{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998855180309101, "eval_steps": 500, "global_step": 4367, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002289639381797367, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 2.1349, "step": 1 }, { "epoch": 0.0004579278763594734, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 2.1263, "step": 2 }, { "epoch": 0.0006868918145392101, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 2.1207, "step": 3 }, { "epoch": 0.0009158557527189468, "grad_norm": 8.66817569732666, "learning_rate": 1.5151515151515152e-07, "loss": 2.1783, "step": 4 }, { "epoch": 0.0011448196908986834, "grad_norm": 8.842265129089355, "learning_rate": 3.0303030303030305e-07, "loss": 2.0996, "step": 5 }, { "epoch": 0.0013737836290784202, "grad_norm": 10.951105117797852, "learning_rate": 4.5454545454545457e-07, "loss": 2.2171, "step": 6 }, { "epoch": 0.0016027475672581567, "grad_norm": 9.442892074584961, "learning_rate": 6.060606060606061e-07, "loss": 2.0768, "step": 7 }, { "epoch": 0.0018317115054378936, "grad_norm": 8.939762115478516, "learning_rate": 7.575757575757576e-07, "loss": 2.1261, "step": 8 }, { "epoch": 0.00206067544361763, "grad_norm": 8.166585922241211, "learning_rate": 9.090909090909091e-07, "loss": 2.0497, "step": 9 }, { "epoch": 0.0022896393817973667, "grad_norm": 8.294096946716309, "learning_rate": 1.0606060606060608e-06, "loss": 2.0554, "step": 10 }, { "epoch": 0.0025186033199771037, "grad_norm": 7.692963123321533, "learning_rate": 1.2121212121212122e-06, "loss": 2.0066, "step": 11 }, { "epoch": 0.0027475672581568403, "grad_norm": 6.519928932189941, "learning_rate": 1.3636363636363636e-06, "loss": 1.804, "step": 12 }, { "epoch": 0.002976531196336577, "grad_norm": 6.0504631996154785, "learning_rate": 1.5151515151515152e-06, "loss": 1.728, "step": 13 }, { "epoch": 0.0032054951345163135, "grad_norm": 5.239335060119629, "learning_rate": 1.6666666666666667e-06, "loss": 1.703, "step": 14 }, { "epoch": 0.0034344590726960505, "grad_norm": 3.2573082447052, "learning_rate": 1.8181818181818183e-06, "loss": 1.6709, "step": 15 }, { "epoch": 0.003663423010875787, "grad_norm": 3.028881788253784, "learning_rate": 1.96969696969697e-06, "loss": 1.5926, "step": 16 }, { "epoch": 0.0038923869490555237, "grad_norm": 2.915285110473633, "learning_rate": 2.1212121212121216e-06, "loss": 1.5392, "step": 17 }, { "epoch": 0.00412135088723526, "grad_norm": 3.2491884231567383, "learning_rate": 2.2727272727272728e-06, "loss": 1.5468, "step": 18 }, { "epoch": 0.004350314825414997, "grad_norm": 2.825366497039795, "learning_rate": 2.4242424242424244e-06, "loss": 1.4983, "step": 19 }, { "epoch": 0.0045792787635947334, "grad_norm": 3.073415994644165, "learning_rate": 2.575757575757576e-06, "loss": 1.4596, "step": 20 }, { "epoch": 0.004808242701774471, "grad_norm": 4.21063756942749, "learning_rate": 2.7272727272727272e-06, "loss": 1.4261, "step": 21 }, { "epoch": 0.0050372066399542075, "grad_norm": 2.9680824279785156, "learning_rate": 2.8787878787878793e-06, "loss": 1.4863, "step": 22 }, { "epoch": 0.005266170578133944, "grad_norm": 3.912755250930786, "learning_rate": 3.0303030303030305e-06, "loss": 1.4069, "step": 23 }, { "epoch": 0.005495134516313681, "grad_norm": 1.9926663637161255, "learning_rate": 3.181818181818182e-06, "loss": 1.3404, "step": 24 }, { "epoch": 0.005724098454493417, "grad_norm": 5.043450355529785, "learning_rate": 3.3333333333333333e-06, "loss": 1.402, "step": 25 }, { "epoch": 0.005953062392673154, "grad_norm": 2.1705479621887207, "learning_rate": 3.4848484848484854e-06, "loss": 1.412, "step": 26 }, { "epoch": 0.00618202633085289, "grad_norm": 1.6663283109664917, "learning_rate": 3.6363636363636366e-06, "loss": 1.4681, "step": 27 }, { "epoch": 0.006410990269032627, "grad_norm": 1.9405096769332886, "learning_rate": 3.7878787878787882e-06, "loss": 1.4265, "step": 28 }, { "epoch": 0.0066399542072123644, "grad_norm": 2.0428900718688965, "learning_rate": 3.93939393939394e-06, "loss": 1.362, "step": 29 }, { "epoch": 0.006868918145392101, "grad_norm": 2.2596077919006348, "learning_rate": 4.0909090909090915e-06, "loss": 1.373, "step": 30 }, { "epoch": 0.007097882083571838, "grad_norm": 1.9293066263198853, "learning_rate": 4.242424242424243e-06, "loss": 1.3973, "step": 31 }, { "epoch": 0.007326846021751574, "grad_norm": 1.669158697128296, "learning_rate": 4.393939393939394e-06, "loss": 1.3654, "step": 32 }, { "epoch": 0.007555809959931311, "grad_norm": 1.5311822891235352, "learning_rate": 4.5454545454545455e-06, "loss": 1.3982, "step": 33 }, { "epoch": 0.007784773898111047, "grad_norm": 1.653910517692566, "learning_rate": 4.696969696969698e-06, "loss": 1.3749, "step": 34 }, { "epoch": 0.008013737836290785, "grad_norm": 1.5567728281021118, "learning_rate": 4.848484848484849e-06, "loss": 1.3699, "step": 35 }, { "epoch": 0.00824270177447052, "grad_norm": 1.6896796226501465, "learning_rate": 5e-06, "loss": 1.3356, "step": 36 }, { "epoch": 0.008471665712650258, "grad_norm": 1.6226534843444824, "learning_rate": 5.151515151515152e-06, "loss": 1.3602, "step": 37 }, { "epoch": 0.008700629650829994, "grad_norm": 1.81944739818573, "learning_rate": 5.303030303030303e-06, "loss": 1.3763, "step": 38 }, { "epoch": 0.008929593589009731, "grad_norm": 1.9701725244522095, "learning_rate": 5.4545454545454545e-06, "loss": 1.4414, "step": 39 }, { "epoch": 0.009158557527189467, "grad_norm": 1.9407984018325806, "learning_rate": 5.606060606060606e-06, "loss": 1.3173, "step": 40 }, { "epoch": 0.009387521465369204, "grad_norm": 1.5994501113891602, "learning_rate": 5.7575757575757586e-06, "loss": 1.3494, "step": 41 }, { "epoch": 0.009616485403548942, "grad_norm": 1.8734986782073975, "learning_rate": 5.90909090909091e-06, "loss": 1.3658, "step": 42 }, { "epoch": 0.009845449341728678, "grad_norm": 1.8831076622009277, "learning_rate": 6.060606060606061e-06, "loss": 1.3024, "step": 43 }, { "epoch": 0.010074413279908415, "grad_norm": 1.6908974647521973, "learning_rate": 6.212121212121213e-06, "loss": 1.4275, "step": 44 }, { "epoch": 0.01030337721808815, "grad_norm": 1.742285132408142, "learning_rate": 6.363636363636364e-06, "loss": 1.3254, "step": 45 }, { "epoch": 0.010532341156267888, "grad_norm": 1.9381937980651855, "learning_rate": 6.515151515151516e-06, "loss": 1.3249, "step": 46 }, { "epoch": 0.010761305094447624, "grad_norm": 1.445646047592163, "learning_rate": 6.666666666666667e-06, "loss": 1.325, "step": 47 }, { "epoch": 0.010990269032627361, "grad_norm": 1.6461410522460938, "learning_rate": 6.818181818181818e-06, "loss": 1.3958, "step": 48 }, { "epoch": 0.011219232970807097, "grad_norm": 2.5436205863952637, "learning_rate": 6.969696969696971e-06, "loss": 1.2518, "step": 49 }, { "epoch": 0.011448196908986834, "grad_norm": 1.5389620065689087, "learning_rate": 7.121212121212122e-06, "loss": 1.2919, "step": 50 }, { "epoch": 0.011677160847166572, "grad_norm": 1.7769535779953003, "learning_rate": 7.272727272727273e-06, "loss": 1.2673, "step": 51 }, { "epoch": 0.011906124785346308, "grad_norm": 2.1329517364501953, "learning_rate": 7.424242424242425e-06, "loss": 1.2995, "step": 52 }, { "epoch": 0.012135088723526045, "grad_norm": 1.5156912803649902, "learning_rate": 7.5757575757575764e-06, "loss": 1.3062, "step": 53 }, { "epoch": 0.01236405266170578, "grad_norm": 1.4951452016830444, "learning_rate": 7.727272727272727e-06, "loss": 1.3429, "step": 54 }, { "epoch": 0.012593016599885518, "grad_norm": 1.3737133741378784, "learning_rate": 7.87878787878788e-06, "loss": 1.2991, "step": 55 }, { "epoch": 0.012821980538065254, "grad_norm": 1.3582038879394531, "learning_rate": 8.03030303030303e-06, "loss": 1.2792, "step": 56 }, { "epoch": 0.013050944476244991, "grad_norm": 1.4764114618301392, "learning_rate": 8.181818181818183e-06, "loss": 1.3561, "step": 57 }, { "epoch": 0.013279908414424729, "grad_norm": 1.6100513935089111, "learning_rate": 8.333333333333334e-06, "loss": 1.2303, "step": 58 }, { "epoch": 0.013508872352604465, "grad_norm": 1.9281264543533325, "learning_rate": 8.484848484848486e-06, "loss": 1.3124, "step": 59 }, { "epoch": 0.013737836290784202, "grad_norm": 1.6992368698120117, "learning_rate": 8.636363636363637e-06, "loss": 1.3066, "step": 60 }, { "epoch": 0.013966800228963938, "grad_norm": 1.4923304319381714, "learning_rate": 8.787878787878788e-06, "loss": 1.3137, "step": 61 }, { "epoch": 0.014195764167143675, "grad_norm": 3.8915576934814453, "learning_rate": 8.93939393939394e-06, "loss": 1.2634, "step": 62 }, { "epoch": 0.014424728105323411, "grad_norm": 1.6618587970733643, "learning_rate": 9.090909090909091e-06, "loss": 1.2467, "step": 63 }, { "epoch": 0.014653692043503148, "grad_norm": 1.4143376350402832, "learning_rate": 9.242424242424244e-06, "loss": 1.2817, "step": 64 }, { "epoch": 0.014882655981682884, "grad_norm": 1.594414234161377, "learning_rate": 9.393939393939396e-06, "loss": 1.3015, "step": 65 }, { "epoch": 0.015111619919862622, "grad_norm": 1.4304990768432617, "learning_rate": 9.545454545454547e-06, "loss": 1.3633, "step": 66 }, { "epoch": 0.015340583858042359, "grad_norm": 2.2776196002960205, "learning_rate": 9.696969696969698e-06, "loss": 1.3109, "step": 67 }, { "epoch": 0.015569547796222095, "grad_norm": 1.789172649383545, "learning_rate": 9.84848484848485e-06, "loss": 1.3003, "step": 68 }, { "epoch": 0.015798511734401832, "grad_norm": 1.5805341005325317, "learning_rate": 1e-05, "loss": 1.3144, "step": 69 }, { "epoch": 0.01602747567258157, "grad_norm": 1.2724578380584717, "learning_rate": 1.0151515151515152e-05, "loss": 1.2855, "step": 70 }, { "epoch": 0.016256439610761304, "grad_norm": 1.451402187347412, "learning_rate": 1.0303030303030304e-05, "loss": 1.2442, "step": 71 }, { "epoch": 0.01648540354894104, "grad_norm": 2.8804028034210205, "learning_rate": 1.0454545454545455e-05, "loss": 1.2494, "step": 72 }, { "epoch": 0.01671436748712078, "grad_norm": 2.359454870223999, "learning_rate": 1.0606060606060606e-05, "loss": 1.1962, "step": 73 }, { "epoch": 0.016943331425300516, "grad_norm": 3.0040102005004883, "learning_rate": 1.0757575757575758e-05, "loss": 1.26, "step": 74 }, { "epoch": 0.017172295363480253, "grad_norm": 2.051539659500122, "learning_rate": 1.0909090909090909e-05, "loss": 1.2601, "step": 75 }, { "epoch": 0.017401259301659987, "grad_norm": 2.0666730403900146, "learning_rate": 1.1060606060606061e-05, "loss": 1.3369, "step": 76 }, { "epoch": 0.017630223239839725, "grad_norm": 1.3947479724884033, "learning_rate": 1.1212121212121212e-05, "loss": 1.2414, "step": 77 }, { "epoch": 0.017859187178019462, "grad_norm": 1.6999517679214478, "learning_rate": 1.1363636363636366e-05, "loss": 1.2773, "step": 78 }, { "epoch": 0.0180881511161992, "grad_norm": 1.3520228862762451, "learning_rate": 1.1515151515151517e-05, "loss": 1.2515, "step": 79 }, { "epoch": 0.018317115054378934, "grad_norm": 1.7271336317062378, "learning_rate": 1.1666666666666668e-05, "loss": 1.285, "step": 80 }, { "epoch": 0.01854607899255867, "grad_norm": 1.5026074647903442, "learning_rate": 1.181818181818182e-05, "loss": 1.2984, "step": 81 }, { "epoch": 0.01877504293073841, "grad_norm": 1.455031156539917, "learning_rate": 1.1969696969696971e-05, "loss": 1.1785, "step": 82 }, { "epoch": 0.019004006868918146, "grad_norm": 2.4134457111358643, "learning_rate": 1.2121212121212122e-05, "loss": 1.2861, "step": 83 }, { "epoch": 0.019232970807097884, "grad_norm": 1.3893452882766724, "learning_rate": 1.2272727272727274e-05, "loss": 1.3012, "step": 84 }, { "epoch": 0.019461934745277618, "grad_norm": 2.0484437942504883, "learning_rate": 1.2424242424242425e-05, "loss": 1.2981, "step": 85 }, { "epoch": 0.019690898683457355, "grad_norm": 1.8571381568908691, "learning_rate": 1.2575757575757576e-05, "loss": 1.2488, "step": 86 }, { "epoch": 0.019919862621637092, "grad_norm": 1.5972386598587036, "learning_rate": 1.2727272727272728e-05, "loss": 1.2762, "step": 87 }, { "epoch": 0.02014882655981683, "grad_norm": 1.9522424936294556, "learning_rate": 1.287878787878788e-05, "loss": 1.2176, "step": 88 }, { "epoch": 0.020377790497996564, "grad_norm": 1.641571044921875, "learning_rate": 1.3030303030303032e-05, "loss": 1.21, "step": 89 }, { "epoch": 0.0206067544361763, "grad_norm": 2.441856861114502, "learning_rate": 1.3181818181818183e-05, "loss": 1.2518, "step": 90 }, { "epoch": 0.02083571837435604, "grad_norm": 2.1330080032348633, "learning_rate": 1.3333333333333333e-05, "loss": 1.2659, "step": 91 }, { "epoch": 0.021064682312535776, "grad_norm": 1.3508576154708862, "learning_rate": 1.3484848484848486e-05, "loss": 1.2586, "step": 92 }, { "epoch": 0.021293646250715514, "grad_norm": 1.7117724418640137, "learning_rate": 1.3636363636363637e-05, "loss": 1.3094, "step": 93 }, { "epoch": 0.021522610188895248, "grad_norm": 1.690869927406311, "learning_rate": 1.378787878787879e-05, "loss": 1.1898, "step": 94 }, { "epoch": 0.021751574127074985, "grad_norm": 2.3677923679351807, "learning_rate": 1.3939393939393942e-05, "loss": 1.2628, "step": 95 }, { "epoch": 0.021980538065254723, "grad_norm": 1.5887540578842163, "learning_rate": 1.4090909090909092e-05, "loss": 1.2717, "step": 96 }, { "epoch": 0.02220950200343446, "grad_norm": 2.6575193405151367, "learning_rate": 1.4242424242424245e-05, "loss": 1.2429, "step": 97 }, { "epoch": 0.022438465941614194, "grad_norm": 1.425506830215454, "learning_rate": 1.4393939393939396e-05, "loss": 1.277, "step": 98 }, { "epoch": 0.02266742987979393, "grad_norm": 1.729892611503601, "learning_rate": 1.4545454545454546e-05, "loss": 1.2601, "step": 99 }, { "epoch": 0.02289639381797367, "grad_norm": 2.3861329555511475, "learning_rate": 1.4696969696969699e-05, "loss": 1.275, "step": 100 }, { "epoch": 0.023125357756153406, "grad_norm": 1.65578031539917, "learning_rate": 1.484848484848485e-05, "loss": 1.248, "step": 101 }, { "epoch": 0.023354321694333144, "grad_norm": 3.5684421062469482, "learning_rate": 1.5000000000000002e-05, "loss": 1.2647, "step": 102 }, { "epoch": 0.023583285632512878, "grad_norm": 19.274433135986328, "learning_rate": 1.5151515151515153e-05, "loss": 1.2598, "step": 103 }, { "epoch": 0.023812249570692615, "grad_norm": 1.5521622896194458, "learning_rate": 1.5303030303030304e-05, "loss": 1.2898, "step": 104 }, { "epoch": 0.024041213508872353, "grad_norm": 1.7420704364776611, "learning_rate": 1.5454545454545454e-05, "loss": 1.289, "step": 105 }, { "epoch": 0.02427017744705209, "grad_norm": 1.6716350317001343, "learning_rate": 1.5606060606060605e-05, "loss": 1.2162, "step": 106 }, { "epoch": 0.024499141385231828, "grad_norm": 1.7894171476364136, "learning_rate": 1.575757575757576e-05, "loss": 1.2512, "step": 107 }, { "epoch": 0.02472810532341156, "grad_norm": 2.7053062915802, "learning_rate": 1.590909090909091e-05, "loss": 1.2625, "step": 108 }, { "epoch": 0.0249570692615913, "grad_norm": 1.7497886419296265, "learning_rate": 1.606060606060606e-05, "loss": 1.2179, "step": 109 }, { "epoch": 0.025186033199771037, "grad_norm": 1.6836780309677124, "learning_rate": 1.6212121212121212e-05, "loss": 1.2076, "step": 110 }, { "epoch": 0.025414997137950774, "grad_norm": 1.893149733543396, "learning_rate": 1.6363636363636366e-05, "loss": 1.2613, "step": 111 }, { "epoch": 0.025643961076130508, "grad_norm": 1.6643422842025757, "learning_rate": 1.6515151515151517e-05, "loss": 1.2215, "step": 112 }, { "epoch": 0.025872925014310245, "grad_norm": 1.624299168586731, "learning_rate": 1.6666666666666667e-05, "loss": 1.2552, "step": 113 }, { "epoch": 0.026101888952489983, "grad_norm": 2.46860408782959, "learning_rate": 1.681818181818182e-05, "loss": 1.2901, "step": 114 }, { "epoch": 0.02633085289066972, "grad_norm": 2.3799362182617188, "learning_rate": 1.6969696969696972e-05, "loss": 1.198, "step": 115 }, { "epoch": 0.026559816828849458, "grad_norm": 1.3481436967849731, "learning_rate": 1.7121212121212123e-05, "loss": 1.2413, "step": 116 }, { "epoch": 0.026788780767029192, "grad_norm": 1.9589934349060059, "learning_rate": 1.7272727272727274e-05, "loss": 1.2845, "step": 117 }, { "epoch": 0.02701774470520893, "grad_norm": 2.889359712600708, "learning_rate": 1.7424242424242425e-05, "loss": 1.2236, "step": 118 }, { "epoch": 0.027246708643388667, "grad_norm": 2.009254217147827, "learning_rate": 1.7575757575757576e-05, "loss": 1.2147, "step": 119 }, { "epoch": 0.027475672581568404, "grad_norm": 1.872424602508545, "learning_rate": 1.772727272727273e-05, "loss": 1.2385, "step": 120 }, { "epoch": 0.027704636519748138, "grad_norm": 1.411062479019165, "learning_rate": 1.787878787878788e-05, "loss": 1.2238, "step": 121 }, { "epoch": 0.027933600457927876, "grad_norm": 1.9089847803115845, "learning_rate": 1.803030303030303e-05, "loss": 1.3142, "step": 122 }, { "epoch": 0.028162564396107613, "grad_norm": 1.5556221008300781, "learning_rate": 1.8181818181818182e-05, "loss": 1.3167, "step": 123 }, { "epoch": 0.02839152833428735, "grad_norm": 1.295181393623352, "learning_rate": 1.8333333333333333e-05, "loss": 1.2127, "step": 124 }, { "epoch": 0.028620492272467088, "grad_norm": 1.6264946460723877, "learning_rate": 1.8484848484848487e-05, "loss": 1.2766, "step": 125 }, { "epoch": 0.028849456210646822, "grad_norm": 1.6156526803970337, "learning_rate": 1.8636363636363638e-05, "loss": 1.2583, "step": 126 }, { "epoch": 0.02907842014882656, "grad_norm": 2.654879570007324, "learning_rate": 1.8787878787878792e-05, "loss": 1.223, "step": 127 }, { "epoch": 0.029307384087006297, "grad_norm": 2.6344661712646484, "learning_rate": 1.8939393939393943e-05, "loss": 1.2516, "step": 128 }, { "epoch": 0.029536348025186034, "grad_norm": 2.6589064598083496, "learning_rate": 1.9090909090909094e-05, "loss": 1.2435, "step": 129 }, { "epoch": 0.029765311963365768, "grad_norm": 2.257918357849121, "learning_rate": 1.9242424242424244e-05, "loss": 1.1835, "step": 130 }, { "epoch": 0.029994275901545506, "grad_norm": 1.8436076641082764, "learning_rate": 1.9393939393939395e-05, "loss": 1.1735, "step": 131 }, { "epoch": 0.030223239839725243, "grad_norm": 1.7338463068008423, "learning_rate": 1.9545454545454546e-05, "loss": 1.1817, "step": 132 }, { "epoch": 0.03045220377790498, "grad_norm": 2.028693675994873, "learning_rate": 1.96969696969697e-05, "loss": 1.2663, "step": 133 }, { "epoch": 0.030681167716084718, "grad_norm": 1.6048945188522339, "learning_rate": 1.984848484848485e-05, "loss": 1.237, "step": 134 }, { "epoch": 0.030910131654264452, "grad_norm": 1.475034236907959, "learning_rate": 2e-05, "loss": 1.2534, "step": 135 }, { "epoch": 0.03113909559244419, "grad_norm": 1.4430214166641235, "learning_rate": 1.9999997248541923e-05, "loss": 1.2202, "step": 136 }, { "epoch": 0.031368059530623923, "grad_norm": 1.705660343170166, "learning_rate": 1.9999988994169196e-05, "loss": 1.2087, "step": 137 }, { "epoch": 0.031597023468803664, "grad_norm": 1.6410300731658936, "learning_rate": 1.9999975236886367e-05, "loss": 1.1895, "step": 138 }, { "epoch": 0.0318259874069834, "grad_norm": 2.1050198078155518, "learning_rate": 1.9999955976701005e-05, "loss": 1.2435, "step": 139 }, { "epoch": 0.03205495134516314, "grad_norm": 2.0697057247161865, "learning_rate": 1.9999931213623708e-05, "loss": 1.2032, "step": 140 }, { "epoch": 0.03228391528334287, "grad_norm": 1.8335312604904175, "learning_rate": 1.9999900947668106e-05, "loss": 1.2381, "step": 141 }, { "epoch": 0.03251287922152261, "grad_norm": 1.9289928674697876, "learning_rate": 1.9999865178850847e-05, "loss": 1.2319, "step": 142 }, { "epoch": 0.03274184315970235, "grad_norm": 2.391378164291382, "learning_rate": 1.9999823907191623e-05, "loss": 1.2736, "step": 143 }, { "epoch": 0.03297080709788208, "grad_norm": 1.6855164766311646, "learning_rate": 1.9999777132713137e-05, "loss": 1.2463, "step": 144 }, { "epoch": 0.03319977103606182, "grad_norm": 1.5160446166992188, "learning_rate": 1.999972485544114e-05, "loss": 1.2183, "step": 145 }, { "epoch": 0.03342873497424156, "grad_norm": 1.6463406085968018, "learning_rate": 1.9999667075404385e-05, "loss": 1.2261, "step": 146 }, { "epoch": 0.03365769891242129, "grad_norm": 1.453431248664856, "learning_rate": 1.999960379263468e-05, "loss": 1.2804, "step": 147 }, { "epoch": 0.03388666285060103, "grad_norm": 4.494565486907959, "learning_rate": 1.9999535007166847e-05, "loss": 1.19, "step": 148 }, { "epoch": 0.034115626788780766, "grad_norm": 2.251443386077881, "learning_rate": 1.999946071903873e-05, "loss": 1.2244, "step": 149 }, { "epoch": 0.03434459072696051, "grad_norm": 1.86344575881958, "learning_rate": 1.999938092829122e-05, "loss": 1.2541, "step": 150 }, { "epoch": 0.03457355466514024, "grad_norm": 2.023585796356201, "learning_rate": 1.9999295634968216e-05, "loss": 1.2251, "step": 151 }, { "epoch": 0.034802518603319975, "grad_norm": 1.6223890781402588, "learning_rate": 1.999920483911666e-05, "loss": 1.2049, "step": 152 }, { "epoch": 0.035031482541499716, "grad_norm": 1.8935831785202026, "learning_rate": 1.9999108540786513e-05, "loss": 1.2355, "step": 153 }, { "epoch": 0.03526044647967945, "grad_norm": 1.2119618654251099, "learning_rate": 1.9999006740030774e-05, "loss": 1.2546, "step": 154 }, { "epoch": 0.035489410417859184, "grad_norm": 1.663476586341858, "learning_rate": 1.999889943690545e-05, "loss": 1.2184, "step": 155 }, { "epoch": 0.035718374356038925, "grad_norm": 1.5204002857208252, "learning_rate": 1.9998786631469602e-05, "loss": 1.2055, "step": 156 }, { "epoch": 0.03594733829421866, "grad_norm": 1.5706137418746948, "learning_rate": 1.9998668323785298e-05, "loss": 1.1985, "step": 157 }, { "epoch": 0.0361763022323984, "grad_norm": 2.2016441822052, "learning_rate": 1.9998544513917646e-05, "loss": 1.2262, "step": 158 }, { "epoch": 0.036405266170578134, "grad_norm": 1.5397722721099854, "learning_rate": 1.9998415201934775e-05, "loss": 1.1625, "step": 159 }, { "epoch": 0.03663423010875787, "grad_norm": 1.7281806468963623, "learning_rate": 1.9998280387907845e-05, "loss": 1.2658, "step": 160 }, { "epoch": 0.03686319404693761, "grad_norm": 1.4419702291488647, "learning_rate": 1.9998140071911044e-05, "loss": 1.1674, "step": 161 }, { "epoch": 0.03709215798511734, "grad_norm": 1.9832121133804321, "learning_rate": 1.9997994254021584e-05, "loss": 1.2543, "step": 162 }, { "epoch": 0.03732112192329708, "grad_norm": 1.2950409650802612, "learning_rate": 1.999784293431971e-05, "loss": 1.2494, "step": 163 }, { "epoch": 0.03755008586147682, "grad_norm": 1.6367920637130737, "learning_rate": 1.9997686112888694e-05, "loss": 1.1819, "step": 164 }, { "epoch": 0.03777904979965655, "grad_norm": 1.458843469619751, "learning_rate": 1.9997523789814827e-05, "loss": 1.2508, "step": 165 }, { "epoch": 0.03800801373783629, "grad_norm": 1.2992488145828247, "learning_rate": 1.999735596518744e-05, "loss": 1.2549, "step": 166 }, { "epoch": 0.038236977676016026, "grad_norm": 1.3639360666275024, "learning_rate": 1.999718263909888e-05, "loss": 1.3374, "step": 167 }, { "epoch": 0.03846594161419577, "grad_norm": 1.6184186935424805, "learning_rate": 1.9997003811644534e-05, "loss": 1.1888, "step": 168 }, { "epoch": 0.0386949055523755, "grad_norm": 1.2626984119415283, "learning_rate": 1.9996819482922804e-05, "loss": 1.1682, "step": 169 }, { "epoch": 0.038923869490555235, "grad_norm": 1.3981817960739136, "learning_rate": 1.9996629653035128e-05, "loss": 1.2318, "step": 170 }, { "epoch": 0.039152833428734976, "grad_norm": 1.3171190023422241, "learning_rate": 1.999643432208596e-05, "loss": 1.1496, "step": 171 }, { "epoch": 0.03938179736691471, "grad_norm": 1.2882895469665527, "learning_rate": 1.99962334901828e-05, "loss": 1.2616, "step": 172 }, { "epoch": 0.03961076130509445, "grad_norm": 1.8390611410140991, "learning_rate": 1.9996027157436154e-05, "loss": 1.3038, "step": 173 }, { "epoch": 0.039839725243274185, "grad_norm": 1.3590266704559326, "learning_rate": 1.9995815323959576e-05, "loss": 1.2718, "step": 174 }, { "epoch": 0.04006868918145392, "grad_norm": 1.7429498434066772, "learning_rate": 1.9995597989869625e-05, "loss": 1.2578, "step": 175 }, { "epoch": 0.04029765311963366, "grad_norm": 1.5614861249923706, "learning_rate": 1.9995375155285906e-05, "loss": 1.1698, "step": 176 }, { "epoch": 0.040526617057813394, "grad_norm": 2.0290753841400146, "learning_rate": 1.999514682033104e-05, "loss": 1.219, "step": 177 }, { "epoch": 0.04075558099599313, "grad_norm": 1.4158384799957275, "learning_rate": 1.9994912985130682e-05, "loss": 1.1945, "step": 178 }, { "epoch": 0.04098454493417287, "grad_norm": 1.7731130123138428, "learning_rate": 1.99946736498135e-05, "loss": 1.1985, "step": 179 }, { "epoch": 0.0412135088723526, "grad_norm": 1.9962372779846191, "learning_rate": 1.9994428814511205e-05, "loss": 1.2413, "step": 180 }, { "epoch": 0.041442472810532344, "grad_norm": 1.7486003637313843, "learning_rate": 1.9994178479358526e-05, "loss": 1.1465, "step": 181 }, { "epoch": 0.04167143674871208, "grad_norm": 1.415873646736145, "learning_rate": 1.9993922644493223e-05, "loss": 1.2388, "step": 182 }, { "epoch": 0.04190040068689181, "grad_norm": 1.6285479068756104, "learning_rate": 1.9993661310056076e-05, "loss": 1.286, "step": 183 }, { "epoch": 0.04212936462507155, "grad_norm": 1.3321681022644043, "learning_rate": 1.99933944761909e-05, "loss": 1.1976, "step": 184 }, { "epoch": 0.04235832856325129, "grad_norm": 1.5006210803985596, "learning_rate": 1.999312214304453e-05, "loss": 1.209, "step": 185 }, { "epoch": 0.04258729250143103, "grad_norm": 3.087294101715088, "learning_rate": 1.999284431076682e-05, "loss": 1.2405, "step": 186 }, { "epoch": 0.04281625643961076, "grad_norm": 2.618927240371704, "learning_rate": 1.999256097951067e-05, "loss": 1.2988, "step": 187 }, { "epoch": 0.043045220377790495, "grad_norm": 1.4773956537246704, "learning_rate": 1.999227214943199e-05, "loss": 1.1566, "step": 188 }, { "epoch": 0.043274184315970236, "grad_norm": 1.824690818786621, "learning_rate": 1.999197782068972e-05, "loss": 1.2149, "step": 189 }, { "epoch": 0.04350314825414997, "grad_norm": 1.7128965854644775, "learning_rate": 1.9991677993445832e-05, "loss": 1.1904, "step": 190 }, { "epoch": 0.04373211219232971, "grad_norm": 1.7998428344726562, "learning_rate": 1.999137266786531e-05, "loss": 1.2846, "step": 191 }, { "epoch": 0.043961076130509445, "grad_norm": 2.0802230834960938, "learning_rate": 1.9991061844116178e-05, "loss": 1.2073, "step": 192 }, { "epoch": 0.04419004006868918, "grad_norm": 1.7808719873428345, "learning_rate": 1.9990745522369482e-05, "loss": 1.2883, "step": 193 }, { "epoch": 0.04441900400686892, "grad_norm": 1.7531901597976685, "learning_rate": 1.9990423702799283e-05, "loss": 1.1568, "step": 194 }, { "epoch": 0.044647967945048654, "grad_norm": 1.5850685834884644, "learning_rate": 1.9990096385582682e-05, "loss": 1.252, "step": 195 }, { "epoch": 0.04487693188322839, "grad_norm": 1.3775941133499146, "learning_rate": 1.9989763570899795e-05, "loss": 1.2807, "step": 196 }, { "epoch": 0.04510589582140813, "grad_norm": 1.3636221885681152, "learning_rate": 1.998942525893377e-05, "loss": 1.2385, "step": 197 }, { "epoch": 0.04533485975958786, "grad_norm": 1.4277960062026978, "learning_rate": 1.998908144987078e-05, "loss": 1.166, "step": 198 }, { "epoch": 0.045563823697767604, "grad_norm": 1.4968419075012207, "learning_rate": 1.9988732143900013e-05, "loss": 1.203, "step": 199 }, { "epoch": 0.04579278763594734, "grad_norm": 1.6300857067108154, "learning_rate": 1.998837734121369e-05, "loss": 1.1981, "step": 200 }, { "epoch": 0.04602175157412707, "grad_norm": 1.3024663925170898, "learning_rate": 1.9988017042007066e-05, "loss": 1.2181, "step": 201 }, { "epoch": 0.04625071551230681, "grad_norm": 1.2611545324325562, "learning_rate": 1.99876512464784e-05, "loss": 1.2375, "step": 202 }, { "epoch": 0.04647967945048655, "grad_norm": 1.7449328899383545, "learning_rate": 1.9987279954828986e-05, "loss": 1.2178, "step": 203 }, { "epoch": 0.04670864338866629, "grad_norm": 1.4038939476013184, "learning_rate": 1.998690316726315e-05, "loss": 1.225, "step": 204 }, { "epoch": 0.04693760732684602, "grad_norm": 1.573346495628357, "learning_rate": 1.9986520883988233e-05, "loss": 1.2241, "step": 205 }, { "epoch": 0.047166571265025756, "grad_norm": 1.1829012632369995, "learning_rate": 1.99861331052146e-05, "loss": 1.1722, "step": 206 }, { "epoch": 0.0473955352032055, "grad_norm": 1.6095627546310425, "learning_rate": 1.9985739831155637e-05, "loss": 1.2481, "step": 207 }, { "epoch": 0.04762449914138523, "grad_norm": 1.8268073797225952, "learning_rate": 1.998534106202777e-05, "loss": 1.1972, "step": 208 }, { "epoch": 0.04785346307956497, "grad_norm": 1.7510496377944946, "learning_rate": 1.9984936798050435e-05, "loss": 1.1889, "step": 209 }, { "epoch": 0.048082427017744706, "grad_norm": 1.9209846258163452, "learning_rate": 1.9984527039446093e-05, "loss": 1.2393, "step": 210 }, { "epoch": 0.04831139095592444, "grad_norm": 1.3671090602874756, "learning_rate": 1.998411178644023e-05, "loss": 1.1783, "step": 211 }, { "epoch": 0.04854035489410418, "grad_norm": 1.4478919506072998, "learning_rate": 1.9983691039261358e-05, "loss": 1.269, "step": 212 }, { "epoch": 0.048769318832283914, "grad_norm": 1.8099792003631592, "learning_rate": 1.998326479814101e-05, "loss": 1.1632, "step": 213 }, { "epoch": 0.048998282770463655, "grad_norm": 1.502695083618164, "learning_rate": 1.9982833063313746e-05, "loss": 1.2305, "step": 214 }, { "epoch": 0.04922724670864339, "grad_norm": 1.1444593667984009, "learning_rate": 1.998239583501714e-05, "loss": 1.1981, "step": 215 }, { "epoch": 0.04945621064682312, "grad_norm": 1.7091630697250366, "learning_rate": 1.99819531134918e-05, "loss": 1.2201, "step": 216 }, { "epoch": 0.049685174585002864, "grad_norm": 1.4177758693695068, "learning_rate": 1.9981504898981352e-05, "loss": 1.1817, "step": 217 }, { "epoch": 0.0499141385231826, "grad_norm": 1.2633055448532104, "learning_rate": 1.998105119173244e-05, "loss": 1.191, "step": 218 }, { "epoch": 0.05014310246136233, "grad_norm": 1.7247159481048584, "learning_rate": 1.998059199199474e-05, "loss": 1.2037, "step": 219 }, { "epoch": 0.05037206639954207, "grad_norm": 1.7244751453399658, "learning_rate": 1.9980127300020946e-05, "loss": 1.1898, "step": 220 }, { "epoch": 0.05060103033772181, "grad_norm": 1.452858567237854, "learning_rate": 1.997965711606677e-05, "loss": 1.2408, "step": 221 }, { "epoch": 0.05082999427590155, "grad_norm": 1.4057929515838623, "learning_rate": 1.9979181440390954e-05, "loss": 1.2024, "step": 222 }, { "epoch": 0.05105895821408128, "grad_norm": 1.3213428258895874, "learning_rate": 1.9978700273255254e-05, "loss": 1.2326, "step": 223 }, { "epoch": 0.051287922152261016, "grad_norm": 1.3907994031906128, "learning_rate": 1.9978213614924456e-05, "loss": 1.169, "step": 224 }, { "epoch": 0.05151688609044076, "grad_norm": 1.4973134994506836, "learning_rate": 1.9977721465666365e-05, "loss": 1.2163, "step": 225 }, { "epoch": 0.05174585002862049, "grad_norm": 1.6128813028335571, "learning_rate": 1.9977223825751802e-05, "loss": 1.1489, "step": 226 }, { "epoch": 0.05197481396680023, "grad_norm": 1.228517770767212, "learning_rate": 1.997672069545462e-05, "loss": 1.1924, "step": 227 }, { "epoch": 0.052203777904979966, "grad_norm": 1.5707398653030396, "learning_rate": 1.9976212075051683e-05, "loss": 1.2292, "step": 228 }, { "epoch": 0.0524327418431597, "grad_norm": 1.605344533920288, "learning_rate": 1.997569796482288e-05, "loss": 1.3047, "step": 229 }, { "epoch": 0.05266170578133944, "grad_norm": 1.1628702878952026, "learning_rate": 1.9975178365051123e-05, "loss": 1.205, "step": 230 }, { "epoch": 0.052890669719519175, "grad_norm": 1.4649019241333008, "learning_rate": 1.9974653276022347e-05, "loss": 1.2032, "step": 231 }, { "epoch": 0.053119633657698916, "grad_norm": 1.4579243659973145, "learning_rate": 1.99741226980255e-05, "loss": 1.238, "step": 232 }, { "epoch": 0.05334859759587865, "grad_norm": 1.4899598360061646, "learning_rate": 1.997358663135255e-05, "loss": 1.2124, "step": 233 }, { "epoch": 0.053577561534058384, "grad_norm": 1.2446345090866089, "learning_rate": 1.99730450762985e-05, "loss": 1.2593, "step": 234 }, { "epoch": 0.053806525472238124, "grad_norm": 1.5485501289367676, "learning_rate": 1.997249803316136e-05, "loss": 1.1541, "step": 235 }, { "epoch": 0.05403548941041786, "grad_norm": 1.386314868927002, "learning_rate": 1.997194550224216e-05, "loss": 1.234, "step": 236 }, { "epoch": 0.05426445334859759, "grad_norm": 1.917922854423523, "learning_rate": 1.9971387483844956e-05, "loss": 1.2255, "step": 237 }, { "epoch": 0.05449341728677733, "grad_norm": 1.9101078510284424, "learning_rate": 1.9970823978276818e-05, "loss": 1.1737, "step": 238 }, { "epoch": 0.05472238122495707, "grad_norm": 2.0706257820129395, "learning_rate": 1.9970254985847842e-05, "loss": 1.1984, "step": 239 }, { "epoch": 0.05495134516313681, "grad_norm": 1.579801321029663, "learning_rate": 1.9969680506871138e-05, "loss": 1.1953, "step": 240 }, { "epoch": 0.05518030910131654, "grad_norm": 1.2857059240341187, "learning_rate": 1.9969100541662833e-05, "loss": 1.2072, "step": 241 }, { "epoch": 0.055409273039496276, "grad_norm": 1.162064552307129, "learning_rate": 1.996851509054208e-05, "loss": 1.1743, "step": 242 }, { "epoch": 0.05563823697767602, "grad_norm": 1.7716028690338135, "learning_rate": 1.9967924153831054e-05, "loss": 1.1876, "step": 243 }, { "epoch": 0.05586720091585575, "grad_norm": 1.3083211183547974, "learning_rate": 1.9967327731854937e-05, "loss": 1.2037, "step": 244 }, { "epoch": 0.05609616485403549, "grad_norm": 1.6880347728729248, "learning_rate": 1.9966725824941933e-05, "loss": 1.112, "step": 245 }, { "epoch": 0.056325128792215226, "grad_norm": 1.951745629310608, "learning_rate": 1.996611843342327e-05, "loss": 1.2209, "step": 246 }, { "epoch": 0.05655409273039496, "grad_norm": 2.4489657878875732, "learning_rate": 1.9965505557633188e-05, "loss": 1.1525, "step": 247 }, { "epoch": 0.0567830566685747, "grad_norm": 1.4747159481048584, "learning_rate": 1.996488719790895e-05, "loss": 1.1597, "step": 248 }, { "epoch": 0.057012020606754435, "grad_norm": 1.5697540044784546, "learning_rate": 1.9964263354590835e-05, "loss": 1.1907, "step": 249 }, { "epoch": 0.057240984544934176, "grad_norm": 1.4222530126571655, "learning_rate": 1.9963634028022133e-05, "loss": 1.2253, "step": 250 }, { "epoch": 0.05746994848311391, "grad_norm": 1.2914453744888306, "learning_rate": 1.9962999218549156e-05, "loss": 1.2175, "step": 251 }, { "epoch": 0.057698912421293644, "grad_norm": 1.646164059638977, "learning_rate": 1.9962358926521245e-05, "loss": 1.1746, "step": 252 }, { "epoch": 0.057927876359473385, "grad_norm": 2.148345947265625, "learning_rate": 1.996171315229074e-05, "loss": 1.1377, "step": 253 }, { "epoch": 0.05815684029765312, "grad_norm": 1.5290459394454956, "learning_rate": 1.9961061896213006e-05, "loss": 1.1186, "step": 254 }, { "epoch": 0.05838580423583286, "grad_norm": 1.3710248470306396, "learning_rate": 1.9960405158646425e-05, "loss": 1.1941, "step": 255 }, { "epoch": 0.058614768174012594, "grad_norm": 1.6446150541305542, "learning_rate": 1.9959742939952393e-05, "loss": 1.1667, "step": 256 }, { "epoch": 0.05884373211219233, "grad_norm": 1.3985366821289062, "learning_rate": 1.9959075240495322e-05, "loss": 1.1506, "step": 257 }, { "epoch": 0.05907269605037207, "grad_norm": 1.5973879098892212, "learning_rate": 1.9958402060642644e-05, "loss": 1.1326, "step": 258 }, { "epoch": 0.0593016599885518, "grad_norm": 1.499578833580017, "learning_rate": 1.9957723400764803e-05, "loss": 1.1938, "step": 259 }, { "epoch": 0.059530623926731537, "grad_norm": 1.3453638553619385, "learning_rate": 1.9957039261235263e-05, "loss": 1.2466, "step": 260 }, { "epoch": 0.05975958786491128, "grad_norm": 1.7341777086257935, "learning_rate": 1.9956349642430494e-05, "loss": 1.1251, "step": 261 }, { "epoch": 0.05998855180309101, "grad_norm": 1.2891796827316284, "learning_rate": 1.9955654544729992e-05, "loss": 1.1765, "step": 262 }, { "epoch": 0.06021751574127075, "grad_norm": 1.1669985055923462, "learning_rate": 1.9954953968516262e-05, "loss": 1.2109, "step": 263 }, { "epoch": 0.060446479679450486, "grad_norm": 1.5981719493865967, "learning_rate": 1.9954247914174828e-05, "loss": 1.1865, "step": 264 }, { "epoch": 0.06067544361763022, "grad_norm": 1.8978960514068604, "learning_rate": 1.9953536382094218e-05, "loss": 1.2389, "step": 265 }, { "epoch": 0.06090440755580996, "grad_norm": 1.3212120532989502, "learning_rate": 1.995281937266599e-05, "loss": 1.1844, "step": 266 }, { "epoch": 0.061133371493989695, "grad_norm": 2.165839672088623, "learning_rate": 1.995209688628471e-05, "loss": 1.196, "step": 267 }, { "epoch": 0.061362335432169436, "grad_norm": 1.4239095449447632, "learning_rate": 1.9951368923347945e-05, "loss": 1.177, "step": 268 }, { "epoch": 0.06159129937034917, "grad_norm": 1.4673502445220947, "learning_rate": 1.9950635484256296e-05, "loss": 1.2372, "step": 269 }, { "epoch": 0.061820263308528904, "grad_norm": 1.2616180181503296, "learning_rate": 1.9949896569413368e-05, "loss": 1.1906, "step": 270 }, { "epoch": 0.062049227246708645, "grad_norm": 1.8740971088409424, "learning_rate": 1.9949152179225776e-05, "loss": 1.2036, "step": 271 }, { "epoch": 0.06227819118488838, "grad_norm": 1.2659952640533447, "learning_rate": 1.9948402314103153e-05, "loss": 1.1836, "step": 272 }, { "epoch": 0.06250715512306812, "grad_norm": 1.446509599685669, "learning_rate": 1.994764697445815e-05, "loss": 1.2553, "step": 273 }, { "epoch": 0.06273611906124785, "grad_norm": 1.2037794589996338, "learning_rate": 1.994688616070641e-05, "loss": 1.209, "step": 274 }, { "epoch": 0.06296508299942759, "grad_norm": 1.1829689741134644, "learning_rate": 1.9946119873266615e-05, "loss": 1.192, "step": 275 }, { "epoch": 0.06319404693760733, "grad_norm": 1.409515619277954, "learning_rate": 1.994534811256044e-05, "loss": 1.2247, "step": 276 }, { "epoch": 0.06342301087578707, "grad_norm": 1.238846778869629, "learning_rate": 1.994457087901258e-05, "loss": 1.2204, "step": 277 }, { "epoch": 0.0636519748139668, "grad_norm": 1.5566067695617676, "learning_rate": 1.9943788173050743e-05, "loss": 1.2295, "step": 278 }, { "epoch": 0.06388093875214654, "grad_norm": 1.221211552619934, "learning_rate": 1.9942999995105646e-05, "loss": 1.1664, "step": 279 }, { "epoch": 0.06410990269032628, "grad_norm": 1.3222095966339111, "learning_rate": 1.9942206345611008e-05, "loss": 1.1554, "step": 280 }, { "epoch": 0.064338866628506, "grad_norm": 1.4909206628799438, "learning_rate": 1.9941407225003577e-05, "loss": 1.2614, "step": 281 }, { "epoch": 0.06456783056668575, "grad_norm": 1.290727972984314, "learning_rate": 1.9940602633723097e-05, "loss": 1.2267, "step": 282 }, { "epoch": 0.06479679450486549, "grad_norm": 2.3527672290802, "learning_rate": 1.9939792572212328e-05, "loss": 1.1742, "step": 283 }, { "epoch": 0.06502575844304521, "grad_norm": 1.4631099700927734, "learning_rate": 1.993897704091705e-05, "loss": 1.2321, "step": 284 }, { "epoch": 0.06525472238122496, "grad_norm": 1.4212102890014648, "learning_rate": 1.9938156040286027e-05, "loss": 1.208, "step": 285 }, { "epoch": 0.0654836863194047, "grad_norm": 1.9231419563293457, "learning_rate": 1.9937329570771057e-05, "loss": 1.2423, "step": 286 }, { "epoch": 0.06571265025758444, "grad_norm": 2.0737898349761963, "learning_rate": 1.9936497632826937e-05, "loss": 1.1687, "step": 287 }, { "epoch": 0.06594161419576416, "grad_norm": 1.1904573440551758, "learning_rate": 1.993566022691148e-05, "loss": 1.2161, "step": 288 }, { "epoch": 0.0661705781339439, "grad_norm": 1.472386121749878, "learning_rate": 1.99348173534855e-05, "loss": 1.2219, "step": 289 }, { "epoch": 0.06639954207212365, "grad_norm": 2.0884242057800293, "learning_rate": 1.993396901301283e-05, "loss": 1.1996, "step": 290 }, { "epoch": 0.06662850601030337, "grad_norm": 1.3736237287521362, "learning_rate": 1.9933115205960295e-05, "loss": 1.1683, "step": 291 }, { "epoch": 0.06685746994848311, "grad_norm": 1.3148783445358276, "learning_rate": 1.9932255932797736e-05, "loss": 1.2628, "step": 292 }, { "epoch": 0.06708643388666286, "grad_norm": 1.330783486366272, "learning_rate": 1.9931391193998015e-05, "loss": 1.1524, "step": 293 }, { "epoch": 0.06731539782484258, "grad_norm": 1.5354028940200806, "learning_rate": 1.9930520990036984e-05, "loss": 1.2563, "step": 294 }, { "epoch": 0.06754436176302232, "grad_norm": 1.3578846454620361, "learning_rate": 1.9929645321393505e-05, "loss": 1.2097, "step": 295 }, { "epoch": 0.06777332570120206, "grad_norm": 1.1557707786560059, "learning_rate": 1.9928764188549462e-05, "loss": 1.1626, "step": 296 }, { "epoch": 0.06800228963938179, "grad_norm": 1.2463425397872925, "learning_rate": 1.9927877591989727e-05, "loss": 1.2083, "step": 297 }, { "epoch": 0.06823125357756153, "grad_norm": 1.1633163690567017, "learning_rate": 1.992698553220219e-05, "loss": 1.1593, "step": 298 }, { "epoch": 0.06846021751574127, "grad_norm": 1.3011326789855957, "learning_rate": 1.992608800967774e-05, "loss": 1.2601, "step": 299 }, { "epoch": 0.06868918145392101, "grad_norm": 1.7210752964019775, "learning_rate": 1.992518502491028e-05, "loss": 1.2187, "step": 300 }, { "epoch": 0.06891814539210074, "grad_norm": 1.1964160203933716, "learning_rate": 1.992427657839671e-05, "loss": 1.1478, "step": 301 }, { "epoch": 0.06914710933028048, "grad_norm": 1.2960009574890137, "learning_rate": 1.992336267063695e-05, "loss": 1.207, "step": 302 }, { "epoch": 0.06937607326846022, "grad_norm": 1.6698633432388306, "learning_rate": 1.9922443302133906e-05, "loss": 1.133, "step": 303 }, { "epoch": 0.06960503720663995, "grad_norm": 1.4826613664627075, "learning_rate": 1.9921518473393502e-05, "loss": 1.1586, "step": 304 }, { "epoch": 0.06983400114481969, "grad_norm": 1.3968387842178345, "learning_rate": 1.9920588184924664e-05, "loss": 1.1621, "step": 305 }, { "epoch": 0.07006296508299943, "grad_norm": 1.6237884759902954, "learning_rate": 1.9919652437239326e-05, "loss": 1.2013, "step": 306 }, { "epoch": 0.07029192902117916, "grad_norm": 1.5827672481536865, "learning_rate": 1.9918711230852416e-05, "loss": 1.1804, "step": 307 }, { "epoch": 0.0705208929593589, "grad_norm": 3.0790855884552, "learning_rate": 1.9917764566281874e-05, "loss": 1.1535, "step": 308 }, { "epoch": 0.07074985689753864, "grad_norm": 1.3645802736282349, "learning_rate": 1.9916812444048642e-05, "loss": 1.1941, "step": 309 }, { "epoch": 0.07097882083571837, "grad_norm": 1.1226625442504883, "learning_rate": 1.9915854864676665e-05, "loss": 1.1721, "step": 310 }, { "epoch": 0.07120778477389811, "grad_norm": 1.386254906654358, "learning_rate": 1.991489182869289e-05, "loss": 1.0778, "step": 311 }, { "epoch": 0.07143674871207785, "grad_norm": 1.1469475030899048, "learning_rate": 1.9913923336627267e-05, "loss": 1.1466, "step": 312 }, { "epoch": 0.07166571265025759, "grad_norm": 1.7655390501022339, "learning_rate": 1.9912949389012754e-05, "loss": 1.1651, "step": 313 }, { "epoch": 0.07189467658843732, "grad_norm": 1.2459030151367188, "learning_rate": 1.9911969986385297e-05, "loss": 1.2548, "step": 314 }, { "epoch": 0.07212364052661706, "grad_norm": 1.4673657417297363, "learning_rate": 1.991098512928386e-05, "loss": 1.1818, "step": 315 }, { "epoch": 0.0723526044647968, "grad_norm": 1.3704054355621338, "learning_rate": 1.9909994818250403e-05, "loss": 1.1223, "step": 316 }, { "epoch": 0.07258156840297653, "grad_norm": 1.2631467580795288, "learning_rate": 1.990899905382988e-05, "loss": 1.1135, "step": 317 }, { "epoch": 0.07281053234115627, "grad_norm": 1.255850076675415, "learning_rate": 1.990799783657026e-05, "loss": 1.1675, "step": 318 }, { "epoch": 0.07303949627933601, "grad_norm": 1.2794235944747925, "learning_rate": 1.9906991167022496e-05, "loss": 1.145, "step": 319 }, { "epoch": 0.07326846021751574, "grad_norm": 1.3548914194107056, "learning_rate": 1.990597904574055e-05, "loss": 1.2148, "step": 320 }, { "epoch": 0.07349742415569548, "grad_norm": 1.7542753219604492, "learning_rate": 1.990496147328139e-05, "loss": 1.164, "step": 321 }, { "epoch": 0.07372638809387522, "grad_norm": 1.3537834882736206, "learning_rate": 1.9903938450204972e-05, "loss": 1.2505, "step": 322 }, { "epoch": 0.07395535203205496, "grad_norm": 1.2525663375854492, "learning_rate": 1.9902909977074267e-05, "loss": 1.2464, "step": 323 }, { "epoch": 0.07418431597023468, "grad_norm": 1.4234391450881958, "learning_rate": 1.9901876054455217e-05, "loss": 1.1412, "step": 324 }, { "epoch": 0.07441327990841443, "grad_norm": 1.5655121803283691, "learning_rate": 1.9900836682916796e-05, "loss": 1.2074, "step": 325 }, { "epoch": 0.07464224384659417, "grad_norm": 1.317477822303772, "learning_rate": 1.989979186303096e-05, "loss": 1.1611, "step": 326 }, { "epoch": 0.0748712077847739, "grad_norm": 1.2287671566009521, "learning_rate": 1.989874159537266e-05, "loss": 1.1862, "step": 327 }, { "epoch": 0.07510017172295363, "grad_norm": 1.3680412769317627, "learning_rate": 1.989768588051985e-05, "loss": 1.0958, "step": 328 }, { "epoch": 0.07532913566113338, "grad_norm": 1.3635417222976685, "learning_rate": 1.9896624719053483e-05, "loss": 1.1807, "step": 329 }, { "epoch": 0.0755580995993131, "grad_norm": 1.7830880880355835, "learning_rate": 1.9895558111557503e-05, "loss": 1.2397, "step": 330 }, { "epoch": 0.07578706353749284, "grad_norm": 1.3236726522445679, "learning_rate": 1.9894486058618863e-05, "loss": 1.1608, "step": 331 }, { "epoch": 0.07601602747567258, "grad_norm": 1.733843207359314, "learning_rate": 1.98934085608275e-05, "loss": 1.2268, "step": 332 }, { "epoch": 0.07624499141385231, "grad_norm": 1.5161333084106445, "learning_rate": 1.9892325618776353e-05, "loss": 1.2225, "step": 333 }, { "epoch": 0.07647395535203205, "grad_norm": 1.2248483896255493, "learning_rate": 1.9891237233061354e-05, "loss": 1.1838, "step": 334 }, { "epoch": 0.0767029192902118, "grad_norm": 1.2949497699737549, "learning_rate": 1.989014340428143e-05, "loss": 1.1763, "step": 335 }, { "epoch": 0.07693188322839153, "grad_norm": 1.6322613954544067, "learning_rate": 1.9889044133038514e-05, "loss": 1.1773, "step": 336 }, { "epoch": 0.07716084716657126, "grad_norm": 1.5641744136810303, "learning_rate": 1.9887939419937522e-05, "loss": 1.2079, "step": 337 }, { "epoch": 0.077389811104751, "grad_norm": 1.1563526391983032, "learning_rate": 1.9886829265586368e-05, "loss": 1.1775, "step": 338 }, { "epoch": 0.07761877504293074, "grad_norm": 1.9111676216125488, "learning_rate": 1.9885713670595958e-05, "loss": 1.1931, "step": 339 }, { "epoch": 0.07784773898111047, "grad_norm": 1.3635084629058838, "learning_rate": 1.98845926355802e-05, "loss": 1.207, "step": 340 }, { "epoch": 0.07807670291929021, "grad_norm": 1.4956107139587402, "learning_rate": 1.988346616115598e-05, "loss": 1.1679, "step": 341 }, { "epoch": 0.07830566685746995, "grad_norm": 1.2649390697479248, "learning_rate": 1.98823342479432e-05, "loss": 1.1901, "step": 342 }, { "epoch": 0.07853463079564968, "grad_norm": 1.5861189365386963, "learning_rate": 1.9881196896564735e-05, "loss": 1.2169, "step": 343 }, { "epoch": 0.07876359473382942, "grad_norm": 1.2659194469451904, "learning_rate": 1.9880054107646467e-05, "loss": 1.2246, "step": 344 }, { "epoch": 0.07899255867200916, "grad_norm": 1.5740238428115845, "learning_rate": 1.9878905881817254e-05, "loss": 1.2331, "step": 345 }, { "epoch": 0.0792215226101889, "grad_norm": 1.9337862730026245, "learning_rate": 1.9877752219708956e-05, "loss": 1.1526, "step": 346 }, { "epoch": 0.07945048654836863, "grad_norm": 1.5918827056884766, "learning_rate": 1.987659312195643e-05, "loss": 1.1238, "step": 347 }, { "epoch": 0.07967945048654837, "grad_norm": 1.6437872648239136, "learning_rate": 1.9875428589197513e-05, "loss": 1.1845, "step": 348 }, { "epoch": 0.07990841442472811, "grad_norm": 1.2831597328186035, "learning_rate": 1.9874258622073044e-05, "loss": 1.1319, "step": 349 }, { "epoch": 0.08013737836290784, "grad_norm": 1.4037307500839233, "learning_rate": 1.9873083221226833e-05, "loss": 1.2522, "step": 350 }, { "epoch": 0.08036634230108758, "grad_norm": 1.400461196899414, "learning_rate": 1.9871902387305707e-05, "loss": 1.1294, "step": 351 }, { "epoch": 0.08059530623926732, "grad_norm": 1.286853551864624, "learning_rate": 1.9870716120959462e-05, "loss": 1.1933, "step": 352 }, { "epoch": 0.08082427017744705, "grad_norm": 1.2215043306350708, "learning_rate": 1.9869524422840893e-05, "loss": 1.1226, "step": 353 }, { "epoch": 0.08105323411562679, "grad_norm": 1.3354159593582153, "learning_rate": 1.9868327293605778e-05, "loss": 1.1322, "step": 354 }, { "epoch": 0.08128219805380653, "grad_norm": 1.2361743450164795, "learning_rate": 1.986712473391289e-05, "loss": 1.2118, "step": 355 }, { "epoch": 0.08151116199198626, "grad_norm": 1.4514862298965454, "learning_rate": 1.986591674442399e-05, "loss": 1.1469, "step": 356 }, { "epoch": 0.081740125930166, "grad_norm": 1.2745472192764282, "learning_rate": 1.9864703325803818e-05, "loss": 1.1403, "step": 357 }, { "epoch": 0.08196908986834574, "grad_norm": 1.3737826347351074, "learning_rate": 1.986348447872011e-05, "loss": 1.1907, "step": 358 }, { "epoch": 0.08219805380652548, "grad_norm": 1.354296326637268, "learning_rate": 1.986226020384359e-05, "loss": 1.1332, "step": 359 }, { "epoch": 0.0824270177447052, "grad_norm": 1.3757176399230957, "learning_rate": 1.986103050184797e-05, "loss": 1.2131, "step": 360 }, { "epoch": 0.08265598168288495, "grad_norm": 1.770670771598816, "learning_rate": 1.9859795373409934e-05, "loss": 1.1724, "step": 361 }, { "epoch": 0.08288494562106469, "grad_norm": 1.3382035493850708, "learning_rate": 1.985855481920917e-05, "loss": 1.1411, "step": 362 }, { "epoch": 0.08311390955924441, "grad_norm": 1.1877765655517578, "learning_rate": 1.9857308839928346e-05, "loss": 1.2451, "step": 363 }, { "epoch": 0.08334287349742416, "grad_norm": 1.3735177516937256, "learning_rate": 1.9856057436253105e-05, "loss": 1.1821, "step": 364 }, { "epoch": 0.0835718374356039, "grad_norm": 1.5288448333740234, "learning_rate": 1.9854800608872096e-05, "loss": 1.1408, "step": 365 }, { "epoch": 0.08380080137378362, "grad_norm": 1.8656703233718872, "learning_rate": 1.9853538358476933e-05, "loss": 1.2307, "step": 366 }, { "epoch": 0.08402976531196336, "grad_norm": 1.520883560180664, "learning_rate": 1.9852270685762222e-05, "loss": 1.1503, "step": 367 }, { "epoch": 0.0842587292501431, "grad_norm": 1.1496034860610962, "learning_rate": 1.9850997591425555e-05, "loss": 1.2023, "step": 368 }, { "epoch": 0.08448769318832285, "grad_norm": 1.2435383796691895, "learning_rate": 1.9849719076167502e-05, "loss": 1.1091, "step": 369 }, { "epoch": 0.08471665712650257, "grad_norm": 1.5105100870132446, "learning_rate": 1.9848435140691627e-05, "loss": 1.2194, "step": 370 }, { "epoch": 0.08494562106468231, "grad_norm": 1.2073237895965576, "learning_rate": 1.9847145785704457e-05, "loss": 1.2124, "step": 371 }, { "epoch": 0.08517458500286205, "grad_norm": 1.3399255275726318, "learning_rate": 1.9845851011915526e-05, "loss": 1.2134, "step": 372 }, { "epoch": 0.08540354894104178, "grad_norm": 1.1257169246673584, "learning_rate": 1.9844550820037326e-05, "loss": 1.2316, "step": 373 }, { "epoch": 0.08563251287922152, "grad_norm": 1.2916021347045898, "learning_rate": 1.9843245210785348e-05, "loss": 1.1766, "step": 374 }, { "epoch": 0.08586147681740126, "grad_norm": 1.1885099411010742, "learning_rate": 1.9841934184878056e-05, "loss": 1.1737, "step": 375 }, { "epoch": 0.08609044075558099, "grad_norm": 1.5768221616744995, "learning_rate": 1.98406177430369e-05, "loss": 1.1579, "step": 376 }, { "epoch": 0.08631940469376073, "grad_norm": 1.2692902088165283, "learning_rate": 1.98392958859863e-05, "loss": 1.1498, "step": 377 }, { "epoch": 0.08654836863194047, "grad_norm": 1.360911250114441, "learning_rate": 1.9837968614453666e-05, "loss": 1.1672, "step": 378 }, { "epoch": 0.0867773325701202, "grad_norm": 1.3675583600997925, "learning_rate": 1.9836635929169388e-05, "loss": 1.2173, "step": 379 }, { "epoch": 0.08700629650829994, "grad_norm": 1.1969804763793945, "learning_rate": 1.9835297830866827e-05, "loss": 1.1914, "step": 380 }, { "epoch": 0.08723526044647968, "grad_norm": 25.4526424407959, "learning_rate": 1.983395432028233e-05, "loss": 1.2027, "step": 381 }, { "epoch": 0.08746422438465942, "grad_norm": 1.52403724193573, "learning_rate": 1.9832605398155217e-05, "loss": 1.2035, "step": 382 }, { "epoch": 0.08769318832283915, "grad_norm": 4.724262237548828, "learning_rate": 1.9831251065227792e-05, "loss": 1.2138, "step": 383 }, { "epoch": 0.08792215226101889, "grad_norm": 2.621134042739868, "learning_rate": 1.9829891322245326e-05, "loss": 1.1642, "step": 384 }, { "epoch": 0.08815111619919863, "grad_norm": 3.4685866832733154, "learning_rate": 1.9828526169956083e-05, "loss": 1.1791, "step": 385 }, { "epoch": 0.08838008013737836, "grad_norm": 1.3899956941604614, "learning_rate": 1.982715560911129e-05, "loss": 1.1847, "step": 386 }, { "epoch": 0.0886090440755581, "grad_norm": 1.1521110534667969, "learning_rate": 1.9825779640465157e-05, "loss": 1.129, "step": 387 }, { "epoch": 0.08883800801373784, "grad_norm": 1.7473269701004028, "learning_rate": 1.982439826477487e-05, "loss": 1.1534, "step": 388 }, { "epoch": 0.08906697195191757, "grad_norm": 1.4069769382476807, "learning_rate": 1.9823011482800584e-05, "loss": 1.1092, "step": 389 }, { "epoch": 0.08929593589009731, "grad_norm": 1.2800990343093872, "learning_rate": 1.9821619295305432e-05, "loss": 1.1823, "step": 390 }, { "epoch": 0.08952489982827705, "grad_norm": 1.3700530529022217, "learning_rate": 1.9820221703055528e-05, "loss": 1.1921, "step": 391 }, { "epoch": 0.08975386376645678, "grad_norm": 1.9944264888763428, "learning_rate": 1.9818818706819955e-05, "loss": 1.1447, "step": 392 }, { "epoch": 0.08998282770463652, "grad_norm": 2.0315797328948975, "learning_rate": 1.981741030737077e-05, "loss": 1.1634, "step": 393 }, { "epoch": 0.09021179164281626, "grad_norm": 1.4205044507980347, "learning_rate": 1.9815996505483e-05, "loss": 1.1817, "step": 394 }, { "epoch": 0.090440755580996, "grad_norm": 1.4898089170455933, "learning_rate": 1.9814577301934647e-05, "loss": 1.1931, "step": 395 }, { "epoch": 0.09066971951917573, "grad_norm": 1.5351992845535278, "learning_rate": 1.9813152697506696e-05, "loss": 1.2307, "step": 396 }, { "epoch": 0.09089868345735547, "grad_norm": 1.2297886610031128, "learning_rate": 1.9811722692983088e-05, "loss": 1.161, "step": 397 }, { "epoch": 0.09112764739553521, "grad_norm": 1.5338091850280762, "learning_rate": 1.981028728915074e-05, "loss": 1.2151, "step": 398 }, { "epoch": 0.09135661133371493, "grad_norm": 1.4196423292160034, "learning_rate": 1.980884648679955e-05, "loss": 1.2312, "step": 399 }, { "epoch": 0.09158557527189468, "grad_norm": 1.2989675998687744, "learning_rate": 1.980740028672237e-05, "loss": 1.1532, "step": 400 }, { "epoch": 0.09181453921007442, "grad_norm": 1.5549927949905396, "learning_rate": 1.9805948689715043e-05, "loss": 1.1157, "step": 401 }, { "epoch": 0.09204350314825414, "grad_norm": 1.2734893560409546, "learning_rate": 1.9804491696576364e-05, "loss": 1.1663, "step": 402 }, { "epoch": 0.09227246708643388, "grad_norm": 1.376017689704895, "learning_rate": 1.9803029308108105e-05, "loss": 1.1519, "step": 403 }, { "epoch": 0.09250143102461363, "grad_norm": 1.1765146255493164, "learning_rate": 1.9801561525115006e-05, "loss": 1.1376, "step": 404 }, { "epoch": 0.09273039496279337, "grad_norm": 1.1429637670516968, "learning_rate": 1.9800088348404778e-05, "loss": 1.1182, "step": 405 }, { "epoch": 0.0929593589009731, "grad_norm": 1.3974339962005615, "learning_rate": 1.9798609778788094e-05, "loss": 1.141, "step": 406 }, { "epoch": 0.09318832283915283, "grad_norm": 1.5140388011932373, "learning_rate": 1.97971258170786e-05, "loss": 1.2011, "step": 407 }, { "epoch": 0.09341728677733258, "grad_norm": 1.161309838294983, "learning_rate": 1.979563646409291e-05, "loss": 1.1788, "step": 408 }, { "epoch": 0.0936462507155123, "grad_norm": 1.2854355573654175, "learning_rate": 1.97941417206506e-05, "loss": 1.2321, "step": 409 }, { "epoch": 0.09387521465369204, "grad_norm": 1.1982879638671875, "learning_rate": 1.9792641587574212e-05, "loss": 1.1316, "step": 410 }, { "epoch": 0.09410417859187178, "grad_norm": 1.1869289875030518, "learning_rate": 1.979113606568926e-05, "loss": 1.1207, "step": 411 }, { "epoch": 0.09433314253005151, "grad_norm": 1.159303069114685, "learning_rate": 1.9789625155824226e-05, "loss": 1.0941, "step": 412 }, { "epoch": 0.09456210646823125, "grad_norm": 1.4857593774795532, "learning_rate": 1.978810885881054e-05, "loss": 1.1209, "step": 413 }, { "epoch": 0.094791070406411, "grad_norm": 1.3553754091262817, "learning_rate": 1.9786587175482613e-05, "loss": 1.154, "step": 414 }, { "epoch": 0.09502003434459072, "grad_norm": 1.219914197921753, "learning_rate": 1.9785060106677818e-05, "loss": 1.1627, "step": 415 }, { "epoch": 0.09524899828277046, "grad_norm": 2.246405839920044, "learning_rate": 1.978352765323648e-05, "loss": 1.2182, "step": 416 }, { "epoch": 0.0954779622209502, "grad_norm": 1.3677849769592285, "learning_rate": 1.97819898160019e-05, "loss": 1.1858, "step": 417 }, { "epoch": 0.09570692615912994, "grad_norm": 1.52046537399292, "learning_rate": 1.9780446595820336e-05, "loss": 1.2089, "step": 418 }, { "epoch": 0.09593589009730967, "grad_norm": 1.4131478071212769, "learning_rate": 1.9778897993541014e-05, "loss": 1.1796, "step": 419 }, { "epoch": 0.09616485403548941, "grad_norm": 1.3519349098205566, "learning_rate": 1.977734401001611e-05, "loss": 1.148, "step": 420 }, { "epoch": 0.09639381797366915, "grad_norm": 1.985721230506897, "learning_rate": 1.9775784646100768e-05, "loss": 1.1838, "step": 421 }, { "epoch": 0.09662278191184888, "grad_norm": 1.2649965286254883, "learning_rate": 1.97742199026531e-05, "loss": 1.1589, "step": 422 }, { "epoch": 0.09685174585002862, "grad_norm": 1.4421800374984741, "learning_rate": 1.977264978053416e-05, "loss": 1.0934, "step": 423 }, { "epoch": 0.09708070978820836, "grad_norm": 1.287441611289978, "learning_rate": 1.977107428060799e-05, "loss": 1.1881, "step": 424 }, { "epoch": 0.09730967372638809, "grad_norm": 1.2831951379776, "learning_rate": 1.9769493403741556e-05, "loss": 1.1283, "step": 425 }, { "epoch": 0.09753863766456783, "grad_norm": 1.2015918493270874, "learning_rate": 1.976790715080481e-05, "loss": 1.1961, "step": 426 }, { "epoch": 0.09776760160274757, "grad_norm": 1.4825067520141602, "learning_rate": 1.9766315522670654e-05, "loss": 1.1451, "step": 427 }, { "epoch": 0.09799656554092731, "grad_norm": 1.2135714292526245, "learning_rate": 1.976471852021495e-05, "loss": 1.0926, "step": 428 }, { "epoch": 0.09822552947910704, "grad_norm": 1.2095484733581543, "learning_rate": 1.9763116144316506e-05, "loss": 1.1423, "step": 429 }, { "epoch": 0.09845449341728678, "grad_norm": 1.321748971939087, "learning_rate": 1.9761508395857106e-05, "loss": 1.1583, "step": 430 }, { "epoch": 0.09868345735546652, "grad_norm": 1.3187049627304077, "learning_rate": 1.9759895275721476e-05, "loss": 1.1745, "step": 431 }, { "epoch": 0.09891242129364625, "grad_norm": 1.3241872787475586, "learning_rate": 1.97582767847973e-05, "loss": 1.1689, "step": 432 }, { "epoch": 0.09914138523182599, "grad_norm": 1.2542831897735596, "learning_rate": 1.9756652923975227e-05, "loss": 1.2044, "step": 433 }, { "epoch": 0.09937034917000573, "grad_norm": 1.2562075853347778, "learning_rate": 1.9755023694148846e-05, "loss": 1.1629, "step": 434 }, { "epoch": 0.09959931310818546, "grad_norm": 1.2366596460342407, "learning_rate": 1.9753389096214716e-05, "loss": 1.0803, "step": 435 }, { "epoch": 0.0998282770463652, "grad_norm": 1.4391214847564697, "learning_rate": 1.9751749131072335e-05, "loss": 1.1797, "step": 436 }, { "epoch": 0.10005724098454494, "grad_norm": 4.435413360595703, "learning_rate": 1.9750103799624165e-05, "loss": 1.1591, "step": 437 }, { "epoch": 0.10028620492272466, "grad_norm": 1.376836895942688, "learning_rate": 1.974845310277562e-05, "loss": 1.2096, "step": 438 }, { "epoch": 0.1005151688609044, "grad_norm": 1.4392584562301636, "learning_rate": 1.974679704143507e-05, "loss": 1.0871, "step": 439 }, { "epoch": 0.10074413279908415, "grad_norm": 1.1275047063827515, "learning_rate": 1.974513561651382e-05, "loss": 1.1411, "step": 440 }, { "epoch": 0.10097309673726389, "grad_norm": 1.4672117233276367, "learning_rate": 1.974346882892614e-05, "loss": 1.138, "step": 441 }, { "epoch": 0.10120206067544361, "grad_norm": 1.2267554998397827, "learning_rate": 1.974179667958926e-05, "loss": 1.1537, "step": 442 }, { "epoch": 0.10143102461362336, "grad_norm": 1.3913313150405884, "learning_rate": 1.9740119169423337e-05, "loss": 1.1623, "step": 443 }, { "epoch": 0.1016599885518031, "grad_norm": 1.6974395513534546, "learning_rate": 1.97384362993515e-05, "loss": 1.208, "step": 444 }, { "epoch": 0.10188895248998282, "grad_norm": 1.3473076820373535, "learning_rate": 1.9736748070299813e-05, "loss": 1.1479, "step": 445 }, { "epoch": 0.10211791642816256, "grad_norm": 1.2066441774368286, "learning_rate": 1.9735054483197295e-05, "loss": 1.2317, "step": 446 }, { "epoch": 0.1023468803663423, "grad_norm": 1.5624324083328247, "learning_rate": 1.973335553897591e-05, "loss": 1.1282, "step": 447 }, { "epoch": 0.10257584430452203, "grad_norm": 1.9326655864715576, "learning_rate": 1.9731651238570582e-05, "loss": 1.1848, "step": 448 }, { "epoch": 0.10280480824270177, "grad_norm": 2.2776360511779785, "learning_rate": 1.9729941582919168e-05, "loss": 1.1599, "step": 449 }, { "epoch": 0.10303377218088151, "grad_norm": 1.2282005548477173, "learning_rate": 1.9728226572962474e-05, "loss": 1.1843, "step": 450 }, { "epoch": 0.10326273611906125, "grad_norm": 1.4309104681015015, "learning_rate": 1.972650620964426e-05, "loss": 1.1689, "step": 451 }, { "epoch": 0.10349170005724098, "grad_norm": 1.427143931388855, "learning_rate": 1.9724780493911227e-05, "loss": 1.1471, "step": 452 }, { "epoch": 0.10372066399542072, "grad_norm": 1.2180664539337158, "learning_rate": 1.9723049426713018e-05, "loss": 1.1932, "step": 453 }, { "epoch": 0.10394962793360046, "grad_norm": 1.2000045776367188, "learning_rate": 1.9721313009002228e-05, "loss": 1.1751, "step": 454 }, { "epoch": 0.10417859187178019, "grad_norm": 1.3412353992462158, "learning_rate": 1.9719571241734395e-05, "loss": 1.147, "step": 455 }, { "epoch": 0.10440755580995993, "grad_norm": 1.2962464094161987, "learning_rate": 1.9717824125867993e-05, "loss": 1.2452, "step": 456 }, { "epoch": 0.10463651974813967, "grad_norm": 1.10789155960083, "learning_rate": 1.9716071662364454e-05, "loss": 1.2014, "step": 457 }, { "epoch": 0.1048654836863194, "grad_norm": 1.4773873090744019, "learning_rate": 1.9714313852188133e-05, "loss": 1.2092, "step": 458 }, { "epoch": 0.10509444762449914, "grad_norm": 1.4391075372695923, "learning_rate": 1.9712550696306346e-05, "loss": 1.1431, "step": 459 }, { "epoch": 0.10532341156267888, "grad_norm": 1.7549716234207153, "learning_rate": 1.9710782195689343e-05, "loss": 1.1395, "step": 460 }, { "epoch": 0.10555237550085861, "grad_norm": 1.5657472610473633, "learning_rate": 1.970900835131031e-05, "loss": 1.132, "step": 461 }, { "epoch": 0.10578133943903835, "grad_norm": 1.4908957481384277, "learning_rate": 1.9707229164145386e-05, "loss": 1.1533, "step": 462 }, { "epoch": 0.10601030337721809, "grad_norm": 1.2966023683547974, "learning_rate": 1.9705444635173635e-05, "loss": 1.1855, "step": 463 }, { "epoch": 0.10623926731539783, "grad_norm": 1.271470546722412, "learning_rate": 1.970365476537707e-05, "loss": 1.2443, "step": 464 }, { "epoch": 0.10646823125357756, "grad_norm": 1.3924498558044434, "learning_rate": 1.9701859555740647e-05, "loss": 1.1831, "step": 465 }, { "epoch": 0.1066971951917573, "grad_norm": 1.053683876991272, "learning_rate": 1.9700059007252248e-05, "loss": 1.2135, "step": 466 }, { "epoch": 0.10692615912993704, "grad_norm": 1.3220330476760864, "learning_rate": 1.9698253120902703e-05, "loss": 1.1772, "step": 467 }, { "epoch": 0.10715512306811677, "grad_norm": 1.528494119644165, "learning_rate": 1.9696441897685777e-05, "loss": 1.2416, "step": 468 }, { "epoch": 0.10738408700629651, "grad_norm": 1.3539191484451294, "learning_rate": 1.969462533859817e-05, "loss": 1.1354, "step": 469 }, { "epoch": 0.10761305094447625, "grad_norm": 1.5194259881973267, "learning_rate": 1.9692803444639517e-05, "loss": 1.208, "step": 470 }, { "epoch": 0.10784201488265598, "grad_norm": 1.2938241958618164, "learning_rate": 1.9690976216812397e-05, "loss": 1.0596, "step": 471 }, { "epoch": 0.10807097882083572, "grad_norm": 1.2351398468017578, "learning_rate": 1.968914365612231e-05, "loss": 1.2107, "step": 472 }, { "epoch": 0.10829994275901546, "grad_norm": 1.381752371788025, "learning_rate": 1.9687305763577705e-05, "loss": 1.1274, "step": 473 }, { "epoch": 0.10852890669719518, "grad_norm": 1.4342790842056274, "learning_rate": 1.9685462540189955e-05, "loss": 1.1483, "step": 474 }, { "epoch": 0.10875787063537493, "grad_norm": 1.2768388986587524, "learning_rate": 1.9683613986973373e-05, "loss": 1.2039, "step": 475 }, { "epoch": 0.10898683457355467, "grad_norm": 1.2173471450805664, "learning_rate": 1.9681760104945203e-05, "loss": 1.1969, "step": 476 }, { "epoch": 0.10921579851173441, "grad_norm": 1.2570933103561401, "learning_rate": 1.967990089512562e-05, "loss": 1.1701, "step": 477 }, { "epoch": 0.10944476244991413, "grad_norm": 1.45657479763031, "learning_rate": 1.9678036358537726e-05, "loss": 1.1106, "step": 478 }, { "epoch": 0.10967372638809388, "grad_norm": 1.4095118045806885, "learning_rate": 1.9676166496207567e-05, "loss": 1.1978, "step": 479 }, { "epoch": 0.10990269032627362, "grad_norm": 1.445336937904358, "learning_rate": 1.967429130916411e-05, "loss": 1.1609, "step": 480 }, { "epoch": 0.11013165426445334, "grad_norm": 1.2025765180587769, "learning_rate": 1.9672410798439256e-05, "loss": 1.1806, "step": 481 }, { "epoch": 0.11036061820263308, "grad_norm": 1.2341455221176147, "learning_rate": 1.9670524965067832e-05, "loss": 1.1765, "step": 482 }, { "epoch": 0.11058958214081283, "grad_norm": 1.1721457242965698, "learning_rate": 1.96686338100876e-05, "loss": 1.1521, "step": 483 }, { "epoch": 0.11081854607899255, "grad_norm": 1.2636065483093262, "learning_rate": 1.9666737334539237e-05, "loss": 1.1089, "step": 484 }, { "epoch": 0.1110475100171723, "grad_norm": 1.7780942916870117, "learning_rate": 1.966483553946637e-05, "loss": 1.1039, "step": 485 }, { "epoch": 0.11127647395535203, "grad_norm": 1.289139747619629, "learning_rate": 1.9662928425915536e-05, "loss": 1.2103, "step": 486 }, { "epoch": 0.11150543789353178, "grad_norm": 1.5748257637023926, "learning_rate": 1.9661015994936204e-05, "loss": 1.1882, "step": 487 }, { "epoch": 0.1117344018317115, "grad_norm": 1.3127802610397339, "learning_rate": 1.9659098247580765e-05, "loss": 1.0901, "step": 488 }, { "epoch": 0.11196336576989124, "grad_norm": 1.2186063528060913, "learning_rate": 1.9657175184904545e-05, "loss": 1.1095, "step": 489 }, { "epoch": 0.11219232970807098, "grad_norm": 1.644958734512329, "learning_rate": 1.9655246807965786e-05, "loss": 1.1162, "step": 490 }, { "epoch": 0.11242129364625071, "grad_norm": 1.3639802932739258, "learning_rate": 1.9653313117825657e-05, "loss": 1.1439, "step": 491 }, { "epoch": 0.11265025758443045, "grad_norm": 1.1354119777679443, "learning_rate": 1.9651374115548255e-05, "loss": 1.1214, "step": 492 }, { "epoch": 0.11287922152261019, "grad_norm": 1.509704351425171, "learning_rate": 1.964942980220059e-05, "loss": 1.154, "step": 493 }, { "epoch": 0.11310818546078992, "grad_norm": 1.605154037475586, "learning_rate": 1.9647480178852606e-05, "loss": 1.2066, "step": 494 }, { "epoch": 0.11333714939896966, "grad_norm": 1.1039700508117676, "learning_rate": 1.9645525246577168e-05, "loss": 1.2157, "step": 495 }, { "epoch": 0.1135661133371494, "grad_norm": 1.5542914867401123, "learning_rate": 1.9643565006450055e-05, "loss": 1.1465, "step": 496 }, { "epoch": 0.11379507727532913, "grad_norm": 1.450024127960205, "learning_rate": 1.9641599459549966e-05, "loss": 1.1535, "step": 497 }, { "epoch": 0.11402404121350887, "grad_norm": 1.6595662832260132, "learning_rate": 1.9639628606958535e-05, "loss": 1.1892, "step": 498 }, { "epoch": 0.11425300515168861, "grad_norm": 1.6177374124526978, "learning_rate": 1.9637652449760297e-05, "loss": 1.114, "step": 499 }, { "epoch": 0.11448196908986835, "grad_norm": 1.6932591199874878, "learning_rate": 1.963567098904272e-05, "loss": 1.1176, "step": 500 }, { "epoch": 0.11471093302804808, "grad_norm": 1.1896140575408936, "learning_rate": 1.963368422589618e-05, "loss": 1.1573, "step": 501 }, { "epoch": 0.11493989696622782, "grad_norm": 1.246862769126892, "learning_rate": 1.9631692161413985e-05, "loss": 1.2009, "step": 502 }, { "epoch": 0.11516886090440756, "grad_norm": 1.422579288482666, "learning_rate": 1.962969479669234e-05, "loss": 1.1447, "step": 503 }, { "epoch": 0.11539782484258729, "grad_norm": 1.2170311212539673, "learning_rate": 1.962769213283039e-05, "loss": 1.1696, "step": 504 }, { "epoch": 0.11562678878076703, "grad_norm": 1.365478754043579, "learning_rate": 1.9625684170930172e-05, "loss": 1.1165, "step": 505 }, { "epoch": 0.11585575271894677, "grad_norm": 1.4074455499649048, "learning_rate": 1.9623670912096656e-05, "loss": 1.1835, "step": 506 }, { "epoch": 0.1160847166571265, "grad_norm": 1.3482747077941895, "learning_rate": 1.9621652357437723e-05, "loss": 1.1344, "step": 507 }, { "epoch": 0.11631368059530624, "grad_norm": 1.7042218446731567, "learning_rate": 1.961962850806417e-05, "loss": 1.1432, "step": 508 }, { "epoch": 0.11654264453348598, "grad_norm": 1.3767738342285156, "learning_rate": 1.9617599365089693e-05, "loss": 1.1334, "step": 509 }, { "epoch": 0.11677160847166572, "grad_norm": 1.317917823791504, "learning_rate": 1.9615564929630925e-05, "loss": 1.1883, "step": 510 }, { "epoch": 0.11700057240984545, "grad_norm": 1.2787598371505737, "learning_rate": 1.9613525202807392e-05, "loss": 1.1198, "step": 511 }, { "epoch": 0.11722953634802519, "grad_norm": 1.6827375888824463, "learning_rate": 1.961148018574154e-05, "loss": 1.1579, "step": 512 }, { "epoch": 0.11745850028620493, "grad_norm": 1.2459272146224976, "learning_rate": 1.9609429879558726e-05, "loss": 1.1789, "step": 513 }, { "epoch": 0.11768746422438466, "grad_norm": 1.3571677207946777, "learning_rate": 1.960737428538721e-05, "loss": 1.139, "step": 514 }, { "epoch": 0.1179164281625644, "grad_norm": 1.3986880779266357, "learning_rate": 1.9605313404358176e-05, "loss": 1.1719, "step": 515 }, { "epoch": 0.11814539210074414, "grad_norm": 1.2968988418579102, "learning_rate": 1.9603247237605706e-05, "loss": 1.1753, "step": 516 }, { "epoch": 0.11837435603892386, "grad_norm": 1.434787631034851, "learning_rate": 1.9601175786266796e-05, "loss": 1.1497, "step": 517 }, { "epoch": 0.1186033199771036, "grad_norm": 1.595548152923584, "learning_rate": 1.9599099051481345e-05, "loss": 1.111, "step": 518 }, { "epoch": 0.11883228391528335, "grad_norm": 1.3125005960464478, "learning_rate": 1.959701703439217e-05, "loss": 1.1453, "step": 519 }, { "epoch": 0.11906124785346307, "grad_norm": 1.6637828350067139, "learning_rate": 1.9594929736144978e-05, "loss": 1.1657, "step": 520 }, { "epoch": 0.11929021179164281, "grad_norm": 1.1127067804336548, "learning_rate": 1.9592837157888396e-05, "loss": 1.2167, "step": 521 }, { "epoch": 0.11951917572982255, "grad_norm": 1.3453141450881958, "learning_rate": 1.959073930077395e-05, "loss": 1.1839, "step": 522 }, { "epoch": 0.1197481396680023, "grad_norm": 1.2498703002929688, "learning_rate": 1.958863616595608e-05, "loss": 1.1712, "step": 523 }, { "epoch": 0.11997710360618202, "grad_norm": 1.6796958446502686, "learning_rate": 1.958652775459212e-05, "loss": 1.1745, "step": 524 }, { "epoch": 0.12020606754436176, "grad_norm": 1.1889232397079468, "learning_rate": 1.9584414067842304e-05, "loss": 1.1619, "step": 525 }, { "epoch": 0.1204350314825415, "grad_norm": 1.389387607574463, "learning_rate": 1.9582295106869788e-05, "loss": 1.1106, "step": 526 }, { "epoch": 0.12066399542072123, "grad_norm": 1.8008201122283936, "learning_rate": 1.958017087284061e-05, "loss": 1.1603, "step": 527 }, { "epoch": 0.12089295935890097, "grad_norm": 1.4966647624969482, "learning_rate": 1.9578041366923722e-05, "loss": 1.1037, "step": 528 }, { "epoch": 0.12112192329708071, "grad_norm": 1.1903491020202637, "learning_rate": 1.957590659029097e-05, "loss": 1.1838, "step": 529 }, { "epoch": 0.12135088723526044, "grad_norm": 1.38507080078125, "learning_rate": 1.957376654411711e-05, "loss": 1.1818, "step": 530 }, { "epoch": 0.12157985117344018, "grad_norm": 1.4420480728149414, "learning_rate": 1.957162122957978e-05, "loss": 1.1738, "step": 531 }, { "epoch": 0.12180881511161992, "grad_norm": 1.5511441230773926, "learning_rate": 1.9569470647859544e-05, "loss": 1.183, "step": 532 }, { "epoch": 0.12203777904979966, "grad_norm": 1.3438152074813843, "learning_rate": 1.9567314800139838e-05, "loss": 1.1897, "step": 533 }, { "epoch": 0.12226674298797939, "grad_norm": 1.281836748123169, "learning_rate": 1.9565153687607006e-05, "loss": 1.1887, "step": 534 }, { "epoch": 0.12249570692615913, "grad_norm": 1.3364907503128052, "learning_rate": 1.9562987311450298e-05, "loss": 1.1386, "step": 535 }, { "epoch": 0.12272467086433887, "grad_norm": 1.3706281185150146, "learning_rate": 1.956081567286185e-05, "loss": 1.1379, "step": 536 }, { "epoch": 0.1229536348025186, "grad_norm": 1.3901708126068115, "learning_rate": 1.9558638773036694e-05, "loss": 1.124, "step": 537 }, { "epoch": 0.12318259874069834, "grad_norm": 1.1462208032608032, "learning_rate": 1.955645661317276e-05, "loss": 1.1561, "step": 538 }, { "epoch": 0.12341156267887808, "grad_norm": 1.1496376991271973, "learning_rate": 1.9554269194470872e-05, "loss": 1.1725, "step": 539 }, { "epoch": 0.12364052661705781, "grad_norm": 1.095733404159546, "learning_rate": 1.955207651813475e-05, "loss": 1.1946, "step": 540 }, { "epoch": 0.12386949055523755, "grad_norm": 1.4160492420196533, "learning_rate": 1.9549878585371006e-05, "loss": 1.0487, "step": 541 }, { "epoch": 0.12409845449341729, "grad_norm": 1.1869254112243652, "learning_rate": 1.9547675397389144e-05, "loss": 1.1571, "step": 542 }, { "epoch": 0.12432741843159702, "grad_norm": 2.6226820945739746, "learning_rate": 1.9545466955401555e-05, "loss": 1.1698, "step": 543 }, { "epoch": 0.12455638236977676, "grad_norm": 1.3678830862045288, "learning_rate": 1.954325326062353e-05, "loss": 1.1191, "step": 544 }, { "epoch": 0.1247853463079565, "grad_norm": 1.2578891515731812, "learning_rate": 1.954103431427325e-05, "loss": 1.1159, "step": 545 }, { "epoch": 0.12501431024613624, "grad_norm": 1.258697509765625, "learning_rate": 1.9538810117571775e-05, "loss": 1.1422, "step": 546 }, { "epoch": 0.12524327418431597, "grad_norm": 1.3794115781784058, "learning_rate": 1.9536580671743067e-05, "loss": 1.1376, "step": 547 }, { "epoch": 0.1254722381224957, "grad_norm": 1.2176522016525269, "learning_rate": 1.9534345978013972e-05, "loss": 1.1487, "step": 548 }, { "epoch": 0.12570120206067545, "grad_norm": 1.7151718139648438, "learning_rate": 1.9532106037614218e-05, "loss": 1.1334, "step": 549 }, { "epoch": 0.12593016599885518, "grad_norm": 1.7247742414474487, "learning_rate": 1.952986085177643e-05, "loss": 1.1493, "step": 550 }, { "epoch": 0.12615912993703493, "grad_norm": 1.252971887588501, "learning_rate": 1.9527610421736115e-05, "loss": 1.1741, "step": 551 }, { "epoch": 0.12638809387521466, "grad_norm": 1.716004729270935, "learning_rate": 1.9525354748731665e-05, "loss": 1.1638, "step": 552 }, { "epoch": 0.12661705781339438, "grad_norm": 1.1862410306930542, "learning_rate": 1.9523093834004358e-05, "loss": 1.1476, "step": 553 }, { "epoch": 0.12684602175157414, "grad_norm": 1.4751574993133545, "learning_rate": 1.952082767879835e-05, "loss": 1.1203, "step": 554 }, { "epoch": 0.12707498568975387, "grad_norm": 1.6074382066726685, "learning_rate": 1.9518556284360696e-05, "loss": 1.1421, "step": 555 }, { "epoch": 0.1273039496279336, "grad_norm": 1.5327030420303345, "learning_rate": 1.9516279651941324e-05, "loss": 1.1892, "step": 556 }, { "epoch": 0.12753291356611335, "grad_norm": 1.531124472618103, "learning_rate": 1.9513997782793045e-05, "loss": 1.1348, "step": 557 }, { "epoch": 0.12776187750429308, "grad_norm": 1.5858008861541748, "learning_rate": 1.9511710678171548e-05, "loss": 1.19, "step": 558 }, { "epoch": 0.1279908414424728, "grad_norm": 1.3005167245864868, "learning_rate": 1.9509418339335415e-05, "loss": 1.1556, "step": 559 }, { "epoch": 0.12821980538065256, "grad_norm": 1.6737267971038818, "learning_rate": 1.950712076754609e-05, "loss": 1.2304, "step": 560 }, { "epoch": 0.12844876931883228, "grad_norm": 1.2671421766281128, "learning_rate": 1.950481796406792e-05, "loss": 1.1869, "step": 561 }, { "epoch": 0.128677733257012, "grad_norm": 1.6968939304351807, "learning_rate": 1.9502509930168113e-05, "loss": 1.1252, "step": 562 }, { "epoch": 0.12890669719519177, "grad_norm": 1.406265377998352, "learning_rate": 1.950019666711676e-05, "loss": 1.1424, "step": 563 }, { "epoch": 0.1291356611333715, "grad_norm": 1.4663190841674805, "learning_rate": 1.949787817618683e-05, "loss": 1.1995, "step": 564 }, { "epoch": 0.12936462507155122, "grad_norm": 1.5765471458435059, "learning_rate": 1.949555445865417e-05, "loss": 1.1302, "step": 565 }, { "epoch": 0.12959358900973098, "grad_norm": 1.7354494333267212, "learning_rate": 1.94932255157975e-05, "loss": 1.1729, "step": 566 }, { "epoch": 0.1298225529479107, "grad_norm": 1.379272699356079, "learning_rate": 1.9490891348898423e-05, "loss": 1.1507, "step": 567 }, { "epoch": 0.13005151688609043, "grad_norm": 1.2708396911621094, "learning_rate": 1.9488551959241405e-05, "loss": 1.1516, "step": 568 }, { "epoch": 0.13028048082427018, "grad_norm": 1.5942022800445557, "learning_rate": 1.9486207348113803e-05, "loss": 1.1358, "step": 569 }, { "epoch": 0.1305094447624499, "grad_norm": 1.2558735609054565, "learning_rate": 1.9483857516805823e-05, "loss": 1.154, "step": 570 }, { "epoch": 0.13073840870062964, "grad_norm": 1.1783373355865479, "learning_rate": 1.9481502466610568e-05, "loss": 1.1798, "step": 571 }, { "epoch": 0.1309673726388094, "grad_norm": 1.2193893194198608, "learning_rate": 1.9479142198823996e-05, "loss": 1.1767, "step": 572 }, { "epoch": 0.13119633657698912, "grad_norm": 1.1724305152893066, "learning_rate": 1.9476776714744945e-05, "loss": 1.113, "step": 573 }, { "epoch": 0.13142530051516887, "grad_norm": 1.3040038347244263, "learning_rate": 1.947440601567512e-05, "loss": 1.1216, "step": 574 }, { "epoch": 0.1316542644533486, "grad_norm": 1.3851133584976196, "learning_rate": 1.9472030102919102e-05, "loss": 1.1914, "step": 575 }, { "epoch": 0.13188322839152833, "grad_norm": 1.331370234489441, "learning_rate": 1.946964897778433e-05, "loss": 1.135, "step": 576 }, { "epoch": 0.13211219232970808, "grad_norm": 1.2417669296264648, "learning_rate": 1.946726264158112e-05, "loss": 1.0824, "step": 577 }, { "epoch": 0.1323411562678878, "grad_norm": 1.1429842710494995, "learning_rate": 1.9464871095622652e-05, "loss": 1.1327, "step": 578 }, { "epoch": 0.13257012020606754, "grad_norm": 1.2447996139526367, "learning_rate": 1.9462474341224974e-05, "loss": 1.1308, "step": 579 }, { "epoch": 0.1327990841442473, "grad_norm": 1.3931182622909546, "learning_rate": 1.9460072379706995e-05, "loss": 1.1119, "step": 580 }, { "epoch": 0.13302804808242702, "grad_norm": 1.3399399518966675, "learning_rate": 1.9457665212390502e-05, "loss": 1.1726, "step": 581 }, { "epoch": 0.13325701202060675, "grad_norm": 1.1147905588150024, "learning_rate": 1.9455252840600137e-05, "loss": 1.0929, "step": 582 }, { "epoch": 0.1334859759587865, "grad_norm": 1.4041277170181274, "learning_rate": 1.9452835265663404e-05, "loss": 1.1418, "step": 583 }, { "epoch": 0.13371493989696623, "grad_norm": 1.5064622163772583, "learning_rate": 1.945041248891068e-05, "loss": 1.1835, "step": 584 }, { "epoch": 0.13394390383514596, "grad_norm": 1.390039324760437, "learning_rate": 1.9447984511675192e-05, "loss": 1.089, "step": 585 }, { "epoch": 0.1341728677733257, "grad_norm": 2.3285298347473145, "learning_rate": 1.944555133529304e-05, "loss": 1.1771, "step": 586 }, { "epoch": 0.13440183171150544, "grad_norm": 1.5625582933425903, "learning_rate": 1.9443112961103182e-05, "loss": 1.1622, "step": 587 }, { "epoch": 0.13463079564968516, "grad_norm": 1.6119787693023682, "learning_rate": 1.9440669390447433e-05, "loss": 1.1831, "step": 588 }, { "epoch": 0.13485975958786492, "grad_norm": 1.1473219394683838, "learning_rate": 1.943822062467047e-05, "loss": 1.0934, "step": 589 }, { "epoch": 0.13508872352604465, "grad_norm": 1.3341083526611328, "learning_rate": 1.9435766665119823e-05, "loss": 1.149, "step": 590 }, { "epoch": 0.13531768746422437, "grad_norm": 1.3681514263153076, "learning_rate": 1.943330751314589e-05, "loss": 1.2103, "step": 591 }, { "epoch": 0.13554665140240413, "grad_norm": 2.2560174465179443, "learning_rate": 1.9430843170101924e-05, "loss": 1.0948, "step": 592 }, { "epoch": 0.13577561534058386, "grad_norm": 1.165104866027832, "learning_rate": 1.9428373637344027e-05, "loss": 1.1378, "step": 593 }, { "epoch": 0.13600457927876358, "grad_norm": 1.2633001804351807, "learning_rate": 1.9425898916231166e-05, "loss": 1.0987, "step": 594 }, { "epoch": 0.13623354321694334, "grad_norm": 1.2817325592041016, "learning_rate": 1.942341900812516e-05, "loss": 1.1535, "step": 595 }, { "epoch": 0.13646250715512306, "grad_norm": 1.2196224927902222, "learning_rate": 1.9420933914390677e-05, "loss": 1.1257, "step": 596 }, { "epoch": 0.1366914710933028, "grad_norm": 1.3916923999786377, "learning_rate": 1.941844363639525e-05, "loss": 1.1056, "step": 597 }, { "epoch": 0.13692043503148255, "grad_norm": 1.571192741394043, "learning_rate": 1.941594817550925e-05, "loss": 1.1894, "step": 598 }, { "epoch": 0.13714939896966227, "grad_norm": 1.3344855308532715, "learning_rate": 1.9413447533105913e-05, "loss": 1.1814, "step": 599 }, { "epoch": 0.13737836290784203, "grad_norm": 2.034505605697632, "learning_rate": 1.9410941710561323e-05, "loss": 1.2126, "step": 600 }, { "epoch": 0.13760732684602175, "grad_norm": 1.2162837982177734, "learning_rate": 1.940843070925441e-05, "loss": 1.1775, "step": 601 }, { "epoch": 0.13783629078420148, "grad_norm": 1.2769267559051514, "learning_rate": 1.940591453056696e-05, "loss": 1.1882, "step": 602 }, { "epoch": 0.13806525472238124, "grad_norm": 1.438686490058899, "learning_rate": 1.9403393175883602e-05, "loss": 1.1577, "step": 603 }, { "epoch": 0.13829421866056096, "grad_norm": 1.3430330753326416, "learning_rate": 1.9400866646591816e-05, "loss": 1.1549, "step": 604 }, { "epoch": 0.1385231825987407, "grad_norm": 1.135581374168396, "learning_rate": 1.939833494408193e-05, "loss": 1.1509, "step": 605 }, { "epoch": 0.13875214653692045, "grad_norm": 1.433982014656067, "learning_rate": 1.939579806974712e-05, "loss": 1.1768, "step": 606 }, { "epoch": 0.13898111047510017, "grad_norm": 1.200698971748352, "learning_rate": 1.939325602498341e-05, "loss": 1.1196, "step": 607 }, { "epoch": 0.1392100744132799, "grad_norm": 1.2908475399017334, "learning_rate": 1.939070881118966e-05, "loss": 1.137, "step": 608 }, { "epoch": 0.13943903835145965, "grad_norm": 1.3260984420776367, "learning_rate": 1.9388156429767585e-05, "loss": 1.1106, "step": 609 }, { "epoch": 0.13966800228963938, "grad_norm": 1.310341238975525, "learning_rate": 1.9385598882121735e-05, "loss": 1.1857, "step": 610 }, { "epoch": 0.1398969662278191, "grad_norm": 1.109005331993103, "learning_rate": 1.9383036169659513e-05, "loss": 1.1895, "step": 611 }, { "epoch": 0.14012593016599886, "grad_norm": 1.230418086051941, "learning_rate": 1.938046829379115e-05, "loss": 1.1439, "step": 612 }, { "epoch": 0.1403548941041786, "grad_norm": 1.380399227142334, "learning_rate": 1.9377895255929734e-05, "loss": 1.1821, "step": 613 }, { "epoch": 0.14058385804235832, "grad_norm": 1.2389920949935913, "learning_rate": 1.937531705749118e-05, "loss": 1.1463, "step": 614 }, { "epoch": 0.14081282198053807, "grad_norm": 1.1333352327346802, "learning_rate": 1.9372733699894253e-05, "loss": 1.1391, "step": 615 }, { "epoch": 0.1410417859187178, "grad_norm": 1.2721819877624512, "learning_rate": 1.937014518456055e-05, "loss": 1.1266, "step": 616 }, { "epoch": 0.14127074985689753, "grad_norm": 1.2748703956604004, "learning_rate": 1.9367551512914513e-05, "loss": 1.1351, "step": 617 }, { "epoch": 0.14149971379507728, "grad_norm": 1.4715847969055176, "learning_rate": 1.936495268638342e-05, "loss": 1.1432, "step": 618 }, { "epoch": 0.141728677733257, "grad_norm": 1.2434061765670776, "learning_rate": 1.9362348706397374e-05, "loss": 1.1471, "step": 619 }, { "epoch": 0.14195764167143674, "grad_norm": 1.2716186046600342, "learning_rate": 1.935973957438933e-05, "loss": 1.1463, "step": 620 }, { "epoch": 0.1421866056096165, "grad_norm": 1.4149022102355957, "learning_rate": 1.935712529179507e-05, "loss": 1.1119, "step": 621 }, { "epoch": 0.14241556954779622, "grad_norm": 1.7188087701797485, "learning_rate": 1.9354505860053215e-05, "loss": 1.0757, "step": 622 }, { "epoch": 0.14264453348597597, "grad_norm": 1.2718242406845093, "learning_rate": 1.9351881280605212e-05, "loss": 1.1599, "step": 623 }, { "epoch": 0.1428734974241557, "grad_norm": 1.4921197891235352, "learning_rate": 1.9349251554895347e-05, "loss": 1.1168, "step": 624 }, { "epoch": 0.14310246136233543, "grad_norm": 1.3648992776870728, "learning_rate": 1.934661668437073e-05, "loss": 1.0909, "step": 625 }, { "epoch": 0.14333142530051518, "grad_norm": 1.2685736417770386, "learning_rate": 1.934397667048132e-05, "loss": 1.1921, "step": 626 }, { "epoch": 0.1435603892386949, "grad_norm": 1.4009802341461182, "learning_rate": 1.9341331514679887e-05, "loss": 1.13, "step": 627 }, { "epoch": 0.14378935317687463, "grad_norm": 2.8500163555145264, "learning_rate": 1.933868121842204e-05, "loss": 1.1513, "step": 628 }, { "epoch": 0.1440183171150544, "grad_norm": 1.7703518867492676, "learning_rate": 1.933602578316621e-05, "loss": 1.1431, "step": 629 }, { "epoch": 0.14424728105323412, "grad_norm": 1.2395981550216675, "learning_rate": 1.9333365210373668e-05, "loss": 1.1633, "step": 630 }, { "epoch": 0.14447624499141384, "grad_norm": 1.3300714492797852, "learning_rate": 1.9330699501508504e-05, "loss": 1.1318, "step": 631 }, { "epoch": 0.1447052089295936, "grad_norm": 1.4751075506210327, "learning_rate": 1.932802865803763e-05, "loss": 1.0868, "step": 632 }, { "epoch": 0.14493417286777333, "grad_norm": 1.5677204132080078, "learning_rate": 1.932535268143079e-05, "loss": 1.2013, "step": 633 }, { "epoch": 0.14516313680595305, "grad_norm": 1.3743131160736084, "learning_rate": 1.9322671573160556e-05, "loss": 1.1175, "step": 634 }, { "epoch": 0.1453921007441328, "grad_norm": 1.6558480262756348, "learning_rate": 1.9319985334702315e-05, "loss": 1.1751, "step": 635 }, { "epoch": 0.14562106468231253, "grad_norm": 1.3124221563339233, "learning_rate": 1.9317293967534283e-05, "loss": 1.1122, "step": 636 }, { "epoch": 0.14585002862049226, "grad_norm": 1.2808423042297363, "learning_rate": 1.9314597473137498e-05, "loss": 1.1417, "step": 637 }, { "epoch": 0.14607899255867202, "grad_norm": 1.250510334968567, "learning_rate": 1.9311895852995815e-05, "loss": 1.2279, "step": 638 }, { "epoch": 0.14630795649685174, "grad_norm": 1.366873860359192, "learning_rate": 1.930918910859592e-05, "loss": 1.1324, "step": 639 }, { "epoch": 0.14653692043503147, "grad_norm": 1.168998122215271, "learning_rate": 1.9306477241427303e-05, "loss": 1.0871, "step": 640 }, { "epoch": 0.14676588437321123, "grad_norm": 1.3328088521957397, "learning_rate": 1.9303760252982287e-05, "loss": 1.1846, "step": 641 }, { "epoch": 0.14699484831139095, "grad_norm": 1.1964495182037354, "learning_rate": 1.9301038144756007e-05, "loss": 1.1037, "step": 642 }, { "epoch": 0.14722381224957068, "grad_norm": 1.2240676879882812, "learning_rate": 1.9298310918246414e-05, "loss": 1.1069, "step": 643 }, { "epoch": 0.14745277618775043, "grad_norm": 1.3975204229354858, "learning_rate": 1.929557857495428e-05, "loss": 1.1145, "step": 644 }, { "epoch": 0.14768174012593016, "grad_norm": 1.3508495092391968, "learning_rate": 1.9292841116383192e-05, "loss": 1.1929, "step": 645 }, { "epoch": 0.14791070406410992, "grad_norm": 1.5650644302368164, "learning_rate": 1.9290098544039546e-05, "loss": 1.1214, "step": 646 }, { "epoch": 0.14813966800228964, "grad_norm": 1.208836555480957, "learning_rate": 1.9287350859432562e-05, "loss": 1.11, "step": 647 }, { "epoch": 0.14836863194046937, "grad_norm": 1.2838915586471558, "learning_rate": 1.9284598064074264e-05, "loss": 1.172, "step": 648 }, { "epoch": 0.14859759587864912, "grad_norm": 1.2721766233444214, "learning_rate": 1.9281840159479494e-05, "loss": 1.1024, "step": 649 }, { "epoch": 0.14882655981682885, "grad_norm": 1.6511443853378296, "learning_rate": 1.9279077147165903e-05, "loss": 1.14, "step": 650 }, { "epoch": 0.14905552375500858, "grad_norm": 1.3828959465026855, "learning_rate": 1.9276309028653954e-05, "loss": 1.1819, "step": 651 }, { "epoch": 0.14928448769318833, "grad_norm": 1.3660774230957031, "learning_rate": 1.927353580546692e-05, "loss": 1.1487, "step": 652 }, { "epoch": 0.14951345163136806, "grad_norm": 1.296252965927124, "learning_rate": 1.927075747913088e-05, "loss": 1.1442, "step": 653 }, { "epoch": 0.1497424155695478, "grad_norm": 1.1136078834533691, "learning_rate": 1.9267974051174727e-05, "loss": 1.0958, "step": 654 }, { "epoch": 0.14997137950772754, "grad_norm": 1.5701806545257568, "learning_rate": 1.9265185523130156e-05, "loss": 1.1934, "step": 655 }, { "epoch": 0.15020034344590727, "grad_norm": 1.439436912536621, "learning_rate": 1.9262391896531668e-05, "loss": 1.1336, "step": 656 }, { "epoch": 0.150429307384087, "grad_norm": 1.2246439456939697, "learning_rate": 1.9259593172916577e-05, "loss": 1.1292, "step": 657 }, { "epoch": 0.15065827132226675, "grad_norm": 1.5522711277008057, "learning_rate": 1.9256789353824996e-05, "loss": 1.1826, "step": 658 }, { "epoch": 0.15088723526044648, "grad_norm": 1.6734440326690674, "learning_rate": 1.9253980440799843e-05, "loss": 1.0926, "step": 659 }, { "epoch": 0.1511161991986262, "grad_norm": 1.276139736175537, "learning_rate": 1.9251166435386837e-05, "loss": 1.1941, "step": 660 }, { "epoch": 0.15134516313680596, "grad_norm": 1.4880852699279785, "learning_rate": 1.9248347339134508e-05, "loss": 1.1254, "step": 661 }, { "epoch": 0.1515741270749857, "grad_norm": 1.467468500137329, "learning_rate": 1.9245523153594172e-05, "loss": 1.1781, "step": 662 }, { "epoch": 0.15180309101316541, "grad_norm": 1.3000531196594238, "learning_rate": 1.9242693880319962e-05, "loss": 1.0907, "step": 663 }, { "epoch": 0.15203205495134517, "grad_norm": 1.569050908088684, "learning_rate": 1.92398595208688e-05, "loss": 1.2279, "step": 664 }, { "epoch": 0.1522610188895249, "grad_norm": 1.3132572174072266, "learning_rate": 1.9237020076800407e-05, "loss": 1.0786, "step": 665 }, { "epoch": 0.15248998282770462, "grad_norm": 1.4111486673355103, "learning_rate": 1.9234175549677313e-05, "loss": 1.1556, "step": 666 }, { "epoch": 0.15271894676588438, "grad_norm": 1.2233771085739136, "learning_rate": 1.923132594106483e-05, "loss": 1.138, "step": 667 }, { "epoch": 0.1529479107040641, "grad_norm": 1.5644503831863403, "learning_rate": 1.922847125253108e-05, "loss": 1.1911, "step": 668 }, { "epoch": 0.15317687464224386, "grad_norm": 1.8687934875488281, "learning_rate": 1.922561148564697e-05, "loss": 1.1343, "step": 669 }, { "epoch": 0.1534058385804236, "grad_norm": 1.1981557607650757, "learning_rate": 1.9222746641986207e-05, "loss": 1.1846, "step": 670 }, { "epoch": 0.15363480251860331, "grad_norm": 1.6271294355392456, "learning_rate": 1.921987672312529e-05, "loss": 1.167, "step": 671 }, { "epoch": 0.15386376645678307, "grad_norm": 1.3530622720718384, "learning_rate": 1.9217001730643514e-05, "loss": 1.1491, "step": 672 }, { "epoch": 0.1540927303949628, "grad_norm": 1.456268072128296, "learning_rate": 1.921412166612296e-05, "loss": 1.1595, "step": 673 }, { "epoch": 0.15432169433314252, "grad_norm": 1.2416306734085083, "learning_rate": 1.92112365311485e-05, "loss": 1.1396, "step": 674 }, { "epoch": 0.15455065827132228, "grad_norm": 1.4196218252182007, "learning_rate": 1.920834632730781e-05, "loss": 1.1493, "step": 675 }, { "epoch": 0.154779622209502, "grad_norm": 1.69566810131073, "learning_rate": 1.920545105619134e-05, "loss": 1.0956, "step": 676 }, { "epoch": 0.15500858614768173, "grad_norm": 1.6241673231124878, "learning_rate": 1.9202550719392324e-05, "loss": 1.1241, "step": 677 }, { "epoch": 0.1552375500858615, "grad_norm": 1.2214878797531128, "learning_rate": 1.919964531850681e-05, "loss": 1.1751, "step": 678 }, { "epoch": 0.1554665140240412, "grad_norm": 2.9216532707214355, "learning_rate": 1.9196734855133603e-05, "loss": 1.1578, "step": 679 }, { "epoch": 0.15569547796222094, "grad_norm": 1.8954651355743408, "learning_rate": 1.919381933087431e-05, "loss": 1.1785, "step": 680 }, { "epoch": 0.1559244419004007, "grad_norm": 1.424324870109558, "learning_rate": 1.919089874733332e-05, "loss": 1.0811, "step": 681 }, { "epoch": 0.15615340583858042, "grad_norm": 1.3659218549728394, "learning_rate": 1.9187973106117808e-05, "loss": 1.1433, "step": 682 }, { "epoch": 0.15638236977676015, "grad_norm": 1.2462931871414185, "learning_rate": 1.918504240883773e-05, "loss": 1.1522, "step": 683 }, { "epoch": 0.1566113337149399, "grad_norm": 1.390007495880127, "learning_rate": 1.9182106657105816e-05, "loss": 1.1282, "step": 684 }, { "epoch": 0.15684029765311963, "grad_norm": 1.1688913106918335, "learning_rate": 1.9179165852537596e-05, "loss": 1.1549, "step": 685 }, { "epoch": 0.15706926159129936, "grad_norm": 1.6128971576690674, "learning_rate": 1.917621999675136e-05, "loss": 1.1311, "step": 686 }, { "epoch": 0.1572982255294791, "grad_norm": 1.283479928970337, "learning_rate": 1.9173269091368197e-05, "loss": 1.109, "step": 687 }, { "epoch": 0.15752718946765884, "grad_norm": 1.3000928163528442, "learning_rate": 1.9170313138011964e-05, "loss": 1.1634, "step": 688 }, { "epoch": 0.15775615340583857, "grad_norm": 1.204128384590149, "learning_rate": 1.9167352138309294e-05, "loss": 1.0988, "step": 689 }, { "epoch": 0.15798511734401832, "grad_norm": 1.3165777921676636, "learning_rate": 1.9164386093889598e-05, "loss": 1.1393, "step": 690 }, { "epoch": 0.15821408128219805, "grad_norm": 1.433110237121582, "learning_rate": 1.9161415006385074e-05, "loss": 1.1621, "step": 691 }, { "epoch": 0.1584430452203778, "grad_norm": 1.2257025241851807, "learning_rate": 1.915843887743068e-05, "loss": 1.1692, "step": 692 }, { "epoch": 0.15867200915855753, "grad_norm": 1.773209571838379, "learning_rate": 1.915545770866415e-05, "loss": 1.1176, "step": 693 }, { "epoch": 0.15890097309673726, "grad_norm": 1.3696967363357544, "learning_rate": 1.9152471501726008e-05, "loss": 1.1944, "step": 694 }, { "epoch": 0.159129937034917, "grad_norm": 1.1072070598602295, "learning_rate": 1.9149480258259535e-05, "loss": 1.1889, "step": 695 }, { "epoch": 0.15935890097309674, "grad_norm": 1.5009992122650146, "learning_rate": 1.914648397991078e-05, "loss": 1.125, "step": 696 }, { "epoch": 0.15958786491127647, "grad_norm": 1.4128434658050537, "learning_rate": 1.9143482668328577e-05, "loss": 1.1271, "step": 697 }, { "epoch": 0.15981682884945622, "grad_norm": 1.0920000076293945, "learning_rate": 1.9140476325164522e-05, "loss": 1.1275, "step": 698 }, { "epoch": 0.16004579278763595, "grad_norm": 1.100367546081543, "learning_rate": 1.9137464952072977e-05, "loss": 1.1063, "step": 699 }, { "epoch": 0.16027475672581568, "grad_norm": 1.1449311971664429, "learning_rate": 1.9134448550711077e-05, "loss": 1.1795, "step": 700 }, { "epoch": 0.16050372066399543, "grad_norm": 1.3512089252471924, "learning_rate": 1.9131427122738725e-05, "loss": 1.0984, "step": 701 }, { "epoch": 0.16073268460217516, "grad_norm": 1.5637792348861694, "learning_rate": 1.9128400669818586e-05, "loss": 1.106, "step": 702 }, { "epoch": 0.16096164854035488, "grad_norm": 1.6217029094696045, "learning_rate": 1.9125369193616085e-05, "loss": 1.1792, "step": 703 }, { "epoch": 0.16119061247853464, "grad_norm": 1.6088597774505615, "learning_rate": 1.9122332695799432e-05, "loss": 1.1579, "step": 704 }, { "epoch": 0.16141957641671437, "grad_norm": 1.4255467653274536, "learning_rate": 1.9119291178039573e-05, "loss": 1.1689, "step": 705 }, { "epoch": 0.1616485403548941, "grad_norm": 2.3765337467193604, "learning_rate": 1.9116244642010234e-05, "loss": 1.2181, "step": 706 }, { "epoch": 0.16187750429307385, "grad_norm": 1.2266517877578735, "learning_rate": 1.91131930893879e-05, "loss": 1.1672, "step": 707 }, { "epoch": 0.16210646823125358, "grad_norm": 1.379189372062683, "learning_rate": 1.9110136521851815e-05, "loss": 1.1442, "step": 708 }, { "epoch": 0.1623354321694333, "grad_norm": 1.462998628616333, "learning_rate": 1.9107074941083987e-05, "loss": 1.1641, "step": 709 }, { "epoch": 0.16256439610761306, "grad_norm": 1.2941198348999023, "learning_rate": 1.9104008348769164e-05, "loss": 1.1731, "step": 710 }, { "epoch": 0.16279336004579278, "grad_norm": 1.5538750886917114, "learning_rate": 1.9100936746594878e-05, "loss": 1.0923, "step": 711 }, { "epoch": 0.1630223239839725, "grad_norm": 1.0926445722579956, "learning_rate": 1.9097860136251402e-05, "loss": 1.2141, "step": 712 }, { "epoch": 0.16325128792215227, "grad_norm": 1.3583141565322876, "learning_rate": 1.9094778519431773e-05, "loss": 1.1745, "step": 713 }, { "epoch": 0.163480251860332, "grad_norm": 1.200218915939331, "learning_rate": 1.9091691897831774e-05, "loss": 1.1723, "step": 714 }, { "epoch": 0.16370921579851175, "grad_norm": 1.428288221359253, "learning_rate": 1.9088600273149947e-05, "loss": 1.1924, "step": 715 }, { "epoch": 0.16393817973669148, "grad_norm": 1.488163709640503, "learning_rate": 1.9085503647087588e-05, "loss": 1.1357, "step": 716 }, { "epoch": 0.1641671436748712, "grad_norm": 1.282083511352539, "learning_rate": 1.9082402021348745e-05, "loss": 1.1102, "step": 717 }, { "epoch": 0.16439610761305096, "grad_norm": 1.33743155002594, "learning_rate": 1.9079295397640215e-05, "loss": 1.2115, "step": 718 }, { "epoch": 0.16462507155123068, "grad_norm": 1.618739128112793, "learning_rate": 1.9076183777671553e-05, "loss": 1.1347, "step": 719 }, { "epoch": 0.1648540354894104, "grad_norm": 2.2287635803222656, "learning_rate": 1.907306716315505e-05, "loss": 1.1554, "step": 720 }, { "epoch": 0.16508299942759017, "grad_norm": 1.8814674615859985, "learning_rate": 1.9069945555805756e-05, "loss": 1.1277, "step": 721 }, { "epoch": 0.1653119633657699, "grad_norm": 1.54645836353302, "learning_rate": 1.9066818957341463e-05, "loss": 1.1175, "step": 722 }, { "epoch": 0.16554092730394962, "grad_norm": 1.5343029499053955, "learning_rate": 1.906368736948272e-05, "loss": 1.1077, "step": 723 }, { "epoch": 0.16576989124212937, "grad_norm": 1.270850658416748, "learning_rate": 1.9060550793952803e-05, "loss": 1.1389, "step": 724 }, { "epoch": 0.1659988551803091, "grad_norm": 1.866256594657898, "learning_rate": 1.905740923247775e-05, "loss": 1.122, "step": 725 }, { "epoch": 0.16622781911848883, "grad_norm": 1.560256004333496, "learning_rate": 1.9054262686786332e-05, "loss": 1.1524, "step": 726 }, { "epoch": 0.16645678305666858, "grad_norm": 1.3043514490127563, "learning_rate": 1.905111115861007e-05, "loss": 1.1863, "step": 727 }, { "epoch": 0.1666857469948483, "grad_norm": 1.1351722478866577, "learning_rate": 1.9047954649683217e-05, "loss": 1.1149, "step": 728 }, { "epoch": 0.16691471093302804, "grad_norm": 1.6116639375686646, "learning_rate": 1.9044793161742783e-05, "loss": 1.1352, "step": 729 }, { "epoch": 0.1671436748712078, "grad_norm": 1.3612065315246582, "learning_rate": 1.9041626696528503e-05, "loss": 1.1206, "step": 730 }, { "epoch": 0.16737263880938752, "grad_norm": 1.5191378593444824, "learning_rate": 1.903845525578286e-05, "loss": 1.1476, "step": 731 }, { "epoch": 0.16760160274756725, "grad_norm": 1.3559706211090088, "learning_rate": 1.903527884125106e-05, "loss": 1.0739, "step": 732 }, { "epoch": 0.167830566685747, "grad_norm": 2.067094087600708, "learning_rate": 1.9032097454681074e-05, "loss": 1.1463, "step": 733 }, { "epoch": 0.16805953062392673, "grad_norm": 1.6561821699142456, "learning_rate": 1.9028911097823578e-05, "loss": 1.1236, "step": 734 }, { "epoch": 0.16828849456210646, "grad_norm": 1.81977117061615, "learning_rate": 1.9025719772432006e-05, "loss": 1.1047, "step": 735 }, { "epoch": 0.1685174585002862, "grad_norm": 1.2570257186889648, "learning_rate": 1.9022523480262517e-05, "loss": 1.1246, "step": 736 }, { "epoch": 0.16874642243846594, "grad_norm": 1.2872344255447388, "learning_rate": 1.9019322223073997e-05, "loss": 1.1549, "step": 737 }, { "epoch": 0.1689753863766457, "grad_norm": 1.111267328262329, "learning_rate": 1.901611600262808e-05, "loss": 1.0933, "step": 738 }, { "epoch": 0.16920435031482542, "grad_norm": 1.3469321727752686, "learning_rate": 1.9012904820689114e-05, "loss": 1.1454, "step": 739 }, { "epoch": 0.16943331425300515, "grad_norm": 1.3398330211639404, "learning_rate": 1.900968867902419e-05, "loss": 1.0773, "step": 740 }, { "epoch": 0.1696622781911849, "grad_norm": 1.4936128854751587, "learning_rate": 1.9006467579403128e-05, "loss": 1.1834, "step": 741 }, { "epoch": 0.16989124212936463, "grad_norm": 1.2364333868026733, "learning_rate": 1.9003241523598465e-05, "loss": 1.1359, "step": 742 }, { "epoch": 0.17012020606754436, "grad_norm": 2.011514663696289, "learning_rate": 1.900001051338547e-05, "loss": 1.1716, "step": 743 }, { "epoch": 0.1703491700057241, "grad_norm": 1.3461471796035767, "learning_rate": 1.899677455054215e-05, "loss": 1.1082, "step": 744 }, { "epoch": 0.17057813394390384, "grad_norm": 1.3014931678771973, "learning_rate": 1.8993533636849223e-05, "loss": 1.1362, "step": 745 }, { "epoch": 0.17080709788208356, "grad_norm": 1.2128814458847046, "learning_rate": 1.8990287774090137e-05, "loss": 1.0883, "step": 746 }, { "epoch": 0.17103606182026332, "grad_norm": 1.4403022527694702, "learning_rate": 1.8987036964051065e-05, "loss": 1.1065, "step": 747 }, { "epoch": 0.17126502575844305, "grad_norm": 1.252302885055542, "learning_rate": 1.8983781208520898e-05, "loss": 1.1181, "step": 748 }, { "epoch": 0.17149398969662277, "grad_norm": 1.318610429763794, "learning_rate": 1.8980520509291255e-05, "loss": 1.1492, "step": 749 }, { "epoch": 0.17172295363480253, "grad_norm": 1.5634231567382812, "learning_rate": 1.8977254868156465e-05, "loss": 1.0785, "step": 750 }, { "epoch": 0.17195191757298225, "grad_norm": 2.146759033203125, "learning_rate": 1.8973984286913584e-05, "loss": 1.1331, "step": 751 }, { "epoch": 0.17218088151116198, "grad_norm": 1.2476396560668945, "learning_rate": 1.897070876736239e-05, "loss": 1.1564, "step": 752 }, { "epoch": 0.17240984544934174, "grad_norm": 1.4595783948898315, "learning_rate": 1.8967428311305375e-05, "loss": 1.1155, "step": 753 }, { "epoch": 0.17263880938752146, "grad_norm": 1.1093003749847412, "learning_rate": 1.896414292054774e-05, "loss": 1.1345, "step": 754 }, { "epoch": 0.1728677733257012, "grad_norm": 1.1664748191833496, "learning_rate": 1.896085259689741e-05, "loss": 1.0583, "step": 755 }, { "epoch": 0.17309673726388095, "grad_norm": 1.5106245279312134, "learning_rate": 1.8957557342165026e-05, "loss": 1.1689, "step": 756 }, { "epoch": 0.17332570120206067, "grad_norm": 1.3270502090454102, "learning_rate": 1.8954257158163936e-05, "loss": 1.1115, "step": 757 }, { "epoch": 0.1735546651402404, "grad_norm": 1.5598188638687134, "learning_rate": 1.895095204671021e-05, "loss": 1.1486, "step": 758 }, { "epoch": 0.17378362907842015, "grad_norm": 1.5022259950637817, "learning_rate": 1.8947642009622612e-05, "loss": 1.1217, "step": 759 }, { "epoch": 0.17401259301659988, "grad_norm": 1.3121103048324585, "learning_rate": 1.8944327048722634e-05, "loss": 1.1604, "step": 760 }, { "epoch": 0.1742415569547796, "grad_norm": 1.4838922023773193, "learning_rate": 1.8941007165834468e-05, "loss": 1.2222, "step": 761 }, { "epoch": 0.17447052089295936, "grad_norm": 1.4497283697128296, "learning_rate": 1.8937682362785025e-05, "loss": 1.1726, "step": 762 }, { "epoch": 0.1746994848311391, "grad_norm": 1.3479863405227661, "learning_rate": 1.8934352641403907e-05, "loss": 1.1307, "step": 763 }, { "epoch": 0.17492844876931885, "grad_norm": 1.622117042541504, "learning_rate": 1.8931018003523438e-05, "loss": 1.0881, "step": 764 }, { "epoch": 0.17515741270749857, "grad_norm": 1.377864956855774, "learning_rate": 1.892767845097864e-05, "loss": 1.0872, "step": 765 }, { "epoch": 0.1753863766456783, "grad_norm": 1.4404441118240356, "learning_rate": 1.8924333985607237e-05, "loss": 1.1333, "step": 766 }, { "epoch": 0.17561534058385805, "grad_norm": 1.5697600841522217, "learning_rate": 1.8920984609249667e-05, "loss": 1.1069, "step": 767 }, { "epoch": 0.17584430452203778, "grad_norm": 1.4469575881958008, "learning_rate": 1.8917630323749056e-05, "loss": 1.1294, "step": 768 }, { "epoch": 0.1760732684602175, "grad_norm": 1.2435827255249023, "learning_rate": 1.8914271130951246e-05, "loss": 1.1437, "step": 769 }, { "epoch": 0.17630223239839726, "grad_norm": 1.4122284650802612, "learning_rate": 1.891090703270477e-05, "loss": 1.1627, "step": 770 }, { "epoch": 0.176531196336577, "grad_norm": 1.3929946422576904, "learning_rate": 1.8907538030860865e-05, "loss": 1.1148, "step": 771 }, { "epoch": 0.17676016027475672, "grad_norm": 1.9755219221115112, "learning_rate": 1.890416412727346e-05, "loss": 1.1699, "step": 772 }, { "epoch": 0.17698912421293647, "grad_norm": 1.6826586723327637, "learning_rate": 1.890078532379919e-05, "loss": 1.0846, "step": 773 }, { "epoch": 0.1772180881511162, "grad_norm": 1.2816320657730103, "learning_rate": 1.8897401622297375e-05, "loss": 1.2024, "step": 774 }, { "epoch": 0.17744705208929593, "grad_norm": 1.4006059169769287, "learning_rate": 1.889401302463005e-05, "loss": 1.116, "step": 775 }, { "epoch": 0.17767601602747568, "grad_norm": 1.476759910583496, "learning_rate": 1.889061953266192e-05, "loss": 1.0944, "step": 776 }, { "epoch": 0.1779049799656554, "grad_norm": 1.5726763010025024, "learning_rate": 1.8887221148260404e-05, "loss": 1.1472, "step": 777 }, { "epoch": 0.17813394390383513, "grad_norm": 1.337249517440796, "learning_rate": 1.8883817873295597e-05, "loss": 1.1403, "step": 778 }, { "epoch": 0.1783629078420149, "grad_norm": 1.338131308555603, "learning_rate": 1.88804097096403e-05, "loss": 1.0678, "step": 779 }, { "epoch": 0.17859187178019462, "grad_norm": 1.291359543800354, "learning_rate": 1.887699665916999e-05, "loss": 1.1212, "step": 780 }, { "epoch": 0.17882083571837434, "grad_norm": 1.7371107339859009, "learning_rate": 1.8873578723762845e-05, "loss": 1.0984, "step": 781 }, { "epoch": 0.1790497996565541, "grad_norm": 1.5285062789916992, "learning_rate": 1.8870155905299725e-05, "loss": 1.1511, "step": 782 }, { "epoch": 0.17927876359473383, "grad_norm": 1.693925380706787, "learning_rate": 1.8866728205664177e-05, "loss": 1.117, "step": 783 }, { "epoch": 0.17950772753291355, "grad_norm": 1.2213523387908936, "learning_rate": 1.8863295626742438e-05, "loss": 1.2201, "step": 784 }, { "epoch": 0.1797366914710933, "grad_norm": 1.410949945449829, "learning_rate": 1.8859858170423423e-05, "loss": 1.1321, "step": 785 }, { "epoch": 0.17996565540927303, "grad_norm": 1.3400486707687378, "learning_rate": 1.8856415838598738e-05, "loss": 1.2231, "step": 786 }, { "epoch": 0.1801946193474528, "grad_norm": 1.147056221961975, "learning_rate": 1.885296863316267e-05, "loss": 1.1623, "step": 787 }, { "epoch": 0.18042358328563252, "grad_norm": 1.1560708284378052, "learning_rate": 1.8849516556012183e-05, "loss": 1.1123, "step": 788 }, { "epoch": 0.18065254722381224, "grad_norm": 1.350136399269104, "learning_rate": 1.8846059609046935e-05, "loss": 1.1598, "step": 789 }, { "epoch": 0.180881511161992, "grad_norm": 1.242692232131958, "learning_rate": 1.8842597794169245e-05, "loss": 1.1087, "step": 790 }, { "epoch": 0.18111047510017173, "grad_norm": 1.2484183311462402, "learning_rate": 1.883913111328413e-05, "loss": 1.2007, "step": 791 }, { "epoch": 0.18133943903835145, "grad_norm": 1.3073456287384033, "learning_rate": 1.8835659568299268e-05, "loss": 1.1419, "step": 792 }, { "epoch": 0.1815684029765312, "grad_norm": 1.2637773752212524, "learning_rate": 1.8832183161125026e-05, "loss": 1.1126, "step": 793 }, { "epoch": 0.18179736691471093, "grad_norm": 2.1271276473999023, "learning_rate": 1.8828701893674435e-05, "loss": 1.0735, "step": 794 }, { "epoch": 0.18202633085289066, "grad_norm": 1.8395510911941528, "learning_rate": 1.8825215767863215e-05, "loss": 1.1168, "step": 795 }, { "epoch": 0.18225529479107042, "grad_norm": 1.3857455253601074, "learning_rate": 1.882172478560975e-05, "loss": 1.1164, "step": 796 }, { "epoch": 0.18248425872925014, "grad_norm": 1.313710331916809, "learning_rate": 1.8818228948835095e-05, "loss": 1.1883, "step": 797 }, { "epoch": 0.18271322266742987, "grad_norm": 1.5113545656204224, "learning_rate": 1.8814728259462978e-05, "loss": 1.1948, "step": 798 }, { "epoch": 0.18294218660560962, "grad_norm": 1.3891445398330688, "learning_rate": 1.8811222719419808e-05, "loss": 1.1182, "step": 799 }, { "epoch": 0.18317115054378935, "grad_norm": 1.672825574874878, "learning_rate": 1.8807712330634645e-05, "loss": 1.1114, "step": 800 }, { "epoch": 0.18340011448196908, "grad_norm": 1.299664855003357, "learning_rate": 1.880419709503923e-05, "loss": 1.1228, "step": 801 }, { "epoch": 0.18362907842014883, "grad_norm": 1.6080543994903564, "learning_rate": 1.8800677014567972e-05, "loss": 1.1325, "step": 802 }, { "epoch": 0.18385804235832856, "grad_norm": 1.4526350498199463, "learning_rate": 1.8797152091157935e-05, "loss": 1.109, "step": 803 }, { "epoch": 0.1840870062965083, "grad_norm": 1.2981843948364258, "learning_rate": 1.8793622326748857e-05, "loss": 1.1266, "step": 804 }, { "epoch": 0.18431597023468804, "grad_norm": 1.715675711631775, "learning_rate": 1.879008772328314e-05, "loss": 1.1661, "step": 805 }, { "epoch": 0.18454493417286777, "grad_norm": 1.5412567853927612, "learning_rate": 1.8786548282705847e-05, "loss": 1.0697, "step": 806 }, { "epoch": 0.1847738981110475, "grad_norm": 1.3529974222183228, "learning_rate": 1.87830040069647e-05, "loss": 1.1753, "step": 807 }, { "epoch": 0.18500286204922725, "grad_norm": 1.229610800743103, "learning_rate": 1.877945489801008e-05, "loss": 1.1039, "step": 808 }, { "epoch": 0.18523182598740698, "grad_norm": 1.704888939857483, "learning_rate": 1.8775900957795042e-05, "loss": 1.1366, "step": 809 }, { "epoch": 0.18546078992558673, "grad_norm": 1.142831802368164, "learning_rate": 1.877234218827528e-05, "loss": 1.0946, "step": 810 }, { "epoch": 0.18568975386376646, "grad_norm": 1.304379940032959, "learning_rate": 1.876877859140916e-05, "loss": 1.1063, "step": 811 }, { "epoch": 0.1859187178019462, "grad_norm": 1.1813594102859497, "learning_rate": 1.8765210169157703e-05, "loss": 1.1067, "step": 812 }, { "epoch": 0.18614768174012594, "grad_norm": 1.6390926837921143, "learning_rate": 1.876163692348457e-05, "loss": 1.1525, "step": 813 }, { "epoch": 0.18637664567830567, "grad_norm": 1.6113990545272827, "learning_rate": 1.87580588563561e-05, "loss": 1.1551, "step": 814 }, { "epoch": 0.1866056096164854, "grad_norm": 1.4802757501602173, "learning_rate": 1.8754475969741272e-05, "loss": 1.1503, "step": 815 }, { "epoch": 0.18683457355466515, "grad_norm": 1.2753384113311768, "learning_rate": 1.8750888265611708e-05, "loss": 1.1212, "step": 816 }, { "epoch": 0.18706353749284488, "grad_norm": 1.1978129148483276, "learning_rate": 1.8747295745941705e-05, "loss": 1.1429, "step": 817 }, { "epoch": 0.1872925014310246, "grad_norm": 1.1764678955078125, "learning_rate": 1.8743698412708187e-05, "loss": 1.1981, "step": 818 }, { "epoch": 0.18752146536920436, "grad_norm": 1.5134238004684448, "learning_rate": 1.874009626789074e-05, "loss": 1.1221, "step": 819 }, { "epoch": 0.1877504293073841, "grad_norm": 1.2333041429519653, "learning_rate": 1.87364893134716e-05, "loss": 1.1512, "step": 820 }, { "epoch": 0.18797939324556381, "grad_norm": 1.1870229244232178, "learning_rate": 1.873287755143563e-05, "loss": 1.1544, "step": 821 }, { "epoch": 0.18820835718374357, "grad_norm": 1.222976565361023, "learning_rate": 1.872926098377036e-05, "loss": 1.1167, "step": 822 }, { "epoch": 0.1884373211219233, "grad_norm": 1.2639892101287842, "learning_rate": 1.872563961246596e-05, "loss": 1.1357, "step": 823 }, { "epoch": 0.18866628506010302, "grad_norm": 1.1272286176681519, "learning_rate": 1.8722013439515236e-05, "loss": 1.1222, "step": 824 }, { "epoch": 0.18889524899828278, "grad_norm": 1.2517902851104736, "learning_rate": 1.8718382466913642e-05, "loss": 1.0577, "step": 825 }, { "epoch": 0.1891242129364625, "grad_norm": 1.3976292610168457, "learning_rate": 1.871474669665927e-05, "loss": 1.125, "step": 826 }, { "epoch": 0.18935317687464223, "grad_norm": 1.212823510169983, "learning_rate": 1.8711106130752855e-05, "loss": 1.1143, "step": 827 }, { "epoch": 0.189582140812822, "grad_norm": 1.4794899225234985, "learning_rate": 1.8707460771197773e-05, "loss": 1.1415, "step": 828 }, { "epoch": 0.1898111047510017, "grad_norm": 1.4691826105117798, "learning_rate": 1.8703810620000033e-05, "loss": 1.1055, "step": 829 }, { "epoch": 0.19004006868918144, "grad_norm": 1.352226972579956, "learning_rate": 1.8700155679168277e-05, "loss": 1.1638, "step": 830 }, { "epoch": 0.1902690326273612, "grad_norm": 1.3408719301223755, "learning_rate": 1.8696495950713794e-05, "loss": 1.1159, "step": 831 }, { "epoch": 0.19049799656554092, "grad_norm": 1.0246559381484985, "learning_rate": 1.8692831436650505e-05, "loss": 1.1375, "step": 832 }, { "epoch": 0.19072696050372068, "grad_norm": 1.1187310218811035, "learning_rate": 1.8689162138994952e-05, "loss": 1.1453, "step": 833 }, { "epoch": 0.1909559244419004, "grad_norm": 1.2305899858474731, "learning_rate": 1.868548805976633e-05, "loss": 1.1069, "step": 834 }, { "epoch": 0.19118488838008013, "grad_norm": 1.4860109090805054, "learning_rate": 1.868180920098644e-05, "loss": 1.0497, "step": 835 }, { "epoch": 0.19141385231825989, "grad_norm": 1.2232182025909424, "learning_rate": 1.867812556467974e-05, "loss": 1.131, "step": 836 }, { "epoch": 0.1916428162564396, "grad_norm": 1.3128330707550049, "learning_rate": 1.8674437152873296e-05, "loss": 1.2106, "step": 837 }, { "epoch": 0.19187178019461934, "grad_norm": 1.2296591997146606, "learning_rate": 1.8670743967596817e-05, "loss": 1.1277, "step": 838 }, { "epoch": 0.1921007441327991, "grad_norm": 1.332777976989746, "learning_rate": 1.8667046010882627e-05, "loss": 1.1003, "step": 839 }, { "epoch": 0.19232970807097882, "grad_norm": 1.8379496335983276, "learning_rate": 1.866334328476568e-05, "loss": 1.1812, "step": 840 }, { "epoch": 0.19255867200915855, "grad_norm": 1.1897050142288208, "learning_rate": 1.865963579128356e-05, "loss": 1.0811, "step": 841 }, { "epoch": 0.1927876359473383, "grad_norm": 1.7345019578933716, "learning_rate": 1.8655923532476463e-05, "loss": 1.1679, "step": 842 }, { "epoch": 0.19301659988551803, "grad_norm": 1.20344877243042, "learning_rate": 1.865220651038722e-05, "loss": 1.1832, "step": 843 }, { "epoch": 0.19324556382369776, "grad_norm": 1.2605164051055908, "learning_rate": 1.864848472706127e-05, "loss": 1.1355, "step": 844 }, { "epoch": 0.1934745277618775, "grad_norm": 1.1412873268127441, "learning_rate": 1.864475818454669e-05, "loss": 1.1202, "step": 845 }, { "epoch": 0.19370349170005724, "grad_norm": 1.1985411643981934, "learning_rate": 1.8641026884894156e-05, "loss": 1.1509, "step": 846 }, { "epoch": 0.19393245563823697, "grad_norm": 1.0788958072662354, "learning_rate": 1.8637290830156972e-05, "loss": 1.1141, "step": 847 }, { "epoch": 0.19416141957641672, "grad_norm": 1.2294611930847168, "learning_rate": 1.8633550022391062e-05, "loss": 1.1473, "step": 848 }, { "epoch": 0.19439038351459645, "grad_norm": 1.1396217346191406, "learning_rate": 1.8629804463654956e-05, "loss": 1.141, "step": 849 }, { "epoch": 0.19461934745277618, "grad_norm": 1.0818895101547241, "learning_rate": 1.8626054156009807e-05, "loss": 1.1195, "step": 850 }, { "epoch": 0.19484831139095593, "grad_norm": 1.287191390991211, "learning_rate": 1.862229910151938e-05, "loss": 1.1387, "step": 851 }, { "epoch": 0.19507727532913566, "grad_norm": 1.1082041263580322, "learning_rate": 1.8618539302250044e-05, "loss": 1.1418, "step": 852 }, { "epoch": 0.19530623926731538, "grad_norm": 1.2280734777450562, "learning_rate": 1.8614774760270785e-05, "loss": 1.157, "step": 853 }, { "epoch": 0.19553520320549514, "grad_norm": 1.1522899866104126, "learning_rate": 1.8611005477653204e-05, "loss": 1.0825, "step": 854 }, { "epoch": 0.19576416714367487, "grad_norm": 1.2309021949768066, "learning_rate": 1.8607231456471505e-05, "loss": 1.151, "step": 855 }, { "epoch": 0.19599313108185462, "grad_norm": 1.141215205192566, "learning_rate": 1.8603452698802498e-05, "loss": 1.0634, "step": 856 }, { "epoch": 0.19622209502003435, "grad_norm": 1.6352757215499878, "learning_rate": 1.85996692067256e-05, "loss": 1.14, "step": 857 }, { "epoch": 0.19645105895821408, "grad_norm": 1.2608399391174316, "learning_rate": 1.859588098232284e-05, "loss": 1.1651, "step": 858 }, { "epoch": 0.19668002289639383, "grad_norm": 1.3403522968292236, "learning_rate": 1.859208802767884e-05, "loss": 1.0751, "step": 859 }, { "epoch": 0.19690898683457356, "grad_norm": 1.5046757459640503, "learning_rate": 1.858829034488084e-05, "loss": 1.1455, "step": 860 }, { "epoch": 0.19713795077275328, "grad_norm": 1.1532163619995117, "learning_rate": 1.8584487936018663e-05, "loss": 1.0847, "step": 861 }, { "epoch": 0.19736691471093304, "grad_norm": 1.1010034084320068, "learning_rate": 1.858068080318475e-05, "loss": 1.1729, "step": 862 }, { "epoch": 0.19759587864911277, "grad_norm": 1.2024052143096924, "learning_rate": 1.857686894847413e-05, "loss": 1.1543, "step": 863 }, { "epoch": 0.1978248425872925, "grad_norm": 1.4353835582733154, "learning_rate": 1.8573052373984435e-05, "loss": 1.1061, "step": 864 }, { "epoch": 0.19805380652547225, "grad_norm": 1.281450867652893, "learning_rate": 1.8569231081815895e-05, "loss": 1.0886, "step": 865 }, { "epoch": 0.19828277046365198, "grad_norm": 1.2481147050857544, "learning_rate": 1.8565405074071338e-05, "loss": 1.0985, "step": 866 }, { "epoch": 0.1985117344018317, "grad_norm": 1.2200899124145508, "learning_rate": 1.8561574352856176e-05, "loss": 1.1767, "step": 867 }, { "epoch": 0.19874069834001146, "grad_norm": 1.187233328819275, "learning_rate": 1.855773892027843e-05, "loss": 1.0757, "step": 868 }, { "epoch": 0.19896966227819118, "grad_norm": 1.319388508796692, "learning_rate": 1.855389877844871e-05, "loss": 1.1752, "step": 869 }, { "epoch": 0.1991986262163709, "grad_norm": 1.2449707984924316, "learning_rate": 1.8550053929480202e-05, "loss": 1.1535, "step": 870 }, { "epoch": 0.19942759015455067, "grad_norm": 1.315616250038147, "learning_rate": 1.8546204375488702e-05, "loss": 1.0604, "step": 871 }, { "epoch": 0.1996565540927304, "grad_norm": 1.368754267692566, "learning_rate": 1.8542350118592585e-05, "loss": 1.1437, "step": 872 }, { "epoch": 0.19988551803091012, "grad_norm": 1.2946205139160156, "learning_rate": 1.853849116091282e-05, "loss": 1.1325, "step": 873 }, { "epoch": 0.20011448196908987, "grad_norm": 1.3224684000015259, "learning_rate": 1.853462750457295e-05, "loss": 1.0641, "step": 874 }, { "epoch": 0.2003434459072696, "grad_norm": 1.159767985343933, "learning_rate": 1.8530759151699122e-05, "loss": 1.0747, "step": 875 }, { "epoch": 0.20057240984544933, "grad_norm": 1.1444604396820068, "learning_rate": 1.8526886104420056e-05, "loss": 1.1565, "step": 876 }, { "epoch": 0.20080137378362908, "grad_norm": 1.265729308128357, "learning_rate": 1.8523008364867056e-05, "loss": 1.0907, "step": 877 }, { "epoch": 0.2010303377218088, "grad_norm": 1.0625101327896118, "learning_rate": 1.851912593517401e-05, "loss": 1.1093, "step": 878 }, { "epoch": 0.20125930165998857, "grad_norm": 1.4997196197509766, "learning_rate": 1.8515238817477382e-05, "loss": 1.1674, "step": 879 }, { "epoch": 0.2014882655981683, "grad_norm": 1.2591129541397095, "learning_rate": 1.8511347013916228e-05, "loss": 1.1546, "step": 880 }, { "epoch": 0.20171722953634802, "grad_norm": 1.1916770935058594, "learning_rate": 1.850745052663217e-05, "loss": 1.094, "step": 881 }, { "epoch": 0.20194619347452777, "grad_norm": 1.0707792043685913, "learning_rate": 1.850354935776941e-05, "loss": 1.0586, "step": 882 }, { "epoch": 0.2021751574127075, "grad_norm": 1.159035086631775, "learning_rate": 1.8499643509474738e-05, "loss": 1.1245, "step": 883 }, { "epoch": 0.20240412135088723, "grad_norm": 1.2937264442443848, "learning_rate": 1.8495732983897504e-05, "loss": 1.1293, "step": 884 }, { "epoch": 0.20263308528906698, "grad_norm": 1.2372575998306274, "learning_rate": 1.8491817783189636e-05, "loss": 1.1198, "step": 885 }, { "epoch": 0.2028620492272467, "grad_norm": 1.1078606843948364, "learning_rate": 1.8487897909505637e-05, "loss": 1.1512, "step": 886 }, { "epoch": 0.20309101316542644, "grad_norm": 1.2595610618591309, "learning_rate": 1.848397336500258e-05, "loss": 1.1433, "step": 887 }, { "epoch": 0.2033199771036062, "grad_norm": 1.1265288591384888, "learning_rate": 1.848004415184011e-05, "loss": 1.1761, "step": 888 }, { "epoch": 0.20354894104178592, "grad_norm": 1.1440848112106323, "learning_rate": 1.8476110272180443e-05, "loss": 1.173, "step": 889 }, { "epoch": 0.20377790497996565, "grad_norm": 1.281594157218933, "learning_rate": 1.8472171728188356e-05, "loss": 1.1097, "step": 890 }, { "epoch": 0.2040068689181454, "grad_norm": 1.2258485555648804, "learning_rate": 1.8468228522031197e-05, "loss": 1.0735, "step": 891 }, { "epoch": 0.20423583285632513, "grad_norm": 1.1039539575576782, "learning_rate": 1.8464280655878876e-05, "loss": 1.1628, "step": 892 }, { "epoch": 0.20446479679450486, "grad_norm": 1.0621813535690308, "learning_rate": 1.846032813190388e-05, "loss": 1.1138, "step": 893 }, { "epoch": 0.2046937607326846, "grad_norm": 1.2525299787521362, "learning_rate": 1.8456370952281243e-05, "loss": 1.1612, "step": 894 }, { "epoch": 0.20492272467086434, "grad_norm": 1.553027629852295, "learning_rate": 1.845240911918857e-05, "loss": 1.1157, "step": 895 }, { "epoch": 0.20515168860904406, "grad_norm": 1.309720754623413, "learning_rate": 1.844844263480602e-05, "loss": 1.1009, "step": 896 }, { "epoch": 0.20538065254722382, "grad_norm": 1.2567998170852661, "learning_rate": 1.8444471501316324e-05, "loss": 1.136, "step": 897 }, { "epoch": 0.20560961648540355, "grad_norm": 1.242303490638733, "learning_rate": 1.8440495720904758e-05, "loss": 1.1018, "step": 898 }, { "epoch": 0.20583858042358327, "grad_norm": 1.5786601305007935, "learning_rate": 1.843651529575916e-05, "loss": 1.126, "step": 899 }, { "epoch": 0.20606754436176303, "grad_norm": 1.179705262184143, "learning_rate": 1.843253022806993e-05, "loss": 1.1069, "step": 900 }, { "epoch": 0.20629650829994275, "grad_norm": 1.4622169733047485, "learning_rate": 1.842854052003001e-05, "loss": 1.0582, "step": 901 }, { "epoch": 0.2065254722381225, "grad_norm": 1.43120276927948, "learning_rate": 1.842454617383491e-05, "loss": 1.0877, "step": 902 }, { "epoch": 0.20675443617630224, "grad_norm": 1.0542099475860596, "learning_rate": 1.842054719168268e-05, "loss": 1.0746, "step": 903 }, { "epoch": 0.20698340011448196, "grad_norm": 1.2692487239837646, "learning_rate": 1.841654357577393e-05, "loss": 1.1247, "step": 904 }, { "epoch": 0.20721236405266172, "grad_norm": 1.1795223951339722, "learning_rate": 1.8412535328311813e-05, "loss": 1.186, "step": 905 }, { "epoch": 0.20744132799084145, "grad_norm": 1.6526825428009033, "learning_rate": 1.8408522451502038e-05, "loss": 1.2059, "step": 906 }, { "epoch": 0.20767029192902117, "grad_norm": 2.6972272396087646, "learning_rate": 1.840450494755285e-05, "loss": 1.186, "step": 907 }, { "epoch": 0.20789925586720093, "grad_norm": 1.294978141784668, "learning_rate": 1.840048281867506e-05, "loss": 1.1069, "step": 908 }, { "epoch": 0.20812821980538065, "grad_norm": 1.686403512954712, "learning_rate": 1.8396456067082e-05, "loss": 1.1007, "step": 909 }, { "epoch": 0.20835718374356038, "grad_norm": 1.4886486530303955, "learning_rate": 1.839242469498956e-05, "loss": 1.1646, "step": 910 }, { "epoch": 0.20858614768174014, "grad_norm": 1.4273146390914917, "learning_rate": 1.8388388704616177e-05, "loss": 1.0601, "step": 911 }, { "epoch": 0.20881511161991986, "grad_norm": 1.9175933599472046, "learning_rate": 1.8384348098182815e-05, "loss": 1.1559, "step": 912 }, { "epoch": 0.2090440755580996, "grad_norm": 1.3737765550613403, "learning_rate": 1.8380302877912993e-05, "loss": 1.185, "step": 913 }, { "epoch": 0.20927303949627934, "grad_norm": 1.3466029167175293, "learning_rate": 1.837625304603275e-05, "loss": 1.14, "step": 914 }, { "epoch": 0.20950200343445907, "grad_norm": 1.427209734916687, "learning_rate": 1.837219860477069e-05, "loss": 1.1444, "step": 915 }, { "epoch": 0.2097309673726388, "grad_norm": 1.7554882764816284, "learning_rate": 1.836813955635793e-05, "loss": 1.1354, "step": 916 }, { "epoch": 0.20995993131081855, "grad_norm": 1.5630890130996704, "learning_rate": 1.8364075903028128e-05, "loss": 1.0917, "step": 917 }, { "epoch": 0.21018889524899828, "grad_norm": 1.6305307149887085, "learning_rate": 1.836000764701748e-05, "loss": 1.0982, "step": 918 }, { "epoch": 0.210417859187178, "grad_norm": 1.154215931892395, "learning_rate": 1.8355934790564718e-05, "loss": 1.1199, "step": 919 }, { "epoch": 0.21064682312535776, "grad_norm": 1.440573811531067, "learning_rate": 1.8351857335911094e-05, "loss": 1.1621, "step": 920 }, { "epoch": 0.2108757870635375, "grad_norm": 1.2942014932632446, "learning_rate": 1.83477752853004e-05, "loss": 1.1378, "step": 921 }, { "epoch": 0.21110475100171722, "grad_norm": 3.2026820182800293, "learning_rate": 1.8343688640978955e-05, "loss": 1.2049, "step": 922 }, { "epoch": 0.21133371493989697, "grad_norm": 1.3892024755477905, "learning_rate": 1.8339597405195607e-05, "loss": 1.1328, "step": 923 }, { "epoch": 0.2115626788780767, "grad_norm": 1.5560742616653442, "learning_rate": 1.833550158020172e-05, "loss": 1.1212, "step": 924 }, { "epoch": 0.21179164281625643, "grad_norm": 1.1782360076904297, "learning_rate": 1.83314011682512e-05, "loss": 1.1153, "step": 925 }, { "epoch": 0.21202060675443618, "grad_norm": 1.5242149829864502, "learning_rate": 1.832729617160047e-05, "loss": 1.1059, "step": 926 }, { "epoch": 0.2122495706926159, "grad_norm": 1.157873272895813, "learning_rate": 1.8323186592508474e-05, "loss": 1.1489, "step": 927 }, { "epoch": 0.21247853463079566, "grad_norm": 1.1628938913345337, "learning_rate": 1.8319072433236677e-05, "loss": 1.0962, "step": 928 }, { "epoch": 0.2127074985689754, "grad_norm": 1.0474616289138794, "learning_rate": 1.831495369604907e-05, "loss": 1.1517, "step": 929 }, { "epoch": 0.21293646250715512, "grad_norm": 1.7184207439422607, "learning_rate": 1.831083038321215e-05, "loss": 1.1172, "step": 930 }, { "epoch": 0.21316542644533487, "grad_norm": 1.8269110918045044, "learning_rate": 1.830670249699495e-05, "loss": 1.1521, "step": 931 }, { "epoch": 0.2133943903835146, "grad_norm": 1.1638559103012085, "learning_rate": 1.830257003966901e-05, "loss": 1.1759, "step": 932 }, { "epoch": 0.21362335432169433, "grad_norm": 1.1705409288406372, "learning_rate": 1.8298433013508384e-05, "loss": 1.0605, "step": 933 }, { "epoch": 0.21385231825987408, "grad_norm": 1.2644610404968262, "learning_rate": 1.8294291420789648e-05, "loss": 1.1405, "step": 934 }, { "epoch": 0.2140812821980538, "grad_norm": 1.0992416143417358, "learning_rate": 1.8290145263791883e-05, "loss": 1.108, "step": 935 }, { "epoch": 0.21431024613623353, "grad_norm": 1.291539192199707, "learning_rate": 1.8285994544796685e-05, "loss": 1.084, "step": 936 }, { "epoch": 0.2145392100744133, "grad_norm": 1.244376540184021, "learning_rate": 1.8281839266088155e-05, "loss": 1.1151, "step": 937 }, { "epoch": 0.21476817401259302, "grad_norm": 1.0517001152038574, "learning_rate": 1.8277679429952913e-05, "loss": 1.1116, "step": 938 }, { "epoch": 0.21499713795077274, "grad_norm": 1.0727293491363525, "learning_rate": 1.827351503868008e-05, "loss": 1.1381, "step": 939 }, { "epoch": 0.2152261018889525, "grad_norm": 1.1680247783660889, "learning_rate": 1.826934609456129e-05, "loss": 1.1139, "step": 940 }, { "epoch": 0.21545506582713222, "grad_norm": 1.4314526319503784, "learning_rate": 1.826517259989067e-05, "loss": 1.1778, "step": 941 }, { "epoch": 0.21568402976531195, "grad_norm": 1.3765356540679932, "learning_rate": 1.8260994556964865e-05, "loss": 1.1226, "step": 942 }, { "epoch": 0.2159129937034917, "grad_norm": 1.1225863695144653, "learning_rate": 1.8256811968083016e-05, "loss": 1.0951, "step": 943 }, { "epoch": 0.21614195764167143, "grad_norm": 1.3155685663223267, "learning_rate": 1.8252624835546768e-05, "loss": 1.1253, "step": 944 }, { "epoch": 0.21637092157985116, "grad_norm": 1.3809353113174438, "learning_rate": 1.824843316166026e-05, "loss": 1.0972, "step": 945 }, { "epoch": 0.21659988551803092, "grad_norm": 1.5605497360229492, "learning_rate": 1.8244236948730138e-05, "loss": 1.1479, "step": 946 }, { "epoch": 0.21682884945621064, "grad_norm": 1.3077151775360107, "learning_rate": 1.8240036199065546e-05, "loss": 1.0822, "step": 947 }, { "epoch": 0.21705781339439037, "grad_norm": 1.3724560737609863, "learning_rate": 1.8235830914978113e-05, "loss": 1.1366, "step": 948 }, { "epoch": 0.21728677733257012, "grad_norm": 1.2070374488830566, "learning_rate": 1.8231621098781983e-05, "loss": 1.1371, "step": 949 }, { "epoch": 0.21751574127074985, "grad_norm": 1.3094055652618408, "learning_rate": 1.822740675279377e-05, "loss": 1.1442, "step": 950 }, { "epoch": 0.2177447052089296, "grad_norm": 1.3075108528137207, "learning_rate": 1.8223187879332604e-05, "loss": 1.1095, "step": 951 }, { "epoch": 0.21797366914710933, "grad_norm": 1.2486156225204468, "learning_rate": 1.8218964480720093e-05, "loss": 1.0987, "step": 952 }, { "epoch": 0.21820263308528906, "grad_norm": 1.1056116819381714, "learning_rate": 1.821473655928033e-05, "loss": 1.116, "step": 953 }, { "epoch": 0.21843159702346882, "grad_norm": 1.235161542892456, "learning_rate": 1.8210504117339917e-05, "loss": 1.062, "step": 954 }, { "epoch": 0.21866056096164854, "grad_norm": 1.3450078964233398, "learning_rate": 1.8206267157227918e-05, "loss": 1.0997, "step": 955 }, { "epoch": 0.21888952489982827, "grad_norm": 1.1602591276168823, "learning_rate": 1.820202568127591e-05, "loss": 1.124, "step": 956 }, { "epoch": 0.21911848883800802, "grad_norm": 1.4237440824508667, "learning_rate": 1.819777969181793e-05, "loss": 1.148, "step": 957 }, { "epoch": 0.21934745277618775, "grad_norm": 1.4019017219543457, "learning_rate": 1.819352919119052e-05, "loss": 1.1316, "step": 958 }, { "epoch": 0.21957641671436748, "grad_norm": 1.2780940532684326, "learning_rate": 1.818927418173269e-05, "loss": 1.1654, "step": 959 }, { "epoch": 0.21980538065254723, "grad_norm": 1.7911431789398193, "learning_rate": 1.8185014665785936e-05, "loss": 1.0586, "step": 960 }, { "epoch": 0.22003434459072696, "grad_norm": 2.0923030376434326, "learning_rate": 1.8180750645694236e-05, "loss": 1.146, "step": 961 }, { "epoch": 0.2202633085289067, "grad_norm": 1.1630254983901978, "learning_rate": 1.8176482123804042e-05, "loss": 1.1488, "step": 962 }, { "epoch": 0.22049227246708644, "grad_norm": 1.14417564868927, "learning_rate": 1.8172209102464288e-05, "loss": 1.089, "step": 963 }, { "epoch": 0.22072123640526617, "grad_norm": 1.4543980360031128, "learning_rate": 1.816793158402638e-05, "loss": 1.1678, "step": 964 }, { "epoch": 0.2209502003434459, "grad_norm": 1.025010108947754, "learning_rate": 1.8163649570844198e-05, "loss": 1.062, "step": 965 }, { "epoch": 0.22117916428162565, "grad_norm": 1.0538175106048584, "learning_rate": 1.8159363065274104e-05, "loss": 1.1371, "step": 966 }, { "epoch": 0.22140812821980538, "grad_norm": 1.9237316846847534, "learning_rate": 1.8155072069674923e-05, "loss": 1.1064, "step": 967 }, { "epoch": 0.2216370921579851, "grad_norm": 1.3368388414382935, "learning_rate": 1.8150776586407957e-05, "loss": 1.0746, "step": 968 }, { "epoch": 0.22186605609616486, "grad_norm": 1.297314167022705, "learning_rate": 1.814647661783697e-05, "loss": 1.1678, "step": 969 }, { "epoch": 0.2220950200343446, "grad_norm": 1.2015776634216309, "learning_rate": 1.8142172166328198e-05, "loss": 1.1321, "step": 970 }, { "epoch": 0.22232398397252431, "grad_norm": 1.1826279163360596, "learning_rate": 1.8137863234250346e-05, "loss": 1.0897, "step": 971 }, { "epoch": 0.22255294791070407, "grad_norm": 1.9954915046691895, "learning_rate": 1.813354982397459e-05, "loss": 1.0966, "step": 972 }, { "epoch": 0.2227819118488838, "grad_norm": 1.4948241710662842, "learning_rate": 1.8129231937874555e-05, "loss": 1.0648, "step": 973 }, { "epoch": 0.22301087578706355, "grad_norm": 1.5026756525039673, "learning_rate": 1.812490957832634e-05, "loss": 1.1085, "step": 974 }, { "epoch": 0.22323983972524328, "grad_norm": 1.6371681690216064, "learning_rate": 1.8120582747708503e-05, "loss": 1.085, "step": 975 }, { "epoch": 0.223468803663423, "grad_norm": 1.236262559890747, "learning_rate": 1.8116251448402062e-05, "loss": 1.1093, "step": 976 }, { "epoch": 0.22369776760160276, "grad_norm": 1.208084225654602, "learning_rate": 1.8111915682790494e-05, "loss": 1.192, "step": 977 }, { "epoch": 0.2239267315397825, "grad_norm": 1.4316471815109253, "learning_rate": 1.810757545325974e-05, "loss": 1.113, "step": 978 }, { "epoch": 0.2241556954779622, "grad_norm": 1.1135729551315308, "learning_rate": 1.8103230762198185e-05, "loss": 1.1644, "step": 979 }, { "epoch": 0.22438465941614197, "grad_norm": 1.1902830600738525, "learning_rate": 1.809888161199668e-05, "loss": 1.1535, "step": 980 }, { "epoch": 0.2246136233543217, "grad_norm": 1.5543137788772583, "learning_rate": 1.8094528005048527e-05, "loss": 1.077, "step": 981 }, { "epoch": 0.22484258729250142, "grad_norm": 1.174572229385376, "learning_rate": 1.8090169943749477e-05, "loss": 1.0657, "step": 982 }, { "epoch": 0.22507155123068118, "grad_norm": 1.3976424932479858, "learning_rate": 1.8085807430497734e-05, "loss": 1.1355, "step": 983 }, { "epoch": 0.2253005151688609, "grad_norm": 1.2085835933685303, "learning_rate": 1.808144046769395e-05, "loss": 1.2107, "step": 984 }, { "epoch": 0.22552947910704063, "grad_norm": 1.3082224130630493, "learning_rate": 1.8077069057741235e-05, "loss": 1.2256, "step": 985 }, { "epoch": 0.22575844304522039, "grad_norm": 1.0391132831573486, "learning_rate": 1.807269320304514e-05, "loss": 1.1506, "step": 986 }, { "epoch": 0.2259874069834001, "grad_norm": 1.0561646223068237, "learning_rate": 1.806831290601365e-05, "loss": 1.1065, "step": 987 }, { "epoch": 0.22621637092157984, "grad_norm": 1.2133493423461914, "learning_rate": 1.8063928169057214e-05, "loss": 1.1006, "step": 988 }, { "epoch": 0.2264453348597596, "grad_norm": 1.4187254905700684, "learning_rate": 1.8059538994588715e-05, "loss": 1.0748, "step": 989 }, { "epoch": 0.22667429879793932, "grad_norm": 1.3195863962173462, "learning_rate": 1.8055145385023477e-05, "loss": 1.1143, "step": 990 }, { "epoch": 0.22690326273611905, "grad_norm": 1.304660677909851, "learning_rate": 1.8050747342779274e-05, "loss": 1.2163, "step": 991 }, { "epoch": 0.2271322266742988, "grad_norm": 1.0397827625274658, "learning_rate": 1.80463448702763e-05, "loss": 1.0848, "step": 992 }, { "epoch": 0.22736119061247853, "grad_norm": 1.2411673069000244, "learning_rate": 1.8041937969937206e-05, "loss": 1.0982, "step": 993 }, { "epoch": 0.22759015455065826, "grad_norm": 1.261560320854187, "learning_rate": 1.803752664418707e-05, "loss": 1.096, "step": 994 }, { "epoch": 0.227819118488838, "grad_norm": 1.3280490636825562, "learning_rate": 1.803311089545341e-05, "loss": 1.1291, "step": 995 }, { "epoch": 0.22804808242701774, "grad_norm": 1.3528070449829102, "learning_rate": 1.8028690726166172e-05, "loss": 1.1006, "step": 996 }, { "epoch": 0.2282770463651975, "grad_norm": 1.1263134479522705, "learning_rate": 1.8024266138757746e-05, "loss": 1.1311, "step": 997 }, { "epoch": 0.22850601030337722, "grad_norm": 1.3571945428848267, "learning_rate": 1.8019837135662932e-05, "loss": 1.1336, "step": 998 }, { "epoch": 0.22873497424155695, "grad_norm": 1.3748575448989868, "learning_rate": 1.801540371931898e-05, "loss": 1.1514, "step": 999 }, { "epoch": 0.2289639381797367, "grad_norm": 1.1356844902038574, "learning_rate": 1.8010965892165568e-05, "loss": 1.1703, "step": 1000 }, { "epoch": 0.22919290211791643, "grad_norm": 1.219716191291809, "learning_rate": 1.8006523656644787e-05, "loss": 1.1123, "step": 1001 }, { "epoch": 0.22942186605609616, "grad_norm": 1.0204182863235474, "learning_rate": 1.8002077015201164e-05, "loss": 1.1016, "step": 1002 }, { "epoch": 0.2296508299942759, "grad_norm": 1.1001499891281128, "learning_rate": 1.7997625970281652e-05, "loss": 1.1063, "step": 1003 }, { "epoch": 0.22987979393245564, "grad_norm": 1.3529033660888672, "learning_rate": 1.7993170524335614e-05, "loss": 1.1498, "step": 1004 }, { "epoch": 0.23010875787063537, "grad_norm": 1.0929923057556152, "learning_rate": 1.7988710679814857e-05, "loss": 1.1105, "step": 1005 }, { "epoch": 0.23033772180881512, "grad_norm": 1.1180248260498047, "learning_rate": 1.798424643917359e-05, "loss": 1.0907, "step": 1006 }, { "epoch": 0.23056668574699485, "grad_norm": 1.199679970741272, "learning_rate": 1.7979777804868447e-05, "loss": 1.184, "step": 1007 }, { "epoch": 0.23079564968517458, "grad_norm": 1.0665282011032104, "learning_rate": 1.797530477935848e-05, "loss": 1.1398, "step": 1008 }, { "epoch": 0.23102461362335433, "grad_norm": 1.1431396007537842, "learning_rate": 1.7970827365105157e-05, "loss": 1.1085, "step": 1009 }, { "epoch": 0.23125357756153406, "grad_norm": 1.1866995096206665, "learning_rate": 1.796634556457236e-05, "loss": 1.1075, "step": 1010 }, { "epoch": 0.23148254149971378, "grad_norm": 1.0268393754959106, "learning_rate": 1.7961859380226395e-05, "loss": 1.1128, "step": 1011 }, { "epoch": 0.23171150543789354, "grad_norm": 1.1771196126937866, "learning_rate": 1.7957368814535963e-05, "loss": 1.0616, "step": 1012 }, { "epoch": 0.23194046937607327, "grad_norm": 1.1519027948379517, "learning_rate": 1.7952873869972183e-05, "loss": 1.1305, "step": 1013 }, { "epoch": 0.232169433314253, "grad_norm": 1.0065557956695557, "learning_rate": 1.7948374549008594e-05, "loss": 1.088, "step": 1014 }, { "epoch": 0.23239839725243275, "grad_norm": 1.4946367740631104, "learning_rate": 1.7943870854121126e-05, "loss": 1.1938, "step": 1015 }, { "epoch": 0.23262736119061247, "grad_norm": 1.5711524486541748, "learning_rate": 1.793936278778813e-05, "loss": 1.0989, "step": 1016 }, { "epoch": 0.2328563251287922, "grad_norm": 1.0987305641174316, "learning_rate": 1.793485035249036e-05, "loss": 1.1015, "step": 1017 }, { "epoch": 0.23308528906697196, "grad_norm": 1.1849825382232666, "learning_rate": 1.793033355071096e-05, "loss": 1.1036, "step": 1018 }, { "epoch": 0.23331425300515168, "grad_norm": 1.1868687868118286, "learning_rate": 1.79258123849355e-05, "loss": 1.0985, "step": 1019 }, { "epoch": 0.23354321694333144, "grad_norm": 1.2682559490203857, "learning_rate": 1.792128685765193e-05, "loss": 1.1574, "step": 1020 }, { "epoch": 0.23377218088151117, "grad_norm": 1.544455647468567, "learning_rate": 1.7916756971350618e-05, "loss": 1.1294, "step": 1021 }, { "epoch": 0.2340011448196909, "grad_norm": 1.103712558746338, "learning_rate": 1.7912222728524318e-05, "loss": 1.0987, "step": 1022 }, { "epoch": 0.23423010875787065, "grad_norm": 1.2829042673110962, "learning_rate": 1.7907684131668186e-05, "loss": 1.0943, "step": 1023 }, { "epoch": 0.23445907269605037, "grad_norm": 1.112669587135315, "learning_rate": 1.7903141183279776e-05, "loss": 1.1518, "step": 1024 }, { "epoch": 0.2346880366342301, "grad_norm": 1.0655471086502075, "learning_rate": 1.789859388585903e-05, "loss": 1.1347, "step": 1025 }, { "epoch": 0.23491700057240986, "grad_norm": 1.2130173444747925, "learning_rate": 1.7894042241908293e-05, "loss": 1.1555, "step": 1026 }, { "epoch": 0.23514596451058958, "grad_norm": 1.1426712274551392, "learning_rate": 1.7889486253932292e-05, "loss": 1.1248, "step": 1027 }, { "epoch": 0.2353749284487693, "grad_norm": 1.0837373733520508, "learning_rate": 1.7884925924438152e-05, "loss": 1.1218, "step": 1028 }, { "epoch": 0.23560389238694907, "grad_norm": 1.2106484174728394, "learning_rate": 1.7880361255935385e-05, "loss": 1.1164, "step": 1029 }, { "epoch": 0.2358328563251288, "grad_norm": 1.155698537826538, "learning_rate": 1.7875792250935883e-05, "loss": 1.1404, "step": 1030 }, { "epoch": 0.23606182026330852, "grad_norm": 1.0495966672897339, "learning_rate": 1.7871218911953942e-05, "loss": 1.1275, "step": 1031 }, { "epoch": 0.23629078420148827, "grad_norm": 1.108055591583252, "learning_rate": 1.7866641241506222e-05, "loss": 1.0906, "step": 1032 }, { "epoch": 0.236519748139668, "grad_norm": 1.1889424324035645, "learning_rate": 1.7862059242111782e-05, "loss": 1.1072, "step": 1033 }, { "epoch": 0.23674871207784773, "grad_norm": 2.039444923400879, "learning_rate": 1.7857472916292056e-05, "loss": 1.1747, "step": 1034 }, { "epoch": 0.23697767601602748, "grad_norm": 1.0959917306900024, "learning_rate": 1.785288226657086e-05, "loss": 1.0886, "step": 1035 }, { "epoch": 0.2372066399542072, "grad_norm": 1.1916193962097168, "learning_rate": 1.7848287295474397e-05, "loss": 1.1372, "step": 1036 }, { "epoch": 0.23743560389238694, "grad_norm": 1.3053346872329712, "learning_rate": 1.7843688005531227e-05, "loss": 1.1423, "step": 1037 }, { "epoch": 0.2376645678305667, "grad_norm": 1.1222914457321167, "learning_rate": 1.7839084399272317e-05, "loss": 1.079, "step": 1038 }, { "epoch": 0.23789353176874642, "grad_norm": 1.1740766763687134, "learning_rate": 1.7834476479230978e-05, "loss": 1.163, "step": 1039 }, { "epoch": 0.23812249570692615, "grad_norm": 2.025049924850464, "learning_rate": 1.782986424794292e-05, "loss": 1.1855, "step": 1040 }, { "epoch": 0.2383514596451059, "grad_norm": 1.2098287343978882, "learning_rate": 1.7825247707946212e-05, "loss": 1.1989, "step": 1041 }, { "epoch": 0.23858042358328563, "grad_norm": 1.0993343591690063, "learning_rate": 1.78206268617813e-05, "loss": 1.1303, "step": 1042 }, { "epoch": 0.23880938752146538, "grad_norm": 1.1957361698150635, "learning_rate": 1.781600171199099e-05, "loss": 1.1391, "step": 1043 }, { "epoch": 0.2390383514596451, "grad_norm": 1.0565906763076782, "learning_rate": 1.7811372261120468e-05, "loss": 1.0929, "step": 1044 }, { "epoch": 0.23926731539782484, "grad_norm": 1.4460198879241943, "learning_rate": 1.780673851171728e-05, "loss": 1.1438, "step": 1045 }, { "epoch": 0.2394962793360046, "grad_norm": 1.2659794092178345, "learning_rate": 1.7802100466331343e-05, "loss": 1.1188, "step": 1046 }, { "epoch": 0.23972524327418432, "grad_norm": 1.1114025115966797, "learning_rate": 1.7797458127514934e-05, "loss": 1.1157, "step": 1047 }, { "epoch": 0.23995420721236405, "grad_norm": 2.1384947299957275, "learning_rate": 1.779281149782269e-05, "loss": 1.0959, "step": 1048 }, { "epoch": 0.2401831711505438, "grad_norm": 1.1005151271820068, "learning_rate": 1.7788160579811614e-05, "loss": 1.0842, "step": 1049 }, { "epoch": 0.24041213508872353, "grad_norm": 1.1792573928833008, "learning_rate": 1.7783505376041063e-05, "loss": 1.0822, "step": 1050 }, { "epoch": 0.24064109902690325, "grad_norm": 1.1286247968673706, "learning_rate": 1.7778845889072764e-05, "loss": 1.1103, "step": 1051 }, { "epoch": 0.240870062965083, "grad_norm": 1.0358811616897583, "learning_rate": 1.777418212147079e-05, "loss": 1.0796, "step": 1052 }, { "epoch": 0.24109902690326274, "grad_norm": 1.1496690511703491, "learning_rate": 1.7769514075801573e-05, "loss": 1.1426, "step": 1053 }, { "epoch": 0.24132799084144246, "grad_norm": 0.995430588722229, "learning_rate": 1.77648417546339e-05, "loss": 1.0874, "step": 1054 }, { "epoch": 0.24155695477962222, "grad_norm": 1.176997423171997, "learning_rate": 1.7760165160538907e-05, "loss": 1.152, "step": 1055 }, { "epoch": 0.24178591871780195, "grad_norm": 1.2487221956253052, "learning_rate": 1.775548429609009e-05, "loss": 1.1166, "step": 1056 }, { "epoch": 0.24201488265598167, "grad_norm": 1.1057595014572144, "learning_rate": 1.7750799163863287e-05, "loss": 1.1729, "step": 1057 }, { "epoch": 0.24224384659416143, "grad_norm": 1.2305538654327393, "learning_rate": 1.774610976643669e-05, "loss": 1.1503, "step": 1058 }, { "epoch": 0.24247281053234115, "grad_norm": 1.1498816013336182, "learning_rate": 1.7741416106390828e-05, "loss": 1.0871, "step": 1059 }, { "epoch": 0.24270177447052088, "grad_norm": 1.7206319570541382, "learning_rate": 1.7736718186308585e-05, "loss": 1.1571, "step": 1060 }, { "epoch": 0.24293073840870064, "grad_norm": 1.1156868934631348, "learning_rate": 1.7732016008775193e-05, "loss": 1.087, "step": 1061 }, { "epoch": 0.24315970234688036, "grad_norm": 1.4100158214569092, "learning_rate": 1.7727309576378213e-05, "loss": 1.1383, "step": 1062 }, { "epoch": 0.2433886662850601, "grad_norm": 1.3357410430908203, "learning_rate": 1.7722598891707562e-05, "loss": 1.1269, "step": 1063 }, { "epoch": 0.24361763022323984, "grad_norm": 2.102534294128418, "learning_rate": 1.7717883957355484e-05, "loss": 1.0601, "step": 1064 }, { "epoch": 0.24384659416141957, "grad_norm": 1.1543803215026855, "learning_rate": 1.7713164775916573e-05, "loss": 1.1185, "step": 1065 }, { "epoch": 0.24407555809959933, "grad_norm": 1.2533857822418213, "learning_rate": 1.7708441349987753e-05, "loss": 1.0979, "step": 1066 }, { "epoch": 0.24430452203777905, "grad_norm": 1.1019551753997803, "learning_rate": 1.7703713682168288e-05, "loss": 1.0778, "step": 1067 }, { "epoch": 0.24453348597595878, "grad_norm": 1.085174798965454, "learning_rate": 1.7698981775059767e-05, "loss": 1.1028, "step": 1068 }, { "epoch": 0.24476244991413854, "grad_norm": 1.204268217086792, "learning_rate": 1.7694245631266124e-05, "loss": 1.0967, "step": 1069 }, { "epoch": 0.24499141385231826, "grad_norm": 1.129050850868225, "learning_rate": 1.768950525339362e-05, "loss": 1.0997, "step": 1070 }, { "epoch": 0.245220377790498, "grad_norm": 1.1998200416564941, "learning_rate": 1.7684760644050847e-05, "loss": 1.0759, "step": 1071 }, { "epoch": 0.24544934172867774, "grad_norm": 2.0567846298217773, "learning_rate": 1.7680011805848715e-05, "loss": 1.0642, "step": 1072 }, { "epoch": 0.24567830566685747, "grad_norm": 1.1932883262634277, "learning_rate": 1.767525874140048e-05, "loss": 1.0918, "step": 1073 }, { "epoch": 0.2459072696050372, "grad_norm": 1.18972909450531, "learning_rate": 1.7670501453321705e-05, "loss": 1.0716, "step": 1074 }, { "epoch": 0.24613623354321695, "grad_norm": 4.0288825035095215, "learning_rate": 1.7665739944230296e-05, "loss": 1.1541, "step": 1075 }, { "epoch": 0.24636519748139668, "grad_norm": 1.1446605920791626, "learning_rate": 1.766097421674646e-05, "loss": 1.0506, "step": 1076 }, { "epoch": 0.2465941614195764, "grad_norm": 0.995333731174469, "learning_rate": 1.7656204273492746e-05, "loss": 1.115, "step": 1077 }, { "epoch": 0.24682312535775616, "grad_norm": 1.2963595390319824, "learning_rate": 1.7651430117094005e-05, "loss": 1.1066, "step": 1078 }, { "epoch": 0.2470520892959359, "grad_norm": 1.3206766843795776, "learning_rate": 1.7646651750177424e-05, "loss": 1.1593, "step": 1079 }, { "epoch": 0.24728105323411562, "grad_norm": 1.1858936548233032, "learning_rate": 1.7641869175372493e-05, "loss": 1.1485, "step": 1080 }, { "epoch": 0.24751001717229537, "grad_norm": 1.3568990230560303, "learning_rate": 1.7637082395311024e-05, "loss": 1.1467, "step": 1081 }, { "epoch": 0.2477389811104751, "grad_norm": 1.4126697778701782, "learning_rate": 1.7632291412627146e-05, "loss": 1.1229, "step": 1082 }, { "epoch": 0.24796794504865483, "grad_norm": 1.0935779809951782, "learning_rate": 1.7627496229957288e-05, "loss": 1.1371, "step": 1083 }, { "epoch": 0.24819690898683458, "grad_norm": 1.2460694313049316, "learning_rate": 1.7622696849940204e-05, "loss": 1.0996, "step": 1084 }, { "epoch": 0.2484258729250143, "grad_norm": 1.2605793476104736, "learning_rate": 1.7617893275216953e-05, "loss": 1.0923, "step": 1085 }, { "epoch": 0.24865483686319403, "grad_norm": 1.437913417816162, "learning_rate": 1.76130855084309e-05, "loss": 1.1114, "step": 1086 }, { "epoch": 0.2488838008013738, "grad_norm": 1.0823016166687012, "learning_rate": 1.7608273552227723e-05, "loss": 1.1255, "step": 1087 }, { "epoch": 0.24911276473955352, "grad_norm": 1.158138632774353, "learning_rate": 1.7603457409255397e-05, "loss": 1.12, "step": 1088 }, { "epoch": 0.24934172867773327, "grad_norm": 1.2879137992858887, "learning_rate": 1.7598637082164204e-05, "loss": 1.0848, "step": 1089 }, { "epoch": 0.249570692615913, "grad_norm": 1.2041521072387695, "learning_rate": 1.759381257360673e-05, "loss": 1.1246, "step": 1090 }, { "epoch": 0.24979965655409272, "grad_norm": 1.3644989728927612, "learning_rate": 1.7588983886237868e-05, "loss": 1.0443, "step": 1091 }, { "epoch": 0.2500286204922725, "grad_norm": 1.232747197151184, "learning_rate": 1.75841510227148e-05, "loss": 1.131, "step": 1092 }, { "epoch": 0.2502575844304522, "grad_norm": 1.1022595167160034, "learning_rate": 1.7579313985697006e-05, "loss": 1.0583, "step": 1093 }, { "epoch": 0.25048654836863193, "grad_norm": 1.0636167526245117, "learning_rate": 1.7574472777846276e-05, "loss": 1.0952, "step": 1094 }, { "epoch": 0.25071551230681166, "grad_norm": 1.6139863729476929, "learning_rate": 1.7569627401826673e-05, "loss": 1.073, "step": 1095 }, { "epoch": 0.2509444762449914, "grad_norm": 1.2776731252670288, "learning_rate": 1.756477786030458e-05, "loss": 1.0649, "step": 1096 }, { "epoch": 0.25117344018317117, "grad_norm": 1.2996584177017212, "learning_rate": 1.7559924155948652e-05, "loss": 1.0911, "step": 1097 }, { "epoch": 0.2514024041213509, "grad_norm": 1.0315338373184204, "learning_rate": 1.7555066291429845e-05, "loss": 1.0903, "step": 1098 }, { "epoch": 0.2516313680595306, "grad_norm": 1.1848983764648438, "learning_rate": 1.7550204269421395e-05, "loss": 1.1727, "step": 1099 }, { "epoch": 0.25186033199771035, "grad_norm": 1.332460880279541, "learning_rate": 1.754533809259884e-05, "loss": 1.1459, "step": 1100 }, { "epoch": 0.2520892959358901, "grad_norm": 1.4366538524627686, "learning_rate": 1.7540467763639994e-05, "loss": 1.163, "step": 1101 }, { "epoch": 0.25231825987406986, "grad_norm": 1.1349256038665771, "learning_rate": 1.7535593285224958e-05, "loss": 1.1246, "step": 1102 }, { "epoch": 0.2525472238122496, "grad_norm": 1.2521072626113892, "learning_rate": 1.7530714660036112e-05, "loss": 1.147, "step": 1103 }, { "epoch": 0.2527761877504293, "grad_norm": 1.1671040058135986, "learning_rate": 1.7525831890758125e-05, "loss": 1.1429, "step": 1104 }, { "epoch": 0.25300515168860904, "grad_norm": 1.0925365686416626, "learning_rate": 1.7520944980077948e-05, "loss": 1.1264, "step": 1105 }, { "epoch": 0.25323411562678877, "grad_norm": 1.1112512350082397, "learning_rate": 1.7516053930684804e-05, "loss": 1.1713, "step": 1106 }, { "epoch": 0.2534630795649685, "grad_norm": 1.217628002166748, "learning_rate": 1.7511158745270197e-05, "loss": 1.0978, "step": 1107 }, { "epoch": 0.2536920435031483, "grad_norm": 1.3462601900100708, "learning_rate": 1.7506259426527903e-05, "loss": 1.0702, "step": 1108 }, { "epoch": 0.253921007441328, "grad_norm": 1.3914707899093628, "learning_rate": 1.750135597715398e-05, "loss": 1.1011, "step": 1109 }, { "epoch": 0.25414997137950773, "grad_norm": 1.2635843753814697, "learning_rate": 1.7496448399846757e-05, "loss": 1.047, "step": 1110 }, { "epoch": 0.25437893531768746, "grad_norm": 1.217361330986023, "learning_rate": 1.7491536697306828e-05, "loss": 1.1323, "step": 1111 }, { "epoch": 0.2546078992558672, "grad_norm": 1.3858387470245361, "learning_rate": 1.7486620872237064e-05, "loss": 1.1033, "step": 1112 }, { "epoch": 0.2548368631940469, "grad_norm": 1.247519612312317, "learning_rate": 1.7481700927342603e-05, "loss": 1.1039, "step": 1113 }, { "epoch": 0.2550658271322267, "grad_norm": 1.3646209239959717, "learning_rate": 1.7476776865330847e-05, "loss": 1.12, "step": 1114 }, { "epoch": 0.2552947910704064, "grad_norm": 1.1417921781539917, "learning_rate": 1.7471848688911465e-05, "loss": 1.1152, "step": 1115 }, { "epoch": 0.25552375500858615, "grad_norm": 2.9017693996429443, "learning_rate": 1.7466916400796398e-05, "loss": 1.117, "step": 1116 }, { "epoch": 0.2557527189467659, "grad_norm": 1.4879010915756226, "learning_rate": 1.7461980003699835e-05, "loss": 1.0987, "step": 1117 }, { "epoch": 0.2559816828849456, "grad_norm": 1.6178057193756104, "learning_rate": 1.7457039500338238e-05, "loss": 1.1345, "step": 1118 }, { "epoch": 0.25621064682312533, "grad_norm": 1.2162644863128662, "learning_rate": 1.7452094893430324e-05, "loss": 1.1063, "step": 1119 }, { "epoch": 0.2564396107613051, "grad_norm": 1.2172150611877441, "learning_rate": 1.744714618569707e-05, "loss": 1.1393, "step": 1120 }, { "epoch": 0.25666857469948484, "grad_norm": 1.195317029953003, "learning_rate": 1.74421933798617e-05, "loss": 1.1355, "step": 1121 }, { "epoch": 0.25689753863766457, "grad_norm": 1.0217382907867432, "learning_rate": 1.7437236478649718e-05, "loss": 1.1258, "step": 1122 }, { "epoch": 0.2571265025758443, "grad_norm": 1.2835495471954346, "learning_rate": 1.7432275484788852e-05, "loss": 1.1104, "step": 1123 }, { "epoch": 0.257355466514024, "grad_norm": 1.0786269903182983, "learning_rate": 1.74273104010091e-05, "loss": 1.066, "step": 1124 }, { "epoch": 0.2575844304522038, "grad_norm": 1.1824384927749634, "learning_rate": 1.74223412300427e-05, "loss": 1.0641, "step": 1125 }, { "epoch": 0.25781339439038353, "grad_norm": 1.0312379598617554, "learning_rate": 1.7417367974624153e-05, "loss": 1.0843, "step": 1126 }, { "epoch": 0.25804235832856326, "grad_norm": 1.2044585943222046, "learning_rate": 1.74123906374902e-05, "loss": 1.0899, "step": 1127 }, { "epoch": 0.258271322266743, "grad_norm": 1.1857991218566895, "learning_rate": 1.740740922137982e-05, "loss": 1.1682, "step": 1128 }, { "epoch": 0.2585002862049227, "grad_norm": 1.4564883708953857, "learning_rate": 1.7402423729034252e-05, "loss": 1.1455, "step": 1129 }, { "epoch": 0.25872925014310244, "grad_norm": 1.2754067182540894, "learning_rate": 1.7397434163196967e-05, "loss": 1.1081, "step": 1130 }, { "epoch": 0.2589582140812822, "grad_norm": 1.149965763092041, "learning_rate": 1.7392440526613684e-05, "loss": 1.1089, "step": 1131 }, { "epoch": 0.25918717801946195, "grad_norm": 1.430191993713379, "learning_rate": 1.7387442822032354e-05, "loss": 1.1204, "step": 1132 }, { "epoch": 0.2594161419576417, "grad_norm": 1.3282990455627441, "learning_rate": 1.738244105220318e-05, "loss": 1.1386, "step": 1133 }, { "epoch": 0.2596451058958214, "grad_norm": 1.1032613515853882, "learning_rate": 1.7377435219878586e-05, "loss": 1.0563, "step": 1134 }, { "epoch": 0.25987406983400113, "grad_norm": 1.0498651266098022, "learning_rate": 1.737242532781324e-05, "loss": 1.104, "step": 1135 }, { "epoch": 0.26010303377218086, "grad_norm": 1.2936218976974487, "learning_rate": 1.736741137876405e-05, "loss": 1.043, "step": 1136 }, { "epoch": 0.26033199771036064, "grad_norm": 1.025429129600525, "learning_rate": 1.736239337549015e-05, "loss": 1.1479, "step": 1137 }, { "epoch": 0.26056096164854037, "grad_norm": 1.1074275970458984, "learning_rate": 1.7357371320752896e-05, "loss": 1.0875, "step": 1138 }, { "epoch": 0.2607899255867201, "grad_norm": 1.2737194299697876, "learning_rate": 1.735234521731589e-05, "loss": 1.1193, "step": 1139 }, { "epoch": 0.2610188895248998, "grad_norm": 2.4709835052490234, "learning_rate": 1.7347315067944955e-05, "loss": 1.1418, "step": 1140 }, { "epoch": 0.26124785346307955, "grad_norm": 1.4513455629348755, "learning_rate": 1.7342280875408138e-05, "loss": 1.1594, "step": 1141 }, { "epoch": 0.2614768174012593, "grad_norm": 1.1992151737213135, "learning_rate": 1.7337242642475712e-05, "loss": 1.0609, "step": 1142 }, { "epoch": 0.26170578133943906, "grad_norm": 1.7710577249526978, "learning_rate": 1.7332200371920173e-05, "loss": 1.1702, "step": 1143 }, { "epoch": 0.2619347452776188, "grad_norm": 1.3110970258712769, "learning_rate": 1.7327154066516244e-05, "loss": 1.0921, "step": 1144 }, { "epoch": 0.2621637092157985, "grad_norm": 1.0681921243667603, "learning_rate": 1.7322103729040868e-05, "loss": 1.038, "step": 1145 }, { "epoch": 0.26239267315397824, "grad_norm": 1.1642048358917236, "learning_rate": 1.731704936227319e-05, "loss": 1.071, "step": 1146 }, { "epoch": 0.26262163709215797, "grad_norm": 1.287327766418457, "learning_rate": 1.7311990968994598e-05, "loss": 1.102, "step": 1147 }, { "epoch": 0.26285060103033775, "grad_norm": 1.2213733196258545, "learning_rate": 1.7306928551988683e-05, "loss": 1.1354, "step": 1148 }, { "epoch": 0.2630795649685175, "grad_norm": 1.3054442405700684, "learning_rate": 1.7301862114041244e-05, "loss": 1.1189, "step": 1149 }, { "epoch": 0.2633085289066972, "grad_norm": 1.8000365495681763, "learning_rate": 1.72967916579403e-05, "loss": 1.105, "step": 1150 }, { "epoch": 0.26353749284487693, "grad_norm": 1.155625343322754, "learning_rate": 1.7291717186476088e-05, "loss": 1.1213, "step": 1151 }, { "epoch": 0.26376645678305666, "grad_norm": 1.052958607673645, "learning_rate": 1.7286638702441037e-05, "loss": 1.1383, "step": 1152 }, { "epoch": 0.2639954207212364, "grad_norm": 1.0741239786148071, "learning_rate": 1.7281556208629802e-05, "loss": 1.1085, "step": 1153 }, { "epoch": 0.26422438465941617, "grad_norm": 1.1770488023757935, "learning_rate": 1.7276469707839235e-05, "loss": 1.0881, "step": 1154 }, { "epoch": 0.2644533485975959, "grad_norm": 1.1173039674758911, "learning_rate": 1.7271379202868394e-05, "loss": 1.162, "step": 1155 }, { "epoch": 0.2646823125357756, "grad_norm": 1.1196292638778687, "learning_rate": 1.7266284696518537e-05, "loss": 1.1199, "step": 1156 }, { "epoch": 0.26491127647395535, "grad_norm": 1.097383975982666, "learning_rate": 1.7261186191593135e-05, "loss": 1.1347, "step": 1157 }, { "epoch": 0.2651402404121351, "grad_norm": 2.0568525791168213, "learning_rate": 1.725608369089785e-05, "loss": 1.0951, "step": 1158 }, { "epoch": 0.2653692043503148, "grad_norm": 1.1759741306304932, "learning_rate": 1.7250977197240545e-05, "loss": 1.1429, "step": 1159 }, { "epoch": 0.2655981682884946, "grad_norm": 1.1705988645553589, "learning_rate": 1.7245866713431278e-05, "loss": 1.0861, "step": 1160 }, { "epoch": 0.2658271322266743, "grad_norm": 1.197718620300293, "learning_rate": 1.724075224228231e-05, "loss": 1.1353, "step": 1161 }, { "epoch": 0.26605609616485404, "grad_norm": 1.0807304382324219, "learning_rate": 1.7235633786608092e-05, "loss": 1.1407, "step": 1162 }, { "epoch": 0.26628506010303377, "grad_norm": 1.2779772281646729, "learning_rate": 1.723051134922526e-05, "loss": 1.1031, "step": 1163 }, { "epoch": 0.2665140240412135, "grad_norm": 1.2724177837371826, "learning_rate": 1.7225384932952655e-05, "loss": 1.0757, "step": 1164 }, { "epoch": 0.2667429879793932, "grad_norm": 1.0801130533218384, "learning_rate": 1.72202545406113e-05, "loss": 1.1328, "step": 1165 }, { "epoch": 0.266971951917573, "grad_norm": 1.0200031995773315, "learning_rate": 1.7215120175024405e-05, "loss": 1.0876, "step": 1166 }, { "epoch": 0.26720091585575273, "grad_norm": 1.1842114925384521, "learning_rate": 1.720998183901737e-05, "loss": 1.1401, "step": 1167 }, { "epoch": 0.26742987979393246, "grad_norm": 1.0764400959014893, "learning_rate": 1.7204839535417778e-05, "loss": 1.046, "step": 1168 }, { "epoch": 0.2676588437321122, "grad_norm": 1.2766287326812744, "learning_rate": 1.7199693267055392e-05, "loss": 1.0931, "step": 1169 }, { "epoch": 0.2678878076702919, "grad_norm": 1.282101035118103, "learning_rate": 1.7194543036762165e-05, "loss": 1.0584, "step": 1170 }, { "epoch": 0.2681167716084717, "grad_norm": 1.2437055110931396, "learning_rate": 1.7189388847372227e-05, "loss": 1.1001, "step": 1171 }, { "epoch": 0.2683457355466514, "grad_norm": 1.0838711261749268, "learning_rate": 1.7184230701721876e-05, "loss": 1.0871, "step": 1172 }, { "epoch": 0.26857469948483115, "grad_norm": 1.4530155658721924, "learning_rate": 1.717906860264961e-05, "loss": 1.1225, "step": 1173 }, { "epoch": 0.2688036634230109, "grad_norm": 1.1978572607040405, "learning_rate": 1.7173902552996075e-05, "loss": 1.1464, "step": 1174 }, { "epoch": 0.2690326273611906, "grad_norm": 1.1811645030975342, "learning_rate": 1.7168732555604114e-05, "loss": 1.1627, "step": 1175 }, { "epoch": 0.26926159129937033, "grad_norm": 1.530895709991455, "learning_rate": 1.716355861331873e-05, "loss": 1.0715, "step": 1176 }, { "epoch": 0.2694905552375501, "grad_norm": 1.2078356742858887, "learning_rate": 1.7158380728987102e-05, "loss": 1.1287, "step": 1177 }, { "epoch": 0.26971951917572984, "grad_norm": 1.318663477897644, "learning_rate": 1.715319890545857e-05, "loss": 1.1881, "step": 1178 }, { "epoch": 0.26994848311390957, "grad_norm": 1.174576759338379, "learning_rate": 1.7148013145584657e-05, "loss": 1.1113, "step": 1179 }, { "epoch": 0.2701774470520893, "grad_norm": 1.2301723957061768, "learning_rate": 1.7142823452219036e-05, "loss": 1.1051, "step": 1180 }, { "epoch": 0.270406410990269, "grad_norm": 1.097507119178772, "learning_rate": 1.7137629828217556e-05, "loss": 1.0563, "step": 1181 }, { "epoch": 0.27063537492844875, "grad_norm": 1.100091814994812, "learning_rate": 1.7132432276438228e-05, "loss": 1.0676, "step": 1182 }, { "epoch": 0.27086433886662853, "grad_norm": 1.1232244968414307, "learning_rate": 1.712723079974121e-05, "loss": 1.1441, "step": 1183 }, { "epoch": 0.27109330280480826, "grad_norm": 1.3553615808486938, "learning_rate": 1.712202540098884e-05, "loss": 1.1112, "step": 1184 }, { "epoch": 0.271322266742988, "grad_norm": 1.2029356956481934, "learning_rate": 1.7116816083045603e-05, "loss": 1.1263, "step": 1185 }, { "epoch": 0.2715512306811677, "grad_norm": 1.465103268623352, "learning_rate": 1.7111602848778143e-05, "loss": 1.1094, "step": 1186 }, { "epoch": 0.27178019461934744, "grad_norm": 1.3049366474151611, "learning_rate": 1.710638570105526e-05, "loss": 1.1015, "step": 1187 }, { "epoch": 0.27200915855752716, "grad_norm": 1.2271827459335327, "learning_rate": 1.7101164642747906e-05, "loss": 1.0745, "step": 1188 }, { "epoch": 0.27223812249570695, "grad_norm": 1.0381929874420166, "learning_rate": 1.7095939676729184e-05, "loss": 1.1369, "step": 1189 }, { "epoch": 0.2724670864338867, "grad_norm": 1.3117164373397827, "learning_rate": 1.709071080587435e-05, "loss": 1.1253, "step": 1190 }, { "epoch": 0.2726960503720664, "grad_norm": 1.409812092781067, "learning_rate": 1.7085478033060808e-05, "loss": 1.1859, "step": 1191 }, { "epoch": 0.27292501431024613, "grad_norm": 1.3818203210830688, "learning_rate": 1.7080241361168108e-05, "loss": 1.0365, "step": 1192 }, { "epoch": 0.27315397824842585, "grad_norm": 1.1283059120178223, "learning_rate": 1.707500079307795e-05, "loss": 1.0971, "step": 1193 }, { "epoch": 0.2733829421866056, "grad_norm": 1.1828192472457886, "learning_rate": 1.7069756331674172e-05, "loss": 1.1046, "step": 1194 }, { "epoch": 0.27361190612478536, "grad_norm": 1.287385106086731, "learning_rate": 1.7064507979842755e-05, "loss": 1.0766, "step": 1195 }, { "epoch": 0.2738408700629651, "grad_norm": 1.1248568296432495, "learning_rate": 1.705925574047183e-05, "loss": 1.068, "step": 1196 }, { "epoch": 0.2740698340011448, "grad_norm": 1.151267647743225, "learning_rate": 1.7053999616451653e-05, "loss": 1.0703, "step": 1197 }, { "epoch": 0.27429879793932455, "grad_norm": 1.1329370737075806, "learning_rate": 1.7048739610674626e-05, "loss": 1.0707, "step": 1198 }, { "epoch": 0.2745277618775043, "grad_norm": 1.0887163877487183, "learning_rate": 1.704347572603529e-05, "loss": 1.1708, "step": 1199 }, { "epoch": 0.27475672581568406, "grad_norm": 1.1625685691833496, "learning_rate": 1.7038207965430316e-05, "loss": 1.0947, "step": 1200 }, { "epoch": 0.2749856897538638, "grad_norm": 1.0462048053741455, "learning_rate": 1.7032936331758505e-05, "loss": 1.0896, "step": 1201 }, { "epoch": 0.2752146536920435, "grad_norm": 1.286412000656128, "learning_rate": 1.7027660827920798e-05, "loss": 1.0962, "step": 1202 }, { "epoch": 0.27544361763022324, "grad_norm": 1.4663245677947998, "learning_rate": 1.702238145682025e-05, "loss": 1.1375, "step": 1203 }, { "epoch": 0.27567258156840296, "grad_norm": 1.2858197689056396, "learning_rate": 1.701709822136207e-05, "loss": 1.1469, "step": 1204 }, { "epoch": 0.2759015455065827, "grad_norm": 1.4555660486221313, "learning_rate": 1.7011811124453567e-05, "loss": 1.096, "step": 1205 }, { "epoch": 0.2761305094447625, "grad_norm": 1.211330771446228, "learning_rate": 1.700652016900419e-05, "loss": 1.0777, "step": 1206 }, { "epoch": 0.2763594733829422, "grad_norm": 1.0660839080810547, "learning_rate": 1.7001225357925506e-05, "loss": 1.0831, "step": 1207 }, { "epoch": 0.2765884373211219, "grad_norm": 1.1060619354248047, "learning_rate": 1.6995926694131207e-05, "loss": 1.097, "step": 1208 }, { "epoch": 0.27681740125930165, "grad_norm": 1.301016926765442, "learning_rate": 1.6990624180537102e-05, "loss": 1.1579, "step": 1209 }, { "epoch": 0.2770463651974814, "grad_norm": 1.3062912225723267, "learning_rate": 1.6985317820061116e-05, "loss": 1.0382, "step": 1210 }, { "epoch": 0.2772753291356611, "grad_norm": 1.3300676345825195, "learning_rate": 1.69800076156233e-05, "loss": 1.2066, "step": 1211 }, { "epoch": 0.2775042930738409, "grad_norm": 1.5877501964569092, "learning_rate": 1.6974693570145818e-05, "loss": 1.0696, "step": 1212 }, { "epoch": 0.2777332570120206, "grad_norm": 1.235116720199585, "learning_rate": 1.696937568655294e-05, "loss": 1.121, "step": 1213 }, { "epoch": 0.27796222095020034, "grad_norm": 1.2792831659317017, "learning_rate": 1.6964053967771047e-05, "loss": 1.1357, "step": 1214 }, { "epoch": 0.27819118488838007, "grad_norm": 1.6012238264083862, "learning_rate": 1.6958728416728644e-05, "loss": 1.0489, "step": 1215 }, { "epoch": 0.2784201488265598, "grad_norm": 1.0891828536987305, "learning_rate": 1.6953399036356338e-05, "loss": 1.0707, "step": 1216 }, { "epoch": 0.2786491127647395, "grad_norm": 1.0039647817611694, "learning_rate": 1.6948065829586835e-05, "loss": 1.1229, "step": 1217 }, { "epoch": 0.2788780767029193, "grad_norm": 1.0819059610366821, "learning_rate": 1.6942728799354963e-05, "loss": 1.1374, "step": 1218 }, { "epoch": 0.27910704064109904, "grad_norm": 1.178362488746643, "learning_rate": 1.693738794859764e-05, "loss": 1.0953, "step": 1219 }, { "epoch": 0.27933600457927876, "grad_norm": 1.132528305053711, "learning_rate": 1.6932043280253892e-05, "loss": 1.1012, "step": 1220 }, { "epoch": 0.2795649685174585, "grad_norm": 1.2141227722167969, "learning_rate": 1.6926694797264844e-05, "loss": 1.1225, "step": 1221 }, { "epoch": 0.2797939324556382, "grad_norm": 1.3470580577850342, "learning_rate": 1.6921342502573723e-05, "loss": 1.1037, "step": 1222 }, { "epoch": 0.280022896393818, "grad_norm": 1.2042268514633179, "learning_rate": 1.6915986399125852e-05, "loss": 1.0931, "step": 1223 }, { "epoch": 0.2802518603319977, "grad_norm": 1.567338228225708, "learning_rate": 1.691062648986865e-05, "loss": 1.0394, "step": 1224 }, { "epoch": 0.28048082427017745, "grad_norm": 0.9561851024627686, "learning_rate": 1.6905262777751627e-05, "loss": 1.01, "step": 1225 }, { "epoch": 0.2807097882083572, "grad_norm": 1.5181394815444946, "learning_rate": 1.6899895265726392e-05, "loss": 1.1086, "step": 1226 }, { "epoch": 0.2809387521465369, "grad_norm": 1.1278475522994995, "learning_rate": 1.689452395674664e-05, "loss": 1.1136, "step": 1227 }, { "epoch": 0.28116771608471663, "grad_norm": 1.375020146369934, "learning_rate": 1.688914885376816e-05, "loss": 1.1157, "step": 1228 }, { "epoch": 0.2813966800228964, "grad_norm": 1.1723235845565796, "learning_rate": 1.6883769959748822e-05, "loss": 1.1582, "step": 1229 }, { "epoch": 0.28162564396107614, "grad_norm": 1.5612130165100098, "learning_rate": 1.6878387277648593e-05, "loss": 1.0827, "step": 1230 }, { "epoch": 0.28185460789925587, "grad_norm": 1.238914966583252, "learning_rate": 1.6873000810429512e-05, "loss": 1.0996, "step": 1231 }, { "epoch": 0.2820835718374356, "grad_norm": 1.1172726154327393, "learning_rate": 1.6867610561055707e-05, "loss": 1.1171, "step": 1232 }, { "epoch": 0.2823125357756153, "grad_norm": 1.265048861503601, "learning_rate": 1.686221653249339e-05, "loss": 1.1057, "step": 1233 }, { "epoch": 0.28254149971379505, "grad_norm": 1.3867113590240479, "learning_rate": 1.6856818727710847e-05, "loss": 1.1072, "step": 1234 }, { "epoch": 0.28277046365197483, "grad_norm": 1.2063997983932495, "learning_rate": 1.6851417149678442e-05, "loss": 1.1, "step": 1235 }, { "epoch": 0.28299942759015456, "grad_norm": 1.0993294715881348, "learning_rate": 1.6846011801368626e-05, "loss": 1.1188, "step": 1236 }, { "epoch": 0.2832283915283343, "grad_norm": 0.9886915683746338, "learning_rate": 1.6840602685755914e-05, "loss": 1.0935, "step": 1237 }, { "epoch": 0.283457355466514, "grad_norm": 1.2733272314071655, "learning_rate": 1.6835189805816894e-05, "loss": 1.0806, "step": 1238 }, { "epoch": 0.28368631940469374, "grad_norm": 1.0870503187179565, "learning_rate": 1.6829773164530226e-05, "loss": 1.1355, "step": 1239 }, { "epoch": 0.28391528334287347, "grad_norm": 1.102055311203003, "learning_rate": 1.6824352764876653e-05, "loss": 1.1119, "step": 1240 }, { "epoch": 0.28414424728105325, "grad_norm": 1.2194703817367554, "learning_rate": 1.6818928609838967e-05, "loss": 1.0878, "step": 1241 }, { "epoch": 0.284373211219233, "grad_norm": 1.2381644248962402, "learning_rate": 1.681350070240204e-05, "loss": 1.0619, "step": 1242 }, { "epoch": 0.2846021751574127, "grad_norm": 1.302049994468689, "learning_rate": 1.6808069045552793e-05, "loss": 1.117, "step": 1243 }, { "epoch": 0.28483113909559243, "grad_norm": 1.0947741270065308, "learning_rate": 1.6802633642280233e-05, "loss": 1.0865, "step": 1244 }, { "epoch": 0.28506010303377216, "grad_norm": 1.4091873168945312, "learning_rate": 1.6797194495575412e-05, "loss": 1.1398, "step": 1245 }, { "epoch": 0.28528906697195194, "grad_norm": 1.2080914974212646, "learning_rate": 1.679175160843145e-05, "loss": 1.0264, "step": 1246 }, { "epoch": 0.28551803091013167, "grad_norm": 1.3599348068237305, "learning_rate": 1.6786304983843517e-05, "loss": 1.1261, "step": 1247 }, { "epoch": 0.2857469948483114, "grad_norm": 1.1633814573287964, "learning_rate": 1.678085462480885e-05, "loss": 1.075, "step": 1248 }, { "epoch": 0.2859759587864911, "grad_norm": 1.1775692701339722, "learning_rate": 1.6775400534326735e-05, "loss": 1.0378, "step": 1249 }, { "epoch": 0.28620492272467085, "grad_norm": 1.3439725637435913, "learning_rate": 1.676994271539851e-05, "loss": 1.0889, "step": 1250 }, { "epoch": 0.2864338866628506, "grad_norm": 1.164843201637268, "learning_rate": 1.6764481171027566e-05, "loss": 1.2002, "step": 1251 }, { "epoch": 0.28666285060103036, "grad_norm": 1.5093419551849365, "learning_rate": 1.6759015904219348e-05, "loss": 1.0292, "step": 1252 }, { "epoch": 0.2868918145392101, "grad_norm": 1.3763619661331177, "learning_rate": 1.6753546917981346e-05, "loss": 1.0907, "step": 1253 }, { "epoch": 0.2871207784773898, "grad_norm": 1.3228497505187988, "learning_rate": 1.6748074215323096e-05, "loss": 1.1251, "step": 1254 }, { "epoch": 0.28734974241556954, "grad_norm": 1.9618299007415771, "learning_rate": 1.6742597799256182e-05, "loss": 1.0923, "step": 1255 }, { "epoch": 0.28757870635374927, "grad_norm": 1.724094033241272, "learning_rate": 1.673711767279423e-05, "loss": 1.0645, "step": 1256 }, { "epoch": 0.287807670291929, "grad_norm": 1.1787453889846802, "learning_rate": 1.6731633838952905e-05, "loss": 1.0869, "step": 1257 }, { "epoch": 0.2880366342301088, "grad_norm": 1.4934698343276978, "learning_rate": 1.672614630074992e-05, "loss": 1.0895, "step": 1258 }, { "epoch": 0.2882655981682885, "grad_norm": 1.7817822694778442, "learning_rate": 1.6720655061205013e-05, "loss": 1.0477, "step": 1259 }, { "epoch": 0.28849456210646823, "grad_norm": 1.1007169485092163, "learning_rate": 1.671516012333997e-05, "loss": 1.1114, "step": 1260 }, { "epoch": 0.28872352604464796, "grad_norm": 1.1942838430404663, "learning_rate": 1.6709661490178617e-05, "loss": 1.1364, "step": 1261 }, { "epoch": 0.2889524899828277, "grad_norm": 1.078680157661438, "learning_rate": 1.6704159164746797e-05, "loss": 1.1666, "step": 1262 }, { "epoch": 0.2891814539210074, "grad_norm": 3.27377986907959, "learning_rate": 1.6698653150072396e-05, "loss": 1.1181, "step": 1263 }, { "epoch": 0.2894104178591872, "grad_norm": 1.0961107015609741, "learning_rate": 1.6693143449185328e-05, "loss": 1.1408, "step": 1264 }, { "epoch": 0.2896393817973669, "grad_norm": 1.6030396223068237, "learning_rate": 1.6687630065117536e-05, "loss": 1.1411, "step": 1265 }, { "epoch": 0.28986834573554665, "grad_norm": 1.1883594989776611, "learning_rate": 1.6682113000902988e-05, "loss": 1.1251, "step": 1266 }, { "epoch": 0.2900973096737264, "grad_norm": 1.2973169088363647, "learning_rate": 1.667659225957768e-05, "loss": 1.0265, "step": 1267 }, { "epoch": 0.2903262736119061, "grad_norm": 1.2373725175857544, "learning_rate": 1.6671067844179625e-05, "loss": 1.0575, "step": 1268 }, { "epoch": 0.2905552375500859, "grad_norm": 1.0662227869033813, "learning_rate": 1.6665539757748866e-05, "loss": 1.1003, "step": 1269 }, { "epoch": 0.2907842014882656, "grad_norm": 1.0769942998886108, "learning_rate": 1.666000800332747e-05, "loss": 1.0941, "step": 1270 }, { "epoch": 0.29101316542644534, "grad_norm": 1.3059958219528198, "learning_rate": 1.6654472583959497e-05, "loss": 1.0998, "step": 1271 }, { "epoch": 0.29124212936462507, "grad_norm": 1.2797632217407227, "learning_rate": 1.664893350269106e-05, "loss": 1.1016, "step": 1272 }, { "epoch": 0.2914710933028048, "grad_norm": 1.2182438373565674, "learning_rate": 1.6643390762570254e-05, "loss": 1.1192, "step": 1273 }, { "epoch": 0.2917000572409845, "grad_norm": 1.4072870016098022, "learning_rate": 1.6637844366647216e-05, "loss": 1.1242, "step": 1274 }, { "epoch": 0.2919290211791643, "grad_norm": 1.3435471057891846, "learning_rate": 1.663229431797407e-05, "loss": 1.1807, "step": 1275 }, { "epoch": 0.29215798511734403, "grad_norm": 1.090842604637146, "learning_rate": 1.6626740619604967e-05, "loss": 1.1318, "step": 1276 }, { "epoch": 0.29238694905552376, "grad_norm": 1.192291021347046, "learning_rate": 1.6621183274596063e-05, "loss": 1.09, "step": 1277 }, { "epoch": 0.2926159129937035, "grad_norm": 1.1859331130981445, "learning_rate": 1.661562228600551e-05, "loss": 0.9964, "step": 1278 }, { "epoch": 0.2928448769318832, "grad_norm": 1.146068811416626, "learning_rate": 1.6610057656893483e-05, "loss": 1.1096, "step": 1279 }, { "epoch": 0.29307384087006294, "grad_norm": 1.1786830425262451, "learning_rate": 1.660448939032214e-05, "loss": 1.1826, "step": 1280 }, { "epoch": 0.2933028048082427, "grad_norm": 1.2743366956710815, "learning_rate": 1.6598917489355665e-05, "loss": 1.0824, "step": 1281 }, { "epoch": 0.29353176874642245, "grad_norm": 1.339068055152893, "learning_rate": 1.6593341957060218e-05, "loss": 1.1928, "step": 1282 }, { "epoch": 0.2937607326846022, "grad_norm": 1.1370441913604736, "learning_rate": 1.658776279650397e-05, "loss": 1.1, "step": 1283 }, { "epoch": 0.2939896966227819, "grad_norm": 1.2556294202804565, "learning_rate": 1.6582180010757082e-05, "loss": 1.0666, "step": 1284 }, { "epoch": 0.29421866056096163, "grad_norm": 1.174279808998108, "learning_rate": 1.6576593602891727e-05, "loss": 1.0921, "step": 1285 }, { "epoch": 0.29444762449914136, "grad_norm": 1.3296531438827515, "learning_rate": 1.6571003575982048e-05, "loss": 1.1534, "step": 1286 }, { "epoch": 0.29467658843732114, "grad_norm": 1.1834275722503662, "learning_rate": 1.656540993310419e-05, "loss": 1.1352, "step": 1287 }, { "epoch": 0.29490555237550087, "grad_norm": 1.2583208084106445, "learning_rate": 1.6559812677336293e-05, "loss": 1.1401, "step": 1288 }, { "epoch": 0.2951345163136806, "grad_norm": 1.1927133798599243, "learning_rate": 1.655421181175848e-05, "loss": 1.1334, "step": 1289 }, { "epoch": 0.2953634802518603, "grad_norm": 1.2883397340774536, "learning_rate": 1.6548607339452853e-05, "loss": 1.1677, "step": 1290 }, { "epoch": 0.29559244419004005, "grad_norm": 1.8883707523345947, "learning_rate": 1.654299926350351e-05, "loss": 1.0453, "step": 1291 }, { "epoch": 0.29582140812821983, "grad_norm": 1.2389609813690186, "learning_rate": 1.6537387586996532e-05, "loss": 1.1274, "step": 1292 }, { "epoch": 0.29605037206639956, "grad_norm": 0.9801892638206482, "learning_rate": 1.6531772313019972e-05, "loss": 1.144, "step": 1293 }, { "epoch": 0.2962793360045793, "grad_norm": 1.00913667678833, "learning_rate": 1.6526153444663873e-05, "loss": 1.087, "step": 1294 }, { "epoch": 0.296508299942759, "grad_norm": 1.0659477710723877, "learning_rate": 1.6520530985020243e-05, "loss": 1.0223, "step": 1295 }, { "epoch": 0.29673726388093874, "grad_norm": 1.5433529615402222, "learning_rate": 1.651490493718309e-05, "loss": 1.1148, "step": 1296 }, { "epoch": 0.29696622781911847, "grad_norm": 1.1053426265716553, "learning_rate": 1.6509275304248366e-05, "loss": 1.128, "step": 1297 }, { "epoch": 0.29719519175729825, "grad_norm": 1.0809921026229858, "learning_rate": 1.650364208931401e-05, "loss": 1.0149, "step": 1298 }, { "epoch": 0.297424155695478, "grad_norm": 1.0818578004837036, "learning_rate": 1.6498005295479946e-05, "loss": 1.0806, "step": 1299 }, { "epoch": 0.2976531196336577, "grad_norm": 2.2473480701446533, "learning_rate": 1.6492364925848045e-05, "loss": 1.1062, "step": 1300 }, { "epoch": 0.29788208357183743, "grad_norm": 1.107351541519165, "learning_rate": 1.6486720983522156e-05, "loss": 1.1262, "step": 1301 }, { "epoch": 0.29811104751001716, "grad_norm": 1.3999285697937012, "learning_rate": 1.6481073471608097e-05, "loss": 1.1821, "step": 1302 }, { "epoch": 0.2983400114481969, "grad_norm": 1.3839341402053833, "learning_rate": 1.647542239321364e-05, "loss": 1.0979, "step": 1303 }, { "epoch": 0.29856897538637667, "grad_norm": 1.4304099082946777, "learning_rate": 1.6469767751448538e-05, "loss": 1.1045, "step": 1304 }, { "epoch": 0.2987979393245564, "grad_norm": 1.2668440341949463, "learning_rate": 1.6464109549424477e-05, "loss": 1.1298, "step": 1305 }, { "epoch": 0.2990269032627361, "grad_norm": 1.2562881708145142, "learning_rate": 1.645844779025513e-05, "loss": 1.1475, "step": 1306 }, { "epoch": 0.29925586720091585, "grad_norm": 1.1980136632919312, "learning_rate": 1.6452782477056112e-05, "loss": 1.1703, "step": 1307 }, { "epoch": 0.2994848311390956, "grad_norm": 1.2106819152832031, "learning_rate": 1.6447113612944993e-05, "loss": 1.1087, "step": 1308 }, { "epoch": 0.2997137950772753, "grad_norm": 1.3096649646759033, "learning_rate": 1.6441441201041312e-05, "loss": 1.0965, "step": 1309 }, { "epoch": 0.2999427590154551, "grad_norm": 1.1836016178131104, "learning_rate": 1.6435765244466536e-05, "loss": 1.0473, "step": 1310 }, { "epoch": 0.3001717229536348, "grad_norm": 1.1866120100021362, "learning_rate": 1.6430085746344107e-05, "loss": 1.1249, "step": 1311 }, { "epoch": 0.30040068689181454, "grad_norm": 1.4427460432052612, "learning_rate": 1.6424402709799404e-05, "loss": 1.1176, "step": 1312 }, { "epoch": 0.30062965082999427, "grad_norm": 1.251099705696106, "learning_rate": 1.6418716137959746e-05, "loss": 1.1228, "step": 1313 }, { "epoch": 0.300858614768174, "grad_norm": 2.566626787185669, "learning_rate": 1.6413026033954418e-05, "loss": 1.1398, "step": 1314 }, { "epoch": 0.3010875787063538, "grad_norm": 1.1216868162155151, "learning_rate": 1.6407332400914625e-05, "loss": 1.0773, "step": 1315 }, { "epoch": 0.3013165426445335, "grad_norm": 1.1276382207870483, "learning_rate": 1.6401635241973533e-05, "loss": 1.0415, "step": 1316 }, { "epoch": 0.30154550658271323, "grad_norm": 1.1274714469909668, "learning_rate": 1.639593456026624e-05, "loss": 1.1136, "step": 1317 }, { "epoch": 0.30177447052089296, "grad_norm": 1.0719152688980103, "learning_rate": 1.639023035892978e-05, "loss": 1.1102, "step": 1318 }, { "epoch": 0.3020034344590727, "grad_norm": 1.0603184700012207, "learning_rate": 1.6384522641103133e-05, "loss": 1.1015, "step": 1319 }, { "epoch": 0.3022323983972524, "grad_norm": 1.0919266939163208, "learning_rate": 1.63788114099272e-05, "loss": 1.0915, "step": 1320 }, { "epoch": 0.3024613623354322, "grad_norm": 1.3398226499557495, "learning_rate": 1.637309666854483e-05, "loss": 1.1068, "step": 1321 }, { "epoch": 0.3026903262736119, "grad_norm": 1.094092845916748, "learning_rate": 1.6367378420100798e-05, "loss": 1.1188, "step": 1322 }, { "epoch": 0.30291929021179165, "grad_norm": 1.0754516124725342, "learning_rate": 1.63616566677418e-05, "loss": 1.1741, "step": 1323 }, { "epoch": 0.3031482541499714, "grad_norm": 1.155779242515564, "learning_rate": 1.6355931414616477e-05, "loss": 1.1085, "step": 1324 }, { "epoch": 0.3033772180881511, "grad_norm": 1.186424970626831, "learning_rate": 1.6350202663875385e-05, "loss": 1.0362, "step": 1325 }, { "epoch": 0.30360618202633083, "grad_norm": 1.1375372409820557, "learning_rate": 1.634447041867101e-05, "loss": 1.0349, "step": 1326 }, { "epoch": 0.3038351459645106, "grad_norm": 1.346127986907959, "learning_rate": 1.633873468215775e-05, "loss": 1.0951, "step": 1327 }, { "epoch": 0.30406410990269034, "grad_norm": 1.2048730850219727, "learning_rate": 1.633299545749194e-05, "loss": 1.0937, "step": 1328 }, { "epoch": 0.30429307384087007, "grad_norm": 1.2353206872940063, "learning_rate": 1.6327252747831824e-05, "loss": 1.1092, "step": 1329 }, { "epoch": 0.3045220377790498, "grad_norm": 1.1722041368484497, "learning_rate": 1.6321506556337575e-05, "loss": 1.0895, "step": 1330 }, { "epoch": 0.3047510017172295, "grad_norm": 1.1982693672180176, "learning_rate": 1.6315756886171264e-05, "loss": 1.1342, "step": 1331 }, { "epoch": 0.30497996565540925, "grad_norm": 1.1630750894546509, "learning_rate": 1.6310003740496887e-05, "loss": 1.1008, "step": 1332 }, { "epoch": 0.30520892959358903, "grad_norm": 1.6868027448654175, "learning_rate": 1.6304247122480355e-05, "loss": 1.1756, "step": 1333 }, { "epoch": 0.30543789353176876, "grad_norm": 1.2052215337753296, "learning_rate": 1.629848703528949e-05, "loss": 1.0759, "step": 1334 }, { "epoch": 0.3056668574699485, "grad_norm": 1.4629309177398682, "learning_rate": 1.6292723482094013e-05, "loss": 1.1375, "step": 1335 }, { "epoch": 0.3058958214081282, "grad_norm": 0.995369553565979, "learning_rate": 1.6286956466065566e-05, "loss": 1.1433, "step": 1336 }, { "epoch": 0.30612478534630794, "grad_norm": 1.3381826877593994, "learning_rate": 1.6281185990377683e-05, "loss": 1.0782, "step": 1337 }, { "epoch": 0.3063537492844877, "grad_norm": 1.22222101688385, "learning_rate": 1.627541205820581e-05, "loss": 1.0767, "step": 1338 }, { "epoch": 0.30658271322266745, "grad_norm": 1.1784082651138306, "learning_rate": 1.6269634672727296e-05, "loss": 1.0653, "step": 1339 }, { "epoch": 0.3068116771608472, "grad_norm": 1.214966058731079, "learning_rate": 1.6263853837121384e-05, "loss": 1.0808, "step": 1340 }, { "epoch": 0.3070406410990269, "grad_norm": 1.5941448211669922, "learning_rate": 1.6258069554569226e-05, "loss": 1.1365, "step": 1341 }, { "epoch": 0.30726960503720663, "grad_norm": 1.1932793855667114, "learning_rate": 1.6252281828253856e-05, "loss": 1.1333, "step": 1342 }, { "epoch": 0.30749856897538635, "grad_norm": 1.285779356956482, "learning_rate": 1.6246490661360215e-05, "loss": 1.0826, "step": 1343 }, { "epoch": 0.30772753291356614, "grad_norm": 1.9943314790725708, "learning_rate": 1.6240696057075138e-05, "loss": 1.0179, "step": 1344 }, { "epoch": 0.30795649685174586, "grad_norm": 1.3297176361083984, "learning_rate": 1.6234898018587336e-05, "loss": 1.0836, "step": 1345 }, { "epoch": 0.3081854607899256, "grad_norm": 1.1271708011627197, "learning_rate": 1.6229096549087434e-05, "loss": 1.0867, "step": 1346 }, { "epoch": 0.3084144247281053, "grad_norm": 1.003398060798645, "learning_rate": 1.6223291651767922e-05, "loss": 1.119, "step": 1347 }, { "epoch": 0.30864338866628505, "grad_norm": 1.4007704257965088, "learning_rate": 1.621748332982319e-05, "loss": 1.054, "step": 1348 }, { "epoch": 0.3088723526044648, "grad_norm": 1.2629410028457642, "learning_rate": 1.6211671586449512e-05, "loss": 1.0702, "step": 1349 }, { "epoch": 0.30910131654264456, "grad_norm": 1.1390570402145386, "learning_rate": 1.6205856424845038e-05, "loss": 1.0674, "step": 1350 }, { "epoch": 0.3093302804808243, "grad_norm": 1.0701704025268555, "learning_rate": 1.62000378482098e-05, "loss": 1.1055, "step": 1351 }, { "epoch": 0.309559244419004, "grad_norm": 1.0888673067092896, "learning_rate": 1.619421585974572e-05, "loss": 1.0241, "step": 1352 }, { "epoch": 0.30978820835718374, "grad_norm": 1.1905760765075684, "learning_rate": 1.618839046265658e-05, "loss": 1.1075, "step": 1353 }, { "epoch": 0.31001717229536346, "grad_norm": 1.1428126096725464, "learning_rate": 1.6182561660148053e-05, "loss": 1.1007, "step": 1354 }, { "epoch": 0.3102461362335432, "grad_norm": 0.9909505844116211, "learning_rate": 1.617672945542768e-05, "loss": 1.0782, "step": 1355 }, { "epoch": 0.310475100171723, "grad_norm": 1.0805736780166626, "learning_rate": 1.6170893851704875e-05, "loss": 1.1272, "step": 1356 }, { "epoch": 0.3107040641099027, "grad_norm": 1.3021117448806763, "learning_rate": 1.6165054852190917e-05, "loss": 1.0889, "step": 1357 }, { "epoch": 0.3109330280480824, "grad_norm": 1.0056633949279785, "learning_rate": 1.6159212460098968e-05, "loss": 1.0785, "step": 1358 }, { "epoch": 0.31116199198626215, "grad_norm": 1.1618127822875977, "learning_rate": 1.6153366678644035e-05, "loss": 1.0712, "step": 1359 }, { "epoch": 0.3113909559244419, "grad_norm": 1.7210270166397095, "learning_rate": 1.614751751104301e-05, "loss": 1.0849, "step": 1360 }, { "epoch": 0.31161991986262166, "grad_norm": 1.262199878692627, "learning_rate": 1.614166496051464e-05, "loss": 1.1274, "step": 1361 }, { "epoch": 0.3118488838008014, "grad_norm": 1.2134400606155396, "learning_rate": 1.6135809030279534e-05, "loss": 1.1211, "step": 1362 }, { "epoch": 0.3120778477389811, "grad_norm": 1.4023921489715576, "learning_rate": 1.6129949723560162e-05, "loss": 1.1045, "step": 1363 }, { "epoch": 0.31230681167716084, "grad_norm": 1.6166713237762451, "learning_rate": 1.612408704358085e-05, "loss": 1.0561, "step": 1364 }, { "epoch": 0.31253577561534057, "grad_norm": 1.3405228853225708, "learning_rate": 1.6118220993567783e-05, "loss": 1.1804, "step": 1365 }, { "epoch": 0.3127647395535203, "grad_norm": 1.1368193626403809, "learning_rate": 1.6112351576748994e-05, "loss": 1.1047, "step": 1366 }, { "epoch": 0.3129937034917001, "grad_norm": 1.1940534114837646, "learning_rate": 1.6106478796354382e-05, "loss": 1.0889, "step": 1367 }, { "epoch": 0.3132226674298798, "grad_norm": 1.7876060009002686, "learning_rate": 1.6100602655615683e-05, "loss": 1.1257, "step": 1368 }, { "epoch": 0.31345163136805954, "grad_norm": 1.4505313634872437, "learning_rate": 1.6094723157766493e-05, "loss": 1.1566, "step": 1369 }, { "epoch": 0.31368059530623926, "grad_norm": 1.2009062767028809, "learning_rate": 1.6088840306042247e-05, "loss": 1.1019, "step": 1370 }, { "epoch": 0.313909559244419, "grad_norm": 1.2727826833724976, "learning_rate": 1.608295410368023e-05, "loss": 1.1033, "step": 1371 }, { "epoch": 0.3141385231825987, "grad_norm": 1.1413002014160156, "learning_rate": 1.6077064553919565e-05, "loss": 1.0678, "step": 1372 }, { "epoch": 0.3143674871207785, "grad_norm": 1.2150791883468628, "learning_rate": 1.6071171660001232e-05, "loss": 1.093, "step": 1373 }, { "epoch": 0.3145964510589582, "grad_norm": 1.4832680225372314, "learning_rate": 1.6065275425168034e-05, "loss": 1.1531, "step": 1374 }, { "epoch": 0.31482541499713795, "grad_norm": 1.3526698350906372, "learning_rate": 1.605937585266462e-05, "loss": 1.104, "step": 1375 }, { "epoch": 0.3150543789353177, "grad_norm": 1.1165345907211304, "learning_rate": 1.6053472945737474e-05, "loss": 1.0644, "step": 1376 }, { "epoch": 0.3152833428734974, "grad_norm": 1.1160399913787842, "learning_rate": 1.6047566707634918e-05, "loss": 1.1056, "step": 1377 }, { "epoch": 0.31551230681167713, "grad_norm": 1.4708969593048096, "learning_rate": 1.604165714160711e-05, "loss": 1.049, "step": 1378 }, { "epoch": 0.3157412707498569, "grad_norm": 1.1713590621948242, "learning_rate": 1.6035744250906026e-05, "loss": 1.1238, "step": 1379 }, { "epoch": 0.31597023468803664, "grad_norm": 1.135960340499878, "learning_rate": 1.6029828038785486e-05, "loss": 1.1036, "step": 1380 }, { "epoch": 0.31619919862621637, "grad_norm": 1.1496847867965698, "learning_rate": 1.602390850850113e-05, "loss": 1.1226, "step": 1381 }, { "epoch": 0.3164281625643961, "grad_norm": 1.4546343088150024, "learning_rate": 1.6017985663310427e-05, "loss": 1.0588, "step": 1382 }, { "epoch": 0.3166571265025758, "grad_norm": 1.1785930395126343, "learning_rate": 1.6012059506472665e-05, "loss": 1.0823, "step": 1383 }, { "epoch": 0.3168860904407556, "grad_norm": 1.2727309465408325, "learning_rate": 1.6006130041248968e-05, "loss": 1.1149, "step": 1384 }, { "epoch": 0.31711505437893533, "grad_norm": 1.122978925704956, "learning_rate": 1.600019727090226e-05, "loss": 1.0961, "step": 1385 }, { "epoch": 0.31734401831711506, "grad_norm": 1.1095397472381592, "learning_rate": 1.59942611986973e-05, "loss": 1.1687, "step": 1386 }, { "epoch": 0.3175729822552948, "grad_norm": 1.2510476112365723, "learning_rate": 1.598832182790066e-05, "loss": 1.0486, "step": 1387 }, { "epoch": 0.3178019461934745, "grad_norm": 1.103319764137268, "learning_rate": 1.5982379161780722e-05, "loss": 1.1006, "step": 1388 }, { "epoch": 0.31803091013165424, "grad_norm": 1.4000933170318604, "learning_rate": 1.597643320360769e-05, "loss": 1.1743, "step": 1389 }, { "epoch": 0.318259874069834, "grad_norm": 1.0757001638412476, "learning_rate": 1.5970483956653572e-05, "loss": 1.0575, "step": 1390 }, { "epoch": 0.31848883800801375, "grad_norm": 1.2366151809692383, "learning_rate": 1.5964531424192187e-05, "loss": 1.0801, "step": 1391 }, { "epoch": 0.3187178019461935, "grad_norm": 1.175316572189331, "learning_rate": 1.595857560949917e-05, "loss": 1.0543, "step": 1392 }, { "epoch": 0.3189467658843732, "grad_norm": 1.2209275960922241, "learning_rate": 1.595261651585195e-05, "loss": 1.1517, "step": 1393 }, { "epoch": 0.31917572982255293, "grad_norm": 1.448627233505249, "learning_rate": 1.5946654146529766e-05, "loss": 1.137, "step": 1394 }, { "epoch": 0.31940469376073266, "grad_norm": 1.2850122451782227, "learning_rate": 1.5940688504813664e-05, "loss": 1.113, "step": 1395 }, { "epoch": 0.31963365769891244, "grad_norm": 1.3141731023788452, "learning_rate": 1.5934719593986483e-05, "loss": 1.0964, "step": 1396 }, { "epoch": 0.31986262163709217, "grad_norm": 1.1482245922088623, "learning_rate": 1.592874741733287e-05, "loss": 1.0624, "step": 1397 }, { "epoch": 0.3200915855752719, "grad_norm": 1.1503582000732422, "learning_rate": 1.5922771978139255e-05, "loss": 1.0634, "step": 1398 }, { "epoch": 0.3203205495134516, "grad_norm": 1.4972145557403564, "learning_rate": 1.5916793279693878e-05, "loss": 1.1168, "step": 1399 }, { "epoch": 0.32054951345163135, "grad_norm": 1.0118694305419922, "learning_rate": 1.5910811325286768e-05, "loss": 1.0714, "step": 1400 }, { "epoch": 0.3207784773898111, "grad_norm": 1.2603758573532104, "learning_rate": 1.590482611820974e-05, "loss": 1.1209, "step": 1401 }, { "epoch": 0.32100744132799086, "grad_norm": 0.941050112247467, "learning_rate": 1.5898837661756405e-05, "loss": 1.0597, "step": 1402 }, { "epoch": 0.3212364052661706, "grad_norm": 1.2156561613082886, "learning_rate": 1.5892845959222164e-05, "loss": 1.101, "step": 1403 }, { "epoch": 0.3214653692043503, "grad_norm": 1.0653733015060425, "learning_rate": 1.5886851013904193e-05, "loss": 1.0916, "step": 1404 }, { "epoch": 0.32169433314253004, "grad_norm": 1.1176843643188477, "learning_rate": 1.5880852829101464e-05, "loss": 1.1002, "step": 1405 }, { "epoch": 0.32192329708070977, "grad_norm": 1.3525538444519043, "learning_rate": 1.5874851408114733e-05, "loss": 1.0686, "step": 1406 }, { "epoch": 0.32215226101888955, "grad_norm": 1.2710604667663574, "learning_rate": 1.5868846754246524e-05, "loss": 1.0965, "step": 1407 }, { "epoch": 0.3223812249570693, "grad_norm": 1.1493090391159058, "learning_rate": 1.5862838870801153e-05, "loss": 1.095, "step": 1408 }, { "epoch": 0.322610188895249, "grad_norm": 1.2418208122253418, "learning_rate": 1.5856827761084698e-05, "loss": 1.0945, "step": 1409 }, { "epoch": 0.32283915283342873, "grad_norm": 1.0906857252120972, "learning_rate": 1.5850813428405036e-05, "loss": 1.1288, "step": 1410 }, { "epoch": 0.32306811677160846, "grad_norm": 1.1310313940048218, "learning_rate": 1.58447958760718e-05, "loss": 1.0776, "step": 1411 }, { "epoch": 0.3232970807097882, "grad_norm": 1.407239317893982, "learning_rate": 1.583877510739639e-05, "loss": 1.0595, "step": 1412 }, { "epoch": 0.32352604464796797, "grad_norm": 1.0133352279663086, "learning_rate": 1.5832751125691993e-05, "loss": 1.0473, "step": 1413 }, { "epoch": 0.3237550085861477, "grad_norm": 1.329931616783142, "learning_rate": 1.5826723934273555e-05, "loss": 1.0844, "step": 1414 }, { "epoch": 0.3239839725243274, "grad_norm": 1.4198311567306519, "learning_rate": 1.5820693536457787e-05, "loss": 1.0686, "step": 1415 }, { "epoch": 0.32421293646250715, "grad_norm": 1.604586124420166, "learning_rate": 1.5814659935563165e-05, "loss": 1.1034, "step": 1416 }, { "epoch": 0.3244419004006869, "grad_norm": 1.439122200012207, "learning_rate": 1.5808623134909932e-05, "loss": 1.1126, "step": 1417 }, { "epoch": 0.3246708643388666, "grad_norm": 1.158563256263733, "learning_rate": 1.5802583137820087e-05, "loss": 1.0924, "step": 1418 }, { "epoch": 0.3248998282770464, "grad_norm": 1.1076048612594604, "learning_rate": 1.579653994761739e-05, "loss": 1.079, "step": 1419 }, { "epoch": 0.3251287922152261, "grad_norm": 1.1624640226364136, "learning_rate": 1.5790493567627357e-05, "loss": 1.0644, "step": 1420 }, { "epoch": 0.32535775615340584, "grad_norm": 1.0896527767181396, "learning_rate": 1.5784444001177262e-05, "loss": 1.1104, "step": 1421 }, { "epoch": 0.32558672009158557, "grad_norm": 1.096172571182251, "learning_rate": 1.577839125159613e-05, "loss": 1.064, "step": 1422 }, { "epoch": 0.3258156840297653, "grad_norm": 1.2764605283737183, "learning_rate": 1.577233532221474e-05, "loss": 1.0389, "step": 1423 }, { "epoch": 0.326044647967945, "grad_norm": 1.2144206762313843, "learning_rate": 1.576627621636561e-05, "loss": 1.0894, "step": 1424 }, { "epoch": 0.3262736119061248, "grad_norm": 1.1059249639511108, "learning_rate": 1.5760213937383032e-05, "loss": 1.0988, "step": 1425 }, { "epoch": 0.32650257584430453, "grad_norm": 1.1343713998794556, "learning_rate": 1.5754148488603017e-05, "loss": 1.1237, "step": 1426 }, { "epoch": 0.32673153978248426, "grad_norm": 1.1411490440368652, "learning_rate": 1.5748079873363327e-05, "loss": 1.14, "step": 1427 }, { "epoch": 0.326960503720664, "grad_norm": 1.1777209043502808, "learning_rate": 1.5742008095003478e-05, "loss": 1.1442, "step": 1428 }, { "epoch": 0.3271894676588437, "grad_norm": 1.2743630409240723, "learning_rate": 1.573593315686471e-05, "loss": 1.127, "step": 1429 }, { "epoch": 0.3274184315970235, "grad_norm": 1.4254614114761353, "learning_rate": 1.5729855062290024e-05, "loss": 1.0446, "step": 1430 }, { "epoch": 0.3276473955352032, "grad_norm": 1.1998964548110962, "learning_rate": 1.572377381462413e-05, "loss": 1.0988, "step": 1431 }, { "epoch": 0.32787635947338295, "grad_norm": 1.0931754112243652, "learning_rate": 1.5717689417213495e-05, "loss": 1.0696, "step": 1432 }, { "epoch": 0.3281053234115627, "grad_norm": 1.1342476606369019, "learning_rate": 1.5711601873406314e-05, "loss": 1.0996, "step": 1433 }, { "epoch": 0.3283342873497424, "grad_norm": 1.8414933681488037, "learning_rate": 1.5705511186552506e-05, "loss": 1.1078, "step": 1434 }, { "epoch": 0.32856325128792213, "grad_norm": 1.0369212627410889, "learning_rate": 1.5699417360003725e-05, "loss": 1.0947, "step": 1435 }, { "epoch": 0.3287922152261019, "grad_norm": 1.1955866813659668, "learning_rate": 1.5693320397113358e-05, "loss": 1.1377, "step": 1436 }, { "epoch": 0.32902117916428164, "grad_norm": 1.1285663843154907, "learning_rate": 1.568722030123651e-05, "loss": 1.1109, "step": 1437 }, { "epoch": 0.32925014310246137, "grad_norm": 1.313095211982727, "learning_rate": 1.568111707573001e-05, "loss": 1.1211, "step": 1438 }, { "epoch": 0.3294791070406411, "grad_norm": 1.4031476974487305, "learning_rate": 1.567501072395241e-05, "loss": 1.0377, "step": 1439 }, { "epoch": 0.3297080709788208, "grad_norm": 1.2493641376495361, "learning_rate": 1.5668901249263996e-05, "loss": 1.0921, "step": 1440 }, { "epoch": 0.32993703491700055, "grad_norm": 1.0535095930099487, "learning_rate": 1.5662788655026745e-05, "loss": 1.0488, "step": 1441 }, { "epoch": 0.33016599885518033, "grad_norm": 1.165377140045166, "learning_rate": 1.565667294460438e-05, "loss": 1.0395, "step": 1442 }, { "epoch": 0.33039496279336006, "grad_norm": 1.4327694177627563, "learning_rate": 1.5650554121362315e-05, "loss": 1.1258, "step": 1443 }, { "epoch": 0.3306239267315398, "grad_norm": 1.1685681343078613, "learning_rate": 1.5644432188667695e-05, "loss": 1.0601, "step": 1444 }, { "epoch": 0.3308528906697195, "grad_norm": 1.4015522003173828, "learning_rate": 1.563830714988936e-05, "loss": 1.0852, "step": 1445 }, { "epoch": 0.33108185460789924, "grad_norm": 1.0831905603408813, "learning_rate": 1.5632179008397876e-05, "loss": 1.125, "step": 1446 }, { "epoch": 0.33131081854607897, "grad_norm": 1.9634826183319092, "learning_rate": 1.5626047767565503e-05, "loss": 1.0603, "step": 1447 }, { "epoch": 0.33153978248425875, "grad_norm": 1.1687532663345337, "learning_rate": 1.561991343076621e-05, "loss": 1.1012, "step": 1448 }, { "epoch": 0.3317687464224385, "grad_norm": 1.5196926593780518, "learning_rate": 1.5613776001375674e-05, "loss": 1.1178, "step": 1449 }, { "epoch": 0.3319977103606182, "grad_norm": 1.2472096681594849, "learning_rate": 1.5607635482771272e-05, "loss": 1.0965, "step": 1450 }, { "epoch": 0.33222667429879793, "grad_norm": 1.207923173904419, "learning_rate": 1.5601491878332077e-05, "loss": 1.0978, "step": 1451 }, { "epoch": 0.33245563823697766, "grad_norm": 1.1956391334533691, "learning_rate": 1.5595345191438864e-05, "loss": 1.1358, "step": 1452 }, { "epoch": 0.33268460217515744, "grad_norm": 1.1643397808074951, "learning_rate": 1.5589195425474105e-05, "loss": 1.0615, "step": 1453 }, { "epoch": 0.33291356611333717, "grad_norm": 1.332839846611023, "learning_rate": 1.5583042583821963e-05, "loss": 1.0744, "step": 1454 }, { "epoch": 0.3331425300515169, "grad_norm": 1.5693964958190918, "learning_rate": 1.5576886669868297e-05, "loss": 1.095, "step": 1455 }, { "epoch": 0.3333714939896966, "grad_norm": 1.3151133060455322, "learning_rate": 1.557072768700065e-05, "loss": 1.0147, "step": 1456 }, { "epoch": 0.33360045792787635, "grad_norm": 2.105339765548706, "learning_rate": 1.5564565638608264e-05, "loss": 1.154, "step": 1457 }, { "epoch": 0.3338294218660561, "grad_norm": 1.1660069227218628, "learning_rate": 1.5558400528082057e-05, "loss": 1.0835, "step": 1458 }, { "epoch": 0.33405838580423586, "grad_norm": 1.198183298110962, "learning_rate": 1.5552232358814646e-05, "loss": 1.0679, "step": 1459 }, { "epoch": 0.3342873497424156, "grad_norm": 0.9822539687156677, "learning_rate": 1.5546061134200316e-05, "loss": 1.0952, "step": 1460 }, { "epoch": 0.3345163136805953, "grad_norm": 1.082090973854065, "learning_rate": 1.5539886857635037e-05, "loss": 1.0979, "step": 1461 }, { "epoch": 0.33474527761877504, "grad_norm": 1.2123302221298218, "learning_rate": 1.5533709532516473e-05, "loss": 1.0944, "step": 1462 }, { "epoch": 0.33497424155695477, "grad_norm": 1.1472247838974, "learning_rate": 1.552752916224395e-05, "loss": 1.1239, "step": 1463 }, { "epoch": 0.3352032054951345, "grad_norm": 1.1450691223144531, "learning_rate": 1.5521345750218463e-05, "loss": 1.069, "step": 1464 }, { "epoch": 0.3354321694333143, "grad_norm": 1.3406015634536743, "learning_rate": 1.551515929984271e-05, "loss": 1.0496, "step": 1465 }, { "epoch": 0.335661133371494, "grad_norm": 1.2106640338897705, "learning_rate": 1.5508969814521026e-05, "loss": 1.1102, "step": 1466 }, { "epoch": 0.33589009730967373, "grad_norm": 1.483971118927002, "learning_rate": 1.5502777297659447e-05, "loss": 1.0899, "step": 1467 }, { "epoch": 0.33611906124785346, "grad_norm": 1.171859860420227, "learning_rate": 1.549658175266565e-05, "loss": 1.0654, "step": 1468 }, { "epoch": 0.3363480251860332, "grad_norm": 2.5037529468536377, "learning_rate": 1.5490383182949e-05, "loss": 1.0884, "step": 1469 }, { "epoch": 0.3365769891242129, "grad_norm": 1.0418339967727661, "learning_rate": 1.5484181591920516e-05, "loss": 1.0827, "step": 1470 }, { "epoch": 0.3368059530623927, "grad_norm": 1.2835181951522827, "learning_rate": 1.5477976982992883e-05, "loss": 1.0139, "step": 1471 }, { "epoch": 0.3370349170005724, "grad_norm": 1.2543227672576904, "learning_rate": 1.547176935958044e-05, "loss": 1.1055, "step": 1472 }, { "epoch": 0.33726388093875215, "grad_norm": 1.0822360515594482, "learning_rate": 1.54655587250992e-05, "loss": 1.0823, "step": 1473 }, { "epoch": 0.3374928448769319, "grad_norm": 1.016126036643982, "learning_rate": 1.5459345082966812e-05, "loss": 1.0929, "step": 1474 }, { "epoch": 0.3377218088151116, "grad_norm": 1.147747278213501, "learning_rate": 1.5453128436602597e-05, "loss": 1.1562, "step": 1475 }, { "epoch": 0.3379507727532914, "grad_norm": 1.3952897787094116, "learning_rate": 1.5446908789427522e-05, "loss": 1.0573, "step": 1476 }, { "epoch": 0.3381797366914711, "grad_norm": 1.0825663805007935, "learning_rate": 1.5440686144864207e-05, "loss": 1.1492, "step": 1477 }, { "epoch": 0.33840870062965084, "grad_norm": 1.1135352849960327, "learning_rate": 1.5434460506336922e-05, "loss": 1.0801, "step": 1478 }, { "epoch": 0.33863766456783057, "grad_norm": 1.1229122877120972, "learning_rate": 1.5428231877271584e-05, "loss": 1.1142, "step": 1479 }, { "epoch": 0.3388666285060103, "grad_norm": 1.1263865232467651, "learning_rate": 1.542200026109575e-05, "loss": 1.0628, "step": 1480 }, { "epoch": 0.33909559244419, "grad_norm": 1.3717758655548096, "learning_rate": 1.5415765661238635e-05, "loss": 1.106, "step": 1481 }, { "epoch": 0.3393245563823698, "grad_norm": 1.1073044538497925, "learning_rate": 1.540952808113108e-05, "loss": 1.0775, "step": 1482 }, { "epoch": 0.33955352032054953, "grad_norm": 1.1785542964935303, "learning_rate": 1.5403287524205577e-05, "loss": 1.0809, "step": 1483 }, { "epoch": 0.33978248425872926, "grad_norm": 1.185001254081726, "learning_rate": 1.539704399389625e-05, "loss": 0.996, "step": 1484 }, { "epoch": 0.340011448196909, "grad_norm": 1.155920386314392, "learning_rate": 1.5390797493638862e-05, "loss": 1.0949, "step": 1485 }, { "epoch": 0.3402404121350887, "grad_norm": 1.0656671524047852, "learning_rate": 1.538454802687081e-05, "loss": 1.0234, "step": 1486 }, { "epoch": 0.34046937607326844, "grad_norm": 1.3848119974136353, "learning_rate": 1.537829559703112e-05, "loss": 1.0846, "step": 1487 }, { "epoch": 0.3406983400114482, "grad_norm": 1.0920829772949219, "learning_rate": 1.5372040207560457e-05, "loss": 1.1337, "step": 1488 }, { "epoch": 0.34092730394962795, "grad_norm": 1.3589757680892944, "learning_rate": 1.536578186190111e-05, "loss": 1.0797, "step": 1489 }, { "epoch": 0.3411562678878077, "grad_norm": 1.4441230297088623, "learning_rate": 1.5359520563496985e-05, "loss": 1.131, "step": 1490 }, { "epoch": 0.3413852318259874, "grad_norm": 1.3296109437942505, "learning_rate": 1.5353256315793633e-05, "loss": 1.086, "step": 1491 }, { "epoch": 0.34161419576416713, "grad_norm": 1.065158486366272, "learning_rate": 1.534698912223821e-05, "loss": 1.0374, "step": 1492 }, { "epoch": 0.34184315970234685, "grad_norm": 2.7994022369384766, "learning_rate": 1.5340718986279505e-05, "loss": 1.0859, "step": 1493 }, { "epoch": 0.34207212364052664, "grad_norm": 1.2376704216003418, "learning_rate": 1.5334445911367915e-05, "loss": 1.0435, "step": 1494 }, { "epoch": 0.34230108757870636, "grad_norm": 1.0192632675170898, "learning_rate": 1.5328169900955463e-05, "loss": 1.114, "step": 1495 }, { "epoch": 0.3425300515168861, "grad_norm": 1.3984559774398804, "learning_rate": 1.5321890958495787e-05, "loss": 1.0808, "step": 1496 }, { "epoch": 0.3427590154550658, "grad_norm": 0.9953001141548157, "learning_rate": 1.5315609087444135e-05, "loss": 1.0632, "step": 1497 }, { "epoch": 0.34298797939324555, "grad_norm": 1.4686434268951416, "learning_rate": 1.5309324291257373e-05, "loss": 1.0939, "step": 1498 }, { "epoch": 0.34321694333142533, "grad_norm": 1.450624704360962, "learning_rate": 1.5303036573393964e-05, "loss": 1.0561, "step": 1499 }, { "epoch": 0.34344590726960506, "grad_norm": 1.1755778789520264, "learning_rate": 1.529674593731399e-05, "loss": 1.1257, "step": 1500 }, { "epoch": 0.3436748712077848, "grad_norm": 1.1907165050506592, "learning_rate": 1.5290452386479132e-05, "loss": 1.1242, "step": 1501 }, { "epoch": 0.3439038351459645, "grad_norm": 1.4106594324111938, "learning_rate": 1.5284155924352678e-05, "loss": 1.1147, "step": 1502 }, { "epoch": 0.34413279908414424, "grad_norm": 1.3477283716201782, "learning_rate": 1.5277856554399528e-05, "loss": 1.1203, "step": 1503 }, { "epoch": 0.34436176302232396, "grad_norm": 1.1035650968551636, "learning_rate": 1.5271554280086164e-05, "loss": 1.0926, "step": 1504 }, { "epoch": 0.34459072696050375, "grad_norm": 1.4535436630249023, "learning_rate": 1.5265249104880675e-05, "loss": 1.0157, "step": 1505 }, { "epoch": 0.3448196908986835, "grad_norm": 1.184928297996521, "learning_rate": 1.5258941032252747e-05, "loss": 1.0421, "step": 1506 }, { "epoch": 0.3450486548368632, "grad_norm": 1.1650036573410034, "learning_rate": 1.5252630065673662e-05, "loss": 1.0696, "step": 1507 }, { "epoch": 0.3452776187750429, "grad_norm": 1.272552490234375, "learning_rate": 1.5246316208616289e-05, "loss": 1.111, "step": 1508 }, { "epoch": 0.34550658271322265, "grad_norm": 1.1120789051055908, "learning_rate": 1.5239999464555092e-05, "loss": 1.1354, "step": 1509 }, { "epoch": 0.3457355466514024, "grad_norm": 1.6743152141571045, "learning_rate": 1.5233679836966122e-05, "loss": 1.0393, "step": 1510 }, { "epoch": 0.34596451058958216, "grad_norm": 1.243407130241394, "learning_rate": 1.5227357329327016e-05, "loss": 1.0646, "step": 1511 }, { "epoch": 0.3461934745277619, "grad_norm": 1.359520673751831, "learning_rate": 1.5221031945116998e-05, "loss": 1.1385, "step": 1512 }, { "epoch": 0.3464224384659416, "grad_norm": 1.18605637550354, "learning_rate": 1.5214703687816874e-05, "loss": 1.1231, "step": 1513 }, { "epoch": 0.34665140240412134, "grad_norm": 1.3147512674331665, "learning_rate": 1.5208372560909031e-05, "loss": 1.1246, "step": 1514 }, { "epoch": 0.34688036634230107, "grad_norm": 1.420447826385498, "learning_rate": 1.5202038567877436e-05, "loss": 1.1292, "step": 1515 }, { "epoch": 0.3471093302804808, "grad_norm": 1.1527974605560303, "learning_rate": 1.5195701712207627e-05, "loss": 1.0658, "step": 1516 }, { "epoch": 0.3473382942186606, "grad_norm": 1.1984782218933105, "learning_rate": 1.5189361997386729e-05, "loss": 1.0923, "step": 1517 }, { "epoch": 0.3475672581568403, "grad_norm": 1.3671753406524658, "learning_rate": 1.5183019426903434e-05, "loss": 1.2041, "step": 1518 }, { "epoch": 0.34779622209502004, "grad_norm": 1.3007023334503174, "learning_rate": 1.5176674004247998e-05, "loss": 1.0656, "step": 1519 }, { "epoch": 0.34802518603319976, "grad_norm": 1.2162976264953613, "learning_rate": 1.517032573291226e-05, "loss": 1.1123, "step": 1520 }, { "epoch": 0.3482541499713795, "grad_norm": 1.119856834411621, "learning_rate": 1.5163974616389621e-05, "loss": 1.0619, "step": 1521 }, { "epoch": 0.3484831139095592, "grad_norm": 1.3005276918411255, "learning_rate": 1.5157620658175046e-05, "loss": 1.0622, "step": 1522 }, { "epoch": 0.348712077847739, "grad_norm": 1.0568853616714478, "learning_rate": 1.515126386176506e-05, "loss": 1.0777, "step": 1523 }, { "epoch": 0.3489410417859187, "grad_norm": 1.326450228691101, "learning_rate": 1.5144904230657765e-05, "loss": 1.0952, "step": 1524 }, { "epoch": 0.34917000572409845, "grad_norm": 1.2039971351623535, "learning_rate": 1.5138541768352802e-05, "loss": 1.1044, "step": 1525 }, { "epoch": 0.3493989696622782, "grad_norm": 1.1191279888153076, "learning_rate": 1.5132176478351386e-05, "loss": 1.0931, "step": 1526 }, { "epoch": 0.3496279336004579, "grad_norm": 1.0349925756454468, "learning_rate": 1.5125808364156283e-05, "loss": 1.1128, "step": 1527 }, { "epoch": 0.3498568975386377, "grad_norm": 1.1004794836044312, "learning_rate": 1.5119437429271813e-05, "loss": 1.1096, "step": 1528 }, { "epoch": 0.3500858614768174, "grad_norm": 1.117506504058838, "learning_rate": 1.5113063677203847e-05, "loss": 1.0509, "step": 1529 }, { "epoch": 0.35031482541499714, "grad_norm": 1.113121509552002, "learning_rate": 1.5106687111459809e-05, "loss": 1.1017, "step": 1530 }, { "epoch": 0.35054378935317687, "grad_norm": 3.266343116760254, "learning_rate": 1.5100307735548662e-05, "loss": 1.1525, "step": 1531 }, { "epoch": 0.3507727532913566, "grad_norm": 1.3672198057174683, "learning_rate": 1.5093925552980934e-05, "loss": 1.0521, "step": 1532 }, { "epoch": 0.3510017172295363, "grad_norm": 1.2172337770462036, "learning_rate": 1.5087540567268682e-05, "loss": 1.0226, "step": 1533 }, { "epoch": 0.3512306811677161, "grad_norm": 0.9038141369819641, "learning_rate": 1.5081152781925508e-05, "loss": 1.0518, "step": 1534 }, { "epoch": 0.35145964510589583, "grad_norm": 1.2456227540969849, "learning_rate": 1.5074762200466557e-05, "loss": 1.121, "step": 1535 }, { "epoch": 0.35168860904407556, "grad_norm": 1.2485681772232056, "learning_rate": 1.5068368826408515e-05, "loss": 1.0517, "step": 1536 }, { "epoch": 0.3519175729822553, "grad_norm": 1.2374999523162842, "learning_rate": 1.5061972663269604e-05, "loss": 1.0035, "step": 1537 }, { "epoch": 0.352146536920435, "grad_norm": 1.0219475030899048, "learning_rate": 1.5055573714569574e-05, "loss": 1.0786, "step": 1538 }, { "epoch": 0.35237550085861474, "grad_norm": 1.2069019079208374, "learning_rate": 1.5049171983829714e-05, "loss": 1.0511, "step": 1539 }, { "epoch": 0.3526044647967945, "grad_norm": 1.256385087966919, "learning_rate": 1.5042767474572846e-05, "loss": 1.1174, "step": 1540 }, { "epoch": 0.35283342873497425, "grad_norm": 1.2299443483352661, "learning_rate": 1.5036360190323315e-05, "loss": 1.0537, "step": 1541 }, { "epoch": 0.353062392673154, "grad_norm": 1.5519248247146606, "learning_rate": 1.5029950134606991e-05, "loss": 1.131, "step": 1542 }, { "epoch": 0.3532913566113337, "grad_norm": 1.0618422031402588, "learning_rate": 1.5023537310951284e-05, "loss": 1.1499, "step": 1543 }, { "epoch": 0.35352032054951343, "grad_norm": 1.1327383518218994, "learning_rate": 1.501712172288511e-05, "loss": 1.1216, "step": 1544 }, { "epoch": 0.35374928448769316, "grad_norm": 1.2645106315612793, "learning_rate": 1.5010703373938915e-05, "loss": 1.108, "step": 1545 }, { "epoch": 0.35397824842587294, "grad_norm": 1.7846484184265137, "learning_rate": 1.5004282267644668e-05, "loss": 1.0953, "step": 1546 }, { "epoch": 0.35420721236405267, "grad_norm": 1.1685068607330322, "learning_rate": 1.4997858407535841e-05, "loss": 1.0658, "step": 1547 }, { "epoch": 0.3544361763022324, "grad_norm": 0.9860405921936035, "learning_rate": 1.4991431797147433e-05, "loss": 1.053, "step": 1548 }, { "epoch": 0.3546651402404121, "grad_norm": 1.2382588386535645, "learning_rate": 1.4985002440015959e-05, "loss": 1.112, "step": 1549 }, { "epoch": 0.35489410417859185, "grad_norm": 1.1630792617797852, "learning_rate": 1.4978570339679435e-05, "loss": 1.0488, "step": 1550 }, { "epoch": 0.35512306811677163, "grad_norm": 1.154436469078064, "learning_rate": 1.4972135499677395e-05, "loss": 1.0509, "step": 1551 }, { "epoch": 0.35535203205495136, "grad_norm": 1.0315396785736084, "learning_rate": 1.4965697923550873e-05, "loss": 1.0807, "step": 1552 }, { "epoch": 0.3555809959931311, "grad_norm": 0.9754356741905212, "learning_rate": 1.4959257614842416e-05, "loss": 1.1137, "step": 1553 }, { "epoch": 0.3558099599313108, "grad_norm": 1.0511184930801392, "learning_rate": 1.4952814577096073e-05, "loss": 1.1068, "step": 1554 }, { "epoch": 0.35603892386949054, "grad_norm": 1.3357048034667969, "learning_rate": 1.4946368813857393e-05, "loss": 1.053, "step": 1555 }, { "epoch": 0.35626788780767027, "grad_norm": 0.9737345576286316, "learning_rate": 1.4939920328673422e-05, "loss": 1.0613, "step": 1556 }, { "epoch": 0.35649685174585005, "grad_norm": 1.1738935708999634, "learning_rate": 1.4933469125092714e-05, "loss": 1.0548, "step": 1557 }, { "epoch": 0.3567258156840298, "grad_norm": 1.0252792835235596, "learning_rate": 1.4927015206665311e-05, "loss": 1.1409, "step": 1558 }, { "epoch": 0.3569547796222095, "grad_norm": 1.1485637426376343, "learning_rate": 1.4920558576942746e-05, "loss": 1.1199, "step": 1559 }, { "epoch": 0.35718374356038923, "grad_norm": 1.2590608596801758, "learning_rate": 1.4914099239478046e-05, "loss": 1.0939, "step": 1560 }, { "epoch": 0.35741270749856896, "grad_norm": 1.1943340301513672, "learning_rate": 1.490763719782574e-05, "loss": 1.049, "step": 1561 }, { "epoch": 0.3576416714367487, "grad_norm": 1.8277630805969238, "learning_rate": 1.4901172455541826e-05, "loss": 1.1446, "step": 1562 }, { "epoch": 0.35787063537492847, "grad_norm": 1.2777670621871948, "learning_rate": 1.4894705016183803e-05, "loss": 1.0863, "step": 1563 }, { "epoch": 0.3580995993131082, "grad_norm": 1.1132858991622925, "learning_rate": 1.4888234883310644e-05, "loss": 1.1255, "step": 1564 }, { "epoch": 0.3583285632512879, "grad_norm": 1.2168593406677246, "learning_rate": 1.4881762060482814e-05, "loss": 1.1271, "step": 1565 }, { "epoch": 0.35855752718946765, "grad_norm": 1.321545124053955, "learning_rate": 1.4875286551262252e-05, "loss": 1.0964, "step": 1566 }, { "epoch": 0.3587864911276474, "grad_norm": 1.1392179727554321, "learning_rate": 1.4868808359212373e-05, "loss": 1.0022, "step": 1567 }, { "epoch": 0.3590154550658271, "grad_norm": 1.227779746055603, "learning_rate": 1.4862327487898075e-05, "loss": 1.1225, "step": 1568 }, { "epoch": 0.3592444190040069, "grad_norm": 1.1016408205032349, "learning_rate": 1.4855843940885726e-05, "loss": 1.0832, "step": 1569 }, { "epoch": 0.3594733829421866, "grad_norm": 1.0965155363082886, "learning_rate": 1.4849357721743169e-05, "loss": 1.0853, "step": 1570 }, { "epoch": 0.35970234688036634, "grad_norm": 1.1590499877929688, "learning_rate": 1.484286883403971e-05, "loss": 1.0385, "step": 1571 }, { "epoch": 0.35993131081854607, "grad_norm": 1.0855969190597534, "learning_rate": 1.483637728134614e-05, "loss": 1.0943, "step": 1572 }, { "epoch": 0.3601602747567258, "grad_norm": 1.1280837059020996, "learning_rate": 1.4829883067234699e-05, "loss": 1.0209, "step": 1573 }, { "epoch": 0.3603892386949056, "grad_norm": 1.0733137130737305, "learning_rate": 1.4823386195279098e-05, "loss": 1.1372, "step": 1574 }, { "epoch": 0.3606182026330853, "grad_norm": 1.1513261795043945, "learning_rate": 1.4816886669054514e-05, "loss": 1.0584, "step": 1575 }, { "epoch": 0.36084716657126503, "grad_norm": 1.3716681003570557, "learning_rate": 1.4810384492137582e-05, "loss": 1.1219, "step": 1576 }, { "epoch": 0.36107613050944476, "grad_norm": 0.9845678806304932, "learning_rate": 1.4803879668106393e-05, "loss": 1.0914, "step": 1577 }, { "epoch": 0.3613050944476245, "grad_norm": 1.257351040840149, "learning_rate": 1.4797372200540497e-05, "loss": 1.1247, "step": 1578 }, { "epoch": 0.3615340583858042, "grad_norm": 1.4582107067108154, "learning_rate": 1.4790862093020903e-05, "loss": 1.0401, "step": 1579 }, { "epoch": 0.361763022323984, "grad_norm": 1.277854323387146, "learning_rate": 1.4784349349130063e-05, "loss": 1.0367, "step": 1580 }, { "epoch": 0.3619919862621637, "grad_norm": 1.1028213500976562, "learning_rate": 1.4777833972451889e-05, "loss": 1.0808, "step": 1581 }, { "epoch": 0.36222095020034345, "grad_norm": 1.0931235551834106, "learning_rate": 1.477131596657174e-05, "loss": 1.0286, "step": 1582 }, { "epoch": 0.3624499141385232, "grad_norm": 1.4668598175048828, "learning_rate": 1.4764795335076414e-05, "loss": 1.1162, "step": 1583 }, { "epoch": 0.3626788780767029, "grad_norm": 2.652813673019409, "learning_rate": 1.4758272081554168e-05, "loss": 1.1068, "step": 1584 }, { "epoch": 0.36290784201488263, "grad_norm": 1.1767535209655762, "learning_rate": 1.4751746209594683e-05, "loss": 1.0795, "step": 1585 }, { "epoch": 0.3631368059530624, "grad_norm": 1.0798718929290771, "learning_rate": 1.47452177227891e-05, "loss": 1.0911, "step": 1586 }, { "epoch": 0.36336576989124214, "grad_norm": 1.119814395904541, "learning_rate": 1.4738686624729987e-05, "loss": 1.0673, "step": 1587 }, { "epoch": 0.36359473382942187, "grad_norm": 1.2240833044052124, "learning_rate": 1.4732152919011355e-05, "loss": 1.0708, "step": 1588 }, { "epoch": 0.3638236977676016, "grad_norm": 1.1120808124542236, "learning_rate": 1.4725616609228648e-05, "loss": 1.0919, "step": 1589 }, { "epoch": 0.3640526617057813, "grad_norm": 1.2676600217819214, "learning_rate": 1.4719077698978737e-05, "loss": 1.0665, "step": 1590 }, { "epoch": 0.36428162564396105, "grad_norm": 1.1480255126953125, "learning_rate": 1.4712536191859934e-05, "loss": 1.0787, "step": 1591 }, { "epoch": 0.36451058958214083, "grad_norm": 1.7230538129806519, "learning_rate": 1.4705992091471975e-05, "loss": 1.1455, "step": 1592 }, { "epoch": 0.36473955352032056, "grad_norm": 1.1542283296585083, "learning_rate": 1.4699445401416024e-05, "loss": 1.1246, "step": 1593 }, { "epoch": 0.3649685174585003, "grad_norm": 1.230782389640808, "learning_rate": 1.4692896125294667e-05, "loss": 1.1106, "step": 1594 }, { "epoch": 0.36519748139668, "grad_norm": 1.2832047939300537, "learning_rate": 1.4686344266711916e-05, "loss": 1.1203, "step": 1595 }, { "epoch": 0.36542644533485974, "grad_norm": 1.213179111480713, "learning_rate": 1.467978982927321e-05, "loss": 1.1387, "step": 1596 }, { "epoch": 0.3656554092730395, "grad_norm": 1.2163344621658325, "learning_rate": 1.4673232816585392e-05, "loss": 1.1456, "step": 1597 }, { "epoch": 0.36588437321121925, "grad_norm": 1.1617780923843384, "learning_rate": 1.4666673232256738e-05, "loss": 1.091, "step": 1598 }, { "epoch": 0.366113337149399, "grad_norm": 1.3086128234863281, "learning_rate": 1.466011107989693e-05, "loss": 1.096, "step": 1599 }, { "epoch": 0.3663423010875787, "grad_norm": 1.1292928457260132, "learning_rate": 1.4653546363117063e-05, "loss": 1.0508, "step": 1600 }, { "epoch": 0.36657126502575843, "grad_norm": 1.2157028913497925, "learning_rate": 1.464697908552965e-05, "loss": 1.0545, "step": 1601 }, { "epoch": 0.36680022896393816, "grad_norm": 1.3973489999771118, "learning_rate": 1.4640409250748604e-05, "loss": 1.0731, "step": 1602 }, { "epoch": 0.36702919290211794, "grad_norm": 1.1359471082687378, "learning_rate": 1.4633836862389257e-05, "loss": 1.0881, "step": 1603 }, { "epoch": 0.36725815684029767, "grad_norm": 1.074826955795288, "learning_rate": 1.4627261924068329e-05, "loss": 1.0443, "step": 1604 }, { "epoch": 0.3674871207784774, "grad_norm": 1.4347003698349, "learning_rate": 1.4620684439403962e-05, "loss": 1.07, "step": 1605 }, { "epoch": 0.3677160847166571, "grad_norm": 1.0502759218215942, "learning_rate": 1.4614104412015688e-05, "loss": 1.0734, "step": 1606 }, { "epoch": 0.36794504865483685, "grad_norm": 1.6279913187026978, "learning_rate": 1.4607521845524439e-05, "loss": 1.0855, "step": 1607 }, { "epoch": 0.3681740125930166, "grad_norm": 1.2757513523101807, "learning_rate": 1.460093674355255e-05, "loss": 1.0385, "step": 1608 }, { "epoch": 0.36840297653119636, "grad_norm": 1.1416752338409424, "learning_rate": 1.4594349109723744e-05, "loss": 1.0783, "step": 1609 }, { "epoch": 0.3686319404693761, "grad_norm": 1.1832513809204102, "learning_rate": 1.4587758947663146e-05, "loss": 1.0836, "step": 1610 }, { "epoch": 0.3688609044075558, "grad_norm": 1.1325169801712036, "learning_rate": 1.4581166260997259e-05, "loss": 1.0372, "step": 1611 }, { "epoch": 0.36908986834573554, "grad_norm": 1.3580266237258911, "learning_rate": 1.4574571053353987e-05, "loss": 1.0604, "step": 1612 }, { "epoch": 0.36931883228391527, "grad_norm": 1.2545275688171387, "learning_rate": 1.4567973328362616e-05, "loss": 1.0859, "step": 1613 }, { "epoch": 0.369547796222095, "grad_norm": 1.1859338283538818, "learning_rate": 1.4561373089653823e-05, "loss": 1.1461, "step": 1614 }, { "epoch": 0.3697767601602748, "grad_norm": 1.2535377740859985, "learning_rate": 1.4554770340859661e-05, "loss": 1.0554, "step": 1615 }, { "epoch": 0.3700057240984545, "grad_norm": 1.490103840827942, "learning_rate": 1.4548165085613569e-05, "loss": 1.0364, "step": 1616 }, { "epoch": 0.37023468803663423, "grad_norm": 1.1767431497573853, "learning_rate": 1.454155732755036e-05, "loss": 1.0654, "step": 1617 }, { "epoch": 0.37046365197481396, "grad_norm": 1.525582194328308, "learning_rate": 1.453494707030623e-05, "loss": 1.0878, "step": 1618 }, { "epoch": 0.3706926159129937, "grad_norm": 1.5364714860916138, "learning_rate": 1.452833431751875e-05, "loss": 1.0832, "step": 1619 }, { "epoch": 0.37092157985117347, "grad_norm": 1.1335092782974243, "learning_rate": 1.4521719072826858e-05, "loss": 1.0521, "step": 1620 }, { "epoch": 0.3711505437893532, "grad_norm": 1.278962254524231, "learning_rate": 1.4515101339870871e-05, "loss": 1.0582, "step": 1621 }, { "epoch": 0.3713795077275329, "grad_norm": 1.424355387687683, "learning_rate": 1.4508481122292475e-05, "loss": 1.0583, "step": 1622 }, { "epoch": 0.37160847166571265, "grad_norm": 1.1989699602127075, "learning_rate": 1.4501858423734711e-05, "loss": 1.0488, "step": 1623 }, { "epoch": 0.3718374356038924, "grad_norm": 1.4514950513839722, "learning_rate": 1.4495233247842001e-05, "loss": 1.1174, "step": 1624 }, { "epoch": 0.3720663995420721, "grad_norm": 1.0482782125473022, "learning_rate": 1.4488605598260129e-05, "loss": 1.085, "step": 1625 }, { "epoch": 0.3722953634802519, "grad_norm": 1.019201636314392, "learning_rate": 1.448197547863622e-05, "loss": 1.0703, "step": 1626 }, { "epoch": 0.3725243274184316, "grad_norm": 1.3561880588531494, "learning_rate": 1.4475342892618792e-05, "loss": 1.0594, "step": 1627 }, { "epoch": 0.37275329135661134, "grad_norm": 1.0671643018722534, "learning_rate": 1.4468707843857683e-05, "loss": 1.0137, "step": 1628 }, { "epoch": 0.37298225529479107, "grad_norm": 1.2708206176757812, "learning_rate": 1.4462070336004117e-05, "loss": 1.0725, "step": 1629 }, { "epoch": 0.3732112192329708, "grad_norm": 2.9247419834136963, "learning_rate": 1.4455430372710652e-05, "loss": 1.144, "step": 1630 }, { "epoch": 0.3734401831711505, "grad_norm": 1.4302781820297241, "learning_rate": 1.4448787957631209e-05, "loss": 1.0814, "step": 1631 }, { "epoch": 0.3736691471093303, "grad_norm": 1.1992768049240112, "learning_rate": 1.4442143094421054e-05, "loss": 1.0457, "step": 1632 }, { "epoch": 0.37389811104751003, "grad_norm": 1.1916435956954956, "learning_rate": 1.4435495786736796e-05, "loss": 1.1704, "step": 1633 }, { "epoch": 0.37412707498568976, "grad_norm": 1.3071752786636353, "learning_rate": 1.4428846038236391e-05, "loss": 1.0534, "step": 1634 }, { "epoch": 0.3743560389238695, "grad_norm": 1.2490984201431274, "learning_rate": 1.4422193852579144e-05, "loss": 1.0872, "step": 1635 }, { "epoch": 0.3745850028620492, "grad_norm": 1.0837894678115845, "learning_rate": 1.4415539233425697e-05, "loss": 1.1409, "step": 1636 }, { "epoch": 0.37481396680022894, "grad_norm": 1.1508346796035767, "learning_rate": 1.4408882184438029e-05, "loss": 1.1122, "step": 1637 }, { "epoch": 0.3750429307384087, "grad_norm": 1.1703577041625977, "learning_rate": 1.4402222709279458e-05, "loss": 1.0457, "step": 1638 }, { "epoch": 0.37527189467658845, "grad_norm": 1.1867021322250366, "learning_rate": 1.439556081161464e-05, "loss": 1.0956, "step": 1639 }, { "epoch": 0.3755008586147682, "grad_norm": 1.2426538467407227, "learning_rate": 1.438889649510956e-05, "loss": 1.0897, "step": 1640 }, { "epoch": 0.3757298225529479, "grad_norm": 1.1953974962234497, "learning_rate": 1.4382229763431533e-05, "loss": 1.096, "step": 1641 }, { "epoch": 0.37595878649112763, "grad_norm": 1.0704538822174072, "learning_rate": 1.4375560620249209e-05, "loss": 1.079, "step": 1642 }, { "epoch": 0.3761877504293074, "grad_norm": 1.342177152633667, "learning_rate": 1.4368889069232559e-05, "loss": 1.1331, "step": 1643 }, { "epoch": 0.37641671436748714, "grad_norm": 1.2774497270584106, "learning_rate": 1.4362215114052887e-05, "loss": 1.0949, "step": 1644 }, { "epoch": 0.37664567830566686, "grad_norm": 1.1422805786132812, "learning_rate": 1.4355538758382805e-05, "loss": 1.087, "step": 1645 }, { "epoch": 0.3768746422438466, "grad_norm": 1.1978528499603271, "learning_rate": 1.4348860005896266e-05, "loss": 1.0468, "step": 1646 }, { "epoch": 0.3771036061820263, "grad_norm": 1.2687922716140747, "learning_rate": 1.4342178860268523e-05, "loss": 1.0217, "step": 1647 }, { "epoch": 0.37733257012020605, "grad_norm": 1.1119877099990845, "learning_rate": 1.433549532517616e-05, "loss": 1.0787, "step": 1648 }, { "epoch": 0.37756153405838583, "grad_norm": 0.9895734190940857, "learning_rate": 1.4328809404297068e-05, "loss": 1.0796, "step": 1649 }, { "epoch": 0.37779049799656556, "grad_norm": 1.1445213556289673, "learning_rate": 1.4322121101310454e-05, "loss": 1.0889, "step": 1650 }, { "epoch": 0.3780194619347453, "grad_norm": 1.115267276763916, "learning_rate": 1.4315430419896836e-05, "loss": 1.0857, "step": 1651 }, { "epoch": 0.378248425872925, "grad_norm": 1.3437042236328125, "learning_rate": 1.4308737363738035e-05, "loss": 1.0725, "step": 1652 }, { "epoch": 0.37847738981110474, "grad_norm": 1.1512742042541504, "learning_rate": 1.430204193651719e-05, "loss": 1.0748, "step": 1653 }, { "epoch": 0.37870635374928446, "grad_norm": 1.109114646911621, "learning_rate": 1.4295344141918734e-05, "loss": 1.1062, "step": 1654 }, { "epoch": 0.37893531768746425, "grad_norm": 1.2906250953674316, "learning_rate": 1.428864398362841e-05, "loss": 1.0753, "step": 1655 }, { "epoch": 0.379164281625644, "grad_norm": 1.2104160785675049, "learning_rate": 1.4281941465333255e-05, "loss": 1.0533, "step": 1656 }, { "epoch": 0.3793932455638237, "grad_norm": 1.1130504608154297, "learning_rate": 1.4275236590721615e-05, "loss": 1.0529, "step": 1657 }, { "epoch": 0.3796222095020034, "grad_norm": 1.3344502449035645, "learning_rate": 1.4268529363483124e-05, "loss": 1.1511, "step": 1658 }, { "epoch": 0.37985117344018315, "grad_norm": 1.1640992164611816, "learning_rate": 1.4261819787308708e-05, "loss": 1.0609, "step": 1659 }, { "epoch": 0.3800801373783629, "grad_norm": 1.4107555150985718, "learning_rate": 1.4255107865890597e-05, "loss": 1.0672, "step": 1660 }, { "epoch": 0.38030910131654266, "grad_norm": 1.214077115058899, "learning_rate": 1.4248393602922299e-05, "loss": 1.097, "step": 1661 }, { "epoch": 0.3805380652547224, "grad_norm": 1.1887096166610718, "learning_rate": 1.4241677002098622e-05, "loss": 1.0425, "step": 1662 }, { "epoch": 0.3807670291929021, "grad_norm": 1.1480146646499634, "learning_rate": 1.4234958067115652e-05, "loss": 1.0934, "step": 1663 }, { "epoch": 0.38099599313108184, "grad_norm": 1.6082152128219604, "learning_rate": 1.4228236801670762e-05, "loss": 1.0686, "step": 1664 }, { "epoch": 0.38122495706926157, "grad_norm": 1.2487170696258545, "learning_rate": 1.4221513209462615e-05, "loss": 1.041, "step": 1665 }, { "epoch": 0.38145392100744135, "grad_norm": 1.2354825735092163, "learning_rate": 1.4214787294191137e-05, "loss": 1.1181, "step": 1666 }, { "epoch": 0.3816828849456211, "grad_norm": 1.2729392051696777, "learning_rate": 1.4208059059557551e-05, "loss": 1.0707, "step": 1667 }, { "epoch": 0.3819118488838008, "grad_norm": 1.0977466106414795, "learning_rate": 1.420132850926434e-05, "loss": 1.087, "step": 1668 }, { "epoch": 0.38214081282198054, "grad_norm": 1.6709105968475342, "learning_rate": 1.419459564701528e-05, "loss": 1.1877, "step": 1669 }, { "epoch": 0.38236977676016026, "grad_norm": 1.3484159708023071, "learning_rate": 1.41878604765154e-05, "loss": 1.1141, "step": 1670 }, { "epoch": 0.38259874069834, "grad_norm": 1.3736979961395264, "learning_rate": 1.4181123001471012e-05, "loss": 1.0962, "step": 1671 }, { "epoch": 0.38282770463651977, "grad_norm": 2.105513572692871, "learning_rate": 1.4174383225589691e-05, "loss": 1.1048, "step": 1672 }, { "epoch": 0.3830566685746995, "grad_norm": 1.2961084842681885, "learning_rate": 1.4167641152580278e-05, "loss": 1.0663, "step": 1673 }, { "epoch": 0.3832856325128792, "grad_norm": 1.142513394355774, "learning_rate": 1.416089678615288e-05, "loss": 1.0686, "step": 1674 }, { "epoch": 0.38351459645105895, "grad_norm": 1.188399314880371, "learning_rate": 1.4154150130018867e-05, "loss": 1.1332, "step": 1675 }, { "epoch": 0.3837435603892387, "grad_norm": 1.2411218881607056, "learning_rate": 1.4147401187890863e-05, "loss": 1.1118, "step": 1676 }, { "epoch": 0.3839725243274184, "grad_norm": 1.4901659488677979, "learning_rate": 1.4140649963482763e-05, "loss": 1.0477, "step": 1677 }, { "epoch": 0.3842014882655982, "grad_norm": 1.0591933727264404, "learning_rate": 1.4133896460509695e-05, "loss": 1.0359, "step": 1678 }, { "epoch": 0.3844304522037779, "grad_norm": 1.8987795114517212, "learning_rate": 1.412714068268807e-05, "loss": 1.0438, "step": 1679 }, { "epoch": 0.38465941614195764, "grad_norm": 1.1370187997817993, "learning_rate": 1.4120382633735528e-05, "loss": 1.0796, "step": 1680 }, { "epoch": 0.38488838008013737, "grad_norm": 1.178002953529358, "learning_rate": 1.4113622317370965e-05, "loss": 1.0361, "step": 1681 }, { "epoch": 0.3851173440183171, "grad_norm": 0.9956271052360535, "learning_rate": 1.4106859737314532e-05, "loss": 1.0649, "step": 1682 }, { "epoch": 0.3853463079564968, "grad_norm": 1.285373330116272, "learning_rate": 1.4100094897287618e-05, "loss": 1.1628, "step": 1683 }, { "epoch": 0.3855752718946766, "grad_norm": 1.119768500328064, "learning_rate": 1.4093327801012854e-05, "loss": 1.1083, "step": 1684 }, { "epoch": 0.38580423583285633, "grad_norm": 1.304608702659607, "learning_rate": 1.4086558452214121e-05, "loss": 1.0883, "step": 1685 }, { "epoch": 0.38603319977103606, "grad_norm": 1.1423434019088745, "learning_rate": 1.4079786854616537e-05, "loss": 1.048, "step": 1686 }, { "epoch": 0.3862621637092158, "grad_norm": 1.159722089767456, "learning_rate": 1.4073013011946449e-05, "loss": 1.0642, "step": 1687 }, { "epoch": 0.3864911276473955, "grad_norm": 1.1652872562408447, "learning_rate": 1.4066236927931447e-05, "loss": 1.1185, "step": 1688 }, { "epoch": 0.3867200915855753, "grad_norm": 1.3069536685943604, "learning_rate": 1.4059458606300358e-05, "loss": 1.1017, "step": 1689 }, { "epoch": 0.386949055523755, "grad_norm": 1.2639847993850708, "learning_rate": 1.405267805078323e-05, "loss": 1.1014, "step": 1690 }, { "epoch": 0.38717801946193475, "grad_norm": 1.3884036540985107, "learning_rate": 1.4045895265111352e-05, "loss": 1.099, "step": 1691 }, { "epoch": 0.3874069834001145, "grad_norm": 1.3688089847564697, "learning_rate": 1.4039110253017225e-05, "loss": 1.0532, "step": 1692 }, { "epoch": 0.3876359473382942, "grad_norm": 1.2900769710540771, "learning_rate": 1.4032323018234592e-05, "loss": 1.0949, "step": 1693 }, { "epoch": 0.38786491127647393, "grad_norm": 1.3169381618499756, "learning_rate": 1.4025533564498411e-05, "loss": 1.1092, "step": 1694 }, { "epoch": 0.3880938752146537, "grad_norm": 1.2723472118377686, "learning_rate": 1.401874189554486e-05, "loss": 1.0769, "step": 1695 }, { "epoch": 0.38832283915283344, "grad_norm": 1.1920543909072876, "learning_rate": 1.4011948015111334e-05, "loss": 1.1574, "step": 1696 }, { "epoch": 0.38855180309101317, "grad_norm": 1.2525862455368042, "learning_rate": 1.400515192693645e-05, "loss": 1.1647, "step": 1697 }, { "epoch": 0.3887807670291929, "grad_norm": 9.487869262695312, "learning_rate": 1.3998353634760044e-05, "loss": 1.1278, "step": 1698 }, { "epoch": 0.3890097309673726, "grad_norm": 1.6089351177215576, "learning_rate": 1.3991553142323156e-05, "loss": 1.1101, "step": 1699 }, { "epoch": 0.38923869490555235, "grad_norm": 1.3901886940002441, "learning_rate": 1.3984750453368033e-05, "loss": 1.1249, "step": 1700 }, { "epoch": 0.38946765884373213, "grad_norm": 3.7920846939086914, "learning_rate": 1.397794557163815e-05, "loss": 1.0998, "step": 1701 }, { "epoch": 0.38969662278191186, "grad_norm": 1.1165345907211304, "learning_rate": 1.3971138500878166e-05, "loss": 1.0719, "step": 1702 }, { "epoch": 0.3899255867200916, "grad_norm": 1.2537471055984497, "learning_rate": 1.396432924483396e-05, "loss": 1.0601, "step": 1703 }, { "epoch": 0.3901545506582713, "grad_norm": 1.2301760911941528, "learning_rate": 1.3957517807252607e-05, "loss": 1.1573, "step": 1704 }, { "epoch": 0.39038351459645104, "grad_norm": 1.645586609840393, "learning_rate": 1.3950704191882388e-05, "loss": 1.1273, "step": 1705 }, { "epoch": 0.39061247853463077, "grad_norm": 1.3407573699951172, "learning_rate": 1.3943888402472771e-05, "loss": 1.0567, "step": 1706 }, { "epoch": 0.39084144247281055, "grad_norm": 2.350411891937256, "learning_rate": 1.393707044277443e-05, "loss": 1.0961, "step": 1707 }, { "epoch": 0.3910704064109903, "grad_norm": 1.187579870223999, "learning_rate": 1.3930250316539237e-05, "loss": 1.0545, "step": 1708 }, { "epoch": 0.39129937034917, "grad_norm": 1.091801404953003, "learning_rate": 1.3923428027520246e-05, "loss": 1.0505, "step": 1709 }, { "epoch": 0.39152833428734973, "grad_norm": 1.194849967956543, "learning_rate": 1.3916603579471705e-05, "loss": 1.0593, "step": 1710 }, { "epoch": 0.39175729822552946, "grad_norm": 1.1145169734954834, "learning_rate": 1.3909776976149047e-05, "loss": 1.051, "step": 1711 }, { "epoch": 0.39198626216370924, "grad_norm": 1.3400248289108276, "learning_rate": 1.3902948221308903e-05, "loss": 1.0907, "step": 1712 }, { "epoch": 0.39221522610188897, "grad_norm": 1.3348122835159302, "learning_rate": 1.3896117318709074e-05, "loss": 1.0244, "step": 1713 }, { "epoch": 0.3924441900400687, "grad_norm": 1.1804814338684082, "learning_rate": 1.388928427210855e-05, "loss": 1.1056, "step": 1714 }, { "epoch": 0.3926731539782484, "grad_norm": 1.4333142042160034, "learning_rate": 1.3882449085267497e-05, "loss": 1.1095, "step": 1715 }, { "epoch": 0.39290211791642815, "grad_norm": 1.10541570186615, "learning_rate": 1.3875611761947264e-05, "loss": 1.1461, "step": 1716 }, { "epoch": 0.3931310818546079, "grad_norm": 1.3302122354507446, "learning_rate": 1.3868772305910376e-05, "loss": 1.043, "step": 1717 }, { "epoch": 0.39336004579278766, "grad_norm": 1.2536112070083618, "learning_rate": 1.3861930720920518e-05, "loss": 1.0733, "step": 1718 }, { "epoch": 0.3935890097309674, "grad_norm": 1.1884063482284546, "learning_rate": 1.3855087010742563e-05, "loss": 1.125, "step": 1719 }, { "epoch": 0.3938179736691471, "grad_norm": 1.491746187210083, "learning_rate": 1.384824117914255e-05, "loss": 1.071, "step": 1720 }, { "epoch": 0.39404693760732684, "grad_norm": 1.331817865371704, "learning_rate": 1.3841393229887676e-05, "loss": 1.084, "step": 1721 }, { "epoch": 0.39427590154550657, "grad_norm": 1.232057809829712, "learning_rate": 1.3834543166746317e-05, "loss": 1.1158, "step": 1722 }, { "epoch": 0.3945048654836863, "grad_norm": 1.0441474914550781, "learning_rate": 1.3827690993488e-05, "loss": 1.1093, "step": 1723 }, { "epoch": 0.3947338294218661, "grad_norm": 1.1634544134140015, "learning_rate": 1.3820836713883424e-05, "loss": 1.0904, "step": 1724 }, { "epoch": 0.3949627933600458, "grad_norm": 1.3219141960144043, "learning_rate": 1.3813980331704437e-05, "loss": 1.0814, "step": 1725 }, { "epoch": 0.39519175729822553, "grad_norm": 1.0469324588775635, "learning_rate": 1.3807121850724045e-05, "loss": 1.0653, "step": 1726 }, { "epoch": 0.39542072123640526, "grad_norm": 1.1637883186340332, "learning_rate": 1.3800261274716424e-05, "loss": 1.0838, "step": 1727 }, { "epoch": 0.395649685174585, "grad_norm": 1.1523021459579468, "learning_rate": 1.3793398607456883e-05, "loss": 1.0906, "step": 1728 }, { "epoch": 0.3958786491127647, "grad_norm": 1.1947762966156006, "learning_rate": 1.3786533852721891e-05, "loss": 1.0937, "step": 1729 }, { "epoch": 0.3961076130509445, "grad_norm": 1.565125584602356, "learning_rate": 1.3779667014289067e-05, "loss": 1.0973, "step": 1730 }, { "epoch": 0.3963365769891242, "grad_norm": 1.6443101167678833, "learning_rate": 1.3772798095937172e-05, "loss": 1.0978, "step": 1731 }, { "epoch": 0.39656554092730395, "grad_norm": 1.2846298217773438, "learning_rate": 1.3765927101446121e-05, "loss": 1.057, "step": 1732 }, { "epoch": 0.3967945048654837, "grad_norm": 1.245452880859375, "learning_rate": 1.3759054034596953e-05, "loss": 1.1326, "step": 1733 }, { "epoch": 0.3970234688036634, "grad_norm": 1.2566457986831665, "learning_rate": 1.375217889917187e-05, "loss": 1.1039, "step": 1734 }, { "epoch": 0.3972524327418432, "grad_norm": 1.0944688320159912, "learning_rate": 1.3745301698954197e-05, "loss": 1.076, "step": 1735 }, { "epoch": 0.3974813966800229, "grad_norm": 1.7811460494995117, "learning_rate": 1.3738422437728398e-05, "loss": 1.0675, "step": 1736 }, { "epoch": 0.39771036061820264, "grad_norm": 2.1263771057128906, "learning_rate": 1.3731541119280073e-05, "loss": 1.0859, "step": 1737 }, { "epoch": 0.39793932455638237, "grad_norm": 1.2063853740692139, "learning_rate": 1.3724657747395957e-05, "loss": 1.1231, "step": 1738 }, { "epoch": 0.3981682884945621, "grad_norm": 1.1201328039169312, "learning_rate": 1.3717772325863913e-05, "loss": 1.1532, "step": 1739 }, { "epoch": 0.3983972524327418, "grad_norm": 1.3484069108963013, "learning_rate": 1.3710884858472926e-05, "loss": 1.0621, "step": 1740 }, { "epoch": 0.3986262163709216, "grad_norm": 1.4419175386428833, "learning_rate": 1.3703995349013113e-05, "loss": 1.0816, "step": 1741 }, { "epoch": 0.39885518030910133, "grad_norm": 1.026955246925354, "learning_rate": 1.3697103801275714e-05, "loss": 1.0184, "step": 1742 }, { "epoch": 0.39908414424728106, "grad_norm": 1.369357943534851, "learning_rate": 1.3690210219053088e-05, "loss": 1.0637, "step": 1743 }, { "epoch": 0.3993131081854608, "grad_norm": 1.0927389860153198, "learning_rate": 1.3683314606138718e-05, "loss": 1.0612, "step": 1744 }, { "epoch": 0.3995420721236405, "grad_norm": 1.3103286027908325, "learning_rate": 1.3676416966327201e-05, "loss": 1.0001, "step": 1745 }, { "epoch": 0.39977103606182024, "grad_norm": 1.1909055709838867, "learning_rate": 1.3669517303414254e-05, "loss": 1.0591, "step": 1746 }, { "epoch": 0.4, "grad_norm": 1.6235116720199585, "learning_rate": 1.3662615621196697e-05, "loss": 1.1164, "step": 1747 }, { "epoch": 0.40022896393817975, "grad_norm": 0.9482841491699219, "learning_rate": 1.3655711923472472e-05, "loss": 1.0182, "step": 1748 }, { "epoch": 0.4004579278763595, "grad_norm": 1.02751886844635, "learning_rate": 1.3648806214040625e-05, "loss": 1.0817, "step": 1749 }, { "epoch": 0.4006868918145392, "grad_norm": 1.257691502571106, "learning_rate": 1.3641898496701306e-05, "loss": 1.1007, "step": 1750 }, { "epoch": 0.40091585575271893, "grad_norm": 1.1339843273162842, "learning_rate": 1.3634988775255785e-05, "loss": 1.0948, "step": 1751 }, { "epoch": 0.40114481969089866, "grad_norm": 2.271824836730957, "learning_rate": 1.362807705350641e-05, "loss": 1.0535, "step": 1752 }, { "epoch": 0.40137378362907844, "grad_norm": 1.175581693649292, "learning_rate": 1.3621163335256655e-05, "loss": 1.0902, "step": 1753 }, { "epoch": 0.40160274756725817, "grad_norm": 0.9900003671646118, "learning_rate": 1.3614247624311076e-05, "loss": 1.0428, "step": 1754 }, { "epoch": 0.4018317115054379, "grad_norm": 1.2661924362182617, "learning_rate": 1.360732992447533e-05, "loss": 1.0597, "step": 1755 }, { "epoch": 0.4020606754436176, "grad_norm": 1.1470941305160522, "learning_rate": 1.360041023955617e-05, "loss": 1.1798, "step": 1756 }, { "epoch": 0.40228963938179735, "grad_norm": 1.8923383951187134, "learning_rate": 1.3593488573361442e-05, "loss": 1.0507, "step": 1757 }, { "epoch": 0.40251860331997713, "grad_norm": 1.0174728631973267, "learning_rate": 1.358656492970008e-05, "loss": 1.0249, "step": 1758 }, { "epoch": 0.40274756725815686, "grad_norm": 1.094590663909912, "learning_rate": 1.3579639312382105e-05, "loss": 1.0823, "step": 1759 }, { "epoch": 0.4029765311963366, "grad_norm": 1.2006431818008423, "learning_rate": 1.357271172521863e-05, "loss": 1.0586, "step": 1760 }, { "epoch": 0.4032054951345163, "grad_norm": 1.0652323961257935, "learning_rate": 1.3565782172021847e-05, "loss": 1.0759, "step": 1761 }, { "epoch": 0.40343445907269604, "grad_norm": 1.5044901371002197, "learning_rate": 1.3558850656605028e-05, "loss": 1.1158, "step": 1762 }, { "epoch": 0.40366342301087577, "grad_norm": 1.201694130897522, "learning_rate": 1.355191718278253e-05, "loss": 1.1144, "step": 1763 }, { "epoch": 0.40389238694905555, "grad_norm": 1.799466609954834, "learning_rate": 1.3544981754369789e-05, "loss": 1.0865, "step": 1764 }, { "epoch": 0.4041213508872353, "grad_norm": 1.0255722999572754, "learning_rate": 1.3538044375183308e-05, "loss": 1.071, "step": 1765 }, { "epoch": 0.404350314825415, "grad_norm": 1.148310661315918, "learning_rate": 1.3531105049040667e-05, "loss": 1.0982, "step": 1766 }, { "epoch": 0.40457927876359473, "grad_norm": 1.1922695636749268, "learning_rate": 1.3524163779760526e-05, "loss": 1.1228, "step": 1767 }, { "epoch": 0.40480824270177446, "grad_norm": 1.0259567499160767, "learning_rate": 1.3517220571162603e-05, "loss": 1.0426, "step": 1768 }, { "epoch": 0.4050372066399542, "grad_norm": 1.3682341575622559, "learning_rate": 1.3510275427067688e-05, "loss": 1.11, "step": 1769 }, { "epoch": 0.40526617057813397, "grad_norm": 1.16379976272583, "learning_rate": 1.3503328351297635e-05, "loss": 1.0049, "step": 1770 }, { "epoch": 0.4054951345163137, "grad_norm": 1.128479242324829, "learning_rate": 1.3496379347675364e-05, "loss": 1.0751, "step": 1771 }, { "epoch": 0.4057240984544934, "grad_norm": 1.2324275970458984, "learning_rate": 1.3489428420024851e-05, "loss": 1.0561, "step": 1772 }, { "epoch": 0.40595306239267315, "grad_norm": 1.2429150342941284, "learning_rate": 1.3482475572171132e-05, "loss": 1.0709, "step": 1773 }, { "epoch": 0.4061820263308529, "grad_norm": 1.3965815305709839, "learning_rate": 1.3475520807940303e-05, "loss": 1.1079, "step": 1774 }, { "epoch": 0.4064109902690326, "grad_norm": 1.0004675388336182, "learning_rate": 1.3468564131159515e-05, "loss": 1.0772, "step": 1775 }, { "epoch": 0.4066399542072124, "grad_norm": 1.2566231489181519, "learning_rate": 1.3461605545656961e-05, "loss": 1.0527, "step": 1776 }, { "epoch": 0.4068689181453921, "grad_norm": 1.2383748292922974, "learning_rate": 1.3454645055261903e-05, "loss": 1.0754, "step": 1777 }, { "epoch": 0.40709788208357184, "grad_norm": 1.3026020526885986, "learning_rate": 1.344768266380463e-05, "loss": 1.0667, "step": 1778 }, { "epoch": 0.40732684602175157, "grad_norm": 1.141505241394043, "learning_rate": 1.3440718375116497e-05, "loss": 1.184, "step": 1779 }, { "epoch": 0.4075558099599313, "grad_norm": 1.1129202842712402, "learning_rate": 1.3433752193029888e-05, "loss": 1.0463, "step": 1780 }, { "epoch": 0.4077847738981111, "grad_norm": 0.9912295341491699, "learning_rate": 1.3426784121378233e-05, "loss": 1.1277, "step": 1781 }, { "epoch": 0.4080137378362908, "grad_norm": 1.029266119003296, "learning_rate": 1.3419814163996007e-05, "loss": 1.0719, "step": 1782 }, { "epoch": 0.40824270177447053, "grad_norm": 1.1671353578567505, "learning_rate": 1.341284232471872e-05, "loss": 1.0912, "step": 1783 }, { "epoch": 0.40847166571265026, "grad_norm": 1.2449313402175903, "learning_rate": 1.3405868607382914e-05, "loss": 1.0755, "step": 1784 }, { "epoch": 0.40870062965083, "grad_norm": 1.078808069229126, "learning_rate": 1.3398893015826166e-05, "loss": 1.0852, "step": 1785 }, { "epoch": 0.4089295935890097, "grad_norm": 1.0642139911651611, "learning_rate": 1.3391915553887093e-05, "loss": 1.0864, "step": 1786 }, { "epoch": 0.4091585575271895, "grad_norm": 1.050003170967102, "learning_rate": 1.3384936225405326e-05, "loss": 1.095, "step": 1787 }, { "epoch": 0.4093875214653692, "grad_norm": 1.2181106805801392, "learning_rate": 1.3377955034221532e-05, "loss": 1.1276, "step": 1788 }, { "epoch": 0.40961648540354895, "grad_norm": 1.3170802593231201, "learning_rate": 1.3370971984177406e-05, "loss": 1.0547, "step": 1789 }, { "epoch": 0.4098454493417287, "grad_norm": 1.1851341724395752, "learning_rate": 1.3363987079115656e-05, "loss": 1.0538, "step": 1790 }, { "epoch": 0.4100744132799084, "grad_norm": 1.218276023864746, "learning_rate": 1.3357000322880024e-05, "loss": 1.0555, "step": 1791 }, { "epoch": 0.41030337721808813, "grad_norm": 1.2883524894714355, "learning_rate": 1.3350011719315257e-05, "loss": 1.0051, "step": 1792 }, { "epoch": 0.4105323411562679, "grad_norm": 1.1477938890457153, "learning_rate": 1.334302127226713e-05, "loss": 1.0466, "step": 1793 }, { "epoch": 0.41076130509444764, "grad_norm": 1.0580675601959229, "learning_rate": 1.333602898558242e-05, "loss": 1.0661, "step": 1794 }, { "epoch": 0.41099026903262736, "grad_norm": 1.4260722398757935, "learning_rate": 1.3329034863108932e-05, "loss": 1.0466, "step": 1795 }, { "epoch": 0.4112192329708071, "grad_norm": 1.1878693103790283, "learning_rate": 1.3322038908695466e-05, "loss": 1.1458, "step": 1796 }, { "epoch": 0.4114481969089868, "grad_norm": 1.031032919883728, "learning_rate": 1.3315041126191845e-05, "loss": 1.0114, "step": 1797 }, { "epoch": 0.41167716084716655, "grad_norm": 1.2380294799804688, "learning_rate": 1.3308041519448885e-05, "loss": 1.0351, "step": 1798 }, { "epoch": 0.41190612478534633, "grad_norm": 1.2293623685836792, "learning_rate": 1.3301040092318409e-05, "loss": 1.0952, "step": 1799 }, { "epoch": 0.41213508872352606, "grad_norm": 1.1843045949935913, "learning_rate": 1.3294036848653247e-05, "loss": 1.0966, "step": 1800 }, { "epoch": 0.4123640526617058, "grad_norm": 1.0566411018371582, "learning_rate": 1.3287031792307226e-05, "loss": 1.0596, "step": 1801 }, { "epoch": 0.4125930165998855, "grad_norm": 1.1404145956039429, "learning_rate": 1.3280024927135165e-05, "loss": 1.0058, "step": 1802 }, { "epoch": 0.41282198053806524, "grad_norm": 1.210115909576416, "learning_rate": 1.3273016256992888e-05, "loss": 1.0835, "step": 1803 }, { "epoch": 0.413050944476245, "grad_norm": 1.214924693107605, "learning_rate": 1.3266005785737206e-05, "loss": 1.0879, "step": 1804 }, { "epoch": 0.41327990841442475, "grad_norm": 1.529047966003418, "learning_rate": 1.3258993517225923e-05, "loss": 1.1117, "step": 1805 }, { "epoch": 0.4135088723526045, "grad_norm": 1.181199312210083, "learning_rate": 1.3251979455317831e-05, "loss": 1.1248, "step": 1806 }, { "epoch": 0.4137378362907842, "grad_norm": 1.1865763664245605, "learning_rate": 1.3244963603872707e-05, "loss": 1.1102, "step": 1807 }, { "epoch": 0.4139668002289639, "grad_norm": 1.2326041460037231, "learning_rate": 1.323794596675132e-05, "loss": 1.0253, "step": 1808 }, { "epoch": 0.41419576416714365, "grad_norm": 1.0496649742126465, "learning_rate": 1.3230926547815413e-05, "loss": 1.0957, "step": 1809 }, { "epoch": 0.41442472810532344, "grad_norm": 1.4039772748947144, "learning_rate": 1.3223905350927718e-05, "loss": 1.055, "step": 1810 }, { "epoch": 0.41465369204350316, "grad_norm": 1.393269658088684, "learning_rate": 1.3216882379951932e-05, "loss": 1.1233, "step": 1811 }, { "epoch": 0.4148826559816829, "grad_norm": 1.2282973527908325, "learning_rate": 1.3209857638752746e-05, "loss": 1.0997, "step": 1812 }, { "epoch": 0.4151116199198626, "grad_norm": 1.080151081085205, "learning_rate": 1.3202831131195812e-05, "loss": 1.0984, "step": 1813 }, { "epoch": 0.41534058385804234, "grad_norm": 1.2916617393493652, "learning_rate": 1.3195802861147759e-05, "loss": 1.1306, "step": 1814 }, { "epoch": 0.41556954779622207, "grad_norm": 1.297864317893982, "learning_rate": 1.318877283247619e-05, "loss": 1.1278, "step": 1815 }, { "epoch": 0.41579851173440185, "grad_norm": 1.1242778301239014, "learning_rate": 1.3181741049049659e-05, "loss": 1.0858, "step": 1816 }, { "epoch": 0.4160274756725816, "grad_norm": 1.1434422731399536, "learning_rate": 1.3174707514737706e-05, "loss": 1.0437, "step": 1817 }, { "epoch": 0.4162564396107613, "grad_norm": 1.0877010822296143, "learning_rate": 1.3167672233410826e-05, "loss": 1.1197, "step": 1818 }, { "epoch": 0.41648540354894104, "grad_norm": 1.1932451725006104, "learning_rate": 1.3160635208940473e-05, "loss": 1.1059, "step": 1819 }, { "epoch": 0.41671436748712076, "grad_norm": 1.4846535921096802, "learning_rate": 1.3153596445199063e-05, "loss": 1.0637, "step": 1820 }, { "epoch": 0.4169433314253005, "grad_norm": 1.1383739709854126, "learning_rate": 1.3146555946059971e-05, "loss": 1.1125, "step": 1821 }, { "epoch": 0.41717229536348027, "grad_norm": 1.1333290338516235, "learning_rate": 1.3139513715397521e-05, "loss": 1.0098, "step": 1822 }, { "epoch": 0.41740125930166, "grad_norm": 0.9758977293968201, "learning_rate": 1.3132469757086997e-05, "loss": 1.0442, "step": 1823 }, { "epoch": 0.4176302232398397, "grad_norm": 1.221085786819458, "learning_rate": 1.3125424075004624e-05, "loss": 1.0796, "step": 1824 }, { "epoch": 0.41785918717801945, "grad_norm": 1.830741286277771, "learning_rate": 1.3118376673027588e-05, "loss": 1.078, "step": 1825 }, { "epoch": 0.4180881511161992, "grad_norm": 1.274835467338562, "learning_rate": 1.3111327555034015e-05, "loss": 1.0829, "step": 1826 }, { "epoch": 0.41831711505437896, "grad_norm": 1.2003624439239502, "learning_rate": 1.3104276724902975e-05, "loss": 1.0118, "step": 1827 }, { "epoch": 0.4185460789925587, "grad_norm": 1.195296049118042, "learning_rate": 1.3097224186514476e-05, "loss": 1.0357, "step": 1828 }, { "epoch": 0.4187750429307384, "grad_norm": 1.2735960483551025, "learning_rate": 1.3090169943749475e-05, "loss": 1.0895, "step": 1829 }, { "epoch": 0.41900400686891814, "grad_norm": 1.1189090013504028, "learning_rate": 1.3083114000489863e-05, "loss": 1.0777, "step": 1830 }, { "epoch": 0.41923297080709787, "grad_norm": 1.1308891773223877, "learning_rate": 1.3076056360618465e-05, "loss": 1.0817, "step": 1831 }, { "epoch": 0.4194619347452776, "grad_norm": 1.2105430364608765, "learning_rate": 1.3068997028019043e-05, "loss": 1.0939, "step": 1832 }, { "epoch": 0.4196908986834574, "grad_norm": 1.4238054752349854, "learning_rate": 1.3061936006576284e-05, "loss": 1.0822, "step": 1833 }, { "epoch": 0.4199198626216371, "grad_norm": 1.037213683128357, "learning_rate": 1.3054873300175814e-05, "loss": 1.0473, "step": 1834 }, { "epoch": 0.42014882655981683, "grad_norm": 1.4768306016921997, "learning_rate": 1.3047808912704178e-05, "loss": 1.0661, "step": 1835 }, { "epoch": 0.42037779049799656, "grad_norm": 1.2641279697418213, "learning_rate": 1.304074284804885e-05, "loss": 1.0562, "step": 1836 }, { "epoch": 0.4206067544361763, "grad_norm": 1.1807743310928345, "learning_rate": 1.3033675110098227e-05, "loss": 1.1504, "step": 1837 }, { "epoch": 0.420835718374356, "grad_norm": 1.099843978881836, "learning_rate": 1.3026605702741625e-05, "loss": 1.0742, "step": 1838 }, { "epoch": 0.4210646823125358, "grad_norm": 1.1367111206054688, "learning_rate": 1.3019534629869281e-05, "loss": 1.1427, "step": 1839 }, { "epoch": 0.4212936462507155, "grad_norm": 1.4131443500518799, "learning_rate": 1.3012461895372343e-05, "loss": 1.0812, "step": 1840 }, { "epoch": 0.42152261018889525, "grad_norm": 1.1341830492019653, "learning_rate": 1.3005387503142884e-05, "loss": 1.0982, "step": 1841 }, { "epoch": 0.421751574127075, "grad_norm": 1.1615837812423706, "learning_rate": 1.2998311457073878e-05, "loss": 1.0308, "step": 1842 }, { "epoch": 0.4219805380652547, "grad_norm": 1.2500286102294922, "learning_rate": 1.2991233761059214e-05, "loss": 1.0998, "step": 1843 }, { "epoch": 0.42220950200343443, "grad_norm": 1.2073861360549927, "learning_rate": 1.2984154418993691e-05, "loss": 1.0588, "step": 1844 }, { "epoch": 0.4224384659416142, "grad_norm": 1.1415460109710693, "learning_rate": 1.2977073434773009e-05, "loss": 1.0693, "step": 1845 }, { "epoch": 0.42266742987979394, "grad_norm": 1.4329516887664795, "learning_rate": 1.2969990812293779e-05, "loss": 1.0938, "step": 1846 }, { "epoch": 0.42289639381797367, "grad_norm": 1.2857966423034668, "learning_rate": 1.29629065554535e-05, "loss": 1.0196, "step": 1847 }, { "epoch": 0.4231253577561534, "grad_norm": 1.0637885332107544, "learning_rate": 1.2955820668150587e-05, "loss": 1.0602, "step": 1848 }, { "epoch": 0.4233543216943331, "grad_norm": 1.3474278450012207, "learning_rate": 1.2948733154284343e-05, "loss": 1.135, "step": 1849 }, { "epoch": 0.42358328563251285, "grad_norm": 1.1365108489990234, "learning_rate": 1.2941644017754964e-05, "loss": 1.1279, "step": 1850 }, { "epoch": 0.42381224957069263, "grad_norm": 1.1364717483520508, "learning_rate": 1.2934553262463548e-05, "loss": 1.0466, "step": 1851 }, { "epoch": 0.42404121350887236, "grad_norm": 1.122428059577942, "learning_rate": 1.2927460892312071e-05, "loss": 1.0606, "step": 1852 }, { "epoch": 0.4242701774470521, "grad_norm": 1.1230154037475586, "learning_rate": 1.2920366911203414e-05, "loss": 1.0651, "step": 1853 }, { "epoch": 0.4244991413852318, "grad_norm": 1.150088906288147, "learning_rate": 1.2913271323041328e-05, "loss": 1.0204, "step": 1854 }, { "epoch": 0.42472810532341154, "grad_norm": 1.1731511354446411, "learning_rate": 1.2906174131730458e-05, "loss": 1.1506, "step": 1855 }, { "epoch": 0.4249570692615913, "grad_norm": 1.1663103103637695, "learning_rate": 1.2899075341176326e-05, "loss": 1.0903, "step": 1856 }, { "epoch": 0.42518603319977105, "grad_norm": 1.0978206396102905, "learning_rate": 1.289197495528534e-05, "loss": 1.0315, "step": 1857 }, { "epoch": 0.4254149971379508, "grad_norm": 1.842656135559082, "learning_rate": 1.2884872977964786e-05, "loss": 1.0962, "step": 1858 }, { "epoch": 0.4256439610761305, "grad_norm": 1.1557258367538452, "learning_rate": 1.2877769413122816e-05, "loss": 1.1225, "step": 1859 }, { "epoch": 0.42587292501431023, "grad_norm": 1.0806372165679932, "learning_rate": 1.2870664264668467e-05, "loss": 1.0553, "step": 1860 }, { "epoch": 0.42610188895248996, "grad_norm": 1.3559293746948242, "learning_rate": 1.2863557536511642e-05, "loss": 1.0912, "step": 1861 }, { "epoch": 0.42633085289066974, "grad_norm": 1.2274169921875, "learning_rate": 1.285644923256311e-05, "loss": 1.1026, "step": 1862 }, { "epoch": 0.42655981682884947, "grad_norm": 1.467261791229248, "learning_rate": 1.2849339356734513e-05, "loss": 1.0609, "step": 1863 }, { "epoch": 0.4267887807670292, "grad_norm": 1.08341646194458, "learning_rate": 1.284222791293836e-05, "loss": 1.0122, "step": 1864 }, { "epoch": 0.4270177447052089, "grad_norm": 1.18088960647583, "learning_rate": 1.2835114905088013e-05, "loss": 1.0016, "step": 1865 }, { "epoch": 0.42724670864338865, "grad_norm": 3.6483867168426514, "learning_rate": 1.2828000337097703e-05, "loss": 1.0826, "step": 1866 }, { "epoch": 0.4274756725815684, "grad_norm": 1.3066470623016357, "learning_rate": 1.2820884212882521e-05, "loss": 1.0831, "step": 1867 }, { "epoch": 0.42770463651974816, "grad_norm": 1.1415867805480957, "learning_rate": 1.2813766536358406e-05, "loss": 1.0411, "step": 1868 }, { "epoch": 0.4279336004579279, "grad_norm": 1.0170698165893555, "learning_rate": 1.2806647311442153e-05, "loss": 1.0385, "step": 1869 }, { "epoch": 0.4281625643961076, "grad_norm": 1.0501190423965454, "learning_rate": 1.2799526542051419e-05, "loss": 1.0581, "step": 1870 }, { "epoch": 0.42839152833428734, "grad_norm": 1.1775119304656982, "learning_rate": 1.2792404232104699e-05, "loss": 1.1704, "step": 1871 }, { "epoch": 0.42862049227246707, "grad_norm": 1.1157633066177368, "learning_rate": 1.2785280385521342e-05, "loss": 1.0301, "step": 1872 }, { "epoch": 0.4288494562106468, "grad_norm": 1.531984567642212, "learning_rate": 1.277815500622154e-05, "loss": 1.0217, "step": 1873 }, { "epoch": 0.4290784201488266, "grad_norm": 4.276288986206055, "learning_rate": 1.2771028098126333e-05, "loss": 1.0464, "step": 1874 }, { "epoch": 0.4293073840870063, "grad_norm": 1.2262746095657349, "learning_rate": 1.2763899665157591e-05, "loss": 1.0452, "step": 1875 }, { "epoch": 0.42953634802518603, "grad_norm": 1.0891977548599243, "learning_rate": 1.275676971123804e-05, "loss": 1.0315, "step": 1876 }, { "epoch": 0.42976531196336576, "grad_norm": 3.1425435543060303, "learning_rate": 1.2749638240291227e-05, "loss": 1.1258, "step": 1877 }, { "epoch": 0.4299942759015455, "grad_norm": 1.209585428237915, "learning_rate": 1.2742505256241543e-05, "loss": 1.074, "step": 1878 }, { "epoch": 0.43022323983972527, "grad_norm": 1.5271508693695068, "learning_rate": 1.2735370763014212e-05, "loss": 1.1113, "step": 1879 }, { "epoch": 0.430452203777905, "grad_norm": 1.2672470808029175, "learning_rate": 1.2728234764535283e-05, "loss": 1.1049, "step": 1880 }, { "epoch": 0.4306811677160847, "grad_norm": 1.2539156675338745, "learning_rate": 1.2721097264731634e-05, "loss": 1.0472, "step": 1881 }, { "epoch": 0.43091013165426445, "grad_norm": 1.542236328125, "learning_rate": 1.2713958267530976e-05, "loss": 1.0408, "step": 1882 }, { "epoch": 0.4311390955924442, "grad_norm": 1.2664871215820312, "learning_rate": 1.2706817776861838e-05, "loss": 1.1148, "step": 1883 }, { "epoch": 0.4313680595306239, "grad_norm": 1.2938625812530518, "learning_rate": 1.269967579665357e-05, "loss": 1.0531, "step": 1884 }, { "epoch": 0.4315970234688037, "grad_norm": 1.3394691944122314, "learning_rate": 1.2692532330836346e-05, "loss": 1.0794, "step": 1885 }, { "epoch": 0.4318259874069834, "grad_norm": 1.4612385034561157, "learning_rate": 1.2685387383341157e-05, "loss": 1.0415, "step": 1886 }, { "epoch": 0.43205495134516314, "grad_norm": 1.6666114330291748, "learning_rate": 1.2678240958099801e-05, "loss": 1.0744, "step": 1887 }, { "epoch": 0.43228391528334287, "grad_norm": 1.2210183143615723, "learning_rate": 1.2671093059044899e-05, "loss": 1.0129, "step": 1888 }, { "epoch": 0.4325128792215226, "grad_norm": 1.5127156972885132, "learning_rate": 1.2663943690109885e-05, "loss": 1.1413, "step": 1889 }, { "epoch": 0.4327418431597023, "grad_norm": 1.2559911012649536, "learning_rate": 1.2656792855228993e-05, "loss": 1.1116, "step": 1890 }, { "epoch": 0.4329708070978821, "grad_norm": 1.1968263387680054, "learning_rate": 1.2649640558337266e-05, "loss": 1.1159, "step": 1891 }, { "epoch": 0.43319977103606183, "grad_norm": 1.3571308851242065, "learning_rate": 1.2642486803370553e-05, "loss": 1.1034, "step": 1892 }, { "epoch": 0.43342873497424156, "grad_norm": 1.0277183055877686, "learning_rate": 1.2635331594265508e-05, "loss": 1.0414, "step": 1893 }, { "epoch": 0.4336576989124213, "grad_norm": 0.9604895114898682, "learning_rate": 1.2628174934959583e-05, "loss": 1.066, "step": 1894 }, { "epoch": 0.433886662850601, "grad_norm": 0.9915218353271484, "learning_rate": 1.2621016829391022e-05, "loss": 1.0006, "step": 1895 }, { "epoch": 0.43411562678878074, "grad_norm": 1.2676836252212524, "learning_rate": 1.2613857281498878e-05, "loss": 1.0798, "step": 1896 }, { "epoch": 0.4343445907269605, "grad_norm": 1.2389227151870728, "learning_rate": 1.2606696295222985e-05, "loss": 1.0692, "step": 1897 }, { "epoch": 0.43457355466514025, "grad_norm": 1.2961418628692627, "learning_rate": 1.2599533874503978e-05, "loss": 1.0435, "step": 1898 }, { "epoch": 0.43480251860332, "grad_norm": 1.2095304727554321, "learning_rate": 1.2592370023283268e-05, "loss": 1.0942, "step": 1899 }, { "epoch": 0.4350314825414997, "grad_norm": 0.9683312773704529, "learning_rate": 1.2585204745503072e-05, "loss": 1.1888, "step": 1900 }, { "epoch": 0.43526044647967943, "grad_norm": 1.135977029800415, "learning_rate": 1.2578038045106383e-05, "loss": 1.1095, "step": 1901 }, { "epoch": 0.4354894104178592, "grad_norm": 1.0904693603515625, "learning_rate": 1.2570869926036968e-05, "loss": 1.1369, "step": 1902 }, { "epoch": 0.43571837435603894, "grad_norm": 1.4972336292266846, "learning_rate": 1.2563700392239387e-05, "loss": 1.0466, "step": 1903 }, { "epoch": 0.43594733829421867, "grad_norm": 1.0306872129440308, "learning_rate": 1.2556529447658974e-05, "loss": 1.1408, "step": 1904 }, { "epoch": 0.4361763022323984, "grad_norm": 1.2561451196670532, "learning_rate": 1.2549357096241841e-05, "loss": 1.0328, "step": 1905 }, { "epoch": 0.4364052661705781, "grad_norm": 1.3269802331924438, "learning_rate": 1.2542183341934873e-05, "loss": 1.0962, "step": 1906 }, { "epoch": 0.43663423010875785, "grad_norm": 1.1158679723739624, "learning_rate": 1.253500818868572e-05, "loss": 1.0914, "step": 1907 }, { "epoch": 0.43686319404693763, "grad_norm": 1.4163261651992798, "learning_rate": 1.252783164044282e-05, "loss": 1.0665, "step": 1908 }, { "epoch": 0.43709215798511736, "grad_norm": 1.1557682752609253, "learning_rate": 1.2520653701155361e-05, "loss": 1.0816, "step": 1909 }, { "epoch": 0.4373211219232971, "grad_norm": 1.1793196201324463, "learning_rate": 1.25134743747733e-05, "loss": 1.0406, "step": 1910 }, { "epoch": 0.4375500858614768, "grad_norm": 1.505786657333374, "learning_rate": 1.2506293665247366e-05, "loss": 1.1074, "step": 1911 }, { "epoch": 0.43777904979965654, "grad_norm": 1.6487133502960205, "learning_rate": 1.2499111576529042e-05, "loss": 1.1171, "step": 1912 }, { "epoch": 0.43800801373783627, "grad_norm": 1.204660177230835, "learning_rate": 1.2491928112570568e-05, "loss": 0.9898, "step": 1913 }, { "epoch": 0.43823697767601605, "grad_norm": 1.1147689819335938, "learning_rate": 1.2484743277324945e-05, "loss": 0.9693, "step": 1914 }, { "epoch": 0.4384659416141958, "grad_norm": 1.136470079421997, "learning_rate": 1.2477557074745932e-05, "loss": 1.0785, "step": 1915 }, { "epoch": 0.4386949055523755, "grad_norm": 1.2387357950210571, "learning_rate": 1.247036950878803e-05, "loss": 1.1218, "step": 1916 }, { "epoch": 0.43892386949055523, "grad_norm": 1.831175446510315, "learning_rate": 1.24631805834065e-05, "loss": 1.0891, "step": 1917 }, { "epoch": 0.43915283342873496, "grad_norm": 1.2181199789047241, "learning_rate": 1.2455990302557346e-05, "loss": 1.0932, "step": 1918 }, { "epoch": 0.4393817973669147, "grad_norm": 1.262529730796814, "learning_rate": 1.2448798670197318e-05, "loss": 1.0351, "step": 1919 }, { "epoch": 0.43961076130509447, "grad_norm": 1.10959792137146, "learning_rate": 1.2441605690283915e-05, "loss": 1.1082, "step": 1920 }, { "epoch": 0.4398397252432742, "grad_norm": 1.1682922840118408, "learning_rate": 1.2434411366775367e-05, "loss": 1.1353, "step": 1921 }, { "epoch": 0.4400686891814539, "grad_norm": 1.2102086544036865, "learning_rate": 1.242721570363066e-05, "loss": 1.0569, "step": 1922 }, { "epoch": 0.44029765311963365, "grad_norm": 1.1970107555389404, "learning_rate": 1.2420018704809497e-05, "loss": 1.0502, "step": 1923 }, { "epoch": 0.4405266170578134, "grad_norm": 1.2726011276245117, "learning_rate": 1.2412820374272332e-05, "loss": 1.0845, "step": 1924 }, { "epoch": 0.44075558099599316, "grad_norm": 1.1796960830688477, "learning_rate": 1.2405620715980345e-05, "loss": 1.0449, "step": 1925 }, { "epoch": 0.4409845449341729, "grad_norm": 1.0175650119781494, "learning_rate": 1.2398419733895444e-05, "loss": 1.0563, "step": 1926 }, { "epoch": 0.4412135088723526, "grad_norm": 2.8849644660949707, "learning_rate": 1.2391217431980273e-05, "loss": 1.1089, "step": 1927 }, { "epoch": 0.44144247281053234, "grad_norm": 1.4662426710128784, "learning_rate": 1.2384013814198197e-05, "loss": 1.1497, "step": 1928 }, { "epoch": 0.44167143674871207, "grad_norm": 1.102353811264038, "learning_rate": 1.2376808884513306e-05, "loss": 1.1013, "step": 1929 }, { "epoch": 0.4419004006868918, "grad_norm": 2.3300585746765137, "learning_rate": 1.2369602646890415e-05, "loss": 1.1192, "step": 1930 }, { "epoch": 0.4421293646250716, "grad_norm": 1.2267271280288696, "learning_rate": 1.2362395105295054e-05, "loss": 1.069, "step": 1931 }, { "epoch": 0.4423583285632513, "grad_norm": 1.4739381074905396, "learning_rate": 1.235518626369347e-05, "loss": 1.0883, "step": 1932 }, { "epoch": 0.44258729250143103, "grad_norm": 1.5360037088394165, "learning_rate": 1.2347976126052631e-05, "loss": 1.0633, "step": 1933 }, { "epoch": 0.44281625643961076, "grad_norm": 1.3187732696533203, "learning_rate": 1.234076469634022e-05, "loss": 1.0849, "step": 1934 }, { "epoch": 0.4430452203777905, "grad_norm": 1.0551137924194336, "learning_rate": 1.2333551978524618e-05, "loss": 1.0977, "step": 1935 }, { "epoch": 0.4432741843159702, "grad_norm": 1.3932082653045654, "learning_rate": 1.2326337976574928e-05, "loss": 1.0336, "step": 1936 }, { "epoch": 0.44350314825415, "grad_norm": 1.1481599807739258, "learning_rate": 1.2319122694460952e-05, "loss": 1.0941, "step": 1937 }, { "epoch": 0.4437321121923297, "grad_norm": 1.5443516969680786, "learning_rate": 1.2311906136153202e-05, "loss": 1.0848, "step": 1938 }, { "epoch": 0.44396107613050945, "grad_norm": 1.0437678098678589, "learning_rate": 1.2304688305622889e-05, "loss": 1.1458, "step": 1939 }, { "epoch": 0.4441900400686892, "grad_norm": 1.0990891456604004, "learning_rate": 1.2297469206841921e-05, "loss": 1.066, "step": 1940 }, { "epoch": 0.4444190040068689, "grad_norm": 1.1834927797317505, "learning_rate": 1.2290248843782915e-05, "loss": 1.0384, "step": 1941 }, { "epoch": 0.44464796794504863, "grad_norm": 1.1914305686950684, "learning_rate": 1.2283027220419169e-05, "loss": 1.0453, "step": 1942 }, { "epoch": 0.4448769318832284, "grad_norm": 1.607132911682129, "learning_rate": 1.2275804340724684e-05, "loss": 1.033, "step": 1943 }, { "epoch": 0.44510589582140814, "grad_norm": 1.1510486602783203, "learning_rate": 1.2268580208674151e-05, "loss": 1.0786, "step": 1944 }, { "epoch": 0.44533485975958786, "grad_norm": 1.0266963243484497, "learning_rate": 1.2261354828242948e-05, "loss": 1.0426, "step": 1945 }, { "epoch": 0.4455638236977676, "grad_norm": 1.0529321432113647, "learning_rate": 1.2254128203407146e-05, "loss": 1.0221, "step": 1946 }, { "epoch": 0.4457927876359473, "grad_norm": 1.0336111783981323, "learning_rate": 1.2246900338143485e-05, "loss": 1.0285, "step": 1947 }, { "epoch": 0.4460217515741271, "grad_norm": 1.5184695720672607, "learning_rate": 1.2239671236429413e-05, "loss": 1.0077, "step": 1948 }, { "epoch": 0.44625071551230683, "grad_norm": 1.1687562465667725, "learning_rate": 1.2232440902243037e-05, "loss": 1.0827, "step": 1949 }, { "epoch": 0.44647967945048656, "grad_norm": 1.2201441526412964, "learning_rate": 1.2225209339563144e-05, "loss": 1.0207, "step": 1950 }, { "epoch": 0.4467086433886663, "grad_norm": 1.1205480098724365, "learning_rate": 1.2217976552369213e-05, "loss": 1.0461, "step": 1951 }, { "epoch": 0.446937607326846, "grad_norm": 2.142512083053589, "learning_rate": 1.2210742544641377e-05, "loss": 1.1122, "step": 1952 }, { "epoch": 0.44716657126502574, "grad_norm": 1.682520866394043, "learning_rate": 1.2203507320360458e-05, "loss": 1.0742, "step": 1953 }, { "epoch": 0.4473955352032055, "grad_norm": 1.2594259977340698, "learning_rate": 1.2196270883507927e-05, "loss": 1.0582, "step": 1954 }, { "epoch": 0.44762449914138525, "grad_norm": 1.0667154788970947, "learning_rate": 1.218903323806595e-05, "loss": 1.0335, "step": 1955 }, { "epoch": 0.447853463079565, "grad_norm": 1.5725624561309814, "learning_rate": 1.2181794388017332e-05, "loss": 1.0779, "step": 1956 }, { "epoch": 0.4480824270177447, "grad_norm": 1.0927304029464722, "learning_rate": 1.2174554337345555e-05, "loss": 1.0951, "step": 1957 }, { "epoch": 0.4483113909559244, "grad_norm": 1.3435699939727783, "learning_rate": 1.2167313090034756e-05, "loss": 1.0554, "step": 1958 }, { "epoch": 0.44854035489410415, "grad_norm": 1.0855443477630615, "learning_rate": 1.2160070650069735e-05, "loss": 1.1022, "step": 1959 }, { "epoch": 0.44876931883228394, "grad_norm": 1.2444900274276733, "learning_rate": 1.2152827021435946e-05, "loss": 1.0268, "step": 1960 }, { "epoch": 0.44899828277046366, "grad_norm": 1.4539769887924194, "learning_rate": 1.2145582208119497e-05, "loss": 1.1199, "step": 1961 }, { "epoch": 0.4492272467086434, "grad_norm": 1.8856399059295654, "learning_rate": 1.2138336214107148e-05, "loss": 1.0528, "step": 1962 }, { "epoch": 0.4494562106468231, "grad_norm": 1.1333612203598022, "learning_rate": 1.2131089043386305e-05, "loss": 1.0599, "step": 1963 }, { "epoch": 0.44968517458500284, "grad_norm": 1.1188855171203613, "learning_rate": 1.212384069994503e-05, "loss": 1.0356, "step": 1964 }, { "epoch": 0.44991413852318257, "grad_norm": 1.4274768829345703, "learning_rate": 1.2116591187772026e-05, "loss": 1.0944, "step": 1965 }, { "epoch": 0.45014310246136235, "grad_norm": 1.637333631515503, "learning_rate": 1.2109340510856633e-05, "loss": 1.098, "step": 1966 }, { "epoch": 0.4503720663995421, "grad_norm": 1.5847759246826172, "learning_rate": 1.2102088673188845e-05, "loss": 1.0764, "step": 1967 }, { "epoch": 0.4506010303377218, "grad_norm": 1.5479519367218018, "learning_rate": 1.2094835678759282e-05, "loss": 1.118, "step": 1968 }, { "epoch": 0.45082999427590154, "grad_norm": 1.1684271097183228, "learning_rate": 1.2087581531559208e-05, "loss": 1.0399, "step": 1969 }, { "epoch": 0.45105895821408126, "grad_norm": 1.4510235786437988, "learning_rate": 1.2080326235580521e-05, "loss": 1.0827, "step": 1970 }, { "epoch": 0.45128792215226105, "grad_norm": 1.1053940057754517, "learning_rate": 1.2073069794815748e-05, "loss": 1.0701, "step": 1971 }, { "epoch": 0.45151688609044077, "grad_norm": 1.083375334739685, "learning_rate": 1.2065812213258051e-05, "loss": 1.0956, "step": 1972 }, { "epoch": 0.4517458500286205, "grad_norm": 1.1666209697723389, "learning_rate": 1.2058553494901203e-05, "loss": 1.0781, "step": 1973 }, { "epoch": 0.4519748139668002, "grad_norm": 1.2016594409942627, "learning_rate": 1.2051293643739634e-05, "loss": 1.0406, "step": 1974 }, { "epoch": 0.45220377790497995, "grad_norm": 1.4277703762054443, "learning_rate": 1.204403266376837e-05, "loss": 1.0543, "step": 1975 }, { "epoch": 0.4524327418431597, "grad_norm": 1.206367015838623, "learning_rate": 1.2036770558983067e-05, "loss": 1.0423, "step": 1976 }, { "epoch": 0.45266170578133946, "grad_norm": 1.0165696144104004, "learning_rate": 1.202950733338e-05, "loss": 0.9945, "step": 1977 }, { "epoch": 0.4528906697195192, "grad_norm": 0.9572392702102661, "learning_rate": 1.2022242990956064e-05, "loss": 1.07, "step": 1978 }, { "epoch": 0.4531196336576989, "grad_norm": 2.232534885406494, "learning_rate": 1.2014977535708767e-05, "loss": 1.1249, "step": 1979 }, { "epoch": 0.45334859759587864, "grad_norm": 1.0949493646621704, "learning_rate": 1.2007710971636221e-05, "loss": 0.9862, "step": 1980 }, { "epoch": 0.45357756153405837, "grad_norm": 1.493613600730896, "learning_rate": 1.2000443302737162e-05, "loss": 1.0943, "step": 1981 }, { "epoch": 0.4538065254722381, "grad_norm": 1.0247379541397095, "learning_rate": 1.1993174533010928e-05, "loss": 1.0921, "step": 1982 }, { "epoch": 0.4540354894104179, "grad_norm": 1.1227061748504639, "learning_rate": 1.1985904666457455e-05, "loss": 1.0995, "step": 1983 }, { "epoch": 0.4542644533485976, "grad_norm": 1.3855504989624023, "learning_rate": 1.1978633707077296e-05, "loss": 1.0868, "step": 1984 }, { "epoch": 0.45449341728677733, "grad_norm": 1.058700680732727, "learning_rate": 1.1971361658871597e-05, "loss": 1.0761, "step": 1985 }, { "epoch": 0.45472238122495706, "grad_norm": 1.199990153312683, "learning_rate": 1.1964088525842108e-05, "loss": 1.0743, "step": 1986 }, { "epoch": 0.4549513451631368, "grad_norm": 1.074346661567688, "learning_rate": 1.1956814311991164e-05, "loss": 1.0791, "step": 1987 }, { "epoch": 0.4551803091013165, "grad_norm": 1.4295293092727661, "learning_rate": 1.1949539021321713e-05, "loss": 1.0344, "step": 1988 }, { "epoch": 0.4554092730394963, "grad_norm": 1.2947413921356201, "learning_rate": 1.1942262657837285e-05, "loss": 1.0676, "step": 1989 }, { "epoch": 0.455638236977676, "grad_norm": 1.270447015762329, "learning_rate": 1.1934985225541998e-05, "loss": 1.0626, "step": 1990 }, { "epoch": 0.45586720091585575, "grad_norm": 1.2195264101028442, "learning_rate": 1.1927706728440565e-05, "loss": 1.0508, "step": 1991 }, { "epoch": 0.4560961648540355, "grad_norm": 1.2984168529510498, "learning_rate": 1.192042717053828e-05, "loss": 1.0517, "step": 1992 }, { "epoch": 0.4563251287922152, "grad_norm": 1.2059777975082397, "learning_rate": 1.1913146555841027e-05, "loss": 1.0639, "step": 1993 }, { "epoch": 0.456554092730395, "grad_norm": 1.2496461868286133, "learning_rate": 1.1905864888355264e-05, "loss": 1.1124, "step": 1994 }, { "epoch": 0.4567830566685747, "grad_norm": 1.3210448026657104, "learning_rate": 1.1898582172088027e-05, "loss": 1.1523, "step": 1995 }, { "epoch": 0.45701202060675444, "grad_norm": 1.288273811340332, "learning_rate": 1.1891298411046943e-05, "loss": 1.0392, "step": 1996 }, { "epoch": 0.45724098454493417, "grad_norm": 1.6366710662841797, "learning_rate": 1.18840136092402e-05, "loss": 1.1138, "step": 1997 }, { "epoch": 0.4574699484831139, "grad_norm": 0.9694288372993469, "learning_rate": 1.1876727770676562e-05, "loss": 1.0257, "step": 1998 }, { "epoch": 0.4576989124212936, "grad_norm": 1.0713609457015991, "learning_rate": 1.1869440899365365e-05, "loss": 1.0785, "step": 1999 }, { "epoch": 0.4579278763594734, "grad_norm": 0.9268698692321777, "learning_rate": 1.1862152999316517e-05, "loss": 1.0387, "step": 2000 }, { "epoch": 0.45815684029765313, "grad_norm": 1.066428780555725, "learning_rate": 1.1854864074540484e-05, "loss": 1.0959, "step": 2001 }, { "epoch": 0.45838580423583286, "grad_norm": 1.2952028512954712, "learning_rate": 1.18475741290483e-05, "loss": 1.0896, "step": 2002 }, { "epoch": 0.4586147681740126, "grad_norm": 1.1627318859100342, "learning_rate": 1.184028316685157e-05, "loss": 1.0602, "step": 2003 }, { "epoch": 0.4588437321121923, "grad_norm": 1.2692722082138062, "learning_rate": 1.1832991191962435e-05, "loss": 1.1016, "step": 2004 }, { "epoch": 0.45907269605037204, "grad_norm": 1.1311182975769043, "learning_rate": 1.182569820839362e-05, "loss": 1.0275, "step": 2005 }, { "epoch": 0.4593016599885518, "grad_norm": 1.1217960119247437, "learning_rate": 1.1818404220158382e-05, "loss": 1.1007, "step": 2006 }, { "epoch": 0.45953062392673155, "grad_norm": 1.1474188566207886, "learning_rate": 1.181110923127055e-05, "loss": 1.0613, "step": 2007 }, { "epoch": 0.4597595878649113, "grad_norm": 1.1906862258911133, "learning_rate": 1.1803813245744495e-05, "loss": 1.1043, "step": 2008 }, { "epoch": 0.459988551803091, "grad_norm": 1.391391634941101, "learning_rate": 1.179651626759513e-05, "loss": 1.1015, "step": 2009 }, { "epoch": 0.46021751574127073, "grad_norm": 1.3041538000106812, "learning_rate": 1.1789218300837929e-05, "loss": 1.0601, "step": 2010 }, { "epoch": 0.46044647967945046, "grad_norm": 1.484880805015564, "learning_rate": 1.1781919349488894e-05, "loss": 1.0303, "step": 2011 }, { "epoch": 0.46067544361763024, "grad_norm": 1.271309733390808, "learning_rate": 1.177461941756458e-05, "loss": 1.0293, "step": 2012 }, { "epoch": 0.46090440755580997, "grad_norm": 1.0927857160568237, "learning_rate": 1.1767318509082083e-05, "loss": 1.0673, "step": 2013 }, { "epoch": 0.4611333714939897, "grad_norm": 1.1591389179229736, "learning_rate": 1.1760016628059026e-05, "loss": 1.1406, "step": 2014 }, { "epoch": 0.4613623354321694, "grad_norm": 1.0383858680725098, "learning_rate": 1.1752713778513576e-05, "loss": 1.0687, "step": 2015 }, { "epoch": 0.46159129937034915, "grad_norm": 1.1415190696716309, "learning_rate": 1.1745409964464425e-05, "loss": 1.094, "step": 2016 }, { "epoch": 0.46182026330852893, "grad_norm": 1.0855273008346558, "learning_rate": 1.1738105189930807e-05, "loss": 1.1396, "step": 2017 }, { "epoch": 0.46204922724670866, "grad_norm": 1.1211069822311401, "learning_rate": 1.1730799458932473e-05, "loss": 1.0809, "step": 2018 }, { "epoch": 0.4622781911848884, "grad_norm": 1.233792781829834, "learning_rate": 1.1723492775489709e-05, "loss": 1.0569, "step": 2019 }, { "epoch": 0.4625071551230681, "grad_norm": 1.1317369937896729, "learning_rate": 1.1716185143623322e-05, "loss": 1.0652, "step": 2020 }, { "epoch": 0.46273611906124784, "grad_norm": 1.083250880241394, "learning_rate": 1.1708876567354635e-05, "loss": 1.0556, "step": 2021 }, { "epoch": 0.46296508299942757, "grad_norm": 1.1673812866210938, "learning_rate": 1.1701567050705504e-05, "loss": 1.0599, "step": 2022 }, { "epoch": 0.46319404693760735, "grad_norm": 1.2285277843475342, "learning_rate": 1.1694256597698288e-05, "loss": 1.0872, "step": 2023 }, { "epoch": 0.4634230108757871, "grad_norm": 1.4135609865188599, "learning_rate": 1.168694521235587e-05, "loss": 1.0408, "step": 2024 }, { "epoch": 0.4636519748139668, "grad_norm": 1.0683159828186035, "learning_rate": 1.1679632898701649e-05, "loss": 1.0787, "step": 2025 }, { "epoch": 0.46388093875214653, "grad_norm": 1.5515210628509521, "learning_rate": 1.1672319660759523e-05, "loss": 1.0343, "step": 2026 }, { "epoch": 0.46410990269032626, "grad_norm": 1.0499638319015503, "learning_rate": 1.1665005502553912e-05, "loss": 1.0641, "step": 2027 }, { "epoch": 0.464338866628506, "grad_norm": 1.0508882999420166, "learning_rate": 1.165769042810973e-05, "loss": 1.0326, "step": 2028 }, { "epoch": 0.46456783056668577, "grad_norm": 1.1158897876739502, "learning_rate": 1.1650374441452403e-05, "loss": 1.0767, "step": 2029 }, { "epoch": 0.4647967945048655, "grad_norm": 1.1368999481201172, "learning_rate": 1.1643057546607858e-05, "loss": 1.0745, "step": 2030 }, { "epoch": 0.4650257584430452, "grad_norm": 1.5753467082977295, "learning_rate": 1.1635739747602522e-05, "loss": 1.0078, "step": 2031 }, { "epoch": 0.46525472238122495, "grad_norm": 1.261248230934143, "learning_rate": 1.1628421048463315e-05, "loss": 0.9994, "step": 2032 }, { "epoch": 0.4654836863194047, "grad_norm": 1.1466697454452515, "learning_rate": 1.1621101453217656e-05, "loss": 1.0402, "step": 2033 }, { "epoch": 0.4657126502575844, "grad_norm": 1.1229275465011597, "learning_rate": 1.1613780965893465e-05, "loss": 1.0619, "step": 2034 }, { "epoch": 0.4659416141957642, "grad_norm": 1.118383526802063, "learning_rate": 1.1606459590519132e-05, "loss": 1.0626, "step": 2035 }, { "epoch": 0.4661705781339439, "grad_norm": 1.139142632484436, "learning_rate": 1.1599137331123558e-05, "loss": 1.0958, "step": 2036 }, { "epoch": 0.46639954207212364, "grad_norm": 1.29563570022583, "learning_rate": 1.1591814191736117e-05, "loss": 1.042, "step": 2037 }, { "epoch": 0.46662850601030337, "grad_norm": 1.1359105110168457, "learning_rate": 1.1584490176386671e-05, "loss": 1.081, "step": 2038 }, { "epoch": 0.4668574699484831, "grad_norm": 1.0732991695404053, "learning_rate": 1.1577165289105565e-05, "loss": 0.9929, "step": 2039 }, { "epoch": 0.4670864338866629, "grad_norm": 1.1904350519180298, "learning_rate": 1.1569839533923626e-05, "loss": 1.0244, "step": 2040 }, { "epoch": 0.4673153978248426, "grad_norm": 1.1562821865081787, "learning_rate": 1.1562512914872152e-05, "loss": 1.1229, "step": 2041 }, { "epoch": 0.46754436176302233, "grad_norm": 1.1359809637069702, "learning_rate": 1.1555185435982923e-05, "loss": 1.0961, "step": 2042 }, { "epoch": 0.46777332570120206, "grad_norm": 1.1743630170822144, "learning_rate": 1.1547857101288185e-05, "loss": 1.1129, "step": 2043 }, { "epoch": 0.4680022896393818, "grad_norm": 1.1456128358840942, "learning_rate": 1.154052791482066e-05, "loss": 1.0655, "step": 2044 }, { "epoch": 0.4682312535775615, "grad_norm": 1.235987663269043, "learning_rate": 1.153319788061354e-05, "loss": 0.9827, "step": 2045 }, { "epoch": 0.4684602175157413, "grad_norm": 1.3438743352890015, "learning_rate": 1.1525867002700484e-05, "loss": 1.0906, "step": 2046 }, { "epoch": 0.468689181453921, "grad_norm": 1.1594905853271484, "learning_rate": 1.1518535285115604e-05, "loss": 1.0886, "step": 2047 }, { "epoch": 0.46891814539210075, "grad_norm": 1.3096404075622559, "learning_rate": 1.1511202731893493e-05, "loss": 1.108, "step": 2048 }, { "epoch": 0.4691471093302805, "grad_norm": 1.1685811281204224, "learning_rate": 1.1503869347069186e-05, "loss": 1.0289, "step": 2049 }, { "epoch": 0.4693760732684602, "grad_norm": 1.7220717668533325, "learning_rate": 1.1496535134678186e-05, "loss": 1.0488, "step": 2050 }, { "epoch": 0.46960503720663993, "grad_norm": 1.417108416557312, "learning_rate": 1.1489200098756447e-05, "loss": 1.0304, "step": 2051 }, { "epoch": 0.4698340011448197, "grad_norm": 1.0246206521987915, "learning_rate": 1.1481864243340381e-05, "loss": 1.0297, "step": 2052 }, { "epoch": 0.47006296508299944, "grad_norm": 1.2509313821792603, "learning_rate": 1.1474527572466847e-05, "loss": 1.1336, "step": 2053 }, { "epoch": 0.47029192902117917, "grad_norm": 1.0300573110580444, "learning_rate": 1.1467190090173147e-05, "loss": 1.0812, "step": 2054 }, { "epoch": 0.4705208929593589, "grad_norm": 1.067595362663269, "learning_rate": 1.1459851800497048e-05, "loss": 1.1246, "step": 2055 }, { "epoch": 0.4707498568975386, "grad_norm": 1.1726340055465698, "learning_rate": 1.145251270747674e-05, "loss": 1.0196, "step": 2056 }, { "epoch": 0.47097882083571835, "grad_norm": 1.0472357273101807, "learning_rate": 1.1445172815150864e-05, "loss": 1.0323, "step": 2057 }, { "epoch": 0.47120778477389813, "grad_norm": 2.4607527256011963, "learning_rate": 1.1437832127558508e-05, "loss": 1.073, "step": 2058 }, { "epoch": 0.47143674871207786, "grad_norm": 0.9941400289535522, "learning_rate": 1.1430490648739185e-05, "loss": 1.0418, "step": 2059 }, { "epoch": 0.4716657126502576, "grad_norm": 1.4227428436279297, "learning_rate": 1.1423148382732854e-05, "loss": 1.1564, "step": 2060 }, { "epoch": 0.4718946765884373, "grad_norm": 1.135663390159607, "learning_rate": 1.1415805333579895e-05, "loss": 1.0869, "step": 2061 }, { "epoch": 0.47212364052661704, "grad_norm": 1.1211762428283691, "learning_rate": 1.1408461505321136e-05, "loss": 1.0782, "step": 2062 }, { "epoch": 0.4723526044647968, "grad_norm": 1.7367572784423828, "learning_rate": 1.1401116901997815e-05, "loss": 1.0039, "step": 2063 }, { "epoch": 0.47258156840297655, "grad_norm": 1.2560068368911743, "learning_rate": 1.1393771527651614e-05, "loss": 1.0838, "step": 2064 }, { "epoch": 0.4728105323411563, "grad_norm": 1.2819138765335083, "learning_rate": 1.1386425386324622e-05, "loss": 0.999, "step": 2065 }, { "epoch": 0.473039496279336, "grad_norm": 1.2946174144744873, "learning_rate": 1.1379078482059367e-05, "loss": 1.0082, "step": 2066 }, { "epoch": 0.47326846021751573, "grad_norm": 1.1478831768035889, "learning_rate": 1.1371730818898785e-05, "loss": 1.114, "step": 2067 }, { "epoch": 0.47349742415569546, "grad_norm": 1.1708440780639648, "learning_rate": 1.1364382400886233e-05, "loss": 1.0527, "step": 2068 }, { "epoch": 0.47372638809387524, "grad_norm": 1.5635682344436646, "learning_rate": 1.1357033232065484e-05, "loss": 1.0343, "step": 2069 }, { "epoch": 0.47395535203205497, "grad_norm": 1.4606600999832153, "learning_rate": 1.134968331648073e-05, "loss": 1.0216, "step": 2070 }, { "epoch": 0.4741843159702347, "grad_norm": 1.3325625658035278, "learning_rate": 1.1342332658176556e-05, "loss": 1.0921, "step": 2071 }, { "epoch": 0.4744132799084144, "grad_norm": 1.0679417848587036, "learning_rate": 1.1334981261197977e-05, "loss": 1.1267, "step": 2072 }, { "epoch": 0.47464224384659415, "grad_norm": 1.4468427896499634, "learning_rate": 1.1327629129590402e-05, "loss": 1.0597, "step": 2073 }, { "epoch": 0.4748712077847739, "grad_norm": 1.1622424125671387, "learning_rate": 1.132027626739965e-05, "loss": 1.0962, "step": 2074 }, { "epoch": 0.47510017172295366, "grad_norm": 1.3740299940109253, "learning_rate": 1.1312922678671935e-05, "loss": 1.0671, "step": 2075 }, { "epoch": 0.4753291356611334, "grad_norm": 1.3251038789749146, "learning_rate": 1.1305568367453877e-05, "loss": 1.0959, "step": 2076 }, { "epoch": 0.4755580995993131, "grad_norm": 1.244946002960205, "learning_rate": 1.1298213337792494e-05, "loss": 1.0705, "step": 2077 }, { "epoch": 0.47578706353749284, "grad_norm": 1.1645523309707642, "learning_rate": 1.1290857593735196e-05, "loss": 1.097, "step": 2078 }, { "epoch": 0.47601602747567257, "grad_norm": 1.0410329103469849, "learning_rate": 1.1283501139329787e-05, "loss": 1.0381, "step": 2079 }, { "epoch": 0.4762449914138523, "grad_norm": 1.137269377708435, "learning_rate": 1.1276143978624457e-05, "loss": 1.0412, "step": 2080 }, { "epoch": 0.4764739553520321, "grad_norm": 1.1206343173980713, "learning_rate": 1.1268786115667798e-05, "loss": 1.0825, "step": 2081 }, { "epoch": 0.4767029192902118, "grad_norm": 1.176672339439392, "learning_rate": 1.126142755450878e-05, "loss": 1.0497, "step": 2082 }, { "epoch": 0.47693188322839153, "grad_norm": 1.2728939056396484, "learning_rate": 1.125406829919675e-05, "loss": 1.0416, "step": 2083 }, { "epoch": 0.47716084716657126, "grad_norm": 1.3541243076324463, "learning_rate": 1.1246708353781453e-05, "loss": 1.0776, "step": 2084 }, { "epoch": 0.477389811104751, "grad_norm": 1.3378325700759888, "learning_rate": 1.1239347722312997e-05, "loss": 1.0741, "step": 2085 }, { "epoch": 0.47761877504293077, "grad_norm": 1.2494585514068604, "learning_rate": 1.1231986408841882e-05, "loss": 1.072, "step": 2086 }, { "epoch": 0.4778477389811105, "grad_norm": 1.4159393310546875, "learning_rate": 1.1224624417418976e-05, "loss": 1.1049, "step": 2087 }, { "epoch": 0.4780767029192902, "grad_norm": 1.2381250858306885, "learning_rate": 1.1217261752095518e-05, "loss": 1.0467, "step": 2088 }, { "epoch": 0.47830566685746995, "grad_norm": 1.128997802734375, "learning_rate": 1.1209898416923129e-05, "loss": 1.0505, "step": 2089 }, { "epoch": 0.4785346307956497, "grad_norm": 1.1534404754638672, "learning_rate": 1.120253441595378e-05, "loss": 0.9887, "step": 2090 }, { "epoch": 0.4787635947338294, "grad_norm": 1.047783613204956, "learning_rate": 1.1195169753239825e-05, "loss": 1.0748, "step": 2091 }, { "epoch": 0.4789925586720092, "grad_norm": 1.0474052429199219, "learning_rate": 1.1187804432833976e-05, "loss": 1.0393, "step": 2092 }, { "epoch": 0.4792215226101889, "grad_norm": 1.389976978302002, "learning_rate": 1.1180438458789305e-05, "loss": 1.0733, "step": 2093 }, { "epoch": 0.47945048654836864, "grad_norm": 1.16421377658844, "learning_rate": 1.1173071835159248e-05, "loss": 1.0366, "step": 2094 }, { "epoch": 0.47967945048654836, "grad_norm": 1.5794599056243896, "learning_rate": 1.1165704565997593e-05, "loss": 1.0504, "step": 2095 }, { "epoch": 0.4799084144247281, "grad_norm": 1.2971510887145996, "learning_rate": 1.115833665535849e-05, "loss": 1.0369, "step": 2096 }, { "epoch": 0.4801373783629078, "grad_norm": 1.2570523023605347, "learning_rate": 1.1150968107296438e-05, "loss": 1.046, "step": 2097 }, { "epoch": 0.4803663423010876, "grad_norm": 1.7457414865493774, "learning_rate": 1.1143598925866286e-05, "loss": 1.0883, "step": 2098 }, { "epoch": 0.48059530623926733, "grad_norm": 1.236221432685852, "learning_rate": 1.1136229115123232e-05, "loss": 1.0506, "step": 2099 }, { "epoch": 0.48082427017744706, "grad_norm": 1.4669848680496216, "learning_rate": 1.1128858679122822e-05, "loss": 1.0371, "step": 2100 }, { "epoch": 0.4810532341156268, "grad_norm": 1.0517163276672363, "learning_rate": 1.1121487621920948e-05, "loss": 1.0899, "step": 2101 }, { "epoch": 0.4812821980538065, "grad_norm": 1.3375537395477295, "learning_rate": 1.1114115947573834e-05, "loss": 1.0915, "step": 2102 }, { "epoch": 0.48151116199198624, "grad_norm": 1.2560817003250122, "learning_rate": 1.1106743660138057e-05, "loss": 1.0988, "step": 2103 }, { "epoch": 0.481740125930166, "grad_norm": 1.2373180389404297, "learning_rate": 1.1099370763670523e-05, "loss": 0.9765, "step": 2104 }, { "epoch": 0.48196908986834575, "grad_norm": 1.2725884914398193, "learning_rate": 1.1091997262228473e-05, "loss": 1.0098, "step": 2105 }, { "epoch": 0.4821980538065255, "grad_norm": 1.7785913944244385, "learning_rate": 1.1084623159869488e-05, "loss": 1.0742, "step": 2106 }, { "epoch": 0.4824270177447052, "grad_norm": 1.4811958074569702, "learning_rate": 1.1077248460651468e-05, "loss": 1.0411, "step": 2107 }, { "epoch": 0.4826559816828849, "grad_norm": 1.2428529262542725, "learning_rate": 1.1069873168632657e-05, "loss": 1.1439, "step": 2108 }, { "epoch": 0.4828849456210647, "grad_norm": 1.0219333171844482, "learning_rate": 1.1062497287871606e-05, "loss": 1.0631, "step": 2109 }, { "epoch": 0.48311390955924444, "grad_norm": 1.0447986125946045, "learning_rate": 1.1055120822427208e-05, "loss": 1.0794, "step": 2110 }, { "epoch": 0.48334287349742416, "grad_norm": 1.1982734203338623, "learning_rate": 1.1047743776358666e-05, "loss": 1.1079, "step": 2111 }, { "epoch": 0.4835718374356039, "grad_norm": 1.3162384033203125, "learning_rate": 1.104036615372551e-05, "loss": 1.1181, "step": 2112 }, { "epoch": 0.4838008013737836, "grad_norm": 1.0572696924209595, "learning_rate": 1.103298795858758e-05, "loss": 1.0138, "step": 2113 }, { "epoch": 0.48402976531196334, "grad_norm": 1.1792926788330078, "learning_rate": 1.1025609195005038e-05, "loss": 1.0455, "step": 2114 }, { "epoch": 0.4842587292501431, "grad_norm": 2.336987018585205, "learning_rate": 1.1018229867038358e-05, "loss": 1.1053, "step": 2115 }, { "epoch": 0.48448769318832285, "grad_norm": 1.0690840482711792, "learning_rate": 1.1010849978748314e-05, "loss": 1.0366, "step": 2116 }, { "epoch": 0.4847166571265026, "grad_norm": 1.3050801753997803, "learning_rate": 1.1003469534196003e-05, "loss": 1.0121, "step": 2117 }, { "epoch": 0.4849456210646823, "grad_norm": 1.054900884628296, "learning_rate": 1.099608853744282e-05, "loss": 1.0639, "step": 2118 }, { "epoch": 0.48517458500286204, "grad_norm": 1.2549351453781128, "learning_rate": 1.0988706992550467e-05, "loss": 1.0572, "step": 2119 }, { "epoch": 0.48540354894104176, "grad_norm": 1.4606761932373047, "learning_rate": 1.0981324903580945e-05, "loss": 1.0981, "step": 2120 }, { "epoch": 0.48563251287922155, "grad_norm": 1.153498649597168, "learning_rate": 1.0973942274596557e-05, "loss": 1.0869, "step": 2121 }, { "epoch": 0.48586147681740127, "grad_norm": 1.411656141281128, "learning_rate": 1.09665591096599e-05, "loss": 1.0528, "step": 2122 }, { "epoch": 0.486090440755581, "grad_norm": 1.040209174156189, "learning_rate": 1.0959175412833869e-05, "loss": 1.0702, "step": 2123 }, { "epoch": 0.4863194046937607, "grad_norm": 0.9708045721054077, "learning_rate": 1.0951791188181648e-05, "loss": 1.1015, "step": 2124 }, { "epoch": 0.48654836863194045, "grad_norm": 1.1500178575515747, "learning_rate": 1.0944406439766719e-05, "loss": 1.0767, "step": 2125 }, { "epoch": 0.4867773325701202, "grad_norm": 3.3348257541656494, "learning_rate": 1.0937021171652842e-05, "loss": 1.146, "step": 2126 }, { "epoch": 0.48700629650829996, "grad_norm": 1.2559481859207153, "learning_rate": 1.0929635387904075e-05, "loss": 1.0668, "step": 2127 }, { "epoch": 0.4872352604464797, "grad_norm": 2.0190439224243164, "learning_rate": 1.092224909258474e-05, "loss": 1.0488, "step": 2128 }, { "epoch": 0.4874642243846594, "grad_norm": 1.2366149425506592, "learning_rate": 1.0914862289759467e-05, "loss": 1.0883, "step": 2129 }, { "epoch": 0.48769318832283914, "grad_norm": 1.206701397895813, "learning_rate": 1.0907474983493144e-05, "loss": 1.0949, "step": 2130 }, { "epoch": 0.48792215226101887, "grad_norm": 1.0292967557907104, "learning_rate": 1.0900087177850946e-05, "loss": 1.0862, "step": 2131 }, { "epoch": 0.48815111619919865, "grad_norm": 1.2705435752868652, "learning_rate": 1.0892698876898322e-05, "loss": 1.0859, "step": 2132 }, { "epoch": 0.4883800801373784, "grad_norm": 1.5828287601470947, "learning_rate": 1.0885310084700989e-05, "loss": 1.0841, "step": 2133 }, { "epoch": 0.4886090440755581, "grad_norm": 1.2173969745635986, "learning_rate": 1.087792080532494e-05, "loss": 1.0741, "step": 2134 }, { "epoch": 0.48883800801373783, "grad_norm": 1.191078543663025, "learning_rate": 1.087053104283643e-05, "loss": 1.0468, "step": 2135 }, { "epoch": 0.48906697195191756, "grad_norm": 1.1816260814666748, "learning_rate": 1.0863140801301988e-05, "loss": 1.0349, "step": 2136 }, { "epoch": 0.4892959358900973, "grad_norm": 1.1615114212036133, "learning_rate": 1.08557500847884e-05, "loss": 1.0635, "step": 2137 }, { "epoch": 0.48952489982827707, "grad_norm": 1.2740709781646729, "learning_rate": 1.0848358897362713e-05, "loss": 0.9996, "step": 2138 }, { "epoch": 0.4897538637664568, "grad_norm": 1.2105414867401123, "learning_rate": 1.0840967243092237e-05, "loss": 1.1426, "step": 2139 }, { "epoch": 0.4899828277046365, "grad_norm": 1.668809175491333, "learning_rate": 1.083357512604454e-05, "loss": 1.0329, "step": 2140 }, { "epoch": 0.49021179164281625, "grad_norm": 1.245213508605957, "learning_rate": 1.082618255028744e-05, "loss": 1.0716, "step": 2141 }, { "epoch": 0.490440755580996, "grad_norm": 1.3269602060317993, "learning_rate": 1.0818789519889006e-05, "loss": 1.0416, "step": 2142 }, { "epoch": 0.4906697195191757, "grad_norm": 1.0687472820281982, "learning_rate": 1.0811396038917568e-05, "loss": 1.0823, "step": 2143 }, { "epoch": 0.4908986834573555, "grad_norm": 1.3689026832580566, "learning_rate": 1.080400211144169e-05, "loss": 1.0581, "step": 2144 }, { "epoch": 0.4911276473955352, "grad_norm": 1.1712907552719116, "learning_rate": 1.0796607741530191e-05, "loss": 1.0461, "step": 2145 }, { "epoch": 0.49135661133371494, "grad_norm": 2.368657350540161, "learning_rate": 1.078921293325213e-05, "loss": 1.1125, "step": 2146 }, { "epoch": 0.49158557527189467, "grad_norm": 1.0939091444015503, "learning_rate": 1.078181769067681e-05, "loss": 1.1154, "step": 2147 }, { "epoch": 0.4918145392100744, "grad_norm": 1.1558587551116943, "learning_rate": 1.077442201787377e-05, "loss": 1.049, "step": 2148 }, { "epoch": 0.4920435031482541, "grad_norm": 1.6609954833984375, "learning_rate": 1.0767025918912785e-05, "loss": 1.0641, "step": 2149 }, { "epoch": 0.4922724670864339, "grad_norm": 1.109490990638733, "learning_rate": 1.075962939786387e-05, "loss": 1.0407, "step": 2150 }, { "epoch": 0.49250143102461363, "grad_norm": 1.1975339651107788, "learning_rate": 1.0752232458797262e-05, "loss": 1.1138, "step": 2151 }, { "epoch": 0.49273039496279336, "grad_norm": 2.040076971054077, "learning_rate": 1.0744835105783443e-05, "loss": 1.0248, "step": 2152 }, { "epoch": 0.4929593589009731, "grad_norm": 1.3864123821258545, "learning_rate": 1.0737437342893107e-05, "loss": 1.089, "step": 2153 }, { "epoch": 0.4931883228391528, "grad_norm": 1.306090235710144, "learning_rate": 1.0730039174197185e-05, "loss": 1.0915, "step": 2154 }, { "epoch": 0.4934172867773326, "grad_norm": 1.2275519371032715, "learning_rate": 1.0722640603766825e-05, "loss": 1.0502, "step": 2155 }, { "epoch": 0.4936462507155123, "grad_norm": 1.4218353033065796, "learning_rate": 1.0715241635673401e-05, "loss": 1.1642, "step": 2156 }, { "epoch": 0.49387521465369205, "grad_norm": 1.3406203985214233, "learning_rate": 1.0707842273988498e-05, "loss": 1.1157, "step": 2157 }, { "epoch": 0.4941041785918718, "grad_norm": 1.8006969690322876, "learning_rate": 1.070044252278393e-05, "loss": 1.0964, "step": 2158 }, { "epoch": 0.4943331425300515, "grad_norm": 1.0731760263442993, "learning_rate": 1.0693042386131713e-05, "loss": 1.0523, "step": 2159 }, { "epoch": 0.49456210646823123, "grad_norm": 1.6657426357269287, "learning_rate": 1.0685641868104085e-05, "loss": 1.0756, "step": 2160 }, { "epoch": 0.494791070406411, "grad_norm": 1.2945160865783691, "learning_rate": 1.0678240972773479e-05, "loss": 1.0694, "step": 2161 }, { "epoch": 0.49502003434459074, "grad_norm": 1.229271650314331, "learning_rate": 1.0670839704212555e-05, "loss": 1.084, "step": 2162 }, { "epoch": 0.49524899828277047, "grad_norm": 1.339217185974121, "learning_rate": 1.0663438066494168e-05, "loss": 1.1103, "step": 2163 }, { "epoch": 0.4954779622209502, "grad_norm": 2.237689971923828, "learning_rate": 1.0656036063691373e-05, "loss": 1.0424, "step": 2164 }, { "epoch": 0.4957069261591299, "grad_norm": 1.0860393047332764, "learning_rate": 1.064863369987743e-05, "loss": 1.0646, "step": 2165 }, { "epoch": 0.49593589009730965, "grad_norm": 1.190271258354187, "learning_rate": 1.0641230979125804e-05, "loss": 1.0778, "step": 2166 }, { "epoch": 0.49616485403548943, "grad_norm": 1.1193439960479736, "learning_rate": 1.0633827905510146e-05, "loss": 1.0855, "step": 2167 }, { "epoch": 0.49639381797366916, "grad_norm": 1.3502209186553955, "learning_rate": 1.0626424483104302e-05, "loss": 1.0405, "step": 2168 }, { "epoch": 0.4966227819118489, "grad_norm": 1.0666968822479248, "learning_rate": 1.061902071598232e-05, "loss": 1.0146, "step": 2169 }, { "epoch": 0.4968517458500286, "grad_norm": 1.291035771369934, "learning_rate": 1.0611616608218429e-05, "loss": 1.058, "step": 2170 }, { "epoch": 0.49708070978820834, "grad_norm": 1.1638990640640259, "learning_rate": 1.0604212163887044e-05, "loss": 1.0792, "step": 2171 }, { "epoch": 0.49730967372638807, "grad_norm": 1.4782902002334595, "learning_rate": 1.0596807387062772e-05, "loss": 1.0652, "step": 2172 }, { "epoch": 0.49753863766456785, "grad_norm": 1.5571351051330566, "learning_rate": 1.0589402281820397e-05, "loss": 1.1116, "step": 2173 }, { "epoch": 0.4977676016027476, "grad_norm": 1.5529664754867554, "learning_rate": 1.058199685223489e-05, "loss": 1.0767, "step": 2174 }, { "epoch": 0.4979965655409273, "grad_norm": 1.0995012521743774, "learning_rate": 1.0574591102381395e-05, "loss": 1.0161, "step": 2175 }, { "epoch": 0.49822552947910703, "grad_norm": 1.1900322437286377, "learning_rate": 1.056718503633523e-05, "loss": 0.9899, "step": 2176 }, { "epoch": 0.49845449341728676, "grad_norm": 1.1697393655776978, "learning_rate": 1.05597786581719e-05, "loss": 1.0988, "step": 2177 }, { "epoch": 0.49868345735546654, "grad_norm": 1.67704176902771, "learning_rate": 1.0552371971967064e-05, "loss": 1.0819, "step": 2178 }, { "epoch": 0.49891242129364627, "grad_norm": 1.1780916452407837, "learning_rate": 1.0544964981796563e-05, "loss": 1.0438, "step": 2179 }, { "epoch": 0.499141385231826, "grad_norm": 1.136317491531372, "learning_rate": 1.0537557691736402e-05, "loss": 1.0522, "step": 2180 }, { "epoch": 0.4993703491700057, "grad_norm": 1.273093581199646, "learning_rate": 1.0530150105862748e-05, "loss": 1.0864, "step": 2181 }, { "epoch": 0.49959931310818545, "grad_norm": 1.1058335304260254, "learning_rate": 1.052274222825194e-05, "loss": 1.0664, "step": 2182 }, { "epoch": 0.4998282770463652, "grad_norm": 1.14626944065094, "learning_rate": 1.0515334062980463e-05, "loss": 1.0342, "step": 2183 }, { "epoch": 0.500057240984545, "grad_norm": 1.7187449932098389, "learning_rate": 1.0507925614124977e-05, "loss": 1.0868, "step": 2184 }, { "epoch": 0.5002862049227247, "grad_norm": 1.194671630859375, "learning_rate": 1.0500516885762278e-05, "loss": 1.1066, "step": 2185 }, { "epoch": 0.5005151688609044, "grad_norm": 1.1433876752853394, "learning_rate": 1.0493107881969335e-05, "loss": 1.0736, "step": 2186 }, { "epoch": 0.5007441327990841, "grad_norm": 1.1823675632476807, "learning_rate": 1.0485698606823258e-05, "loss": 1.0713, "step": 2187 }, { "epoch": 0.5009730967372639, "grad_norm": 1.105976939201355, "learning_rate": 1.047828906440131e-05, "loss": 1.06, "step": 2188 }, { "epoch": 0.5012020606754436, "grad_norm": 1.8626004457473755, "learning_rate": 1.0470879258780904e-05, "loss": 1.0403, "step": 2189 }, { "epoch": 0.5014310246136233, "grad_norm": 0.9595720171928406, "learning_rate": 1.0463469194039584e-05, "loss": 1.0738, "step": 2190 }, { "epoch": 0.501659988551803, "grad_norm": 1.0298045873641968, "learning_rate": 1.0456058874255055e-05, "loss": 1.0297, "step": 2191 }, { "epoch": 0.5018889524899828, "grad_norm": 1.3229492902755737, "learning_rate": 1.044864830350515e-05, "loss": 1.0169, "step": 2192 }, { "epoch": 0.5021179164281626, "grad_norm": 1.3693296909332275, "learning_rate": 1.0441237485867845e-05, "loss": 1.0533, "step": 2193 }, { "epoch": 0.5023468803663423, "grad_norm": 1.587257981300354, "learning_rate": 1.0433826425421252e-05, "loss": 1.0314, "step": 2194 }, { "epoch": 0.5025758443045221, "grad_norm": 1.2772449254989624, "learning_rate": 1.0426415126243615e-05, "loss": 1.1135, "step": 2195 }, { "epoch": 0.5028048082427018, "grad_norm": 1.3835172653198242, "learning_rate": 1.0419003592413308e-05, "loss": 1.0587, "step": 2196 }, { "epoch": 0.5030337721808815, "grad_norm": 1.1872873306274414, "learning_rate": 1.0411591828008839e-05, "loss": 1.092, "step": 2197 }, { "epoch": 0.5032627361190612, "grad_norm": 1.5710023641586304, "learning_rate": 1.0404179837108833e-05, "loss": 1.0778, "step": 2198 }, { "epoch": 0.503491700057241, "grad_norm": 1.2492884397506714, "learning_rate": 1.0396767623792054e-05, "loss": 1.0466, "step": 2199 }, { "epoch": 0.5037206639954207, "grad_norm": 1.0260891914367676, "learning_rate": 1.0389355192137379e-05, "loss": 1.1209, "step": 2200 }, { "epoch": 0.5039496279336004, "grad_norm": 1.9055163860321045, "learning_rate": 1.0381942546223805e-05, "loss": 1.0782, "step": 2201 }, { "epoch": 0.5041785918717802, "grad_norm": 1.1072444915771484, "learning_rate": 1.0374529690130448e-05, "loss": 1.1312, "step": 2202 }, { "epoch": 0.5044075558099599, "grad_norm": 1.2415649890899658, "learning_rate": 1.0367116627936549e-05, "loss": 1.0609, "step": 2203 }, { "epoch": 0.5046365197481397, "grad_norm": 1.1317917108535767, "learning_rate": 1.0359703363721443e-05, "loss": 1.0392, "step": 2204 }, { "epoch": 0.5048654836863194, "grad_norm": 1.2405906915664673, "learning_rate": 1.0352289901564592e-05, "loss": 1.0132, "step": 2205 }, { "epoch": 0.5050944476244992, "grad_norm": 1.1400092840194702, "learning_rate": 1.034487624554556e-05, "loss": 1.026, "step": 2206 }, { "epoch": 0.5053234115626789, "grad_norm": 1.1699836254119873, "learning_rate": 1.0337462399744025e-05, "loss": 1.077, "step": 2207 }, { "epoch": 0.5055523755008586, "grad_norm": 1.1276884078979492, "learning_rate": 1.033004836823976e-05, "loss": 1.1002, "step": 2208 }, { "epoch": 0.5057813394390384, "grad_norm": 1.2098451852798462, "learning_rate": 1.032263415511264e-05, "loss": 1.006, "step": 2209 }, { "epoch": 0.5060103033772181, "grad_norm": 1.1760305166244507, "learning_rate": 1.0315219764442657e-05, "loss": 1.0193, "step": 2210 }, { "epoch": 0.5062392673153978, "grad_norm": 1.0920625925064087, "learning_rate": 1.0307805200309877e-05, "loss": 1.0141, "step": 2211 }, { "epoch": 0.5064682312535775, "grad_norm": 1.2013732194900513, "learning_rate": 1.0300390466794477e-05, "loss": 1.0508, "step": 2212 }, { "epoch": 0.5066971951917573, "grad_norm": 1.3105764389038086, "learning_rate": 1.0292975567976719e-05, "loss": 1.0373, "step": 2213 }, { "epoch": 0.506926159129937, "grad_norm": 1.2512943744659424, "learning_rate": 1.0285560507936962e-05, "loss": 1.0673, "step": 2214 }, { "epoch": 0.5071551230681167, "grad_norm": 1.5378952026367188, "learning_rate": 1.0278145290755657e-05, "loss": 1.0772, "step": 2215 }, { "epoch": 0.5073840870062966, "grad_norm": 1.0876376628875732, "learning_rate": 1.0270729920513326e-05, "loss": 1.0603, "step": 2216 }, { "epoch": 0.5076130509444763, "grad_norm": 1.0932906866073608, "learning_rate": 1.0263314401290589e-05, "loss": 1.0401, "step": 2217 }, { "epoch": 0.507842014882656, "grad_norm": 1.3708820343017578, "learning_rate": 1.0255898737168147e-05, "loss": 1.073, "step": 2218 }, { "epoch": 0.5080709788208357, "grad_norm": 1.1124688386917114, "learning_rate": 1.0248482932226775e-05, "loss": 1.0008, "step": 2219 }, { "epoch": 0.5082999427590155, "grad_norm": 2.375359058380127, "learning_rate": 1.0241066990547328e-05, "loss": 1.0608, "step": 2220 }, { "epoch": 0.5085289066971952, "grad_norm": 1.3283129930496216, "learning_rate": 1.0233650916210736e-05, "loss": 1.0725, "step": 2221 }, { "epoch": 0.5087578706353749, "grad_norm": 1.1090928316116333, "learning_rate": 1.0226234713298007e-05, "loss": 1.0388, "step": 2222 }, { "epoch": 0.5089868345735546, "grad_norm": 1.2009447813034058, "learning_rate": 1.021881838589021e-05, "loss": 1.0969, "step": 2223 }, { "epoch": 0.5092157985117344, "grad_norm": 1.144832968711853, "learning_rate": 1.021140193806849e-05, "loss": 1.0553, "step": 2224 }, { "epoch": 0.5094447624499141, "grad_norm": 1.072203516960144, "learning_rate": 1.0203985373914056e-05, "loss": 1.0466, "step": 2225 }, { "epoch": 0.5096737263880938, "grad_norm": 1.0581434965133667, "learning_rate": 1.019656869750818e-05, "loss": 1.0244, "step": 2226 }, { "epoch": 0.5099026903262737, "grad_norm": 1.2225676774978638, "learning_rate": 1.0189151912932199e-05, "loss": 1.0231, "step": 2227 }, { "epoch": 0.5101316542644534, "grad_norm": 1.196749210357666, "learning_rate": 1.0181735024267504e-05, "loss": 1.128, "step": 2228 }, { "epoch": 0.5103606182026331, "grad_norm": 1.0913293361663818, "learning_rate": 1.0174318035595551e-05, "loss": 1.1325, "step": 2229 }, { "epoch": 0.5105895821408128, "grad_norm": 1.296054720878601, "learning_rate": 1.0166900950997845e-05, "loss": 1.0734, "step": 2230 }, { "epoch": 0.5108185460789926, "grad_norm": 1.2411744594573975, "learning_rate": 1.0159483774555945e-05, "loss": 1.033, "step": 2231 }, { "epoch": 0.5110475100171723, "grad_norm": 1.2513041496276855, "learning_rate": 1.015206651035146e-05, "loss": 1.1326, "step": 2232 }, { "epoch": 0.511276473955352, "grad_norm": 1.182816743850708, "learning_rate": 1.0144649162466047e-05, "loss": 1.0539, "step": 2233 }, { "epoch": 0.5115054378935318, "grad_norm": 1.555910587310791, "learning_rate": 1.0137231734981417e-05, "loss": 1.0438, "step": 2234 }, { "epoch": 0.5117344018317115, "grad_norm": 1.1628386974334717, "learning_rate": 1.012981423197931e-05, "loss": 1.1111, "step": 2235 }, { "epoch": 0.5119633657698912, "grad_norm": 1.134666919708252, "learning_rate": 1.0122396657541522e-05, "loss": 1.0665, "step": 2236 }, { "epoch": 0.5121923297080709, "grad_norm": 1.291595458984375, "learning_rate": 1.011497901574988e-05, "loss": 0.956, "step": 2237 }, { "epoch": 0.5124212936462507, "grad_norm": 1.0975548028945923, "learning_rate": 1.0107561310686247e-05, "loss": 1.0621, "step": 2238 }, { "epoch": 0.5126502575844305, "grad_norm": 1.4685457944869995, "learning_rate": 1.0100143546432527e-05, "loss": 1.11, "step": 2239 }, { "epoch": 0.5128792215226102, "grad_norm": 1.5499354600906372, "learning_rate": 1.0092725727070653e-05, "loss": 1.0941, "step": 2240 }, { "epoch": 0.51310818546079, "grad_norm": 1.6682047843933105, "learning_rate": 1.0085307856682593e-05, "loss": 1.0784, "step": 2241 }, { "epoch": 0.5133371493989697, "grad_norm": 1.0723470449447632, "learning_rate": 1.007788993935033e-05, "loss": 1.0805, "step": 2242 }, { "epoch": 0.5135661133371494, "grad_norm": 1.175240397453308, "learning_rate": 1.007047197915589e-05, "loss": 1.098, "step": 2243 }, { "epoch": 0.5137950772753291, "grad_norm": 1.1597743034362793, "learning_rate": 1.0063053980181305e-05, "loss": 1.0723, "step": 2244 }, { "epoch": 0.5140240412135089, "grad_norm": 1.0197961330413818, "learning_rate": 1.0055635946508649e-05, "loss": 1.0706, "step": 2245 }, { "epoch": 0.5142530051516886, "grad_norm": 1.1021794080734253, "learning_rate": 1.0048217882219995e-05, "loss": 1.0715, "step": 2246 }, { "epoch": 0.5144819690898683, "grad_norm": 1.0522361993789673, "learning_rate": 1.0040799791397444e-05, "loss": 1.0604, "step": 2247 }, { "epoch": 0.514710933028048, "grad_norm": 1.32160484790802, "learning_rate": 1.0033381678123113e-05, "loss": 1.0638, "step": 2248 }, { "epoch": 0.5149398969662278, "grad_norm": 1.5277987718582153, "learning_rate": 1.002596354647912e-05, "loss": 1.1117, "step": 2249 }, { "epoch": 0.5151688609044076, "grad_norm": 1.1318628787994385, "learning_rate": 1.0018545400547609e-05, "loss": 1.0124, "step": 2250 }, { "epoch": 0.5153978248425873, "grad_norm": 1.8370736837387085, "learning_rate": 1.001112724441072e-05, "loss": 1.0463, "step": 2251 }, { "epoch": 0.5156267887807671, "grad_norm": 2.369612693786621, "learning_rate": 1.0003709082150598e-05, "loss": 1.0653, "step": 2252 }, { "epoch": 0.5158557527189468, "grad_norm": 1.059601902961731, "learning_rate": 9.996290917849405e-06, "loss": 1.1053, "step": 2253 }, { "epoch": 0.5160847166571265, "grad_norm": 1.0960521697998047, "learning_rate": 9.988872755589283e-06, "loss": 1.0594, "step": 2254 }, { "epoch": 0.5163136805953062, "grad_norm": 1.3870129585266113, "learning_rate": 9.981454599452391e-06, "loss": 1.0288, "step": 2255 }, { "epoch": 0.516542644533486, "grad_norm": 1.0221267938613892, "learning_rate": 9.974036453520881e-06, "loss": 0.9995, "step": 2256 }, { "epoch": 0.5167716084716657, "grad_norm": 1.0023499727249146, "learning_rate": 9.966618321876889e-06, "loss": 1.0261, "step": 2257 }, { "epoch": 0.5170005724098454, "grad_norm": 1.196207880973816, "learning_rate": 9.95920020860256e-06, "loss": 1.0774, "step": 2258 }, { "epoch": 0.5172295363480252, "grad_norm": 1.261536717414856, "learning_rate": 9.951782117780008e-06, "loss": 1.0645, "step": 2259 }, { "epoch": 0.5174585002862049, "grad_norm": 1.8544546365737915, "learning_rate": 9.944364053491356e-06, "loss": 1.0541, "step": 2260 }, { "epoch": 0.5176874642243846, "grad_norm": 1.036802053451538, "learning_rate": 9.936946019818698e-06, "loss": 1.0645, "step": 2261 }, { "epoch": 0.5179164281625644, "grad_norm": 1.515656590461731, "learning_rate": 9.929528020844114e-06, "loss": 1.0333, "step": 2262 }, { "epoch": 0.5181453921007442, "grad_norm": 1.1658852100372314, "learning_rate": 9.922110060649672e-06, "loss": 1.0802, "step": 2263 }, { "epoch": 0.5183743560389239, "grad_norm": 1.0310983657836914, "learning_rate": 9.914692143317412e-06, "loss": 1.0777, "step": 2264 }, { "epoch": 0.5186033199771036, "grad_norm": 1.1228355169296265, "learning_rate": 9.90727427292935e-06, "loss": 1.0623, "step": 2265 }, { "epoch": 0.5188322839152834, "grad_norm": 1.1571909189224243, "learning_rate": 9.899856453567474e-06, "loss": 1.0168, "step": 2266 }, { "epoch": 0.5190612478534631, "grad_norm": 1.233893871307373, "learning_rate": 9.892438689313757e-06, "loss": 1.0401, "step": 2267 }, { "epoch": 0.5192902117916428, "grad_norm": 2.297332525253296, "learning_rate": 9.885020984250124e-06, "loss": 1.0835, "step": 2268 }, { "epoch": 0.5195191757298225, "grad_norm": 1.0272998809814453, "learning_rate": 9.877603342458483e-06, "loss": 1.0059, "step": 2269 }, { "epoch": 0.5197481396680023, "grad_norm": 1.3607406616210938, "learning_rate": 9.870185768020694e-06, "loss": 1.072, "step": 2270 }, { "epoch": 0.519977103606182, "grad_norm": 1.6629681587219238, "learning_rate": 9.862768265018585e-06, "loss": 1.0697, "step": 2271 }, { "epoch": 0.5202060675443617, "grad_norm": 1.2140170335769653, "learning_rate": 9.855350837533958e-06, "loss": 1.0702, "step": 2272 }, { "epoch": 0.5204350314825416, "grad_norm": 1.465503454208374, "learning_rate": 9.847933489648545e-06, "loss": 1.1246, "step": 2273 }, { "epoch": 0.5206639954207213, "grad_norm": 1.1289547681808472, "learning_rate": 9.840516225444059e-06, "loss": 1.0491, "step": 2274 }, { "epoch": 0.520892959358901, "grad_norm": 1.1703757047653198, "learning_rate": 9.83309904900216e-06, "loss": 1.0422, "step": 2275 }, { "epoch": 0.5211219232970807, "grad_norm": 1.3183718919754028, "learning_rate": 9.82568196440445e-06, "loss": 1.1088, "step": 2276 }, { "epoch": 0.5213508872352605, "grad_norm": 1.2612817287445068, "learning_rate": 9.818264975732497e-06, "loss": 1.0711, "step": 2277 }, { "epoch": 0.5215798511734402, "grad_norm": 1.238250732421875, "learning_rate": 9.810848087067805e-06, "loss": 1.0843, "step": 2278 }, { "epoch": 0.5218088151116199, "grad_norm": 1.1225922107696533, "learning_rate": 9.803431302491823e-06, "loss": 1.0751, "step": 2279 }, { "epoch": 0.5220377790497996, "grad_norm": 1.3378745317459106, "learning_rate": 9.79601462608595e-06, "loss": 1.0344, "step": 2280 }, { "epoch": 0.5222667429879794, "grad_norm": 1.1414393186569214, "learning_rate": 9.788598061931513e-06, "loss": 1.0469, "step": 2281 }, { "epoch": 0.5224957069261591, "grad_norm": 1.9631890058517456, "learning_rate": 9.781181614109793e-06, "loss": 1.1085, "step": 2282 }, { "epoch": 0.5227246708643388, "grad_norm": 1.1475551128387451, "learning_rate": 9.773765286701998e-06, "loss": 1.0458, "step": 2283 }, { "epoch": 0.5229536348025186, "grad_norm": 1.056517481803894, "learning_rate": 9.766349083789266e-06, "loss": 1.0451, "step": 2284 }, { "epoch": 0.5231825987406984, "grad_norm": 1.2475084066390991, "learning_rate": 9.758933009452674e-06, "loss": 1.0261, "step": 2285 }, { "epoch": 0.5234115626788781, "grad_norm": 1.041703701019287, "learning_rate": 9.751517067773228e-06, "loss": 1.0591, "step": 2286 }, { "epoch": 0.5236405266170578, "grad_norm": 1.334044337272644, "learning_rate": 9.744101262831855e-06, "loss": 1.0793, "step": 2287 }, { "epoch": 0.5238694905552376, "grad_norm": 1.1858912706375122, "learning_rate": 9.736685598709413e-06, "loss": 1.084, "step": 2288 }, { "epoch": 0.5240984544934173, "grad_norm": 1.2032268047332764, "learning_rate": 9.72927007948668e-06, "loss": 1.0218, "step": 2289 }, { "epoch": 0.524327418431597, "grad_norm": 1.705446720123291, "learning_rate": 9.721854709244346e-06, "loss": 1.0121, "step": 2290 }, { "epoch": 0.5245563823697768, "grad_norm": 1.374306559562683, "learning_rate": 9.71443949206304e-06, "loss": 1.0152, "step": 2291 }, { "epoch": 0.5247853463079565, "grad_norm": 1.2469069957733154, "learning_rate": 9.707024432023284e-06, "loss": 1.0331, "step": 2292 }, { "epoch": 0.5250143102461362, "grad_norm": 1.5407803058624268, "learning_rate": 9.699609533205527e-06, "loss": 1.0996, "step": 2293 }, { "epoch": 0.5252432741843159, "grad_norm": 1.1991982460021973, "learning_rate": 9.692194799690128e-06, "loss": 1.1026, "step": 2294 }, { "epoch": 0.5254722381224957, "grad_norm": 1.3273327350616455, "learning_rate": 9.684780235557346e-06, "loss": 1.0099, "step": 2295 }, { "epoch": 0.5257012020606755, "grad_norm": 1.2710803747177124, "learning_rate": 9.67736584488736e-06, "loss": 1.0538, "step": 2296 }, { "epoch": 0.5259301659988552, "grad_norm": 1.1919000148773193, "learning_rate": 9.669951631760245e-06, "loss": 1.06, "step": 2297 }, { "epoch": 0.526159129937035, "grad_norm": 1.2854589223861694, "learning_rate": 9.662537600255979e-06, "loss": 1.044, "step": 2298 }, { "epoch": 0.5263880938752147, "grad_norm": 1.4513083696365356, "learning_rate": 9.65512375445444e-06, "loss": 1.0597, "step": 2299 }, { "epoch": 0.5266170578133944, "grad_norm": 1.1614129543304443, "learning_rate": 9.647710098435413e-06, "loss": 0.9531, "step": 2300 }, { "epoch": 0.5268460217515741, "grad_norm": 1.165914535522461, "learning_rate": 9.64029663627856e-06, "loss": 1.0857, "step": 2301 }, { "epoch": 0.5270749856897539, "grad_norm": 1.8047739267349243, "learning_rate": 9.632883372063458e-06, "loss": 1.029, "step": 2302 }, { "epoch": 0.5273039496279336, "grad_norm": 1.2753591537475586, "learning_rate": 9.625470309869554e-06, "loss": 0.9939, "step": 2303 }, { "epoch": 0.5275329135661133, "grad_norm": 1.525112509727478, "learning_rate": 9.618057453776196e-06, "loss": 1.0816, "step": 2304 }, { "epoch": 0.527761877504293, "grad_norm": 1.4662785530090332, "learning_rate": 9.610644807862625e-06, "loss": 1.0653, "step": 2305 }, { "epoch": 0.5279908414424728, "grad_norm": 3.009976387023926, "learning_rate": 9.603232376207948e-06, "loss": 0.9941, "step": 2306 }, { "epoch": 0.5282198053806525, "grad_norm": 1.217646837234497, "learning_rate": 9.59582016289117e-06, "loss": 1.0431, "step": 2307 }, { "epoch": 0.5284487693188323, "grad_norm": 1.1886101961135864, "learning_rate": 9.588408171991168e-06, "loss": 1.0745, "step": 2308 }, { "epoch": 0.5286777332570121, "grad_norm": 1.1897996664047241, "learning_rate": 9.580996407586695e-06, "loss": 1.0537, "step": 2309 }, { "epoch": 0.5289066971951918, "grad_norm": 1.416695237159729, "learning_rate": 9.573584873756387e-06, "loss": 1.0699, "step": 2310 }, { "epoch": 0.5291356611333715, "grad_norm": 2.577252149581909, "learning_rate": 9.566173574578751e-06, "loss": 1.0533, "step": 2311 }, { "epoch": 0.5293646250715512, "grad_norm": 1.102662444114685, "learning_rate": 9.558762514132157e-06, "loss": 1.1236, "step": 2312 }, { "epoch": 0.529593589009731, "grad_norm": 1.6311894655227661, "learning_rate": 9.551351696494854e-06, "loss": 1.1093, "step": 2313 }, { "epoch": 0.5298225529479107, "grad_norm": 1.327171802520752, "learning_rate": 9.543941125744947e-06, "loss": 1.0756, "step": 2314 }, { "epoch": 0.5300515168860904, "grad_norm": 1.0158864259719849, "learning_rate": 9.536530805960418e-06, "loss": 1.0776, "step": 2315 }, { "epoch": 0.5302804808242702, "grad_norm": 1.1998199224472046, "learning_rate": 9.529120741219103e-06, "loss": 1.0484, "step": 2316 }, { "epoch": 0.5305094447624499, "grad_norm": 1.073457956314087, "learning_rate": 9.521710935598693e-06, "loss": 0.9798, "step": 2317 }, { "epoch": 0.5307384087006296, "grad_norm": 1.1644235849380493, "learning_rate": 9.514301393176742e-06, "loss": 1.0871, "step": 2318 }, { "epoch": 0.5309673726388094, "grad_norm": 1.074922800064087, "learning_rate": 9.506892118030668e-06, "loss": 1.0178, "step": 2319 }, { "epoch": 0.5311963365769892, "grad_norm": 1.2031996250152588, "learning_rate": 9.499483114237726e-06, "loss": 1.1128, "step": 2320 }, { "epoch": 0.5314253005151689, "grad_norm": 1.1941750049591064, "learning_rate": 9.492074385875025e-06, "loss": 1.0887, "step": 2321 }, { "epoch": 0.5316542644533486, "grad_norm": 1.6538817882537842, "learning_rate": 9.484665937019539e-06, "loss": 1.0918, "step": 2322 }, { "epoch": 0.5318832283915284, "grad_norm": 1.1676061153411865, "learning_rate": 9.477257771748061e-06, "loss": 1.0102, "step": 2323 }, { "epoch": 0.5321121923297081, "grad_norm": 1.8035333156585693, "learning_rate": 9.469849894137254e-06, "loss": 1.0648, "step": 2324 }, { "epoch": 0.5323411562678878, "grad_norm": 1.2286866903305054, "learning_rate": 9.4624423082636e-06, "loss": 1.0543, "step": 2325 }, { "epoch": 0.5325701202060675, "grad_norm": 1.29671311378479, "learning_rate": 9.455035018203439e-06, "loss": 1.0887, "step": 2326 }, { "epoch": 0.5327990841442473, "grad_norm": 1.1480847597122192, "learning_rate": 9.44762802803294e-06, "loss": 1.125, "step": 2327 }, { "epoch": 0.533028048082427, "grad_norm": 1.277753233909607, "learning_rate": 9.440221341828104e-06, "loss": 1.0196, "step": 2328 }, { "epoch": 0.5332570120206067, "grad_norm": 1.2395761013031006, "learning_rate": 9.43281496366477e-06, "loss": 1.0753, "step": 2329 }, { "epoch": 0.5334859759587864, "grad_norm": 1.0635128021240234, "learning_rate": 9.42540889761861e-06, "loss": 1.0173, "step": 2330 }, { "epoch": 0.5337149398969663, "grad_norm": 1.3224315643310547, "learning_rate": 9.418003147765113e-06, "loss": 0.992, "step": 2331 }, { "epoch": 0.533943903835146, "grad_norm": 0.9989613890647888, "learning_rate": 9.410597718179603e-06, "loss": 1.0676, "step": 2332 }, { "epoch": 0.5341728677733257, "grad_norm": 1.0474404096603394, "learning_rate": 9.40319261293723e-06, "loss": 1.0012, "step": 2333 }, { "epoch": 0.5344018317115055, "grad_norm": 1.0710350275039673, "learning_rate": 9.395787836112958e-06, "loss": 1.0822, "step": 2334 }, { "epoch": 0.5346307956496852, "grad_norm": 1.1103330850601196, "learning_rate": 9.388383391781576e-06, "loss": 1.05, "step": 2335 }, { "epoch": 0.5348597595878649, "grad_norm": 1.2348805665969849, "learning_rate": 9.380979284017682e-06, "loss": 1.0656, "step": 2336 }, { "epoch": 0.5350887235260446, "grad_norm": 1.1286513805389404, "learning_rate": 9.373575516895698e-06, "loss": 1.0303, "step": 2337 }, { "epoch": 0.5353176874642244, "grad_norm": 1.173068881034851, "learning_rate": 9.366172094489858e-06, "loss": 1.0615, "step": 2338 }, { "epoch": 0.5355466514024041, "grad_norm": 1.185383915901184, "learning_rate": 9.358769020874198e-06, "loss": 1.0928, "step": 2339 }, { "epoch": 0.5357756153405838, "grad_norm": 1.1078901290893555, "learning_rate": 9.351366300122569e-06, "loss": 1.081, "step": 2340 }, { "epoch": 0.5360045792787635, "grad_norm": 1.2607088088989258, "learning_rate": 9.343963936308634e-06, "loss": 1.017, "step": 2341 }, { "epoch": 0.5362335432169434, "grad_norm": 1.102243423461914, "learning_rate": 9.336561933505836e-06, "loss": 1.0839, "step": 2342 }, { "epoch": 0.5364625071551231, "grad_norm": 1.3678319454193115, "learning_rate": 9.329160295787447e-06, "loss": 1.073, "step": 2343 }, { "epoch": 0.5366914710933028, "grad_norm": 1.1074059009552002, "learning_rate": 9.321759027226525e-06, "loss": 1.0501, "step": 2344 }, { "epoch": 0.5369204350314826, "grad_norm": 1.2822704315185547, "learning_rate": 9.314358131895919e-06, "loss": 1.0713, "step": 2345 }, { "epoch": 0.5371493989696623, "grad_norm": 1.119148850440979, "learning_rate": 9.306957613868292e-06, "loss": 1.0374, "step": 2346 }, { "epoch": 0.537378362907842, "grad_norm": 1.2142623662948608, "learning_rate": 9.299557477216073e-06, "loss": 1.0243, "step": 2347 }, { "epoch": 0.5376073268460217, "grad_norm": 1.343944787979126, "learning_rate": 9.292157726011502e-06, "loss": 1.1066, "step": 2348 }, { "epoch": 0.5378362907842015, "grad_norm": 1.2517242431640625, "learning_rate": 9.284758364326604e-06, "loss": 1.0373, "step": 2349 }, { "epoch": 0.5380652547223812, "grad_norm": 1.8333529233932495, "learning_rate": 9.277359396233176e-06, "loss": 1.04, "step": 2350 }, { "epoch": 0.5382942186605609, "grad_norm": 1.1809799671173096, "learning_rate": 9.269960825802817e-06, "loss": 1.0577, "step": 2351 }, { "epoch": 0.5385231825987407, "grad_norm": 1.1359210014343262, "learning_rate": 9.262562657106898e-06, "loss": 1.0731, "step": 2352 }, { "epoch": 0.5387521465369204, "grad_norm": 1.6872447729110718, "learning_rate": 9.255164894216562e-06, "loss": 1.0443, "step": 2353 }, { "epoch": 0.5389811104751002, "grad_norm": 1.3061941862106323, "learning_rate": 9.247767541202738e-06, "loss": 1.0525, "step": 2354 }, { "epoch": 0.53921007441328, "grad_norm": 1.4695810079574585, "learning_rate": 9.240370602136134e-06, "loss": 1.0589, "step": 2355 }, { "epoch": 0.5394390383514597, "grad_norm": 1.203343152999878, "learning_rate": 9.232974081087216e-06, "loss": 1.0574, "step": 2356 }, { "epoch": 0.5396680022896394, "grad_norm": 1.3898227214813232, "learning_rate": 9.225577982126234e-06, "loss": 1.0697, "step": 2357 }, { "epoch": 0.5398969662278191, "grad_norm": 1.1392302513122559, "learning_rate": 9.218182309323193e-06, "loss": 1.0639, "step": 2358 }, { "epoch": 0.5401259301659989, "grad_norm": 0.9744246602058411, "learning_rate": 9.21078706674787e-06, "loss": 1.0744, "step": 2359 }, { "epoch": 0.5403548941041786, "grad_norm": 1.147935152053833, "learning_rate": 9.203392258469814e-06, "loss": 1.0258, "step": 2360 }, { "epoch": 0.5405838580423583, "grad_norm": 1.1611018180847168, "learning_rate": 9.195997888558312e-06, "loss": 1.0645, "step": 2361 }, { "epoch": 0.540812821980538, "grad_norm": 1.4205443859100342, "learning_rate": 9.188603961082436e-06, "loss": 1.0319, "step": 2362 }, { "epoch": 0.5410417859187178, "grad_norm": 1.160415768623352, "learning_rate": 9.181210480110997e-06, "loss": 1.1, "step": 2363 }, { "epoch": 0.5412707498568975, "grad_norm": 1.692002296447754, "learning_rate": 9.173817449712563e-06, "loss": 1.0234, "step": 2364 }, { "epoch": 0.5414997137950772, "grad_norm": 1.0437875986099243, "learning_rate": 9.166424873955462e-06, "loss": 1.0846, "step": 2365 }, { "epoch": 0.5417286777332571, "grad_norm": 2.888340473175049, "learning_rate": 9.159032756907765e-06, "loss": 1.0399, "step": 2366 }, { "epoch": 0.5419576416714368, "grad_norm": 1.0444552898406982, "learning_rate": 9.151641102637289e-06, "loss": 1.0821, "step": 2367 }, { "epoch": 0.5421866056096165, "grad_norm": 1.2841830253601074, "learning_rate": 9.144249915211605e-06, "loss": 1.0579, "step": 2368 }, { "epoch": 0.5424155695477962, "grad_norm": 1.8997342586517334, "learning_rate": 9.136859198698014e-06, "loss": 1.1244, "step": 2369 }, { "epoch": 0.542644533485976, "grad_norm": 1.4083105325698853, "learning_rate": 9.12946895716357e-06, "loss": 1.0149, "step": 2370 }, { "epoch": 0.5428734974241557, "grad_norm": 1.2661566734313965, "learning_rate": 9.122079194675064e-06, "loss": 1.0958, "step": 2371 }, { "epoch": 0.5431024613623354, "grad_norm": 1.254859209060669, "learning_rate": 9.114689915299015e-06, "loss": 1.1392, "step": 2372 }, { "epoch": 0.5433314253005151, "grad_norm": 1.7026736736297607, "learning_rate": 9.10730112310168e-06, "loss": 1.0332, "step": 2373 }, { "epoch": 0.5435603892386949, "grad_norm": 1.1682655811309814, "learning_rate": 9.099912822149056e-06, "loss": 1.0609, "step": 2374 }, { "epoch": 0.5437893531768746, "grad_norm": 1.0199397802352905, "learning_rate": 9.092525016506858e-06, "loss": 1.0164, "step": 2375 }, { "epoch": 0.5440183171150543, "grad_norm": 1.0826716423034668, "learning_rate": 9.085137710240536e-06, "loss": 1.0245, "step": 2376 }, { "epoch": 0.5442472810532342, "grad_norm": 1.1761521100997925, "learning_rate": 9.077750907415264e-06, "loss": 1.0278, "step": 2377 }, { "epoch": 0.5444762449914139, "grad_norm": 1.112975001335144, "learning_rate": 9.07036461209593e-06, "loss": 1.021, "step": 2378 }, { "epoch": 0.5447052089295936, "grad_norm": 1.1769195795059204, "learning_rate": 9.06297882834716e-06, "loss": 1.0348, "step": 2379 }, { "epoch": 0.5449341728677733, "grad_norm": 1.3177459239959717, "learning_rate": 9.055593560233284e-06, "loss": 1.0599, "step": 2380 }, { "epoch": 0.5451631368059531, "grad_norm": 1.2103499174118042, "learning_rate": 9.048208811818353e-06, "loss": 1.0701, "step": 2381 }, { "epoch": 0.5453921007441328, "grad_norm": 1.5910011529922485, "learning_rate": 9.040824587166136e-06, "loss": 1.1076, "step": 2382 }, { "epoch": 0.5456210646823125, "grad_norm": 1.020159125328064, "learning_rate": 9.033440890340102e-06, "loss": 1.0, "step": 2383 }, { "epoch": 0.5458500286204923, "grad_norm": 1.280670166015625, "learning_rate": 9.026057725403445e-06, "loss": 1.015, "step": 2384 }, { "epoch": 0.546078992558672, "grad_norm": 1.4677191972732544, "learning_rate": 9.018675096419059e-06, "loss": 1.0787, "step": 2385 }, { "epoch": 0.5463079564968517, "grad_norm": 1.1645160913467407, "learning_rate": 9.011293007449535e-06, "loss": 1.0954, "step": 2386 }, { "epoch": 0.5465369204350314, "grad_norm": 1.4413548707962036, "learning_rate": 9.003911462557185e-06, "loss": 1.0522, "step": 2387 }, { "epoch": 0.5467658843732112, "grad_norm": 1.051669955253601, "learning_rate": 8.996530465804e-06, "loss": 1.0343, "step": 2388 }, { "epoch": 0.546994848311391, "grad_norm": 1.315771222114563, "learning_rate": 8.98915002125169e-06, "loss": 1.0497, "step": 2389 }, { "epoch": 0.5472238122495707, "grad_norm": 1.1893324851989746, "learning_rate": 8.981770132961649e-06, "loss": 1.0671, "step": 2390 }, { "epoch": 0.5474527761877505, "grad_norm": 1.8192094564437866, "learning_rate": 8.974390804994966e-06, "loss": 1.0127, "step": 2391 }, { "epoch": 0.5476817401259302, "grad_norm": 1.2040550708770752, "learning_rate": 8.96701204141242e-06, "loss": 1.1041, "step": 2392 }, { "epoch": 0.5479107040641099, "grad_norm": 1.1481995582580566, "learning_rate": 8.959633846274494e-06, "loss": 1.0686, "step": 2393 }, { "epoch": 0.5481396680022896, "grad_norm": 1.2036935091018677, "learning_rate": 8.952256223641337e-06, "loss": 1.0603, "step": 2394 }, { "epoch": 0.5483686319404694, "grad_norm": 1.1734176874160767, "learning_rate": 8.944879177572796e-06, "loss": 1.0554, "step": 2395 }, { "epoch": 0.5485975958786491, "grad_norm": 1.3413574695587158, "learning_rate": 8.9375027121284e-06, "loss": 0.9716, "step": 2396 }, { "epoch": 0.5488265598168288, "grad_norm": 1.8128085136413574, "learning_rate": 8.930126831367346e-06, "loss": 1.0691, "step": 2397 }, { "epoch": 0.5490555237550085, "grad_norm": 1.5389128923416138, "learning_rate": 8.922751539348534e-06, "loss": 1.0215, "step": 2398 }, { "epoch": 0.5492844876931883, "grad_norm": 1.3885033130645752, "learning_rate": 8.915376840130517e-06, "loss": 1.1011, "step": 2399 }, { "epoch": 0.5495134516313681, "grad_norm": 1.6749422550201416, "learning_rate": 8.908002737771529e-06, "loss": 1.0884, "step": 2400 }, { "epoch": 0.5497424155695478, "grad_norm": 1.39671790599823, "learning_rate": 8.900629236329482e-06, "loss": 1.0296, "step": 2401 }, { "epoch": 0.5499713795077276, "grad_norm": 1.2996166944503784, "learning_rate": 8.893256339861946e-06, "loss": 1.0335, "step": 2402 }, { "epoch": 0.5502003434459073, "grad_norm": 1.482348084449768, "learning_rate": 8.885884052426168e-06, "loss": 1.0564, "step": 2403 }, { "epoch": 0.550429307384087, "grad_norm": 1.385023593902588, "learning_rate": 8.878512378079057e-06, "loss": 1.0633, "step": 2404 }, { "epoch": 0.5506582713222667, "grad_norm": 1.3737043142318726, "learning_rate": 8.871141320877181e-06, "loss": 1.0962, "step": 2405 }, { "epoch": 0.5508872352604465, "grad_norm": 1.2021304368972778, "learning_rate": 8.86377088487677e-06, "loss": 1.0578, "step": 2406 }, { "epoch": 0.5511161991986262, "grad_norm": 1.1903609037399292, "learning_rate": 8.856401074133718e-06, "loss": 1.0453, "step": 2407 }, { "epoch": 0.5513451631368059, "grad_norm": 1.103837013244629, "learning_rate": 8.849031892703564e-06, "loss": 0.9953, "step": 2408 }, { "epoch": 0.5515741270749857, "grad_norm": 1.2637211084365845, "learning_rate": 8.841663344641514e-06, "loss": 1.0946, "step": 2409 }, { "epoch": 0.5518030910131654, "grad_norm": 1.2190263271331787, "learning_rate": 8.83429543400241e-06, "loss": 1.0259, "step": 2410 }, { "epoch": 0.5520320549513451, "grad_norm": 1.3978723287582397, "learning_rate": 8.826928164840755e-06, "loss": 1.0372, "step": 2411 }, { "epoch": 0.552261018889525, "grad_norm": 1.1146042346954346, "learning_rate": 8.819561541210698e-06, "loss": 1.0455, "step": 2412 }, { "epoch": 0.5524899828277047, "grad_norm": 1.5733097791671753, "learning_rate": 8.812195567166028e-06, "loss": 1.0123, "step": 2413 }, { "epoch": 0.5527189467658844, "grad_norm": 1.4020459651947021, "learning_rate": 8.804830246760175e-06, "loss": 1.0588, "step": 2414 }, { "epoch": 0.5529479107040641, "grad_norm": 1.1088488101959229, "learning_rate": 8.797465584046225e-06, "loss": 1.0534, "step": 2415 }, { "epoch": 0.5531768746422439, "grad_norm": 1.1872738599777222, "learning_rate": 8.790101583076874e-06, "loss": 1.0252, "step": 2416 }, { "epoch": 0.5534058385804236, "grad_norm": 1.0902388095855713, "learning_rate": 8.78273824790448e-06, "loss": 1.0794, "step": 2417 }, { "epoch": 0.5536348025186033, "grad_norm": 1.246376633644104, "learning_rate": 8.775375582581027e-06, "loss": 1.0609, "step": 2418 }, { "epoch": 0.553863766456783, "grad_norm": 1.1201062202453613, "learning_rate": 8.76801359115812e-06, "loss": 1.0713, "step": 2419 }, { "epoch": 0.5540927303949628, "grad_norm": 1.1334254741668701, "learning_rate": 8.760652277687007e-06, "loss": 1.0681, "step": 2420 }, { "epoch": 0.5543216943331425, "grad_norm": 1.380236029624939, "learning_rate": 8.75329164621855e-06, "loss": 1.0457, "step": 2421 }, { "epoch": 0.5545506582713222, "grad_norm": 1.3209164142608643, "learning_rate": 8.745931700803251e-06, "loss": 1.0322, "step": 2422 }, { "epoch": 0.554779622209502, "grad_norm": 1.5157129764556885, "learning_rate": 8.738572445491225e-06, "loss": 1.0049, "step": 2423 }, { "epoch": 0.5550085861476818, "grad_norm": 1.1138639450073242, "learning_rate": 8.731213884332205e-06, "loss": 1.0625, "step": 2424 }, { "epoch": 0.5552375500858615, "grad_norm": 1.2549896240234375, "learning_rate": 8.723856021375543e-06, "loss": 1.0304, "step": 2425 }, { "epoch": 0.5554665140240412, "grad_norm": 1.2935527563095093, "learning_rate": 8.716498860670218e-06, "loss": 1.0602, "step": 2426 }, { "epoch": 0.555695477962221, "grad_norm": 1.2371066808700562, "learning_rate": 8.709142406264807e-06, "loss": 1.0347, "step": 2427 }, { "epoch": 0.5559244419004007, "grad_norm": 1.1444132328033447, "learning_rate": 8.701786662207506e-06, "loss": 1.0418, "step": 2428 }, { "epoch": 0.5561534058385804, "grad_norm": 1.1240417957305908, "learning_rate": 8.694431632546127e-06, "loss": 0.9742, "step": 2429 }, { "epoch": 0.5563823697767601, "grad_norm": 1.1536635160446167, "learning_rate": 8.687077321328066e-06, "loss": 1.0736, "step": 2430 }, { "epoch": 0.5566113337149399, "grad_norm": 1.3496026992797852, "learning_rate": 8.679723732600355e-06, "loss": 1.0392, "step": 2431 }, { "epoch": 0.5568402976531196, "grad_norm": 1.3216404914855957, "learning_rate": 8.672370870409601e-06, "loss": 1.053, "step": 2432 }, { "epoch": 0.5570692615912993, "grad_norm": 1.1046510934829712, "learning_rate": 8.665018738802023e-06, "loss": 1.0534, "step": 2433 }, { "epoch": 0.557298225529479, "grad_norm": 1.7113914489746094, "learning_rate": 8.657667341823449e-06, "loss": 1.092, "step": 2434 }, { "epoch": 0.5575271894676589, "grad_norm": 1.2329431772232056, "learning_rate": 8.650316683519276e-06, "loss": 1.0156, "step": 2435 }, { "epoch": 0.5577561534058386, "grad_norm": 1.0992010831832886, "learning_rate": 8.642966767934516e-06, "loss": 1.0104, "step": 2436 }, { "epoch": 0.5579851173440183, "grad_norm": 1.1873196363449097, "learning_rate": 8.63561759911377e-06, "loss": 0.9733, "step": 2437 }, { "epoch": 0.5582140812821981, "grad_norm": 1.360651969909668, "learning_rate": 8.628269181101216e-06, "loss": 1.0369, "step": 2438 }, { "epoch": 0.5584430452203778, "grad_norm": 1.1703392267227173, "learning_rate": 8.620921517940635e-06, "loss": 1.0778, "step": 2439 }, { "epoch": 0.5586720091585575, "grad_norm": 1.1417691707611084, "learning_rate": 8.61357461367538e-06, "loss": 1.0593, "step": 2440 }, { "epoch": 0.5589009730967373, "grad_norm": 1.130734920501709, "learning_rate": 8.60622847234839e-06, "loss": 1.0531, "step": 2441 }, { "epoch": 0.559129937034917, "grad_norm": 1.1957651376724243, "learning_rate": 8.598883098002188e-06, "loss": 1.0961, "step": 2442 }, { "epoch": 0.5593589009730967, "grad_norm": 1.3269649744033813, "learning_rate": 8.591538494678867e-06, "loss": 1.0608, "step": 2443 }, { "epoch": 0.5595878649112764, "grad_norm": 1.2087531089782715, "learning_rate": 8.584194666420105e-06, "loss": 1.1281, "step": 2444 }, { "epoch": 0.5598168288494562, "grad_norm": 1.5364724397659302, "learning_rate": 8.576851617267151e-06, "loss": 1.0366, "step": 2445 }, { "epoch": 0.560045792787636, "grad_norm": 1.299095630645752, "learning_rate": 8.569509351260817e-06, "loss": 1.0942, "step": 2446 }, { "epoch": 0.5602747567258157, "grad_norm": 1.0818737745285034, "learning_rate": 8.562167872441493e-06, "loss": 1.1259, "step": 2447 }, { "epoch": 0.5605037206639955, "grad_norm": 1.1360951662063599, "learning_rate": 8.554827184849139e-06, "loss": 1.1018, "step": 2448 }, { "epoch": 0.5607326846021752, "grad_norm": 1.383863925933838, "learning_rate": 8.547487292523264e-06, "loss": 1.0885, "step": 2449 }, { "epoch": 0.5609616485403549, "grad_norm": 1.0860772132873535, "learning_rate": 8.540148199502955e-06, "loss": 1.0822, "step": 2450 }, { "epoch": 0.5611906124785346, "grad_norm": 1.1337288618087769, "learning_rate": 8.532809909826854e-06, "loss": 1.0814, "step": 2451 }, { "epoch": 0.5614195764167144, "grad_norm": 1.118790626525879, "learning_rate": 8.525472427533156e-06, "loss": 1.0809, "step": 2452 }, { "epoch": 0.5616485403548941, "grad_norm": 1.1470918655395508, "learning_rate": 8.518135756659624e-06, "loss": 1.0631, "step": 2453 }, { "epoch": 0.5618775042930738, "grad_norm": 1.0622974634170532, "learning_rate": 8.510799901243554e-06, "loss": 1.0763, "step": 2454 }, { "epoch": 0.5621064682312535, "grad_norm": 1.0906604528427124, "learning_rate": 8.503464865321817e-06, "loss": 1.069, "step": 2455 }, { "epoch": 0.5623354321694333, "grad_norm": 1.1250942945480347, "learning_rate": 8.496130652930818e-06, "loss": 1.0974, "step": 2456 }, { "epoch": 0.562564396107613, "grad_norm": 1.2028555870056152, "learning_rate": 8.48879726810651e-06, "loss": 1.0351, "step": 2457 }, { "epoch": 0.5627933600457928, "grad_norm": 1.8054412603378296, "learning_rate": 8.481464714884396e-06, "loss": 1.0841, "step": 2458 }, { "epoch": 0.5630223239839726, "grad_norm": 1.1389802694320679, "learning_rate": 8.474132997299521e-06, "loss": 1.0583, "step": 2459 }, { "epoch": 0.5632512879221523, "grad_norm": 1.328384280204773, "learning_rate": 8.466802119386462e-06, "loss": 1.0583, "step": 2460 }, { "epoch": 0.563480251860332, "grad_norm": 1.3099395036697388, "learning_rate": 8.459472085179342e-06, "loss": 1.0417, "step": 2461 }, { "epoch": 0.5637092157985117, "grad_norm": 1.1930067539215088, "learning_rate": 8.45214289871182e-06, "loss": 1.0465, "step": 2462 }, { "epoch": 0.5639381797366915, "grad_norm": 1.0933071374893188, "learning_rate": 8.44481456401708e-06, "loss": 1.0736, "step": 2463 }, { "epoch": 0.5641671436748712, "grad_norm": 1.0359748601913452, "learning_rate": 8.437487085127851e-06, "loss": 1.0145, "step": 2464 }, { "epoch": 0.5643961076130509, "grad_norm": 1.7698050737380981, "learning_rate": 8.430160466076378e-06, "loss": 1.0021, "step": 2465 }, { "epoch": 0.5646250715512307, "grad_norm": 1.2962058782577515, "learning_rate": 8.422834710894434e-06, "loss": 1.05, "step": 2466 }, { "epoch": 0.5648540354894104, "grad_norm": 1.083851933479309, "learning_rate": 8.415509823613332e-06, "loss": 0.9928, "step": 2467 }, { "epoch": 0.5650829994275901, "grad_norm": 1.2015832662582397, "learning_rate": 8.408185808263886e-06, "loss": 1.0259, "step": 2468 }, { "epoch": 0.5653119633657699, "grad_norm": 1.0598255395889282, "learning_rate": 8.400862668876445e-06, "loss": 1.0091, "step": 2469 }, { "epoch": 0.5655409273039497, "grad_norm": 1.0813366174697876, "learning_rate": 8.393540409480873e-06, "loss": 1.0822, "step": 2470 }, { "epoch": 0.5657698912421294, "grad_norm": 1.1584067344665527, "learning_rate": 8.38621903410654e-06, "loss": 1.0001, "step": 2471 }, { "epoch": 0.5659988551803091, "grad_norm": 1.030510425567627, "learning_rate": 8.378898546782344e-06, "loss": 0.9853, "step": 2472 }, { "epoch": 0.5662278191184889, "grad_norm": 1.317218542098999, "learning_rate": 8.371578951536689e-06, "loss": 1.0422, "step": 2473 }, { "epoch": 0.5664567830566686, "grad_norm": 1.258917212486267, "learning_rate": 8.364260252397483e-06, "loss": 1.0164, "step": 2474 }, { "epoch": 0.5666857469948483, "grad_norm": 1.1054089069366455, "learning_rate": 8.356942453392147e-06, "loss": 1.01, "step": 2475 }, { "epoch": 0.566914710933028, "grad_norm": 1.110060691833496, "learning_rate": 8.349625558547599e-06, "loss": 1.0202, "step": 2476 }, { "epoch": 0.5671436748712078, "grad_norm": 1.5175520181655884, "learning_rate": 8.342309571890272e-06, "loss": 1.0423, "step": 2477 }, { "epoch": 0.5673726388093875, "grad_norm": 1.2970167398452759, "learning_rate": 8.334994497446091e-06, "loss": 1.1032, "step": 2478 }, { "epoch": 0.5676016027475672, "grad_norm": 1.1656423807144165, "learning_rate": 8.327680339240478e-06, "loss": 1.0052, "step": 2479 }, { "epoch": 0.5678305666857469, "grad_norm": 1.0043197870254517, "learning_rate": 8.320367101298351e-06, "loss": 1.0508, "step": 2480 }, { "epoch": 0.5680595306239268, "grad_norm": 1.8934231996536255, "learning_rate": 8.313054787644131e-06, "loss": 1.0589, "step": 2481 }, { "epoch": 0.5682884945621065, "grad_norm": 1.1924573183059692, "learning_rate": 8.305743402301714e-06, "loss": 1.0504, "step": 2482 }, { "epoch": 0.5685174585002862, "grad_norm": 2.6312386989593506, "learning_rate": 8.298432949294499e-06, "loss": 0.9884, "step": 2483 }, { "epoch": 0.568746422438466, "grad_norm": 1.0763286352157593, "learning_rate": 8.29112343264537e-06, "loss": 1.075, "step": 2484 }, { "epoch": 0.5689753863766457, "grad_norm": 2.5799460411071777, "learning_rate": 8.283814856376681e-06, "loss": 1.0406, "step": 2485 }, { "epoch": 0.5692043503148254, "grad_norm": 1.2226494550704956, "learning_rate": 8.276507224510294e-06, "loss": 1.1118, "step": 2486 }, { "epoch": 0.5694333142530051, "grad_norm": 1.126726746559143, "learning_rate": 8.26920054106753e-06, "loss": 1.0838, "step": 2487 }, { "epoch": 0.5696622781911849, "grad_norm": 1.5410242080688477, "learning_rate": 8.261894810069197e-06, "loss": 1.0668, "step": 2488 }, { "epoch": 0.5698912421293646, "grad_norm": 1.1347763538360596, "learning_rate": 8.25459003553558e-06, "loss": 1.1072, "step": 2489 }, { "epoch": 0.5701202060675443, "grad_norm": 1.095646858215332, "learning_rate": 8.247286221486429e-06, "loss": 1.0138, "step": 2490 }, { "epoch": 0.570349170005724, "grad_norm": 1.7708925008773804, "learning_rate": 8.239983371940975e-06, "loss": 1.0268, "step": 2491 }, { "epoch": 0.5705781339439039, "grad_norm": 1.2089495658874512, "learning_rate": 8.232681490917919e-06, "loss": 1.0141, "step": 2492 }, { "epoch": 0.5708070978820836, "grad_norm": 1.059664249420166, "learning_rate": 8.22538058243542e-06, "loss": 1.0827, "step": 2493 }, { "epoch": 0.5710360618202633, "grad_norm": 1.3616527318954468, "learning_rate": 8.218080650511107e-06, "loss": 1.0793, "step": 2494 }, { "epoch": 0.5712650257584431, "grad_norm": 1.2198219299316406, "learning_rate": 8.210781699162075e-06, "loss": 1.0753, "step": 2495 }, { "epoch": 0.5714939896966228, "grad_norm": 1.3348844051361084, "learning_rate": 8.203483732404872e-06, "loss": 1.0341, "step": 2496 }, { "epoch": 0.5717229536348025, "grad_norm": 1.160372257232666, "learning_rate": 8.196186754255509e-06, "loss": 1.0313, "step": 2497 }, { "epoch": 0.5719519175729822, "grad_norm": 1.0435636043548584, "learning_rate": 8.188890768729452e-06, "loss": 1.0094, "step": 2498 }, { "epoch": 0.572180881511162, "grad_norm": 1.090612530708313, "learning_rate": 8.181595779841618e-06, "loss": 1.0399, "step": 2499 }, { "epoch": 0.5724098454493417, "grad_norm": 1.496956467628479, "learning_rate": 8.174301791606384e-06, "loss": 0.9976, "step": 2500 }, { "epoch": 0.5726388093875214, "grad_norm": 1.1636426448822021, "learning_rate": 8.167008808037568e-06, "loss": 1.0563, "step": 2501 }, { "epoch": 0.5728677733257012, "grad_norm": 1.6248148679733276, "learning_rate": 8.159716833148432e-06, "loss": 1.0554, "step": 2502 }, { "epoch": 0.5730967372638809, "grad_norm": 1.1249679327011108, "learning_rate": 8.152425870951701e-06, "loss": 1.0476, "step": 2503 }, { "epoch": 0.5733257012020607, "grad_norm": 1.0898033380508423, "learning_rate": 8.145135925459518e-06, "loss": 1.1075, "step": 2504 }, { "epoch": 0.5735546651402404, "grad_norm": 1.1172502040863037, "learning_rate": 8.137847000683485e-06, "loss": 1.0698, "step": 2505 }, { "epoch": 0.5737836290784202, "grad_norm": 1.1264135837554932, "learning_rate": 8.130559100634639e-06, "loss": 1.0334, "step": 2506 }, { "epoch": 0.5740125930165999, "grad_norm": 1.164330005645752, "learning_rate": 8.123272229323443e-06, "loss": 0.9811, "step": 2507 }, { "epoch": 0.5742415569547796, "grad_norm": 1.1172728538513184, "learning_rate": 8.115986390759805e-06, "loss": 1.087, "step": 2508 }, { "epoch": 0.5744705208929594, "grad_norm": 1.0987147092819214, "learning_rate": 8.108701588953059e-06, "loss": 1.1124, "step": 2509 }, { "epoch": 0.5746994848311391, "grad_norm": 1.3992830514907837, "learning_rate": 8.101417827911975e-06, "loss": 1.0272, "step": 2510 }, { "epoch": 0.5749284487693188, "grad_norm": 1.1009325981140137, "learning_rate": 8.094135111644741e-06, "loss": 1.0467, "step": 2511 }, { "epoch": 0.5751574127074985, "grad_norm": 1.2195725440979004, "learning_rate": 8.086853444158977e-06, "loss": 1.0822, "step": 2512 }, { "epoch": 0.5753863766456783, "grad_norm": 1.24541175365448, "learning_rate": 8.07957282946172e-06, "loss": 1.0975, "step": 2513 }, { "epoch": 0.575615340583858, "grad_norm": 1.9513866901397705, "learning_rate": 8.072293271559439e-06, "loss": 1.0539, "step": 2514 }, { "epoch": 0.5758443045220378, "grad_norm": 1.3148459196090698, "learning_rate": 8.065014774458004e-06, "loss": 1.0509, "step": 2515 }, { "epoch": 0.5760732684602176, "grad_norm": 1.848658800125122, "learning_rate": 8.05773734216272e-06, "loss": 1.071, "step": 2516 }, { "epoch": 0.5763022323983973, "grad_norm": 1.1349958181381226, "learning_rate": 8.05046097867829e-06, "loss": 1.0761, "step": 2517 }, { "epoch": 0.576531196336577, "grad_norm": 1.2456731796264648, "learning_rate": 8.043185688008837e-06, "loss": 1.0541, "step": 2518 }, { "epoch": 0.5767601602747567, "grad_norm": 1.2999017238616943, "learning_rate": 8.035911474157897e-06, "loss": 1.0678, "step": 2519 }, { "epoch": 0.5769891242129365, "grad_norm": 1.1113868951797485, "learning_rate": 8.028638341128405e-06, "loss": 1.0296, "step": 2520 }, { "epoch": 0.5772180881511162, "grad_norm": 1.351574182510376, "learning_rate": 8.021366292922704e-06, "loss": 0.9984, "step": 2521 }, { "epoch": 0.5774470520892959, "grad_norm": 1.0689737796783447, "learning_rate": 8.014095333542548e-06, "loss": 1.0453, "step": 2522 }, { "epoch": 0.5776760160274756, "grad_norm": 1.2941296100616455, "learning_rate": 8.006825466989075e-06, "loss": 1.0933, "step": 2523 }, { "epoch": 0.5779049799656554, "grad_norm": 1.1887894868850708, "learning_rate": 7.999556697262838e-06, "loss": 0.9727, "step": 2524 }, { "epoch": 0.5781339439038351, "grad_norm": 1.1151421070098877, "learning_rate": 7.992289028363782e-06, "loss": 1.037, "step": 2525 }, { "epoch": 0.5783629078420148, "grad_norm": 1.1871955394744873, "learning_rate": 7.985022464291236e-06, "loss": 1.0871, "step": 2526 }, { "epoch": 0.5785918717801947, "grad_norm": 1.3512312173843384, "learning_rate": 7.977757009043941e-06, "loss": 1.0276, "step": 2527 }, { "epoch": 0.5788208357183744, "grad_norm": 1.1984821557998657, "learning_rate": 7.970492666620002e-06, "loss": 1.0461, "step": 2528 }, { "epoch": 0.5790497996565541, "grad_norm": 1.1962758302688599, "learning_rate": 7.963229441016938e-06, "loss": 1.0625, "step": 2529 }, { "epoch": 0.5792787635947338, "grad_norm": 1.174294352531433, "learning_rate": 7.955967336231635e-06, "loss": 0.9852, "step": 2530 }, { "epoch": 0.5795077275329136, "grad_norm": 1.1075918674468994, "learning_rate": 7.948706356260367e-06, "loss": 1.1239, "step": 2531 }, { "epoch": 0.5797366914710933, "grad_norm": 1.2722827196121216, "learning_rate": 7.941446505098795e-06, "loss": 1.0525, "step": 2532 }, { "epoch": 0.579965655409273, "grad_norm": 1.085821509361267, "learning_rate": 7.934187786741956e-06, "loss": 1.0824, "step": 2533 }, { "epoch": 0.5801946193474528, "grad_norm": 1.2172048091888428, "learning_rate": 7.926930205184254e-06, "loss": 1.0138, "step": 2534 }, { "epoch": 0.5804235832856325, "grad_norm": 1.2068073749542236, "learning_rate": 7.919673764419479e-06, "loss": 1.0394, "step": 2535 }, { "epoch": 0.5806525472238122, "grad_norm": 1.9181028604507446, "learning_rate": 7.912418468440794e-06, "loss": 1.0098, "step": 2536 }, { "epoch": 0.5808815111619919, "grad_norm": 2.7652781009674072, "learning_rate": 7.90516432124072e-06, "loss": 1.1164, "step": 2537 }, { "epoch": 0.5811104751001718, "grad_norm": 1.5410016775131226, "learning_rate": 7.89791132681116e-06, "loss": 1.0504, "step": 2538 }, { "epoch": 0.5813394390383515, "grad_norm": 1.0692219734191895, "learning_rate": 7.89065948914337e-06, "loss": 1.0947, "step": 2539 }, { "epoch": 0.5815684029765312, "grad_norm": 1.367213249206543, "learning_rate": 7.883408812227977e-06, "loss": 1.0337, "step": 2540 }, { "epoch": 0.581797366914711, "grad_norm": 1.8254352807998657, "learning_rate": 7.876159300054974e-06, "loss": 1.0041, "step": 2541 }, { "epoch": 0.5820263308528907, "grad_norm": 1.1613630056381226, "learning_rate": 7.868910956613697e-06, "loss": 1.0489, "step": 2542 }, { "epoch": 0.5822552947910704, "grad_norm": 1.2703049182891846, "learning_rate": 7.861663785892857e-06, "loss": 1.0312, "step": 2543 }, { "epoch": 0.5824842587292501, "grad_norm": 1.263065218925476, "learning_rate": 7.854417791880508e-06, "loss": 1.031, "step": 2544 }, { "epoch": 0.5827132226674299, "grad_norm": 1.1359111070632935, "learning_rate": 7.847172978564055e-06, "loss": 1.0146, "step": 2545 }, { "epoch": 0.5829421866056096, "grad_norm": 1.2447282075881958, "learning_rate": 7.839929349930266e-06, "loss": 1.045, "step": 2546 }, { "epoch": 0.5831711505437893, "grad_norm": 1.083162546157837, "learning_rate": 7.832686909965248e-06, "loss": 1.0664, "step": 2547 }, { "epoch": 0.583400114481969, "grad_norm": 1.1830934286117554, "learning_rate": 7.82544566265445e-06, "loss": 0.9864, "step": 2548 }, { "epoch": 0.5836290784201488, "grad_norm": 1.7827184200286865, "learning_rate": 7.818205611982673e-06, "loss": 1.0378, "step": 2549 }, { "epoch": 0.5838580423583286, "grad_norm": 1.157387614250183, "learning_rate": 7.810966761934053e-06, "loss": 1.033, "step": 2550 }, { "epoch": 0.5840870062965083, "grad_norm": 1.540801763534546, "learning_rate": 7.803729116492072e-06, "loss": 1.0447, "step": 2551 }, { "epoch": 0.5843159702346881, "grad_norm": 1.1215074062347412, "learning_rate": 7.796492679639549e-06, "loss": 1.0606, "step": 2552 }, { "epoch": 0.5845449341728678, "grad_norm": 1.2133134603500366, "learning_rate": 7.789257455358625e-06, "loss": 1.0275, "step": 2553 }, { "epoch": 0.5847738981110475, "grad_norm": 1.3863023519515991, "learning_rate": 7.782023447630789e-06, "loss": 1.0156, "step": 2554 }, { "epoch": 0.5850028620492272, "grad_norm": 1.131239891052246, "learning_rate": 7.774790660436857e-06, "loss": 0.996, "step": 2555 }, { "epoch": 0.585231825987407, "grad_norm": 1.179404377937317, "learning_rate": 7.767559097756966e-06, "loss": 1.0682, "step": 2556 }, { "epoch": 0.5854607899255867, "grad_norm": 2.166602373123169, "learning_rate": 7.760328763570589e-06, "loss": 1.0358, "step": 2557 }, { "epoch": 0.5856897538637664, "grad_norm": 1.535658836364746, "learning_rate": 7.753099661856516e-06, "loss": 1.0759, "step": 2558 }, { "epoch": 0.5859187178019462, "grad_norm": 1.1416999101638794, "learning_rate": 7.745871796592857e-06, "loss": 1.0636, "step": 2559 }, { "epoch": 0.5861476817401259, "grad_norm": 1.214870572090149, "learning_rate": 7.738645171757054e-06, "loss": 1.0911, "step": 2560 }, { "epoch": 0.5863766456783057, "grad_norm": 1.7018429040908813, "learning_rate": 7.731419791325852e-06, "loss": 1.0343, "step": 2561 }, { "epoch": 0.5866056096164854, "grad_norm": 1.0340094566345215, "learning_rate": 7.72419565927532e-06, "loss": 1.0418, "step": 2562 }, { "epoch": 0.5868345735546652, "grad_norm": 1.4168850183486938, "learning_rate": 7.716972779580836e-06, "loss": 1.0363, "step": 2563 }, { "epoch": 0.5870635374928449, "grad_norm": 1.3030794858932495, "learning_rate": 7.709751156217088e-06, "loss": 1.0039, "step": 2564 }, { "epoch": 0.5872925014310246, "grad_norm": 1.357153296470642, "learning_rate": 7.702530793158079e-06, "loss": 1.1281, "step": 2565 }, { "epoch": 0.5875214653692044, "grad_norm": 1.1770304441452026, "learning_rate": 7.695311694377116e-06, "loss": 1.037, "step": 2566 }, { "epoch": 0.5877504293073841, "grad_norm": 1.2851818799972534, "learning_rate": 7.688093863846801e-06, "loss": 1.0687, "step": 2567 }, { "epoch": 0.5879793932455638, "grad_norm": 1.1500670909881592, "learning_rate": 7.680877305539048e-06, "loss": 1.0786, "step": 2568 }, { "epoch": 0.5882083571837435, "grad_norm": 1.660108208656311, "learning_rate": 7.673662023425074e-06, "loss": 1.0077, "step": 2569 }, { "epoch": 0.5884373211219233, "grad_norm": 1.0523499250411987, "learning_rate": 7.666448021475385e-06, "loss": 1.0091, "step": 2570 }, { "epoch": 0.588666285060103, "grad_norm": 1.180342197418213, "learning_rate": 7.659235303659784e-06, "loss": 1.0668, "step": 2571 }, { "epoch": 0.5888952489982827, "grad_norm": 1.269478440284729, "learning_rate": 7.65202387394737e-06, "loss": 1.0988, "step": 2572 }, { "epoch": 0.5891242129364626, "grad_norm": 1.2310328483581543, "learning_rate": 7.644813736306531e-06, "loss": 1.0087, "step": 2573 }, { "epoch": 0.5893531768746423, "grad_norm": 1.2201181650161743, "learning_rate": 7.637604894704951e-06, "loss": 1.097, "step": 2574 }, { "epoch": 0.589582140812822, "grad_norm": 1.19076406955719, "learning_rate": 7.630397353109588e-06, "loss": 1.0493, "step": 2575 }, { "epoch": 0.5898111047510017, "grad_norm": 1.3227699995040894, "learning_rate": 7.623191115486695e-06, "loss": 1.0598, "step": 2576 }, { "epoch": 0.5900400686891815, "grad_norm": 1.1145106554031372, "learning_rate": 7.615986185801807e-06, "loss": 1.0267, "step": 2577 }, { "epoch": 0.5902690326273612, "grad_norm": 1.8626641035079956, "learning_rate": 7.608782568019729e-06, "loss": 1.0034, "step": 2578 }, { "epoch": 0.5904979965655409, "grad_norm": 1.3820207118988037, "learning_rate": 7.601580266104558e-06, "loss": 1.0034, "step": 2579 }, { "epoch": 0.5907269605037206, "grad_norm": 1.131319522857666, "learning_rate": 7.594379284019659e-06, "loss": 1.0725, "step": 2580 }, { "epoch": 0.5909559244419004, "grad_norm": 1.295155644416809, "learning_rate": 7.587179625727671e-06, "loss": 1.0437, "step": 2581 }, { "epoch": 0.5911848883800801, "grad_norm": 1.355495572090149, "learning_rate": 7.579981295190506e-06, "loss": 1.0339, "step": 2582 }, { "epoch": 0.5914138523182598, "grad_norm": 1.4129412174224854, "learning_rate": 7.572784296369342e-06, "loss": 1.0228, "step": 2583 }, { "epoch": 0.5916428162564397, "grad_norm": 1.3346372842788696, "learning_rate": 7.565588633224632e-06, "loss": 1.0627, "step": 2584 }, { "epoch": 0.5918717801946194, "grad_norm": 1.4779704809188843, "learning_rate": 7.558394309716088e-06, "loss": 1.1153, "step": 2585 }, { "epoch": 0.5921007441327991, "grad_norm": 1.397422194480896, "learning_rate": 7.551201329802684e-06, "loss": 1.0255, "step": 2586 }, { "epoch": 0.5923297080709788, "grad_norm": 1.3591479063034058, "learning_rate": 7.544009697442656e-06, "loss": 1.0635, "step": 2587 }, { "epoch": 0.5925586720091586, "grad_norm": 1.7575359344482422, "learning_rate": 7.536819416593504e-06, "loss": 1.0804, "step": 2588 }, { "epoch": 0.5927876359473383, "grad_norm": 1.3760896921157837, "learning_rate": 7.529630491211972e-06, "loss": 1.0748, "step": 2589 }, { "epoch": 0.593016599885518, "grad_norm": 1.791036605834961, "learning_rate": 7.522442925254068e-06, "loss": 1.0705, "step": 2590 }, { "epoch": 0.5932455638236978, "grad_norm": 1.271471619606018, "learning_rate": 7.515256722675058e-06, "loss": 1.0509, "step": 2591 }, { "epoch": 0.5934745277618775, "grad_norm": 1.1143954992294312, "learning_rate": 7.508071887429433e-06, "loss": 1.0352, "step": 2592 }, { "epoch": 0.5937034917000572, "grad_norm": 1.292467474937439, "learning_rate": 7.500888423470962e-06, "loss": 1.013, "step": 2593 }, { "epoch": 0.5939324556382369, "grad_norm": 1.4147076606750488, "learning_rate": 7.493706334752637e-06, "loss": 1.1661, "step": 2594 }, { "epoch": 0.5941614195764167, "grad_norm": 1.6669336557388306, "learning_rate": 7.486525625226699e-06, "loss": 1.0527, "step": 2595 }, { "epoch": 0.5943903835145965, "grad_norm": 1.3241775035858154, "learning_rate": 7.479346298844645e-06, "loss": 1.0268, "step": 2596 }, { "epoch": 0.5946193474527762, "grad_norm": 1.1176531314849854, "learning_rate": 7.472168359557183e-06, "loss": 1.0257, "step": 2597 }, { "epoch": 0.594848311390956, "grad_norm": 1.194197654724121, "learning_rate": 7.46499181131428e-06, "loss": 1.073, "step": 2598 }, { "epoch": 0.5950772753291357, "grad_norm": 1.1093398332595825, "learning_rate": 7.4578166580651335e-06, "loss": 0.9844, "step": 2599 }, { "epoch": 0.5953062392673154, "grad_norm": 1.3040740489959717, "learning_rate": 7.450642903758163e-06, "loss": 1.1006, "step": 2600 }, { "epoch": 0.5955352032054951, "grad_norm": 1.1911894083023071, "learning_rate": 7.443470552341028e-06, "loss": 1.1261, "step": 2601 }, { "epoch": 0.5957641671436749, "grad_norm": 1.2414566278457642, "learning_rate": 7.436299607760616e-06, "loss": 1.0389, "step": 2602 }, { "epoch": 0.5959931310818546, "grad_norm": 1.163601040840149, "learning_rate": 7.429130073963036e-06, "loss": 0.9994, "step": 2603 }, { "epoch": 0.5962220950200343, "grad_norm": 1.274039626121521, "learning_rate": 7.421961954893622e-06, "loss": 1.0788, "step": 2604 }, { "epoch": 0.596451058958214, "grad_norm": 1.1208449602127075, "learning_rate": 7.414795254496929e-06, "loss": 1.032, "step": 2605 }, { "epoch": 0.5966800228963938, "grad_norm": 1.4800190925598145, "learning_rate": 7.4076299767167325e-06, "loss": 1.0526, "step": 2606 }, { "epoch": 0.5969089868345736, "grad_norm": 1.2557833194732666, "learning_rate": 7.400466125496027e-06, "loss": 1.1016, "step": 2607 }, { "epoch": 0.5971379507727533, "grad_norm": 1.3543636798858643, "learning_rate": 7.393303704777017e-06, "loss": 1.0346, "step": 2608 }, { "epoch": 0.5973669147109331, "grad_norm": 1.2229251861572266, "learning_rate": 7.386142718501122e-06, "loss": 1.0234, "step": 2609 }, { "epoch": 0.5975958786491128, "grad_norm": 1.217532753944397, "learning_rate": 7.378983170608982e-06, "loss": 1.039, "step": 2610 }, { "epoch": 0.5978248425872925, "grad_norm": 1.3155670166015625, "learning_rate": 7.37182506504042e-06, "loss": 1.0595, "step": 2611 }, { "epoch": 0.5980538065254722, "grad_norm": 1.3033359050750732, "learning_rate": 7.364668405734493e-06, "loss": 1.0695, "step": 2612 }, { "epoch": 0.598282770463652, "grad_norm": 1.1440081596374512, "learning_rate": 7.35751319662945e-06, "loss": 1.0842, "step": 2613 }, { "epoch": 0.5985117344018317, "grad_norm": 1.1137559413909912, "learning_rate": 7.350359441662735e-06, "loss": 1.007, "step": 2614 }, { "epoch": 0.5987406983400114, "grad_norm": 1.325477957725525, "learning_rate": 7.343207144771012e-06, "loss": 1.0835, "step": 2615 }, { "epoch": 0.5989696622781912, "grad_norm": 1.1493194103240967, "learning_rate": 7.336056309890116e-06, "loss": 1.0678, "step": 2616 }, { "epoch": 0.5991986262163709, "grad_norm": 1.2086422443389893, "learning_rate": 7.3289069409551e-06, "loss": 1.1063, "step": 2617 }, { "epoch": 0.5994275901545506, "grad_norm": 1.3805280923843384, "learning_rate": 7.321759041900204e-06, "loss": 1.0846, "step": 2618 }, { "epoch": 0.5996565540927304, "grad_norm": 1.2356138229370117, "learning_rate": 7.314612616658847e-06, "loss": 1.0757, "step": 2619 }, { "epoch": 0.5998855180309102, "grad_norm": 1.2783619165420532, "learning_rate": 7.307467669163655e-06, "loss": 1.0257, "step": 2620 }, { "epoch": 0.6001144819690899, "grad_norm": 1.2165898084640503, "learning_rate": 7.3003242033464314e-06, "loss": 1.0641, "step": 2621 }, { "epoch": 0.6003434459072696, "grad_norm": 1.589637279510498, "learning_rate": 7.293182223138164e-06, "loss": 1.1101, "step": 2622 }, { "epoch": 0.6005724098454494, "grad_norm": 1.0955222845077515, "learning_rate": 7.286041732469024e-06, "loss": 1.0198, "step": 2623 }, { "epoch": 0.6008013737836291, "grad_norm": 1.270451307296753, "learning_rate": 7.278902735268367e-06, "loss": 1.0718, "step": 2624 }, { "epoch": 0.6010303377218088, "grad_norm": 1.114689826965332, "learning_rate": 7.27176523546472e-06, "loss": 1.0091, "step": 2625 }, { "epoch": 0.6012593016599885, "grad_norm": 1.1657631397247314, "learning_rate": 7.264629236985792e-06, "loss": 1.1009, "step": 2626 }, { "epoch": 0.6014882655981683, "grad_norm": 1.2354736328125, "learning_rate": 7.25749474375846e-06, "loss": 1.0566, "step": 2627 }, { "epoch": 0.601717229536348, "grad_norm": 1.1252378225326538, "learning_rate": 7.250361759708775e-06, "loss": 1.038, "step": 2628 }, { "epoch": 0.6019461934745277, "grad_norm": 1.6679290533065796, "learning_rate": 7.243230288761966e-06, "loss": 1.0852, "step": 2629 }, { "epoch": 0.6021751574127076, "grad_norm": 1.029215693473816, "learning_rate": 7.2361003348424105e-06, "loss": 1.0707, "step": 2630 }, { "epoch": 0.6024041213508873, "grad_norm": 1.261576771736145, "learning_rate": 7.2289719018736715e-06, "loss": 0.9972, "step": 2631 }, { "epoch": 0.602633085289067, "grad_norm": 1.4089431762695312, "learning_rate": 7.221844993778464e-06, "loss": 1.0427, "step": 2632 }, { "epoch": 0.6028620492272467, "grad_norm": 1.418616771697998, "learning_rate": 7.21471961447866e-06, "loss": 1.1127, "step": 2633 }, { "epoch": 0.6030910131654265, "grad_norm": 1.258452296257019, "learning_rate": 7.207595767895303e-06, "loss": 1.0767, "step": 2634 }, { "epoch": 0.6033199771036062, "grad_norm": 1.0797663927078247, "learning_rate": 7.200473457948584e-06, "loss": 1.0761, "step": 2635 }, { "epoch": 0.6035489410417859, "grad_norm": 1.2073185443878174, "learning_rate": 7.193352688557849e-06, "loss": 1.0367, "step": 2636 }, { "epoch": 0.6037779049799656, "grad_norm": 1.2464630603790283, "learning_rate": 7.186233463641601e-06, "loss": 1.0441, "step": 2637 }, { "epoch": 0.6040068689181454, "grad_norm": 2.0551130771636963, "learning_rate": 7.179115787117482e-06, "loss": 1.0626, "step": 2638 }, { "epoch": 0.6042358328563251, "grad_norm": 1.3895354270935059, "learning_rate": 7.171999662902298e-06, "loss": 1.0775, "step": 2639 }, { "epoch": 0.6044647967945048, "grad_norm": 1.0203369855880737, "learning_rate": 7.164885094911991e-06, "loss": 1.0592, "step": 2640 }, { "epoch": 0.6046937607326845, "grad_norm": 1.9111573696136475, "learning_rate": 7.157772087061645e-06, "loss": 1.0904, "step": 2641 }, { "epoch": 0.6049227246708644, "grad_norm": 1.0934786796569824, "learning_rate": 7.150660643265488e-06, "loss": 1.0466, "step": 2642 }, { "epoch": 0.6051516886090441, "grad_norm": 1.2626543045043945, "learning_rate": 7.143550767436894e-06, "loss": 1.0527, "step": 2643 }, { "epoch": 0.6053806525472238, "grad_norm": 1.443155288696289, "learning_rate": 7.136442463488362e-06, "loss": 1.0411, "step": 2644 }, { "epoch": 0.6056096164854036, "grad_norm": 1.5223894119262695, "learning_rate": 7.129335735331537e-06, "loss": 1.0087, "step": 2645 }, { "epoch": 0.6058385804235833, "grad_norm": 1.1826412677764893, "learning_rate": 7.122230586877188e-06, "loss": 1.0613, "step": 2646 }, { "epoch": 0.606067544361763, "grad_norm": 3.0586001873016357, "learning_rate": 7.115127022035215e-06, "loss": 1.0597, "step": 2647 }, { "epoch": 0.6062965082999427, "grad_norm": 1.441426157951355, "learning_rate": 7.108025044714661e-06, "loss": 1.1143, "step": 2648 }, { "epoch": 0.6065254722381225, "grad_norm": 1.2049249410629272, "learning_rate": 7.100924658823677e-06, "loss": 1.082, "step": 2649 }, { "epoch": 0.6067544361763022, "grad_norm": 1.1801356077194214, "learning_rate": 7.093825868269546e-06, "loss": 1.0233, "step": 2650 }, { "epoch": 0.6069834001144819, "grad_norm": 1.2054147720336914, "learning_rate": 7.0867286769586775e-06, "loss": 1.015, "step": 2651 }, { "epoch": 0.6072123640526617, "grad_norm": 3.0669796466827393, "learning_rate": 7.0796330887965884e-06, "loss": 1.0719, "step": 2652 }, { "epoch": 0.6074413279908415, "grad_norm": 1.0845890045166016, "learning_rate": 7.072539107687928e-06, "loss": 1.0525, "step": 2653 }, { "epoch": 0.6076702919290212, "grad_norm": 1.1807281970977783, "learning_rate": 7.065446737536455e-06, "loss": 1.0256, "step": 2654 }, { "epoch": 0.607899255867201, "grad_norm": 1.2322497367858887, "learning_rate": 7.058355982245038e-06, "loss": 1.0678, "step": 2655 }, { "epoch": 0.6081282198053807, "grad_norm": 1.065679907798767, "learning_rate": 7.051266845715663e-06, "loss": 1.0384, "step": 2656 }, { "epoch": 0.6083571837435604, "grad_norm": 1.2442513704299927, "learning_rate": 7.044179331849415e-06, "loss": 1.0565, "step": 2657 }, { "epoch": 0.6085861476817401, "grad_norm": 1.1693778038024902, "learning_rate": 7.0370934445465026e-06, "loss": 0.9849, "step": 2658 }, { "epoch": 0.6088151116199199, "grad_norm": 1.1018726825714111, "learning_rate": 7.030009187706228e-06, "loss": 1.0543, "step": 2659 }, { "epoch": 0.6090440755580996, "grad_norm": 1.5267049074172974, "learning_rate": 7.022926565226995e-06, "loss": 1.0282, "step": 2660 }, { "epoch": 0.6092730394962793, "grad_norm": 1.2858092784881592, "learning_rate": 7.015845581006311e-06, "loss": 0.991, "step": 2661 }, { "epoch": 0.609502003434459, "grad_norm": 1.529943585395813, "learning_rate": 7.00876623894079e-06, "loss": 1.0248, "step": 2662 }, { "epoch": 0.6097309673726388, "grad_norm": 1.2061338424682617, "learning_rate": 7.001688542926126e-06, "loss": 1.0418, "step": 2663 }, { "epoch": 0.6099599313108185, "grad_norm": 1.2191417217254639, "learning_rate": 6.994612496857118e-06, "loss": 1.0346, "step": 2664 }, { "epoch": 0.6101888952489983, "grad_norm": 1.3246591091156006, "learning_rate": 6.9875381046276605e-06, "loss": 1.051, "step": 2665 }, { "epoch": 0.6104178591871781, "grad_norm": 1.1738966703414917, "learning_rate": 6.9804653701307225e-06, "loss": 1.073, "step": 2666 }, { "epoch": 0.6106468231253578, "grad_norm": 1.06193208694458, "learning_rate": 6.973394297258378e-06, "loss": 1.0468, "step": 2667 }, { "epoch": 0.6108757870635375, "grad_norm": 1.585971713066101, "learning_rate": 6.966324889901776e-06, "loss": 1.0083, "step": 2668 }, { "epoch": 0.6111047510017172, "grad_norm": 1.149452805519104, "learning_rate": 6.959257151951153e-06, "loss": 1.0624, "step": 2669 }, { "epoch": 0.611333714939897, "grad_norm": 1.240146279335022, "learning_rate": 6.952191087295827e-06, "loss": 1.0515, "step": 2670 }, { "epoch": 0.6115626788780767, "grad_norm": 1.2787004709243774, "learning_rate": 6.945126699824189e-06, "loss": 0.9782, "step": 2671 }, { "epoch": 0.6117916428162564, "grad_norm": 1.338744878768921, "learning_rate": 6.938063993423718e-06, "loss": 1.0323, "step": 2672 }, { "epoch": 0.6120206067544361, "grad_norm": 2.4173684120178223, "learning_rate": 6.9310029719809625e-06, "loss": 1.022, "step": 2673 }, { "epoch": 0.6122495706926159, "grad_norm": 1.3075268268585205, "learning_rate": 6.923943639381539e-06, "loss": 1.0476, "step": 2674 }, { "epoch": 0.6124785346307956, "grad_norm": 1.0469852685928345, "learning_rate": 6.916885999510137e-06, "loss": 0.9744, "step": 2675 }, { "epoch": 0.6127074985689754, "grad_norm": 1.373605489730835, "learning_rate": 6.909830056250527e-06, "loss": 1.0191, "step": 2676 }, { "epoch": 0.6129364625071552, "grad_norm": 1.5057294368743896, "learning_rate": 6.902775813485527e-06, "loss": 1.061, "step": 2677 }, { "epoch": 0.6131654264453349, "grad_norm": 1.0761992931365967, "learning_rate": 6.895723275097031e-06, "loss": 1.0406, "step": 2678 }, { "epoch": 0.6133943903835146, "grad_norm": 1.1914690732955933, "learning_rate": 6.888672444965988e-06, "loss": 1.0442, "step": 2679 }, { "epoch": 0.6136233543216943, "grad_norm": 1.2478915452957153, "learning_rate": 6.881623326972412e-06, "loss": 0.9698, "step": 2680 }, { "epoch": 0.6138523182598741, "grad_norm": 1.6330108642578125, "learning_rate": 6.874575924995378e-06, "loss": 1.0203, "step": 2681 }, { "epoch": 0.6140812821980538, "grad_norm": 1.1003386974334717, "learning_rate": 6.867530242913008e-06, "loss": 1.0006, "step": 2682 }, { "epoch": 0.6143102461362335, "grad_norm": 1.3207449913024902, "learning_rate": 6.860486284602479e-06, "loss": 1.1075, "step": 2683 }, { "epoch": 0.6145392100744133, "grad_norm": 1.2823762893676758, "learning_rate": 6.853444053940034e-06, "loss": 0.9907, "step": 2684 }, { "epoch": 0.614768174012593, "grad_norm": 1.167484164237976, "learning_rate": 6.846403554800938e-06, "loss": 1.0652, "step": 2685 }, { "epoch": 0.6149971379507727, "grad_norm": 1.0674535036087036, "learning_rate": 6.839364791059529e-06, "loss": 1.0155, "step": 2686 }, { "epoch": 0.6152261018889524, "grad_norm": 1.2486037015914917, "learning_rate": 6.832327766589177e-06, "loss": 1.1036, "step": 2687 }, { "epoch": 0.6154550658271323, "grad_norm": 1.2341771125793457, "learning_rate": 6.825292485262296e-06, "loss": 1.0722, "step": 2688 }, { "epoch": 0.615684029765312, "grad_norm": 1.7849056720733643, "learning_rate": 6.818258950950346e-06, "loss": 1.0679, "step": 2689 }, { "epoch": 0.6159129937034917, "grad_norm": 1.1976414918899536, "learning_rate": 6.8112271675238154e-06, "loss": 1.0202, "step": 2690 }, { "epoch": 0.6161419576416715, "grad_norm": 1.1759425401687622, "learning_rate": 6.804197138852242e-06, "loss": 1.1098, "step": 2691 }, { "epoch": 0.6163709215798512, "grad_norm": 1.9811475276947021, "learning_rate": 6.797168868804192e-06, "loss": 1.0115, "step": 2692 }, { "epoch": 0.6165998855180309, "grad_norm": 1.4365376234054565, "learning_rate": 6.790142361247258e-06, "loss": 1.0826, "step": 2693 }, { "epoch": 0.6168288494562106, "grad_norm": 0.9954733848571777, "learning_rate": 6.7831176200480686e-06, "loss": 1.024, "step": 2694 }, { "epoch": 0.6170578133943904, "grad_norm": 1.2109805345535278, "learning_rate": 6.776094649072286e-06, "loss": 1.0394, "step": 2695 }, { "epoch": 0.6172867773325701, "grad_norm": 1.5378385782241821, "learning_rate": 6.769073452184589e-06, "loss": 1.0604, "step": 2696 }, { "epoch": 0.6175157412707498, "grad_norm": 1.6136606931686401, "learning_rate": 6.762054033248681e-06, "loss": 1.0747, "step": 2697 }, { "epoch": 0.6177447052089295, "grad_norm": 1.2883368730545044, "learning_rate": 6.755036396127297e-06, "loss": 0.9892, "step": 2698 }, { "epoch": 0.6179736691471094, "grad_norm": 1.2920852899551392, "learning_rate": 6.748020544682172e-06, "loss": 1.0611, "step": 2699 }, { "epoch": 0.6182026330852891, "grad_norm": 1.2162961959838867, "learning_rate": 6.7410064827740805e-06, "loss": 0.9927, "step": 2700 }, { "epoch": 0.6184315970234688, "grad_norm": 1.3566070795059204, "learning_rate": 6.733994214262797e-06, "loss": 0.9985, "step": 2701 }, { "epoch": 0.6186605609616486, "grad_norm": 1.4097816944122314, "learning_rate": 6.726983743007112e-06, "loss": 1.0129, "step": 2702 }, { "epoch": 0.6188895248998283, "grad_norm": 1.3612055778503418, "learning_rate": 6.7199750728648395e-06, "loss": 1.0485, "step": 2703 }, { "epoch": 0.619118488838008, "grad_norm": 1.2182425260543823, "learning_rate": 6.712968207692778e-06, "loss": 1.0371, "step": 2704 }, { "epoch": 0.6193474527761877, "grad_norm": 1.3078467845916748, "learning_rate": 6.705963151346755e-06, "loss": 0.9789, "step": 2705 }, { "epoch": 0.6195764167143675, "grad_norm": 1.150083065032959, "learning_rate": 6.698959907681595e-06, "loss": 1.0607, "step": 2706 }, { "epoch": 0.6198053806525472, "grad_norm": 1.1655128002166748, "learning_rate": 6.6919584805511175e-06, "loss": 1.0277, "step": 2707 }, { "epoch": 0.6200343445907269, "grad_norm": 1.273138165473938, "learning_rate": 6.684958873808156e-06, "loss": 1.0686, "step": 2708 }, { "epoch": 0.6202633085289067, "grad_norm": 1.1581090688705444, "learning_rate": 6.6779610913045344e-06, "loss": 1.0464, "step": 2709 }, { "epoch": 0.6204922724670864, "grad_norm": 1.240431547164917, "learning_rate": 6.670965136891072e-06, "loss": 1.051, "step": 2710 }, { "epoch": 0.6207212364052662, "grad_norm": 1.4974451065063477, "learning_rate": 6.663971014417585e-06, "loss": 1.1388, "step": 2711 }, { "epoch": 0.620950200343446, "grad_norm": 1.2878180742263794, "learning_rate": 6.6569787277328745e-06, "loss": 1.0783, "step": 2712 }, { "epoch": 0.6211791642816257, "grad_norm": 1.712733507156372, "learning_rate": 6.6499882806847445e-06, "loss": 1.007, "step": 2713 }, { "epoch": 0.6214081282198054, "grad_norm": 1.2414849996566772, "learning_rate": 6.64299967711998e-06, "loss": 1.0117, "step": 2714 }, { "epoch": 0.6216370921579851, "grad_norm": 1.0478038787841797, "learning_rate": 6.636012920884346e-06, "loss": 1.0522, "step": 2715 }, { "epoch": 0.6218660560961649, "grad_norm": 1.2143323421478271, "learning_rate": 6.629028015822596e-06, "loss": 1.0254, "step": 2716 }, { "epoch": 0.6220950200343446, "grad_norm": 1.4381064176559448, "learning_rate": 6.622044965778471e-06, "loss": 1.1201, "step": 2717 }, { "epoch": 0.6223239839725243, "grad_norm": 1.1734505891799927, "learning_rate": 6.615063774594677e-06, "loss": 1.1097, "step": 2718 }, { "epoch": 0.622552947910704, "grad_norm": 1.087598204612732, "learning_rate": 6.608084446112909e-06, "loss": 1.0212, "step": 2719 }, { "epoch": 0.6227819118488838, "grad_norm": 1.2489269971847534, "learning_rate": 6.601106984173835e-06, "loss": 1.0581, "step": 2720 }, { "epoch": 0.6230108757870635, "grad_norm": 1.7127981185913086, "learning_rate": 6.594131392617087e-06, "loss": 1.0913, "step": 2721 }, { "epoch": 0.6232398397252433, "grad_norm": 2.1938893795013428, "learning_rate": 6.5871576752812845e-06, "loss": 1.0303, "step": 2722 }, { "epoch": 0.623468803663423, "grad_norm": 1.364212989807129, "learning_rate": 6.580185836003995e-06, "loss": 1.0338, "step": 2723 }, { "epoch": 0.6236977676016028, "grad_norm": 1.1102733612060547, "learning_rate": 6.573215878621769e-06, "loss": 1.076, "step": 2724 }, { "epoch": 0.6239267315397825, "grad_norm": 1.3565003871917725, "learning_rate": 6.566247806970119e-06, "loss": 1.0169, "step": 2725 }, { "epoch": 0.6241556954779622, "grad_norm": 1.652982234954834, "learning_rate": 6.559281624883506e-06, "loss": 0.9878, "step": 2726 }, { "epoch": 0.624384659416142, "grad_norm": 1.271844506263733, "learning_rate": 6.552317336195371e-06, "loss": 1.0609, "step": 2727 }, { "epoch": 0.6246136233543217, "grad_norm": 1.2755295038223267, "learning_rate": 6.5453549447381e-06, "loss": 1.0662, "step": 2728 }, { "epoch": 0.6248425872925014, "grad_norm": 1.265498399734497, "learning_rate": 6.53839445434304e-06, "loss": 1.0132, "step": 2729 }, { "epoch": 0.6250715512306811, "grad_norm": 1.5743672847747803, "learning_rate": 6.531435868840488e-06, "loss": 1.0412, "step": 2730 }, { "epoch": 0.6253005151688609, "grad_norm": 1.1162136793136597, "learning_rate": 6.524479192059699e-06, "loss": 1.0259, "step": 2731 }, { "epoch": 0.6255294791070406, "grad_norm": 1.2399483919143677, "learning_rate": 6.5175244278288705e-06, "loss": 1.0109, "step": 2732 }, { "epoch": 0.6257584430452203, "grad_norm": 1.169719934463501, "learning_rate": 6.510571579975155e-06, "loss": 0.9764, "step": 2733 }, { "epoch": 0.6259874069834002, "grad_norm": 1.7997078895568848, "learning_rate": 6.5036206523246404e-06, "loss": 1.0591, "step": 2734 }, { "epoch": 0.6262163709215799, "grad_norm": 1.2209597826004028, "learning_rate": 6.496671648702366e-06, "loss": 1.0484, "step": 2735 }, { "epoch": 0.6264453348597596, "grad_norm": 1.2143415212631226, "learning_rate": 6.489724572932314e-06, "loss": 1.0438, "step": 2736 }, { "epoch": 0.6266742987979393, "grad_norm": 1.148642897605896, "learning_rate": 6.4827794288374e-06, "loss": 1.028, "step": 2737 }, { "epoch": 0.6269032627361191, "grad_norm": 1.284192681312561, "learning_rate": 6.475836220239475e-06, "loss": 1.0812, "step": 2738 }, { "epoch": 0.6271322266742988, "grad_norm": 1.0696079730987549, "learning_rate": 6.468894950959336e-06, "loss": 1.1179, "step": 2739 }, { "epoch": 0.6273611906124785, "grad_norm": 1.2461477518081665, "learning_rate": 6.461955624816696e-06, "loss": 1.0082, "step": 2740 }, { "epoch": 0.6275901545506583, "grad_norm": 1.2268809080123901, "learning_rate": 6.455018245630214e-06, "loss": 1.0832, "step": 2741 }, { "epoch": 0.627819118488838, "grad_norm": 1.1206541061401367, "learning_rate": 6.4480828172174714e-06, "loss": 1.0686, "step": 2742 }, { "epoch": 0.6280480824270177, "grad_norm": 1.9735060930252075, "learning_rate": 6.441149343394975e-06, "loss": 1.0499, "step": 2743 }, { "epoch": 0.6282770463651974, "grad_norm": 1.1669235229492188, "learning_rate": 6.4342178279781584e-06, "loss": 1.0743, "step": 2744 }, { "epoch": 0.6285060103033773, "grad_norm": 1.3031091690063477, "learning_rate": 6.427288274781372e-06, "loss": 1.0164, "step": 2745 }, { "epoch": 0.628734974241557, "grad_norm": 1.2017693519592285, "learning_rate": 6.420360687617897e-06, "loss": 0.9844, "step": 2746 }, { "epoch": 0.6289639381797367, "grad_norm": 1.155867576599121, "learning_rate": 6.413435070299925e-06, "loss": 1.0669, "step": 2747 }, { "epoch": 0.6291929021179165, "grad_norm": 1.315988302230835, "learning_rate": 6.406511426638562e-06, "loss": 1.0525, "step": 2748 }, { "epoch": 0.6294218660560962, "grad_norm": 1.3917714357376099, "learning_rate": 6.3995897604438315e-06, "loss": 1.0961, "step": 2749 }, { "epoch": 0.6296508299942759, "grad_norm": 1.3738504648208618, "learning_rate": 6.392670075524674e-06, "loss": 1.0626, "step": 2750 }, { "epoch": 0.6298797939324556, "grad_norm": 1.1680269241333008, "learning_rate": 6.385752375688927e-06, "loss": 1.0226, "step": 2751 }, { "epoch": 0.6301087578706354, "grad_norm": 1.6877174377441406, "learning_rate": 6.378836664743347e-06, "loss": 1.036, "step": 2752 }, { "epoch": 0.6303377218088151, "grad_norm": 1.2052552700042725, "learning_rate": 6.3719229464935915e-06, "loss": 1.0776, "step": 2753 }, { "epoch": 0.6305666857469948, "grad_norm": 1.3558235168457031, "learning_rate": 6.365011224744218e-06, "loss": 1.0233, "step": 2754 }, { "epoch": 0.6307956496851745, "grad_norm": 1.0907037258148193, "learning_rate": 6.3581015032986945e-06, "loss": 0.9963, "step": 2755 }, { "epoch": 0.6310246136233543, "grad_norm": 1.4145336151123047, "learning_rate": 6.35119378595938e-06, "loss": 1.0576, "step": 2756 }, { "epoch": 0.6312535775615341, "grad_norm": 1.3490455150604248, "learning_rate": 6.344288076527532e-06, "loss": 1.0378, "step": 2757 }, { "epoch": 0.6314825414997138, "grad_norm": 1.4001924991607666, "learning_rate": 6.337384378803309e-06, "loss": 1.108, "step": 2758 }, { "epoch": 0.6317115054378936, "grad_norm": 1.187118411064148, "learning_rate": 6.330482696585749e-06, "loss": 1.0318, "step": 2759 }, { "epoch": 0.6319404693760733, "grad_norm": 1.5335361957550049, "learning_rate": 6.323583033672799e-06, "loss": 1.0561, "step": 2760 }, { "epoch": 0.632169433314253, "grad_norm": 1.461372971534729, "learning_rate": 6.316685393861284e-06, "loss": 0.988, "step": 2761 }, { "epoch": 0.6323983972524327, "grad_norm": 1.2753512859344482, "learning_rate": 6.309789780946916e-06, "loss": 1.07, "step": 2762 }, { "epoch": 0.6326273611906125, "grad_norm": 1.0639042854309082, "learning_rate": 6.302896198724288e-06, "loss": 1.122, "step": 2763 }, { "epoch": 0.6328563251287922, "grad_norm": 1.267246961593628, "learning_rate": 6.29600465098689e-06, "loss": 1.068, "step": 2764 }, { "epoch": 0.6330852890669719, "grad_norm": 1.2320417165756226, "learning_rate": 6.289115141527077e-06, "loss": 1.0611, "step": 2765 }, { "epoch": 0.6333142530051517, "grad_norm": 1.517449975013733, "learning_rate": 6.282227674136091e-06, "loss": 0.9906, "step": 2766 }, { "epoch": 0.6335432169433314, "grad_norm": 1.2116914987564087, "learning_rate": 6.275342252604044e-06, "loss": 1.0568, "step": 2767 }, { "epoch": 0.6337721808815112, "grad_norm": 1.178650140762329, "learning_rate": 6.2684588807199265e-06, "loss": 1.0065, "step": 2768 }, { "epoch": 0.6340011448196909, "grad_norm": 1.3236886262893677, "learning_rate": 6.261577562271605e-06, "loss": 0.9728, "step": 2769 }, { "epoch": 0.6342301087578707, "grad_norm": 1.2734071016311646, "learning_rate": 6.254698301045806e-06, "loss": 1.05, "step": 2770 }, { "epoch": 0.6344590726960504, "grad_norm": 1.281076431274414, "learning_rate": 6.247821100828131e-06, "loss": 1.0518, "step": 2771 }, { "epoch": 0.6346880366342301, "grad_norm": 1.1672574281692505, "learning_rate": 6.240945965403049e-06, "loss": 1.0612, "step": 2772 }, { "epoch": 0.6349170005724099, "grad_norm": 1.2151325941085815, "learning_rate": 6.234072898553882e-06, "loss": 1.0501, "step": 2773 }, { "epoch": 0.6351459645105896, "grad_norm": 1.215021014213562, "learning_rate": 6.22720190406283e-06, "loss": 1.0501, "step": 2774 }, { "epoch": 0.6353749284487693, "grad_norm": 1.1667354106903076, "learning_rate": 6.220332985710936e-06, "loss": 1.0068, "step": 2775 }, { "epoch": 0.635603892386949, "grad_norm": 1.4257439374923706, "learning_rate": 6.213466147278111e-06, "loss": 1.085, "step": 2776 }, { "epoch": 0.6358328563251288, "grad_norm": 1.5628901720046997, "learning_rate": 6.206601392543121e-06, "loss": 0.9748, "step": 2777 }, { "epoch": 0.6360618202633085, "grad_norm": 1.2145711183547974, "learning_rate": 6.199738725283578e-06, "loss": 1.0345, "step": 2778 }, { "epoch": 0.6362907842014882, "grad_norm": 1.2755681276321411, "learning_rate": 6.192878149275954e-06, "loss": 1.068, "step": 2779 }, { "epoch": 0.636519748139668, "grad_norm": 1.1560249328613281, "learning_rate": 6.186019668295568e-06, "loss": 1.0608, "step": 2780 }, { "epoch": 0.6367487120778478, "grad_norm": 1.1933772563934326, "learning_rate": 6.179163286116581e-06, "loss": 1.0713, "step": 2781 }, { "epoch": 0.6369776760160275, "grad_norm": 2.3430261611938477, "learning_rate": 6.172309006511999e-06, "loss": 0.9809, "step": 2782 }, { "epoch": 0.6372066399542072, "grad_norm": 1.1244544982910156, "learning_rate": 6.165456833253686e-06, "loss": 1.0319, "step": 2783 }, { "epoch": 0.637435603892387, "grad_norm": 1.41416597366333, "learning_rate": 6.1586067701123255e-06, "loss": 1.0276, "step": 2784 }, { "epoch": 0.6376645678305667, "grad_norm": 1.5440860986709595, "learning_rate": 6.151758820857455e-06, "loss": 1.0168, "step": 2785 }, { "epoch": 0.6378935317687464, "grad_norm": 1.2612404823303223, "learning_rate": 6.144912989257441e-06, "loss": 0.9923, "step": 2786 }, { "epoch": 0.6381224957069261, "grad_norm": 1.4268378019332886, "learning_rate": 6.138069279079484e-06, "loss": 1.0428, "step": 2787 }, { "epoch": 0.6383514596451059, "grad_norm": 1.4409279823303223, "learning_rate": 6.13122769408963e-06, "loss": 1.0381, "step": 2788 }, { "epoch": 0.6385804235832856, "grad_norm": 1.529488205909729, "learning_rate": 6.124388238052737e-06, "loss": 1.0536, "step": 2789 }, { "epoch": 0.6388093875214653, "grad_norm": 1.03910231590271, "learning_rate": 6.1175509147325015e-06, "loss": 0.986, "step": 2790 }, { "epoch": 0.6390383514596452, "grad_norm": 1.074388027191162, "learning_rate": 6.1107157278914545e-06, "loss": 1.066, "step": 2791 }, { "epoch": 0.6392673153978249, "grad_norm": 1.1778825521469116, "learning_rate": 6.1038826812909265e-06, "loss": 1.1038, "step": 2792 }, { "epoch": 0.6394962793360046, "grad_norm": 1.192124843597412, "learning_rate": 6.097051778691099e-06, "loss": 1.0755, "step": 2793 }, { "epoch": 0.6397252432741843, "grad_norm": 1.2130941152572632, "learning_rate": 6.090223023850954e-06, "loss": 1.0033, "step": 2794 }, { "epoch": 0.6399542072123641, "grad_norm": 1.1291038990020752, "learning_rate": 6.083396420528298e-06, "loss": 1.04, "step": 2795 }, { "epoch": 0.6401831711505438, "grad_norm": 1.3231033086776733, "learning_rate": 6.0765719724797586e-06, "loss": 1.0925, "step": 2796 }, { "epoch": 0.6404121350887235, "grad_norm": 1.6609188318252563, "learning_rate": 6.069749683460765e-06, "loss": 1.0937, "step": 2797 }, { "epoch": 0.6406410990269032, "grad_norm": 1.21103036403656, "learning_rate": 6.0629295572255695e-06, "loss": 0.951, "step": 2798 }, { "epoch": 0.640870062965083, "grad_norm": 1.0261344909667969, "learning_rate": 6.056111597527235e-06, "loss": 1.0446, "step": 2799 }, { "epoch": 0.6410990269032627, "grad_norm": 1.1342116594314575, "learning_rate": 6.0492958081176155e-06, "loss": 1.0956, "step": 2800 }, { "epoch": 0.6413279908414424, "grad_norm": 1.2922279834747314, "learning_rate": 6.042482192747394e-06, "loss": 1.0102, "step": 2801 }, { "epoch": 0.6415569547796222, "grad_norm": 1.2681461572647095, "learning_rate": 6.0356707551660434e-06, "loss": 0.9613, "step": 2802 }, { "epoch": 0.641785918717802, "grad_norm": 1.3258183002471924, "learning_rate": 6.0288614991218366e-06, "loss": 1.0068, "step": 2803 }, { "epoch": 0.6420148826559817, "grad_norm": 1.2088922262191772, "learning_rate": 6.022054428361852e-06, "loss": 1.0253, "step": 2804 }, { "epoch": 0.6422438465941614, "grad_norm": 1.0741603374481201, "learning_rate": 6.015249546631969e-06, "loss": 0.9941, "step": 2805 }, { "epoch": 0.6424728105323412, "grad_norm": 1.408013105392456, "learning_rate": 6.008446857676849e-06, "loss": 1.0913, "step": 2806 }, { "epoch": 0.6427017744705209, "grad_norm": 1.087931752204895, "learning_rate": 6.001646365239959e-06, "loss": 1.0355, "step": 2807 }, { "epoch": 0.6429307384087006, "grad_norm": 1.1895891427993774, "learning_rate": 5.994848073063552e-06, "loss": 1.1122, "step": 2808 }, { "epoch": 0.6431597023468804, "grad_norm": 1.3358407020568848, "learning_rate": 5.988051984888668e-06, "loss": 1.0171, "step": 2809 }, { "epoch": 0.6433886662850601, "grad_norm": 1.3957514762878418, "learning_rate": 5.9812581044551475e-06, "loss": 1.0719, "step": 2810 }, { "epoch": 0.6436176302232398, "grad_norm": 2.103109836578369, "learning_rate": 5.974466435501591e-06, "loss": 1.0735, "step": 2811 }, { "epoch": 0.6438465941614195, "grad_norm": 1.2422171831130981, "learning_rate": 5.967676981765409e-06, "loss": 1.0396, "step": 2812 }, { "epoch": 0.6440755580995993, "grad_norm": 1.1619694232940674, "learning_rate": 5.960889746982778e-06, "loss": 1.063, "step": 2813 }, { "epoch": 0.6443045220377791, "grad_norm": 1.4143716096878052, "learning_rate": 5.954104734888653e-06, "loss": 1.0359, "step": 2814 }, { "epoch": 0.6445334859759588, "grad_norm": 1.296014666557312, "learning_rate": 5.947321949216771e-06, "loss": 1.0085, "step": 2815 }, { "epoch": 0.6447624499141386, "grad_norm": 1.0912615060806274, "learning_rate": 5.940541393699646e-06, "loss": 1.0417, "step": 2816 }, { "epoch": 0.6449914138523183, "grad_norm": 1.079500675201416, "learning_rate": 5.933763072068554e-06, "loss": 0.9998, "step": 2817 }, { "epoch": 0.645220377790498, "grad_norm": 1.2979693412780762, "learning_rate": 5.926986988053557e-06, "loss": 1.0842, "step": 2818 }, { "epoch": 0.6454493417286777, "grad_norm": 1.6231458187103271, "learning_rate": 5.9202131453834664e-06, "loss": 1.0377, "step": 2819 }, { "epoch": 0.6456783056668575, "grad_norm": 1.2327483892440796, "learning_rate": 5.913441547785879e-06, "loss": 1.0495, "step": 2820 }, { "epoch": 0.6459072696050372, "grad_norm": 1.5749176740646362, "learning_rate": 5.906672198987149e-06, "loss": 1.0473, "step": 2821 }, { "epoch": 0.6461362335432169, "grad_norm": 1.3406803607940674, "learning_rate": 5.899905102712386e-06, "loss": 1.062, "step": 2822 }, { "epoch": 0.6463651974813966, "grad_norm": 1.0731439590454102, "learning_rate": 5.893140262685469e-06, "loss": 1.0155, "step": 2823 }, { "epoch": 0.6465941614195764, "grad_norm": 1.2645176649093628, "learning_rate": 5.886377682629037e-06, "loss": 0.9567, "step": 2824 }, { "epoch": 0.6468231253577561, "grad_norm": 1.1633598804473877, "learning_rate": 5.879617366264476e-06, "loss": 0.9959, "step": 2825 }, { "epoch": 0.6470520892959359, "grad_norm": 1.266831874847412, "learning_rate": 5.872859317311933e-06, "loss": 1.035, "step": 2826 }, { "epoch": 0.6472810532341157, "grad_norm": 1.426615595817566, "learning_rate": 5.866103539490307e-06, "loss": 1.0236, "step": 2827 }, { "epoch": 0.6475100171722954, "grad_norm": 1.0592693090438843, "learning_rate": 5.859350036517242e-06, "loss": 1.0293, "step": 2828 }, { "epoch": 0.6477389811104751, "grad_norm": 1.2604130506515503, "learning_rate": 5.852598812109139e-06, "loss": 1.0833, "step": 2829 }, { "epoch": 0.6479679450486548, "grad_norm": 1.2905731201171875, "learning_rate": 5.845849869981137e-06, "loss": 1.109, "step": 2830 }, { "epoch": 0.6481969089868346, "grad_norm": 1.1893059015274048, "learning_rate": 5.839103213847123e-06, "loss": 1.0075, "step": 2831 }, { "epoch": 0.6484258729250143, "grad_norm": 1.5178680419921875, "learning_rate": 5.832358847419728e-06, "loss": 1.0779, "step": 2832 }, { "epoch": 0.648654836863194, "grad_norm": 1.2127212285995483, "learning_rate": 5.8256167744103144e-06, "loss": 1.0206, "step": 2833 }, { "epoch": 0.6488838008013738, "grad_norm": 1.2147332429885864, "learning_rate": 5.818876998528988e-06, "loss": 1.019, "step": 2834 }, { "epoch": 0.6491127647395535, "grad_norm": 1.7146003246307373, "learning_rate": 5.812139523484604e-06, "loss": 0.9919, "step": 2835 }, { "epoch": 0.6493417286777332, "grad_norm": 1.2911676168441772, "learning_rate": 5.805404352984724e-06, "loss": 1.0572, "step": 2836 }, { "epoch": 0.649570692615913, "grad_norm": 1.3293169736862183, "learning_rate": 5.7986714907356614e-06, "loss": 0.9918, "step": 2837 }, { "epoch": 0.6497996565540928, "grad_norm": 1.9981685876846313, "learning_rate": 5.791940940442453e-06, "loss": 1.0236, "step": 2838 }, { "epoch": 0.6500286204922725, "grad_norm": 1.25807785987854, "learning_rate": 5.785212705808865e-06, "loss": 1.0795, "step": 2839 }, { "epoch": 0.6502575844304522, "grad_norm": 1.261753797531128, "learning_rate": 5.778486790537392e-06, "loss": 1.087, "step": 2840 }, { "epoch": 0.650486548368632, "grad_norm": 1.0655620098114014, "learning_rate": 5.7717631983292375e-06, "loss": 1.0383, "step": 2841 }, { "epoch": 0.6507155123068117, "grad_norm": 1.2953264713287354, "learning_rate": 5.76504193288435e-06, "loss": 1.0587, "step": 2842 }, { "epoch": 0.6509444762449914, "grad_norm": 1.2501654624938965, "learning_rate": 5.758322997901384e-06, "loss": 1.0403, "step": 2843 }, { "epoch": 0.6511734401831711, "grad_norm": 1.1158286333084106, "learning_rate": 5.751606397077703e-06, "loss": 1.0854, "step": 2844 }, { "epoch": 0.6514024041213509, "grad_norm": 1.357792615890503, "learning_rate": 5.744892134109406e-06, "loss": 1.1166, "step": 2845 }, { "epoch": 0.6516313680595306, "grad_norm": 1.070070505142212, "learning_rate": 5.738180212691296e-06, "loss": 1.0345, "step": 2846 }, { "epoch": 0.6518603319977103, "grad_norm": 1.1419059038162231, "learning_rate": 5.7314706365168806e-06, "loss": 1.0482, "step": 2847 }, { "epoch": 0.65208929593589, "grad_norm": 1.5696709156036377, "learning_rate": 5.724763409278383e-06, "loss": 1.0254, "step": 2848 }, { "epoch": 0.6523182598740699, "grad_norm": 1.2937999963760376, "learning_rate": 5.718058534666746e-06, "loss": 1.078, "step": 2849 }, { "epoch": 0.6525472238122496, "grad_norm": 1.8056284189224243, "learning_rate": 5.711356016371593e-06, "loss": 1.0889, "step": 2850 }, { "epoch": 0.6527761877504293, "grad_norm": 1.4800915718078613, "learning_rate": 5.704655858081268e-06, "loss": 1.0542, "step": 2851 }, { "epoch": 0.6530051516886091, "grad_norm": 1.4089901447296143, "learning_rate": 5.6979580634828125e-06, "loss": 1.0553, "step": 2852 }, { "epoch": 0.6532341156267888, "grad_norm": 1.154541254043579, "learning_rate": 5.691262636261967e-06, "loss": 1.0012, "step": 2853 }, { "epoch": 0.6534630795649685, "grad_norm": 1.3811694383621216, "learning_rate": 5.684569580103171e-06, "loss": 0.9693, "step": 2854 }, { "epoch": 0.6536920435031482, "grad_norm": 2.098176956176758, "learning_rate": 5.6778788986895464e-06, "loss": 1.1379, "step": 2855 }, { "epoch": 0.653921007441328, "grad_norm": 1.450215458869934, "learning_rate": 5.671190595702932e-06, "loss": 1.0369, "step": 2856 }, { "epoch": 0.6541499713795077, "grad_norm": 1.3346037864685059, "learning_rate": 5.664504674823844e-06, "loss": 1.128, "step": 2857 }, { "epoch": 0.6543789353176874, "grad_norm": 2.112135171890259, "learning_rate": 5.6578211397314765e-06, "loss": 1.0601, "step": 2858 }, { "epoch": 0.6546078992558672, "grad_norm": 1.497977614402771, "learning_rate": 5.6511399941037344e-06, "loss": 1.0149, "step": 2859 }, { "epoch": 0.654836863194047, "grad_norm": 1.1439969539642334, "learning_rate": 5.6444612416171976e-06, "loss": 0.9617, "step": 2860 }, { "epoch": 0.6550658271322267, "grad_norm": 1.8310574293136597, "learning_rate": 5.637784885947117e-06, "loss": 1.0273, "step": 2861 }, { "epoch": 0.6552947910704064, "grad_norm": 1.3659378290176392, "learning_rate": 5.631110930767443e-06, "loss": 1.058, "step": 2862 }, { "epoch": 0.6555237550085862, "grad_norm": 1.355661153793335, "learning_rate": 5.6244393797507944e-06, "loss": 1.0411, "step": 2863 }, { "epoch": 0.6557527189467659, "grad_norm": 0.9867006540298462, "learning_rate": 5.617770236568469e-06, "loss": 0.9702, "step": 2864 }, { "epoch": 0.6559816828849456, "grad_norm": 1.066946029663086, "learning_rate": 5.611103504890444e-06, "loss": 1.0521, "step": 2865 }, { "epoch": 0.6562106468231254, "grad_norm": 1.2276383638381958, "learning_rate": 5.604439188385362e-06, "loss": 0.9953, "step": 2866 }, { "epoch": 0.6564396107613051, "grad_norm": 1.2933861017227173, "learning_rate": 5.597777290720543e-06, "loss": 1.0319, "step": 2867 }, { "epoch": 0.6566685746994848, "grad_norm": 1.144028902053833, "learning_rate": 5.591117815561973e-06, "loss": 1.0391, "step": 2868 }, { "epoch": 0.6568975386376645, "grad_norm": 1.4104443788528442, "learning_rate": 5.584460766574304e-06, "loss": 1.0806, "step": 2869 }, { "epoch": 0.6571265025758443, "grad_norm": 2.108306884765625, "learning_rate": 5.5778061474208565e-06, "loss": 1.0714, "step": 2870 }, { "epoch": 0.657355466514024, "grad_norm": 1.3675113916397095, "learning_rate": 5.571153961763613e-06, "loss": 1.0704, "step": 2871 }, { "epoch": 0.6575844304522038, "grad_norm": 1.1373252868652344, "learning_rate": 5.564504213263205e-06, "loss": 1.0022, "step": 2872 }, { "epoch": 0.6578133943903836, "grad_norm": 5.982883930206299, "learning_rate": 5.55785690557895e-06, "loss": 1.1152, "step": 2873 }, { "epoch": 0.6580423583285633, "grad_norm": 1.589943289756775, "learning_rate": 5.551212042368792e-06, "loss": 1.0449, "step": 2874 }, { "epoch": 0.658271322266743, "grad_norm": 1.2605445384979248, "learning_rate": 5.54456962728935e-06, "loss": 1.0192, "step": 2875 }, { "epoch": 0.6585002862049227, "grad_norm": 1.27881920337677, "learning_rate": 5.537929663995887e-06, "loss": 1.1066, "step": 2876 }, { "epoch": 0.6587292501431025, "grad_norm": 2.3701119422912598, "learning_rate": 5.531292156142319e-06, "loss": 1.0039, "step": 2877 }, { "epoch": 0.6589582140812822, "grad_norm": 2.732001543045044, "learning_rate": 5.5246571073812124e-06, "loss": 1.0069, "step": 2878 }, { "epoch": 0.6591871780194619, "grad_norm": 1.3857358694076538, "learning_rate": 5.5180245213637785e-06, "loss": 1.0746, "step": 2879 }, { "epoch": 0.6594161419576416, "grad_norm": 1.0118176937103271, "learning_rate": 5.511394401739874e-06, "loss": 1.0039, "step": 2880 }, { "epoch": 0.6596451058958214, "grad_norm": 1.1465644836425781, "learning_rate": 5.504766752157997e-06, "loss": 1.0596, "step": 2881 }, { "epoch": 0.6598740698340011, "grad_norm": 1.1057982444763184, "learning_rate": 5.498141576265289e-06, "loss": 1.0616, "step": 2882 }, { "epoch": 0.6601030337721809, "grad_norm": 1.2650837898254395, "learning_rate": 5.491518877707527e-06, "loss": 1.045, "step": 2883 }, { "epoch": 0.6603319977103607, "grad_norm": 1.2822803258895874, "learning_rate": 5.484898660129132e-06, "loss": 1.0242, "step": 2884 }, { "epoch": 0.6605609616485404, "grad_norm": 1.1227819919586182, "learning_rate": 5.478280927173145e-06, "loss": 0.9838, "step": 2885 }, { "epoch": 0.6607899255867201, "grad_norm": 2.5325989723205566, "learning_rate": 5.4716656824812505e-06, "loss": 1.0738, "step": 2886 }, { "epoch": 0.6610188895248998, "grad_norm": 2.1725833415985107, "learning_rate": 5.465052929693774e-06, "loss": 1.0192, "step": 2887 }, { "epoch": 0.6612478534630796, "grad_norm": 1.7834242582321167, "learning_rate": 5.458442672449644e-06, "loss": 1.0923, "step": 2888 }, { "epoch": 0.6614768174012593, "grad_norm": 1.2370448112487793, "learning_rate": 5.451834914386435e-06, "loss": 1.0404, "step": 2889 }, { "epoch": 0.661705781339439, "grad_norm": 1.3691270351409912, "learning_rate": 5.445229659140341e-06, "loss": 1.0226, "step": 2890 }, { "epoch": 0.6619347452776188, "grad_norm": 1.2318962812423706, "learning_rate": 5.4386269103461785e-06, "loss": 1.039, "step": 2891 }, { "epoch": 0.6621637092157985, "grad_norm": 1.2632200717926025, "learning_rate": 5.432026671637385e-06, "loss": 1.077, "step": 2892 }, { "epoch": 0.6623926731539782, "grad_norm": 1.4517078399658203, "learning_rate": 5.425428946646016e-06, "loss": 1.0304, "step": 2893 }, { "epoch": 0.6626216370921579, "grad_norm": 1.1040912866592407, "learning_rate": 5.418833739002745e-06, "loss": 1.0163, "step": 2894 }, { "epoch": 0.6628506010303378, "grad_norm": 2.0394599437713623, "learning_rate": 5.4122410523368615e-06, "loss": 1.0289, "step": 2895 }, { "epoch": 0.6630795649685175, "grad_norm": 1.1909575462341309, "learning_rate": 5.405650890276255e-06, "loss": 1.0632, "step": 2896 }, { "epoch": 0.6633085289066972, "grad_norm": 1.359932541847229, "learning_rate": 5.39906325644745e-06, "loss": 0.9915, "step": 2897 }, { "epoch": 0.663537492844877, "grad_norm": 1.3327099084854126, "learning_rate": 5.392478154475565e-06, "loss": 1.0265, "step": 2898 }, { "epoch": 0.6637664567830567, "grad_norm": 1.2770649194717407, "learning_rate": 5.3858955879843155e-06, "loss": 1.0492, "step": 2899 }, { "epoch": 0.6639954207212364, "grad_norm": 1.136519193649292, "learning_rate": 5.379315560596038e-06, "loss": 1.027, "step": 2900 }, { "epoch": 0.6642243846594161, "grad_norm": 1.217628836631775, "learning_rate": 5.372738075931674e-06, "loss": 0.9827, "step": 2901 }, { "epoch": 0.6644533485975959, "grad_norm": 1.2686570882797241, "learning_rate": 5.366163137610749e-06, "loss": 1.063, "step": 2902 }, { "epoch": 0.6646823125357756, "grad_norm": 1.29387629032135, "learning_rate": 5.359590749251397e-06, "loss": 1.0409, "step": 2903 }, { "epoch": 0.6649112764739553, "grad_norm": 1.0357317924499512, "learning_rate": 5.353020914470353e-06, "loss": 1.0741, "step": 2904 }, { "epoch": 0.665140240412135, "grad_norm": 1.323845624923706, "learning_rate": 5.346453636882939e-06, "loss": 1.1121, "step": 2905 }, { "epoch": 0.6653692043503149, "grad_norm": 1.1600542068481445, "learning_rate": 5.339888920103074e-06, "loss": 1.0454, "step": 2906 }, { "epoch": 0.6655981682884946, "grad_norm": 1.1847002506256104, "learning_rate": 5.333326767743263e-06, "loss": 1.0733, "step": 2907 }, { "epoch": 0.6658271322266743, "grad_norm": 1.0907032489776611, "learning_rate": 5.326767183414609e-06, "loss": 1.0321, "step": 2908 }, { "epoch": 0.6660560961648541, "grad_norm": 1.4598631858825684, "learning_rate": 5.320210170726796e-06, "loss": 1.0596, "step": 2909 }, { "epoch": 0.6662850601030338, "grad_norm": 1.2866328954696655, "learning_rate": 5.313655733288083e-06, "loss": 1.0658, "step": 2910 }, { "epoch": 0.6665140240412135, "grad_norm": 1.1192216873168945, "learning_rate": 5.307103874705335e-06, "loss": 1.0721, "step": 2911 }, { "epoch": 0.6667429879793932, "grad_norm": 1.190312385559082, "learning_rate": 5.300554598583982e-06, "loss": 1.0158, "step": 2912 }, { "epoch": 0.666971951917573, "grad_norm": 1.0897172689437866, "learning_rate": 5.294007908528029e-06, "loss": 0.9763, "step": 2913 }, { "epoch": 0.6672009158557527, "grad_norm": 1.4546432495117188, "learning_rate": 5.287463808140069e-06, "loss": 1.0753, "step": 2914 }, { "epoch": 0.6674298797939324, "grad_norm": 1.2672525644302368, "learning_rate": 5.280922301021267e-06, "loss": 1.0099, "step": 2915 }, { "epoch": 0.6676588437321122, "grad_norm": 1.5939594507217407, "learning_rate": 5.274383390771356e-06, "loss": 1.1524, "step": 2916 }, { "epoch": 0.6678878076702919, "grad_norm": 1.3578407764434814, "learning_rate": 5.267847080988647e-06, "loss": 0.9973, "step": 2917 }, { "epoch": 0.6681167716084717, "grad_norm": 1.221137285232544, "learning_rate": 5.2613133752700145e-06, "loss": 1.0632, "step": 2918 }, { "epoch": 0.6683457355466514, "grad_norm": 1.5474742650985718, "learning_rate": 5.254782277210901e-06, "loss": 1.0569, "step": 2919 }, { "epoch": 0.6685746994848312, "grad_norm": 1.647705078125, "learning_rate": 5.2482537904053185e-06, "loss": 1.0202, "step": 2920 }, { "epoch": 0.6688036634230109, "grad_norm": 1.6697813272476196, "learning_rate": 5.241727918445836e-06, "loss": 1.0308, "step": 2921 }, { "epoch": 0.6690326273611906, "grad_norm": 1.1288517713546753, "learning_rate": 5.235204664923586e-06, "loss": 1.0139, "step": 2922 }, { "epoch": 0.6692615912993704, "grad_norm": 0.9977933168411255, "learning_rate": 5.228684033428265e-06, "loss": 1.0381, "step": 2923 }, { "epoch": 0.6694905552375501, "grad_norm": 1.2061574459075928, "learning_rate": 5.22216602754811e-06, "loss": 1.0258, "step": 2924 }, { "epoch": 0.6697195191757298, "grad_norm": 1.1679553985595703, "learning_rate": 5.215650650869941e-06, "loss": 1.0316, "step": 2925 }, { "epoch": 0.6699484831139095, "grad_norm": 1.209476351737976, "learning_rate": 5.209137906979102e-06, "loss": 1.0115, "step": 2926 }, { "epoch": 0.6701774470520893, "grad_norm": 1.3832190036773682, "learning_rate": 5.202627799459503e-06, "loss": 1.0837, "step": 2927 }, { "epoch": 0.670406410990269, "grad_norm": 1.2740558385849, "learning_rate": 5.1961203318936116e-06, "loss": 1.0364, "step": 2928 }, { "epoch": 0.6706353749284488, "grad_norm": 1.1675924062728882, "learning_rate": 5.1896155078624225e-06, "loss": 1.0948, "step": 2929 }, { "epoch": 0.6708643388666286, "grad_norm": 1.704378604888916, "learning_rate": 5.183113330945488e-06, "loss": 1.0411, "step": 2930 }, { "epoch": 0.6710933028048083, "grad_norm": 1.6397212743759155, "learning_rate": 5.176613804720905e-06, "loss": 1.0228, "step": 2931 }, { "epoch": 0.671322266742988, "grad_norm": 1.287341833114624, "learning_rate": 5.170116932765304e-06, "loss": 1.0504, "step": 2932 }, { "epoch": 0.6715512306811677, "grad_norm": 1.3021739721298218, "learning_rate": 5.1636227186538625e-06, "loss": 1.0174, "step": 2933 }, { "epoch": 0.6717801946193475, "grad_norm": 1.6247448921203613, "learning_rate": 5.157131165960289e-06, "loss": 1.0317, "step": 2934 }, { "epoch": 0.6720091585575272, "grad_norm": 2.791722536087036, "learning_rate": 5.1506422782568345e-06, "loss": 1.0632, "step": 2935 }, { "epoch": 0.6722381224957069, "grad_norm": 1.033670425415039, "learning_rate": 5.144156059114279e-06, "loss": 1.0681, "step": 2936 }, { "epoch": 0.6724670864338866, "grad_norm": 1.6212668418884277, "learning_rate": 5.137672512101925e-06, "loss": 1.0611, "step": 2937 }, { "epoch": 0.6726960503720664, "grad_norm": 1.806549310684204, "learning_rate": 5.131191640787627e-06, "loss": 1.0404, "step": 2938 }, { "epoch": 0.6729250143102461, "grad_norm": 1.3719223737716675, "learning_rate": 5.124713448737753e-06, "loss": 1.0456, "step": 2939 }, { "epoch": 0.6731539782484258, "grad_norm": 1.4912806749343872, "learning_rate": 5.11823793951719e-06, "loss": 1.0434, "step": 2940 }, { "epoch": 0.6733829421866057, "grad_norm": 1.3837188482284546, "learning_rate": 5.111765116689355e-06, "loss": 1.024, "step": 2941 }, { "epoch": 0.6736119061247854, "grad_norm": 1.8850486278533936, "learning_rate": 5.105294983816203e-06, "loss": 1.0059, "step": 2942 }, { "epoch": 0.6738408700629651, "grad_norm": 1.2506767511367798, "learning_rate": 5.098827544458178e-06, "loss": 1.0762, "step": 2943 }, { "epoch": 0.6740698340011448, "grad_norm": 1.9231312274932861, "learning_rate": 5.0923628021742644e-06, "loss": 1.0204, "step": 2944 }, { "epoch": 0.6742987979393246, "grad_norm": 1.2224328517913818, "learning_rate": 5.085900760521955e-06, "loss": 1.0187, "step": 2945 }, { "epoch": 0.6745277618775043, "grad_norm": 1.4136698246002197, "learning_rate": 5.079441423057259e-06, "loss": 1.0467, "step": 2946 }, { "epoch": 0.674756725815684, "grad_norm": 1.244694709777832, "learning_rate": 5.072984793334696e-06, "loss": 1.0298, "step": 2947 }, { "epoch": 0.6749856897538637, "grad_norm": 1.417830467224121, "learning_rate": 5.066530874907285e-06, "loss": 1.0746, "step": 2948 }, { "epoch": 0.6752146536920435, "grad_norm": 1.1816048622131348, "learning_rate": 5.060079671326577e-06, "loss": 1.0203, "step": 2949 }, { "epoch": 0.6754436176302232, "grad_norm": 1.4444321393966675, "learning_rate": 5.053631186142612e-06, "loss": 1.0502, "step": 2950 }, { "epoch": 0.6756725815684029, "grad_norm": 1.1188122034072876, "learning_rate": 5.0471854229039286e-06, "loss": 1.0968, "step": 2951 }, { "epoch": 0.6759015455065828, "grad_norm": 1.4322268962860107, "learning_rate": 5.040742385157584e-06, "loss": 1.0962, "step": 2952 }, { "epoch": 0.6761305094447625, "grad_norm": 1.2078114748001099, "learning_rate": 5.034302076449132e-06, "loss": 1.0694, "step": 2953 }, { "epoch": 0.6763594733829422, "grad_norm": 4.1897292137146, "learning_rate": 5.027864500322611e-06, "loss": 1.0731, "step": 2954 }, { "epoch": 0.676588437321122, "grad_norm": 1.3412197828292847, "learning_rate": 5.021429660320565e-06, "loss": 1.074, "step": 2955 }, { "epoch": 0.6768174012593017, "grad_norm": 2.866015911102295, "learning_rate": 5.014997559984045e-06, "loss": 0.9959, "step": 2956 }, { "epoch": 0.6770463651974814, "grad_norm": 1.168531894683838, "learning_rate": 5.008568202852569e-06, "loss": 1.0117, "step": 2957 }, { "epoch": 0.6772753291356611, "grad_norm": 1.4421766996383667, "learning_rate": 5.002141592464162e-06, "loss": 1.0038, "step": 2958 }, { "epoch": 0.6775042930738409, "grad_norm": 1.2987374067306519, "learning_rate": 4.995717732355335e-06, "loss": 1.03, "step": 2959 }, { "epoch": 0.6777332570120206, "grad_norm": 1.224198818206787, "learning_rate": 4.989296626061084e-06, "loss": 1.0346, "step": 2960 }, { "epoch": 0.6779622209502003, "grad_norm": 2.0996627807617188, "learning_rate": 4.982878277114891e-06, "loss": 1.0766, "step": 2961 }, { "epoch": 0.67819118488838, "grad_norm": 1.3319966793060303, "learning_rate": 4.976462689048718e-06, "loss": 1.002, "step": 2962 }, { "epoch": 0.6784201488265598, "grad_norm": 1.1799633502960205, "learning_rate": 4.970049865393009e-06, "loss": 1.0168, "step": 2963 }, { "epoch": 0.6786491127647396, "grad_norm": 1.1029752492904663, "learning_rate": 4.963639809676692e-06, "loss": 1.0343, "step": 2964 }, { "epoch": 0.6788780767029193, "grad_norm": 1.0645133256912231, "learning_rate": 4.957232525427156e-06, "loss": 1.0062, "step": 2965 }, { "epoch": 0.6791070406410991, "grad_norm": 1.2702126502990723, "learning_rate": 4.950828016170286e-06, "loss": 1.0674, "step": 2966 }, { "epoch": 0.6793360045792788, "grad_norm": 1.3423794507980347, "learning_rate": 4.94442628543043e-06, "loss": 1.0065, "step": 2967 }, { "epoch": 0.6795649685174585, "grad_norm": 1.3491889238357544, "learning_rate": 4.9380273367304e-06, "loss": 1.0039, "step": 2968 }, { "epoch": 0.6797939324556382, "grad_norm": 1.332449197769165, "learning_rate": 4.931631173591487e-06, "loss": 1.015, "step": 2969 }, { "epoch": 0.680022896393818, "grad_norm": 1.2349896430969238, "learning_rate": 4.925237799533445e-06, "loss": 1.0143, "step": 2970 }, { "epoch": 0.6802518603319977, "grad_norm": 1.296811580657959, "learning_rate": 4.918847218074495e-06, "loss": 1.0468, "step": 2971 }, { "epoch": 0.6804808242701774, "grad_norm": 1.2404383420944214, "learning_rate": 4.912459432731322e-06, "loss": 1.0548, "step": 2972 }, { "epoch": 0.6807097882083571, "grad_norm": 1.2925716638565063, "learning_rate": 4.906074447019068e-06, "loss": 1.0182, "step": 2973 }, { "epoch": 0.6809387521465369, "grad_norm": 1.7651426792144775, "learning_rate": 4.899692264451339e-06, "loss": 1.028, "step": 2974 }, { "epoch": 0.6811677160847167, "grad_norm": 2.1795103549957275, "learning_rate": 4.893312888540195e-06, "loss": 1.0185, "step": 2975 }, { "epoch": 0.6813966800228964, "grad_norm": 1.4040412902832031, "learning_rate": 4.886936322796154e-06, "loss": 1.015, "step": 2976 }, { "epoch": 0.6816256439610762, "grad_norm": 2.831342935562134, "learning_rate": 4.880562570728188e-06, "loss": 1.0135, "step": 2977 }, { "epoch": 0.6818546078992559, "grad_norm": 1.828460454940796, "learning_rate": 4.87419163584372e-06, "loss": 1.0305, "step": 2978 }, { "epoch": 0.6820835718374356, "grad_norm": 1.3774365186691284, "learning_rate": 4.867823521648613e-06, "loss": 1.0625, "step": 2979 }, { "epoch": 0.6823125357756153, "grad_norm": 1.7624365091323853, "learning_rate": 4.861458231647202e-06, "loss": 1.0516, "step": 2980 }, { "epoch": 0.6825414997137951, "grad_norm": 1.3166933059692383, "learning_rate": 4.855095769342241e-06, "loss": 1.0544, "step": 2981 }, { "epoch": 0.6827704636519748, "grad_norm": 2.098736047744751, "learning_rate": 4.848736138234943e-06, "loss": 1.0258, "step": 2982 }, { "epoch": 0.6829994275901545, "grad_norm": 1.0946608781814575, "learning_rate": 4.842379341824958e-06, "loss": 1.0407, "step": 2983 }, { "epoch": 0.6832283915283343, "grad_norm": 1.1885132789611816, "learning_rate": 4.836025383610382e-06, "loss": 1.0462, "step": 2984 }, { "epoch": 0.683457355466514, "grad_norm": 5.834176540374756, "learning_rate": 4.829674267087742e-06, "loss": 1.0261, "step": 2985 }, { "epoch": 0.6836863194046937, "grad_norm": 3.0932862758636475, "learning_rate": 4.823325995752005e-06, "loss": 1.0829, "step": 2986 }, { "epoch": 0.6839152833428735, "grad_norm": 1.27007257938385, "learning_rate": 4.816980573096571e-06, "loss": 1.1001, "step": 2987 }, { "epoch": 0.6841442472810533, "grad_norm": 1.145909070968628, "learning_rate": 4.810638002613273e-06, "loss": 1.0904, "step": 2988 }, { "epoch": 0.684373211219233, "grad_norm": 1.4148883819580078, "learning_rate": 4.804298287792374e-06, "loss": 1.0352, "step": 2989 }, { "epoch": 0.6846021751574127, "grad_norm": 1.51638925075531, "learning_rate": 4.797961432122568e-06, "loss": 1.0851, "step": 2990 }, { "epoch": 0.6848311390955925, "grad_norm": 2.6201398372650146, "learning_rate": 4.791627439090975e-06, "loss": 1.0082, "step": 2991 }, { "epoch": 0.6850601030337722, "grad_norm": 1.3478920459747314, "learning_rate": 4.785296312183131e-06, "loss": 1.0192, "step": 2992 }, { "epoch": 0.6852890669719519, "grad_norm": 2.5813474655151367, "learning_rate": 4.778968054883002e-06, "loss": 1.0566, "step": 2993 }, { "epoch": 0.6855180309101316, "grad_norm": 1.4003512859344482, "learning_rate": 4.772642670672988e-06, "loss": 1.0256, "step": 2994 }, { "epoch": 0.6857469948483114, "grad_norm": 1.65157151222229, "learning_rate": 4.766320163033882e-06, "loss": 1.081, "step": 2995 }, { "epoch": 0.6859759587864911, "grad_norm": 1.354181170463562, "learning_rate": 4.7600005354449075e-06, "loss": 0.9829, "step": 2996 }, { "epoch": 0.6862049227246708, "grad_norm": 1.2161715030670166, "learning_rate": 4.753683791383713e-06, "loss": 1.0763, "step": 2997 }, { "epoch": 0.6864338866628507, "grad_norm": 1.2519667148590088, "learning_rate": 4.74736993432634e-06, "loss": 1.0668, "step": 2998 }, { "epoch": 0.6866628506010304, "grad_norm": 1.2776641845703125, "learning_rate": 4.741058967747254e-06, "loss": 1.0412, "step": 2999 }, { "epoch": 0.6868918145392101, "grad_norm": 1.4053977727890015, "learning_rate": 4.734750895119327e-06, "loss": 1.079, "step": 3000 }, { "epoch": 0.6871207784773898, "grad_norm": 1.4398205280303955, "learning_rate": 4.7284457199138374e-06, "loss": 1.0251, "step": 3001 }, { "epoch": 0.6873497424155696, "grad_norm": 1.4499075412750244, "learning_rate": 4.722143445600477e-06, "loss": 1.0771, "step": 3002 }, { "epoch": 0.6875787063537493, "grad_norm": 1.4174058437347412, "learning_rate": 4.71584407564732e-06, "loss": 1.0093, "step": 3003 }, { "epoch": 0.687807670291929, "grad_norm": 1.425048589706421, "learning_rate": 4.70954761352087e-06, "loss": 1.0117, "step": 3004 }, { "epoch": 0.6880366342301087, "grad_norm": 1.1394058465957642, "learning_rate": 4.703254062686017e-06, "loss": 1.0435, "step": 3005 }, { "epoch": 0.6882655981682885, "grad_norm": 1.4277303218841553, "learning_rate": 4.696963426606041e-06, "loss": 1.0903, "step": 3006 }, { "epoch": 0.6884945621064682, "grad_norm": 1.744752287864685, "learning_rate": 4.690675708742628e-06, "loss": 1.0468, "step": 3007 }, { "epoch": 0.6887235260446479, "grad_norm": 1.4332195520401, "learning_rate": 4.684390912555866e-06, "loss": 1.0169, "step": 3008 }, { "epoch": 0.6889524899828277, "grad_norm": 1.3215627670288086, "learning_rate": 4.678109041504215e-06, "loss": 1.0733, "step": 3009 }, { "epoch": 0.6891814539210075, "grad_norm": 1.8956875801086426, "learning_rate": 4.671830099044536e-06, "loss": 1.0317, "step": 3010 }, { "epoch": 0.6894104178591872, "grad_norm": 1.260425090789795, "learning_rate": 4.665554088632089e-06, "loss": 1.0633, "step": 3011 }, { "epoch": 0.689639381797367, "grad_norm": 1.27289617061615, "learning_rate": 4.6592810137205e-06, "loss": 1.0074, "step": 3012 }, { "epoch": 0.6898683457355467, "grad_norm": 1.8777307271957397, "learning_rate": 4.653010877761793e-06, "loss": 1.0374, "step": 3013 }, { "epoch": 0.6900973096737264, "grad_norm": 1.1621068716049194, "learning_rate": 4.64674368420637e-06, "loss": 1.0394, "step": 3014 }, { "epoch": 0.6903262736119061, "grad_norm": 1.2484198808670044, "learning_rate": 4.640479436503016e-06, "loss": 1.0031, "step": 3015 }, { "epoch": 0.6905552375500859, "grad_norm": 1.564257025718689, "learning_rate": 4.634218138098897e-06, "loss": 1.0898, "step": 3016 }, { "epoch": 0.6907842014882656, "grad_norm": 1.2701921463012695, "learning_rate": 4.6279597924395434e-06, "loss": 1.0803, "step": 3017 }, { "epoch": 0.6910131654264453, "grad_norm": 1.458882212638855, "learning_rate": 4.621704402968881e-06, "loss": 1.0771, "step": 3018 }, { "epoch": 0.691242129364625, "grad_norm": 1.2890880107879639, "learning_rate": 4.615451973129196e-06, "loss": 1.0809, "step": 3019 }, { "epoch": 0.6914710933028048, "grad_norm": 1.1623772382736206, "learning_rate": 4.60920250636114e-06, "loss": 1.0524, "step": 3020 }, { "epoch": 0.6917000572409846, "grad_norm": 1.1945689916610718, "learning_rate": 4.602956006103752e-06, "loss": 1.0345, "step": 3021 }, { "epoch": 0.6919290211791643, "grad_norm": 1.1116697788238525, "learning_rate": 4.596712475794427e-06, "loss": 0.9965, "step": 3022 }, { "epoch": 0.692157985117344, "grad_norm": 1.092020869255066, "learning_rate": 4.590471918868923e-06, "loss": 1.019, "step": 3023 }, { "epoch": 0.6923869490555238, "grad_norm": 1.2264333963394165, "learning_rate": 4.584234338761368e-06, "loss": 1.0246, "step": 3024 }, { "epoch": 0.6926159129937035, "grad_norm": 1.1611523628234863, "learning_rate": 4.5779997389042514e-06, "loss": 1.0514, "step": 3025 }, { "epoch": 0.6928448769318832, "grad_norm": 1.5043096542358398, "learning_rate": 4.571768122728421e-06, "loss": 0.9878, "step": 3026 }, { "epoch": 0.693073840870063, "grad_norm": 1.4404031038284302, "learning_rate": 4.56553949366308e-06, "loss": 1.0665, "step": 3027 }, { "epoch": 0.6933028048082427, "grad_norm": 1.6433451175689697, "learning_rate": 4.559313855135795e-06, "loss": 1.0795, "step": 3028 }, { "epoch": 0.6935317687464224, "grad_norm": 1.2414443492889404, "learning_rate": 4.55309121057248e-06, "loss": 1.0612, "step": 3029 }, { "epoch": 0.6937607326846021, "grad_norm": 1.7606821060180664, "learning_rate": 4.546871563397409e-06, "loss": 1.0517, "step": 3030 }, { "epoch": 0.6939896966227819, "grad_norm": 1.563098430633545, "learning_rate": 4.54065491703319e-06, "loss": 1.0475, "step": 3031 }, { "epoch": 0.6942186605609616, "grad_norm": 1.3073487281799316, "learning_rate": 4.534441274900807e-06, "loss": 1.0028, "step": 3032 }, { "epoch": 0.6944476244991414, "grad_norm": 1.2197542190551758, "learning_rate": 4.528230640419562e-06, "loss": 1.027, "step": 3033 }, { "epoch": 0.6946765884373212, "grad_norm": 1.2171616554260254, "learning_rate": 4.522023017007118e-06, "loss": 1.0114, "step": 3034 }, { "epoch": 0.6949055523755009, "grad_norm": 1.2713412046432495, "learning_rate": 4.515818408079487e-06, "loss": 1.0541, "step": 3035 }, { "epoch": 0.6951345163136806, "grad_norm": 1.235053539276123, "learning_rate": 4.5096168170510036e-06, "loss": 1.0338, "step": 3036 }, { "epoch": 0.6953634802518603, "grad_norm": 1.3829389810562134, "learning_rate": 4.503418247334353e-06, "loss": 1.0417, "step": 3037 }, { "epoch": 0.6955924441900401, "grad_norm": 1.107755422592163, "learning_rate": 4.497222702340559e-06, "loss": 1.0538, "step": 3038 }, { "epoch": 0.6958214081282198, "grad_norm": 1.2136812210083008, "learning_rate": 4.491030185478976e-06, "loss": 1.0574, "step": 3039 }, { "epoch": 0.6960503720663995, "grad_norm": 1.9772335290908813, "learning_rate": 4.4848407001572945e-06, "loss": 1.0413, "step": 3040 }, { "epoch": 0.6962793360045793, "grad_norm": 1.3861563205718994, "learning_rate": 4.4786542497815365e-06, "loss": 1.0276, "step": 3041 }, { "epoch": 0.696508299942759, "grad_norm": 1.2986854314804077, "learning_rate": 4.472470837756055e-06, "loss": 1.0022, "step": 3042 }, { "epoch": 0.6967372638809387, "grad_norm": 1.8411191701889038, "learning_rate": 4.466290467483531e-06, "loss": 1.0107, "step": 3043 }, { "epoch": 0.6969662278191184, "grad_norm": 1.371367335319519, "learning_rate": 4.460113142364961e-06, "loss": 1.017, "step": 3044 }, { "epoch": 0.6971951917572983, "grad_norm": 1.3172252178192139, "learning_rate": 4.453938865799686e-06, "loss": 1.0403, "step": 3045 }, { "epoch": 0.697424155695478, "grad_norm": 1.2504808902740479, "learning_rate": 4.447767641185359e-06, "loss": 1.0093, "step": 3046 }, { "epoch": 0.6976531196336577, "grad_norm": 1.1979867219924927, "learning_rate": 4.441599471917946e-06, "loss": 1.0421, "step": 3047 }, { "epoch": 0.6978820835718375, "grad_norm": 1.6098335981369019, "learning_rate": 4.435434361391737e-06, "loss": 1.091, "step": 3048 }, { "epoch": 0.6981110475100172, "grad_norm": 1.1873350143432617, "learning_rate": 4.429272312999353e-06, "loss": 1.0796, "step": 3049 }, { "epoch": 0.6983400114481969, "grad_norm": 1.3173235654830933, "learning_rate": 4.423113330131708e-06, "loss": 1.0352, "step": 3050 }, { "epoch": 0.6985689753863766, "grad_norm": 1.3406671285629272, "learning_rate": 4.4169574161780395e-06, "loss": 1.0448, "step": 3051 }, { "epoch": 0.6987979393245564, "grad_norm": 1.2191475629806519, "learning_rate": 4.4108045745258966e-06, "loss": 1.0402, "step": 3052 }, { "epoch": 0.6990269032627361, "grad_norm": 1.2168577909469604, "learning_rate": 4.404654808561137e-06, "loss": 1.0253, "step": 3053 }, { "epoch": 0.6992558672009158, "grad_norm": 9.12889575958252, "learning_rate": 4.398508121667925e-06, "loss": 1.0162, "step": 3054 }, { "epoch": 0.6994848311390955, "grad_norm": 1.6077773571014404, "learning_rate": 4.39236451722873e-06, "loss": 1.0926, "step": 3055 }, { "epoch": 0.6997137950772754, "grad_norm": 1.5294177532196045, "learning_rate": 4.386223998624327e-06, "loss": 1.0349, "step": 3056 }, { "epoch": 0.6999427590154551, "grad_norm": 1.079950213432312, "learning_rate": 4.380086569233796e-06, "loss": 0.9882, "step": 3057 }, { "epoch": 0.7001717229536348, "grad_norm": 1.1813180446624756, "learning_rate": 4.3739522324344994e-06, "loss": 1.1365, "step": 3058 }, { "epoch": 0.7004006868918146, "grad_norm": 1.2818435430526733, "learning_rate": 4.3678209916021264e-06, "loss": 1.0429, "step": 3059 }, { "epoch": 0.7006296508299943, "grad_norm": 1.6467807292938232, "learning_rate": 4.361692850110644e-06, "loss": 1.0304, "step": 3060 }, { "epoch": 0.700858614768174, "grad_norm": 1.1530770063400269, "learning_rate": 4.355567811332311e-06, "loss": 1.1026, "step": 3061 }, { "epoch": 0.7010875787063537, "grad_norm": 1.2358185052871704, "learning_rate": 4.3494458786376845e-06, "loss": 1.0106, "step": 3062 }, { "epoch": 0.7013165426445335, "grad_norm": 2.5719728469848633, "learning_rate": 4.3433270553956245e-06, "loss": 0.9823, "step": 3063 }, { "epoch": 0.7015455065827132, "grad_norm": 1.4278590679168701, "learning_rate": 4.337211344973256e-06, "loss": 1.0915, "step": 3064 }, { "epoch": 0.7017744705208929, "grad_norm": 1.1447930335998535, "learning_rate": 4.331098750736008e-06, "loss": 1.0208, "step": 3065 }, { "epoch": 0.7020034344590727, "grad_norm": 1.5032339096069336, "learning_rate": 4.3249892760475894e-06, "loss": 1.0192, "step": 3066 }, { "epoch": 0.7022323983972524, "grad_norm": 1.17005455493927, "learning_rate": 4.318882924269994e-06, "loss": 0.9827, "step": 3067 }, { "epoch": 0.7024613623354322, "grad_norm": 1.24888014793396, "learning_rate": 4.312779698763493e-06, "loss": 1.0159, "step": 3068 }, { "epoch": 0.7026903262736119, "grad_norm": 1.3852883577346802, "learning_rate": 4.306679602886643e-06, "loss": 1.0457, "step": 3069 }, { "epoch": 0.7029192902117917, "grad_norm": 1.4950320720672607, "learning_rate": 4.300582639996274e-06, "loss": 1.0019, "step": 3070 }, { "epoch": 0.7031482541499714, "grad_norm": 1.2468384504318237, "learning_rate": 4.2944888134474995e-06, "loss": 1.0566, "step": 3071 }, { "epoch": 0.7033772180881511, "grad_norm": 1.0590832233428955, "learning_rate": 4.2883981265936884e-06, "loss": 1.0033, "step": 3072 }, { "epoch": 0.7036061820263309, "grad_norm": 1.5195868015289307, "learning_rate": 4.282310582786504e-06, "loss": 1.0192, "step": 3073 }, { "epoch": 0.7038351459645106, "grad_norm": 1.6989542245864868, "learning_rate": 4.276226185375874e-06, "loss": 1.0511, "step": 3074 }, { "epoch": 0.7040641099026903, "grad_norm": 1.4525377750396729, "learning_rate": 4.270144937709981e-06, "loss": 1.0511, "step": 3075 }, { "epoch": 0.70429307384087, "grad_norm": 1.3085981607437134, "learning_rate": 4.264066843135292e-06, "loss": 1.0334, "step": 3076 }, { "epoch": 0.7045220377790498, "grad_norm": 1.2663421630859375, "learning_rate": 4.257991904996527e-06, "loss": 1.0987, "step": 3077 }, { "epoch": 0.7047510017172295, "grad_norm": 1.1956284046173096, "learning_rate": 4.251920126636676e-06, "loss": 1.0384, "step": 3078 }, { "epoch": 0.7049799656554093, "grad_norm": 1.3076503276824951, "learning_rate": 4.245851511396988e-06, "loss": 1.1324, "step": 3079 }, { "epoch": 0.705208929593589, "grad_norm": 1.2820242643356323, "learning_rate": 4.23978606261697e-06, "loss": 1.0065, "step": 3080 }, { "epoch": 0.7054378935317688, "grad_norm": 1.707970142364502, "learning_rate": 4.233723783634388e-06, "loss": 1.0478, "step": 3081 }, { "epoch": 0.7056668574699485, "grad_norm": 1.9648287296295166, "learning_rate": 4.227664677785264e-06, "loss": 0.9916, "step": 3082 }, { "epoch": 0.7058958214081282, "grad_norm": 1.4425021409988403, "learning_rate": 4.221608748403872e-06, "loss": 1.0077, "step": 3083 }, { "epoch": 0.706124785346308, "grad_norm": 1.1991877555847168, "learning_rate": 4.21555599882274e-06, "loss": 1.085, "step": 3084 }, { "epoch": 0.7063537492844877, "grad_norm": 1.6011837720870972, "learning_rate": 4.2095064323726485e-06, "loss": 1.0511, "step": 3085 }, { "epoch": 0.7065827132226674, "grad_norm": 1.2623403072357178, "learning_rate": 4.2034600523826116e-06, "loss": 0.992, "step": 3086 }, { "epoch": 0.7068116771608471, "grad_norm": 1.3289179801940918, "learning_rate": 4.1974168621799185e-06, "loss": 1.0386, "step": 3087 }, { "epoch": 0.7070406410990269, "grad_norm": 1.3888287544250488, "learning_rate": 4.191376865090073e-06, "loss": 1.0181, "step": 3088 }, { "epoch": 0.7072696050372066, "grad_norm": 1.3241780996322632, "learning_rate": 4.1853400644368395e-06, "loss": 1.0285, "step": 3089 }, { "epoch": 0.7074985689753863, "grad_norm": 1.2323023080825806, "learning_rate": 4.179306463542217e-06, "loss": 1.0286, "step": 3090 }, { "epoch": 0.7077275329135662, "grad_norm": 1.290425419807434, "learning_rate": 4.173276065726448e-06, "loss": 1.0194, "step": 3091 }, { "epoch": 0.7079564968517459, "grad_norm": 1.7319308519363403, "learning_rate": 4.167248874308009e-06, "loss": 1.0092, "step": 3092 }, { "epoch": 0.7081854607899256, "grad_norm": 1.0908766984939575, "learning_rate": 4.161224892603613e-06, "loss": 1.0487, "step": 3093 }, { "epoch": 0.7084144247281053, "grad_norm": 1.2904776334762573, "learning_rate": 4.155204123928205e-06, "loss": 0.9839, "step": 3094 }, { "epoch": 0.7086433886662851, "grad_norm": 1.3205946683883667, "learning_rate": 4.149186571594965e-06, "loss": 1.0494, "step": 3095 }, { "epoch": 0.7088723526044648, "grad_norm": 1.6448436975479126, "learning_rate": 4.143172238915302e-06, "loss": 1.0911, "step": 3096 }, { "epoch": 0.7091013165426445, "grad_norm": 1.1375172138214111, "learning_rate": 4.137161129198852e-06, "loss": 0.9825, "step": 3097 }, { "epoch": 0.7093302804808242, "grad_norm": 1.2753404378890991, "learning_rate": 4.131153245753482e-06, "loss": 1.0859, "step": 3098 }, { "epoch": 0.709559244419004, "grad_norm": 1.3567954301834106, "learning_rate": 4.1251485918852725e-06, "loss": 1.029, "step": 3099 }, { "epoch": 0.7097882083571837, "grad_norm": 1.057942271232605, "learning_rate": 4.119147170898535e-06, "loss": 1.0965, "step": 3100 }, { "epoch": 0.7100171722953634, "grad_norm": 1.2383803129196167, "learning_rate": 4.113148986095812e-06, "loss": 1.0577, "step": 3101 }, { "epoch": 0.7102461362335433, "grad_norm": 1.1256804466247559, "learning_rate": 4.107154040777842e-06, "loss": 0.9798, "step": 3102 }, { "epoch": 0.710475100171723, "grad_norm": 1.5061252117156982, "learning_rate": 4.101162338243595e-06, "loss": 1.1058, "step": 3103 }, { "epoch": 0.7107040641099027, "grad_norm": 1.094844102859497, "learning_rate": 4.095173881790265e-06, "loss": 0.9926, "step": 3104 }, { "epoch": 0.7109330280480824, "grad_norm": 1.3939733505249023, "learning_rate": 4.0891886747132356e-06, "loss": 1.061, "step": 3105 }, { "epoch": 0.7111619919862622, "grad_norm": 2.069545030593872, "learning_rate": 4.0832067203061244e-06, "loss": 1.0444, "step": 3106 }, { "epoch": 0.7113909559244419, "grad_norm": 1.2419723272323608, "learning_rate": 4.0772280218607485e-06, "loss": 1.0661, "step": 3107 }, { "epoch": 0.7116199198626216, "grad_norm": 2.2456400394439697, "learning_rate": 4.071252582667135e-06, "loss": 1.0287, "step": 3108 }, { "epoch": 0.7118488838008014, "grad_norm": 1.2714403867721558, "learning_rate": 4.065280406013522e-06, "loss": 0.9891, "step": 3109 }, { "epoch": 0.7120778477389811, "grad_norm": 1.4441180229187012, "learning_rate": 4.059311495186338e-06, "loss": 1.0688, "step": 3110 }, { "epoch": 0.7123068116771608, "grad_norm": 1.442212700843811, "learning_rate": 4.0533458534702354e-06, "loss": 1.0711, "step": 3111 }, { "epoch": 0.7125357756153405, "grad_norm": 1.2680224180221558, "learning_rate": 4.0473834841480565e-06, "loss": 1.0097, "step": 3112 }, { "epoch": 0.7127647395535203, "grad_norm": 1.0516926050186157, "learning_rate": 4.0414243905008325e-06, "loss": 1.0449, "step": 3113 }, { "epoch": 0.7129937034917001, "grad_norm": 1.1603946685791016, "learning_rate": 4.035468575807812e-06, "loss": 1.0299, "step": 3114 }, { "epoch": 0.7132226674298798, "grad_norm": 1.0879015922546387, "learning_rate": 4.029516043346432e-06, "loss": 1.0125, "step": 3115 }, { "epoch": 0.7134516313680596, "grad_norm": 2.325087070465088, "learning_rate": 4.023566796392313e-06, "loss": 1.0965, "step": 3116 }, { "epoch": 0.7136805953062393, "grad_norm": 1.1928337812423706, "learning_rate": 4.017620838219276e-06, "loss": 1.0108, "step": 3117 }, { "epoch": 0.713909559244419, "grad_norm": 1.2268211841583252, "learning_rate": 4.011678172099343e-06, "loss": 0.9763, "step": 3118 }, { "epoch": 0.7141385231825987, "grad_norm": 1.3314909934997559, "learning_rate": 4.005738801302701e-06, "loss": 1.0531, "step": 3119 }, { "epoch": 0.7143674871207785, "grad_norm": 1.3090684413909912, "learning_rate": 3.999802729097743e-06, "loss": 1.0713, "step": 3120 }, { "epoch": 0.7145964510589582, "grad_norm": 1.1635836362838745, "learning_rate": 3.993869958751036e-06, "loss": 1.0654, "step": 3121 }, { "epoch": 0.7148254149971379, "grad_norm": 1.3543496131896973, "learning_rate": 3.9879404935273346e-06, "loss": 1.0163, "step": 3122 }, { "epoch": 0.7150543789353176, "grad_norm": 1.3396997451782227, "learning_rate": 3.982014336689579e-06, "loss": 1.0441, "step": 3123 }, { "epoch": 0.7152833428734974, "grad_norm": 1.4751423597335815, "learning_rate": 3.9760914914988716e-06, "loss": 1.0307, "step": 3124 }, { "epoch": 0.7155123068116772, "grad_norm": 1.3802671432495117, "learning_rate": 3.970171961214515e-06, "loss": 1.0171, "step": 3125 }, { "epoch": 0.7157412707498569, "grad_norm": 1.8824442625045776, "learning_rate": 3.964255749093979e-06, "loss": 0.969, "step": 3126 }, { "epoch": 0.7159702346880367, "grad_norm": 1.3549731969833374, "learning_rate": 3.958342858392893e-06, "loss": 1.0393, "step": 3127 }, { "epoch": 0.7161991986262164, "grad_norm": 1.1898926496505737, "learning_rate": 3.952433292365081e-06, "loss": 1.0948, "step": 3128 }, { "epoch": 0.7164281625643961, "grad_norm": 1.109851360321045, "learning_rate": 3.94652705426253e-06, "loss": 1.0621, "step": 3129 }, { "epoch": 0.7166571265025758, "grad_norm": 1.4158560037612915, "learning_rate": 3.940624147335386e-06, "loss": 1.0273, "step": 3130 }, { "epoch": 0.7168860904407556, "grad_norm": 1.319413661956787, "learning_rate": 3.93472457483197e-06, "loss": 0.9908, "step": 3131 }, { "epoch": 0.7171150543789353, "grad_norm": 2.840635299682617, "learning_rate": 3.92882833999877e-06, "loss": 1.0255, "step": 3132 }, { "epoch": 0.717344018317115, "grad_norm": 1.2576954364776611, "learning_rate": 3.9229354460804345e-06, "loss": 1.0252, "step": 3133 }, { "epoch": 0.7175729822552948, "grad_norm": 1.3615483045578003, "learning_rate": 3.917045896319772e-06, "loss": 1.0324, "step": 3134 }, { "epoch": 0.7178019461934745, "grad_norm": 1.6669081449508667, "learning_rate": 3.911159693957755e-06, "loss": 1.0628, "step": 3135 }, { "epoch": 0.7180309101316542, "grad_norm": 1.2302170991897583, "learning_rate": 3.905276842233508e-06, "loss": 1.0182, "step": 3136 }, { "epoch": 0.718259874069834, "grad_norm": 1.0562607049942017, "learning_rate": 3.899397344384316e-06, "loss": 1.01, "step": 3137 }, { "epoch": 0.7184888380080138, "grad_norm": 2.51627516746521, "learning_rate": 3.893521203645618e-06, "loss": 1.1183, "step": 3138 }, { "epoch": 0.7187178019461935, "grad_norm": 1.2331818342208862, "learning_rate": 3.887648423251006e-06, "loss": 1.0323, "step": 3139 }, { "epoch": 0.7189467658843732, "grad_norm": 1.2647502422332764, "learning_rate": 3.881779006432223e-06, "loss": 1.0546, "step": 3140 }, { "epoch": 0.719175729822553, "grad_norm": 1.3590573072433472, "learning_rate": 3.875912956419152e-06, "loss": 0.9891, "step": 3141 }, { "epoch": 0.7194046937607327, "grad_norm": 1.4926403760910034, "learning_rate": 3.870050276439843e-06, "loss": 1.0391, "step": 3142 }, { "epoch": 0.7196336576989124, "grad_norm": 1.504622459411621, "learning_rate": 3.864190969720469e-06, "loss": 0.9936, "step": 3143 }, { "epoch": 0.7198626216370921, "grad_norm": 1.0495116710662842, "learning_rate": 3.858335039485362e-06, "loss": 1.0307, "step": 3144 }, { "epoch": 0.7200915855752719, "grad_norm": 1.2690739631652832, "learning_rate": 3.852482488956992e-06, "loss": 1.0355, "step": 3145 }, { "epoch": 0.7203205495134516, "grad_norm": 2.033017158508301, "learning_rate": 3.846633321355967e-06, "loss": 1.0948, "step": 3146 }, { "epoch": 0.7205495134516313, "grad_norm": 1.1862678527832031, "learning_rate": 3.840787539901037e-06, "loss": 1.055, "step": 3147 }, { "epoch": 0.7207784773898112, "grad_norm": 1.3250430822372437, "learning_rate": 3.834945147809082e-06, "loss": 1.052, "step": 3148 }, { "epoch": 0.7210074413279909, "grad_norm": 1.126364827156067, "learning_rate": 3.829106148295127e-06, "loss": 1.0445, "step": 3149 }, { "epoch": 0.7212364052661706, "grad_norm": 1.6518840789794922, "learning_rate": 3.823270544572319e-06, "loss": 1.04, "step": 3150 }, { "epoch": 0.7214653692043503, "grad_norm": 1.127027988433838, "learning_rate": 3.817438339851947e-06, "loss": 1.0113, "step": 3151 }, { "epoch": 0.7216943331425301, "grad_norm": 1.3487000465393066, "learning_rate": 3.8116095373434204e-06, "loss": 0.9984, "step": 3152 }, { "epoch": 0.7219232970807098, "grad_norm": 1.3868497610092163, "learning_rate": 3.805784140254286e-06, "loss": 1.0504, "step": 3153 }, { "epoch": 0.7221522610188895, "grad_norm": 1.4739601612091064, "learning_rate": 3.799962151790203e-06, "loss": 1.0158, "step": 3154 }, { "epoch": 0.7223812249570692, "grad_norm": 1.0389552116394043, "learning_rate": 3.794143575154964e-06, "loss": 1.0075, "step": 3155 }, { "epoch": 0.722610188895249, "grad_norm": 1.0425493717193604, "learning_rate": 3.788328413550493e-06, "loss": 0.9877, "step": 3156 }, { "epoch": 0.7228391528334287, "grad_norm": 1.1646589040756226, "learning_rate": 3.7825166701768125e-06, "loss": 0.959, "step": 3157 }, { "epoch": 0.7230681167716084, "grad_norm": 1.1060878038406372, "learning_rate": 3.776708348232081e-06, "loss": 1.0858, "step": 3158 }, { "epoch": 0.7232970807097882, "grad_norm": 1.2601068019866943, "learning_rate": 3.7709034509125706e-06, "loss": 1.0423, "step": 3159 }, { "epoch": 0.723526044647968, "grad_norm": 2.284247875213623, "learning_rate": 3.7651019814126656e-06, "loss": 1.0213, "step": 3160 }, { "epoch": 0.7237550085861477, "grad_norm": 1.5774025917053223, "learning_rate": 3.7593039429248667e-06, "loss": 1.0469, "step": 3161 }, { "epoch": 0.7239839725243274, "grad_norm": 1.9497007131576538, "learning_rate": 3.7535093386397868e-06, "loss": 1.0541, "step": 3162 }, { "epoch": 0.7242129364625072, "grad_norm": 1.517136812210083, "learning_rate": 3.7477181717461463e-06, "loss": 0.9846, "step": 3163 }, { "epoch": 0.7244419004006869, "grad_norm": 1.3717387914657593, "learning_rate": 3.74193044543078e-06, "loss": 1.0027, "step": 3164 }, { "epoch": 0.7246708643388666, "grad_norm": 1.2264940738677979, "learning_rate": 3.7361461628786167e-06, "loss": 1.1155, "step": 3165 }, { "epoch": 0.7248998282770464, "grad_norm": 1.5513440370559692, "learning_rate": 3.7303653272727057e-06, "loss": 1.0442, "step": 3166 }, { "epoch": 0.7251287922152261, "grad_norm": 1.0587714910507202, "learning_rate": 3.7245879417941943e-06, "loss": 1.1098, "step": 3167 }, { "epoch": 0.7253577561534058, "grad_norm": 1.0646835565567017, "learning_rate": 3.7188140096223225e-06, "loss": 0.9877, "step": 3168 }, { "epoch": 0.7255867200915855, "grad_norm": 1.0646835565567017, "learning_rate": 3.7188140096223225e-06, "loss": 1.0259, "step": 3169 }, { "epoch": 0.7258156840297653, "grad_norm": 1.221613883972168, "learning_rate": 3.713043533934435e-06, "loss": 0.96, "step": 3170 }, { "epoch": 0.7260446479679451, "grad_norm": 1.373716950416565, "learning_rate": 3.707276517905989e-06, "loss": 1.0587, "step": 3171 }, { "epoch": 0.7262736119061248, "grad_norm": 1.5889724493026733, "learning_rate": 3.701512964710513e-06, "loss": 1.0572, "step": 3172 }, { "epoch": 0.7265025758443046, "grad_norm": 1.3366655111312866, "learning_rate": 3.695752877519646e-06, "loss": 1.0241, "step": 3173 }, { "epoch": 0.7267315397824843, "grad_norm": 1.3003686666488647, "learning_rate": 3.689996259503116e-06, "loss": 1.0352, "step": 3174 }, { "epoch": 0.726960503720664, "grad_norm": 1.2446143627166748, "learning_rate": 3.6842431138287415e-06, "loss": 1.0475, "step": 3175 }, { "epoch": 0.7271894676588437, "grad_norm": 1.0979483127593994, "learning_rate": 3.67849344366243e-06, "loss": 1.0735, "step": 3176 }, { "epoch": 0.7274184315970235, "grad_norm": 1.3841794729232788, "learning_rate": 3.672747252168176e-06, "loss": 1.0478, "step": 3177 }, { "epoch": 0.7276473955352032, "grad_norm": 1.3376761674880981, "learning_rate": 3.6670045425080626e-06, "loss": 1.0732, "step": 3178 }, { "epoch": 0.7278763594733829, "grad_norm": 2.0364394187927246, "learning_rate": 3.6612653178422564e-06, "loss": 1.05, "step": 3179 }, { "epoch": 0.7281053234115626, "grad_norm": 1.3383004665374756, "learning_rate": 3.655529581328995e-06, "loss": 1.0408, "step": 3180 }, { "epoch": 0.7283342873497424, "grad_norm": 1.368977427482605, "learning_rate": 3.6497973361246153e-06, "loss": 1.0406, "step": 3181 }, { "epoch": 0.7285632512879221, "grad_norm": 1.3485641479492188, "learning_rate": 3.6440685853835266e-06, "loss": 1.0455, "step": 3182 }, { "epoch": 0.7287922152261019, "grad_norm": 1.3723440170288086, "learning_rate": 3.6383433322582028e-06, "loss": 1.0398, "step": 3183 }, { "epoch": 0.7290211791642817, "grad_norm": 1.250652551651001, "learning_rate": 3.632621579899207e-06, "loss": 1.0, "step": 3184 }, { "epoch": 0.7292501431024614, "grad_norm": 1.3991602659225464, "learning_rate": 3.6269033314551725e-06, "loss": 1.0197, "step": 3185 }, { "epoch": 0.7294791070406411, "grad_norm": 1.7109068632125854, "learning_rate": 3.6211885900728017e-06, "loss": 1.0875, "step": 3186 }, { "epoch": 0.7297080709788208, "grad_norm": 1.0364817380905151, "learning_rate": 3.6154773588968704e-06, "loss": 1.1141, "step": 3187 }, { "epoch": 0.7299370349170006, "grad_norm": 1.2963764667510986, "learning_rate": 3.609769641070221e-06, "loss": 1.0609, "step": 3188 }, { "epoch": 0.7301659988551803, "grad_norm": 1.1506567001342773, "learning_rate": 3.6040654397337614e-06, "loss": 1.0173, "step": 3189 }, { "epoch": 0.73039496279336, "grad_norm": 1.3218399286270142, "learning_rate": 3.598364758026467e-06, "loss": 1.093, "step": 3190 }, { "epoch": 0.7306239267315398, "grad_norm": 1.5998913049697876, "learning_rate": 3.5926675990853752e-06, "loss": 1.0072, "step": 3191 }, { "epoch": 0.7308528906697195, "grad_norm": 1.9323956966400146, "learning_rate": 3.5869739660455847e-06, "loss": 0.9571, "step": 3192 }, { "epoch": 0.7310818546078992, "grad_norm": 1.1179009675979614, "learning_rate": 3.581283862040257e-06, "loss": 1.0337, "step": 3193 }, { "epoch": 0.731310818546079, "grad_norm": 1.3210819959640503, "learning_rate": 3.575597290200599e-06, "loss": 1.0332, "step": 3194 }, { "epoch": 0.7315397824842588, "grad_norm": 1.1699495315551758, "learning_rate": 3.569914253655896e-06, "loss": 1.0276, "step": 3195 }, { "epoch": 0.7317687464224385, "grad_norm": 1.2185953855514526, "learning_rate": 3.5642347555334665e-06, "loss": 1.0249, "step": 3196 }, { "epoch": 0.7319977103606182, "grad_norm": 1.1740138530731201, "learning_rate": 3.55855879895869e-06, "loss": 0.9732, "step": 3197 }, { "epoch": 0.732226674298798, "grad_norm": 1.19003427028656, "learning_rate": 3.552886387055009e-06, "loss": 0.993, "step": 3198 }, { "epoch": 0.7324556382369777, "grad_norm": 1.3954188823699951, "learning_rate": 3.547217522943892e-06, "loss": 1.0497, "step": 3199 }, { "epoch": 0.7326846021751574, "grad_norm": 1.2999389171600342, "learning_rate": 3.5415522097448717e-06, "loss": 1.0348, "step": 3200 }, { "epoch": 0.7329135661133371, "grad_norm": 1.2359155416488647, "learning_rate": 3.5358904505755243e-06, "loss": 1.0308, "step": 3201 }, { "epoch": 0.7331425300515169, "grad_norm": 1.3111592531204224, "learning_rate": 3.530232248551466e-06, "loss": 1.0899, "step": 3202 }, { "epoch": 0.7333714939896966, "grad_norm": 1.2703155279159546, "learning_rate": 3.524577606786358e-06, "loss": 1.0127, "step": 3203 }, { "epoch": 0.7336004579278763, "grad_norm": 1.2910027503967285, "learning_rate": 3.518926528391904e-06, "loss": 1.0473, "step": 3204 }, { "epoch": 0.733829421866056, "grad_norm": 1.4213367700576782, "learning_rate": 3.513279016477844e-06, "loss": 0.9976, "step": 3205 }, { "epoch": 0.7340583858042359, "grad_norm": 1.4030628204345703, "learning_rate": 3.5076350741519595e-06, "loss": 1.051, "step": 3206 }, { "epoch": 0.7342873497424156, "grad_norm": 1.3458659648895264, "learning_rate": 3.5019947045200553e-06, "loss": 1.0091, "step": 3207 }, { "epoch": 0.7345163136805953, "grad_norm": 1.5768972635269165, "learning_rate": 3.496357910685989e-06, "loss": 1.0405, "step": 3208 }, { "epoch": 0.7347452776187751, "grad_norm": 1.4291819334030151, "learning_rate": 3.4907246957516416e-06, "loss": 1.0732, "step": 3209 }, { "epoch": 0.7349742415569548, "grad_norm": 1.2574915885925293, "learning_rate": 3.4850950628169167e-06, "loss": 1.0461, "step": 3210 }, { "epoch": 0.7352032054951345, "grad_norm": 1.2337363958358765, "learning_rate": 3.479469014979754e-06, "loss": 1.1096, "step": 3211 }, { "epoch": 0.7354321694333142, "grad_norm": 1.4060138463974, "learning_rate": 3.473846555336131e-06, "loss": 1.0116, "step": 3212 }, { "epoch": 0.735661133371494, "grad_norm": 1.3836556673049927, "learning_rate": 3.46822768698003e-06, "loss": 1.0388, "step": 3213 }, { "epoch": 0.7358900973096737, "grad_norm": 1.2583669424057007, "learning_rate": 3.4626124130034713e-06, "loss": 1.0596, "step": 3214 }, { "epoch": 0.7361190612478534, "grad_norm": 1.593520164489746, "learning_rate": 3.457000736496492e-06, "loss": 1.062, "step": 3215 }, { "epoch": 0.7363480251860332, "grad_norm": 1.5272831916809082, "learning_rate": 3.4513926605471504e-06, "loss": 1.0255, "step": 3216 }, { "epoch": 0.736576989124213, "grad_norm": 2.2148725986480713, "learning_rate": 3.445788188241527e-06, "loss": 1.0377, "step": 3217 }, { "epoch": 0.7368059530623927, "grad_norm": 1.665127158164978, "learning_rate": 3.440187322663707e-06, "loss": 1.0202, "step": 3218 }, { "epoch": 0.7370349170005724, "grad_norm": 1.2639302015304565, "learning_rate": 3.4345900668958088e-06, "loss": 1.0474, "step": 3219 }, { "epoch": 0.7372638809387522, "grad_norm": 1.2613798379898071, "learning_rate": 3.428996424017956e-06, "loss": 1.0672, "step": 3220 }, { "epoch": 0.7374928448769319, "grad_norm": 10.604727745056152, "learning_rate": 3.423406397108273e-06, "loss": 0.9956, "step": 3221 }, { "epoch": 0.7377218088151116, "grad_norm": 1.3740556240081787, "learning_rate": 3.417819989242915e-06, "loss": 1.0367, "step": 3222 }, { "epoch": 0.7379507727532914, "grad_norm": 1.2875322103500366, "learning_rate": 3.412237203496036e-06, "loss": 1.0304, "step": 3223 }, { "epoch": 0.7381797366914711, "grad_norm": 1.4357035160064697, "learning_rate": 3.4066580429397877e-06, "loss": 0.9638, "step": 3224 }, { "epoch": 0.7384087006296508, "grad_norm": 1.247023344039917, "learning_rate": 3.401082510644337e-06, "loss": 0.9908, "step": 3225 }, { "epoch": 0.7386376645678305, "grad_norm": 1.2303351163864136, "learning_rate": 3.395510609677861e-06, "loss": 1.0369, "step": 3226 }, { "epoch": 0.7388666285060103, "grad_norm": 1.5605922937393188, "learning_rate": 3.3899423431065215e-06, "loss": 1.0669, "step": 3227 }, { "epoch": 0.73909559244419, "grad_norm": 1.551808476448059, "learning_rate": 3.384377713994492e-06, "loss": 0.9855, "step": 3228 }, { "epoch": 0.7393245563823698, "grad_norm": 1.3470834493637085, "learning_rate": 3.3788167254039417e-06, "loss": 1.0534, "step": 3229 }, { "epoch": 0.7395535203205496, "grad_norm": 2.2554211616516113, "learning_rate": 3.3732593803950354e-06, "loss": 1.0309, "step": 3230 }, { "epoch": 0.7397824842587293, "grad_norm": 1.2798614501953125, "learning_rate": 3.3677056820259324e-06, "loss": 0.9978, "step": 3231 }, { "epoch": 0.740011448196909, "grad_norm": 1.184571385383606, "learning_rate": 3.3621556333527884e-06, "loss": 1.053, "step": 3232 }, { "epoch": 0.7402404121350887, "grad_norm": 1.5115231275558472, "learning_rate": 3.3566092374297465e-06, "loss": 0.9638, "step": 3233 }, { "epoch": 0.7404693760732685, "grad_norm": 1.1283025741577148, "learning_rate": 3.3510664973089467e-06, "loss": 1.022, "step": 3234 }, { "epoch": 0.7406983400114482, "grad_norm": 1.4040166139602661, "learning_rate": 3.3455274160405025e-06, "loss": 1.037, "step": 3235 }, { "epoch": 0.7409273039496279, "grad_norm": 1.2837587594985962, "learning_rate": 3.3399919966725335e-06, "loss": 1.0439, "step": 3236 }, { "epoch": 0.7411562678878076, "grad_norm": 1.1922607421875, "learning_rate": 3.3344602422511343e-06, "loss": 0.9965, "step": 3237 }, { "epoch": 0.7413852318259874, "grad_norm": 1.1945897340774536, "learning_rate": 3.3289321558203767e-06, "loss": 1.0592, "step": 3238 }, { "epoch": 0.7416141957641671, "grad_norm": 1.1530017852783203, "learning_rate": 3.323407740422323e-06, "loss": 1.0469, "step": 3239 }, { "epoch": 0.7418431597023469, "grad_norm": 1.1255567073822021, "learning_rate": 3.317886999097014e-06, "loss": 1.0633, "step": 3240 }, { "epoch": 0.7420721236405267, "grad_norm": 1.1872222423553467, "learning_rate": 3.3123699348824654e-06, "loss": 0.9804, "step": 3241 }, { "epoch": 0.7423010875787064, "grad_norm": 1.289146900177002, "learning_rate": 3.306856550814673e-06, "loss": 1.0055, "step": 3242 }, { "epoch": 0.7425300515168861, "grad_norm": 1.5255881547927856, "learning_rate": 3.3013468499276057e-06, "loss": 1.0347, "step": 3243 }, { "epoch": 0.7427590154550658, "grad_norm": 1.0099772214889526, "learning_rate": 3.2958408352532055e-06, "loss": 1.0634, "step": 3244 }, { "epoch": 0.7429879793932456, "grad_norm": 1.8835973739624023, "learning_rate": 3.290338509821386e-06, "loss": 1.024, "step": 3245 }, { "epoch": 0.7432169433314253, "grad_norm": 1.2783852815628052, "learning_rate": 3.2848398766600298e-06, "loss": 1.0316, "step": 3246 }, { "epoch": 0.743445907269605, "grad_norm": 1.3650398254394531, "learning_rate": 3.2793449387949907e-06, "loss": 1.0741, "step": 3247 }, { "epoch": 0.7436748712077847, "grad_norm": 1.6553008556365967, "learning_rate": 3.273853699250088e-06, "loss": 0.9974, "step": 3248 }, { "epoch": 0.7439038351459645, "grad_norm": 1.0704493522644043, "learning_rate": 3.268366161047096e-06, "loss": 1.08, "step": 3249 }, { "epoch": 0.7441327990841442, "grad_norm": 1.375232219696045, "learning_rate": 3.2628823272057753e-06, "loss": 1.0434, "step": 3250 }, { "epoch": 0.7443617630223239, "grad_norm": 1.122849464416504, "learning_rate": 3.257402200743821e-06, "loss": 1.0497, "step": 3251 }, { "epoch": 0.7445907269605038, "grad_norm": 1.2734968662261963, "learning_rate": 3.251925784676907e-06, "loss": 1.0542, "step": 3252 }, { "epoch": 0.7448196908986835, "grad_norm": 1.3681397438049316, "learning_rate": 3.246453082018658e-06, "loss": 0.9997, "step": 3253 }, { "epoch": 0.7450486548368632, "grad_norm": 1.1173583269119263, "learning_rate": 3.2409840957806548e-06, "loss": 1.0651, "step": 3254 }, { "epoch": 0.745277618775043, "grad_norm": 1.235009789466858, "learning_rate": 3.235518828972437e-06, "loss": 0.9709, "step": 3255 }, { "epoch": 0.7455065827132227, "grad_norm": 1.1784312725067139, "learning_rate": 3.2300572846014945e-06, "loss": 1.0648, "step": 3256 }, { "epoch": 0.7457355466514024, "grad_norm": 1.190245270729065, "learning_rate": 3.224599465673268e-06, "loss": 1.0308, "step": 3257 }, { "epoch": 0.7459645105895821, "grad_norm": 1.3046700954437256, "learning_rate": 3.2191453751911505e-06, "loss": 0.9958, "step": 3258 }, { "epoch": 0.7461934745277619, "grad_norm": 1.363734483718872, "learning_rate": 3.213695016156484e-06, "loss": 1.0139, "step": 3259 }, { "epoch": 0.7464224384659416, "grad_norm": 1.3983644247055054, "learning_rate": 3.2082483915685526e-06, "loss": 1.0878, "step": 3260 }, { "epoch": 0.7466514024041213, "grad_norm": 1.4021762609481812, "learning_rate": 3.202805504424592e-06, "loss": 1.0088, "step": 3261 }, { "epoch": 0.746880366342301, "grad_norm": 1.5322551727294922, "learning_rate": 3.197366357719772e-06, "loss": 1.0558, "step": 3262 }, { "epoch": 0.7471093302804809, "grad_norm": 1.4262042045593262, "learning_rate": 3.191930954447209e-06, "loss": 1.0341, "step": 3263 }, { "epoch": 0.7473382942186606, "grad_norm": 1.3308278322219849, "learning_rate": 3.186499297597968e-06, "loss": 1.0309, "step": 3264 }, { "epoch": 0.7475672581568403, "grad_norm": 1.1862602233886719, "learning_rate": 3.1810713901610367e-06, "loss": 1.0448, "step": 3265 }, { "epoch": 0.7477962220950201, "grad_norm": 1.3563153743743896, "learning_rate": 3.175647235123347e-06, "loss": 1.0332, "step": 3266 }, { "epoch": 0.7480251860331998, "grad_norm": 1.3230400085449219, "learning_rate": 3.170226835469774e-06, "loss": 1.0438, "step": 3267 }, { "epoch": 0.7482541499713795, "grad_norm": 1.3605960607528687, "learning_rate": 3.16481019418311e-06, "loss": 1.001, "step": 3268 }, { "epoch": 0.7484831139095592, "grad_norm": 1.1855884790420532, "learning_rate": 3.159397314244089e-06, "loss": 0.9966, "step": 3269 }, { "epoch": 0.748712077847739, "grad_norm": 1.5543913841247559, "learning_rate": 3.1539881986313758e-06, "loss": 0.9855, "step": 3270 }, { "epoch": 0.7489410417859187, "grad_norm": 1.125836968421936, "learning_rate": 3.1485828503215588e-06, "loss": 1.1038, "step": 3271 }, { "epoch": 0.7491700057240984, "grad_norm": 2.7458279132843018, "learning_rate": 3.1431812722891598e-06, "loss": 0.9994, "step": 3272 }, { "epoch": 0.7493989696622781, "grad_norm": 1.4484138488769531, "learning_rate": 3.137783467506613e-06, "loss": 0.9574, "step": 3273 }, { "epoch": 0.7496279336004579, "grad_norm": 1.4349960088729858, "learning_rate": 3.1323894389442942e-06, "loss": 1.0307, "step": 3274 }, { "epoch": 0.7498568975386377, "grad_norm": 1.287442922592163, "learning_rate": 3.126999189570493e-06, "loss": 1.0006, "step": 3275 }, { "epoch": 0.7500858614768174, "grad_norm": 1.1122688055038452, "learning_rate": 3.1216127223514116e-06, "loss": 1.046, "step": 3276 }, { "epoch": 0.7503148254149972, "grad_norm": 1.196319580078125, "learning_rate": 3.116230040251177e-06, "loss": 1.0686, "step": 3277 }, { "epoch": 0.7505437893531769, "grad_norm": 1.5629481077194214, "learning_rate": 3.1108511462318437e-06, "loss": 1.0516, "step": 3278 }, { "epoch": 0.7507727532913566, "grad_norm": 1.118003487586975, "learning_rate": 3.1054760432533626e-06, "loss": 1.0298, "step": 3279 }, { "epoch": 0.7510017172295363, "grad_norm": 1.118003487586975, "learning_rate": 3.1054760432533626e-06, "loss": 1.0198, "step": 3280 }, { "epoch": 0.7512306811677161, "grad_norm": 1.319196343421936, "learning_rate": 3.100104734273608e-06, "loss": 1.0704, "step": 3281 }, { "epoch": 0.7514596451058958, "grad_norm": 1.3036201000213623, "learning_rate": 3.0947372222483762e-06, "loss": 1.0307, "step": 3282 }, { "epoch": 0.7516886090440755, "grad_norm": 1.2368963956832886, "learning_rate": 3.089373510131354e-06, "loss": 1.0142, "step": 3283 }, { "epoch": 0.7519175729822553, "grad_norm": 1.583749771118164, "learning_rate": 3.0840136008741505e-06, "loss": 1.0126, "step": 3284 }, { "epoch": 0.752146536920435, "grad_norm": 1.174692988395691, "learning_rate": 3.0786574974262784e-06, "loss": 1.0675, "step": 3285 }, { "epoch": 0.7523755008586148, "grad_norm": 1.5759061574935913, "learning_rate": 3.073305202735157e-06, "loss": 1.0071, "step": 3286 }, { "epoch": 0.7526044647967945, "grad_norm": 1.3044160604476929, "learning_rate": 3.0679567197461135e-06, "loss": 0.9839, "step": 3287 }, { "epoch": 0.7528334287349743, "grad_norm": 1.2453902959823608, "learning_rate": 3.0626120514023605e-06, "loss": 1.0634, "step": 3288 }, { "epoch": 0.753062392673154, "grad_norm": 1.130800485610962, "learning_rate": 3.057271200645037e-06, "loss": 1.0402, "step": 3289 }, { "epoch": 0.7532913566113337, "grad_norm": 1.4683923721313477, "learning_rate": 3.0519341704131666e-06, "loss": 1.0274, "step": 3290 }, { "epoch": 0.7535203205495135, "grad_norm": 1.3023918867111206, "learning_rate": 3.0466009636436633e-06, "loss": 1.0431, "step": 3291 }, { "epoch": 0.7537492844876932, "grad_norm": 2.7885706424713135, "learning_rate": 3.0412715832713592e-06, "loss": 1.0291, "step": 3292 }, { "epoch": 0.7539782484258729, "grad_norm": 1.2981799840927124, "learning_rate": 3.035946032228957e-06, "loss": 0.9623, "step": 3293 }, { "epoch": 0.7542072123640526, "grad_norm": 1.2832386493682861, "learning_rate": 3.0306243134470668e-06, "loss": 1.0626, "step": 3294 }, { "epoch": 0.7544361763022324, "grad_norm": 1.5354857444763184, "learning_rate": 3.0253064298541857e-06, "loss": 1.0211, "step": 3295 }, { "epoch": 0.7546651402404121, "grad_norm": 1.4440540075302124, "learning_rate": 3.0199923843767007e-06, "loss": 1.0351, "step": 3296 }, { "epoch": 0.7548941041785918, "grad_norm": 1.2906075716018677, "learning_rate": 3.014682179938886e-06, "loss": 1.0806, "step": 3297 }, { "epoch": 0.7551230681167717, "grad_norm": 1.2191280126571655, "learning_rate": 3.009375819462902e-06, "loss": 1.0078, "step": 3298 }, { "epoch": 0.7553520320549514, "grad_norm": 1.3844983577728271, "learning_rate": 3.0040733058687956e-06, "loss": 1.056, "step": 3299 }, { "epoch": 0.7555809959931311, "grad_norm": 1.1661230325698853, "learning_rate": 2.998774642074496e-06, "loss": 1.061, "step": 3300 }, { "epoch": 0.7558099599313108, "grad_norm": 1.2717007398605347, "learning_rate": 2.993479830995815e-06, "loss": 0.9824, "step": 3301 }, { "epoch": 0.7560389238694906, "grad_norm": 1.4554436206817627, "learning_rate": 2.9881888755464337e-06, "loss": 1.0202, "step": 3302 }, { "epoch": 0.7562678878076703, "grad_norm": 1.251201868057251, "learning_rate": 2.9829017786379333e-06, "loss": 1.0595, "step": 3303 }, { "epoch": 0.75649685174585, "grad_norm": 1.0925774574279785, "learning_rate": 2.97761854317975e-06, "loss": 1.0848, "step": 3304 }, { "epoch": 0.7567258156840297, "grad_norm": 1.3679059743881226, "learning_rate": 2.972339172079204e-06, "loss": 1.0356, "step": 3305 }, { "epoch": 0.7569547796222095, "grad_norm": 1.140267014503479, "learning_rate": 2.9670636682414966e-06, "loss": 1.04, "step": 3306 }, { "epoch": 0.7571837435603892, "grad_norm": 1.2260184288024902, "learning_rate": 2.961792034569686e-06, "loss": 1.0593, "step": 3307 }, { "epoch": 0.7574127074985689, "grad_norm": 1.3413991928100586, "learning_rate": 2.9565242739647115e-06, "loss": 1.0271, "step": 3308 }, { "epoch": 0.7576416714367488, "grad_norm": 1.2301392555236816, "learning_rate": 2.9512603893253756e-06, "loss": 1.0802, "step": 3309 }, { "epoch": 0.7578706353749285, "grad_norm": 1.1220687627792358, "learning_rate": 2.9460003835483497e-06, "loss": 1.0152, "step": 3310 }, { "epoch": 0.7580995993131082, "grad_norm": 2.1697607040405273, "learning_rate": 2.940744259528173e-06, "loss": 1.0526, "step": 3311 }, { "epoch": 0.758328563251288, "grad_norm": 1.1174249649047852, "learning_rate": 2.9354920201572457e-06, "loss": 0.9836, "step": 3312 }, { "epoch": 0.7585575271894677, "grad_norm": 1.1898833513259888, "learning_rate": 2.9302436683258306e-06, "loss": 1.0391, "step": 3313 }, { "epoch": 0.7587864911276474, "grad_norm": 1.1694586277008057, "learning_rate": 2.9249992069220557e-06, "loss": 1.0203, "step": 3314 }, { "epoch": 0.7590154550658271, "grad_norm": 1.1864376068115234, "learning_rate": 2.919758638831893e-06, "loss": 1.046, "step": 3315 }, { "epoch": 0.7592444190040069, "grad_norm": 1.4337023496627808, "learning_rate": 2.9145219669391944e-06, "loss": 1.1258, "step": 3316 }, { "epoch": 0.7594733829421866, "grad_norm": 1.0952881574630737, "learning_rate": 2.909289194125655e-06, "loss": 1.0528, "step": 3317 }, { "epoch": 0.7597023468803663, "grad_norm": 1.3980028629302979, "learning_rate": 2.904060323270822e-06, "loss": 1.0476, "step": 3318 }, { "epoch": 0.759931310818546, "grad_norm": 1.3246995210647583, "learning_rate": 2.898835357252097e-06, "loss": 1.0199, "step": 3319 }, { "epoch": 0.7601602747567258, "grad_norm": 1.449928879737854, "learning_rate": 2.8936142989447434e-06, "loss": 1.0102, "step": 3320 }, { "epoch": 0.7603892386949056, "grad_norm": 1.211899757385254, "learning_rate": 2.8883971512218588e-06, "loss": 1.0177, "step": 3321 }, { "epoch": 0.7606182026330853, "grad_norm": 1.1149224042892456, "learning_rate": 2.8831839169543998e-06, "loss": 1.0456, "step": 3322 }, { "epoch": 0.760847166571265, "grad_norm": 1.3038837909698486, "learning_rate": 2.877974599011162e-06, "loss": 1.0935, "step": 3323 }, { "epoch": 0.7610761305094448, "grad_norm": 1.0868488550186157, "learning_rate": 2.8727692002587914e-06, "loss": 1.0169, "step": 3324 }, { "epoch": 0.7613050944476245, "grad_norm": 1.4088412523269653, "learning_rate": 2.867567723561776e-06, "loss": 1.0274, "step": 3325 }, { "epoch": 0.7615340583858042, "grad_norm": 1.7433838844299316, "learning_rate": 2.8623701717824435e-06, "loss": 0.9796, "step": 3326 }, { "epoch": 0.761763022323984, "grad_norm": 1.1715034246444702, "learning_rate": 2.8571765477809645e-06, "loss": 1.0241, "step": 3327 }, { "epoch": 0.7619919862621637, "grad_norm": 1.7508958578109741, "learning_rate": 2.851986854415347e-06, "loss": 0.9302, "step": 3328 }, { "epoch": 0.7622209502003434, "grad_norm": 1.3008875846862793, "learning_rate": 2.84680109454143e-06, "loss": 1.0152, "step": 3329 }, { "epoch": 0.7624499141385231, "grad_norm": 1.7418900728225708, "learning_rate": 2.841619271012901e-06, "loss": 0.97, "step": 3330 }, { "epoch": 0.7626788780767029, "grad_norm": 1.3994221687316895, "learning_rate": 2.8364413866812733e-06, "loss": 1.1088, "step": 3331 }, { "epoch": 0.7629078420148827, "grad_norm": 1.5729641914367676, "learning_rate": 2.83126744439589e-06, "loss": 1.0114, "step": 3332 }, { "epoch": 0.7631368059530624, "grad_norm": 1.295013666152954, "learning_rate": 2.826097447003925e-06, "loss": 1.0756, "step": 3333 }, { "epoch": 0.7633657698912422, "grad_norm": 1.3600553274154663, "learning_rate": 2.820931397350395e-06, "loss": 1.0254, "step": 3334 }, { "epoch": 0.7635947338294219, "grad_norm": 1.1421211957931519, "learning_rate": 2.815769298278125e-06, "loss": 1.0173, "step": 3335 }, { "epoch": 0.7638236977676016, "grad_norm": 2.1977944374084473, "learning_rate": 2.810611152627777e-06, "loss": 1.0828, "step": 3336 }, { "epoch": 0.7640526617057813, "grad_norm": 1.382452368736267, "learning_rate": 2.8054569632378358e-06, "loss": 1.0648, "step": 3337 }, { "epoch": 0.7642816256439611, "grad_norm": 1.372998833656311, "learning_rate": 2.80030673294461e-06, "loss": 1.0375, "step": 3338 }, { "epoch": 0.7645105895821408, "grad_norm": 1.1051013469696045, "learning_rate": 2.795160464582225e-06, "loss": 1.0433, "step": 3339 }, { "epoch": 0.7647395535203205, "grad_norm": 1.265097737312317, "learning_rate": 2.7900181609826325e-06, "loss": 0.9865, "step": 3340 }, { "epoch": 0.7649685174585003, "grad_norm": 1.2459075450897217, "learning_rate": 2.784879824975597e-06, "loss": 1.0751, "step": 3341 }, { "epoch": 0.76519748139668, "grad_norm": 1.2201124429702759, "learning_rate": 2.779745459388705e-06, "loss": 1.0641, "step": 3342 }, { "epoch": 0.7654264453348597, "grad_norm": 1.1874167919158936, "learning_rate": 2.774615067047346e-06, "loss": 1.0337, "step": 3343 }, { "epoch": 0.7656554092730395, "grad_norm": 1.3858647346496582, "learning_rate": 2.769488650774741e-06, "loss": 1.0191, "step": 3344 }, { "epoch": 0.7658843732112193, "grad_norm": 1.2821847200393677, "learning_rate": 2.7643662133919136e-06, "loss": 1.0342, "step": 3345 }, { "epoch": 0.766113337149399, "grad_norm": 1.4728732109069824, "learning_rate": 2.7592477577176924e-06, "loss": 0.9817, "step": 3346 }, { "epoch": 0.7663423010875787, "grad_norm": 1.484055519104004, "learning_rate": 2.7541332865687245e-06, "loss": 1.1209, "step": 3347 }, { "epoch": 0.7665712650257585, "grad_norm": 1.1936273574829102, "learning_rate": 2.749022802759459e-06, "loss": 1.0309, "step": 3348 }, { "epoch": 0.7668002289639382, "grad_norm": 1.9332703351974487, "learning_rate": 2.7439163091021525e-06, "loss": 1.0065, "step": 3349 }, { "epoch": 0.7670291929021179, "grad_norm": 1.1286215782165527, "learning_rate": 2.738813808406866e-06, "loss": 0.9393, "step": 3350 }, { "epoch": 0.7672581568402976, "grad_norm": 1.4261751174926758, "learning_rate": 2.7337153034814636e-06, "loss": 1.051, "step": 3351 }, { "epoch": 0.7674871207784774, "grad_norm": 1.2641804218292236, "learning_rate": 2.7286207971316094e-06, "loss": 1.0007, "step": 3352 }, { "epoch": 0.7677160847166571, "grad_norm": 1.276326298713684, "learning_rate": 2.7235302921607665e-06, "loss": 1.0195, "step": 3353 }, { "epoch": 0.7679450486548368, "grad_norm": 1.2994381189346313, "learning_rate": 2.7184437913701977e-06, "loss": 1.0581, "step": 3354 }, { "epoch": 0.7681740125930167, "grad_norm": 1.2071455717086792, "learning_rate": 2.713361297558963e-06, "loss": 1.0235, "step": 3355 }, { "epoch": 0.7684029765311964, "grad_norm": 1.246546745300293, "learning_rate": 2.708282813523917e-06, "loss": 1.0011, "step": 3356 }, { "epoch": 0.7686319404693761, "grad_norm": 1.2827038764953613, "learning_rate": 2.7032083420597e-06, "loss": 1.0674, "step": 3357 }, { "epoch": 0.7688609044075558, "grad_norm": 1.9928056001663208, "learning_rate": 2.6981378859587614e-06, "loss": 1.0078, "step": 3358 }, { "epoch": 0.7690898683457356, "grad_norm": 1.672621250152588, "learning_rate": 2.6930714480113217e-06, "loss": 1.0532, "step": 3359 }, { "epoch": 0.7693188322839153, "grad_norm": 1.4071228504180908, "learning_rate": 2.688009031005403e-06, "loss": 1.1329, "step": 3360 }, { "epoch": 0.769547796222095, "grad_norm": 2.0468266010284424, "learning_rate": 2.6829506377268122e-06, "loss": 1.063, "step": 3361 }, { "epoch": 0.7697767601602747, "grad_norm": 1.948828935623169, "learning_rate": 2.6778962709591382e-06, "loss": 1.0897, "step": 3362 }, { "epoch": 0.7700057240984545, "grad_norm": 1.4286307096481323, "learning_rate": 2.6728459334837576e-06, "loss": 1.0155, "step": 3363 }, { "epoch": 0.7702346880366342, "grad_norm": 1.1092466115951538, "learning_rate": 2.667799628079829e-06, "loss": 1.0039, "step": 3364 }, { "epoch": 0.7704636519748139, "grad_norm": 1.3140974044799805, "learning_rate": 2.6627573575242917e-06, "loss": 1.0641, "step": 3365 }, { "epoch": 0.7706926159129937, "grad_norm": 1.29413640499115, "learning_rate": 2.6577191245918654e-06, "loss": 1.01, "step": 3366 }, { "epoch": 0.7709215798511735, "grad_norm": 1.2063698768615723, "learning_rate": 2.6526849320550474e-06, "loss": 1.0791, "step": 3367 }, { "epoch": 0.7711505437893532, "grad_norm": 1.2607606649398804, "learning_rate": 2.6476547826841106e-06, "loss": 1.0255, "step": 3368 }, { "epoch": 0.7713795077275329, "grad_norm": 1.187766194343567, "learning_rate": 2.642628679247109e-06, "loss": 1.066, "step": 3369 }, { "epoch": 0.7716084716657127, "grad_norm": 1.3988637924194336, "learning_rate": 2.6376066245098565e-06, "loss": 1.1261, "step": 3370 }, { "epoch": 0.7718374356038924, "grad_norm": 1.4937584400177002, "learning_rate": 2.6325886212359496e-06, "loss": 0.9689, "step": 3371 }, { "epoch": 0.7720663995420721, "grad_norm": 1.2917104959487915, "learning_rate": 2.6275746721867624e-06, "loss": 1.0493, "step": 3372 }, { "epoch": 0.7722953634802519, "grad_norm": 1.5106794834136963, "learning_rate": 2.6225647801214203e-06, "loss": 0.9849, "step": 3373 }, { "epoch": 0.7725243274184316, "grad_norm": 1.2963145971298218, "learning_rate": 2.6175589477968234e-06, "loss": 1.0687, "step": 3374 }, { "epoch": 0.7727532913566113, "grad_norm": 1.9042096138000488, "learning_rate": 2.6125571779676493e-06, "loss": 1.0122, "step": 3375 }, { "epoch": 0.772982255294791, "grad_norm": 14.797686576843262, "learning_rate": 2.607559473386321e-06, "loss": 1.0177, "step": 3376 }, { "epoch": 0.7732112192329708, "grad_norm": 1.1996756792068481, "learning_rate": 2.602565836803036e-06, "loss": 1.1003, "step": 3377 }, { "epoch": 0.7734401831711506, "grad_norm": 1.0620721578598022, "learning_rate": 2.5975762709657506e-06, "loss": 1.0226, "step": 3378 }, { "epoch": 0.7736691471093303, "grad_norm": 1.3966255187988281, "learning_rate": 2.5925907786201808e-06, "loss": 1.0354, "step": 3379 }, { "epoch": 0.77389811104751, "grad_norm": 1.4668009281158447, "learning_rate": 2.5876093625098066e-06, "loss": 1.0616, "step": 3380 }, { "epoch": 0.7741270749856898, "grad_norm": 1.1744608879089355, "learning_rate": 2.5826320253758477e-06, "loss": 1.0438, "step": 3381 }, { "epoch": 0.7743560389238695, "grad_norm": 1.5106372833251953, "learning_rate": 2.5776587699573007e-06, "loss": 1.066, "step": 3382 }, { "epoch": 0.7745850028620492, "grad_norm": 1.2056074142456055, "learning_rate": 2.5726895989909063e-06, "loss": 1.0526, "step": 3383 }, { "epoch": 0.774813966800229, "grad_norm": 1.2208433151245117, "learning_rate": 2.5677245152111497e-06, "loss": 1.0644, "step": 3384 }, { "epoch": 0.7750429307384087, "grad_norm": 1.3252755403518677, "learning_rate": 2.5627635213502832e-06, "loss": 1.0094, "step": 3385 }, { "epoch": 0.7752718946765884, "grad_norm": 1.3273080587387085, "learning_rate": 2.5578066201383e-06, "loss": 1.1036, "step": 3386 }, { "epoch": 0.7755008586147681, "grad_norm": 1.2906368970870972, "learning_rate": 2.552853814302936e-06, "loss": 1.0622, "step": 3387 }, { "epoch": 0.7757298225529479, "grad_norm": 1.388886570930481, "learning_rate": 2.547905106569677e-06, "loss": 1.0753, "step": 3388 }, { "epoch": 0.7759587864911276, "grad_norm": 1.221598744392395, "learning_rate": 2.5429604996617653e-06, "loss": 1.0204, "step": 3389 }, { "epoch": 0.7761877504293074, "grad_norm": 1.2661128044128418, "learning_rate": 2.5380199963001684e-06, "loss": 1.0461, "step": 3390 }, { "epoch": 0.7764167143674872, "grad_norm": 3.6229851245880127, "learning_rate": 2.5330835992036062e-06, "loss": 1.1383, "step": 3391 }, { "epoch": 0.7766456783056669, "grad_norm": 1.637839913368225, "learning_rate": 2.528151311088537e-06, "loss": 1.0802, "step": 3392 }, { "epoch": 0.7768746422438466, "grad_norm": 1.3283582925796509, "learning_rate": 2.523223134669157e-06, "loss": 1.0888, "step": 3393 }, { "epoch": 0.7771036061820263, "grad_norm": 1.1502894163131714, "learning_rate": 2.518299072657403e-06, "loss": 1.0402, "step": 3394 }, { "epoch": 0.7773325701202061, "grad_norm": 1.397196888923645, "learning_rate": 2.513379127762937e-06, "loss": 1.048, "step": 3395 }, { "epoch": 0.7775615340583858, "grad_norm": 1.6322966814041138, "learning_rate": 2.5084633026931727e-06, "loss": 1.0891, "step": 3396 }, { "epoch": 0.7777904979965655, "grad_norm": 1.2721960544586182, "learning_rate": 2.5035516001532467e-06, "loss": 1.0438, "step": 3397 }, { "epoch": 0.7780194619347452, "grad_norm": 1.2815996408462524, "learning_rate": 2.4986440228460185e-06, "loss": 1.0351, "step": 3398 }, { "epoch": 0.778248425872925, "grad_norm": 1.0550155639648438, "learning_rate": 2.4937405734720964e-06, "loss": 1.0648, "step": 3399 }, { "epoch": 0.7784773898111047, "grad_norm": 1.814061164855957, "learning_rate": 2.488841254729808e-06, "loss": 1.0793, "step": 3400 }, { "epoch": 0.7787063537492845, "grad_norm": 1.3419748544692993, "learning_rate": 2.4839460693151994e-06, "loss": 1.092, "step": 3401 }, { "epoch": 0.7789353176874643, "grad_norm": 1.4719575643539429, "learning_rate": 2.4790550199220543e-06, "loss": 1.0483, "step": 3402 }, { "epoch": 0.779164281625644, "grad_norm": 1.003715991973877, "learning_rate": 2.474168109241877e-06, "loss": 0.984, "step": 3403 }, { "epoch": 0.7793932455638237, "grad_norm": 1.2123103141784668, "learning_rate": 2.469285339963892e-06, "loss": 1.042, "step": 3404 }, { "epoch": 0.7796222095020034, "grad_norm": 1.2390992641448975, "learning_rate": 2.4644067147750462e-06, "loss": 0.9107, "step": 3405 }, { "epoch": 0.7798511734401832, "grad_norm": 1.2249623537063599, "learning_rate": 2.459532236360007e-06, "loss": 1.0057, "step": 3406 }, { "epoch": 0.7800801373783629, "grad_norm": 1.6221455335617065, "learning_rate": 2.4546619074011603e-06, "loss": 0.9997, "step": 3407 }, { "epoch": 0.7803091013165426, "grad_norm": 1.1947602033615112, "learning_rate": 2.4497957305786046e-06, "loss": 0.9835, "step": 3408 }, { "epoch": 0.7805380652547224, "grad_norm": 1.270727515220642, "learning_rate": 2.4449337085701573e-06, "loss": 1.0053, "step": 3409 }, { "epoch": 0.7807670291929021, "grad_norm": 1.52069890499115, "learning_rate": 2.4400758440513516e-06, "loss": 1.0603, "step": 3410 }, { "epoch": 0.7809959931310818, "grad_norm": 1.472410798072815, "learning_rate": 2.4352221396954233e-06, "loss": 1.0715, "step": 3411 }, { "epoch": 0.7812249570692615, "grad_norm": 1.2387964725494385, "learning_rate": 2.430372598173326e-06, "loss": 1.0548, "step": 3412 }, { "epoch": 0.7814539210074414, "grad_norm": 1.2153174877166748, "learning_rate": 2.4255272221537295e-06, "loss": 1.0096, "step": 3413 }, { "epoch": 0.7816828849456211, "grad_norm": 1.296165943145752, "learning_rate": 2.4206860143029954e-06, "loss": 1.0653, "step": 3414 }, { "epoch": 0.7819118488838008, "grad_norm": 1.3834319114685059, "learning_rate": 2.4158489772852035e-06, "loss": 0.9823, "step": 3415 }, { "epoch": 0.7821408128219806, "grad_norm": 1.151049017906189, "learning_rate": 2.4110161137621325e-06, "loss": 1.051, "step": 3416 }, { "epoch": 0.7823697767601603, "grad_norm": 2.3822290897369385, "learning_rate": 2.406187426393269e-06, "loss": 1.0159, "step": 3417 }, { "epoch": 0.78259874069834, "grad_norm": 1.6371638774871826, "learning_rate": 2.401362917835798e-06, "loss": 0.969, "step": 3418 }, { "epoch": 0.7828277046365197, "grad_norm": 1.29193913936615, "learning_rate": 2.396542590744606e-06, "loss": 1.0759, "step": 3419 }, { "epoch": 0.7830566685746995, "grad_norm": 1.1832704544067383, "learning_rate": 2.391726447772279e-06, "loss": 1.0444, "step": 3420 }, { "epoch": 0.7832856325128792, "grad_norm": 1.0548069477081299, "learning_rate": 2.3869144915691033e-06, "loss": 1.0141, "step": 3421 }, { "epoch": 0.7835145964510589, "grad_norm": 1.3056319952011108, "learning_rate": 2.3821067247830488e-06, "loss": 1.0866, "step": 3422 }, { "epoch": 0.7837435603892386, "grad_norm": 1.4193148612976074, "learning_rate": 2.3773031500597974e-06, "loss": 1.0747, "step": 3423 }, { "epoch": 0.7839725243274185, "grad_norm": 1.3721299171447754, "learning_rate": 2.3725037700427168e-06, "loss": 1.0653, "step": 3424 }, { "epoch": 0.7842014882655982, "grad_norm": 1.2656069993972778, "learning_rate": 2.3677085873728602e-06, "loss": 1.0436, "step": 3425 }, { "epoch": 0.7844304522037779, "grad_norm": 1.1870973110198975, "learning_rate": 2.3629176046889755e-06, "loss": 1.0881, "step": 3426 }, { "epoch": 0.7846594161419577, "grad_norm": 2.626173257827759, "learning_rate": 2.3581308246275103e-06, "loss": 1.0576, "step": 3427 }, { "epoch": 0.7848883800801374, "grad_norm": 1.569612741470337, "learning_rate": 2.353348249822579e-06, "loss": 1.0122, "step": 3428 }, { "epoch": 0.7851173440183171, "grad_norm": 1.4496647119522095, "learning_rate": 2.3485698829059967e-06, "loss": 1.0401, "step": 3429 }, { "epoch": 0.7853463079564968, "grad_norm": 1.2720706462860107, "learning_rate": 2.3437957265072587e-06, "loss": 1.0253, "step": 3430 }, { "epoch": 0.7855752718946766, "grad_norm": 1.1991873979568481, "learning_rate": 2.339025783253541e-06, "loss": 1.075, "step": 3431 }, { "epoch": 0.7858042358328563, "grad_norm": 1.2989721298217773, "learning_rate": 2.334260055769707e-06, "loss": 1.0634, "step": 3432 }, { "epoch": 0.786033199771036, "grad_norm": 1.156073808670044, "learning_rate": 2.3294985466782937e-06, "loss": 1.0094, "step": 3433 }, { "epoch": 0.7862621637092158, "grad_norm": 1.2679800987243652, "learning_rate": 2.324741258599521e-06, "loss": 1.0954, "step": 3434 }, { "epoch": 0.7864911276473955, "grad_norm": 1.283085823059082, "learning_rate": 2.319988194151287e-06, "loss": 0.9946, "step": 3435 }, { "epoch": 0.7867200915855753, "grad_norm": 1.6633352041244507, "learning_rate": 2.3152393559491546e-06, "loss": 1.0509, "step": 3436 }, { "epoch": 0.786949055523755, "grad_norm": 1.3661447763442993, "learning_rate": 2.3104947466063785e-06, "loss": 1.0671, "step": 3437 }, { "epoch": 0.7871780194619348, "grad_norm": 1.2480249404907227, "learning_rate": 2.305754368733878e-06, "loss": 1.0529, "step": 3438 }, { "epoch": 0.7874069834001145, "grad_norm": 1.286214828491211, "learning_rate": 2.3010182249402368e-06, "loss": 1.0186, "step": 3439 }, { "epoch": 0.7876359473382942, "grad_norm": 1.268823504447937, "learning_rate": 2.2962863178317154e-06, "loss": 1.0093, "step": 3440 }, { "epoch": 0.787864911276474, "grad_norm": 1.215895652770996, "learning_rate": 2.29155865001225e-06, "loss": 1.0696, "step": 3441 }, { "epoch": 0.7880938752146537, "grad_norm": 1.2569650411605835, "learning_rate": 2.2868352240834304e-06, "loss": 1.0109, "step": 3442 }, { "epoch": 0.7883228391528334, "grad_norm": 1.3006694316864014, "learning_rate": 2.282116042644519e-06, "loss": 1.0191, "step": 3443 }, { "epoch": 0.7885518030910131, "grad_norm": 1.2813971042633057, "learning_rate": 2.2774011082924417e-06, "loss": 1.0308, "step": 3444 }, { "epoch": 0.7887807670291929, "grad_norm": 1.4007090330123901, "learning_rate": 2.2726904236217895e-06, "loss": 1.0365, "step": 3445 }, { "epoch": 0.7890097309673726, "grad_norm": 1.9624810218811035, "learning_rate": 2.2679839912248104e-06, "loss": 1.0467, "step": 3446 }, { "epoch": 0.7892386949055524, "grad_norm": 1.6809523105621338, "learning_rate": 2.263281813691417e-06, "loss": 1.025, "step": 3447 }, { "epoch": 0.7894676588437322, "grad_norm": 1.4995564222335815, "learning_rate": 2.2585838936091753e-06, "loss": 1.0324, "step": 3448 }, { "epoch": 0.7896966227819119, "grad_norm": 1.4374549388885498, "learning_rate": 2.253890233563316e-06, "loss": 1.0693, "step": 3449 }, { "epoch": 0.7899255867200916, "grad_norm": 1.204321265220642, "learning_rate": 2.2492008361367133e-06, "loss": 1.0055, "step": 3450 }, { "epoch": 0.7901545506582713, "grad_norm": 1.4027067422866821, "learning_rate": 2.2445157039099096e-06, "loss": 1.0308, "step": 3451 }, { "epoch": 0.7903835145964511, "grad_norm": 1.5836561918258667, "learning_rate": 2.2398348394610947e-06, "loss": 1.0295, "step": 3452 }, { "epoch": 0.7906124785346308, "grad_norm": 1.5846806764602661, "learning_rate": 2.235158245366105e-06, "loss": 1.0415, "step": 3453 }, { "epoch": 0.7908414424728105, "grad_norm": 1.1941750049591064, "learning_rate": 2.2304859241984313e-06, "loss": 1.0086, "step": 3454 }, { "epoch": 0.7910704064109902, "grad_norm": 1.2206698656082153, "learning_rate": 2.225817878529214e-06, "loss": 1.0253, "step": 3455 }, { "epoch": 0.79129937034917, "grad_norm": 1.6405779123306274, "learning_rate": 2.2211541109272383e-06, "loss": 0.983, "step": 3456 }, { "epoch": 0.7915283342873497, "grad_norm": 1.773629903793335, "learning_rate": 2.216494623958939e-06, "loss": 0.9978, "step": 3457 }, { "epoch": 0.7917572982255294, "grad_norm": 1.5641894340515137, "learning_rate": 2.2118394201883907e-06, "loss": 1.0389, "step": 3458 }, { "epoch": 0.7919862621637093, "grad_norm": 1.358713984489441, "learning_rate": 2.207188502177313e-06, "loss": 1.0336, "step": 3459 }, { "epoch": 0.792215226101889, "grad_norm": 1.1877168416976929, "learning_rate": 2.2025418724850678e-06, "loss": 1.1138, "step": 3460 }, { "epoch": 0.7924441900400687, "grad_norm": 1.3562843799591064, "learning_rate": 2.197899533668657e-06, "loss": 1.0429, "step": 3461 }, { "epoch": 0.7926731539782484, "grad_norm": 1.2236860990524292, "learning_rate": 2.1932614882827196e-06, "loss": 1.0147, "step": 3462 }, { "epoch": 0.7929021179164282, "grad_norm": 1.2904807329177856, "learning_rate": 2.1886277388795363e-06, "loss": 1.0654, "step": 3463 }, { "epoch": 0.7931310818546079, "grad_norm": 1.3184075355529785, "learning_rate": 2.1839982880090115e-06, "loss": 1.1185, "step": 3464 }, { "epoch": 0.7933600457927876, "grad_norm": 1.4988008737564087, "learning_rate": 2.1793731382187056e-06, "loss": 1.0399, "step": 3465 }, { "epoch": 0.7935890097309674, "grad_norm": 1.8388975858688354, "learning_rate": 2.1747522920537913e-06, "loss": 1.0708, "step": 3466 }, { "epoch": 0.7938179736691471, "grad_norm": 1.239767074584961, "learning_rate": 2.1701357520570797e-06, "loss": 1.0156, "step": 3467 }, { "epoch": 0.7940469376073268, "grad_norm": 1.1681170463562012, "learning_rate": 2.165523520769024e-06, "loss": 1.0439, "step": 3468 }, { "epoch": 0.7942759015455065, "grad_norm": 1.1504888534545898, "learning_rate": 2.160915600727688e-06, "loss": 1.0851, "step": 3469 }, { "epoch": 0.7945048654836864, "grad_norm": 1.3817356824874878, "learning_rate": 2.156311994468774e-06, "loss": 1.0873, "step": 3470 }, { "epoch": 0.7947338294218661, "grad_norm": 1.3247697353363037, "learning_rate": 2.151712704525608e-06, "loss": 0.9901, "step": 3471 }, { "epoch": 0.7949627933600458, "grad_norm": 1.5269098281860352, "learning_rate": 2.1471177334291404e-06, "loss": 1.0721, "step": 3472 }, { "epoch": 0.7951917572982256, "grad_norm": 1.195955753326416, "learning_rate": 2.142527083707946e-06, "loss": 1.0325, "step": 3473 }, { "epoch": 0.7954207212364053, "grad_norm": 1.3172709941864014, "learning_rate": 2.1379407578882206e-06, "loss": 1.0282, "step": 3474 }, { "epoch": 0.795649685174585, "grad_norm": 1.4341003894805908, "learning_rate": 2.13335875849378e-06, "loss": 1.098, "step": 3475 }, { "epoch": 0.7958786491127647, "grad_norm": 1.1495006084442139, "learning_rate": 2.1287810880460636e-06, "loss": 1.0147, "step": 3476 }, { "epoch": 0.7961076130509445, "grad_norm": 1.1693551540374756, "learning_rate": 2.1242077490641157e-06, "loss": 1.0005, "step": 3477 }, { "epoch": 0.7963365769891242, "grad_norm": 1.5552064180374146, "learning_rate": 2.119638744064617e-06, "loss": 1.0714, "step": 3478 }, { "epoch": 0.7965655409273039, "grad_norm": 1.519784688949585, "learning_rate": 2.1150740755618505e-06, "loss": 1.04, "step": 3479 }, { "epoch": 0.7967945048654836, "grad_norm": 1.1543006896972656, "learning_rate": 2.1105137460677093e-06, "loss": 1.0897, "step": 3480 }, { "epoch": 0.7970234688036634, "grad_norm": 1.7196766138076782, "learning_rate": 2.1059577580917067e-06, "loss": 1.0252, "step": 3481 }, { "epoch": 0.7972524327418432, "grad_norm": 1.2323554754257202, "learning_rate": 2.1014061141409715e-06, "loss": 1.0197, "step": 3482 }, { "epoch": 0.7974813966800229, "grad_norm": 1.1643589735031128, "learning_rate": 2.0968588167202265e-06, "loss": 1.023, "step": 3483 }, { "epoch": 0.7977103606182027, "grad_norm": 1.2324453592300415, "learning_rate": 2.0923158683318157e-06, "loss": 0.9976, "step": 3484 }, { "epoch": 0.7979393245563824, "grad_norm": 1.3599796295166016, "learning_rate": 2.087777271475684e-06, "loss": 1.0451, "step": 3485 }, { "epoch": 0.7981682884945621, "grad_norm": 1.2370396852493286, "learning_rate": 2.0832430286493834e-06, "loss": 0.9734, "step": 3486 }, { "epoch": 0.7983972524327418, "grad_norm": 1.3840800523757935, "learning_rate": 2.0787131423480722e-06, "loss": 0.9833, "step": 3487 }, { "epoch": 0.7986262163709216, "grad_norm": 1.1826318502426147, "learning_rate": 2.0741876150645025e-06, "loss": 0.9921, "step": 3488 }, { "epoch": 0.7988551803091013, "grad_norm": 1.1588865518569946, "learning_rate": 2.0696664492890394e-06, "loss": 1.0858, "step": 3489 }, { "epoch": 0.799084144247281, "grad_norm": 1.4045047760009766, "learning_rate": 2.0651496475096455e-06, "loss": 1.0409, "step": 3490 }, { "epoch": 0.7993131081854608, "grad_norm": 1.2877663373947144, "learning_rate": 2.060637212211869e-06, "loss": 0.9848, "step": 3491 }, { "epoch": 0.7995420721236405, "grad_norm": 1.3673158884048462, "learning_rate": 2.0561291458788736e-06, "loss": 0.9969, "step": 3492 }, { "epoch": 0.7997710360618203, "grad_norm": 1.3965160846710205, "learning_rate": 2.0516254509914103e-06, "loss": 1.0477, "step": 3493 }, { "epoch": 0.8, "grad_norm": 1.1263400316238403, "learning_rate": 2.047126130027819e-06, "loss": 1.0591, "step": 3494 }, { "epoch": 0.8002289639381798, "grad_norm": 1.2847234010696411, "learning_rate": 2.042631185464039e-06, "loss": 0.9993, "step": 3495 }, { "epoch": 0.8004579278763595, "grad_norm": 2.184096336364746, "learning_rate": 2.038140619773609e-06, "loss": 1.0074, "step": 3496 }, { "epoch": 0.8006868918145392, "grad_norm": 1.3483428955078125, "learning_rate": 2.03365443542764e-06, "loss": 1.0989, "step": 3497 }, { "epoch": 0.800915855752719, "grad_norm": 1.5538264513015747, "learning_rate": 2.029172634894846e-06, "loss": 1.0901, "step": 3498 }, { "epoch": 0.8011448196908987, "grad_norm": 1.2614027261734009, "learning_rate": 2.024695220641524e-06, "loss": 1.0144, "step": 3499 }, { "epoch": 0.8013737836290784, "grad_norm": 1.2823106050491333, "learning_rate": 2.020222195131556e-06, "loss": 0.9797, "step": 3500 }, { "epoch": 0.8016027475672581, "grad_norm": 1.257866382598877, "learning_rate": 2.0157535608264123e-06, "loss": 1.0476, "step": 3501 }, { "epoch": 0.8018317115054379, "grad_norm": 1.226118803024292, "learning_rate": 2.0112893201851435e-06, "loss": 0.9959, "step": 3502 }, { "epoch": 0.8020606754436176, "grad_norm": 1.5508838891983032, "learning_rate": 2.0068294756643846e-06, "loss": 0.9996, "step": 3503 }, { "epoch": 0.8022896393817973, "grad_norm": 1.1606311798095703, "learning_rate": 2.0023740297183536e-06, "loss": 1.0322, "step": 3504 }, { "epoch": 0.8025186033199772, "grad_norm": 1.516457200050354, "learning_rate": 1.997922984798836e-06, "loss": 1.0247, "step": 3505 }, { "epoch": 0.8027475672581569, "grad_norm": 2.4328088760375977, "learning_rate": 1.993476343355213e-06, "loss": 1.0883, "step": 3506 }, { "epoch": 0.8029765311963366, "grad_norm": 1.4516451358795166, "learning_rate": 1.9890341078344343e-06, "loss": 1.0693, "step": 3507 }, { "epoch": 0.8032054951345163, "grad_norm": 1.3910856246948242, "learning_rate": 1.9845962806810205e-06, "loss": 1.0984, "step": 3508 }, { "epoch": 0.8034344590726961, "grad_norm": 1.184085488319397, "learning_rate": 1.980162864337071e-06, "loss": 1.0698, "step": 3509 }, { "epoch": 0.8036634230108758, "grad_norm": 1.3776863813400269, "learning_rate": 1.9757338612422594e-06, "loss": 0.9953, "step": 3510 }, { "epoch": 0.8038923869490555, "grad_norm": 1.102057933807373, "learning_rate": 1.971309273833828e-06, "loss": 1.0835, "step": 3511 }, { "epoch": 0.8041213508872352, "grad_norm": 1.2618690729141235, "learning_rate": 1.966889104546591e-06, "loss": 1.0462, "step": 3512 }, { "epoch": 0.804350314825415, "grad_norm": 1.2333935499191284, "learning_rate": 1.9624733558129304e-06, "loss": 1.0208, "step": 3513 }, { "epoch": 0.8045792787635947, "grad_norm": 1.2381764650344849, "learning_rate": 1.958062030062795e-06, "loss": 1.0503, "step": 3514 }, { "epoch": 0.8048082427017744, "grad_norm": 1.2884087562561035, "learning_rate": 1.9536551297237018e-06, "loss": 1.0545, "step": 3515 }, { "epoch": 0.8050372066399543, "grad_norm": 1.294965147972107, "learning_rate": 1.9492526572207294e-06, "loss": 1.0354, "step": 3516 }, { "epoch": 0.805266170578134, "grad_norm": 1.5084460973739624, "learning_rate": 1.944854614976521e-06, "loss": 0.9814, "step": 3517 }, { "epoch": 0.8054951345163137, "grad_norm": 1.1529735326766968, "learning_rate": 1.940461005411288e-06, "loss": 1.0445, "step": 3518 }, { "epoch": 0.8057240984544934, "grad_norm": 1.3352361917495728, "learning_rate": 1.9360718309427863e-06, "loss": 0.9968, "step": 3519 }, { "epoch": 0.8059530623926732, "grad_norm": 1.3624452352523804, "learning_rate": 1.931687093986354e-06, "loss": 1.0275, "step": 3520 }, { "epoch": 0.8061820263308529, "grad_norm": 1.2383012771606445, "learning_rate": 1.9273067969548664e-06, "loss": 1.0416, "step": 3521 }, { "epoch": 0.8064109902690326, "grad_norm": 1.340103030204773, "learning_rate": 1.922930942258766e-06, "loss": 1.0579, "step": 3522 }, { "epoch": 0.8066399542072124, "grad_norm": 2.9526262283325195, "learning_rate": 1.918559532306051e-06, "loss": 1.0087, "step": 3523 }, { "epoch": 0.8068689181453921, "grad_norm": 1.7514522075653076, "learning_rate": 1.91419256950227e-06, "loss": 1.0383, "step": 3524 }, { "epoch": 0.8070978820835718, "grad_norm": 1.2114307880401611, "learning_rate": 1.9098300562505266e-06, "loss": 0.9728, "step": 3525 }, { "epoch": 0.8073268460217515, "grad_norm": 1.102329969406128, "learning_rate": 1.9054719949514756e-06, "loss": 1.0953, "step": 3526 }, { "epoch": 0.8075558099599313, "grad_norm": 1.2721331119537354, "learning_rate": 1.9011183880033203e-06, "loss": 1.0068, "step": 3527 }, { "epoch": 0.8077847738981111, "grad_norm": 1.7620582580566406, "learning_rate": 1.8967692378018155e-06, "loss": 0.9938, "step": 3528 }, { "epoch": 0.8080137378362908, "grad_norm": 1.253893494606018, "learning_rate": 1.8924245467402612e-06, "loss": 1.0459, "step": 3529 }, { "epoch": 0.8082427017744706, "grad_norm": 1.549721360206604, "learning_rate": 1.8880843172095066e-06, "loss": 1.0437, "step": 3530 }, { "epoch": 0.8084716657126503, "grad_norm": 1.4003283977508545, "learning_rate": 1.8837485515979425e-06, "loss": 1.0505, "step": 3531 }, { "epoch": 0.80870062965083, "grad_norm": 1.227433681488037, "learning_rate": 1.8794172522915022e-06, "loss": 1.0111, "step": 3532 }, { "epoch": 0.8089295935890097, "grad_norm": 1.263609528541565, "learning_rate": 1.875090421673662e-06, "loss": 1.0436, "step": 3533 }, { "epoch": 0.8091585575271895, "grad_norm": 1.4125272035598755, "learning_rate": 1.8707680621254487e-06, "loss": 1.061, "step": 3534 }, { "epoch": 0.8093875214653692, "grad_norm": 1.1441245079040527, "learning_rate": 1.8664501760254128e-06, "loss": 1.0341, "step": 3535 }, { "epoch": 0.8096164854035489, "grad_norm": 1.3769537210464478, "learning_rate": 1.8621367657496504e-06, "loss": 1.0348, "step": 3536 }, { "epoch": 0.8098454493417286, "grad_norm": 1.3794772624969482, "learning_rate": 1.8578278336718037e-06, "loss": 1.0156, "step": 3537 }, { "epoch": 0.8100744132799084, "grad_norm": 1.4838491678237915, "learning_rate": 1.8535233821630338e-06, "loss": 0.9982, "step": 3538 }, { "epoch": 0.8103033772180882, "grad_norm": 1.272406816482544, "learning_rate": 1.849223413592046e-06, "loss": 1.0323, "step": 3539 }, { "epoch": 0.8105323411562679, "grad_norm": 1.1917959451675415, "learning_rate": 1.8449279303250777e-06, "loss": 1.0565, "step": 3540 }, { "epoch": 0.8107613050944477, "grad_norm": 1.2484959363937378, "learning_rate": 1.8406369347258968e-06, "loss": 1.0529, "step": 3541 }, { "epoch": 0.8109902690326274, "grad_norm": 1.6294903755187988, "learning_rate": 1.8363504291558053e-06, "loss": 1.0013, "step": 3542 }, { "epoch": 0.8112192329708071, "grad_norm": 1.355380654335022, "learning_rate": 1.8320684159736236e-06, "loss": 0.9698, "step": 3543 }, { "epoch": 0.8114481969089868, "grad_norm": 1.0993623733520508, "learning_rate": 1.827790897535715e-06, "loss": 1.0257, "step": 3544 }, { "epoch": 0.8116771608471666, "grad_norm": 1.8703913688659668, "learning_rate": 1.8235178761959626e-06, "loss": 1.0786, "step": 3545 }, { "epoch": 0.8119061247853463, "grad_norm": 1.225956678390503, "learning_rate": 1.8192493543057676e-06, "loss": 1.0386, "step": 3546 }, { "epoch": 0.812135088723526, "grad_norm": 1.302952527999878, "learning_rate": 1.8149853342140644e-06, "loss": 1.011, "step": 3547 }, { "epoch": 0.8123640526617057, "grad_norm": 1.4825295209884644, "learning_rate": 1.8107258182673127e-06, "loss": 1.0508, "step": 3548 }, { "epoch": 0.8125930165998855, "grad_norm": 1.4160746335983276, "learning_rate": 1.8064708088094829e-06, "loss": 1.0045, "step": 3549 }, { "epoch": 0.8128219805380652, "grad_norm": 1.3948523998260498, "learning_rate": 1.802220308182071e-06, "loss": 1.0562, "step": 3550 }, { "epoch": 0.813050944476245, "grad_norm": 1.3531475067138672, "learning_rate": 1.797974318724094e-06, "loss": 0.9929, "step": 3551 }, { "epoch": 0.8132799084144248, "grad_norm": 1.301820993423462, "learning_rate": 1.7937328427720834e-06, "loss": 1.0954, "step": 3552 }, { "epoch": 0.8135088723526045, "grad_norm": 1.1766568422317505, "learning_rate": 1.7894958826600884e-06, "loss": 1.0252, "step": 3553 }, { "epoch": 0.8137378362907842, "grad_norm": 1.2368574142456055, "learning_rate": 1.7852634407196723e-06, "loss": 0.9867, "step": 3554 }, { "epoch": 0.813966800228964, "grad_norm": 1.2918037176132202, "learning_rate": 1.7810355192799122e-06, "loss": 1.0437, "step": 3555 }, { "epoch": 0.8141957641671437, "grad_norm": 1.3767991065979004, "learning_rate": 1.7768121206674006e-06, "loss": 1.0281, "step": 3556 }, { "epoch": 0.8144247281053234, "grad_norm": 1.165218472480774, "learning_rate": 1.7725932472062302e-06, "loss": 0.9958, "step": 3557 }, { "epoch": 0.8146536920435031, "grad_norm": 1.1615934371948242, "learning_rate": 1.7683789012180196e-06, "loss": 1.0294, "step": 3558 }, { "epoch": 0.8148826559816829, "grad_norm": 1.3397858142852783, "learning_rate": 1.7641690850218884e-06, "loss": 1.0477, "step": 3559 }, { "epoch": 0.8151116199198626, "grad_norm": 1.2796862125396729, "learning_rate": 1.7599638009344566e-06, "loss": 0.9805, "step": 3560 }, { "epoch": 0.8153405838580423, "grad_norm": 1.1992462873458862, "learning_rate": 1.7557630512698642e-06, "loss": 1.0074, "step": 3561 }, { "epoch": 0.8155695477962221, "grad_norm": 1.330796480178833, "learning_rate": 1.7515668383397433e-06, "loss": 1.0369, "step": 3562 }, { "epoch": 0.8157985117344019, "grad_norm": 1.7938005924224854, "learning_rate": 1.7473751644532366e-06, "loss": 1.0118, "step": 3563 }, { "epoch": 0.8160274756725816, "grad_norm": 1.2056411504745483, "learning_rate": 1.7431880319169858e-06, "loss": 1.0215, "step": 3564 }, { "epoch": 0.8162564396107613, "grad_norm": 1.4476250410079956, "learning_rate": 1.7390054430351366e-06, "loss": 0.9641, "step": 3565 }, { "epoch": 0.8164854035489411, "grad_norm": 1.1308528184890747, "learning_rate": 1.7348274001093324e-06, "loss": 1.0338, "step": 3566 }, { "epoch": 0.8167143674871208, "grad_norm": 1.440428614616394, "learning_rate": 1.730653905438714e-06, "loss": 1.0503, "step": 3567 }, { "epoch": 0.8169433314253005, "grad_norm": 1.2121968269348145, "learning_rate": 1.7264849613199208e-06, "loss": 1.0444, "step": 3568 }, { "epoch": 0.8171722953634802, "grad_norm": 1.3329687118530273, "learning_rate": 1.722320570047089e-06, "loss": 1.0456, "step": 3569 }, { "epoch": 0.81740125930166, "grad_norm": 1.237636923789978, "learning_rate": 1.7181607339118488e-06, "loss": 1.0376, "step": 3570 }, { "epoch": 0.8176302232398397, "grad_norm": 7.456436634063721, "learning_rate": 1.714005455203318e-06, "loss": 1.0065, "step": 3571 }, { "epoch": 0.8178591871780194, "grad_norm": 1.594241976737976, "learning_rate": 1.7098547362081197e-06, "loss": 0.9686, "step": 3572 }, { "epoch": 0.8180881511161991, "grad_norm": 1.2208811044692993, "learning_rate": 1.7057085792103534e-06, "loss": 0.965, "step": 3573 }, { "epoch": 0.818317115054379, "grad_norm": 1.1306706666946411, "learning_rate": 1.701566986491614e-06, "loss": 1.0411, "step": 3574 }, { "epoch": 0.8185460789925587, "grad_norm": 1.4703606367111206, "learning_rate": 1.697429960330993e-06, "loss": 1.063, "step": 3575 }, { "epoch": 0.8187750429307384, "grad_norm": 1.7712504863739014, "learning_rate": 1.6932975030050524e-06, "loss": 1.03, "step": 3576 }, { "epoch": 0.8190040068689182, "grad_norm": 1.5078486204147339, "learning_rate": 1.6891696167878535e-06, "loss": 1.0125, "step": 3577 }, { "epoch": 0.8192329708070979, "grad_norm": 1.5069183111190796, "learning_rate": 1.6850463039509356e-06, "loss": 1.0039, "step": 3578 }, { "epoch": 0.8194619347452776, "grad_norm": 1.3289002180099487, "learning_rate": 1.680927566763325e-06, "loss": 1.0696, "step": 3579 }, { "epoch": 0.8196908986834573, "grad_norm": 1.418169617652893, "learning_rate": 1.6768134074915277e-06, "loss": 1.0456, "step": 3580 }, { "epoch": 0.8199198626216371, "grad_norm": 1.5103626251220703, "learning_rate": 1.672703828399529e-06, "loss": 1.0815, "step": 3581 }, { "epoch": 0.8201488265598168, "grad_norm": 1.2844433784484863, "learning_rate": 1.6685988317487988e-06, "loss": 1.0614, "step": 3582 }, { "epoch": 0.8203777904979965, "grad_norm": 1.187903881072998, "learning_rate": 1.6644984197982828e-06, "loss": 1.0207, "step": 3583 }, { "epoch": 0.8206067544361763, "grad_norm": 1.2992799282073975, "learning_rate": 1.6604025948043966e-06, "loss": 1.0443, "step": 3584 }, { "epoch": 0.8208357183743561, "grad_norm": 1.6380910873413086, "learning_rate": 1.6563113590210455e-06, "loss": 1.112, "step": 3585 }, { "epoch": 0.8210646823125358, "grad_norm": 1.1910545825958252, "learning_rate": 1.652224714699603e-06, "loss": 1.0636, "step": 3586 }, { "epoch": 0.8212936462507155, "grad_norm": 1.3377147912979126, "learning_rate": 1.6481426640889098e-06, "loss": 1.0045, "step": 3587 }, { "epoch": 0.8215226101888953, "grad_norm": 1.2766714096069336, "learning_rate": 1.6440652094352838e-06, "loss": 1.017, "step": 3588 }, { "epoch": 0.821751574127075, "grad_norm": 1.1633610725402832, "learning_rate": 1.6399923529825213e-06, "loss": 1.0152, "step": 3589 }, { "epoch": 0.8219805380652547, "grad_norm": 1.4114861488342285, "learning_rate": 1.6359240969718748e-06, "loss": 1.0769, "step": 3590 }, { "epoch": 0.8222095020034345, "grad_norm": 1.3905683755874634, "learning_rate": 1.6318604436420738e-06, "loss": 1.0795, "step": 3591 }, { "epoch": 0.8224384659416142, "grad_norm": 1.3522437810897827, "learning_rate": 1.6278013952293115e-06, "loss": 1.0389, "step": 3592 }, { "epoch": 0.8226674298797939, "grad_norm": 1.1281168460845947, "learning_rate": 1.6237469539672479e-06, "loss": 1.0212, "step": 3593 }, { "epoch": 0.8228963938179736, "grad_norm": 1.1827605962753296, "learning_rate": 1.6196971220870105e-06, "loss": 0.9905, "step": 3594 }, { "epoch": 0.8231253577561534, "grad_norm": 1.626893401145935, "learning_rate": 1.6156519018171856e-06, "loss": 1.0374, "step": 3595 }, { "epoch": 0.8233543216943331, "grad_norm": 1.1740373373031616, "learning_rate": 1.6116112953838247e-06, "loss": 1.0092, "step": 3596 }, { "epoch": 0.8235832856325129, "grad_norm": 1.1388554573059082, "learning_rate": 1.6075753050104426e-06, "loss": 1.0141, "step": 3597 }, { "epoch": 0.8238122495706927, "grad_norm": 1.3180172443389893, "learning_rate": 1.6035439329180025e-06, "loss": 1.0853, "step": 3598 }, { "epoch": 0.8240412135088724, "grad_norm": 1.3404852151870728, "learning_rate": 1.5995171813249433e-06, "loss": 1.0734, "step": 3599 }, { "epoch": 0.8242701774470521, "grad_norm": 1.2810182571411133, "learning_rate": 1.5954950524471513e-06, "loss": 1.0471, "step": 3600 }, { "epoch": 0.8244991413852318, "grad_norm": 1.1049981117248535, "learning_rate": 1.591477548497966e-06, "loss": 1.0101, "step": 3601 }, { "epoch": 0.8247281053234116, "grad_norm": 1.3329854011535645, "learning_rate": 1.587464671688187e-06, "loss": 1.058, "step": 3602 }, { "epoch": 0.8249570692615913, "grad_norm": 1.1104480028152466, "learning_rate": 1.583456424226073e-06, "loss": 1.0087, "step": 3603 }, { "epoch": 0.825186033199771, "grad_norm": 1.2844616174697876, "learning_rate": 1.5794528083173223e-06, "loss": 1.0039, "step": 3604 }, { "epoch": 0.8254149971379507, "grad_norm": 1.207993984222412, "learning_rate": 1.575453826165093e-06, "loss": 1.0599, "step": 3605 }, { "epoch": 0.8256439610761305, "grad_norm": 1.4137972593307495, "learning_rate": 1.5714594799699912e-06, "loss": 1.0705, "step": 3606 }, { "epoch": 0.8258729250143102, "grad_norm": 1.3582916259765625, "learning_rate": 1.5674697719300735e-06, "loss": 1.0247, "step": 3607 }, { "epoch": 0.82610188895249, "grad_norm": 1.240177869796753, "learning_rate": 1.5634847042408408e-06, "loss": 1.0207, "step": 3608 }, { "epoch": 0.8263308528906698, "grad_norm": 1.2895662784576416, "learning_rate": 1.5595042790952442e-06, "loss": 0.9647, "step": 3609 }, { "epoch": 0.8265598168288495, "grad_norm": 1.8447550535202026, "learning_rate": 1.5555284986836782e-06, "loss": 1.0683, "step": 3610 }, { "epoch": 0.8267887807670292, "grad_norm": 1.2016741037368774, "learning_rate": 1.551557365193983e-06, "loss": 1.0238, "step": 3611 }, { "epoch": 0.827017744705209, "grad_norm": 1.8298025131225586, "learning_rate": 1.5475908808114325e-06, "loss": 1.0688, "step": 3612 }, { "epoch": 0.8272467086433887, "grad_norm": 1.6220577955245972, "learning_rate": 1.5436290477187589e-06, "loss": 1.0334, "step": 3613 }, { "epoch": 0.8274756725815684, "grad_norm": 1.505828619003296, "learning_rate": 1.539671868096123e-06, "loss": 1.0641, "step": 3614 }, { "epoch": 0.8277046365197481, "grad_norm": 1.131373643875122, "learning_rate": 1.535719344121125e-06, "loss": 1.0156, "step": 3615 }, { "epoch": 0.8279336004579279, "grad_norm": 1.5130774974822998, "learning_rate": 1.5317714779688076e-06, "loss": 1.104, "step": 3616 }, { "epoch": 0.8281625643961076, "grad_norm": 1.2293245792388916, "learning_rate": 1.5278282718116477e-06, "loss": 1.0824, "step": 3617 }, { "epoch": 0.8283915283342873, "grad_norm": 1.3308606147766113, "learning_rate": 1.5238897278195597e-06, "loss": 1.0208, "step": 3618 }, { "epoch": 0.828620492272467, "grad_norm": 1.5616285800933838, "learning_rate": 1.5199558481598908e-06, "loss": 1.0327, "step": 3619 }, { "epoch": 0.8288494562106469, "grad_norm": 1.1585637331008911, "learning_rate": 1.5160266349974207e-06, "loss": 0.9957, "step": 3620 }, { "epoch": 0.8290784201488266, "grad_norm": 1.2559548616409302, "learning_rate": 1.5121020904943651e-06, "loss": 1.0655, "step": 3621 }, { "epoch": 0.8293073840870063, "grad_norm": 1.3206498622894287, "learning_rate": 1.5081822168103654e-06, "loss": 1.0465, "step": 3622 }, { "epoch": 0.829536348025186, "grad_norm": 1.3651444911956787, "learning_rate": 1.5042670161024975e-06, "loss": 1.0738, "step": 3623 }, { "epoch": 0.8297653119633658, "grad_norm": 1.3507568836212158, "learning_rate": 1.500356490525261e-06, "loss": 1.0927, "step": 3624 }, { "epoch": 0.8299942759015455, "grad_norm": 1.1345727443695068, "learning_rate": 1.4964506422305902e-06, "loss": 1.0417, "step": 3625 }, { "epoch": 0.8302232398397252, "grad_norm": 1.2418663501739502, "learning_rate": 1.4925494733678324e-06, "loss": 1.0467, "step": 3626 }, { "epoch": 0.830452203777905, "grad_norm": 1.6180311441421509, "learning_rate": 1.4886529860837772e-06, "loss": 1.0142, "step": 3627 }, { "epoch": 0.8306811677160847, "grad_norm": 1.9630390405654907, "learning_rate": 1.4847611825226227e-06, "loss": 1.1022, "step": 3628 }, { "epoch": 0.8309101316542644, "grad_norm": 1.488043189048767, "learning_rate": 1.4808740648259967e-06, "loss": 1.0521, "step": 3629 }, { "epoch": 0.8311390955924441, "grad_norm": 1.2796021699905396, "learning_rate": 1.4769916351329495e-06, "loss": 1.0249, "step": 3630 }, { "epoch": 0.831368059530624, "grad_norm": 1.1457350254058838, "learning_rate": 1.4731138955799474e-06, "loss": 1.1018, "step": 3631 }, { "epoch": 0.8315970234688037, "grad_norm": 1.1872814893722534, "learning_rate": 1.4692408483008803e-06, "loss": 0.9932, "step": 3632 }, { "epoch": 0.8318259874069834, "grad_norm": 1.2328518629074097, "learning_rate": 1.465372495427052e-06, "loss": 0.9768, "step": 3633 }, { "epoch": 0.8320549513451632, "grad_norm": 1.285885214805603, "learning_rate": 1.4615088390871846e-06, "loss": 1.0506, "step": 3634 }, { "epoch": 0.8322839152833429, "grad_norm": 1.0765000581741333, "learning_rate": 1.457649881407417e-06, "loss": 1.084, "step": 3635 }, { "epoch": 0.8325128792215226, "grad_norm": 1.3982456922531128, "learning_rate": 1.4537956245113006e-06, "loss": 0.9873, "step": 3636 }, { "epoch": 0.8327418431597023, "grad_norm": 1.536367654800415, "learning_rate": 1.4499460705198e-06, "loss": 1.0396, "step": 3637 }, { "epoch": 0.8329708070978821, "grad_norm": 1.5141808986663818, "learning_rate": 1.446101221551296e-06, "loss": 1.0472, "step": 3638 }, { "epoch": 0.8331997710360618, "grad_norm": 1.16829252243042, "learning_rate": 1.4422610797215707e-06, "loss": 1.0189, "step": 3639 }, { "epoch": 0.8334287349742415, "grad_norm": 1.6448733806610107, "learning_rate": 1.4384256471438241e-06, "loss": 1.0127, "step": 3640 }, { "epoch": 0.8336576989124213, "grad_norm": 1.396384358406067, "learning_rate": 1.4345949259286673e-06, "loss": 1.0597, "step": 3641 }, { "epoch": 0.833886662850601, "grad_norm": 1.3862338066101074, "learning_rate": 1.4307689181841077e-06, "loss": 1.0525, "step": 3642 }, { "epoch": 0.8341156267887808, "grad_norm": 1.409687876701355, "learning_rate": 1.4269476260155668e-06, "loss": 1.0834, "step": 3643 }, { "epoch": 0.8343445907269605, "grad_norm": 1.310608983039856, "learning_rate": 1.4231310515258745e-06, "loss": 1.0362, "step": 3644 }, { "epoch": 0.8345735546651403, "grad_norm": 1.2230523824691772, "learning_rate": 1.4193191968152543e-06, "loss": 0.9915, "step": 3645 }, { "epoch": 0.83480251860332, "grad_norm": 1.3876858949661255, "learning_rate": 1.4155120639813392e-06, "loss": 0.9453, "step": 3646 }, { "epoch": 0.8350314825414997, "grad_norm": 1.223950743675232, "learning_rate": 1.4117096551191633e-06, "loss": 1.011, "step": 3647 }, { "epoch": 0.8352604464796795, "grad_norm": 1.2806096076965332, "learning_rate": 1.4079119723211599e-06, "loss": 1.0136, "step": 3648 }, { "epoch": 0.8354894104178592, "grad_norm": 1.2026371955871582, "learning_rate": 1.4041190176771635e-06, "loss": 0.9929, "step": 3649 }, { "epoch": 0.8357183743560389, "grad_norm": 1.4160505533218384, "learning_rate": 1.4003307932744003e-06, "loss": 1.0733, "step": 3650 }, { "epoch": 0.8359473382942186, "grad_norm": 1.2385631799697876, "learning_rate": 1.396547301197504e-06, "loss": 1.0789, "step": 3651 }, { "epoch": 0.8361763022323984, "grad_norm": 1.875428557395935, "learning_rate": 1.3927685435284977e-06, "loss": 1.0522, "step": 3652 }, { "epoch": 0.8364052661705781, "grad_norm": 1.4530951976776123, "learning_rate": 1.388994522346796e-06, "loss": 1.0203, "step": 3653 }, { "epoch": 0.8366342301087579, "grad_norm": 1.4752883911132812, "learning_rate": 1.3852252397292143e-06, "loss": 0.9734, "step": 3654 }, { "epoch": 0.8368631940469377, "grad_norm": 1.1157587766647339, "learning_rate": 1.38146069774996e-06, "loss": 0.9831, "step": 3655 }, { "epoch": 0.8370921579851174, "grad_norm": 1.351131558418274, "learning_rate": 1.377700898480624e-06, "loss": 1.0292, "step": 3656 }, { "epoch": 0.8373211219232971, "grad_norm": 1.5354865789413452, "learning_rate": 1.373945843990192e-06, "loss": 0.9554, "step": 3657 }, { "epoch": 0.8375500858614768, "grad_norm": 1.1356061697006226, "learning_rate": 1.3701955363450447e-06, "loss": 1.0667, "step": 3658 }, { "epoch": 0.8377790497996566, "grad_norm": 1.2410647869110107, "learning_rate": 1.3664499776089401e-06, "loss": 1.0254, "step": 3659 }, { "epoch": 0.8380080137378363, "grad_norm": 1.2936856746673584, "learning_rate": 1.3627091698430284e-06, "loss": 0.9986, "step": 3660 }, { "epoch": 0.838236977676016, "grad_norm": 1.256300687789917, "learning_rate": 1.3589731151058461e-06, "loss": 1.0607, "step": 3661 }, { "epoch": 0.8384659416141957, "grad_norm": 1.4257123470306396, "learning_rate": 1.355241815453312e-06, "loss": 1.0198, "step": 3662 }, { "epoch": 0.8386949055523755, "grad_norm": 1.3414442539215088, "learning_rate": 1.3515152729387315e-06, "loss": 1.045, "step": 3663 }, { "epoch": 0.8389238694905552, "grad_norm": 1.2866592407226562, "learning_rate": 1.347793489612782e-06, "loss": 0.9788, "step": 3664 }, { "epoch": 0.8391528334287349, "grad_norm": 2.8504953384399414, "learning_rate": 1.3440764675235384e-06, "loss": 0.9914, "step": 3665 }, { "epoch": 0.8393817973669148, "grad_norm": 1.2402368783950806, "learning_rate": 1.3403642087164447e-06, "loss": 1.0267, "step": 3666 }, { "epoch": 0.8396107613050945, "grad_norm": 1.9054492712020874, "learning_rate": 1.33665671523432e-06, "loss": 1.0689, "step": 3667 }, { "epoch": 0.8398397252432742, "grad_norm": 1.5320415496826172, "learning_rate": 1.332953989117377e-06, "loss": 1.025, "step": 3668 }, { "epoch": 0.8400686891814539, "grad_norm": 1.2526971101760864, "learning_rate": 1.3292560324031867e-06, "loss": 1.0578, "step": 3669 }, { "epoch": 0.8402976531196337, "grad_norm": 1.3369795083999634, "learning_rate": 1.3255628471267056e-06, "loss": 1.0097, "step": 3670 }, { "epoch": 0.8405266170578134, "grad_norm": 1.3616974353790283, "learning_rate": 1.321874435320264e-06, "loss": 1.0137, "step": 3671 }, { "epoch": 0.8407555809959931, "grad_norm": 1.156179428100586, "learning_rate": 1.3181907990135624e-06, "loss": 1.0683, "step": 3672 }, { "epoch": 0.8409845449341729, "grad_norm": 1.5307315587997437, "learning_rate": 1.3145119402336758e-06, "loss": 1.0354, "step": 3673 }, { "epoch": 0.8412135088723526, "grad_norm": 1.272509217262268, "learning_rate": 1.3108378610050498e-06, "loss": 1.068, "step": 3674 }, { "epoch": 0.8414424728105323, "grad_norm": 1.1754751205444336, "learning_rate": 1.307168563349499e-06, "loss": 1.0448, "step": 3675 }, { "epoch": 0.841671436748712, "grad_norm": 1.483634352684021, "learning_rate": 1.3035040492862061e-06, "loss": 1.0705, "step": 3676 }, { "epoch": 0.8419004006868919, "grad_norm": 1.1576098203659058, "learning_rate": 1.2998443208317246e-06, "loss": 1.1061, "step": 3677 }, { "epoch": 0.8421293646250716, "grad_norm": 1.2855072021484375, "learning_rate": 1.2961893799999703e-06, "loss": 1.0096, "step": 3678 }, { "epoch": 0.8423583285632513, "grad_norm": 1.4712800979614258, "learning_rate": 1.2925392288022299e-06, "loss": 0.9488, "step": 3679 }, { "epoch": 0.842587292501431, "grad_norm": 1.539665937423706, "learning_rate": 1.2888938692471464e-06, "loss": 1.0338, "step": 3680 }, { "epoch": 0.8428162564396108, "grad_norm": 1.1315950155258179, "learning_rate": 1.28525330334073e-06, "loss": 1.0255, "step": 3681 }, { "epoch": 0.8430452203777905, "grad_norm": 1.3212192058563232, "learning_rate": 1.2816175330863613e-06, "loss": 0.9955, "step": 3682 }, { "epoch": 0.8432741843159702, "grad_norm": 1.3658416271209717, "learning_rate": 1.2779865604847674e-06, "loss": 1.0136, "step": 3683 }, { "epoch": 0.84350314825415, "grad_norm": 1.8101247549057007, "learning_rate": 1.2743603875340426e-06, "loss": 1.0044, "step": 3684 }, { "epoch": 0.8437321121923297, "grad_norm": 1.4984769821166992, "learning_rate": 1.270739016229642e-06, "loss": 1.0216, "step": 3685 }, { "epoch": 0.8439610761305094, "grad_norm": 1.2391111850738525, "learning_rate": 1.267122448564374e-06, "loss": 1.0306, "step": 3686 }, { "epoch": 0.8441900400686891, "grad_norm": 1.167498230934143, "learning_rate": 1.2635106865284063e-06, "loss": 0.9658, "step": 3687 }, { "epoch": 0.8444190040068689, "grad_norm": 1.7382392883300781, "learning_rate": 1.259903732109261e-06, "loss": 0.9892, "step": 3688 }, { "epoch": 0.8446479679450487, "grad_norm": 1.2146246433258057, "learning_rate": 1.256301587291815e-06, "loss": 1.0496, "step": 3689 }, { "epoch": 0.8448769318832284, "grad_norm": 1.7516138553619385, "learning_rate": 1.2527042540583e-06, "loss": 1.0987, "step": 3690 }, { "epoch": 0.8451058958214082, "grad_norm": 1.5586003065109253, "learning_rate": 1.249111734388292e-06, "loss": 1.0333, "step": 3691 }, { "epoch": 0.8453348597595879, "grad_norm": 1.4014748334884644, "learning_rate": 1.2455240302587325e-06, "loss": 1.0038, "step": 3692 }, { "epoch": 0.8455638236977676, "grad_norm": 1.3031091690063477, "learning_rate": 1.2419411436439021e-06, "loss": 1.0135, "step": 3693 }, { "epoch": 0.8457927876359473, "grad_norm": 1.1367318630218506, "learning_rate": 1.238363076515432e-06, "loss": 1.0154, "step": 3694 }, { "epoch": 0.8460217515741271, "grad_norm": 1.4096239805221558, "learning_rate": 1.2347898308423012e-06, "loss": 1.0172, "step": 3695 }, { "epoch": 0.8462507155123068, "grad_norm": 1.2547544240951538, "learning_rate": 1.2312214085908424e-06, "loss": 0.9901, "step": 3696 }, { "epoch": 0.8464796794504865, "grad_norm": 1.4089115858078003, "learning_rate": 1.227657811724723e-06, "loss": 1.0461, "step": 3697 }, { "epoch": 0.8467086433886662, "grad_norm": 1.6553548574447632, "learning_rate": 1.2240990422049625e-06, "loss": 1.0419, "step": 3698 }, { "epoch": 0.846937607326846, "grad_norm": 1.4773091077804565, "learning_rate": 1.2205451019899217e-06, "loss": 1.0423, "step": 3699 }, { "epoch": 0.8471665712650257, "grad_norm": 1.2673395872116089, "learning_rate": 1.2169959930353049e-06, "loss": 1.0627, "step": 3700 }, { "epoch": 0.8473955352032055, "grad_norm": 1.941409707069397, "learning_rate": 1.2134517172941563e-06, "loss": 0.9947, "step": 3701 }, { "epoch": 0.8476244991413853, "grad_norm": 2.137348175048828, "learning_rate": 1.2099122767168602e-06, "loss": 1.009, "step": 3702 }, { "epoch": 0.847853463079565, "grad_norm": 1.4081718921661377, "learning_rate": 1.2063776732511434e-06, "loss": 0.9911, "step": 3703 }, { "epoch": 0.8480824270177447, "grad_norm": 1.2987362146377563, "learning_rate": 1.2028479088420686e-06, "loss": 1.0329, "step": 3704 }, { "epoch": 0.8483113909559244, "grad_norm": 1.554642677307129, "learning_rate": 1.19932298543203e-06, "loss": 1.0258, "step": 3705 }, { "epoch": 0.8485403548941042, "grad_norm": 1.3535888195037842, "learning_rate": 1.19580290496077e-06, "loss": 1.0652, "step": 3706 }, { "epoch": 0.8487693188322839, "grad_norm": 1.258093237876892, "learning_rate": 1.1922876693653584e-06, "loss": 1.0323, "step": 3707 }, { "epoch": 0.8489982827704636, "grad_norm": 1.1866981983184814, "learning_rate": 1.1887772805801967e-06, "loss": 1.0644, "step": 3708 }, { "epoch": 0.8492272467086434, "grad_norm": 1.5237160921096802, "learning_rate": 1.1852717405370228e-06, "loss": 0.9952, "step": 3709 }, { "epoch": 0.8494562106468231, "grad_norm": 1.3863621950149536, "learning_rate": 1.1817710511649105e-06, "loss": 0.9737, "step": 3710 }, { "epoch": 0.8496851745850028, "grad_norm": 1.380892038345337, "learning_rate": 1.1782752143902553e-06, "loss": 0.9958, "step": 3711 }, { "epoch": 0.8499141385231826, "grad_norm": 1.3540898561477661, "learning_rate": 1.1747842321367886e-06, "loss": 1.0295, "step": 3712 }, { "epoch": 0.8501431024613624, "grad_norm": 1.2603505849838257, "learning_rate": 1.171298106325568e-06, "loss": 1.0327, "step": 3713 }, { "epoch": 0.8503720663995421, "grad_norm": 1.7046961784362793, "learning_rate": 1.1678168388749788e-06, "loss": 1.0873, "step": 3714 }, { "epoch": 0.8506010303377218, "grad_norm": 1.4969935417175293, "learning_rate": 1.1643404317007345e-06, "loss": 1.0087, "step": 3715 }, { "epoch": 0.8508299942759016, "grad_norm": 1.0810832977294922, "learning_rate": 1.1608688867158724e-06, "loss": 0.9596, "step": 3716 }, { "epoch": 0.8510589582140813, "grad_norm": 1.7231873273849487, "learning_rate": 1.1574022058307555e-06, "loss": 1.0446, "step": 3717 }, { "epoch": 0.851287922152261, "grad_norm": 1.4240765571594238, "learning_rate": 1.1539403909530688e-06, "loss": 1.0087, "step": 3718 }, { "epoch": 0.8515168860904407, "grad_norm": 1.1195824146270752, "learning_rate": 1.1504834439878166e-06, "loss": 1.006, "step": 3719 }, { "epoch": 0.8517458500286205, "grad_norm": 1.3876497745513916, "learning_rate": 1.1470313668373324e-06, "loss": 1.0934, "step": 3720 }, { "epoch": 0.8519748139668002, "grad_norm": 1.3672479391098022, "learning_rate": 1.1435841614012666e-06, "loss": 1.0914, "step": 3721 }, { "epoch": 0.8522037779049799, "grad_norm": 1.1950000524520874, "learning_rate": 1.140141829576582e-06, "loss": 1.0035, "step": 3722 }, { "epoch": 0.8524327418431596, "grad_norm": 1.6234666109085083, "learning_rate": 1.1367043732575666e-06, "loss": 1.0398, "step": 3723 }, { "epoch": 0.8526617057813395, "grad_norm": 1.2520289421081543, "learning_rate": 1.1332717943358263e-06, "loss": 0.9795, "step": 3724 }, { "epoch": 0.8528906697195192, "grad_norm": 1.3402252197265625, "learning_rate": 1.1298440947002775e-06, "loss": 1.0576, "step": 3725 }, { "epoch": 0.8531196336576989, "grad_norm": 1.3803917169570923, "learning_rate": 1.1264212762371563e-06, "loss": 1.0388, "step": 3726 }, { "epoch": 0.8533485975958787, "grad_norm": 1.1965564489364624, "learning_rate": 1.1230033408300111e-06, "loss": 1.0416, "step": 3727 }, { "epoch": 0.8535775615340584, "grad_norm": 1.4472531080245972, "learning_rate": 1.1195902903597023e-06, "loss": 1.0545, "step": 3728 }, { "epoch": 0.8538065254722381, "grad_norm": 2.9633491039276123, "learning_rate": 1.1161821267044038e-06, "loss": 1.0308, "step": 3729 }, { "epoch": 0.8540354894104178, "grad_norm": 3.640618324279785, "learning_rate": 1.1127788517395987e-06, "loss": 0.9939, "step": 3730 }, { "epoch": 0.8542644533485976, "grad_norm": 1.3619310855865479, "learning_rate": 1.1093804673380804e-06, "loss": 1.0465, "step": 3731 }, { "epoch": 0.8544934172867773, "grad_norm": 1.3596311807632446, "learning_rate": 1.1059869753699547e-06, "loss": 1.0179, "step": 3732 }, { "epoch": 0.854722381224957, "grad_norm": 1.5578163862228394, "learning_rate": 1.102598377702625e-06, "loss": 0.9975, "step": 3733 }, { "epoch": 0.8549513451631368, "grad_norm": 1.3997864723205566, "learning_rate": 1.099214676200816e-06, "loss": 1.023, "step": 3734 }, { "epoch": 0.8551803091013166, "grad_norm": 1.4430853128433228, "learning_rate": 1.0958358727265438e-06, "loss": 1.0851, "step": 3735 }, { "epoch": 0.8554092730394963, "grad_norm": 1.0692206621170044, "learning_rate": 1.092461969139137e-06, "loss": 1.0646, "step": 3736 }, { "epoch": 0.855638236977676, "grad_norm": 1.4968456029891968, "learning_rate": 1.089092967295231e-06, "loss": 1.0341, "step": 3737 }, { "epoch": 0.8558672009158558, "grad_norm": 1.3507084846496582, "learning_rate": 1.0857288690487555e-06, "loss": 1.0051, "step": 3738 }, { "epoch": 0.8560961648540355, "grad_norm": 1.3483790159225464, "learning_rate": 1.082369676250945e-06, "loss": 1.0424, "step": 3739 }, { "epoch": 0.8563251287922152, "grad_norm": 1.4069323539733887, "learning_rate": 1.0790153907503364e-06, "loss": 1.0422, "step": 3740 }, { "epoch": 0.856554092730395, "grad_norm": 1.629030704498291, "learning_rate": 1.0756660143927644e-06, "loss": 1.0405, "step": 3741 }, { "epoch": 0.8567830566685747, "grad_norm": 1.409432291984558, "learning_rate": 1.0723215490213635e-06, "loss": 1.0972, "step": 3742 }, { "epoch": 0.8570120206067544, "grad_norm": 1.2074227333068848, "learning_rate": 1.0689819964765646e-06, "loss": 1.0264, "step": 3743 }, { "epoch": 0.8572409845449341, "grad_norm": 1.379439115524292, "learning_rate": 1.0656473585960946e-06, "loss": 0.9715, "step": 3744 }, { "epoch": 0.8574699484831139, "grad_norm": 1.0981643199920654, "learning_rate": 1.0623176372149802e-06, "loss": 1.0236, "step": 3745 }, { "epoch": 0.8576989124212936, "grad_norm": 1.303014874458313, "learning_rate": 1.0589928341655342e-06, "loss": 1.0092, "step": 3746 }, { "epoch": 0.8579278763594734, "grad_norm": 1.1583458185195923, "learning_rate": 1.0556729512773679e-06, "loss": 1.0262, "step": 3747 }, { "epoch": 0.8581568402976532, "grad_norm": 1.2592904567718506, "learning_rate": 1.0523579903773917e-06, "loss": 1.0341, "step": 3748 }, { "epoch": 0.8583858042358329, "grad_norm": 1.2319550514221191, "learning_rate": 1.0490479532897946e-06, "loss": 1.056, "step": 3749 }, { "epoch": 0.8586147681740126, "grad_norm": 1.1954131126403809, "learning_rate": 1.0457428418360616e-06, "loss": 1.0415, "step": 3750 }, { "epoch": 0.8588437321121923, "grad_norm": 1.6190708875656128, "learning_rate": 1.0424426578349733e-06, "loss": 1.0591, "step": 3751 }, { "epoch": 0.8590726960503721, "grad_norm": 1.1923547983169556, "learning_rate": 1.0391474031025895e-06, "loss": 1.0615, "step": 3752 }, { "epoch": 0.8593016599885518, "grad_norm": 1.4358948469161987, "learning_rate": 1.0358570794522615e-06, "loss": 0.9408, "step": 3753 }, { "epoch": 0.8595306239267315, "grad_norm": 1.3710299730300903, "learning_rate": 1.0325716886946268e-06, "loss": 1.0515, "step": 3754 }, { "epoch": 0.8597595878649112, "grad_norm": 1.4297199249267578, "learning_rate": 1.0292912326376091e-06, "loss": 0.9476, "step": 3755 }, { "epoch": 0.859988551803091, "grad_norm": 1.2614961862564087, "learning_rate": 1.0260157130864178e-06, "loss": 1.1242, "step": 3756 }, { "epoch": 0.8602175157412707, "grad_norm": 1.7146847248077393, "learning_rate": 1.0227451318435378e-06, "loss": 1.0826, "step": 3757 }, { "epoch": 0.8604464796794505, "grad_norm": 1.4658128023147583, "learning_rate": 1.019479490708748e-06, "loss": 1.0325, "step": 3758 }, { "epoch": 0.8606754436176303, "grad_norm": 1.323593258857727, "learning_rate": 1.0162187914791045e-06, "loss": 1.0018, "step": 3759 }, { "epoch": 0.86090440755581, "grad_norm": 1.5773340463638306, "learning_rate": 1.0129630359489352e-06, "loss": 0.941, "step": 3760 }, { "epoch": 0.8611333714939897, "grad_norm": 1.2028287649154663, "learning_rate": 1.0097122259098625e-06, "loss": 0.9942, "step": 3761 }, { "epoch": 0.8613623354321694, "grad_norm": 1.2747315168380737, "learning_rate": 1.0064663631507787e-06, "loss": 1.0371, "step": 3762 }, { "epoch": 0.8615912993703492, "grad_norm": 1.2724723815917969, "learning_rate": 1.0032254494578519e-06, "loss": 1.0712, "step": 3763 }, { "epoch": 0.8618202633085289, "grad_norm": 1.3098310232162476, "learning_rate": 9.999894866145287e-07, "loss": 1.0679, "step": 3764 }, { "epoch": 0.8620492272467086, "grad_norm": 1.2499535083770752, "learning_rate": 9.967584764015392e-07, "loss": 1.0339, "step": 3765 }, { "epoch": 0.8622781911848884, "grad_norm": 1.4696881771087646, "learning_rate": 9.935324205968744e-07, "loss": 1.0509, "step": 3766 }, { "epoch": 0.8625071551230681, "grad_norm": 1.3763833045959473, "learning_rate": 9.903113209758098e-07, "loss": 1.0162, "step": 3767 }, { "epoch": 0.8627361190612478, "grad_norm": 1.3132450580596924, "learning_rate": 9.870951793108863e-07, "loss": 1.031, "step": 3768 }, { "epoch": 0.8629650829994275, "grad_norm": 1.5283464193344116, "learning_rate": 9.838839973719227e-07, "loss": 1.0361, "step": 3769 }, { "epoch": 0.8631940469376074, "grad_norm": 1.428132176399231, "learning_rate": 9.806777769260034e-07, "loss": 1.0075, "step": 3770 }, { "epoch": 0.8634230108757871, "grad_norm": 1.229541540145874, "learning_rate": 9.774765197374847e-07, "loss": 1.0586, "step": 3771 }, { "epoch": 0.8636519748139668, "grad_norm": 1.8992414474487305, "learning_rate": 9.742802275679941e-07, "loss": 1.0236, "step": 3772 }, { "epoch": 0.8638809387521466, "grad_norm": 1.2058573961257935, "learning_rate": 9.710889021764235e-07, "loss": 0.9526, "step": 3773 }, { "epoch": 0.8641099026903263, "grad_norm": 1.2776075601577759, "learning_rate": 9.679025453189273e-07, "loss": 1.0135, "step": 3774 }, { "epoch": 0.864338866628506, "grad_norm": 1.5076816082000732, "learning_rate": 9.64721158748938e-07, "loss": 1.035, "step": 3775 }, { "epoch": 0.8645678305666857, "grad_norm": 3.022805690765381, "learning_rate": 9.615447442171444e-07, "loss": 1.0242, "step": 3776 }, { "epoch": 0.8647967945048655, "grad_norm": 1.037968635559082, "learning_rate": 9.583733034714982e-07, "loss": 1.0302, "step": 3777 }, { "epoch": 0.8650257584430452, "grad_norm": 1.6912962198257446, "learning_rate": 9.55206838257219e-07, "loss": 0.9981, "step": 3778 }, { "epoch": 0.8652547223812249, "grad_norm": 1.1713199615478516, "learning_rate": 9.520453503167837e-07, "loss": 0.9741, "step": 3779 }, { "epoch": 0.8654836863194046, "grad_norm": 1.2378156185150146, "learning_rate": 9.488888413899345e-07, "loss": 1.0817, "step": 3780 }, { "epoch": 0.8657126502575845, "grad_norm": 1.2005176544189453, "learning_rate": 9.457373132136716e-07, "loss": 1.0458, "step": 3781 }, { "epoch": 0.8659416141957642, "grad_norm": 2.108745574951172, "learning_rate": 9.425907675222535e-07, "loss": 1.1055, "step": 3782 }, { "epoch": 0.8661705781339439, "grad_norm": 1.631432294845581, "learning_rate": 9.394492060471994e-07, "loss": 1.0514, "step": 3783 }, { "epoch": 0.8663995420721237, "grad_norm": 1.4168133735656738, "learning_rate": 9.363126305172831e-07, "loss": 1.0468, "step": 3784 }, { "epoch": 0.8666285060103034, "grad_norm": 1.2942928075790405, "learning_rate": 9.331810426585364e-07, "loss": 1.0393, "step": 3785 }, { "epoch": 0.8668574699484831, "grad_norm": 1.3363865613937378, "learning_rate": 9.300544441942461e-07, "loss": 1.0378, "step": 3786 }, { "epoch": 0.8670864338866628, "grad_norm": 1.569215178489685, "learning_rate": 9.269328368449538e-07, "loss": 1.046, "step": 3787 }, { "epoch": 0.8673153978248426, "grad_norm": 1.158187985420227, "learning_rate": 9.238162223284497e-07, "loss": 1.0903, "step": 3788 }, { "epoch": 0.8675443617630223, "grad_norm": 2.184824228286743, "learning_rate": 9.207046023597866e-07, "loss": 1.0087, "step": 3789 }, { "epoch": 0.867773325701202, "grad_norm": 1.4404144287109375, "learning_rate": 9.175979786512589e-07, "loss": 1.0473, "step": 3790 }, { "epoch": 0.8680022896393818, "grad_norm": 1.384673833847046, "learning_rate": 9.144963529124163e-07, "loss": 1.0535, "step": 3791 }, { "epoch": 0.8682312535775615, "grad_norm": 1.3582277297973633, "learning_rate": 9.113997268500574e-07, "loss": 1.0592, "step": 3792 }, { "epoch": 0.8684602175157413, "grad_norm": 1.1262242794036865, "learning_rate": 9.083081021682305e-07, "loss": 1.0654, "step": 3793 }, { "epoch": 0.868689181453921, "grad_norm": 1.1574608087539673, "learning_rate": 9.052214805682303e-07, "loss": 1.065, "step": 3794 }, { "epoch": 0.8689181453921008, "grad_norm": 1.3190420866012573, "learning_rate": 9.021398637485979e-07, "loss": 0.9962, "step": 3795 }, { "epoch": 0.8691471093302805, "grad_norm": 4.816401958465576, "learning_rate": 8.990632534051224e-07, "loss": 0.9806, "step": 3796 }, { "epoch": 0.8693760732684602, "grad_norm": 1.8604696989059448, "learning_rate": 8.959916512308387e-07, "loss": 1.0577, "step": 3797 }, { "epoch": 0.86960503720664, "grad_norm": 1.3026139736175537, "learning_rate": 8.929250589160166e-07, "loss": 1.0274, "step": 3798 }, { "epoch": 0.8698340011448197, "grad_norm": 1.3048547506332397, "learning_rate": 8.898634781481829e-07, "loss": 1.0608, "step": 3799 }, { "epoch": 0.8700629650829994, "grad_norm": 1.4611790180206299, "learning_rate": 8.868069106121002e-07, "loss": 1.0704, "step": 3800 }, { "epoch": 0.8702919290211791, "grad_norm": 1.360741138458252, "learning_rate": 8.837553579897673e-07, "loss": 1.018, "step": 3801 }, { "epoch": 0.8705208929593589, "grad_norm": 1.1581453084945679, "learning_rate": 8.807088219604288e-07, "loss": 1.0178, "step": 3802 }, { "epoch": 0.8707498568975386, "grad_norm": 1.437386393547058, "learning_rate": 8.776673042005724e-07, "loss": 1.062, "step": 3803 }, { "epoch": 0.8709788208357184, "grad_norm": 1.3421489000320435, "learning_rate": 8.74630806383916e-07, "loss": 0.9713, "step": 3804 }, { "epoch": 0.8712077847738982, "grad_norm": 1.095431923866272, "learning_rate": 8.715993301814174e-07, "loss": 0.9333, "step": 3805 }, { "epoch": 0.8714367487120779, "grad_norm": 1.4182848930358887, "learning_rate": 8.685728772612778e-07, "loss": 1.0209, "step": 3806 }, { "epoch": 0.8716657126502576, "grad_norm": 1.2892870903015137, "learning_rate": 8.655514492889249e-07, "loss": 1.048, "step": 3807 }, { "epoch": 0.8718946765884373, "grad_norm": 1.217413306236267, "learning_rate": 8.625350479270255e-07, "loss": 1.0019, "step": 3808 }, { "epoch": 0.8721236405266171, "grad_norm": 1.2366745471954346, "learning_rate": 8.595236748354807e-07, "loss": 1.042, "step": 3809 }, { "epoch": 0.8723526044647968, "grad_norm": 1.248071312904358, "learning_rate": 8.565173316714249e-07, "loss": 1.0241, "step": 3810 }, { "epoch": 0.8725815684029765, "grad_norm": 1.2598704099655151, "learning_rate": 8.535160200892234e-07, "loss": 1.0653, "step": 3811 }, { "epoch": 0.8728105323411562, "grad_norm": 1.4132360219955444, "learning_rate": 8.505197417404687e-07, "loss": 1.0395, "step": 3812 }, { "epoch": 0.873039496279336, "grad_norm": 1.2241437435150146, "learning_rate": 8.475284982739917e-07, "loss": 1.0056, "step": 3813 }, { "epoch": 0.8732684602175157, "grad_norm": 3.2831687927246094, "learning_rate": 8.445422913358503e-07, "loss": 1.047, "step": 3814 }, { "epoch": 0.8734974241556954, "grad_norm": 1.535780668258667, "learning_rate": 8.415611225693254e-07, "loss": 1.0187, "step": 3815 }, { "epoch": 0.8737263880938753, "grad_norm": 1.2877063751220703, "learning_rate": 8.385849936149282e-07, "loss": 0.9162, "step": 3816 }, { "epoch": 0.873955352032055, "grad_norm": 1.2718114852905273, "learning_rate": 8.356139061104029e-07, "loss": 1.0094, "step": 3817 }, { "epoch": 0.8741843159702347, "grad_norm": 1.4520769119262695, "learning_rate": 8.326478616907085e-07, "loss": 1.0206, "step": 3818 }, { "epoch": 0.8744132799084144, "grad_norm": 2.025461435317993, "learning_rate": 8.296868619880372e-07, "loss": 1.0811, "step": 3819 }, { "epoch": 0.8746422438465942, "grad_norm": 1.302669882774353, "learning_rate": 8.26730908631802e-07, "loss": 1.038, "step": 3820 }, { "epoch": 0.8748712077847739, "grad_norm": 1.3168150186538696, "learning_rate": 8.237800032486387e-07, "loss": 1.0384, "step": 3821 }, { "epoch": 0.8751001717229536, "grad_norm": 1.5246906280517578, "learning_rate": 8.208341474624071e-07, "loss": 1.0494, "step": 3822 }, { "epoch": 0.8753291356611334, "grad_norm": 2.0702121257781982, "learning_rate": 8.178933428941849e-07, "loss": 1.0735, "step": 3823 }, { "epoch": 0.8755580995993131, "grad_norm": 1.1960113048553467, "learning_rate": 8.149575911622731e-07, "loss": 1.0311, "step": 3824 }, { "epoch": 0.8757870635374928, "grad_norm": 1.2554943561553955, "learning_rate": 8.12026893882194e-07, "loss": 1.0279, "step": 3825 }, { "epoch": 0.8760160274756725, "grad_norm": 1.2481484413146973, "learning_rate": 8.091012526666797e-07, "loss": 1.0473, "step": 3826 }, { "epoch": 0.8762449914138524, "grad_norm": 1.5274428129196167, "learning_rate": 8.061806691256913e-07, "loss": 1.0147, "step": 3827 }, { "epoch": 0.8764739553520321, "grad_norm": 1.132452130317688, "learning_rate": 8.032651448664009e-07, "loss": 1.0577, "step": 3828 }, { "epoch": 0.8767029192902118, "grad_norm": 1.3138935565948486, "learning_rate": 8.003546814931917e-07, "loss": 1.0043, "step": 3829 }, { "epoch": 0.8769318832283916, "grad_norm": 1.1806259155273438, "learning_rate": 7.974492806076762e-07, "loss": 1.0525, "step": 3830 }, { "epoch": 0.8771608471665713, "grad_norm": 1.3222166299819946, "learning_rate": 7.945489438086651e-07, "loss": 1.092, "step": 3831 }, { "epoch": 0.877389811104751, "grad_norm": 1.3098158836364746, "learning_rate": 7.91653672692192e-07, "loss": 1.0704, "step": 3832 }, { "epoch": 0.8776187750429307, "grad_norm": 1.5300372838974, "learning_rate": 7.887634688515e-07, "loss": 1.0192, "step": 3833 }, { "epoch": 0.8778477389811105, "grad_norm": 1.5282765626907349, "learning_rate": 7.858783338770437e-07, "loss": 1.0689, "step": 3834 }, { "epoch": 0.8780767029192902, "grad_norm": 1.0936659574508667, "learning_rate": 7.829982693564886e-07, "loss": 1.0448, "step": 3835 }, { "epoch": 0.8783056668574699, "grad_norm": 1.1930863857269287, "learning_rate": 7.801232768747113e-07, "loss": 0.9976, "step": 3836 }, { "epoch": 0.8785346307956496, "grad_norm": 1.5860801935195923, "learning_rate": 7.772533580137942e-07, "loss": 0.9733, "step": 3837 }, { "epoch": 0.8787635947338294, "grad_norm": 1.3473610877990723, "learning_rate": 7.743885143530317e-07, "loss": 1.0056, "step": 3838 }, { "epoch": 0.8789925586720092, "grad_norm": 1.2198975086212158, "learning_rate": 7.715287474689237e-07, "loss": 1.0589, "step": 3839 }, { "epoch": 0.8792215226101889, "grad_norm": 2.675382614135742, "learning_rate": 7.686740589351704e-07, "loss": 1.021, "step": 3840 }, { "epoch": 0.8794504865483687, "grad_norm": 1.468649983406067, "learning_rate": 7.658244503226909e-07, "loss": 1.0378, "step": 3841 }, { "epoch": 0.8796794504865484, "grad_norm": 1.561955213546753, "learning_rate": 7.629799231995961e-07, "loss": 1.0173, "step": 3842 }, { "epoch": 0.8799084144247281, "grad_norm": 1.2610421180725098, "learning_rate": 7.601404791312028e-07, "loss": 1.0568, "step": 3843 }, { "epoch": 0.8801373783629078, "grad_norm": 1.5121142864227295, "learning_rate": 7.573061196800414e-07, "loss": 0.9959, "step": 3844 }, { "epoch": 0.8803663423010876, "grad_norm": 1.1338526010513306, "learning_rate": 7.544768464058294e-07, "loss": 0.9543, "step": 3845 }, { "epoch": 0.8805953062392673, "grad_norm": 1.4656891822814941, "learning_rate": 7.51652660865495e-07, "loss": 1.0345, "step": 3846 }, { "epoch": 0.880824270177447, "grad_norm": 1.067690372467041, "learning_rate": 7.488335646131628e-07, "loss": 1.001, "step": 3847 }, { "epoch": 0.8810532341156267, "grad_norm": 1.3866279125213623, "learning_rate": 7.460195592001585e-07, "loss": 0.9599, "step": 3848 }, { "epoch": 0.8812821980538065, "grad_norm": 1.3089559078216553, "learning_rate": 7.432106461750044e-07, "loss": 1.0318, "step": 3849 }, { "epoch": 0.8815111619919863, "grad_norm": 1.1162192821502686, "learning_rate": 7.40406827083423e-07, "loss": 1.0282, "step": 3850 }, { "epoch": 0.881740125930166, "grad_norm": 1.7417439222335815, "learning_rate": 7.376081034683325e-07, "loss": 1.0748, "step": 3851 }, { "epoch": 0.8819690898683458, "grad_norm": 1.5468460321426392, "learning_rate": 7.348144768698485e-07, "loss": 1.0649, "step": 3852 }, { "epoch": 0.8821980538065255, "grad_norm": 1.8217270374298096, "learning_rate": 7.320259488252757e-07, "loss": 1.0416, "step": 3853 }, { "epoch": 0.8824270177447052, "grad_norm": 1.378803014755249, "learning_rate": 7.292425208691212e-07, "loss": 1.0032, "step": 3854 }, { "epoch": 0.882655981682885, "grad_norm": 1.5163785219192505, "learning_rate": 7.26464194533083e-07, "loss": 1.0449, "step": 3855 }, { "epoch": 0.8828849456210647, "grad_norm": 1.268101692199707, "learning_rate": 7.236909713460482e-07, "loss": 1.0759, "step": 3856 }, { "epoch": 0.8831139095592444, "grad_norm": 1.5246165990829468, "learning_rate": 7.209228528340972e-07, "loss": 1.0565, "step": 3857 }, { "epoch": 0.8833428734974241, "grad_norm": 1.3627903461456299, "learning_rate": 7.181598405205082e-07, "loss": 1.0487, "step": 3858 }, { "epoch": 0.8835718374356039, "grad_norm": 1.2992089986801147, "learning_rate": 7.154019359257381e-07, "loss": 1.0759, "step": 3859 }, { "epoch": 0.8838008013737836, "grad_norm": 1.353250503540039, "learning_rate": 7.126491405674407e-07, "loss": 1.0197, "step": 3860 }, { "epoch": 0.8840297653119633, "grad_norm": 1.206063985824585, "learning_rate": 7.099014559604556e-07, "loss": 1.0341, "step": 3861 }, { "epoch": 0.8842587292501431, "grad_norm": 1.488411784172058, "learning_rate": 7.071588836168109e-07, "loss": 0.992, "step": 3862 }, { "epoch": 0.8844876931883229, "grad_norm": 1.6317309141159058, "learning_rate": 7.044214250457216e-07, "loss": 1.0674, "step": 3863 }, { "epoch": 0.8847166571265026, "grad_norm": 4.539884090423584, "learning_rate": 7.016890817535881e-07, "loss": 1.0251, "step": 3864 }, { "epoch": 0.8849456210646823, "grad_norm": 1.4975309371948242, "learning_rate": 6.989618552439958e-07, "loss": 1.0532, "step": 3865 }, { "epoch": 0.8851745850028621, "grad_norm": 1.3167531490325928, "learning_rate": 6.962397470177162e-07, "loss": 1.0001, "step": 3866 }, { "epoch": 0.8854035489410418, "grad_norm": 1.4427505731582642, "learning_rate": 6.935227585726978e-07, "loss": 1.1204, "step": 3867 }, { "epoch": 0.8856325128792215, "grad_norm": 1.2467824220657349, "learning_rate": 6.908108914040823e-07, "loss": 0.9937, "step": 3868 }, { "epoch": 0.8858614768174012, "grad_norm": 1.390540361404419, "learning_rate": 6.881041470041849e-07, "loss": 0.9756, "step": 3869 }, { "epoch": 0.886090440755581, "grad_norm": 1.3013540506362915, "learning_rate": 6.854025268625042e-07, "loss": 1.0061, "step": 3870 }, { "epoch": 0.8863194046937607, "grad_norm": 1.362518548965454, "learning_rate": 6.827060324657164e-07, "loss": 1.0281, "step": 3871 }, { "epoch": 0.8865483686319404, "grad_norm": 1.2985994815826416, "learning_rate": 6.800146652976869e-07, "loss": 1.049, "step": 3872 }, { "epoch": 0.8867773325701203, "grad_norm": 1.0657109022140503, "learning_rate": 6.773284268394464e-07, "loss": 1.0451, "step": 3873 }, { "epoch": 0.8870062965083, "grad_norm": 1.4140193462371826, "learning_rate": 6.746473185692115e-07, "loss": 0.9702, "step": 3874 }, { "epoch": 0.8872352604464797, "grad_norm": 1.1937177181243896, "learning_rate": 6.71971341962373e-07, "loss": 1.0247, "step": 3875 }, { "epoch": 0.8874642243846594, "grad_norm": 1.1080400943756104, "learning_rate": 6.693004984914997e-07, "loss": 1.039, "step": 3876 }, { "epoch": 0.8876931883228392, "grad_norm": 1.7587668895721436, "learning_rate": 6.666347896263326e-07, "loss": 1.0278, "step": 3877 }, { "epoch": 0.8879221522610189, "grad_norm": 1.3932582139968872, "learning_rate": 6.639742168337904e-07, "loss": 1.027, "step": 3878 }, { "epoch": 0.8881511161991986, "grad_norm": 1.1661114692687988, "learning_rate": 6.61318781577962e-07, "loss": 1.0093, "step": 3879 }, { "epoch": 0.8883800801373783, "grad_norm": 1.9590517282485962, "learning_rate": 6.586684853201153e-07, "loss": 1.0055, "step": 3880 }, { "epoch": 0.8886090440755581, "grad_norm": 1.1533890962600708, "learning_rate": 6.560233295186802e-07, "loss": 1.0544, "step": 3881 }, { "epoch": 0.8888380080137378, "grad_norm": 1.3643183708190918, "learning_rate": 6.53383315629268e-07, "loss": 1.0934, "step": 3882 }, { "epoch": 0.8890669719519175, "grad_norm": 1.3094185590744019, "learning_rate": 6.507484451046575e-07, "loss": 1.0203, "step": 3883 }, { "epoch": 0.8892959358900973, "grad_norm": 1.2927873134613037, "learning_rate": 6.481187193947913e-07, "loss": 1.0636, "step": 3884 }, { "epoch": 0.8895248998282771, "grad_norm": 1.2240914106369019, "learning_rate": 6.454941399467873e-07, "loss": 1.0583, "step": 3885 }, { "epoch": 0.8897538637664568, "grad_norm": 1.765049934387207, "learning_rate": 6.428747082049313e-07, "loss": 1.0131, "step": 3886 }, { "epoch": 0.8899828277046365, "grad_norm": 1.369654893875122, "learning_rate": 6.402604256106715e-07, "loss": 1.0498, "step": 3887 }, { "epoch": 0.8902117916428163, "grad_norm": 1.3459523916244507, "learning_rate": 6.37651293602628e-07, "loss": 0.9947, "step": 3888 }, { "epoch": 0.890440755580996, "grad_norm": 1.3815542459487915, "learning_rate": 6.350473136165836e-07, "loss": 0.9885, "step": 3889 }, { "epoch": 0.8906697195191757, "grad_norm": 1.2464871406555176, "learning_rate": 6.324484870854874e-07, "loss": 1.0349, "step": 3890 }, { "epoch": 0.8908986834573555, "grad_norm": 1.1414371728897095, "learning_rate": 6.298548154394501e-07, "loss": 0.9818, "step": 3891 }, { "epoch": 0.8911276473955352, "grad_norm": 1.2838653326034546, "learning_rate": 6.2726630010575e-07, "loss": 1.0227, "step": 3892 }, { "epoch": 0.8913566113337149, "grad_norm": 1.3109924793243408, "learning_rate": 6.246829425088229e-07, "loss": 1.0274, "step": 3893 }, { "epoch": 0.8915855752718946, "grad_norm": 1.4073375463485718, "learning_rate": 6.22104744070271e-07, "loss": 1.0466, "step": 3894 }, { "epoch": 0.8918145392100744, "grad_norm": 1.345703125, "learning_rate": 6.195317062088513e-07, "loss": 1.0208, "step": 3895 }, { "epoch": 0.8920435031482542, "grad_norm": 1.3727253675460815, "learning_rate": 6.169638303404912e-07, "loss": 1.0646, "step": 3896 }, { "epoch": 0.8922724670864339, "grad_norm": 1.132949709892273, "learning_rate": 6.14401117878266e-07, "loss": 0.9631, "step": 3897 }, { "epoch": 0.8925014310246137, "grad_norm": 1.202333688735962, "learning_rate": 6.118435702324166e-07, "loss": 0.9988, "step": 3898 }, { "epoch": 0.8927303949627934, "grad_norm": 1.380020022392273, "learning_rate": 6.092911888103404e-07, "loss": 1.0655, "step": 3899 }, { "epoch": 0.8929593589009731, "grad_norm": 1.3880457878112793, "learning_rate": 6.067439750165916e-07, "loss": 1.004, "step": 3900 }, { "epoch": 0.8931883228391528, "grad_norm": 1.0868477821350098, "learning_rate": 6.042019302528801e-07, "loss": 0.9799, "step": 3901 }, { "epoch": 0.8934172867773326, "grad_norm": 1.2509794235229492, "learning_rate": 6.016650559180715e-07, "loss": 1.0058, "step": 3902 }, { "epoch": 0.8936462507155123, "grad_norm": 1.0911234617233276, "learning_rate": 5.991333534081878e-07, "loss": 1.0277, "step": 3903 }, { "epoch": 0.893875214653692, "grad_norm": 1.3828800916671753, "learning_rate": 5.966068241164025e-07, "loss": 1.0678, "step": 3904 }, { "epoch": 0.8941041785918717, "grad_norm": 1.1840590238571167, "learning_rate": 5.940854694330433e-07, "loss": 0.9748, "step": 3905 }, { "epoch": 0.8943331425300515, "grad_norm": 1.447941541671753, "learning_rate": 5.915692907455905e-07, "loss": 0.9746, "step": 3906 }, { "epoch": 0.8945621064682312, "grad_norm": 2.082156181335449, "learning_rate": 5.890582894386798e-07, "loss": 1.0355, "step": 3907 }, { "epoch": 0.894791070406411, "grad_norm": 1.1880125999450684, "learning_rate": 5.86552466894088e-07, "loss": 1.0326, "step": 3908 }, { "epoch": 0.8950200343445908, "grad_norm": 1.3864256143569946, "learning_rate": 5.840518244907512e-07, "loss": 0.9975, "step": 3909 }, { "epoch": 0.8952489982827705, "grad_norm": 1.368095874786377, "learning_rate": 5.815563636047539e-07, "loss": 0.9745, "step": 3910 }, { "epoch": 0.8954779622209502, "grad_norm": 1.2576544284820557, "learning_rate": 5.790660856093245e-07, "loss": 1.0147, "step": 3911 }, { "epoch": 0.89570692615913, "grad_norm": 1.2153538465499878, "learning_rate": 5.76580991874841e-07, "loss": 1.0201, "step": 3912 }, { "epoch": 0.8959358900973097, "grad_norm": 1.7341593503952026, "learning_rate": 5.741010837688354e-07, "loss": 0.9843, "step": 3913 }, { "epoch": 0.8961648540354894, "grad_norm": 1.151334285736084, "learning_rate": 5.716263626559748e-07, "loss": 0.9381, "step": 3914 }, { "epoch": 0.8963938179736691, "grad_norm": 1.2881344556808472, "learning_rate": 5.691568298980799e-07, "loss": 0.9995, "step": 3915 }, { "epoch": 0.8966227819118489, "grad_norm": 1.0738816261291504, "learning_rate": 5.666924868541124e-07, "loss": 0.9905, "step": 3916 }, { "epoch": 0.8968517458500286, "grad_norm": 1.1380635499954224, "learning_rate": 5.64233334880181e-07, "loss": 1.0218, "step": 3917 }, { "epoch": 0.8970807097882083, "grad_norm": 1.9478411674499512, "learning_rate": 5.617793753295364e-07, "loss": 1.0623, "step": 3918 }, { "epoch": 0.8973096737263881, "grad_norm": 3.0823843479156494, "learning_rate": 5.593306095525697e-07, "loss": 1.0533, "step": 3919 }, { "epoch": 0.8975386376645679, "grad_norm": 1.3105992078781128, "learning_rate": 5.568870388968184e-07, "loss": 1.0027, "step": 3920 }, { "epoch": 0.8977676016027476, "grad_norm": 1.4492088556289673, "learning_rate": 5.544486647069614e-07, "loss": 0.9894, "step": 3921 }, { "epoch": 0.8979965655409273, "grad_norm": 1.8873605728149414, "learning_rate": 5.520154883248086e-07, "loss": 1.0517, "step": 3922 }, { "epoch": 0.898225529479107, "grad_norm": 1.1515185832977295, "learning_rate": 5.495875110893223e-07, "loss": 1.0118, "step": 3923 }, { "epoch": 0.8984544934172868, "grad_norm": 1.1645504236221313, "learning_rate": 5.471647343365982e-07, "loss": 0.9882, "step": 3924 }, { "epoch": 0.8986834573554665, "grad_norm": 1.2297289371490479, "learning_rate": 5.447471593998666e-07, "loss": 0.9947, "step": 3925 }, { "epoch": 0.8989124212936462, "grad_norm": 1.699040412902832, "learning_rate": 5.423347876094998e-07, "loss": 0.9676, "step": 3926 }, { "epoch": 0.899141385231826, "grad_norm": 1.485076665878296, "learning_rate": 5.399276202930071e-07, "loss": 1.0582, "step": 3927 }, { "epoch": 0.8993703491700057, "grad_norm": 2.078399419784546, "learning_rate": 5.375256587750311e-07, "loss": 1.0306, "step": 3928 }, { "epoch": 0.8995993131081854, "grad_norm": 1.3613694906234741, "learning_rate": 5.351289043773511e-07, "loss": 1.0463, "step": 3929 }, { "epoch": 0.8998282770463651, "grad_norm": 1.1392529010772705, "learning_rate": 5.327373584188822e-07, "loss": 1.0335, "step": 3930 }, { "epoch": 0.900057240984545, "grad_norm": 1.9740110635757446, "learning_rate": 5.303510222156716e-07, "loss": 1.0021, "step": 3931 }, { "epoch": 0.9002862049227247, "grad_norm": 1.254390835762024, "learning_rate": 5.279698970809011e-07, "loss": 1.0522, "step": 3932 }, { "epoch": 0.9005151688609044, "grad_norm": 1.3042449951171875, "learning_rate": 5.255939843248792e-07, "loss": 1.0814, "step": 3933 }, { "epoch": 0.9007441327990842, "grad_norm": 1.3471471071243286, "learning_rate": 5.232232852550568e-07, "loss": 1.0309, "step": 3934 }, { "epoch": 0.9009730967372639, "grad_norm": 1.9291504621505737, "learning_rate": 5.208578011760079e-07, "loss": 1.0439, "step": 3935 }, { "epoch": 0.9012020606754436, "grad_norm": 1.2869789600372314, "learning_rate": 5.184975333894349e-07, "loss": 1.1011, "step": 3936 }, { "epoch": 0.9014310246136233, "grad_norm": 1.4123051166534424, "learning_rate": 5.161424831941797e-07, "loss": 1.0704, "step": 3937 }, { "epoch": 0.9016599885518031, "grad_norm": 1.2700828313827515, "learning_rate": 5.137926518862013e-07, "loss": 1.053, "step": 3938 }, { "epoch": 0.9018889524899828, "grad_norm": 1.2948448657989502, "learning_rate": 5.114480407585942e-07, "loss": 1.0921, "step": 3939 }, { "epoch": 0.9021179164281625, "grad_norm": 1.1770613193511963, "learning_rate": 5.091086511015786e-07, "loss": 1.0008, "step": 3940 }, { "epoch": 0.9023468803663423, "grad_norm": 1.1652601957321167, "learning_rate": 5.067744842025002e-07, "loss": 1.0086, "step": 3941 }, { "epoch": 0.9025758443045221, "grad_norm": 1.2411365509033203, "learning_rate": 5.044455413458327e-07, "loss": 1.0527, "step": 3942 }, { "epoch": 0.9028048082427018, "grad_norm": 1.4037675857543945, "learning_rate": 5.02121823813172e-07, "loss": 1.0761, "step": 3943 }, { "epoch": 0.9030337721808815, "grad_norm": 1.2515357732772827, "learning_rate": 4.998033328832419e-07, "loss": 0.9981, "step": 3944 }, { "epoch": 0.9032627361190613, "grad_norm": 1.2006301879882812, "learning_rate": 4.974900698318885e-07, "loss": 0.983, "step": 3945 }, { "epoch": 0.903491700057241, "grad_norm": 1.3678110837936401, "learning_rate": 4.951820359320802e-07, "loss": 1.0476, "step": 3946 }, { "epoch": 0.9037206639954207, "grad_norm": 1.4648295640945435, "learning_rate": 4.928792324539089e-07, "loss": 0.9633, "step": 3947 }, { "epoch": 0.9039496279336005, "grad_norm": 1.2273361682891846, "learning_rate": 4.905816606645896e-07, "loss": 1.038, "step": 3948 }, { "epoch": 0.9041785918717802, "grad_norm": 1.4598244428634644, "learning_rate": 4.882893218284546e-07, "loss": 1.0626, "step": 3949 }, { "epoch": 0.9044075558099599, "grad_norm": 1.1587828397750854, "learning_rate": 4.860022172069579e-07, "loss": 1.0123, "step": 3950 }, { "epoch": 0.9046365197481396, "grad_norm": 1.4061542749404907, "learning_rate": 4.837203480586782e-07, "loss": 0.9873, "step": 3951 }, { "epoch": 0.9048654836863194, "grad_norm": 1.5064465999603271, "learning_rate": 4.814437156393048e-07, "loss": 1.0661, "step": 3952 }, { "epoch": 0.9050944476244991, "grad_norm": 1.5875314474105835, "learning_rate": 4.791723212016508e-07, "loss": 0.9952, "step": 3953 }, { "epoch": 0.9053234115626789, "grad_norm": 1.572794795036316, "learning_rate": 4.769061659956464e-07, "loss": 1.0625, "step": 3954 }, { "epoch": 0.9055523755008587, "grad_norm": 1.485347032546997, "learning_rate": 4.746452512683375e-07, "loss": 0.9975, "step": 3955 }, { "epoch": 0.9057813394390384, "grad_norm": 1.1230580806732178, "learning_rate": 4.7238957826388656e-07, "loss": 1.0865, "step": 3956 }, { "epoch": 0.9060103033772181, "grad_norm": 1.2839785814285278, "learning_rate": 4.7013914822356956e-07, "loss": 1.0588, "step": 3957 }, { "epoch": 0.9062392673153978, "grad_norm": 1.29798424243927, "learning_rate": 4.6789396238578255e-07, "loss": 1.0536, "step": 3958 }, { "epoch": 0.9064682312535776, "grad_norm": 1.4684815406799316, "learning_rate": 4.656540219860317e-07, "loss": 1.0095, "step": 3959 }, { "epoch": 0.9066971951917573, "grad_norm": 3.565049409866333, "learning_rate": 4.6341932825693326e-07, "loss": 1.0161, "step": 3960 }, { "epoch": 0.906926159129937, "grad_norm": 1.6227142810821533, "learning_rate": 4.611898824282257e-07, "loss": 1.0105, "step": 3961 }, { "epoch": 0.9071551230681167, "grad_norm": 1.3278828859329224, "learning_rate": 4.5896568572675327e-07, "loss": 0.9983, "step": 3962 }, { "epoch": 0.9073840870062965, "grad_norm": 1.2562534809112549, "learning_rate": 4.567467393764702e-07, "loss": 1.0176, "step": 3963 }, { "epoch": 0.9076130509444762, "grad_norm": 1.5097713470458984, "learning_rate": 4.5453304459844525e-07, "loss": 1.0569, "step": 3964 }, { "epoch": 0.907842014882656, "grad_norm": 1.313685655593872, "learning_rate": 4.5232460261085966e-07, "loss": 1.0136, "step": 3965 }, { "epoch": 0.9080709788208358, "grad_norm": 1.2489078044891357, "learning_rate": 4.501214146289956e-07, "loss": 1.0279, "step": 3966 }, { "epoch": 0.9082999427590155, "grad_norm": 1.3103182315826416, "learning_rate": 4.4792348186525e-07, "loss": 1.0426, "step": 3967 }, { "epoch": 0.9085289066971952, "grad_norm": 1.1325465440750122, "learning_rate": 4.4573080552912874e-07, "loss": 0.9801, "step": 3968 }, { "epoch": 0.9087578706353749, "grad_norm": 1.951433539390564, "learning_rate": 4.435433868272421e-07, "loss": 0.9769, "step": 3969 }, { "epoch": 0.9089868345735547, "grad_norm": 1.1956815719604492, "learning_rate": 4.413612269633083e-07, "loss": 1.0412, "step": 3970 }, { "epoch": 0.9092157985117344, "grad_norm": 1.2107232809066772, "learning_rate": 4.391843271381513e-07, "loss": 0.9577, "step": 3971 }, { "epoch": 0.9094447624499141, "grad_norm": 1.252339482307434, "learning_rate": 4.370126885497017e-07, "loss": 1.0642, "step": 3972 }, { "epoch": 0.9096737263880939, "grad_norm": 1.6101003885269165, "learning_rate": 4.3484631239299356e-07, "loss": 0.9944, "step": 3973 }, { "epoch": 0.9099026903262736, "grad_norm": 1.0703437328338623, "learning_rate": 4.3268519986016444e-07, "loss": 1.0407, "step": 3974 }, { "epoch": 0.9101316542644533, "grad_norm": 1.2428948879241943, "learning_rate": 4.3052935214045745e-07, "loss": 1.0025, "step": 3975 }, { "epoch": 0.910360618202633, "grad_norm": 1.3559765815734863, "learning_rate": 4.2837877042021915e-07, "loss": 1.0398, "step": 3976 }, { "epoch": 0.9105895821408129, "grad_norm": 1.2042170763015747, "learning_rate": 4.26233455882894e-07, "loss": 0.9451, "step": 3977 }, { "epoch": 0.9108185460789926, "grad_norm": 1.347455620765686, "learning_rate": 4.240934097090299e-07, "loss": 0.9805, "step": 3978 }, { "epoch": 0.9110475100171723, "grad_norm": 2.1769297122955322, "learning_rate": 4.219586330762815e-07, "loss": 1.0007, "step": 3979 }, { "epoch": 0.911276473955352, "grad_norm": 1.392369270324707, "learning_rate": 4.198291271593924e-07, "loss": 1.0211, "step": 3980 }, { "epoch": 0.9115054378935318, "grad_norm": 1.1754943132400513, "learning_rate": 4.177048931302152e-07, "loss": 1.008, "step": 3981 }, { "epoch": 0.9117344018317115, "grad_norm": 1.4287794828414917, "learning_rate": 4.1558593215769714e-07, "loss": 1.0554, "step": 3982 }, { "epoch": 0.9119633657698912, "grad_norm": 1.3779202699661255, "learning_rate": 4.134722454078843e-07, "loss": 1.0852, "step": 3983 }, { "epoch": 0.912192329708071, "grad_norm": 1.3808211088180542, "learning_rate": 4.1136383404392187e-07, "loss": 1.0345, "step": 3984 }, { "epoch": 0.9124212936462507, "grad_norm": 1.3731261491775513, "learning_rate": 4.092606992260506e-07, "loss": 1.0508, "step": 3985 }, { "epoch": 0.9126502575844304, "grad_norm": 1.3647085428237915, "learning_rate": 4.0716284211160807e-07, "loss": 1.0739, "step": 3986 }, { "epoch": 0.9128792215226101, "grad_norm": 1.5696361064910889, "learning_rate": 4.0507026385502747e-07, "loss": 1.0594, "step": 3987 }, { "epoch": 0.91310818546079, "grad_norm": 1.408075213432312, "learning_rate": 4.029829656078343e-07, "loss": 1.0911, "step": 3988 }, { "epoch": 0.9133371493989697, "grad_norm": 1.2420895099639893, "learning_rate": 4.0090094851865547e-07, "loss": 0.9449, "step": 3989 }, { "epoch": 0.9135661133371494, "grad_norm": 1.551822543144226, "learning_rate": 3.9882421373320655e-07, "loss": 0.9516, "step": 3990 }, { "epoch": 0.9137950772753292, "grad_norm": 2.8030037879943848, "learning_rate": 3.967527623942957e-07, "loss": 1.0588, "step": 3991 }, { "epoch": 0.9140240412135089, "grad_norm": 1.239384651184082, "learning_rate": 3.9468659564182554e-07, "loss": 1.0079, "step": 3992 }, { "epoch": 0.9142530051516886, "grad_norm": 1.3504375219345093, "learning_rate": 3.9262571461279097e-07, "loss": 0.9982, "step": 3993 }, { "epoch": 0.9144819690898683, "grad_norm": 1.2237106561660767, "learning_rate": 3.9057012044127817e-07, "loss": 1.0647, "step": 3994 }, { "epoch": 0.9147109330280481, "grad_norm": 1.3022246360778809, "learning_rate": 3.885198142584612e-07, "loss": 1.04, "step": 3995 }, { "epoch": 0.9149398969662278, "grad_norm": 1.3392308950424194, "learning_rate": 3.864747971926086e-07, "loss": 1.0708, "step": 3996 }, { "epoch": 0.9151688609044075, "grad_norm": 1.2771753072738647, "learning_rate": 3.844350703690758e-07, "loss": 1.0403, "step": 3997 }, { "epoch": 0.9153978248425872, "grad_norm": 1.0660115480422974, "learning_rate": 3.82400634910306e-07, "loss": 1.0296, "step": 3998 }, { "epoch": 0.915626788780767, "grad_norm": 1.3524727821350098, "learning_rate": 3.8037149193583257e-07, "loss": 1.0209, "step": 3999 }, { "epoch": 0.9158557527189468, "grad_norm": 1.2420437335968018, "learning_rate": 3.7834764256227674e-07, "loss": 1.0699, "step": 4000 }, { "epoch": 0.9160847166571265, "grad_norm": 1.2622954845428467, "learning_rate": 3.7632908790334656e-07, "loss": 1.038, "step": 4001 }, { "epoch": 0.9163136805953063, "grad_norm": 1.260853886604309, "learning_rate": 3.7431582906983124e-07, "loss": 1.0352, "step": 4002 }, { "epoch": 0.916542644533486, "grad_norm": 1.2435685396194458, "learning_rate": 3.723078671696168e-07, "loss": 1.0368, "step": 4003 }, { "epoch": 0.9167716084716657, "grad_norm": 1.2323977947235107, "learning_rate": 3.7030520330766264e-07, "loss": 0.9676, "step": 4004 }, { "epoch": 0.9170005724098454, "grad_norm": 1.191890001296997, "learning_rate": 3.6830783858601835e-07, "loss": 0.9949, "step": 4005 }, { "epoch": 0.9172295363480252, "grad_norm": 1.2182968854904175, "learning_rate": 3.6631577410382034e-07, "loss": 1.0146, "step": 4006 }, { "epoch": 0.9174585002862049, "grad_norm": 1.3352956771850586, "learning_rate": 3.643290109572828e-07, "loss": 1.0326, "step": 4007 }, { "epoch": 0.9176874642243846, "grad_norm": 1.6930725574493408, "learning_rate": 3.6234755023970447e-07, "loss": 1.0013, "step": 4008 }, { "epoch": 0.9179164281625644, "grad_norm": 1.251842737197876, "learning_rate": 3.603713930414676e-07, "loss": 1.0314, "step": 4009 }, { "epoch": 0.9181453921007441, "grad_norm": 1.3096429109573364, "learning_rate": 3.5840054045003346e-07, "loss": 1.024, "step": 4010 }, { "epoch": 0.9183743560389239, "grad_norm": 1.7176495790481567, "learning_rate": 3.564349935499478e-07, "loss": 0.9741, "step": 4011 }, { "epoch": 0.9186033199771036, "grad_norm": 1.384275197982788, "learning_rate": 3.5447475342283324e-07, "loss": 1.0004, "step": 4012 }, { "epoch": 0.9188322839152834, "grad_norm": 1.2257745265960693, "learning_rate": 3.525198211473935e-07, "loss": 0.9716, "step": 4013 }, { "epoch": 0.9190612478534631, "grad_norm": 1.544185757637024, "learning_rate": 3.5057019779941246e-07, "loss": 1.0022, "step": 4014 }, { "epoch": 0.9192902117916428, "grad_norm": 2.072894334793091, "learning_rate": 3.4862588445174985e-07, "loss": 0.9794, "step": 4015 }, { "epoch": 0.9195191757298226, "grad_norm": 1.132996678352356, "learning_rate": 3.466868821743452e-07, "loss": 1.021, "step": 4016 }, { "epoch": 0.9197481396680023, "grad_norm": 1.3861950635910034, "learning_rate": 3.447531920342173e-07, "loss": 1.0204, "step": 4017 }, { "epoch": 0.919977103606182, "grad_norm": 1.2314361333847046, "learning_rate": 3.428248150954583e-07, "loss": 1.0447, "step": 4018 }, { "epoch": 0.9202060675443617, "grad_norm": 1.2820926904678345, "learning_rate": 3.4090175241923593e-07, "loss": 1.0334, "step": 4019 }, { "epoch": 0.9204350314825415, "grad_norm": 1.4982575178146362, "learning_rate": 3.3898400506379937e-07, "loss": 1.1002, "step": 4020 }, { "epoch": 0.9206639954207212, "grad_norm": 2.526170492172241, "learning_rate": 3.3707157408446547e-07, "loss": 1.0653, "step": 4021 }, { "epoch": 0.9208929593589009, "grad_norm": 2.164984703063965, "learning_rate": 3.3516446053363015e-07, "loss": 0.9579, "step": 4022 }, { "epoch": 0.9211219232970808, "grad_norm": 1.396539330482483, "learning_rate": 3.3326266546076293e-07, "loss": 0.9919, "step": 4023 }, { "epoch": 0.9213508872352605, "grad_norm": 1.2713897228240967, "learning_rate": 3.3136618991240544e-07, "loss": 1.0049, "step": 4024 }, { "epoch": 0.9215798511734402, "grad_norm": 1.5097935199737549, "learning_rate": 3.294750349321718e-07, "loss": 1.0192, "step": 4025 }, { "epoch": 0.9218088151116199, "grad_norm": 1.3932673931121826, "learning_rate": 3.2758920156074624e-07, "loss": 0.9681, "step": 4026 }, { "epoch": 0.9220377790497997, "grad_norm": 1.4764078855514526, "learning_rate": 3.2570869083589196e-07, "loss": 1.0264, "step": 4027 }, { "epoch": 0.9222667429879794, "grad_norm": 1.1931257247924805, "learning_rate": 3.238335037924356e-07, "loss": 1.0474, "step": 4028 }, { "epoch": 0.9224957069261591, "grad_norm": 1.2183254957199097, "learning_rate": 3.219636414622751e-07, "loss": 1.0471, "step": 4029 }, { "epoch": 0.9227246708643388, "grad_norm": 1.6249024868011475, "learning_rate": 3.20099104874384e-07, "loss": 1.0004, "step": 4030 }, { "epoch": 0.9229536348025186, "grad_norm": 1.2928696870803833, "learning_rate": 3.1823989505479934e-07, "loss": 0.969, "step": 4031 }, { "epoch": 0.9231825987406983, "grad_norm": 1.292734980583191, "learning_rate": 3.163860130266283e-07, "loss": 1.0344, "step": 4032 }, { "epoch": 0.923411562678878, "grad_norm": 1.6928443908691406, "learning_rate": 3.145374598100448e-07, "loss": 1.02, "step": 4033 }, { "epoch": 0.9236405266170579, "grad_norm": 1.1957359313964844, "learning_rate": 3.1269423642229737e-07, "loss": 1.0593, "step": 4034 }, { "epoch": 0.9238694905552376, "grad_norm": 1.3396857976913452, "learning_rate": 3.1085634387769124e-07, "loss": 1.0156, "step": 4035 }, { "epoch": 0.9240984544934173, "grad_norm": 1.7538094520568848, "learning_rate": 3.090237831876053e-07, "loss": 1.0305, "step": 4036 }, { "epoch": 0.924327418431597, "grad_norm": 1.2242697477340698, "learning_rate": 3.0719655536048276e-07, "loss": 1.027, "step": 4037 }, { "epoch": 0.9245563823697768, "grad_norm": 1.4524619579315186, "learning_rate": 3.053746614018305e-07, "loss": 1.0006, "step": 4038 }, { "epoch": 0.9247853463079565, "grad_norm": 1.2049915790557861, "learning_rate": 3.035581023142231e-07, "loss": 1.043, "step": 4039 }, { "epoch": 0.9250143102461362, "grad_norm": 1.5115872621536255, "learning_rate": 3.0174687909729663e-07, "loss": 1.0647, "step": 4040 }, { "epoch": 0.925243274184316, "grad_norm": 1.2160587310791016, "learning_rate": 2.9994099274775256e-07, "loss": 1.0233, "step": 4041 }, { "epoch": 0.9254722381224957, "grad_norm": 1.3652007579803467, "learning_rate": 2.9814044425935605e-07, "loss": 1.0257, "step": 4042 }, { "epoch": 0.9257012020606754, "grad_norm": 1.5073693990707397, "learning_rate": 2.9634523462293005e-07, "loss": 1.0529, "step": 4043 }, { "epoch": 0.9259301659988551, "grad_norm": 1.4959176778793335, "learning_rate": 2.945553648263677e-07, "loss": 1.0633, "step": 4044 }, { "epoch": 0.9261591299370349, "grad_norm": 1.320374846458435, "learning_rate": 2.9277083585461776e-07, "loss": 0.9744, "step": 4045 }, { "epoch": 0.9263880938752147, "grad_norm": 1.4175662994384766, "learning_rate": 2.909916486896913e-07, "loss": 0.9728, "step": 4046 }, { "epoch": 0.9266170578133944, "grad_norm": 1.35358464717865, "learning_rate": 2.8921780431065973e-07, "loss": 1.0241, "step": 4047 }, { "epoch": 0.9268460217515742, "grad_norm": 1.674445390701294, "learning_rate": 2.874493036936554e-07, "loss": 0.9889, "step": 4048 }, { "epoch": 0.9270749856897539, "grad_norm": 1.1121513843536377, "learning_rate": 2.8568614781186886e-07, "loss": 1.0436, "step": 4049 }, { "epoch": 0.9273039496279336, "grad_norm": 1.7877289056777954, "learning_rate": 2.839283376355506e-07, "loss": 1.0327, "step": 4050 }, { "epoch": 0.9275329135661133, "grad_norm": 1.4056133031845093, "learning_rate": 2.82175874132008e-07, "loss": 1.0264, "step": 4051 }, { "epoch": 0.9277618775042931, "grad_norm": 1.2976934909820557, "learning_rate": 2.804287582656073e-07, "loss": 1.1032, "step": 4052 }, { "epoch": 0.9279908414424728, "grad_norm": 1.4245258569717407, "learning_rate": 2.7868699099777295e-07, "loss": 1.0262, "step": 4053 }, { "epoch": 0.9282198053806525, "grad_norm": 1.3631470203399658, "learning_rate": 2.769505732869837e-07, "loss": 1.0073, "step": 4054 }, { "epoch": 0.9284487693188322, "grad_norm": 2.766822576522827, "learning_rate": 2.752195060887775e-07, "loss": 0.9939, "step": 4055 }, { "epoch": 0.928677733257012, "grad_norm": 1.360872507095337, "learning_rate": 2.7349379035574217e-07, "loss": 1.0607, "step": 4056 }, { "epoch": 0.9289066971951918, "grad_norm": 1.0225962400436401, "learning_rate": 2.717734270375272e-07, "loss": 1.0532, "step": 4057 }, { "epoch": 0.9291356611333715, "grad_norm": 1.3829964399337769, "learning_rate": 2.7005841708083516e-07, "loss": 1.057, "step": 4058 }, { "epoch": 0.9293646250715513, "grad_norm": 1.5266767740249634, "learning_rate": 2.683487614294189e-07, "loss": 1.0197, "step": 4059 }, { "epoch": 0.929593589009731, "grad_norm": 1.3430898189544678, "learning_rate": 2.6664446102408924e-07, "loss": 1.046, "step": 4060 }, { "epoch": 0.9298225529479107, "grad_norm": 1.3258440494537354, "learning_rate": 2.649455168027082e-07, "loss": 1.0198, "step": 4061 }, { "epoch": 0.9300515168860904, "grad_norm": 1.0601249933242798, "learning_rate": 2.632519297001912e-07, "loss": 1.0252, "step": 4062 }, { "epoch": 0.9302804808242702, "grad_norm": 1.4658838510513306, "learning_rate": 2.615637006485039e-07, "loss": 1.0577, "step": 4063 }, { "epoch": 0.9305094447624499, "grad_norm": 1.3812763690948486, "learning_rate": 2.5988083057666534e-07, "loss": 1.0447, "step": 4064 }, { "epoch": 0.9307384087006296, "grad_norm": 1.4836375713348389, "learning_rate": 2.582033204107437e-07, "loss": 0.9993, "step": 4065 }, { "epoch": 0.9309673726388094, "grad_norm": 1.3811873197555542, "learning_rate": 2.565311710738616e-07, "loss": 1.0034, "step": 4066 }, { "epoch": 0.9311963365769891, "grad_norm": 1.320236325263977, "learning_rate": 2.5486438348618416e-07, "loss": 0.9957, "step": 4067 }, { "epoch": 0.9314253005151688, "grad_norm": 1.1758593320846558, "learning_rate": 2.532029585649343e-07, "loss": 1.0828, "step": 4068 }, { "epoch": 0.9316542644533486, "grad_norm": 1.2372307777404785, "learning_rate": 2.515468972243795e-07, "loss": 1.0289, "step": 4069 }, { "epoch": 0.9318832283915284, "grad_norm": 1.5127670764923096, "learning_rate": 2.4989620037583653e-07, "loss": 1.116, "step": 4070 }, { "epoch": 0.9321121923297081, "grad_norm": 1.2241275310516357, "learning_rate": 2.4825086892766745e-07, "loss": 0.9408, "step": 4071 }, { "epoch": 0.9323411562678878, "grad_norm": 1.1708391904830933, "learning_rate": 2.466109037852893e-07, "loss": 1.0203, "step": 4072 }, { "epoch": 0.9325701202060676, "grad_norm": 1.14144766330719, "learning_rate": 2.4497630585115673e-07, "loss": 0.9937, "step": 4073 }, { "epoch": 0.9327990841442473, "grad_norm": 2.2938714027404785, "learning_rate": 2.4334707602477693e-07, "loss": 1.0365, "step": 4074 }, { "epoch": 0.933028048082427, "grad_norm": 1.596992015838623, "learning_rate": 2.417232152027016e-07, "loss": 1.0269, "step": 4075 }, { "epoch": 0.9332570120206067, "grad_norm": 1.3394994735717773, "learning_rate": 2.4010472427852706e-07, "loss": 1.0008, "step": 4076 }, { "epoch": 0.9334859759587865, "grad_norm": 2.9255053997039795, "learning_rate": 2.384916041428964e-07, "loss": 1.0246, "step": 4077 }, { "epoch": 0.9337149398969662, "grad_norm": 2.5200510025024414, "learning_rate": 2.3688385568349515e-07, "loss": 1.0253, "step": 4078 }, { "epoch": 0.9339439038351459, "grad_norm": 1.3866181373596191, "learning_rate": 2.3528147978505334e-07, "loss": 1.0208, "step": 4079 }, { "epoch": 0.9341728677733258, "grad_norm": 1.7252721786499023, "learning_rate": 2.3368447732934785e-07, "loss": 1.0771, "step": 4080 }, { "epoch": 0.9344018317115055, "grad_norm": 1.2650654315948486, "learning_rate": 2.3209284919519127e-07, "loss": 1.0407, "step": 4081 }, { "epoch": 0.9346307956496852, "grad_norm": 1.2546926736831665, "learning_rate": 2.305065962584463e-07, "loss": 1.0684, "step": 4082 }, { "epoch": 0.9348597595878649, "grad_norm": 1.51604425907135, "learning_rate": 2.289257193920158e-07, "loss": 1.0321, "step": 4083 }, { "epoch": 0.9350887235260447, "grad_norm": 1.2213822603225708, "learning_rate": 2.2735021946583946e-07, "loss": 1.0326, "step": 4084 }, { "epoch": 0.9353176874642244, "grad_norm": 1.4243481159210205, "learning_rate": 2.2578009734690264e-07, "loss": 1.0401, "step": 4085 }, { "epoch": 0.9355466514024041, "grad_norm": 1.6007384061813354, "learning_rate": 2.242153538992331e-07, "loss": 0.9734, "step": 4086 }, { "epoch": 0.9357756153405838, "grad_norm": 2.3834850788116455, "learning_rate": 2.2265598998389316e-07, "loss": 0.988, "step": 4087 }, { "epoch": 0.9360045792787636, "grad_norm": 1.524840235710144, "learning_rate": 2.211020064589886e-07, "loss": 1.0204, "step": 4088 }, { "epoch": 0.9362335432169433, "grad_norm": 1.3676830530166626, "learning_rate": 2.195534041796632e-07, "loss": 1.0464, "step": 4089 }, { "epoch": 0.936462507155123, "grad_norm": 1.5607987642288208, "learning_rate": 2.180101839981008e-07, "loss": 1.0407, "step": 4090 }, { "epoch": 0.9366914710933028, "grad_norm": 1.0320096015930176, "learning_rate": 2.1647234676352213e-07, "loss": 1.0463, "step": 4091 }, { "epoch": 0.9369204350314826, "grad_norm": 1.3179558515548706, "learning_rate": 2.1493989332218468e-07, "loss": 1.0596, "step": 4092 }, { "epoch": 0.9371493989696623, "grad_norm": 1.3272641897201538, "learning_rate": 2.134128245173872e-07, "loss": 1.0372, "step": 4093 }, { "epoch": 0.937378362907842, "grad_norm": 1.2094119787216187, "learning_rate": 2.1189114118946196e-07, "loss": 1.032, "step": 4094 }, { "epoch": 0.9376073268460218, "grad_norm": 1.2302625179290771, "learning_rate": 2.103748441757758e-07, "loss": 0.9675, "step": 4095 }, { "epoch": 0.9378362907842015, "grad_norm": 1.7436269521713257, "learning_rate": 2.0886393431073794e-07, "loss": 1.0837, "step": 4096 }, { "epoch": 0.9380652547223812, "grad_norm": 1.3275481462478638, "learning_rate": 2.0735841242578992e-07, "loss": 1.0377, "step": 4097 }, { "epoch": 0.938294218660561, "grad_norm": 1.3621660470962524, "learning_rate": 2.0585827934940238e-07, "loss": 1.0369, "step": 4098 }, { "epoch": 0.9385231825987407, "grad_norm": 1.739432692527771, "learning_rate": 2.043635359070928e-07, "loss": 1.0002, "step": 4099 }, { "epoch": 0.9387521465369204, "grad_norm": 1.273725986480713, "learning_rate": 2.0287418292140204e-07, "loss": 0.9611, "step": 4100 }, { "epoch": 0.9389811104751001, "grad_norm": 1.3180021047592163, "learning_rate": 2.0139022121190788e-07, "loss": 1.0431, "step": 4101 }, { "epoch": 0.9392100744132799, "grad_norm": 1.3748250007629395, "learning_rate": 1.9991165159522485e-07, "loss": 0.9487, "step": 4102 }, { "epoch": 0.9394390383514597, "grad_norm": 1.186481237411499, "learning_rate": 1.9843847488499545e-07, "loss": 1.0343, "step": 4103 }, { "epoch": 0.9396680022896394, "grad_norm": 1.2737975120544434, "learning_rate": 1.9697069189189678e-07, "loss": 1.0535, "step": 4104 }, { "epoch": 0.9398969662278192, "grad_norm": 1.1873112916946411, "learning_rate": 1.9550830342363714e-07, "loss": 1.0056, "step": 4105 }, { "epoch": 0.9401259301659989, "grad_norm": 1.3513420820236206, "learning_rate": 1.9405131028495838e-07, "loss": 1.0278, "step": 4106 }, { "epoch": 0.9403548941041786, "grad_norm": 1.1599657535552979, "learning_rate": 1.9259971327763028e-07, "loss": 1.0013, "step": 4107 }, { "epoch": 0.9405838580423583, "grad_norm": 2.088212728500366, "learning_rate": 1.9115351320045495e-07, "loss": 1.0098, "step": 4108 }, { "epoch": 0.9408128219805381, "grad_norm": 1.281222939491272, "learning_rate": 1.8971271084926245e-07, "loss": 0.9935, "step": 4109 }, { "epoch": 0.9410417859187178, "grad_norm": 1.147822618484497, "learning_rate": 1.8827730701691749e-07, "loss": 1.0115, "step": 4110 }, { "epoch": 0.9412707498568975, "grad_norm": 2.1663131713867188, "learning_rate": 1.868473024933082e-07, "loss": 1.1268, "step": 4111 }, { "epoch": 0.9414997137950772, "grad_norm": 1.2913000583648682, "learning_rate": 1.8542269806535285e-07, "loss": 1.0183, "step": 4112 }, { "epoch": 0.941728677733257, "grad_norm": 1.3916893005371094, "learning_rate": 1.8400349451700438e-07, "loss": 1.0304, "step": 4113 }, { "epoch": 0.9419576416714367, "grad_norm": 1.3906985521316528, "learning_rate": 1.8258969262923366e-07, "loss": 1.0088, "step": 4114 }, { "epoch": 0.9421866056096165, "grad_norm": 1.523729681968689, "learning_rate": 1.8118129318004717e-07, "loss": 1.0386, "step": 4115 }, { "epoch": 0.9424155695477963, "grad_norm": 1.3909317255020142, "learning_rate": 1.7977829694447278e-07, "loss": 0.9631, "step": 4116 }, { "epoch": 0.942644533485976, "grad_norm": 1.4848453998565674, "learning_rate": 1.7838070469456958e-07, "loss": 0.9619, "step": 4117 }, { "epoch": 0.9428734974241557, "grad_norm": 1.3331187963485718, "learning_rate": 1.769885171994201e-07, "loss": 1.0474, "step": 4118 }, { "epoch": 0.9431024613623354, "grad_norm": 1.6051127910614014, "learning_rate": 1.7560173522513268e-07, "loss": 1.0188, "step": 4119 }, { "epoch": 0.9433314253005152, "grad_norm": 1.2383366823196411, "learning_rate": 1.742203595348435e-07, "loss": 1.0819, "step": 4120 }, { "epoch": 0.9435603892386949, "grad_norm": 1.607271671295166, "learning_rate": 1.728443908887112e-07, "loss": 0.9864, "step": 4121 }, { "epoch": 0.9437893531768746, "grad_norm": 1.5015615224838257, "learning_rate": 1.7147383004391782e-07, "loss": 1.0301, "step": 4122 }, { "epoch": 0.9440183171150544, "grad_norm": 1.2938905954360962, "learning_rate": 1.7010867775467454e-07, "loss": 0.9931, "step": 4123 }, { "epoch": 0.9442472810532341, "grad_norm": 1.4441864490509033, "learning_rate": 1.6874893477221376e-07, "loss": 1.0832, "step": 4124 }, { "epoch": 0.9444762449914138, "grad_norm": 1.309255599975586, "learning_rate": 1.6739460184478694e-07, "loss": 1.0367, "step": 4125 }, { "epoch": 0.9447052089295936, "grad_norm": 1.3141146898269653, "learning_rate": 1.660456797176735e-07, "loss": 1.0261, "step": 4126 }, { "epoch": 0.9449341728677734, "grad_norm": 1.307094931602478, "learning_rate": 1.6470216913317628e-07, "loss": 1.0172, "step": 4127 }, { "epoch": 0.9451631368059531, "grad_norm": 1.344239592552185, "learning_rate": 1.63364070830615e-07, "loss": 1.0416, "step": 4128 }, { "epoch": 0.9453921007441328, "grad_norm": 1.2404433488845825, "learning_rate": 1.6203138554633625e-07, "loss": 1.0191, "step": 4129 }, { "epoch": 0.9456210646823126, "grad_norm": 1.1200016736984253, "learning_rate": 1.6070411401370335e-07, "loss": 0.9745, "step": 4130 }, { "epoch": 0.9458500286204923, "grad_norm": 3.205339193344116, "learning_rate": 1.5938225696310427e-07, "loss": 1.0064, "step": 4131 }, { "epoch": 0.946078992558672, "grad_norm": 1.468671441078186, "learning_rate": 1.580658151219461e-07, "loss": 1.0288, "step": 4132 }, { "epoch": 0.9463079564968517, "grad_norm": 1.7323161363601685, "learning_rate": 1.567547892146537e-07, "loss": 0.9814, "step": 4133 }, { "epoch": 0.9465369204350315, "grad_norm": 1.4511128664016724, "learning_rate": 1.5544917996267562e-07, "loss": 0.9879, "step": 4134 }, { "epoch": 0.9467658843732112, "grad_norm": 1.2336581945419312, "learning_rate": 1.541489880844782e-07, "loss": 1.0541, "step": 4135 }, { "epoch": 0.9469948483113909, "grad_norm": 1.2576466798782349, "learning_rate": 1.528542142955436e-07, "loss": 1.0047, "step": 4136 }, { "epoch": 0.9472238122495706, "grad_norm": 1.4360203742980957, "learning_rate": 1.5156485930837628e-07, "loss": 1.0658, "step": 4137 }, { "epoch": 0.9474527761877505, "grad_norm": 1.8652851581573486, "learning_rate": 1.5028092383249871e-07, "loss": 1.0568, "step": 4138 }, { "epoch": 0.9476817401259302, "grad_norm": 1.3658671379089355, "learning_rate": 1.4900240857444792e-07, "loss": 1.095, "step": 4139 }, { "epoch": 0.9479107040641099, "grad_norm": 1.3814666271209717, "learning_rate": 1.4772931423778003e-07, "loss": 1.089, "step": 4140 }, { "epoch": 0.9481396680022897, "grad_norm": 1.3728240728378296, "learning_rate": 1.464616415230702e-07, "loss": 1.0431, "step": 4141 }, { "epoch": 0.9483686319404694, "grad_norm": 1.6586147546768188, "learning_rate": 1.4519939112790705e-07, "loss": 1.0047, "step": 4142 }, { "epoch": 0.9485975958786491, "grad_norm": 1.2365854978561401, "learning_rate": 1.4394256374689498e-07, "loss": 1.0841, "step": 4143 }, { "epoch": 0.9488265598168288, "grad_norm": 1.3631367683410645, "learning_rate": 1.426911600716574e-07, "loss": 1.0188, "step": 4144 }, { "epoch": 0.9490555237550086, "grad_norm": 1.1804825067520142, "learning_rate": 1.4144518079083125e-07, "loss": 1.0877, "step": 4145 }, { "epoch": 0.9492844876931883, "grad_norm": 1.24893057346344, "learning_rate": 1.4020462659006806e-07, "loss": 1.0689, "step": 4146 }, { "epoch": 0.949513451631368, "grad_norm": 1.317393183708191, "learning_rate": 1.3896949815203398e-07, "loss": 1.0285, "step": 4147 }, { "epoch": 0.9497424155695477, "grad_norm": 1.233035683631897, "learning_rate": 1.3773979615640976e-07, "loss": 1.0676, "step": 4148 }, { "epoch": 0.9499713795077276, "grad_norm": 1.0426533222198486, "learning_rate": 1.3651552127989186e-07, "loss": 1.0019, "step": 4149 }, { "epoch": 0.9502003434459073, "grad_norm": 1.5621211528778076, "learning_rate": 1.3529667419618475e-07, "loss": 1.0579, "step": 4150 }, { "epoch": 0.950429307384087, "grad_norm": 1.3285621404647827, "learning_rate": 1.3408325557601297e-07, "loss": 1.0285, "step": 4151 }, { "epoch": 0.9506582713222668, "grad_norm": 1.1618311405181885, "learning_rate": 1.3287526608711132e-07, "loss": 1.046, "step": 4152 }, { "epoch": 0.9508872352604465, "grad_norm": 1.317858338356018, "learning_rate": 1.3167270639422357e-07, "loss": 1.0425, "step": 4153 }, { "epoch": 0.9511161991986262, "grad_norm": 1.2323342561721802, "learning_rate": 1.304755771591093e-07, "loss": 1.0702, "step": 4154 }, { "epoch": 0.951345163136806, "grad_norm": 1.3113363981246948, "learning_rate": 1.292838790405393e-07, "loss": 1.026, "step": 4155 }, { "epoch": 0.9515741270749857, "grad_norm": 1.145249366760254, "learning_rate": 1.2809761269429343e-07, "loss": 1.0076, "step": 4156 }, { "epoch": 0.9518030910131654, "grad_norm": 1.527535319328308, "learning_rate": 1.269167787731662e-07, "loss": 0.9754, "step": 4157 }, { "epoch": 0.9520320549513451, "grad_norm": 1.321520209312439, "learning_rate": 1.2574137792695895e-07, "loss": 1.0286, "step": 4158 }, { "epoch": 0.9522610188895249, "grad_norm": 1.5232340097427368, "learning_rate": 1.2457141080248647e-07, "loss": 0.9744, "step": 4159 }, { "epoch": 0.9524899828277046, "grad_norm": 1.3998576402664185, "learning_rate": 1.2340687804357044e-07, "loss": 1.0796, "step": 4160 }, { "epoch": 0.9527189467658844, "grad_norm": 1.2090296745300293, "learning_rate": 1.2224778029104377e-07, "loss": 1.0844, "step": 4161 }, { "epoch": 0.9529479107040641, "grad_norm": 1.319123387336731, "learning_rate": 1.2109411818274851e-07, "loss": 1.0412, "step": 4162 }, { "epoch": 0.9531768746422439, "grad_norm": 1.2785385847091675, "learning_rate": 1.1994589235353682e-07, "loss": 1.0274, "step": 4163 }, { "epoch": 0.9534058385804236, "grad_norm": 1.2551246881484985, "learning_rate": 1.1880310343526324e-07, "loss": 1.0395, "step": 4164 }, { "epoch": 0.9536348025186033, "grad_norm": 1.755634069442749, "learning_rate": 1.1766575205680031e-07, "loss": 1.0167, "step": 4165 }, { "epoch": 0.9538637664567831, "grad_norm": 1.1135053634643555, "learning_rate": 1.1653383884401959e-07, "loss": 0.9704, "step": 4166 }, { "epoch": 0.9540927303949628, "grad_norm": 1.2887723445892334, "learning_rate": 1.1540736441980505e-07, "loss": 1.0541, "step": 4167 }, { "epoch": 0.9543216943331425, "grad_norm": 1.2668262720108032, "learning_rate": 1.1428632940404416e-07, "loss": 1.0312, "step": 4168 }, { "epoch": 0.9545506582713222, "grad_norm": 1.4527376890182495, "learning_rate": 1.1317073441363458e-07, "loss": 1.0343, "step": 4169 }, { "epoch": 0.954779622209502, "grad_norm": 1.378683090209961, "learning_rate": 1.1206058006247966e-07, "loss": 0.9851, "step": 4170 }, { "epoch": 0.9550085861476817, "grad_norm": 1.202966570854187, "learning_rate": 1.109558669614863e-07, "loss": 1.027, "step": 4171 }, { "epoch": 0.9552375500858615, "grad_norm": 1.4312129020690918, "learning_rate": 1.0985659571856933e-07, "loss": 1.0712, "step": 4172 }, { "epoch": 0.9554665140240413, "grad_norm": 1.2038168907165527, "learning_rate": 1.0876276693864818e-07, "loss": 1.0783, "step": 4173 }, { "epoch": 0.955695477962221, "grad_norm": 1.1261094808578491, "learning_rate": 1.0767438122364914e-07, "loss": 1.0094, "step": 4174 }, { "epoch": 0.9559244419004007, "grad_norm": 2.3489320278167725, "learning_rate": 1.0659143917250092e-07, "loss": 1.0653, "step": 4175 }, { "epoch": 0.9561534058385804, "grad_norm": 1.5417100191116333, "learning_rate": 1.055139413811379e-07, "loss": 0.9797, "step": 4176 }, { "epoch": 0.9563823697767602, "grad_norm": 1.4111857414245605, "learning_rate": 1.0444188844249691e-07, "loss": 0.997, "step": 4177 }, { "epoch": 0.9566113337149399, "grad_norm": 1.4079347848892212, "learning_rate": 1.0337528094651938e-07, "loss": 0.9685, "step": 4178 }, { "epoch": 0.9568402976531196, "grad_norm": 1.3267221450805664, "learning_rate": 1.0231411948015247e-07, "loss": 1.1016, "step": 4179 }, { "epoch": 0.9570692615912993, "grad_norm": 1.3851131200790405, "learning_rate": 1.0125840462734238e-07, "loss": 1.0756, "step": 4180 }, { "epoch": 0.9572982255294791, "grad_norm": 1.3021942377090454, "learning_rate": 1.0020813696904108e-07, "loss": 0.9716, "step": 4181 }, { "epoch": 0.9575271894676588, "grad_norm": 1.1864266395568848, "learning_rate": 9.916331708320403e-08, "loss": 0.9976, "step": 4182 }, { "epoch": 0.9577561534058385, "grad_norm": 13.135167121887207, "learning_rate": 9.812394554478355e-08, "loss": 1.0404, "step": 4183 }, { "epoch": 0.9579851173440184, "grad_norm": 1.3869199752807617, "learning_rate": 9.709002292573876e-08, "loss": 1.0379, "step": 4184 }, { "epoch": 0.9582140812821981, "grad_norm": 1.8574575185775757, "learning_rate": 9.606154979502791e-08, "loss": 1.0384, "step": 4185 }, { "epoch": 0.9584430452203778, "grad_norm": 1.1653584241867065, "learning_rate": 9.503852671861158e-08, "loss": 1.0695, "step": 4186 }, { "epoch": 0.9586720091585575, "grad_norm": 1.499384880065918, "learning_rate": 9.402095425945168e-08, "loss": 1.0062, "step": 4187 }, { "epoch": 0.9589009730967373, "grad_norm": 1.988095760345459, "learning_rate": 9.300883297750696e-08, "loss": 1.0296, "step": 4188 }, { "epoch": 0.959129937034917, "grad_norm": 1.2225043773651123, "learning_rate": 9.2002163429743e-08, "loss": 0.9754, "step": 4189 }, { "epoch": 0.9593589009730967, "grad_norm": 1.299344778060913, "learning_rate": 9.10009461701189e-08, "loss": 1.0025, "step": 4190 }, { "epoch": 0.9595878649112765, "grad_norm": 1.1545172929763794, "learning_rate": 9.000518174959726e-08, "loss": 1.0316, "step": 4191 }, { "epoch": 0.9598168288494562, "grad_norm": 1.2487175464630127, "learning_rate": 8.901487071613868e-08, "loss": 1.06, "step": 4192 }, { "epoch": 0.9600457927876359, "grad_norm": 1.9878804683685303, "learning_rate": 8.803001361470386e-08, "loss": 1.053, "step": 4193 }, { "epoch": 0.9602747567258156, "grad_norm": 1.282788634300232, "learning_rate": 8.705061098724932e-08, "loss": 0.9953, "step": 4194 }, { "epoch": 0.9605037206639955, "grad_norm": 1.5588258504867554, "learning_rate": 8.60766633727339e-08, "loss": 1.0122, "step": 4195 }, { "epoch": 0.9607326846021752, "grad_norm": 2.007049798965454, "learning_rate": 8.510817130711224e-08, "loss": 1.0305, "step": 4196 }, { "epoch": 0.9609616485403549, "grad_norm": 1.2135330438613892, "learning_rate": 8.41451353233369e-08, "loss": 1.0765, "step": 4197 }, { "epoch": 0.9611906124785347, "grad_norm": 1.5741900205612183, "learning_rate": 8.318755595135952e-08, "loss": 1.0657, "step": 4198 }, { "epoch": 0.9614195764167144, "grad_norm": 1.8799227476119995, "learning_rate": 8.223543371812748e-08, "loss": 1.0397, "step": 4199 }, { "epoch": 0.9616485403548941, "grad_norm": 2.1442928314208984, "learning_rate": 8.128876914758499e-08, "loss": 1.0042, "step": 4200 }, { "epoch": 0.9618775042930738, "grad_norm": 1.2874436378479004, "learning_rate": 8.034756276067534e-08, "loss": 0.9996, "step": 4201 }, { "epoch": 0.9621064682312536, "grad_norm": 1.346038579940796, "learning_rate": 7.941181507533424e-08, "loss": 1.0304, "step": 4202 }, { "epoch": 0.9623354321694333, "grad_norm": 1.7882304191589355, "learning_rate": 7.848152660649866e-08, "loss": 1.0538, "step": 4203 }, { "epoch": 0.962564396107613, "grad_norm": 1.237015724182129, "learning_rate": 7.755669786609688e-08, "loss": 1.0272, "step": 4204 }, { "epoch": 0.9627933600457927, "grad_norm": 1.6121935844421387, "learning_rate": 7.663732936305291e-08, "loss": 1.015, "step": 4205 }, { "epoch": 0.9630223239839725, "grad_norm": 1.267065405845642, "learning_rate": 7.572342160328982e-08, "loss": 1.0515, "step": 4206 }, { "epoch": 0.9632512879221523, "grad_norm": 1.263258934020996, "learning_rate": 7.481497508972313e-08, "loss": 1.0207, "step": 4207 }, { "epoch": 0.963480251860332, "grad_norm": 1.343643069267273, "learning_rate": 7.391199032226182e-08, "loss": 1.0493, "step": 4208 }, { "epoch": 0.9637092157985118, "grad_norm": 1.558370590209961, "learning_rate": 7.301446779781285e-08, "loss": 1.0701, "step": 4209 }, { "epoch": 0.9639381797366915, "grad_norm": 1.1900969743728638, "learning_rate": 7.212240801027337e-08, "loss": 0.9938, "step": 4210 }, { "epoch": 0.9641671436748712, "grad_norm": 2.1308228969573975, "learning_rate": 7.123581145053849e-08, "loss": 1.0434, "step": 4211 }, { "epoch": 0.964396107613051, "grad_norm": 1.5577213764190674, "learning_rate": 7.035467860649348e-08, "loss": 1.0161, "step": 4212 }, { "epoch": 0.9646250715512307, "grad_norm": 1.1246519088745117, "learning_rate": 6.947900996301826e-08, "loss": 1.0164, "step": 4213 }, { "epoch": 0.9648540354894104, "grad_norm": 1.6314594745635986, "learning_rate": 6.860880600198627e-08, "loss": 1.0539, "step": 4214 }, { "epoch": 0.9650829994275901, "grad_norm": 1.6893818378448486, "learning_rate": 6.774406720226335e-08, "loss": 1.0481, "step": 4215 }, { "epoch": 0.9653119633657699, "grad_norm": 1.3539258241653442, "learning_rate": 6.688479403970883e-08, "loss": 1.0231, "step": 4216 }, { "epoch": 0.9655409273039496, "grad_norm": 1.230441927909851, "learning_rate": 6.603098698717336e-08, "loss": 1.0163, "step": 4217 }, { "epoch": 0.9657698912421294, "grad_norm": 1.3217246532440186, "learning_rate": 6.51826465144978e-08, "loss": 0.98, "step": 4218 }, { "epoch": 0.9659988551803091, "grad_norm": 1.1363701820373535, "learning_rate": 6.433977308851869e-08, "loss": 1.0446, "step": 4219 }, { "epoch": 0.9662278191184889, "grad_norm": 1.5187417268753052, "learning_rate": 6.35023671730628e-08, "loss": 1.0349, "step": 4220 }, { "epoch": 0.9664567830566686, "grad_norm": 1.2305715084075928, "learning_rate": 6.267042922894595e-08, "loss": 1.0425, "step": 4221 }, { "epoch": 0.9666857469948483, "grad_norm": 1.2499189376831055, "learning_rate": 6.18439597139775e-08, "loss": 0.9435, "step": 4222 }, { "epoch": 0.966914710933028, "grad_norm": 1.2281574010849, "learning_rate": 6.102295908295585e-08, "loss": 1.0268, "step": 4223 }, { "epoch": 0.9671436748712078, "grad_norm": 1.3302663564682007, "learning_rate": 6.020742778767185e-08, "loss": 1.012, "step": 4224 }, { "epoch": 0.9673726388093875, "grad_norm": 1.1993911266326904, "learning_rate": 5.93973662769054e-08, "loss": 1.0542, "step": 4225 }, { "epoch": 0.9676016027475672, "grad_norm": 1.2014667987823486, "learning_rate": 5.859277499642546e-08, "loss": 0.9975, "step": 4226 }, { "epoch": 0.967830566685747, "grad_norm": 1.501274585723877, "learning_rate": 5.7793654388993426e-08, "loss": 1.0324, "step": 4227 }, { "epoch": 0.9680595306239267, "grad_norm": 1.7279783487319946, "learning_rate": 5.700000489435753e-08, "loss": 1.0626, "step": 4228 }, { "epoch": 0.9682884945621064, "grad_norm": 1.46021568775177, "learning_rate": 5.621182694925731e-08, "loss": 1.0041, "step": 4229 }, { "epoch": 0.9685174585002863, "grad_norm": 1.1859759092330933, "learning_rate": 5.542912098741915e-08, "loss": 1.0243, "step": 4230 }, { "epoch": 0.968746422438466, "grad_norm": 1.5773413181304932, "learning_rate": 5.465188743956073e-08, "loss": 1.0268, "step": 4231 }, { "epoch": 0.9689753863766457, "grad_norm": 1.4995766878128052, "learning_rate": 5.388012673338661e-08, "loss": 1.0252, "step": 4232 }, { "epoch": 0.9692043503148254, "grad_norm": 1.340819239616394, "learning_rate": 5.3113839293590374e-08, "loss": 1.0199, "step": 4233 }, { "epoch": 0.9694333142530052, "grad_norm": 1.144403338432312, "learning_rate": 5.235302554185362e-08, "loss": 0.9865, "step": 4234 }, { "epoch": 0.9696622781911849, "grad_norm": 1.210654854774475, "learning_rate": 5.159768589684699e-08, "loss": 0.978, "step": 4235 }, { "epoch": 0.9698912421293646, "grad_norm": 1.2872427701950073, "learning_rate": 5.084782077422468e-08, "loss": 1.0102, "step": 4236 }, { "epoch": 0.9701202060675443, "grad_norm": 1.495758295059204, "learning_rate": 5.010343058663325e-08, "loss": 1.0315, "step": 4237 }, { "epoch": 0.9703491700057241, "grad_norm": 1.3226579427719116, "learning_rate": 4.9364515743705046e-08, "loss": 1.0261, "step": 4238 }, { "epoch": 0.9705781339439038, "grad_norm": 1.2678083181381226, "learning_rate": 4.863107665205702e-08, "loss": 1.0226, "step": 4239 }, { "epoch": 0.9708070978820835, "grad_norm": 1.6307822465896606, "learning_rate": 4.79031137152941e-08, "loss": 1.0238, "step": 4240 }, { "epoch": 0.9710360618202634, "grad_norm": 1.3601518869400024, "learning_rate": 4.71806273340103e-08, "loss": 0.9714, "step": 4241 }, { "epoch": 0.9712650257584431, "grad_norm": 1.3477270603179932, "learning_rate": 4.646361790578313e-08, "loss": 1.0751, "step": 4242 }, { "epoch": 0.9714939896966228, "grad_norm": 1.1436761617660522, "learning_rate": 4.575208582517587e-08, "loss": 1.0135, "step": 4243 }, { "epoch": 0.9717229536348025, "grad_norm": 2.0135879516601562, "learning_rate": 4.504603148373976e-08, "loss": 0.9799, "step": 4244 }, { "epoch": 0.9719519175729823, "grad_norm": 2.1591763496398926, "learning_rate": 4.4345455270010665e-08, "loss": 1.0147, "step": 4245 }, { "epoch": 0.972180881511162, "grad_norm": 1.2410547733306885, "learning_rate": 4.365035756950797e-08, "loss": 1.0154, "step": 4246 }, { "epoch": 0.9724098454493417, "grad_norm": 1.4873212575912476, "learning_rate": 4.296073876474016e-08, "loss": 1.0427, "step": 4247 }, { "epoch": 0.9726388093875215, "grad_norm": 1.2260034084320068, "learning_rate": 4.227659923519811e-08, "loss": 1.0528, "step": 4248 }, { "epoch": 0.9728677733257012, "grad_norm": 1.4164505004882812, "learning_rate": 4.159793935735734e-08, "loss": 1.0673, "step": 4249 }, { "epoch": 0.9730967372638809, "grad_norm": 1.2214871644973755, "learning_rate": 4.092475950468022e-08, "loss": 1.0818, "step": 4250 }, { "epoch": 0.9733257012020606, "grad_norm": 1.3067419528961182, "learning_rate": 4.025706004760932e-08, "loss": 0.9782, "step": 4251 }, { "epoch": 0.9735546651402404, "grad_norm": 1.2343987226486206, "learning_rate": 3.9594841353577384e-08, "loss": 1.093, "step": 4252 }, { "epoch": 0.9737836290784202, "grad_norm": 1.420095443725586, "learning_rate": 3.8938103786995144e-08, "loss": 1.0679, "step": 4253 }, { "epoch": 0.9740125930165999, "grad_norm": 1.1239619255065918, "learning_rate": 3.8286847709261276e-08, "loss": 1.0286, "step": 4254 }, { "epoch": 0.9742415569547797, "grad_norm": 1.2686529159545898, "learning_rate": 3.7641073478755786e-08, "loss": 0.9945, "step": 4255 }, { "epoch": 0.9744705208929594, "grad_norm": 2.8842384815216064, "learning_rate": 3.7000781450844405e-08, "loss": 1.0213, "step": 4256 }, { "epoch": 0.9746994848311391, "grad_norm": 1.8031467199325562, "learning_rate": 3.636597197787084e-08, "loss": 1.0022, "step": 4257 }, { "epoch": 0.9749284487693188, "grad_norm": 1.7563955783843994, "learning_rate": 3.573664540916899e-08, "loss": 0.9997, "step": 4258 }, { "epoch": 0.9751574127074986, "grad_norm": 2.2838706970214844, "learning_rate": 3.5112802091051834e-08, "loss": 1.0434, "step": 4259 }, { "epoch": 0.9753863766456783, "grad_norm": 1.1979697942733765, "learning_rate": 3.449444236681254e-08, "loss": 1.0448, "step": 4260 }, { "epoch": 0.975615340583858, "grad_norm": 1.2877243757247925, "learning_rate": 3.388156657673114e-08, "loss": 1.0362, "step": 4261 }, { "epoch": 0.9758443045220377, "grad_norm": 1.5697972774505615, "learning_rate": 3.327417505806785e-08, "loss": 1.0681, "step": 4262 }, { "epoch": 0.9760732684602175, "grad_norm": 1.3261923789978027, "learning_rate": 3.267226814506419e-08, "loss": 1.0489, "step": 4263 }, { "epoch": 0.9763022323983973, "grad_norm": 1.2220418453216553, "learning_rate": 3.2075846168946326e-08, "loss": 1.0713, "step": 4264 }, { "epoch": 0.976531196336577, "grad_norm": 1.7401745319366455, "learning_rate": 3.148490945791838e-08, "loss": 1.0271, "step": 4265 }, { "epoch": 0.9767601602747568, "grad_norm": 1.2124825716018677, "learning_rate": 3.089945833716912e-08, "loss": 1.0562, "step": 4266 }, { "epoch": 0.9769891242129365, "grad_norm": 1.3442974090576172, "learning_rate": 3.03194931288664e-08, "loss": 1.0586, "step": 4267 }, { "epoch": 0.9772180881511162, "grad_norm": 1.2674909830093384, "learning_rate": 2.9745014152161578e-08, "loss": 0.9957, "step": 4268 }, { "epoch": 0.9774470520892959, "grad_norm": 1.6897748708724976, "learning_rate": 2.917602172318401e-08, "loss": 0.9973, "step": 4269 }, { "epoch": 0.9776760160274757, "grad_norm": 1.3805571794509888, "learning_rate": 2.8612516155047674e-08, "loss": 0.9844, "step": 4270 }, { "epoch": 0.9779049799656554, "grad_norm": 1.2225171327590942, "learning_rate": 2.8054497757842304e-08, "loss": 1.034, "step": 4271 }, { "epoch": 0.9781339439038351, "grad_norm": 1.321537733078003, "learning_rate": 2.7501966838642258e-08, "loss": 1.0053, "step": 4272 }, { "epoch": 0.9783629078420149, "grad_norm": 1.2543835639953613, "learning_rate": 2.695492370149988e-08, "loss": 1.0385, "step": 4273 }, { "epoch": 0.9785918717801946, "grad_norm": 1.189489483833313, "learning_rate": 2.641336864744992e-08, "loss": 0.9956, "step": 4274 }, { "epoch": 0.9788208357183743, "grad_norm": 1.2529815435409546, "learning_rate": 2.5877301974503998e-08, "loss": 1.0218, "step": 4275 }, { "epoch": 0.9790497996565541, "grad_norm": 1.217818260192871, "learning_rate": 2.534672397765614e-08, "loss": 0.9919, "step": 4276 }, { "epoch": 0.9792787635947339, "grad_norm": 1.363022804260254, "learning_rate": 2.482163494887724e-08, "loss": 1.0145, "step": 4277 }, { "epoch": 0.9795077275329136, "grad_norm": 1.5168522596359253, "learning_rate": 2.430203517712171e-08, "loss": 1.0786, "step": 4278 }, { "epoch": 0.9797366914710933, "grad_norm": 1.2632395029067993, "learning_rate": 2.3787924948319718e-08, "loss": 1.023, "step": 4279 }, { "epoch": 0.979965655409273, "grad_norm": 1.2734144926071167, "learning_rate": 2.327930454538274e-08, "loss": 1.0511, "step": 4280 }, { "epoch": 0.9801946193474528, "grad_norm": 1.4147753715515137, "learning_rate": 2.2776174248199114e-08, "loss": 0.9804, "step": 4281 }, { "epoch": 0.9804235832856325, "grad_norm": 1.5188604593276978, "learning_rate": 2.227853433363736e-08, "loss": 1.0049, "step": 4282 }, { "epoch": 0.9806525472238122, "grad_norm": 1.0858154296875, "learning_rate": 2.1786385075545093e-08, "loss": 1.0509, "step": 4283 }, { "epoch": 0.980881511161992, "grad_norm": 1.3833143711090088, "learning_rate": 2.1299726744747896e-08, "loss": 0.9949, "step": 4284 }, { "epoch": 0.9811104751001717, "grad_norm": 1.2171403169631958, "learning_rate": 2.0818559609049327e-08, "loss": 1.04, "step": 4285 }, { "epoch": 0.9813394390383514, "grad_norm": 1.468177080154419, "learning_rate": 2.0342883933232027e-08, "loss": 0.985, "step": 4286 }, { "epoch": 0.9815684029765313, "grad_norm": 1.3929141759872437, "learning_rate": 1.987269997905661e-08, "loss": 0.9983, "step": 4287 }, { "epoch": 0.981797366914711, "grad_norm": 1.1607084274291992, "learning_rate": 1.9408008005260548e-08, "loss": 1.0426, "step": 4288 }, { "epoch": 0.9820263308528907, "grad_norm": 1.3756016492843628, "learning_rate": 1.8948808267560405e-08, "loss": 1.0324, "step": 4289 }, { "epoch": 0.9822552947910704, "grad_norm": 1.2399051189422607, "learning_rate": 1.8495101018649598e-08, "loss": 0.994, "step": 4290 }, { "epoch": 0.9824842587292502, "grad_norm": 1.5608595609664917, "learning_rate": 1.8046886508200633e-08, "loss": 1.0074, "step": 4291 }, { "epoch": 0.9827132226674299, "grad_norm": 1.3537348508834839, "learning_rate": 1.7604164982860662e-08, "loss": 1.0612, "step": 4292 }, { "epoch": 0.9829421866056096, "grad_norm": 1.5062490701675415, "learning_rate": 1.716693668625591e-08, "loss": 1.0275, "step": 4293 }, { "epoch": 0.9831711505437893, "grad_norm": 1.5584570169448853, "learning_rate": 1.673520185899058e-08, "loss": 1.0148, "step": 4294 }, { "epoch": 0.9834001144819691, "grad_norm": 1.1560457944869995, "learning_rate": 1.630896073864352e-08, "loss": 0.986, "step": 4295 }, { "epoch": 0.9836290784201488, "grad_norm": 1.2290360927581787, "learning_rate": 1.5888213559771548e-08, "loss": 1.0749, "step": 4296 }, { "epoch": 0.9838580423583285, "grad_norm": 1.4205414056777954, "learning_rate": 1.5472960553909456e-08, "loss": 0.9877, "step": 4297 }, { "epoch": 0.9840870062965082, "grad_norm": 1.6074973344802856, "learning_rate": 1.5063201949566674e-08, "loss": 1.0362, "step": 4298 }, { "epoch": 0.9843159702346881, "grad_norm": 1.3286057710647583, "learning_rate": 1.4658937972230613e-08, "loss": 0.9793, "step": 4299 }, { "epoch": 0.9845449341728678, "grad_norm": 1.3167859315872192, "learning_rate": 1.426016884436332e-08, "loss": 1.039, "step": 4300 }, { "epoch": 0.9847738981110475, "grad_norm": 1.1874357461929321, "learning_rate": 1.3866894785404816e-08, "loss": 0.9651, "step": 4301 }, { "epoch": 0.9850028620492273, "grad_norm": 1.4577040672302246, "learning_rate": 1.3479116011769766e-08, "loss": 1.0622, "step": 4302 }, { "epoch": 0.985231825987407, "grad_norm": 1.3572076559066772, "learning_rate": 1.3096832736850806e-08, "loss": 1.0132, "step": 4303 }, { "epoch": 0.9854607899255867, "grad_norm": 1.22506844997406, "learning_rate": 1.2720045171014106e-08, "loss": 1.0378, "step": 4304 }, { "epoch": 0.9856897538637664, "grad_norm": 1.5919448137283325, "learning_rate": 1.2348753521602696e-08, "loss": 1.0365, "step": 4305 }, { "epoch": 0.9859187178019462, "grad_norm": 1.5855189561843872, "learning_rate": 1.1982957992936472e-08, "loss": 1.0479, "step": 4306 }, { "epoch": 0.9861476817401259, "grad_norm": 1.5232701301574707, "learning_rate": 1.162265878630886e-08, "loss": 1.0253, "step": 4307 }, { "epoch": 0.9863766456783056, "grad_norm": 1.51606023311615, "learning_rate": 1.1267856099989039e-08, "loss": 0.9984, "step": 4308 }, { "epoch": 0.9866056096164854, "grad_norm": 1.5284004211425781, "learning_rate": 1.0918550129223049e-08, "loss": 1.0555, "step": 4309 }, { "epoch": 0.9868345735546652, "grad_norm": 1.2388665676116943, "learning_rate": 1.0574741066230465e-08, "loss": 0.9951, "step": 4310 }, { "epoch": 0.9870635374928449, "grad_norm": 1.161252737045288, "learning_rate": 1.0236429100206612e-08, "loss": 1.0634, "step": 4311 }, { "epoch": 0.9872925014310246, "grad_norm": 1.1357179880142212, "learning_rate": 9.903614417320351e-09, "loss": 1.0417, "step": 4312 }, { "epoch": 0.9875214653692044, "grad_norm": 1.3556911945343018, "learning_rate": 9.57629720071962e-09, "loss": 1.0326, "step": 4313 }, { "epoch": 0.9877504293073841, "grad_norm": 1.2056077718734741, "learning_rate": 9.254477630521452e-09, "loss": 0.945, "step": 4314 }, { "epoch": 0.9879793932455638, "grad_norm": 1.208575963973999, "learning_rate": 8.938155883823074e-09, "loss": 1.0396, "step": 4315 }, { "epoch": 0.9882083571837436, "grad_norm": 1.3659850358963013, "learning_rate": 8.627332134690802e-09, "loss": 1.0029, "step": 4316 }, { "epoch": 0.9884373211219233, "grad_norm": 1.363781213760376, "learning_rate": 8.322006554171147e-09, "loss": 1.0407, "step": 4317 }, { "epoch": 0.988666285060103, "grad_norm": 1.3712806701660156, "learning_rate": 8.02217931028082e-09, "loss": 0.9999, "step": 4318 }, { "epoch": 0.9888952489982827, "grad_norm": 1.421749234199524, "learning_rate": 7.727850568012286e-09, "loss": 1.0482, "step": 4319 }, { "epoch": 0.9891242129364625, "grad_norm": 1.1671499013900757, "learning_rate": 7.439020489332649e-09, "loss": 1.044, "step": 4320 }, { "epoch": 0.9893531768746422, "grad_norm": 1.8170863389968872, "learning_rate": 7.1556892331814394e-09, "loss": 0.9375, "step": 4321 }, { "epoch": 0.989582140812822, "grad_norm": 1.3475590944290161, "learning_rate": 6.8778569554750484e-09, "loss": 0.9789, "step": 4322 }, { "epoch": 0.9898111047510018, "grad_norm": 1.19892418384552, "learning_rate": 6.605523809102288e-09, "loss": 1.0129, "step": 4323 }, { "epoch": 0.9900400686891815, "grad_norm": 1.4870357513427734, "learning_rate": 6.3386899439243925e-09, "loss": 1.0412, "step": 4324 }, { "epoch": 0.9902690326273612, "grad_norm": 1.529685378074646, "learning_rate": 6.0773555067783485e-09, "loss": 1.0188, "step": 4325 }, { "epoch": 0.9904979965655409, "grad_norm": 1.2781044244766235, "learning_rate": 5.8215206414746764e-09, "loss": 1.0064, "step": 4326 }, { "epoch": 0.9907269605037207, "grad_norm": 1.5567418336868286, "learning_rate": 5.571185488797426e-09, "loss": 1.0069, "step": 4327 }, { "epoch": 0.9909559244419004, "grad_norm": 1.4737282991409302, "learning_rate": 5.326350186503071e-09, "loss": 1.0737, "step": 4328 }, { "epoch": 0.9911848883800801, "grad_norm": 1.3786917924880981, "learning_rate": 5.087014869322726e-09, "loss": 0.991, "step": 4329 }, { "epoch": 0.9914138523182598, "grad_norm": 1.3202197551727295, "learning_rate": 4.853179668959928e-09, "loss": 1.0897, "step": 4330 }, { "epoch": 0.9916428162564396, "grad_norm": 1.275343656539917, "learning_rate": 4.6248447140939675e-09, "loss": 1.0553, "step": 4331 }, { "epoch": 0.9918717801946193, "grad_norm": 1.365133285522461, "learning_rate": 4.4020101303743345e-09, "loss": 1.024, "step": 4332 }, { "epoch": 0.9921007441327991, "grad_norm": 1.2677979469299316, "learning_rate": 4.184676040426272e-09, "loss": 1.0297, "step": 4333 }, { "epoch": 0.9923297080709789, "grad_norm": 1.360640525817871, "learning_rate": 3.972842563845225e-09, "loss": 1.0112, "step": 4334 }, { "epoch": 0.9925586720091586, "grad_norm": 1.6422892808914185, "learning_rate": 3.7665098172023905e-09, "loss": 1.023, "step": 4335 }, { "epoch": 0.9927876359473383, "grad_norm": 1.0084567070007324, "learning_rate": 3.5656779140402777e-09, "loss": 0.974, "step": 4336 }, { "epoch": 0.993016599885518, "grad_norm": 1.3576377630233765, "learning_rate": 3.3703469648760367e-09, "loss": 0.9808, "step": 4337 }, { "epoch": 0.9932455638236978, "grad_norm": 1.804397463798523, "learning_rate": 3.1805170771970207e-09, "loss": 1.0415, "step": 4338 }, { "epoch": 0.9934745277618775, "grad_norm": 1.3295351266860962, "learning_rate": 2.9961883554674443e-09, "loss": 1.0812, "step": 4339 }, { "epoch": 0.9937034917000572, "grad_norm": 2.1188831329345703, "learning_rate": 2.8173609011195035e-09, "loss": 0.964, "step": 4340 }, { "epoch": 0.993932455638237, "grad_norm": 1.1726047992706299, "learning_rate": 2.6440348125622574e-09, "loss": 0.9524, "step": 4341 }, { "epoch": 0.9941614195764167, "grad_norm": 1.23749840259552, "learning_rate": 2.476210185173855e-09, "loss": 0.979, "step": 4342 }, { "epoch": 0.9943903835145964, "grad_norm": 1.6526784896850586, "learning_rate": 2.3138871113081997e-09, "loss": 1.0302, "step": 4343 }, { "epoch": 0.9946193474527761, "grad_norm": 1.434206485748291, "learning_rate": 2.1570656802905042e-09, "loss": 1.0205, "step": 4344 }, { "epoch": 0.994848311390956, "grad_norm": 1.5796558856964111, "learning_rate": 2.0057459784161848e-09, "loss": 1.0317, "step": 4345 }, { "epoch": 0.9950772753291357, "grad_norm": 1.5953503847122192, "learning_rate": 1.859928088957519e-09, "loss": 1.0114, "step": 4346 }, { "epoch": 0.9953062392673154, "grad_norm": 1.3271796703338623, "learning_rate": 1.7196120921558757e-09, "loss": 1.0554, "step": 4347 }, { "epoch": 0.9955352032054952, "grad_norm": 3.4375476837158203, "learning_rate": 1.5847980652261563e-09, "loss": 1.0024, "step": 4348 }, { "epoch": 0.9957641671436749, "grad_norm": 1.3996182680130005, "learning_rate": 1.4554860823556838e-09, "loss": 1.0361, "step": 4349 }, { "epoch": 0.9959931310818546, "grad_norm": 1.1781070232391357, "learning_rate": 1.3316762147030925e-09, "loss": 1.0539, "step": 4350 }, { "epoch": 0.9962220950200343, "grad_norm": 1.5741220712661743, "learning_rate": 1.213368530399439e-09, "loss": 0.9569, "step": 4351 }, { "epoch": 0.9964510589582141, "grad_norm": 1.3308751583099365, "learning_rate": 1.100563094550422e-09, "loss": 0.9695, "step": 4352 }, { "epoch": 0.9966800228963938, "grad_norm": 1.3270615339279175, "learning_rate": 9.932599692297207e-10, "loss": 1.0147, "step": 4353 }, { "epoch": 0.9969089868345735, "grad_norm": 1.6848193407058716, "learning_rate": 8.914592134867672e-10, "loss": 0.9697, "step": 4354 }, { "epoch": 0.9971379507727532, "grad_norm": 1.465806245803833, "learning_rate": 7.95160883341195e-10, "loss": 0.9577, "step": 4355 }, { "epoch": 0.9973669147109331, "grad_norm": 1.743776798248291, "learning_rate": 7.043650317850592e-10, "loss": 1.0717, "step": 4356 }, { "epoch": 0.9975958786491128, "grad_norm": 1.4670666456222534, "learning_rate": 6.190717087828368e-10, "loss": 0.9676, "step": 4357 }, { "epoch": 0.9978248425872925, "grad_norm": 1.3044846057891846, "learning_rate": 5.392809612703165e-10, "loss": 1.0857, "step": 4358 }, { "epoch": 0.9980538065254723, "grad_norm": 1.9425315856933594, "learning_rate": 4.649928331557085e-10, "loss": 1.061, "step": 4359 }, { "epoch": 0.998282770463652, "grad_norm": 1.157950520515442, "learning_rate": 3.9620736532075543e-10, "loss": 1.0149, "step": 4360 }, { "epoch": 0.9985117344018317, "grad_norm": 1.157373309135437, "learning_rate": 3.3292459561518055e-10, "loss": 1.0196, "step": 4361 }, { "epoch": 0.9987406983400114, "grad_norm": 1.302017331123352, "learning_rate": 2.7514455886334945e-10, "loss": 1.0031, "step": 4362 }, { "epoch": 0.9989696622781912, "grad_norm": 1.2691115140914917, "learning_rate": 2.2286728686315984e-10, "loss": 1.0935, "step": 4363 }, { "epoch": 0.9991986262163709, "grad_norm": 1.2917944192886353, "learning_rate": 1.7609280838049026e-10, "loss": 1.0505, "step": 4364 }, { "epoch": 0.9994275901545506, "grad_norm": 2.0502283573150635, "learning_rate": 1.3482114915475132e-10, "loss": 1.0274, "step": 4365 }, { "epoch": 0.9996565540927304, "grad_norm": 1.4327635765075684, "learning_rate": 9.905233189888563e-11, "loss": 1.0067, "step": 4366 }, { "epoch": 0.9998855180309101, "grad_norm": 1.210883617401123, "learning_rate": 6.87863762938168e-11, "loss": 1.04, "step": 4367 }, { "epoch": 0.9998855180309101, "step": 4367, "total_flos": 2.9591670167426826e+18, "train_loss": 1.0860715442400435, "train_runtime": 124901.2572, "train_samples_per_second": 4.476, "train_steps_per_second": 0.035 } ], "logging_steps": 1.0, "max_steps": 4367, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50000, "total_flos": 2.9591670167426826e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }