diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,30599 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9998855180309101, + "eval_steps": 500, + "global_step": 4367, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0002289639381797367, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 2.1349, + "step": 1 + }, + { + "epoch": 0.0004579278763594734, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 2.1263, + "step": 2 + }, + { + "epoch": 0.0006868918145392101, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 2.1207, + "step": 3 + }, + { + "epoch": 0.0009158557527189468, + "grad_norm": 8.66817569732666, + "learning_rate": 1.5151515151515152e-07, + "loss": 2.1783, + "step": 4 + }, + { + "epoch": 0.0011448196908986834, + "grad_norm": 8.842265129089355, + "learning_rate": 3.0303030303030305e-07, + "loss": 2.0996, + "step": 5 + }, + { + "epoch": 0.0013737836290784202, + "grad_norm": 10.951105117797852, + "learning_rate": 4.5454545454545457e-07, + "loss": 2.2171, + "step": 6 + }, + { + "epoch": 0.0016027475672581567, + "grad_norm": 9.442892074584961, + "learning_rate": 6.060606060606061e-07, + "loss": 2.0768, + "step": 7 + }, + { + "epoch": 0.0018317115054378936, + "grad_norm": 8.939762115478516, + "learning_rate": 7.575757575757576e-07, + "loss": 2.1261, + "step": 8 + }, + { + "epoch": 0.00206067544361763, + "grad_norm": 8.166585922241211, + "learning_rate": 9.090909090909091e-07, + "loss": 2.0497, + "step": 9 + }, + { + "epoch": 0.0022896393817973667, + "grad_norm": 8.294096946716309, + "learning_rate": 1.0606060606060608e-06, + "loss": 2.0554, + "step": 10 + }, + { + "epoch": 0.0025186033199771037, + "grad_norm": 7.692963123321533, + "learning_rate": 1.2121212121212122e-06, + "loss": 2.0066, + "step": 11 + }, + { + "epoch": 0.0027475672581568403, + "grad_norm": 6.519928932189941, + "learning_rate": 1.3636363636363636e-06, + "loss": 1.804, + "step": 12 + }, + { + "epoch": 0.002976531196336577, + "grad_norm": 6.0504631996154785, + "learning_rate": 1.5151515151515152e-06, + "loss": 1.728, + "step": 13 + }, + { + "epoch": 0.0032054951345163135, + "grad_norm": 5.239335060119629, + "learning_rate": 1.6666666666666667e-06, + "loss": 1.703, + "step": 14 + }, + { + "epoch": 0.0034344590726960505, + "grad_norm": 3.2573082447052, + "learning_rate": 1.8181818181818183e-06, + "loss": 1.6709, + "step": 15 + }, + { + "epoch": 0.003663423010875787, + "grad_norm": 3.028881788253784, + "learning_rate": 1.96969696969697e-06, + "loss": 1.5926, + "step": 16 + }, + { + "epoch": 0.0038923869490555237, + "grad_norm": 2.915285110473633, + "learning_rate": 2.1212121212121216e-06, + "loss": 1.5392, + "step": 17 + }, + { + "epoch": 0.00412135088723526, + "grad_norm": 3.2491884231567383, + "learning_rate": 2.2727272727272728e-06, + "loss": 1.5468, + "step": 18 + }, + { + "epoch": 0.004350314825414997, + "grad_norm": 2.825366497039795, + "learning_rate": 2.4242424242424244e-06, + "loss": 1.4983, + "step": 19 + }, + { + "epoch": 0.0045792787635947334, + "grad_norm": 3.073415994644165, + "learning_rate": 2.575757575757576e-06, + "loss": 1.4596, + "step": 20 + }, + { + "epoch": 0.004808242701774471, + "grad_norm": 4.21063756942749, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.4261, + "step": 21 + }, + { + "epoch": 0.0050372066399542075, + "grad_norm": 2.9680824279785156, + "learning_rate": 2.8787878787878793e-06, + "loss": 1.4863, + "step": 22 + }, + { + "epoch": 0.005266170578133944, + "grad_norm": 3.912755250930786, + "learning_rate": 3.0303030303030305e-06, + "loss": 1.4069, + "step": 23 + }, + { + "epoch": 0.005495134516313681, + "grad_norm": 1.9926663637161255, + "learning_rate": 3.181818181818182e-06, + "loss": 1.3404, + "step": 24 + }, + { + "epoch": 0.005724098454493417, + "grad_norm": 5.043450355529785, + "learning_rate": 3.3333333333333333e-06, + "loss": 1.402, + "step": 25 + }, + { + "epoch": 0.005953062392673154, + "grad_norm": 2.1705479621887207, + "learning_rate": 3.4848484848484854e-06, + "loss": 1.412, + "step": 26 + }, + { + "epoch": 0.00618202633085289, + "grad_norm": 1.6663283109664917, + "learning_rate": 3.6363636363636366e-06, + "loss": 1.4681, + "step": 27 + }, + { + "epoch": 0.006410990269032627, + "grad_norm": 1.9405096769332886, + "learning_rate": 3.7878787878787882e-06, + "loss": 1.4265, + "step": 28 + }, + { + "epoch": 0.0066399542072123644, + "grad_norm": 2.0428900718688965, + "learning_rate": 3.93939393939394e-06, + "loss": 1.362, + "step": 29 + }, + { + "epoch": 0.006868918145392101, + "grad_norm": 2.2596077919006348, + "learning_rate": 4.0909090909090915e-06, + "loss": 1.373, + "step": 30 + }, + { + "epoch": 0.007097882083571838, + "grad_norm": 1.9293066263198853, + "learning_rate": 4.242424242424243e-06, + "loss": 1.3973, + "step": 31 + }, + { + "epoch": 0.007326846021751574, + "grad_norm": 1.669158697128296, + "learning_rate": 4.393939393939394e-06, + "loss": 1.3654, + "step": 32 + }, + { + "epoch": 0.007555809959931311, + "grad_norm": 1.5311822891235352, + "learning_rate": 4.5454545454545455e-06, + "loss": 1.3982, + "step": 33 + }, + { + "epoch": 0.007784773898111047, + "grad_norm": 1.653910517692566, + "learning_rate": 4.696969696969698e-06, + "loss": 1.3749, + "step": 34 + }, + { + "epoch": 0.008013737836290785, + "grad_norm": 1.5567728281021118, + "learning_rate": 4.848484848484849e-06, + "loss": 1.3699, + "step": 35 + }, + { + "epoch": 0.00824270177447052, + "grad_norm": 1.6896796226501465, + "learning_rate": 5e-06, + "loss": 1.3356, + "step": 36 + }, + { + "epoch": 0.008471665712650258, + "grad_norm": 1.6226534843444824, + "learning_rate": 5.151515151515152e-06, + "loss": 1.3602, + "step": 37 + }, + { + "epoch": 0.008700629650829994, + "grad_norm": 1.81944739818573, + "learning_rate": 5.303030303030303e-06, + "loss": 1.3763, + "step": 38 + }, + { + "epoch": 0.008929593589009731, + "grad_norm": 1.9701725244522095, + "learning_rate": 5.4545454545454545e-06, + "loss": 1.4414, + "step": 39 + }, + { + "epoch": 0.009158557527189467, + "grad_norm": 1.9407984018325806, + "learning_rate": 5.606060606060606e-06, + "loss": 1.3173, + "step": 40 + }, + { + "epoch": 0.009387521465369204, + "grad_norm": 1.5994501113891602, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.3494, + "step": 41 + }, + { + "epoch": 0.009616485403548942, + "grad_norm": 1.8734986782073975, + "learning_rate": 5.90909090909091e-06, + "loss": 1.3658, + "step": 42 + }, + { + "epoch": 0.009845449341728678, + "grad_norm": 1.8831076622009277, + "learning_rate": 6.060606060606061e-06, + "loss": 1.3024, + "step": 43 + }, + { + "epoch": 0.010074413279908415, + "grad_norm": 1.6908974647521973, + "learning_rate": 6.212121212121213e-06, + "loss": 1.4275, + "step": 44 + }, + { + "epoch": 0.01030337721808815, + "grad_norm": 1.742285132408142, + "learning_rate": 6.363636363636364e-06, + "loss": 1.3254, + "step": 45 + }, + { + "epoch": 0.010532341156267888, + "grad_norm": 1.9381937980651855, + "learning_rate": 6.515151515151516e-06, + "loss": 1.3249, + "step": 46 + }, + { + "epoch": 0.010761305094447624, + "grad_norm": 1.445646047592163, + "learning_rate": 6.666666666666667e-06, + "loss": 1.325, + "step": 47 + }, + { + "epoch": 0.010990269032627361, + "grad_norm": 1.6461410522460938, + "learning_rate": 6.818181818181818e-06, + "loss": 1.3958, + "step": 48 + }, + { + "epoch": 0.011219232970807097, + "grad_norm": 2.5436205863952637, + "learning_rate": 6.969696969696971e-06, + "loss": 1.2518, + "step": 49 + }, + { + "epoch": 0.011448196908986834, + "grad_norm": 1.5389620065689087, + "learning_rate": 7.121212121212122e-06, + "loss": 1.2919, + "step": 50 + }, + { + "epoch": 0.011677160847166572, + "grad_norm": 1.7769535779953003, + "learning_rate": 7.272727272727273e-06, + "loss": 1.2673, + "step": 51 + }, + { + "epoch": 0.011906124785346308, + "grad_norm": 2.1329517364501953, + "learning_rate": 7.424242424242425e-06, + "loss": 1.2995, + "step": 52 + }, + { + "epoch": 0.012135088723526045, + "grad_norm": 1.5156912803649902, + "learning_rate": 7.5757575757575764e-06, + "loss": 1.3062, + "step": 53 + }, + { + "epoch": 0.01236405266170578, + "grad_norm": 1.4951452016830444, + "learning_rate": 7.727272727272727e-06, + "loss": 1.3429, + "step": 54 + }, + { + "epoch": 0.012593016599885518, + "grad_norm": 1.3737133741378784, + "learning_rate": 7.87878787878788e-06, + "loss": 1.2991, + "step": 55 + }, + { + "epoch": 0.012821980538065254, + "grad_norm": 1.3582038879394531, + "learning_rate": 8.03030303030303e-06, + "loss": 1.2792, + "step": 56 + }, + { + "epoch": 0.013050944476244991, + "grad_norm": 1.4764114618301392, + "learning_rate": 8.181818181818183e-06, + "loss": 1.3561, + "step": 57 + }, + { + "epoch": 0.013279908414424729, + "grad_norm": 1.6100513935089111, + "learning_rate": 8.333333333333334e-06, + "loss": 1.2303, + "step": 58 + }, + { + "epoch": 0.013508872352604465, + "grad_norm": 1.9281264543533325, + "learning_rate": 8.484848484848486e-06, + "loss": 1.3124, + "step": 59 + }, + { + "epoch": 0.013737836290784202, + "grad_norm": 1.6992368698120117, + "learning_rate": 8.636363636363637e-06, + "loss": 1.3066, + "step": 60 + }, + { + "epoch": 0.013966800228963938, + "grad_norm": 1.4923304319381714, + "learning_rate": 8.787878787878788e-06, + "loss": 1.3137, + "step": 61 + }, + { + "epoch": 0.014195764167143675, + "grad_norm": 3.8915576934814453, + "learning_rate": 8.93939393939394e-06, + "loss": 1.2634, + "step": 62 + }, + { + "epoch": 0.014424728105323411, + "grad_norm": 1.6618587970733643, + "learning_rate": 9.090909090909091e-06, + "loss": 1.2467, + "step": 63 + }, + { + "epoch": 0.014653692043503148, + "grad_norm": 1.4143376350402832, + "learning_rate": 9.242424242424244e-06, + "loss": 1.2817, + "step": 64 + }, + { + "epoch": 0.014882655981682884, + "grad_norm": 1.594414234161377, + "learning_rate": 9.393939393939396e-06, + "loss": 1.3015, + "step": 65 + }, + { + "epoch": 0.015111619919862622, + "grad_norm": 1.4304990768432617, + "learning_rate": 9.545454545454547e-06, + "loss": 1.3633, + "step": 66 + }, + { + "epoch": 0.015340583858042359, + "grad_norm": 2.2776196002960205, + "learning_rate": 9.696969696969698e-06, + "loss": 1.3109, + "step": 67 + }, + { + "epoch": 0.015569547796222095, + "grad_norm": 1.789172649383545, + "learning_rate": 9.84848484848485e-06, + "loss": 1.3003, + "step": 68 + }, + { + "epoch": 0.015798511734401832, + "grad_norm": 1.5805341005325317, + "learning_rate": 1e-05, + "loss": 1.3144, + "step": 69 + }, + { + "epoch": 0.01602747567258157, + "grad_norm": 1.2724578380584717, + "learning_rate": 1.0151515151515152e-05, + "loss": 1.2855, + "step": 70 + }, + { + "epoch": 0.016256439610761304, + "grad_norm": 1.451402187347412, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.2442, + "step": 71 + }, + { + "epoch": 0.01648540354894104, + "grad_norm": 2.8804028034210205, + "learning_rate": 1.0454545454545455e-05, + "loss": 1.2494, + "step": 72 + }, + { + "epoch": 0.01671436748712078, + "grad_norm": 2.359454870223999, + "learning_rate": 1.0606060606060606e-05, + "loss": 1.1962, + "step": 73 + }, + { + "epoch": 0.016943331425300516, + "grad_norm": 3.0040102005004883, + "learning_rate": 1.0757575757575758e-05, + "loss": 1.26, + "step": 74 + }, + { + "epoch": 0.017172295363480253, + "grad_norm": 2.051539659500122, + "learning_rate": 1.0909090909090909e-05, + "loss": 1.2601, + "step": 75 + }, + { + "epoch": 0.017401259301659987, + "grad_norm": 2.0666730403900146, + "learning_rate": 1.1060606060606061e-05, + "loss": 1.3369, + "step": 76 + }, + { + "epoch": 0.017630223239839725, + "grad_norm": 1.3947479724884033, + "learning_rate": 1.1212121212121212e-05, + "loss": 1.2414, + "step": 77 + }, + { + "epoch": 0.017859187178019462, + "grad_norm": 1.6999517679214478, + "learning_rate": 1.1363636363636366e-05, + "loss": 1.2773, + "step": 78 + }, + { + "epoch": 0.0180881511161992, + "grad_norm": 1.3520228862762451, + "learning_rate": 1.1515151515151517e-05, + "loss": 1.2515, + "step": 79 + }, + { + "epoch": 0.018317115054378934, + "grad_norm": 1.7271336317062378, + "learning_rate": 1.1666666666666668e-05, + "loss": 1.285, + "step": 80 + }, + { + "epoch": 0.01854607899255867, + "grad_norm": 1.5026074647903442, + "learning_rate": 1.181818181818182e-05, + "loss": 1.2984, + "step": 81 + }, + { + "epoch": 0.01877504293073841, + "grad_norm": 1.455031156539917, + "learning_rate": 1.1969696969696971e-05, + "loss": 1.1785, + "step": 82 + }, + { + "epoch": 0.019004006868918146, + "grad_norm": 2.4134457111358643, + "learning_rate": 1.2121212121212122e-05, + "loss": 1.2861, + "step": 83 + }, + { + "epoch": 0.019232970807097884, + "grad_norm": 1.3893452882766724, + "learning_rate": 1.2272727272727274e-05, + "loss": 1.3012, + "step": 84 + }, + { + "epoch": 0.019461934745277618, + "grad_norm": 2.0484437942504883, + "learning_rate": 1.2424242424242425e-05, + "loss": 1.2981, + "step": 85 + }, + { + "epoch": 0.019690898683457355, + "grad_norm": 1.8571381568908691, + "learning_rate": 1.2575757575757576e-05, + "loss": 1.2488, + "step": 86 + }, + { + "epoch": 0.019919862621637092, + "grad_norm": 1.5972386598587036, + "learning_rate": 1.2727272727272728e-05, + "loss": 1.2762, + "step": 87 + }, + { + "epoch": 0.02014882655981683, + "grad_norm": 1.9522424936294556, + "learning_rate": 1.287878787878788e-05, + "loss": 1.2176, + "step": 88 + }, + { + "epoch": 0.020377790497996564, + "grad_norm": 1.641571044921875, + "learning_rate": 1.3030303030303032e-05, + "loss": 1.21, + "step": 89 + }, + { + "epoch": 0.0206067544361763, + "grad_norm": 2.441856861114502, + "learning_rate": 1.3181818181818183e-05, + "loss": 1.2518, + "step": 90 + }, + { + "epoch": 0.02083571837435604, + "grad_norm": 2.1330080032348633, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.2659, + "step": 91 + }, + { + "epoch": 0.021064682312535776, + "grad_norm": 1.3508576154708862, + "learning_rate": 1.3484848484848486e-05, + "loss": 1.2586, + "step": 92 + }, + { + "epoch": 0.021293646250715514, + "grad_norm": 1.7117724418640137, + "learning_rate": 1.3636363636363637e-05, + "loss": 1.3094, + "step": 93 + }, + { + "epoch": 0.021522610188895248, + "grad_norm": 1.690869927406311, + "learning_rate": 1.378787878787879e-05, + "loss": 1.1898, + "step": 94 + }, + { + "epoch": 0.021751574127074985, + "grad_norm": 2.3677923679351807, + "learning_rate": 1.3939393939393942e-05, + "loss": 1.2628, + "step": 95 + }, + { + "epoch": 0.021980538065254723, + "grad_norm": 1.5887540578842163, + "learning_rate": 1.4090909090909092e-05, + "loss": 1.2717, + "step": 96 + }, + { + "epoch": 0.02220950200343446, + "grad_norm": 2.6575193405151367, + "learning_rate": 1.4242424242424245e-05, + "loss": 1.2429, + "step": 97 + }, + { + "epoch": 0.022438465941614194, + "grad_norm": 1.425506830215454, + "learning_rate": 1.4393939393939396e-05, + "loss": 1.277, + "step": 98 + }, + { + "epoch": 0.02266742987979393, + "grad_norm": 1.729892611503601, + "learning_rate": 1.4545454545454546e-05, + "loss": 1.2601, + "step": 99 + }, + { + "epoch": 0.02289639381797367, + "grad_norm": 2.3861329555511475, + "learning_rate": 1.4696969696969699e-05, + "loss": 1.275, + "step": 100 + }, + { + "epoch": 0.023125357756153406, + "grad_norm": 1.65578031539917, + "learning_rate": 1.484848484848485e-05, + "loss": 1.248, + "step": 101 + }, + { + "epoch": 0.023354321694333144, + "grad_norm": 3.5684421062469482, + "learning_rate": 1.5000000000000002e-05, + "loss": 1.2647, + "step": 102 + }, + { + "epoch": 0.023583285632512878, + "grad_norm": 19.274433135986328, + "learning_rate": 1.5151515151515153e-05, + "loss": 1.2598, + "step": 103 + }, + { + "epoch": 0.023812249570692615, + "grad_norm": 1.5521622896194458, + "learning_rate": 1.5303030303030304e-05, + "loss": 1.2898, + "step": 104 + }, + { + "epoch": 0.024041213508872353, + "grad_norm": 1.7420704364776611, + "learning_rate": 1.5454545454545454e-05, + "loss": 1.289, + "step": 105 + }, + { + "epoch": 0.02427017744705209, + "grad_norm": 1.6716350317001343, + "learning_rate": 1.5606060606060605e-05, + "loss": 1.2162, + "step": 106 + }, + { + "epoch": 0.024499141385231828, + "grad_norm": 1.7894171476364136, + "learning_rate": 1.575757575757576e-05, + "loss": 1.2512, + "step": 107 + }, + { + "epoch": 0.02472810532341156, + "grad_norm": 2.7053062915802, + "learning_rate": 1.590909090909091e-05, + "loss": 1.2625, + "step": 108 + }, + { + "epoch": 0.0249570692615913, + "grad_norm": 1.7497886419296265, + "learning_rate": 1.606060606060606e-05, + "loss": 1.2179, + "step": 109 + }, + { + "epoch": 0.025186033199771037, + "grad_norm": 1.6836780309677124, + "learning_rate": 1.6212121212121212e-05, + "loss": 1.2076, + "step": 110 + }, + { + "epoch": 0.025414997137950774, + "grad_norm": 1.893149733543396, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.2613, + "step": 111 + }, + { + "epoch": 0.025643961076130508, + "grad_norm": 1.6643422842025757, + "learning_rate": 1.6515151515151517e-05, + "loss": 1.2215, + "step": 112 + }, + { + "epoch": 0.025872925014310245, + "grad_norm": 1.624299168586731, + "learning_rate": 1.6666666666666667e-05, + "loss": 1.2552, + "step": 113 + }, + { + "epoch": 0.026101888952489983, + "grad_norm": 2.46860408782959, + "learning_rate": 1.681818181818182e-05, + "loss": 1.2901, + "step": 114 + }, + { + "epoch": 0.02633085289066972, + "grad_norm": 2.3799362182617188, + "learning_rate": 1.6969696969696972e-05, + "loss": 1.198, + "step": 115 + }, + { + "epoch": 0.026559816828849458, + "grad_norm": 1.3481436967849731, + "learning_rate": 1.7121212121212123e-05, + "loss": 1.2413, + "step": 116 + }, + { + "epoch": 0.026788780767029192, + "grad_norm": 1.9589934349060059, + "learning_rate": 1.7272727272727274e-05, + "loss": 1.2845, + "step": 117 + }, + { + "epoch": 0.02701774470520893, + "grad_norm": 2.889359712600708, + "learning_rate": 1.7424242424242425e-05, + "loss": 1.2236, + "step": 118 + }, + { + "epoch": 0.027246708643388667, + "grad_norm": 2.009254217147827, + "learning_rate": 1.7575757575757576e-05, + "loss": 1.2147, + "step": 119 + }, + { + "epoch": 0.027475672581568404, + "grad_norm": 1.872424602508545, + "learning_rate": 1.772727272727273e-05, + "loss": 1.2385, + "step": 120 + }, + { + "epoch": 0.027704636519748138, + "grad_norm": 1.411062479019165, + "learning_rate": 1.787878787878788e-05, + "loss": 1.2238, + "step": 121 + }, + { + "epoch": 0.027933600457927876, + "grad_norm": 1.9089847803115845, + "learning_rate": 1.803030303030303e-05, + "loss": 1.3142, + "step": 122 + }, + { + "epoch": 0.028162564396107613, + "grad_norm": 1.5556221008300781, + "learning_rate": 1.8181818181818182e-05, + "loss": 1.3167, + "step": 123 + }, + { + "epoch": 0.02839152833428735, + "grad_norm": 1.295181393623352, + "learning_rate": 1.8333333333333333e-05, + "loss": 1.2127, + "step": 124 + }, + { + "epoch": 0.028620492272467088, + "grad_norm": 1.6264946460723877, + "learning_rate": 1.8484848484848487e-05, + "loss": 1.2766, + "step": 125 + }, + { + "epoch": 0.028849456210646822, + "grad_norm": 1.6156526803970337, + "learning_rate": 1.8636363636363638e-05, + "loss": 1.2583, + "step": 126 + }, + { + "epoch": 0.02907842014882656, + "grad_norm": 2.654879570007324, + "learning_rate": 1.8787878787878792e-05, + "loss": 1.223, + "step": 127 + }, + { + "epoch": 0.029307384087006297, + "grad_norm": 2.6344661712646484, + "learning_rate": 1.8939393939393943e-05, + "loss": 1.2516, + "step": 128 + }, + { + "epoch": 0.029536348025186034, + "grad_norm": 2.6589064598083496, + "learning_rate": 1.9090909090909094e-05, + "loss": 1.2435, + "step": 129 + }, + { + "epoch": 0.029765311963365768, + "grad_norm": 2.257918357849121, + "learning_rate": 1.9242424242424244e-05, + "loss": 1.1835, + "step": 130 + }, + { + "epoch": 0.029994275901545506, + "grad_norm": 1.8436076641082764, + "learning_rate": 1.9393939393939395e-05, + "loss": 1.1735, + "step": 131 + }, + { + "epoch": 0.030223239839725243, + "grad_norm": 1.7338463068008423, + "learning_rate": 1.9545454545454546e-05, + "loss": 1.1817, + "step": 132 + }, + { + "epoch": 0.03045220377790498, + "grad_norm": 2.028693675994873, + "learning_rate": 1.96969696969697e-05, + "loss": 1.2663, + "step": 133 + }, + { + "epoch": 0.030681167716084718, + "grad_norm": 1.6048945188522339, + "learning_rate": 1.984848484848485e-05, + "loss": 1.237, + "step": 134 + }, + { + "epoch": 0.030910131654264452, + "grad_norm": 1.475034236907959, + "learning_rate": 2e-05, + "loss": 1.2534, + "step": 135 + }, + { + "epoch": 0.03113909559244419, + "grad_norm": 1.4430214166641235, + "learning_rate": 1.9999997248541923e-05, + "loss": 1.2202, + "step": 136 + }, + { + "epoch": 0.031368059530623923, + "grad_norm": 1.705660343170166, + "learning_rate": 1.9999988994169196e-05, + "loss": 1.2087, + "step": 137 + }, + { + "epoch": 0.031597023468803664, + "grad_norm": 1.6410300731658936, + "learning_rate": 1.9999975236886367e-05, + "loss": 1.1895, + "step": 138 + }, + { + "epoch": 0.0318259874069834, + "grad_norm": 2.1050198078155518, + "learning_rate": 1.9999955976701005e-05, + "loss": 1.2435, + "step": 139 + }, + { + "epoch": 0.03205495134516314, + "grad_norm": 2.0697057247161865, + "learning_rate": 1.9999931213623708e-05, + "loss": 1.2032, + "step": 140 + }, + { + "epoch": 0.03228391528334287, + "grad_norm": 1.8335312604904175, + "learning_rate": 1.9999900947668106e-05, + "loss": 1.2381, + "step": 141 + }, + { + "epoch": 0.03251287922152261, + "grad_norm": 1.9289928674697876, + "learning_rate": 1.9999865178850847e-05, + "loss": 1.2319, + "step": 142 + }, + { + "epoch": 0.03274184315970235, + "grad_norm": 2.391378164291382, + "learning_rate": 1.9999823907191623e-05, + "loss": 1.2736, + "step": 143 + }, + { + "epoch": 0.03297080709788208, + "grad_norm": 1.6855164766311646, + "learning_rate": 1.9999777132713137e-05, + "loss": 1.2463, + "step": 144 + }, + { + "epoch": 0.03319977103606182, + "grad_norm": 1.5160446166992188, + "learning_rate": 1.999972485544114e-05, + "loss": 1.2183, + "step": 145 + }, + { + "epoch": 0.03342873497424156, + "grad_norm": 1.6463406085968018, + "learning_rate": 1.9999667075404385e-05, + "loss": 1.2261, + "step": 146 + }, + { + "epoch": 0.03365769891242129, + "grad_norm": 1.453431248664856, + "learning_rate": 1.999960379263468e-05, + "loss": 1.2804, + "step": 147 + }, + { + "epoch": 0.03388666285060103, + "grad_norm": 4.494565486907959, + "learning_rate": 1.9999535007166847e-05, + "loss": 1.19, + "step": 148 + }, + { + "epoch": 0.034115626788780766, + "grad_norm": 2.251443386077881, + "learning_rate": 1.999946071903873e-05, + "loss": 1.2244, + "step": 149 + }, + { + "epoch": 0.03434459072696051, + "grad_norm": 1.86344575881958, + "learning_rate": 1.999938092829122e-05, + "loss": 1.2541, + "step": 150 + }, + { + "epoch": 0.03457355466514024, + "grad_norm": 2.023585796356201, + "learning_rate": 1.9999295634968216e-05, + "loss": 1.2251, + "step": 151 + }, + { + "epoch": 0.034802518603319975, + "grad_norm": 1.6223890781402588, + "learning_rate": 1.999920483911666e-05, + "loss": 1.2049, + "step": 152 + }, + { + "epoch": 0.035031482541499716, + "grad_norm": 1.8935831785202026, + "learning_rate": 1.9999108540786513e-05, + "loss": 1.2355, + "step": 153 + }, + { + "epoch": 0.03526044647967945, + "grad_norm": 1.2119618654251099, + "learning_rate": 1.9999006740030774e-05, + "loss": 1.2546, + "step": 154 + }, + { + "epoch": 0.035489410417859184, + "grad_norm": 1.663476586341858, + "learning_rate": 1.999889943690545e-05, + "loss": 1.2184, + "step": 155 + }, + { + "epoch": 0.035718374356038925, + "grad_norm": 1.5204002857208252, + "learning_rate": 1.9998786631469602e-05, + "loss": 1.2055, + "step": 156 + }, + { + "epoch": 0.03594733829421866, + "grad_norm": 1.5706137418746948, + "learning_rate": 1.9998668323785298e-05, + "loss": 1.1985, + "step": 157 + }, + { + "epoch": 0.0361763022323984, + "grad_norm": 2.2016441822052, + "learning_rate": 1.9998544513917646e-05, + "loss": 1.2262, + "step": 158 + }, + { + "epoch": 0.036405266170578134, + "grad_norm": 1.5397722721099854, + "learning_rate": 1.9998415201934775e-05, + "loss": 1.1625, + "step": 159 + }, + { + "epoch": 0.03663423010875787, + "grad_norm": 1.7281806468963623, + "learning_rate": 1.9998280387907845e-05, + "loss": 1.2658, + "step": 160 + }, + { + "epoch": 0.03686319404693761, + "grad_norm": 1.4419702291488647, + "learning_rate": 1.9998140071911044e-05, + "loss": 1.1674, + "step": 161 + }, + { + "epoch": 0.03709215798511734, + "grad_norm": 1.9832121133804321, + "learning_rate": 1.9997994254021584e-05, + "loss": 1.2543, + "step": 162 + }, + { + "epoch": 0.03732112192329708, + "grad_norm": 1.2950409650802612, + "learning_rate": 1.999784293431971e-05, + "loss": 1.2494, + "step": 163 + }, + { + "epoch": 0.03755008586147682, + "grad_norm": 1.6367920637130737, + "learning_rate": 1.9997686112888694e-05, + "loss": 1.1819, + "step": 164 + }, + { + "epoch": 0.03777904979965655, + "grad_norm": 1.458843469619751, + "learning_rate": 1.9997523789814827e-05, + "loss": 1.2508, + "step": 165 + }, + { + "epoch": 0.03800801373783629, + "grad_norm": 1.2992488145828247, + "learning_rate": 1.999735596518744e-05, + "loss": 1.2549, + "step": 166 + }, + { + "epoch": 0.038236977676016026, + "grad_norm": 1.3639360666275024, + "learning_rate": 1.999718263909888e-05, + "loss": 1.3374, + "step": 167 + }, + { + "epoch": 0.03846594161419577, + "grad_norm": 1.6184186935424805, + "learning_rate": 1.9997003811644534e-05, + "loss": 1.1888, + "step": 168 + }, + { + "epoch": 0.0386949055523755, + "grad_norm": 1.2626984119415283, + "learning_rate": 1.9996819482922804e-05, + "loss": 1.1682, + "step": 169 + }, + { + "epoch": 0.038923869490555235, + "grad_norm": 1.3981817960739136, + "learning_rate": 1.9996629653035128e-05, + "loss": 1.2318, + "step": 170 + }, + { + "epoch": 0.039152833428734976, + "grad_norm": 1.3171190023422241, + "learning_rate": 1.999643432208596e-05, + "loss": 1.1496, + "step": 171 + }, + { + "epoch": 0.03938179736691471, + "grad_norm": 1.2882895469665527, + "learning_rate": 1.99962334901828e-05, + "loss": 1.2616, + "step": 172 + }, + { + "epoch": 0.03961076130509445, + "grad_norm": 1.8390611410140991, + "learning_rate": 1.9996027157436154e-05, + "loss": 1.3038, + "step": 173 + }, + { + "epoch": 0.039839725243274185, + "grad_norm": 1.3590266704559326, + "learning_rate": 1.9995815323959576e-05, + "loss": 1.2718, + "step": 174 + }, + { + "epoch": 0.04006868918145392, + "grad_norm": 1.7429498434066772, + "learning_rate": 1.9995597989869625e-05, + "loss": 1.2578, + "step": 175 + }, + { + "epoch": 0.04029765311963366, + "grad_norm": 1.5614861249923706, + "learning_rate": 1.9995375155285906e-05, + "loss": 1.1698, + "step": 176 + }, + { + "epoch": 0.040526617057813394, + "grad_norm": 2.0290753841400146, + "learning_rate": 1.999514682033104e-05, + "loss": 1.219, + "step": 177 + }, + { + "epoch": 0.04075558099599313, + "grad_norm": 1.4158384799957275, + "learning_rate": 1.9994912985130682e-05, + "loss": 1.1945, + "step": 178 + }, + { + "epoch": 0.04098454493417287, + "grad_norm": 1.7731130123138428, + "learning_rate": 1.99946736498135e-05, + "loss": 1.1985, + "step": 179 + }, + { + "epoch": 0.0412135088723526, + "grad_norm": 1.9962372779846191, + "learning_rate": 1.9994428814511205e-05, + "loss": 1.2413, + "step": 180 + }, + { + "epoch": 0.041442472810532344, + "grad_norm": 1.7486003637313843, + "learning_rate": 1.9994178479358526e-05, + "loss": 1.1465, + "step": 181 + }, + { + "epoch": 0.04167143674871208, + "grad_norm": 1.415873646736145, + "learning_rate": 1.9993922644493223e-05, + "loss": 1.2388, + "step": 182 + }, + { + "epoch": 0.04190040068689181, + "grad_norm": 1.6285479068756104, + "learning_rate": 1.9993661310056076e-05, + "loss": 1.286, + "step": 183 + }, + { + "epoch": 0.04212936462507155, + "grad_norm": 1.3321681022644043, + "learning_rate": 1.99933944761909e-05, + "loss": 1.1976, + "step": 184 + }, + { + "epoch": 0.04235832856325129, + "grad_norm": 1.5006210803985596, + "learning_rate": 1.999312214304453e-05, + "loss": 1.209, + "step": 185 + }, + { + "epoch": 0.04258729250143103, + "grad_norm": 3.087294101715088, + "learning_rate": 1.999284431076682e-05, + "loss": 1.2405, + "step": 186 + }, + { + "epoch": 0.04281625643961076, + "grad_norm": 2.618927240371704, + "learning_rate": 1.999256097951067e-05, + "loss": 1.2988, + "step": 187 + }, + { + "epoch": 0.043045220377790495, + "grad_norm": 1.4773956537246704, + "learning_rate": 1.999227214943199e-05, + "loss": 1.1566, + "step": 188 + }, + { + "epoch": 0.043274184315970236, + "grad_norm": 1.824690818786621, + "learning_rate": 1.999197782068972e-05, + "loss": 1.2149, + "step": 189 + }, + { + "epoch": 0.04350314825414997, + "grad_norm": 1.7128965854644775, + "learning_rate": 1.9991677993445832e-05, + "loss": 1.1904, + "step": 190 + }, + { + "epoch": 0.04373211219232971, + "grad_norm": 1.7998428344726562, + "learning_rate": 1.999137266786531e-05, + "loss": 1.2846, + "step": 191 + }, + { + "epoch": 0.043961076130509445, + "grad_norm": 2.0802230834960938, + "learning_rate": 1.9991061844116178e-05, + "loss": 1.2073, + "step": 192 + }, + { + "epoch": 0.04419004006868918, + "grad_norm": 1.7808719873428345, + "learning_rate": 1.9990745522369482e-05, + "loss": 1.2883, + "step": 193 + }, + { + "epoch": 0.04441900400686892, + "grad_norm": 1.7531901597976685, + "learning_rate": 1.9990423702799283e-05, + "loss": 1.1568, + "step": 194 + }, + { + "epoch": 0.044647967945048654, + "grad_norm": 1.5850685834884644, + "learning_rate": 1.9990096385582682e-05, + "loss": 1.252, + "step": 195 + }, + { + "epoch": 0.04487693188322839, + "grad_norm": 1.3775941133499146, + "learning_rate": 1.9989763570899795e-05, + "loss": 1.2807, + "step": 196 + }, + { + "epoch": 0.04510589582140813, + "grad_norm": 1.3636221885681152, + "learning_rate": 1.998942525893377e-05, + "loss": 1.2385, + "step": 197 + }, + { + "epoch": 0.04533485975958786, + "grad_norm": 1.4277960062026978, + "learning_rate": 1.998908144987078e-05, + "loss": 1.166, + "step": 198 + }, + { + "epoch": 0.045563823697767604, + "grad_norm": 1.4968419075012207, + "learning_rate": 1.9988732143900013e-05, + "loss": 1.203, + "step": 199 + }, + { + "epoch": 0.04579278763594734, + "grad_norm": 1.6300857067108154, + "learning_rate": 1.998837734121369e-05, + "loss": 1.1981, + "step": 200 + }, + { + "epoch": 0.04602175157412707, + "grad_norm": 1.3024663925170898, + "learning_rate": 1.9988017042007066e-05, + "loss": 1.2181, + "step": 201 + }, + { + "epoch": 0.04625071551230681, + "grad_norm": 1.2611545324325562, + "learning_rate": 1.99876512464784e-05, + "loss": 1.2375, + "step": 202 + }, + { + "epoch": 0.04647967945048655, + "grad_norm": 1.7449328899383545, + "learning_rate": 1.9987279954828986e-05, + "loss": 1.2178, + "step": 203 + }, + { + "epoch": 0.04670864338866629, + "grad_norm": 1.4038939476013184, + "learning_rate": 1.998690316726315e-05, + "loss": 1.225, + "step": 204 + }, + { + "epoch": 0.04693760732684602, + "grad_norm": 1.573346495628357, + "learning_rate": 1.9986520883988233e-05, + "loss": 1.2241, + "step": 205 + }, + { + "epoch": 0.047166571265025756, + "grad_norm": 1.1829012632369995, + "learning_rate": 1.99861331052146e-05, + "loss": 1.1722, + "step": 206 + }, + { + "epoch": 0.0473955352032055, + "grad_norm": 1.6095627546310425, + "learning_rate": 1.9985739831155637e-05, + "loss": 1.2481, + "step": 207 + }, + { + "epoch": 0.04762449914138523, + "grad_norm": 1.8268073797225952, + "learning_rate": 1.998534106202777e-05, + "loss": 1.1972, + "step": 208 + }, + { + "epoch": 0.04785346307956497, + "grad_norm": 1.7510496377944946, + "learning_rate": 1.9984936798050435e-05, + "loss": 1.1889, + "step": 209 + }, + { + "epoch": 0.048082427017744706, + "grad_norm": 1.9209846258163452, + "learning_rate": 1.9984527039446093e-05, + "loss": 1.2393, + "step": 210 + }, + { + "epoch": 0.04831139095592444, + "grad_norm": 1.3671090602874756, + "learning_rate": 1.998411178644023e-05, + "loss": 1.1783, + "step": 211 + }, + { + "epoch": 0.04854035489410418, + "grad_norm": 1.4478919506072998, + "learning_rate": 1.9983691039261358e-05, + "loss": 1.269, + "step": 212 + }, + { + "epoch": 0.048769318832283914, + "grad_norm": 1.8099792003631592, + "learning_rate": 1.998326479814101e-05, + "loss": 1.1632, + "step": 213 + }, + { + "epoch": 0.048998282770463655, + "grad_norm": 1.502695083618164, + "learning_rate": 1.9982833063313746e-05, + "loss": 1.2305, + "step": 214 + }, + { + "epoch": 0.04922724670864339, + "grad_norm": 1.1444593667984009, + "learning_rate": 1.998239583501714e-05, + "loss": 1.1981, + "step": 215 + }, + { + "epoch": 0.04945621064682312, + "grad_norm": 1.7091630697250366, + "learning_rate": 1.99819531134918e-05, + "loss": 1.2201, + "step": 216 + }, + { + "epoch": 0.049685174585002864, + "grad_norm": 1.4177758693695068, + "learning_rate": 1.9981504898981352e-05, + "loss": 1.1817, + "step": 217 + }, + { + "epoch": 0.0499141385231826, + "grad_norm": 1.2633055448532104, + "learning_rate": 1.998105119173244e-05, + "loss": 1.191, + "step": 218 + }, + { + "epoch": 0.05014310246136233, + "grad_norm": 1.7247159481048584, + "learning_rate": 1.998059199199474e-05, + "loss": 1.2037, + "step": 219 + }, + { + "epoch": 0.05037206639954207, + "grad_norm": 1.7244751453399658, + "learning_rate": 1.9980127300020946e-05, + "loss": 1.1898, + "step": 220 + }, + { + "epoch": 0.05060103033772181, + "grad_norm": 1.452858567237854, + "learning_rate": 1.997965711606677e-05, + "loss": 1.2408, + "step": 221 + }, + { + "epoch": 0.05082999427590155, + "grad_norm": 1.4057929515838623, + "learning_rate": 1.9979181440390954e-05, + "loss": 1.2024, + "step": 222 + }, + { + "epoch": 0.05105895821408128, + "grad_norm": 1.3213428258895874, + "learning_rate": 1.9978700273255254e-05, + "loss": 1.2326, + "step": 223 + }, + { + "epoch": 0.051287922152261016, + "grad_norm": 1.3907994031906128, + "learning_rate": 1.9978213614924456e-05, + "loss": 1.169, + "step": 224 + }, + { + "epoch": 0.05151688609044076, + "grad_norm": 1.4973134994506836, + "learning_rate": 1.9977721465666365e-05, + "loss": 1.2163, + "step": 225 + }, + { + "epoch": 0.05174585002862049, + "grad_norm": 1.6128813028335571, + "learning_rate": 1.9977223825751802e-05, + "loss": 1.1489, + "step": 226 + }, + { + "epoch": 0.05197481396680023, + "grad_norm": 1.228517770767212, + "learning_rate": 1.997672069545462e-05, + "loss": 1.1924, + "step": 227 + }, + { + "epoch": 0.052203777904979966, + "grad_norm": 1.5707398653030396, + "learning_rate": 1.9976212075051683e-05, + "loss": 1.2292, + "step": 228 + }, + { + "epoch": 0.0524327418431597, + "grad_norm": 1.605344533920288, + "learning_rate": 1.997569796482288e-05, + "loss": 1.3047, + "step": 229 + }, + { + "epoch": 0.05266170578133944, + "grad_norm": 1.1628702878952026, + "learning_rate": 1.9975178365051123e-05, + "loss": 1.205, + "step": 230 + }, + { + "epoch": 0.052890669719519175, + "grad_norm": 1.4649019241333008, + "learning_rate": 1.9974653276022347e-05, + "loss": 1.2032, + "step": 231 + }, + { + "epoch": 0.053119633657698916, + "grad_norm": 1.4579243659973145, + "learning_rate": 1.99741226980255e-05, + "loss": 1.238, + "step": 232 + }, + { + "epoch": 0.05334859759587865, + "grad_norm": 1.4899598360061646, + "learning_rate": 1.997358663135255e-05, + "loss": 1.2124, + "step": 233 + }, + { + "epoch": 0.053577561534058384, + "grad_norm": 1.2446345090866089, + "learning_rate": 1.99730450762985e-05, + "loss": 1.2593, + "step": 234 + }, + { + "epoch": 0.053806525472238124, + "grad_norm": 1.5485501289367676, + "learning_rate": 1.997249803316136e-05, + "loss": 1.1541, + "step": 235 + }, + { + "epoch": 0.05403548941041786, + "grad_norm": 1.386314868927002, + "learning_rate": 1.997194550224216e-05, + "loss": 1.234, + "step": 236 + }, + { + "epoch": 0.05426445334859759, + "grad_norm": 1.917922854423523, + "learning_rate": 1.9971387483844956e-05, + "loss": 1.2255, + "step": 237 + }, + { + "epoch": 0.05449341728677733, + "grad_norm": 1.9101078510284424, + "learning_rate": 1.9970823978276818e-05, + "loss": 1.1737, + "step": 238 + }, + { + "epoch": 0.05472238122495707, + "grad_norm": 2.0706257820129395, + "learning_rate": 1.9970254985847842e-05, + "loss": 1.1984, + "step": 239 + }, + { + "epoch": 0.05495134516313681, + "grad_norm": 1.579801321029663, + "learning_rate": 1.9969680506871138e-05, + "loss": 1.1953, + "step": 240 + }, + { + "epoch": 0.05518030910131654, + "grad_norm": 1.2857059240341187, + "learning_rate": 1.9969100541662833e-05, + "loss": 1.2072, + "step": 241 + }, + { + "epoch": 0.055409273039496276, + "grad_norm": 1.162064552307129, + "learning_rate": 1.996851509054208e-05, + "loss": 1.1743, + "step": 242 + }, + { + "epoch": 0.05563823697767602, + "grad_norm": 1.7716028690338135, + "learning_rate": 1.9967924153831054e-05, + "loss": 1.1876, + "step": 243 + }, + { + "epoch": 0.05586720091585575, + "grad_norm": 1.3083211183547974, + "learning_rate": 1.9967327731854937e-05, + "loss": 1.2037, + "step": 244 + }, + { + "epoch": 0.05609616485403549, + "grad_norm": 1.6880347728729248, + "learning_rate": 1.9966725824941933e-05, + "loss": 1.112, + "step": 245 + }, + { + "epoch": 0.056325128792215226, + "grad_norm": 1.951745629310608, + "learning_rate": 1.996611843342327e-05, + "loss": 1.2209, + "step": 246 + }, + { + "epoch": 0.05655409273039496, + "grad_norm": 2.4489657878875732, + "learning_rate": 1.9965505557633188e-05, + "loss": 1.1525, + "step": 247 + }, + { + "epoch": 0.0567830566685747, + "grad_norm": 1.4747159481048584, + "learning_rate": 1.996488719790895e-05, + "loss": 1.1597, + "step": 248 + }, + { + "epoch": 0.057012020606754435, + "grad_norm": 1.5697540044784546, + "learning_rate": 1.9964263354590835e-05, + "loss": 1.1907, + "step": 249 + }, + { + "epoch": 0.057240984544934176, + "grad_norm": 1.4222530126571655, + "learning_rate": 1.9963634028022133e-05, + "loss": 1.2253, + "step": 250 + }, + { + "epoch": 0.05746994848311391, + "grad_norm": 1.2914453744888306, + "learning_rate": 1.9962999218549156e-05, + "loss": 1.2175, + "step": 251 + }, + { + "epoch": 0.057698912421293644, + "grad_norm": 1.646164059638977, + "learning_rate": 1.9962358926521245e-05, + "loss": 1.1746, + "step": 252 + }, + { + "epoch": 0.057927876359473385, + "grad_norm": 2.148345947265625, + "learning_rate": 1.996171315229074e-05, + "loss": 1.1377, + "step": 253 + }, + { + "epoch": 0.05815684029765312, + "grad_norm": 1.5290459394454956, + "learning_rate": 1.9961061896213006e-05, + "loss": 1.1186, + "step": 254 + }, + { + "epoch": 0.05838580423583286, + "grad_norm": 1.3710248470306396, + "learning_rate": 1.9960405158646425e-05, + "loss": 1.1941, + "step": 255 + }, + { + "epoch": 0.058614768174012594, + "grad_norm": 1.6446150541305542, + "learning_rate": 1.9959742939952393e-05, + "loss": 1.1667, + "step": 256 + }, + { + "epoch": 0.05884373211219233, + "grad_norm": 1.3985366821289062, + "learning_rate": 1.9959075240495322e-05, + "loss": 1.1506, + "step": 257 + }, + { + "epoch": 0.05907269605037207, + "grad_norm": 1.5973879098892212, + "learning_rate": 1.9958402060642644e-05, + "loss": 1.1326, + "step": 258 + }, + { + "epoch": 0.0593016599885518, + "grad_norm": 1.499578833580017, + "learning_rate": 1.9957723400764803e-05, + "loss": 1.1938, + "step": 259 + }, + { + "epoch": 0.059530623926731537, + "grad_norm": 1.3453638553619385, + "learning_rate": 1.9957039261235263e-05, + "loss": 1.2466, + "step": 260 + }, + { + "epoch": 0.05975958786491128, + "grad_norm": 1.7341777086257935, + "learning_rate": 1.9956349642430494e-05, + "loss": 1.1251, + "step": 261 + }, + { + "epoch": 0.05998855180309101, + "grad_norm": 1.2891796827316284, + "learning_rate": 1.9955654544729992e-05, + "loss": 1.1765, + "step": 262 + }, + { + "epoch": 0.06021751574127075, + "grad_norm": 1.1669985055923462, + "learning_rate": 1.9954953968516262e-05, + "loss": 1.2109, + "step": 263 + }, + { + "epoch": 0.060446479679450486, + "grad_norm": 1.5981719493865967, + "learning_rate": 1.9954247914174828e-05, + "loss": 1.1865, + "step": 264 + }, + { + "epoch": 0.06067544361763022, + "grad_norm": 1.8978960514068604, + "learning_rate": 1.9953536382094218e-05, + "loss": 1.2389, + "step": 265 + }, + { + "epoch": 0.06090440755580996, + "grad_norm": 1.3212120532989502, + "learning_rate": 1.995281937266599e-05, + "loss": 1.1844, + "step": 266 + }, + { + "epoch": 0.061133371493989695, + "grad_norm": 2.165839672088623, + "learning_rate": 1.995209688628471e-05, + "loss": 1.196, + "step": 267 + }, + { + "epoch": 0.061362335432169436, + "grad_norm": 1.4239095449447632, + "learning_rate": 1.9951368923347945e-05, + "loss": 1.177, + "step": 268 + }, + { + "epoch": 0.06159129937034917, + "grad_norm": 1.4673502445220947, + "learning_rate": 1.9950635484256296e-05, + "loss": 1.2372, + "step": 269 + }, + { + "epoch": 0.061820263308528904, + "grad_norm": 1.2616180181503296, + "learning_rate": 1.9949896569413368e-05, + "loss": 1.1906, + "step": 270 + }, + { + "epoch": 0.062049227246708645, + "grad_norm": 1.8740971088409424, + "learning_rate": 1.9949152179225776e-05, + "loss": 1.2036, + "step": 271 + }, + { + "epoch": 0.06227819118488838, + "grad_norm": 1.2659952640533447, + "learning_rate": 1.9948402314103153e-05, + "loss": 1.1836, + "step": 272 + }, + { + "epoch": 0.06250715512306812, + "grad_norm": 1.446509599685669, + "learning_rate": 1.994764697445815e-05, + "loss": 1.2553, + "step": 273 + }, + { + "epoch": 0.06273611906124785, + "grad_norm": 1.2037794589996338, + "learning_rate": 1.994688616070641e-05, + "loss": 1.209, + "step": 274 + }, + { + "epoch": 0.06296508299942759, + "grad_norm": 1.1829689741134644, + "learning_rate": 1.9946119873266615e-05, + "loss": 1.192, + "step": 275 + }, + { + "epoch": 0.06319404693760733, + "grad_norm": 1.409515619277954, + "learning_rate": 1.994534811256044e-05, + "loss": 1.2247, + "step": 276 + }, + { + "epoch": 0.06342301087578707, + "grad_norm": 1.238846778869629, + "learning_rate": 1.994457087901258e-05, + "loss": 1.2204, + "step": 277 + }, + { + "epoch": 0.0636519748139668, + "grad_norm": 1.5566067695617676, + "learning_rate": 1.9943788173050743e-05, + "loss": 1.2295, + "step": 278 + }, + { + "epoch": 0.06388093875214654, + "grad_norm": 1.221211552619934, + "learning_rate": 1.9942999995105646e-05, + "loss": 1.1664, + "step": 279 + }, + { + "epoch": 0.06410990269032628, + "grad_norm": 1.3222095966339111, + "learning_rate": 1.9942206345611008e-05, + "loss": 1.1554, + "step": 280 + }, + { + "epoch": 0.064338866628506, + "grad_norm": 1.4909206628799438, + "learning_rate": 1.9941407225003577e-05, + "loss": 1.2614, + "step": 281 + }, + { + "epoch": 0.06456783056668575, + "grad_norm": 1.290727972984314, + "learning_rate": 1.9940602633723097e-05, + "loss": 1.2267, + "step": 282 + }, + { + "epoch": 0.06479679450486549, + "grad_norm": 2.3527672290802, + "learning_rate": 1.9939792572212328e-05, + "loss": 1.1742, + "step": 283 + }, + { + "epoch": 0.06502575844304521, + "grad_norm": 1.4631099700927734, + "learning_rate": 1.993897704091705e-05, + "loss": 1.2321, + "step": 284 + }, + { + "epoch": 0.06525472238122496, + "grad_norm": 1.4212102890014648, + "learning_rate": 1.9938156040286027e-05, + "loss": 1.208, + "step": 285 + }, + { + "epoch": 0.0654836863194047, + "grad_norm": 1.9231419563293457, + "learning_rate": 1.9937329570771057e-05, + "loss": 1.2423, + "step": 286 + }, + { + "epoch": 0.06571265025758444, + "grad_norm": 2.0737898349761963, + "learning_rate": 1.9936497632826937e-05, + "loss": 1.1687, + "step": 287 + }, + { + "epoch": 0.06594161419576416, + "grad_norm": 1.1904573440551758, + "learning_rate": 1.993566022691148e-05, + "loss": 1.2161, + "step": 288 + }, + { + "epoch": 0.0661705781339439, + "grad_norm": 1.472386121749878, + "learning_rate": 1.99348173534855e-05, + "loss": 1.2219, + "step": 289 + }, + { + "epoch": 0.06639954207212365, + "grad_norm": 2.0884242057800293, + "learning_rate": 1.993396901301283e-05, + "loss": 1.1996, + "step": 290 + }, + { + "epoch": 0.06662850601030337, + "grad_norm": 1.3736237287521362, + "learning_rate": 1.9933115205960295e-05, + "loss": 1.1683, + "step": 291 + }, + { + "epoch": 0.06685746994848311, + "grad_norm": 1.3148783445358276, + "learning_rate": 1.9932255932797736e-05, + "loss": 1.2628, + "step": 292 + }, + { + "epoch": 0.06708643388666286, + "grad_norm": 1.330783486366272, + "learning_rate": 1.9931391193998015e-05, + "loss": 1.1524, + "step": 293 + }, + { + "epoch": 0.06731539782484258, + "grad_norm": 1.5354028940200806, + "learning_rate": 1.9930520990036984e-05, + "loss": 1.2563, + "step": 294 + }, + { + "epoch": 0.06754436176302232, + "grad_norm": 1.3578846454620361, + "learning_rate": 1.9929645321393505e-05, + "loss": 1.2097, + "step": 295 + }, + { + "epoch": 0.06777332570120206, + "grad_norm": 1.1557707786560059, + "learning_rate": 1.9928764188549462e-05, + "loss": 1.1626, + "step": 296 + }, + { + "epoch": 0.06800228963938179, + "grad_norm": 1.2463425397872925, + "learning_rate": 1.9927877591989727e-05, + "loss": 1.2083, + "step": 297 + }, + { + "epoch": 0.06823125357756153, + "grad_norm": 1.1633163690567017, + "learning_rate": 1.992698553220219e-05, + "loss": 1.1593, + "step": 298 + }, + { + "epoch": 0.06846021751574127, + "grad_norm": 1.3011326789855957, + "learning_rate": 1.992608800967774e-05, + "loss": 1.2601, + "step": 299 + }, + { + "epoch": 0.06868918145392101, + "grad_norm": 1.7210752964019775, + "learning_rate": 1.992518502491028e-05, + "loss": 1.2187, + "step": 300 + }, + { + "epoch": 0.06891814539210074, + "grad_norm": 1.1964160203933716, + "learning_rate": 1.992427657839671e-05, + "loss": 1.1478, + "step": 301 + }, + { + "epoch": 0.06914710933028048, + "grad_norm": 1.2960009574890137, + "learning_rate": 1.992336267063695e-05, + "loss": 1.207, + "step": 302 + }, + { + "epoch": 0.06937607326846022, + "grad_norm": 1.6698633432388306, + "learning_rate": 1.9922443302133906e-05, + "loss": 1.133, + "step": 303 + }, + { + "epoch": 0.06960503720663995, + "grad_norm": 1.4826613664627075, + "learning_rate": 1.9921518473393502e-05, + "loss": 1.1586, + "step": 304 + }, + { + "epoch": 0.06983400114481969, + "grad_norm": 1.3968387842178345, + "learning_rate": 1.9920588184924664e-05, + "loss": 1.1621, + "step": 305 + }, + { + "epoch": 0.07006296508299943, + "grad_norm": 1.6237884759902954, + "learning_rate": 1.9919652437239326e-05, + "loss": 1.2013, + "step": 306 + }, + { + "epoch": 0.07029192902117916, + "grad_norm": 1.5827672481536865, + "learning_rate": 1.9918711230852416e-05, + "loss": 1.1804, + "step": 307 + }, + { + "epoch": 0.0705208929593589, + "grad_norm": 3.0790855884552, + "learning_rate": 1.9917764566281874e-05, + "loss": 1.1535, + "step": 308 + }, + { + "epoch": 0.07074985689753864, + "grad_norm": 1.3645802736282349, + "learning_rate": 1.9916812444048642e-05, + "loss": 1.1941, + "step": 309 + }, + { + "epoch": 0.07097882083571837, + "grad_norm": 1.1226625442504883, + "learning_rate": 1.9915854864676665e-05, + "loss": 1.1721, + "step": 310 + }, + { + "epoch": 0.07120778477389811, + "grad_norm": 1.386254906654358, + "learning_rate": 1.991489182869289e-05, + "loss": 1.0778, + "step": 311 + }, + { + "epoch": 0.07143674871207785, + "grad_norm": 1.1469475030899048, + "learning_rate": 1.9913923336627267e-05, + "loss": 1.1466, + "step": 312 + }, + { + "epoch": 0.07166571265025759, + "grad_norm": 1.7655390501022339, + "learning_rate": 1.9912949389012754e-05, + "loss": 1.1651, + "step": 313 + }, + { + "epoch": 0.07189467658843732, + "grad_norm": 1.2459030151367188, + "learning_rate": 1.9911969986385297e-05, + "loss": 1.2548, + "step": 314 + }, + { + "epoch": 0.07212364052661706, + "grad_norm": 1.4673657417297363, + "learning_rate": 1.991098512928386e-05, + "loss": 1.1818, + "step": 315 + }, + { + "epoch": 0.0723526044647968, + "grad_norm": 1.3704054355621338, + "learning_rate": 1.9909994818250403e-05, + "loss": 1.1223, + "step": 316 + }, + { + "epoch": 0.07258156840297653, + "grad_norm": 1.2631467580795288, + "learning_rate": 1.990899905382988e-05, + "loss": 1.1135, + "step": 317 + }, + { + "epoch": 0.07281053234115627, + "grad_norm": 1.255850076675415, + "learning_rate": 1.990799783657026e-05, + "loss": 1.1675, + "step": 318 + }, + { + "epoch": 0.07303949627933601, + "grad_norm": 1.2794235944747925, + "learning_rate": 1.9906991167022496e-05, + "loss": 1.145, + "step": 319 + }, + { + "epoch": 0.07326846021751574, + "grad_norm": 1.3548914194107056, + "learning_rate": 1.990597904574055e-05, + "loss": 1.2148, + "step": 320 + }, + { + "epoch": 0.07349742415569548, + "grad_norm": 1.7542753219604492, + "learning_rate": 1.990496147328139e-05, + "loss": 1.164, + "step": 321 + }, + { + "epoch": 0.07372638809387522, + "grad_norm": 1.3537834882736206, + "learning_rate": 1.9903938450204972e-05, + "loss": 1.2505, + "step": 322 + }, + { + "epoch": 0.07395535203205496, + "grad_norm": 1.2525663375854492, + "learning_rate": 1.9902909977074267e-05, + "loss": 1.2464, + "step": 323 + }, + { + "epoch": 0.07418431597023468, + "grad_norm": 1.4234391450881958, + "learning_rate": 1.9901876054455217e-05, + "loss": 1.1412, + "step": 324 + }, + { + "epoch": 0.07441327990841443, + "grad_norm": 1.5655121803283691, + "learning_rate": 1.9900836682916796e-05, + "loss": 1.2074, + "step": 325 + }, + { + "epoch": 0.07464224384659417, + "grad_norm": 1.317477822303772, + "learning_rate": 1.989979186303096e-05, + "loss": 1.1611, + "step": 326 + }, + { + "epoch": 0.0748712077847739, + "grad_norm": 1.2287671566009521, + "learning_rate": 1.989874159537266e-05, + "loss": 1.1862, + "step": 327 + }, + { + "epoch": 0.07510017172295363, + "grad_norm": 1.3680412769317627, + "learning_rate": 1.989768588051985e-05, + "loss": 1.0958, + "step": 328 + }, + { + "epoch": 0.07532913566113338, + "grad_norm": 1.3635417222976685, + "learning_rate": 1.9896624719053483e-05, + "loss": 1.1807, + "step": 329 + }, + { + "epoch": 0.0755580995993131, + "grad_norm": 1.7830880880355835, + "learning_rate": 1.9895558111557503e-05, + "loss": 1.2397, + "step": 330 + }, + { + "epoch": 0.07578706353749284, + "grad_norm": 1.3236726522445679, + "learning_rate": 1.9894486058618863e-05, + "loss": 1.1608, + "step": 331 + }, + { + "epoch": 0.07601602747567258, + "grad_norm": 1.733843207359314, + "learning_rate": 1.98934085608275e-05, + "loss": 1.2268, + "step": 332 + }, + { + "epoch": 0.07624499141385231, + "grad_norm": 1.5161333084106445, + "learning_rate": 1.9892325618776353e-05, + "loss": 1.2225, + "step": 333 + }, + { + "epoch": 0.07647395535203205, + "grad_norm": 1.2248483896255493, + "learning_rate": 1.9891237233061354e-05, + "loss": 1.1838, + "step": 334 + }, + { + "epoch": 0.0767029192902118, + "grad_norm": 1.2949497699737549, + "learning_rate": 1.989014340428143e-05, + "loss": 1.1763, + "step": 335 + }, + { + "epoch": 0.07693188322839153, + "grad_norm": 1.6322613954544067, + "learning_rate": 1.9889044133038514e-05, + "loss": 1.1773, + "step": 336 + }, + { + "epoch": 0.07716084716657126, + "grad_norm": 1.5641744136810303, + "learning_rate": 1.9887939419937522e-05, + "loss": 1.2079, + "step": 337 + }, + { + "epoch": 0.077389811104751, + "grad_norm": 1.1563526391983032, + "learning_rate": 1.9886829265586368e-05, + "loss": 1.1775, + "step": 338 + }, + { + "epoch": 0.07761877504293074, + "grad_norm": 1.9111676216125488, + "learning_rate": 1.9885713670595958e-05, + "loss": 1.1931, + "step": 339 + }, + { + "epoch": 0.07784773898111047, + "grad_norm": 1.3635084629058838, + "learning_rate": 1.98845926355802e-05, + "loss": 1.207, + "step": 340 + }, + { + "epoch": 0.07807670291929021, + "grad_norm": 1.4956107139587402, + "learning_rate": 1.988346616115598e-05, + "loss": 1.1679, + "step": 341 + }, + { + "epoch": 0.07830566685746995, + "grad_norm": 1.2649390697479248, + "learning_rate": 1.98823342479432e-05, + "loss": 1.1901, + "step": 342 + }, + { + "epoch": 0.07853463079564968, + "grad_norm": 1.5861189365386963, + "learning_rate": 1.9881196896564735e-05, + "loss": 1.2169, + "step": 343 + }, + { + "epoch": 0.07876359473382942, + "grad_norm": 1.2659194469451904, + "learning_rate": 1.9880054107646467e-05, + "loss": 1.2246, + "step": 344 + }, + { + "epoch": 0.07899255867200916, + "grad_norm": 1.5740238428115845, + "learning_rate": 1.9878905881817254e-05, + "loss": 1.2331, + "step": 345 + }, + { + "epoch": 0.0792215226101889, + "grad_norm": 1.9337862730026245, + "learning_rate": 1.9877752219708956e-05, + "loss": 1.1526, + "step": 346 + }, + { + "epoch": 0.07945048654836863, + "grad_norm": 1.5918827056884766, + "learning_rate": 1.987659312195643e-05, + "loss": 1.1238, + "step": 347 + }, + { + "epoch": 0.07967945048654837, + "grad_norm": 1.6437872648239136, + "learning_rate": 1.9875428589197513e-05, + "loss": 1.1845, + "step": 348 + }, + { + "epoch": 0.07990841442472811, + "grad_norm": 1.2831597328186035, + "learning_rate": 1.9874258622073044e-05, + "loss": 1.1319, + "step": 349 + }, + { + "epoch": 0.08013737836290784, + "grad_norm": 1.4037307500839233, + "learning_rate": 1.9873083221226833e-05, + "loss": 1.2522, + "step": 350 + }, + { + "epoch": 0.08036634230108758, + "grad_norm": 1.400461196899414, + "learning_rate": 1.9871902387305707e-05, + "loss": 1.1294, + "step": 351 + }, + { + "epoch": 0.08059530623926732, + "grad_norm": 1.286853551864624, + "learning_rate": 1.9870716120959462e-05, + "loss": 1.1933, + "step": 352 + }, + { + "epoch": 0.08082427017744705, + "grad_norm": 1.2215043306350708, + "learning_rate": 1.9869524422840893e-05, + "loss": 1.1226, + "step": 353 + }, + { + "epoch": 0.08105323411562679, + "grad_norm": 1.3354159593582153, + "learning_rate": 1.9868327293605778e-05, + "loss": 1.1322, + "step": 354 + }, + { + "epoch": 0.08128219805380653, + "grad_norm": 1.2361743450164795, + "learning_rate": 1.986712473391289e-05, + "loss": 1.2118, + "step": 355 + }, + { + "epoch": 0.08151116199198626, + "grad_norm": 1.4514862298965454, + "learning_rate": 1.986591674442399e-05, + "loss": 1.1469, + "step": 356 + }, + { + "epoch": 0.081740125930166, + "grad_norm": 1.2745472192764282, + "learning_rate": 1.9864703325803818e-05, + "loss": 1.1403, + "step": 357 + }, + { + "epoch": 0.08196908986834574, + "grad_norm": 1.3737826347351074, + "learning_rate": 1.986348447872011e-05, + "loss": 1.1907, + "step": 358 + }, + { + "epoch": 0.08219805380652548, + "grad_norm": 1.354296326637268, + "learning_rate": 1.986226020384359e-05, + "loss": 1.1332, + "step": 359 + }, + { + "epoch": 0.0824270177447052, + "grad_norm": 1.3757176399230957, + "learning_rate": 1.986103050184797e-05, + "loss": 1.2131, + "step": 360 + }, + { + "epoch": 0.08265598168288495, + "grad_norm": 1.770670771598816, + "learning_rate": 1.9859795373409934e-05, + "loss": 1.1724, + "step": 361 + }, + { + "epoch": 0.08288494562106469, + "grad_norm": 1.3382035493850708, + "learning_rate": 1.985855481920917e-05, + "loss": 1.1411, + "step": 362 + }, + { + "epoch": 0.08311390955924441, + "grad_norm": 1.1877765655517578, + "learning_rate": 1.9857308839928346e-05, + "loss": 1.2451, + "step": 363 + }, + { + "epoch": 0.08334287349742416, + "grad_norm": 1.3735177516937256, + "learning_rate": 1.9856057436253105e-05, + "loss": 1.1821, + "step": 364 + }, + { + "epoch": 0.0835718374356039, + "grad_norm": 1.5288448333740234, + "learning_rate": 1.9854800608872096e-05, + "loss": 1.1408, + "step": 365 + }, + { + "epoch": 0.08380080137378362, + "grad_norm": 1.8656703233718872, + "learning_rate": 1.9853538358476933e-05, + "loss": 1.2307, + "step": 366 + }, + { + "epoch": 0.08402976531196336, + "grad_norm": 1.520883560180664, + "learning_rate": 1.9852270685762222e-05, + "loss": 1.1503, + "step": 367 + }, + { + "epoch": 0.0842587292501431, + "grad_norm": 1.1496034860610962, + "learning_rate": 1.9850997591425555e-05, + "loss": 1.2023, + "step": 368 + }, + { + "epoch": 0.08448769318832285, + "grad_norm": 1.2435383796691895, + "learning_rate": 1.9849719076167502e-05, + "loss": 1.1091, + "step": 369 + }, + { + "epoch": 0.08471665712650257, + "grad_norm": 1.5105100870132446, + "learning_rate": 1.9848435140691627e-05, + "loss": 1.2194, + "step": 370 + }, + { + "epoch": 0.08494562106468231, + "grad_norm": 1.2073237895965576, + "learning_rate": 1.9847145785704457e-05, + "loss": 1.2124, + "step": 371 + }, + { + "epoch": 0.08517458500286205, + "grad_norm": 1.3399255275726318, + "learning_rate": 1.9845851011915526e-05, + "loss": 1.2134, + "step": 372 + }, + { + "epoch": 0.08540354894104178, + "grad_norm": 1.1257169246673584, + "learning_rate": 1.9844550820037326e-05, + "loss": 1.2316, + "step": 373 + }, + { + "epoch": 0.08563251287922152, + "grad_norm": 1.2916021347045898, + "learning_rate": 1.9843245210785348e-05, + "loss": 1.1766, + "step": 374 + }, + { + "epoch": 0.08586147681740126, + "grad_norm": 1.1885099411010742, + "learning_rate": 1.9841934184878056e-05, + "loss": 1.1737, + "step": 375 + }, + { + "epoch": 0.08609044075558099, + "grad_norm": 1.5768221616744995, + "learning_rate": 1.98406177430369e-05, + "loss": 1.1579, + "step": 376 + }, + { + "epoch": 0.08631940469376073, + "grad_norm": 1.2692902088165283, + "learning_rate": 1.98392958859863e-05, + "loss": 1.1498, + "step": 377 + }, + { + "epoch": 0.08654836863194047, + "grad_norm": 1.360911250114441, + "learning_rate": 1.9837968614453666e-05, + "loss": 1.1672, + "step": 378 + }, + { + "epoch": 0.0867773325701202, + "grad_norm": 1.3675583600997925, + "learning_rate": 1.9836635929169388e-05, + "loss": 1.2173, + "step": 379 + }, + { + "epoch": 0.08700629650829994, + "grad_norm": 1.1969804763793945, + "learning_rate": 1.9835297830866827e-05, + "loss": 1.1914, + "step": 380 + }, + { + "epoch": 0.08723526044647968, + "grad_norm": 25.4526424407959, + "learning_rate": 1.983395432028233e-05, + "loss": 1.2027, + "step": 381 + }, + { + "epoch": 0.08746422438465942, + "grad_norm": 1.52403724193573, + "learning_rate": 1.9832605398155217e-05, + "loss": 1.2035, + "step": 382 + }, + { + "epoch": 0.08769318832283915, + "grad_norm": 4.724262237548828, + "learning_rate": 1.9831251065227792e-05, + "loss": 1.2138, + "step": 383 + }, + { + "epoch": 0.08792215226101889, + "grad_norm": 2.621134042739868, + "learning_rate": 1.9829891322245326e-05, + "loss": 1.1642, + "step": 384 + }, + { + "epoch": 0.08815111619919863, + "grad_norm": 3.4685866832733154, + "learning_rate": 1.9828526169956083e-05, + "loss": 1.1791, + "step": 385 + }, + { + "epoch": 0.08838008013737836, + "grad_norm": 1.3899956941604614, + "learning_rate": 1.982715560911129e-05, + "loss": 1.1847, + "step": 386 + }, + { + "epoch": 0.0886090440755581, + "grad_norm": 1.1521110534667969, + "learning_rate": 1.9825779640465157e-05, + "loss": 1.129, + "step": 387 + }, + { + "epoch": 0.08883800801373784, + "grad_norm": 1.7473269701004028, + "learning_rate": 1.982439826477487e-05, + "loss": 1.1534, + "step": 388 + }, + { + "epoch": 0.08906697195191757, + "grad_norm": 1.4069769382476807, + "learning_rate": 1.9823011482800584e-05, + "loss": 1.1092, + "step": 389 + }, + { + "epoch": 0.08929593589009731, + "grad_norm": 1.2800990343093872, + "learning_rate": 1.9821619295305432e-05, + "loss": 1.1823, + "step": 390 + }, + { + "epoch": 0.08952489982827705, + "grad_norm": 1.3700530529022217, + "learning_rate": 1.9820221703055528e-05, + "loss": 1.1921, + "step": 391 + }, + { + "epoch": 0.08975386376645678, + "grad_norm": 1.9944264888763428, + "learning_rate": 1.9818818706819955e-05, + "loss": 1.1447, + "step": 392 + }, + { + "epoch": 0.08998282770463652, + "grad_norm": 2.0315797328948975, + "learning_rate": 1.981741030737077e-05, + "loss": 1.1634, + "step": 393 + }, + { + "epoch": 0.09021179164281626, + "grad_norm": 1.4205044507980347, + "learning_rate": 1.9815996505483e-05, + "loss": 1.1817, + "step": 394 + }, + { + "epoch": 0.090440755580996, + "grad_norm": 1.4898089170455933, + "learning_rate": 1.9814577301934647e-05, + "loss": 1.1931, + "step": 395 + }, + { + "epoch": 0.09066971951917573, + "grad_norm": 1.5351992845535278, + "learning_rate": 1.9813152697506696e-05, + "loss": 1.2307, + "step": 396 + }, + { + "epoch": 0.09089868345735547, + "grad_norm": 1.2297886610031128, + "learning_rate": 1.9811722692983088e-05, + "loss": 1.161, + "step": 397 + }, + { + "epoch": 0.09112764739553521, + "grad_norm": 1.5338091850280762, + "learning_rate": 1.981028728915074e-05, + "loss": 1.2151, + "step": 398 + }, + { + "epoch": 0.09135661133371493, + "grad_norm": 1.4196423292160034, + "learning_rate": 1.980884648679955e-05, + "loss": 1.2312, + "step": 399 + }, + { + "epoch": 0.09158557527189468, + "grad_norm": 1.2989675998687744, + "learning_rate": 1.980740028672237e-05, + "loss": 1.1532, + "step": 400 + }, + { + "epoch": 0.09181453921007442, + "grad_norm": 1.5549927949905396, + "learning_rate": 1.9805948689715043e-05, + "loss": 1.1157, + "step": 401 + }, + { + "epoch": 0.09204350314825414, + "grad_norm": 1.2734893560409546, + "learning_rate": 1.9804491696576364e-05, + "loss": 1.1663, + "step": 402 + }, + { + "epoch": 0.09227246708643388, + "grad_norm": 1.376017689704895, + "learning_rate": 1.9803029308108105e-05, + "loss": 1.1519, + "step": 403 + }, + { + "epoch": 0.09250143102461363, + "grad_norm": 1.1765146255493164, + "learning_rate": 1.9801561525115006e-05, + "loss": 1.1376, + "step": 404 + }, + { + "epoch": 0.09273039496279337, + "grad_norm": 1.1429637670516968, + "learning_rate": 1.9800088348404778e-05, + "loss": 1.1182, + "step": 405 + }, + { + "epoch": 0.0929593589009731, + "grad_norm": 1.3974339962005615, + "learning_rate": 1.9798609778788094e-05, + "loss": 1.141, + "step": 406 + }, + { + "epoch": 0.09318832283915283, + "grad_norm": 1.5140388011932373, + "learning_rate": 1.97971258170786e-05, + "loss": 1.2011, + "step": 407 + }, + { + "epoch": 0.09341728677733258, + "grad_norm": 1.161309838294983, + "learning_rate": 1.979563646409291e-05, + "loss": 1.1788, + "step": 408 + }, + { + "epoch": 0.0936462507155123, + "grad_norm": 1.2854355573654175, + "learning_rate": 1.97941417206506e-05, + "loss": 1.2321, + "step": 409 + }, + { + "epoch": 0.09387521465369204, + "grad_norm": 1.1982879638671875, + "learning_rate": 1.9792641587574212e-05, + "loss": 1.1316, + "step": 410 + }, + { + "epoch": 0.09410417859187178, + "grad_norm": 1.1869289875030518, + "learning_rate": 1.979113606568926e-05, + "loss": 1.1207, + "step": 411 + }, + { + "epoch": 0.09433314253005151, + "grad_norm": 1.159303069114685, + "learning_rate": 1.9789625155824226e-05, + "loss": 1.0941, + "step": 412 + }, + { + "epoch": 0.09456210646823125, + "grad_norm": 1.4857593774795532, + "learning_rate": 1.978810885881054e-05, + "loss": 1.1209, + "step": 413 + }, + { + "epoch": 0.094791070406411, + "grad_norm": 1.3553754091262817, + "learning_rate": 1.9786587175482613e-05, + "loss": 1.154, + "step": 414 + }, + { + "epoch": 0.09502003434459072, + "grad_norm": 1.219914197921753, + "learning_rate": 1.9785060106677818e-05, + "loss": 1.1627, + "step": 415 + }, + { + "epoch": 0.09524899828277046, + "grad_norm": 2.246405839920044, + "learning_rate": 1.978352765323648e-05, + "loss": 1.2182, + "step": 416 + }, + { + "epoch": 0.0954779622209502, + "grad_norm": 1.3677849769592285, + "learning_rate": 1.97819898160019e-05, + "loss": 1.1858, + "step": 417 + }, + { + "epoch": 0.09570692615912994, + "grad_norm": 1.52046537399292, + "learning_rate": 1.9780446595820336e-05, + "loss": 1.2089, + "step": 418 + }, + { + "epoch": 0.09593589009730967, + "grad_norm": 1.4131478071212769, + "learning_rate": 1.9778897993541014e-05, + "loss": 1.1796, + "step": 419 + }, + { + "epoch": 0.09616485403548941, + "grad_norm": 1.3519349098205566, + "learning_rate": 1.977734401001611e-05, + "loss": 1.148, + "step": 420 + }, + { + "epoch": 0.09639381797366915, + "grad_norm": 1.985721230506897, + "learning_rate": 1.9775784646100768e-05, + "loss": 1.1838, + "step": 421 + }, + { + "epoch": 0.09662278191184888, + "grad_norm": 1.2649965286254883, + "learning_rate": 1.97742199026531e-05, + "loss": 1.1589, + "step": 422 + }, + { + "epoch": 0.09685174585002862, + "grad_norm": 1.4421800374984741, + "learning_rate": 1.977264978053416e-05, + "loss": 1.0934, + "step": 423 + }, + { + "epoch": 0.09708070978820836, + "grad_norm": 1.287441611289978, + "learning_rate": 1.977107428060799e-05, + "loss": 1.1881, + "step": 424 + }, + { + "epoch": 0.09730967372638809, + "grad_norm": 1.2831951379776, + "learning_rate": 1.9769493403741556e-05, + "loss": 1.1283, + "step": 425 + }, + { + "epoch": 0.09753863766456783, + "grad_norm": 1.2015918493270874, + "learning_rate": 1.976790715080481e-05, + "loss": 1.1961, + "step": 426 + }, + { + "epoch": 0.09776760160274757, + "grad_norm": 1.4825067520141602, + "learning_rate": 1.9766315522670654e-05, + "loss": 1.1451, + "step": 427 + }, + { + "epoch": 0.09799656554092731, + "grad_norm": 1.2135714292526245, + "learning_rate": 1.976471852021495e-05, + "loss": 1.0926, + "step": 428 + }, + { + "epoch": 0.09822552947910704, + "grad_norm": 1.2095484733581543, + "learning_rate": 1.9763116144316506e-05, + "loss": 1.1423, + "step": 429 + }, + { + "epoch": 0.09845449341728678, + "grad_norm": 1.321748971939087, + "learning_rate": 1.9761508395857106e-05, + "loss": 1.1583, + "step": 430 + }, + { + "epoch": 0.09868345735546652, + "grad_norm": 1.3187049627304077, + "learning_rate": 1.9759895275721476e-05, + "loss": 1.1745, + "step": 431 + }, + { + "epoch": 0.09891242129364625, + "grad_norm": 1.3241872787475586, + "learning_rate": 1.97582767847973e-05, + "loss": 1.1689, + "step": 432 + }, + { + "epoch": 0.09914138523182599, + "grad_norm": 1.2542831897735596, + "learning_rate": 1.9756652923975227e-05, + "loss": 1.2044, + "step": 433 + }, + { + "epoch": 0.09937034917000573, + "grad_norm": 1.2562075853347778, + "learning_rate": 1.9755023694148846e-05, + "loss": 1.1629, + "step": 434 + }, + { + "epoch": 0.09959931310818546, + "grad_norm": 1.2366596460342407, + "learning_rate": 1.9753389096214716e-05, + "loss": 1.0803, + "step": 435 + }, + { + "epoch": 0.0998282770463652, + "grad_norm": 1.4391214847564697, + "learning_rate": 1.9751749131072335e-05, + "loss": 1.1797, + "step": 436 + }, + { + "epoch": 0.10005724098454494, + "grad_norm": 4.435413360595703, + "learning_rate": 1.9750103799624165e-05, + "loss": 1.1591, + "step": 437 + }, + { + "epoch": 0.10028620492272466, + "grad_norm": 1.376836895942688, + "learning_rate": 1.974845310277562e-05, + "loss": 1.2096, + "step": 438 + }, + { + "epoch": 0.1005151688609044, + "grad_norm": 1.4392584562301636, + "learning_rate": 1.974679704143507e-05, + "loss": 1.0871, + "step": 439 + }, + { + "epoch": 0.10074413279908415, + "grad_norm": 1.1275047063827515, + "learning_rate": 1.974513561651382e-05, + "loss": 1.1411, + "step": 440 + }, + { + "epoch": 0.10097309673726389, + "grad_norm": 1.4672117233276367, + "learning_rate": 1.974346882892614e-05, + "loss": 1.138, + "step": 441 + }, + { + "epoch": 0.10120206067544361, + "grad_norm": 1.2267554998397827, + "learning_rate": 1.974179667958926e-05, + "loss": 1.1537, + "step": 442 + }, + { + "epoch": 0.10143102461362336, + "grad_norm": 1.3913313150405884, + "learning_rate": 1.9740119169423337e-05, + "loss": 1.1623, + "step": 443 + }, + { + "epoch": 0.1016599885518031, + "grad_norm": 1.6974395513534546, + "learning_rate": 1.97384362993515e-05, + "loss": 1.208, + "step": 444 + }, + { + "epoch": 0.10188895248998282, + "grad_norm": 1.3473076820373535, + "learning_rate": 1.9736748070299813e-05, + "loss": 1.1479, + "step": 445 + }, + { + "epoch": 0.10211791642816256, + "grad_norm": 1.2066441774368286, + "learning_rate": 1.9735054483197295e-05, + "loss": 1.2317, + "step": 446 + }, + { + "epoch": 0.1023468803663423, + "grad_norm": 1.5624324083328247, + "learning_rate": 1.973335553897591e-05, + "loss": 1.1282, + "step": 447 + }, + { + "epoch": 0.10257584430452203, + "grad_norm": 1.9326655864715576, + "learning_rate": 1.9731651238570582e-05, + "loss": 1.1848, + "step": 448 + }, + { + "epoch": 0.10280480824270177, + "grad_norm": 2.2776360511779785, + "learning_rate": 1.9729941582919168e-05, + "loss": 1.1599, + "step": 449 + }, + { + "epoch": 0.10303377218088151, + "grad_norm": 1.2282005548477173, + "learning_rate": 1.9728226572962474e-05, + "loss": 1.1843, + "step": 450 + }, + { + "epoch": 0.10326273611906125, + "grad_norm": 1.4309104681015015, + "learning_rate": 1.972650620964426e-05, + "loss": 1.1689, + "step": 451 + }, + { + "epoch": 0.10349170005724098, + "grad_norm": 1.427143931388855, + "learning_rate": 1.9724780493911227e-05, + "loss": 1.1471, + "step": 452 + }, + { + "epoch": 0.10372066399542072, + "grad_norm": 1.2180664539337158, + "learning_rate": 1.9723049426713018e-05, + "loss": 1.1932, + "step": 453 + }, + { + "epoch": 0.10394962793360046, + "grad_norm": 1.2000045776367188, + "learning_rate": 1.9721313009002228e-05, + "loss": 1.1751, + "step": 454 + }, + { + "epoch": 0.10417859187178019, + "grad_norm": 1.3412353992462158, + "learning_rate": 1.9719571241734395e-05, + "loss": 1.147, + "step": 455 + }, + { + "epoch": 0.10440755580995993, + "grad_norm": 1.2962464094161987, + "learning_rate": 1.9717824125867993e-05, + "loss": 1.2452, + "step": 456 + }, + { + "epoch": 0.10463651974813967, + "grad_norm": 1.10789155960083, + "learning_rate": 1.9716071662364454e-05, + "loss": 1.2014, + "step": 457 + }, + { + "epoch": 0.1048654836863194, + "grad_norm": 1.4773873090744019, + "learning_rate": 1.9714313852188133e-05, + "loss": 1.2092, + "step": 458 + }, + { + "epoch": 0.10509444762449914, + "grad_norm": 1.4391075372695923, + "learning_rate": 1.9712550696306346e-05, + "loss": 1.1431, + "step": 459 + }, + { + "epoch": 0.10532341156267888, + "grad_norm": 1.7549716234207153, + "learning_rate": 1.9710782195689343e-05, + "loss": 1.1395, + "step": 460 + }, + { + "epoch": 0.10555237550085861, + "grad_norm": 1.5657472610473633, + "learning_rate": 1.970900835131031e-05, + "loss": 1.132, + "step": 461 + }, + { + "epoch": 0.10578133943903835, + "grad_norm": 1.4908957481384277, + "learning_rate": 1.9707229164145386e-05, + "loss": 1.1533, + "step": 462 + }, + { + "epoch": 0.10601030337721809, + "grad_norm": 1.2966023683547974, + "learning_rate": 1.9705444635173635e-05, + "loss": 1.1855, + "step": 463 + }, + { + "epoch": 0.10623926731539783, + "grad_norm": 1.271470546722412, + "learning_rate": 1.970365476537707e-05, + "loss": 1.2443, + "step": 464 + }, + { + "epoch": 0.10646823125357756, + "grad_norm": 1.3924498558044434, + "learning_rate": 1.9701859555740647e-05, + "loss": 1.1831, + "step": 465 + }, + { + "epoch": 0.1066971951917573, + "grad_norm": 1.053683876991272, + "learning_rate": 1.9700059007252248e-05, + "loss": 1.2135, + "step": 466 + }, + { + "epoch": 0.10692615912993704, + "grad_norm": 1.3220330476760864, + "learning_rate": 1.9698253120902703e-05, + "loss": 1.1772, + "step": 467 + }, + { + "epoch": 0.10715512306811677, + "grad_norm": 1.528494119644165, + "learning_rate": 1.9696441897685777e-05, + "loss": 1.2416, + "step": 468 + }, + { + "epoch": 0.10738408700629651, + "grad_norm": 1.3539191484451294, + "learning_rate": 1.969462533859817e-05, + "loss": 1.1354, + "step": 469 + }, + { + "epoch": 0.10761305094447625, + "grad_norm": 1.5194259881973267, + "learning_rate": 1.9692803444639517e-05, + "loss": 1.208, + "step": 470 + }, + { + "epoch": 0.10784201488265598, + "grad_norm": 1.2938241958618164, + "learning_rate": 1.9690976216812397e-05, + "loss": 1.0596, + "step": 471 + }, + { + "epoch": 0.10807097882083572, + "grad_norm": 1.2351398468017578, + "learning_rate": 1.968914365612231e-05, + "loss": 1.2107, + "step": 472 + }, + { + "epoch": 0.10829994275901546, + "grad_norm": 1.381752371788025, + "learning_rate": 1.9687305763577705e-05, + "loss": 1.1274, + "step": 473 + }, + { + "epoch": 0.10852890669719518, + "grad_norm": 1.4342790842056274, + "learning_rate": 1.9685462540189955e-05, + "loss": 1.1483, + "step": 474 + }, + { + "epoch": 0.10875787063537493, + "grad_norm": 1.2768388986587524, + "learning_rate": 1.9683613986973373e-05, + "loss": 1.2039, + "step": 475 + }, + { + "epoch": 0.10898683457355467, + "grad_norm": 1.2173471450805664, + "learning_rate": 1.9681760104945203e-05, + "loss": 1.1969, + "step": 476 + }, + { + "epoch": 0.10921579851173441, + "grad_norm": 1.2570933103561401, + "learning_rate": 1.967990089512562e-05, + "loss": 1.1701, + "step": 477 + }, + { + "epoch": 0.10944476244991413, + "grad_norm": 1.45657479763031, + "learning_rate": 1.9678036358537726e-05, + "loss": 1.1106, + "step": 478 + }, + { + "epoch": 0.10967372638809388, + "grad_norm": 1.4095118045806885, + "learning_rate": 1.9676166496207567e-05, + "loss": 1.1978, + "step": 479 + }, + { + "epoch": 0.10990269032627362, + "grad_norm": 1.445336937904358, + "learning_rate": 1.967429130916411e-05, + "loss": 1.1609, + "step": 480 + }, + { + "epoch": 0.11013165426445334, + "grad_norm": 1.2025765180587769, + "learning_rate": 1.9672410798439256e-05, + "loss": 1.1806, + "step": 481 + }, + { + "epoch": 0.11036061820263308, + "grad_norm": 1.2341455221176147, + "learning_rate": 1.9670524965067832e-05, + "loss": 1.1765, + "step": 482 + }, + { + "epoch": 0.11058958214081283, + "grad_norm": 1.1721457242965698, + "learning_rate": 1.96686338100876e-05, + "loss": 1.1521, + "step": 483 + }, + { + "epoch": 0.11081854607899255, + "grad_norm": 1.2636065483093262, + "learning_rate": 1.9666737334539237e-05, + "loss": 1.1089, + "step": 484 + }, + { + "epoch": 0.1110475100171723, + "grad_norm": 1.7780942916870117, + "learning_rate": 1.966483553946637e-05, + "loss": 1.1039, + "step": 485 + }, + { + "epoch": 0.11127647395535203, + "grad_norm": 1.289139747619629, + "learning_rate": 1.9662928425915536e-05, + "loss": 1.2103, + "step": 486 + }, + { + "epoch": 0.11150543789353178, + "grad_norm": 1.5748257637023926, + "learning_rate": 1.9661015994936204e-05, + "loss": 1.1882, + "step": 487 + }, + { + "epoch": 0.1117344018317115, + "grad_norm": 1.3127802610397339, + "learning_rate": 1.9659098247580765e-05, + "loss": 1.0901, + "step": 488 + }, + { + "epoch": 0.11196336576989124, + "grad_norm": 1.2186063528060913, + "learning_rate": 1.9657175184904545e-05, + "loss": 1.1095, + "step": 489 + }, + { + "epoch": 0.11219232970807098, + "grad_norm": 1.644958734512329, + "learning_rate": 1.9655246807965786e-05, + "loss": 1.1162, + "step": 490 + }, + { + "epoch": 0.11242129364625071, + "grad_norm": 1.3639802932739258, + "learning_rate": 1.9653313117825657e-05, + "loss": 1.1439, + "step": 491 + }, + { + "epoch": 0.11265025758443045, + "grad_norm": 1.1354119777679443, + "learning_rate": 1.9651374115548255e-05, + "loss": 1.1214, + "step": 492 + }, + { + "epoch": 0.11287922152261019, + "grad_norm": 1.509704351425171, + "learning_rate": 1.964942980220059e-05, + "loss": 1.154, + "step": 493 + }, + { + "epoch": 0.11310818546078992, + "grad_norm": 1.605154037475586, + "learning_rate": 1.9647480178852606e-05, + "loss": 1.2066, + "step": 494 + }, + { + "epoch": 0.11333714939896966, + "grad_norm": 1.1039700508117676, + "learning_rate": 1.9645525246577168e-05, + "loss": 1.2157, + "step": 495 + }, + { + "epoch": 0.1135661133371494, + "grad_norm": 1.5542914867401123, + "learning_rate": 1.9643565006450055e-05, + "loss": 1.1465, + "step": 496 + }, + { + "epoch": 0.11379507727532913, + "grad_norm": 1.450024127960205, + "learning_rate": 1.9641599459549966e-05, + "loss": 1.1535, + "step": 497 + }, + { + "epoch": 0.11402404121350887, + "grad_norm": 1.6595662832260132, + "learning_rate": 1.9639628606958535e-05, + "loss": 1.1892, + "step": 498 + }, + { + "epoch": 0.11425300515168861, + "grad_norm": 1.6177374124526978, + "learning_rate": 1.9637652449760297e-05, + "loss": 1.114, + "step": 499 + }, + { + "epoch": 0.11448196908986835, + "grad_norm": 1.6932591199874878, + "learning_rate": 1.963567098904272e-05, + "loss": 1.1176, + "step": 500 + }, + { + "epoch": 0.11471093302804808, + "grad_norm": 1.1896140575408936, + "learning_rate": 1.963368422589618e-05, + "loss": 1.1573, + "step": 501 + }, + { + "epoch": 0.11493989696622782, + "grad_norm": 1.246862769126892, + "learning_rate": 1.9631692161413985e-05, + "loss": 1.2009, + "step": 502 + }, + { + "epoch": 0.11516886090440756, + "grad_norm": 1.422579288482666, + "learning_rate": 1.962969479669234e-05, + "loss": 1.1447, + "step": 503 + }, + { + "epoch": 0.11539782484258729, + "grad_norm": 1.2170311212539673, + "learning_rate": 1.962769213283039e-05, + "loss": 1.1696, + "step": 504 + }, + { + "epoch": 0.11562678878076703, + "grad_norm": 1.365478754043579, + "learning_rate": 1.9625684170930172e-05, + "loss": 1.1165, + "step": 505 + }, + { + "epoch": 0.11585575271894677, + "grad_norm": 1.4074455499649048, + "learning_rate": 1.9623670912096656e-05, + "loss": 1.1835, + "step": 506 + }, + { + "epoch": 0.1160847166571265, + "grad_norm": 1.3482747077941895, + "learning_rate": 1.9621652357437723e-05, + "loss": 1.1344, + "step": 507 + }, + { + "epoch": 0.11631368059530624, + "grad_norm": 1.7042218446731567, + "learning_rate": 1.961962850806417e-05, + "loss": 1.1432, + "step": 508 + }, + { + "epoch": 0.11654264453348598, + "grad_norm": 1.3767738342285156, + "learning_rate": 1.9617599365089693e-05, + "loss": 1.1334, + "step": 509 + }, + { + "epoch": 0.11677160847166572, + "grad_norm": 1.317917823791504, + "learning_rate": 1.9615564929630925e-05, + "loss": 1.1883, + "step": 510 + }, + { + "epoch": 0.11700057240984545, + "grad_norm": 1.2787598371505737, + "learning_rate": 1.9613525202807392e-05, + "loss": 1.1198, + "step": 511 + }, + { + "epoch": 0.11722953634802519, + "grad_norm": 1.6827375888824463, + "learning_rate": 1.961148018574154e-05, + "loss": 1.1579, + "step": 512 + }, + { + "epoch": 0.11745850028620493, + "grad_norm": 1.2459272146224976, + "learning_rate": 1.9609429879558726e-05, + "loss": 1.1789, + "step": 513 + }, + { + "epoch": 0.11768746422438466, + "grad_norm": 1.3571677207946777, + "learning_rate": 1.960737428538721e-05, + "loss": 1.139, + "step": 514 + }, + { + "epoch": 0.1179164281625644, + "grad_norm": 1.3986880779266357, + "learning_rate": 1.9605313404358176e-05, + "loss": 1.1719, + "step": 515 + }, + { + "epoch": 0.11814539210074414, + "grad_norm": 1.2968988418579102, + "learning_rate": 1.9603247237605706e-05, + "loss": 1.1753, + "step": 516 + }, + { + "epoch": 0.11837435603892386, + "grad_norm": 1.434787631034851, + "learning_rate": 1.9601175786266796e-05, + "loss": 1.1497, + "step": 517 + }, + { + "epoch": 0.1186033199771036, + "grad_norm": 1.595548152923584, + "learning_rate": 1.9599099051481345e-05, + "loss": 1.111, + "step": 518 + }, + { + "epoch": 0.11883228391528335, + "grad_norm": 1.3125005960464478, + "learning_rate": 1.959701703439217e-05, + "loss": 1.1453, + "step": 519 + }, + { + "epoch": 0.11906124785346307, + "grad_norm": 1.6637828350067139, + "learning_rate": 1.9594929736144978e-05, + "loss": 1.1657, + "step": 520 + }, + { + "epoch": 0.11929021179164281, + "grad_norm": 1.1127067804336548, + "learning_rate": 1.9592837157888396e-05, + "loss": 1.2167, + "step": 521 + }, + { + "epoch": 0.11951917572982255, + "grad_norm": 1.3453141450881958, + "learning_rate": 1.959073930077395e-05, + "loss": 1.1839, + "step": 522 + }, + { + "epoch": 0.1197481396680023, + "grad_norm": 1.2498703002929688, + "learning_rate": 1.958863616595608e-05, + "loss": 1.1712, + "step": 523 + }, + { + "epoch": 0.11997710360618202, + "grad_norm": 1.6796958446502686, + "learning_rate": 1.958652775459212e-05, + "loss": 1.1745, + "step": 524 + }, + { + "epoch": 0.12020606754436176, + "grad_norm": 1.1889232397079468, + "learning_rate": 1.9584414067842304e-05, + "loss": 1.1619, + "step": 525 + }, + { + "epoch": 0.1204350314825415, + "grad_norm": 1.389387607574463, + "learning_rate": 1.9582295106869788e-05, + "loss": 1.1106, + "step": 526 + }, + { + "epoch": 0.12066399542072123, + "grad_norm": 1.8008201122283936, + "learning_rate": 1.958017087284061e-05, + "loss": 1.1603, + "step": 527 + }, + { + "epoch": 0.12089295935890097, + "grad_norm": 1.4966647624969482, + "learning_rate": 1.9578041366923722e-05, + "loss": 1.1037, + "step": 528 + }, + { + "epoch": 0.12112192329708071, + "grad_norm": 1.1903491020202637, + "learning_rate": 1.957590659029097e-05, + "loss": 1.1838, + "step": 529 + }, + { + "epoch": 0.12135088723526044, + "grad_norm": 1.38507080078125, + "learning_rate": 1.957376654411711e-05, + "loss": 1.1818, + "step": 530 + }, + { + "epoch": 0.12157985117344018, + "grad_norm": 1.4420480728149414, + "learning_rate": 1.957162122957978e-05, + "loss": 1.1738, + "step": 531 + }, + { + "epoch": 0.12180881511161992, + "grad_norm": 1.5511441230773926, + "learning_rate": 1.9569470647859544e-05, + "loss": 1.183, + "step": 532 + }, + { + "epoch": 0.12203777904979966, + "grad_norm": 1.3438152074813843, + "learning_rate": 1.9567314800139838e-05, + "loss": 1.1897, + "step": 533 + }, + { + "epoch": 0.12226674298797939, + "grad_norm": 1.281836748123169, + "learning_rate": 1.9565153687607006e-05, + "loss": 1.1887, + "step": 534 + }, + { + "epoch": 0.12249570692615913, + "grad_norm": 1.3364907503128052, + "learning_rate": 1.9562987311450298e-05, + "loss": 1.1386, + "step": 535 + }, + { + "epoch": 0.12272467086433887, + "grad_norm": 1.3706281185150146, + "learning_rate": 1.956081567286185e-05, + "loss": 1.1379, + "step": 536 + }, + { + "epoch": 0.1229536348025186, + "grad_norm": 1.3901708126068115, + "learning_rate": 1.9558638773036694e-05, + "loss": 1.124, + "step": 537 + }, + { + "epoch": 0.12318259874069834, + "grad_norm": 1.1462208032608032, + "learning_rate": 1.955645661317276e-05, + "loss": 1.1561, + "step": 538 + }, + { + "epoch": 0.12341156267887808, + "grad_norm": 1.1496376991271973, + "learning_rate": 1.9554269194470872e-05, + "loss": 1.1725, + "step": 539 + }, + { + "epoch": 0.12364052661705781, + "grad_norm": 1.095733404159546, + "learning_rate": 1.955207651813475e-05, + "loss": 1.1946, + "step": 540 + }, + { + "epoch": 0.12386949055523755, + "grad_norm": 1.4160492420196533, + "learning_rate": 1.9549878585371006e-05, + "loss": 1.0487, + "step": 541 + }, + { + "epoch": 0.12409845449341729, + "grad_norm": 1.1869254112243652, + "learning_rate": 1.9547675397389144e-05, + "loss": 1.1571, + "step": 542 + }, + { + "epoch": 0.12432741843159702, + "grad_norm": 2.6226820945739746, + "learning_rate": 1.9545466955401555e-05, + "loss": 1.1698, + "step": 543 + }, + { + "epoch": 0.12455638236977676, + "grad_norm": 1.3678830862045288, + "learning_rate": 1.954325326062353e-05, + "loss": 1.1191, + "step": 544 + }, + { + "epoch": 0.1247853463079565, + "grad_norm": 1.2578891515731812, + "learning_rate": 1.954103431427325e-05, + "loss": 1.1159, + "step": 545 + }, + { + "epoch": 0.12501431024613624, + "grad_norm": 1.258697509765625, + "learning_rate": 1.9538810117571775e-05, + "loss": 1.1422, + "step": 546 + }, + { + "epoch": 0.12524327418431597, + "grad_norm": 1.3794115781784058, + "learning_rate": 1.9536580671743067e-05, + "loss": 1.1376, + "step": 547 + }, + { + "epoch": 0.1254722381224957, + "grad_norm": 1.2176522016525269, + "learning_rate": 1.9534345978013972e-05, + "loss": 1.1487, + "step": 548 + }, + { + "epoch": 0.12570120206067545, + "grad_norm": 1.7151718139648438, + "learning_rate": 1.9532106037614218e-05, + "loss": 1.1334, + "step": 549 + }, + { + "epoch": 0.12593016599885518, + "grad_norm": 1.7247742414474487, + "learning_rate": 1.952986085177643e-05, + "loss": 1.1493, + "step": 550 + }, + { + "epoch": 0.12615912993703493, + "grad_norm": 1.252971887588501, + "learning_rate": 1.9527610421736115e-05, + "loss": 1.1741, + "step": 551 + }, + { + "epoch": 0.12638809387521466, + "grad_norm": 1.716004729270935, + "learning_rate": 1.9525354748731665e-05, + "loss": 1.1638, + "step": 552 + }, + { + "epoch": 0.12661705781339438, + "grad_norm": 1.1862410306930542, + "learning_rate": 1.9523093834004358e-05, + "loss": 1.1476, + "step": 553 + }, + { + "epoch": 0.12684602175157414, + "grad_norm": 1.4751574993133545, + "learning_rate": 1.952082767879835e-05, + "loss": 1.1203, + "step": 554 + }, + { + "epoch": 0.12707498568975387, + "grad_norm": 1.6074382066726685, + "learning_rate": 1.9518556284360696e-05, + "loss": 1.1421, + "step": 555 + }, + { + "epoch": 0.1273039496279336, + "grad_norm": 1.5327030420303345, + "learning_rate": 1.9516279651941324e-05, + "loss": 1.1892, + "step": 556 + }, + { + "epoch": 0.12753291356611335, + "grad_norm": 1.531124472618103, + "learning_rate": 1.9513997782793045e-05, + "loss": 1.1348, + "step": 557 + }, + { + "epoch": 0.12776187750429308, + "grad_norm": 1.5858008861541748, + "learning_rate": 1.9511710678171548e-05, + "loss": 1.19, + "step": 558 + }, + { + "epoch": 0.1279908414424728, + "grad_norm": 1.3005167245864868, + "learning_rate": 1.9509418339335415e-05, + "loss": 1.1556, + "step": 559 + }, + { + "epoch": 0.12821980538065256, + "grad_norm": 1.6737267971038818, + "learning_rate": 1.950712076754609e-05, + "loss": 1.2304, + "step": 560 + }, + { + "epoch": 0.12844876931883228, + "grad_norm": 1.2671421766281128, + "learning_rate": 1.950481796406792e-05, + "loss": 1.1869, + "step": 561 + }, + { + "epoch": 0.128677733257012, + "grad_norm": 1.6968939304351807, + "learning_rate": 1.9502509930168113e-05, + "loss": 1.1252, + "step": 562 + }, + { + "epoch": 0.12890669719519177, + "grad_norm": 1.406265377998352, + "learning_rate": 1.950019666711676e-05, + "loss": 1.1424, + "step": 563 + }, + { + "epoch": 0.1291356611333715, + "grad_norm": 1.4663190841674805, + "learning_rate": 1.949787817618683e-05, + "loss": 1.1995, + "step": 564 + }, + { + "epoch": 0.12936462507155122, + "grad_norm": 1.5765471458435059, + "learning_rate": 1.949555445865417e-05, + "loss": 1.1302, + "step": 565 + }, + { + "epoch": 0.12959358900973098, + "grad_norm": 1.7354494333267212, + "learning_rate": 1.94932255157975e-05, + "loss": 1.1729, + "step": 566 + }, + { + "epoch": 0.1298225529479107, + "grad_norm": 1.379272699356079, + "learning_rate": 1.9490891348898423e-05, + "loss": 1.1507, + "step": 567 + }, + { + "epoch": 0.13005151688609043, + "grad_norm": 1.2708396911621094, + "learning_rate": 1.9488551959241405e-05, + "loss": 1.1516, + "step": 568 + }, + { + "epoch": 0.13028048082427018, + "grad_norm": 1.5942022800445557, + "learning_rate": 1.9486207348113803e-05, + "loss": 1.1358, + "step": 569 + }, + { + "epoch": 0.1305094447624499, + "grad_norm": 1.2558735609054565, + "learning_rate": 1.9483857516805823e-05, + "loss": 1.154, + "step": 570 + }, + { + "epoch": 0.13073840870062964, + "grad_norm": 1.1783373355865479, + "learning_rate": 1.9481502466610568e-05, + "loss": 1.1798, + "step": 571 + }, + { + "epoch": 0.1309673726388094, + "grad_norm": 1.2193893194198608, + "learning_rate": 1.9479142198823996e-05, + "loss": 1.1767, + "step": 572 + }, + { + "epoch": 0.13119633657698912, + "grad_norm": 1.1724305152893066, + "learning_rate": 1.9476776714744945e-05, + "loss": 1.113, + "step": 573 + }, + { + "epoch": 0.13142530051516887, + "grad_norm": 1.3040038347244263, + "learning_rate": 1.947440601567512e-05, + "loss": 1.1216, + "step": 574 + }, + { + "epoch": 0.1316542644533486, + "grad_norm": 1.3851133584976196, + "learning_rate": 1.9472030102919102e-05, + "loss": 1.1914, + "step": 575 + }, + { + "epoch": 0.13188322839152833, + "grad_norm": 1.331370234489441, + "learning_rate": 1.946964897778433e-05, + "loss": 1.135, + "step": 576 + }, + { + "epoch": 0.13211219232970808, + "grad_norm": 1.2417669296264648, + "learning_rate": 1.946726264158112e-05, + "loss": 1.0824, + "step": 577 + }, + { + "epoch": 0.1323411562678878, + "grad_norm": 1.1429842710494995, + "learning_rate": 1.9464871095622652e-05, + "loss": 1.1327, + "step": 578 + }, + { + "epoch": 0.13257012020606754, + "grad_norm": 1.2447996139526367, + "learning_rate": 1.9462474341224974e-05, + "loss": 1.1308, + "step": 579 + }, + { + "epoch": 0.1327990841442473, + "grad_norm": 1.3931182622909546, + "learning_rate": 1.9460072379706995e-05, + "loss": 1.1119, + "step": 580 + }, + { + "epoch": 0.13302804808242702, + "grad_norm": 1.3399399518966675, + "learning_rate": 1.9457665212390502e-05, + "loss": 1.1726, + "step": 581 + }, + { + "epoch": 0.13325701202060675, + "grad_norm": 1.1147905588150024, + "learning_rate": 1.9455252840600137e-05, + "loss": 1.0929, + "step": 582 + }, + { + "epoch": 0.1334859759587865, + "grad_norm": 1.4041277170181274, + "learning_rate": 1.9452835265663404e-05, + "loss": 1.1418, + "step": 583 + }, + { + "epoch": 0.13371493989696623, + "grad_norm": 1.5064622163772583, + "learning_rate": 1.945041248891068e-05, + "loss": 1.1835, + "step": 584 + }, + { + "epoch": 0.13394390383514596, + "grad_norm": 1.390039324760437, + "learning_rate": 1.9447984511675192e-05, + "loss": 1.089, + "step": 585 + }, + { + "epoch": 0.1341728677733257, + "grad_norm": 2.3285298347473145, + "learning_rate": 1.944555133529304e-05, + "loss": 1.1771, + "step": 586 + }, + { + "epoch": 0.13440183171150544, + "grad_norm": 1.5625582933425903, + "learning_rate": 1.9443112961103182e-05, + "loss": 1.1622, + "step": 587 + }, + { + "epoch": 0.13463079564968516, + "grad_norm": 1.6119787693023682, + "learning_rate": 1.9440669390447433e-05, + "loss": 1.1831, + "step": 588 + }, + { + "epoch": 0.13485975958786492, + "grad_norm": 1.1473219394683838, + "learning_rate": 1.943822062467047e-05, + "loss": 1.0934, + "step": 589 + }, + { + "epoch": 0.13508872352604465, + "grad_norm": 1.3341083526611328, + "learning_rate": 1.9435766665119823e-05, + "loss": 1.149, + "step": 590 + }, + { + "epoch": 0.13531768746422437, + "grad_norm": 1.3681514263153076, + "learning_rate": 1.943330751314589e-05, + "loss": 1.2103, + "step": 591 + }, + { + "epoch": 0.13554665140240413, + "grad_norm": 2.2560174465179443, + "learning_rate": 1.9430843170101924e-05, + "loss": 1.0948, + "step": 592 + }, + { + "epoch": 0.13577561534058386, + "grad_norm": 1.165104866027832, + "learning_rate": 1.9428373637344027e-05, + "loss": 1.1378, + "step": 593 + }, + { + "epoch": 0.13600457927876358, + "grad_norm": 1.2633001804351807, + "learning_rate": 1.9425898916231166e-05, + "loss": 1.0987, + "step": 594 + }, + { + "epoch": 0.13623354321694334, + "grad_norm": 1.2817325592041016, + "learning_rate": 1.942341900812516e-05, + "loss": 1.1535, + "step": 595 + }, + { + "epoch": 0.13646250715512306, + "grad_norm": 1.2196224927902222, + "learning_rate": 1.9420933914390677e-05, + "loss": 1.1257, + "step": 596 + }, + { + "epoch": 0.1366914710933028, + "grad_norm": 1.3916923999786377, + "learning_rate": 1.941844363639525e-05, + "loss": 1.1056, + "step": 597 + }, + { + "epoch": 0.13692043503148255, + "grad_norm": 1.571192741394043, + "learning_rate": 1.941594817550925e-05, + "loss": 1.1894, + "step": 598 + }, + { + "epoch": 0.13714939896966227, + "grad_norm": 1.3344855308532715, + "learning_rate": 1.9413447533105913e-05, + "loss": 1.1814, + "step": 599 + }, + { + "epoch": 0.13737836290784203, + "grad_norm": 2.034505605697632, + "learning_rate": 1.9410941710561323e-05, + "loss": 1.2126, + "step": 600 + }, + { + "epoch": 0.13760732684602175, + "grad_norm": 1.2162837982177734, + "learning_rate": 1.940843070925441e-05, + "loss": 1.1775, + "step": 601 + }, + { + "epoch": 0.13783629078420148, + "grad_norm": 1.2769267559051514, + "learning_rate": 1.940591453056696e-05, + "loss": 1.1882, + "step": 602 + }, + { + "epoch": 0.13806525472238124, + "grad_norm": 1.438686490058899, + "learning_rate": 1.9403393175883602e-05, + "loss": 1.1577, + "step": 603 + }, + { + "epoch": 0.13829421866056096, + "grad_norm": 1.3430330753326416, + "learning_rate": 1.9400866646591816e-05, + "loss": 1.1549, + "step": 604 + }, + { + "epoch": 0.1385231825987407, + "grad_norm": 1.135581374168396, + "learning_rate": 1.939833494408193e-05, + "loss": 1.1509, + "step": 605 + }, + { + "epoch": 0.13875214653692045, + "grad_norm": 1.433982014656067, + "learning_rate": 1.939579806974712e-05, + "loss": 1.1768, + "step": 606 + }, + { + "epoch": 0.13898111047510017, + "grad_norm": 1.200698971748352, + "learning_rate": 1.939325602498341e-05, + "loss": 1.1196, + "step": 607 + }, + { + "epoch": 0.1392100744132799, + "grad_norm": 1.2908475399017334, + "learning_rate": 1.939070881118966e-05, + "loss": 1.137, + "step": 608 + }, + { + "epoch": 0.13943903835145965, + "grad_norm": 1.3260984420776367, + "learning_rate": 1.9388156429767585e-05, + "loss": 1.1106, + "step": 609 + }, + { + "epoch": 0.13966800228963938, + "grad_norm": 1.310341238975525, + "learning_rate": 1.9385598882121735e-05, + "loss": 1.1857, + "step": 610 + }, + { + "epoch": 0.1398969662278191, + "grad_norm": 1.109005331993103, + "learning_rate": 1.9383036169659513e-05, + "loss": 1.1895, + "step": 611 + }, + { + "epoch": 0.14012593016599886, + "grad_norm": 1.230418086051941, + "learning_rate": 1.938046829379115e-05, + "loss": 1.1439, + "step": 612 + }, + { + "epoch": 0.1403548941041786, + "grad_norm": 1.380399227142334, + "learning_rate": 1.9377895255929734e-05, + "loss": 1.1821, + "step": 613 + }, + { + "epoch": 0.14058385804235832, + "grad_norm": 1.2389920949935913, + "learning_rate": 1.937531705749118e-05, + "loss": 1.1463, + "step": 614 + }, + { + "epoch": 0.14081282198053807, + "grad_norm": 1.1333352327346802, + "learning_rate": 1.9372733699894253e-05, + "loss": 1.1391, + "step": 615 + }, + { + "epoch": 0.1410417859187178, + "grad_norm": 1.2721819877624512, + "learning_rate": 1.937014518456055e-05, + "loss": 1.1266, + "step": 616 + }, + { + "epoch": 0.14127074985689753, + "grad_norm": 1.2748703956604004, + "learning_rate": 1.9367551512914513e-05, + "loss": 1.1351, + "step": 617 + }, + { + "epoch": 0.14149971379507728, + "grad_norm": 1.4715847969055176, + "learning_rate": 1.936495268638342e-05, + "loss": 1.1432, + "step": 618 + }, + { + "epoch": 0.141728677733257, + "grad_norm": 1.2434061765670776, + "learning_rate": 1.9362348706397374e-05, + "loss": 1.1471, + "step": 619 + }, + { + "epoch": 0.14195764167143674, + "grad_norm": 1.2716186046600342, + "learning_rate": 1.935973957438933e-05, + "loss": 1.1463, + "step": 620 + }, + { + "epoch": 0.1421866056096165, + "grad_norm": 1.4149022102355957, + "learning_rate": 1.935712529179507e-05, + "loss": 1.1119, + "step": 621 + }, + { + "epoch": 0.14241556954779622, + "grad_norm": 1.7188087701797485, + "learning_rate": 1.9354505860053215e-05, + "loss": 1.0757, + "step": 622 + }, + { + "epoch": 0.14264453348597597, + "grad_norm": 1.2718242406845093, + "learning_rate": 1.9351881280605212e-05, + "loss": 1.1599, + "step": 623 + }, + { + "epoch": 0.1428734974241557, + "grad_norm": 1.4921197891235352, + "learning_rate": 1.9349251554895347e-05, + "loss": 1.1168, + "step": 624 + }, + { + "epoch": 0.14310246136233543, + "grad_norm": 1.3648992776870728, + "learning_rate": 1.934661668437073e-05, + "loss": 1.0909, + "step": 625 + }, + { + "epoch": 0.14333142530051518, + "grad_norm": 1.2685736417770386, + "learning_rate": 1.934397667048132e-05, + "loss": 1.1921, + "step": 626 + }, + { + "epoch": 0.1435603892386949, + "grad_norm": 1.4009802341461182, + "learning_rate": 1.9341331514679887e-05, + "loss": 1.13, + "step": 627 + }, + { + "epoch": 0.14378935317687463, + "grad_norm": 2.8500163555145264, + "learning_rate": 1.933868121842204e-05, + "loss": 1.1513, + "step": 628 + }, + { + "epoch": 0.1440183171150544, + "grad_norm": 1.7703518867492676, + "learning_rate": 1.933602578316621e-05, + "loss": 1.1431, + "step": 629 + }, + { + "epoch": 0.14424728105323412, + "grad_norm": 1.2395981550216675, + "learning_rate": 1.9333365210373668e-05, + "loss": 1.1633, + "step": 630 + }, + { + "epoch": 0.14447624499141384, + "grad_norm": 1.3300714492797852, + "learning_rate": 1.9330699501508504e-05, + "loss": 1.1318, + "step": 631 + }, + { + "epoch": 0.1447052089295936, + "grad_norm": 1.4751075506210327, + "learning_rate": 1.932802865803763e-05, + "loss": 1.0868, + "step": 632 + }, + { + "epoch": 0.14493417286777333, + "grad_norm": 1.5677204132080078, + "learning_rate": 1.932535268143079e-05, + "loss": 1.2013, + "step": 633 + }, + { + "epoch": 0.14516313680595305, + "grad_norm": 1.3743131160736084, + "learning_rate": 1.9322671573160556e-05, + "loss": 1.1175, + "step": 634 + }, + { + "epoch": 0.1453921007441328, + "grad_norm": 1.6558480262756348, + "learning_rate": 1.9319985334702315e-05, + "loss": 1.1751, + "step": 635 + }, + { + "epoch": 0.14562106468231253, + "grad_norm": 1.3124221563339233, + "learning_rate": 1.9317293967534283e-05, + "loss": 1.1122, + "step": 636 + }, + { + "epoch": 0.14585002862049226, + "grad_norm": 1.2808423042297363, + "learning_rate": 1.9314597473137498e-05, + "loss": 1.1417, + "step": 637 + }, + { + "epoch": 0.14607899255867202, + "grad_norm": 1.250510334968567, + "learning_rate": 1.9311895852995815e-05, + "loss": 1.2279, + "step": 638 + }, + { + "epoch": 0.14630795649685174, + "grad_norm": 1.366873860359192, + "learning_rate": 1.930918910859592e-05, + "loss": 1.1324, + "step": 639 + }, + { + "epoch": 0.14653692043503147, + "grad_norm": 1.168998122215271, + "learning_rate": 1.9306477241427303e-05, + "loss": 1.0871, + "step": 640 + }, + { + "epoch": 0.14676588437321123, + "grad_norm": 1.3328088521957397, + "learning_rate": 1.9303760252982287e-05, + "loss": 1.1846, + "step": 641 + }, + { + "epoch": 0.14699484831139095, + "grad_norm": 1.1964495182037354, + "learning_rate": 1.9301038144756007e-05, + "loss": 1.1037, + "step": 642 + }, + { + "epoch": 0.14722381224957068, + "grad_norm": 1.2240676879882812, + "learning_rate": 1.9298310918246414e-05, + "loss": 1.1069, + "step": 643 + }, + { + "epoch": 0.14745277618775043, + "grad_norm": 1.3975204229354858, + "learning_rate": 1.929557857495428e-05, + "loss": 1.1145, + "step": 644 + }, + { + "epoch": 0.14768174012593016, + "grad_norm": 1.3508495092391968, + "learning_rate": 1.9292841116383192e-05, + "loss": 1.1929, + "step": 645 + }, + { + "epoch": 0.14791070406410992, + "grad_norm": 1.5650644302368164, + "learning_rate": 1.9290098544039546e-05, + "loss": 1.1214, + "step": 646 + }, + { + "epoch": 0.14813966800228964, + "grad_norm": 1.208836555480957, + "learning_rate": 1.9287350859432562e-05, + "loss": 1.11, + "step": 647 + }, + { + "epoch": 0.14836863194046937, + "grad_norm": 1.2838915586471558, + "learning_rate": 1.9284598064074264e-05, + "loss": 1.172, + "step": 648 + }, + { + "epoch": 0.14859759587864912, + "grad_norm": 1.2721766233444214, + "learning_rate": 1.9281840159479494e-05, + "loss": 1.1024, + "step": 649 + }, + { + "epoch": 0.14882655981682885, + "grad_norm": 1.6511443853378296, + "learning_rate": 1.9279077147165903e-05, + "loss": 1.14, + "step": 650 + }, + { + "epoch": 0.14905552375500858, + "grad_norm": 1.3828959465026855, + "learning_rate": 1.9276309028653954e-05, + "loss": 1.1819, + "step": 651 + }, + { + "epoch": 0.14928448769318833, + "grad_norm": 1.3660774230957031, + "learning_rate": 1.927353580546692e-05, + "loss": 1.1487, + "step": 652 + }, + { + "epoch": 0.14951345163136806, + "grad_norm": 1.296252965927124, + "learning_rate": 1.927075747913088e-05, + "loss": 1.1442, + "step": 653 + }, + { + "epoch": 0.1497424155695478, + "grad_norm": 1.1136078834533691, + "learning_rate": 1.9267974051174727e-05, + "loss": 1.0958, + "step": 654 + }, + { + "epoch": 0.14997137950772754, + "grad_norm": 1.5701806545257568, + "learning_rate": 1.9265185523130156e-05, + "loss": 1.1934, + "step": 655 + }, + { + "epoch": 0.15020034344590727, + "grad_norm": 1.439436912536621, + "learning_rate": 1.9262391896531668e-05, + "loss": 1.1336, + "step": 656 + }, + { + "epoch": 0.150429307384087, + "grad_norm": 1.2246439456939697, + "learning_rate": 1.9259593172916577e-05, + "loss": 1.1292, + "step": 657 + }, + { + "epoch": 0.15065827132226675, + "grad_norm": 1.5522711277008057, + "learning_rate": 1.9256789353824996e-05, + "loss": 1.1826, + "step": 658 + }, + { + "epoch": 0.15088723526044648, + "grad_norm": 1.6734440326690674, + "learning_rate": 1.9253980440799843e-05, + "loss": 1.0926, + "step": 659 + }, + { + "epoch": 0.1511161991986262, + "grad_norm": 1.276139736175537, + "learning_rate": 1.9251166435386837e-05, + "loss": 1.1941, + "step": 660 + }, + { + "epoch": 0.15134516313680596, + "grad_norm": 1.4880852699279785, + "learning_rate": 1.9248347339134508e-05, + "loss": 1.1254, + "step": 661 + }, + { + "epoch": 0.1515741270749857, + "grad_norm": 1.467468500137329, + "learning_rate": 1.9245523153594172e-05, + "loss": 1.1781, + "step": 662 + }, + { + "epoch": 0.15180309101316541, + "grad_norm": 1.3000531196594238, + "learning_rate": 1.9242693880319962e-05, + "loss": 1.0907, + "step": 663 + }, + { + "epoch": 0.15203205495134517, + "grad_norm": 1.569050908088684, + "learning_rate": 1.92398595208688e-05, + "loss": 1.2279, + "step": 664 + }, + { + "epoch": 0.1522610188895249, + "grad_norm": 1.3132572174072266, + "learning_rate": 1.9237020076800407e-05, + "loss": 1.0786, + "step": 665 + }, + { + "epoch": 0.15248998282770462, + "grad_norm": 1.4111486673355103, + "learning_rate": 1.9234175549677313e-05, + "loss": 1.1556, + "step": 666 + }, + { + "epoch": 0.15271894676588438, + "grad_norm": 1.2233771085739136, + "learning_rate": 1.923132594106483e-05, + "loss": 1.138, + "step": 667 + }, + { + "epoch": 0.1529479107040641, + "grad_norm": 1.5644503831863403, + "learning_rate": 1.922847125253108e-05, + "loss": 1.1911, + "step": 668 + }, + { + "epoch": 0.15317687464224386, + "grad_norm": 1.8687934875488281, + "learning_rate": 1.922561148564697e-05, + "loss": 1.1343, + "step": 669 + }, + { + "epoch": 0.1534058385804236, + "grad_norm": 1.1981557607650757, + "learning_rate": 1.9222746641986207e-05, + "loss": 1.1846, + "step": 670 + }, + { + "epoch": 0.15363480251860331, + "grad_norm": 1.6271294355392456, + "learning_rate": 1.921987672312529e-05, + "loss": 1.167, + "step": 671 + }, + { + "epoch": 0.15386376645678307, + "grad_norm": 1.3530622720718384, + "learning_rate": 1.9217001730643514e-05, + "loss": 1.1491, + "step": 672 + }, + { + "epoch": 0.1540927303949628, + "grad_norm": 1.456268072128296, + "learning_rate": 1.921412166612296e-05, + "loss": 1.1595, + "step": 673 + }, + { + "epoch": 0.15432169433314252, + "grad_norm": 1.2416306734085083, + "learning_rate": 1.92112365311485e-05, + "loss": 1.1396, + "step": 674 + }, + { + "epoch": 0.15455065827132228, + "grad_norm": 1.4196218252182007, + "learning_rate": 1.920834632730781e-05, + "loss": 1.1493, + "step": 675 + }, + { + "epoch": 0.154779622209502, + "grad_norm": 1.69566810131073, + "learning_rate": 1.920545105619134e-05, + "loss": 1.0956, + "step": 676 + }, + { + "epoch": 0.15500858614768173, + "grad_norm": 1.6241673231124878, + "learning_rate": 1.9202550719392324e-05, + "loss": 1.1241, + "step": 677 + }, + { + "epoch": 0.1552375500858615, + "grad_norm": 1.2214878797531128, + "learning_rate": 1.919964531850681e-05, + "loss": 1.1751, + "step": 678 + }, + { + "epoch": 0.1554665140240412, + "grad_norm": 2.9216532707214355, + "learning_rate": 1.9196734855133603e-05, + "loss": 1.1578, + "step": 679 + }, + { + "epoch": 0.15569547796222094, + "grad_norm": 1.8954651355743408, + "learning_rate": 1.919381933087431e-05, + "loss": 1.1785, + "step": 680 + }, + { + "epoch": 0.1559244419004007, + "grad_norm": 1.424324870109558, + "learning_rate": 1.919089874733332e-05, + "loss": 1.0811, + "step": 681 + }, + { + "epoch": 0.15615340583858042, + "grad_norm": 1.3659218549728394, + "learning_rate": 1.9187973106117808e-05, + "loss": 1.1433, + "step": 682 + }, + { + "epoch": 0.15638236977676015, + "grad_norm": 1.2462931871414185, + "learning_rate": 1.918504240883773e-05, + "loss": 1.1522, + "step": 683 + }, + { + "epoch": 0.1566113337149399, + "grad_norm": 1.390007495880127, + "learning_rate": 1.9182106657105816e-05, + "loss": 1.1282, + "step": 684 + }, + { + "epoch": 0.15684029765311963, + "grad_norm": 1.1688913106918335, + "learning_rate": 1.9179165852537596e-05, + "loss": 1.1549, + "step": 685 + }, + { + "epoch": 0.15706926159129936, + "grad_norm": 1.6128971576690674, + "learning_rate": 1.917621999675136e-05, + "loss": 1.1311, + "step": 686 + }, + { + "epoch": 0.1572982255294791, + "grad_norm": 1.283479928970337, + "learning_rate": 1.9173269091368197e-05, + "loss": 1.109, + "step": 687 + }, + { + "epoch": 0.15752718946765884, + "grad_norm": 1.3000928163528442, + "learning_rate": 1.9170313138011964e-05, + "loss": 1.1634, + "step": 688 + }, + { + "epoch": 0.15775615340583857, + "grad_norm": 1.204128384590149, + "learning_rate": 1.9167352138309294e-05, + "loss": 1.0988, + "step": 689 + }, + { + "epoch": 0.15798511734401832, + "grad_norm": 1.3165777921676636, + "learning_rate": 1.9164386093889598e-05, + "loss": 1.1393, + "step": 690 + }, + { + "epoch": 0.15821408128219805, + "grad_norm": 1.433110237121582, + "learning_rate": 1.9161415006385074e-05, + "loss": 1.1621, + "step": 691 + }, + { + "epoch": 0.1584430452203778, + "grad_norm": 1.2257025241851807, + "learning_rate": 1.915843887743068e-05, + "loss": 1.1692, + "step": 692 + }, + { + "epoch": 0.15867200915855753, + "grad_norm": 1.773209571838379, + "learning_rate": 1.915545770866415e-05, + "loss": 1.1176, + "step": 693 + }, + { + "epoch": 0.15890097309673726, + "grad_norm": 1.3696967363357544, + "learning_rate": 1.9152471501726008e-05, + "loss": 1.1944, + "step": 694 + }, + { + "epoch": 0.159129937034917, + "grad_norm": 1.1072070598602295, + "learning_rate": 1.9149480258259535e-05, + "loss": 1.1889, + "step": 695 + }, + { + "epoch": 0.15935890097309674, + "grad_norm": 1.5009992122650146, + "learning_rate": 1.914648397991078e-05, + "loss": 1.125, + "step": 696 + }, + { + "epoch": 0.15958786491127647, + "grad_norm": 1.4128434658050537, + "learning_rate": 1.9143482668328577e-05, + "loss": 1.1271, + "step": 697 + }, + { + "epoch": 0.15981682884945622, + "grad_norm": 1.0920000076293945, + "learning_rate": 1.9140476325164522e-05, + "loss": 1.1275, + "step": 698 + }, + { + "epoch": 0.16004579278763595, + "grad_norm": 1.100367546081543, + "learning_rate": 1.9137464952072977e-05, + "loss": 1.1063, + "step": 699 + }, + { + "epoch": 0.16027475672581568, + "grad_norm": 1.1449311971664429, + "learning_rate": 1.9134448550711077e-05, + "loss": 1.1795, + "step": 700 + }, + { + "epoch": 0.16050372066399543, + "grad_norm": 1.3512089252471924, + "learning_rate": 1.9131427122738725e-05, + "loss": 1.0984, + "step": 701 + }, + { + "epoch": 0.16073268460217516, + "grad_norm": 1.5637792348861694, + "learning_rate": 1.9128400669818586e-05, + "loss": 1.106, + "step": 702 + }, + { + "epoch": 0.16096164854035488, + "grad_norm": 1.6217029094696045, + "learning_rate": 1.9125369193616085e-05, + "loss": 1.1792, + "step": 703 + }, + { + "epoch": 0.16119061247853464, + "grad_norm": 1.6088597774505615, + "learning_rate": 1.9122332695799432e-05, + "loss": 1.1579, + "step": 704 + }, + { + "epoch": 0.16141957641671437, + "grad_norm": 1.4255467653274536, + "learning_rate": 1.9119291178039573e-05, + "loss": 1.1689, + "step": 705 + }, + { + "epoch": 0.1616485403548941, + "grad_norm": 2.3765337467193604, + "learning_rate": 1.9116244642010234e-05, + "loss": 1.2181, + "step": 706 + }, + { + "epoch": 0.16187750429307385, + "grad_norm": 1.2266517877578735, + "learning_rate": 1.91131930893879e-05, + "loss": 1.1672, + "step": 707 + }, + { + "epoch": 0.16210646823125358, + "grad_norm": 1.379189372062683, + "learning_rate": 1.9110136521851815e-05, + "loss": 1.1442, + "step": 708 + }, + { + "epoch": 0.1623354321694333, + "grad_norm": 1.462998628616333, + "learning_rate": 1.9107074941083987e-05, + "loss": 1.1641, + "step": 709 + }, + { + "epoch": 0.16256439610761306, + "grad_norm": 1.2941198348999023, + "learning_rate": 1.9104008348769164e-05, + "loss": 1.1731, + "step": 710 + }, + { + "epoch": 0.16279336004579278, + "grad_norm": 1.5538750886917114, + "learning_rate": 1.9100936746594878e-05, + "loss": 1.0923, + "step": 711 + }, + { + "epoch": 0.1630223239839725, + "grad_norm": 1.0926445722579956, + "learning_rate": 1.9097860136251402e-05, + "loss": 1.2141, + "step": 712 + }, + { + "epoch": 0.16325128792215227, + "grad_norm": 1.3583141565322876, + "learning_rate": 1.9094778519431773e-05, + "loss": 1.1745, + "step": 713 + }, + { + "epoch": 0.163480251860332, + "grad_norm": 1.200218915939331, + "learning_rate": 1.9091691897831774e-05, + "loss": 1.1723, + "step": 714 + }, + { + "epoch": 0.16370921579851175, + "grad_norm": 1.428288221359253, + "learning_rate": 1.9088600273149947e-05, + "loss": 1.1924, + "step": 715 + }, + { + "epoch": 0.16393817973669148, + "grad_norm": 1.488163709640503, + "learning_rate": 1.9085503647087588e-05, + "loss": 1.1357, + "step": 716 + }, + { + "epoch": 0.1641671436748712, + "grad_norm": 1.282083511352539, + "learning_rate": 1.9082402021348745e-05, + "loss": 1.1102, + "step": 717 + }, + { + "epoch": 0.16439610761305096, + "grad_norm": 1.33743155002594, + "learning_rate": 1.9079295397640215e-05, + "loss": 1.2115, + "step": 718 + }, + { + "epoch": 0.16462507155123068, + "grad_norm": 1.618739128112793, + "learning_rate": 1.9076183777671553e-05, + "loss": 1.1347, + "step": 719 + }, + { + "epoch": 0.1648540354894104, + "grad_norm": 2.2287635803222656, + "learning_rate": 1.907306716315505e-05, + "loss": 1.1554, + "step": 720 + }, + { + "epoch": 0.16508299942759017, + "grad_norm": 1.8814674615859985, + "learning_rate": 1.9069945555805756e-05, + "loss": 1.1277, + "step": 721 + }, + { + "epoch": 0.1653119633657699, + "grad_norm": 1.54645836353302, + "learning_rate": 1.9066818957341463e-05, + "loss": 1.1175, + "step": 722 + }, + { + "epoch": 0.16554092730394962, + "grad_norm": 1.5343029499053955, + "learning_rate": 1.906368736948272e-05, + "loss": 1.1077, + "step": 723 + }, + { + "epoch": 0.16576989124212937, + "grad_norm": 1.270850658416748, + "learning_rate": 1.9060550793952803e-05, + "loss": 1.1389, + "step": 724 + }, + { + "epoch": 0.1659988551803091, + "grad_norm": 1.866256594657898, + "learning_rate": 1.905740923247775e-05, + "loss": 1.122, + "step": 725 + }, + { + "epoch": 0.16622781911848883, + "grad_norm": 1.560256004333496, + "learning_rate": 1.9054262686786332e-05, + "loss": 1.1524, + "step": 726 + }, + { + "epoch": 0.16645678305666858, + "grad_norm": 1.3043514490127563, + "learning_rate": 1.905111115861007e-05, + "loss": 1.1863, + "step": 727 + }, + { + "epoch": 0.1666857469948483, + "grad_norm": 1.1351722478866577, + "learning_rate": 1.9047954649683217e-05, + "loss": 1.1149, + "step": 728 + }, + { + "epoch": 0.16691471093302804, + "grad_norm": 1.6116639375686646, + "learning_rate": 1.9044793161742783e-05, + "loss": 1.1352, + "step": 729 + }, + { + "epoch": 0.1671436748712078, + "grad_norm": 1.3612065315246582, + "learning_rate": 1.9041626696528503e-05, + "loss": 1.1206, + "step": 730 + }, + { + "epoch": 0.16737263880938752, + "grad_norm": 1.5191378593444824, + "learning_rate": 1.903845525578286e-05, + "loss": 1.1476, + "step": 731 + }, + { + "epoch": 0.16760160274756725, + "grad_norm": 1.3559706211090088, + "learning_rate": 1.903527884125106e-05, + "loss": 1.0739, + "step": 732 + }, + { + "epoch": 0.167830566685747, + "grad_norm": 2.067094087600708, + "learning_rate": 1.9032097454681074e-05, + "loss": 1.1463, + "step": 733 + }, + { + "epoch": 0.16805953062392673, + "grad_norm": 1.6561821699142456, + "learning_rate": 1.9028911097823578e-05, + "loss": 1.1236, + "step": 734 + }, + { + "epoch": 0.16828849456210646, + "grad_norm": 1.81977117061615, + "learning_rate": 1.9025719772432006e-05, + "loss": 1.1047, + "step": 735 + }, + { + "epoch": 0.1685174585002862, + "grad_norm": 1.2570257186889648, + "learning_rate": 1.9022523480262517e-05, + "loss": 1.1246, + "step": 736 + }, + { + "epoch": 0.16874642243846594, + "grad_norm": 1.2872344255447388, + "learning_rate": 1.9019322223073997e-05, + "loss": 1.1549, + "step": 737 + }, + { + "epoch": 0.1689753863766457, + "grad_norm": 1.111267328262329, + "learning_rate": 1.901611600262808e-05, + "loss": 1.0933, + "step": 738 + }, + { + "epoch": 0.16920435031482542, + "grad_norm": 1.3469321727752686, + "learning_rate": 1.9012904820689114e-05, + "loss": 1.1454, + "step": 739 + }, + { + "epoch": 0.16943331425300515, + "grad_norm": 1.3398330211639404, + "learning_rate": 1.900968867902419e-05, + "loss": 1.0773, + "step": 740 + }, + { + "epoch": 0.1696622781911849, + "grad_norm": 1.4936128854751587, + "learning_rate": 1.9006467579403128e-05, + "loss": 1.1834, + "step": 741 + }, + { + "epoch": 0.16989124212936463, + "grad_norm": 1.2364333868026733, + "learning_rate": 1.9003241523598465e-05, + "loss": 1.1359, + "step": 742 + }, + { + "epoch": 0.17012020606754436, + "grad_norm": 2.011514663696289, + "learning_rate": 1.900001051338547e-05, + "loss": 1.1716, + "step": 743 + }, + { + "epoch": 0.1703491700057241, + "grad_norm": 1.3461471796035767, + "learning_rate": 1.899677455054215e-05, + "loss": 1.1082, + "step": 744 + }, + { + "epoch": 0.17057813394390384, + "grad_norm": 1.3014931678771973, + "learning_rate": 1.8993533636849223e-05, + "loss": 1.1362, + "step": 745 + }, + { + "epoch": 0.17080709788208356, + "grad_norm": 1.2128814458847046, + "learning_rate": 1.8990287774090137e-05, + "loss": 1.0883, + "step": 746 + }, + { + "epoch": 0.17103606182026332, + "grad_norm": 1.4403022527694702, + "learning_rate": 1.8987036964051065e-05, + "loss": 1.1065, + "step": 747 + }, + { + "epoch": 0.17126502575844305, + "grad_norm": 1.252302885055542, + "learning_rate": 1.8983781208520898e-05, + "loss": 1.1181, + "step": 748 + }, + { + "epoch": 0.17149398969662277, + "grad_norm": 1.318610429763794, + "learning_rate": 1.8980520509291255e-05, + "loss": 1.1492, + "step": 749 + }, + { + "epoch": 0.17172295363480253, + "grad_norm": 1.5634231567382812, + "learning_rate": 1.8977254868156465e-05, + "loss": 1.0785, + "step": 750 + }, + { + "epoch": 0.17195191757298225, + "grad_norm": 2.146759033203125, + "learning_rate": 1.8973984286913584e-05, + "loss": 1.1331, + "step": 751 + }, + { + "epoch": 0.17218088151116198, + "grad_norm": 1.2476396560668945, + "learning_rate": 1.897070876736239e-05, + "loss": 1.1564, + "step": 752 + }, + { + "epoch": 0.17240984544934174, + "grad_norm": 1.4595783948898315, + "learning_rate": 1.8967428311305375e-05, + "loss": 1.1155, + "step": 753 + }, + { + "epoch": 0.17263880938752146, + "grad_norm": 1.1093003749847412, + "learning_rate": 1.896414292054774e-05, + "loss": 1.1345, + "step": 754 + }, + { + "epoch": 0.1728677733257012, + "grad_norm": 1.1664748191833496, + "learning_rate": 1.896085259689741e-05, + "loss": 1.0583, + "step": 755 + }, + { + "epoch": 0.17309673726388095, + "grad_norm": 1.5106245279312134, + "learning_rate": 1.8957557342165026e-05, + "loss": 1.1689, + "step": 756 + }, + { + "epoch": 0.17332570120206067, + "grad_norm": 1.3270502090454102, + "learning_rate": 1.8954257158163936e-05, + "loss": 1.1115, + "step": 757 + }, + { + "epoch": 0.1735546651402404, + "grad_norm": 1.5598188638687134, + "learning_rate": 1.895095204671021e-05, + "loss": 1.1486, + "step": 758 + }, + { + "epoch": 0.17378362907842015, + "grad_norm": 1.5022259950637817, + "learning_rate": 1.8947642009622612e-05, + "loss": 1.1217, + "step": 759 + }, + { + "epoch": 0.17401259301659988, + "grad_norm": 1.3121103048324585, + "learning_rate": 1.8944327048722634e-05, + "loss": 1.1604, + "step": 760 + }, + { + "epoch": 0.1742415569547796, + "grad_norm": 1.4838922023773193, + "learning_rate": 1.8941007165834468e-05, + "loss": 1.2222, + "step": 761 + }, + { + "epoch": 0.17447052089295936, + "grad_norm": 1.4497283697128296, + "learning_rate": 1.8937682362785025e-05, + "loss": 1.1726, + "step": 762 + }, + { + "epoch": 0.1746994848311391, + "grad_norm": 1.3479863405227661, + "learning_rate": 1.8934352641403907e-05, + "loss": 1.1307, + "step": 763 + }, + { + "epoch": 0.17492844876931885, + "grad_norm": 1.622117042541504, + "learning_rate": 1.8931018003523438e-05, + "loss": 1.0881, + "step": 764 + }, + { + "epoch": 0.17515741270749857, + "grad_norm": 1.377864956855774, + "learning_rate": 1.892767845097864e-05, + "loss": 1.0872, + "step": 765 + }, + { + "epoch": 0.1753863766456783, + "grad_norm": 1.4404441118240356, + "learning_rate": 1.8924333985607237e-05, + "loss": 1.1333, + "step": 766 + }, + { + "epoch": 0.17561534058385805, + "grad_norm": 1.5697600841522217, + "learning_rate": 1.8920984609249667e-05, + "loss": 1.1069, + "step": 767 + }, + { + "epoch": 0.17584430452203778, + "grad_norm": 1.4469575881958008, + "learning_rate": 1.8917630323749056e-05, + "loss": 1.1294, + "step": 768 + }, + { + "epoch": 0.1760732684602175, + "grad_norm": 1.2435827255249023, + "learning_rate": 1.8914271130951246e-05, + "loss": 1.1437, + "step": 769 + }, + { + "epoch": 0.17630223239839726, + "grad_norm": 1.4122284650802612, + "learning_rate": 1.891090703270477e-05, + "loss": 1.1627, + "step": 770 + }, + { + "epoch": 0.176531196336577, + "grad_norm": 1.3929946422576904, + "learning_rate": 1.8907538030860865e-05, + "loss": 1.1148, + "step": 771 + }, + { + "epoch": 0.17676016027475672, + "grad_norm": 1.9755219221115112, + "learning_rate": 1.890416412727346e-05, + "loss": 1.1699, + "step": 772 + }, + { + "epoch": 0.17698912421293647, + "grad_norm": 1.6826586723327637, + "learning_rate": 1.890078532379919e-05, + "loss": 1.0846, + "step": 773 + }, + { + "epoch": 0.1772180881511162, + "grad_norm": 1.2816320657730103, + "learning_rate": 1.8897401622297375e-05, + "loss": 1.2024, + "step": 774 + }, + { + "epoch": 0.17744705208929593, + "grad_norm": 1.4006059169769287, + "learning_rate": 1.889401302463005e-05, + "loss": 1.116, + "step": 775 + }, + { + "epoch": 0.17767601602747568, + "grad_norm": 1.476759910583496, + "learning_rate": 1.889061953266192e-05, + "loss": 1.0944, + "step": 776 + }, + { + "epoch": 0.1779049799656554, + "grad_norm": 1.5726763010025024, + "learning_rate": 1.8887221148260404e-05, + "loss": 1.1472, + "step": 777 + }, + { + "epoch": 0.17813394390383513, + "grad_norm": 1.337249517440796, + "learning_rate": 1.8883817873295597e-05, + "loss": 1.1403, + "step": 778 + }, + { + "epoch": 0.1783629078420149, + "grad_norm": 1.338131308555603, + "learning_rate": 1.88804097096403e-05, + "loss": 1.0678, + "step": 779 + }, + { + "epoch": 0.17859187178019462, + "grad_norm": 1.291359543800354, + "learning_rate": 1.887699665916999e-05, + "loss": 1.1212, + "step": 780 + }, + { + "epoch": 0.17882083571837434, + "grad_norm": 1.7371107339859009, + "learning_rate": 1.8873578723762845e-05, + "loss": 1.0984, + "step": 781 + }, + { + "epoch": 0.1790497996565541, + "grad_norm": 1.5285062789916992, + "learning_rate": 1.8870155905299725e-05, + "loss": 1.1511, + "step": 782 + }, + { + "epoch": 0.17927876359473383, + "grad_norm": 1.693925380706787, + "learning_rate": 1.8866728205664177e-05, + "loss": 1.117, + "step": 783 + }, + { + "epoch": 0.17950772753291355, + "grad_norm": 1.2213523387908936, + "learning_rate": 1.8863295626742438e-05, + "loss": 1.2201, + "step": 784 + }, + { + "epoch": 0.1797366914710933, + "grad_norm": 1.410949945449829, + "learning_rate": 1.8859858170423423e-05, + "loss": 1.1321, + "step": 785 + }, + { + "epoch": 0.17996565540927303, + "grad_norm": 1.3400486707687378, + "learning_rate": 1.8856415838598738e-05, + "loss": 1.2231, + "step": 786 + }, + { + "epoch": 0.1801946193474528, + "grad_norm": 1.147056221961975, + "learning_rate": 1.885296863316267e-05, + "loss": 1.1623, + "step": 787 + }, + { + "epoch": 0.18042358328563252, + "grad_norm": 1.1560708284378052, + "learning_rate": 1.8849516556012183e-05, + "loss": 1.1123, + "step": 788 + }, + { + "epoch": 0.18065254722381224, + "grad_norm": 1.350136399269104, + "learning_rate": 1.8846059609046935e-05, + "loss": 1.1598, + "step": 789 + }, + { + "epoch": 0.180881511161992, + "grad_norm": 1.242692232131958, + "learning_rate": 1.8842597794169245e-05, + "loss": 1.1087, + "step": 790 + }, + { + "epoch": 0.18111047510017173, + "grad_norm": 1.2484183311462402, + "learning_rate": 1.883913111328413e-05, + "loss": 1.2007, + "step": 791 + }, + { + "epoch": 0.18133943903835145, + "grad_norm": 1.3073456287384033, + "learning_rate": 1.8835659568299268e-05, + "loss": 1.1419, + "step": 792 + }, + { + "epoch": 0.1815684029765312, + "grad_norm": 1.2637773752212524, + "learning_rate": 1.8832183161125026e-05, + "loss": 1.1126, + "step": 793 + }, + { + "epoch": 0.18179736691471093, + "grad_norm": 2.1271276473999023, + "learning_rate": 1.8828701893674435e-05, + "loss": 1.0735, + "step": 794 + }, + { + "epoch": 0.18202633085289066, + "grad_norm": 1.8395510911941528, + "learning_rate": 1.8825215767863215e-05, + "loss": 1.1168, + "step": 795 + }, + { + "epoch": 0.18225529479107042, + "grad_norm": 1.3857455253601074, + "learning_rate": 1.882172478560975e-05, + "loss": 1.1164, + "step": 796 + }, + { + "epoch": 0.18248425872925014, + "grad_norm": 1.313710331916809, + "learning_rate": 1.8818228948835095e-05, + "loss": 1.1883, + "step": 797 + }, + { + "epoch": 0.18271322266742987, + "grad_norm": 1.5113545656204224, + "learning_rate": 1.8814728259462978e-05, + "loss": 1.1948, + "step": 798 + }, + { + "epoch": 0.18294218660560962, + "grad_norm": 1.3891445398330688, + "learning_rate": 1.8811222719419808e-05, + "loss": 1.1182, + "step": 799 + }, + { + "epoch": 0.18317115054378935, + "grad_norm": 1.672825574874878, + "learning_rate": 1.8807712330634645e-05, + "loss": 1.1114, + "step": 800 + }, + { + "epoch": 0.18340011448196908, + "grad_norm": 1.299664855003357, + "learning_rate": 1.880419709503923e-05, + "loss": 1.1228, + "step": 801 + }, + { + "epoch": 0.18362907842014883, + "grad_norm": 1.6080543994903564, + "learning_rate": 1.8800677014567972e-05, + "loss": 1.1325, + "step": 802 + }, + { + "epoch": 0.18385804235832856, + "grad_norm": 1.4526350498199463, + "learning_rate": 1.8797152091157935e-05, + "loss": 1.109, + "step": 803 + }, + { + "epoch": 0.1840870062965083, + "grad_norm": 1.2981843948364258, + "learning_rate": 1.8793622326748857e-05, + "loss": 1.1266, + "step": 804 + }, + { + "epoch": 0.18431597023468804, + "grad_norm": 1.715675711631775, + "learning_rate": 1.879008772328314e-05, + "loss": 1.1661, + "step": 805 + }, + { + "epoch": 0.18454493417286777, + "grad_norm": 1.5412567853927612, + "learning_rate": 1.8786548282705847e-05, + "loss": 1.0697, + "step": 806 + }, + { + "epoch": 0.1847738981110475, + "grad_norm": 1.3529974222183228, + "learning_rate": 1.87830040069647e-05, + "loss": 1.1753, + "step": 807 + }, + { + "epoch": 0.18500286204922725, + "grad_norm": 1.229610800743103, + "learning_rate": 1.877945489801008e-05, + "loss": 1.1039, + "step": 808 + }, + { + "epoch": 0.18523182598740698, + "grad_norm": 1.704888939857483, + "learning_rate": 1.8775900957795042e-05, + "loss": 1.1366, + "step": 809 + }, + { + "epoch": 0.18546078992558673, + "grad_norm": 1.142831802368164, + "learning_rate": 1.877234218827528e-05, + "loss": 1.0946, + "step": 810 + }, + { + "epoch": 0.18568975386376646, + "grad_norm": 1.304379940032959, + "learning_rate": 1.876877859140916e-05, + "loss": 1.1063, + "step": 811 + }, + { + "epoch": 0.1859187178019462, + "grad_norm": 1.1813594102859497, + "learning_rate": 1.8765210169157703e-05, + "loss": 1.1067, + "step": 812 + }, + { + "epoch": 0.18614768174012594, + "grad_norm": 1.6390926837921143, + "learning_rate": 1.876163692348457e-05, + "loss": 1.1525, + "step": 813 + }, + { + "epoch": 0.18637664567830567, + "grad_norm": 1.6113990545272827, + "learning_rate": 1.87580588563561e-05, + "loss": 1.1551, + "step": 814 + }, + { + "epoch": 0.1866056096164854, + "grad_norm": 1.4802757501602173, + "learning_rate": 1.8754475969741272e-05, + "loss": 1.1503, + "step": 815 + }, + { + "epoch": 0.18683457355466515, + "grad_norm": 1.2753384113311768, + "learning_rate": 1.8750888265611708e-05, + "loss": 1.1212, + "step": 816 + }, + { + "epoch": 0.18706353749284488, + "grad_norm": 1.1978129148483276, + "learning_rate": 1.8747295745941705e-05, + "loss": 1.1429, + "step": 817 + }, + { + "epoch": 0.1872925014310246, + "grad_norm": 1.1764678955078125, + "learning_rate": 1.8743698412708187e-05, + "loss": 1.1981, + "step": 818 + }, + { + "epoch": 0.18752146536920436, + "grad_norm": 1.5134238004684448, + "learning_rate": 1.874009626789074e-05, + "loss": 1.1221, + "step": 819 + }, + { + "epoch": 0.1877504293073841, + "grad_norm": 1.2333041429519653, + "learning_rate": 1.87364893134716e-05, + "loss": 1.1512, + "step": 820 + }, + { + "epoch": 0.18797939324556381, + "grad_norm": 1.1870229244232178, + "learning_rate": 1.873287755143563e-05, + "loss": 1.1544, + "step": 821 + }, + { + "epoch": 0.18820835718374357, + "grad_norm": 1.222976565361023, + "learning_rate": 1.872926098377036e-05, + "loss": 1.1167, + "step": 822 + }, + { + "epoch": 0.1884373211219233, + "grad_norm": 1.2639892101287842, + "learning_rate": 1.872563961246596e-05, + "loss": 1.1357, + "step": 823 + }, + { + "epoch": 0.18866628506010302, + "grad_norm": 1.1272286176681519, + "learning_rate": 1.8722013439515236e-05, + "loss": 1.1222, + "step": 824 + }, + { + "epoch": 0.18889524899828278, + "grad_norm": 1.2517902851104736, + "learning_rate": 1.8718382466913642e-05, + "loss": 1.0577, + "step": 825 + }, + { + "epoch": 0.1891242129364625, + "grad_norm": 1.3976292610168457, + "learning_rate": 1.871474669665927e-05, + "loss": 1.125, + "step": 826 + }, + { + "epoch": 0.18935317687464223, + "grad_norm": 1.212823510169983, + "learning_rate": 1.8711106130752855e-05, + "loss": 1.1143, + "step": 827 + }, + { + "epoch": 0.189582140812822, + "grad_norm": 1.4794899225234985, + "learning_rate": 1.8707460771197773e-05, + "loss": 1.1415, + "step": 828 + }, + { + "epoch": 0.1898111047510017, + "grad_norm": 1.4691826105117798, + "learning_rate": 1.8703810620000033e-05, + "loss": 1.1055, + "step": 829 + }, + { + "epoch": 0.19004006868918144, + "grad_norm": 1.352226972579956, + "learning_rate": 1.8700155679168277e-05, + "loss": 1.1638, + "step": 830 + }, + { + "epoch": 0.1902690326273612, + "grad_norm": 1.3408719301223755, + "learning_rate": 1.8696495950713794e-05, + "loss": 1.1159, + "step": 831 + }, + { + "epoch": 0.19049799656554092, + "grad_norm": 1.0246559381484985, + "learning_rate": 1.8692831436650505e-05, + "loss": 1.1375, + "step": 832 + }, + { + "epoch": 0.19072696050372068, + "grad_norm": 1.1187310218811035, + "learning_rate": 1.8689162138994952e-05, + "loss": 1.1453, + "step": 833 + }, + { + "epoch": 0.1909559244419004, + "grad_norm": 1.2305899858474731, + "learning_rate": 1.868548805976633e-05, + "loss": 1.1069, + "step": 834 + }, + { + "epoch": 0.19118488838008013, + "grad_norm": 1.4860109090805054, + "learning_rate": 1.868180920098644e-05, + "loss": 1.0497, + "step": 835 + }, + { + "epoch": 0.19141385231825989, + "grad_norm": 1.2232182025909424, + "learning_rate": 1.867812556467974e-05, + "loss": 1.131, + "step": 836 + }, + { + "epoch": 0.1916428162564396, + "grad_norm": 1.3128330707550049, + "learning_rate": 1.8674437152873296e-05, + "loss": 1.2106, + "step": 837 + }, + { + "epoch": 0.19187178019461934, + "grad_norm": 1.2296591997146606, + "learning_rate": 1.8670743967596817e-05, + "loss": 1.1277, + "step": 838 + }, + { + "epoch": 0.1921007441327991, + "grad_norm": 1.332777976989746, + "learning_rate": 1.8667046010882627e-05, + "loss": 1.1003, + "step": 839 + }, + { + "epoch": 0.19232970807097882, + "grad_norm": 1.8379496335983276, + "learning_rate": 1.866334328476568e-05, + "loss": 1.1812, + "step": 840 + }, + { + "epoch": 0.19255867200915855, + "grad_norm": 1.1897050142288208, + "learning_rate": 1.865963579128356e-05, + "loss": 1.0811, + "step": 841 + }, + { + "epoch": 0.1927876359473383, + "grad_norm": 1.7345019578933716, + "learning_rate": 1.8655923532476463e-05, + "loss": 1.1679, + "step": 842 + }, + { + "epoch": 0.19301659988551803, + "grad_norm": 1.20344877243042, + "learning_rate": 1.865220651038722e-05, + "loss": 1.1832, + "step": 843 + }, + { + "epoch": 0.19324556382369776, + "grad_norm": 1.2605164051055908, + "learning_rate": 1.864848472706127e-05, + "loss": 1.1355, + "step": 844 + }, + { + "epoch": 0.1934745277618775, + "grad_norm": 1.1412873268127441, + "learning_rate": 1.864475818454669e-05, + "loss": 1.1202, + "step": 845 + }, + { + "epoch": 0.19370349170005724, + "grad_norm": 1.1985411643981934, + "learning_rate": 1.8641026884894156e-05, + "loss": 1.1509, + "step": 846 + }, + { + "epoch": 0.19393245563823697, + "grad_norm": 1.0788958072662354, + "learning_rate": 1.8637290830156972e-05, + "loss": 1.1141, + "step": 847 + }, + { + "epoch": 0.19416141957641672, + "grad_norm": 1.2294611930847168, + "learning_rate": 1.8633550022391062e-05, + "loss": 1.1473, + "step": 848 + }, + { + "epoch": 0.19439038351459645, + "grad_norm": 1.1396217346191406, + "learning_rate": 1.8629804463654956e-05, + "loss": 1.141, + "step": 849 + }, + { + "epoch": 0.19461934745277618, + "grad_norm": 1.0818895101547241, + "learning_rate": 1.8626054156009807e-05, + "loss": 1.1195, + "step": 850 + }, + { + "epoch": 0.19484831139095593, + "grad_norm": 1.287191390991211, + "learning_rate": 1.862229910151938e-05, + "loss": 1.1387, + "step": 851 + }, + { + "epoch": 0.19507727532913566, + "grad_norm": 1.1082041263580322, + "learning_rate": 1.8618539302250044e-05, + "loss": 1.1418, + "step": 852 + }, + { + "epoch": 0.19530623926731538, + "grad_norm": 1.2280734777450562, + "learning_rate": 1.8614774760270785e-05, + "loss": 1.157, + "step": 853 + }, + { + "epoch": 0.19553520320549514, + "grad_norm": 1.1522899866104126, + "learning_rate": 1.8611005477653204e-05, + "loss": 1.0825, + "step": 854 + }, + { + "epoch": 0.19576416714367487, + "grad_norm": 1.2309021949768066, + "learning_rate": 1.8607231456471505e-05, + "loss": 1.151, + "step": 855 + }, + { + "epoch": 0.19599313108185462, + "grad_norm": 1.141215205192566, + "learning_rate": 1.8603452698802498e-05, + "loss": 1.0634, + "step": 856 + }, + { + "epoch": 0.19622209502003435, + "grad_norm": 1.6352757215499878, + "learning_rate": 1.85996692067256e-05, + "loss": 1.14, + "step": 857 + }, + { + "epoch": 0.19645105895821408, + "grad_norm": 1.2608399391174316, + "learning_rate": 1.859588098232284e-05, + "loss": 1.1651, + "step": 858 + }, + { + "epoch": 0.19668002289639383, + "grad_norm": 1.3403522968292236, + "learning_rate": 1.859208802767884e-05, + "loss": 1.0751, + "step": 859 + }, + { + "epoch": 0.19690898683457356, + "grad_norm": 1.5046757459640503, + "learning_rate": 1.858829034488084e-05, + "loss": 1.1455, + "step": 860 + }, + { + "epoch": 0.19713795077275328, + "grad_norm": 1.1532163619995117, + "learning_rate": 1.8584487936018663e-05, + "loss": 1.0847, + "step": 861 + }, + { + "epoch": 0.19736691471093304, + "grad_norm": 1.1010034084320068, + "learning_rate": 1.858068080318475e-05, + "loss": 1.1729, + "step": 862 + }, + { + "epoch": 0.19759587864911277, + "grad_norm": 1.2024052143096924, + "learning_rate": 1.857686894847413e-05, + "loss": 1.1543, + "step": 863 + }, + { + "epoch": 0.1978248425872925, + "grad_norm": 1.4353835582733154, + "learning_rate": 1.8573052373984435e-05, + "loss": 1.1061, + "step": 864 + }, + { + "epoch": 0.19805380652547225, + "grad_norm": 1.281450867652893, + "learning_rate": 1.8569231081815895e-05, + "loss": 1.0886, + "step": 865 + }, + { + "epoch": 0.19828277046365198, + "grad_norm": 1.2481147050857544, + "learning_rate": 1.8565405074071338e-05, + "loss": 1.0985, + "step": 866 + }, + { + "epoch": 0.1985117344018317, + "grad_norm": 1.2200899124145508, + "learning_rate": 1.8561574352856176e-05, + "loss": 1.1767, + "step": 867 + }, + { + "epoch": 0.19874069834001146, + "grad_norm": 1.187233328819275, + "learning_rate": 1.855773892027843e-05, + "loss": 1.0757, + "step": 868 + }, + { + "epoch": 0.19896966227819118, + "grad_norm": 1.319388508796692, + "learning_rate": 1.855389877844871e-05, + "loss": 1.1752, + "step": 869 + }, + { + "epoch": 0.1991986262163709, + "grad_norm": 1.2449707984924316, + "learning_rate": 1.8550053929480202e-05, + "loss": 1.1535, + "step": 870 + }, + { + "epoch": 0.19942759015455067, + "grad_norm": 1.315616250038147, + "learning_rate": 1.8546204375488702e-05, + "loss": 1.0604, + "step": 871 + }, + { + "epoch": 0.1996565540927304, + "grad_norm": 1.368754267692566, + "learning_rate": 1.8542350118592585e-05, + "loss": 1.1437, + "step": 872 + }, + { + "epoch": 0.19988551803091012, + "grad_norm": 1.2946205139160156, + "learning_rate": 1.853849116091282e-05, + "loss": 1.1325, + "step": 873 + }, + { + "epoch": 0.20011448196908987, + "grad_norm": 1.3224684000015259, + "learning_rate": 1.853462750457295e-05, + "loss": 1.0641, + "step": 874 + }, + { + "epoch": 0.2003434459072696, + "grad_norm": 1.159767985343933, + "learning_rate": 1.8530759151699122e-05, + "loss": 1.0747, + "step": 875 + }, + { + "epoch": 0.20057240984544933, + "grad_norm": 1.1444604396820068, + "learning_rate": 1.8526886104420056e-05, + "loss": 1.1565, + "step": 876 + }, + { + "epoch": 0.20080137378362908, + "grad_norm": 1.265729308128357, + "learning_rate": 1.8523008364867056e-05, + "loss": 1.0907, + "step": 877 + }, + { + "epoch": 0.2010303377218088, + "grad_norm": 1.0625101327896118, + "learning_rate": 1.851912593517401e-05, + "loss": 1.1093, + "step": 878 + }, + { + "epoch": 0.20125930165998857, + "grad_norm": 1.4997196197509766, + "learning_rate": 1.8515238817477382e-05, + "loss": 1.1674, + "step": 879 + }, + { + "epoch": 0.2014882655981683, + "grad_norm": 1.2591129541397095, + "learning_rate": 1.8511347013916228e-05, + "loss": 1.1546, + "step": 880 + }, + { + "epoch": 0.20171722953634802, + "grad_norm": 1.1916770935058594, + "learning_rate": 1.850745052663217e-05, + "loss": 1.094, + "step": 881 + }, + { + "epoch": 0.20194619347452777, + "grad_norm": 1.0707792043685913, + "learning_rate": 1.850354935776941e-05, + "loss": 1.0586, + "step": 882 + }, + { + "epoch": 0.2021751574127075, + "grad_norm": 1.159035086631775, + "learning_rate": 1.8499643509474738e-05, + "loss": 1.1245, + "step": 883 + }, + { + "epoch": 0.20240412135088723, + "grad_norm": 1.2937264442443848, + "learning_rate": 1.8495732983897504e-05, + "loss": 1.1293, + "step": 884 + }, + { + "epoch": 0.20263308528906698, + "grad_norm": 1.2372575998306274, + "learning_rate": 1.8491817783189636e-05, + "loss": 1.1198, + "step": 885 + }, + { + "epoch": 0.2028620492272467, + "grad_norm": 1.1078606843948364, + "learning_rate": 1.8487897909505637e-05, + "loss": 1.1512, + "step": 886 + }, + { + "epoch": 0.20309101316542644, + "grad_norm": 1.2595610618591309, + "learning_rate": 1.848397336500258e-05, + "loss": 1.1433, + "step": 887 + }, + { + "epoch": 0.2033199771036062, + "grad_norm": 1.1265288591384888, + "learning_rate": 1.848004415184011e-05, + "loss": 1.1761, + "step": 888 + }, + { + "epoch": 0.20354894104178592, + "grad_norm": 1.1440848112106323, + "learning_rate": 1.8476110272180443e-05, + "loss": 1.173, + "step": 889 + }, + { + "epoch": 0.20377790497996565, + "grad_norm": 1.281594157218933, + "learning_rate": 1.8472171728188356e-05, + "loss": 1.1097, + "step": 890 + }, + { + "epoch": 0.2040068689181454, + "grad_norm": 1.2258485555648804, + "learning_rate": 1.8468228522031197e-05, + "loss": 1.0735, + "step": 891 + }, + { + "epoch": 0.20423583285632513, + "grad_norm": 1.1039539575576782, + "learning_rate": 1.8464280655878876e-05, + "loss": 1.1628, + "step": 892 + }, + { + "epoch": 0.20446479679450486, + "grad_norm": 1.0621813535690308, + "learning_rate": 1.846032813190388e-05, + "loss": 1.1138, + "step": 893 + }, + { + "epoch": 0.2046937607326846, + "grad_norm": 1.2525299787521362, + "learning_rate": 1.8456370952281243e-05, + "loss": 1.1612, + "step": 894 + }, + { + "epoch": 0.20492272467086434, + "grad_norm": 1.553027629852295, + "learning_rate": 1.845240911918857e-05, + "loss": 1.1157, + "step": 895 + }, + { + "epoch": 0.20515168860904406, + "grad_norm": 1.309720754623413, + "learning_rate": 1.844844263480602e-05, + "loss": 1.1009, + "step": 896 + }, + { + "epoch": 0.20538065254722382, + "grad_norm": 1.2567998170852661, + "learning_rate": 1.8444471501316324e-05, + "loss": 1.136, + "step": 897 + }, + { + "epoch": 0.20560961648540355, + "grad_norm": 1.242303490638733, + "learning_rate": 1.8440495720904758e-05, + "loss": 1.1018, + "step": 898 + }, + { + "epoch": 0.20583858042358327, + "grad_norm": 1.5786601305007935, + "learning_rate": 1.843651529575916e-05, + "loss": 1.126, + "step": 899 + }, + { + "epoch": 0.20606754436176303, + "grad_norm": 1.179705262184143, + "learning_rate": 1.843253022806993e-05, + "loss": 1.1069, + "step": 900 + }, + { + "epoch": 0.20629650829994275, + "grad_norm": 1.4622169733047485, + "learning_rate": 1.842854052003001e-05, + "loss": 1.0582, + "step": 901 + }, + { + "epoch": 0.2065254722381225, + "grad_norm": 1.43120276927948, + "learning_rate": 1.842454617383491e-05, + "loss": 1.0877, + "step": 902 + }, + { + "epoch": 0.20675443617630224, + "grad_norm": 1.0542099475860596, + "learning_rate": 1.842054719168268e-05, + "loss": 1.0746, + "step": 903 + }, + { + "epoch": 0.20698340011448196, + "grad_norm": 1.2692487239837646, + "learning_rate": 1.841654357577393e-05, + "loss": 1.1247, + "step": 904 + }, + { + "epoch": 0.20721236405266172, + "grad_norm": 1.1795223951339722, + "learning_rate": 1.8412535328311813e-05, + "loss": 1.186, + "step": 905 + }, + { + "epoch": 0.20744132799084145, + "grad_norm": 1.6526825428009033, + "learning_rate": 1.8408522451502038e-05, + "loss": 1.2059, + "step": 906 + }, + { + "epoch": 0.20767029192902117, + "grad_norm": 2.6972272396087646, + "learning_rate": 1.840450494755285e-05, + "loss": 1.186, + "step": 907 + }, + { + "epoch": 0.20789925586720093, + "grad_norm": 1.294978141784668, + "learning_rate": 1.840048281867506e-05, + "loss": 1.1069, + "step": 908 + }, + { + "epoch": 0.20812821980538065, + "grad_norm": 1.686403512954712, + "learning_rate": 1.8396456067082e-05, + "loss": 1.1007, + "step": 909 + }, + { + "epoch": 0.20835718374356038, + "grad_norm": 1.4886486530303955, + "learning_rate": 1.839242469498956e-05, + "loss": 1.1646, + "step": 910 + }, + { + "epoch": 0.20858614768174014, + "grad_norm": 1.4273146390914917, + "learning_rate": 1.8388388704616177e-05, + "loss": 1.0601, + "step": 911 + }, + { + "epoch": 0.20881511161991986, + "grad_norm": 1.9175933599472046, + "learning_rate": 1.8384348098182815e-05, + "loss": 1.1559, + "step": 912 + }, + { + "epoch": 0.2090440755580996, + "grad_norm": 1.3737765550613403, + "learning_rate": 1.8380302877912993e-05, + "loss": 1.185, + "step": 913 + }, + { + "epoch": 0.20927303949627934, + "grad_norm": 1.3466029167175293, + "learning_rate": 1.837625304603275e-05, + "loss": 1.14, + "step": 914 + }, + { + "epoch": 0.20950200343445907, + "grad_norm": 1.427209734916687, + "learning_rate": 1.837219860477069e-05, + "loss": 1.1444, + "step": 915 + }, + { + "epoch": 0.2097309673726388, + "grad_norm": 1.7554882764816284, + "learning_rate": 1.836813955635793e-05, + "loss": 1.1354, + "step": 916 + }, + { + "epoch": 0.20995993131081855, + "grad_norm": 1.5630890130996704, + "learning_rate": 1.8364075903028128e-05, + "loss": 1.0917, + "step": 917 + }, + { + "epoch": 0.21018889524899828, + "grad_norm": 1.6305307149887085, + "learning_rate": 1.836000764701748e-05, + "loss": 1.0982, + "step": 918 + }, + { + "epoch": 0.210417859187178, + "grad_norm": 1.154215931892395, + "learning_rate": 1.8355934790564718e-05, + "loss": 1.1199, + "step": 919 + }, + { + "epoch": 0.21064682312535776, + "grad_norm": 1.440573811531067, + "learning_rate": 1.8351857335911094e-05, + "loss": 1.1621, + "step": 920 + }, + { + "epoch": 0.2108757870635375, + "grad_norm": 1.2942014932632446, + "learning_rate": 1.83477752853004e-05, + "loss": 1.1378, + "step": 921 + }, + { + "epoch": 0.21110475100171722, + "grad_norm": 3.2026820182800293, + "learning_rate": 1.8343688640978955e-05, + "loss": 1.2049, + "step": 922 + }, + { + "epoch": 0.21133371493989697, + "grad_norm": 1.3892024755477905, + "learning_rate": 1.8339597405195607e-05, + "loss": 1.1328, + "step": 923 + }, + { + "epoch": 0.2115626788780767, + "grad_norm": 1.5560742616653442, + "learning_rate": 1.833550158020172e-05, + "loss": 1.1212, + "step": 924 + }, + { + "epoch": 0.21179164281625643, + "grad_norm": 1.1782360076904297, + "learning_rate": 1.83314011682512e-05, + "loss": 1.1153, + "step": 925 + }, + { + "epoch": 0.21202060675443618, + "grad_norm": 1.5242149829864502, + "learning_rate": 1.832729617160047e-05, + "loss": 1.1059, + "step": 926 + }, + { + "epoch": 0.2122495706926159, + "grad_norm": 1.157873272895813, + "learning_rate": 1.8323186592508474e-05, + "loss": 1.1489, + "step": 927 + }, + { + "epoch": 0.21247853463079566, + "grad_norm": 1.1628938913345337, + "learning_rate": 1.8319072433236677e-05, + "loss": 1.0962, + "step": 928 + }, + { + "epoch": 0.2127074985689754, + "grad_norm": 1.0474616289138794, + "learning_rate": 1.831495369604907e-05, + "loss": 1.1517, + "step": 929 + }, + { + "epoch": 0.21293646250715512, + "grad_norm": 1.7184207439422607, + "learning_rate": 1.831083038321215e-05, + "loss": 1.1172, + "step": 930 + }, + { + "epoch": 0.21316542644533487, + "grad_norm": 1.8269110918045044, + "learning_rate": 1.830670249699495e-05, + "loss": 1.1521, + "step": 931 + }, + { + "epoch": 0.2133943903835146, + "grad_norm": 1.1638559103012085, + "learning_rate": 1.830257003966901e-05, + "loss": 1.1759, + "step": 932 + }, + { + "epoch": 0.21362335432169433, + "grad_norm": 1.1705409288406372, + "learning_rate": 1.8298433013508384e-05, + "loss": 1.0605, + "step": 933 + }, + { + "epoch": 0.21385231825987408, + "grad_norm": 1.2644610404968262, + "learning_rate": 1.8294291420789648e-05, + "loss": 1.1405, + "step": 934 + }, + { + "epoch": 0.2140812821980538, + "grad_norm": 1.0992416143417358, + "learning_rate": 1.8290145263791883e-05, + "loss": 1.108, + "step": 935 + }, + { + "epoch": 0.21431024613623353, + "grad_norm": 1.291539192199707, + "learning_rate": 1.8285994544796685e-05, + "loss": 1.084, + "step": 936 + }, + { + "epoch": 0.2145392100744133, + "grad_norm": 1.244376540184021, + "learning_rate": 1.8281839266088155e-05, + "loss": 1.1151, + "step": 937 + }, + { + "epoch": 0.21476817401259302, + "grad_norm": 1.0517001152038574, + "learning_rate": 1.8277679429952913e-05, + "loss": 1.1116, + "step": 938 + }, + { + "epoch": 0.21499713795077274, + "grad_norm": 1.0727293491363525, + "learning_rate": 1.827351503868008e-05, + "loss": 1.1381, + "step": 939 + }, + { + "epoch": 0.2152261018889525, + "grad_norm": 1.1680247783660889, + "learning_rate": 1.826934609456129e-05, + "loss": 1.1139, + "step": 940 + }, + { + "epoch": 0.21545506582713222, + "grad_norm": 1.4314526319503784, + "learning_rate": 1.826517259989067e-05, + "loss": 1.1778, + "step": 941 + }, + { + "epoch": 0.21568402976531195, + "grad_norm": 1.3765356540679932, + "learning_rate": 1.8260994556964865e-05, + "loss": 1.1226, + "step": 942 + }, + { + "epoch": 0.2159129937034917, + "grad_norm": 1.1225863695144653, + "learning_rate": 1.8256811968083016e-05, + "loss": 1.0951, + "step": 943 + }, + { + "epoch": 0.21614195764167143, + "grad_norm": 1.3155685663223267, + "learning_rate": 1.8252624835546768e-05, + "loss": 1.1253, + "step": 944 + }, + { + "epoch": 0.21637092157985116, + "grad_norm": 1.3809353113174438, + "learning_rate": 1.824843316166026e-05, + "loss": 1.0972, + "step": 945 + }, + { + "epoch": 0.21659988551803092, + "grad_norm": 1.5605497360229492, + "learning_rate": 1.8244236948730138e-05, + "loss": 1.1479, + "step": 946 + }, + { + "epoch": 0.21682884945621064, + "grad_norm": 1.3077151775360107, + "learning_rate": 1.8240036199065546e-05, + "loss": 1.0822, + "step": 947 + }, + { + "epoch": 0.21705781339439037, + "grad_norm": 1.3724560737609863, + "learning_rate": 1.8235830914978113e-05, + "loss": 1.1366, + "step": 948 + }, + { + "epoch": 0.21728677733257012, + "grad_norm": 1.2070374488830566, + "learning_rate": 1.8231621098781983e-05, + "loss": 1.1371, + "step": 949 + }, + { + "epoch": 0.21751574127074985, + "grad_norm": 1.3094055652618408, + "learning_rate": 1.822740675279377e-05, + "loss": 1.1442, + "step": 950 + }, + { + "epoch": 0.2177447052089296, + "grad_norm": 1.3075108528137207, + "learning_rate": 1.8223187879332604e-05, + "loss": 1.1095, + "step": 951 + }, + { + "epoch": 0.21797366914710933, + "grad_norm": 1.2486156225204468, + "learning_rate": 1.8218964480720093e-05, + "loss": 1.0987, + "step": 952 + }, + { + "epoch": 0.21820263308528906, + "grad_norm": 1.1056116819381714, + "learning_rate": 1.821473655928033e-05, + "loss": 1.116, + "step": 953 + }, + { + "epoch": 0.21843159702346882, + "grad_norm": 1.235161542892456, + "learning_rate": 1.8210504117339917e-05, + "loss": 1.062, + "step": 954 + }, + { + "epoch": 0.21866056096164854, + "grad_norm": 1.3450078964233398, + "learning_rate": 1.8206267157227918e-05, + "loss": 1.0997, + "step": 955 + }, + { + "epoch": 0.21888952489982827, + "grad_norm": 1.1602591276168823, + "learning_rate": 1.820202568127591e-05, + "loss": 1.124, + "step": 956 + }, + { + "epoch": 0.21911848883800802, + "grad_norm": 1.4237440824508667, + "learning_rate": 1.819777969181793e-05, + "loss": 1.148, + "step": 957 + }, + { + "epoch": 0.21934745277618775, + "grad_norm": 1.4019017219543457, + "learning_rate": 1.819352919119052e-05, + "loss": 1.1316, + "step": 958 + }, + { + "epoch": 0.21957641671436748, + "grad_norm": 1.2780940532684326, + "learning_rate": 1.818927418173269e-05, + "loss": 1.1654, + "step": 959 + }, + { + "epoch": 0.21980538065254723, + "grad_norm": 1.7911431789398193, + "learning_rate": 1.8185014665785936e-05, + "loss": 1.0586, + "step": 960 + }, + { + "epoch": 0.22003434459072696, + "grad_norm": 2.0923030376434326, + "learning_rate": 1.8180750645694236e-05, + "loss": 1.146, + "step": 961 + }, + { + "epoch": 0.2202633085289067, + "grad_norm": 1.1630254983901978, + "learning_rate": 1.8176482123804042e-05, + "loss": 1.1488, + "step": 962 + }, + { + "epoch": 0.22049227246708644, + "grad_norm": 1.14417564868927, + "learning_rate": 1.8172209102464288e-05, + "loss": 1.089, + "step": 963 + }, + { + "epoch": 0.22072123640526617, + "grad_norm": 1.4543980360031128, + "learning_rate": 1.816793158402638e-05, + "loss": 1.1678, + "step": 964 + }, + { + "epoch": 0.2209502003434459, + "grad_norm": 1.025010108947754, + "learning_rate": 1.8163649570844198e-05, + "loss": 1.062, + "step": 965 + }, + { + "epoch": 0.22117916428162565, + "grad_norm": 1.0538175106048584, + "learning_rate": 1.8159363065274104e-05, + "loss": 1.1371, + "step": 966 + }, + { + "epoch": 0.22140812821980538, + "grad_norm": 1.9237316846847534, + "learning_rate": 1.8155072069674923e-05, + "loss": 1.1064, + "step": 967 + }, + { + "epoch": 0.2216370921579851, + "grad_norm": 1.3368388414382935, + "learning_rate": 1.8150776586407957e-05, + "loss": 1.0746, + "step": 968 + }, + { + "epoch": 0.22186605609616486, + "grad_norm": 1.297314167022705, + "learning_rate": 1.814647661783697e-05, + "loss": 1.1678, + "step": 969 + }, + { + "epoch": 0.2220950200343446, + "grad_norm": 1.2015776634216309, + "learning_rate": 1.8142172166328198e-05, + "loss": 1.1321, + "step": 970 + }, + { + "epoch": 0.22232398397252431, + "grad_norm": 1.1826279163360596, + "learning_rate": 1.8137863234250346e-05, + "loss": 1.0897, + "step": 971 + }, + { + "epoch": 0.22255294791070407, + "grad_norm": 1.9954915046691895, + "learning_rate": 1.813354982397459e-05, + "loss": 1.0966, + "step": 972 + }, + { + "epoch": 0.2227819118488838, + "grad_norm": 1.4948241710662842, + "learning_rate": 1.8129231937874555e-05, + "loss": 1.0648, + "step": 973 + }, + { + "epoch": 0.22301087578706355, + "grad_norm": 1.5026756525039673, + "learning_rate": 1.812490957832634e-05, + "loss": 1.1085, + "step": 974 + }, + { + "epoch": 0.22323983972524328, + "grad_norm": 1.6371681690216064, + "learning_rate": 1.8120582747708503e-05, + "loss": 1.085, + "step": 975 + }, + { + "epoch": 0.223468803663423, + "grad_norm": 1.236262559890747, + "learning_rate": 1.8116251448402062e-05, + "loss": 1.1093, + "step": 976 + }, + { + "epoch": 0.22369776760160276, + "grad_norm": 1.208084225654602, + "learning_rate": 1.8111915682790494e-05, + "loss": 1.192, + "step": 977 + }, + { + "epoch": 0.2239267315397825, + "grad_norm": 1.4316471815109253, + "learning_rate": 1.810757545325974e-05, + "loss": 1.113, + "step": 978 + }, + { + "epoch": 0.2241556954779622, + "grad_norm": 1.1135729551315308, + "learning_rate": 1.8103230762198185e-05, + "loss": 1.1644, + "step": 979 + }, + { + "epoch": 0.22438465941614197, + "grad_norm": 1.1902830600738525, + "learning_rate": 1.809888161199668e-05, + "loss": 1.1535, + "step": 980 + }, + { + "epoch": 0.2246136233543217, + "grad_norm": 1.5543137788772583, + "learning_rate": 1.8094528005048527e-05, + "loss": 1.077, + "step": 981 + }, + { + "epoch": 0.22484258729250142, + "grad_norm": 1.174572229385376, + "learning_rate": 1.8090169943749477e-05, + "loss": 1.0657, + "step": 982 + }, + { + "epoch": 0.22507155123068118, + "grad_norm": 1.3976424932479858, + "learning_rate": 1.8085807430497734e-05, + "loss": 1.1355, + "step": 983 + }, + { + "epoch": 0.2253005151688609, + "grad_norm": 1.2085835933685303, + "learning_rate": 1.808144046769395e-05, + "loss": 1.2107, + "step": 984 + }, + { + "epoch": 0.22552947910704063, + "grad_norm": 1.3082224130630493, + "learning_rate": 1.8077069057741235e-05, + "loss": 1.2256, + "step": 985 + }, + { + "epoch": 0.22575844304522039, + "grad_norm": 1.0391132831573486, + "learning_rate": 1.807269320304514e-05, + "loss": 1.1506, + "step": 986 + }, + { + "epoch": 0.2259874069834001, + "grad_norm": 1.0561646223068237, + "learning_rate": 1.806831290601365e-05, + "loss": 1.1065, + "step": 987 + }, + { + "epoch": 0.22621637092157984, + "grad_norm": 1.2133493423461914, + "learning_rate": 1.8063928169057214e-05, + "loss": 1.1006, + "step": 988 + }, + { + "epoch": 0.2264453348597596, + "grad_norm": 1.4187254905700684, + "learning_rate": 1.8059538994588715e-05, + "loss": 1.0748, + "step": 989 + }, + { + "epoch": 0.22667429879793932, + "grad_norm": 1.3195863962173462, + "learning_rate": 1.8055145385023477e-05, + "loss": 1.1143, + "step": 990 + }, + { + "epoch": 0.22690326273611905, + "grad_norm": 1.304660677909851, + "learning_rate": 1.8050747342779274e-05, + "loss": 1.2163, + "step": 991 + }, + { + "epoch": 0.2271322266742988, + "grad_norm": 1.0397827625274658, + "learning_rate": 1.80463448702763e-05, + "loss": 1.0848, + "step": 992 + }, + { + "epoch": 0.22736119061247853, + "grad_norm": 1.2411673069000244, + "learning_rate": 1.8041937969937206e-05, + "loss": 1.0982, + "step": 993 + }, + { + "epoch": 0.22759015455065826, + "grad_norm": 1.261560320854187, + "learning_rate": 1.803752664418707e-05, + "loss": 1.096, + "step": 994 + }, + { + "epoch": 0.227819118488838, + "grad_norm": 1.3280490636825562, + "learning_rate": 1.803311089545341e-05, + "loss": 1.1291, + "step": 995 + }, + { + "epoch": 0.22804808242701774, + "grad_norm": 1.3528070449829102, + "learning_rate": 1.8028690726166172e-05, + "loss": 1.1006, + "step": 996 + }, + { + "epoch": 0.2282770463651975, + "grad_norm": 1.1263134479522705, + "learning_rate": 1.8024266138757746e-05, + "loss": 1.1311, + "step": 997 + }, + { + "epoch": 0.22850601030337722, + "grad_norm": 1.3571945428848267, + "learning_rate": 1.8019837135662932e-05, + "loss": 1.1336, + "step": 998 + }, + { + "epoch": 0.22873497424155695, + "grad_norm": 1.3748575448989868, + "learning_rate": 1.801540371931898e-05, + "loss": 1.1514, + "step": 999 + }, + { + "epoch": 0.2289639381797367, + "grad_norm": 1.1356844902038574, + "learning_rate": 1.8010965892165568e-05, + "loss": 1.1703, + "step": 1000 + }, + { + "epoch": 0.22919290211791643, + "grad_norm": 1.219716191291809, + "learning_rate": 1.8006523656644787e-05, + "loss": 1.1123, + "step": 1001 + }, + { + "epoch": 0.22942186605609616, + "grad_norm": 1.0204182863235474, + "learning_rate": 1.8002077015201164e-05, + "loss": 1.1016, + "step": 1002 + }, + { + "epoch": 0.2296508299942759, + "grad_norm": 1.1001499891281128, + "learning_rate": 1.7997625970281652e-05, + "loss": 1.1063, + "step": 1003 + }, + { + "epoch": 0.22987979393245564, + "grad_norm": 1.3529033660888672, + "learning_rate": 1.7993170524335614e-05, + "loss": 1.1498, + "step": 1004 + }, + { + "epoch": 0.23010875787063537, + "grad_norm": 1.0929923057556152, + "learning_rate": 1.7988710679814857e-05, + "loss": 1.1105, + "step": 1005 + }, + { + "epoch": 0.23033772180881512, + "grad_norm": 1.1180248260498047, + "learning_rate": 1.798424643917359e-05, + "loss": 1.0907, + "step": 1006 + }, + { + "epoch": 0.23056668574699485, + "grad_norm": 1.199679970741272, + "learning_rate": 1.7979777804868447e-05, + "loss": 1.184, + "step": 1007 + }, + { + "epoch": 0.23079564968517458, + "grad_norm": 1.0665282011032104, + "learning_rate": 1.797530477935848e-05, + "loss": 1.1398, + "step": 1008 + }, + { + "epoch": 0.23102461362335433, + "grad_norm": 1.1431396007537842, + "learning_rate": 1.7970827365105157e-05, + "loss": 1.1085, + "step": 1009 + }, + { + "epoch": 0.23125357756153406, + "grad_norm": 1.1866995096206665, + "learning_rate": 1.796634556457236e-05, + "loss": 1.1075, + "step": 1010 + }, + { + "epoch": 0.23148254149971378, + "grad_norm": 1.0268393754959106, + "learning_rate": 1.7961859380226395e-05, + "loss": 1.1128, + "step": 1011 + }, + { + "epoch": 0.23171150543789354, + "grad_norm": 1.1771196126937866, + "learning_rate": 1.7957368814535963e-05, + "loss": 1.0616, + "step": 1012 + }, + { + "epoch": 0.23194046937607327, + "grad_norm": 1.1519027948379517, + "learning_rate": 1.7952873869972183e-05, + "loss": 1.1305, + "step": 1013 + }, + { + "epoch": 0.232169433314253, + "grad_norm": 1.0065557956695557, + "learning_rate": 1.7948374549008594e-05, + "loss": 1.088, + "step": 1014 + }, + { + "epoch": 0.23239839725243275, + "grad_norm": 1.4946367740631104, + "learning_rate": 1.7943870854121126e-05, + "loss": 1.1938, + "step": 1015 + }, + { + "epoch": 0.23262736119061247, + "grad_norm": 1.5711524486541748, + "learning_rate": 1.793936278778813e-05, + "loss": 1.0989, + "step": 1016 + }, + { + "epoch": 0.2328563251287922, + "grad_norm": 1.0987305641174316, + "learning_rate": 1.793485035249036e-05, + "loss": 1.1015, + "step": 1017 + }, + { + "epoch": 0.23308528906697196, + "grad_norm": 1.1849825382232666, + "learning_rate": 1.793033355071096e-05, + "loss": 1.1036, + "step": 1018 + }, + { + "epoch": 0.23331425300515168, + "grad_norm": 1.1868687868118286, + "learning_rate": 1.79258123849355e-05, + "loss": 1.0985, + "step": 1019 + }, + { + "epoch": 0.23354321694333144, + "grad_norm": 1.2682559490203857, + "learning_rate": 1.792128685765193e-05, + "loss": 1.1574, + "step": 1020 + }, + { + "epoch": 0.23377218088151117, + "grad_norm": 1.544455647468567, + "learning_rate": 1.7916756971350618e-05, + "loss": 1.1294, + "step": 1021 + }, + { + "epoch": 0.2340011448196909, + "grad_norm": 1.103712558746338, + "learning_rate": 1.7912222728524318e-05, + "loss": 1.0987, + "step": 1022 + }, + { + "epoch": 0.23423010875787065, + "grad_norm": 1.2829042673110962, + "learning_rate": 1.7907684131668186e-05, + "loss": 1.0943, + "step": 1023 + }, + { + "epoch": 0.23445907269605037, + "grad_norm": 1.112669587135315, + "learning_rate": 1.7903141183279776e-05, + "loss": 1.1518, + "step": 1024 + }, + { + "epoch": 0.2346880366342301, + "grad_norm": 1.0655471086502075, + "learning_rate": 1.789859388585903e-05, + "loss": 1.1347, + "step": 1025 + }, + { + "epoch": 0.23491700057240986, + "grad_norm": 1.2130173444747925, + "learning_rate": 1.7894042241908293e-05, + "loss": 1.1555, + "step": 1026 + }, + { + "epoch": 0.23514596451058958, + "grad_norm": 1.1426712274551392, + "learning_rate": 1.7889486253932292e-05, + "loss": 1.1248, + "step": 1027 + }, + { + "epoch": 0.2353749284487693, + "grad_norm": 1.0837373733520508, + "learning_rate": 1.7884925924438152e-05, + "loss": 1.1218, + "step": 1028 + }, + { + "epoch": 0.23560389238694907, + "grad_norm": 1.2106484174728394, + "learning_rate": 1.7880361255935385e-05, + "loss": 1.1164, + "step": 1029 + }, + { + "epoch": 0.2358328563251288, + "grad_norm": 1.155698537826538, + "learning_rate": 1.7875792250935883e-05, + "loss": 1.1404, + "step": 1030 + }, + { + "epoch": 0.23606182026330852, + "grad_norm": 1.0495966672897339, + "learning_rate": 1.7871218911953942e-05, + "loss": 1.1275, + "step": 1031 + }, + { + "epoch": 0.23629078420148827, + "grad_norm": 1.108055591583252, + "learning_rate": 1.7866641241506222e-05, + "loss": 1.0906, + "step": 1032 + }, + { + "epoch": 0.236519748139668, + "grad_norm": 1.1889424324035645, + "learning_rate": 1.7862059242111782e-05, + "loss": 1.1072, + "step": 1033 + }, + { + "epoch": 0.23674871207784773, + "grad_norm": 2.039444923400879, + "learning_rate": 1.7857472916292056e-05, + "loss": 1.1747, + "step": 1034 + }, + { + "epoch": 0.23697767601602748, + "grad_norm": 1.0959917306900024, + "learning_rate": 1.785288226657086e-05, + "loss": 1.0886, + "step": 1035 + }, + { + "epoch": 0.2372066399542072, + "grad_norm": 1.1916193962097168, + "learning_rate": 1.7848287295474397e-05, + "loss": 1.1372, + "step": 1036 + }, + { + "epoch": 0.23743560389238694, + "grad_norm": 1.3053346872329712, + "learning_rate": 1.7843688005531227e-05, + "loss": 1.1423, + "step": 1037 + }, + { + "epoch": 0.2376645678305667, + "grad_norm": 1.1222914457321167, + "learning_rate": 1.7839084399272317e-05, + "loss": 1.079, + "step": 1038 + }, + { + "epoch": 0.23789353176874642, + "grad_norm": 1.1740766763687134, + "learning_rate": 1.7834476479230978e-05, + "loss": 1.163, + "step": 1039 + }, + { + "epoch": 0.23812249570692615, + "grad_norm": 2.025049924850464, + "learning_rate": 1.782986424794292e-05, + "loss": 1.1855, + "step": 1040 + }, + { + "epoch": 0.2383514596451059, + "grad_norm": 1.2098287343978882, + "learning_rate": 1.7825247707946212e-05, + "loss": 1.1989, + "step": 1041 + }, + { + "epoch": 0.23858042358328563, + "grad_norm": 1.0993343591690063, + "learning_rate": 1.78206268617813e-05, + "loss": 1.1303, + "step": 1042 + }, + { + "epoch": 0.23880938752146538, + "grad_norm": 1.1957361698150635, + "learning_rate": 1.781600171199099e-05, + "loss": 1.1391, + "step": 1043 + }, + { + "epoch": 0.2390383514596451, + "grad_norm": 1.0565906763076782, + "learning_rate": 1.7811372261120468e-05, + "loss": 1.0929, + "step": 1044 + }, + { + "epoch": 0.23926731539782484, + "grad_norm": 1.4460198879241943, + "learning_rate": 1.780673851171728e-05, + "loss": 1.1438, + "step": 1045 + }, + { + "epoch": 0.2394962793360046, + "grad_norm": 1.2659794092178345, + "learning_rate": 1.7802100466331343e-05, + "loss": 1.1188, + "step": 1046 + }, + { + "epoch": 0.23972524327418432, + "grad_norm": 1.1114025115966797, + "learning_rate": 1.7797458127514934e-05, + "loss": 1.1157, + "step": 1047 + }, + { + "epoch": 0.23995420721236405, + "grad_norm": 2.1384947299957275, + "learning_rate": 1.779281149782269e-05, + "loss": 1.0959, + "step": 1048 + }, + { + "epoch": 0.2401831711505438, + "grad_norm": 1.1005151271820068, + "learning_rate": 1.7788160579811614e-05, + "loss": 1.0842, + "step": 1049 + }, + { + "epoch": 0.24041213508872353, + "grad_norm": 1.1792573928833008, + "learning_rate": 1.7783505376041063e-05, + "loss": 1.0822, + "step": 1050 + }, + { + "epoch": 0.24064109902690325, + "grad_norm": 1.1286247968673706, + "learning_rate": 1.7778845889072764e-05, + "loss": 1.1103, + "step": 1051 + }, + { + "epoch": 0.240870062965083, + "grad_norm": 1.0358811616897583, + "learning_rate": 1.777418212147079e-05, + "loss": 1.0796, + "step": 1052 + }, + { + "epoch": 0.24109902690326274, + "grad_norm": 1.1496690511703491, + "learning_rate": 1.7769514075801573e-05, + "loss": 1.1426, + "step": 1053 + }, + { + "epoch": 0.24132799084144246, + "grad_norm": 0.995430588722229, + "learning_rate": 1.77648417546339e-05, + "loss": 1.0874, + "step": 1054 + }, + { + "epoch": 0.24155695477962222, + "grad_norm": 1.176997423171997, + "learning_rate": 1.7760165160538907e-05, + "loss": 1.152, + "step": 1055 + }, + { + "epoch": 0.24178591871780195, + "grad_norm": 1.2487221956253052, + "learning_rate": 1.775548429609009e-05, + "loss": 1.1166, + "step": 1056 + }, + { + "epoch": 0.24201488265598167, + "grad_norm": 1.1057595014572144, + "learning_rate": 1.7750799163863287e-05, + "loss": 1.1729, + "step": 1057 + }, + { + "epoch": 0.24224384659416143, + "grad_norm": 1.2305538654327393, + "learning_rate": 1.774610976643669e-05, + "loss": 1.1503, + "step": 1058 + }, + { + "epoch": 0.24247281053234115, + "grad_norm": 1.1498816013336182, + "learning_rate": 1.7741416106390828e-05, + "loss": 1.0871, + "step": 1059 + }, + { + "epoch": 0.24270177447052088, + "grad_norm": 1.7206319570541382, + "learning_rate": 1.7736718186308585e-05, + "loss": 1.1571, + "step": 1060 + }, + { + "epoch": 0.24293073840870064, + "grad_norm": 1.1156868934631348, + "learning_rate": 1.7732016008775193e-05, + "loss": 1.087, + "step": 1061 + }, + { + "epoch": 0.24315970234688036, + "grad_norm": 1.4100158214569092, + "learning_rate": 1.7727309576378213e-05, + "loss": 1.1383, + "step": 1062 + }, + { + "epoch": 0.2433886662850601, + "grad_norm": 1.3357410430908203, + "learning_rate": 1.7722598891707562e-05, + "loss": 1.1269, + "step": 1063 + }, + { + "epoch": 0.24361763022323984, + "grad_norm": 2.102534294128418, + "learning_rate": 1.7717883957355484e-05, + "loss": 1.0601, + "step": 1064 + }, + { + "epoch": 0.24384659416141957, + "grad_norm": 1.1543803215026855, + "learning_rate": 1.7713164775916573e-05, + "loss": 1.1185, + "step": 1065 + }, + { + "epoch": 0.24407555809959933, + "grad_norm": 1.2533857822418213, + "learning_rate": 1.7708441349987753e-05, + "loss": 1.0979, + "step": 1066 + }, + { + "epoch": 0.24430452203777905, + "grad_norm": 1.1019551753997803, + "learning_rate": 1.7703713682168288e-05, + "loss": 1.0778, + "step": 1067 + }, + { + "epoch": 0.24453348597595878, + "grad_norm": 1.085174798965454, + "learning_rate": 1.7698981775059767e-05, + "loss": 1.1028, + "step": 1068 + }, + { + "epoch": 0.24476244991413854, + "grad_norm": 1.204268217086792, + "learning_rate": 1.7694245631266124e-05, + "loss": 1.0967, + "step": 1069 + }, + { + "epoch": 0.24499141385231826, + "grad_norm": 1.129050850868225, + "learning_rate": 1.768950525339362e-05, + "loss": 1.0997, + "step": 1070 + }, + { + "epoch": 0.245220377790498, + "grad_norm": 1.1998200416564941, + "learning_rate": 1.7684760644050847e-05, + "loss": 1.0759, + "step": 1071 + }, + { + "epoch": 0.24544934172867774, + "grad_norm": 2.0567846298217773, + "learning_rate": 1.7680011805848715e-05, + "loss": 1.0642, + "step": 1072 + }, + { + "epoch": 0.24567830566685747, + "grad_norm": 1.1932883262634277, + "learning_rate": 1.767525874140048e-05, + "loss": 1.0918, + "step": 1073 + }, + { + "epoch": 0.2459072696050372, + "grad_norm": 1.18972909450531, + "learning_rate": 1.7670501453321705e-05, + "loss": 1.0716, + "step": 1074 + }, + { + "epoch": 0.24613623354321695, + "grad_norm": 4.0288825035095215, + "learning_rate": 1.7665739944230296e-05, + "loss": 1.1541, + "step": 1075 + }, + { + "epoch": 0.24636519748139668, + "grad_norm": 1.1446605920791626, + "learning_rate": 1.766097421674646e-05, + "loss": 1.0506, + "step": 1076 + }, + { + "epoch": 0.2465941614195764, + "grad_norm": 0.995333731174469, + "learning_rate": 1.7656204273492746e-05, + "loss": 1.115, + "step": 1077 + }, + { + "epoch": 0.24682312535775616, + "grad_norm": 1.2963595390319824, + "learning_rate": 1.7651430117094005e-05, + "loss": 1.1066, + "step": 1078 + }, + { + "epoch": 0.2470520892959359, + "grad_norm": 1.3206766843795776, + "learning_rate": 1.7646651750177424e-05, + "loss": 1.1593, + "step": 1079 + }, + { + "epoch": 0.24728105323411562, + "grad_norm": 1.1858936548233032, + "learning_rate": 1.7641869175372493e-05, + "loss": 1.1485, + "step": 1080 + }, + { + "epoch": 0.24751001717229537, + "grad_norm": 1.3568990230560303, + "learning_rate": 1.7637082395311024e-05, + "loss": 1.1467, + "step": 1081 + }, + { + "epoch": 0.2477389811104751, + "grad_norm": 1.4126697778701782, + "learning_rate": 1.7632291412627146e-05, + "loss": 1.1229, + "step": 1082 + }, + { + "epoch": 0.24796794504865483, + "grad_norm": 1.0935779809951782, + "learning_rate": 1.7627496229957288e-05, + "loss": 1.1371, + "step": 1083 + }, + { + "epoch": 0.24819690898683458, + "grad_norm": 1.2460694313049316, + "learning_rate": 1.7622696849940204e-05, + "loss": 1.0996, + "step": 1084 + }, + { + "epoch": 0.2484258729250143, + "grad_norm": 1.2605793476104736, + "learning_rate": 1.7617893275216953e-05, + "loss": 1.0923, + "step": 1085 + }, + { + "epoch": 0.24865483686319403, + "grad_norm": 1.437913417816162, + "learning_rate": 1.76130855084309e-05, + "loss": 1.1114, + "step": 1086 + }, + { + "epoch": 0.2488838008013738, + "grad_norm": 1.0823016166687012, + "learning_rate": 1.7608273552227723e-05, + "loss": 1.1255, + "step": 1087 + }, + { + "epoch": 0.24911276473955352, + "grad_norm": 1.158138632774353, + "learning_rate": 1.7603457409255397e-05, + "loss": 1.12, + "step": 1088 + }, + { + "epoch": 0.24934172867773327, + "grad_norm": 1.2879137992858887, + "learning_rate": 1.7598637082164204e-05, + "loss": 1.0848, + "step": 1089 + }, + { + "epoch": 0.249570692615913, + "grad_norm": 1.2041521072387695, + "learning_rate": 1.759381257360673e-05, + "loss": 1.1246, + "step": 1090 + }, + { + "epoch": 0.24979965655409272, + "grad_norm": 1.3644989728927612, + "learning_rate": 1.7588983886237868e-05, + "loss": 1.0443, + "step": 1091 + }, + { + "epoch": 0.2500286204922725, + "grad_norm": 1.232747197151184, + "learning_rate": 1.75841510227148e-05, + "loss": 1.131, + "step": 1092 + }, + { + "epoch": 0.2502575844304522, + "grad_norm": 1.1022595167160034, + "learning_rate": 1.7579313985697006e-05, + "loss": 1.0583, + "step": 1093 + }, + { + "epoch": 0.25048654836863193, + "grad_norm": 1.0636167526245117, + "learning_rate": 1.7574472777846276e-05, + "loss": 1.0952, + "step": 1094 + }, + { + "epoch": 0.25071551230681166, + "grad_norm": 1.6139863729476929, + "learning_rate": 1.7569627401826673e-05, + "loss": 1.073, + "step": 1095 + }, + { + "epoch": 0.2509444762449914, + "grad_norm": 1.2776731252670288, + "learning_rate": 1.756477786030458e-05, + "loss": 1.0649, + "step": 1096 + }, + { + "epoch": 0.25117344018317117, + "grad_norm": 1.2996584177017212, + "learning_rate": 1.7559924155948652e-05, + "loss": 1.0911, + "step": 1097 + }, + { + "epoch": 0.2514024041213509, + "grad_norm": 1.0315338373184204, + "learning_rate": 1.7555066291429845e-05, + "loss": 1.0903, + "step": 1098 + }, + { + "epoch": 0.2516313680595306, + "grad_norm": 1.1848983764648438, + "learning_rate": 1.7550204269421395e-05, + "loss": 1.1727, + "step": 1099 + }, + { + "epoch": 0.25186033199771035, + "grad_norm": 1.332460880279541, + "learning_rate": 1.754533809259884e-05, + "loss": 1.1459, + "step": 1100 + }, + { + "epoch": 0.2520892959358901, + "grad_norm": 1.4366538524627686, + "learning_rate": 1.7540467763639994e-05, + "loss": 1.163, + "step": 1101 + }, + { + "epoch": 0.25231825987406986, + "grad_norm": 1.1349256038665771, + "learning_rate": 1.7535593285224958e-05, + "loss": 1.1246, + "step": 1102 + }, + { + "epoch": 0.2525472238122496, + "grad_norm": 1.2521072626113892, + "learning_rate": 1.7530714660036112e-05, + "loss": 1.147, + "step": 1103 + }, + { + "epoch": 0.2527761877504293, + "grad_norm": 1.1671040058135986, + "learning_rate": 1.7525831890758125e-05, + "loss": 1.1429, + "step": 1104 + }, + { + "epoch": 0.25300515168860904, + "grad_norm": 1.0925365686416626, + "learning_rate": 1.7520944980077948e-05, + "loss": 1.1264, + "step": 1105 + }, + { + "epoch": 0.25323411562678877, + "grad_norm": 1.1112512350082397, + "learning_rate": 1.7516053930684804e-05, + "loss": 1.1713, + "step": 1106 + }, + { + "epoch": 0.2534630795649685, + "grad_norm": 1.217628002166748, + "learning_rate": 1.7511158745270197e-05, + "loss": 1.0978, + "step": 1107 + }, + { + "epoch": 0.2536920435031483, + "grad_norm": 1.3462601900100708, + "learning_rate": 1.7506259426527903e-05, + "loss": 1.0702, + "step": 1108 + }, + { + "epoch": 0.253921007441328, + "grad_norm": 1.3914707899093628, + "learning_rate": 1.750135597715398e-05, + "loss": 1.1011, + "step": 1109 + }, + { + "epoch": 0.25414997137950773, + "grad_norm": 1.2635843753814697, + "learning_rate": 1.7496448399846757e-05, + "loss": 1.047, + "step": 1110 + }, + { + "epoch": 0.25437893531768746, + "grad_norm": 1.217361330986023, + "learning_rate": 1.7491536697306828e-05, + "loss": 1.1323, + "step": 1111 + }, + { + "epoch": 0.2546078992558672, + "grad_norm": 1.3858387470245361, + "learning_rate": 1.7486620872237064e-05, + "loss": 1.1033, + "step": 1112 + }, + { + "epoch": 0.2548368631940469, + "grad_norm": 1.247519612312317, + "learning_rate": 1.7481700927342603e-05, + "loss": 1.1039, + "step": 1113 + }, + { + "epoch": 0.2550658271322267, + "grad_norm": 1.3646209239959717, + "learning_rate": 1.7476776865330847e-05, + "loss": 1.12, + "step": 1114 + }, + { + "epoch": 0.2552947910704064, + "grad_norm": 1.1417921781539917, + "learning_rate": 1.7471848688911465e-05, + "loss": 1.1152, + "step": 1115 + }, + { + "epoch": 0.25552375500858615, + "grad_norm": 2.9017693996429443, + "learning_rate": 1.7466916400796398e-05, + "loss": 1.117, + "step": 1116 + }, + { + "epoch": 0.2557527189467659, + "grad_norm": 1.4879010915756226, + "learning_rate": 1.7461980003699835e-05, + "loss": 1.0987, + "step": 1117 + }, + { + "epoch": 0.2559816828849456, + "grad_norm": 1.6178057193756104, + "learning_rate": 1.7457039500338238e-05, + "loss": 1.1345, + "step": 1118 + }, + { + "epoch": 0.25621064682312533, + "grad_norm": 1.2162644863128662, + "learning_rate": 1.7452094893430324e-05, + "loss": 1.1063, + "step": 1119 + }, + { + "epoch": 0.2564396107613051, + "grad_norm": 1.2172150611877441, + "learning_rate": 1.744714618569707e-05, + "loss": 1.1393, + "step": 1120 + }, + { + "epoch": 0.25666857469948484, + "grad_norm": 1.195317029953003, + "learning_rate": 1.74421933798617e-05, + "loss": 1.1355, + "step": 1121 + }, + { + "epoch": 0.25689753863766457, + "grad_norm": 1.0217382907867432, + "learning_rate": 1.7437236478649718e-05, + "loss": 1.1258, + "step": 1122 + }, + { + "epoch": 0.2571265025758443, + "grad_norm": 1.2835495471954346, + "learning_rate": 1.7432275484788852e-05, + "loss": 1.1104, + "step": 1123 + }, + { + "epoch": 0.257355466514024, + "grad_norm": 1.0786269903182983, + "learning_rate": 1.74273104010091e-05, + "loss": 1.066, + "step": 1124 + }, + { + "epoch": 0.2575844304522038, + "grad_norm": 1.1824384927749634, + "learning_rate": 1.74223412300427e-05, + "loss": 1.0641, + "step": 1125 + }, + { + "epoch": 0.25781339439038353, + "grad_norm": 1.0312379598617554, + "learning_rate": 1.7417367974624153e-05, + "loss": 1.0843, + "step": 1126 + }, + { + "epoch": 0.25804235832856326, + "grad_norm": 1.2044585943222046, + "learning_rate": 1.74123906374902e-05, + "loss": 1.0899, + "step": 1127 + }, + { + "epoch": 0.258271322266743, + "grad_norm": 1.1857991218566895, + "learning_rate": 1.740740922137982e-05, + "loss": 1.1682, + "step": 1128 + }, + { + "epoch": 0.2585002862049227, + "grad_norm": 1.4564883708953857, + "learning_rate": 1.7402423729034252e-05, + "loss": 1.1455, + "step": 1129 + }, + { + "epoch": 0.25872925014310244, + "grad_norm": 1.2754067182540894, + "learning_rate": 1.7397434163196967e-05, + "loss": 1.1081, + "step": 1130 + }, + { + "epoch": 0.2589582140812822, + "grad_norm": 1.149965763092041, + "learning_rate": 1.7392440526613684e-05, + "loss": 1.1089, + "step": 1131 + }, + { + "epoch": 0.25918717801946195, + "grad_norm": 1.430191993713379, + "learning_rate": 1.7387442822032354e-05, + "loss": 1.1204, + "step": 1132 + }, + { + "epoch": 0.2594161419576417, + "grad_norm": 1.3282990455627441, + "learning_rate": 1.738244105220318e-05, + "loss": 1.1386, + "step": 1133 + }, + { + "epoch": 0.2596451058958214, + "grad_norm": 1.1032613515853882, + "learning_rate": 1.7377435219878586e-05, + "loss": 1.0563, + "step": 1134 + }, + { + "epoch": 0.25987406983400113, + "grad_norm": 1.0498651266098022, + "learning_rate": 1.737242532781324e-05, + "loss": 1.104, + "step": 1135 + }, + { + "epoch": 0.26010303377218086, + "grad_norm": 1.2936218976974487, + "learning_rate": 1.736741137876405e-05, + "loss": 1.043, + "step": 1136 + }, + { + "epoch": 0.26033199771036064, + "grad_norm": 1.025429129600525, + "learning_rate": 1.736239337549015e-05, + "loss": 1.1479, + "step": 1137 + }, + { + "epoch": 0.26056096164854037, + "grad_norm": 1.1074275970458984, + "learning_rate": 1.7357371320752896e-05, + "loss": 1.0875, + "step": 1138 + }, + { + "epoch": 0.2607899255867201, + "grad_norm": 1.2737194299697876, + "learning_rate": 1.735234521731589e-05, + "loss": 1.1193, + "step": 1139 + }, + { + "epoch": 0.2610188895248998, + "grad_norm": 2.4709835052490234, + "learning_rate": 1.7347315067944955e-05, + "loss": 1.1418, + "step": 1140 + }, + { + "epoch": 0.26124785346307955, + "grad_norm": 1.4513455629348755, + "learning_rate": 1.7342280875408138e-05, + "loss": 1.1594, + "step": 1141 + }, + { + "epoch": 0.2614768174012593, + "grad_norm": 1.1992151737213135, + "learning_rate": 1.7337242642475712e-05, + "loss": 1.0609, + "step": 1142 + }, + { + "epoch": 0.26170578133943906, + "grad_norm": 1.7710577249526978, + "learning_rate": 1.7332200371920173e-05, + "loss": 1.1702, + "step": 1143 + }, + { + "epoch": 0.2619347452776188, + "grad_norm": 1.3110970258712769, + "learning_rate": 1.7327154066516244e-05, + "loss": 1.0921, + "step": 1144 + }, + { + "epoch": 0.2621637092157985, + "grad_norm": 1.0681921243667603, + "learning_rate": 1.7322103729040868e-05, + "loss": 1.038, + "step": 1145 + }, + { + "epoch": 0.26239267315397824, + "grad_norm": 1.1642048358917236, + "learning_rate": 1.731704936227319e-05, + "loss": 1.071, + "step": 1146 + }, + { + "epoch": 0.26262163709215797, + "grad_norm": 1.287327766418457, + "learning_rate": 1.7311990968994598e-05, + "loss": 1.102, + "step": 1147 + }, + { + "epoch": 0.26285060103033775, + "grad_norm": 1.2213733196258545, + "learning_rate": 1.7306928551988683e-05, + "loss": 1.1354, + "step": 1148 + }, + { + "epoch": 0.2630795649685175, + "grad_norm": 1.3054442405700684, + "learning_rate": 1.7301862114041244e-05, + "loss": 1.1189, + "step": 1149 + }, + { + "epoch": 0.2633085289066972, + "grad_norm": 1.8000365495681763, + "learning_rate": 1.72967916579403e-05, + "loss": 1.105, + "step": 1150 + }, + { + "epoch": 0.26353749284487693, + "grad_norm": 1.155625343322754, + "learning_rate": 1.7291717186476088e-05, + "loss": 1.1213, + "step": 1151 + }, + { + "epoch": 0.26376645678305666, + "grad_norm": 1.052958607673645, + "learning_rate": 1.7286638702441037e-05, + "loss": 1.1383, + "step": 1152 + }, + { + "epoch": 0.2639954207212364, + "grad_norm": 1.0741239786148071, + "learning_rate": 1.7281556208629802e-05, + "loss": 1.1085, + "step": 1153 + }, + { + "epoch": 0.26422438465941617, + "grad_norm": 1.1770488023757935, + "learning_rate": 1.7276469707839235e-05, + "loss": 1.0881, + "step": 1154 + }, + { + "epoch": 0.2644533485975959, + "grad_norm": 1.1173039674758911, + "learning_rate": 1.7271379202868394e-05, + "loss": 1.162, + "step": 1155 + }, + { + "epoch": 0.2646823125357756, + "grad_norm": 1.1196292638778687, + "learning_rate": 1.7266284696518537e-05, + "loss": 1.1199, + "step": 1156 + }, + { + "epoch": 0.26491127647395535, + "grad_norm": 1.097383975982666, + "learning_rate": 1.7261186191593135e-05, + "loss": 1.1347, + "step": 1157 + }, + { + "epoch": 0.2651402404121351, + "grad_norm": 2.0568525791168213, + "learning_rate": 1.725608369089785e-05, + "loss": 1.0951, + "step": 1158 + }, + { + "epoch": 0.2653692043503148, + "grad_norm": 1.1759741306304932, + "learning_rate": 1.7250977197240545e-05, + "loss": 1.1429, + "step": 1159 + }, + { + "epoch": 0.2655981682884946, + "grad_norm": 1.1705988645553589, + "learning_rate": 1.7245866713431278e-05, + "loss": 1.0861, + "step": 1160 + }, + { + "epoch": 0.2658271322266743, + "grad_norm": 1.197718620300293, + "learning_rate": 1.724075224228231e-05, + "loss": 1.1353, + "step": 1161 + }, + { + "epoch": 0.26605609616485404, + "grad_norm": 1.0807304382324219, + "learning_rate": 1.7235633786608092e-05, + "loss": 1.1407, + "step": 1162 + }, + { + "epoch": 0.26628506010303377, + "grad_norm": 1.2779772281646729, + "learning_rate": 1.723051134922526e-05, + "loss": 1.1031, + "step": 1163 + }, + { + "epoch": 0.2665140240412135, + "grad_norm": 1.2724177837371826, + "learning_rate": 1.7225384932952655e-05, + "loss": 1.0757, + "step": 1164 + }, + { + "epoch": 0.2667429879793932, + "grad_norm": 1.0801130533218384, + "learning_rate": 1.72202545406113e-05, + "loss": 1.1328, + "step": 1165 + }, + { + "epoch": 0.266971951917573, + "grad_norm": 1.0200031995773315, + "learning_rate": 1.7215120175024405e-05, + "loss": 1.0876, + "step": 1166 + }, + { + "epoch": 0.26720091585575273, + "grad_norm": 1.1842114925384521, + "learning_rate": 1.720998183901737e-05, + "loss": 1.1401, + "step": 1167 + }, + { + "epoch": 0.26742987979393246, + "grad_norm": 1.0764400959014893, + "learning_rate": 1.7204839535417778e-05, + "loss": 1.046, + "step": 1168 + }, + { + "epoch": 0.2676588437321122, + "grad_norm": 1.2766287326812744, + "learning_rate": 1.7199693267055392e-05, + "loss": 1.0931, + "step": 1169 + }, + { + "epoch": 0.2678878076702919, + "grad_norm": 1.282101035118103, + "learning_rate": 1.7194543036762165e-05, + "loss": 1.0584, + "step": 1170 + }, + { + "epoch": 0.2681167716084717, + "grad_norm": 1.2437055110931396, + "learning_rate": 1.7189388847372227e-05, + "loss": 1.1001, + "step": 1171 + }, + { + "epoch": 0.2683457355466514, + "grad_norm": 1.0838711261749268, + "learning_rate": 1.7184230701721876e-05, + "loss": 1.0871, + "step": 1172 + }, + { + "epoch": 0.26857469948483115, + "grad_norm": 1.4530155658721924, + "learning_rate": 1.717906860264961e-05, + "loss": 1.1225, + "step": 1173 + }, + { + "epoch": 0.2688036634230109, + "grad_norm": 1.1978572607040405, + "learning_rate": 1.7173902552996075e-05, + "loss": 1.1464, + "step": 1174 + }, + { + "epoch": 0.2690326273611906, + "grad_norm": 1.1811645030975342, + "learning_rate": 1.7168732555604114e-05, + "loss": 1.1627, + "step": 1175 + }, + { + "epoch": 0.26926159129937033, + "grad_norm": 1.530895709991455, + "learning_rate": 1.716355861331873e-05, + "loss": 1.0715, + "step": 1176 + }, + { + "epoch": 0.2694905552375501, + "grad_norm": 1.2078356742858887, + "learning_rate": 1.7158380728987102e-05, + "loss": 1.1287, + "step": 1177 + }, + { + "epoch": 0.26971951917572984, + "grad_norm": 1.318663477897644, + "learning_rate": 1.715319890545857e-05, + "loss": 1.1881, + "step": 1178 + }, + { + "epoch": 0.26994848311390957, + "grad_norm": 1.174576759338379, + "learning_rate": 1.7148013145584657e-05, + "loss": 1.1113, + "step": 1179 + }, + { + "epoch": 0.2701774470520893, + "grad_norm": 1.2301723957061768, + "learning_rate": 1.7142823452219036e-05, + "loss": 1.1051, + "step": 1180 + }, + { + "epoch": 0.270406410990269, + "grad_norm": 1.097507119178772, + "learning_rate": 1.7137629828217556e-05, + "loss": 1.0563, + "step": 1181 + }, + { + "epoch": 0.27063537492844875, + "grad_norm": 1.100091814994812, + "learning_rate": 1.7132432276438228e-05, + "loss": 1.0676, + "step": 1182 + }, + { + "epoch": 0.27086433886662853, + "grad_norm": 1.1232244968414307, + "learning_rate": 1.712723079974121e-05, + "loss": 1.1441, + "step": 1183 + }, + { + "epoch": 0.27109330280480826, + "grad_norm": 1.3553615808486938, + "learning_rate": 1.712202540098884e-05, + "loss": 1.1112, + "step": 1184 + }, + { + "epoch": 0.271322266742988, + "grad_norm": 1.2029356956481934, + "learning_rate": 1.7116816083045603e-05, + "loss": 1.1263, + "step": 1185 + }, + { + "epoch": 0.2715512306811677, + "grad_norm": 1.465103268623352, + "learning_rate": 1.7111602848778143e-05, + "loss": 1.1094, + "step": 1186 + }, + { + "epoch": 0.27178019461934744, + "grad_norm": 1.3049366474151611, + "learning_rate": 1.710638570105526e-05, + "loss": 1.1015, + "step": 1187 + }, + { + "epoch": 0.27200915855752716, + "grad_norm": 1.2271827459335327, + "learning_rate": 1.7101164642747906e-05, + "loss": 1.0745, + "step": 1188 + }, + { + "epoch": 0.27223812249570695, + "grad_norm": 1.0381929874420166, + "learning_rate": 1.7095939676729184e-05, + "loss": 1.1369, + "step": 1189 + }, + { + "epoch": 0.2724670864338867, + "grad_norm": 1.3117164373397827, + "learning_rate": 1.709071080587435e-05, + "loss": 1.1253, + "step": 1190 + }, + { + "epoch": 0.2726960503720664, + "grad_norm": 1.409812092781067, + "learning_rate": 1.7085478033060808e-05, + "loss": 1.1859, + "step": 1191 + }, + { + "epoch": 0.27292501431024613, + "grad_norm": 1.3818203210830688, + "learning_rate": 1.7080241361168108e-05, + "loss": 1.0365, + "step": 1192 + }, + { + "epoch": 0.27315397824842585, + "grad_norm": 1.1283059120178223, + "learning_rate": 1.707500079307795e-05, + "loss": 1.0971, + "step": 1193 + }, + { + "epoch": 0.2733829421866056, + "grad_norm": 1.1828192472457886, + "learning_rate": 1.7069756331674172e-05, + "loss": 1.1046, + "step": 1194 + }, + { + "epoch": 0.27361190612478536, + "grad_norm": 1.287385106086731, + "learning_rate": 1.7064507979842755e-05, + "loss": 1.0766, + "step": 1195 + }, + { + "epoch": 0.2738408700629651, + "grad_norm": 1.1248568296432495, + "learning_rate": 1.705925574047183e-05, + "loss": 1.068, + "step": 1196 + }, + { + "epoch": 0.2740698340011448, + "grad_norm": 1.151267647743225, + "learning_rate": 1.7053999616451653e-05, + "loss": 1.0703, + "step": 1197 + }, + { + "epoch": 0.27429879793932455, + "grad_norm": 1.1329370737075806, + "learning_rate": 1.7048739610674626e-05, + "loss": 1.0707, + "step": 1198 + }, + { + "epoch": 0.2745277618775043, + "grad_norm": 1.0887163877487183, + "learning_rate": 1.704347572603529e-05, + "loss": 1.1708, + "step": 1199 + }, + { + "epoch": 0.27475672581568406, + "grad_norm": 1.1625685691833496, + "learning_rate": 1.7038207965430316e-05, + "loss": 1.0947, + "step": 1200 + }, + { + "epoch": 0.2749856897538638, + "grad_norm": 1.0462048053741455, + "learning_rate": 1.7032936331758505e-05, + "loss": 1.0896, + "step": 1201 + }, + { + "epoch": 0.2752146536920435, + "grad_norm": 1.286412000656128, + "learning_rate": 1.7027660827920798e-05, + "loss": 1.0962, + "step": 1202 + }, + { + "epoch": 0.27544361763022324, + "grad_norm": 1.4663245677947998, + "learning_rate": 1.702238145682025e-05, + "loss": 1.1375, + "step": 1203 + }, + { + "epoch": 0.27567258156840296, + "grad_norm": 1.2858197689056396, + "learning_rate": 1.701709822136207e-05, + "loss": 1.1469, + "step": 1204 + }, + { + "epoch": 0.2759015455065827, + "grad_norm": 1.4555660486221313, + "learning_rate": 1.7011811124453567e-05, + "loss": 1.096, + "step": 1205 + }, + { + "epoch": 0.2761305094447625, + "grad_norm": 1.211330771446228, + "learning_rate": 1.700652016900419e-05, + "loss": 1.0777, + "step": 1206 + }, + { + "epoch": 0.2763594733829422, + "grad_norm": 1.0660839080810547, + "learning_rate": 1.7001225357925506e-05, + "loss": 1.0831, + "step": 1207 + }, + { + "epoch": 0.2765884373211219, + "grad_norm": 1.1060619354248047, + "learning_rate": 1.6995926694131207e-05, + "loss": 1.097, + "step": 1208 + }, + { + "epoch": 0.27681740125930165, + "grad_norm": 1.301016926765442, + "learning_rate": 1.6990624180537102e-05, + "loss": 1.1579, + "step": 1209 + }, + { + "epoch": 0.2770463651974814, + "grad_norm": 1.3062912225723267, + "learning_rate": 1.6985317820061116e-05, + "loss": 1.0382, + "step": 1210 + }, + { + "epoch": 0.2772753291356611, + "grad_norm": 1.3300676345825195, + "learning_rate": 1.69800076156233e-05, + "loss": 1.2066, + "step": 1211 + }, + { + "epoch": 0.2775042930738409, + "grad_norm": 1.5877501964569092, + "learning_rate": 1.6974693570145818e-05, + "loss": 1.0696, + "step": 1212 + }, + { + "epoch": 0.2777332570120206, + "grad_norm": 1.235116720199585, + "learning_rate": 1.696937568655294e-05, + "loss": 1.121, + "step": 1213 + }, + { + "epoch": 0.27796222095020034, + "grad_norm": 1.2792831659317017, + "learning_rate": 1.6964053967771047e-05, + "loss": 1.1357, + "step": 1214 + }, + { + "epoch": 0.27819118488838007, + "grad_norm": 1.6012238264083862, + "learning_rate": 1.6958728416728644e-05, + "loss": 1.0489, + "step": 1215 + }, + { + "epoch": 0.2784201488265598, + "grad_norm": 1.0891828536987305, + "learning_rate": 1.6953399036356338e-05, + "loss": 1.0707, + "step": 1216 + }, + { + "epoch": 0.2786491127647395, + "grad_norm": 1.0039647817611694, + "learning_rate": 1.6948065829586835e-05, + "loss": 1.1229, + "step": 1217 + }, + { + "epoch": 0.2788780767029193, + "grad_norm": 1.0819059610366821, + "learning_rate": 1.6942728799354963e-05, + "loss": 1.1374, + "step": 1218 + }, + { + "epoch": 0.27910704064109904, + "grad_norm": 1.178362488746643, + "learning_rate": 1.693738794859764e-05, + "loss": 1.0953, + "step": 1219 + }, + { + "epoch": 0.27933600457927876, + "grad_norm": 1.132528305053711, + "learning_rate": 1.6932043280253892e-05, + "loss": 1.1012, + "step": 1220 + }, + { + "epoch": 0.2795649685174585, + "grad_norm": 1.2141227722167969, + "learning_rate": 1.6926694797264844e-05, + "loss": 1.1225, + "step": 1221 + }, + { + "epoch": 0.2797939324556382, + "grad_norm": 1.3470580577850342, + "learning_rate": 1.6921342502573723e-05, + "loss": 1.1037, + "step": 1222 + }, + { + "epoch": 0.280022896393818, + "grad_norm": 1.2042268514633179, + "learning_rate": 1.6915986399125852e-05, + "loss": 1.0931, + "step": 1223 + }, + { + "epoch": 0.2802518603319977, + "grad_norm": 1.567338228225708, + "learning_rate": 1.691062648986865e-05, + "loss": 1.0394, + "step": 1224 + }, + { + "epoch": 0.28048082427017745, + "grad_norm": 0.9561851024627686, + "learning_rate": 1.6905262777751627e-05, + "loss": 1.01, + "step": 1225 + }, + { + "epoch": 0.2807097882083572, + "grad_norm": 1.5181394815444946, + "learning_rate": 1.6899895265726392e-05, + "loss": 1.1086, + "step": 1226 + }, + { + "epoch": 0.2809387521465369, + "grad_norm": 1.1278475522994995, + "learning_rate": 1.689452395674664e-05, + "loss": 1.1136, + "step": 1227 + }, + { + "epoch": 0.28116771608471663, + "grad_norm": 1.375020146369934, + "learning_rate": 1.688914885376816e-05, + "loss": 1.1157, + "step": 1228 + }, + { + "epoch": 0.2813966800228964, + "grad_norm": 1.1723235845565796, + "learning_rate": 1.6883769959748822e-05, + "loss": 1.1582, + "step": 1229 + }, + { + "epoch": 0.28162564396107614, + "grad_norm": 1.5612130165100098, + "learning_rate": 1.6878387277648593e-05, + "loss": 1.0827, + "step": 1230 + }, + { + "epoch": 0.28185460789925587, + "grad_norm": 1.238914966583252, + "learning_rate": 1.6873000810429512e-05, + "loss": 1.0996, + "step": 1231 + }, + { + "epoch": 0.2820835718374356, + "grad_norm": 1.1172726154327393, + "learning_rate": 1.6867610561055707e-05, + "loss": 1.1171, + "step": 1232 + }, + { + "epoch": 0.2823125357756153, + "grad_norm": 1.265048861503601, + "learning_rate": 1.686221653249339e-05, + "loss": 1.1057, + "step": 1233 + }, + { + "epoch": 0.28254149971379505, + "grad_norm": 1.3867113590240479, + "learning_rate": 1.6856818727710847e-05, + "loss": 1.1072, + "step": 1234 + }, + { + "epoch": 0.28277046365197483, + "grad_norm": 1.2063997983932495, + "learning_rate": 1.6851417149678442e-05, + "loss": 1.1, + "step": 1235 + }, + { + "epoch": 0.28299942759015456, + "grad_norm": 1.0993294715881348, + "learning_rate": 1.6846011801368626e-05, + "loss": 1.1188, + "step": 1236 + }, + { + "epoch": 0.2832283915283343, + "grad_norm": 0.9886915683746338, + "learning_rate": 1.6840602685755914e-05, + "loss": 1.0935, + "step": 1237 + }, + { + "epoch": 0.283457355466514, + "grad_norm": 1.2733272314071655, + "learning_rate": 1.6835189805816894e-05, + "loss": 1.0806, + "step": 1238 + }, + { + "epoch": 0.28368631940469374, + "grad_norm": 1.0870503187179565, + "learning_rate": 1.6829773164530226e-05, + "loss": 1.1355, + "step": 1239 + }, + { + "epoch": 0.28391528334287347, + "grad_norm": 1.102055311203003, + "learning_rate": 1.6824352764876653e-05, + "loss": 1.1119, + "step": 1240 + }, + { + "epoch": 0.28414424728105325, + "grad_norm": 1.2194703817367554, + "learning_rate": 1.6818928609838967e-05, + "loss": 1.0878, + "step": 1241 + }, + { + "epoch": 0.284373211219233, + "grad_norm": 1.2381644248962402, + "learning_rate": 1.681350070240204e-05, + "loss": 1.0619, + "step": 1242 + }, + { + "epoch": 0.2846021751574127, + "grad_norm": 1.302049994468689, + "learning_rate": 1.6808069045552793e-05, + "loss": 1.117, + "step": 1243 + }, + { + "epoch": 0.28483113909559243, + "grad_norm": 1.0947741270065308, + "learning_rate": 1.6802633642280233e-05, + "loss": 1.0865, + "step": 1244 + }, + { + "epoch": 0.28506010303377216, + "grad_norm": 1.4091873168945312, + "learning_rate": 1.6797194495575412e-05, + "loss": 1.1398, + "step": 1245 + }, + { + "epoch": 0.28528906697195194, + "grad_norm": 1.2080914974212646, + "learning_rate": 1.679175160843145e-05, + "loss": 1.0264, + "step": 1246 + }, + { + "epoch": 0.28551803091013167, + "grad_norm": 1.3599348068237305, + "learning_rate": 1.6786304983843517e-05, + "loss": 1.1261, + "step": 1247 + }, + { + "epoch": 0.2857469948483114, + "grad_norm": 1.1633814573287964, + "learning_rate": 1.678085462480885e-05, + "loss": 1.075, + "step": 1248 + }, + { + "epoch": 0.2859759587864911, + "grad_norm": 1.1775692701339722, + "learning_rate": 1.6775400534326735e-05, + "loss": 1.0378, + "step": 1249 + }, + { + "epoch": 0.28620492272467085, + "grad_norm": 1.3439725637435913, + "learning_rate": 1.676994271539851e-05, + "loss": 1.0889, + "step": 1250 + }, + { + "epoch": 0.2864338866628506, + "grad_norm": 1.164843201637268, + "learning_rate": 1.6764481171027566e-05, + "loss": 1.2002, + "step": 1251 + }, + { + "epoch": 0.28666285060103036, + "grad_norm": 1.5093419551849365, + "learning_rate": 1.6759015904219348e-05, + "loss": 1.0292, + "step": 1252 + }, + { + "epoch": 0.2868918145392101, + "grad_norm": 1.3763619661331177, + "learning_rate": 1.6753546917981346e-05, + "loss": 1.0907, + "step": 1253 + }, + { + "epoch": 0.2871207784773898, + "grad_norm": 1.3228497505187988, + "learning_rate": 1.6748074215323096e-05, + "loss": 1.1251, + "step": 1254 + }, + { + "epoch": 0.28734974241556954, + "grad_norm": 1.9618299007415771, + "learning_rate": 1.6742597799256182e-05, + "loss": 1.0923, + "step": 1255 + }, + { + "epoch": 0.28757870635374927, + "grad_norm": 1.724094033241272, + "learning_rate": 1.673711767279423e-05, + "loss": 1.0645, + "step": 1256 + }, + { + "epoch": 0.287807670291929, + "grad_norm": 1.1787453889846802, + "learning_rate": 1.6731633838952905e-05, + "loss": 1.0869, + "step": 1257 + }, + { + "epoch": 0.2880366342301088, + "grad_norm": 1.4934698343276978, + "learning_rate": 1.672614630074992e-05, + "loss": 1.0895, + "step": 1258 + }, + { + "epoch": 0.2882655981682885, + "grad_norm": 1.7817822694778442, + "learning_rate": 1.6720655061205013e-05, + "loss": 1.0477, + "step": 1259 + }, + { + "epoch": 0.28849456210646823, + "grad_norm": 1.1007169485092163, + "learning_rate": 1.671516012333997e-05, + "loss": 1.1114, + "step": 1260 + }, + { + "epoch": 0.28872352604464796, + "grad_norm": 1.1942838430404663, + "learning_rate": 1.6709661490178617e-05, + "loss": 1.1364, + "step": 1261 + }, + { + "epoch": 0.2889524899828277, + "grad_norm": 1.078680157661438, + "learning_rate": 1.6704159164746797e-05, + "loss": 1.1666, + "step": 1262 + }, + { + "epoch": 0.2891814539210074, + "grad_norm": 3.27377986907959, + "learning_rate": 1.6698653150072396e-05, + "loss": 1.1181, + "step": 1263 + }, + { + "epoch": 0.2894104178591872, + "grad_norm": 1.0961107015609741, + "learning_rate": 1.6693143449185328e-05, + "loss": 1.1408, + "step": 1264 + }, + { + "epoch": 0.2896393817973669, + "grad_norm": 1.6030396223068237, + "learning_rate": 1.6687630065117536e-05, + "loss": 1.1411, + "step": 1265 + }, + { + "epoch": 0.28986834573554665, + "grad_norm": 1.1883594989776611, + "learning_rate": 1.6682113000902988e-05, + "loss": 1.1251, + "step": 1266 + }, + { + "epoch": 0.2900973096737264, + "grad_norm": 1.2973169088363647, + "learning_rate": 1.667659225957768e-05, + "loss": 1.0265, + "step": 1267 + }, + { + "epoch": 0.2903262736119061, + "grad_norm": 1.2373725175857544, + "learning_rate": 1.6671067844179625e-05, + "loss": 1.0575, + "step": 1268 + }, + { + "epoch": 0.2905552375500859, + "grad_norm": 1.0662227869033813, + "learning_rate": 1.6665539757748866e-05, + "loss": 1.1003, + "step": 1269 + }, + { + "epoch": 0.2907842014882656, + "grad_norm": 1.0769942998886108, + "learning_rate": 1.666000800332747e-05, + "loss": 1.0941, + "step": 1270 + }, + { + "epoch": 0.29101316542644534, + "grad_norm": 1.3059958219528198, + "learning_rate": 1.6654472583959497e-05, + "loss": 1.0998, + "step": 1271 + }, + { + "epoch": 0.29124212936462507, + "grad_norm": 1.2797632217407227, + "learning_rate": 1.664893350269106e-05, + "loss": 1.1016, + "step": 1272 + }, + { + "epoch": 0.2914710933028048, + "grad_norm": 1.2182438373565674, + "learning_rate": 1.6643390762570254e-05, + "loss": 1.1192, + "step": 1273 + }, + { + "epoch": 0.2917000572409845, + "grad_norm": 1.4072870016098022, + "learning_rate": 1.6637844366647216e-05, + "loss": 1.1242, + "step": 1274 + }, + { + "epoch": 0.2919290211791643, + "grad_norm": 1.3435471057891846, + "learning_rate": 1.663229431797407e-05, + "loss": 1.1807, + "step": 1275 + }, + { + "epoch": 0.29215798511734403, + "grad_norm": 1.090842604637146, + "learning_rate": 1.6626740619604967e-05, + "loss": 1.1318, + "step": 1276 + }, + { + "epoch": 0.29238694905552376, + "grad_norm": 1.192291021347046, + "learning_rate": 1.6621183274596063e-05, + "loss": 1.09, + "step": 1277 + }, + { + "epoch": 0.2926159129937035, + "grad_norm": 1.1859331130981445, + "learning_rate": 1.661562228600551e-05, + "loss": 0.9964, + "step": 1278 + }, + { + "epoch": 0.2928448769318832, + "grad_norm": 1.146068811416626, + "learning_rate": 1.6610057656893483e-05, + "loss": 1.1096, + "step": 1279 + }, + { + "epoch": 0.29307384087006294, + "grad_norm": 1.1786830425262451, + "learning_rate": 1.660448939032214e-05, + "loss": 1.1826, + "step": 1280 + }, + { + "epoch": 0.2933028048082427, + "grad_norm": 1.2743366956710815, + "learning_rate": 1.6598917489355665e-05, + "loss": 1.0824, + "step": 1281 + }, + { + "epoch": 0.29353176874642245, + "grad_norm": 1.339068055152893, + "learning_rate": 1.6593341957060218e-05, + "loss": 1.1928, + "step": 1282 + }, + { + "epoch": 0.2937607326846022, + "grad_norm": 1.1370441913604736, + "learning_rate": 1.658776279650397e-05, + "loss": 1.1, + "step": 1283 + }, + { + "epoch": 0.2939896966227819, + "grad_norm": 1.2556294202804565, + "learning_rate": 1.6582180010757082e-05, + "loss": 1.0666, + "step": 1284 + }, + { + "epoch": 0.29421866056096163, + "grad_norm": 1.174279808998108, + "learning_rate": 1.6576593602891727e-05, + "loss": 1.0921, + "step": 1285 + }, + { + "epoch": 0.29444762449914136, + "grad_norm": 1.3296531438827515, + "learning_rate": 1.6571003575982048e-05, + "loss": 1.1534, + "step": 1286 + }, + { + "epoch": 0.29467658843732114, + "grad_norm": 1.1834275722503662, + "learning_rate": 1.656540993310419e-05, + "loss": 1.1352, + "step": 1287 + }, + { + "epoch": 0.29490555237550087, + "grad_norm": 1.2583208084106445, + "learning_rate": 1.6559812677336293e-05, + "loss": 1.1401, + "step": 1288 + }, + { + "epoch": 0.2951345163136806, + "grad_norm": 1.1927133798599243, + "learning_rate": 1.655421181175848e-05, + "loss": 1.1334, + "step": 1289 + }, + { + "epoch": 0.2953634802518603, + "grad_norm": 1.2883397340774536, + "learning_rate": 1.6548607339452853e-05, + "loss": 1.1677, + "step": 1290 + }, + { + "epoch": 0.29559244419004005, + "grad_norm": 1.8883707523345947, + "learning_rate": 1.654299926350351e-05, + "loss": 1.0453, + "step": 1291 + }, + { + "epoch": 0.29582140812821983, + "grad_norm": 1.2389609813690186, + "learning_rate": 1.6537387586996532e-05, + "loss": 1.1274, + "step": 1292 + }, + { + "epoch": 0.29605037206639956, + "grad_norm": 0.9801892638206482, + "learning_rate": 1.6531772313019972e-05, + "loss": 1.144, + "step": 1293 + }, + { + "epoch": 0.2962793360045793, + "grad_norm": 1.00913667678833, + "learning_rate": 1.6526153444663873e-05, + "loss": 1.087, + "step": 1294 + }, + { + "epoch": 0.296508299942759, + "grad_norm": 1.0659477710723877, + "learning_rate": 1.6520530985020243e-05, + "loss": 1.0223, + "step": 1295 + }, + { + "epoch": 0.29673726388093874, + "grad_norm": 1.5433529615402222, + "learning_rate": 1.651490493718309e-05, + "loss": 1.1148, + "step": 1296 + }, + { + "epoch": 0.29696622781911847, + "grad_norm": 1.1053426265716553, + "learning_rate": 1.6509275304248366e-05, + "loss": 1.128, + "step": 1297 + }, + { + "epoch": 0.29719519175729825, + "grad_norm": 1.0809921026229858, + "learning_rate": 1.650364208931401e-05, + "loss": 1.0149, + "step": 1298 + }, + { + "epoch": 0.297424155695478, + "grad_norm": 1.0818578004837036, + "learning_rate": 1.6498005295479946e-05, + "loss": 1.0806, + "step": 1299 + }, + { + "epoch": 0.2976531196336577, + "grad_norm": 2.2473480701446533, + "learning_rate": 1.6492364925848045e-05, + "loss": 1.1062, + "step": 1300 + }, + { + "epoch": 0.29788208357183743, + "grad_norm": 1.107351541519165, + "learning_rate": 1.6486720983522156e-05, + "loss": 1.1262, + "step": 1301 + }, + { + "epoch": 0.29811104751001716, + "grad_norm": 1.3999285697937012, + "learning_rate": 1.6481073471608097e-05, + "loss": 1.1821, + "step": 1302 + }, + { + "epoch": 0.2983400114481969, + "grad_norm": 1.3839341402053833, + "learning_rate": 1.647542239321364e-05, + "loss": 1.0979, + "step": 1303 + }, + { + "epoch": 0.29856897538637667, + "grad_norm": 1.4304099082946777, + "learning_rate": 1.6469767751448538e-05, + "loss": 1.1045, + "step": 1304 + }, + { + "epoch": 0.2987979393245564, + "grad_norm": 1.2668440341949463, + "learning_rate": 1.6464109549424477e-05, + "loss": 1.1298, + "step": 1305 + }, + { + "epoch": 0.2990269032627361, + "grad_norm": 1.2562881708145142, + "learning_rate": 1.645844779025513e-05, + "loss": 1.1475, + "step": 1306 + }, + { + "epoch": 0.29925586720091585, + "grad_norm": 1.1980136632919312, + "learning_rate": 1.6452782477056112e-05, + "loss": 1.1703, + "step": 1307 + }, + { + "epoch": 0.2994848311390956, + "grad_norm": 1.2106819152832031, + "learning_rate": 1.6447113612944993e-05, + "loss": 1.1087, + "step": 1308 + }, + { + "epoch": 0.2997137950772753, + "grad_norm": 1.3096649646759033, + "learning_rate": 1.6441441201041312e-05, + "loss": 1.0965, + "step": 1309 + }, + { + "epoch": 0.2999427590154551, + "grad_norm": 1.1836016178131104, + "learning_rate": 1.6435765244466536e-05, + "loss": 1.0473, + "step": 1310 + }, + { + "epoch": 0.3001717229536348, + "grad_norm": 1.1866120100021362, + "learning_rate": 1.6430085746344107e-05, + "loss": 1.1249, + "step": 1311 + }, + { + "epoch": 0.30040068689181454, + "grad_norm": 1.4427460432052612, + "learning_rate": 1.6424402709799404e-05, + "loss": 1.1176, + "step": 1312 + }, + { + "epoch": 0.30062965082999427, + "grad_norm": 1.251099705696106, + "learning_rate": 1.6418716137959746e-05, + "loss": 1.1228, + "step": 1313 + }, + { + "epoch": 0.300858614768174, + "grad_norm": 2.566626787185669, + "learning_rate": 1.6413026033954418e-05, + "loss": 1.1398, + "step": 1314 + }, + { + "epoch": 0.3010875787063538, + "grad_norm": 1.1216868162155151, + "learning_rate": 1.6407332400914625e-05, + "loss": 1.0773, + "step": 1315 + }, + { + "epoch": 0.3013165426445335, + "grad_norm": 1.1276382207870483, + "learning_rate": 1.6401635241973533e-05, + "loss": 1.0415, + "step": 1316 + }, + { + "epoch": 0.30154550658271323, + "grad_norm": 1.1274714469909668, + "learning_rate": 1.639593456026624e-05, + "loss": 1.1136, + "step": 1317 + }, + { + "epoch": 0.30177447052089296, + "grad_norm": 1.0719152688980103, + "learning_rate": 1.639023035892978e-05, + "loss": 1.1102, + "step": 1318 + }, + { + "epoch": 0.3020034344590727, + "grad_norm": 1.0603184700012207, + "learning_rate": 1.6384522641103133e-05, + "loss": 1.1015, + "step": 1319 + }, + { + "epoch": 0.3022323983972524, + "grad_norm": 1.0919266939163208, + "learning_rate": 1.63788114099272e-05, + "loss": 1.0915, + "step": 1320 + }, + { + "epoch": 0.3024613623354322, + "grad_norm": 1.3398226499557495, + "learning_rate": 1.637309666854483e-05, + "loss": 1.1068, + "step": 1321 + }, + { + "epoch": 0.3026903262736119, + "grad_norm": 1.094092845916748, + "learning_rate": 1.6367378420100798e-05, + "loss": 1.1188, + "step": 1322 + }, + { + "epoch": 0.30291929021179165, + "grad_norm": 1.0754516124725342, + "learning_rate": 1.63616566677418e-05, + "loss": 1.1741, + "step": 1323 + }, + { + "epoch": 0.3031482541499714, + "grad_norm": 1.155779242515564, + "learning_rate": 1.6355931414616477e-05, + "loss": 1.1085, + "step": 1324 + }, + { + "epoch": 0.3033772180881511, + "grad_norm": 1.186424970626831, + "learning_rate": 1.6350202663875385e-05, + "loss": 1.0362, + "step": 1325 + }, + { + "epoch": 0.30360618202633083, + "grad_norm": 1.1375372409820557, + "learning_rate": 1.634447041867101e-05, + "loss": 1.0349, + "step": 1326 + }, + { + "epoch": 0.3038351459645106, + "grad_norm": 1.346127986907959, + "learning_rate": 1.633873468215775e-05, + "loss": 1.0951, + "step": 1327 + }, + { + "epoch": 0.30406410990269034, + "grad_norm": 1.2048730850219727, + "learning_rate": 1.633299545749194e-05, + "loss": 1.0937, + "step": 1328 + }, + { + "epoch": 0.30429307384087007, + "grad_norm": 1.2353206872940063, + "learning_rate": 1.6327252747831824e-05, + "loss": 1.1092, + "step": 1329 + }, + { + "epoch": 0.3045220377790498, + "grad_norm": 1.1722041368484497, + "learning_rate": 1.6321506556337575e-05, + "loss": 1.0895, + "step": 1330 + }, + { + "epoch": 0.3047510017172295, + "grad_norm": 1.1982693672180176, + "learning_rate": 1.6315756886171264e-05, + "loss": 1.1342, + "step": 1331 + }, + { + "epoch": 0.30497996565540925, + "grad_norm": 1.1630750894546509, + "learning_rate": 1.6310003740496887e-05, + "loss": 1.1008, + "step": 1332 + }, + { + "epoch": 0.30520892959358903, + "grad_norm": 1.6868027448654175, + "learning_rate": 1.6304247122480355e-05, + "loss": 1.1756, + "step": 1333 + }, + { + "epoch": 0.30543789353176876, + "grad_norm": 1.2052215337753296, + "learning_rate": 1.629848703528949e-05, + "loss": 1.0759, + "step": 1334 + }, + { + "epoch": 0.3056668574699485, + "grad_norm": 1.4629309177398682, + "learning_rate": 1.6292723482094013e-05, + "loss": 1.1375, + "step": 1335 + }, + { + "epoch": 0.3058958214081282, + "grad_norm": 0.995369553565979, + "learning_rate": 1.6286956466065566e-05, + "loss": 1.1433, + "step": 1336 + }, + { + "epoch": 0.30612478534630794, + "grad_norm": 1.3381826877593994, + "learning_rate": 1.6281185990377683e-05, + "loss": 1.0782, + "step": 1337 + }, + { + "epoch": 0.3063537492844877, + "grad_norm": 1.22222101688385, + "learning_rate": 1.627541205820581e-05, + "loss": 1.0767, + "step": 1338 + }, + { + "epoch": 0.30658271322266745, + "grad_norm": 1.1784082651138306, + "learning_rate": 1.6269634672727296e-05, + "loss": 1.0653, + "step": 1339 + }, + { + "epoch": 0.3068116771608472, + "grad_norm": 1.214966058731079, + "learning_rate": 1.6263853837121384e-05, + "loss": 1.0808, + "step": 1340 + }, + { + "epoch": 0.3070406410990269, + "grad_norm": 1.5941448211669922, + "learning_rate": 1.6258069554569226e-05, + "loss": 1.1365, + "step": 1341 + }, + { + "epoch": 0.30726960503720663, + "grad_norm": 1.1932793855667114, + "learning_rate": 1.6252281828253856e-05, + "loss": 1.1333, + "step": 1342 + }, + { + "epoch": 0.30749856897538635, + "grad_norm": 1.285779356956482, + "learning_rate": 1.6246490661360215e-05, + "loss": 1.0826, + "step": 1343 + }, + { + "epoch": 0.30772753291356614, + "grad_norm": 1.9943314790725708, + "learning_rate": 1.6240696057075138e-05, + "loss": 1.0179, + "step": 1344 + }, + { + "epoch": 0.30795649685174586, + "grad_norm": 1.3297176361083984, + "learning_rate": 1.6234898018587336e-05, + "loss": 1.0836, + "step": 1345 + }, + { + "epoch": 0.3081854607899256, + "grad_norm": 1.1271708011627197, + "learning_rate": 1.6229096549087434e-05, + "loss": 1.0867, + "step": 1346 + }, + { + "epoch": 0.3084144247281053, + "grad_norm": 1.003398060798645, + "learning_rate": 1.6223291651767922e-05, + "loss": 1.119, + "step": 1347 + }, + { + "epoch": 0.30864338866628505, + "grad_norm": 1.4007704257965088, + "learning_rate": 1.621748332982319e-05, + "loss": 1.054, + "step": 1348 + }, + { + "epoch": 0.3088723526044648, + "grad_norm": 1.2629410028457642, + "learning_rate": 1.6211671586449512e-05, + "loss": 1.0702, + "step": 1349 + }, + { + "epoch": 0.30910131654264456, + "grad_norm": 1.1390570402145386, + "learning_rate": 1.6205856424845038e-05, + "loss": 1.0674, + "step": 1350 + }, + { + "epoch": 0.3093302804808243, + "grad_norm": 1.0701704025268555, + "learning_rate": 1.62000378482098e-05, + "loss": 1.1055, + "step": 1351 + }, + { + "epoch": 0.309559244419004, + "grad_norm": 1.0888673067092896, + "learning_rate": 1.619421585974572e-05, + "loss": 1.0241, + "step": 1352 + }, + { + "epoch": 0.30978820835718374, + "grad_norm": 1.1905760765075684, + "learning_rate": 1.618839046265658e-05, + "loss": 1.1075, + "step": 1353 + }, + { + "epoch": 0.31001717229536346, + "grad_norm": 1.1428126096725464, + "learning_rate": 1.6182561660148053e-05, + "loss": 1.1007, + "step": 1354 + }, + { + "epoch": 0.3102461362335432, + "grad_norm": 0.9909505844116211, + "learning_rate": 1.617672945542768e-05, + "loss": 1.0782, + "step": 1355 + }, + { + "epoch": 0.310475100171723, + "grad_norm": 1.0805736780166626, + "learning_rate": 1.6170893851704875e-05, + "loss": 1.1272, + "step": 1356 + }, + { + "epoch": 0.3107040641099027, + "grad_norm": 1.3021117448806763, + "learning_rate": 1.6165054852190917e-05, + "loss": 1.0889, + "step": 1357 + }, + { + "epoch": 0.3109330280480824, + "grad_norm": 1.0056633949279785, + "learning_rate": 1.6159212460098968e-05, + "loss": 1.0785, + "step": 1358 + }, + { + "epoch": 0.31116199198626215, + "grad_norm": 1.1618127822875977, + "learning_rate": 1.6153366678644035e-05, + "loss": 1.0712, + "step": 1359 + }, + { + "epoch": 0.3113909559244419, + "grad_norm": 1.7210270166397095, + "learning_rate": 1.614751751104301e-05, + "loss": 1.0849, + "step": 1360 + }, + { + "epoch": 0.31161991986262166, + "grad_norm": 1.262199878692627, + "learning_rate": 1.614166496051464e-05, + "loss": 1.1274, + "step": 1361 + }, + { + "epoch": 0.3118488838008014, + "grad_norm": 1.2134400606155396, + "learning_rate": 1.6135809030279534e-05, + "loss": 1.1211, + "step": 1362 + }, + { + "epoch": 0.3120778477389811, + "grad_norm": 1.4023921489715576, + "learning_rate": 1.6129949723560162e-05, + "loss": 1.1045, + "step": 1363 + }, + { + "epoch": 0.31230681167716084, + "grad_norm": 1.6166713237762451, + "learning_rate": 1.612408704358085e-05, + "loss": 1.0561, + "step": 1364 + }, + { + "epoch": 0.31253577561534057, + "grad_norm": 1.3405228853225708, + "learning_rate": 1.6118220993567783e-05, + "loss": 1.1804, + "step": 1365 + }, + { + "epoch": 0.3127647395535203, + "grad_norm": 1.1368193626403809, + "learning_rate": 1.6112351576748994e-05, + "loss": 1.1047, + "step": 1366 + }, + { + "epoch": 0.3129937034917001, + "grad_norm": 1.1940534114837646, + "learning_rate": 1.6106478796354382e-05, + "loss": 1.0889, + "step": 1367 + }, + { + "epoch": 0.3132226674298798, + "grad_norm": 1.7876060009002686, + "learning_rate": 1.6100602655615683e-05, + "loss": 1.1257, + "step": 1368 + }, + { + "epoch": 0.31345163136805954, + "grad_norm": 1.4505313634872437, + "learning_rate": 1.6094723157766493e-05, + "loss": 1.1566, + "step": 1369 + }, + { + "epoch": 0.31368059530623926, + "grad_norm": 1.2009062767028809, + "learning_rate": 1.6088840306042247e-05, + "loss": 1.1019, + "step": 1370 + }, + { + "epoch": 0.313909559244419, + "grad_norm": 1.2727826833724976, + "learning_rate": 1.608295410368023e-05, + "loss": 1.1033, + "step": 1371 + }, + { + "epoch": 0.3141385231825987, + "grad_norm": 1.1413002014160156, + "learning_rate": 1.6077064553919565e-05, + "loss": 1.0678, + "step": 1372 + }, + { + "epoch": 0.3143674871207785, + "grad_norm": 1.2150791883468628, + "learning_rate": 1.6071171660001232e-05, + "loss": 1.093, + "step": 1373 + }, + { + "epoch": 0.3145964510589582, + "grad_norm": 1.4832680225372314, + "learning_rate": 1.6065275425168034e-05, + "loss": 1.1531, + "step": 1374 + }, + { + "epoch": 0.31482541499713795, + "grad_norm": 1.3526698350906372, + "learning_rate": 1.605937585266462e-05, + "loss": 1.104, + "step": 1375 + }, + { + "epoch": 0.3150543789353177, + "grad_norm": 1.1165345907211304, + "learning_rate": 1.6053472945737474e-05, + "loss": 1.0644, + "step": 1376 + }, + { + "epoch": 0.3152833428734974, + "grad_norm": 1.1160399913787842, + "learning_rate": 1.6047566707634918e-05, + "loss": 1.1056, + "step": 1377 + }, + { + "epoch": 0.31551230681167713, + "grad_norm": 1.4708969593048096, + "learning_rate": 1.604165714160711e-05, + "loss": 1.049, + "step": 1378 + }, + { + "epoch": 0.3157412707498569, + "grad_norm": 1.1713590621948242, + "learning_rate": 1.6035744250906026e-05, + "loss": 1.1238, + "step": 1379 + }, + { + "epoch": 0.31597023468803664, + "grad_norm": 1.135960340499878, + "learning_rate": 1.6029828038785486e-05, + "loss": 1.1036, + "step": 1380 + }, + { + "epoch": 0.31619919862621637, + "grad_norm": 1.1496847867965698, + "learning_rate": 1.602390850850113e-05, + "loss": 1.1226, + "step": 1381 + }, + { + "epoch": 0.3164281625643961, + "grad_norm": 1.4546343088150024, + "learning_rate": 1.6017985663310427e-05, + "loss": 1.0588, + "step": 1382 + }, + { + "epoch": 0.3166571265025758, + "grad_norm": 1.1785930395126343, + "learning_rate": 1.6012059506472665e-05, + "loss": 1.0823, + "step": 1383 + }, + { + "epoch": 0.3168860904407556, + "grad_norm": 1.2727309465408325, + "learning_rate": 1.6006130041248968e-05, + "loss": 1.1149, + "step": 1384 + }, + { + "epoch": 0.31711505437893533, + "grad_norm": 1.122978925704956, + "learning_rate": 1.600019727090226e-05, + "loss": 1.0961, + "step": 1385 + }, + { + "epoch": 0.31734401831711506, + "grad_norm": 1.1095397472381592, + "learning_rate": 1.59942611986973e-05, + "loss": 1.1687, + "step": 1386 + }, + { + "epoch": 0.3175729822552948, + "grad_norm": 1.2510476112365723, + "learning_rate": 1.598832182790066e-05, + "loss": 1.0486, + "step": 1387 + }, + { + "epoch": 0.3178019461934745, + "grad_norm": 1.103319764137268, + "learning_rate": 1.5982379161780722e-05, + "loss": 1.1006, + "step": 1388 + }, + { + "epoch": 0.31803091013165424, + "grad_norm": 1.4000933170318604, + "learning_rate": 1.597643320360769e-05, + "loss": 1.1743, + "step": 1389 + }, + { + "epoch": 0.318259874069834, + "grad_norm": 1.0757001638412476, + "learning_rate": 1.5970483956653572e-05, + "loss": 1.0575, + "step": 1390 + }, + { + "epoch": 0.31848883800801375, + "grad_norm": 1.2366151809692383, + "learning_rate": 1.5964531424192187e-05, + "loss": 1.0801, + "step": 1391 + }, + { + "epoch": 0.3187178019461935, + "grad_norm": 1.175316572189331, + "learning_rate": 1.595857560949917e-05, + "loss": 1.0543, + "step": 1392 + }, + { + "epoch": 0.3189467658843732, + "grad_norm": 1.2209275960922241, + "learning_rate": 1.595261651585195e-05, + "loss": 1.1517, + "step": 1393 + }, + { + "epoch": 0.31917572982255293, + "grad_norm": 1.448627233505249, + "learning_rate": 1.5946654146529766e-05, + "loss": 1.137, + "step": 1394 + }, + { + "epoch": 0.31940469376073266, + "grad_norm": 1.2850122451782227, + "learning_rate": 1.5940688504813664e-05, + "loss": 1.113, + "step": 1395 + }, + { + "epoch": 0.31963365769891244, + "grad_norm": 1.3141731023788452, + "learning_rate": 1.5934719593986483e-05, + "loss": 1.0964, + "step": 1396 + }, + { + "epoch": 0.31986262163709217, + "grad_norm": 1.1482245922088623, + "learning_rate": 1.592874741733287e-05, + "loss": 1.0624, + "step": 1397 + }, + { + "epoch": 0.3200915855752719, + "grad_norm": 1.1503582000732422, + "learning_rate": 1.5922771978139255e-05, + "loss": 1.0634, + "step": 1398 + }, + { + "epoch": 0.3203205495134516, + "grad_norm": 1.4972145557403564, + "learning_rate": 1.5916793279693878e-05, + "loss": 1.1168, + "step": 1399 + }, + { + "epoch": 0.32054951345163135, + "grad_norm": 1.0118694305419922, + "learning_rate": 1.5910811325286768e-05, + "loss": 1.0714, + "step": 1400 + }, + { + "epoch": 0.3207784773898111, + "grad_norm": 1.2603758573532104, + "learning_rate": 1.590482611820974e-05, + "loss": 1.1209, + "step": 1401 + }, + { + "epoch": 0.32100744132799086, + "grad_norm": 0.941050112247467, + "learning_rate": 1.5898837661756405e-05, + "loss": 1.0597, + "step": 1402 + }, + { + "epoch": 0.3212364052661706, + "grad_norm": 1.2156561613082886, + "learning_rate": 1.5892845959222164e-05, + "loss": 1.101, + "step": 1403 + }, + { + "epoch": 0.3214653692043503, + "grad_norm": 1.0653733015060425, + "learning_rate": 1.5886851013904193e-05, + "loss": 1.0916, + "step": 1404 + }, + { + "epoch": 0.32169433314253004, + "grad_norm": 1.1176843643188477, + "learning_rate": 1.5880852829101464e-05, + "loss": 1.1002, + "step": 1405 + }, + { + "epoch": 0.32192329708070977, + "grad_norm": 1.3525538444519043, + "learning_rate": 1.5874851408114733e-05, + "loss": 1.0686, + "step": 1406 + }, + { + "epoch": 0.32215226101888955, + "grad_norm": 1.2710604667663574, + "learning_rate": 1.5868846754246524e-05, + "loss": 1.0965, + "step": 1407 + }, + { + "epoch": 0.3223812249570693, + "grad_norm": 1.1493090391159058, + "learning_rate": 1.5862838870801153e-05, + "loss": 1.095, + "step": 1408 + }, + { + "epoch": 0.322610188895249, + "grad_norm": 1.2418208122253418, + "learning_rate": 1.5856827761084698e-05, + "loss": 1.0945, + "step": 1409 + }, + { + "epoch": 0.32283915283342873, + "grad_norm": 1.0906857252120972, + "learning_rate": 1.5850813428405036e-05, + "loss": 1.1288, + "step": 1410 + }, + { + "epoch": 0.32306811677160846, + "grad_norm": 1.1310313940048218, + "learning_rate": 1.58447958760718e-05, + "loss": 1.0776, + "step": 1411 + }, + { + "epoch": 0.3232970807097882, + "grad_norm": 1.407239317893982, + "learning_rate": 1.583877510739639e-05, + "loss": 1.0595, + "step": 1412 + }, + { + "epoch": 0.32352604464796797, + "grad_norm": 1.0133352279663086, + "learning_rate": 1.5832751125691993e-05, + "loss": 1.0473, + "step": 1413 + }, + { + "epoch": 0.3237550085861477, + "grad_norm": 1.329931616783142, + "learning_rate": 1.5826723934273555e-05, + "loss": 1.0844, + "step": 1414 + }, + { + "epoch": 0.3239839725243274, + "grad_norm": 1.4198311567306519, + "learning_rate": 1.5820693536457787e-05, + "loss": 1.0686, + "step": 1415 + }, + { + "epoch": 0.32421293646250715, + "grad_norm": 1.604586124420166, + "learning_rate": 1.5814659935563165e-05, + "loss": 1.1034, + "step": 1416 + }, + { + "epoch": 0.3244419004006869, + "grad_norm": 1.439122200012207, + "learning_rate": 1.5808623134909932e-05, + "loss": 1.1126, + "step": 1417 + }, + { + "epoch": 0.3246708643388666, + "grad_norm": 1.158563256263733, + "learning_rate": 1.5802583137820087e-05, + "loss": 1.0924, + "step": 1418 + }, + { + "epoch": 0.3248998282770464, + "grad_norm": 1.1076048612594604, + "learning_rate": 1.579653994761739e-05, + "loss": 1.079, + "step": 1419 + }, + { + "epoch": 0.3251287922152261, + "grad_norm": 1.1624640226364136, + "learning_rate": 1.5790493567627357e-05, + "loss": 1.0644, + "step": 1420 + }, + { + "epoch": 0.32535775615340584, + "grad_norm": 1.0896527767181396, + "learning_rate": 1.5784444001177262e-05, + "loss": 1.1104, + "step": 1421 + }, + { + "epoch": 0.32558672009158557, + "grad_norm": 1.096172571182251, + "learning_rate": 1.577839125159613e-05, + "loss": 1.064, + "step": 1422 + }, + { + "epoch": 0.3258156840297653, + "grad_norm": 1.2764605283737183, + "learning_rate": 1.577233532221474e-05, + "loss": 1.0389, + "step": 1423 + }, + { + "epoch": 0.326044647967945, + "grad_norm": 1.2144206762313843, + "learning_rate": 1.576627621636561e-05, + "loss": 1.0894, + "step": 1424 + }, + { + "epoch": 0.3262736119061248, + "grad_norm": 1.1059249639511108, + "learning_rate": 1.5760213937383032e-05, + "loss": 1.0988, + "step": 1425 + }, + { + "epoch": 0.32650257584430453, + "grad_norm": 1.1343713998794556, + "learning_rate": 1.5754148488603017e-05, + "loss": 1.1237, + "step": 1426 + }, + { + "epoch": 0.32673153978248426, + "grad_norm": 1.1411490440368652, + "learning_rate": 1.5748079873363327e-05, + "loss": 1.14, + "step": 1427 + }, + { + "epoch": 0.326960503720664, + "grad_norm": 1.1777209043502808, + "learning_rate": 1.5742008095003478e-05, + "loss": 1.1442, + "step": 1428 + }, + { + "epoch": 0.3271894676588437, + "grad_norm": 1.2743630409240723, + "learning_rate": 1.573593315686471e-05, + "loss": 1.127, + "step": 1429 + }, + { + "epoch": 0.3274184315970235, + "grad_norm": 1.4254614114761353, + "learning_rate": 1.5729855062290024e-05, + "loss": 1.0446, + "step": 1430 + }, + { + "epoch": 0.3276473955352032, + "grad_norm": 1.1998964548110962, + "learning_rate": 1.572377381462413e-05, + "loss": 1.0988, + "step": 1431 + }, + { + "epoch": 0.32787635947338295, + "grad_norm": 1.0931754112243652, + "learning_rate": 1.5717689417213495e-05, + "loss": 1.0696, + "step": 1432 + }, + { + "epoch": 0.3281053234115627, + "grad_norm": 1.1342476606369019, + "learning_rate": 1.5711601873406314e-05, + "loss": 1.0996, + "step": 1433 + }, + { + "epoch": 0.3283342873497424, + "grad_norm": 1.8414933681488037, + "learning_rate": 1.5705511186552506e-05, + "loss": 1.1078, + "step": 1434 + }, + { + "epoch": 0.32856325128792213, + "grad_norm": 1.0369212627410889, + "learning_rate": 1.5699417360003725e-05, + "loss": 1.0947, + "step": 1435 + }, + { + "epoch": 0.3287922152261019, + "grad_norm": 1.1955866813659668, + "learning_rate": 1.5693320397113358e-05, + "loss": 1.1377, + "step": 1436 + }, + { + "epoch": 0.32902117916428164, + "grad_norm": 1.1285663843154907, + "learning_rate": 1.568722030123651e-05, + "loss": 1.1109, + "step": 1437 + }, + { + "epoch": 0.32925014310246137, + "grad_norm": 1.313095211982727, + "learning_rate": 1.568111707573001e-05, + "loss": 1.1211, + "step": 1438 + }, + { + "epoch": 0.3294791070406411, + "grad_norm": 1.4031476974487305, + "learning_rate": 1.567501072395241e-05, + "loss": 1.0377, + "step": 1439 + }, + { + "epoch": 0.3297080709788208, + "grad_norm": 1.2493641376495361, + "learning_rate": 1.5668901249263996e-05, + "loss": 1.0921, + "step": 1440 + }, + { + "epoch": 0.32993703491700055, + "grad_norm": 1.0535095930099487, + "learning_rate": 1.5662788655026745e-05, + "loss": 1.0488, + "step": 1441 + }, + { + "epoch": 0.33016599885518033, + "grad_norm": 1.165377140045166, + "learning_rate": 1.565667294460438e-05, + "loss": 1.0395, + "step": 1442 + }, + { + "epoch": 0.33039496279336006, + "grad_norm": 1.4327694177627563, + "learning_rate": 1.5650554121362315e-05, + "loss": 1.1258, + "step": 1443 + }, + { + "epoch": 0.3306239267315398, + "grad_norm": 1.1685681343078613, + "learning_rate": 1.5644432188667695e-05, + "loss": 1.0601, + "step": 1444 + }, + { + "epoch": 0.3308528906697195, + "grad_norm": 1.4015522003173828, + "learning_rate": 1.563830714988936e-05, + "loss": 1.0852, + "step": 1445 + }, + { + "epoch": 0.33108185460789924, + "grad_norm": 1.0831905603408813, + "learning_rate": 1.5632179008397876e-05, + "loss": 1.125, + "step": 1446 + }, + { + "epoch": 0.33131081854607897, + "grad_norm": 1.9634826183319092, + "learning_rate": 1.5626047767565503e-05, + "loss": 1.0603, + "step": 1447 + }, + { + "epoch": 0.33153978248425875, + "grad_norm": 1.1687532663345337, + "learning_rate": 1.561991343076621e-05, + "loss": 1.1012, + "step": 1448 + }, + { + "epoch": 0.3317687464224385, + "grad_norm": 1.5196926593780518, + "learning_rate": 1.5613776001375674e-05, + "loss": 1.1178, + "step": 1449 + }, + { + "epoch": 0.3319977103606182, + "grad_norm": 1.2472096681594849, + "learning_rate": 1.5607635482771272e-05, + "loss": 1.0965, + "step": 1450 + }, + { + "epoch": 0.33222667429879793, + "grad_norm": 1.207923173904419, + "learning_rate": 1.5601491878332077e-05, + "loss": 1.0978, + "step": 1451 + }, + { + "epoch": 0.33245563823697766, + "grad_norm": 1.1956391334533691, + "learning_rate": 1.5595345191438864e-05, + "loss": 1.1358, + "step": 1452 + }, + { + "epoch": 0.33268460217515744, + "grad_norm": 1.1643397808074951, + "learning_rate": 1.5589195425474105e-05, + "loss": 1.0615, + "step": 1453 + }, + { + "epoch": 0.33291356611333717, + "grad_norm": 1.332839846611023, + "learning_rate": 1.5583042583821963e-05, + "loss": 1.0744, + "step": 1454 + }, + { + "epoch": 0.3331425300515169, + "grad_norm": 1.5693964958190918, + "learning_rate": 1.5576886669868297e-05, + "loss": 1.095, + "step": 1455 + }, + { + "epoch": 0.3333714939896966, + "grad_norm": 1.3151133060455322, + "learning_rate": 1.557072768700065e-05, + "loss": 1.0147, + "step": 1456 + }, + { + "epoch": 0.33360045792787635, + "grad_norm": 2.105339765548706, + "learning_rate": 1.5564565638608264e-05, + "loss": 1.154, + "step": 1457 + }, + { + "epoch": 0.3338294218660561, + "grad_norm": 1.1660069227218628, + "learning_rate": 1.5558400528082057e-05, + "loss": 1.0835, + "step": 1458 + }, + { + "epoch": 0.33405838580423586, + "grad_norm": 1.198183298110962, + "learning_rate": 1.5552232358814646e-05, + "loss": 1.0679, + "step": 1459 + }, + { + "epoch": 0.3342873497424156, + "grad_norm": 0.9822539687156677, + "learning_rate": 1.5546061134200316e-05, + "loss": 1.0952, + "step": 1460 + }, + { + "epoch": 0.3345163136805953, + "grad_norm": 1.082090973854065, + "learning_rate": 1.5539886857635037e-05, + "loss": 1.0979, + "step": 1461 + }, + { + "epoch": 0.33474527761877504, + "grad_norm": 1.2123302221298218, + "learning_rate": 1.5533709532516473e-05, + "loss": 1.0944, + "step": 1462 + }, + { + "epoch": 0.33497424155695477, + "grad_norm": 1.1472247838974, + "learning_rate": 1.552752916224395e-05, + "loss": 1.1239, + "step": 1463 + }, + { + "epoch": 0.3352032054951345, + "grad_norm": 1.1450691223144531, + "learning_rate": 1.5521345750218463e-05, + "loss": 1.069, + "step": 1464 + }, + { + "epoch": 0.3354321694333143, + "grad_norm": 1.3406015634536743, + "learning_rate": 1.551515929984271e-05, + "loss": 1.0496, + "step": 1465 + }, + { + "epoch": 0.335661133371494, + "grad_norm": 1.2106640338897705, + "learning_rate": 1.5508969814521026e-05, + "loss": 1.1102, + "step": 1466 + }, + { + "epoch": 0.33589009730967373, + "grad_norm": 1.483971118927002, + "learning_rate": 1.5502777297659447e-05, + "loss": 1.0899, + "step": 1467 + }, + { + "epoch": 0.33611906124785346, + "grad_norm": 1.171859860420227, + "learning_rate": 1.549658175266565e-05, + "loss": 1.0654, + "step": 1468 + }, + { + "epoch": 0.3363480251860332, + "grad_norm": 2.5037529468536377, + "learning_rate": 1.5490383182949e-05, + "loss": 1.0884, + "step": 1469 + }, + { + "epoch": 0.3365769891242129, + "grad_norm": 1.0418339967727661, + "learning_rate": 1.5484181591920516e-05, + "loss": 1.0827, + "step": 1470 + }, + { + "epoch": 0.3368059530623927, + "grad_norm": 1.2835181951522827, + "learning_rate": 1.5477976982992883e-05, + "loss": 1.0139, + "step": 1471 + }, + { + "epoch": 0.3370349170005724, + "grad_norm": 1.2543227672576904, + "learning_rate": 1.547176935958044e-05, + "loss": 1.1055, + "step": 1472 + }, + { + "epoch": 0.33726388093875215, + "grad_norm": 1.0822360515594482, + "learning_rate": 1.54655587250992e-05, + "loss": 1.0823, + "step": 1473 + }, + { + "epoch": 0.3374928448769319, + "grad_norm": 1.016126036643982, + "learning_rate": 1.5459345082966812e-05, + "loss": 1.0929, + "step": 1474 + }, + { + "epoch": 0.3377218088151116, + "grad_norm": 1.147747278213501, + "learning_rate": 1.5453128436602597e-05, + "loss": 1.1562, + "step": 1475 + }, + { + "epoch": 0.3379507727532914, + "grad_norm": 1.3952897787094116, + "learning_rate": 1.5446908789427522e-05, + "loss": 1.0573, + "step": 1476 + }, + { + "epoch": 0.3381797366914711, + "grad_norm": 1.0825663805007935, + "learning_rate": 1.5440686144864207e-05, + "loss": 1.1492, + "step": 1477 + }, + { + "epoch": 0.33840870062965084, + "grad_norm": 1.1135352849960327, + "learning_rate": 1.5434460506336922e-05, + "loss": 1.0801, + "step": 1478 + }, + { + "epoch": 0.33863766456783057, + "grad_norm": 1.1229122877120972, + "learning_rate": 1.5428231877271584e-05, + "loss": 1.1142, + "step": 1479 + }, + { + "epoch": 0.3388666285060103, + "grad_norm": 1.1263865232467651, + "learning_rate": 1.542200026109575e-05, + "loss": 1.0628, + "step": 1480 + }, + { + "epoch": 0.33909559244419, + "grad_norm": 1.3717758655548096, + "learning_rate": 1.5415765661238635e-05, + "loss": 1.106, + "step": 1481 + }, + { + "epoch": 0.3393245563823698, + "grad_norm": 1.1073044538497925, + "learning_rate": 1.540952808113108e-05, + "loss": 1.0775, + "step": 1482 + }, + { + "epoch": 0.33955352032054953, + "grad_norm": 1.1785542964935303, + "learning_rate": 1.5403287524205577e-05, + "loss": 1.0809, + "step": 1483 + }, + { + "epoch": 0.33978248425872926, + "grad_norm": 1.185001254081726, + "learning_rate": 1.539704399389625e-05, + "loss": 0.996, + "step": 1484 + }, + { + "epoch": 0.340011448196909, + "grad_norm": 1.155920386314392, + "learning_rate": 1.5390797493638862e-05, + "loss": 1.0949, + "step": 1485 + }, + { + "epoch": 0.3402404121350887, + "grad_norm": 1.0656671524047852, + "learning_rate": 1.538454802687081e-05, + "loss": 1.0234, + "step": 1486 + }, + { + "epoch": 0.34046937607326844, + "grad_norm": 1.3848119974136353, + "learning_rate": 1.537829559703112e-05, + "loss": 1.0846, + "step": 1487 + }, + { + "epoch": 0.3406983400114482, + "grad_norm": 1.0920829772949219, + "learning_rate": 1.5372040207560457e-05, + "loss": 1.1337, + "step": 1488 + }, + { + "epoch": 0.34092730394962795, + "grad_norm": 1.3589757680892944, + "learning_rate": 1.536578186190111e-05, + "loss": 1.0797, + "step": 1489 + }, + { + "epoch": 0.3411562678878077, + "grad_norm": 1.4441230297088623, + "learning_rate": 1.5359520563496985e-05, + "loss": 1.131, + "step": 1490 + }, + { + "epoch": 0.3413852318259874, + "grad_norm": 1.3296109437942505, + "learning_rate": 1.5353256315793633e-05, + "loss": 1.086, + "step": 1491 + }, + { + "epoch": 0.34161419576416713, + "grad_norm": 1.065158486366272, + "learning_rate": 1.534698912223821e-05, + "loss": 1.0374, + "step": 1492 + }, + { + "epoch": 0.34184315970234685, + "grad_norm": 2.7994022369384766, + "learning_rate": 1.5340718986279505e-05, + "loss": 1.0859, + "step": 1493 + }, + { + "epoch": 0.34207212364052664, + "grad_norm": 1.2376704216003418, + "learning_rate": 1.5334445911367915e-05, + "loss": 1.0435, + "step": 1494 + }, + { + "epoch": 0.34230108757870636, + "grad_norm": 1.0192632675170898, + "learning_rate": 1.5328169900955463e-05, + "loss": 1.114, + "step": 1495 + }, + { + "epoch": 0.3425300515168861, + "grad_norm": 1.3984559774398804, + "learning_rate": 1.5321890958495787e-05, + "loss": 1.0808, + "step": 1496 + }, + { + "epoch": 0.3427590154550658, + "grad_norm": 0.9953001141548157, + "learning_rate": 1.5315609087444135e-05, + "loss": 1.0632, + "step": 1497 + }, + { + "epoch": 0.34298797939324555, + "grad_norm": 1.4686434268951416, + "learning_rate": 1.5309324291257373e-05, + "loss": 1.0939, + "step": 1498 + }, + { + "epoch": 0.34321694333142533, + "grad_norm": 1.450624704360962, + "learning_rate": 1.5303036573393964e-05, + "loss": 1.0561, + "step": 1499 + }, + { + "epoch": 0.34344590726960506, + "grad_norm": 1.1755778789520264, + "learning_rate": 1.529674593731399e-05, + "loss": 1.1257, + "step": 1500 + }, + { + "epoch": 0.3436748712077848, + "grad_norm": 1.1907165050506592, + "learning_rate": 1.5290452386479132e-05, + "loss": 1.1242, + "step": 1501 + }, + { + "epoch": 0.3439038351459645, + "grad_norm": 1.4106594324111938, + "learning_rate": 1.5284155924352678e-05, + "loss": 1.1147, + "step": 1502 + }, + { + "epoch": 0.34413279908414424, + "grad_norm": 1.3477283716201782, + "learning_rate": 1.5277856554399528e-05, + "loss": 1.1203, + "step": 1503 + }, + { + "epoch": 0.34436176302232396, + "grad_norm": 1.1035650968551636, + "learning_rate": 1.5271554280086164e-05, + "loss": 1.0926, + "step": 1504 + }, + { + "epoch": 0.34459072696050375, + "grad_norm": 1.4535436630249023, + "learning_rate": 1.5265249104880675e-05, + "loss": 1.0157, + "step": 1505 + }, + { + "epoch": 0.3448196908986835, + "grad_norm": 1.184928297996521, + "learning_rate": 1.5258941032252747e-05, + "loss": 1.0421, + "step": 1506 + }, + { + "epoch": 0.3450486548368632, + "grad_norm": 1.1650036573410034, + "learning_rate": 1.5252630065673662e-05, + "loss": 1.0696, + "step": 1507 + }, + { + "epoch": 0.3452776187750429, + "grad_norm": 1.272552490234375, + "learning_rate": 1.5246316208616289e-05, + "loss": 1.111, + "step": 1508 + }, + { + "epoch": 0.34550658271322265, + "grad_norm": 1.1120789051055908, + "learning_rate": 1.5239999464555092e-05, + "loss": 1.1354, + "step": 1509 + }, + { + "epoch": 0.3457355466514024, + "grad_norm": 1.6743152141571045, + "learning_rate": 1.5233679836966122e-05, + "loss": 1.0393, + "step": 1510 + }, + { + "epoch": 0.34596451058958216, + "grad_norm": 1.243407130241394, + "learning_rate": 1.5227357329327016e-05, + "loss": 1.0646, + "step": 1511 + }, + { + "epoch": 0.3461934745277619, + "grad_norm": 1.359520673751831, + "learning_rate": 1.5221031945116998e-05, + "loss": 1.1385, + "step": 1512 + }, + { + "epoch": 0.3464224384659416, + "grad_norm": 1.18605637550354, + "learning_rate": 1.5214703687816874e-05, + "loss": 1.1231, + "step": 1513 + }, + { + "epoch": 0.34665140240412134, + "grad_norm": 1.3147512674331665, + "learning_rate": 1.5208372560909031e-05, + "loss": 1.1246, + "step": 1514 + }, + { + "epoch": 0.34688036634230107, + "grad_norm": 1.420447826385498, + "learning_rate": 1.5202038567877436e-05, + "loss": 1.1292, + "step": 1515 + }, + { + "epoch": 0.3471093302804808, + "grad_norm": 1.1527974605560303, + "learning_rate": 1.5195701712207627e-05, + "loss": 1.0658, + "step": 1516 + }, + { + "epoch": 0.3473382942186606, + "grad_norm": 1.1984782218933105, + "learning_rate": 1.5189361997386729e-05, + "loss": 1.0923, + "step": 1517 + }, + { + "epoch": 0.3475672581568403, + "grad_norm": 1.3671753406524658, + "learning_rate": 1.5183019426903434e-05, + "loss": 1.2041, + "step": 1518 + }, + { + "epoch": 0.34779622209502004, + "grad_norm": 1.3007023334503174, + "learning_rate": 1.5176674004247998e-05, + "loss": 1.0656, + "step": 1519 + }, + { + "epoch": 0.34802518603319976, + "grad_norm": 1.2162976264953613, + "learning_rate": 1.517032573291226e-05, + "loss": 1.1123, + "step": 1520 + }, + { + "epoch": 0.3482541499713795, + "grad_norm": 1.119856834411621, + "learning_rate": 1.5163974616389621e-05, + "loss": 1.0619, + "step": 1521 + }, + { + "epoch": 0.3484831139095592, + "grad_norm": 1.3005276918411255, + "learning_rate": 1.5157620658175046e-05, + "loss": 1.0622, + "step": 1522 + }, + { + "epoch": 0.348712077847739, + "grad_norm": 1.0568853616714478, + "learning_rate": 1.515126386176506e-05, + "loss": 1.0777, + "step": 1523 + }, + { + "epoch": 0.3489410417859187, + "grad_norm": 1.326450228691101, + "learning_rate": 1.5144904230657765e-05, + "loss": 1.0952, + "step": 1524 + }, + { + "epoch": 0.34917000572409845, + "grad_norm": 1.2039971351623535, + "learning_rate": 1.5138541768352802e-05, + "loss": 1.1044, + "step": 1525 + }, + { + "epoch": 0.3493989696622782, + "grad_norm": 1.1191279888153076, + "learning_rate": 1.5132176478351386e-05, + "loss": 1.0931, + "step": 1526 + }, + { + "epoch": 0.3496279336004579, + "grad_norm": 1.0349925756454468, + "learning_rate": 1.5125808364156283e-05, + "loss": 1.1128, + "step": 1527 + }, + { + "epoch": 0.3498568975386377, + "grad_norm": 1.1004794836044312, + "learning_rate": 1.5119437429271813e-05, + "loss": 1.1096, + "step": 1528 + }, + { + "epoch": 0.3500858614768174, + "grad_norm": 1.117506504058838, + "learning_rate": 1.5113063677203847e-05, + "loss": 1.0509, + "step": 1529 + }, + { + "epoch": 0.35031482541499714, + "grad_norm": 1.113121509552002, + "learning_rate": 1.5106687111459809e-05, + "loss": 1.1017, + "step": 1530 + }, + { + "epoch": 0.35054378935317687, + "grad_norm": 3.266343116760254, + "learning_rate": 1.5100307735548662e-05, + "loss": 1.1525, + "step": 1531 + }, + { + "epoch": 0.3507727532913566, + "grad_norm": 1.3672198057174683, + "learning_rate": 1.5093925552980934e-05, + "loss": 1.0521, + "step": 1532 + }, + { + "epoch": 0.3510017172295363, + "grad_norm": 1.2172337770462036, + "learning_rate": 1.5087540567268682e-05, + "loss": 1.0226, + "step": 1533 + }, + { + "epoch": 0.3512306811677161, + "grad_norm": 0.9038141369819641, + "learning_rate": 1.5081152781925508e-05, + "loss": 1.0518, + "step": 1534 + }, + { + "epoch": 0.35145964510589583, + "grad_norm": 1.2456227540969849, + "learning_rate": 1.5074762200466557e-05, + "loss": 1.121, + "step": 1535 + }, + { + "epoch": 0.35168860904407556, + "grad_norm": 1.2485681772232056, + "learning_rate": 1.5068368826408515e-05, + "loss": 1.0517, + "step": 1536 + }, + { + "epoch": 0.3519175729822553, + "grad_norm": 1.2374999523162842, + "learning_rate": 1.5061972663269604e-05, + "loss": 1.0035, + "step": 1537 + }, + { + "epoch": 0.352146536920435, + "grad_norm": 1.0219475030899048, + "learning_rate": 1.5055573714569574e-05, + "loss": 1.0786, + "step": 1538 + }, + { + "epoch": 0.35237550085861474, + "grad_norm": 1.2069019079208374, + "learning_rate": 1.5049171983829714e-05, + "loss": 1.0511, + "step": 1539 + }, + { + "epoch": 0.3526044647967945, + "grad_norm": 1.256385087966919, + "learning_rate": 1.5042767474572846e-05, + "loss": 1.1174, + "step": 1540 + }, + { + "epoch": 0.35283342873497425, + "grad_norm": 1.2299443483352661, + "learning_rate": 1.5036360190323315e-05, + "loss": 1.0537, + "step": 1541 + }, + { + "epoch": 0.353062392673154, + "grad_norm": 1.5519248247146606, + "learning_rate": 1.5029950134606991e-05, + "loss": 1.131, + "step": 1542 + }, + { + "epoch": 0.3532913566113337, + "grad_norm": 1.0618422031402588, + "learning_rate": 1.5023537310951284e-05, + "loss": 1.1499, + "step": 1543 + }, + { + "epoch": 0.35352032054951343, + "grad_norm": 1.1327383518218994, + "learning_rate": 1.501712172288511e-05, + "loss": 1.1216, + "step": 1544 + }, + { + "epoch": 0.35374928448769316, + "grad_norm": 1.2645106315612793, + "learning_rate": 1.5010703373938915e-05, + "loss": 1.108, + "step": 1545 + }, + { + "epoch": 0.35397824842587294, + "grad_norm": 1.7846484184265137, + "learning_rate": 1.5004282267644668e-05, + "loss": 1.0953, + "step": 1546 + }, + { + "epoch": 0.35420721236405267, + "grad_norm": 1.1685068607330322, + "learning_rate": 1.4997858407535841e-05, + "loss": 1.0658, + "step": 1547 + }, + { + "epoch": 0.3544361763022324, + "grad_norm": 0.9860405921936035, + "learning_rate": 1.4991431797147433e-05, + "loss": 1.053, + "step": 1548 + }, + { + "epoch": 0.3546651402404121, + "grad_norm": 1.2382588386535645, + "learning_rate": 1.4985002440015959e-05, + "loss": 1.112, + "step": 1549 + }, + { + "epoch": 0.35489410417859185, + "grad_norm": 1.1630792617797852, + "learning_rate": 1.4978570339679435e-05, + "loss": 1.0488, + "step": 1550 + }, + { + "epoch": 0.35512306811677163, + "grad_norm": 1.154436469078064, + "learning_rate": 1.4972135499677395e-05, + "loss": 1.0509, + "step": 1551 + }, + { + "epoch": 0.35535203205495136, + "grad_norm": 1.0315396785736084, + "learning_rate": 1.4965697923550873e-05, + "loss": 1.0807, + "step": 1552 + }, + { + "epoch": 0.3555809959931311, + "grad_norm": 0.9754356741905212, + "learning_rate": 1.4959257614842416e-05, + "loss": 1.1137, + "step": 1553 + }, + { + "epoch": 0.3558099599313108, + "grad_norm": 1.0511184930801392, + "learning_rate": 1.4952814577096073e-05, + "loss": 1.1068, + "step": 1554 + }, + { + "epoch": 0.35603892386949054, + "grad_norm": 1.3357048034667969, + "learning_rate": 1.4946368813857393e-05, + "loss": 1.053, + "step": 1555 + }, + { + "epoch": 0.35626788780767027, + "grad_norm": 0.9737345576286316, + "learning_rate": 1.4939920328673422e-05, + "loss": 1.0613, + "step": 1556 + }, + { + "epoch": 0.35649685174585005, + "grad_norm": 1.1738935708999634, + "learning_rate": 1.4933469125092714e-05, + "loss": 1.0548, + "step": 1557 + }, + { + "epoch": 0.3567258156840298, + "grad_norm": 1.0252792835235596, + "learning_rate": 1.4927015206665311e-05, + "loss": 1.1409, + "step": 1558 + }, + { + "epoch": 0.3569547796222095, + "grad_norm": 1.1485637426376343, + "learning_rate": 1.4920558576942746e-05, + "loss": 1.1199, + "step": 1559 + }, + { + "epoch": 0.35718374356038923, + "grad_norm": 1.2590608596801758, + "learning_rate": 1.4914099239478046e-05, + "loss": 1.0939, + "step": 1560 + }, + { + "epoch": 0.35741270749856896, + "grad_norm": 1.1943340301513672, + "learning_rate": 1.490763719782574e-05, + "loss": 1.049, + "step": 1561 + }, + { + "epoch": 0.3576416714367487, + "grad_norm": 1.8277630805969238, + "learning_rate": 1.4901172455541826e-05, + "loss": 1.1446, + "step": 1562 + }, + { + "epoch": 0.35787063537492847, + "grad_norm": 1.2777670621871948, + "learning_rate": 1.4894705016183803e-05, + "loss": 1.0863, + "step": 1563 + }, + { + "epoch": 0.3580995993131082, + "grad_norm": 1.1132858991622925, + "learning_rate": 1.4888234883310644e-05, + "loss": 1.1255, + "step": 1564 + }, + { + "epoch": 0.3583285632512879, + "grad_norm": 1.2168593406677246, + "learning_rate": 1.4881762060482814e-05, + "loss": 1.1271, + "step": 1565 + }, + { + "epoch": 0.35855752718946765, + "grad_norm": 1.321545124053955, + "learning_rate": 1.4875286551262252e-05, + "loss": 1.0964, + "step": 1566 + }, + { + "epoch": 0.3587864911276474, + "grad_norm": 1.1392179727554321, + "learning_rate": 1.4868808359212373e-05, + "loss": 1.0022, + "step": 1567 + }, + { + "epoch": 0.3590154550658271, + "grad_norm": 1.227779746055603, + "learning_rate": 1.4862327487898075e-05, + "loss": 1.1225, + "step": 1568 + }, + { + "epoch": 0.3592444190040069, + "grad_norm": 1.1016408205032349, + "learning_rate": 1.4855843940885726e-05, + "loss": 1.0832, + "step": 1569 + }, + { + "epoch": 0.3594733829421866, + "grad_norm": 1.0965155363082886, + "learning_rate": 1.4849357721743169e-05, + "loss": 1.0853, + "step": 1570 + }, + { + "epoch": 0.35970234688036634, + "grad_norm": 1.1590499877929688, + "learning_rate": 1.484286883403971e-05, + "loss": 1.0385, + "step": 1571 + }, + { + "epoch": 0.35993131081854607, + "grad_norm": 1.0855969190597534, + "learning_rate": 1.483637728134614e-05, + "loss": 1.0943, + "step": 1572 + }, + { + "epoch": 0.3601602747567258, + "grad_norm": 1.1280837059020996, + "learning_rate": 1.4829883067234699e-05, + "loss": 1.0209, + "step": 1573 + }, + { + "epoch": 0.3603892386949056, + "grad_norm": 1.0733137130737305, + "learning_rate": 1.4823386195279098e-05, + "loss": 1.1372, + "step": 1574 + }, + { + "epoch": 0.3606182026330853, + "grad_norm": 1.1513261795043945, + "learning_rate": 1.4816886669054514e-05, + "loss": 1.0584, + "step": 1575 + }, + { + "epoch": 0.36084716657126503, + "grad_norm": 1.3716681003570557, + "learning_rate": 1.4810384492137582e-05, + "loss": 1.1219, + "step": 1576 + }, + { + "epoch": 0.36107613050944476, + "grad_norm": 0.9845678806304932, + "learning_rate": 1.4803879668106393e-05, + "loss": 1.0914, + "step": 1577 + }, + { + "epoch": 0.3613050944476245, + "grad_norm": 1.257351040840149, + "learning_rate": 1.4797372200540497e-05, + "loss": 1.1247, + "step": 1578 + }, + { + "epoch": 0.3615340583858042, + "grad_norm": 1.4582107067108154, + "learning_rate": 1.4790862093020903e-05, + "loss": 1.0401, + "step": 1579 + }, + { + "epoch": 0.361763022323984, + "grad_norm": 1.277854323387146, + "learning_rate": 1.4784349349130063e-05, + "loss": 1.0367, + "step": 1580 + }, + { + "epoch": 0.3619919862621637, + "grad_norm": 1.1028213500976562, + "learning_rate": 1.4777833972451889e-05, + "loss": 1.0808, + "step": 1581 + }, + { + "epoch": 0.36222095020034345, + "grad_norm": 1.0931235551834106, + "learning_rate": 1.477131596657174e-05, + "loss": 1.0286, + "step": 1582 + }, + { + "epoch": 0.3624499141385232, + "grad_norm": 1.4668598175048828, + "learning_rate": 1.4764795335076414e-05, + "loss": 1.1162, + "step": 1583 + }, + { + "epoch": 0.3626788780767029, + "grad_norm": 2.652813673019409, + "learning_rate": 1.4758272081554168e-05, + "loss": 1.1068, + "step": 1584 + }, + { + "epoch": 0.36290784201488263, + "grad_norm": 1.1767535209655762, + "learning_rate": 1.4751746209594683e-05, + "loss": 1.0795, + "step": 1585 + }, + { + "epoch": 0.3631368059530624, + "grad_norm": 1.0798718929290771, + "learning_rate": 1.47452177227891e-05, + "loss": 1.0911, + "step": 1586 + }, + { + "epoch": 0.36336576989124214, + "grad_norm": 1.119814395904541, + "learning_rate": 1.4738686624729987e-05, + "loss": 1.0673, + "step": 1587 + }, + { + "epoch": 0.36359473382942187, + "grad_norm": 1.2240833044052124, + "learning_rate": 1.4732152919011355e-05, + "loss": 1.0708, + "step": 1588 + }, + { + "epoch": 0.3638236977676016, + "grad_norm": 1.1120808124542236, + "learning_rate": 1.4725616609228648e-05, + "loss": 1.0919, + "step": 1589 + }, + { + "epoch": 0.3640526617057813, + "grad_norm": 1.2676600217819214, + "learning_rate": 1.4719077698978737e-05, + "loss": 1.0665, + "step": 1590 + }, + { + "epoch": 0.36428162564396105, + "grad_norm": 1.1480255126953125, + "learning_rate": 1.4712536191859934e-05, + "loss": 1.0787, + "step": 1591 + }, + { + "epoch": 0.36451058958214083, + "grad_norm": 1.7230538129806519, + "learning_rate": 1.4705992091471975e-05, + "loss": 1.1455, + "step": 1592 + }, + { + "epoch": 0.36473955352032056, + "grad_norm": 1.1542283296585083, + "learning_rate": 1.4699445401416024e-05, + "loss": 1.1246, + "step": 1593 + }, + { + "epoch": 0.3649685174585003, + "grad_norm": 1.230782389640808, + "learning_rate": 1.4692896125294667e-05, + "loss": 1.1106, + "step": 1594 + }, + { + "epoch": 0.36519748139668, + "grad_norm": 1.2832047939300537, + "learning_rate": 1.4686344266711916e-05, + "loss": 1.1203, + "step": 1595 + }, + { + "epoch": 0.36542644533485974, + "grad_norm": 1.213179111480713, + "learning_rate": 1.467978982927321e-05, + "loss": 1.1387, + "step": 1596 + }, + { + "epoch": 0.3656554092730395, + "grad_norm": 1.2163344621658325, + "learning_rate": 1.4673232816585392e-05, + "loss": 1.1456, + "step": 1597 + }, + { + "epoch": 0.36588437321121925, + "grad_norm": 1.1617780923843384, + "learning_rate": 1.4666673232256738e-05, + "loss": 1.091, + "step": 1598 + }, + { + "epoch": 0.366113337149399, + "grad_norm": 1.3086128234863281, + "learning_rate": 1.466011107989693e-05, + "loss": 1.096, + "step": 1599 + }, + { + "epoch": 0.3663423010875787, + "grad_norm": 1.1292928457260132, + "learning_rate": 1.4653546363117063e-05, + "loss": 1.0508, + "step": 1600 + }, + { + "epoch": 0.36657126502575843, + "grad_norm": 1.2157028913497925, + "learning_rate": 1.464697908552965e-05, + "loss": 1.0545, + "step": 1601 + }, + { + "epoch": 0.36680022896393816, + "grad_norm": 1.3973489999771118, + "learning_rate": 1.4640409250748604e-05, + "loss": 1.0731, + "step": 1602 + }, + { + "epoch": 0.36702919290211794, + "grad_norm": 1.1359471082687378, + "learning_rate": 1.4633836862389257e-05, + "loss": 1.0881, + "step": 1603 + }, + { + "epoch": 0.36725815684029767, + "grad_norm": 1.074826955795288, + "learning_rate": 1.4627261924068329e-05, + "loss": 1.0443, + "step": 1604 + }, + { + "epoch": 0.3674871207784774, + "grad_norm": 1.4347003698349, + "learning_rate": 1.4620684439403962e-05, + "loss": 1.07, + "step": 1605 + }, + { + "epoch": 0.3677160847166571, + "grad_norm": 1.0502759218215942, + "learning_rate": 1.4614104412015688e-05, + "loss": 1.0734, + "step": 1606 + }, + { + "epoch": 0.36794504865483685, + "grad_norm": 1.6279913187026978, + "learning_rate": 1.4607521845524439e-05, + "loss": 1.0855, + "step": 1607 + }, + { + "epoch": 0.3681740125930166, + "grad_norm": 1.2757513523101807, + "learning_rate": 1.460093674355255e-05, + "loss": 1.0385, + "step": 1608 + }, + { + "epoch": 0.36840297653119636, + "grad_norm": 1.1416752338409424, + "learning_rate": 1.4594349109723744e-05, + "loss": 1.0783, + "step": 1609 + }, + { + "epoch": 0.3686319404693761, + "grad_norm": 1.1832513809204102, + "learning_rate": 1.4587758947663146e-05, + "loss": 1.0836, + "step": 1610 + }, + { + "epoch": 0.3688609044075558, + "grad_norm": 1.1325169801712036, + "learning_rate": 1.4581166260997259e-05, + "loss": 1.0372, + "step": 1611 + }, + { + "epoch": 0.36908986834573554, + "grad_norm": 1.3580266237258911, + "learning_rate": 1.4574571053353987e-05, + "loss": 1.0604, + "step": 1612 + }, + { + "epoch": 0.36931883228391527, + "grad_norm": 1.2545275688171387, + "learning_rate": 1.4567973328362616e-05, + "loss": 1.0859, + "step": 1613 + }, + { + "epoch": 0.369547796222095, + "grad_norm": 1.1859338283538818, + "learning_rate": 1.4561373089653823e-05, + "loss": 1.1461, + "step": 1614 + }, + { + "epoch": 0.3697767601602748, + "grad_norm": 1.2535377740859985, + "learning_rate": 1.4554770340859661e-05, + "loss": 1.0554, + "step": 1615 + }, + { + "epoch": 0.3700057240984545, + "grad_norm": 1.490103840827942, + "learning_rate": 1.4548165085613569e-05, + "loss": 1.0364, + "step": 1616 + }, + { + "epoch": 0.37023468803663423, + "grad_norm": 1.1767431497573853, + "learning_rate": 1.454155732755036e-05, + "loss": 1.0654, + "step": 1617 + }, + { + "epoch": 0.37046365197481396, + "grad_norm": 1.525582194328308, + "learning_rate": 1.453494707030623e-05, + "loss": 1.0878, + "step": 1618 + }, + { + "epoch": 0.3706926159129937, + "grad_norm": 1.5364714860916138, + "learning_rate": 1.452833431751875e-05, + "loss": 1.0832, + "step": 1619 + }, + { + "epoch": 0.37092157985117347, + "grad_norm": 1.1335092782974243, + "learning_rate": 1.4521719072826858e-05, + "loss": 1.0521, + "step": 1620 + }, + { + "epoch": 0.3711505437893532, + "grad_norm": 1.278962254524231, + "learning_rate": 1.4515101339870871e-05, + "loss": 1.0582, + "step": 1621 + }, + { + "epoch": 0.3713795077275329, + "grad_norm": 1.424355387687683, + "learning_rate": 1.4508481122292475e-05, + "loss": 1.0583, + "step": 1622 + }, + { + "epoch": 0.37160847166571265, + "grad_norm": 1.1989699602127075, + "learning_rate": 1.4501858423734711e-05, + "loss": 1.0488, + "step": 1623 + }, + { + "epoch": 0.3718374356038924, + "grad_norm": 1.4514950513839722, + "learning_rate": 1.4495233247842001e-05, + "loss": 1.1174, + "step": 1624 + }, + { + "epoch": 0.3720663995420721, + "grad_norm": 1.0482782125473022, + "learning_rate": 1.4488605598260129e-05, + "loss": 1.085, + "step": 1625 + }, + { + "epoch": 0.3722953634802519, + "grad_norm": 1.019201636314392, + "learning_rate": 1.448197547863622e-05, + "loss": 1.0703, + "step": 1626 + }, + { + "epoch": 0.3725243274184316, + "grad_norm": 1.3561880588531494, + "learning_rate": 1.4475342892618792e-05, + "loss": 1.0594, + "step": 1627 + }, + { + "epoch": 0.37275329135661134, + "grad_norm": 1.0671643018722534, + "learning_rate": 1.4468707843857683e-05, + "loss": 1.0137, + "step": 1628 + }, + { + "epoch": 0.37298225529479107, + "grad_norm": 1.2708206176757812, + "learning_rate": 1.4462070336004117e-05, + "loss": 1.0725, + "step": 1629 + }, + { + "epoch": 0.3732112192329708, + "grad_norm": 2.9247419834136963, + "learning_rate": 1.4455430372710652e-05, + "loss": 1.144, + "step": 1630 + }, + { + "epoch": 0.3734401831711505, + "grad_norm": 1.4302781820297241, + "learning_rate": 1.4448787957631209e-05, + "loss": 1.0814, + "step": 1631 + }, + { + "epoch": 0.3736691471093303, + "grad_norm": 1.1992768049240112, + "learning_rate": 1.4442143094421054e-05, + "loss": 1.0457, + "step": 1632 + }, + { + "epoch": 0.37389811104751003, + "grad_norm": 1.1916435956954956, + "learning_rate": 1.4435495786736796e-05, + "loss": 1.1704, + "step": 1633 + }, + { + "epoch": 0.37412707498568976, + "grad_norm": 1.3071752786636353, + "learning_rate": 1.4428846038236391e-05, + "loss": 1.0534, + "step": 1634 + }, + { + "epoch": 0.3743560389238695, + "grad_norm": 1.2490984201431274, + "learning_rate": 1.4422193852579144e-05, + "loss": 1.0872, + "step": 1635 + }, + { + "epoch": 0.3745850028620492, + "grad_norm": 1.0837894678115845, + "learning_rate": 1.4415539233425697e-05, + "loss": 1.1409, + "step": 1636 + }, + { + "epoch": 0.37481396680022894, + "grad_norm": 1.1508346796035767, + "learning_rate": 1.4408882184438029e-05, + "loss": 1.1122, + "step": 1637 + }, + { + "epoch": 0.3750429307384087, + "grad_norm": 1.1703577041625977, + "learning_rate": 1.4402222709279458e-05, + "loss": 1.0457, + "step": 1638 + }, + { + "epoch": 0.37527189467658845, + "grad_norm": 1.1867021322250366, + "learning_rate": 1.439556081161464e-05, + "loss": 1.0956, + "step": 1639 + }, + { + "epoch": 0.3755008586147682, + "grad_norm": 1.2426538467407227, + "learning_rate": 1.438889649510956e-05, + "loss": 1.0897, + "step": 1640 + }, + { + "epoch": 0.3757298225529479, + "grad_norm": 1.1953974962234497, + "learning_rate": 1.4382229763431533e-05, + "loss": 1.096, + "step": 1641 + }, + { + "epoch": 0.37595878649112763, + "grad_norm": 1.0704538822174072, + "learning_rate": 1.4375560620249209e-05, + "loss": 1.079, + "step": 1642 + }, + { + "epoch": 0.3761877504293074, + "grad_norm": 1.342177152633667, + "learning_rate": 1.4368889069232559e-05, + "loss": 1.1331, + "step": 1643 + }, + { + "epoch": 0.37641671436748714, + "grad_norm": 1.2774497270584106, + "learning_rate": 1.4362215114052887e-05, + "loss": 1.0949, + "step": 1644 + }, + { + "epoch": 0.37664567830566686, + "grad_norm": 1.1422805786132812, + "learning_rate": 1.4355538758382805e-05, + "loss": 1.087, + "step": 1645 + }, + { + "epoch": 0.3768746422438466, + "grad_norm": 1.1978528499603271, + "learning_rate": 1.4348860005896266e-05, + "loss": 1.0468, + "step": 1646 + }, + { + "epoch": 0.3771036061820263, + "grad_norm": 1.2687922716140747, + "learning_rate": 1.4342178860268523e-05, + "loss": 1.0217, + "step": 1647 + }, + { + "epoch": 0.37733257012020605, + "grad_norm": 1.1119877099990845, + "learning_rate": 1.433549532517616e-05, + "loss": 1.0787, + "step": 1648 + }, + { + "epoch": 0.37756153405838583, + "grad_norm": 0.9895734190940857, + "learning_rate": 1.4328809404297068e-05, + "loss": 1.0796, + "step": 1649 + }, + { + "epoch": 0.37779049799656556, + "grad_norm": 1.1445213556289673, + "learning_rate": 1.4322121101310454e-05, + "loss": 1.0889, + "step": 1650 + }, + { + "epoch": 0.3780194619347453, + "grad_norm": 1.115267276763916, + "learning_rate": 1.4315430419896836e-05, + "loss": 1.0857, + "step": 1651 + }, + { + "epoch": 0.378248425872925, + "grad_norm": 1.3437042236328125, + "learning_rate": 1.4308737363738035e-05, + "loss": 1.0725, + "step": 1652 + }, + { + "epoch": 0.37847738981110474, + "grad_norm": 1.1512742042541504, + "learning_rate": 1.430204193651719e-05, + "loss": 1.0748, + "step": 1653 + }, + { + "epoch": 0.37870635374928446, + "grad_norm": 1.109114646911621, + "learning_rate": 1.4295344141918734e-05, + "loss": 1.1062, + "step": 1654 + }, + { + "epoch": 0.37893531768746425, + "grad_norm": 1.2906250953674316, + "learning_rate": 1.428864398362841e-05, + "loss": 1.0753, + "step": 1655 + }, + { + "epoch": 0.379164281625644, + "grad_norm": 1.2104160785675049, + "learning_rate": 1.4281941465333255e-05, + "loss": 1.0533, + "step": 1656 + }, + { + "epoch": 0.3793932455638237, + "grad_norm": 1.1130504608154297, + "learning_rate": 1.4275236590721615e-05, + "loss": 1.0529, + "step": 1657 + }, + { + "epoch": 0.3796222095020034, + "grad_norm": 1.3344502449035645, + "learning_rate": 1.4268529363483124e-05, + "loss": 1.1511, + "step": 1658 + }, + { + "epoch": 0.37985117344018315, + "grad_norm": 1.1640992164611816, + "learning_rate": 1.4261819787308708e-05, + "loss": 1.0609, + "step": 1659 + }, + { + "epoch": 0.3800801373783629, + "grad_norm": 1.4107555150985718, + "learning_rate": 1.4255107865890597e-05, + "loss": 1.0672, + "step": 1660 + }, + { + "epoch": 0.38030910131654266, + "grad_norm": 1.214077115058899, + "learning_rate": 1.4248393602922299e-05, + "loss": 1.097, + "step": 1661 + }, + { + "epoch": 0.3805380652547224, + "grad_norm": 1.1887096166610718, + "learning_rate": 1.4241677002098622e-05, + "loss": 1.0425, + "step": 1662 + }, + { + "epoch": 0.3807670291929021, + "grad_norm": 1.1480146646499634, + "learning_rate": 1.4234958067115652e-05, + "loss": 1.0934, + "step": 1663 + }, + { + "epoch": 0.38099599313108184, + "grad_norm": 1.6082152128219604, + "learning_rate": 1.4228236801670762e-05, + "loss": 1.0686, + "step": 1664 + }, + { + "epoch": 0.38122495706926157, + "grad_norm": 1.2487170696258545, + "learning_rate": 1.4221513209462615e-05, + "loss": 1.041, + "step": 1665 + }, + { + "epoch": 0.38145392100744135, + "grad_norm": 1.2354825735092163, + "learning_rate": 1.4214787294191137e-05, + "loss": 1.1181, + "step": 1666 + }, + { + "epoch": 0.3816828849456211, + "grad_norm": 1.2729392051696777, + "learning_rate": 1.4208059059557551e-05, + "loss": 1.0707, + "step": 1667 + }, + { + "epoch": 0.3819118488838008, + "grad_norm": 1.0977466106414795, + "learning_rate": 1.420132850926434e-05, + "loss": 1.087, + "step": 1668 + }, + { + "epoch": 0.38214081282198054, + "grad_norm": 1.6709105968475342, + "learning_rate": 1.419459564701528e-05, + "loss": 1.1877, + "step": 1669 + }, + { + "epoch": 0.38236977676016026, + "grad_norm": 1.3484159708023071, + "learning_rate": 1.41878604765154e-05, + "loss": 1.1141, + "step": 1670 + }, + { + "epoch": 0.38259874069834, + "grad_norm": 1.3736979961395264, + "learning_rate": 1.4181123001471012e-05, + "loss": 1.0962, + "step": 1671 + }, + { + "epoch": 0.38282770463651977, + "grad_norm": 2.105513572692871, + "learning_rate": 1.4174383225589691e-05, + "loss": 1.1048, + "step": 1672 + }, + { + "epoch": 0.3830566685746995, + "grad_norm": 1.2961084842681885, + "learning_rate": 1.4167641152580278e-05, + "loss": 1.0663, + "step": 1673 + }, + { + "epoch": 0.3832856325128792, + "grad_norm": 1.142513394355774, + "learning_rate": 1.416089678615288e-05, + "loss": 1.0686, + "step": 1674 + }, + { + "epoch": 0.38351459645105895, + "grad_norm": 1.188399314880371, + "learning_rate": 1.4154150130018867e-05, + "loss": 1.1332, + "step": 1675 + }, + { + "epoch": 0.3837435603892387, + "grad_norm": 1.2411218881607056, + "learning_rate": 1.4147401187890863e-05, + "loss": 1.1118, + "step": 1676 + }, + { + "epoch": 0.3839725243274184, + "grad_norm": 1.4901659488677979, + "learning_rate": 1.4140649963482763e-05, + "loss": 1.0477, + "step": 1677 + }, + { + "epoch": 0.3842014882655982, + "grad_norm": 1.0591933727264404, + "learning_rate": 1.4133896460509695e-05, + "loss": 1.0359, + "step": 1678 + }, + { + "epoch": 0.3844304522037779, + "grad_norm": 1.8987795114517212, + "learning_rate": 1.412714068268807e-05, + "loss": 1.0438, + "step": 1679 + }, + { + "epoch": 0.38465941614195764, + "grad_norm": 1.1370187997817993, + "learning_rate": 1.4120382633735528e-05, + "loss": 1.0796, + "step": 1680 + }, + { + "epoch": 0.38488838008013737, + "grad_norm": 1.178002953529358, + "learning_rate": 1.4113622317370965e-05, + "loss": 1.0361, + "step": 1681 + }, + { + "epoch": 0.3851173440183171, + "grad_norm": 0.9956271052360535, + "learning_rate": 1.4106859737314532e-05, + "loss": 1.0649, + "step": 1682 + }, + { + "epoch": 0.3853463079564968, + "grad_norm": 1.285373330116272, + "learning_rate": 1.4100094897287618e-05, + "loss": 1.1628, + "step": 1683 + }, + { + "epoch": 0.3855752718946766, + "grad_norm": 1.119768500328064, + "learning_rate": 1.4093327801012854e-05, + "loss": 1.1083, + "step": 1684 + }, + { + "epoch": 0.38580423583285633, + "grad_norm": 1.304608702659607, + "learning_rate": 1.4086558452214121e-05, + "loss": 1.0883, + "step": 1685 + }, + { + "epoch": 0.38603319977103606, + "grad_norm": 1.1423434019088745, + "learning_rate": 1.4079786854616537e-05, + "loss": 1.048, + "step": 1686 + }, + { + "epoch": 0.3862621637092158, + "grad_norm": 1.159722089767456, + "learning_rate": 1.4073013011946449e-05, + "loss": 1.0642, + "step": 1687 + }, + { + "epoch": 0.3864911276473955, + "grad_norm": 1.1652872562408447, + "learning_rate": 1.4066236927931447e-05, + "loss": 1.1185, + "step": 1688 + }, + { + "epoch": 0.3867200915855753, + "grad_norm": 1.3069536685943604, + "learning_rate": 1.4059458606300358e-05, + "loss": 1.1017, + "step": 1689 + }, + { + "epoch": 0.386949055523755, + "grad_norm": 1.2639847993850708, + "learning_rate": 1.405267805078323e-05, + "loss": 1.1014, + "step": 1690 + }, + { + "epoch": 0.38717801946193475, + "grad_norm": 1.3884036540985107, + "learning_rate": 1.4045895265111352e-05, + "loss": 1.099, + "step": 1691 + }, + { + "epoch": 0.3874069834001145, + "grad_norm": 1.3688089847564697, + "learning_rate": 1.4039110253017225e-05, + "loss": 1.0532, + "step": 1692 + }, + { + "epoch": 0.3876359473382942, + "grad_norm": 1.2900769710540771, + "learning_rate": 1.4032323018234592e-05, + "loss": 1.0949, + "step": 1693 + }, + { + "epoch": 0.38786491127647393, + "grad_norm": 1.3169381618499756, + "learning_rate": 1.4025533564498411e-05, + "loss": 1.1092, + "step": 1694 + }, + { + "epoch": 0.3880938752146537, + "grad_norm": 1.2723472118377686, + "learning_rate": 1.401874189554486e-05, + "loss": 1.0769, + "step": 1695 + }, + { + "epoch": 0.38832283915283344, + "grad_norm": 1.1920543909072876, + "learning_rate": 1.4011948015111334e-05, + "loss": 1.1574, + "step": 1696 + }, + { + "epoch": 0.38855180309101317, + "grad_norm": 1.2525862455368042, + "learning_rate": 1.400515192693645e-05, + "loss": 1.1647, + "step": 1697 + }, + { + "epoch": 0.3887807670291929, + "grad_norm": 9.487869262695312, + "learning_rate": 1.3998353634760044e-05, + "loss": 1.1278, + "step": 1698 + }, + { + "epoch": 0.3890097309673726, + "grad_norm": 1.6089351177215576, + "learning_rate": 1.3991553142323156e-05, + "loss": 1.1101, + "step": 1699 + }, + { + "epoch": 0.38923869490555235, + "grad_norm": 1.3901886940002441, + "learning_rate": 1.3984750453368033e-05, + "loss": 1.1249, + "step": 1700 + }, + { + "epoch": 0.38946765884373213, + "grad_norm": 3.7920846939086914, + "learning_rate": 1.397794557163815e-05, + "loss": 1.0998, + "step": 1701 + }, + { + "epoch": 0.38969662278191186, + "grad_norm": 1.1165345907211304, + "learning_rate": 1.3971138500878166e-05, + "loss": 1.0719, + "step": 1702 + }, + { + "epoch": 0.3899255867200916, + "grad_norm": 1.2537471055984497, + "learning_rate": 1.396432924483396e-05, + "loss": 1.0601, + "step": 1703 + }, + { + "epoch": 0.3901545506582713, + "grad_norm": 1.2301760911941528, + "learning_rate": 1.3957517807252607e-05, + "loss": 1.1573, + "step": 1704 + }, + { + "epoch": 0.39038351459645104, + "grad_norm": 1.645586609840393, + "learning_rate": 1.3950704191882388e-05, + "loss": 1.1273, + "step": 1705 + }, + { + "epoch": 0.39061247853463077, + "grad_norm": 1.3407573699951172, + "learning_rate": 1.3943888402472771e-05, + "loss": 1.0567, + "step": 1706 + }, + { + "epoch": 0.39084144247281055, + "grad_norm": 2.350411891937256, + "learning_rate": 1.393707044277443e-05, + "loss": 1.0961, + "step": 1707 + }, + { + "epoch": 0.3910704064109903, + "grad_norm": 1.187579870223999, + "learning_rate": 1.3930250316539237e-05, + "loss": 1.0545, + "step": 1708 + }, + { + "epoch": 0.39129937034917, + "grad_norm": 1.091801404953003, + "learning_rate": 1.3923428027520246e-05, + "loss": 1.0505, + "step": 1709 + }, + { + "epoch": 0.39152833428734973, + "grad_norm": 1.194849967956543, + "learning_rate": 1.3916603579471705e-05, + "loss": 1.0593, + "step": 1710 + }, + { + "epoch": 0.39175729822552946, + "grad_norm": 1.1145169734954834, + "learning_rate": 1.3909776976149047e-05, + "loss": 1.051, + "step": 1711 + }, + { + "epoch": 0.39198626216370924, + "grad_norm": 1.3400248289108276, + "learning_rate": 1.3902948221308903e-05, + "loss": 1.0907, + "step": 1712 + }, + { + "epoch": 0.39221522610188897, + "grad_norm": 1.3348122835159302, + "learning_rate": 1.3896117318709074e-05, + "loss": 1.0244, + "step": 1713 + }, + { + "epoch": 0.3924441900400687, + "grad_norm": 1.1804814338684082, + "learning_rate": 1.388928427210855e-05, + "loss": 1.1056, + "step": 1714 + }, + { + "epoch": 0.3926731539782484, + "grad_norm": 1.4333142042160034, + "learning_rate": 1.3882449085267497e-05, + "loss": 1.1095, + "step": 1715 + }, + { + "epoch": 0.39290211791642815, + "grad_norm": 1.10541570186615, + "learning_rate": 1.3875611761947264e-05, + "loss": 1.1461, + "step": 1716 + }, + { + "epoch": 0.3931310818546079, + "grad_norm": 1.3302122354507446, + "learning_rate": 1.3868772305910376e-05, + "loss": 1.043, + "step": 1717 + }, + { + "epoch": 0.39336004579278766, + "grad_norm": 1.2536112070083618, + "learning_rate": 1.3861930720920518e-05, + "loss": 1.0733, + "step": 1718 + }, + { + "epoch": 0.3935890097309674, + "grad_norm": 1.1884063482284546, + "learning_rate": 1.3855087010742563e-05, + "loss": 1.125, + "step": 1719 + }, + { + "epoch": 0.3938179736691471, + "grad_norm": 1.491746187210083, + "learning_rate": 1.384824117914255e-05, + "loss": 1.071, + "step": 1720 + }, + { + "epoch": 0.39404693760732684, + "grad_norm": 1.331817865371704, + "learning_rate": 1.3841393229887676e-05, + "loss": 1.084, + "step": 1721 + }, + { + "epoch": 0.39427590154550657, + "grad_norm": 1.232057809829712, + "learning_rate": 1.3834543166746317e-05, + "loss": 1.1158, + "step": 1722 + }, + { + "epoch": 0.3945048654836863, + "grad_norm": 1.0441474914550781, + "learning_rate": 1.3827690993488e-05, + "loss": 1.1093, + "step": 1723 + }, + { + "epoch": 0.3947338294218661, + "grad_norm": 1.1634544134140015, + "learning_rate": 1.3820836713883424e-05, + "loss": 1.0904, + "step": 1724 + }, + { + "epoch": 0.3949627933600458, + "grad_norm": 1.3219141960144043, + "learning_rate": 1.3813980331704437e-05, + "loss": 1.0814, + "step": 1725 + }, + { + "epoch": 0.39519175729822553, + "grad_norm": 1.0469324588775635, + "learning_rate": 1.3807121850724045e-05, + "loss": 1.0653, + "step": 1726 + }, + { + "epoch": 0.39542072123640526, + "grad_norm": 1.1637883186340332, + "learning_rate": 1.3800261274716424e-05, + "loss": 1.0838, + "step": 1727 + }, + { + "epoch": 0.395649685174585, + "grad_norm": 1.1523021459579468, + "learning_rate": 1.3793398607456883e-05, + "loss": 1.0906, + "step": 1728 + }, + { + "epoch": 0.3958786491127647, + "grad_norm": 1.1947762966156006, + "learning_rate": 1.3786533852721891e-05, + "loss": 1.0937, + "step": 1729 + }, + { + "epoch": 0.3961076130509445, + "grad_norm": 1.565125584602356, + "learning_rate": 1.3779667014289067e-05, + "loss": 1.0973, + "step": 1730 + }, + { + "epoch": 0.3963365769891242, + "grad_norm": 1.6443101167678833, + "learning_rate": 1.3772798095937172e-05, + "loss": 1.0978, + "step": 1731 + }, + { + "epoch": 0.39656554092730395, + "grad_norm": 1.2846298217773438, + "learning_rate": 1.3765927101446121e-05, + "loss": 1.057, + "step": 1732 + }, + { + "epoch": 0.3967945048654837, + "grad_norm": 1.245452880859375, + "learning_rate": 1.3759054034596953e-05, + "loss": 1.1326, + "step": 1733 + }, + { + "epoch": 0.3970234688036634, + "grad_norm": 1.2566457986831665, + "learning_rate": 1.375217889917187e-05, + "loss": 1.1039, + "step": 1734 + }, + { + "epoch": 0.3972524327418432, + "grad_norm": 1.0944688320159912, + "learning_rate": 1.3745301698954197e-05, + "loss": 1.076, + "step": 1735 + }, + { + "epoch": 0.3974813966800229, + "grad_norm": 1.7811460494995117, + "learning_rate": 1.3738422437728398e-05, + "loss": 1.0675, + "step": 1736 + }, + { + "epoch": 0.39771036061820264, + "grad_norm": 2.1263771057128906, + "learning_rate": 1.3731541119280073e-05, + "loss": 1.0859, + "step": 1737 + }, + { + "epoch": 0.39793932455638237, + "grad_norm": 1.2063853740692139, + "learning_rate": 1.3724657747395957e-05, + "loss": 1.1231, + "step": 1738 + }, + { + "epoch": 0.3981682884945621, + "grad_norm": 1.1201328039169312, + "learning_rate": 1.3717772325863913e-05, + "loss": 1.1532, + "step": 1739 + }, + { + "epoch": 0.3983972524327418, + "grad_norm": 1.3484069108963013, + "learning_rate": 1.3710884858472926e-05, + "loss": 1.0621, + "step": 1740 + }, + { + "epoch": 0.3986262163709216, + "grad_norm": 1.4419175386428833, + "learning_rate": 1.3703995349013113e-05, + "loss": 1.0816, + "step": 1741 + }, + { + "epoch": 0.39885518030910133, + "grad_norm": 1.026955246925354, + "learning_rate": 1.3697103801275714e-05, + "loss": 1.0184, + "step": 1742 + }, + { + "epoch": 0.39908414424728106, + "grad_norm": 1.369357943534851, + "learning_rate": 1.3690210219053088e-05, + "loss": 1.0637, + "step": 1743 + }, + { + "epoch": 0.3993131081854608, + "grad_norm": 1.0927389860153198, + "learning_rate": 1.3683314606138718e-05, + "loss": 1.0612, + "step": 1744 + }, + { + "epoch": 0.3995420721236405, + "grad_norm": 1.3103286027908325, + "learning_rate": 1.3676416966327201e-05, + "loss": 1.0001, + "step": 1745 + }, + { + "epoch": 0.39977103606182024, + "grad_norm": 1.1909055709838867, + "learning_rate": 1.3669517303414254e-05, + "loss": 1.0591, + "step": 1746 + }, + { + "epoch": 0.4, + "grad_norm": 1.6235116720199585, + "learning_rate": 1.3662615621196697e-05, + "loss": 1.1164, + "step": 1747 + }, + { + "epoch": 0.40022896393817975, + "grad_norm": 0.9482841491699219, + "learning_rate": 1.3655711923472472e-05, + "loss": 1.0182, + "step": 1748 + }, + { + "epoch": 0.4004579278763595, + "grad_norm": 1.02751886844635, + "learning_rate": 1.3648806214040625e-05, + "loss": 1.0817, + "step": 1749 + }, + { + "epoch": 0.4006868918145392, + "grad_norm": 1.257691502571106, + "learning_rate": 1.3641898496701306e-05, + "loss": 1.1007, + "step": 1750 + }, + { + "epoch": 0.40091585575271893, + "grad_norm": 1.1339843273162842, + "learning_rate": 1.3634988775255785e-05, + "loss": 1.0948, + "step": 1751 + }, + { + "epoch": 0.40114481969089866, + "grad_norm": 2.271824836730957, + "learning_rate": 1.362807705350641e-05, + "loss": 1.0535, + "step": 1752 + }, + { + "epoch": 0.40137378362907844, + "grad_norm": 1.175581693649292, + "learning_rate": 1.3621163335256655e-05, + "loss": 1.0902, + "step": 1753 + }, + { + "epoch": 0.40160274756725817, + "grad_norm": 0.9900003671646118, + "learning_rate": 1.3614247624311076e-05, + "loss": 1.0428, + "step": 1754 + }, + { + "epoch": 0.4018317115054379, + "grad_norm": 1.2661924362182617, + "learning_rate": 1.360732992447533e-05, + "loss": 1.0597, + "step": 1755 + }, + { + "epoch": 0.4020606754436176, + "grad_norm": 1.1470941305160522, + "learning_rate": 1.360041023955617e-05, + "loss": 1.1798, + "step": 1756 + }, + { + "epoch": 0.40228963938179735, + "grad_norm": 1.8923383951187134, + "learning_rate": 1.3593488573361442e-05, + "loss": 1.0507, + "step": 1757 + }, + { + "epoch": 0.40251860331997713, + "grad_norm": 1.0174728631973267, + "learning_rate": 1.358656492970008e-05, + "loss": 1.0249, + "step": 1758 + }, + { + "epoch": 0.40274756725815686, + "grad_norm": 1.094590663909912, + "learning_rate": 1.3579639312382105e-05, + "loss": 1.0823, + "step": 1759 + }, + { + "epoch": 0.4029765311963366, + "grad_norm": 1.2006431818008423, + "learning_rate": 1.357271172521863e-05, + "loss": 1.0586, + "step": 1760 + }, + { + "epoch": 0.4032054951345163, + "grad_norm": 1.0652323961257935, + "learning_rate": 1.3565782172021847e-05, + "loss": 1.0759, + "step": 1761 + }, + { + "epoch": 0.40343445907269604, + "grad_norm": 1.5044901371002197, + "learning_rate": 1.3558850656605028e-05, + "loss": 1.1158, + "step": 1762 + }, + { + "epoch": 0.40366342301087577, + "grad_norm": 1.201694130897522, + "learning_rate": 1.355191718278253e-05, + "loss": 1.1144, + "step": 1763 + }, + { + "epoch": 0.40389238694905555, + "grad_norm": 1.799466609954834, + "learning_rate": 1.3544981754369789e-05, + "loss": 1.0865, + "step": 1764 + }, + { + "epoch": 0.4041213508872353, + "grad_norm": 1.0255722999572754, + "learning_rate": 1.3538044375183308e-05, + "loss": 1.071, + "step": 1765 + }, + { + "epoch": 0.404350314825415, + "grad_norm": 1.148310661315918, + "learning_rate": 1.3531105049040667e-05, + "loss": 1.0982, + "step": 1766 + }, + { + "epoch": 0.40457927876359473, + "grad_norm": 1.1922695636749268, + "learning_rate": 1.3524163779760526e-05, + "loss": 1.1228, + "step": 1767 + }, + { + "epoch": 0.40480824270177446, + "grad_norm": 1.0259567499160767, + "learning_rate": 1.3517220571162603e-05, + "loss": 1.0426, + "step": 1768 + }, + { + "epoch": 0.4050372066399542, + "grad_norm": 1.3682341575622559, + "learning_rate": 1.3510275427067688e-05, + "loss": 1.11, + "step": 1769 + }, + { + "epoch": 0.40526617057813397, + "grad_norm": 1.16379976272583, + "learning_rate": 1.3503328351297635e-05, + "loss": 1.0049, + "step": 1770 + }, + { + "epoch": 0.4054951345163137, + "grad_norm": 1.128479242324829, + "learning_rate": 1.3496379347675364e-05, + "loss": 1.0751, + "step": 1771 + }, + { + "epoch": 0.4057240984544934, + "grad_norm": 1.2324275970458984, + "learning_rate": 1.3489428420024851e-05, + "loss": 1.0561, + "step": 1772 + }, + { + "epoch": 0.40595306239267315, + "grad_norm": 1.2429150342941284, + "learning_rate": 1.3482475572171132e-05, + "loss": 1.0709, + "step": 1773 + }, + { + "epoch": 0.4061820263308529, + "grad_norm": 1.3965815305709839, + "learning_rate": 1.3475520807940303e-05, + "loss": 1.1079, + "step": 1774 + }, + { + "epoch": 0.4064109902690326, + "grad_norm": 1.0004675388336182, + "learning_rate": 1.3468564131159515e-05, + "loss": 1.0772, + "step": 1775 + }, + { + "epoch": 0.4066399542072124, + "grad_norm": 1.2566231489181519, + "learning_rate": 1.3461605545656961e-05, + "loss": 1.0527, + "step": 1776 + }, + { + "epoch": 0.4068689181453921, + "grad_norm": 1.2383748292922974, + "learning_rate": 1.3454645055261903e-05, + "loss": 1.0754, + "step": 1777 + }, + { + "epoch": 0.40709788208357184, + "grad_norm": 1.3026020526885986, + "learning_rate": 1.344768266380463e-05, + "loss": 1.0667, + "step": 1778 + }, + { + "epoch": 0.40732684602175157, + "grad_norm": 1.141505241394043, + "learning_rate": 1.3440718375116497e-05, + "loss": 1.184, + "step": 1779 + }, + { + "epoch": 0.4075558099599313, + "grad_norm": 1.1129202842712402, + "learning_rate": 1.3433752193029888e-05, + "loss": 1.0463, + "step": 1780 + }, + { + "epoch": 0.4077847738981111, + "grad_norm": 0.9912295341491699, + "learning_rate": 1.3426784121378233e-05, + "loss": 1.1277, + "step": 1781 + }, + { + "epoch": 0.4080137378362908, + "grad_norm": 1.029266119003296, + "learning_rate": 1.3419814163996007e-05, + "loss": 1.0719, + "step": 1782 + }, + { + "epoch": 0.40824270177447053, + "grad_norm": 1.1671353578567505, + "learning_rate": 1.341284232471872e-05, + "loss": 1.0912, + "step": 1783 + }, + { + "epoch": 0.40847166571265026, + "grad_norm": 1.2449313402175903, + "learning_rate": 1.3405868607382914e-05, + "loss": 1.0755, + "step": 1784 + }, + { + "epoch": 0.40870062965083, + "grad_norm": 1.078808069229126, + "learning_rate": 1.3398893015826166e-05, + "loss": 1.0852, + "step": 1785 + }, + { + "epoch": 0.4089295935890097, + "grad_norm": 1.0642139911651611, + "learning_rate": 1.3391915553887093e-05, + "loss": 1.0864, + "step": 1786 + }, + { + "epoch": 0.4091585575271895, + "grad_norm": 1.050003170967102, + "learning_rate": 1.3384936225405326e-05, + "loss": 1.095, + "step": 1787 + }, + { + "epoch": 0.4093875214653692, + "grad_norm": 1.2181106805801392, + "learning_rate": 1.3377955034221532e-05, + "loss": 1.1276, + "step": 1788 + }, + { + "epoch": 0.40961648540354895, + "grad_norm": 1.3170802593231201, + "learning_rate": 1.3370971984177406e-05, + "loss": 1.0547, + "step": 1789 + }, + { + "epoch": 0.4098454493417287, + "grad_norm": 1.1851341724395752, + "learning_rate": 1.3363987079115656e-05, + "loss": 1.0538, + "step": 1790 + }, + { + "epoch": 0.4100744132799084, + "grad_norm": 1.218276023864746, + "learning_rate": 1.3357000322880024e-05, + "loss": 1.0555, + "step": 1791 + }, + { + "epoch": 0.41030337721808813, + "grad_norm": 1.2883524894714355, + "learning_rate": 1.3350011719315257e-05, + "loss": 1.0051, + "step": 1792 + }, + { + "epoch": 0.4105323411562679, + "grad_norm": 1.1477938890457153, + "learning_rate": 1.334302127226713e-05, + "loss": 1.0466, + "step": 1793 + }, + { + "epoch": 0.41076130509444764, + "grad_norm": 1.0580675601959229, + "learning_rate": 1.333602898558242e-05, + "loss": 1.0661, + "step": 1794 + }, + { + "epoch": 0.41099026903262736, + "grad_norm": 1.4260722398757935, + "learning_rate": 1.3329034863108932e-05, + "loss": 1.0466, + "step": 1795 + }, + { + "epoch": 0.4112192329708071, + "grad_norm": 1.1878693103790283, + "learning_rate": 1.3322038908695466e-05, + "loss": 1.1458, + "step": 1796 + }, + { + "epoch": 0.4114481969089868, + "grad_norm": 1.031032919883728, + "learning_rate": 1.3315041126191845e-05, + "loss": 1.0114, + "step": 1797 + }, + { + "epoch": 0.41167716084716655, + "grad_norm": 1.2380294799804688, + "learning_rate": 1.3308041519448885e-05, + "loss": 1.0351, + "step": 1798 + }, + { + "epoch": 0.41190612478534633, + "grad_norm": 1.2293623685836792, + "learning_rate": 1.3301040092318409e-05, + "loss": 1.0952, + "step": 1799 + }, + { + "epoch": 0.41213508872352606, + "grad_norm": 1.1843045949935913, + "learning_rate": 1.3294036848653247e-05, + "loss": 1.0966, + "step": 1800 + }, + { + "epoch": 0.4123640526617058, + "grad_norm": 1.0566411018371582, + "learning_rate": 1.3287031792307226e-05, + "loss": 1.0596, + "step": 1801 + }, + { + "epoch": 0.4125930165998855, + "grad_norm": 1.1404145956039429, + "learning_rate": 1.3280024927135165e-05, + "loss": 1.0058, + "step": 1802 + }, + { + "epoch": 0.41282198053806524, + "grad_norm": 1.210115909576416, + "learning_rate": 1.3273016256992888e-05, + "loss": 1.0835, + "step": 1803 + }, + { + "epoch": 0.413050944476245, + "grad_norm": 1.214924693107605, + "learning_rate": 1.3266005785737206e-05, + "loss": 1.0879, + "step": 1804 + }, + { + "epoch": 0.41327990841442475, + "grad_norm": 1.529047966003418, + "learning_rate": 1.3258993517225923e-05, + "loss": 1.1117, + "step": 1805 + }, + { + "epoch": 0.4135088723526045, + "grad_norm": 1.181199312210083, + "learning_rate": 1.3251979455317831e-05, + "loss": 1.1248, + "step": 1806 + }, + { + "epoch": 0.4137378362907842, + "grad_norm": 1.1865763664245605, + "learning_rate": 1.3244963603872707e-05, + "loss": 1.1102, + "step": 1807 + }, + { + "epoch": 0.4139668002289639, + "grad_norm": 1.2326041460037231, + "learning_rate": 1.323794596675132e-05, + "loss": 1.0253, + "step": 1808 + }, + { + "epoch": 0.41419576416714365, + "grad_norm": 1.0496649742126465, + "learning_rate": 1.3230926547815413e-05, + "loss": 1.0957, + "step": 1809 + }, + { + "epoch": 0.41442472810532344, + "grad_norm": 1.4039772748947144, + "learning_rate": 1.3223905350927718e-05, + "loss": 1.055, + "step": 1810 + }, + { + "epoch": 0.41465369204350316, + "grad_norm": 1.393269658088684, + "learning_rate": 1.3216882379951932e-05, + "loss": 1.1233, + "step": 1811 + }, + { + "epoch": 0.4148826559816829, + "grad_norm": 1.2282973527908325, + "learning_rate": 1.3209857638752746e-05, + "loss": 1.0997, + "step": 1812 + }, + { + "epoch": 0.4151116199198626, + "grad_norm": 1.080151081085205, + "learning_rate": 1.3202831131195812e-05, + "loss": 1.0984, + "step": 1813 + }, + { + "epoch": 0.41534058385804234, + "grad_norm": 1.2916617393493652, + "learning_rate": 1.3195802861147759e-05, + "loss": 1.1306, + "step": 1814 + }, + { + "epoch": 0.41556954779622207, + "grad_norm": 1.297864317893982, + "learning_rate": 1.318877283247619e-05, + "loss": 1.1278, + "step": 1815 + }, + { + "epoch": 0.41579851173440185, + "grad_norm": 1.1242778301239014, + "learning_rate": 1.3181741049049659e-05, + "loss": 1.0858, + "step": 1816 + }, + { + "epoch": 0.4160274756725816, + "grad_norm": 1.1434422731399536, + "learning_rate": 1.3174707514737706e-05, + "loss": 1.0437, + "step": 1817 + }, + { + "epoch": 0.4162564396107613, + "grad_norm": 1.0877010822296143, + "learning_rate": 1.3167672233410826e-05, + "loss": 1.1197, + "step": 1818 + }, + { + "epoch": 0.41648540354894104, + "grad_norm": 1.1932451725006104, + "learning_rate": 1.3160635208940473e-05, + "loss": 1.1059, + "step": 1819 + }, + { + "epoch": 0.41671436748712076, + "grad_norm": 1.4846535921096802, + "learning_rate": 1.3153596445199063e-05, + "loss": 1.0637, + "step": 1820 + }, + { + "epoch": 0.4169433314253005, + "grad_norm": 1.1383739709854126, + "learning_rate": 1.3146555946059971e-05, + "loss": 1.1125, + "step": 1821 + }, + { + "epoch": 0.41717229536348027, + "grad_norm": 1.1333290338516235, + "learning_rate": 1.3139513715397521e-05, + "loss": 1.0098, + "step": 1822 + }, + { + "epoch": 0.41740125930166, + "grad_norm": 0.9758977293968201, + "learning_rate": 1.3132469757086997e-05, + "loss": 1.0442, + "step": 1823 + }, + { + "epoch": 0.4176302232398397, + "grad_norm": 1.221085786819458, + "learning_rate": 1.3125424075004624e-05, + "loss": 1.0796, + "step": 1824 + }, + { + "epoch": 0.41785918717801945, + "grad_norm": 1.830741286277771, + "learning_rate": 1.3118376673027588e-05, + "loss": 1.078, + "step": 1825 + }, + { + "epoch": 0.4180881511161992, + "grad_norm": 1.274835467338562, + "learning_rate": 1.3111327555034015e-05, + "loss": 1.0829, + "step": 1826 + }, + { + "epoch": 0.41831711505437896, + "grad_norm": 1.2003624439239502, + "learning_rate": 1.3104276724902975e-05, + "loss": 1.0118, + "step": 1827 + }, + { + "epoch": 0.4185460789925587, + "grad_norm": 1.195296049118042, + "learning_rate": 1.3097224186514476e-05, + "loss": 1.0357, + "step": 1828 + }, + { + "epoch": 0.4187750429307384, + "grad_norm": 1.2735960483551025, + "learning_rate": 1.3090169943749475e-05, + "loss": 1.0895, + "step": 1829 + }, + { + "epoch": 0.41900400686891814, + "grad_norm": 1.1189090013504028, + "learning_rate": 1.3083114000489863e-05, + "loss": 1.0777, + "step": 1830 + }, + { + "epoch": 0.41923297080709787, + "grad_norm": 1.1308891773223877, + "learning_rate": 1.3076056360618465e-05, + "loss": 1.0817, + "step": 1831 + }, + { + "epoch": 0.4194619347452776, + "grad_norm": 1.2105430364608765, + "learning_rate": 1.3068997028019043e-05, + "loss": 1.0939, + "step": 1832 + }, + { + "epoch": 0.4196908986834574, + "grad_norm": 1.4238054752349854, + "learning_rate": 1.3061936006576284e-05, + "loss": 1.0822, + "step": 1833 + }, + { + "epoch": 0.4199198626216371, + "grad_norm": 1.037213683128357, + "learning_rate": 1.3054873300175814e-05, + "loss": 1.0473, + "step": 1834 + }, + { + "epoch": 0.42014882655981683, + "grad_norm": 1.4768306016921997, + "learning_rate": 1.3047808912704178e-05, + "loss": 1.0661, + "step": 1835 + }, + { + "epoch": 0.42037779049799656, + "grad_norm": 1.2641279697418213, + "learning_rate": 1.304074284804885e-05, + "loss": 1.0562, + "step": 1836 + }, + { + "epoch": 0.4206067544361763, + "grad_norm": 1.1807743310928345, + "learning_rate": 1.3033675110098227e-05, + "loss": 1.1504, + "step": 1837 + }, + { + "epoch": 0.420835718374356, + "grad_norm": 1.099843978881836, + "learning_rate": 1.3026605702741625e-05, + "loss": 1.0742, + "step": 1838 + }, + { + "epoch": 0.4210646823125358, + "grad_norm": 1.1367111206054688, + "learning_rate": 1.3019534629869281e-05, + "loss": 1.1427, + "step": 1839 + }, + { + "epoch": 0.4212936462507155, + "grad_norm": 1.4131443500518799, + "learning_rate": 1.3012461895372343e-05, + "loss": 1.0812, + "step": 1840 + }, + { + "epoch": 0.42152261018889525, + "grad_norm": 1.1341830492019653, + "learning_rate": 1.3005387503142884e-05, + "loss": 1.0982, + "step": 1841 + }, + { + "epoch": 0.421751574127075, + "grad_norm": 1.1615837812423706, + "learning_rate": 1.2998311457073878e-05, + "loss": 1.0308, + "step": 1842 + }, + { + "epoch": 0.4219805380652547, + "grad_norm": 1.2500286102294922, + "learning_rate": 1.2991233761059214e-05, + "loss": 1.0998, + "step": 1843 + }, + { + "epoch": 0.42220950200343443, + "grad_norm": 1.2073861360549927, + "learning_rate": 1.2984154418993691e-05, + "loss": 1.0588, + "step": 1844 + }, + { + "epoch": 0.4224384659416142, + "grad_norm": 1.1415460109710693, + "learning_rate": 1.2977073434773009e-05, + "loss": 1.0693, + "step": 1845 + }, + { + "epoch": 0.42266742987979394, + "grad_norm": 1.4329516887664795, + "learning_rate": 1.2969990812293779e-05, + "loss": 1.0938, + "step": 1846 + }, + { + "epoch": 0.42289639381797367, + "grad_norm": 1.2857966423034668, + "learning_rate": 1.29629065554535e-05, + "loss": 1.0196, + "step": 1847 + }, + { + "epoch": 0.4231253577561534, + "grad_norm": 1.0637885332107544, + "learning_rate": 1.2955820668150587e-05, + "loss": 1.0602, + "step": 1848 + }, + { + "epoch": 0.4233543216943331, + "grad_norm": 1.3474278450012207, + "learning_rate": 1.2948733154284343e-05, + "loss": 1.135, + "step": 1849 + }, + { + "epoch": 0.42358328563251285, + "grad_norm": 1.1365108489990234, + "learning_rate": 1.2941644017754964e-05, + "loss": 1.1279, + "step": 1850 + }, + { + "epoch": 0.42381224957069263, + "grad_norm": 1.1364717483520508, + "learning_rate": 1.2934553262463548e-05, + "loss": 1.0466, + "step": 1851 + }, + { + "epoch": 0.42404121350887236, + "grad_norm": 1.122428059577942, + "learning_rate": 1.2927460892312071e-05, + "loss": 1.0606, + "step": 1852 + }, + { + "epoch": 0.4242701774470521, + "grad_norm": 1.1230154037475586, + "learning_rate": 1.2920366911203414e-05, + "loss": 1.0651, + "step": 1853 + }, + { + "epoch": 0.4244991413852318, + "grad_norm": 1.150088906288147, + "learning_rate": 1.2913271323041328e-05, + "loss": 1.0204, + "step": 1854 + }, + { + "epoch": 0.42472810532341154, + "grad_norm": 1.1731511354446411, + "learning_rate": 1.2906174131730458e-05, + "loss": 1.1506, + "step": 1855 + }, + { + "epoch": 0.4249570692615913, + "grad_norm": 1.1663103103637695, + "learning_rate": 1.2899075341176326e-05, + "loss": 1.0903, + "step": 1856 + }, + { + "epoch": 0.42518603319977105, + "grad_norm": 1.0978206396102905, + "learning_rate": 1.289197495528534e-05, + "loss": 1.0315, + "step": 1857 + }, + { + "epoch": 0.4254149971379508, + "grad_norm": 1.842656135559082, + "learning_rate": 1.2884872977964786e-05, + "loss": 1.0962, + "step": 1858 + }, + { + "epoch": 0.4256439610761305, + "grad_norm": 1.1557258367538452, + "learning_rate": 1.2877769413122816e-05, + "loss": 1.1225, + "step": 1859 + }, + { + "epoch": 0.42587292501431023, + "grad_norm": 1.0806372165679932, + "learning_rate": 1.2870664264668467e-05, + "loss": 1.0553, + "step": 1860 + }, + { + "epoch": 0.42610188895248996, + "grad_norm": 1.3559293746948242, + "learning_rate": 1.2863557536511642e-05, + "loss": 1.0912, + "step": 1861 + }, + { + "epoch": 0.42633085289066974, + "grad_norm": 1.2274169921875, + "learning_rate": 1.285644923256311e-05, + "loss": 1.1026, + "step": 1862 + }, + { + "epoch": 0.42655981682884947, + "grad_norm": 1.467261791229248, + "learning_rate": 1.2849339356734513e-05, + "loss": 1.0609, + "step": 1863 + }, + { + "epoch": 0.4267887807670292, + "grad_norm": 1.08341646194458, + "learning_rate": 1.284222791293836e-05, + "loss": 1.0122, + "step": 1864 + }, + { + "epoch": 0.4270177447052089, + "grad_norm": 1.18088960647583, + "learning_rate": 1.2835114905088013e-05, + "loss": 1.0016, + "step": 1865 + }, + { + "epoch": 0.42724670864338865, + "grad_norm": 3.6483867168426514, + "learning_rate": 1.2828000337097703e-05, + "loss": 1.0826, + "step": 1866 + }, + { + "epoch": 0.4274756725815684, + "grad_norm": 1.3066470623016357, + "learning_rate": 1.2820884212882521e-05, + "loss": 1.0831, + "step": 1867 + }, + { + "epoch": 0.42770463651974816, + "grad_norm": 1.1415867805480957, + "learning_rate": 1.2813766536358406e-05, + "loss": 1.0411, + "step": 1868 + }, + { + "epoch": 0.4279336004579279, + "grad_norm": 1.0170698165893555, + "learning_rate": 1.2806647311442153e-05, + "loss": 1.0385, + "step": 1869 + }, + { + "epoch": 0.4281625643961076, + "grad_norm": 1.0501190423965454, + "learning_rate": 1.2799526542051419e-05, + "loss": 1.0581, + "step": 1870 + }, + { + "epoch": 0.42839152833428734, + "grad_norm": 1.1775119304656982, + "learning_rate": 1.2792404232104699e-05, + "loss": 1.1704, + "step": 1871 + }, + { + "epoch": 0.42862049227246707, + "grad_norm": 1.1157633066177368, + "learning_rate": 1.2785280385521342e-05, + "loss": 1.0301, + "step": 1872 + }, + { + "epoch": 0.4288494562106468, + "grad_norm": 1.531984567642212, + "learning_rate": 1.277815500622154e-05, + "loss": 1.0217, + "step": 1873 + }, + { + "epoch": 0.4290784201488266, + "grad_norm": 4.276288986206055, + "learning_rate": 1.2771028098126333e-05, + "loss": 1.0464, + "step": 1874 + }, + { + "epoch": 0.4293073840870063, + "grad_norm": 1.2262746095657349, + "learning_rate": 1.2763899665157591e-05, + "loss": 1.0452, + "step": 1875 + }, + { + "epoch": 0.42953634802518603, + "grad_norm": 1.0891977548599243, + "learning_rate": 1.275676971123804e-05, + "loss": 1.0315, + "step": 1876 + }, + { + "epoch": 0.42976531196336576, + "grad_norm": 3.1425435543060303, + "learning_rate": 1.2749638240291227e-05, + "loss": 1.1258, + "step": 1877 + }, + { + "epoch": 0.4299942759015455, + "grad_norm": 1.209585428237915, + "learning_rate": 1.2742505256241543e-05, + "loss": 1.074, + "step": 1878 + }, + { + "epoch": 0.43022323983972527, + "grad_norm": 1.5271508693695068, + "learning_rate": 1.2735370763014212e-05, + "loss": 1.1113, + "step": 1879 + }, + { + "epoch": 0.430452203777905, + "grad_norm": 1.2672470808029175, + "learning_rate": 1.2728234764535283e-05, + "loss": 1.1049, + "step": 1880 + }, + { + "epoch": 0.4306811677160847, + "grad_norm": 1.2539156675338745, + "learning_rate": 1.2721097264731634e-05, + "loss": 1.0472, + "step": 1881 + }, + { + "epoch": 0.43091013165426445, + "grad_norm": 1.542236328125, + "learning_rate": 1.2713958267530976e-05, + "loss": 1.0408, + "step": 1882 + }, + { + "epoch": 0.4311390955924442, + "grad_norm": 1.2664871215820312, + "learning_rate": 1.2706817776861838e-05, + "loss": 1.1148, + "step": 1883 + }, + { + "epoch": 0.4313680595306239, + "grad_norm": 1.2938625812530518, + "learning_rate": 1.269967579665357e-05, + "loss": 1.0531, + "step": 1884 + }, + { + "epoch": 0.4315970234688037, + "grad_norm": 1.3394691944122314, + "learning_rate": 1.2692532330836346e-05, + "loss": 1.0794, + "step": 1885 + }, + { + "epoch": 0.4318259874069834, + "grad_norm": 1.4612385034561157, + "learning_rate": 1.2685387383341157e-05, + "loss": 1.0415, + "step": 1886 + }, + { + "epoch": 0.43205495134516314, + "grad_norm": 1.6666114330291748, + "learning_rate": 1.2678240958099801e-05, + "loss": 1.0744, + "step": 1887 + }, + { + "epoch": 0.43228391528334287, + "grad_norm": 1.2210183143615723, + "learning_rate": 1.2671093059044899e-05, + "loss": 1.0129, + "step": 1888 + }, + { + "epoch": 0.4325128792215226, + "grad_norm": 1.5127156972885132, + "learning_rate": 1.2663943690109885e-05, + "loss": 1.1413, + "step": 1889 + }, + { + "epoch": 0.4327418431597023, + "grad_norm": 1.2559911012649536, + "learning_rate": 1.2656792855228993e-05, + "loss": 1.1116, + "step": 1890 + }, + { + "epoch": 0.4329708070978821, + "grad_norm": 1.1968263387680054, + "learning_rate": 1.2649640558337266e-05, + "loss": 1.1159, + "step": 1891 + }, + { + "epoch": 0.43319977103606183, + "grad_norm": 1.3571308851242065, + "learning_rate": 1.2642486803370553e-05, + "loss": 1.1034, + "step": 1892 + }, + { + "epoch": 0.43342873497424156, + "grad_norm": 1.0277183055877686, + "learning_rate": 1.2635331594265508e-05, + "loss": 1.0414, + "step": 1893 + }, + { + "epoch": 0.4336576989124213, + "grad_norm": 0.9604895114898682, + "learning_rate": 1.2628174934959583e-05, + "loss": 1.066, + "step": 1894 + }, + { + "epoch": 0.433886662850601, + "grad_norm": 0.9915218353271484, + "learning_rate": 1.2621016829391022e-05, + "loss": 1.0006, + "step": 1895 + }, + { + "epoch": 0.43411562678878074, + "grad_norm": 1.2676836252212524, + "learning_rate": 1.2613857281498878e-05, + "loss": 1.0798, + "step": 1896 + }, + { + "epoch": 0.4343445907269605, + "grad_norm": 1.2389227151870728, + "learning_rate": 1.2606696295222985e-05, + "loss": 1.0692, + "step": 1897 + }, + { + "epoch": 0.43457355466514025, + "grad_norm": 1.2961418628692627, + "learning_rate": 1.2599533874503978e-05, + "loss": 1.0435, + "step": 1898 + }, + { + "epoch": 0.43480251860332, + "grad_norm": 1.2095304727554321, + "learning_rate": 1.2592370023283268e-05, + "loss": 1.0942, + "step": 1899 + }, + { + "epoch": 0.4350314825414997, + "grad_norm": 0.9683312773704529, + "learning_rate": 1.2585204745503072e-05, + "loss": 1.1888, + "step": 1900 + }, + { + "epoch": 0.43526044647967943, + "grad_norm": 1.135977029800415, + "learning_rate": 1.2578038045106383e-05, + "loss": 1.1095, + "step": 1901 + }, + { + "epoch": 0.4354894104178592, + "grad_norm": 1.0904693603515625, + "learning_rate": 1.2570869926036968e-05, + "loss": 1.1369, + "step": 1902 + }, + { + "epoch": 0.43571837435603894, + "grad_norm": 1.4972336292266846, + "learning_rate": 1.2563700392239387e-05, + "loss": 1.0466, + "step": 1903 + }, + { + "epoch": 0.43594733829421867, + "grad_norm": 1.0306872129440308, + "learning_rate": 1.2556529447658974e-05, + "loss": 1.1408, + "step": 1904 + }, + { + "epoch": 0.4361763022323984, + "grad_norm": 1.2561451196670532, + "learning_rate": 1.2549357096241841e-05, + "loss": 1.0328, + "step": 1905 + }, + { + "epoch": 0.4364052661705781, + "grad_norm": 1.3269802331924438, + "learning_rate": 1.2542183341934873e-05, + "loss": 1.0962, + "step": 1906 + }, + { + "epoch": 0.43663423010875785, + "grad_norm": 1.1158679723739624, + "learning_rate": 1.253500818868572e-05, + "loss": 1.0914, + "step": 1907 + }, + { + "epoch": 0.43686319404693763, + "grad_norm": 1.4163261651992798, + "learning_rate": 1.252783164044282e-05, + "loss": 1.0665, + "step": 1908 + }, + { + "epoch": 0.43709215798511736, + "grad_norm": 1.1557682752609253, + "learning_rate": 1.2520653701155361e-05, + "loss": 1.0816, + "step": 1909 + }, + { + "epoch": 0.4373211219232971, + "grad_norm": 1.1793196201324463, + "learning_rate": 1.25134743747733e-05, + "loss": 1.0406, + "step": 1910 + }, + { + "epoch": 0.4375500858614768, + "grad_norm": 1.505786657333374, + "learning_rate": 1.2506293665247366e-05, + "loss": 1.1074, + "step": 1911 + }, + { + "epoch": 0.43777904979965654, + "grad_norm": 1.6487133502960205, + "learning_rate": 1.2499111576529042e-05, + "loss": 1.1171, + "step": 1912 + }, + { + "epoch": 0.43800801373783627, + "grad_norm": 1.204660177230835, + "learning_rate": 1.2491928112570568e-05, + "loss": 0.9898, + "step": 1913 + }, + { + "epoch": 0.43823697767601605, + "grad_norm": 1.1147689819335938, + "learning_rate": 1.2484743277324945e-05, + "loss": 0.9693, + "step": 1914 + }, + { + "epoch": 0.4384659416141958, + "grad_norm": 1.136470079421997, + "learning_rate": 1.2477557074745932e-05, + "loss": 1.0785, + "step": 1915 + }, + { + "epoch": 0.4386949055523755, + "grad_norm": 1.2387357950210571, + "learning_rate": 1.247036950878803e-05, + "loss": 1.1218, + "step": 1916 + }, + { + "epoch": 0.43892386949055523, + "grad_norm": 1.831175446510315, + "learning_rate": 1.24631805834065e-05, + "loss": 1.0891, + "step": 1917 + }, + { + "epoch": 0.43915283342873496, + "grad_norm": 1.2181199789047241, + "learning_rate": 1.2455990302557346e-05, + "loss": 1.0932, + "step": 1918 + }, + { + "epoch": 0.4393817973669147, + "grad_norm": 1.262529730796814, + "learning_rate": 1.2448798670197318e-05, + "loss": 1.0351, + "step": 1919 + }, + { + "epoch": 0.43961076130509447, + "grad_norm": 1.10959792137146, + "learning_rate": 1.2441605690283915e-05, + "loss": 1.1082, + "step": 1920 + }, + { + "epoch": 0.4398397252432742, + "grad_norm": 1.1682922840118408, + "learning_rate": 1.2434411366775367e-05, + "loss": 1.1353, + "step": 1921 + }, + { + "epoch": 0.4400686891814539, + "grad_norm": 1.2102086544036865, + "learning_rate": 1.242721570363066e-05, + "loss": 1.0569, + "step": 1922 + }, + { + "epoch": 0.44029765311963365, + "grad_norm": 1.1970107555389404, + "learning_rate": 1.2420018704809497e-05, + "loss": 1.0502, + "step": 1923 + }, + { + "epoch": 0.4405266170578134, + "grad_norm": 1.2726011276245117, + "learning_rate": 1.2412820374272332e-05, + "loss": 1.0845, + "step": 1924 + }, + { + "epoch": 0.44075558099599316, + "grad_norm": 1.1796960830688477, + "learning_rate": 1.2405620715980345e-05, + "loss": 1.0449, + "step": 1925 + }, + { + "epoch": 0.4409845449341729, + "grad_norm": 1.0175650119781494, + "learning_rate": 1.2398419733895444e-05, + "loss": 1.0563, + "step": 1926 + }, + { + "epoch": 0.4412135088723526, + "grad_norm": 2.8849644660949707, + "learning_rate": 1.2391217431980273e-05, + "loss": 1.1089, + "step": 1927 + }, + { + "epoch": 0.44144247281053234, + "grad_norm": 1.4662426710128784, + "learning_rate": 1.2384013814198197e-05, + "loss": 1.1497, + "step": 1928 + }, + { + "epoch": 0.44167143674871207, + "grad_norm": 1.102353811264038, + "learning_rate": 1.2376808884513306e-05, + "loss": 1.1013, + "step": 1929 + }, + { + "epoch": 0.4419004006868918, + "grad_norm": 2.3300585746765137, + "learning_rate": 1.2369602646890415e-05, + "loss": 1.1192, + "step": 1930 + }, + { + "epoch": 0.4421293646250716, + "grad_norm": 1.2267271280288696, + "learning_rate": 1.2362395105295054e-05, + "loss": 1.069, + "step": 1931 + }, + { + "epoch": 0.4423583285632513, + "grad_norm": 1.4739381074905396, + "learning_rate": 1.235518626369347e-05, + "loss": 1.0883, + "step": 1932 + }, + { + "epoch": 0.44258729250143103, + "grad_norm": 1.5360037088394165, + "learning_rate": 1.2347976126052631e-05, + "loss": 1.0633, + "step": 1933 + }, + { + "epoch": 0.44281625643961076, + "grad_norm": 1.3187732696533203, + "learning_rate": 1.234076469634022e-05, + "loss": 1.0849, + "step": 1934 + }, + { + "epoch": 0.4430452203777905, + "grad_norm": 1.0551137924194336, + "learning_rate": 1.2333551978524618e-05, + "loss": 1.0977, + "step": 1935 + }, + { + "epoch": 0.4432741843159702, + "grad_norm": 1.3932082653045654, + "learning_rate": 1.2326337976574928e-05, + "loss": 1.0336, + "step": 1936 + }, + { + "epoch": 0.44350314825415, + "grad_norm": 1.1481599807739258, + "learning_rate": 1.2319122694460952e-05, + "loss": 1.0941, + "step": 1937 + }, + { + "epoch": 0.4437321121923297, + "grad_norm": 1.5443516969680786, + "learning_rate": 1.2311906136153202e-05, + "loss": 1.0848, + "step": 1938 + }, + { + "epoch": 0.44396107613050945, + "grad_norm": 1.0437678098678589, + "learning_rate": 1.2304688305622889e-05, + "loss": 1.1458, + "step": 1939 + }, + { + "epoch": 0.4441900400686892, + "grad_norm": 1.0990891456604004, + "learning_rate": 1.2297469206841921e-05, + "loss": 1.066, + "step": 1940 + }, + { + "epoch": 0.4444190040068689, + "grad_norm": 1.1834927797317505, + "learning_rate": 1.2290248843782915e-05, + "loss": 1.0384, + "step": 1941 + }, + { + "epoch": 0.44464796794504863, + "grad_norm": 1.1914305686950684, + "learning_rate": 1.2283027220419169e-05, + "loss": 1.0453, + "step": 1942 + }, + { + "epoch": 0.4448769318832284, + "grad_norm": 1.607132911682129, + "learning_rate": 1.2275804340724684e-05, + "loss": 1.033, + "step": 1943 + }, + { + "epoch": 0.44510589582140814, + "grad_norm": 1.1510486602783203, + "learning_rate": 1.2268580208674151e-05, + "loss": 1.0786, + "step": 1944 + }, + { + "epoch": 0.44533485975958786, + "grad_norm": 1.0266963243484497, + "learning_rate": 1.2261354828242948e-05, + "loss": 1.0426, + "step": 1945 + }, + { + "epoch": 0.4455638236977676, + "grad_norm": 1.0529321432113647, + "learning_rate": 1.2254128203407146e-05, + "loss": 1.0221, + "step": 1946 + }, + { + "epoch": 0.4457927876359473, + "grad_norm": 1.0336111783981323, + "learning_rate": 1.2246900338143485e-05, + "loss": 1.0285, + "step": 1947 + }, + { + "epoch": 0.4460217515741271, + "grad_norm": 1.5184695720672607, + "learning_rate": 1.2239671236429413e-05, + "loss": 1.0077, + "step": 1948 + }, + { + "epoch": 0.44625071551230683, + "grad_norm": 1.1687562465667725, + "learning_rate": 1.2232440902243037e-05, + "loss": 1.0827, + "step": 1949 + }, + { + "epoch": 0.44647967945048656, + "grad_norm": 1.2201441526412964, + "learning_rate": 1.2225209339563144e-05, + "loss": 1.0207, + "step": 1950 + }, + { + "epoch": 0.4467086433886663, + "grad_norm": 1.1205480098724365, + "learning_rate": 1.2217976552369213e-05, + "loss": 1.0461, + "step": 1951 + }, + { + "epoch": 0.446937607326846, + "grad_norm": 2.142512083053589, + "learning_rate": 1.2210742544641377e-05, + "loss": 1.1122, + "step": 1952 + }, + { + "epoch": 0.44716657126502574, + "grad_norm": 1.682520866394043, + "learning_rate": 1.2203507320360458e-05, + "loss": 1.0742, + "step": 1953 + }, + { + "epoch": 0.4473955352032055, + "grad_norm": 1.2594259977340698, + "learning_rate": 1.2196270883507927e-05, + "loss": 1.0582, + "step": 1954 + }, + { + "epoch": 0.44762449914138525, + "grad_norm": 1.0667154788970947, + "learning_rate": 1.218903323806595e-05, + "loss": 1.0335, + "step": 1955 + }, + { + "epoch": 0.447853463079565, + "grad_norm": 1.5725624561309814, + "learning_rate": 1.2181794388017332e-05, + "loss": 1.0779, + "step": 1956 + }, + { + "epoch": 0.4480824270177447, + "grad_norm": 1.0927304029464722, + "learning_rate": 1.2174554337345555e-05, + "loss": 1.0951, + "step": 1957 + }, + { + "epoch": 0.4483113909559244, + "grad_norm": 1.3435699939727783, + "learning_rate": 1.2167313090034756e-05, + "loss": 1.0554, + "step": 1958 + }, + { + "epoch": 0.44854035489410415, + "grad_norm": 1.0855443477630615, + "learning_rate": 1.2160070650069735e-05, + "loss": 1.1022, + "step": 1959 + }, + { + "epoch": 0.44876931883228394, + "grad_norm": 1.2444900274276733, + "learning_rate": 1.2152827021435946e-05, + "loss": 1.0268, + "step": 1960 + }, + { + "epoch": 0.44899828277046366, + "grad_norm": 1.4539769887924194, + "learning_rate": 1.2145582208119497e-05, + "loss": 1.1199, + "step": 1961 + }, + { + "epoch": 0.4492272467086434, + "grad_norm": 1.8856399059295654, + "learning_rate": 1.2138336214107148e-05, + "loss": 1.0528, + "step": 1962 + }, + { + "epoch": 0.4494562106468231, + "grad_norm": 1.1333612203598022, + "learning_rate": 1.2131089043386305e-05, + "loss": 1.0599, + "step": 1963 + }, + { + "epoch": 0.44968517458500284, + "grad_norm": 1.1188855171203613, + "learning_rate": 1.212384069994503e-05, + "loss": 1.0356, + "step": 1964 + }, + { + "epoch": 0.44991413852318257, + "grad_norm": 1.4274768829345703, + "learning_rate": 1.2116591187772026e-05, + "loss": 1.0944, + "step": 1965 + }, + { + "epoch": 0.45014310246136235, + "grad_norm": 1.637333631515503, + "learning_rate": 1.2109340510856633e-05, + "loss": 1.098, + "step": 1966 + }, + { + "epoch": 0.4503720663995421, + "grad_norm": 1.5847759246826172, + "learning_rate": 1.2102088673188845e-05, + "loss": 1.0764, + "step": 1967 + }, + { + "epoch": 0.4506010303377218, + "grad_norm": 1.5479519367218018, + "learning_rate": 1.2094835678759282e-05, + "loss": 1.118, + "step": 1968 + }, + { + "epoch": 0.45082999427590154, + "grad_norm": 1.1684271097183228, + "learning_rate": 1.2087581531559208e-05, + "loss": 1.0399, + "step": 1969 + }, + { + "epoch": 0.45105895821408126, + "grad_norm": 1.4510235786437988, + "learning_rate": 1.2080326235580521e-05, + "loss": 1.0827, + "step": 1970 + }, + { + "epoch": 0.45128792215226105, + "grad_norm": 1.1053940057754517, + "learning_rate": 1.2073069794815748e-05, + "loss": 1.0701, + "step": 1971 + }, + { + "epoch": 0.45151688609044077, + "grad_norm": 1.083375334739685, + "learning_rate": 1.2065812213258051e-05, + "loss": 1.0956, + "step": 1972 + }, + { + "epoch": 0.4517458500286205, + "grad_norm": 1.1666209697723389, + "learning_rate": 1.2058553494901203e-05, + "loss": 1.0781, + "step": 1973 + }, + { + "epoch": 0.4519748139668002, + "grad_norm": 1.2016594409942627, + "learning_rate": 1.2051293643739634e-05, + "loss": 1.0406, + "step": 1974 + }, + { + "epoch": 0.45220377790497995, + "grad_norm": 1.4277703762054443, + "learning_rate": 1.204403266376837e-05, + "loss": 1.0543, + "step": 1975 + }, + { + "epoch": 0.4524327418431597, + "grad_norm": 1.206367015838623, + "learning_rate": 1.2036770558983067e-05, + "loss": 1.0423, + "step": 1976 + }, + { + "epoch": 0.45266170578133946, + "grad_norm": 1.0165696144104004, + "learning_rate": 1.202950733338e-05, + "loss": 0.9945, + "step": 1977 + }, + { + "epoch": 0.4528906697195192, + "grad_norm": 0.9572392702102661, + "learning_rate": 1.2022242990956064e-05, + "loss": 1.07, + "step": 1978 + }, + { + "epoch": 0.4531196336576989, + "grad_norm": 2.232534885406494, + "learning_rate": 1.2014977535708767e-05, + "loss": 1.1249, + "step": 1979 + }, + { + "epoch": 0.45334859759587864, + "grad_norm": 1.0949493646621704, + "learning_rate": 1.2007710971636221e-05, + "loss": 0.9862, + "step": 1980 + }, + { + "epoch": 0.45357756153405837, + "grad_norm": 1.493613600730896, + "learning_rate": 1.2000443302737162e-05, + "loss": 1.0943, + "step": 1981 + }, + { + "epoch": 0.4538065254722381, + "grad_norm": 1.0247379541397095, + "learning_rate": 1.1993174533010928e-05, + "loss": 1.0921, + "step": 1982 + }, + { + "epoch": 0.4540354894104179, + "grad_norm": 1.1227061748504639, + "learning_rate": 1.1985904666457455e-05, + "loss": 1.0995, + "step": 1983 + }, + { + "epoch": 0.4542644533485976, + "grad_norm": 1.3855504989624023, + "learning_rate": 1.1978633707077296e-05, + "loss": 1.0868, + "step": 1984 + }, + { + "epoch": 0.45449341728677733, + "grad_norm": 1.058700680732727, + "learning_rate": 1.1971361658871597e-05, + "loss": 1.0761, + "step": 1985 + }, + { + "epoch": 0.45472238122495706, + "grad_norm": 1.199990153312683, + "learning_rate": 1.1964088525842108e-05, + "loss": 1.0743, + "step": 1986 + }, + { + "epoch": 0.4549513451631368, + "grad_norm": 1.074346661567688, + "learning_rate": 1.1956814311991164e-05, + "loss": 1.0791, + "step": 1987 + }, + { + "epoch": 0.4551803091013165, + "grad_norm": 1.4295293092727661, + "learning_rate": 1.1949539021321713e-05, + "loss": 1.0344, + "step": 1988 + }, + { + "epoch": 0.4554092730394963, + "grad_norm": 1.2947413921356201, + "learning_rate": 1.1942262657837285e-05, + "loss": 1.0676, + "step": 1989 + }, + { + "epoch": 0.455638236977676, + "grad_norm": 1.270447015762329, + "learning_rate": 1.1934985225541998e-05, + "loss": 1.0626, + "step": 1990 + }, + { + "epoch": 0.45586720091585575, + "grad_norm": 1.2195264101028442, + "learning_rate": 1.1927706728440565e-05, + "loss": 1.0508, + "step": 1991 + }, + { + "epoch": 0.4560961648540355, + "grad_norm": 1.2984168529510498, + "learning_rate": 1.192042717053828e-05, + "loss": 1.0517, + "step": 1992 + }, + { + "epoch": 0.4563251287922152, + "grad_norm": 1.2059777975082397, + "learning_rate": 1.1913146555841027e-05, + "loss": 1.0639, + "step": 1993 + }, + { + "epoch": 0.456554092730395, + "grad_norm": 1.2496461868286133, + "learning_rate": 1.1905864888355264e-05, + "loss": 1.1124, + "step": 1994 + }, + { + "epoch": 0.4567830566685747, + "grad_norm": 1.3210448026657104, + "learning_rate": 1.1898582172088027e-05, + "loss": 1.1523, + "step": 1995 + }, + { + "epoch": 0.45701202060675444, + "grad_norm": 1.288273811340332, + "learning_rate": 1.1891298411046943e-05, + "loss": 1.0392, + "step": 1996 + }, + { + "epoch": 0.45724098454493417, + "grad_norm": 1.6366710662841797, + "learning_rate": 1.18840136092402e-05, + "loss": 1.1138, + "step": 1997 + }, + { + "epoch": 0.4574699484831139, + "grad_norm": 0.9694288372993469, + "learning_rate": 1.1876727770676562e-05, + "loss": 1.0257, + "step": 1998 + }, + { + "epoch": 0.4576989124212936, + "grad_norm": 1.0713609457015991, + "learning_rate": 1.1869440899365365e-05, + "loss": 1.0785, + "step": 1999 + }, + { + "epoch": 0.4579278763594734, + "grad_norm": 0.9268698692321777, + "learning_rate": 1.1862152999316517e-05, + "loss": 1.0387, + "step": 2000 + }, + { + "epoch": 0.45815684029765313, + "grad_norm": 1.066428780555725, + "learning_rate": 1.1854864074540484e-05, + "loss": 1.0959, + "step": 2001 + }, + { + "epoch": 0.45838580423583286, + "grad_norm": 1.2952028512954712, + "learning_rate": 1.18475741290483e-05, + "loss": 1.0896, + "step": 2002 + }, + { + "epoch": 0.4586147681740126, + "grad_norm": 1.1627318859100342, + "learning_rate": 1.184028316685157e-05, + "loss": 1.0602, + "step": 2003 + }, + { + "epoch": 0.4588437321121923, + "grad_norm": 1.2692722082138062, + "learning_rate": 1.1832991191962435e-05, + "loss": 1.1016, + "step": 2004 + }, + { + "epoch": 0.45907269605037204, + "grad_norm": 1.1311182975769043, + "learning_rate": 1.182569820839362e-05, + "loss": 1.0275, + "step": 2005 + }, + { + "epoch": 0.4593016599885518, + "grad_norm": 1.1217960119247437, + "learning_rate": 1.1818404220158382e-05, + "loss": 1.1007, + "step": 2006 + }, + { + "epoch": 0.45953062392673155, + "grad_norm": 1.1474188566207886, + "learning_rate": 1.181110923127055e-05, + "loss": 1.0613, + "step": 2007 + }, + { + "epoch": 0.4597595878649113, + "grad_norm": 1.1906862258911133, + "learning_rate": 1.1803813245744495e-05, + "loss": 1.1043, + "step": 2008 + }, + { + "epoch": 0.459988551803091, + "grad_norm": 1.391391634941101, + "learning_rate": 1.179651626759513e-05, + "loss": 1.1015, + "step": 2009 + }, + { + "epoch": 0.46021751574127073, + "grad_norm": 1.3041538000106812, + "learning_rate": 1.1789218300837929e-05, + "loss": 1.0601, + "step": 2010 + }, + { + "epoch": 0.46044647967945046, + "grad_norm": 1.484880805015564, + "learning_rate": 1.1781919349488894e-05, + "loss": 1.0303, + "step": 2011 + }, + { + "epoch": 0.46067544361763024, + "grad_norm": 1.271309733390808, + "learning_rate": 1.177461941756458e-05, + "loss": 1.0293, + "step": 2012 + }, + { + "epoch": 0.46090440755580997, + "grad_norm": 1.0927857160568237, + "learning_rate": 1.1767318509082083e-05, + "loss": 1.0673, + "step": 2013 + }, + { + "epoch": 0.4611333714939897, + "grad_norm": 1.1591389179229736, + "learning_rate": 1.1760016628059026e-05, + "loss": 1.1406, + "step": 2014 + }, + { + "epoch": 0.4613623354321694, + "grad_norm": 1.0383858680725098, + "learning_rate": 1.1752713778513576e-05, + "loss": 1.0687, + "step": 2015 + }, + { + "epoch": 0.46159129937034915, + "grad_norm": 1.1415190696716309, + "learning_rate": 1.1745409964464425e-05, + "loss": 1.094, + "step": 2016 + }, + { + "epoch": 0.46182026330852893, + "grad_norm": 1.0855273008346558, + "learning_rate": 1.1738105189930807e-05, + "loss": 1.1396, + "step": 2017 + }, + { + "epoch": 0.46204922724670866, + "grad_norm": 1.1211069822311401, + "learning_rate": 1.1730799458932473e-05, + "loss": 1.0809, + "step": 2018 + }, + { + "epoch": 0.4622781911848884, + "grad_norm": 1.233792781829834, + "learning_rate": 1.1723492775489709e-05, + "loss": 1.0569, + "step": 2019 + }, + { + "epoch": 0.4625071551230681, + "grad_norm": 1.1317369937896729, + "learning_rate": 1.1716185143623322e-05, + "loss": 1.0652, + "step": 2020 + }, + { + "epoch": 0.46273611906124784, + "grad_norm": 1.083250880241394, + "learning_rate": 1.1708876567354635e-05, + "loss": 1.0556, + "step": 2021 + }, + { + "epoch": 0.46296508299942757, + "grad_norm": 1.1673812866210938, + "learning_rate": 1.1701567050705504e-05, + "loss": 1.0599, + "step": 2022 + }, + { + "epoch": 0.46319404693760735, + "grad_norm": 1.2285277843475342, + "learning_rate": 1.1694256597698288e-05, + "loss": 1.0872, + "step": 2023 + }, + { + "epoch": 0.4634230108757871, + "grad_norm": 1.4135609865188599, + "learning_rate": 1.168694521235587e-05, + "loss": 1.0408, + "step": 2024 + }, + { + "epoch": 0.4636519748139668, + "grad_norm": 1.0683159828186035, + "learning_rate": 1.1679632898701649e-05, + "loss": 1.0787, + "step": 2025 + }, + { + "epoch": 0.46388093875214653, + "grad_norm": 1.5515210628509521, + "learning_rate": 1.1672319660759523e-05, + "loss": 1.0343, + "step": 2026 + }, + { + "epoch": 0.46410990269032626, + "grad_norm": 1.0499638319015503, + "learning_rate": 1.1665005502553912e-05, + "loss": 1.0641, + "step": 2027 + }, + { + "epoch": 0.464338866628506, + "grad_norm": 1.0508882999420166, + "learning_rate": 1.165769042810973e-05, + "loss": 1.0326, + "step": 2028 + }, + { + "epoch": 0.46456783056668577, + "grad_norm": 1.1158897876739502, + "learning_rate": 1.1650374441452403e-05, + "loss": 1.0767, + "step": 2029 + }, + { + "epoch": 0.4647967945048655, + "grad_norm": 1.1368999481201172, + "learning_rate": 1.1643057546607858e-05, + "loss": 1.0745, + "step": 2030 + }, + { + "epoch": 0.4650257584430452, + "grad_norm": 1.5753467082977295, + "learning_rate": 1.1635739747602522e-05, + "loss": 1.0078, + "step": 2031 + }, + { + "epoch": 0.46525472238122495, + "grad_norm": 1.261248230934143, + "learning_rate": 1.1628421048463315e-05, + "loss": 0.9994, + "step": 2032 + }, + { + "epoch": 0.4654836863194047, + "grad_norm": 1.1466697454452515, + "learning_rate": 1.1621101453217656e-05, + "loss": 1.0402, + "step": 2033 + }, + { + "epoch": 0.4657126502575844, + "grad_norm": 1.1229275465011597, + "learning_rate": 1.1613780965893465e-05, + "loss": 1.0619, + "step": 2034 + }, + { + "epoch": 0.4659416141957642, + "grad_norm": 1.118383526802063, + "learning_rate": 1.1606459590519132e-05, + "loss": 1.0626, + "step": 2035 + }, + { + "epoch": 0.4661705781339439, + "grad_norm": 1.139142632484436, + "learning_rate": 1.1599137331123558e-05, + "loss": 1.0958, + "step": 2036 + }, + { + "epoch": 0.46639954207212364, + "grad_norm": 1.29563570022583, + "learning_rate": 1.1591814191736117e-05, + "loss": 1.042, + "step": 2037 + }, + { + "epoch": 0.46662850601030337, + "grad_norm": 1.1359105110168457, + "learning_rate": 1.1584490176386671e-05, + "loss": 1.081, + "step": 2038 + }, + { + "epoch": 0.4668574699484831, + "grad_norm": 1.0732991695404053, + "learning_rate": 1.1577165289105565e-05, + "loss": 0.9929, + "step": 2039 + }, + { + "epoch": 0.4670864338866629, + "grad_norm": 1.1904350519180298, + "learning_rate": 1.1569839533923626e-05, + "loss": 1.0244, + "step": 2040 + }, + { + "epoch": 0.4673153978248426, + "grad_norm": 1.1562821865081787, + "learning_rate": 1.1562512914872152e-05, + "loss": 1.1229, + "step": 2041 + }, + { + "epoch": 0.46754436176302233, + "grad_norm": 1.1359809637069702, + "learning_rate": 1.1555185435982923e-05, + "loss": 1.0961, + "step": 2042 + }, + { + "epoch": 0.46777332570120206, + "grad_norm": 1.1743630170822144, + "learning_rate": 1.1547857101288185e-05, + "loss": 1.1129, + "step": 2043 + }, + { + "epoch": 0.4680022896393818, + "grad_norm": 1.1456128358840942, + "learning_rate": 1.154052791482066e-05, + "loss": 1.0655, + "step": 2044 + }, + { + "epoch": 0.4682312535775615, + "grad_norm": 1.235987663269043, + "learning_rate": 1.153319788061354e-05, + "loss": 0.9827, + "step": 2045 + }, + { + "epoch": 0.4684602175157413, + "grad_norm": 1.3438743352890015, + "learning_rate": 1.1525867002700484e-05, + "loss": 1.0906, + "step": 2046 + }, + { + "epoch": 0.468689181453921, + "grad_norm": 1.1594905853271484, + "learning_rate": 1.1518535285115604e-05, + "loss": 1.0886, + "step": 2047 + }, + { + "epoch": 0.46891814539210075, + "grad_norm": 1.3096404075622559, + "learning_rate": 1.1511202731893493e-05, + "loss": 1.108, + "step": 2048 + }, + { + "epoch": 0.4691471093302805, + "grad_norm": 1.1685811281204224, + "learning_rate": 1.1503869347069186e-05, + "loss": 1.0289, + "step": 2049 + }, + { + "epoch": 0.4693760732684602, + "grad_norm": 1.7220717668533325, + "learning_rate": 1.1496535134678186e-05, + "loss": 1.0488, + "step": 2050 + }, + { + "epoch": 0.46960503720663993, + "grad_norm": 1.417108416557312, + "learning_rate": 1.1489200098756447e-05, + "loss": 1.0304, + "step": 2051 + }, + { + "epoch": 0.4698340011448197, + "grad_norm": 1.0246206521987915, + "learning_rate": 1.1481864243340381e-05, + "loss": 1.0297, + "step": 2052 + }, + { + "epoch": 0.47006296508299944, + "grad_norm": 1.2509313821792603, + "learning_rate": 1.1474527572466847e-05, + "loss": 1.1336, + "step": 2053 + }, + { + "epoch": 0.47029192902117917, + "grad_norm": 1.0300573110580444, + "learning_rate": 1.1467190090173147e-05, + "loss": 1.0812, + "step": 2054 + }, + { + "epoch": 0.4705208929593589, + "grad_norm": 1.067595362663269, + "learning_rate": 1.1459851800497048e-05, + "loss": 1.1246, + "step": 2055 + }, + { + "epoch": 0.4707498568975386, + "grad_norm": 1.1726340055465698, + "learning_rate": 1.145251270747674e-05, + "loss": 1.0196, + "step": 2056 + }, + { + "epoch": 0.47097882083571835, + "grad_norm": 1.0472357273101807, + "learning_rate": 1.1445172815150864e-05, + "loss": 1.0323, + "step": 2057 + }, + { + "epoch": 0.47120778477389813, + "grad_norm": 2.4607527256011963, + "learning_rate": 1.1437832127558508e-05, + "loss": 1.073, + "step": 2058 + }, + { + "epoch": 0.47143674871207786, + "grad_norm": 0.9941400289535522, + "learning_rate": 1.1430490648739185e-05, + "loss": 1.0418, + "step": 2059 + }, + { + "epoch": 0.4716657126502576, + "grad_norm": 1.4227428436279297, + "learning_rate": 1.1423148382732854e-05, + "loss": 1.1564, + "step": 2060 + }, + { + "epoch": 0.4718946765884373, + "grad_norm": 1.135663390159607, + "learning_rate": 1.1415805333579895e-05, + "loss": 1.0869, + "step": 2061 + }, + { + "epoch": 0.47212364052661704, + "grad_norm": 1.1211762428283691, + "learning_rate": 1.1408461505321136e-05, + "loss": 1.0782, + "step": 2062 + }, + { + "epoch": 0.4723526044647968, + "grad_norm": 1.7367572784423828, + "learning_rate": 1.1401116901997815e-05, + "loss": 1.0039, + "step": 2063 + }, + { + "epoch": 0.47258156840297655, + "grad_norm": 1.2560068368911743, + "learning_rate": 1.1393771527651614e-05, + "loss": 1.0838, + "step": 2064 + }, + { + "epoch": 0.4728105323411563, + "grad_norm": 1.2819138765335083, + "learning_rate": 1.1386425386324622e-05, + "loss": 0.999, + "step": 2065 + }, + { + "epoch": 0.473039496279336, + "grad_norm": 1.2946174144744873, + "learning_rate": 1.1379078482059367e-05, + "loss": 1.0082, + "step": 2066 + }, + { + "epoch": 0.47326846021751573, + "grad_norm": 1.1478831768035889, + "learning_rate": 1.1371730818898785e-05, + "loss": 1.114, + "step": 2067 + }, + { + "epoch": 0.47349742415569546, + "grad_norm": 1.1708440780639648, + "learning_rate": 1.1364382400886233e-05, + "loss": 1.0527, + "step": 2068 + }, + { + "epoch": 0.47372638809387524, + "grad_norm": 1.5635682344436646, + "learning_rate": 1.1357033232065484e-05, + "loss": 1.0343, + "step": 2069 + }, + { + "epoch": 0.47395535203205497, + "grad_norm": 1.4606600999832153, + "learning_rate": 1.134968331648073e-05, + "loss": 1.0216, + "step": 2070 + }, + { + "epoch": 0.4741843159702347, + "grad_norm": 1.3325625658035278, + "learning_rate": 1.1342332658176556e-05, + "loss": 1.0921, + "step": 2071 + }, + { + "epoch": 0.4744132799084144, + "grad_norm": 1.0679417848587036, + "learning_rate": 1.1334981261197977e-05, + "loss": 1.1267, + "step": 2072 + }, + { + "epoch": 0.47464224384659415, + "grad_norm": 1.4468427896499634, + "learning_rate": 1.1327629129590402e-05, + "loss": 1.0597, + "step": 2073 + }, + { + "epoch": 0.4748712077847739, + "grad_norm": 1.1622424125671387, + "learning_rate": 1.132027626739965e-05, + "loss": 1.0962, + "step": 2074 + }, + { + "epoch": 0.47510017172295366, + "grad_norm": 1.3740299940109253, + "learning_rate": 1.1312922678671935e-05, + "loss": 1.0671, + "step": 2075 + }, + { + "epoch": 0.4753291356611334, + "grad_norm": 1.3251038789749146, + "learning_rate": 1.1305568367453877e-05, + "loss": 1.0959, + "step": 2076 + }, + { + "epoch": 0.4755580995993131, + "grad_norm": 1.244946002960205, + "learning_rate": 1.1298213337792494e-05, + "loss": 1.0705, + "step": 2077 + }, + { + "epoch": 0.47578706353749284, + "grad_norm": 1.1645523309707642, + "learning_rate": 1.1290857593735196e-05, + "loss": 1.097, + "step": 2078 + }, + { + "epoch": 0.47601602747567257, + "grad_norm": 1.0410329103469849, + "learning_rate": 1.1283501139329787e-05, + "loss": 1.0381, + "step": 2079 + }, + { + "epoch": 0.4762449914138523, + "grad_norm": 1.137269377708435, + "learning_rate": 1.1276143978624457e-05, + "loss": 1.0412, + "step": 2080 + }, + { + "epoch": 0.4764739553520321, + "grad_norm": 1.1206343173980713, + "learning_rate": 1.1268786115667798e-05, + "loss": 1.0825, + "step": 2081 + }, + { + "epoch": 0.4767029192902118, + "grad_norm": 1.176672339439392, + "learning_rate": 1.126142755450878e-05, + "loss": 1.0497, + "step": 2082 + }, + { + "epoch": 0.47693188322839153, + "grad_norm": 1.2728939056396484, + "learning_rate": 1.125406829919675e-05, + "loss": 1.0416, + "step": 2083 + }, + { + "epoch": 0.47716084716657126, + "grad_norm": 1.3541243076324463, + "learning_rate": 1.1246708353781453e-05, + "loss": 1.0776, + "step": 2084 + }, + { + "epoch": 0.477389811104751, + "grad_norm": 1.3378325700759888, + "learning_rate": 1.1239347722312997e-05, + "loss": 1.0741, + "step": 2085 + }, + { + "epoch": 0.47761877504293077, + "grad_norm": 1.2494585514068604, + "learning_rate": 1.1231986408841882e-05, + "loss": 1.072, + "step": 2086 + }, + { + "epoch": 0.4778477389811105, + "grad_norm": 1.4159393310546875, + "learning_rate": 1.1224624417418976e-05, + "loss": 1.1049, + "step": 2087 + }, + { + "epoch": 0.4780767029192902, + "grad_norm": 1.2381250858306885, + "learning_rate": 1.1217261752095518e-05, + "loss": 1.0467, + "step": 2088 + }, + { + "epoch": 0.47830566685746995, + "grad_norm": 1.128997802734375, + "learning_rate": 1.1209898416923129e-05, + "loss": 1.0505, + "step": 2089 + }, + { + "epoch": 0.4785346307956497, + "grad_norm": 1.1534404754638672, + "learning_rate": 1.120253441595378e-05, + "loss": 0.9887, + "step": 2090 + }, + { + "epoch": 0.4787635947338294, + "grad_norm": 1.047783613204956, + "learning_rate": 1.1195169753239825e-05, + "loss": 1.0748, + "step": 2091 + }, + { + "epoch": 0.4789925586720092, + "grad_norm": 1.0474052429199219, + "learning_rate": 1.1187804432833976e-05, + "loss": 1.0393, + "step": 2092 + }, + { + "epoch": 0.4792215226101889, + "grad_norm": 1.389976978302002, + "learning_rate": 1.1180438458789305e-05, + "loss": 1.0733, + "step": 2093 + }, + { + "epoch": 0.47945048654836864, + "grad_norm": 1.16421377658844, + "learning_rate": 1.1173071835159248e-05, + "loss": 1.0366, + "step": 2094 + }, + { + "epoch": 0.47967945048654836, + "grad_norm": 1.5794599056243896, + "learning_rate": 1.1165704565997593e-05, + "loss": 1.0504, + "step": 2095 + }, + { + "epoch": 0.4799084144247281, + "grad_norm": 1.2971510887145996, + "learning_rate": 1.115833665535849e-05, + "loss": 1.0369, + "step": 2096 + }, + { + "epoch": 0.4801373783629078, + "grad_norm": 1.2570523023605347, + "learning_rate": 1.1150968107296438e-05, + "loss": 1.046, + "step": 2097 + }, + { + "epoch": 0.4803663423010876, + "grad_norm": 1.7457414865493774, + "learning_rate": 1.1143598925866286e-05, + "loss": 1.0883, + "step": 2098 + }, + { + "epoch": 0.48059530623926733, + "grad_norm": 1.236221432685852, + "learning_rate": 1.1136229115123232e-05, + "loss": 1.0506, + "step": 2099 + }, + { + "epoch": 0.48082427017744706, + "grad_norm": 1.4669848680496216, + "learning_rate": 1.1128858679122822e-05, + "loss": 1.0371, + "step": 2100 + }, + { + "epoch": 0.4810532341156268, + "grad_norm": 1.0517163276672363, + "learning_rate": 1.1121487621920948e-05, + "loss": 1.0899, + "step": 2101 + }, + { + "epoch": 0.4812821980538065, + "grad_norm": 1.3375537395477295, + "learning_rate": 1.1114115947573834e-05, + "loss": 1.0915, + "step": 2102 + }, + { + "epoch": 0.48151116199198624, + "grad_norm": 1.2560817003250122, + "learning_rate": 1.1106743660138057e-05, + "loss": 1.0988, + "step": 2103 + }, + { + "epoch": 0.481740125930166, + "grad_norm": 1.2373180389404297, + "learning_rate": 1.1099370763670523e-05, + "loss": 0.9765, + "step": 2104 + }, + { + "epoch": 0.48196908986834575, + "grad_norm": 1.2725884914398193, + "learning_rate": 1.1091997262228473e-05, + "loss": 1.0098, + "step": 2105 + }, + { + "epoch": 0.4821980538065255, + "grad_norm": 1.7785913944244385, + "learning_rate": 1.1084623159869488e-05, + "loss": 1.0742, + "step": 2106 + }, + { + "epoch": 0.4824270177447052, + "grad_norm": 1.4811958074569702, + "learning_rate": 1.1077248460651468e-05, + "loss": 1.0411, + "step": 2107 + }, + { + "epoch": 0.4826559816828849, + "grad_norm": 1.2428529262542725, + "learning_rate": 1.1069873168632657e-05, + "loss": 1.1439, + "step": 2108 + }, + { + "epoch": 0.4828849456210647, + "grad_norm": 1.0219333171844482, + "learning_rate": 1.1062497287871606e-05, + "loss": 1.0631, + "step": 2109 + }, + { + "epoch": 0.48311390955924444, + "grad_norm": 1.0447986125946045, + "learning_rate": 1.1055120822427208e-05, + "loss": 1.0794, + "step": 2110 + }, + { + "epoch": 0.48334287349742416, + "grad_norm": 1.1982734203338623, + "learning_rate": 1.1047743776358666e-05, + "loss": 1.1079, + "step": 2111 + }, + { + "epoch": 0.4835718374356039, + "grad_norm": 1.3162384033203125, + "learning_rate": 1.104036615372551e-05, + "loss": 1.1181, + "step": 2112 + }, + { + "epoch": 0.4838008013737836, + "grad_norm": 1.0572696924209595, + "learning_rate": 1.103298795858758e-05, + "loss": 1.0138, + "step": 2113 + }, + { + "epoch": 0.48402976531196334, + "grad_norm": 1.1792926788330078, + "learning_rate": 1.1025609195005038e-05, + "loss": 1.0455, + "step": 2114 + }, + { + "epoch": 0.4842587292501431, + "grad_norm": 2.336987018585205, + "learning_rate": 1.1018229867038358e-05, + "loss": 1.1053, + "step": 2115 + }, + { + "epoch": 0.48448769318832285, + "grad_norm": 1.0690840482711792, + "learning_rate": 1.1010849978748314e-05, + "loss": 1.0366, + "step": 2116 + }, + { + "epoch": 0.4847166571265026, + "grad_norm": 1.3050801753997803, + "learning_rate": 1.1003469534196003e-05, + "loss": 1.0121, + "step": 2117 + }, + { + "epoch": 0.4849456210646823, + "grad_norm": 1.054900884628296, + "learning_rate": 1.099608853744282e-05, + "loss": 1.0639, + "step": 2118 + }, + { + "epoch": 0.48517458500286204, + "grad_norm": 1.2549351453781128, + "learning_rate": 1.0988706992550467e-05, + "loss": 1.0572, + "step": 2119 + }, + { + "epoch": 0.48540354894104176, + "grad_norm": 1.4606761932373047, + "learning_rate": 1.0981324903580945e-05, + "loss": 1.0981, + "step": 2120 + }, + { + "epoch": 0.48563251287922155, + "grad_norm": 1.153498649597168, + "learning_rate": 1.0973942274596557e-05, + "loss": 1.0869, + "step": 2121 + }, + { + "epoch": 0.48586147681740127, + "grad_norm": 1.411656141281128, + "learning_rate": 1.09665591096599e-05, + "loss": 1.0528, + "step": 2122 + }, + { + "epoch": 0.486090440755581, + "grad_norm": 1.040209174156189, + "learning_rate": 1.0959175412833869e-05, + "loss": 1.0702, + "step": 2123 + }, + { + "epoch": 0.4863194046937607, + "grad_norm": 0.9708045721054077, + "learning_rate": 1.0951791188181648e-05, + "loss": 1.1015, + "step": 2124 + }, + { + "epoch": 0.48654836863194045, + "grad_norm": 1.1500178575515747, + "learning_rate": 1.0944406439766719e-05, + "loss": 1.0767, + "step": 2125 + }, + { + "epoch": 0.4867773325701202, + "grad_norm": 3.3348257541656494, + "learning_rate": 1.0937021171652842e-05, + "loss": 1.146, + "step": 2126 + }, + { + "epoch": 0.48700629650829996, + "grad_norm": 1.2559481859207153, + "learning_rate": 1.0929635387904075e-05, + "loss": 1.0668, + "step": 2127 + }, + { + "epoch": 0.4872352604464797, + "grad_norm": 2.0190439224243164, + "learning_rate": 1.092224909258474e-05, + "loss": 1.0488, + "step": 2128 + }, + { + "epoch": 0.4874642243846594, + "grad_norm": 1.2366149425506592, + "learning_rate": 1.0914862289759467e-05, + "loss": 1.0883, + "step": 2129 + }, + { + "epoch": 0.48769318832283914, + "grad_norm": 1.206701397895813, + "learning_rate": 1.0907474983493144e-05, + "loss": 1.0949, + "step": 2130 + }, + { + "epoch": 0.48792215226101887, + "grad_norm": 1.0292967557907104, + "learning_rate": 1.0900087177850946e-05, + "loss": 1.0862, + "step": 2131 + }, + { + "epoch": 0.48815111619919865, + "grad_norm": 1.2705435752868652, + "learning_rate": 1.0892698876898322e-05, + "loss": 1.0859, + "step": 2132 + }, + { + "epoch": 0.4883800801373784, + "grad_norm": 1.5828287601470947, + "learning_rate": 1.0885310084700989e-05, + "loss": 1.0841, + "step": 2133 + }, + { + "epoch": 0.4886090440755581, + "grad_norm": 1.2173969745635986, + "learning_rate": 1.087792080532494e-05, + "loss": 1.0741, + "step": 2134 + }, + { + "epoch": 0.48883800801373783, + "grad_norm": 1.191078543663025, + "learning_rate": 1.087053104283643e-05, + "loss": 1.0468, + "step": 2135 + }, + { + "epoch": 0.48906697195191756, + "grad_norm": 1.1816260814666748, + "learning_rate": 1.0863140801301988e-05, + "loss": 1.0349, + "step": 2136 + }, + { + "epoch": 0.4892959358900973, + "grad_norm": 1.1615114212036133, + "learning_rate": 1.08557500847884e-05, + "loss": 1.0635, + "step": 2137 + }, + { + "epoch": 0.48952489982827707, + "grad_norm": 1.2740709781646729, + "learning_rate": 1.0848358897362713e-05, + "loss": 0.9996, + "step": 2138 + }, + { + "epoch": 0.4897538637664568, + "grad_norm": 1.2105414867401123, + "learning_rate": 1.0840967243092237e-05, + "loss": 1.1426, + "step": 2139 + }, + { + "epoch": 0.4899828277046365, + "grad_norm": 1.668809175491333, + "learning_rate": 1.083357512604454e-05, + "loss": 1.0329, + "step": 2140 + }, + { + "epoch": 0.49021179164281625, + "grad_norm": 1.245213508605957, + "learning_rate": 1.082618255028744e-05, + "loss": 1.0716, + "step": 2141 + }, + { + "epoch": 0.490440755580996, + "grad_norm": 1.3269602060317993, + "learning_rate": 1.0818789519889006e-05, + "loss": 1.0416, + "step": 2142 + }, + { + "epoch": 0.4906697195191757, + "grad_norm": 1.0687472820281982, + "learning_rate": 1.0811396038917568e-05, + "loss": 1.0823, + "step": 2143 + }, + { + "epoch": 0.4908986834573555, + "grad_norm": 1.3689026832580566, + "learning_rate": 1.080400211144169e-05, + "loss": 1.0581, + "step": 2144 + }, + { + "epoch": 0.4911276473955352, + "grad_norm": 1.1712907552719116, + "learning_rate": 1.0796607741530191e-05, + "loss": 1.0461, + "step": 2145 + }, + { + "epoch": 0.49135661133371494, + "grad_norm": 2.368657350540161, + "learning_rate": 1.078921293325213e-05, + "loss": 1.1125, + "step": 2146 + }, + { + "epoch": 0.49158557527189467, + "grad_norm": 1.0939091444015503, + "learning_rate": 1.078181769067681e-05, + "loss": 1.1154, + "step": 2147 + }, + { + "epoch": 0.4918145392100744, + "grad_norm": 1.1558587551116943, + "learning_rate": 1.077442201787377e-05, + "loss": 1.049, + "step": 2148 + }, + { + "epoch": 0.4920435031482541, + "grad_norm": 1.6609954833984375, + "learning_rate": 1.0767025918912785e-05, + "loss": 1.0641, + "step": 2149 + }, + { + "epoch": 0.4922724670864339, + "grad_norm": 1.109490990638733, + "learning_rate": 1.075962939786387e-05, + "loss": 1.0407, + "step": 2150 + }, + { + "epoch": 0.49250143102461363, + "grad_norm": 1.1975339651107788, + "learning_rate": 1.0752232458797262e-05, + "loss": 1.1138, + "step": 2151 + }, + { + "epoch": 0.49273039496279336, + "grad_norm": 2.040076971054077, + "learning_rate": 1.0744835105783443e-05, + "loss": 1.0248, + "step": 2152 + }, + { + "epoch": 0.4929593589009731, + "grad_norm": 1.3864123821258545, + "learning_rate": 1.0737437342893107e-05, + "loss": 1.089, + "step": 2153 + }, + { + "epoch": 0.4931883228391528, + "grad_norm": 1.306090235710144, + "learning_rate": 1.0730039174197185e-05, + "loss": 1.0915, + "step": 2154 + }, + { + "epoch": 0.4934172867773326, + "grad_norm": 1.2275519371032715, + "learning_rate": 1.0722640603766825e-05, + "loss": 1.0502, + "step": 2155 + }, + { + "epoch": 0.4936462507155123, + "grad_norm": 1.4218353033065796, + "learning_rate": 1.0715241635673401e-05, + "loss": 1.1642, + "step": 2156 + }, + { + "epoch": 0.49387521465369205, + "grad_norm": 1.3406203985214233, + "learning_rate": 1.0707842273988498e-05, + "loss": 1.1157, + "step": 2157 + }, + { + "epoch": 0.4941041785918718, + "grad_norm": 1.8006969690322876, + "learning_rate": 1.070044252278393e-05, + "loss": 1.0964, + "step": 2158 + }, + { + "epoch": 0.4943331425300515, + "grad_norm": 1.0731760263442993, + "learning_rate": 1.0693042386131713e-05, + "loss": 1.0523, + "step": 2159 + }, + { + "epoch": 0.49456210646823123, + "grad_norm": 1.6657426357269287, + "learning_rate": 1.0685641868104085e-05, + "loss": 1.0756, + "step": 2160 + }, + { + "epoch": 0.494791070406411, + "grad_norm": 1.2945160865783691, + "learning_rate": 1.0678240972773479e-05, + "loss": 1.0694, + "step": 2161 + }, + { + "epoch": 0.49502003434459074, + "grad_norm": 1.229271650314331, + "learning_rate": 1.0670839704212555e-05, + "loss": 1.084, + "step": 2162 + }, + { + "epoch": 0.49524899828277047, + "grad_norm": 1.339217185974121, + "learning_rate": 1.0663438066494168e-05, + "loss": 1.1103, + "step": 2163 + }, + { + "epoch": 0.4954779622209502, + "grad_norm": 2.237689971923828, + "learning_rate": 1.0656036063691373e-05, + "loss": 1.0424, + "step": 2164 + }, + { + "epoch": 0.4957069261591299, + "grad_norm": 1.0860393047332764, + "learning_rate": 1.064863369987743e-05, + "loss": 1.0646, + "step": 2165 + }, + { + "epoch": 0.49593589009730965, + "grad_norm": 1.190271258354187, + "learning_rate": 1.0641230979125804e-05, + "loss": 1.0778, + "step": 2166 + }, + { + "epoch": 0.49616485403548943, + "grad_norm": 1.1193439960479736, + "learning_rate": 1.0633827905510146e-05, + "loss": 1.0855, + "step": 2167 + }, + { + "epoch": 0.49639381797366916, + "grad_norm": 1.3502209186553955, + "learning_rate": 1.0626424483104302e-05, + "loss": 1.0405, + "step": 2168 + }, + { + "epoch": 0.4966227819118489, + "grad_norm": 1.0666968822479248, + "learning_rate": 1.061902071598232e-05, + "loss": 1.0146, + "step": 2169 + }, + { + "epoch": 0.4968517458500286, + "grad_norm": 1.291035771369934, + "learning_rate": 1.0611616608218429e-05, + "loss": 1.058, + "step": 2170 + }, + { + "epoch": 0.49708070978820834, + "grad_norm": 1.1638990640640259, + "learning_rate": 1.0604212163887044e-05, + "loss": 1.0792, + "step": 2171 + }, + { + "epoch": 0.49730967372638807, + "grad_norm": 1.4782902002334595, + "learning_rate": 1.0596807387062772e-05, + "loss": 1.0652, + "step": 2172 + }, + { + "epoch": 0.49753863766456785, + "grad_norm": 1.5571351051330566, + "learning_rate": 1.0589402281820397e-05, + "loss": 1.1116, + "step": 2173 + }, + { + "epoch": 0.4977676016027476, + "grad_norm": 1.5529664754867554, + "learning_rate": 1.058199685223489e-05, + "loss": 1.0767, + "step": 2174 + }, + { + "epoch": 0.4979965655409273, + "grad_norm": 1.0995012521743774, + "learning_rate": 1.0574591102381395e-05, + "loss": 1.0161, + "step": 2175 + }, + { + "epoch": 0.49822552947910703, + "grad_norm": 1.1900322437286377, + "learning_rate": 1.056718503633523e-05, + "loss": 0.9899, + "step": 2176 + }, + { + "epoch": 0.49845449341728676, + "grad_norm": 1.1697393655776978, + "learning_rate": 1.05597786581719e-05, + "loss": 1.0988, + "step": 2177 + }, + { + "epoch": 0.49868345735546654, + "grad_norm": 1.67704176902771, + "learning_rate": 1.0552371971967064e-05, + "loss": 1.0819, + "step": 2178 + }, + { + "epoch": 0.49891242129364627, + "grad_norm": 1.1780916452407837, + "learning_rate": 1.0544964981796563e-05, + "loss": 1.0438, + "step": 2179 + }, + { + "epoch": 0.499141385231826, + "grad_norm": 1.136317491531372, + "learning_rate": 1.0537557691736402e-05, + "loss": 1.0522, + "step": 2180 + }, + { + "epoch": 0.4993703491700057, + "grad_norm": 1.273093581199646, + "learning_rate": 1.0530150105862748e-05, + "loss": 1.0864, + "step": 2181 + }, + { + "epoch": 0.49959931310818545, + "grad_norm": 1.1058335304260254, + "learning_rate": 1.052274222825194e-05, + "loss": 1.0664, + "step": 2182 + }, + { + "epoch": 0.4998282770463652, + "grad_norm": 1.14626944065094, + "learning_rate": 1.0515334062980463e-05, + "loss": 1.0342, + "step": 2183 + }, + { + "epoch": 0.500057240984545, + "grad_norm": 1.7187449932098389, + "learning_rate": 1.0507925614124977e-05, + "loss": 1.0868, + "step": 2184 + }, + { + "epoch": 0.5002862049227247, + "grad_norm": 1.194671630859375, + "learning_rate": 1.0500516885762278e-05, + "loss": 1.1066, + "step": 2185 + }, + { + "epoch": 0.5005151688609044, + "grad_norm": 1.1433876752853394, + "learning_rate": 1.0493107881969335e-05, + "loss": 1.0736, + "step": 2186 + }, + { + "epoch": 0.5007441327990841, + "grad_norm": 1.1823675632476807, + "learning_rate": 1.0485698606823258e-05, + "loss": 1.0713, + "step": 2187 + }, + { + "epoch": 0.5009730967372639, + "grad_norm": 1.105976939201355, + "learning_rate": 1.047828906440131e-05, + "loss": 1.06, + "step": 2188 + }, + { + "epoch": 0.5012020606754436, + "grad_norm": 1.8626004457473755, + "learning_rate": 1.0470879258780904e-05, + "loss": 1.0403, + "step": 2189 + }, + { + "epoch": 0.5014310246136233, + "grad_norm": 0.9595720171928406, + "learning_rate": 1.0463469194039584e-05, + "loss": 1.0738, + "step": 2190 + }, + { + "epoch": 0.501659988551803, + "grad_norm": 1.0298045873641968, + "learning_rate": 1.0456058874255055e-05, + "loss": 1.0297, + "step": 2191 + }, + { + "epoch": 0.5018889524899828, + "grad_norm": 1.3229492902755737, + "learning_rate": 1.044864830350515e-05, + "loss": 1.0169, + "step": 2192 + }, + { + "epoch": 0.5021179164281626, + "grad_norm": 1.3693296909332275, + "learning_rate": 1.0441237485867845e-05, + "loss": 1.0533, + "step": 2193 + }, + { + "epoch": 0.5023468803663423, + "grad_norm": 1.587257981300354, + "learning_rate": 1.0433826425421252e-05, + "loss": 1.0314, + "step": 2194 + }, + { + "epoch": 0.5025758443045221, + "grad_norm": 1.2772449254989624, + "learning_rate": 1.0426415126243615e-05, + "loss": 1.1135, + "step": 2195 + }, + { + "epoch": 0.5028048082427018, + "grad_norm": 1.3835172653198242, + "learning_rate": 1.0419003592413308e-05, + "loss": 1.0587, + "step": 2196 + }, + { + "epoch": 0.5030337721808815, + "grad_norm": 1.1872873306274414, + "learning_rate": 1.0411591828008839e-05, + "loss": 1.092, + "step": 2197 + }, + { + "epoch": 0.5032627361190612, + "grad_norm": 1.5710023641586304, + "learning_rate": 1.0404179837108833e-05, + "loss": 1.0778, + "step": 2198 + }, + { + "epoch": 0.503491700057241, + "grad_norm": 1.2492884397506714, + "learning_rate": 1.0396767623792054e-05, + "loss": 1.0466, + "step": 2199 + }, + { + "epoch": 0.5037206639954207, + "grad_norm": 1.0260891914367676, + "learning_rate": 1.0389355192137379e-05, + "loss": 1.1209, + "step": 2200 + }, + { + "epoch": 0.5039496279336004, + "grad_norm": 1.9055163860321045, + "learning_rate": 1.0381942546223805e-05, + "loss": 1.0782, + "step": 2201 + }, + { + "epoch": 0.5041785918717802, + "grad_norm": 1.1072444915771484, + "learning_rate": 1.0374529690130448e-05, + "loss": 1.1312, + "step": 2202 + }, + { + "epoch": 0.5044075558099599, + "grad_norm": 1.2415649890899658, + "learning_rate": 1.0367116627936549e-05, + "loss": 1.0609, + "step": 2203 + }, + { + "epoch": 0.5046365197481397, + "grad_norm": 1.1317917108535767, + "learning_rate": 1.0359703363721443e-05, + "loss": 1.0392, + "step": 2204 + }, + { + "epoch": 0.5048654836863194, + "grad_norm": 1.2405906915664673, + "learning_rate": 1.0352289901564592e-05, + "loss": 1.0132, + "step": 2205 + }, + { + "epoch": 0.5050944476244992, + "grad_norm": 1.1400092840194702, + "learning_rate": 1.034487624554556e-05, + "loss": 1.026, + "step": 2206 + }, + { + "epoch": 0.5053234115626789, + "grad_norm": 1.1699836254119873, + "learning_rate": 1.0337462399744025e-05, + "loss": 1.077, + "step": 2207 + }, + { + "epoch": 0.5055523755008586, + "grad_norm": 1.1276884078979492, + "learning_rate": 1.033004836823976e-05, + "loss": 1.1002, + "step": 2208 + }, + { + "epoch": 0.5057813394390384, + "grad_norm": 1.2098451852798462, + "learning_rate": 1.032263415511264e-05, + "loss": 1.006, + "step": 2209 + }, + { + "epoch": 0.5060103033772181, + "grad_norm": 1.1760305166244507, + "learning_rate": 1.0315219764442657e-05, + "loss": 1.0193, + "step": 2210 + }, + { + "epoch": 0.5062392673153978, + "grad_norm": 1.0920625925064087, + "learning_rate": 1.0307805200309877e-05, + "loss": 1.0141, + "step": 2211 + }, + { + "epoch": 0.5064682312535775, + "grad_norm": 1.2013732194900513, + "learning_rate": 1.0300390466794477e-05, + "loss": 1.0508, + "step": 2212 + }, + { + "epoch": 0.5066971951917573, + "grad_norm": 1.3105764389038086, + "learning_rate": 1.0292975567976719e-05, + "loss": 1.0373, + "step": 2213 + }, + { + "epoch": 0.506926159129937, + "grad_norm": 1.2512943744659424, + "learning_rate": 1.0285560507936962e-05, + "loss": 1.0673, + "step": 2214 + }, + { + "epoch": 0.5071551230681167, + "grad_norm": 1.5378952026367188, + "learning_rate": 1.0278145290755657e-05, + "loss": 1.0772, + "step": 2215 + }, + { + "epoch": 0.5073840870062966, + "grad_norm": 1.0876376628875732, + "learning_rate": 1.0270729920513326e-05, + "loss": 1.0603, + "step": 2216 + }, + { + "epoch": 0.5076130509444763, + "grad_norm": 1.0932906866073608, + "learning_rate": 1.0263314401290589e-05, + "loss": 1.0401, + "step": 2217 + }, + { + "epoch": 0.507842014882656, + "grad_norm": 1.3708820343017578, + "learning_rate": 1.0255898737168147e-05, + "loss": 1.073, + "step": 2218 + }, + { + "epoch": 0.5080709788208357, + "grad_norm": 1.1124688386917114, + "learning_rate": 1.0248482932226775e-05, + "loss": 1.0008, + "step": 2219 + }, + { + "epoch": 0.5082999427590155, + "grad_norm": 2.375359058380127, + "learning_rate": 1.0241066990547328e-05, + "loss": 1.0608, + "step": 2220 + }, + { + "epoch": 0.5085289066971952, + "grad_norm": 1.3283129930496216, + "learning_rate": 1.0233650916210736e-05, + "loss": 1.0725, + "step": 2221 + }, + { + "epoch": 0.5087578706353749, + "grad_norm": 1.1090928316116333, + "learning_rate": 1.0226234713298007e-05, + "loss": 1.0388, + "step": 2222 + }, + { + "epoch": 0.5089868345735546, + "grad_norm": 1.2009447813034058, + "learning_rate": 1.021881838589021e-05, + "loss": 1.0969, + "step": 2223 + }, + { + "epoch": 0.5092157985117344, + "grad_norm": 1.144832968711853, + "learning_rate": 1.021140193806849e-05, + "loss": 1.0553, + "step": 2224 + }, + { + "epoch": 0.5094447624499141, + "grad_norm": 1.072203516960144, + "learning_rate": 1.0203985373914056e-05, + "loss": 1.0466, + "step": 2225 + }, + { + "epoch": 0.5096737263880938, + "grad_norm": 1.0581434965133667, + "learning_rate": 1.019656869750818e-05, + "loss": 1.0244, + "step": 2226 + }, + { + "epoch": 0.5099026903262737, + "grad_norm": 1.2225676774978638, + "learning_rate": 1.0189151912932199e-05, + "loss": 1.0231, + "step": 2227 + }, + { + "epoch": 0.5101316542644534, + "grad_norm": 1.196749210357666, + "learning_rate": 1.0181735024267504e-05, + "loss": 1.128, + "step": 2228 + }, + { + "epoch": 0.5103606182026331, + "grad_norm": 1.0913293361663818, + "learning_rate": 1.0174318035595551e-05, + "loss": 1.1325, + "step": 2229 + }, + { + "epoch": 0.5105895821408128, + "grad_norm": 1.296054720878601, + "learning_rate": 1.0166900950997845e-05, + "loss": 1.0734, + "step": 2230 + }, + { + "epoch": 0.5108185460789926, + "grad_norm": 1.2411744594573975, + "learning_rate": 1.0159483774555945e-05, + "loss": 1.033, + "step": 2231 + }, + { + "epoch": 0.5110475100171723, + "grad_norm": 1.2513041496276855, + "learning_rate": 1.015206651035146e-05, + "loss": 1.1326, + "step": 2232 + }, + { + "epoch": 0.511276473955352, + "grad_norm": 1.182816743850708, + "learning_rate": 1.0144649162466047e-05, + "loss": 1.0539, + "step": 2233 + }, + { + "epoch": 0.5115054378935318, + "grad_norm": 1.555910587310791, + "learning_rate": 1.0137231734981417e-05, + "loss": 1.0438, + "step": 2234 + }, + { + "epoch": 0.5117344018317115, + "grad_norm": 1.1628386974334717, + "learning_rate": 1.012981423197931e-05, + "loss": 1.1111, + "step": 2235 + }, + { + "epoch": 0.5119633657698912, + "grad_norm": 1.134666919708252, + "learning_rate": 1.0122396657541522e-05, + "loss": 1.0665, + "step": 2236 + }, + { + "epoch": 0.5121923297080709, + "grad_norm": 1.291595458984375, + "learning_rate": 1.011497901574988e-05, + "loss": 0.956, + "step": 2237 + }, + { + "epoch": 0.5124212936462507, + "grad_norm": 1.0975548028945923, + "learning_rate": 1.0107561310686247e-05, + "loss": 1.0621, + "step": 2238 + }, + { + "epoch": 0.5126502575844305, + "grad_norm": 1.4685457944869995, + "learning_rate": 1.0100143546432527e-05, + "loss": 1.11, + "step": 2239 + }, + { + "epoch": 0.5128792215226102, + "grad_norm": 1.5499354600906372, + "learning_rate": 1.0092725727070653e-05, + "loss": 1.0941, + "step": 2240 + }, + { + "epoch": 0.51310818546079, + "grad_norm": 1.6682047843933105, + "learning_rate": 1.0085307856682593e-05, + "loss": 1.0784, + "step": 2241 + }, + { + "epoch": 0.5133371493989697, + "grad_norm": 1.0723470449447632, + "learning_rate": 1.007788993935033e-05, + "loss": 1.0805, + "step": 2242 + }, + { + "epoch": 0.5135661133371494, + "grad_norm": 1.175240397453308, + "learning_rate": 1.007047197915589e-05, + "loss": 1.098, + "step": 2243 + }, + { + "epoch": 0.5137950772753291, + "grad_norm": 1.1597743034362793, + "learning_rate": 1.0063053980181305e-05, + "loss": 1.0723, + "step": 2244 + }, + { + "epoch": 0.5140240412135089, + "grad_norm": 1.0197961330413818, + "learning_rate": 1.0055635946508649e-05, + "loss": 1.0706, + "step": 2245 + }, + { + "epoch": 0.5142530051516886, + "grad_norm": 1.1021794080734253, + "learning_rate": 1.0048217882219995e-05, + "loss": 1.0715, + "step": 2246 + }, + { + "epoch": 0.5144819690898683, + "grad_norm": 1.0522361993789673, + "learning_rate": 1.0040799791397444e-05, + "loss": 1.0604, + "step": 2247 + }, + { + "epoch": 0.514710933028048, + "grad_norm": 1.32160484790802, + "learning_rate": 1.0033381678123113e-05, + "loss": 1.0638, + "step": 2248 + }, + { + "epoch": 0.5149398969662278, + "grad_norm": 1.5277987718582153, + "learning_rate": 1.002596354647912e-05, + "loss": 1.1117, + "step": 2249 + }, + { + "epoch": 0.5151688609044076, + "grad_norm": 1.1318628787994385, + "learning_rate": 1.0018545400547609e-05, + "loss": 1.0124, + "step": 2250 + }, + { + "epoch": 0.5153978248425873, + "grad_norm": 1.8370736837387085, + "learning_rate": 1.001112724441072e-05, + "loss": 1.0463, + "step": 2251 + }, + { + "epoch": 0.5156267887807671, + "grad_norm": 2.369612693786621, + "learning_rate": 1.0003709082150598e-05, + "loss": 1.0653, + "step": 2252 + }, + { + "epoch": 0.5158557527189468, + "grad_norm": 1.059601902961731, + "learning_rate": 9.996290917849405e-06, + "loss": 1.1053, + "step": 2253 + }, + { + "epoch": 0.5160847166571265, + "grad_norm": 1.0960521697998047, + "learning_rate": 9.988872755589283e-06, + "loss": 1.0594, + "step": 2254 + }, + { + "epoch": 0.5163136805953062, + "grad_norm": 1.3870129585266113, + "learning_rate": 9.981454599452391e-06, + "loss": 1.0288, + "step": 2255 + }, + { + "epoch": 0.516542644533486, + "grad_norm": 1.0221267938613892, + "learning_rate": 9.974036453520881e-06, + "loss": 0.9995, + "step": 2256 + }, + { + "epoch": 0.5167716084716657, + "grad_norm": 1.0023499727249146, + "learning_rate": 9.966618321876889e-06, + "loss": 1.0261, + "step": 2257 + }, + { + "epoch": 0.5170005724098454, + "grad_norm": 1.196207880973816, + "learning_rate": 9.95920020860256e-06, + "loss": 1.0774, + "step": 2258 + }, + { + "epoch": 0.5172295363480252, + "grad_norm": 1.261536717414856, + "learning_rate": 9.951782117780008e-06, + "loss": 1.0645, + "step": 2259 + }, + { + "epoch": 0.5174585002862049, + "grad_norm": 1.8544546365737915, + "learning_rate": 9.944364053491356e-06, + "loss": 1.0541, + "step": 2260 + }, + { + "epoch": 0.5176874642243846, + "grad_norm": 1.036802053451538, + "learning_rate": 9.936946019818698e-06, + "loss": 1.0645, + "step": 2261 + }, + { + "epoch": 0.5179164281625644, + "grad_norm": 1.515656590461731, + "learning_rate": 9.929528020844114e-06, + "loss": 1.0333, + "step": 2262 + }, + { + "epoch": 0.5181453921007442, + "grad_norm": 1.1658852100372314, + "learning_rate": 9.922110060649672e-06, + "loss": 1.0802, + "step": 2263 + }, + { + "epoch": 0.5183743560389239, + "grad_norm": 1.0310983657836914, + "learning_rate": 9.914692143317412e-06, + "loss": 1.0777, + "step": 2264 + }, + { + "epoch": 0.5186033199771036, + "grad_norm": 1.1228355169296265, + "learning_rate": 9.90727427292935e-06, + "loss": 1.0623, + "step": 2265 + }, + { + "epoch": 0.5188322839152834, + "grad_norm": 1.1571909189224243, + "learning_rate": 9.899856453567474e-06, + "loss": 1.0168, + "step": 2266 + }, + { + "epoch": 0.5190612478534631, + "grad_norm": 1.233893871307373, + "learning_rate": 9.892438689313757e-06, + "loss": 1.0401, + "step": 2267 + }, + { + "epoch": 0.5192902117916428, + "grad_norm": 2.297332525253296, + "learning_rate": 9.885020984250124e-06, + "loss": 1.0835, + "step": 2268 + }, + { + "epoch": 0.5195191757298225, + "grad_norm": 1.0272998809814453, + "learning_rate": 9.877603342458483e-06, + "loss": 1.0059, + "step": 2269 + }, + { + "epoch": 0.5197481396680023, + "grad_norm": 1.3607406616210938, + "learning_rate": 9.870185768020694e-06, + "loss": 1.072, + "step": 2270 + }, + { + "epoch": 0.519977103606182, + "grad_norm": 1.6629681587219238, + "learning_rate": 9.862768265018585e-06, + "loss": 1.0697, + "step": 2271 + }, + { + "epoch": 0.5202060675443617, + "grad_norm": 1.2140170335769653, + "learning_rate": 9.855350837533958e-06, + "loss": 1.0702, + "step": 2272 + }, + { + "epoch": 0.5204350314825416, + "grad_norm": 1.465503454208374, + "learning_rate": 9.847933489648545e-06, + "loss": 1.1246, + "step": 2273 + }, + { + "epoch": 0.5206639954207213, + "grad_norm": 1.1289547681808472, + "learning_rate": 9.840516225444059e-06, + "loss": 1.0491, + "step": 2274 + }, + { + "epoch": 0.520892959358901, + "grad_norm": 1.1703757047653198, + "learning_rate": 9.83309904900216e-06, + "loss": 1.0422, + "step": 2275 + }, + { + "epoch": 0.5211219232970807, + "grad_norm": 1.3183718919754028, + "learning_rate": 9.82568196440445e-06, + "loss": 1.1088, + "step": 2276 + }, + { + "epoch": 0.5213508872352605, + "grad_norm": 1.2612817287445068, + "learning_rate": 9.818264975732497e-06, + "loss": 1.0711, + "step": 2277 + }, + { + "epoch": 0.5215798511734402, + "grad_norm": 1.238250732421875, + "learning_rate": 9.810848087067805e-06, + "loss": 1.0843, + "step": 2278 + }, + { + "epoch": 0.5218088151116199, + "grad_norm": 1.1225922107696533, + "learning_rate": 9.803431302491823e-06, + "loss": 1.0751, + "step": 2279 + }, + { + "epoch": 0.5220377790497996, + "grad_norm": 1.3378745317459106, + "learning_rate": 9.79601462608595e-06, + "loss": 1.0344, + "step": 2280 + }, + { + "epoch": 0.5222667429879794, + "grad_norm": 1.1414393186569214, + "learning_rate": 9.788598061931513e-06, + "loss": 1.0469, + "step": 2281 + }, + { + "epoch": 0.5224957069261591, + "grad_norm": 1.9631890058517456, + "learning_rate": 9.781181614109793e-06, + "loss": 1.1085, + "step": 2282 + }, + { + "epoch": 0.5227246708643388, + "grad_norm": 1.1475551128387451, + "learning_rate": 9.773765286701998e-06, + "loss": 1.0458, + "step": 2283 + }, + { + "epoch": 0.5229536348025186, + "grad_norm": 1.056517481803894, + "learning_rate": 9.766349083789266e-06, + "loss": 1.0451, + "step": 2284 + }, + { + "epoch": 0.5231825987406984, + "grad_norm": 1.2475084066390991, + "learning_rate": 9.758933009452674e-06, + "loss": 1.0261, + "step": 2285 + }, + { + "epoch": 0.5234115626788781, + "grad_norm": 1.041703701019287, + "learning_rate": 9.751517067773228e-06, + "loss": 1.0591, + "step": 2286 + }, + { + "epoch": 0.5236405266170578, + "grad_norm": 1.334044337272644, + "learning_rate": 9.744101262831855e-06, + "loss": 1.0793, + "step": 2287 + }, + { + "epoch": 0.5238694905552376, + "grad_norm": 1.1858912706375122, + "learning_rate": 9.736685598709413e-06, + "loss": 1.084, + "step": 2288 + }, + { + "epoch": 0.5240984544934173, + "grad_norm": 1.2032268047332764, + "learning_rate": 9.72927007948668e-06, + "loss": 1.0218, + "step": 2289 + }, + { + "epoch": 0.524327418431597, + "grad_norm": 1.705446720123291, + "learning_rate": 9.721854709244346e-06, + "loss": 1.0121, + "step": 2290 + }, + { + "epoch": 0.5245563823697768, + "grad_norm": 1.374306559562683, + "learning_rate": 9.71443949206304e-06, + "loss": 1.0152, + "step": 2291 + }, + { + "epoch": 0.5247853463079565, + "grad_norm": 1.2469069957733154, + "learning_rate": 9.707024432023284e-06, + "loss": 1.0331, + "step": 2292 + }, + { + "epoch": 0.5250143102461362, + "grad_norm": 1.5407803058624268, + "learning_rate": 9.699609533205527e-06, + "loss": 1.0996, + "step": 2293 + }, + { + "epoch": 0.5252432741843159, + "grad_norm": 1.1991982460021973, + "learning_rate": 9.692194799690128e-06, + "loss": 1.1026, + "step": 2294 + }, + { + "epoch": 0.5254722381224957, + "grad_norm": 1.3273327350616455, + "learning_rate": 9.684780235557346e-06, + "loss": 1.0099, + "step": 2295 + }, + { + "epoch": 0.5257012020606755, + "grad_norm": 1.2710803747177124, + "learning_rate": 9.67736584488736e-06, + "loss": 1.0538, + "step": 2296 + }, + { + "epoch": 0.5259301659988552, + "grad_norm": 1.1919000148773193, + "learning_rate": 9.669951631760245e-06, + "loss": 1.06, + "step": 2297 + }, + { + "epoch": 0.526159129937035, + "grad_norm": 1.2854589223861694, + "learning_rate": 9.662537600255979e-06, + "loss": 1.044, + "step": 2298 + }, + { + "epoch": 0.5263880938752147, + "grad_norm": 1.4513083696365356, + "learning_rate": 9.65512375445444e-06, + "loss": 1.0597, + "step": 2299 + }, + { + "epoch": 0.5266170578133944, + "grad_norm": 1.1614129543304443, + "learning_rate": 9.647710098435413e-06, + "loss": 0.9531, + "step": 2300 + }, + { + "epoch": 0.5268460217515741, + "grad_norm": 1.165914535522461, + "learning_rate": 9.64029663627856e-06, + "loss": 1.0857, + "step": 2301 + }, + { + "epoch": 0.5270749856897539, + "grad_norm": 1.8047739267349243, + "learning_rate": 9.632883372063458e-06, + "loss": 1.029, + "step": 2302 + }, + { + "epoch": 0.5273039496279336, + "grad_norm": 1.2753591537475586, + "learning_rate": 9.625470309869554e-06, + "loss": 0.9939, + "step": 2303 + }, + { + "epoch": 0.5275329135661133, + "grad_norm": 1.525112509727478, + "learning_rate": 9.618057453776196e-06, + "loss": 1.0816, + "step": 2304 + }, + { + "epoch": 0.527761877504293, + "grad_norm": 1.4662785530090332, + "learning_rate": 9.610644807862625e-06, + "loss": 1.0653, + "step": 2305 + }, + { + "epoch": 0.5279908414424728, + "grad_norm": 3.009976387023926, + "learning_rate": 9.603232376207948e-06, + "loss": 0.9941, + "step": 2306 + }, + { + "epoch": 0.5282198053806525, + "grad_norm": 1.217646837234497, + "learning_rate": 9.59582016289117e-06, + "loss": 1.0431, + "step": 2307 + }, + { + "epoch": 0.5284487693188323, + "grad_norm": 1.1886101961135864, + "learning_rate": 9.588408171991168e-06, + "loss": 1.0745, + "step": 2308 + }, + { + "epoch": 0.5286777332570121, + "grad_norm": 1.1897996664047241, + "learning_rate": 9.580996407586695e-06, + "loss": 1.0537, + "step": 2309 + }, + { + "epoch": 0.5289066971951918, + "grad_norm": 1.416695237159729, + "learning_rate": 9.573584873756387e-06, + "loss": 1.0699, + "step": 2310 + }, + { + "epoch": 0.5291356611333715, + "grad_norm": 2.577252149581909, + "learning_rate": 9.566173574578751e-06, + "loss": 1.0533, + "step": 2311 + }, + { + "epoch": 0.5293646250715512, + "grad_norm": 1.102662444114685, + "learning_rate": 9.558762514132157e-06, + "loss": 1.1236, + "step": 2312 + }, + { + "epoch": 0.529593589009731, + "grad_norm": 1.6311894655227661, + "learning_rate": 9.551351696494854e-06, + "loss": 1.1093, + "step": 2313 + }, + { + "epoch": 0.5298225529479107, + "grad_norm": 1.327171802520752, + "learning_rate": 9.543941125744947e-06, + "loss": 1.0756, + "step": 2314 + }, + { + "epoch": 0.5300515168860904, + "grad_norm": 1.0158864259719849, + "learning_rate": 9.536530805960418e-06, + "loss": 1.0776, + "step": 2315 + }, + { + "epoch": 0.5302804808242702, + "grad_norm": 1.1998199224472046, + "learning_rate": 9.529120741219103e-06, + "loss": 1.0484, + "step": 2316 + }, + { + "epoch": 0.5305094447624499, + "grad_norm": 1.073457956314087, + "learning_rate": 9.521710935598693e-06, + "loss": 0.9798, + "step": 2317 + }, + { + "epoch": 0.5307384087006296, + "grad_norm": 1.1644235849380493, + "learning_rate": 9.514301393176742e-06, + "loss": 1.0871, + "step": 2318 + }, + { + "epoch": 0.5309673726388094, + "grad_norm": 1.074922800064087, + "learning_rate": 9.506892118030668e-06, + "loss": 1.0178, + "step": 2319 + }, + { + "epoch": 0.5311963365769892, + "grad_norm": 1.2031996250152588, + "learning_rate": 9.499483114237726e-06, + "loss": 1.1128, + "step": 2320 + }, + { + "epoch": 0.5314253005151689, + "grad_norm": 1.1941750049591064, + "learning_rate": 9.492074385875025e-06, + "loss": 1.0887, + "step": 2321 + }, + { + "epoch": 0.5316542644533486, + "grad_norm": 1.6538817882537842, + "learning_rate": 9.484665937019539e-06, + "loss": 1.0918, + "step": 2322 + }, + { + "epoch": 0.5318832283915284, + "grad_norm": 1.1676061153411865, + "learning_rate": 9.477257771748061e-06, + "loss": 1.0102, + "step": 2323 + }, + { + "epoch": 0.5321121923297081, + "grad_norm": 1.8035333156585693, + "learning_rate": 9.469849894137254e-06, + "loss": 1.0648, + "step": 2324 + }, + { + "epoch": 0.5323411562678878, + "grad_norm": 1.2286866903305054, + "learning_rate": 9.4624423082636e-06, + "loss": 1.0543, + "step": 2325 + }, + { + "epoch": 0.5325701202060675, + "grad_norm": 1.29671311378479, + "learning_rate": 9.455035018203439e-06, + "loss": 1.0887, + "step": 2326 + }, + { + "epoch": 0.5327990841442473, + "grad_norm": 1.1480847597122192, + "learning_rate": 9.44762802803294e-06, + "loss": 1.125, + "step": 2327 + }, + { + "epoch": 0.533028048082427, + "grad_norm": 1.277753233909607, + "learning_rate": 9.440221341828104e-06, + "loss": 1.0196, + "step": 2328 + }, + { + "epoch": 0.5332570120206067, + "grad_norm": 1.2395761013031006, + "learning_rate": 9.43281496366477e-06, + "loss": 1.0753, + "step": 2329 + }, + { + "epoch": 0.5334859759587864, + "grad_norm": 1.0635128021240234, + "learning_rate": 9.42540889761861e-06, + "loss": 1.0173, + "step": 2330 + }, + { + "epoch": 0.5337149398969663, + "grad_norm": 1.3224315643310547, + "learning_rate": 9.418003147765113e-06, + "loss": 0.992, + "step": 2331 + }, + { + "epoch": 0.533943903835146, + "grad_norm": 0.9989613890647888, + "learning_rate": 9.410597718179603e-06, + "loss": 1.0676, + "step": 2332 + }, + { + "epoch": 0.5341728677733257, + "grad_norm": 1.0474404096603394, + "learning_rate": 9.40319261293723e-06, + "loss": 1.0012, + "step": 2333 + }, + { + "epoch": 0.5344018317115055, + "grad_norm": 1.0710350275039673, + "learning_rate": 9.395787836112958e-06, + "loss": 1.0822, + "step": 2334 + }, + { + "epoch": 0.5346307956496852, + "grad_norm": 1.1103330850601196, + "learning_rate": 9.388383391781576e-06, + "loss": 1.05, + "step": 2335 + }, + { + "epoch": 0.5348597595878649, + "grad_norm": 1.2348805665969849, + "learning_rate": 9.380979284017682e-06, + "loss": 1.0656, + "step": 2336 + }, + { + "epoch": 0.5350887235260446, + "grad_norm": 1.1286513805389404, + "learning_rate": 9.373575516895698e-06, + "loss": 1.0303, + "step": 2337 + }, + { + "epoch": 0.5353176874642244, + "grad_norm": 1.173068881034851, + "learning_rate": 9.366172094489858e-06, + "loss": 1.0615, + "step": 2338 + }, + { + "epoch": 0.5355466514024041, + "grad_norm": 1.185383915901184, + "learning_rate": 9.358769020874198e-06, + "loss": 1.0928, + "step": 2339 + }, + { + "epoch": 0.5357756153405838, + "grad_norm": 1.1078901290893555, + "learning_rate": 9.351366300122569e-06, + "loss": 1.081, + "step": 2340 + }, + { + "epoch": 0.5360045792787635, + "grad_norm": 1.2607088088989258, + "learning_rate": 9.343963936308634e-06, + "loss": 1.017, + "step": 2341 + }, + { + "epoch": 0.5362335432169434, + "grad_norm": 1.102243423461914, + "learning_rate": 9.336561933505836e-06, + "loss": 1.0839, + "step": 2342 + }, + { + "epoch": 0.5364625071551231, + "grad_norm": 1.3678319454193115, + "learning_rate": 9.329160295787447e-06, + "loss": 1.073, + "step": 2343 + }, + { + "epoch": 0.5366914710933028, + "grad_norm": 1.1074059009552002, + "learning_rate": 9.321759027226525e-06, + "loss": 1.0501, + "step": 2344 + }, + { + "epoch": 0.5369204350314826, + "grad_norm": 1.2822704315185547, + "learning_rate": 9.314358131895919e-06, + "loss": 1.0713, + "step": 2345 + }, + { + "epoch": 0.5371493989696623, + "grad_norm": 1.119148850440979, + "learning_rate": 9.306957613868292e-06, + "loss": 1.0374, + "step": 2346 + }, + { + "epoch": 0.537378362907842, + "grad_norm": 1.2142623662948608, + "learning_rate": 9.299557477216073e-06, + "loss": 1.0243, + "step": 2347 + }, + { + "epoch": 0.5376073268460217, + "grad_norm": 1.343944787979126, + "learning_rate": 9.292157726011502e-06, + "loss": 1.1066, + "step": 2348 + }, + { + "epoch": 0.5378362907842015, + "grad_norm": 1.2517242431640625, + "learning_rate": 9.284758364326604e-06, + "loss": 1.0373, + "step": 2349 + }, + { + "epoch": 0.5380652547223812, + "grad_norm": 1.8333529233932495, + "learning_rate": 9.277359396233176e-06, + "loss": 1.04, + "step": 2350 + }, + { + "epoch": 0.5382942186605609, + "grad_norm": 1.1809799671173096, + "learning_rate": 9.269960825802817e-06, + "loss": 1.0577, + "step": 2351 + }, + { + "epoch": 0.5385231825987407, + "grad_norm": 1.1359210014343262, + "learning_rate": 9.262562657106898e-06, + "loss": 1.0731, + "step": 2352 + }, + { + "epoch": 0.5387521465369204, + "grad_norm": 1.6872447729110718, + "learning_rate": 9.255164894216562e-06, + "loss": 1.0443, + "step": 2353 + }, + { + "epoch": 0.5389811104751002, + "grad_norm": 1.3061941862106323, + "learning_rate": 9.247767541202738e-06, + "loss": 1.0525, + "step": 2354 + }, + { + "epoch": 0.53921007441328, + "grad_norm": 1.4695810079574585, + "learning_rate": 9.240370602136134e-06, + "loss": 1.0589, + "step": 2355 + }, + { + "epoch": 0.5394390383514597, + "grad_norm": 1.203343152999878, + "learning_rate": 9.232974081087216e-06, + "loss": 1.0574, + "step": 2356 + }, + { + "epoch": 0.5396680022896394, + "grad_norm": 1.3898227214813232, + "learning_rate": 9.225577982126234e-06, + "loss": 1.0697, + "step": 2357 + }, + { + "epoch": 0.5398969662278191, + "grad_norm": 1.1392302513122559, + "learning_rate": 9.218182309323193e-06, + "loss": 1.0639, + "step": 2358 + }, + { + "epoch": 0.5401259301659989, + "grad_norm": 0.9744246602058411, + "learning_rate": 9.21078706674787e-06, + "loss": 1.0744, + "step": 2359 + }, + { + "epoch": 0.5403548941041786, + "grad_norm": 1.147935152053833, + "learning_rate": 9.203392258469814e-06, + "loss": 1.0258, + "step": 2360 + }, + { + "epoch": 0.5405838580423583, + "grad_norm": 1.1611018180847168, + "learning_rate": 9.195997888558312e-06, + "loss": 1.0645, + "step": 2361 + }, + { + "epoch": 0.540812821980538, + "grad_norm": 1.4205443859100342, + "learning_rate": 9.188603961082436e-06, + "loss": 1.0319, + "step": 2362 + }, + { + "epoch": 0.5410417859187178, + "grad_norm": 1.160415768623352, + "learning_rate": 9.181210480110997e-06, + "loss": 1.1, + "step": 2363 + }, + { + "epoch": 0.5412707498568975, + "grad_norm": 1.692002296447754, + "learning_rate": 9.173817449712563e-06, + "loss": 1.0234, + "step": 2364 + }, + { + "epoch": 0.5414997137950772, + "grad_norm": 1.0437875986099243, + "learning_rate": 9.166424873955462e-06, + "loss": 1.0846, + "step": 2365 + }, + { + "epoch": 0.5417286777332571, + "grad_norm": 2.888340473175049, + "learning_rate": 9.159032756907765e-06, + "loss": 1.0399, + "step": 2366 + }, + { + "epoch": 0.5419576416714368, + "grad_norm": 1.0444552898406982, + "learning_rate": 9.151641102637289e-06, + "loss": 1.0821, + "step": 2367 + }, + { + "epoch": 0.5421866056096165, + "grad_norm": 1.2841830253601074, + "learning_rate": 9.144249915211605e-06, + "loss": 1.0579, + "step": 2368 + }, + { + "epoch": 0.5424155695477962, + "grad_norm": 1.8997342586517334, + "learning_rate": 9.136859198698014e-06, + "loss": 1.1244, + "step": 2369 + }, + { + "epoch": 0.542644533485976, + "grad_norm": 1.4083105325698853, + "learning_rate": 9.12946895716357e-06, + "loss": 1.0149, + "step": 2370 + }, + { + "epoch": 0.5428734974241557, + "grad_norm": 1.2661566734313965, + "learning_rate": 9.122079194675064e-06, + "loss": 1.0958, + "step": 2371 + }, + { + "epoch": 0.5431024613623354, + "grad_norm": 1.254859209060669, + "learning_rate": 9.114689915299015e-06, + "loss": 1.1392, + "step": 2372 + }, + { + "epoch": 0.5433314253005151, + "grad_norm": 1.7026736736297607, + "learning_rate": 9.10730112310168e-06, + "loss": 1.0332, + "step": 2373 + }, + { + "epoch": 0.5435603892386949, + "grad_norm": 1.1682655811309814, + "learning_rate": 9.099912822149056e-06, + "loss": 1.0609, + "step": 2374 + }, + { + "epoch": 0.5437893531768746, + "grad_norm": 1.0199397802352905, + "learning_rate": 9.092525016506858e-06, + "loss": 1.0164, + "step": 2375 + }, + { + "epoch": 0.5440183171150543, + "grad_norm": 1.0826716423034668, + "learning_rate": 9.085137710240536e-06, + "loss": 1.0245, + "step": 2376 + }, + { + "epoch": 0.5442472810532342, + "grad_norm": 1.1761521100997925, + "learning_rate": 9.077750907415264e-06, + "loss": 1.0278, + "step": 2377 + }, + { + "epoch": 0.5444762449914139, + "grad_norm": 1.112975001335144, + "learning_rate": 9.07036461209593e-06, + "loss": 1.021, + "step": 2378 + }, + { + "epoch": 0.5447052089295936, + "grad_norm": 1.1769195795059204, + "learning_rate": 9.06297882834716e-06, + "loss": 1.0348, + "step": 2379 + }, + { + "epoch": 0.5449341728677733, + "grad_norm": 1.3177459239959717, + "learning_rate": 9.055593560233284e-06, + "loss": 1.0599, + "step": 2380 + }, + { + "epoch": 0.5451631368059531, + "grad_norm": 1.2103499174118042, + "learning_rate": 9.048208811818353e-06, + "loss": 1.0701, + "step": 2381 + }, + { + "epoch": 0.5453921007441328, + "grad_norm": 1.5910011529922485, + "learning_rate": 9.040824587166136e-06, + "loss": 1.1076, + "step": 2382 + }, + { + "epoch": 0.5456210646823125, + "grad_norm": 1.020159125328064, + "learning_rate": 9.033440890340102e-06, + "loss": 1.0, + "step": 2383 + }, + { + "epoch": 0.5458500286204923, + "grad_norm": 1.280670166015625, + "learning_rate": 9.026057725403445e-06, + "loss": 1.015, + "step": 2384 + }, + { + "epoch": 0.546078992558672, + "grad_norm": 1.4677191972732544, + "learning_rate": 9.018675096419059e-06, + "loss": 1.0787, + "step": 2385 + }, + { + "epoch": 0.5463079564968517, + "grad_norm": 1.1645160913467407, + "learning_rate": 9.011293007449535e-06, + "loss": 1.0954, + "step": 2386 + }, + { + "epoch": 0.5465369204350314, + "grad_norm": 1.4413548707962036, + "learning_rate": 9.003911462557185e-06, + "loss": 1.0522, + "step": 2387 + }, + { + "epoch": 0.5467658843732112, + "grad_norm": 1.051669955253601, + "learning_rate": 8.996530465804e-06, + "loss": 1.0343, + "step": 2388 + }, + { + "epoch": 0.546994848311391, + "grad_norm": 1.315771222114563, + "learning_rate": 8.98915002125169e-06, + "loss": 1.0497, + "step": 2389 + }, + { + "epoch": 0.5472238122495707, + "grad_norm": 1.1893324851989746, + "learning_rate": 8.981770132961649e-06, + "loss": 1.0671, + "step": 2390 + }, + { + "epoch": 0.5474527761877505, + "grad_norm": 1.8192094564437866, + "learning_rate": 8.974390804994966e-06, + "loss": 1.0127, + "step": 2391 + }, + { + "epoch": 0.5476817401259302, + "grad_norm": 1.2040550708770752, + "learning_rate": 8.96701204141242e-06, + "loss": 1.1041, + "step": 2392 + }, + { + "epoch": 0.5479107040641099, + "grad_norm": 1.1481995582580566, + "learning_rate": 8.959633846274494e-06, + "loss": 1.0686, + "step": 2393 + }, + { + "epoch": 0.5481396680022896, + "grad_norm": 1.2036935091018677, + "learning_rate": 8.952256223641337e-06, + "loss": 1.0603, + "step": 2394 + }, + { + "epoch": 0.5483686319404694, + "grad_norm": 1.1734176874160767, + "learning_rate": 8.944879177572796e-06, + "loss": 1.0554, + "step": 2395 + }, + { + "epoch": 0.5485975958786491, + "grad_norm": 1.3413574695587158, + "learning_rate": 8.9375027121284e-06, + "loss": 0.9716, + "step": 2396 + }, + { + "epoch": 0.5488265598168288, + "grad_norm": 1.8128085136413574, + "learning_rate": 8.930126831367346e-06, + "loss": 1.0691, + "step": 2397 + }, + { + "epoch": 0.5490555237550085, + "grad_norm": 1.5389128923416138, + "learning_rate": 8.922751539348534e-06, + "loss": 1.0215, + "step": 2398 + }, + { + "epoch": 0.5492844876931883, + "grad_norm": 1.3885033130645752, + "learning_rate": 8.915376840130517e-06, + "loss": 1.1011, + "step": 2399 + }, + { + "epoch": 0.5495134516313681, + "grad_norm": 1.6749422550201416, + "learning_rate": 8.908002737771529e-06, + "loss": 1.0884, + "step": 2400 + }, + { + "epoch": 0.5497424155695478, + "grad_norm": 1.39671790599823, + "learning_rate": 8.900629236329482e-06, + "loss": 1.0296, + "step": 2401 + }, + { + "epoch": 0.5499713795077276, + "grad_norm": 1.2996166944503784, + "learning_rate": 8.893256339861946e-06, + "loss": 1.0335, + "step": 2402 + }, + { + "epoch": 0.5502003434459073, + "grad_norm": 1.482348084449768, + "learning_rate": 8.885884052426168e-06, + "loss": 1.0564, + "step": 2403 + }, + { + "epoch": 0.550429307384087, + "grad_norm": 1.385023593902588, + "learning_rate": 8.878512378079057e-06, + "loss": 1.0633, + "step": 2404 + }, + { + "epoch": 0.5506582713222667, + "grad_norm": 1.3737043142318726, + "learning_rate": 8.871141320877181e-06, + "loss": 1.0962, + "step": 2405 + }, + { + "epoch": 0.5508872352604465, + "grad_norm": 1.2021304368972778, + "learning_rate": 8.86377088487677e-06, + "loss": 1.0578, + "step": 2406 + }, + { + "epoch": 0.5511161991986262, + "grad_norm": 1.1903609037399292, + "learning_rate": 8.856401074133718e-06, + "loss": 1.0453, + "step": 2407 + }, + { + "epoch": 0.5513451631368059, + "grad_norm": 1.103837013244629, + "learning_rate": 8.849031892703564e-06, + "loss": 0.9953, + "step": 2408 + }, + { + "epoch": 0.5515741270749857, + "grad_norm": 1.2637211084365845, + "learning_rate": 8.841663344641514e-06, + "loss": 1.0946, + "step": 2409 + }, + { + "epoch": 0.5518030910131654, + "grad_norm": 1.2190263271331787, + "learning_rate": 8.83429543400241e-06, + "loss": 1.0259, + "step": 2410 + }, + { + "epoch": 0.5520320549513451, + "grad_norm": 1.3978723287582397, + "learning_rate": 8.826928164840755e-06, + "loss": 1.0372, + "step": 2411 + }, + { + "epoch": 0.552261018889525, + "grad_norm": 1.1146042346954346, + "learning_rate": 8.819561541210698e-06, + "loss": 1.0455, + "step": 2412 + }, + { + "epoch": 0.5524899828277047, + "grad_norm": 1.5733097791671753, + "learning_rate": 8.812195567166028e-06, + "loss": 1.0123, + "step": 2413 + }, + { + "epoch": 0.5527189467658844, + "grad_norm": 1.4020459651947021, + "learning_rate": 8.804830246760175e-06, + "loss": 1.0588, + "step": 2414 + }, + { + "epoch": 0.5529479107040641, + "grad_norm": 1.1088488101959229, + "learning_rate": 8.797465584046225e-06, + "loss": 1.0534, + "step": 2415 + }, + { + "epoch": 0.5531768746422439, + "grad_norm": 1.1872738599777222, + "learning_rate": 8.790101583076874e-06, + "loss": 1.0252, + "step": 2416 + }, + { + "epoch": 0.5534058385804236, + "grad_norm": 1.0902388095855713, + "learning_rate": 8.78273824790448e-06, + "loss": 1.0794, + "step": 2417 + }, + { + "epoch": 0.5536348025186033, + "grad_norm": 1.246376633644104, + "learning_rate": 8.775375582581027e-06, + "loss": 1.0609, + "step": 2418 + }, + { + "epoch": 0.553863766456783, + "grad_norm": 1.1201062202453613, + "learning_rate": 8.76801359115812e-06, + "loss": 1.0713, + "step": 2419 + }, + { + "epoch": 0.5540927303949628, + "grad_norm": 1.1334254741668701, + "learning_rate": 8.760652277687007e-06, + "loss": 1.0681, + "step": 2420 + }, + { + "epoch": 0.5543216943331425, + "grad_norm": 1.380236029624939, + "learning_rate": 8.75329164621855e-06, + "loss": 1.0457, + "step": 2421 + }, + { + "epoch": 0.5545506582713222, + "grad_norm": 1.3209164142608643, + "learning_rate": 8.745931700803251e-06, + "loss": 1.0322, + "step": 2422 + }, + { + "epoch": 0.554779622209502, + "grad_norm": 1.5157129764556885, + "learning_rate": 8.738572445491225e-06, + "loss": 1.0049, + "step": 2423 + }, + { + "epoch": 0.5550085861476818, + "grad_norm": 1.1138639450073242, + "learning_rate": 8.731213884332205e-06, + "loss": 1.0625, + "step": 2424 + }, + { + "epoch": 0.5552375500858615, + "grad_norm": 1.2549896240234375, + "learning_rate": 8.723856021375543e-06, + "loss": 1.0304, + "step": 2425 + }, + { + "epoch": 0.5554665140240412, + "grad_norm": 1.2935527563095093, + "learning_rate": 8.716498860670218e-06, + "loss": 1.0602, + "step": 2426 + }, + { + "epoch": 0.555695477962221, + "grad_norm": 1.2371066808700562, + "learning_rate": 8.709142406264807e-06, + "loss": 1.0347, + "step": 2427 + }, + { + "epoch": 0.5559244419004007, + "grad_norm": 1.1444132328033447, + "learning_rate": 8.701786662207506e-06, + "loss": 1.0418, + "step": 2428 + }, + { + "epoch": 0.5561534058385804, + "grad_norm": 1.1240417957305908, + "learning_rate": 8.694431632546127e-06, + "loss": 0.9742, + "step": 2429 + }, + { + "epoch": 0.5563823697767601, + "grad_norm": 1.1536635160446167, + "learning_rate": 8.687077321328066e-06, + "loss": 1.0736, + "step": 2430 + }, + { + "epoch": 0.5566113337149399, + "grad_norm": 1.3496026992797852, + "learning_rate": 8.679723732600355e-06, + "loss": 1.0392, + "step": 2431 + }, + { + "epoch": 0.5568402976531196, + "grad_norm": 1.3216404914855957, + "learning_rate": 8.672370870409601e-06, + "loss": 1.053, + "step": 2432 + }, + { + "epoch": 0.5570692615912993, + "grad_norm": 1.1046510934829712, + "learning_rate": 8.665018738802023e-06, + "loss": 1.0534, + "step": 2433 + }, + { + "epoch": 0.557298225529479, + "grad_norm": 1.7113914489746094, + "learning_rate": 8.657667341823449e-06, + "loss": 1.092, + "step": 2434 + }, + { + "epoch": 0.5575271894676589, + "grad_norm": 1.2329431772232056, + "learning_rate": 8.650316683519276e-06, + "loss": 1.0156, + "step": 2435 + }, + { + "epoch": 0.5577561534058386, + "grad_norm": 1.0992010831832886, + "learning_rate": 8.642966767934516e-06, + "loss": 1.0104, + "step": 2436 + }, + { + "epoch": 0.5579851173440183, + "grad_norm": 1.1873196363449097, + "learning_rate": 8.63561759911377e-06, + "loss": 0.9733, + "step": 2437 + }, + { + "epoch": 0.5582140812821981, + "grad_norm": 1.360651969909668, + "learning_rate": 8.628269181101216e-06, + "loss": 1.0369, + "step": 2438 + }, + { + "epoch": 0.5584430452203778, + "grad_norm": 1.1703392267227173, + "learning_rate": 8.620921517940635e-06, + "loss": 1.0778, + "step": 2439 + }, + { + "epoch": 0.5586720091585575, + "grad_norm": 1.1417691707611084, + "learning_rate": 8.61357461367538e-06, + "loss": 1.0593, + "step": 2440 + }, + { + "epoch": 0.5589009730967373, + "grad_norm": 1.130734920501709, + "learning_rate": 8.60622847234839e-06, + "loss": 1.0531, + "step": 2441 + }, + { + "epoch": 0.559129937034917, + "grad_norm": 1.1957651376724243, + "learning_rate": 8.598883098002188e-06, + "loss": 1.0961, + "step": 2442 + }, + { + "epoch": 0.5593589009730967, + "grad_norm": 1.3269649744033813, + "learning_rate": 8.591538494678867e-06, + "loss": 1.0608, + "step": 2443 + }, + { + "epoch": 0.5595878649112764, + "grad_norm": 1.2087531089782715, + "learning_rate": 8.584194666420105e-06, + "loss": 1.1281, + "step": 2444 + }, + { + "epoch": 0.5598168288494562, + "grad_norm": 1.5364724397659302, + "learning_rate": 8.576851617267151e-06, + "loss": 1.0366, + "step": 2445 + }, + { + "epoch": 0.560045792787636, + "grad_norm": 1.299095630645752, + "learning_rate": 8.569509351260817e-06, + "loss": 1.0942, + "step": 2446 + }, + { + "epoch": 0.5602747567258157, + "grad_norm": 1.0818737745285034, + "learning_rate": 8.562167872441493e-06, + "loss": 1.1259, + "step": 2447 + }, + { + "epoch": 0.5605037206639955, + "grad_norm": 1.1360951662063599, + "learning_rate": 8.554827184849139e-06, + "loss": 1.1018, + "step": 2448 + }, + { + "epoch": 0.5607326846021752, + "grad_norm": 1.383863925933838, + "learning_rate": 8.547487292523264e-06, + "loss": 1.0885, + "step": 2449 + }, + { + "epoch": 0.5609616485403549, + "grad_norm": 1.0860772132873535, + "learning_rate": 8.540148199502955e-06, + "loss": 1.0822, + "step": 2450 + }, + { + "epoch": 0.5611906124785346, + "grad_norm": 1.1337288618087769, + "learning_rate": 8.532809909826854e-06, + "loss": 1.0814, + "step": 2451 + }, + { + "epoch": 0.5614195764167144, + "grad_norm": 1.118790626525879, + "learning_rate": 8.525472427533156e-06, + "loss": 1.0809, + "step": 2452 + }, + { + "epoch": 0.5616485403548941, + "grad_norm": 1.1470918655395508, + "learning_rate": 8.518135756659624e-06, + "loss": 1.0631, + "step": 2453 + }, + { + "epoch": 0.5618775042930738, + "grad_norm": 1.0622974634170532, + "learning_rate": 8.510799901243554e-06, + "loss": 1.0763, + "step": 2454 + }, + { + "epoch": 0.5621064682312535, + "grad_norm": 1.0906604528427124, + "learning_rate": 8.503464865321817e-06, + "loss": 1.069, + "step": 2455 + }, + { + "epoch": 0.5623354321694333, + "grad_norm": 1.1250942945480347, + "learning_rate": 8.496130652930818e-06, + "loss": 1.0974, + "step": 2456 + }, + { + "epoch": 0.562564396107613, + "grad_norm": 1.2028555870056152, + "learning_rate": 8.48879726810651e-06, + "loss": 1.0351, + "step": 2457 + }, + { + "epoch": 0.5627933600457928, + "grad_norm": 1.8054412603378296, + "learning_rate": 8.481464714884396e-06, + "loss": 1.0841, + "step": 2458 + }, + { + "epoch": 0.5630223239839726, + "grad_norm": 1.1389802694320679, + "learning_rate": 8.474132997299521e-06, + "loss": 1.0583, + "step": 2459 + }, + { + "epoch": 0.5632512879221523, + "grad_norm": 1.328384280204773, + "learning_rate": 8.466802119386462e-06, + "loss": 1.0583, + "step": 2460 + }, + { + "epoch": 0.563480251860332, + "grad_norm": 1.3099395036697388, + "learning_rate": 8.459472085179342e-06, + "loss": 1.0417, + "step": 2461 + }, + { + "epoch": 0.5637092157985117, + "grad_norm": 1.1930067539215088, + "learning_rate": 8.45214289871182e-06, + "loss": 1.0465, + "step": 2462 + }, + { + "epoch": 0.5639381797366915, + "grad_norm": 1.0933071374893188, + "learning_rate": 8.44481456401708e-06, + "loss": 1.0736, + "step": 2463 + }, + { + "epoch": 0.5641671436748712, + "grad_norm": 1.0359748601913452, + "learning_rate": 8.437487085127851e-06, + "loss": 1.0145, + "step": 2464 + }, + { + "epoch": 0.5643961076130509, + "grad_norm": 1.7698050737380981, + "learning_rate": 8.430160466076378e-06, + "loss": 1.0021, + "step": 2465 + }, + { + "epoch": 0.5646250715512307, + "grad_norm": 1.2962058782577515, + "learning_rate": 8.422834710894434e-06, + "loss": 1.05, + "step": 2466 + }, + { + "epoch": 0.5648540354894104, + "grad_norm": 1.083851933479309, + "learning_rate": 8.415509823613332e-06, + "loss": 0.9928, + "step": 2467 + }, + { + "epoch": 0.5650829994275901, + "grad_norm": 1.2015832662582397, + "learning_rate": 8.408185808263886e-06, + "loss": 1.0259, + "step": 2468 + }, + { + "epoch": 0.5653119633657699, + "grad_norm": 1.0598255395889282, + "learning_rate": 8.400862668876445e-06, + "loss": 1.0091, + "step": 2469 + }, + { + "epoch": 0.5655409273039497, + "grad_norm": 1.0813366174697876, + "learning_rate": 8.393540409480873e-06, + "loss": 1.0822, + "step": 2470 + }, + { + "epoch": 0.5657698912421294, + "grad_norm": 1.1584067344665527, + "learning_rate": 8.38621903410654e-06, + "loss": 1.0001, + "step": 2471 + }, + { + "epoch": 0.5659988551803091, + "grad_norm": 1.030510425567627, + "learning_rate": 8.378898546782344e-06, + "loss": 0.9853, + "step": 2472 + }, + { + "epoch": 0.5662278191184889, + "grad_norm": 1.317218542098999, + "learning_rate": 8.371578951536689e-06, + "loss": 1.0422, + "step": 2473 + }, + { + "epoch": 0.5664567830566686, + "grad_norm": 1.258917212486267, + "learning_rate": 8.364260252397483e-06, + "loss": 1.0164, + "step": 2474 + }, + { + "epoch": 0.5666857469948483, + "grad_norm": 1.1054089069366455, + "learning_rate": 8.356942453392147e-06, + "loss": 1.01, + "step": 2475 + }, + { + "epoch": 0.566914710933028, + "grad_norm": 1.110060691833496, + "learning_rate": 8.349625558547599e-06, + "loss": 1.0202, + "step": 2476 + }, + { + "epoch": 0.5671436748712078, + "grad_norm": 1.5175520181655884, + "learning_rate": 8.342309571890272e-06, + "loss": 1.0423, + "step": 2477 + }, + { + "epoch": 0.5673726388093875, + "grad_norm": 1.2970167398452759, + "learning_rate": 8.334994497446091e-06, + "loss": 1.1032, + "step": 2478 + }, + { + "epoch": 0.5676016027475672, + "grad_norm": 1.1656423807144165, + "learning_rate": 8.327680339240478e-06, + "loss": 1.0052, + "step": 2479 + }, + { + "epoch": 0.5678305666857469, + "grad_norm": 1.0043197870254517, + "learning_rate": 8.320367101298351e-06, + "loss": 1.0508, + "step": 2480 + }, + { + "epoch": 0.5680595306239268, + "grad_norm": 1.8934231996536255, + "learning_rate": 8.313054787644131e-06, + "loss": 1.0589, + "step": 2481 + }, + { + "epoch": 0.5682884945621065, + "grad_norm": 1.1924573183059692, + "learning_rate": 8.305743402301714e-06, + "loss": 1.0504, + "step": 2482 + }, + { + "epoch": 0.5685174585002862, + "grad_norm": 2.6312386989593506, + "learning_rate": 8.298432949294499e-06, + "loss": 0.9884, + "step": 2483 + }, + { + "epoch": 0.568746422438466, + "grad_norm": 1.0763286352157593, + "learning_rate": 8.29112343264537e-06, + "loss": 1.075, + "step": 2484 + }, + { + "epoch": 0.5689753863766457, + "grad_norm": 2.5799460411071777, + "learning_rate": 8.283814856376681e-06, + "loss": 1.0406, + "step": 2485 + }, + { + "epoch": 0.5692043503148254, + "grad_norm": 1.2226494550704956, + "learning_rate": 8.276507224510294e-06, + "loss": 1.1118, + "step": 2486 + }, + { + "epoch": 0.5694333142530051, + "grad_norm": 1.126726746559143, + "learning_rate": 8.26920054106753e-06, + "loss": 1.0838, + "step": 2487 + }, + { + "epoch": 0.5696622781911849, + "grad_norm": 1.5410242080688477, + "learning_rate": 8.261894810069197e-06, + "loss": 1.0668, + "step": 2488 + }, + { + "epoch": 0.5698912421293646, + "grad_norm": 1.1347763538360596, + "learning_rate": 8.25459003553558e-06, + "loss": 1.1072, + "step": 2489 + }, + { + "epoch": 0.5701202060675443, + "grad_norm": 1.095646858215332, + "learning_rate": 8.247286221486429e-06, + "loss": 1.0138, + "step": 2490 + }, + { + "epoch": 0.570349170005724, + "grad_norm": 1.7708925008773804, + "learning_rate": 8.239983371940975e-06, + "loss": 1.0268, + "step": 2491 + }, + { + "epoch": 0.5705781339439039, + "grad_norm": 1.2089495658874512, + "learning_rate": 8.232681490917919e-06, + "loss": 1.0141, + "step": 2492 + }, + { + "epoch": 0.5708070978820836, + "grad_norm": 1.059664249420166, + "learning_rate": 8.22538058243542e-06, + "loss": 1.0827, + "step": 2493 + }, + { + "epoch": 0.5710360618202633, + "grad_norm": 1.3616527318954468, + "learning_rate": 8.218080650511107e-06, + "loss": 1.0793, + "step": 2494 + }, + { + "epoch": 0.5712650257584431, + "grad_norm": 1.2198219299316406, + "learning_rate": 8.210781699162075e-06, + "loss": 1.0753, + "step": 2495 + }, + { + "epoch": 0.5714939896966228, + "grad_norm": 1.3348844051361084, + "learning_rate": 8.203483732404872e-06, + "loss": 1.0341, + "step": 2496 + }, + { + "epoch": 0.5717229536348025, + "grad_norm": 1.160372257232666, + "learning_rate": 8.196186754255509e-06, + "loss": 1.0313, + "step": 2497 + }, + { + "epoch": 0.5719519175729822, + "grad_norm": 1.0435636043548584, + "learning_rate": 8.188890768729452e-06, + "loss": 1.0094, + "step": 2498 + }, + { + "epoch": 0.572180881511162, + "grad_norm": 1.090612530708313, + "learning_rate": 8.181595779841618e-06, + "loss": 1.0399, + "step": 2499 + }, + { + "epoch": 0.5724098454493417, + "grad_norm": 1.496956467628479, + "learning_rate": 8.174301791606384e-06, + "loss": 0.9976, + "step": 2500 + }, + { + "epoch": 0.5726388093875214, + "grad_norm": 1.1636426448822021, + "learning_rate": 8.167008808037568e-06, + "loss": 1.0563, + "step": 2501 + }, + { + "epoch": 0.5728677733257012, + "grad_norm": 1.6248148679733276, + "learning_rate": 8.159716833148432e-06, + "loss": 1.0554, + "step": 2502 + }, + { + "epoch": 0.5730967372638809, + "grad_norm": 1.1249679327011108, + "learning_rate": 8.152425870951701e-06, + "loss": 1.0476, + "step": 2503 + }, + { + "epoch": 0.5733257012020607, + "grad_norm": 1.0898033380508423, + "learning_rate": 8.145135925459518e-06, + "loss": 1.1075, + "step": 2504 + }, + { + "epoch": 0.5735546651402404, + "grad_norm": 1.1172502040863037, + "learning_rate": 8.137847000683485e-06, + "loss": 1.0698, + "step": 2505 + }, + { + "epoch": 0.5737836290784202, + "grad_norm": 1.1264135837554932, + "learning_rate": 8.130559100634639e-06, + "loss": 1.0334, + "step": 2506 + }, + { + "epoch": 0.5740125930165999, + "grad_norm": 1.164330005645752, + "learning_rate": 8.123272229323443e-06, + "loss": 0.9811, + "step": 2507 + }, + { + "epoch": 0.5742415569547796, + "grad_norm": 1.1172728538513184, + "learning_rate": 8.115986390759805e-06, + "loss": 1.087, + "step": 2508 + }, + { + "epoch": 0.5744705208929594, + "grad_norm": 1.0987147092819214, + "learning_rate": 8.108701588953059e-06, + "loss": 1.1124, + "step": 2509 + }, + { + "epoch": 0.5746994848311391, + "grad_norm": 1.3992830514907837, + "learning_rate": 8.101417827911975e-06, + "loss": 1.0272, + "step": 2510 + }, + { + "epoch": 0.5749284487693188, + "grad_norm": 1.1009325981140137, + "learning_rate": 8.094135111644741e-06, + "loss": 1.0467, + "step": 2511 + }, + { + "epoch": 0.5751574127074985, + "grad_norm": 1.2195725440979004, + "learning_rate": 8.086853444158977e-06, + "loss": 1.0822, + "step": 2512 + }, + { + "epoch": 0.5753863766456783, + "grad_norm": 1.24541175365448, + "learning_rate": 8.07957282946172e-06, + "loss": 1.0975, + "step": 2513 + }, + { + "epoch": 0.575615340583858, + "grad_norm": 1.9513866901397705, + "learning_rate": 8.072293271559439e-06, + "loss": 1.0539, + "step": 2514 + }, + { + "epoch": 0.5758443045220378, + "grad_norm": 1.3148459196090698, + "learning_rate": 8.065014774458004e-06, + "loss": 1.0509, + "step": 2515 + }, + { + "epoch": 0.5760732684602176, + "grad_norm": 1.848658800125122, + "learning_rate": 8.05773734216272e-06, + "loss": 1.071, + "step": 2516 + }, + { + "epoch": 0.5763022323983973, + "grad_norm": 1.1349958181381226, + "learning_rate": 8.05046097867829e-06, + "loss": 1.0761, + "step": 2517 + }, + { + "epoch": 0.576531196336577, + "grad_norm": 1.2456731796264648, + "learning_rate": 8.043185688008837e-06, + "loss": 1.0541, + "step": 2518 + }, + { + "epoch": 0.5767601602747567, + "grad_norm": 1.2999017238616943, + "learning_rate": 8.035911474157897e-06, + "loss": 1.0678, + "step": 2519 + }, + { + "epoch": 0.5769891242129365, + "grad_norm": 1.1113868951797485, + "learning_rate": 8.028638341128405e-06, + "loss": 1.0296, + "step": 2520 + }, + { + "epoch": 0.5772180881511162, + "grad_norm": 1.351574182510376, + "learning_rate": 8.021366292922704e-06, + "loss": 0.9984, + "step": 2521 + }, + { + "epoch": 0.5774470520892959, + "grad_norm": 1.0689737796783447, + "learning_rate": 8.014095333542548e-06, + "loss": 1.0453, + "step": 2522 + }, + { + "epoch": 0.5776760160274756, + "grad_norm": 1.2941296100616455, + "learning_rate": 8.006825466989075e-06, + "loss": 1.0933, + "step": 2523 + }, + { + "epoch": 0.5779049799656554, + "grad_norm": 1.1887894868850708, + "learning_rate": 7.999556697262838e-06, + "loss": 0.9727, + "step": 2524 + }, + { + "epoch": 0.5781339439038351, + "grad_norm": 1.1151421070098877, + "learning_rate": 7.992289028363782e-06, + "loss": 1.037, + "step": 2525 + }, + { + "epoch": 0.5783629078420148, + "grad_norm": 1.1871955394744873, + "learning_rate": 7.985022464291236e-06, + "loss": 1.0871, + "step": 2526 + }, + { + "epoch": 0.5785918717801947, + "grad_norm": 1.3512312173843384, + "learning_rate": 7.977757009043941e-06, + "loss": 1.0276, + "step": 2527 + }, + { + "epoch": 0.5788208357183744, + "grad_norm": 1.1984821557998657, + "learning_rate": 7.970492666620002e-06, + "loss": 1.0461, + "step": 2528 + }, + { + "epoch": 0.5790497996565541, + "grad_norm": 1.1962758302688599, + "learning_rate": 7.963229441016938e-06, + "loss": 1.0625, + "step": 2529 + }, + { + "epoch": 0.5792787635947338, + "grad_norm": 1.174294352531433, + "learning_rate": 7.955967336231635e-06, + "loss": 0.9852, + "step": 2530 + }, + { + "epoch": 0.5795077275329136, + "grad_norm": 1.1075918674468994, + "learning_rate": 7.948706356260367e-06, + "loss": 1.1239, + "step": 2531 + }, + { + "epoch": 0.5797366914710933, + "grad_norm": 1.2722827196121216, + "learning_rate": 7.941446505098795e-06, + "loss": 1.0525, + "step": 2532 + }, + { + "epoch": 0.579965655409273, + "grad_norm": 1.085821509361267, + "learning_rate": 7.934187786741956e-06, + "loss": 1.0824, + "step": 2533 + }, + { + "epoch": 0.5801946193474528, + "grad_norm": 1.2172048091888428, + "learning_rate": 7.926930205184254e-06, + "loss": 1.0138, + "step": 2534 + }, + { + "epoch": 0.5804235832856325, + "grad_norm": 1.2068073749542236, + "learning_rate": 7.919673764419479e-06, + "loss": 1.0394, + "step": 2535 + }, + { + "epoch": 0.5806525472238122, + "grad_norm": 1.9181028604507446, + "learning_rate": 7.912418468440794e-06, + "loss": 1.0098, + "step": 2536 + }, + { + "epoch": 0.5808815111619919, + "grad_norm": 2.7652781009674072, + "learning_rate": 7.90516432124072e-06, + "loss": 1.1164, + "step": 2537 + }, + { + "epoch": 0.5811104751001718, + "grad_norm": 1.5410016775131226, + "learning_rate": 7.89791132681116e-06, + "loss": 1.0504, + "step": 2538 + }, + { + "epoch": 0.5813394390383515, + "grad_norm": 1.0692219734191895, + "learning_rate": 7.89065948914337e-06, + "loss": 1.0947, + "step": 2539 + }, + { + "epoch": 0.5815684029765312, + "grad_norm": 1.367213249206543, + "learning_rate": 7.883408812227977e-06, + "loss": 1.0337, + "step": 2540 + }, + { + "epoch": 0.581797366914711, + "grad_norm": 1.8254352807998657, + "learning_rate": 7.876159300054974e-06, + "loss": 1.0041, + "step": 2541 + }, + { + "epoch": 0.5820263308528907, + "grad_norm": 1.1613630056381226, + "learning_rate": 7.868910956613697e-06, + "loss": 1.0489, + "step": 2542 + }, + { + "epoch": 0.5822552947910704, + "grad_norm": 1.2703049182891846, + "learning_rate": 7.861663785892857e-06, + "loss": 1.0312, + "step": 2543 + }, + { + "epoch": 0.5824842587292501, + "grad_norm": 1.263065218925476, + "learning_rate": 7.854417791880508e-06, + "loss": 1.031, + "step": 2544 + }, + { + "epoch": 0.5827132226674299, + "grad_norm": 1.1359111070632935, + "learning_rate": 7.847172978564055e-06, + "loss": 1.0146, + "step": 2545 + }, + { + "epoch": 0.5829421866056096, + "grad_norm": 1.2447282075881958, + "learning_rate": 7.839929349930266e-06, + "loss": 1.045, + "step": 2546 + }, + { + "epoch": 0.5831711505437893, + "grad_norm": 1.083162546157837, + "learning_rate": 7.832686909965248e-06, + "loss": 1.0664, + "step": 2547 + }, + { + "epoch": 0.583400114481969, + "grad_norm": 1.1830934286117554, + "learning_rate": 7.82544566265445e-06, + "loss": 0.9864, + "step": 2548 + }, + { + "epoch": 0.5836290784201488, + "grad_norm": 1.7827184200286865, + "learning_rate": 7.818205611982673e-06, + "loss": 1.0378, + "step": 2549 + }, + { + "epoch": 0.5838580423583286, + "grad_norm": 1.157387614250183, + "learning_rate": 7.810966761934053e-06, + "loss": 1.033, + "step": 2550 + }, + { + "epoch": 0.5840870062965083, + "grad_norm": 1.540801763534546, + "learning_rate": 7.803729116492072e-06, + "loss": 1.0447, + "step": 2551 + }, + { + "epoch": 0.5843159702346881, + "grad_norm": 1.1215074062347412, + "learning_rate": 7.796492679639549e-06, + "loss": 1.0606, + "step": 2552 + }, + { + "epoch": 0.5845449341728678, + "grad_norm": 1.2133134603500366, + "learning_rate": 7.789257455358625e-06, + "loss": 1.0275, + "step": 2553 + }, + { + "epoch": 0.5847738981110475, + "grad_norm": 1.3863023519515991, + "learning_rate": 7.782023447630789e-06, + "loss": 1.0156, + "step": 2554 + }, + { + "epoch": 0.5850028620492272, + "grad_norm": 1.131239891052246, + "learning_rate": 7.774790660436857e-06, + "loss": 0.996, + "step": 2555 + }, + { + "epoch": 0.585231825987407, + "grad_norm": 1.179404377937317, + "learning_rate": 7.767559097756966e-06, + "loss": 1.0682, + "step": 2556 + }, + { + "epoch": 0.5854607899255867, + "grad_norm": 2.166602373123169, + "learning_rate": 7.760328763570589e-06, + "loss": 1.0358, + "step": 2557 + }, + { + "epoch": 0.5856897538637664, + "grad_norm": 1.535658836364746, + "learning_rate": 7.753099661856516e-06, + "loss": 1.0759, + "step": 2558 + }, + { + "epoch": 0.5859187178019462, + "grad_norm": 1.1416999101638794, + "learning_rate": 7.745871796592857e-06, + "loss": 1.0636, + "step": 2559 + }, + { + "epoch": 0.5861476817401259, + "grad_norm": 1.214870572090149, + "learning_rate": 7.738645171757054e-06, + "loss": 1.0911, + "step": 2560 + }, + { + "epoch": 0.5863766456783057, + "grad_norm": 1.7018429040908813, + "learning_rate": 7.731419791325852e-06, + "loss": 1.0343, + "step": 2561 + }, + { + "epoch": 0.5866056096164854, + "grad_norm": 1.0340094566345215, + "learning_rate": 7.72419565927532e-06, + "loss": 1.0418, + "step": 2562 + }, + { + "epoch": 0.5868345735546652, + "grad_norm": 1.4168850183486938, + "learning_rate": 7.716972779580836e-06, + "loss": 1.0363, + "step": 2563 + }, + { + "epoch": 0.5870635374928449, + "grad_norm": 1.3030794858932495, + "learning_rate": 7.709751156217088e-06, + "loss": 1.0039, + "step": 2564 + }, + { + "epoch": 0.5872925014310246, + "grad_norm": 1.357153296470642, + "learning_rate": 7.702530793158079e-06, + "loss": 1.1281, + "step": 2565 + }, + { + "epoch": 0.5875214653692044, + "grad_norm": 1.1770304441452026, + "learning_rate": 7.695311694377116e-06, + "loss": 1.037, + "step": 2566 + }, + { + "epoch": 0.5877504293073841, + "grad_norm": 1.2851818799972534, + "learning_rate": 7.688093863846801e-06, + "loss": 1.0687, + "step": 2567 + }, + { + "epoch": 0.5879793932455638, + "grad_norm": 1.1500670909881592, + "learning_rate": 7.680877305539048e-06, + "loss": 1.0786, + "step": 2568 + }, + { + "epoch": 0.5882083571837435, + "grad_norm": 1.660108208656311, + "learning_rate": 7.673662023425074e-06, + "loss": 1.0077, + "step": 2569 + }, + { + "epoch": 0.5884373211219233, + "grad_norm": 1.0523499250411987, + "learning_rate": 7.666448021475385e-06, + "loss": 1.0091, + "step": 2570 + }, + { + "epoch": 0.588666285060103, + "grad_norm": 1.180342197418213, + "learning_rate": 7.659235303659784e-06, + "loss": 1.0668, + "step": 2571 + }, + { + "epoch": 0.5888952489982827, + "grad_norm": 1.269478440284729, + "learning_rate": 7.65202387394737e-06, + "loss": 1.0988, + "step": 2572 + }, + { + "epoch": 0.5891242129364626, + "grad_norm": 1.2310328483581543, + "learning_rate": 7.644813736306531e-06, + "loss": 1.0087, + "step": 2573 + }, + { + "epoch": 0.5893531768746423, + "grad_norm": 1.2201181650161743, + "learning_rate": 7.637604894704951e-06, + "loss": 1.097, + "step": 2574 + }, + { + "epoch": 0.589582140812822, + "grad_norm": 1.19076406955719, + "learning_rate": 7.630397353109588e-06, + "loss": 1.0493, + "step": 2575 + }, + { + "epoch": 0.5898111047510017, + "grad_norm": 1.3227699995040894, + "learning_rate": 7.623191115486695e-06, + "loss": 1.0598, + "step": 2576 + }, + { + "epoch": 0.5900400686891815, + "grad_norm": 1.1145106554031372, + "learning_rate": 7.615986185801807e-06, + "loss": 1.0267, + "step": 2577 + }, + { + "epoch": 0.5902690326273612, + "grad_norm": 1.8626641035079956, + "learning_rate": 7.608782568019729e-06, + "loss": 1.0034, + "step": 2578 + }, + { + "epoch": 0.5904979965655409, + "grad_norm": 1.3820207118988037, + "learning_rate": 7.601580266104558e-06, + "loss": 1.0034, + "step": 2579 + }, + { + "epoch": 0.5907269605037206, + "grad_norm": 1.131319522857666, + "learning_rate": 7.594379284019659e-06, + "loss": 1.0725, + "step": 2580 + }, + { + "epoch": 0.5909559244419004, + "grad_norm": 1.295155644416809, + "learning_rate": 7.587179625727671e-06, + "loss": 1.0437, + "step": 2581 + }, + { + "epoch": 0.5911848883800801, + "grad_norm": 1.355495572090149, + "learning_rate": 7.579981295190506e-06, + "loss": 1.0339, + "step": 2582 + }, + { + "epoch": 0.5914138523182598, + "grad_norm": 1.4129412174224854, + "learning_rate": 7.572784296369342e-06, + "loss": 1.0228, + "step": 2583 + }, + { + "epoch": 0.5916428162564397, + "grad_norm": 1.3346372842788696, + "learning_rate": 7.565588633224632e-06, + "loss": 1.0627, + "step": 2584 + }, + { + "epoch": 0.5918717801946194, + "grad_norm": 1.4779704809188843, + "learning_rate": 7.558394309716088e-06, + "loss": 1.1153, + "step": 2585 + }, + { + "epoch": 0.5921007441327991, + "grad_norm": 1.397422194480896, + "learning_rate": 7.551201329802684e-06, + "loss": 1.0255, + "step": 2586 + }, + { + "epoch": 0.5923297080709788, + "grad_norm": 1.3591479063034058, + "learning_rate": 7.544009697442656e-06, + "loss": 1.0635, + "step": 2587 + }, + { + "epoch": 0.5925586720091586, + "grad_norm": 1.7575359344482422, + "learning_rate": 7.536819416593504e-06, + "loss": 1.0804, + "step": 2588 + }, + { + "epoch": 0.5927876359473383, + "grad_norm": 1.3760896921157837, + "learning_rate": 7.529630491211972e-06, + "loss": 1.0748, + "step": 2589 + }, + { + "epoch": 0.593016599885518, + "grad_norm": 1.791036605834961, + "learning_rate": 7.522442925254068e-06, + "loss": 1.0705, + "step": 2590 + }, + { + "epoch": 0.5932455638236978, + "grad_norm": 1.271471619606018, + "learning_rate": 7.515256722675058e-06, + "loss": 1.0509, + "step": 2591 + }, + { + "epoch": 0.5934745277618775, + "grad_norm": 1.1143954992294312, + "learning_rate": 7.508071887429433e-06, + "loss": 1.0352, + "step": 2592 + }, + { + "epoch": 0.5937034917000572, + "grad_norm": 1.292467474937439, + "learning_rate": 7.500888423470962e-06, + "loss": 1.013, + "step": 2593 + }, + { + "epoch": 0.5939324556382369, + "grad_norm": 1.4147076606750488, + "learning_rate": 7.493706334752637e-06, + "loss": 1.1661, + "step": 2594 + }, + { + "epoch": 0.5941614195764167, + "grad_norm": 1.6669336557388306, + "learning_rate": 7.486525625226699e-06, + "loss": 1.0527, + "step": 2595 + }, + { + "epoch": 0.5943903835145965, + "grad_norm": 1.3241775035858154, + "learning_rate": 7.479346298844645e-06, + "loss": 1.0268, + "step": 2596 + }, + { + "epoch": 0.5946193474527762, + "grad_norm": 1.1176531314849854, + "learning_rate": 7.472168359557183e-06, + "loss": 1.0257, + "step": 2597 + }, + { + "epoch": 0.594848311390956, + "grad_norm": 1.194197654724121, + "learning_rate": 7.46499181131428e-06, + "loss": 1.073, + "step": 2598 + }, + { + "epoch": 0.5950772753291357, + "grad_norm": 1.1093398332595825, + "learning_rate": 7.4578166580651335e-06, + "loss": 0.9844, + "step": 2599 + }, + { + "epoch": 0.5953062392673154, + "grad_norm": 1.3040740489959717, + "learning_rate": 7.450642903758163e-06, + "loss": 1.1006, + "step": 2600 + }, + { + "epoch": 0.5955352032054951, + "grad_norm": 1.1911894083023071, + "learning_rate": 7.443470552341028e-06, + "loss": 1.1261, + "step": 2601 + }, + { + "epoch": 0.5957641671436749, + "grad_norm": 1.2414566278457642, + "learning_rate": 7.436299607760616e-06, + "loss": 1.0389, + "step": 2602 + }, + { + "epoch": 0.5959931310818546, + "grad_norm": 1.163601040840149, + "learning_rate": 7.429130073963036e-06, + "loss": 0.9994, + "step": 2603 + }, + { + "epoch": 0.5962220950200343, + "grad_norm": 1.274039626121521, + "learning_rate": 7.421961954893622e-06, + "loss": 1.0788, + "step": 2604 + }, + { + "epoch": 0.596451058958214, + "grad_norm": 1.1208449602127075, + "learning_rate": 7.414795254496929e-06, + "loss": 1.032, + "step": 2605 + }, + { + "epoch": 0.5966800228963938, + "grad_norm": 1.4800190925598145, + "learning_rate": 7.4076299767167325e-06, + "loss": 1.0526, + "step": 2606 + }, + { + "epoch": 0.5969089868345736, + "grad_norm": 1.2557833194732666, + "learning_rate": 7.400466125496027e-06, + "loss": 1.1016, + "step": 2607 + }, + { + "epoch": 0.5971379507727533, + "grad_norm": 1.3543636798858643, + "learning_rate": 7.393303704777017e-06, + "loss": 1.0346, + "step": 2608 + }, + { + "epoch": 0.5973669147109331, + "grad_norm": 1.2229251861572266, + "learning_rate": 7.386142718501122e-06, + "loss": 1.0234, + "step": 2609 + }, + { + "epoch": 0.5975958786491128, + "grad_norm": 1.217532753944397, + "learning_rate": 7.378983170608982e-06, + "loss": 1.039, + "step": 2610 + }, + { + "epoch": 0.5978248425872925, + "grad_norm": 1.3155670166015625, + "learning_rate": 7.37182506504042e-06, + "loss": 1.0595, + "step": 2611 + }, + { + "epoch": 0.5980538065254722, + "grad_norm": 1.3033359050750732, + "learning_rate": 7.364668405734493e-06, + "loss": 1.0695, + "step": 2612 + }, + { + "epoch": 0.598282770463652, + "grad_norm": 1.1440081596374512, + "learning_rate": 7.35751319662945e-06, + "loss": 1.0842, + "step": 2613 + }, + { + "epoch": 0.5985117344018317, + "grad_norm": 1.1137559413909912, + "learning_rate": 7.350359441662735e-06, + "loss": 1.007, + "step": 2614 + }, + { + "epoch": 0.5987406983400114, + "grad_norm": 1.325477957725525, + "learning_rate": 7.343207144771012e-06, + "loss": 1.0835, + "step": 2615 + }, + { + "epoch": 0.5989696622781912, + "grad_norm": 1.1493194103240967, + "learning_rate": 7.336056309890116e-06, + "loss": 1.0678, + "step": 2616 + }, + { + "epoch": 0.5991986262163709, + "grad_norm": 1.2086422443389893, + "learning_rate": 7.3289069409551e-06, + "loss": 1.1063, + "step": 2617 + }, + { + "epoch": 0.5994275901545506, + "grad_norm": 1.3805280923843384, + "learning_rate": 7.321759041900204e-06, + "loss": 1.0846, + "step": 2618 + }, + { + "epoch": 0.5996565540927304, + "grad_norm": 1.2356138229370117, + "learning_rate": 7.314612616658847e-06, + "loss": 1.0757, + "step": 2619 + }, + { + "epoch": 0.5998855180309102, + "grad_norm": 1.2783619165420532, + "learning_rate": 7.307467669163655e-06, + "loss": 1.0257, + "step": 2620 + }, + { + "epoch": 0.6001144819690899, + "grad_norm": 1.2165898084640503, + "learning_rate": 7.3003242033464314e-06, + "loss": 1.0641, + "step": 2621 + }, + { + "epoch": 0.6003434459072696, + "grad_norm": 1.589637279510498, + "learning_rate": 7.293182223138164e-06, + "loss": 1.1101, + "step": 2622 + }, + { + "epoch": 0.6005724098454494, + "grad_norm": 1.0955222845077515, + "learning_rate": 7.286041732469024e-06, + "loss": 1.0198, + "step": 2623 + }, + { + "epoch": 0.6008013737836291, + "grad_norm": 1.270451307296753, + "learning_rate": 7.278902735268367e-06, + "loss": 1.0718, + "step": 2624 + }, + { + "epoch": 0.6010303377218088, + "grad_norm": 1.114689826965332, + "learning_rate": 7.27176523546472e-06, + "loss": 1.0091, + "step": 2625 + }, + { + "epoch": 0.6012593016599885, + "grad_norm": 1.1657631397247314, + "learning_rate": 7.264629236985792e-06, + "loss": 1.1009, + "step": 2626 + }, + { + "epoch": 0.6014882655981683, + "grad_norm": 1.2354736328125, + "learning_rate": 7.25749474375846e-06, + "loss": 1.0566, + "step": 2627 + }, + { + "epoch": 0.601717229536348, + "grad_norm": 1.1252378225326538, + "learning_rate": 7.250361759708775e-06, + "loss": 1.038, + "step": 2628 + }, + { + "epoch": 0.6019461934745277, + "grad_norm": 1.6679290533065796, + "learning_rate": 7.243230288761966e-06, + "loss": 1.0852, + "step": 2629 + }, + { + "epoch": 0.6021751574127076, + "grad_norm": 1.029215693473816, + "learning_rate": 7.2361003348424105e-06, + "loss": 1.0707, + "step": 2630 + }, + { + "epoch": 0.6024041213508873, + "grad_norm": 1.261576771736145, + "learning_rate": 7.2289719018736715e-06, + "loss": 0.9972, + "step": 2631 + }, + { + "epoch": 0.602633085289067, + "grad_norm": 1.4089431762695312, + "learning_rate": 7.221844993778464e-06, + "loss": 1.0427, + "step": 2632 + }, + { + "epoch": 0.6028620492272467, + "grad_norm": 1.418616771697998, + "learning_rate": 7.21471961447866e-06, + "loss": 1.1127, + "step": 2633 + }, + { + "epoch": 0.6030910131654265, + "grad_norm": 1.258452296257019, + "learning_rate": 7.207595767895303e-06, + "loss": 1.0767, + "step": 2634 + }, + { + "epoch": 0.6033199771036062, + "grad_norm": 1.0797663927078247, + "learning_rate": 7.200473457948584e-06, + "loss": 1.0761, + "step": 2635 + }, + { + "epoch": 0.6035489410417859, + "grad_norm": 1.2073185443878174, + "learning_rate": 7.193352688557849e-06, + "loss": 1.0367, + "step": 2636 + }, + { + "epoch": 0.6037779049799656, + "grad_norm": 1.2464630603790283, + "learning_rate": 7.186233463641601e-06, + "loss": 1.0441, + "step": 2637 + }, + { + "epoch": 0.6040068689181454, + "grad_norm": 2.0551130771636963, + "learning_rate": 7.179115787117482e-06, + "loss": 1.0626, + "step": 2638 + }, + { + "epoch": 0.6042358328563251, + "grad_norm": 1.3895354270935059, + "learning_rate": 7.171999662902298e-06, + "loss": 1.0775, + "step": 2639 + }, + { + "epoch": 0.6044647967945048, + "grad_norm": 1.0203369855880737, + "learning_rate": 7.164885094911991e-06, + "loss": 1.0592, + "step": 2640 + }, + { + "epoch": 0.6046937607326845, + "grad_norm": 1.9111573696136475, + "learning_rate": 7.157772087061645e-06, + "loss": 1.0904, + "step": 2641 + }, + { + "epoch": 0.6049227246708644, + "grad_norm": 1.0934786796569824, + "learning_rate": 7.150660643265488e-06, + "loss": 1.0466, + "step": 2642 + }, + { + "epoch": 0.6051516886090441, + "grad_norm": 1.2626543045043945, + "learning_rate": 7.143550767436894e-06, + "loss": 1.0527, + "step": 2643 + }, + { + "epoch": 0.6053806525472238, + "grad_norm": 1.443155288696289, + "learning_rate": 7.136442463488362e-06, + "loss": 1.0411, + "step": 2644 + }, + { + "epoch": 0.6056096164854036, + "grad_norm": 1.5223894119262695, + "learning_rate": 7.129335735331537e-06, + "loss": 1.0087, + "step": 2645 + }, + { + "epoch": 0.6058385804235833, + "grad_norm": 1.1826412677764893, + "learning_rate": 7.122230586877188e-06, + "loss": 1.0613, + "step": 2646 + }, + { + "epoch": 0.606067544361763, + "grad_norm": 3.0586001873016357, + "learning_rate": 7.115127022035215e-06, + "loss": 1.0597, + "step": 2647 + }, + { + "epoch": 0.6062965082999427, + "grad_norm": 1.441426157951355, + "learning_rate": 7.108025044714661e-06, + "loss": 1.1143, + "step": 2648 + }, + { + "epoch": 0.6065254722381225, + "grad_norm": 1.2049249410629272, + "learning_rate": 7.100924658823677e-06, + "loss": 1.082, + "step": 2649 + }, + { + "epoch": 0.6067544361763022, + "grad_norm": 1.1801356077194214, + "learning_rate": 7.093825868269546e-06, + "loss": 1.0233, + "step": 2650 + }, + { + "epoch": 0.6069834001144819, + "grad_norm": 1.2054147720336914, + "learning_rate": 7.0867286769586775e-06, + "loss": 1.015, + "step": 2651 + }, + { + "epoch": 0.6072123640526617, + "grad_norm": 3.0669796466827393, + "learning_rate": 7.0796330887965884e-06, + "loss": 1.0719, + "step": 2652 + }, + { + "epoch": 0.6074413279908415, + "grad_norm": 1.0845890045166016, + "learning_rate": 7.072539107687928e-06, + "loss": 1.0525, + "step": 2653 + }, + { + "epoch": 0.6076702919290212, + "grad_norm": 1.1807281970977783, + "learning_rate": 7.065446737536455e-06, + "loss": 1.0256, + "step": 2654 + }, + { + "epoch": 0.607899255867201, + "grad_norm": 1.2322497367858887, + "learning_rate": 7.058355982245038e-06, + "loss": 1.0678, + "step": 2655 + }, + { + "epoch": 0.6081282198053807, + "grad_norm": 1.065679907798767, + "learning_rate": 7.051266845715663e-06, + "loss": 1.0384, + "step": 2656 + }, + { + "epoch": 0.6083571837435604, + "grad_norm": 1.2442513704299927, + "learning_rate": 7.044179331849415e-06, + "loss": 1.0565, + "step": 2657 + }, + { + "epoch": 0.6085861476817401, + "grad_norm": 1.1693778038024902, + "learning_rate": 7.0370934445465026e-06, + "loss": 0.9849, + "step": 2658 + }, + { + "epoch": 0.6088151116199199, + "grad_norm": 1.1018726825714111, + "learning_rate": 7.030009187706228e-06, + "loss": 1.0543, + "step": 2659 + }, + { + "epoch": 0.6090440755580996, + "grad_norm": 1.5267049074172974, + "learning_rate": 7.022926565226995e-06, + "loss": 1.0282, + "step": 2660 + }, + { + "epoch": 0.6092730394962793, + "grad_norm": 1.2858092784881592, + "learning_rate": 7.015845581006311e-06, + "loss": 0.991, + "step": 2661 + }, + { + "epoch": 0.609502003434459, + "grad_norm": 1.529943585395813, + "learning_rate": 7.00876623894079e-06, + "loss": 1.0248, + "step": 2662 + }, + { + "epoch": 0.6097309673726388, + "grad_norm": 1.2061338424682617, + "learning_rate": 7.001688542926126e-06, + "loss": 1.0418, + "step": 2663 + }, + { + "epoch": 0.6099599313108185, + "grad_norm": 1.2191417217254639, + "learning_rate": 6.994612496857118e-06, + "loss": 1.0346, + "step": 2664 + }, + { + "epoch": 0.6101888952489983, + "grad_norm": 1.3246591091156006, + "learning_rate": 6.9875381046276605e-06, + "loss": 1.051, + "step": 2665 + }, + { + "epoch": 0.6104178591871781, + "grad_norm": 1.1738966703414917, + "learning_rate": 6.9804653701307225e-06, + "loss": 1.073, + "step": 2666 + }, + { + "epoch": 0.6106468231253578, + "grad_norm": 1.06193208694458, + "learning_rate": 6.973394297258378e-06, + "loss": 1.0468, + "step": 2667 + }, + { + "epoch": 0.6108757870635375, + "grad_norm": 1.585971713066101, + "learning_rate": 6.966324889901776e-06, + "loss": 1.0083, + "step": 2668 + }, + { + "epoch": 0.6111047510017172, + "grad_norm": 1.149452805519104, + "learning_rate": 6.959257151951153e-06, + "loss": 1.0624, + "step": 2669 + }, + { + "epoch": 0.611333714939897, + "grad_norm": 1.240146279335022, + "learning_rate": 6.952191087295827e-06, + "loss": 1.0515, + "step": 2670 + }, + { + "epoch": 0.6115626788780767, + "grad_norm": 1.2787004709243774, + "learning_rate": 6.945126699824189e-06, + "loss": 0.9782, + "step": 2671 + }, + { + "epoch": 0.6117916428162564, + "grad_norm": 1.338744878768921, + "learning_rate": 6.938063993423718e-06, + "loss": 1.0323, + "step": 2672 + }, + { + "epoch": 0.6120206067544361, + "grad_norm": 2.4173684120178223, + "learning_rate": 6.9310029719809625e-06, + "loss": 1.022, + "step": 2673 + }, + { + "epoch": 0.6122495706926159, + "grad_norm": 1.3075268268585205, + "learning_rate": 6.923943639381539e-06, + "loss": 1.0476, + "step": 2674 + }, + { + "epoch": 0.6124785346307956, + "grad_norm": 1.0469852685928345, + "learning_rate": 6.916885999510137e-06, + "loss": 0.9744, + "step": 2675 + }, + { + "epoch": 0.6127074985689754, + "grad_norm": 1.373605489730835, + "learning_rate": 6.909830056250527e-06, + "loss": 1.0191, + "step": 2676 + }, + { + "epoch": 0.6129364625071552, + "grad_norm": 1.5057294368743896, + "learning_rate": 6.902775813485527e-06, + "loss": 1.061, + "step": 2677 + }, + { + "epoch": 0.6131654264453349, + "grad_norm": 1.0761992931365967, + "learning_rate": 6.895723275097031e-06, + "loss": 1.0406, + "step": 2678 + }, + { + "epoch": 0.6133943903835146, + "grad_norm": 1.1914690732955933, + "learning_rate": 6.888672444965988e-06, + "loss": 1.0442, + "step": 2679 + }, + { + "epoch": 0.6136233543216943, + "grad_norm": 1.2478915452957153, + "learning_rate": 6.881623326972412e-06, + "loss": 0.9698, + "step": 2680 + }, + { + "epoch": 0.6138523182598741, + "grad_norm": 1.6330108642578125, + "learning_rate": 6.874575924995378e-06, + "loss": 1.0203, + "step": 2681 + }, + { + "epoch": 0.6140812821980538, + "grad_norm": 1.1003386974334717, + "learning_rate": 6.867530242913008e-06, + "loss": 1.0006, + "step": 2682 + }, + { + "epoch": 0.6143102461362335, + "grad_norm": 1.3207449913024902, + "learning_rate": 6.860486284602479e-06, + "loss": 1.1075, + "step": 2683 + }, + { + "epoch": 0.6145392100744133, + "grad_norm": 1.2823762893676758, + "learning_rate": 6.853444053940034e-06, + "loss": 0.9907, + "step": 2684 + }, + { + "epoch": 0.614768174012593, + "grad_norm": 1.167484164237976, + "learning_rate": 6.846403554800938e-06, + "loss": 1.0652, + "step": 2685 + }, + { + "epoch": 0.6149971379507727, + "grad_norm": 1.0674535036087036, + "learning_rate": 6.839364791059529e-06, + "loss": 1.0155, + "step": 2686 + }, + { + "epoch": 0.6152261018889524, + "grad_norm": 1.2486037015914917, + "learning_rate": 6.832327766589177e-06, + "loss": 1.1036, + "step": 2687 + }, + { + "epoch": 0.6154550658271323, + "grad_norm": 1.2341771125793457, + "learning_rate": 6.825292485262296e-06, + "loss": 1.0722, + "step": 2688 + }, + { + "epoch": 0.615684029765312, + "grad_norm": 1.7849056720733643, + "learning_rate": 6.818258950950346e-06, + "loss": 1.0679, + "step": 2689 + }, + { + "epoch": 0.6159129937034917, + "grad_norm": 1.1976414918899536, + "learning_rate": 6.8112271675238154e-06, + "loss": 1.0202, + "step": 2690 + }, + { + "epoch": 0.6161419576416715, + "grad_norm": 1.1759425401687622, + "learning_rate": 6.804197138852242e-06, + "loss": 1.1098, + "step": 2691 + }, + { + "epoch": 0.6163709215798512, + "grad_norm": 1.9811475276947021, + "learning_rate": 6.797168868804192e-06, + "loss": 1.0115, + "step": 2692 + }, + { + "epoch": 0.6165998855180309, + "grad_norm": 1.4365376234054565, + "learning_rate": 6.790142361247258e-06, + "loss": 1.0826, + "step": 2693 + }, + { + "epoch": 0.6168288494562106, + "grad_norm": 0.9954733848571777, + "learning_rate": 6.7831176200480686e-06, + "loss": 1.024, + "step": 2694 + }, + { + "epoch": 0.6170578133943904, + "grad_norm": 1.2109805345535278, + "learning_rate": 6.776094649072286e-06, + "loss": 1.0394, + "step": 2695 + }, + { + "epoch": 0.6172867773325701, + "grad_norm": 1.5378385782241821, + "learning_rate": 6.769073452184589e-06, + "loss": 1.0604, + "step": 2696 + }, + { + "epoch": 0.6175157412707498, + "grad_norm": 1.6136606931686401, + "learning_rate": 6.762054033248681e-06, + "loss": 1.0747, + "step": 2697 + }, + { + "epoch": 0.6177447052089295, + "grad_norm": 1.2883368730545044, + "learning_rate": 6.755036396127297e-06, + "loss": 0.9892, + "step": 2698 + }, + { + "epoch": 0.6179736691471094, + "grad_norm": 1.2920852899551392, + "learning_rate": 6.748020544682172e-06, + "loss": 1.0611, + "step": 2699 + }, + { + "epoch": 0.6182026330852891, + "grad_norm": 1.2162961959838867, + "learning_rate": 6.7410064827740805e-06, + "loss": 0.9927, + "step": 2700 + }, + { + "epoch": 0.6184315970234688, + "grad_norm": 1.3566070795059204, + "learning_rate": 6.733994214262797e-06, + "loss": 0.9985, + "step": 2701 + }, + { + "epoch": 0.6186605609616486, + "grad_norm": 1.4097816944122314, + "learning_rate": 6.726983743007112e-06, + "loss": 1.0129, + "step": 2702 + }, + { + "epoch": 0.6188895248998283, + "grad_norm": 1.3612055778503418, + "learning_rate": 6.7199750728648395e-06, + "loss": 1.0485, + "step": 2703 + }, + { + "epoch": 0.619118488838008, + "grad_norm": 1.2182425260543823, + "learning_rate": 6.712968207692778e-06, + "loss": 1.0371, + "step": 2704 + }, + { + "epoch": 0.6193474527761877, + "grad_norm": 1.3078467845916748, + "learning_rate": 6.705963151346755e-06, + "loss": 0.9789, + "step": 2705 + }, + { + "epoch": 0.6195764167143675, + "grad_norm": 1.150083065032959, + "learning_rate": 6.698959907681595e-06, + "loss": 1.0607, + "step": 2706 + }, + { + "epoch": 0.6198053806525472, + "grad_norm": 1.1655128002166748, + "learning_rate": 6.6919584805511175e-06, + "loss": 1.0277, + "step": 2707 + }, + { + "epoch": 0.6200343445907269, + "grad_norm": 1.273138165473938, + "learning_rate": 6.684958873808156e-06, + "loss": 1.0686, + "step": 2708 + }, + { + "epoch": 0.6202633085289067, + "grad_norm": 1.1581090688705444, + "learning_rate": 6.6779610913045344e-06, + "loss": 1.0464, + "step": 2709 + }, + { + "epoch": 0.6204922724670864, + "grad_norm": 1.240431547164917, + "learning_rate": 6.670965136891072e-06, + "loss": 1.051, + "step": 2710 + }, + { + "epoch": 0.6207212364052662, + "grad_norm": 1.4974451065063477, + "learning_rate": 6.663971014417585e-06, + "loss": 1.1388, + "step": 2711 + }, + { + "epoch": 0.620950200343446, + "grad_norm": 1.2878180742263794, + "learning_rate": 6.6569787277328745e-06, + "loss": 1.0783, + "step": 2712 + }, + { + "epoch": 0.6211791642816257, + "grad_norm": 1.712733507156372, + "learning_rate": 6.6499882806847445e-06, + "loss": 1.007, + "step": 2713 + }, + { + "epoch": 0.6214081282198054, + "grad_norm": 1.2414849996566772, + "learning_rate": 6.64299967711998e-06, + "loss": 1.0117, + "step": 2714 + }, + { + "epoch": 0.6216370921579851, + "grad_norm": 1.0478038787841797, + "learning_rate": 6.636012920884346e-06, + "loss": 1.0522, + "step": 2715 + }, + { + "epoch": 0.6218660560961649, + "grad_norm": 1.2143323421478271, + "learning_rate": 6.629028015822596e-06, + "loss": 1.0254, + "step": 2716 + }, + { + "epoch": 0.6220950200343446, + "grad_norm": 1.4381064176559448, + "learning_rate": 6.622044965778471e-06, + "loss": 1.1201, + "step": 2717 + }, + { + "epoch": 0.6223239839725243, + "grad_norm": 1.1734505891799927, + "learning_rate": 6.615063774594677e-06, + "loss": 1.1097, + "step": 2718 + }, + { + "epoch": 0.622552947910704, + "grad_norm": 1.087598204612732, + "learning_rate": 6.608084446112909e-06, + "loss": 1.0212, + "step": 2719 + }, + { + "epoch": 0.6227819118488838, + "grad_norm": 1.2489269971847534, + "learning_rate": 6.601106984173835e-06, + "loss": 1.0581, + "step": 2720 + }, + { + "epoch": 0.6230108757870635, + "grad_norm": 1.7127981185913086, + "learning_rate": 6.594131392617087e-06, + "loss": 1.0913, + "step": 2721 + }, + { + "epoch": 0.6232398397252433, + "grad_norm": 2.1938893795013428, + "learning_rate": 6.5871576752812845e-06, + "loss": 1.0303, + "step": 2722 + }, + { + "epoch": 0.623468803663423, + "grad_norm": 1.364212989807129, + "learning_rate": 6.580185836003995e-06, + "loss": 1.0338, + "step": 2723 + }, + { + "epoch": 0.6236977676016028, + "grad_norm": 1.1102733612060547, + "learning_rate": 6.573215878621769e-06, + "loss": 1.076, + "step": 2724 + }, + { + "epoch": 0.6239267315397825, + "grad_norm": 1.3565003871917725, + "learning_rate": 6.566247806970119e-06, + "loss": 1.0169, + "step": 2725 + }, + { + "epoch": 0.6241556954779622, + "grad_norm": 1.652982234954834, + "learning_rate": 6.559281624883506e-06, + "loss": 0.9878, + "step": 2726 + }, + { + "epoch": 0.624384659416142, + "grad_norm": 1.271844506263733, + "learning_rate": 6.552317336195371e-06, + "loss": 1.0609, + "step": 2727 + }, + { + "epoch": 0.6246136233543217, + "grad_norm": 1.2755295038223267, + "learning_rate": 6.5453549447381e-06, + "loss": 1.0662, + "step": 2728 + }, + { + "epoch": 0.6248425872925014, + "grad_norm": 1.265498399734497, + "learning_rate": 6.53839445434304e-06, + "loss": 1.0132, + "step": 2729 + }, + { + "epoch": 0.6250715512306811, + "grad_norm": 1.5743672847747803, + "learning_rate": 6.531435868840488e-06, + "loss": 1.0412, + "step": 2730 + }, + { + "epoch": 0.6253005151688609, + "grad_norm": 1.1162136793136597, + "learning_rate": 6.524479192059699e-06, + "loss": 1.0259, + "step": 2731 + }, + { + "epoch": 0.6255294791070406, + "grad_norm": 1.2399483919143677, + "learning_rate": 6.5175244278288705e-06, + "loss": 1.0109, + "step": 2732 + }, + { + "epoch": 0.6257584430452203, + "grad_norm": 1.169719934463501, + "learning_rate": 6.510571579975155e-06, + "loss": 0.9764, + "step": 2733 + }, + { + "epoch": 0.6259874069834002, + "grad_norm": 1.7997078895568848, + "learning_rate": 6.5036206523246404e-06, + "loss": 1.0591, + "step": 2734 + }, + { + "epoch": 0.6262163709215799, + "grad_norm": 1.2209597826004028, + "learning_rate": 6.496671648702366e-06, + "loss": 1.0484, + "step": 2735 + }, + { + "epoch": 0.6264453348597596, + "grad_norm": 1.2143415212631226, + "learning_rate": 6.489724572932314e-06, + "loss": 1.0438, + "step": 2736 + }, + { + "epoch": 0.6266742987979393, + "grad_norm": 1.148642897605896, + "learning_rate": 6.4827794288374e-06, + "loss": 1.028, + "step": 2737 + }, + { + "epoch": 0.6269032627361191, + "grad_norm": 1.284192681312561, + "learning_rate": 6.475836220239475e-06, + "loss": 1.0812, + "step": 2738 + }, + { + "epoch": 0.6271322266742988, + "grad_norm": 1.0696079730987549, + "learning_rate": 6.468894950959336e-06, + "loss": 1.1179, + "step": 2739 + }, + { + "epoch": 0.6273611906124785, + "grad_norm": 1.2461477518081665, + "learning_rate": 6.461955624816696e-06, + "loss": 1.0082, + "step": 2740 + }, + { + "epoch": 0.6275901545506583, + "grad_norm": 1.2268809080123901, + "learning_rate": 6.455018245630214e-06, + "loss": 1.0832, + "step": 2741 + }, + { + "epoch": 0.627819118488838, + "grad_norm": 1.1206541061401367, + "learning_rate": 6.4480828172174714e-06, + "loss": 1.0686, + "step": 2742 + }, + { + "epoch": 0.6280480824270177, + "grad_norm": 1.9735060930252075, + "learning_rate": 6.441149343394975e-06, + "loss": 1.0499, + "step": 2743 + }, + { + "epoch": 0.6282770463651974, + "grad_norm": 1.1669235229492188, + "learning_rate": 6.4342178279781584e-06, + "loss": 1.0743, + "step": 2744 + }, + { + "epoch": 0.6285060103033773, + "grad_norm": 1.3031091690063477, + "learning_rate": 6.427288274781372e-06, + "loss": 1.0164, + "step": 2745 + }, + { + "epoch": 0.628734974241557, + "grad_norm": 1.2017693519592285, + "learning_rate": 6.420360687617897e-06, + "loss": 0.9844, + "step": 2746 + }, + { + "epoch": 0.6289639381797367, + "grad_norm": 1.155867576599121, + "learning_rate": 6.413435070299925e-06, + "loss": 1.0669, + "step": 2747 + }, + { + "epoch": 0.6291929021179165, + "grad_norm": 1.315988302230835, + "learning_rate": 6.406511426638562e-06, + "loss": 1.0525, + "step": 2748 + }, + { + "epoch": 0.6294218660560962, + "grad_norm": 1.3917714357376099, + "learning_rate": 6.3995897604438315e-06, + "loss": 1.0961, + "step": 2749 + }, + { + "epoch": 0.6296508299942759, + "grad_norm": 1.3738504648208618, + "learning_rate": 6.392670075524674e-06, + "loss": 1.0626, + "step": 2750 + }, + { + "epoch": 0.6298797939324556, + "grad_norm": 1.1680269241333008, + "learning_rate": 6.385752375688927e-06, + "loss": 1.0226, + "step": 2751 + }, + { + "epoch": 0.6301087578706354, + "grad_norm": 1.6877174377441406, + "learning_rate": 6.378836664743347e-06, + "loss": 1.036, + "step": 2752 + }, + { + "epoch": 0.6303377218088151, + "grad_norm": 1.2052552700042725, + "learning_rate": 6.3719229464935915e-06, + "loss": 1.0776, + "step": 2753 + }, + { + "epoch": 0.6305666857469948, + "grad_norm": 1.3558235168457031, + "learning_rate": 6.365011224744218e-06, + "loss": 1.0233, + "step": 2754 + }, + { + "epoch": 0.6307956496851745, + "grad_norm": 1.0907037258148193, + "learning_rate": 6.3581015032986945e-06, + "loss": 0.9963, + "step": 2755 + }, + { + "epoch": 0.6310246136233543, + "grad_norm": 1.4145336151123047, + "learning_rate": 6.35119378595938e-06, + "loss": 1.0576, + "step": 2756 + }, + { + "epoch": 0.6312535775615341, + "grad_norm": 1.3490455150604248, + "learning_rate": 6.344288076527532e-06, + "loss": 1.0378, + "step": 2757 + }, + { + "epoch": 0.6314825414997138, + "grad_norm": 1.4001924991607666, + "learning_rate": 6.337384378803309e-06, + "loss": 1.108, + "step": 2758 + }, + { + "epoch": 0.6317115054378936, + "grad_norm": 1.187118411064148, + "learning_rate": 6.330482696585749e-06, + "loss": 1.0318, + "step": 2759 + }, + { + "epoch": 0.6319404693760733, + "grad_norm": 1.5335361957550049, + "learning_rate": 6.323583033672799e-06, + "loss": 1.0561, + "step": 2760 + }, + { + "epoch": 0.632169433314253, + "grad_norm": 1.461372971534729, + "learning_rate": 6.316685393861284e-06, + "loss": 0.988, + "step": 2761 + }, + { + "epoch": 0.6323983972524327, + "grad_norm": 1.2753512859344482, + "learning_rate": 6.309789780946916e-06, + "loss": 1.07, + "step": 2762 + }, + { + "epoch": 0.6326273611906125, + "grad_norm": 1.0639042854309082, + "learning_rate": 6.302896198724288e-06, + "loss": 1.122, + "step": 2763 + }, + { + "epoch": 0.6328563251287922, + "grad_norm": 1.267246961593628, + "learning_rate": 6.29600465098689e-06, + "loss": 1.068, + "step": 2764 + }, + { + "epoch": 0.6330852890669719, + "grad_norm": 1.2320417165756226, + "learning_rate": 6.289115141527077e-06, + "loss": 1.0611, + "step": 2765 + }, + { + "epoch": 0.6333142530051517, + "grad_norm": 1.517449975013733, + "learning_rate": 6.282227674136091e-06, + "loss": 0.9906, + "step": 2766 + }, + { + "epoch": 0.6335432169433314, + "grad_norm": 1.2116914987564087, + "learning_rate": 6.275342252604044e-06, + "loss": 1.0568, + "step": 2767 + }, + { + "epoch": 0.6337721808815112, + "grad_norm": 1.178650140762329, + "learning_rate": 6.2684588807199265e-06, + "loss": 1.0065, + "step": 2768 + }, + { + "epoch": 0.6340011448196909, + "grad_norm": 1.3236886262893677, + "learning_rate": 6.261577562271605e-06, + "loss": 0.9728, + "step": 2769 + }, + { + "epoch": 0.6342301087578707, + "grad_norm": 1.2734071016311646, + "learning_rate": 6.254698301045806e-06, + "loss": 1.05, + "step": 2770 + }, + { + "epoch": 0.6344590726960504, + "grad_norm": 1.281076431274414, + "learning_rate": 6.247821100828131e-06, + "loss": 1.0518, + "step": 2771 + }, + { + "epoch": 0.6346880366342301, + "grad_norm": 1.1672574281692505, + "learning_rate": 6.240945965403049e-06, + "loss": 1.0612, + "step": 2772 + }, + { + "epoch": 0.6349170005724099, + "grad_norm": 1.2151325941085815, + "learning_rate": 6.234072898553882e-06, + "loss": 1.0501, + "step": 2773 + }, + { + "epoch": 0.6351459645105896, + "grad_norm": 1.215021014213562, + "learning_rate": 6.22720190406283e-06, + "loss": 1.0501, + "step": 2774 + }, + { + "epoch": 0.6353749284487693, + "grad_norm": 1.1667354106903076, + "learning_rate": 6.220332985710936e-06, + "loss": 1.0068, + "step": 2775 + }, + { + "epoch": 0.635603892386949, + "grad_norm": 1.4257439374923706, + "learning_rate": 6.213466147278111e-06, + "loss": 1.085, + "step": 2776 + }, + { + "epoch": 0.6358328563251288, + "grad_norm": 1.5628901720046997, + "learning_rate": 6.206601392543121e-06, + "loss": 0.9748, + "step": 2777 + }, + { + "epoch": 0.6360618202633085, + "grad_norm": 1.2145711183547974, + "learning_rate": 6.199738725283578e-06, + "loss": 1.0345, + "step": 2778 + }, + { + "epoch": 0.6362907842014882, + "grad_norm": 1.2755681276321411, + "learning_rate": 6.192878149275954e-06, + "loss": 1.068, + "step": 2779 + }, + { + "epoch": 0.636519748139668, + "grad_norm": 1.1560249328613281, + "learning_rate": 6.186019668295568e-06, + "loss": 1.0608, + "step": 2780 + }, + { + "epoch": 0.6367487120778478, + "grad_norm": 1.1933772563934326, + "learning_rate": 6.179163286116581e-06, + "loss": 1.0713, + "step": 2781 + }, + { + "epoch": 0.6369776760160275, + "grad_norm": 2.3430261611938477, + "learning_rate": 6.172309006511999e-06, + "loss": 0.9809, + "step": 2782 + }, + { + "epoch": 0.6372066399542072, + "grad_norm": 1.1244544982910156, + "learning_rate": 6.165456833253686e-06, + "loss": 1.0319, + "step": 2783 + }, + { + "epoch": 0.637435603892387, + "grad_norm": 1.41416597366333, + "learning_rate": 6.1586067701123255e-06, + "loss": 1.0276, + "step": 2784 + }, + { + "epoch": 0.6376645678305667, + "grad_norm": 1.5440860986709595, + "learning_rate": 6.151758820857455e-06, + "loss": 1.0168, + "step": 2785 + }, + { + "epoch": 0.6378935317687464, + "grad_norm": 1.2612404823303223, + "learning_rate": 6.144912989257441e-06, + "loss": 0.9923, + "step": 2786 + }, + { + "epoch": 0.6381224957069261, + "grad_norm": 1.4268378019332886, + "learning_rate": 6.138069279079484e-06, + "loss": 1.0428, + "step": 2787 + }, + { + "epoch": 0.6383514596451059, + "grad_norm": 1.4409279823303223, + "learning_rate": 6.13122769408963e-06, + "loss": 1.0381, + "step": 2788 + }, + { + "epoch": 0.6385804235832856, + "grad_norm": 1.529488205909729, + "learning_rate": 6.124388238052737e-06, + "loss": 1.0536, + "step": 2789 + }, + { + "epoch": 0.6388093875214653, + "grad_norm": 1.03910231590271, + "learning_rate": 6.1175509147325015e-06, + "loss": 0.986, + "step": 2790 + }, + { + "epoch": 0.6390383514596452, + "grad_norm": 1.074388027191162, + "learning_rate": 6.1107157278914545e-06, + "loss": 1.066, + "step": 2791 + }, + { + "epoch": 0.6392673153978249, + "grad_norm": 1.1778825521469116, + "learning_rate": 6.1038826812909265e-06, + "loss": 1.1038, + "step": 2792 + }, + { + "epoch": 0.6394962793360046, + "grad_norm": 1.192124843597412, + "learning_rate": 6.097051778691099e-06, + "loss": 1.0755, + "step": 2793 + }, + { + "epoch": 0.6397252432741843, + "grad_norm": 1.2130941152572632, + "learning_rate": 6.090223023850954e-06, + "loss": 1.0033, + "step": 2794 + }, + { + "epoch": 0.6399542072123641, + "grad_norm": 1.1291038990020752, + "learning_rate": 6.083396420528298e-06, + "loss": 1.04, + "step": 2795 + }, + { + "epoch": 0.6401831711505438, + "grad_norm": 1.3231033086776733, + "learning_rate": 6.0765719724797586e-06, + "loss": 1.0925, + "step": 2796 + }, + { + "epoch": 0.6404121350887235, + "grad_norm": 1.6609188318252563, + "learning_rate": 6.069749683460765e-06, + "loss": 1.0937, + "step": 2797 + }, + { + "epoch": 0.6406410990269032, + "grad_norm": 1.21103036403656, + "learning_rate": 6.0629295572255695e-06, + "loss": 0.951, + "step": 2798 + }, + { + "epoch": 0.640870062965083, + "grad_norm": 1.0261344909667969, + "learning_rate": 6.056111597527235e-06, + "loss": 1.0446, + "step": 2799 + }, + { + "epoch": 0.6410990269032627, + "grad_norm": 1.1342116594314575, + "learning_rate": 6.0492958081176155e-06, + "loss": 1.0956, + "step": 2800 + }, + { + "epoch": 0.6413279908414424, + "grad_norm": 1.2922279834747314, + "learning_rate": 6.042482192747394e-06, + "loss": 1.0102, + "step": 2801 + }, + { + "epoch": 0.6415569547796222, + "grad_norm": 1.2681461572647095, + "learning_rate": 6.0356707551660434e-06, + "loss": 0.9613, + "step": 2802 + }, + { + "epoch": 0.641785918717802, + "grad_norm": 1.3258183002471924, + "learning_rate": 6.0288614991218366e-06, + "loss": 1.0068, + "step": 2803 + }, + { + "epoch": 0.6420148826559817, + "grad_norm": 1.2088922262191772, + "learning_rate": 6.022054428361852e-06, + "loss": 1.0253, + "step": 2804 + }, + { + "epoch": 0.6422438465941614, + "grad_norm": 1.0741603374481201, + "learning_rate": 6.015249546631969e-06, + "loss": 0.9941, + "step": 2805 + }, + { + "epoch": 0.6424728105323412, + "grad_norm": 1.408013105392456, + "learning_rate": 6.008446857676849e-06, + "loss": 1.0913, + "step": 2806 + }, + { + "epoch": 0.6427017744705209, + "grad_norm": 1.087931752204895, + "learning_rate": 6.001646365239959e-06, + "loss": 1.0355, + "step": 2807 + }, + { + "epoch": 0.6429307384087006, + "grad_norm": 1.1895891427993774, + "learning_rate": 5.994848073063552e-06, + "loss": 1.1122, + "step": 2808 + }, + { + "epoch": 0.6431597023468804, + "grad_norm": 1.3358407020568848, + "learning_rate": 5.988051984888668e-06, + "loss": 1.0171, + "step": 2809 + }, + { + "epoch": 0.6433886662850601, + "grad_norm": 1.3957514762878418, + "learning_rate": 5.9812581044551475e-06, + "loss": 1.0719, + "step": 2810 + }, + { + "epoch": 0.6436176302232398, + "grad_norm": 2.103109836578369, + "learning_rate": 5.974466435501591e-06, + "loss": 1.0735, + "step": 2811 + }, + { + "epoch": 0.6438465941614195, + "grad_norm": 1.2422171831130981, + "learning_rate": 5.967676981765409e-06, + "loss": 1.0396, + "step": 2812 + }, + { + "epoch": 0.6440755580995993, + "grad_norm": 1.1619694232940674, + "learning_rate": 5.960889746982778e-06, + "loss": 1.063, + "step": 2813 + }, + { + "epoch": 0.6443045220377791, + "grad_norm": 1.4143716096878052, + "learning_rate": 5.954104734888653e-06, + "loss": 1.0359, + "step": 2814 + }, + { + "epoch": 0.6445334859759588, + "grad_norm": 1.296014666557312, + "learning_rate": 5.947321949216771e-06, + "loss": 1.0085, + "step": 2815 + }, + { + "epoch": 0.6447624499141386, + "grad_norm": 1.0912615060806274, + "learning_rate": 5.940541393699646e-06, + "loss": 1.0417, + "step": 2816 + }, + { + "epoch": 0.6449914138523183, + "grad_norm": 1.079500675201416, + "learning_rate": 5.933763072068554e-06, + "loss": 0.9998, + "step": 2817 + }, + { + "epoch": 0.645220377790498, + "grad_norm": 1.2979693412780762, + "learning_rate": 5.926986988053557e-06, + "loss": 1.0842, + "step": 2818 + }, + { + "epoch": 0.6454493417286777, + "grad_norm": 1.6231458187103271, + "learning_rate": 5.9202131453834664e-06, + "loss": 1.0377, + "step": 2819 + }, + { + "epoch": 0.6456783056668575, + "grad_norm": 1.2327483892440796, + "learning_rate": 5.913441547785879e-06, + "loss": 1.0495, + "step": 2820 + }, + { + "epoch": 0.6459072696050372, + "grad_norm": 1.5749176740646362, + "learning_rate": 5.906672198987149e-06, + "loss": 1.0473, + "step": 2821 + }, + { + "epoch": 0.6461362335432169, + "grad_norm": 1.3406803607940674, + "learning_rate": 5.899905102712386e-06, + "loss": 1.062, + "step": 2822 + }, + { + "epoch": 0.6463651974813966, + "grad_norm": 1.0731439590454102, + "learning_rate": 5.893140262685469e-06, + "loss": 1.0155, + "step": 2823 + }, + { + "epoch": 0.6465941614195764, + "grad_norm": 1.2645176649093628, + "learning_rate": 5.886377682629037e-06, + "loss": 0.9567, + "step": 2824 + }, + { + "epoch": 0.6468231253577561, + "grad_norm": 1.1633598804473877, + "learning_rate": 5.879617366264476e-06, + "loss": 0.9959, + "step": 2825 + }, + { + "epoch": 0.6470520892959359, + "grad_norm": 1.266831874847412, + "learning_rate": 5.872859317311933e-06, + "loss": 1.035, + "step": 2826 + }, + { + "epoch": 0.6472810532341157, + "grad_norm": 1.426615595817566, + "learning_rate": 5.866103539490307e-06, + "loss": 1.0236, + "step": 2827 + }, + { + "epoch": 0.6475100171722954, + "grad_norm": 1.0592693090438843, + "learning_rate": 5.859350036517242e-06, + "loss": 1.0293, + "step": 2828 + }, + { + "epoch": 0.6477389811104751, + "grad_norm": 1.2604130506515503, + "learning_rate": 5.852598812109139e-06, + "loss": 1.0833, + "step": 2829 + }, + { + "epoch": 0.6479679450486548, + "grad_norm": 1.2905731201171875, + "learning_rate": 5.845849869981137e-06, + "loss": 1.109, + "step": 2830 + }, + { + "epoch": 0.6481969089868346, + "grad_norm": 1.1893059015274048, + "learning_rate": 5.839103213847123e-06, + "loss": 1.0075, + "step": 2831 + }, + { + "epoch": 0.6484258729250143, + "grad_norm": 1.5178680419921875, + "learning_rate": 5.832358847419728e-06, + "loss": 1.0779, + "step": 2832 + }, + { + "epoch": 0.648654836863194, + "grad_norm": 1.2127212285995483, + "learning_rate": 5.8256167744103144e-06, + "loss": 1.0206, + "step": 2833 + }, + { + "epoch": 0.6488838008013738, + "grad_norm": 1.2147332429885864, + "learning_rate": 5.818876998528988e-06, + "loss": 1.019, + "step": 2834 + }, + { + "epoch": 0.6491127647395535, + "grad_norm": 1.7146003246307373, + "learning_rate": 5.812139523484604e-06, + "loss": 0.9919, + "step": 2835 + }, + { + "epoch": 0.6493417286777332, + "grad_norm": 1.2911676168441772, + "learning_rate": 5.805404352984724e-06, + "loss": 1.0572, + "step": 2836 + }, + { + "epoch": 0.649570692615913, + "grad_norm": 1.3293169736862183, + "learning_rate": 5.7986714907356614e-06, + "loss": 0.9918, + "step": 2837 + }, + { + "epoch": 0.6497996565540928, + "grad_norm": 1.9981685876846313, + "learning_rate": 5.791940940442453e-06, + "loss": 1.0236, + "step": 2838 + }, + { + "epoch": 0.6500286204922725, + "grad_norm": 1.25807785987854, + "learning_rate": 5.785212705808865e-06, + "loss": 1.0795, + "step": 2839 + }, + { + "epoch": 0.6502575844304522, + "grad_norm": 1.261753797531128, + "learning_rate": 5.778486790537392e-06, + "loss": 1.087, + "step": 2840 + }, + { + "epoch": 0.650486548368632, + "grad_norm": 1.0655620098114014, + "learning_rate": 5.7717631983292375e-06, + "loss": 1.0383, + "step": 2841 + }, + { + "epoch": 0.6507155123068117, + "grad_norm": 1.2953264713287354, + "learning_rate": 5.76504193288435e-06, + "loss": 1.0587, + "step": 2842 + }, + { + "epoch": 0.6509444762449914, + "grad_norm": 1.2501654624938965, + "learning_rate": 5.758322997901384e-06, + "loss": 1.0403, + "step": 2843 + }, + { + "epoch": 0.6511734401831711, + "grad_norm": 1.1158286333084106, + "learning_rate": 5.751606397077703e-06, + "loss": 1.0854, + "step": 2844 + }, + { + "epoch": 0.6514024041213509, + "grad_norm": 1.357792615890503, + "learning_rate": 5.744892134109406e-06, + "loss": 1.1166, + "step": 2845 + }, + { + "epoch": 0.6516313680595306, + "grad_norm": 1.070070505142212, + "learning_rate": 5.738180212691296e-06, + "loss": 1.0345, + "step": 2846 + }, + { + "epoch": 0.6518603319977103, + "grad_norm": 1.1419059038162231, + "learning_rate": 5.7314706365168806e-06, + "loss": 1.0482, + "step": 2847 + }, + { + "epoch": 0.65208929593589, + "grad_norm": 1.5696709156036377, + "learning_rate": 5.724763409278383e-06, + "loss": 1.0254, + "step": 2848 + }, + { + "epoch": 0.6523182598740699, + "grad_norm": 1.2937999963760376, + "learning_rate": 5.718058534666746e-06, + "loss": 1.078, + "step": 2849 + }, + { + "epoch": 0.6525472238122496, + "grad_norm": 1.8056284189224243, + "learning_rate": 5.711356016371593e-06, + "loss": 1.0889, + "step": 2850 + }, + { + "epoch": 0.6527761877504293, + "grad_norm": 1.4800915718078613, + "learning_rate": 5.704655858081268e-06, + "loss": 1.0542, + "step": 2851 + }, + { + "epoch": 0.6530051516886091, + "grad_norm": 1.4089901447296143, + "learning_rate": 5.6979580634828125e-06, + "loss": 1.0553, + "step": 2852 + }, + { + "epoch": 0.6532341156267888, + "grad_norm": 1.154541254043579, + "learning_rate": 5.691262636261967e-06, + "loss": 1.0012, + "step": 2853 + }, + { + "epoch": 0.6534630795649685, + "grad_norm": 1.3811694383621216, + "learning_rate": 5.684569580103171e-06, + "loss": 0.9693, + "step": 2854 + }, + { + "epoch": 0.6536920435031482, + "grad_norm": 2.098176956176758, + "learning_rate": 5.6778788986895464e-06, + "loss": 1.1379, + "step": 2855 + }, + { + "epoch": 0.653921007441328, + "grad_norm": 1.450215458869934, + "learning_rate": 5.671190595702932e-06, + "loss": 1.0369, + "step": 2856 + }, + { + "epoch": 0.6541499713795077, + "grad_norm": 1.3346037864685059, + "learning_rate": 5.664504674823844e-06, + "loss": 1.128, + "step": 2857 + }, + { + "epoch": 0.6543789353176874, + "grad_norm": 2.112135171890259, + "learning_rate": 5.6578211397314765e-06, + "loss": 1.0601, + "step": 2858 + }, + { + "epoch": 0.6546078992558672, + "grad_norm": 1.497977614402771, + "learning_rate": 5.6511399941037344e-06, + "loss": 1.0149, + "step": 2859 + }, + { + "epoch": 0.654836863194047, + "grad_norm": 1.1439969539642334, + "learning_rate": 5.6444612416171976e-06, + "loss": 0.9617, + "step": 2860 + }, + { + "epoch": 0.6550658271322267, + "grad_norm": 1.8310574293136597, + "learning_rate": 5.637784885947117e-06, + "loss": 1.0273, + "step": 2861 + }, + { + "epoch": 0.6552947910704064, + "grad_norm": 1.3659378290176392, + "learning_rate": 5.631110930767443e-06, + "loss": 1.058, + "step": 2862 + }, + { + "epoch": 0.6555237550085862, + "grad_norm": 1.355661153793335, + "learning_rate": 5.6244393797507944e-06, + "loss": 1.0411, + "step": 2863 + }, + { + "epoch": 0.6557527189467659, + "grad_norm": 0.9867006540298462, + "learning_rate": 5.617770236568469e-06, + "loss": 0.9702, + "step": 2864 + }, + { + "epoch": 0.6559816828849456, + "grad_norm": 1.066946029663086, + "learning_rate": 5.611103504890444e-06, + "loss": 1.0521, + "step": 2865 + }, + { + "epoch": 0.6562106468231254, + "grad_norm": 1.2276383638381958, + "learning_rate": 5.604439188385362e-06, + "loss": 0.9953, + "step": 2866 + }, + { + "epoch": 0.6564396107613051, + "grad_norm": 1.2933861017227173, + "learning_rate": 5.597777290720543e-06, + "loss": 1.0319, + "step": 2867 + }, + { + "epoch": 0.6566685746994848, + "grad_norm": 1.144028902053833, + "learning_rate": 5.591117815561973e-06, + "loss": 1.0391, + "step": 2868 + }, + { + "epoch": 0.6568975386376645, + "grad_norm": 1.4104443788528442, + "learning_rate": 5.584460766574304e-06, + "loss": 1.0806, + "step": 2869 + }, + { + "epoch": 0.6571265025758443, + "grad_norm": 2.108306884765625, + "learning_rate": 5.5778061474208565e-06, + "loss": 1.0714, + "step": 2870 + }, + { + "epoch": 0.657355466514024, + "grad_norm": 1.3675113916397095, + "learning_rate": 5.571153961763613e-06, + "loss": 1.0704, + "step": 2871 + }, + { + "epoch": 0.6575844304522038, + "grad_norm": 1.1373252868652344, + "learning_rate": 5.564504213263205e-06, + "loss": 1.0022, + "step": 2872 + }, + { + "epoch": 0.6578133943903836, + "grad_norm": 5.982883930206299, + "learning_rate": 5.55785690557895e-06, + "loss": 1.1152, + "step": 2873 + }, + { + "epoch": 0.6580423583285633, + "grad_norm": 1.589943289756775, + "learning_rate": 5.551212042368792e-06, + "loss": 1.0449, + "step": 2874 + }, + { + "epoch": 0.658271322266743, + "grad_norm": 1.2605445384979248, + "learning_rate": 5.54456962728935e-06, + "loss": 1.0192, + "step": 2875 + }, + { + "epoch": 0.6585002862049227, + "grad_norm": 1.27881920337677, + "learning_rate": 5.537929663995887e-06, + "loss": 1.1066, + "step": 2876 + }, + { + "epoch": 0.6587292501431025, + "grad_norm": 2.3701119422912598, + "learning_rate": 5.531292156142319e-06, + "loss": 1.0039, + "step": 2877 + }, + { + "epoch": 0.6589582140812822, + "grad_norm": 2.732001543045044, + "learning_rate": 5.5246571073812124e-06, + "loss": 1.0069, + "step": 2878 + }, + { + "epoch": 0.6591871780194619, + "grad_norm": 1.3857358694076538, + "learning_rate": 5.5180245213637785e-06, + "loss": 1.0746, + "step": 2879 + }, + { + "epoch": 0.6594161419576416, + "grad_norm": 1.0118176937103271, + "learning_rate": 5.511394401739874e-06, + "loss": 1.0039, + "step": 2880 + }, + { + "epoch": 0.6596451058958214, + "grad_norm": 1.1465644836425781, + "learning_rate": 5.504766752157997e-06, + "loss": 1.0596, + "step": 2881 + }, + { + "epoch": 0.6598740698340011, + "grad_norm": 1.1057982444763184, + "learning_rate": 5.498141576265289e-06, + "loss": 1.0616, + "step": 2882 + }, + { + "epoch": 0.6601030337721809, + "grad_norm": 1.2650837898254395, + "learning_rate": 5.491518877707527e-06, + "loss": 1.045, + "step": 2883 + }, + { + "epoch": 0.6603319977103607, + "grad_norm": 1.2822803258895874, + "learning_rate": 5.484898660129132e-06, + "loss": 1.0242, + "step": 2884 + }, + { + "epoch": 0.6605609616485404, + "grad_norm": 1.1227819919586182, + "learning_rate": 5.478280927173145e-06, + "loss": 0.9838, + "step": 2885 + }, + { + "epoch": 0.6607899255867201, + "grad_norm": 2.5325989723205566, + "learning_rate": 5.4716656824812505e-06, + "loss": 1.0738, + "step": 2886 + }, + { + "epoch": 0.6610188895248998, + "grad_norm": 2.1725833415985107, + "learning_rate": 5.465052929693774e-06, + "loss": 1.0192, + "step": 2887 + }, + { + "epoch": 0.6612478534630796, + "grad_norm": 1.7834242582321167, + "learning_rate": 5.458442672449644e-06, + "loss": 1.0923, + "step": 2888 + }, + { + "epoch": 0.6614768174012593, + "grad_norm": 1.2370448112487793, + "learning_rate": 5.451834914386435e-06, + "loss": 1.0404, + "step": 2889 + }, + { + "epoch": 0.661705781339439, + "grad_norm": 1.3691270351409912, + "learning_rate": 5.445229659140341e-06, + "loss": 1.0226, + "step": 2890 + }, + { + "epoch": 0.6619347452776188, + "grad_norm": 1.2318962812423706, + "learning_rate": 5.4386269103461785e-06, + "loss": 1.039, + "step": 2891 + }, + { + "epoch": 0.6621637092157985, + "grad_norm": 1.2632200717926025, + "learning_rate": 5.432026671637385e-06, + "loss": 1.077, + "step": 2892 + }, + { + "epoch": 0.6623926731539782, + "grad_norm": 1.4517078399658203, + "learning_rate": 5.425428946646016e-06, + "loss": 1.0304, + "step": 2893 + }, + { + "epoch": 0.6626216370921579, + "grad_norm": 1.1040912866592407, + "learning_rate": 5.418833739002745e-06, + "loss": 1.0163, + "step": 2894 + }, + { + "epoch": 0.6628506010303378, + "grad_norm": 2.0394599437713623, + "learning_rate": 5.4122410523368615e-06, + "loss": 1.0289, + "step": 2895 + }, + { + "epoch": 0.6630795649685175, + "grad_norm": 1.1909575462341309, + "learning_rate": 5.405650890276255e-06, + "loss": 1.0632, + "step": 2896 + }, + { + "epoch": 0.6633085289066972, + "grad_norm": 1.359932541847229, + "learning_rate": 5.39906325644745e-06, + "loss": 0.9915, + "step": 2897 + }, + { + "epoch": 0.663537492844877, + "grad_norm": 1.3327099084854126, + "learning_rate": 5.392478154475565e-06, + "loss": 1.0265, + "step": 2898 + }, + { + "epoch": 0.6637664567830567, + "grad_norm": 1.2770649194717407, + "learning_rate": 5.3858955879843155e-06, + "loss": 1.0492, + "step": 2899 + }, + { + "epoch": 0.6639954207212364, + "grad_norm": 1.136519193649292, + "learning_rate": 5.379315560596038e-06, + "loss": 1.027, + "step": 2900 + }, + { + "epoch": 0.6642243846594161, + "grad_norm": 1.217628836631775, + "learning_rate": 5.372738075931674e-06, + "loss": 0.9827, + "step": 2901 + }, + { + "epoch": 0.6644533485975959, + "grad_norm": 1.2686570882797241, + "learning_rate": 5.366163137610749e-06, + "loss": 1.063, + "step": 2902 + }, + { + "epoch": 0.6646823125357756, + "grad_norm": 1.29387629032135, + "learning_rate": 5.359590749251397e-06, + "loss": 1.0409, + "step": 2903 + }, + { + "epoch": 0.6649112764739553, + "grad_norm": 1.0357317924499512, + "learning_rate": 5.353020914470353e-06, + "loss": 1.0741, + "step": 2904 + }, + { + "epoch": 0.665140240412135, + "grad_norm": 1.323845624923706, + "learning_rate": 5.346453636882939e-06, + "loss": 1.1121, + "step": 2905 + }, + { + "epoch": 0.6653692043503149, + "grad_norm": 1.1600542068481445, + "learning_rate": 5.339888920103074e-06, + "loss": 1.0454, + "step": 2906 + }, + { + "epoch": 0.6655981682884946, + "grad_norm": 1.1847002506256104, + "learning_rate": 5.333326767743263e-06, + "loss": 1.0733, + "step": 2907 + }, + { + "epoch": 0.6658271322266743, + "grad_norm": 1.0907032489776611, + "learning_rate": 5.326767183414609e-06, + "loss": 1.0321, + "step": 2908 + }, + { + "epoch": 0.6660560961648541, + "grad_norm": 1.4598631858825684, + "learning_rate": 5.320210170726796e-06, + "loss": 1.0596, + "step": 2909 + }, + { + "epoch": 0.6662850601030338, + "grad_norm": 1.2866328954696655, + "learning_rate": 5.313655733288083e-06, + "loss": 1.0658, + "step": 2910 + }, + { + "epoch": 0.6665140240412135, + "grad_norm": 1.1192216873168945, + "learning_rate": 5.307103874705335e-06, + "loss": 1.0721, + "step": 2911 + }, + { + "epoch": 0.6667429879793932, + "grad_norm": 1.190312385559082, + "learning_rate": 5.300554598583982e-06, + "loss": 1.0158, + "step": 2912 + }, + { + "epoch": 0.666971951917573, + "grad_norm": 1.0897172689437866, + "learning_rate": 5.294007908528029e-06, + "loss": 0.9763, + "step": 2913 + }, + { + "epoch": 0.6672009158557527, + "grad_norm": 1.4546432495117188, + "learning_rate": 5.287463808140069e-06, + "loss": 1.0753, + "step": 2914 + }, + { + "epoch": 0.6674298797939324, + "grad_norm": 1.2672525644302368, + "learning_rate": 5.280922301021267e-06, + "loss": 1.0099, + "step": 2915 + }, + { + "epoch": 0.6676588437321122, + "grad_norm": 1.5939594507217407, + "learning_rate": 5.274383390771356e-06, + "loss": 1.1524, + "step": 2916 + }, + { + "epoch": 0.6678878076702919, + "grad_norm": 1.3578407764434814, + "learning_rate": 5.267847080988647e-06, + "loss": 0.9973, + "step": 2917 + }, + { + "epoch": 0.6681167716084717, + "grad_norm": 1.221137285232544, + "learning_rate": 5.2613133752700145e-06, + "loss": 1.0632, + "step": 2918 + }, + { + "epoch": 0.6683457355466514, + "grad_norm": 1.5474742650985718, + "learning_rate": 5.254782277210901e-06, + "loss": 1.0569, + "step": 2919 + }, + { + "epoch": 0.6685746994848312, + "grad_norm": 1.647705078125, + "learning_rate": 5.2482537904053185e-06, + "loss": 1.0202, + "step": 2920 + }, + { + "epoch": 0.6688036634230109, + "grad_norm": 1.6697813272476196, + "learning_rate": 5.241727918445836e-06, + "loss": 1.0308, + "step": 2921 + }, + { + "epoch": 0.6690326273611906, + "grad_norm": 1.1288517713546753, + "learning_rate": 5.235204664923586e-06, + "loss": 1.0139, + "step": 2922 + }, + { + "epoch": 0.6692615912993704, + "grad_norm": 0.9977933168411255, + "learning_rate": 5.228684033428265e-06, + "loss": 1.0381, + "step": 2923 + }, + { + "epoch": 0.6694905552375501, + "grad_norm": 1.2061574459075928, + "learning_rate": 5.22216602754811e-06, + "loss": 1.0258, + "step": 2924 + }, + { + "epoch": 0.6697195191757298, + "grad_norm": 1.1679553985595703, + "learning_rate": 5.215650650869941e-06, + "loss": 1.0316, + "step": 2925 + }, + { + "epoch": 0.6699484831139095, + "grad_norm": 1.209476351737976, + "learning_rate": 5.209137906979102e-06, + "loss": 1.0115, + "step": 2926 + }, + { + "epoch": 0.6701774470520893, + "grad_norm": 1.3832190036773682, + "learning_rate": 5.202627799459503e-06, + "loss": 1.0837, + "step": 2927 + }, + { + "epoch": 0.670406410990269, + "grad_norm": 1.2740558385849, + "learning_rate": 5.1961203318936116e-06, + "loss": 1.0364, + "step": 2928 + }, + { + "epoch": 0.6706353749284488, + "grad_norm": 1.1675924062728882, + "learning_rate": 5.1896155078624225e-06, + "loss": 1.0948, + "step": 2929 + }, + { + "epoch": 0.6708643388666286, + "grad_norm": 1.704378604888916, + "learning_rate": 5.183113330945488e-06, + "loss": 1.0411, + "step": 2930 + }, + { + "epoch": 0.6710933028048083, + "grad_norm": 1.6397212743759155, + "learning_rate": 5.176613804720905e-06, + "loss": 1.0228, + "step": 2931 + }, + { + "epoch": 0.671322266742988, + "grad_norm": 1.287341833114624, + "learning_rate": 5.170116932765304e-06, + "loss": 1.0504, + "step": 2932 + }, + { + "epoch": 0.6715512306811677, + "grad_norm": 1.3021739721298218, + "learning_rate": 5.1636227186538625e-06, + "loss": 1.0174, + "step": 2933 + }, + { + "epoch": 0.6717801946193475, + "grad_norm": 1.6247448921203613, + "learning_rate": 5.157131165960289e-06, + "loss": 1.0317, + "step": 2934 + }, + { + "epoch": 0.6720091585575272, + "grad_norm": 2.791722536087036, + "learning_rate": 5.1506422782568345e-06, + "loss": 1.0632, + "step": 2935 + }, + { + "epoch": 0.6722381224957069, + "grad_norm": 1.033670425415039, + "learning_rate": 5.144156059114279e-06, + "loss": 1.0681, + "step": 2936 + }, + { + "epoch": 0.6724670864338866, + "grad_norm": 1.6212668418884277, + "learning_rate": 5.137672512101925e-06, + "loss": 1.0611, + "step": 2937 + }, + { + "epoch": 0.6726960503720664, + "grad_norm": 1.806549310684204, + "learning_rate": 5.131191640787627e-06, + "loss": 1.0404, + "step": 2938 + }, + { + "epoch": 0.6729250143102461, + "grad_norm": 1.3719223737716675, + "learning_rate": 5.124713448737753e-06, + "loss": 1.0456, + "step": 2939 + }, + { + "epoch": 0.6731539782484258, + "grad_norm": 1.4912806749343872, + "learning_rate": 5.11823793951719e-06, + "loss": 1.0434, + "step": 2940 + }, + { + "epoch": 0.6733829421866057, + "grad_norm": 1.3837188482284546, + "learning_rate": 5.111765116689355e-06, + "loss": 1.024, + "step": 2941 + }, + { + "epoch": 0.6736119061247854, + "grad_norm": 1.8850486278533936, + "learning_rate": 5.105294983816203e-06, + "loss": 1.0059, + "step": 2942 + }, + { + "epoch": 0.6738408700629651, + "grad_norm": 1.2506767511367798, + "learning_rate": 5.098827544458178e-06, + "loss": 1.0762, + "step": 2943 + }, + { + "epoch": 0.6740698340011448, + "grad_norm": 1.9231312274932861, + "learning_rate": 5.0923628021742644e-06, + "loss": 1.0204, + "step": 2944 + }, + { + "epoch": 0.6742987979393246, + "grad_norm": 1.2224328517913818, + "learning_rate": 5.085900760521955e-06, + "loss": 1.0187, + "step": 2945 + }, + { + "epoch": 0.6745277618775043, + "grad_norm": 1.4136698246002197, + "learning_rate": 5.079441423057259e-06, + "loss": 1.0467, + "step": 2946 + }, + { + "epoch": 0.674756725815684, + "grad_norm": 1.244694709777832, + "learning_rate": 5.072984793334696e-06, + "loss": 1.0298, + "step": 2947 + }, + { + "epoch": 0.6749856897538637, + "grad_norm": 1.417830467224121, + "learning_rate": 5.066530874907285e-06, + "loss": 1.0746, + "step": 2948 + }, + { + "epoch": 0.6752146536920435, + "grad_norm": 1.1816048622131348, + "learning_rate": 5.060079671326577e-06, + "loss": 1.0203, + "step": 2949 + }, + { + "epoch": 0.6754436176302232, + "grad_norm": 1.4444321393966675, + "learning_rate": 5.053631186142612e-06, + "loss": 1.0502, + "step": 2950 + }, + { + "epoch": 0.6756725815684029, + "grad_norm": 1.1188122034072876, + "learning_rate": 5.0471854229039286e-06, + "loss": 1.0968, + "step": 2951 + }, + { + "epoch": 0.6759015455065828, + "grad_norm": 1.4322268962860107, + "learning_rate": 5.040742385157584e-06, + "loss": 1.0962, + "step": 2952 + }, + { + "epoch": 0.6761305094447625, + "grad_norm": 1.2078114748001099, + "learning_rate": 5.034302076449132e-06, + "loss": 1.0694, + "step": 2953 + }, + { + "epoch": 0.6763594733829422, + "grad_norm": 4.1897292137146, + "learning_rate": 5.027864500322611e-06, + "loss": 1.0731, + "step": 2954 + }, + { + "epoch": 0.676588437321122, + "grad_norm": 1.3412197828292847, + "learning_rate": 5.021429660320565e-06, + "loss": 1.074, + "step": 2955 + }, + { + "epoch": 0.6768174012593017, + "grad_norm": 2.866015911102295, + "learning_rate": 5.014997559984045e-06, + "loss": 0.9959, + "step": 2956 + }, + { + "epoch": 0.6770463651974814, + "grad_norm": 1.168531894683838, + "learning_rate": 5.008568202852569e-06, + "loss": 1.0117, + "step": 2957 + }, + { + "epoch": 0.6772753291356611, + "grad_norm": 1.4421766996383667, + "learning_rate": 5.002141592464162e-06, + "loss": 1.0038, + "step": 2958 + }, + { + "epoch": 0.6775042930738409, + "grad_norm": 1.2987374067306519, + "learning_rate": 4.995717732355335e-06, + "loss": 1.03, + "step": 2959 + }, + { + "epoch": 0.6777332570120206, + "grad_norm": 1.224198818206787, + "learning_rate": 4.989296626061084e-06, + "loss": 1.0346, + "step": 2960 + }, + { + "epoch": 0.6779622209502003, + "grad_norm": 2.0996627807617188, + "learning_rate": 4.982878277114891e-06, + "loss": 1.0766, + "step": 2961 + }, + { + "epoch": 0.67819118488838, + "grad_norm": 1.3319966793060303, + "learning_rate": 4.976462689048718e-06, + "loss": 1.002, + "step": 2962 + }, + { + "epoch": 0.6784201488265598, + "grad_norm": 1.1799633502960205, + "learning_rate": 4.970049865393009e-06, + "loss": 1.0168, + "step": 2963 + }, + { + "epoch": 0.6786491127647396, + "grad_norm": 1.1029752492904663, + "learning_rate": 4.963639809676692e-06, + "loss": 1.0343, + "step": 2964 + }, + { + "epoch": 0.6788780767029193, + "grad_norm": 1.0645133256912231, + "learning_rate": 4.957232525427156e-06, + "loss": 1.0062, + "step": 2965 + }, + { + "epoch": 0.6791070406410991, + "grad_norm": 1.2702126502990723, + "learning_rate": 4.950828016170286e-06, + "loss": 1.0674, + "step": 2966 + }, + { + "epoch": 0.6793360045792788, + "grad_norm": 1.3423794507980347, + "learning_rate": 4.94442628543043e-06, + "loss": 1.0065, + "step": 2967 + }, + { + "epoch": 0.6795649685174585, + "grad_norm": 1.3491889238357544, + "learning_rate": 4.9380273367304e-06, + "loss": 1.0039, + "step": 2968 + }, + { + "epoch": 0.6797939324556382, + "grad_norm": 1.332449197769165, + "learning_rate": 4.931631173591487e-06, + "loss": 1.015, + "step": 2969 + }, + { + "epoch": 0.680022896393818, + "grad_norm": 1.2349896430969238, + "learning_rate": 4.925237799533445e-06, + "loss": 1.0143, + "step": 2970 + }, + { + "epoch": 0.6802518603319977, + "grad_norm": 1.296811580657959, + "learning_rate": 4.918847218074495e-06, + "loss": 1.0468, + "step": 2971 + }, + { + "epoch": 0.6804808242701774, + "grad_norm": 1.2404383420944214, + "learning_rate": 4.912459432731322e-06, + "loss": 1.0548, + "step": 2972 + }, + { + "epoch": 0.6807097882083571, + "grad_norm": 1.2925716638565063, + "learning_rate": 4.906074447019068e-06, + "loss": 1.0182, + "step": 2973 + }, + { + "epoch": 0.6809387521465369, + "grad_norm": 1.7651426792144775, + "learning_rate": 4.899692264451339e-06, + "loss": 1.028, + "step": 2974 + }, + { + "epoch": 0.6811677160847167, + "grad_norm": 2.1795103549957275, + "learning_rate": 4.893312888540195e-06, + "loss": 1.0185, + "step": 2975 + }, + { + "epoch": 0.6813966800228964, + "grad_norm": 1.4040412902832031, + "learning_rate": 4.886936322796154e-06, + "loss": 1.015, + "step": 2976 + }, + { + "epoch": 0.6816256439610762, + "grad_norm": 2.831342935562134, + "learning_rate": 4.880562570728188e-06, + "loss": 1.0135, + "step": 2977 + }, + { + "epoch": 0.6818546078992559, + "grad_norm": 1.828460454940796, + "learning_rate": 4.87419163584372e-06, + "loss": 1.0305, + "step": 2978 + }, + { + "epoch": 0.6820835718374356, + "grad_norm": 1.3774365186691284, + "learning_rate": 4.867823521648613e-06, + "loss": 1.0625, + "step": 2979 + }, + { + "epoch": 0.6823125357756153, + "grad_norm": 1.7624365091323853, + "learning_rate": 4.861458231647202e-06, + "loss": 1.0516, + "step": 2980 + }, + { + "epoch": 0.6825414997137951, + "grad_norm": 1.3166933059692383, + "learning_rate": 4.855095769342241e-06, + "loss": 1.0544, + "step": 2981 + }, + { + "epoch": 0.6827704636519748, + "grad_norm": 2.098736047744751, + "learning_rate": 4.848736138234943e-06, + "loss": 1.0258, + "step": 2982 + }, + { + "epoch": 0.6829994275901545, + "grad_norm": 1.0946608781814575, + "learning_rate": 4.842379341824958e-06, + "loss": 1.0407, + "step": 2983 + }, + { + "epoch": 0.6832283915283343, + "grad_norm": 1.1885132789611816, + "learning_rate": 4.836025383610382e-06, + "loss": 1.0462, + "step": 2984 + }, + { + "epoch": 0.683457355466514, + "grad_norm": 5.834176540374756, + "learning_rate": 4.829674267087742e-06, + "loss": 1.0261, + "step": 2985 + }, + { + "epoch": 0.6836863194046937, + "grad_norm": 3.0932862758636475, + "learning_rate": 4.823325995752005e-06, + "loss": 1.0829, + "step": 2986 + }, + { + "epoch": 0.6839152833428735, + "grad_norm": 1.27007257938385, + "learning_rate": 4.816980573096571e-06, + "loss": 1.1001, + "step": 2987 + }, + { + "epoch": 0.6841442472810533, + "grad_norm": 1.145909070968628, + "learning_rate": 4.810638002613273e-06, + "loss": 1.0904, + "step": 2988 + }, + { + "epoch": 0.684373211219233, + "grad_norm": 1.4148883819580078, + "learning_rate": 4.804298287792374e-06, + "loss": 1.0352, + "step": 2989 + }, + { + "epoch": 0.6846021751574127, + "grad_norm": 1.51638925075531, + "learning_rate": 4.797961432122568e-06, + "loss": 1.0851, + "step": 2990 + }, + { + "epoch": 0.6848311390955925, + "grad_norm": 2.6201398372650146, + "learning_rate": 4.791627439090975e-06, + "loss": 1.0082, + "step": 2991 + }, + { + "epoch": 0.6850601030337722, + "grad_norm": 1.3478920459747314, + "learning_rate": 4.785296312183131e-06, + "loss": 1.0192, + "step": 2992 + }, + { + "epoch": 0.6852890669719519, + "grad_norm": 2.5813474655151367, + "learning_rate": 4.778968054883002e-06, + "loss": 1.0566, + "step": 2993 + }, + { + "epoch": 0.6855180309101316, + "grad_norm": 1.4003512859344482, + "learning_rate": 4.772642670672988e-06, + "loss": 1.0256, + "step": 2994 + }, + { + "epoch": 0.6857469948483114, + "grad_norm": 1.65157151222229, + "learning_rate": 4.766320163033882e-06, + "loss": 1.081, + "step": 2995 + }, + { + "epoch": 0.6859759587864911, + "grad_norm": 1.354181170463562, + "learning_rate": 4.7600005354449075e-06, + "loss": 0.9829, + "step": 2996 + }, + { + "epoch": 0.6862049227246708, + "grad_norm": 1.2161715030670166, + "learning_rate": 4.753683791383713e-06, + "loss": 1.0763, + "step": 2997 + }, + { + "epoch": 0.6864338866628507, + "grad_norm": 1.2519667148590088, + "learning_rate": 4.74736993432634e-06, + "loss": 1.0668, + "step": 2998 + }, + { + "epoch": 0.6866628506010304, + "grad_norm": 1.2776641845703125, + "learning_rate": 4.741058967747254e-06, + "loss": 1.0412, + "step": 2999 + }, + { + "epoch": 0.6868918145392101, + "grad_norm": 1.4053977727890015, + "learning_rate": 4.734750895119327e-06, + "loss": 1.079, + "step": 3000 + }, + { + "epoch": 0.6871207784773898, + "grad_norm": 1.4398205280303955, + "learning_rate": 4.7284457199138374e-06, + "loss": 1.0251, + "step": 3001 + }, + { + "epoch": 0.6873497424155696, + "grad_norm": 1.4499075412750244, + "learning_rate": 4.722143445600477e-06, + "loss": 1.0771, + "step": 3002 + }, + { + "epoch": 0.6875787063537493, + "grad_norm": 1.4174058437347412, + "learning_rate": 4.71584407564732e-06, + "loss": 1.0093, + "step": 3003 + }, + { + "epoch": 0.687807670291929, + "grad_norm": 1.425048589706421, + "learning_rate": 4.70954761352087e-06, + "loss": 1.0117, + "step": 3004 + }, + { + "epoch": 0.6880366342301087, + "grad_norm": 1.1394058465957642, + "learning_rate": 4.703254062686017e-06, + "loss": 1.0435, + "step": 3005 + }, + { + "epoch": 0.6882655981682885, + "grad_norm": 1.4277303218841553, + "learning_rate": 4.696963426606041e-06, + "loss": 1.0903, + "step": 3006 + }, + { + "epoch": 0.6884945621064682, + "grad_norm": 1.744752287864685, + "learning_rate": 4.690675708742628e-06, + "loss": 1.0468, + "step": 3007 + }, + { + "epoch": 0.6887235260446479, + "grad_norm": 1.4332195520401, + "learning_rate": 4.684390912555866e-06, + "loss": 1.0169, + "step": 3008 + }, + { + "epoch": 0.6889524899828277, + "grad_norm": 1.3215627670288086, + "learning_rate": 4.678109041504215e-06, + "loss": 1.0733, + "step": 3009 + }, + { + "epoch": 0.6891814539210075, + "grad_norm": 1.8956875801086426, + "learning_rate": 4.671830099044536e-06, + "loss": 1.0317, + "step": 3010 + }, + { + "epoch": 0.6894104178591872, + "grad_norm": 1.260425090789795, + "learning_rate": 4.665554088632089e-06, + "loss": 1.0633, + "step": 3011 + }, + { + "epoch": 0.689639381797367, + "grad_norm": 1.27289617061615, + "learning_rate": 4.6592810137205e-06, + "loss": 1.0074, + "step": 3012 + }, + { + "epoch": 0.6898683457355467, + "grad_norm": 1.8777307271957397, + "learning_rate": 4.653010877761793e-06, + "loss": 1.0374, + "step": 3013 + }, + { + "epoch": 0.6900973096737264, + "grad_norm": 1.1621068716049194, + "learning_rate": 4.64674368420637e-06, + "loss": 1.0394, + "step": 3014 + }, + { + "epoch": 0.6903262736119061, + "grad_norm": 1.2484198808670044, + "learning_rate": 4.640479436503016e-06, + "loss": 1.0031, + "step": 3015 + }, + { + "epoch": 0.6905552375500859, + "grad_norm": 1.564257025718689, + "learning_rate": 4.634218138098897e-06, + "loss": 1.0898, + "step": 3016 + }, + { + "epoch": 0.6907842014882656, + "grad_norm": 1.2701921463012695, + "learning_rate": 4.6279597924395434e-06, + "loss": 1.0803, + "step": 3017 + }, + { + "epoch": 0.6910131654264453, + "grad_norm": 1.458882212638855, + "learning_rate": 4.621704402968881e-06, + "loss": 1.0771, + "step": 3018 + }, + { + "epoch": 0.691242129364625, + "grad_norm": 1.2890880107879639, + "learning_rate": 4.615451973129196e-06, + "loss": 1.0809, + "step": 3019 + }, + { + "epoch": 0.6914710933028048, + "grad_norm": 1.1623772382736206, + "learning_rate": 4.60920250636114e-06, + "loss": 1.0524, + "step": 3020 + }, + { + "epoch": 0.6917000572409846, + "grad_norm": 1.1945689916610718, + "learning_rate": 4.602956006103752e-06, + "loss": 1.0345, + "step": 3021 + }, + { + "epoch": 0.6919290211791643, + "grad_norm": 1.1116697788238525, + "learning_rate": 4.596712475794427e-06, + "loss": 0.9965, + "step": 3022 + }, + { + "epoch": 0.692157985117344, + "grad_norm": 1.092020869255066, + "learning_rate": 4.590471918868923e-06, + "loss": 1.019, + "step": 3023 + }, + { + "epoch": 0.6923869490555238, + "grad_norm": 1.2264333963394165, + "learning_rate": 4.584234338761368e-06, + "loss": 1.0246, + "step": 3024 + }, + { + "epoch": 0.6926159129937035, + "grad_norm": 1.1611523628234863, + "learning_rate": 4.5779997389042514e-06, + "loss": 1.0514, + "step": 3025 + }, + { + "epoch": 0.6928448769318832, + "grad_norm": 1.5043096542358398, + "learning_rate": 4.571768122728421e-06, + "loss": 0.9878, + "step": 3026 + }, + { + "epoch": 0.693073840870063, + "grad_norm": 1.4404031038284302, + "learning_rate": 4.56553949366308e-06, + "loss": 1.0665, + "step": 3027 + }, + { + "epoch": 0.6933028048082427, + "grad_norm": 1.6433451175689697, + "learning_rate": 4.559313855135795e-06, + "loss": 1.0795, + "step": 3028 + }, + { + "epoch": 0.6935317687464224, + "grad_norm": 1.2414443492889404, + "learning_rate": 4.55309121057248e-06, + "loss": 1.0612, + "step": 3029 + }, + { + "epoch": 0.6937607326846021, + "grad_norm": 1.7606821060180664, + "learning_rate": 4.546871563397409e-06, + "loss": 1.0517, + "step": 3030 + }, + { + "epoch": 0.6939896966227819, + "grad_norm": 1.563098430633545, + "learning_rate": 4.54065491703319e-06, + "loss": 1.0475, + "step": 3031 + }, + { + "epoch": 0.6942186605609616, + "grad_norm": 1.3073487281799316, + "learning_rate": 4.534441274900807e-06, + "loss": 1.0028, + "step": 3032 + }, + { + "epoch": 0.6944476244991414, + "grad_norm": 1.2197542190551758, + "learning_rate": 4.528230640419562e-06, + "loss": 1.027, + "step": 3033 + }, + { + "epoch": 0.6946765884373212, + "grad_norm": 1.2171616554260254, + "learning_rate": 4.522023017007118e-06, + "loss": 1.0114, + "step": 3034 + }, + { + "epoch": 0.6949055523755009, + "grad_norm": 1.2713412046432495, + "learning_rate": 4.515818408079487e-06, + "loss": 1.0541, + "step": 3035 + }, + { + "epoch": 0.6951345163136806, + "grad_norm": 1.235053539276123, + "learning_rate": 4.5096168170510036e-06, + "loss": 1.0338, + "step": 3036 + }, + { + "epoch": 0.6953634802518603, + "grad_norm": 1.3829389810562134, + "learning_rate": 4.503418247334353e-06, + "loss": 1.0417, + "step": 3037 + }, + { + "epoch": 0.6955924441900401, + "grad_norm": 1.107755422592163, + "learning_rate": 4.497222702340559e-06, + "loss": 1.0538, + "step": 3038 + }, + { + "epoch": 0.6958214081282198, + "grad_norm": 1.2136812210083008, + "learning_rate": 4.491030185478976e-06, + "loss": 1.0574, + "step": 3039 + }, + { + "epoch": 0.6960503720663995, + "grad_norm": 1.9772335290908813, + "learning_rate": 4.4848407001572945e-06, + "loss": 1.0413, + "step": 3040 + }, + { + "epoch": 0.6962793360045793, + "grad_norm": 1.3861563205718994, + "learning_rate": 4.4786542497815365e-06, + "loss": 1.0276, + "step": 3041 + }, + { + "epoch": 0.696508299942759, + "grad_norm": 1.2986854314804077, + "learning_rate": 4.472470837756055e-06, + "loss": 1.0022, + "step": 3042 + }, + { + "epoch": 0.6967372638809387, + "grad_norm": 1.8411191701889038, + "learning_rate": 4.466290467483531e-06, + "loss": 1.0107, + "step": 3043 + }, + { + "epoch": 0.6969662278191184, + "grad_norm": 1.371367335319519, + "learning_rate": 4.460113142364961e-06, + "loss": 1.017, + "step": 3044 + }, + { + "epoch": 0.6971951917572983, + "grad_norm": 1.3172252178192139, + "learning_rate": 4.453938865799686e-06, + "loss": 1.0403, + "step": 3045 + }, + { + "epoch": 0.697424155695478, + "grad_norm": 1.2504808902740479, + "learning_rate": 4.447767641185359e-06, + "loss": 1.0093, + "step": 3046 + }, + { + "epoch": 0.6976531196336577, + "grad_norm": 1.1979867219924927, + "learning_rate": 4.441599471917946e-06, + "loss": 1.0421, + "step": 3047 + }, + { + "epoch": 0.6978820835718375, + "grad_norm": 1.6098335981369019, + "learning_rate": 4.435434361391737e-06, + "loss": 1.091, + "step": 3048 + }, + { + "epoch": 0.6981110475100172, + "grad_norm": 1.1873350143432617, + "learning_rate": 4.429272312999353e-06, + "loss": 1.0796, + "step": 3049 + }, + { + "epoch": 0.6983400114481969, + "grad_norm": 1.3173235654830933, + "learning_rate": 4.423113330131708e-06, + "loss": 1.0352, + "step": 3050 + }, + { + "epoch": 0.6985689753863766, + "grad_norm": 1.3406671285629272, + "learning_rate": 4.4169574161780395e-06, + "loss": 1.0448, + "step": 3051 + }, + { + "epoch": 0.6987979393245564, + "grad_norm": 1.2191475629806519, + "learning_rate": 4.4108045745258966e-06, + "loss": 1.0402, + "step": 3052 + }, + { + "epoch": 0.6990269032627361, + "grad_norm": 1.2168577909469604, + "learning_rate": 4.404654808561137e-06, + "loss": 1.0253, + "step": 3053 + }, + { + "epoch": 0.6992558672009158, + "grad_norm": 9.12889575958252, + "learning_rate": 4.398508121667925e-06, + "loss": 1.0162, + "step": 3054 + }, + { + "epoch": 0.6994848311390955, + "grad_norm": 1.6077773571014404, + "learning_rate": 4.39236451722873e-06, + "loss": 1.0926, + "step": 3055 + }, + { + "epoch": 0.6997137950772754, + "grad_norm": 1.5294177532196045, + "learning_rate": 4.386223998624327e-06, + "loss": 1.0349, + "step": 3056 + }, + { + "epoch": 0.6999427590154551, + "grad_norm": 1.079950213432312, + "learning_rate": 4.380086569233796e-06, + "loss": 0.9882, + "step": 3057 + }, + { + "epoch": 0.7001717229536348, + "grad_norm": 1.1813180446624756, + "learning_rate": 4.3739522324344994e-06, + "loss": 1.1365, + "step": 3058 + }, + { + "epoch": 0.7004006868918146, + "grad_norm": 1.2818435430526733, + "learning_rate": 4.3678209916021264e-06, + "loss": 1.0429, + "step": 3059 + }, + { + "epoch": 0.7006296508299943, + "grad_norm": 1.6467807292938232, + "learning_rate": 4.361692850110644e-06, + "loss": 1.0304, + "step": 3060 + }, + { + "epoch": 0.700858614768174, + "grad_norm": 1.1530770063400269, + "learning_rate": 4.355567811332311e-06, + "loss": 1.1026, + "step": 3061 + }, + { + "epoch": 0.7010875787063537, + "grad_norm": 1.2358185052871704, + "learning_rate": 4.3494458786376845e-06, + "loss": 1.0106, + "step": 3062 + }, + { + "epoch": 0.7013165426445335, + "grad_norm": 2.5719728469848633, + "learning_rate": 4.3433270553956245e-06, + "loss": 0.9823, + "step": 3063 + }, + { + "epoch": 0.7015455065827132, + "grad_norm": 1.4278590679168701, + "learning_rate": 4.337211344973256e-06, + "loss": 1.0915, + "step": 3064 + }, + { + "epoch": 0.7017744705208929, + "grad_norm": 1.1447930335998535, + "learning_rate": 4.331098750736008e-06, + "loss": 1.0208, + "step": 3065 + }, + { + "epoch": 0.7020034344590727, + "grad_norm": 1.5032339096069336, + "learning_rate": 4.3249892760475894e-06, + "loss": 1.0192, + "step": 3066 + }, + { + "epoch": 0.7022323983972524, + "grad_norm": 1.17005455493927, + "learning_rate": 4.318882924269994e-06, + "loss": 0.9827, + "step": 3067 + }, + { + "epoch": 0.7024613623354322, + "grad_norm": 1.24888014793396, + "learning_rate": 4.312779698763493e-06, + "loss": 1.0159, + "step": 3068 + }, + { + "epoch": 0.7026903262736119, + "grad_norm": 1.3852883577346802, + "learning_rate": 4.306679602886643e-06, + "loss": 1.0457, + "step": 3069 + }, + { + "epoch": 0.7029192902117917, + "grad_norm": 1.4950320720672607, + "learning_rate": 4.300582639996274e-06, + "loss": 1.0019, + "step": 3070 + }, + { + "epoch": 0.7031482541499714, + "grad_norm": 1.2468384504318237, + "learning_rate": 4.2944888134474995e-06, + "loss": 1.0566, + "step": 3071 + }, + { + "epoch": 0.7033772180881511, + "grad_norm": 1.0590832233428955, + "learning_rate": 4.2883981265936884e-06, + "loss": 1.0033, + "step": 3072 + }, + { + "epoch": 0.7036061820263309, + "grad_norm": 1.5195868015289307, + "learning_rate": 4.282310582786504e-06, + "loss": 1.0192, + "step": 3073 + }, + { + "epoch": 0.7038351459645106, + "grad_norm": 1.6989542245864868, + "learning_rate": 4.276226185375874e-06, + "loss": 1.0511, + "step": 3074 + }, + { + "epoch": 0.7040641099026903, + "grad_norm": 1.4525377750396729, + "learning_rate": 4.270144937709981e-06, + "loss": 1.0511, + "step": 3075 + }, + { + "epoch": 0.70429307384087, + "grad_norm": 1.3085981607437134, + "learning_rate": 4.264066843135292e-06, + "loss": 1.0334, + "step": 3076 + }, + { + "epoch": 0.7045220377790498, + "grad_norm": 1.2663421630859375, + "learning_rate": 4.257991904996527e-06, + "loss": 1.0987, + "step": 3077 + }, + { + "epoch": 0.7047510017172295, + "grad_norm": 1.1956284046173096, + "learning_rate": 4.251920126636676e-06, + "loss": 1.0384, + "step": 3078 + }, + { + "epoch": 0.7049799656554093, + "grad_norm": 1.3076503276824951, + "learning_rate": 4.245851511396988e-06, + "loss": 1.1324, + "step": 3079 + }, + { + "epoch": 0.705208929593589, + "grad_norm": 1.2820242643356323, + "learning_rate": 4.23978606261697e-06, + "loss": 1.0065, + "step": 3080 + }, + { + "epoch": 0.7054378935317688, + "grad_norm": 1.707970142364502, + "learning_rate": 4.233723783634388e-06, + "loss": 1.0478, + "step": 3081 + }, + { + "epoch": 0.7056668574699485, + "grad_norm": 1.9648287296295166, + "learning_rate": 4.227664677785264e-06, + "loss": 0.9916, + "step": 3082 + }, + { + "epoch": 0.7058958214081282, + "grad_norm": 1.4425021409988403, + "learning_rate": 4.221608748403872e-06, + "loss": 1.0077, + "step": 3083 + }, + { + "epoch": 0.706124785346308, + "grad_norm": 1.1991877555847168, + "learning_rate": 4.21555599882274e-06, + "loss": 1.085, + "step": 3084 + }, + { + "epoch": 0.7063537492844877, + "grad_norm": 1.6011837720870972, + "learning_rate": 4.2095064323726485e-06, + "loss": 1.0511, + "step": 3085 + }, + { + "epoch": 0.7065827132226674, + "grad_norm": 1.2623403072357178, + "learning_rate": 4.2034600523826116e-06, + "loss": 0.992, + "step": 3086 + }, + { + "epoch": 0.7068116771608471, + "grad_norm": 1.3289179801940918, + "learning_rate": 4.1974168621799185e-06, + "loss": 1.0386, + "step": 3087 + }, + { + "epoch": 0.7070406410990269, + "grad_norm": 1.3888287544250488, + "learning_rate": 4.191376865090073e-06, + "loss": 1.0181, + "step": 3088 + }, + { + "epoch": 0.7072696050372066, + "grad_norm": 1.3241780996322632, + "learning_rate": 4.1853400644368395e-06, + "loss": 1.0285, + "step": 3089 + }, + { + "epoch": 0.7074985689753863, + "grad_norm": 1.2323023080825806, + "learning_rate": 4.179306463542217e-06, + "loss": 1.0286, + "step": 3090 + }, + { + "epoch": 0.7077275329135662, + "grad_norm": 1.290425419807434, + "learning_rate": 4.173276065726448e-06, + "loss": 1.0194, + "step": 3091 + }, + { + "epoch": 0.7079564968517459, + "grad_norm": 1.7319308519363403, + "learning_rate": 4.167248874308009e-06, + "loss": 1.0092, + "step": 3092 + }, + { + "epoch": 0.7081854607899256, + "grad_norm": 1.0908766984939575, + "learning_rate": 4.161224892603613e-06, + "loss": 1.0487, + "step": 3093 + }, + { + "epoch": 0.7084144247281053, + "grad_norm": 1.2904776334762573, + "learning_rate": 4.155204123928205e-06, + "loss": 0.9839, + "step": 3094 + }, + { + "epoch": 0.7086433886662851, + "grad_norm": 1.3205946683883667, + "learning_rate": 4.149186571594965e-06, + "loss": 1.0494, + "step": 3095 + }, + { + "epoch": 0.7088723526044648, + "grad_norm": 1.6448436975479126, + "learning_rate": 4.143172238915302e-06, + "loss": 1.0911, + "step": 3096 + }, + { + "epoch": 0.7091013165426445, + "grad_norm": 1.1375172138214111, + "learning_rate": 4.137161129198852e-06, + "loss": 0.9825, + "step": 3097 + }, + { + "epoch": 0.7093302804808242, + "grad_norm": 1.2753404378890991, + "learning_rate": 4.131153245753482e-06, + "loss": 1.0859, + "step": 3098 + }, + { + "epoch": 0.709559244419004, + "grad_norm": 1.3567954301834106, + "learning_rate": 4.1251485918852725e-06, + "loss": 1.029, + "step": 3099 + }, + { + "epoch": 0.7097882083571837, + "grad_norm": 1.057942271232605, + "learning_rate": 4.119147170898535e-06, + "loss": 1.0965, + "step": 3100 + }, + { + "epoch": 0.7100171722953634, + "grad_norm": 1.2383803129196167, + "learning_rate": 4.113148986095812e-06, + "loss": 1.0577, + "step": 3101 + }, + { + "epoch": 0.7102461362335433, + "grad_norm": 1.1256804466247559, + "learning_rate": 4.107154040777842e-06, + "loss": 0.9798, + "step": 3102 + }, + { + "epoch": 0.710475100171723, + "grad_norm": 1.5061252117156982, + "learning_rate": 4.101162338243595e-06, + "loss": 1.1058, + "step": 3103 + }, + { + "epoch": 0.7107040641099027, + "grad_norm": 1.094844102859497, + "learning_rate": 4.095173881790265e-06, + "loss": 0.9926, + "step": 3104 + }, + { + "epoch": 0.7109330280480824, + "grad_norm": 1.3939733505249023, + "learning_rate": 4.0891886747132356e-06, + "loss": 1.061, + "step": 3105 + }, + { + "epoch": 0.7111619919862622, + "grad_norm": 2.069545030593872, + "learning_rate": 4.0832067203061244e-06, + "loss": 1.0444, + "step": 3106 + }, + { + "epoch": 0.7113909559244419, + "grad_norm": 1.2419723272323608, + "learning_rate": 4.0772280218607485e-06, + "loss": 1.0661, + "step": 3107 + }, + { + "epoch": 0.7116199198626216, + "grad_norm": 2.2456400394439697, + "learning_rate": 4.071252582667135e-06, + "loss": 1.0287, + "step": 3108 + }, + { + "epoch": 0.7118488838008014, + "grad_norm": 1.2714403867721558, + "learning_rate": 4.065280406013522e-06, + "loss": 0.9891, + "step": 3109 + }, + { + "epoch": 0.7120778477389811, + "grad_norm": 1.4441180229187012, + "learning_rate": 4.059311495186338e-06, + "loss": 1.0688, + "step": 3110 + }, + { + "epoch": 0.7123068116771608, + "grad_norm": 1.442212700843811, + "learning_rate": 4.0533458534702354e-06, + "loss": 1.0711, + "step": 3111 + }, + { + "epoch": 0.7125357756153405, + "grad_norm": 1.2680224180221558, + "learning_rate": 4.0473834841480565e-06, + "loss": 1.0097, + "step": 3112 + }, + { + "epoch": 0.7127647395535203, + "grad_norm": 1.0516926050186157, + "learning_rate": 4.0414243905008325e-06, + "loss": 1.0449, + "step": 3113 + }, + { + "epoch": 0.7129937034917001, + "grad_norm": 1.1603946685791016, + "learning_rate": 4.035468575807812e-06, + "loss": 1.0299, + "step": 3114 + }, + { + "epoch": 0.7132226674298798, + "grad_norm": 1.0879015922546387, + "learning_rate": 4.029516043346432e-06, + "loss": 1.0125, + "step": 3115 + }, + { + "epoch": 0.7134516313680596, + "grad_norm": 2.325087070465088, + "learning_rate": 4.023566796392313e-06, + "loss": 1.0965, + "step": 3116 + }, + { + "epoch": 0.7136805953062393, + "grad_norm": 1.1928337812423706, + "learning_rate": 4.017620838219276e-06, + "loss": 1.0108, + "step": 3117 + }, + { + "epoch": 0.713909559244419, + "grad_norm": 1.2268211841583252, + "learning_rate": 4.011678172099343e-06, + "loss": 0.9763, + "step": 3118 + }, + { + "epoch": 0.7141385231825987, + "grad_norm": 1.3314909934997559, + "learning_rate": 4.005738801302701e-06, + "loss": 1.0531, + "step": 3119 + }, + { + "epoch": 0.7143674871207785, + "grad_norm": 1.3090684413909912, + "learning_rate": 3.999802729097743e-06, + "loss": 1.0713, + "step": 3120 + }, + { + "epoch": 0.7145964510589582, + "grad_norm": 1.1635836362838745, + "learning_rate": 3.993869958751036e-06, + "loss": 1.0654, + "step": 3121 + }, + { + "epoch": 0.7148254149971379, + "grad_norm": 1.3543496131896973, + "learning_rate": 3.9879404935273346e-06, + "loss": 1.0163, + "step": 3122 + }, + { + "epoch": 0.7150543789353176, + "grad_norm": 1.3396997451782227, + "learning_rate": 3.982014336689579e-06, + "loss": 1.0441, + "step": 3123 + }, + { + "epoch": 0.7152833428734974, + "grad_norm": 1.4751423597335815, + "learning_rate": 3.9760914914988716e-06, + "loss": 1.0307, + "step": 3124 + }, + { + "epoch": 0.7155123068116772, + "grad_norm": 1.3802671432495117, + "learning_rate": 3.970171961214515e-06, + "loss": 1.0171, + "step": 3125 + }, + { + "epoch": 0.7157412707498569, + "grad_norm": 1.8824442625045776, + "learning_rate": 3.964255749093979e-06, + "loss": 0.969, + "step": 3126 + }, + { + "epoch": 0.7159702346880367, + "grad_norm": 1.3549731969833374, + "learning_rate": 3.958342858392893e-06, + "loss": 1.0393, + "step": 3127 + }, + { + "epoch": 0.7161991986262164, + "grad_norm": 1.1898926496505737, + "learning_rate": 3.952433292365081e-06, + "loss": 1.0948, + "step": 3128 + }, + { + "epoch": 0.7164281625643961, + "grad_norm": 1.109851360321045, + "learning_rate": 3.94652705426253e-06, + "loss": 1.0621, + "step": 3129 + }, + { + "epoch": 0.7166571265025758, + "grad_norm": 1.4158560037612915, + "learning_rate": 3.940624147335386e-06, + "loss": 1.0273, + "step": 3130 + }, + { + "epoch": 0.7168860904407556, + "grad_norm": 1.319413661956787, + "learning_rate": 3.93472457483197e-06, + "loss": 0.9908, + "step": 3131 + }, + { + "epoch": 0.7171150543789353, + "grad_norm": 2.840635299682617, + "learning_rate": 3.92882833999877e-06, + "loss": 1.0255, + "step": 3132 + }, + { + "epoch": 0.717344018317115, + "grad_norm": 1.2576954364776611, + "learning_rate": 3.9229354460804345e-06, + "loss": 1.0252, + "step": 3133 + }, + { + "epoch": 0.7175729822552948, + "grad_norm": 1.3615483045578003, + "learning_rate": 3.917045896319772e-06, + "loss": 1.0324, + "step": 3134 + }, + { + "epoch": 0.7178019461934745, + "grad_norm": 1.6669081449508667, + "learning_rate": 3.911159693957755e-06, + "loss": 1.0628, + "step": 3135 + }, + { + "epoch": 0.7180309101316542, + "grad_norm": 1.2302170991897583, + "learning_rate": 3.905276842233508e-06, + "loss": 1.0182, + "step": 3136 + }, + { + "epoch": 0.718259874069834, + "grad_norm": 1.0562607049942017, + "learning_rate": 3.899397344384316e-06, + "loss": 1.01, + "step": 3137 + }, + { + "epoch": 0.7184888380080138, + "grad_norm": 2.51627516746521, + "learning_rate": 3.893521203645618e-06, + "loss": 1.1183, + "step": 3138 + }, + { + "epoch": 0.7187178019461935, + "grad_norm": 1.2331818342208862, + "learning_rate": 3.887648423251006e-06, + "loss": 1.0323, + "step": 3139 + }, + { + "epoch": 0.7189467658843732, + "grad_norm": 1.2647502422332764, + "learning_rate": 3.881779006432223e-06, + "loss": 1.0546, + "step": 3140 + }, + { + "epoch": 0.719175729822553, + "grad_norm": 1.3590573072433472, + "learning_rate": 3.875912956419152e-06, + "loss": 0.9891, + "step": 3141 + }, + { + "epoch": 0.7194046937607327, + "grad_norm": 1.4926403760910034, + "learning_rate": 3.870050276439843e-06, + "loss": 1.0391, + "step": 3142 + }, + { + "epoch": 0.7196336576989124, + "grad_norm": 1.504622459411621, + "learning_rate": 3.864190969720469e-06, + "loss": 0.9936, + "step": 3143 + }, + { + "epoch": 0.7198626216370921, + "grad_norm": 1.0495116710662842, + "learning_rate": 3.858335039485362e-06, + "loss": 1.0307, + "step": 3144 + }, + { + "epoch": 0.7200915855752719, + "grad_norm": 1.2690739631652832, + "learning_rate": 3.852482488956992e-06, + "loss": 1.0355, + "step": 3145 + }, + { + "epoch": 0.7203205495134516, + "grad_norm": 2.033017158508301, + "learning_rate": 3.846633321355967e-06, + "loss": 1.0948, + "step": 3146 + }, + { + "epoch": 0.7205495134516313, + "grad_norm": 1.1862678527832031, + "learning_rate": 3.840787539901037e-06, + "loss": 1.055, + "step": 3147 + }, + { + "epoch": 0.7207784773898112, + "grad_norm": 1.3250430822372437, + "learning_rate": 3.834945147809082e-06, + "loss": 1.052, + "step": 3148 + }, + { + "epoch": 0.7210074413279909, + "grad_norm": 1.126364827156067, + "learning_rate": 3.829106148295127e-06, + "loss": 1.0445, + "step": 3149 + }, + { + "epoch": 0.7212364052661706, + "grad_norm": 1.6518840789794922, + "learning_rate": 3.823270544572319e-06, + "loss": 1.04, + "step": 3150 + }, + { + "epoch": 0.7214653692043503, + "grad_norm": 1.127027988433838, + "learning_rate": 3.817438339851947e-06, + "loss": 1.0113, + "step": 3151 + }, + { + "epoch": 0.7216943331425301, + "grad_norm": 1.3487000465393066, + "learning_rate": 3.8116095373434204e-06, + "loss": 0.9984, + "step": 3152 + }, + { + "epoch": 0.7219232970807098, + "grad_norm": 1.3868497610092163, + "learning_rate": 3.805784140254286e-06, + "loss": 1.0504, + "step": 3153 + }, + { + "epoch": 0.7221522610188895, + "grad_norm": 1.4739601612091064, + "learning_rate": 3.799962151790203e-06, + "loss": 1.0158, + "step": 3154 + }, + { + "epoch": 0.7223812249570692, + "grad_norm": 1.0389552116394043, + "learning_rate": 3.794143575154964e-06, + "loss": 1.0075, + "step": 3155 + }, + { + "epoch": 0.722610188895249, + "grad_norm": 1.0425493717193604, + "learning_rate": 3.788328413550493e-06, + "loss": 0.9877, + "step": 3156 + }, + { + "epoch": 0.7228391528334287, + "grad_norm": 1.1646589040756226, + "learning_rate": 3.7825166701768125e-06, + "loss": 0.959, + "step": 3157 + }, + { + "epoch": 0.7230681167716084, + "grad_norm": 1.1060878038406372, + "learning_rate": 3.776708348232081e-06, + "loss": 1.0858, + "step": 3158 + }, + { + "epoch": 0.7232970807097882, + "grad_norm": 1.2601068019866943, + "learning_rate": 3.7709034509125706e-06, + "loss": 1.0423, + "step": 3159 + }, + { + "epoch": 0.723526044647968, + "grad_norm": 2.284247875213623, + "learning_rate": 3.7651019814126656e-06, + "loss": 1.0213, + "step": 3160 + }, + { + "epoch": 0.7237550085861477, + "grad_norm": 1.5774025917053223, + "learning_rate": 3.7593039429248667e-06, + "loss": 1.0469, + "step": 3161 + }, + { + "epoch": 0.7239839725243274, + "grad_norm": 1.9497007131576538, + "learning_rate": 3.7535093386397868e-06, + "loss": 1.0541, + "step": 3162 + }, + { + "epoch": 0.7242129364625072, + "grad_norm": 1.517136812210083, + "learning_rate": 3.7477181717461463e-06, + "loss": 0.9846, + "step": 3163 + }, + { + "epoch": 0.7244419004006869, + "grad_norm": 1.3717387914657593, + "learning_rate": 3.74193044543078e-06, + "loss": 1.0027, + "step": 3164 + }, + { + "epoch": 0.7246708643388666, + "grad_norm": 1.2264940738677979, + "learning_rate": 3.7361461628786167e-06, + "loss": 1.1155, + "step": 3165 + }, + { + "epoch": 0.7248998282770464, + "grad_norm": 1.5513440370559692, + "learning_rate": 3.7303653272727057e-06, + "loss": 1.0442, + "step": 3166 + }, + { + "epoch": 0.7251287922152261, + "grad_norm": 1.0587714910507202, + "learning_rate": 3.7245879417941943e-06, + "loss": 1.1098, + "step": 3167 + }, + { + "epoch": 0.7253577561534058, + "grad_norm": 1.0646835565567017, + "learning_rate": 3.7188140096223225e-06, + "loss": 0.9877, + "step": 3168 + }, + { + "epoch": 0.7255867200915855, + "grad_norm": 1.0646835565567017, + "learning_rate": 3.7188140096223225e-06, + "loss": 1.0259, + "step": 3169 + }, + { + "epoch": 0.7258156840297653, + "grad_norm": 1.221613883972168, + "learning_rate": 3.713043533934435e-06, + "loss": 0.96, + "step": 3170 + }, + { + "epoch": 0.7260446479679451, + "grad_norm": 1.373716950416565, + "learning_rate": 3.707276517905989e-06, + "loss": 1.0587, + "step": 3171 + }, + { + "epoch": 0.7262736119061248, + "grad_norm": 1.5889724493026733, + "learning_rate": 3.701512964710513e-06, + "loss": 1.0572, + "step": 3172 + }, + { + "epoch": 0.7265025758443046, + "grad_norm": 1.3366655111312866, + "learning_rate": 3.695752877519646e-06, + "loss": 1.0241, + "step": 3173 + }, + { + "epoch": 0.7267315397824843, + "grad_norm": 1.3003686666488647, + "learning_rate": 3.689996259503116e-06, + "loss": 1.0352, + "step": 3174 + }, + { + "epoch": 0.726960503720664, + "grad_norm": 1.2446143627166748, + "learning_rate": 3.6842431138287415e-06, + "loss": 1.0475, + "step": 3175 + }, + { + "epoch": 0.7271894676588437, + "grad_norm": 1.0979483127593994, + "learning_rate": 3.67849344366243e-06, + "loss": 1.0735, + "step": 3176 + }, + { + "epoch": 0.7274184315970235, + "grad_norm": 1.3841794729232788, + "learning_rate": 3.672747252168176e-06, + "loss": 1.0478, + "step": 3177 + }, + { + "epoch": 0.7276473955352032, + "grad_norm": 1.3376761674880981, + "learning_rate": 3.6670045425080626e-06, + "loss": 1.0732, + "step": 3178 + }, + { + "epoch": 0.7278763594733829, + "grad_norm": 2.0364394187927246, + "learning_rate": 3.6612653178422564e-06, + "loss": 1.05, + "step": 3179 + }, + { + "epoch": 0.7281053234115626, + "grad_norm": 1.3383004665374756, + "learning_rate": 3.655529581328995e-06, + "loss": 1.0408, + "step": 3180 + }, + { + "epoch": 0.7283342873497424, + "grad_norm": 1.368977427482605, + "learning_rate": 3.6497973361246153e-06, + "loss": 1.0406, + "step": 3181 + }, + { + "epoch": 0.7285632512879221, + "grad_norm": 1.3485641479492188, + "learning_rate": 3.6440685853835266e-06, + "loss": 1.0455, + "step": 3182 + }, + { + "epoch": 0.7287922152261019, + "grad_norm": 1.3723440170288086, + "learning_rate": 3.6383433322582028e-06, + "loss": 1.0398, + "step": 3183 + }, + { + "epoch": 0.7290211791642817, + "grad_norm": 1.250652551651001, + "learning_rate": 3.632621579899207e-06, + "loss": 1.0, + "step": 3184 + }, + { + "epoch": 0.7292501431024614, + "grad_norm": 1.3991602659225464, + "learning_rate": 3.6269033314551725e-06, + "loss": 1.0197, + "step": 3185 + }, + { + "epoch": 0.7294791070406411, + "grad_norm": 1.7109068632125854, + "learning_rate": 3.6211885900728017e-06, + "loss": 1.0875, + "step": 3186 + }, + { + "epoch": 0.7297080709788208, + "grad_norm": 1.0364817380905151, + "learning_rate": 3.6154773588968704e-06, + "loss": 1.1141, + "step": 3187 + }, + { + "epoch": 0.7299370349170006, + "grad_norm": 1.2963764667510986, + "learning_rate": 3.609769641070221e-06, + "loss": 1.0609, + "step": 3188 + }, + { + "epoch": 0.7301659988551803, + "grad_norm": 1.1506567001342773, + "learning_rate": 3.6040654397337614e-06, + "loss": 1.0173, + "step": 3189 + }, + { + "epoch": 0.73039496279336, + "grad_norm": 1.3218399286270142, + "learning_rate": 3.598364758026467e-06, + "loss": 1.093, + "step": 3190 + }, + { + "epoch": 0.7306239267315398, + "grad_norm": 1.5998913049697876, + "learning_rate": 3.5926675990853752e-06, + "loss": 1.0072, + "step": 3191 + }, + { + "epoch": 0.7308528906697195, + "grad_norm": 1.9323956966400146, + "learning_rate": 3.5869739660455847e-06, + "loss": 0.9571, + "step": 3192 + }, + { + "epoch": 0.7310818546078992, + "grad_norm": 1.1179009675979614, + "learning_rate": 3.581283862040257e-06, + "loss": 1.0337, + "step": 3193 + }, + { + "epoch": 0.731310818546079, + "grad_norm": 1.3210819959640503, + "learning_rate": 3.575597290200599e-06, + "loss": 1.0332, + "step": 3194 + }, + { + "epoch": 0.7315397824842588, + "grad_norm": 1.1699495315551758, + "learning_rate": 3.569914253655896e-06, + "loss": 1.0276, + "step": 3195 + }, + { + "epoch": 0.7317687464224385, + "grad_norm": 1.2185953855514526, + "learning_rate": 3.5642347555334665e-06, + "loss": 1.0249, + "step": 3196 + }, + { + "epoch": 0.7319977103606182, + "grad_norm": 1.1740138530731201, + "learning_rate": 3.55855879895869e-06, + "loss": 0.9732, + "step": 3197 + }, + { + "epoch": 0.732226674298798, + "grad_norm": 1.19003427028656, + "learning_rate": 3.552886387055009e-06, + "loss": 0.993, + "step": 3198 + }, + { + "epoch": 0.7324556382369777, + "grad_norm": 1.3954188823699951, + "learning_rate": 3.547217522943892e-06, + "loss": 1.0497, + "step": 3199 + }, + { + "epoch": 0.7326846021751574, + "grad_norm": 1.2999389171600342, + "learning_rate": 3.5415522097448717e-06, + "loss": 1.0348, + "step": 3200 + }, + { + "epoch": 0.7329135661133371, + "grad_norm": 1.2359155416488647, + "learning_rate": 3.5358904505755243e-06, + "loss": 1.0308, + "step": 3201 + }, + { + "epoch": 0.7331425300515169, + "grad_norm": 1.3111592531204224, + "learning_rate": 3.530232248551466e-06, + "loss": 1.0899, + "step": 3202 + }, + { + "epoch": 0.7333714939896966, + "grad_norm": 1.2703155279159546, + "learning_rate": 3.524577606786358e-06, + "loss": 1.0127, + "step": 3203 + }, + { + "epoch": 0.7336004579278763, + "grad_norm": 1.2910027503967285, + "learning_rate": 3.518926528391904e-06, + "loss": 1.0473, + "step": 3204 + }, + { + "epoch": 0.733829421866056, + "grad_norm": 1.4213367700576782, + "learning_rate": 3.513279016477844e-06, + "loss": 0.9976, + "step": 3205 + }, + { + "epoch": 0.7340583858042359, + "grad_norm": 1.4030628204345703, + "learning_rate": 3.5076350741519595e-06, + "loss": 1.051, + "step": 3206 + }, + { + "epoch": 0.7342873497424156, + "grad_norm": 1.3458659648895264, + "learning_rate": 3.5019947045200553e-06, + "loss": 1.0091, + "step": 3207 + }, + { + "epoch": 0.7345163136805953, + "grad_norm": 1.5768972635269165, + "learning_rate": 3.496357910685989e-06, + "loss": 1.0405, + "step": 3208 + }, + { + "epoch": 0.7347452776187751, + "grad_norm": 1.4291819334030151, + "learning_rate": 3.4907246957516416e-06, + "loss": 1.0732, + "step": 3209 + }, + { + "epoch": 0.7349742415569548, + "grad_norm": 1.2574915885925293, + "learning_rate": 3.4850950628169167e-06, + "loss": 1.0461, + "step": 3210 + }, + { + "epoch": 0.7352032054951345, + "grad_norm": 1.2337363958358765, + "learning_rate": 3.479469014979754e-06, + "loss": 1.1096, + "step": 3211 + }, + { + "epoch": 0.7354321694333142, + "grad_norm": 1.4060138463974, + "learning_rate": 3.473846555336131e-06, + "loss": 1.0116, + "step": 3212 + }, + { + "epoch": 0.735661133371494, + "grad_norm": 1.3836556673049927, + "learning_rate": 3.46822768698003e-06, + "loss": 1.0388, + "step": 3213 + }, + { + "epoch": 0.7358900973096737, + "grad_norm": 1.2583669424057007, + "learning_rate": 3.4626124130034713e-06, + "loss": 1.0596, + "step": 3214 + }, + { + "epoch": 0.7361190612478534, + "grad_norm": 1.593520164489746, + "learning_rate": 3.457000736496492e-06, + "loss": 1.062, + "step": 3215 + }, + { + "epoch": 0.7363480251860332, + "grad_norm": 1.5272831916809082, + "learning_rate": 3.4513926605471504e-06, + "loss": 1.0255, + "step": 3216 + }, + { + "epoch": 0.736576989124213, + "grad_norm": 2.2148725986480713, + "learning_rate": 3.445788188241527e-06, + "loss": 1.0377, + "step": 3217 + }, + { + "epoch": 0.7368059530623927, + "grad_norm": 1.665127158164978, + "learning_rate": 3.440187322663707e-06, + "loss": 1.0202, + "step": 3218 + }, + { + "epoch": 0.7370349170005724, + "grad_norm": 1.2639302015304565, + "learning_rate": 3.4345900668958088e-06, + "loss": 1.0474, + "step": 3219 + }, + { + "epoch": 0.7372638809387522, + "grad_norm": 1.2613798379898071, + "learning_rate": 3.428996424017956e-06, + "loss": 1.0672, + "step": 3220 + }, + { + "epoch": 0.7374928448769319, + "grad_norm": 10.604727745056152, + "learning_rate": 3.423406397108273e-06, + "loss": 0.9956, + "step": 3221 + }, + { + "epoch": 0.7377218088151116, + "grad_norm": 1.3740556240081787, + "learning_rate": 3.417819989242915e-06, + "loss": 1.0367, + "step": 3222 + }, + { + "epoch": 0.7379507727532914, + "grad_norm": 1.2875322103500366, + "learning_rate": 3.412237203496036e-06, + "loss": 1.0304, + "step": 3223 + }, + { + "epoch": 0.7381797366914711, + "grad_norm": 1.4357035160064697, + "learning_rate": 3.4066580429397877e-06, + "loss": 0.9638, + "step": 3224 + }, + { + "epoch": 0.7384087006296508, + "grad_norm": 1.247023344039917, + "learning_rate": 3.401082510644337e-06, + "loss": 0.9908, + "step": 3225 + }, + { + "epoch": 0.7386376645678305, + "grad_norm": 1.2303351163864136, + "learning_rate": 3.395510609677861e-06, + "loss": 1.0369, + "step": 3226 + }, + { + "epoch": 0.7388666285060103, + "grad_norm": 1.5605922937393188, + "learning_rate": 3.3899423431065215e-06, + "loss": 1.0669, + "step": 3227 + }, + { + "epoch": 0.73909559244419, + "grad_norm": 1.551808476448059, + "learning_rate": 3.384377713994492e-06, + "loss": 0.9855, + "step": 3228 + }, + { + "epoch": 0.7393245563823698, + "grad_norm": 1.3470834493637085, + "learning_rate": 3.3788167254039417e-06, + "loss": 1.0534, + "step": 3229 + }, + { + "epoch": 0.7395535203205496, + "grad_norm": 2.2554211616516113, + "learning_rate": 3.3732593803950354e-06, + "loss": 1.0309, + "step": 3230 + }, + { + "epoch": 0.7397824842587293, + "grad_norm": 1.2798614501953125, + "learning_rate": 3.3677056820259324e-06, + "loss": 0.9978, + "step": 3231 + }, + { + "epoch": 0.740011448196909, + "grad_norm": 1.184571385383606, + "learning_rate": 3.3621556333527884e-06, + "loss": 1.053, + "step": 3232 + }, + { + "epoch": 0.7402404121350887, + "grad_norm": 1.5115231275558472, + "learning_rate": 3.3566092374297465e-06, + "loss": 0.9638, + "step": 3233 + }, + { + "epoch": 0.7404693760732685, + "grad_norm": 1.1283025741577148, + "learning_rate": 3.3510664973089467e-06, + "loss": 1.022, + "step": 3234 + }, + { + "epoch": 0.7406983400114482, + "grad_norm": 1.4040166139602661, + "learning_rate": 3.3455274160405025e-06, + "loss": 1.037, + "step": 3235 + }, + { + "epoch": 0.7409273039496279, + "grad_norm": 1.2837587594985962, + "learning_rate": 3.3399919966725335e-06, + "loss": 1.0439, + "step": 3236 + }, + { + "epoch": 0.7411562678878076, + "grad_norm": 1.1922607421875, + "learning_rate": 3.3344602422511343e-06, + "loss": 0.9965, + "step": 3237 + }, + { + "epoch": 0.7413852318259874, + "grad_norm": 1.1945897340774536, + "learning_rate": 3.3289321558203767e-06, + "loss": 1.0592, + "step": 3238 + }, + { + "epoch": 0.7416141957641671, + "grad_norm": 1.1530017852783203, + "learning_rate": 3.323407740422323e-06, + "loss": 1.0469, + "step": 3239 + }, + { + "epoch": 0.7418431597023469, + "grad_norm": 1.1255567073822021, + "learning_rate": 3.317886999097014e-06, + "loss": 1.0633, + "step": 3240 + }, + { + "epoch": 0.7420721236405267, + "grad_norm": 1.1872222423553467, + "learning_rate": 3.3123699348824654e-06, + "loss": 0.9804, + "step": 3241 + }, + { + "epoch": 0.7423010875787064, + "grad_norm": 1.289146900177002, + "learning_rate": 3.306856550814673e-06, + "loss": 1.0055, + "step": 3242 + }, + { + "epoch": 0.7425300515168861, + "grad_norm": 1.5255881547927856, + "learning_rate": 3.3013468499276057e-06, + "loss": 1.0347, + "step": 3243 + }, + { + "epoch": 0.7427590154550658, + "grad_norm": 1.0099772214889526, + "learning_rate": 3.2958408352532055e-06, + "loss": 1.0634, + "step": 3244 + }, + { + "epoch": 0.7429879793932456, + "grad_norm": 1.8835973739624023, + "learning_rate": 3.290338509821386e-06, + "loss": 1.024, + "step": 3245 + }, + { + "epoch": 0.7432169433314253, + "grad_norm": 1.2783852815628052, + "learning_rate": 3.2848398766600298e-06, + "loss": 1.0316, + "step": 3246 + }, + { + "epoch": 0.743445907269605, + "grad_norm": 1.3650398254394531, + "learning_rate": 3.2793449387949907e-06, + "loss": 1.0741, + "step": 3247 + }, + { + "epoch": 0.7436748712077847, + "grad_norm": 1.6553008556365967, + "learning_rate": 3.273853699250088e-06, + "loss": 0.9974, + "step": 3248 + }, + { + "epoch": 0.7439038351459645, + "grad_norm": 1.0704493522644043, + "learning_rate": 3.268366161047096e-06, + "loss": 1.08, + "step": 3249 + }, + { + "epoch": 0.7441327990841442, + "grad_norm": 1.375232219696045, + "learning_rate": 3.2628823272057753e-06, + "loss": 1.0434, + "step": 3250 + }, + { + "epoch": 0.7443617630223239, + "grad_norm": 1.122849464416504, + "learning_rate": 3.257402200743821e-06, + "loss": 1.0497, + "step": 3251 + }, + { + "epoch": 0.7445907269605038, + "grad_norm": 1.2734968662261963, + "learning_rate": 3.251925784676907e-06, + "loss": 1.0542, + "step": 3252 + }, + { + "epoch": 0.7448196908986835, + "grad_norm": 1.3681397438049316, + "learning_rate": 3.246453082018658e-06, + "loss": 0.9997, + "step": 3253 + }, + { + "epoch": 0.7450486548368632, + "grad_norm": 1.1173583269119263, + "learning_rate": 3.2409840957806548e-06, + "loss": 1.0651, + "step": 3254 + }, + { + "epoch": 0.745277618775043, + "grad_norm": 1.235009789466858, + "learning_rate": 3.235518828972437e-06, + "loss": 0.9709, + "step": 3255 + }, + { + "epoch": 0.7455065827132227, + "grad_norm": 1.1784312725067139, + "learning_rate": 3.2300572846014945e-06, + "loss": 1.0648, + "step": 3256 + }, + { + "epoch": 0.7457355466514024, + "grad_norm": 1.190245270729065, + "learning_rate": 3.224599465673268e-06, + "loss": 1.0308, + "step": 3257 + }, + { + "epoch": 0.7459645105895821, + "grad_norm": 1.3046700954437256, + "learning_rate": 3.2191453751911505e-06, + "loss": 0.9958, + "step": 3258 + }, + { + "epoch": 0.7461934745277619, + "grad_norm": 1.363734483718872, + "learning_rate": 3.213695016156484e-06, + "loss": 1.0139, + "step": 3259 + }, + { + "epoch": 0.7464224384659416, + "grad_norm": 1.3983644247055054, + "learning_rate": 3.2082483915685526e-06, + "loss": 1.0878, + "step": 3260 + }, + { + "epoch": 0.7466514024041213, + "grad_norm": 1.4021762609481812, + "learning_rate": 3.202805504424592e-06, + "loss": 1.0088, + "step": 3261 + }, + { + "epoch": 0.746880366342301, + "grad_norm": 1.5322551727294922, + "learning_rate": 3.197366357719772e-06, + "loss": 1.0558, + "step": 3262 + }, + { + "epoch": 0.7471093302804809, + "grad_norm": 1.4262042045593262, + "learning_rate": 3.191930954447209e-06, + "loss": 1.0341, + "step": 3263 + }, + { + "epoch": 0.7473382942186606, + "grad_norm": 1.3308278322219849, + "learning_rate": 3.186499297597968e-06, + "loss": 1.0309, + "step": 3264 + }, + { + "epoch": 0.7475672581568403, + "grad_norm": 1.1862602233886719, + "learning_rate": 3.1810713901610367e-06, + "loss": 1.0448, + "step": 3265 + }, + { + "epoch": 0.7477962220950201, + "grad_norm": 1.3563153743743896, + "learning_rate": 3.175647235123347e-06, + "loss": 1.0332, + "step": 3266 + }, + { + "epoch": 0.7480251860331998, + "grad_norm": 1.3230400085449219, + "learning_rate": 3.170226835469774e-06, + "loss": 1.0438, + "step": 3267 + }, + { + "epoch": 0.7482541499713795, + "grad_norm": 1.3605960607528687, + "learning_rate": 3.16481019418311e-06, + "loss": 1.001, + "step": 3268 + }, + { + "epoch": 0.7484831139095592, + "grad_norm": 1.1855884790420532, + "learning_rate": 3.159397314244089e-06, + "loss": 0.9966, + "step": 3269 + }, + { + "epoch": 0.748712077847739, + "grad_norm": 1.5543913841247559, + "learning_rate": 3.1539881986313758e-06, + "loss": 0.9855, + "step": 3270 + }, + { + "epoch": 0.7489410417859187, + "grad_norm": 1.125836968421936, + "learning_rate": 3.1485828503215588e-06, + "loss": 1.1038, + "step": 3271 + }, + { + "epoch": 0.7491700057240984, + "grad_norm": 2.7458279132843018, + "learning_rate": 3.1431812722891598e-06, + "loss": 0.9994, + "step": 3272 + }, + { + "epoch": 0.7493989696622781, + "grad_norm": 1.4484138488769531, + "learning_rate": 3.137783467506613e-06, + "loss": 0.9574, + "step": 3273 + }, + { + "epoch": 0.7496279336004579, + "grad_norm": 1.4349960088729858, + "learning_rate": 3.1323894389442942e-06, + "loss": 1.0307, + "step": 3274 + }, + { + "epoch": 0.7498568975386377, + "grad_norm": 1.287442922592163, + "learning_rate": 3.126999189570493e-06, + "loss": 1.0006, + "step": 3275 + }, + { + "epoch": 0.7500858614768174, + "grad_norm": 1.1122688055038452, + "learning_rate": 3.1216127223514116e-06, + "loss": 1.046, + "step": 3276 + }, + { + "epoch": 0.7503148254149972, + "grad_norm": 1.196319580078125, + "learning_rate": 3.116230040251177e-06, + "loss": 1.0686, + "step": 3277 + }, + { + "epoch": 0.7505437893531769, + "grad_norm": 1.5629481077194214, + "learning_rate": 3.1108511462318437e-06, + "loss": 1.0516, + "step": 3278 + }, + { + "epoch": 0.7507727532913566, + "grad_norm": 1.118003487586975, + "learning_rate": 3.1054760432533626e-06, + "loss": 1.0298, + "step": 3279 + }, + { + "epoch": 0.7510017172295363, + "grad_norm": 1.118003487586975, + "learning_rate": 3.1054760432533626e-06, + "loss": 1.0198, + "step": 3280 + }, + { + "epoch": 0.7512306811677161, + "grad_norm": 1.319196343421936, + "learning_rate": 3.100104734273608e-06, + "loss": 1.0704, + "step": 3281 + }, + { + "epoch": 0.7514596451058958, + "grad_norm": 1.3036201000213623, + "learning_rate": 3.0947372222483762e-06, + "loss": 1.0307, + "step": 3282 + }, + { + "epoch": 0.7516886090440755, + "grad_norm": 1.2368963956832886, + "learning_rate": 3.089373510131354e-06, + "loss": 1.0142, + "step": 3283 + }, + { + "epoch": 0.7519175729822553, + "grad_norm": 1.583749771118164, + "learning_rate": 3.0840136008741505e-06, + "loss": 1.0126, + "step": 3284 + }, + { + "epoch": 0.752146536920435, + "grad_norm": 1.174692988395691, + "learning_rate": 3.0786574974262784e-06, + "loss": 1.0675, + "step": 3285 + }, + { + "epoch": 0.7523755008586148, + "grad_norm": 1.5759061574935913, + "learning_rate": 3.073305202735157e-06, + "loss": 1.0071, + "step": 3286 + }, + { + "epoch": 0.7526044647967945, + "grad_norm": 1.3044160604476929, + "learning_rate": 3.0679567197461135e-06, + "loss": 0.9839, + "step": 3287 + }, + { + "epoch": 0.7528334287349743, + "grad_norm": 1.2453902959823608, + "learning_rate": 3.0626120514023605e-06, + "loss": 1.0634, + "step": 3288 + }, + { + "epoch": 0.753062392673154, + "grad_norm": 1.130800485610962, + "learning_rate": 3.057271200645037e-06, + "loss": 1.0402, + "step": 3289 + }, + { + "epoch": 0.7532913566113337, + "grad_norm": 1.4683923721313477, + "learning_rate": 3.0519341704131666e-06, + "loss": 1.0274, + "step": 3290 + }, + { + "epoch": 0.7535203205495135, + "grad_norm": 1.3023918867111206, + "learning_rate": 3.0466009636436633e-06, + "loss": 1.0431, + "step": 3291 + }, + { + "epoch": 0.7537492844876932, + "grad_norm": 2.7885706424713135, + "learning_rate": 3.0412715832713592e-06, + "loss": 1.0291, + "step": 3292 + }, + { + "epoch": 0.7539782484258729, + "grad_norm": 1.2981799840927124, + "learning_rate": 3.035946032228957e-06, + "loss": 0.9623, + "step": 3293 + }, + { + "epoch": 0.7542072123640526, + "grad_norm": 1.2832386493682861, + "learning_rate": 3.0306243134470668e-06, + "loss": 1.0626, + "step": 3294 + }, + { + "epoch": 0.7544361763022324, + "grad_norm": 1.5354857444763184, + "learning_rate": 3.0253064298541857e-06, + "loss": 1.0211, + "step": 3295 + }, + { + "epoch": 0.7546651402404121, + "grad_norm": 1.4440540075302124, + "learning_rate": 3.0199923843767007e-06, + "loss": 1.0351, + "step": 3296 + }, + { + "epoch": 0.7548941041785918, + "grad_norm": 1.2906075716018677, + "learning_rate": 3.014682179938886e-06, + "loss": 1.0806, + "step": 3297 + }, + { + "epoch": 0.7551230681167717, + "grad_norm": 1.2191280126571655, + "learning_rate": 3.009375819462902e-06, + "loss": 1.0078, + "step": 3298 + }, + { + "epoch": 0.7553520320549514, + "grad_norm": 1.3844983577728271, + "learning_rate": 3.0040733058687956e-06, + "loss": 1.056, + "step": 3299 + }, + { + "epoch": 0.7555809959931311, + "grad_norm": 1.1661230325698853, + "learning_rate": 2.998774642074496e-06, + "loss": 1.061, + "step": 3300 + }, + { + "epoch": 0.7558099599313108, + "grad_norm": 1.2717007398605347, + "learning_rate": 2.993479830995815e-06, + "loss": 0.9824, + "step": 3301 + }, + { + "epoch": 0.7560389238694906, + "grad_norm": 1.4554436206817627, + "learning_rate": 2.9881888755464337e-06, + "loss": 1.0202, + "step": 3302 + }, + { + "epoch": 0.7562678878076703, + "grad_norm": 1.251201868057251, + "learning_rate": 2.9829017786379333e-06, + "loss": 1.0595, + "step": 3303 + }, + { + "epoch": 0.75649685174585, + "grad_norm": 1.0925774574279785, + "learning_rate": 2.97761854317975e-06, + "loss": 1.0848, + "step": 3304 + }, + { + "epoch": 0.7567258156840297, + "grad_norm": 1.3679059743881226, + "learning_rate": 2.972339172079204e-06, + "loss": 1.0356, + "step": 3305 + }, + { + "epoch": 0.7569547796222095, + "grad_norm": 1.140267014503479, + "learning_rate": 2.9670636682414966e-06, + "loss": 1.04, + "step": 3306 + }, + { + "epoch": 0.7571837435603892, + "grad_norm": 1.2260184288024902, + "learning_rate": 2.961792034569686e-06, + "loss": 1.0593, + "step": 3307 + }, + { + "epoch": 0.7574127074985689, + "grad_norm": 1.3413991928100586, + "learning_rate": 2.9565242739647115e-06, + "loss": 1.0271, + "step": 3308 + }, + { + "epoch": 0.7576416714367488, + "grad_norm": 1.2301392555236816, + "learning_rate": 2.9512603893253756e-06, + "loss": 1.0802, + "step": 3309 + }, + { + "epoch": 0.7578706353749285, + "grad_norm": 1.1220687627792358, + "learning_rate": 2.9460003835483497e-06, + "loss": 1.0152, + "step": 3310 + }, + { + "epoch": 0.7580995993131082, + "grad_norm": 2.1697607040405273, + "learning_rate": 2.940744259528173e-06, + "loss": 1.0526, + "step": 3311 + }, + { + "epoch": 0.758328563251288, + "grad_norm": 1.1174249649047852, + "learning_rate": 2.9354920201572457e-06, + "loss": 0.9836, + "step": 3312 + }, + { + "epoch": 0.7585575271894677, + "grad_norm": 1.1898833513259888, + "learning_rate": 2.9302436683258306e-06, + "loss": 1.0391, + "step": 3313 + }, + { + "epoch": 0.7587864911276474, + "grad_norm": 1.1694586277008057, + "learning_rate": 2.9249992069220557e-06, + "loss": 1.0203, + "step": 3314 + }, + { + "epoch": 0.7590154550658271, + "grad_norm": 1.1864376068115234, + "learning_rate": 2.919758638831893e-06, + "loss": 1.046, + "step": 3315 + }, + { + "epoch": 0.7592444190040069, + "grad_norm": 1.4337023496627808, + "learning_rate": 2.9145219669391944e-06, + "loss": 1.1258, + "step": 3316 + }, + { + "epoch": 0.7594733829421866, + "grad_norm": 1.0952881574630737, + "learning_rate": 2.909289194125655e-06, + "loss": 1.0528, + "step": 3317 + }, + { + "epoch": 0.7597023468803663, + "grad_norm": 1.3980028629302979, + "learning_rate": 2.904060323270822e-06, + "loss": 1.0476, + "step": 3318 + }, + { + "epoch": 0.759931310818546, + "grad_norm": 1.3246995210647583, + "learning_rate": 2.898835357252097e-06, + "loss": 1.0199, + "step": 3319 + }, + { + "epoch": 0.7601602747567258, + "grad_norm": 1.449928879737854, + "learning_rate": 2.8936142989447434e-06, + "loss": 1.0102, + "step": 3320 + }, + { + "epoch": 0.7603892386949056, + "grad_norm": 1.211899757385254, + "learning_rate": 2.8883971512218588e-06, + "loss": 1.0177, + "step": 3321 + }, + { + "epoch": 0.7606182026330853, + "grad_norm": 1.1149224042892456, + "learning_rate": 2.8831839169543998e-06, + "loss": 1.0456, + "step": 3322 + }, + { + "epoch": 0.760847166571265, + "grad_norm": 1.3038837909698486, + "learning_rate": 2.877974599011162e-06, + "loss": 1.0935, + "step": 3323 + }, + { + "epoch": 0.7610761305094448, + "grad_norm": 1.0868488550186157, + "learning_rate": 2.8727692002587914e-06, + "loss": 1.0169, + "step": 3324 + }, + { + "epoch": 0.7613050944476245, + "grad_norm": 1.4088412523269653, + "learning_rate": 2.867567723561776e-06, + "loss": 1.0274, + "step": 3325 + }, + { + "epoch": 0.7615340583858042, + "grad_norm": 1.7433838844299316, + "learning_rate": 2.8623701717824435e-06, + "loss": 0.9796, + "step": 3326 + }, + { + "epoch": 0.761763022323984, + "grad_norm": 1.1715034246444702, + "learning_rate": 2.8571765477809645e-06, + "loss": 1.0241, + "step": 3327 + }, + { + "epoch": 0.7619919862621637, + "grad_norm": 1.7508958578109741, + "learning_rate": 2.851986854415347e-06, + "loss": 0.9302, + "step": 3328 + }, + { + "epoch": 0.7622209502003434, + "grad_norm": 1.3008875846862793, + "learning_rate": 2.84680109454143e-06, + "loss": 1.0152, + "step": 3329 + }, + { + "epoch": 0.7624499141385231, + "grad_norm": 1.7418900728225708, + "learning_rate": 2.841619271012901e-06, + "loss": 0.97, + "step": 3330 + }, + { + "epoch": 0.7626788780767029, + "grad_norm": 1.3994221687316895, + "learning_rate": 2.8364413866812733e-06, + "loss": 1.1088, + "step": 3331 + }, + { + "epoch": 0.7629078420148827, + "grad_norm": 1.5729641914367676, + "learning_rate": 2.83126744439589e-06, + "loss": 1.0114, + "step": 3332 + }, + { + "epoch": 0.7631368059530624, + "grad_norm": 1.295013666152954, + "learning_rate": 2.826097447003925e-06, + "loss": 1.0756, + "step": 3333 + }, + { + "epoch": 0.7633657698912422, + "grad_norm": 1.3600553274154663, + "learning_rate": 2.820931397350395e-06, + "loss": 1.0254, + "step": 3334 + }, + { + "epoch": 0.7635947338294219, + "grad_norm": 1.1421211957931519, + "learning_rate": 2.815769298278125e-06, + "loss": 1.0173, + "step": 3335 + }, + { + "epoch": 0.7638236977676016, + "grad_norm": 2.1977944374084473, + "learning_rate": 2.810611152627777e-06, + "loss": 1.0828, + "step": 3336 + }, + { + "epoch": 0.7640526617057813, + "grad_norm": 1.382452368736267, + "learning_rate": 2.8054569632378358e-06, + "loss": 1.0648, + "step": 3337 + }, + { + "epoch": 0.7642816256439611, + "grad_norm": 1.372998833656311, + "learning_rate": 2.80030673294461e-06, + "loss": 1.0375, + "step": 3338 + }, + { + "epoch": 0.7645105895821408, + "grad_norm": 1.1051013469696045, + "learning_rate": 2.795160464582225e-06, + "loss": 1.0433, + "step": 3339 + }, + { + "epoch": 0.7647395535203205, + "grad_norm": 1.265097737312317, + "learning_rate": 2.7900181609826325e-06, + "loss": 0.9865, + "step": 3340 + }, + { + "epoch": 0.7649685174585003, + "grad_norm": 1.2459075450897217, + "learning_rate": 2.784879824975597e-06, + "loss": 1.0751, + "step": 3341 + }, + { + "epoch": 0.76519748139668, + "grad_norm": 1.2201124429702759, + "learning_rate": 2.779745459388705e-06, + "loss": 1.0641, + "step": 3342 + }, + { + "epoch": 0.7654264453348597, + "grad_norm": 1.1874167919158936, + "learning_rate": 2.774615067047346e-06, + "loss": 1.0337, + "step": 3343 + }, + { + "epoch": 0.7656554092730395, + "grad_norm": 1.3858647346496582, + "learning_rate": 2.769488650774741e-06, + "loss": 1.0191, + "step": 3344 + }, + { + "epoch": 0.7658843732112193, + "grad_norm": 1.2821847200393677, + "learning_rate": 2.7643662133919136e-06, + "loss": 1.0342, + "step": 3345 + }, + { + "epoch": 0.766113337149399, + "grad_norm": 1.4728732109069824, + "learning_rate": 2.7592477577176924e-06, + "loss": 0.9817, + "step": 3346 + }, + { + "epoch": 0.7663423010875787, + "grad_norm": 1.484055519104004, + "learning_rate": 2.7541332865687245e-06, + "loss": 1.1209, + "step": 3347 + }, + { + "epoch": 0.7665712650257585, + "grad_norm": 1.1936273574829102, + "learning_rate": 2.749022802759459e-06, + "loss": 1.0309, + "step": 3348 + }, + { + "epoch": 0.7668002289639382, + "grad_norm": 1.9332703351974487, + "learning_rate": 2.7439163091021525e-06, + "loss": 1.0065, + "step": 3349 + }, + { + "epoch": 0.7670291929021179, + "grad_norm": 1.1286215782165527, + "learning_rate": 2.738813808406866e-06, + "loss": 0.9393, + "step": 3350 + }, + { + "epoch": 0.7672581568402976, + "grad_norm": 1.4261751174926758, + "learning_rate": 2.7337153034814636e-06, + "loss": 1.051, + "step": 3351 + }, + { + "epoch": 0.7674871207784774, + "grad_norm": 1.2641804218292236, + "learning_rate": 2.7286207971316094e-06, + "loss": 1.0007, + "step": 3352 + }, + { + "epoch": 0.7677160847166571, + "grad_norm": 1.276326298713684, + "learning_rate": 2.7235302921607665e-06, + "loss": 1.0195, + "step": 3353 + }, + { + "epoch": 0.7679450486548368, + "grad_norm": 1.2994381189346313, + "learning_rate": 2.7184437913701977e-06, + "loss": 1.0581, + "step": 3354 + }, + { + "epoch": 0.7681740125930167, + "grad_norm": 1.2071455717086792, + "learning_rate": 2.713361297558963e-06, + "loss": 1.0235, + "step": 3355 + }, + { + "epoch": 0.7684029765311964, + "grad_norm": 1.246546745300293, + "learning_rate": 2.708282813523917e-06, + "loss": 1.0011, + "step": 3356 + }, + { + "epoch": 0.7686319404693761, + "grad_norm": 1.2827038764953613, + "learning_rate": 2.7032083420597e-06, + "loss": 1.0674, + "step": 3357 + }, + { + "epoch": 0.7688609044075558, + "grad_norm": 1.9928056001663208, + "learning_rate": 2.6981378859587614e-06, + "loss": 1.0078, + "step": 3358 + }, + { + "epoch": 0.7690898683457356, + "grad_norm": 1.672621250152588, + "learning_rate": 2.6930714480113217e-06, + "loss": 1.0532, + "step": 3359 + }, + { + "epoch": 0.7693188322839153, + "grad_norm": 1.4071228504180908, + "learning_rate": 2.688009031005403e-06, + "loss": 1.1329, + "step": 3360 + }, + { + "epoch": 0.769547796222095, + "grad_norm": 2.0468266010284424, + "learning_rate": 2.6829506377268122e-06, + "loss": 1.063, + "step": 3361 + }, + { + "epoch": 0.7697767601602747, + "grad_norm": 1.948828935623169, + "learning_rate": 2.6778962709591382e-06, + "loss": 1.0897, + "step": 3362 + }, + { + "epoch": 0.7700057240984545, + "grad_norm": 1.4286307096481323, + "learning_rate": 2.6728459334837576e-06, + "loss": 1.0155, + "step": 3363 + }, + { + "epoch": 0.7702346880366342, + "grad_norm": 1.1092466115951538, + "learning_rate": 2.667799628079829e-06, + "loss": 1.0039, + "step": 3364 + }, + { + "epoch": 0.7704636519748139, + "grad_norm": 1.3140974044799805, + "learning_rate": 2.6627573575242917e-06, + "loss": 1.0641, + "step": 3365 + }, + { + "epoch": 0.7706926159129937, + "grad_norm": 1.29413640499115, + "learning_rate": 2.6577191245918654e-06, + "loss": 1.01, + "step": 3366 + }, + { + "epoch": 0.7709215798511735, + "grad_norm": 1.2063698768615723, + "learning_rate": 2.6526849320550474e-06, + "loss": 1.0791, + "step": 3367 + }, + { + "epoch": 0.7711505437893532, + "grad_norm": 1.2607606649398804, + "learning_rate": 2.6476547826841106e-06, + "loss": 1.0255, + "step": 3368 + }, + { + "epoch": 0.7713795077275329, + "grad_norm": 1.187766194343567, + "learning_rate": 2.642628679247109e-06, + "loss": 1.066, + "step": 3369 + }, + { + "epoch": 0.7716084716657127, + "grad_norm": 1.3988637924194336, + "learning_rate": 2.6376066245098565e-06, + "loss": 1.1261, + "step": 3370 + }, + { + "epoch": 0.7718374356038924, + "grad_norm": 1.4937584400177002, + "learning_rate": 2.6325886212359496e-06, + "loss": 0.9689, + "step": 3371 + }, + { + "epoch": 0.7720663995420721, + "grad_norm": 1.2917104959487915, + "learning_rate": 2.6275746721867624e-06, + "loss": 1.0493, + "step": 3372 + }, + { + "epoch": 0.7722953634802519, + "grad_norm": 1.5106794834136963, + "learning_rate": 2.6225647801214203e-06, + "loss": 0.9849, + "step": 3373 + }, + { + "epoch": 0.7725243274184316, + "grad_norm": 1.2963145971298218, + "learning_rate": 2.6175589477968234e-06, + "loss": 1.0687, + "step": 3374 + }, + { + "epoch": 0.7727532913566113, + "grad_norm": 1.9042096138000488, + "learning_rate": 2.6125571779676493e-06, + "loss": 1.0122, + "step": 3375 + }, + { + "epoch": 0.772982255294791, + "grad_norm": 14.797686576843262, + "learning_rate": 2.607559473386321e-06, + "loss": 1.0177, + "step": 3376 + }, + { + "epoch": 0.7732112192329708, + "grad_norm": 1.1996756792068481, + "learning_rate": 2.602565836803036e-06, + "loss": 1.1003, + "step": 3377 + }, + { + "epoch": 0.7734401831711506, + "grad_norm": 1.0620721578598022, + "learning_rate": 2.5975762709657506e-06, + "loss": 1.0226, + "step": 3378 + }, + { + "epoch": 0.7736691471093303, + "grad_norm": 1.3966255187988281, + "learning_rate": 2.5925907786201808e-06, + "loss": 1.0354, + "step": 3379 + }, + { + "epoch": 0.77389811104751, + "grad_norm": 1.4668009281158447, + "learning_rate": 2.5876093625098066e-06, + "loss": 1.0616, + "step": 3380 + }, + { + "epoch": 0.7741270749856898, + "grad_norm": 1.1744608879089355, + "learning_rate": 2.5826320253758477e-06, + "loss": 1.0438, + "step": 3381 + }, + { + "epoch": 0.7743560389238695, + "grad_norm": 1.5106372833251953, + "learning_rate": 2.5776587699573007e-06, + "loss": 1.066, + "step": 3382 + }, + { + "epoch": 0.7745850028620492, + "grad_norm": 1.2056074142456055, + "learning_rate": 2.5726895989909063e-06, + "loss": 1.0526, + "step": 3383 + }, + { + "epoch": 0.774813966800229, + "grad_norm": 1.2208433151245117, + "learning_rate": 2.5677245152111497e-06, + "loss": 1.0644, + "step": 3384 + }, + { + "epoch": 0.7750429307384087, + "grad_norm": 1.3252755403518677, + "learning_rate": 2.5627635213502832e-06, + "loss": 1.0094, + "step": 3385 + }, + { + "epoch": 0.7752718946765884, + "grad_norm": 1.3273080587387085, + "learning_rate": 2.5578066201383e-06, + "loss": 1.1036, + "step": 3386 + }, + { + "epoch": 0.7755008586147681, + "grad_norm": 1.2906368970870972, + "learning_rate": 2.552853814302936e-06, + "loss": 1.0622, + "step": 3387 + }, + { + "epoch": 0.7757298225529479, + "grad_norm": 1.388886570930481, + "learning_rate": 2.547905106569677e-06, + "loss": 1.0753, + "step": 3388 + }, + { + "epoch": 0.7759587864911276, + "grad_norm": 1.221598744392395, + "learning_rate": 2.5429604996617653e-06, + "loss": 1.0204, + "step": 3389 + }, + { + "epoch": 0.7761877504293074, + "grad_norm": 1.2661128044128418, + "learning_rate": 2.5380199963001684e-06, + "loss": 1.0461, + "step": 3390 + }, + { + "epoch": 0.7764167143674872, + "grad_norm": 3.6229851245880127, + "learning_rate": 2.5330835992036062e-06, + "loss": 1.1383, + "step": 3391 + }, + { + "epoch": 0.7766456783056669, + "grad_norm": 1.637839913368225, + "learning_rate": 2.528151311088537e-06, + "loss": 1.0802, + "step": 3392 + }, + { + "epoch": 0.7768746422438466, + "grad_norm": 1.3283582925796509, + "learning_rate": 2.523223134669157e-06, + "loss": 1.0888, + "step": 3393 + }, + { + "epoch": 0.7771036061820263, + "grad_norm": 1.1502894163131714, + "learning_rate": 2.518299072657403e-06, + "loss": 1.0402, + "step": 3394 + }, + { + "epoch": 0.7773325701202061, + "grad_norm": 1.397196888923645, + "learning_rate": 2.513379127762937e-06, + "loss": 1.048, + "step": 3395 + }, + { + "epoch": 0.7775615340583858, + "grad_norm": 1.6322966814041138, + "learning_rate": 2.5084633026931727e-06, + "loss": 1.0891, + "step": 3396 + }, + { + "epoch": 0.7777904979965655, + "grad_norm": 1.2721960544586182, + "learning_rate": 2.5035516001532467e-06, + "loss": 1.0438, + "step": 3397 + }, + { + "epoch": 0.7780194619347452, + "grad_norm": 1.2815996408462524, + "learning_rate": 2.4986440228460185e-06, + "loss": 1.0351, + "step": 3398 + }, + { + "epoch": 0.778248425872925, + "grad_norm": 1.0550155639648438, + "learning_rate": 2.4937405734720964e-06, + "loss": 1.0648, + "step": 3399 + }, + { + "epoch": 0.7784773898111047, + "grad_norm": 1.814061164855957, + "learning_rate": 2.488841254729808e-06, + "loss": 1.0793, + "step": 3400 + }, + { + "epoch": 0.7787063537492845, + "grad_norm": 1.3419748544692993, + "learning_rate": 2.4839460693151994e-06, + "loss": 1.092, + "step": 3401 + }, + { + "epoch": 0.7789353176874643, + "grad_norm": 1.4719575643539429, + "learning_rate": 2.4790550199220543e-06, + "loss": 1.0483, + "step": 3402 + }, + { + "epoch": 0.779164281625644, + "grad_norm": 1.003715991973877, + "learning_rate": 2.474168109241877e-06, + "loss": 0.984, + "step": 3403 + }, + { + "epoch": 0.7793932455638237, + "grad_norm": 1.2123103141784668, + "learning_rate": 2.469285339963892e-06, + "loss": 1.042, + "step": 3404 + }, + { + "epoch": 0.7796222095020034, + "grad_norm": 1.2390992641448975, + "learning_rate": 2.4644067147750462e-06, + "loss": 0.9107, + "step": 3405 + }, + { + "epoch": 0.7798511734401832, + "grad_norm": 1.2249623537063599, + "learning_rate": 2.459532236360007e-06, + "loss": 1.0057, + "step": 3406 + }, + { + "epoch": 0.7800801373783629, + "grad_norm": 1.6221455335617065, + "learning_rate": 2.4546619074011603e-06, + "loss": 0.9997, + "step": 3407 + }, + { + "epoch": 0.7803091013165426, + "grad_norm": 1.1947602033615112, + "learning_rate": 2.4497957305786046e-06, + "loss": 0.9835, + "step": 3408 + }, + { + "epoch": 0.7805380652547224, + "grad_norm": 1.270727515220642, + "learning_rate": 2.4449337085701573e-06, + "loss": 1.0053, + "step": 3409 + }, + { + "epoch": 0.7807670291929021, + "grad_norm": 1.52069890499115, + "learning_rate": 2.4400758440513516e-06, + "loss": 1.0603, + "step": 3410 + }, + { + "epoch": 0.7809959931310818, + "grad_norm": 1.472410798072815, + "learning_rate": 2.4352221396954233e-06, + "loss": 1.0715, + "step": 3411 + }, + { + "epoch": 0.7812249570692615, + "grad_norm": 1.2387964725494385, + "learning_rate": 2.430372598173326e-06, + "loss": 1.0548, + "step": 3412 + }, + { + "epoch": 0.7814539210074414, + "grad_norm": 1.2153174877166748, + "learning_rate": 2.4255272221537295e-06, + "loss": 1.0096, + "step": 3413 + }, + { + "epoch": 0.7816828849456211, + "grad_norm": 1.296165943145752, + "learning_rate": 2.4206860143029954e-06, + "loss": 1.0653, + "step": 3414 + }, + { + "epoch": 0.7819118488838008, + "grad_norm": 1.3834319114685059, + "learning_rate": 2.4158489772852035e-06, + "loss": 0.9823, + "step": 3415 + }, + { + "epoch": 0.7821408128219806, + "grad_norm": 1.151049017906189, + "learning_rate": 2.4110161137621325e-06, + "loss": 1.051, + "step": 3416 + }, + { + "epoch": 0.7823697767601603, + "grad_norm": 2.3822290897369385, + "learning_rate": 2.406187426393269e-06, + "loss": 1.0159, + "step": 3417 + }, + { + "epoch": 0.78259874069834, + "grad_norm": 1.6371638774871826, + "learning_rate": 2.401362917835798e-06, + "loss": 0.969, + "step": 3418 + }, + { + "epoch": 0.7828277046365197, + "grad_norm": 1.29193913936615, + "learning_rate": 2.396542590744606e-06, + "loss": 1.0759, + "step": 3419 + }, + { + "epoch": 0.7830566685746995, + "grad_norm": 1.1832704544067383, + "learning_rate": 2.391726447772279e-06, + "loss": 1.0444, + "step": 3420 + }, + { + "epoch": 0.7832856325128792, + "grad_norm": 1.0548069477081299, + "learning_rate": 2.3869144915691033e-06, + "loss": 1.0141, + "step": 3421 + }, + { + "epoch": 0.7835145964510589, + "grad_norm": 1.3056319952011108, + "learning_rate": 2.3821067247830488e-06, + "loss": 1.0866, + "step": 3422 + }, + { + "epoch": 0.7837435603892386, + "grad_norm": 1.4193148612976074, + "learning_rate": 2.3773031500597974e-06, + "loss": 1.0747, + "step": 3423 + }, + { + "epoch": 0.7839725243274185, + "grad_norm": 1.3721299171447754, + "learning_rate": 2.3725037700427168e-06, + "loss": 1.0653, + "step": 3424 + }, + { + "epoch": 0.7842014882655982, + "grad_norm": 1.2656069993972778, + "learning_rate": 2.3677085873728602e-06, + "loss": 1.0436, + "step": 3425 + }, + { + "epoch": 0.7844304522037779, + "grad_norm": 1.1870973110198975, + "learning_rate": 2.3629176046889755e-06, + "loss": 1.0881, + "step": 3426 + }, + { + "epoch": 0.7846594161419577, + "grad_norm": 2.626173257827759, + "learning_rate": 2.3581308246275103e-06, + "loss": 1.0576, + "step": 3427 + }, + { + "epoch": 0.7848883800801374, + "grad_norm": 1.569612741470337, + "learning_rate": 2.353348249822579e-06, + "loss": 1.0122, + "step": 3428 + }, + { + "epoch": 0.7851173440183171, + "grad_norm": 1.4496647119522095, + "learning_rate": 2.3485698829059967e-06, + "loss": 1.0401, + "step": 3429 + }, + { + "epoch": 0.7853463079564968, + "grad_norm": 1.2720706462860107, + "learning_rate": 2.3437957265072587e-06, + "loss": 1.0253, + "step": 3430 + }, + { + "epoch": 0.7855752718946766, + "grad_norm": 1.1991873979568481, + "learning_rate": 2.339025783253541e-06, + "loss": 1.075, + "step": 3431 + }, + { + "epoch": 0.7858042358328563, + "grad_norm": 1.2989721298217773, + "learning_rate": 2.334260055769707e-06, + "loss": 1.0634, + "step": 3432 + }, + { + "epoch": 0.786033199771036, + "grad_norm": 1.156073808670044, + "learning_rate": 2.3294985466782937e-06, + "loss": 1.0094, + "step": 3433 + }, + { + "epoch": 0.7862621637092158, + "grad_norm": 1.2679800987243652, + "learning_rate": 2.324741258599521e-06, + "loss": 1.0954, + "step": 3434 + }, + { + "epoch": 0.7864911276473955, + "grad_norm": 1.283085823059082, + "learning_rate": 2.319988194151287e-06, + "loss": 0.9946, + "step": 3435 + }, + { + "epoch": 0.7867200915855753, + "grad_norm": 1.6633352041244507, + "learning_rate": 2.3152393559491546e-06, + "loss": 1.0509, + "step": 3436 + }, + { + "epoch": 0.786949055523755, + "grad_norm": 1.3661447763442993, + "learning_rate": 2.3104947466063785e-06, + "loss": 1.0671, + "step": 3437 + }, + { + "epoch": 0.7871780194619348, + "grad_norm": 1.2480249404907227, + "learning_rate": 2.305754368733878e-06, + "loss": 1.0529, + "step": 3438 + }, + { + "epoch": 0.7874069834001145, + "grad_norm": 1.286214828491211, + "learning_rate": 2.3010182249402368e-06, + "loss": 1.0186, + "step": 3439 + }, + { + "epoch": 0.7876359473382942, + "grad_norm": 1.268823504447937, + "learning_rate": 2.2962863178317154e-06, + "loss": 1.0093, + "step": 3440 + }, + { + "epoch": 0.787864911276474, + "grad_norm": 1.215895652770996, + "learning_rate": 2.29155865001225e-06, + "loss": 1.0696, + "step": 3441 + }, + { + "epoch": 0.7880938752146537, + "grad_norm": 1.2569650411605835, + "learning_rate": 2.2868352240834304e-06, + "loss": 1.0109, + "step": 3442 + }, + { + "epoch": 0.7883228391528334, + "grad_norm": 1.3006694316864014, + "learning_rate": 2.282116042644519e-06, + "loss": 1.0191, + "step": 3443 + }, + { + "epoch": 0.7885518030910131, + "grad_norm": 1.2813971042633057, + "learning_rate": 2.2774011082924417e-06, + "loss": 1.0308, + "step": 3444 + }, + { + "epoch": 0.7887807670291929, + "grad_norm": 1.4007090330123901, + "learning_rate": 2.2726904236217895e-06, + "loss": 1.0365, + "step": 3445 + }, + { + "epoch": 0.7890097309673726, + "grad_norm": 1.9624810218811035, + "learning_rate": 2.2679839912248104e-06, + "loss": 1.0467, + "step": 3446 + }, + { + "epoch": 0.7892386949055524, + "grad_norm": 1.6809523105621338, + "learning_rate": 2.263281813691417e-06, + "loss": 1.025, + "step": 3447 + }, + { + "epoch": 0.7894676588437322, + "grad_norm": 1.4995564222335815, + "learning_rate": 2.2585838936091753e-06, + "loss": 1.0324, + "step": 3448 + }, + { + "epoch": 0.7896966227819119, + "grad_norm": 1.4374549388885498, + "learning_rate": 2.253890233563316e-06, + "loss": 1.0693, + "step": 3449 + }, + { + "epoch": 0.7899255867200916, + "grad_norm": 1.204321265220642, + "learning_rate": 2.2492008361367133e-06, + "loss": 1.0055, + "step": 3450 + }, + { + "epoch": 0.7901545506582713, + "grad_norm": 1.4027067422866821, + "learning_rate": 2.2445157039099096e-06, + "loss": 1.0308, + "step": 3451 + }, + { + "epoch": 0.7903835145964511, + "grad_norm": 1.5836561918258667, + "learning_rate": 2.2398348394610947e-06, + "loss": 1.0295, + "step": 3452 + }, + { + "epoch": 0.7906124785346308, + "grad_norm": 1.5846806764602661, + "learning_rate": 2.235158245366105e-06, + "loss": 1.0415, + "step": 3453 + }, + { + "epoch": 0.7908414424728105, + "grad_norm": 1.1941750049591064, + "learning_rate": 2.2304859241984313e-06, + "loss": 1.0086, + "step": 3454 + }, + { + "epoch": 0.7910704064109902, + "grad_norm": 1.2206698656082153, + "learning_rate": 2.225817878529214e-06, + "loss": 1.0253, + "step": 3455 + }, + { + "epoch": 0.79129937034917, + "grad_norm": 1.6405779123306274, + "learning_rate": 2.2211541109272383e-06, + "loss": 0.983, + "step": 3456 + }, + { + "epoch": 0.7915283342873497, + "grad_norm": 1.773629903793335, + "learning_rate": 2.216494623958939e-06, + "loss": 0.9978, + "step": 3457 + }, + { + "epoch": 0.7917572982255294, + "grad_norm": 1.5641894340515137, + "learning_rate": 2.2118394201883907e-06, + "loss": 1.0389, + "step": 3458 + }, + { + "epoch": 0.7919862621637093, + "grad_norm": 1.358713984489441, + "learning_rate": 2.207188502177313e-06, + "loss": 1.0336, + "step": 3459 + }, + { + "epoch": 0.792215226101889, + "grad_norm": 1.1877168416976929, + "learning_rate": 2.2025418724850678e-06, + "loss": 1.1138, + "step": 3460 + }, + { + "epoch": 0.7924441900400687, + "grad_norm": 1.3562843799591064, + "learning_rate": 2.197899533668657e-06, + "loss": 1.0429, + "step": 3461 + }, + { + "epoch": 0.7926731539782484, + "grad_norm": 1.2236860990524292, + "learning_rate": 2.1932614882827196e-06, + "loss": 1.0147, + "step": 3462 + }, + { + "epoch": 0.7929021179164282, + "grad_norm": 1.2904807329177856, + "learning_rate": 2.1886277388795363e-06, + "loss": 1.0654, + "step": 3463 + }, + { + "epoch": 0.7931310818546079, + "grad_norm": 1.3184075355529785, + "learning_rate": 2.1839982880090115e-06, + "loss": 1.1185, + "step": 3464 + }, + { + "epoch": 0.7933600457927876, + "grad_norm": 1.4988008737564087, + "learning_rate": 2.1793731382187056e-06, + "loss": 1.0399, + "step": 3465 + }, + { + "epoch": 0.7935890097309674, + "grad_norm": 1.8388975858688354, + "learning_rate": 2.1747522920537913e-06, + "loss": 1.0708, + "step": 3466 + }, + { + "epoch": 0.7938179736691471, + "grad_norm": 1.239767074584961, + "learning_rate": 2.1701357520570797e-06, + "loss": 1.0156, + "step": 3467 + }, + { + "epoch": 0.7940469376073268, + "grad_norm": 1.1681170463562012, + "learning_rate": 2.165523520769024e-06, + "loss": 1.0439, + "step": 3468 + }, + { + "epoch": 0.7942759015455065, + "grad_norm": 1.1504888534545898, + "learning_rate": 2.160915600727688e-06, + "loss": 1.0851, + "step": 3469 + }, + { + "epoch": 0.7945048654836864, + "grad_norm": 1.3817356824874878, + "learning_rate": 2.156311994468774e-06, + "loss": 1.0873, + "step": 3470 + }, + { + "epoch": 0.7947338294218661, + "grad_norm": 1.3247697353363037, + "learning_rate": 2.151712704525608e-06, + "loss": 0.9901, + "step": 3471 + }, + { + "epoch": 0.7949627933600458, + "grad_norm": 1.5269098281860352, + "learning_rate": 2.1471177334291404e-06, + "loss": 1.0721, + "step": 3472 + }, + { + "epoch": 0.7951917572982256, + "grad_norm": 1.195955753326416, + "learning_rate": 2.142527083707946e-06, + "loss": 1.0325, + "step": 3473 + }, + { + "epoch": 0.7954207212364053, + "grad_norm": 1.3172709941864014, + "learning_rate": 2.1379407578882206e-06, + "loss": 1.0282, + "step": 3474 + }, + { + "epoch": 0.795649685174585, + "grad_norm": 1.4341003894805908, + "learning_rate": 2.13335875849378e-06, + "loss": 1.098, + "step": 3475 + }, + { + "epoch": 0.7958786491127647, + "grad_norm": 1.1495006084442139, + "learning_rate": 2.1287810880460636e-06, + "loss": 1.0147, + "step": 3476 + }, + { + "epoch": 0.7961076130509445, + "grad_norm": 1.1693551540374756, + "learning_rate": 2.1242077490641157e-06, + "loss": 1.0005, + "step": 3477 + }, + { + "epoch": 0.7963365769891242, + "grad_norm": 1.5552064180374146, + "learning_rate": 2.119638744064617e-06, + "loss": 1.0714, + "step": 3478 + }, + { + "epoch": 0.7965655409273039, + "grad_norm": 1.519784688949585, + "learning_rate": 2.1150740755618505e-06, + "loss": 1.04, + "step": 3479 + }, + { + "epoch": 0.7967945048654836, + "grad_norm": 1.1543006896972656, + "learning_rate": 2.1105137460677093e-06, + "loss": 1.0897, + "step": 3480 + }, + { + "epoch": 0.7970234688036634, + "grad_norm": 1.7196766138076782, + "learning_rate": 2.1059577580917067e-06, + "loss": 1.0252, + "step": 3481 + }, + { + "epoch": 0.7972524327418432, + "grad_norm": 1.2323554754257202, + "learning_rate": 2.1014061141409715e-06, + "loss": 1.0197, + "step": 3482 + }, + { + "epoch": 0.7974813966800229, + "grad_norm": 1.1643589735031128, + "learning_rate": 2.0968588167202265e-06, + "loss": 1.023, + "step": 3483 + }, + { + "epoch": 0.7977103606182027, + "grad_norm": 1.2324453592300415, + "learning_rate": 2.0923158683318157e-06, + "loss": 0.9976, + "step": 3484 + }, + { + "epoch": 0.7979393245563824, + "grad_norm": 1.3599796295166016, + "learning_rate": 2.087777271475684e-06, + "loss": 1.0451, + "step": 3485 + }, + { + "epoch": 0.7981682884945621, + "grad_norm": 1.2370396852493286, + "learning_rate": 2.0832430286493834e-06, + "loss": 0.9734, + "step": 3486 + }, + { + "epoch": 0.7983972524327418, + "grad_norm": 1.3840800523757935, + "learning_rate": 2.0787131423480722e-06, + "loss": 0.9833, + "step": 3487 + }, + { + "epoch": 0.7986262163709216, + "grad_norm": 1.1826318502426147, + "learning_rate": 2.0741876150645025e-06, + "loss": 0.9921, + "step": 3488 + }, + { + "epoch": 0.7988551803091013, + "grad_norm": 1.1588865518569946, + "learning_rate": 2.0696664492890394e-06, + "loss": 1.0858, + "step": 3489 + }, + { + "epoch": 0.799084144247281, + "grad_norm": 1.4045047760009766, + "learning_rate": 2.0651496475096455e-06, + "loss": 1.0409, + "step": 3490 + }, + { + "epoch": 0.7993131081854608, + "grad_norm": 1.2877663373947144, + "learning_rate": 2.060637212211869e-06, + "loss": 0.9848, + "step": 3491 + }, + { + "epoch": 0.7995420721236405, + "grad_norm": 1.3673158884048462, + "learning_rate": 2.0561291458788736e-06, + "loss": 0.9969, + "step": 3492 + }, + { + "epoch": 0.7997710360618203, + "grad_norm": 1.3965160846710205, + "learning_rate": 2.0516254509914103e-06, + "loss": 1.0477, + "step": 3493 + }, + { + "epoch": 0.8, + "grad_norm": 1.1263400316238403, + "learning_rate": 2.047126130027819e-06, + "loss": 1.0591, + "step": 3494 + }, + { + "epoch": 0.8002289639381798, + "grad_norm": 1.2847234010696411, + "learning_rate": 2.042631185464039e-06, + "loss": 0.9993, + "step": 3495 + }, + { + "epoch": 0.8004579278763595, + "grad_norm": 2.184096336364746, + "learning_rate": 2.038140619773609e-06, + "loss": 1.0074, + "step": 3496 + }, + { + "epoch": 0.8006868918145392, + "grad_norm": 1.3483428955078125, + "learning_rate": 2.03365443542764e-06, + "loss": 1.0989, + "step": 3497 + }, + { + "epoch": 0.800915855752719, + "grad_norm": 1.5538264513015747, + "learning_rate": 2.029172634894846e-06, + "loss": 1.0901, + "step": 3498 + }, + { + "epoch": 0.8011448196908987, + "grad_norm": 1.2614027261734009, + "learning_rate": 2.024695220641524e-06, + "loss": 1.0144, + "step": 3499 + }, + { + "epoch": 0.8013737836290784, + "grad_norm": 1.2823106050491333, + "learning_rate": 2.020222195131556e-06, + "loss": 0.9797, + "step": 3500 + }, + { + "epoch": 0.8016027475672581, + "grad_norm": 1.257866382598877, + "learning_rate": 2.0157535608264123e-06, + "loss": 1.0476, + "step": 3501 + }, + { + "epoch": 0.8018317115054379, + "grad_norm": 1.226118803024292, + "learning_rate": 2.0112893201851435e-06, + "loss": 0.9959, + "step": 3502 + }, + { + "epoch": 0.8020606754436176, + "grad_norm": 1.5508838891983032, + "learning_rate": 2.0068294756643846e-06, + "loss": 0.9996, + "step": 3503 + }, + { + "epoch": 0.8022896393817973, + "grad_norm": 1.1606311798095703, + "learning_rate": 2.0023740297183536e-06, + "loss": 1.0322, + "step": 3504 + }, + { + "epoch": 0.8025186033199772, + "grad_norm": 1.516457200050354, + "learning_rate": 1.997922984798836e-06, + "loss": 1.0247, + "step": 3505 + }, + { + "epoch": 0.8027475672581569, + "grad_norm": 2.4328088760375977, + "learning_rate": 1.993476343355213e-06, + "loss": 1.0883, + "step": 3506 + }, + { + "epoch": 0.8029765311963366, + "grad_norm": 1.4516451358795166, + "learning_rate": 1.9890341078344343e-06, + "loss": 1.0693, + "step": 3507 + }, + { + "epoch": 0.8032054951345163, + "grad_norm": 1.3910856246948242, + "learning_rate": 1.9845962806810205e-06, + "loss": 1.0984, + "step": 3508 + }, + { + "epoch": 0.8034344590726961, + "grad_norm": 1.184085488319397, + "learning_rate": 1.980162864337071e-06, + "loss": 1.0698, + "step": 3509 + }, + { + "epoch": 0.8036634230108758, + "grad_norm": 1.3776863813400269, + "learning_rate": 1.9757338612422594e-06, + "loss": 0.9953, + "step": 3510 + }, + { + "epoch": 0.8038923869490555, + "grad_norm": 1.102057933807373, + "learning_rate": 1.971309273833828e-06, + "loss": 1.0835, + "step": 3511 + }, + { + "epoch": 0.8041213508872352, + "grad_norm": 1.2618690729141235, + "learning_rate": 1.966889104546591e-06, + "loss": 1.0462, + "step": 3512 + }, + { + "epoch": 0.804350314825415, + "grad_norm": 1.2333935499191284, + "learning_rate": 1.9624733558129304e-06, + "loss": 1.0208, + "step": 3513 + }, + { + "epoch": 0.8045792787635947, + "grad_norm": 1.2381764650344849, + "learning_rate": 1.958062030062795e-06, + "loss": 1.0503, + "step": 3514 + }, + { + "epoch": 0.8048082427017744, + "grad_norm": 1.2884087562561035, + "learning_rate": 1.9536551297237018e-06, + "loss": 1.0545, + "step": 3515 + }, + { + "epoch": 0.8050372066399543, + "grad_norm": 1.294965147972107, + "learning_rate": 1.9492526572207294e-06, + "loss": 1.0354, + "step": 3516 + }, + { + "epoch": 0.805266170578134, + "grad_norm": 1.5084460973739624, + "learning_rate": 1.944854614976521e-06, + "loss": 0.9814, + "step": 3517 + }, + { + "epoch": 0.8054951345163137, + "grad_norm": 1.1529735326766968, + "learning_rate": 1.940461005411288e-06, + "loss": 1.0445, + "step": 3518 + }, + { + "epoch": 0.8057240984544934, + "grad_norm": 1.3352361917495728, + "learning_rate": 1.9360718309427863e-06, + "loss": 0.9968, + "step": 3519 + }, + { + "epoch": 0.8059530623926732, + "grad_norm": 1.3624452352523804, + "learning_rate": 1.931687093986354e-06, + "loss": 1.0275, + "step": 3520 + }, + { + "epoch": 0.8061820263308529, + "grad_norm": 1.2383012771606445, + "learning_rate": 1.9273067969548664e-06, + "loss": 1.0416, + "step": 3521 + }, + { + "epoch": 0.8064109902690326, + "grad_norm": 1.340103030204773, + "learning_rate": 1.922930942258766e-06, + "loss": 1.0579, + "step": 3522 + }, + { + "epoch": 0.8066399542072124, + "grad_norm": 2.9526262283325195, + "learning_rate": 1.918559532306051e-06, + "loss": 1.0087, + "step": 3523 + }, + { + "epoch": 0.8068689181453921, + "grad_norm": 1.7514522075653076, + "learning_rate": 1.91419256950227e-06, + "loss": 1.0383, + "step": 3524 + }, + { + "epoch": 0.8070978820835718, + "grad_norm": 1.2114307880401611, + "learning_rate": 1.9098300562505266e-06, + "loss": 0.9728, + "step": 3525 + }, + { + "epoch": 0.8073268460217515, + "grad_norm": 1.102329969406128, + "learning_rate": 1.9054719949514756e-06, + "loss": 1.0953, + "step": 3526 + }, + { + "epoch": 0.8075558099599313, + "grad_norm": 1.2721331119537354, + "learning_rate": 1.9011183880033203e-06, + "loss": 1.0068, + "step": 3527 + }, + { + "epoch": 0.8077847738981111, + "grad_norm": 1.7620582580566406, + "learning_rate": 1.8967692378018155e-06, + "loss": 0.9938, + "step": 3528 + }, + { + "epoch": 0.8080137378362908, + "grad_norm": 1.253893494606018, + "learning_rate": 1.8924245467402612e-06, + "loss": 1.0459, + "step": 3529 + }, + { + "epoch": 0.8082427017744706, + "grad_norm": 1.549721360206604, + "learning_rate": 1.8880843172095066e-06, + "loss": 1.0437, + "step": 3530 + }, + { + "epoch": 0.8084716657126503, + "grad_norm": 1.4003283977508545, + "learning_rate": 1.8837485515979425e-06, + "loss": 1.0505, + "step": 3531 + }, + { + "epoch": 0.80870062965083, + "grad_norm": 1.227433681488037, + "learning_rate": 1.8794172522915022e-06, + "loss": 1.0111, + "step": 3532 + }, + { + "epoch": 0.8089295935890097, + "grad_norm": 1.263609528541565, + "learning_rate": 1.875090421673662e-06, + "loss": 1.0436, + "step": 3533 + }, + { + "epoch": 0.8091585575271895, + "grad_norm": 1.4125272035598755, + "learning_rate": 1.8707680621254487e-06, + "loss": 1.061, + "step": 3534 + }, + { + "epoch": 0.8093875214653692, + "grad_norm": 1.1441245079040527, + "learning_rate": 1.8664501760254128e-06, + "loss": 1.0341, + "step": 3535 + }, + { + "epoch": 0.8096164854035489, + "grad_norm": 1.3769537210464478, + "learning_rate": 1.8621367657496504e-06, + "loss": 1.0348, + "step": 3536 + }, + { + "epoch": 0.8098454493417286, + "grad_norm": 1.3794772624969482, + "learning_rate": 1.8578278336718037e-06, + "loss": 1.0156, + "step": 3537 + }, + { + "epoch": 0.8100744132799084, + "grad_norm": 1.4838491678237915, + "learning_rate": 1.8535233821630338e-06, + "loss": 0.9982, + "step": 3538 + }, + { + "epoch": 0.8103033772180882, + "grad_norm": 1.272406816482544, + "learning_rate": 1.849223413592046e-06, + "loss": 1.0323, + "step": 3539 + }, + { + "epoch": 0.8105323411562679, + "grad_norm": 1.1917959451675415, + "learning_rate": 1.8449279303250777e-06, + "loss": 1.0565, + "step": 3540 + }, + { + "epoch": 0.8107613050944477, + "grad_norm": 1.2484959363937378, + "learning_rate": 1.8406369347258968e-06, + "loss": 1.0529, + "step": 3541 + }, + { + "epoch": 0.8109902690326274, + "grad_norm": 1.6294903755187988, + "learning_rate": 1.8363504291558053e-06, + "loss": 1.0013, + "step": 3542 + }, + { + "epoch": 0.8112192329708071, + "grad_norm": 1.355380654335022, + "learning_rate": 1.8320684159736236e-06, + "loss": 0.9698, + "step": 3543 + }, + { + "epoch": 0.8114481969089868, + "grad_norm": 1.0993623733520508, + "learning_rate": 1.827790897535715e-06, + "loss": 1.0257, + "step": 3544 + }, + { + "epoch": 0.8116771608471666, + "grad_norm": 1.8703913688659668, + "learning_rate": 1.8235178761959626e-06, + "loss": 1.0786, + "step": 3545 + }, + { + "epoch": 0.8119061247853463, + "grad_norm": 1.225956678390503, + "learning_rate": 1.8192493543057676e-06, + "loss": 1.0386, + "step": 3546 + }, + { + "epoch": 0.812135088723526, + "grad_norm": 1.302952527999878, + "learning_rate": 1.8149853342140644e-06, + "loss": 1.011, + "step": 3547 + }, + { + "epoch": 0.8123640526617057, + "grad_norm": 1.4825295209884644, + "learning_rate": 1.8107258182673127e-06, + "loss": 1.0508, + "step": 3548 + }, + { + "epoch": 0.8125930165998855, + "grad_norm": 1.4160746335983276, + "learning_rate": 1.8064708088094829e-06, + "loss": 1.0045, + "step": 3549 + }, + { + "epoch": 0.8128219805380652, + "grad_norm": 1.3948523998260498, + "learning_rate": 1.802220308182071e-06, + "loss": 1.0562, + "step": 3550 + }, + { + "epoch": 0.813050944476245, + "grad_norm": 1.3531475067138672, + "learning_rate": 1.797974318724094e-06, + "loss": 0.9929, + "step": 3551 + }, + { + "epoch": 0.8132799084144248, + "grad_norm": 1.301820993423462, + "learning_rate": 1.7937328427720834e-06, + "loss": 1.0954, + "step": 3552 + }, + { + "epoch": 0.8135088723526045, + "grad_norm": 1.1766568422317505, + "learning_rate": 1.7894958826600884e-06, + "loss": 1.0252, + "step": 3553 + }, + { + "epoch": 0.8137378362907842, + "grad_norm": 1.2368574142456055, + "learning_rate": 1.7852634407196723e-06, + "loss": 0.9867, + "step": 3554 + }, + { + "epoch": 0.813966800228964, + "grad_norm": 1.2918037176132202, + "learning_rate": 1.7810355192799122e-06, + "loss": 1.0437, + "step": 3555 + }, + { + "epoch": 0.8141957641671437, + "grad_norm": 1.3767991065979004, + "learning_rate": 1.7768121206674006e-06, + "loss": 1.0281, + "step": 3556 + }, + { + "epoch": 0.8144247281053234, + "grad_norm": 1.165218472480774, + "learning_rate": 1.7725932472062302e-06, + "loss": 0.9958, + "step": 3557 + }, + { + "epoch": 0.8146536920435031, + "grad_norm": 1.1615934371948242, + "learning_rate": 1.7683789012180196e-06, + "loss": 1.0294, + "step": 3558 + }, + { + "epoch": 0.8148826559816829, + "grad_norm": 1.3397858142852783, + "learning_rate": 1.7641690850218884e-06, + "loss": 1.0477, + "step": 3559 + }, + { + "epoch": 0.8151116199198626, + "grad_norm": 1.2796862125396729, + "learning_rate": 1.7599638009344566e-06, + "loss": 0.9805, + "step": 3560 + }, + { + "epoch": 0.8153405838580423, + "grad_norm": 1.1992462873458862, + "learning_rate": 1.7557630512698642e-06, + "loss": 1.0074, + "step": 3561 + }, + { + "epoch": 0.8155695477962221, + "grad_norm": 1.330796480178833, + "learning_rate": 1.7515668383397433e-06, + "loss": 1.0369, + "step": 3562 + }, + { + "epoch": 0.8157985117344019, + "grad_norm": 1.7938005924224854, + "learning_rate": 1.7473751644532366e-06, + "loss": 1.0118, + "step": 3563 + }, + { + "epoch": 0.8160274756725816, + "grad_norm": 1.2056411504745483, + "learning_rate": 1.7431880319169858e-06, + "loss": 1.0215, + "step": 3564 + }, + { + "epoch": 0.8162564396107613, + "grad_norm": 1.4476250410079956, + "learning_rate": 1.7390054430351366e-06, + "loss": 0.9641, + "step": 3565 + }, + { + "epoch": 0.8164854035489411, + "grad_norm": 1.1308528184890747, + "learning_rate": 1.7348274001093324e-06, + "loss": 1.0338, + "step": 3566 + }, + { + "epoch": 0.8167143674871208, + "grad_norm": 1.440428614616394, + "learning_rate": 1.730653905438714e-06, + "loss": 1.0503, + "step": 3567 + }, + { + "epoch": 0.8169433314253005, + "grad_norm": 1.2121968269348145, + "learning_rate": 1.7264849613199208e-06, + "loss": 1.0444, + "step": 3568 + }, + { + "epoch": 0.8171722953634802, + "grad_norm": 1.3329687118530273, + "learning_rate": 1.722320570047089e-06, + "loss": 1.0456, + "step": 3569 + }, + { + "epoch": 0.81740125930166, + "grad_norm": 1.237636923789978, + "learning_rate": 1.7181607339118488e-06, + "loss": 1.0376, + "step": 3570 + }, + { + "epoch": 0.8176302232398397, + "grad_norm": 7.456436634063721, + "learning_rate": 1.714005455203318e-06, + "loss": 1.0065, + "step": 3571 + }, + { + "epoch": 0.8178591871780194, + "grad_norm": 1.594241976737976, + "learning_rate": 1.7098547362081197e-06, + "loss": 0.9686, + "step": 3572 + }, + { + "epoch": 0.8180881511161991, + "grad_norm": 1.2208811044692993, + "learning_rate": 1.7057085792103534e-06, + "loss": 0.965, + "step": 3573 + }, + { + "epoch": 0.818317115054379, + "grad_norm": 1.1306706666946411, + "learning_rate": 1.701566986491614e-06, + "loss": 1.0411, + "step": 3574 + }, + { + "epoch": 0.8185460789925587, + "grad_norm": 1.4703606367111206, + "learning_rate": 1.697429960330993e-06, + "loss": 1.063, + "step": 3575 + }, + { + "epoch": 0.8187750429307384, + "grad_norm": 1.7712504863739014, + "learning_rate": 1.6932975030050524e-06, + "loss": 1.03, + "step": 3576 + }, + { + "epoch": 0.8190040068689182, + "grad_norm": 1.5078486204147339, + "learning_rate": 1.6891696167878535e-06, + "loss": 1.0125, + "step": 3577 + }, + { + "epoch": 0.8192329708070979, + "grad_norm": 1.5069183111190796, + "learning_rate": 1.6850463039509356e-06, + "loss": 1.0039, + "step": 3578 + }, + { + "epoch": 0.8194619347452776, + "grad_norm": 1.3289002180099487, + "learning_rate": 1.680927566763325e-06, + "loss": 1.0696, + "step": 3579 + }, + { + "epoch": 0.8196908986834573, + "grad_norm": 1.418169617652893, + "learning_rate": 1.6768134074915277e-06, + "loss": 1.0456, + "step": 3580 + }, + { + "epoch": 0.8199198626216371, + "grad_norm": 1.5103626251220703, + "learning_rate": 1.672703828399529e-06, + "loss": 1.0815, + "step": 3581 + }, + { + "epoch": 0.8201488265598168, + "grad_norm": 1.2844433784484863, + "learning_rate": 1.6685988317487988e-06, + "loss": 1.0614, + "step": 3582 + }, + { + "epoch": 0.8203777904979965, + "grad_norm": 1.187903881072998, + "learning_rate": 1.6644984197982828e-06, + "loss": 1.0207, + "step": 3583 + }, + { + "epoch": 0.8206067544361763, + "grad_norm": 1.2992799282073975, + "learning_rate": 1.6604025948043966e-06, + "loss": 1.0443, + "step": 3584 + }, + { + "epoch": 0.8208357183743561, + "grad_norm": 1.6380910873413086, + "learning_rate": 1.6563113590210455e-06, + "loss": 1.112, + "step": 3585 + }, + { + "epoch": 0.8210646823125358, + "grad_norm": 1.1910545825958252, + "learning_rate": 1.652224714699603e-06, + "loss": 1.0636, + "step": 3586 + }, + { + "epoch": 0.8212936462507155, + "grad_norm": 1.3377147912979126, + "learning_rate": 1.6481426640889098e-06, + "loss": 1.0045, + "step": 3587 + }, + { + "epoch": 0.8215226101888953, + "grad_norm": 1.2766714096069336, + "learning_rate": 1.6440652094352838e-06, + "loss": 1.017, + "step": 3588 + }, + { + "epoch": 0.821751574127075, + "grad_norm": 1.1633610725402832, + "learning_rate": 1.6399923529825213e-06, + "loss": 1.0152, + "step": 3589 + }, + { + "epoch": 0.8219805380652547, + "grad_norm": 1.4114861488342285, + "learning_rate": 1.6359240969718748e-06, + "loss": 1.0769, + "step": 3590 + }, + { + "epoch": 0.8222095020034345, + "grad_norm": 1.3905683755874634, + "learning_rate": 1.6318604436420738e-06, + "loss": 1.0795, + "step": 3591 + }, + { + "epoch": 0.8224384659416142, + "grad_norm": 1.3522437810897827, + "learning_rate": 1.6278013952293115e-06, + "loss": 1.0389, + "step": 3592 + }, + { + "epoch": 0.8226674298797939, + "grad_norm": 1.1281168460845947, + "learning_rate": 1.6237469539672479e-06, + "loss": 1.0212, + "step": 3593 + }, + { + "epoch": 0.8228963938179736, + "grad_norm": 1.1827605962753296, + "learning_rate": 1.6196971220870105e-06, + "loss": 0.9905, + "step": 3594 + }, + { + "epoch": 0.8231253577561534, + "grad_norm": 1.626893401145935, + "learning_rate": 1.6156519018171856e-06, + "loss": 1.0374, + "step": 3595 + }, + { + "epoch": 0.8233543216943331, + "grad_norm": 1.1740373373031616, + "learning_rate": 1.6116112953838247e-06, + "loss": 1.0092, + "step": 3596 + }, + { + "epoch": 0.8235832856325129, + "grad_norm": 1.1388554573059082, + "learning_rate": 1.6075753050104426e-06, + "loss": 1.0141, + "step": 3597 + }, + { + "epoch": 0.8238122495706927, + "grad_norm": 1.3180172443389893, + "learning_rate": 1.6035439329180025e-06, + "loss": 1.0853, + "step": 3598 + }, + { + "epoch": 0.8240412135088724, + "grad_norm": 1.3404852151870728, + "learning_rate": 1.5995171813249433e-06, + "loss": 1.0734, + "step": 3599 + }, + { + "epoch": 0.8242701774470521, + "grad_norm": 1.2810182571411133, + "learning_rate": 1.5954950524471513e-06, + "loss": 1.0471, + "step": 3600 + }, + { + "epoch": 0.8244991413852318, + "grad_norm": 1.1049981117248535, + "learning_rate": 1.591477548497966e-06, + "loss": 1.0101, + "step": 3601 + }, + { + "epoch": 0.8247281053234116, + "grad_norm": 1.3329854011535645, + "learning_rate": 1.587464671688187e-06, + "loss": 1.058, + "step": 3602 + }, + { + "epoch": 0.8249570692615913, + "grad_norm": 1.1104480028152466, + "learning_rate": 1.583456424226073e-06, + "loss": 1.0087, + "step": 3603 + }, + { + "epoch": 0.825186033199771, + "grad_norm": 1.2844616174697876, + "learning_rate": 1.5794528083173223e-06, + "loss": 1.0039, + "step": 3604 + }, + { + "epoch": 0.8254149971379507, + "grad_norm": 1.207993984222412, + "learning_rate": 1.575453826165093e-06, + "loss": 1.0599, + "step": 3605 + }, + { + "epoch": 0.8256439610761305, + "grad_norm": 1.4137972593307495, + "learning_rate": 1.5714594799699912e-06, + "loss": 1.0705, + "step": 3606 + }, + { + "epoch": 0.8258729250143102, + "grad_norm": 1.3582916259765625, + "learning_rate": 1.5674697719300735e-06, + "loss": 1.0247, + "step": 3607 + }, + { + "epoch": 0.82610188895249, + "grad_norm": 1.240177869796753, + "learning_rate": 1.5634847042408408e-06, + "loss": 1.0207, + "step": 3608 + }, + { + "epoch": 0.8263308528906698, + "grad_norm": 1.2895662784576416, + "learning_rate": 1.5595042790952442e-06, + "loss": 0.9647, + "step": 3609 + }, + { + "epoch": 0.8265598168288495, + "grad_norm": 1.8447550535202026, + "learning_rate": 1.5555284986836782e-06, + "loss": 1.0683, + "step": 3610 + }, + { + "epoch": 0.8267887807670292, + "grad_norm": 1.2016741037368774, + "learning_rate": 1.551557365193983e-06, + "loss": 1.0238, + "step": 3611 + }, + { + "epoch": 0.827017744705209, + "grad_norm": 1.8298025131225586, + "learning_rate": 1.5475908808114325e-06, + "loss": 1.0688, + "step": 3612 + }, + { + "epoch": 0.8272467086433887, + "grad_norm": 1.6220577955245972, + "learning_rate": 1.5436290477187589e-06, + "loss": 1.0334, + "step": 3613 + }, + { + "epoch": 0.8274756725815684, + "grad_norm": 1.505828619003296, + "learning_rate": 1.539671868096123e-06, + "loss": 1.0641, + "step": 3614 + }, + { + "epoch": 0.8277046365197481, + "grad_norm": 1.131373643875122, + "learning_rate": 1.535719344121125e-06, + "loss": 1.0156, + "step": 3615 + }, + { + "epoch": 0.8279336004579279, + "grad_norm": 1.5130774974822998, + "learning_rate": 1.5317714779688076e-06, + "loss": 1.104, + "step": 3616 + }, + { + "epoch": 0.8281625643961076, + "grad_norm": 1.2293245792388916, + "learning_rate": 1.5278282718116477e-06, + "loss": 1.0824, + "step": 3617 + }, + { + "epoch": 0.8283915283342873, + "grad_norm": 1.3308606147766113, + "learning_rate": 1.5238897278195597e-06, + "loss": 1.0208, + "step": 3618 + }, + { + "epoch": 0.828620492272467, + "grad_norm": 1.5616285800933838, + "learning_rate": 1.5199558481598908e-06, + "loss": 1.0327, + "step": 3619 + }, + { + "epoch": 0.8288494562106469, + "grad_norm": 1.1585637331008911, + "learning_rate": 1.5160266349974207e-06, + "loss": 0.9957, + "step": 3620 + }, + { + "epoch": 0.8290784201488266, + "grad_norm": 1.2559548616409302, + "learning_rate": 1.5121020904943651e-06, + "loss": 1.0655, + "step": 3621 + }, + { + "epoch": 0.8293073840870063, + "grad_norm": 1.3206498622894287, + "learning_rate": 1.5081822168103654e-06, + "loss": 1.0465, + "step": 3622 + }, + { + "epoch": 0.829536348025186, + "grad_norm": 1.3651444911956787, + "learning_rate": 1.5042670161024975e-06, + "loss": 1.0738, + "step": 3623 + }, + { + "epoch": 0.8297653119633658, + "grad_norm": 1.3507568836212158, + "learning_rate": 1.500356490525261e-06, + "loss": 1.0927, + "step": 3624 + }, + { + "epoch": 0.8299942759015455, + "grad_norm": 1.1345727443695068, + "learning_rate": 1.4964506422305902e-06, + "loss": 1.0417, + "step": 3625 + }, + { + "epoch": 0.8302232398397252, + "grad_norm": 1.2418663501739502, + "learning_rate": 1.4925494733678324e-06, + "loss": 1.0467, + "step": 3626 + }, + { + "epoch": 0.830452203777905, + "grad_norm": 1.6180311441421509, + "learning_rate": 1.4886529860837772e-06, + "loss": 1.0142, + "step": 3627 + }, + { + "epoch": 0.8306811677160847, + "grad_norm": 1.9630390405654907, + "learning_rate": 1.4847611825226227e-06, + "loss": 1.1022, + "step": 3628 + }, + { + "epoch": 0.8309101316542644, + "grad_norm": 1.488043189048767, + "learning_rate": 1.4808740648259967e-06, + "loss": 1.0521, + "step": 3629 + }, + { + "epoch": 0.8311390955924441, + "grad_norm": 1.2796021699905396, + "learning_rate": 1.4769916351329495e-06, + "loss": 1.0249, + "step": 3630 + }, + { + "epoch": 0.831368059530624, + "grad_norm": 1.1457350254058838, + "learning_rate": 1.4731138955799474e-06, + "loss": 1.1018, + "step": 3631 + }, + { + "epoch": 0.8315970234688037, + "grad_norm": 1.1872814893722534, + "learning_rate": 1.4692408483008803e-06, + "loss": 0.9932, + "step": 3632 + }, + { + "epoch": 0.8318259874069834, + "grad_norm": 1.2328518629074097, + "learning_rate": 1.465372495427052e-06, + "loss": 0.9768, + "step": 3633 + }, + { + "epoch": 0.8320549513451632, + "grad_norm": 1.285885214805603, + "learning_rate": 1.4615088390871846e-06, + "loss": 1.0506, + "step": 3634 + }, + { + "epoch": 0.8322839152833429, + "grad_norm": 1.0765000581741333, + "learning_rate": 1.457649881407417e-06, + "loss": 1.084, + "step": 3635 + }, + { + "epoch": 0.8325128792215226, + "grad_norm": 1.3982456922531128, + "learning_rate": 1.4537956245113006e-06, + "loss": 0.9873, + "step": 3636 + }, + { + "epoch": 0.8327418431597023, + "grad_norm": 1.536367654800415, + "learning_rate": 1.4499460705198e-06, + "loss": 1.0396, + "step": 3637 + }, + { + "epoch": 0.8329708070978821, + "grad_norm": 1.5141808986663818, + "learning_rate": 1.446101221551296e-06, + "loss": 1.0472, + "step": 3638 + }, + { + "epoch": 0.8331997710360618, + "grad_norm": 1.16829252243042, + "learning_rate": 1.4422610797215707e-06, + "loss": 1.0189, + "step": 3639 + }, + { + "epoch": 0.8334287349742415, + "grad_norm": 1.6448733806610107, + "learning_rate": 1.4384256471438241e-06, + "loss": 1.0127, + "step": 3640 + }, + { + "epoch": 0.8336576989124213, + "grad_norm": 1.396384358406067, + "learning_rate": 1.4345949259286673e-06, + "loss": 1.0597, + "step": 3641 + }, + { + "epoch": 0.833886662850601, + "grad_norm": 1.3862338066101074, + "learning_rate": 1.4307689181841077e-06, + "loss": 1.0525, + "step": 3642 + }, + { + "epoch": 0.8341156267887808, + "grad_norm": 1.409687876701355, + "learning_rate": 1.4269476260155668e-06, + "loss": 1.0834, + "step": 3643 + }, + { + "epoch": 0.8343445907269605, + "grad_norm": 1.310608983039856, + "learning_rate": 1.4231310515258745e-06, + "loss": 1.0362, + "step": 3644 + }, + { + "epoch": 0.8345735546651403, + "grad_norm": 1.2230523824691772, + "learning_rate": 1.4193191968152543e-06, + "loss": 0.9915, + "step": 3645 + }, + { + "epoch": 0.83480251860332, + "grad_norm": 1.3876858949661255, + "learning_rate": 1.4155120639813392e-06, + "loss": 0.9453, + "step": 3646 + }, + { + "epoch": 0.8350314825414997, + "grad_norm": 1.223950743675232, + "learning_rate": 1.4117096551191633e-06, + "loss": 1.011, + "step": 3647 + }, + { + "epoch": 0.8352604464796795, + "grad_norm": 1.2806096076965332, + "learning_rate": 1.4079119723211599e-06, + "loss": 1.0136, + "step": 3648 + }, + { + "epoch": 0.8354894104178592, + "grad_norm": 1.2026371955871582, + "learning_rate": 1.4041190176771635e-06, + "loss": 0.9929, + "step": 3649 + }, + { + "epoch": 0.8357183743560389, + "grad_norm": 1.4160505533218384, + "learning_rate": 1.4003307932744003e-06, + "loss": 1.0733, + "step": 3650 + }, + { + "epoch": 0.8359473382942186, + "grad_norm": 1.2385631799697876, + "learning_rate": 1.396547301197504e-06, + "loss": 1.0789, + "step": 3651 + }, + { + "epoch": 0.8361763022323984, + "grad_norm": 1.875428557395935, + "learning_rate": 1.3927685435284977e-06, + "loss": 1.0522, + "step": 3652 + }, + { + "epoch": 0.8364052661705781, + "grad_norm": 1.4530951976776123, + "learning_rate": 1.388994522346796e-06, + "loss": 1.0203, + "step": 3653 + }, + { + "epoch": 0.8366342301087579, + "grad_norm": 1.4752883911132812, + "learning_rate": 1.3852252397292143e-06, + "loss": 0.9734, + "step": 3654 + }, + { + "epoch": 0.8368631940469377, + "grad_norm": 1.1157587766647339, + "learning_rate": 1.38146069774996e-06, + "loss": 0.9831, + "step": 3655 + }, + { + "epoch": 0.8370921579851174, + "grad_norm": 1.351131558418274, + "learning_rate": 1.377700898480624e-06, + "loss": 1.0292, + "step": 3656 + }, + { + "epoch": 0.8373211219232971, + "grad_norm": 1.5354865789413452, + "learning_rate": 1.373945843990192e-06, + "loss": 0.9554, + "step": 3657 + }, + { + "epoch": 0.8375500858614768, + "grad_norm": 1.1356061697006226, + "learning_rate": 1.3701955363450447e-06, + "loss": 1.0667, + "step": 3658 + }, + { + "epoch": 0.8377790497996566, + "grad_norm": 1.2410647869110107, + "learning_rate": 1.3664499776089401e-06, + "loss": 1.0254, + "step": 3659 + }, + { + "epoch": 0.8380080137378363, + "grad_norm": 1.2936856746673584, + "learning_rate": 1.3627091698430284e-06, + "loss": 0.9986, + "step": 3660 + }, + { + "epoch": 0.838236977676016, + "grad_norm": 1.256300687789917, + "learning_rate": 1.3589731151058461e-06, + "loss": 1.0607, + "step": 3661 + }, + { + "epoch": 0.8384659416141957, + "grad_norm": 1.4257123470306396, + "learning_rate": 1.355241815453312e-06, + "loss": 1.0198, + "step": 3662 + }, + { + "epoch": 0.8386949055523755, + "grad_norm": 1.3414442539215088, + "learning_rate": 1.3515152729387315e-06, + "loss": 1.045, + "step": 3663 + }, + { + "epoch": 0.8389238694905552, + "grad_norm": 1.2866592407226562, + "learning_rate": 1.347793489612782e-06, + "loss": 0.9788, + "step": 3664 + }, + { + "epoch": 0.8391528334287349, + "grad_norm": 2.8504953384399414, + "learning_rate": 1.3440764675235384e-06, + "loss": 0.9914, + "step": 3665 + }, + { + "epoch": 0.8393817973669148, + "grad_norm": 1.2402368783950806, + "learning_rate": 1.3403642087164447e-06, + "loss": 1.0267, + "step": 3666 + }, + { + "epoch": 0.8396107613050945, + "grad_norm": 1.9054492712020874, + "learning_rate": 1.33665671523432e-06, + "loss": 1.0689, + "step": 3667 + }, + { + "epoch": 0.8398397252432742, + "grad_norm": 1.5320415496826172, + "learning_rate": 1.332953989117377e-06, + "loss": 1.025, + "step": 3668 + }, + { + "epoch": 0.8400686891814539, + "grad_norm": 1.2526971101760864, + "learning_rate": 1.3292560324031867e-06, + "loss": 1.0578, + "step": 3669 + }, + { + "epoch": 0.8402976531196337, + "grad_norm": 1.3369795083999634, + "learning_rate": 1.3255628471267056e-06, + "loss": 1.0097, + "step": 3670 + }, + { + "epoch": 0.8405266170578134, + "grad_norm": 1.3616974353790283, + "learning_rate": 1.321874435320264e-06, + "loss": 1.0137, + "step": 3671 + }, + { + "epoch": 0.8407555809959931, + "grad_norm": 1.156179428100586, + "learning_rate": 1.3181907990135624e-06, + "loss": 1.0683, + "step": 3672 + }, + { + "epoch": 0.8409845449341729, + "grad_norm": 1.5307315587997437, + "learning_rate": 1.3145119402336758e-06, + "loss": 1.0354, + "step": 3673 + }, + { + "epoch": 0.8412135088723526, + "grad_norm": 1.272509217262268, + "learning_rate": 1.3108378610050498e-06, + "loss": 1.068, + "step": 3674 + }, + { + "epoch": 0.8414424728105323, + "grad_norm": 1.1754751205444336, + "learning_rate": 1.307168563349499e-06, + "loss": 1.0448, + "step": 3675 + }, + { + "epoch": 0.841671436748712, + "grad_norm": 1.483634352684021, + "learning_rate": 1.3035040492862061e-06, + "loss": 1.0705, + "step": 3676 + }, + { + "epoch": 0.8419004006868919, + "grad_norm": 1.1576098203659058, + "learning_rate": 1.2998443208317246e-06, + "loss": 1.1061, + "step": 3677 + }, + { + "epoch": 0.8421293646250716, + "grad_norm": 1.2855072021484375, + "learning_rate": 1.2961893799999703e-06, + "loss": 1.0096, + "step": 3678 + }, + { + "epoch": 0.8423583285632513, + "grad_norm": 1.4712800979614258, + "learning_rate": 1.2925392288022299e-06, + "loss": 0.9488, + "step": 3679 + }, + { + "epoch": 0.842587292501431, + "grad_norm": 1.539665937423706, + "learning_rate": 1.2888938692471464e-06, + "loss": 1.0338, + "step": 3680 + }, + { + "epoch": 0.8428162564396108, + "grad_norm": 1.1315950155258179, + "learning_rate": 1.28525330334073e-06, + "loss": 1.0255, + "step": 3681 + }, + { + "epoch": 0.8430452203777905, + "grad_norm": 1.3212192058563232, + "learning_rate": 1.2816175330863613e-06, + "loss": 0.9955, + "step": 3682 + }, + { + "epoch": 0.8432741843159702, + "grad_norm": 1.3658416271209717, + "learning_rate": 1.2779865604847674e-06, + "loss": 1.0136, + "step": 3683 + }, + { + "epoch": 0.84350314825415, + "grad_norm": 1.8101247549057007, + "learning_rate": 1.2743603875340426e-06, + "loss": 1.0044, + "step": 3684 + }, + { + "epoch": 0.8437321121923297, + "grad_norm": 1.4984769821166992, + "learning_rate": 1.270739016229642e-06, + "loss": 1.0216, + "step": 3685 + }, + { + "epoch": 0.8439610761305094, + "grad_norm": 1.2391111850738525, + "learning_rate": 1.267122448564374e-06, + "loss": 1.0306, + "step": 3686 + }, + { + "epoch": 0.8441900400686891, + "grad_norm": 1.167498230934143, + "learning_rate": 1.2635106865284063e-06, + "loss": 0.9658, + "step": 3687 + }, + { + "epoch": 0.8444190040068689, + "grad_norm": 1.7382392883300781, + "learning_rate": 1.259903732109261e-06, + "loss": 0.9892, + "step": 3688 + }, + { + "epoch": 0.8446479679450487, + "grad_norm": 1.2146246433258057, + "learning_rate": 1.256301587291815e-06, + "loss": 1.0496, + "step": 3689 + }, + { + "epoch": 0.8448769318832284, + "grad_norm": 1.7516138553619385, + "learning_rate": 1.2527042540583e-06, + "loss": 1.0987, + "step": 3690 + }, + { + "epoch": 0.8451058958214082, + "grad_norm": 1.5586003065109253, + "learning_rate": 1.249111734388292e-06, + "loss": 1.0333, + "step": 3691 + }, + { + "epoch": 0.8453348597595879, + "grad_norm": 1.4014748334884644, + "learning_rate": 1.2455240302587325e-06, + "loss": 1.0038, + "step": 3692 + }, + { + "epoch": 0.8455638236977676, + "grad_norm": 1.3031091690063477, + "learning_rate": 1.2419411436439021e-06, + "loss": 1.0135, + "step": 3693 + }, + { + "epoch": 0.8457927876359473, + "grad_norm": 1.1367318630218506, + "learning_rate": 1.238363076515432e-06, + "loss": 1.0154, + "step": 3694 + }, + { + "epoch": 0.8460217515741271, + "grad_norm": 1.4096239805221558, + "learning_rate": 1.2347898308423012e-06, + "loss": 1.0172, + "step": 3695 + }, + { + "epoch": 0.8462507155123068, + "grad_norm": 1.2547544240951538, + "learning_rate": 1.2312214085908424e-06, + "loss": 0.9901, + "step": 3696 + }, + { + "epoch": 0.8464796794504865, + "grad_norm": 1.4089115858078003, + "learning_rate": 1.227657811724723e-06, + "loss": 1.0461, + "step": 3697 + }, + { + "epoch": 0.8467086433886662, + "grad_norm": 1.6553548574447632, + "learning_rate": 1.2240990422049625e-06, + "loss": 1.0419, + "step": 3698 + }, + { + "epoch": 0.846937607326846, + "grad_norm": 1.4773091077804565, + "learning_rate": 1.2205451019899217e-06, + "loss": 1.0423, + "step": 3699 + }, + { + "epoch": 0.8471665712650257, + "grad_norm": 1.2673395872116089, + "learning_rate": 1.2169959930353049e-06, + "loss": 1.0627, + "step": 3700 + }, + { + "epoch": 0.8473955352032055, + "grad_norm": 1.941409707069397, + "learning_rate": 1.2134517172941563e-06, + "loss": 0.9947, + "step": 3701 + }, + { + "epoch": 0.8476244991413853, + "grad_norm": 2.137348175048828, + "learning_rate": 1.2099122767168602e-06, + "loss": 1.009, + "step": 3702 + }, + { + "epoch": 0.847853463079565, + "grad_norm": 1.4081718921661377, + "learning_rate": 1.2063776732511434e-06, + "loss": 0.9911, + "step": 3703 + }, + { + "epoch": 0.8480824270177447, + "grad_norm": 1.2987362146377563, + "learning_rate": 1.2028479088420686e-06, + "loss": 1.0329, + "step": 3704 + }, + { + "epoch": 0.8483113909559244, + "grad_norm": 1.554642677307129, + "learning_rate": 1.19932298543203e-06, + "loss": 1.0258, + "step": 3705 + }, + { + "epoch": 0.8485403548941042, + "grad_norm": 1.3535888195037842, + "learning_rate": 1.19580290496077e-06, + "loss": 1.0652, + "step": 3706 + }, + { + "epoch": 0.8487693188322839, + "grad_norm": 1.258093237876892, + "learning_rate": 1.1922876693653584e-06, + "loss": 1.0323, + "step": 3707 + }, + { + "epoch": 0.8489982827704636, + "grad_norm": 1.1866981983184814, + "learning_rate": 1.1887772805801967e-06, + "loss": 1.0644, + "step": 3708 + }, + { + "epoch": 0.8492272467086434, + "grad_norm": 1.5237160921096802, + "learning_rate": 1.1852717405370228e-06, + "loss": 0.9952, + "step": 3709 + }, + { + "epoch": 0.8494562106468231, + "grad_norm": 1.3863621950149536, + "learning_rate": 1.1817710511649105e-06, + "loss": 0.9737, + "step": 3710 + }, + { + "epoch": 0.8496851745850028, + "grad_norm": 1.380892038345337, + "learning_rate": 1.1782752143902553e-06, + "loss": 0.9958, + "step": 3711 + }, + { + "epoch": 0.8499141385231826, + "grad_norm": 1.3540898561477661, + "learning_rate": 1.1747842321367886e-06, + "loss": 1.0295, + "step": 3712 + }, + { + "epoch": 0.8501431024613624, + "grad_norm": 1.2603505849838257, + "learning_rate": 1.171298106325568e-06, + "loss": 1.0327, + "step": 3713 + }, + { + "epoch": 0.8503720663995421, + "grad_norm": 1.7046961784362793, + "learning_rate": 1.1678168388749788e-06, + "loss": 1.0873, + "step": 3714 + }, + { + "epoch": 0.8506010303377218, + "grad_norm": 1.4969935417175293, + "learning_rate": 1.1643404317007345e-06, + "loss": 1.0087, + "step": 3715 + }, + { + "epoch": 0.8508299942759016, + "grad_norm": 1.0810832977294922, + "learning_rate": 1.1608688867158724e-06, + "loss": 0.9596, + "step": 3716 + }, + { + "epoch": 0.8510589582140813, + "grad_norm": 1.7231873273849487, + "learning_rate": 1.1574022058307555e-06, + "loss": 1.0446, + "step": 3717 + }, + { + "epoch": 0.851287922152261, + "grad_norm": 1.4240765571594238, + "learning_rate": 1.1539403909530688e-06, + "loss": 1.0087, + "step": 3718 + }, + { + "epoch": 0.8515168860904407, + "grad_norm": 1.1195824146270752, + "learning_rate": 1.1504834439878166e-06, + "loss": 1.006, + "step": 3719 + }, + { + "epoch": 0.8517458500286205, + "grad_norm": 1.3876497745513916, + "learning_rate": 1.1470313668373324e-06, + "loss": 1.0934, + "step": 3720 + }, + { + "epoch": 0.8519748139668002, + "grad_norm": 1.3672479391098022, + "learning_rate": 1.1435841614012666e-06, + "loss": 1.0914, + "step": 3721 + }, + { + "epoch": 0.8522037779049799, + "grad_norm": 1.1950000524520874, + "learning_rate": 1.140141829576582e-06, + "loss": 1.0035, + "step": 3722 + }, + { + "epoch": 0.8524327418431596, + "grad_norm": 1.6234666109085083, + "learning_rate": 1.1367043732575666e-06, + "loss": 1.0398, + "step": 3723 + }, + { + "epoch": 0.8526617057813395, + "grad_norm": 1.2520289421081543, + "learning_rate": 1.1332717943358263e-06, + "loss": 0.9795, + "step": 3724 + }, + { + "epoch": 0.8528906697195192, + "grad_norm": 1.3402252197265625, + "learning_rate": 1.1298440947002775e-06, + "loss": 1.0576, + "step": 3725 + }, + { + "epoch": 0.8531196336576989, + "grad_norm": 1.3803917169570923, + "learning_rate": 1.1264212762371563e-06, + "loss": 1.0388, + "step": 3726 + }, + { + "epoch": 0.8533485975958787, + "grad_norm": 1.1965564489364624, + "learning_rate": 1.1230033408300111e-06, + "loss": 1.0416, + "step": 3727 + }, + { + "epoch": 0.8535775615340584, + "grad_norm": 1.4472531080245972, + "learning_rate": 1.1195902903597023e-06, + "loss": 1.0545, + "step": 3728 + }, + { + "epoch": 0.8538065254722381, + "grad_norm": 2.9633491039276123, + "learning_rate": 1.1161821267044038e-06, + "loss": 1.0308, + "step": 3729 + }, + { + "epoch": 0.8540354894104178, + "grad_norm": 3.640618324279785, + "learning_rate": 1.1127788517395987e-06, + "loss": 0.9939, + "step": 3730 + }, + { + "epoch": 0.8542644533485976, + "grad_norm": 1.3619310855865479, + "learning_rate": 1.1093804673380804e-06, + "loss": 1.0465, + "step": 3731 + }, + { + "epoch": 0.8544934172867773, + "grad_norm": 1.3596311807632446, + "learning_rate": 1.1059869753699547e-06, + "loss": 1.0179, + "step": 3732 + }, + { + "epoch": 0.854722381224957, + "grad_norm": 1.5578163862228394, + "learning_rate": 1.102598377702625e-06, + "loss": 0.9975, + "step": 3733 + }, + { + "epoch": 0.8549513451631368, + "grad_norm": 1.3997864723205566, + "learning_rate": 1.099214676200816e-06, + "loss": 1.023, + "step": 3734 + }, + { + "epoch": 0.8551803091013166, + "grad_norm": 1.4430853128433228, + "learning_rate": 1.0958358727265438e-06, + "loss": 1.0851, + "step": 3735 + }, + { + "epoch": 0.8554092730394963, + "grad_norm": 1.0692206621170044, + "learning_rate": 1.092461969139137e-06, + "loss": 1.0646, + "step": 3736 + }, + { + "epoch": 0.855638236977676, + "grad_norm": 1.4968456029891968, + "learning_rate": 1.089092967295231e-06, + "loss": 1.0341, + "step": 3737 + }, + { + "epoch": 0.8558672009158558, + "grad_norm": 1.3507084846496582, + "learning_rate": 1.0857288690487555e-06, + "loss": 1.0051, + "step": 3738 + }, + { + "epoch": 0.8560961648540355, + "grad_norm": 1.3483790159225464, + "learning_rate": 1.082369676250945e-06, + "loss": 1.0424, + "step": 3739 + }, + { + "epoch": 0.8563251287922152, + "grad_norm": 1.4069323539733887, + "learning_rate": 1.0790153907503364e-06, + "loss": 1.0422, + "step": 3740 + }, + { + "epoch": 0.856554092730395, + "grad_norm": 1.629030704498291, + "learning_rate": 1.0756660143927644e-06, + "loss": 1.0405, + "step": 3741 + }, + { + "epoch": 0.8567830566685747, + "grad_norm": 1.409432291984558, + "learning_rate": 1.0723215490213635e-06, + "loss": 1.0972, + "step": 3742 + }, + { + "epoch": 0.8570120206067544, + "grad_norm": 1.2074227333068848, + "learning_rate": 1.0689819964765646e-06, + "loss": 1.0264, + "step": 3743 + }, + { + "epoch": 0.8572409845449341, + "grad_norm": 1.379439115524292, + "learning_rate": 1.0656473585960946e-06, + "loss": 0.9715, + "step": 3744 + }, + { + "epoch": 0.8574699484831139, + "grad_norm": 1.0981643199920654, + "learning_rate": 1.0623176372149802e-06, + "loss": 1.0236, + "step": 3745 + }, + { + "epoch": 0.8576989124212936, + "grad_norm": 1.303014874458313, + "learning_rate": 1.0589928341655342e-06, + "loss": 1.0092, + "step": 3746 + }, + { + "epoch": 0.8579278763594734, + "grad_norm": 1.1583458185195923, + "learning_rate": 1.0556729512773679e-06, + "loss": 1.0262, + "step": 3747 + }, + { + "epoch": 0.8581568402976532, + "grad_norm": 1.2592904567718506, + "learning_rate": 1.0523579903773917e-06, + "loss": 1.0341, + "step": 3748 + }, + { + "epoch": 0.8583858042358329, + "grad_norm": 1.2319550514221191, + "learning_rate": 1.0490479532897946e-06, + "loss": 1.056, + "step": 3749 + }, + { + "epoch": 0.8586147681740126, + "grad_norm": 1.1954131126403809, + "learning_rate": 1.0457428418360616e-06, + "loss": 1.0415, + "step": 3750 + }, + { + "epoch": 0.8588437321121923, + "grad_norm": 1.6190708875656128, + "learning_rate": 1.0424426578349733e-06, + "loss": 1.0591, + "step": 3751 + }, + { + "epoch": 0.8590726960503721, + "grad_norm": 1.1923547983169556, + "learning_rate": 1.0391474031025895e-06, + "loss": 1.0615, + "step": 3752 + }, + { + "epoch": 0.8593016599885518, + "grad_norm": 1.4358948469161987, + "learning_rate": 1.0358570794522615e-06, + "loss": 0.9408, + "step": 3753 + }, + { + "epoch": 0.8595306239267315, + "grad_norm": 1.3710299730300903, + "learning_rate": 1.0325716886946268e-06, + "loss": 1.0515, + "step": 3754 + }, + { + "epoch": 0.8597595878649112, + "grad_norm": 1.4297199249267578, + "learning_rate": 1.0292912326376091e-06, + "loss": 0.9476, + "step": 3755 + }, + { + "epoch": 0.859988551803091, + "grad_norm": 1.2614961862564087, + "learning_rate": 1.0260157130864178e-06, + "loss": 1.1242, + "step": 3756 + }, + { + "epoch": 0.8602175157412707, + "grad_norm": 1.7146847248077393, + "learning_rate": 1.0227451318435378e-06, + "loss": 1.0826, + "step": 3757 + }, + { + "epoch": 0.8604464796794505, + "grad_norm": 1.4658128023147583, + "learning_rate": 1.019479490708748e-06, + "loss": 1.0325, + "step": 3758 + }, + { + "epoch": 0.8606754436176303, + "grad_norm": 1.323593258857727, + "learning_rate": 1.0162187914791045e-06, + "loss": 1.0018, + "step": 3759 + }, + { + "epoch": 0.86090440755581, + "grad_norm": 1.5773340463638306, + "learning_rate": 1.0129630359489352e-06, + "loss": 0.941, + "step": 3760 + }, + { + "epoch": 0.8611333714939897, + "grad_norm": 1.2028287649154663, + "learning_rate": 1.0097122259098625e-06, + "loss": 0.9942, + "step": 3761 + }, + { + "epoch": 0.8613623354321694, + "grad_norm": 1.2747315168380737, + "learning_rate": 1.0064663631507787e-06, + "loss": 1.0371, + "step": 3762 + }, + { + "epoch": 0.8615912993703492, + "grad_norm": 1.2724723815917969, + "learning_rate": 1.0032254494578519e-06, + "loss": 1.0712, + "step": 3763 + }, + { + "epoch": 0.8618202633085289, + "grad_norm": 1.3098310232162476, + "learning_rate": 9.999894866145287e-07, + "loss": 1.0679, + "step": 3764 + }, + { + "epoch": 0.8620492272467086, + "grad_norm": 1.2499535083770752, + "learning_rate": 9.967584764015392e-07, + "loss": 1.0339, + "step": 3765 + }, + { + "epoch": 0.8622781911848884, + "grad_norm": 1.4696881771087646, + "learning_rate": 9.935324205968744e-07, + "loss": 1.0509, + "step": 3766 + }, + { + "epoch": 0.8625071551230681, + "grad_norm": 1.3763833045959473, + "learning_rate": 9.903113209758098e-07, + "loss": 1.0162, + "step": 3767 + }, + { + "epoch": 0.8627361190612478, + "grad_norm": 1.3132450580596924, + "learning_rate": 9.870951793108863e-07, + "loss": 1.031, + "step": 3768 + }, + { + "epoch": 0.8629650829994275, + "grad_norm": 1.5283464193344116, + "learning_rate": 9.838839973719227e-07, + "loss": 1.0361, + "step": 3769 + }, + { + "epoch": 0.8631940469376074, + "grad_norm": 1.428132176399231, + "learning_rate": 9.806777769260034e-07, + "loss": 1.0075, + "step": 3770 + }, + { + "epoch": 0.8634230108757871, + "grad_norm": 1.229541540145874, + "learning_rate": 9.774765197374847e-07, + "loss": 1.0586, + "step": 3771 + }, + { + "epoch": 0.8636519748139668, + "grad_norm": 1.8992414474487305, + "learning_rate": 9.742802275679941e-07, + "loss": 1.0236, + "step": 3772 + }, + { + "epoch": 0.8638809387521466, + "grad_norm": 1.2058573961257935, + "learning_rate": 9.710889021764235e-07, + "loss": 0.9526, + "step": 3773 + }, + { + "epoch": 0.8641099026903263, + "grad_norm": 1.2776075601577759, + "learning_rate": 9.679025453189273e-07, + "loss": 1.0135, + "step": 3774 + }, + { + "epoch": 0.864338866628506, + "grad_norm": 1.5076816082000732, + "learning_rate": 9.64721158748938e-07, + "loss": 1.035, + "step": 3775 + }, + { + "epoch": 0.8645678305666857, + "grad_norm": 3.022805690765381, + "learning_rate": 9.615447442171444e-07, + "loss": 1.0242, + "step": 3776 + }, + { + "epoch": 0.8647967945048655, + "grad_norm": 1.037968635559082, + "learning_rate": 9.583733034714982e-07, + "loss": 1.0302, + "step": 3777 + }, + { + "epoch": 0.8650257584430452, + "grad_norm": 1.6912962198257446, + "learning_rate": 9.55206838257219e-07, + "loss": 0.9981, + "step": 3778 + }, + { + "epoch": 0.8652547223812249, + "grad_norm": 1.1713199615478516, + "learning_rate": 9.520453503167837e-07, + "loss": 0.9741, + "step": 3779 + }, + { + "epoch": 0.8654836863194046, + "grad_norm": 1.2378156185150146, + "learning_rate": 9.488888413899345e-07, + "loss": 1.0817, + "step": 3780 + }, + { + "epoch": 0.8657126502575845, + "grad_norm": 1.2005176544189453, + "learning_rate": 9.457373132136716e-07, + "loss": 1.0458, + "step": 3781 + }, + { + "epoch": 0.8659416141957642, + "grad_norm": 2.108745574951172, + "learning_rate": 9.425907675222535e-07, + "loss": 1.1055, + "step": 3782 + }, + { + "epoch": 0.8661705781339439, + "grad_norm": 1.631432294845581, + "learning_rate": 9.394492060471994e-07, + "loss": 1.0514, + "step": 3783 + }, + { + "epoch": 0.8663995420721237, + "grad_norm": 1.4168133735656738, + "learning_rate": 9.363126305172831e-07, + "loss": 1.0468, + "step": 3784 + }, + { + "epoch": 0.8666285060103034, + "grad_norm": 1.2942928075790405, + "learning_rate": 9.331810426585364e-07, + "loss": 1.0393, + "step": 3785 + }, + { + "epoch": 0.8668574699484831, + "grad_norm": 1.3363865613937378, + "learning_rate": 9.300544441942461e-07, + "loss": 1.0378, + "step": 3786 + }, + { + "epoch": 0.8670864338866628, + "grad_norm": 1.569215178489685, + "learning_rate": 9.269328368449538e-07, + "loss": 1.046, + "step": 3787 + }, + { + "epoch": 0.8673153978248426, + "grad_norm": 1.158187985420227, + "learning_rate": 9.238162223284497e-07, + "loss": 1.0903, + "step": 3788 + }, + { + "epoch": 0.8675443617630223, + "grad_norm": 2.184824228286743, + "learning_rate": 9.207046023597866e-07, + "loss": 1.0087, + "step": 3789 + }, + { + "epoch": 0.867773325701202, + "grad_norm": 1.4404144287109375, + "learning_rate": 9.175979786512589e-07, + "loss": 1.0473, + "step": 3790 + }, + { + "epoch": 0.8680022896393818, + "grad_norm": 1.384673833847046, + "learning_rate": 9.144963529124163e-07, + "loss": 1.0535, + "step": 3791 + }, + { + "epoch": 0.8682312535775615, + "grad_norm": 1.3582277297973633, + "learning_rate": 9.113997268500574e-07, + "loss": 1.0592, + "step": 3792 + }, + { + "epoch": 0.8684602175157413, + "grad_norm": 1.1262242794036865, + "learning_rate": 9.083081021682305e-07, + "loss": 1.0654, + "step": 3793 + }, + { + "epoch": 0.868689181453921, + "grad_norm": 1.1574608087539673, + "learning_rate": 9.052214805682303e-07, + "loss": 1.065, + "step": 3794 + }, + { + "epoch": 0.8689181453921008, + "grad_norm": 1.3190420866012573, + "learning_rate": 9.021398637485979e-07, + "loss": 0.9962, + "step": 3795 + }, + { + "epoch": 0.8691471093302805, + "grad_norm": 4.816401958465576, + "learning_rate": 8.990632534051224e-07, + "loss": 0.9806, + "step": 3796 + }, + { + "epoch": 0.8693760732684602, + "grad_norm": 1.8604696989059448, + "learning_rate": 8.959916512308387e-07, + "loss": 1.0577, + "step": 3797 + }, + { + "epoch": 0.86960503720664, + "grad_norm": 1.3026139736175537, + "learning_rate": 8.929250589160166e-07, + "loss": 1.0274, + "step": 3798 + }, + { + "epoch": 0.8698340011448197, + "grad_norm": 1.3048547506332397, + "learning_rate": 8.898634781481829e-07, + "loss": 1.0608, + "step": 3799 + }, + { + "epoch": 0.8700629650829994, + "grad_norm": 1.4611790180206299, + "learning_rate": 8.868069106121002e-07, + "loss": 1.0704, + "step": 3800 + }, + { + "epoch": 0.8702919290211791, + "grad_norm": 1.360741138458252, + "learning_rate": 8.837553579897673e-07, + "loss": 1.018, + "step": 3801 + }, + { + "epoch": 0.8705208929593589, + "grad_norm": 1.1581453084945679, + "learning_rate": 8.807088219604288e-07, + "loss": 1.0178, + "step": 3802 + }, + { + "epoch": 0.8707498568975386, + "grad_norm": 1.437386393547058, + "learning_rate": 8.776673042005724e-07, + "loss": 1.062, + "step": 3803 + }, + { + "epoch": 0.8709788208357184, + "grad_norm": 1.3421489000320435, + "learning_rate": 8.74630806383916e-07, + "loss": 0.9713, + "step": 3804 + }, + { + "epoch": 0.8712077847738982, + "grad_norm": 1.095431923866272, + "learning_rate": 8.715993301814174e-07, + "loss": 0.9333, + "step": 3805 + }, + { + "epoch": 0.8714367487120779, + "grad_norm": 1.4182848930358887, + "learning_rate": 8.685728772612778e-07, + "loss": 1.0209, + "step": 3806 + }, + { + "epoch": 0.8716657126502576, + "grad_norm": 1.2892870903015137, + "learning_rate": 8.655514492889249e-07, + "loss": 1.048, + "step": 3807 + }, + { + "epoch": 0.8718946765884373, + "grad_norm": 1.217413306236267, + "learning_rate": 8.625350479270255e-07, + "loss": 1.0019, + "step": 3808 + }, + { + "epoch": 0.8721236405266171, + "grad_norm": 1.2366745471954346, + "learning_rate": 8.595236748354807e-07, + "loss": 1.042, + "step": 3809 + }, + { + "epoch": 0.8723526044647968, + "grad_norm": 1.248071312904358, + "learning_rate": 8.565173316714249e-07, + "loss": 1.0241, + "step": 3810 + }, + { + "epoch": 0.8725815684029765, + "grad_norm": 1.2598704099655151, + "learning_rate": 8.535160200892234e-07, + "loss": 1.0653, + "step": 3811 + }, + { + "epoch": 0.8728105323411562, + "grad_norm": 1.4132360219955444, + "learning_rate": 8.505197417404687e-07, + "loss": 1.0395, + "step": 3812 + }, + { + "epoch": 0.873039496279336, + "grad_norm": 1.2241437435150146, + "learning_rate": 8.475284982739917e-07, + "loss": 1.0056, + "step": 3813 + }, + { + "epoch": 0.8732684602175157, + "grad_norm": 3.2831687927246094, + "learning_rate": 8.445422913358503e-07, + "loss": 1.047, + "step": 3814 + }, + { + "epoch": 0.8734974241556954, + "grad_norm": 1.535780668258667, + "learning_rate": 8.415611225693254e-07, + "loss": 1.0187, + "step": 3815 + }, + { + "epoch": 0.8737263880938753, + "grad_norm": 1.2877063751220703, + "learning_rate": 8.385849936149282e-07, + "loss": 0.9162, + "step": 3816 + }, + { + "epoch": 0.873955352032055, + "grad_norm": 1.2718114852905273, + "learning_rate": 8.356139061104029e-07, + "loss": 1.0094, + "step": 3817 + }, + { + "epoch": 0.8741843159702347, + "grad_norm": 1.4520769119262695, + "learning_rate": 8.326478616907085e-07, + "loss": 1.0206, + "step": 3818 + }, + { + "epoch": 0.8744132799084144, + "grad_norm": 2.025461435317993, + "learning_rate": 8.296868619880372e-07, + "loss": 1.0811, + "step": 3819 + }, + { + "epoch": 0.8746422438465942, + "grad_norm": 1.302669882774353, + "learning_rate": 8.26730908631802e-07, + "loss": 1.038, + "step": 3820 + }, + { + "epoch": 0.8748712077847739, + "grad_norm": 1.3168150186538696, + "learning_rate": 8.237800032486387e-07, + "loss": 1.0384, + "step": 3821 + }, + { + "epoch": 0.8751001717229536, + "grad_norm": 1.5246906280517578, + "learning_rate": 8.208341474624071e-07, + "loss": 1.0494, + "step": 3822 + }, + { + "epoch": 0.8753291356611334, + "grad_norm": 2.0702121257781982, + "learning_rate": 8.178933428941849e-07, + "loss": 1.0735, + "step": 3823 + }, + { + "epoch": 0.8755580995993131, + "grad_norm": 1.1960113048553467, + "learning_rate": 8.149575911622731e-07, + "loss": 1.0311, + "step": 3824 + }, + { + "epoch": 0.8757870635374928, + "grad_norm": 1.2554943561553955, + "learning_rate": 8.12026893882194e-07, + "loss": 1.0279, + "step": 3825 + }, + { + "epoch": 0.8760160274756725, + "grad_norm": 1.2481484413146973, + "learning_rate": 8.091012526666797e-07, + "loss": 1.0473, + "step": 3826 + }, + { + "epoch": 0.8762449914138524, + "grad_norm": 1.5274428129196167, + "learning_rate": 8.061806691256913e-07, + "loss": 1.0147, + "step": 3827 + }, + { + "epoch": 0.8764739553520321, + "grad_norm": 1.132452130317688, + "learning_rate": 8.032651448664009e-07, + "loss": 1.0577, + "step": 3828 + }, + { + "epoch": 0.8767029192902118, + "grad_norm": 1.3138935565948486, + "learning_rate": 8.003546814931917e-07, + "loss": 1.0043, + "step": 3829 + }, + { + "epoch": 0.8769318832283916, + "grad_norm": 1.1806259155273438, + "learning_rate": 7.974492806076762e-07, + "loss": 1.0525, + "step": 3830 + }, + { + "epoch": 0.8771608471665713, + "grad_norm": 1.3222166299819946, + "learning_rate": 7.945489438086651e-07, + "loss": 1.092, + "step": 3831 + }, + { + "epoch": 0.877389811104751, + "grad_norm": 1.3098158836364746, + "learning_rate": 7.91653672692192e-07, + "loss": 1.0704, + "step": 3832 + }, + { + "epoch": 0.8776187750429307, + "grad_norm": 1.5300372838974, + "learning_rate": 7.887634688515e-07, + "loss": 1.0192, + "step": 3833 + }, + { + "epoch": 0.8778477389811105, + "grad_norm": 1.5282765626907349, + "learning_rate": 7.858783338770437e-07, + "loss": 1.0689, + "step": 3834 + }, + { + "epoch": 0.8780767029192902, + "grad_norm": 1.0936659574508667, + "learning_rate": 7.829982693564886e-07, + "loss": 1.0448, + "step": 3835 + }, + { + "epoch": 0.8783056668574699, + "grad_norm": 1.1930863857269287, + "learning_rate": 7.801232768747113e-07, + "loss": 0.9976, + "step": 3836 + }, + { + "epoch": 0.8785346307956496, + "grad_norm": 1.5860801935195923, + "learning_rate": 7.772533580137942e-07, + "loss": 0.9733, + "step": 3837 + }, + { + "epoch": 0.8787635947338294, + "grad_norm": 1.3473610877990723, + "learning_rate": 7.743885143530317e-07, + "loss": 1.0056, + "step": 3838 + }, + { + "epoch": 0.8789925586720092, + "grad_norm": 1.2198975086212158, + "learning_rate": 7.715287474689237e-07, + "loss": 1.0589, + "step": 3839 + }, + { + "epoch": 0.8792215226101889, + "grad_norm": 2.675382614135742, + "learning_rate": 7.686740589351704e-07, + "loss": 1.021, + "step": 3840 + }, + { + "epoch": 0.8794504865483687, + "grad_norm": 1.468649983406067, + "learning_rate": 7.658244503226909e-07, + "loss": 1.0378, + "step": 3841 + }, + { + "epoch": 0.8796794504865484, + "grad_norm": 1.561955213546753, + "learning_rate": 7.629799231995961e-07, + "loss": 1.0173, + "step": 3842 + }, + { + "epoch": 0.8799084144247281, + "grad_norm": 1.2610421180725098, + "learning_rate": 7.601404791312028e-07, + "loss": 1.0568, + "step": 3843 + }, + { + "epoch": 0.8801373783629078, + "grad_norm": 1.5121142864227295, + "learning_rate": 7.573061196800414e-07, + "loss": 0.9959, + "step": 3844 + }, + { + "epoch": 0.8803663423010876, + "grad_norm": 1.1338526010513306, + "learning_rate": 7.544768464058294e-07, + "loss": 0.9543, + "step": 3845 + }, + { + "epoch": 0.8805953062392673, + "grad_norm": 1.4656891822814941, + "learning_rate": 7.51652660865495e-07, + "loss": 1.0345, + "step": 3846 + }, + { + "epoch": 0.880824270177447, + "grad_norm": 1.067690372467041, + "learning_rate": 7.488335646131628e-07, + "loss": 1.001, + "step": 3847 + }, + { + "epoch": 0.8810532341156267, + "grad_norm": 1.3866279125213623, + "learning_rate": 7.460195592001585e-07, + "loss": 0.9599, + "step": 3848 + }, + { + "epoch": 0.8812821980538065, + "grad_norm": 1.3089559078216553, + "learning_rate": 7.432106461750044e-07, + "loss": 1.0318, + "step": 3849 + }, + { + "epoch": 0.8815111619919863, + "grad_norm": 1.1162192821502686, + "learning_rate": 7.40406827083423e-07, + "loss": 1.0282, + "step": 3850 + }, + { + "epoch": 0.881740125930166, + "grad_norm": 1.7417439222335815, + "learning_rate": 7.376081034683325e-07, + "loss": 1.0748, + "step": 3851 + }, + { + "epoch": 0.8819690898683458, + "grad_norm": 1.5468460321426392, + "learning_rate": 7.348144768698485e-07, + "loss": 1.0649, + "step": 3852 + }, + { + "epoch": 0.8821980538065255, + "grad_norm": 1.8217270374298096, + "learning_rate": 7.320259488252757e-07, + "loss": 1.0416, + "step": 3853 + }, + { + "epoch": 0.8824270177447052, + "grad_norm": 1.378803014755249, + "learning_rate": 7.292425208691212e-07, + "loss": 1.0032, + "step": 3854 + }, + { + "epoch": 0.882655981682885, + "grad_norm": 1.5163785219192505, + "learning_rate": 7.26464194533083e-07, + "loss": 1.0449, + "step": 3855 + }, + { + "epoch": 0.8828849456210647, + "grad_norm": 1.268101692199707, + "learning_rate": 7.236909713460482e-07, + "loss": 1.0759, + "step": 3856 + }, + { + "epoch": 0.8831139095592444, + "grad_norm": 1.5246165990829468, + "learning_rate": 7.209228528340972e-07, + "loss": 1.0565, + "step": 3857 + }, + { + "epoch": 0.8833428734974241, + "grad_norm": 1.3627903461456299, + "learning_rate": 7.181598405205082e-07, + "loss": 1.0487, + "step": 3858 + }, + { + "epoch": 0.8835718374356039, + "grad_norm": 1.2992089986801147, + "learning_rate": 7.154019359257381e-07, + "loss": 1.0759, + "step": 3859 + }, + { + "epoch": 0.8838008013737836, + "grad_norm": 1.353250503540039, + "learning_rate": 7.126491405674407e-07, + "loss": 1.0197, + "step": 3860 + }, + { + "epoch": 0.8840297653119633, + "grad_norm": 1.206063985824585, + "learning_rate": 7.099014559604556e-07, + "loss": 1.0341, + "step": 3861 + }, + { + "epoch": 0.8842587292501431, + "grad_norm": 1.488411784172058, + "learning_rate": 7.071588836168109e-07, + "loss": 0.992, + "step": 3862 + }, + { + "epoch": 0.8844876931883229, + "grad_norm": 1.6317309141159058, + "learning_rate": 7.044214250457216e-07, + "loss": 1.0674, + "step": 3863 + }, + { + "epoch": 0.8847166571265026, + "grad_norm": 4.539884090423584, + "learning_rate": 7.016890817535881e-07, + "loss": 1.0251, + "step": 3864 + }, + { + "epoch": 0.8849456210646823, + "grad_norm": 1.4975309371948242, + "learning_rate": 6.989618552439958e-07, + "loss": 1.0532, + "step": 3865 + }, + { + "epoch": 0.8851745850028621, + "grad_norm": 1.3167531490325928, + "learning_rate": 6.962397470177162e-07, + "loss": 1.0001, + "step": 3866 + }, + { + "epoch": 0.8854035489410418, + "grad_norm": 1.4427505731582642, + "learning_rate": 6.935227585726978e-07, + "loss": 1.1204, + "step": 3867 + }, + { + "epoch": 0.8856325128792215, + "grad_norm": 1.2467824220657349, + "learning_rate": 6.908108914040823e-07, + "loss": 0.9937, + "step": 3868 + }, + { + "epoch": 0.8858614768174012, + "grad_norm": 1.390540361404419, + "learning_rate": 6.881041470041849e-07, + "loss": 0.9756, + "step": 3869 + }, + { + "epoch": 0.886090440755581, + "grad_norm": 1.3013540506362915, + "learning_rate": 6.854025268625042e-07, + "loss": 1.0061, + "step": 3870 + }, + { + "epoch": 0.8863194046937607, + "grad_norm": 1.362518548965454, + "learning_rate": 6.827060324657164e-07, + "loss": 1.0281, + "step": 3871 + }, + { + "epoch": 0.8865483686319404, + "grad_norm": 1.2985994815826416, + "learning_rate": 6.800146652976869e-07, + "loss": 1.049, + "step": 3872 + }, + { + "epoch": 0.8867773325701203, + "grad_norm": 1.0657109022140503, + "learning_rate": 6.773284268394464e-07, + "loss": 1.0451, + "step": 3873 + }, + { + "epoch": 0.8870062965083, + "grad_norm": 1.4140193462371826, + "learning_rate": 6.746473185692115e-07, + "loss": 0.9702, + "step": 3874 + }, + { + "epoch": 0.8872352604464797, + "grad_norm": 1.1937177181243896, + "learning_rate": 6.71971341962373e-07, + "loss": 1.0247, + "step": 3875 + }, + { + "epoch": 0.8874642243846594, + "grad_norm": 1.1080400943756104, + "learning_rate": 6.693004984914997e-07, + "loss": 1.039, + "step": 3876 + }, + { + "epoch": 0.8876931883228392, + "grad_norm": 1.7587668895721436, + "learning_rate": 6.666347896263326e-07, + "loss": 1.0278, + "step": 3877 + }, + { + "epoch": 0.8879221522610189, + "grad_norm": 1.3932582139968872, + "learning_rate": 6.639742168337904e-07, + "loss": 1.027, + "step": 3878 + }, + { + "epoch": 0.8881511161991986, + "grad_norm": 1.1661114692687988, + "learning_rate": 6.61318781577962e-07, + "loss": 1.0093, + "step": 3879 + }, + { + "epoch": 0.8883800801373783, + "grad_norm": 1.9590517282485962, + "learning_rate": 6.586684853201153e-07, + "loss": 1.0055, + "step": 3880 + }, + { + "epoch": 0.8886090440755581, + "grad_norm": 1.1533890962600708, + "learning_rate": 6.560233295186802e-07, + "loss": 1.0544, + "step": 3881 + }, + { + "epoch": 0.8888380080137378, + "grad_norm": 1.3643183708190918, + "learning_rate": 6.53383315629268e-07, + "loss": 1.0934, + "step": 3882 + }, + { + "epoch": 0.8890669719519175, + "grad_norm": 1.3094185590744019, + "learning_rate": 6.507484451046575e-07, + "loss": 1.0203, + "step": 3883 + }, + { + "epoch": 0.8892959358900973, + "grad_norm": 1.2927873134613037, + "learning_rate": 6.481187193947913e-07, + "loss": 1.0636, + "step": 3884 + }, + { + "epoch": 0.8895248998282771, + "grad_norm": 1.2240914106369019, + "learning_rate": 6.454941399467873e-07, + "loss": 1.0583, + "step": 3885 + }, + { + "epoch": 0.8897538637664568, + "grad_norm": 1.765049934387207, + "learning_rate": 6.428747082049313e-07, + "loss": 1.0131, + "step": 3886 + }, + { + "epoch": 0.8899828277046365, + "grad_norm": 1.369654893875122, + "learning_rate": 6.402604256106715e-07, + "loss": 1.0498, + "step": 3887 + }, + { + "epoch": 0.8902117916428163, + "grad_norm": 1.3459523916244507, + "learning_rate": 6.37651293602628e-07, + "loss": 0.9947, + "step": 3888 + }, + { + "epoch": 0.890440755580996, + "grad_norm": 1.3815542459487915, + "learning_rate": 6.350473136165836e-07, + "loss": 0.9885, + "step": 3889 + }, + { + "epoch": 0.8906697195191757, + "grad_norm": 1.2464871406555176, + "learning_rate": 6.324484870854874e-07, + "loss": 1.0349, + "step": 3890 + }, + { + "epoch": 0.8908986834573555, + "grad_norm": 1.1414371728897095, + "learning_rate": 6.298548154394501e-07, + "loss": 0.9818, + "step": 3891 + }, + { + "epoch": 0.8911276473955352, + "grad_norm": 1.2838653326034546, + "learning_rate": 6.2726630010575e-07, + "loss": 1.0227, + "step": 3892 + }, + { + "epoch": 0.8913566113337149, + "grad_norm": 1.3109924793243408, + "learning_rate": 6.246829425088229e-07, + "loss": 1.0274, + "step": 3893 + }, + { + "epoch": 0.8915855752718946, + "grad_norm": 1.4073375463485718, + "learning_rate": 6.22104744070271e-07, + "loss": 1.0466, + "step": 3894 + }, + { + "epoch": 0.8918145392100744, + "grad_norm": 1.345703125, + "learning_rate": 6.195317062088513e-07, + "loss": 1.0208, + "step": 3895 + }, + { + "epoch": 0.8920435031482542, + "grad_norm": 1.3727253675460815, + "learning_rate": 6.169638303404912e-07, + "loss": 1.0646, + "step": 3896 + }, + { + "epoch": 0.8922724670864339, + "grad_norm": 1.132949709892273, + "learning_rate": 6.14401117878266e-07, + "loss": 0.9631, + "step": 3897 + }, + { + "epoch": 0.8925014310246137, + "grad_norm": 1.202333688735962, + "learning_rate": 6.118435702324166e-07, + "loss": 0.9988, + "step": 3898 + }, + { + "epoch": 0.8927303949627934, + "grad_norm": 1.380020022392273, + "learning_rate": 6.092911888103404e-07, + "loss": 1.0655, + "step": 3899 + }, + { + "epoch": 0.8929593589009731, + "grad_norm": 1.3880457878112793, + "learning_rate": 6.067439750165916e-07, + "loss": 1.004, + "step": 3900 + }, + { + "epoch": 0.8931883228391528, + "grad_norm": 1.0868477821350098, + "learning_rate": 6.042019302528801e-07, + "loss": 0.9799, + "step": 3901 + }, + { + "epoch": 0.8934172867773326, + "grad_norm": 1.2509794235229492, + "learning_rate": 6.016650559180715e-07, + "loss": 1.0058, + "step": 3902 + }, + { + "epoch": 0.8936462507155123, + "grad_norm": 1.0911234617233276, + "learning_rate": 5.991333534081878e-07, + "loss": 1.0277, + "step": 3903 + }, + { + "epoch": 0.893875214653692, + "grad_norm": 1.3828800916671753, + "learning_rate": 5.966068241164025e-07, + "loss": 1.0678, + "step": 3904 + }, + { + "epoch": 0.8941041785918717, + "grad_norm": 1.1840590238571167, + "learning_rate": 5.940854694330433e-07, + "loss": 0.9748, + "step": 3905 + }, + { + "epoch": 0.8943331425300515, + "grad_norm": 1.447941541671753, + "learning_rate": 5.915692907455905e-07, + "loss": 0.9746, + "step": 3906 + }, + { + "epoch": 0.8945621064682312, + "grad_norm": 2.082156181335449, + "learning_rate": 5.890582894386798e-07, + "loss": 1.0355, + "step": 3907 + }, + { + "epoch": 0.894791070406411, + "grad_norm": 1.1880125999450684, + "learning_rate": 5.86552466894088e-07, + "loss": 1.0326, + "step": 3908 + }, + { + "epoch": 0.8950200343445908, + "grad_norm": 1.3864256143569946, + "learning_rate": 5.840518244907512e-07, + "loss": 0.9975, + "step": 3909 + }, + { + "epoch": 0.8952489982827705, + "grad_norm": 1.368095874786377, + "learning_rate": 5.815563636047539e-07, + "loss": 0.9745, + "step": 3910 + }, + { + "epoch": 0.8954779622209502, + "grad_norm": 1.2576544284820557, + "learning_rate": 5.790660856093245e-07, + "loss": 1.0147, + "step": 3911 + }, + { + "epoch": 0.89570692615913, + "grad_norm": 1.2153538465499878, + "learning_rate": 5.76580991874841e-07, + "loss": 1.0201, + "step": 3912 + }, + { + "epoch": 0.8959358900973097, + "grad_norm": 1.7341593503952026, + "learning_rate": 5.741010837688354e-07, + "loss": 0.9843, + "step": 3913 + }, + { + "epoch": 0.8961648540354894, + "grad_norm": 1.151334285736084, + "learning_rate": 5.716263626559748e-07, + "loss": 0.9381, + "step": 3914 + }, + { + "epoch": 0.8963938179736691, + "grad_norm": 1.2881344556808472, + "learning_rate": 5.691568298980799e-07, + "loss": 0.9995, + "step": 3915 + }, + { + "epoch": 0.8966227819118489, + "grad_norm": 1.0738816261291504, + "learning_rate": 5.666924868541124e-07, + "loss": 0.9905, + "step": 3916 + }, + { + "epoch": 0.8968517458500286, + "grad_norm": 1.1380635499954224, + "learning_rate": 5.64233334880181e-07, + "loss": 1.0218, + "step": 3917 + }, + { + "epoch": 0.8970807097882083, + "grad_norm": 1.9478411674499512, + "learning_rate": 5.617793753295364e-07, + "loss": 1.0623, + "step": 3918 + }, + { + "epoch": 0.8973096737263881, + "grad_norm": 3.0823843479156494, + "learning_rate": 5.593306095525697e-07, + "loss": 1.0533, + "step": 3919 + }, + { + "epoch": 0.8975386376645679, + "grad_norm": 1.3105992078781128, + "learning_rate": 5.568870388968184e-07, + "loss": 1.0027, + "step": 3920 + }, + { + "epoch": 0.8977676016027476, + "grad_norm": 1.4492088556289673, + "learning_rate": 5.544486647069614e-07, + "loss": 0.9894, + "step": 3921 + }, + { + "epoch": 0.8979965655409273, + "grad_norm": 1.8873605728149414, + "learning_rate": 5.520154883248086e-07, + "loss": 1.0517, + "step": 3922 + }, + { + "epoch": 0.898225529479107, + "grad_norm": 1.1515185832977295, + "learning_rate": 5.495875110893223e-07, + "loss": 1.0118, + "step": 3923 + }, + { + "epoch": 0.8984544934172868, + "grad_norm": 1.1645504236221313, + "learning_rate": 5.471647343365982e-07, + "loss": 0.9882, + "step": 3924 + }, + { + "epoch": 0.8986834573554665, + "grad_norm": 1.2297289371490479, + "learning_rate": 5.447471593998666e-07, + "loss": 0.9947, + "step": 3925 + }, + { + "epoch": 0.8989124212936462, + "grad_norm": 1.699040412902832, + "learning_rate": 5.423347876094998e-07, + "loss": 0.9676, + "step": 3926 + }, + { + "epoch": 0.899141385231826, + "grad_norm": 1.485076665878296, + "learning_rate": 5.399276202930071e-07, + "loss": 1.0582, + "step": 3927 + }, + { + "epoch": 0.8993703491700057, + "grad_norm": 2.078399419784546, + "learning_rate": 5.375256587750311e-07, + "loss": 1.0306, + "step": 3928 + }, + { + "epoch": 0.8995993131081854, + "grad_norm": 1.3613694906234741, + "learning_rate": 5.351289043773511e-07, + "loss": 1.0463, + "step": 3929 + }, + { + "epoch": 0.8998282770463651, + "grad_norm": 1.1392529010772705, + "learning_rate": 5.327373584188822e-07, + "loss": 1.0335, + "step": 3930 + }, + { + "epoch": 0.900057240984545, + "grad_norm": 1.9740110635757446, + "learning_rate": 5.303510222156716e-07, + "loss": 1.0021, + "step": 3931 + }, + { + "epoch": 0.9002862049227247, + "grad_norm": 1.254390835762024, + "learning_rate": 5.279698970809011e-07, + "loss": 1.0522, + "step": 3932 + }, + { + "epoch": 0.9005151688609044, + "grad_norm": 1.3042449951171875, + "learning_rate": 5.255939843248792e-07, + "loss": 1.0814, + "step": 3933 + }, + { + "epoch": 0.9007441327990842, + "grad_norm": 1.3471471071243286, + "learning_rate": 5.232232852550568e-07, + "loss": 1.0309, + "step": 3934 + }, + { + "epoch": 0.9009730967372639, + "grad_norm": 1.9291504621505737, + "learning_rate": 5.208578011760079e-07, + "loss": 1.0439, + "step": 3935 + }, + { + "epoch": 0.9012020606754436, + "grad_norm": 1.2869789600372314, + "learning_rate": 5.184975333894349e-07, + "loss": 1.1011, + "step": 3936 + }, + { + "epoch": 0.9014310246136233, + "grad_norm": 1.4123051166534424, + "learning_rate": 5.161424831941797e-07, + "loss": 1.0704, + "step": 3937 + }, + { + "epoch": 0.9016599885518031, + "grad_norm": 1.2700828313827515, + "learning_rate": 5.137926518862013e-07, + "loss": 1.053, + "step": 3938 + }, + { + "epoch": 0.9018889524899828, + "grad_norm": 1.2948448657989502, + "learning_rate": 5.114480407585942e-07, + "loss": 1.0921, + "step": 3939 + }, + { + "epoch": 0.9021179164281625, + "grad_norm": 1.1770613193511963, + "learning_rate": 5.091086511015786e-07, + "loss": 1.0008, + "step": 3940 + }, + { + "epoch": 0.9023468803663423, + "grad_norm": 1.1652601957321167, + "learning_rate": 5.067744842025002e-07, + "loss": 1.0086, + "step": 3941 + }, + { + "epoch": 0.9025758443045221, + "grad_norm": 1.2411365509033203, + "learning_rate": 5.044455413458327e-07, + "loss": 1.0527, + "step": 3942 + }, + { + "epoch": 0.9028048082427018, + "grad_norm": 1.4037675857543945, + "learning_rate": 5.02121823813172e-07, + "loss": 1.0761, + "step": 3943 + }, + { + "epoch": 0.9030337721808815, + "grad_norm": 1.2515357732772827, + "learning_rate": 4.998033328832419e-07, + "loss": 0.9981, + "step": 3944 + }, + { + "epoch": 0.9032627361190613, + "grad_norm": 1.2006301879882812, + "learning_rate": 4.974900698318885e-07, + "loss": 0.983, + "step": 3945 + }, + { + "epoch": 0.903491700057241, + "grad_norm": 1.3678110837936401, + "learning_rate": 4.951820359320802e-07, + "loss": 1.0476, + "step": 3946 + }, + { + "epoch": 0.9037206639954207, + "grad_norm": 1.4648295640945435, + "learning_rate": 4.928792324539089e-07, + "loss": 0.9633, + "step": 3947 + }, + { + "epoch": 0.9039496279336005, + "grad_norm": 1.2273361682891846, + "learning_rate": 4.905816606645896e-07, + "loss": 1.038, + "step": 3948 + }, + { + "epoch": 0.9041785918717802, + "grad_norm": 1.4598244428634644, + "learning_rate": 4.882893218284546e-07, + "loss": 1.0626, + "step": 3949 + }, + { + "epoch": 0.9044075558099599, + "grad_norm": 1.1587828397750854, + "learning_rate": 4.860022172069579e-07, + "loss": 1.0123, + "step": 3950 + }, + { + "epoch": 0.9046365197481396, + "grad_norm": 1.4061542749404907, + "learning_rate": 4.837203480586782e-07, + "loss": 0.9873, + "step": 3951 + }, + { + "epoch": 0.9048654836863194, + "grad_norm": 1.5064465999603271, + "learning_rate": 4.814437156393048e-07, + "loss": 1.0661, + "step": 3952 + }, + { + "epoch": 0.9050944476244991, + "grad_norm": 1.5875314474105835, + "learning_rate": 4.791723212016508e-07, + "loss": 0.9952, + "step": 3953 + }, + { + "epoch": 0.9053234115626789, + "grad_norm": 1.572794795036316, + "learning_rate": 4.769061659956464e-07, + "loss": 1.0625, + "step": 3954 + }, + { + "epoch": 0.9055523755008587, + "grad_norm": 1.485347032546997, + "learning_rate": 4.746452512683375e-07, + "loss": 0.9975, + "step": 3955 + }, + { + "epoch": 0.9057813394390384, + "grad_norm": 1.1230580806732178, + "learning_rate": 4.7238957826388656e-07, + "loss": 1.0865, + "step": 3956 + }, + { + "epoch": 0.9060103033772181, + "grad_norm": 1.2839785814285278, + "learning_rate": 4.7013914822356956e-07, + "loss": 1.0588, + "step": 3957 + }, + { + "epoch": 0.9062392673153978, + "grad_norm": 1.29798424243927, + "learning_rate": 4.6789396238578255e-07, + "loss": 1.0536, + "step": 3958 + }, + { + "epoch": 0.9064682312535776, + "grad_norm": 1.4684815406799316, + "learning_rate": 4.656540219860317e-07, + "loss": 1.0095, + "step": 3959 + }, + { + "epoch": 0.9066971951917573, + "grad_norm": 3.565049409866333, + "learning_rate": 4.6341932825693326e-07, + "loss": 1.0161, + "step": 3960 + }, + { + "epoch": 0.906926159129937, + "grad_norm": 1.6227142810821533, + "learning_rate": 4.611898824282257e-07, + "loss": 1.0105, + "step": 3961 + }, + { + "epoch": 0.9071551230681167, + "grad_norm": 1.3278828859329224, + "learning_rate": 4.5896568572675327e-07, + "loss": 0.9983, + "step": 3962 + }, + { + "epoch": 0.9073840870062965, + "grad_norm": 1.2562534809112549, + "learning_rate": 4.567467393764702e-07, + "loss": 1.0176, + "step": 3963 + }, + { + "epoch": 0.9076130509444762, + "grad_norm": 1.5097713470458984, + "learning_rate": 4.5453304459844525e-07, + "loss": 1.0569, + "step": 3964 + }, + { + "epoch": 0.907842014882656, + "grad_norm": 1.313685655593872, + "learning_rate": 4.5232460261085966e-07, + "loss": 1.0136, + "step": 3965 + }, + { + "epoch": 0.9080709788208358, + "grad_norm": 1.2489078044891357, + "learning_rate": 4.501214146289956e-07, + "loss": 1.0279, + "step": 3966 + }, + { + "epoch": 0.9082999427590155, + "grad_norm": 1.3103182315826416, + "learning_rate": 4.4792348186525e-07, + "loss": 1.0426, + "step": 3967 + }, + { + "epoch": 0.9085289066971952, + "grad_norm": 1.1325465440750122, + "learning_rate": 4.4573080552912874e-07, + "loss": 0.9801, + "step": 3968 + }, + { + "epoch": 0.9087578706353749, + "grad_norm": 1.951433539390564, + "learning_rate": 4.435433868272421e-07, + "loss": 0.9769, + "step": 3969 + }, + { + "epoch": 0.9089868345735547, + "grad_norm": 1.1956815719604492, + "learning_rate": 4.413612269633083e-07, + "loss": 1.0412, + "step": 3970 + }, + { + "epoch": 0.9092157985117344, + "grad_norm": 1.2107232809066772, + "learning_rate": 4.391843271381513e-07, + "loss": 0.9577, + "step": 3971 + }, + { + "epoch": 0.9094447624499141, + "grad_norm": 1.252339482307434, + "learning_rate": 4.370126885497017e-07, + "loss": 1.0642, + "step": 3972 + }, + { + "epoch": 0.9096737263880939, + "grad_norm": 1.6101003885269165, + "learning_rate": 4.3484631239299356e-07, + "loss": 0.9944, + "step": 3973 + }, + { + "epoch": 0.9099026903262736, + "grad_norm": 1.0703437328338623, + "learning_rate": 4.3268519986016444e-07, + "loss": 1.0407, + "step": 3974 + }, + { + "epoch": 0.9101316542644533, + "grad_norm": 1.2428948879241943, + "learning_rate": 4.3052935214045745e-07, + "loss": 1.0025, + "step": 3975 + }, + { + "epoch": 0.910360618202633, + "grad_norm": 1.3559765815734863, + "learning_rate": 4.2837877042021915e-07, + "loss": 1.0398, + "step": 3976 + }, + { + "epoch": 0.9105895821408129, + "grad_norm": 1.2042170763015747, + "learning_rate": 4.26233455882894e-07, + "loss": 0.9451, + "step": 3977 + }, + { + "epoch": 0.9108185460789926, + "grad_norm": 1.347455620765686, + "learning_rate": 4.240934097090299e-07, + "loss": 0.9805, + "step": 3978 + }, + { + "epoch": 0.9110475100171723, + "grad_norm": 2.1769297122955322, + "learning_rate": 4.219586330762815e-07, + "loss": 1.0007, + "step": 3979 + }, + { + "epoch": 0.911276473955352, + "grad_norm": 1.392369270324707, + "learning_rate": 4.198291271593924e-07, + "loss": 1.0211, + "step": 3980 + }, + { + "epoch": 0.9115054378935318, + "grad_norm": 1.1754943132400513, + "learning_rate": 4.177048931302152e-07, + "loss": 1.008, + "step": 3981 + }, + { + "epoch": 0.9117344018317115, + "grad_norm": 1.4287794828414917, + "learning_rate": 4.1558593215769714e-07, + "loss": 1.0554, + "step": 3982 + }, + { + "epoch": 0.9119633657698912, + "grad_norm": 1.3779202699661255, + "learning_rate": 4.134722454078843e-07, + "loss": 1.0852, + "step": 3983 + }, + { + "epoch": 0.912192329708071, + "grad_norm": 1.3808211088180542, + "learning_rate": 4.1136383404392187e-07, + "loss": 1.0345, + "step": 3984 + }, + { + "epoch": 0.9124212936462507, + "grad_norm": 1.3731261491775513, + "learning_rate": 4.092606992260506e-07, + "loss": 1.0508, + "step": 3985 + }, + { + "epoch": 0.9126502575844304, + "grad_norm": 1.3647085428237915, + "learning_rate": 4.0716284211160807e-07, + "loss": 1.0739, + "step": 3986 + }, + { + "epoch": 0.9128792215226101, + "grad_norm": 1.5696361064910889, + "learning_rate": 4.0507026385502747e-07, + "loss": 1.0594, + "step": 3987 + }, + { + "epoch": 0.91310818546079, + "grad_norm": 1.408075213432312, + "learning_rate": 4.029829656078343e-07, + "loss": 1.0911, + "step": 3988 + }, + { + "epoch": 0.9133371493989697, + "grad_norm": 1.2420895099639893, + "learning_rate": 4.0090094851865547e-07, + "loss": 0.9449, + "step": 3989 + }, + { + "epoch": 0.9135661133371494, + "grad_norm": 1.551822543144226, + "learning_rate": 3.9882421373320655e-07, + "loss": 0.9516, + "step": 3990 + }, + { + "epoch": 0.9137950772753292, + "grad_norm": 2.8030037879943848, + "learning_rate": 3.967527623942957e-07, + "loss": 1.0588, + "step": 3991 + }, + { + "epoch": 0.9140240412135089, + "grad_norm": 1.239384651184082, + "learning_rate": 3.9468659564182554e-07, + "loss": 1.0079, + "step": 3992 + }, + { + "epoch": 0.9142530051516886, + "grad_norm": 1.3504375219345093, + "learning_rate": 3.9262571461279097e-07, + "loss": 0.9982, + "step": 3993 + }, + { + "epoch": 0.9144819690898683, + "grad_norm": 1.2237106561660767, + "learning_rate": 3.9057012044127817e-07, + "loss": 1.0647, + "step": 3994 + }, + { + "epoch": 0.9147109330280481, + "grad_norm": 1.3022246360778809, + "learning_rate": 3.885198142584612e-07, + "loss": 1.04, + "step": 3995 + }, + { + "epoch": 0.9149398969662278, + "grad_norm": 1.3392308950424194, + "learning_rate": 3.864747971926086e-07, + "loss": 1.0708, + "step": 3996 + }, + { + "epoch": 0.9151688609044075, + "grad_norm": 1.2771753072738647, + "learning_rate": 3.844350703690758e-07, + "loss": 1.0403, + "step": 3997 + }, + { + "epoch": 0.9153978248425872, + "grad_norm": 1.0660115480422974, + "learning_rate": 3.82400634910306e-07, + "loss": 1.0296, + "step": 3998 + }, + { + "epoch": 0.915626788780767, + "grad_norm": 1.3524727821350098, + "learning_rate": 3.8037149193583257e-07, + "loss": 1.0209, + "step": 3999 + }, + { + "epoch": 0.9158557527189468, + "grad_norm": 1.2420437335968018, + "learning_rate": 3.7834764256227674e-07, + "loss": 1.0699, + "step": 4000 + }, + { + "epoch": 0.9160847166571265, + "grad_norm": 1.2622954845428467, + "learning_rate": 3.7632908790334656e-07, + "loss": 1.038, + "step": 4001 + }, + { + "epoch": 0.9163136805953063, + "grad_norm": 1.260853886604309, + "learning_rate": 3.7431582906983124e-07, + "loss": 1.0352, + "step": 4002 + }, + { + "epoch": 0.916542644533486, + "grad_norm": 1.2435685396194458, + "learning_rate": 3.723078671696168e-07, + "loss": 1.0368, + "step": 4003 + }, + { + "epoch": 0.9167716084716657, + "grad_norm": 1.2323977947235107, + "learning_rate": 3.7030520330766264e-07, + "loss": 0.9676, + "step": 4004 + }, + { + "epoch": 0.9170005724098454, + "grad_norm": 1.191890001296997, + "learning_rate": 3.6830783858601835e-07, + "loss": 0.9949, + "step": 4005 + }, + { + "epoch": 0.9172295363480252, + "grad_norm": 1.2182968854904175, + "learning_rate": 3.6631577410382034e-07, + "loss": 1.0146, + "step": 4006 + }, + { + "epoch": 0.9174585002862049, + "grad_norm": 1.3352956771850586, + "learning_rate": 3.643290109572828e-07, + "loss": 1.0326, + "step": 4007 + }, + { + "epoch": 0.9176874642243846, + "grad_norm": 1.6930725574493408, + "learning_rate": 3.6234755023970447e-07, + "loss": 1.0013, + "step": 4008 + }, + { + "epoch": 0.9179164281625644, + "grad_norm": 1.251842737197876, + "learning_rate": 3.603713930414676e-07, + "loss": 1.0314, + "step": 4009 + }, + { + "epoch": 0.9181453921007441, + "grad_norm": 1.3096429109573364, + "learning_rate": 3.5840054045003346e-07, + "loss": 1.024, + "step": 4010 + }, + { + "epoch": 0.9183743560389239, + "grad_norm": 1.7176495790481567, + "learning_rate": 3.564349935499478e-07, + "loss": 0.9741, + "step": 4011 + }, + { + "epoch": 0.9186033199771036, + "grad_norm": 1.384275197982788, + "learning_rate": 3.5447475342283324e-07, + "loss": 1.0004, + "step": 4012 + }, + { + "epoch": 0.9188322839152834, + "grad_norm": 1.2257745265960693, + "learning_rate": 3.525198211473935e-07, + "loss": 0.9716, + "step": 4013 + }, + { + "epoch": 0.9190612478534631, + "grad_norm": 1.544185757637024, + "learning_rate": 3.5057019779941246e-07, + "loss": 1.0022, + "step": 4014 + }, + { + "epoch": 0.9192902117916428, + "grad_norm": 2.072894334793091, + "learning_rate": 3.4862588445174985e-07, + "loss": 0.9794, + "step": 4015 + }, + { + "epoch": 0.9195191757298226, + "grad_norm": 1.132996678352356, + "learning_rate": 3.466868821743452e-07, + "loss": 1.021, + "step": 4016 + }, + { + "epoch": 0.9197481396680023, + "grad_norm": 1.3861950635910034, + "learning_rate": 3.447531920342173e-07, + "loss": 1.0204, + "step": 4017 + }, + { + "epoch": 0.919977103606182, + "grad_norm": 1.2314361333847046, + "learning_rate": 3.428248150954583e-07, + "loss": 1.0447, + "step": 4018 + }, + { + "epoch": 0.9202060675443617, + "grad_norm": 1.2820926904678345, + "learning_rate": 3.4090175241923593e-07, + "loss": 1.0334, + "step": 4019 + }, + { + "epoch": 0.9204350314825415, + "grad_norm": 1.4982575178146362, + "learning_rate": 3.3898400506379937e-07, + "loss": 1.1002, + "step": 4020 + }, + { + "epoch": 0.9206639954207212, + "grad_norm": 2.526170492172241, + "learning_rate": 3.3707157408446547e-07, + "loss": 1.0653, + "step": 4021 + }, + { + "epoch": 0.9208929593589009, + "grad_norm": 2.164984703063965, + "learning_rate": 3.3516446053363015e-07, + "loss": 0.9579, + "step": 4022 + }, + { + "epoch": 0.9211219232970808, + "grad_norm": 1.396539330482483, + "learning_rate": 3.3326266546076293e-07, + "loss": 0.9919, + "step": 4023 + }, + { + "epoch": 0.9213508872352605, + "grad_norm": 1.2713897228240967, + "learning_rate": 3.3136618991240544e-07, + "loss": 1.0049, + "step": 4024 + }, + { + "epoch": 0.9215798511734402, + "grad_norm": 1.5097935199737549, + "learning_rate": 3.294750349321718e-07, + "loss": 1.0192, + "step": 4025 + }, + { + "epoch": 0.9218088151116199, + "grad_norm": 1.3932673931121826, + "learning_rate": 3.2758920156074624e-07, + "loss": 0.9681, + "step": 4026 + }, + { + "epoch": 0.9220377790497997, + "grad_norm": 1.4764078855514526, + "learning_rate": 3.2570869083589196e-07, + "loss": 1.0264, + "step": 4027 + }, + { + "epoch": 0.9222667429879794, + "grad_norm": 1.1931257247924805, + "learning_rate": 3.238335037924356e-07, + "loss": 1.0474, + "step": 4028 + }, + { + "epoch": 0.9224957069261591, + "grad_norm": 1.2183254957199097, + "learning_rate": 3.219636414622751e-07, + "loss": 1.0471, + "step": 4029 + }, + { + "epoch": 0.9227246708643388, + "grad_norm": 1.6249024868011475, + "learning_rate": 3.20099104874384e-07, + "loss": 1.0004, + "step": 4030 + }, + { + "epoch": 0.9229536348025186, + "grad_norm": 1.2928696870803833, + "learning_rate": 3.1823989505479934e-07, + "loss": 0.969, + "step": 4031 + }, + { + "epoch": 0.9231825987406983, + "grad_norm": 1.292734980583191, + "learning_rate": 3.163860130266283e-07, + "loss": 1.0344, + "step": 4032 + }, + { + "epoch": 0.923411562678878, + "grad_norm": 1.6928443908691406, + "learning_rate": 3.145374598100448e-07, + "loss": 1.02, + "step": 4033 + }, + { + "epoch": 0.9236405266170579, + "grad_norm": 1.1957359313964844, + "learning_rate": 3.1269423642229737e-07, + "loss": 1.0593, + "step": 4034 + }, + { + "epoch": 0.9238694905552376, + "grad_norm": 1.3396857976913452, + "learning_rate": 3.1085634387769124e-07, + "loss": 1.0156, + "step": 4035 + }, + { + "epoch": 0.9240984544934173, + "grad_norm": 1.7538094520568848, + "learning_rate": 3.090237831876053e-07, + "loss": 1.0305, + "step": 4036 + }, + { + "epoch": 0.924327418431597, + "grad_norm": 1.2242697477340698, + "learning_rate": 3.0719655536048276e-07, + "loss": 1.027, + "step": 4037 + }, + { + "epoch": 0.9245563823697768, + "grad_norm": 1.4524619579315186, + "learning_rate": 3.053746614018305e-07, + "loss": 1.0006, + "step": 4038 + }, + { + "epoch": 0.9247853463079565, + "grad_norm": 1.2049915790557861, + "learning_rate": 3.035581023142231e-07, + "loss": 1.043, + "step": 4039 + }, + { + "epoch": 0.9250143102461362, + "grad_norm": 1.5115872621536255, + "learning_rate": 3.0174687909729663e-07, + "loss": 1.0647, + "step": 4040 + }, + { + "epoch": 0.925243274184316, + "grad_norm": 1.2160587310791016, + "learning_rate": 2.9994099274775256e-07, + "loss": 1.0233, + "step": 4041 + }, + { + "epoch": 0.9254722381224957, + "grad_norm": 1.3652007579803467, + "learning_rate": 2.9814044425935605e-07, + "loss": 1.0257, + "step": 4042 + }, + { + "epoch": 0.9257012020606754, + "grad_norm": 1.5073693990707397, + "learning_rate": 2.9634523462293005e-07, + "loss": 1.0529, + "step": 4043 + }, + { + "epoch": 0.9259301659988551, + "grad_norm": 1.4959176778793335, + "learning_rate": 2.945553648263677e-07, + "loss": 1.0633, + "step": 4044 + }, + { + "epoch": 0.9261591299370349, + "grad_norm": 1.320374846458435, + "learning_rate": 2.9277083585461776e-07, + "loss": 0.9744, + "step": 4045 + }, + { + "epoch": 0.9263880938752147, + "grad_norm": 1.4175662994384766, + "learning_rate": 2.909916486896913e-07, + "loss": 0.9728, + "step": 4046 + }, + { + "epoch": 0.9266170578133944, + "grad_norm": 1.35358464717865, + "learning_rate": 2.8921780431065973e-07, + "loss": 1.0241, + "step": 4047 + }, + { + "epoch": 0.9268460217515742, + "grad_norm": 1.674445390701294, + "learning_rate": 2.874493036936554e-07, + "loss": 0.9889, + "step": 4048 + }, + { + "epoch": 0.9270749856897539, + "grad_norm": 1.1121513843536377, + "learning_rate": 2.8568614781186886e-07, + "loss": 1.0436, + "step": 4049 + }, + { + "epoch": 0.9273039496279336, + "grad_norm": 1.7877289056777954, + "learning_rate": 2.839283376355506e-07, + "loss": 1.0327, + "step": 4050 + }, + { + "epoch": 0.9275329135661133, + "grad_norm": 1.4056133031845093, + "learning_rate": 2.82175874132008e-07, + "loss": 1.0264, + "step": 4051 + }, + { + "epoch": 0.9277618775042931, + "grad_norm": 1.2976934909820557, + "learning_rate": 2.804287582656073e-07, + "loss": 1.1032, + "step": 4052 + }, + { + "epoch": 0.9279908414424728, + "grad_norm": 1.4245258569717407, + "learning_rate": 2.7868699099777295e-07, + "loss": 1.0262, + "step": 4053 + }, + { + "epoch": 0.9282198053806525, + "grad_norm": 1.3631470203399658, + "learning_rate": 2.769505732869837e-07, + "loss": 1.0073, + "step": 4054 + }, + { + "epoch": 0.9284487693188322, + "grad_norm": 2.766822576522827, + "learning_rate": 2.752195060887775e-07, + "loss": 0.9939, + "step": 4055 + }, + { + "epoch": 0.928677733257012, + "grad_norm": 1.360872507095337, + "learning_rate": 2.7349379035574217e-07, + "loss": 1.0607, + "step": 4056 + }, + { + "epoch": 0.9289066971951918, + "grad_norm": 1.0225962400436401, + "learning_rate": 2.717734270375272e-07, + "loss": 1.0532, + "step": 4057 + }, + { + "epoch": 0.9291356611333715, + "grad_norm": 1.3829964399337769, + "learning_rate": 2.7005841708083516e-07, + "loss": 1.057, + "step": 4058 + }, + { + "epoch": 0.9293646250715513, + "grad_norm": 1.5266767740249634, + "learning_rate": 2.683487614294189e-07, + "loss": 1.0197, + "step": 4059 + }, + { + "epoch": 0.929593589009731, + "grad_norm": 1.3430898189544678, + "learning_rate": 2.6664446102408924e-07, + "loss": 1.046, + "step": 4060 + }, + { + "epoch": 0.9298225529479107, + "grad_norm": 1.3258440494537354, + "learning_rate": 2.649455168027082e-07, + "loss": 1.0198, + "step": 4061 + }, + { + "epoch": 0.9300515168860904, + "grad_norm": 1.0601249933242798, + "learning_rate": 2.632519297001912e-07, + "loss": 1.0252, + "step": 4062 + }, + { + "epoch": 0.9302804808242702, + "grad_norm": 1.4658838510513306, + "learning_rate": 2.615637006485039e-07, + "loss": 1.0577, + "step": 4063 + }, + { + "epoch": 0.9305094447624499, + "grad_norm": 1.3812763690948486, + "learning_rate": 2.5988083057666534e-07, + "loss": 1.0447, + "step": 4064 + }, + { + "epoch": 0.9307384087006296, + "grad_norm": 1.4836375713348389, + "learning_rate": 2.582033204107437e-07, + "loss": 0.9993, + "step": 4065 + }, + { + "epoch": 0.9309673726388094, + "grad_norm": 1.3811873197555542, + "learning_rate": 2.565311710738616e-07, + "loss": 1.0034, + "step": 4066 + }, + { + "epoch": 0.9311963365769891, + "grad_norm": 1.320236325263977, + "learning_rate": 2.5486438348618416e-07, + "loss": 0.9957, + "step": 4067 + }, + { + "epoch": 0.9314253005151688, + "grad_norm": 1.1758593320846558, + "learning_rate": 2.532029585649343e-07, + "loss": 1.0828, + "step": 4068 + }, + { + "epoch": 0.9316542644533486, + "grad_norm": 1.2372307777404785, + "learning_rate": 2.515468972243795e-07, + "loss": 1.0289, + "step": 4069 + }, + { + "epoch": 0.9318832283915284, + "grad_norm": 1.5127670764923096, + "learning_rate": 2.4989620037583653e-07, + "loss": 1.116, + "step": 4070 + }, + { + "epoch": 0.9321121923297081, + "grad_norm": 1.2241275310516357, + "learning_rate": 2.4825086892766745e-07, + "loss": 0.9408, + "step": 4071 + }, + { + "epoch": 0.9323411562678878, + "grad_norm": 1.1708391904830933, + "learning_rate": 2.466109037852893e-07, + "loss": 1.0203, + "step": 4072 + }, + { + "epoch": 0.9325701202060676, + "grad_norm": 1.14144766330719, + "learning_rate": 2.4497630585115673e-07, + "loss": 0.9937, + "step": 4073 + }, + { + "epoch": 0.9327990841442473, + "grad_norm": 2.2938714027404785, + "learning_rate": 2.4334707602477693e-07, + "loss": 1.0365, + "step": 4074 + }, + { + "epoch": 0.933028048082427, + "grad_norm": 1.596992015838623, + "learning_rate": 2.417232152027016e-07, + "loss": 1.0269, + "step": 4075 + }, + { + "epoch": 0.9332570120206067, + "grad_norm": 1.3394994735717773, + "learning_rate": 2.4010472427852706e-07, + "loss": 1.0008, + "step": 4076 + }, + { + "epoch": 0.9334859759587865, + "grad_norm": 2.9255053997039795, + "learning_rate": 2.384916041428964e-07, + "loss": 1.0246, + "step": 4077 + }, + { + "epoch": 0.9337149398969662, + "grad_norm": 2.5200510025024414, + "learning_rate": 2.3688385568349515e-07, + "loss": 1.0253, + "step": 4078 + }, + { + "epoch": 0.9339439038351459, + "grad_norm": 1.3866181373596191, + "learning_rate": 2.3528147978505334e-07, + "loss": 1.0208, + "step": 4079 + }, + { + "epoch": 0.9341728677733258, + "grad_norm": 1.7252721786499023, + "learning_rate": 2.3368447732934785e-07, + "loss": 1.0771, + "step": 4080 + }, + { + "epoch": 0.9344018317115055, + "grad_norm": 1.2650654315948486, + "learning_rate": 2.3209284919519127e-07, + "loss": 1.0407, + "step": 4081 + }, + { + "epoch": 0.9346307956496852, + "grad_norm": 1.2546926736831665, + "learning_rate": 2.305065962584463e-07, + "loss": 1.0684, + "step": 4082 + }, + { + "epoch": 0.9348597595878649, + "grad_norm": 1.51604425907135, + "learning_rate": 2.289257193920158e-07, + "loss": 1.0321, + "step": 4083 + }, + { + "epoch": 0.9350887235260447, + "grad_norm": 1.2213822603225708, + "learning_rate": 2.2735021946583946e-07, + "loss": 1.0326, + "step": 4084 + }, + { + "epoch": 0.9353176874642244, + "grad_norm": 1.4243481159210205, + "learning_rate": 2.2578009734690264e-07, + "loss": 1.0401, + "step": 4085 + }, + { + "epoch": 0.9355466514024041, + "grad_norm": 1.6007384061813354, + "learning_rate": 2.242153538992331e-07, + "loss": 0.9734, + "step": 4086 + }, + { + "epoch": 0.9357756153405838, + "grad_norm": 2.3834850788116455, + "learning_rate": 2.2265598998389316e-07, + "loss": 0.988, + "step": 4087 + }, + { + "epoch": 0.9360045792787636, + "grad_norm": 1.524840235710144, + "learning_rate": 2.211020064589886e-07, + "loss": 1.0204, + "step": 4088 + }, + { + "epoch": 0.9362335432169433, + "grad_norm": 1.3676830530166626, + "learning_rate": 2.195534041796632e-07, + "loss": 1.0464, + "step": 4089 + }, + { + "epoch": 0.936462507155123, + "grad_norm": 1.5607987642288208, + "learning_rate": 2.180101839981008e-07, + "loss": 1.0407, + "step": 4090 + }, + { + "epoch": 0.9366914710933028, + "grad_norm": 1.0320096015930176, + "learning_rate": 2.1647234676352213e-07, + "loss": 1.0463, + "step": 4091 + }, + { + "epoch": 0.9369204350314826, + "grad_norm": 1.3179558515548706, + "learning_rate": 2.1493989332218468e-07, + "loss": 1.0596, + "step": 4092 + }, + { + "epoch": 0.9371493989696623, + "grad_norm": 1.3272641897201538, + "learning_rate": 2.134128245173872e-07, + "loss": 1.0372, + "step": 4093 + }, + { + "epoch": 0.937378362907842, + "grad_norm": 1.2094119787216187, + "learning_rate": 2.1189114118946196e-07, + "loss": 1.032, + "step": 4094 + }, + { + "epoch": 0.9376073268460218, + "grad_norm": 1.2302625179290771, + "learning_rate": 2.103748441757758e-07, + "loss": 0.9675, + "step": 4095 + }, + { + "epoch": 0.9378362907842015, + "grad_norm": 1.7436269521713257, + "learning_rate": 2.0886393431073794e-07, + "loss": 1.0837, + "step": 4096 + }, + { + "epoch": 0.9380652547223812, + "grad_norm": 1.3275481462478638, + "learning_rate": 2.0735841242578992e-07, + "loss": 1.0377, + "step": 4097 + }, + { + "epoch": 0.938294218660561, + "grad_norm": 1.3621660470962524, + "learning_rate": 2.0585827934940238e-07, + "loss": 1.0369, + "step": 4098 + }, + { + "epoch": 0.9385231825987407, + "grad_norm": 1.739432692527771, + "learning_rate": 2.043635359070928e-07, + "loss": 1.0002, + "step": 4099 + }, + { + "epoch": 0.9387521465369204, + "grad_norm": 1.273725986480713, + "learning_rate": 2.0287418292140204e-07, + "loss": 0.9611, + "step": 4100 + }, + { + "epoch": 0.9389811104751001, + "grad_norm": 1.3180021047592163, + "learning_rate": 2.0139022121190788e-07, + "loss": 1.0431, + "step": 4101 + }, + { + "epoch": 0.9392100744132799, + "grad_norm": 1.3748250007629395, + "learning_rate": 1.9991165159522485e-07, + "loss": 0.9487, + "step": 4102 + }, + { + "epoch": 0.9394390383514597, + "grad_norm": 1.186481237411499, + "learning_rate": 1.9843847488499545e-07, + "loss": 1.0343, + "step": 4103 + }, + { + "epoch": 0.9396680022896394, + "grad_norm": 1.2737975120544434, + "learning_rate": 1.9697069189189678e-07, + "loss": 1.0535, + "step": 4104 + }, + { + "epoch": 0.9398969662278192, + "grad_norm": 1.1873112916946411, + "learning_rate": 1.9550830342363714e-07, + "loss": 1.0056, + "step": 4105 + }, + { + "epoch": 0.9401259301659989, + "grad_norm": 1.3513420820236206, + "learning_rate": 1.9405131028495838e-07, + "loss": 1.0278, + "step": 4106 + }, + { + "epoch": 0.9403548941041786, + "grad_norm": 1.1599657535552979, + "learning_rate": 1.9259971327763028e-07, + "loss": 1.0013, + "step": 4107 + }, + { + "epoch": 0.9405838580423583, + "grad_norm": 2.088212728500366, + "learning_rate": 1.9115351320045495e-07, + "loss": 1.0098, + "step": 4108 + }, + { + "epoch": 0.9408128219805381, + "grad_norm": 1.281222939491272, + "learning_rate": 1.8971271084926245e-07, + "loss": 0.9935, + "step": 4109 + }, + { + "epoch": 0.9410417859187178, + "grad_norm": 1.147822618484497, + "learning_rate": 1.8827730701691749e-07, + "loss": 1.0115, + "step": 4110 + }, + { + "epoch": 0.9412707498568975, + "grad_norm": 2.1663131713867188, + "learning_rate": 1.868473024933082e-07, + "loss": 1.1268, + "step": 4111 + }, + { + "epoch": 0.9414997137950772, + "grad_norm": 1.2913000583648682, + "learning_rate": 1.8542269806535285e-07, + "loss": 1.0183, + "step": 4112 + }, + { + "epoch": 0.941728677733257, + "grad_norm": 1.3916893005371094, + "learning_rate": 1.8400349451700438e-07, + "loss": 1.0304, + "step": 4113 + }, + { + "epoch": 0.9419576416714367, + "grad_norm": 1.3906985521316528, + "learning_rate": 1.8258969262923366e-07, + "loss": 1.0088, + "step": 4114 + }, + { + "epoch": 0.9421866056096165, + "grad_norm": 1.523729681968689, + "learning_rate": 1.8118129318004717e-07, + "loss": 1.0386, + "step": 4115 + }, + { + "epoch": 0.9424155695477963, + "grad_norm": 1.3909317255020142, + "learning_rate": 1.7977829694447278e-07, + "loss": 0.9631, + "step": 4116 + }, + { + "epoch": 0.942644533485976, + "grad_norm": 1.4848453998565674, + "learning_rate": 1.7838070469456958e-07, + "loss": 0.9619, + "step": 4117 + }, + { + "epoch": 0.9428734974241557, + "grad_norm": 1.3331187963485718, + "learning_rate": 1.769885171994201e-07, + "loss": 1.0474, + "step": 4118 + }, + { + "epoch": 0.9431024613623354, + "grad_norm": 1.6051127910614014, + "learning_rate": 1.7560173522513268e-07, + "loss": 1.0188, + "step": 4119 + }, + { + "epoch": 0.9433314253005152, + "grad_norm": 1.2383366823196411, + "learning_rate": 1.742203595348435e-07, + "loss": 1.0819, + "step": 4120 + }, + { + "epoch": 0.9435603892386949, + "grad_norm": 1.607271671295166, + "learning_rate": 1.728443908887112e-07, + "loss": 0.9864, + "step": 4121 + }, + { + "epoch": 0.9437893531768746, + "grad_norm": 1.5015615224838257, + "learning_rate": 1.7147383004391782e-07, + "loss": 1.0301, + "step": 4122 + }, + { + "epoch": 0.9440183171150544, + "grad_norm": 1.2938905954360962, + "learning_rate": 1.7010867775467454e-07, + "loss": 0.9931, + "step": 4123 + }, + { + "epoch": 0.9442472810532341, + "grad_norm": 1.4441864490509033, + "learning_rate": 1.6874893477221376e-07, + "loss": 1.0832, + "step": 4124 + }, + { + "epoch": 0.9444762449914138, + "grad_norm": 1.309255599975586, + "learning_rate": 1.6739460184478694e-07, + "loss": 1.0367, + "step": 4125 + }, + { + "epoch": 0.9447052089295936, + "grad_norm": 1.3141146898269653, + "learning_rate": 1.660456797176735e-07, + "loss": 1.0261, + "step": 4126 + }, + { + "epoch": 0.9449341728677734, + "grad_norm": 1.307094931602478, + "learning_rate": 1.6470216913317628e-07, + "loss": 1.0172, + "step": 4127 + }, + { + "epoch": 0.9451631368059531, + "grad_norm": 1.344239592552185, + "learning_rate": 1.63364070830615e-07, + "loss": 1.0416, + "step": 4128 + }, + { + "epoch": 0.9453921007441328, + "grad_norm": 1.2404433488845825, + "learning_rate": 1.6203138554633625e-07, + "loss": 1.0191, + "step": 4129 + }, + { + "epoch": 0.9456210646823126, + "grad_norm": 1.1200016736984253, + "learning_rate": 1.6070411401370335e-07, + "loss": 0.9745, + "step": 4130 + }, + { + "epoch": 0.9458500286204923, + "grad_norm": 3.205339193344116, + "learning_rate": 1.5938225696310427e-07, + "loss": 1.0064, + "step": 4131 + }, + { + "epoch": 0.946078992558672, + "grad_norm": 1.468671441078186, + "learning_rate": 1.580658151219461e-07, + "loss": 1.0288, + "step": 4132 + }, + { + "epoch": 0.9463079564968517, + "grad_norm": 1.7323161363601685, + "learning_rate": 1.567547892146537e-07, + "loss": 0.9814, + "step": 4133 + }, + { + "epoch": 0.9465369204350315, + "grad_norm": 1.4511128664016724, + "learning_rate": 1.5544917996267562e-07, + "loss": 0.9879, + "step": 4134 + }, + { + "epoch": 0.9467658843732112, + "grad_norm": 1.2336581945419312, + "learning_rate": 1.541489880844782e-07, + "loss": 1.0541, + "step": 4135 + }, + { + "epoch": 0.9469948483113909, + "grad_norm": 1.2576466798782349, + "learning_rate": 1.528542142955436e-07, + "loss": 1.0047, + "step": 4136 + }, + { + "epoch": 0.9472238122495706, + "grad_norm": 1.4360203742980957, + "learning_rate": 1.5156485930837628e-07, + "loss": 1.0658, + "step": 4137 + }, + { + "epoch": 0.9474527761877505, + "grad_norm": 1.8652851581573486, + "learning_rate": 1.5028092383249871e-07, + "loss": 1.0568, + "step": 4138 + }, + { + "epoch": 0.9476817401259302, + "grad_norm": 1.3658671379089355, + "learning_rate": 1.4900240857444792e-07, + "loss": 1.095, + "step": 4139 + }, + { + "epoch": 0.9479107040641099, + "grad_norm": 1.3814666271209717, + "learning_rate": 1.4772931423778003e-07, + "loss": 1.089, + "step": 4140 + }, + { + "epoch": 0.9481396680022897, + "grad_norm": 1.3728240728378296, + "learning_rate": 1.464616415230702e-07, + "loss": 1.0431, + "step": 4141 + }, + { + "epoch": 0.9483686319404694, + "grad_norm": 1.6586147546768188, + "learning_rate": 1.4519939112790705e-07, + "loss": 1.0047, + "step": 4142 + }, + { + "epoch": 0.9485975958786491, + "grad_norm": 1.2365854978561401, + "learning_rate": 1.4394256374689498e-07, + "loss": 1.0841, + "step": 4143 + }, + { + "epoch": 0.9488265598168288, + "grad_norm": 1.3631367683410645, + "learning_rate": 1.426911600716574e-07, + "loss": 1.0188, + "step": 4144 + }, + { + "epoch": 0.9490555237550086, + "grad_norm": 1.1804825067520142, + "learning_rate": 1.4144518079083125e-07, + "loss": 1.0877, + "step": 4145 + }, + { + "epoch": 0.9492844876931883, + "grad_norm": 1.24893057346344, + "learning_rate": 1.4020462659006806e-07, + "loss": 1.0689, + "step": 4146 + }, + { + "epoch": 0.949513451631368, + "grad_norm": 1.317393183708191, + "learning_rate": 1.3896949815203398e-07, + "loss": 1.0285, + "step": 4147 + }, + { + "epoch": 0.9497424155695477, + "grad_norm": 1.233035683631897, + "learning_rate": 1.3773979615640976e-07, + "loss": 1.0676, + "step": 4148 + }, + { + "epoch": 0.9499713795077276, + "grad_norm": 1.0426533222198486, + "learning_rate": 1.3651552127989186e-07, + "loss": 1.0019, + "step": 4149 + }, + { + "epoch": 0.9502003434459073, + "grad_norm": 1.5621211528778076, + "learning_rate": 1.3529667419618475e-07, + "loss": 1.0579, + "step": 4150 + }, + { + "epoch": 0.950429307384087, + "grad_norm": 1.3285621404647827, + "learning_rate": 1.3408325557601297e-07, + "loss": 1.0285, + "step": 4151 + }, + { + "epoch": 0.9506582713222668, + "grad_norm": 1.1618311405181885, + "learning_rate": 1.3287526608711132e-07, + "loss": 1.046, + "step": 4152 + }, + { + "epoch": 0.9508872352604465, + "grad_norm": 1.317858338356018, + "learning_rate": 1.3167270639422357e-07, + "loss": 1.0425, + "step": 4153 + }, + { + "epoch": 0.9511161991986262, + "grad_norm": 1.2323342561721802, + "learning_rate": 1.304755771591093e-07, + "loss": 1.0702, + "step": 4154 + }, + { + "epoch": 0.951345163136806, + "grad_norm": 1.3113363981246948, + "learning_rate": 1.292838790405393e-07, + "loss": 1.026, + "step": 4155 + }, + { + "epoch": 0.9515741270749857, + "grad_norm": 1.145249366760254, + "learning_rate": 1.2809761269429343e-07, + "loss": 1.0076, + "step": 4156 + }, + { + "epoch": 0.9518030910131654, + "grad_norm": 1.527535319328308, + "learning_rate": 1.269167787731662e-07, + "loss": 0.9754, + "step": 4157 + }, + { + "epoch": 0.9520320549513451, + "grad_norm": 1.321520209312439, + "learning_rate": 1.2574137792695895e-07, + "loss": 1.0286, + "step": 4158 + }, + { + "epoch": 0.9522610188895249, + "grad_norm": 1.5232340097427368, + "learning_rate": 1.2457141080248647e-07, + "loss": 0.9744, + "step": 4159 + }, + { + "epoch": 0.9524899828277046, + "grad_norm": 1.3998576402664185, + "learning_rate": 1.2340687804357044e-07, + "loss": 1.0796, + "step": 4160 + }, + { + "epoch": 0.9527189467658844, + "grad_norm": 1.2090296745300293, + "learning_rate": 1.2224778029104377e-07, + "loss": 1.0844, + "step": 4161 + }, + { + "epoch": 0.9529479107040641, + "grad_norm": 1.319123387336731, + "learning_rate": 1.2109411818274851e-07, + "loss": 1.0412, + "step": 4162 + }, + { + "epoch": 0.9531768746422439, + "grad_norm": 1.2785385847091675, + "learning_rate": 1.1994589235353682e-07, + "loss": 1.0274, + "step": 4163 + }, + { + "epoch": 0.9534058385804236, + "grad_norm": 1.2551246881484985, + "learning_rate": 1.1880310343526324e-07, + "loss": 1.0395, + "step": 4164 + }, + { + "epoch": 0.9536348025186033, + "grad_norm": 1.755634069442749, + "learning_rate": 1.1766575205680031e-07, + "loss": 1.0167, + "step": 4165 + }, + { + "epoch": 0.9538637664567831, + "grad_norm": 1.1135053634643555, + "learning_rate": 1.1653383884401959e-07, + "loss": 0.9704, + "step": 4166 + }, + { + "epoch": 0.9540927303949628, + "grad_norm": 1.2887723445892334, + "learning_rate": 1.1540736441980505e-07, + "loss": 1.0541, + "step": 4167 + }, + { + "epoch": 0.9543216943331425, + "grad_norm": 1.2668262720108032, + "learning_rate": 1.1428632940404416e-07, + "loss": 1.0312, + "step": 4168 + }, + { + "epoch": 0.9545506582713222, + "grad_norm": 1.4527376890182495, + "learning_rate": 1.1317073441363458e-07, + "loss": 1.0343, + "step": 4169 + }, + { + "epoch": 0.954779622209502, + "grad_norm": 1.378683090209961, + "learning_rate": 1.1206058006247966e-07, + "loss": 0.9851, + "step": 4170 + }, + { + "epoch": 0.9550085861476817, + "grad_norm": 1.202966570854187, + "learning_rate": 1.109558669614863e-07, + "loss": 1.027, + "step": 4171 + }, + { + "epoch": 0.9552375500858615, + "grad_norm": 1.4312129020690918, + "learning_rate": 1.0985659571856933e-07, + "loss": 1.0712, + "step": 4172 + }, + { + "epoch": 0.9554665140240413, + "grad_norm": 1.2038168907165527, + "learning_rate": 1.0876276693864818e-07, + "loss": 1.0783, + "step": 4173 + }, + { + "epoch": 0.955695477962221, + "grad_norm": 1.1261094808578491, + "learning_rate": 1.0767438122364914e-07, + "loss": 1.0094, + "step": 4174 + }, + { + "epoch": 0.9559244419004007, + "grad_norm": 2.3489320278167725, + "learning_rate": 1.0659143917250092e-07, + "loss": 1.0653, + "step": 4175 + }, + { + "epoch": 0.9561534058385804, + "grad_norm": 1.5417100191116333, + "learning_rate": 1.055139413811379e-07, + "loss": 0.9797, + "step": 4176 + }, + { + "epoch": 0.9563823697767602, + "grad_norm": 1.4111857414245605, + "learning_rate": 1.0444188844249691e-07, + "loss": 0.997, + "step": 4177 + }, + { + "epoch": 0.9566113337149399, + "grad_norm": 1.4079347848892212, + "learning_rate": 1.0337528094651938e-07, + "loss": 0.9685, + "step": 4178 + }, + { + "epoch": 0.9568402976531196, + "grad_norm": 1.3267221450805664, + "learning_rate": 1.0231411948015247e-07, + "loss": 1.1016, + "step": 4179 + }, + { + "epoch": 0.9570692615912993, + "grad_norm": 1.3851131200790405, + "learning_rate": 1.0125840462734238e-07, + "loss": 1.0756, + "step": 4180 + }, + { + "epoch": 0.9572982255294791, + "grad_norm": 1.3021942377090454, + "learning_rate": 1.0020813696904108e-07, + "loss": 0.9716, + "step": 4181 + }, + { + "epoch": 0.9575271894676588, + "grad_norm": 1.1864266395568848, + "learning_rate": 9.916331708320403e-08, + "loss": 0.9976, + "step": 4182 + }, + { + "epoch": 0.9577561534058385, + "grad_norm": 13.135167121887207, + "learning_rate": 9.812394554478355e-08, + "loss": 1.0404, + "step": 4183 + }, + { + "epoch": 0.9579851173440184, + "grad_norm": 1.3869199752807617, + "learning_rate": 9.709002292573876e-08, + "loss": 1.0379, + "step": 4184 + }, + { + "epoch": 0.9582140812821981, + "grad_norm": 1.8574575185775757, + "learning_rate": 9.606154979502791e-08, + "loss": 1.0384, + "step": 4185 + }, + { + "epoch": 0.9584430452203778, + "grad_norm": 1.1653584241867065, + "learning_rate": 9.503852671861158e-08, + "loss": 1.0695, + "step": 4186 + }, + { + "epoch": 0.9586720091585575, + "grad_norm": 1.499384880065918, + "learning_rate": 9.402095425945168e-08, + "loss": 1.0062, + "step": 4187 + }, + { + "epoch": 0.9589009730967373, + "grad_norm": 1.988095760345459, + "learning_rate": 9.300883297750696e-08, + "loss": 1.0296, + "step": 4188 + }, + { + "epoch": 0.959129937034917, + "grad_norm": 1.2225043773651123, + "learning_rate": 9.2002163429743e-08, + "loss": 0.9754, + "step": 4189 + }, + { + "epoch": 0.9593589009730967, + "grad_norm": 1.299344778060913, + "learning_rate": 9.10009461701189e-08, + "loss": 1.0025, + "step": 4190 + }, + { + "epoch": 0.9595878649112765, + "grad_norm": 1.1545172929763794, + "learning_rate": 9.000518174959726e-08, + "loss": 1.0316, + "step": 4191 + }, + { + "epoch": 0.9598168288494562, + "grad_norm": 1.2487175464630127, + "learning_rate": 8.901487071613868e-08, + "loss": 1.06, + "step": 4192 + }, + { + "epoch": 0.9600457927876359, + "grad_norm": 1.9878804683685303, + "learning_rate": 8.803001361470386e-08, + "loss": 1.053, + "step": 4193 + }, + { + "epoch": 0.9602747567258156, + "grad_norm": 1.282788634300232, + "learning_rate": 8.705061098724932e-08, + "loss": 0.9953, + "step": 4194 + }, + { + "epoch": 0.9605037206639955, + "grad_norm": 1.5588258504867554, + "learning_rate": 8.60766633727339e-08, + "loss": 1.0122, + "step": 4195 + }, + { + "epoch": 0.9607326846021752, + "grad_norm": 2.007049798965454, + "learning_rate": 8.510817130711224e-08, + "loss": 1.0305, + "step": 4196 + }, + { + "epoch": 0.9609616485403549, + "grad_norm": 1.2135330438613892, + "learning_rate": 8.41451353233369e-08, + "loss": 1.0765, + "step": 4197 + }, + { + "epoch": 0.9611906124785347, + "grad_norm": 1.5741900205612183, + "learning_rate": 8.318755595135952e-08, + "loss": 1.0657, + "step": 4198 + }, + { + "epoch": 0.9614195764167144, + "grad_norm": 1.8799227476119995, + "learning_rate": 8.223543371812748e-08, + "loss": 1.0397, + "step": 4199 + }, + { + "epoch": 0.9616485403548941, + "grad_norm": 2.1442928314208984, + "learning_rate": 8.128876914758499e-08, + "loss": 1.0042, + "step": 4200 + }, + { + "epoch": 0.9618775042930738, + "grad_norm": 1.2874436378479004, + "learning_rate": 8.034756276067534e-08, + "loss": 0.9996, + "step": 4201 + }, + { + "epoch": 0.9621064682312536, + "grad_norm": 1.346038579940796, + "learning_rate": 7.941181507533424e-08, + "loss": 1.0304, + "step": 4202 + }, + { + "epoch": 0.9623354321694333, + "grad_norm": 1.7882304191589355, + "learning_rate": 7.848152660649866e-08, + "loss": 1.0538, + "step": 4203 + }, + { + "epoch": 0.962564396107613, + "grad_norm": 1.237015724182129, + "learning_rate": 7.755669786609688e-08, + "loss": 1.0272, + "step": 4204 + }, + { + "epoch": 0.9627933600457927, + "grad_norm": 1.6121935844421387, + "learning_rate": 7.663732936305291e-08, + "loss": 1.015, + "step": 4205 + }, + { + "epoch": 0.9630223239839725, + "grad_norm": 1.267065405845642, + "learning_rate": 7.572342160328982e-08, + "loss": 1.0515, + "step": 4206 + }, + { + "epoch": 0.9632512879221523, + "grad_norm": 1.263258934020996, + "learning_rate": 7.481497508972313e-08, + "loss": 1.0207, + "step": 4207 + }, + { + "epoch": 0.963480251860332, + "grad_norm": 1.343643069267273, + "learning_rate": 7.391199032226182e-08, + "loss": 1.0493, + "step": 4208 + }, + { + "epoch": 0.9637092157985118, + "grad_norm": 1.558370590209961, + "learning_rate": 7.301446779781285e-08, + "loss": 1.0701, + "step": 4209 + }, + { + "epoch": 0.9639381797366915, + "grad_norm": 1.1900969743728638, + "learning_rate": 7.212240801027337e-08, + "loss": 0.9938, + "step": 4210 + }, + { + "epoch": 0.9641671436748712, + "grad_norm": 2.1308228969573975, + "learning_rate": 7.123581145053849e-08, + "loss": 1.0434, + "step": 4211 + }, + { + "epoch": 0.964396107613051, + "grad_norm": 1.5577213764190674, + "learning_rate": 7.035467860649348e-08, + "loss": 1.0161, + "step": 4212 + }, + { + "epoch": 0.9646250715512307, + "grad_norm": 1.1246519088745117, + "learning_rate": 6.947900996301826e-08, + "loss": 1.0164, + "step": 4213 + }, + { + "epoch": 0.9648540354894104, + "grad_norm": 1.6314594745635986, + "learning_rate": 6.860880600198627e-08, + "loss": 1.0539, + "step": 4214 + }, + { + "epoch": 0.9650829994275901, + "grad_norm": 1.6893818378448486, + "learning_rate": 6.774406720226335e-08, + "loss": 1.0481, + "step": 4215 + }, + { + "epoch": 0.9653119633657699, + "grad_norm": 1.3539258241653442, + "learning_rate": 6.688479403970883e-08, + "loss": 1.0231, + "step": 4216 + }, + { + "epoch": 0.9655409273039496, + "grad_norm": 1.230441927909851, + "learning_rate": 6.603098698717336e-08, + "loss": 1.0163, + "step": 4217 + }, + { + "epoch": 0.9657698912421294, + "grad_norm": 1.3217246532440186, + "learning_rate": 6.51826465144978e-08, + "loss": 0.98, + "step": 4218 + }, + { + "epoch": 0.9659988551803091, + "grad_norm": 1.1363701820373535, + "learning_rate": 6.433977308851869e-08, + "loss": 1.0446, + "step": 4219 + }, + { + "epoch": 0.9662278191184889, + "grad_norm": 1.5187417268753052, + "learning_rate": 6.35023671730628e-08, + "loss": 1.0349, + "step": 4220 + }, + { + "epoch": 0.9664567830566686, + "grad_norm": 1.2305715084075928, + "learning_rate": 6.267042922894595e-08, + "loss": 1.0425, + "step": 4221 + }, + { + "epoch": 0.9666857469948483, + "grad_norm": 1.2499189376831055, + "learning_rate": 6.18439597139775e-08, + "loss": 0.9435, + "step": 4222 + }, + { + "epoch": 0.966914710933028, + "grad_norm": 1.2281574010849, + "learning_rate": 6.102295908295585e-08, + "loss": 1.0268, + "step": 4223 + }, + { + "epoch": 0.9671436748712078, + "grad_norm": 1.3302663564682007, + "learning_rate": 6.020742778767185e-08, + "loss": 1.012, + "step": 4224 + }, + { + "epoch": 0.9673726388093875, + "grad_norm": 1.1993911266326904, + "learning_rate": 5.93973662769054e-08, + "loss": 1.0542, + "step": 4225 + }, + { + "epoch": 0.9676016027475672, + "grad_norm": 1.2014667987823486, + "learning_rate": 5.859277499642546e-08, + "loss": 0.9975, + "step": 4226 + }, + { + "epoch": 0.967830566685747, + "grad_norm": 1.501274585723877, + "learning_rate": 5.7793654388993426e-08, + "loss": 1.0324, + "step": 4227 + }, + { + "epoch": 0.9680595306239267, + "grad_norm": 1.7279783487319946, + "learning_rate": 5.700000489435753e-08, + "loss": 1.0626, + "step": 4228 + }, + { + "epoch": 0.9682884945621064, + "grad_norm": 1.46021568775177, + "learning_rate": 5.621182694925731e-08, + "loss": 1.0041, + "step": 4229 + }, + { + "epoch": 0.9685174585002863, + "grad_norm": 1.1859759092330933, + "learning_rate": 5.542912098741915e-08, + "loss": 1.0243, + "step": 4230 + }, + { + "epoch": 0.968746422438466, + "grad_norm": 1.5773413181304932, + "learning_rate": 5.465188743956073e-08, + "loss": 1.0268, + "step": 4231 + }, + { + "epoch": 0.9689753863766457, + "grad_norm": 1.4995766878128052, + "learning_rate": 5.388012673338661e-08, + "loss": 1.0252, + "step": 4232 + }, + { + "epoch": 0.9692043503148254, + "grad_norm": 1.340819239616394, + "learning_rate": 5.3113839293590374e-08, + "loss": 1.0199, + "step": 4233 + }, + { + "epoch": 0.9694333142530052, + "grad_norm": 1.144403338432312, + "learning_rate": 5.235302554185362e-08, + "loss": 0.9865, + "step": 4234 + }, + { + "epoch": 0.9696622781911849, + "grad_norm": 1.210654854774475, + "learning_rate": 5.159768589684699e-08, + "loss": 0.978, + "step": 4235 + }, + { + "epoch": 0.9698912421293646, + "grad_norm": 1.2872427701950073, + "learning_rate": 5.084782077422468e-08, + "loss": 1.0102, + "step": 4236 + }, + { + "epoch": 0.9701202060675443, + "grad_norm": 1.495758295059204, + "learning_rate": 5.010343058663325e-08, + "loss": 1.0315, + "step": 4237 + }, + { + "epoch": 0.9703491700057241, + "grad_norm": 1.3226579427719116, + "learning_rate": 4.9364515743705046e-08, + "loss": 1.0261, + "step": 4238 + }, + { + "epoch": 0.9705781339439038, + "grad_norm": 1.2678083181381226, + "learning_rate": 4.863107665205702e-08, + "loss": 1.0226, + "step": 4239 + }, + { + "epoch": 0.9708070978820835, + "grad_norm": 1.6307822465896606, + "learning_rate": 4.79031137152941e-08, + "loss": 1.0238, + "step": 4240 + }, + { + "epoch": 0.9710360618202634, + "grad_norm": 1.3601518869400024, + "learning_rate": 4.71806273340103e-08, + "loss": 0.9714, + "step": 4241 + }, + { + "epoch": 0.9712650257584431, + "grad_norm": 1.3477270603179932, + "learning_rate": 4.646361790578313e-08, + "loss": 1.0751, + "step": 4242 + }, + { + "epoch": 0.9714939896966228, + "grad_norm": 1.1436761617660522, + "learning_rate": 4.575208582517587e-08, + "loss": 1.0135, + "step": 4243 + }, + { + "epoch": 0.9717229536348025, + "grad_norm": 2.0135879516601562, + "learning_rate": 4.504603148373976e-08, + "loss": 0.9799, + "step": 4244 + }, + { + "epoch": 0.9719519175729823, + "grad_norm": 2.1591763496398926, + "learning_rate": 4.4345455270010665e-08, + "loss": 1.0147, + "step": 4245 + }, + { + "epoch": 0.972180881511162, + "grad_norm": 1.2410547733306885, + "learning_rate": 4.365035756950797e-08, + "loss": 1.0154, + "step": 4246 + }, + { + "epoch": 0.9724098454493417, + "grad_norm": 1.4873212575912476, + "learning_rate": 4.296073876474016e-08, + "loss": 1.0427, + "step": 4247 + }, + { + "epoch": 0.9726388093875215, + "grad_norm": 1.2260034084320068, + "learning_rate": 4.227659923519811e-08, + "loss": 1.0528, + "step": 4248 + }, + { + "epoch": 0.9728677733257012, + "grad_norm": 1.4164505004882812, + "learning_rate": 4.159793935735734e-08, + "loss": 1.0673, + "step": 4249 + }, + { + "epoch": 0.9730967372638809, + "grad_norm": 1.2214871644973755, + "learning_rate": 4.092475950468022e-08, + "loss": 1.0818, + "step": 4250 + }, + { + "epoch": 0.9733257012020606, + "grad_norm": 1.3067419528961182, + "learning_rate": 4.025706004760932e-08, + "loss": 0.9782, + "step": 4251 + }, + { + "epoch": 0.9735546651402404, + "grad_norm": 1.2343987226486206, + "learning_rate": 3.9594841353577384e-08, + "loss": 1.093, + "step": 4252 + }, + { + "epoch": 0.9737836290784202, + "grad_norm": 1.420095443725586, + "learning_rate": 3.8938103786995144e-08, + "loss": 1.0679, + "step": 4253 + }, + { + "epoch": 0.9740125930165999, + "grad_norm": 1.1239619255065918, + "learning_rate": 3.8286847709261276e-08, + "loss": 1.0286, + "step": 4254 + }, + { + "epoch": 0.9742415569547797, + "grad_norm": 1.2686529159545898, + "learning_rate": 3.7641073478755786e-08, + "loss": 0.9945, + "step": 4255 + }, + { + "epoch": 0.9744705208929594, + "grad_norm": 2.8842384815216064, + "learning_rate": 3.7000781450844405e-08, + "loss": 1.0213, + "step": 4256 + }, + { + "epoch": 0.9746994848311391, + "grad_norm": 1.8031467199325562, + "learning_rate": 3.636597197787084e-08, + "loss": 1.0022, + "step": 4257 + }, + { + "epoch": 0.9749284487693188, + "grad_norm": 1.7563955783843994, + "learning_rate": 3.573664540916899e-08, + "loss": 0.9997, + "step": 4258 + }, + { + "epoch": 0.9751574127074986, + "grad_norm": 2.2838706970214844, + "learning_rate": 3.5112802091051834e-08, + "loss": 1.0434, + "step": 4259 + }, + { + "epoch": 0.9753863766456783, + "grad_norm": 1.1979697942733765, + "learning_rate": 3.449444236681254e-08, + "loss": 1.0448, + "step": 4260 + }, + { + "epoch": 0.975615340583858, + "grad_norm": 1.2877243757247925, + "learning_rate": 3.388156657673114e-08, + "loss": 1.0362, + "step": 4261 + }, + { + "epoch": 0.9758443045220377, + "grad_norm": 1.5697972774505615, + "learning_rate": 3.327417505806785e-08, + "loss": 1.0681, + "step": 4262 + }, + { + "epoch": 0.9760732684602175, + "grad_norm": 1.3261923789978027, + "learning_rate": 3.267226814506419e-08, + "loss": 1.0489, + "step": 4263 + }, + { + "epoch": 0.9763022323983973, + "grad_norm": 1.2220418453216553, + "learning_rate": 3.2075846168946326e-08, + "loss": 1.0713, + "step": 4264 + }, + { + "epoch": 0.976531196336577, + "grad_norm": 1.7401745319366455, + "learning_rate": 3.148490945791838e-08, + "loss": 1.0271, + "step": 4265 + }, + { + "epoch": 0.9767601602747568, + "grad_norm": 1.2124825716018677, + "learning_rate": 3.089945833716912e-08, + "loss": 1.0562, + "step": 4266 + }, + { + "epoch": 0.9769891242129365, + "grad_norm": 1.3442974090576172, + "learning_rate": 3.03194931288664e-08, + "loss": 1.0586, + "step": 4267 + }, + { + "epoch": 0.9772180881511162, + "grad_norm": 1.2674909830093384, + "learning_rate": 2.9745014152161578e-08, + "loss": 0.9957, + "step": 4268 + }, + { + "epoch": 0.9774470520892959, + "grad_norm": 1.6897748708724976, + "learning_rate": 2.917602172318401e-08, + "loss": 0.9973, + "step": 4269 + }, + { + "epoch": 0.9776760160274757, + "grad_norm": 1.3805571794509888, + "learning_rate": 2.8612516155047674e-08, + "loss": 0.9844, + "step": 4270 + }, + { + "epoch": 0.9779049799656554, + "grad_norm": 1.2225171327590942, + "learning_rate": 2.8054497757842304e-08, + "loss": 1.034, + "step": 4271 + }, + { + "epoch": 0.9781339439038351, + "grad_norm": 1.321537733078003, + "learning_rate": 2.7501966838642258e-08, + "loss": 1.0053, + "step": 4272 + }, + { + "epoch": 0.9783629078420149, + "grad_norm": 1.2543835639953613, + "learning_rate": 2.695492370149988e-08, + "loss": 1.0385, + "step": 4273 + }, + { + "epoch": 0.9785918717801946, + "grad_norm": 1.189489483833313, + "learning_rate": 2.641336864744992e-08, + "loss": 0.9956, + "step": 4274 + }, + { + "epoch": 0.9788208357183743, + "grad_norm": 1.2529815435409546, + "learning_rate": 2.5877301974503998e-08, + "loss": 1.0218, + "step": 4275 + }, + { + "epoch": 0.9790497996565541, + "grad_norm": 1.217818260192871, + "learning_rate": 2.534672397765614e-08, + "loss": 0.9919, + "step": 4276 + }, + { + "epoch": 0.9792787635947339, + "grad_norm": 1.363022804260254, + "learning_rate": 2.482163494887724e-08, + "loss": 1.0145, + "step": 4277 + }, + { + "epoch": 0.9795077275329136, + "grad_norm": 1.5168522596359253, + "learning_rate": 2.430203517712171e-08, + "loss": 1.0786, + "step": 4278 + }, + { + "epoch": 0.9797366914710933, + "grad_norm": 1.2632395029067993, + "learning_rate": 2.3787924948319718e-08, + "loss": 1.023, + "step": 4279 + }, + { + "epoch": 0.979965655409273, + "grad_norm": 1.2734144926071167, + "learning_rate": 2.327930454538274e-08, + "loss": 1.0511, + "step": 4280 + }, + { + "epoch": 0.9801946193474528, + "grad_norm": 1.4147753715515137, + "learning_rate": 2.2776174248199114e-08, + "loss": 0.9804, + "step": 4281 + }, + { + "epoch": 0.9804235832856325, + "grad_norm": 1.5188604593276978, + "learning_rate": 2.227853433363736e-08, + "loss": 1.0049, + "step": 4282 + }, + { + "epoch": 0.9806525472238122, + "grad_norm": 1.0858154296875, + "learning_rate": 2.1786385075545093e-08, + "loss": 1.0509, + "step": 4283 + }, + { + "epoch": 0.980881511161992, + "grad_norm": 1.3833143711090088, + "learning_rate": 2.1299726744747896e-08, + "loss": 0.9949, + "step": 4284 + }, + { + "epoch": 0.9811104751001717, + "grad_norm": 1.2171403169631958, + "learning_rate": 2.0818559609049327e-08, + "loss": 1.04, + "step": 4285 + }, + { + "epoch": 0.9813394390383514, + "grad_norm": 1.468177080154419, + "learning_rate": 2.0342883933232027e-08, + "loss": 0.985, + "step": 4286 + }, + { + "epoch": 0.9815684029765313, + "grad_norm": 1.3929141759872437, + "learning_rate": 1.987269997905661e-08, + "loss": 0.9983, + "step": 4287 + }, + { + "epoch": 0.981797366914711, + "grad_norm": 1.1607084274291992, + "learning_rate": 1.9408008005260548e-08, + "loss": 1.0426, + "step": 4288 + }, + { + "epoch": 0.9820263308528907, + "grad_norm": 1.3756016492843628, + "learning_rate": 1.8948808267560405e-08, + "loss": 1.0324, + "step": 4289 + }, + { + "epoch": 0.9822552947910704, + "grad_norm": 1.2399051189422607, + "learning_rate": 1.8495101018649598e-08, + "loss": 0.994, + "step": 4290 + }, + { + "epoch": 0.9824842587292502, + "grad_norm": 1.5608595609664917, + "learning_rate": 1.8046886508200633e-08, + "loss": 1.0074, + "step": 4291 + }, + { + "epoch": 0.9827132226674299, + "grad_norm": 1.3537348508834839, + "learning_rate": 1.7604164982860662e-08, + "loss": 1.0612, + "step": 4292 + }, + { + "epoch": 0.9829421866056096, + "grad_norm": 1.5062490701675415, + "learning_rate": 1.716693668625591e-08, + "loss": 1.0275, + "step": 4293 + }, + { + "epoch": 0.9831711505437893, + "grad_norm": 1.5584570169448853, + "learning_rate": 1.673520185899058e-08, + "loss": 1.0148, + "step": 4294 + }, + { + "epoch": 0.9834001144819691, + "grad_norm": 1.1560457944869995, + "learning_rate": 1.630896073864352e-08, + "loss": 0.986, + "step": 4295 + }, + { + "epoch": 0.9836290784201488, + "grad_norm": 1.2290360927581787, + "learning_rate": 1.5888213559771548e-08, + "loss": 1.0749, + "step": 4296 + }, + { + "epoch": 0.9838580423583285, + "grad_norm": 1.4205414056777954, + "learning_rate": 1.5472960553909456e-08, + "loss": 0.9877, + "step": 4297 + }, + { + "epoch": 0.9840870062965082, + "grad_norm": 1.6074973344802856, + "learning_rate": 1.5063201949566674e-08, + "loss": 1.0362, + "step": 4298 + }, + { + "epoch": 0.9843159702346881, + "grad_norm": 1.3286057710647583, + "learning_rate": 1.4658937972230613e-08, + "loss": 0.9793, + "step": 4299 + }, + { + "epoch": 0.9845449341728678, + "grad_norm": 1.3167859315872192, + "learning_rate": 1.426016884436332e-08, + "loss": 1.039, + "step": 4300 + }, + { + "epoch": 0.9847738981110475, + "grad_norm": 1.1874357461929321, + "learning_rate": 1.3866894785404816e-08, + "loss": 0.9651, + "step": 4301 + }, + { + "epoch": 0.9850028620492273, + "grad_norm": 1.4577040672302246, + "learning_rate": 1.3479116011769766e-08, + "loss": 1.0622, + "step": 4302 + }, + { + "epoch": 0.985231825987407, + "grad_norm": 1.3572076559066772, + "learning_rate": 1.3096832736850806e-08, + "loss": 1.0132, + "step": 4303 + }, + { + "epoch": 0.9854607899255867, + "grad_norm": 1.22506844997406, + "learning_rate": 1.2720045171014106e-08, + "loss": 1.0378, + "step": 4304 + }, + { + "epoch": 0.9856897538637664, + "grad_norm": 1.5919448137283325, + "learning_rate": 1.2348753521602696e-08, + "loss": 1.0365, + "step": 4305 + }, + { + "epoch": 0.9859187178019462, + "grad_norm": 1.5855189561843872, + "learning_rate": 1.1982957992936472e-08, + "loss": 1.0479, + "step": 4306 + }, + { + "epoch": 0.9861476817401259, + "grad_norm": 1.5232701301574707, + "learning_rate": 1.162265878630886e-08, + "loss": 1.0253, + "step": 4307 + }, + { + "epoch": 0.9863766456783056, + "grad_norm": 1.51606023311615, + "learning_rate": 1.1267856099989039e-08, + "loss": 0.9984, + "step": 4308 + }, + { + "epoch": 0.9866056096164854, + "grad_norm": 1.5284004211425781, + "learning_rate": 1.0918550129223049e-08, + "loss": 1.0555, + "step": 4309 + }, + { + "epoch": 0.9868345735546652, + "grad_norm": 1.2388665676116943, + "learning_rate": 1.0574741066230465e-08, + "loss": 0.9951, + "step": 4310 + }, + { + "epoch": 0.9870635374928449, + "grad_norm": 1.161252737045288, + "learning_rate": 1.0236429100206612e-08, + "loss": 1.0634, + "step": 4311 + }, + { + "epoch": 0.9872925014310246, + "grad_norm": 1.1357179880142212, + "learning_rate": 9.903614417320351e-09, + "loss": 1.0417, + "step": 4312 + }, + { + "epoch": 0.9875214653692044, + "grad_norm": 1.3556911945343018, + "learning_rate": 9.57629720071962e-09, + "loss": 1.0326, + "step": 4313 + }, + { + "epoch": 0.9877504293073841, + "grad_norm": 1.2056077718734741, + "learning_rate": 9.254477630521452e-09, + "loss": 0.945, + "step": 4314 + }, + { + "epoch": 0.9879793932455638, + "grad_norm": 1.208575963973999, + "learning_rate": 8.938155883823074e-09, + "loss": 1.0396, + "step": 4315 + }, + { + "epoch": 0.9882083571837436, + "grad_norm": 1.3659850358963013, + "learning_rate": 8.627332134690802e-09, + "loss": 1.0029, + "step": 4316 + }, + { + "epoch": 0.9884373211219233, + "grad_norm": 1.363781213760376, + "learning_rate": 8.322006554171147e-09, + "loss": 1.0407, + "step": 4317 + }, + { + "epoch": 0.988666285060103, + "grad_norm": 1.3712806701660156, + "learning_rate": 8.02217931028082e-09, + "loss": 0.9999, + "step": 4318 + }, + { + "epoch": 0.9888952489982827, + "grad_norm": 1.421749234199524, + "learning_rate": 7.727850568012286e-09, + "loss": 1.0482, + "step": 4319 + }, + { + "epoch": 0.9891242129364625, + "grad_norm": 1.1671499013900757, + "learning_rate": 7.439020489332649e-09, + "loss": 1.044, + "step": 4320 + }, + { + "epoch": 0.9893531768746422, + "grad_norm": 1.8170863389968872, + "learning_rate": 7.1556892331814394e-09, + "loss": 0.9375, + "step": 4321 + }, + { + "epoch": 0.989582140812822, + "grad_norm": 1.3475590944290161, + "learning_rate": 6.8778569554750484e-09, + "loss": 0.9789, + "step": 4322 + }, + { + "epoch": 0.9898111047510018, + "grad_norm": 1.19892418384552, + "learning_rate": 6.605523809102288e-09, + "loss": 1.0129, + "step": 4323 + }, + { + "epoch": 0.9900400686891815, + "grad_norm": 1.4870357513427734, + "learning_rate": 6.3386899439243925e-09, + "loss": 1.0412, + "step": 4324 + }, + { + "epoch": 0.9902690326273612, + "grad_norm": 1.529685378074646, + "learning_rate": 6.0773555067783485e-09, + "loss": 1.0188, + "step": 4325 + }, + { + "epoch": 0.9904979965655409, + "grad_norm": 1.2781044244766235, + "learning_rate": 5.8215206414746764e-09, + "loss": 1.0064, + "step": 4326 + }, + { + "epoch": 0.9907269605037207, + "grad_norm": 1.5567418336868286, + "learning_rate": 5.571185488797426e-09, + "loss": 1.0069, + "step": 4327 + }, + { + "epoch": 0.9909559244419004, + "grad_norm": 1.4737282991409302, + "learning_rate": 5.326350186503071e-09, + "loss": 1.0737, + "step": 4328 + }, + { + "epoch": 0.9911848883800801, + "grad_norm": 1.3786917924880981, + "learning_rate": 5.087014869322726e-09, + "loss": 0.991, + "step": 4329 + }, + { + "epoch": 0.9914138523182598, + "grad_norm": 1.3202197551727295, + "learning_rate": 4.853179668959928e-09, + "loss": 1.0897, + "step": 4330 + }, + { + "epoch": 0.9916428162564396, + "grad_norm": 1.275343656539917, + "learning_rate": 4.6248447140939675e-09, + "loss": 1.0553, + "step": 4331 + }, + { + "epoch": 0.9918717801946193, + "grad_norm": 1.365133285522461, + "learning_rate": 4.4020101303743345e-09, + "loss": 1.024, + "step": 4332 + }, + { + "epoch": 0.9921007441327991, + "grad_norm": 1.2677979469299316, + "learning_rate": 4.184676040426272e-09, + "loss": 1.0297, + "step": 4333 + }, + { + "epoch": 0.9923297080709789, + "grad_norm": 1.360640525817871, + "learning_rate": 3.972842563845225e-09, + "loss": 1.0112, + "step": 4334 + }, + { + "epoch": 0.9925586720091586, + "grad_norm": 1.6422892808914185, + "learning_rate": 3.7665098172023905e-09, + "loss": 1.023, + "step": 4335 + }, + { + "epoch": 0.9927876359473383, + "grad_norm": 1.0084567070007324, + "learning_rate": 3.5656779140402777e-09, + "loss": 0.974, + "step": 4336 + }, + { + "epoch": 0.993016599885518, + "grad_norm": 1.3576377630233765, + "learning_rate": 3.3703469648760367e-09, + "loss": 0.9808, + "step": 4337 + }, + { + "epoch": 0.9932455638236978, + "grad_norm": 1.804397463798523, + "learning_rate": 3.1805170771970207e-09, + "loss": 1.0415, + "step": 4338 + }, + { + "epoch": 0.9934745277618775, + "grad_norm": 1.3295351266860962, + "learning_rate": 2.9961883554674443e-09, + "loss": 1.0812, + "step": 4339 + }, + { + "epoch": 0.9937034917000572, + "grad_norm": 2.1188831329345703, + "learning_rate": 2.8173609011195035e-09, + "loss": 0.964, + "step": 4340 + }, + { + "epoch": 0.993932455638237, + "grad_norm": 1.1726047992706299, + "learning_rate": 2.6440348125622574e-09, + "loss": 0.9524, + "step": 4341 + }, + { + "epoch": 0.9941614195764167, + "grad_norm": 1.23749840259552, + "learning_rate": 2.476210185173855e-09, + "loss": 0.979, + "step": 4342 + }, + { + "epoch": 0.9943903835145964, + "grad_norm": 1.6526784896850586, + "learning_rate": 2.3138871113081997e-09, + "loss": 1.0302, + "step": 4343 + }, + { + "epoch": 0.9946193474527761, + "grad_norm": 1.434206485748291, + "learning_rate": 2.1570656802905042e-09, + "loss": 1.0205, + "step": 4344 + }, + { + "epoch": 0.994848311390956, + "grad_norm": 1.5796558856964111, + "learning_rate": 2.0057459784161848e-09, + "loss": 1.0317, + "step": 4345 + }, + { + "epoch": 0.9950772753291357, + "grad_norm": 1.5953503847122192, + "learning_rate": 1.859928088957519e-09, + "loss": 1.0114, + "step": 4346 + }, + { + "epoch": 0.9953062392673154, + "grad_norm": 1.3271796703338623, + "learning_rate": 1.7196120921558757e-09, + "loss": 1.0554, + "step": 4347 + }, + { + "epoch": 0.9955352032054952, + "grad_norm": 3.4375476837158203, + "learning_rate": 1.5847980652261563e-09, + "loss": 1.0024, + "step": 4348 + }, + { + "epoch": 0.9957641671436749, + "grad_norm": 1.3996182680130005, + "learning_rate": 1.4554860823556838e-09, + "loss": 1.0361, + "step": 4349 + }, + { + "epoch": 0.9959931310818546, + "grad_norm": 1.1781070232391357, + "learning_rate": 1.3316762147030925e-09, + "loss": 1.0539, + "step": 4350 + }, + { + "epoch": 0.9962220950200343, + "grad_norm": 1.5741220712661743, + "learning_rate": 1.213368530399439e-09, + "loss": 0.9569, + "step": 4351 + }, + { + "epoch": 0.9964510589582141, + "grad_norm": 1.3308751583099365, + "learning_rate": 1.100563094550422e-09, + "loss": 0.9695, + "step": 4352 + }, + { + "epoch": 0.9966800228963938, + "grad_norm": 1.3270615339279175, + "learning_rate": 9.932599692297207e-10, + "loss": 1.0147, + "step": 4353 + }, + { + "epoch": 0.9969089868345735, + "grad_norm": 1.6848193407058716, + "learning_rate": 8.914592134867672e-10, + "loss": 0.9697, + "step": 4354 + }, + { + "epoch": 0.9971379507727532, + "grad_norm": 1.465806245803833, + "learning_rate": 7.95160883341195e-10, + "loss": 0.9577, + "step": 4355 + }, + { + "epoch": 0.9973669147109331, + "grad_norm": 1.743776798248291, + "learning_rate": 7.043650317850592e-10, + "loss": 1.0717, + "step": 4356 + }, + { + "epoch": 0.9975958786491128, + "grad_norm": 1.4670666456222534, + "learning_rate": 6.190717087828368e-10, + "loss": 0.9676, + "step": 4357 + }, + { + "epoch": 0.9978248425872925, + "grad_norm": 1.3044846057891846, + "learning_rate": 5.392809612703165e-10, + "loss": 1.0857, + "step": 4358 + }, + { + "epoch": 0.9980538065254723, + "grad_norm": 1.9425315856933594, + "learning_rate": 4.649928331557085e-10, + "loss": 1.061, + "step": 4359 + }, + { + "epoch": 0.998282770463652, + "grad_norm": 1.157950520515442, + "learning_rate": 3.9620736532075543e-10, + "loss": 1.0149, + "step": 4360 + }, + { + "epoch": 0.9985117344018317, + "grad_norm": 1.157373309135437, + "learning_rate": 3.3292459561518055e-10, + "loss": 1.0196, + "step": 4361 + }, + { + "epoch": 0.9987406983400114, + "grad_norm": 1.302017331123352, + "learning_rate": 2.7514455886334945e-10, + "loss": 1.0031, + "step": 4362 + }, + { + "epoch": 0.9989696622781912, + "grad_norm": 1.2691115140914917, + "learning_rate": 2.2286728686315984e-10, + "loss": 1.0935, + "step": 4363 + }, + { + "epoch": 0.9991986262163709, + "grad_norm": 1.2917944192886353, + "learning_rate": 1.7609280838049026e-10, + "loss": 1.0505, + "step": 4364 + }, + { + "epoch": 0.9994275901545506, + "grad_norm": 2.0502283573150635, + "learning_rate": 1.3482114915475132e-10, + "loss": 1.0274, + "step": 4365 + }, + { + "epoch": 0.9996565540927304, + "grad_norm": 1.4327635765075684, + "learning_rate": 9.905233189888563e-11, + "loss": 1.0067, + "step": 4366 + }, + { + "epoch": 0.9998855180309101, + "grad_norm": 1.210883617401123, + "learning_rate": 6.87863762938168e-11, + "loss": 1.04, + "step": 4367 + }, + { + "epoch": 0.9998855180309101, + "step": 4367, + "total_flos": 2.9591670167426826e+18, + "train_loss": 1.0860715442400435, + "train_runtime": 124901.2572, + "train_samples_per_second": 4.476, + "train_steps_per_second": 0.035 + } + ], + "logging_steps": 1.0, + "max_steps": 4367, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 50000, + "total_flos": 2.9591670167426826e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}