{ "best_metric": 1.8095293045043945, "best_model_checkpoint": "miner_id_24/checkpoint-3000", "epoch": 2.0003243067942273, "eval_steps": 100, "global_step": 3084, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006486135884546781, "grad_norm": 3.839925765991211, "learning_rate": 2e-05, "loss": 5.3671, "step": 1 }, { "epoch": 0.0006486135884546781, "eval_loss": 5.341527462005615, "eval_runtime": 35.046, "eval_samples_per_second": 58.666, "eval_steps_per_second": 14.666, "step": 1 }, { "epoch": 0.0012972271769093563, "grad_norm": 3.8855299949645996, "learning_rate": 4e-05, "loss": 5.2735, "step": 2 }, { "epoch": 0.0019458407653640344, "grad_norm": 3.7578446865081787, "learning_rate": 6e-05, "loss": 4.8599, "step": 3 }, { "epoch": 0.0025944543538187126, "grad_norm": 2.840263843536377, "learning_rate": 8e-05, "loss": 4.3011, "step": 4 }, { "epoch": 0.0032430679422733905, "grad_norm": 3.2963178157806396, "learning_rate": 0.0001, "loss": 4.7049, "step": 5 }, { "epoch": 0.003891681530728069, "grad_norm": 2.3765435218811035, "learning_rate": 0.00012, "loss": 4.4563, "step": 6 }, { "epoch": 0.004540295119182747, "grad_norm": 3.072453022003174, "learning_rate": 0.00014, "loss": 4.4638, "step": 7 }, { "epoch": 0.005188908707637425, "grad_norm": 3.7620046138763428, "learning_rate": 0.00016, "loss": 4.8635, "step": 8 }, { "epoch": 0.0058375222960921035, "grad_norm": 2.274141311645508, "learning_rate": 0.00018, "loss": 4.0879, "step": 9 }, { "epoch": 0.006486135884546781, "grad_norm": 2.1749279499053955, "learning_rate": 0.0002, "loss": 4.0116, "step": 10 }, { "epoch": 0.007134749473001459, "grad_norm": 2.7090888023376465, "learning_rate": 0.0001999999477769797, "loss": 4.1352, "step": 11 }, { "epoch": 0.007783363061456138, "grad_norm": 2.8192594051361084, "learning_rate": 0.00019999979110797331, "loss": 3.7506, "step": 12 }, { "epoch": 0.008431976649910815, "grad_norm": 2.1979215145111084, "learning_rate": 0.00019999952999314444, "loss": 4.0342, "step": 13 }, { "epoch": 0.009080590238365494, "grad_norm": 1.9726805686950684, "learning_rate": 0.0001999991644327659, "loss": 3.6903, "step": 14 }, { "epoch": 0.009729203826820172, "grad_norm": 2.4161455631256104, "learning_rate": 0.0001999986944272194, "loss": 3.2464, "step": 15 }, { "epoch": 0.01037781741527485, "grad_norm": 1.8676812648773193, "learning_rate": 0.00019999811997699593, "loss": 3.7741, "step": 16 }, { "epoch": 0.011026431003729529, "grad_norm": 1.7903848886489868, "learning_rate": 0.00019999744108269542, "loss": 3.7455, "step": 17 }, { "epoch": 0.011675044592184207, "grad_norm": 2.043623924255371, "learning_rate": 0.00019999665774502696, "loss": 3.8827, "step": 18 }, { "epoch": 0.012323658180638884, "grad_norm": 1.8445640802383423, "learning_rate": 0.00019999576996480872, "loss": 3.9371, "step": 19 }, { "epoch": 0.012972271769093562, "grad_norm": 1.5927119255065918, "learning_rate": 0.000199994777742968, "loss": 3.5422, "step": 20 }, { "epoch": 0.01362088535754824, "grad_norm": 1.3268831968307495, "learning_rate": 0.0001999936810805411, "loss": 3.286, "step": 21 }, { "epoch": 0.014269498946002919, "grad_norm": 1.617830514907837, "learning_rate": 0.0001999924799786734, "loss": 3.3049, "step": 22 }, { "epoch": 0.014918112534457597, "grad_norm": 1.592024803161621, "learning_rate": 0.00019999117443861942, "loss": 3.3464, "step": 23 }, { "epoch": 0.015566726122912275, "grad_norm": 1.5516711473464966, "learning_rate": 0.00019998976446174277, "loss": 3.399, "step": 24 }, { "epoch": 0.016215339711366954, "grad_norm": 1.7870945930480957, "learning_rate": 0.00019998825004951612, "loss": 3.5775, "step": 25 }, { "epoch": 0.01686395329982163, "grad_norm": 1.6130069494247437, "learning_rate": 0.00019998663120352118, "loss": 3.6604, "step": 26 }, { "epoch": 0.01751256688827631, "grad_norm": 1.4511877298355103, "learning_rate": 0.00019998490792544883, "loss": 3.243, "step": 27 }, { "epoch": 0.018161180476730987, "grad_norm": 1.6252033710479736, "learning_rate": 0.0001999830802170989, "loss": 3.6021, "step": 28 }, { "epoch": 0.018809794065185667, "grad_norm": 1.7277567386627197, "learning_rate": 0.00019998114808038043, "loss": 3.5944, "step": 29 }, { "epoch": 0.019458407653640344, "grad_norm": 1.6731386184692383, "learning_rate": 0.00019997911151731134, "loss": 3.3969, "step": 30 }, { "epoch": 0.02010702124209502, "grad_norm": 1.6243277788162231, "learning_rate": 0.00019997697053001886, "loss": 3.035, "step": 31 }, { "epoch": 0.0207556348305497, "grad_norm": 1.773876667022705, "learning_rate": 0.00019997472512073912, "loss": 3.6559, "step": 32 }, { "epoch": 0.021404248419004377, "grad_norm": 1.47771155834198, "learning_rate": 0.00019997237529181737, "loss": 3.103, "step": 33 }, { "epoch": 0.022052862007459057, "grad_norm": 1.3901262283325195, "learning_rate": 0.0001999699210457079, "loss": 2.8133, "step": 34 }, { "epoch": 0.022701475595913734, "grad_norm": 1.6069719791412354, "learning_rate": 0.00019996736238497406, "loss": 3.3517, "step": 35 }, { "epoch": 0.023350089184368414, "grad_norm": 1.5524790287017822, "learning_rate": 0.0001999646993122883, "loss": 3.3211, "step": 36 }, { "epoch": 0.02399870277282309, "grad_norm": 1.6193372011184692, "learning_rate": 0.0001999619318304321, "loss": 3.5321, "step": 37 }, { "epoch": 0.024647316361277767, "grad_norm": 1.6295925378799438, "learning_rate": 0.00019995905994229593, "loss": 3.4257, "step": 38 }, { "epoch": 0.025295929949732447, "grad_norm": 1.9165164232254028, "learning_rate": 0.00019995608365087946, "loss": 3.9521, "step": 39 }, { "epoch": 0.025944543538187124, "grad_norm": 1.5593171119689941, "learning_rate": 0.0001999530029592912, "loss": 3.4656, "step": 40 }, { "epoch": 0.026593157126641804, "grad_norm": 1.691826581954956, "learning_rate": 0.0001999498178707489, "loss": 3.2846, "step": 41 }, { "epoch": 0.02724177071509648, "grad_norm": 1.709882140159607, "learning_rate": 0.00019994652838857917, "loss": 3.4973, "step": 42 }, { "epoch": 0.02789038430355116, "grad_norm": 1.5011334419250488, "learning_rate": 0.00019994313451621783, "loss": 3.0221, "step": 43 }, { "epoch": 0.028538997892005837, "grad_norm": 1.3794457912445068, "learning_rate": 0.0001999396362572096, "loss": 3.5346, "step": 44 }, { "epoch": 0.029187611480460517, "grad_norm": 1.6306489706039429, "learning_rate": 0.00019993603361520828, "loss": 3.4094, "step": 45 }, { "epoch": 0.029836225068915194, "grad_norm": 1.2462743520736694, "learning_rate": 0.00019993232659397666, "loss": 3.1227, "step": 46 }, { "epoch": 0.03048483865736987, "grad_norm": 1.4848414659500122, "learning_rate": 0.00019992851519738664, "loss": 3.2734, "step": 47 }, { "epoch": 0.03113345224582455, "grad_norm": 1.7278729677200317, "learning_rate": 0.00019992459942941906, "loss": 3.3184, "step": 48 }, { "epoch": 0.03178206583427923, "grad_norm": 1.4307690858840942, "learning_rate": 0.00019992057929416371, "loss": 3.5227, "step": 49 }, { "epoch": 0.03243067942273391, "grad_norm": 1.530920147895813, "learning_rate": 0.00019991645479581956, "loss": 3.1131, "step": 50 }, { "epoch": 0.03307929301118859, "grad_norm": 1.5259414911270142, "learning_rate": 0.00019991222593869444, "loss": 3.7062, "step": 51 }, { "epoch": 0.03372790659964326, "grad_norm": 1.562098503112793, "learning_rate": 0.0001999078927272052, "loss": 3.3501, "step": 52 }, { "epoch": 0.03437652018809794, "grad_norm": 1.539969801902771, "learning_rate": 0.00019990345516587775, "loss": 3.7154, "step": 53 }, { "epoch": 0.03502513377655262, "grad_norm": 1.8281248807907104, "learning_rate": 0.00019989891325934692, "loss": 3.5978, "step": 54 }, { "epoch": 0.035673747365007294, "grad_norm": 1.5411529541015625, "learning_rate": 0.00019989426701235653, "loss": 3.3103, "step": 55 }, { "epoch": 0.036322360953461974, "grad_norm": 1.3391532897949219, "learning_rate": 0.00019988951642975947, "loss": 2.9469, "step": 56 }, { "epoch": 0.036970974541916654, "grad_norm": 1.5546962022781372, "learning_rate": 0.00019988466151651748, "loss": 3.5001, "step": 57 }, { "epoch": 0.037619588130371334, "grad_norm": 1.673667550086975, "learning_rate": 0.00019987970227770135, "loss": 3.5981, "step": 58 }, { "epoch": 0.03826820171882601, "grad_norm": 1.5357543230056763, "learning_rate": 0.00019987463871849078, "loss": 3.2209, "step": 59 }, { "epoch": 0.03891681530728069, "grad_norm": 1.5452758073806763, "learning_rate": 0.0001998694708441745, "loss": 2.9202, "step": 60 }, { "epoch": 0.03956542889573537, "grad_norm": 1.5281524658203125, "learning_rate": 0.00019986419866015013, "loss": 3.1143, "step": 61 }, { "epoch": 0.04021404248419004, "grad_norm": 1.6060402393341064, "learning_rate": 0.00019985882217192423, "loss": 3.5452, "step": 62 }, { "epoch": 0.04086265607264472, "grad_norm": 1.6011849641799927, "learning_rate": 0.00019985334138511237, "loss": 3.5304, "step": 63 }, { "epoch": 0.0415112696610994, "grad_norm": 1.8889275789260864, "learning_rate": 0.00019984775630543902, "loss": 3.2174, "step": 64 }, { "epoch": 0.04215988324955408, "grad_norm": 1.7960286140441895, "learning_rate": 0.00019984206693873753, "loss": 3.6113, "step": 65 }, { "epoch": 0.042808496838008754, "grad_norm": 1.4898135662078857, "learning_rate": 0.00019983627329095028, "loss": 3.2973, "step": 66 }, { "epoch": 0.043457110426463434, "grad_norm": 1.5716447830200195, "learning_rate": 0.00019983037536812842, "loss": 3.011, "step": 67 }, { "epoch": 0.044105724014918114, "grad_norm": 1.4708287715911865, "learning_rate": 0.00019982437317643217, "loss": 3.3225, "step": 68 }, { "epoch": 0.04475433760337279, "grad_norm": 1.4985527992248535, "learning_rate": 0.00019981826672213054, "loss": 3.1602, "step": 69 }, { "epoch": 0.04540295119182747, "grad_norm": 1.4717998504638672, "learning_rate": 0.0001998120560116015, "loss": 3.2304, "step": 70 }, { "epoch": 0.04605156478028215, "grad_norm": 1.9946448802947998, "learning_rate": 0.00019980574105133187, "loss": 3.3665, "step": 71 }, { "epoch": 0.04670017836873683, "grad_norm": 1.5488526821136475, "learning_rate": 0.00019979932184791742, "loss": 3.0899, "step": 72 }, { "epoch": 0.0473487919571915, "grad_norm": 1.3588461875915527, "learning_rate": 0.00019979279840806271, "loss": 3.0037, "step": 73 }, { "epoch": 0.04799740554564618, "grad_norm": 1.368888258934021, "learning_rate": 0.00019978617073858123, "loss": 3.1688, "step": 74 }, { "epoch": 0.04864601913410086, "grad_norm": 1.7766389846801758, "learning_rate": 0.00019977943884639534, "loss": 3.0139, "step": 75 }, { "epoch": 0.049294632722555534, "grad_norm": 1.4920904636383057, "learning_rate": 0.0001997726027385362, "loss": 3.2154, "step": 76 }, { "epoch": 0.049943246311010214, "grad_norm": 1.707010269165039, "learning_rate": 0.00019976566242214388, "loss": 3.3891, "step": 77 }, { "epoch": 0.050591859899464894, "grad_norm": 1.7228715419769287, "learning_rate": 0.00019975861790446722, "loss": 3.4335, "step": 78 }, { "epoch": 0.051240473487919574, "grad_norm": 1.423966646194458, "learning_rate": 0.000199751469192864, "loss": 3.0397, "step": 79 }, { "epoch": 0.05188908707637425, "grad_norm": 1.5038292407989502, "learning_rate": 0.00019974421629480075, "loss": 3.0318, "step": 80 }, { "epoch": 0.05253770066482893, "grad_norm": 1.4808934926986694, "learning_rate": 0.00019973685921785282, "loss": 3.1422, "step": 81 }, { "epoch": 0.05318631425328361, "grad_norm": 1.4777363538742065, "learning_rate": 0.00019972939796970436, "loss": 3.3775, "step": 82 }, { "epoch": 0.05383492784173829, "grad_norm": 1.7979999780654907, "learning_rate": 0.00019972183255814843, "loss": 3.3686, "step": 83 }, { "epoch": 0.05448354143019296, "grad_norm": 1.5056769847869873, "learning_rate": 0.00019971416299108672, "loss": 3.2013, "step": 84 }, { "epoch": 0.05513215501864764, "grad_norm": 1.389157772064209, "learning_rate": 0.0001997063892765298, "loss": 3.0217, "step": 85 }, { "epoch": 0.05578076860710232, "grad_norm": 1.7881611585617065, "learning_rate": 0.00019969851142259706, "loss": 3.1829, "step": 86 }, { "epoch": 0.056429382195556994, "grad_norm": 1.3970147371292114, "learning_rate": 0.0001996905294375166, "loss": 3.059, "step": 87 }, { "epoch": 0.057077995784011674, "grad_norm": 1.655731439590454, "learning_rate": 0.0001996824433296252, "loss": 3.3041, "step": 88 }, { "epoch": 0.057726609372466355, "grad_norm": 1.7357268333435059, "learning_rate": 0.0001996742531073686, "loss": 3.4154, "step": 89 }, { "epoch": 0.058375222960921035, "grad_norm": 1.6906143426895142, "learning_rate": 0.00019966595877930106, "loss": 3.186, "step": 90 }, { "epoch": 0.05902383654937571, "grad_norm": 1.3434598445892334, "learning_rate": 0.00019965756035408573, "loss": 3.0845, "step": 91 }, { "epoch": 0.05967245013783039, "grad_norm": 1.5761163234710693, "learning_rate": 0.00019964905784049442, "loss": 3.2079, "step": 92 }, { "epoch": 0.06032106372628507, "grad_norm": 1.366127371788025, "learning_rate": 0.00019964045124740772, "loss": 3.0052, "step": 93 }, { "epoch": 0.06096967731473974, "grad_norm": 1.5109351873397827, "learning_rate": 0.0001996317405838148, "loss": 3.0047, "step": 94 }, { "epoch": 0.06161829090319442, "grad_norm": 1.2576245069503784, "learning_rate": 0.0001996229258588136, "loss": 3.1236, "step": 95 }, { "epoch": 0.0622669044916491, "grad_norm": 1.4992951154708862, "learning_rate": 0.0001996140070816108, "loss": 3.1032, "step": 96 }, { "epoch": 0.06291551808010377, "grad_norm": 1.4879511594772339, "learning_rate": 0.0001996049842615217, "loss": 3.2732, "step": 97 }, { "epoch": 0.06356413166855845, "grad_norm": 1.2676115036010742, "learning_rate": 0.00019959585740797028, "loss": 3.0975, "step": 98 }, { "epoch": 0.06421274525701313, "grad_norm": 1.826397180557251, "learning_rate": 0.00019958662653048913, "loss": 3.3263, "step": 99 }, { "epoch": 0.06486135884546781, "grad_norm": 1.3498917818069458, "learning_rate": 0.00019957729163871962, "loss": 3.1325, "step": 100 }, { "epoch": 0.06486135884546781, "eval_loss": 3.3947603702545166, "eval_runtime": 35.0846, "eval_samples_per_second": 58.601, "eval_steps_per_second": 14.65, "step": 100 }, { "epoch": 0.0655099724339225, "grad_norm": 1.445410966873169, "learning_rate": 0.00019956785274241164, "loss": 3.236, "step": 101 }, { "epoch": 0.06615858602237717, "grad_norm": 1.6176825761795044, "learning_rate": 0.00019955830985142367, "loss": 3.0255, "step": 102 }, { "epoch": 0.06680719961083184, "grad_norm": 1.61312735080719, "learning_rate": 0.000199548662975723, "loss": 3.2001, "step": 103 }, { "epoch": 0.06745581319928652, "grad_norm": 1.36332368850708, "learning_rate": 0.00019953891212538534, "loss": 3.0994, "step": 104 }, { "epoch": 0.0681044267877412, "grad_norm": 1.7672650814056396, "learning_rate": 0.00019952905731059506, "loss": 3.2036, "step": 105 }, { "epoch": 0.06875304037619588, "grad_norm": 1.484741449356079, "learning_rate": 0.00019951909854164517, "loss": 3.0926, "step": 106 }, { "epoch": 0.06940165396465056, "grad_norm": 1.6508300304412842, "learning_rate": 0.00019950903582893718, "loss": 3.3921, "step": 107 }, { "epoch": 0.07005026755310524, "grad_norm": 1.5160471200942993, "learning_rate": 0.0001994988691829812, "loss": 3.2314, "step": 108 }, { "epoch": 0.07069888114155992, "grad_norm": 1.6546351909637451, "learning_rate": 0.00019948859861439587, "loss": 3.6566, "step": 109 }, { "epoch": 0.07134749473001459, "grad_norm": 1.401430368423462, "learning_rate": 0.00019947822413390843, "loss": 3.0595, "step": 110 }, { "epoch": 0.07199610831846927, "grad_norm": 1.4741144180297852, "learning_rate": 0.0001994677457523546, "loss": 2.5762, "step": 111 }, { "epoch": 0.07264472190692395, "grad_norm": 1.4336308240890503, "learning_rate": 0.0001994571634806786, "loss": 3.2864, "step": 112 }, { "epoch": 0.07329333549537863, "grad_norm": 1.3772560358047485, "learning_rate": 0.00019944647732993324, "loss": 2.9711, "step": 113 }, { "epoch": 0.07394194908383331, "grad_norm": 1.8636032342910767, "learning_rate": 0.0001994356873112798, "loss": 3.6032, "step": 114 }, { "epoch": 0.07459056267228799, "grad_norm": 1.5523511171340942, "learning_rate": 0.00019942479343598794, "loss": 2.9138, "step": 115 }, { "epoch": 0.07523917626074267, "grad_norm": 1.3805291652679443, "learning_rate": 0.00019941379571543596, "loss": 3.1352, "step": 116 }, { "epoch": 0.07588778984919733, "grad_norm": 1.7657650709152222, "learning_rate": 0.00019940269416111054, "loss": 3.6572, "step": 117 }, { "epoch": 0.07653640343765201, "grad_norm": 1.0355079174041748, "learning_rate": 0.00019939148878460677, "loss": 3.1429, "step": 118 }, { "epoch": 0.0771850170261067, "grad_norm": 2.0980677604675293, "learning_rate": 0.00019938017959762822, "loss": 3.6367, "step": 119 }, { "epoch": 0.07783363061456137, "grad_norm": 1.554826259613037, "learning_rate": 0.00019936876661198692, "loss": 3.0011, "step": 120 }, { "epoch": 0.07848224420301605, "grad_norm": 1.5516928434371948, "learning_rate": 0.0001993572498396033, "loss": 2.8849, "step": 121 }, { "epoch": 0.07913085779147074, "grad_norm": 1.8022054433822632, "learning_rate": 0.00019934562929250612, "loss": 3.3661, "step": 122 }, { "epoch": 0.07977947137992542, "grad_norm": 1.523295521736145, "learning_rate": 0.00019933390498283262, "loss": 3.37, "step": 123 }, { "epoch": 0.08042808496838008, "grad_norm": 1.5638176202774048, "learning_rate": 0.0001993220769228284, "loss": 3.0696, "step": 124 }, { "epoch": 0.08107669855683476, "grad_norm": 1.4551162719726562, "learning_rate": 0.00019931014512484732, "loss": 2.9773, "step": 125 }, { "epoch": 0.08172531214528944, "grad_norm": 1.8125625848770142, "learning_rate": 0.00019929810960135172, "loss": 3.5409, "step": 126 }, { "epoch": 0.08237392573374412, "grad_norm": 1.598616361618042, "learning_rate": 0.0001992859703649122, "loss": 3.2174, "step": 127 }, { "epoch": 0.0830225393221988, "grad_norm": 1.6178065538406372, "learning_rate": 0.00019927372742820779, "loss": 3.1763, "step": 128 }, { "epoch": 0.08367115291065348, "grad_norm": 1.5398484468460083, "learning_rate": 0.00019926138080402566, "loss": 3.174, "step": 129 }, { "epoch": 0.08431976649910816, "grad_norm": 1.6358362436294556, "learning_rate": 0.0001992489305052614, "loss": 3.3533, "step": 130 }, { "epoch": 0.08496838008756283, "grad_norm": 1.5181739330291748, "learning_rate": 0.00019923637654491888, "loss": 3.0561, "step": 131 }, { "epoch": 0.08561699367601751, "grad_norm": 1.4952844381332397, "learning_rate": 0.00019922371893611022, "loss": 2.8942, "step": 132 }, { "epoch": 0.08626560726447219, "grad_norm": 1.55109441280365, "learning_rate": 0.00019921095769205574, "loss": 2.9548, "step": 133 }, { "epoch": 0.08691422085292687, "grad_norm": 1.258230209350586, "learning_rate": 0.00019919809282608407, "loss": 2.6494, "step": 134 }, { "epoch": 0.08756283444138155, "grad_norm": 1.5716472864151, "learning_rate": 0.0001991851243516321, "loss": 3.0505, "step": 135 }, { "epoch": 0.08821144802983623, "grad_norm": 2.0663163661956787, "learning_rate": 0.00019917205228224481, "loss": 3.699, "step": 136 }, { "epoch": 0.08886006161829091, "grad_norm": 1.6701781749725342, "learning_rate": 0.00019915887663157555, "loss": 3.3952, "step": 137 }, { "epoch": 0.08950867520674557, "grad_norm": 1.5281134843826294, "learning_rate": 0.0001991455974133857, "loss": 3.1934, "step": 138 }, { "epoch": 0.09015728879520025, "grad_norm": 1.577763557434082, "learning_rate": 0.00019913221464154488, "loss": 3.266, "step": 139 }, { "epoch": 0.09080590238365494, "grad_norm": 1.510722041130066, "learning_rate": 0.0001991187283300309, "loss": 3.0935, "step": 140 }, { "epoch": 0.09145451597210962, "grad_norm": 1.5357921123504639, "learning_rate": 0.00019910513849292962, "loss": 3.1124, "step": 141 }, { "epoch": 0.0921031295605643, "grad_norm": 1.441179633140564, "learning_rate": 0.00019909144514443518, "loss": 3.0029, "step": 142 }, { "epoch": 0.09275174314901898, "grad_norm": 1.5612683296203613, "learning_rate": 0.00019907764829884964, "loss": 2.9997, "step": 143 }, { "epoch": 0.09340035673747366, "grad_norm": 1.4757554531097412, "learning_rate": 0.00019906374797058334, "loss": 3.0236, "step": 144 }, { "epoch": 0.09404897032592832, "grad_norm": 1.4126044511795044, "learning_rate": 0.00019904974417415456, "loss": 2.7912, "step": 145 }, { "epoch": 0.094697583914383, "grad_norm": 1.751099944114685, "learning_rate": 0.00019903563692418976, "loss": 3.2925, "step": 146 }, { "epoch": 0.09534619750283768, "grad_norm": 1.6099004745483398, "learning_rate": 0.00019902142623542336, "loss": 3.0836, "step": 147 }, { "epoch": 0.09599481109129236, "grad_norm": 1.4375733137130737, "learning_rate": 0.0001990071121226979, "loss": 3.0289, "step": 148 }, { "epoch": 0.09664342467974704, "grad_norm": 1.8070977926254272, "learning_rate": 0.0001989926946009639, "loss": 3.1598, "step": 149 }, { "epoch": 0.09729203826820172, "grad_norm": 1.9119011163711548, "learning_rate": 0.00019897817368527985, "loss": 3.484, "step": 150 }, { "epoch": 0.0979406518566564, "grad_norm": 1.6774824857711792, "learning_rate": 0.00019896354939081233, "loss": 3.3953, "step": 151 }, { "epoch": 0.09858926544511107, "grad_norm": 1.5909019708633423, "learning_rate": 0.00019894882173283578, "loss": 3.3354, "step": 152 }, { "epoch": 0.09923787903356575, "grad_norm": 1.9849345684051514, "learning_rate": 0.00019893399072673268, "loss": 3.6584, "step": 153 }, { "epoch": 0.09988649262202043, "grad_norm": 1.6064544916152954, "learning_rate": 0.00019891905638799346, "loss": 2.8525, "step": 154 }, { "epoch": 0.10053510621047511, "grad_norm": 1.518021821975708, "learning_rate": 0.0001989040187322164, "loss": 3.1254, "step": 155 }, { "epoch": 0.10118371979892979, "grad_norm": 1.3452222347259521, "learning_rate": 0.00019888887777510776, "loss": 2.8711, "step": 156 }, { "epoch": 0.10183233338738447, "grad_norm": 1.5667740106582642, "learning_rate": 0.00019887363353248168, "loss": 2.8838, "step": 157 }, { "epoch": 0.10248094697583915, "grad_norm": 1.9317669868469238, "learning_rate": 0.0001988582860202601, "loss": 3.6402, "step": 158 }, { "epoch": 0.10312956056429382, "grad_norm": 1.5328751802444458, "learning_rate": 0.00019884283525447297, "loss": 3.4408, "step": 159 }, { "epoch": 0.1037781741527485, "grad_norm": 1.5413289070129395, "learning_rate": 0.00019882728125125799, "loss": 3.2537, "step": 160 }, { "epoch": 0.10442678774120318, "grad_norm": 1.7009034156799316, "learning_rate": 0.00019881162402686064, "loss": 3.2318, "step": 161 }, { "epoch": 0.10507540132965786, "grad_norm": 1.3361696004867554, "learning_rate": 0.00019879586359763436, "loss": 3.042, "step": 162 }, { "epoch": 0.10572401491811254, "grad_norm": 1.5617996454238892, "learning_rate": 0.0001987799999800402, "loss": 3.2803, "step": 163 }, { "epoch": 0.10637262850656722, "grad_norm": 1.399532437324524, "learning_rate": 0.00019876403319064715, "loss": 3.0391, "step": 164 }, { "epoch": 0.1070212420950219, "grad_norm": 1.429004192352295, "learning_rate": 0.0001987479632461319, "loss": 2.9923, "step": 165 }, { "epoch": 0.10766985568347658, "grad_norm": 1.4386272430419922, "learning_rate": 0.0001987317901632788, "loss": 2.7631, "step": 166 }, { "epoch": 0.10831846927193124, "grad_norm": 1.4610167741775513, "learning_rate": 0.00019871551395898003, "loss": 3.152, "step": 167 }, { "epoch": 0.10896708286038592, "grad_norm": 1.275974154472351, "learning_rate": 0.00019869913465023548, "loss": 3.0508, "step": 168 }, { "epoch": 0.1096156964488406, "grad_norm": 1.895013451576233, "learning_rate": 0.00019868265225415265, "loss": 3.4938, "step": 169 }, { "epoch": 0.11026431003729528, "grad_norm": 1.966009497642517, "learning_rate": 0.0001986660667879467, "loss": 3.4529, "step": 170 }, { "epoch": 0.11091292362574996, "grad_norm": 1.5443345308303833, "learning_rate": 0.0001986493782689406, "loss": 2.8802, "step": 171 }, { "epoch": 0.11156153721420464, "grad_norm": 1.4930187463760376, "learning_rate": 0.00019863258671456478, "loss": 3.1778, "step": 172 }, { "epoch": 0.11221015080265932, "grad_norm": 1.4915255308151245, "learning_rate": 0.00019861569214235737, "loss": 3.0042, "step": 173 }, { "epoch": 0.11285876439111399, "grad_norm": 1.735092043876648, "learning_rate": 0.00019859869456996407, "loss": 3.5419, "step": 174 }, { "epoch": 0.11350737797956867, "grad_norm": 1.5523368120193481, "learning_rate": 0.00019858159401513819, "loss": 3.0072, "step": 175 }, { "epoch": 0.11415599156802335, "grad_norm": 1.5719223022460938, "learning_rate": 0.00019856439049574057, "loss": 3.0198, "step": 176 }, { "epoch": 0.11480460515647803, "grad_norm": 1.835192084312439, "learning_rate": 0.0001985470840297396, "loss": 3.0267, "step": 177 }, { "epoch": 0.11545321874493271, "grad_norm": 1.9231184720993042, "learning_rate": 0.00019852967463521124, "loss": 3.1645, "step": 178 }, { "epoch": 0.11610183233338739, "grad_norm": 1.5600292682647705, "learning_rate": 0.00019851216233033884, "loss": 3.0769, "step": 179 }, { "epoch": 0.11675044592184207, "grad_norm": 1.7042049169540405, "learning_rate": 0.00019849454713341338, "loss": 3.4191, "step": 180 }, { "epoch": 0.11739905951029674, "grad_norm": 1.5651507377624512, "learning_rate": 0.0001984768290628332, "loss": 2.922, "step": 181 }, { "epoch": 0.11804767309875142, "grad_norm": 1.469074010848999, "learning_rate": 0.0001984590081371041, "loss": 2.9323, "step": 182 }, { "epoch": 0.1186962866872061, "grad_norm": 1.4901165962219238, "learning_rate": 0.00019844108437483938, "loss": 2.9711, "step": 183 }, { "epoch": 0.11934490027566078, "grad_norm": 1.219331979751587, "learning_rate": 0.00019842305779475968, "loss": 2.4844, "step": 184 }, { "epoch": 0.11999351386411546, "grad_norm": 1.562193751335144, "learning_rate": 0.00019840492841569307, "loss": 3.0347, "step": 185 }, { "epoch": 0.12064212745257014, "grad_norm": 1.3141801357269287, "learning_rate": 0.0001983866962565749, "loss": 2.8955, "step": 186 }, { "epoch": 0.12129074104102482, "grad_norm": 1.7868186235427856, "learning_rate": 0.00019836836133644802, "loss": 3.371, "step": 187 }, { "epoch": 0.12193935462947948, "grad_norm": 1.5469980239868164, "learning_rate": 0.0001983499236744625, "loss": 3.1649, "step": 188 }, { "epoch": 0.12258796821793416, "grad_norm": 1.5688115358352661, "learning_rate": 0.00019833138328987572, "loss": 3.102, "step": 189 }, { "epoch": 0.12323658180638884, "grad_norm": 1.9711377620697021, "learning_rate": 0.00019831274020205242, "loss": 3.2971, "step": 190 }, { "epoch": 0.12388519539484352, "grad_norm": 1.5055005550384521, "learning_rate": 0.00019829399443046454, "loss": 3.0587, "step": 191 }, { "epoch": 0.1245338089832982, "grad_norm": 1.5554633140563965, "learning_rate": 0.00019827514599469128, "loss": 3.2015, "step": 192 }, { "epoch": 0.12518242257175288, "grad_norm": 1.476672887802124, "learning_rate": 0.00019825619491441914, "loss": 3.0183, "step": 193 }, { "epoch": 0.12583103616020755, "grad_norm": 1.6196556091308594, "learning_rate": 0.0001982371412094417, "loss": 3.1581, "step": 194 }, { "epoch": 0.12647964974866224, "grad_norm": 1.4986144304275513, "learning_rate": 0.0001982179848996599, "loss": 3.2815, "step": 195 }, { "epoch": 0.1271282633371169, "grad_norm": 1.511244773864746, "learning_rate": 0.00019819872600508162, "loss": 3.1595, "step": 196 }, { "epoch": 0.1277768769255716, "grad_norm": 1.9068719148635864, "learning_rate": 0.00019817936454582212, "loss": 3.3151, "step": 197 }, { "epoch": 0.12842549051402627, "grad_norm": 2.2623164653778076, "learning_rate": 0.00019815990054210361, "loss": 3.6853, "step": 198 }, { "epoch": 0.12907410410248094, "grad_norm": 1.4253089427947998, "learning_rate": 0.00019814033401425554, "loss": 2.919, "step": 199 }, { "epoch": 0.12972271769093563, "grad_norm": 1.7597752809524536, "learning_rate": 0.0001981206649827143, "loss": 3.3967, "step": 200 }, { "epoch": 0.12972271769093563, "eval_loss": 3.2505762577056885, "eval_runtime": 35.0313, "eval_samples_per_second": 58.69, "eval_steps_per_second": 14.673, "step": 200 }, { "epoch": 0.1303713312793903, "grad_norm": 1.5361113548278809, "learning_rate": 0.00019810089346802346, "loss": 2.9761, "step": 201 }, { "epoch": 0.131019944867845, "grad_norm": 1.7124556303024292, "learning_rate": 0.0001980810194908336, "loss": 3.1084, "step": 202 }, { "epoch": 0.13166855845629966, "grad_norm": 1.9076920747756958, "learning_rate": 0.00019806104307190222, "loss": 3.4621, "step": 203 }, { "epoch": 0.13231717204475435, "grad_norm": 1.381435751914978, "learning_rate": 0.00019804096423209398, "loss": 2.7652, "step": 204 }, { "epoch": 0.13296578563320902, "grad_norm": 1.352182149887085, "learning_rate": 0.00019802078299238044, "loss": 2.717, "step": 205 }, { "epoch": 0.13361439922166368, "grad_norm": 1.5664377212524414, "learning_rate": 0.00019800049937384004, "loss": 2.9769, "step": 206 }, { "epoch": 0.13426301281011838, "grad_norm": 1.786026120185852, "learning_rate": 0.00019798011339765826, "loss": 3.341, "step": 207 }, { "epoch": 0.13491162639857304, "grad_norm": 1.4288690090179443, "learning_rate": 0.00019795962508512742, "loss": 2.676, "step": 208 }, { "epoch": 0.13556023998702774, "grad_norm": 1.6291303634643555, "learning_rate": 0.00019793903445764675, "loss": 3.1092, "step": 209 }, { "epoch": 0.1362088535754824, "grad_norm": 1.7784898281097412, "learning_rate": 0.0001979183415367224, "loss": 3.4292, "step": 210 }, { "epoch": 0.1368574671639371, "grad_norm": 1.7724665403366089, "learning_rate": 0.00019789754634396724, "loss": 3.3831, "step": 211 }, { "epoch": 0.13750608075239176, "grad_norm": 1.4297045469284058, "learning_rate": 0.00019787664890110108, "loss": 2.8106, "step": 212 }, { "epoch": 0.13815469434084643, "grad_norm": 1.650215983390808, "learning_rate": 0.0001978556492299504, "loss": 3.1152, "step": 213 }, { "epoch": 0.13880330792930112, "grad_norm": 1.4097318649291992, "learning_rate": 0.0001978345473524486, "loss": 3.1504, "step": 214 }, { "epoch": 0.1394519215177558, "grad_norm": 1.5713872909545898, "learning_rate": 0.00019781334329063572, "loss": 3.5687, "step": 215 }, { "epoch": 0.14010053510621048, "grad_norm": 1.2881985902786255, "learning_rate": 0.00019779203706665857, "loss": 3.0303, "step": 216 }, { "epoch": 0.14074914869466515, "grad_norm": 1.3899292945861816, "learning_rate": 0.00019777062870277064, "loss": 2.7401, "step": 217 }, { "epoch": 0.14139776228311984, "grad_norm": 1.5206934213638306, "learning_rate": 0.00019774911822133216, "loss": 2.8458, "step": 218 }, { "epoch": 0.1420463758715745, "grad_norm": 1.777827262878418, "learning_rate": 0.00019772750564480993, "loss": 3.1399, "step": 219 }, { "epoch": 0.14269498946002918, "grad_norm": 1.5830134153366089, "learning_rate": 0.00019770579099577743, "loss": 2.9685, "step": 220 }, { "epoch": 0.14334360304848387, "grad_norm": 1.619513750076294, "learning_rate": 0.0001976839742969148, "loss": 3.1719, "step": 221 }, { "epoch": 0.14399221663693854, "grad_norm": 1.6449589729309082, "learning_rate": 0.00019766205557100868, "loss": 3.1031, "step": 222 }, { "epoch": 0.14464083022539323, "grad_norm": 1.5686246156692505, "learning_rate": 0.0001976400348409523, "loss": 3.1232, "step": 223 }, { "epoch": 0.1452894438138479, "grad_norm": 1.781656265258789, "learning_rate": 0.0001976179121297455, "loss": 3.3703, "step": 224 }, { "epoch": 0.1459380574023026, "grad_norm": 1.6864426136016846, "learning_rate": 0.00019759568746049452, "loss": 3.3688, "step": 225 }, { "epoch": 0.14658667099075726, "grad_norm": 1.545371651649475, "learning_rate": 0.00019757336085641218, "loss": 3.3151, "step": 226 }, { "epoch": 0.14723528457921192, "grad_norm": 1.3052035570144653, "learning_rate": 0.0001975509323408177, "loss": 2.8913, "step": 227 }, { "epoch": 0.14788389816766662, "grad_norm": 1.2720938920974731, "learning_rate": 0.0001975284019371368, "loss": 3.0601, "step": 228 }, { "epoch": 0.14853251175612128, "grad_norm": 1.3967403173446655, "learning_rate": 0.00019750576966890158, "loss": 3.1093, "step": 229 }, { "epoch": 0.14918112534457598, "grad_norm": 1.4218616485595703, "learning_rate": 0.00019748303555975057, "loss": 2.8957, "step": 230 }, { "epoch": 0.14982973893303064, "grad_norm": 1.502661943435669, "learning_rate": 0.0001974601996334286, "loss": 3.1115, "step": 231 }, { "epoch": 0.15047835252148534, "grad_norm": 1.4609853029251099, "learning_rate": 0.00019743726191378698, "loss": 2.9707, "step": 232 }, { "epoch": 0.15112696610994, "grad_norm": 1.4173173904418945, "learning_rate": 0.00019741422242478316, "loss": 2.7085, "step": 233 }, { "epoch": 0.15177557969839467, "grad_norm": 1.6199264526367188, "learning_rate": 0.000197391081190481, "loss": 3.1486, "step": 234 }, { "epoch": 0.15242419328684936, "grad_norm": 1.5446836948394775, "learning_rate": 0.00019736783823505065, "loss": 2.884, "step": 235 }, { "epoch": 0.15307280687530403, "grad_norm": 1.59959876537323, "learning_rate": 0.00019734449358276842, "loss": 3.1943, "step": 236 }, { "epoch": 0.15372142046375872, "grad_norm": 1.547925353050232, "learning_rate": 0.00019732104725801684, "loss": 3.1038, "step": 237 }, { "epoch": 0.1543700340522134, "grad_norm": 1.5789741277694702, "learning_rate": 0.0001972974992852847, "loss": 3.0996, "step": 238 }, { "epoch": 0.15501864764066808, "grad_norm": 1.675233244895935, "learning_rate": 0.00019727384968916693, "loss": 3.3453, "step": 239 }, { "epoch": 0.15566726122912275, "grad_norm": 1.5575461387634277, "learning_rate": 0.00019725009849436463, "loss": 3.0918, "step": 240 }, { "epoch": 0.15631587481757742, "grad_norm": 1.6142653226852417, "learning_rate": 0.00019722624572568492, "loss": 3.0376, "step": 241 }, { "epoch": 0.1569644884060321, "grad_norm": 1.6989195346832275, "learning_rate": 0.0001972022914080411, "loss": 3.4667, "step": 242 }, { "epoch": 0.15761310199448678, "grad_norm": 1.5104671716690063, "learning_rate": 0.0001971782355664525, "loss": 2.9191, "step": 243 }, { "epoch": 0.15826171558294147, "grad_norm": 1.792704463005066, "learning_rate": 0.00019715407822604451, "loss": 3.2042, "step": 244 }, { "epoch": 0.15891032917139614, "grad_norm": 1.798724889755249, "learning_rate": 0.00019712981941204848, "loss": 3.1007, "step": 245 }, { "epoch": 0.15955894275985083, "grad_norm": 1.3790630102157593, "learning_rate": 0.00019710545914980183, "loss": 3.0556, "step": 246 }, { "epoch": 0.1602075563483055, "grad_norm": 1.8706490993499756, "learning_rate": 0.00019708099746474785, "loss": 3.3396, "step": 247 }, { "epoch": 0.16085616993676016, "grad_norm": 1.6114487648010254, "learning_rate": 0.00019705643438243584, "loss": 2.6075, "step": 248 }, { "epoch": 0.16150478352521486, "grad_norm": 1.5196325778961182, "learning_rate": 0.0001970317699285209, "loss": 3.0033, "step": 249 }, { "epoch": 0.16215339711366952, "grad_norm": 1.6828662157058716, "learning_rate": 0.00019700700412876416, "loss": 3.0495, "step": 250 }, { "epoch": 0.16280201070212422, "grad_norm": 1.5269091129302979, "learning_rate": 0.00019698213700903246, "loss": 3.0902, "step": 251 }, { "epoch": 0.16345062429057888, "grad_norm": 1.3918476104736328, "learning_rate": 0.00019695716859529855, "loss": 2.9452, "step": 252 }, { "epoch": 0.16409923787903358, "grad_norm": 1.6424914598464966, "learning_rate": 0.00019693209891364093, "loss": 3.2129, "step": 253 }, { "epoch": 0.16474785146748824, "grad_norm": 2.0256524085998535, "learning_rate": 0.0001969069279902439, "loss": 3.6181, "step": 254 }, { "epoch": 0.1653964650559429, "grad_norm": 1.7617406845092773, "learning_rate": 0.00019688165585139748, "loss": 3.2462, "step": 255 }, { "epoch": 0.1660450786443976, "grad_norm": 1.5007916688919067, "learning_rate": 0.00019685628252349744, "loss": 3.0225, "step": 256 }, { "epoch": 0.16669369223285227, "grad_norm": 1.6564241647720337, "learning_rate": 0.0001968308080330452, "loss": 3.0276, "step": 257 }, { "epoch": 0.16734230582130696, "grad_norm": 1.571022868156433, "learning_rate": 0.00019680523240664786, "loss": 2.853, "step": 258 }, { "epoch": 0.16799091940976163, "grad_norm": 1.3106783628463745, "learning_rate": 0.00019677955567101813, "loss": 2.539, "step": 259 }, { "epoch": 0.16863953299821632, "grad_norm": 1.4485841989517212, "learning_rate": 0.0001967537778529744, "loss": 2.9644, "step": 260 }, { "epoch": 0.169288146586671, "grad_norm": 1.4725185632705688, "learning_rate": 0.00019672789897944056, "loss": 2.8638, "step": 261 }, { "epoch": 0.16993676017512566, "grad_norm": 1.7089518308639526, "learning_rate": 0.00019670191907744598, "loss": 3.2726, "step": 262 }, { "epoch": 0.17058537376358035, "grad_norm": 2.0805907249450684, "learning_rate": 0.00019667583817412578, "loss": 3.5152, "step": 263 }, { "epoch": 0.17123398735203502, "grad_norm": 1.42578125, "learning_rate": 0.00019664965629672033, "loss": 2.7068, "step": 264 }, { "epoch": 0.1718826009404897, "grad_norm": 1.70866858959198, "learning_rate": 0.0001966233734725756, "loss": 2.9573, "step": 265 }, { "epoch": 0.17253121452894438, "grad_norm": 1.55576491355896, "learning_rate": 0.000196596989729143, "loss": 3.1079, "step": 266 }, { "epoch": 0.17317982811739907, "grad_norm": 1.4486907720565796, "learning_rate": 0.00019657050509397923, "loss": 2.9568, "step": 267 }, { "epoch": 0.17382844170585374, "grad_norm": 2.7671091556549072, "learning_rate": 0.00019654391959474647, "loss": 3.3971, "step": 268 }, { "epoch": 0.1744770552943084, "grad_norm": 1.6282739639282227, "learning_rate": 0.00019651723325921224, "loss": 3.3782, "step": 269 }, { "epoch": 0.1751256688827631, "grad_norm": 1.6156924962997437, "learning_rate": 0.00019649044611524933, "loss": 3.1748, "step": 270 }, { "epoch": 0.17577428247121776, "grad_norm": 1.4323394298553467, "learning_rate": 0.00019646355819083589, "loss": 2.9028, "step": 271 }, { "epoch": 0.17642289605967246, "grad_norm": 1.5577207803726196, "learning_rate": 0.00019643656951405525, "loss": 3.1325, "step": 272 }, { "epoch": 0.17707150964812712, "grad_norm": 1.5941506624221802, "learning_rate": 0.00019640948011309604, "loss": 3.0872, "step": 273 }, { "epoch": 0.17772012323658182, "grad_norm": 1.3285024166107178, "learning_rate": 0.00019638229001625205, "loss": 2.6622, "step": 274 }, { "epoch": 0.17836873682503648, "grad_norm": 1.4673588275909424, "learning_rate": 0.0001963549992519223, "loss": 3.1656, "step": 275 }, { "epoch": 0.17901735041349115, "grad_norm": 1.9262672662734985, "learning_rate": 0.00019632760784861087, "loss": 2.8955, "step": 276 }, { "epoch": 0.17966596400194584, "grad_norm": 1.5531195402145386, "learning_rate": 0.00019630011583492702, "loss": 2.9895, "step": 277 }, { "epoch": 0.1803145775904005, "grad_norm": 1.259700059890747, "learning_rate": 0.00019627252323958504, "loss": 2.4601, "step": 278 }, { "epoch": 0.1809631911788552, "grad_norm": 1.4137221574783325, "learning_rate": 0.00019624483009140435, "loss": 2.7425, "step": 279 }, { "epoch": 0.18161180476730987, "grad_norm": 1.2611486911773682, "learning_rate": 0.0001962170364193093, "loss": 2.7371, "step": 280 }, { "epoch": 0.18226041835576456, "grad_norm": 1.7357769012451172, "learning_rate": 0.00019618914225232934, "loss": 3.0614, "step": 281 }, { "epoch": 0.18290903194421923, "grad_norm": 1.706430435180664, "learning_rate": 0.00019616114761959874, "loss": 2.9595, "step": 282 }, { "epoch": 0.1835576455326739, "grad_norm": 1.6613085269927979, "learning_rate": 0.00019613305255035686, "loss": 2.9299, "step": 283 }, { "epoch": 0.1842062591211286, "grad_norm": 1.4859459400177002, "learning_rate": 0.00019610485707394784, "loss": 2.8886, "step": 284 }, { "epoch": 0.18485487270958326, "grad_norm": 1.5165742635726929, "learning_rate": 0.00019607656121982075, "loss": 2.7321, "step": 285 }, { "epoch": 0.18550348629803795, "grad_norm": 1.711745023727417, "learning_rate": 0.00019604816501752947, "loss": 2.8954, "step": 286 }, { "epoch": 0.18615209988649262, "grad_norm": 1.5451345443725586, "learning_rate": 0.00019601966849673276, "loss": 2.9563, "step": 287 }, { "epoch": 0.1868007134749473, "grad_norm": 1.3915531635284424, "learning_rate": 0.00019599107168719412, "loss": 3.0462, "step": 288 }, { "epoch": 0.18744932706340198, "grad_norm": 1.2463390827178955, "learning_rate": 0.0001959623746187817, "loss": 2.7958, "step": 289 }, { "epoch": 0.18809794065185664, "grad_norm": 1.683665156364441, "learning_rate": 0.0001959335773214685, "loss": 2.8798, "step": 290 }, { "epoch": 0.18874655424031134, "grad_norm": 1.5754495859146118, "learning_rate": 0.00019590467982533215, "loss": 2.8401, "step": 291 }, { "epoch": 0.189395167828766, "grad_norm": 1.6469128131866455, "learning_rate": 0.000195875682160555, "loss": 3.0458, "step": 292 }, { "epoch": 0.1900437814172207, "grad_norm": 1.4030909538269043, "learning_rate": 0.00019584658435742384, "loss": 2.8845, "step": 293 }, { "epoch": 0.19069239500567536, "grad_norm": 1.4269638061523438, "learning_rate": 0.00019581738644633027, "loss": 3.1745, "step": 294 }, { "epoch": 0.19134100859413006, "grad_norm": 1.8712362051010132, "learning_rate": 0.00019578808845777034, "loss": 2.9603, "step": 295 }, { "epoch": 0.19198962218258472, "grad_norm": 1.3823246955871582, "learning_rate": 0.0001957586904223446, "loss": 2.8134, "step": 296 }, { "epoch": 0.1926382357710394, "grad_norm": 1.6640461683273315, "learning_rate": 0.00019572919237075817, "loss": 3.0434, "step": 297 }, { "epoch": 0.19328684935949408, "grad_norm": 1.68658447265625, "learning_rate": 0.0001956995943338206, "loss": 3.2863, "step": 298 }, { "epoch": 0.19393546294794875, "grad_norm": 1.4889813661575317, "learning_rate": 0.00019566989634244584, "loss": 2.7607, "step": 299 }, { "epoch": 0.19458407653640344, "grad_norm": 1.526188850402832, "learning_rate": 0.00019564009842765225, "loss": 2.9834, "step": 300 }, { "epoch": 0.19458407653640344, "eval_loss": 3.1460793018341064, "eval_runtime": 35.0284, "eval_samples_per_second": 58.695, "eval_steps_per_second": 14.674, "step": 300 }, { "epoch": 0.1952326901248581, "grad_norm": 1.5348420143127441, "learning_rate": 0.00019561020062056262, "loss": 3.0085, "step": 301 }, { "epoch": 0.1958813037133128, "grad_norm": 1.731803297996521, "learning_rate": 0.000195580202952404, "loss": 2.6935, "step": 302 }, { "epoch": 0.19652991730176747, "grad_norm": 1.4084587097167969, "learning_rate": 0.0001955501054545078, "loss": 2.686, "step": 303 }, { "epoch": 0.19717853089022214, "grad_norm": 1.6902164220809937, "learning_rate": 0.00019551990815830958, "loss": 3.081, "step": 304 }, { "epoch": 0.19782714447867683, "grad_norm": 1.8202259540557861, "learning_rate": 0.00019548961109534934, "loss": 3.3835, "step": 305 }, { "epoch": 0.1984757580671315, "grad_norm": 1.5406153202056885, "learning_rate": 0.00019545921429727105, "loss": 2.7855, "step": 306 }, { "epoch": 0.1991243716555862, "grad_norm": 1.4835028648376465, "learning_rate": 0.00019542871779582304, "loss": 2.8638, "step": 307 }, { "epoch": 0.19977298524404086, "grad_norm": 1.9609777927398682, "learning_rate": 0.00019539812162285767, "loss": 3.0011, "step": 308 }, { "epoch": 0.20042159883249555, "grad_norm": 1.6606584787368774, "learning_rate": 0.00019536742581033144, "loss": 2.8927, "step": 309 }, { "epoch": 0.20107021242095022, "grad_norm": 1.4777995347976685, "learning_rate": 0.0001953366303903049, "loss": 2.9018, "step": 310 }, { "epoch": 0.20171882600940488, "grad_norm": 1.5207997560501099, "learning_rate": 0.00019530573539494262, "loss": 2.7984, "step": 311 }, { "epoch": 0.20236743959785958, "grad_norm": 1.5775551795959473, "learning_rate": 0.00019527474085651324, "loss": 2.9266, "step": 312 }, { "epoch": 0.20301605318631424, "grad_norm": 1.8596768379211426, "learning_rate": 0.00019524364680738933, "loss": 3.2169, "step": 313 }, { "epoch": 0.20366466677476894, "grad_norm": 2.019535541534424, "learning_rate": 0.00019521245328004737, "loss": 2.8426, "step": 314 }, { "epoch": 0.2043132803632236, "grad_norm": 1.4515011310577393, "learning_rate": 0.00019518116030706778, "loss": 2.8278, "step": 315 }, { "epoch": 0.2049618939516783, "grad_norm": 1.6668334007263184, "learning_rate": 0.00019514976792113484, "loss": 2.9151, "step": 316 }, { "epoch": 0.20561050754013296, "grad_norm": 1.6750513315200806, "learning_rate": 0.00019511827615503662, "loss": 3.0335, "step": 317 }, { "epoch": 0.20625912112858763, "grad_norm": 1.3898333311080933, "learning_rate": 0.00019508668504166505, "loss": 2.7486, "step": 318 }, { "epoch": 0.20690773471704232, "grad_norm": 1.9349443912506104, "learning_rate": 0.0001950549946140158, "loss": 3.3656, "step": 319 }, { "epoch": 0.207556348305497, "grad_norm": 1.46927809715271, "learning_rate": 0.00019502320490518823, "loss": 2.8108, "step": 320 }, { "epoch": 0.20820496189395168, "grad_norm": 1.5501346588134766, "learning_rate": 0.0001949913159483855, "loss": 2.7632, "step": 321 }, { "epoch": 0.20885357548240635, "grad_norm": 1.5428237915039062, "learning_rate": 0.00019495932777691428, "loss": 2.9241, "step": 322 }, { "epoch": 0.20950218907086104, "grad_norm": 1.4161127805709839, "learning_rate": 0.000194927240424185, "loss": 2.7479, "step": 323 }, { "epoch": 0.2101508026593157, "grad_norm": 1.6062211990356445, "learning_rate": 0.00019489505392371163, "loss": 3.0915, "step": 324 }, { "epoch": 0.21079941624777038, "grad_norm": 1.6449527740478516, "learning_rate": 0.00019486276830911166, "loss": 3.1093, "step": 325 }, { "epoch": 0.21144802983622507, "grad_norm": 1.6712406873703003, "learning_rate": 0.00019483038361410622, "loss": 3.2132, "step": 326 }, { "epoch": 0.21209664342467974, "grad_norm": 1.9225763082504272, "learning_rate": 0.00019479789987251973, "loss": 3.334, "step": 327 }, { "epoch": 0.21274525701313443, "grad_norm": 1.6560297012329102, "learning_rate": 0.00019476531711828027, "loss": 3.1673, "step": 328 }, { "epoch": 0.2133938706015891, "grad_norm": 1.6719776391983032, "learning_rate": 0.00019473263538541914, "loss": 2.8777, "step": 329 }, { "epoch": 0.2140424841900438, "grad_norm": 1.66377854347229, "learning_rate": 0.00019469985470807122, "loss": 3.0038, "step": 330 }, { "epoch": 0.21469109777849846, "grad_norm": 1.5589053630828857, "learning_rate": 0.00019466697512047452, "loss": 2.8917, "step": 331 }, { "epoch": 0.21533971136695315, "grad_norm": 1.5291959047317505, "learning_rate": 0.00019463399665697057, "loss": 2.9973, "step": 332 }, { "epoch": 0.21598832495540782, "grad_norm": 1.274047613143921, "learning_rate": 0.000194600919352004, "loss": 2.6853, "step": 333 }, { "epoch": 0.21663693854386248, "grad_norm": 1.4769723415374756, "learning_rate": 0.00019456774324012278, "loss": 2.6683, "step": 334 }, { "epoch": 0.21728555213231718, "grad_norm": 1.3692103624343872, "learning_rate": 0.00019453446835597803, "loss": 2.5512, "step": 335 }, { "epoch": 0.21793416572077184, "grad_norm": 1.6415315866470337, "learning_rate": 0.00019450109473432406, "loss": 3.1087, "step": 336 }, { "epoch": 0.21858277930922654, "grad_norm": 1.6555556058883667, "learning_rate": 0.00019446762241001828, "loss": 2.9557, "step": 337 }, { "epoch": 0.2192313928976812, "grad_norm": 1.7784076929092407, "learning_rate": 0.0001944340514180212, "loss": 3.3665, "step": 338 }, { "epoch": 0.2198800064861359, "grad_norm": 1.7411212921142578, "learning_rate": 0.00019440038179339647, "loss": 3.1573, "step": 339 }, { "epoch": 0.22052862007459056, "grad_norm": 1.2555419206619263, "learning_rate": 0.00019436661357131056, "loss": 2.3548, "step": 340 }, { "epoch": 0.22117723366304523, "grad_norm": 1.7457014322280884, "learning_rate": 0.00019433274678703314, "loss": 3.1, "step": 341 }, { "epoch": 0.22182584725149992, "grad_norm": 1.761592149734497, "learning_rate": 0.00019429878147593667, "loss": 3.0195, "step": 342 }, { "epoch": 0.2224744608399546, "grad_norm": 1.766256332397461, "learning_rate": 0.0001942647176734966, "loss": 2.9552, "step": 343 }, { "epoch": 0.22312307442840928, "grad_norm": 1.3047394752502441, "learning_rate": 0.0001942305554152912, "loss": 2.6313, "step": 344 }, { "epoch": 0.22377168801686395, "grad_norm": 1.9204014539718628, "learning_rate": 0.0001941962947370016, "loss": 3.2288, "step": 345 }, { "epoch": 0.22442030160531865, "grad_norm": 2.016052484512329, "learning_rate": 0.00019416193567441172, "loss": 3.0854, "step": 346 }, { "epoch": 0.2250689151937733, "grad_norm": 1.4872554540634155, "learning_rate": 0.00019412747826340828, "loss": 2.707, "step": 347 }, { "epoch": 0.22571752878222798, "grad_norm": 1.4982898235321045, "learning_rate": 0.00019409292253998062, "loss": 2.8002, "step": 348 }, { "epoch": 0.22636614237068267, "grad_norm": 1.6852301359176636, "learning_rate": 0.0001940582685402209, "loss": 2.7447, "step": 349 }, { "epoch": 0.22701475595913734, "grad_norm": 1.5387024879455566, "learning_rate": 0.00019402351630032377, "loss": 2.8884, "step": 350 }, { "epoch": 0.22766336954759203, "grad_norm": 1.4964500665664673, "learning_rate": 0.00019398866585658658, "loss": 2.8696, "step": 351 }, { "epoch": 0.2283119831360467, "grad_norm": 1.7827928066253662, "learning_rate": 0.00019395371724540935, "loss": 3.1649, "step": 352 }, { "epoch": 0.2289605967245014, "grad_norm": 1.6043105125427246, "learning_rate": 0.00019391867050329436, "loss": 2.773, "step": 353 }, { "epoch": 0.22960921031295606, "grad_norm": 1.7371394634246826, "learning_rate": 0.00019388352566684664, "loss": 2.5968, "step": 354 }, { "epoch": 0.23025782390141072, "grad_norm": 1.862587571144104, "learning_rate": 0.00019384828277277356, "loss": 3.0649, "step": 355 }, { "epoch": 0.23090643748986542, "grad_norm": 1.7257881164550781, "learning_rate": 0.00019381294185788494, "loss": 2.9117, "step": 356 }, { "epoch": 0.23155505107832008, "grad_norm": 1.5261613130569458, "learning_rate": 0.00019377750295909295, "loss": 3.0148, "step": 357 }, { "epoch": 0.23220366466677478, "grad_norm": 1.1886473894119263, "learning_rate": 0.0001937419661134121, "loss": 2.9267, "step": 358 }, { "epoch": 0.23285227825522944, "grad_norm": 1.715752124786377, "learning_rate": 0.00019370633135795922, "loss": 2.8664, "step": 359 }, { "epoch": 0.23350089184368414, "grad_norm": 1.4864920377731323, "learning_rate": 0.00019367059872995348, "loss": 3.0009, "step": 360 }, { "epoch": 0.2341495054321388, "grad_norm": 1.4579834938049316, "learning_rate": 0.00019363476826671612, "loss": 2.7175, "step": 361 }, { "epoch": 0.23479811902059347, "grad_norm": 1.9683144092559814, "learning_rate": 0.0001935988400056706, "loss": 3.2889, "step": 362 }, { "epoch": 0.23544673260904816, "grad_norm": 2.1681151390075684, "learning_rate": 0.00019356281398434267, "loss": 3.3214, "step": 363 }, { "epoch": 0.23609534619750283, "grad_norm": 1.5372897386550903, "learning_rate": 0.00019352669024036003, "loss": 2.7409, "step": 364 }, { "epoch": 0.23674395978595753, "grad_norm": 1.6058359146118164, "learning_rate": 0.0001934904688114525, "loss": 2.9069, "step": 365 }, { "epoch": 0.2373925733744122, "grad_norm": 1.6840300559997559, "learning_rate": 0.00019345414973545192, "loss": 2.9605, "step": 366 }, { "epoch": 0.23804118696286689, "grad_norm": 1.4882802963256836, "learning_rate": 0.00019341773305029216, "loss": 2.8192, "step": 367 }, { "epoch": 0.23868980055132155, "grad_norm": 1.947954535484314, "learning_rate": 0.00019338121879400896, "loss": 3.295, "step": 368 }, { "epoch": 0.23933841413977622, "grad_norm": 2.0685973167419434, "learning_rate": 0.00019334460700474006, "loss": 2.9202, "step": 369 }, { "epoch": 0.2399870277282309, "grad_norm": 1.392995834350586, "learning_rate": 0.000193307897720725, "loss": 2.6796, "step": 370 }, { "epoch": 0.24063564131668558, "grad_norm": 2.002079486846924, "learning_rate": 0.00019327109098030517, "loss": 3.2781, "step": 371 }, { "epoch": 0.24128425490514027, "grad_norm": 1.7302005290985107, "learning_rate": 0.00019323418682192376, "loss": 2.8825, "step": 372 }, { "epoch": 0.24193286849359494, "grad_norm": 1.3565232753753662, "learning_rate": 0.0001931971852841257, "loss": 2.876, "step": 373 }, { "epoch": 0.24258148208204963, "grad_norm": 1.7181309461593628, "learning_rate": 0.00019316008640555765, "loss": 2.8888, "step": 374 }, { "epoch": 0.2432300956705043, "grad_norm": 1.7642203569412231, "learning_rate": 0.0001931228902249679, "loss": 3.335, "step": 375 }, { "epoch": 0.24387870925895896, "grad_norm": 1.6514211893081665, "learning_rate": 0.00019308559678120641, "loss": 2.7494, "step": 376 }, { "epoch": 0.24452732284741366, "grad_norm": 1.8369169235229492, "learning_rate": 0.00019304820611322466, "loss": 3.3083, "step": 377 }, { "epoch": 0.24517593643586832, "grad_norm": 1.6943202018737793, "learning_rate": 0.00019301071826007576, "loss": 2.8296, "step": 378 }, { "epoch": 0.24582455002432302, "grad_norm": 1.3199753761291504, "learning_rate": 0.00019297313326091428, "loss": 2.7518, "step": 379 }, { "epoch": 0.24647316361277768, "grad_norm": 1.4667972326278687, "learning_rate": 0.00019293545115499624, "loss": 3.0344, "step": 380 }, { "epoch": 0.24712177720123238, "grad_norm": 1.3710405826568604, "learning_rate": 0.00019289767198167916, "loss": 2.6783, "step": 381 }, { "epoch": 0.24777039078968704, "grad_norm": 1.6002377271652222, "learning_rate": 0.00019285979578042184, "loss": 2.9566, "step": 382 }, { "epoch": 0.2484190043781417, "grad_norm": 1.3481214046478271, "learning_rate": 0.0001928218225907845, "loss": 2.5909, "step": 383 }, { "epoch": 0.2490676179665964, "grad_norm": 1.6726258993148804, "learning_rate": 0.0001927837524524286, "loss": 2.8626, "step": 384 }, { "epoch": 0.24971623155505107, "grad_norm": 1.9543962478637695, "learning_rate": 0.00019274558540511696, "loss": 2.8986, "step": 385 }, { "epoch": 0.25036484514350577, "grad_norm": 1.5243573188781738, "learning_rate": 0.0001927073214887135, "loss": 2.7777, "step": 386 }, { "epoch": 0.25101345873196046, "grad_norm": 1.5934795141220093, "learning_rate": 0.00019266896074318334, "loss": 2.8004, "step": 387 }, { "epoch": 0.2516620723204151, "grad_norm": 1.762434959411621, "learning_rate": 0.00019263050320859283, "loss": 3.0764, "step": 388 }, { "epoch": 0.2523106859088698, "grad_norm": 1.614300012588501, "learning_rate": 0.0001925919489251093, "loss": 2.8265, "step": 389 }, { "epoch": 0.2529592994973245, "grad_norm": 1.491695761680603, "learning_rate": 0.00019255329793300114, "loss": 2.6368, "step": 390 }, { "epoch": 0.2536079130857791, "grad_norm": 1.3432161808013916, "learning_rate": 0.00019251455027263786, "loss": 2.4541, "step": 391 }, { "epoch": 0.2542565266742338, "grad_norm": 1.573061466217041, "learning_rate": 0.0001924757059844898, "loss": 2.8716, "step": 392 }, { "epoch": 0.2549051402626885, "grad_norm": 1.695497989654541, "learning_rate": 0.0001924367651091283, "loss": 2.9639, "step": 393 }, { "epoch": 0.2555537538511432, "grad_norm": 1.673969030380249, "learning_rate": 0.00019239772768722558, "loss": 2.7697, "step": 394 }, { "epoch": 0.25620236743959784, "grad_norm": 1.746987223625183, "learning_rate": 0.00019235859375955463, "loss": 2.7487, "step": 395 }, { "epoch": 0.25685098102805254, "grad_norm": 1.5612534284591675, "learning_rate": 0.00019231936336698932, "loss": 2.6166, "step": 396 }, { "epoch": 0.25749959461650723, "grad_norm": 1.751975417137146, "learning_rate": 0.00019228003655050424, "loss": 3.0322, "step": 397 }, { "epoch": 0.25814820820496187, "grad_norm": 1.908344030380249, "learning_rate": 0.00019224061335117472, "loss": 2.9928, "step": 398 }, { "epoch": 0.25879682179341656, "grad_norm": 1.5650019645690918, "learning_rate": 0.0001922010938101767, "loss": 2.9637, "step": 399 }, { "epoch": 0.25944543538187126, "grad_norm": 1.636045217514038, "learning_rate": 0.00019216147796878675, "loss": 2.6376, "step": 400 }, { "epoch": 0.25944543538187126, "eval_loss": 3.0807607173919678, "eval_runtime": 35.0392, "eval_samples_per_second": 58.677, "eval_steps_per_second": 14.669, "step": 400 }, { "epoch": 0.26009404897032595, "grad_norm": 1.5578237771987915, "learning_rate": 0.0001921217658683821, "loss": 2.8666, "step": 401 }, { "epoch": 0.2607426625587806, "grad_norm": 1.553233027458191, "learning_rate": 0.00019208195755044042, "loss": 3.0263, "step": 402 }, { "epoch": 0.2613912761472353, "grad_norm": 1.6139765977859497, "learning_rate": 0.00019204205305653997, "loss": 2.8051, "step": 403 }, { "epoch": 0.26203988973569, "grad_norm": 1.8325905799865723, "learning_rate": 0.00019200205242835938, "loss": 3.2565, "step": 404 }, { "epoch": 0.2626885033241446, "grad_norm": 1.861401915550232, "learning_rate": 0.00019196195570767775, "loss": 2.8737, "step": 405 }, { "epoch": 0.2633371169125993, "grad_norm": 1.9898242950439453, "learning_rate": 0.00019192176293637448, "loss": 2.9564, "step": 406 }, { "epoch": 0.263985730501054, "grad_norm": 1.7285724878311157, "learning_rate": 0.00019188147415642934, "loss": 2.7479, "step": 407 }, { "epoch": 0.2646343440895087, "grad_norm": 1.913542628288269, "learning_rate": 0.0001918410894099224, "loss": 3.2177, "step": 408 }, { "epoch": 0.26528295767796334, "grad_norm": 1.4639517068862915, "learning_rate": 0.0001918006087390339, "loss": 2.507, "step": 409 }, { "epoch": 0.26593157126641803, "grad_norm": 1.6322999000549316, "learning_rate": 0.00019176003218604427, "loss": 2.7468, "step": 410 }, { "epoch": 0.2665801848548727, "grad_norm": 1.634833574295044, "learning_rate": 0.00019171935979333418, "loss": 3.0316, "step": 411 }, { "epoch": 0.26722879844332736, "grad_norm": 1.6071871519088745, "learning_rate": 0.00019167859160338426, "loss": 2.9154, "step": 412 }, { "epoch": 0.26787741203178206, "grad_norm": 1.5342798233032227, "learning_rate": 0.00019163772765877534, "loss": 2.774, "step": 413 }, { "epoch": 0.26852602562023675, "grad_norm": 2.07749080657959, "learning_rate": 0.00019159676800218814, "loss": 3.1722, "step": 414 }, { "epoch": 0.26917463920869145, "grad_norm": 1.5742781162261963, "learning_rate": 0.00019155571267640342, "loss": 2.7114, "step": 415 }, { "epoch": 0.2698232527971461, "grad_norm": 2.0750064849853516, "learning_rate": 0.00019151456172430183, "loss": 3.0305, "step": 416 }, { "epoch": 0.2704718663856008, "grad_norm": 1.4500852823257446, "learning_rate": 0.00019147331518886394, "loss": 2.7075, "step": 417 }, { "epoch": 0.2711204799740555, "grad_norm": 1.4853321313858032, "learning_rate": 0.00019143197311317014, "loss": 2.6766, "step": 418 }, { "epoch": 0.2717690935625101, "grad_norm": 1.6843584775924683, "learning_rate": 0.00019139053554040051, "loss": 2.9589, "step": 419 }, { "epoch": 0.2724177071509648, "grad_norm": 1.4509408473968506, "learning_rate": 0.000191349002513835, "loss": 2.7141, "step": 420 }, { "epoch": 0.2730663207394195, "grad_norm": 1.3282841444015503, "learning_rate": 0.00019130737407685322, "loss": 2.7155, "step": 421 }, { "epoch": 0.2737149343278742, "grad_norm": 2.0540008544921875, "learning_rate": 0.0001912656502729344, "loss": 3.0081, "step": 422 }, { "epoch": 0.27436354791632883, "grad_norm": 1.7258481979370117, "learning_rate": 0.00019122383114565745, "loss": 3.0919, "step": 423 }, { "epoch": 0.2750121615047835, "grad_norm": 1.791343331336975, "learning_rate": 0.00019118191673870075, "loss": 3.1804, "step": 424 }, { "epoch": 0.2756607750932382, "grad_norm": 1.437330961227417, "learning_rate": 0.0001911399070958422, "loss": 2.7125, "step": 425 }, { "epoch": 0.27630938868169286, "grad_norm": 1.4045196771621704, "learning_rate": 0.00019109780226095932, "loss": 2.6503, "step": 426 }, { "epoch": 0.27695800227014755, "grad_norm": 1.5944185256958008, "learning_rate": 0.00019105560227802886, "loss": 2.5278, "step": 427 }, { "epoch": 0.27760661585860225, "grad_norm": 1.549092411994934, "learning_rate": 0.00019101330719112705, "loss": 2.7544, "step": 428 }, { "epoch": 0.27825522944705694, "grad_norm": 1.571057677268982, "learning_rate": 0.00019097091704442943, "loss": 2.7882, "step": 429 }, { "epoch": 0.2789038430355116, "grad_norm": 1.3524292707443237, "learning_rate": 0.00019092843188221083, "loss": 2.7014, "step": 430 }, { "epoch": 0.2795524566239663, "grad_norm": 1.6462427377700806, "learning_rate": 0.00019088585174884532, "loss": 2.8716, "step": 431 }, { "epoch": 0.28020107021242097, "grad_norm": 1.873026728630066, "learning_rate": 0.0001908431766888062, "loss": 3.125, "step": 432 }, { "epoch": 0.2808496838008756, "grad_norm": 1.6041051149368286, "learning_rate": 0.0001908004067466658, "loss": 2.7251, "step": 433 }, { "epoch": 0.2814982973893303, "grad_norm": 2.2293577194213867, "learning_rate": 0.00019075754196709572, "loss": 3.2629, "step": 434 }, { "epoch": 0.282146910977785, "grad_norm": 1.5311048030853271, "learning_rate": 0.00019071458239486647, "loss": 2.5779, "step": 435 }, { "epoch": 0.2827955245662397, "grad_norm": 1.599424958229065, "learning_rate": 0.00019067152807484763, "loss": 2.8575, "step": 436 }, { "epoch": 0.2834441381546943, "grad_norm": 1.244439721107483, "learning_rate": 0.00019062837905200775, "loss": 2.5516, "step": 437 }, { "epoch": 0.284092751743149, "grad_norm": 1.7165900468826294, "learning_rate": 0.00019058513537141428, "loss": 2.6784, "step": 438 }, { "epoch": 0.2847413653316037, "grad_norm": 1.5756560564041138, "learning_rate": 0.00019054179707823349, "loss": 2.8271, "step": 439 }, { "epoch": 0.28538997892005835, "grad_norm": 1.6109158992767334, "learning_rate": 0.00019049836421773054, "loss": 2.8693, "step": 440 }, { "epoch": 0.28603859250851305, "grad_norm": 1.7730863094329834, "learning_rate": 0.00019045483683526935, "loss": 2.7913, "step": 441 }, { "epoch": 0.28668720609696774, "grad_norm": 1.6210626363754272, "learning_rate": 0.00019041121497631253, "loss": 2.7582, "step": 442 }, { "epoch": 0.28733581968542243, "grad_norm": 1.9615261554718018, "learning_rate": 0.00019036749868642133, "loss": 2.6966, "step": 443 }, { "epoch": 0.28798443327387707, "grad_norm": 1.6248594522476196, "learning_rate": 0.00019032368801125577, "loss": 2.6116, "step": 444 }, { "epoch": 0.28863304686233177, "grad_norm": 1.7763419151306152, "learning_rate": 0.00019027978299657436, "loss": 2.7513, "step": 445 }, { "epoch": 0.28928166045078646, "grad_norm": 1.6963870525360107, "learning_rate": 0.00019023578368823412, "loss": 2.7999, "step": 446 }, { "epoch": 0.2899302740392411, "grad_norm": 1.6345908641815186, "learning_rate": 0.00019019169013219055, "loss": 2.8505, "step": 447 }, { "epoch": 0.2905788876276958, "grad_norm": 1.9350310564041138, "learning_rate": 0.0001901475023744977, "loss": 3.1872, "step": 448 }, { "epoch": 0.2912275012161505, "grad_norm": 1.5780411958694458, "learning_rate": 0.00019010322046130788, "loss": 2.6037, "step": 449 }, { "epoch": 0.2918761148046052, "grad_norm": 1.9846161603927612, "learning_rate": 0.0001900588444388718, "loss": 2.6862, "step": 450 }, { "epoch": 0.2925247283930598, "grad_norm": 1.9463258981704712, "learning_rate": 0.00019001437435353847, "loss": 3.1947, "step": 451 }, { "epoch": 0.2931733419815145, "grad_norm": 1.9290223121643066, "learning_rate": 0.00018996981025175519, "loss": 3.1036, "step": 452 }, { "epoch": 0.2938219555699692, "grad_norm": 1.4379712343215942, "learning_rate": 0.00018992515218006726, "loss": 2.4148, "step": 453 }, { "epoch": 0.29447056915842384, "grad_norm": 1.5647400617599487, "learning_rate": 0.00018988040018511838, "loss": 2.771, "step": 454 }, { "epoch": 0.29511918274687854, "grad_norm": 1.7547186613082886, "learning_rate": 0.0001898355543136502, "loss": 2.9179, "step": 455 }, { "epoch": 0.29576779633533323, "grad_norm": 2.1263155937194824, "learning_rate": 0.0001897906146125025, "loss": 2.8847, "step": 456 }, { "epoch": 0.2964164099237879, "grad_norm": 1.5842361450195312, "learning_rate": 0.00018974558112861293, "loss": 2.6552, "step": 457 }, { "epoch": 0.29706502351224257, "grad_norm": 1.5192675590515137, "learning_rate": 0.00018970045390901728, "loss": 2.7666, "step": 458 }, { "epoch": 0.29771363710069726, "grad_norm": 1.7092015743255615, "learning_rate": 0.00018965523300084906, "loss": 2.7522, "step": 459 }, { "epoch": 0.29836225068915195, "grad_norm": 1.6913198232650757, "learning_rate": 0.0001896099184513398, "loss": 2.4689, "step": 460 }, { "epoch": 0.2990108642776066, "grad_norm": 1.7776695489883423, "learning_rate": 0.00018956451030781866, "loss": 2.9348, "step": 461 }, { "epoch": 0.2996594778660613, "grad_norm": 1.8394848108291626, "learning_rate": 0.00018951900861771272, "loss": 2.9898, "step": 462 }, { "epoch": 0.300308091454516, "grad_norm": 1.7781721353530884, "learning_rate": 0.00018947341342854667, "loss": 3.0998, "step": 463 }, { "epoch": 0.3009567050429707, "grad_norm": 1.9235535860061646, "learning_rate": 0.0001894277247879429, "loss": 3.2562, "step": 464 }, { "epoch": 0.3016053186314253, "grad_norm": 1.9180947542190552, "learning_rate": 0.00018938194274362128, "loss": 3.0752, "step": 465 }, { "epoch": 0.30225393221988, "grad_norm": 1.8241761922836304, "learning_rate": 0.00018933606734339954, "loss": 3.0885, "step": 466 }, { "epoch": 0.3029025458083347, "grad_norm": 1.6399977207183838, "learning_rate": 0.00018929009863519251, "loss": 2.6054, "step": 467 }, { "epoch": 0.30355115939678934, "grad_norm": 1.5864003896713257, "learning_rate": 0.00018924403666701286, "loss": 2.7837, "step": 468 }, { "epoch": 0.30419977298524403, "grad_norm": 1.562268614768982, "learning_rate": 0.00018919788148697038, "loss": 2.4986, "step": 469 }, { "epoch": 0.3048483865736987, "grad_norm": 1.7865949869155884, "learning_rate": 0.00018915163314327235, "loss": 2.9248, "step": 470 }, { "epoch": 0.3054970001621534, "grad_norm": 1.4853394031524658, "learning_rate": 0.00018910529168422336, "loss": 2.5685, "step": 471 }, { "epoch": 0.30614561375060806, "grad_norm": 1.7445790767669678, "learning_rate": 0.00018905885715822524, "loss": 2.6873, "step": 472 }, { "epoch": 0.30679422733906275, "grad_norm": 1.549039363861084, "learning_rate": 0.00018901232961377696, "loss": 2.5288, "step": 473 }, { "epoch": 0.30744284092751745, "grad_norm": 1.7520999908447266, "learning_rate": 0.00018896570909947475, "loss": 2.8631, "step": 474 }, { "epoch": 0.3080914545159721, "grad_norm": 2.0441670417785645, "learning_rate": 0.00018891899566401185, "loss": 2.9225, "step": 475 }, { "epoch": 0.3087400681044268, "grad_norm": 2.1516342163085938, "learning_rate": 0.00018887218935617864, "loss": 3.1289, "step": 476 }, { "epoch": 0.3093886816928815, "grad_norm": 1.6816316843032837, "learning_rate": 0.0001888252902248624, "loss": 3.0072, "step": 477 }, { "epoch": 0.31003729528133617, "grad_norm": 1.9715406894683838, "learning_rate": 0.00018877829831904746, "loss": 3.1758, "step": 478 }, { "epoch": 0.3106859088697908, "grad_norm": 2.013761281967163, "learning_rate": 0.00018873121368781495, "loss": 3.1089, "step": 479 }, { "epoch": 0.3113345224582455, "grad_norm": 1.7637441158294678, "learning_rate": 0.000188684036380343, "loss": 3.0456, "step": 480 }, { "epoch": 0.3119831360467002, "grad_norm": 2.099621534347534, "learning_rate": 0.00018863676644590634, "loss": 3.0646, "step": 481 }, { "epoch": 0.31263174963515483, "grad_norm": 1.895135760307312, "learning_rate": 0.0001885894039338766, "loss": 2.873, "step": 482 }, { "epoch": 0.3132803632236095, "grad_norm": 1.6207704544067383, "learning_rate": 0.00018854194889372203, "loss": 2.891, "step": 483 }, { "epoch": 0.3139289768120642, "grad_norm": 1.5354077816009521, "learning_rate": 0.00018849440137500757, "loss": 2.5351, "step": 484 }, { "epoch": 0.3145775904005189, "grad_norm": 1.7981899976730347, "learning_rate": 0.00018844676142739468, "loss": 2.6389, "step": 485 }, { "epoch": 0.31522620398897355, "grad_norm": 1.8740936517715454, "learning_rate": 0.0001883990291006414, "loss": 2.8595, "step": 486 }, { "epoch": 0.31587481757742825, "grad_norm": 1.5358887910842896, "learning_rate": 0.0001883512044446023, "loss": 2.8182, "step": 487 }, { "epoch": 0.31652343116588294, "grad_norm": 1.634946584701538, "learning_rate": 0.0001883032875092283, "loss": 2.8051, "step": 488 }, { "epoch": 0.3171720447543376, "grad_norm": 1.5934207439422607, "learning_rate": 0.00018825527834456676, "loss": 2.567, "step": 489 }, { "epoch": 0.3178206583427923, "grad_norm": 2.1626904010772705, "learning_rate": 0.00018820717700076132, "loss": 3.3013, "step": 490 }, { "epoch": 0.31846927193124697, "grad_norm": 1.679886817932129, "learning_rate": 0.00018815898352805198, "loss": 2.7692, "step": 491 }, { "epoch": 0.31911788551970166, "grad_norm": 1.5849850177764893, "learning_rate": 0.0001881106979767749, "loss": 2.5478, "step": 492 }, { "epoch": 0.3197664991081563, "grad_norm": 1.5100640058517456, "learning_rate": 0.00018806232039736238, "loss": 2.9153, "step": 493 }, { "epoch": 0.320415112696611, "grad_norm": 1.6818066835403442, "learning_rate": 0.00018801385084034293, "loss": 2.5523, "step": 494 }, { "epoch": 0.3210637262850657, "grad_norm": 1.7340106964111328, "learning_rate": 0.00018796528935634106, "loss": 2.6807, "step": 495 }, { "epoch": 0.3217123398735203, "grad_norm": 2.0395491123199463, "learning_rate": 0.00018791663599607733, "loss": 2.8422, "step": 496 }, { "epoch": 0.322360953461975, "grad_norm": 2.242358684539795, "learning_rate": 0.0001878678908103683, "loss": 3.1986, "step": 497 }, { "epoch": 0.3230095670504297, "grad_norm": 1.7941346168518066, "learning_rate": 0.00018781905385012627, "loss": 2.7526, "step": 498 }, { "epoch": 0.3236581806388844, "grad_norm": 1.8280810117721558, "learning_rate": 0.00018777012516635962, "loss": 3.0354, "step": 499 }, { "epoch": 0.32430679422733905, "grad_norm": 2.011113405227661, "learning_rate": 0.00018772110481017236, "loss": 3.4206, "step": 500 }, { "epoch": 0.32430679422733905, "eval_loss": 2.9965591430664062, "eval_runtime": 35.1456, "eval_samples_per_second": 58.5, "eval_steps_per_second": 14.625, "step": 500 }, { "epoch": 0.32495540781579374, "grad_norm": 1.837101936340332, "learning_rate": 0.00018767199283276434, "loss": 3.0649, "step": 501 }, { "epoch": 0.32560402140424843, "grad_norm": 1.6489636898040771, "learning_rate": 0.00018762278928543107, "loss": 2.8305, "step": 502 }, { "epoch": 0.32625263499270307, "grad_norm": 1.6956580877304077, "learning_rate": 0.0001875734942195637, "loss": 2.8062, "step": 503 }, { "epoch": 0.32690124858115777, "grad_norm": 1.6304137706756592, "learning_rate": 0.00018752410768664896, "loss": 2.9245, "step": 504 }, { "epoch": 0.32754986216961246, "grad_norm": 1.58011794090271, "learning_rate": 0.00018747462973826918, "loss": 3.0477, "step": 505 }, { "epoch": 0.32819847575806715, "grad_norm": 1.6205480098724365, "learning_rate": 0.00018742506042610207, "loss": 2.7484, "step": 506 }, { "epoch": 0.3288470893465218, "grad_norm": 1.7571868896484375, "learning_rate": 0.0001873753998019208, "loss": 2.7491, "step": 507 }, { "epoch": 0.3294957029349765, "grad_norm": 1.8142462968826294, "learning_rate": 0.000187325647917594, "loss": 2.951, "step": 508 }, { "epoch": 0.3301443165234312, "grad_norm": 2.0688979625701904, "learning_rate": 0.00018727580482508547, "loss": 3.1116, "step": 509 }, { "epoch": 0.3307929301118858, "grad_norm": 1.7499206066131592, "learning_rate": 0.0001872258705764544, "loss": 2.6538, "step": 510 }, { "epoch": 0.3314415437003405, "grad_norm": 1.4673819541931152, "learning_rate": 0.00018717584522385508, "loss": 2.5399, "step": 511 }, { "epoch": 0.3320901572887952, "grad_norm": 1.7545253038406372, "learning_rate": 0.00018712572881953704, "loss": 2.8087, "step": 512 }, { "epoch": 0.3327387708772499, "grad_norm": 1.840064287185669, "learning_rate": 0.0001870755214158449, "loss": 2.9165, "step": 513 }, { "epoch": 0.33338738446570454, "grad_norm": 1.818017601966858, "learning_rate": 0.0001870252230652183, "loss": 2.6241, "step": 514 }, { "epoch": 0.33403599805415923, "grad_norm": 1.8099747896194458, "learning_rate": 0.0001869748338201918, "loss": 2.8133, "step": 515 }, { "epoch": 0.3346846116426139, "grad_norm": 1.8657416105270386, "learning_rate": 0.0001869243537333951, "loss": 3.2439, "step": 516 }, { "epoch": 0.33533322523106857, "grad_norm": 1.7444634437561035, "learning_rate": 0.00018687378285755254, "loss": 2.8965, "step": 517 }, { "epoch": 0.33598183881952326, "grad_norm": 1.6002898216247559, "learning_rate": 0.00018682312124548346, "loss": 2.3373, "step": 518 }, { "epoch": 0.33663045240797795, "grad_norm": 1.763842225074768, "learning_rate": 0.0001867723689501019, "loss": 2.7576, "step": 519 }, { "epoch": 0.33727906599643265, "grad_norm": 1.4681367874145508, "learning_rate": 0.00018672152602441661, "loss": 2.4674, "step": 520 }, { "epoch": 0.3379276795848873, "grad_norm": 1.7198644876480103, "learning_rate": 0.00018667059252153097, "loss": 2.9537, "step": 521 }, { "epoch": 0.338576293173342, "grad_norm": 2.017458915710449, "learning_rate": 0.0001866195684946431, "loss": 2.929, "step": 522 }, { "epoch": 0.3392249067617967, "grad_norm": 2.0059356689453125, "learning_rate": 0.00018656845399704554, "loss": 3.092, "step": 523 }, { "epoch": 0.3398735203502513, "grad_norm": 1.466575026512146, "learning_rate": 0.00018651724908212534, "loss": 2.618, "step": 524 }, { "epoch": 0.340522133938706, "grad_norm": 1.72751784324646, "learning_rate": 0.00018646595380336398, "loss": 2.7469, "step": 525 }, { "epoch": 0.3411707475271607, "grad_norm": 1.6940511465072632, "learning_rate": 0.0001864145682143374, "loss": 2.7195, "step": 526 }, { "epoch": 0.3418193611156154, "grad_norm": 1.6156834363937378, "learning_rate": 0.00018636309236871578, "loss": 2.8783, "step": 527 }, { "epoch": 0.34246797470407003, "grad_norm": 1.7460261583328247, "learning_rate": 0.00018631152632026364, "loss": 2.953, "step": 528 }, { "epoch": 0.3431165882925247, "grad_norm": 1.9199987649917603, "learning_rate": 0.00018625987012283965, "loss": 3.2277, "step": 529 }, { "epoch": 0.3437652018809794, "grad_norm": 1.7437705993652344, "learning_rate": 0.00018620812383039666, "loss": 2.9584, "step": 530 }, { "epoch": 0.34441381546943406, "grad_norm": 1.624984860420227, "learning_rate": 0.00018615628749698164, "loss": 2.5822, "step": 531 }, { "epoch": 0.34506242905788875, "grad_norm": 1.4664303064346313, "learning_rate": 0.00018610436117673555, "loss": 2.4913, "step": 532 }, { "epoch": 0.34571104264634345, "grad_norm": 1.8447574377059937, "learning_rate": 0.00018605234492389343, "loss": 2.8984, "step": 533 }, { "epoch": 0.34635965623479814, "grad_norm": 1.6688429117202759, "learning_rate": 0.00018600023879278415, "loss": 2.6631, "step": 534 }, { "epoch": 0.3470082698232528, "grad_norm": 2.222217082977295, "learning_rate": 0.00018594804283783055, "loss": 2.5823, "step": 535 }, { "epoch": 0.3476568834117075, "grad_norm": 1.899994134902954, "learning_rate": 0.0001858957571135492, "loss": 2.857, "step": 536 }, { "epoch": 0.34830549700016217, "grad_norm": 1.9467296600341797, "learning_rate": 0.00018584338167455043, "loss": 3.0544, "step": 537 }, { "epoch": 0.3489541105886168, "grad_norm": 1.872937798500061, "learning_rate": 0.00018579091657553844, "loss": 2.8376, "step": 538 }, { "epoch": 0.3496027241770715, "grad_norm": 1.4617030620574951, "learning_rate": 0.00018573836187131082, "loss": 2.5929, "step": 539 }, { "epoch": 0.3502513377655262, "grad_norm": 1.541212558746338, "learning_rate": 0.00018568571761675893, "loss": 2.7098, "step": 540 }, { "epoch": 0.3508999513539809, "grad_norm": 1.751999020576477, "learning_rate": 0.00018563298386686762, "loss": 2.9904, "step": 541 }, { "epoch": 0.3515485649424355, "grad_norm": 1.8048698902130127, "learning_rate": 0.00018558016067671517, "loss": 2.9375, "step": 542 }, { "epoch": 0.3521971785308902, "grad_norm": 1.6260299682617188, "learning_rate": 0.00018552724810147334, "loss": 2.5476, "step": 543 }, { "epoch": 0.3528457921193449, "grad_norm": 1.5425095558166504, "learning_rate": 0.00018547424619640723, "loss": 2.5694, "step": 544 }, { "epoch": 0.35349440570779955, "grad_norm": 1.9954092502593994, "learning_rate": 0.0001854211550168752, "loss": 2.7663, "step": 545 }, { "epoch": 0.35414301929625425, "grad_norm": 1.525377631187439, "learning_rate": 0.00018536797461832886, "loss": 2.5976, "step": 546 }, { "epoch": 0.35479163288470894, "grad_norm": 1.8690515756607056, "learning_rate": 0.0001853147050563131, "loss": 3.024, "step": 547 }, { "epoch": 0.35544024647316363, "grad_norm": 2.028202533721924, "learning_rate": 0.00018526134638646583, "loss": 2.7786, "step": 548 }, { "epoch": 0.3560888600616183, "grad_norm": 2.0508041381835938, "learning_rate": 0.00018520789866451808, "loss": 2.8, "step": 549 }, { "epoch": 0.35673747365007297, "grad_norm": 2.111360788345337, "learning_rate": 0.00018515436194629388, "loss": 2.8509, "step": 550 }, { "epoch": 0.35738608723852766, "grad_norm": 2.0007760524749756, "learning_rate": 0.0001851007362877102, "loss": 3.2225, "step": 551 }, { "epoch": 0.3580347008269823, "grad_norm": 1.7218207120895386, "learning_rate": 0.0001850470217447769, "loss": 2.7467, "step": 552 }, { "epoch": 0.358683314415437, "grad_norm": 1.3164633512496948, "learning_rate": 0.00018499321837359673, "loss": 2.9617, "step": 553 }, { "epoch": 0.3593319280038917, "grad_norm": 1.6897423267364502, "learning_rate": 0.0001849393262303652, "loss": 2.8252, "step": 554 }, { "epoch": 0.3599805415923464, "grad_norm": 1.7059605121612549, "learning_rate": 0.00018488534537137042, "loss": 2.6184, "step": 555 }, { "epoch": 0.360629155180801, "grad_norm": 1.5643235445022583, "learning_rate": 0.00018483127585299338, "loss": 2.5362, "step": 556 }, { "epoch": 0.3612777687692557, "grad_norm": 2.067978858947754, "learning_rate": 0.00018477711773170748, "loss": 3.1775, "step": 557 }, { "epoch": 0.3619263823577104, "grad_norm": 1.9548184871673584, "learning_rate": 0.00018472287106407876, "loss": 2.7611, "step": 558 }, { "epoch": 0.36257499594616505, "grad_norm": 1.5673408508300781, "learning_rate": 0.0001846685359067657, "loss": 2.8096, "step": 559 }, { "epoch": 0.36322360953461974, "grad_norm": 1.6013007164001465, "learning_rate": 0.00018461411231651922, "loss": 2.8868, "step": 560 }, { "epoch": 0.36387222312307443, "grad_norm": 1.5211323499679565, "learning_rate": 0.0001845596003501826, "loss": 2.4753, "step": 561 }, { "epoch": 0.36452083671152913, "grad_norm": 1.6809226274490356, "learning_rate": 0.00018450500006469148, "loss": 2.6591, "step": 562 }, { "epoch": 0.36516945029998377, "grad_norm": 1.6600697040557861, "learning_rate": 0.00018445031151707365, "loss": 2.6906, "step": 563 }, { "epoch": 0.36581806388843846, "grad_norm": 1.6027389764785767, "learning_rate": 0.00018439553476444913, "loss": 2.8732, "step": 564 }, { "epoch": 0.36646667747689315, "grad_norm": 1.4227046966552734, "learning_rate": 0.0001843406698640301, "loss": 2.392, "step": 565 }, { "epoch": 0.3671152910653478, "grad_norm": 1.5342984199523926, "learning_rate": 0.00018428571687312072, "loss": 2.5136, "step": 566 }, { "epoch": 0.3677639046538025, "grad_norm": 1.7692079544067383, "learning_rate": 0.00018423067584911727, "loss": 2.6271, "step": 567 }, { "epoch": 0.3684125182422572, "grad_norm": 1.6541763544082642, "learning_rate": 0.00018417554684950794, "loss": 2.7254, "step": 568 }, { "epoch": 0.3690611318307119, "grad_norm": 1.5763524770736694, "learning_rate": 0.00018412032993187272, "loss": 2.5029, "step": 569 }, { "epoch": 0.3697097454191665, "grad_norm": 1.6308703422546387, "learning_rate": 0.00018406502515388353, "loss": 2.9408, "step": 570 }, { "epoch": 0.3703583590076212, "grad_norm": 1.692257046699524, "learning_rate": 0.000184009632573304, "loss": 2.9273, "step": 571 }, { "epoch": 0.3710069725960759, "grad_norm": 2.2380211353302, "learning_rate": 0.0001839541522479895, "loss": 2.8422, "step": 572 }, { "epoch": 0.37165558618453054, "grad_norm": 2.141791582107544, "learning_rate": 0.00018389858423588704, "loss": 2.5891, "step": 573 }, { "epoch": 0.37230419977298523, "grad_norm": 1.792551040649414, "learning_rate": 0.0001838429285950352, "loss": 2.4832, "step": 574 }, { "epoch": 0.3729528133614399, "grad_norm": 1.6737881898880005, "learning_rate": 0.0001837871853835641, "loss": 2.7309, "step": 575 }, { "epoch": 0.3736014269498946, "grad_norm": 1.777600646018982, "learning_rate": 0.00018373135465969532, "loss": 2.7503, "step": 576 }, { "epoch": 0.37425004053834926, "grad_norm": 2.1143133640289307, "learning_rate": 0.00018367543648174184, "loss": 2.846, "step": 577 }, { "epoch": 0.37489865412680395, "grad_norm": 1.7452971935272217, "learning_rate": 0.00018361943090810796, "loss": 2.942, "step": 578 }, { "epoch": 0.37554726771525865, "grad_norm": 1.4994680881500244, "learning_rate": 0.00018356333799728931, "loss": 2.3738, "step": 579 }, { "epoch": 0.3761958813037133, "grad_norm": 1.645715594291687, "learning_rate": 0.0001835071578078727, "loss": 2.7813, "step": 580 }, { "epoch": 0.376844494892168, "grad_norm": 1.5714319944381714, "learning_rate": 0.00018345089039853614, "loss": 2.5876, "step": 581 }, { "epoch": 0.3774931084806227, "grad_norm": 1.9836387634277344, "learning_rate": 0.00018339453582804865, "loss": 2.7991, "step": 582 }, { "epoch": 0.37814172206907737, "grad_norm": 1.7780382633209229, "learning_rate": 0.00018333809415527043, "loss": 2.6186, "step": 583 }, { "epoch": 0.378790335657532, "grad_norm": 1.7474172115325928, "learning_rate": 0.0001832815654391525, "loss": 2.9986, "step": 584 }, { "epoch": 0.3794389492459867, "grad_norm": 1.7414286136627197, "learning_rate": 0.0001832249497387369, "loss": 2.4334, "step": 585 }, { "epoch": 0.3800875628344414, "grad_norm": 1.456046462059021, "learning_rate": 0.0001831682471131565, "loss": 2.7362, "step": 586 }, { "epoch": 0.38073617642289603, "grad_norm": 2.0054612159729004, "learning_rate": 0.00018311145762163494, "loss": 2.8001, "step": 587 }, { "epoch": 0.3813847900113507, "grad_norm": 1.7399264574050903, "learning_rate": 0.00018305458132348657, "loss": 2.7319, "step": 588 }, { "epoch": 0.3820334035998054, "grad_norm": 2.101745843887329, "learning_rate": 0.00018299761827811643, "loss": 3.0083, "step": 589 }, { "epoch": 0.3826820171882601, "grad_norm": 1.816279649734497, "learning_rate": 0.0001829405685450202, "loss": 2.9274, "step": 590 }, { "epoch": 0.38333063077671475, "grad_norm": 1.7195055484771729, "learning_rate": 0.00018288343218378404, "loss": 2.8652, "step": 591 }, { "epoch": 0.38397924436516945, "grad_norm": 1.7244223356246948, "learning_rate": 0.00018282620925408463, "loss": 2.9323, "step": 592 }, { "epoch": 0.38462785795362414, "grad_norm": 1.9450119733810425, "learning_rate": 0.00018276889981568906, "loss": 3.0536, "step": 593 }, { "epoch": 0.3852764715420788, "grad_norm": 1.5440584421157837, "learning_rate": 0.00018271150392845477, "loss": 2.9146, "step": 594 }, { "epoch": 0.3859250851305335, "grad_norm": 1.770662784576416, "learning_rate": 0.00018265402165232946, "loss": 2.9072, "step": 595 }, { "epoch": 0.38657369871898817, "grad_norm": 1.89310884475708, "learning_rate": 0.00018259645304735114, "loss": 3.1776, "step": 596 }, { "epoch": 0.38722231230744286, "grad_norm": 1.6811450719833374, "learning_rate": 0.00018253879817364788, "loss": 3.0617, "step": 597 }, { "epoch": 0.3878709258958975, "grad_norm": 1.6450713872909546, "learning_rate": 0.00018248105709143799, "loss": 2.5922, "step": 598 }, { "epoch": 0.3885195394843522, "grad_norm": 1.7148988246917725, "learning_rate": 0.00018242322986102968, "loss": 2.7235, "step": 599 }, { "epoch": 0.3891681530728069, "grad_norm": 2.009889841079712, "learning_rate": 0.00018236531654282123, "loss": 3.0461, "step": 600 }, { "epoch": 0.3891681530728069, "eval_loss": 2.933032989501953, "eval_runtime": 35.1356, "eval_samples_per_second": 58.516, "eval_steps_per_second": 14.629, "step": 600 }, { "epoch": 0.3898167666612615, "grad_norm": 1.6049710512161255, "learning_rate": 0.00018230731719730078, "loss": 3.0594, "step": 601 }, { "epoch": 0.3904653802497162, "grad_norm": 2.0074195861816406, "learning_rate": 0.0001822492318850464, "loss": 2.8586, "step": 602 }, { "epoch": 0.3911139938381709, "grad_norm": 1.7622570991516113, "learning_rate": 0.00018219106066672582, "loss": 2.9857, "step": 603 }, { "epoch": 0.3917626074266256, "grad_norm": 2.027636766433716, "learning_rate": 0.00018213280360309666, "loss": 3.0782, "step": 604 }, { "epoch": 0.39241122101508025, "grad_norm": 1.7770642042160034, "learning_rate": 0.0001820744607550061, "loss": 2.7027, "step": 605 }, { "epoch": 0.39305983460353494, "grad_norm": 1.9147592782974243, "learning_rate": 0.00018201603218339087, "loss": 3.0756, "step": 606 }, { "epoch": 0.39370844819198964, "grad_norm": 1.7803279161453247, "learning_rate": 0.00018195751794927736, "loss": 2.6723, "step": 607 }, { "epoch": 0.3943570617804443, "grad_norm": 1.7743611335754395, "learning_rate": 0.00018189891811378137, "loss": 2.9055, "step": 608 }, { "epoch": 0.39500567536889897, "grad_norm": 1.8400771617889404, "learning_rate": 0.0001818402327381081, "loss": 2.7791, "step": 609 }, { "epoch": 0.39565428895735366, "grad_norm": 1.8678261041641235, "learning_rate": 0.00018178146188355205, "loss": 3.0098, "step": 610 }, { "epoch": 0.39630290254580836, "grad_norm": 1.8353296518325806, "learning_rate": 0.00018172260561149711, "loss": 2.9813, "step": 611 }, { "epoch": 0.396951516134263, "grad_norm": 1.625322699546814, "learning_rate": 0.00018166366398341633, "loss": 2.6379, "step": 612 }, { "epoch": 0.3976001297227177, "grad_norm": 1.6621862649917603, "learning_rate": 0.0001816046370608719, "loss": 3.0066, "step": 613 }, { "epoch": 0.3982487433111724, "grad_norm": 1.427992820739746, "learning_rate": 0.00018154552490551507, "loss": 2.4205, "step": 614 }, { "epoch": 0.398897356899627, "grad_norm": 1.9729347229003906, "learning_rate": 0.00018148632757908616, "loss": 2.7996, "step": 615 }, { "epoch": 0.3995459704880817, "grad_norm": 1.6517467498779297, "learning_rate": 0.00018142704514341446, "loss": 2.8187, "step": 616 }, { "epoch": 0.4001945840765364, "grad_norm": 1.609317660331726, "learning_rate": 0.0001813676776604181, "loss": 2.6935, "step": 617 }, { "epoch": 0.4008431976649911, "grad_norm": 1.7710105180740356, "learning_rate": 0.0001813082251921041, "loss": 2.8423, "step": 618 }, { "epoch": 0.40149181125344574, "grad_norm": 1.6651195287704468, "learning_rate": 0.00018124868780056814, "loss": 2.7972, "step": 619 }, { "epoch": 0.40214042484190043, "grad_norm": 1.9904202222824097, "learning_rate": 0.00018118906554799474, "loss": 2.8755, "step": 620 }, { "epoch": 0.40278903843035513, "grad_norm": 1.853272557258606, "learning_rate": 0.0001811293584966569, "loss": 2.8479, "step": 621 }, { "epoch": 0.40343765201880977, "grad_norm": 1.6893529891967773, "learning_rate": 0.0001810695667089164, "loss": 2.8064, "step": 622 }, { "epoch": 0.40408626560726446, "grad_norm": 1.7378567457199097, "learning_rate": 0.00018100969024722327, "loss": 2.6438, "step": 623 }, { "epoch": 0.40473487919571915, "grad_norm": 1.5746333599090576, "learning_rate": 0.00018094972917411615, "loss": 2.3556, "step": 624 }, { "epoch": 0.40538349278417385, "grad_norm": 1.6189936399459839, "learning_rate": 0.000180889683552222, "loss": 2.7307, "step": 625 }, { "epoch": 0.4060321063726285, "grad_norm": 1.676355242729187, "learning_rate": 0.00018082955344425616, "loss": 2.6473, "step": 626 }, { "epoch": 0.4066807199610832, "grad_norm": 1.6809967756271362, "learning_rate": 0.00018076933891302205, "loss": 2.58, "step": 627 }, { "epoch": 0.4073293335495379, "grad_norm": 2.239384174346924, "learning_rate": 0.0001807090400214114, "loss": 2.9977, "step": 628 }, { "epoch": 0.4079779471379925, "grad_norm": 1.7096261978149414, "learning_rate": 0.00018064865683240403, "loss": 2.6852, "step": 629 }, { "epoch": 0.4086265607264472, "grad_norm": 1.6960233449935913, "learning_rate": 0.00018058818940906778, "loss": 2.5462, "step": 630 }, { "epoch": 0.4092751743149019, "grad_norm": 1.784436583518982, "learning_rate": 0.00018052763781455848, "loss": 2.5808, "step": 631 }, { "epoch": 0.4099237879033566, "grad_norm": 1.686411738395691, "learning_rate": 0.00018046700211211988, "loss": 2.4902, "step": 632 }, { "epoch": 0.41057240149181123, "grad_norm": 1.9172080755233765, "learning_rate": 0.00018040628236508355, "loss": 2.818, "step": 633 }, { "epoch": 0.41122101508026593, "grad_norm": 1.8704211711883545, "learning_rate": 0.00018034547863686888, "loss": 2.6007, "step": 634 }, { "epoch": 0.4118696286687206, "grad_norm": 1.9697834253311157, "learning_rate": 0.00018028459099098294, "loss": 2.6415, "step": 635 }, { "epoch": 0.41251824225717526, "grad_norm": 1.6072981357574463, "learning_rate": 0.00018022361949102047, "loss": 2.5671, "step": 636 }, { "epoch": 0.41316685584562995, "grad_norm": 2.0946097373962402, "learning_rate": 0.0001801625642006638, "loss": 2.9608, "step": 637 }, { "epoch": 0.41381546943408465, "grad_norm": 1.8869754076004028, "learning_rate": 0.00018010142518368278, "loss": 2.4696, "step": 638 }, { "epoch": 0.41446408302253934, "grad_norm": 1.6254740953445435, "learning_rate": 0.00018004020250393462, "loss": 2.5122, "step": 639 }, { "epoch": 0.415112696610994, "grad_norm": 1.7860641479492188, "learning_rate": 0.00017997889622536407, "loss": 2.7656, "step": 640 }, { "epoch": 0.4157613101994487, "grad_norm": 1.880562424659729, "learning_rate": 0.00017991750641200308, "loss": 2.6534, "step": 641 }, { "epoch": 0.41640992378790337, "grad_norm": 1.9303995370864868, "learning_rate": 0.00017985603312797087, "loss": 2.6272, "step": 642 }, { "epoch": 0.417058537376358, "grad_norm": 1.7239816188812256, "learning_rate": 0.00017979447643747386, "loss": 2.1161, "step": 643 }, { "epoch": 0.4177071509648127, "grad_norm": 2.192125082015991, "learning_rate": 0.00017973283640480554, "loss": 2.9204, "step": 644 }, { "epoch": 0.4183557645532674, "grad_norm": 2.1280171871185303, "learning_rate": 0.00017967111309434653, "loss": 2.876, "step": 645 }, { "epoch": 0.4190043781417221, "grad_norm": 1.4464986324310303, "learning_rate": 0.00017960930657056438, "loss": 2.4648, "step": 646 }, { "epoch": 0.4196529917301767, "grad_norm": 1.9137802124023438, "learning_rate": 0.00017954741689801354, "loss": 2.9846, "step": 647 }, { "epoch": 0.4203016053186314, "grad_norm": 2.0788402557373047, "learning_rate": 0.00017948544414133534, "loss": 2.8715, "step": 648 }, { "epoch": 0.4209502189070861, "grad_norm": 1.603805422782898, "learning_rate": 0.00017942338836525785, "loss": 2.6986, "step": 649 }, { "epoch": 0.42159883249554075, "grad_norm": 2.4646081924438477, "learning_rate": 0.00017936124963459585, "loss": 3.0402, "step": 650 }, { "epoch": 0.42224744608399545, "grad_norm": 2.0049445629119873, "learning_rate": 0.00017929902801425086, "loss": 2.8503, "step": 651 }, { "epoch": 0.42289605967245014, "grad_norm": 1.6906543970108032, "learning_rate": 0.00017923672356921083, "loss": 2.3713, "step": 652 }, { "epoch": 0.42354467326090484, "grad_norm": 2.0179948806762695, "learning_rate": 0.00017917433636455035, "loss": 3.0244, "step": 653 }, { "epoch": 0.4241932868493595, "grad_norm": 1.578054428100586, "learning_rate": 0.00017911186646543028, "loss": 2.4165, "step": 654 }, { "epoch": 0.42484190043781417, "grad_norm": 1.7836250066757202, "learning_rate": 0.00017904931393709805, "loss": 2.4845, "step": 655 }, { "epoch": 0.42549051402626886, "grad_norm": 2.1490225791931152, "learning_rate": 0.00017898667884488727, "loss": 2.7724, "step": 656 }, { "epoch": 0.4261391276147235, "grad_norm": 1.8153955936431885, "learning_rate": 0.00017892396125421782, "loss": 2.5802, "step": 657 }, { "epoch": 0.4267877412031782, "grad_norm": 1.8560956716537476, "learning_rate": 0.00017886116123059574, "loss": 2.7596, "step": 658 }, { "epoch": 0.4274363547916329, "grad_norm": 2.0156941413879395, "learning_rate": 0.0001787982788396131, "loss": 2.6246, "step": 659 }, { "epoch": 0.4280849683800876, "grad_norm": 1.5855635404586792, "learning_rate": 0.0001787353141469482, "loss": 2.6171, "step": 660 }, { "epoch": 0.4287335819685422, "grad_norm": 1.9964396953582764, "learning_rate": 0.00017867226721836508, "loss": 2.6852, "step": 661 }, { "epoch": 0.4293821955569969, "grad_norm": 1.763782024383545, "learning_rate": 0.00017860913811971378, "loss": 2.7468, "step": 662 }, { "epoch": 0.4300308091454516, "grad_norm": 1.9695777893066406, "learning_rate": 0.00017854592691693013, "loss": 2.9564, "step": 663 }, { "epoch": 0.4306794227339063, "grad_norm": 1.990429162979126, "learning_rate": 0.00017848263367603576, "loss": 2.7851, "step": 664 }, { "epoch": 0.43132803632236094, "grad_norm": 1.7005643844604492, "learning_rate": 0.00017841925846313788, "loss": 2.6023, "step": 665 }, { "epoch": 0.43197664991081564, "grad_norm": 1.8780453205108643, "learning_rate": 0.00017835580134442953, "loss": 2.6858, "step": 666 }, { "epoch": 0.43262526349927033, "grad_norm": 1.9776185750961304, "learning_rate": 0.000178292262386189, "loss": 2.8049, "step": 667 }, { "epoch": 0.43327387708772497, "grad_norm": 2.2262673377990723, "learning_rate": 0.00017822864165478034, "loss": 3.0098, "step": 668 }, { "epoch": 0.43392249067617966, "grad_norm": 2.442150115966797, "learning_rate": 0.0001781649392166528, "loss": 3.1744, "step": 669 }, { "epoch": 0.43457110426463436, "grad_norm": 1.940292477607727, "learning_rate": 0.00017810115513834112, "loss": 2.6796, "step": 670 }, { "epoch": 0.43521971785308905, "grad_norm": 1.8976854085922241, "learning_rate": 0.00017803728948646518, "loss": 2.9376, "step": 671 }, { "epoch": 0.4358683314415437, "grad_norm": 1.970984935760498, "learning_rate": 0.0001779733423277302, "loss": 2.6614, "step": 672 }, { "epoch": 0.4365169450299984, "grad_norm": 2.067122459411621, "learning_rate": 0.00017790931372892638, "loss": 2.8135, "step": 673 }, { "epoch": 0.4371655586184531, "grad_norm": 1.887774109840393, "learning_rate": 0.0001778452037569291, "loss": 2.663, "step": 674 }, { "epoch": 0.4378141722069077, "grad_norm": 1.676389455795288, "learning_rate": 0.00017778101247869867, "loss": 2.762, "step": 675 }, { "epoch": 0.4384627857953624, "grad_norm": 1.9964745044708252, "learning_rate": 0.00017771673996128036, "loss": 3.2542, "step": 676 }, { "epoch": 0.4391113993838171, "grad_norm": 2.2475292682647705, "learning_rate": 0.00017765238627180424, "loss": 2.7994, "step": 677 }, { "epoch": 0.4397600129722718, "grad_norm": 1.7947001457214355, "learning_rate": 0.00017758795147748523, "loss": 2.8031, "step": 678 }, { "epoch": 0.44040862656072643, "grad_norm": 2.071796178817749, "learning_rate": 0.0001775234356456229, "loss": 2.8581, "step": 679 }, { "epoch": 0.44105724014918113, "grad_norm": 2.3688528537750244, "learning_rate": 0.00017745883884360143, "loss": 3.0703, "step": 680 }, { "epoch": 0.4417058537376358, "grad_norm": 1.7595100402832031, "learning_rate": 0.0001773941611388897, "loss": 2.7228, "step": 681 }, { "epoch": 0.44235446732609046, "grad_norm": 1.669471263885498, "learning_rate": 0.00017732940259904101, "loss": 2.6771, "step": 682 }, { "epoch": 0.44300308091454516, "grad_norm": 1.671545386314392, "learning_rate": 0.00017726456329169302, "loss": 2.8939, "step": 683 }, { "epoch": 0.44365169450299985, "grad_norm": 2.1762986183166504, "learning_rate": 0.0001771996432845679, "loss": 2.9404, "step": 684 }, { "epoch": 0.44430030809145454, "grad_norm": 1.9645555019378662, "learning_rate": 0.00017713464264547198, "loss": 2.8707, "step": 685 }, { "epoch": 0.4449489216799092, "grad_norm": 1.8939317464828491, "learning_rate": 0.00017706956144229586, "loss": 2.811, "step": 686 }, { "epoch": 0.4455975352683639, "grad_norm": 2.0793728828430176, "learning_rate": 0.00017700439974301427, "loss": 2.5266, "step": 687 }, { "epoch": 0.44624614885681857, "grad_norm": 1.9912395477294922, "learning_rate": 0.00017693915761568608, "loss": 2.8944, "step": 688 }, { "epoch": 0.4468947624452732, "grad_norm": 1.769579291343689, "learning_rate": 0.00017687383512845405, "loss": 2.6281, "step": 689 }, { "epoch": 0.4475433760337279, "grad_norm": 1.7188599109649658, "learning_rate": 0.00017680843234954495, "loss": 2.709, "step": 690 }, { "epoch": 0.4481919896221826, "grad_norm": 1.7006970643997192, "learning_rate": 0.00017674294934726939, "loss": 2.6594, "step": 691 }, { "epoch": 0.4488406032106373, "grad_norm": 1.7506557703018188, "learning_rate": 0.00017667738619002175, "loss": 2.4676, "step": 692 }, { "epoch": 0.44948921679909193, "grad_norm": 1.9103237390518188, "learning_rate": 0.0001766117429462802, "loss": 2.6517, "step": 693 }, { "epoch": 0.4501378303875466, "grad_norm": 1.9856759309768677, "learning_rate": 0.00017654601968460648, "loss": 2.733, "step": 694 }, { "epoch": 0.4507864439760013, "grad_norm": 2.2881221771240234, "learning_rate": 0.00017648021647364597, "loss": 3.0138, "step": 695 }, { "epoch": 0.45143505756445595, "grad_norm": 1.7466799020767212, "learning_rate": 0.00017641433338212744, "loss": 2.6415, "step": 696 }, { "epoch": 0.45208367115291065, "grad_norm": 1.826073169708252, "learning_rate": 0.00017634837047886324, "loss": 2.4217, "step": 697 }, { "epoch": 0.45273228474136534, "grad_norm": 1.7518538236618042, "learning_rate": 0.000176282327832749, "loss": 2.6648, "step": 698 }, { "epoch": 0.45338089832982004, "grad_norm": 1.8038562536239624, "learning_rate": 0.00017621620551276366, "loss": 2.5656, "step": 699 }, { "epoch": 0.4540295119182747, "grad_norm": 1.6857868432998657, "learning_rate": 0.0001761500035879693, "loss": 2.2323, "step": 700 }, { "epoch": 0.4540295119182747, "eval_loss": 2.860482931137085, "eval_runtime": 35.1828, "eval_samples_per_second": 58.438, "eval_steps_per_second": 14.609, "step": 700 }, { "epoch": 0.45467812550672937, "grad_norm": 2.8103723526000977, "learning_rate": 0.0001760837221275113, "loss": 3.3218, "step": 701 }, { "epoch": 0.45532673909518406, "grad_norm": 1.8184293508529663, "learning_rate": 0.00017601736120061796, "loss": 2.6172, "step": 702 }, { "epoch": 0.4559753526836387, "grad_norm": 1.6888784170150757, "learning_rate": 0.00017595092087660064, "loss": 2.5546, "step": 703 }, { "epoch": 0.4566239662720934, "grad_norm": 1.5799273252487183, "learning_rate": 0.00017588440122485365, "loss": 2.6794, "step": 704 }, { "epoch": 0.4572725798605481, "grad_norm": 2.066277027130127, "learning_rate": 0.00017581780231485413, "loss": 3.0023, "step": 705 }, { "epoch": 0.4579211934490028, "grad_norm": 1.6376198530197144, "learning_rate": 0.00017575112421616202, "loss": 2.5459, "step": 706 }, { "epoch": 0.4585698070374574, "grad_norm": 2.1497507095336914, "learning_rate": 0.00017568436699841992, "loss": 2.8139, "step": 707 }, { "epoch": 0.4592184206259121, "grad_norm": 2.116893768310547, "learning_rate": 0.0001756175307313531, "loss": 2.7252, "step": 708 }, { "epoch": 0.4598670342143668, "grad_norm": 2.1601617336273193, "learning_rate": 0.00017555061548476944, "loss": 2.7944, "step": 709 }, { "epoch": 0.46051564780282145, "grad_norm": 1.6564985513687134, "learning_rate": 0.00017548362132855923, "loss": 2.5264, "step": 710 }, { "epoch": 0.46116426139127614, "grad_norm": 1.8431380987167358, "learning_rate": 0.0001754165483326952, "loss": 2.6722, "step": 711 }, { "epoch": 0.46181287497973084, "grad_norm": 1.7818156480789185, "learning_rate": 0.00017534939656723247, "loss": 2.5168, "step": 712 }, { "epoch": 0.46246148856818553, "grad_norm": 1.7254563570022583, "learning_rate": 0.0001752821661023084, "loss": 2.3754, "step": 713 }, { "epoch": 0.46311010215664017, "grad_norm": 2.039100170135498, "learning_rate": 0.00017521485700814252, "loss": 3.0401, "step": 714 }, { "epoch": 0.46375871574509486, "grad_norm": 1.957342505455017, "learning_rate": 0.00017514746935503653, "loss": 2.6507, "step": 715 }, { "epoch": 0.46440732933354956, "grad_norm": 1.6347419023513794, "learning_rate": 0.00017508000321337418, "loss": 2.3161, "step": 716 }, { "epoch": 0.4650559429220042, "grad_norm": 1.7550560235977173, "learning_rate": 0.00017501245865362116, "loss": 2.3331, "step": 717 }, { "epoch": 0.4657045565104589, "grad_norm": 1.5809965133666992, "learning_rate": 0.00017494483574632513, "loss": 2.3326, "step": 718 }, { "epoch": 0.4663531700989136, "grad_norm": 1.7528107166290283, "learning_rate": 0.00017487713456211545, "loss": 2.3505, "step": 719 }, { "epoch": 0.4670017836873683, "grad_norm": 2.278684139251709, "learning_rate": 0.00017480935517170344, "loss": 2.8061, "step": 720 }, { "epoch": 0.4676503972758229, "grad_norm": 2.1842806339263916, "learning_rate": 0.0001747414976458819, "loss": 2.7589, "step": 721 }, { "epoch": 0.4682990108642776, "grad_norm": 2.1465959548950195, "learning_rate": 0.00017467356205552536, "loss": 2.8726, "step": 722 }, { "epoch": 0.4689476244527323, "grad_norm": 1.7664055824279785, "learning_rate": 0.00017460554847158988, "loss": 2.7915, "step": 723 }, { "epoch": 0.46959623804118694, "grad_norm": 1.7545201778411865, "learning_rate": 0.0001745374569651129, "loss": 2.4965, "step": 724 }, { "epoch": 0.47024485162964164, "grad_norm": 2.0252699851989746, "learning_rate": 0.0001744692876072134, "loss": 2.6414, "step": 725 }, { "epoch": 0.47089346521809633, "grad_norm": 2.818993091583252, "learning_rate": 0.00017440104046909146, "loss": 3.2413, "step": 726 }, { "epoch": 0.471542078806551, "grad_norm": 2.1532135009765625, "learning_rate": 0.00017433271562202857, "loss": 2.882, "step": 727 }, { "epoch": 0.47219069239500566, "grad_norm": 2.0322587490081787, "learning_rate": 0.00017426431313738734, "loss": 2.8182, "step": 728 }, { "epoch": 0.47283930598346036, "grad_norm": 2.150658369064331, "learning_rate": 0.0001741958330866114, "loss": 2.805, "step": 729 }, { "epoch": 0.47348791957191505, "grad_norm": 1.9341611862182617, "learning_rate": 0.00017412727554122555, "loss": 2.6659, "step": 730 }, { "epoch": 0.4741365331603697, "grad_norm": 2.9606316089630127, "learning_rate": 0.00017405864057283533, "loss": 2.8061, "step": 731 }, { "epoch": 0.4747851467488244, "grad_norm": 1.7531644105911255, "learning_rate": 0.0001739899282531273, "loss": 2.3735, "step": 732 }, { "epoch": 0.4754337603372791, "grad_norm": 1.7819886207580566, "learning_rate": 0.00017392113865386875, "loss": 2.5763, "step": 733 }, { "epoch": 0.47608237392573377, "grad_norm": 1.7732908725738525, "learning_rate": 0.00017385227184690767, "loss": 2.5113, "step": 734 }, { "epoch": 0.4767309875141884, "grad_norm": 2.105950117111206, "learning_rate": 0.00017378332790417273, "loss": 3.0511, "step": 735 }, { "epoch": 0.4773796011026431, "grad_norm": 1.6725560426712036, "learning_rate": 0.00017371430689767315, "loss": 2.4889, "step": 736 }, { "epoch": 0.4780282146910978, "grad_norm": 1.7272483110427856, "learning_rate": 0.0001736452088994987, "loss": 2.4335, "step": 737 }, { "epoch": 0.47867682827955244, "grad_norm": 1.90679132938385, "learning_rate": 0.00017357603398181936, "loss": 2.6486, "step": 738 }, { "epoch": 0.47932544186800713, "grad_norm": 1.7886470556259155, "learning_rate": 0.00017350678221688575, "loss": 2.8445, "step": 739 }, { "epoch": 0.4799740554564618, "grad_norm": 1.8590137958526611, "learning_rate": 0.00017343745367702845, "loss": 2.6151, "step": 740 }, { "epoch": 0.4806226690449165, "grad_norm": 1.7846184968948364, "learning_rate": 0.0001733680484346585, "loss": 2.4468, "step": 741 }, { "epoch": 0.48127128263337116, "grad_norm": 2.1245336532592773, "learning_rate": 0.00017329856656226683, "loss": 3.0435, "step": 742 }, { "epoch": 0.48191989622182585, "grad_norm": 1.5738229751586914, "learning_rate": 0.0001732290081324246, "loss": 2.3764, "step": 743 }, { "epoch": 0.48256850981028054, "grad_norm": 2.1271040439605713, "learning_rate": 0.00017315937321778276, "loss": 2.832, "step": 744 }, { "epoch": 0.4832171233987352, "grad_norm": 1.8022990226745605, "learning_rate": 0.00017308966189107222, "loss": 2.4129, "step": 745 }, { "epoch": 0.4838657369871899, "grad_norm": 1.615537166595459, "learning_rate": 0.00017301987422510377, "loss": 2.3864, "step": 746 }, { "epoch": 0.48451435057564457, "grad_norm": 1.9763058423995972, "learning_rate": 0.0001729500102927678, "loss": 2.4206, "step": 747 }, { "epoch": 0.48516296416409926, "grad_norm": 1.7219407558441162, "learning_rate": 0.00017288007016703444, "loss": 2.5351, "step": 748 }, { "epoch": 0.4858115777525539, "grad_norm": 1.8550753593444824, "learning_rate": 0.0001728100539209534, "loss": 2.7959, "step": 749 }, { "epoch": 0.4864601913410086, "grad_norm": 1.9887588024139404, "learning_rate": 0.00017273996162765385, "loss": 2.817, "step": 750 }, { "epoch": 0.4871088049294633, "grad_norm": 2.0434343814849854, "learning_rate": 0.0001726697933603444, "loss": 2.6956, "step": 751 }, { "epoch": 0.48775741851791793, "grad_norm": 1.9442192316055298, "learning_rate": 0.0001725995491923131, "loss": 2.7617, "step": 752 }, { "epoch": 0.4884060321063726, "grad_norm": 1.7185665369033813, "learning_rate": 0.0001725292291969271, "loss": 2.6301, "step": 753 }, { "epoch": 0.4890546456948273, "grad_norm": 2.0023398399353027, "learning_rate": 0.00017245883344763297, "loss": 3.0376, "step": 754 }, { "epoch": 0.489703259283282, "grad_norm": 1.9929579496383667, "learning_rate": 0.00017238836201795618, "loss": 2.409, "step": 755 }, { "epoch": 0.49035187287173665, "grad_norm": 2.058685779571533, "learning_rate": 0.0001723178149815014, "loss": 2.5951, "step": 756 }, { "epoch": 0.49100048646019134, "grad_norm": 2.1027729511260986, "learning_rate": 0.0001722471924119522, "loss": 2.8047, "step": 757 }, { "epoch": 0.49164910004864604, "grad_norm": 1.7806205749511719, "learning_rate": 0.00017217649438307106, "loss": 2.2973, "step": 758 }, { "epoch": 0.4922977136371007, "grad_norm": 1.5680922269821167, "learning_rate": 0.00017210572096869927, "loss": 2.3274, "step": 759 }, { "epoch": 0.49294632722555537, "grad_norm": 2.411433458328247, "learning_rate": 0.00017203487224275686, "loss": 2.6766, "step": 760 }, { "epoch": 0.49359494081401006, "grad_norm": 1.8723822832107544, "learning_rate": 0.0001719639482792425, "loss": 2.4925, "step": 761 }, { "epoch": 0.49424355440246476, "grad_norm": 2.7825822830200195, "learning_rate": 0.00017189294915223355, "loss": 2.9721, "step": 762 }, { "epoch": 0.4948921679909194, "grad_norm": 1.9583919048309326, "learning_rate": 0.00017182187493588566, "loss": 2.3496, "step": 763 }, { "epoch": 0.4955407815793741, "grad_norm": 2.0904908180236816, "learning_rate": 0.00017175072570443312, "loss": 2.9076, "step": 764 }, { "epoch": 0.4961893951678288, "grad_norm": 1.7156928777694702, "learning_rate": 0.00017167950153218847, "loss": 2.744, "step": 765 }, { "epoch": 0.4968380087562834, "grad_norm": 2.0886523723602295, "learning_rate": 0.0001716082024935425, "loss": 2.8517, "step": 766 }, { "epoch": 0.4974866223447381, "grad_norm": 1.905518651008606, "learning_rate": 0.00017153682866296428, "loss": 2.6008, "step": 767 }, { "epoch": 0.4981352359331928, "grad_norm": 1.7719477415084839, "learning_rate": 0.00017146538011500093, "loss": 2.4845, "step": 768 }, { "epoch": 0.4987838495216475, "grad_norm": 2.297144889831543, "learning_rate": 0.0001713938569242776, "loss": 2.7606, "step": 769 }, { "epoch": 0.49943246311010214, "grad_norm": 2.0028140544891357, "learning_rate": 0.00017132225916549748, "loss": 2.5758, "step": 770 }, { "epoch": 0.5000810766985568, "grad_norm": 2.407991647720337, "learning_rate": 0.0001712505869134416, "loss": 2.9426, "step": 771 }, { "epoch": 0.5007296902870115, "grad_norm": 1.942800760269165, "learning_rate": 0.00017117884024296876, "loss": 2.7087, "step": 772 }, { "epoch": 0.5013783038754662, "grad_norm": 2.014552593231201, "learning_rate": 0.00017110701922901547, "loss": 2.402, "step": 773 }, { "epoch": 0.5020269174639209, "grad_norm": 1.5824172496795654, "learning_rate": 0.00017103512394659603, "loss": 2.3163, "step": 774 }, { "epoch": 0.5026755310523755, "grad_norm": 1.9979703426361084, "learning_rate": 0.00017096315447080217, "loss": 2.8207, "step": 775 }, { "epoch": 0.5033241446408302, "grad_norm": 2.047609806060791, "learning_rate": 0.00017089111087680318, "loss": 2.7548, "step": 776 }, { "epoch": 0.5039727582292849, "grad_norm": 1.373998761177063, "learning_rate": 0.0001708189932398457, "loss": 2.4864, "step": 777 }, { "epoch": 0.5046213718177396, "grad_norm": 2.039827346801758, "learning_rate": 0.00017074680163525375, "loss": 2.574, "step": 778 }, { "epoch": 0.5052699854061943, "grad_norm": 2.025698184967041, "learning_rate": 0.00017067453613842866, "loss": 3.0488, "step": 779 }, { "epoch": 0.505918598994649, "grad_norm": 1.8299282789230347, "learning_rate": 0.00017060219682484885, "loss": 2.5391, "step": 780 }, { "epoch": 0.5065672125831037, "grad_norm": 1.9974464178085327, "learning_rate": 0.0001705297837700698, "loss": 2.6035, "step": 781 }, { "epoch": 0.5072158261715582, "grad_norm": 1.8642154932022095, "learning_rate": 0.00017045729704972417, "loss": 2.5576, "step": 782 }, { "epoch": 0.5078644397600129, "grad_norm": 2.192844867706299, "learning_rate": 0.00017038473673952145, "loss": 2.6334, "step": 783 }, { "epoch": 0.5085130533484676, "grad_norm": 1.6969176530838013, "learning_rate": 0.00017031210291524798, "loss": 2.2498, "step": 784 }, { "epoch": 0.5091616669369223, "grad_norm": 1.8505398035049438, "learning_rate": 0.00017023939565276693, "loss": 2.409, "step": 785 }, { "epoch": 0.509810280525377, "grad_norm": 2.1385858058929443, "learning_rate": 0.00017016661502801818, "loss": 2.6331, "step": 786 }, { "epoch": 0.5104588941138317, "grad_norm": 2.121183156967163, "learning_rate": 0.0001700937611170182, "loss": 2.9105, "step": 787 }, { "epoch": 0.5111075077022864, "grad_norm": 2.307201862335205, "learning_rate": 0.00017002083399586, "loss": 2.8832, "step": 788 }, { "epoch": 0.511756121290741, "grad_norm": 1.9347745180130005, "learning_rate": 0.00016994783374071304, "loss": 2.3185, "step": 789 }, { "epoch": 0.5124047348791957, "grad_norm": 2.135420083999634, "learning_rate": 0.00016987476042782326, "loss": 2.4629, "step": 790 }, { "epoch": 0.5130533484676504, "grad_norm": 2.292529821395874, "learning_rate": 0.0001698016141335128, "loss": 2.8452, "step": 791 }, { "epoch": 0.5137019620561051, "grad_norm": 2.1734936237335205, "learning_rate": 0.00016972839493418012, "loss": 2.5095, "step": 792 }, { "epoch": 0.5143505756445598, "grad_norm": 1.981299877166748, "learning_rate": 0.00016965510290629972, "loss": 2.7166, "step": 793 }, { "epoch": 0.5149991892330145, "grad_norm": 1.7044910192489624, "learning_rate": 0.00016958173812642224, "loss": 2.3918, "step": 794 }, { "epoch": 0.5156478028214692, "grad_norm": 2.169282913208008, "learning_rate": 0.00016950830067117428, "loss": 2.6025, "step": 795 }, { "epoch": 0.5162964164099237, "grad_norm": 1.78818678855896, "learning_rate": 0.00016943479061725837, "loss": 2.438, "step": 796 }, { "epoch": 0.5169450299983784, "grad_norm": 1.7287423610687256, "learning_rate": 0.00016936120804145283, "loss": 2.5884, "step": 797 }, { "epoch": 0.5175936435868331, "grad_norm": 1.9279563426971436, "learning_rate": 0.00016928755302061173, "loss": 2.6079, "step": 798 }, { "epoch": 0.5182422571752878, "grad_norm": 1.795711874961853, "learning_rate": 0.00016921382563166493, "loss": 2.6937, "step": 799 }, { "epoch": 0.5188908707637425, "grad_norm": 1.8513976335525513, "learning_rate": 0.00016914002595161764, "loss": 2.7525, "step": 800 }, { "epoch": 0.5188908707637425, "eval_loss": 2.7908430099487305, "eval_runtime": 35.1125, "eval_samples_per_second": 58.555, "eval_steps_per_second": 14.639, "step": 800 }, { "epoch": 0.5195394843521972, "grad_norm": 1.7002214193344116, "learning_rate": 0.00016906615405755076, "loss": 2.4483, "step": 801 }, { "epoch": 0.5201880979406519, "grad_norm": 1.766008734703064, "learning_rate": 0.00016899221002662055, "loss": 2.5278, "step": 802 }, { "epoch": 0.5208367115291065, "grad_norm": 2.3250269889831543, "learning_rate": 0.00016891819393605864, "loss": 2.8772, "step": 803 }, { "epoch": 0.5214853251175612, "grad_norm": 1.6832720041275024, "learning_rate": 0.00016884410586317188, "loss": 2.5574, "step": 804 }, { "epoch": 0.5221339387060159, "grad_norm": 1.8285645246505737, "learning_rate": 0.00016876994588534234, "loss": 2.9363, "step": 805 }, { "epoch": 0.5227825522944706, "grad_norm": 1.9659390449523926, "learning_rate": 0.0001686957140800272, "loss": 2.6583, "step": 806 }, { "epoch": 0.5234311658829253, "grad_norm": 1.683066487312317, "learning_rate": 0.0001686214105247586, "loss": 2.6224, "step": 807 }, { "epoch": 0.52407977947138, "grad_norm": 2.1758179664611816, "learning_rate": 0.0001685470352971437, "loss": 2.7589, "step": 808 }, { "epoch": 0.5247283930598347, "grad_norm": 2.082282066345215, "learning_rate": 0.00016847258847486446, "loss": 2.698, "step": 809 }, { "epoch": 0.5253770066482892, "grad_norm": 1.8239572048187256, "learning_rate": 0.00016839807013567764, "loss": 2.6484, "step": 810 }, { "epoch": 0.5260256202367439, "grad_norm": 1.9416550397872925, "learning_rate": 0.00016832348035741466, "loss": 2.5934, "step": 811 }, { "epoch": 0.5266742338251986, "grad_norm": 2.1960136890411377, "learning_rate": 0.0001682488192179817, "loss": 2.8175, "step": 812 }, { "epoch": 0.5273228474136533, "grad_norm": 1.9475816488265991, "learning_rate": 0.00016817408679535922, "loss": 2.4741, "step": 813 }, { "epoch": 0.527971461002108, "grad_norm": 1.8553614616394043, "learning_rate": 0.0001680992831676024, "loss": 2.1402, "step": 814 }, { "epoch": 0.5286200745905627, "grad_norm": 2.0549750328063965, "learning_rate": 0.00016802440841284062, "loss": 2.8977, "step": 815 }, { "epoch": 0.5292686881790174, "grad_norm": 2.0518736839294434, "learning_rate": 0.00016794946260927755, "loss": 2.4652, "step": 816 }, { "epoch": 0.529917301767472, "grad_norm": 2.099036931991577, "learning_rate": 0.00016787444583519118, "loss": 3.2066, "step": 817 }, { "epoch": 0.5305659153559267, "grad_norm": 1.6188719272613525, "learning_rate": 0.00016779935816893353, "loss": 2.4488, "step": 818 }, { "epoch": 0.5312145289443814, "grad_norm": 1.6918838024139404, "learning_rate": 0.0001677241996889307, "loss": 2.4458, "step": 819 }, { "epoch": 0.5318631425328361, "grad_norm": 1.904426097869873, "learning_rate": 0.00016764897047368278, "loss": 2.5795, "step": 820 }, { "epoch": 0.5325117561212908, "grad_norm": 1.885549783706665, "learning_rate": 0.00016757367060176364, "loss": 2.7087, "step": 821 }, { "epoch": 0.5331603697097455, "grad_norm": 1.8176720142364502, "learning_rate": 0.00016749830015182107, "loss": 2.2846, "step": 822 }, { "epoch": 0.5338089832982001, "grad_norm": 1.8654227256774902, "learning_rate": 0.0001674228592025765, "loss": 2.6544, "step": 823 }, { "epoch": 0.5344575968866547, "grad_norm": 2.117871046066284, "learning_rate": 0.000167347347832825, "loss": 2.487, "step": 824 }, { "epoch": 0.5351062104751094, "grad_norm": 2.4831976890563965, "learning_rate": 0.00016727176612143523, "loss": 2.6658, "step": 825 }, { "epoch": 0.5357548240635641, "grad_norm": 2.1280226707458496, "learning_rate": 0.00016719611414734925, "loss": 2.3715, "step": 826 }, { "epoch": 0.5364034376520188, "grad_norm": 2.020052671432495, "learning_rate": 0.00016712039198958265, "loss": 2.6436, "step": 827 }, { "epoch": 0.5370520512404735, "grad_norm": 2.2182655334472656, "learning_rate": 0.00016704459972722414, "loss": 2.7887, "step": 828 }, { "epoch": 0.5377006648289282, "grad_norm": 2.0764174461364746, "learning_rate": 0.00016696873743943575, "loss": 2.7486, "step": 829 }, { "epoch": 0.5383492784173829, "grad_norm": 1.9241282939910889, "learning_rate": 0.00016689280520545263, "loss": 2.4796, "step": 830 }, { "epoch": 0.5389978920058375, "grad_norm": 2.1257684230804443, "learning_rate": 0.00016681680310458305, "loss": 2.5978, "step": 831 }, { "epoch": 0.5396465055942922, "grad_norm": 1.6599854230880737, "learning_rate": 0.00016674073121620815, "loss": 2.1928, "step": 832 }, { "epoch": 0.5402951191827469, "grad_norm": 1.7232425212860107, "learning_rate": 0.00016666458961978194, "loss": 2.6277, "step": 833 }, { "epoch": 0.5409437327712016, "grad_norm": 1.8201569318771362, "learning_rate": 0.00016658837839483143, "loss": 2.2378, "step": 834 }, { "epoch": 0.5415923463596563, "grad_norm": 1.8493746519088745, "learning_rate": 0.00016651209762095617, "loss": 2.5452, "step": 835 }, { "epoch": 0.542240959948111, "grad_norm": 2.146618366241455, "learning_rate": 0.00016643574737782842, "loss": 2.534, "step": 836 }, { "epoch": 0.5428895735365656, "grad_norm": 2.040282964706421, "learning_rate": 0.00016635932774519295, "loss": 2.6128, "step": 837 }, { "epoch": 0.5435381871250202, "grad_norm": 2.0830955505371094, "learning_rate": 0.00016628283880286703, "loss": 2.7162, "step": 838 }, { "epoch": 0.5441868007134749, "grad_norm": 2.4236154556274414, "learning_rate": 0.00016620628063074042, "loss": 2.4665, "step": 839 }, { "epoch": 0.5448354143019296, "grad_norm": 2.307872772216797, "learning_rate": 0.00016612965330877502, "loss": 2.8138, "step": 840 }, { "epoch": 0.5454840278903843, "grad_norm": 2.213623523712158, "learning_rate": 0.00016605295691700506, "loss": 2.7249, "step": 841 }, { "epoch": 0.546132641478839, "grad_norm": 2.143096923828125, "learning_rate": 0.00016597619153553689, "loss": 2.6388, "step": 842 }, { "epoch": 0.5467812550672937, "grad_norm": 1.8381470441818237, "learning_rate": 0.0001658993572445489, "loss": 2.5095, "step": 843 }, { "epoch": 0.5474298686557484, "grad_norm": 2.278886079788208, "learning_rate": 0.00016582245412429146, "loss": 2.8616, "step": 844 }, { "epoch": 0.548078482244203, "grad_norm": 1.8392088413238525, "learning_rate": 0.00016574548225508685, "loss": 2.3217, "step": 845 }, { "epoch": 0.5487270958326577, "grad_norm": 1.6857413053512573, "learning_rate": 0.00016566844171732914, "loss": 2.4047, "step": 846 }, { "epoch": 0.5493757094211124, "grad_norm": 1.9785003662109375, "learning_rate": 0.0001655913325914841, "loss": 2.544, "step": 847 }, { "epoch": 0.550024323009567, "grad_norm": 1.9097630977630615, "learning_rate": 0.00016551415495808915, "loss": 2.5956, "step": 848 }, { "epoch": 0.5506729365980217, "grad_norm": 1.9379462003707886, "learning_rate": 0.00016543690889775334, "loss": 2.5121, "step": 849 }, { "epoch": 0.5513215501864764, "grad_norm": 1.8445897102355957, "learning_rate": 0.00016535959449115705, "loss": 2.2496, "step": 850 }, { "epoch": 0.5519701637749311, "grad_norm": 2.416783332824707, "learning_rate": 0.00016528221181905217, "loss": 3.0578, "step": 851 }, { "epoch": 0.5526187773633857, "grad_norm": 2.3340001106262207, "learning_rate": 0.0001652047609622618, "loss": 2.7235, "step": 852 }, { "epoch": 0.5532673909518404, "grad_norm": 1.8974119424819946, "learning_rate": 0.00016512724200168033, "loss": 2.8078, "step": 853 }, { "epoch": 0.5539160045402951, "grad_norm": 1.7799787521362305, "learning_rate": 0.00016504965501827317, "loss": 2.3376, "step": 854 }, { "epoch": 0.5545646181287498, "grad_norm": 2.031590223312378, "learning_rate": 0.0001649720000930769, "loss": 2.8774, "step": 855 }, { "epoch": 0.5552132317172045, "grad_norm": 1.985558032989502, "learning_rate": 0.00016489427730719905, "loss": 2.7342, "step": 856 }, { "epoch": 0.5558618453056592, "grad_norm": 2.4429149627685547, "learning_rate": 0.00016481648674181794, "loss": 2.9052, "step": 857 }, { "epoch": 0.5565104588941139, "grad_norm": 1.832931637763977, "learning_rate": 0.00016473862847818277, "loss": 2.6417, "step": 858 }, { "epoch": 0.5571590724825685, "grad_norm": 2.2543516159057617, "learning_rate": 0.0001646607025976134, "loss": 2.9164, "step": 859 }, { "epoch": 0.5578076860710232, "grad_norm": 1.9577391147613525, "learning_rate": 0.0001645827091815003, "loss": 2.3698, "step": 860 }, { "epoch": 0.5584562996594779, "grad_norm": 1.6750187873840332, "learning_rate": 0.00016450464831130457, "loss": 2.2941, "step": 861 }, { "epoch": 0.5591049132479325, "grad_norm": 2.3008785247802734, "learning_rate": 0.00016442652006855762, "loss": 2.6143, "step": 862 }, { "epoch": 0.5597535268363872, "grad_norm": 2.8179540634155273, "learning_rate": 0.00016434832453486134, "loss": 2.7987, "step": 863 }, { "epoch": 0.5604021404248419, "grad_norm": 1.9579061269760132, "learning_rate": 0.00016427006179188787, "loss": 2.6678, "step": 864 }, { "epoch": 0.5610507540132966, "grad_norm": 1.7792688608169556, "learning_rate": 0.00016419173192137962, "loss": 2.583, "step": 865 }, { "epoch": 0.5616993676017512, "grad_norm": 1.8606637716293335, "learning_rate": 0.00016411333500514888, "loss": 2.4707, "step": 866 }, { "epoch": 0.5623479811902059, "grad_norm": 1.9141956567764282, "learning_rate": 0.00016403487112507826, "loss": 2.338, "step": 867 }, { "epoch": 0.5629965947786606, "grad_norm": 2.4477109909057617, "learning_rate": 0.00016395634036312013, "loss": 2.8729, "step": 868 }, { "epoch": 0.5636452083671153, "grad_norm": 1.9421724081039429, "learning_rate": 0.00016387774280129675, "loss": 2.3365, "step": 869 }, { "epoch": 0.56429382195557, "grad_norm": 1.895973563194275, "learning_rate": 0.00016379907852170016, "loss": 2.3755, "step": 870 }, { "epoch": 0.5649424355440247, "grad_norm": 1.9399441480636597, "learning_rate": 0.0001637203476064921, "loss": 2.5109, "step": 871 }, { "epoch": 0.5655910491324794, "grad_norm": 1.9453812837600708, "learning_rate": 0.00016364155013790391, "loss": 2.2947, "step": 872 }, { "epoch": 0.566239662720934, "grad_norm": 1.9297842979431152, "learning_rate": 0.0001635626861982364, "loss": 2.407, "step": 873 }, { "epoch": 0.5668882763093887, "grad_norm": 2.359004020690918, "learning_rate": 0.00016348375586985983, "loss": 2.6511, "step": 874 }, { "epoch": 0.5675368898978433, "grad_norm": 2.020916700363159, "learning_rate": 0.00016340475923521384, "loss": 2.4603, "step": 875 }, { "epoch": 0.568185503486298, "grad_norm": 1.781226396560669, "learning_rate": 0.0001633256963768072, "loss": 2.6044, "step": 876 }, { "epoch": 0.5688341170747527, "grad_norm": 2.2149884700775146, "learning_rate": 0.00016324656737721807, "loss": 2.8704, "step": 877 }, { "epoch": 0.5694827306632074, "grad_norm": 2.051330327987671, "learning_rate": 0.00016316737231909342, "loss": 2.3622, "step": 878 }, { "epoch": 0.5701313442516621, "grad_norm": 1.8624647855758667, "learning_rate": 0.00016308811128514945, "loss": 2.271, "step": 879 }, { "epoch": 0.5707799578401167, "grad_norm": 1.8709050416946411, "learning_rate": 0.00016300878435817113, "loss": 2.5855, "step": 880 }, { "epoch": 0.5714285714285714, "grad_norm": 1.630293607711792, "learning_rate": 0.00016292939162101228, "loss": 2.6065, "step": 881 }, { "epoch": 0.5720771850170261, "grad_norm": 2.4564857482910156, "learning_rate": 0.00016284993315659552, "loss": 2.6145, "step": 882 }, { "epoch": 0.5727257986054808, "grad_norm": 2.180867910385132, "learning_rate": 0.00016277040904791199, "loss": 2.6335, "step": 883 }, { "epoch": 0.5733744121939355, "grad_norm": 2.075727939605713, "learning_rate": 0.00016269081937802157, "loss": 2.8165, "step": 884 }, { "epoch": 0.5740230257823902, "grad_norm": 1.9940731525421143, "learning_rate": 0.00016261116423005247, "loss": 2.6012, "step": 885 }, { "epoch": 0.5746716393708449, "grad_norm": 2.3178861141204834, "learning_rate": 0.0001625314436872013, "loss": 2.4274, "step": 886 }, { "epoch": 0.5753202529592994, "grad_norm": 2.374150276184082, "learning_rate": 0.00016245165783273312, "loss": 2.819, "step": 887 }, { "epoch": 0.5759688665477541, "grad_norm": 2.485283851623535, "learning_rate": 0.000162371806749981, "loss": 2.8611, "step": 888 }, { "epoch": 0.5766174801362088, "grad_norm": 1.7687296867370605, "learning_rate": 0.00016229189052234625, "loss": 2.3727, "step": 889 }, { "epoch": 0.5772660937246635, "grad_norm": 2.347965955734253, "learning_rate": 0.0001622119092332982, "loss": 2.3925, "step": 890 }, { "epoch": 0.5779147073131182, "grad_norm": 2.2159085273742676, "learning_rate": 0.00016213186296637418, "loss": 2.813, "step": 891 }, { "epoch": 0.5785633209015729, "grad_norm": 2.174755811691284, "learning_rate": 0.00016205175180517933, "loss": 2.8767, "step": 892 }, { "epoch": 0.5792119344900276, "grad_norm": 2.3282101154327393, "learning_rate": 0.00016197157583338657, "loss": 2.6352, "step": 893 }, { "epoch": 0.5798605480784822, "grad_norm": 2.26180362701416, "learning_rate": 0.00016189133513473656, "loss": 2.9565, "step": 894 }, { "epoch": 0.5805091616669369, "grad_norm": 2.0725808143615723, "learning_rate": 0.0001618110297930375, "loss": 2.4181, "step": 895 }, { "epoch": 0.5811577752553916, "grad_norm": 1.8791955709457397, "learning_rate": 0.00016173065989216512, "loss": 2.4613, "step": 896 }, { "epoch": 0.5818063888438463, "grad_norm": 1.8205071687698364, "learning_rate": 0.00016165022551606266, "loss": 2.4702, "step": 897 }, { "epoch": 0.582455002432301, "grad_norm": 2.206456184387207, "learning_rate": 0.00016156972674874056, "loss": 3.0161, "step": 898 }, { "epoch": 0.5831036160207557, "grad_norm": 2.196382761001587, "learning_rate": 0.0001614891636742767, "loss": 2.5711, "step": 899 }, { "epoch": 0.5837522296092104, "grad_norm": 2.239133596420288, "learning_rate": 0.00016140853637681594, "loss": 2.8358, "step": 900 }, { "epoch": 0.5837522296092104, "eval_loss": 2.71309232711792, "eval_runtime": 35.0486, "eval_samples_per_second": 58.661, "eval_steps_per_second": 14.665, "step": 900 }, { "epoch": 0.5844008431976649, "grad_norm": 2.4057347774505615, "learning_rate": 0.00016132784494057037, "loss": 2.7037, "step": 901 }, { "epoch": 0.5850494567861196, "grad_norm": 1.9970471858978271, "learning_rate": 0.0001612470894498189, "loss": 2.1508, "step": 902 }, { "epoch": 0.5856980703745743, "grad_norm": 1.9484702348709106, "learning_rate": 0.00016116626998890753, "loss": 2.42, "step": 903 }, { "epoch": 0.586346683963029, "grad_norm": 2.6620540618896484, "learning_rate": 0.00016108538664224895, "loss": 2.9392, "step": 904 }, { "epoch": 0.5869952975514837, "grad_norm": 1.8398857116699219, "learning_rate": 0.0001610044394943226, "loss": 2.2753, "step": 905 }, { "epoch": 0.5876439111399384, "grad_norm": 2.0386431217193604, "learning_rate": 0.00016092342862967463, "loss": 2.3637, "step": 906 }, { "epoch": 0.5882925247283931, "grad_norm": 2.0229485034942627, "learning_rate": 0.00016084235413291764, "loss": 2.5873, "step": 907 }, { "epoch": 0.5889411383168477, "grad_norm": 1.9943222999572754, "learning_rate": 0.00016076121608873072, "loss": 2.5657, "step": 908 }, { "epoch": 0.5895897519053024, "grad_norm": 2.2625303268432617, "learning_rate": 0.00016068001458185936, "loss": 2.4297, "step": 909 }, { "epoch": 0.5902383654937571, "grad_norm": 1.6178410053253174, "learning_rate": 0.0001605987496971153, "loss": 2.3811, "step": 910 }, { "epoch": 0.5908869790822118, "grad_norm": 2.008234739303589, "learning_rate": 0.00016051742151937655, "loss": 2.6696, "step": 911 }, { "epoch": 0.5915355926706665, "grad_norm": 2.067573308944702, "learning_rate": 0.00016043603013358708, "loss": 2.6264, "step": 912 }, { "epoch": 0.5921842062591212, "grad_norm": 2.2784266471862793, "learning_rate": 0.00016035457562475704, "loss": 2.5608, "step": 913 }, { "epoch": 0.5928328198475759, "grad_norm": 1.8012956380844116, "learning_rate": 0.00016027305807796247, "loss": 2.2227, "step": 914 }, { "epoch": 0.5934814334360304, "grad_norm": 2.263453245162964, "learning_rate": 0.00016019147757834513, "loss": 2.9776, "step": 915 }, { "epoch": 0.5941300470244851, "grad_norm": 2.3760533332824707, "learning_rate": 0.00016010983421111264, "loss": 2.8201, "step": 916 }, { "epoch": 0.5947786606129398, "grad_norm": 2.3448662757873535, "learning_rate": 0.00016002812806153834, "loss": 2.6808, "step": 917 }, { "epoch": 0.5954272742013945, "grad_norm": 2.7986111640930176, "learning_rate": 0.000159946359214961, "loss": 3.0013, "step": 918 }, { "epoch": 0.5960758877898492, "grad_norm": 1.9401862621307373, "learning_rate": 0.00015986452775678496, "loss": 2.4262, "step": 919 }, { "epoch": 0.5967245013783039, "grad_norm": 2.164109230041504, "learning_rate": 0.00015978263377247994, "loss": 2.7278, "step": 920 }, { "epoch": 0.5973731149667586, "grad_norm": 2.031909465789795, "learning_rate": 0.00015970067734758094, "loss": 2.321, "step": 921 }, { "epoch": 0.5980217285552132, "grad_norm": 1.8662800788879395, "learning_rate": 0.00015961865856768825, "loss": 2.558, "step": 922 }, { "epoch": 0.5986703421436679, "grad_norm": 2.2298877239227295, "learning_rate": 0.00015953657751846718, "loss": 2.8875, "step": 923 }, { "epoch": 0.5993189557321226, "grad_norm": 1.8821171522140503, "learning_rate": 0.0001594544342856482, "loss": 2.3213, "step": 924 }, { "epoch": 0.5999675693205773, "grad_norm": 2.823119640350342, "learning_rate": 0.00015937222895502661, "loss": 2.8922, "step": 925 }, { "epoch": 0.600616182909032, "grad_norm": 1.9157410860061646, "learning_rate": 0.00015928996161246263, "loss": 2.6328, "step": 926 }, { "epoch": 0.6012647964974867, "grad_norm": 1.9722377061843872, "learning_rate": 0.00015920763234388127, "loss": 2.4277, "step": 927 }, { "epoch": 0.6019134100859413, "grad_norm": 2.05491304397583, "learning_rate": 0.00015912524123527221, "loss": 2.7735, "step": 928 }, { "epoch": 0.6025620236743959, "grad_norm": 1.8120936155319214, "learning_rate": 0.0001590427883726896, "loss": 2.4808, "step": 929 }, { "epoch": 0.6032106372628506, "grad_norm": 2.2335927486419678, "learning_rate": 0.00015896027384225235, "loss": 2.7079, "step": 930 }, { "epoch": 0.6038592508513053, "grad_norm": 2.2155191898345947, "learning_rate": 0.00015887769773014347, "loss": 2.8518, "step": 931 }, { "epoch": 0.60450786443976, "grad_norm": 2.2017664909362793, "learning_rate": 0.00015879506012261052, "loss": 2.62, "step": 932 }, { "epoch": 0.6051564780282147, "grad_norm": 1.9267942905426025, "learning_rate": 0.00015871236110596515, "loss": 2.4708, "step": 933 }, { "epoch": 0.6058050916166694, "grad_norm": 2.4583592414855957, "learning_rate": 0.0001586296007665833, "loss": 2.3994, "step": 934 }, { "epoch": 0.6064537052051241, "grad_norm": 2.005619764328003, "learning_rate": 0.00015854677919090477, "loss": 2.5359, "step": 935 }, { "epoch": 0.6071023187935787, "grad_norm": 1.6709439754486084, "learning_rate": 0.00015846389646543348, "loss": 2.2574, "step": 936 }, { "epoch": 0.6077509323820334, "grad_norm": 2.0372586250305176, "learning_rate": 0.00015838095267673712, "loss": 2.6288, "step": 937 }, { "epoch": 0.6083995459704881, "grad_norm": 1.981662631034851, "learning_rate": 0.0001582979479114472, "loss": 2.625, "step": 938 }, { "epoch": 0.6090481595589428, "grad_norm": 1.9726930856704712, "learning_rate": 0.00015821488225625896, "loss": 2.2981, "step": 939 }, { "epoch": 0.6096967731473975, "grad_norm": 2.2656047344207764, "learning_rate": 0.00015813175579793112, "loss": 2.6303, "step": 940 }, { "epoch": 0.6103453867358521, "grad_norm": 2.3176369667053223, "learning_rate": 0.00015804856862328598, "loss": 2.6425, "step": 941 }, { "epoch": 0.6109940003243068, "grad_norm": 2.239445924758911, "learning_rate": 0.00015796532081920934, "loss": 2.6754, "step": 942 }, { "epoch": 0.6116426139127614, "grad_norm": 2.164426803588867, "learning_rate": 0.00015788201247265011, "loss": 2.7497, "step": 943 }, { "epoch": 0.6122912275012161, "grad_norm": 1.971767783164978, "learning_rate": 0.00015779864367062064, "loss": 2.3136, "step": 944 }, { "epoch": 0.6129398410896708, "grad_norm": 1.8716953992843628, "learning_rate": 0.0001577152145001963, "loss": 2.8075, "step": 945 }, { "epoch": 0.6135884546781255, "grad_norm": 2.133183240890503, "learning_rate": 0.0001576317250485156, "loss": 2.3824, "step": 946 }, { "epoch": 0.6142370682665802, "grad_norm": 2.0376219749450684, "learning_rate": 0.00015754817540277992, "loss": 2.4396, "step": 947 }, { "epoch": 0.6148856818550349, "grad_norm": 2.009746789932251, "learning_rate": 0.0001574645656502536, "loss": 2.3153, "step": 948 }, { "epoch": 0.6155342954434896, "grad_norm": 2.2997593879699707, "learning_rate": 0.00015738089587826365, "loss": 2.3117, "step": 949 }, { "epoch": 0.6161829090319442, "grad_norm": 2.345745801925659, "learning_rate": 0.00015729716617419995, "loss": 2.7221, "step": 950 }, { "epoch": 0.6168315226203989, "grad_norm": 2.584408760070801, "learning_rate": 0.00015721337662551475, "loss": 2.6463, "step": 951 }, { "epoch": 0.6174801362088536, "grad_norm": 2.4927022457122803, "learning_rate": 0.00015712952731972295, "loss": 2.6563, "step": 952 }, { "epoch": 0.6181287497973083, "grad_norm": 2.3778746128082275, "learning_rate": 0.00015704561834440183, "loss": 2.5953, "step": 953 }, { "epoch": 0.618777363385763, "grad_norm": 2.2131829261779785, "learning_rate": 0.00015696164978719102, "loss": 2.365, "step": 954 }, { "epoch": 0.6194259769742176, "grad_norm": 1.973899006843567, "learning_rate": 0.0001568776217357923, "loss": 2.1784, "step": 955 }, { "epoch": 0.6200745905626723, "grad_norm": 2.3028757572174072, "learning_rate": 0.00015679353427796967, "loss": 2.158, "step": 956 }, { "epoch": 0.6207232041511269, "grad_norm": 2.5411038398742676, "learning_rate": 0.0001567093875015492, "loss": 2.9572, "step": 957 }, { "epoch": 0.6213718177395816, "grad_norm": 2.2682900428771973, "learning_rate": 0.0001566251814944188, "loss": 2.5505, "step": 958 }, { "epoch": 0.6220204313280363, "grad_norm": 1.782820463180542, "learning_rate": 0.00015654091634452835, "loss": 2.3828, "step": 959 }, { "epoch": 0.622669044916491, "grad_norm": 2.541433095932007, "learning_rate": 0.0001564565921398894, "loss": 2.6932, "step": 960 }, { "epoch": 0.6233176585049457, "grad_norm": 2.8258216381073, "learning_rate": 0.0001563722089685753, "loss": 2.7276, "step": 961 }, { "epoch": 0.6239662720934004, "grad_norm": 2.6248385906219482, "learning_rate": 0.0001562877669187209, "loss": 2.5222, "step": 962 }, { "epoch": 0.6246148856818551, "grad_norm": 2.2808005809783936, "learning_rate": 0.00015620326607852265, "loss": 2.6974, "step": 963 }, { "epoch": 0.6252634992703097, "grad_norm": 2.674485683441162, "learning_rate": 0.00015611870653623825, "loss": 2.8484, "step": 964 }, { "epoch": 0.6259121128587644, "grad_norm": 2.3913564682006836, "learning_rate": 0.00015603408838018684, "loss": 2.5638, "step": 965 }, { "epoch": 0.626560726447219, "grad_norm": 2.08746600151062, "learning_rate": 0.0001559494116987487, "loss": 2.4275, "step": 966 }, { "epoch": 0.6272093400356737, "grad_norm": 2.110572099685669, "learning_rate": 0.00015586467658036524, "loss": 2.6626, "step": 967 }, { "epoch": 0.6278579536241284, "grad_norm": 2.177456855773926, "learning_rate": 0.00015577988311353904, "loss": 2.6193, "step": 968 }, { "epoch": 0.6285065672125831, "grad_norm": 2.267613649368286, "learning_rate": 0.00015569503138683345, "loss": 2.5349, "step": 969 }, { "epoch": 0.6291551808010378, "grad_norm": 2.223992109298706, "learning_rate": 0.00015561012148887274, "loss": 2.5358, "step": 970 }, { "epoch": 0.6298037943894924, "grad_norm": 2.304708242416382, "learning_rate": 0.00015552515350834197, "loss": 2.9059, "step": 971 }, { "epoch": 0.6304524079779471, "grad_norm": 2.364778995513916, "learning_rate": 0.00015544012753398678, "loss": 2.7984, "step": 972 }, { "epoch": 0.6311010215664018, "grad_norm": 2.1268465518951416, "learning_rate": 0.00015535504365461346, "loss": 2.4436, "step": 973 }, { "epoch": 0.6317496351548565, "grad_norm": 2.3930370807647705, "learning_rate": 0.00015526990195908878, "loss": 2.6227, "step": 974 }, { "epoch": 0.6323982487433112, "grad_norm": 2.3842310905456543, "learning_rate": 0.00015518470253633986, "loss": 2.5071, "step": 975 }, { "epoch": 0.6330468623317659, "grad_norm": 1.9541676044464111, "learning_rate": 0.00015509944547535405, "loss": 2.4086, "step": 976 }, { "epoch": 0.6336954759202206, "grad_norm": 2.475834846496582, "learning_rate": 0.00015501413086517905, "loss": 2.5644, "step": 977 }, { "epoch": 0.6343440895086752, "grad_norm": 1.875421166419983, "learning_rate": 0.0001549287587949226, "loss": 2.4876, "step": 978 }, { "epoch": 0.6349927030971299, "grad_norm": 2.6315712928771973, "learning_rate": 0.00015484332935375244, "loss": 2.6129, "step": 979 }, { "epoch": 0.6356413166855845, "grad_norm": 2.140617847442627, "learning_rate": 0.00015475784263089618, "loss": 2.716, "step": 980 }, { "epoch": 0.6362899302740392, "grad_norm": 2.2375760078430176, "learning_rate": 0.00015467229871564137, "loss": 2.5743, "step": 981 }, { "epoch": 0.6369385438624939, "grad_norm": 2.190585136413574, "learning_rate": 0.00015458669769733522, "loss": 2.6336, "step": 982 }, { "epoch": 0.6375871574509486, "grad_norm": 2.7392513751983643, "learning_rate": 0.00015450103966538464, "loss": 2.9417, "step": 983 }, { "epoch": 0.6382357710394033, "grad_norm": 2.1198604106903076, "learning_rate": 0.00015441532470925602, "loss": 2.8481, "step": 984 }, { "epoch": 0.6388843846278579, "grad_norm": 2.1409387588500977, "learning_rate": 0.00015432955291847525, "loss": 2.4726, "step": 985 }, { "epoch": 0.6395329982163126, "grad_norm": 2.2652640342712402, "learning_rate": 0.00015424372438262756, "loss": 2.5788, "step": 986 }, { "epoch": 0.6401816118047673, "grad_norm": 1.8990882635116577, "learning_rate": 0.00015415783919135747, "loss": 2.3454, "step": 987 }, { "epoch": 0.640830225393222, "grad_norm": 2.663339853286743, "learning_rate": 0.00015407189743436864, "loss": 2.9276, "step": 988 }, { "epoch": 0.6414788389816767, "grad_norm": 2.127061128616333, "learning_rate": 0.0001539858992014239, "loss": 2.3861, "step": 989 }, { "epoch": 0.6421274525701314, "grad_norm": 2.5524849891662598, "learning_rate": 0.00015389984458234488, "loss": 2.8867, "step": 990 }, { "epoch": 0.6427760661585861, "grad_norm": 2.432252883911133, "learning_rate": 0.00015381373366701227, "loss": 2.5198, "step": 991 }, { "epoch": 0.6434246797470407, "grad_norm": 1.9449174404144287, "learning_rate": 0.0001537275665453656, "loss": 2.1787, "step": 992 }, { "epoch": 0.6440732933354953, "grad_norm": 1.8984159231185913, "learning_rate": 0.00015364134330740292, "loss": 2.1472, "step": 993 }, { "epoch": 0.64472190692395, "grad_norm": 2.226346015930176, "learning_rate": 0.00015355506404318104, "loss": 2.6574, "step": 994 }, { "epoch": 0.6453705205124047, "grad_norm": 2.1505062580108643, "learning_rate": 0.00015346872884281518, "loss": 2.4511, "step": 995 }, { "epoch": 0.6460191341008594, "grad_norm": 3.126215696334839, "learning_rate": 0.0001533823377964791, "loss": 3.2088, "step": 996 }, { "epoch": 0.6466677476893141, "grad_norm": 2.350383996963501, "learning_rate": 0.00015329589099440476, "loss": 2.596, "step": 997 }, { "epoch": 0.6473163612777688, "grad_norm": 2.0301599502563477, "learning_rate": 0.00015320938852688248, "loss": 2.3248, "step": 998 }, { "epoch": 0.6479649748662234, "grad_norm": 2.394653797149658, "learning_rate": 0.00015312283048426063, "loss": 2.7805, "step": 999 }, { "epoch": 0.6486135884546781, "grad_norm": 2.457226037979126, "learning_rate": 0.00015303621695694566, "loss": 2.8312, "step": 1000 }, { "epoch": 0.6486135884546781, "eval_loss": 2.6432583332061768, "eval_runtime": 35.0996, "eval_samples_per_second": 58.576, "eval_steps_per_second": 14.644, "step": 1000 }, { "epoch": 0.6492622020431328, "grad_norm": 2.4642865657806396, "learning_rate": 0.00015294954803540201, "loss": 2.5842, "step": 1001 }, { "epoch": 0.6499108156315875, "grad_norm": 2.280331611633301, "learning_rate": 0.00015286282381015188, "loss": 2.5772, "step": 1002 }, { "epoch": 0.6505594292200422, "grad_norm": 2.446798801422119, "learning_rate": 0.00015277604437177535, "loss": 2.3743, "step": 1003 }, { "epoch": 0.6512080428084969, "grad_norm": 2.17205810546875, "learning_rate": 0.00015268920981091006, "loss": 2.5112, "step": 1004 }, { "epoch": 0.6518566563969516, "grad_norm": 2.6025631427764893, "learning_rate": 0.00015260232021825128, "loss": 2.6536, "step": 1005 }, { "epoch": 0.6525052699854061, "grad_norm": 2.247659206390381, "learning_rate": 0.00015251537568455173, "loss": 2.6427, "step": 1006 }, { "epoch": 0.6531538835738608, "grad_norm": 2.3733818531036377, "learning_rate": 0.0001524283763006216, "loss": 2.5537, "step": 1007 }, { "epoch": 0.6538024971623155, "grad_norm": 1.9836475849151611, "learning_rate": 0.00015234132215732822, "loss": 2.1503, "step": 1008 }, { "epoch": 0.6544511107507702, "grad_norm": 2.3204004764556885, "learning_rate": 0.00015225421334559626, "loss": 2.4757, "step": 1009 }, { "epoch": 0.6550997243392249, "grad_norm": 2.1223583221435547, "learning_rate": 0.0001521670499564074, "loss": 2.436, "step": 1010 }, { "epoch": 0.6557483379276796, "grad_norm": 2.065255641937256, "learning_rate": 0.00015207983208080034, "loss": 2.6815, "step": 1011 }, { "epoch": 0.6563969515161343, "grad_norm": 2.7279276847839355, "learning_rate": 0.0001519925598098707, "loss": 2.6822, "step": 1012 }, { "epoch": 0.6570455651045889, "grad_norm": 3.1544482707977295, "learning_rate": 0.00015190523323477094, "loss": 2.3543, "step": 1013 }, { "epoch": 0.6576941786930436, "grad_norm": 1.9720402956008911, "learning_rate": 0.00015181785244671017, "loss": 2.2591, "step": 1014 }, { "epoch": 0.6583427922814983, "grad_norm": 2.548261880874634, "learning_rate": 0.0001517304175369542, "loss": 2.6285, "step": 1015 }, { "epoch": 0.658991405869953, "grad_norm": 2.411700487136841, "learning_rate": 0.00015164292859682528, "loss": 2.8611, "step": 1016 }, { "epoch": 0.6596400194584077, "grad_norm": 2.1039950847625732, "learning_rate": 0.00015155538571770218, "loss": 2.6611, "step": 1017 }, { "epoch": 0.6602886330468624, "grad_norm": 2.013720750808716, "learning_rate": 0.00015146778899102, "loss": 2.2983, "step": 1018 }, { "epoch": 0.660937246635317, "grad_norm": 1.9716508388519287, "learning_rate": 0.00015138013850827, "loss": 2.3059, "step": 1019 }, { "epoch": 0.6615858602237716, "grad_norm": 2.187788724899292, "learning_rate": 0.00015129243436099964, "loss": 2.5764, "step": 1020 }, { "epoch": 0.6622344738122263, "grad_norm": 1.8586186170578003, "learning_rate": 0.00015120467664081247, "loss": 2.3396, "step": 1021 }, { "epoch": 0.662883087400681, "grad_norm": 2.022642135620117, "learning_rate": 0.00015111686543936795, "loss": 2.5443, "step": 1022 }, { "epoch": 0.6635317009891357, "grad_norm": 2.5416646003723145, "learning_rate": 0.00015102900084838135, "loss": 2.4268, "step": 1023 }, { "epoch": 0.6641803145775904, "grad_norm": 2.6829476356506348, "learning_rate": 0.0001509410829596238, "loss": 2.5134, "step": 1024 }, { "epoch": 0.6648289281660451, "grad_norm": 2.125669240951538, "learning_rate": 0.00015085311186492206, "loss": 2.3724, "step": 1025 }, { "epoch": 0.6654775417544998, "grad_norm": 2.1708943843841553, "learning_rate": 0.00015076508765615845, "loss": 2.5586, "step": 1026 }, { "epoch": 0.6661261553429544, "grad_norm": 2.269397735595703, "learning_rate": 0.00015067701042527075, "loss": 2.2569, "step": 1027 }, { "epoch": 0.6667747689314091, "grad_norm": 2.504974603652954, "learning_rate": 0.00015058888026425212, "loss": 2.7459, "step": 1028 }, { "epoch": 0.6674233825198638, "grad_norm": 2.0300753116607666, "learning_rate": 0.0001505006972651511, "loss": 2.3028, "step": 1029 }, { "epoch": 0.6680719961083185, "grad_norm": 2.9403915405273438, "learning_rate": 0.0001504124615200713, "loss": 2.6278, "step": 1030 }, { "epoch": 0.6687206096967732, "grad_norm": 2.079052448272705, "learning_rate": 0.00015032417312117142, "loss": 2.5203, "step": 1031 }, { "epoch": 0.6693692232852279, "grad_norm": 2.2372522354125977, "learning_rate": 0.00015023583216066525, "loss": 2.2446, "step": 1032 }, { "epoch": 0.6700178368736825, "grad_norm": 1.8166232109069824, "learning_rate": 0.00015014743873082145, "loss": 2.0312, "step": 1033 }, { "epoch": 0.6706664504621371, "grad_norm": 2.386690139770508, "learning_rate": 0.00015005899292396335, "loss": 2.4561, "step": 1034 }, { "epoch": 0.6713150640505918, "grad_norm": 2.3613104820251465, "learning_rate": 0.00014997049483246922, "loss": 2.8016, "step": 1035 }, { "epoch": 0.6719636776390465, "grad_norm": 2.3698136806488037, "learning_rate": 0.00014988194454877173, "loss": 2.719, "step": 1036 }, { "epoch": 0.6726122912275012, "grad_norm": 2.12869930267334, "learning_rate": 0.00014979334216535817, "loss": 2.5146, "step": 1037 }, { "epoch": 0.6732609048159559, "grad_norm": 2.1055350303649902, "learning_rate": 0.00014970468777477026, "loss": 2.4697, "step": 1038 }, { "epoch": 0.6739095184044106, "grad_norm": 2.2801716327667236, "learning_rate": 0.0001496159814696039, "loss": 2.5333, "step": 1039 }, { "epoch": 0.6745581319928653, "grad_norm": 1.980212688446045, "learning_rate": 0.00014952722334250944, "loss": 2.2694, "step": 1040 }, { "epoch": 0.6752067455813199, "grad_norm": 2.357842445373535, "learning_rate": 0.00014943841348619112, "loss": 2.5925, "step": 1041 }, { "epoch": 0.6758553591697746, "grad_norm": 2.3249998092651367, "learning_rate": 0.0001493495519934074, "loss": 2.5554, "step": 1042 }, { "epoch": 0.6765039727582293, "grad_norm": 1.858232855796814, "learning_rate": 0.00014926063895697052, "loss": 2.3822, "step": 1043 }, { "epoch": 0.677152586346684, "grad_norm": 2.028215169906616, "learning_rate": 0.00014917167446974668, "loss": 2.2575, "step": 1044 }, { "epoch": 0.6778011999351387, "grad_norm": 2.5688393115997314, "learning_rate": 0.00014908265862465577, "loss": 2.5325, "step": 1045 }, { "epoch": 0.6784498135235933, "grad_norm": 1.9057042598724365, "learning_rate": 0.00014899359151467127, "loss": 2.0173, "step": 1046 }, { "epoch": 0.679098427112048, "grad_norm": 2.113856077194214, "learning_rate": 0.00014890447323282025, "loss": 2.5823, "step": 1047 }, { "epoch": 0.6797470407005026, "grad_norm": 2.114788055419922, "learning_rate": 0.00014881530387218325, "loss": 2.4123, "step": 1048 }, { "epoch": 0.6803956542889573, "grad_norm": 2.254091501235962, "learning_rate": 0.00014872608352589414, "loss": 2.5016, "step": 1049 }, { "epoch": 0.681044267877412, "grad_norm": 2.6602749824523926, "learning_rate": 0.00014863681228714006, "loss": 2.8041, "step": 1050 }, { "epoch": 0.6816928814658667, "grad_norm": 2.2850215435028076, "learning_rate": 0.00014854749024916127, "loss": 2.4741, "step": 1051 }, { "epoch": 0.6823414950543214, "grad_norm": 2.111722707748413, "learning_rate": 0.00014845811750525105, "loss": 2.3893, "step": 1052 }, { "epoch": 0.6829901086427761, "grad_norm": 2.2083775997161865, "learning_rate": 0.00014836869414875574, "loss": 2.5281, "step": 1053 }, { "epoch": 0.6836387222312308, "grad_norm": 1.834399700164795, "learning_rate": 0.00014827922027307451, "loss": 2.1503, "step": 1054 }, { "epoch": 0.6842873358196854, "grad_norm": 2.2304556369781494, "learning_rate": 0.00014818969597165922, "loss": 2.5701, "step": 1055 }, { "epoch": 0.6849359494081401, "grad_norm": 2.9219696521759033, "learning_rate": 0.00014810012133801453, "loss": 2.6086, "step": 1056 }, { "epoch": 0.6855845629965948, "grad_norm": 2.4242303371429443, "learning_rate": 0.00014801049646569756, "loss": 2.7595, "step": 1057 }, { "epoch": 0.6862331765850495, "grad_norm": 2.4718270301818848, "learning_rate": 0.00014792082144831793, "loss": 2.5084, "step": 1058 }, { "epoch": 0.6868817901735041, "grad_norm": 2.371413230895996, "learning_rate": 0.0001478310963795377, "loss": 2.6398, "step": 1059 }, { "epoch": 0.6875304037619588, "grad_norm": 2.0543532371520996, "learning_rate": 0.0001477413213530711, "loss": 2.2261, "step": 1060 }, { "epoch": 0.6881790173504135, "grad_norm": 2.694807291030884, "learning_rate": 0.0001476514964626846, "loss": 2.6597, "step": 1061 }, { "epoch": 0.6888276309388681, "grad_norm": 2.224252223968506, "learning_rate": 0.00014756162180219672, "loss": 2.6955, "step": 1062 }, { "epoch": 0.6894762445273228, "grad_norm": 2.3304057121276855, "learning_rate": 0.00014747169746547802, "loss": 2.3894, "step": 1063 }, { "epoch": 0.6901248581157775, "grad_norm": 2.1676759719848633, "learning_rate": 0.00014738172354645087, "loss": 2.2289, "step": 1064 }, { "epoch": 0.6907734717042322, "grad_norm": 2.1403555870056152, "learning_rate": 0.00014729170013908955, "loss": 2.4913, "step": 1065 }, { "epoch": 0.6914220852926869, "grad_norm": 2.491506576538086, "learning_rate": 0.00014720162733741987, "loss": 2.438, "step": 1066 }, { "epoch": 0.6920706988811416, "grad_norm": 2.3070244789123535, "learning_rate": 0.00014711150523551934, "loss": 2.504, "step": 1067 }, { "epoch": 0.6927193124695963, "grad_norm": 2.5060532093048096, "learning_rate": 0.00014702133392751688, "loss": 2.4193, "step": 1068 }, { "epoch": 0.6933679260580509, "grad_norm": 2.2390241622924805, "learning_rate": 0.0001469311135075929, "loss": 2.5739, "step": 1069 }, { "epoch": 0.6940165396465056, "grad_norm": 3.0148630142211914, "learning_rate": 0.00014684084406997903, "loss": 2.5851, "step": 1070 }, { "epoch": 0.6946651532349603, "grad_norm": 2.4393041133880615, "learning_rate": 0.00014675052570895813, "loss": 2.4117, "step": 1071 }, { "epoch": 0.695313766823415, "grad_norm": 2.4812159538269043, "learning_rate": 0.00014666015851886414, "loss": 2.4349, "step": 1072 }, { "epoch": 0.6959623804118696, "grad_norm": 2.2978785037994385, "learning_rate": 0.00014656974259408208, "loss": 2.4117, "step": 1073 }, { "epoch": 0.6966109940003243, "grad_norm": 2.825817346572876, "learning_rate": 0.0001464792780290477, "loss": 2.403, "step": 1074 }, { "epoch": 0.697259607588779, "grad_norm": 2.235614061355591, "learning_rate": 0.00014638876491824773, "loss": 2.4884, "step": 1075 }, { "epoch": 0.6979082211772336, "grad_norm": 2.5563125610351562, "learning_rate": 0.0001462982033562195, "loss": 2.5922, "step": 1076 }, { "epoch": 0.6985568347656883, "grad_norm": 3.3240959644317627, "learning_rate": 0.000146207593437551, "loss": 2.3523, "step": 1077 }, { "epoch": 0.699205448354143, "grad_norm": 2.142399787902832, "learning_rate": 0.00014611693525688066, "loss": 2.569, "step": 1078 }, { "epoch": 0.6998540619425977, "grad_norm": 2.3642191886901855, "learning_rate": 0.0001460262289088974, "loss": 2.4369, "step": 1079 }, { "epoch": 0.7005026755310524, "grad_norm": 1.8554115295410156, "learning_rate": 0.00014593547448834036, "loss": 2.1434, "step": 1080 }, { "epoch": 0.7011512891195071, "grad_norm": 2.424924850463867, "learning_rate": 0.000145844672089999, "loss": 2.4921, "step": 1081 }, { "epoch": 0.7017999027079618, "grad_norm": 2.1045143604278564, "learning_rate": 0.0001457538218087128, "loss": 2.5413, "step": 1082 }, { "epoch": 0.7024485162964164, "grad_norm": 2.1705238819122314, "learning_rate": 0.0001456629237393713, "loss": 2.504, "step": 1083 }, { "epoch": 0.703097129884871, "grad_norm": 2.357515811920166, "learning_rate": 0.00014557197797691394, "loss": 2.646, "step": 1084 }, { "epoch": 0.7037457434733257, "grad_norm": 2.097329616546631, "learning_rate": 0.0001454809846163299, "loss": 2.0735, "step": 1085 }, { "epoch": 0.7043943570617804, "grad_norm": 2.336643695831299, "learning_rate": 0.00014538994375265822, "loss": 2.3785, "step": 1086 }, { "epoch": 0.7050429706502351, "grad_norm": 2.4383277893066406, "learning_rate": 0.00014529885548098743, "loss": 2.7388, "step": 1087 }, { "epoch": 0.7056915842386898, "grad_norm": 2.3092379570007324, "learning_rate": 0.00014520771989645563, "loss": 2.4622, "step": 1088 }, { "epoch": 0.7063401978271445, "grad_norm": 2.3851819038391113, "learning_rate": 0.00014511653709425038, "loss": 2.5224, "step": 1089 }, { "epoch": 0.7069888114155991, "grad_norm": 2.216942071914673, "learning_rate": 0.00014502530716960842, "loss": 2.5186, "step": 1090 }, { "epoch": 0.7076374250040538, "grad_norm": 2.5116093158721924, "learning_rate": 0.00014493403021781587, "loss": 2.5118, "step": 1091 }, { "epoch": 0.7082860385925085, "grad_norm": 2.5489773750305176, "learning_rate": 0.00014484270633420785, "loss": 2.4453, "step": 1092 }, { "epoch": 0.7089346521809632, "grad_norm": 2.2868235111236572, "learning_rate": 0.00014475133561416855, "loss": 2.5527, "step": 1093 }, { "epoch": 0.7095832657694179, "grad_norm": 2.466740846633911, "learning_rate": 0.00014465991815313108, "loss": 2.4462, "step": 1094 }, { "epoch": 0.7102318793578726, "grad_norm": 2.185421943664551, "learning_rate": 0.00014456845404657738, "loss": 2.2608, "step": 1095 }, { "epoch": 0.7108804929463273, "grad_norm": 2.5879385471343994, "learning_rate": 0.000144476943390038, "loss": 2.7189, "step": 1096 }, { "epoch": 0.7115291065347819, "grad_norm": 2.3383841514587402, "learning_rate": 0.0001443853862790923, "loss": 2.4742, "step": 1097 }, { "epoch": 0.7121777201232365, "grad_norm": 2.228130340576172, "learning_rate": 0.00014429378280936804, "loss": 2.5369, "step": 1098 }, { "epoch": 0.7128263337116912, "grad_norm": 2.544971227645874, "learning_rate": 0.00014420213307654134, "loss": 2.5713, "step": 1099 }, { "epoch": 0.7134749473001459, "grad_norm": 2.1553821563720703, "learning_rate": 0.0001441104371763368, "loss": 2.4678, "step": 1100 }, { "epoch": 0.7134749473001459, "eval_loss": 2.5773675441741943, "eval_runtime": 35.0629, "eval_samples_per_second": 58.637, "eval_steps_per_second": 14.659, "step": 1100 }, { "epoch": 0.7141235608886006, "grad_norm": 2.1089272499084473, "learning_rate": 0.0001440186952045271, "loss": 2.4097, "step": 1101 }, { "epoch": 0.7147721744770553, "grad_norm": 2.070720672607422, "learning_rate": 0.00014392690725693313, "loss": 2.2625, "step": 1102 }, { "epoch": 0.71542078806551, "grad_norm": 2.149669647216797, "learning_rate": 0.00014383507342942376, "loss": 2.1797, "step": 1103 }, { "epoch": 0.7160694016539646, "grad_norm": 2.524801015853882, "learning_rate": 0.0001437431938179158, "loss": 2.4521, "step": 1104 }, { "epoch": 0.7167180152424193, "grad_norm": 2.1104750633239746, "learning_rate": 0.00014365126851837383, "loss": 2.309, "step": 1105 }, { "epoch": 0.717366628830874, "grad_norm": 2.5360360145568848, "learning_rate": 0.0001435592976268102, "loss": 2.6206, "step": 1106 }, { "epoch": 0.7180152424193287, "grad_norm": 2.81925368309021, "learning_rate": 0.0001434672812392849, "loss": 2.8028, "step": 1107 }, { "epoch": 0.7186638560077834, "grad_norm": 2.341125965118408, "learning_rate": 0.0001433752194519054, "loss": 2.3247, "step": 1108 }, { "epoch": 0.7193124695962381, "grad_norm": 2.1391282081604004, "learning_rate": 0.00014328311236082655, "loss": 2.5974, "step": 1109 }, { "epoch": 0.7199610831846928, "grad_norm": 2.807884693145752, "learning_rate": 0.00014319096006225055, "loss": 2.6009, "step": 1110 }, { "epoch": 0.7206096967731473, "grad_norm": 2.4300692081451416, "learning_rate": 0.0001430987626524269, "loss": 2.553, "step": 1111 }, { "epoch": 0.721258310361602, "grad_norm": 2.1681888103485107, "learning_rate": 0.00014300652022765207, "loss": 2.3673, "step": 1112 }, { "epoch": 0.7219069239500567, "grad_norm": 3.1575734615325928, "learning_rate": 0.00014291423288426963, "loss": 2.8007, "step": 1113 }, { "epoch": 0.7225555375385114, "grad_norm": 2.3539319038391113, "learning_rate": 0.0001428219007186701, "loss": 2.3168, "step": 1114 }, { "epoch": 0.7232041511269661, "grad_norm": 3.1537773609161377, "learning_rate": 0.00014272952382729076, "loss": 2.8854, "step": 1115 }, { "epoch": 0.7238527647154208, "grad_norm": 2.659494400024414, "learning_rate": 0.0001426371023066156, "loss": 2.6897, "step": 1116 }, { "epoch": 0.7245013783038755, "grad_norm": 1.7911912202835083, "learning_rate": 0.00014254463625317524, "loss": 2.1254, "step": 1117 }, { "epoch": 0.7251499918923301, "grad_norm": 2.4076085090637207, "learning_rate": 0.00014245212576354682, "loss": 2.4101, "step": 1118 }, { "epoch": 0.7257986054807848, "grad_norm": 2.504896879196167, "learning_rate": 0.00014235957093435388, "loss": 2.5749, "step": 1119 }, { "epoch": 0.7264472190692395, "grad_norm": 2.2043542861938477, "learning_rate": 0.00014226697186226625, "loss": 2.4805, "step": 1120 }, { "epoch": 0.7270958326576942, "grad_norm": 2.2740418910980225, "learning_rate": 0.000142174328644, "loss": 2.1746, "step": 1121 }, { "epoch": 0.7277444462461489, "grad_norm": 2.2913689613342285, "learning_rate": 0.00014208164137631736, "loss": 2.6415, "step": 1122 }, { "epoch": 0.7283930598346036, "grad_norm": 2.4523983001708984, "learning_rate": 0.00014198891015602646, "loss": 2.1421, "step": 1123 }, { "epoch": 0.7290416734230583, "grad_norm": 2.0742244720458984, "learning_rate": 0.0001418961350799814, "loss": 2.3531, "step": 1124 }, { "epoch": 0.7296902870115128, "grad_norm": 2.62817120552063, "learning_rate": 0.00014180331624508207, "loss": 2.4457, "step": 1125 }, { "epoch": 0.7303389005999675, "grad_norm": 2.8968143463134766, "learning_rate": 0.00014171045374827409, "loss": 2.6835, "step": 1126 }, { "epoch": 0.7309875141884222, "grad_norm": 2.455381155014038, "learning_rate": 0.00014161754768654862, "loss": 2.5579, "step": 1127 }, { "epoch": 0.7316361277768769, "grad_norm": 2.2377424240112305, "learning_rate": 0.0001415245981569424, "loss": 2.3832, "step": 1128 }, { "epoch": 0.7322847413653316, "grad_norm": 2.1349565982818604, "learning_rate": 0.00014143160525653746, "loss": 2.2958, "step": 1129 }, { "epoch": 0.7329333549537863, "grad_norm": 2.5063793659210205, "learning_rate": 0.00014133856908246135, "loss": 2.2773, "step": 1130 }, { "epoch": 0.733581968542241, "grad_norm": 2.268751621246338, "learning_rate": 0.00014124548973188655, "loss": 2.3114, "step": 1131 }, { "epoch": 0.7342305821306956, "grad_norm": 2.72029709815979, "learning_rate": 0.00014115236730203077, "loss": 2.634, "step": 1132 }, { "epoch": 0.7348791957191503, "grad_norm": 2.3108718395233154, "learning_rate": 0.00014105920189015674, "loss": 2.3512, "step": 1133 }, { "epoch": 0.735527809307605, "grad_norm": 2.2641985416412354, "learning_rate": 0.000140965993593572, "loss": 2.1644, "step": 1134 }, { "epoch": 0.7361764228960597, "grad_norm": 2.7570831775665283, "learning_rate": 0.00014087274250962895, "loss": 2.5593, "step": 1135 }, { "epoch": 0.7368250364845144, "grad_norm": 2.6672446727752686, "learning_rate": 0.00014077944873572466, "loss": 2.367, "step": 1136 }, { "epoch": 0.7374736500729691, "grad_norm": 2.4296653270721436, "learning_rate": 0.00014068611236930077, "loss": 2.3527, "step": 1137 }, { "epoch": 0.7381222636614237, "grad_norm": 2.323241949081421, "learning_rate": 0.00014059273350784342, "loss": 2.4478, "step": 1138 }, { "epoch": 0.7387708772498783, "grad_norm": 2.3554646968841553, "learning_rate": 0.00014049931224888317, "loss": 2.5404, "step": 1139 }, { "epoch": 0.739419490838333, "grad_norm": 2.4682397842407227, "learning_rate": 0.00014040584868999477, "loss": 2.4046, "step": 1140 }, { "epoch": 0.7400681044267877, "grad_norm": 2.774336814880371, "learning_rate": 0.00014031234292879725, "loss": 2.6765, "step": 1141 }, { "epoch": 0.7407167180152424, "grad_norm": 3.020242214202881, "learning_rate": 0.0001402187950629536, "loss": 2.7665, "step": 1142 }, { "epoch": 0.7413653316036971, "grad_norm": 2.138206720352173, "learning_rate": 0.00014012520519017098, "loss": 2.3586, "step": 1143 }, { "epoch": 0.7420139451921518, "grad_norm": 2.667858839035034, "learning_rate": 0.00014003157340820022, "loss": 2.1228, "step": 1144 }, { "epoch": 0.7426625587806065, "grad_norm": 1.9645453691482544, "learning_rate": 0.00013993789981483606, "loss": 2.0815, "step": 1145 }, { "epoch": 0.7433111723690611, "grad_norm": 2.2368223667144775, "learning_rate": 0.0001398441845079168, "loss": 2.3474, "step": 1146 }, { "epoch": 0.7439597859575158, "grad_norm": 3.167555809020996, "learning_rate": 0.00013975042758532442, "loss": 2.6056, "step": 1147 }, { "epoch": 0.7446083995459705, "grad_norm": 2.2531092166900635, "learning_rate": 0.00013965662914498428, "loss": 2.4848, "step": 1148 }, { "epoch": 0.7452570131344252, "grad_norm": 2.2312138080596924, "learning_rate": 0.00013956278928486517, "loss": 2.4139, "step": 1149 }, { "epoch": 0.7459056267228799, "grad_norm": 2.196377754211426, "learning_rate": 0.00013946890810297909, "loss": 2.4152, "step": 1150 }, { "epoch": 0.7465542403113345, "grad_norm": 1.8897932767868042, "learning_rate": 0.00013937498569738122, "loss": 1.7618, "step": 1151 }, { "epoch": 0.7472028538997892, "grad_norm": 2.03973650932312, "learning_rate": 0.00013928102216616975, "loss": 2.1865, "step": 1152 }, { "epoch": 0.7478514674882438, "grad_norm": 2.341304063796997, "learning_rate": 0.00013918701760748592, "loss": 2.5282, "step": 1153 }, { "epoch": 0.7485000810766985, "grad_norm": 3.1422858238220215, "learning_rate": 0.0001390929721195138, "loss": 2.5298, "step": 1154 }, { "epoch": 0.7491486946651532, "grad_norm": 2.296104907989502, "learning_rate": 0.00013899888580048013, "loss": 2.3464, "step": 1155 }, { "epoch": 0.7497973082536079, "grad_norm": 2.044914960861206, "learning_rate": 0.00013890475874865432, "loss": 2.2842, "step": 1156 }, { "epoch": 0.7504459218420626, "grad_norm": 2.295339345932007, "learning_rate": 0.0001388105910623484, "loss": 2.1611, "step": 1157 }, { "epoch": 0.7510945354305173, "grad_norm": 1.9470664262771606, "learning_rate": 0.00013871638283991677, "loss": 2.2582, "step": 1158 }, { "epoch": 0.751743149018972, "grad_norm": 2.307607412338257, "learning_rate": 0.0001386221341797562, "loss": 2.2545, "step": 1159 }, { "epoch": 0.7523917626074266, "grad_norm": 2.5277693271636963, "learning_rate": 0.00013852784518030568, "loss": 2.3755, "step": 1160 }, { "epoch": 0.7530403761958813, "grad_norm": 2.450094223022461, "learning_rate": 0.0001384335159400463, "loss": 2.3814, "step": 1161 }, { "epoch": 0.753688989784336, "grad_norm": 2.489305019378662, "learning_rate": 0.00013833914655750126, "loss": 2.266, "step": 1162 }, { "epoch": 0.7543376033727907, "grad_norm": 2.1400020122528076, "learning_rate": 0.0001382447371312356, "loss": 2.4149, "step": 1163 }, { "epoch": 0.7549862169612453, "grad_norm": 2.539252758026123, "learning_rate": 0.0001381502877598563, "loss": 2.4504, "step": 1164 }, { "epoch": 0.7556348305497, "grad_norm": 2.4654245376586914, "learning_rate": 0.00013805579854201194, "loss": 2.3976, "step": 1165 }, { "epoch": 0.7562834441381547, "grad_norm": 2.1683602333068848, "learning_rate": 0.00013796126957639276, "loss": 2.3712, "step": 1166 }, { "epoch": 0.7569320577266093, "grad_norm": 2.0974810123443604, "learning_rate": 0.00013786670096173058, "loss": 2.0485, "step": 1167 }, { "epoch": 0.757580671315064, "grad_norm": 2.3853375911712646, "learning_rate": 0.0001377720927967985, "loss": 2.3085, "step": 1168 }, { "epoch": 0.7582292849035187, "grad_norm": 2.3424267768859863, "learning_rate": 0.00013767744518041098, "loss": 2.4662, "step": 1169 }, { "epoch": 0.7588778984919734, "grad_norm": 2.741757392883301, "learning_rate": 0.00013758275821142382, "loss": 2.5631, "step": 1170 }, { "epoch": 0.7595265120804281, "grad_norm": 2.836474895477295, "learning_rate": 0.00013748803198873372, "loss": 2.4569, "step": 1171 }, { "epoch": 0.7601751256688828, "grad_norm": 2.304793119430542, "learning_rate": 0.00013739326661127855, "loss": 2.2398, "step": 1172 }, { "epoch": 0.7608237392573375, "grad_norm": 2.611055374145508, "learning_rate": 0.0001372984621780369, "loss": 2.3038, "step": 1173 }, { "epoch": 0.7614723528457921, "grad_norm": 2.802955150604248, "learning_rate": 0.0001372036187880283, "loss": 2.4535, "step": 1174 }, { "epoch": 0.7621209664342468, "grad_norm": 1.970030426979065, "learning_rate": 0.0001371087365403129, "loss": 2.1952, "step": 1175 }, { "epoch": 0.7627695800227015, "grad_norm": 2.4060895442962646, "learning_rate": 0.00013701381553399145, "loss": 2.4232, "step": 1176 }, { "epoch": 0.7634181936111561, "grad_norm": 2.2072536945343018, "learning_rate": 0.0001369188558682052, "loss": 2.4983, "step": 1177 }, { "epoch": 0.7640668071996108, "grad_norm": 2.413482189178467, "learning_rate": 0.00013682385764213572, "loss": 2.3419, "step": 1178 }, { "epoch": 0.7647154207880655, "grad_norm": 2.2544052600860596, "learning_rate": 0.00013672882095500495, "loss": 2.3998, "step": 1179 }, { "epoch": 0.7653640343765202, "grad_norm": 2.29129958152771, "learning_rate": 0.00013663374590607496, "loss": 2.2509, "step": 1180 }, { "epoch": 0.7660126479649748, "grad_norm": 3.020617961883545, "learning_rate": 0.00013653863259464782, "loss": 2.6206, "step": 1181 }, { "epoch": 0.7666612615534295, "grad_norm": 2.1808910369873047, "learning_rate": 0.00013644348112006562, "loss": 2.5462, "step": 1182 }, { "epoch": 0.7673098751418842, "grad_norm": 2.851975679397583, "learning_rate": 0.00013634829158171033, "loss": 2.6786, "step": 1183 }, { "epoch": 0.7679584887303389, "grad_norm": 2.732391119003296, "learning_rate": 0.00013625306407900366, "loss": 2.6931, "step": 1184 }, { "epoch": 0.7686071023187936, "grad_norm": 2.61332368850708, "learning_rate": 0.0001361577987114069, "loss": 2.54, "step": 1185 }, { "epoch": 0.7692557159072483, "grad_norm": 2.5611138343811035, "learning_rate": 0.00013606249557842102, "loss": 2.5078, "step": 1186 }, { "epoch": 0.769904329495703, "grad_norm": 2.338277816772461, "learning_rate": 0.00013596715477958639, "loss": 2.6028, "step": 1187 }, { "epoch": 0.7705529430841576, "grad_norm": 2.5733425617218018, "learning_rate": 0.00013587177641448265, "loss": 2.463, "step": 1188 }, { "epoch": 0.7712015566726123, "grad_norm": 3.148705005645752, "learning_rate": 0.00013577636058272876, "loss": 2.8474, "step": 1189 }, { "epoch": 0.771850170261067, "grad_norm": 2.5615646839141846, "learning_rate": 0.00013568090738398276, "loss": 2.2384, "step": 1190 }, { "epoch": 0.7724987838495216, "grad_norm": 2.5868325233459473, "learning_rate": 0.00013558541691794174, "loss": 2.3262, "step": 1191 }, { "epoch": 0.7731473974379763, "grad_norm": 2.0615079402923584, "learning_rate": 0.00013548988928434167, "loss": 2.1882, "step": 1192 }, { "epoch": 0.773796011026431, "grad_norm": 2.806894540786743, "learning_rate": 0.00013539432458295743, "loss": 2.4151, "step": 1193 }, { "epoch": 0.7744446246148857, "grad_norm": 2.0107595920562744, "learning_rate": 0.00013529872291360257, "loss": 2.204, "step": 1194 }, { "epoch": 0.7750932382033403, "grad_norm": 2.2639808654785156, "learning_rate": 0.00013520308437612924, "loss": 2.4427, "step": 1195 }, { "epoch": 0.775741851791795, "grad_norm": 2.3818392753601074, "learning_rate": 0.00013510740907042812, "loss": 2.653, "step": 1196 }, { "epoch": 0.7763904653802497, "grad_norm": 1.9760150909423828, "learning_rate": 0.00013501169709642823, "loss": 1.8716, "step": 1197 }, { "epoch": 0.7770390789687044, "grad_norm": 2.419508218765259, "learning_rate": 0.00013491594855409697, "loss": 2.3192, "step": 1198 }, { "epoch": 0.7776876925571591, "grad_norm": 2.227017879486084, "learning_rate": 0.0001348201635434399, "loss": 2.448, "step": 1199 }, { "epoch": 0.7783363061456138, "grad_norm": 2.833954095840454, "learning_rate": 0.00013472434216450064, "loss": 2.5489, "step": 1200 }, { "epoch": 0.7783363061456138, "eval_loss": 2.507995128631592, "eval_runtime": 35.0953, "eval_samples_per_second": 58.583, "eval_steps_per_second": 14.646, "step": 1200 }, { "epoch": 0.7789849197340685, "grad_norm": 2.5480222702026367, "learning_rate": 0.0001346284845173609, "loss": 2.3433, "step": 1201 }, { "epoch": 0.779633533322523, "grad_norm": 2.6179025173187256, "learning_rate": 0.00013453259070214012, "loss": 2.4039, "step": 1202 }, { "epoch": 0.7802821469109777, "grad_norm": 2.6602606773376465, "learning_rate": 0.00013443666081899567, "loss": 2.3777, "step": 1203 }, { "epoch": 0.7809307604994324, "grad_norm": 2.419031858444214, "learning_rate": 0.00013434069496812243, "loss": 2.421, "step": 1204 }, { "epoch": 0.7815793740878871, "grad_norm": 2.3668181896209717, "learning_rate": 0.00013424469324975298, "loss": 2.4142, "step": 1205 }, { "epoch": 0.7822279876763418, "grad_norm": 2.29980206489563, "learning_rate": 0.00013414865576415728, "loss": 2.0517, "step": 1206 }, { "epoch": 0.7828766012647965, "grad_norm": 2.560058355331421, "learning_rate": 0.00013405258261164275, "loss": 2.2945, "step": 1207 }, { "epoch": 0.7835252148532512, "grad_norm": 2.4130055904388428, "learning_rate": 0.00013395647389255396, "loss": 2.1076, "step": 1208 }, { "epoch": 0.7841738284417058, "grad_norm": 3.231400489807129, "learning_rate": 0.00013386032970727263, "loss": 2.5379, "step": 1209 }, { "epoch": 0.7848224420301605, "grad_norm": 2.077465057373047, "learning_rate": 0.00013376415015621754, "loss": 2.196, "step": 1210 }, { "epoch": 0.7854710556186152, "grad_norm": 2.7595620155334473, "learning_rate": 0.0001336679353398445, "loss": 2.3361, "step": 1211 }, { "epoch": 0.7861196692070699, "grad_norm": 2.4860260486602783, "learning_rate": 0.00013357168535864603, "loss": 2.3604, "step": 1212 }, { "epoch": 0.7867682827955246, "grad_norm": 2.6833608150482178, "learning_rate": 0.00013347540031315145, "loss": 2.7017, "step": 1213 }, { "epoch": 0.7874168963839793, "grad_norm": 2.592604637145996, "learning_rate": 0.00013337908030392663, "loss": 2.3755, "step": 1214 }, { "epoch": 0.788065509972434, "grad_norm": 2.6695525646209717, "learning_rate": 0.00013328272543157405, "loss": 2.4251, "step": 1215 }, { "epoch": 0.7887141235608885, "grad_norm": 2.6484742164611816, "learning_rate": 0.00013318633579673255, "loss": 2.4892, "step": 1216 }, { "epoch": 0.7893627371493432, "grad_norm": 2.5846235752105713, "learning_rate": 0.0001330899115000773, "loss": 2.5035, "step": 1217 }, { "epoch": 0.7900113507377979, "grad_norm": 3.073356866836548, "learning_rate": 0.00013299345264231957, "loss": 2.3143, "step": 1218 }, { "epoch": 0.7906599643262526, "grad_norm": 2.5657074451446533, "learning_rate": 0.00013289695932420693, "loss": 2.3942, "step": 1219 }, { "epoch": 0.7913085779147073, "grad_norm": 2.4712636470794678, "learning_rate": 0.0001328004316465228, "loss": 2.373, "step": 1220 }, { "epoch": 0.791957191503162, "grad_norm": 2.659590005874634, "learning_rate": 0.0001327038697100865, "loss": 2.4869, "step": 1221 }, { "epoch": 0.7926058050916167, "grad_norm": 2.396930456161499, "learning_rate": 0.00013260727361575313, "loss": 2.1233, "step": 1222 }, { "epoch": 0.7932544186800713, "grad_norm": 2.9727060794830322, "learning_rate": 0.00013251064346441355, "loss": 2.2691, "step": 1223 }, { "epoch": 0.793903032268526, "grad_norm": 2.7200422286987305, "learning_rate": 0.00013241397935699406, "loss": 2.7116, "step": 1224 }, { "epoch": 0.7945516458569807, "grad_norm": 2.4535746574401855, "learning_rate": 0.00013231728139445655, "loss": 2.1634, "step": 1225 }, { "epoch": 0.7952002594454354, "grad_norm": 2.258160352706909, "learning_rate": 0.00013222054967779816, "loss": 2.1694, "step": 1226 }, { "epoch": 0.7958488730338901, "grad_norm": 2.7618281841278076, "learning_rate": 0.00013212378430805136, "loss": 2.4903, "step": 1227 }, { "epoch": 0.7964974866223448, "grad_norm": 3.050058603286743, "learning_rate": 0.00013202698538628376, "loss": 2.3809, "step": 1228 }, { "epoch": 0.7971461002107995, "grad_norm": 2.4234302043914795, "learning_rate": 0.000131930153013598, "loss": 2.5015, "step": 1229 }, { "epoch": 0.797794713799254, "grad_norm": 2.250849723815918, "learning_rate": 0.00013183328729113165, "loss": 2.066, "step": 1230 }, { "epoch": 0.7984433273877087, "grad_norm": 2.900230884552002, "learning_rate": 0.00013173638832005713, "loss": 2.7898, "step": 1231 }, { "epoch": 0.7990919409761634, "grad_norm": 2.699423313140869, "learning_rate": 0.00013163945620158155, "loss": 2.4043, "step": 1232 }, { "epoch": 0.7997405545646181, "grad_norm": 2.5210211277008057, "learning_rate": 0.0001315424910369467, "loss": 2.4904, "step": 1233 }, { "epoch": 0.8003891681530728, "grad_norm": 2.376856565475464, "learning_rate": 0.00013144549292742885, "loss": 2.511, "step": 1234 }, { "epoch": 0.8010377817415275, "grad_norm": 2.8324451446533203, "learning_rate": 0.00013134846197433866, "loss": 2.784, "step": 1235 }, { "epoch": 0.8016863953299822, "grad_norm": 2.427661657333374, "learning_rate": 0.00013125139827902115, "loss": 2.4783, "step": 1236 }, { "epoch": 0.8023350089184368, "grad_norm": 2.4515533447265625, "learning_rate": 0.0001311543019428555, "loss": 2.4615, "step": 1237 }, { "epoch": 0.8029836225068915, "grad_norm": 2.6180105209350586, "learning_rate": 0.00013105717306725501, "loss": 2.6502, "step": 1238 }, { "epoch": 0.8036322360953462, "grad_norm": 2.5390195846557617, "learning_rate": 0.00013096001175366692, "loss": 2.2904, "step": 1239 }, { "epoch": 0.8042808496838009, "grad_norm": 2.410534620285034, "learning_rate": 0.00013086281810357236, "loss": 2.3858, "step": 1240 }, { "epoch": 0.8049294632722556, "grad_norm": 2.6688618659973145, "learning_rate": 0.00013076559221848627, "loss": 2.5565, "step": 1241 }, { "epoch": 0.8055780768607103, "grad_norm": 2.645551919937134, "learning_rate": 0.00013066833419995722, "loss": 2.5737, "step": 1242 }, { "epoch": 0.806226690449165, "grad_norm": 3.0038886070251465, "learning_rate": 0.00013057104414956736, "loss": 2.5184, "step": 1243 }, { "epoch": 0.8068753040376195, "grad_norm": 2.292436361312866, "learning_rate": 0.00013047372216893233, "loss": 2.5096, "step": 1244 }, { "epoch": 0.8075239176260742, "grad_norm": 2.426621437072754, "learning_rate": 0.00013037636835970106, "loss": 2.3853, "step": 1245 }, { "epoch": 0.8081725312145289, "grad_norm": 2.409891128540039, "learning_rate": 0.00013027898282355575, "loss": 2.3684, "step": 1246 }, { "epoch": 0.8088211448029836, "grad_norm": 2.8206863403320312, "learning_rate": 0.00013018156566221173, "loss": 2.5703, "step": 1247 }, { "epoch": 0.8094697583914383, "grad_norm": 2.360962390899658, "learning_rate": 0.0001300841169774174, "loss": 2.2034, "step": 1248 }, { "epoch": 0.810118371979893, "grad_norm": 3.016183614730835, "learning_rate": 0.000129986636870954, "loss": 2.4776, "step": 1249 }, { "epoch": 0.8107669855683477, "grad_norm": 2.2755393981933594, "learning_rate": 0.00012988912544463567, "loss": 2.2995, "step": 1250 }, { "epoch": 0.8114155991568023, "grad_norm": 3.014244318008423, "learning_rate": 0.00012979158280030926, "loss": 2.5089, "step": 1251 }, { "epoch": 0.812064212745257, "grad_norm": 2.3656747341156006, "learning_rate": 0.00012969400903985415, "loss": 2.1939, "step": 1252 }, { "epoch": 0.8127128263337117, "grad_norm": 2.6046266555786133, "learning_rate": 0.0001295964042651823, "loss": 2.3183, "step": 1253 }, { "epoch": 0.8133614399221664, "grad_norm": 2.7731997966766357, "learning_rate": 0.00012949876857823805, "loss": 2.211, "step": 1254 }, { "epoch": 0.8140100535106211, "grad_norm": 2.1547765731811523, "learning_rate": 0.000129401102080998, "loss": 2.1934, "step": 1255 }, { "epoch": 0.8146586670990758, "grad_norm": 3.0415127277374268, "learning_rate": 0.00012930340487547087, "loss": 2.4377, "step": 1256 }, { "epoch": 0.8153072806875304, "grad_norm": 2.021466016769409, "learning_rate": 0.00012920567706369758, "loss": 2.1597, "step": 1257 }, { "epoch": 0.815955894275985, "grad_norm": 3.250190258026123, "learning_rate": 0.000129107918747751, "loss": 2.4312, "step": 1258 }, { "epoch": 0.8166045078644397, "grad_norm": 2.783698320388794, "learning_rate": 0.00012901013002973574, "loss": 2.3175, "step": 1259 }, { "epoch": 0.8172531214528944, "grad_norm": 2.6531429290771484, "learning_rate": 0.0001289123110117883, "loss": 2.5148, "step": 1260 }, { "epoch": 0.8179017350413491, "grad_norm": 2.3012263774871826, "learning_rate": 0.0001288144617960768, "loss": 2.3275, "step": 1261 }, { "epoch": 0.8185503486298038, "grad_norm": 2.2047152519226074, "learning_rate": 0.00012871658248480076, "loss": 2.1403, "step": 1262 }, { "epoch": 0.8191989622182585, "grad_norm": 2.4006264209747314, "learning_rate": 0.00012861867318019135, "loss": 2.4712, "step": 1263 }, { "epoch": 0.8198475758067132, "grad_norm": 2.667694568634033, "learning_rate": 0.0001285207339845109, "loss": 2.2733, "step": 1264 }, { "epoch": 0.8204961893951678, "grad_norm": 2.4242568016052246, "learning_rate": 0.00012842276500005305, "loss": 2.4602, "step": 1265 }, { "epoch": 0.8211448029836225, "grad_norm": 2.208620309829712, "learning_rate": 0.00012832476632914253, "loss": 2.308, "step": 1266 }, { "epoch": 0.8217934165720772, "grad_norm": 2.2818360328674316, "learning_rate": 0.00012822673807413504, "loss": 2.3658, "step": 1267 }, { "epoch": 0.8224420301605319, "grad_norm": 2.0980873107910156, "learning_rate": 0.00012812868033741724, "loss": 2.1861, "step": 1268 }, { "epoch": 0.8230906437489866, "grad_norm": 2.4625985622406006, "learning_rate": 0.00012803059322140657, "loss": 2.3836, "step": 1269 }, { "epoch": 0.8237392573374412, "grad_norm": 2.6560587882995605, "learning_rate": 0.00012793247682855113, "loss": 2.2713, "step": 1270 }, { "epoch": 0.8243878709258959, "grad_norm": 2.1424713134765625, "learning_rate": 0.00012783433126132952, "loss": 2.3024, "step": 1271 }, { "epoch": 0.8250364845143505, "grad_norm": 2.5408236980438232, "learning_rate": 0.00012773615662225104, "loss": 2.3144, "step": 1272 }, { "epoch": 0.8256850981028052, "grad_norm": 2.226816177368164, "learning_rate": 0.00012763795301385514, "loss": 2.4711, "step": 1273 }, { "epoch": 0.8263337116912599, "grad_norm": 2.702565908432007, "learning_rate": 0.00012753972053871157, "loss": 2.3688, "step": 1274 }, { "epoch": 0.8269823252797146, "grad_norm": 2.5809807777404785, "learning_rate": 0.00012744145929942033, "loss": 2.3776, "step": 1275 }, { "epoch": 0.8276309388681693, "grad_norm": 1.9908251762390137, "learning_rate": 0.00012734316939861135, "loss": 2.1862, "step": 1276 }, { "epoch": 0.828279552456624, "grad_norm": 2.185211658477783, "learning_rate": 0.00012724485093894457, "loss": 2.2627, "step": 1277 }, { "epoch": 0.8289281660450787, "grad_norm": 2.9738476276397705, "learning_rate": 0.00012714650402310967, "loss": 2.4709, "step": 1278 }, { "epoch": 0.8295767796335333, "grad_norm": 2.5706405639648438, "learning_rate": 0.00012704812875382614, "loss": 2.237, "step": 1279 }, { "epoch": 0.830225393221988, "grad_norm": 2.8274548053741455, "learning_rate": 0.0001269497252338431, "loss": 2.2875, "step": 1280 }, { "epoch": 0.8308740068104427, "grad_norm": 3.408583641052246, "learning_rate": 0.0001268512935659391, "loss": 2.5795, "step": 1281 }, { "epoch": 0.8315226203988973, "grad_norm": 3.543614149093628, "learning_rate": 0.00012675283385292212, "loss": 2.6821, "step": 1282 }, { "epoch": 0.832171233987352, "grad_norm": 2.822291135787964, "learning_rate": 0.00012665434619762937, "loss": 2.4279, "step": 1283 }, { "epoch": 0.8328198475758067, "grad_norm": 3.3415355682373047, "learning_rate": 0.0001265558307029274, "loss": 2.4599, "step": 1284 }, { "epoch": 0.8334684611642614, "grad_norm": 3.0435562133789062, "learning_rate": 0.00012645728747171168, "loss": 2.6046, "step": 1285 }, { "epoch": 0.834117074752716, "grad_norm": 2.548673391342163, "learning_rate": 0.00012635871660690676, "loss": 2.2026, "step": 1286 }, { "epoch": 0.8347656883411707, "grad_norm": 2.464386463165283, "learning_rate": 0.000126260118211466, "loss": 2.4223, "step": 1287 }, { "epoch": 0.8354143019296254, "grad_norm": 2.657822847366333, "learning_rate": 0.00012616149238837146, "loss": 2.3175, "step": 1288 }, { "epoch": 0.8360629155180801, "grad_norm": 2.4349546432495117, "learning_rate": 0.000126062839240634, "loss": 2.3012, "step": 1289 }, { "epoch": 0.8367115291065348, "grad_norm": 2.0549161434173584, "learning_rate": 0.00012596415887129286, "loss": 2.0065, "step": 1290 }, { "epoch": 0.8373601426949895, "grad_norm": 2.4180831909179688, "learning_rate": 0.0001258654513834158, "loss": 2.2861, "step": 1291 }, { "epoch": 0.8380087562834442, "grad_norm": 2.416142702102661, "learning_rate": 0.00012576671688009885, "loss": 2.3555, "step": 1292 }, { "epoch": 0.8386573698718988, "grad_norm": 2.6806013584136963, "learning_rate": 0.00012566795546446632, "loss": 2.3155, "step": 1293 }, { "epoch": 0.8393059834603535, "grad_norm": 2.607876777648926, "learning_rate": 0.00012556916723967062, "loss": 2.4867, "step": 1294 }, { "epoch": 0.8399545970488081, "grad_norm": 2.036471366882324, "learning_rate": 0.00012547035230889213, "loss": 2.2461, "step": 1295 }, { "epoch": 0.8406032106372628, "grad_norm": 2.4334611892700195, "learning_rate": 0.00012537151077533911, "loss": 2.0928, "step": 1296 }, { "epoch": 0.8412518242257175, "grad_norm": 3.007021903991699, "learning_rate": 0.00012527264274224764, "loss": 2.4552, "step": 1297 }, { "epoch": 0.8419004378141722, "grad_norm": 2.514653444290161, "learning_rate": 0.00012517374831288146, "loss": 2.288, "step": 1298 }, { "epoch": 0.8425490514026269, "grad_norm": 2.493276596069336, "learning_rate": 0.00012507482759053186, "loss": 2.4474, "step": 1299 }, { "epoch": 0.8431976649910815, "grad_norm": 2.9481067657470703, "learning_rate": 0.00012497588067851768, "loss": 2.3458, "step": 1300 }, { "epoch": 0.8431976649910815, "eval_loss": 2.4472556114196777, "eval_runtime": 35.0786, "eval_samples_per_second": 58.611, "eval_steps_per_second": 14.653, "step": 1300 }, { "epoch": 0.8438462785795362, "grad_norm": 2.0098061561584473, "learning_rate": 0.000124876907680185, "loss": 2.1488, "step": 1301 }, { "epoch": 0.8444948921679909, "grad_norm": 2.7381789684295654, "learning_rate": 0.00012477790869890723, "loss": 2.3503, "step": 1302 }, { "epoch": 0.8451435057564456, "grad_norm": 2.9024243354797363, "learning_rate": 0.0001246788838380849, "loss": 2.2406, "step": 1303 }, { "epoch": 0.8457921193449003, "grad_norm": 2.852903127670288, "learning_rate": 0.0001245798332011455, "loss": 2.6625, "step": 1304 }, { "epoch": 0.846440732933355, "grad_norm": 2.153883695602417, "learning_rate": 0.00012448075689154353, "loss": 1.9289, "step": 1305 }, { "epoch": 0.8470893465218097, "grad_norm": 2.4564383029937744, "learning_rate": 0.00012438165501276027, "loss": 2.0509, "step": 1306 }, { "epoch": 0.8477379601102643, "grad_norm": 2.9788084030151367, "learning_rate": 0.00012428252766830372, "loss": 2.3384, "step": 1307 }, { "epoch": 0.848386573698719, "grad_norm": 2.659809112548828, "learning_rate": 0.00012418337496170842, "loss": 2.3646, "step": 1308 }, { "epoch": 0.8490351872871736, "grad_norm": 2.709117889404297, "learning_rate": 0.0001240841969965355, "loss": 2.4318, "step": 1309 }, { "epoch": 0.8496838008756283, "grad_norm": 2.656294345855713, "learning_rate": 0.00012398499387637242, "loss": 2.4029, "step": 1310 }, { "epoch": 0.850332414464083, "grad_norm": 2.5548534393310547, "learning_rate": 0.0001238857657048329, "loss": 2.2124, "step": 1311 }, { "epoch": 0.8509810280525377, "grad_norm": 2.234127998352051, "learning_rate": 0.00012378651258555681, "loss": 2.0491, "step": 1312 }, { "epoch": 0.8516296416409924, "grad_norm": 2.901419162750244, "learning_rate": 0.00012368723462221013, "loss": 2.5911, "step": 1313 }, { "epoch": 0.852278255229447, "grad_norm": 2.82485294342041, "learning_rate": 0.00012358793191848472, "loss": 2.5056, "step": 1314 }, { "epoch": 0.8529268688179017, "grad_norm": 2.6819112300872803, "learning_rate": 0.00012348860457809838, "loss": 2.213, "step": 1315 }, { "epoch": 0.8535754824063564, "grad_norm": 2.63716983795166, "learning_rate": 0.00012338925270479454, "loss": 2.2735, "step": 1316 }, { "epoch": 0.8542240959948111, "grad_norm": 2.2220935821533203, "learning_rate": 0.00012328987640234236, "loss": 2.2579, "step": 1317 }, { "epoch": 0.8548727095832658, "grad_norm": 2.3605754375457764, "learning_rate": 0.00012319047577453638, "loss": 2.1064, "step": 1318 }, { "epoch": 0.8555213231717205, "grad_norm": 3.5681967735290527, "learning_rate": 0.00012309105092519665, "loss": 2.7274, "step": 1319 }, { "epoch": 0.8561699367601752, "grad_norm": 2.190582036972046, "learning_rate": 0.0001229916019581685, "loss": 2.2929, "step": 1320 }, { "epoch": 0.8568185503486297, "grad_norm": 2.3365941047668457, "learning_rate": 0.00012289212897732244, "loss": 2.165, "step": 1321 }, { "epoch": 0.8574671639370844, "grad_norm": 3.1625869274139404, "learning_rate": 0.00012279263208655404, "loss": 2.2552, "step": 1322 }, { "epoch": 0.8581157775255391, "grad_norm": 2.7186431884765625, "learning_rate": 0.00012269311138978384, "loss": 2.1678, "step": 1323 }, { "epoch": 0.8587643911139938, "grad_norm": 3.058908700942993, "learning_rate": 0.00012259356699095733, "loss": 2.41, "step": 1324 }, { "epoch": 0.8594130047024485, "grad_norm": 2.504408836364746, "learning_rate": 0.00012249399899404463, "loss": 2.3778, "step": 1325 }, { "epoch": 0.8600616182909032, "grad_norm": 2.87089204788208, "learning_rate": 0.00012239440750304062, "loss": 2.4919, "step": 1326 }, { "epoch": 0.8607102318793579, "grad_norm": 2.2586331367492676, "learning_rate": 0.00012229479262196468, "loss": 2.1072, "step": 1327 }, { "epoch": 0.8613588454678126, "grad_norm": 2.727515697479248, "learning_rate": 0.00012219515445486054, "loss": 2.3569, "step": 1328 }, { "epoch": 0.8620074590562672, "grad_norm": 3.055591106414795, "learning_rate": 0.00012209549310579636, "loss": 2.7093, "step": 1329 }, { "epoch": 0.8626560726447219, "grad_norm": 2.563161611557007, "learning_rate": 0.00012199580867886451, "loss": 2.3266, "step": 1330 }, { "epoch": 0.8633046862331766, "grad_norm": 3.566985845565796, "learning_rate": 0.00012189610127818138, "loss": 2.2936, "step": 1331 }, { "epoch": 0.8639532998216313, "grad_norm": 2.773505687713623, "learning_rate": 0.00012179637100788739, "loss": 2.3725, "step": 1332 }, { "epoch": 0.864601913410086, "grad_norm": 2.55859375, "learning_rate": 0.0001216966179721469, "loss": 2.2233, "step": 1333 }, { "epoch": 0.8652505269985407, "grad_norm": 3.174692392349243, "learning_rate": 0.00012159684227514798, "loss": 2.3059, "step": 1334 }, { "epoch": 0.8658991405869954, "grad_norm": 2.622833251953125, "learning_rate": 0.00012149704402110243, "loss": 2.3714, "step": 1335 }, { "epoch": 0.8665477541754499, "grad_norm": 2.7736048698425293, "learning_rate": 0.0001213972233142455, "loss": 2.2567, "step": 1336 }, { "epoch": 0.8671963677639046, "grad_norm": 2.366366386413574, "learning_rate": 0.00012129738025883606, "loss": 2.216, "step": 1337 }, { "epoch": 0.8678449813523593, "grad_norm": 2.6889476776123047, "learning_rate": 0.00012119751495915617, "loss": 2.1073, "step": 1338 }, { "epoch": 0.868493594940814, "grad_norm": 2.2877416610717773, "learning_rate": 0.00012109762751951118, "loss": 2.0964, "step": 1339 }, { "epoch": 0.8691422085292687, "grad_norm": 2.254711151123047, "learning_rate": 0.00012099771804422956, "loss": 2.1213, "step": 1340 }, { "epoch": 0.8697908221177234, "grad_norm": 2.7491915225982666, "learning_rate": 0.00012089778663766286, "loss": 2.444, "step": 1341 }, { "epoch": 0.8704394357061781, "grad_norm": 2.2575454711914062, "learning_rate": 0.00012079783340418543, "loss": 2.0181, "step": 1342 }, { "epoch": 0.8710880492946327, "grad_norm": 3.0659279823303223, "learning_rate": 0.00012069785844819446, "loss": 2.4101, "step": 1343 }, { "epoch": 0.8717366628830874, "grad_norm": 2.737976551055908, "learning_rate": 0.00012059786187410984, "loss": 2.2344, "step": 1344 }, { "epoch": 0.8723852764715421, "grad_norm": 2.3673593997955322, "learning_rate": 0.00012049784378637406, "loss": 2.1484, "step": 1345 }, { "epoch": 0.8730338900599968, "grad_norm": 2.6234099864959717, "learning_rate": 0.00012039780428945202, "loss": 2.2664, "step": 1346 }, { "epoch": 0.8736825036484515, "grad_norm": 3.783639669418335, "learning_rate": 0.00012029774348783105, "loss": 2.5983, "step": 1347 }, { "epoch": 0.8743311172369062, "grad_norm": 2.597580909729004, "learning_rate": 0.00012019766148602062, "loss": 2.3014, "step": 1348 }, { "epoch": 0.8749797308253608, "grad_norm": 2.445458173751831, "learning_rate": 0.0001200975583885525, "loss": 2.1725, "step": 1349 }, { "epoch": 0.8756283444138154, "grad_norm": 2.6002631187438965, "learning_rate": 0.00011999743429998036, "loss": 2.3073, "step": 1350 }, { "epoch": 0.8762769580022701, "grad_norm": 2.409813165664673, "learning_rate": 0.00011989728932487988, "loss": 2.1673, "step": 1351 }, { "epoch": 0.8769255715907248, "grad_norm": 2.99670672416687, "learning_rate": 0.00011979712356784853, "loss": 2.3082, "step": 1352 }, { "epoch": 0.8775741851791795, "grad_norm": 3.4744348526000977, "learning_rate": 0.00011969693713350545, "loss": 2.6459, "step": 1353 }, { "epoch": 0.8782227987676342, "grad_norm": 2.5937678813934326, "learning_rate": 0.0001195967301264914, "loss": 2.2489, "step": 1354 }, { "epoch": 0.8788714123560889, "grad_norm": 3.1032259464263916, "learning_rate": 0.00011949650265146863, "loss": 2.4483, "step": 1355 }, { "epoch": 0.8795200259445436, "grad_norm": 2.8258488178253174, "learning_rate": 0.00011939625481312075, "loss": 2.0937, "step": 1356 }, { "epoch": 0.8801686395329982, "grad_norm": 2.7051444053649902, "learning_rate": 0.00011929598671615272, "loss": 2.1481, "step": 1357 }, { "epoch": 0.8808172531214529, "grad_norm": 3.1604318618774414, "learning_rate": 0.00011919569846529057, "loss": 2.3598, "step": 1358 }, { "epoch": 0.8814658667099076, "grad_norm": 2.950549364089966, "learning_rate": 0.00011909539016528141, "loss": 2.4348, "step": 1359 }, { "epoch": 0.8821144802983623, "grad_norm": 3.1726768016815186, "learning_rate": 0.00011899506192089327, "loss": 2.262, "step": 1360 }, { "epoch": 0.882763093886817, "grad_norm": 2.3717453479766846, "learning_rate": 0.00011889471383691506, "loss": 2.3302, "step": 1361 }, { "epoch": 0.8834117074752716, "grad_norm": 2.686929225921631, "learning_rate": 0.00011879434601815634, "loss": 2.3075, "step": 1362 }, { "epoch": 0.8840603210637263, "grad_norm": 2.7507166862487793, "learning_rate": 0.00011869395856944734, "loss": 2.1629, "step": 1363 }, { "epoch": 0.8847089346521809, "grad_norm": 2.963787794113159, "learning_rate": 0.00011859355159563876, "loss": 2.3872, "step": 1364 }, { "epoch": 0.8853575482406356, "grad_norm": 2.6169145107269287, "learning_rate": 0.00011849312520160172, "loss": 2.2173, "step": 1365 }, { "epoch": 0.8860061618290903, "grad_norm": 2.5377180576324463, "learning_rate": 0.00011839267949222765, "loss": 2.2952, "step": 1366 }, { "epoch": 0.886654775417545, "grad_norm": 2.741016149520874, "learning_rate": 0.00011829221457242809, "loss": 2.2287, "step": 1367 }, { "epoch": 0.8873033890059997, "grad_norm": 2.678703546524048, "learning_rate": 0.00011819173054713466, "loss": 2.0928, "step": 1368 }, { "epoch": 0.8879520025944544, "grad_norm": 2.8006253242492676, "learning_rate": 0.00011809122752129895, "loss": 2.2164, "step": 1369 }, { "epoch": 0.8886006161829091, "grad_norm": 2.8502776622772217, "learning_rate": 0.0001179907055998924, "loss": 2.2518, "step": 1370 }, { "epoch": 0.8892492297713637, "grad_norm": 2.929751396179199, "learning_rate": 0.00011789016488790616, "loss": 2.2639, "step": 1371 }, { "epoch": 0.8898978433598184, "grad_norm": 2.7852799892425537, "learning_rate": 0.00011778960549035101, "loss": 2.6305, "step": 1372 }, { "epoch": 0.8905464569482731, "grad_norm": 2.3066766262054443, "learning_rate": 0.0001176890275122573, "loss": 2.1192, "step": 1373 }, { "epoch": 0.8911950705367278, "grad_norm": 2.774937391281128, "learning_rate": 0.00011758843105867474, "loss": 2.4058, "step": 1374 }, { "epoch": 0.8918436841251824, "grad_norm": 2.1911017894744873, "learning_rate": 0.00011748781623467231, "loss": 2.1424, "step": 1375 }, { "epoch": 0.8924922977136371, "grad_norm": 2.5702404975891113, "learning_rate": 0.00011738718314533826, "loss": 2.1588, "step": 1376 }, { "epoch": 0.8931409113020918, "grad_norm": 2.8232014179229736, "learning_rate": 0.00011728653189577982, "loss": 2.2131, "step": 1377 }, { "epoch": 0.8937895248905464, "grad_norm": 2.727078914642334, "learning_rate": 0.00011718586259112326, "loss": 2.3537, "step": 1378 }, { "epoch": 0.8944381384790011, "grad_norm": 3.2957241535186768, "learning_rate": 0.00011708517533651367, "loss": 2.4289, "step": 1379 }, { "epoch": 0.8950867520674558, "grad_norm": 2.6409926414489746, "learning_rate": 0.0001169844702371149, "loss": 2.1962, "step": 1380 }, { "epoch": 0.8957353656559105, "grad_norm": 2.246046304702759, "learning_rate": 0.00011688374739810944, "loss": 2.0953, "step": 1381 }, { "epoch": 0.8963839792443652, "grad_norm": 2.4557595252990723, "learning_rate": 0.00011678300692469832, "loss": 2.0189, "step": 1382 }, { "epoch": 0.8970325928328199, "grad_norm": 2.9679806232452393, "learning_rate": 0.00011668224892210098, "loss": 2.1831, "step": 1383 }, { "epoch": 0.8976812064212746, "grad_norm": 2.6609697341918945, "learning_rate": 0.00011658147349555514, "loss": 2.1171, "step": 1384 }, { "epoch": 0.8983298200097292, "grad_norm": 2.3603599071502686, "learning_rate": 0.00011648068075031679, "loss": 2.1292, "step": 1385 }, { "epoch": 0.8989784335981839, "grad_norm": 2.384096622467041, "learning_rate": 0.00011637987079165988, "loss": 2.2376, "step": 1386 }, { "epoch": 0.8996270471866386, "grad_norm": 2.5582213401794434, "learning_rate": 0.0001162790437248765, "loss": 2.1226, "step": 1387 }, { "epoch": 0.9002756607750932, "grad_norm": 2.4638381004333496, "learning_rate": 0.0001161781996552765, "loss": 2.2524, "step": 1388 }, { "epoch": 0.9009242743635479, "grad_norm": 2.3635599613189697, "learning_rate": 0.00011607733868818749, "loss": 1.9257, "step": 1389 }, { "epoch": 0.9015728879520026, "grad_norm": 2.4746227264404297, "learning_rate": 0.00011597646092895478, "loss": 2.4001, "step": 1390 }, { "epoch": 0.9022215015404573, "grad_norm": 2.5965683460235596, "learning_rate": 0.00011587556648294123, "loss": 2.1818, "step": 1391 }, { "epoch": 0.9028701151289119, "grad_norm": 2.6644184589385986, "learning_rate": 0.00011577465545552703, "loss": 2.3987, "step": 1392 }, { "epoch": 0.9035187287173666, "grad_norm": 3.5527167320251465, "learning_rate": 0.0001156737279521098, "loss": 2.4003, "step": 1393 }, { "epoch": 0.9041673423058213, "grad_norm": 2.1018991470336914, "learning_rate": 0.00011557278407810431, "loss": 2.29, "step": 1394 }, { "epoch": 0.904815955894276, "grad_norm": 2.4119088649749756, "learning_rate": 0.00011547182393894243, "loss": 1.9079, "step": 1395 }, { "epoch": 0.9054645694827307, "grad_norm": 3.1918206214904785, "learning_rate": 0.00011537084764007302, "loss": 2.3082, "step": 1396 }, { "epoch": 0.9061131830711854, "grad_norm": 2.2965595722198486, "learning_rate": 0.00011526985528696184, "loss": 1.9587, "step": 1397 }, { "epoch": 0.9067617966596401, "grad_norm": 2.67322039604187, "learning_rate": 0.00011516884698509143, "loss": 2.5107, "step": 1398 }, { "epoch": 0.9074104102480947, "grad_norm": 2.3477673530578613, "learning_rate": 0.00011506782283996091, "loss": 2.0445, "step": 1399 }, { "epoch": 0.9080590238365494, "grad_norm": 2.4770901203155518, "learning_rate": 0.000114966782957086, "loss": 2.3761, "step": 1400 }, { "epoch": 0.9080590238365494, "eval_loss": 2.37962007522583, "eval_runtime": 35.1239, "eval_samples_per_second": 58.536, "eval_steps_per_second": 14.634, "step": 1400 }, { "epoch": 0.908707637425004, "grad_norm": 3.1679563522338867, "learning_rate": 0.00011486572744199895, "loss": 2.3374, "step": 1401 }, { "epoch": 0.9093562510134587, "grad_norm": 2.86329984664917, "learning_rate": 0.00011476465640024814, "loss": 2.0861, "step": 1402 }, { "epoch": 0.9100048646019134, "grad_norm": 2.8228843212127686, "learning_rate": 0.00011466356993739831, "loss": 2.3439, "step": 1403 }, { "epoch": 0.9106534781903681, "grad_norm": 3.723388433456421, "learning_rate": 0.00011456246815903027, "loss": 2.4151, "step": 1404 }, { "epoch": 0.9113020917788228, "grad_norm": 2.5116336345672607, "learning_rate": 0.00011446135117074079, "loss": 2.1727, "step": 1405 }, { "epoch": 0.9119507053672774, "grad_norm": 2.8641600608825684, "learning_rate": 0.00011436021907814263, "loss": 2.3122, "step": 1406 }, { "epoch": 0.9125993189557321, "grad_norm": 2.8923842906951904, "learning_rate": 0.00011425907198686419, "loss": 2.1642, "step": 1407 }, { "epoch": 0.9132479325441868, "grad_norm": 2.6967973709106445, "learning_rate": 0.00011415791000254964, "loss": 2.3025, "step": 1408 }, { "epoch": 0.9138965461326415, "grad_norm": 2.8836705684661865, "learning_rate": 0.00011405673323085866, "loss": 2.2665, "step": 1409 }, { "epoch": 0.9145451597210962, "grad_norm": 2.9426329135894775, "learning_rate": 0.00011395554177746638, "loss": 2.323, "step": 1410 }, { "epoch": 0.9151937733095509, "grad_norm": 2.8421013355255127, "learning_rate": 0.00011385433574806327, "loss": 2.2184, "step": 1411 }, { "epoch": 0.9158423868980056, "grad_norm": 3.0411062240600586, "learning_rate": 0.00011375311524835501, "loss": 2.4687, "step": 1412 }, { "epoch": 0.9164910004864602, "grad_norm": 2.386579990386963, "learning_rate": 0.0001136518803840624, "loss": 2.0759, "step": 1413 }, { "epoch": 0.9171396140749148, "grad_norm": 2.822800397872925, "learning_rate": 0.00011355063126092127, "loss": 2.3796, "step": 1414 }, { "epoch": 0.9177882276633695, "grad_norm": 2.5839641094207764, "learning_rate": 0.00011344936798468228, "loss": 2.0011, "step": 1415 }, { "epoch": 0.9184368412518242, "grad_norm": 2.960855722427368, "learning_rate": 0.00011334809066111098, "loss": 2.2969, "step": 1416 }, { "epoch": 0.9190854548402789, "grad_norm": 2.875777006149292, "learning_rate": 0.00011324679939598748, "loss": 2.4424, "step": 1417 }, { "epoch": 0.9197340684287336, "grad_norm": 3.0501344203948975, "learning_rate": 0.0001131454942951065, "loss": 2.3683, "step": 1418 }, { "epoch": 0.9203826820171883, "grad_norm": 2.374938488006592, "learning_rate": 0.00011304417546427722, "loss": 2.2456, "step": 1419 }, { "epoch": 0.9210312956056429, "grad_norm": 2.8600172996520996, "learning_rate": 0.00011294284300932309, "loss": 2.1685, "step": 1420 }, { "epoch": 0.9216799091940976, "grad_norm": 3.081756830215454, "learning_rate": 0.00011284149703608192, "loss": 2.3933, "step": 1421 }, { "epoch": 0.9223285227825523, "grad_norm": 2.341628313064575, "learning_rate": 0.0001127401376504055, "loss": 2.165, "step": 1422 }, { "epoch": 0.922977136371007, "grad_norm": 3.0492992401123047, "learning_rate": 0.00011263876495815979, "loss": 2.4199, "step": 1423 }, { "epoch": 0.9236257499594617, "grad_norm": 2.391216278076172, "learning_rate": 0.00011253737906522448, "loss": 2.3134, "step": 1424 }, { "epoch": 0.9242743635479164, "grad_norm": 2.4839046001434326, "learning_rate": 0.00011243598007749314, "loss": 2.1542, "step": 1425 }, { "epoch": 0.9249229771363711, "grad_norm": 2.4912936687469482, "learning_rate": 0.000112334568100873, "loss": 1.7934, "step": 1426 }, { "epoch": 0.9255715907248256, "grad_norm": 2.9923453330993652, "learning_rate": 0.00011223314324128482, "loss": 2.1803, "step": 1427 }, { "epoch": 0.9262202043132803, "grad_norm": 2.87125825881958, "learning_rate": 0.0001121317056046629, "loss": 2.2536, "step": 1428 }, { "epoch": 0.926868817901735, "grad_norm": 2.7513790130615234, "learning_rate": 0.00011203025529695481, "loss": 2.0449, "step": 1429 }, { "epoch": 0.9275174314901897, "grad_norm": 3.287181854248047, "learning_rate": 0.00011192879242412137, "loss": 2.1261, "step": 1430 }, { "epoch": 0.9281660450786444, "grad_norm": 2.913548231124878, "learning_rate": 0.00011182731709213659, "loss": 2.2672, "step": 1431 }, { "epoch": 0.9288146586670991, "grad_norm": 2.636482000350952, "learning_rate": 0.00011172582940698736, "loss": 2.3362, "step": 1432 }, { "epoch": 0.9294632722555538, "grad_norm": 2.975858449935913, "learning_rate": 0.0001116243294746736, "loss": 2.3561, "step": 1433 }, { "epoch": 0.9301118858440084, "grad_norm": 2.8193869590759277, "learning_rate": 0.00011152281740120795, "loss": 2.274, "step": 1434 }, { "epoch": 0.9307604994324631, "grad_norm": 2.9799084663391113, "learning_rate": 0.0001114212932926158, "loss": 2.1122, "step": 1435 }, { "epoch": 0.9314091130209178, "grad_norm": 2.929915428161621, "learning_rate": 0.00011131975725493493, "loss": 2.2549, "step": 1436 }, { "epoch": 0.9320577266093725, "grad_norm": 2.910146713256836, "learning_rate": 0.00011121820939421585, "loss": 2.1701, "step": 1437 }, { "epoch": 0.9327063401978272, "grad_norm": 3.160346746444702, "learning_rate": 0.00011111664981652121, "loss": 2.3943, "step": 1438 }, { "epoch": 0.9333549537862819, "grad_norm": 2.966280937194824, "learning_rate": 0.000111015078627926, "loss": 2.5059, "step": 1439 }, { "epoch": 0.9340035673747366, "grad_norm": 2.5267651081085205, "learning_rate": 0.0001109134959345173, "loss": 2.2292, "step": 1440 }, { "epoch": 0.9346521809631911, "grad_norm": 2.637760877609253, "learning_rate": 0.00011081190184239419, "loss": 2.487, "step": 1441 }, { "epoch": 0.9353007945516458, "grad_norm": 3.1283209323883057, "learning_rate": 0.0001107102964576677, "loss": 2.6824, "step": 1442 }, { "epoch": 0.9359494081401005, "grad_norm": 2.7695422172546387, "learning_rate": 0.0001106086798864606, "loss": 2.3728, "step": 1443 }, { "epoch": 0.9365980217285552, "grad_norm": 3.2953104972839355, "learning_rate": 0.0001105070522349074, "loss": 2.3574, "step": 1444 }, { "epoch": 0.9372466353170099, "grad_norm": 2.839682102203369, "learning_rate": 0.00011040541360915418, "loss": 2.0427, "step": 1445 }, { "epoch": 0.9378952489054646, "grad_norm": 3.3990519046783447, "learning_rate": 0.0001103037641153584, "loss": 2.6258, "step": 1446 }, { "epoch": 0.9385438624939193, "grad_norm": 2.3462929725646973, "learning_rate": 0.00011020210385968897, "loss": 2.188, "step": 1447 }, { "epoch": 0.9391924760823739, "grad_norm": 2.668840169906616, "learning_rate": 0.00011010043294832601, "loss": 2.1619, "step": 1448 }, { "epoch": 0.9398410896708286, "grad_norm": 2.456411838531494, "learning_rate": 0.00010999875148746075, "loss": 2.1247, "step": 1449 }, { "epoch": 0.9404897032592833, "grad_norm": 4.119238376617432, "learning_rate": 0.00010989705958329546, "loss": 2.5057, "step": 1450 }, { "epoch": 0.941138316847738, "grad_norm": 2.5969901084899902, "learning_rate": 0.00010979535734204327, "loss": 2.141, "step": 1451 }, { "epoch": 0.9417869304361927, "grad_norm": 3.214310646057129, "learning_rate": 0.00010969364486992819, "loss": 2.3752, "step": 1452 }, { "epoch": 0.9424355440246474, "grad_norm": 2.271165609359741, "learning_rate": 0.00010959192227318484, "loss": 2.0017, "step": 1453 }, { "epoch": 0.943084157613102, "grad_norm": 2.786717653274536, "learning_rate": 0.00010949018965805843, "loss": 2.2987, "step": 1454 }, { "epoch": 0.9437327712015566, "grad_norm": 3.335977554321289, "learning_rate": 0.00010938844713080472, "loss": 2.3358, "step": 1455 }, { "epoch": 0.9443813847900113, "grad_norm": 2.8724164962768555, "learning_rate": 0.00010928669479768969, "loss": 2.3065, "step": 1456 }, { "epoch": 0.945029998378466, "grad_norm": 2.8408761024475098, "learning_rate": 0.00010918493276498964, "loss": 2.2499, "step": 1457 }, { "epoch": 0.9456786119669207, "grad_norm": 2.6420578956604004, "learning_rate": 0.00010908316113899097, "loss": 2.062, "step": 1458 }, { "epoch": 0.9463272255553754, "grad_norm": 3.4038913249969482, "learning_rate": 0.00010898138002599015, "loss": 2.3121, "step": 1459 }, { "epoch": 0.9469758391438301, "grad_norm": 2.528416395187378, "learning_rate": 0.00010887958953229349, "loss": 2.0033, "step": 1460 }, { "epoch": 0.9476244527322848, "grad_norm": 2.500972270965576, "learning_rate": 0.00010877778976421715, "loss": 2.2569, "step": 1461 }, { "epoch": 0.9482730663207394, "grad_norm": 2.6802618503570557, "learning_rate": 0.00010867598082808692, "loss": 2.1504, "step": 1462 }, { "epoch": 0.9489216799091941, "grad_norm": 2.8642585277557373, "learning_rate": 0.00010857416283023825, "loss": 2.2568, "step": 1463 }, { "epoch": 0.9495702934976488, "grad_norm": 3.0792908668518066, "learning_rate": 0.00010847233587701598, "loss": 2.2981, "step": 1464 }, { "epoch": 0.9502189070861035, "grad_norm": 2.910794258117676, "learning_rate": 0.00010837050007477432, "loss": 2.4581, "step": 1465 }, { "epoch": 0.9508675206745582, "grad_norm": 3.024482250213623, "learning_rate": 0.00010826865552987677, "loss": 2.4451, "step": 1466 }, { "epoch": 0.9515161342630128, "grad_norm": 3.1212029457092285, "learning_rate": 0.00010816680234869591, "loss": 2.3028, "step": 1467 }, { "epoch": 0.9521647478514675, "grad_norm": 3.2346534729003906, "learning_rate": 0.00010806494063761335, "loss": 2.2808, "step": 1468 }, { "epoch": 0.9528133614399221, "grad_norm": 2.798114538192749, "learning_rate": 0.00010796307050301961, "loss": 2.2759, "step": 1469 }, { "epoch": 0.9534619750283768, "grad_norm": 2.716041088104248, "learning_rate": 0.000107861192051314, "loss": 2.2166, "step": 1470 }, { "epoch": 0.9541105886168315, "grad_norm": 2.692410469055176, "learning_rate": 0.00010775930538890458, "loss": 2.2679, "step": 1471 }, { "epoch": 0.9547592022052862, "grad_norm": 3.0163207054138184, "learning_rate": 0.00010765741062220787, "loss": 2.1849, "step": 1472 }, { "epoch": 0.9554078157937409, "grad_norm": 2.473099946975708, "learning_rate": 0.00010755550785764896, "loss": 1.9016, "step": 1473 }, { "epoch": 0.9560564293821956, "grad_norm": 2.8906147480010986, "learning_rate": 0.00010745359720166125, "loss": 2.4415, "step": 1474 }, { "epoch": 0.9567050429706503, "grad_norm": 2.884521245956421, "learning_rate": 0.00010735167876068639, "loss": 2.1613, "step": 1475 }, { "epoch": 0.9573536565591049, "grad_norm": 2.504941463470459, "learning_rate": 0.00010724975264117415, "loss": 2.0177, "step": 1476 }, { "epoch": 0.9580022701475596, "grad_norm": 3.3962528705596924, "learning_rate": 0.0001071478189495823, "loss": 2.1185, "step": 1477 }, { "epoch": 0.9586508837360143, "grad_norm": 3.3360044956207275, "learning_rate": 0.00010704587779237654, "loss": 2.4986, "step": 1478 }, { "epoch": 0.959299497324469, "grad_norm": 2.7830538749694824, "learning_rate": 0.0001069439292760304, "loss": 2.1103, "step": 1479 }, { "epoch": 0.9599481109129236, "grad_norm": 2.688582181930542, "learning_rate": 0.00010684197350702512, "loss": 2.2462, "step": 1480 }, { "epoch": 0.9605967245013783, "grad_norm": 2.859369993209839, "learning_rate": 0.00010674001059184939, "loss": 2.3044, "step": 1481 }, { "epoch": 0.961245338089833, "grad_norm": 4.421907424926758, "learning_rate": 0.00010663804063699947, "loss": 2.4036, "step": 1482 }, { "epoch": 0.9618939516782876, "grad_norm": 2.192413806915283, "learning_rate": 0.00010653606374897891, "loss": 1.8703, "step": 1483 }, { "epoch": 0.9625425652667423, "grad_norm": 3.2039551734924316, "learning_rate": 0.00010643408003429856, "loss": 2.4661, "step": 1484 }, { "epoch": 0.963191178855197, "grad_norm": 2.080410957336426, "learning_rate": 0.00010633208959947635, "loss": 1.9677, "step": 1485 }, { "epoch": 0.9638397924436517, "grad_norm": 2.931468963623047, "learning_rate": 0.00010623009255103726, "loss": 2.3481, "step": 1486 }, { "epoch": 0.9644884060321064, "grad_norm": 2.418475866317749, "learning_rate": 0.00010612808899551319, "loss": 2.1327, "step": 1487 }, { "epoch": 0.9651370196205611, "grad_norm": 2.494847536087036, "learning_rate": 0.00010602607903944279, "loss": 2.4237, "step": 1488 }, { "epoch": 0.9657856332090158, "grad_norm": 2.632746934890747, "learning_rate": 0.00010592406278937144, "loss": 2.4381, "step": 1489 }, { "epoch": 0.9664342467974704, "grad_norm": 2.8051106929779053, "learning_rate": 0.00010582204035185106, "loss": 2.2001, "step": 1490 }, { "epoch": 0.9670828603859251, "grad_norm": 2.816913366317749, "learning_rate": 0.00010572001183344004, "loss": 2.2778, "step": 1491 }, { "epoch": 0.9677314739743798, "grad_norm": 3.0832695960998535, "learning_rate": 0.00010561797734070316, "loss": 2.3058, "step": 1492 }, { "epoch": 0.9683800875628344, "grad_norm": 3.424374580383301, "learning_rate": 0.00010551593698021136, "loss": 2.4284, "step": 1493 }, { "epoch": 0.9690287011512891, "grad_norm": 3.237190008163452, "learning_rate": 0.00010541389085854176, "loss": 2.4673, "step": 1494 }, { "epoch": 0.9696773147397438, "grad_norm": 2.645200729370117, "learning_rate": 0.00010531183908227753, "loss": 2.2648, "step": 1495 }, { "epoch": 0.9703259283281985, "grad_norm": 2.56109356880188, "learning_rate": 0.00010520978175800771, "loss": 1.9921, "step": 1496 }, { "epoch": 0.9709745419166531, "grad_norm": 3.311659574508667, "learning_rate": 0.00010510771899232712, "loss": 2.485, "step": 1497 }, { "epoch": 0.9716231555051078, "grad_norm": 2.732013463973999, "learning_rate": 0.00010500565089183627, "loss": 2.2056, "step": 1498 }, { "epoch": 0.9722717690935625, "grad_norm": 3.28397798538208, "learning_rate": 0.00010490357756314127, "loss": 2.2909, "step": 1499 }, { "epoch": 0.9729203826820172, "grad_norm": 2.805410385131836, "learning_rate": 0.00010480149911285364, "loss": 2.0236, "step": 1500 }, { "epoch": 0.9729203826820172, "eval_loss": 2.3124866485595703, "eval_runtime": 35.147, "eval_samples_per_second": 58.497, "eval_steps_per_second": 14.624, "step": 1500 }, { "epoch": 0.9735689962704719, "grad_norm": 3.3805577754974365, "learning_rate": 0.00010469941564759027, "loss": 2.469, "step": 1501 }, { "epoch": 0.9742176098589266, "grad_norm": 2.9957621097564697, "learning_rate": 0.00010459732727397336, "loss": 2.1591, "step": 1502 }, { "epoch": 0.9748662234473813, "grad_norm": 3.0814363956451416, "learning_rate": 0.00010449523409863011, "loss": 2.2245, "step": 1503 }, { "epoch": 0.9755148370358359, "grad_norm": 2.3660271167755127, "learning_rate": 0.00010439313622819284, "loss": 1.9534, "step": 1504 }, { "epoch": 0.9761634506242906, "grad_norm": 3.426288604736328, "learning_rate": 0.00010429103376929874, "loss": 2.2492, "step": 1505 }, { "epoch": 0.9768120642127452, "grad_norm": 3.190302848815918, "learning_rate": 0.00010418892682858975, "loss": 2.4716, "step": 1506 }, { "epoch": 0.9774606778011999, "grad_norm": 3.087360382080078, "learning_rate": 0.00010408681551271256, "loss": 2.1342, "step": 1507 }, { "epoch": 0.9781092913896546, "grad_norm": 3.266953468322754, "learning_rate": 0.00010398469992831832, "loss": 2.4686, "step": 1508 }, { "epoch": 0.9787579049781093, "grad_norm": 2.8588433265686035, "learning_rate": 0.00010388258018206283, "loss": 2.0403, "step": 1509 }, { "epoch": 0.979406518566564, "grad_norm": 2.885345458984375, "learning_rate": 0.00010378045638060605, "loss": 2.2458, "step": 1510 }, { "epoch": 0.9800551321550186, "grad_norm": 2.6360859870910645, "learning_rate": 0.00010367832863061227, "loss": 1.9501, "step": 1511 }, { "epoch": 0.9807037457434733, "grad_norm": 3.0104072093963623, "learning_rate": 0.00010357619703874986, "loss": 2.3123, "step": 1512 }, { "epoch": 0.981352359331928, "grad_norm": 2.8297128677368164, "learning_rate": 0.00010347406171169125, "loss": 2.2189, "step": 1513 }, { "epoch": 0.9820009729203827, "grad_norm": 3.027043104171753, "learning_rate": 0.00010337192275611274, "loss": 2.281, "step": 1514 }, { "epoch": 0.9826495865088374, "grad_norm": 3.5282554626464844, "learning_rate": 0.00010326978027869439, "loss": 2.4496, "step": 1515 }, { "epoch": 0.9832982000972921, "grad_norm": 3.6257994174957275, "learning_rate": 0.00010316763438612002, "loss": 2.3929, "step": 1516 }, { "epoch": 0.9839468136857468, "grad_norm": 3.4494125843048096, "learning_rate": 0.00010306548518507695, "loss": 2.2083, "step": 1517 }, { "epoch": 0.9845954272742014, "grad_norm": 3.3319547176361084, "learning_rate": 0.00010296333278225599, "loss": 2.7246, "step": 1518 }, { "epoch": 0.985244040862656, "grad_norm": 2.7109665870666504, "learning_rate": 0.00010286117728435125, "loss": 2.1897, "step": 1519 }, { "epoch": 0.9858926544511107, "grad_norm": 2.7485601902008057, "learning_rate": 0.00010275901879806014, "loss": 1.9106, "step": 1520 }, { "epoch": 0.9865412680395654, "grad_norm": 2.7985074520111084, "learning_rate": 0.00010265685743008311, "loss": 2.1642, "step": 1521 }, { "epoch": 0.9871898816280201, "grad_norm": 1.9505937099456787, "learning_rate": 0.00010255469328712371, "loss": 2.2099, "step": 1522 }, { "epoch": 0.9878384952164748, "grad_norm": 2.9058632850646973, "learning_rate": 0.0001024525264758883, "loss": 2.1847, "step": 1523 }, { "epoch": 0.9884871088049295, "grad_norm": 3.175062656402588, "learning_rate": 0.00010235035710308611, "loss": 2.2391, "step": 1524 }, { "epoch": 0.9891357223933841, "grad_norm": 3.0369863510131836, "learning_rate": 0.00010224818527542899, "loss": 1.9514, "step": 1525 }, { "epoch": 0.9897843359818388, "grad_norm": 3.1444361209869385, "learning_rate": 0.00010214601109963135, "loss": 2.3488, "step": 1526 }, { "epoch": 0.9904329495702935, "grad_norm": 3.477829694747925, "learning_rate": 0.00010204383468241007, "loss": 2.2842, "step": 1527 }, { "epoch": 0.9910815631587482, "grad_norm": 3.400541305541992, "learning_rate": 0.00010194165613048444, "loss": 2.1575, "step": 1528 }, { "epoch": 0.9917301767472029, "grad_norm": 3.2055649757385254, "learning_rate": 0.00010183947555057582, "loss": 2.3558, "step": 1529 }, { "epoch": 0.9923787903356576, "grad_norm": 2.9880716800689697, "learning_rate": 0.0001017372930494078, "loss": 2.1039, "step": 1530 }, { "epoch": 0.9930274039241123, "grad_norm": 3.288238286972046, "learning_rate": 0.00010163510873370601, "loss": 2.4636, "step": 1531 }, { "epoch": 0.9936760175125668, "grad_norm": 2.681642532348633, "learning_rate": 0.0001015329227101979, "loss": 1.9191, "step": 1532 }, { "epoch": 0.9943246311010215, "grad_norm": 2.8686816692352295, "learning_rate": 0.0001014307350856127, "loss": 2.2624, "step": 1533 }, { "epoch": 0.9949732446894762, "grad_norm": 2.733706474304199, "learning_rate": 0.00010132854596668133, "loss": 2.0953, "step": 1534 }, { "epoch": 0.9956218582779309, "grad_norm": 2.374807834625244, "learning_rate": 0.0001012263554601363, "loss": 2.0617, "step": 1535 }, { "epoch": 0.9962704718663856, "grad_norm": 3.8467745780944824, "learning_rate": 0.00010112416367271156, "loss": 2.3556, "step": 1536 }, { "epoch": 0.9969190854548403, "grad_norm": 2.803708791732788, "learning_rate": 0.00010102197071114236, "loss": 2.245, "step": 1537 }, { "epoch": 0.997567699043295, "grad_norm": 3.4458584785461426, "learning_rate": 0.00010091977668216524, "loss": 2.3693, "step": 1538 }, { "epoch": 0.9982163126317496, "grad_norm": 2.929656744003296, "learning_rate": 0.00010081758169251778, "loss": 2.0552, "step": 1539 }, { "epoch": 0.9988649262202043, "grad_norm": 3.5934321880340576, "learning_rate": 0.00010071538584893863, "loss": 2.177, "step": 1540 }, { "epoch": 0.999513539808659, "grad_norm": 3.452805280685425, "learning_rate": 0.00010061318925816728, "loss": 2.114, "step": 1541 }, { "epoch": 1.0001621533971137, "grad_norm": 4.757358074188232, "learning_rate": 0.00010051099202694403, "loss": 2.4829, "step": 1542 }, { "epoch": 1.0008107669855684, "grad_norm": 3.1391735076904297, "learning_rate": 0.00010040879426200982, "loss": 1.9174, "step": 1543 }, { "epoch": 1.001459380574023, "grad_norm": 3.067674398422241, "learning_rate": 0.00010030659607010617, "loss": 1.7887, "step": 1544 }, { "epoch": 1.0021079941624778, "grad_norm": 2.703613758087158, "learning_rate": 0.00010020439755797511, "loss": 1.7965, "step": 1545 }, { "epoch": 1.0027566077509324, "grad_norm": 2.480989456176758, "learning_rate": 0.00010010219883235887, "loss": 1.9512, "step": 1546 }, { "epoch": 1.0034052213393871, "grad_norm": 2.9696717262268066, "learning_rate": 0.0001, "loss": 1.9098, "step": 1547 }, { "epoch": 1.0040538349278418, "grad_norm": 3.463980197906494, "learning_rate": 9.989780116764115e-05, "loss": 2.272, "step": 1548 }, { "epoch": 1.0047024485162963, "grad_norm": 3.322113513946533, "learning_rate": 9.979560244202493e-05, "loss": 2.075, "step": 1549 }, { "epoch": 1.005351062104751, "grad_norm": 2.8073513507843018, "learning_rate": 9.969340392989384e-05, "loss": 2.1272, "step": 1550 }, { "epoch": 1.0059996756932057, "grad_norm": 2.758537530899048, "learning_rate": 9.959120573799022e-05, "loss": 1.925, "step": 1551 }, { "epoch": 1.0066482892816604, "grad_norm": 2.939807415008545, "learning_rate": 9.948900797305601e-05, "loss": 1.969, "step": 1552 }, { "epoch": 1.007296902870115, "grad_norm": 2.7651724815368652, "learning_rate": 9.938681074183277e-05, "loss": 1.9098, "step": 1553 }, { "epoch": 1.0079455164585698, "grad_norm": 2.5402278900146484, "learning_rate": 9.928461415106142e-05, "loss": 1.5434, "step": 1554 }, { "epoch": 1.0085941300470245, "grad_norm": 3.258568286895752, "learning_rate": 9.91824183074822e-05, "loss": 2.0138, "step": 1555 }, { "epoch": 1.0092427436354792, "grad_norm": 2.709629774093628, "learning_rate": 9.908022331783476e-05, "loss": 1.7002, "step": 1556 }, { "epoch": 1.0098913572239339, "grad_norm": 2.930220603942871, "learning_rate": 9.897802928885763e-05, "loss": 1.8909, "step": 1557 }, { "epoch": 1.0105399708123886, "grad_norm": 3.9990627765655518, "learning_rate": 9.887583632728845e-05, "loss": 1.9064, "step": 1558 }, { "epoch": 1.0111885844008432, "grad_norm": 4.014188289642334, "learning_rate": 9.877364453986371e-05, "loss": 2.2555, "step": 1559 }, { "epoch": 1.011837197989298, "grad_norm": 3.5029964447021484, "learning_rate": 9.867145403331869e-05, "loss": 2.0679, "step": 1560 }, { "epoch": 1.0124858115777526, "grad_norm": 2.509709358215332, "learning_rate": 9.856926491438734e-05, "loss": 1.3961, "step": 1561 }, { "epoch": 1.0131344251662073, "grad_norm": 2.852165460586548, "learning_rate": 9.846707728980214e-05, "loss": 1.6452, "step": 1562 }, { "epoch": 1.0137830387546618, "grad_norm": 3.6528263092041016, "learning_rate": 9.8364891266294e-05, "loss": 2.093, "step": 1563 }, { "epoch": 1.0144316523431165, "grad_norm": 2.7031829357147217, "learning_rate": 9.826270695059221e-05, "loss": 1.9237, "step": 1564 }, { "epoch": 1.0150802659315712, "grad_norm": 2.556868314743042, "learning_rate": 9.816052444942422e-05, "loss": 1.5903, "step": 1565 }, { "epoch": 1.0157288795200259, "grad_norm": 2.8520004749298096, "learning_rate": 9.805834386951561e-05, "loss": 1.6552, "step": 1566 }, { "epoch": 1.0163774931084806, "grad_norm": 3.1155073642730713, "learning_rate": 9.795616531758993e-05, "loss": 1.9539, "step": 1567 }, { "epoch": 1.0170261066969353, "grad_norm": 3.119288444519043, "learning_rate": 9.785398890036867e-05, "loss": 1.9254, "step": 1568 }, { "epoch": 1.01767472028539, "grad_norm": 2.404115915298462, "learning_rate": 9.7751814724571e-05, "loss": 1.6682, "step": 1569 }, { "epoch": 1.0183233338738447, "grad_norm": 2.9738729000091553, "learning_rate": 9.764964289691389e-05, "loss": 2.0583, "step": 1570 }, { "epoch": 1.0189719474622994, "grad_norm": 3.5042972564697266, "learning_rate": 9.754747352411169e-05, "loss": 2.004, "step": 1571 }, { "epoch": 1.019620561050754, "grad_norm": 2.8745548725128174, "learning_rate": 9.74453067128763e-05, "loss": 1.6072, "step": 1572 }, { "epoch": 1.0202691746392087, "grad_norm": 3.4794466495513916, "learning_rate": 9.73431425699169e-05, "loss": 2.0015, "step": 1573 }, { "epoch": 1.0209177882276634, "grad_norm": 3.572443962097168, "learning_rate": 9.72409812019399e-05, "loss": 2.0072, "step": 1574 }, { "epoch": 1.0215664018161181, "grad_norm": 2.9821250438690186, "learning_rate": 9.713882271564879e-05, "loss": 1.736, "step": 1575 }, { "epoch": 1.0222150154045728, "grad_norm": 2.8369014263153076, "learning_rate": 9.703666721774402e-05, "loss": 1.8708, "step": 1576 }, { "epoch": 1.0228636289930273, "grad_norm": 2.7988221645355225, "learning_rate": 9.693451481492306e-05, "loss": 1.9037, "step": 1577 }, { "epoch": 1.023512242581482, "grad_norm": 2.8663995265960693, "learning_rate": 9.683236561388e-05, "loss": 1.9212, "step": 1578 }, { "epoch": 1.0241608561699367, "grad_norm": 2.9756734371185303, "learning_rate": 9.673021972130563e-05, "loss": 1.9153, "step": 1579 }, { "epoch": 1.0248094697583914, "grad_norm": 3.4735498428344727, "learning_rate": 9.662807724388731e-05, "loss": 1.9042, "step": 1580 }, { "epoch": 1.025458083346846, "grad_norm": 3.5060696601867676, "learning_rate": 9.652593828830879e-05, "loss": 1.8981, "step": 1581 }, { "epoch": 1.0261066969353008, "grad_norm": 3.037895917892456, "learning_rate": 9.642380296125019e-05, "loss": 1.824, "step": 1582 }, { "epoch": 1.0267553105237555, "grad_norm": 3.1835851669311523, "learning_rate": 9.632167136938774e-05, "loss": 1.9822, "step": 1583 }, { "epoch": 1.0274039241122102, "grad_norm": 2.7022721767425537, "learning_rate": 9.621954361939396e-05, "loss": 1.6392, "step": 1584 }, { "epoch": 1.0280525377006648, "grad_norm": 3.651803970336914, "learning_rate": 9.611741981793717e-05, "loss": 2.1823, "step": 1585 }, { "epoch": 1.0287011512891195, "grad_norm": 3.333892822265625, "learning_rate": 9.601530007168166e-05, "loss": 1.6954, "step": 1586 }, { "epoch": 1.0293497648775742, "grad_norm": 3.276643991470337, "learning_rate": 9.591318448728748e-05, "loss": 2.1116, "step": 1587 }, { "epoch": 1.029998378466029, "grad_norm": 2.830047130584717, "learning_rate": 9.581107317141026e-05, "loss": 1.5735, "step": 1588 }, { "epoch": 1.0306469920544836, "grad_norm": 2.7472352981567383, "learning_rate": 9.57089662307013e-05, "loss": 2.0467, "step": 1589 }, { "epoch": 1.0312956056429383, "grad_norm": 2.9230573177337646, "learning_rate": 9.560686377180718e-05, "loss": 1.8485, "step": 1590 }, { "epoch": 1.0319442192313928, "grad_norm": 2.9430761337280273, "learning_rate": 9.550476590136991e-05, "loss": 1.7057, "step": 1591 }, { "epoch": 1.0325928328198475, "grad_norm": 3.3265395164489746, "learning_rate": 9.540267272602666e-05, "loss": 2.0616, "step": 1592 }, { "epoch": 1.0332414464083022, "grad_norm": 2.731788396835327, "learning_rate": 9.530058435240975e-05, "loss": 1.7425, "step": 1593 }, { "epoch": 1.0338900599967569, "grad_norm": 2.7669782638549805, "learning_rate": 9.519850088714641e-05, "loss": 1.8668, "step": 1594 }, { "epoch": 1.0345386735852116, "grad_norm": 2.9217612743377686, "learning_rate": 9.50964224368588e-05, "loss": 1.7899, "step": 1595 }, { "epoch": 1.0351872871736663, "grad_norm": 4.093738555908203, "learning_rate": 9.499434910816378e-05, "loss": 2.1509, "step": 1596 }, { "epoch": 1.035835900762121, "grad_norm": 2.873826742172241, "learning_rate": 9.489228100767294e-05, "loss": 1.7444, "step": 1597 }, { "epoch": 1.0364845143505756, "grad_norm": 2.9690091609954834, "learning_rate": 9.479021824199229e-05, "loss": 1.656, "step": 1598 }, { "epoch": 1.0371331279390303, "grad_norm": 2.7292227745056152, "learning_rate": 9.468816091772246e-05, "loss": 1.6384, "step": 1599 }, { "epoch": 1.037781741527485, "grad_norm": 3.3454012870788574, "learning_rate": 9.458610914145826e-05, "loss": 1.9383, "step": 1600 }, { "epoch": 1.037781741527485, "eval_loss": 2.2606089115142822, "eval_runtime": 35.0762, "eval_samples_per_second": 58.615, "eval_steps_per_second": 14.654, "step": 1600 }, { "epoch": 1.0384303551159397, "grad_norm": 3.463085412979126, "learning_rate": 9.448406301978866e-05, "loss": 2.1843, "step": 1601 }, { "epoch": 1.0390789687043944, "grad_norm": 2.8203296661376953, "learning_rate": 9.438202265929688e-05, "loss": 1.7024, "step": 1602 }, { "epoch": 1.0397275822928491, "grad_norm": 3.8935906887054443, "learning_rate": 9.427998816655997e-05, "loss": 2.2453, "step": 1603 }, { "epoch": 1.0403761958813038, "grad_norm": 3.650148630142212, "learning_rate": 9.417795964814897e-05, "loss": 1.9086, "step": 1604 }, { "epoch": 1.0410248094697585, "grad_norm": 3.2131192684173584, "learning_rate": 9.407593721062859e-05, "loss": 2.0812, "step": 1605 }, { "epoch": 1.041673423058213, "grad_norm": 3.1165382862091064, "learning_rate": 9.397392096055724e-05, "loss": 1.8238, "step": 1606 }, { "epoch": 1.0423220366466677, "grad_norm": 3.1514523029327393, "learning_rate": 9.387191100448685e-05, "loss": 1.8326, "step": 1607 }, { "epoch": 1.0429706502351224, "grad_norm": 3.85775089263916, "learning_rate": 9.376990744896276e-05, "loss": 2.3191, "step": 1608 }, { "epoch": 1.043619263823577, "grad_norm": 3.088756561279297, "learning_rate": 9.366791040052368e-05, "loss": 1.9159, "step": 1609 }, { "epoch": 1.0442678774120318, "grad_norm": 2.8240625858306885, "learning_rate": 9.356591996570148e-05, "loss": 1.5689, "step": 1610 }, { "epoch": 1.0449164910004864, "grad_norm": 3.2539751529693604, "learning_rate": 9.346393625102114e-05, "loss": 1.9528, "step": 1611 }, { "epoch": 1.0455651045889411, "grad_norm": 3.0957069396972656, "learning_rate": 9.336195936300054e-05, "loss": 2.1473, "step": 1612 }, { "epoch": 1.0462137181773958, "grad_norm": 2.9586756229400635, "learning_rate": 9.325998940815062e-05, "loss": 1.8947, "step": 1613 }, { "epoch": 1.0468623317658505, "grad_norm": 3.2391159534454346, "learning_rate": 9.315802649297488e-05, "loss": 1.9296, "step": 1614 }, { "epoch": 1.0475109453543052, "grad_norm": 2.534924030303955, "learning_rate": 9.305607072396958e-05, "loss": 1.6653, "step": 1615 }, { "epoch": 1.04815955894276, "grad_norm": 2.945472240447998, "learning_rate": 9.295412220762347e-05, "loss": 1.7748, "step": 1616 }, { "epoch": 1.0488081725312146, "grad_norm": 3.704002857208252, "learning_rate": 9.285218105041773e-05, "loss": 2.0761, "step": 1617 }, { "epoch": 1.0494567861196693, "grad_norm": 3.1357100009918213, "learning_rate": 9.275024735882588e-05, "loss": 1.7378, "step": 1618 }, { "epoch": 1.0501053997081238, "grad_norm": 3.197866439819336, "learning_rate": 9.264832123931363e-05, "loss": 2.2471, "step": 1619 }, { "epoch": 1.0507540132965785, "grad_norm": 3.1937649250030518, "learning_rate": 9.254640279833876e-05, "loss": 1.6929, "step": 1620 }, { "epoch": 1.0514026268850332, "grad_norm": 4.325114727020264, "learning_rate": 9.244449214235105e-05, "loss": 2.2179, "step": 1621 }, { "epoch": 1.0520512404734879, "grad_norm": 2.911024570465088, "learning_rate": 9.234258937779215e-05, "loss": 1.5538, "step": 1622 }, { "epoch": 1.0526998540619426, "grad_norm": 3.568424701690674, "learning_rate": 9.224069461109547e-05, "loss": 2.0871, "step": 1623 }, { "epoch": 1.0533484676503972, "grad_norm": 4.273421764373779, "learning_rate": 9.213880794868601e-05, "loss": 2.1506, "step": 1624 }, { "epoch": 1.053997081238852, "grad_norm": 3.561208963394165, "learning_rate": 9.203692949698043e-05, "loss": 2.148, "step": 1625 }, { "epoch": 1.0546456948273066, "grad_norm": 3.430603504180908, "learning_rate": 9.193505936238669e-05, "loss": 2.0024, "step": 1626 }, { "epoch": 1.0552943084157613, "grad_norm": 3.5299758911132812, "learning_rate": 9.18331976513041e-05, "loss": 1.9753, "step": 1627 }, { "epoch": 1.055942922004216, "grad_norm": 3.1212453842163086, "learning_rate": 9.173134447012322e-05, "loss": 1.9789, "step": 1628 }, { "epoch": 1.0565915355926707, "grad_norm": 3.029099225997925, "learning_rate": 9.162949992522569e-05, "loss": 1.5735, "step": 1629 }, { "epoch": 1.0572401491811254, "grad_norm": 3.144139289855957, "learning_rate": 9.152766412298404e-05, "loss": 1.7973, "step": 1630 }, { "epoch": 1.05788876276958, "grad_norm": 3.8304193019866943, "learning_rate": 9.142583716976179e-05, "loss": 2.2012, "step": 1631 }, { "epoch": 1.0585373763580348, "grad_norm": 3.6837079524993896, "learning_rate": 9.132401917191312e-05, "loss": 2.0158, "step": 1632 }, { "epoch": 1.0591859899464895, "grad_norm": 3.7940714359283447, "learning_rate": 9.122221023578287e-05, "loss": 2.2844, "step": 1633 }, { "epoch": 1.059834603534944, "grad_norm": 3.6124017238616943, "learning_rate": 9.112041046770653e-05, "loss": 2.107, "step": 1634 }, { "epoch": 1.0604832171233987, "grad_norm": 2.8182005882263184, "learning_rate": 9.101861997400986e-05, "loss": 1.4453, "step": 1635 }, { "epoch": 1.0611318307118534, "grad_norm": 3.2007431983947754, "learning_rate": 9.091683886100905e-05, "loss": 1.8972, "step": 1636 }, { "epoch": 1.061780444300308, "grad_norm": 3.3300485610961914, "learning_rate": 9.081506723501038e-05, "loss": 2.1854, "step": 1637 }, { "epoch": 1.0624290578887627, "grad_norm": 3.257756471633911, "learning_rate": 9.071330520231033e-05, "loss": 1.7916, "step": 1638 }, { "epoch": 1.0630776714772174, "grad_norm": 4.046993732452393, "learning_rate": 9.061155286919532e-05, "loss": 2.2605, "step": 1639 }, { "epoch": 1.0637262850656721, "grad_norm": 3.3148815631866455, "learning_rate": 9.050981034194158e-05, "loss": 1.909, "step": 1640 }, { "epoch": 1.0643748986541268, "grad_norm": 3.2830381393432617, "learning_rate": 9.040807772681517e-05, "loss": 1.8806, "step": 1641 }, { "epoch": 1.0650235122425815, "grad_norm": 2.94028902053833, "learning_rate": 9.030635513007182e-05, "loss": 1.5503, "step": 1642 }, { "epoch": 1.0656721258310362, "grad_norm": 3.6414058208465576, "learning_rate": 9.020464265795674e-05, "loss": 1.9927, "step": 1643 }, { "epoch": 1.066320739419491, "grad_norm": 3.0167534351348877, "learning_rate": 9.010294041670456e-05, "loss": 1.8539, "step": 1644 }, { "epoch": 1.0669693530079456, "grad_norm": 3.5465660095214844, "learning_rate": 9.000124851253928e-05, "loss": 2.0003, "step": 1645 }, { "epoch": 1.0676179665964003, "grad_norm": 3.4539973735809326, "learning_rate": 8.989956705167401e-05, "loss": 1.6849, "step": 1646 }, { "epoch": 1.0682665801848548, "grad_norm": 3.413740396499634, "learning_rate": 8.979789614031105e-05, "loss": 1.8535, "step": 1647 }, { "epoch": 1.0689151937733095, "grad_norm": 3.4457616806030273, "learning_rate": 8.969623588464163e-05, "loss": 1.847, "step": 1648 }, { "epoch": 1.0695638073617642, "grad_norm": 3.546393632888794, "learning_rate": 8.959458639084586e-05, "loss": 1.8482, "step": 1649 }, { "epoch": 1.0702124209502188, "grad_norm": 2.86091947555542, "learning_rate": 8.949294776509262e-05, "loss": 1.8877, "step": 1650 }, { "epoch": 1.0708610345386735, "grad_norm": 3.2060577869415283, "learning_rate": 8.939132011353943e-05, "loss": 2.0384, "step": 1651 }, { "epoch": 1.0715096481271282, "grad_norm": 3.089614152908325, "learning_rate": 8.928970354233235e-05, "loss": 1.6746, "step": 1652 }, { "epoch": 1.072158261715583, "grad_norm": 3.161879062652588, "learning_rate": 8.918809815760585e-05, "loss": 1.9112, "step": 1653 }, { "epoch": 1.0728068753040376, "grad_norm": 3.193826913833618, "learning_rate": 8.908650406548275e-05, "loss": 1.811, "step": 1654 }, { "epoch": 1.0734554888924923, "grad_norm": 3.107208490371704, "learning_rate": 8.898492137207399e-05, "loss": 1.8096, "step": 1655 }, { "epoch": 1.074104102480947, "grad_norm": 3.91662335395813, "learning_rate": 8.888335018347878e-05, "loss": 1.9704, "step": 1656 }, { "epoch": 1.0747527160694017, "grad_norm": 2.9578254222869873, "learning_rate": 8.878179060578417e-05, "loss": 1.7025, "step": 1657 }, { "epoch": 1.0754013296578564, "grad_norm": 3.0146644115448, "learning_rate": 8.868024274506505e-05, "loss": 1.8808, "step": 1658 }, { "epoch": 1.076049943246311, "grad_norm": 3.5208287239074707, "learning_rate": 8.857870670738424e-05, "loss": 2.1348, "step": 1659 }, { "epoch": 1.0766985568347658, "grad_norm": 3.313014268875122, "learning_rate": 8.847718259879206e-05, "loss": 1.7466, "step": 1660 }, { "epoch": 1.0773471704232205, "grad_norm": 3.2565646171569824, "learning_rate": 8.83756705253264e-05, "loss": 1.7593, "step": 1661 }, { "epoch": 1.077995784011675, "grad_norm": 3.2182021141052246, "learning_rate": 8.827417059301265e-05, "loss": 1.6764, "step": 1662 }, { "epoch": 1.0786443976001296, "grad_norm": 2.871675968170166, "learning_rate": 8.817268290786343e-05, "loss": 1.8204, "step": 1663 }, { "epoch": 1.0792930111885843, "grad_norm": 3.157430648803711, "learning_rate": 8.807120757587865e-05, "loss": 1.7203, "step": 1664 }, { "epoch": 1.079941624777039, "grad_norm": 3.1763226985931396, "learning_rate": 8.796974470304521e-05, "loss": 2.2859, "step": 1665 }, { "epoch": 1.0805902383654937, "grad_norm": 3.25197696685791, "learning_rate": 8.786829439533712e-05, "loss": 2.2049, "step": 1666 }, { "epoch": 1.0812388519539484, "grad_norm": 2.6246166229248047, "learning_rate": 8.776685675871522e-05, "loss": 1.7751, "step": 1667 }, { "epoch": 1.0818874655424031, "grad_norm": 2.9587862491607666, "learning_rate": 8.766543189912705e-05, "loss": 1.7023, "step": 1668 }, { "epoch": 1.0825360791308578, "grad_norm": 3.871548652648926, "learning_rate": 8.756401992250691e-05, "loss": 1.8907, "step": 1669 }, { "epoch": 1.0831846927193125, "grad_norm": 2.508451461791992, "learning_rate": 8.746262093477553e-05, "loss": 2.0086, "step": 1670 }, { "epoch": 1.0838333063077672, "grad_norm": 3.0046660900115967, "learning_rate": 8.736123504184022e-05, "loss": 1.8008, "step": 1671 }, { "epoch": 1.084481919896222, "grad_norm": 3.2727181911468506, "learning_rate": 8.72598623495945e-05, "loss": 2.1021, "step": 1672 }, { "epoch": 1.0851305334846766, "grad_norm": 3.3095524311065674, "learning_rate": 8.715850296391811e-05, "loss": 2.0405, "step": 1673 }, { "epoch": 1.0857791470731313, "grad_norm": 3.5616602897644043, "learning_rate": 8.705715699067692e-05, "loss": 1.9126, "step": 1674 }, { "epoch": 1.0864277606615858, "grad_norm": 3.621194839477539, "learning_rate": 8.695582453572282e-05, "loss": 1.7997, "step": 1675 }, { "epoch": 1.0870763742500404, "grad_norm": 3.6697275638580322, "learning_rate": 8.685450570489353e-05, "loss": 2.0915, "step": 1676 }, { "epoch": 1.0877249878384951, "grad_norm": 4.016453742980957, "learning_rate": 8.675320060401254e-05, "loss": 1.9676, "step": 1677 }, { "epoch": 1.0883736014269498, "grad_norm": 3.2040913105010986, "learning_rate": 8.665190933888904e-05, "loss": 1.9684, "step": 1678 }, { "epoch": 1.0890222150154045, "grad_norm": 2.899991512298584, "learning_rate": 8.655063201531773e-05, "loss": 1.6359, "step": 1679 }, { "epoch": 1.0896708286038592, "grad_norm": 3.7530014514923096, "learning_rate": 8.644936873907877e-05, "loss": 2.2172, "step": 1680 }, { "epoch": 1.090319442192314, "grad_norm": 3.914391040802002, "learning_rate": 8.634811961593762e-05, "loss": 1.9096, "step": 1681 }, { "epoch": 1.0909680557807686, "grad_norm": 2.807934522628784, "learning_rate": 8.624688475164502e-05, "loss": 1.648, "step": 1682 }, { "epoch": 1.0916166693692233, "grad_norm": 2.4991159439086914, "learning_rate": 8.614566425193676e-05, "loss": 1.4971, "step": 1683 }, { "epoch": 1.092265282957678, "grad_norm": 3.4715545177459717, "learning_rate": 8.604445822253361e-05, "loss": 2.1269, "step": 1684 }, { "epoch": 1.0929138965461327, "grad_norm": 2.9209651947021484, "learning_rate": 8.594326676914132e-05, "loss": 1.8752, "step": 1685 }, { "epoch": 1.0935625101345874, "grad_norm": 2.9050798416137695, "learning_rate": 8.584208999745037e-05, "loss": 1.7498, "step": 1686 }, { "epoch": 1.094211123723042, "grad_norm": 3.491727113723755, "learning_rate": 8.574092801313582e-05, "loss": 1.7328, "step": 1687 }, { "epoch": 1.0948597373114968, "grad_norm": 3.5687386989593506, "learning_rate": 8.56397809218574e-05, "loss": 2.3336, "step": 1688 }, { "epoch": 1.0955083508999515, "grad_norm": 3.013150215148926, "learning_rate": 8.553864882925922e-05, "loss": 1.7177, "step": 1689 }, { "epoch": 1.096156964488406, "grad_norm": 3.3549015522003174, "learning_rate": 8.543753184096976e-05, "loss": 2.7786, "step": 1690 }, { "epoch": 1.0968055780768606, "grad_norm": 2.587169647216797, "learning_rate": 8.533643006260171e-05, "loss": 1.1181, "step": 1691 }, { "epoch": 1.0974541916653153, "grad_norm": 3.5929863452911377, "learning_rate": 8.523534359975189e-05, "loss": 2.0019, "step": 1692 }, { "epoch": 1.09810280525377, "grad_norm": 3.8406782150268555, "learning_rate": 8.513427255800109e-05, "loss": 2.1886, "step": 1693 }, { "epoch": 1.0987514188422247, "grad_norm": 3.071868658065796, "learning_rate": 8.503321704291401e-05, "loss": 1.637, "step": 1694 }, { "epoch": 1.0994000324306794, "grad_norm": 3.554957866668701, "learning_rate": 8.493217716003913e-05, "loss": 2.6372, "step": 1695 }, { "epoch": 1.100048646019134, "grad_norm": 2.5973799228668213, "learning_rate": 8.483115301490862e-05, "loss": 1.4881, "step": 1696 }, { "epoch": 1.1006972596075888, "grad_norm": 3.2988367080688477, "learning_rate": 8.473014471303818e-05, "loss": 1.7535, "step": 1697 }, { "epoch": 1.1013458731960435, "grad_norm": 3.3347957134246826, "learning_rate": 8.4629152359927e-05, "loss": 1.7584, "step": 1698 }, { "epoch": 1.1019944867844982, "grad_norm": 3.197293758392334, "learning_rate": 8.452817606105758e-05, "loss": 1.8049, "step": 1699 }, { "epoch": 1.1026431003729529, "grad_norm": 3.618504524230957, "learning_rate": 8.44272159218957e-05, "loss": 1.8816, "step": 1700 }, { "epoch": 1.1026431003729529, "eval_loss": 2.1879794597625732, "eval_runtime": 35.0955, "eval_samples_per_second": 58.583, "eval_steps_per_second": 14.646, "step": 1700 }, { "epoch": 1.1032917139614076, "grad_norm": 2.7549734115600586, "learning_rate": 8.432627204789022e-05, "loss": 1.7389, "step": 1701 }, { "epoch": 1.1039403275498623, "grad_norm": 2.8839144706726074, "learning_rate": 8.422534454447299e-05, "loss": 1.4802, "step": 1702 }, { "epoch": 1.1045889411383167, "grad_norm": 4.072040557861328, "learning_rate": 8.41244335170588e-05, "loss": 2.1208, "step": 1703 }, { "epoch": 1.1052375547267714, "grad_norm": 3.043339967727661, "learning_rate": 8.402353907104523e-05, "loss": 1.5048, "step": 1704 }, { "epoch": 1.1058861683152261, "grad_norm": 3.51377534866333, "learning_rate": 8.392266131181255e-05, "loss": 2.0098, "step": 1705 }, { "epoch": 1.1065347819036808, "grad_norm": 3.6939690113067627, "learning_rate": 8.382180034472353e-05, "loss": 2.1789, "step": 1706 }, { "epoch": 1.1071833954921355, "grad_norm": 3.0726406574249268, "learning_rate": 8.372095627512353e-05, "loss": 1.5154, "step": 1707 }, { "epoch": 1.1078320090805902, "grad_norm": 2.352734327316284, "learning_rate": 8.362012920834014e-05, "loss": 1.5846, "step": 1708 }, { "epoch": 1.108480622669045, "grad_norm": 2.6789486408233643, "learning_rate": 8.351931924968326e-05, "loss": 1.7513, "step": 1709 }, { "epoch": 1.1091292362574996, "grad_norm": 2.5697178840637207, "learning_rate": 8.341852650444488e-05, "loss": 1.5103, "step": 1710 }, { "epoch": 1.1097778498459543, "grad_norm": 3.963867425918579, "learning_rate": 8.331775107789906e-05, "loss": 1.9428, "step": 1711 }, { "epoch": 1.110426463434409, "grad_norm": 3.482707977294922, "learning_rate": 8.321699307530172e-05, "loss": 2.0756, "step": 1712 }, { "epoch": 1.1110750770228637, "grad_norm": 2.8577158451080322, "learning_rate": 8.311625260189057e-05, "loss": 1.892, "step": 1713 }, { "epoch": 1.1117236906113184, "grad_norm": 3.8578054904937744, "learning_rate": 8.301552976288512e-05, "loss": 1.9592, "step": 1714 }, { "epoch": 1.112372304199773, "grad_norm": 3.5636231899261475, "learning_rate": 8.291482466348634e-05, "loss": 1.9179, "step": 1715 }, { "epoch": 1.1130209177882278, "grad_norm": 3.1694111824035645, "learning_rate": 8.281413740887675e-05, "loss": 1.7542, "step": 1716 }, { "epoch": 1.1136695313766825, "grad_norm": 3.29695725440979, "learning_rate": 8.27134681042202e-05, "loss": 1.892, "step": 1717 }, { "epoch": 1.114318144965137, "grad_norm": 3.617048740386963, "learning_rate": 8.261281685466177e-05, "loss": 2.0342, "step": 1718 }, { "epoch": 1.1149667585535916, "grad_norm": 3.1209442615509033, "learning_rate": 8.25121837653277e-05, "loss": 1.7294, "step": 1719 }, { "epoch": 1.1156153721420463, "grad_norm": 2.974846363067627, "learning_rate": 8.24115689413253e-05, "loss": 1.7431, "step": 1720 }, { "epoch": 1.116263985730501, "grad_norm": 3.3295867443084717, "learning_rate": 8.231097248774274e-05, "loss": 2.0937, "step": 1721 }, { "epoch": 1.1169125993189557, "grad_norm": 2.6555190086364746, "learning_rate": 8.2210394509649e-05, "loss": 1.5566, "step": 1722 }, { "epoch": 1.1175612129074104, "grad_norm": 3.059607744216919, "learning_rate": 8.210983511209388e-05, "loss": 1.8924, "step": 1723 }, { "epoch": 1.118209826495865, "grad_norm": 3.4758877754211426, "learning_rate": 8.200929440010765e-05, "loss": 1.9471, "step": 1724 }, { "epoch": 1.1188584400843198, "grad_norm": 3.1908211708068848, "learning_rate": 8.190877247870108e-05, "loss": 1.9874, "step": 1725 }, { "epoch": 1.1195070536727745, "grad_norm": 2.7493531703948975, "learning_rate": 8.180826945286539e-05, "loss": 1.4748, "step": 1726 }, { "epoch": 1.1201556672612292, "grad_norm": 3.6533265113830566, "learning_rate": 8.170778542757192e-05, "loss": 2.0339, "step": 1727 }, { "epoch": 1.1208042808496839, "grad_norm": 3.0872814655303955, "learning_rate": 8.160732050777235e-05, "loss": 1.6466, "step": 1728 }, { "epoch": 1.1214528944381386, "grad_norm": 3.5196774005889893, "learning_rate": 8.150687479839827e-05, "loss": 2.147, "step": 1729 }, { "epoch": 1.1221015080265933, "grad_norm": 3.603255271911621, "learning_rate": 8.140644840436126e-05, "loss": 2.0698, "step": 1730 }, { "epoch": 1.1227501216150477, "grad_norm": 3.048560380935669, "learning_rate": 8.130604143055268e-05, "loss": 2.061, "step": 1731 }, { "epoch": 1.1233987352035024, "grad_norm": 3.536052942276001, "learning_rate": 8.120565398184369e-05, "loss": 1.6523, "step": 1732 }, { "epoch": 1.1240473487919571, "grad_norm": 3.4674997329711914, "learning_rate": 8.110528616308497e-05, "loss": 1.798, "step": 1733 }, { "epoch": 1.1246959623804118, "grad_norm": 3.437767267227173, "learning_rate": 8.100493807910674e-05, "loss": 1.873, "step": 1734 }, { "epoch": 1.1253445759688665, "grad_norm": 3.3794450759887695, "learning_rate": 8.090460983471861e-05, "loss": 1.827, "step": 1735 }, { "epoch": 1.1259931895573212, "grad_norm": 4.27205753326416, "learning_rate": 8.080430153470945e-05, "loss": 2.1369, "step": 1736 }, { "epoch": 1.1266418031457759, "grad_norm": 3.591155529022217, "learning_rate": 8.07040132838473e-05, "loss": 1.9334, "step": 1737 }, { "epoch": 1.1272904167342306, "grad_norm": 4.0804290771484375, "learning_rate": 8.060374518687926e-05, "loss": 2.4296, "step": 1738 }, { "epoch": 1.1279390303226853, "grad_norm": 3.929328203201294, "learning_rate": 8.050349734853142e-05, "loss": 2.0161, "step": 1739 }, { "epoch": 1.12858764391114, "grad_norm": 3.5734262466430664, "learning_rate": 8.040326987350865e-05, "loss": 2.3901, "step": 1740 }, { "epoch": 1.1292362574995947, "grad_norm": 2.5662105083465576, "learning_rate": 8.030306286649456e-05, "loss": 1.4458, "step": 1741 }, { "epoch": 1.1298848710880494, "grad_norm": 3.518500328063965, "learning_rate": 8.020287643215148e-05, "loss": 1.6458, "step": 1742 }, { "epoch": 1.130533484676504, "grad_norm": 3.902670383453369, "learning_rate": 8.010271067512011e-05, "loss": 2.3542, "step": 1743 }, { "epoch": 1.1311820982649587, "grad_norm": 2.7158684730529785, "learning_rate": 8.000256570001964e-05, "loss": 1.5332, "step": 1744 }, { "epoch": 1.1318307118534134, "grad_norm": 3.4691672325134277, "learning_rate": 7.990244161144752e-05, "loss": 2.1497, "step": 1745 }, { "epoch": 1.132479325441868, "grad_norm": 2.848093271255493, "learning_rate": 7.98023385139794e-05, "loss": 1.5466, "step": 1746 }, { "epoch": 1.1331279390303226, "grad_norm": 3.2027299404144287, "learning_rate": 7.970225651216899e-05, "loss": 1.8938, "step": 1747 }, { "epoch": 1.1337765526187773, "grad_norm": 3.0930826663970947, "learning_rate": 7.960219571054799e-05, "loss": 1.694, "step": 1748 }, { "epoch": 1.134425166207232, "grad_norm": 3.7159385681152344, "learning_rate": 7.950215621362595e-05, "loss": 1.5674, "step": 1749 }, { "epoch": 1.1350737797956867, "grad_norm": 3.7922158241271973, "learning_rate": 7.940213812589018e-05, "loss": 1.6991, "step": 1750 }, { "epoch": 1.1357223933841414, "grad_norm": 3.4015872478485107, "learning_rate": 7.930214155180558e-05, "loss": 1.7586, "step": 1751 }, { "epoch": 1.136371006972596, "grad_norm": 3.61306095123291, "learning_rate": 7.920216659581462e-05, "loss": 1.7758, "step": 1752 }, { "epoch": 1.1370196205610508, "grad_norm": 4.124576091766357, "learning_rate": 7.910221336233719e-05, "loss": 1.5408, "step": 1753 }, { "epoch": 1.1376682341495055, "grad_norm": 4.318497657775879, "learning_rate": 7.900228195577046e-05, "loss": 2.1257, "step": 1754 }, { "epoch": 1.1383168477379602, "grad_norm": 3.9991707801818848, "learning_rate": 7.890237248048886e-05, "loss": 1.9977, "step": 1755 }, { "epoch": 1.1389654613264149, "grad_norm": 3.8899405002593994, "learning_rate": 7.880248504084384e-05, "loss": 1.7637, "step": 1756 }, { "epoch": 1.1396140749148695, "grad_norm": 3.27457332611084, "learning_rate": 7.870261974116395e-05, "loss": 1.8326, "step": 1757 }, { "epoch": 1.1402626885033242, "grad_norm": 3.6218090057373047, "learning_rate": 7.860277668575449e-05, "loss": 1.8657, "step": 1758 }, { "epoch": 1.1409113020917787, "grad_norm": 2.6100270748138428, "learning_rate": 7.85029559788976e-05, "loss": 1.5204, "step": 1759 }, { "epoch": 1.1415599156802334, "grad_norm": 4.04969596862793, "learning_rate": 7.840315772485203e-05, "loss": 2.1493, "step": 1760 }, { "epoch": 1.142208529268688, "grad_norm": 3.818507432937622, "learning_rate": 7.830338202785312e-05, "loss": 2.0508, "step": 1761 }, { "epoch": 1.1428571428571428, "grad_norm": 3.6501264572143555, "learning_rate": 7.820362899211264e-05, "loss": 1.8982, "step": 1762 }, { "epoch": 1.1435057564455975, "grad_norm": 3.2152748107910156, "learning_rate": 7.810389872181864e-05, "loss": 1.509, "step": 1763 }, { "epoch": 1.1441543700340522, "grad_norm": 3.389416217803955, "learning_rate": 7.80041913211355e-05, "loss": 2.0092, "step": 1764 }, { "epoch": 1.1448029836225069, "grad_norm": 2.967541217803955, "learning_rate": 7.790450689420363e-05, "loss": 1.7677, "step": 1765 }, { "epoch": 1.1454515972109616, "grad_norm": 2.999128818511963, "learning_rate": 7.780484554513948e-05, "loss": 1.6206, "step": 1766 }, { "epoch": 1.1461002107994163, "grad_norm": 3.0433456897735596, "learning_rate": 7.770520737803536e-05, "loss": 1.6247, "step": 1767 }, { "epoch": 1.146748824387871, "grad_norm": 3.5057733058929443, "learning_rate": 7.76055924969594e-05, "loss": 1.9273, "step": 1768 }, { "epoch": 1.1473974379763257, "grad_norm": 2.9957528114318848, "learning_rate": 7.75060010059554e-05, "loss": 1.6813, "step": 1769 }, { "epoch": 1.1480460515647803, "grad_norm": 3.606004476547241, "learning_rate": 7.74064330090427e-05, "loss": 1.735, "step": 1770 }, { "epoch": 1.148694665153235, "grad_norm": 3.0331177711486816, "learning_rate": 7.730688861021618e-05, "loss": 1.6498, "step": 1771 }, { "epoch": 1.1493432787416897, "grad_norm": 3.540494203567505, "learning_rate": 7.720736791344599e-05, "loss": 1.9597, "step": 1772 }, { "epoch": 1.1499918923301444, "grad_norm": 3.3496036529541016, "learning_rate": 7.710787102267757e-05, "loss": 1.968, "step": 1773 }, { "epoch": 1.150640505918599, "grad_norm": 3.0179667472839355, "learning_rate": 7.70083980418315e-05, "loss": 1.9445, "step": 1774 }, { "epoch": 1.1512891195070536, "grad_norm": 3.055924415588379, "learning_rate": 7.690894907480336e-05, "loss": 1.927, "step": 1775 }, { "epoch": 1.1519377330955083, "grad_norm": 3.437929630279541, "learning_rate": 7.680952422546365e-05, "loss": 1.8838, "step": 1776 }, { "epoch": 1.152586346683963, "grad_norm": 3.819762945175171, "learning_rate": 7.671012359765768e-05, "loss": 2.0261, "step": 1777 }, { "epoch": 1.1532349602724177, "grad_norm": 3.2069027423858643, "learning_rate": 7.661074729520548e-05, "loss": 1.66, "step": 1778 }, { "epoch": 1.1538835738608724, "grad_norm": 3.4103593826293945, "learning_rate": 7.651139542190164e-05, "loss": 1.7969, "step": 1779 }, { "epoch": 1.154532187449327, "grad_norm": 3.4866232872009277, "learning_rate": 7.64120680815153e-05, "loss": 1.8652, "step": 1780 }, { "epoch": 1.1551808010377818, "grad_norm": 3.7092230319976807, "learning_rate": 7.631276537778992e-05, "loss": 2.1207, "step": 1781 }, { "epoch": 1.1558294146262365, "grad_norm": 3.472687005996704, "learning_rate": 7.621348741444324e-05, "loss": 1.9565, "step": 1782 }, { "epoch": 1.1564780282146911, "grad_norm": 3.288837194442749, "learning_rate": 7.611423429516715e-05, "loss": 2.0108, "step": 1783 }, { "epoch": 1.1571266418031458, "grad_norm": 2.9107871055603027, "learning_rate": 7.601500612362762e-05, "loss": 1.5616, "step": 1784 }, { "epoch": 1.1577752553916005, "grad_norm": 3.23193097114563, "learning_rate": 7.59158030034645e-05, "loss": 1.7925, "step": 1785 }, { "epoch": 1.1584238689800552, "grad_norm": 3.2363452911376953, "learning_rate": 7.581662503829158e-05, "loss": 2.0746, "step": 1786 }, { "epoch": 1.1590724825685097, "grad_norm": 3.243356227874756, "learning_rate": 7.571747233169633e-05, "loss": 1.6092, "step": 1787 }, { "epoch": 1.1597210961569644, "grad_norm": 3.884089469909668, "learning_rate": 7.561834498723974e-05, "loss": 2.1068, "step": 1788 }, { "epoch": 1.160369709745419, "grad_norm": 3.465823173522949, "learning_rate": 7.55192431084565e-05, "loss": 2.003, "step": 1789 }, { "epoch": 1.1610183233338738, "grad_norm": 3.0565216541290283, "learning_rate": 7.542016679885453e-05, "loss": 1.6195, "step": 1790 }, { "epoch": 1.1616669369223285, "grad_norm": 3.5745460987091064, "learning_rate": 7.532111616191514e-05, "loss": 1.8331, "step": 1791 }, { "epoch": 1.1623155505107832, "grad_norm": 2.926725149154663, "learning_rate": 7.522209130109279e-05, "loss": 1.7666, "step": 1792 }, { "epoch": 1.1629641640992379, "grad_norm": 3.663959264755249, "learning_rate": 7.512309231981502e-05, "loss": 2.1074, "step": 1793 }, { "epoch": 1.1636127776876926, "grad_norm": 3.14648699760437, "learning_rate": 7.502411932148236e-05, "loss": 1.7941, "step": 1794 }, { "epoch": 1.1642613912761473, "grad_norm": 4.011041641235352, "learning_rate": 7.492517240946815e-05, "loss": 2.1362, "step": 1795 }, { "epoch": 1.164910004864602, "grad_norm": 3.347409725189209, "learning_rate": 7.48262516871186e-05, "loss": 1.7614, "step": 1796 }, { "epoch": 1.1655586184530566, "grad_norm": 3.747483968734741, "learning_rate": 7.472735725775241e-05, "loss": 1.7117, "step": 1797 }, { "epoch": 1.1662072320415113, "grad_norm": 3.653857707977295, "learning_rate": 7.462848922466092e-05, "loss": 1.8692, "step": 1798 }, { "epoch": 1.166855845629966, "grad_norm": 3.081038236618042, "learning_rate": 7.452964769110786e-05, "loss": 1.5437, "step": 1799 }, { "epoch": 1.1675044592184207, "grad_norm": 2.873063087463379, "learning_rate": 7.443083276032937e-05, "loss": 1.8313, "step": 1800 }, { "epoch": 1.1675044592184207, "eval_loss": 2.1418817043304443, "eval_runtime": 35.1289, "eval_samples_per_second": 58.527, "eval_steps_per_second": 14.632, "step": 1800 }, { "epoch": 1.1681530728068754, "grad_norm": 3.8776912689208984, "learning_rate": 7.433204453553366e-05, "loss": 1.7784, "step": 1801 }, { "epoch": 1.1688016863953299, "grad_norm": 3.7379465103149414, "learning_rate": 7.423328311990116e-05, "loss": 1.9556, "step": 1802 }, { "epoch": 1.1694502999837846, "grad_norm": 3.9918289184570312, "learning_rate": 7.413454861658424e-05, "loss": 1.896, "step": 1803 }, { "epoch": 1.1700989135722393, "grad_norm": 4.035956382751465, "learning_rate": 7.403584112870716e-05, "loss": 2.0763, "step": 1804 }, { "epoch": 1.170747527160694, "grad_norm": 4.307534217834473, "learning_rate": 7.3937160759366e-05, "loss": 2.0676, "step": 1805 }, { "epoch": 1.1713961407491487, "grad_norm": 3.65425968170166, "learning_rate": 7.383850761162853e-05, "loss": 1.6914, "step": 1806 }, { "epoch": 1.1720447543376034, "grad_norm": 3.7392771244049072, "learning_rate": 7.373988178853403e-05, "loss": 2.0052, "step": 1807 }, { "epoch": 1.172693367926058, "grad_norm": 3.6620025634765625, "learning_rate": 7.364128339309326e-05, "loss": 1.7354, "step": 1808 }, { "epoch": 1.1733419815145127, "grad_norm": 3.9420547485351562, "learning_rate": 7.354271252828833e-05, "loss": 2.0831, "step": 1809 }, { "epoch": 1.1739905951029674, "grad_norm": 3.5152528285980225, "learning_rate": 7.344416929707264e-05, "loss": 1.8795, "step": 1810 }, { "epoch": 1.1746392086914221, "grad_norm": 3.746999502182007, "learning_rate": 7.334565380237065e-05, "loss": 1.914, "step": 1811 }, { "epoch": 1.1752878222798768, "grad_norm": 2.676220178604126, "learning_rate": 7.324716614707793e-05, "loss": 1.4885, "step": 1812 }, { "epoch": 1.1759364358683315, "grad_norm": 2.928553819656372, "learning_rate": 7.31487064340609e-05, "loss": 1.6738, "step": 1813 }, { "epoch": 1.1765850494567862, "grad_norm": 3.2144322395324707, "learning_rate": 7.305027476615689e-05, "loss": 2.1104, "step": 1814 }, { "epoch": 1.1772336630452407, "grad_norm": 2.8432633876800537, "learning_rate": 7.295187124617384e-05, "loss": 1.6358, "step": 1815 }, { "epoch": 1.1778822766336954, "grad_norm": 3.4077272415161133, "learning_rate": 7.285349597689035e-05, "loss": 1.6474, "step": 1816 }, { "epoch": 1.17853089022215, "grad_norm": 3.828331470489502, "learning_rate": 7.275514906105547e-05, "loss": 2.0732, "step": 1817 }, { "epoch": 1.1791795038106048, "grad_norm": 3.580864667892456, "learning_rate": 7.265683060138868e-05, "loss": 1.7516, "step": 1818 }, { "epoch": 1.1798281173990595, "grad_norm": 3.3466989994049072, "learning_rate": 7.25585407005797e-05, "loss": 1.6254, "step": 1819 }, { "epoch": 1.1804767309875142, "grad_norm": 3.0358641147613525, "learning_rate": 7.246027946128844e-05, "loss": 1.7323, "step": 1820 }, { "epoch": 1.1811253445759688, "grad_norm": 3.517976999282837, "learning_rate": 7.23620469861449e-05, "loss": 1.9087, "step": 1821 }, { "epoch": 1.1817739581644235, "grad_norm": 3.2168514728546143, "learning_rate": 7.226384337774898e-05, "loss": 1.9275, "step": 1822 }, { "epoch": 1.1824225717528782, "grad_norm": 3.2475240230560303, "learning_rate": 7.216566873867048e-05, "loss": 1.8244, "step": 1823 }, { "epoch": 1.183071185341333, "grad_norm": 3.1169238090515137, "learning_rate": 7.206752317144893e-05, "loss": 1.7604, "step": 1824 }, { "epoch": 1.1837197989297876, "grad_norm": 3.31949520111084, "learning_rate": 7.196940677859346e-05, "loss": 1.7384, "step": 1825 }, { "epoch": 1.1843684125182423, "grad_norm": 3.2452340126037598, "learning_rate": 7.187131966258278e-05, "loss": 1.9212, "step": 1826 }, { "epoch": 1.185017026106697, "grad_norm": 2.961420774459839, "learning_rate": 7.177326192586498e-05, "loss": 1.488, "step": 1827 }, { "epoch": 1.1856656396951517, "grad_norm": 3.0133421421051025, "learning_rate": 7.167523367085749e-05, "loss": 1.9129, "step": 1828 }, { "epoch": 1.1863142532836064, "grad_norm": 3.047515392303467, "learning_rate": 7.157723499994695e-05, "loss": 1.5641, "step": 1829 }, { "epoch": 1.1869628668720609, "grad_norm": 4.146703243255615, "learning_rate": 7.147926601548911e-05, "loss": 1.9863, "step": 1830 }, { "epoch": 1.1876114804605156, "grad_norm": 3.6120903491973877, "learning_rate": 7.138132681980867e-05, "loss": 1.7842, "step": 1831 }, { "epoch": 1.1882600940489703, "grad_norm": 4.044733047485352, "learning_rate": 7.128341751519924e-05, "loss": 1.9796, "step": 1832 }, { "epoch": 1.188908707637425, "grad_norm": 3.40850567817688, "learning_rate": 7.118553820392325e-05, "loss": 1.5536, "step": 1833 }, { "epoch": 1.1895573212258796, "grad_norm": 3.5287880897521973, "learning_rate": 7.10876889882117e-05, "loss": 1.8914, "step": 1834 }, { "epoch": 1.1902059348143343, "grad_norm": 3.7855634689331055, "learning_rate": 7.098986997026428e-05, "loss": 1.9389, "step": 1835 }, { "epoch": 1.190854548402789, "grad_norm": 4.025291442871094, "learning_rate": 7.089208125224903e-05, "loss": 1.859, "step": 1836 }, { "epoch": 1.1915031619912437, "grad_norm": 4.16865873336792, "learning_rate": 7.079432293630244e-05, "loss": 1.9416, "step": 1837 }, { "epoch": 1.1921517755796984, "grad_norm": 3.69059681892395, "learning_rate": 7.069659512452918e-05, "loss": 2.1033, "step": 1838 }, { "epoch": 1.1928003891681531, "grad_norm": 3.3533573150634766, "learning_rate": 7.059889791900207e-05, "loss": 1.6723, "step": 1839 }, { "epoch": 1.1934490027566078, "grad_norm": 3.554982900619507, "learning_rate": 7.0501231421762e-05, "loss": 1.6126, "step": 1840 }, { "epoch": 1.1940976163450625, "grad_norm": 3.2600605487823486, "learning_rate": 7.040359573481774e-05, "loss": 1.8086, "step": 1841 }, { "epoch": 1.1947462299335172, "grad_norm": 3.430548906326294, "learning_rate": 7.030599096014585e-05, "loss": 1.8034, "step": 1842 }, { "epoch": 1.1953948435219717, "grad_norm": 3.1032180786132812, "learning_rate": 7.020841719969076e-05, "loss": 1.4674, "step": 1843 }, { "epoch": 1.1960434571104264, "grad_norm": 3.234858274459839, "learning_rate": 7.011087455536433e-05, "loss": 1.9497, "step": 1844 }, { "epoch": 1.196692070698881, "grad_norm": 3.262610912322998, "learning_rate": 7.0013363129046e-05, "loss": 1.6844, "step": 1845 }, { "epoch": 1.1973406842873358, "grad_norm": 3.3395044803619385, "learning_rate": 6.991588302258262e-05, "loss": 1.9027, "step": 1846 }, { "epoch": 1.1979892978757904, "grad_norm": 4.3748250007629395, "learning_rate": 6.981843433778828e-05, "loss": 2.2491, "step": 1847 }, { "epoch": 1.1986379114642451, "grad_norm": 3.9556214809417725, "learning_rate": 6.972101717644429e-05, "loss": 1.82, "step": 1848 }, { "epoch": 1.1992865250526998, "grad_norm": 3.800960063934326, "learning_rate": 6.962363164029896e-05, "loss": 1.8458, "step": 1849 }, { "epoch": 1.1999351386411545, "grad_norm": 3.597313404083252, "learning_rate": 6.952627783106771e-05, "loss": 1.9514, "step": 1850 }, { "epoch": 1.2005837522296092, "grad_norm": 3.3537988662719727, "learning_rate": 6.942895585043267e-05, "loss": 1.7374, "step": 1851 }, { "epoch": 1.201232365818064, "grad_norm": 3.077211856842041, "learning_rate": 6.933166580004282e-05, "loss": 1.618, "step": 1852 }, { "epoch": 1.2018809794065186, "grad_norm": 3.6400246620178223, "learning_rate": 6.923440778151378e-05, "loss": 1.7998, "step": 1853 }, { "epoch": 1.2025295929949733, "grad_norm": 3.7369632720947266, "learning_rate": 6.913718189642769e-05, "loss": 1.658, "step": 1854 }, { "epoch": 1.203178206583428, "grad_norm": 3.377979040145874, "learning_rate": 6.903998824633312e-05, "loss": 2.0871, "step": 1855 }, { "epoch": 1.2038268201718827, "grad_norm": 2.948380708694458, "learning_rate": 6.894282693274498e-05, "loss": 1.6113, "step": 1856 }, { "epoch": 1.2044754337603374, "grad_norm": 3.4764108657836914, "learning_rate": 6.884569805714447e-05, "loss": 1.8711, "step": 1857 }, { "epoch": 1.2051240473487919, "grad_norm": 2.819741725921631, "learning_rate": 6.874860172097883e-05, "loss": 1.6616, "step": 1858 }, { "epoch": 1.2057726609372466, "grad_norm": 2.8407464027404785, "learning_rate": 6.865153802566133e-05, "loss": 1.8897, "step": 1859 }, { "epoch": 1.2064212745257012, "grad_norm": 3.3834033012390137, "learning_rate": 6.855450707257117e-05, "loss": 1.9087, "step": 1860 }, { "epoch": 1.207069888114156, "grad_norm": 3.0513904094696045, "learning_rate": 6.84575089630533e-05, "loss": 1.5701, "step": 1861 }, { "epoch": 1.2077185017026106, "grad_norm": 3.2753725051879883, "learning_rate": 6.836054379841846e-05, "loss": 1.5959, "step": 1862 }, { "epoch": 1.2083671152910653, "grad_norm": 3.225700855255127, "learning_rate": 6.82636116799429e-05, "loss": 1.9895, "step": 1863 }, { "epoch": 1.20901572887952, "grad_norm": 3.690739631652832, "learning_rate": 6.816671270886836e-05, "loss": 1.8869, "step": 1864 }, { "epoch": 1.2096643424679747, "grad_norm": 3.8880887031555176, "learning_rate": 6.806984698640202e-05, "loss": 1.9326, "step": 1865 }, { "epoch": 1.2103129560564294, "grad_norm": 4.027405261993408, "learning_rate": 6.797301461371625e-05, "loss": 1.9912, "step": 1866 }, { "epoch": 1.210961569644884, "grad_norm": 3.8782289028167725, "learning_rate": 6.787621569194866e-05, "loss": 1.6445, "step": 1867 }, { "epoch": 1.2116101832333388, "grad_norm": 4.589846611022949, "learning_rate": 6.777945032220187e-05, "loss": 2.3369, "step": 1868 }, { "epoch": 1.2122587968217935, "grad_norm": 3.5106916427612305, "learning_rate": 6.76827186055435e-05, "loss": 1.9925, "step": 1869 }, { "epoch": 1.2129074104102482, "grad_norm": 3.6947474479675293, "learning_rate": 6.758602064300597e-05, "loss": 1.8589, "step": 1870 }, { "epoch": 1.2135560239987027, "grad_norm": 3.768291473388672, "learning_rate": 6.748935653558646e-05, "loss": 1.8413, "step": 1871 }, { "epoch": 1.2142046375871574, "grad_norm": 3.442101240158081, "learning_rate": 6.739272638424687e-05, "loss": 2.0776, "step": 1872 }, { "epoch": 1.214853251175612, "grad_norm": 2.8098862171173096, "learning_rate": 6.729613028991353e-05, "loss": 1.3106, "step": 1873 }, { "epoch": 1.2155018647640667, "grad_norm": 3.125943660736084, "learning_rate": 6.719956835347723e-05, "loss": 1.7558, "step": 1874 }, { "epoch": 1.2161504783525214, "grad_norm": 2.91395902633667, "learning_rate": 6.71030406757931e-05, "loss": 1.4933, "step": 1875 }, { "epoch": 1.2167990919409761, "grad_norm": 3.1882665157318115, "learning_rate": 6.700654735768046e-05, "loss": 1.6818, "step": 1876 }, { "epoch": 1.2174477055294308, "grad_norm": 3.260629653930664, "learning_rate": 6.691008849992276e-05, "loss": 2.0246, "step": 1877 }, { "epoch": 1.2180963191178855, "grad_norm": 3.480231285095215, "learning_rate": 6.681366420326747e-05, "loss": 1.7292, "step": 1878 }, { "epoch": 1.2187449327063402, "grad_norm": 3.2030067443847656, "learning_rate": 6.671727456842597e-05, "loss": 1.6421, "step": 1879 }, { "epoch": 1.219393546294795, "grad_norm": 3.2332823276519775, "learning_rate": 6.662091969607339e-05, "loss": 1.7449, "step": 1880 }, { "epoch": 1.2200421598832496, "grad_norm": 4.286528587341309, "learning_rate": 6.652459968684859e-05, "loss": 1.8587, "step": 1881 }, { "epoch": 1.2206907734717043, "grad_norm": 3.3137097358703613, "learning_rate": 6.6428314641354e-05, "loss": 1.5391, "step": 1882 }, { "epoch": 1.221339387060159, "grad_norm": 3.5908029079437256, "learning_rate": 6.633206466015554e-05, "loss": 2.0696, "step": 1883 }, { "epoch": 1.2219880006486137, "grad_norm": 3.581099510192871, "learning_rate": 6.623584984378247e-05, "loss": 1.6594, "step": 1884 }, { "epoch": 1.2226366142370684, "grad_norm": 3.6829140186309814, "learning_rate": 6.613967029272741e-05, "loss": 1.59, "step": 1885 }, { "epoch": 1.223285227825523, "grad_norm": 4.24811315536499, "learning_rate": 6.604352610744605e-05, "loss": 1.8838, "step": 1886 }, { "epoch": 1.2239338414139775, "grad_norm": 2.746631145477295, "learning_rate": 6.594741738835725e-05, "loss": 1.3413, "step": 1887 }, { "epoch": 1.2245824550024322, "grad_norm": 3.5681722164154053, "learning_rate": 6.58513442358427e-05, "loss": 1.9438, "step": 1888 }, { "epoch": 1.225231068590887, "grad_norm": 3.325854539871216, "learning_rate": 6.575530675024704e-05, "loss": 1.8716, "step": 1889 }, { "epoch": 1.2258796821793416, "grad_norm": 3.417414903640747, "learning_rate": 6.56593050318776e-05, "loss": 1.7574, "step": 1890 }, { "epoch": 1.2265282957677963, "grad_norm": 4.2380571365356445, "learning_rate": 6.556333918100436e-05, "loss": 2.1978, "step": 1891 }, { "epoch": 1.227176909356251, "grad_norm": 3.921210527420044, "learning_rate": 6.54674092978599e-05, "loss": 1.9623, "step": 1892 }, { "epoch": 1.2278255229447057, "grad_norm": 2.817203998565674, "learning_rate": 6.537151548263911e-05, "loss": 1.6916, "step": 1893 }, { "epoch": 1.2284741365331604, "grad_norm": 3.2463769912719727, "learning_rate": 6.527565783549936e-05, "loss": 1.4784, "step": 1894 }, { "epoch": 1.229122750121615, "grad_norm": 3.348630905151367, "learning_rate": 6.517983645656014e-05, "loss": 1.9391, "step": 1895 }, { "epoch": 1.2297713637100698, "grad_norm": 3.3819868564605713, "learning_rate": 6.508405144590307e-05, "loss": 1.5138, "step": 1896 }, { "epoch": 1.2304199772985245, "grad_norm": 3.5647220611572266, "learning_rate": 6.498830290357182e-05, "loss": 1.8771, "step": 1897 }, { "epoch": 1.2310685908869792, "grad_norm": 2.8127102851867676, "learning_rate": 6.489259092957193e-05, "loss": 1.6066, "step": 1898 }, { "epoch": 1.2317172044754336, "grad_norm": 3.6445772647857666, "learning_rate": 6.479691562387076e-05, "loss": 1.8968, "step": 1899 }, { "epoch": 1.2323658180638883, "grad_norm": 3.800100564956665, "learning_rate": 6.470127708639742e-05, "loss": 1.9847, "step": 1900 }, { "epoch": 1.2323658180638883, "eval_loss": 2.094085693359375, "eval_runtime": 35.1162, "eval_samples_per_second": 58.548, "eval_steps_per_second": 14.637, "step": 1900 }, { "epoch": 1.233014431652343, "grad_norm": 3.508455753326416, "learning_rate": 6.460567541704256e-05, "loss": 1.846, "step": 1901 }, { "epoch": 1.2336630452407977, "grad_norm": 2.815027952194214, "learning_rate": 6.451011071565834e-05, "loss": 1.4219, "step": 1902 }, { "epoch": 1.2343116588292524, "grad_norm": 4.307042598724365, "learning_rate": 6.441458308205829e-05, "loss": 2.2902, "step": 1903 }, { "epoch": 1.2349602724177071, "grad_norm": 4.005605697631836, "learning_rate": 6.431909261601726e-05, "loss": 1.7675, "step": 1904 }, { "epoch": 1.2356088860061618, "grad_norm": 4.776045799255371, "learning_rate": 6.422363941727126e-05, "loss": 1.8473, "step": 1905 }, { "epoch": 1.2362574995946165, "grad_norm": 4.088295936584473, "learning_rate": 6.412822358551735e-05, "loss": 1.7692, "step": 1906 }, { "epoch": 1.2369061131830712, "grad_norm": 3.2851200103759766, "learning_rate": 6.403284522041364e-05, "loss": 1.6023, "step": 1907 }, { "epoch": 1.237554726771526, "grad_norm": 3.8041205406188965, "learning_rate": 6.3937504421579e-05, "loss": 1.9276, "step": 1908 }, { "epoch": 1.2382033403599806, "grad_norm": 4.022549152374268, "learning_rate": 6.384220128859312e-05, "loss": 1.8326, "step": 1909 }, { "epoch": 1.2388519539484353, "grad_norm": 4.490525722503662, "learning_rate": 6.37469359209964e-05, "loss": 1.7128, "step": 1910 }, { "epoch": 1.23950056753689, "grad_norm": 3.155466318130493, "learning_rate": 6.365170841828972e-05, "loss": 1.6361, "step": 1911 }, { "epoch": 1.2401491811253447, "grad_norm": 3.871823787689209, "learning_rate": 6.355651887993443e-05, "loss": 1.9832, "step": 1912 }, { "epoch": 1.2407977947137994, "grad_norm": 3.618861198425293, "learning_rate": 6.346136740535224e-05, "loss": 1.5877, "step": 1913 }, { "epoch": 1.241446408302254, "grad_norm": 2.9137308597564697, "learning_rate": 6.336625409392505e-05, "loss": 1.6455, "step": 1914 }, { "epoch": 1.2420950218907085, "grad_norm": 3.302041530609131, "learning_rate": 6.327117904499503e-05, "loss": 1.6157, "step": 1915 }, { "epoch": 1.2427436354791632, "grad_norm": 2.973400354385376, "learning_rate": 6.317614235786426e-05, "loss": 1.3795, "step": 1916 }, { "epoch": 1.243392249067618, "grad_norm": 3.974292278289795, "learning_rate": 6.308114413179483e-05, "loss": 2.008, "step": 1917 }, { "epoch": 1.2440408626560726, "grad_norm": 3.872140884399414, "learning_rate": 6.298618446600856e-05, "loss": 1.9316, "step": 1918 }, { "epoch": 1.2446894762445273, "grad_norm": 3.3570444583892822, "learning_rate": 6.289126345968712e-05, "loss": 1.8585, "step": 1919 }, { "epoch": 1.245338089832982, "grad_norm": 2.773864507675171, "learning_rate": 6.279638121197174e-05, "loss": 1.4592, "step": 1920 }, { "epoch": 1.2459867034214367, "grad_norm": 3.365849733352661, "learning_rate": 6.270153782196313e-05, "loss": 1.7536, "step": 1921 }, { "epoch": 1.2466353170098914, "grad_norm": 3.676769495010376, "learning_rate": 6.26067333887215e-05, "loss": 1.8841, "step": 1922 }, { "epoch": 1.247283930598346, "grad_norm": 3.586636781692505, "learning_rate": 6.251196801126629e-05, "loss": 1.96, "step": 1923 }, { "epoch": 1.2479325441868008, "grad_norm": 3.1699347496032715, "learning_rate": 6.24172417885762e-05, "loss": 1.5797, "step": 1924 }, { "epoch": 1.2485811577752555, "grad_norm": 3.2571425437927246, "learning_rate": 6.232255481958902e-05, "loss": 1.6784, "step": 1925 }, { "epoch": 1.2492297713637102, "grad_norm": 3.6678390502929688, "learning_rate": 6.222790720320156e-05, "loss": 1.8337, "step": 1926 }, { "epoch": 1.2498783849521646, "grad_norm": 3.4892377853393555, "learning_rate": 6.213329903826947e-05, "loss": 1.8508, "step": 1927 }, { "epoch": 1.2505269985406193, "grad_norm": 2.786478042602539, "learning_rate": 6.203873042360722e-05, "loss": 1.4537, "step": 1928 }, { "epoch": 1.251175612129074, "grad_norm": 4.075290203094482, "learning_rate": 6.194420145798804e-05, "loss": 2.2873, "step": 1929 }, { "epoch": 1.2518242257175287, "grad_norm": 3.69903564453125, "learning_rate": 6.184971224014369e-05, "loss": 2.1355, "step": 1930 }, { "epoch": 1.2524728393059834, "grad_norm": 3.569366455078125, "learning_rate": 6.175526286876439e-05, "loss": 1.8278, "step": 1931 }, { "epoch": 1.253121452894438, "grad_norm": 3.2949295043945312, "learning_rate": 6.166085344249876e-05, "loss": 1.7428, "step": 1932 }, { "epoch": 1.2537700664828928, "grad_norm": 3.0604870319366455, "learning_rate": 6.156648405995373e-05, "loss": 1.5376, "step": 1933 }, { "epoch": 1.2544186800713475, "grad_norm": 2.9408717155456543, "learning_rate": 6.147215481969435e-05, "loss": 1.4931, "step": 1934 }, { "epoch": 1.2550672936598022, "grad_norm": 3.056992530822754, "learning_rate": 6.137786582024382e-05, "loss": 1.3855, "step": 1935 }, { "epoch": 1.2557159072482569, "grad_norm": 3.8870999813079834, "learning_rate": 6.128361716008324e-05, "loss": 1.7622, "step": 1936 }, { "epoch": 1.2563645208367116, "grad_norm": 4.669191837310791, "learning_rate": 6.118940893765162e-05, "loss": 2.2121, "step": 1937 }, { "epoch": 1.2570131344251663, "grad_norm": 4.113475322723389, "learning_rate": 6.109524125134571e-05, "loss": 1.754, "step": 1938 }, { "epoch": 1.257661748013621, "grad_norm": 3.718883514404297, "learning_rate": 6.100111419951992e-05, "loss": 2.0206, "step": 1939 }, { "epoch": 1.2583103616020757, "grad_norm": 3.817570447921753, "learning_rate": 6.0907027880486234e-05, "loss": 2.0613, "step": 1940 }, { "epoch": 1.2589589751905303, "grad_norm": 3.4652676582336426, "learning_rate": 6.081298239251407e-05, "loss": 1.8895, "step": 1941 }, { "epoch": 1.259607588778985, "grad_norm": 2.9336230754852295, "learning_rate": 6.071897783383027e-05, "loss": 1.4043, "step": 1942 }, { "epoch": 1.2602562023674395, "grad_norm": 3.8246753215789795, "learning_rate": 6.06250143026188e-05, "loss": 1.9911, "step": 1943 }, { "epoch": 1.2609048159558942, "grad_norm": 2.880711078643799, "learning_rate": 6.053109189702093e-05, "loss": 1.4626, "step": 1944 }, { "epoch": 1.261553429544349, "grad_norm": 3.242635726928711, "learning_rate": 6.043721071513484e-05, "loss": 1.787, "step": 1945 }, { "epoch": 1.2622020431328036, "grad_norm": 2.6144473552703857, "learning_rate": 6.0343370855015725e-05, "loss": 1.4048, "step": 1946 }, { "epoch": 1.2628506567212583, "grad_norm": 3.3847968578338623, "learning_rate": 6.0249572414675594e-05, "loss": 1.7854, "step": 1947 }, { "epoch": 1.263499270309713, "grad_norm": 3.7292299270629883, "learning_rate": 6.015581549208322e-05, "loss": 1.7866, "step": 1948 }, { "epoch": 1.2641478838981677, "grad_norm": 3.4160943031311035, "learning_rate": 6.0062100185163986e-05, "loss": 1.9025, "step": 1949 }, { "epoch": 1.2647964974866224, "grad_norm": 3.658334255218506, "learning_rate": 5.996842659179977e-05, "loss": 1.8745, "step": 1950 }, { "epoch": 1.265445111075077, "grad_norm": 3.2078793048858643, "learning_rate": 5.9874794809829026e-05, "loss": 1.6561, "step": 1951 }, { "epoch": 1.2660937246635318, "grad_norm": 3.6777424812316895, "learning_rate": 5.97812049370464e-05, "loss": 1.8133, "step": 1952 }, { "epoch": 1.2667423382519865, "grad_norm": 3.2769994735717773, "learning_rate": 5.96876570712028e-05, "loss": 1.5465, "step": 1953 }, { "epoch": 1.267390951840441, "grad_norm": 3.8856887817382812, "learning_rate": 5.9594151310005275e-05, "loss": 2.0164, "step": 1954 }, { "epoch": 1.2680395654288956, "grad_norm": 3.552018642425537, "learning_rate": 5.9500687751116877e-05, "loss": 1.5889, "step": 1955 }, { "epoch": 1.2686881790173503, "grad_norm": 3.264456033706665, "learning_rate": 5.940726649215661e-05, "loss": 1.5883, "step": 1956 }, { "epoch": 1.269336792605805, "grad_norm": 3.649216413497925, "learning_rate": 5.931388763069924e-05, "loss": 2.0696, "step": 1957 }, { "epoch": 1.2699854061942597, "grad_norm": 3.310223340988159, "learning_rate": 5.9220551264275356e-05, "loss": 1.7099, "step": 1958 }, { "epoch": 1.2706340197827144, "grad_norm": 3.9755542278289795, "learning_rate": 5.9127257490371044e-05, "loss": 2.0587, "step": 1959 }, { "epoch": 1.271282633371169, "grad_norm": 4.148164749145508, "learning_rate": 5.9034006406428025e-05, "loss": 1.9249, "step": 1960 }, { "epoch": 1.2719312469596238, "grad_norm": 3.830314874649048, "learning_rate": 5.89407981098433e-05, "loss": 1.7712, "step": 1961 }, { "epoch": 1.2725798605480785, "grad_norm": 3.3320674896240234, "learning_rate": 5.8847632697969246e-05, "loss": 1.7464, "step": 1962 }, { "epoch": 1.2732284741365332, "grad_norm": 3.0370733737945557, "learning_rate": 5.8754510268113496e-05, "loss": 1.4753, "step": 1963 }, { "epoch": 1.2738770877249879, "grad_norm": 4.017465114593506, "learning_rate": 5.8661430917538685e-05, "loss": 1.9693, "step": 1964 }, { "epoch": 1.2745257013134426, "grad_norm": 3.50209903717041, "learning_rate": 5.856839474346254e-05, "loss": 1.7481, "step": 1965 }, { "epoch": 1.2751743149018973, "grad_norm": 3.6932497024536133, "learning_rate": 5.8475401843057634e-05, "loss": 2.1304, "step": 1966 }, { "epoch": 1.275822928490352, "grad_norm": 4.391726493835449, "learning_rate": 5.838245231345141e-05, "loss": 1.8628, "step": 1967 }, { "epoch": 1.2764715420788066, "grad_norm": 3.4755752086639404, "learning_rate": 5.828954625172597e-05, "loss": 1.8748, "step": 1968 }, { "epoch": 1.2771201556672613, "grad_norm": 3.6913866996765137, "learning_rate": 5.8196683754917957e-05, "loss": 1.7439, "step": 1969 }, { "epoch": 1.277768769255716, "grad_norm": 3.543081045150757, "learning_rate": 5.810386492001861e-05, "loss": 1.63, "step": 1970 }, { "epoch": 1.2784173828441705, "grad_norm": 5.300014972686768, "learning_rate": 5.801108984397354e-05, "loss": 2.2083, "step": 1971 }, { "epoch": 1.2790659964326252, "grad_norm": 3.132617950439453, "learning_rate": 5.791835862368262e-05, "loss": 1.3592, "step": 1972 }, { "epoch": 1.27971461002108, "grad_norm": 3.7092294692993164, "learning_rate": 5.782567135599999e-05, "loss": 1.8477, "step": 1973 }, { "epoch": 1.2803632236095346, "grad_norm": 3.9332468509674072, "learning_rate": 5.773302813773376e-05, "loss": 1.68, "step": 1974 }, { "epoch": 1.2810118371979893, "grad_norm": 3.606058120727539, "learning_rate": 5.764042906564616e-05, "loss": 1.7567, "step": 1975 }, { "epoch": 1.281660450786444, "grad_norm": 3.0142011642456055, "learning_rate": 5.7547874236453205e-05, "loss": 1.61, "step": 1976 }, { "epoch": 1.2823090643748987, "grad_norm": 3.7815258502960205, "learning_rate": 5.74553637468248e-05, "loss": 1.7499, "step": 1977 }, { "epoch": 1.2829576779633534, "grad_norm": 3.697462797164917, "learning_rate": 5.736289769338441e-05, "loss": 1.6691, "step": 1978 }, { "epoch": 1.283606291551808, "grad_norm": 3.5001537799835205, "learning_rate": 5.727047617270924e-05, "loss": 1.7189, "step": 1979 }, { "epoch": 1.2842549051402627, "grad_norm": 3.455692768096924, "learning_rate": 5.717809928132991e-05, "loss": 1.5348, "step": 1980 }, { "epoch": 1.2849035187287174, "grad_norm": 5.130237579345703, "learning_rate": 5.7085767115730374e-05, "loss": 2.1009, "step": 1981 }, { "epoch": 1.285552132317172, "grad_norm": 3.1796584129333496, "learning_rate": 5.699347977234799e-05, "loss": 1.7231, "step": 1982 }, { "epoch": 1.2862007459056266, "grad_norm": 3.455136775970459, "learning_rate": 5.6901237347573144e-05, "loss": 2.0303, "step": 1983 }, { "epoch": 1.2868493594940813, "grad_norm": 3.3914473056793213, "learning_rate": 5.6809039937749496e-05, "loss": 1.7844, "step": 1984 }, { "epoch": 1.287497973082536, "grad_norm": 3.3445935249328613, "learning_rate": 5.671688763917351e-05, "loss": 1.738, "step": 1985 }, { "epoch": 1.2881465866709907, "grad_norm": 3.4687821865081787, "learning_rate": 5.6624780548094636e-05, "loss": 1.6426, "step": 1986 }, { "epoch": 1.2887952002594454, "grad_norm": 3.618929624557495, "learning_rate": 5.653271876071508e-05, "loss": 2.1458, "step": 1987 }, { "epoch": 1.2894438138479, "grad_norm": 3.7624285221099854, "learning_rate": 5.644070237318977e-05, "loss": 1.7929, "step": 1988 }, { "epoch": 1.2900924274363548, "grad_norm": 4.638845443725586, "learning_rate": 5.634873148162618e-05, "loss": 1.9391, "step": 1989 }, { "epoch": 1.2907410410248095, "grad_norm": 3.007538318634033, "learning_rate": 5.625680618208422e-05, "loss": 1.6695, "step": 1990 }, { "epoch": 1.2913896546132642, "grad_norm": 2.9628562927246094, "learning_rate": 5.616492657057627e-05, "loss": 1.5124, "step": 1991 }, { "epoch": 1.2920382682017189, "grad_norm": 3.4413094520568848, "learning_rate": 5.607309274306689e-05, "loss": 1.6985, "step": 1992 }, { "epoch": 1.2926868817901735, "grad_norm": 3.0776283740997314, "learning_rate": 5.5981304795472944e-05, "loss": 1.6798, "step": 1993 }, { "epoch": 1.2933354953786282, "grad_norm": 3.2071259021759033, "learning_rate": 5.5889562823663244e-05, "loss": 1.7205, "step": 1994 }, { "epoch": 1.293984108967083, "grad_norm": 3.1568660736083984, "learning_rate": 5.5797866923458674e-05, "loss": 1.3985, "step": 1995 }, { "epoch": 1.2946327225555376, "grad_norm": 3.8732593059539795, "learning_rate": 5.5706217190632014e-05, "loss": 2.0659, "step": 1996 }, { "epoch": 1.2952813361439923, "grad_norm": 4.650418758392334, "learning_rate": 5.5614613720907705e-05, "loss": 1.937, "step": 1997 }, { "epoch": 1.295929949732447, "grad_norm": 3.7891266345977783, "learning_rate": 5.552305660996202e-05, "loss": 1.4898, "step": 1998 }, { "epoch": 1.2965785633209017, "grad_norm": 4.1587653160095215, "learning_rate": 5.543154595342266e-05, "loss": 1.7931, "step": 1999 }, { "epoch": 1.2972271769093562, "grad_norm": 3.8399646282196045, "learning_rate": 5.534008184686892e-05, "loss": 1.8436, "step": 2000 }, { "epoch": 1.2972271769093562, "eval_loss": 2.043125867843628, "eval_runtime": 35.0774, "eval_samples_per_second": 58.613, "eval_steps_per_second": 14.653, "step": 2000 }, { "epoch": 1.2978757904978109, "grad_norm": 3.1133577823638916, "learning_rate": 5.524866438583143e-05, "loss": 1.4449, "step": 2001 }, { "epoch": 1.2985244040862656, "grad_norm": 3.7278459072113037, "learning_rate": 5.5157293665792164e-05, "loss": 1.8444, "step": 2002 }, { "epoch": 1.2991730176747203, "grad_norm": 4.393215179443359, "learning_rate": 5.5065969782184145e-05, "loss": 1.7674, "step": 2003 }, { "epoch": 1.299821631263175, "grad_norm": 4.188038349151611, "learning_rate": 5.4974692830391574e-05, "loss": 1.7193, "step": 2004 }, { "epoch": 1.3004702448516297, "grad_norm": 3.4436049461364746, "learning_rate": 5.488346290574966e-05, "loss": 1.5855, "step": 2005 }, { "epoch": 1.3011188584400843, "grad_norm": 3.5161731243133545, "learning_rate": 5.479228010354437e-05, "loss": 1.9395, "step": 2006 }, { "epoch": 1.301767472028539, "grad_norm": 3.175755262374878, "learning_rate": 5.4701144519012605e-05, "loss": 1.6176, "step": 2007 }, { "epoch": 1.3024160856169937, "grad_norm": 3.217761516571045, "learning_rate": 5.4610056247341814e-05, "loss": 1.6734, "step": 2008 }, { "epoch": 1.3030646992054484, "grad_norm": 3.924492359161377, "learning_rate": 5.4519015383670145e-05, "loss": 1.5242, "step": 2009 }, { "epoch": 1.303713312793903, "grad_norm": 4.628222465515137, "learning_rate": 5.44280220230861e-05, "loss": 2.3928, "step": 2010 }, { "epoch": 1.3043619263823576, "grad_norm": 3.142000675201416, "learning_rate": 5.43370762606287e-05, "loss": 1.5446, "step": 2011 }, { "epoch": 1.3050105399708123, "grad_norm": 3.327075481414795, "learning_rate": 5.4246178191287215e-05, "loss": 1.7909, "step": 2012 }, { "epoch": 1.305659153559267, "grad_norm": 2.8218281269073486, "learning_rate": 5.4155327910001e-05, "loss": 1.2737, "step": 2013 }, { "epoch": 1.3063077671477217, "grad_norm": 2.8458263874053955, "learning_rate": 5.4064525511659636e-05, "loss": 1.641, "step": 2014 }, { "epoch": 1.3069563807361764, "grad_norm": 3.791328191757202, "learning_rate": 5.39737710911026e-05, "loss": 2.2467, "step": 2015 }, { "epoch": 1.307604994324631, "grad_norm": 3.8327720165252686, "learning_rate": 5.388306474311937e-05, "loss": 1.6769, "step": 2016 }, { "epoch": 1.3082536079130858, "grad_norm": 4.301083564758301, "learning_rate": 5.379240656244901e-05, "loss": 2.1983, "step": 2017 }, { "epoch": 1.3089022215015405, "grad_norm": 3.4168057441711426, "learning_rate": 5.3701796643780524e-05, "loss": 1.6809, "step": 2018 }, { "epoch": 1.3095508350899951, "grad_norm": 3.307873010635376, "learning_rate": 5.3611235081752274e-05, "loss": 1.534, "step": 2019 }, { "epoch": 1.3101994486784498, "grad_norm": 3.939237117767334, "learning_rate": 5.352072197095229e-05, "loss": 2.0292, "step": 2020 }, { "epoch": 1.3108480622669045, "grad_norm": 3.979771852493286, "learning_rate": 5.343025740591796e-05, "loss": 1.6932, "step": 2021 }, { "epoch": 1.3114966758553592, "grad_norm": 3.5983948707580566, "learning_rate": 5.333984148113584e-05, "loss": 1.704, "step": 2022 }, { "epoch": 1.312145289443814, "grad_norm": 3.429224729537964, "learning_rate": 5.3249474291041904e-05, "loss": 1.7802, "step": 2023 }, { "epoch": 1.3127939030322686, "grad_norm": 3.371872663497925, "learning_rate": 5.3159155930021e-05, "loss": 1.4778, "step": 2024 }, { "epoch": 1.3134425166207233, "grad_norm": 4.9890851974487305, "learning_rate": 5.306888649240715e-05, "loss": 2.1199, "step": 2025 }, { "epoch": 1.314091130209178, "grad_norm": 3.793229341506958, "learning_rate": 5.2978666072483166e-05, "loss": 1.7585, "step": 2026 }, { "epoch": 1.3147397437976327, "grad_norm": 3.821183443069458, "learning_rate": 5.28884947644807e-05, "loss": 1.8377, "step": 2027 }, { "epoch": 1.3153883573860872, "grad_norm": 3.6032888889312744, "learning_rate": 5.279837266258016e-05, "loss": 1.7614, "step": 2028 }, { "epoch": 1.3160369709745419, "grad_norm": 3.7340643405914307, "learning_rate": 5.270829986091044e-05, "loss": 1.9951, "step": 2029 }, { "epoch": 1.3166855845629966, "grad_norm": 3.324342966079712, "learning_rate": 5.261827645354912e-05, "loss": 1.6215, "step": 2030 }, { "epoch": 1.3173341981514513, "grad_norm": 3.9157752990722656, "learning_rate": 5.2528302534521987e-05, "loss": 2.0033, "step": 2031 }, { "epoch": 1.317982811739906, "grad_norm": 3.353072166442871, "learning_rate": 5.243837819780332e-05, "loss": 1.456, "step": 2032 }, { "epoch": 1.3186314253283606, "grad_norm": 3.3384482860565186, "learning_rate": 5.234850353731544e-05, "loss": 1.8107, "step": 2033 }, { "epoch": 1.3192800389168153, "grad_norm": 3.7296571731567383, "learning_rate": 5.225867864692895e-05, "loss": 1.8491, "step": 2034 }, { "epoch": 1.31992865250527, "grad_norm": 3.5766191482543945, "learning_rate": 5.216890362046233e-05, "loss": 1.7089, "step": 2035 }, { "epoch": 1.3205772660937247, "grad_norm": 3.308260440826416, "learning_rate": 5.207917855168205e-05, "loss": 1.8135, "step": 2036 }, { "epoch": 1.3212258796821794, "grad_norm": 3.241671323776245, "learning_rate": 5.198950353430246e-05, "loss": 1.6325, "step": 2037 }, { "epoch": 1.3218744932706339, "grad_norm": 3.519355297088623, "learning_rate": 5.189987866198548e-05, "loss": 1.7301, "step": 2038 }, { "epoch": 1.3225231068590886, "grad_norm": 3.0608015060424805, "learning_rate": 5.1810304028340806e-05, "loss": 1.5383, "step": 2039 }, { "epoch": 1.3231717204475433, "grad_norm": 3.3508996963500977, "learning_rate": 5.172077972692553e-05, "loss": 1.792, "step": 2040 }, { "epoch": 1.323820334035998, "grad_norm": 4.155913829803467, "learning_rate": 5.1631305851244316e-05, "loss": 1.5546, "step": 2041 }, { "epoch": 1.3244689476244527, "grad_norm": 3.8740527629852295, "learning_rate": 5.1541882494748996e-05, "loss": 1.8142, "step": 2042 }, { "epoch": 1.3251175612129074, "grad_norm": 3.6615548133850098, "learning_rate": 5.145250975083877e-05, "loss": 1.854, "step": 2043 }, { "epoch": 1.325766174801362, "grad_norm": 3.854388475418091, "learning_rate": 5.136318771285994e-05, "loss": 2.0503, "step": 2044 }, { "epoch": 1.3264147883898167, "grad_norm": 3.579725742340088, "learning_rate": 5.127391647410583e-05, "loss": 1.774, "step": 2045 }, { "epoch": 1.3270634019782714, "grad_norm": 3.921165943145752, "learning_rate": 5.118469612781676e-05, "loss": 1.9953, "step": 2046 }, { "epoch": 1.3277120155667261, "grad_norm": 3.514629602432251, "learning_rate": 5.109552676717976e-05, "loss": 1.6384, "step": 2047 }, { "epoch": 1.3283606291551808, "grad_norm": 3.1642324924468994, "learning_rate": 5.100640848532878e-05, "loss": 1.5911, "step": 2048 }, { "epoch": 1.3290092427436355, "grad_norm": 3.250716209411621, "learning_rate": 5.091734137534426e-05, "loss": 1.8094, "step": 2049 }, { "epoch": 1.3296578563320902, "grad_norm": 4.189258098602295, "learning_rate": 5.082832553025335e-05, "loss": 1.8997, "step": 2050 }, { "epoch": 1.330306469920545, "grad_norm": 2.9619927406311035, "learning_rate": 5.07393610430295e-05, "loss": 1.5376, "step": 2051 }, { "epoch": 1.3309550835089996, "grad_norm": 3.7196571826934814, "learning_rate": 5.0650448006592624e-05, "loss": 2.4425, "step": 2052 }, { "epoch": 1.3316036970974543, "grad_norm": 3.490706443786621, "learning_rate": 5.05615865138089e-05, "loss": 1.4862, "step": 2053 }, { "epoch": 1.332252310685909, "grad_norm": 3.211367607116699, "learning_rate": 5.0472776657490586e-05, "loss": 1.5928, "step": 2054 }, { "epoch": 1.3329009242743637, "grad_norm": 3.890547275543213, "learning_rate": 5.0384018530396114e-05, "loss": 1.6615, "step": 2055 }, { "epoch": 1.3335495378628182, "grad_norm": 3.4924685955047607, "learning_rate": 5.0295312225229786e-05, "loss": 1.9003, "step": 2056 }, { "epoch": 1.3341981514512729, "grad_norm": 3.4535317420959473, "learning_rate": 5.020665783464182e-05, "loss": 1.4413, "step": 2057 }, { "epoch": 1.3348467650397275, "grad_norm": 3.7064075469970703, "learning_rate": 5.011805545122826e-05, "loss": 1.8275, "step": 2058 }, { "epoch": 1.3354953786281822, "grad_norm": 3.642709970474243, "learning_rate": 5.00295051675308e-05, "loss": 1.6604, "step": 2059 }, { "epoch": 1.336143992216637, "grad_norm": 3.728032350540161, "learning_rate": 4.994100707603664e-05, "loss": 1.6009, "step": 2060 }, { "epoch": 1.3367926058050916, "grad_norm": 3.2465975284576416, "learning_rate": 4.9852561269178564e-05, "loss": 1.7703, "step": 2061 }, { "epoch": 1.3374412193935463, "grad_norm": 3.1201417446136475, "learning_rate": 4.976416783933476e-05, "loss": 1.6243, "step": 2062 }, { "epoch": 1.338089832982001, "grad_norm": 3.5770230293273926, "learning_rate": 4.967582687882859e-05, "loss": 1.7997, "step": 2063 }, { "epoch": 1.3387384465704557, "grad_norm": 2.899506092071533, "learning_rate": 4.958753847992875e-05, "loss": 1.3897, "step": 2064 }, { "epoch": 1.3393870601589104, "grad_norm": 3.2437689304351807, "learning_rate": 4.9499302734848916e-05, "loss": 1.471, "step": 2065 }, { "epoch": 1.3400356737473649, "grad_norm": 5.016238689422607, "learning_rate": 4.941111973574791e-05, "loss": 2.1055, "step": 2066 }, { "epoch": 1.3406842873358196, "grad_norm": 3.434286117553711, "learning_rate": 4.9322989574729295e-05, "loss": 1.7253, "step": 2067 }, { "epoch": 1.3413329009242743, "grad_norm": 3.5316224098205566, "learning_rate": 4.923491234384158e-05, "loss": 1.8691, "step": 2068 }, { "epoch": 1.341981514512729, "grad_norm": 3.5542209148406982, "learning_rate": 4.914688813507797e-05, "loss": 1.7076, "step": 2069 }, { "epoch": 1.3426301281011837, "grad_norm": 4.496135711669922, "learning_rate": 4.9058917040376216e-05, "loss": 2.1759, "step": 2070 }, { "epoch": 1.3432787416896383, "grad_norm": 5.085272312164307, "learning_rate": 4.89709991516187e-05, "loss": 1.7519, "step": 2071 }, { "epoch": 1.343927355278093, "grad_norm": 4.378961086273193, "learning_rate": 4.888313456063206e-05, "loss": 1.8451, "step": 2072 }, { "epoch": 1.3445759688665477, "grad_norm": 3.867076873779297, "learning_rate": 4.879532335918755e-05, "loss": 1.5901, "step": 2073 }, { "epoch": 1.3452245824550024, "grad_norm": 3.353153944015503, "learning_rate": 4.8707565639000366e-05, "loss": 1.7846, "step": 2074 }, { "epoch": 1.3458731960434571, "grad_norm": 4.43692684173584, "learning_rate": 4.861986149173005e-05, "loss": 1.906, "step": 2075 }, { "epoch": 1.3465218096319118, "grad_norm": 3.5207293033599854, "learning_rate": 4.853221100898003e-05, "loss": 1.5007, "step": 2076 }, { "epoch": 1.3471704232203665, "grad_norm": 3.273021697998047, "learning_rate": 4.844461428229782e-05, "loss": 1.564, "step": 2077 }, { "epoch": 1.3478190368088212, "grad_norm": 3.3903870582580566, "learning_rate": 4.8357071403174746e-05, "loss": 1.6403, "step": 2078 }, { "epoch": 1.348467650397276, "grad_norm": 3.6990256309509277, "learning_rate": 4.8269582463045835e-05, "loss": 1.714, "step": 2079 }, { "epoch": 1.3491162639857306, "grad_norm": 3.671762228012085, "learning_rate": 4.818214755328987e-05, "loss": 1.6446, "step": 2080 }, { "epoch": 1.3497648775741853, "grad_norm": 3.531496524810791, "learning_rate": 4.809476676522909e-05, "loss": 1.5116, "step": 2081 }, { "epoch": 1.35041349116264, "grad_norm": 4.370388031005859, "learning_rate": 4.800744019012934e-05, "loss": 1.998, "step": 2082 }, { "epoch": 1.3510621047510947, "grad_norm": 4.056191921234131, "learning_rate": 4.7920167919199686e-05, "loss": 1.804, "step": 2083 }, { "epoch": 1.3517107183395491, "grad_norm": 4.105471611022949, "learning_rate": 4.7832950043592616e-05, "loss": 1.6518, "step": 2084 }, { "epoch": 1.3523593319280038, "grad_norm": 3.358036518096924, "learning_rate": 4.7745786654403765e-05, "loss": 1.6125, "step": 2085 }, { "epoch": 1.3530079455164585, "grad_norm": 2.9293599128723145, "learning_rate": 4.765867784267175e-05, "loss": 1.446, "step": 2086 }, { "epoch": 1.3536565591049132, "grad_norm": 3.4014084339141846, "learning_rate": 4.757162369937842e-05, "loss": 1.5982, "step": 2087 }, { "epoch": 1.354305172693368, "grad_norm": 3.9015140533447266, "learning_rate": 4.748462431544826e-05, "loss": 1.7017, "step": 2088 }, { "epoch": 1.3549537862818226, "grad_norm": 3.9982011318206787, "learning_rate": 4.739767978174875e-05, "loss": 1.6529, "step": 2089 }, { "epoch": 1.3556023998702773, "grad_norm": 4.065764904022217, "learning_rate": 4.731079018908995e-05, "loss": 1.8086, "step": 2090 }, { "epoch": 1.356251013458732, "grad_norm": 3.5262956619262695, "learning_rate": 4.722395562822468e-05, "loss": 1.6989, "step": 2091 }, { "epoch": 1.3568996270471867, "grad_norm": 3.936694622039795, "learning_rate": 4.713717618984812e-05, "loss": 1.9688, "step": 2092 }, { "epoch": 1.3575482406356414, "grad_norm": 2.8225741386413574, "learning_rate": 4.705045196459799e-05, "loss": 1.3642, "step": 2093 }, { "epoch": 1.3581968542240959, "grad_norm": 3.529682159423828, "learning_rate": 4.696378304305435e-05, "loss": 1.5984, "step": 2094 }, { "epoch": 1.3588454678125506, "grad_norm": 3.4405035972595215, "learning_rate": 4.687716951573938e-05, "loss": 1.3801, "step": 2095 }, { "epoch": 1.3594940814010052, "grad_norm": 3.3005824089050293, "learning_rate": 4.679061147311756e-05, "loss": 1.6307, "step": 2096 }, { "epoch": 1.36014269498946, "grad_norm": 4.486237525939941, "learning_rate": 4.6704109005595266e-05, "loss": 1.9482, "step": 2097 }, { "epoch": 1.3607913085779146, "grad_norm": 3.415778398513794, "learning_rate": 4.661766220352097e-05, "loss": 1.4412, "step": 2098 }, { "epoch": 1.3614399221663693, "grad_norm": 3.507199764251709, "learning_rate": 4.653127115718485e-05, "loss": 1.5225, "step": 2099 }, { "epoch": 1.362088535754824, "grad_norm": 3.454383134841919, "learning_rate": 4.644493595681899e-05, "loss": 1.6931, "step": 2100 }, { "epoch": 1.362088535754824, "eval_loss": 2.002316951751709, "eval_runtime": 35.0945, "eval_samples_per_second": 58.585, "eval_steps_per_second": 14.646, "step": 2100 }, { "epoch": 1.3627371493432787, "grad_norm": 4.0902910232543945, "learning_rate": 4.635865669259708e-05, "loss": 1.6002, "step": 2101 }, { "epoch": 1.3633857629317334, "grad_norm": 4.234682559967041, "learning_rate": 4.627243345463439e-05, "loss": 1.9327, "step": 2102 }, { "epoch": 1.364034376520188, "grad_norm": 3.34226655960083, "learning_rate": 4.6186266332987715e-05, "loss": 1.5594, "step": 2103 }, { "epoch": 1.3646829901086428, "grad_norm": 2.917745351791382, "learning_rate": 4.6100155417655134e-05, "loss": 1.5182, "step": 2104 }, { "epoch": 1.3653316036970975, "grad_norm": 3.669440746307373, "learning_rate": 4.601410079857617e-05, "loss": 1.5694, "step": 2105 }, { "epoch": 1.3659802172855522, "grad_norm": 3.749854803085327, "learning_rate": 4.592810256563137e-05, "loss": 1.8499, "step": 2106 }, { "epoch": 1.3666288308740069, "grad_norm": 4.099685192108154, "learning_rate": 4.584216080864258e-05, "loss": 1.8458, "step": 2107 }, { "epoch": 1.3672774444624616, "grad_norm": 3.6814818382263184, "learning_rate": 4.5756275617372465e-05, "loss": 1.5491, "step": 2108 }, { "epoch": 1.3679260580509163, "grad_norm": 3.773793935775757, "learning_rate": 4.567044708152476e-05, "loss": 1.8323, "step": 2109 }, { "epoch": 1.368574671639371, "grad_norm": 3.3476083278656006, "learning_rate": 4.558467529074401e-05, "loss": 1.4292, "step": 2110 }, { "epoch": 1.3692232852278257, "grad_norm": 3.893867015838623, "learning_rate": 4.549896033461538e-05, "loss": 1.7224, "step": 2111 }, { "epoch": 1.3698718988162801, "grad_norm": 3.378614902496338, "learning_rate": 4.541330230266481e-05, "loss": 1.8221, "step": 2112 }, { "epoch": 1.3705205124047348, "grad_norm": 3.1137964725494385, "learning_rate": 4.5327701284358656e-05, "loss": 1.4875, "step": 2113 }, { "epoch": 1.3711691259931895, "grad_norm": 3.6080522537231445, "learning_rate": 4.524215736910388e-05, "loss": 1.9737, "step": 2114 }, { "epoch": 1.3718177395816442, "grad_norm": 3.105175256729126, "learning_rate": 4.515667064624758e-05, "loss": 1.6649, "step": 2115 }, { "epoch": 1.372466353170099, "grad_norm": 4.224860668182373, "learning_rate": 4.507124120507742e-05, "loss": 1.7872, "step": 2116 }, { "epoch": 1.3731149667585536, "grad_norm": 3.4784505367279053, "learning_rate": 4.498586913482095e-05, "loss": 1.6713, "step": 2117 }, { "epoch": 1.3737635803470083, "grad_norm": 3.8677048683166504, "learning_rate": 4.490055452464594e-05, "loss": 1.8984, "step": 2118 }, { "epoch": 1.374412193935463, "grad_norm": 3.49123215675354, "learning_rate": 4.481529746366019e-05, "loss": 1.6243, "step": 2119 }, { "epoch": 1.3750608075239177, "grad_norm": 2.7699098587036133, "learning_rate": 4.4730098040911226e-05, "loss": 1.6554, "step": 2120 }, { "epoch": 1.3757094211123724, "grad_norm": 4.3785014152526855, "learning_rate": 4.464495634538656e-05, "loss": 1.7963, "step": 2121 }, { "epoch": 1.3763580347008268, "grad_norm": 3.4200305938720703, "learning_rate": 4.4559872466013245e-05, "loss": 1.5709, "step": 2122 }, { "epoch": 1.3770066482892815, "grad_norm": 3.0476019382476807, "learning_rate": 4.447484649165808e-05, "loss": 1.4232, "step": 2123 }, { "epoch": 1.3776552618777362, "grad_norm": 3.3558449745178223, "learning_rate": 4.438987851112728e-05, "loss": 1.4483, "step": 2124 }, { "epoch": 1.378303875466191, "grad_norm": 3.893054485321045, "learning_rate": 4.4304968613166564e-05, "loss": 1.6281, "step": 2125 }, { "epoch": 1.3789524890546456, "grad_norm": 4.136235237121582, "learning_rate": 4.4220116886460993e-05, "loss": 1.9169, "step": 2126 }, { "epoch": 1.3796011026431003, "grad_norm": 3.6908533573150635, "learning_rate": 4.4135323419634766e-05, "loss": 1.5266, "step": 2127 }, { "epoch": 1.380249716231555, "grad_norm": 4.079684257507324, "learning_rate": 4.405058830125137e-05, "loss": 1.846, "step": 2128 }, { "epoch": 1.3808983298200097, "grad_norm": 3.9350478649139404, "learning_rate": 4.396591161981317e-05, "loss": 1.6342, "step": 2129 }, { "epoch": 1.3815469434084644, "grad_norm": 4.489711284637451, "learning_rate": 4.388129346376178e-05, "loss": 1.829, "step": 2130 }, { "epoch": 1.382195556996919, "grad_norm": 4.542858123779297, "learning_rate": 4.379673392147735e-05, "loss": 1.6816, "step": 2131 }, { "epoch": 1.3828441705853738, "grad_norm": 3.4624056816101074, "learning_rate": 4.3712233081279095e-05, "loss": 1.8358, "step": 2132 }, { "epoch": 1.3834927841738285, "grad_norm": 4.125976085662842, "learning_rate": 4.362779103142472e-05, "loss": 1.7568, "step": 2133 }, { "epoch": 1.3841413977622832, "grad_norm": 3.749516725540161, "learning_rate": 4.354340786011061e-05, "loss": 1.7374, "step": 2134 }, { "epoch": 1.3847900113507379, "grad_norm": 3.6913464069366455, "learning_rate": 4.34590836554717e-05, "loss": 1.4426, "step": 2135 }, { "epoch": 1.3854386249391926, "grad_norm": 4.070128440856934, "learning_rate": 4.337481850558122e-05, "loss": 1.9334, "step": 2136 }, { "epoch": 1.3860872385276473, "grad_norm": 3.5387516021728516, "learning_rate": 4.329061249845083e-05, "loss": 1.5679, "step": 2137 }, { "epoch": 1.386735852116102, "grad_norm": 3.7760870456695557, "learning_rate": 4.320646572203033e-05, "loss": 2.1245, "step": 2138 }, { "epoch": 1.3873844657045566, "grad_norm": 3.5190494060516357, "learning_rate": 4.312237826420774e-05, "loss": 1.3052, "step": 2139 }, { "epoch": 1.3880330792930111, "grad_norm": 3.7962303161621094, "learning_rate": 4.303835021280903e-05, "loss": 1.5814, "step": 2140 }, { "epoch": 1.3886816928814658, "grad_norm": 3.219163656234741, "learning_rate": 4.29543816555982e-05, "loss": 1.6245, "step": 2141 }, { "epoch": 1.3893303064699205, "grad_norm": 3.5009872913360596, "learning_rate": 4.287047268027711e-05, "loss": 1.6944, "step": 2142 }, { "epoch": 1.3899789200583752, "grad_norm": 3.0797958374023438, "learning_rate": 4.2786623374485305e-05, "loss": 1.2273, "step": 2143 }, { "epoch": 1.39062753364683, "grad_norm": 3.5116994380950928, "learning_rate": 4.270283382580008e-05, "loss": 1.8139, "step": 2144 }, { "epoch": 1.3912761472352846, "grad_norm": 3.368184804916382, "learning_rate": 4.261910412173633e-05, "loss": 1.7232, "step": 2145 }, { "epoch": 1.3919247608237393, "grad_norm": 3.4655721187591553, "learning_rate": 4.253543434974643e-05, "loss": 1.6674, "step": 2146 }, { "epoch": 1.392573374412194, "grad_norm": 4.059392929077148, "learning_rate": 4.245182459722008e-05, "loss": 1.9942, "step": 2147 }, { "epoch": 1.3932219880006487, "grad_norm": 5.60744047164917, "learning_rate": 4.236827495148443e-05, "loss": 1.8047, "step": 2148 }, { "epoch": 1.3938706015891034, "grad_norm": 4.040340900421143, "learning_rate": 4.2284785499803714e-05, "loss": 1.7566, "step": 2149 }, { "epoch": 1.3945192151775578, "grad_norm": 4.131582260131836, "learning_rate": 4.220135632937937e-05, "loss": 2.0004, "step": 2150 }, { "epoch": 1.3951678287660125, "grad_norm": 3.6701483726501465, "learning_rate": 4.211798752734991e-05, "loss": 1.7452, "step": 2151 }, { "epoch": 1.3958164423544672, "grad_norm": 3.180025577545166, "learning_rate": 4.2034679180790694e-05, "loss": 1.5429, "step": 2152 }, { "epoch": 1.396465055942922, "grad_norm": 4.198643207550049, "learning_rate": 4.1951431376714025e-05, "loss": 2.0123, "step": 2153 }, { "epoch": 1.3971136695313766, "grad_norm": 3.607287645339966, "learning_rate": 4.1868244202068905e-05, "loss": 1.4438, "step": 2154 }, { "epoch": 1.3977622831198313, "grad_norm": 3.6625301837921143, "learning_rate": 4.178511774374109e-05, "loss": 1.6574, "step": 2155 }, { "epoch": 1.398410896708286, "grad_norm": 4.374930381774902, "learning_rate": 4.170205208855281e-05, "loss": 1.8132, "step": 2156 }, { "epoch": 1.3990595102967407, "grad_norm": 3.823245048522949, "learning_rate": 4.161904732326289e-05, "loss": 1.7528, "step": 2157 }, { "epoch": 1.3997081238851954, "grad_norm": 3.507200241088867, "learning_rate": 4.153610353456654e-05, "loss": 1.8062, "step": 2158 }, { "epoch": 1.40035673747365, "grad_norm": 3.5892202854156494, "learning_rate": 4.145322080909523e-05, "loss": 1.8998, "step": 2159 }, { "epoch": 1.4010053510621048, "grad_norm": 3.3999083042144775, "learning_rate": 4.137039923341674e-05, "loss": 1.7142, "step": 2160 }, { "epoch": 1.4016539646505595, "grad_norm": 4.105923652648926, "learning_rate": 4.1287638894034844e-05, "loss": 1.7403, "step": 2161 }, { "epoch": 1.4023025782390142, "grad_norm": 4.3477373123168945, "learning_rate": 4.120493987738951e-05, "loss": 1.6526, "step": 2162 }, { "epoch": 1.4029511918274689, "grad_norm": 3.892319679260254, "learning_rate": 4.112230226985654e-05, "loss": 1.6508, "step": 2163 }, { "epoch": 1.4035998054159236, "grad_norm": 3.772798776626587, "learning_rate": 4.103972615774769e-05, "loss": 1.4889, "step": 2164 }, { "epoch": 1.4042484190043782, "grad_norm": 4.167153358459473, "learning_rate": 4.0957211627310376e-05, "loss": 1.9027, "step": 2165 }, { "epoch": 1.404897032592833, "grad_norm": 3.806922435760498, "learning_rate": 4.087475876472781e-05, "loss": 1.7716, "step": 2166 }, { "epoch": 1.4055456461812876, "grad_norm": 3.4960336685180664, "learning_rate": 4.079236765611874e-05, "loss": 1.6891, "step": 2167 }, { "epoch": 1.406194259769742, "grad_norm": 3.4638662338256836, "learning_rate": 4.071003838753737e-05, "loss": 1.4375, "step": 2168 }, { "epoch": 1.4068428733581968, "grad_norm": 3.8627383708953857, "learning_rate": 4.0627771044973436e-05, "loss": 1.703, "step": 2169 }, { "epoch": 1.4074914869466515, "grad_norm": 3.3079304695129395, "learning_rate": 4.054556571435184e-05, "loss": 1.4178, "step": 2170 }, { "epoch": 1.4081401005351062, "grad_norm": 3.1823318004608154, "learning_rate": 4.046342248153287e-05, "loss": 1.7309, "step": 2171 }, { "epoch": 1.4087887141235609, "grad_norm": 3.8023147583007812, "learning_rate": 4.038134143231176e-05, "loss": 1.7318, "step": 2172 }, { "epoch": 1.4094373277120156, "grad_norm": 3.0713319778442383, "learning_rate": 4.029932265241908e-05, "loss": 1.5386, "step": 2173 }, { "epoch": 1.4100859413004703, "grad_norm": 3.323232650756836, "learning_rate": 4.021736622752008e-05, "loss": 1.5542, "step": 2174 }, { "epoch": 1.410734554888925, "grad_norm": 3.548295736312866, "learning_rate": 4.013547224321505e-05, "loss": 1.7217, "step": 2175 }, { "epoch": 1.4113831684773797, "grad_norm": 3.9045448303222656, "learning_rate": 4.005364078503903e-05, "loss": 1.7654, "step": 2176 }, { "epoch": 1.4120317820658344, "grad_norm": 3.148836612701416, "learning_rate": 3.9971871938461667e-05, "loss": 1.5483, "step": 2177 }, { "epoch": 1.4126803956542888, "grad_norm": 3.2153265476226807, "learning_rate": 3.9890165788887365e-05, "loss": 1.5301, "step": 2178 }, { "epoch": 1.4133290092427435, "grad_norm": 4.266164302825928, "learning_rate": 3.9808522421654895e-05, "loss": 1.8858, "step": 2179 }, { "epoch": 1.4139776228311982, "grad_norm": 4.208529472351074, "learning_rate": 3.9726941922037586e-05, "loss": 1.987, "step": 2180 }, { "epoch": 1.414626236419653, "grad_norm": 4.653675079345703, "learning_rate": 3.9645424375242965e-05, "loss": 1.909, "step": 2181 }, { "epoch": 1.4152748500081076, "grad_norm": 3.7570486068725586, "learning_rate": 3.9563969866412917e-05, "loss": 1.5453, "step": 2182 }, { "epoch": 1.4159234635965623, "grad_norm": 4.871485710144043, "learning_rate": 3.948257848062351e-05, "loss": 1.4993, "step": 2183 }, { "epoch": 1.416572077185017, "grad_norm": 3.5231685638427734, "learning_rate": 3.940125030288472e-05, "loss": 1.4617, "step": 2184 }, { "epoch": 1.4172206907734717, "grad_norm": 3.5659403800964355, "learning_rate": 3.931998541814069e-05, "loss": 1.6923, "step": 2185 }, { "epoch": 1.4178693043619264, "grad_norm": 3.388287305831909, "learning_rate": 3.923878391126932e-05, "loss": 1.6154, "step": 2186 }, { "epoch": 1.418517917950381, "grad_norm": 3.704702377319336, "learning_rate": 3.915764586708238e-05, "loss": 1.4855, "step": 2187 }, { "epoch": 1.4191665315388358, "grad_norm": 4.736812114715576, "learning_rate": 3.9076571370325364e-05, "loss": 1.748, "step": 2188 }, { "epoch": 1.4198151451272905, "grad_norm": 4.265902042388916, "learning_rate": 3.8995560505677396e-05, "loss": 1.5184, "step": 2189 }, { "epoch": 1.4204637587157452, "grad_norm": 3.644395351409912, "learning_rate": 3.8914613357751064e-05, "loss": 1.72, "step": 2190 }, { "epoch": 1.4211123723041998, "grad_norm": 4.458390235900879, "learning_rate": 3.8833730011092475e-05, "loss": 1.7412, "step": 2191 }, { "epoch": 1.4217609858926545, "grad_norm": 4.660626411437988, "learning_rate": 3.8752910550181124e-05, "loss": 1.9901, "step": 2192 }, { "epoch": 1.4224095994811092, "grad_norm": 4.261714458465576, "learning_rate": 3.867215505942966e-05, "loss": 1.6944, "step": 2193 }, { "epoch": 1.423058213069564, "grad_norm": 3.668905258178711, "learning_rate": 3.859146362318408e-05, "loss": 1.5252, "step": 2194 }, { "epoch": 1.4237068266580186, "grad_norm": 4.081768035888672, "learning_rate": 3.8510836325723324e-05, "loss": 1.9369, "step": 2195 }, { "epoch": 1.424355440246473, "grad_norm": 3.144137144088745, "learning_rate": 3.8430273251259466e-05, "loss": 1.343, "step": 2196 }, { "epoch": 1.4250040538349278, "grad_norm": 3.3499345779418945, "learning_rate": 3.834977448393739e-05, "loss": 1.6606, "step": 2197 }, { "epoch": 1.4256526674233825, "grad_norm": 3.9143521785736084, "learning_rate": 3.82693401078349e-05, "loss": 2.0279, "step": 2198 }, { "epoch": 1.4263012810118372, "grad_norm": 3.729210138320923, "learning_rate": 3.818897020696256e-05, "loss": 1.5266, "step": 2199 }, { "epoch": 1.4269498946002919, "grad_norm": 3.004948139190674, "learning_rate": 3.810866486526348e-05, "loss": 1.593, "step": 2200 }, { "epoch": 1.4269498946002919, "eval_loss": 1.9542957544326782, "eval_runtime": 35.1115, "eval_samples_per_second": 58.556, "eval_steps_per_second": 14.639, "step": 2200 }, { "epoch": 1.4275985081887466, "grad_norm": 3.540048837661743, "learning_rate": 3.802842416661343e-05, "loss": 1.6745, "step": 2201 }, { "epoch": 1.4282471217772013, "grad_norm": 3.7538070678710938, "learning_rate": 3.794824819482067e-05, "loss": 1.5808, "step": 2202 }, { "epoch": 1.428895735365656, "grad_norm": 5.4453125, "learning_rate": 3.786813703362583e-05, "loss": 2.1144, "step": 2203 }, { "epoch": 1.4295443489541106, "grad_norm": 5.0782060623168945, "learning_rate": 3.778809076670179e-05, "loss": 2.1446, "step": 2204 }, { "epoch": 1.4301929625425653, "grad_norm": 4.326142311096191, "learning_rate": 3.770810947765379e-05, "loss": 1.8443, "step": 2205 }, { "epoch": 1.4308415761310198, "grad_norm": 3.880082607269287, "learning_rate": 3.762819325001903e-05, "loss": 1.663, "step": 2206 }, { "epoch": 1.4314901897194745, "grad_norm": 3.546368360519409, "learning_rate": 3.754834216726688e-05, "loss": 1.8069, "step": 2207 }, { "epoch": 1.4321388033079292, "grad_norm": 3.3262813091278076, "learning_rate": 3.7468556312798685e-05, "loss": 1.4974, "step": 2208 }, { "epoch": 1.432787416896384, "grad_norm": 4.183609962463379, "learning_rate": 3.738883576994754e-05, "loss": 1.926, "step": 2209 }, { "epoch": 1.4334360304848386, "grad_norm": 3.422820806503296, "learning_rate": 3.730918062197846e-05, "loss": 1.4387, "step": 2210 }, { "epoch": 1.4340846440732933, "grad_norm": 3.828866958618164, "learning_rate": 3.722959095208801e-05, "loss": 1.8877, "step": 2211 }, { "epoch": 1.434733257661748, "grad_norm": 3.5452375411987305, "learning_rate": 3.715006684340454e-05, "loss": 1.8055, "step": 2212 }, { "epoch": 1.4353818712502027, "grad_norm": 3.450061082839966, "learning_rate": 3.707060837898775e-05, "loss": 1.7473, "step": 2213 }, { "epoch": 1.4360304848386574, "grad_norm": 3.812938690185547, "learning_rate": 3.69912156418289e-05, "loss": 1.7659, "step": 2214 }, { "epoch": 1.436679098427112, "grad_norm": 3.9397966861724854, "learning_rate": 3.6911888714850564e-05, "loss": 1.7709, "step": 2215 }, { "epoch": 1.4373277120155667, "grad_norm": 3.9813451766967773, "learning_rate": 3.683262768090656e-05, "loss": 1.7464, "step": 2216 }, { "epoch": 1.4379763256040214, "grad_norm": 3.228898525238037, "learning_rate": 3.675343262278196e-05, "loss": 1.5045, "step": 2217 }, { "epoch": 1.4386249391924761, "grad_norm": 2.9506118297576904, "learning_rate": 3.667430362319277e-05, "loss": 1.5511, "step": 2218 }, { "epoch": 1.4392735527809308, "grad_norm": 3.4678609371185303, "learning_rate": 3.65952407647862e-05, "loss": 1.4726, "step": 2219 }, { "epoch": 1.4399221663693855, "grad_norm": 3.911971092224121, "learning_rate": 3.6516244130140176e-05, "loss": 1.4799, "step": 2220 }, { "epoch": 1.4405707799578402, "grad_norm": 3.714747667312622, "learning_rate": 3.643731380176363e-05, "loss": 1.821, "step": 2221 }, { "epoch": 1.441219393546295, "grad_norm": 2.9916281700134277, "learning_rate": 3.63584498620961e-05, "loss": 1.1812, "step": 2222 }, { "epoch": 1.4418680071347496, "grad_norm": 4.174739837646484, "learning_rate": 3.627965239350789e-05, "loss": 1.9549, "step": 2223 }, { "epoch": 1.442516620723204, "grad_norm": 3.8844964504241943, "learning_rate": 3.620092147829985e-05, "loss": 1.9314, "step": 2224 }, { "epoch": 1.4431652343116588, "grad_norm": 3.462688684463501, "learning_rate": 3.612225719870327e-05, "loss": 1.4707, "step": 2225 }, { "epoch": 1.4438138479001135, "grad_norm": 4.261932373046875, "learning_rate": 3.604365963687991e-05, "loss": 1.9117, "step": 2226 }, { "epoch": 1.4444624614885682, "grad_norm": 3.8103442192077637, "learning_rate": 3.5965128874921746e-05, "loss": 1.848, "step": 2227 }, { "epoch": 1.4451110750770229, "grad_norm": 3.0310921669006348, "learning_rate": 3.588666499485115e-05, "loss": 1.2677, "step": 2228 }, { "epoch": 1.4457596886654775, "grad_norm": 3.2992660999298096, "learning_rate": 3.580826807862043e-05, "loss": 1.7976, "step": 2229 }, { "epoch": 1.4464083022539322, "grad_norm": 5.16666841506958, "learning_rate": 3.5729938208112115e-05, "loss": 1.9375, "step": 2230 }, { "epoch": 1.447056915842387, "grad_norm": 3.4400851726531982, "learning_rate": 3.565167546513866e-05, "loss": 1.447, "step": 2231 }, { "epoch": 1.4477055294308416, "grad_norm": 3.6043946743011475, "learning_rate": 3.557347993144238e-05, "loss": 1.5262, "step": 2232 }, { "epoch": 1.4483541430192963, "grad_norm": 4.386716842651367, "learning_rate": 3.5495351688695464e-05, "loss": 1.719, "step": 2233 }, { "epoch": 1.4490027566077508, "grad_norm": 3.3811662197113037, "learning_rate": 3.541729081849969e-05, "loss": 1.373, "step": 2234 }, { "epoch": 1.4496513701962055, "grad_norm": 4.607582092285156, "learning_rate": 3.533929740238663e-05, "loss": 1.9897, "step": 2235 }, { "epoch": 1.4502999837846602, "grad_norm": 3.798954963684082, "learning_rate": 3.5261371521817244e-05, "loss": 2.0683, "step": 2236 }, { "epoch": 1.4509485973731149, "grad_norm": 3.3497581481933594, "learning_rate": 3.5183513258182075e-05, "loss": 1.5086, "step": 2237 }, { "epoch": 1.4515972109615696, "grad_norm": 4.09397029876709, "learning_rate": 3.510572269280097e-05, "loss": 1.7382, "step": 2238 }, { "epoch": 1.4522458245500243, "grad_norm": 4.357300281524658, "learning_rate": 3.502799990692309e-05, "loss": 1.8799, "step": 2239 }, { "epoch": 1.452894438138479, "grad_norm": 3.4024734497070312, "learning_rate": 3.495034498172687e-05, "loss": 1.5196, "step": 2240 }, { "epoch": 1.4535430517269337, "grad_norm": 3.3956453800201416, "learning_rate": 3.487275799831972e-05, "loss": 1.6579, "step": 2241 }, { "epoch": 1.4541916653153883, "grad_norm": 3.6668381690979004, "learning_rate": 3.4795239037738233e-05, "loss": 1.6173, "step": 2242 }, { "epoch": 1.454840278903843, "grad_norm": 3.540177822113037, "learning_rate": 3.471778818094785e-05, "loss": 1.4374, "step": 2243 }, { "epoch": 1.4554888924922977, "grad_norm": 3.013129234313965, "learning_rate": 3.464040550884294e-05, "loss": 1.453, "step": 2244 }, { "epoch": 1.4561375060807524, "grad_norm": 3.4033713340759277, "learning_rate": 3.456309110224666e-05, "loss": 1.645, "step": 2245 }, { "epoch": 1.4567861196692071, "grad_norm": 4.394961833953857, "learning_rate": 3.4485845041910856e-05, "loss": 1.7417, "step": 2246 }, { "epoch": 1.4574347332576618, "grad_norm": 3.7915172576904297, "learning_rate": 3.4408667408515924e-05, "loss": 1.6527, "step": 2247 }, { "epoch": 1.4580833468461165, "grad_norm": 4.056890487670898, "learning_rate": 3.433155828267089e-05, "loss": 1.8284, "step": 2248 }, { "epoch": 1.4587319604345712, "grad_norm": 3.412689447402954, "learning_rate": 3.425451774491319e-05, "loss": 1.6444, "step": 2249 }, { "epoch": 1.459380574023026, "grad_norm": 3.592686176300049, "learning_rate": 3.417754587570856e-05, "loss": 1.8249, "step": 2250 }, { "epoch": 1.4600291876114806, "grad_norm": 3.0686776638031006, "learning_rate": 3.4100642755451154e-05, "loss": 1.6212, "step": 2251 }, { "epoch": 1.460677801199935, "grad_norm": 3.612276792526245, "learning_rate": 3.402380846446314e-05, "loss": 1.5929, "step": 2252 }, { "epoch": 1.4613264147883898, "grad_norm": 3.7021632194519043, "learning_rate": 3.394704308299499e-05, "loss": 1.7232, "step": 2253 }, { "epoch": 1.4619750283768445, "grad_norm": 3.603787660598755, "learning_rate": 3.387034669122501e-05, "loss": 1.623, "step": 2254 }, { "epoch": 1.4626236419652991, "grad_norm": 3.8675572872161865, "learning_rate": 3.37937193692596e-05, "loss": 1.4882, "step": 2255 }, { "epoch": 1.4632722555537538, "grad_norm": 4.305935859680176, "learning_rate": 3.3717161197132996e-05, "loss": 1.6491, "step": 2256 }, { "epoch": 1.4639208691422085, "grad_norm": 4.108709335327148, "learning_rate": 3.3640672254807095e-05, "loss": 1.7008, "step": 2257 }, { "epoch": 1.4645694827306632, "grad_norm": 3.8892459869384766, "learning_rate": 3.356425262217164e-05, "loss": 1.7303, "step": 2258 }, { "epoch": 1.465218096319118, "grad_norm": 4.711791515350342, "learning_rate": 3.348790237904382e-05, "loss": 1.957, "step": 2259 }, { "epoch": 1.4658667099075726, "grad_norm": 3.3674638271331787, "learning_rate": 3.341162160516857e-05, "loss": 1.6295, "step": 2260 }, { "epoch": 1.4665153234960273, "grad_norm": 3.650325059890747, "learning_rate": 3.333541038021805e-05, "loss": 1.6111, "step": 2261 }, { "epoch": 1.4671639370844818, "grad_norm": 3.708380937576294, "learning_rate": 3.325926878379191e-05, "loss": 1.7754, "step": 2262 }, { "epoch": 1.4678125506729365, "grad_norm": 3.836378335952759, "learning_rate": 3.318319689541698e-05, "loss": 1.5054, "step": 2263 }, { "epoch": 1.4684611642613912, "grad_norm": 3.8078041076660156, "learning_rate": 3.310719479454736e-05, "loss": 1.5658, "step": 2264 }, { "epoch": 1.4691097778498459, "grad_norm": 3.518536329269409, "learning_rate": 3.303126256056428e-05, "loss": 1.6054, "step": 2265 }, { "epoch": 1.4697583914383006, "grad_norm": 4.631259918212891, "learning_rate": 3.295540027277588e-05, "loss": 1.7211, "step": 2266 }, { "epoch": 1.4704070050267553, "grad_norm": 4.6063151359558105, "learning_rate": 3.2879608010417385e-05, "loss": 2.1066, "step": 2267 }, { "epoch": 1.47105561861521, "grad_norm": 4.129861831665039, "learning_rate": 3.280388585265075e-05, "loss": 1.6216, "step": 2268 }, { "epoch": 1.4717042322036646, "grad_norm": 4.48596715927124, "learning_rate": 3.272823387856482e-05, "loss": 1.4518, "step": 2269 }, { "epoch": 1.4723528457921193, "grad_norm": 3.9528958797454834, "learning_rate": 3.265265216717503e-05, "loss": 1.729, "step": 2270 }, { "epoch": 1.473001459380574, "grad_norm": 3.80484676361084, "learning_rate": 3.2577140797423535e-05, "loss": 1.5422, "step": 2271 }, { "epoch": 1.4736500729690287, "grad_norm": 4.446285724639893, "learning_rate": 3.250169984817897e-05, "loss": 1.997, "step": 2272 }, { "epoch": 1.4742986865574834, "grad_norm": 3.814338207244873, "learning_rate": 3.2426329398236346e-05, "loss": 1.4046, "step": 2273 }, { "epoch": 1.4749473001459381, "grad_norm": 3.6371920108795166, "learning_rate": 3.2351029526317246e-05, "loss": 1.6304, "step": 2274 }, { "epoch": 1.4755959137343928, "grad_norm": 3.7074458599090576, "learning_rate": 3.227580031106928e-05, "loss": 1.6964, "step": 2275 }, { "epoch": 1.4762445273228475, "grad_norm": 3.774646282196045, "learning_rate": 3.220064183106648e-05, "loss": 1.4481, "step": 2276 }, { "epoch": 1.4768931409113022, "grad_norm": 3.5276026725769043, "learning_rate": 3.2125554164808824e-05, "loss": 1.5923, "step": 2277 }, { "epoch": 1.4775417544997569, "grad_norm": 3.2300214767456055, "learning_rate": 3.205053739072248e-05, "loss": 1.292, "step": 2278 }, { "epoch": 1.4781903680882116, "grad_norm": 3.32893705368042, "learning_rate": 3.197559158715941e-05, "loss": 2.1905, "step": 2279 }, { "epoch": 1.478838981676666, "grad_norm": 3.7948622703552246, "learning_rate": 3.19007168323976e-05, "loss": 1.6618, "step": 2280 }, { "epoch": 1.4794875952651207, "grad_norm": 3.4926388263702393, "learning_rate": 3.182591320464079e-05, "loss": 1.5551, "step": 2281 }, { "epoch": 1.4801362088535754, "grad_norm": 4.268977642059326, "learning_rate": 3.175118078201833e-05, "loss": 1.9959, "step": 2282 }, { "epoch": 1.4807848224420301, "grad_norm": 3.9173758029937744, "learning_rate": 3.1676519642585356e-05, "loss": 1.7569, "step": 2283 }, { "epoch": 1.4814334360304848, "grad_norm": 3.335625410079956, "learning_rate": 3.16019298643224e-05, "loss": 1.4476, "step": 2284 }, { "epoch": 1.4820820496189395, "grad_norm": 3.481259346008301, "learning_rate": 3.152741152513561e-05, "loss": 1.6568, "step": 2285 }, { "epoch": 1.4827306632073942, "grad_norm": 3.1565637588500977, "learning_rate": 3.1452964702856346e-05, "loss": 1.3805, "step": 2286 }, { "epoch": 1.483379276795849, "grad_norm": 4.270587921142578, "learning_rate": 3.137858947524143e-05, "loss": 1.7981, "step": 2287 }, { "epoch": 1.4840278903843036, "grad_norm": 3.5864787101745605, "learning_rate": 3.130428591997282e-05, "loss": 1.4996, "step": 2288 }, { "epoch": 1.4846765039727583, "grad_norm": 3.6623685359954834, "learning_rate": 3.123005411465766e-05, "loss": 1.6997, "step": 2289 }, { "epoch": 1.4853251175612128, "grad_norm": 3.166729688644409, "learning_rate": 3.115589413682813e-05, "loss": 1.431, "step": 2290 }, { "epoch": 1.4859737311496675, "grad_norm": 3.4171228408813477, "learning_rate": 3.108180606394138e-05, "loss": 1.4607, "step": 2291 }, { "epoch": 1.4866223447381222, "grad_norm": 4.274218559265137, "learning_rate": 3.100778997337947e-05, "loss": 1.633, "step": 2292 }, { "epoch": 1.4872709583265769, "grad_norm": 3.523789405822754, "learning_rate": 3.093384594244926e-05, "loss": 1.7018, "step": 2293 }, { "epoch": 1.4879195719150315, "grad_norm": 4.640697956085205, "learning_rate": 3.08599740483824e-05, "loss": 2.3105, "step": 2294 }, { "epoch": 1.4885681855034862, "grad_norm": 4.490864276885986, "learning_rate": 3.0786174368335105e-05, "loss": 1.5876, "step": 2295 }, { "epoch": 1.489216799091941, "grad_norm": 4.017573833465576, "learning_rate": 3.0712446979388256e-05, "loss": 1.6412, "step": 2296 }, { "epoch": 1.4898654126803956, "grad_norm": 3.8607804775238037, "learning_rate": 3.063879195854721e-05, "loss": 1.6057, "step": 2297 }, { "epoch": 1.4905140262688503, "grad_norm": 3.7884743213653564, "learning_rate": 3.0565209382741664e-05, "loss": 1.5468, "step": 2298 }, { "epoch": 1.491162639857305, "grad_norm": 3.378019094467163, "learning_rate": 3.049169932882576e-05, "loss": 1.4994, "step": 2299 }, { "epoch": 1.4918112534457597, "grad_norm": 3.6990740299224854, "learning_rate": 3.04182618735778e-05, "loss": 1.8932, "step": 2300 }, { "epoch": 1.4918112534457597, "eval_loss": 1.9170268774032593, "eval_runtime": 35.1001, "eval_samples_per_second": 58.575, "eval_steps_per_second": 14.644, "step": 2300 }, { "epoch": 1.4924598670342144, "grad_norm": 3.1823511123657227, "learning_rate": 3.034489709370033e-05, "loss": 1.3926, "step": 2301 }, { "epoch": 1.493108480622669, "grad_norm": 4.4710211753845215, "learning_rate": 3.0271605065819885e-05, "loss": 1.9308, "step": 2302 }, { "epoch": 1.4937570942111238, "grad_norm": 2.7638750076293945, "learning_rate": 3.01983858664872e-05, "loss": 1.0723, "step": 2303 }, { "epoch": 1.4944057077995785, "grad_norm": 3.371657133102417, "learning_rate": 3.0125239572176744e-05, "loss": 1.5902, "step": 2304 }, { "epoch": 1.4950543213880332, "grad_norm": 4.082417964935303, "learning_rate": 3.005216625928695e-05, "loss": 1.9383, "step": 2305 }, { "epoch": 1.4957029349764879, "grad_norm": 4.245962142944336, "learning_rate": 2.9979166004140036e-05, "loss": 1.7559, "step": 2306 }, { "epoch": 1.4963515485649426, "grad_norm": 3.646352767944336, "learning_rate": 2.9906238882981817e-05, "loss": 1.6593, "step": 2307 }, { "epoch": 1.497000162153397, "grad_norm": 4.262327671051025, "learning_rate": 2.9833384971981838e-05, "loss": 1.7694, "step": 2308 }, { "epoch": 1.4976487757418517, "grad_norm": 3.109628915786743, "learning_rate": 2.9760604347233068e-05, "loss": 1.5457, "step": 2309 }, { "epoch": 1.4982973893303064, "grad_norm": 3.7536780834198, "learning_rate": 2.968789708475205e-05, "loss": 1.7079, "step": 2310 }, { "epoch": 1.4989460029187611, "grad_norm": 3.22652268409729, "learning_rate": 2.9615263260478575e-05, "loss": 1.4773, "step": 2311 }, { "epoch": 1.4995946165072158, "grad_norm": 3.8230719566345215, "learning_rate": 2.9542702950275835e-05, "loss": 1.4638, "step": 2312 }, { "epoch": 1.5002432300956705, "grad_norm": 3.364539384841919, "learning_rate": 2.9470216229930226e-05, "loss": 1.3774, "step": 2313 }, { "epoch": 1.5008918436841252, "grad_norm": 3.5049262046813965, "learning_rate": 2.9397803175151207e-05, "loss": 1.4187, "step": 2314 }, { "epoch": 1.50154045727258, "grad_norm": 3.405932664871216, "learning_rate": 2.9325463861571378e-05, "loss": 1.6072, "step": 2315 }, { "epoch": 1.5021890708610346, "grad_norm": 3.5575273036956787, "learning_rate": 2.9253198364746225e-05, "loss": 1.6254, "step": 2316 }, { "epoch": 1.502837684449489, "grad_norm": 3.5173630714416504, "learning_rate": 2.9181006760154316e-05, "loss": 1.7185, "step": 2317 }, { "epoch": 1.5034862980379438, "grad_norm": 3.8847312927246094, "learning_rate": 2.9108889123196824e-05, "loss": 1.7295, "step": 2318 }, { "epoch": 1.5041349116263985, "grad_norm": 4.250885963439941, "learning_rate": 2.9036845529197843e-05, "loss": 1.6393, "step": 2319 }, { "epoch": 1.5047835252148531, "grad_norm": 3.787752389907837, "learning_rate": 2.8964876053403966e-05, "loss": 1.7064, "step": 2320 }, { "epoch": 1.5054321388033078, "grad_norm": 4.314201354980469, "learning_rate": 2.889298077098451e-05, "loss": 1.74, "step": 2321 }, { "epoch": 1.5060807523917625, "grad_norm": 4.269136428833008, "learning_rate": 2.882115975703128e-05, "loss": 1.8841, "step": 2322 }, { "epoch": 1.5067293659802172, "grad_norm": 4.8325676918029785, "learning_rate": 2.8749413086558407e-05, "loss": 1.7873, "step": 2323 }, { "epoch": 1.507377979568672, "grad_norm": 3.393559217453003, "learning_rate": 2.8677740834502532e-05, "loss": 1.5122, "step": 2324 }, { "epoch": 1.5080265931571266, "grad_norm": 3.6705055236816406, "learning_rate": 2.8606143075722402e-05, "loss": 1.848, "step": 2325 }, { "epoch": 1.5086752067455813, "grad_norm": 3.4050071239471436, "learning_rate": 2.8534619884999124e-05, "loss": 1.6375, "step": 2326 }, { "epoch": 1.509323820334036, "grad_norm": 3.533254861831665, "learning_rate": 2.8463171337035754e-05, "loss": 1.5469, "step": 2327 }, { "epoch": 1.5099724339224907, "grad_norm": 3.604917049407959, "learning_rate": 2.839179750645752e-05, "loss": 1.7284, "step": 2328 }, { "epoch": 1.5106210475109454, "grad_norm": 3.87251877784729, "learning_rate": 2.8320498467811573e-05, "loss": 1.8322, "step": 2329 }, { "epoch": 1.5112696610994, "grad_norm": 3.422940254211426, "learning_rate": 2.8249274295566864e-05, "loss": 1.7426, "step": 2330 }, { "epoch": 1.5119182746878548, "grad_norm": 3.3917782306671143, "learning_rate": 2.817812506411435e-05, "loss": 1.6137, "step": 2331 }, { "epoch": 1.5125668882763095, "grad_norm": 3.314363718032837, "learning_rate": 2.8107050847766458e-05, "loss": 1.4992, "step": 2332 }, { "epoch": 1.5132155018647642, "grad_norm": 3.6997299194335938, "learning_rate": 2.8036051720757482e-05, "loss": 1.9176, "step": 2333 }, { "epoch": 1.5138641154532189, "grad_norm": 3.6247336864471436, "learning_rate": 2.7965127757243147e-05, "loss": 1.2439, "step": 2334 }, { "epoch": 1.5145127290416736, "grad_norm": 3.1223981380462646, "learning_rate": 2.789427903130075e-05, "loss": 1.5438, "step": 2335 }, { "epoch": 1.5151613426301282, "grad_norm": 3.720625162124634, "learning_rate": 2.782350561692896e-05, "loss": 1.6673, "step": 2336 }, { "epoch": 1.515809956218583, "grad_norm": 3.3066697120666504, "learning_rate": 2.77528075880478e-05, "loss": 1.5007, "step": 2337 }, { "epoch": 1.5164585698070374, "grad_norm": 3.6418204307556152, "learning_rate": 2.768218501849862e-05, "loss": 1.9104, "step": 2338 }, { "epoch": 1.517107183395492, "grad_norm": 2.539001941680908, "learning_rate": 2.7611637982043825e-05, "loss": 1.4989, "step": 2339 }, { "epoch": 1.5177557969839468, "grad_norm": 3.566798210144043, "learning_rate": 2.7541166552367058e-05, "loss": 1.6642, "step": 2340 }, { "epoch": 1.5184044105724015, "grad_norm": 2.978632926940918, "learning_rate": 2.747077080307289e-05, "loss": 1.3699, "step": 2341 }, { "epoch": 1.5190530241608562, "grad_norm": 2.864914655685425, "learning_rate": 2.7400450807686938e-05, "loss": 1.4998, "step": 2342 }, { "epoch": 1.5197016377493109, "grad_norm": 3.8595798015594482, "learning_rate": 2.733020663965561e-05, "loss": 1.5566, "step": 2343 }, { "epoch": 1.5203502513377656, "grad_norm": 3.502196788787842, "learning_rate": 2.7260038372346174e-05, "loss": 1.6419, "step": 2344 }, { "epoch": 1.52099886492622, "grad_norm": 3.1398541927337646, "learning_rate": 2.7189946079046613e-05, "loss": 1.6131, "step": 2345 }, { "epoch": 1.5216474785146747, "grad_norm": 3.806169033050537, "learning_rate": 2.7119929832965553e-05, "loss": 1.6356, "step": 2346 }, { "epoch": 1.5222960921031294, "grad_norm": 3.17885160446167, "learning_rate": 2.7049989707232216e-05, "loss": 1.2607, "step": 2347 }, { "epoch": 1.5229447056915841, "grad_norm": 3.7036855220794678, "learning_rate": 2.6980125774896238e-05, "loss": 1.6674, "step": 2348 }, { "epoch": 1.5235933192800388, "grad_norm": 3.344797372817993, "learning_rate": 2.691033810892778e-05, "loss": 1.5075, "step": 2349 }, { "epoch": 1.5242419328684935, "grad_norm": 3.681107997894287, "learning_rate": 2.684062678221726e-05, "loss": 1.5516, "step": 2350 }, { "epoch": 1.5248905464569482, "grad_norm": 4.0922322273254395, "learning_rate": 2.6770991867575435e-05, "loss": 1.7942, "step": 2351 }, { "epoch": 1.525539160045403, "grad_norm": 3.4197933673858643, "learning_rate": 2.670143343773317e-05, "loss": 1.5173, "step": 2352 }, { "epoch": 1.5261877736338576, "grad_norm": 3.547905683517456, "learning_rate": 2.6631951565341517e-05, "loss": 1.7154, "step": 2353 }, { "epoch": 1.5268363872223123, "grad_norm": 3.555095672607422, "learning_rate": 2.656254632297156e-05, "loss": 1.5934, "step": 2354 }, { "epoch": 1.527485000810767, "grad_norm": 3.5101375579833984, "learning_rate": 2.6493217783114288e-05, "loss": 1.4744, "step": 2355 }, { "epoch": 1.5281336143992217, "grad_norm": 3.218090534210205, "learning_rate": 2.6423966018180658e-05, "loss": 1.3225, "step": 2356 }, { "epoch": 1.5287822279876764, "grad_norm": 4.146740913391113, "learning_rate": 2.6354791100501343e-05, "loss": 1.8845, "step": 2357 }, { "epoch": 1.529430841576131, "grad_norm": 3.682525873184204, "learning_rate": 2.6285693102326868e-05, "loss": 1.2668, "step": 2358 }, { "epoch": 1.5300794551645858, "grad_norm": 4.081378936767578, "learning_rate": 2.6216672095827266e-05, "loss": 1.6056, "step": 2359 }, { "epoch": 1.5307280687530405, "grad_norm": 3.6732709407806396, "learning_rate": 2.6147728153092356e-05, "loss": 1.4433, "step": 2360 }, { "epoch": 1.5313766823414952, "grad_norm": 5.12739896774292, "learning_rate": 2.6078861346131278e-05, "loss": 1.7995, "step": 2361 }, { "epoch": 1.5320252959299498, "grad_norm": 4.686697959899902, "learning_rate": 2.601007174687271e-05, "loss": 1.8866, "step": 2362 }, { "epoch": 1.5326739095184045, "grad_norm": 3.7649168968200684, "learning_rate": 2.5941359427164692e-05, "loss": 1.5273, "step": 2363 }, { "epoch": 1.5333225231068592, "grad_norm": 5.0795440673828125, "learning_rate": 2.5872724458774477e-05, "loss": 1.8222, "step": 2364 }, { "epoch": 1.533971136695314, "grad_norm": 3.224774122238159, "learning_rate": 2.5804166913388607e-05, "loss": 1.2669, "step": 2365 }, { "epoch": 1.5346197502837684, "grad_norm": 3.364097833633423, "learning_rate": 2.5735686862612685e-05, "loss": 1.6605, "step": 2366 }, { "epoch": 1.535268363872223, "grad_norm": 3.7051572799682617, "learning_rate": 2.5667284377971458e-05, "loss": 1.6973, "step": 2367 }, { "epoch": 1.5359169774606778, "grad_norm": 3.684174060821533, "learning_rate": 2.559895953090856e-05, "loss": 1.6887, "step": 2368 }, { "epoch": 1.5365655910491325, "grad_norm": 3.9025399684906006, "learning_rate": 2.5530712392786615e-05, "loss": 1.5601, "step": 2369 }, { "epoch": 1.5372142046375872, "grad_norm": 3.750572681427002, "learning_rate": 2.5462543034887098e-05, "loss": 1.4532, "step": 2370 }, { "epoch": 1.5378628182260419, "grad_norm": 4.434974670410156, "learning_rate": 2.5394451528410135e-05, "loss": 1.6951, "step": 2371 }, { "epoch": 1.5385114318144966, "grad_norm": 4.175408363342285, "learning_rate": 2.532643794447467e-05, "loss": 1.5469, "step": 2372 }, { "epoch": 1.539160045402951, "grad_norm": 4.211963653564453, "learning_rate": 2.5258502354118096e-05, "loss": 1.5961, "step": 2373 }, { "epoch": 1.5398086589914057, "grad_norm": 4.141477584838867, "learning_rate": 2.5190644828296585e-05, "loss": 2.4503, "step": 2374 }, { "epoch": 1.5404572725798604, "grad_norm": 3.1622188091278076, "learning_rate": 2.5122865437884545e-05, "loss": 1.2875, "step": 2375 }, { "epoch": 1.5411058861683151, "grad_norm": 3.3986353874206543, "learning_rate": 2.505516425367491e-05, "loss": 1.4184, "step": 2376 }, { "epoch": 1.5417544997567698, "grad_norm": 3.4594807624816895, "learning_rate": 2.498754134637885e-05, "loss": 1.782, "step": 2377 }, { "epoch": 1.5424031133452245, "grad_norm": 4.554396629333496, "learning_rate": 2.491999678662582e-05, "loss": 1.9458, "step": 2378 }, { "epoch": 1.5430517269336792, "grad_norm": 3.912311315536499, "learning_rate": 2.4852530644963478e-05, "loss": 1.604, "step": 2379 }, { "epoch": 1.543700340522134, "grad_norm": 2.881380319595337, "learning_rate": 2.478514299185749e-05, "loss": 1.2624, "step": 2380 }, { "epoch": 1.5443489541105886, "grad_norm": 3.0822060108184814, "learning_rate": 2.4717833897691632e-05, "loss": 1.4394, "step": 2381 }, { "epoch": 1.5449975676990433, "grad_norm": 3.763140916824341, "learning_rate": 2.465060343276755e-05, "loss": 1.7568, "step": 2382 }, { "epoch": 1.545646181287498, "grad_norm": 3.0965278148651123, "learning_rate": 2.4583451667304836e-05, "loss": 1.5074, "step": 2383 }, { "epoch": 1.5462947948759527, "grad_norm": 3.899507761001587, "learning_rate": 2.451637867144081e-05, "loss": 1.7153, "step": 2384 }, { "epoch": 1.5469434084644074, "grad_norm": 4.274811267852783, "learning_rate": 2.444938451523058e-05, "loss": 1.6902, "step": 2385 }, { "epoch": 1.547592022052862, "grad_norm": 3.304788112640381, "learning_rate": 2.4382469268646922e-05, "loss": 1.4539, "step": 2386 }, { "epoch": 1.5482406356413168, "grad_norm": 3.9734692573547363, "learning_rate": 2.4315633001580108e-05, "loss": 1.6484, "step": 2387 }, { "epoch": 1.5488892492297714, "grad_norm": 3.506962299346924, "learning_rate": 2.4248875783837987e-05, "loss": 1.5916, "step": 2388 }, { "epoch": 1.5495378628182261, "grad_norm": 3.58309268951416, "learning_rate": 2.4182197685145858e-05, "loss": 1.6675, "step": 2389 }, { "epoch": 1.5501864764066808, "grad_norm": 3.1961729526519775, "learning_rate": 2.4115598775146353e-05, "loss": 1.4959, "step": 2390 }, { "epoch": 1.5508350899951355, "grad_norm": 3.2499399185180664, "learning_rate": 2.4049079123399364e-05, "loss": 1.3962, "step": 2391 }, { "epoch": 1.5514837035835902, "grad_norm": 3.4459726810455322, "learning_rate": 2.3982638799382072e-05, "loss": 1.6871, "step": 2392 }, { "epoch": 1.552132317172045, "grad_norm": 3.6818599700927734, "learning_rate": 2.3916277872488714e-05, "loss": 1.8426, "step": 2393 }, { "epoch": 1.5527809307604994, "grad_norm": 3.3071799278259277, "learning_rate": 2.3849996412030683e-05, "loss": 1.4536, "step": 2394 }, { "epoch": 1.553429544348954, "grad_norm": 3.6428110599517822, "learning_rate": 2.3783794487236365e-05, "loss": 1.4551, "step": 2395 }, { "epoch": 1.5540781579374088, "grad_norm": 4.76463508605957, "learning_rate": 2.3717672167250994e-05, "loss": 2.0648, "step": 2396 }, { "epoch": 1.5547267715258635, "grad_norm": 4.521492004394531, "learning_rate": 2.3651629521136775e-05, "loss": 1.6332, "step": 2397 }, { "epoch": 1.5553753851143182, "grad_norm": 4.671876430511475, "learning_rate": 2.358566661787257e-05, "loss": 1.8109, "step": 2398 }, { "epoch": 1.5560239987027729, "grad_norm": 4.368535041809082, "learning_rate": 2.3519783526354088e-05, "loss": 1.9099, "step": 2399 }, { "epoch": 1.5566726122912276, "grad_norm": 4.65513801574707, "learning_rate": 2.3453980315393543e-05, "loss": 2.0395, "step": 2400 }, { "epoch": 1.5566726122912276, "eval_loss": 1.8830896615982056, "eval_runtime": 35.0699, "eval_samples_per_second": 58.626, "eval_steps_per_second": 14.656, "step": 2400 }, { "epoch": 1.557321225879682, "grad_norm": 3.737138032913208, "learning_rate": 2.338825705371982e-05, "loss": 1.2678, "step": 2401 }, { "epoch": 1.5579698394681367, "grad_norm": 3.6911299228668213, "learning_rate": 2.3322613809978256e-05, "loss": 1.8103, "step": 2402 }, { "epoch": 1.5586184530565914, "grad_norm": 3.982001781463623, "learning_rate": 2.3257050652730628e-05, "loss": 1.7312, "step": 2403 }, { "epoch": 1.559267066645046, "grad_norm": 3.526353120803833, "learning_rate": 2.3191567650455082e-05, "loss": 1.4309, "step": 2404 }, { "epoch": 1.5599156802335008, "grad_norm": 4.454493999481201, "learning_rate": 2.3126164871545963e-05, "loss": 1.9946, "step": 2405 }, { "epoch": 1.5605642938219555, "grad_norm": 4.153310775756836, "learning_rate": 2.306084238431394e-05, "loss": 1.424, "step": 2406 }, { "epoch": 1.5612129074104102, "grad_norm": 4.284578800201416, "learning_rate": 2.2995600256985717e-05, "loss": 1.6181, "step": 2407 }, { "epoch": 1.5618615209988649, "grad_norm": 3.964031934738159, "learning_rate": 2.293043855770416e-05, "loss": 1.6876, "step": 2408 }, { "epoch": 1.5625101345873196, "grad_norm": 3.7495861053466797, "learning_rate": 2.2865357354528038e-05, "loss": 1.5812, "step": 2409 }, { "epoch": 1.5631587481757743, "grad_norm": 3.6319618225097656, "learning_rate": 2.2800356715432104e-05, "loss": 1.5106, "step": 2410 }, { "epoch": 1.563807361764229, "grad_norm": 3.8865973949432373, "learning_rate": 2.273543670830699e-05, "loss": 1.345, "step": 2411 }, { "epoch": 1.5644559753526837, "grad_norm": 3.627671718597412, "learning_rate": 2.2670597400959005e-05, "loss": 1.678, "step": 2412 }, { "epoch": 1.5651045889411384, "grad_norm": 4.029452323913574, "learning_rate": 2.2605838861110317e-05, "loss": 1.6506, "step": 2413 }, { "epoch": 1.565753202529593, "grad_norm": 3.1634202003479004, "learning_rate": 2.2541161156398583e-05, "loss": 1.8153, "step": 2414 }, { "epoch": 1.5664018161180477, "grad_norm": 3.534687042236328, "learning_rate": 2.247656435437716e-05, "loss": 1.5676, "step": 2415 }, { "epoch": 1.5670504297065024, "grad_norm": 3.1331491470336914, "learning_rate": 2.2412048522514804e-05, "loss": 1.4642, "step": 2416 }, { "epoch": 1.5676990432949571, "grad_norm": 4.084421634674072, "learning_rate": 2.234761372819577e-05, "loss": 1.7161, "step": 2417 }, { "epoch": 1.5683476568834118, "grad_norm": 4.24366569519043, "learning_rate": 2.2283260038719646e-05, "loss": 1.7975, "step": 2418 }, { "epoch": 1.5689962704718665, "grad_norm": 4.248519420623779, "learning_rate": 2.2218987521301314e-05, "loss": 1.6336, "step": 2419 }, { "epoch": 1.5696448840603212, "grad_norm": 3.8891372680664062, "learning_rate": 2.215479624307092e-05, "loss": 1.6613, "step": 2420 }, { "epoch": 1.570293497648776, "grad_norm": 3.5946707725524902, "learning_rate": 2.2090686271073623e-05, "loss": 1.4469, "step": 2421 }, { "epoch": 1.5709421112372304, "grad_norm": 3.603620767593384, "learning_rate": 2.2026657672269836e-05, "loss": 1.4549, "step": 2422 }, { "epoch": 1.571590724825685, "grad_norm": 4.1350932121276855, "learning_rate": 2.1962710513534822e-05, "loss": 1.8215, "step": 2423 }, { "epoch": 1.5722393384141398, "grad_norm": 3.595306634902954, "learning_rate": 2.1898844861658917e-05, "loss": 1.4649, "step": 2424 }, { "epoch": 1.5728879520025945, "grad_norm": 4.014327526092529, "learning_rate": 2.1835060783347204e-05, "loss": 1.5964, "step": 2425 }, { "epoch": 1.5735365655910492, "grad_norm": 3.9278504848480225, "learning_rate": 2.1771358345219673e-05, "loss": 1.896, "step": 2426 }, { "epoch": 1.5741851791795038, "grad_norm": 3.2365915775299072, "learning_rate": 2.1707737613811007e-05, "loss": 1.5712, "step": 2427 }, { "epoch": 1.5748337927679585, "grad_norm": 2.767158269882202, "learning_rate": 2.1644198655570504e-05, "loss": 1.3234, "step": 2428 }, { "epoch": 1.575482406356413, "grad_norm": 4.471320629119873, "learning_rate": 2.1580741536862127e-05, "loss": 1.823, "step": 2429 }, { "epoch": 1.5761310199448677, "grad_norm": 3.4796979427337646, "learning_rate": 2.1517366323964283e-05, "loss": 1.2217, "step": 2430 }, { "epoch": 1.5767796335333224, "grad_norm": 3.4873430728912354, "learning_rate": 2.145407308306988e-05, "loss": 1.5245, "step": 2431 }, { "epoch": 1.577428247121777, "grad_norm": 3.757344961166382, "learning_rate": 2.139086188028623e-05, "loss": 1.5943, "step": 2432 }, { "epoch": 1.5780768607102318, "grad_norm": 3.618011236190796, "learning_rate": 2.1327732781634946e-05, "loss": 1.4303, "step": 2433 }, { "epoch": 1.5787254742986865, "grad_norm": 3.446916341781616, "learning_rate": 2.1264685853051803e-05, "loss": 1.5439, "step": 2434 }, { "epoch": 1.5793740878871412, "grad_norm": 4.3068366050720215, "learning_rate": 2.1201721160386866e-05, "loss": 1.8074, "step": 2435 }, { "epoch": 1.5800227014755959, "grad_norm": 3.950576066970825, "learning_rate": 2.11388387694043e-05, "loss": 1.5898, "step": 2436 }, { "epoch": 1.5806713150640506, "grad_norm": 3.2711849212646484, "learning_rate": 2.107603874578219e-05, "loss": 1.5258, "step": 2437 }, { "epoch": 1.5813199286525053, "grad_norm": 3.6027162075042725, "learning_rate": 2.1013321155112754e-05, "loss": 1.4562, "step": 2438 }, { "epoch": 1.58196854224096, "grad_norm": 4.601603031158447, "learning_rate": 2.095068606290196e-05, "loss": 1.8017, "step": 2439 }, { "epoch": 1.5826171558294146, "grad_norm": 4.1301045417785645, "learning_rate": 2.088813353456974e-05, "loss": 1.6224, "step": 2440 }, { "epoch": 1.5832657694178693, "grad_norm": 4.207179546356201, "learning_rate": 2.08256636354497e-05, "loss": 1.9334, "step": 2441 }, { "epoch": 1.583914383006324, "grad_norm": 4.3391947746276855, "learning_rate": 2.076327643078917e-05, "loss": 1.9571, "step": 2442 }, { "epoch": 1.5845629965947787, "grad_norm": 3.9490466117858887, "learning_rate": 2.0700971985749163e-05, "loss": 1.5645, "step": 2443 }, { "epoch": 1.5852116101832334, "grad_norm": 3.089162588119507, "learning_rate": 2.0638750365404148e-05, "loss": 1.3657, "step": 2444 }, { "epoch": 1.5858602237716881, "grad_norm": 4.410218715667725, "learning_rate": 2.057661163474216e-05, "loss": 1.7955, "step": 2445 }, { "epoch": 1.5865088373601428, "grad_norm": 4.010253429412842, "learning_rate": 2.0514555858664663e-05, "loss": 1.4708, "step": 2446 }, { "epoch": 1.5871574509485975, "grad_norm": 3.807661771774292, "learning_rate": 2.0452583101986468e-05, "loss": 1.5989, "step": 2447 }, { "epoch": 1.5878060645370522, "grad_norm": 4.1416730880737305, "learning_rate": 2.0390693429435627e-05, "loss": 1.9329, "step": 2448 }, { "epoch": 1.588454678125507, "grad_norm": 3.4773240089416504, "learning_rate": 2.0328886905653488e-05, "loss": 1.397, "step": 2449 }, { "epoch": 1.5891032917139614, "grad_norm": 3.6085777282714844, "learning_rate": 2.026716359519447e-05, "loss": 1.4052, "step": 2450 }, { "epoch": 1.589751905302416, "grad_norm": 3.738893747329712, "learning_rate": 2.0205523562526162e-05, "loss": 1.6958, "step": 2451 }, { "epoch": 1.5904005188908708, "grad_norm": 3.561540126800537, "learning_rate": 2.0143966872029163e-05, "loss": 1.5096, "step": 2452 }, { "epoch": 1.5910491324793254, "grad_norm": 4.80440092086792, "learning_rate": 2.008249358799693e-05, "loss": 1.9699, "step": 2453 }, { "epoch": 1.5916977460677801, "grad_norm": 4.6924004554748535, "learning_rate": 2.0021103774635952e-05, "loss": 1.5564, "step": 2454 }, { "epoch": 1.5923463596562348, "grad_norm": 3.9652812480926514, "learning_rate": 1.995979749606538e-05, "loss": 1.6335, "step": 2455 }, { "epoch": 1.5929949732446895, "grad_norm": 3.9821746349334717, "learning_rate": 1.989857481631727e-05, "loss": 1.517, "step": 2456 }, { "epoch": 1.593643586833144, "grad_norm": 3.9635255336761475, "learning_rate": 1.9837435799336224e-05, "loss": 1.5405, "step": 2457 }, { "epoch": 1.5942922004215987, "grad_norm": 3.3529088497161865, "learning_rate": 1.977638050897954e-05, "loss": 1.3552, "step": 2458 }, { "epoch": 1.5949408140100534, "grad_norm": 4.252659797668457, "learning_rate": 1.9715409009017105e-05, "loss": 1.6276, "step": 2459 }, { "epoch": 1.595589427598508, "grad_norm": 3.995776653289795, "learning_rate": 1.965452136313113e-05, "loss": 1.7081, "step": 2460 }, { "epoch": 1.5962380411869628, "grad_norm": 3.449315309524536, "learning_rate": 1.9593717634916475e-05, "loss": 1.721, "step": 2461 }, { "epoch": 1.5968866547754175, "grad_norm": 4.511311054229736, "learning_rate": 1.9532997887880135e-05, "loss": 1.6067, "step": 2462 }, { "epoch": 1.5975352683638722, "grad_norm": 4.121259689331055, "learning_rate": 1.947236218544154e-05, "loss": 1.6672, "step": 2463 }, { "epoch": 1.5981838819523269, "grad_norm": 3.5367753505706787, "learning_rate": 1.941181059093222e-05, "loss": 1.5568, "step": 2464 }, { "epoch": 1.5988324955407816, "grad_norm": 4.245017051696777, "learning_rate": 1.9351343167595992e-05, "loss": 1.6451, "step": 2465 }, { "epoch": 1.5994811091292362, "grad_norm": 4.779149055480957, "learning_rate": 1.929095997858861e-05, "loss": 1.7535, "step": 2466 }, { "epoch": 1.600129722717691, "grad_norm": 3.014092206954956, "learning_rate": 1.9230661086977963e-05, "loss": 1.3133, "step": 2467 }, { "epoch": 1.6007783363061456, "grad_norm": 4.39810848236084, "learning_rate": 1.917044655574387e-05, "loss": 2.1199, "step": 2468 }, { "epoch": 1.6014269498946003, "grad_norm": 3.506082773208618, "learning_rate": 1.9110316447777987e-05, "loss": 1.4521, "step": 2469 }, { "epoch": 1.602075563483055, "grad_norm": 4.344029903411865, "learning_rate": 1.9050270825883877e-05, "loss": 1.8427, "step": 2470 }, { "epoch": 1.6027241770715097, "grad_norm": 4.6020731925964355, "learning_rate": 1.899030975277676e-05, "loss": 2.0465, "step": 2471 }, { "epoch": 1.6033727906599644, "grad_norm": 3.9321534633636475, "learning_rate": 1.893043329108365e-05, "loss": 1.4024, "step": 2472 }, { "epoch": 1.604021404248419, "grad_norm": 4.427879333496094, "learning_rate": 1.8870641503343102e-05, "loss": 1.8526, "step": 2473 }, { "epoch": 1.6046700178368738, "grad_norm": 3.955268383026123, "learning_rate": 1.881093445200529e-05, "loss": 1.6716, "step": 2474 }, { "epoch": 1.6053186314253285, "grad_norm": 3.335338592529297, "learning_rate": 1.875131219943187e-05, "loss": 1.3222, "step": 2475 }, { "epoch": 1.6059672450137832, "grad_norm": 3.904433012008667, "learning_rate": 1.8691774807895914e-05, "loss": 1.6327, "step": 2476 }, { "epoch": 1.6066158586022379, "grad_norm": 4.228398323059082, "learning_rate": 1.863232233958191e-05, "loss": 1.7878, "step": 2477 }, { "epoch": 1.6072644721906924, "grad_norm": 3.2232577800750732, "learning_rate": 1.8572954856585535e-05, "loss": 1.3356, "step": 2478 }, { "epoch": 1.607913085779147, "grad_norm": 3.855698823928833, "learning_rate": 1.851367242091384e-05, "loss": 1.4704, "step": 2479 }, { "epoch": 1.6085616993676017, "grad_norm": 3.4431252479553223, "learning_rate": 1.8454475094484935e-05, "loss": 1.6174, "step": 2480 }, { "epoch": 1.6092103129560564, "grad_norm": 3.8528337478637695, "learning_rate": 1.8395362939128125e-05, "loss": 1.8742, "step": 2481 }, { "epoch": 1.6098589265445111, "grad_norm": 3.2518532276153564, "learning_rate": 1.8336336016583666e-05, "loss": 1.5104, "step": 2482 }, { "epoch": 1.6105075401329658, "grad_norm": 3.799945831298828, "learning_rate": 1.827739438850288e-05, "loss": 1.5691, "step": 2483 }, { "epoch": 1.6111561537214205, "grad_norm": 3.745877265930176, "learning_rate": 1.8218538116447958e-05, "loss": 1.4814, "step": 2484 }, { "epoch": 1.611804767309875, "grad_norm": 3.372673749923706, "learning_rate": 1.8159767261891937e-05, "loss": 1.4944, "step": 2485 }, { "epoch": 1.6124533808983297, "grad_norm": 3.3720028400421143, "learning_rate": 1.8101081886218663e-05, "loss": 1.5063, "step": 2486 }, { "epoch": 1.6131019944867844, "grad_norm": 3.5248827934265137, "learning_rate": 1.8042482050722653e-05, "loss": 1.6946, "step": 2487 }, { "epoch": 1.613750608075239, "grad_norm": 4.051091194152832, "learning_rate": 1.798396781660914e-05, "loss": 1.5956, "step": 2488 }, { "epoch": 1.6143992216636938, "grad_norm": 3.970662832260132, "learning_rate": 1.7925539244993915e-05, "loss": 2.0569, "step": 2489 }, { "epoch": 1.6150478352521485, "grad_norm": 4.228386402130127, "learning_rate": 1.7867196396903352e-05, "loss": 1.3472, "step": 2490 }, { "epoch": 1.6156964488406031, "grad_norm": 3.9117839336395264, "learning_rate": 1.7808939333274178e-05, "loss": 1.3907, "step": 2491 }, { "epoch": 1.6163450624290578, "grad_norm": 3.5734810829162598, "learning_rate": 1.7750768114953618e-05, "loss": 1.5225, "step": 2492 }, { "epoch": 1.6169936760175125, "grad_norm": 3.700122594833374, "learning_rate": 1.7692682802699234e-05, "loss": 1.5707, "step": 2493 }, { "epoch": 1.6176422896059672, "grad_norm": 3.6150190830230713, "learning_rate": 1.7634683457178792e-05, "loss": 1.579, "step": 2494 }, { "epoch": 1.618290903194422, "grad_norm": 3.6471548080444336, "learning_rate": 1.7576770138970343e-05, "loss": 1.4838, "step": 2495 }, { "epoch": 1.6189395167828766, "grad_norm": 4.11499547958374, "learning_rate": 1.751894290856203e-05, "loss": 1.8211, "step": 2496 }, { "epoch": 1.6195881303713313, "grad_norm": 3.5359299182891846, "learning_rate": 1.7461201826352124e-05, "loss": 1.5844, "step": 2497 }, { "epoch": 1.620236743959786, "grad_norm": 3.7360482215881348, "learning_rate": 1.7403546952648885e-05, "loss": 1.3751, "step": 2498 }, { "epoch": 1.6208853575482407, "grad_norm": 3.57149338722229, "learning_rate": 1.7345978347670544e-05, "loss": 1.4625, "step": 2499 }, { "epoch": 1.6215339711366954, "grad_norm": 4.885515213012695, "learning_rate": 1.7288496071545256e-05, "loss": 1.7951, "step": 2500 }, { "epoch": 1.6215339711366954, "eval_loss": 1.8597742319107056, "eval_runtime": 35.0374, "eval_samples_per_second": 58.68, "eval_steps_per_second": 14.67, "step": 2500 }, { "epoch": 1.62218258472515, "grad_norm": 3.838585376739502, "learning_rate": 1.7231100184310956e-05, "loss": 1.7625, "step": 2501 }, { "epoch": 1.6228311983136048, "grad_norm": 3.418085813522339, "learning_rate": 1.7173790745915398e-05, "loss": 1.4235, "step": 2502 }, { "epoch": 1.6234798119020595, "grad_norm": 4.029057502746582, "learning_rate": 1.7116567816215955e-05, "loss": 1.5914, "step": 2503 }, { "epoch": 1.6241284254905142, "grad_norm": 3.9772164821624756, "learning_rate": 1.7059431454979824e-05, "loss": 1.6713, "step": 2504 }, { "epoch": 1.6247770390789689, "grad_norm": 4.427797317504883, "learning_rate": 1.700238172188359e-05, "loss": 1.4003, "step": 2505 }, { "epoch": 1.6254256526674233, "grad_norm": 3.819469928741455, "learning_rate": 1.694541867651348e-05, "loss": 1.4897, "step": 2506 }, { "epoch": 1.626074266255878, "grad_norm": 4.537180423736572, "learning_rate": 1.6888542378365092e-05, "loss": 2.0955, "step": 2507 }, { "epoch": 1.6267228798443327, "grad_norm": 4.369129657745361, "learning_rate": 1.6831752886843512e-05, "loss": 1.9953, "step": 2508 }, { "epoch": 1.6273714934327874, "grad_norm": 3.277195930480957, "learning_rate": 1.6775050261263116e-05, "loss": 1.2, "step": 2509 }, { "epoch": 1.6280201070212421, "grad_norm": 3.6796250343322754, "learning_rate": 1.6718434560847506e-05, "loss": 1.7338, "step": 2510 }, { "epoch": 1.6286687206096968, "grad_norm": 3.9466397762298584, "learning_rate": 1.6661905844729608e-05, "loss": 1.6724, "step": 2511 }, { "epoch": 1.6293173341981515, "grad_norm": 3.536290407180786, "learning_rate": 1.6605464171951358e-05, "loss": 1.6365, "step": 2512 }, { "epoch": 1.629965947786606, "grad_norm": 4.062119483947754, "learning_rate": 1.6549109601463908e-05, "loss": 1.5393, "step": 2513 }, { "epoch": 1.6306145613750607, "grad_norm": 4.492953300476074, "learning_rate": 1.6492842192127324e-05, "loss": 1.7247, "step": 2514 }, { "epoch": 1.6312631749635154, "grad_norm": 3.8087146282196045, "learning_rate": 1.64366620027107e-05, "loss": 1.7503, "step": 2515 }, { "epoch": 1.63191178855197, "grad_norm": 3.3403923511505127, "learning_rate": 1.6380569091892062e-05, "loss": 1.4592, "step": 2516 }, { "epoch": 1.6325604021404247, "grad_norm": 3.538494110107422, "learning_rate": 1.632456351825816e-05, "loss": 1.8507, "step": 2517 }, { "epoch": 1.6332090157288794, "grad_norm": 3.6101748943328857, "learning_rate": 1.626864534030469e-05, "loss": 1.4374, "step": 2518 }, { "epoch": 1.6338576293173341, "grad_norm": 3.148545265197754, "learning_rate": 1.621281461643589e-05, "loss": 1.2234, "step": 2519 }, { "epoch": 1.6345062429057888, "grad_norm": 3.4423186779022217, "learning_rate": 1.6157071404964808e-05, "loss": 1.4434, "step": 2520 }, { "epoch": 1.6351548564942435, "grad_norm": 3.8528356552124023, "learning_rate": 1.610141576411296e-05, "loss": 1.7241, "step": 2521 }, { "epoch": 1.6358034700826982, "grad_norm": 3.990858316421509, "learning_rate": 1.6045847752010525e-05, "loss": 1.3552, "step": 2522 }, { "epoch": 1.636452083671153, "grad_norm": 4.455612659454346, "learning_rate": 1.5990367426696017e-05, "loss": 1.6761, "step": 2523 }, { "epoch": 1.6371006972596076, "grad_norm": 4.120962619781494, "learning_rate": 1.593497484611649e-05, "loss": 1.9268, "step": 2524 }, { "epoch": 1.6377493108480623, "grad_norm": 3.3944661617279053, "learning_rate": 1.5879670068127305e-05, "loss": 1.51, "step": 2525 }, { "epoch": 1.638397924436517, "grad_norm": 2.934962272644043, "learning_rate": 1.582445315049208e-05, "loss": 1.5863, "step": 2526 }, { "epoch": 1.6390465380249717, "grad_norm": 3.3539350032806396, "learning_rate": 1.5769324150882735e-05, "loss": 1.5283, "step": 2527 }, { "epoch": 1.6396951516134264, "grad_norm": 3.5773351192474365, "learning_rate": 1.571428312687928e-05, "loss": 1.4051, "step": 2528 }, { "epoch": 1.640343765201881, "grad_norm": 4.061117172241211, "learning_rate": 1.565933013596994e-05, "loss": 1.7975, "step": 2529 }, { "epoch": 1.6409923787903358, "grad_norm": 3.318239212036133, "learning_rate": 1.560446523555089e-05, "loss": 1.4807, "step": 2530 }, { "epoch": 1.6416409923787905, "grad_norm": 3.7695717811584473, "learning_rate": 1.5549688482926374e-05, "loss": 1.4743, "step": 2531 }, { "epoch": 1.6422896059672452, "grad_norm": 4.578890800476074, "learning_rate": 1.549499993530853e-05, "loss": 1.8317, "step": 2532 }, { "epoch": 1.6429382195556999, "grad_norm": 3.357616901397705, "learning_rate": 1.5440399649817385e-05, "loss": 1.5413, "step": 2533 }, { "epoch": 1.6435868331441543, "grad_norm": 3.6430342197418213, "learning_rate": 1.5385887683480794e-05, "loss": 1.4893, "step": 2534 }, { "epoch": 1.644235446732609, "grad_norm": 4.464782238006592, "learning_rate": 1.5331464093234314e-05, "loss": 1.6323, "step": 2535 }, { "epoch": 1.6448840603210637, "grad_norm": 4.207960605621338, "learning_rate": 1.527712893592127e-05, "loss": 1.5359, "step": 2536 }, { "epoch": 1.6455326739095184, "grad_norm": 4.160396099090576, "learning_rate": 1.5222882268292538e-05, "loss": 1.6815, "step": 2537 }, { "epoch": 1.646181287497973, "grad_norm": 3.4661993980407715, "learning_rate": 1.5168724147006652e-05, "loss": 1.712, "step": 2538 }, { "epoch": 1.6468299010864278, "grad_norm": 4.36069393157959, "learning_rate": 1.5114654628629576e-05, "loss": 1.684, "step": 2539 }, { "epoch": 1.6474785146748825, "grad_norm": 4.508609294891357, "learning_rate": 1.5060673769634825e-05, "loss": 1.6561, "step": 2540 }, { "epoch": 1.648127128263337, "grad_norm": 4.050048828125, "learning_rate": 1.5006781626403289e-05, "loss": 1.6386, "step": 2541 }, { "epoch": 1.6487757418517917, "grad_norm": 4.923355579376221, "learning_rate": 1.4952978255223104e-05, "loss": 2.0467, "step": 2542 }, { "epoch": 1.6494243554402463, "grad_norm": 4.099915027618408, "learning_rate": 1.4899263712289835e-05, "loss": 1.5092, "step": 2543 }, { "epoch": 1.650072969028701, "grad_norm": 3.8637373447418213, "learning_rate": 1.4845638053706146e-05, "loss": 1.5402, "step": 2544 }, { "epoch": 1.6507215826171557, "grad_norm": 4.021983623504639, "learning_rate": 1.4792101335481945e-05, "loss": 1.7063, "step": 2545 }, { "epoch": 1.6513701962056104, "grad_norm": 3.7282865047454834, "learning_rate": 1.473865361353416e-05, "loss": 1.3386, "step": 2546 }, { "epoch": 1.6520188097940651, "grad_norm": 4.368015766143799, "learning_rate": 1.4685294943686911e-05, "loss": 1.6597, "step": 2547 }, { "epoch": 1.6526674233825198, "grad_norm": 3.759490966796875, "learning_rate": 1.4632025381671133e-05, "loss": 1.8021, "step": 2548 }, { "epoch": 1.6533160369709745, "grad_norm": 3.5990772247314453, "learning_rate": 1.4578844983124818e-05, "loss": 1.4853, "step": 2549 }, { "epoch": 1.6539646505594292, "grad_norm": 3.760692834854126, "learning_rate": 1.45257538035928e-05, "loss": 1.5397, "step": 2550 }, { "epoch": 1.654613264147884, "grad_norm": 4.3784589767456055, "learning_rate": 1.447275189852666e-05, "loss": 1.6037, "step": 2551 }, { "epoch": 1.6552618777363386, "grad_norm": 3.6675422191619873, "learning_rate": 1.4419839323284844e-05, "loss": 1.4253, "step": 2552 }, { "epoch": 1.6559104913247933, "grad_norm": 3.7725980281829834, "learning_rate": 1.4367016133132394e-05, "loss": 1.5519, "step": 2553 }, { "epoch": 1.656559104913248, "grad_norm": 3.784635543823242, "learning_rate": 1.4314282383241096e-05, "loss": 1.3483, "step": 2554 }, { "epoch": 1.6572077185017027, "grad_norm": 3.4882090091705322, "learning_rate": 1.4261638128689204e-05, "loss": 1.3864, "step": 2555 }, { "epoch": 1.6578563320901574, "grad_norm": 3.91473126411438, "learning_rate": 1.4209083424461577e-05, "loss": 1.4945, "step": 2556 }, { "epoch": 1.658504945678612, "grad_norm": 2.969473361968994, "learning_rate": 1.4156618325449567e-05, "loss": 1.4067, "step": 2557 }, { "epoch": 1.6591535592670668, "grad_norm": 3.4114830493927, "learning_rate": 1.4104242886450824e-05, "loss": 1.5373, "step": 2558 }, { "epoch": 1.6598021728555215, "grad_norm": 3.417128324508667, "learning_rate": 1.4051957162169482e-05, "loss": 1.3506, "step": 2559 }, { "epoch": 1.6604507864439761, "grad_norm": 3.2905731201171875, "learning_rate": 1.3999761207215834e-05, "loss": 1.3987, "step": 2560 }, { "epoch": 1.6610994000324308, "grad_norm": 3.9280331134796143, "learning_rate": 1.394765507610658e-05, "loss": 1.6221, "step": 2561 }, { "epoch": 1.6617480136208853, "grad_norm": 4.317049503326416, "learning_rate": 1.3895638823264446e-05, "loss": 1.6635, "step": 2562 }, { "epoch": 1.66239662720934, "grad_norm": 3.6965043544769287, "learning_rate": 1.3843712503018392e-05, "loss": 1.5509, "step": 2563 }, { "epoch": 1.6630452407977947, "grad_norm": 3.5482099056243896, "learning_rate": 1.3791876169603357e-05, "loss": 1.4264, "step": 2564 }, { "epoch": 1.6636938543862494, "grad_norm": 3.816452980041504, "learning_rate": 1.3740129877160358e-05, "loss": 1.6647, "step": 2565 }, { "epoch": 1.664342467974704, "grad_norm": 4.050480365753174, "learning_rate": 1.368847367973638e-05, "loss": 1.4551, "step": 2566 }, { "epoch": 1.6649910815631588, "grad_norm": 4.2461066246032715, "learning_rate": 1.3636907631284224e-05, "loss": 1.6108, "step": 2567 }, { "epoch": 1.6656396951516135, "grad_norm": 3.7157883644104004, "learning_rate": 1.3585431785662627e-05, "loss": 1.5537, "step": 2568 }, { "epoch": 1.666288308740068, "grad_norm": 3.3504509925842285, "learning_rate": 1.3534046196636041e-05, "loss": 1.4025, "step": 2569 }, { "epoch": 1.6669369223285226, "grad_norm": 4.687771320343018, "learning_rate": 1.348275091787471e-05, "loss": 1.7712, "step": 2570 }, { "epoch": 1.6675855359169773, "grad_norm": 3.584503412246704, "learning_rate": 1.3431546002954487e-05, "loss": 1.4136, "step": 2571 }, { "epoch": 1.668234149505432, "grad_norm": 3.52327823638916, "learning_rate": 1.3380431505356905e-05, "loss": 1.3847, "step": 2572 }, { "epoch": 1.6688827630938867, "grad_norm": 4.815456390380859, "learning_rate": 1.3329407478469036e-05, "loss": 1.8604, "step": 2573 }, { "epoch": 1.6695313766823414, "grad_norm": 4.439941883087158, "learning_rate": 1.3278473975583417e-05, "loss": 1.7302, "step": 2574 }, { "epoch": 1.670179990270796, "grad_norm": 3.1460890769958496, "learning_rate": 1.3227631049898115e-05, "loss": 1.5178, "step": 2575 }, { "epoch": 1.6708286038592508, "grad_norm": 4.197859764099121, "learning_rate": 1.317687875451653e-05, "loss": 1.7153, "step": 2576 }, { "epoch": 1.6714772174477055, "grad_norm": 4.040067672729492, "learning_rate": 1.3126217142447462e-05, "loss": 1.2796, "step": 2577 }, { "epoch": 1.6721258310361602, "grad_norm": 3.2873566150665283, "learning_rate": 1.3075646266604913e-05, "loss": 1.3712, "step": 2578 }, { "epoch": 1.6727744446246149, "grad_norm": 4.139466762542725, "learning_rate": 1.3025166179808202e-05, "loss": 1.7301, "step": 2579 }, { "epoch": 1.6734230582130696, "grad_norm": 4.303050518035889, "learning_rate": 1.297477693478173e-05, "loss": 1.7653, "step": 2580 }, { "epoch": 1.6740716718015243, "grad_norm": 3.396998405456543, "learning_rate": 1.2924478584155098e-05, "loss": 1.523, "step": 2581 }, { "epoch": 1.674720285389979, "grad_norm": 3.4838624000549316, "learning_rate": 1.2874271180462972e-05, "loss": 1.475, "step": 2582 }, { "epoch": 1.6753688989784337, "grad_norm": 4.4181084632873535, "learning_rate": 1.2824154776144937e-05, "loss": 1.6512, "step": 2583 }, { "epoch": 1.6760175125668884, "grad_norm": 4.846231460571289, "learning_rate": 1.2774129423545633e-05, "loss": 1.6165, "step": 2584 }, { "epoch": 1.676666126155343, "grad_norm": 3.3758156299591064, "learning_rate": 1.2724195174914544e-05, "loss": 1.5059, "step": 2585 }, { "epoch": 1.6773147397437977, "grad_norm": 3.7439541816711426, "learning_rate": 1.2674352082406039e-05, "loss": 1.5947, "step": 2586 }, { "epoch": 1.6779633533322524, "grad_norm": 3.379343032836914, "learning_rate": 1.2624600198079206e-05, "loss": 1.2824, "step": 2587 }, { "epoch": 1.6786119669207071, "grad_norm": 4.223423004150391, "learning_rate": 1.257493957389796e-05, "loss": 1.4793, "step": 2588 }, { "epoch": 1.6792605805091618, "grad_norm": 4.197158336639404, "learning_rate": 1.252537026173084e-05, "loss": 1.6789, "step": 2589 }, { "epoch": 1.6799091940976163, "grad_norm": 3.1581029891967773, "learning_rate": 1.2475892313351035e-05, "loss": 1.4969, "step": 2590 }, { "epoch": 1.680557807686071, "grad_norm": 4.130486488342285, "learning_rate": 1.2426505780436326e-05, "loss": 1.5531, "step": 2591 }, { "epoch": 1.6812064212745257, "grad_norm": 4.139348983764648, "learning_rate": 1.2377210714568944e-05, "loss": 1.377, "step": 2592 }, { "epoch": 1.6818550348629804, "grad_norm": 4.147006511688232, "learning_rate": 1.2328007167235678e-05, "loss": 1.5007, "step": 2593 }, { "epoch": 1.682503648451435, "grad_norm": 4.634511470794678, "learning_rate": 1.2278895189827644e-05, "loss": 1.9015, "step": 2594 }, { "epoch": 1.6831522620398898, "grad_norm": 3.7980401515960693, "learning_rate": 1.222987483364041e-05, "loss": 1.4498, "step": 2595 }, { "epoch": 1.6838008756283445, "grad_norm": 4.201277256011963, "learning_rate": 1.218094614987374e-05, "loss": 1.5587, "step": 2596 }, { "epoch": 1.684449489216799, "grad_norm": 4.34132194519043, "learning_rate": 1.2132109189631724e-05, "loss": 1.8599, "step": 2597 }, { "epoch": 1.6850981028052536, "grad_norm": 4.639657974243164, "learning_rate": 1.208336400392268e-05, "loss": 1.5991, "step": 2598 }, { "epoch": 1.6857467163937083, "grad_norm": 4.19991397857666, "learning_rate": 1.2034710643658953e-05, "loss": 1.9065, "step": 2599 }, { "epoch": 1.686395329982163, "grad_norm": 4.338680267333984, "learning_rate": 1.1986149159657113e-05, "loss": 1.5655, "step": 2600 }, { "epoch": 1.686395329982163, "eval_loss": 1.8418598175048828, "eval_runtime": 35.1346, "eval_samples_per_second": 58.518, "eval_steps_per_second": 14.629, "step": 2600 }, { "epoch": 1.6870439435706177, "grad_norm": 4.300708770751953, "learning_rate": 1.1937679602637652e-05, "loss": 1.6837, "step": 2601 }, { "epoch": 1.6876925571590724, "grad_norm": 6.012038707733154, "learning_rate": 1.1889302023225158e-05, "loss": 1.8445, "step": 2602 }, { "epoch": 1.688341170747527, "grad_norm": 4.507540225982666, "learning_rate": 1.184101647194803e-05, "loss": 1.4588, "step": 2603 }, { "epoch": 1.6889897843359818, "grad_norm": 4.020684719085693, "learning_rate": 1.1792822999238685e-05, "loss": 1.6297, "step": 2604 }, { "epoch": 1.6896383979244365, "grad_norm": 3.000462293624878, "learning_rate": 1.1744721655433255e-05, "loss": 1.3941, "step": 2605 }, { "epoch": 1.6902870115128912, "grad_norm": 3.6341302394866943, "learning_rate": 1.16967124907717e-05, "loss": 1.5157, "step": 2606 }, { "epoch": 1.6909356251013459, "grad_norm": 3.717895746231079, "learning_rate": 1.1648795555397719e-05, "loss": 1.5356, "step": 2607 }, { "epoch": 1.6915842386898006, "grad_norm": 4.225353240966797, "learning_rate": 1.1600970899358588e-05, "loss": 1.6289, "step": 2608 }, { "epoch": 1.6922328522782553, "grad_norm": 4.336716175079346, "learning_rate": 1.155323857260534e-05, "loss": 1.6448, "step": 2609 }, { "epoch": 1.69288146586671, "grad_norm": 3.835988998413086, "learning_rate": 1.1505598624992442e-05, "loss": 1.4025, "step": 2610 }, { "epoch": 1.6935300794551646, "grad_norm": 3.512087821960449, "learning_rate": 1.145805110627799e-05, "loss": 1.3878, "step": 2611 }, { "epoch": 1.6941786930436193, "grad_norm": 3.964550733566284, "learning_rate": 1.1410596066123413e-05, "loss": 1.5992, "step": 2612 }, { "epoch": 1.694827306632074, "grad_norm": 4.985112190246582, "learning_rate": 1.1363233554093667e-05, "loss": 1.803, "step": 2613 }, { "epoch": 1.6954759202205287, "grad_norm": 4.148494243621826, "learning_rate": 1.1315963619657044e-05, "loss": 1.6521, "step": 2614 }, { "epoch": 1.6961245338089834, "grad_norm": 3.696659564971924, "learning_rate": 1.1268786312185053e-05, "loss": 1.3405, "step": 2615 }, { "epoch": 1.6967731473974381, "grad_norm": 4.742820739746094, "learning_rate": 1.1221701680952589e-05, "loss": 1.8169, "step": 2616 }, { "epoch": 1.6974217609858928, "grad_norm": 3.008798837661743, "learning_rate": 1.1174709775137637e-05, "loss": 1.3678, "step": 2617 }, { "epoch": 1.6980703745743473, "grad_norm": 4.038968086242676, "learning_rate": 1.1127810643821401e-05, "loss": 1.5457, "step": 2618 }, { "epoch": 1.698718988162802, "grad_norm": 4.360540866851807, "learning_rate": 1.1081004335988165e-05, "loss": 1.7292, "step": 2619 }, { "epoch": 1.6993676017512567, "grad_norm": 4.143523216247559, "learning_rate": 1.103429090052528e-05, "loss": 1.5325, "step": 2620 }, { "epoch": 1.7000162153397114, "grad_norm": 3.8044192790985107, "learning_rate": 1.0987670386223047e-05, "loss": 1.7413, "step": 2621 }, { "epoch": 1.700664828928166, "grad_norm": 3.9254343509674072, "learning_rate": 1.0941142841774776e-05, "loss": 1.4243, "step": 2622 }, { "epoch": 1.7013134425166208, "grad_norm": 3.72050404548645, "learning_rate": 1.0894708315776647e-05, "loss": 1.3721, "step": 2623 }, { "epoch": 1.7019620561050754, "grad_norm": 4.298069000244141, "learning_rate": 1.0848366856727654e-05, "loss": 1.8883, "step": 2624 }, { "epoch": 1.70261066969353, "grad_norm": 5.060550689697266, "learning_rate": 1.0802118513029647e-05, "loss": 1.7796, "step": 2625 }, { "epoch": 1.7032592832819846, "grad_norm": 3.3467857837677, "learning_rate": 1.0755963332987163e-05, "loss": 1.3085, "step": 2626 }, { "epoch": 1.7039078968704393, "grad_norm": 4.120709419250488, "learning_rate": 1.0709901364807495e-05, "loss": 1.6745, "step": 2627 }, { "epoch": 1.704556510458894, "grad_norm": 3.5117883682250977, "learning_rate": 1.0663932656600505e-05, "loss": 1.7402, "step": 2628 }, { "epoch": 1.7052051240473487, "grad_norm": 3.525233507156372, "learning_rate": 1.0618057256378711e-05, "loss": 1.3637, "step": 2629 }, { "epoch": 1.7058537376358034, "grad_norm": 4.2615814208984375, "learning_rate": 1.0572275212057158e-05, "loss": 1.8262, "step": 2630 }, { "epoch": 1.706502351224258, "grad_norm": 4.091869831085205, "learning_rate": 1.0526586571453356e-05, "loss": 1.6237, "step": 2631 }, { "epoch": 1.7071509648127128, "grad_norm": 3.2687876224517822, "learning_rate": 1.0480991382287298e-05, "loss": 1.4113, "step": 2632 }, { "epoch": 1.7077995784011675, "grad_norm": 3.579939842224121, "learning_rate": 1.0435489692181344e-05, "loss": 1.5848, "step": 2633 }, { "epoch": 1.7084481919896222, "grad_norm": 3.525017023086548, "learning_rate": 1.0390081548660235e-05, "loss": 1.4573, "step": 2634 }, { "epoch": 1.7090968055780769, "grad_norm": 3.7382800579071045, "learning_rate": 1.0344766999150935e-05, "loss": 1.5937, "step": 2635 }, { "epoch": 1.7097454191665316, "grad_norm": 3.763394355773926, "learning_rate": 1.0299546090982737e-05, "loss": 1.3845, "step": 2636 }, { "epoch": 1.7103940327549862, "grad_norm": 3.9812662601470947, "learning_rate": 1.0254418871387061e-05, "loss": 1.6569, "step": 2637 }, { "epoch": 1.711042646343441, "grad_norm": 3.87648344039917, "learning_rate": 1.0209385387497517e-05, "loss": 1.5705, "step": 2638 }, { "epoch": 1.7116912599318956, "grad_norm": 3.759092330932617, "learning_rate": 1.01644456863498e-05, "loss": 1.559, "step": 2639 }, { "epoch": 1.7123398735203503, "grad_norm": 4.964505672454834, "learning_rate": 1.0119599814881619e-05, "loss": 1.7194, "step": 2640 }, { "epoch": 1.712988487108805, "grad_norm": 3.9060440063476562, "learning_rate": 1.0074847819932754e-05, "loss": 1.7556, "step": 2641 }, { "epoch": 1.7136371006972597, "grad_norm": 3.8141658306121826, "learning_rate": 1.0030189748244856e-05, "loss": 1.6652, "step": 2642 }, { "epoch": 1.7142857142857144, "grad_norm": 3.8541340827941895, "learning_rate": 9.985625646461539e-06, "loss": 1.6501, "step": 2643 }, { "epoch": 1.714934327874169, "grad_norm": 3.9131546020507812, "learning_rate": 9.941155561128212e-06, "loss": 1.4757, "step": 2644 }, { "epoch": 1.7155829414626238, "grad_norm": 4.466082572937012, "learning_rate": 9.896779538692135e-06, "loss": 1.755, "step": 2645 }, { "epoch": 1.7162315550510783, "grad_norm": 3.972756862640381, "learning_rate": 9.852497625502311e-06, "loss": 1.7233, "step": 2646 }, { "epoch": 1.716880168639533, "grad_norm": 3.876030921936035, "learning_rate": 9.80830986780944e-06, "loss": 1.5529, "step": 2647 }, { "epoch": 1.7175287822279877, "grad_norm": 3.9666080474853516, "learning_rate": 9.764216311765905e-06, "loss": 1.7215, "step": 2648 }, { "epoch": 1.7181773958164424, "grad_norm": 4.445436954498291, "learning_rate": 9.720217003425647e-06, "loss": 1.5887, "step": 2649 }, { "epoch": 1.718826009404897, "grad_norm": 4.088064193725586, "learning_rate": 9.676311988744225e-06, "loss": 1.7734, "step": 2650 }, { "epoch": 1.7194746229933517, "grad_norm": 3.7458794116973877, "learning_rate": 9.632501313578667e-06, "loss": 1.6207, "step": 2651 }, { "epoch": 1.7201232365818064, "grad_norm": 3.3170173168182373, "learning_rate": 9.58878502368753e-06, "loss": 1.3867, "step": 2652 }, { "epoch": 1.720771850170261, "grad_norm": 4.409306526184082, "learning_rate": 9.545163164730687e-06, "loss": 1.6196, "step": 2653 }, { "epoch": 1.7214204637587156, "grad_norm": 3.950925827026367, "learning_rate": 9.50163578226948e-06, "loss": 1.6592, "step": 2654 }, { "epoch": 1.7220690773471703, "grad_norm": 3.964370012283325, "learning_rate": 9.458202921766546e-06, "loss": 1.2977, "step": 2655 }, { "epoch": 1.722717690935625, "grad_norm": 3.8704025745391846, "learning_rate": 9.41486462858575e-06, "loss": 1.3927, "step": 2656 }, { "epoch": 1.7233663045240797, "grad_norm": 3.4447779655456543, "learning_rate": 9.371620947992276e-06, "loss": 1.3024, "step": 2657 }, { "epoch": 1.7240149181125344, "grad_norm": 4.438530445098877, "learning_rate": 9.328471925152381e-06, "loss": 1.9476, "step": 2658 }, { "epoch": 1.724663531700989, "grad_norm": 3.542987585067749, "learning_rate": 9.285417605133562e-06, "loss": 1.3708, "step": 2659 }, { "epoch": 1.7253121452894438, "grad_norm": 4.158416271209717, "learning_rate": 9.242458032904311e-06, "loss": 1.4484, "step": 2660 }, { "epoch": 1.7259607588778985, "grad_norm": 3.7121734619140625, "learning_rate": 9.199593253334204e-06, "loss": 1.6229, "step": 2661 }, { "epoch": 1.7266093724663532, "grad_norm": 3.9140655994415283, "learning_rate": 9.156823311193818e-06, "loss": 1.622, "step": 2662 }, { "epoch": 1.7272579860548078, "grad_norm": 3.7158870697021484, "learning_rate": 9.114148251154675e-06, "loss": 1.4435, "step": 2663 }, { "epoch": 1.7279065996432625, "grad_norm": 3.5419411659240723, "learning_rate": 9.071568117789186e-06, "loss": 1.4125, "step": 2664 }, { "epoch": 1.7285552132317172, "grad_norm": 3.680100202560425, "learning_rate": 9.029082955570589e-06, "loss": 1.3984, "step": 2665 }, { "epoch": 1.729203826820172, "grad_norm": 3.583246946334839, "learning_rate": 8.986692808872976e-06, "loss": 1.4396, "step": 2666 }, { "epoch": 1.7298524404086266, "grad_norm": 3.144155263900757, "learning_rate": 8.944397721971154e-06, "loss": 1.223, "step": 2667 }, { "epoch": 1.7305010539970813, "grad_norm": 4.334980487823486, "learning_rate": 8.902197739040708e-06, "loss": 1.5693, "step": 2668 }, { "epoch": 1.731149667585536, "grad_norm": 3.9168457984924316, "learning_rate": 8.860092904157791e-06, "loss": 1.7046, "step": 2669 }, { "epoch": 1.7317982811739907, "grad_norm": 3.062551498413086, "learning_rate": 8.81808326129927e-06, "loss": 1.3234, "step": 2670 }, { "epoch": 1.7324468947624454, "grad_norm": 3.4754064083099365, "learning_rate": 8.77616885434258e-06, "loss": 1.7686, "step": 2671 }, { "epoch": 1.7330955083509, "grad_norm": 3.675621271133423, "learning_rate": 8.734349727065605e-06, "loss": 1.48, "step": 2672 }, { "epoch": 1.7337441219393548, "grad_norm": 3.7275962829589844, "learning_rate": 8.692625923146802e-06, "loss": 1.6251, "step": 2673 }, { "epoch": 1.7343927355278093, "grad_norm": 3.84548282623291, "learning_rate": 8.650997486165013e-06, "loss": 1.5206, "step": 2674 }, { "epoch": 1.735041349116264, "grad_norm": 4.1720123291015625, "learning_rate": 8.609464459599504e-06, "loss": 1.5657, "step": 2675 }, { "epoch": 1.7356899627047186, "grad_norm": 3.9853506088256836, "learning_rate": 8.568026886829883e-06, "loss": 1.4321, "step": 2676 }, { "epoch": 1.7363385762931733, "grad_norm": 4.783657550811768, "learning_rate": 8.526684811136054e-06, "loss": 1.8964, "step": 2677 }, { "epoch": 1.736987189881628, "grad_norm": 4.352350234985352, "learning_rate": 8.485438275698154e-06, "loss": 1.9714, "step": 2678 }, { "epoch": 1.7376358034700827, "grad_norm": 3.2816944122314453, "learning_rate": 8.444287323596578e-06, "loss": 1.1274, "step": 2679 }, { "epoch": 1.7382844170585374, "grad_norm": 3.661813735961914, "learning_rate": 8.403231997811867e-06, "loss": 1.5093, "step": 2680 }, { "epoch": 1.738933030646992, "grad_norm": 4.30817985534668, "learning_rate": 8.362272341224664e-06, "loss": 1.8728, "step": 2681 }, { "epoch": 1.7395816442354466, "grad_norm": 3.778003215789795, "learning_rate": 8.321408396615749e-06, "loss": 1.3492, "step": 2682 }, { "epoch": 1.7402302578239013, "grad_norm": 3.3843305110931396, "learning_rate": 8.280640206665835e-06, "loss": 1.5623, "step": 2683 }, { "epoch": 1.740878871412356, "grad_norm": 3.212092638015747, "learning_rate": 8.23996781395574e-06, "loss": 1.281, "step": 2684 }, { "epoch": 1.7415274850008107, "grad_norm": 3.670172929763794, "learning_rate": 8.199391260966126e-06, "loss": 1.5852, "step": 2685 }, { "epoch": 1.7421760985892654, "grad_norm": 4.704514026641846, "learning_rate": 8.158910590077606e-06, "loss": 1.629, "step": 2686 }, { "epoch": 1.74282471217772, "grad_norm": 4.549071788787842, "learning_rate": 8.118525843570668e-06, "loss": 1.4719, "step": 2687 }, { "epoch": 1.7434733257661748, "grad_norm": 4.572027683258057, "learning_rate": 8.078237063625538e-06, "loss": 1.8911, "step": 2688 }, { "epoch": 1.7441219393546294, "grad_norm": 3.688284397125244, "learning_rate": 8.038044292322266e-06, "loss": 1.4573, "step": 2689 }, { "epoch": 1.7447705529430841, "grad_norm": 3.3613474369049072, "learning_rate": 7.997947571640619e-06, "loss": 1.527, "step": 2690 }, { "epoch": 1.7454191665315388, "grad_norm": 3.538370132446289, "learning_rate": 7.957946943460048e-06, "loss": 1.4203, "step": 2691 }, { "epoch": 1.7460677801199935, "grad_norm": 4.3591837882995605, "learning_rate": 7.918042449559582e-06, "loss": 1.7317, "step": 2692 }, { "epoch": 1.7467163937084482, "grad_norm": 3.606415033340454, "learning_rate": 7.878234131617934e-06, "loss": 1.6418, "step": 2693 }, { "epoch": 1.747365007296903, "grad_norm": 3.8441548347473145, "learning_rate": 7.838522031213269e-06, "loss": 1.6053, "step": 2694 }, { "epoch": 1.7480136208853576, "grad_norm": 3.9034695625305176, "learning_rate": 7.79890618982333e-06, "loss": 1.6073, "step": 2695 }, { "epoch": 1.7486622344738123, "grad_norm": 3.7583513259887695, "learning_rate": 7.759386648825307e-06, "loss": 1.8827, "step": 2696 }, { "epoch": 1.749310848062267, "grad_norm": 3.4743611812591553, "learning_rate": 7.71996344949576e-06, "loss": 1.4905, "step": 2697 }, { "epoch": 1.7499594616507217, "grad_norm": 3.939840078353882, "learning_rate": 7.680636633010695e-06, "loss": 1.5308, "step": 2698 }, { "epoch": 1.7506080752391764, "grad_norm": 4.952948093414307, "learning_rate": 7.641406240445392e-06, "loss": 1.9657, "step": 2699 }, { "epoch": 1.751256688827631, "grad_norm": 3.4480223655700684, "learning_rate": 7.602272312774461e-06, "loss": 1.2855, "step": 2700 }, { "epoch": 1.751256688827631, "eval_loss": 1.8256381750106812, "eval_runtime": 35.1232, "eval_samples_per_second": 58.537, "eval_steps_per_second": 14.634, "step": 2700 }, { "epoch": 1.7519053024160858, "grad_norm": 4.24340295791626, "learning_rate": 7.563234890871718e-06, "loss": 1.6116, "step": 2701 }, { "epoch": 1.7525539160045402, "grad_norm": 3.4818055629730225, "learning_rate": 7.524294015510203e-06, "loss": 1.456, "step": 2702 }, { "epoch": 1.753202529592995, "grad_norm": 3.8919901847839355, "learning_rate": 7.485449727362159e-06, "loss": 1.5071, "step": 2703 }, { "epoch": 1.7538511431814496, "grad_norm": 5.132872104644775, "learning_rate": 7.446702066998845e-06, "loss": 2.0253, "step": 2704 }, { "epoch": 1.7544997567699043, "grad_norm": 4.504909992218018, "learning_rate": 7.408051074890721e-06, "loss": 1.6033, "step": 2705 }, { "epoch": 1.755148370358359, "grad_norm": 4.314545154571533, "learning_rate": 7.369496791407171e-06, "loss": 1.626, "step": 2706 }, { "epoch": 1.7557969839468137, "grad_norm": 3.724616765975952, "learning_rate": 7.331039256816663e-06, "loss": 1.6678, "step": 2707 }, { "epoch": 1.7564455975352684, "grad_norm": 3.686814785003662, "learning_rate": 7.292678511286522e-06, "loss": 1.3183, "step": 2708 }, { "epoch": 1.7570942111237229, "grad_norm": 3.6383635997772217, "learning_rate": 7.254414594883052e-06, "loss": 1.4893, "step": 2709 }, { "epoch": 1.7577428247121776, "grad_norm": 3.745144844055176, "learning_rate": 7.216247547571398e-06, "loss": 1.5152, "step": 2710 }, { "epoch": 1.7583914383006323, "grad_norm": 4.269280433654785, "learning_rate": 7.178177409215514e-06, "loss": 1.6344, "step": 2711 }, { "epoch": 1.759040051889087, "grad_norm": 4.295335292816162, "learning_rate": 7.140204219578184e-06, "loss": 1.6222, "step": 2712 }, { "epoch": 1.7596886654775417, "grad_norm": 3.6855738162994385, "learning_rate": 7.102328018320858e-06, "loss": 1.4766, "step": 2713 }, { "epoch": 1.7603372790659964, "grad_norm": 3.649822950363159, "learning_rate": 7.064548845003771e-06, "loss": 1.6598, "step": 2714 }, { "epoch": 1.760985892654451, "grad_norm": 3.867445945739746, "learning_rate": 7.026866739085747e-06, "loss": 1.9886, "step": 2715 }, { "epoch": 1.7616345062429057, "grad_norm": 2.9414334297180176, "learning_rate": 6.98928173992427e-06, "loss": 1.332, "step": 2716 }, { "epoch": 1.7622831198313604, "grad_norm": 3.0432345867156982, "learning_rate": 6.95179388677536e-06, "loss": 1.2541, "step": 2717 }, { "epoch": 1.7629317334198151, "grad_norm": 3.5335042476654053, "learning_rate": 6.914403218793608e-06, "loss": 1.5579, "step": 2718 }, { "epoch": 1.7635803470082698, "grad_norm": 3.1067028045654297, "learning_rate": 6.877109775032098e-06, "loss": 1.2953, "step": 2719 }, { "epoch": 1.7642289605967245, "grad_norm": 3.2547950744628906, "learning_rate": 6.839913594442338e-06, "loss": 1.604, "step": 2720 }, { "epoch": 1.7648775741851792, "grad_norm": 3.6031689643859863, "learning_rate": 6.802814715874295e-06, "loss": 1.4509, "step": 2721 }, { "epoch": 1.765526187773634, "grad_norm": 3.5760819911956787, "learning_rate": 6.76581317807623e-06, "loss": 1.4613, "step": 2722 }, { "epoch": 1.7661748013620886, "grad_norm": 3.749077320098877, "learning_rate": 6.7289090196948405e-06, "loss": 1.4942, "step": 2723 }, { "epoch": 1.7668234149505433, "grad_norm": 3.7442007064819336, "learning_rate": 6.692102279275014e-06, "loss": 1.4486, "step": 2724 }, { "epoch": 1.767472028538998, "grad_norm": 3.6555118560791016, "learning_rate": 6.655392995259957e-06, "loss": 1.6379, "step": 2725 }, { "epoch": 1.7681206421274527, "grad_norm": 5.535717487335205, "learning_rate": 6.6187812059910425e-06, "loss": 1.7141, "step": 2726 }, { "epoch": 1.7687692557159074, "grad_norm": 3.268592119216919, "learning_rate": 6.582266949707849e-06, "loss": 1.2133, "step": 2727 }, { "epoch": 1.769417869304362, "grad_norm": 5.176033973693848, "learning_rate": 6.5458502645480924e-06, "loss": 2.1423, "step": 2728 }, { "epoch": 1.7700664828928168, "grad_norm": 3.8671770095825195, "learning_rate": 6.509531188547513e-06, "loss": 1.3063, "step": 2729 }, { "epoch": 1.7707150964812712, "grad_norm": 3.598050355911255, "learning_rate": 6.473309759639989e-06, "loss": 1.3929, "step": 2730 }, { "epoch": 1.771363710069726, "grad_norm": 4.694198131561279, "learning_rate": 6.437186015657337e-06, "loss": 1.7056, "step": 2731 }, { "epoch": 1.7720123236581806, "grad_norm": 4.3623528480529785, "learning_rate": 6.401159994329409e-06, "loss": 1.7615, "step": 2732 }, { "epoch": 1.7726609372466353, "grad_norm": 3.756930351257324, "learning_rate": 6.365231733283905e-06, "loss": 1.6735, "step": 2733 }, { "epoch": 1.77330955083509, "grad_norm": 3.854763984680176, "learning_rate": 6.329401270046542e-06, "loss": 1.6676, "step": 2734 }, { "epoch": 1.7739581644235447, "grad_norm": 4.268932342529297, "learning_rate": 6.293668642040762e-06, "loss": 1.4795, "step": 2735 }, { "epoch": 1.7746067780119994, "grad_norm": 4.289312362670898, "learning_rate": 6.258033886587911e-06, "loss": 1.8378, "step": 2736 }, { "epoch": 1.7752553916004539, "grad_norm": 3.7266221046447754, "learning_rate": 6.222497040907083e-06, "loss": 1.4466, "step": 2737 }, { "epoch": 1.7759040051889086, "grad_norm": 3.735713481903076, "learning_rate": 6.187058142115077e-06, "loss": 1.5126, "step": 2738 }, { "epoch": 1.7765526187773633, "grad_norm": 3.4026243686676025, "learning_rate": 6.15171722722645e-06, "loss": 1.3415, "step": 2739 }, { "epoch": 1.777201232365818, "grad_norm": 4.45815896987915, "learning_rate": 6.116474333153366e-06, "loss": 1.7747, "step": 2740 }, { "epoch": 1.7778498459542726, "grad_norm": 4.493323802947998, "learning_rate": 6.081329496705667e-06, "loss": 1.5961, "step": 2741 }, { "epoch": 1.7784984595427273, "grad_norm": 4.176742076873779, "learning_rate": 6.046282754590693e-06, "loss": 1.5482, "step": 2742 }, { "epoch": 1.779147073131182, "grad_norm": 4.874648571014404, "learning_rate": 6.011334143413405e-06, "loss": 1.786, "step": 2743 }, { "epoch": 1.7797956867196367, "grad_norm": 3.82142972946167, "learning_rate": 5.976483699676261e-06, "loss": 1.5514, "step": 2744 }, { "epoch": 1.7804443003080914, "grad_norm": 3.8767080307006836, "learning_rate": 5.9417314597791315e-06, "loss": 1.4555, "step": 2745 }, { "epoch": 1.7810929138965461, "grad_norm": 4.044439792633057, "learning_rate": 5.907077460019394e-06, "loss": 1.6864, "step": 2746 }, { "epoch": 1.7817415274850008, "grad_norm": 4.042080402374268, "learning_rate": 5.87252173659173e-06, "loss": 1.6896, "step": 2747 }, { "epoch": 1.7823901410734555, "grad_norm": 3.7942934036254883, "learning_rate": 5.838064325588288e-06, "loss": 1.6385, "step": 2748 }, { "epoch": 1.7830387546619102, "grad_norm": 3.6866953372955322, "learning_rate": 5.803705262998415e-06, "loss": 1.3622, "step": 2749 }, { "epoch": 1.783687368250365, "grad_norm": 4.242663860321045, "learning_rate": 5.769444584708828e-06, "loss": 1.5446, "step": 2750 }, { "epoch": 1.7843359818388196, "grad_norm": 4.238768100738525, "learning_rate": 5.735282326503422e-06, "loss": 1.7951, "step": 2751 }, { "epoch": 1.7849845954272743, "grad_norm": 3.4801833629608154, "learning_rate": 5.701218524063334e-06, "loss": 1.4071, "step": 2752 }, { "epoch": 1.785633209015729, "grad_norm": 3.2863376140594482, "learning_rate": 5.667253212966872e-06, "loss": 1.3079, "step": 2753 }, { "epoch": 1.7862818226041837, "grad_norm": 3.7522573471069336, "learning_rate": 5.633386428689435e-06, "loss": 1.4624, "step": 2754 }, { "epoch": 1.7869304361926384, "grad_norm": 4.37391471862793, "learning_rate": 5.5996182066035625e-06, "loss": 1.8311, "step": 2755 }, { "epoch": 1.787579049781093, "grad_norm": 3.8527486324310303, "learning_rate": 5.565948581978786e-06, "loss": 1.7147, "step": 2756 }, { "epoch": 1.7882276633695477, "grad_norm": 3.761984348297119, "learning_rate": 5.532377589981741e-06, "loss": 1.5419, "step": 2757 }, { "epoch": 1.7888762769580022, "grad_norm": 4.236515998840332, "learning_rate": 5.498905265675958e-06, "loss": 1.4862, "step": 2758 }, { "epoch": 1.789524890546457, "grad_norm": 3.767507314682007, "learning_rate": 5.465531644021982e-06, "loss": 1.4897, "step": 2759 }, { "epoch": 1.7901735041349116, "grad_norm": 4.007956504821777, "learning_rate": 5.4322567598772415e-06, "loss": 1.5518, "step": 2760 }, { "epoch": 1.7908221177233663, "grad_norm": 4.644977569580078, "learning_rate": 5.399080647996002e-06, "loss": 1.7724, "step": 2761 }, { "epoch": 1.791470731311821, "grad_norm": 4.557041168212891, "learning_rate": 5.366003343029446e-06, "loss": 1.715, "step": 2762 }, { "epoch": 1.7921193449002757, "grad_norm": 4.906123161315918, "learning_rate": 5.333024879525472e-06, "loss": 1.7729, "step": 2763 }, { "epoch": 1.7927679584887304, "grad_norm": 3.682722568511963, "learning_rate": 5.300145291928815e-06, "loss": 1.5367, "step": 2764 }, { "epoch": 1.7934165720771849, "grad_norm": 5.035078048706055, "learning_rate": 5.267364614580861e-06, "loss": 1.7584, "step": 2765 }, { "epoch": 1.7940651856656396, "grad_norm": 3.285907506942749, "learning_rate": 5.2346828817197655e-06, "loss": 1.5103, "step": 2766 }, { "epoch": 1.7947137992540942, "grad_norm": 3.5945491790771484, "learning_rate": 5.202100127480269e-06, "loss": 1.4406, "step": 2767 }, { "epoch": 1.795362412842549, "grad_norm": 3.415757656097412, "learning_rate": 5.169616385893794e-06, "loss": 1.5342, "step": 2768 }, { "epoch": 1.7960110264310036, "grad_norm": 3.580531597137451, "learning_rate": 5.1372316908883225e-06, "loss": 1.5808, "step": 2769 }, { "epoch": 1.7966596400194583, "grad_norm": 4.725218772888184, "learning_rate": 5.104946076288375e-06, "loss": 1.8804, "step": 2770 }, { "epoch": 1.797308253607913, "grad_norm": 3.5706560611724854, "learning_rate": 5.072759575815011e-06, "loss": 1.4567, "step": 2771 }, { "epoch": 1.7979568671963677, "grad_norm": 4.379705905914307, "learning_rate": 5.0406722230857295e-06, "loss": 1.8243, "step": 2772 }, { "epoch": 1.7986054807848224, "grad_norm": 3.6834893226623535, "learning_rate": 5.008684051614543e-06, "loss": 1.439, "step": 2773 }, { "epoch": 1.799254094373277, "grad_norm": 3.667409658432007, "learning_rate": 4.976795094811782e-06, "loss": 1.4583, "step": 2774 }, { "epoch": 1.7999027079617318, "grad_norm": 3.3831920623779297, "learning_rate": 4.945005385984214e-06, "loss": 1.4414, "step": 2775 }, { "epoch": 1.8005513215501865, "grad_norm": 4.3380889892578125, "learning_rate": 4.913314958334958e-06, "loss": 1.7807, "step": 2776 }, { "epoch": 1.8011999351386412, "grad_norm": 4.456954479217529, "learning_rate": 4.881723844963382e-06, "loss": 1.5376, "step": 2777 }, { "epoch": 1.8018485487270959, "grad_norm": 3.7724268436431885, "learning_rate": 4.850232078865169e-06, "loss": 1.2568, "step": 2778 }, { "epoch": 1.8024971623155506, "grad_norm": 3.955535650253296, "learning_rate": 4.818839692932209e-06, "loss": 1.688, "step": 2779 }, { "epoch": 1.8031457759040053, "grad_norm": 4.5658488273620605, "learning_rate": 4.787546719952629e-06, "loss": 1.7955, "step": 2780 }, { "epoch": 1.80379438949246, "grad_norm": 3.6783533096313477, "learning_rate": 4.756353192610674e-06, "loss": 1.4188, "step": 2781 }, { "epoch": 1.8044430030809147, "grad_norm": 4.207545757293701, "learning_rate": 4.725259143486771e-06, "loss": 1.6307, "step": 2782 }, { "epoch": 1.8050916166693693, "grad_norm": 4.381121635437012, "learning_rate": 4.694264605057397e-06, "loss": 1.6546, "step": 2783 }, { "epoch": 1.805740230257824, "grad_norm": 3.977132797241211, "learning_rate": 4.663369609695123e-06, "loss": 1.5373, "step": 2784 }, { "epoch": 1.8063888438462787, "grad_norm": 4.055887699127197, "learning_rate": 4.6325741896685815e-06, "loss": 1.5111, "step": 2785 }, { "epoch": 1.8070374574347332, "grad_norm": 4.133464336395264, "learning_rate": 4.601878377142333e-06, "loss": 1.5271, "step": 2786 }, { "epoch": 1.807686071023188, "grad_norm": 3.9013161659240723, "learning_rate": 4.571282204176974e-06, "loss": 1.5208, "step": 2787 }, { "epoch": 1.8083346846116426, "grad_norm": 4.018560886383057, "learning_rate": 4.5407857027289555e-06, "loss": 1.539, "step": 2788 }, { "epoch": 1.8089832982000973, "grad_norm": 4.303539752960205, "learning_rate": 4.510388904650698e-06, "loss": 1.8491, "step": 2789 }, { "epoch": 1.809631911788552, "grad_norm": 3.6235921382904053, "learning_rate": 4.480091841690404e-06, "loss": 1.3633, "step": 2790 }, { "epoch": 1.8102805253770067, "grad_norm": 3.4056453704833984, "learning_rate": 4.449894545492228e-06, "loss": 1.3822, "step": 2791 }, { "epoch": 1.8109291389654614, "grad_norm": 3.6715948581695557, "learning_rate": 4.419797047595997e-06, "loss": 1.6809, "step": 2792 }, { "epoch": 1.8115777525539158, "grad_norm": 3.448446035385132, "learning_rate": 4.389799379437387e-06, "loss": 1.3331, "step": 2793 }, { "epoch": 1.8122263661423705, "grad_norm": 4.257687568664551, "learning_rate": 4.359901572347758e-06, "loss": 1.6666, "step": 2794 }, { "epoch": 1.8128749797308252, "grad_norm": 3.827152729034424, "learning_rate": 4.330103657554185e-06, "loss": 1.4224, "step": 2795 }, { "epoch": 1.81352359331928, "grad_norm": 3.659827709197998, "learning_rate": 4.30040566617943e-06, "loss": 1.444, "step": 2796 }, { "epoch": 1.8141722069077346, "grad_norm": 3.2429237365722656, "learning_rate": 4.270807629241835e-06, "loss": 1.3824, "step": 2797 }, { "epoch": 1.8148208204961893, "grad_norm": 3.651597023010254, "learning_rate": 4.241309577655406e-06, "loss": 1.3814, "step": 2798 }, { "epoch": 1.815469434084644, "grad_norm": 3.6668853759765625, "learning_rate": 4.211911542229674e-06, "loss": 1.5988, "step": 2799 }, { "epoch": 1.8161180476730987, "grad_norm": 4.040700912475586, "learning_rate": 4.1826135536697235e-06, "loss": 1.4709, "step": 2800 }, { "epoch": 1.8161180476730987, "eval_loss": 1.8161381483078003, "eval_runtime": 35.1406, "eval_samples_per_second": 58.508, "eval_steps_per_second": 14.627, "step": 2800 }, { "epoch": 1.8167666612615534, "grad_norm": 4.216310977935791, "learning_rate": 4.153415642576164e-06, "loss": 1.6415, "step": 2801 }, { "epoch": 1.817415274850008, "grad_norm": 3.6424665451049805, "learning_rate": 4.124317839445024e-06, "loss": 1.4601, "step": 2802 }, { "epoch": 1.8180638884384628, "grad_norm": 4.383656024932861, "learning_rate": 4.095320174667849e-06, "loss": 1.7885, "step": 2803 }, { "epoch": 1.8187125020269175, "grad_norm": 4.429874897003174, "learning_rate": 4.0664226785314895e-06, "loss": 1.6286, "step": 2804 }, { "epoch": 1.8193611156153722, "grad_norm": 4.311161041259766, "learning_rate": 4.037625381218313e-06, "loss": 1.5789, "step": 2805 }, { "epoch": 1.8200097292038269, "grad_norm": 4.205760955810547, "learning_rate": 4.0089283128059045e-06, "loss": 1.747, "step": 2806 }, { "epoch": 1.8206583427922816, "grad_norm": 4.295394420623779, "learning_rate": 3.9803315032672315e-06, "loss": 1.6269, "step": 2807 }, { "epoch": 1.8213069563807363, "grad_norm": 3.9077181816101074, "learning_rate": 3.951834982470526e-06, "loss": 1.5982, "step": 2808 }, { "epoch": 1.821955569969191, "grad_norm": 3.680891990661621, "learning_rate": 3.923438780179267e-06, "loss": 1.5134, "step": 2809 }, { "epoch": 1.8226041835576456, "grad_norm": 4.415660381317139, "learning_rate": 3.895142926052187e-06, "loss": 1.6817, "step": 2810 }, { "epoch": 1.8232527971461003, "grad_norm": 3.825579881668091, "learning_rate": 3.8669474496431655e-06, "loss": 1.5465, "step": 2811 }, { "epoch": 1.823901410734555, "grad_norm": 3.4160256385803223, "learning_rate": 3.838852380401281e-06, "loss": 1.366, "step": 2812 }, { "epoch": 1.8245500243230097, "grad_norm": 3.08737850189209, "learning_rate": 3.810857747670682e-06, "loss": 1.2301, "step": 2813 }, { "epoch": 1.8251986379114642, "grad_norm": 3.7413735389709473, "learning_rate": 3.7829635806907016e-06, "loss": 1.7276, "step": 2814 }, { "epoch": 1.8258472514999189, "grad_norm": 3.889486312866211, "learning_rate": 3.755169908595657e-06, "loss": 1.5712, "step": 2815 }, { "epoch": 1.8264958650883736, "grad_norm": 4.351959228515625, "learning_rate": 3.7274767604149494e-06, "loss": 1.5691, "step": 2816 }, { "epoch": 1.8271444786768283, "grad_norm": 4.314332485198975, "learning_rate": 3.6998841650729977e-06, "loss": 1.6118, "step": 2817 }, { "epoch": 1.827793092265283, "grad_norm": 3.6060192584991455, "learning_rate": 3.672392151389137e-06, "loss": 1.6721, "step": 2818 }, { "epoch": 1.8284417058537377, "grad_norm": 4.074169635772705, "learning_rate": 3.6450007480777093e-06, "loss": 1.4582, "step": 2819 }, { "epoch": 1.8290903194421924, "grad_norm": 5.092626094818115, "learning_rate": 3.617709983747941e-06, "loss": 2.2274, "step": 2820 }, { "epoch": 1.8297389330306468, "grad_norm": 3.8781027793884277, "learning_rate": 3.5905198869039757e-06, "loss": 1.3795, "step": 2821 }, { "epoch": 1.8303875466191015, "grad_norm": 3.2604570388793945, "learning_rate": 3.563430485944763e-06, "loss": 1.4297, "step": 2822 }, { "epoch": 1.8310361602075562, "grad_norm": 3.640106439590454, "learning_rate": 3.5364418091641373e-06, "loss": 1.4627, "step": 2823 }, { "epoch": 1.831684773796011, "grad_norm": 3.7133285999298096, "learning_rate": 3.5095538847506828e-06, "loss": 1.6186, "step": 2824 }, { "epoch": 1.8323333873844656, "grad_norm": 3.856898069381714, "learning_rate": 3.4827667407877796e-06, "loss": 1.3512, "step": 2825 }, { "epoch": 1.8329820009729203, "grad_norm": 3.420078754425049, "learning_rate": 3.4560804052535477e-06, "loss": 1.485, "step": 2826 }, { "epoch": 1.833630614561375, "grad_norm": 3.54903507232666, "learning_rate": 3.4294949060207916e-06, "loss": 1.4382, "step": 2827 }, { "epoch": 1.8342792281498297, "grad_norm": 3.9077694416046143, "learning_rate": 3.4030102708570212e-06, "loss": 1.5874, "step": 2828 }, { "epoch": 1.8349278417382844, "grad_norm": 3.691904306411743, "learning_rate": 3.376626527424387e-06, "loss": 1.3772, "step": 2829 }, { "epoch": 1.835576455326739, "grad_norm": 4.199033737182617, "learning_rate": 3.350343703279679e-06, "loss": 1.5458, "step": 2830 }, { "epoch": 1.8362250689151938, "grad_norm": 4.3504319190979, "learning_rate": 3.324161825874228e-06, "loss": 1.8798, "step": 2831 }, { "epoch": 1.8368736825036485, "grad_norm": 3.5428903102874756, "learning_rate": 3.2980809225540034e-06, "loss": 1.3349, "step": 2832 }, { "epoch": 1.8375222960921032, "grad_norm": 3.934295892715454, "learning_rate": 3.2721010205594706e-06, "loss": 1.6728, "step": 2833 }, { "epoch": 1.8381709096805579, "grad_norm": 3.5205702781677246, "learning_rate": 3.2462221470256015e-06, "loss": 1.6957, "step": 2834 }, { "epoch": 1.8388195232690125, "grad_norm": 4.042935848236084, "learning_rate": 3.220444328981864e-06, "loss": 1.563, "step": 2835 }, { "epoch": 1.8394681368574672, "grad_norm": 4.235220909118652, "learning_rate": 3.1947675933521548e-06, "loss": 1.7691, "step": 2836 }, { "epoch": 1.840116750445922, "grad_norm": 3.752261161804199, "learning_rate": 3.169191966954821e-06, "loss": 1.2874, "step": 2837 }, { "epoch": 1.8407653640343766, "grad_norm": 3.849437952041626, "learning_rate": 3.143717476502572e-06, "loss": 1.6385, "step": 2838 }, { "epoch": 1.8414139776228313, "grad_norm": 3.9681670665740967, "learning_rate": 3.118344148602537e-06, "loss": 1.7744, "step": 2839 }, { "epoch": 1.842062591211286, "grad_norm": 4.0448760986328125, "learning_rate": 3.093072009756115e-06, "loss": 1.6712, "step": 2840 }, { "epoch": 1.8427112047997407, "grad_norm": 3.8869118690490723, "learning_rate": 3.0679010863590816e-06, "loss": 1.5176, "step": 2841 }, { "epoch": 1.8433598183881952, "grad_norm": 4.638172149658203, "learning_rate": 3.0428314047014626e-06, "loss": 1.8094, "step": 2842 }, { "epoch": 1.8440084319766499, "grad_norm": 3.2408907413482666, "learning_rate": 3.017862990967546e-06, "loss": 1.4775, "step": 2843 }, { "epoch": 1.8446570455651046, "grad_norm": 3.930145263671875, "learning_rate": 2.9929958712358486e-06, "loss": 1.6223, "step": 2844 }, { "epoch": 1.8453056591535593, "grad_norm": 3.928050994873047, "learning_rate": 2.9682300714790947e-06, "loss": 1.455, "step": 2845 }, { "epoch": 1.845954272742014, "grad_norm": 4.214625358581543, "learning_rate": 2.9435656175641923e-06, "loss": 1.4263, "step": 2846 }, { "epoch": 1.8466028863304687, "grad_norm": 3.271622896194458, "learning_rate": 2.919002535252147e-06, "loss": 1.2564, "step": 2847 }, { "epoch": 1.8472514999189233, "grad_norm": 2.682239294052124, "learning_rate": 2.8945408501981906e-06, "loss": 1.436, "step": 2848 }, { "epoch": 1.8479001135073778, "grad_norm": 3.028463840484619, "learning_rate": 2.870180587951521e-06, "loss": 1.3224, "step": 2849 }, { "epoch": 1.8485487270958325, "grad_norm": 3.6213815212249756, "learning_rate": 2.8459217739555068e-06, "loss": 1.5788, "step": 2850 }, { "epoch": 1.8491973406842872, "grad_norm": 3.2942731380462646, "learning_rate": 2.8217644335475245e-06, "loss": 1.4117, "step": 2851 }, { "epoch": 1.849845954272742, "grad_norm": 3.1236562728881836, "learning_rate": 2.7977085919589254e-06, "loss": 1.3038, "step": 2852 }, { "epoch": 1.8504945678611966, "grad_norm": 3.2051961421966553, "learning_rate": 2.77375427431511e-06, "loss": 1.491, "step": 2853 }, { "epoch": 1.8511431814496513, "grad_norm": 3.3913021087646484, "learning_rate": 2.749901505635388e-06, "loss": 1.2916, "step": 2854 }, { "epoch": 1.851791795038106, "grad_norm": 4.085671424865723, "learning_rate": 2.7261503108330753e-06, "loss": 1.6022, "step": 2855 }, { "epoch": 1.8524404086265607, "grad_norm": 5.2289228439331055, "learning_rate": 2.702500714715317e-06, "loss": 1.5289, "step": 2856 }, { "epoch": 1.8530890222150154, "grad_norm": 3.617866039276123, "learning_rate": 2.6789527419831872e-06, "loss": 1.4287, "step": 2857 }, { "epoch": 1.85373763580347, "grad_norm": 4.388442516326904, "learning_rate": 2.6555064172316234e-06, "loss": 1.7237, "step": 2858 }, { "epoch": 1.8543862493919248, "grad_norm": 4.3117146492004395, "learning_rate": 2.63216176494937e-06, "loss": 1.6899, "step": 2859 }, { "epoch": 1.8550348629803795, "grad_norm": 4.349372386932373, "learning_rate": 2.608918809519001e-06, "loss": 1.4575, "step": 2860 }, { "epoch": 1.8556834765688341, "grad_norm": 4.036871910095215, "learning_rate": 2.5857775752168522e-06, "loss": 1.5671, "step": 2861 }, { "epoch": 1.8563320901572888, "grad_norm": 4.291961669921875, "learning_rate": 2.5627380862130457e-06, "loss": 1.6764, "step": 2862 }, { "epoch": 1.8569807037457435, "grad_norm": 3.9272289276123047, "learning_rate": 2.5398003665713877e-06, "loss": 1.4866, "step": 2863 }, { "epoch": 1.8576293173341982, "grad_norm": 3.36185359954834, "learning_rate": 2.5169644402494584e-06, "loss": 1.404, "step": 2864 }, { "epoch": 1.858277930922653, "grad_norm": 3.601714849472046, "learning_rate": 2.4942303310984348e-06, "loss": 1.3809, "step": 2865 }, { "epoch": 1.8589265445111076, "grad_norm": 4.535726547241211, "learning_rate": 2.471598062863223e-06, "loss": 1.6576, "step": 2866 }, { "epoch": 1.8595751580995623, "grad_norm": 4.563507556915283, "learning_rate": 2.4490676591823248e-06, "loss": 1.8607, "step": 2867 }, { "epoch": 1.860223771688017, "grad_norm": 4.632997989654541, "learning_rate": 2.4266391435878387e-06, "loss": 1.5451, "step": 2868 }, { "epoch": 1.8608723852764717, "grad_norm": 3.6605536937713623, "learning_rate": 2.4043125395054934e-06, "loss": 1.5181, "step": 2869 }, { "epoch": 1.8615209988649262, "grad_norm": 4.37591028213501, "learning_rate": 2.3820878702545125e-06, "loss": 1.5279, "step": 2870 }, { "epoch": 1.8621696124533809, "grad_norm": 3.5915894508361816, "learning_rate": 2.3599651590476945e-06, "loss": 1.5668, "step": 2871 }, { "epoch": 1.8628182260418356, "grad_norm": 3.708045721054077, "learning_rate": 2.3379444289913342e-06, "loss": 1.4916, "step": 2872 }, { "epoch": 1.8634668396302903, "grad_norm": 3.2057130336761475, "learning_rate": 2.3160257030852116e-06, "loss": 1.4906, "step": 2873 }, { "epoch": 1.864115453218745, "grad_norm": 4.309330940246582, "learning_rate": 2.2942090042225804e-06, "loss": 1.7608, "step": 2874 }, { "epoch": 1.8647640668071996, "grad_norm": 3.901855230331421, "learning_rate": 2.2724943551901024e-06, "loss": 1.5248, "step": 2875 }, { "epoch": 1.8654126803956543, "grad_norm": 3.6462326049804688, "learning_rate": 2.250881778667868e-06, "loss": 1.4709, "step": 2876 }, { "epoch": 1.8660612939841088, "grad_norm": 3.9734044075012207, "learning_rate": 2.2293712972293657e-06, "loss": 1.3347, "step": 2877 }, { "epoch": 1.8667099075725635, "grad_norm": 3.6062216758728027, "learning_rate": 2.2079629333414453e-06, "loss": 1.5284, "step": 2878 }, { "epoch": 1.8673585211610182, "grad_norm": 3.1091322898864746, "learning_rate": 2.1866567093642874e-06, "loss": 0.9221, "step": 2879 }, { "epoch": 1.8680071347494729, "grad_norm": 3.7485857009887695, "learning_rate": 2.1654526475514135e-06, "loss": 1.7938, "step": 2880 }, { "epoch": 1.8686557483379276, "grad_norm": 3.604536294937134, "learning_rate": 2.144350770049597e-06, "loss": 1.4852, "step": 2881 }, { "epoch": 1.8693043619263823, "grad_norm": 3.637744903564453, "learning_rate": 2.12335109889894e-06, "loss": 1.7894, "step": 2882 }, { "epoch": 1.869952975514837, "grad_norm": 3.121673107147217, "learning_rate": 2.1024536560327656e-06, "loss": 1.1893, "step": 2883 }, { "epoch": 1.8706015891032917, "grad_norm": 3.4892287254333496, "learning_rate": 2.081658463277614e-06, "loss": 1.4814, "step": 2884 }, { "epoch": 1.8712502026917464, "grad_norm": 4.109448432922363, "learning_rate": 2.0609655423532436e-06, "loss": 1.6677, "step": 2885 }, { "epoch": 1.871898816280201, "grad_norm": 3.8102729320526123, "learning_rate": 2.0403749148725895e-06, "loss": 1.742, "step": 2886 }, { "epoch": 1.8725474298686557, "grad_norm": 3.8127777576446533, "learning_rate": 2.0198866023417585e-06, "loss": 1.7789, "step": 2887 }, { "epoch": 1.8731960434571104, "grad_norm": 3.2487311363220215, "learning_rate": 1.999500626159967e-06, "loss": 1.2434, "step": 2888 }, { "epoch": 1.8738446570455651, "grad_norm": 4.213447570800781, "learning_rate": 1.9792170076195716e-06, "loss": 1.6229, "step": 2889 }, { "epoch": 1.8744932706340198, "grad_norm": 4.874112129211426, "learning_rate": 1.9590357679060034e-06, "loss": 1.6795, "step": 2890 }, { "epoch": 1.8751418842224745, "grad_norm": 4.201356887817383, "learning_rate": 1.938956928097757e-06, "loss": 1.5263, "step": 2891 }, { "epoch": 1.8757904978109292, "grad_norm": 4.355547904968262, "learning_rate": 1.9189805091664124e-06, "loss": 1.561, "step": 2892 }, { "epoch": 1.876439111399384, "grad_norm": 3.5807785987854004, "learning_rate": 1.8991065319765244e-06, "loss": 1.4988, "step": 2893 }, { "epoch": 1.8770877249878386, "grad_norm": 4.124091148376465, "learning_rate": 1.8793350172856994e-06, "loss": 1.6951, "step": 2894 }, { "epoch": 1.8777363385762933, "grad_norm": 3.8594167232513428, "learning_rate": 1.8596659857444743e-06, "loss": 1.8178, "step": 2895 }, { "epoch": 1.878384952164748, "grad_norm": 3.54828143119812, "learning_rate": 1.8400994578963826e-06, "loss": 1.4517, "step": 2896 }, { "epoch": 1.8790335657532027, "grad_norm": 3.5863959789276123, "learning_rate": 1.8206354541778992e-06, "loss": 1.5309, "step": 2897 }, { "epoch": 1.8796821793416572, "grad_norm": 3.7424983978271484, "learning_rate": 1.8012739949183844e-06, "loss": 1.5264, "step": 2898 }, { "epoch": 1.8803307929301118, "grad_norm": 3.724041700363159, "learning_rate": 1.7820151003401508e-06, "loss": 1.3492, "step": 2899 }, { "epoch": 1.8809794065185665, "grad_norm": 4.089436054229736, "learning_rate": 1.7628587905583083e-06, "loss": 1.7402, "step": 2900 }, { "epoch": 1.8809794065185665, "eval_loss": 1.8115514516830444, "eval_runtime": 35.1035, "eval_samples_per_second": 58.57, "eval_steps_per_second": 14.642, "step": 2900 }, { "epoch": 1.8816280201070212, "grad_norm": 3.746548891067505, "learning_rate": 1.7438050855808963e-06, "loss": 1.6207, "step": 2901 }, { "epoch": 1.882276633695476, "grad_norm": 4.537253379821777, "learning_rate": 1.7248540053087402e-06, "loss": 1.7536, "step": 2902 }, { "epoch": 1.8829252472839306, "grad_norm": 3.8964309692382812, "learning_rate": 1.706005569535496e-06, "loss": 1.704, "step": 2903 }, { "epoch": 1.8835738608723853, "grad_norm": 4.050978660583496, "learning_rate": 1.6872597979476047e-06, "loss": 1.6277, "step": 2904 }, { "epoch": 1.8842224744608398, "grad_norm": 3.7590744495391846, "learning_rate": 1.6686167101242932e-06, "loss": 1.5309, "step": 2905 }, { "epoch": 1.8848710880492945, "grad_norm": 4.313365936279297, "learning_rate": 1.6500763255375196e-06, "loss": 1.7797, "step": 2906 }, { "epoch": 1.8855197016377492, "grad_norm": 5.272026062011719, "learning_rate": 1.631638663551982e-06, "loss": 1.6233, "step": 2907 }, { "epoch": 1.8861683152262039, "grad_norm": 4.108143329620361, "learning_rate": 1.6133037434250985e-06, "loss": 1.6755, "step": 2908 }, { "epoch": 1.8868169288146586, "grad_norm": 4.803829193115234, "learning_rate": 1.5950715843069508e-06, "loss": 1.7261, "step": 2909 }, { "epoch": 1.8874655424031133, "grad_norm": 4.1437225341796875, "learning_rate": 1.576942205240317e-06, "loss": 1.6364, "step": 2910 }, { "epoch": 1.888114155991568, "grad_norm": 4.040375709533691, "learning_rate": 1.5589156251606174e-06, "loss": 1.5142, "step": 2911 }, { "epoch": 1.8887627695800226, "grad_norm": 3.6698975563049316, "learning_rate": 1.540991862895902e-06, "loss": 1.4443, "step": 2912 }, { "epoch": 1.8894113831684773, "grad_norm": 4.066771984100342, "learning_rate": 1.5231709371668179e-06, "loss": 1.4185, "step": 2913 }, { "epoch": 1.890059996756932, "grad_norm": 3.654670476913452, "learning_rate": 1.5054528665866319e-06, "loss": 1.4717, "step": 2914 }, { "epoch": 1.8907086103453867, "grad_norm": 4.294985294342041, "learning_rate": 1.487837669661163e-06, "loss": 1.8005, "step": 2915 }, { "epoch": 1.8913572239338414, "grad_norm": 3.5719339847564697, "learning_rate": 1.4703253647887827e-06, "loss": 1.4356, "step": 2916 }, { "epoch": 1.8920058375222961, "grad_norm": 3.8719818592071533, "learning_rate": 1.4529159702604044e-06, "loss": 1.6041, "step": 2917 }, { "epoch": 1.8926544511107508, "grad_norm": 4.003454208374023, "learning_rate": 1.4356095042594386e-06, "loss": 1.645, "step": 2918 }, { "epoch": 1.8933030646992055, "grad_norm": 4.036367416381836, "learning_rate": 1.4184059848618147e-06, "loss": 1.4652, "step": 2919 }, { "epoch": 1.8939516782876602, "grad_norm": 3.894151210784912, "learning_rate": 1.4013054300359373e-06, "loss": 1.5748, "step": 2920 }, { "epoch": 1.894600291876115, "grad_norm": 4.419712543487549, "learning_rate": 1.3843078576426416e-06, "loss": 1.8465, "step": 2921 }, { "epoch": 1.8952489054645696, "grad_norm": 3.418888807296753, "learning_rate": 1.3674132854352373e-06, "loss": 1.6784, "step": 2922 }, { "epoch": 1.8958975190530243, "grad_norm": 3.127779722213745, "learning_rate": 1.3506217310594094e-06, "loss": 1.2829, "step": 2923 }, { "epoch": 1.896546132641479, "grad_norm": 3.896705150604248, "learning_rate": 1.3339332120532956e-06, "loss": 1.6556, "step": 2924 }, { "epoch": 1.8971947462299337, "grad_norm": 3.5791563987731934, "learning_rate": 1.317347745847386e-06, "loss": 1.3423, "step": 2925 }, { "epoch": 1.8978433598183881, "grad_norm": 3.264252185821533, "learning_rate": 1.3008653497645462e-06, "loss": 1.2332, "step": 2926 }, { "epoch": 1.8984919734068428, "grad_norm": 3.5697648525238037, "learning_rate": 1.2844860410199722e-06, "loss": 1.552, "step": 2927 }, { "epoch": 1.8991405869952975, "grad_norm": 5.202948570251465, "learning_rate": 1.2682098367212237e-06, "loss": 1.6393, "step": 2928 }, { "epoch": 1.8997892005837522, "grad_norm": 4.674209117889404, "learning_rate": 1.2520367538681243e-06, "loss": 2.1186, "step": 2929 }, { "epoch": 1.900437814172207, "grad_norm": 3.7318437099456787, "learning_rate": 1.235966809352851e-06, "loss": 1.6782, "step": 2930 }, { "epoch": 1.9010864277606616, "grad_norm": 3.304704427719116, "learning_rate": 1.2200000199598104e-06, "loss": 1.3808, "step": 2931 }, { "epoch": 1.9017350413491163, "grad_norm": 4.361791133880615, "learning_rate": 1.2041364023656742e-06, "loss": 1.6709, "step": 2932 }, { "epoch": 1.9023836549375708, "grad_norm": 3.821343183517456, "learning_rate": 1.1883759731393663e-06, "loss": 1.874, "step": 2933 }, { "epoch": 1.9030322685260255, "grad_norm": 4.168802261352539, "learning_rate": 1.17271874874203e-06, "loss": 1.6697, "step": 2934 }, { "epoch": 1.9036808821144802, "grad_norm": 3.6853387355804443, "learning_rate": 1.1571647455270396e-06, "loss": 1.5875, "step": 2935 }, { "epoch": 1.9043294957029349, "grad_norm": 4.00960636138916, "learning_rate": 1.1417139797399002e-06, "loss": 1.44, "step": 2936 }, { "epoch": 1.9049781092913896, "grad_norm": 4.284857273101807, "learning_rate": 1.1263664675183583e-06, "loss": 1.6697, "step": 2937 }, { "epoch": 1.9056267228798442, "grad_norm": 3.5892515182495117, "learning_rate": 1.1111222248922471e-06, "loss": 1.6763, "step": 2938 }, { "epoch": 1.906275336468299, "grad_norm": 3.6422317028045654, "learning_rate": 1.0959812677835968e-06, "loss": 1.4571, "step": 2939 }, { "epoch": 1.9069239500567536, "grad_norm": 3.755175828933716, "learning_rate": 1.0809436120065464e-06, "loss": 1.5506, "step": 2940 }, { "epoch": 1.9075725636452083, "grad_norm": 4.224865436553955, "learning_rate": 1.06600927326731e-06, "loss": 1.6119, "step": 2941 }, { "epoch": 1.908221177233663, "grad_norm": 4.608331680297852, "learning_rate": 1.051178267164221e-06, "loss": 1.6249, "step": 2942 }, { "epoch": 1.9088697908221177, "grad_norm": 4.360371112823486, "learning_rate": 1.0364506091876892e-06, "loss": 1.376, "step": 2943 }, { "epoch": 1.9095184044105724, "grad_norm": 3.842301607131958, "learning_rate": 1.0218263147201534e-06, "loss": 1.5608, "step": 2944 }, { "epoch": 1.910167017999027, "grad_norm": 3.4179539680480957, "learning_rate": 1.0073053990361182e-06, "loss": 1.4498, "step": 2945 }, { "epoch": 1.9108156315874818, "grad_norm": 4.120007038116455, "learning_rate": 9.928878773020956e-07, "loss": 1.5156, "step": 2946 }, { "epoch": 1.9114642451759365, "grad_norm": 4.100413799285889, "learning_rate": 9.785737645766403e-07, "loss": 1.6076, "step": 2947 }, { "epoch": 1.9121128587643912, "grad_norm": 3.398078680038452, "learning_rate": 9.643630758102484e-07, "loss": 1.4822, "step": 2948 }, { "epoch": 1.9127614723528459, "grad_norm": 3.187776803970337, "learning_rate": 9.502558258454364e-07, "loss": 1.7457, "step": 2949 }, { "epoch": 1.9134100859413006, "grad_norm": 2.963242769241333, "learning_rate": 9.362520294166733e-07, "loss": 1.2732, "step": 2950 }, { "epoch": 1.9140586995297553, "grad_norm": 3.782763957977295, "learning_rate": 9.223517011503591e-07, "loss": 1.518, "step": 2951 }, { "epoch": 1.91470731311821, "grad_norm": 3.964421272277832, "learning_rate": 9.08554855564836e-07, "loss": 2.0361, "step": 2952 }, { "epoch": 1.9153559267066647, "grad_norm": 3.2636027336120605, "learning_rate": 8.948615070703769e-07, "loss": 1.09, "step": 2953 }, { "epoch": 1.9160045402951191, "grad_norm": 3.895223617553711, "learning_rate": 8.812716699691193e-07, "loss": 1.6104, "step": 2954 }, { "epoch": 1.9166531538835738, "grad_norm": 4.242066383361816, "learning_rate": 8.677853584551309e-07, "loss": 1.7069, "step": 2955 }, { "epoch": 1.9173017674720285, "grad_norm": 4.717459678649902, "learning_rate": 8.544025866143224e-07, "loss": 1.8142, "step": 2956 }, { "epoch": 1.9179503810604832, "grad_norm": 3.7411346435546875, "learning_rate": 8.41123368424468e-07, "loss": 1.5264, "step": 2957 }, { "epoch": 1.918598994648938, "grad_norm": 4.391357898712158, "learning_rate": 8.279477177551842e-07, "loss": 1.4961, "step": 2958 }, { "epoch": 1.9192476082373926, "grad_norm": 4.828644275665283, "learning_rate": 8.148756483679187e-07, "loss": 1.6426, "step": 2959 }, { "epoch": 1.9198962218258473, "grad_norm": 4.502415657043457, "learning_rate": 8.019071739159278e-07, "loss": 1.6672, "step": 2960 }, { "epoch": 1.9205448354143018, "grad_norm": 3.5933210849761963, "learning_rate": 7.890423079442766e-07, "loss": 1.4816, "step": 2961 }, { "epoch": 1.9211934490027565, "grad_norm": 4.128708362579346, "learning_rate": 7.762810638898055e-07, "loss": 1.9086, "step": 2962 }, { "epoch": 1.9218420625912112, "grad_norm": 5.281210422515869, "learning_rate": 7.636234550811194e-07, "loss": 1.7385, "step": 2963 }, { "epoch": 1.9224906761796658, "grad_norm": 5.041767120361328, "learning_rate": 7.510694947385877e-07, "loss": 1.5938, "step": 2964 }, { "epoch": 1.9231392897681205, "grad_norm": 3.4296810626983643, "learning_rate": 7.386191959743549e-07, "loss": 1.7191, "step": 2965 }, { "epoch": 1.9237879033565752, "grad_norm": 3.5881218910217285, "learning_rate": 7.262725717922303e-07, "loss": 1.4608, "step": 2966 }, { "epoch": 1.92443651694503, "grad_norm": 3.7223548889160156, "learning_rate": 7.140296350877985e-07, "loss": 1.5195, "step": 2967 }, { "epoch": 1.9250851305334846, "grad_norm": 4.246870994567871, "learning_rate": 7.018903986483083e-07, "loss": 1.811, "step": 2968 }, { "epoch": 1.9257337441219393, "grad_norm": 3.8602375984191895, "learning_rate": 6.898548751527068e-07, "loss": 1.5002, "step": 2969 }, { "epoch": 1.926382357710394, "grad_norm": 3.662970542907715, "learning_rate": 6.779230771716383e-07, "loss": 1.5217, "step": 2970 }, { "epoch": 1.9270309712988487, "grad_norm": 4.314441204071045, "learning_rate": 6.660950171673786e-07, "loss": 1.7055, "step": 2971 }, { "epoch": 1.9276795848873034, "grad_norm": 3.59952974319458, "learning_rate": 6.543707074938787e-07, "loss": 1.4693, "step": 2972 }, { "epoch": 1.928328198475758, "grad_norm": 3.5033376216888428, "learning_rate": 6.427501603967101e-07, "loss": 1.4613, "step": 2973 }, { "epoch": 1.9289768120642128, "grad_norm": 3.699728488922119, "learning_rate": 6.31233388013075e-07, "loss": 1.8418, "step": 2974 }, { "epoch": 1.9296254256526675, "grad_norm": 3.287628412246704, "learning_rate": 6.198204023717847e-07, "loss": 1.3444, "step": 2975 }, { "epoch": 1.9302740392411222, "grad_norm": 4.232908725738525, "learning_rate": 6.085112153932593e-07, "loss": 1.8365, "step": 2976 }, { "epoch": 1.9309226528295769, "grad_norm": 4.041594982147217, "learning_rate": 5.973058388894837e-07, "loss": 1.584, "step": 2977 }, { "epoch": 1.9315712664180316, "grad_norm": 3.646378755569458, "learning_rate": 5.862042845640403e-07, "loss": 1.3818, "step": 2978 }, { "epoch": 1.9322198800064863, "grad_norm": 4.270051956176758, "learning_rate": 5.752065640120541e-07, "loss": 1.4899, "step": 2979 }, { "epoch": 1.932868493594941, "grad_norm": 4.203458309173584, "learning_rate": 5.643126887202143e-07, "loss": 1.8083, "step": 2980 }, { "epoch": 1.9335171071833956, "grad_norm": 3.4022433757781982, "learning_rate": 5.535226700667528e-07, "loss": 1.4982, "step": 2981 }, { "epoch": 1.9341657207718501, "grad_norm": 3.855419635772705, "learning_rate": 5.428365193213992e-07, "loss": 1.5489, "step": 2982 }, { "epoch": 1.9348143343603048, "grad_norm": 3.259767770767212, "learning_rate": 5.32254247645425e-07, "loss": 1.2992, "step": 2983 }, { "epoch": 1.9354629479487595, "grad_norm": 3.3422834873199463, "learning_rate": 5.217758660915784e-07, "loss": 1.6832, "step": 2984 }, { "epoch": 1.9361115615372142, "grad_norm": 3.184936761856079, "learning_rate": 5.114013856041377e-07, "loss": 1.3683, "step": 2985 }, { "epoch": 1.936760175125669, "grad_norm": 3.9365456104278564, "learning_rate": 5.011308170188245e-07, "loss": 1.7342, "step": 2986 }, { "epoch": 1.9374087887141236, "grad_norm": 4.059337139129639, "learning_rate": 4.909641710628354e-07, "loss": 1.7243, "step": 2987 }, { "epoch": 1.9380574023025783, "grad_norm": 3.693351984024048, "learning_rate": 4.809014583548432e-07, "loss": 1.4151, "step": 2988 }, { "epoch": 1.9387060158910328, "grad_norm": 4.075037002563477, "learning_rate": 4.709426894049407e-07, "loss": 1.9239, "step": 2989 }, { "epoch": 1.9393546294794874, "grad_norm": 3.035301446914673, "learning_rate": 4.6108787461468516e-07, "loss": 1.1466, "step": 2990 }, { "epoch": 1.9400032430679421, "grad_norm": 3.820017099380493, "learning_rate": 4.5133702427700986e-07, "loss": 1.387, "step": 2991 }, { "epoch": 1.9406518566563968, "grad_norm": 3.9929842948913574, "learning_rate": 4.4169014857632364e-07, "loss": 1.5571, "step": 2992 }, { "epoch": 1.9413004702448515, "grad_norm": 4.609864711761475, "learning_rate": 4.321472575883889e-07, "loss": 1.7019, "step": 2993 }, { "epoch": 1.9419490838333062, "grad_norm": 4.620673656463623, "learning_rate": 4.227083612803884e-07, "loss": 1.6932, "step": 2994 }, { "epoch": 1.942597697421761, "grad_norm": 3.9064674377441406, "learning_rate": 4.133734695108582e-07, "loss": 1.4116, "step": 2995 }, { "epoch": 1.9432463110102156, "grad_norm": 3.848545789718628, "learning_rate": 4.041425920297326e-07, "loss": 1.7602, "step": 2996 }, { "epoch": 1.9438949245986703, "grad_norm": 4.10701322555542, "learning_rate": 3.950157384783104e-07, "loss": 1.8215, "step": 2997 }, { "epoch": 1.944543538187125, "grad_norm": 3.1775357723236084, "learning_rate": 3.859929183892108e-07, "loss": 1.3084, "step": 2998 }, { "epoch": 1.9451921517755797, "grad_norm": 3.5141162872314453, "learning_rate": 3.770741411864176e-07, "loss": 1.7156, "step": 2999 }, { "epoch": 1.9458407653640344, "grad_norm": 4.541618347167969, "learning_rate": 3.6825941618524594e-07, "loss": 1.7177, "step": 3000 }, { "epoch": 1.9458407653640344, "eval_loss": 1.8095293045043945, "eval_runtime": 35.1011, "eval_samples_per_second": 58.574, "eval_steps_per_second": 14.643, "step": 3000 }, { "epoch": 1.946489378952489, "grad_norm": 3.6778409481048584, "learning_rate": 3.5954875259232023e-07, "loss": 1.42, "step": 3001 }, { "epoch": 1.9471379925409438, "grad_norm": 4.308028697967529, "learning_rate": 3.5094215950557394e-07, "loss": 1.6654, "step": 3002 }, { "epoch": 1.9477866061293985, "grad_norm": 4.574370384216309, "learning_rate": 3.424396459142831e-07, "loss": 1.6826, "step": 3003 }, { "epoch": 1.9484352197178532, "grad_norm": 3.678739070892334, "learning_rate": 3.3404122069895515e-07, "loss": 1.5386, "step": 3004 }, { "epoch": 1.9490838333063079, "grad_norm": 4.089027404785156, "learning_rate": 3.257468926314289e-07, "loss": 1.6729, "step": 3005 }, { "epoch": 1.9497324468947625, "grad_norm": 4.182556629180908, "learning_rate": 3.1755667037479676e-07, "loss": 1.6968, "step": 3006 }, { "epoch": 1.9503810604832172, "grad_norm": 4.085065841674805, "learning_rate": 3.094705624834271e-07, "loss": 1.7223, "step": 3007 }, { "epoch": 1.951029674071672, "grad_norm": 3.7673091888427734, "learning_rate": 3.014885774029419e-07, "loss": 1.6627, "step": 3008 }, { "epoch": 1.9516782876601266, "grad_norm": 3.997030258178711, "learning_rate": 2.936107234701946e-07, "loss": 1.5503, "step": 3009 }, { "epoch": 1.952326901248581, "grad_norm": 3.7440600395202637, "learning_rate": 2.8583700891330335e-07, "loss": 1.5001, "step": 3010 }, { "epoch": 1.9529755148370358, "grad_norm": 4.037206649780273, "learning_rate": 2.781674418515956e-07, "loss": 1.6562, "step": 3011 }, { "epoch": 1.9536241284254905, "grad_norm": 2.904554843902588, "learning_rate": 2.7060203029564135e-07, "loss": 1.3864, "step": 3012 }, { "epoch": 1.9542727420139452, "grad_norm": 4.295997619628906, "learning_rate": 2.6314078214719764e-07, "loss": 1.8303, "step": 3013 }, { "epoch": 1.9549213556023999, "grad_norm": 3.855299472808838, "learning_rate": 2.55783705199264e-07, "loss": 1.4629, "step": 3014 }, { "epoch": 1.9555699691908546, "grad_norm": 4.455493450164795, "learning_rate": 2.4853080713600483e-07, "loss": 1.6651, "step": 3015 }, { "epoch": 1.9562185827793093, "grad_norm": 3.1951048374176025, "learning_rate": 2.413820955327828e-07, "loss": 1.2877, "step": 3016 }, { "epoch": 1.9568671963677637, "grad_norm": 3.2502670288085938, "learning_rate": 2.343375778561474e-07, "loss": 1.3391, "step": 3017 }, { "epoch": 1.9575158099562184, "grad_norm": 4.558046817779541, "learning_rate": 2.2739726146381311e-07, "loss": 1.8658, "step": 3018 }, { "epoch": 1.9581644235446731, "grad_norm": 3.8116796016693115, "learning_rate": 2.2056115360468143e-07, "loss": 1.7049, "step": 3019 }, { "epoch": 1.9588130371331278, "grad_norm": 3.8781213760375977, "learning_rate": 2.1382926141877425e-07, "loss": 1.4922, "step": 3020 }, { "epoch": 1.9594616507215825, "grad_norm": 4.510586261749268, "learning_rate": 2.0720159193730048e-07, "loss": 1.6788, "step": 3021 }, { "epoch": 1.9601102643100372, "grad_norm": 3.9747118949890137, "learning_rate": 2.0067815208260066e-07, "loss": 1.6454, "step": 3022 }, { "epoch": 1.960758877898492, "grad_norm": 4.142552375793457, "learning_rate": 1.9425894866813566e-07, "loss": 1.5863, "step": 3023 }, { "epoch": 1.9614074914869466, "grad_norm": 3.511263370513916, "learning_rate": 1.8794398839853121e-07, "loss": 1.5666, "step": 3024 }, { "epoch": 1.9620561050754013, "grad_norm": 3.404165506362915, "learning_rate": 1.8173327786948912e-07, "loss": 1.3809, "step": 3025 }, { "epoch": 1.962704718663856, "grad_norm": 4.866592884063721, "learning_rate": 1.7562682356786487e-07, "loss": 1.619, "step": 3026 }, { "epoch": 1.9633533322523107, "grad_norm": 3.441409111022949, "learning_rate": 1.6962463187160106e-07, "loss": 1.4755, "step": 3027 }, { "epoch": 1.9640019458407654, "grad_norm": 3.0417816638946533, "learning_rate": 1.6372670904974963e-07, "loss": 1.2714, "step": 3028 }, { "epoch": 1.96465055942922, "grad_norm": 4.760983943939209, "learning_rate": 1.5793306126247187e-07, "loss": 1.9, "step": 3029 }, { "epoch": 1.9652991730176748, "grad_norm": 4.103235244750977, "learning_rate": 1.5224369456098285e-07, "loss": 1.743, "step": 3030 }, { "epoch": 1.9659477866061295, "grad_norm": 2.864081859588623, "learning_rate": 1.4665861488761813e-07, "loss": 1.1274, "step": 3031 }, { "epoch": 1.9665964001945841, "grad_norm": 4.193395614624023, "learning_rate": 1.4117782807575585e-07, "loss": 1.5352, "step": 3032 }, { "epoch": 1.9672450137830388, "grad_norm": 3.922750949859619, "learning_rate": 1.3580133984987254e-07, "loss": 1.4893, "step": 3033 }, { "epoch": 1.9678936273714935, "grad_norm": 4.047073841094971, "learning_rate": 1.305291558254984e-07, "loss": 1.4842, "step": 3034 }, { "epoch": 1.9685422409599482, "grad_norm": 3.948975086212158, "learning_rate": 1.2536128150921757e-07, "loss": 1.6871, "step": 3035 }, { "epoch": 1.969190854548403, "grad_norm": 3.684391498565674, "learning_rate": 1.2029772229865676e-07, "loss": 1.5092, "step": 3036 }, { "epoch": 1.9698394681368576, "grad_norm": 4.096811294555664, "learning_rate": 1.1533848348252996e-07, "loss": 1.4593, "step": 3037 }, { "epoch": 1.970488081725312, "grad_norm": 3.7814786434173584, "learning_rate": 1.1048357024054934e-07, "loss": 1.6341, "step": 3038 }, { "epoch": 1.9711366953137668, "grad_norm": 3.643556833267212, "learning_rate": 1.0573298764348094e-07, "loss": 1.5072, "step": 3039 }, { "epoch": 1.9717853089022215, "grad_norm": 4.24396276473999, "learning_rate": 1.010867406531113e-07, "loss": 1.5855, "step": 3040 }, { "epoch": 1.9724339224906762, "grad_norm": 4.070800304412842, "learning_rate": 9.654483412228078e-08, "loss": 1.6089, "step": 3041 }, { "epoch": 1.9730825360791309, "grad_norm": 3.6172778606414795, "learning_rate": 9.21072727948169e-08, "loss": 1.4299, "step": 3042 }, { "epoch": 1.9737311496675856, "grad_norm": 4.127065658569336, "learning_rate": 8.777406130558996e-08, "loss": 1.7877, "step": 3043 }, { "epoch": 1.9743797632560403, "grad_norm": 4.066019535064697, "learning_rate": 8.354520418044631e-08, "loss": 1.6611, "step": 3044 }, { "epoch": 1.9750283768444947, "grad_norm": 3.6341562271118164, "learning_rate": 7.942070583627503e-08, "loss": 1.5374, "step": 3045 }, { "epoch": 1.9756769904329494, "grad_norm": 3.6916756629943848, "learning_rate": 7.54005705809524e-08, "loss": 1.6682, "step": 3046 }, { "epoch": 1.9763256040214041, "grad_norm": 3.2920279502868652, "learning_rate": 7.1484802613353e-08, "loss": 1.319, "step": 3047 }, { "epoch": 1.9769742176098588, "grad_norm": 3.9806532859802246, "learning_rate": 6.76734060233275e-08, "loss": 1.7824, "step": 3048 }, { "epoch": 1.9776228311983135, "grad_norm": 4.110844612121582, "learning_rate": 6.3966384791736e-08, "loss": 1.9061, "step": 3049 }, { "epoch": 1.9782714447867682, "grad_norm": 3.641789436340332, "learning_rate": 6.036374279041469e-08, "loss": 1.2725, "step": 3050 }, { "epoch": 1.978920058375223, "grad_norm": 3.0695769786834717, "learning_rate": 5.686548378218692e-08, "loss": 1.3112, "step": 3051 }, { "epoch": 1.9795686719636776, "grad_norm": 4.121557712554932, "learning_rate": 5.347161142083001e-08, "loss": 1.785, "step": 3052 }, { "epoch": 1.9802172855521323, "grad_norm": 3.6425161361694336, "learning_rate": 5.018212925113064e-08, "loss": 1.4396, "step": 3053 }, { "epoch": 1.980865899140587, "grad_norm": 3.896867513656616, "learning_rate": 4.699704070880717e-08, "loss": 1.6528, "step": 3054 }, { "epoch": 1.9815145127290417, "grad_norm": 3.6801159381866455, "learning_rate": 4.391634912056519e-08, "loss": 1.451, "step": 3055 }, { "epoch": 1.9821631263174964, "grad_norm": 4.225161552429199, "learning_rate": 4.094005770406417e-08, "loss": 1.5651, "step": 3056 }, { "epoch": 1.982811739905951, "grad_norm": 3.9316306114196777, "learning_rate": 3.806816956791748e-08, "loss": 1.5246, "step": 3057 }, { "epoch": 1.9834603534944057, "grad_norm": 3.4197158813476562, "learning_rate": 3.5300687711703475e-08, "loss": 1.3627, "step": 3058 }, { "epoch": 1.9841089670828604, "grad_norm": 4.311890125274658, "learning_rate": 3.263761502594331e-08, "loss": 1.7444, "step": 3059 }, { "epoch": 1.9847575806713151, "grad_norm": 3.8272769451141357, "learning_rate": 3.0078954292123154e-08, "loss": 1.6508, "step": 3060 }, { "epoch": 1.9854061942597698, "grad_norm": 3.39658260345459, "learning_rate": 2.7624708182649728e-08, "loss": 1.1715, "step": 3061 }, { "epoch": 1.9860548078482245, "grad_norm": 4.383815288543701, "learning_rate": 2.5274879260883677e-08, "loss": 1.8981, "step": 3062 }, { "epoch": 1.9867034214366792, "grad_norm": 4.106340408325195, "learning_rate": 2.302946998113953e-08, "loss": 1.4404, "step": 3063 }, { "epoch": 1.987352035025134, "grad_norm": 3.506497859954834, "learning_rate": 2.08884826886524e-08, "loss": 1.5293, "step": 3064 }, { "epoch": 1.9880006486135886, "grad_norm": 3.8884854316711426, "learning_rate": 1.88519196196002e-08, "loss": 1.7149, "step": 3065 }, { "epoch": 1.988649262202043, "grad_norm": 3.66325306892395, "learning_rate": 1.6919782901092527e-08, "loss": 1.4065, "step": 3066 }, { "epoch": 1.9892978757904978, "grad_norm": 3.464769124984741, "learning_rate": 1.5092074551170676e-08, "loss": 1.461, "step": 3067 }, { "epoch": 1.9899464893789525, "grad_norm": 4.370641231536865, "learning_rate": 1.3368796478807621e-08, "loss": 1.9601, "step": 3068 }, { "epoch": 1.9905951029674072, "grad_norm": 3.9992072582244873, "learning_rate": 1.174995048388583e-08, "loss": 1.353, "step": 3069 }, { "epoch": 1.9912437165558619, "grad_norm": 3.9411115646362305, "learning_rate": 1.0235538257230558e-08, "loss": 1.6022, "step": 3070 }, { "epoch": 1.9918923301443165, "grad_norm": 4.436995506286621, "learning_rate": 8.825561380598757e-09, "loss": 1.6347, "step": 3071 }, { "epoch": 1.9925409437327712, "grad_norm": 3.9146671295166016, "learning_rate": 7.520021326634652e-09, "loss": 1.5475, "step": 3072 }, { "epoch": 1.9931895573212257, "grad_norm": 3.825161933898926, "learning_rate": 6.318919458936367e-09, "loss": 1.4209, "step": 3073 }, { "epoch": 1.9938381709096804, "grad_norm": 3.7990143299102783, "learning_rate": 5.222257032011513e-09, "loss": 1.6408, "step": 3074 }, { "epoch": 1.994486784498135, "grad_norm": 3.7868599891662598, "learning_rate": 4.23003519126608e-09, "loss": 1.2884, "step": 3075 }, { "epoch": 1.9951353980865898, "grad_norm": 4.650010585784912, "learning_rate": 3.3422549730377506e-09, "loss": 1.5425, "step": 3076 }, { "epoch": 1.9957840116750445, "grad_norm": 4.488104820251465, "learning_rate": 2.5589173045958983e-09, "loss": 1.6848, "step": 3077 }, { "epoch": 1.9964326252634992, "grad_norm": 2.9422812461853027, "learning_rate": 1.8800230040860733e-09, "loss": 1.3039, "step": 3078 }, { "epoch": 1.9970812388519539, "grad_norm": 3.495547294616699, "learning_rate": 1.3055727806077222e-09, "loss": 1.4678, "step": 3079 }, { "epoch": 1.9977298524404086, "grad_norm": 3.8827977180480957, "learning_rate": 8.355672341253673e-10, "loss": 1.7146, "step": 3080 }, { "epoch": 1.9983784660288633, "grad_norm": 4.386503219604492, "learning_rate": 4.700068555574255e-10, "loss": 1.723, "step": 3081 }, { "epoch": 1.999027079617318, "grad_norm": 3.5366950035095215, "learning_rate": 2.0889202670959506e-10, "loss": 1.359, "step": 3082 }, { "epoch": 1.9996756932057727, "grad_norm": 5.078452110290527, "learning_rate": 5.222302031926418e-11, "loss": 1.8279, "step": 3083 }, { "epoch": 2.0003243067942273, "grad_norm": 6.576585292816162, "learning_rate": 0.0, "loss": 2.2408, "step": 3084 } ], "logging_steps": 1, "max_steps": 3084, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 2, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.2233780041023488e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }