{ "best_metric": 2.8474695682525635, "best_model_checkpoint": "miner_id_24/checkpoint-3600", "epoch": 1.0575712248234599, "eval_steps": 100, "global_step": 3800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002782899085121926, "grad_norm": 17.002941131591797, "learning_rate": 2e-05, "loss": 5.9812, "step": 1 }, { "epoch": 0.0002782899085121926, "eval_loss": 5.98688268661499, "eval_runtime": 84.9746, "eval_samples_per_second": 58.841, "eval_steps_per_second": 14.71, "step": 1 }, { "epoch": 0.0005565798170243852, "grad_norm": 19.287992477416992, "learning_rate": 4e-05, "loss": 5.9342, "step": 2 }, { "epoch": 0.0008348697255365778, "grad_norm": 21.941875457763672, "learning_rate": 6e-05, "loss": 6.385, "step": 3 }, { "epoch": 0.0011131596340487704, "grad_norm": 16.80741310119629, "learning_rate": 8e-05, "loss": 5.6794, "step": 4 }, { "epoch": 0.0013914495425609629, "grad_norm": 12.198833465576172, "learning_rate": 0.0001, "loss": 5.4401, "step": 5 }, { "epoch": 0.0016697394510731556, "grad_norm": 7.938112735748291, "learning_rate": 0.00012, "loss": 5.3999, "step": 6 }, { "epoch": 0.001948029359585348, "grad_norm": 14.917191505432129, "learning_rate": 0.00014, "loss": 5.7897, "step": 7 }, { "epoch": 0.0022263192680975407, "grad_norm": 7.297199726104736, "learning_rate": 0.00016, "loss": 4.7889, "step": 8 }, { "epoch": 0.002504609176609733, "grad_norm": 7.74416446685791, "learning_rate": 0.00018, "loss": 4.7504, "step": 9 }, { "epoch": 0.0027828990851219257, "grad_norm": 6.026649475097656, "learning_rate": 0.0002, "loss": 3.9825, "step": 10 }, { "epoch": 0.0030611889936341184, "grad_norm": 9.58944320678711, "learning_rate": 0.0001999999904195954, "loss": 4.3154, "step": 11 }, { "epoch": 0.003339478902146311, "grad_norm": 7.8204426765441895, "learning_rate": 0.00019999996167838346, "loss": 4.0044, "step": 12 }, { "epoch": 0.0036177688106585034, "grad_norm": 11.4631929397583, "learning_rate": 0.0001999999137763697, "loss": 4.0882, "step": 13 }, { "epoch": 0.003896058719170696, "grad_norm": 9.104846000671387, "learning_rate": 0.00019999984671356322, "loss": 4.1362, "step": 14 }, { "epoch": 0.004174348627682889, "grad_norm": 4.854076385498047, "learning_rate": 0.00019999976048997695, "loss": 3.8936, "step": 15 }, { "epoch": 0.0044526385361950815, "grad_norm": 4.761904239654541, "learning_rate": 0.0001999996551056274, "loss": 3.7275, "step": 16 }, { "epoch": 0.004730928444707274, "grad_norm": 5.670674800872803, "learning_rate": 0.0001999995305605347, "loss": 3.7384, "step": 17 }, { "epoch": 0.005009218353219466, "grad_norm": 3.3813259601593018, "learning_rate": 0.00019999938685472278, "loss": 3.717, "step": 18 }, { "epoch": 0.005287508261731659, "grad_norm": 3.5495376586914062, "learning_rate": 0.0001999992239882192, "loss": 3.6773, "step": 19 }, { "epoch": 0.005565798170243851, "grad_norm": 2.8463995456695557, "learning_rate": 0.00019999904196105507, "loss": 3.4787, "step": 20 }, { "epoch": 0.005844088078756044, "grad_norm": 3.221832513809204, "learning_rate": 0.00019999884077326533, "loss": 3.6683, "step": 21 }, { "epoch": 0.006122377987268237, "grad_norm": 3.0998988151550293, "learning_rate": 0.00019999862042488853, "loss": 3.9501, "step": 22 }, { "epoch": 0.0064006678957804295, "grad_norm": 3.077512502670288, "learning_rate": 0.00019999838091596688, "loss": 3.6844, "step": 23 }, { "epoch": 0.006678957804292622, "grad_norm": 2.7098445892333984, "learning_rate": 0.00019999812224654625, "loss": 3.3924, "step": 24 }, { "epoch": 0.006957247712804814, "grad_norm": 3.481621026992798, "learning_rate": 0.00019999784441667627, "loss": 3.6165, "step": 25 }, { "epoch": 0.007235537621317007, "grad_norm": 2.7846627235412598, "learning_rate": 0.0001999975474264101, "loss": 3.6616, "step": 26 }, { "epoch": 0.0075138275298291994, "grad_norm": 2.986056089401245, "learning_rate": 0.00019999723127580468, "loss": 3.6636, "step": 27 }, { "epoch": 0.007792117438341392, "grad_norm": 2.3427329063415527, "learning_rate": 0.00019999689596492058, "loss": 3.3375, "step": 28 }, { "epoch": 0.008070407346853584, "grad_norm": 2.994873523712158, "learning_rate": 0.00019999654149382206, "loss": 3.4239, "step": 29 }, { "epoch": 0.008348697255365778, "grad_norm": 2.916687250137329, "learning_rate": 0.00019999616786257703, "loss": 3.6612, "step": 30 }, { "epoch": 0.00862698716387797, "grad_norm": 3.2186973094940186, "learning_rate": 0.00019999577507125705, "loss": 3.4924, "step": 31 }, { "epoch": 0.008905277072390163, "grad_norm": 2.7801923751831055, "learning_rate": 0.00019999536311993742, "loss": 3.5651, "step": 32 }, { "epoch": 0.009183566980902355, "grad_norm": 3.049238443374634, "learning_rate": 0.00019999493200869713, "loss": 3.6423, "step": 33 }, { "epoch": 0.009461856889414548, "grad_norm": 2.719590425491333, "learning_rate": 0.00019999448173761865, "loss": 3.2166, "step": 34 }, { "epoch": 0.00974014679792674, "grad_norm": 2.965956211090088, "learning_rate": 0.00019999401230678837, "loss": 3.66, "step": 35 }, { "epoch": 0.010018436706438932, "grad_norm": 2.7884485721588135, "learning_rate": 0.00019999352371629617, "loss": 3.3686, "step": 36 }, { "epoch": 0.010296726614951126, "grad_norm": 3.150559186935425, "learning_rate": 0.00019999301596623567, "loss": 3.6838, "step": 37 }, { "epoch": 0.010575016523463317, "grad_norm": 3.7389256954193115, "learning_rate": 0.0001999924890567042, "loss": 3.2839, "step": 38 }, { "epoch": 0.010853306431975511, "grad_norm": 2.6185200214385986, "learning_rate": 0.00019999194298780273, "loss": 3.4399, "step": 39 }, { "epoch": 0.011131596340487703, "grad_norm": 2.8291258811950684, "learning_rate": 0.0001999913777596358, "loss": 3.3369, "step": 40 }, { "epoch": 0.011409886248999896, "grad_norm": 3.6246542930603027, "learning_rate": 0.00019999079337231185, "loss": 3.2463, "step": 41 }, { "epoch": 0.011688176157512088, "grad_norm": 2.834122657775879, "learning_rate": 0.0001999901898259427, "loss": 3.2887, "step": 42 }, { "epoch": 0.01196646606602428, "grad_norm": 2.774077892303467, "learning_rate": 0.00019998956712064412, "loss": 3.2401, "step": 43 }, { "epoch": 0.012244755974536474, "grad_norm": 3.3849713802337646, "learning_rate": 0.00019998892525653535, "loss": 3.599, "step": 44 }, { "epoch": 0.012523045883048665, "grad_norm": 3.1346535682678223, "learning_rate": 0.00019998826423373942, "loss": 3.4501, "step": 45 }, { "epoch": 0.012801335791560859, "grad_norm": 2.7817208766937256, "learning_rate": 0.00019998758405238295, "loss": 3.405, "step": 46 }, { "epoch": 0.01307962570007305, "grad_norm": 3.800513744354248, "learning_rate": 0.0001999868847125963, "loss": 3.285, "step": 47 }, { "epoch": 0.013357915608585244, "grad_norm": 3.1255245208740234, "learning_rate": 0.00019998616621451349, "loss": 3.2415, "step": 48 }, { "epoch": 0.013636205517097436, "grad_norm": 3.535891056060791, "learning_rate": 0.0001999854285582721, "loss": 3.5676, "step": 49 }, { "epoch": 0.013914495425609628, "grad_norm": 3.263808012008667, "learning_rate": 0.00019998467174401355, "loss": 3.525, "step": 50 }, { "epoch": 0.014192785334121822, "grad_norm": 2.7439327239990234, "learning_rate": 0.00019998389577188284, "loss": 3.4441, "step": 51 }, { "epoch": 0.014471075242634013, "grad_norm": 3.0952131748199463, "learning_rate": 0.00019998310064202866, "loss": 3.2142, "step": 52 }, { "epoch": 0.014749365151146207, "grad_norm": 2.5860517024993896, "learning_rate": 0.00019998228635460336, "loss": 3.3881, "step": 53 }, { "epoch": 0.015027655059658399, "grad_norm": 2.5894296169281006, "learning_rate": 0.00019998145290976287, "loss": 3.4558, "step": 54 }, { "epoch": 0.015305944968170592, "grad_norm": 3.080479621887207, "learning_rate": 0.000199980600307667, "loss": 3.427, "step": 55 }, { "epoch": 0.015584234876682784, "grad_norm": 3.147078037261963, "learning_rate": 0.00019997972854847912, "loss": 3.539, "step": 56 }, { "epoch": 0.015862524785194978, "grad_norm": 2.9708526134490967, "learning_rate": 0.0001999788376323662, "loss": 3.3065, "step": 57 }, { "epoch": 0.016140814693707168, "grad_norm": 2.3979759216308594, "learning_rate": 0.000199977927559499, "loss": 3.3108, "step": 58 }, { "epoch": 0.01641910460221936, "grad_norm": 2.7668023109436035, "learning_rate": 0.0001999769983300518, "loss": 3.2346, "step": 59 }, { "epoch": 0.016697394510731555, "grad_norm": 3.3077287673950195, "learning_rate": 0.00019997604994420276, "loss": 3.0784, "step": 60 }, { "epoch": 0.01697568441924375, "grad_norm": 3.038454055786133, "learning_rate": 0.0001999750824021336, "loss": 3.2051, "step": 61 }, { "epoch": 0.01725397432775594, "grad_norm": 2.8979313373565674, "learning_rate": 0.00019997409570402961, "loss": 3.3868, "step": 62 }, { "epoch": 0.017532264236268132, "grad_norm": 2.8584742546081543, "learning_rate": 0.0001999730898500799, "loss": 3.4808, "step": 63 }, { "epoch": 0.017810554144780326, "grad_norm": 2.4810538291931152, "learning_rate": 0.0001999720648404772, "loss": 3.2123, "step": 64 }, { "epoch": 0.018088844053292516, "grad_norm": 2.732379674911499, "learning_rate": 0.00019997102067541796, "loss": 3.1345, "step": 65 }, { "epoch": 0.01836713396180471, "grad_norm": 3.2496063709259033, "learning_rate": 0.0001999699573551022, "loss": 3.5387, "step": 66 }, { "epoch": 0.018645423870316903, "grad_norm": 2.8145253658294678, "learning_rate": 0.00019996887487973365, "loss": 3.1685, "step": 67 }, { "epoch": 0.018923713778829097, "grad_norm": 2.7135744094848633, "learning_rate": 0.00019996777324951973, "loss": 3.1059, "step": 68 }, { "epoch": 0.019202003687341287, "grad_norm": 3.733142137527466, "learning_rate": 0.00019996665246467155, "loss": 3.2716, "step": 69 }, { "epoch": 0.01948029359585348, "grad_norm": 2.9407293796539307, "learning_rate": 0.00019996551252540382, "loss": 3.3199, "step": 70 }, { "epoch": 0.019758583504365674, "grad_norm": 2.5942862033843994, "learning_rate": 0.000199964353431935, "loss": 3.0171, "step": 71 }, { "epoch": 0.020036873412877864, "grad_norm": 3.57987904548645, "learning_rate": 0.00019996317518448714, "loss": 3.3179, "step": 72 }, { "epoch": 0.020315163321390058, "grad_norm": 3.3481435775756836, "learning_rate": 0.00019996197778328602, "loss": 2.9369, "step": 73 }, { "epoch": 0.02059345322990225, "grad_norm": 3.3575656414031982, "learning_rate": 0.0001999607612285611, "loss": 3.4142, "step": 74 }, { "epoch": 0.020871743138414445, "grad_norm": 2.5996809005737305, "learning_rate": 0.00019995952552054544, "loss": 3.2364, "step": 75 }, { "epoch": 0.021150033046926635, "grad_norm": 2.7393929958343506, "learning_rate": 0.00019995827065947584, "loss": 3.2158, "step": 76 }, { "epoch": 0.02142832295543883, "grad_norm": 3.2959396839141846, "learning_rate": 0.00019995699664559276, "loss": 3.561, "step": 77 }, { "epoch": 0.021706612863951022, "grad_norm": 3.0207879543304443, "learning_rate": 0.00019995570347914026, "loss": 3.367, "step": 78 }, { "epoch": 0.021984902772463212, "grad_norm": 3.2401602268218994, "learning_rate": 0.0001999543911603661, "loss": 3.1579, "step": 79 }, { "epoch": 0.022263192680975406, "grad_norm": 3.10953426361084, "learning_rate": 0.00019995305968952183, "loss": 3.3987, "step": 80 }, { "epoch": 0.0225414825894876, "grad_norm": 2.6518425941467285, "learning_rate": 0.00019995170906686251, "loss": 2.9922, "step": 81 }, { "epoch": 0.022819772497999793, "grad_norm": 2.9276158809661865, "learning_rate": 0.00019995033929264694, "loss": 3.3777, "step": 82 }, { "epoch": 0.023098062406511983, "grad_norm": 2.297638177871704, "learning_rate": 0.00019994895036713756, "loss": 3.131, "step": 83 }, { "epoch": 0.023376352315024176, "grad_norm": 3.2343451976776123, "learning_rate": 0.00019994754229060052, "loss": 3.3614, "step": 84 }, { "epoch": 0.02365464222353637, "grad_norm": 6.476233005523682, "learning_rate": 0.00019994611506330562, "loss": 3.5223, "step": 85 }, { "epoch": 0.02393293213204856, "grad_norm": 2.676745891571045, "learning_rate": 0.00019994466868552627, "loss": 3.4546, "step": 86 }, { "epoch": 0.024211222040560754, "grad_norm": 2.841125965118408, "learning_rate": 0.00019994320315753973, "loss": 3.5407, "step": 87 }, { "epoch": 0.024489511949072947, "grad_norm": 2.9516446590423584, "learning_rate": 0.0001999417184796267, "loss": 3.103, "step": 88 }, { "epoch": 0.02476780185758514, "grad_norm": 2.586479663848877, "learning_rate": 0.00019994021465207174, "loss": 3.4791, "step": 89 }, { "epoch": 0.02504609176609733, "grad_norm": 2.7659854888916016, "learning_rate": 0.00019993869167516287, "loss": 3.2204, "step": 90 }, { "epoch": 0.025324381674609524, "grad_norm": 2.610041379928589, "learning_rate": 0.00019993714954919206, "loss": 3.1517, "step": 91 }, { "epoch": 0.025602671583121718, "grad_norm": 2.58697247505188, "learning_rate": 0.0001999355882744547, "loss": 3.047, "step": 92 }, { "epoch": 0.025880961491633908, "grad_norm": 2.3641750812530518, "learning_rate": 0.00019993400785124995, "loss": 3.1796, "step": 93 }, { "epoch": 0.0261592514001461, "grad_norm": 2.2581377029418945, "learning_rate": 0.00019993240827988063, "loss": 2.8717, "step": 94 }, { "epoch": 0.026437541308658295, "grad_norm": 2.949000120162964, "learning_rate": 0.00019993078956065323, "loss": 3.4251, "step": 95 }, { "epoch": 0.02671583121717049, "grad_norm": 2.414064645767212, "learning_rate": 0.00019992915169387795, "loss": 3.1424, "step": 96 }, { "epoch": 0.02699412112568268, "grad_norm": 2.637364149093628, "learning_rate": 0.00019992749467986857, "loss": 3.0854, "step": 97 }, { "epoch": 0.027272411034194873, "grad_norm": 2.871019124984741, "learning_rate": 0.00019992581851894264, "loss": 3.1808, "step": 98 }, { "epoch": 0.027550700942707066, "grad_norm": 3.2153420448303223, "learning_rate": 0.00019992412321142127, "loss": 3.1763, "step": 99 }, { "epoch": 0.027828990851219256, "grad_norm": 2.187028169631958, "learning_rate": 0.00019992240875762934, "loss": 2.9023, "step": 100 }, { "epoch": 0.027828990851219256, "eval_loss": 3.233719825744629, "eval_runtime": 84.2876, "eval_samples_per_second": 59.321, "eval_steps_per_second": 14.83, "step": 100 }, { "epoch": 0.02810728075973145, "grad_norm": 2.4928641319274902, "learning_rate": 0.0001999206751578953, "loss": 3.1677, "step": 101 }, { "epoch": 0.028385570668243643, "grad_norm": 3.1369922161102295, "learning_rate": 0.00019991892241255136, "loss": 3.2865, "step": 102 }, { "epoch": 0.028663860576755837, "grad_norm": 2.0534613132476807, "learning_rate": 0.00019991715052193338, "loss": 2.9914, "step": 103 }, { "epoch": 0.028942150485268027, "grad_norm": 2.562408685684204, "learning_rate": 0.00019991535948638082, "loss": 3.2549, "step": 104 }, { "epoch": 0.02922044039378022, "grad_norm": 2.6675612926483154, "learning_rate": 0.0001999135493062369, "loss": 3.2924, "step": 105 }, { "epoch": 0.029498730302292414, "grad_norm": 2.7064266204833984, "learning_rate": 0.00019991171998184844, "loss": 2.9505, "step": 106 }, { "epoch": 0.029777020210804604, "grad_norm": 2.5236973762512207, "learning_rate": 0.00019990987151356594, "loss": 3.3463, "step": 107 }, { "epoch": 0.030055310119316798, "grad_norm": 3.177499771118164, "learning_rate": 0.00019990800390174362, "loss": 3.0391, "step": 108 }, { "epoch": 0.03033360002782899, "grad_norm": 2.4843997955322266, "learning_rate": 0.0001999061171467393, "loss": 3.0856, "step": 109 }, { "epoch": 0.030611889936341185, "grad_norm": 3.058932065963745, "learning_rate": 0.0001999042112489145, "loss": 3.4859, "step": 110 }, { "epoch": 0.030890179844853375, "grad_norm": 2.678724527359009, "learning_rate": 0.00019990228620863441, "loss": 3.2138, "step": 111 }, { "epoch": 0.03116846975336557, "grad_norm": 3.12869930267334, "learning_rate": 0.0001999003420262679, "loss": 3.4137, "step": 112 }, { "epoch": 0.03144675966187776, "grad_norm": 2.3533921241760254, "learning_rate": 0.00019989837870218748, "loss": 3.228, "step": 113 }, { "epoch": 0.031725049570389956, "grad_norm": 3.090468645095825, "learning_rate": 0.00019989639623676933, "loss": 3.2394, "step": 114 }, { "epoch": 0.032003339478902146, "grad_norm": 2.8647165298461914, "learning_rate": 0.00019989439463039333, "loss": 3.3222, "step": 115 }, { "epoch": 0.032281629387414336, "grad_norm": 2.5960564613342285, "learning_rate": 0.00019989237388344294, "loss": 3.2122, "step": 116 }, { "epoch": 0.03255991929592653, "grad_norm": 2.588900327682495, "learning_rate": 0.00019989033399630542, "loss": 3.1173, "step": 117 }, { "epoch": 0.03283820920443872, "grad_norm": 2.8869099617004395, "learning_rate": 0.0001998882749693716, "loss": 2.9443, "step": 118 }, { "epoch": 0.03311649911295092, "grad_norm": 2.2668545246124268, "learning_rate": 0.00019988619680303603, "loss": 2.8916, "step": 119 }, { "epoch": 0.03339478902146311, "grad_norm": 2.4536972045898438, "learning_rate": 0.0001998840994976969, "loss": 2.8507, "step": 120 }, { "epoch": 0.0336730789299753, "grad_norm": 3.33591365814209, "learning_rate": 0.00019988198305375603, "loss": 3.4067, "step": 121 }, { "epoch": 0.0339513688384875, "grad_norm": 2.859400987625122, "learning_rate": 0.00019987984747161898, "loss": 3.3268, "step": 122 }, { "epoch": 0.03422965874699969, "grad_norm": 2.5193753242492676, "learning_rate": 0.00019987769275169495, "loss": 3.0829, "step": 123 }, { "epoch": 0.03450794865551188, "grad_norm": 2.6370418071746826, "learning_rate": 0.00019987551889439676, "loss": 3.3234, "step": 124 }, { "epoch": 0.034786238564024075, "grad_norm": 2.57183837890625, "learning_rate": 0.00019987332590014098, "loss": 3.2239, "step": 125 }, { "epoch": 0.035064528472536265, "grad_norm": 2.590930461883545, "learning_rate": 0.00019987111376934782, "loss": 3.0664, "step": 126 }, { "epoch": 0.035342818381048455, "grad_norm": 2.990720272064209, "learning_rate": 0.0001998688825024411, "loss": 2.9522, "step": 127 }, { "epoch": 0.03562110828956065, "grad_norm": 2.7668814659118652, "learning_rate": 0.00019986663209984836, "loss": 2.9067, "step": 128 }, { "epoch": 0.03589939819807284, "grad_norm": 2.579965591430664, "learning_rate": 0.00019986436256200078, "loss": 3.0813, "step": 129 }, { "epoch": 0.03617768810658503, "grad_norm": 2.8013083934783936, "learning_rate": 0.0001998620738893333, "loss": 3.505, "step": 130 }, { "epoch": 0.03645597801509723, "grad_norm": 3.043736457824707, "learning_rate": 0.00019985976608228435, "loss": 3.3471, "step": 131 }, { "epoch": 0.03673426792360942, "grad_norm": 2.9704949855804443, "learning_rate": 0.00019985743914129617, "loss": 3.1441, "step": 132 }, { "epoch": 0.037012557832121616, "grad_norm": 3.0298256874084473, "learning_rate": 0.00019985509306681462, "loss": 3.2919, "step": 133 }, { "epoch": 0.037290847740633806, "grad_norm": 2.366316318511963, "learning_rate": 0.00019985272785928924, "loss": 3.0449, "step": 134 }, { "epoch": 0.037569137649145996, "grad_norm": 2.357591390609741, "learning_rate": 0.0001998503435191732, "loss": 2.9952, "step": 135 }, { "epoch": 0.03784742755765819, "grad_norm": 2.6317784786224365, "learning_rate": 0.00019984794004692335, "loss": 3.0721, "step": 136 }, { "epoch": 0.038125717466170383, "grad_norm": 3.0209872722625732, "learning_rate": 0.00019984551744300024, "loss": 3.4592, "step": 137 }, { "epoch": 0.038404007374682574, "grad_norm": 2.1290223598480225, "learning_rate": 0.00019984307570786804, "loss": 3.0236, "step": 138 }, { "epoch": 0.03868229728319477, "grad_norm": 2.6746068000793457, "learning_rate": 0.00019984061484199463, "loss": 2.9767, "step": 139 }, { "epoch": 0.03896058719170696, "grad_norm": 2.557413101196289, "learning_rate": 0.00019983813484585153, "loss": 3.1368, "step": 140 }, { "epoch": 0.03923887710021915, "grad_norm": 2.649019479751587, "learning_rate": 0.00019983563571991387, "loss": 3.2892, "step": 141 }, { "epoch": 0.03951716700873135, "grad_norm": 2.776559591293335, "learning_rate": 0.0001998331174646606, "loss": 3.0966, "step": 142 }, { "epoch": 0.03979545691724354, "grad_norm": 2.563878059387207, "learning_rate": 0.00019983058008057417, "loss": 3.0719, "step": 143 }, { "epoch": 0.04007374682575573, "grad_norm": 2.597567319869995, "learning_rate": 0.00019982802356814078, "loss": 3.1182, "step": 144 }, { "epoch": 0.040352036734267925, "grad_norm": 2.1810426712036133, "learning_rate": 0.00019982544792785024, "loss": 2.8473, "step": 145 }, { "epoch": 0.040630326642780115, "grad_norm": 2.612783670425415, "learning_rate": 0.00019982285316019613, "loss": 3.2294, "step": 146 }, { "epoch": 0.04090861655129231, "grad_norm": 2.298963785171509, "learning_rate": 0.00019982023926567563, "loss": 3.0684, "step": 147 }, { "epoch": 0.0411869064598045, "grad_norm": 3.169950246810913, "learning_rate": 0.0001998176062447895, "loss": 3.1138, "step": 148 }, { "epoch": 0.04146519636831669, "grad_norm": 2.783531665802002, "learning_rate": 0.00019981495409804233, "loss": 3.3083, "step": 149 }, { "epoch": 0.04174348627682889, "grad_norm": 2.1829240322113037, "learning_rate": 0.00019981228282594228, "loss": 3.0633, "step": 150 }, { "epoch": 0.04202177618534108, "grad_norm": 2.9619195461273193, "learning_rate": 0.00019980959242900116, "loss": 3.2574, "step": 151 }, { "epoch": 0.04230006609385327, "grad_norm": 2.8050758838653564, "learning_rate": 0.0001998068829077345, "loss": 3.2913, "step": 152 }, { "epoch": 0.04257835600236547, "grad_norm": 2.417833089828491, "learning_rate": 0.00019980415426266141, "loss": 3.2415, "step": 153 }, { "epoch": 0.04285664591087766, "grad_norm": 2.167005777359009, "learning_rate": 0.0001998014064943048, "loss": 3.0312, "step": 154 }, { "epoch": 0.04313493581938985, "grad_norm": 2.4490816593170166, "learning_rate": 0.0001997986396031911, "loss": 2.9634, "step": 155 }, { "epoch": 0.043413225727902044, "grad_norm": 2.7819173336029053, "learning_rate": 0.0001997958535898505, "loss": 3.4581, "step": 156 }, { "epoch": 0.043691515636414234, "grad_norm": 2.269476890563965, "learning_rate": 0.00019979304845481682, "loss": 3.1743, "step": 157 }, { "epoch": 0.043969805544926424, "grad_norm": 2.26863169670105, "learning_rate": 0.00019979022419862753, "loss": 3.05, "step": 158 }, { "epoch": 0.04424809545343862, "grad_norm": 2.423788547515869, "learning_rate": 0.0001997873808218238, "loss": 3.1894, "step": 159 }, { "epoch": 0.04452638536195081, "grad_norm": 3.6697404384613037, "learning_rate": 0.00019978451832495045, "loss": 3.4141, "step": 160 }, { "epoch": 0.04480467527046301, "grad_norm": 2.6646714210510254, "learning_rate": 0.00019978163670855592, "loss": 3.1432, "step": 161 }, { "epoch": 0.0450829651789752, "grad_norm": 2.732215404510498, "learning_rate": 0.0001997787359731924, "loss": 3.166, "step": 162 }, { "epoch": 0.04536125508748739, "grad_norm": 2.673656940460205, "learning_rate": 0.00019977581611941564, "loss": 3.1938, "step": 163 }, { "epoch": 0.045639544995999586, "grad_norm": 2.292940378189087, "learning_rate": 0.00019977287714778518, "loss": 3.126, "step": 164 }, { "epoch": 0.045917834904511776, "grad_norm": 2.3284780979156494, "learning_rate": 0.0001997699190588641, "loss": 3.285, "step": 165 }, { "epoch": 0.046196124813023966, "grad_norm": 2.654193878173828, "learning_rate": 0.00019976694185321918, "loss": 2.8519, "step": 166 }, { "epoch": 0.04647441472153616, "grad_norm": 2.5698466300964355, "learning_rate": 0.0001997639455314209, "loss": 3.3963, "step": 167 }, { "epoch": 0.04675270463004835, "grad_norm": 2.5178191661834717, "learning_rate": 0.0001997609300940434, "loss": 2.9886, "step": 168 }, { "epoch": 0.04703099453856054, "grad_norm": 2.562946081161499, "learning_rate": 0.00019975789554166443, "loss": 3.2732, "step": 169 }, { "epoch": 0.04730928444707274, "grad_norm": 2.9027211666107178, "learning_rate": 0.00019975484187486543, "loss": 3.3352, "step": 170 }, { "epoch": 0.04758757435558493, "grad_norm": 2.348240852355957, "learning_rate": 0.00019975176909423154, "loss": 3.1408, "step": 171 }, { "epoch": 0.04786586426409712, "grad_norm": 2.4577667713165283, "learning_rate": 0.00019974867720035154, "loss": 3.0454, "step": 172 }, { "epoch": 0.04814415417260932, "grad_norm": 2.442345380783081, "learning_rate": 0.00019974556619381778, "loss": 3.3547, "step": 173 }, { "epoch": 0.04842244408112151, "grad_norm": 2.2678701877593994, "learning_rate": 0.00019974243607522642, "loss": 3.2344, "step": 174 }, { "epoch": 0.048700733989633704, "grad_norm": 3.064833641052246, "learning_rate": 0.00019973928684517724, "loss": 3.1236, "step": 175 }, { "epoch": 0.048979023898145894, "grad_norm": 2.5488476753234863, "learning_rate": 0.00019973611850427361, "loss": 3.1378, "step": 176 }, { "epoch": 0.049257313806658085, "grad_norm": 2.449816942214966, "learning_rate": 0.00019973293105312258, "loss": 3.099, "step": 177 }, { "epoch": 0.04953560371517028, "grad_norm": 2.777306079864502, "learning_rate": 0.000199729724492335, "loss": 3.3206, "step": 178 }, { "epoch": 0.04981389362368247, "grad_norm": 2.5056986808776855, "learning_rate": 0.00019972649882252517, "loss": 3.0, "step": 179 }, { "epoch": 0.05009218353219466, "grad_norm": 2.486391305923462, "learning_rate": 0.0001997232540443112, "loss": 3.2105, "step": 180 }, { "epoch": 0.05037047344070686, "grad_norm": 2.8079428672790527, "learning_rate": 0.0001997199901583148, "loss": 3.1463, "step": 181 }, { "epoch": 0.05064876334921905, "grad_norm": 2.251539468765259, "learning_rate": 0.00019971670716516135, "loss": 2.7869, "step": 182 }, { "epoch": 0.05092705325773124, "grad_norm": 2.564356565475464, "learning_rate": 0.00019971340506547993, "loss": 2.8953, "step": 183 }, { "epoch": 0.051205343166243436, "grad_norm": 2.2237839698791504, "learning_rate": 0.00019971008385990323, "loss": 3.1079, "step": 184 }, { "epoch": 0.051483633074755626, "grad_norm": 2.5412509441375732, "learning_rate": 0.00019970674354906763, "loss": 3.1197, "step": 185 }, { "epoch": 0.051761922983267816, "grad_norm": 2.5886483192443848, "learning_rate": 0.00019970338413361312, "loss": 3.2883, "step": 186 }, { "epoch": 0.05204021289178001, "grad_norm": 2.5604493618011475, "learning_rate": 0.00019970000561418346, "loss": 3.0094, "step": 187 }, { "epoch": 0.0523185028002922, "grad_norm": 2.410118341445923, "learning_rate": 0.00019969660799142594, "loss": 3.2327, "step": 188 }, { "epoch": 0.0525967927088044, "grad_norm": 2.61869478225708, "learning_rate": 0.00019969319126599162, "loss": 3.5249, "step": 189 }, { "epoch": 0.05287508261731659, "grad_norm": 3.0196402072906494, "learning_rate": 0.00019968975543853512, "loss": 3.6002, "step": 190 }, { "epoch": 0.05315337252582878, "grad_norm": 2.399961471557617, "learning_rate": 0.00019968630050971482, "loss": 3.0321, "step": 191 }, { "epoch": 0.05343166243434098, "grad_norm": 2.5176491737365723, "learning_rate": 0.0001996828264801927, "loss": 3.2924, "step": 192 }, { "epoch": 0.05370995234285317, "grad_norm": 2.842106342315674, "learning_rate": 0.0001996793333506344, "loss": 3.2072, "step": 193 }, { "epoch": 0.05398824225136536, "grad_norm": 2.4583053588867188, "learning_rate": 0.00019967582112170922, "loss": 3.2504, "step": 194 }, { "epoch": 0.054266532159877555, "grad_norm": 2.680354595184326, "learning_rate": 0.00019967228979409014, "loss": 3.4597, "step": 195 }, { "epoch": 0.054544822068389745, "grad_norm": 2.476909637451172, "learning_rate": 0.00019966873936845382, "loss": 3.3429, "step": 196 }, { "epoch": 0.054823111976901935, "grad_norm": 2.686929225921631, "learning_rate": 0.00019966516984548053, "loss": 3.2888, "step": 197 }, { "epoch": 0.05510140188541413, "grad_norm": 2.863997459411621, "learning_rate": 0.00019966158122585421, "loss": 3.1925, "step": 198 }, { "epoch": 0.05537969179392632, "grad_norm": 2.2689263820648193, "learning_rate": 0.00019965797351026248, "loss": 2.9514, "step": 199 }, { "epoch": 0.05565798170243851, "grad_norm": 2.4391138553619385, "learning_rate": 0.00019965434669939663, "loss": 3.3221, "step": 200 }, { "epoch": 0.05565798170243851, "eval_loss": 3.1457667350769043, "eval_runtime": 84.3133, "eval_samples_per_second": 59.303, "eval_steps_per_second": 14.826, "step": 200 }, { "epoch": 0.05593627161095071, "grad_norm": 2.416696548461914, "learning_rate": 0.00019965070079395152, "loss": 3.0738, "step": 201 }, { "epoch": 0.0562145615194629, "grad_norm": 2.7304296493530273, "learning_rate": 0.00019964703579462582, "loss": 3.265, "step": 202 }, { "epoch": 0.056492851427975097, "grad_norm": 2.84639310836792, "learning_rate": 0.0001996433517021217, "loss": 3.1511, "step": 203 }, { "epoch": 0.05677114133648729, "grad_norm": 3.1984355449676514, "learning_rate": 0.0001996396485171451, "loss": 3.2948, "step": 204 }, { "epoch": 0.05704943124499948, "grad_norm": 2.538571834564209, "learning_rate": 0.00019963592624040556, "loss": 3.403, "step": 205 }, { "epoch": 0.057327721153511674, "grad_norm": 2.2732908725738525, "learning_rate": 0.00019963218487261634, "loss": 3.3002, "step": 206 }, { "epoch": 0.057606011062023864, "grad_norm": 2.5193586349487305, "learning_rate": 0.00019962842441449428, "loss": 3.2354, "step": 207 }, { "epoch": 0.057884300970536054, "grad_norm": 2.6292974948883057, "learning_rate": 0.0001996246448667599, "loss": 2.9179, "step": 208 }, { "epoch": 0.05816259087904825, "grad_norm": 2.610001802444458, "learning_rate": 0.00019962084623013744, "loss": 2.8539, "step": 209 }, { "epoch": 0.05844088078756044, "grad_norm": 2.0262327194213867, "learning_rate": 0.00019961702850535468, "loss": 2.7812, "step": 210 }, { "epoch": 0.05871917069607263, "grad_norm": 2.819694757461548, "learning_rate": 0.00019961319169314323, "loss": 3.2468, "step": 211 }, { "epoch": 0.05899746060458483, "grad_norm": 2.4672889709472656, "learning_rate": 0.00019960933579423815, "loss": 3.2287, "step": 212 }, { "epoch": 0.05927575051309702, "grad_norm": 2.5554051399230957, "learning_rate": 0.0001996054608093783, "loss": 3.0795, "step": 213 }, { "epoch": 0.05955404042160921, "grad_norm": 2.879462718963623, "learning_rate": 0.00019960156673930622, "loss": 3.4255, "step": 214 }, { "epoch": 0.059832330330121405, "grad_norm": 2.362056016921997, "learning_rate": 0.00019959765358476794, "loss": 3.1245, "step": 215 }, { "epoch": 0.060110620238633596, "grad_norm": 2.623589277267456, "learning_rate": 0.0001995937213465133, "loss": 3.2193, "step": 216 }, { "epoch": 0.06038891014714579, "grad_norm": 1.9338349103927612, "learning_rate": 0.00019958977002529577, "loss": 2.8042, "step": 217 }, { "epoch": 0.06066720005565798, "grad_norm": 2.3193845748901367, "learning_rate": 0.00019958579962187244, "loss": 3.0788, "step": 218 }, { "epoch": 0.06094548996417017, "grad_norm": 2.372978925704956, "learning_rate": 0.00019958181013700403, "loss": 2.9918, "step": 219 }, { "epoch": 0.06122377987268237, "grad_norm": 2.568800449371338, "learning_rate": 0.00019957780157145503, "loss": 3.2468, "step": 220 }, { "epoch": 0.06150206978119456, "grad_norm": 1.9775584936141968, "learning_rate": 0.00019957377392599348, "loss": 2.7487, "step": 221 }, { "epoch": 0.06178035968970675, "grad_norm": 2.5512070655822754, "learning_rate": 0.0001995697272013911, "loss": 2.7688, "step": 222 }, { "epoch": 0.06205864959821895, "grad_norm": 2.209963798522949, "learning_rate": 0.00019956566139842324, "loss": 3.154, "step": 223 }, { "epoch": 0.06233693950673114, "grad_norm": 2.8553903102874756, "learning_rate": 0.00019956157651786902, "loss": 3.3546, "step": 224 }, { "epoch": 0.06261522941524333, "grad_norm": 3.364727258682251, "learning_rate": 0.00019955747256051112, "loss": 3.0483, "step": 225 }, { "epoch": 0.06289351932375552, "grad_norm": 3.059723138809204, "learning_rate": 0.00019955334952713582, "loss": 3.1438, "step": 226 }, { "epoch": 0.06317180923226771, "grad_norm": 2.7689554691314697, "learning_rate": 0.00019954920741853323, "loss": 3.2509, "step": 227 }, { "epoch": 0.06345009914077991, "grad_norm": 2.462669610977173, "learning_rate": 0.00019954504623549696, "loss": 2.8673, "step": 228 }, { "epoch": 0.0637283890492921, "grad_norm": 2.82407546043396, "learning_rate": 0.00019954086597882427, "loss": 3.3771, "step": 229 }, { "epoch": 0.06400667895780429, "grad_norm": 2.258277416229248, "learning_rate": 0.00019953666664931623, "loss": 3.1994, "step": 230 }, { "epoch": 0.06428496886631649, "grad_norm": 2.442780017852783, "learning_rate": 0.0001995324482477774, "loss": 2.9306, "step": 231 }, { "epoch": 0.06456325877482867, "grad_norm": 2.246717929840088, "learning_rate": 0.0001995282107750161, "loss": 2.9427, "step": 232 }, { "epoch": 0.06484154868334087, "grad_norm": 2.0566117763519287, "learning_rate": 0.0001995239542318442, "loss": 2.9828, "step": 233 }, { "epoch": 0.06511983859185307, "grad_norm": 2.608257532119751, "learning_rate": 0.00019951967861907738, "loss": 3.2362, "step": 234 }, { "epoch": 0.06539812850036525, "grad_norm": 3.059354543685913, "learning_rate": 0.0001995153839375348, "loss": 3.2081, "step": 235 }, { "epoch": 0.06567641840887745, "grad_norm": 2.256696939468384, "learning_rate": 0.00019951107018803944, "loss": 2.9373, "step": 236 }, { "epoch": 0.06595470831738964, "grad_norm": 2.9037654399871826, "learning_rate": 0.0001995067373714178, "loss": 3.1212, "step": 237 }, { "epoch": 0.06623299822590184, "grad_norm": 2.6014556884765625, "learning_rate": 0.00019950238548850004, "loss": 3.1306, "step": 238 }, { "epoch": 0.06651128813441402, "grad_norm": 2.1499381065368652, "learning_rate": 0.0001994980145401201, "loss": 3.1654, "step": 239 }, { "epoch": 0.06678957804292622, "grad_norm": 2.589449882507324, "learning_rate": 0.00019949362452711543, "loss": 3.1417, "step": 240 }, { "epoch": 0.06706786795143842, "grad_norm": 3.630931854248047, "learning_rate": 0.0001994892154503272, "loss": 3.12, "step": 241 }, { "epoch": 0.0673461578599506, "grad_norm": 2.863436222076416, "learning_rate": 0.00019948478731060022, "loss": 3.2909, "step": 242 }, { "epoch": 0.0676244477684628, "grad_norm": 3.0917303562164307, "learning_rate": 0.000199480340108783, "loss": 3.3036, "step": 243 }, { "epoch": 0.067902737676975, "grad_norm": 2.857853651046753, "learning_rate": 0.00019947587384572764, "loss": 3.3592, "step": 244 }, { "epoch": 0.06818102758548718, "grad_norm": 3.0293962955474854, "learning_rate": 0.00019947138852228992, "loss": 3.2727, "step": 245 }, { "epoch": 0.06845931749399937, "grad_norm": 2.4538116455078125, "learning_rate": 0.0001994668841393292, "loss": 2.9585, "step": 246 }, { "epoch": 0.06873760740251157, "grad_norm": 2.7676730155944824, "learning_rate": 0.00019946236069770862, "loss": 3.2699, "step": 247 }, { "epoch": 0.06901589731102376, "grad_norm": 2.2552454471588135, "learning_rate": 0.0001994578181982949, "loss": 3.0268, "step": 248 }, { "epoch": 0.06929418721953595, "grad_norm": 2.6883902549743652, "learning_rate": 0.0001994532566419584, "loss": 3.5246, "step": 249 }, { "epoch": 0.06957247712804815, "grad_norm": 2.4788832664489746, "learning_rate": 0.0001994486760295732, "loss": 3.0449, "step": 250 }, { "epoch": 0.06985076703656033, "grad_norm": 2.687880277633667, "learning_rate": 0.00019944407636201693, "loss": 3.21, "step": 251 }, { "epoch": 0.07012905694507253, "grad_norm": 2.528339147567749, "learning_rate": 0.00019943945764017094, "loss": 3.0269, "step": 252 }, { "epoch": 0.07040734685358473, "grad_norm": 2.1724002361297607, "learning_rate": 0.00019943481986492022, "loss": 3.1538, "step": 253 }, { "epoch": 0.07068563676209691, "grad_norm": 2.7069270610809326, "learning_rate": 0.00019943016303715336, "loss": 3.1634, "step": 254 }, { "epoch": 0.0709639266706091, "grad_norm": 2.4778125286102295, "learning_rate": 0.00019942548715776273, "loss": 3.0475, "step": 255 }, { "epoch": 0.0712422165791213, "grad_norm": 2.0639779567718506, "learning_rate": 0.00019942079222764418, "loss": 3.1992, "step": 256 }, { "epoch": 0.07152050648763349, "grad_norm": 2.7047886848449707, "learning_rate": 0.0001994160782476974, "loss": 3.1037, "step": 257 }, { "epoch": 0.07179879639614568, "grad_norm": 2.497145891189575, "learning_rate": 0.00019941134521882548, "loss": 2.7647, "step": 258 }, { "epoch": 0.07207708630465788, "grad_norm": 2.5394625663757324, "learning_rate": 0.00019940659314193545, "loss": 2.9826, "step": 259 }, { "epoch": 0.07235537621317006, "grad_norm": 2.2167916297912598, "learning_rate": 0.0001994018220179378, "loss": 3.1098, "step": 260 }, { "epoch": 0.07263366612168226, "grad_norm": 2.145617961883545, "learning_rate": 0.00019939703184774666, "loss": 3.0937, "step": 261 }, { "epoch": 0.07291195603019446, "grad_norm": 2.4528911113739014, "learning_rate": 0.00019939222263227997, "loss": 2.8268, "step": 262 }, { "epoch": 0.07319024593870664, "grad_norm": 2.241543769836426, "learning_rate": 0.0001993873943724591, "loss": 2.8702, "step": 263 }, { "epoch": 0.07346853584721884, "grad_norm": 2.651615858078003, "learning_rate": 0.00019938254706920928, "loss": 3.1603, "step": 264 }, { "epoch": 0.07374682575573104, "grad_norm": 2.43216872215271, "learning_rate": 0.00019937768072345922, "loss": 2.9518, "step": 265 }, { "epoch": 0.07402511566424323, "grad_norm": 2.267383098602295, "learning_rate": 0.00019937279533614136, "loss": 3.0859, "step": 266 }, { "epoch": 0.07430340557275542, "grad_norm": 2.9553921222686768, "learning_rate": 0.00019936789090819183, "loss": 2.942, "step": 267 }, { "epoch": 0.07458169548126761, "grad_norm": 2.5467541217803955, "learning_rate": 0.00019936296744055035, "loss": 3.1312, "step": 268 }, { "epoch": 0.07485998538977981, "grad_norm": 2.402892827987671, "learning_rate": 0.00019935802493416022, "loss": 3.2853, "step": 269 }, { "epoch": 0.07513827529829199, "grad_norm": 2.4091978073120117, "learning_rate": 0.00019935306338996855, "loss": 3.1141, "step": 270 }, { "epoch": 0.07541656520680419, "grad_norm": 2.554346799850464, "learning_rate": 0.00019934808280892602, "loss": 3.0631, "step": 271 }, { "epoch": 0.07569485511531639, "grad_norm": 2.4397192001342773, "learning_rate": 0.00019934308319198687, "loss": 2.7703, "step": 272 }, { "epoch": 0.07597314502382857, "grad_norm": 2.2375779151916504, "learning_rate": 0.0001993380645401091, "loss": 3.0097, "step": 273 }, { "epoch": 0.07625143493234077, "grad_norm": 2.5792229175567627, "learning_rate": 0.00019933302685425436, "loss": 2.931, "step": 274 }, { "epoch": 0.07652972484085296, "grad_norm": 2.217116355895996, "learning_rate": 0.00019932797013538786, "loss": 2.8427, "step": 275 }, { "epoch": 0.07680801474936515, "grad_norm": 2.570390462875366, "learning_rate": 0.00019932289438447852, "loss": 3.0201, "step": 276 }, { "epoch": 0.07708630465787734, "grad_norm": 2.482693672180176, "learning_rate": 0.00019931779960249895, "loss": 3.0033, "step": 277 }, { "epoch": 0.07736459456638954, "grad_norm": 2.934562921524048, "learning_rate": 0.0001993126857904253, "loss": 3.3006, "step": 278 }, { "epoch": 0.07764288447490172, "grad_norm": 2.3979170322418213, "learning_rate": 0.00019930755294923742, "loss": 2.9612, "step": 279 }, { "epoch": 0.07792117438341392, "grad_norm": 2.08113431930542, "learning_rate": 0.00019930240107991879, "loss": 3.1123, "step": 280 }, { "epoch": 0.07819946429192612, "grad_norm": 2.5386717319488525, "learning_rate": 0.00019929723018345658, "loss": 3.2004, "step": 281 }, { "epoch": 0.0784777542004383, "grad_norm": 2.6750895977020264, "learning_rate": 0.0001992920402608416, "loss": 3.1406, "step": 282 }, { "epoch": 0.0787560441089505, "grad_norm": 2.848982334136963, "learning_rate": 0.0001992868313130682, "loss": 3.0589, "step": 283 }, { "epoch": 0.0790343340174627, "grad_norm": 2.4624407291412354, "learning_rate": 0.00019928160334113453, "loss": 3.1288, "step": 284 }, { "epoch": 0.07931262392597488, "grad_norm": 1.9524894952774048, "learning_rate": 0.0001992763563460423, "loss": 3.0679, "step": 285 }, { "epoch": 0.07959091383448708, "grad_norm": 2.863542318344116, "learning_rate": 0.00019927109032879682, "loss": 3.1985, "step": 286 }, { "epoch": 0.07986920374299927, "grad_norm": 2.142577886581421, "learning_rate": 0.0001992658052904072, "loss": 3.1598, "step": 287 }, { "epoch": 0.08014749365151146, "grad_norm": 1.9506819248199463, "learning_rate": 0.000199260501231886, "loss": 2.9892, "step": 288 }, { "epoch": 0.08042578356002365, "grad_norm": 2.029646158218384, "learning_rate": 0.00019925517815424953, "loss": 3.1497, "step": 289 }, { "epoch": 0.08070407346853585, "grad_norm": 2.071078062057495, "learning_rate": 0.0001992498360585178, "loss": 3.0307, "step": 290 }, { "epoch": 0.08098236337704803, "grad_norm": 2.042703628540039, "learning_rate": 0.00019924447494571438, "loss": 2.9602, "step": 291 }, { "epoch": 0.08126065328556023, "grad_norm": 2.004037380218506, "learning_rate": 0.0001992390948168665, "loss": 2.8187, "step": 292 }, { "epoch": 0.08153894319407243, "grad_norm": 3.014195442199707, "learning_rate": 0.00019923369567300498, "loss": 3.1527, "step": 293 }, { "epoch": 0.08181723310258462, "grad_norm": 2.090179204940796, "learning_rate": 0.00019922827751516438, "loss": 2.9583, "step": 294 }, { "epoch": 0.08209552301109681, "grad_norm": 2.5031628608703613, "learning_rate": 0.0001992228403443829, "loss": 3.2267, "step": 295 }, { "epoch": 0.082373812919609, "grad_norm": 2.428762912750244, "learning_rate": 0.00019921738416170227, "loss": 2.9677, "step": 296 }, { "epoch": 0.0826521028281212, "grad_norm": 2.581678628921509, "learning_rate": 0.00019921190896816801, "loss": 3.086, "step": 297 }, { "epoch": 0.08293039273663338, "grad_norm": 2.3357739448547363, "learning_rate": 0.00019920641476482917, "loss": 3.2235, "step": 298 }, { "epoch": 0.08320868264514558, "grad_norm": 2.3630545139312744, "learning_rate": 0.00019920090155273851, "loss": 3.0333, "step": 299 }, { "epoch": 0.08348697255365778, "grad_norm": 2.340430736541748, "learning_rate": 0.00019919536933295237, "loss": 3.0076, "step": 300 }, { "epoch": 0.08348697255365778, "eval_loss": 3.1012399196624756, "eval_runtime": 84.6843, "eval_samples_per_second": 59.043, "eval_steps_per_second": 14.761, "step": 300 }, { "epoch": 0.08376526246216996, "grad_norm": 2.3581724166870117, "learning_rate": 0.00019918981810653082, "loss": 3.0328, "step": 301 }, { "epoch": 0.08404355237068216, "grad_norm": 2.453171491622925, "learning_rate": 0.00019918424787453747, "loss": 2.9242, "step": 302 }, { "epoch": 0.08432184227919436, "grad_norm": 2.496574878692627, "learning_rate": 0.0001991786586380396, "loss": 3.0577, "step": 303 }, { "epoch": 0.08460013218770654, "grad_norm": 2.7812600135803223, "learning_rate": 0.00019917305039810824, "loss": 3.1012, "step": 304 }, { "epoch": 0.08487842209621874, "grad_norm": 2.7636566162109375, "learning_rate": 0.00019916742315581793, "loss": 3.1331, "step": 305 }, { "epoch": 0.08515671200473093, "grad_norm": 2.141082525253296, "learning_rate": 0.00019916177691224688, "loss": 3.0042, "step": 306 }, { "epoch": 0.08543500191324312, "grad_norm": 2.1939048767089844, "learning_rate": 0.00019915611166847698, "loss": 2.9426, "step": 307 }, { "epoch": 0.08571329182175531, "grad_norm": 2.4433774948120117, "learning_rate": 0.00019915042742559371, "loss": 2.8669, "step": 308 }, { "epoch": 0.08599158173026751, "grad_norm": 2.447218656539917, "learning_rate": 0.00019914472418468626, "loss": 3.0552, "step": 309 }, { "epoch": 0.0862698716387797, "grad_norm": 2.404257297515869, "learning_rate": 0.00019913900194684737, "loss": 2.9688, "step": 310 }, { "epoch": 0.08654816154729189, "grad_norm": 2.3225021362304688, "learning_rate": 0.00019913326071317345, "loss": 2.8411, "step": 311 }, { "epoch": 0.08682645145580409, "grad_norm": 2.2253050804138184, "learning_rate": 0.00019912750048476467, "loss": 3.0543, "step": 312 }, { "epoch": 0.08710474136431627, "grad_norm": 2.623739242553711, "learning_rate": 0.00019912172126272464, "loss": 3.1256, "step": 313 }, { "epoch": 0.08738303127282847, "grad_norm": 2.3449671268463135, "learning_rate": 0.00019911592304816073, "loss": 2.7866, "step": 314 }, { "epoch": 0.08766132118134067, "grad_norm": 3.2625606060028076, "learning_rate": 0.00019911010584218393, "loss": 3.2266, "step": 315 }, { "epoch": 0.08793961108985285, "grad_norm": 2.2077088356018066, "learning_rate": 0.00019910426964590884, "loss": 2.9329, "step": 316 }, { "epoch": 0.08821790099836505, "grad_norm": 2.1451663970947266, "learning_rate": 0.0001990984144604538, "loss": 2.7007, "step": 317 }, { "epoch": 0.08849619090687724, "grad_norm": 2.403789758682251, "learning_rate": 0.00019909254028694062, "loss": 3.2651, "step": 318 }, { "epoch": 0.08877448081538943, "grad_norm": 2.42313289642334, "learning_rate": 0.0001990866471264949, "loss": 3.0436, "step": 319 }, { "epoch": 0.08905277072390162, "grad_norm": 2.432422161102295, "learning_rate": 0.00019908073498024577, "loss": 2.9449, "step": 320 }, { "epoch": 0.08933106063241382, "grad_norm": 2.4560887813568115, "learning_rate": 0.0001990748038493261, "loss": 3.004, "step": 321 }, { "epoch": 0.08960935054092602, "grad_norm": 2.8377909660339355, "learning_rate": 0.0001990688537348723, "loss": 3.2206, "step": 322 }, { "epoch": 0.0898876404494382, "grad_norm": 2.5766313076019287, "learning_rate": 0.00019906288463802444, "loss": 3.3782, "step": 323 }, { "epoch": 0.0901659303579504, "grad_norm": 2.5259029865264893, "learning_rate": 0.0001990568965599263, "loss": 3.276, "step": 324 }, { "epoch": 0.0904442202664626, "grad_norm": 2.117607831954956, "learning_rate": 0.00019905088950172525, "loss": 2.9149, "step": 325 }, { "epoch": 0.09072251017497478, "grad_norm": 2.6952085494995117, "learning_rate": 0.0001990448634645722, "loss": 2.9134, "step": 326 }, { "epoch": 0.09100080008348697, "grad_norm": 2.1106913089752197, "learning_rate": 0.00019903881844962192, "loss": 2.9102, "step": 327 }, { "epoch": 0.09127908999199917, "grad_norm": 2.335932731628418, "learning_rate": 0.0001990327544580326, "loss": 3.3176, "step": 328 }, { "epoch": 0.09155737990051135, "grad_norm": 2.4216434955596924, "learning_rate": 0.00019902667149096614, "loss": 2.9336, "step": 329 }, { "epoch": 0.09183566980902355, "grad_norm": 2.345383882522583, "learning_rate": 0.0001990205695495881, "loss": 3.0631, "step": 330 }, { "epoch": 0.09211395971753575, "grad_norm": 2.338063955307007, "learning_rate": 0.0001990144486350677, "loss": 3.0655, "step": 331 }, { "epoch": 0.09239224962604793, "grad_norm": 2.448991298675537, "learning_rate": 0.00019900830874857773, "loss": 2.9162, "step": 332 }, { "epoch": 0.09267053953456013, "grad_norm": 2.1384122371673584, "learning_rate": 0.00019900214989129462, "loss": 3.0811, "step": 333 }, { "epoch": 0.09294882944307233, "grad_norm": 2.301331043243408, "learning_rate": 0.0001989959720643985, "loss": 2.9866, "step": 334 }, { "epoch": 0.09322711935158451, "grad_norm": 2.189103126525879, "learning_rate": 0.0001989897752690731, "loss": 3.0582, "step": 335 }, { "epoch": 0.0935054092600967, "grad_norm": 2.8342630863189697, "learning_rate": 0.00019898355950650568, "loss": 3.0671, "step": 336 }, { "epoch": 0.0937836991686089, "grad_norm": 2.1547491550445557, "learning_rate": 0.0001989773247778873, "loss": 2.9594, "step": 337 }, { "epoch": 0.09406198907712109, "grad_norm": 2.4930500984191895, "learning_rate": 0.00019897107108441262, "loss": 2.9101, "step": 338 }, { "epoch": 0.09434027898563328, "grad_norm": 2.285538673400879, "learning_rate": 0.00019896479842727982, "loss": 2.8896, "step": 339 }, { "epoch": 0.09461856889414548, "grad_norm": 2.716468334197998, "learning_rate": 0.00019895850680769087, "loss": 3.2421, "step": 340 }, { "epoch": 0.09489685880265766, "grad_norm": 2.3773834705352783, "learning_rate": 0.00019895219622685124, "loss": 3.2088, "step": 341 }, { "epoch": 0.09517514871116986, "grad_norm": 2.2655930519104004, "learning_rate": 0.00019894586668597008, "loss": 3.0896, "step": 342 }, { "epoch": 0.09545343861968206, "grad_norm": 2.474036931991577, "learning_rate": 0.00019893951818626023, "loss": 3.1729, "step": 343 }, { "epoch": 0.09573172852819424, "grad_norm": 2.4246020317077637, "learning_rate": 0.0001989331507289381, "loss": 3.0023, "step": 344 }, { "epoch": 0.09601001843670644, "grad_norm": 2.7881805896759033, "learning_rate": 0.00019892676431522373, "loss": 3.2201, "step": 345 }, { "epoch": 0.09628830834521863, "grad_norm": 3.2350337505340576, "learning_rate": 0.00019892035894634078, "loss": 2.8602, "step": 346 }, { "epoch": 0.09656659825373082, "grad_norm": 2.478100299835205, "learning_rate": 0.00019891393462351665, "loss": 3.2415, "step": 347 }, { "epoch": 0.09684488816224301, "grad_norm": 2.5819339752197266, "learning_rate": 0.00019890749134798222, "loss": 3.0491, "step": 348 }, { "epoch": 0.09712317807075521, "grad_norm": 2.5929932594299316, "learning_rate": 0.0001989010291209721, "loss": 2.9707, "step": 349 }, { "epoch": 0.09740146797926741, "grad_norm": 2.348785400390625, "learning_rate": 0.00019889454794372454, "loss": 3.1738, "step": 350 }, { "epoch": 0.09767975788777959, "grad_norm": 3.2977755069732666, "learning_rate": 0.00019888804781748132, "loss": 3.2136, "step": 351 }, { "epoch": 0.09795804779629179, "grad_norm": 2.4276630878448486, "learning_rate": 0.00019888152874348794, "loss": 3.2304, "step": 352 }, { "epoch": 0.09823633770480399, "grad_norm": 2.42805552482605, "learning_rate": 0.0001988749907229935, "loss": 3.0534, "step": 353 }, { "epoch": 0.09851462761331617, "grad_norm": 2.639913320541382, "learning_rate": 0.0001988684337572508, "loss": 2.8857, "step": 354 }, { "epoch": 0.09879291752182837, "grad_norm": 2.4938008785247803, "learning_rate": 0.00019886185784751613, "loss": 2.7434, "step": 355 }, { "epoch": 0.09907120743034056, "grad_norm": 2.3958933353424072, "learning_rate": 0.00019885526299504955, "loss": 3.0847, "step": 356 }, { "epoch": 0.09934949733885275, "grad_norm": 2.2492778301239014, "learning_rate": 0.00019884864920111466, "loss": 2.9559, "step": 357 }, { "epoch": 0.09962778724736494, "grad_norm": 2.0358526706695557, "learning_rate": 0.00019884201646697868, "loss": 3.3034, "step": 358 }, { "epoch": 0.09990607715587714, "grad_norm": 2.1167819499969482, "learning_rate": 0.00019883536479391251, "loss": 2.9596, "step": 359 }, { "epoch": 0.10018436706438932, "grad_norm": 2.303647756576538, "learning_rate": 0.00019882869418319074, "loss": 3.2392, "step": 360 }, { "epoch": 0.10046265697290152, "grad_norm": 2.3578591346740723, "learning_rate": 0.0001988220046360914, "loss": 3.114, "step": 361 }, { "epoch": 0.10074094688141372, "grad_norm": 2.3685109615325928, "learning_rate": 0.00019881529615389636, "loss": 2.8554, "step": 362 }, { "epoch": 0.1010192367899259, "grad_norm": 2.0713369846343994, "learning_rate": 0.00019880856873789093, "loss": 2.8518, "step": 363 }, { "epoch": 0.1012975266984381, "grad_norm": 2.388382911682129, "learning_rate": 0.00019880182238936423, "loss": 2.958, "step": 364 }, { "epoch": 0.1015758166069503, "grad_norm": 2.4314351081848145, "learning_rate": 0.00019879505710960883, "loss": 3.081, "step": 365 }, { "epoch": 0.10185410651546248, "grad_norm": 2.228017568588257, "learning_rate": 0.00019878827289992107, "loss": 2.9916, "step": 366 }, { "epoch": 0.10213239642397468, "grad_norm": 2.3488426208496094, "learning_rate": 0.00019878146976160083, "loss": 3.1225, "step": 367 }, { "epoch": 0.10241068633248687, "grad_norm": 2.4583683013916016, "learning_rate": 0.0001987746476959517, "loss": 3.076, "step": 368 }, { "epoch": 0.10268897624099906, "grad_norm": 2.3816723823547363, "learning_rate": 0.00019876780670428074, "loss": 3.1938, "step": 369 }, { "epoch": 0.10296726614951125, "grad_norm": 2.396934986114502, "learning_rate": 0.00019876094678789881, "loss": 3.2038, "step": 370 }, { "epoch": 0.10324555605802345, "grad_norm": 2.055657148361206, "learning_rate": 0.00019875406794812034, "loss": 2.9685, "step": 371 }, { "epoch": 0.10352384596653563, "grad_norm": 2.6715643405914307, "learning_rate": 0.00019874717018626332, "loss": 3.2049, "step": 372 }, { "epoch": 0.10380213587504783, "grad_norm": 2.3720312118530273, "learning_rate": 0.00019874025350364945, "loss": 2.9651, "step": 373 }, { "epoch": 0.10408042578356003, "grad_norm": 2.55594539642334, "learning_rate": 0.00019873331790160403, "loss": 2.9883, "step": 374 }, { "epoch": 0.10435871569207221, "grad_norm": 2.075500726699829, "learning_rate": 0.00019872636338145596, "loss": 2.9476, "step": 375 }, { "epoch": 0.1046370056005844, "grad_norm": 2.4690656661987305, "learning_rate": 0.0001987193899445378, "loss": 2.9478, "step": 376 }, { "epoch": 0.1049152955090966, "grad_norm": 2.007357597351074, "learning_rate": 0.00019871239759218566, "loss": 3.0422, "step": 377 }, { "epoch": 0.1051935854176088, "grad_norm": 2.5830485820770264, "learning_rate": 0.00019870538632573936, "loss": 3.0064, "step": 378 }, { "epoch": 0.10547187532612098, "grad_norm": 2.1962311267852783, "learning_rate": 0.00019869835614654238, "loss": 2.8906, "step": 379 }, { "epoch": 0.10575016523463318, "grad_norm": 2.478692054748535, "learning_rate": 0.00019869130705594168, "loss": 3.114, "step": 380 }, { "epoch": 0.10602845514314538, "grad_norm": 2.0583858489990234, "learning_rate": 0.00019868423905528796, "loss": 2.986, "step": 381 }, { "epoch": 0.10630674505165756, "grad_norm": 2.303459644317627, "learning_rate": 0.0001986771521459355, "loss": 3.0922, "step": 382 }, { "epoch": 0.10658503496016976, "grad_norm": 2.456014394760132, "learning_rate": 0.00019867004632924217, "loss": 3.0063, "step": 383 }, { "epoch": 0.10686332486868196, "grad_norm": 2.478271722793579, "learning_rate": 0.00019866292160656957, "loss": 3.1578, "step": 384 }, { "epoch": 0.10714161477719414, "grad_norm": 2.075045347213745, "learning_rate": 0.00019865577797928283, "loss": 2.906, "step": 385 }, { "epoch": 0.10741990468570634, "grad_norm": 2.3233718872070312, "learning_rate": 0.00019864861544875072, "loss": 3.1283, "step": 386 }, { "epoch": 0.10769819459421853, "grad_norm": 2.0878591537475586, "learning_rate": 0.00019864143401634562, "loss": 2.7757, "step": 387 }, { "epoch": 0.10797648450273072, "grad_norm": 3.101155996322632, "learning_rate": 0.00019863423368344355, "loss": 3.1719, "step": 388 }, { "epoch": 0.10825477441124291, "grad_norm": 2.4252467155456543, "learning_rate": 0.0001986270144514242, "loss": 3.118, "step": 389 }, { "epoch": 0.10853306431975511, "grad_norm": 2.3604631423950195, "learning_rate": 0.0001986197763216708, "loss": 2.8754, "step": 390 }, { "epoch": 0.10881135422826729, "grad_norm": 2.2223122119903564, "learning_rate": 0.00019861251929557022, "loss": 2.9746, "step": 391 }, { "epoch": 0.10908964413677949, "grad_norm": 2.5862038135528564, "learning_rate": 0.000198605243374513, "loss": 3.2785, "step": 392 }, { "epoch": 0.10936793404529169, "grad_norm": 2.1648552417755127, "learning_rate": 0.00019859794855989327, "loss": 2.9542, "step": 393 }, { "epoch": 0.10964622395380387, "grad_norm": 2.4264414310455322, "learning_rate": 0.0001985906348531087, "loss": 3.3219, "step": 394 }, { "epoch": 0.10992451386231607, "grad_norm": 2.119896173477173, "learning_rate": 0.00019858330225556076, "loss": 3.0327, "step": 395 }, { "epoch": 0.11020280377082826, "grad_norm": 2.4197657108306885, "learning_rate": 0.00019857595076865433, "loss": 2.8218, "step": 396 }, { "epoch": 0.11048109367934045, "grad_norm": 2.218339204788208, "learning_rate": 0.00019856858039379814, "loss": 2.7461, "step": 397 }, { "epoch": 0.11075938358785264, "grad_norm": 2.2768938541412354, "learning_rate": 0.00019856119113240427, "loss": 3.3579, "step": 398 }, { "epoch": 0.11103767349636484, "grad_norm": 2.108539581298828, "learning_rate": 0.00019855378298588868, "loss": 2.8677, "step": 399 }, { "epoch": 0.11131596340487702, "grad_norm": 2.125035524368286, "learning_rate": 0.00019854635595567077, "loss": 2.865, "step": 400 }, { "epoch": 0.11131596340487702, "eval_loss": 3.0699453353881836, "eval_runtime": 84.8456, "eval_samples_per_second": 58.931, "eval_steps_per_second": 14.733, "step": 400 }, { "epoch": 0.11159425331338922, "grad_norm": 2.108405590057373, "learning_rate": 0.00019853891004317363, "loss": 2.9478, "step": 401 }, { "epoch": 0.11187254322190142, "grad_norm": 3.114546060562134, "learning_rate": 0.00019853144524982396, "loss": 3.1217, "step": 402 }, { "epoch": 0.1121508331304136, "grad_norm": 2.260957956314087, "learning_rate": 0.0001985239615770521, "loss": 3.0552, "step": 403 }, { "epoch": 0.1124291230389258, "grad_norm": 2.7920634746551514, "learning_rate": 0.00019851645902629192, "loss": 3.1219, "step": 404 }, { "epoch": 0.112707412947438, "grad_norm": 2.332014560699463, "learning_rate": 0.00019850893759898103, "loss": 2.9827, "step": 405 }, { "epoch": 0.11298570285595019, "grad_norm": 2.1218183040618896, "learning_rate": 0.00019850139729656058, "loss": 2.5333, "step": 406 }, { "epoch": 0.11326399276446238, "grad_norm": 2.3233604431152344, "learning_rate": 0.00019849383812047534, "loss": 3.0452, "step": 407 }, { "epoch": 0.11354228267297457, "grad_norm": 2.8609113693237305, "learning_rate": 0.0001984862600721737, "loss": 3.4326, "step": 408 }, { "epoch": 0.11382057258148677, "grad_norm": 2.6440603733062744, "learning_rate": 0.0001984786631531077, "loss": 3.2617, "step": 409 }, { "epoch": 0.11409886248999895, "grad_norm": 2.6440229415893555, "learning_rate": 0.000198471047364733, "loss": 3.1632, "step": 410 }, { "epoch": 0.11437715239851115, "grad_norm": 2.524158477783203, "learning_rate": 0.00019846341270850877, "loss": 3.3356, "step": 411 }, { "epoch": 0.11465544230702335, "grad_norm": 2.2843925952911377, "learning_rate": 0.0001984557591858979, "loss": 3.005, "step": 412 }, { "epoch": 0.11493373221553553, "grad_norm": 2.0623104572296143, "learning_rate": 0.0001984480867983669, "loss": 3.1248, "step": 413 }, { "epoch": 0.11521202212404773, "grad_norm": 2.1712939739227295, "learning_rate": 0.00019844039554738584, "loss": 3.0611, "step": 414 }, { "epoch": 0.11549031203255992, "grad_norm": 2.7139744758605957, "learning_rate": 0.0001984326854344284, "loss": 2.892, "step": 415 }, { "epoch": 0.11576860194107211, "grad_norm": 2.380901336669922, "learning_rate": 0.00019842495646097197, "loss": 2.9651, "step": 416 }, { "epoch": 0.1160468918495843, "grad_norm": 2.27986741065979, "learning_rate": 0.00019841720862849742, "loss": 3.2873, "step": 417 }, { "epoch": 0.1163251817580965, "grad_norm": 2.4040000438690186, "learning_rate": 0.0001984094419384893, "loss": 3.1707, "step": 418 }, { "epoch": 0.11660347166660869, "grad_norm": 2.3718647956848145, "learning_rate": 0.0001984016563924358, "loss": 3.0166, "step": 419 }, { "epoch": 0.11688176157512088, "grad_norm": 2.2892348766326904, "learning_rate": 0.0001983938519918287, "loss": 2.8065, "step": 420 }, { "epoch": 0.11716005148363308, "grad_norm": 2.1373353004455566, "learning_rate": 0.00019838602873816337, "loss": 2.9926, "step": 421 }, { "epoch": 0.11743834139214526, "grad_norm": 2.592775821685791, "learning_rate": 0.0001983781866329388, "loss": 2.9556, "step": 422 }, { "epoch": 0.11771663130065746, "grad_norm": 2.836141347885132, "learning_rate": 0.00019837032567765757, "loss": 3.2806, "step": 423 }, { "epoch": 0.11799492120916966, "grad_norm": 2.5971944332122803, "learning_rate": 0.00019836244587382597, "loss": 3.0687, "step": 424 }, { "epoch": 0.11827321111768184, "grad_norm": 2.38938307762146, "learning_rate": 0.00019835454722295378, "loss": 3.0625, "step": 425 }, { "epoch": 0.11855150102619404, "grad_norm": 2.546592950820923, "learning_rate": 0.0001983466297265545, "loss": 3.2388, "step": 426 }, { "epoch": 0.11882979093470623, "grad_norm": 2.466273069381714, "learning_rate": 0.00019833869338614513, "loss": 2.8286, "step": 427 }, { "epoch": 0.11910808084321842, "grad_norm": 2.380063772201538, "learning_rate": 0.0001983307382032464, "loss": 3.3092, "step": 428 }, { "epoch": 0.11938637075173061, "grad_norm": 1.7745685577392578, "learning_rate": 0.00019832276417938252, "loss": 2.8905, "step": 429 }, { "epoch": 0.11966466066024281, "grad_norm": 2.346071243286133, "learning_rate": 0.0001983147713160814, "loss": 2.8016, "step": 430 }, { "epoch": 0.119942950568755, "grad_norm": 2.176030397415161, "learning_rate": 0.00019830675961487456, "loss": 2.8559, "step": 431 }, { "epoch": 0.12022124047726719, "grad_norm": 2.2990527153015137, "learning_rate": 0.0001982987290772971, "loss": 2.9844, "step": 432 }, { "epoch": 0.12049953038577939, "grad_norm": 2.4893555641174316, "learning_rate": 0.0001982906797048877, "loss": 3.0962, "step": 433 }, { "epoch": 0.12077782029429159, "grad_norm": 2.116438627243042, "learning_rate": 0.00019828261149918875, "loss": 2.8139, "step": 434 }, { "epoch": 0.12105611020280377, "grad_norm": 2.221937417984009, "learning_rate": 0.00019827452446174613, "loss": 3.0746, "step": 435 }, { "epoch": 0.12133440011131597, "grad_norm": 2.5545432567596436, "learning_rate": 0.00019826641859410938, "loss": 2.803, "step": 436 }, { "epoch": 0.12161269001982816, "grad_norm": 2.348519802093506, "learning_rate": 0.00019825829389783167, "loss": 2.7334, "step": 437 }, { "epoch": 0.12189097992834035, "grad_norm": 1.8105679750442505, "learning_rate": 0.00019825015037446976, "loss": 3.0343, "step": 438 }, { "epoch": 0.12216926983685254, "grad_norm": 2.871101140975952, "learning_rate": 0.00019824198802558405, "loss": 3.3523, "step": 439 }, { "epoch": 0.12244755974536474, "grad_norm": 2.2958691120147705, "learning_rate": 0.00019823380685273843, "loss": 3.2355, "step": 440 }, { "epoch": 0.12272584965387692, "grad_norm": 2.4496185779571533, "learning_rate": 0.00019822560685750054, "loss": 3.0625, "step": 441 }, { "epoch": 0.12300413956238912, "grad_norm": 2.224569082260132, "learning_rate": 0.00019821738804144153, "loss": 2.8401, "step": 442 }, { "epoch": 0.12328242947090132, "grad_norm": 2.1573524475097656, "learning_rate": 0.00019820915040613622, "loss": 3.1235, "step": 443 }, { "epoch": 0.1235607193794135, "grad_norm": 2.0801284313201904, "learning_rate": 0.00019820089395316298, "loss": 3.1397, "step": 444 }, { "epoch": 0.1238390092879257, "grad_norm": 2.1467182636260986, "learning_rate": 0.00019819261868410384, "loss": 2.7248, "step": 445 }, { "epoch": 0.1241172991964379, "grad_norm": 2.412508726119995, "learning_rate": 0.00019818432460054443, "loss": 2.954, "step": 446 }, { "epoch": 0.12439558910495008, "grad_norm": 1.9736499786376953, "learning_rate": 0.0001981760117040739, "loss": 2.9987, "step": 447 }, { "epoch": 0.12467387901346227, "grad_norm": 2.810506820678711, "learning_rate": 0.00019816767999628507, "loss": 2.8143, "step": 448 }, { "epoch": 0.12495216892197447, "grad_norm": 2.2019197940826416, "learning_rate": 0.00019815932947877442, "loss": 3.0292, "step": 449 }, { "epoch": 0.12523045883048667, "grad_norm": 2.210148334503174, "learning_rate": 0.00019815096015314196, "loss": 3.1336, "step": 450 }, { "epoch": 0.12550874873899887, "grad_norm": 2.269942045211792, "learning_rate": 0.00019814257202099128, "loss": 2.7951, "step": 451 }, { "epoch": 0.12578703864751103, "grad_norm": 2.1060009002685547, "learning_rate": 0.0001981341650839297, "loss": 2.8537, "step": 452 }, { "epoch": 0.12606532855602323, "grad_norm": 2.4027867317199707, "learning_rate": 0.00019812573934356795, "loss": 3.2334, "step": 453 }, { "epoch": 0.12634361846453543, "grad_norm": 5.312845230102539, "learning_rate": 0.00019811729480152053, "loss": 3.0747, "step": 454 }, { "epoch": 0.12662190837304763, "grad_norm": 2.1850552558898926, "learning_rate": 0.00019810883145940546, "loss": 2.9734, "step": 455 }, { "epoch": 0.12690019828155982, "grad_norm": 2.4432599544525146, "learning_rate": 0.00019810034931884443, "loss": 3.0887, "step": 456 }, { "epoch": 0.12717848819007202, "grad_norm": 2.4416098594665527, "learning_rate": 0.00019809184838146264, "loss": 3.1262, "step": 457 }, { "epoch": 0.1274567780985842, "grad_norm": 2.303316831588745, "learning_rate": 0.00019808332864888894, "loss": 2.8147, "step": 458 }, { "epoch": 0.1277350680070964, "grad_norm": 2.1671509742736816, "learning_rate": 0.0001980747901227558, "loss": 2.9563, "step": 459 }, { "epoch": 0.12801335791560858, "grad_norm": 2.7765119075775146, "learning_rate": 0.0001980662328046993, "loss": 2.9807, "step": 460 }, { "epoch": 0.12829164782412078, "grad_norm": 2.0430989265441895, "learning_rate": 0.000198057656696359, "loss": 2.6735, "step": 461 }, { "epoch": 0.12856993773263298, "grad_norm": 2.1812233924865723, "learning_rate": 0.00019804906179937822, "loss": 2.8773, "step": 462 }, { "epoch": 0.12884822764114517, "grad_norm": 2.371373176574707, "learning_rate": 0.0001980404481154038, "loss": 2.73, "step": 463 }, { "epoch": 0.12912651754965734, "grad_norm": 1.9812898635864258, "learning_rate": 0.00019803181564608622, "loss": 2.895, "step": 464 }, { "epoch": 0.12940480745816954, "grad_norm": 2.2081284523010254, "learning_rate": 0.0001980231643930795, "loss": 2.858, "step": 465 }, { "epoch": 0.12968309736668174, "grad_norm": 2.6150717735290527, "learning_rate": 0.00019801449435804126, "loss": 3.3826, "step": 466 }, { "epoch": 0.12996138727519393, "grad_norm": 2.64520001411438, "learning_rate": 0.00019800580554263277, "loss": 3.0603, "step": 467 }, { "epoch": 0.13023967718370613, "grad_norm": 2.532212734222412, "learning_rate": 0.00019799709794851892, "loss": 3.0784, "step": 468 }, { "epoch": 0.13051796709221833, "grad_norm": 1.9552390575408936, "learning_rate": 0.0001979883715773681, "loss": 2.7624, "step": 469 }, { "epoch": 0.1307962570007305, "grad_norm": 2.5089361667633057, "learning_rate": 0.0001979796264308524, "loss": 3.0881, "step": 470 }, { "epoch": 0.1310745469092427, "grad_norm": 2.5039610862731934, "learning_rate": 0.00019797086251064743, "loss": 3.0629, "step": 471 }, { "epoch": 0.1313528368177549, "grad_norm": 2.5699501037597656, "learning_rate": 0.00019796207981843242, "loss": 3.0622, "step": 472 }, { "epoch": 0.1316311267262671, "grad_norm": 2.2075252532958984, "learning_rate": 0.0001979532783558902, "loss": 2.9487, "step": 473 }, { "epoch": 0.1319094166347793, "grad_norm": 2.186957359313965, "learning_rate": 0.00019794445812470728, "loss": 2.7397, "step": 474 }, { "epoch": 0.13218770654329148, "grad_norm": 2.3244500160217285, "learning_rate": 0.0001979356191265736, "loss": 3.015, "step": 475 }, { "epoch": 0.13246599645180368, "grad_norm": 2.2182435989379883, "learning_rate": 0.0001979267613631828, "loss": 2.9736, "step": 476 }, { "epoch": 0.13274428636031585, "grad_norm": 2.18674373626709, "learning_rate": 0.00019791788483623213, "loss": 2.9665, "step": 477 }, { "epoch": 0.13302257626882805, "grad_norm": 1.9541887044906616, "learning_rate": 0.00019790898954742239, "loss": 3.0831, "step": 478 }, { "epoch": 0.13330086617734024, "grad_norm": 2.2709081172943115, "learning_rate": 0.00019790007549845795, "loss": 2.7109, "step": 479 }, { "epoch": 0.13357915608585244, "grad_norm": 2.02314829826355, "learning_rate": 0.00019789114269104686, "loss": 3.0052, "step": 480 }, { "epoch": 0.13385744599436464, "grad_norm": 2.4927501678466797, "learning_rate": 0.00019788219112690074, "loss": 3.1733, "step": 481 }, { "epoch": 0.13413573590287683, "grad_norm": 2.2295081615448, "learning_rate": 0.00019787322080773475, "loss": 2.8966, "step": 482 }, { "epoch": 0.134414025811389, "grad_norm": 2.372164249420166, "learning_rate": 0.00019786423173526763, "loss": 3.2377, "step": 483 }, { "epoch": 0.1346923157199012, "grad_norm": 2.6764392852783203, "learning_rate": 0.00019785522391122182, "loss": 2.8826, "step": 484 }, { "epoch": 0.1349706056284134, "grad_norm": 2.161125421524048, "learning_rate": 0.0001978461973373233, "loss": 2.9582, "step": 485 }, { "epoch": 0.1352488955369256, "grad_norm": 2.2367019653320312, "learning_rate": 0.0001978371520153016, "loss": 3.0724, "step": 486 }, { "epoch": 0.1355271854454378, "grad_norm": 2.6189560890197754, "learning_rate": 0.00019782808794688986, "loss": 2.8145, "step": 487 }, { "epoch": 0.13580547535395, "grad_norm": 2.3323240280151367, "learning_rate": 0.0001978190051338249, "loss": 3.0884, "step": 488 }, { "epoch": 0.13608376526246216, "grad_norm": 2.6934092044830322, "learning_rate": 0.000197809903577847, "loss": 2.9306, "step": 489 }, { "epoch": 0.13636205517097436, "grad_norm": 2.37654972076416, "learning_rate": 0.0001978007832807001, "loss": 3.0945, "step": 490 }, { "epoch": 0.13664034507948655, "grad_norm": 2.25034236907959, "learning_rate": 0.00019779164424413173, "loss": 3.0062, "step": 491 }, { "epoch": 0.13691863498799875, "grad_norm": 2.328068733215332, "learning_rate": 0.000197782486469893, "loss": 2.9326, "step": 492 }, { "epoch": 0.13719692489651095, "grad_norm": 2.2280008792877197, "learning_rate": 0.00019777330995973867, "loss": 3.0139, "step": 493 }, { "epoch": 0.13747521480502314, "grad_norm": 2.4439785480499268, "learning_rate": 0.0001977641147154269, "loss": 3.1008, "step": 494 }, { "epoch": 0.1377535047135353, "grad_norm": 2.1118569374084473, "learning_rate": 0.0001977549007387197, "loss": 2.9033, "step": 495 }, { "epoch": 0.1380317946220475, "grad_norm": 2.102998733520508, "learning_rate": 0.0001977456680313825, "loss": 2.9161, "step": 496 }, { "epoch": 0.1383100845305597, "grad_norm": 2.179610252380371, "learning_rate": 0.00019773641659518435, "loss": 3.137, "step": 497 }, { "epoch": 0.1385883744390719, "grad_norm": 2.334568738937378, "learning_rate": 0.00019772714643189794, "loss": 2.956, "step": 498 }, { "epoch": 0.1388666643475841, "grad_norm": 2.13036847114563, "learning_rate": 0.00019771785754329945, "loss": 2.8521, "step": 499 }, { "epoch": 0.1391449542560963, "grad_norm": 2.4586195945739746, "learning_rate": 0.00019770854993116874, "loss": 3.0561, "step": 500 }, { "epoch": 0.1391449542560963, "eval_loss": 3.0413191318511963, "eval_runtime": 84.8791, "eval_samples_per_second": 58.907, "eval_steps_per_second": 14.727, "step": 500 }, { "epoch": 0.13942324416460847, "grad_norm": 2.4837100505828857, "learning_rate": 0.0001976992235972892, "loss": 3.2499, "step": 501 }, { "epoch": 0.13970153407312066, "grad_norm": 2.6068382263183594, "learning_rate": 0.00019768987854344787, "loss": 3.186, "step": 502 }, { "epoch": 0.13997982398163286, "grad_norm": 2.9714605808258057, "learning_rate": 0.00019768051477143532, "loss": 3.0214, "step": 503 }, { "epoch": 0.14025811389014506, "grad_norm": 2.118037462234497, "learning_rate": 0.0001976711322830457, "loss": 3.096, "step": 504 }, { "epoch": 0.14053640379865726, "grad_norm": 2.149463176727295, "learning_rate": 0.0001976617310800768, "loss": 2.7824, "step": 505 }, { "epoch": 0.14081469370716945, "grad_norm": 2.3836264610290527, "learning_rate": 0.00019765231116433, "loss": 2.932, "step": 506 }, { "epoch": 0.14109298361568165, "grad_norm": 1.8851443529129028, "learning_rate": 0.00019764287253761013, "loss": 2.9245, "step": 507 }, { "epoch": 0.14137127352419382, "grad_norm": 2.2656989097595215, "learning_rate": 0.0001976334152017258, "loss": 2.8756, "step": 508 }, { "epoch": 0.14164956343270602, "grad_norm": 2.3422839641571045, "learning_rate": 0.00019762393915848906, "loss": 2.9436, "step": 509 }, { "epoch": 0.1419278533412182, "grad_norm": 2.4072458744049072, "learning_rate": 0.0001976144444097156, "loss": 3.1425, "step": 510 }, { "epoch": 0.1422061432497304, "grad_norm": 2.178051471710205, "learning_rate": 0.00019760493095722476, "loss": 3.0129, "step": 511 }, { "epoch": 0.1424844331582426, "grad_norm": 2.391737937927246, "learning_rate": 0.00019759539880283934, "loss": 2.8618, "step": 512 }, { "epoch": 0.1427627230667548, "grad_norm": 2.163343906402588, "learning_rate": 0.00019758584794838573, "loss": 2.9103, "step": 513 }, { "epoch": 0.14304101297526697, "grad_norm": 2.2700748443603516, "learning_rate": 0.00019757627839569404, "loss": 2.9762, "step": 514 }, { "epoch": 0.14331930288377917, "grad_norm": 2.173391580581665, "learning_rate": 0.00019756669014659781, "loss": 3.0224, "step": 515 }, { "epoch": 0.14359759279229137, "grad_norm": 2.1817431449890137, "learning_rate": 0.00019755708320293427, "loss": 2.9534, "step": 516 }, { "epoch": 0.14387588270080356, "grad_norm": 1.939325213432312, "learning_rate": 0.00019754745756654413, "loss": 3.0255, "step": 517 }, { "epoch": 0.14415417260931576, "grad_norm": 2.1195762157440186, "learning_rate": 0.00019753781323927184, "loss": 3.2482, "step": 518 }, { "epoch": 0.14443246251782796, "grad_norm": 2.3146917819976807, "learning_rate": 0.00019752815022296524, "loss": 3.0773, "step": 519 }, { "epoch": 0.14471075242634013, "grad_norm": 2.5443153381347656, "learning_rate": 0.00019751846851947584, "loss": 3.0918, "step": 520 }, { "epoch": 0.14498904233485232, "grad_norm": 3.5468006134033203, "learning_rate": 0.0001975087681306588, "loss": 3.3507, "step": 521 }, { "epoch": 0.14526733224336452, "grad_norm": 2.1524624824523926, "learning_rate": 0.00019749904905837275, "loss": 2.88, "step": 522 }, { "epoch": 0.14554562215187672, "grad_norm": 2.3025882244110107, "learning_rate": 0.00019748931130447996, "loss": 3.0398, "step": 523 }, { "epoch": 0.14582391206038892, "grad_norm": 2.139808416366577, "learning_rate": 0.00019747955487084623, "loss": 2.8448, "step": 524 }, { "epoch": 0.1461022019689011, "grad_norm": 2.265538454055786, "learning_rate": 0.000197469779759341, "loss": 3.1162, "step": 525 }, { "epoch": 0.14638049187741328, "grad_norm": 2.135514497756958, "learning_rate": 0.00019745998597183728, "loss": 2.8737, "step": 526 }, { "epoch": 0.14665878178592548, "grad_norm": 2.4391753673553467, "learning_rate": 0.00019745017351021157, "loss": 3.0548, "step": 527 }, { "epoch": 0.14693707169443768, "grad_norm": 2.447962760925293, "learning_rate": 0.0001974403423763441, "loss": 2.9866, "step": 528 }, { "epoch": 0.14721536160294987, "grad_norm": 2.3633787631988525, "learning_rate": 0.00019743049257211858, "loss": 3.2898, "step": 529 }, { "epoch": 0.14749365151146207, "grad_norm": 2.7165722846984863, "learning_rate": 0.00019742062409942222, "loss": 2.9024, "step": 530 }, { "epoch": 0.14777194141997427, "grad_norm": 2.2524967193603516, "learning_rate": 0.00019741073696014598, "loss": 2.6894, "step": 531 }, { "epoch": 0.14805023132848646, "grad_norm": 2.1831159591674805, "learning_rate": 0.00019740083115618435, "loss": 3.0673, "step": 532 }, { "epoch": 0.14832852123699863, "grad_norm": 2.2960596084594727, "learning_rate": 0.00019739090668943526, "loss": 3.2704, "step": 533 }, { "epoch": 0.14860681114551083, "grad_norm": 2.391859292984009, "learning_rate": 0.0001973809635618004, "loss": 3.0315, "step": 534 }, { "epoch": 0.14888510105402303, "grad_norm": 2.2195723056793213, "learning_rate": 0.00019737100177518496, "loss": 3.1945, "step": 535 }, { "epoch": 0.14916339096253523, "grad_norm": 2.13958740234375, "learning_rate": 0.0001973610213314976, "loss": 2.9415, "step": 536 }, { "epoch": 0.14944168087104742, "grad_norm": 2.390456199645996, "learning_rate": 0.00019735102223265077, "loss": 3.134, "step": 537 }, { "epoch": 0.14971997077955962, "grad_norm": 2.1612658500671387, "learning_rate": 0.00019734100448056028, "loss": 2.992, "step": 538 }, { "epoch": 0.1499982606880718, "grad_norm": 2.1035611629486084, "learning_rate": 0.00019733096807714572, "loss": 2.9315, "step": 539 }, { "epoch": 0.15027655059658399, "grad_norm": 2.4247665405273438, "learning_rate": 0.00019732091302433004, "loss": 3.1062, "step": 540 }, { "epoch": 0.15055484050509618, "grad_norm": 2.3431739807128906, "learning_rate": 0.00019731083932403992, "loss": 3.0715, "step": 541 }, { "epoch": 0.15083313041360838, "grad_norm": 2.3261477947235107, "learning_rate": 0.00019730074697820558, "loss": 2.6535, "step": 542 }, { "epoch": 0.15111142032212058, "grad_norm": 2.5420656204223633, "learning_rate": 0.00019729063598876074, "loss": 2.8965, "step": 543 }, { "epoch": 0.15138971023063277, "grad_norm": 2.440958261489868, "learning_rate": 0.0001972805063576428, "loss": 3.153, "step": 544 }, { "epoch": 0.15166800013914494, "grad_norm": 2.388324499130249, "learning_rate": 0.00019727035808679268, "loss": 3.3159, "step": 545 }, { "epoch": 0.15194629004765714, "grad_norm": 2.481550931930542, "learning_rate": 0.00019726019117815481, "loss": 3.0985, "step": 546 }, { "epoch": 0.15222457995616934, "grad_norm": 2.6005289554595947, "learning_rate": 0.00019725000563367733, "loss": 3.0063, "step": 547 }, { "epoch": 0.15250286986468153, "grad_norm": 2.5605878829956055, "learning_rate": 0.00019723980145531182, "loss": 3.5104, "step": 548 }, { "epoch": 0.15278115977319373, "grad_norm": 2.5355913639068604, "learning_rate": 0.00019722957864501352, "loss": 3.055, "step": 549 }, { "epoch": 0.15305944968170593, "grad_norm": 2.313845157623291, "learning_rate": 0.00019721933720474115, "loss": 3.1663, "step": 550 }, { "epoch": 0.1533377395902181, "grad_norm": 3.1426303386688232, "learning_rate": 0.0001972090771364571, "loss": 3.0302, "step": 551 }, { "epoch": 0.1536160294987303, "grad_norm": 2.7097275257110596, "learning_rate": 0.00019719879844212727, "loss": 3.2017, "step": 552 }, { "epoch": 0.1538943194072425, "grad_norm": 4.878807544708252, "learning_rate": 0.0001971885011237211, "loss": 2.825, "step": 553 }, { "epoch": 0.1541726093157547, "grad_norm": 2.0051393508911133, "learning_rate": 0.00019717818518321173, "loss": 3.1923, "step": 554 }, { "epoch": 0.15445089922426689, "grad_norm": 2.0709266662597656, "learning_rate": 0.00019716785062257573, "loss": 3.053, "step": 555 }, { "epoch": 0.15472918913277908, "grad_norm": 1.9326175451278687, "learning_rate": 0.00019715749744379324, "loss": 2.821, "step": 556 }, { "epoch": 0.15500747904129125, "grad_norm": 2.359337568283081, "learning_rate": 0.00019714712564884804, "loss": 2.9097, "step": 557 }, { "epoch": 0.15528576894980345, "grad_norm": 2.030411958694458, "learning_rate": 0.00019713673523972752, "loss": 2.7036, "step": 558 }, { "epoch": 0.15556405885831565, "grad_norm": 2.5988950729370117, "learning_rate": 0.00019712632621842247, "loss": 3.0849, "step": 559 }, { "epoch": 0.15584234876682784, "grad_norm": 2.1664576530456543, "learning_rate": 0.00019711589858692737, "loss": 3.1466, "step": 560 }, { "epoch": 0.15612063867534004, "grad_norm": 2.919633388519287, "learning_rate": 0.0001971054523472403, "loss": 2.9805, "step": 561 }, { "epoch": 0.15639892858385224, "grad_norm": 2.506777048110962, "learning_rate": 0.00019709498750136278, "loss": 2.9039, "step": 562 }, { "epoch": 0.15667721849236443, "grad_norm": 3.298003911972046, "learning_rate": 0.00019708450405129995, "loss": 2.9887, "step": 563 }, { "epoch": 0.1569555084008766, "grad_norm": 2.6447958946228027, "learning_rate": 0.00019707400199906058, "loss": 3.1474, "step": 564 }, { "epoch": 0.1572337983093888, "grad_norm": 2.804832696914673, "learning_rate": 0.00019706348134665688, "loss": 3.1065, "step": 565 }, { "epoch": 0.157512088217901, "grad_norm": 2.2039551734924316, "learning_rate": 0.00019705294209610474, "loss": 2.7588, "step": 566 }, { "epoch": 0.1577903781264132, "grad_norm": 2.244847059249878, "learning_rate": 0.00019704238424942357, "loss": 2.8133, "step": 567 }, { "epoch": 0.1580686680349254, "grad_norm": 2.735396385192871, "learning_rate": 0.0001970318078086363, "loss": 2.9023, "step": 568 }, { "epoch": 0.1583469579434376, "grad_norm": 2.4692294597625732, "learning_rate": 0.00019702121277576952, "loss": 2.725, "step": 569 }, { "epoch": 0.15862524785194976, "grad_norm": 2.1928467750549316, "learning_rate": 0.00019701059915285324, "loss": 3.0139, "step": 570 }, { "epoch": 0.15890353776046195, "grad_norm": 2.6561479568481445, "learning_rate": 0.00019699996694192123, "loss": 2.9993, "step": 571 }, { "epoch": 0.15918182766897415, "grad_norm": 2.1139893531799316, "learning_rate": 0.00019698931614501057, "loss": 2.9184, "step": 572 }, { "epoch": 0.15946011757748635, "grad_norm": 2.121431589126587, "learning_rate": 0.00019697864676416216, "loss": 2.8956, "step": 573 }, { "epoch": 0.15973840748599855, "grad_norm": 2.1429173946380615, "learning_rate": 0.00019696795880142027, "loss": 2.7773, "step": 574 }, { "epoch": 0.16001669739451074, "grad_norm": 2.9368703365325928, "learning_rate": 0.00019695725225883282, "loss": 3.0621, "step": 575 }, { "epoch": 0.1602949873030229, "grad_norm": 2.034986972808838, "learning_rate": 0.00019694652713845128, "loss": 3.0199, "step": 576 }, { "epoch": 0.1605732772115351, "grad_norm": 2.5348939895629883, "learning_rate": 0.00019693578344233065, "loss": 3.1451, "step": 577 }, { "epoch": 0.1608515671200473, "grad_norm": 2.2686927318573, "learning_rate": 0.00019692502117252953, "loss": 3.0964, "step": 578 }, { "epoch": 0.1611298570285595, "grad_norm": 2.703784465789795, "learning_rate": 0.00019691424033111007, "loss": 3.1578, "step": 579 }, { "epoch": 0.1614081469370717, "grad_norm": 2.3303580284118652, "learning_rate": 0.0001969034409201379, "loss": 3.0578, "step": 580 }, { "epoch": 0.1616864368455839, "grad_norm": 2.1466805934906006, "learning_rate": 0.00019689262294168237, "loss": 2.9607, "step": 581 }, { "epoch": 0.16196472675409607, "grad_norm": 2.0792548656463623, "learning_rate": 0.0001968817863978162, "loss": 3.0616, "step": 582 }, { "epoch": 0.16224301666260826, "grad_norm": 2.131519317626953, "learning_rate": 0.0001968709312906158, "loss": 2.7517, "step": 583 }, { "epoch": 0.16252130657112046, "grad_norm": 1.9856834411621094, "learning_rate": 0.00019686005762216112, "loss": 2.7995, "step": 584 }, { "epoch": 0.16279959647963266, "grad_norm": 2.2624948024749756, "learning_rate": 0.0001968491653945356, "loss": 2.8312, "step": 585 }, { "epoch": 0.16307788638814485, "grad_norm": 2.3447439670562744, "learning_rate": 0.00019683825460982632, "loss": 2.9798, "step": 586 }, { "epoch": 0.16335617629665705, "grad_norm": 2.1563222408294678, "learning_rate": 0.00019682732527012383, "loss": 3.0943, "step": 587 }, { "epoch": 0.16363446620516925, "grad_norm": 2.185161590576172, "learning_rate": 0.00019681637737752232, "loss": 2.8358, "step": 588 }, { "epoch": 0.16391275611368142, "grad_norm": 2.4883179664611816, "learning_rate": 0.00019680541093411948, "loss": 3.2151, "step": 589 }, { "epoch": 0.16419104602219362, "grad_norm": 2.479470729827881, "learning_rate": 0.00019679442594201655, "loss": 2.8993, "step": 590 }, { "epoch": 0.1644693359307058, "grad_norm": 2.2433178424835205, "learning_rate": 0.00019678342240331837, "loss": 3.0019, "step": 591 }, { "epoch": 0.164747625839218, "grad_norm": 2.070258378982544, "learning_rate": 0.0001967724003201333, "loss": 2.8878, "step": 592 }, { "epoch": 0.1650259157477302, "grad_norm": 2.597830295562744, "learning_rate": 0.00019676135969457325, "loss": 2.8476, "step": 593 }, { "epoch": 0.1653042056562424, "grad_norm": 2.3516111373901367, "learning_rate": 0.00019675030052875372, "loss": 3.1238, "step": 594 }, { "epoch": 0.16558249556475457, "grad_norm": 2.474061965942383, "learning_rate": 0.00019673922282479372, "loss": 3.2602, "step": 595 }, { "epoch": 0.16586078547326677, "grad_norm": 2.246753692626953, "learning_rate": 0.0001967281265848158, "loss": 2.9538, "step": 596 }, { "epoch": 0.16613907538177897, "grad_norm": 1.999294638633728, "learning_rate": 0.00019671701181094614, "loss": 2.9771, "step": 597 }, { "epoch": 0.16641736529029116, "grad_norm": 2.507982015609741, "learning_rate": 0.00019670587850531437, "loss": 3.4189, "step": 598 }, { "epoch": 0.16669565519880336, "grad_norm": 2.311464309692383, "learning_rate": 0.00019669472667005374, "loss": 2.9299, "step": 599 }, { "epoch": 0.16697394510731556, "grad_norm": 2.3999874591827393, "learning_rate": 0.00019668355630730107, "loss": 3.1306, "step": 600 }, { "epoch": 0.16697394510731556, "eval_loss": 3.025890588760376, "eval_runtime": 84.3182, "eval_samples_per_second": 59.299, "eval_steps_per_second": 14.825, "step": 600 }, { "epoch": 0.16725223501582773, "grad_norm": 2.3539016246795654, "learning_rate": 0.00019667236741919666, "loss": 3.2015, "step": 601 }, { "epoch": 0.16753052492433992, "grad_norm": 2.732624053955078, "learning_rate": 0.00019666116000788438, "loss": 3.0229, "step": 602 }, { "epoch": 0.16780881483285212, "grad_norm": 2.1932709217071533, "learning_rate": 0.00019664993407551167, "loss": 3.0041, "step": 603 }, { "epoch": 0.16808710474136432, "grad_norm": 2.112006902694702, "learning_rate": 0.0001966386896242295, "loss": 2.9041, "step": 604 }, { "epoch": 0.16836539464987652, "grad_norm": 2.3525311946868896, "learning_rate": 0.00019662742665619242, "loss": 2.9485, "step": 605 }, { "epoch": 0.1686436845583887, "grad_norm": 2.599879264831543, "learning_rate": 0.00019661614517355852, "loss": 3.5018, "step": 606 }, { "epoch": 0.16892197446690088, "grad_norm": 2.125608205795288, "learning_rate": 0.00019660484517848936, "loss": 3.2557, "step": 607 }, { "epoch": 0.16920026437541308, "grad_norm": 3.9239907264709473, "learning_rate": 0.0001965935266731502, "loss": 3.002, "step": 608 }, { "epoch": 0.16947855428392528, "grad_norm": 2.4615936279296875, "learning_rate": 0.00019658218965970965, "loss": 3.0159, "step": 609 }, { "epoch": 0.16975684419243747, "grad_norm": 2.317309617996216, "learning_rate": 0.00019657083414034005, "loss": 3.345, "step": 610 }, { "epoch": 0.17003513410094967, "grad_norm": 2.1813011169433594, "learning_rate": 0.0001965594601172172, "loss": 3.0024, "step": 611 }, { "epoch": 0.17031342400946187, "grad_norm": 2.632230043411255, "learning_rate": 0.00019654806759252042, "loss": 2.9065, "step": 612 }, { "epoch": 0.17059171391797404, "grad_norm": 2.1997859477996826, "learning_rate": 0.00019653665656843266, "loss": 2.7687, "step": 613 }, { "epoch": 0.17087000382648623, "grad_norm": 2.347308874130249, "learning_rate": 0.0001965252270471403, "loss": 2.9366, "step": 614 }, { "epoch": 0.17114829373499843, "grad_norm": 2.110773801803589, "learning_rate": 0.00019651377903083343, "loss": 2.7284, "step": 615 }, { "epoch": 0.17142658364351063, "grad_norm": 2.2888636589050293, "learning_rate": 0.0001965023125217055, "loss": 3.0952, "step": 616 }, { "epoch": 0.17170487355202282, "grad_norm": 2.207576274871826, "learning_rate": 0.0001964908275219536, "loss": 2.9366, "step": 617 }, { "epoch": 0.17198316346053502, "grad_norm": 2.4303977489471436, "learning_rate": 0.00019647932403377833, "loss": 2.9432, "step": 618 }, { "epoch": 0.17226145336904722, "grad_norm": 2.17824125289917, "learning_rate": 0.0001964678020593839, "loss": 2.9237, "step": 619 }, { "epoch": 0.1725397432775594, "grad_norm": 2.0226407051086426, "learning_rate": 0.00019645626160097796, "loss": 2.901, "step": 620 }, { "epoch": 0.17281803318607158, "grad_norm": 2.2777554988861084, "learning_rate": 0.00019644470266077182, "loss": 3.0199, "step": 621 }, { "epoch": 0.17309632309458378, "grad_norm": 2.380937337875366, "learning_rate": 0.00019643312524098022, "loss": 2.8169, "step": 622 }, { "epoch": 0.17337461300309598, "grad_norm": 2.3177664279937744, "learning_rate": 0.0001964215293438215, "loss": 3.1814, "step": 623 }, { "epoch": 0.17365290291160818, "grad_norm": 2.59035587310791, "learning_rate": 0.00019640991497151756, "loss": 3.4064, "step": 624 }, { "epoch": 0.17393119282012037, "grad_norm": 2.426198720932007, "learning_rate": 0.0001963982821262937, "loss": 2.8315, "step": 625 }, { "epoch": 0.17420948272863254, "grad_norm": 2.7167255878448486, "learning_rate": 0.000196386630810379, "loss": 2.9154, "step": 626 }, { "epoch": 0.17448777263714474, "grad_norm": 2.1342952251434326, "learning_rate": 0.00019637496102600587, "loss": 2.7981, "step": 627 }, { "epoch": 0.17476606254565694, "grad_norm": 2.1007769107818604, "learning_rate": 0.00019636327277541035, "loss": 2.6715, "step": 628 }, { "epoch": 0.17504435245416913, "grad_norm": 2.1944477558135986, "learning_rate": 0.00019635156606083201, "loss": 2.9031, "step": 629 }, { "epoch": 0.17532264236268133, "grad_norm": 2.32949161529541, "learning_rate": 0.00019633984088451395, "loss": 2.9302, "step": 630 }, { "epoch": 0.17560093227119353, "grad_norm": 2.76697039604187, "learning_rate": 0.0001963280972487028, "loss": 3.1214, "step": 631 }, { "epoch": 0.1758792221797057, "grad_norm": 2.9381232261657715, "learning_rate": 0.00019631633515564877, "loss": 3.2007, "step": 632 }, { "epoch": 0.1761575120882179, "grad_norm": 2.2798707485198975, "learning_rate": 0.00019630455460760554, "loss": 3.0884, "step": 633 }, { "epoch": 0.1764358019967301, "grad_norm": 2.508188247680664, "learning_rate": 0.00019629275560683032, "loss": 3.1608, "step": 634 }, { "epoch": 0.1767140919052423, "grad_norm": 2.66141414642334, "learning_rate": 0.00019628093815558394, "loss": 3.1747, "step": 635 }, { "epoch": 0.17699238181375448, "grad_norm": 2.307785987854004, "learning_rate": 0.00019626910225613076, "loss": 3.2577, "step": 636 }, { "epoch": 0.17727067172226668, "grad_norm": 2.0002317428588867, "learning_rate": 0.00019625724791073857, "loss": 3.0424, "step": 637 }, { "epoch": 0.17754896163077885, "grad_norm": 2.208745002746582, "learning_rate": 0.00019624537512167878, "loss": 3.0447, "step": 638 }, { "epoch": 0.17782725153929105, "grad_norm": 2.6376283168792725, "learning_rate": 0.0001962334838912263, "loss": 3.0685, "step": 639 }, { "epoch": 0.17810554144780325, "grad_norm": 2.0581912994384766, "learning_rate": 0.00019622157422165962, "loss": 2.7502, "step": 640 }, { "epoch": 0.17838383135631544, "grad_norm": 1.928661823272705, "learning_rate": 0.00019620964611526066, "loss": 2.8662, "step": 641 }, { "epoch": 0.17866212126482764, "grad_norm": 2.007415533065796, "learning_rate": 0.00019619769957431504, "loss": 2.9939, "step": 642 }, { "epoch": 0.17894041117333984, "grad_norm": 2.4597301483154297, "learning_rate": 0.00019618573460111174, "loss": 2.9446, "step": 643 }, { "epoch": 0.17921870108185203, "grad_norm": 2.11757493019104, "learning_rate": 0.00019617375119794337, "loss": 3.1847, "step": 644 }, { "epoch": 0.1794969909903642, "grad_norm": 2.220942735671997, "learning_rate": 0.00019616174936710605, "loss": 2.9784, "step": 645 }, { "epoch": 0.1797752808988764, "grad_norm": 2.1159889698028564, "learning_rate": 0.0001961497291108994, "loss": 2.9813, "step": 646 }, { "epoch": 0.1800535708073886, "grad_norm": 2.4251437187194824, "learning_rate": 0.00019613769043162665, "loss": 2.8813, "step": 647 }, { "epoch": 0.1803318607159008, "grad_norm": 2.219775676727295, "learning_rate": 0.00019612563333159443, "loss": 3.077, "step": 648 }, { "epoch": 0.180610150624413, "grad_norm": 2.2904911041259766, "learning_rate": 0.00019611355781311307, "loss": 2.9154, "step": 649 }, { "epoch": 0.1808884405329252, "grad_norm": 2.1819229125976562, "learning_rate": 0.00019610146387849628, "loss": 2.8944, "step": 650 }, { "epoch": 0.18116673044143736, "grad_norm": 2.1656594276428223, "learning_rate": 0.0001960893515300614, "loss": 2.8615, "step": 651 }, { "epoch": 0.18144502034994955, "grad_norm": 2.1331050395965576, "learning_rate": 0.0001960772207701292, "loss": 2.8295, "step": 652 }, { "epoch": 0.18172331025846175, "grad_norm": 2.5402050018310547, "learning_rate": 0.00019606507160102403, "loss": 2.9696, "step": 653 }, { "epoch": 0.18200160016697395, "grad_norm": 2.129232168197632, "learning_rate": 0.00019605290402507385, "loss": 2.8634, "step": 654 }, { "epoch": 0.18227989007548615, "grad_norm": 1.9406921863555908, "learning_rate": 0.00019604071804460996, "loss": 2.8994, "step": 655 }, { "epoch": 0.18255817998399834, "grad_norm": 2.1775248050689697, "learning_rate": 0.00019602851366196737, "loss": 3.0738, "step": 656 }, { "epoch": 0.1828364698925105, "grad_norm": 2.283561944961548, "learning_rate": 0.0001960162908794845, "loss": 3.135, "step": 657 }, { "epoch": 0.1831147598010227, "grad_norm": 2.545853614807129, "learning_rate": 0.00019600404969950333, "loss": 2.942, "step": 658 }, { "epoch": 0.1833930497095349, "grad_norm": 2.2506566047668457, "learning_rate": 0.00019599179012436943, "loss": 3.1367, "step": 659 }, { "epoch": 0.1836713396180471, "grad_norm": 2.1151535511016846, "learning_rate": 0.00019597951215643175, "loss": 2.708, "step": 660 }, { "epoch": 0.1839496295265593, "grad_norm": 2.453552722930908, "learning_rate": 0.00019596721579804292, "loss": 2.9115, "step": 661 }, { "epoch": 0.1842279194350715, "grad_norm": 2.4045965671539307, "learning_rate": 0.00019595490105155894, "loss": 3.0761, "step": 662 }, { "epoch": 0.18450620934358367, "grad_norm": 2.3597042560577393, "learning_rate": 0.0001959425679193395, "loss": 3.1204, "step": 663 }, { "epoch": 0.18478449925209586, "grad_norm": 2.3870129585266113, "learning_rate": 0.00019593021640374768, "loss": 2.9493, "step": 664 }, { "epoch": 0.18506278916060806, "grad_norm": 1.9003781080245972, "learning_rate": 0.00019591784650715014, "loss": 2.7553, "step": 665 }, { "epoch": 0.18534107906912026, "grad_norm": 2.293550968170166, "learning_rate": 0.00019590545823191708, "loss": 3.087, "step": 666 }, { "epoch": 0.18561936897763245, "grad_norm": 2.3235623836517334, "learning_rate": 0.00019589305158042213, "loss": 3.0288, "step": 667 }, { "epoch": 0.18589765888614465, "grad_norm": 2.3410675525665283, "learning_rate": 0.00019588062655504255, "loss": 2.8995, "step": 668 }, { "epoch": 0.18617594879465682, "grad_norm": 2.11531662940979, "learning_rate": 0.00019586818315815908, "loss": 2.9523, "step": 669 }, { "epoch": 0.18645423870316902, "grad_norm": 2.8850862979888916, "learning_rate": 0.00019585572139215596, "loss": 3.3075, "step": 670 }, { "epoch": 0.18673252861168121, "grad_norm": 2.3469364643096924, "learning_rate": 0.00019584324125942099, "loss": 3.3646, "step": 671 }, { "epoch": 0.1870108185201934, "grad_norm": 2.4118428230285645, "learning_rate": 0.00019583074276234542, "loss": 3.0454, "step": 672 }, { "epoch": 0.1872891084287056, "grad_norm": 3.5281238555908203, "learning_rate": 0.00019581822590332407, "loss": 2.9047, "step": 673 }, { "epoch": 0.1875673983372178, "grad_norm": 2.502401351928711, "learning_rate": 0.0001958056906847553, "loss": 2.9471, "step": 674 }, { "epoch": 0.18784568824573, "grad_norm": 2.110297679901123, "learning_rate": 0.00019579313710904096, "loss": 3.0767, "step": 675 }, { "epoch": 0.18812397815424217, "grad_norm": 2.6627931594848633, "learning_rate": 0.0001957805651785864, "loss": 2.8879, "step": 676 }, { "epoch": 0.18840226806275437, "grad_norm": 2.183412790298462, "learning_rate": 0.0001957679748958005, "loss": 3.0051, "step": 677 }, { "epoch": 0.18868055797126657, "grad_norm": 2.1290040016174316, "learning_rate": 0.00019575536626309568, "loss": 2.9257, "step": 678 }, { "epoch": 0.18895884787977876, "grad_norm": 1.9357424974441528, "learning_rate": 0.00019574273928288783, "loss": 2.7418, "step": 679 }, { "epoch": 0.18923713778829096, "grad_norm": 2.2572133541107178, "learning_rate": 0.0001957300939575964, "loss": 2.9118, "step": 680 }, { "epoch": 0.18951542769680316, "grad_norm": 2.370974540710449, "learning_rate": 0.00019571743028964434, "loss": 3.0587, "step": 681 }, { "epoch": 0.18979371760531533, "grad_norm": 2.255549669265747, "learning_rate": 0.00019570474828145806, "loss": 2.9537, "step": 682 }, { "epoch": 0.19007200751382752, "grad_norm": 2.3519108295440674, "learning_rate": 0.00019569204793546763, "loss": 3.046, "step": 683 }, { "epoch": 0.19035029742233972, "grad_norm": 2.2800424098968506, "learning_rate": 0.00019567932925410646, "loss": 3.1147, "step": 684 }, { "epoch": 0.19062858733085192, "grad_norm": 2.26926589012146, "learning_rate": 0.00019566659223981157, "loss": 2.8861, "step": 685 }, { "epoch": 0.19090687723936411, "grad_norm": 2.328157901763916, "learning_rate": 0.00019565383689502356, "loss": 3.1294, "step": 686 }, { "epoch": 0.1911851671478763, "grad_norm": 2.7283689975738525, "learning_rate": 0.00019564106322218628, "loss": 3.0066, "step": 687 }, { "epoch": 0.19146345705638848, "grad_norm": 2.129948854446411, "learning_rate": 0.00019562827122374742, "loss": 2.7559, "step": 688 }, { "epoch": 0.19174174696490068, "grad_norm": 2.2049500942230225, "learning_rate": 0.000195615460902158, "loss": 2.8441, "step": 689 }, { "epoch": 0.19202003687341287, "grad_norm": 2.194370985031128, "learning_rate": 0.0001956026322598725, "loss": 2.9248, "step": 690 }, { "epoch": 0.19229832678192507, "grad_norm": 2.5081229209899902, "learning_rate": 0.00019558978529934912, "loss": 3.139, "step": 691 }, { "epoch": 0.19257661669043727, "grad_norm": 2.4252994060516357, "learning_rate": 0.00019557692002304937, "loss": 2.9552, "step": 692 }, { "epoch": 0.19285490659894947, "grad_norm": 2.2137136459350586, "learning_rate": 0.0001955640364334383, "loss": 2.8912, "step": 693 }, { "epoch": 0.19313319650746164, "grad_norm": 2.655430555343628, "learning_rate": 0.0001955511345329846, "loss": 3.1298, "step": 694 }, { "epoch": 0.19341148641597383, "grad_norm": 2.1990854740142822, "learning_rate": 0.00019553821432416032, "loss": 3.072, "step": 695 }, { "epoch": 0.19368977632448603, "grad_norm": 2.1199026107788086, "learning_rate": 0.00019552527580944112, "loss": 2.8267, "step": 696 }, { "epoch": 0.19396806623299823, "grad_norm": 2.2154250144958496, "learning_rate": 0.00019551231899130603, "loss": 2.6921, "step": 697 }, { "epoch": 0.19424635614151042, "grad_norm": 2.4295618534088135, "learning_rate": 0.0001954993438722378, "loss": 2.9632, "step": 698 }, { "epoch": 0.19452464605002262, "grad_norm": 2.324199676513672, "learning_rate": 0.0001954863504547225, "loss": 3.1559, "step": 699 }, { "epoch": 0.19480293595853482, "grad_norm": 2.2634470462799072, "learning_rate": 0.00019547333874124978, "loss": 3.1343, "step": 700 }, { "epoch": 0.19480293595853482, "eval_loss": 3.014402389526367, "eval_runtime": 84.3624, "eval_samples_per_second": 59.268, "eval_steps_per_second": 14.817, "step": 700 }, { "epoch": 0.195081225867047, "grad_norm": 2.10640287399292, "learning_rate": 0.00019546030873431283, "loss": 2.9358, "step": 701 }, { "epoch": 0.19535951577555918, "grad_norm": 2.3610377311706543, "learning_rate": 0.00019544726043640824, "loss": 3.0397, "step": 702 }, { "epoch": 0.19563780568407138, "grad_norm": 2.332200527191162, "learning_rate": 0.00019543419385003623, "loss": 3.0246, "step": 703 }, { "epoch": 0.19591609559258358, "grad_norm": 2.4290270805358887, "learning_rate": 0.0001954211089777004, "loss": 2.9637, "step": 704 }, { "epoch": 0.19619438550109577, "grad_norm": 2.6105663776397705, "learning_rate": 0.000195408005821908, "loss": 2.9197, "step": 705 }, { "epoch": 0.19647267540960797, "grad_norm": 2.384455680847168, "learning_rate": 0.0001953948843851696, "loss": 3.0827, "step": 706 }, { "epoch": 0.19675096531812014, "grad_norm": 2.8150367736816406, "learning_rate": 0.00019538174466999947, "loss": 3.1737, "step": 707 }, { "epoch": 0.19702925522663234, "grad_norm": 3.0767524242401123, "learning_rate": 0.00019536858667891524, "loss": 3.2746, "step": 708 }, { "epoch": 0.19730754513514454, "grad_norm": 3.028341054916382, "learning_rate": 0.0001953554104144381, "loss": 3.1192, "step": 709 }, { "epoch": 0.19758583504365673, "grad_norm": 2.792232036590576, "learning_rate": 0.0001953422158790927, "loss": 3.337, "step": 710 }, { "epoch": 0.19786412495216893, "grad_norm": 2.4463818073272705, "learning_rate": 0.00019532900307540725, "loss": 2.8125, "step": 711 }, { "epoch": 0.19814241486068113, "grad_norm": 2.287376880645752, "learning_rate": 0.00019531577200591343, "loss": 2.8128, "step": 712 }, { "epoch": 0.1984207047691933, "grad_norm": 2.3140463829040527, "learning_rate": 0.00019530252267314636, "loss": 3.07, "step": 713 }, { "epoch": 0.1986989946777055, "grad_norm": 2.707022190093994, "learning_rate": 0.00019528925507964485, "loss": 3.127, "step": 714 }, { "epoch": 0.1989772845862177, "grad_norm": 2.233452558517456, "learning_rate": 0.00019527596922795093, "loss": 3.1148, "step": 715 }, { "epoch": 0.1992555744947299, "grad_norm": 2.2917845249176025, "learning_rate": 0.00019526266512061036, "loss": 2.6866, "step": 716 }, { "epoch": 0.19953386440324208, "grad_norm": 2.269742488861084, "learning_rate": 0.0001952493427601723, "loss": 3.07, "step": 717 }, { "epoch": 0.19981215431175428, "grad_norm": 2.1115143299102783, "learning_rate": 0.00019523600214918944, "loss": 2.9641, "step": 718 }, { "epoch": 0.20009044422026645, "grad_norm": 2.415621757507324, "learning_rate": 0.0001952226432902179, "loss": 2.9293, "step": 719 }, { "epoch": 0.20036873412877865, "grad_norm": 2.059229850769043, "learning_rate": 0.0001952092661858174, "loss": 2.8464, "step": 720 }, { "epoch": 0.20064702403729084, "grad_norm": 2.0873801708221436, "learning_rate": 0.00019519587083855112, "loss": 2.8541, "step": 721 }, { "epoch": 0.20092531394580304, "grad_norm": 2.260575771331787, "learning_rate": 0.00019518245725098557, "loss": 3.1808, "step": 722 }, { "epoch": 0.20120360385431524, "grad_norm": 2.1481099128723145, "learning_rate": 0.0001951690254256911, "loss": 2.868, "step": 723 }, { "epoch": 0.20148189376282744, "grad_norm": 2.202821731567383, "learning_rate": 0.0001951555753652412, "loss": 3.1555, "step": 724 }, { "epoch": 0.2017601836713396, "grad_norm": 2.3944413661956787, "learning_rate": 0.0001951421070722131, "loss": 2.9258, "step": 725 }, { "epoch": 0.2020384735798518, "grad_norm": 2.1731626987457275, "learning_rate": 0.00019512862054918743, "loss": 2.8616, "step": 726 }, { "epoch": 0.202316763488364, "grad_norm": 2.1008031368255615, "learning_rate": 0.00019511511579874827, "loss": 3.0853, "step": 727 }, { "epoch": 0.2025950533968762, "grad_norm": 2.5215277671813965, "learning_rate": 0.00019510159282348327, "loss": 3.1588, "step": 728 }, { "epoch": 0.2028733433053884, "grad_norm": 2.569723606109619, "learning_rate": 0.00019508805162598352, "loss": 3.2812, "step": 729 }, { "epoch": 0.2031516332139006, "grad_norm": 2.4406051635742188, "learning_rate": 0.00019507449220884364, "loss": 2.7893, "step": 730 }, { "epoch": 0.2034299231224128, "grad_norm": 2.470410108566284, "learning_rate": 0.0001950609145746617, "loss": 3.0455, "step": 731 }, { "epoch": 0.20370821303092496, "grad_norm": 2.491248607635498, "learning_rate": 0.00019504731872603935, "loss": 2.8495, "step": 732 }, { "epoch": 0.20398650293943715, "grad_norm": 2.3216116428375244, "learning_rate": 0.00019503370466558158, "loss": 3.0097, "step": 733 }, { "epoch": 0.20426479284794935, "grad_norm": 2.4635937213897705, "learning_rate": 0.00019502007239589704, "loss": 2.6961, "step": 734 }, { "epoch": 0.20454308275646155, "grad_norm": 2.3673110008239746, "learning_rate": 0.00019500642191959774, "loss": 2.8703, "step": 735 }, { "epoch": 0.20482137266497374, "grad_norm": 2.2671172618865967, "learning_rate": 0.00019499275323929917, "loss": 2.99, "step": 736 }, { "epoch": 0.20509966257348594, "grad_norm": 2.0761921405792236, "learning_rate": 0.00019497906635762045, "loss": 2.6422, "step": 737 }, { "epoch": 0.2053779524819981, "grad_norm": 2.3641774654388428, "learning_rate": 0.00019496536127718405, "loss": 2.9824, "step": 738 }, { "epoch": 0.2056562423905103, "grad_norm": 2.302015781402588, "learning_rate": 0.00019495163800061594, "loss": 2.7938, "step": 739 }, { "epoch": 0.2059345322990225, "grad_norm": 2.328127145767212, "learning_rate": 0.00019493789653054572, "loss": 2.9689, "step": 740 }, { "epoch": 0.2062128222075347, "grad_norm": 2.861438751220703, "learning_rate": 0.00019492413686960627, "loss": 3.0389, "step": 741 }, { "epoch": 0.2064911121160469, "grad_norm": 2.6095097064971924, "learning_rate": 0.00019491035902043406, "loss": 2.9596, "step": 742 }, { "epoch": 0.2067694020245591, "grad_norm": 2.349069595336914, "learning_rate": 0.00019489656298566908, "loss": 3.1141, "step": 743 }, { "epoch": 0.20704769193307127, "grad_norm": 2.0935702323913574, "learning_rate": 0.00019488274876795477, "loss": 2.7071, "step": 744 }, { "epoch": 0.20732598184158346, "grad_norm": 2.392406463623047, "learning_rate": 0.00019486891636993797, "loss": 3.0632, "step": 745 }, { "epoch": 0.20760427175009566, "grad_norm": 2.36651873588562, "learning_rate": 0.00019485506579426914, "loss": 2.8488, "step": 746 }, { "epoch": 0.20788256165860786, "grad_norm": 2.3972702026367188, "learning_rate": 0.00019484119704360215, "loss": 3.0111, "step": 747 }, { "epoch": 0.20816085156712005, "grad_norm": 2.15364146232605, "learning_rate": 0.00019482731012059436, "loss": 3.1008, "step": 748 }, { "epoch": 0.20843914147563225, "grad_norm": 2.368328809738159, "learning_rate": 0.00019481340502790662, "loss": 3.3395, "step": 749 }, { "epoch": 0.20871743138414442, "grad_norm": 2.3806376457214355, "learning_rate": 0.00019479948176820324, "loss": 3.031, "step": 750 }, { "epoch": 0.20899572129265662, "grad_norm": 2.263307809829712, "learning_rate": 0.00019478554034415207, "loss": 2.8215, "step": 751 }, { "epoch": 0.2092740112011688, "grad_norm": 2.246006488800049, "learning_rate": 0.0001947715807584244, "loss": 3.1377, "step": 752 }, { "epoch": 0.209552301109681, "grad_norm": 2.315937042236328, "learning_rate": 0.00019475760301369493, "loss": 2.799, "step": 753 }, { "epoch": 0.2098305910181932, "grad_norm": 2.2703065872192383, "learning_rate": 0.000194743607112642, "loss": 3.061, "step": 754 }, { "epoch": 0.2101088809267054, "grad_norm": 3.5324175357818604, "learning_rate": 0.00019472959305794727, "loss": 3.2445, "step": 755 }, { "epoch": 0.2103871708352176, "grad_norm": 2.6278202533721924, "learning_rate": 0.00019471556085229596, "loss": 3.0884, "step": 756 }, { "epoch": 0.21066546074372977, "grad_norm": 2.345980167388916, "learning_rate": 0.00019470151049837678, "loss": 3.17, "step": 757 }, { "epoch": 0.21094375065224197, "grad_norm": 2.1123218536376953, "learning_rate": 0.00019468744199888186, "loss": 2.8863, "step": 758 }, { "epoch": 0.21122204056075417, "grad_norm": 3.114107370376587, "learning_rate": 0.00019467335535650689, "loss": 3.2495, "step": 759 }, { "epoch": 0.21150033046926636, "grad_norm": 2.9116768836975098, "learning_rate": 0.0001946592505739509, "loss": 3.4303, "step": 760 }, { "epoch": 0.21177862037777856, "grad_norm": 2.4628231525421143, "learning_rate": 0.00019464512765391656, "loss": 3.1004, "step": 761 }, { "epoch": 0.21205691028629076, "grad_norm": 2.1384007930755615, "learning_rate": 0.0001946309865991099, "loss": 2.764, "step": 762 }, { "epoch": 0.21233520019480293, "grad_norm": 2.1018624305725098, "learning_rate": 0.00019461682741224044, "loss": 2.8667, "step": 763 }, { "epoch": 0.21261349010331512, "grad_norm": 3.2161452770233154, "learning_rate": 0.00019460265009602128, "loss": 3.1986, "step": 764 }, { "epoch": 0.21289178001182732, "grad_norm": 2.3462510108947754, "learning_rate": 0.0001945884546531688, "loss": 3.0255, "step": 765 }, { "epoch": 0.21317006992033952, "grad_norm": 2.6107935905456543, "learning_rate": 0.000194574241086403, "loss": 3.3451, "step": 766 }, { "epoch": 0.2134483598288517, "grad_norm": 2.222099781036377, "learning_rate": 0.0001945600093984474, "loss": 3.1542, "step": 767 }, { "epoch": 0.2137266497373639, "grad_norm": 1.935279369354248, "learning_rate": 0.00019454575959202878, "loss": 2.7982, "step": 768 }, { "epoch": 0.21400493964587608, "grad_norm": 2.2807111740112305, "learning_rate": 0.00019453149166987754, "loss": 2.6159, "step": 769 }, { "epoch": 0.21428322955438828, "grad_norm": 2.0912930965423584, "learning_rate": 0.00019451720563472757, "loss": 2.9352, "step": 770 }, { "epoch": 0.21456151946290047, "grad_norm": 2.3805131912231445, "learning_rate": 0.00019450290148931624, "loss": 3.1516, "step": 771 }, { "epoch": 0.21483980937141267, "grad_norm": 2.4347658157348633, "learning_rate": 0.00019448857923638418, "loss": 2.7806, "step": 772 }, { "epoch": 0.21511809927992487, "grad_norm": 2.4791994094848633, "learning_rate": 0.00019447423887867583, "loss": 3.1061, "step": 773 }, { "epoch": 0.21539638918843707, "grad_norm": 2.2799859046936035, "learning_rate": 0.0001944598804189388, "loss": 2.8213, "step": 774 }, { "epoch": 0.21567467909694923, "grad_norm": 2.2505621910095215, "learning_rate": 0.00019444550385992434, "loss": 3.2953, "step": 775 }, { "epoch": 0.21595296900546143, "grad_norm": 2.5306851863861084, "learning_rate": 0.0001944311092043871, "loss": 2.7749, "step": 776 }, { "epoch": 0.21623125891397363, "grad_norm": 2.2524096965789795, "learning_rate": 0.00019441669645508522, "loss": 2.8506, "step": 777 }, { "epoch": 0.21650954882248583, "grad_norm": 2.222785711288452, "learning_rate": 0.00019440226561478027, "loss": 2.8162, "step": 778 }, { "epoch": 0.21678783873099802, "grad_norm": 2.1942076683044434, "learning_rate": 0.00019438781668623732, "loss": 3.0405, "step": 779 }, { "epoch": 0.21706612863951022, "grad_norm": 2.0071957111358643, "learning_rate": 0.00019437334967222495, "loss": 3.1, "step": 780 }, { "epoch": 0.2173444185480224, "grad_norm": 2.4297375679016113, "learning_rate": 0.00019435886457551513, "loss": 3.0226, "step": 781 }, { "epoch": 0.21762270845653459, "grad_norm": 2.4541661739349365, "learning_rate": 0.0001943443613988833, "loss": 3.0937, "step": 782 }, { "epoch": 0.21790099836504678, "grad_norm": 2.229928970336914, "learning_rate": 0.0001943298401451084, "loss": 3.0116, "step": 783 }, { "epoch": 0.21817928827355898, "grad_norm": 2.2430613040924072, "learning_rate": 0.00019431530081697284, "loss": 2.9867, "step": 784 }, { "epoch": 0.21845757818207118, "grad_norm": 2.092585802078247, "learning_rate": 0.00019430074341726244, "loss": 3.1599, "step": 785 }, { "epoch": 0.21873586809058337, "grad_norm": 2.581699848175049, "learning_rate": 0.00019428616794876655, "loss": 3.3671, "step": 786 }, { "epoch": 0.21901415799909557, "grad_norm": 2.0766971111297607, "learning_rate": 0.00019427157441427793, "loss": 3.0163, "step": 787 }, { "epoch": 0.21929244790760774, "grad_norm": 2.2302350997924805, "learning_rate": 0.0001942569628165928, "loss": 2.894, "step": 788 }, { "epoch": 0.21957073781611994, "grad_norm": 2.3058876991271973, "learning_rate": 0.00019424233315851086, "loss": 3.1925, "step": 789 }, { "epoch": 0.21984902772463213, "grad_norm": 2.3643789291381836, "learning_rate": 0.00019422768544283533, "loss": 2.7517, "step": 790 }, { "epoch": 0.22012731763314433, "grad_norm": 2.4070327281951904, "learning_rate": 0.00019421301967237277, "loss": 3.1556, "step": 791 }, { "epoch": 0.22040560754165653, "grad_norm": 2.795344591140747, "learning_rate": 0.00019419833584993329, "loss": 3.0548, "step": 792 }, { "epoch": 0.22068389745016873, "grad_norm": 2.2769570350646973, "learning_rate": 0.00019418363397833043, "loss": 3.0606, "step": 793 }, { "epoch": 0.2209621873586809, "grad_norm": 2.4615914821624756, "learning_rate": 0.00019416891406038114, "loss": 3.0929, "step": 794 }, { "epoch": 0.2212404772671931, "grad_norm": 2.3979427814483643, "learning_rate": 0.00019415417609890595, "loss": 3.1033, "step": 795 }, { "epoch": 0.2215187671757053, "grad_norm": 2.171539068222046, "learning_rate": 0.0001941394200967287, "loss": 2.9031, "step": 796 }, { "epoch": 0.22179705708421749, "grad_norm": 2.9271600246429443, "learning_rate": 0.00019412464605667683, "loss": 3.3661, "step": 797 }, { "epoch": 0.22207534699272968, "grad_norm": 2.169682502746582, "learning_rate": 0.0001941098539815811, "loss": 3.0295, "step": 798 }, { "epoch": 0.22235363690124188, "grad_norm": 2.0674493312835693, "learning_rate": 0.00019409504387427583, "loss": 2.9494, "step": 799 }, { "epoch": 0.22263192680975405, "grad_norm": 2.272202491760254, "learning_rate": 0.00019408021573759875, "loss": 2.9414, "step": 800 }, { "epoch": 0.22263192680975405, "eval_loss": 2.998628616333008, "eval_runtime": 84.2627, "eval_samples_per_second": 59.338, "eval_steps_per_second": 14.835, "step": 800 }, { "epoch": 0.22291021671826625, "grad_norm": 2.2192418575286865, "learning_rate": 0.00019406536957439108, "loss": 3.1919, "step": 801 }, { "epoch": 0.22318850662677844, "grad_norm": 2.1852099895477295, "learning_rate": 0.0001940505053874974, "loss": 3.0429, "step": 802 }, { "epoch": 0.22346679653529064, "grad_norm": 2.452183723449707, "learning_rate": 0.00019403562317976586, "loss": 3.2538, "step": 803 }, { "epoch": 0.22374508644380284, "grad_norm": 2.3386006355285645, "learning_rate": 0.00019402072295404796, "loss": 2.9573, "step": 804 }, { "epoch": 0.22402337635231503, "grad_norm": 2.5900769233703613, "learning_rate": 0.00019400580471319877, "loss": 3.2044, "step": 805 }, { "epoch": 0.2243016662608272, "grad_norm": 2.3939337730407715, "learning_rate": 0.00019399086846007673, "loss": 2.9898, "step": 806 }, { "epoch": 0.2245799561693394, "grad_norm": 2.4106552600860596, "learning_rate": 0.0001939759141975437, "loss": 3.3784, "step": 807 }, { "epoch": 0.2248582460778516, "grad_norm": 2.3325350284576416, "learning_rate": 0.00019396094192846507, "loss": 2.7309, "step": 808 }, { "epoch": 0.2251365359863638, "grad_norm": 2.4056780338287354, "learning_rate": 0.00019394595165570967, "loss": 2.933, "step": 809 }, { "epoch": 0.225414825894876, "grad_norm": 2.8392417430877686, "learning_rate": 0.0001939309433821497, "loss": 2.8622, "step": 810 }, { "epoch": 0.2256931158033882, "grad_norm": 2.4220974445343018, "learning_rate": 0.0001939159171106609, "loss": 2.6285, "step": 811 }, { "epoch": 0.22597140571190039, "grad_norm": 2.304481267929077, "learning_rate": 0.00019390087284412247, "loss": 2.9031, "step": 812 }, { "epoch": 0.22624969562041256, "grad_norm": 2.1207478046417236, "learning_rate": 0.00019388581058541694, "loss": 2.9384, "step": 813 }, { "epoch": 0.22652798552892475, "grad_norm": 2.4876582622528076, "learning_rate": 0.00019387073033743038, "loss": 2.9424, "step": 814 }, { "epoch": 0.22680627543743695, "grad_norm": 2.723314046859741, "learning_rate": 0.00019385563210305232, "loss": 3.1015, "step": 815 }, { "epoch": 0.22708456534594915, "grad_norm": 1.925142765045166, "learning_rate": 0.00019384051588517567, "loss": 2.7193, "step": 816 }, { "epoch": 0.22736285525446134, "grad_norm": 2.064570903778076, "learning_rate": 0.00019382538168669685, "loss": 2.9302, "step": 817 }, { "epoch": 0.22764114516297354, "grad_norm": 2.087707042694092, "learning_rate": 0.00019381022951051562, "loss": 2.8706, "step": 818 }, { "epoch": 0.2279194350714857, "grad_norm": 2.3085482120513916, "learning_rate": 0.00019379505935953538, "loss": 2.8803, "step": 819 }, { "epoch": 0.2281977249799979, "grad_norm": 1.9882041215896606, "learning_rate": 0.00019377987123666275, "loss": 2.7262, "step": 820 }, { "epoch": 0.2284760148885101, "grad_norm": 2.2090253829956055, "learning_rate": 0.00019376466514480794, "loss": 2.8486, "step": 821 }, { "epoch": 0.2287543047970223, "grad_norm": 2.107933521270752, "learning_rate": 0.00019374944108688455, "loss": 2.9308, "step": 822 }, { "epoch": 0.2290325947055345, "grad_norm": 2.234562635421753, "learning_rate": 0.00019373419906580964, "loss": 2.7859, "step": 823 }, { "epoch": 0.2293108846140467, "grad_norm": 2.6875193119049072, "learning_rate": 0.00019371893908450374, "loss": 3.2001, "step": 824 }, { "epoch": 0.22958917452255886, "grad_norm": 2.2028822898864746, "learning_rate": 0.0001937036611458907, "loss": 3.0227, "step": 825 }, { "epoch": 0.22986746443107106, "grad_norm": 2.0872786045074463, "learning_rate": 0.00019368836525289796, "loss": 2.8062, "step": 826 }, { "epoch": 0.23014575433958326, "grad_norm": 2.183893918991089, "learning_rate": 0.00019367305140845635, "loss": 3.137, "step": 827 }, { "epoch": 0.23042404424809546, "grad_norm": 2.293381929397583, "learning_rate": 0.00019365771961550008, "loss": 2.9796, "step": 828 }, { "epoch": 0.23070233415660765, "grad_norm": 2.043055534362793, "learning_rate": 0.00019364236987696686, "loss": 2.6933, "step": 829 }, { "epoch": 0.23098062406511985, "grad_norm": 2.1304562091827393, "learning_rate": 0.00019362700219579786, "loss": 2.844, "step": 830 }, { "epoch": 0.23125891397363202, "grad_norm": 2.0563228130340576, "learning_rate": 0.0001936116165749376, "loss": 2.973, "step": 831 }, { "epoch": 0.23153720388214422, "grad_norm": 2.271160125732422, "learning_rate": 0.0001935962130173341, "loss": 3.0596, "step": 832 }, { "epoch": 0.2318154937906564, "grad_norm": 2.251871109008789, "learning_rate": 0.0001935807915259388, "loss": 2.7004, "step": 833 }, { "epoch": 0.2320937836991686, "grad_norm": 2.582554340362549, "learning_rate": 0.00019356535210370663, "loss": 2.8864, "step": 834 }, { "epoch": 0.2323720736076808, "grad_norm": 2.285609245300293, "learning_rate": 0.00019354989475359587, "loss": 3.0932, "step": 835 }, { "epoch": 0.232650363516193, "grad_norm": 2.3683583736419678, "learning_rate": 0.0001935344194785683, "loss": 3.0998, "step": 836 }, { "epoch": 0.23292865342470517, "grad_norm": 2.286287784576416, "learning_rate": 0.00019351892628158908, "loss": 3.054, "step": 837 }, { "epoch": 0.23320694333321737, "grad_norm": 2.080242872238159, "learning_rate": 0.0001935034151656268, "loss": 3.0147, "step": 838 }, { "epoch": 0.23348523324172957, "grad_norm": 2.432089328765869, "learning_rate": 0.0001934878861336536, "loss": 3.2254, "step": 839 }, { "epoch": 0.23376352315024176, "grad_norm": 3.0245442390441895, "learning_rate": 0.00019347233918864492, "loss": 3.1908, "step": 840 }, { "epoch": 0.23404181305875396, "grad_norm": 2.034923553466797, "learning_rate": 0.00019345677433357968, "loss": 3.0386, "step": 841 }, { "epoch": 0.23432010296726616, "grad_norm": 2.2210001945495605, "learning_rate": 0.00019344119157144022, "loss": 2.8296, "step": 842 }, { "epoch": 0.23459839287577836, "grad_norm": 2.0682013034820557, "learning_rate": 0.00019342559090521237, "loss": 2.9119, "step": 843 }, { "epoch": 0.23487668278429052, "grad_norm": 2.142770290374756, "learning_rate": 0.0001934099723378853, "loss": 3.1297, "step": 844 }, { "epoch": 0.23515497269280272, "grad_norm": 2.0137932300567627, "learning_rate": 0.00019339433587245165, "loss": 3.0619, "step": 845 }, { "epoch": 0.23543326260131492, "grad_norm": 2.3546319007873535, "learning_rate": 0.00019337868151190755, "loss": 2.8012, "step": 846 }, { "epoch": 0.23571155250982712, "grad_norm": 1.9808251857757568, "learning_rate": 0.00019336300925925243, "loss": 2.9213, "step": 847 }, { "epoch": 0.2359898424183393, "grad_norm": 2.1718761920928955, "learning_rate": 0.00019334731911748926, "loss": 2.6831, "step": 848 }, { "epoch": 0.2362681323268515, "grad_norm": 2.270195245742798, "learning_rate": 0.00019333161108962441, "loss": 3.0344, "step": 849 }, { "epoch": 0.23654642223536368, "grad_norm": 2.4823391437530518, "learning_rate": 0.00019331588517866764, "loss": 3.1725, "step": 850 }, { "epoch": 0.23682471214387588, "grad_norm": 2.836463689804077, "learning_rate": 0.00019330014138763216, "loss": 3.0252, "step": 851 }, { "epoch": 0.23710300205238807, "grad_norm": 2.2328147888183594, "learning_rate": 0.00019328437971953463, "loss": 2.9016, "step": 852 }, { "epoch": 0.23738129196090027, "grad_norm": 2.2391767501831055, "learning_rate": 0.00019326860017739508, "loss": 2.7987, "step": 853 }, { "epoch": 0.23765958186941247, "grad_norm": 2.4389960765838623, "learning_rate": 0.00019325280276423704, "loss": 2.8529, "step": 854 }, { "epoch": 0.23793787177792466, "grad_norm": 2.3144774436950684, "learning_rate": 0.0001932369874830874, "loss": 2.8671, "step": 855 }, { "epoch": 0.23821616168643683, "grad_norm": 2.2653298377990723, "learning_rate": 0.00019322115433697648, "loss": 2.7422, "step": 856 }, { "epoch": 0.23849445159494903, "grad_norm": 2.0711584091186523, "learning_rate": 0.00019320530332893803, "loss": 3.2618, "step": 857 }, { "epoch": 0.23877274150346123, "grad_norm": 2.499851942062378, "learning_rate": 0.00019318943446200933, "loss": 3.4878, "step": 858 }, { "epoch": 0.23905103141197342, "grad_norm": 2.3842647075653076, "learning_rate": 0.00019317354773923086, "loss": 2.8183, "step": 859 }, { "epoch": 0.23932932132048562, "grad_norm": 2.026024341583252, "learning_rate": 0.0001931576431636467, "loss": 2.8785, "step": 860 }, { "epoch": 0.23960761122899782, "grad_norm": 2.217151165008545, "learning_rate": 0.0001931417207383043, "loss": 2.9576, "step": 861 }, { "epoch": 0.23988590113751, "grad_norm": 2.83335542678833, "learning_rate": 0.00019312578046625452, "loss": 3.1622, "step": 862 }, { "epoch": 0.24016419104602219, "grad_norm": 2.193587303161621, "learning_rate": 0.0001931098223505516, "loss": 3.016, "step": 863 }, { "epoch": 0.24044248095453438, "grad_norm": 2.2684731483459473, "learning_rate": 0.00019309384639425333, "loss": 2.9497, "step": 864 }, { "epoch": 0.24072077086304658, "grad_norm": 2.149592876434326, "learning_rate": 0.00019307785260042076, "loss": 2.9131, "step": 865 }, { "epoch": 0.24099906077155878, "grad_norm": 2.0008645057678223, "learning_rate": 0.00019306184097211845, "loss": 2.8075, "step": 866 }, { "epoch": 0.24127735068007097, "grad_norm": 1.975762963294983, "learning_rate": 0.00019304581151241437, "loss": 2.8263, "step": 867 }, { "epoch": 0.24155564058858317, "grad_norm": 2.3389108180999756, "learning_rate": 0.00019302976422437993, "loss": 2.9782, "step": 868 }, { "epoch": 0.24183393049709534, "grad_norm": 2.241065502166748, "learning_rate": 0.00019301369911108984, "loss": 2.909, "step": 869 }, { "epoch": 0.24211222040560754, "grad_norm": 1.999258041381836, "learning_rate": 0.00019299761617562236, "loss": 2.9049, "step": 870 }, { "epoch": 0.24239051031411973, "grad_norm": 2.799419641494751, "learning_rate": 0.0001929815154210591, "loss": 2.7585, "step": 871 }, { "epoch": 0.24266880022263193, "grad_norm": 2.4495270252227783, "learning_rate": 0.00019296539685048506, "loss": 2.8304, "step": 872 }, { "epoch": 0.24294709013114413, "grad_norm": 3.619530200958252, "learning_rate": 0.00019294926046698876, "loss": 3.1709, "step": 873 }, { "epoch": 0.24322538003965632, "grad_norm": 2.235837697982788, "learning_rate": 0.000192933106273662, "loss": 2.7996, "step": 874 }, { "epoch": 0.2435036699481685, "grad_norm": 2.3480660915374756, "learning_rate": 0.0001929169342736001, "loss": 3.0552, "step": 875 }, { "epoch": 0.2437819598566807, "grad_norm": 2.3852174282073975, "learning_rate": 0.00019290074446990166, "loss": 3.0486, "step": 876 }, { "epoch": 0.2440602497651929, "grad_norm": 2.092909574508667, "learning_rate": 0.0001928845368656689, "loss": 2.5779, "step": 877 }, { "epoch": 0.24433853967370509, "grad_norm": 2.2215349674224854, "learning_rate": 0.00019286831146400724, "loss": 2.9381, "step": 878 }, { "epoch": 0.24461682958221728, "grad_norm": 2.1749627590179443, "learning_rate": 0.00019285206826802562, "loss": 2.8491, "step": 879 }, { "epoch": 0.24489511949072948, "grad_norm": 2.1214070320129395, "learning_rate": 0.00019283580728083637, "loss": 2.9312, "step": 880 }, { "epoch": 0.24517340939924165, "grad_norm": 2.1492879390716553, "learning_rate": 0.00019281952850555524, "loss": 3.0076, "step": 881 }, { "epoch": 0.24545169930775385, "grad_norm": 2.7832489013671875, "learning_rate": 0.00019280323194530136, "loss": 2.7958, "step": 882 }, { "epoch": 0.24572998921626604, "grad_norm": 2.103790760040283, "learning_rate": 0.0001927869176031973, "loss": 2.7018, "step": 883 }, { "epoch": 0.24600827912477824, "grad_norm": 2.4814155101776123, "learning_rate": 0.00019277058548236898, "loss": 3.0921, "step": 884 }, { "epoch": 0.24628656903329044, "grad_norm": 2.4500222206115723, "learning_rate": 0.00019275423558594583, "loss": 3.2264, "step": 885 }, { "epoch": 0.24656485894180263, "grad_norm": 2.0407135486602783, "learning_rate": 0.00019273786791706058, "loss": 2.721, "step": 886 }, { "epoch": 0.2468431488503148, "grad_norm": 2.3130295276641846, "learning_rate": 0.00019272148247884938, "loss": 2.9651, "step": 887 }, { "epoch": 0.247121438758827, "grad_norm": 2.0370211601257324, "learning_rate": 0.00019270507927445185, "loss": 2.94, "step": 888 }, { "epoch": 0.2473997286673392, "grad_norm": 2.6659724712371826, "learning_rate": 0.000192688658307011, "loss": 3.1618, "step": 889 }, { "epoch": 0.2476780185758514, "grad_norm": 2.37921142578125, "learning_rate": 0.0001926722195796732, "loss": 2.6071, "step": 890 }, { "epoch": 0.2479563084843636, "grad_norm": 2.111372709274292, "learning_rate": 0.0001926557630955882, "loss": 2.7348, "step": 891 }, { "epoch": 0.2482345983928758, "grad_norm": 2.299969434738159, "learning_rate": 0.00019263928885790925, "loss": 3.0014, "step": 892 }, { "epoch": 0.24851288830138796, "grad_norm": 1.949044108390808, "learning_rate": 0.00019262279686979293, "loss": 2.8624, "step": 893 }, { "epoch": 0.24879117820990015, "grad_norm": 2.4874696731567383, "learning_rate": 0.00019260628713439924, "loss": 3.0875, "step": 894 }, { "epoch": 0.24906946811841235, "grad_norm": 2.361778497695923, "learning_rate": 0.00019258975965489157, "loss": 3.1709, "step": 895 }, { "epoch": 0.24934775802692455, "grad_norm": 2.313771963119507, "learning_rate": 0.00019257321443443673, "loss": 2.7375, "step": 896 }, { "epoch": 0.24962604793543675, "grad_norm": 2.7089507579803467, "learning_rate": 0.0001925566514762049, "loss": 2.9833, "step": 897 }, { "epoch": 0.24990433784394894, "grad_norm": 2.3654909133911133, "learning_rate": 0.0001925400707833697, "loss": 3.0736, "step": 898 }, { "epoch": 0.25018262775246114, "grad_norm": 2.3672971725463867, "learning_rate": 0.00019252347235910813, "loss": 2.9923, "step": 899 }, { "epoch": 0.25046091766097334, "grad_norm": 2.363785982131958, "learning_rate": 0.00019250685620660055, "loss": 2.9757, "step": 900 }, { "epoch": 0.25046091766097334, "eval_loss": 2.991063356399536, "eval_runtime": 84.9794, "eval_samples_per_second": 58.838, "eval_steps_per_second": 14.709, "step": 900 }, { "epoch": 0.25073920756948553, "grad_norm": 1.9591429233551025, "learning_rate": 0.00019249022232903078, "loss": 2.9023, "step": 901 }, { "epoch": 0.25101749747799773, "grad_norm": 2.3338277339935303, "learning_rate": 0.00019247357072958599, "loss": 2.9266, "step": 902 }, { "epoch": 0.2512957873865099, "grad_norm": 2.050804376602173, "learning_rate": 0.00019245690141145674, "loss": 3.099, "step": 903 }, { "epoch": 0.25157407729502207, "grad_norm": 2.225813150405884, "learning_rate": 0.00019244021437783706, "loss": 3.124, "step": 904 }, { "epoch": 0.25185236720353427, "grad_norm": 2.3553507328033447, "learning_rate": 0.00019242350963192428, "loss": 2.9571, "step": 905 }, { "epoch": 0.25213065711204646, "grad_norm": 1.9857616424560547, "learning_rate": 0.00019240678717691916, "loss": 2.7771, "step": 906 }, { "epoch": 0.25240894702055866, "grad_norm": 2.295212984085083, "learning_rate": 0.00019239004701602589, "loss": 3.0302, "step": 907 }, { "epoch": 0.25268723692907086, "grad_norm": 2.264939308166504, "learning_rate": 0.00019237328915245198, "loss": 3.0364, "step": 908 }, { "epoch": 0.25296552683758305, "grad_norm": 2.696686267852783, "learning_rate": 0.00019235651358940844, "loss": 2.7149, "step": 909 }, { "epoch": 0.25324381674609525, "grad_norm": 2.4715704917907715, "learning_rate": 0.00019233972033010953, "loss": 3.1557, "step": 910 }, { "epoch": 0.25352210665460745, "grad_norm": 2.473724842071533, "learning_rate": 0.00019232290937777301, "loss": 2.9834, "step": 911 }, { "epoch": 0.25380039656311965, "grad_norm": 2.222715139389038, "learning_rate": 0.00019230608073561998, "loss": 2.9991, "step": 912 }, { "epoch": 0.25407868647163184, "grad_norm": 1.9690274000167847, "learning_rate": 0.00019228923440687496, "loss": 2.7934, "step": 913 }, { "epoch": 0.25435697638014404, "grad_norm": 2.653773784637451, "learning_rate": 0.00019227237039476583, "loss": 2.8512, "step": 914 }, { "epoch": 0.2546352662886562, "grad_norm": 2.414804697036743, "learning_rate": 0.00019225548870252385, "loss": 2.9129, "step": 915 }, { "epoch": 0.2549135561971684, "grad_norm": 2.4671685695648193, "learning_rate": 0.00019223858933338375, "loss": 2.919, "step": 916 }, { "epoch": 0.2551918461056806, "grad_norm": 2.154003381729126, "learning_rate": 0.00019222167229058354, "loss": 2.6941, "step": 917 }, { "epoch": 0.2554701360141928, "grad_norm": 2.1004443168640137, "learning_rate": 0.00019220473757736468, "loss": 3.1412, "step": 918 }, { "epoch": 0.25574842592270497, "grad_norm": 2.221973180770874, "learning_rate": 0.00019218778519697197, "loss": 2.9934, "step": 919 }, { "epoch": 0.25602671583121717, "grad_norm": 2.2630701065063477, "learning_rate": 0.00019217081515265368, "loss": 3.0316, "step": 920 }, { "epoch": 0.25630500573972936, "grad_norm": 2.2909319400787354, "learning_rate": 0.00019215382744766136, "loss": 2.9084, "step": 921 }, { "epoch": 0.25658329564824156, "grad_norm": 2.4857752323150635, "learning_rate": 0.00019213682208524998, "loss": 2.8905, "step": 922 }, { "epoch": 0.25686158555675376, "grad_norm": 2.259615421295166, "learning_rate": 0.00019211979906867793, "loss": 2.8035, "step": 923 }, { "epoch": 0.25713987546526595, "grad_norm": 2.365906238555908, "learning_rate": 0.000192102758401207, "loss": 2.9914, "step": 924 }, { "epoch": 0.25741816537377815, "grad_norm": 2.489870309829712, "learning_rate": 0.00019208570008610222, "loss": 3.1519, "step": 925 }, { "epoch": 0.25769645528229035, "grad_norm": 2.3857553005218506, "learning_rate": 0.0001920686241266322, "loss": 2.7944, "step": 926 }, { "epoch": 0.25797474519080255, "grad_norm": 2.9879558086395264, "learning_rate": 0.00019205153052606876, "loss": 3.0066, "step": 927 }, { "epoch": 0.2582530350993147, "grad_norm": 2.2600924968719482, "learning_rate": 0.00019203441928768722, "loss": 2.8593, "step": 928 }, { "epoch": 0.2585313250078269, "grad_norm": 2.295046329498291, "learning_rate": 0.0001920172904147662, "loss": 3.0173, "step": 929 }, { "epoch": 0.2588096149163391, "grad_norm": 2.0109050273895264, "learning_rate": 0.00019200014391058777, "loss": 2.9551, "step": 930 }, { "epoch": 0.2590879048248513, "grad_norm": 2.5349931716918945, "learning_rate": 0.00019198297977843728, "loss": 3.1603, "step": 931 }, { "epoch": 0.2593661947333635, "grad_norm": 2.7034881114959717, "learning_rate": 0.00019196579802160355, "loss": 3.0908, "step": 932 }, { "epoch": 0.2596444846418757, "grad_norm": 2.876826763153076, "learning_rate": 0.00019194859864337876, "loss": 3.2376, "step": 933 }, { "epoch": 0.25992277455038787, "grad_norm": 2.101888656616211, "learning_rate": 0.00019193138164705844, "loss": 2.8997, "step": 934 }, { "epoch": 0.26020106445890007, "grad_norm": 2.534095287322998, "learning_rate": 0.00019191414703594148, "loss": 2.979, "step": 935 }, { "epoch": 0.26047935436741226, "grad_norm": 2.2128708362579346, "learning_rate": 0.00019189689481333022, "loss": 2.9093, "step": 936 }, { "epoch": 0.26075764427592446, "grad_norm": 2.7324604988098145, "learning_rate": 0.00019187962498253027, "loss": 2.8689, "step": 937 }, { "epoch": 0.26103593418443666, "grad_norm": 2.311871290206909, "learning_rate": 0.0001918623375468507, "loss": 3.0875, "step": 938 }, { "epoch": 0.26131422409294885, "grad_norm": 2.4302868843078613, "learning_rate": 0.0001918450325096039, "loss": 2.9568, "step": 939 }, { "epoch": 0.261592514001461, "grad_norm": 2.3075382709503174, "learning_rate": 0.0001918277098741057, "loss": 2.9289, "step": 940 }, { "epoch": 0.2618708039099732, "grad_norm": 1.9859980344772339, "learning_rate": 0.00019181036964367523, "loss": 3.0259, "step": 941 }, { "epoch": 0.2621490938184854, "grad_norm": 2.0013375282287598, "learning_rate": 0.000191793011821635, "loss": 2.9212, "step": 942 }, { "epoch": 0.2624273837269976, "grad_norm": 2.3014297485351562, "learning_rate": 0.00019177563641131092, "loss": 2.7998, "step": 943 }, { "epoch": 0.2627056736355098, "grad_norm": 2.6545004844665527, "learning_rate": 0.0001917582434160323, "loss": 2.8732, "step": 944 }, { "epoch": 0.262983963544022, "grad_norm": 2.273261785507202, "learning_rate": 0.00019174083283913173, "loss": 3.2238, "step": 945 }, { "epoch": 0.2632622534525342, "grad_norm": 2.2668890953063965, "learning_rate": 0.00019172340468394522, "loss": 3.1532, "step": 946 }, { "epoch": 0.2635405433610464, "grad_norm": 2.1367313861846924, "learning_rate": 0.00019170595895381216, "loss": 2.606, "step": 947 }, { "epoch": 0.2638188332695586, "grad_norm": 2.221686363220215, "learning_rate": 0.0001916884956520753, "loss": 2.8747, "step": 948 }, { "epoch": 0.26409712317807077, "grad_norm": 2.092822313308716, "learning_rate": 0.00019167101478208073, "loss": 2.8658, "step": 949 }, { "epoch": 0.26437541308658297, "grad_norm": 2.37062406539917, "learning_rate": 0.00019165351634717798, "loss": 2.834, "step": 950 }, { "epoch": 0.26465370299509516, "grad_norm": 2.2493433952331543, "learning_rate": 0.0001916360003507198, "loss": 3.1328, "step": 951 }, { "epoch": 0.26493199290360736, "grad_norm": 2.183903455734253, "learning_rate": 0.00019161846679606245, "loss": 2.828, "step": 952 }, { "epoch": 0.2652102828121195, "grad_norm": 2.3612265586853027, "learning_rate": 0.00019160091568656552, "loss": 3.2564, "step": 953 }, { "epoch": 0.2654885727206317, "grad_norm": 2.2755420207977295, "learning_rate": 0.00019158334702559194, "loss": 3.1154, "step": 954 }, { "epoch": 0.2657668626291439, "grad_norm": 2.0507993698120117, "learning_rate": 0.00019156576081650792, "loss": 2.882, "step": 955 }, { "epoch": 0.2660451525376561, "grad_norm": 2.211333990097046, "learning_rate": 0.00019154815706268324, "loss": 2.7766, "step": 956 }, { "epoch": 0.2663234424461683, "grad_norm": 2.2490601539611816, "learning_rate": 0.00019153053576749084, "loss": 3.0217, "step": 957 }, { "epoch": 0.2666017323546805, "grad_norm": 2.658447265625, "learning_rate": 0.00019151289693430717, "loss": 3.1547, "step": 958 }, { "epoch": 0.2668800222631927, "grad_norm": 2.0015876293182373, "learning_rate": 0.0001914952405665119, "loss": 2.8196, "step": 959 }, { "epoch": 0.2671583121717049, "grad_norm": 2.3417866230010986, "learning_rate": 0.00019147756666748823, "loss": 2.709, "step": 960 }, { "epoch": 0.2674366020802171, "grad_norm": 1.907423973083496, "learning_rate": 0.0001914598752406225, "loss": 2.7507, "step": 961 }, { "epoch": 0.2677148919887293, "grad_norm": 2.3158528804779053, "learning_rate": 0.0001914421662893046, "loss": 3.1787, "step": 962 }, { "epoch": 0.2679931818972415, "grad_norm": 2.2121713161468506, "learning_rate": 0.0001914244398169277, "loss": 2.6823, "step": 963 }, { "epoch": 0.26827147180575367, "grad_norm": 2.310617446899414, "learning_rate": 0.00019140669582688835, "loss": 2.6731, "step": 964 }, { "epoch": 0.2685497617142658, "grad_norm": 3.4885268211364746, "learning_rate": 0.00019138893432258644, "loss": 3.0244, "step": 965 }, { "epoch": 0.268828051622778, "grad_norm": 2.508208990097046, "learning_rate": 0.0001913711553074252, "loss": 2.8748, "step": 966 }, { "epoch": 0.2691063415312902, "grad_norm": 2.513348340988159, "learning_rate": 0.00019135335878481122, "loss": 2.9602, "step": 967 }, { "epoch": 0.2693846314398024, "grad_norm": 2.1486382484436035, "learning_rate": 0.00019133554475815446, "loss": 2.91, "step": 968 }, { "epoch": 0.2696629213483146, "grad_norm": 2.4330828189849854, "learning_rate": 0.00019131771323086828, "loss": 3.1533, "step": 969 }, { "epoch": 0.2699412112568268, "grad_norm": 2.586414098739624, "learning_rate": 0.0001912998642063693, "loss": 3.0431, "step": 970 }, { "epoch": 0.270219501165339, "grad_norm": 2.0730769634246826, "learning_rate": 0.00019128199768807757, "loss": 2.9144, "step": 971 }, { "epoch": 0.2704977910738512, "grad_norm": 2.1281137466430664, "learning_rate": 0.00019126411367941643, "loss": 3.0665, "step": 972 }, { "epoch": 0.2707760809823634, "grad_norm": 2.5447998046875, "learning_rate": 0.0001912462121838126, "loss": 2.948, "step": 973 }, { "epoch": 0.2710543708908756, "grad_norm": 2.3098933696746826, "learning_rate": 0.00019122829320469616, "loss": 3.015, "step": 974 }, { "epoch": 0.2713326607993878, "grad_norm": 2.7119693756103516, "learning_rate": 0.00019121035674550058, "loss": 3.4292, "step": 975 }, { "epoch": 0.2716109507079, "grad_norm": 2.1214985847473145, "learning_rate": 0.00019119240280966255, "loss": 2.6641, "step": 976 }, { "epoch": 0.2718892406164121, "grad_norm": 2.10428524017334, "learning_rate": 0.0001911744314006222, "loss": 2.722, "step": 977 }, { "epoch": 0.2721675305249243, "grad_norm": 2.42034912109375, "learning_rate": 0.00019115644252182308, "loss": 2.8762, "step": 978 }, { "epoch": 0.2724458204334365, "grad_norm": 2.141751289367676, "learning_rate": 0.0001911384361767119, "loss": 2.8202, "step": 979 }, { "epoch": 0.2727241103419487, "grad_norm": 2.406116485595703, "learning_rate": 0.00019112041236873887, "loss": 2.9173, "step": 980 }, { "epoch": 0.2730024002504609, "grad_norm": 2.257107973098755, "learning_rate": 0.0001911023711013575, "loss": 2.9635, "step": 981 }, { "epoch": 0.2732806901589731, "grad_norm": 2.416607141494751, "learning_rate": 0.00019108431237802465, "loss": 2.9385, "step": 982 }, { "epoch": 0.2735589800674853, "grad_norm": 2.077342987060547, "learning_rate": 0.00019106623620220053, "loss": 2.9861, "step": 983 }, { "epoch": 0.2738372699759975, "grad_norm": 2.418398857116699, "learning_rate": 0.00019104814257734861, "loss": 3.2599, "step": 984 }, { "epoch": 0.2741155598845097, "grad_norm": 2.5574445724487305, "learning_rate": 0.00019103003150693583, "loss": 3.122, "step": 985 }, { "epoch": 0.2743938497930219, "grad_norm": 2.273277759552002, "learning_rate": 0.00019101190299443245, "loss": 2.8853, "step": 986 }, { "epoch": 0.2746721397015341, "grad_norm": 2.2766342163085938, "learning_rate": 0.00019099375704331191, "loss": 3.0543, "step": 987 }, { "epoch": 0.2749504296100463, "grad_norm": 2.471675395965576, "learning_rate": 0.0001909755936570513, "loss": 2.9018, "step": 988 }, { "epoch": 0.2752287195185585, "grad_norm": 2.387054681777954, "learning_rate": 0.00019095741283913076, "loss": 2.8184, "step": 989 }, { "epoch": 0.2755070094270706, "grad_norm": 2.3248586654663086, "learning_rate": 0.00019093921459303388, "loss": 3.1213, "step": 990 }, { "epoch": 0.2757852993355828, "grad_norm": 2.096325635910034, "learning_rate": 0.00019092099892224762, "loss": 2.9431, "step": 991 }, { "epoch": 0.276063589244095, "grad_norm": 2.027973175048828, "learning_rate": 0.00019090276583026225, "loss": 2.8891, "step": 992 }, { "epoch": 0.2763418791526072, "grad_norm": 2.1154844760894775, "learning_rate": 0.0001908845153205714, "loss": 2.881, "step": 993 }, { "epoch": 0.2766201690611194, "grad_norm": 2.1684629917144775, "learning_rate": 0.00019086624739667193, "loss": 2.7207, "step": 994 }, { "epoch": 0.2768984589696316, "grad_norm": 2.225539445877075, "learning_rate": 0.00019084796206206422, "loss": 2.8696, "step": 995 }, { "epoch": 0.2771767488781438, "grad_norm": 2.2602624893188477, "learning_rate": 0.00019082965932025188, "loss": 2.9483, "step": 996 }, { "epoch": 0.277455038786656, "grad_norm": 2.2472681999206543, "learning_rate": 0.0001908113391747418, "loss": 2.7872, "step": 997 }, { "epoch": 0.2777333286951682, "grad_norm": 2.4084396362304688, "learning_rate": 0.00019079300162904429, "loss": 3.0977, "step": 998 }, { "epoch": 0.2780116186036804, "grad_norm": 4.903784275054932, "learning_rate": 0.000190774646686673, "loss": 2.977, "step": 999 }, { "epoch": 0.2782899085121926, "grad_norm": 2.4104344844818115, "learning_rate": 0.0001907562743511448, "loss": 3.2082, "step": 1000 }, { "epoch": 0.2782899085121926, "eval_loss": 2.9812283515930176, "eval_runtime": 84.7136, "eval_samples_per_second": 59.022, "eval_steps_per_second": 14.756, "step": 1000 }, { "epoch": 0.2785681984207048, "grad_norm": 2.295063018798828, "learning_rate": 0.00019073788462598012, "loss": 3.3572, "step": 1001 }, { "epoch": 0.27884648832921693, "grad_norm": 2.9532876014709473, "learning_rate": 0.0001907194775147025, "loss": 2.8815, "step": 1002 }, { "epoch": 0.27912477823772913, "grad_norm": 2.7832155227661133, "learning_rate": 0.00019070105302083887, "loss": 3.0865, "step": 1003 }, { "epoch": 0.27940306814624133, "grad_norm": 2.1961820125579834, "learning_rate": 0.00019068261114791953, "loss": 3.0414, "step": 1004 }, { "epoch": 0.2796813580547535, "grad_norm": 2.2525787353515625, "learning_rate": 0.00019066415189947816, "loss": 3.2121, "step": 1005 }, { "epoch": 0.2799596479632657, "grad_norm": 2.1405348777770996, "learning_rate": 0.00019064567527905162, "loss": 2.782, "step": 1006 }, { "epoch": 0.2802379378717779, "grad_norm": 3.208451271057129, "learning_rate": 0.00019062718129018018, "loss": 3.0338, "step": 1007 }, { "epoch": 0.2805162277802901, "grad_norm": 2.3725242614746094, "learning_rate": 0.00019060866993640744, "loss": 2.7811, "step": 1008 }, { "epoch": 0.2807945176888023, "grad_norm": 2.251878499984741, "learning_rate": 0.00019059014122128041, "loss": 2.9813, "step": 1009 }, { "epoch": 0.2810728075973145, "grad_norm": 2.245824098587036, "learning_rate": 0.00019057159514834922, "loss": 3.0321, "step": 1010 }, { "epoch": 0.2813510975058267, "grad_norm": 2.740670680999756, "learning_rate": 0.00019055303172116754, "loss": 3.2232, "step": 1011 }, { "epoch": 0.2816293874143389, "grad_norm": 2.6457338333129883, "learning_rate": 0.00019053445094329224, "loss": 3.2815, "step": 1012 }, { "epoch": 0.2819076773228511, "grad_norm": 1.9184149503707886, "learning_rate": 0.00019051585281828354, "loss": 2.666, "step": 1013 }, { "epoch": 0.2821859672313633, "grad_norm": 2.2121214866638184, "learning_rate": 0.000190497237349705, "loss": 3.0131, "step": 1014 }, { "epoch": 0.28246425713987544, "grad_norm": 2.1868200302124023, "learning_rate": 0.00019047860454112346, "loss": 2.833, "step": 1015 }, { "epoch": 0.28274254704838764, "grad_norm": 2.3559131622314453, "learning_rate": 0.0001904599543961092, "loss": 3.1597, "step": 1016 }, { "epoch": 0.28302083695689984, "grad_norm": 2.4490432739257812, "learning_rate": 0.00019044128691823566, "loss": 2.9238, "step": 1017 }, { "epoch": 0.28329912686541203, "grad_norm": 2.268052577972412, "learning_rate": 0.0001904226021110797, "loss": 2.9538, "step": 1018 }, { "epoch": 0.28357741677392423, "grad_norm": 2.239588499069214, "learning_rate": 0.0001904038999782215, "loss": 2.9348, "step": 1019 }, { "epoch": 0.2838557066824364, "grad_norm": 2.179885149002075, "learning_rate": 0.0001903851805232445, "loss": 3.0642, "step": 1020 }, { "epoch": 0.2841339965909486, "grad_norm": 2.3981640338897705, "learning_rate": 0.00019036644374973555, "loss": 2.9009, "step": 1021 }, { "epoch": 0.2844122864994608, "grad_norm": 2.2204346656799316, "learning_rate": 0.00019034768966128476, "loss": 2.9819, "step": 1022 }, { "epoch": 0.284690576407973, "grad_norm": 2.3552000522613525, "learning_rate": 0.0001903289182614855, "loss": 2.8935, "step": 1023 }, { "epoch": 0.2849688663164852, "grad_norm": 2.280709743499756, "learning_rate": 0.00019031012955393462, "loss": 2.8733, "step": 1024 }, { "epoch": 0.2852471562249974, "grad_norm": 2.7742390632629395, "learning_rate": 0.0001902913235422321, "loss": 3.2554, "step": 1025 }, { "epoch": 0.2855254461335096, "grad_norm": 2.740758180618286, "learning_rate": 0.0001902725002299814, "loss": 2.8624, "step": 1026 }, { "epoch": 0.28580373604202175, "grad_norm": 2.5338869094848633, "learning_rate": 0.00019025365962078915, "loss": 3.0358, "step": 1027 }, { "epoch": 0.28608202595053395, "grad_norm": 2.250833034515381, "learning_rate": 0.0001902348017182654, "loss": 2.9373, "step": 1028 }, { "epoch": 0.28636031585904614, "grad_norm": 2.7110579013824463, "learning_rate": 0.0001902159265260235, "loss": 3.0516, "step": 1029 }, { "epoch": 0.28663860576755834, "grad_norm": 2.30458402633667, "learning_rate": 0.00019019703404768004, "loss": 2.78, "step": 1030 }, { "epoch": 0.28691689567607054, "grad_norm": 2.298051118850708, "learning_rate": 0.000190178124286855, "loss": 2.6448, "step": 1031 }, { "epoch": 0.28719518558458274, "grad_norm": 3.837308168411255, "learning_rate": 0.00019015919724717163, "loss": 3.0589, "step": 1032 }, { "epoch": 0.28747347549309493, "grad_norm": 2.2510111331939697, "learning_rate": 0.0001901402529322565, "loss": 2.854, "step": 1033 }, { "epoch": 0.28775176540160713, "grad_norm": 2.1007468700408936, "learning_rate": 0.00019012129134573952, "loss": 2.7978, "step": 1034 }, { "epoch": 0.2880300553101193, "grad_norm": 2.4168872833251953, "learning_rate": 0.00019010231249125386, "loss": 3.0707, "step": 1035 }, { "epoch": 0.2883083452186315, "grad_norm": 3.084479331970215, "learning_rate": 0.00019008331637243605, "loss": 3.0427, "step": 1036 }, { "epoch": 0.2885866351271437, "grad_norm": 2.3489291667938232, "learning_rate": 0.00019006430299292585, "loss": 3.0196, "step": 1037 }, { "epoch": 0.2888649250356559, "grad_norm": 2.4247078895568848, "learning_rate": 0.00019004527235636642, "loss": 2.8312, "step": 1038 }, { "epoch": 0.2891432149441681, "grad_norm": 2.2092173099517822, "learning_rate": 0.00019002622446640416, "loss": 2.939, "step": 1039 }, { "epoch": 0.28942150485268026, "grad_norm": 2.3561270236968994, "learning_rate": 0.00019000715932668884, "loss": 2.7269, "step": 1040 }, { "epoch": 0.28969979476119245, "grad_norm": 2.2443103790283203, "learning_rate": 0.00018998807694087347, "loss": 2.6341, "step": 1041 }, { "epoch": 0.28997808466970465, "grad_norm": 2.0303030014038086, "learning_rate": 0.00018996897731261435, "loss": 2.7886, "step": 1042 }, { "epoch": 0.29025637457821685, "grad_norm": 2.1284236907958984, "learning_rate": 0.00018994986044557118, "loss": 2.9591, "step": 1043 }, { "epoch": 0.29053466448672904, "grad_norm": 2.222538471221924, "learning_rate": 0.00018993072634340688, "loss": 2.8613, "step": 1044 }, { "epoch": 0.29081295439524124, "grad_norm": 2.3499810695648193, "learning_rate": 0.00018991157500978771, "loss": 3.0851, "step": 1045 }, { "epoch": 0.29109124430375344, "grad_norm": 2.5125091075897217, "learning_rate": 0.00018989240644838321, "loss": 3.1046, "step": 1046 }, { "epoch": 0.29136953421226564, "grad_norm": 2.4956893920898438, "learning_rate": 0.00018987322066286622, "loss": 3.0937, "step": 1047 }, { "epoch": 0.29164782412077783, "grad_norm": 2.574758768081665, "learning_rate": 0.00018985401765691292, "loss": 3.3111, "step": 1048 }, { "epoch": 0.29192611402929003, "grad_norm": 2.519538640975952, "learning_rate": 0.0001898347974342028, "loss": 3.0489, "step": 1049 }, { "epoch": 0.2922044039378022, "grad_norm": 2.4975109100341797, "learning_rate": 0.00018981555999841846, "loss": 3.2315, "step": 1050 }, { "epoch": 0.2924826938463144, "grad_norm": 2.383451461791992, "learning_rate": 0.00018979630535324613, "loss": 2.9348, "step": 1051 }, { "epoch": 0.29276098375482656, "grad_norm": 2.4152188301086426, "learning_rate": 0.00018977703350237505, "loss": 3.166, "step": 1052 }, { "epoch": 0.29303927366333876, "grad_norm": 2.2097294330596924, "learning_rate": 0.00018975774444949788, "loss": 2.7553, "step": 1053 }, { "epoch": 0.29331756357185096, "grad_norm": 2.4151086807250977, "learning_rate": 0.00018973843819831057, "loss": 2.8159, "step": 1054 }, { "epoch": 0.29359585348036316, "grad_norm": 2.438908338546753, "learning_rate": 0.00018971911475251235, "loss": 3.1621, "step": 1055 }, { "epoch": 0.29387414338887535, "grad_norm": 2.1655166149139404, "learning_rate": 0.00018969977411580573, "loss": 2.8609, "step": 1056 }, { "epoch": 0.29415243329738755, "grad_norm": 2.257509708404541, "learning_rate": 0.00018968041629189656, "loss": 2.8508, "step": 1057 }, { "epoch": 0.29443072320589975, "grad_norm": 2.3579158782958984, "learning_rate": 0.00018966104128449396, "loss": 3.2132, "step": 1058 }, { "epoch": 0.29470901311441194, "grad_norm": 2.1810085773468018, "learning_rate": 0.0001896416490973103, "loss": 2.9548, "step": 1059 }, { "epoch": 0.29498730302292414, "grad_norm": 1.7938565015792847, "learning_rate": 0.00018962223973406132, "loss": 2.8069, "step": 1060 }, { "epoch": 0.29526559293143634, "grad_norm": 2.4322869777679443, "learning_rate": 0.000189602813198466, "loss": 3.1546, "step": 1061 }, { "epoch": 0.29554388283994854, "grad_norm": 2.2390685081481934, "learning_rate": 0.0001895833694942466, "loss": 3.0933, "step": 1062 }, { "epoch": 0.29582217274846073, "grad_norm": 2.435523748397827, "learning_rate": 0.0001895639086251287, "loss": 2.9639, "step": 1063 }, { "epoch": 0.29610046265697293, "grad_norm": 2.668278455734253, "learning_rate": 0.00018954443059484118, "loss": 3.2604, "step": 1064 }, { "epoch": 0.29637875256548507, "grad_norm": 2.3723320960998535, "learning_rate": 0.0001895249354071162, "loss": 2.8976, "step": 1065 }, { "epoch": 0.29665704247399727, "grad_norm": 2.012861490249634, "learning_rate": 0.00018950542306568914, "loss": 2.8731, "step": 1066 }, { "epoch": 0.29693533238250946, "grad_norm": 2.2463841438293457, "learning_rate": 0.0001894858935742988, "loss": 2.9732, "step": 1067 }, { "epoch": 0.29721362229102166, "grad_norm": 3.3393445014953613, "learning_rate": 0.00018946634693668707, "loss": 2.9558, "step": 1068 }, { "epoch": 0.29749191219953386, "grad_norm": 2.0321619510650635, "learning_rate": 0.00018944678315659937, "loss": 3.0321, "step": 1069 }, { "epoch": 0.29777020210804606, "grad_norm": 2.6007144451141357, "learning_rate": 0.00018942720223778418, "loss": 2.9716, "step": 1070 }, { "epoch": 0.29804849201655825, "grad_norm": 2.194688320159912, "learning_rate": 0.00018940760418399344, "loss": 2.8576, "step": 1071 }, { "epoch": 0.29832678192507045, "grad_norm": 2.4894134998321533, "learning_rate": 0.00018938798899898225, "loss": 2.7962, "step": 1072 }, { "epoch": 0.29860507183358265, "grad_norm": 2.6302945613861084, "learning_rate": 0.00018936835668650908, "loss": 2.9588, "step": 1073 }, { "epoch": 0.29888336174209484, "grad_norm": 2.2776103019714355, "learning_rate": 0.00018934870725033557, "loss": 3.1319, "step": 1074 }, { "epoch": 0.29916165165060704, "grad_norm": 2.7816131114959717, "learning_rate": 0.00018932904069422679, "loss": 3.1834, "step": 1075 }, { "epoch": 0.29943994155911924, "grad_norm": 2.2944226264953613, "learning_rate": 0.00018930935702195093, "loss": 3.0898, "step": 1076 }, { "epoch": 0.2997182314676314, "grad_norm": 2.0931670665740967, "learning_rate": 0.00018928965623727964, "loss": 2.6973, "step": 1077 }, { "epoch": 0.2999965213761436, "grad_norm": 2.22282338142395, "learning_rate": 0.00018926993834398765, "loss": 3.0529, "step": 1078 }, { "epoch": 0.3002748112846558, "grad_norm": 2.131679058074951, "learning_rate": 0.00018925020334585311, "loss": 2.6117, "step": 1079 }, { "epoch": 0.30055310119316797, "grad_norm": 2.5342094898223877, "learning_rate": 0.00018923045124665745, "loss": 3.2762, "step": 1080 }, { "epoch": 0.30083139110168017, "grad_norm": 2.6795685291290283, "learning_rate": 0.00018921068205018525, "loss": 3.2019, "step": 1081 }, { "epoch": 0.30110968101019236, "grad_norm": 2.2270455360412598, "learning_rate": 0.0001891908957602245, "loss": 2.9762, "step": 1082 }, { "epoch": 0.30138797091870456, "grad_norm": 2.2827603816986084, "learning_rate": 0.0001891710923805664, "loss": 3.0479, "step": 1083 }, { "epoch": 0.30166626082721676, "grad_norm": 2.31645131111145, "learning_rate": 0.00018915127191500544, "loss": 3.1381, "step": 1084 }, { "epoch": 0.30194455073572896, "grad_norm": 2.195394277572632, "learning_rate": 0.00018913143436733937, "loss": 3.0308, "step": 1085 }, { "epoch": 0.30222284064424115, "grad_norm": 2.792379856109619, "learning_rate": 0.00018911157974136925, "loss": 2.8825, "step": 1086 }, { "epoch": 0.30250113055275335, "grad_norm": 2.6862266063690186, "learning_rate": 0.00018909170804089939, "loss": 3.0844, "step": 1087 }, { "epoch": 0.30277942046126555, "grad_norm": 2.170910596847534, "learning_rate": 0.0001890718192697373, "loss": 3.067, "step": 1088 }, { "epoch": 0.3030577103697777, "grad_norm": 2.0469236373901367, "learning_rate": 0.00018905191343169392, "loss": 2.8804, "step": 1089 }, { "epoch": 0.3033360002782899, "grad_norm": 2.0144686698913574, "learning_rate": 0.00018903199053058332, "loss": 2.9923, "step": 1090 }, { "epoch": 0.3036142901868021, "grad_norm": 2.469778060913086, "learning_rate": 0.00018901205057022288, "loss": 3.0029, "step": 1091 }, { "epoch": 0.3038925800953143, "grad_norm": 2.3326077461242676, "learning_rate": 0.0001889920935544333, "loss": 2.9103, "step": 1092 }, { "epoch": 0.3041708700038265, "grad_norm": 2.52431058883667, "learning_rate": 0.00018897211948703846, "loss": 2.9991, "step": 1093 }, { "epoch": 0.3044491599123387, "grad_norm": 2.962874412536621, "learning_rate": 0.0001889521283718656, "loss": 3.1715, "step": 1094 }, { "epoch": 0.30472744982085087, "grad_norm": 2.337473154067993, "learning_rate": 0.00018893212021274514, "loss": 2.9481, "step": 1095 }, { "epoch": 0.30500573972936307, "grad_norm": 2.0684142112731934, "learning_rate": 0.00018891209501351082, "loss": 2.9719, "step": 1096 }, { "epoch": 0.30528402963787526, "grad_norm": 2.54006290435791, "learning_rate": 0.00018889205277799963, "loss": 2.803, "step": 1097 }, { "epoch": 0.30556231954638746, "grad_norm": 2.356423854827881, "learning_rate": 0.00018887199351005183, "loss": 2.9109, "step": 1098 }, { "epoch": 0.30584060945489966, "grad_norm": 2.250021457672119, "learning_rate": 0.00018885191721351092, "loss": 3.092, "step": 1099 }, { "epoch": 0.30611889936341186, "grad_norm": 2.0202651023864746, "learning_rate": 0.0001888318238922237, "loss": 2.8002, "step": 1100 }, { "epoch": 0.30611889936341186, "eval_loss": 2.971851348876953, "eval_runtime": 84.2556, "eval_samples_per_second": 59.343, "eval_steps_per_second": 14.836, "step": 1100 }, { "epoch": 0.30639718927192405, "grad_norm": 2.3319437503814697, "learning_rate": 0.0001888117135500402, "loss": 2.956, "step": 1101 }, { "epoch": 0.3066754791804362, "grad_norm": 2.168853759765625, "learning_rate": 0.00018879158619081375, "loss": 2.7645, "step": 1102 }, { "epoch": 0.3069537690889484, "grad_norm": 2.405137777328491, "learning_rate": 0.00018877144181840086, "loss": 3.2334, "step": 1103 }, { "epoch": 0.3072320589974606, "grad_norm": 2.687662363052368, "learning_rate": 0.00018875128043666144, "loss": 2.9231, "step": 1104 }, { "epoch": 0.3075103489059728, "grad_norm": 1.9743692874908447, "learning_rate": 0.00018873110204945846, "loss": 2.9776, "step": 1105 }, { "epoch": 0.307788638814485, "grad_norm": 2.3342981338500977, "learning_rate": 0.00018871090666065842, "loss": 2.7816, "step": 1106 }, { "epoch": 0.3080669287229972, "grad_norm": 2.3802807331085205, "learning_rate": 0.00018869069427413076, "loss": 2.9028, "step": 1107 }, { "epoch": 0.3083452186315094, "grad_norm": 2.244115114212036, "learning_rate": 0.00018867046489374838, "loss": 3.1007, "step": 1108 }, { "epoch": 0.3086235085400216, "grad_norm": 2.1095259189605713, "learning_rate": 0.00018865021852338746, "loss": 2.792, "step": 1109 }, { "epoch": 0.30890179844853377, "grad_norm": 2.5294413566589355, "learning_rate": 0.00018862995516692732, "loss": 2.811, "step": 1110 }, { "epoch": 0.30918008835704597, "grad_norm": 2.3228917121887207, "learning_rate": 0.00018860967482825055, "loss": 2.9245, "step": 1111 }, { "epoch": 0.30945837826555816, "grad_norm": 2.133042812347412, "learning_rate": 0.0001885893775112431, "loss": 2.9268, "step": 1112 }, { "epoch": 0.30973666817407036, "grad_norm": 2.063772201538086, "learning_rate": 0.00018856906321979407, "loss": 2.9566, "step": 1113 }, { "epoch": 0.3100149580825825, "grad_norm": 2.2008044719696045, "learning_rate": 0.00018854873195779583, "loss": 2.574, "step": 1114 }, { "epoch": 0.3102932479910947, "grad_norm": 2.1742701530456543, "learning_rate": 0.000188528383729144, "loss": 2.9992, "step": 1115 }, { "epoch": 0.3105715378996069, "grad_norm": 2.2259178161621094, "learning_rate": 0.00018850801853773753, "loss": 2.8978, "step": 1116 }, { "epoch": 0.3108498278081191, "grad_norm": 2.128570318222046, "learning_rate": 0.00018848763638747848, "loss": 2.7417, "step": 1117 }, { "epoch": 0.3111281177166313, "grad_norm": 2.079465866088867, "learning_rate": 0.00018846723728227228, "loss": 2.6456, "step": 1118 }, { "epoch": 0.3114064076251435, "grad_norm": 2.38692045211792, "learning_rate": 0.00018844682122602752, "loss": 2.8159, "step": 1119 }, { "epoch": 0.3116846975336557, "grad_norm": 2.269820213317871, "learning_rate": 0.00018842638822265615, "loss": 3.1357, "step": 1120 }, { "epoch": 0.3119629874421679, "grad_norm": 2.6265504360198975, "learning_rate": 0.00018840593827607326, "loss": 2.9266, "step": 1121 }, { "epoch": 0.3122412773506801, "grad_norm": 2.151615858078003, "learning_rate": 0.00018838547139019723, "loss": 2.8285, "step": 1122 }, { "epoch": 0.3125195672591923, "grad_norm": 2.108973264694214, "learning_rate": 0.00018836498756894966, "loss": 2.9678, "step": 1123 }, { "epoch": 0.3127978571677045, "grad_norm": 2.114283561706543, "learning_rate": 0.00018834448681625542, "loss": 2.9171, "step": 1124 }, { "epoch": 0.31307614707621667, "grad_norm": 2.4875669479370117, "learning_rate": 0.00018832396913604264, "loss": 2.974, "step": 1125 }, { "epoch": 0.31335443698472887, "grad_norm": 2.299150228500366, "learning_rate": 0.0001883034345322427, "loss": 3.0238, "step": 1126 }, { "epoch": 0.313632726893241, "grad_norm": 2.2018871307373047, "learning_rate": 0.00018828288300879012, "loss": 2.9183, "step": 1127 }, { "epoch": 0.3139110168017532, "grad_norm": 2.27093243598938, "learning_rate": 0.0001882623145696228, "loss": 2.956, "step": 1128 }, { "epoch": 0.3141893067102654, "grad_norm": 2.498364210128784, "learning_rate": 0.0001882417292186818, "loss": 3.1549, "step": 1129 }, { "epoch": 0.3144675966187776, "grad_norm": 2.2145185470581055, "learning_rate": 0.00018822112695991144, "loss": 2.8288, "step": 1130 }, { "epoch": 0.3147458865272898, "grad_norm": 2.1318297386169434, "learning_rate": 0.00018820050779725929, "loss": 2.8094, "step": 1131 }, { "epoch": 0.315024176435802, "grad_norm": 2.2452354431152344, "learning_rate": 0.00018817987173467613, "loss": 2.877, "step": 1132 }, { "epoch": 0.3153024663443142, "grad_norm": 2.0873141288757324, "learning_rate": 0.000188159218776116, "loss": 3.1625, "step": 1133 }, { "epoch": 0.3155807562528264, "grad_norm": 2.624479055404663, "learning_rate": 0.0001881385489255362, "loss": 3.0381, "step": 1134 }, { "epoch": 0.3158590461613386, "grad_norm": 3.1240036487579346, "learning_rate": 0.00018811786218689717, "loss": 2.8823, "step": 1135 }, { "epoch": 0.3161373360698508, "grad_norm": 2.2257115840911865, "learning_rate": 0.00018809715856416275, "loss": 2.8804, "step": 1136 }, { "epoch": 0.316415625978363, "grad_norm": 2.2033560276031494, "learning_rate": 0.00018807643806129987, "loss": 3.0727, "step": 1137 }, { "epoch": 0.3166939158868752, "grad_norm": 2.2808096408843994, "learning_rate": 0.00018805570068227876, "loss": 2.7602, "step": 1138 }, { "epoch": 0.3169722057953873, "grad_norm": 2.203282117843628, "learning_rate": 0.00018803494643107284, "loss": 2.8903, "step": 1139 }, { "epoch": 0.3172504957038995, "grad_norm": 2.3106565475463867, "learning_rate": 0.00018801417531165885, "loss": 3.1969, "step": 1140 }, { "epoch": 0.3175287856124117, "grad_norm": 2.098233222961426, "learning_rate": 0.00018799338732801663, "loss": 2.7922, "step": 1141 }, { "epoch": 0.3178070755209239, "grad_norm": 1.968705654144287, "learning_rate": 0.0001879725824841294, "loss": 2.6316, "step": 1142 }, { "epoch": 0.3180853654294361, "grad_norm": 2.5677616596221924, "learning_rate": 0.00018795176078398348, "loss": 3.0551, "step": 1143 }, { "epoch": 0.3183636553379483, "grad_norm": 2.4083244800567627, "learning_rate": 0.0001879309222315685, "loss": 2.9688, "step": 1144 }, { "epoch": 0.3186419452464605, "grad_norm": 2.284806728363037, "learning_rate": 0.00018791006683087732, "loss": 2.9046, "step": 1145 }, { "epoch": 0.3189202351549727, "grad_norm": 2.285280704498291, "learning_rate": 0.00018788919458590596, "loss": 3.0153, "step": 1146 }, { "epoch": 0.3191985250634849, "grad_norm": 2.8007445335388184, "learning_rate": 0.00018786830550065372, "loss": 3.3478, "step": 1147 }, { "epoch": 0.3194768149719971, "grad_norm": 2.165300130844116, "learning_rate": 0.00018784739957912317, "loss": 2.6987, "step": 1148 }, { "epoch": 0.3197551048805093, "grad_norm": 2.224034309387207, "learning_rate": 0.00018782647682531996, "loss": 2.9192, "step": 1149 }, { "epoch": 0.3200333947890215, "grad_norm": 2.2775347232818604, "learning_rate": 0.00018780553724325315, "loss": 2.6095, "step": 1150 }, { "epoch": 0.3203116846975337, "grad_norm": 2.342355966567993, "learning_rate": 0.00018778458083693487, "loss": 2.7442, "step": 1151 }, { "epoch": 0.3205899746060458, "grad_norm": 2.2344112396240234, "learning_rate": 0.00018776360761038055, "loss": 2.9166, "step": 1152 }, { "epoch": 0.320868264514558, "grad_norm": 2.5537362098693848, "learning_rate": 0.00018774261756760886, "loss": 3.1212, "step": 1153 }, { "epoch": 0.3211465544230702, "grad_norm": 3.5446200370788574, "learning_rate": 0.0001877216107126416, "loss": 3.2037, "step": 1154 }, { "epoch": 0.3214248443315824, "grad_norm": 2.398542881011963, "learning_rate": 0.00018770058704950393, "loss": 2.709, "step": 1155 }, { "epoch": 0.3217031342400946, "grad_norm": 2.1771011352539062, "learning_rate": 0.00018767954658222412, "loss": 3.1763, "step": 1156 }, { "epoch": 0.3219814241486068, "grad_norm": 2.532365083694458, "learning_rate": 0.00018765848931483369, "loss": 3.0292, "step": 1157 }, { "epoch": 0.322259714057119, "grad_norm": 2.601168394088745, "learning_rate": 0.0001876374152513674, "loss": 2.929, "step": 1158 }, { "epoch": 0.3225380039656312, "grad_norm": 2.303968667984009, "learning_rate": 0.00018761632439586316, "loss": 2.8676, "step": 1159 }, { "epoch": 0.3228162938741434, "grad_norm": 2.7635085582733154, "learning_rate": 0.0001875952167523622, "loss": 2.7935, "step": 1160 }, { "epoch": 0.3230945837826556, "grad_norm": 2.1332292556762695, "learning_rate": 0.00018757409232490892, "loss": 2.699, "step": 1161 }, { "epoch": 0.3233728736911678, "grad_norm": 2.6221301555633545, "learning_rate": 0.00018755295111755088, "loss": 3.3359, "step": 1162 }, { "epoch": 0.32365116359968, "grad_norm": 2.0089375972747803, "learning_rate": 0.00018753179313433897, "loss": 2.7084, "step": 1163 }, { "epoch": 0.32392945350819213, "grad_norm": 2.3948702812194824, "learning_rate": 0.0001875106183793272, "loss": 2.7496, "step": 1164 }, { "epoch": 0.32420774341670433, "grad_norm": 2.054074287414551, "learning_rate": 0.00018748942685657281, "loss": 2.9357, "step": 1165 }, { "epoch": 0.3244860333252165, "grad_norm": 2.2625646591186523, "learning_rate": 0.0001874682185701363, "loss": 2.7322, "step": 1166 }, { "epoch": 0.3247643232337287, "grad_norm": 2.163066864013672, "learning_rate": 0.00018744699352408128, "loss": 3.2351, "step": 1167 }, { "epoch": 0.3250426131422409, "grad_norm": 2.724336624145508, "learning_rate": 0.00018742575172247473, "loss": 3.1954, "step": 1168 }, { "epoch": 0.3253209030507531, "grad_norm": 2.1193623542785645, "learning_rate": 0.00018740449316938669, "loss": 2.7972, "step": 1169 }, { "epoch": 0.3255991929592653, "grad_norm": 2.321559190750122, "learning_rate": 0.0001873832178688905, "loss": 2.9818, "step": 1170 }, { "epoch": 0.3258774828677775, "grad_norm": 2.205207109451294, "learning_rate": 0.0001873619258250627, "loss": 2.7382, "step": 1171 }, { "epoch": 0.3261557727762897, "grad_norm": 2.2333266735076904, "learning_rate": 0.00018734061704198297, "loss": 2.9347, "step": 1172 }, { "epoch": 0.3264340626848019, "grad_norm": 2.7027249336242676, "learning_rate": 0.00018731929152373424, "loss": 3.1438, "step": 1173 }, { "epoch": 0.3267123525933141, "grad_norm": 2.5843727588653564, "learning_rate": 0.0001872979492744027, "loss": 3.1782, "step": 1174 }, { "epoch": 0.3269906425018263, "grad_norm": 2.5205907821655273, "learning_rate": 0.00018727659029807767, "loss": 2.9564, "step": 1175 }, { "epoch": 0.3272689324103385, "grad_norm": 2.3286476135253906, "learning_rate": 0.0001872552145988517, "loss": 3.1941, "step": 1176 }, { "epoch": 0.32754722231885064, "grad_norm": 2.730647325515747, "learning_rate": 0.00018723382218082058, "loss": 3.1389, "step": 1177 }, { "epoch": 0.32782551222736284, "grad_norm": 2.7821812629699707, "learning_rate": 0.00018721241304808323, "loss": 3.3162, "step": 1178 }, { "epoch": 0.32810380213587503, "grad_norm": 2.2343802452087402, "learning_rate": 0.00018719098720474184, "loss": 2.9526, "step": 1179 }, { "epoch": 0.32838209204438723, "grad_norm": 2.2397286891937256, "learning_rate": 0.00018716954465490174, "loss": 3.0829, "step": 1180 }, { "epoch": 0.3286603819528994, "grad_norm": 2.359719753265381, "learning_rate": 0.00018714808540267153, "loss": 3.2031, "step": 1181 }, { "epoch": 0.3289386718614116, "grad_norm": 2.2687413692474365, "learning_rate": 0.00018712660945216297, "loss": 2.9213, "step": 1182 }, { "epoch": 0.3292169617699238, "grad_norm": 2.119237184524536, "learning_rate": 0.000187105116807491, "loss": 2.7226, "step": 1183 }, { "epoch": 0.329495251678436, "grad_norm": 2.2565367221832275, "learning_rate": 0.00018708360747277385, "loss": 2.7129, "step": 1184 }, { "epoch": 0.3297735415869482, "grad_norm": 2.4942169189453125, "learning_rate": 0.0001870620814521328, "loss": 2.8245, "step": 1185 }, { "epoch": 0.3300518314954604, "grad_norm": 2.0679166316986084, "learning_rate": 0.00018704053874969248, "loss": 2.7597, "step": 1186 }, { "epoch": 0.3303301214039726, "grad_norm": 2.4863128662109375, "learning_rate": 0.00018701897936958058, "loss": 2.8186, "step": 1187 }, { "epoch": 0.3306084113124848, "grad_norm": 2.167577028274536, "learning_rate": 0.0001869974033159281, "loss": 2.8108, "step": 1188 }, { "epoch": 0.33088670122099695, "grad_norm": 2.265469551086426, "learning_rate": 0.00018697581059286917, "loss": 2.7453, "step": 1189 }, { "epoch": 0.33116499112950915, "grad_norm": 2.6284773349761963, "learning_rate": 0.00018695420120454115, "loss": 3.1183, "step": 1190 }, { "epoch": 0.33144328103802134, "grad_norm": 2.2972939014434814, "learning_rate": 0.0001869325751550845, "loss": 2.6669, "step": 1191 }, { "epoch": 0.33172157094653354, "grad_norm": 2.1704702377319336, "learning_rate": 0.00018691093244864305, "loss": 2.758, "step": 1192 }, { "epoch": 0.33199986085504574, "grad_norm": 1.9892711639404297, "learning_rate": 0.0001868892730893637, "loss": 2.7054, "step": 1193 }, { "epoch": 0.33227815076355793, "grad_norm": 2.1082918643951416, "learning_rate": 0.0001868675970813965, "loss": 2.6969, "step": 1194 }, { "epoch": 0.33255644067207013, "grad_norm": 2.258547306060791, "learning_rate": 0.00018684590442889472, "loss": 2.9904, "step": 1195 }, { "epoch": 0.3328347305805823, "grad_norm": 2.319761276245117, "learning_rate": 0.000186824195136015, "loss": 2.8719, "step": 1196 }, { "epoch": 0.3331130204890945, "grad_norm": 2.0440516471862793, "learning_rate": 0.00018680246920691688, "loss": 2.6646, "step": 1197 }, { "epoch": 0.3333913103976067, "grad_norm": 2.343111276626587, "learning_rate": 0.00018678072664576325, "loss": 2.8313, "step": 1198 }, { "epoch": 0.3336696003061189, "grad_norm": 1.927234411239624, "learning_rate": 0.00018675896745672018, "loss": 2.921, "step": 1199 }, { "epoch": 0.3339478902146311, "grad_norm": 2.250185966491699, "learning_rate": 0.00018673719164395691, "loss": 2.7507, "step": 1200 }, { "epoch": 0.3339478902146311, "eval_loss": 2.9658923149108887, "eval_runtime": 85.0539, "eval_samples_per_second": 58.786, "eval_steps_per_second": 14.697, "step": 1200 }, { "epoch": 0.33422618012314326, "grad_norm": 2.2908971309661865, "learning_rate": 0.00018671539921164584, "loss": 2.9163, "step": 1201 }, { "epoch": 0.33450447003165545, "grad_norm": 2.0086634159088135, "learning_rate": 0.00018669359016396263, "loss": 2.5519, "step": 1202 }, { "epoch": 0.33478275994016765, "grad_norm": 2.2176084518432617, "learning_rate": 0.00018667176450508602, "loss": 2.8026, "step": 1203 }, { "epoch": 0.33506104984867985, "grad_norm": 2.002673625946045, "learning_rate": 0.00018664992223919798, "loss": 2.8022, "step": 1204 }, { "epoch": 0.33533933975719205, "grad_norm": 2.4368367195129395, "learning_rate": 0.00018662806337048368, "loss": 3.0865, "step": 1205 }, { "epoch": 0.33561762966570424, "grad_norm": 2.409742593765259, "learning_rate": 0.00018660618790313144, "loss": 3.2996, "step": 1206 }, { "epoch": 0.33589591957421644, "grad_norm": 2.15346097946167, "learning_rate": 0.00018658429584133282, "loss": 2.8494, "step": 1207 }, { "epoch": 0.33617420948272864, "grad_norm": 2.4256703853607178, "learning_rate": 0.00018656238718928246, "loss": 3.0829, "step": 1208 }, { "epoch": 0.33645249939124083, "grad_norm": 2.2442879676818848, "learning_rate": 0.00018654046195117828, "loss": 3.1098, "step": 1209 }, { "epoch": 0.33673078929975303, "grad_norm": 2.427661418914795, "learning_rate": 0.00018651852013122132, "loss": 2.9827, "step": 1210 }, { "epoch": 0.3370090792082652, "grad_norm": 2.412055253982544, "learning_rate": 0.00018649656173361578, "loss": 2.976, "step": 1211 }, { "epoch": 0.3372873691167774, "grad_norm": 2.4291296005249023, "learning_rate": 0.0001864745867625691, "loss": 2.7787, "step": 1212 }, { "epoch": 0.3375656590252896, "grad_norm": 2.6005985736846924, "learning_rate": 0.00018645259522229187, "loss": 3.0154, "step": 1213 }, { "epoch": 0.33784394893380176, "grad_norm": 2.0380475521087646, "learning_rate": 0.00018643058711699783, "loss": 3.1137, "step": 1214 }, { "epoch": 0.33812223884231396, "grad_norm": 2.2061920166015625, "learning_rate": 0.0001864085624509039, "loss": 3.1765, "step": 1215 }, { "epoch": 0.33840052875082616, "grad_norm": 2.370047092437744, "learning_rate": 0.00018638652122823023, "loss": 3.1502, "step": 1216 }, { "epoch": 0.33867881865933835, "grad_norm": 2.099005699157715, "learning_rate": 0.00018636446345320003, "loss": 3.084, "step": 1217 }, { "epoch": 0.33895710856785055, "grad_norm": 2.661806583404541, "learning_rate": 0.00018634238913003978, "loss": 2.8901, "step": 1218 }, { "epoch": 0.33923539847636275, "grad_norm": 2.3678128719329834, "learning_rate": 0.0001863202982629791, "loss": 2.9861, "step": 1219 }, { "epoch": 0.33951368838487495, "grad_norm": 2.0430805683135986, "learning_rate": 0.0001862981908562508, "loss": 2.6691, "step": 1220 }, { "epoch": 0.33979197829338714, "grad_norm": 2.113291025161743, "learning_rate": 0.00018627606691409084, "loss": 2.9175, "step": 1221 }, { "epoch": 0.34007026820189934, "grad_norm": 2.258105993270874, "learning_rate": 0.00018625392644073827, "loss": 2.8762, "step": 1222 }, { "epoch": 0.34034855811041154, "grad_norm": 2.689866304397583, "learning_rate": 0.00018623176944043547, "loss": 2.9577, "step": 1223 }, { "epoch": 0.34062684801892373, "grad_norm": 2.453442335128784, "learning_rate": 0.00018620959591742785, "loss": 3.0686, "step": 1224 }, { "epoch": 0.34090513792743593, "grad_norm": 2.409374952316284, "learning_rate": 0.00018618740587596406, "loss": 2.9682, "step": 1225 }, { "epoch": 0.34118342783594807, "grad_norm": 2.186138153076172, "learning_rate": 0.0001861651993202959, "loss": 2.661, "step": 1226 }, { "epoch": 0.34146171774446027, "grad_norm": 2.307302236557007, "learning_rate": 0.0001861429762546783, "loss": 2.693, "step": 1227 }, { "epoch": 0.34174000765297247, "grad_norm": 2.4118337631225586, "learning_rate": 0.0001861207366833694, "loss": 2.986, "step": 1228 }, { "epoch": 0.34201829756148466, "grad_norm": 2.199756383895874, "learning_rate": 0.00018609848061063047, "loss": 2.976, "step": 1229 }, { "epoch": 0.34229658746999686, "grad_norm": 2.42448353767395, "learning_rate": 0.00018607620804072595, "loss": 2.7873, "step": 1230 }, { "epoch": 0.34257487737850906, "grad_norm": 2.298520803451538, "learning_rate": 0.00018605391897792342, "loss": 2.7695, "step": 1231 }, { "epoch": 0.34285316728702125, "grad_norm": 2.01639723777771, "learning_rate": 0.00018603161342649376, "loss": 3.0318, "step": 1232 }, { "epoch": 0.34313145719553345, "grad_norm": 2.57551646232605, "learning_rate": 0.00018600929139071074, "loss": 3.013, "step": 1233 }, { "epoch": 0.34340974710404565, "grad_norm": 1.730008602142334, "learning_rate": 0.00018598695287485155, "loss": 2.7117, "step": 1234 }, { "epoch": 0.34368803701255785, "grad_norm": 2.3833367824554443, "learning_rate": 0.00018596459788319634, "loss": 2.7706, "step": 1235 }, { "epoch": 0.34396632692107004, "grad_norm": 2.1640148162841797, "learning_rate": 0.00018594222642002864, "loss": 2.778, "step": 1236 }, { "epoch": 0.34424461682958224, "grad_norm": 2.302320957183838, "learning_rate": 0.00018591983848963487, "loss": 2.7422, "step": 1237 }, { "epoch": 0.34452290673809444, "grad_norm": 2.096182346343994, "learning_rate": 0.0001858974340963048, "loss": 2.6134, "step": 1238 }, { "epoch": 0.3448011966466066, "grad_norm": 2.0788567066192627, "learning_rate": 0.0001858750132443313, "loss": 2.88, "step": 1239 }, { "epoch": 0.3450794865551188, "grad_norm": 2.3331961631774902, "learning_rate": 0.00018585257593801036, "loss": 2.8588, "step": 1240 }, { "epoch": 0.34535777646363097, "grad_norm": 2.0921545028686523, "learning_rate": 0.00018583012218164116, "loss": 3.0093, "step": 1241 }, { "epoch": 0.34563606637214317, "grad_norm": 2.7631418704986572, "learning_rate": 0.00018580765197952602, "loss": 3.0754, "step": 1242 }, { "epoch": 0.34591435628065537, "grad_norm": 2.3819169998168945, "learning_rate": 0.00018578516533597044, "loss": 3.0273, "step": 1243 }, { "epoch": 0.34619264618916756, "grad_norm": 2.3277432918548584, "learning_rate": 0.00018576266225528302, "loss": 3.1449, "step": 1244 }, { "epoch": 0.34647093609767976, "grad_norm": 2.1631033420562744, "learning_rate": 0.00018574014274177548, "loss": 2.7666, "step": 1245 }, { "epoch": 0.34674922600619196, "grad_norm": 2.7257604598999023, "learning_rate": 0.00018571760679976285, "loss": 3.1857, "step": 1246 }, { "epoch": 0.34702751591470415, "grad_norm": 2.586211681365967, "learning_rate": 0.00018569505443356312, "loss": 3.0004, "step": 1247 }, { "epoch": 0.34730580582321635, "grad_norm": 2.1937501430511475, "learning_rate": 0.00018567248564749754, "loss": 2.8664, "step": 1248 }, { "epoch": 0.34758409573172855, "grad_norm": 2.3358423709869385, "learning_rate": 0.00018564990044589043, "loss": 2.8615, "step": 1249 }, { "epoch": 0.34786238564024075, "grad_norm": 2.6659228801727295, "learning_rate": 0.00018562729883306932, "loss": 3.072, "step": 1250 }, { "epoch": 0.3481406755487529, "grad_norm": 2.256988286972046, "learning_rate": 0.0001856046808133649, "loss": 2.781, "step": 1251 }, { "epoch": 0.3484189654572651, "grad_norm": 2.5885608196258545, "learning_rate": 0.00018558204639111092, "loss": 2.9498, "step": 1252 }, { "epoch": 0.3486972553657773, "grad_norm": 2.403681755065918, "learning_rate": 0.00018555939557064432, "loss": 2.9491, "step": 1253 }, { "epoch": 0.3489755452742895, "grad_norm": 2.3021979331970215, "learning_rate": 0.00018553672835630523, "loss": 3.1435, "step": 1254 }, { "epoch": 0.3492538351828017, "grad_norm": 2.3900492191314697, "learning_rate": 0.0001855140447524368, "loss": 2.7499, "step": 1255 }, { "epoch": 0.34953212509131387, "grad_norm": 2.352330446243286, "learning_rate": 0.00018549134476338543, "loss": 3.0155, "step": 1256 }, { "epoch": 0.34981041499982607, "grad_norm": 2.046834707260132, "learning_rate": 0.00018546862839350064, "loss": 2.6364, "step": 1257 }, { "epoch": 0.35008870490833827, "grad_norm": 2.1589837074279785, "learning_rate": 0.00018544589564713503, "loss": 2.9764, "step": 1258 }, { "epoch": 0.35036699481685046, "grad_norm": 2.3603148460388184, "learning_rate": 0.0001854231465286444, "loss": 3.0283, "step": 1259 }, { "epoch": 0.35064528472536266, "grad_norm": 2.3142430782318115, "learning_rate": 0.00018540038104238767, "loss": 3.0008, "step": 1260 }, { "epoch": 0.35092357463387486, "grad_norm": 2.688842535018921, "learning_rate": 0.00018537759919272685, "loss": 3.0417, "step": 1261 }, { "epoch": 0.35120186454238705, "grad_norm": 3.017772912979126, "learning_rate": 0.0001853548009840272, "loss": 3.1862, "step": 1262 }, { "epoch": 0.35148015445089925, "grad_norm": 2.5841875076293945, "learning_rate": 0.000185331986420657, "loss": 2.9738, "step": 1263 }, { "epoch": 0.3517584443594114, "grad_norm": 2.1404669284820557, "learning_rate": 0.0001853091555069877, "loss": 3.0049, "step": 1264 }, { "epoch": 0.3520367342679236, "grad_norm": 2.4357874393463135, "learning_rate": 0.00018528630824739387, "loss": 3.2524, "step": 1265 }, { "epoch": 0.3523150241764358, "grad_norm": 2.330873727798462, "learning_rate": 0.00018526344464625324, "loss": 2.9843, "step": 1266 }, { "epoch": 0.352593314084948, "grad_norm": 2.2773749828338623, "learning_rate": 0.0001852405647079467, "loss": 3.0321, "step": 1267 }, { "epoch": 0.3528716039934602, "grad_norm": 2.774960994720459, "learning_rate": 0.00018521766843685824, "loss": 3.0679, "step": 1268 }, { "epoch": 0.3531498939019724, "grad_norm": 2.3752331733703613, "learning_rate": 0.00018519475583737488, "loss": 3.2168, "step": 1269 }, { "epoch": 0.3534281838104846, "grad_norm": 2.853553056716919, "learning_rate": 0.00018517182691388695, "loss": 2.831, "step": 1270 }, { "epoch": 0.3537064737189968, "grad_norm": 2.2781498432159424, "learning_rate": 0.00018514888167078775, "loss": 2.9166, "step": 1271 }, { "epoch": 0.35398476362750897, "grad_norm": 2.0867795944213867, "learning_rate": 0.00018512592011247386, "loss": 2.6106, "step": 1272 }, { "epoch": 0.35426305353602117, "grad_norm": 2.3298819065093994, "learning_rate": 0.00018510294224334482, "loss": 2.7466, "step": 1273 }, { "epoch": 0.35454134344453336, "grad_norm": 2.4238321781158447, "learning_rate": 0.00018507994806780343, "loss": 3.1735, "step": 1274 }, { "epoch": 0.35481963335304556, "grad_norm": 2.2738101482391357, "learning_rate": 0.00018505693759025547, "loss": 2.9927, "step": 1275 }, { "epoch": 0.3550979232615577, "grad_norm": 3.5487821102142334, "learning_rate": 0.00018503391081511005, "loss": 3.0086, "step": 1276 }, { "epoch": 0.3553762131700699, "grad_norm": 2.731464385986328, "learning_rate": 0.00018501086774677924, "loss": 3.0356, "step": 1277 }, { "epoch": 0.3556545030785821, "grad_norm": 2.3948864936828613, "learning_rate": 0.00018498780838967823, "loss": 2.8605, "step": 1278 }, { "epoch": 0.3559327929870943, "grad_norm": 1.9798177480697632, "learning_rate": 0.00018496473274822546, "loss": 2.8806, "step": 1279 }, { "epoch": 0.3562110828956065, "grad_norm": 2.6703426837921143, "learning_rate": 0.00018494164082684235, "loss": 3.128, "step": 1280 }, { "epoch": 0.3564893728041187, "grad_norm": 2.574873208999634, "learning_rate": 0.00018491853262995352, "loss": 3.069, "step": 1281 }, { "epoch": 0.3567676627126309, "grad_norm": 2.4518730640411377, "learning_rate": 0.00018489540816198668, "loss": 3.114, "step": 1282 }, { "epoch": 0.3570459526211431, "grad_norm": 2.36063551902771, "learning_rate": 0.00018487226742737271, "loss": 2.9518, "step": 1283 }, { "epoch": 0.3573242425296553, "grad_norm": 2.0314536094665527, "learning_rate": 0.00018484911043054552, "loss": 2.7532, "step": 1284 }, { "epoch": 0.3576025324381675, "grad_norm": 2.2699084281921387, "learning_rate": 0.00018482593717594214, "loss": 3.2118, "step": 1285 }, { "epoch": 0.3578808223466797, "grad_norm": 2.3084986209869385, "learning_rate": 0.0001848027476680028, "loss": 2.7185, "step": 1286 }, { "epoch": 0.35815911225519187, "grad_norm": 2.1672518253326416, "learning_rate": 0.00018477954191117084, "loss": 2.8249, "step": 1287 }, { "epoch": 0.35843740216370407, "grad_norm": 2.4308488368988037, "learning_rate": 0.0001847563199098926, "loss": 2.7898, "step": 1288 }, { "epoch": 0.3587156920722162, "grad_norm": 2.3692209720611572, "learning_rate": 0.0001847330816686176, "loss": 2.7709, "step": 1289 }, { "epoch": 0.3589939819807284, "grad_norm": 2.290740489959717, "learning_rate": 0.00018470982719179853, "loss": 3.0153, "step": 1290 }, { "epoch": 0.3592722718892406, "grad_norm": 2.3506410121917725, "learning_rate": 0.00018468655648389113, "loss": 2.8632, "step": 1291 }, { "epoch": 0.3595505617977528, "grad_norm": 2.177306652069092, "learning_rate": 0.0001846632695493542, "loss": 2.9484, "step": 1292 }, { "epoch": 0.359828851706265, "grad_norm": 2.3661012649536133, "learning_rate": 0.00018463996639264977, "loss": 3.236, "step": 1293 }, { "epoch": 0.3601071416147772, "grad_norm": 2.252408742904663, "learning_rate": 0.00018461664701824285, "loss": 2.8989, "step": 1294 }, { "epoch": 0.3603854315232894, "grad_norm": 2.205371141433716, "learning_rate": 0.00018459331143060167, "loss": 2.7423, "step": 1295 }, { "epoch": 0.3606637214318016, "grad_norm": 1.961328387260437, "learning_rate": 0.00018456995963419753, "loss": 2.6397, "step": 1296 }, { "epoch": 0.3609420113403138, "grad_norm": 2.966557741165161, "learning_rate": 0.00018454659163350475, "loss": 3.0631, "step": 1297 }, { "epoch": 0.361220301248826, "grad_norm": 2.3127055168151855, "learning_rate": 0.00018452320743300087, "loss": 2.8856, "step": 1298 }, { "epoch": 0.3614985911573382, "grad_norm": 2.226360559463501, "learning_rate": 0.00018449980703716653, "loss": 2.871, "step": 1299 }, { "epoch": 0.3617768810658504, "grad_norm": 2.0332190990448, "learning_rate": 0.00018447639045048538, "loss": 2.6323, "step": 1300 }, { "epoch": 0.3617768810658504, "eval_loss": 2.9565489292144775, "eval_runtime": 84.8186, "eval_samples_per_second": 58.949, "eval_steps_per_second": 14.737, "step": 1300 }, { "epoch": 0.3620551709743625, "grad_norm": 2.044790029525757, "learning_rate": 0.00018445295767744423, "loss": 2.7435, "step": 1301 }, { "epoch": 0.3623334608828747, "grad_norm": 2.4415640830993652, "learning_rate": 0.00018442950872253304, "loss": 3.0958, "step": 1302 }, { "epoch": 0.3626117507913869, "grad_norm": 2.256223440170288, "learning_rate": 0.00018440604359024477, "loss": 3.0521, "step": 1303 }, { "epoch": 0.3628900406998991, "grad_norm": 2.5509045124053955, "learning_rate": 0.00018438256228507554, "loss": 3.1475, "step": 1304 }, { "epoch": 0.3631683306084113, "grad_norm": 2.2218732833862305, "learning_rate": 0.0001843590648115246, "loss": 2.8882, "step": 1305 }, { "epoch": 0.3634466205169235, "grad_norm": 2.1533758640289307, "learning_rate": 0.00018433555117409415, "loss": 2.7792, "step": 1306 }, { "epoch": 0.3637249104254357, "grad_norm": 2.6469674110412598, "learning_rate": 0.0001843120213772897, "loss": 3.0074, "step": 1307 }, { "epoch": 0.3640032003339479, "grad_norm": 2.3371036052703857, "learning_rate": 0.00018428847542561972, "loss": 3.0115, "step": 1308 }, { "epoch": 0.3642814902424601, "grad_norm": 2.3841440677642822, "learning_rate": 0.00018426491332359577, "loss": 2.8204, "step": 1309 }, { "epoch": 0.3645597801509723, "grad_norm": 2.222301959991455, "learning_rate": 0.0001842413350757326, "loss": 2.8691, "step": 1310 }, { "epoch": 0.3648380700594845, "grad_norm": 2.8514020442962646, "learning_rate": 0.00018421774068654793, "loss": 3.0531, "step": 1311 }, { "epoch": 0.3651163599679967, "grad_norm": 2.4131250381469727, "learning_rate": 0.00018419413016056269, "loss": 3.1359, "step": 1312 }, { "epoch": 0.3653946498765088, "grad_norm": 2.293592929840088, "learning_rate": 0.0001841705035023008, "loss": 2.8887, "step": 1313 }, { "epoch": 0.365672939785021, "grad_norm": 2.175602674484253, "learning_rate": 0.00018414686071628936, "loss": 2.9775, "step": 1314 }, { "epoch": 0.3659512296935332, "grad_norm": 3.348400592803955, "learning_rate": 0.0001841232018070585, "loss": 3.1677, "step": 1315 }, { "epoch": 0.3662295196020454, "grad_norm": 2.385610580444336, "learning_rate": 0.00018409952677914145, "loss": 2.7199, "step": 1316 }, { "epoch": 0.3665078095105576, "grad_norm": 2.138465642929077, "learning_rate": 0.00018407583563707455, "loss": 2.9353, "step": 1317 }, { "epoch": 0.3667860994190698, "grad_norm": 2.5164780616760254, "learning_rate": 0.0001840521283853972, "loss": 3.0106, "step": 1318 }, { "epoch": 0.367064389327582, "grad_norm": 2.4383127689361572, "learning_rate": 0.00018402840502865193, "loss": 3.0327, "step": 1319 }, { "epoch": 0.3673426792360942, "grad_norm": 2.5241663455963135, "learning_rate": 0.00018400466557138428, "loss": 3.2177, "step": 1320 }, { "epoch": 0.3676209691446064, "grad_norm": 2.467475175857544, "learning_rate": 0.000183980910018143, "loss": 2.7847, "step": 1321 }, { "epoch": 0.3678992590531186, "grad_norm": 2.362370491027832, "learning_rate": 0.00018395713837347978, "loss": 2.583, "step": 1322 }, { "epoch": 0.3681775489616308, "grad_norm": 1.8929369449615479, "learning_rate": 0.00018393335064194942, "loss": 2.8805, "step": 1323 }, { "epoch": 0.368455838870143, "grad_norm": 2.5111165046691895, "learning_rate": 0.00018390954682810998, "loss": 2.8882, "step": 1324 }, { "epoch": 0.3687341287786552, "grad_norm": 4.528006553649902, "learning_rate": 0.00018388572693652235, "loss": 2.8355, "step": 1325 }, { "epoch": 0.36901241868716733, "grad_norm": 2.292663335800171, "learning_rate": 0.00018386189097175062, "loss": 2.5853, "step": 1326 }, { "epoch": 0.36929070859567953, "grad_norm": 2.3462979793548584, "learning_rate": 0.000183838038938362, "loss": 2.9652, "step": 1327 }, { "epoch": 0.3695689985041917, "grad_norm": 2.1878912448883057, "learning_rate": 0.00018381417084092672, "loss": 3.1027, "step": 1328 }, { "epoch": 0.3698472884127039, "grad_norm": 2.509308338165283, "learning_rate": 0.0001837902866840181, "loss": 3.0436, "step": 1329 }, { "epoch": 0.3701255783212161, "grad_norm": 2.632751941680908, "learning_rate": 0.00018376638647221252, "loss": 2.9865, "step": 1330 }, { "epoch": 0.3704038682297283, "grad_norm": 2.833408832550049, "learning_rate": 0.00018374247021008946, "loss": 2.9175, "step": 1331 }, { "epoch": 0.3706821581382405, "grad_norm": 2.488046407699585, "learning_rate": 0.00018371853790223147, "loss": 3.3372, "step": 1332 }, { "epoch": 0.3709604480467527, "grad_norm": 2.5615451335906982, "learning_rate": 0.00018369458955322417, "loss": 3.1065, "step": 1333 }, { "epoch": 0.3712387379552649, "grad_norm": 2.0403623580932617, "learning_rate": 0.00018367062516765629, "loss": 2.7281, "step": 1334 }, { "epoch": 0.3715170278637771, "grad_norm": 2.0974042415618896, "learning_rate": 0.0001836466447501196, "loss": 2.8068, "step": 1335 }, { "epoch": 0.3717953177722893, "grad_norm": 2.064469814300537, "learning_rate": 0.00018362264830520884, "loss": 2.7345, "step": 1336 }, { "epoch": 0.3720736076808015, "grad_norm": 2.310638904571533, "learning_rate": 0.00018359863583752205, "loss": 3.0449, "step": 1337 }, { "epoch": 0.37235189758931364, "grad_norm": 2.5461196899414062, "learning_rate": 0.00018357460735166016, "loss": 2.8948, "step": 1338 }, { "epoch": 0.37263018749782584, "grad_norm": 2.4266250133514404, "learning_rate": 0.00018355056285222724, "loss": 3.0434, "step": 1339 }, { "epoch": 0.37290847740633803, "grad_norm": 2.2776870727539062, "learning_rate": 0.00018352650234383037, "loss": 2.8748, "step": 1340 }, { "epoch": 0.37318676731485023, "grad_norm": 2.034586191177368, "learning_rate": 0.00018350242583107978, "loss": 2.9048, "step": 1341 }, { "epoch": 0.37346505722336243, "grad_norm": 2.4456920623779297, "learning_rate": 0.00018347833331858873, "loss": 2.9367, "step": 1342 }, { "epoch": 0.3737433471318746, "grad_norm": 2.24980092048645, "learning_rate": 0.00018345422481097346, "loss": 2.8973, "step": 1343 }, { "epoch": 0.3740216370403868, "grad_norm": 2.620724678039551, "learning_rate": 0.00018343010031285348, "loss": 2.9836, "step": 1344 }, { "epoch": 0.374299926948899, "grad_norm": 2.2938642501831055, "learning_rate": 0.00018340595982885112, "loss": 2.7217, "step": 1345 }, { "epoch": 0.3745782168574112, "grad_norm": 2.3017830848693848, "learning_rate": 0.000183381803363592, "loss": 2.8486, "step": 1346 }, { "epoch": 0.3748565067659234, "grad_norm": 2.300840139389038, "learning_rate": 0.0001833576309217046, "loss": 3.0812, "step": 1347 }, { "epoch": 0.3751347966744356, "grad_norm": 2.4301412105560303, "learning_rate": 0.00018333344250782058, "loss": 3.0507, "step": 1348 }, { "epoch": 0.3754130865829478, "grad_norm": 2.183340311050415, "learning_rate": 0.0001833092381265747, "loss": 2.7738, "step": 1349 }, { "epoch": 0.37569137649146, "grad_norm": 2.429962396621704, "learning_rate": 0.00018328501778260464, "loss": 2.9313, "step": 1350 }, { "epoch": 0.37596966639997215, "grad_norm": 2.4895594120025635, "learning_rate": 0.00018326078148055123, "loss": 2.8792, "step": 1351 }, { "epoch": 0.37624795630848434, "grad_norm": 2.1060776710510254, "learning_rate": 0.0001832365292250584, "loss": 2.8478, "step": 1352 }, { "epoch": 0.37652624621699654, "grad_norm": 2.075732469558716, "learning_rate": 0.00018321226102077298, "loss": 2.6956, "step": 1353 }, { "epoch": 0.37680453612550874, "grad_norm": 2.378427028656006, "learning_rate": 0.00018318797687234503, "loss": 3.1764, "step": 1354 }, { "epoch": 0.37708282603402093, "grad_norm": 2.3772432804107666, "learning_rate": 0.00018316367678442757, "loss": 3.0689, "step": 1355 }, { "epoch": 0.37736111594253313, "grad_norm": 2.570065975189209, "learning_rate": 0.00018313936076167666, "loss": 2.8864, "step": 1356 }, { "epoch": 0.37763940585104533, "grad_norm": 2.3763134479522705, "learning_rate": 0.0001831150288087515, "loss": 3.0035, "step": 1357 }, { "epoch": 0.3779176957595575, "grad_norm": 2.29880690574646, "learning_rate": 0.00018309068093031426, "loss": 2.6676, "step": 1358 }, { "epoch": 0.3781959856680697, "grad_norm": 2.540256977081299, "learning_rate": 0.00018306631713103016, "loss": 2.8762, "step": 1359 }, { "epoch": 0.3784742755765819, "grad_norm": 2.2924606800079346, "learning_rate": 0.00018304193741556759, "loss": 3.0847, "step": 1360 }, { "epoch": 0.3787525654850941, "grad_norm": 2.3403427600860596, "learning_rate": 0.0001830175417885978, "loss": 2.9828, "step": 1361 }, { "epoch": 0.3790308553936063, "grad_norm": 2.344900131225586, "learning_rate": 0.00018299313025479529, "loss": 3.0659, "step": 1362 }, { "epoch": 0.37930914530211846, "grad_norm": 1.8387023210525513, "learning_rate": 0.0001829687028188374, "loss": 2.7874, "step": 1363 }, { "epoch": 0.37958743521063065, "grad_norm": 2.3810815811157227, "learning_rate": 0.0001829442594854047, "loss": 2.9831, "step": 1364 }, { "epoch": 0.37986572511914285, "grad_norm": 2.8099541664123535, "learning_rate": 0.00018291980025918073, "loss": 3.1119, "step": 1365 }, { "epoch": 0.38014401502765505, "grad_norm": 2.33979868888855, "learning_rate": 0.00018289532514485202, "loss": 2.7826, "step": 1366 }, { "epoch": 0.38042230493616724, "grad_norm": 2.4608840942382812, "learning_rate": 0.00018287083414710823, "loss": 2.9991, "step": 1367 }, { "epoch": 0.38070059484467944, "grad_norm": 1.9969059228897095, "learning_rate": 0.00018284632727064203, "loss": 2.9048, "step": 1368 }, { "epoch": 0.38097888475319164, "grad_norm": 2.0154013633728027, "learning_rate": 0.00018282180452014915, "loss": 2.7983, "step": 1369 }, { "epoch": 0.38125717466170383, "grad_norm": 2.377565860748291, "learning_rate": 0.00018279726590032837, "loss": 2.9977, "step": 1370 }, { "epoch": 0.38153546457021603, "grad_norm": 2.7190558910369873, "learning_rate": 0.00018277271141588144, "loss": 3.1377, "step": 1371 }, { "epoch": 0.38181375447872823, "grad_norm": 2.4035136699676514, "learning_rate": 0.00018274814107151324, "loss": 2.9027, "step": 1372 }, { "epoch": 0.3820920443872404, "grad_norm": 2.223487377166748, "learning_rate": 0.00018272355487193158, "loss": 2.8337, "step": 1373 }, { "epoch": 0.3823703342957526, "grad_norm": 2.5261576175689697, "learning_rate": 0.00018269895282184743, "loss": 3.3779, "step": 1374 }, { "epoch": 0.3826486242042648, "grad_norm": 2.3695907592773438, "learning_rate": 0.00018267433492597475, "loss": 2.9574, "step": 1375 }, { "epoch": 0.38292691411277696, "grad_norm": 2.470168352127075, "learning_rate": 0.0001826497011890305, "loss": 2.7848, "step": 1376 }, { "epoch": 0.38320520402128916, "grad_norm": 2.7634096145629883, "learning_rate": 0.0001826250516157347, "loss": 3.0878, "step": 1377 }, { "epoch": 0.38348349392980136, "grad_norm": 2.455291986465454, "learning_rate": 0.00018260038621081042, "loss": 3.2018, "step": 1378 }, { "epoch": 0.38376178383831355, "grad_norm": 2.4405229091644287, "learning_rate": 0.00018257570497898375, "loss": 2.9115, "step": 1379 }, { "epoch": 0.38404007374682575, "grad_norm": 2.1084277629852295, "learning_rate": 0.00018255100792498384, "loss": 2.8855, "step": 1380 }, { "epoch": 0.38431836365533795, "grad_norm": 2.243788003921509, "learning_rate": 0.00018252629505354276, "loss": 3.0133, "step": 1381 }, { "epoch": 0.38459665356385014, "grad_norm": 2.46830677986145, "learning_rate": 0.0001825015663693958, "loss": 3.0427, "step": 1382 }, { "epoch": 0.38487494347236234, "grad_norm": 2.2948153018951416, "learning_rate": 0.00018247682187728112, "loss": 2.6683, "step": 1383 }, { "epoch": 0.38515323338087454, "grad_norm": 2.787569999694824, "learning_rate": 0.00018245206158194, "loss": 3.2823, "step": 1384 }, { "epoch": 0.38543152328938673, "grad_norm": 2.697896957397461, "learning_rate": 0.00018242728548811666, "loss": 3.035, "step": 1385 }, { "epoch": 0.38570981319789893, "grad_norm": 2.236534357070923, "learning_rate": 0.00018240249360055842, "loss": 3.159, "step": 1386 }, { "epoch": 0.38598810310641113, "grad_norm": 2.1746883392333984, "learning_rate": 0.00018237768592401564, "loss": 2.8667, "step": 1387 }, { "epoch": 0.38626639301492327, "grad_norm": 2.9784903526306152, "learning_rate": 0.00018235286246324166, "loss": 3.0382, "step": 1388 }, { "epoch": 0.38654468292343547, "grad_norm": 2.4932610988616943, "learning_rate": 0.00018232802322299283, "loss": 3.0112, "step": 1389 }, { "epoch": 0.38682297283194766, "grad_norm": 2.500810146331787, "learning_rate": 0.00018230316820802854, "loss": 2.9854, "step": 1390 }, { "epoch": 0.38710126274045986, "grad_norm": 2.681528329849243, "learning_rate": 0.00018227829742311125, "loss": 2.9093, "step": 1391 }, { "epoch": 0.38737955264897206, "grad_norm": 2.3117756843566895, "learning_rate": 0.00018225341087300639, "loss": 3.1773, "step": 1392 }, { "epoch": 0.38765784255748426, "grad_norm": 2.189483165740967, "learning_rate": 0.00018222850856248243, "loss": 2.823, "step": 1393 }, { "epoch": 0.38793613246599645, "grad_norm": 2.4919378757476807, "learning_rate": 0.00018220359049631081, "loss": 2.9995, "step": 1394 }, { "epoch": 0.38821442237450865, "grad_norm": 2.5016541481018066, "learning_rate": 0.00018217865667926613, "loss": 2.7971, "step": 1395 }, { "epoch": 0.38849271228302085, "grad_norm": 2.149334192276001, "learning_rate": 0.0001821537071161258, "loss": 2.5613, "step": 1396 }, { "epoch": 0.38877100219153304, "grad_norm": 2.0272927284240723, "learning_rate": 0.00018212874181167043, "loss": 2.6662, "step": 1397 }, { "epoch": 0.38904929210004524, "grad_norm": 2.082967519760132, "learning_rate": 0.00018210376077068357, "loss": 2.9858, "step": 1398 }, { "epoch": 0.38932758200855744, "grad_norm": 2.2828497886657715, "learning_rate": 0.00018207876399795172, "loss": 3.1637, "step": 1399 }, { "epoch": 0.38960587191706963, "grad_norm": 2.3362624645233154, "learning_rate": 0.00018205375149826455, "loss": 3.0745, "step": 1400 }, { "epoch": 0.38960587191706963, "eval_loss": 2.9541642665863037, "eval_runtime": 85.0978, "eval_samples_per_second": 58.756, "eval_steps_per_second": 14.689, "step": 1400 }, { "epoch": 0.3898841618255818, "grad_norm": 2.4108872413635254, "learning_rate": 0.00018202872327641463, "loss": 3.0583, "step": 1401 }, { "epoch": 0.390162451734094, "grad_norm": 2.1015937328338623, "learning_rate": 0.00018200367933719758, "loss": 3.0834, "step": 1402 }, { "epoch": 0.39044074164260617, "grad_norm": 2.0665242671966553, "learning_rate": 0.00018197861968541194, "loss": 2.7568, "step": 1403 }, { "epoch": 0.39071903155111837, "grad_norm": 2.3367726802825928, "learning_rate": 0.00018195354432585948, "loss": 2.7112, "step": 1404 }, { "epoch": 0.39099732145963056, "grad_norm": 2.6426053047180176, "learning_rate": 0.00018192845326334473, "loss": 3.1849, "step": 1405 }, { "epoch": 0.39127561136814276, "grad_norm": 2.535113573074341, "learning_rate": 0.00018190334650267537, "loss": 2.7451, "step": 1406 }, { "epoch": 0.39155390127665496, "grad_norm": 2.4003348350524902, "learning_rate": 0.0001818782240486621, "loss": 2.8772, "step": 1407 }, { "epoch": 0.39183219118516716, "grad_norm": 2.5059261322021484, "learning_rate": 0.00018185308590611853, "loss": 2.8768, "step": 1408 }, { "epoch": 0.39211048109367935, "grad_norm": 2.162773609161377, "learning_rate": 0.0001818279320798614, "loss": 2.9664, "step": 1409 }, { "epoch": 0.39238877100219155, "grad_norm": 2.4576668739318848, "learning_rate": 0.0001818027625747103, "loss": 2.9933, "step": 1410 }, { "epoch": 0.39266706091070375, "grad_norm": 2.5110888481140137, "learning_rate": 0.00018177757739548794, "loss": 2.9718, "step": 1411 }, { "epoch": 0.39294535081921594, "grad_norm": 2.80082368850708, "learning_rate": 0.00018175237654702008, "loss": 2.6028, "step": 1412 }, { "epoch": 0.3932236407277281, "grad_norm": 2.3873651027679443, "learning_rate": 0.0001817271600341353, "loss": 3.296, "step": 1413 }, { "epoch": 0.3935019306362403, "grad_norm": 2.438391923904419, "learning_rate": 0.0001817019278616653, "loss": 2.843, "step": 1414 }, { "epoch": 0.3937802205447525, "grad_norm": 2.227487564086914, "learning_rate": 0.00018167668003444485, "loss": 2.7055, "step": 1415 }, { "epoch": 0.3940585104532647, "grad_norm": 2.178060531616211, "learning_rate": 0.00018165141655731158, "loss": 2.8064, "step": 1416 }, { "epoch": 0.3943368003617769, "grad_norm": 2.306391716003418, "learning_rate": 0.00018162613743510618, "loss": 2.7714, "step": 1417 }, { "epoch": 0.39461509027028907, "grad_norm": 2.22808837890625, "learning_rate": 0.00018160084267267233, "loss": 2.8243, "step": 1418 }, { "epoch": 0.39489338017880127, "grad_norm": 2.75333571434021, "learning_rate": 0.0001815755322748567, "loss": 2.9365, "step": 1419 }, { "epoch": 0.39517167008731346, "grad_norm": 2.7594099044799805, "learning_rate": 0.00018155020624650902, "loss": 2.9503, "step": 1420 }, { "epoch": 0.39544995999582566, "grad_norm": 1.9924191236495972, "learning_rate": 0.00018152486459248194, "loss": 2.9466, "step": 1421 }, { "epoch": 0.39572824990433786, "grad_norm": 1.8259479999542236, "learning_rate": 0.00018149950731763108, "loss": 2.7602, "step": 1422 }, { "epoch": 0.39600653981285006, "grad_norm": 3.152134895324707, "learning_rate": 0.00018147413442681516, "loss": 3.0988, "step": 1423 }, { "epoch": 0.39628482972136225, "grad_norm": 2.5566935539245605, "learning_rate": 0.00018144874592489579, "loss": 3.0311, "step": 1424 }, { "epoch": 0.3965631196298744, "grad_norm": 2.0833349227905273, "learning_rate": 0.00018142334181673762, "loss": 2.8655, "step": 1425 }, { "epoch": 0.3968414095383866, "grad_norm": 2.335747718811035, "learning_rate": 0.00018139792210720832, "loss": 2.6954, "step": 1426 }, { "epoch": 0.3971196994468988, "grad_norm": 3.1068949699401855, "learning_rate": 0.00018137248680117845, "loss": 3.0796, "step": 1427 }, { "epoch": 0.397397989355411, "grad_norm": 2.250394344329834, "learning_rate": 0.0001813470359035217, "loss": 3.0936, "step": 1428 }, { "epoch": 0.3976762792639232, "grad_norm": 2.428577184677124, "learning_rate": 0.00018132156941911456, "loss": 2.7821, "step": 1429 }, { "epoch": 0.3979545691724354, "grad_norm": 2.3767282962799072, "learning_rate": 0.0001812960873528367, "loss": 2.8127, "step": 1430 }, { "epoch": 0.3982328590809476, "grad_norm": 2.3917484283447266, "learning_rate": 0.00018127058970957068, "loss": 2.8294, "step": 1431 }, { "epoch": 0.3985111489894598, "grad_norm": 2.293307065963745, "learning_rate": 0.00018124507649420202, "loss": 2.8585, "step": 1432 }, { "epoch": 0.39878943889797197, "grad_norm": 2.6107118129730225, "learning_rate": 0.0001812195477116193, "loss": 3.0687, "step": 1433 }, { "epoch": 0.39906772880648417, "grad_norm": 2.404802083969116, "learning_rate": 0.00018119400336671403, "loss": 2.8775, "step": 1434 }, { "epoch": 0.39934601871499636, "grad_norm": 2.3755240440368652, "learning_rate": 0.00018116844346438064, "loss": 3.2697, "step": 1435 }, { "epoch": 0.39962430862350856, "grad_norm": 2.1464850902557373, "learning_rate": 0.00018114286800951672, "loss": 3.089, "step": 1436 }, { "epoch": 0.39990259853202076, "grad_norm": 2.3471181392669678, "learning_rate": 0.0001811172770070227, "loss": 3.2725, "step": 1437 }, { "epoch": 0.4001808884405329, "grad_norm": 2.137349843978882, "learning_rate": 0.00018109167046180202, "loss": 2.6994, "step": 1438 }, { "epoch": 0.4004591783490451, "grad_norm": 2.530853509902954, "learning_rate": 0.00018106604837876106, "loss": 3.0677, "step": 1439 }, { "epoch": 0.4007374682575573, "grad_norm": 3.030200242996216, "learning_rate": 0.00018104041076280928, "loss": 3.2513, "step": 1440 }, { "epoch": 0.4010157581660695, "grad_norm": 2.3604588508605957, "learning_rate": 0.00018101475761885902, "loss": 2.8667, "step": 1441 }, { "epoch": 0.4012940480745817, "grad_norm": 2.1922221183776855, "learning_rate": 0.00018098908895182564, "loss": 2.9412, "step": 1442 }, { "epoch": 0.4015723379830939, "grad_norm": 2.783278226852417, "learning_rate": 0.00018096340476662746, "loss": 3.1869, "step": 1443 }, { "epoch": 0.4018506278916061, "grad_norm": 2.3102259635925293, "learning_rate": 0.0001809377050681858, "loss": 3.0143, "step": 1444 }, { "epoch": 0.4021289178001183, "grad_norm": 2.044894218444824, "learning_rate": 0.00018091198986142489, "loss": 2.6595, "step": 1445 }, { "epoch": 0.4024072077086305, "grad_norm": 2.6115221977233887, "learning_rate": 0.00018088625915127203, "loss": 2.9902, "step": 1446 }, { "epoch": 0.4026854976171427, "grad_norm": 2.597118854522705, "learning_rate": 0.00018086051294265734, "loss": 3.0778, "step": 1447 }, { "epoch": 0.40296378752565487, "grad_norm": 2.434523105621338, "learning_rate": 0.0001808347512405141, "loss": 2.7931, "step": 1448 }, { "epoch": 0.40324207743416707, "grad_norm": 2.3377387523651123, "learning_rate": 0.00018080897404977838, "loss": 3.0064, "step": 1449 }, { "epoch": 0.4035203673426792, "grad_norm": 2.474585771560669, "learning_rate": 0.00018078318137538935, "loss": 3.2966, "step": 1450 }, { "epoch": 0.4037986572511914, "grad_norm": 2.1286842823028564, "learning_rate": 0.0001807573732222891, "loss": 2.8943, "step": 1451 }, { "epoch": 0.4040769471597036, "grad_norm": 2.2833456993103027, "learning_rate": 0.00018073154959542264, "loss": 3.0101, "step": 1452 }, { "epoch": 0.4043552370682158, "grad_norm": 2.3003273010253906, "learning_rate": 0.000180705710499738, "loss": 2.9603, "step": 1453 }, { "epoch": 0.404633526976728, "grad_norm": 2.6145360469818115, "learning_rate": 0.00018067985594018618, "loss": 2.9547, "step": 1454 }, { "epoch": 0.4049118168852402, "grad_norm": 2.5264997482299805, "learning_rate": 0.00018065398592172113, "loss": 3.0331, "step": 1455 }, { "epoch": 0.4051901067937524, "grad_norm": 2.5674047470092773, "learning_rate": 0.00018062810044929974, "loss": 3.1826, "step": 1456 }, { "epoch": 0.4054683967022646, "grad_norm": 2.6137619018554688, "learning_rate": 0.00018060219952788185, "loss": 3.1445, "step": 1457 }, { "epoch": 0.4057466866107768, "grad_norm": 2.243896007537842, "learning_rate": 0.0001805762831624303, "loss": 3.0134, "step": 1458 }, { "epoch": 0.406024976519289, "grad_norm": 2.181788921356201, "learning_rate": 0.0001805503513579109, "loss": 2.6341, "step": 1459 }, { "epoch": 0.4063032664278012, "grad_norm": 2.20489764213562, "learning_rate": 0.00018052440411929233, "loss": 2.9065, "step": 1460 }, { "epoch": 0.4065815563363134, "grad_norm": 2.16267728805542, "learning_rate": 0.0001804984414515464, "loss": 3.1237, "step": 1461 }, { "epoch": 0.4068598462448256, "grad_norm": 2.1957852840423584, "learning_rate": 0.0001804724633596477, "loss": 2.7409, "step": 1462 }, { "epoch": 0.4071381361533377, "grad_norm": 2.026777982711792, "learning_rate": 0.0001804464698485738, "loss": 2.6759, "step": 1463 }, { "epoch": 0.4074164260618499, "grad_norm": 2.3886356353759766, "learning_rate": 0.00018042046092330534, "loss": 2.7539, "step": 1464 }, { "epoch": 0.4076947159703621, "grad_norm": 8.196935653686523, "learning_rate": 0.0001803944365888258, "loss": 2.9382, "step": 1465 }, { "epoch": 0.4079730058788743, "grad_norm": 2.6158368587493896, "learning_rate": 0.0001803683968501217, "loss": 3.0821, "step": 1466 }, { "epoch": 0.4082512957873865, "grad_norm": 2.287116050720215, "learning_rate": 0.00018034234171218242, "loss": 2.9287, "step": 1467 }, { "epoch": 0.4085295856958987, "grad_norm": 2.333550214767456, "learning_rate": 0.00018031627118000033, "loss": 2.9122, "step": 1468 }, { "epoch": 0.4088078756044109, "grad_norm": 2.1937859058380127, "learning_rate": 0.0001802901852585708, "loss": 2.7261, "step": 1469 }, { "epoch": 0.4090861655129231, "grad_norm": 2.578164577484131, "learning_rate": 0.00018026408395289205, "loss": 3.0152, "step": 1470 }, { "epoch": 0.4093644554214353, "grad_norm": 2.1487278938293457, "learning_rate": 0.00018023796726796535, "loss": 2.9244, "step": 1471 }, { "epoch": 0.4096427453299475, "grad_norm": 2.3235023021698, "learning_rate": 0.00018021183520879484, "loss": 3.0808, "step": 1472 }, { "epoch": 0.4099210352384597, "grad_norm": 2.7452189922332764, "learning_rate": 0.00018018568778038763, "loss": 3.0424, "step": 1473 }, { "epoch": 0.4101993251469719, "grad_norm": 2.248053550720215, "learning_rate": 0.0001801595249877538, "loss": 2.8951, "step": 1474 }, { "epoch": 0.410477615055484, "grad_norm": 2.1908910274505615, "learning_rate": 0.0001801333468359063, "loss": 2.7813, "step": 1475 }, { "epoch": 0.4107559049639962, "grad_norm": 2.190885305404663, "learning_rate": 0.0001801071533298612, "loss": 2.9323, "step": 1476 }, { "epoch": 0.4110341948725084, "grad_norm": 2.3519840240478516, "learning_rate": 0.00018008094447463725, "loss": 2.9599, "step": 1477 }, { "epoch": 0.4113124847810206, "grad_norm": 2.7468619346618652, "learning_rate": 0.00018005472027525634, "loss": 2.9602, "step": 1478 }, { "epoch": 0.4115907746895328, "grad_norm": 2.930952787399292, "learning_rate": 0.0001800284807367432, "loss": 3.2061, "step": 1479 }, { "epoch": 0.411869064598045, "grad_norm": 2.1983838081359863, "learning_rate": 0.00018000222586412562, "loss": 2.9376, "step": 1480 }, { "epoch": 0.4121473545065572, "grad_norm": 2.3161911964416504, "learning_rate": 0.0001799759556624342, "loss": 3.0711, "step": 1481 }, { "epoch": 0.4124256444150694, "grad_norm": 2.4388632774353027, "learning_rate": 0.0001799496701367025, "loss": 3.0021, "step": 1482 }, { "epoch": 0.4127039343235816, "grad_norm": 2.499904155731201, "learning_rate": 0.00017992336929196706, "loss": 3.0225, "step": 1483 }, { "epoch": 0.4129822242320938, "grad_norm": 2.290944814682007, "learning_rate": 0.00017989705313326733, "loss": 3.0056, "step": 1484 }, { "epoch": 0.413260514140606, "grad_norm": 2.6194465160369873, "learning_rate": 0.00017987072166564572, "loss": 3.1444, "step": 1485 }, { "epoch": 0.4135388040491182, "grad_norm": 2.6370837688446045, "learning_rate": 0.00017984437489414749, "loss": 2.7918, "step": 1486 }, { "epoch": 0.4138170939576304, "grad_norm": 2.184260129928589, "learning_rate": 0.000179818012823821, "loss": 2.7755, "step": 1487 }, { "epoch": 0.41409538386614253, "grad_norm": 2.328777313232422, "learning_rate": 0.00017979163545971734, "loss": 2.7521, "step": 1488 }, { "epoch": 0.4143736737746547, "grad_norm": 2.630589246749878, "learning_rate": 0.00017976524280689064, "loss": 3.1421, "step": 1489 }, { "epoch": 0.4146519636831669, "grad_norm": 2.952857255935669, "learning_rate": 0.000179738834870398, "loss": 3.1612, "step": 1490 }, { "epoch": 0.4149302535916791, "grad_norm": 2.2236733436584473, "learning_rate": 0.00017971241165529937, "loss": 3.1822, "step": 1491 }, { "epoch": 0.4152085435001913, "grad_norm": 2.3405508995056152, "learning_rate": 0.00017968597316665762, "loss": 3.2457, "step": 1492 }, { "epoch": 0.4154868334087035, "grad_norm": 2.100637674331665, "learning_rate": 0.0001796595194095386, "loss": 2.9315, "step": 1493 }, { "epoch": 0.4157651233172157, "grad_norm": 2.5437636375427246, "learning_rate": 0.0001796330503890111, "loss": 3.2034, "step": 1494 }, { "epoch": 0.4160434132257279, "grad_norm": 2.3208770751953125, "learning_rate": 0.00017960656611014676, "loss": 2.7522, "step": 1495 }, { "epoch": 0.4163217031342401, "grad_norm": 2.535043716430664, "learning_rate": 0.00017958006657802015, "loss": 3.1059, "step": 1496 }, { "epoch": 0.4165999930427523, "grad_norm": 2.4839468002319336, "learning_rate": 0.00017955355179770886, "loss": 3.1948, "step": 1497 }, { "epoch": 0.4168782829512645, "grad_norm": 2.0940611362457275, "learning_rate": 0.0001795270217742933, "loss": 2.9136, "step": 1498 }, { "epoch": 0.4171565728597767, "grad_norm": 2.3662772178649902, "learning_rate": 0.00017950047651285681, "loss": 2.9115, "step": 1499 }, { "epoch": 0.41743486276828884, "grad_norm": 2.6298537254333496, "learning_rate": 0.0001794739160184858, "loss": 3.0388, "step": 1500 }, { "epoch": 0.41743486276828884, "eval_loss": 2.9435160160064697, "eval_runtime": 84.3342, "eval_samples_per_second": 59.288, "eval_steps_per_second": 14.822, "step": 1500 }, { "epoch": 0.41771315267680104, "grad_norm": 2.355015277862549, "learning_rate": 0.0001794473402962693, "loss": 3.0685, "step": 1501 }, { "epoch": 0.41799144258531323, "grad_norm": 2.1867897510528564, "learning_rate": 0.00017942074935129956, "loss": 2.8054, "step": 1502 }, { "epoch": 0.41826973249382543, "grad_norm": 2.3667449951171875, "learning_rate": 0.00017939414318867157, "loss": 2.8193, "step": 1503 }, { "epoch": 0.4185480224023376, "grad_norm": 2.2092385292053223, "learning_rate": 0.00017936752181348334, "loss": 3.0683, "step": 1504 }, { "epoch": 0.4188263123108498, "grad_norm": 2.6852476596832275, "learning_rate": 0.00017934088523083563, "loss": 3.056, "step": 1505 }, { "epoch": 0.419104602219362, "grad_norm": 2.1904942989349365, "learning_rate": 0.00017931423344583232, "loss": 3.0908, "step": 1506 }, { "epoch": 0.4193828921278742, "grad_norm": 2.3850483894348145, "learning_rate": 0.00017928756646358012, "loss": 2.8461, "step": 1507 }, { "epoch": 0.4196611820363864, "grad_norm": 1.9621740579605103, "learning_rate": 0.00017926088428918857, "loss": 2.8933, "step": 1508 }, { "epoch": 0.4199394719448986, "grad_norm": 2.3844521045684814, "learning_rate": 0.0001792341869277702, "loss": 2.6984, "step": 1509 }, { "epoch": 0.4202177618534108, "grad_norm": 2.2282097339630127, "learning_rate": 0.00017920747438444047, "loss": 2.9991, "step": 1510 }, { "epoch": 0.420496051761923, "grad_norm": 2.1430187225341797, "learning_rate": 0.00017918074666431775, "loss": 2.7905, "step": 1511 }, { "epoch": 0.4207743416704352, "grad_norm": 2.2480509281158447, "learning_rate": 0.00017915400377252323, "loss": 2.8307, "step": 1512 }, { "epoch": 0.42105263157894735, "grad_norm": 2.027968645095825, "learning_rate": 0.00017912724571418105, "loss": 2.9861, "step": 1513 }, { "epoch": 0.42133092148745954, "grad_norm": 2.4502811431884766, "learning_rate": 0.00017910047249441834, "loss": 2.871, "step": 1514 }, { "epoch": 0.42160921139597174, "grad_norm": 2.58532977104187, "learning_rate": 0.000179073684118365, "loss": 2.9273, "step": 1515 }, { "epoch": 0.42188750130448394, "grad_norm": 2.673048973083496, "learning_rate": 0.00017904688059115396, "loss": 3.0988, "step": 1516 }, { "epoch": 0.42216579121299613, "grad_norm": 1.960426688194275, "learning_rate": 0.00017902006191792094, "loss": 2.7689, "step": 1517 }, { "epoch": 0.42244408112150833, "grad_norm": 2.131484031677246, "learning_rate": 0.00017899322810380463, "loss": 3.032, "step": 1518 }, { "epoch": 0.4227223710300205, "grad_norm": 1.9887473583221436, "learning_rate": 0.0001789663791539466, "loss": 2.7202, "step": 1519 }, { "epoch": 0.4230006609385327, "grad_norm": 2.6431562900543213, "learning_rate": 0.00017893951507349137, "loss": 3.1044, "step": 1520 }, { "epoch": 0.4232789508470449, "grad_norm": 2.392472505569458, "learning_rate": 0.0001789126358675863, "loss": 3.2007, "step": 1521 }, { "epoch": 0.4235572407555571, "grad_norm": 2.0452799797058105, "learning_rate": 0.0001788857415413816, "loss": 2.8725, "step": 1522 }, { "epoch": 0.4238355306640693, "grad_norm": 2.0101158618927, "learning_rate": 0.0001788588321000305, "loss": 2.8801, "step": 1523 }, { "epoch": 0.4241138205725815, "grad_norm": 2.2835144996643066, "learning_rate": 0.00017883190754868907, "loss": 2.8852, "step": 1524 }, { "epoch": 0.42439211048109365, "grad_norm": 2.1649763584136963, "learning_rate": 0.00017880496789251623, "loss": 2.9195, "step": 1525 }, { "epoch": 0.42467040038960585, "grad_norm": 2.120333433151245, "learning_rate": 0.0001787780131366739, "loss": 2.7889, "step": 1526 }, { "epoch": 0.42494869029811805, "grad_norm": 2.2283036708831787, "learning_rate": 0.00017875104328632678, "loss": 2.9201, "step": 1527 }, { "epoch": 0.42522698020663025, "grad_norm": 2.597425699234009, "learning_rate": 0.00017872405834664252, "loss": 3.0986, "step": 1528 }, { "epoch": 0.42550527011514244, "grad_norm": 2.5603561401367188, "learning_rate": 0.00017869705832279165, "loss": 3.2406, "step": 1529 }, { "epoch": 0.42578356002365464, "grad_norm": 2.330618381500244, "learning_rate": 0.00017867004321994758, "loss": 2.8447, "step": 1530 }, { "epoch": 0.42606184993216684, "grad_norm": 2.3854737281799316, "learning_rate": 0.00017864301304328664, "loss": 3.105, "step": 1531 }, { "epoch": 0.42634013984067903, "grad_norm": 2.227254867553711, "learning_rate": 0.0001786159677979881, "loss": 3.0417, "step": 1532 }, { "epoch": 0.42661842974919123, "grad_norm": 2.4053795337677, "learning_rate": 0.00017858890748923388, "loss": 2.8188, "step": 1533 }, { "epoch": 0.4268967196577034, "grad_norm": 2.2057714462280273, "learning_rate": 0.00017856183212220912, "loss": 2.9842, "step": 1534 }, { "epoch": 0.4271750095662156, "grad_norm": 2.175656318664551, "learning_rate": 0.0001785347417021016, "loss": 2.9128, "step": 1535 }, { "epoch": 0.4274532994747278, "grad_norm": 2.4099249839782715, "learning_rate": 0.00017850763623410205, "loss": 2.8526, "step": 1536 }, { "epoch": 0.42773158938323996, "grad_norm": 2.112323522567749, "learning_rate": 0.00017848051572340415, "loss": 2.889, "step": 1537 }, { "epoch": 0.42800987929175216, "grad_norm": 2.391981601715088, "learning_rate": 0.00017845338017520438, "loss": 2.9846, "step": 1538 }, { "epoch": 0.42828816920026436, "grad_norm": 2.3300626277923584, "learning_rate": 0.0001784262295947021, "loss": 2.8795, "step": 1539 }, { "epoch": 0.42856645910877655, "grad_norm": 2.054837465286255, "learning_rate": 0.0001783990639870997, "loss": 2.649, "step": 1540 }, { "epoch": 0.42884474901728875, "grad_norm": 2.4826412200927734, "learning_rate": 0.00017837188335760216, "loss": 2.7401, "step": 1541 }, { "epoch": 0.42912303892580095, "grad_norm": 2.3597049713134766, "learning_rate": 0.0001783446877114176, "loss": 2.9801, "step": 1542 }, { "epoch": 0.42940132883431315, "grad_norm": 2.2011847496032715, "learning_rate": 0.00017831747705375698, "loss": 2.9762, "step": 1543 }, { "epoch": 0.42967961874282534, "grad_norm": 2.2656610012054443, "learning_rate": 0.00017829025138983397, "loss": 2.8144, "step": 1544 }, { "epoch": 0.42995790865133754, "grad_norm": 2.0530154705047607, "learning_rate": 0.0001782630107248653, "loss": 2.9091, "step": 1545 }, { "epoch": 0.43023619855984974, "grad_norm": 2.2800612449645996, "learning_rate": 0.00017823575506407046, "loss": 2.8905, "step": 1546 }, { "epoch": 0.43051448846836193, "grad_norm": 2.688504934310913, "learning_rate": 0.0001782084844126719, "loss": 3.1475, "step": 1547 }, { "epoch": 0.43079277837687413, "grad_norm": 2.4335620403289795, "learning_rate": 0.00017818119877589485, "loss": 2.6545, "step": 1548 }, { "epoch": 0.4310710682853863, "grad_norm": 2.1354079246520996, "learning_rate": 0.0001781538981589675, "loss": 2.7739, "step": 1549 }, { "epoch": 0.43134935819389847, "grad_norm": 2.6244606971740723, "learning_rate": 0.00017812658256712081, "loss": 3.1742, "step": 1550 }, { "epoch": 0.43162764810241067, "grad_norm": 2.0910539627075195, "learning_rate": 0.00017809925200558876, "loss": 3.0865, "step": 1551 }, { "epoch": 0.43190593801092286, "grad_norm": 2.3477671146392822, "learning_rate": 0.00017807190647960806, "loss": 3.2576, "step": 1552 }, { "epoch": 0.43218422791943506, "grad_norm": 2.5532398223876953, "learning_rate": 0.0001780445459944183, "loss": 3.0099, "step": 1553 }, { "epoch": 0.43246251782794726, "grad_norm": 2.4760825634002686, "learning_rate": 0.000178017170555262, "loss": 3.1642, "step": 1554 }, { "epoch": 0.43274080773645945, "grad_norm": 2.578453302383423, "learning_rate": 0.00017798978016738453, "loss": 3.197, "step": 1555 }, { "epoch": 0.43301909764497165, "grad_norm": 2.3926427364349365, "learning_rate": 0.00017796237483603408, "loss": 2.7734, "step": 1556 }, { "epoch": 0.43329738755348385, "grad_norm": 2.307000160217285, "learning_rate": 0.0001779349545664618, "loss": 3.2266, "step": 1557 }, { "epoch": 0.43357567746199605, "grad_norm": 2.1902518272399902, "learning_rate": 0.00017790751936392151, "loss": 2.812, "step": 1558 }, { "epoch": 0.43385396737050824, "grad_norm": 2.178658962249756, "learning_rate": 0.00017788006923367015, "loss": 2.8877, "step": 1559 }, { "epoch": 0.43413225727902044, "grad_norm": 1.8742996454238892, "learning_rate": 0.0001778526041809673, "loss": 2.6613, "step": 1560 }, { "epoch": 0.43441054718753264, "grad_norm": 2.5255515575408936, "learning_rate": 0.00017782512421107553, "loss": 2.9941, "step": 1561 }, { "epoch": 0.4346888370960448, "grad_norm": 2.3451552391052246, "learning_rate": 0.00017779762932926022, "loss": 2.984, "step": 1562 }, { "epoch": 0.434967127004557, "grad_norm": 2.5891788005828857, "learning_rate": 0.00017777011954078957, "loss": 2.5272, "step": 1563 }, { "epoch": 0.43524541691306917, "grad_norm": 2.427964448928833, "learning_rate": 0.00017774259485093472, "loss": 2.9097, "step": 1564 }, { "epoch": 0.43552370682158137, "grad_norm": 2.2478299140930176, "learning_rate": 0.0001777150552649696, "loss": 3.2605, "step": 1565 }, { "epoch": 0.43580199673009357, "grad_norm": 2.092874050140381, "learning_rate": 0.0001776875007881711, "loss": 3.0447, "step": 1566 }, { "epoch": 0.43608028663860576, "grad_norm": 2.724999189376831, "learning_rate": 0.00017765993142581873, "loss": 2.9951, "step": 1567 }, { "epoch": 0.43635857654711796, "grad_norm": 2.448967695236206, "learning_rate": 0.0001776323471831951, "loss": 2.9322, "step": 1568 }, { "epoch": 0.43663686645563016, "grad_norm": 2.169391393661499, "learning_rate": 0.00017760474806558557, "loss": 2.8168, "step": 1569 }, { "epoch": 0.43691515636414235, "grad_norm": 2.412353754043579, "learning_rate": 0.00017757713407827837, "loss": 3.0926, "step": 1570 }, { "epoch": 0.43719344627265455, "grad_norm": 2.1152477264404297, "learning_rate": 0.0001775495052265645, "loss": 2.9102, "step": 1571 }, { "epoch": 0.43747173618116675, "grad_norm": 2.273615837097168, "learning_rate": 0.00017752186151573794, "loss": 2.8067, "step": 1572 }, { "epoch": 0.43775002608967895, "grad_norm": 2.2339484691619873, "learning_rate": 0.00017749420295109537, "loss": 2.7762, "step": 1573 }, { "epoch": 0.43802831599819114, "grad_norm": 2.0991029739379883, "learning_rate": 0.0001774665295379365, "loss": 2.8056, "step": 1574 }, { "epoch": 0.4383066059067033, "grad_norm": 2.323793411254883, "learning_rate": 0.00017743884128156368, "loss": 2.7193, "step": 1575 }, { "epoch": 0.4385848958152155, "grad_norm": 2.1427690982818604, "learning_rate": 0.00017741113818728224, "loss": 2.9011, "step": 1576 }, { "epoch": 0.4388631857237277, "grad_norm": 2.667240858078003, "learning_rate": 0.00017738342026040032, "loss": 2.9217, "step": 1577 }, { "epoch": 0.4391414756322399, "grad_norm": 2.4806602001190186, "learning_rate": 0.00017735568750622895, "loss": 3.0521, "step": 1578 }, { "epoch": 0.43941976554075207, "grad_norm": 2.2599003314971924, "learning_rate": 0.00017732793993008186, "loss": 2.7249, "step": 1579 }, { "epoch": 0.43969805544926427, "grad_norm": 2.069152355194092, "learning_rate": 0.00017730017753727573, "loss": 2.5847, "step": 1580 }, { "epoch": 0.43997634535777647, "grad_norm": 2.539839029312134, "learning_rate": 0.00017727240033313014, "loss": 2.9386, "step": 1581 }, { "epoch": 0.44025463526628866, "grad_norm": 2.547626256942749, "learning_rate": 0.00017724460832296734, "loss": 3.1169, "step": 1582 }, { "epoch": 0.44053292517480086, "grad_norm": 2.6569392681121826, "learning_rate": 0.00017721680151211252, "loss": 3.1089, "step": 1583 }, { "epoch": 0.44081121508331306, "grad_norm": 1.8564096689224243, "learning_rate": 0.0001771889799058937, "loss": 2.7479, "step": 1584 }, { "epoch": 0.44108950499182525, "grad_norm": 2.853278875350952, "learning_rate": 0.00017716114350964173, "loss": 3.0734, "step": 1585 }, { "epoch": 0.44136779490033745, "grad_norm": 2.1204781532287598, "learning_rate": 0.0001771332923286903, "loss": 2.886, "step": 1586 }, { "epoch": 0.4416460848088496, "grad_norm": 2.0781548023223877, "learning_rate": 0.00017710542636837587, "loss": 2.805, "step": 1587 }, { "epoch": 0.4419243747173618, "grad_norm": 2.2901318073272705, "learning_rate": 0.00017707754563403788, "loss": 3.0889, "step": 1588 }, { "epoch": 0.442202664625874, "grad_norm": 2.003519058227539, "learning_rate": 0.00017704965013101842, "loss": 2.9303, "step": 1589 }, { "epoch": 0.4424809545343862, "grad_norm": 2.4573092460632324, "learning_rate": 0.00017702173986466248, "loss": 2.7841, "step": 1590 }, { "epoch": 0.4427592444428984, "grad_norm": 2.2894668579101562, "learning_rate": 0.00017699381484031799, "loss": 2.7556, "step": 1591 }, { "epoch": 0.4430375343514106, "grad_norm": 2.7957422733306885, "learning_rate": 0.00017696587506333552, "loss": 3.024, "step": 1592 }, { "epoch": 0.4433158242599228, "grad_norm": 2.393179178237915, "learning_rate": 0.0001769379205390686, "loss": 3.0884, "step": 1593 }, { "epoch": 0.44359411416843497, "grad_norm": 2.266326427459717, "learning_rate": 0.00017690995127287354, "loss": 2.7727, "step": 1594 }, { "epoch": 0.44387240407694717, "grad_norm": 2.2660281658172607, "learning_rate": 0.00017688196727010943, "loss": 2.7035, "step": 1595 }, { "epoch": 0.44415069398545937, "grad_norm": 2.773442029953003, "learning_rate": 0.0001768539685361383, "loss": 3.0823, "step": 1596 }, { "epoch": 0.44442898389397156, "grad_norm": 1.9260491132736206, "learning_rate": 0.0001768259550763249, "loss": 2.4221, "step": 1597 }, { "epoch": 0.44470727380248376, "grad_norm": 2.427091360092163, "learning_rate": 0.00017679792689603686, "loss": 2.8745, "step": 1598 }, { "epoch": 0.44498556371099596, "grad_norm": 4.168886661529541, "learning_rate": 0.00017676988400064456, "loss": 3.077, "step": 1599 }, { "epoch": 0.4452638536195081, "grad_norm": 2.2594480514526367, "learning_rate": 0.0001767418263955213, "loss": 2.9508, "step": 1600 }, { "epoch": 0.4452638536195081, "eval_loss": 2.9355742931365967, "eval_runtime": 84.3786, "eval_samples_per_second": 59.257, "eval_steps_per_second": 14.814, "step": 1600 }, { "epoch": 0.4455421435280203, "grad_norm": 2.4243345260620117, "learning_rate": 0.0001767137540860431, "loss": 2.7506, "step": 1601 }, { "epoch": 0.4458204334365325, "grad_norm": 2.1864430904388428, "learning_rate": 0.00017668566707758888, "loss": 2.7049, "step": 1602 }, { "epoch": 0.4460987233450447, "grad_norm": 2.722066879272461, "learning_rate": 0.0001766575653755403, "loss": 3.1618, "step": 1603 }, { "epoch": 0.4463770132535569, "grad_norm": 2.351907968521118, "learning_rate": 0.00017662944898528187, "loss": 2.8834, "step": 1604 }, { "epoch": 0.4466553031620691, "grad_norm": 2.2368266582489014, "learning_rate": 0.00017660131791220098, "loss": 2.8789, "step": 1605 }, { "epoch": 0.4469335930705813, "grad_norm": 2.610078811645508, "learning_rate": 0.0001765731721616877, "loss": 3.0625, "step": 1606 }, { "epoch": 0.4472118829790935, "grad_norm": 2.120957851409912, "learning_rate": 0.000176545011739135, "loss": 2.8811, "step": 1607 }, { "epoch": 0.4474901728876057, "grad_norm": 2.471010208129883, "learning_rate": 0.0001765168366499387, "loss": 2.8656, "step": 1608 }, { "epoch": 0.44776846279611787, "grad_norm": 2.2618563175201416, "learning_rate": 0.0001764886468994973, "loss": 3.0036, "step": 1609 }, { "epoch": 0.44804675270463007, "grad_norm": 2.091439962387085, "learning_rate": 0.00017646044249321224, "loss": 3.1173, "step": 1610 }, { "epoch": 0.44832504261314227, "grad_norm": 2.485386371612549, "learning_rate": 0.00017643222343648768, "loss": 3.1456, "step": 1611 }, { "epoch": 0.4486033325216544, "grad_norm": 2.4731528759002686, "learning_rate": 0.0001764039897347306, "loss": 3.1075, "step": 1612 }, { "epoch": 0.4488816224301666, "grad_norm": 2.559610605239868, "learning_rate": 0.0001763757413933509, "loss": 3.0695, "step": 1613 }, { "epoch": 0.4491599123386788, "grad_norm": 2.3829660415649414, "learning_rate": 0.0001763474784177611, "loss": 3.0681, "step": 1614 }, { "epoch": 0.449438202247191, "grad_norm": 2.4688544273376465, "learning_rate": 0.00017631920081337662, "loss": 3.0147, "step": 1615 }, { "epoch": 0.4497164921557032, "grad_norm": 2.0860488414764404, "learning_rate": 0.0001762909085856157, "loss": 2.9386, "step": 1616 }, { "epoch": 0.4499947820642154, "grad_norm": 2.3536834716796875, "learning_rate": 0.0001762626017398994, "loss": 2.7838, "step": 1617 }, { "epoch": 0.4502730719727276, "grad_norm": 2.3266372680664062, "learning_rate": 0.00017623428028165145, "loss": 2.9087, "step": 1618 }, { "epoch": 0.4505513618812398, "grad_norm": 2.3906242847442627, "learning_rate": 0.00017620594421629852, "loss": 2.9404, "step": 1619 }, { "epoch": 0.450829651789752, "grad_norm": 2.5523111820220947, "learning_rate": 0.00017617759354927005, "loss": 3.0738, "step": 1620 }, { "epoch": 0.4511079416982642, "grad_norm": 2.6853137016296387, "learning_rate": 0.00017614922828599822, "loss": 2.7857, "step": 1621 }, { "epoch": 0.4513862316067764, "grad_norm": 2.377764940261841, "learning_rate": 0.0001761208484319181, "loss": 3.1861, "step": 1622 }, { "epoch": 0.4516645215152886, "grad_norm": 2.2885422706604004, "learning_rate": 0.0001760924539924674, "loss": 2.9549, "step": 1623 }, { "epoch": 0.45194281142380077, "grad_norm": 2.169076681137085, "learning_rate": 0.00017606404497308683, "loss": 3.0951, "step": 1624 }, { "epoch": 0.4522211013323129, "grad_norm": 2.308926820755005, "learning_rate": 0.00017603562137921976, "loss": 3.3127, "step": 1625 }, { "epoch": 0.4524993912408251, "grad_norm": 2.438093662261963, "learning_rate": 0.00017600718321631234, "loss": 2.9343, "step": 1626 }, { "epoch": 0.4527776811493373, "grad_norm": 2.600412368774414, "learning_rate": 0.00017597873048981355, "loss": 2.7667, "step": 1627 }, { "epoch": 0.4530559710578495, "grad_norm": 2.2359116077423096, "learning_rate": 0.0001759502632051752, "loss": 2.588, "step": 1628 }, { "epoch": 0.4533342609663617, "grad_norm": 2.453897714614868, "learning_rate": 0.00017592178136785185, "loss": 2.8986, "step": 1629 }, { "epoch": 0.4536125508748739, "grad_norm": 2.4181478023529053, "learning_rate": 0.0001758932849833008, "loss": 2.7649, "step": 1630 }, { "epoch": 0.4538908407833861, "grad_norm": 2.176375150680542, "learning_rate": 0.0001758647740569823, "loss": 2.9607, "step": 1631 }, { "epoch": 0.4541691306918983, "grad_norm": 2.327392578125, "learning_rate": 0.00017583624859435917, "loss": 2.9129, "step": 1632 }, { "epoch": 0.4544474206004105, "grad_norm": 2.254091739654541, "learning_rate": 0.00017580770860089714, "loss": 3.029, "step": 1633 }, { "epoch": 0.4547257105089227, "grad_norm": 2.1933653354644775, "learning_rate": 0.00017577915408206472, "loss": 2.9775, "step": 1634 }, { "epoch": 0.4550040004174349, "grad_norm": 2.023534059524536, "learning_rate": 0.00017575058504333319, "loss": 2.737, "step": 1635 }, { "epoch": 0.4552822903259471, "grad_norm": 2.896252155303955, "learning_rate": 0.0001757220014901766, "loss": 2.9723, "step": 1636 }, { "epoch": 0.4555605802344592, "grad_norm": 2.4600837230682373, "learning_rate": 0.00017569340342807178, "loss": 2.9181, "step": 1637 }, { "epoch": 0.4558388701429714, "grad_norm": 1.8405340909957886, "learning_rate": 0.0001756647908624984, "loss": 2.7309, "step": 1638 }, { "epoch": 0.4561171600514836, "grad_norm": 2.1959388256073, "learning_rate": 0.00017563616379893878, "loss": 3.0731, "step": 1639 }, { "epoch": 0.4563954499599958, "grad_norm": 1.9801900386810303, "learning_rate": 0.00017560752224287813, "loss": 2.8508, "step": 1640 }, { "epoch": 0.456673739868508, "grad_norm": 2.4894955158233643, "learning_rate": 0.00017557886619980445, "loss": 3.1301, "step": 1641 }, { "epoch": 0.4569520297770202, "grad_norm": 2.240042209625244, "learning_rate": 0.00017555019567520842, "loss": 2.8062, "step": 1642 }, { "epoch": 0.4572303196855324, "grad_norm": 2.7217319011688232, "learning_rate": 0.00017552151067458353, "loss": 3.1598, "step": 1643 }, { "epoch": 0.4575086095940446, "grad_norm": 2.206531286239624, "learning_rate": 0.00017549281120342612, "loss": 2.7675, "step": 1644 }, { "epoch": 0.4577868995025568, "grad_norm": 2.2856290340423584, "learning_rate": 0.00017546409726723519, "loss": 2.9288, "step": 1645 }, { "epoch": 0.458065189411069, "grad_norm": 2.3853166103363037, "learning_rate": 0.00017543536887151256, "loss": 2.937, "step": 1646 }, { "epoch": 0.4583434793195812, "grad_norm": 2.459928035736084, "learning_rate": 0.00017540662602176282, "loss": 2.8787, "step": 1647 }, { "epoch": 0.4586217692280934, "grad_norm": 2.4138965606689453, "learning_rate": 0.00017537786872349342, "loss": 2.8926, "step": 1648 }, { "epoch": 0.45890005913660553, "grad_norm": 3.99428391456604, "learning_rate": 0.00017534909698221438, "loss": 3.0472, "step": 1649 }, { "epoch": 0.45917834904511773, "grad_norm": 2.7470438480377197, "learning_rate": 0.00017532031080343864, "loss": 2.826, "step": 1650 }, { "epoch": 0.4594566389536299, "grad_norm": 2.4054617881774902, "learning_rate": 0.0001752915101926819, "loss": 2.6596, "step": 1651 }, { "epoch": 0.4597349288621421, "grad_norm": 2.2495157718658447, "learning_rate": 0.00017526269515546255, "loss": 2.5785, "step": 1652 }, { "epoch": 0.4600132187706543, "grad_norm": 2.405332088470459, "learning_rate": 0.00017523386569730177, "loss": 3.0094, "step": 1653 }, { "epoch": 0.4602915086791665, "grad_norm": 2.4681594371795654, "learning_rate": 0.00017520502182372355, "loss": 2.724, "step": 1654 }, { "epoch": 0.4605697985876787, "grad_norm": 2.491534948348999, "learning_rate": 0.00017517616354025459, "loss": 2.8024, "step": 1655 }, { "epoch": 0.4608480884961909, "grad_norm": 2.2967476844787598, "learning_rate": 0.0001751472908524244, "loss": 2.8969, "step": 1656 }, { "epoch": 0.4611263784047031, "grad_norm": 2.5176398754119873, "learning_rate": 0.0001751184037657652, "loss": 2.9074, "step": 1657 }, { "epoch": 0.4614046683132153, "grad_norm": 2.1492137908935547, "learning_rate": 0.00017508950228581197, "loss": 2.8794, "step": 1658 }, { "epoch": 0.4616829582217275, "grad_norm": 2.1307902336120605, "learning_rate": 0.0001750605864181025, "loss": 2.6807, "step": 1659 }, { "epoch": 0.4619612481302397, "grad_norm": 2.7678709030151367, "learning_rate": 0.0001750316561681773, "loss": 3.1124, "step": 1660 }, { "epoch": 0.4622395380387519, "grad_norm": 2.0988805294036865, "learning_rate": 0.0001750027115415796, "loss": 2.8745, "step": 1661 }, { "epoch": 0.46251782794726404, "grad_norm": 2.491716146469116, "learning_rate": 0.00017497375254385547, "loss": 3.2251, "step": 1662 }, { "epoch": 0.46279611785577623, "grad_norm": 2.5652554035186768, "learning_rate": 0.00017494477918055367, "loss": 3.1244, "step": 1663 }, { "epoch": 0.46307440776428843, "grad_norm": 2.098722457885742, "learning_rate": 0.00017491579145722575, "loss": 2.9644, "step": 1664 }, { "epoch": 0.46335269767280063, "grad_norm": 2.2444772720336914, "learning_rate": 0.00017488678937942597, "loss": 3.0687, "step": 1665 }, { "epoch": 0.4636309875813128, "grad_norm": 2.2089200019836426, "learning_rate": 0.0001748577729527114, "loss": 2.8738, "step": 1666 }, { "epoch": 0.463909277489825, "grad_norm": 2.3964178562164307, "learning_rate": 0.00017482874218264173, "loss": 2.9886, "step": 1667 }, { "epoch": 0.4641875673983372, "grad_norm": 2.341874122619629, "learning_rate": 0.0001747996970747796, "loss": 2.8725, "step": 1668 }, { "epoch": 0.4644658573068494, "grad_norm": 2.014882802963257, "learning_rate": 0.00017477063763469023, "loss": 2.8374, "step": 1669 }, { "epoch": 0.4647441472153616, "grad_norm": 2.30665922164917, "learning_rate": 0.00017474156386794166, "loss": 2.8668, "step": 1670 }, { "epoch": 0.4650224371238738, "grad_norm": 2.320704460144043, "learning_rate": 0.00017471247578010462, "loss": 2.9172, "step": 1671 }, { "epoch": 0.465300727032386, "grad_norm": 1.8798365592956543, "learning_rate": 0.0001746833733767527, "loss": 2.7551, "step": 1672 }, { "epoch": 0.4655790169408982, "grad_norm": 2.8550612926483154, "learning_rate": 0.0001746542566634621, "loss": 3.1651, "step": 1673 }, { "epoch": 0.46585730684941035, "grad_norm": 2.015118360519409, "learning_rate": 0.00017462512564581185, "loss": 2.7932, "step": 1674 }, { "epoch": 0.46613559675792254, "grad_norm": 1.9119848012924194, "learning_rate": 0.0001745959803293836, "loss": 2.6291, "step": 1675 }, { "epoch": 0.46641388666643474, "grad_norm": 2.1400763988494873, "learning_rate": 0.00017456682071976198, "loss": 2.8828, "step": 1676 }, { "epoch": 0.46669217657494694, "grad_norm": 2.507437229156494, "learning_rate": 0.00017453764682253406, "loss": 2.9941, "step": 1677 }, { "epoch": 0.46697046648345913, "grad_norm": 2.4237325191497803, "learning_rate": 0.0001745084586432899, "loss": 2.9775, "step": 1678 }, { "epoch": 0.46724875639197133, "grad_norm": 1.9604809284210205, "learning_rate": 0.00017447925618762212, "loss": 2.946, "step": 1679 }, { "epoch": 0.46752704630048353, "grad_norm": 2.601691722869873, "learning_rate": 0.0001744500394611262, "loss": 3.1487, "step": 1680 }, { "epoch": 0.4678053362089957, "grad_norm": 2.130403518676758, "learning_rate": 0.00017442080846940026, "loss": 3.134, "step": 1681 }, { "epoch": 0.4680836261175079, "grad_norm": 2.36118221282959, "learning_rate": 0.00017439156321804523, "loss": 3.0812, "step": 1682 }, { "epoch": 0.4683619160260201, "grad_norm": 2.330289363861084, "learning_rate": 0.0001743623037126647, "loss": 2.7112, "step": 1683 }, { "epoch": 0.4686402059345323, "grad_norm": 2.326556921005249, "learning_rate": 0.0001743330299588651, "loss": 2.9237, "step": 1684 }, { "epoch": 0.4689184958430445, "grad_norm": 2.022028684616089, "learning_rate": 0.00017430374196225542, "loss": 3.02, "step": 1685 }, { "epoch": 0.4691967857515567, "grad_norm": 2.7578988075256348, "learning_rate": 0.00017427443972844753, "loss": 2.8873, "step": 1686 }, { "epoch": 0.46947507566006885, "grad_norm": 2.6402623653411865, "learning_rate": 0.00017424512326305597, "loss": 2.7798, "step": 1687 }, { "epoch": 0.46975336556858105, "grad_norm": 2.2084550857543945, "learning_rate": 0.00017421579257169802, "loss": 2.8937, "step": 1688 }, { "epoch": 0.47003165547709325, "grad_norm": 2.375795841217041, "learning_rate": 0.00017418644765999366, "loss": 2.6742, "step": 1689 }, { "epoch": 0.47030994538560544, "grad_norm": 1.9375183582305908, "learning_rate": 0.00017415708853356563, "loss": 2.8057, "step": 1690 }, { "epoch": 0.47058823529411764, "grad_norm": 2.6819851398468018, "learning_rate": 0.00017412771519803935, "loss": 3.0994, "step": 1691 }, { "epoch": 0.47086652520262984, "grad_norm": 2.433946132659912, "learning_rate": 0.00017409832765904302, "loss": 3.138, "step": 1692 }, { "epoch": 0.47114481511114203, "grad_norm": 2.5765674114227295, "learning_rate": 0.00017406892592220752, "loss": 3.0197, "step": 1693 }, { "epoch": 0.47142310501965423, "grad_norm": 2.0258610248565674, "learning_rate": 0.00017403950999316642, "loss": 2.8643, "step": 1694 }, { "epoch": 0.47170139492816643, "grad_norm": 2.5778284072875977, "learning_rate": 0.00017401007987755614, "loss": 3.1507, "step": 1695 }, { "epoch": 0.4719796848366786, "grad_norm": 1.949785590171814, "learning_rate": 0.00017398063558101562, "loss": 2.9896, "step": 1696 }, { "epoch": 0.4722579747451908, "grad_norm": 1.9877575635910034, "learning_rate": 0.0001739511771091867, "loss": 2.9534, "step": 1697 }, { "epoch": 0.472536264653703, "grad_norm": 2.313185453414917, "learning_rate": 0.00017392170446771388, "loss": 2.987, "step": 1698 }, { "epoch": 0.47281455456221516, "grad_norm": 2.3114078044891357, "learning_rate": 0.0001738922176622443, "loss": 2.7444, "step": 1699 }, { "epoch": 0.47309284447072736, "grad_norm": 2.197207450866699, "learning_rate": 0.00017386271669842787, "loss": 2.7546, "step": 1700 }, { "epoch": 0.47309284447072736, "eval_loss": 2.930652379989624, "eval_runtime": 84.3984, "eval_samples_per_second": 59.243, "eval_steps_per_second": 14.811, "step": 1700 }, { "epoch": 0.47337113437923956, "grad_norm": 2.262272834777832, "learning_rate": 0.00017383320158191726, "loss": 3.0414, "step": 1701 }, { "epoch": 0.47364942428775175, "grad_norm": 2.1728155612945557, "learning_rate": 0.00017380367231836774, "loss": 2.6885, "step": 1702 }, { "epoch": 0.47392771419626395, "grad_norm": 2.1178138256073, "learning_rate": 0.0001737741289134374, "loss": 2.9028, "step": 1703 }, { "epoch": 0.47420600410477615, "grad_norm": 2.2292842864990234, "learning_rate": 0.000173744571372787, "loss": 2.7798, "step": 1704 }, { "epoch": 0.47448429401328834, "grad_norm": 2.551589250564575, "learning_rate": 0.00017371499970208004, "loss": 3.1099, "step": 1705 }, { "epoch": 0.47476258392180054, "grad_norm": 2.214289903640747, "learning_rate": 0.00017368541390698256, "loss": 2.672, "step": 1706 }, { "epoch": 0.47504087383031274, "grad_norm": 2.5802297592163086, "learning_rate": 0.00017365581399316358, "loss": 2.8033, "step": 1707 }, { "epoch": 0.47531916373882493, "grad_norm": 2.428255796432495, "learning_rate": 0.0001736261999662946, "loss": 3.1137, "step": 1708 }, { "epoch": 0.47559745364733713, "grad_norm": 2.6427464485168457, "learning_rate": 0.00017359657183204993, "loss": 3.0413, "step": 1709 }, { "epoch": 0.47587574355584933, "grad_norm": 2.298693895339966, "learning_rate": 0.00017356692959610658, "loss": 3.0487, "step": 1710 }, { "epoch": 0.4761540334643615, "grad_norm": 2.2719123363494873, "learning_rate": 0.00017353727326414423, "loss": 2.8605, "step": 1711 }, { "epoch": 0.47643232337287367, "grad_norm": 2.2737016677856445, "learning_rate": 0.00017350760284184523, "loss": 2.8059, "step": 1712 }, { "epoch": 0.47671061328138586, "grad_norm": 2.596184730529785, "learning_rate": 0.00017347791833489476, "loss": 2.9065, "step": 1713 }, { "epoch": 0.47698890318989806, "grad_norm": 2.56683087348938, "learning_rate": 0.00017344821974898052, "loss": 2.9617, "step": 1714 }, { "epoch": 0.47726719309841026, "grad_norm": 2.7709975242614746, "learning_rate": 0.0001734185070897931, "loss": 3.1319, "step": 1715 }, { "epoch": 0.47754548300692246, "grad_norm": 3.552119016647339, "learning_rate": 0.0001733887803630256, "loss": 2.7462, "step": 1716 }, { "epoch": 0.47782377291543465, "grad_norm": 2.3296573162078857, "learning_rate": 0.0001733590395743739, "loss": 2.9805, "step": 1717 }, { "epoch": 0.47810206282394685, "grad_norm": 2.4791040420532227, "learning_rate": 0.0001733292847295366, "loss": 2.7301, "step": 1718 }, { "epoch": 0.47838035273245905, "grad_norm": 2.3433940410614014, "learning_rate": 0.00017329951583421505, "loss": 2.9946, "step": 1719 }, { "epoch": 0.47865864264097124, "grad_norm": 2.158147096633911, "learning_rate": 0.00017326973289411306, "loss": 2.7605, "step": 1720 }, { "epoch": 0.47893693254948344, "grad_norm": 2.784956932067871, "learning_rate": 0.0001732399359149374, "loss": 3.2857, "step": 1721 }, { "epoch": 0.47921522245799564, "grad_norm": 1.9880154132843018, "learning_rate": 0.00017321012490239732, "loss": 2.7633, "step": 1722 }, { "epoch": 0.47949351236650783, "grad_norm": 2.5646860599517822, "learning_rate": 0.00017318029986220494, "loss": 2.948, "step": 1723 }, { "epoch": 0.47977180227502, "grad_norm": 2.38944935798645, "learning_rate": 0.00017315046080007494, "loss": 3.0565, "step": 1724 }, { "epoch": 0.4800500921835322, "grad_norm": 3.5560412406921387, "learning_rate": 0.00017312060772172472, "loss": 2.7749, "step": 1725 }, { "epoch": 0.48032838209204437, "grad_norm": 2.3663554191589355, "learning_rate": 0.00017309074063287438, "loss": 2.9317, "step": 1726 }, { "epoch": 0.48060667200055657, "grad_norm": 2.7474215030670166, "learning_rate": 0.0001730608595392467, "loss": 2.971, "step": 1727 }, { "epoch": 0.48088496190906876, "grad_norm": 2.284649133682251, "learning_rate": 0.0001730309644465671, "loss": 2.7419, "step": 1728 }, { "epoch": 0.48116325181758096, "grad_norm": 2.377777099609375, "learning_rate": 0.00017300105536056378, "loss": 2.7605, "step": 1729 }, { "epoch": 0.48144154172609316, "grad_norm": 2.0597424507141113, "learning_rate": 0.00017297113228696755, "loss": 2.9235, "step": 1730 }, { "epoch": 0.48171983163460536, "grad_norm": 2.4781877994537354, "learning_rate": 0.00017294119523151192, "loss": 2.771, "step": 1731 }, { "epoch": 0.48199812154311755, "grad_norm": 2.089691162109375, "learning_rate": 0.000172911244199933, "loss": 2.7363, "step": 1732 }, { "epoch": 0.48227641145162975, "grad_norm": 2.2047555446624756, "learning_rate": 0.00017288127919796973, "loss": 3.0557, "step": 1733 }, { "epoch": 0.48255470136014195, "grad_norm": 2.3159735202789307, "learning_rate": 0.0001728513002313636, "loss": 2.8392, "step": 1734 }, { "epoch": 0.48283299126865414, "grad_norm": 2.2041749954223633, "learning_rate": 0.00017282130730585885, "loss": 2.5119, "step": 1735 }, { "epoch": 0.48311128117716634, "grad_norm": 2.5747697353363037, "learning_rate": 0.00017279130042720237, "loss": 2.7609, "step": 1736 }, { "epoch": 0.4833895710856785, "grad_norm": 2.4192728996276855, "learning_rate": 0.0001727612796011437, "loss": 2.9146, "step": 1737 }, { "epoch": 0.4836678609941907, "grad_norm": 2.3680379390716553, "learning_rate": 0.0001727312448334351, "loss": 3.0955, "step": 1738 }, { "epoch": 0.4839461509027029, "grad_norm": 2.0414390563964844, "learning_rate": 0.00017270119612983144, "loss": 2.7193, "step": 1739 }, { "epoch": 0.4842244408112151, "grad_norm": 2.3661298751831055, "learning_rate": 0.0001726711334960903, "loss": 2.9833, "step": 1740 }, { "epoch": 0.48450273071972727, "grad_norm": 2.6313931941986084, "learning_rate": 0.00017264105693797196, "loss": 2.9183, "step": 1741 }, { "epoch": 0.48478102062823947, "grad_norm": 2.7338039875030518, "learning_rate": 0.00017261096646123931, "loss": 3.1536, "step": 1742 }, { "epoch": 0.48505931053675166, "grad_norm": 2.570854663848877, "learning_rate": 0.0001725808620716579, "loss": 2.9497, "step": 1743 }, { "epoch": 0.48533760044526386, "grad_norm": 2.5261807441711426, "learning_rate": 0.00017255074377499604, "loss": 3.136, "step": 1744 }, { "epoch": 0.48561589035377606, "grad_norm": 2.261172294616699, "learning_rate": 0.0001725206115770246, "loss": 3.1068, "step": 1745 }, { "epoch": 0.48589418026228826, "grad_norm": 2.654038906097412, "learning_rate": 0.0001724904654835171, "loss": 2.6501, "step": 1746 }, { "epoch": 0.48617247017080045, "grad_norm": 2.481077194213867, "learning_rate": 0.00017246030550024987, "loss": 2.8668, "step": 1747 }, { "epoch": 0.48645076007931265, "grad_norm": 2.529261350631714, "learning_rate": 0.00017243013163300178, "loss": 2.6797, "step": 1748 }, { "epoch": 0.4867290499878248, "grad_norm": 2.230909824371338, "learning_rate": 0.00017239994388755435, "loss": 2.8353, "step": 1749 }, { "epoch": 0.487007339896337, "grad_norm": 2.351857900619507, "learning_rate": 0.00017236974226969183, "loss": 3.0757, "step": 1750 }, { "epoch": 0.4872856298048492, "grad_norm": 2.4003870487213135, "learning_rate": 0.00017233952678520108, "loss": 2.8741, "step": 1751 }, { "epoch": 0.4875639197133614, "grad_norm": 2.3478686809539795, "learning_rate": 0.00017230929743987164, "loss": 2.8393, "step": 1752 }, { "epoch": 0.4878422096218736, "grad_norm": 2.360248327255249, "learning_rate": 0.0001722790542394957, "loss": 3.0228, "step": 1753 }, { "epoch": 0.4881204995303858, "grad_norm": 2.730483055114746, "learning_rate": 0.00017224879718986803, "loss": 2.8846, "step": 1754 }, { "epoch": 0.488398789438898, "grad_norm": 2.590165376663208, "learning_rate": 0.00017221852629678629, "loss": 3.1608, "step": 1755 }, { "epoch": 0.48867707934741017, "grad_norm": 2.3240606784820557, "learning_rate": 0.00017218824156605046, "loss": 2.7745, "step": 1756 }, { "epoch": 0.48895536925592237, "grad_norm": 2.2719218730926514, "learning_rate": 0.00017215794300346344, "loss": 2.9773, "step": 1757 }, { "epoch": 0.48923365916443456, "grad_norm": 2.409031629562378, "learning_rate": 0.00017212763061483062, "loss": 2.8488, "step": 1758 }, { "epoch": 0.48951194907294676, "grad_norm": 2.5309038162231445, "learning_rate": 0.00017209730440596014, "loss": 3.0897, "step": 1759 }, { "epoch": 0.48979023898145896, "grad_norm": 2.206596851348877, "learning_rate": 0.00017206696438266272, "loss": 2.8838, "step": 1760 }, { "epoch": 0.4900685288899711, "grad_norm": 2.436613082885742, "learning_rate": 0.0001720366105507518, "loss": 2.5827, "step": 1761 }, { "epoch": 0.4903468187984833, "grad_norm": 1.9256775379180908, "learning_rate": 0.00017200624291604337, "loss": 2.7672, "step": 1762 }, { "epoch": 0.4906251087069955, "grad_norm": 2.382777452468872, "learning_rate": 0.00017197586148435612, "loss": 2.9402, "step": 1763 }, { "epoch": 0.4909033986155077, "grad_norm": 2.3583152294158936, "learning_rate": 0.00017194546626151142, "loss": 2.8626, "step": 1764 }, { "epoch": 0.4911816885240199, "grad_norm": 2.3444674015045166, "learning_rate": 0.0001719150572533332, "loss": 2.945, "step": 1765 }, { "epoch": 0.4914599784325321, "grad_norm": 2.3207318782806396, "learning_rate": 0.00017188463446564804, "loss": 2.9933, "step": 1766 }, { "epoch": 0.4917382683410443, "grad_norm": 2.1960461139678955, "learning_rate": 0.00017185419790428528, "loss": 2.9371, "step": 1767 }, { "epoch": 0.4920165582495565, "grad_norm": 2.0122897624969482, "learning_rate": 0.00017182374757507675, "loss": 2.8575, "step": 1768 }, { "epoch": 0.4922948481580687, "grad_norm": 2.1931400299072266, "learning_rate": 0.00017179328348385702, "loss": 2.7728, "step": 1769 }, { "epoch": 0.4925731380665809, "grad_norm": 3.04001784324646, "learning_rate": 0.00017176280563646322, "loss": 2.8343, "step": 1770 }, { "epoch": 0.49285142797509307, "grad_norm": 2.666628360748291, "learning_rate": 0.00017173231403873514, "loss": 2.9795, "step": 1771 }, { "epoch": 0.49312971788360527, "grad_norm": 2.751157522201538, "learning_rate": 0.00017170180869651523, "loss": 2.8306, "step": 1772 }, { "epoch": 0.49340800779211746, "grad_norm": 1.9222159385681152, "learning_rate": 0.00017167128961564863, "loss": 2.9515, "step": 1773 }, { "epoch": 0.4936862977006296, "grad_norm": 2.0998449325561523, "learning_rate": 0.00017164075680198296, "loss": 2.8609, "step": 1774 }, { "epoch": 0.4939645876091418, "grad_norm": 2.41192364692688, "learning_rate": 0.00017161021026136856, "loss": 2.8254, "step": 1775 }, { "epoch": 0.494242877517654, "grad_norm": 2.4543135166168213, "learning_rate": 0.00017157964999965842, "loss": 3.0242, "step": 1776 }, { "epoch": 0.4945211674261662, "grad_norm": 2.361844301223755, "learning_rate": 0.00017154907602270813, "loss": 2.9661, "step": 1777 }, { "epoch": 0.4947994573346784, "grad_norm": 2.0169484615325928, "learning_rate": 0.00017151848833637593, "loss": 2.7991, "step": 1778 }, { "epoch": 0.4950777472431906, "grad_norm": 2.2751924991607666, "learning_rate": 0.00017148788694652264, "loss": 3.0045, "step": 1779 }, { "epoch": 0.4953560371517028, "grad_norm": 2.202221155166626, "learning_rate": 0.0001714572718590117, "loss": 3.0661, "step": 1780 }, { "epoch": 0.495634327060215, "grad_norm": 2.545112133026123, "learning_rate": 0.00017142664307970927, "loss": 2.8825, "step": 1781 }, { "epoch": 0.4959126169687272, "grad_norm": 2.4622902870178223, "learning_rate": 0.00017139600061448406, "loss": 2.9365, "step": 1782 }, { "epoch": 0.4961909068772394, "grad_norm": 2.492636203765869, "learning_rate": 0.0001713653444692074, "loss": 3.0956, "step": 1783 }, { "epoch": 0.4964691967857516, "grad_norm": 2.5722527503967285, "learning_rate": 0.00017133467464975328, "loss": 3.0095, "step": 1784 }, { "epoch": 0.4967474866942638, "grad_norm": 1.9169038534164429, "learning_rate": 0.00017130399116199824, "loss": 2.6587, "step": 1785 }, { "epoch": 0.4970257766027759, "grad_norm": 2.3607869148254395, "learning_rate": 0.0001712732940118215, "loss": 2.7371, "step": 1786 }, { "epoch": 0.4973040665112881, "grad_norm": 2.2507214546203613, "learning_rate": 0.00017124258320510492, "loss": 2.9095, "step": 1787 }, { "epoch": 0.4975823564198003, "grad_norm": 2.531008005142212, "learning_rate": 0.0001712118587477329, "loss": 3.0213, "step": 1788 }, { "epoch": 0.4978606463283125, "grad_norm": 2.3737425804138184, "learning_rate": 0.00017118112064559251, "loss": 2.9686, "step": 1789 }, { "epoch": 0.4981389362368247, "grad_norm": 2.3264658451080322, "learning_rate": 0.00017115036890457338, "loss": 3.0977, "step": 1790 }, { "epoch": 0.4984172261453369, "grad_norm": 2.539376735687256, "learning_rate": 0.00017111960353056787, "loss": 3.3913, "step": 1791 }, { "epoch": 0.4986955160538491, "grad_norm": 2.2906954288482666, "learning_rate": 0.00017108882452947084, "loss": 2.684, "step": 1792 }, { "epoch": 0.4989738059623613, "grad_norm": 2.934720277786255, "learning_rate": 0.00017105803190717978, "loss": 3.0913, "step": 1793 }, { "epoch": 0.4992520958708735, "grad_norm": 2.6971898078918457, "learning_rate": 0.0001710272256695948, "loss": 2.9098, "step": 1794 }, { "epoch": 0.4995303857793857, "grad_norm": 2.277660846710205, "learning_rate": 0.00017099640582261865, "loss": 2.8744, "step": 1795 }, { "epoch": 0.4998086756878979, "grad_norm": 2.299502372741699, "learning_rate": 0.00017096557237215664, "loss": 2.8469, "step": 1796 }, { "epoch": 0.5000869655964101, "grad_norm": 2.2011513710021973, "learning_rate": 0.0001709347253241167, "loss": 2.764, "step": 1797 }, { "epoch": 0.5003652555049223, "grad_norm": 2.100465774536133, "learning_rate": 0.00017090386468440944, "loss": 2.7567, "step": 1798 }, { "epoch": 0.5006435454134345, "grad_norm": 2.1828315258026123, "learning_rate": 0.00017087299045894793, "loss": 2.7748, "step": 1799 }, { "epoch": 0.5009218353219467, "grad_norm": 2.711533546447754, "learning_rate": 0.00017084210265364794, "loss": 2.6985, "step": 1800 }, { "epoch": 0.5009218353219467, "eval_loss": 2.926023483276367, "eval_runtime": 84.2693, "eval_samples_per_second": 59.334, "eval_steps_per_second": 14.833, "step": 1800 }, { "epoch": 0.5012001252304589, "grad_norm": 2.3957035541534424, "learning_rate": 0.00017081120127442786, "loss": 2.9942, "step": 1801 }, { "epoch": 0.5014784151389711, "grad_norm": 2.4124081134796143, "learning_rate": 0.00017078028632720865, "loss": 2.9478, "step": 1802 }, { "epoch": 0.5017567050474833, "grad_norm": 2.211324453353882, "learning_rate": 0.00017074935781791377, "loss": 2.9605, "step": 1803 }, { "epoch": 0.5020349949559955, "grad_norm": 2.2002527713775635, "learning_rate": 0.00017071841575246947, "loss": 2.6132, "step": 1804 }, { "epoch": 0.5023132848645075, "grad_norm": 2.9506890773773193, "learning_rate": 0.00017068746013680446, "loss": 3.0015, "step": 1805 }, { "epoch": 0.5025915747730197, "grad_norm": 2.5539309978485107, "learning_rate": 0.0001706564909768501, "loss": 3.2394, "step": 1806 }, { "epoch": 0.5028698646815319, "grad_norm": 2.2777230739593506, "learning_rate": 0.0001706255082785403, "loss": 2.8863, "step": 1807 }, { "epoch": 0.5031481545900441, "grad_norm": 2.4090030193328857, "learning_rate": 0.00017059451204781166, "loss": 3.0215, "step": 1808 }, { "epoch": 0.5034264444985563, "grad_norm": 2.1513662338256836, "learning_rate": 0.00017056350229060328, "loss": 2.7575, "step": 1809 }, { "epoch": 0.5037047344070685, "grad_norm": 2.2303550243377686, "learning_rate": 0.00017053247901285682, "loss": 3.0058, "step": 1810 }, { "epoch": 0.5039830243155807, "grad_norm": 2.1075103282928467, "learning_rate": 0.00017050144222051667, "loss": 2.877, "step": 1811 }, { "epoch": 0.5042613142240929, "grad_norm": 2.4040331840515137, "learning_rate": 0.00017047039191952972, "loss": 2.725, "step": 1812 }, { "epoch": 0.5045396041326051, "grad_norm": 2.3204548358917236, "learning_rate": 0.0001704393281158454, "loss": 3.0308, "step": 1813 }, { "epoch": 0.5048178940411173, "grad_norm": 2.6136064529418945, "learning_rate": 0.00017040825081541584, "loss": 2.9317, "step": 1814 }, { "epoch": 0.5050961839496295, "grad_norm": 2.79492449760437, "learning_rate": 0.00017037716002419568, "loss": 2.924, "step": 1815 }, { "epoch": 0.5053744738581417, "grad_norm": 2.750577688217163, "learning_rate": 0.00017034605574814223, "loss": 3.2454, "step": 1816 }, { "epoch": 0.5056527637666539, "grad_norm": 2.4282729625701904, "learning_rate": 0.0001703149379932152, "loss": 2.9288, "step": 1817 }, { "epoch": 0.5059310536751661, "grad_norm": 2.457610845565796, "learning_rate": 0.0001702838067653771, "loss": 2.9543, "step": 1818 }, { "epoch": 0.5062093435836783, "grad_norm": 2.147484540939331, "learning_rate": 0.0001702526620705929, "loss": 2.7158, "step": 1819 }, { "epoch": 0.5064876334921905, "grad_norm": 2.2379133701324463, "learning_rate": 0.00017022150391483013, "loss": 2.9601, "step": 1820 }, { "epoch": 0.5067659234007027, "grad_norm": 2.3535096645355225, "learning_rate": 0.00017019033230405905, "loss": 3.0283, "step": 1821 }, { "epoch": 0.5070442133092149, "grad_norm": 2.256289482116699, "learning_rate": 0.0001701591472442523, "loss": 2.6044, "step": 1822 }, { "epoch": 0.5073225032177271, "grad_norm": 2.1072287559509277, "learning_rate": 0.0001701279487413852, "loss": 2.9001, "step": 1823 }, { "epoch": 0.5076007931262393, "grad_norm": 2.3810811042785645, "learning_rate": 0.00017009673680143568, "loss": 3.1999, "step": 1824 }, { "epoch": 0.5078790830347515, "grad_norm": 2.259049415588379, "learning_rate": 0.00017006551143038416, "loss": 2.9027, "step": 1825 }, { "epoch": 0.5081573729432637, "grad_norm": 2.4845969676971436, "learning_rate": 0.00017003427263421369, "loss": 3.0545, "step": 1826 }, { "epoch": 0.5084356628517759, "grad_norm": 2.3114805221557617, "learning_rate": 0.0001700030204189099, "loss": 2.9399, "step": 1827 }, { "epoch": 0.5087139527602881, "grad_norm": 2.0240015983581543, "learning_rate": 0.0001699717547904609, "loss": 2.6758, "step": 1828 }, { "epoch": 0.5089922426688003, "grad_norm": 1.9221512079238892, "learning_rate": 0.0001699404757548575, "loss": 2.6819, "step": 1829 }, { "epoch": 0.5092705325773124, "grad_norm": 2.1201674938201904, "learning_rate": 0.00016990918331809303, "loss": 2.7888, "step": 1830 }, { "epoch": 0.5095488224858246, "grad_norm": 2.3356354236602783, "learning_rate": 0.0001698778774861633, "loss": 2.8149, "step": 1831 }, { "epoch": 0.5098271123943368, "grad_norm": 2.2215800285339355, "learning_rate": 0.0001698465582650668, "loss": 2.8354, "step": 1832 }, { "epoch": 0.510105402302849, "grad_norm": 2.3393871784210205, "learning_rate": 0.00016981522566080455, "loss": 2.7693, "step": 1833 }, { "epoch": 0.5103836922113612, "grad_norm": 2.132838726043701, "learning_rate": 0.00016978387967938013, "loss": 2.7909, "step": 1834 }, { "epoch": 0.5106619821198733, "grad_norm": 2.0715861320495605, "learning_rate": 0.0001697525203267997, "loss": 2.8189, "step": 1835 }, { "epoch": 0.5109402720283855, "grad_norm": 2.388221502304077, "learning_rate": 0.00016972114760907192, "loss": 2.7097, "step": 1836 }, { "epoch": 0.5112185619368977, "grad_norm": 2.1698508262634277, "learning_rate": 0.00016968976153220808, "loss": 2.9579, "step": 1837 }, { "epoch": 0.5114968518454099, "grad_norm": 2.430748224258423, "learning_rate": 0.00016965836210222203, "loss": 2.8014, "step": 1838 }, { "epoch": 0.5117751417539221, "grad_norm": 2.5079500675201416, "learning_rate": 0.00016962694932513012, "loss": 2.9096, "step": 1839 }, { "epoch": 0.5120534316624343, "grad_norm": 2.0711987018585205, "learning_rate": 0.00016959552320695132, "loss": 2.7793, "step": 1840 }, { "epoch": 0.5123317215709465, "grad_norm": 2.4591636657714844, "learning_rate": 0.00016956408375370713, "loss": 3.0434, "step": 1841 }, { "epoch": 0.5126100114794587, "grad_norm": 2.143944501876831, "learning_rate": 0.00016953263097142156, "loss": 2.6824, "step": 1842 }, { "epoch": 0.5128883013879709, "grad_norm": 2.316941022872925, "learning_rate": 0.00016950116486612123, "loss": 3.0258, "step": 1843 }, { "epoch": 0.5131665912964831, "grad_norm": 2.2244391441345215, "learning_rate": 0.00016946968544383537, "loss": 2.9194, "step": 1844 }, { "epoch": 0.5134448812049953, "grad_norm": 2.04463791847229, "learning_rate": 0.0001694381927105956, "loss": 2.865, "step": 1845 }, { "epoch": 0.5137231711135075, "grad_norm": 2.798447608947754, "learning_rate": 0.0001694066866724362, "loss": 3.0306, "step": 1846 }, { "epoch": 0.5140014610220197, "grad_norm": 2.235954999923706, "learning_rate": 0.00016937516733539406, "loss": 3.135, "step": 1847 }, { "epoch": 0.5142797509305319, "grad_norm": 2.570878267288208, "learning_rate": 0.00016934363470550844, "loss": 3.4151, "step": 1848 }, { "epoch": 0.5145580408390441, "grad_norm": 2.407341480255127, "learning_rate": 0.00016931208878882131, "loss": 3.0041, "step": 1849 }, { "epoch": 0.5148363307475563, "grad_norm": 2.0636656284332275, "learning_rate": 0.00016928052959137713, "loss": 2.798, "step": 1850 }, { "epoch": 0.5151146206560685, "grad_norm": 2.2571423053741455, "learning_rate": 0.00016924895711922284, "loss": 2.6852, "step": 1851 }, { "epoch": 0.5153929105645807, "grad_norm": 2.1497771739959717, "learning_rate": 0.000169217371378408, "loss": 2.7295, "step": 1852 }, { "epoch": 0.5156712004730929, "grad_norm": 2.3534488677978516, "learning_rate": 0.00016918577237498472, "loss": 2.8954, "step": 1853 }, { "epoch": 0.5159494903816051, "grad_norm": 2.2533316612243652, "learning_rate": 0.0001691541601150076, "loss": 2.8116, "step": 1854 }, { "epoch": 0.5162277802901172, "grad_norm": 2.3317763805389404, "learning_rate": 0.00016912253460453386, "loss": 2.6978, "step": 1855 }, { "epoch": 0.5165060701986294, "grad_norm": 2.063188314437866, "learning_rate": 0.0001690908958496231, "loss": 2.6574, "step": 1856 }, { "epoch": 0.5167843601071416, "grad_norm": 2.6488149166107178, "learning_rate": 0.00016905924385633765, "loss": 3.0491, "step": 1857 }, { "epoch": 0.5170626500156538, "grad_norm": 3.1898932456970215, "learning_rate": 0.00016902757863074226, "loss": 2.7263, "step": 1858 }, { "epoch": 0.517340939924166, "grad_norm": 2.900926113128662, "learning_rate": 0.00016899590017890423, "loss": 2.9807, "step": 1859 }, { "epoch": 0.5176192298326782, "grad_norm": 2.507230758666992, "learning_rate": 0.00016896420850689344, "loss": 3.0007, "step": 1860 }, { "epoch": 0.5178975197411904, "grad_norm": 2.919523000717163, "learning_rate": 0.0001689325036207822, "loss": 3.1216, "step": 1861 }, { "epoch": 0.5181758096497026, "grad_norm": 2.483959436416626, "learning_rate": 0.00016890078552664553, "loss": 2.9017, "step": 1862 }, { "epoch": 0.5184540995582148, "grad_norm": 2.3976876735687256, "learning_rate": 0.00016886905423056078, "loss": 2.9407, "step": 1863 }, { "epoch": 0.518732389466727, "grad_norm": 2.503432035446167, "learning_rate": 0.00016883730973860797, "loss": 2.9821, "step": 1864 }, { "epoch": 0.5190106793752391, "grad_norm": 2.181992530822754, "learning_rate": 0.0001688055520568696, "loss": 3.1655, "step": 1865 }, { "epoch": 0.5192889692837513, "grad_norm": 2.6174073219299316, "learning_rate": 0.00016877378119143068, "loss": 2.8986, "step": 1866 }, { "epoch": 0.5195672591922635, "grad_norm": 2.5770742893218994, "learning_rate": 0.00016874199714837876, "loss": 2.7974, "step": 1867 }, { "epoch": 0.5198455491007757, "grad_norm": 2.667470932006836, "learning_rate": 0.00016871019993380397, "loss": 2.8337, "step": 1868 }, { "epoch": 0.5201238390092879, "grad_norm": 2.4528160095214844, "learning_rate": 0.00016867838955379885, "loss": 2.9383, "step": 1869 }, { "epoch": 0.5204021289178001, "grad_norm": 2.4041454792022705, "learning_rate": 0.00016864656601445858, "loss": 2.951, "step": 1870 }, { "epoch": 0.5206804188263123, "grad_norm": 2.1874783039093018, "learning_rate": 0.00016861472932188076, "loss": 2.7302, "step": 1871 }, { "epoch": 0.5209587087348245, "grad_norm": 2.2599010467529297, "learning_rate": 0.0001685828794821656, "loss": 2.7613, "step": 1872 }, { "epoch": 0.5212369986433367, "grad_norm": 2.070667028427124, "learning_rate": 0.00016855101650141573, "loss": 2.6955, "step": 1873 }, { "epoch": 0.5215152885518489, "grad_norm": 2.7473227977752686, "learning_rate": 0.0001685191403857364, "loss": 2.9847, "step": 1874 }, { "epoch": 0.5217935784603611, "grad_norm": 2.3491551876068115, "learning_rate": 0.00016848725114123537, "loss": 3.0091, "step": 1875 }, { "epoch": 0.5220718683688733, "grad_norm": 2.438403367996216, "learning_rate": 0.0001684553487740228, "loss": 2.7447, "step": 1876 }, { "epoch": 0.5223501582773855, "grad_norm": 2.4021010398864746, "learning_rate": 0.00016842343329021147, "loss": 2.7751, "step": 1877 }, { "epoch": 0.5226284481858977, "grad_norm": 2.7495365142822266, "learning_rate": 0.00016839150469591667, "loss": 2.9313, "step": 1878 }, { "epoch": 0.5229067380944099, "grad_norm": 2.0450022220611572, "learning_rate": 0.00016835956299725613, "loss": 2.6649, "step": 1879 }, { "epoch": 0.523185028002922, "grad_norm": 2.5452864170074463, "learning_rate": 0.00016832760820035015, "loss": 2.7744, "step": 1880 }, { "epoch": 0.5234633179114342, "grad_norm": 2.465223550796509, "learning_rate": 0.0001682956403113216, "loss": 2.7282, "step": 1881 }, { "epoch": 0.5237416078199464, "grad_norm": 2.496549606323242, "learning_rate": 0.00016826365933629567, "loss": 2.6839, "step": 1882 }, { "epoch": 0.5240198977284586, "grad_norm": 2.6692147254943848, "learning_rate": 0.00016823166528140025, "loss": 3.1479, "step": 1883 }, { "epoch": 0.5242981876369708, "grad_norm": 2.3386025428771973, "learning_rate": 0.00016819965815276564, "loss": 3.2468, "step": 1884 }, { "epoch": 0.524576477545483, "grad_norm": 2.6003293991088867, "learning_rate": 0.00016816763795652465, "loss": 2.814, "step": 1885 }, { "epoch": 0.5248547674539952, "grad_norm": 2.565321922302246, "learning_rate": 0.00016813560469881265, "loss": 2.9091, "step": 1886 }, { "epoch": 0.5251330573625074, "grad_norm": 2.319659948348999, "learning_rate": 0.0001681035583857674, "loss": 3.0594, "step": 1887 }, { "epoch": 0.5254113472710196, "grad_norm": 2.30830454826355, "learning_rate": 0.0001680714990235293, "loss": 2.8205, "step": 1888 }, { "epoch": 0.5256896371795318, "grad_norm": 2.4392049312591553, "learning_rate": 0.0001680394266182412, "loss": 3.0637, "step": 1889 }, { "epoch": 0.525967927088044, "grad_norm": 2.873981237411499, "learning_rate": 0.00016800734117604834, "loss": 2.9001, "step": 1890 }, { "epoch": 0.5262462169965562, "grad_norm": 2.465078830718994, "learning_rate": 0.00016797524270309864, "loss": 2.8419, "step": 1891 }, { "epoch": 0.5265245069050684, "grad_norm": 2.1811859607696533, "learning_rate": 0.0001679431312055424, "loss": 2.755, "step": 1892 }, { "epoch": 0.5268027968135806, "grad_norm": 2.7164580821990967, "learning_rate": 0.00016791100668953236, "loss": 3.12, "step": 1893 }, { "epoch": 0.5270810867220928, "grad_norm": 2.5228593349456787, "learning_rate": 0.000167878869161224, "loss": 3.0291, "step": 1894 }, { "epoch": 0.527359376630605, "grad_norm": 1.9620640277862549, "learning_rate": 0.00016784671862677498, "loss": 2.6265, "step": 1895 }, { "epoch": 0.5276376665391171, "grad_norm": 2.234213352203369, "learning_rate": 0.0001678145550923457, "loss": 3.1383, "step": 1896 }, { "epoch": 0.5279159564476293, "grad_norm": 2.142794132232666, "learning_rate": 0.00016778237856409888, "loss": 2.9021, "step": 1897 }, { "epoch": 0.5281942463561415, "grad_norm": 1.9027434587478638, "learning_rate": 0.00016775018904819988, "loss": 2.5357, "step": 1898 }, { "epoch": 0.5284725362646537, "grad_norm": 2.2427761554718018, "learning_rate": 0.00016771798655081642, "loss": 2.8729, "step": 1899 }, { "epoch": 0.5287508261731659, "grad_norm": 2.2922072410583496, "learning_rate": 0.00016768577107811877, "loss": 2.7927, "step": 1900 }, { "epoch": 0.5287508261731659, "eval_loss": 2.919917106628418, "eval_runtime": 84.8996, "eval_samples_per_second": 58.893, "eval_steps_per_second": 14.723, "step": 1900 }, { "epoch": 0.5290291160816781, "grad_norm": 2.0734360218048096, "learning_rate": 0.00016765354263627966, "loss": 2.8661, "step": 1901 }, { "epoch": 0.5293074059901903, "grad_norm": 2.1585662364959717, "learning_rate": 0.00016762130123147438, "loss": 2.8749, "step": 1902 }, { "epoch": 0.5295856958987025, "grad_norm": 2.696930408477783, "learning_rate": 0.00016758904686988056, "loss": 2.7665, "step": 1903 }, { "epoch": 0.5298639858072147, "grad_norm": 2.101065158843994, "learning_rate": 0.00016755677955767847, "loss": 2.9256, "step": 1904 }, { "epoch": 0.5301422757157268, "grad_norm": 2.152916193008423, "learning_rate": 0.0001675244993010507, "loss": 2.8687, "step": 1905 }, { "epoch": 0.530420565624239, "grad_norm": 2.252162456512451, "learning_rate": 0.0001674922061061825, "loss": 3.1117, "step": 1906 }, { "epoch": 0.5306988555327512, "grad_norm": 2.2085745334625244, "learning_rate": 0.00016745989997926145, "loss": 2.8976, "step": 1907 }, { "epoch": 0.5309771454412634, "grad_norm": 2.358081579208374, "learning_rate": 0.00016742758092647773, "loss": 2.7849, "step": 1908 }, { "epoch": 0.5312554353497756, "grad_norm": 2.1191911697387695, "learning_rate": 0.00016739524895402385, "loss": 3.1943, "step": 1909 }, { "epoch": 0.5315337252582878, "grad_norm": 2.4339487552642822, "learning_rate": 0.00016736290406809494, "loss": 2.9059, "step": 1910 }, { "epoch": 0.5318120151668, "grad_norm": 2.012509346008301, "learning_rate": 0.0001673305462748885, "loss": 2.9606, "step": 1911 }, { "epoch": 0.5320903050753122, "grad_norm": 2.5875980854034424, "learning_rate": 0.00016729817558060458, "loss": 3.2342, "step": 1912 }, { "epoch": 0.5323685949838244, "grad_norm": 2.442078113555908, "learning_rate": 0.00016726579199144564, "loss": 2.9874, "step": 1913 }, { "epoch": 0.5326468848923366, "grad_norm": 1.9513555765151978, "learning_rate": 0.00016723339551361668, "loss": 2.9293, "step": 1914 }, { "epoch": 0.5329251748008488, "grad_norm": 2.352891445159912, "learning_rate": 0.00016720098615332507, "loss": 2.7938, "step": 1915 }, { "epoch": 0.533203464709361, "grad_norm": 2.2850265502929688, "learning_rate": 0.00016716856391678074, "loss": 3.0194, "step": 1916 }, { "epoch": 0.5334817546178732, "grad_norm": 2.7878501415252686, "learning_rate": 0.000167136128810196, "loss": 2.6482, "step": 1917 }, { "epoch": 0.5337600445263854, "grad_norm": 2.3103487491607666, "learning_rate": 0.00016710368083978577, "loss": 3.0358, "step": 1918 }, { "epoch": 0.5340383344348976, "grad_norm": 2.094059467315674, "learning_rate": 0.00016707122001176727, "loss": 2.9417, "step": 1919 }, { "epoch": 0.5343166243434098, "grad_norm": 2.4515938758850098, "learning_rate": 0.0001670387463323603, "loss": 3.0914, "step": 1920 }, { "epoch": 0.534594914251922, "grad_norm": 2.387132406234741, "learning_rate": 0.00016700625980778705, "loss": 2.9089, "step": 1921 }, { "epoch": 0.5348732041604342, "grad_norm": 2.2919018268585205, "learning_rate": 0.00016697376044427225, "loss": 3.0042, "step": 1922 }, { "epoch": 0.5351514940689464, "grad_norm": 2.416548252105713, "learning_rate": 0.00016694124824804299, "loss": 3.0263, "step": 1923 }, { "epoch": 0.5354297839774586, "grad_norm": 2.296496629714966, "learning_rate": 0.00016690872322532884, "loss": 2.9959, "step": 1924 }, { "epoch": 0.5357080738859707, "grad_norm": 2.6415584087371826, "learning_rate": 0.00016687618538236193, "loss": 2.8099, "step": 1925 }, { "epoch": 0.535986363794483, "grad_norm": 2.3300039768218994, "learning_rate": 0.00016684363472537677, "loss": 3.0877, "step": 1926 }, { "epoch": 0.5362646537029951, "grad_norm": 2.418118715286255, "learning_rate": 0.00016681107126061029, "loss": 2.9904, "step": 1927 }, { "epoch": 0.5365429436115073, "grad_norm": 2.486640453338623, "learning_rate": 0.00016677849499430192, "loss": 3.0493, "step": 1928 }, { "epoch": 0.5368212335200194, "grad_norm": 2.04656720161438, "learning_rate": 0.00016674590593269354, "loss": 2.7882, "step": 1929 }, { "epoch": 0.5370995234285316, "grad_norm": 2.5102040767669678, "learning_rate": 0.00016671330408202952, "loss": 2.9057, "step": 1930 }, { "epoch": 0.5373778133370438, "grad_norm": 2.1595771312713623, "learning_rate": 0.0001666806894485566, "loss": 2.7766, "step": 1931 }, { "epoch": 0.537656103245556, "grad_norm": 3.017573595046997, "learning_rate": 0.00016664806203852396, "loss": 3.2916, "step": 1932 }, { "epoch": 0.5379343931540682, "grad_norm": 2.4415502548217773, "learning_rate": 0.00016661542185818337, "loss": 3.1499, "step": 1933 }, { "epoch": 0.5382126830625804, "grad_norm": 2.2937028408050537, "learning_rate": 0.0001665827689137889, "loss": 2.9484, "step": 1934 }, { "epoch": 0.5384909729710926, "grad_norm": 2.6735966205596924, "learning_rate": 0.00016655010321159713, "loss": 3.0873, "step": 1935 }, { "epoch": 0.5387692628796048, "grad_norm": 2.4005355834960938, "learning_rate": 0.00016651742475786704, "loss": 2.8047, "step": 1936 }, { "epoch": 0.539047552788117, "grad_norm": 1.9890570640563965, "learning_rate": 0.00016648473355886014, "loss": 2.7597, "step": 1937 }, { "epoch": 0.5393258426966292, "grad_norm": 2.2992310523986816, "learning_rate": 0.00016645202962084032, "loss": 3.0104, "step": 1938 }, { "epoch": 0.5396041326051414, "grad_norm": 4.309446334838867, "learning_rate": 0.00016641931295007388, "loss": 2.6869, "step": 1939 }, { "epoch": 0.5398824225136536, "grad_norm": 2.2391769886016846, "learning_rate": 0.00016638658355282962, "loss": 2.9755, "step": 1940 }, { "epoch": 0.5401607124221658, "grad_norm": 2.8339881896972656, "learning_rate": 0.00016635384143537878, "loss": 3.295, "step": 1941 }, { "epoch": 0.540439002330678, "grad_norm": 2.5695888996124268, "learning_rate": 0.00016632108660399498, "loss": 3.0481, "step": 1942 }, { "epoch": 0.5407172922391902, "grad_norm": 2.739103078842163, "learning_rate": 0.0001662883190649543, "loss": 2.8815, "step": 1943 }, { "epoch": 0.5409955821477024, "grad_norm": 2.817305326461792, "learning_rate": 0.00016625553882453534, "loss": 3.1115, "step": 1944 }, { "epoch": 0.5412738720562146, "grad_norm": 2.2619001865386963, "learning_rate": 0.00016622274588901895, "loss": 2.9478, "step": 1945 }, { "epoch": 0.5415521619647268, "grad_norm": 2.1543004512786865, "learning_rate": 0.0001661899402646886, "loss": 2.7077, "step": 1946 }, { "epoch": 0.541830451873239, "grad_norm": 2.647078514099121, "learning_rate": 0.00016615712195783012, "loss": 2.7137, "step": 1947 }, { "epoch": 0.5421087417817512, "grad_norm": 2.4697089195251465, "learning_rate": 0.00016612429097473175, "loss": 3.0418, "step": 1948 }, { "epoch": 0.5423870316902634, "grad_norm": 2.073970317840576, "learning_rate": 0.00016609144732168414, "loss": 2.9565, "step": 1949 }, { "epoch": 0.5426653215987756, "grad_norm": 2.152066707611084, "learning_rate": 0.0001660585910049804, "loss": 2.736, "step": 1950 }, { "epoch": 0.5429436115072878, "grad_norm": 2.4977447986602783, "learning_rate": 0.0001660257220309161, "loss": 2.6473, "step": 1951 }, { "epoch": 0.5432219014158, "grad_norm": 2.344604253768921, "learning_rate": 0.0001659928404057892, "loss": 2.8409, "step": 1952 }, { "epoch": 0.5435001913243122, "grad_norm": 2.3428380489349365, "learning_rate": 0.00016595994613590004, "loss": 2.7094, "step": 1953 }, { "epoch": 0.5437784812328242, "grad_norm": 2.1515395641326904, "learning_rate": 0.0001659270392275515, "loss": 3.0314, "step": 1954 }, { "epoch": 0.5440567711413364, "grad_norm": 2.9351179599761963, "learning_rate": 0.00016589411968704875, "loss": 2.9599, "step": 1955 }, { "epoch": 0.5443350610498486, "grad_norm": 2.3707497119903564, "learning_rate": 0.00016586118752069947, "loss": 2.7284, "step": 1956 }, { "epoch": 0.5446133509583608, "grad_norm": 2.5616343021392822, "learning_rate": 0.00016582824273481372, "loss": 2.9676, "step": 1957 }, { "epoch": 0.544891640866873, "grad_norm": 2.3641326427459717, "learning_rate": 0.00016579528533570395, "loss": 2.8919, "step": 1958 }, { "epoch": 0.5451699307753852, "grad_norm": 2.415510416030884, "learning_rate": 0.00016576231532968515, "loss": 3.1458, "step": 1959 }, { "epoch": 0.5454482206838974, "grad_norm": 1.9849399328231812, "learning_rate": 0.00016572933272307458, "loss": 2.8463, "step": 1960 }, { "epoch": 0.5457265105924096, "grad_norm": 2.2097907066345215, "learning_rate": 0.000165696337522192, "loss": 2.8365, "step": 1961 }, { "epoch": 0.5460048005009218, "grad_norm": 2.4192047119140625, "learning_rate": 0.00016566332973335952, "loss": 2.7035, "step": 1962 }, { "epoch": 0.546283090409434, "grad_norm": 2.4762537479400635, "learning_rate": 0.00016563030936290175, "loss": 2.7399, "step": 1963 }, { "epoch": 0.5465613803179462, "grad_norm": 2.128631591796875, "learning_rate": 0.0001655972764171456, "loss": 2.6706, "step": 1964 }, { "epoch": 0.5468396702264584, "grad_norm": 2.702988862991333, "learning_rate": 0.0001655642309024205, "loss": 2.922, "step": 1965 }, { "epoch": 0.5471179601349706, "grad_norm": 2.3204996585845947, "learning_rate": 0.00016553117282505818, "loss": 3.0058, "step": 1966 }, { "epoch": 0.5473962500434828, "grad_norm": 2.3878731727600098, "learning_rate": 0.00016549810219139291, "loss": 2.7881, "step": 1967 }, { "epoch": 0.547674539951995, "grad_norm": 2.1982181072235107, "learning_rate": 0.0001654650190077613, "loss": 2.9485, "step": 1968 }, { "epoch": 0.5479528298605072, "grad_norm": 2.3051490783691406, "learning_rate": 0.00016543192328050227, "loss": 2.9112, "step": 1969 }, { "epoch": 0.5482311197690194, "grad_norm": 2.2694547176361084, "learning_rate": 0.0001653988150159573, "loss": 2.7732, "step": 1970 }, { "epoch": 0.5485094096775316, "grad_norm": 2.1985435485839844, "learning_rate": 0.00016536569422047015, "loss": 2.9587, "step": 1971 }, { "epoch": 0.5487876995860438, "grad_norm": 2.042534112930298, "learning_rate": 0.00016533256090038706, "loss": 2.7207, "step": 1972 }, { "epoch": 0.549065989494556, "grad_norm": 1.9338102340698242, "learning_rate": 0.00016529941506205663, "loss": 2.852, "step": 1973 }, { "epoch": 0.5493442794030682, "grad_norm": 2.37833309173584, "learning_rate": 0.0001652662567118299, "loss": 2.8111, "step": 1974 }, { "epoch": 0.5496225693115804, "grad_norm": 2.6836705207824707, "learning_rate": 0.00016523308585606023, "loss": 3.0788, "step": 1975 }, { "epoch": 0.5499008592200926, "grad_norm": 2.722048044204712, "learning_rate": 0.00016519990250110346, "loss": 2.9574, "step": 1976 }, { "epoch": 0.5501791491286048, "grad_norm": 2.6264002323150635, "learning_rate": 0.0001651667066533178, "loss": 2.9347, "step": 1977 }, { "epoch": 0.550457439037117, "grad_norm": 2.259922981262207, "learning_rate": 0.0001651334983190638, "loss": 2.9512, "step": 1978 }, { "epoch": 0.550735728945629, "grad_norm": 2.156153917312622, "learning_rate": 0.0001651002775047045, "loss": 2.7466, "step": 1979 }, { "epoch": 0.5510140188541413, "grad_norm": 2.4331908226013184, "learning_rate": 0.00016506704421660523, "loss": 3.013, "step": 1980 }, { "epoch": 0.5512923087626534, "grad_norm": 2.750307559967041, "learning_rate": 0.00016503379846113378, "loss": 2.9857, "step": 1981 }, { "epoch": 0.5515705986711656, "grad_norm": 2.240656614303589, "learning_rate": 0.00016500054024466029, "loss": 2.949, "step": 1982 }, { "epoch": 0.5518488885796778, "grad_norm": 2.4127752780914307, "learning_rate": 0.0001649672695735573, "loss": 2.649, "step": 1983 }, { "epoch": 0.55212717848819, "grad_norm": 2.0091280937194824, "learning_rate": 0.00016493398645419977, "loss": 2.6543, "step": 1984 }, { "epoch": 0.5524054683967022, "grad_norm": 2.3334176540374756, "learning_rate": 0.000164900690892965, "loss": 3.0344, "step": 1985 }, { "epoch": 0.5526837583052144, "grad_norm": 2.313554048538208, "learning_rate": 0.00016486738289623267, "loss": 2.6717, "step": 1986 }, { "epoch": 0.5529620482137266, "grad_norm": 2.1962478160858154, "learning_rate": 0.0001648340624703849, "loss": 3.078, "step": 1987 }, { "epoch": 0.5532403381222388, "grad_norm": 2.2124226093292236, "learning_rate": 0.0001648007296218061, "loss": 2.6876, "step": 1988 }, { "epoch": 0.553518628030751, "grad_norm": 2.434992551803589, "learning_rate": 0.0001647673843568832, "loss": 2.917, "step": 1989 }, { "epoch": 0.5537969179392632, "grad_norm": 2.146813154220581, "learning_rate": 0.0001647340266820053, "loss": 2.8534, "step": 1990 }, { "epoch": 0.5540752078477754, "grad_norm": 2.5056605339050293, "learning_rate": 0.00016470065660356413, "loss": 2.7945, "step": 1991 }, { "epoch": 0.5543534977562876, "grad_norm": 2.23988676071167, "learning_rate": 0.0001646672741279536, "loss": 2.9043, "step": 1992 }, { "epoch": 0.5546317876647998, "grad_norm": 2.5383193492889404, "learning_rate": 0.00016463387926157001, "loss": 3.0039, "step": 1993 }, { "epoch": 0.554910077573312, "grad_norm": 2.192621946334839, "learning_rate": 0.00016460047201081218, "loss": 2.8835, "step": 1994 }, { "epoch": 0.5551883674818242, "grad_norm": 2.5819473266601562, "learning_rate": 0.00016456705238208116, "loss": 2.97, "step": 1995 }, { "epoch": 0.5554666573903364, "grad_norm": 3.4661834239959717, "learning_rate": 0.00016453362038178047, "loss": 2.8627, "step": 1996 }, { "epoch": 0.5557449472988486, "grad_norm": 2.4773688316345215, "learning_rate": 0.00016450017601631588, "loss": 2.8568, "step": 1997 }, { "epoch": 0.5560232372073608, "grad_norm": 2.2663443088531494, "learning_rate": 0.00016446671929209566, "loss": 2.9237, "step": 1998 }, { "epoch": 0.556301527115873, "grad_norm": 2.646900177001953, "learning_rate": 0.00016443325021553036, "loss": 2.9215, "step": 1999 }, { "epoch": 0.5565798170243852, "grad_norm": 2.4180891513824463, "learning_rate": 0.00016439976879303295, "loss": 2.841, "step": 2000 }, { "epoch": 0.5565798170243852, "eval_loss": 2.9150915145874023, "eval_runtime": 84.3126, "eval_samples_per_second": 59.303, "eval_steps_per_second": 14.826, "step": 2000 }, { "epoch": 0.5568581069328974, "grad_norm": 1.9937694072723389, "learning_rate": 0.0001643662750310187, "loss": 2.8497, "step": 2001 }, { "epoch": 0.5571363968414096, "grad_norm": 2.1709563732147217, "learning_rate": 0.00016433276893590532, "loss": 3.0176, "step": 2002 }, { "epoch": 0.5574146867499218, "grad_norm": 2.087261438369751, "learning_rate": 0.00016429925051411284, "loss": 2.8818, "step": 2003 }, { "epoch": 0.5576929766584339, "grad_norm": 2.778231143951416, "learning_rate": 0.00016426571977206365, "loss": 2.8642, "step": 2004 }, { "epoch": 0.5579712665669461, "grad_norm": 2.2054693698883057, "learning_rate": 0.0001642321767161825, "loss": 2.8595, "step": 2005 }, { "epoch": 0.5582495564754583, "grad_norm": 2.5143532752990723, "learning_rate": 0.00016419862135289657, "loss": 2.8715, "step": 2006 }, { "epoch": 0.5585278463839705, "grad_norm": 2.4755825996398926, "learning_rate": 0.0001641650536886353, "loss": 2.897, "step": 2007 }, { "epoch": 0.5588061362924827, "grad_norm": 2.620387077331543, "learning_rate": 0.00016413147372983051, "loss": 3.0051, "step": 2008 }, { "epoch": 0.5590844262009949, "grad_norm": 2.2283756732940674, "learning_rate": 0.00016409788148291643, "loss": 2.781, "step": 2009 }, { "epoch": 0.559362716109507, "grad_norm": 2.3019275665283203, "learning_rate": 0.00016406427695432957, "loss": 2.9187, "step": 2010 }, { "epoch": 0.5596410060180192, "grad_norm": 2.2935285568237305, "learning_rate": 0.00016403066015050884, "loss": 2.836, "step": 2011 }, { "epoch": 0.5599192959265314, "grad_norm": 2.356058359146118, "learning_rate": 0.0001639970310778955, "loss": 2.7482, "step": 2012 }, { "epoch": 0.5601975858350436, "grad_norm": 2.482983350753784, "learning_rate": 0.00016396338974293317, "loss": 2.7292, "step": 2013 }, { "epoch": 0.5604758757435558, "grad_norm": 2.1267011165618896, "learning_rate": 0.00016392973615206777, "loss": 2.6327, "step": 2014 }, { "epoch": 0.560754165652068, "grad_norm": 2.4464073181152344, "learning_rate": 0.0001638960703117476, "loss": 3.0038, "step": 2015 }, { "epoch": 0.5610324555605802, "grad_norm": 2.501739740371704, "learning_rate": 0.00016386239222842335, "loss": 2.8496, "step": 2016 }, { "epoch": 0.5613107454690924, "grad_norm": 2.0744946002960205, "learning_rate": 0.00016382870190854794, "loss": 2.7997, "step": 2017 }, { "epoch": 0.5615890353776046, "grad_norm": 2.5704474449157715, "learning_rate": 0.00016379499935857678, "loss": 2.7746, "step": 2018 }, { "epoch": 0.5618673252861168, "grad_norm": 2.200807809829712, "learning_rate": 0.00016376128458496752, "loss": 2.7004, "step": 2019 }, { "epoch": 0.562145615194629, "grad_norm": 4.663360118865967, "learning_rate": 0.00016372755759418018, "loss": 3.0707, "step": 2020 }, { "epoch": 0.5624239051031412, "grad_norm": 2.140141010284424, "learning_rate": 0.0001636938183926771, "loss": 2.8506, "step": 2021 }, { "epoch": 0.5627021950116534, "grad_norm": 2.590257406234741, "learning_rate": 0.00016366006698692305, "loss": 3.0344, "step": 2022 }, { "epoch": 0.5629804849201656, "grad_norm": 2.606826066970825, "learning_rate": 0.00016362630338338506, "loss": 3.0664, "step": 2023 }, { "epoch": 0.5632587748286778, "grad_norm": 2.107863426208496, "learning_rate": 0.00016359252758853243, "loss": 2.4226, "step": 2024 }, { "epoch": 0.56353706473719, "grad_norm": 2.0325253009796143, "learning_rate": 0.00016355873960883694, "loss": 2.5492, "step": 2025 }, { "epoch": 0.5638153546457022, "grad_norm": 2.4454360008239746, "learning_rate": 0.00016352493945077264, "loss": 2.7257, "step": 2026 }, { "epoch": 0.5640936445542144, "grad_norm": 2.6493520736694336, "learning_rate": 0.00016349112712081592, "loss": 3.0761, "step": 2027 }, { "epoch": 0.5643719344627266, "grad_norm": 2.120412588119507, "learning_rate": 0.00016345730262544547, "loss": 2.8448, "step": 2028 }, { "epoch": 0.5646502243712387, "grad_norm": 2.472684621810913, "learning_rate": 0.00016342346597114237, "loss": 3.0654, "step": 2029 }, { "epoch": 0.5649285142797509, "grad_norm": 2.2580957412719727, "learning_rate": 0.0001633896171643899, "loss": 2.8239, "step": 2030 }, { "epoch": 0.5652068041882631, "grad_norm": 2.6949515342712402, "learning_rate": 0.0001633557562116739, "loss": 2.7548, "step": 2031 }, { "epoch": 0.5654850940967753, "grad_norm": 2.790701150894165, "learning_rate": 0.00016332188311948234, "loss": 3.0079, "step": 2032 }, { "epoch": 0.5657633840052875, "grad_norm": 2.8903000354766846, "learning_rate": 0.0001632879978943056, "loss": 2.9429, "step": 2033 }, { "epoch": 0.5660416739137997, "grad_norm": 2.842350482940674, "learning_rate": 0.00016325410054263633, "loss": 2.8975, "step": 2034 }, { "epoch": 0.5663199638223119, "grad_norm": 2.1550087928771973, "learning_rate": 0.00016322019107096957, "loss": 2.9263, "step": 2035 }, { "epoch": 0.5665982537308241, "grad_norm": 2.109544277191162, "learning_rate": 0.00016318626948580258, "loss": 2.8787, "step": 2036 }, { "epoch": 0.5668765436393363, "grad_norm": 2.1769118309020996, "learning_rate": 0.0001631523357936351, "loss": 2.8881, "step": 2037 }, { "epoch": 0.5671548335478485, "grad_norm": 2.429656505584717, "learning_rate": 0.00016311839000096904, "loss": 2.7744, "step": 2038 }, { "epoch": 0.5674331234563607, "grad_norm": 2.260044813156128, "learning_rate": 0.00016308443211430872, "loss": 2.8389, "step": 2039 }, { "epoch": 0.5677114133648729, "grad_norm": 2.3132448196411133, "learning_rate": 0.00016305046214016073, "loss": 2.91, "step": 2040 }, { "epoch": 0.567989703273385, "grad_norm": 2.634324312210083, "learning_rate": 0.000163016480085034, "loss": 2.9771, "step": 2041 }, { "epoch": 0.5682679931818972, "grad_norm": 2.190044403076172, "learning_rate": 0.00016298248595543978, "loss": 2.5307, "step": 2042 }, { "epoch": 0.5685462830904094, "grad_norm": 2.3761801719665527, "learning_rate": 0.00016294847975789162, "loss": 3.079, "step": 2043 }, { "epoch": 0.5688245729989216, "grad_norm": 2.232034683227539, "learning_rate": 0.0001629144614989053, "loss": 3.0625, "step": 2044 }, { "epoch": 0.5691028629074338, "grad_norm": 2.079737424850464, "learning_rate": 0.0001628804311849991, "loss": 2.5829, "step": 2045 }, { "epoch": 0.569381152815946, "grad_norm": 2.3784873485565186, "learning_rate": 0.00016284638882269346, "loss": 2.7726, "step": 2046 }, { "epoch": 0.5696594427244582, "grad_norm": 2.390270948410034, "learning_rate": 0.0001628123344185112, "loss": 3.0228, "step": 2047 }, { "epoch": 0.5699377326329704, "grad_norm": 2.2414469718933105, "learning_rate": 0.00016277826797897737, "loss": 2.6981, "step": 2048 }, { "epoch": 0.5702160225414826, "grad_norm": 2.436793327331543, "learning_rate": 0.0001627441895106194, "loss": 2.936, "step": 2049 }, { "epoch": 0.5704943124499948, "grad_norm": 2.377934217453003, "learning_rate": 0.000162710099019967, "loss": 2.8147, "step": 2050 }, { "epoch": 0.570772602358507, "grad_norm": 2.30033016204834, "learning_rate": 0.00016267599651355221, "loss": 2.8028, "step": 2051 }, { "epoch": 0.5710508922670192, "grad_norm": 2.304966926574707, "learning_rate": 0.0001626418819979093, "loss": 2.9616, "step": 2052 }, { "epoch": 0.5713291821755314, "grad_norm": 2.323075771331787, "learning_rate": 0.0001626077554795749, "loss": 2.9705, "step": 2053 }, { "epoch": 0.5716074720840435, "grad_norm": 2.281712293624878, "learning_rate": 0.00016257361696508794, "loss": 2.7624, "step": 2054 }, { "epoch": 0.5718857619925557, "grad_norm": 2.581143617630005, "learning_rate": 0.00016253946646098966, "loss": 2.9806, "step": 2055 }, { "epoch": 0.5721640519010679, "grad_norm": 2.0533857345581055, "learning_rate": 0.00016250530397382353, "loss": 2.7637, "step": 2056 }, { "epoch": 0.5724423418095801, "grad_norm": 2.2935004234313965, "learning_rate": 0.00016247112951013536, "loss": 3.0307, "step": 2057 }, { "epoch": 0.5727206317180923, "grad_norm": 2.486053228378296, "learning_rate": 0.0001624369430764733, "loss": 2.7748, "step": 2058 }, { "epoch": 0.5729989216266045, "grad_norm": 2.610978126525879, "learning_rate": 0.00016240274467938767, "loss": 2.7952, "step": 2059 }, { "epoch": 0.5732772115351167, "grad_norm": 2.512754440307617, "learning_rate": 0.00016236853432543123, "loss": 2.7704, "step": 2060 }, { "epoch": 0.5735555014436289, "grad_norm": 2.4459645748138428, "learning_rate": 0.00016233431202115895, "loss": 2.9253, "step": 2061 }, { "epoch": 0.5738337913521411, "grad_norm": 2.58434796333313, "learning_rate": 0.00016230007777312807, "loss": 2.767, "step": 2062 }, { "epoch": 0.5741120812606533, "grad_norm": 2.4839210510253906, "learning_rate": 0.00016226583158789813, "loss": 3.06, "step": 2063 }, { "epoch": 0.5743903711691655, "grad_norm": 2.3217861652374268, "learning_rate": 0.00016223157347203105, "loss": 2.8039, "step": 2064 }, { "epoch": 0.5746686610776777, "grad_norm": 2.1571245193481445, "learning_rate": 0.0001621973034320909, "loss": 2.5247, "step": 2065 }, { "epoch": 0.5749469509861899, "grad_norm": 2.8123562335968018, "learning_rate": 0.00016216302147464414, "loss": 2.9259, "step": 2066 }, { "epoch": 0.5752252408947021, "grad_norm": 2.207058906555176, "learning_rate": 0.00016212872760625944, "loss": 2.7393, "step": 2067 }, { "epoch": 0.5755035308032143, "grad_norm": 2.9887354373931885, "learning_rate": 0.0001620944218335078, "loss": 2.8283, "step": 2068 }, { "epoch": 0.5757818207117265, "grad_norm": 2.41922664642334, "learning_rate": 0.00016206010416296244, "loss": 3.1678, "step": 2069 }, { "epoch": 0.5760601106202387, "grad_norm": 2.1267218589782715, "learning_rate": 0.000162025774601199, "loss": 2.7397, "step": 2070 }, { "epoch": 0.5763384005287508, "grad_norm": 2.311589479446411, "learning_rate": 0.00016199143315479518, "loss": 3.055, "step": 2071 }, { "epoch": 0.576616690437263, "grad_norm": 2.1903018951416016, "learning_rate": 0.00016195707983033118, "loss": 2.7724, "step": 2072 }, { "epoch": 0.5768949803457752, "grad_norm": 2.662034034729004, "learning_rate": 0.00016192271463438934, "loss": 2.8215, "step": 2073 }, { "epoch": 0.5771732702542874, "grad_norm": 2.6136488914489746, "learning_rate": 0.0001618883375735543, "loss": 2.9932, "step": 2074 }, { "epoch": 0.5774515601627996, "grad_norm": 2.6986947059631348, "learning_rate": 0.00016185394865441297, "loss": 2.9471, "step": 2075 }, { "epoch": 0.5777298500713118, "grad_norm": 2.858654260635376, "learning_rate": 0.00016181954788355458, "loss": 3.0513, "step": 2076 }, { "epoch": 0.578008139979824, "grad_norm": 2.45855975151062, "learning_rate": 0.0001617851352675706, "loss": 2.8446, "step": 2077 }, { "epoch": 0.5782864298883362, "grad_norm": 2.4820330142974854, "learning_rate": 0.0001617507108130547, "loss": 2.9324, "step": 2078 }, { "epoch": 0.5785647197968483, "grad_norm": 2.380343198776245, "learning_rate": 0.00016171627452660293, "loss": 2.8217, "step": 2079 }, { "epoch": 0.5788430097053605, "grad_norm": 2.5176961421966553, "learning_rate": 0.0001616818264148136, "loss": 2.9874, "step": 2080 }, { "epoch": 0.5791212996138727, "grad_norm": 2.63055157661438, "learning_rate": 0.0001616473664842872, "loss": 3.1282, "step": 2081 }, { "epoch": 0.5793995895223849, "grad_norm": 2.1881473064422607, "learning_rate": 0.00016161289474162652, "loss": 2.7393, "step": 2082 }, { "epoch": 0.5796778794308971, "grad_norm": 2.74493408203125, "learning_rate": 0.00016157841119343664, "loss": 2.8003, "step": 2083 }, { "epoch": 0.5799561693394093, "grad_norm": 2.464918613433838, "learning_rate": 0.0001615439158463249, "loss": 2.6401, "step": 2084 }, { "epoch": 0.5802344592479215, "grad_norm": 2.658651828765869, "learning_rate": 0.00016150940870690084, "loss": 2.8619, "step": 2085 }, { "epoch": 0.5805127491564337, "grad_norm": 2.411074638366699, "learning_rate": 0.0001614748897817764, "loss": 2.7545, "step": 2086 }, { "epoch": 0.5807910390649459, "grad_norm": 2.116706371307373, "learning_rate": 0.00016144035907756564, "loss": 2.8296, "step": 2087 }, { "epoch": 0.5810693289734581, "grad_norm": 3.1943864822387695, "learning_rate": 0.00016140581660088488, "loss": 3.0769, "step": 2088 }, { "epoch": 0.5813476188819703, "grad_norm": 2.0831809043884277, "learning_rate": 0.00016137126235835276, "loss": 2.8647, "step": 2089 }, { "epoch": 0.5816259087904825, "grad_norm": 2.9632339477539062, "learning_rate": 0.0001613366963565902, "loss": 2.7625, "step": 2090 }, { "epoch": 0.5819041986989947, "grad_norm": 2.417341470718384, "learning_rate": 0.00016130211860222024, "loss": 3.041, "step": 2091 }, { "epoch": 0.5821824886075069, "grad_norm": 2.3462271690368652, "learning_rate": 0.00016126752910186834, "loss": 2.9865, "step": 2092 }, { "epoch": 0.5824607785160191, "grad_norm": 2.123222589492798, "learning_rate": 0.00016123292786216208, "loss": 2.983, "step": 2093 }, { "epoch": 0.5827390684245313, "grad_norm": 2.4878523349761963, "learning_rate": 0.0001611983148897314, "loss": 3.095, "step": 2094 }, { "epoch": 0.5830173583330435, "grad_norm": 2.2196364402770996, "learning_rate": 0.0001611636901912083, "loss": 2.7891, "step": 2095 }, { "epoch": 0.5832956482415557, "grad_norm": 2.2717201709747314, "learning_rate": 0.00016112905377322728, "loss": 2.8791, "step": 2096 }, { "epoch": 0.5835739381500679, "grad_norm": 2.2012338638305664, "learning_rate": 0.00016109440564242492, "loss": 2.8546, "step": 2097 }, { "epoch": 0.5838522280585801, "grad_norm": 2.1055173873901367, "learning_rate": 0.00016105974580544005, "loss": 2.8989, "step": 2098 }, { "epoch": 0.5841305179670923, "grad_norm": 2.6595404148101807, "learning_rate": 0.0001610250742689138, "loss": 2.9118, "step": 2099 }, { "epoch": 0.5844088078756045, "grad_norm": 3.0985939502716064, "learning_rate": 0.00016099039103948951, "loss": 3.0443, "step": 2100 }, { "epoch": 0.5844088078756045, "eval_loss": 2.9115405082702637, "eval_runtime": 84.4657, "eval_samples_per_second": 59.196, "eval_steps_per_second": 14.799, "step": 2100 }, { "epoch": 0.5846870977841166, "grad_norm": 2.1743268966674805, "learning_rate": 0.0001609556961238128, "loss": 2.6993, "step": 2101 }, { "epoch": 0.5849653876926288, "grad_norm": 2.407303810119629, "learning_rate": 0.0001609209895285314, "loss": 2.971, "step": 2102 }, { "epoch": 0.585243677601141, "grad_norm": 2.494429588317871, "learning_rate": 0.00016088627126029548, "loss": 2.818, "step": 2103 }, { "epoch": 0.5855219675096531, "grad_norm": 2.112222671508789, "learning_rate": 0.0001608515413257573, "loss": 2.8534, "step": 2104 }, { "epoch": 0.5858002574181653, "grad_norm": 2.290632724761963, "learning_rate": 0.0001608167997315714, "loss": 2.7328, "step": 2105 }, { "epoch": 0.5860785473266775, "grad_norm": 2.2767767906188965, "learning_rate": 0.00016078204648439453, "loss": 2.8271, "step": 2106 }, { "epoch": 0.5863568372351897, "grad_norm": 2.3380379676818848, "learning_rate": 0.00016074728159088574, "loss": 3.0473, "step": 2107 }, { "epoch": 0.5866351271437019, "grad_norm": 2.144124984741211, "learning_rate": 0.00016071250505770624, "loss": 2.6827, "step": 2108 }, { "epoch": 0.5869134170522141, "grad_norm": 2.3525285720825195, "learning_rate": 0.00016067771689151948, "loss": 3.1214, "step": 2109 }, { "epoch": 0.5871917069607263, "grad_norm": 2.9980599880218506, "learning_rate": 0.00016064291709899116, "loss": 2.9473, "step": 2110 }, { "epoch": 0.5874699968692385, "grad_norm": 2.7081539630889893, "learning_rate": 0.00016060810568678923, "loss": 3.0321, "step": 2111 }, { "epoch": 0.5877482867777507, "grad_norm": 2.236114501953125, "learning_rate": 0.0001605732826615838, "loss": 2.9224, "step": 2112 }, { "epoch": 0.5880265766862629, "grad_norm": 1.9092764854431152, "learning_rate": 0.00016053844803004728, "loss": 2.5706, "step": 2113 }, { "epoch": 0.5883048665947751, "grad_norm": 2.2694549560546875, "learning_rate": 0.00016050360179885423, "loss": 2.9611, "step": 2114 }, { "epoch": 0.5885831565032873, "grad_norm": 2.4863059520721436, "learning_rate": 0.00016046874397468148, "loss": 2.9781, "step": 2115 }, { "epoch": 0.5888614464117995, "grad_norm": 5.281558036804199, "learning_rate": 0.00016043387456420808, "loss": 2.9867, "step": 2116 }, { "epoch": 0.5891397363203117, "grad_norm": 2.5232224464416504, "learning_rate": 0.0001603989935741153, "loss": 3.0309, "step": 2117 }, { "epoch": 0.5894180262288239, "grad_norm": 2.153146743774414, "learning_rate": 0.00016036410101108662, "loss": 2.7201, "step": 2118 }, { "epoch": 0.5896963161373361, "grad_norm": 2.1310219764709473, "learning_rate": 0.0001603291968818077, "loss": 2.9546, "step": 2119 }, { "epoch": 0.5899746060458483, "grad_norm": 2.357241153717041, "learning_rate": 0.0001602942811929665, "loss": 2.8963, "step": 2120 }, { "epoch": 0.5902528959543605, "grad_norm": 3.3376731872558594, "learning_rate": 0.00016025935395125313, "loss": 2.8033, "step": 2121 }, { "epoch": 0.5905311858628727, "grad_norm": 2.6821534633636475, "learning_rate": 0.00016022441516335992, "loss": 2.9784, "step": 2122 }, { "epoch": 0.5908094757713849, "grad_norm": 2.1204159259796143, "learning_rate": 0.00016018946483598148, "loss": 2.6149, "step": 2123 }, { "epoch": 0.5910877656798971, "grad_norm": 2.0825066566467285, "learning_rate": 0.00016015450297581448, "loss": 2.5005, "step": 2124 }, { "epoch": 0.5913660555884093, "grad_norm": 2.26924467086792, "learning_rate": 0.000160119529589558, "loss": 2.8727, "step": 2125 }, { "epoch": 0.5916443454969215, "grad_norm": 2.272739887237549, "learning_rate": 0.00016008454468391308, "loss": 2.9219, "step": 2126 }, { "epoch": 0.5919226354054337, "grad_norm": 2.393573522567749, "learning_rate": 0.00016004954826558327, "loss": 2.7882, "step": 2127 }, { "epoch": 0.5922009253139459, "grad_norm": 2.2192962169647217, "learning_rate": 0.0001600145403412741, "loss": 2.8759, "step": 2128 }, { "epoch": 0.5924792152224579, "grad_norm": 2.444972038269043, "learning_rate": 0.00015997952091769335, "loss": 2.8933, "step": 2129 }, { "epoch": 0.5927575051309701, "grad_norm": 2.541269063949585, "learning_rate": 0.00015994449000155105, "loss": 3.0452, "step": 2130 }, { "epoch": 0.5930357950394823, "grad_norm": 2.4052085876464844, "learning_rate": 0.0001599094475995594, "loss": 2.9314, "step": 2131 }, { "epoch": 0.5933140849479945, "grad_norm": 2.4039876461029053, "learning_rate": 0.00015987439371843284, "loss": 2.9293, "step": 2132 }, { "epoch": 0.5935923748565067, "grad_norm": 3.198533535003662, "learning_rate": 0.00015983932836488788, "loss": 3.0907, "step": 2133 }, { "epoch": 0.5938706647650189, "grad_norm": 2.560333490371704, "learning_rate": 0.00015980425154564344, "loss": 3.0959, "step": 2134 }, { "epoch": 0.5941489546735311, "grad_norm": 2.261425495147705, "learning_rate": 0.00015976916326742048, "loss": 2.9643, "step": 2135 }, { "epoch": 0.5944272445820433, "grad_norm": 2.1651453971862793, "learning_rate": 0.0001597340635369422, "loss": 2.8279, "step": 2136 }, { "epoch": 0.5947055344905555, "grad_norm": 2.2444875240325928, "learning_rate": 0.00015969895236093399, "loss": 2.7908, "step": 2137 }, { "epoch": 0.5949838243990677, "grad_norm": 2.2363498210906982, "learning_rate": 0.00015966382974612336, "loss": 2.7145, "step": 2138 }, { "epoch": 0.5952621143075799, "grad_norm": 2.556377649307251, "learning_rate": 0.00015962869569924025, "loss": 2.7909, "step": 2139 }, { "epoch": 0.5955404042160921, "grad_norm": 2.8886547088623047, "learning_rate": 0.00015959355022701648, "loss": 2.8417, "step": 2140 }, { "epoch": 0.5958186941246043, "grad_norm": 2.375072717666626, "learning_rate": 0.00015955839333618626, "loss": 3.0116, "step": 2141 }, { "epoch": 0.5960969840331165, "grad_norm": 2.344085693359375, "learning_rate": 0.00015952322503348597, "loss": 2.9525, "step": 2142 }, { "epoch": 0.5963752739416287, "grad_norm": 2.034959554672241, "learning_rate": 0.0001594880453256541, "loss": 2.7102, "step": 2143 }, { "epoch": 0.5966535638501409, "grad_norm": 2.3794124126434326, "learning_rate": 0.00015945285421943136, "loss": 2.8817, "step": 2144 }, { "epoch": 0.5969318537586531, "grad_norm": 2.544445276260376, "learning_rate": 0.0001594176517215607, "loss": 2.8628, "step": 2145 }, { "epoch": 0.5972101436671653, "grad_norm": 2.3793671131134033, "learning_rate": 0.0001593824378387871, "loss": 2.9377, "step": 2146 }, { "epoch": 0.5974884335756775, "grad_norm": 2.032205581665039, "learning_rate": 0.00015934721257785794, "loss": 2.696, "step": 2147 }, { "epoch": 0.5977667234841897, "grad_norm": 2.5437817573547363, "learning_rate": 0.00015931197594552262, "loss": 2.8596, "step": 2148 }, { "epoch": 0.5980450133927019, "grad_norm": 2.505988597869873, "learning_rate": 0.00015927672794853278, "loss": 2.7841, "step": 2149 }, { "epoch": 0.5983233033012141, "grad_norm": 2.2664225101470947, "learning_rate": 0.00015924146859364218, "loss": 2.7527, "step": 2150 }, { "epoch": 0.5986015932097263, "grad_norm": 2.3853771686553955, "learning_rate": 0.00015920619788760683, "loss": 2.8167, "step": 2151 }, { "epoch": 0.5988798831182385, "grad_norm": 2.420379877090454, "learning_rate": 0.00015917091583718488, "loss": 2.8705, "step": 2152 }, { "epoch": 0.5991581730267506, "grad_norm": 2.6942567825317383, "learning_rate": 0.00015913562244913667, "loss": 3.2588, "step": 2153 }, { "epoch": 0.5994364629352628, "grad_norm": 2.265284538269043, "learning_rate": 0.00015910031773022464, "loss": 2.7761, "step": 2154 }, { "epoch": 0.599714752843775, "grad_norm": 2.474555730819702, "learning_rate": 0.00015906500168721353, "loss": 2.7849, "step": 2155 }, { "epoch": 0.5999930427522872, "grad_norm": 2.5489721298217773, "learning_rate": 0.00015902967432687012, "loss": 2.8618, "step": 2156 }, { "epoch": 0.6002713326607994, "grad_norm": 2.2978951930999756, "learning_rate": 0.00015899433565596348, "loss": 3.0135, "step": 2157 }, { "epoch": 0.6005496225693115, "grad_norm": 2.064039945602417, "learning_rate": 0.00015895898568126475, "loss": 2.6934, "step": 2158 }, { "epoch": 0.6008279124778237, "grad_norm": 2.2669320106506348, "learning_rate": 0.00015892362440954726, "loss": 2.5319, "step": 2159 }, { "epoch": 0.6011062023863359, "grad_norm": 2.406996011734009, "learning_rate": 0.00015888825184758653, "loss": 3.1433, "step": 2160 }, { "epoch": 0.6013844922948481, "grad_norm": 2.2653729915618896, "learning_rate": 0.00015885286800216024, "loss": 2.9701, "step": 2161 }, { "epoch": 0.6016627822033603, "grad_norm": 2.445518970489502, "learning_rate": 0.00015881747288004822, "loss": 2.9713, "step": 2162 }, { "epoch": 0.6019410721118725, "grad_norm": 2.485112190246582, "learning_rate": 0.00015878206648803244, "loss": 3.0853, "step": 2163 }, { "epoch": 0.6022193620203847, "grad_norm": 2.3362393379211426, "learning_rate": 0.00015874664883289708, "loss": 3.0144, "step": 2164 }, { "epoch": 0.6024976519288969, "grad_norm": 2.296271324157715, "learning_rate": 0.00015871121992142842, "loss": 3.0965, "step": 2165 }, { "epoch": 0.6027759418374091, "grad_norm": 2.4204819202423096, "learning_rate": 0.0001586757797604149, "loss": 2.9477, "step": 2166 }, { "epoch": 0.6030542317459213, "grad_norm": 2.3704886436462402, "learning_rate": 0.00015864032835664725, "loss": 2.9468, "step": 2167 }, { "epoch": 0.6033325216544335, "grad_norm": 2.279327630996704, "learning_rate": 0.00015860486571691812, "loss": 2.5575, "step": 2168 }, { "epoch": 0.6036108115629457, "grad_norm": 2.201233386993408, "learning_rate": 0.00015856939184802254, "loss": 2.6708, "step": 2169 }, { "epoch": 0.6038891014714579, "grad_norm": 2.4510738849639893, "learning_rate": 0.0001585339067567575, "loss": 2.6958, "step": 2170 }, { "epoch": 0.6041673913799701, "grad_norm": 2.3983023166656494, "learning_rate": 0.0001584984104499223, "loss": 2.9562, "step": 2171 }, { "epoch": 0.6044456812884823, "grad_norm": 2.343332052230835, "learning_rate": 0.00015846290293431833, "loss": 2.9116, "step": 2172 }, { "epoch": 0.6047239711969945, "grad_norm": 2.3738043308258057, "learning_rate": 0.000158427384216749, "loss": 2.6765, "step": 2173 }, { "epoch": 0.6050022611055067, "grad_norm": 2.160430669784546, "learning_rate": 0.00015839185430402015, "loss": 2.9949, "step": 2174 }, { "epoch": 0.6052805510140189, "grad_norm": 2.3140578269958496, "learning_rate": 0.00015835631320293945, "loss": 2.8888, "step": 2175 }, { "epoch": 0.6055588409225311, "grad_norm": 2.4452648162841797, "learning_rate": 0.000158320760920317, "loss": 3.1985, "step": 2176 }, { "epoch": 0.6058371308310433, "grad_norm": 2.781312942504883, "learning_rate": 0.00015828519746296477, "loss": 3.2221, "step": 2177 }, { "epoch": 0.6061154207395554, "grad_norm": 2.1096351146698, "learning_rate": 0.00015824962283769712, "loss": 2.7647, "step": 2178 }, { "epoch": 0.6063937106480676, "grad_norm": 2.3364462852478027, "learning_rate": 0.00015821403705133035, "loss": 2.8477, "step": 2179 }, { "epoch": 0.6066720005565798, "grad_norm": 2.0927674770355225, "learning_rate": 0.00015817844011068303, "loss": 3.0232, "step": 2180 }, { "epoch": 0.606950290465092, "grad_norm": 2.2776570320129395, "learning_rate": 0.00015814283202257581, "loss": 3.0373, "step": 2181 }, { "epoch": 0.6072285803736042, "grad_norm": 2.5803277492523193, "learning_rate": 0.0001581072127938315, "loss": 2.8857, "step": 2182 }, { "epoch": 0.6075068702821164, "grad_norm": 2.1546480655670166, "learning_rate": 0.000158071582431275, "loss": 2.8789, "step": 2183 }, { "epoch": 0.6077851601906286, "grad_norm": 2.029275894165039, "learning_rate": 0.00015803594094173338, "loss": 2.6751, "step": 2184 }, { "epoch": 0.6080634500991408, "grad_norm": 2.071354627609253, "learning_rate": 0.00015800028833203592, "loss": 2.9265, "step": 2185 }, { "epoch": 0.608341740007653, "grad_norm": 1.9437540769577026, "learning_rate": 0.00015796462460901384, "loss": 2.7398, "step": 2186 }, { "epoch": 0.6086200299161652, "grad_norm": 2.18601393699646, "learning_rate": 0.00015792894977950065, "loss": 2.887, "step": 2187 }, { "epoch": 0.6088983198246773, "grad_norm": 2.006263017654419, "learning_rate": 0.00015789326385033195, "loss": 2.7292, "step": 2188 }, { "epoch": 0.6091766097331895, "grad_norm": 2.293865919113159, "learning_rate": 0.0001578575668283454, "loss": 2.7669, "step": 2189 }, { "epoch": 0.6094548996417017, "grad_norm": 2.5924177169799805, "learning_rate": 0.0001578218587203809, "loss": 2.871, "step": 2190 }, { "epoch": 0.6097331895502139, "grad_norm": 2.434884786605835, "learning_rate": 0.00015778613953328034, "loss": 3.017, "step": 2191 }, { "epoch": 0.6100114794587261, "grad_norm": 2.2052009105682373, "learning_rate": 0.00015775040927388788, "loss": 2.9211, "step": 2192 }, { "epoch": 0.6102897693672383, "grad_norm": 2.2576346397399902, "learning_rate": 0.0001577146679490497, "loss": 3.0011, "step": 2193 }, { "epoch": 0.6105680592757505, "grad_norm": 2.3031089305877686, "learning_rate": 0.00015767891556561412, "loss": 2.707, "step": 2194 }, { "epoch": 0.6108463491842627, "grad_norm": 2.285851001739502, "learning_rate": 0.00015764315213043158, "loss": 2.7782, "step": 2195 }, { "epoch": 0.6111246390927749, "grad_norm": 2.0939157009124756, "learning_rate": 0.00015760737765035463, "loss": 2.8451, "step": 2196 }, { "epoch": 0.6114029290012871, "grad_norm": 2.3285579681396484, "learning_rate": 0.00015757159213223798, "loss": 2.8875, "step": 2197 }, { "epoch": 0.6116812189097993, "grad_norm": 2.5175721645355225, "learning_rate": 0.0001575357955829384, "loss": 2.8349, "step": 2198 }, { "epoch": 0.6119595088183115, "grad_norm": 3.293410301208496, "learning_rate": 0.00015749998800931487, "loss": 3.0401, "step": 2199 }, { "epoch": 0.6122377987268237, "grad_norm": 2.4121453762054443, "learning_rate": 0.0001574641694182283, "loss": 2.7652, "step": 2200 }, { "epoch": 0.6122377987268237, "eval_loss": 2.902242660522461, "eval_runtime": 84.7294, "eval_samples_per_second": 59.011, "eval_steps_per_second": 14.753, "step": 2200 }, { "epoch": 0.6125160886353359, "grad_norm": 2.65045166015625, "learning_rate": 0.00015742833981654191, "loss": 3.1452, "step": 2201 }, { "epoch": 0.6127943785438481, "grad_norm": 2.2562432289123535, "learning_rate": 0.0001573924992111209, "loss": 2.7859, "step": 2202 }, { "epoch": 0.6130726684523602, "grad_norm": 2.15720534324646, "learning_rate": 0.00015735664760883262, "loss": 2.6121, "step": 2203 }, { "epoch": 0.6133509583608724, "grad_norm": 2.268415689468384, "learning_rate": 0.00015732078501654652, "loss": 2.8629, "step": 2204 }, { "epoch": 0.6136292482693846, "grad_norm": 2.2984490394592285, "learning_rate": 0.0001572849114411342, "loss": 2.8243, "step": 2205 }, { "epoch": 0.6139075381778968, "grad_norm": 2.3487210273742676, "learning_rate": 0.0001572490268894693, "loss": 2.9669, "step": 2206 }, { "epoch": 0.614185828086409, "grad_norm": 2.2375307083129883, "learning_rate": 0.0001572131313684276, "loss": 2.9135, "step": 2207 }, { "epoch": 0.6144641179949212, "grad_norm": 2.2432563304901123, "learning_rate": 0.00015717722488488696, "loss": 2.817, "step": 2208 }, { "epoch": 0.6147424079034334, "grad_norm": 2.4378702640533447, "learning_rate": 0.00015714130744572734, "loss": 2.8261, "step": 2209 }, { "epoch": 0.6150206978119456, "grad_norm": 2.5774154663085938, "learning_rate": 0.00015710537905783088, "loss": 3.0569, "step": 2210 }, { "epoch": 0.6152989877204578, "grad_norm": 2.7691664695739746, "learning_rate": 0.0001570694397280817, "loss": 3.1016, "step": 2211 }, { "epoch": 0.61557727762897, "grad_norm": 2.736478805541992, "learning_rate": 0.00015703348946336605, "loss": 2.8962, "step": 2212 }, { "epoch": 0.6158555675374822, "grad_norm": 2.300225019454956, "learning_rate": 0.0001569975282705723, "loss": 3.0661, "step": 2213 }, { "epoch": 0.6161338574459944, "grad_norm": 2.354440450668335, "learning_rate": 0.00015696155615659092, "loss": 2.6363, "step": 2214 }, { "epoch": 0.6164121473545066, "grad_norm": 2.4289002418518066, "learning_rate": 0.00015692557312831448, "loss": 2.9435, "step": 2215 }, { "epoch": 0.6166904372630188, "grad_norm": 2.489961862564087, "learning_rate": 0.00015688957919263757, "loss": 2.9762, "step": 2216 }, { "epoch": 0.616968727171531, "grad_norm": 2.6834914684295654, "learning_rate": 0.00015685357435645696, "loss": 2.8746, "step": 2217 }, { "epoch": 0.6172470170800431, "grad_norm": 3.5694053173065186, "learning_rate": 0.00015681755862667144, "loss": 2.8143, "step": 2218 }, { "epoch": 0.6175253069885553, "grad_norm": 2.336108446121216, "learning_rate": 0.00015678153201018194, "loss": 2.784, "step": 2219 }, { "epoch": 0.6178035968970675, "grad_norm": 2.424042224884033, "learning_rate": 0.0001567454945138914, "loss": 3.0074, "step": 2220 }, { "epoch": 0.6180818868055797, "grad_norm": 2.39479398727417, "learning_rate": 0.00015670944614470497, "loss": 2.6299, "step": 2221 }, { "epoch": 0.6183601767140919, "grad_norm": 3.448154926300049, "learning_rate": 0.00015667338690952977, "loss": 3.1881, "step": 2222 }, { "epoch": 0.6186384666226041, "grad_norm": 2.380462884902954, "learning_rate": 0.00015663731681527505, "loss": 3.0321, "step": 2223 }, { "epoch": 0.6189167565311163, "grad_norm": 2.4951112270355225, "learning_rate": 0.0001566012358688521, "loss": 3.1458, "step": 2224 }, { "epoch": 0.6191950464396285, "grad_norm": 2.554395914077759, "learning_rate": 0.00015656514407717435, "loss": 3.0326, "step": 2225 }, { "epoch": 0.6194733363481407, "grad_norm": 2.3989994525909424, "learning_rate": 0.0001565290414471573, "loss": 2.7585, "step": 2226 }, { "epoch": 0.6197516262566529, "grad_norm": 2.2682552337646484, "learning_rate": 0.00015649292798571845, "loss": 2.602, "step": 2227 }, { "epoch": 0.620029916165165, "grad_norm": 2.5908021926879883, "learning_rate": 0.0001564568036997775, "loss": 2.9183, "step": 2228 }, { "epoch": 0.6203082060736772, "grad_norm": 2.202775001525879, "learning_rate": 0.00015642066859625607, "loss": 2.9247, "step": 2229 }, { "epoch": 0.6205864959821894, "grad_norm": 2.418424606323242, "learning_rate": 0.00015638452268207807, "loss": 3.0348, "step": 2230 }, { "epoch": 0.6208647858907016, "grad_norm": 2.7670645713806152, "learning_rate": 0.00015634836596416918, "loss": 2.969, "step": 2231 }, { "epoch": 0.6211430757992138, "grad_norm": 2.3293581008911133, "learning_rate": 0.00015631219844945745, "loss": 2.7433, "step": 2232 }, { "epoch": 0.621421365707726, "grad_norm": 2.1599066257476807, "learning_rate": 0.00015627602014487284, "loss": 2.8558, "step": 2233 }, { "epoch": 0.6216996556162382, "grad_norm": 2.2230799198150635, "learning_rate": 0.00015623983105734736, "loss": 2.7859, "step": 2234 }, { "epoch": 0.6219779455247504, "grad_norm": 2.3161003589630127, "learning_rate": 0.00015620363119381517, "loss": 2.7508, "step": 2235 }, { "epoch": 0.6222562354332626, "grad_norm": 2.337570905685425, "learning_rate": 0.00015616742056121248, "loss": 3.1568, "step": 2236 }, { "epoch": 0.6225345253417748, "grad_norm": 2.2100698947906494, "learning_rate": 0.00015613119916647747, "loss": 3.1009, "step": 2237 }, { "epoch": 0.622812815250287, "grad_norm": 2.327989101409912, "learning_rate": 0.00015609496701655053, "loss": 2.8803, "step": 2238 }, { "epoch": 0.6230911051587992, "grad_norm": 2.295416831970215, "learning_rate": 0.00015605872411837398, "loss": 2.984, "step": 2239 }, { "epoch": 0.6233693950673114, "grad_norm": 2.265650987625122, "learning_rate": 0.00015602247047889227, "loss": 2.8083, "step": 2240 }, { "epoch": 0.6236476849758236, "grad_norm": 2.2248129844665527, "learning_rate": 0.00015598620610505188, "loss": 2.7934, "step": 2241 }, { "epoch": 0.6239259748843358, "grad_norm": 2.349078893661499, "learning_rate": 0.00015594993100380138, "loss": 2.8903, "step": 2242 }, { "epoch": 0.624204264792848, "grad_norm": 2.4927685260772705, "learning_rate": 0.00015591364518209133, "loss": 2.8402, "step": 2243 }, { "epoch": 0.6244825547013602, "grad_norm": 2.2496588230133057, "learning_rate": 0.00015587734864687443, "loss": 2.7092, "step": 2244 }, { "epoch": 0.6247608446098724, "grad_norm": 2.00419020652771, "learning_rate": 0.0001558410414051054, "loss": 3.0492, "step": 2245 }, { "epoch": 0.6250391345183846, "grad_norm": 2.056647300720215, "learning_rate": 0.00015580472346374096, "loss": 2.6807, "step": 2246 }, { "epoch": 0.6253174244268968, "grad_norm": 7.105856418609619, "learning_rate": 0.00015576839482973992, "loss": 3.0287, "step": 2247 }, { "epoch": 0.625595714335409, "grad_norm": 2.360283613204956, "learning_rate": 0.0001557320555100632, "loss": 2.9968, "step": 2248 }, { "epoch": 0.6258740042439211, "grad_norm": 2.5805556774139404, "learning_rate": 0.00015569570551167365, "loss": 3.1905, "step": 2249 }, { "epoch": 0.6261522941524333, "grad_norm": 2.4609901905059814, "learning_rate": 0.00015565934484153626, "loss": 2.8492, "step": 2250 }, { "epoch": 0.6264305840609455, "grad_norm": 2.316751003265381, "learning_rate": 0.00015562297350661796, "loss": 2.5976, "step": 2251 }, { "epoch": 0.6267088739694577, "grad_norm": 2.574906587600708, "learning_rate": 0.00015558659151388788, "loss": 2.8651, "step": 2252 }, { "epoch": 0.6269871638779698, "grad_norm": 2.1495351791381836, "learning_rate": 0.00015555019887031706, "loss": 2.8421, "step": 2253 }, { "epoch": 0.627265453786482, "grad_norm": 2.5074269771575928, "learning_rate": 0.00015551379558287863, "loss": 3.1858, "step": 2254 }, { "epoch": 0.6275437436949942, "grad_norm": 3.5621511936187744, "learning_rate": 0.00015547738165854776, "loss": 3.0722, "step": 2255 }, { "epoch": 0.6278220336035064, "grad_norm": 1.950539231300354, "learning_rate": 0.00015544095710430167, "loss": 2.7158, "step": 2256 }, { "epoch": 0.6281003235120186, "grad_norm": 2.508481979370117, "learning_rate": 0.00015540452192711954, "loss": 3.0394, "step": 2257 }, { "epoch": 0.6283786134205308, "grad_norm": 2.0491902828216553, "learning_rate": 0.00015536807613398268, "loss": 2.6455, "step": 2258 }, { "epoch": 0.628656903329043, "grad_norm": 2.2473723888397217, "learning_rate": 0.00015533161973187441, "loss": 2.6753, "step": 2259 }, { "epoch": 0.6289351932375552, "grad_norm": 2.468512773513794, "learning_rate": 0.00015529515272778009, "loss": 3.0679, "step": 2260 }, { "epoch": 0.6292134831460674, "grad_norm": 2.6676406860351562, "learning_rate": 0.00015525867512868703, "loss": 2.699, "step": 2261 }, { "epoch": 0.6294917730545796, "grad_norm": 2.2389016151428223, "learning_rate": 0.0001552221869415847, "loss": 2.776, "step": 2262 }, { "epoch": 0.6297700629630918, "grad_norm": 2.4624996185302734, "learning_rate": 0.0001551856881734645, "loss": 2.8546, "step": 2263 }, { "epoch": 0.630048352871604, "grad_norm": 2.418334722518921, "learning_rate": 0.00015514917883131986, "loss": 2.7462, "step": 2264 }, { "epoch": 0.6303266427801162, "grad_norm": 2.5704855918884277, "learning_rate": 0.0001551126589221463, "loss": 2.6054, "step": 2265 }, { "epoch": 0.6306049326886284, "grad_norm": 2.282357692718506, "learning_rate": 0.00015507612845294132, "loss": 2.7313, "step": 2266 }, { "epoch": 0.6308832225971406, "grad_norm": 1.98102867603302, "learning_rate": 0.0001550395874307045, "loss": 2.8492, "step": 2267 }, { "epoch": 0.6311615125056528, "grad_norm": 2.568166732788086, "learning_rate": 0.0001550030358624373, "loss": 2.9916, "step": 2268 }, { "epoch": 0.631439802414165, "grad_norm": 2.396833658218384, "learning_rate": 0.00015496647375514338, "loss": 2.8088, "step": 2269 }, { "epoch": 0.6317180923226772, "grad_norm": 2.2618460655212402, "learning_rate": 0.00015492990111582832, "loss": 2.7455, "step": 2270 }, { "epoch": 0.6319963822311894, "grad_norm": 2.450801372528076, "learning_rate": 0.0001548933179514997, "loss": 2.689, "step": 2271 }, { "epoch": 0.6322746721397016, "grad_norm": 2.4680655002593994, "learning_rate": 0.0001548567242691672, "loss": 2.8812, "step": 2272 }, { "epoch": 0.6325529620482138, "grad_norm": 2.2798874378204346, "learning_rate": 0.0001548201200758424, "loss": 2.9892, "step": 2273 }, { "epoch": 0.632831251956726, "grad_norm": 1.9100581407546997, "learning_rate": 0.000154783505378539, "loss": 2.4896, "step": 2274 }, { "epoch": 0.6331095418652382, "grad_norm": 2.296383857727051, "learning_rate": 0.00015474688018427267, "loss": 2.7327, "step": 2275 }, { "epoch": 0.6333878317737504, "grad_norm": 2.2902421951293945, "learning_rate": 0.00015471024450006114, "loss": 3.1119, "step": 2276 }, { "epoch": 0.6336661216822626, "grad_norm": 2.3841118812561035, "learning_rate": 0.00015467359833292398, "loss": 3.0054, "step": 2277 }, { "epoch": 0.6339444115907746, "grad_norm": 2.9186716079711914, "learning_rate": 0.00015463694168988301, "loss": 2.6672, "step": 2278 }, { "epoch": 0.6342227014992868, "grad_norm": 2.3415329456329346, "learning_rate": 0.00015460027457796192, "loss": 2.7147, "step": 2279 }, { "epoch": 0.634500991407799, "grad_norm": 2.281310796737671, "learning_rate": 0.00015456359700418635, "loss": 2.8268, "step": 2280 }, { "epoch": 0.6347792813163112, "grad_norm": 2.335850477218628, "learning_rate": 0.00015452690897558408, "loss": 2.7445, "step": 2281 }, { "epoch": 0.6350575712248234, "grad_norm": 2.1815695762634277, "learning_rate": 0.00015449021049918486, "loss": 2.9003, "step": 2282 }, { "epoch": 0.6353358611333356, "grad_norm": 2.5337257385253906, "learning_rate": 0.00015445350158202036, "loss": 2.8139, "step": 2283 }, { "epoch": 0.6356141510418478, "grad_norm": 2.448033094406128, "learning_rate": 0.0001544167822311243, "loss": 2.7762, "step": 2284 }, { "epoch": 0.63589244095036, "grad_norm": 2.373009204864502, "learning_rate": 0.00015438005245353245, "loss": 2.7112, "step": 2285 }, { "epoch": 0.6361707308588722, "grad_norm": 2.396712303161621, "learning_rate": 0.00015434331225628253, "loss": 2.7085, "step": 2286 }, { "epoch": 0.6364490207673844, "grad_norm": 2.8886046409606934, "learning_rate": 0.00015430656164641424, "loss": 3.1701, "step": 2287 }, { "epoch": 0.6367273106758966, "grad_norm": 3.3111822605133057, "learning_rate": 0.00015426980063096927, "loss": 3.2136, "step": 2288 }, { "epoch": 0.6370056005844088, "grad_norm": 2.5444042682647705, "learning_rate": 0.0001542330292169914, "loss": 2.9106, "step": 2289 }, { "epoch": 0.637283890492921, "grad_norm": 2.6024930477142334, "learning_rate": 0.00015419624741152624, "loss": 3.0734, "step": 2290 }, { "epoch": 0.6375621804014332, "grad_norm": 2.622558832168579, "learning_rate": 0.00015415945522162153, "loss": 3.1125, "step": 2291 }, { "epoch": 0.6378404703099454, "grad_norm": 2.4754395484924316, "learning_rate": 0.00015412265265432698, "loss": 2.9275, "step": 2292 }, { "epoch": 0.6381187602184576, "grad_norm": 2.328571319580078, "learning_rate": 0.00015408583971669422, "loss": 3.3255, "step": 2293 }, { "epoch": 0.6383970501269698, "grad_norm": 2.4436309337615967, "learning_rate": 0.0001540490164157769, "loss": 3.1541, "step": 2294 }, { "epoch": 0.638675340035482, "grad_norm": 2.0960569381713867, "learning_rate": 0.0001540121827586307, "loss": 2.9176, "step": 2295 }, { "epoch": 0.6389536299439942, "grad_norm": 2.5506656169891357, "learning_rate": 0.00015397533875231316, "loss": 2.9244, "step": 2296 }, { "epoch": 0.6392319198525064, "grad_norm": 2.141123056411743, "learning_rate": 0.00015393848440388403, "loss": 2.7763, "step": 2297 }, { "epoch": 0.6395102097610186, "grad_norm": 2.357213258743286, "learning_rate": 0.0001539016197204048, "loss": 2.6349, "step": 2298 }, { "epoch": 0.6397884996695308, "grad_norm": 2.3410000801086426, "learning_rate": 0.00015386474470893902, "loss": 2.7728, "step": 2299 }, { "epoch": 0.640066789578043, "grad_norm": 2.279019594192505, "learning_rate": 0.00015382785937655236, "loss": 3.068, "step": 2300 }, { "epoch": 0.640066789578043, "eval_loss": 2.9015514850616455, "eval_runtime": 84.8183, "eval_samples_per_second": 58.95, "eval_steps_per_second": 14.737, "step": 2300 }, { "epoch": 0.6403450794865552, "grad_norm": 2.6002635955810547, "learning_rate": 0.00015379096373031224, "loss": 2.8404, "step": 2301 }, { "epoch": 0.6406233693950674, "grad_norm": 2.3668243885040283, "learning_rate": 0.0001537540577772882, "loss": 2.6655, "step": 2302 }, { "epoch": 0.6409016593035795, "grad_norm": 2.1840412616729736, "learning_rate": 0.0001537171415245517, "loss": 2.8173, "step": 2303 }, { "epoch": 0.6411799492120916, "grad_norm": 2.3490986824035645, "learning_rate": 0.00015368021497917623, "loss": 2.8968, "step": 2304 }, { "epoch": 0.6414582391206038, "grad_norm": 5.5470123291015625, "learning_rate": 0.0001536432781482372, "loss": 3.0855, "step": 2305 }, { "epoch": 0.641736529029116, "grad_norm": 2.1480517387390137, "learning_rate": 0.00015360633103881198, "loss": 3.0977, "step": 2306 }, { "epoch": 0.6420148189376282, "grad_norm": 2.383373737335205, "learning_rate": 0.00015356937365798, "loss": 2.8948, "step": 2307 }, { "epoch": 0.6422931088461404, "grad_norm": 2.267979145050049, "learning_rate": 0.00015353240601282252, "loss": 2.8921, "step": 2308 }, { "epoch": 0.6425713987546526, "grad_norm": 2.7158210277557373, "learning_rate": 0.0001534954281104229, "loss": 2.8732, "step": 2309 }, { "epoch": 0.6428496886631648, "grad_norm": 2.194939136505127, "learning_rate": 0.00015345843995786634, "loss": 2.8439, "step": 2310 }, { "epoch": 0.643127978571677, "grad_norm": 2.405475616455078, "learning_rate": 0.0001534214415622401, "loss": 2.8406, "step": 2311 }, { "epoch": 0.6434062684801892, "grad_norm": 2.3297860622406006, "learning_rate": 0.0001533844329306334, "loss": 2.6847, "step": 2312 }, { "epoch": 0.6436845583887014, "grad_norm": 2.639130115509033, "learning_rate": 0.00015334741407013738, "loss": 2.7575, "step": 2313 }, { "epoch": 0.6439628482972136, "grad_norm": 2.407463550567627, "learning_rate": 0.00015331038498784513, "loss": 2.9092, "step": 2314 }, { "epoch": 0.6442411382057258, "grad_norm": 2.1769917011260986, "learning_rate": 0.00015327334569085173, "loss": 2.8175, "step": 2315 }, { "epoch": 0.644519428114238, "grad_norm": 2.7419755458831787, "learning_rate": 0.00015323629618625421, "loss": 3.0851, "step": 2316 }, { "epoch": 0.6447977180227502, "grad_norm": 2.834144115447998, "learning_rate": 0.00015319923648115158, "loss": 3.09, "step": 2317 }, { "epoch": 0.6450760079312624, "grad_norm": 2.088766574859619, "learning_rate": 0.00015316216658264474, "loss": 2.917, "step": 2318 }, { "epoch": 0.6453542978397746, "grad_norm": 2.168592691421509, "learning_rate": 0.0001531250864978366, "loss": 2.8261, "step": 2319 }, { "epoch": 0.6456325877482868, "grad_norm": 2.1610898971557617, "learning_rate": 0.00015308799623383202, "loss": 2.8089, "step": 2320 }, { "epoch": 0.645910877656799, "grad_norm": 2.4429521560668945, "learning_rate": 0.00015305089579773778, "loss": 2.7566, "step": 2321 }, { "epoch": 0.6461891675653112, "grad_norm": 2.464627742767334, "learning_rate": 0.00015301378519666262, "loss": 3.108, "step": 2322 }, { "epoch": 0.6464674574738234, "grad_norm": 2.3831465244293213, "learning_rate": 0.00015297666443771723, "loss": 2.9579, "step": 2323 }, { "epoch": 0.6467457473823356, "grad_norm": 2.1190881729125977, "learning_rate": 0.00015293953352801424, "loss": 2.933, "step": 2324 }, { "epoch": 0.6470240372908478, "grad_norm": 2.572755813598633, "learning_rate": 0.00015290239247466828, "loss": 3.1923, "step": 2325 }, { "epoch": 0.64730232719936, "grad_norm": 2.0943355560302734, "learning_rate": 0.0001528652412847958, "loss": 2.6704, "step": 2326 }, { "epoch": 0.6475806171078722, "grad_norm": 2.4893205165863037, "learning_rate": 0.00015282807996551534, "loss": 2.4621, "step": 2327 }, { "epoch": 0.6478589070163843, "grad_norm": 2.574781656265259, "learning_rate": 0.00015279090852394727, "loss": 2.9227, "step": 2328 }, { "epoch": 0.6481371969248965, "grad_norm": 2.334601402282715, "learning_rate": 0.00015275372696721396, "loss": 2.9333, "step": 2329 }, { "epoch": 0.6484154868334087, "grad_norm": 2.7247133255004883, "learning_rate": 0.00015271653530243967, "loss": 3.0956, "step": 2330 }, { "epoch": 0.6486937767419209, "grad_norm": 2.389068126678467, "learning_rate": 0.00015267933353675065, "loss": 2.721, "step": 2331 }, { "epoch": 0.648972066650433, "grad_norm": 2.323213577270508, "learning_rate": 0.00015264212167727506, "loss": 2.7577, "step": 2332 }, { "epoch": 0.6492503565589453, "grad_norm": 2.388552188873291, "learning_rate": 0.00015260489973114295, "loss": 2.6426, "step": 2333 }, { "epoch": 0.6495286464674574, "grad_norm": 2.219369649887085, "learning_rate": 0.0001525676677054864, "loss": 2.7152, "step": 2334 }, { "epoch": 0.6498069363759696, "grad_norm": 2.2115695476531982, "learning_rate": 0.00015253042560743938, "loss": 2.8658, "step": 2335 }, { "epoch": 0.6500852262844818, "grad_norm": 2.189544439315796, "learning_rate": 0.0001524931734441377, "loss": 2.8058, "step": 2336 }, { "epoch": 0.650363516192994, "grad_norm": 2.236215114593506, "learning_rate": 0.00015245591122271926, "loss": 2.6414, "step": 2337 }, { "epoch": 0.6506418061015062, "grad_norm": 2.340519905090332, "learning_rate": 0.00015241863895032372, "loss": 2.9146, "step": 2338 }, { "epoch": 0.6509200960100184, "grad_norm": 2.3819122314453125, "learning_rate": 0.0001523813566340928, "loss": 2.7891, "step": 2339 }, { "epoch": 0.6511983859185306, "grad_norm": 2.4419491291046143, "learning_rate": 0.0001523440642811701, "loss": 2.9194, "step": 2340 }, { "epoch": 0.6514766758270428, "grad_norm": 2.250020742416382, "learning_rate": 0.0001523067618987011, "loss": 2.7387, "step": 2341 }, { "epoch": 0.651754965735555, "grad_norm": 2.1364214420318604, "learning_rate": 0.00015226944949383328, "loss": 2.6882, "step": 2342 }, { "epoch": 0.6520332556440672, "grad_norm": 2.6239824295043945, "learning_rate": 0.00015223212707371597, "loss": 3.0637, "step": 2343 }, { "epoch": 0.6523115455525794, "grad_norm": 2.386368751525879, "learning_rate": 0.00015219479464550047, "loss": 2.9382, "step": 2344 }, { "epoch": 0.6525898354610916, "grad_norm": 2.6909332275390625, "learning_rate": 0.00015215745221633994, "loss": 2.8997, "step": 2345 }, { "epoch": 0.6528681253696038, "grad_norm": 2.54628849029541, "learning_rate": 0.00015212009979338954, "loss": 2.9346, "step": 2346 }, { "epoch": 0.653146415278116, "grad_norm": 2.520817279815674, "learning_rate": 0.00015208273738380625, "loss": 2.8938, "step": 2347 }, { "epoch": 0.6534247051866282, "grad_norm": 2.717228651046753, "learning_rate": 0.00015204536499474904, "loss": 2.9749, "step": 2348 }, { "epoch": 0.6537029950951404, "grad_norm": 2.5763633251190186, "learning_rate": 0.00015200798263337876, "loss": 2.8938, "step": 2349 }, { "epoch": 0.6539812850036526, "grad_norm": 2.5721492767333984, "learning_rate": 0.00015197059030685813, "loss": 2.9402, "step": 2350 }, { "epoch": 0.6542595749121648, "grad_norm": 2.6083104610443115, "learning_rate": 0.00015193318802235193, "loss": 2.6548, "step": 2351 }, { "epoch": 0.654537864820677, "grad_norm": 2.316977024078369, "learning_rate": 0.00015189577578702661, "loss": 2.8149, "step": 2352 }, { "epoch": 0.6548161547291891, "grad_norm": 2.385430335998535, "learning_rate": 0.0001518583536080507, "loss": 3.0826, "step": 2353 }, { "epoch": 0.6550944446377013, "grad_norm": 2.5307397842407227, "learning_rate": 0.00015182092149259465, "loss": 2.9633, "step": 2354 }, { "epoch": 0.6553727345462135, "grad_norm": 2.477802038192749, "learning_rate": 0.0001517834794478307, "loss": 2.8143, "step": 2355 }, { "epoch": 0.6556510244547257, "grad_norm": 2.484562397003174, "learning_rate": 0.00015174602748093305, "loss": 2.5985, "step": 2356 }, { "epoch": 0.6559293143632379, "grad_norm": 2.6254656314849854, "learning_rate": 0.00015170856559907785, "loss": 2.9757, "step": 2357 }, { "epoch": 0.6562076042717501, "grad_norm": 2.3044722080230713, "learning_rate": 0.000151671093809443, "loss": 2.9126, "step": 2358 }, { "epoch": 0.6564858941802623, "grad_norm": 2.3167834281921387, "learning_rate": 0.00015163361211920851, "loss": 2.7457, "step": 2359 }, { "epoch": 0.6567641840887745, "grad_norm": 2.3561513423919678, "learning_rate": 0.0001515961205355561, "loss": 2.817, "step": 2360 }, { "epoch": 0.6570424739972867, "grad_norm": 2.6068460941314697, "learning_rate": 0.00015155861906566948, "loss": 3.0642, "step": 2361 }, { "epoch": 0.6573207639057989, "grad_norm": 2.3626158237457275, "learning_rate": 0.00015152110771673427, "loss": 2.7431, "step": 2362 }, { "epoch": 0.657599053814311, "grad_norm": 2.8164634704589844, "learning_rate": 0.00015148358649593792, "loss": 3.2721, "step": 2363 }, { "epoch": 0.6578773437228232, "grad_norm": 2.4645943641662598, "learning_rate": 0.00015144605541046976, "loss": 2.9138, "step": 2364 }, { "epoch": 0.6581556336313354, "grad_norm": 2.318392515182495, "learning_rate": 0.00015140851446752114, "loss": 3.0198, "step": 2365 }, { "epoch": 0.6584339235398476, "grad_norm": 2.300506830215454, "learning_rate": 0.0001513709636742851, "loss": 2.8191, "step": 2366 }, { "epoch": 0.6587122134483598, "grad_norm": 2.0718917846679688, "learning_rate": 0.00015133340303795676, "loss": 2.7563, "step": 2367 }, { "epoch": 0.658990503356872, "grad_norm": 2.2541115283966064, "learning_rate": 0.00015129583256573304, "loss": 2.9671, "step": 2368 }, { "epoch": 0.6592687932653842, "grad_norm": 2.3011698722839355, "learning_rate": 0.00015125825226481267, "loss": 2.8813, "step": 2369 }, { "epoch": 0.6595470831738964, "grad_norm": 2.0344419479370117, "learning_rate": 0.00015122066214239642, "loss": 2.6706, "step": 2370 }, { "epoch": 0.6598253730824086, "grad_norm": 2.389373779296875, "learning_rate": 0.00015118306220568683, "loss": 2.7703, "step": 2371 }, { "epoch": 0.6601036629909208, "grad_norm": 2.303463935852051, "learning_rate": 0.00015114545246188837, "loss": 2.9107, "step": 2372 }, { "epoch": 0.660381952899433, "grad_norm": 2.0073657035827637, "learning_rate": 0.00015110783291820735, "loss": 2.65, "step": 2373 }, { "epoch": 0.6606602428079452, "grad_norm": 2.0742385387420654, "learning_rate": 0.00015107020358185195, "loss": 2.5528, "step": 2374 }, { "epoch": 0.6609385327164574, "grad_norm": 2.7990305423736572, "learning_rate": 0.00015103256446003234, "loss": 2.7044, "step": 2375 }, { "epoch": 0.6612168226249696, "grad_norm": 2.276743173599243, "learning_rate": 0.0001509949155599604, "loss": 2.7079, "step": 2376 }, { "epoch": 0.6614951125334817, "grad_norm": 2.340050220489502, "learning_rate": 0.00015095725688885, "loss": 2.6518, "step": 2377 }, { "epoch": 0.6617734024419939, "grad_norm": 2.345189332962036, "learning_rate": 0.00015091958845391682, "loss": 3.0324, "step": 2378 }, { "epoch": 0.6620516923505061, "grad_norm": 2.2916181087493896, "learning_rate": 0.0001508819102623785, "loss": 3.0087, "step": 2379 }, { "epoch": 0.6623299822590183, "grad_norm": 2.6377108097076416, "learning_rate": 0.0001508442223214544, "loss": 2.791, "step": 2380 }, { "epoch": 0.6626082721675305, "grad_norm": 2.470862865447998, "learning_rate": 0.00015080652463836592, "loss": 2.5594, "step": 2381 }, { "epoch": 0.6628865620760427, "grad_norm": 2.1913278102874756, "learning_rate": 0.00015076881722033618, "loss": 2.8299, "step": 2382 }, { "epoch": 0.6631648519845549, "grad_norm": 2.2359533309936523, "learning_rate": 0.00015073110007459022, "loss": 2.8544, "step": 2383 }, { "epoch": 0.6634431418930671, "grad_norm": 2.010138511657715, "learning_rate": 0.00015069337320835498, "loss": 2.4969, "step": 2384 }, { "epoch": 0.6637214318015793, "grad_norm": 2.291635274887085, "learning_rate": 0.00015065563662885928, "loss": 2.7543, "step": 2385 }, { "epoch": 0.6639997217100915, "grad_norm": 2.3367929458618164, "learning_rate": 0.00015061789034333364, "loss": 2.7591, "step": 2386 }, { "epoch": 0.6642780116186037, "grad_norm": 2.270887851715088, "learning_rate": 0.00015058013435901066, "loss": 2.7316, "step": 2387 }, { "epoch": 0.6645563015271159, "grad_norm": 2.530060291290283, "learning_rate": 0.00015054236868312465, "loss": 3.106, "step": 2388 }, { "epoch": 0.6648345914356281, "grad_norm": 2.285036325454712, "learning_rate": 0.00015050459332291175, "loss": 2.863, "step": 2389 }, { "epoch": 0.6651128813441403, "grad_norm": 2.0853681564331055, "learning_rate": 0.00015046680828561016, "loss": 2.6886, "step": 2390 }, { "epoch": 0.6653911712526525, "grad_norm": 2.4353253841400146, "learning_rate": 0.00015042901357845975, "loss": 3.0249, "step": 2391 }, { "epoch": 0.6656694611611647, "grad_norm": 2.398242235183716, "learning_rate": 0.00015039120920870222, "loss": 2.8617, "step": 2392 }, { "epoch": 0.6659477510696769, "grad_norm": 2.4063234329223633, "learning_rate": 0.00015035339518358127, "loss": 2.5518, "step": 2393 }, { "epoch": 0.666226040978189, "grad_norm": 2.5710017681121826, "learning_rate": 0.00015031557151034235, "loss": 3.0855, "step": 2394 }, { "epoch": 0.6665043308867012, "grad_norm": 2.8214316368103027, "learning_rate": 0.00015027773819623278, "loss": 3.061, "step": 2395 }, { "epoch": 0.6667826207952134, "grad_norm": 2.344674587249756, "learning_rate": 0.0001502398952485017, "loss": 2.5821, "step": 2396 }, { "epoch": 0.6670609107037256, "grad_norm": 2.5112979412078857, "learning_rate": 0.00015020204267440022, "loss": 2.9112, "step": 2397 }, { "epoch": 0.6673392006122378, "grad_norm": 2.5978240966796875, "learning_rate": 0.00015016418048118107, "loss": 2.876, "step": 2398 }, { "epoch": 0.66761749052075, "grad_norm": 2.6927216053009033, "learning_rate": 0.00015012630867609908, "loss": 2.6908, "step": 2399 }, { "epoch": 0.6678957804292622, "grad_norm": 2.4657297134399414, "learning_rate": 0.0001500884272664107, "loss": 2.8894, "step": 2400 }, { "epoch": 0.6678957804292622, "eval_loss": 2.8939549922943115, "eval_runtime": 84.481, "eval_samples_per_second": 59.185, "eval_steps_per_second": 14.796, "step": 2400 }, { "epoch": 0.6681740703377744, "grad_norm": 2.610689401626587, "learning_rate": 0.0001500505362593743, "loss": 2.6858, "step": 2401 }, { "epoch": 0.6684523602462865, "grad_norm": 2.948399066925049, "learning_rate": 0.0001500126356622502, "loss": 2.7656, "step": 2402 }, { "epoch": 0.6687306501547987, "grad_norm": 2.370637893676758, "learning_rate": 0.00014997472548230044, "loss": 2.767, "step": 2403 }, { "epoch": 0.6690089400633109, "grad_norm": 2.487337589263916, "learning_rate": 0.00014993680572678882, "loss": 3.0233, "step": 2404 }, { "epoch": 0.6692872299718231, "grad_norm": 2.3260087966918945, "learning_rate": 0.00014989887640298117, "loss": 2.8386, "step": 2405 }, { "epoch": 0.6695655198803353, "grad_norm": 2.3378124237060547, "learning_rate": 0.00014986093751814502, "loss": 2.9775, "step": 2406 }, { "epoch": 0.6698438097888475, "grad_norm": 2.269970655441284, "learning_rate": 0.0001498229890795498, "loss": 2.7789, "step": 2407 }, { "epoch": 0.6701220996973597, "grad_norm": 2.348436117172241, "learning_rate": 0.00014978503109446667, "loss": 2.9957, "step": 2408 }, { "epoch": 0.6704003896058719, "grad_norm": 2.4629931449890137, "learning_rate": 0.00014974706357016876, "loss": 2.6762, "step": 2409 }, { "epoch": 0.6706786795143841, "grad_norm": 2.473480224609375, "learning_rate": 0.0001497090865139309, "loss": 3.0556, "step": 2410 }, { "epoch": 0.6709569694228963, "grad_norm": 2.4975221157073975, "learning_rate": 0.00014967109993302983, "loss": 3.0659, "step": 2411 }, { "epoch": 0.6712352593314085, "grad_norm": 2.4952709674835205, "learning_rate": 0.00014963310383474412, "loss": 2.9398, "step": 2412 }, { "epoch": 0.6715135492399207, "grad_norm": 3.621741771697998, "learning_rate": 0.00014959509822635406, "loss": 2.6151, "step": 2413 }, { "epoch": 0.6717918391484329, "grad_norm": 2.323545217514038, "learning_rate": 0.00014955708311514186, "loss": 2.9012, "step": 2414 }, { "epoch": 0.6720701290569451, "grad_norm": 2.3927907943725586, "learning_rate": 0.00014951905850839152, "loss": 2.66, "step": 2415 }, { "epoch": 0.6723484189654573, "grad_norm": 2.3929362297058105, "learning_rate": 0.0001494810244133889, "loss": 2.9311, "step": 2416 }, { "epoch": 0.6726267088739695, "grad_norm": 2.1439931392669678, "learning_rate": 0.00014944298083742155, "loss": 2.6008, "step": 2417 }, { "epoch": 0.6729049987824817, "grad_norm": 2.428896188735962, "learning_rate": 0.000149404927787779, "loss": 2.9185, "step": 2418 }, { "epoch": 0.6731832886909939, "grad_norm": 2.6254031658172607, "learning_rate": 0.00014936686527175256, "loss": 3.0833, "step": 2419 }, { "epoch": 0.6734615785995061, "grad_norm": 2.185499429702759, "learning_rate": 0.0001493287932966352, "loss": 2.9353, "step": 2420 }, { "epoch": 0.6737398685080183, "grad_norm": 2.1270368099212646, "learning_rate": 0.00014929071186972192, "loss": 2.7926, "step": 2421 }, { "epoch": 0.6740181584165305, "grad_norm": 2.410911798477173, "learning_rate": 0.0001492526209983094, "loss": 2.7769, "step": 2422 }, { "epoch": 0.6742964483250427, "grad_norm": 2.228372097015381, "learning_rate": 0.00014921452068969614, "loss": 2.7122, "step": 2423 }, { "epoch": 0.6745747382335548, "grad_norm": 2.5815186500549316, "learning_rate": 0.00014917641095118245, "loss": 2.8742, "step": 2424 }, { "epoch": 0.674853028142067, "grad_norm": 2.1164791584014893, "learning_rate": 0.0001491382917900705, "loss": 2.7675, "step": 2425 }, { "epoch": 0.6751313180505792, "grad_norm": 2.5315847396850586, "learning_rate": 0.00014910016321366423, "loss": 3.0506, "step": 2426 }, { "epoch": 0.6754096079590913, "grad_norm": 2.216218948364258, "learning_rate": 0.00014906202522926937, "loss": 2.8143, "step": 2427 }, { "epoch": 0.6756878978676035, "grad_norm": 2.5821971893310547, "learning_rate": 0.0001490238778441935, "loss": 2.8844, "step": 2428 }, { "epoch": 0.6759661877761157, "grad_norm": 2.345128059387207, "learning_rate": 0.0001489857210657459, "loss": 2.8954, "step": 2429 }, { "epoch": 0.6762444776846279, "grad_norm": 2.28586483001709, "learning_rate": 0.0001489475549012378, "loss": 2.8965, "step": 2430 }, { "epoch": 0.6765227675931401, "grad_norm": 3.013997793197632, "learning_rate": 0.0001489093793579821, "loss": 2.7374, "step": 2431 }, { "epoch": 0.6768010575016523, "grad_norm": 2.253941774368286, "learning_rate": 0.0001488711944432935, "loss": 2.9957, "step": 2432 }, { "epoch": 0.6770793474101645, "grad_norm": 2.1783432960510254, "learning_rate": 0.0001488330001644886, "loss": 2.7934, "step": 2433 }, { "epoch": 0.6773576373186767, "grad_norm": 2.8319015502929688, "learning_rate": 0.00014879479652888576, "loss": 3.0161, "step": 2434 }, { "epoch": 0.6776359272271889, "grad_norm": 2.3244211673736572, "learning_rate": 0.00014875658354380502, "loss": 2.5827, "step": 2435 }, { "epoch": 0.6779142171357011, "grad_norm": 2.0005130767822266, "learning_rate": 0.00014871836121656837, "loss": 2.8291, "step": 2436 }, { "epoch": 0.6781925070442133, "grad_norm": 2.240949869155884, "learning_rate": 0.00014868012955449947, "loss": 2.803, "step": 2437 }, { "epoch": 0.6784707969527255, "grad_norm": 2.4109225273132324, "learning_rate": 0.00014864188856492384, "loss": 2.8899, "step": 2438 }, { "epoch": 0.6787490868612377, "grad_norm": 2.113454818725586, "learning_rate": 0.00014860363825516877, "loss": 3.0722, "step": 2439 }, { "epoch": 0.6790273767697499, "grad_norm": 2.356344223022461, "learning_rate": 0.0001485653786325633, "loss": 2.753, "step": 2440 }, { "epoch": 0.6793056666782621, "grad_norm": 2.589690923690796, "learning_rate": 0.0001485271097044383, "loss": 2.9803, "step": 2441 }, { "epoch": 0.6795839565867743, "grad_norm": 2.3920979499816895, "learning_rate": 0.0001484888314781264, "loss": 2.9495, "step": 2442 }, { "epoch": 0.6798622464952865, "grad_norm": 3.083099603652954, "learning_rate": 0.00014845054396096202, "loss": 2.8837, "step": 2443 }, { "epoch": 0.6801405364037987, "grad_norm": 2.2445390224456787, "learning_rate": 0.00014841224716028136, "loss": 2.9396, "step": 2444 }, { "epoch": 0.6804188263123109, "grad_norm": 2.732341766357422, "learning_rate": 0.0001483739410834224, "loss": 3.0811, "step": 2445 }, { "epoch": 0.6806971162208231, "grad_norm": 2.4688162803649902, "learning_rate": 0.0001483356257377249, "loss": 2.9208, "step": 2446 }, { "epoch": 0.6809754061293353, "grad_norm": 2.587488889694214, "learning_rate": 0.0001482973011305304, "loss": 3.0163, "step": 2447 }, { "epoch": 0.6812536960378475, "grad_norm": 2.1822240352630615, "learning_rate": 0.00014825896726918217, "loss": 3.0749, "step": 2448 }, { "epoch": 0.6815319859463597, "grad_norm": 2.258373498916626, "learning_rate": 0.0001482206241610253, "loss": 2.7075, "step": 2449 }, { "epoch": 0.6818102758548719, "grad_norm": 2.0289902687072754, "learning_rate": 0.00014818227181340667, "loss": 2.7572, "step": 2450 }, { "epoch": 0.6820885657633841, "grad_norm": 2.140697479248047, "learning_rate": 0.00014814391023367483, "loss": 2.7566, "step": 2451 }, { "epoch": 0.6823668556718961, "grad_norm": 3.741675853729248, "learning_rate": 0.00014810553942918024, "loss": 2.7537, "step": 2452 }, { "epoch": 0.6826451455804083, "grad_norm": 2.570993185043335, "learning_rate": 0.00014806715940727505, "loss": 2.821, "step": 2453 }, { "epoch": 0.6829234354889205, "grad_norm": 2.286104917526245, "learning_rate": 0.00014802877017531315, "loss": 3.1793, "step": 2454 }, { "epoch": 0.6832017253974327, "grad_norm": 2.3950703144073486, "learning_rate": 0.00014799037174065025, "loss": 2.9791, "step": 2455 }, { "epoch": 0.6834800153059449, "grad_norm": 2.0297555923461914, "learning_rate": 0.00014795196411064378, "loss": 2.944, "step": 2456 }, { "epoch": 0.6837583052144571, "grad_norm": 2.570373058319092, "learning_rate": 0.00014791354729265293, "loss": 2.6485, "step": 2457 }, { "epoch": 0.6840365951229693, "grad_norm": 2.507829427719116, "learning_rate": 0.00014787512129403878, "loss": 2.8306, "step": 2458 }, { "epoch": 0.6843148850314815, "grad_norm": 2.075848340988159, "learning_rate": 0.00014783668612216395, "loss": 2.7357, "step": 2459 }, { "epoch": 0.6845931749399937, "grad_norm": 2.8555119037628174, "learning_rate": 0.00014779824178439297, "loss": 3.1689, "step": 2460 }, { "epoch": 0.6848714648485059, "grad_norm": 2.7152645587921143, "learning_rate": 0.0001477597882880921, "loss": 2.8822, "step": 2461 }, { "epoch": 0.6851497547570181, "grad_norm": 2.066239356994629, "learning_rate": 0.00014772132564062933, "loss": 2.7502, "step": 2462 }, { "epoch": 0.6854280446655303, "grad_norm": 2.1994123458862305, "learning_rate": 0.00014768285384937437, "loss": 2.7208, "step": 2463 }, { "epoch": 0.6857063345740425, "grad_norm": 2.4326834678649902, "learning_rate": 0.0001476443729216988, "loss": 2.8392, "step": 2464 }, { "epoch": 0.6859846244825547, "grad_norm": 2.6524085998535156, "learning_rate": 0.00014760588286497588, "loss": 2.9474, "step": 2465 }, { "epoch": 0.6862629143910669, "grad_norm": 2.13771390914917, "learning_rate": 0.0001475673836865805, "loss": 2.6741, "step": 2466 }, { "epoch": 0.6865412042995791, "grad_norm": 2.4691250324249268, "learning_rate": 0.00014752887539388955, "loss": 2.5997, "step": 2467 }, { "epoch": 0.6868194942080913, "grad_norm": 2.407876968383789, "learning_rate": 0.0001474903579942815, "loss": 3.0856, "step": 2468 }, { "epoch": 0.6870977841166035, "grad_norm": 2.641387462615967, "learning_rate": 0.00014745183149513652, "loss": 2.9842, "step": 2469 }, { "epoch": 0.6873760740251157, "grad_norm": 2.5091779232025146, "learning_rate": 0.0001474132959038367, "loss": 2.8368, "step": 2470 }, { "epoch": 0.6876543639336279, "grad_norm": 2.1645874977111816, "learning_rate": 0.0001473747512277657, "loss": 2.7958, "step": 2471 }, { "epoch": 0.6879326538421401, "grad_norm": 2.25966739654541, "learning_rate": 0.000147336197474309, "loss": 2.9286, "step": 2472 }, { "epoch": 0.6882109437506523, "grad_norm": 2.5963797569274902, "learning_rate": 0.00014729763465085385, "loss": 3.0389, "step": 2473 }, { "epoch": 0.6884892336591645, "grad_norm": 2.4471237659454346, "learning_rate": 0.00014725906276478917, "loss": 3.2774, "step": 2474 }, { "epoch": 0.6887675235676767, "grad_norm": 2.4583840370178223, "learning_rate": 0.00014722048182350562, "loss": 2.8893, "step": 2475 }, { "epoch": 0.6890458134761889, "grad_norm": 2.3678646087646484, "learning_rate": 0.0001471818918343957, "loss": 2.8601, "step": 2476 }, { "epoch": 0.689324103384701, "grad_norm": 2.573274612426758, "learning_rate": 0.00014714329280485347, "loss": 2.895, "step": 2477 }, { "epoch": 0.6896023932932132, "grad_norm": 1.9784098863601685, "learning_rate": 0.00014710468474227485, "loss": 2.6719, "step": 2478 }, { "epoch": 0.6898806832017254, "grad_norm": 2.6651358604431152, "learning_rate": 0.00014706606765405752, "loss": 2.7941, "step": 2479 }, { "epoch": 0.6901589731102376, "grad_norm": 2.4018611907958984, "learning_rate": 0.00014702744154760074, "loss": 2.9355, "step": 2480 }, { "epoch": 0.6904372630187497, "grad_norm": 2.289295196533203, "learning_rate": 0.00014698880643030564, "loss": 3.0308, "step": 2481 }, { "epoch": 0.6907155529272619, "grad_norm": 3.54044508934021, "learning_rate": 0.00014695016230957498, "loss": 2.7342, "step": 2482 }, { "epoch": 0.6909938428357741, "grad_norm": 2.28523325920105, "learning_rate": 0.00014691150919281332, "loss": 2.9908, "step": 2483 }, { "epoch": 0.6912721327442863, "grad_norm": 2.798379421234131, "learning_rate": 0.00014687284708742688, "loss": 2.9022, "step": 2484 }, { "epoch": 0.6915504226527985, "grad_norm": 2.494239568710327, "learning_rate": 0.00014683417600082368, "loss": 3.1488, "step": 2485 }, { "epoch": 0.6918287125613107, "grad_norm": 2.4802982807159424, "learning_rate": 0.00014679549594041332, "loss": 2.9299, "step": 2486 }, { "epoch": 0.6921070024698229, "grad_norm": 2.3038530349731445, "learning_rate": 0.00014675680691360733, "loss": 2.9533, "step": 2487 }, { "epoch": 0.6923852923783351, "grad_norm": 1.997196078300476, "learning_rate": 0.00014671810892781878, "loss": 2.6415, "step": 2488 }, { "epoch": 0.6926635822868473, "grad_norm": 2.1693150997161865, "learning_rate": 0.00014667940199046252, "loss": 2.8172, "step": 2489 }, { "epoch": 0.6929418721953595, "grad_norm": 2.4507358074188232, "learning_rate": 0.00014664068610895512, "loss": 2.913, "step": 2490 }, { "epoch": 0.6932201621038717, "grad_norm": 2.844573736190796, "learning_rate": 0.0001466019612907148, "loss": 3.4969, "step": 2491 }, { "epoch": 0.6934984520123839, "grad_norm": 2.816178321838379, "learning_rate": 0.00014656322754316167, "loss": 2.9344, "step": 2492 }, { "epoch": 0.6937767419208961, "grad_norm": 2.684624671936035, "learning_rate": 0.00014652448487371733, "loss": 2.9696, "step": 2493 }, { "epoch": 0.6940550318294083, "grad_norm": 2.6339311599731445, "learning_rate": 0.00014648573328980522, "loss": 2.9094, "step": 2494 }, { "epoch": 0.6943333217379205, "grad_norm": 2.3940746784210205, "learning_rate": 0.00014644697279885045, "loss": 3.029, "step": 2495 }, { "epoch": 0.6946116116464327, "grad_norm": 2.0384929180145264, "learning_rate": 0.00014640820340827986, "loss": 2.6506, "step": 2496 }, { "epoch": 0.6948899015549449, "grad_norm": 2.7981414794921875, "learning_rate": 0.00014636942512552195, "loss": 2.6721, "step": 2497 }, { "epoch": 0.6951681914634571, "grad_norm": 2.78652286529541, "learning_rate": 0.000146330637958007, "loss": 2.7046, "step": 2498 }, { "epoch": 0.6954464813719693, "grad_norm": 2.182464838027954, "learning_rate": 0.00014629184191316687, "loss": 2.9706, "step": 2499 }, { "epoch": 0.6957247712804815, "grad_norm": 2.3756306171417236, "learning_rate": 0.00014625303699843526, "loss": 3.0927, "step": 2500 }, { "epoch": 0.6957247712804815, "eval_loss": 2.8911361694335938, "eval_runtime": 84.2655, "eval_samples_per_second": 59.336, "eval_steps_per_second": 14.834, "step": 2500 }, { "epoch": 0.6960030611889937, "grad_norm": 2.666254758834839, "learning_rate": 0.0001462142232212475, "loss": 2.852, "step": 2501 }, { "epoch": 0.6962813510975058, "grad_norm": 2.3860573768615723, "learning_rate": 0.0001461754005890406, "loss": 2.8238, "step": 2502 }, { "epoch": 0.696559641006018, "grad_norm": 2.713355541229248, "learning_rate": 0.00014613656910925332, "loss": 2.6728, "step": 2503 }, { "epoch": 0.6968379309145302, "grad_norm": 2.2631947994232178, "learning_rate": 0.00014609772878932604, "loss": 2.9454, "step": 2504 }, { "epoch": 0.6971162208230424, "grad_norm": 2.5548272132873535, "learning_rate": 0.00014605887963670093, "loss": 2.9289, "step": 2505 }, { "epoch": 0.6973945107315546, "grad_norm": 2.0470774173736572, "learning_rate": 0.00014602002165882175, "loss": 2.7692, "step": 2506 }, { "epoch": 0.6976728006400668, "grad_norm": 2.4039528369903564, "learning_rate": 0.00014598115486313403, "loss": 2.7638, "step": 2507 }, { "epoch": 0.697951090548579, "grad_norm": 2.36600661277771, "learning_rate": 0.00014594227925708502, "loss": 2.8034, "step": 2508 }, { "epoch": 0.6982293804570912, "grad_norm": 2.466883420944214, "learning_rate": 0.0001459033948481235, "loss": 2.9846, "step": 2509 }, { "epoch": 0.6985076703656034, "grad_norm": 2.598576307296753, "learning_rate": 0.0001458645016437001, "loss": 2.8845, "step": 2510 }, { "epoch": 0.6987859602741155, "grad_norm": 2.3111155033111572, "learning_rate": 0.00014582559965126704, "loss": 2.8694, "step": 2511 }, { "epoch": 0.6990642501826277, "grad_norm": 2.3255608081817627, "learning_rate": 0.00014578668887827828, "loss": 2.9541, "step": 2512 }, { "epoch": 0.6993425400911399, "grad_norm": 2.2398300170898438, "learning_rate": 0.0001457477693321894, "loss": 2.6816, "step": 2513 }, { "epoch": 0.6996208299996521, "grad_norm": 2.549151659011841, "learning_rate": 0.0001457088410204578, "loss": 2.9665, "step": 2514 }, { "epoch": 0.6998991199081643, "grad_norm": 2.2412123680114746, "learning_rate": 0.00014566990395054234, "loss": 2.7941, "step": 2515 }, { "epoch": 0.7001774098166765, "grad_norm": 2.4173076152801514, "learning_rate": 0.00014563095812990373, "loss": 2.9132, "step": 2516 }, { "epoch": 0.7004556997251887, "grad_norm": 2.4110422134399414, "learning_rate": 0.00014559200356600434, "loss": 3.1279, "step": 2517 }, { "epoch": 0.7007339896337009, "grad_norm": 2.126763343811035, "learning_rate": 0.0001455530402663081, "loss": 2.7537, "step": 2518 }, { "epoch": 0.7010122795422131, "grad_norm": 3.1615376472473145, "learning_rate": 0.00014551406823828074, "loss": 2.5437, "step": 2519 }, { "epoch": 0.7012905694507253, "grad_norm": 2.6250667572021484, "learning_rate": 0.00014547508748938962, "loss": 2.9512, "step": 2520 }, { "epoch": 0.7015688593592375, "grad_norm": 2.4776804447174072, "learning_rate": 0.00014543609802710375, "loss": 2.9456, "step": 2521 }, { "epoch": 0.7018471492677497, "grad_norm": 3.0254149436950684, "learning_rate": 0.00014539709985889385, "loss": 2.7601, "step": 2522 }, { "epoch": 0.7021254391762619, "grad_norm": 2.671382188796997, "learning_rate": 0.00014535809299223227, "loss": 2.8316, "step": 2523 }, { "epoch": 0.7024037290847741, "grad_norm": 2.5283820629119873, "learning_rate": 0.000145319077434593, "loss": 3.0017, "step": 2524 }, { "epoch": 0.7026820189932863, "grad_norm": 2.512965440750122, "learning_rate": 0.00014528005319345183, "loss": 2.8257, "step": 2525 }, { "epoch": 0.7029603089017985, "grad_norm": 3.0543930530548096, "learning_rate": 0.00014524102027628602, "loss": 3.0491, "step": 2526 }, { "epoch": 0.7032385988103106, "grad_norm": 3.301297903060913, "learning_rate": 0.00014520197869057468, "loss": 2.9661, "step": 2527 }, { "epoch": 0.7035168887188228, "grad_norm": 2.6495959758758545, "learning_rate": 0.00014516292844379844, "loss": 2.9416, "step": 2528 }, { "epoch": 0.703795178627335, "grad_norm": 2.6722586154937744, "learning_rate": 0.00014512386954343965, "loss": 2.9915, "step": 2529 }, { "epoch": 0.7040734685358472, "grad_norm": 2.713003396987915, "learning_rate": 0.00014508480199698236, "loss": 2.9126, "step": 2530 }, { "epoch": 0.7043517584443594, "grad_norm": 2.2845146656036377, "learning_rate": 0.00014504572581191212, "loss": 2.9811, "step": 2531 }, { "epoch": 0.7046300483528716, "grad_norm": 2.944228172302246, "learning_rate": 0.00014500664099571637, "loss": 3.1304, "step": 2532 }, { "epoch": 0.7049083382613838, "grad_norm": 2.6273791790008545, "learning_rate": 0.00014496754755588399, "loss": 2.752, "step": 2533 }, { "epoch": 0.705186628169896, "grad_norm": 2.3951497077941895, "learning_rate": 0.00014492844549990563, "loss": 2.9666, "step": 2534 }, { "epoch": 0.7054649180784082, "grad_norm": 2.677607774734497, "learning_rate": 0.00014488933483527356, "loss": 2.9505, "step": 2535 }, { "epoch": 0.7057432079869204, "grad_norm": 2.1963438987731934, "learning_rate": 0.0001448502155694817, "loss": 3.1214, "step": 2536 }, { "epoch": 0.7060214978954326, "grad_norm": 2.6803481578826904, "learning_rate": 0.00014481108771002558, "loss": 2.8798, "step": 2537 }, { "epoch": 0.7062997878039448, "grad_norm": 2.309009075164795, "learning_rate": 0.00014477195126440248, "loss": 2.736, "step": 2538 }, { "epoch": 0.706578077712457, "grad_norm": 2.3641457557678223, "learning_rate": 0.00014473280624011122, "loss": 2.9662, "step": 2539 }, { "epoch": 0.7068563676209692, "grad_norm": 2.99946928024292, "learning_rate": 0.00014469365264465234, "loss": 3.3244, "step": 2540 }, { "epoch": 0.7071346575294813, "grad_norm": 2.2323992252349854, "learning_rate": 0.00014465449048552793, "loss": 2.9267, "step": 2541 }, { "epoch": 0.7074129474379935, "grad_norm": 2.0293006896972656, "learning_rate": 0.00014461531977024182, "loss": 2.584, "step": 2542 }, { "epoch": 0.7076912373465057, "grad_norm": 2.2981114387512207, "learning_rate": 0.0001445761405062994, "loss": 3.2084, "step": 2543 }, { "epoch": 0.7079695272550179, "grad_norm": 2.6050686836242676, "learning_rate": 0.00014453695270120776, "loss": 3.067, "step": 2544 }, { "epoch": 0.7082478171635301, "grad_norm": 2.1764731407165527, "learning_rate": 0.0001444977563624756, "loss": 3.0972, "step": 2545 }, { "epoch": 0.7085261070720423, "grad_norm": 2.112128496170044, "learning_rate": 0.00014445855149761323, "loss": 2.7692, "step": 2546 }, { "epoch": 0.7088043969805545, "grad_norm": 2.2400307655334473, "learning_rate": 0.00014441933811413265, "loss": 2.8415, "step": 2547 }, { "epoch": 0.7090826868890667, "grad_norm": 2.0619518756866455, "learning_rate": 0.00014438011621954746, "loss": 2.7379, "step": 2548 }, { "epoch": 0.7093609767975789, "grad_norm": 2.9532580375671387, "learning_rate": 0.0001443408858213729, "loss": 2.9881, "step": 2549 }, { "epoch": 0.7096392667060911, "grad_norm": 2.6665666103363037, "learning_rate": 0.00014430164692712577, "loss": 2.8882, "step": 2550 }, { "epoch": 0.7099175566146033, "grad_norm": 2.2902913093566895, "learning_rate": 0.00014426239954432466, "loss": 2.7248, "step": 2551 }, { "epoch": 0.7101958465231154, "grad_norm": 2.467069149017334, "learning_rate": 0.00014422314368048962, "loss": 2.7093, "step": 2552 }, { "epoch": 0.7104741364316276, "grad_norm": 2.314922332763672, "learning_rate": 0.00014418387934314237, "loss": 2.8066, "step": 2553 }, { "epoch": 0.7107524263401398, "grad_norm": 2.3425540924072266, "learning_rate": 0.00014414460653980633, "loss": 3.0086, "step": 2554 }, { "epoch": 0.711030716248652, "grad_norm": 2.2888119220733643, "learning_rate": 0.00014410532527800646, "loss": 2.9697, "step": 2555 }, { "epoch": 0.7113090061571642, "grad_norm": 2.2812161445617676, "learning_rate": 0.00014406603556526936, "loss": 3.0698, "step": 2556 }, { "epoch": 0.7115872960656764, "grad_norm": 2.622490644454956, "learning_rate": 0.0001440267374091233, "loss": 2.779, "step": 2557 }, { "epoch": 0.7118655859741886, "grad_norm": 2.371513843536377, "learning_rate": 0.00014398743081709808, "loss": 2.9448, "step": 2558 }, { "epoch": 0.7121438758827008, "grad_norm": 2.6143295764923096, "learning_rate": 0.00014394811579672515, "loss": 3.1758, "step": 2559 }, { "epoch": 0.712422165791213, "grad_norm": 2.1905994415283203, "learning_rate": 0.00014390879235553765, "loss": 2.9169, "step": 2560 }, { "epoch": 0.7127004556997252, "grad_norm": 2.6578574180603027, "learning_rate": 0.0001438694605010702, "loss": 2.6499, "step": 2561 }, { "epoch": 0.7129787456082374, "grad_norm": 2.2438316345214844, "learning_rate": 0.00014383012024085914, "loss": 2.9687, "step": 2562 }, { "epoch": 0.7132570355167496, "grad_norm": 2.5514156818389893, "learning_rate": 0.0001437907715824424, "loss": 2.8051, "step": 2563 }, { "epoch": 0.7135353254252618, "grad_norm": 2.1935784816741943, "learning_rate": 0.00014375141453335942, "loss": 2.7845, "step": 2564 }, { "epoch": 0.713813615333774, "grad_norm": 2.192988634109497, "learning_rate": 0.00014371204910115142, "loss": 2.647, "step": 2565 }, { "epoch": 0.7140919052422862, "grad_norm": 2.640127182006836, "learning_rate": 0.0001436726752933611, "loss": 2.7477, "step": 2566 }, { "epoch": 0.7143701951507984, "grad_norm": 3.022184371948242, "learning_rate": 0.00014363329311753277, "loss": 2.5906, "step": 2567 }, { "epoch": 0.7146484850593106, "grad_norm": 2.5015792846679688, "learning_rate": 0.00014359390258121243, "loss": 2.8485, "step": 2568 }, { "epoch": 0.7149267749678228, "grad_norm": 2.216637372970581, "learning_rate": 0.0001435545036919476, "loss": 3.063, "step": 2569 }, { "epoch": 0.715205064876335, "grad_norm": 2.0902724266052246, "learning_rate": 0.00014351509645728743, "loss": 2.8108, "step": 2570 }, { "epoch": 0.7154833547848471, "grad_norm": 2.3379430770874023, "learning_rate": 0.00014347568088478264, "loss": 2.7082, "step": 2571 }, { "epoch": 0.7157616446933593, "grad_norm": 2.2452073097229004, "learning_rate": 0.00014343625698198562, "loss": 2.7432, "step": 2572 }, { "epoch": 0.7160399346018715, "grad_norm": 2.7764956951141357, "learning_rate": 0.0001433968247564503, "loss": 2.9776, "step": 2573 }, { "epoch": 0.7163182245103837, "grad_norm": 2.325338363647461, "learning_rate": 0.00014335738421573214, "loss": 2.7989, "step": 2574 }, { "epoch": 0.7165965144188959, "grad_norm": 2.076982259750366, "learning_rate": 0.00014331793536738834, "loss": 2.8058, "step": 2575 }, { "epoch": 0.7168748043274081, "grad_norm": 2.547137975692749, "learning_rate": 0.0001432784782189776, "loss": 2.9035, "step": 2576 }, { "epoch": 0.7171530942359202, "grad_norm": 2.0528392791748047, "learning_rate": 0.00014323901277806021, "loss": 2.6291, "step": 2577 }, { "epoch": 0.7174313841444324, "grad_norm": 2.69754958152771, "learning_rate": 0.00014319953905219814, "loss": 2.8087, "step": 2578 }, { "epoch": 0.7177096740529446, "grad_norm": 2.252431869506836, "learning_rate": 0.00014316005704895478, "loss": 2.5687, "step": 2579 }, { "epoch": 0.7179879639614568, "grad_norm": 2.2771408557891846, "learning_rate": 0.00014312056677589526, "loss": 2.6667, "step": 2580 }, { "epoch": 0.718266253869969, "grad_norm": 1.9985313415527344, "learning_rate": 0.00014308106824058623, "loss": 3.0709, "step": 2581 }, { "epoch": 0.7185445437784812, "grad_norm": 2.3912034034729004, "learning_rate": 0.0001430415614505959, "loss": 2.7862, "step": 2582 }, { "epoch": 0.7188228336869934, "grad_norm": 2.6025936603546143, "learning_rate": 0.0001430020464134941, "loss": 2.8372, "step": 2583 }, { "epoch": 0.7191011235955056, "grad_norm": 1.9390627145767212, "learning_rate": 0.00014296252313685226, "loss": 2.5845, "step": 2584 }, { "epoch": 0.7193794135040178, "grad_norm": 2.14170503616333, "learning_rate": 0.00014292299162824333, "loss": 2.7378, "step": 2585 }, { "epoch": 0.71965770341253, "grad_norm": 2.547639846801758, "learning_rate": 0.00014288345189524188, "loss": 2.9077, "step": 2586 }, { "epoch": 0.7199359933210422, "grad_norm": 2.3925774097442627, "learning_rate": 0.00014284390394542402, "loss": 2.9096, "step": 2587 }, { "epoch": 0.7202142832295544, "grad_norm": 2.330439805984497, "learning_rate": 0.0001428043477863675, "loss": 2.9604, "step": 2588 }, { "epoch": 0.7204925731380666, "grad_norm": 2.5451998710632324, "learning_rate": 0.00014276478342565159, "loss": 2.9989, "step": 2589 }, { "epoch": 0.7207708630465788, "grad_norm": 2.2671308517456055, "learning_rate": 0.00014272521087085708, "loss": 2.6208, "step": 2590 }, { "epoch": 0.721049152955091, "grad_norm": 2.3839287757873535, "learning_rate": 0.00014268563012956648, "loss": 2.8776, "step": 2591 }, { "epoch": 0.7213274428636032, "grad_norm": 2.4791343212127686, "learning_rate": 0.00014264604120936374, "loss": 2.7955, "step": 2592 }, { "epoch": 0.7216057327721154, "grad_norm": 2.0680160522460938, "learning_rate": 0.0001426064441178344, "loss": 2.6551, "step": 2593 }, { "epoch": 0.7218840226806276, "grad_norm": 2.217759609222412, "learning_rate": 0.00014256683886256562, "loss": 2.7938, "step": 2594 }, { "epoch": 0.7221623125891398, "grad_norm": 2.39816951751709, "learning_rate": 0.00014252722545114605, "loss": 2.9254, "step": 2595 }, { "epoch": 0.722440602497652, "grad_norm": 2.291930913925171, "learning_rate": 0.00014248760389116595, "loss": 2.9397, "step": 2596 }, { "epoch": 0.7227188924061642, "grad_norm": 2.6097464561462402, "learning_rate": 0.00014244797419021718, "loss": 3.0135, "step": 2597 }, { "epoch": 0.7229971823146764, "grad_norm": 2.229104518890381, "learning_rate": 0.00014240833635589304, "loss": 2.9638, "step": 2598 }, { "epoch": 0.7232754722231886, "grad_norm": 2.2221601009368896, "learning_rate": 0.0001423686903957885, "loss": 2.9406, "step": 2599 }, { "epoch": 0.7235537621317008, "grad_norm": 2.3216845989227295, "learning_rate": 0.00014232903631750002, "loss": 2.8307, "step": 2600 }, { "epoch": 0.7235537621317008, "eval_loss": 2.8894307613372803, "eval_runtime": 84.27, "eval_samples_per_second": 59.333, "eval_steps_per_second": 14.833, "step": 2600 }, { "epoch": 0.7238320520402128, "grad_norm": 2.8004348278045654, "learning_rate": 0.0001422893741286257, "loss": 3.1698, "step": 2601 }, { "epoch": 0.724110341948725, "grad_norm": 2.4075427055358887, "learning_rate": 0.00014224970383676505, "loss": 2.4907, "step": 2602 }, { "epoch": 0.7243886318572372, "grad_norm": 2.024212121963501, "learning_rate": 0.00014221002544951932, "loss": 2.702, "step": 2603 }, { "epoch": 0.7246669217657494, "grad_norm": 2.610654354095459, "learning_rate": 0.0001421703389744911, "loss": 2.9358, "step": 2604 }, { "epoch": 0.7249452116742616, "grad_norm": 2.383971691131592, "learning_rate": 0.00014213064441928472, "loss": 2.9066, "step": 2605 }, { "epoch": 0.7252235015827738, "grad_norm": 2.312650442123413, "learning_rate": 0.00014209094179150596, "loss": 2.6078, "step": 2606 }, { "epoch": 0.725501791491286, "grad_norm": 2.337151527404785, "learning_rate": 0.00014205123109876214, "loss": 2.7778, "step": 2607 }, { "epoch": 0.7257800813997982, "grad_norm": 2.4863641262054443, "learning_rate": 0.00014201151234866216, "loss": 2.8425, "step": 2608 }, { "epoch": 0.7260583713083104, "grad_norm": 2.4292097091674805, "learning_rate": 0.0001419717855488165, "loss": 2.8368, "step": 2609 }, { "epoch": 0.7263366612168226, "grad_norm": 2.1595864295959473, "learning_rate": 0.00014193205070683708, "loss": 2.6959, "step": 2610 }, { "epoch": 0.7266149511253348, "grad_norm": 2.668581008911133, "learning_rate": 0.00014189230783033744, "loss": 2.869, "step": 2611 }, { "epoch": 0.726893241033847, "grad_norm": 2.7287447452545166, "learning_rate": 0.00014185255692693261, "loss": 2.9375, "step": 2612 }, { "epoch": 0.7271715309423592, "grad_norm": 2.899278163909912, "learning_rate": 0.00014181279800423924, "loss": 2.8761, "step": 2613 }, { "epoch": 0.7274498208508714, "grad_norm": 2.1848256587982178, "learning_rate": 0.00014177303106987542, "loss": 2.5997, "step": 2614 }, { "epoch": 0.7277281107593836, "grad_norm": 2.304382801055908, "learning_rate": 0.00014173325613146083, "loss": 2.8768, "step": 2615 }, { "epoch": 0.7280064006678958, "grad_norm": 2.829383134841919, "learning_rate": 0.00014169347319661663, "loss": 2.8654, "step": 2616 }, { "epoch": 0.728284690576408, "grad_norm": 2.769886016845703, "learning_rate": 0.0001416536822729656, "loss": 3.0448, "step": 2617 }, { "epoch": 0.7285629804849202, "grad_norm": 2.743988037109375, "learning_rate": 0.00014161388336813202, "loss": 3.0869, "step": 2618 }, { "epoch": 0.7288412703934324, "grad_norm": 2.2690269947052, "learning_rate": 0.00014157407648974165, "loss": 2.8458, "step": 2619 }, { "epoch": 0.7291195603019446, "grad_norm": 2.5905332565307617, "learning_rate": 0.00014153426164542176, "loss": 2.7956, "step": 2620 }, { "epoch": 0.7293978502104568, "grad_norm": 2.682894468307495, "learning_rate": 0.0001414944388428013, "loss": 3.1649, "step": 2621 }, { "epoch": 0.729676140118969, "grad_norm": 2.2920444011688232, "learning_rate": 0.00014145460808951058, "loss": 2.8952, "step": 2622 }, { "epoch": 0.7299544300274812, "grad_norm": 2.2016422748565674, "learning_rate": 0.0001414147693931815, "loss": 2.7734, "step": 2623 }, { "epoch": 0.7302327199359934, "grad_norm": 2.5997700691223145, "learning_rate": 0.00014137492276144746, "loss": 2.8169, "step": 2624 }, { "epoch": 0.7305110098445056, "grad_norm": 2.2915844917297363, "learning_rate": 0.00014133506820194347, "loss": 2.7996, "step": 2625 }, { "epoch": 0.7307892997530177, "grad_norm": 2.5242722034454346, "learning_rate": 0.00014129520572230586, "loss": 2.9321, "step": 2626 }, { "epoch": 0.7310675896615298, "grad_norm": 2.1989855766296387, "learning_rate": 0.00014125533533017272, "loss": 2.6986, "step": 2627 }, { "epoch": 0.731345879570042, "grad_norm": 2.7659032344818115, "learning_rate": 0.0001412154570331835, "loss": 3.0791, "step": 2628 }, { "epoch": 0.7316241694785542, "grad_norm": 2.504997968673706, "learning_rate": 0.0001411755708389792, "loss": 2.9496, "step": 2629 }, { "epoch": 0.7319024593870664, "grad_norm": 2.805389165878296, "learning_rate": 0.00014113567675520233, "loss": 2.8601, "step": 2630 }, { "epoch": 0.7321807492955786, "grad_norm": 2.473259210586548, "learning_rate": 0.00014109577478949693, "loss": 2.7824, "step": 2631 }, { "epoch": 0.7324590392040908, "grad_norm": 2.484463691711426, "learning_rate": 0.00014105586494950857, "loss": 2.9093, "step": 2632 }, { "epoch": 0.732737329112603, "grad_norm": 2.420942783355713, "learning_rate": 0.00014101594724288423, "loss": 2.9642, "step": 2633 }, { "epoch": 0.7330156190211152, "grad_norm": 2.6652300357818604, "learning_rate": 0.00014097602167727254, "loss": 2.6762, "step": 2634 }, { "epoch": 0.7332939089296274, "grad_norm": 2.238347291946411, "learning_rate": 0.0001409360882603235, "loss": 3.1221, "step": 2635 }, { "epoch": 0.7335721988381396, "grad_norm": 2.679945945739746, "learning_rate": 0.00014089614699968868, "loss": 2.8431, "step": 2636 }, { "epoch": 0.7338504887466518, "grad_norm": 2.6287124156951904, "learning_rate": 0.00014085619790302117, "loss": 3.1644, "step": 2637 }, { "epoch": 0.734128778655164, "grad_norm": 2.263740062713623, "learning_rate": 0.0001408162409779756, "loss": 2.8599, "step": 2638 }, { "epoch": 0.7344070685636762, "grad_norm": 2.1983678340911865, "learning_rate": 0.0001407762762322079, "loss": 3.0291, "step": 2639 }, { "epoch": 0.7346853584721884, "grad_norm": 2.306008815765381, "learning_rate": 0.00014073630367337575, "loss": 2.649, "step": 2640 }, { "epoch": 0.7349636483807006, "grad_norm": 2.967658042907715, "learning_rate": 0.0001406963233091382, "loss": 2.7504, "step": 2641 }, { "epoch": 0.7352419382892128, "grad_norm": 2.288546323776245, "learning_rate": 0.00014065633514715576, "loss": 3.0376, "step": 2642 }, { "epoch": 0.735520228197725, "grad_norm": 2.0820252895355225, "learning_rate": 0.00014061633919509054, "loss": 2.5965, "step": 2643 }, { "epoch": 0.7357985181062372, "grad_norm": 2.4084553718566895, "learning_rate": 0.0001405763354606061, "loss": 2.8328, "step": 2644 }, { "epoch": 0.7360768080147494, "grad_norm": 2.461158514022827, "learning_rate": 0.00014053632395136738, "loss": 3.2982, "step": 2645 }, { "epoch": 0.7363550979232616, "grad_norm": 2.181114912033081, "learning_rate": 0.00014049630467504102, "loss": 2.6919, "step": 2646 }, { "epoch": 0.7366333878317738, "grad_norm": 2.018073797225952, "learning_rate": 0.00014045627763929498, "loss": 2.7487, "step": 2647 }, { "epoch": 0.736911677740286, "grad_norm": 2.795910596847534, "learning_rate": 0.00014041624285179876, "loss": 3.1671, "step": 2648 }, { "epoch": 0.7371899676487982, "grad_norm": 2.7975785732269287, "learning_rate": 0.00014037620032022338, "loss": 2.7447, "step": 2649 }, { "epoch": 0.7374682575573104, "grad_norm": 2.090843677520752, "learning_rate": 0.00014033615005224132, "loss": 2.6899, "step": 2650 }, { "epoch": 0.7377465474658225, "grad_norm": 2.373931646347046, "learning_rate": 0.00014029609205552652, "loss": 2.9698, "step": 2651 }, { "epoch": 0.7380248373743347, "grad_norm": 2.2109851837158203, "learning_rate": 0.00014025602633775436, "loss": 2.6409, "step": 2652 }, { "epoch": 0.7383031272828469, "grad_norm": 2.3415160179138184, "learning_rate": 0.00014021595290660186, "loss": 2.6524, "step": 2653 }, { "epoch": 0.7385814171913591, "grad_norm": 2.1563234329223633, "learning_rate": 0.00014017587176974732, "loss": 2.8997, "step": 2654 }, { "epoch": 0.7388597070998713, "grad_norm": 2.5548288822174072, "learning_rate": 0.00014013578293487066, "loss": 3.0395, "step": 2655 }, { "epoch": 0.7391379970083835, "grad_norm": 2.3461854457855225, "learning_rate": 0.00014009568640965326, "loss": 2.8754, "step": 2656 }, { "epoch": 0.7394162869168956, "grad_norm": 2.2088019847869873, "learning_rate": 0.00014005558220177782, "loss": 2.8318, "step": 2657 }, { "epoch": 0.7396945768254078, "grad_norm": 2.3841652870178223, "learning_rate": 0.00014001547031892877, "loss": 2.7747, "step": 2658 }, { "epoch": 0.73997286673392, "grad_norm": 2.5456504821777344, "learning_rate": 0.0001399753507687918, "loss": 3.1313, "step": 2659 }, { "epoch": 0.7402511566424322, "grad_norm": 2.6195147037506104, "learning_rate": 0.0001399352235590541, "loss": 2.8082, "step": 2660 }, { "epoch": 0.7405294465509444, "grad_norm": 2.132760763168335, "learning_rate": 0.00013989508869740447, "loss": 3.1209, "step": 2661 }, { "epoch": 0.7408077364594566, "grad_norm": 2.475168228149414, "learning_rate": 0.000139854946191533, "loss": 3.0419, "step": 2662 }, { "epoch": 0.7410860263679688, "grad_norm": 2.268988847732544, "learning_rate": 0.0001398147960491313, "loss": 3.0655, "step": 2663 }, { "epoch": 0.741364316276481, "grad_norm": 2.247349739074707, "learning_rate": 0.00013977463827789254, "loss": 2.8504, "step": 2664 }, { "epoch": 0.7416426061849932, "grad_norm": 2.3417980670928955, "learning_rate": 0.00013973447288551124, "loss": 2.9516, "step": 2665 }, { "epoch": 0.7419208960935054, "grad_norm": 2.6716747283935547, "learning_rate": 0.0001396942998796834, "loss": 2.8584, "step": 2666 }, { "epoch": 0.7421991860020176, "grad_norm": 2.004079580307007, "learning_rate": 0.00013965411926810647, "loss": 2.5631, "step": 2667 }, { "epoch": 0.7424774759105298, "grad_norm": 2.642584800720215, "learning_rate": 0.00013961393105847944, "loss": 3.2124, "step": 2668 }, { "epoch": 0.742755765819042, "grad_norm": 2.334580183029175, "learning_rate": 0.00013957373525850267, "loss": 2.92, "step": 2669 }, { "epoch": 0.7430340557275542, "grad_norm": 2.430765151977539, "learning_rate": 0.000139533531875878, "loss": 2.8986, "step": 2670 }, { "epoch": 0.7433123456360664, "grad_norm": 2.3277268409729004, "learning_rate": 0.00013949332091830875, "loss": 2.8168, "step": 2671 }, { "epoch": 0.7435906355445786, "grad_norm": 2.169621229171753, "learning_rate": 0.00013945310239349957, "loss": 2.7348, "step": 2672 }, { "epoch": 0.7438689254530908, "grad_norm": 2.1072797775268555, "learning_rate": 0.00013941287630915675, "loss": 2.7992, "step": 2673 }, { "epoch": 0.744147215361603, "grad_norm": 2.645357370376587, "learning_rate": 0.00013937264267298793, "loss": 2.9555, "step": 2674 }, { "epoch": 0.7444255052701152, "grad_norm": 1.926472783088684, "learning_rate": 0.00013933240149270216, "loss": 2.7531, "step": 2675 }, { "epoch": 0.7447037951786273, "grad_norm": 2.4407100677490234, "learning_rate": 0.00013929215277600998, "loss": 2.8698, "step": 2676 }, { "epoch": 0.7449820850871395, "grad_norm": 2.4590470790863037, "learning_rate": 0.0001392518965306234, "loss": 3.0502, "step": 2677 }, { "epoch": 0.7452603749956517, "grad_norm": 3.1190521717071533, "learning_rate": 0.00013921163276425582, "loss": 2.6229, "step": 2678 }, { "epoch": 0.7455386649041639, "grad_norm": 2.4012014865875244, "learning_rate": 0.00013917136148462207, "loss": 2.6994, "step": 2679 }, { "epoch": 0.7458169548126761, "grad_norm": 2.2991693019866943, "learning_rate": 0.00013913108269943852, "loss": 3.0587, "step": 2680 }, { "epoch": 0.7460952447211883, "grad_norm": 2.248157501220703, "learning_rate": 0.0001390907964164229, "loss": 2.9231, "step": 2681 }, { "epoch": 0.7463735346297005, "grad_norm": 2.3110578060150146, "learning_rate": 0.00013905050264329436, "loss": 3.0398, "step": 2682 }, { "epoch": 0.7466518245382127, "grad_norm": 2.76767635345459, "learning_rate": 0.00013901020138777352, "loss": 3.2035, "step": 2683 }, { "epoch": 0.7469301144467249, "grad_norm": 2.4784085750579834, "learning_rate": 0.00013896989265758242, "loss": 3.1686, "step": 2684 }, { "epoch": 0.747208404355237, "grad_norm": 2.5520057678222656, "learning_rate": 0.00013892957646044452, "loss": 3.0799, "step": 2685 }, { "epoch": 0.7474866942637493, "grad_norm": 2.453796863555908, "learning_rate": 0.0001388892528040848, "loss": 3.0264, "step": 2686 }, { "epoch": 0.7477649841722614, "grad_norm": 2.224375009536743, "learning_rate": 0.00013884892169622952, "loss": 2.4847, "step": 2687 }, { "epoch": 0.7480432740807736, "grad_norm": 2.3240966796875, "learning_rate": 0.00013880858314460653, "loss": 2.9901, "step": 2688 }, { "epoch": 0.7483215639892858, "grad_norm": 2.4380455017089844, "learning_rate": 0.0001387682371569449, "loss": 2.8329, "step": 2689 }, { "epoch": 0.748599853897798, "grad_norm": 2.739887237548828, "learning_rate": 0.0001387278837409754, "loss": 3.1922, "step": 2690 }, { "epoch": 0.7488781438063102, "grad_norm": 2.0786569118499756, "learning_rate": 0.00013868752290442996, "loss": 2.8163, "step": 2691 }, { "epoch": 0.7491564337148224, "grad_norm": 2.822497844696045, "learning_rate": 0.00013864715465504207, "loss": 2.9972, "step": 2692 }, { "epoch": 0.7494347236233346, "grad_norm": 2.298771381378174, "learning_rate": 0.00013860677900054663, "loss": 2.7292, "step": 2693 }, { "epoch": 0.7497130135318468, "grad_norm": 2.7373735904693604, "learning_rate": 0.00013856639594867995, "loss": 3.0687, "step": 2694 }, { "epoch": 0.749991303440359, "grad_norm": 2.7172389030456543, "learning_rate": 0.0001385260055071797, "loss": 2.8058, "step": 2695 }, { "epoch": 0.7502695933488712, "grad_norm": 2.2732293605804443, "learning_rate": 0.00013848560768378506, "loss": 2.9586, "step": 2696 }, { "epoch": 0.7505478832573834, "grad_norm": 2.2830874919891357, "learning_rate": 0.0001384452024862366, "loss": 2.9663, "step": 2697 }, { "epoch": 0.7508261731658956, "grad_norm": 2.336170196533203, "learning_rate": 0.0001384047899222762, "loss": 2.7111, "step": 2698 }, { "epoch": 0.7511044630744078, "grad_norm": 2.4633781909942627, "learning_rate": 0.0001383643699996473, "loss": 2.8409, "step": 2699 }, { "epoch": 0.75138275298292, "grad_norm": 2.323568344116211, "learning_rate": 0.00013832394272609468, "loss": 2.7508, "step": 2700 }, { "epoch": 0.75138275298292, "eval_loss": 2.8838729858398438, "eval_runtime": 84.3549, "eval_samples_per_second": 59.273, "eval_steps_per_second": 14.818, "step": 2700 }, { "epoch": 0.7516610428914321, "grad_norm": 2.1870124340057373, "learning_rate": 0.0001382835081093645, "loss": 2.6269, "step": 2701 }, { "epoch": 0.7519393327999443, "grad_norm": 2.108917474746704, "learning_rate": 0.00013824306615720441, "loss": 2.7686, "step": 2702 }, { "epoch": 0.7522176227084565, "grad_norm": 4.9598469734191895, "learning_rate": 0.00013820261687736338, "loss": 2.9284, "step": 2703 }, { "epoch": 0.7524959126169687, "grad_norm": 2.233964443206787, "learning_rate": 0.00013816216027759179, "loss": 2.7908, "step": 2704 }, { "epoch": 0.7527742025254809, "grad_norm": 2.3443915843963623, "learning_rate": 0.00013812169636564153, "loss": 2.7673, "step": 2705 }, { "epoch": 0.7530524924339931, "grad_norm": 3.093904972076416, "learning_rate": 0.00013808122514926576, "loss": 3.0042, "step": 2706 }, { "epoch": 0.7533307823425053, "grad_norm": 2.2356021404266357, "learning_rate": 0.00013804074663621908, "loss": 2.8657, "step": 2707 }, { "epoch": 0.7536090722510175, "grad_norm": 2.4930474758148193, "learning_rate": 0.00013800026083425753, "loss": 3.2387, "step": 2708 }, { "epoch": 0.7538873621595297, "grad_norm": 2.207085609436035, "learning_rate": 0.00013795976775113851, "loss": 2.6486, "step": 2709 }, { "epoch": 0.7541656520680419, "grad_norm": 2.2075977325439453, "learning_rate": 0.00013791926739462084, "loss": 2.8325, "step": 2710 }, { "epoch": 0.7544439419765541, "grad_norm": 2.063286066055298, "learning_rate": 0.0001378787597724647, "loss": 2.7184, "step": 2711 }, { "epoch": 0.7547222318850663, "grad_norm": 2.6886770725250244, "learning_rate": 0.00013783824489243165, "loss": 2.834, "step": 2712 }, { "epoch": 0.7550005217935785, "grad_norm": 2.3059167861938477, "learning_rate": 0.00013779772276228472, "loss": 2.6904, "step": 2713 }, { "epoch": 0.7552788117020907, "grad_norm": 2.7630717754364014, "learning_rate": 0.00013775719338978825, "loss": 3.0455, "step": 2714 }, { "epoch": 0.7555571016106029, "grad_norm": 2.417217493057251, "learning_rate": 0.00013771665678270802, "loss": 3.2447, "step": 2715 }, { "epoch": 0.755835391519115, "grad_norm": 2.4200637340545654, "learning_rate": 0.0001376761129488111, "loss": 2.8498, "step": 2716 }, { "epoch": 0.7561136814276272, "grad_norm": 2.1971914768218994, "learning_rate": 0.00013763556189586612, "loss": 3.0316, "step": 2717 }, { "epoch": 0.7563919713361394, "grad_norm": 2.3966140747070312, "learning_rate": 0.00013759500363164293, "loss": 2.8705, "step": 2718 }, { "epoch": 0.7566702612446516, "grad_norm": 2.3725390434265137, "learning_rate": 0.0001375544381639128, "loss": 3.0095, "step": 2719 }, { "epoch": 0.7569485511531638, "grad_norm": 2.063838243484497, "learning_rate": 0.00013751386550044846, "loss": 2.8169, "step": 2720 }, { "epoch": 0.757226841061676, "grad_norm": 2.404046058654785, "learning_rate": 0.00013747328564902393, "loss": 2.5408, "step": 2721 }, { "epoch": 0.7575051309701882, "grad_norm": 2.134890079498291, "learning_rate": 0.00013743269861741465, "loss": 3.0515, "step": 2722 }, { "epoch": 0.7577834208787004, "grad_norm": 2.4662671089172363, "learning_rate": 0.00013739210441339738, "loss": 2.9821, "step": 2723 }, { "epoch": 0.7580617107872126, "grad_norm": 2.660399913787842, "learning_rate": 0.00013735150304475035, "loss": 2.8721, "step": 2724 }, { "epoch": 0.7583400006957248, "grad_norm": 2.4874267578125, "learning_rate": 0.00013731089451925307, "loss": 2.7903, "step": 2725 }, { "epoch": 0.7586182906042369, "grad_norm": 2.1065773963928223, "learning_rate": 0.00013727027884468654, "loss": 2.757, "step": 2726 }, { "epoch": 0.7588965805127491, "grad_norm": 2.319070816040039, "learning_rate": 0.00013722965602883295, "loss": 2.8591, "step": 2727 }, { "epoch": 0.7591748704212613, "grad_norm": 2.35707426071167, "learning_rate": 0.00013718902607947603, "loss": 3.1987, "step": 2728 }, { "epoch": 0.7594531603297735, "grad_norm": 2.6881465911865234, "learning_rate": 0.00013714838900440074, "loss": 3.0864, "step": 2729 }, { "epoch": 0.7597314502382857, "grad_norm": 2.459284782409668, "learning_rate": 0.00013710774481139358, "loss": 2.998, "step": 2730 }, { "epoch": 0.7600097401467979, "grad_norm": 2.487508773803711, "learning_rate": 0.00013706709350824224, "loss": 2.9996, "step": 2731 }, { "epoch": 0.7602880300553101, "grad_norm": 2.491698980331421, "learning_rate": 0.0001370264351027358, "loss": 2.9012, "step": 2732 }, { "epoch": 0.7605663199638223, "grad_norm": 2.188878297805786, "learning_rate": 0.0001369857696026648, "loss": 2.7304, "step": 2733 }, { "epoch": 0.7608446098723345, "grad_norm": 1.9824713468551636, "learning_rate": 0.00013694509701582105, "loss": 2.5473, "step": 2734 }, { "epoch": 0.7611228997808467, "grad_norm": 2.4307992458343506, "learning_rate": 0.00013690441734999777, "loss": 2.888, "step": 2735 }, { "epoch": 0.7614011896893589, "grad_norm": 2.199894666671753, "learning_rate": 0.0001368637306129895, "loss": 2.4989, "step": 2736 }, { "epoch": 0.7616794795978711, "grad_norm": 2.589210033416748, "learning_rate": 0.00013682303681259215, "loss": 2.5932, "step": 2737 }, { "epoch": 0.7619577695063833, "grad_norm": 2.9834885597229004, "learning_rate": 0.00013678233595660296, "loss": 2.8705, "step": 2738 }, { "epoch": 0.7622360594148955, "grad_norm": 2.751154899597168, "learning_rate": 0.00013674162805282056, "loss": 2.8666, "step": 2739 }, { "epoch": 0.7625143493234077, "grad_norm": 2.5837810039520264, "learning_rate": 0.00013670091310904496, "loss": 2.8899, "step": 2740 }, { "epoch": 0.7627926392319199, "grad_norm": 2.0701000690460205, "learning_rate": 0.00013666019113307737, "loss": 2.823, "step": 2741 }, { "epoch": 0.7630709291404321, "grad_norm": 2.6528501510620117, "learning_rate": 0.00013661946213272053, "loss": 2.7506, "step": 2742 }, { "epoch": 0.7633492190489443, "grad_norm": 2.3916521072387695, "learning_rate": 0.00013657872611577847, "loss": 2.8499, "step": 2743 }, { "epoch": 0.7636275089574565, "grad_norm": 2.3901100158691406, "learning_rate": 0.00013653798309005643, "loss": 3.117, "step": 2744 }, { "epoch": 0.7639057988659687, "grad_norm": 2.4126222133636475, "learning_rate": 0.00013649723306336122, "loss": 2.9432, "step": 2745 }, { "epoch": 0.7641840887744809, "grad_norm": 2.4491443634033203, "learning_rate": 0.0001364564760435008, "loss": 2.8246, "step": 2746 }, { "epoch": 0.764462378682993, "grad_norm": 2.1873321533203125, "learning_rate": 0.00013641571203828454, "loss": 2.7621, "step": 2747 }, { "epoch": 0.7647406685915052, "grad_norm": 2.2900521755218506, "learning_rate": 0.00013637494105552323, "loss": 2.859, "step": 2748 }, { "epoch": 0.7650189585000174, "grad_norm": 2.7229080200195312, "learning_rate": 0.00013633416310302886, "loss": 2.8401, "step": 2749 }, { "epoch": 0.7652972484085296, "grad_norm": 2.195042848587036, "learning_rate": 0.00013629337818861481, "loss": 2.6483, "step": 2750 }, { "epoch": 0.7655755383170417, "grad_norm": 2.5500874519348145, "learning_rate": 0.00013625258632009584, "loss": 2.7797, "step": 2751 }, { "epoch": 0.7658538282255539, "grad_norm": 2.372342109680176, "learning_rate": 0.000136211787505288, "loss": 2.861, "step": 2752 }, { "epoch": 0.7661321181340661, "grad_norm": 2.394554376602173, "learning_rate": 0.00013617098175200857, "loss": 2.9702, "step": 2753 }, { "epoch": 0.7664104080425783, "grad_norm": 2.4847848415374756, "learning_rate": 0.00013613016906807642, "loss": 2.893, "step": 2754 }, { "epoch": 0.7666886979510905, "grad_norm": 2.192683696746826, "learning_rate": 0.00013608934946131152, "loss": 2.7701, "step": 2755 }, { "epoch": 0.7669669878596027, "grad_norm": 2.3759162425994873, "learning_rate": 0.00013604852293953518, "loss": 2.7531, "step": 2756 }, { "epoch": 0.7672452777681149, "grad_norm": 2.3491714000701904, "learning_rate": 0.0001360076895105702, "loss": 2.9028, "step": 2757 }, { "epoch": 0.7675235676766271, "grad_norm": 2.5839829444885254, "learning_rate": 0.0001359668491822405, "loss": 2.7521, "step": 2758 }, { "epoch": 0.7678018575851393, "grad_norm": 2.102311372756958, "learning_rate": 0.00013592600196237145, "loss": 2.7326, "step": 2759 }, { "epoch": 0.7680801474936515, "grad_norm": 2.3689069747924805, "learning_rate": 0.00013588514785878975, "loss": 2.8104, "step": 2760 }, { "epoch": 0.7683584374021637, "grad_norm": 2.4586732387542725, "learning_rate": 0.00013584428687932334, "loss": 2.828, "step": 2761 }, { "epoch": 0.7686367273106759, "grad_norm": 2.313019037246704, "learning_rate": 0.00013580341903180146, "loss": 2.8619, "step": 2762 }, { "epoch": 0.7689150172191881, "grad_norm": 2.50771164894104, "learning_rate": 0.00013576254432405484, "loss": 2.7772, "step": 2763 }, { "epoch": 0.7691933071277003, "grad_norm": 2.8097000122070312, "learning_rate": 0.00013572166276391533, "loss": 3.1496, "step": 2764 }, { "epoch": 0.7694715970362125, "grad_norm": 2.6353299617767334, "learning_rate": 0.00013568077435921616, "loss": 2.9974, "step": 2765 }, { "epoch": 0.7697498869447247, "grad_norm": 2.169436454772949, "learning_rate": 0.00013563987911779192, "loss": 2.7264, "step": 2766 }, { "epoch": 0.7700281768532369, "grad_norm": 2.1435797214508057, "learning_rate": 0.00013559897704747842, "loss": 2.8991, "step": 2767 }, { "epoch": 0.7703064667617491, "grad_norm": 2.1802544593811035, "learning_rate": 0.00013555806815611287, "loss": 2.9603, "step": 2768 }, { "epoch": 0.7705847566702613, "grad_norm": 2.4861719608306885, "learning_rate": 0.00013551715245153375, "loss": 2.8672, "step": 2769 }, { "epoch": 0.7708630465787735, "grad_norm": 2.1184139251708984, "learning_rate": 0.00013547622994158077, "loss": 2.8142, "step": 2770 }, { "epoch": 0.7711413364872857, "grad_norm": 2.651750326156616, "learning_rate": 0.00013543530063409514, "loss": 3.1313, "step": 2771 }, { "epoch": 0.7714196263957979, "grad_norm": 2.2516672611236572, "learning_rate": 0.0001353943645369191, "loss": 2.8378, "step": 2772 }, { "epoch": 0.7716979163043101, "grad_norm": 2.5009119510650635, "learning_rate": 0.00013535342165789643, "loss": 2.9966, "step": 2773 }, { "epoch": 0.7719762062128223, "grad_norm": 2.965012550354004, "learning_rate": 0.00013531247200487213, "loss": 2.977, "step": 2774 }, { "epoch": 0.7722544961213345, "grad_norm": 2.1602132320404053, "learning_rate": 0.00013527151558569237, "loss": 2.7641, "step": 2775 }, { "epoch": 0.7725327860298465, "grad_norm": 2.2205309867858887, "learning_rate": 0.0001352305524082049, "loss": 2.687, "step": 2776 }, { "epoch": 0.7728110759383587, "grad_norm": 2.868004560470581, "learning_rate": 0.00013518958248025847, "loss": 3.1157, "step": 2777 }, { "epoch": 0.7730893658468709, "grad_norm": 2.6935765743255615, "learning_rate": 0.00013514860580970327, "loss": 3.1045, "step": 2778 }, { "epoch": 0.7733676557553831, "grad_norm": 2.6112964153289795, "learning_rate": 0.00013510762240439078, "loss": 2.8188, "step": 2779 }, { "epoch": 0.7736459456638953, "grad_norm": 2.2648119926452637, "learning_rate": 0.00013506663227217378, "loss": 2.6937, "step": 2780 }, { "epoch": 0.7739242355724075, "grad_norm": 2.767502546310425, "learning_rate": 0.00013502563542090626, "loss": 3.0061, "step": 2781 }, { "epoch": 0.7742025254809197, "grad_norm": 2.7275753021240234, "learning_rate": 0.00013498463185844357, "loss": 2.8665, "step": 2782 }, { "epoch": 0.7744808153894319, "grad_norm": 2.000126361846924, "learning_rate": 0.00013494362159264232, "loss": 2.6032, "step": 2783 }, { "epoch": 0.7747591052979441, "grad_norm": 2.311316967010498, "learning_rate": 0.00013490260463136045, "loss": 2.8058, "step": 2784 }, { "epoch": 0.7750373952064563, "grad_norm": 2.261234998703003, "learning_rate": 0.00013486158098245706, "loss": 2.4667, "step": 2785 }, { "epoch": 0.7753156851149685, "grad_norm": 2.5440738201141357, "learning_rate": 0.00013482055065379268, "loss": 3.0884, "step": 2786 }, { "epoch": 0.7755939750234807, "grad_norm": 2.166848659515381, "learning_rate": 0.00013477951365322902, "loss": 2.8172, "step": 2787 }, { "epoch": 0.7758722649319929, "grad_norm": 2.030203342437744, "learning_rate": 0.00013473846998862912, "loss": 2.8992, "step": 2788 }, { "epoch": 0.7761505548405051, "grad_norm": 2.4010019302368164, "learning_rate": 0.00013469741966785727, "loss": 2.6293, "step": 2789 }, { "epoch": 0.7764288447490173, "grad_norm": 3.002753496170044, "learning_rate": 0.00013465636269877903, "loss": 3.0517, "step": 2790 }, { "epoch": 0.7767071346575295, "grad_norm": 2.9699501991271973, "learning_rate": 0.0001346152990892613, "loss": 2.9055, "step": 2791 }, { "epoch": 0.7769854245660417, "grad_norm": 2.690847396850586, "learning_rate": 0.00013457422884717213, "loss": 3.317, "step": 2792 }, { "epoch": 0.7772637144745539, "grad_norm": 2.401961326599121, "learning_rate": 0.00013453315198038094, "loss": 2.697, "step": 2793 }, { "epoch": 0.7775420043830661, "grad_norm": 2.7697112560272217, "learning_rate": 0.00013449206849675842, "loss": 3.1001, "step": 2794 }, { "epoch": 0.7778202942915783, "grad_norm": 2.2446179389953613, "learning_rate": 0.00013445097840417646, "loss": 2.9235, "step": 2795 }, { "epoch": 0.7780985842000905, "grad_norm": 2.5357322692871094, "learning_rate": 0.00013440988171050825, "loss": 2.7856, "step": 2796 }, { "epoch": 0.7783768741086027, "grad_norm": 2.336679697036743, "learning_rate": 0.00013436877842362826, "loss": 2.5776, "step": 2797 }, { "epoch": 0.7786551640171149, "grad_norm": 2.34267258644104, "learning_rate": 0.00013432766855141226, "loss": 2.6041, "step": 2798 }, { "epoch": 0.7789334539256271, "grad_norm": 2.6033670902252197, "learning_rate": 0.00013428655210173714, "loss": 2.8294, "step": 2799 }, { "epoch": 0.7792117438341393, "grad_norm": 2.4635872840881348, "learning_rate": 0.00013424542908248123, "loss": 3.1238, "step": 2800 }, { "epoch": 0.7792117438341393, "eval_loss": 2.882225275039673, "eval_runtime": 84.4673, "eval_samples_per_second": 59.195, "eval_steps_per_second": 14.799, "step": 2800 }, { "epoch": 0.7794900337426514, "grad_norm": 2.340282678604126, "learning_rate": 0.00013420429950152397, "loss": 2.8221, "step": 2801 }, { "epoch": 0.7797683236511636, "grad_norm": 2.2594597339630127, "learning_rate": 0.00013416316336674618, "loss": 2.954, "step": 2802 }, { "epoch": 0.7800466135596757, "grad_norm": 2.6319468021392822, "learning_rate": 0.00013412202068602982, "loss": 2.7503, "step": 2803 }, { "epoch": 0.780324903468188, "grad_norm": 2.456671714782715, "learning_rate": 0.00013408087146725818, "loss": 2.9226, "step": 2804 }, { "epoch": 0.7806031933767001, "grad_norm": 2.5043864250183105, "learning_rate": 0.0001340397157183158, "loss": 3.0368, "step": 2805 }, { "epoch": 0.7808814832852123, "grad_norm": 2.6903975009918213, "learning_rate": 0.00013399855344708845, "loss": 2.7342, "step": 2806 }, { "epoch": 0.7811597731937245, "grad_norm": 2.637270450592041, "learning_rate": 0.00013395738466146311, "loss": 2.9507, "step": 2807 }, { "epoch": 0.7814380631022367, "grad_norm": 2.363246440887451, "learning_rate": 0.00013391620936932815, "loss": 2.6864, "step": 2808 }, { "epoch": 0.7817163530107489, "grad_norm": 2.387957811355591, "learning_rate": 0.00013387502757857299, "loss": 2.7638, "step": 2809 }, { "epoch": 0.7819946429192611, "grad_norm": 2.5388855934143066, "learning_rate": 0.00013383383929708843, "loss": 2.8964, "step": 2810 }, { "epoch": 0.7822729328277733, "grad_norm": 2.333359956741333, "learning_rate": 0.00013379264453276652, "loss": 2.7608, "step": 2811 }, { "epoch": 0.7825512227362855, "grad_norm": 2.4610865116119385, "learning_rate": 0.00013375144329350042, "loss": 2.7976, "step": 2812 }, { "epoch": 0.7828295126447977, "grad_norm": 2.7037317752838135, "learning_rate": 0.0001337102355871847, "loss": 2.7819, "step": 2813 }, { "epoch": 0.7831078025533099, "grad_norm": 2.5659358501434326, "learning_rate": 0.00013366902142171507, "loss": 2.8921, "step": 2814 }, { "epoch": 0.7833860924618221, "grad_norm": 2.577279806137085, "learning_rate": 0.0001336278008049885, "loss": 2.7266, "step": 2815 }, { "epoch": 0.7836643823703343, "grad_norm": 2.594400644302368, "learning_rate": 0.00013358657374490314, "loss": 3.0125, "step": 2816 }, { "epoch": 0.7839426722788465, "grad_norm": 2.270775318145752, "learning_rate": 0.00013354534024935852, "loss": 3.1689, "step": 2817 }, { "epoch": 0.7842209621873587, "grad_norm": 2.4255192279815674, "learning_rate": 0.00013350410032625523, "loss": 2.787, "step": 2818 }, { "epoch": 0.7844992520958709, "grad_norm": 2.006786584854126, "learning_rate": 0.00013346285398349524, "loss": 2.5805, "step": 2819 }, { "epoch": 0.7847775420043831, "grad_norm": 2.311858892440796, "learning_rate": 0.0001334216012289816, "loss": 2.8503, "step": 2820 }, { "epoch": 0.7850558319128953, "grad_norm": 2.9306201934814453, "learning_rate": 0.00013338034207061874, "loss": 3.194, "step": 2821 }, { "epoch": 0.7853341218214075, "grad_norm": 2.5281338691711426, "learning_rate": 0.00013333907651631225, "loss": 2.9857, "step": 2822 }, { "epoch": 0.7856124117299197, "grad_norm": 2.4619383811950684, "learning_rate": 0.0001332978045739689, "loss": 2.8281, "step": 2823 }, { "epoch": 0.7858907016384319, "grad_norm": 2.2377512454986572, "learning_rate": 0.00013325652625149678, "loss": 2.6092, "step": 2824 }, { "epoch": 0.786168991546944, "grad_norm": 2.1621153354644775, "learning_rate": 0.00013321524155680508, "loss": 2.6982, "step": 2825 }, { "epoch": 0.7864472814554562, "grad_norm": 2.269160270690918, "learning_rate": 0.00013317395049780437, "loss": 2.6553, "step": 2826 }, { "epoch": 0.7867255713639684, "grad_norm": 2.1640732288360596, "learning_rate": 0.00013313265308240627, "loss": 2.8053, "step": 2827 }, { "epoch": 0.7870038612724806, "grad_norm": 2.1772875785827637, "learning_rate": 0.00013309134931852374, "loss": 3.0357, "step": 2828 }, { "epoch": 0.7872821511809928, "grad_norm": 2.2376163005828857, "learning_rate": 0.00013305003921407094, "loss": 2.754, "step": 2829 }, { "epoch": 0.787560441089505, "grad_norm": 2.2518310546875, "learning_rate": 0.00013300872277696317, "loss": 2.519, "step": 2830 }, { "epoch": 0.7878387309980172, "grad_norm": 2.025895118713379, "learning_rate": 0.000132967400015117, "loss": 2.6752, "step": 2831 }, { "epoch": 0.7881170209065294, "grad_norm": 2.4197428226470947, "learning_rate": 0.00013292607093645024, "loss": 2.6363, "step": 2832 }, { "epoch": 0.7883953108150416, "grad_norm": 2.4116156101226807, "learning_rate": 0.00013288473554888186, "loss": 2.891, "step": 2833 }, { "epoch": 0.7886736007235537, "grad_norm": 2.4579906463623047, "learning_rate": 0.000132843393860332, "loss": 2.8462, "step": 2834 }, { "epoch": 0.7889518906320659, "grad_norm": 2.279387950897217, "learning_rate": 0.00013280204587872216, "loss": 2.7628, "step": 2835 }, { "epoch": 0.7892301805405781, "grad_norm": 2.3259150981903076, "learning_rate": 0.00013276069161197488, "loss": 2.7153, "step": 2836 }, { "epoch": 0.7895084704490903, "grad_norm": 2.40762996673584, "learning_rate": 0.000132719331068014, "loss": 2.8512, "step": 2837 }, { "epoch": 0.7897867603576025, "grad_norm": 2.3239026069641113, "learning_rate": 0.00013267796425476454, "loss": 2.8924, "step": 2838 }, { "epoch": 0.7900650502661147, "grad_norm": 2.6349427700042725, "learning_rate": 0.00013263659118015268, "loss": 2.9701, "step": 2839 }, { "epoch": 0.7903433401746269, "grad_norm": 2.1970553398132324, "learning_rate": 0.00013259521185210587, "loss": 2.6071, "step": 2840 }, { "epoch": 0.7906216300831391, "grad_norm": 2.2803094387054443, "learning_rate": 0.0001325538262785527, "loss": 2.785, "step": 2841 }, { "epoch": 0.7908999199916513, "grad_norm": 2.1001508235931396, "learning_rate": 0.00013251243446742302, "loss": 2.797, "step": 2842 }, { "epoch": 0.7911782099001635, "grad_norm": 2.4026196002960205, "learning_rate": 0.00013247103642664778, "loss": 2.9509, "step": 2843 }, { "epoch": 0.7914564998086757, "grad_norm": 2.361757516860962, "learning_rate": 0.00013242963216415922, "loss": 2.5763, "step": 2844 }, { "epoch": 0.7917347897171879, "grad_norm": 2.7007718086242676, "learning_rate": 0.00013238822168789072, "loss": 2.9539, "step": 2845 }, { "epoch": 0.7920130796257001, "grad_norm": 2.4759132862091064, "learning_rate": 0.00013234680500577686, "loss": 2.8997, "step": 2846 }, { "epoch": 0.7922913695342123, "grad_norm": 2.2495381832122803, "learning_rate": 0.00013230538212575342, "loss": 2.967, "step": 2847 }, { "epoch": 0.7925696594427245, "grad_norm": 2.609720468521118, "learning_rate": 0.00013226395305575736, "loss": 2.8836, "step": 2848 }, { "epoch": 0.7928479493512367, "grad_norm": 2.7028651237487793, "learning_rate": 0.0001322225178037268, "loss": 2.8788, "step": 2849 }, { "epoch": 0.7931262392597488, "grad_norm": 2.349637269973755, "learning_rate": 0.00013218107637760112, "loss": 2.8447, "step": 2850 }, { "epoch": 0.793404529168261, "grad_norm": 2.7970075607299805, "learning_rate": 0.00013213962878532077, "loss": 2.7389, "step": 2851 }, { "epoch": 0.7936828190767732, "grad_norm": 2.313335657119751, "learning_rate": 0.00013209817503482746, "loss": 2.8769, "step": 2852 }, { "epoch": 0.7939611089852854, "grad_norm": 2.1612472534179688, "learning_rate": 0.00013205671513406413, "loss": 2.4192, "step": 2853 }, { "epoch": 0.7942393988937976, "grad_norm": 2.334526300430298, "learning_rate": 0.00013201524909097476, "loss": 2.7656, "step": 2854 }, { "epoch": 0.7945176888023098, "grad_norm": 2.521909475326538, "learning_rate": 0.00013197377691350458, "loss": 2.8994, "step": 2855 }, { "epoch": 0.794795978710822, "grad_norm": 2.223538398742676, "learning_rate": 0.00013193229860960004, "loss": 2.7074, "step": 2856 }, { "epoch": 0.7950742686193342, "grad_norm": 2.3317415714263916, "learning_rate": 0.0001318908141872087, "loss": 2.9723, "step": 2857 }, { "epoch": 0.7953525585278464, "grad_norm": 2.1646032333374023, "learning_rate": 0.00013184932365427926, "loss": 2.7442, "step": 2858 }, { "epoch": 0.7956308484363586, "grad_norm": 2.295118570327759, "learning_rate": 0.00013180782701876172, "loss": 2.6989, "step": 2859 }, { "epoch": 0.7959091383448708, "grad_norm": 2.5496535301208496, "learning_rate": 0.00013176632428860716, "loss": 3.0136, "step": 2860 }, { "epoch": 0.796187428253383, "grad_norm": 2.156581401824951, "learning_rate": 0.00013172481547176778, "loss": 2.8857, "step": 2861 }, { "epoch": 0.7964657181618952, "grad_norm": 2.612180233001709, "learning_rate": 0.00013168330057619705, "loss": 2.9784, "step": 2862 }, { "epoch": 0.7967440080704074, "grad_norm": 2.9062676429748535, "learning_rate": 0.00013164177960984957, "loss": 2.9663, "step": 2863 }, { "epoch": 0.7970222979789195, "grad_norm": 2.4610538482666016, "learning_rate": 0.00013160025258068106, "loss": 3.0692, "step": 2864 }, { "epoch": 0.7973005878874317, "grad_norm": 2.190492868423462, "learning_rate": 0.00013155871949664847, "loss": 2.749, "step": 2865 }, { "epoch": 0.7975788777959439, "grad_norm": 2.648954391479492, "learning_rate": 0.00013151718036570983, "loss": 2.989, "step": 2866 }, { "epoch": 0.7978571677044561, "grad_norm": 2.2997756004333496, "learning_rate": 0.0001314756351958244, "loss": 2.8007, "step": 2867 }, { "epoch": 0.7981354576129683, "grad_norm": 2.0664470195770264, "learning_rate": 0.00013143408399495256, "loss": 2.6175, "step": 2868 }, { "epoch": 0.7984137475214805, "grad_norm": 2.9490582942962646, "learning_rate": 0.0001313925267710559, "loss": 2.9622, "step": 2869 }, { "epoch": 0.7986920374299927, "grad_norm": 2.3423349857330322, "learning_rate": 0.00013135096353209707, "loss": 2.8389, "step": 2870 }, { "epoch": 0.7989703273385049, "grad_norm": 2.37735652923584, "learning_rate": 0.00013130939428603993, "loss": 2.7256, "step": 2871 }, { "epoch": 0.7992486172470171, "grad_norm": 2.6614909172058105, "learning_rate": 0.0001312678190408495, "loss": 2.8443, "step": 2872 }, { "epoch": 0.7995269071555293, "grad_norm": 2.5067646503448486, "learning_rate": 0.00013122623780449194, "loss": 3.1194, "step": 2873 }, { "epoch": 0.7998051970640415, "grad_norm": 2.8055484294891357, "learning_rate": 0.0001311846505849345, "loss": 3.0639, "step": 2874 }, { "epoch": 0.8000834869725536, "grad_norm": 2.3073270320892334, "learning_rate": 0.0001311430573901457, "loss": 2.6756, "step": 2875 }, { "epoch": 0.8003617768810658, "grad_norm": 2.37880802154541, "learning_rate": 0.00013110145822809512, "loss": 2.7545, "step": 2876 }, { "epoch": 0.800640066789578, "grad_norm": 2.1708486080169678, "learning_rate": 0.00013105985310675342, "loss": 2.6735, "step": 2877 }, { "epoch": 0.8009183566980902, "grad_norm": 2.6247198581695557, "learning_rate": 0.00013101824203409255, "loss": 2.9861, "step": 2878 }, { "epoch": 0.8011966466066024, "grad_norm": 2.6121935844421387, "learning_rate": 0.00013097662501808553, "loss": 2.8806, "step": 2879 }, { "epoch": 0.8014749365151146, "grad_norm": 2.517226457595825, "learning_rate": 0.00013093500206670647, "loss": 2.904, "step": 2880 }, { "epoch": 0.8017532264236268, "grad_norm": 2.3243541717529297, "learning_rate": 0.0001308933731879307, "loss": 2.8931, "step": 2881 }, { "epoch": 0.802031516332139, "grad_norm": 2.175309419631958, "learning_rate": 0.0001308517383897347, "loss": 2.8007, "step": 2882 }, { "epoch": 0.8023098062406512, "grad_norm": 2.624201774597168, "learning_rate": 0.0001308100976800959, "loss": 2.982, "step": 2883 }, { "epoch": 0.8025880961491634, "grad_norm": 2.8070223331451416, "learning_rate": 0.0001307684510669931, "loss": 3.1489, "step": 2884 }, { "epoch": 0.8028663860576756, "grad_norm": 2.3585872650146484, "learning_rate": 0.0001307267985584061, "loss": 2.7074, "step": 2885 }, { "epoch": 0.8031446759661878, "grad_norm": 2.3527615070343018, "learning_rate": 0.00013068514016231582, "loss": 3.1625, "step": 2886 }, { "epoch": 0.8034229658747, "grad_norm": 1.9742125272750854, "learning_rate": 0.00013064347588670442, "loss": 2.7315, "step": 2887 }, { "epoch": 0.8037012557832122, "grad_norm": 2.2947466373443604, "learning_rate": 0.0001306018057395551, "loss": 2.5298, "step": 2888 }, { "epoch": 0.8039795456917244, "grad_norm": 2.6458163261413574, "learning_rate": 0.00013056012972885214, "loss": 2.9331, "step": 2889 }, { "epoch": 0.8042578356002366, "grad_norm": 2.298686981201172, "learning_rate": 0.00013051844786258105, "loss": 2.5789, "step": 2890 }, { "epoch": 0.8045361255087488, "grad_norm": 2.630329132080078, "learning_rate": 0.00013047676014872842, "loss": 2.8828, "step": 2891 }, { "epoch": 0.804814415417261, "grad_norm": 2.21370530128479, "learning_rate": 0.0001304350665952819, "loss": 3.1229, "step": 2892 }, { "epoch": 0.8050927053257732, "grad_norm": 3.1550040245056152, "learning_rate": 0.00013039336721023035, "loss": 2.8838, "step": 2893 }, { "epoch": 0.8053709952342853, "grad_norm": 2.251923084259033, "learning_rate": 0.00013035166200156371, "loss": 2.9517, "step": 2894 }, { "epoch": 0.8056492851427975, "grad_norm": 2.8450841903686523, "learning_rate": 0.00013030995097727298, "loss": 2.8562, "step": 2895 }, { "epoch": 0.8059275750513097, "grad_norm": 2.4136178493499756, "learning_rate": 0.00013026823414535044, "loss": 2.9562, "step": 2896 }, { "epoch": 0.8062058649598219, "grad_norm": 2.5075128078460693, "learning_rate": 0.0001302265115137893, "loss": 2.6654, "step": 2897 }, { "epoch": 0.8064841548683341, "grad_norm": 2.547419309616089, "learning_rate": 0.00013018478309058395, "loss": 3.0058, "step": 2898 }, { "epoch": 0.8067624447768463, "grad_norm": 2.428328037261963, "learning_rate": 0.00013014304888372993, "loss": 2.8133, "step": 2899 }, { "epoch": 0.8070407346853584, "grad_norm": 2.653953790664673, "learning_rate": 0.00013010130890122383, "loss": 2.9479, "step": 2900 }, { "epoch": 0.8070407346853584, "eval_loss": 2.8736321926116943, "eval_runtime": 84.4393, "eval_samples_per_second": 59.214, "eval_steps_per_second": 14.804, "step": 2900 }, { "epoch": 0.8073190245938706, "grad_norm": 2.2900428771972656, "learning_rate": 0.00013005956315106335, "loss": 2.5466, "step": 2901 }, { "epoch": 0.8075973145023828, "grad_norm": 2.3615610599517822, "learning_rate": 0.00013001781164124733, "loss": 2.867, "step": 2902 }, { "epoch": 0.807875604410895, "grad_norm": 2.7149837017059326, "learning_rate": 0.0001299760543797757, "loss": 2.9569, "step": 2903 }, { "epoch": 0.8081538943194072, "grad_norm": 2.6153762340545654, "learning_rate": 0.00012993429137464949, "loss": 2.9417, "step": 2904 }, { "epoch": 0.8084321842279194, "grad_norm": 2.3927881717681885, "learning_rate": 0.00012989252263387083, "loss": 3.1241, "step": 2905 }, { "epoch": 0.8087104741364316, "grad_norm": 2.6768033504486084, "learning_rate": 0.00012985074816544297, "loss": 2.8434, "step": 2906 }, { "epoch": 0.8089887640449438, "grad_norm": 2.1067471504211426, "learning_rate": 0.00012980896797737016, "loss": 2.6729, "step": 2907 }, { "epoch": 0.809267053953456, "grad_norm": 2.4046382904052734, "learning_rate": 0.0001297671820776579, "loss": 2.9142, "step": 2908 }, { "epoch": 0.8095453438619682, "grad_norm": 2.1701953411102295, "learning_rate": 0.00012972539047431268, "loss": 2.8373, "step": 2909 }, { "epoch": 0.8098236337704804, "grad_norm": 2.529376268386841, "learning_rate": 0.00012968359317534214, "loss": 2.7294, "step": 2910 }, { "epoch": 0.8101019236789926, "grad_norm": 2.2581324577331543, "learning_rate": 0.0001296417901887549, "loss": 2.6804, "step": 2911 }, { "epoch": 0.8103802135875048, "grad_norm": 2.2644565105438232, "learning_rate": 0.00012959998152256086, "loss": 2.7853, "step": 2912 }, { "epoch": 0.810658503496017, "grad_norm": 2.2355926036834717, "learning_rate": 0.0001295581671847708, "loss": 3.0193, "step": 2913 }, { "epoch": 0.8109367934045292, "grad_norm": 2.5335633754730225, "learning_rate": 0.00012951634718339674, "loss": 2.8586, "step": 2914 }, { "epoch": 0.8112150833130414, "grad_norm": 2.048173666000366, "learning_rate": 0.00012947452152645172, "loss": 2.6922, "step": 2915 }, { "epoch": 0.8114933732215536, "grad_norm": 2.717226266860962, "learning_rate": 0.00012943269022194987, "loss": 2.8505, "step": 2916 }, { "epoch": 0.8117716631300658, "grad_norm": 2.7248053550720215, "learning_rate": 0.0001293908532779064, "loss": 3.0582, "step": 2917 }, { "epoch": 0.812049953038578, "grad_norm": 2.447498321533203, "learning_rate": 0.00012934901070233763, "loss": 2.8927, "step": 2918 }, { "epoch": 0.8123282429470902, "grad_norm": 2.317617893218994, "learning_rate": 0.0001293071625032609, "loss": 2.9816, "step": 2919 }, { "epoch": 0.8126065328556024, "grad_norm": 2.3242099285125732, "learning_rate": 0.0001292653086886947, "loss": 2.687, "step": 2920 }, { "epoch": 0.8128848227641146, "grad_norm": 2.8131978511810303, "learning_rate": 0.00012922344926665855, "loss": 3.0635, "step": 2921 }, { "epoch": 0.8131631126726268, "grad_norm": 2.195563793182373, "learning_rate": 0.00012918158424517304, "loss": 2.9898, "step": 2922 }, { "epoch": 0.813441402581139, "grad_norm": 2.259996175765991, "learning_rate": 0.00012913971363225985, "loss": 2.7602, "step": 2923 }, { "epoch": 0.8137196924896511, "grad_norm": 2.5215206146240234, "learning_rate": 0.00012909783743594175, "loss": 2.9604, "step": 2924 }, { "epoch": 0.8139979823981632, "grad_norm": 2.2148098945617676, "learning_rate": 0.00012905595566424253, "loss": 2.5609, "step": 2925 }, { "epoch": 0.8142762723066754, "grad_norm": 2.494105339050293, "learning_rate": 0.00012901406832518708, "loss": 2.9695, "step": 2926 }, { "epoch": 0.8145545622151876, "grad_norm": 3.6215951442718506, "learning_rate": 0.00012897217542680132, "loss": 3.0711, "step": 2927 }, { "epoch": 0.8148328521236998, "grad_norm": 2.013169765472412, "learning_rate": 0.00012893027697711236, "loss": 2.6423, "step": 2928 }, { "epoch": 0.815111142032212, "grad_norm": 2.0806689262390137, "learning_rate": 0.00012888837298414822, "loss": 2.6742, "step": 2929 }, { "epoch": 0.8153894319407242, "grad_norm": 3.037644624710083, "learning_rate": 0.0001288464634559381, "loss": 2.9785, "step": 2930 }, { "epoch": 0.8156677218492364, "grad_norm": 2.2214303016662598, "learning_rate": 0.00012880454840051212, "loss": 2.912, "step": 2931 }, { "epoch": 0.8159460117577486, "grad_norm": 2.9687399864196777, "learning_rate": 0.00012876262782590155, "loss": 3.006, "step": 2932 }, { "epoch": 0.8162243016662608, "grad_norm": 2.1185553073883057, "learning_rate": 0.0001287207017401388, "loss": 2.6617, "step": 2933 }, { "epoch": 0.816502591574773, "grad_norm": 2.43449330329895, "learning_rate": 0.00012867877015125718, "loss": 2.7363, "step": 2934 }, { "epoch": 0.8167808814832852, "grad_norm": 2.3914458751678467, "learning_rate": 0.0001286368330672911, "loss": 2.7851, "step": 2935 }, { "epoch": 0.8170591713917974, "grad_norm": 2.459439992904663, "learning_rate": 0.0001285948904962761, "loss": 2.7994, "step": 2936 }, { "epoch": 0.8173374613003096, "grad_norm": 2.418583869934082, "learning_rate": 0.0001285529424462487, "loss": 2.8681, "step": 2937 }, { "epoch": 0.8176157512088218, "grad_norm": 2.4265248775482178, "learning_rate": 0.0001285109889252465, "loss": 2.8279, "step": 2938 }, { "epoch": 0.817894041117334, "grad_norm": 2.392277479171753, "learning_rate": 0.0001284690299413081, "loss": 3.0317, "step": 2939 }, { "epoch": 0.8181723310258462, "grad_norm": 2.35749888420105, "learning_rate": 0.00012842706550247318, "loss": 2.8092, "step": 2940 }, { "epoch": 0.8184506209343584, "grad_norm": 2.120776891708374, "learning_rate": 0.00012838509561678246, "loss": 2.8071, "step": 2941 }, { "epoch": 0.8187289108428706, "grad_norm": 2.6026172637939453, "learning_rate": 0.0001283431202922778, "loss": 2.9375, "step": 2942 }, { "epoch": 0.8190072007513828, "grad_norm": 2.3582749366760254, "learning_rate": 0.00012830113953700192, "loss": 2.7249, "step": 2943 }, { "epoch": 0.819285490659895, "grad_norm": 2.6513803005218506, "learning_rate": 0.00012825915335899866, "loss": 3.0857, "step": 2944 }, { "epoch": 0.8195637805684072, "grad_norm": 2.2872297763824463, "learning_rate": 0.00012821716176631296, "loss": 2.8472, "step": 2945 }, { "epoch": 0.8198420704769194, "grad_norm": 2.4487850666046143, "learning_rate": 0.00012817516476699075, "loss": 2.7221, "step": 2946 }, { "epoch": 0.8201203603854316, "grad_norm": 2.5003488063812256, "learning_rate": 0.00012813316236907896, "loss": 3.02, "step": 2947 }, { "epoch": 0.8203986502939438, "grad_norm": 2.4596872329711914, "learning_rate": 0.0001280911545806256, "loss": 2.8776, "step": 2948 }, { "epoch": 0.820676940202456, "grad_norm": 2.368199348449707, "learning_rate": 0.00012804914140967974, "loss": 2.977, "step": 2949 }, { "epoch": 0.820955230110968, "grad_norm": 2.285266876220703, "learning_rate": 0.00012800712286429144, "loss": 2.9114, "step": 2950 }, { "epoch": 0.8212335200194802, "grad_norm": 2.3975143432617188, "learning_rate": 0.00012796509895251172, "loss": 2.8164, "step": 2951 }, { "epoch": 0.8215118099279924, "grad_norm": 2.1443240642547607, "learning_rate": 0.00012792306968239278, "loss": 2.7754, "step": 2952 }, { "epoch": 0.8217900998365046, "grad_norm": 2.710097312927246, "learning_rate": 0.00012788103506198775, "loss": 3.0806, "step": 2953 }, { "epoch": 0.8220683897450168, "grad_norm": 2.537108898162842, "learning_rate": 0.00012783899509935073, "loss": 2.9445, "step": 2954 }, { "epoch": 0.822346679653529, "grad_norm": 2.291429281234741, "learning_rate": 0.00012779694980253706, "loss": 2.9015, "step": 2955 }, { "epoch": 0.8226249695620412, "grad_norm": 2.1255440711975098, "learning_rate": 0.00012775489917960284, "loss": 2.684, "step": 2956 }, { "epoch": 0.8229032594705534, "grad_norm": 2.423882484436035, "learning_rate": 0.00012771284323860534, "loss": 2.7119, "step": 2957 }, { "epoch": 0.8231815493790656, "grad_norm": 2.761375665664673, "learning_rate": 0.00012767078198760282, "loss": 2.9379, "step": 2958 }, { "epoch": 0.8234598392875778, "grad_norm": 2.477085828781128, "learning_rate": 0.00012762871543465457, "loss": 2.9351, "step": 2959 }, { "epoch": 0.82373812919609, "grad_norm": 2.2762138843536377, "learning_rate": 0.00012758664358782088, "loss": 2.8119, "step": 2960 }, { "epoch": 0.8240164191046022, "grad_norm": 2.1999428272247314, "learning_rate": 0.00012754456645516307, "loss": 2.8722, "step": 2961 }, { "epoch": 0.8242947090131144, "grad_norm": 2.1070680618286133, "learning_rate": 0.00012750248404474343, "loss": 2.8316, "step": 2962 }, { "epoch": 0.8245729989216266, "grad_norm": 2.180941104888916, "learning_rate": 0.0001274603963646253, "loss": 2.9451, "step": 2963 }, { "epoch": 0.8248512888301388, "grad_norm": 2.32004451751709, "learning_rate": 0.000127418303422873, "loss": 2.9099, "step": 2964 }, { "epoch": 0.825129578738651, "grad_norm": 2.339617967605591, "learning_rate": 0.00012737620522755192, "loss": 2.5298, "step": 2965 }, { "epoch": 0.8254078686471632, "grad_norm": 2.1367099285125732, "learning_rate": 0.00012733410178672835, "loss": 2.857, "step": 2966 }, { "epoch": 0.8256861585556754, "grad_norm": 2.4242475032806396, "learning_rate": 0.00012729199310846975, "loss": 2.8728, "step": 2967 }, { "epoch": 0.8259644484641876, "grad_norm": 2.361701488494873, "learning_rate": 0.00012724987920084439, "loss": 2.9321, "step": 2968 }, { "epoch": 0.8262427383726998, "grad_norm": 2.600550889968872, "learning_rate": 0.0001272077600719217, "loss": 3.0485, "step": 2969 }, { "epoch": 0.826521028281212, "grad_norm": 2.272958278656006, "learning_rate": 0.00012716563572977199, "loss": 2.7445, "step": 2970 }, { "epoch": 0.8267993181897242, "grad_norm": 2.5893428325653076, "learning_rate": 0.00012712350618246667, "loss": 2.9214, "step": 2971 }, { "epoch": 0.8270776080982364, "grad_norm": 2.702472686767578, "learning_rate": 0.0001270813714380781, "loss": 3.0315, "step": 2972 }, { "epoch": 0.8273558980067486, "grad_norm": 2.2046597003936768, "learning_rate": 0.0001270392315046796, "loss": 2.9025, "step": 2973 }, { "epoch": 0.8276341879152608, "grad_norm": 2.4686124324798584, "learning_rate": 0.00012699708639034556, "loss": 3.0387, "step": 2974 }, { "epoch": 0.8279124778237729, "grad_norm": 2.0094878673553467, "learning_rate": 0.0001269549361031513, "loss": 2.56, "step": 2975 }, { "epoch": 0.8281907677322851, "grad_norm": 2.3929848670959473, "learning_rate": 0.00012691278065117315, "loss": 2.8399, "step": 2976 }, { "epoch": 0.8284690576407973, "grad_norm": 2.571901559829712, "learning_rate": 0.0001268706200424885, "loss": 2.6875, "step": 2977 }, { "epoch": 0.8287473475493095, "grad_norm": 2.186870813369751, "learning_rate": 0.0001268284542851756, "loss": 2.4539, "step": 2978 }, { "epoch": 0.8290256374578217, "grad_norm": 2.2831342220306396, "learning_rate": 0.00012678628338731374, "loss": 2.7489, "step": 2979 }, { "epoch": 0.8293039273663338, "grad_norm": 2.0514700412750244, "learning_rate": 0.00012674410735698325, "loss": 2.907, "step": 2980 }, { "epoch": 0.829582217274846, "grad_norm": 2.27148175239563, "learning_rate": 0.0001267019262022654, "loss": 2.7047, "step": 2981 }, { "epoch": 0.8298605071833582, "grad_norm": 2.184298276901245, "learning_rate": 0.00012665973993124236, "loss": 2.8486, "step": 2982 }, { "epoch": 0.8301387970918704, "grad_norm": 2.008800506591797, "learning_rate": 0.00012661754855199748, "loss": 2.6626, "step": 2983 }, { "epoch": 0.8304170870003826, "grad_norm": 2.3796048164367676, "learning_rate": 0.00012657535207261488, "loss": 3.0021, "step": 2984 }, { "epoch": 0.8306953769088948, "grad_norm": 2.356804370880127, "learning_rate": 0.00012653315050117979, "loss": 2.8353, "step": 2985 }, { "epoch": 0.830973666817407, "grad_norm": 2.007028579711914, "learning_rate": 0.00012649094384577838, "loss": 2.7404, "step": 2986 }, { "epoch": 0.8312519567259192, "grad_norm": 2.1832668781280518, "learning_rate": 0.00012644873211449772, "loss": 2.816, "step": 2987 }, { "epoch": 0.8315302466344314, "grad_norm": 2.8127596378326416, "learning_rate": 0.000126406515315426, "loss": 2.9661, "step": 2988 }, { "epoch": 0.8318085365429436, "grad_norm": 2.3283605575561523, "learning_rate": 0.00012636429345665227, "loss": 2.8493, "step": 2989 }, { "epoch": 0.8320868264514558, "grad_norm": 2.7549166679382324, "learning_rate": 0.00012632206654626657, "loss": 2.9809, "step": 2990 }, { "epoch": 0.832365116359968, "grad_norm": 2.3044545650482178, "learning_rate": 0.00012627983459235993, "loss": 2.7633, "step": 2991 }, { "epoch": 0.8326434062684802, "grad_norm": 2.1750566959381104, "learning_rate": 0.00012623759760302432, "loss": 2.8281, "step": 2992 }, { "epoch": 0.8329216961769924, "grad_norm": 2.454819440841675, "learning_rate": 0.00012619535558635271, "loss": 2.7568, "step": 2993 }, { "epoch": 0.8331999860855046, "grad_norm": 2.400097370147705, "learning_rate": 0.00012615310855043898, "loss": 3.0477, "step": 2994 }, { "epoch": 0.8334782759940168, "grad_norm": 2.1538872718811035, "learning_rate": 0.00012611085650337804, "loss": 2.7391, "step": 2995 }, { "epoch": 0.833756565902529, "grad_norm": 2.225940227508545, "learning_rate": 0.0001260685994532657, "loss": 2.6355, "step": 2996 }, { "epoch": 0.8340348558110412, "grad_norm": 2.35384464263916, "learning_rate": 0.00012602633740819877, "loss": 3.1432, "step": 2997 }, { "epoch": 0.8343131457195534, "grad_norm": 2.284543514251709, "learning_rate": 0.000125984070376275, "loss": 2.6465, "step": 2998 }, { "epoch": 0.8345914356280656, "grad_norm": 2.5718183517456055, "learning_rate": 0.00012594179836559308, "loss": 2.951, "step": 2999 }, { "epoch": 0.8348697255365777, "grad_norm": 2.0782413482666016, "learning_rate": 0.00012589952138425265, "loss": 2.8698, "step": 3000 }, { "epoch": 0.8348697255365777, "eval_loss": 2.8747780323028564, "eval_runtime": 84.954, "eval_samples_per_second": 58.855, "eval_steps_per_second": 14.714, "step": 3000 }, { "epoch": 0.8351480154450899, "grad_norm": 2.5280380249023438, "learning_rate": 0.00012585723944035437, "loss": 2.8799, "step": 3001 }, { "epoch": 0.8354263053536021, "grad_norm": 2.53306245803833, "learning_rate": 0.00012581495254199978, "loss": 2.9175, "step": 3002 }, { "epoch": 0.8357045952621143, "grad_norm": 2.2948436737060547, "learning_rate": 0.00012577266069729138, "loss": 2.8815, "step": 3003 }, { "epoch": 0.8359828851706265, "grad_norm": 2.2829959392547607, "learning_rate": 0.00012573036391433267, "loss": 2.584, "step": 3004 }, { "epoch": 0.8362611750791387, "grad_norm": 2.560807228088379, "learning_rate": 0.000125688062201228, "loss": 2.914, "step": 3005 }, { "epoch": 0.8365394649876509, "grad_norm": 2.3947746753692627, "learning_rate": 0.00012564575556608273, "loss": 3.066, "step": 3006 }, { "epoch": 0.8368177548961631, "grad_norm": 2.0135371685028076, "learning_rate": 0.00012560344401700322, "loss": 2.4776, "step": 3007 }, { "epoch": 0.8370960448046753, "grad_norm": 2.0847644805908203, "learning_rate": 0.0001255611275620966, "loss": 2.6528, "step": 3008 }, { "epoch": 0.8373743347131875, "grad_norm": 2.1330463886260986, "learning_rate": 0.00012551880620947114, "loss": 2.8387, "step": 3009 }, { "epoch": 0.8376526246216996, "grad_norm": 2.124899387359619, "learning_rate": 0.0001254764799672359, "loss": 2.6317, "step": 3010 }, { "epoch": 0.8379309145302118, "grad_norm": 2.4356443881988525, "learning_rate": 0.00012543414884350095, "loss": 2.7197, "step": 3011 }, { "epoch": 0.838209204438724, "grad_norm": 2.2458689212799072, "learning_rate": 0.0001253918128463773, "loss": 2.8825, "step": 3012 }, { "epoch": 0.8384874943472362, "grad_norm": 2.427018642425537, "learning_rate": 0.00012534947198397677, "loss": 2.7709, "step": 3013 }, { "epoch": 0.8387657842557484, "grad_norm": 2.500670909881592, "learning_rate": 0.00012530712626441235, "loss": 2.8101, "step": 3014 }, { "epoch": 0.8390440741642606, "grad_norm": 2.543105125427246, "learning_rate": 0.00012526477569579773, "loss": 2.8782, "step": 3015 }, { "epoch": 0.8393223640727728, "grad_norm": 2.4876961708068848, "learning_rate": 0.00012522242028624765, "loss": 3.1554, "step": 3016 }, { "epoch": 0.839600653981285, "grad_norm": 2.281028985977173, "learning_rate": 0.00012518006004387777, "loss": 2.7307, "step": 3017 }, { "epoch": 0.8398789438897972, "grad_norm": 2.2541873455047607, "learning_rate": 0.0001251376949768046, "loss": 2.7594, "step": 3018 }, { "epoch": 0.8401572337983094, "grad_norm": 2.364515542984009, "learning_rate": 0.00012509532509314566, "loss": 2.6418, "step": 3019 }, { "epoch": 0.8404355237068216, "grad_norm": 2.379120349884033, "learning_rate": 0.0001250529504010194, "loss": 3.0197, "step": 3020 }, { "epoch": 0.8407138136153338, "grad_norm": 2.1797425746917725, "learning_rate": 0.0001250105709085451, "loss": 2.8096, "step": 3021 }, { "epoch": 0.840992103523846, "grad_norm": 2.3542373180389404, "learning_rate": 0.000124968186623843, "loss": 2.9486, "step": 3022 }, { "epoch": 0.8412703934323582, "grad_norm": 2.3237035274505615, "learning_rate": 0.00012492579755503432, "loss": 2.6808, "step": 3023 }, { "epoch": 0.8415486833408704, "grad_norm": 2.626868963241577, "learning_rate": 0.00012488340371024116, "loss": 3.142, "step": 3024 }, { "epoch": 0.8418269732493825, "grad_norm": 2.4676411151885986, "learning_rate": 0.00012484100509758646, "loss": 2.7237, "step": 3025 }, { "epoch": 0.8421052631578947, "grad_norm": 5.243633270263672, "learning_rate": 0.0001247986017251942, "loss": 2.8905, "step": 3026 }, { "epoch": 0.8423835530664069, "grad_norm": 2.4676711559295654, "learning_rate": 0.00012475619360118918, "loss": 2.7752, "step": 3027 }, { "epoch": 0.8426618429749191, "grad_norm": 2.252593994140625, "learning_rate": 0.00012471378073369708, "loss": 2.8668, "step": 3028 }, { "epoch": 0.8429401328834313, "grad_norm": 2.475684642791748, "learning_rate": 0.00012467136313084468, "loss": 3.1256, "step": 3029 }, { "epoch": 0.8432184227919435, "grad_norm": 2.1612749099731445, "learning_rate": 0.00012462894080075945, "loss": 2.5412, "step": 3030 }, { "epoch": 0.8434967127004557, "grad_norm": 2.1267168521881104, "learning_rate": 0.00012458651375156987, "loss": 2.8114, "step": 3031 }, { "epoch": 0.8437750026089679, "grad_norm": 2.4037249088287354, "learning_rate": 0.00012454408199140532, "loss": 3.0696, "step": 3032 }, { "epoch": 0.8440532925174801, "grad_norm": 2.226177930831909, "learning_rate": 0.00012450164552839602, "loss": 2.821, "step": 3033 }, { "epoch": 0.8443315824259923, "grad_norm": 2.310986042022705, "learning_rate": 0.00012445920437067318, "loss": 2.5467, "step": 3034 }, { "epoch": 0.8446098723345045, "grad_norm": 2.371478319168091, "learning_rate": 0.00012441675852636888, "loss": 2.7967, "step": 3035 }, { "epoch": 0.8448881622430167, "grad_norm": 2.1629040241241455, "learning_rate": 0.00012437430800361606, "loss": 2.5896, "step": 3036 }, { "epoch": 0.8451664521515289, "grad_norm": 2.6354029178619385, "learning_rate": 0.00012433185281054856, "loss": 3.0851, "step": 3037 }, { "epoch": 0.845444742060041, "grad_norm": 2.9918041229248047, "learning_rate": 0.00012428939295530122, "loss": 2.9653, "step": 3038 }, { "epoch": 0.8457230319685533, "grad_norm": 2.479963541030884, "learning_rate": 0.00012424692844600964, "loss": 2.8725, "step": 3039 }, { "epoch": 0.8460013218770654, "grad_norm": 2.1917507648468018, "learning_rate": 0.00012420445929081032, "loss": 2.8818, "step": 3040 }, { "epoch": 0.8462796117855776, "grad_norm": 3.7117621898651123, "learning_rate": 0.0001241619854978408, "loss": 3.251, "step": 3041 }, { "epoch": 0.8465579016940898, "grad_norm": 2.224148988723755, "learning_rate": 0.0001241195070752393, "loss": 2.8597, "step": 3042 }, { "epoch": 0.846836191602602, "grad_norm": 2.2419686317443848, "learning_rate": 0.0001240770240311451, "loss": 2.9144, "step": 3043 }, { "epoch": 0.8471144815111142, "grad_norm": 2.3509795665740967, "learning_rate": 0.00012403453637369825, "loss": 2.7837, "step": 3044 }, { "epoch": 0.8473927714196264, "grad_norm": 2.326627016067505, "learning_rate": 0.00012399204411103976, "loss": 2.6767, "step": 3045 }, { "epoch": 0.8476710613281386, "grad_norm": 2.73604679107666, "learning_rate": 0.00012394954725131145, "loss": 2.8134, "step": 3046 }, { "epoch": 0.8479493512366508, "grad_norm": 2.317373752593994, "learning_rate": 0.0001239070458026561, "loss": 3.1149, "step": 3047 }, { "epoch": 0.848227641145163, "grad_norm": 2.0434160232543945, "learning_rate": 0.0001238645397732173, "loss": 2.6894, "step": 3048 }, { "epoch": 0.8485059310536751, "grad_norm": 2.5108156204223633, "learning_rate": 0.00012382202917113965, "loss": 2.9498, "step": 3049 }, { "epoch": 0.8487842209621873, "grad_norm": 2.167247772216797, "learning_rate": 0.00012377951400456837, "loss": 2.8796, "step": 3050 }, { "epoch": 0.8490625108706995, "grad_norm": 2.4433934688568115, "learning_rate": 0.00012373699428164982, "loss": 2.7639, "step": 3051 }, { "epoch": 0.8493408007792117, "grad_norm": 2.1768581867218018, "learning_rate": 0.0001236944700105311, "loss": 2.7802, "step": 3052 }, { "epoch": 0.8496190906877239, "grad_norm": 2.239447832107544, "learning_rate": 0.00012365194119936013, "loss": 2.9012, "step": 3053 }, { "epoch": 0.8498973805962361, "grad_norm": 2.254333257675171, "learning_rate": 0.0001236094078562859, "loss": 2.9264, "step": 3054 }, { "epoch": 0.8501756705047483, "grad_norm": 2.3298230171203613, "learning_rate": 0.00012356686998945808, "loss": 2.7002, "step": 3055 }, { "epoch": 0.8504539604132605, "grad_norm": 1.9509849548339844, "learning_rate": 0.00012352432760702725, "loss": 2.6613, "step": 3056 }, { "epoch": 0.8507322503217727, "grad_norm": 2.193026542663574, "learning_rate": 0.0001234817807171449, "loss": 2.9023, "step": 3057 }, { "epoch": 0.8510105402302849, "grad_norm": 2.4085371494293213, "learning_rate": 0.00012343922932796334, "loss": 2.8324, "step": 3058 }, { "epoch": 0.8512888301387971, "grad_norm": 2.1137948036193848, "learning_rate": 0.00012339667344763577, "loss": 2.6326, "step": 3059 }, { "epoch": 0.8515671200473093, "grad_norm": 2.7694218158721924, "learning_rate": 0.00012335411308431627, "loss": 2.8986, "step": 3060 }, { "epoch": 0.8518454099558215, "grad_norm": 2.226592540740967, "learning_rate": 0.0001233115482461597, "loss": 2.7708, "step": 3061 }, { "epoch": 0.8521236998643337, "grad_norm": 2.423302412033081, "learning_rate": 0.00012326897894132188, "loss": 2.7265, "step": 3062 }, { "epoch": 0.8524019897728459, "grad_norm": 2.834028720855713, "learning_rate": 0.00012322640517795937, "loss": 2.9336, "step": 3063 }, { "epoch": 0.8526802796813581, "grad_norm": 2.332402229309082, "learning_rate": 0.00012318382696422972, "loss": 2.9294, "step": 3064 }, { "epoch": 0.8529585695898703, "grad_norm": 2.6458802223205566, "learning_rate": 0.0001231412443082912, "loss": 2.8882, "step": 3065 }, { "epoch": 0.8532368594983825, "grad_norm": 2.6019880771636963, "learning_rate": 0.00012309865721830303, "loss": 2.7972, "step": 3066 }, { "epoch": 0.8535151494068947, "grad_norm": 2.33577299118042, "learning_rate": 0.0001230560657024252, "loss": 2.876, "step": 3067 }, { "epoch": 0.8537934393154069, "grad_norm": 1.9822306632995605, "learning_rate": 0.0001230134697688186, "loss": 2.6352, "step": 3068 }, { "epoch": 0.854071729223919, "grad_norm": 2.3735594749450684, "learning_rate": 0.00012297086942564503, "loss": 2.5594, "step": 3069 }, { "epoch": 0.8543500191324312, "grad_norm": 2.358985662460327, "learning_rate": 0.00012292826468106695, "loss": 3.0717, "step": 3070 }, { "epoch": 0.8546283090409434, "grad_norm": 2.745330810546875, "learning_rate": 0.00012288565554324785, "loss": 2.8932, "step": 3071 }, { "epoch": 0.8549065989494556, "grad_norm": 2.266486406326294, "learning_rate": 0.00012284304202035196, "loss": 2.7055, "step": 3072 }, { "epoch": 0.8551848888579678, "grad_norm": 2.645406723022461, "learning_rate": 0.00012280042412054438, "loss": 2.9937, "step": 3073 }, { "epoch": 0.8554631787664799, "grad_norm": 2.43501353263855, "learning_rate": 0.000122757801851991, "loss": 2.7979, "step": 3074 }, { "epoch": 0.8557414686749921, "grad_norm": 2.5453896522521973, "learning_rate": 0.00012271517522285868, "loss": 2.871, "step": 3075 }, { "epoch": 0.8560197585835043, "grad_norm": 2.2425076961517334, "learning_rate": 0.00012267254424131496, "loss": 2.9193, "step": 3076 }, { "epoch": 0.8562980484920165, "grad_norm": 2.391697406768799, "learning_rate": 0.00012262990891552829, "loss": 2.9995, "step": 3077 }, { "epoch": 0.8565763384005287, "grad_norm": 2.24037504196167, "learning_rate": 0.00012258726925366796, "loss": 2.8981, "step": 3078 }, { "epoch": 0.8568546283090409, "grad_norm": 2.24367618560791, "learning_rate": 0.00012254462526390409, "loss": 2.6902, "step": 3079 }, { "epoch": 0.8571329182175531, "grad_norm": 2.8459410667419434, "learning_rate": 0.00012250197695440757, "loss": 2.9446, "step": 3080 }, { "epoch": 0.8574112081260653, "grad_norm": 2.2806174755096436, "learning_rate": 0.0001224593243333502, "loss": 2.8494, "step": 3081 }, { "epoch": 0.8576894980345775, "grad_norm": 2.1955888271331787, "learning_rate": 0.00012241666740890454, "loss": 2.948, "step": 3082 }, { "epoch": 0.8579677879430897, "grad_norm": 2.1594016551971436, "learning_rate": 0.00012237400618924396, "loss": 2.8282, "step": 3083 }, { "epoch": 0.8582460778516019, "grad_norm": 2.1758382320404053, "learning_rate": 0.00012233134068254282, "loss": 2.8879, "step": 3084 }, { "epoch": 0.8585243677601141, "grad_norm": 2.64624285697937, "learning_rate": 0.0001222886708969761, "loss": 3.1521, "step": 3085 }, { "epoch": 0.8588026576686263, "grad_norm": 2.334070920944214, "learning_rate": 0.00012224599684071964, "loss": 2.9791, "step": 3086 }, { "epoch": 0.8590809475771385, "grad_norm": 2.7285430431365967, "learning_rate": 0.00012220331852195017, "loss": 2.8089, "step": 3087 }, { "epoch": 0.8593592374856507, "grad_norm": 2.3747029304504395, "learning_rate": 0.00012216063594884522, "loss": 2.8639, "step": 3088 }, { "epoch": 0.8596375273941629, "grad_norm": 2.80376935005188, "learning_rate": 0.00012211794912958312, "loss": 3.077, "step": 3089 }, { "epoch": 0.8599158173026751, "grad_norm": 2.9673192501068115, "learning_rate": 0.00012207525807234295, "loss": 2.8195, "step": 3090 }, { "epoch": 0.8601941072111873, "grad_norm": 2.3636767864227295, "learning_rate": 0.00012203256278530473, "loss": 2.758, "step": 3091 }, { "epoch": 0.8604723971196995, "grad_norm": 2.311436891555786, "learning_rate": 0.00012198986327664921, "loss": 2.6994, "step": 3092 }, { "epoch": 0.8607506870282117, "grad_norm": 2.3035452365875244, "learning_rate": 0.00012194715955455791, "loss": 2.8457, "step": 3093 }, { "epoch": 0.8610289769367239, "grad_norm": 2.2746169567108154, "learning_rate": 0.00012190445162721329, "loss": 2.8317, "step": 3094 }, { "epoch": 0.8613072668452361, "grad_norm": 2.390657424926758, "learning_rate": 0.00012186173950279847, "loss": 2.9185, "step": 3095 }, { "epoch": 0.8615855567537483, "grad_norm": 2.3525004386901855, "learning_rate": 0.00012181902318949742, "loss": 3.1317, "step": 3096 }, { "epoch": 0.8618638466622605, "grad_norm": 2.29316782951355, "learning_rate": 0.00012177630269549502, "loss": 2.621, "step": 3097 }, { "epoch": 0.8621421365707727, "grad_norm": 2.696976900100708, "learning_rate": 0.00012173357802897682, "loss": 2.8491, "step": 3098 }, { "epoch": 0.8624204264792847, "grad_norm": 2.0916619300842285, "learning_rate": 0.00012169084919812917, "loss": 2.7137, "step": 3099 }, { "epoch": 0.8626987163877969, "grad_norm": 2.3254196643829346, "learning_rate": 0.00012164811621113932, "loss": 2.7183, "step": 3100 }, { "epoch": 0.8626987163877969, "eval_loss": 2.870107412338257, "eval_runtime": 84.767, "eval_samples_per_second": 58.985, "eval_steps_per_second": 14.746, "step": 3100 }, { "epoch": 0.8629770062963091, "grad_norm": 2.399704933166504, "learning_rate": 0.0001216053790761952, "loss": 2.6526, "step": 3101 }, { "epoch": 0.8632552962048213, "grad_norm": 2.1048245429992676, "learning_rate": 0.00012156263780148563, "loss": 2.8232, "step": 3102 }, { "epoch": 0.8635335861133335, "grad_norm": 2.764892578125, "learning_rate": 0.00012151989239520019, "loss": 3.3538, "step": 3103 }, { "epoch": 0.8638118760218457, "grad_norm": 2.692272186279297, "learning_rate": 0.00012147714286552922, "loss": 3.0572, "step": 3104 }, { "epoch": 0.8640901659303579, "grad_norm": 2.6705734729766846, "learning_rate": 0.00012143438922066388, "loss": 2.998, "step": 3105 }, { "epoch": 0.8643684558388701, "grad_norm": 2.8965930938720703, "learning_rate": 0.00012139163146879615, "loss": 3.0505, "step": 3106 }, { "epoch": 0.8646467457473823, "grad_norm": 2.0954883098602295, "learning_rate": 0.00012134886961811872, "loss": 2.7269, "step": 3107 }, { "epoch": 0.8649250356558945, "grad_norm": 2.547647476196289, "learning_rate": 0.00012130610367682507, "loss": 2.955, "step": 3108 }, { "epoch": 0.8652033255644067, "grad_norm": 2.4457592964172363, "learning_rate": 0.0001212633336531096, "loss": 2.8715, "step": 3109 }, { "epoch": 0.8654816154729189, "grad_norm": 2.582653284072876, "learning_rate": 0.00012122055955516733, "loss": 2.8914, "step": 3110 }, { "epoch": 0.8657599053814311, "grad_norm": 2.279489517211914, "learning_rate": 0.00012117778139119414, "loss": 2.9532, "step": 3111 }, { "epoch": 0.8660381952899433, "grad_norm": 2.2489049434661865, "learning_rate": 0.00012113499916938667, "loss": 2.8506, "step": 3112 }, { "epoch": 0.8663164851984555, "grad_norm": 2.4785749912261963, "learning_rate": 0.00012109221289794233, "loss": 3.1786, "step": 3113 }, { "epoch": 0.8665947751069677, "grad_norm": 2.216130495071411, "learning_rate": 0.00012104942258505931, "loss": 2.5144, "step": 3114 }, { "epoch": 0.8668730650154799, "grad_norm": 2.232760190963745, "learning_rate": 0.00012100662823893662, "loss": 2.8763, "step": 3115 }, { "epoch": 0.8671513549239921, "grad_norm": 2.318591833114624, "learning_rate": 0.00012096382986777397, "loss": 2.9187, "step": 3116 }, { "epoch": 0.8674296448325043, "grad_norm": 2.0994932651519775, "learning_rate": 0.00012092102747977188, "loss": 2.6135, "step": 3117 }, { "epoch": 0.8677079347410165, "grad_norm": 2.304403305053711, "learning_rate": 0.00012087822108313162, "loss": 2.7303, "step": 3118 }, { "epoch": 0.8679862246495287, "grad_norm": 2.393878936767578, "learning_rate": 0.00012083541068605529, "loss": 2.9251, "step": 3119 }, { "epoch": 0.8682645145580409, "grad_norm": 2.2377734184265137, "learning_rate": 0.00012079259629674564, "loss": 2.9143, "step": 3120 }, { "epoch": 0.8685428044665531, "grad_norm": 2.3184397220611572, "learning_rate": 0.0001207497779234063, "loss": 2.6959, "step": 3121 }, { "epoch": 0.8688210943750653, "grad_norm": 3.0270893573760986, "learning_rate": 0.00012070695557424161, "loss": 3.0684, "step": 3122 }, { "epoch": 0.8690993842835775, "grad_norm": 2.3763606548309326, "learning_rate": 0.00012066412925745665, "loss": 2.7172, "step": 3123 }, { "epoch": 0.8693776741920896, "grad_norm": 2.301104784011841, "learning_rate": 0.00012062129898125736, "loss": 2.9164, "step": 3124 }, { "epoch": 0.8696559641006018, "grad_norm": 2.2643089294433594, "learning_rate": 0.00012057846475385028, "loss": 2.6864, "step": 3125 }, { "epoch": 0.869934254009114, "grad_norm": 2.223737955093384, "learning_rate": 0.00012053562658344284, "loss": 2.8567, "step": 3126 }, { "epoch": 0.8702125439176261, "grad_norm": 2.0737998485565186, "learning_rate": 0.00012049278447824319, "loss": 2.4913, "step": 3127 }, { "epoch": 0.8704908338261383, "grad_norm": 2.399003267288208, "learning_rate": 0.00012044993844646017, "loss": 2.7724, "step": 3128 }, { "epoch": 0.8707691237346505, "grad_norm": 2.3014771938323975, "learning_rate": 0.00012040708849630351, "loss": 2.7126, "step": 3129 }, { "epoch": 0.8710474136431627, "grad_norm": 2.372558832168579, "learning_rate": 0.00012036423463598354, "loss": 2.9925, "step": 3130 }, { "epoch": 0.8713257035516749, "grad_norm": 2.1995272636413574, "learning_rate": 0.00012032137687371145, "loss": 2.7987, "step": 3131 }, { "epoch": 0.8716039934601871, "grad_norm": 2.2565462589263916, "learning_rate": 0.0001202785152176991, "loss": 2.9212, "step": 3132 }, { "epoch": 0.8718822833686993, "grad_norm": 2.4183125495910645, "learning_rate": 0.00012023564967615915, "loss": 2.6801, "step": 3133 }, { "epoch": 0.8721605732772115, "grad_norm": 2.2728066444396973, "learning_rate": 0.00012019278025730496, "loss": 3.0712, "step": 3134 }, { "epoch": 0.8724388631857237, "grad_norm": 2.693830728530884, "learning_rate": 0.00012014990696935069, "loss": 2.8051, "step": 3135 }, { "epoch": 0.8727171530942359, "grad_norm": 2.34587025642395, "learning_rate": 0.00012010702982051118, "loss": 2.8794, "step": 3136 }, { "epoch": 0.8729954430027481, "grad_norm": 2.4217886924743652, "learning_rate": 0.00012006414881900208, "loss": 2.8431, "step": 3137 }, { "epoch": 0.8732737329112603, "grad_norm": 2.3363759517669678, "learning_rate": 0.0001200212639730397, "loss": 2.951, "step": 3138 }, { "epoch": 0.8735520228197725, "grad_norm": 2.6384646892547607, "learning_rate": 0.00011997837529084111, "loss": 2.9855, "step": 3139 }, { "epoch": 0.8738303127282847, "grad_norm": 2.448763370513916, "learning_rate": 0.00011993548278062416, "loss": 2.7724, "step": 3140 }, { "epoch": 0.8741086026367969, "grad_norm": 3.195932626724243, "learning_rate": 0.00011989258645060742, "loss": 3.15, "step": 3141 }, { "epoch": 0.8743868925453091, "grad_norm": 2.4808483123779297, "learning_rate": 0.00011984968630901012, "loss": 2.8513, "step": 3142 }, { "epoch": 0.8746651824538213, "grad_norm": 2.432471752166748, "learning_rate": 0.00011980678236405231, "loss": 2.6586, "step": 3143 }, { "epoch": 0.8749434723623335, "grad_norm": 2.429511547088623, "learning_rate": 0.00011976387462395475, "loss": 2.8241, "step": 3144 }, { "epoch": 0.8752217622708457, "grad_norm": 2.3698301315307617, "learning_rate": 0.00011972096309693883, "loss": 2.642, "step": 3145 }, { "epoch": 0.8755000521793579, "grad_norm": 2.2753422260284424, "learning_rate": 0.00011967804779122684, "loss": 2.7391, "step": 3146 }, { "epoch": 0.8757783420878701, "grad_norm": 2.5310845375061035, "learning_rate": 0.00011963512871504168, "loss": 2.9439, "step": 3147 }, { "epoch": 0.8760566319963823, "grad_norm": 2.3136539459228516, "learning_rate": 0.00011959220587660693, "loss": 2.827, "step": 3148 }, { "epoch": 0.8763349219048944, "grad_norm": 2.5879242420196533, "learning_rate": 0.00011954927928414699, "loss": 2.79, "step": 3149 }, { "epoch": 0.8766132118134066, "grad_norm": 2.783357620239258, "learning_rate": 0.00011950634894588697, "loss": 2.9125, "step": 3150 }, { "epoch": 0.8768915017219188, "grad_norm": 2.1605336666107178, "learning_rate": 0.00011946341487005268, "loss": 2.8057, "step": 3151 }, { "epoch": 0.877169791630431, "grad_norm": 2.070019006729126, "learning_rate": 0.00011942047706487056, "loss": 2.8694, "step": 3152 }, { "epoch": 0.8774480815389432, "grad_norm": 2.4996659755706787, "learning_rate": 0.0001193775355385679, "loss": 2.7311, "step": 3153 }, { "epoch": 0.8777263714474554, "grad_norm": 2.225355863571167, "learning_rate": 0.00011933459029937263, "loss": 2.7965, "step": 3154 }, { "epoch": 0.8780046613559676, "grad_norm": 2.6613411903381348, "learning_rate": 0.00011929164135551341, "loss": 2.8094, "step": 3155 }, { "epoch": 0.8782829512644797, "grad_norm": 2.3886003494262695, "learning_rate": 0.00011924868871521959, "loss": 2.8034, "step": 3156 }, { "epoch": 0.878561241172992, "grad_norm": 2.3685553073883057, "learning_rate": 0.00011920573238672128, "loss": 2.6923, "step": 3157 }, { "epoch": 0.8788395310815041, "grad_norm": 2.255863666534424, "learning_rate": 0.0001191627723782492, "loss": 2.687, "step": 3158 }, { "epoch": 0.8791178209900163, "grad_norm": 2.459873914718628, "learning_rate": 0.0001191198086980349, "loss": 2.8802, "step": 3159 }, { "epoch": 0.8793961108985285, "grad_norm": 2.6391382217407227, "learning_rate": 0.00011907684135431054, "loss": 3.0591, "step": 3160 }, { "epoch": 0.8796744008070407, "grad_norm": 2.3763396739959717, "learning_rate": 0.00011903387035530898, "loss": 2.9569, "step": 3161 }, { "epoch": 0.8799526907155529, "grad_norm": 2.3936374187469482, "learning_rate": 0.00011899089570926385, "loss": 2.7921, "step": 3162 }, { "epoch": 0.8802309806240651, "grad_norm": 2.412552833557129, "learning_rate": 0.00011894791742440946, "loss": 2.6865, "step": 3163 }, { "epoch": 0.8805092705325773, "grad_norm": 2.781595468521118, "learning_rate": 0.00011890493550898072, "loss": 2.9301, "step": 3164 }, { "epoch": 0.8807875604410895, "grad_norm": 2.285830020904541, "learning_rate": 0.00011886194997121339, "loss": 2.9814, "step": 3165 }, { "epoch": 0.8810658503496017, "grad_norm": 2.6993072032928467, "learning_rate": 0.00011881896081934383, "loss": 2.5662, "step": 3166 }, { "epoch": 0.8813441402581139, "grad_norm": 2.2598068714141846, "learning_rate": 0.00011877596806160904, "loss": 2.6818, "step": 3167 }, { "epoch": 0.8816224301666261, "grad_norm": 2.345642328262329, "learning_rate": 0.00011873297170624687, "loss": 2.959, "step": 3168 }, { "epoch": 0.8819007200751383, "grad_norm": 2.2420620918273926, "learning_rate": 0.00011868997176149575, "loss": 2.7087, "step": 3169 }, { "epoch": 0.8821790099836505, "grad_norm": 2.185966730117798, "learning_rate": 0.00011864696823559476, "loss": 2.7591, "step": 3170 }, { "epoch": 0.8824572998921627, "grad_norm": 2.3370463848114014, "learning_rate": 0.00011860396113678383, "loss": 2.7997, "step": 3171 }, { "epoch": 0.8827355898006749, "grad_norm": 2.2197234630584717, "learning_rate": 0.00011856095047330335, "loss": 2.8749, "step": 3172 }, { "epoch": 0.8830138797091871, "grad_norm": 2.6085100173950195, "learning_rate": 0.0001185179362533946, "loss": 2.5917, "step": 3173 }, { "epoch": 0.8832921696176992, "grad_norm": 2.385488986968994, "learning_rate": 0.00011847491848529942, "loss": 2.993, "step": 3174 }, { "epoch": 0.8835704595262114, "grad_norm": 2.335477590560913, "learning_rate": 0.00011843189717726036, "loss": 2.7259, "step": 3175 }, { "epoch": 0.8838487494347236, "grad_norm": 2.662012815475464, "learning_rate": 0.00011838887233752061, "loss": 2.9055, "step": 3176 }, { "epoch": 0.8841270393432358, "grad_norm": 2.5965843200683594, "learning_rate": 0.00011834584397432418, "loss": 2.7232, "step": 3177 }, { "epoch": 0.884405329251748, "grad_norm": 2.208580732345581, "learning_rate": 0.00011830281209591556, "loss": 2.5904, "step": 3178 }, { "epoch": 0.8846836191602602, "grad_norm": 2.3809196949005127, "learning_rate": 0.00011825977671054002, "loss": 2.6973, "step": 3179 }, { "epoch": 0.8849619090687724, "grad_norm": 2.418663263320923, "learning_rate": 0.00011821673782644356, "loss": 2.5733, "step": 3180 }, { "epoch": 0.8852401989772846, "grad_norm": 1.9989609718322754, "learning_rate": 0.0001181736954518727, "loss": 2.7866, "step": 3181 }, { "epoch": 0.8855184888857968, "grad_norm": 2.5542213916778564, "learning_rate": 0.0001181306495950747, "loss": 2.9548, "step": 3182 }, { "epoch": 0.885796778794309, "grad_norm": 2.4278435707092285, "learning_rate": 0.00011808760026429757, "loss": 2.8717, "step": 3183 }, { "epoch": 0.8860750687028212, "grad_norm": 2.550440549850464, "learning_rate": 0.00011804454746778986, "loss": 2.9638, "step": 3184 }, { "epoch": 0.8863533586113334, "grad_norm": 2.8156044483184814, "learning_rate": 0.00011800149121380082, "loss": 2.7627, "step": 3185 }, { "epoch": 0.8866316485198455, "grad_norm": 1.9831881523132324, "learning_rate": 0.00011795843151058041, "loss": 2.6911, "step": 3186 }, { "epoch": 0.8869099384283577, "grad_norm": 2.176304578781128, "learning_rate": 0.0001179153683663792, "loss": 2.8662, "step": 3187 }, { "epoch": 0.8871882283368699, "grad_norm": 2.246699571609497, "learning_rate": 0.00011787230178944844, "loss": 2.8479, "step": 3188 }, { "epoch": 0.8874665182453821, "grad_norm": 2.4771440029144287, "learning_rate": 0.00011782923178804001, "loss": 2.751, "step": 3189 }, { "epoch": 0.8877448081538943, "grad_norm": 2.597818613052368, "learning_rate": 0.00011778615837040656, "loss": 2.8729, "step": 3190 }, { "epoch": 0.8880230980624065, "grad_norm": 2.379552125930786, "learning_rate": 0.0001177430815448012, "loss": 2.9676, "step": 3191 }, { "epoch": 0.8883013879709187, "grad_norm": 2.0865752696990967, "learning_rate": 0.00011770000131947785, "loss": 2.6235, "step": 3192 }, { "epoch": 0.8885796778794309, "grad_norm": 2.5810279846191406, "learning_rate": 0.00011765691770269103, "loss": 2.8153, "step": 3193 }, { "epoch": 0.8888579677879431, "grad_norm": 2.4158856868743896, "learning_rate": 0.00011761383070269592, "loss": 2.8831, "step": 3194 }, { "epoch": 0.8891362576964553, "grad_norm": 2.472506523132324, "learning_rate": 0.00011757074032774826, "loss": 2.8188, "step": 3195 }, { "epoch": 0.8894145476049675, "grad_norm": 2.400904655456543, "learning_rate": 0.00011752764658610462, "loss": 2.9434, "step": 3196 }, { "epoch": 0.8896928375134797, "grad_norm": 2.082472324371338, "learning_rate": 0.00011748454948602206, "loss": 2.8787, "step": 3197 }, { "epoch": 0.8899711274219919, "grad_norm": 2.313159704208374, "learning_rate": 0.00011744144903575831, "loss": 2.9163, "step": 3198 }, { "epoch": 0.890249417330504, "grad_norm": 2.5241880416870117, "learning_rate": 0.00011739834524357183, "loss": 2.5254, "step": 3199 }, { "epoch": 0.8905277072390162, "grad_norm": 2.209603786468506, "learning_rate": 0.0001173552381177216, "loss": 2.7572, "step": 3200 }, { "epoch": 0.8905277072390162, "eval_loss": 2.8613476753234863, "eval_runtime": 84.381, "eval_samples_per_second": 59.255, "eval_steps_per_second": 14.814, "step": 3200 }, { "epoch": 0.8908059971475284, "grad_norm": 2.666059732437134, "learning_rate": 0.00011731212766646729, "loss": 2.8175, "step": 3201 }, { "epoch": 0.8910842870560406, "grad_norm": 2.4883222579956055, "learning_rate": 0.00011726901389806924, "loss": 3.1434, "step": 3202 }, { "epoch": 0.8913625769645528, "grad_norm": 2.305368661880493, "learning_rate": 0.0001172258968207884, "loss": 2.842, "step": 3203 }, { "epoch": 0.891640866873065, "grad_norm": 2.2838211059570312, "learning_rate": 0.00011718277644288631, "loss": 2.6889, "step": 3204 }, { "epoch": 0.8919191567815772, "grad_norm": 2.575415849685669, "learning_rate": 0.00011713965277262524, "loss": 2.8824, "step": 3205 }, { "epoch": 0.8921974466900894, "grad_norm": 2.3618688583374023, "learning_rate": 0.00011709652581826803, "loss": 2.8238, "step": 3206 }, { "epoch": 0.8924757365986016, "grad_norm": 2.1778440475463867, "learning_rate": 0.00011705339558807806, "loss": 2.779, "step": 3207 }, { "epoch": 0.8927540265071138, "grad_norm": 2.3154773712158203, "learning_rate": 0.00011701026209031956, "loss": 2.8788, "step": 3208 }, { "epoch": 0.893032316415626, "grad_norm": 2.654587984085083, "learning_rate": 0.00011696712533325718, "loss": 2.9652, "step": 3209 }, { "epoch": 0.8933106063241382, "grad_norm": 2.16239333152771, "learning_rate": 0.00011692398532515627, "loss": 2.6643, "step": 3210 }, { "epoch": 0.8935888962326504, "grad_norm": 2.2648394107818604, "learning_rate": 0.00011688084207428285, "loss": 2.8135, "step": 3211 }, { "epoch": 0.8938671861411626, "grad_norm": 2.097222328186035, "learning_rate": 0.0001168376955889035, "loss": 2.798, "step": 3212 }, { "epoch": 0.8941454760496748, "grad_norm": 2.3265035152435303, "learning_rate": 0.0001167945458772854, "loss": 3.1749, "step": 3213 }, { "epoch": 0.894423765958187, "grad_norm": 2.658252716064453, "learning_rate": 0.00011675139294769644, "loss": 2.9155, "step": 3214 }, { "epoch": 0.8947020558666992, "grad_norm": 2.5769951343536377, "learning_rate": 0.00011670823680840502, "loss": 2.8134, "step": 3215 }, { "epoch": 0.8949803457752114, "grad_norm": 2.2917299270629883, "learning_rate": 0.00011666507746768023, "loss": 2.8611, "step": 3216 }, { "epoch": 0.8952586356837235, "grad_norm": 2.6614186763763428, "learning_rate": 0.00011662191493379175, "loss": 2.9898, "step": 3217 }, { "epoch": 0.8955369255922357, "grad_norm": 2.381690502166748, "learning_rate": 0.00011657874921500986, "loss": 3.1203, "step": 3218 }, { "epoch": 0.8958152155007479, "grad_norm": 2.2761404514312744, "learning_rate": 0.00011653558031960547, "loss": 2.7251, "step": 3219 }, { "epoch": 0.8960935054092601, "grad_norm": 2.08563494682312, "learning_rate": 0.00011649240825585009, "loss": 2.6672, "step": 3220 }, { "epoch": 0.8963717953177723, "grad_norm": 2.159539222717285, "learning_rate": 0.00011644923303201585, "loss": 2.7854, "step": 3221 }, { "epoch": 0.8966500852262845, "grad_norm": 2.302506446838379, "learning_rate": 0.00011640605465637542, "loss": 2.6819, "step": 3222 }, { "epoch": 0.8969283751347967, "grad_norm": 2.6407859325408936, "learning_rate": 0.00011636287313720218, "loss": 2.8962, "step": 3223 }, { "epoch": 0.8972066650433088, "grad_norm": 2.363819122314453, "learning_rate": 0.00011631968848277004, "loss": 2.9683, "step": 3224 }, { "epoch": 0.897484954951821, "grad_norm": 2.6299755573272705, "learning_rate": 0.00011627650070135352, "loss": 2.9565, "step": 3225 }, { "epoch": 0.8977632448603332, "grad_norm": 2.565045118331909, "learning_rate": 0.00011623330980122777, "loss": 2.9152, "step": 3226 }, { "epoch": 0.8980415347688454, "grad_norm": 2.3795461654663086, "learning_rate": 0.00011619011579066851, "loss": 2.7894, "step": 3227 }, { "epoch": 0.8983198246773576, "grad_norm": 2.810457229614258, "learning_rate": 0.00011614691867795204, "loss": 3.14, "step": 3228 }, { "epoch": 0.8985981145858698, "grad_norm": 2.354898691177368, "learning_rate": 0.00011610371847135528, "loss": 2.8388, "step": 3229 }, { "epoch": 0.898876404494382, "grad_norm": 2.5870392322540283, "learning_rate": 0.00011606051517915579, "loss": 2.779, "step": 3230 }, { "epoch": 0.8991546944028942, "grad_norm": 2.542508602142334, "learning_rate": 0.00011601730880963162, "loss": 2.7397, "step": 3231 }, { "epoch": 0.8994329843114064, "grad_norm": 2.29947829246521, "learning_rate": 0.00011597409937106145, "loss": 2.7645, "step": 3232 }, { "epoch": 0.8997112742199186, "grad_norm": 2.263051986694336, "learning_rate": 0.0001159308868717246, "loss": 2.7811, "step": 3233 }, { "epoch": 0.8999895641284308, "grad_norm": 2.2374486923217773, "learning_rate": 0.00011588767131990092, "loss": 2.6576, "step": 3234 }, { "epoch": 0.900267854036943, "grad_norm": 2.802074432373047, "learning_rate": 0.00011584445272387083, "loss": 3.0678, "step": 3235 }, { "epoch": 0.9005461439454552, "grad_norm": 2.298988103866577, "learning_rate": 0.00011580123109191543, "loss": 2.7643, "step": 3236 }, { "epoch": 0.9008244338539674, "grad_norm": 2.4191951751708984, "learning_rate": 0.00011575800643231627, "loss": 2.9738, "step": 3237 }, { "epoch": 0.9011027237624796, "grad_norm": 2.216965436935425, "learning_rate": 0.00011571477875335554, "loss": 2.8942, "step": 3238 }, { "epoch": 0.9013810136709918, "grad_norm": 2.6007871627807617, "learning_rate": 0.00011567154806331605, "loss": 3.0146, "step": 3239 }, { "epoch": 0.901659303579504, "grad_norm": 2.256652355194092, "learning_rate": 0.00011562831437048116, "loss": 3.1227, "step": 3240 }, { "epoch": 0.9019375934880162, "grad_norm": 2.286311626434326, "learning_rate": 0.00011558507768313476, "loss": 2.7476, "step": 3241 }, { "epoch": 0.9022158833965284, "grad_norm": 2.1706249713897705, "learning_rate": 0.00011554183800956136, "loss": 2.6704, "step": 3242 }, { "epoch": 0.9024941733050406, "grad_norm": 2.1790919303894043, "learning_rate": 0.00011549859535804606, "loss": 2.7046, "step": 3243 }, { "epoch": 0.9027724632135528, "grad_norm": 2.3796074390411377, "learning_rate": 0.00011545534973687444, "loss": 2.5855, "step": 3244 }, { "epoch": 0.903050753122065, "grad_norm": 2.739600896835327, "learning_rate": 0.00011541210115433277, "loss": 3.1678, "step": 3245 }, { "epoch": 0.9033290430305772, "grad_norm": 2.442772388458252, "learning_rate": 0.0001153688496187078, "loss": 2.889, "step": 3246 }, { "epoch": 0.9036073329390893, "grad_norm": 2.186018943786621, "learning_rate": 0.00011532559513828686, "loss": 2.8301, "step": 3247 }, { "epoch": 0.9038856228476015, "grad_norm": 2.859795093536377, "learning_rate": 0.00011528233772135788, "loss": 2.692, "step": 3248 }, { "epoch": 0.9041639127561136, "grad_norm": 2.262990713119507, "learning_rate": 0.00011523907737620936, "loss": 3.0078, "step": 3249 }, { "epoch": 0.9044422026646258, "grad_norm": 2.5985515117645264, "learning_rate": 0.00011519581411113027, "loss": 2.6198, "step": 3250 }, { "epoch": 0.904720492573138, "grad_norm": 2.2888035774230957, "learning_rate": 0.00011515254793441026, "loss": 2.4328, "step": 3251 }, { "epoch": 0.9049987824816502, "grad_norm": 2.482410430908203, "learning_rate": 0.00011510927885433947, "loss": 3.0261, "step": 3252 }, { "epoch": 0.9052770723901624, "grad_norm": 3.122368574142456, "learning_rate": 0.00011506600687920855, "loss": 2.9247, "step": 3253 }, { "epoch": 0.9055553622986746, "grad_norm": 2.2355477809906006, "learning_rate": 0.00011502273201730883, "loss": 2.8199, "step": 3254 }, { "epoch": 0.9058336522071868, "grad_norm": 2.3419809341430664, "learning_rate": 0.00011497945427693207, "loss": 2.6171, "step": 3255 }, { "epoch": 0.906111942115699, "grad_norm": 2.1395649909973145, "learning_rate": 0.00011493617366637066, "loss": 2.5076, "step": 3256 }, { "epoch": 0.9063902320242112, "grad_norm": 2.2100508213043213, "learning_rate": 0.00011489289019391753, "loss": 2.7848, "step": 3257 }, { "epoch": 0.9066685219327234, "grad_norm": 2.4874353408813477, "learning_rate": 0.00011484960386786612, "loss": 2.9157, "step": 3258 }, { "epoch": 0.9069468118412356, "grad_norm": 2.282299041748047, "learning_rate": 0.00011480631469651044, "loss": 2.735, "step": 3259 }, { "epoch": 0.9072251017497478, "grad_norm": 2.33966064453125, "learning_rate": 0.00011476302268814508, "loss": 2.5904, "step": 3260 }, { "epoch": 0.90750339165826, "grad_norm": 2.0325934886932373, "learning_rate": 0.00011471972785106509, "loss": 2.8835, "step": 3261 }, { "epoch": 0.9077816815667722, "grad_norm": 2.2501611709594727, "learning_rate": 0.00011467643019356611, "loss": 2.6152, "step": 3262 }, { "epoch": 0.9080599714752844, "grad_norm": 2.3151445388793945, "learning_rate": 0.00011463312972394438, "loss": 2.7238, "step": 3263 }, { "epoch": 0.9083382613837966, "grad_norm": 2.465106725692749, "learning_rate": 0.00011458982645049659, "loss": 2.9745, "step": 3264 }, { "epoch": 0.9086165512923088, "grad_norm": 2.129293203353882, "learning_rate": 0.00011454652038151996, "loss": 2.7973, "step": 3265 }, { "epoch": 0.908894841200821, "grad_norm": 2.5864665508270264, "learning_rate": 0.00011450321152531236, "loss": 2.7476, "step": 3266 }, { "epoch": 0.9091731311093332, "grad_norm": 2.2388198375701904, "learning_rate": 0.00011445989989017205, "loss": 2.9773, "step": 3267 }, { "epoch": 0.9094514210178454, "grad_norm": 2.270639181137085, "learning_rate": 0.00011441658548439789, "loss": 2.8322, "step": 3268 }, { "epoch": 0.9097297109263576, "grad_norm": 2.5815067291259766, "learning_rate": 0.0001143732683162893, "loss": 2.9748, "step": 3269 }, { "epoch": 0.9100080008348698, "grad_norm": 2.0846245288848877, "learning_rate": 0.00011432994839414624, "loss": 2.6558, "step": 3270 }, { "epoch": 0.910286290743382, "grad_norm": 2.0546298027038574, "learning_rate": 0.00011428662572626907, "loss": 2.6025, "step": 3271 }, { "epoch": 0.9105645806518942, "grad_norm": 2.30173921585083, "learning_rate": 0.0001142433003209588, "loss": 2.8407, "step": 3272 }, { "epoch": 0.9108428705604062, "grad_norm": 2.4462785720825195, "learning_rate": 0.00011419997218651697, "loss": 2.7769, "step": 3273 }, { "epoch": 0.9111211604689184, "grad_norm": 2.323119640350342, "learning_rate": 0.00011415664133124555, "loss": 2.7279, "step": 3274 }, { "epoch": 0.9113994503774306, "grad_norm": 2.78416109085083, "learning_rate": 0.00011411330776344709, "loss": 2.6991, "step": 3275 }, { "epoch": 0.9116777402859428, "grad_norm": 2.159846305847168, "learning_rate": 0.00011406997149142467, "loss": 2.9925, "step": 3276 }, { "epoch": 0.911956030194455, "grad_norm": 2.3135643005371094, "learning_rate": 0.00011402663252348184, "loss": 2.9014, "step": 3277 }, { "epoch": 0.9122343201029672, "grad_norm": 2.158547878265381, "learning_rate": 0.00011398329086792272, "loss": 2.3815, "step": 3278 }, { "epoch": 0.9125126100114794, "grad_norm": 2.359222173690796, "learning_rate": 0.00011393994653305194, "loss": 2.7752, "step": 3279 }, { "epoch": 0.9127908999199916, "grad_norm": 2.7347259521484375, "learning_rate": 0.00011389659952717459, "loss": 2.7155, "step": 3280 }, { "epoch": 0.9130691898285038, "grad_norm": 1.9999991655349731, "learning_rate": 0.0001138532498585963, "loss": 2.466, "step": 3281 }, { "epoch": 0.913347479737016, "grad_norm": 2.111236572265625, "learning_rate": 0.00011380989753562328, "loss": 2.8205, "step": 3282 }, { "epoch": 0.9136257696455282, "grad_norm": 2.319601535797119, "learning_rate": 0.00011376654256656213, "loss": 2.8346, "step": 3283 }, { "epoch": 0.9139040595540404, "grad_norm": 2.19997501373291, "learning_rate": 0.00011372318495972001, "loss": 2.4643, "step": 3284 }, { "epoch": 0.9141823494625526, "grad_norm": 2.7047526836395264, "learning_rate": 0.00011367982472340462, "loss": 2.9018, "step": 3285 }, { "epoch": 0.9144606393710648, "grad_norm": 2.8374383449554443, "learning_rate": 0.00011363646186592412, "loss": 2.8884, "step": 3286 }, { "epoch": 0.914738929279577, "grad_norm": 2.4695358276367188, "learning_rate": 0.00011359309639558717, "loss": 2.8908, "step": 3287 }, { "epoch": 0.9150172191880892, "grad_norm": 2.2683334350585938, "learning_rate": 0.00011354972832070295, "loss": 2.8336, "step": 3288 }, { "epoch": 0.9152955090966014, "grad_norm": 2.3568077087402344, "learning_rate": 0.00011350635764958115, "loss": 2.7561, "step": 3289 }, { "epoch": 0.9155737990051136, "grad_norm": 2.2410595417022705, "learning_rate": 0.00011346298439053196, "loss": 2.9763, "step": 3290 }, { "epoch": 0.9158520889136258, "grad_norm": 2.2025701999664307, "learning_rate": 0.00011341960855186597, "loss": 2.7241, "step": 3291 }, { "epoch": 0.916130378822138, "grad_norm": 2.2833974361419678, "learning_rate": 0.00011337623014189443, "loss": 2.7399, "step": 3292 }, { "epoch": 0.9164086687306502, "grad_norm": 2.69035267829895, "learning_rate": 0.00011333284916892895, "loss": 2.6448, "step": 3293 }, { "epoch": 0.9166869586391624, "grad_norm": 2.296508550643921, "learning_rate": 0.00011328946564128167, "loss": 2.8063, "step": 3294 }, { "epoch": 0.9169652485476746, "grad_norm": 2.5324597358703613, "learning_rate": 0.00011324607956726524, "loss": 2.6951, "step": 3295 }, { "epoch": 0.9172435384561868, "grad_norm": 2.399184226989746, "learning_rate": 0.0001132026909551928, "loss": 2.8096, "step": 3296 }, { "epoch": 0.917521828364699, "grad_norm": 2.7564783096313477, "learning_rate": 0.00011315929981337789, "loss": 2.9802, "step": 3297 }, { "epoch": 0.9178001182732111, "grad_norm": 2.584537982940674, "learning_rate": 0.0001131159061501347, "loss": 3.1286, "step": 3298 }, { "epoch": 0.9180784081817233, "grad_norm": 2.528155565261841, "learning_rate": 0.00011307250997377776, "loss": 2.8737, "step": 3299 }, { "epoch": 0.9183566980902355, "grad_norm": 2.2671310901641846, "learning_rate": 0.00011302911129262208, "loss": 2.9963, "step": 3300 }, { "epoch": 0.9183566980902355, "eval_loss": 2.858048439025879, "eval_runtime": 84.9546, "eval_samples_per_second": 58.855, "eval_steps_per_second": 14.714, "step": 3300 }, { "epoch": 0.9186349879987477, "grad_norm": 2.335045576095581, "learning_rate": 0.00011298571011498328, "loss": 2.9065, "step": 3301 }, { "epoch": 0.9189132779072599, "grad_norm": 2.1657867431640625, "learning_rate": 0.00011294230644917736, "loss": 2.7108, "step": 3302 }, { "epoch": 0.919191567815772, "grad_norm": 2.217881679534912, "learning_rate": 0.00011289890030352077, "loss": 2.8423, "step": 3303 }, { "epoch": 0.9194698577242842, "grad_norm": 2.284163236618042, "learning_rate": 0.00011285549168633052, "loss": 2.7591, "step": 3304 }, { "epoch": 0.9197481476327964, "grad_norm": 2.2042033672332764, "learning_rate": 0.00011281208060592402, "loss": 2.9399, "step": 3305 }, { "epoch": 0.9200264375413086, "grad_norm": 2.6998393535614014, "learning_rate": 0.00011276866707061922, "loss": 3.2081, "step": 3306 }, { "epoch": 0.9203047274498208, "grad_norm": 2.962000608444214, "learning_rate": 0.00011272525108873448, "loss": 2.9937, "step": 3307 }, { "epoch": 0.920583017358333, "grad_norm": 2.3461766242980957, "learning_rate": 0.00011268183266858865, "loss": 2.8546, "step": 3308 }, { "epoch": 0.9208613072668452, "grad_norm": 2.7342071533203125, "learning_rate": 0.00011263841181850105, "loss": 2.9024, "step": 3309 }, { "epoch": 0.9211395971753574, "grad_norm": 1.9094696044921875, "learning_rate": 0.00011259498854679149, "loss": 2.5305, "step": 3310 }, { "epoch": 0.9214178870838696, "grad_norm": 2.179755210876465, "learning_rate": 0.00011255156286178023, "loss": 2.8066, "step": 3311 }, { "epoch": 0.9216961769923818, "grad_norm": 2.1006948947906494, "learning_rate": 0.00011250813477178791, "loss": 2.7481, "step": 3312 }, { "epoch": 0.921974466900894, "grad_norm": 2.832465410232544, "learning_rate": 0.00011246470428513583, "loss": 3.0574, "step": 3313 }, { "epoch": 0.9222527568094062, "grad_norm": 2.9960803985595703, "learning_rate": 0.00011242127141014548, "loss": 3.1441, "step": 3314 }, { "epoch": 0.9225310467179184, "grad_norm": 2.6043107509613037, "learning_rate": 0.00011237783615513904, "loss": 2.8598, "step": 3315 }, { "epoch": 0.9228093366264306, "grad_norm": 2.288594961166382, "learning_rate": 0.00011233439852843904, "loss": 2.9397, "step": 3316 }, { "epoch": 0.9230876265349428, "grad_norm": 2.3405494689941406, "learning_rate": 0.00011229095853836848, "loss": 2.7271, "step": 3317 }, { "epoch": 0.923365916443455, "grad_norm": 2.0119740962982178, "learning_rate": 0.00011224751619325078, "loss": 2.7206, "step": 3318 }, { "epoch": 0.9236442063519672, "grad_norm": 2.354665756225586, "learning_rate": 0.00011220407150140991, "loss": 2.885, "step": 3319 }, { "epoch": 0.9239224962604794, "grad_norm": 2.466855525970459, "learning_rate": 0.00011216062447117015, "loss": 2.8629, "step": 3320 }, { "epoch": 0.9242007861689916, "grad_norm": 2.5271036624908447, "learning_rate": 0.00011211717511085633, "loss": 3.0928, "step": 3321 }, { "epoch": 0.9244790760775038, "grad_norm": 2.865319013595581, "learning_rate": 0.00011207372342879374, "loss": 3.0588, "step": 3322 }, { "epoch": 0.9247573659860159, "grad_norm": 2.478585958480835, "learning_rate": 0.00011203026943330806, "loss": 2.9025, "step": 3323 }, { "epoch": 0.9250356558945281, "grad_norm": 2.293642520904541, "learning_rate": 0.00011198681313272535, "loss": 2.8712, "step": 3324 }, { "epoch": 0.9253139458030403, "grad_norm": 2.8248133659362793, "learning_rate": 0.00011194335453537227, "loss": 2.8739, "step": 3325 }, { "epoch": 0.9255922357115525, "grad_norm": 2.6668381690979004, "learning_rate": 0.00011189989364957586, "loss": 2.9767, "step": 3326 }, { "epoch": 0.9258705256200647, "grad_norm": 3.19988751411438, "learning_rate": 0.00011185643048366348, "loss": 3.2032, "step": 3327 }, { "epoch": 0.9261488155285769, "grad_norm": 2.1796481609344482, "learning_rate": 0.00011181296504596308, "loss": 2.6084, "step": 3328 }, { "epoch": 0.9264271054370891, "grad_norm": 2.262021780014038, "learning_rate": 0.00011176949734480302, "loss": 2.9607, "step": 3329 }, { "epoch": 0.9267053953456013, "grad_norm": 2.0772109031677246, "learning_rate": 0.00011172602738851201, "loss": 2.7654, "step": 3330 }, { "epoch": 0.9269836852541135, "grad_norm": 2.3915789127349854, "learning_rate": 0.00011168255518541926, "loss": 2.8657, "step": 3331 }, { "epoch": 0.9272619751626257, "grad_norm": 2.139936685562134, "learning_rate": 0.00011163908074385443, "loss": 2.6868, "step": 3332 }, { "epoch": 0.9275402650711378, "grad_norm": 2.1370015144348145, "learning_rate": 0.00011159560407214755, "loss": 2.7093, "step": 3333 }, { "epoch": 0.92781855497965, "grad_norm": 2.5584495067596436, "learning_rate": 0.00011155212517862908, "loss": 3.2027, "step": 3334 }, { "epoch": 0.9280968448881622, "grad_norm": 2.562694787979126, "learning_rate": 0.00011150864407162995, "loss": 2.8933, "step": 3335 }, { "epoch": 0.9283751347966744, "grad_norm": 2.264348030090332, "learning_rate": 0.0001114651607594815, "loss": 3.1107, "step": 3336 }, { "epoch": 0.9286534247051866, "grad_norm": 2.344406843185425, "learning_rate": 0.00011142167525051546, "loss": 2.8163, "step": 3337 }, { "epoch": 0.9289317146136988, "grad_norm": 2.187631368637085, "learning_rate": 0.00011137818755306401, "loss": 2.7269, "step": 3338 }, { "epoch": 0.929210004522211, "grad_norm": 2.2779104709625244, "learning_rate": 0.00011133469767545979, "loss": 2.6265, "step": 3339 }, { "epoch": 0.9294882944307232, "grad_norm": 2.5341391563415527, "learning_rate": 0.00011129120562603575, "loss": 2.5921, "step": 3340 }, { "epoch": 0.9297665843392354, "grad_norm": 2.2553391456604004, "learning_rate": 0.00011124771141312534, "loss": 2.6841, "step": 3341 }, { "epoch": 0.9300448742477476, "grad_norm": 2.232781410217285, "learning_rate": 0.00011120421504506242, "loss": 2.7506, "step": 3342 }, { "epoch": 0.9303231641562598, "grad_norm": 2.1760356426239014, "learning_rate": 0.00011116071653018123, "loss": 2.9451, "step": 3343 }, { "epoch": 0.930601454064772, "grad_norm": 3.2467339038848877, "learning_rate": 0.00011111721587681645, "loss": 2.6086, "step": 3344 }, { "epoch": 0.9308797439732842, "grad_norm": 2.293043613433838, "learning_rate": 0.00011107371309330314, "loss": 2.9329, "step": 3345 }, { "epoch": 0.9311580338817964, "grad_norm": 2.6064062118530273, "learning_rate": 0.00011103020818797679, "loss": 2.681, "step": 3346 }, { "epoch": 0.9314363237903086, "grad_norm": 2.2397329807281494, "learning_rate": 0.00011098670116917331, "loss": 2.9558, "step": 3347 }, { "epoch": 0.9317146136988207, "grad_norm": 2.2513034343719482, "learning_rate": 0.00011094319204522896, "loss": 2.8221, "step": 3348 }, { "epoch": 0.9319929036073329, "grad_norm": 3.109107732772827, "learning_rate": 0.00011089968082448046, "loss": 3.1987, "step": 3349 }, { "epoch": 0.9322711935158451, "grad_norm": 3.5638010501861572, "learning_rate": 0.00011085616751526495, "loss": 3.036, "step": 3350 }, { "epoch": 0.9325494834243573, "grad_norm": 2.168316125869751, "learning_rate": 0.0001108126521259199, "loss": 2.791, "step": 3351 }, { "epoch": 0.9328277733328695, "grad_norm": 2.0525906085968018, "learning_rate": 0.00011076913466478316, "loss": 2.8991, "step": 3352 }, { "epoch": 0.9331060632413817, "grad_norm": 2.5270113945007324, "learning_rate": 0.0001107256151401931, "loss": 2.7507, "step": 3353 }, { "epoch": 0.9333843531498939, "grad_norm": 2.3239269256591797, "learning_rate": 0.00011068209356048843, "loss": 2.6329, "step": 3354 }, { "epoch": 0.9336626430584061, "grad_norm": 2.1798810958862305, "learning_rate": 0.00011063856993400812, "loss": 2.4523, "step": 3355 }, { "epoch": 0.9339409329669183, "grad_norm": 2.218135118484497, "learning_rate": 0.00011059504426909178, "loss": 2.8171, "step": 3356 }, { "epoch": 0.9342192228754305, "grad_norm": 2.431818962097168, "learning_rate": 0.00011055151657407923, "loss": 2.7395, "step": 3357 }, { "epoch": 0.9344975127839427, "grad_norm": 2.3993406295776367, "learning_rate": 0.00011050798685731068, "loss": 2.8514, "step": 3358 }, { "epoch": 0.9347758026924549, "grad_norm": 2.7699503898620605, "learning_rate": 0.00011046445512712684, "loss": 2.8559, "step": 3359 }, { "epoch": 0.9350540926009671, "grad_norm": 2.2018282413482666, "learning_rate": 0.00011042092139186873, "loss": 2.5736, "step": 3360 }, { "epoch": 0.9353323825094793, "grad_norm": 2.754648208618164, "learning_rate": 0.00011037738565987773, "loss": 2.9668, "step": 3361 }, { "epoch": 0.9356106724179915, "grad_norm": 2.4192304611206055, "learning_rate": 0.0001103338479394957, "loss": 2.7228, "step": 3362 }, { "epoch": 0.9358889623265036, "grad_norm": 3.394136667251587, "learning_rate": 0.00011029030823906477, "loss": 3.0349, "step": 3363 }, { "epoch": 0.9361672522350158, "grad_norm": 2.588103771209717, "learning_rate": 0.00011024676656692749, "loss": 2.8637, "step": 3364 }, { "epoch": 0.936445542143528, "grad_norm": 2.3688783645629883, "learning_rate": 0.00011020322293142685, "loss": 2.9091, "step": 3365 }, { "epoch": 0.9367238320520402, "grad_norm": 2.125288486480713, "learning_rate": 0.00011015967734090613, "loss": 2.7945, "step": 3366 }, { "epoch": 0.9370021219605524, "grad_norm": 2.55336332321167, "learning_rate": 0.000110116129803709, "loss": 2.7737, "step": 3367 }, { "epoch": 0.9372804118690646, "grad_norm": 2.732656717300415, "learning_rate": 0.00011007258032817955, "loss": 2.8103, "step": 3368 }, { "epoch": 0.9375587017775768, "grad_norm": 2.160461187362671, "learning_rate": 0.0001100290289226622, "loss": 2.6442, "step": 3369 }, { "epoch": 0.937836991686089, "grad_norm": 2.4054534435272217, "learning_rate": 0.00010998547559550178, "loss": 2.7807, "step": 3370 }, { "epoch": 0.9381152815946012, "grad_norm": 2.438422203063965, "learning_rate": 0.00010994192035504341, "loss": 2.7416, "step": 3371 }, { "epoch": 0.9383935715031134, "grad_norm": 2.2534382343292236, "learning_rate": 0.00010989836320963267, "loss": 2.898, "step": 3372 }, { "epoch": 0.9386718614116255, "grad_norm": 2.581714391708374, "learning_rate": 0.00010985480416761543, "loss": 2.8967, "step": 3373 }, { "epoch": 0.9389501513201377, "grad_norm": 2.739999294281006, "learning_rate": 0.00010981124323733796, "loss": 2.6241, "step": 3374 }, { "epoch": 0.9392284412286499, "grad_norm": 2.684469699859619, "learning_rate": 0.00010976768042714691, "loss": 2.976, "step": 3375 }, { "epoch": 0.9395067311371621, "grad_norm": 2.4240050315856934, "learning_rate": 0.00010972411574538926, "loss": 2.7242, "step": 3376 }, { "epoch": 0.9397850210456743, "grad_norm": 3.2338616847991943, "learning_rate": 0.00010968054920041231, "loss": 3.0169, "step": 3377 }, { "epoch": 0.9400633109541865, "grad_norm": 2.4346389770507812, "learning_rate": 0.00010963698080056383, "loss": 2.9421, "step": 3378 }, { "epoch": 0.9403416008626987, "grad_norm": 2.279630422592163, "learning_rate": 0.00010959341055419186, "loss": 3.0487, "step": 3379 }, { "epoch": 0.9406198907712109, "grad_norm": 2.3373608589172363, "learning_rate": 0.00010954983846964475, "loss": 2.6994, "step": 3380 }, { "epoch": 0.9408981806797231, "grad_norm": 2.2228503227233887, "learning_rate": 0.00010950626455527136, "loss": 2.9121, "step": 3381 }, { "epoch": 0.9411764705882353, "grad_norm": 2.4826459884643555, "learning_rate": 0.00010946268881942075, "loss": 2.6557, "step": 3382 }, { "epoch": 0.9414547604967475, "grad_norm": 2.559644937515259, "learning_rate": 0.00010941911127044235, "loss": 3.0571, "step": 3383 }, { "epoch": 0.9417330504052597, "grad_norm": 2.498295545578003, "learning_rate": 0.00010937553191668606, "loss": 2.8011, "step": 3384 }, { "epoch": 0.9420113403137719, "grad_norm": 2.42797589302063, "learning_rate": 0.00010933195076650197, "loss": 3.2785, "step": 3385 }, { "epoch": 0.9422896302222841, "grad_norm": 2.4225103855133057, "learning_rate": 0.00010928836782824059, "loss": 2.8082, "step": 3386 }, { "epoch": 0.9425679201307963, "grad_norm": 2.2978909015655518, "learning_rate": 0.00010924478311025279, "loss": 2.8397, "step": 3387 }, { "epoch": 0.9428462100393085, "grad_norm": 2.238184690475464, "learning_rate": 0.00010920119662088975, "loss": 2.8608, "step": 3388 }, { "epoch": 0.9431244999478207, "grad_norm": 2.4247283935546875, "learning_rate": 0.00010915760836850293, "loss": 2.9992, "step": 3389 }, { "epoch": 0.9434027898563329, "grad_norm": 2.703320026397705, "learning_rate": 0.00010911401836144428, "loss": 2.7573, "step": 3390 }, { "epoch": 0.943681079764845, "grad_norm": 2.559495210647583, "learning_rate": 0.00010907042660806597, "loss": 2.7297, "step": 3391 }, { "epoch": 0.9439593696733573, "grad_norm": 2.3044445514678955, "learning_rate": 0.0001090268331167205, "loss": 2.6088, "step": 3392 }, { "epoch": 0.9442376595818694, "grad_norm": 2.8080151081085205, "learning_rate": 0.00010898323789576079, "loss": 2.8747, "step": 3393 }, { "epoch": 0.9445159494903816, "grad_norm": 2.473289966583252, "learning_rate": 0.00010893964095354001, "loss": 3.0389, "step": 3394 }, { "epoch": 0.9447942393988938, "grad_norm": 2.407241106033325, "learning_rate": 0.00010889604229841166, "loss": 2.6577, "step": 3395 }, { "epoch": 0.945072529307406, "grad_norm": 2.1695327758789062, "learning_rate": 0.00010885244193872964, "loss": 2.694, "step": 3396 }, { "epoch": 0.9453508192159182, "grad_norm": 2.1375181674957275, "learning_rate": 0.00010880883988284812, "loss": 2.609, "step": 3397 }, { "epoch": 0.9456291091244303, "grad_norm": 2.9132590293884277, "learning_rate": 0.00010876523613912156, "loss": 2.75, "step": 3398 }, { "epoch": 0.9459073990329425, "grad_norm": 1.878095030784607, "learning_rate": 0.00010872163071590484, "loss": 2.6727, "step": 3399 }, { "epoch": 0.9461856889414547, "grad_norm": 3.3226566314697266, "learning_rate": 0.00010867802362155312, "loss": 2.6802, "step": 3400 }, { "epoch": 0.9461856889414547, "eval_loss": 2.853184461593628, "eval_runtime": 84.7306, "eval_samples_per_second": 59.011, "eval_steps_per_second": 14.753, "step": 3400 }, { "epoch": 0.9464639788499669, "grad_norm": 2.4484403133392334, "learning_rate": 0.00010863441486442183, "loss": 2.9368, "step": 3401 }, { "epoch": 0.9467422687584791, "grad_norm": 2.4893407821655273, "learning_rate": 0.0001085908044528668, "loss": 2.7994, "step": 3402 }, { "epoch": 0.9470205586669913, "grad_norm": 2.34183406829834, "learning_rate": 0.00010854719239524413, "loss": 2.6994, "step": 3403 }, { "epoch": 0.9472988485755035, "grad_norm": 2.435349702835083, "learning_rate": 0.00010850357869991022, "loss": 3.0188, "step": 3404 }, { "epoch": 0.9475771384840157, "grad_norm": 2.160815477371216, "learning_rate": 0.00010845996337522181, "loss": 2.5216, "step": 3405 }, { "epoch": 0.9478554283925279, "grad_norm": 2.317697525024414, "learning_rate": 0.00010841634642953599, "loss": 2.7243, "step": 3406 }, { "epoch": 0.9481337183010401, "grad_norm": 2.4328453540802, "learning_rate": 0.00010837272787121007, "loss": 2.7284, "step": 3407 }, { "epoch": 0.9484120082095523, "grad_norm": 2.18982195854187, "learning_rate": 0.00010832910770860173, "loss": 2.5337, "step": 3408 }, { "epoch": 0.9486902981180645, "grad_norm": 2.2894012928009033, "learning_rate": 0.00010828548595006897, "loss": 2.8213, "step": 3409 }, { "epoch": 0.9489685880265767, "grad_norm": 2.2360222339630127, "learning_rate": 0.00010824186260397007, "loss": 2.7091, "step": 3410 }, { "epoch": 0.9492468779350889, "grad_norm": 2.0325064659118652, "learning_rate": 0.00010819823767866357, "loss": 2.7222, "step": 3411 }, { "epoch": 0.9495251678436011, "grad_norm": 2.430107593536377, "learning_rate": 0.0001081546111825084, "loss": 2.7026, "step": 3412 }, { "epoch": 0.9498034577521133, "grad_norm": 2.193359851837158, "learning_rate": 0.00010811098312386376, "loss": 2.8797, "step": 3413 }, { "epoch": 0.9500817476606255, "grad_norm": 2.255690574645996, "learning_rate": 0.0001080673535110891, "loss": 2.724, "step": 3414 }, { "epoch": 0.9503600375691377, "grad_norm": 2.5954506397247314, "learning_rate": 0.00010802372235254425, "loss": 2.7815, "step": 3415 }, { "epoch": 0.9506383274776499, "grad_norm": 2.6023714542388916, "learning_rate": 0.00010798008965658927, "loss": 2.9039, "step": 3416 }, { "epoch": 0.9509166173861621, "grad_norm": 2.5801782608032227, "learning_rate": 0.0001079364554315845, "loss": 3.3455, "step": 3417 }, { "epoch": 0.9511949072946743, "grad_norm": 3.276528835296631, "learning_rate": 0.00010789281968589069, "loss": 3.2265, "step": 3418 }, { "epoch": 0.9514731972031865, "grad_norm": 2.3223166465759277, "learning_rate": 0.00010784918242786874, "loss": 2.9946, "step": 3419 }, { "epoch": 0.9517514871116987, "grad_norm": 2.341273784637451, "learning_rate": 0.00010780554366587994, "loss": 2.8756, "step": 3420 }, { "epoch": 0.9520297770202109, "grad_norm": 2.3316421508789062, "learning_rate": 0.0001077619034082858, "loss": 2.9907, "step": 3421 }, { "epoch": 0.952308066928723, "grad_norm": 2.5908291339874268, "learning_rate": 0.00010771826166344817, "loss": 2.4577, "step": 3422 }, { "epoch": 0.9525863568372351, "grad_norm": 3.4217398166656494, "learning_rate": 0.00010767461843972913, "loss": 2.8214, "step": 3423 }, { "epoch": 0.9528646467457473, "grad_norm": 2.451514720916748, "learning_rate": 0.0001076309737454911, "loss": 2.8437, "step": 3424 }, { "epoch": 0.9531429366542595, "grad_norm": 2.30802059173584, "learning_rate": 0.00010758732758909676, "loss": 2.7413, "step": 3425 }, { "epoch": 0.9534212265627717, "grad_norm": 1.9844789505004883, "learning_rate": 0.00010754367997890905, "loss": 2.6411, "step": 3426 }, { "epoch": 0.9536995164712839, "grad_norm": 2.3322134017944336, "learning_rate": 0.00010750003092329121, "loss": 2.9509, "step": 3427 }, { "epoch": 0.9539778063797961, "grad_norm": 2.3097901344299316, "learning_rate": 0.00010745638043060677, "loss": 2.8728, "step": 3428 }, { "epoch": 0.9542560962883083, "grad_norm": 2.2025935649871826, "learning_rate": 0.00010741272850921948, "loss": 2.8397, "step": 3429 }, { "epoch": 0.9545343861968205, "grad_norm": 2.9165728092193604, "learning_rate": 0.00010736907516749341, "loss": 2.8681, "step": 3430 }, { "epoch": 0.9548126761053327, "grad_norm": 2.7683804035186768, "learning_rate": 0.00010732542041379296, "loss": 2.6116, "step": 3431 }, { "epoch": 0.9550909660138449, "grad_norm": 2.2428462505340576, "learning_rate": 0.00010728176425648266, "loss": 2.7152, "step": 3432 }, { "epoch": 0.9553692559223571, "grad_norm": 2.7460410594940186, "learning_rate": 0.0001072381067039274, "loss": 3.0348, "step": 3433 }, { "epoch": 0.9556475458308693, "grad_norm": 2.4041411876678467, "learning_rate": 0.00010719444776449234, "loss": 2.576, "step": 3434 }, { "epoch": 0.9559258357393815, "grad_norm": 2.5589759349823, "learning_rate": 0.00010715078744654286, "loss": 2.6222, "step": 3435 }, { "epoch": 0.9562041256478937, "grad_norm": 2.3054697513580322, "learning_rate": 0.00010710712575844464, "loss": 2.8066, "step": 3436 }, { "epoch": 0.9564824155564059, "grad_norm": 1.9223846197128296, "learning_rate": 0.00010706346270856362, "loss": 2.4603, "step": 3437 }, { "epoch": 0.9567607054649181, "grad_norm": 2.14790415763855, "learning_rate": 0.00010701979830526598, "loss": 2.6655, "step": 3438 }, { "epoch": 0.9570389953734303, "grad_norm": 2.1722042560577393, "learning_rate": 0.0001069761325569182, "loss": 2.5412, "step": 3439 }, { "epoch": 0.9573172852819425, "grad_norm": 2.389418840408325, "learning_rate": 0.00010693246547188697, "loss": 2.9063, "step": 3440 }, { "epoch": 0.9575955751904547, "grad_norm": 2.484543561935425, "learning_rate": 0.00010688879705853924, "loss": 2.8468, "step": 3441 }, { "epoch": 0.9578738650989669, "grad_norm": 1.9330151081085205, "learning_rate": 0.00010684512732524228, "loss": 2.6909, "step": 3442 }, { "epoch": 0.9581521550074791, "grad_norm": 2.348156690597534, "learning_rate": 0.0001068014562803635, "loss": 2.6621, "step": 3443 }, { "epoch": 0.9584304449159913, "grad_norm": 2.6134719848632812, "learning_rate": 0.00010675778393227067, "loss": 2.9911, "step": 3444 }, { "epoch": 0.9587087348245035, "grad_norm": 2.2595410346984863, "learning_rate": 0.00010671411028933178, "loss": 2.8151, "step": 3445 }, { "epoch": 0.9589870247330157, "grad_norm": 2.0615856647491455, "learning_rate": 0.00010667043535991501, "loss": 2.6959, "step": 3446 }, { "epoch": 0.9592653146415279, "grad_norm": 2.5475590229034424, "learning_rate": 0.00010662675915238884, "loss": 2.7886, "step": 3447 }, { "epoch": 0.95954360455004, "grad_norm": 2.409576177597046, "learning_rate": 0.00010658308167512201, "loss": 2.9314, "step": 3448 }, { "epoch": 0.9598218944585521, "grad_norm": 2.3006186485290527, "learning_rate": 0.00010653940293648343, "loss": 2.7994, "step": 3449 }, { "epoch": 0.9601001843670643, "grad_norm": 2.692882537841797, "learning_rate": 0.00010649572294484235, "loss": 2.7995, "step": 3450 }, { "epoch": 0.9603784742755765, "grad_norm": 2.340289354324341, "learning_rate": 0.00010645204170856819, "loss": 2.7389, "step": 3451 }, { "epoch": 0.9606567641840887, "grad_norm": 2.2093257904052734, "learning_rate": 0.00010640835923603065, "loss": 2.6672, "step": 3452 }, { "epoch": 0.9609350540926009, "grad_norm": 2.311194896697998, "learning_rate": 0.00010636467553559961, "loss": 3.1015, "step": 3453 }, { "epoch": 0.9612133440011131, "grad_norm": 2.6760976314544678, "learning_rate": 0.00010632099061564522, "loss": 2.9132, "step": 3454 }, { "epoch": 0.9614916339096253, "grad_norm": 2.238374948501587, "learning_rate": 0.00010627730448453792, "loss": 2.514, "step": 3455 }, { "epoch": 0.9617699238181375, "grad_norm": 2.9613614082336426, "learning_rate": 0.00010623361715064825, "loss": 2.8558, "step": 3456 }, { "epoch": 0.9620482137266497, "grad_norm": 2.384523391723633, "learning_rate": 0.00010618992862234712, "loss": 2.8143, "step": 3457 }, { "epoch": 0.9623265036351619, "grad_norm": 2.3256590366363525, "learning_rate": 0.00010614623890800554, "loss": 2.7558, "step": 3458 }, { "epoch": 0.9626047935436741, "grad_norm": 2.3488802909851074, "learning_rate": 0.00010610254801599488, "loss": 2.6528, "step": 3459 }, { "epoch": 0.9628830834521863, "grad_norm": 2.2439961433410645, "learning_rate": 0.00010605885595468663, "loss": 3.053, "step": 3460 }, { "epoch": 0.9631613733606985, "grad_norm": 2.6251027584075928, "learning_rate": 0.00010601516273245256, "loss": 2.6995, "step": 3461 }, { "epoch": 0.9634396632692107, "grad_norm": 2.3058249950408936, "learning_rate": 0.00010597146835766463, "loss": 2.8869, "step": 3462 }, { "epoch": 0.9637179531777229, "grad_norm": 2.5864367485046387, "learning_rate": 0.00010592777283869504, "loss": 2.7109, "step": 3463 }, { "epoch": 0.9639962430862351, "grad_norm": 2.3141210079193115, "learning_rate": 0.00010588407618391621, "loss": 2.7213, "step": 3464 }, { "epoch": 0.9642745329947473, "grad_norm": 2.498164176940918, "learning_rate": 0.00010584037840170078, "loss": 2.6419, "step": 3465 }, { "epoch": 0.9645528229032595, "grad_norm": 2.8334856033325195, "learning_rate": 0.00010579667950042156, "loss": 2.9573, "step": 3466 }, { "epoch": 0.9648311128117717, "grad_norm": 2.537562847137451, "learning_rate": 0.00010575297948845166, "loss": 2.8908, "step": 3467 }, { "epoch": 0.9651094027202839, "grad_norm": 2.225403070449829, "learning_rate": 0.00010570927837416435, "loss": 2.6236, "step": 3468 }, { "epoch": 0.9653876926287961, "grad_norm": 2.7487964630126953, "learning_rate": 0.00010566557616593306, "loss": 2.9078, "step": 3469 }, { "epoch": 0.9656659825373083, "grad_norm": 2.6212213039398193, "learning_rate": 0.00010562187287213157, "loss": 2.8571, "step": 3470 }, { "epoch": 0.9659442724458205, "grad_norm": 2.163294792175293, "learning_rate": 0.00010557816850113374, "loss": 2.7762, "step": 3471 }, { "epoch": 0.9662225623543327, "grad_norm": 2.286970615386963, "learning_rate": 0.0001055344630613137, "loss": 2.7906, "step": 3472 }, { "epoch": 0.9665008522628448, "grad_norm": 4.619243621826172, "learning_rate": 0.00010549075656104573, "loss": 2.9877, "step": 3473 }, { "epoch": 0.966779142171357, "grad_norm": 2.634018898010254, "learning_rate": 0.00010544704900870437, "loss": 2.8022, "step": 3474 }, { "epoch": 0.9670574320798692, "grad_norm": 1.981520652770996, "learning_rate": 0.00010540334041266437, "loss": 2.5547, "step": 3475 }, { "epoch": 0.9673357219883814, "grad_norm": 2.909611940383911, "learning_rate": 0.0001053596307813006, "loss": 3.0267, "step": 3476 }, { "epoch": 0.9676140118968936, "grad_norm": 2.3776512145996094, "learning_rate": 0.00010531592012298821, "loss": 3.1253, "step": 3477 }, { "epoch": 0.9678923018054058, "grad_norm": 2.416294813156128, "learning_rate": 0.00010527220844610253, "loss": 2.8826, "step": 3478 }, { "epoch": 0.968170591713918, "grad_norm": 2.4944708347320557, "learning_rate": 0.000105228495759019, "loss": 2.937, "step": 3479 }, { "epoch": 0.9684488816224301, "grad_norm": 2.3774211406707764, "learning_rate": 0.00010518478207011342, "loss": 2.8935, "step": 3480 }, { "epoch": 0.9687271715309423, "grad_norm": 2.497574806213379, "learning_rate": 0.00010514106738776162, "loss": 3.1926, "step": 3481 }, { "epoch": 0.9690054614394545, "grad_norm": 2.6420583724975586, "learning_rate": 0.0001050973517203397, "loss": 2.9894, "step": 3482 }, { "epoch": 0.9692837513479667, "grad_norm": 2.631377696990967, "learning_rate": 0.00010505363507622396, "loss": 2.7764, "step": 3483 }, { "epoch": 0.9695620412564789, "grad_norm": 2.26882266998291, "learning_rate": 0.00010500991746379085, "loss": 2.7499, "step": 3484 }, { "epoch": 0.9698403311649911, "grad_norm": 2.575061798095703, "learning_rate": 0.00010496619889141699, "loss": 2.8587, "step": 3485 }, { "epoch": 0.9701186210735033, "grad_norm": 2.343097686767578, "learning_rate": 0.00010492247936747927, "loss": 2.7622, "step": 3486 }, { "epoch": 0.9703969109820155, "grad_norm": 2.4070615768432617, "learning_rate": 0.00010487875890035466, "loss": 2.5154, "step": 3487 }, { "epoch": 0.9706752008905277, "grad_norm": 2.7661375999450684, "learning_rate": 0.00010483503749842034, "loss": 3.0374, "step": 3488 }, { "epoch": 0.9709534907990399, "grad_norm": 2.2526514530181885, "learning_rate": 0.00010479131517005372, "loss": 2.852, "step": 3489 }, { "epoch": 0.9712317807075521, "grad_norm": 2.4803273677825928, "learning_rate": 0.00010474759192363237, "loss": 2.8408, "step": 3490 }, { "epoch": 0.9715100706160643, "grad_norm": 2.3808817863464355, "learning_rate": 0.00010470386776753395, "loss": 2.8475, "step": 3491 }, { "epoch": 0.9717883605245765, "grad_norm": 2.629021406173706, "learning_rate": 0.00010466014271013645, "loss": 2.7204, "step": 3492 }, { "epoch": 0.9720666504330887, "grad_norm": 2.3725035190582275, "learning_rate": 0.00010461641675981786, "loss": 2.7147, "step": 3493 }, { "epoch": 0.9723449403416009, "grad_norm": 2.413064479827881, "learning_rate": 0.00010457268992495648, "loss": 3.0143, "step": 3494 }, { "epoch": 0.9726232302501131, "grad_norm": 2.187839984893799, "learning_rate": 0.00010452896221393071, "loss": 2.8, "step": 3495 }, { "epoch": 0.9729015201586253, "grad_norm": 2.276700019836426, "learning_rate": 0.00010448523363511913, "loss": 2.6035, "step": 3496 }, { "epoch": 0.9731798100671374, "grad_norm": 2.2283873558044434, "learning_rate": 0.00010444150419690048, "loss": 2.7933, "step": 3497 }, { "epoch": 0.9734580999756496, "grad_norm": 2.4954376220703125, "learning_rate": 0.00010439777390765371, "loss": 2.8615, "step": 3498 }, { "epoch": 0.9737363898841618, "grad_norm": 2.356783628463745, "learning_rate": 0.00010435404277575787, "loss": 2.7257, "step": 3499 }, { "epoch": 0.974014679792674, "grad_norm": 2.230088949203491, "learning_rate": 0.00010431031080959217, "loss": 2.8935, "step": 3500 }, { "epoch": 0.974014679792674, "eval_loss": 2.8504438400268555, "eval_runtime": 84.9148, "eval_samples_per_second": 58.883, "eval_steps_per_second": 14.721, "step": 3500 }, { "epoch": 0.9742929697011862, "grad_norm": 2.254199743270874, "learning_rate": 0.00010426657801753605, "loss": 2.6137, "step": 3501 }, { "epoch": 0.9745712596096984, "grad_norm": 2.2392194271087646, "learning_rate": 0.00010422284440796908, "loss": 2.6891, "step": 3502 }, { "epoch": 0.9748495495182106, "grad_norm": 2.2609758377075195, "learning_rate": 0.00010417910998927091, "loss": 3.0529, "step": 3503 }, { "epoch": 0.9751278394267228, "grad_norm": 2.17738938331604, "learning_rate": 0.00010413537476982152, "loss": 2.6903, "step": 3504 }, { "epoch": 0.975406129335235, "grad_norm": 2.2378740310668945, "learning_rate": 0.00010409163875800081, "loss": 2.9504, "step": 3505 }, { "epoch": 0.9756844192437472, "grad_norm": 2.469862937927246, "learning_rate": 0.00010404790196218901, "loss": 2.8363, "step": 3506 }, { "epoch": 0.9759627091522594, "grad_norm": 2.4737284183502197, "learning_rate": 0.00010400416439076644, "loss": 2.6723, "step": 3507 }, { "epoch": 0.9762409990607716, "grad_norm": 2.456648826599121, "learning_rate": 0.00010396042605211358, "loss": 2.6159, "step": 3508 }, { "epoch": 0.9765192889692837, "grad_norm": 2.6448042392730713, "learning_rate": 0.00010391668695461103, "loss": 2.9114, "step": 3509 }, { "epoch": 0.976797578877796, "grad_norm": 2.4260952472686768, "learning_rate": 0.00010387294710663955, "loss": 2.9968, "step": 3510 }, { "epoch": 0.9770758687863081, "grad_norm": 2.230940341949463, "learning_rate": 0.00010382920651658011, "loss": 2.7335, "step": 3511 }, { "epoch": 0.9773541586948203, "grad_norm": 2.644887685775757, "learning_rate": 0.0001037854651928137, "loss": 2.783, "step": 3512 }, { "epoch": 0.9776324486033325, "grad_norm": 2.2695231437683105, "learning_rate": 0.00010374172314372148, "loss": 2.7698, "step": 3513 }, { "epoch": 0.9779107385118447, "grad_norm": 2.5400569438934326, "learning_rate": 0.00010369798037768488, "loss": 2.8493, "step": 3514 }, { "epoch": 0.9781890284203569, "grad_norm": 2.4253506660461426, "learning_rate": 0.0001036542369030853, "loss": 2.9496, "step": 3515 }, { "epoch": 0.9784673183288691, "grad_norm": 2.4494292736053467, "learning_rate": 0.00010361049272830437, "loss": 2.8679, "step": 3516 }, { "epoch": 0.9787456082373813, "grad_norm": 2.2363533973693848, "learning_rate": 0.00010356674786172382, "loss": 2.8026, "step": 3517 }, { "epoch": 0.9790238981458935, "grad_norm": 2.5827839374542236, "learning_rate": 0.00010352300231172551, "loss": 3.405, "step": 3518 }, { "epoch": 0.9793021880544057, "grad_norm": 2.104947805404663, "learning_rate": 0.00010347925608669145, "loss": 2.6455, "step": 3519 }, { "epoch": 0.9795804779629179, "grad_norm": 2.5958423614501953, "learning_rate": 0.00010343550919500375, "loss": 2.9997, "step": 3520 }, { "epoch": 0.9798587678714301, "grad_norm": 2.499011754989624, "learning_rate": 0.00010339176164504474, "loss": 2.9692, "step": 3521 }, { "epoch": 0.9801370577799422, "grad_norm": 2.2796711921691895, "learning_rate": 0.00010334801344519668, "loss": 2.663, "step": 3522 }, { "epoch": 0.9804153476884544, "grad_norm": 2.489182233810425, "learning_rate": 0.00010330426460384221, "loss": 3.2549, "step": 3523 }, { "epoch": 0.9806936375969666, "grad_norm": 2.531644105911255, "learning_rate": 0.0001032605151293639, "loss": 2.8405, "step": 3524 }, { "epoch": 0.9809719275054788, "grad_norm": 2.3420217037200928, "learning_rate": 0.00010321676503014448, "loss": 2.6128, "step": 3525 }, { "epoch": 0.981250217413991, "grad_norm": 2.4011876583099365, "learning_rate": 0.00010317301431456688, "loss": 3.1994, "step": 3526 }, { "epoch": 0.9815285073225032, "grad_norm": 2.4028685092926025, "learning_rate": 0.00010312926299101404, "loss": 2.8196, "step": 3527 }, { "epoch": 0.9818067972310154, "grad_norm": 2.4062864780426025, "learning_rate": 0.0001030855110678691, "loss": 2.8253, "step": 3528 }, { "epoch": 0.9820850871395276, "grad_norm": 2.65921688079834, "learning_rate": 0.00010304175855351523, "loss": 3.0018, "step": 3529 }, { "epoch": 0.9823633770480398, "grad_norm": 2.0689477920532227, "learning_rate": 0.00010299800545633587, "loss": 2.7162, "step": 3530 }, { "epoch": 0.982641666956552, "grad_norm": 2.3299365043640137, "learning_rate": 0.00010295425178471437, "loss": 2.875, "step": 3531 }, { "epoch": 0.9829199568650642, "grad_norm": 2.1975953578948975, "learning_rate": 0.00010291049754703432, "loss": 2.8609, "step": 3532 }, { "epoch": 0.9831982467735764, "grad_norm": 2.2381927967071533, "learning_rate": 0.00010286674275167943, "loss": 2.6983, "step": 3533 }, { "epoch": 0.9834765366820886, "grad_norm": 2.5271706581115723, "learning_rate": 0.00010282298740703337, "loss": 2.6475, "step": 3534 }, { "epoch": 0.9837548265906008, "grad_norm": 2.404139995574951, "learning_rate": 0.00010277923152148013, "loss": 2.873, "step": 3535 }, { "epoch": 0.984033116499113, "grad_norm": 2.357858419418335, "learning_rate": 0.00010273547510340364, "loss": 2.7502, "step": 3536 }, { "epoch": 0.9843114064076252, "grad_norm": 2.25480055809021, "learning_rate": 0.00010269171816118794, "loss": 2.8542, "step": 3537 }, { "epoch": 0.9845896963161374, "grad_norm": 2.156928777694702, "learning_rate": 0.00010264796070321732, "loss": 2.8352, "step": 3538 }, { "epoch": 0.9848679862246495, "grad_norm": 2.1265952587127686, "learning_rate": 0.00010260420273787596, "loss": 2.7218, "step": 3539 }, { "epoch": 0.9851462761331617, "grad_norm": 2.5060293674468994, "learning_rate": 0.00010256044427354827, "loss": 2.7101, "step": 3540 }, { "epoch": 0.9854245660416739, "grad_norm": 2.3506336212158203, "learning_rate": 0.00010251668531861878, "loss": 2.6445, "step": 3541 }, { "epoch": 0.9857028559501861, "grad_norm": 2.3358585834503174, "learning_rate": 0.00010247292588147201, "loss": 2.9224, "step": 3542 }, { "epoch": 0.9859811458586983, "grad_norm": 2.3024113178253174, "learning_rate": 0.00010242916597049262, "loss": 2.7137, "step": 3543 }, { "epoch": 0.9862594357672105, "grad_norm": 2.3046987056732178, "learning_rate": 0.00010238540559406538, "loss": 2.8917, "step": 3544 }, { "epoch": 0.9865377256757227, "grad_norm": 2.615590810775757, "learning_rate": 0.0001023416447605751, "loss": 2.9021, "step": 3545 }, { "epoch": 0.9868160155842349, "grad_norm": 2.3845207691192627, "learning_rate": 0.00010229788347840677, "loss": 2.8677, "step": 3546 }, { "epoch": 0.987094305492747, "grad_norm": 2.3745594024658203, "learning_rate": 0.00010225412175594536, "loss": 2.7307, "step": 3547 }, { "epoch": 0.9873725954012592, "grad_norm": 2.154855728149414, "learning_rate": 0.00010221035960157597, "loss": 2.6082, "step": 3548 }, { "epoch": 0.9876508853097714, "grad_norm": 2.1277034282684326, "learning_rate": 0.00010216659702368378, "loss": 2.7401, "step": 3549 }, { "epoch": 0.9879291752182836, "grad_norm": 2.1559395790100098, "learning_rate": 0.00010212283403065408, "loss": 2.6309, "step": 3550 }, { "epoch": 0.9882074651267958, "grad_norm": 2.7748842239379883, "learning_rate": 0.00010207907063087223, "loss": 2.7656, "step": 3551 }, { "epoch": 0.988485755035308, "grad_norm": 2.795274257659912, "learning_rate": 0.00010203530683272363, "loss": 3.14, "step": 3552 }, { "epoch": 0.9887640449438202, "grad_norm": 2.5042433738708496, "learning_rate": 0.00010199154264459372, "loss": 3.2704, "step": 3553 }, { "epoch": 0.9890423348523324, "grad_norm": 2.2857401371002197, "learning_rate": 0.00010194777807486814, "loss": 2.7257, "step": 3554 }, { "epoch": 0.9893206247608446, "grad_norm": 2.787203073501587, "learning_rate": 0.00010190401313193256, "loss": 2.8598, "step": 3555 }, { "epoch": 0.9895989146693568, "grad_norm": 1.9844505786895752, "learning_rate": 0.0001018602478241726, "loss": 2.6898, "step": 3556 }, { "epoch": 0.989877204577869, "grad_norm": 2.403738498687744, "learning_rate": 0.00010181648215997415, "loss": 2.6947, "step": 3557 }, { "epoch": 0.9901554944863812, "grad_norm": 2.3970541954040527, "learning_rate": 0.00010177271614772302, "loss": 2.8986, "step": 3558 }, { "epoch": 0.9904337843948934, "grad_norm": 2.433960437774658, "learning_rate": 0.0001017289497958051, "loss": 2.9981, "step": 3559 }, { "epoch": 0.9907120743034056, "grad_norm": 3.0570759773254395, "learning_rate": 0.00010168518311260642, "loss": 2.6912, "step": 3560 }, { "epoch": 0.9909903642119178, "grad_norm": 2.1811139583587646, "learning_rate": 0.00010164141610651303, "loss": 2.7529, "step": 3561 }, { "epoch": 0.99126865412043, "grad_norm": 2.6839935779571533, "learning_rate": 0.000101597648785911, "loss": 2.6282, "step": 3562 }, { "epoch": 0.9915469440289422, "grad_norm": 2.157683849334717, "learning_rate": 0.00010155388115918659, "loss": 2.6186, "step": 3563 }, { "epoch": 0.9918252339374544, "grad_norm": 2.543529748916626, "learning_rate": 0.00010151011323472594, "loss": 2.829, "step": 3564 }, { "epoch": 0.9921035238459666, "grad_norm": 2.5193004608154297, "learning_rate": 0.00010146634502091537, "loss": 2.7702, "step": 3565 }, { "epoch": 0.9923818137544788, "grad_norm": 2.5317776203155518, "learning_rate": 0.00010142257652614125, "loss": 2.814, "step": 3566 }, { "epoch": 0.992660103662991, "grad_norm": 2.3581552505493164, "learning_rate": 0.00010137880775878994, "loss": 2.8386, "step": 3567 }, { "epoch": 0.9929383935715032, "grad_norm": 2.1867854595184326, "learning_rate": 0.0001013350387272479, "loss": 2.7001, "step": 3568 }, { "epoch": 0.9932166834800153, "grad_norm": 2.558553457260132, "learning_rate": 0.00010129126943990161, "loss": 2.8019, "step": 3569 }, { "epoch": 0.9934949733885275, "grad_norm": 2.5082204341888428, "learning_rate": 0.00010124749990513771, "loss": 2.805, "step": 3570 }, { "epoch": 0.9937732632970397, "grad_norm": 2.3257765769958496, "learning_rate": 0.00010120373013134267, "loss": 2.6393, "step": 3571 }, { "epoch": 0.9940515532055518, "grad_norm": 2.8094825744628906, "learning_rate": 0.00010115996012690325, "loss": 2.9302, "step": 3572 }, { "epoch": 0.994329843114064, "grad_norm": 2.636359691619873, "learning_rate": 0.00010111618990020609, "loss": 2.8701, "step": 3573 }, { "epoch": 0.9946081330225762, "grad_norm": 2.4539875984191895, "learning_rate": 0.00010107241945963785, "loss": 2.7971, "step": 3574 }, { "epoch": 0.9948864229310884, "grad_norm": 2.442060947418213, "learning_rate": 0.00010102864881358543, "loss": 2.872, "step": 3575 }, { "epoch": 0.9951647128396006, "grad_norm": 2.201049327850342, "learning_rate": 0.00010098487797043554, "loss": 2.9848, "step": 3576 }, { "epoch": 0.9954430027481128, "grad_norm": 2.342043161392212, "learning_rate": 0.00010094110693857506, "loss": 2.9098, "step": 3577 }, { "epoch": 0.995721292656625, "grad_norm": 2.060192584991455, "learning_rate": 0.00010089733572639088, "loss": 2.5572, "step": 3578 }, { "epoch": 0.9959995825651372, "grad_norm": 2.4076054096221924, "learning_rate": 0.00010085356434226994, "loss": 2.7625, "step": 3579 }, { "epoch": 0.9962778724736494, "grad_norm": 2.087200403213501, "learning_rate": 0.00010080979279459912, "loss": 2.5054, "step": 3580 }, { "epoch": 0.9965561623821616, "grad_norm": 2.457927703857422, "learning_rate": 0.00010076602109176548, "loss": 2.8075, "step": 3581 }, { "epoch": 0.9968344522906738, "grad_norm": 2.256704807281494, "learning_rate": 0.00010072224924215601, "loss": 2.7971, "step": 3582 }, { "epoch": 0.997112742199186, "grad_norm": 3.2414162158966064, "learning_rate": 0.00010067847725415769, "loss": 2.6674, "step": 3583 }, { "epoch": 0.9973910321076982, "grad_norm": 2.034857749938965, "learning_rate": 0.00010063470513615769, "loss": 2.6577, "step": 3584 }, { "epoch": 0.9976693220162104, "grad_norm": 2.1702258586883545, "learning_rate": 0.00010059093289654303, "loss": 2.4907, "step": 3585 }, { "epoch": 0.9979476119247226, "grad_norm": 2.2782490253448486, "learning_rate": 0.00010054716054370084, "loss": 2.8283, "step": 3586 }, { "epoch": 0.9982259018332348, "grad_norm": 2.5203466415405273, "learning_rate": 0.00010050338808601827, "loss": 2.8492, "step": 3587 }, { "epoch": 0.998504191741747, "grad_norm": 2.3866348266601562, "learning_rate": 0.00010045961553188248, "loss": 2.9176, "step": 3588 }, { "epoch": 0.9987824816502592, "grad_norm": 2.5512540340423584, "learning_rate": 0.0001004158428896806, "loss": 2.7893, "step": 3589 }, { "epoch": 0.9990607715587714, "grad_norm": 2.737086534500122, "learning_rate": 0.00010037207016779985, "loss": 2.9874, "step": 3590 }, { "epoch": 0.9993390614672836, "grad_norm": 2.202831983566284, "learning_rate": 0.0001003282973746275, "loss": 2.6883, "step": 3591 }, { "epoch": 0.9996173513757958, "grad_norm": 2.9956512451171875, "learning_rate": 0.00010028452451855068, "loss": 2.8314, "step": 3592 }, { "epoch": 0.999895641284308, "grad_norm": 2.298567533493042, "learning_rate": 0.00010024075160795665, "loss": 2.8097, "step": 3593 }, { "epoch": 1.0002435036699482, "grad_norm": 6.168365955352783, "learning_rate": 0.0001001969786512327, "loss": 4.8184, "step": 3594 }, { "epoch": 1.0005217935784603, "grad_norm": 2.1960833072662354, "learning_rate": 0.00010015320565676605, "loss": 2.4587, "step": 3595 }, { "epoch": 1.0008000834869726, "grad_norm": 2.397888422012329, "learning_rate": 0.00010010943263294393, "loss": 2.7209, "step": 3596 }, { "epoch": 1.0010783733954847, "grad_norm": 1.8872299194335938, "learning_rate": 0.00010006565958815364, "loss": 2.0356, "step": 3597 }, { "epoch": 1.001356663303997, "grad_norm": 2.219761848449707, "learning_rate": 0.00010002188653078244, "loss": 2.6035, "step": 3598 }, { "epoch": 1.001634953212509, "grad_norm": 2.422180414199829, "learning_rate": 9.997811346921758e-05, "loss": 2.7069, "step": 3599 }, { "epoch": 1.0019132431210214, "grad_norm": 2.084489107131958, "learning_rate": 9.993434041184637e-05, "loss": 2.6542, "step": 3600 }, { "epoch": 1.0019132431210214, "eval_loss": 2.8474695682525635, "eval_runtime": 84.587, "eval_samples_per_second": 59.111, "eval_steps_per_second": 14.778, "step": 3600 }, { "epoch": 1.0021915330295335, "grad_norm": 2.4727015495300293, "learning_rate": 9.989056736705608e-05, "loss": 2.7356, "step": 3601 }, { "epoch": 1.0024698229380458, "grad_norm": 2.1826629638671875, "learning_rate": 9.984679434323399e-05, "loss": 2.4974, "step": 3602 }, { "epoch": 1.0027481128465578, "grad_norm": 2.0469789505004883, "learning_rate": 9.98030213487673e-05, "loss": 2.3743, "step": 3603 }, { "epoch": 1.0030264027550702, "grad_norm": 2.1204702854156494, "learning_rate": 9.975924839204333e-05, "loss": 2.3914, "step": 3604 }, { "epoch": 1.0033046926635822, "grad_norm": 2.30515456199646, "learning_rate": 9.971547548144934e-05, "loss": 2.7669, "step": 3605 }, { "epoch": 1.0035829825720946, "grad_norm": 2.2537689208984375, "learning_rate": 9.96717026253725e-05, "loss": 2.2713, "step": 3606 }, { "epoch": 1.0038612724806066, "grad_norm": 2.1024792194366455, "learning_rate": 9.962792983220014e-05, "loss": 2.6992, "step": 3607 }, { "epoch": 1.004139562389119, "grad_norm": 2.6211812496185303, "learning_rate": 9.958415711031944e-05, "loss": 2.553, "step": 3608 }, { "epoch": 1.004417852297631, "grad_norm": 2.4872612953186035, "learning_rate": 9.954038446811755e-05, "loss": 2.6366, "step": 3609 }, { "epoch": 1.0046961422061433, "grad_norm": 2.327535629272461, "learning_rate": 9.949661191398175e-05, "loss": 2.7231, "step": 3610 }, { "epoch": 1.0049744321146554, "grad_norm": 2.1828579902648926, "learning_rate": 9.94528394562992e-05, "loss": 2.4099, "step": 3611 }, { "epoch": 1.0052527220231677, "grad_norm": 2.1962826251983643, "learning_rate": 9.940906710345698e-05, "loss": 2.4376, "step": 3612 }, { "epoch": 1.0055310119316798, "grad_norm": 2.220290422439575, "learning_rate": 9.936529486384234e-05, "loss": 2.3303, "step": 3613 }, { "epoch": 1.0058093018401921, "grad_norm": 2.429093599319458, "learning_rate": 9.932152274584232e-05, "loss": 2.5576, "step": 3614 }, { "epoch": 1.0060875917487042, "grad_norm": 2.545867681503296, "learning_rate": 9.927775075784403e-05, "loss": 2.5247, "step": 3615 }, { "epoch": 1.0063658816572163, "grad_norm": 2.177238702774048, "learning_rate": 9.923397890823453e-05, "loss": 2.2422, "step": 3616 }, { "epoch": 1.0066441715657286, "grad_norm": 2.516214370727539, "learning_rate": 9.91902072054009e-05, "loss": 2.6979, "step": 3617 }, { "epoch": 1.0069224614742407, "grad_norm": 2.583171844482422, "learning_rate": 9.914643565773008e-05, "loss": 2.4948, "step": 3618 }, { "epoch": 1.007200751382753, "grad_norm": 2.1928822994232178, "learning_rate": 9.910266427360913e-05, "loss": 2.4619, "step": 3619 }, { "epoch": 1.007479041291265, "grad_norm": 2.3109724521636963, "learning_rate": 9.905889306142497e-05, "loss": 2.4164, "step": 3620 }, { "epoch": 1.0077573311997774, "grad_norm": 2.3388402462005615, "learning_rate": 9.901512202956447e-05, "loss": 2.5941, "step": 3621 }, { "epoch": 1.0080356211082895, "grad_norm": 2.36480450630188, "learning_rate": 9.89713511864146e-05, "loss": 2.5486, "step": 3622 }, { "epoch": 1.0083139110168018, "grad_norm": 2.3684072494506836, "learning_rate": 9.892758054036216e-05, "loss": 2.6478, "step": 3623 }, { "epoch": 1.0085922009253139, "grad_norm": 2.2883782386779785, "learning_rate": 9.888381009979394e-05, "loss": 2.3874, "step": 3624 }, { "epoch": 1.0088704908338262, "grad_norm": 2.400454521179199, "learning_rate": 9.884003987309676e-05, "loss": 2.6688, "step": 3625 }, { "epoch": 1.0091487807423383, "grad_norm": 2.411891222000122, "learning_rate": 9.879626986865735e-05, "loss": 2.8327, "step": 3626 }, { "epoch": 1.0094270706508506, "grad_norm": 2.746245861053467, "learning_rate": 9.875250009486232e-05, "loss": 2.7147, "step": 3627 }, { "epoch": 1.0097053605593627, "grad_norm": 2.195439577102661, "learning_rate": 9.870873056009841e-05, "loss": 2.2958, "step": 3628 }, { "epoch": 1.009983650467875, "grad_norm": 2.4186742305755615, "learning_rate": 9.866496127275216e-05, "loss": 2.3944, "step": 3629 }, { "epoch": 1.010261940376387, "grad_norm": 2.4659276008605957, "learning_rate": 9.862119224121011e-05, "loss": 2.8833, "step": 3630 }, { "epoch": 1.0105402302848994, "grad_norm": 3.256303071975708, "learning_rate": 9.85774234738588e-05, "loss": 2.5439, "step": 3631 }, { "epoch": 1.0108185201934115, "grad_norm": 2.459998369216919, "learning_rate": 9.85336549790847e-05, "loss": 2.4565, "step": 3632 }, { "epoch": 1.0110968101019238, "grad_norm": 2.466240167617798, "learning_rate": 9.848988676527411e-05, "loss": 2.4005, "step": 3633 }, { "epoch": 1.0113751000104358, "grad_norm": 2.684922218322754, "learning_rate": 9.844611884081348e-05, "loss": 2.4699, "step": 3634 }, { "epoch": 1.0116533899189482, "grad_norm": 2.9135866165161133, "learning_rate": 9.840235121408902e-05, "loss": 3.0818, "step": 3635 }, { "epoch": 1.0119316798274602, "grad_norm": 2.5571932792663574, "learning_rate": 9.835858389348701e-05, "loss": 2.6709, "step": 3636 }, { "epoch": 1.0122099697359725, "grad_norm": 2.159505605697632, "learning_rate": 9.831481688739362e-05, "loss": 2.3602, "step": 3637 }, { "epoch": 1.0124882596444846, "grad_norm": 2.178410291671753, "learning_rate": 9.827105020419494e-05, "loss": 2.3776, "step": 3638 }, { "epoch": 1.012766549552997, "grad_norm": 2.4218437671661377, "learning_rate": 9.8227283852277e-05, "loss": 2.1694, "step": 3639 }, { "epoch": 1.013044839461509, "grad_norm": 3.1513845920562744, "learning_rate": 9.818351784002586e-05, "loss": 2.9845, "step": 3640 }, { "epoch": 1.0133231293700211, "grad_norm": 2.8680331707000732, "learning_rate": 9.813975217582739e-05, "loss": 2.5105, "step": 3641 }, { "epoch": 1.0136014192785334, "grad_norm": 2.3797199726104736, "learning_rate": 9.809598686806746e-05, "loss": 2.4647, "step": 3642 }, { "epoch": 1.0138797091870455, "grad_norm": 2.403562068939209, "learning_rate": 9.805222192513184e-05, "loss": 2.6245, "step": 3643 }, { "epoch": 1.0141579990955578, "grad_norm": 2.2409002780914307, "learning_rate": 9.800845735540627e-05, "loss": 2.393, "step": 3644 }, { "epoch": 1.01443628900407, "grad_norm": 2.2305660247802734, "learning_rate": 9.796469316727641e-05, "loss": 2.5389, "step": 3645 }, { "epoch": 1.0147145789125822, "grad_norm": 2.433889389038086, "learning_rate": 9.792092936912777e-05, "loss": 2.7513, "step": 3646 }, { "epoch": 1.0149928688210943, "grad_norm": 2.397838830947876, "learning_rate": 9.78771659693459e-05, "loss": 2.7513, "step": 3647 }, { "epoch": 1.0152711587296066, "grad_norm": 2.5803353786468506, "learning_rate": 9.783340297631623e-05, "loss": 2.5487, "step": 3648 }, { "epoch": 1.0155494486381187, "grad_norm": 2.5709424018859863, "learning_rate": 9.778964039842404e-05, "loss": 2.4367, "step": 3649 }, { "epoch": 1.015827738546631, "grad_norm": 2.5421626567840576, "learning_rate": 9.774587824405466e-05, "loss": 3.0246, "step": 3650 }, { "epoch": 1.016106028455143, "grad_norm": 2.4680778980255127, "learning_rate": 9.770211652159327e-05, "loss": 2.4571, "step": 3651 }, { "epoch": 1.0163843183636554, "grad_norm": 2.4402754306793213, "learning_rate": 9.76583552394249e-05, "loss": 2.3657, "step": 3652 }, { "epoch": 1.0166626082721675, "grad_norm": 2.582365036010742, "learning_rate": 9.761459440593466e-05, "loss": 2.6948, "step": 3653 }, { "epoch": 1.0169408981806798, "grad_norm": 2.3168914318084717, "learning_rate": 9.757083402950742e-05, "loss": 2.3346, "step": 3654 }, { "epoch": 1.0172191880891919, "grad_norm": 2.390580654144287, "learning_rate": 9.752707411852802e-05, "loss": 2.7386, "step": 3655 }, { "epoch": 1.0174974779977042, "grad_norm": 2.624013900756836, "learning_rate": 9.748331468138124e-05, "loss": 2.4795, "step": 3656 }, { "epoch": 1.0177757679062163, "grad_norm": 2.4292471408843994, "learning_rate": 9.743955572645174e-05, "loss": 2.7798, "step": 3657 }, { "epoch": 1.0180540578147286, "grad_norm": 2.503296136856079, "learning_rate": 9.739579726212406e-05, "loss": 2.6261, "step": 3658 }, { "epoch": 1.0183323477232407, "grad_norm": 2.2126271724700928, "learning_rate": 9.735203929678272e-05, "loss": 2.2868, "step": 3659 }, { "epoch": 1.018610637631753, "grad_norm": 2.1973748207092285, "learning_rate": 9.730828183881208e-05, "loss": 2.4527, "step": 3660 }, { "epoch": 1.018888927540265, "grad_norm": 2.435417652130127, "learning_rate": 9.726452489659638e-05, "loss": 2.5638, "step": 3661 }, { "epoch": 1.0191672174487774, "grad_norm": 2.363333225250244, "learning_rate": 9.722076847851988e-05, "loss": 2.4653, "step": 3662 }, { "epoch": 1.0194455073572894, "grad_norm": 2.7161386013031006, "learning_rate": 9.717701259296665e-05, "loss": 2.4748, "step": 3663 }, { "epoch": 1.0197237972658018, "grad_norm": 2.3049778938293457, "learning_rate": 9.713325724832059e-05, "loss": 2.5271, "step": 3664 }, { "epoch": 1.0200020871743138, "grad_norm": 2.264054775238037, "learning_rate": 9.708950245296569e-05, "loss": 2.4146, "step": 3665 }, { "epoch": 1.020280377082826, "grad_norm": 2.786287784576416, "learning_rate": 9.704574821528566e-05, "loss": 2.6383, "step": 3666 }, { "epoch": 1.0205586669913382, "grad_norm": 2.3080055713653564, "learning_rate": 9.700199454366415e-05, "loss": 2.5227, "step": 3667 }, { "epoch": 1.0208369568998503, "grad_norm": 2.4426419734954834, "learning_rate": 9.695824144648478e-05, "loss": 2.4908, "step": 3668 }, { "epoch": 1.0211152468083626, "grad_norm": 2.478846788406372, "learning_rate": 9.691448893213095e-05, "loss": 2.5778, "step": 3669 }, { "epoch": 1.0213935367168747, "grad_norm": 2.3766379356384277, "learning_rate": 9.687073700898598e-05, "loss": 2.5546, "step": 3670 }, { "epoch": 1.021671826625387, "grad_norm": 2.3187801837921143, "learning_rate": 9.682698568543317e-05, "loss": 2.5168, "step": 3671 }, { "epoch": 1.0219501165338991, "grad_norm": 2.4878695011138916, "learning_rate": 9.678323496985557e-05, "loss": 2.7055, "step": 3672 }, { "epoch": 1.0222284064424114, "grad_norm": 2.6090481281280518, "learning_rate": 9.673948487063614e-05, "loss": 2.6359, "step": 3673 }, { "epoch": 1.0225066963509235, "grad_norm": 2.3619978427886963, "learning_rate": 9.669573539615782e-05, "loss": 2.4851, "step": 3674 }, { "epoch": 1.0227849862594358, "grad_norm": 2.271355390548706, "learning_rate": 9.665198655480334e-05, "loss": 2.7666, "step": 3675 }, { "epoch": 1.023063276167948, "grad_norm": 2.36423659324646, "learning_rate": 9.660823835495531e-05, "loss": 2.3988, "step": 3676 }, { "epoch": 1.0233415660764602, "grad_norm": 2.4783449172973633, "learning_rate": 9.656449080499627e-05, "loss": 2.4864, "step": 3677 }, { "epoch": 1.0236198559849723, "grad_norm": 2.3957090377807617, "learning_rate": 9.65207439133086e-05, "loss": 2.3701, "step": 3678 }, { "epoch": 1.0238981458934846, "grad_norm": 2.900683879852295, "learning_rate": 9.64769976882745e-05, "loss": 2.5155, "step": 3679 }, { "epoch": 1.0241764358019967, "grad_norm": 2.5048580169677734, "learning_rate": 9.643325213827619e-05, "loss": 2.5991, "step": 3680 }, { "epoch": 1.024454725710509, "grad_norm": 2.4622344970703125, "learning_rate": 9.638950727169564e-05, "loss": 2.5138, "step": 3681 }, { "epoch": 1.024733015619021, "grad_norm": 2.480517625808716, "learning_rate": 9.63457630969147e-05, "loss": 2.6391, "step": 3682 }, { "epoch": 1.0250113055275334, "grad_norm": 2.688997268676758, "learning_rate": 9.630201962231511e-05, "loss": 2.3735, "step": 3683 }, { "epoch": 1.0252895954360455, "grad_norm": 2.426302909851074, "learning_rate": 9.62582768562785e-05, "loss": 2.2221, "step": 3684 }, { "epoch": 1.0255678853445578, "grad_norm": 2.492079496383667, "learning_rate": 9.621453480718634e-05, "loss": 2.9364, "step": 3685 }, { "epoch": 1.0258461752530699, "grad_norm": 2.756873846054077, "learning_rate": 9.61707934834199e-05, "loss": 2.6593, "step": 3686 }, { "epoch": 1.0261244651615822, "grad_norm": 2.2915287017822266, "learning_rate": 9.612705289336045e-05, "loss": 2.3577, "step": 3687 }, { "epoch": 1.0264027550700943, "grad_norm": 2.240419387817383, "learning_rate": 9.6083313045389e-05, "loss": 2.4756, "step": 3688 }, { "epoch": 1.0266810449786066, "grad_norm": 2.4698326587677, "learning_rate": 9.603957394788644e-05, "loss": 2.5557, "step": 3689 }, { "epoch": 1.0269593348871187, "grad_norm": 2.5737481117248535, "learning_rate": 9.599583560923359e-05, "loss": 2.6166, "step": 3690 }, { "epoch": 1.0272376247956307, "grad_norm": 2.226649761199951, "learning_rate": 9.595209803781102e-05, "loss": 2.4705, "step": 3691 }, { "epoch": 1.027515914704143, "grad_norm": 2.4698619842529297, "learning_rate": 9.590836124199921e-05, "loss": 2.4206, "step": 3692 }, { "epoch": 1.0277942046126551, "grad_norm": 2.4154977798461914, "learning_rate": 9.586462523017852e-05, "loss": 2.6235, "step": 3693 }, { "epoch": 1.0280724945211674, "grad_norm": 2.2395994663238525, "learning_rate": 9.58208900107291e-05, "loss": 2.5879, "step": 3694 }, { "epoch": 1.0283507844296795, "grad_norm": 2.263139486312866, "learning_rate": 9.577715559203095e-05, "loss": 2.794, "step": 3695 }, { "epoch": 1.0286290743381918, "grad_norm": 2.4009816646575928, "learning_rate": 9.573342198246397e-05, "loss": 2.8207, "step": 3696 }, { "epoch": 1.028907364246704, "grad_norm": 2.4740641117095947, "learning_rate": 9.568968919040787e-05, "loss": 2.7345, "step": 3697 }, { "epoch": 1.0291856541552162, "grad_norm": 2.556521415710449, "learning_rate": 9.564595722424217e-05, "loss": 2.6085, "step": 3698 }, { "epoch": 1.0294639440637283, "grad_norm": 2.379560708999634, "learning_rate": 9.560222609234633e-05, "loss": 2.4874, "step": 3699 }, { "epoch": 1.0297422339722406, "grad_norm": 2.2581701278686523, "learning_rate": 9.555849580309954e-05, "loss": 2.4456, "step": 3700 }, { "epoch": 1.0297422339722406, "eval_loss": 2.8660240173339844, "eval_runtime": 84.4219, "eval_samples_per_second": 59.226, "eval_steps_per_second": 14.807, "step": 3700 }, { "epoch": 1.0300205238807527, "grad_norm": 2.34928560256958, "learning_rate": 9.551476636488089e-05, "loss": 2.4443, "step": 3701 }, { "epoch": 1.030298813789265, "grad_norm": 2.5956695079803467, "learning_rate": 9.547103778606931e-05, "loss": 2.798, "step": 3702 }, { "epoch": 1.030577103697777, "grad_norm": 2.6470494270324707, "learning_rate": 9.542731007504356e-05, "loss": 2.7361, "step": 3703 }, { "epoch": 1.0308553936062894, "grad_norm": 2.35915207862854, "learning_rate": 9.538358324018215e-05, "loss": 2.4667, "step": 3704 }, { "epoch": 1.0311336835148015, "grad_norm": 2.26962947845459, "learning_rate": 9.533985728986359e-05, "loss": 2.5105, "step": 3705 }, { "epoch": 1.0314119734233138, "grad_norm": 2.273303747177124, "learning_rate": 9.529613223246607e-05, "loss": 2.7761, "step": 3706 }, { "epoch": 1.031690263331826, "grad_norm": 2.518406867980957, "learning_rate": 9.525240807636766e-05, "loss": 2.6669, "step": 3707 }, { "epoch": 1.0319685532403382, "grad_norm": 2.45633602142334, "learning_rate": 9.52086848299463e-05, "loss": 2.5069, "step": 3708 }, { "epoch": 1.0322468431488503, "grad_norm": 3.081700086593628, "learning_rate": 9.51649625015797e-05, "loss": 2.6667, "step": 3709 }, { "epoch": 1.0325251330573626, "grad_norm": 2.3902275562286377, "learning_rate": 9.512124109964539e-05, "loss": 2.5777, "step": 3710 }, { "epoch": 1.0328034229658747, "grad_norm": 2.087824583053589, "learning_rate": 9.507752063252078e-05, "loss": 2.4125, "step": 3711 }, { "epoch": 1.033081712874387, "grad_norm": 2.584324836730957, "learning_rate": 9.503380110858304e-05, "loss": 2.596, "step": 3712 }, { "epoch": 1.033360002782899, "grad_norm": 2.4409570693969727, "learning_rate": 9.499008253620919e-05, "loss": 2.5021, "step": 3713 }, { "epoch": 1.0336382926914114, "grad_norm": 2.357409954071045, "learning_rate": 9.494636492377607e-05, "loss": 2.4717, "step": 3714 }, { "epoch": 1.0339165825999235, "grad_norm": 2.637960195541382, "learning_rate": 9.490264827966033e-05, "loss": 2.7572, "step": 3715 }, { "epoch": 1.0341948725084356, "grad_norm": 2.5555248260498047, "learning_rate": 9.485893261223842e-05, "loss": 2.6943, "step": 3716 }, { "epoch": 1.0344731624169479, "grad_norm": 2.3676626682281494, "learning_rate": 9.481521792988664e-05, "loss": 2.7945, "step": 3717 }, { "epoch": 1.03475145232546, "grad_norm": 2.2306549549102783, "learning_rate": 9.477150424098105e-05, "loss": 2.3112, "step": 3718 }, { "epoch": 1.0350297422339723, "grad_norm": 2.835977792739868, "learning_rate": 9.47277915538975e-05, "loss": 2.8379, "step": 3719 }, { "epoch": 1.0353080321424843, "grad_norm": 2.4739816188812256, "learning_rate": 9.468407987701179e-05, "loss": 2.6375, "step": 3720 }, { "epoch": 1.0355863220509967, "grad_norm": 2.7540197372436523, "learning_rate": 9.464036921869941e-05, "loss": 2.416, "step": 3721 }, { "epoch": 1.0358646119595087, "grad_norm": 2.581390142440796, "learning_rate": 9.459665958733565e-05, "loss": 2.3666, "step": 3722 }, { "epoch": 1.036142901868021, "grad_norm": 2.3293237686157227, "learning_rate": 9.455295099129563e-05, "loss": 2.5112, "step": 3723 }, { "epoch": 1.0364211917765331, "grad_norm": 2.709442377090454, "learning_rate": 9.450924343895428e-05, "loss": 3.0548, "step": 3724 }, { "epoch": 1.0366994816850454, "grad_norm": 2.5723798274993896, "learning_rate": 9.446553693868633e-05, "loss": 2.7705, "step": 3725 }, { "epoch": 1.0369777715935575, "grad_norm": 2.4315478801727295, "learning_rate": 9.442183149886627e-05, "loss": 2.7446, "step": 3726 }, { "epoch": 1.0372560615020698, "grad_norm": 2.1737117767333984, "learning_rate": 9.437812712786844e-05, "loss": 2.534, "step": 3727 }, { "epoch": 1.037534351410582, "grad_norm": 2.1455442905426025, "learning_rate": 9.433442383406696e-05, "loss": 2.3731, "step": 3728 }, { "epoch": 1.0378126413190942, "grad_norm": 2.4659476280212402, "learning_rate": 9.429072162583567e-05, "loss": 2.3374, "step": 3729 }, { "epoch": 1.0380909312276063, "grad_norm": 2.5009284019470215, "learning_rate": 9.424702051154836e-05, "loss": 2.6745, "step": 3730 }, { "epoch": 1.0383692211361186, "grad_norm": 2.334843158721924, "learning_rate": 9.420332049957846e-05, "loss": 2.3093, "step": 3731 }, { "epoch": 1.0386475110446307, "grad_norm": 2.483473539352417, "learning_rate": 9.415962159829926e-05, "loss": 2.4984, "step": 3732 }, { "epoch": 1.038925800953143, "grad_norm": 2.4551875591278076, "learning_rate": 9.411592381608381e-05, "loss": 2.423, "step": 3733 }, { "epoch": 1.039204090861655, "grad_norm": 2.400089740753174, "learning_rate": 9.407222716130499e-05, "loss": 2.0407, "step": 3734 }, { "epoch": 1.0394823807701674, "grad_norm": 2.617560863494873, "learning_rate": 9.402853164233538e-05, "loss": 2.8515, "step": 3735 }, { "epoch": 1.0397606706786795, "grad_norm": 2.454972505569458, "learning_rate": 9.398483726754746e-05, "loss": 2.3012, "step": 3736 }, { "epoch": 1.0400389605871918, "grad_norm": 2.8407280445098877, "learning_rate": 9.394114404531338e-05, "loss": 2.7096, "step": 3737 }, { "epoch": 1.040317250495704, "grad_norm": 2.6554818153381348, "learning_rate": 9.389745198400513e-05, "loss": 2.7251, "step": 3738 }, { "epoch": 1.0405955404042162, "grad_norm": 2.69405460357666, "learning_rate": 9.385376109199448e-05, "loss": 2.3692, "step": 3739 }, { "epoch": 1.0408738303127283, "grad_norm": 2.626368761062622, "learning_rate": 9.381007137765292e-05, "loss": 2.6053, "step": 3740 }, { "epoch": 1.0411521202212404, "grad_norm": 2.5650789737701416, "learning_rate": 9.376638284935176e-05, "loss": 2.7401, "step": 3741 }, { "epoch": 1.0414304101297527, "grad_norm": 2.240908622741699, "learning_rate": 9.372269551546211e-05, "loss": 2.4553, "step": 3742 }, { "epoch": 1.0417087000382648, "grad_norm": 2.357799530029297, "learning_rate": 9.367900938435479e-05, "loss": 2.818, "step": 3743 }, { "epoch": 1.041986989946777, "grad_norm": 2.7151036262512207, "learning_rate": 9.36353244644004e-05, "loss": 2.7159, "step": 3744 }, { "epoch": 1.0422652798552892, "grad_norm": 2.4492530822753906, "learning_rate": 9.359164076396937e-05, "loss": 2.4194, "step": 3745 }, { "epoch": 1.0425435697638015, "grad_norm": 2.7243385314941406, "learning_rate": 9.354795829143182e-05, "loss": 2.739, "step": 3746 }, { "epoch": 1.0428218596723136, "grad_norm": 2.5599193572998047, "learning_rate": 9.350427705515766e-05, "loss": 2.6885, "step": 3747 }, { "epoch": 1.0431001495808259, "grad_norm": 2.4527359008789062, "learning_rate": 9.346059706351659e-05, "loss": 2.5548, "step": 3748 }, { "epoch": 1.043378439489338, "grad_norm": 2.3975369930267334, "learning_rate": 9.341691832487804e-05, "loss": 2.4877, "step": 3749 }, { "epoch": 1.0436567293978503, "grad_norm": 2.267547369003296, "learning_rate": 9.337324084761118e-05, "loss": 2.2493, "step": 3750 }, { "epoch": 1.0439350193063623, "grad_norm": 2.2563583850860596, "learning_rate": 9.332956464008503e-05, "loss": 2.4411, "step": 3751 }, { "epoch": 1.0442133092148747, "grad_norm": 2.401918649673462, "learning_rate": 9.328588971066827e-05, "loss": 2.8112, "step": 3752 }, { "epoch": 1.0444915991233867, "grad_norm": 2.5725367069244385, "learning_rate": 9.324221606772935e-05, "loss": 2.5735, "step": 3753 }, { "epoch": 1.044769889031899, "grad_norm": 2.5067310333251953, "learning_rate": 9.319854371963653e-05, "loss": 2.4901, "step": 3754 }, { "epoch": 1.0450481789404111, "grad_norm": 2.332139730453491, "learning_rate": 9.315487267475777e-05, "loss": 2.3954, "step": 3755 }, { "epoch": 1.0453264688489234, "grad_norm": 2.5820467472076416, "learning_rate": 9.311120294146078e-05, "loss": 2.685, "step": 3756 }, { "epoch": 1.0456047587574355, "grad_norm": 2.7918787002563477, "learning_rate": 9.306753452811308e-05, "loss": 2.8996, "step": 3757 }, { "epoch": 1.0458830486659478, "grad_norm": 2.6018383502960205, "learning_rate": 9.302386744308185e-05, "loss": 2.6813, "step": 3758 }, { "epoch": 1.04616133857446, "grad_norm": 2.2214136123657227, "learning_rate": 9.298020169473402e-05, "loss": 2.5446, "step": 3759 }, { "epoch": 1.0464396284829722, "grad_norm": 2.195864677429199, "learning_rate": 9.293653729143636e-05, "loss": 2.3378, "step": 3760 }, { "epoch": 1.0467179183914843, "grad_norm": 2.4155166149139404, "learning_rate": 9.289287424155536e-05, "loss": 2.5393, "step": 3761 }, { "epoch": 1.0469962082999966, "grad_norm": 2.280897617340088, "learning_rate": 9.284921255345715e-05, "loss": 2.472, "step": 3762 }, { "epoch": 1.0472744982085087, "grad_norm": 2.2828261852264404, "learning_rate": 9.280555223550767e-05, "loss": 2.4396, "step": 3763 }, { "epoch": 1.047552788117021, "grad_norm": 2.5520870685577393, "learning_rate": 9.27618932960726e-05, "loss": 2.5667, "step": 3764 }, { "epoch": 1.047831078025533, "grad_norm": 2.394723892211914, "learning_rate": 9.271823574351736e-05, "loss": 2.5952, "step": 3765 }, { "epoch": 1.0481093679340452, "grad_norm": 2.639838695526123, "learning_rate": 9.267457958620703e-05, "loss": 2.7062, "step": 3766 }, { "epoch": 1.0483876578425575, "grad_norm": 2.5440590381622314, "learning_rate": 9.263092483250657e-05, "loss": 2.5498, "step": 3767 }, { "epoch": 1.0486659477510696, "grad_norm": 2.683584213256836, "learning_rate": 9.258727149078055e-05, "loss": 2.6906, "step": 3768 }, { "epoch": 1.048944237659582, "grad_norm": 2.7254245281219482, "learning_rate": 9.254361956939327e-05, "loss": 2.6168, "step": 3769 }, { "epoch": 1.049222527568094, "grad_norm": 2.4922702312469482, "learning_rate": 9.249996907670881e-05, "loss": 2.7162, "step": 3770 }, { "epoch": 1.0495008174766063, "grad_norm": 2.6056606769561768, "learning_rate": 9.245632002109099e-05, "loss": 2.4848, "step": 3771 }, { "epoch": 1.0497791073851184, "grad_norm": 2.89192271232605, "learning_rate": 9.241267241090327e-05, "loss": 2.6764, "step": 3772 }, { "epoch": 1.0500573972936307, "grad_norm": 2.1888632774353027, "learning_rate": 9.236902625450893e-05, "loss": 2.3813, "step": 3773 }, { "epoch": 1.0503356872021428, "grad_norm": 2.4971957206726074, "learning_rate": 9.232538156027091e-05, "loss": 2.3947, "step": 3774 }, { "epoch": 1.050613977110655, "grad_norm": 2.0770082473754883, "learning_rate": 9.228173833655186e-05, "loss": 2.0168, "step": 3775 }, { "epoch": 1.0508922670191672, "grad_norm": 2.418344497680664, "learning_rate": 9.223809659171423e-05, "loss": 2.7327, "step": 3776 }, { "epoch": 1.0511705569276795, "grad_norm": 2.1940460205078125, "learning_rate": 9.21944563341201e-05, "loss": 2.3535, "step": 3777 }, { "epoch": 1.0514488468361916, "grad_norm": 2.1863420009613037, "learning_rate": 9.215081757213127e-05, "loss": 2.357, "step": 3778 }, { "epoch": 1.0517271367447039, "grad_norm": 2.4358723163604736, "learning_rate": 9.210718031410934e-05, "loss": 2.5809, "step": 3779 }, { "epoch": 1.052005426653216, "grad_norm": 2.3010590076446533, "learning_rate": 9.206354456841551e-05, "loss": 2.4569, "step": 3780 }, { "epoch": 1.0522837165617283, "grad_norm": 2.511343002319336, "learning_rate": 9.201991034341075e-05, "loss": 2.6625, "step": 3781 }, { "epoch": 1.0525620064702403, "grad_norm": 2.421273708343506, "learning_rate": 9.197627764745577e-05, "loss": 2.472, "step": 3782 }, { "epoch": 1.0528402963787526, "grad_norm": 2.5087594985961914, "learning_rate": 9.193264648891091e-05, "loss": 2.462, "step": 3783 }, { "epoch": 1.0531185862872647, "grad_norm": 2.3734774589538574, "learning_rate": 9.188901687613624e-05, "loss": 2.6545, "step": 3784 }, { "epoch": 1.053396876195777, "grad_norm": 2.4143643379211426, "learning_rate": 9.18453888174916e-05, "loss": 2.4521, "step": 3785 }, { "epoch": 1.0536751661042891, "grad_norm": 2.7103075981140137, "learning_rate": 9.180176232133647e-05, "loss": 2.9208, "step": 3786 }, { "epoch": 1.0539534560128014, "grad_norm": 2.6463193893432617, "learning_rate": 9.175813739602996e-05, "loss": 2.7337, "step": 3787 }, { "epoch": 1.0542317459213135, "grad_norm": 2.1172423362731934, "learning_rate": 9.171451404993105e-05, "loss": 2.3709, "step": 3788 }, { "epoch": 1.0545100358298258, "grad_norm": 2.3747429847717285, "learning_rate": 9.167089229139829e-05, "loss": 2.469, "step": 3789 }, { "epoch": 1.054788325738338, "grad_norm": 2.7493515014648438, "learning_rate": 9.162727212878996e-05, "loss": 2.7483, "step": 3790 }, { "epoch": 1.05506661564685, "grad_norm": 2.1751036643981934, "learning_rate": 9.158365357046405e-05, "loss": 2.3784, "step": 3791 }, { "epoch": 1.0553449055553623, "grad_norm": 2.5898613929748535, "learning_rate": 9.154003662477821e-05, "loss": 2.4678, "step": 3792 }, { "epoch": 1.0556231954638744, "grad_norm": 2.432840347290039, "learning_rate": 9.14964213000898e-05, "loss": 2.2535, "step": 3793 }, { "epoch": 1.0559014853723867, "grad_norm": 2.558281183242798, "learning_rate": 9.14528076047559e-05, "loss": 2.8376, "step": 3794 }, { "epoch": 1.0561797752808988, "grad_norm": 2.4376842975616455, "learning_rate": 9.140919554713323e-05, "loss": 2.5847, "step": 3795 }, { "epoch": 1.056458065189411, "grad_norm": 2.5773372650146484, "learning_rate": 9.136558513557818e-05, "loss": 2.3371, "step": 3796 }, { "epoch": 1.0567363550979232, "grad_norm": 2.790679454803467, "learning_rate": 9.132197637844691e-05, "loss": 2.5507, "step": 3797 }, { "epoch": 1.0570146450064355, "grad_norm": 2.5002150535583496, "learning_rate": 9.12783692840952e-05, "loss": 2.6795, "step": 3798 }, { "epoch": 1.0572929349149476, "grad_norm": 2.6883461475372314, "learning_rate": 9.123476386087844e-05, "loss": 2.6772, "step": 3799 }, { "epoch": 1.0575712248234599, "grad_norm": 2.8790204524993896, "learning_rate": 9.119116011715189e-05, "loss": 2.5414, "step": 3800 }, { "epoch": 1.0575712248234599, "eval_loss": 2.8708136081695557, "eval_runtime": 84.6585, "eval_samples_per_second": 59.061, "eval_steps_per_second": 14.765, "step": 3800 } ], "logging_steps": 1, "max_steps": 7187, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 2, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 2 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.73962634313728e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }