diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,280033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5197817566349329, + "eval_steps": 500, + "global_step": 40000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 1.2994543915873322e-05, + "grad_norm": 0.7394225001335144, + "learning_rate": 1.25e-05, + "loss": 2.0735, + "step": 1 + }, + { + "epoch": 2.5989087831746645e-05, + "grad_norm": 1.3883064985275269, + "learning_rate": 2.5e-05, + "loss": 2.7196, + "step": 2 + }, + { + "epoch": 3.8983631747619965e-05, + "grad_norm": 1.988787055015564, + "learning_rate": 3.7500000000000003e-05, + "loss": 3.517, + "step": 3 + }, + { + "epoch": 5.197817566349329e-05, + "grad_norm": 0.7598587274551392, + "learning_rate": 5e-05, + "loss": 2.0302, + "step": 4 + }, + { + "epoch": 6.497271957936662e-05, + "grad_norm": 0.9281490445137024, + "learning_rate": 6.25e-05, + "loss": 2.5406, + "step": 5 + }, + { + "epoch": 7.796726349523993e-05, + "grad_norm": 1.0694752931594849, + "learning_rate": 7.500000000000001e-05, + "loss": 2.7612, + "step": 6 + }, + { + "epoch": 9.096180741111325e-05, + "grad_norm": 0.8068076968193054, + "learning_rate": 8.75e-05, + "loss": 2.6643, + "step": 7 + }, + { + "epoch": 0.00010395635132698658, + "grad_norm": 2.041363000869751, + "learning_rate": 0.0001, + "loss": 3.1038, + "step": 8 + }, + { + "epoch": 0.0001169508952428599, + "grad_norm": 1.9163001775741577, + "learning_rate": 0.00011250000000000001, + "loss": 2.7784, + "step": 9 + }, + { + "epoch": 0.00012994543915873324, + "grad_norm": 0.7601706385612488, + "learning_rate": 0.000125, + "loss": 2.0801, + "step": 10 + }, + { + "epoch": 0.00014293998307460655, + "grad_norm": 0.6098636388778687, + "learning_rate": 0.0001375, + "loss": 1.8692, + "step": 11 + }, + { + "epoch": 0.00015593452699047986, + "grad_norm": 2.0739760398864746, + "learning_rate": 0.00015000000000000001, + "loss": 2.4924, + "step": 12 + }, + { + "epoch": 0.0001689290709063532, + "grad_norm": 0.5784039497375488, + "learning_rate": 0.00016250000000000002, + "loss": 1.8504, + "step": 13 + }, + { + "epoch": 0.0001819236148222265, + "grad_norm": 0.9400786757469177, + "learning_rate": 0.000175, + "loss": 1.9691, + "step": 14 + }, + { + "epoch": 0.00019491815873809985, + "grad_norm": 0.9181386232376099, + "learning_rate": 0.0001875, + "loss": 1.9537, + "step": 15 + }, + { + "epoch": 0.00020791270265397316, + "grad_norm": 0.8861708641052246, + "learning_rate": 0.0002, + "loss": 2.1486, + "step": 16 + }, + { + "epoch": 0.0002209072465698465, + "grad_norm": 1.134639859199524, + "learning_rate": 0.0001999974005380886, + "loss": 2.1537, + "step": 17 + }, + { + "epoch": 0.0002339017904857198, + "grad_norm": 0.813442587852478, + "learning_rate": 0.00019999480107617726, + "loss": 2.0054, + "step": 18 + }, + { + "epoch": 0.00024689633440159314, + "grad_norm": 0.8912744522094727, + "learning_rate": 0.00019999220161426586, + "loss": 1.8618, + "step": 19 + }, + { + "epoch": 0.0002598908783174665, + "grad_norm": 0.5203714966773987, + "learning_rate": 0.00019998960215235448, + "loss": 1.6886, + "step": 20 + }, + { + "epoch": 0.00027288542223333976, + "grad_norm": 0.6847788691520691, + "learning_rate": 0.00019998700269044308, + "loss": 1.8225, + "step": 21 + }, + { + "epoch": 0.0002858799661492131, + "grad_norm": 0.5748642683029175, + "learning_rate": 0.0001999844032285317, + "loss": 1.9526, + "step": 22 + }, + { + "epoch": 0.00029887451006508644, + "grad_norm": 0.501183271408081, + "learning_rate": 0.00019998180376662033, + "loss": 1.7452, + "step": 23 + }, + { + "epoch": 0.0003118690539809597, + "grad_norm": 0.703778088092804, + "learning_rate": 0.00019997920430470892, + "loss": 1.8431, + "step": 24 + }, + { + "epoch": 0.00032486359789683306, + "grad_norm": 0.5046797394752502, + "learning_rate": 0.00019997660484279755, + "loss": 1.8238, + "step": 25 + }, + { + "epoch": 0.0003378581418127064, + "grad_norm": 0.7188125848770142, + "learning_rate": 0.00019997400538088617, + "loss": 1.9455, + "step": 26 + }, + { + "epoch": 0.00035085268572857973, + "grad_norm": 0.5912610292434692, + "learning_rate": 0.0001999714059189748, + "loss": 1.6048, + "step": 27 + }, + { + "epoch": 0.000363847229644453, + "grad_norm": 0.588297426700592, + "learning_rate": 0.0001999688064570634, + "loss": 1.809, + "step": 28 + }, + { + "epoch": 0.00037684177356032636, + "grad_norm": 0.7199362516403198, + "learning_rate": 0.000199966206995152, + "loss": 1.5913, + "step": 29 + }, + { + "epoch": 0.0003898363174761997, + "grad_norm": 0.5308181643486023, + "learning_rate": 0.00019996360753324064, + "loss": 1.7181, + "step": 30 + }, + { + "epoch": 0.000402830861392073, + "grad_norm": 0.7171155214309692, + "learning_rate": 0.00019996100807132924, + "loss": 1.8663, + "step": 31 + }, + { + "epoch": 0.0004158254053079463, + "grad_norm": 0.8405770659446716, + "learning_rate": 0.00019995840860941787, + "loss": 1.8455, + "step": 32 + }, + { + "epoch": 0.00042881994922381965, + "grad_norm": 1.0783236026763916, + "learning_rate": 0.00019995580914750646, + "loss": 1.9135, + "step": 33 + }, + { + "epoch": 0.000441814493139693, + "grad_norm": 0.5558031797409058, + "learning_rate": 0.0001999532096855951, + "loss": 1.611, + "step": 34 + }, + { + "epoch": 0.00045480903705556627, + "grad_norm": 0.643465518951416, + "learning_rate": 0.0001999506102236837, + "loss": 1.6633, + "step": 35 + }, + { + "epoch": 0.0004678035809714396, + "grad_norm": 0.5131320953369141, + "learning_rate": 0.0001999480107617723, + "loss": 1.4666, + "step": 36 + }, + { + "epoch": 0.00048079812488731295, + "grad_norm": 0.7401881814002991, + "learning_rate": 0.00019994541129986093, + "loss": 1.5295, + "step": 37 + }, + { + "epoch": 0.0004937926688031863, + "grad_norm": 0.5300176739692688, + "learning_rate": 0.00019994281183794956, + "loss": 1.7598, + "step": 38 + }, + { + "epoch": 0.0005067872127190596, + "grad_norm": 0.7013173699378967, + "learning_rate": 0.00019994021237603818, + "loss": 1.9747, + "step": 39 + }, + { + "epoch": 0.000519781756634933, + "grad_norm": 0.5502619743347168, + "learning_rate": 0.00019993761291412678, + "loss": 1.7822, + "step": 40 + }, + { + "epoch": 0.0005327763005508062, + "grad_norm": 0.7401593923568726, + "learning_rate": 0.0001999350134522154, + "loss": 1.7382, + "step": 41 + }, + { + "epoch": 0.0005457708444666795, + "grad_norm": 0.8741652965545654, + "learning_rate": 0.00019993241399030403, + "loss": 1.5306, + "step": 42 + }, + { + "epoch": 0.0005587653883825529, + "grad_norm": 1.000598430633545, + "learning_rate": 0.00019992981452839263, + "loss": 1.5486, + "step": 43 + }, + { + "epoch": 0.0005717599322984262, + "grad_norm": 0.9379377961158752, + "learning_rate": 0.00019992721506648125, + "loss": 1.8125, + "step": 44 + }, + { + "epoch": 0.0005847544762142995, + "grad_norm": 1.193866491317749, + "learning_rate": 0.00019992461560456985, + "loss": 1.7332, + "step": 45 + }, + { + "epoch": 0.0005977490201301729, + "grad_norm": 1.287405252456665, + "learning_rate": 0.00019992201614265847, + "loss": 1.9023, + "step": 46 + }, + { + "epoch": 0.0006107435640460462, + "grad_norm": 0.5383558869361877, + "learning_rate": 0.0001999194166807471, + "loss": 1.5893, + "step": 47 + }, + { + "epoch": 0.0006237381079619194, + "grad_norm": 0.7014543414115906, + "learning_rate": 0.0001999168172188357, + "loss": 1.6903, + "step": 48 + }, + { + "epoch": 0.0006367326518777928, + "grad_norm": 0.3466140031814575, + "learning_rate": 0.00019991421775692432, + "loss": 1.5747, + "step": 49 + }, + { + "epoch": 0.0006497271957936661, + "grad_norm": 0.39199456572532654, + "learning_rate": 0.00019991161829501294, + "loss": 1.5924, + "step": 50 + }, + { + "epoch": 0.0006627217397095395, + "grad_norm": 0.6264086365699768, + "learning_rate": 0.00019990901883310157, + "loss": 1.7289, + "step": 51 + }, + { + "epoch": 0.0006757162836254128, + "grad_norm": 0.774596631526947, + "learning_rate": 0.00019990641937119017, + "loss": 1.6478, + "step": 52 + }, + { + "epoch": 0.0006887108275412861, + "grad_norm": 1.0306848287582397, + "learning_rate": 0.0001999038199092788, + "loss": 1.8524, + "step": 53 + }, + { + "epoch": 0.0007017053714571595, + "grad_norm": 0.94454026222229, + "learning_rate": 0.00019990122044736741, + "loss": 1.401, + "step": 54 + }, + { + "epoch": 0.0007146999153730328, + "grad_norm": 1.3383725881576538, + "learning_rate": 0.000199898620985456, + "loss": 1.6518, + "step": 55 + }, + { + "epoch": 0.000727694459288906, + "grad_norm": 0.3218197226524353, + "learning_rate": 0.00019989602152354464, + "loss": 1.4379, + "step": 56 + }, + { + "epoch": 0.0007406890032047794, + "grad_norm": 0.6366977691650391, + "learning_rate": 0.00019989342206163326, + "loss": 1.6121, + "step": 57 + }, + { + "epoch": 0.0007536835471206527, + "grad_norm": 0.40514591336250305, + "learning_rate": 0.00019989082259972186, + "loss": 1.808, + "step": 58 + }, + { + "epoch": 0.000766678091036526, + "grad_norm": 0.4810250699520111, + "learning_rate": 0.00019988822313781048, + "loss": 1.5069, + "step": 59 + }, + { + "epoch": 0.0007796726349523994, + "grad_norm": 0.33548909425735474, + "learning_rate": 0.00019988562367589908, + "loss": 1.3861, + "step": 60 + }, + { + "epoch": 0.0007926671788682727, + "grad_norm": 0.37393638491630554, + "learning_rate": 0.00019988302421398773, + "loss": 1.4715, + "step": 61 + }, + { + "epoch": 0.000805661722784146, + "grad_norm": 0.5419512987136841, + "learning_rate": 0.00019988042475207633, + "loss": 1.5614, + "step": 62 + }, + { + "epoch": 0.0008186562667000193, + "grad_norm": 0.6855679154396057, + "learning_rate": 0.00019987782529016495, + "loss": 1.575, + "step": 63 + }, + { + "epoch": 0.0008316508106158926, + "grad_norm": 0.4336775541305542, + "learning_rate": 0.00019987522582825355, + "loss": 1.5964, + "step": 64 + }, + { + "epoch": 0.000844645354531766, + "grad_norm": 0.36566340923309326, + "learning_rate": 0.00019987262636634218, + "loss": 1.5594, + "step": 65 + }, + { + "epoch": 0.0008576398984476393, + "grad_norm": 0.36470988392829895, + "learning_rate": 0.0001998700269044308, + "loss": 1.5931, + "step": 66 + }, + { + "epoch": 0.0008706344423635126, + "grad_norm": 0.3844112455844879, + "learning_rate": 0.0001998674274425194, + "loss": 1.5849, + "step": 67 + }, + { + "epoch": 0.000883628986279386, + "grad_norm": 0.44522514939308167, + "learning_rate": 0.00019986482798060802, + "loss": 1.4193, + "step": 68 + }, + { + "epoch": 0.0008966235301952593, + "grad_norm": 0.2723512351512909, + "learning_rate": 0.00019986222851869665, + "loss": 1.4587, + "step": 69 + }, + { + "epoch": 0.0009096180741111325, + "grad_norm": 0.4704466760158539, + "learning_rate": 0.00019985962905678527, + "loss": 1.4766, + "step": 70 + }, + { + "epoch": 0.0009226126180270059, + "grad_norm": 0.4864993691444397, + "learning_rate": 0.00019985702959487387, + "loss": 1.7165, + "step": 71 + }, + { + "epoch": 0.0009356071619428792, + "grad_norm": 0.4624054431915283, + "learning_rate": 0.00019985443013296247, + "loss": 1.5708, + "step": 72 + }, + { + "epoch": 0.0009486017058587525, + "grad_norm": 0.4995464086532593, + "learning_rate": 0.00019985183067105112, + "loss": 1.6012, + "step": 73 + }, + { + "epoch": 0.0009615962497746259, + "grad_norm": 0.3099346160888672, + "learning_rate": 0.00019984923120913971, + "loss": 1.5089, + "step": 74 + }, + { + "epoch": 0.0009745907936904992, + "grad_norm": 0.5344899296760559, + "learning_rate": 0.00019984663174722834, + "loss": 1.4251, + "step": 75 + }, + { + "epoch": 0.0009875853376063726, + "grad_norm": 0.42957183718681335, + "learning_rate": 0.00019984403228531694, + "loss": 1.7298, + "step": 76 + }, + { + "epoch": 0.0010005798815222457, + "grad_norm": 0.31365153193473816, + "learning_rate": 0.00019984143282340556, + "loss": 1.3713, + "step": 77 + }, + { + "epoch": 0.0010135744254381191, + "grad_norm": 0.2500200867652893, + "learning_rate": 0.00019983883336149419, + "loss": 1.4603, + "step": 78 + }, + { + "epoch": 0.0010265689693539925, + "grad_norm": 0.4030626118183136, + "learning_rate": 0.00019983623389958278, + "loss": 1.6264, + "step": 79 + }, + { + "epoch": 0.001039563513269866, + "grad_norm": 0.5635514259338379, + "learning_rate": 0.0001998336344376714, + "loss": 1.7023, + "step": 80 + }, + { + "epoch": 0.001052558057185739, + "grad_norm": 0.3587415814399719, + "learning_rate": 0.00019983103497576003, + "loss": 1.4858, + "step": 81 + }, + { + "epoch": 0.0010655526011016125, + "grad_norm": 0.5135048031806946, + "learning_rate": 0.00019982843551384866, + "loss": 1.6953, + "step": 82 + }, + { + "epoch": 0.0010785471450174859, + "grad_norm": 0.5444433093070984, + "learning_rate": 0.00019982583605193725, + "loss": 1.5036, + "step": 83 + }, + { + "epoch": 0.001091541688933359, + "grad_norm": 0.36568179726600647, + "learning_rate": 0.00019982323659002585, + "loss": 1.6189, + "step": 84 + }, + { + "epoch": 0.0011045362328492324, + "grad_norm": 0.36495357751846313, + "learning_rate": 0.0001998206371281145, + "loss": 1.5315, + "step": 85 + }, + { + "epoch": 0.0011175307767651058, + "grad_norm": 0.42951181530952454, + "learning_rate": 0.0001998180376662031, + "loss": 1.5117, + "step": 86 + }, + { + "epoch": 0.001130525320680979, + "grad_norm": 0.3365992605686188, + "learning_rate": 0.00019981543820429172, + "loss": 1.2442, + "step": 87 + }, + { + "epoch": 0.0011435198645968524, + "grad_norm": 0.4303477704524994, + "learning_rate": 0.00019981283874238032, + "loss": 1.4972, + "step": 88 + }, + { + "epoch": 0.0011565144085127258, + "grad_norm": 0.5333189964294434, + "learning_rate": 0.00019981023928046895, + "loss": 1.6606, + "step": 89 + }, + { + "epoch": 0.001169508952428599, + "grad_norm": 0.45519933104515076, + "learning_rate": 0.00019980763981855757, + "loss": 1.2952, + "step": 90 + }, + { + "epoch": 0.0011825034963444724, + "grad_norm": 0.31117764115333557, + "learning_rate": 0.00019980504035664617, + "loss": 1.3897, + "step": 91 + }, + { + "epoch": 0.0011954980402603458, + "grad_norm": 0.4803959131240845, + "learning_rate": 0.00019980244089473482, + "loss": 1.5162, + "step": 92 + }, + { + "epoch": 0.001208492584176219, + "grad_norm": 0.397958368062973, + "learning_rate": 0.00019979984143282342, + "loss": 1.4509, + "step": 93 + }, + { + "epoch": 0.0012214871280920923, + "grad_norm": 0.4095950126647949, + "learning_rate": 0.00019979724197091204, + "loss": 1.519, + "step": 94 + }, + { + "epoch": 0.0012344816720079657, + "grad_norm": 0.27611222863197327, + "learning_rate": 0.00019979464250900064, + "loss": 1.307, + "step": 95 + }, + { + "epoch": 0.0012474762159238389, + "grad_norm": 0.47964999079704285, + "learning_rate": 0.00019979204304708926, + "loss": 1.5736, + "step": 96 + }, + { + "epoch": 0.0012604707598397123, + "grad_norm": 0.3905075788497925, + "learning_rate": 0.0001997894435851779, + "loss": 1.5931, + "step": 97 + }, + { + "epoch": 0.0012734653037555857, + "grad_norm": 0.3908260464668274, + "learning_rate": 0.00019978684412326649, + "loss": 1.3432, + "step": 98 + }, + { + "epoch": 0.0012864598476714588, + "grad_norm": 0.2977055013179779, + "learning_rate": 0.0001997842446613551, + "loss": 1.6434, + "step": 99 + }, + { + "epoch": 0.0012994543915873322, + "grad_norm": 0.41405099630355835, + "learning_rate": 0.00019978164519944373, + "loss": 1.5094, + "step": 100 + }, + { + "epoch": 0.0013124489355032056, + "grad_norm": 0.29386085271835327, + "learning_rate": 0.00019977904573753233, + "loss": 1.4693, + "step": 101 + }, + { + "epoch": 0.001325443479419079, + "grad_norm": 0.3862020969390869, + "learning_rate": 0.00019977644627562096, + "loss": 1.4884, + "step": 102 + }, + { + "epoch": 0.0013384380233349522, + "grad_norm": 0.35656118392944336, + "learning_rate": 0.00019977384681370955, + "loss": 1.394, + "step": 103 + }, + { + "epoch": 0.0013514325672508256, + "grad_norm": 0.4221237897872925, + "learning_rate": 0.0001997712473517982, + "loss": 1.6967, + "step": 104 + }, + { + "epoch": 0.001364427111166699, + "grad_norm": 0.3853652775287628, + "learning_rate": 0.0001997686478898868, + "loss": 1.5752, + "step": 105 + }, + { + "epoch": 0.0013774216550825722, + "grad_norm": 0.4113156199455261, + "learning_rate": 0.00019976604842797543, + "loss": 1.735, + "step": 106 + }, + { + "epoch": 0.0013904161989984455, + "grad_norm": 0.35804283618927, + "learning_rate": 0.00019976344896606402, + "loss": 1.5583, + "step": 107 + }, + { + "epoch": 0.001403410742914319, + "grad_norm": 0.36831387877464294, + "learning_rate": 0.00019976084950415265, + "loss": 1.34, + "step": 108 + }, + { + "epoch": 0.0014164052868301921, + "grad_norm": 0.31828340888023376, + "learning_rate": 0.00019975825004224127, + "loss": 1.4227, + "step": 109 + }, + { + "epoch": 0.0014293998307460655, + "grad_norm": 0.32072535157203674, + "learning_rate": 0.00019975565058032987, + "loss": 1.7136, + "step": 110 + }, + { + "epoch": 0.001442394374661939, + "grad_norm": 0.3531497120857239, + "learning_rate": 0.0001997530511184185, + "loss": 1.6733, + "step": 111 + }, + { + "epoch": 0.001455388918577812, + "grad_norm": 0.3910057246685028, + "learning_rate": 0.00019975045165650712, + "loss": 1.5452, + "step": 112 + }, + { + "epoch": 0.0014683834624936855, + "grad_norm": 0.39372771978378296, + "learning_rate": 0.00019974785219459572, + "loss": 1.4553, + "step": 113 + }, + { + "epoch": 0.0014813780064095589, + "grad_norm": 0.3863373398780823, + "learning_rate": 0.00019974525273268434, + "loss": 1.3832, + "step": 114 + }, + { + "epoch": 0.001494372550325432, + "grad_norm": 0.3262098729610443, + "learning_rate": 0.00019974265327077294, + "loss": 1.4632, + "step": 115 + }, + { + "epoch": 0.0015073670942413054, + "grad_norm": 0.3259226381778717, + "learning_rate": 0.0001997400538088616, + "loss": 1.474, + "step": 116 + }, + { + "epoch": 0.0015203616381571788, + "grad_norm": 0.31307917833328247, + "learning_rate": 0.0001997374543469502, + "loss": 1.3626, + "step": 117 + }, + { + "epoch": 0.001533356182073052, + "grad_norm": 0.3163079619407654, + "learning_rate": 0.0001997348548850388, + "loss": 1.3759, + "step": 118 + }, + { + "epoch": 0.0015463507259889254, + "grad_norm": 0.44436267018318176, + "learning_rate": 0.0001997322554231274, + "loss": 1.4845, + "step": 119 + }, + { + "epoch": 0.0015593452699047988, + "grad_norm": 0.4585951566696167, + "learning_rate": 0.00019972965596121603, + "loss": 1.5826, + "step": 120 + }, + { + "epoch": 0.001572339813820672, + "grad_norm": 0.37757980823516846, + "learning_rate": 0.00019972705649930466, + "loss": 1.5665, + "step": 121 + }, + { + "epoch": 0.0015853343577365453, + "grad_norm": 0.33806487917900085, + "learning_rate": 0.00019972445703739326, + "loss": 1.5042, + "step": 122 + }, + { + "epoch": 0.0015983289016524187, + "grad_norm": 0.44263142347335815, + "learning_rate": 0.00019972185757548188, + "loss": 1.5388, + "step": 123 + }, + { + "epoch": 0.001611323445568292, + "grad_norm": 0.31990745663642883, + "learning_rate": 0.0001997192581135705, + "loss": 1.5376, + "step": 124 + }, + { + "epoch": 0.0016243179894841653, + "grad_norm": 0.32597091794013977, + "learning_rate": 0.00019971665865165913, + "loss": 1.457, + "step": 125 + }, + { + "epoch": 0.0016373125334000387, + "grad_norm": 0.31376028060913086, + "learning_rate": 0.00019971405918974773, + "loss": 1.4279, + "step": 126 + }, + { + "epoch": 0.001650307077315912, + "grad_norm": 0.27025485038757324, + "learning_rate": 0.00019971145972783632, + "loss": 1.4572, + "step": 127 + }, + { + "epoch": 0.0016633016212317853, + "grad_norm": 0.22959962487220764, + "learning_rate": 0.00019970886026592498, + "loss": 1.2702, + "step": 128 + }, + { + "epoch": 0.0016762961651476586, + "grad_norm": 0.24560846388339996, + "learning_rate": 0.00019970626080401357, + "loss": 1.2481, + "step": 129 + }, + { + "epoch": 0.001689290709063532, + "grad_norm": 0.4410383105278015, + "learning_rate": 0.0001997036613421022, + "loss": 1.4782, + "step": 130 + }, + { + "epoch": 0.0017022852529794052, + "grad_norm": 0.35910242795944214, + "learning_rate": 0.00019970106188019082, + "loss": 1.5368, + "step": 131 + }, + { + "epoch": 0.0017152797968952786, + "grad_norm": 0.3717356026172638, + "learning_rate": 0.00019969846241827942, + "loss": 1.43, + "step": 132 + }, + { + "epoch": 0.001728274340811152, + "grad_norm": 0.34773606061935425, + "learning_rate": 0.00019969586295636804, + "loss": 1.5728, + "step": 133 + }, + { + "epoch": 0.0017412688847270252, + "grad_norm": 0.3884320557117462, + "learning_rate": 0.00019969326349445664, + "loss": 1.5506, + "step": 134 + }, + { + "epoch": 0.0017542634286428986, + "grad_norm": 0.30978694558143616, + "learning_rate": 0.0001996906640325453, + "loss": 1.5241, + "step": 135 + }, + { + "epoch": 0.001767257972558772, + "grad_norm": 0.36688876152038574, + "learning_rate": 0.0001996880645706339, + "loss": 1.4682, + "step": 136 + }, + { + "epoch": 0.0017802525164746451, + "grad_norm": 0.4215044677257538, + "learning_rate": 0.00019968546510872252, + "loss": 1.609, + "step": 137 + }, + { + "epoch": 0.0017932470603905185, + "grad_norm": 0.35323330760002136, + "learning_rate": 0.0001996828656468111, + "loss": 1.5944, + "step": 138 + }, + { + "epoch": 0.001806241604306392, + "grad_norm": 0.38223913311958313, + "learning_rate": 0.00019968026618489974, + "loss": 1.5231, + "step": 139 + }, + { + "epoch": 0.001819236148222265, + "grad_norm": 0.3340063989162445, + "learning_rate": 0.00019967766672298836, + "loss": 1.4547, + "step": 140 + }, + { + "epoch": 0.0018322306921381385, + "grad_norm": 0.2879827916622162, + "learning_rate": 0.00019967506726107696, + "loss": 1.3654, + "step": 141 + }, + { + "epoch": 0.0018452252360540119, + "grad_norm": 0.35720300674438477, + "learning_rate": 0.00019967246779916558, + "loss": 1.475, + "step": 142 + }, + { + "epoch": 0.001858219779969885, + "grad_norm": 0.45305946469306946, + "learning_rate": 0.0001996698683372542, + "loss": 1.462, + "step": 143 + }, + { + "epoch": 0.0018712143238857584, + "grad_norm": 0.41068610548973083, + "learning_rate": 0.0001996672688753428, + "loss": 1.6636, + "step": 144 + }, + { + "epoch": 0.0018842088678016318, + "grad_norm": 0.3500629663467407, + "learning_rate": 0.00019966466941343143, + "loss": 1.5408, + "step": 145 + }, + { + "epoch": 0.001897203411717505, + "grad_norm": 0.41315722465515137, + "learning_rate": 0.00019966206995152003, + "loss": 1.4691, + "step": 146 + }, + { + "epoch": 0.0019101979556333784, + "grad_norm": 0.3140808045864105, + "learning_rate": 0.00019965947048960868, + "loss": 1.3598, + "step": 147 + }, + { + "epoch": 0.0019231924995492518, + "grad_norm": 0.23439089953899384, + "learning_rate": 0.00019965687102769728, + "loss": 1.4139, + "step": 148 + }, + { + "epoch": 0.0019361870434651252, + "grad_norm": 0.40360027551651, + "learning_rate": 0.0001996542715657859, + "loss": 1.5104, + "step": 149 + }, + { + "epoch": 0.0019491815873809984, + "grad_norm": 0.3256491720676422, + "learning_rate": 0.0001996516721038745, + "loss": 1.6522, + "step": 150 + }, + { + "epoch": 0.0019621761312968717, + "grad_norm": 0.3488617539405823, + "learning_rate": 0.00019964907264196312, + "loss": 1.4508, + "step": 151 + }, + { + "epoch": 0.001975170675212745, + "grad_norm": 0.3948095440864563, + "learning_rate": 0.00019964647318005175, + "loss": 1.4633, + "step": 152 + }, + { + "epoch": 0.0019881652191286185, + "grad_norm": 0.3381510078907013, + "learning_rate": 0.00019964387371814034, + "loss": 1.3295, + "step": 153 + }, + { + "epoch": 0.0020011597630444915, + "grad_norm": 0.3446526527404785, + "learning_rate": 0.00019964127425622897, + "loss": 1.4926, + "step": 154 + }, + { + "epoch": 0.002014154306960365, + "grad_norm": 0.38809219002723694, + "learning_rate": 0.0001996386747943176, + "loss": 1.4912, + "step": 155 + }, + { + "epoch": 0.0020271488508762383, + "grad_norm": 0.363363653421402, + "learning_rate": 0.0001996360753324062, + "loss": 1.6432, + "step": 156 + }, + { + "epoch": 0.0020401433947921117, + "grad_norm": 0.4307781159877777, + "learning_rate": 0.00019963347587049482, + "loss": 1.6573, + "step": 157 + }, + { + "epoch": 0.002053137938707985, + "grad_norm": 0.4218001067638397, + "learning_rate": 0.0001996308764085834, + "loss": 1.6746, + "step": 158 + }, + { + "epoch": 0.0020661324826238585, + "grad_norm": 0.3414730727672577, + "learning_rate": 0.00019962827694667206, + "loss": 1.5121, + "step": 159 + }, + { + "epoch": 0.002079127026539732, + "grad_norm": 0.3359917104244232, + "learning_rate": 0.00019962567748476066, + "loss": 1.5004, + "step": 160 + }, + { + "epoch": 0.002092121570455605, + "grad_norm": 0.2820243239402771, + "learning_rate": 0.00019962307802284929, + "loss": 1.4176, + "step": 161 + }, + { + "epoch": 0.002105116114371478, + "grad_norm": 0.33084461092948914, + "learning_rate": 0.00019962047856093788, + "loss": 1.5205, + "step": 162 + }, + { + "epoch": 0.0021181106582873516, + "grad_norm": 0.41144001483917236, + "learning_rate": 0.0001996178790990265, + "loss": 1.6402, + "step": 163 + }, + { + "epoch": 0.002131105202203225, + "grad_norm": 0.3849426507949829, + "learning_rate": 0.00019961527963711513, + "loss": 1.6129, + "step": 164 + }, + { + "epoch": 0.0021440997461190984, + "grad_norm": 0.18820500373840332, + "learning_rate": 0.00019961268017520373, + "loss": 1.32, + "step": 165 + }, + { + "epoch": 0.0021570942900349718, + "grad_norm": 0.35013675689697266, + "learning_rate": 0.00019961008071329238, + "loss": 1.5776, + "step": 166 + }, + { + "epoch": 0.0021700888339508447, + "grad_norm": 0.3043151795864105, + "learning_rate": 0.00019960748125138098, + "loss": 1.5799, + "step": 167 + }, + { + "epoch": 0.002183083377866718, + "grad_norm": 0.3667580485343933, + "learning_rate": 0.00019960488178946958, + "loss": 1.6441, + "step": 168 + }, + { + "epoch": 0.0021960779217825915, + "grad_norm": 0.36724984645843506, + "learning_rate": 0.0001996022823275582, + "loss": 1.6838, + "step": 169 + }, + { + "epoch": 0.002209072465698465, + "grad_norm": 0.3321700990200043, + "learning_rate": 0.00019959968286564682, + "loss": 1.6015, + "step": 170 + }, + { + "epoch": 0.0022220670096143383, + "grad_norm": 0.34856680035591125, + "learning_rate": 0.00019959708340373545, + "loss": 1.4171, + "step": 171 + }, + { + "epoch": 0.0022350615535302117, + "grad_norm": 0.35499298572540283, + "learning_rate": 0.00019959448394182405, + "loss": 1.6336, + "step": 172 + }, + { + "epoch": 0.0022480560974460846, + "grad_norm": 0.320279985666275, + "learning_rate": 0.00019959188447991267, + "loss": 1.4745, + "step": 173 + }, + { + "epoch": 0.002261050641361958, + "grad_norm": 0.2296724021434784, + "learning_rate": 0.0001995892850180013, + "loss": 1.2762, + "step": 174 + }, + { + "epoch": 0.0022740451852778314, + "grad_norm": 0.4183747470378876, + "learning_rate": 0.0001995866855560899, + "loss": 1.5977, + "step": 175 + }, + { + "epoch": 0.002287039729193705, + "grad_norm": 0.3686603605747223, + "learning_rate": 0.00019958408609417852, + "loss": 1.5158, + "step": 176 + }, + { + "epoch": 0.002300034273109578, + "grad_norm": 0.4603264629840851, + "learning_rate": 0.00019958148663226711, + "loss": 1.5406, + "step": 177 + }, + { + "epoch": 0.0023130288170254516, + "grad_norm": 0.37625256180763245, + "learning_rate": 0.00019957888717035577, + "loss": 1.5891, + "step": 178 + }, + { + "epoch": 0.0023260233609413246, + "grad_norm": 0.3858341872692108, + "learning_rate": 0.00019957628770844436, + "loss": 1.4942, + "step": 179 + }, + { + "epoch": 0.002339017904857198, + "grad_norm": 0.36542120575904846, + "learning_rate": 0.000199573688246533, + "loss": 1.423, + "step": 180 + }, + { + "epoch": 0.0023520124487730713, + "grad_norm": 0.35841289162635803, + "learning_rate": 0.00019957108878462159, + "loss": 1.4763, + "step": 181 + }, + { + "epoch": 0.0023650069926889447, + "grad_norm": 0.35375916957855225, + "learning_rate": 0.0001995684893227102, + "loss": 1.5949, + "step": 182 + }, + { + "epoch": 0.002378001536604818, + "grad_norm": 0.334729939699173, + "learning_rate": 0.00019956588986079883, + "loss": 1.6127, + "step": 183 + }, + { + "epoch": 0.0023909960805206915, + "grad_norm": 0.283333957195282, + "learning_rate": 0.00019956329039888743, + "loss": 1.4284, + "step": 184 + }, + { + "epoch": 0.002403990624436565, + "grad_norm": 0.4093594253063202, + "learning_rate": 0.00019956069093697606, + "loss": 1.4232, + "step": 185 + }, + { + "epoch": 0.002416985168352438, + "grad_norm": 0.3956725299358368, + "learning_rate": 0.00019955809147506468, + "loss": 1.553, + "step": 186 + }, + { + "epoch": 0.0024299797122683113, + "grad_norm": 0.3651161193847656, + "learning_rate": 0.00019955549201315328, + "loss": 1.5034, + "step": 187 + }, + { + "epoch": 0.0024429742561841846, + "grad_norm": 0.39115169644355774, + "learning_rate": 0.0001995528925512419, + "loss": 1.6763, + "step": 188 + }, + { + "epoch": 0.002455968800100058, + "grad_norm": 0.2925562560558319, + "learning_rate": 0.0001995502930893305, + "loss": 1.441, + "step": 189 + }, + { + "epoch": 0.0024689633440159314, + "grad_norm": 0.37559300661087036, + "learning_rate": 0.00019954769362741915, + "loss": 1.4808, + "step": 190 + }, + { + "epoch": 0.002481957887931805, + "grad_norm": 0.3514731228351593, + "learning_rate": 0.00019954509416550775, + "loss": 1.4988, + "step": 191 + }, + { + "epoch": 0.0024949524318476778, + "grad_norm": 0.33247920870780945, + "learning_rate": 0.00019954249470359637, + "loss": 1.4973, + "step": 192 + }, + { + "epoch": 0.002507946975763551, + "grad_norm": 0.3307076096534729, + "learning_rate": 0.00019953989524168497, + "loss": 1.3719, + "step": 193 + }, + { + "epoch": 0.0025209415196794246, + "grad_norm": 0.33346524834632874, + "learning_rate": 0.0001995372957797736, + "loss": 1.5309, + "step": 194 + }, + { + "epoch": 0.002533936063595298, + "grad_norm": 0.3161267936229706, + "learning_rate": 0.00019953469631786222, + "loss": 1.3535, + "step": 195 + }, + { + "epoch": 0.0025469306075111713, + "grad_norm": 0.28719034790992737, + "learning_rate": 0.00019953209685595082, + "loss": 1.5653, + "step": 196 + }, + { + "epoch": 0.0025599251514270447, + "grad_norm": 0.36087697744369507, + "learning_rate": 0.00019952949739403944, + "loss": 1.5175, + "step": 197 + }, + { + "epoch": 0.0025729196953429177, + "grad_norm": 0.43930137157440186, + "learning_rate": 0.00019952689793212807, + "loss": 1.5531, + "step": 198 + }, + { + "epoch": 0.002585914239258791, + "grad_norm": 0.6818702816963196, + "learning_rate": 0.00019952429847021666, + "loss": 1.592, + "step": 199 + }, + { + "epoch": 0.0025989087831746645, + "grad_norm": 0.20841331779956818, + "learning_rate": 0.0001995216990083053, + "loss": 1.0124, + "step": 200 + }, + { + "epoch": 0.002611903327090538, + "grad_norm": 0.3404110372066498, + "learning_rate": 0.00019951909954639389, + "loss": 1.6634, + "step": 201 + }, + { + "epoch": 0.0026248978710064113, + "grad_norm": 0.39020222425460815, + "learning_rate": 0.00019951650008448254, + "loss": 1.4838, + "step": 202 + }, + { + "epoch": 0.0026378924149222847, + "grad_norm": 0.3787286579608917, + "learning_rate": 0.00019951390062257113, + "loss": 1.6008, + "step": 203 + }, + { + "epoch": 0.002650886958838158, + "grad_norm": 0.3658662438392639, + "learning_rate": 0.00019951130116065976, + "loss": 1.4642, + "step": 204 + }, + { + "epoch": 0.002663881502754031, + "grad_norm": 0.3418336808681488, + "learning_rate": 0.00019950870169874838, + "loss": 1.4158, + "step": 205 + }, + { + "epoch": 0.0026768760466699044, + "grad_norm": 0.3201123774051666, + "learning_rate": 0.00019950610223683698, + "loss": 1.6424, + "step": 206 + }, + { + "epoch": 0.002689870590585778, + "grad_norm": 0.4045439064502716, + "learning_rate": 0.0001995035027749256, + "loss": 1.7248, + "step": 207 + }, + { + "epoch": 0.002702865134501651, + "grad_norm": 0.30982545018196106, + "learning_rate": 0.0001995009033130142, + "loss": 1.348, + "step": 208 + }, + { + "epoch": 0.0027158596784175246, + "grad_norm": 0.3888842761516571, + "learning_rate": 0.00019949830385110285, + "loss": 1.6432, + "step": 209 + }, + { + "epoch": 0.002728854222333398, + "grad_norm": 0.37842032313346863, + "learning_rate": 0.00019949570438919145, + "loss": 1.56, + "step": 210 + }, + { + "epoch": 0.002741848766249271, + "grad_norm": 0.37506985664367676, + "learning_rate": 0.00019949310492728005, + "loss": 1.6661, + "step": 211 + }, + { + "epoch": 0.0027548433101651443, + "grad_norm": 0.36613574624061584, + "learning_rate": 0.00019949050546536867, + "loss": 1.538, + "step": 212 + }, + { + "epoch": 0.0027678378540810177, + "grad_norm": 0.2759266197681427, + "learning_rate": 0.0001994879060034573, + "loss": 1.5326, + "step": 213 + }, + { + "epoch": 0.002780832397996891, + "grad_norm": 0.28298652172088623, + "learning_rate": 0.00019948530654154592, + "loss": 1.5049, + "step": 214 + }, + { + "epoch": 0.0027938269419127645, + "grad_norm": 0.236300528049469, + "learning_rate": 0.00019948270707963452, + "loss": 1.4315, + "step": 215 + }, + { + "epoch": 0.002806821485828638, + "grad_norm": 0.36888444423675537, + "learning_rate": 0.00019948010761772314, + "loss": 1.4284, + "step": 216 + }, + { + "epoch": 0.002819816029744511, + "grad_norm": 0.42646467685699463, + "learning_rate": 0.00019947750815581177, + "loss": 1.6482, + "step": 217 + }, + { + "epoch": 0.0028328105736603842, + "grad_norm": 0.30057191848754883, + "learning_rate": 0.00019947490869390037, + "loss": 1.3229, + "step": 218 + }, + { + "epoch": 0.0028458051175762576, + "grad_norm": 0.39201629161834717, + "learning_rate": 0.000199472309231989, + "loss": 1.566, + "step": 219 + }, + { + "epoch": 0.002858799661492131, + "grad_norm": 0.32493525743484497, + "learning_rate": 0.0001994697097700776, + "loss": 1.4828, + "step": 220 + }, + { + "epoch": 0.0028717942054080044, + "grad_norm": 0.27081894874572754, + "learning_rate": 0.00019946711030816624, + "loss": 1.5155, + "step": 221 + }, + { + "epoch": 0.002884788749323878, + "grad_norm": 0.38155704736709595, + "learning_rate": 0.00019946451084625484, + "loss": 1.3287, + "step": 222 + }, + { + "epoch": 0.0028977832932397508, + "grad_norm": 0.364666610956192, + "learning_rate": 0.00019946191138434343, + "loss": 1.6249, + "step": 223 + }, + { + "epoch": 0.002910777837155624, + "grad_norm": 0.24264782667160034, + "learning_rate": 0.00019945931192243206, + "loss": 1.4499, + "step": 224 + }, + { + "epoch": 0.0029237723810714975, + "grad_norm": 0.39832785725593567, + "learning_rate": 0.00019945671246052068, + "loss": 1.5181, + "step": 225 + }, + { + "epoch": 0.002936766924987371, + "grad_norm": 0.351386696100235, + "learning_rate": 0.0001994541129986093, + "loss": 1.4117, + "step": 226 + }, + { + "epoch": 0.0029497614689032443, + "grad_norm": 0.3663950562477112, + "learning_rate": 0.0001994515135366979, + "loss": 1.5284, + "step": 227 + }, + { + "epoch": 0.0029627560128191177, + "grad_norm": 0.33718690276145935, + "learning_rate": 0.00019944891407478653, + "loss": 1.668, + "step": 228 + }, + { + "epoch": 0.002975750556734991, + "grad_norm": 0.28165867924690247, + "learning_rate": 0.00019944631461287515, + "loss": 1.3657, + "step": 229 + }, + { + "epoch": 0.002988745100650864, + "grad_norm": 0.4133395850658417, + "learning_rate": 0.00019944371515096375, + "loss": 1.4351, + "step": 230 + }, + { + "epoch": 0.0030017396445667375, + "grad_norm": 0.31343212723731995, + "learning_rate": 0.00019944111568905238, + "loss": 1.5072, + "step": 231 + }, + { + "epoch": 0.003014734188482611, + "grad_norm": 0.2588444650173187, + "learning_rate": 0.00019943851622714097, + "loss": 1.3898, + "step": 232 + }, + { + "epoch": 0.0030277287323984842, + "grad_norm": 0.3805907964706421, + "learning_rate": 0.00019943591676522963, + "loss": 1.7189, + "step": 233 + }, + { + "epoch": 0.0030407232763143576, + "grad_norm": 0.3293072581291199, + "learning_rate": 0.00019943331730331822, + "loss": 1.5578, + "step": 234 + }, + { + "epoch": 0.003053717820230231, + "grad_norm": 0.4328724443912506, + "learning_rate": 0.00019943071784140682, + "loss": 1.477, + "step": 235 + }, + { + "epoch": 0.003066712364146104, + "grad_norm": 0.32229161262512207, + "learning_rate": 0.00019942811837949544, + "loss": 1.3583, + "step": 236 + }, + { + "epoch": 0.0030797069080619774, + "grad_norm": 0.35365840792655945, + "learning_rate": 0.00019942551891758407, + "loss": 1.5048, + "step": 237 + }, + { + "epoch": 0.0030927014519778508, + "grad_norm": 0.3168087899684906, + "learning_rate": 0.0001994229194556727, + "loss": 1.239, + "step": 238 + }, + { + "epoch": 0.003105695995893724, + "grad_norm": 0.3821578621864319, + "learning_rate": 0.0001994203199937613, + "loss": 1.4687, + "step": 239 + }, + { + "epoch": 0.0031186905398095975, + "grad_norm": 0.3703784644603729, + "learning_rate": 0.00019941772053184992, + "loss": 1.4105, + "step": 240 + }, + { + "epoch": 0.003131685083725471, + "grad_norm": 0.3951148986816406, + "learning_rate": 0.00019941512106993854, + "loss": 1.4942, + "step": 241 + }, + { + "epoch": 0.003144679627641344, + "grad_norm": 0.32025468349456787, + "learning_rate": 0.00019941252160802714, + "loss": 1.7404, + "step": 242 + }, + { + "epoch": 0.0031576741715572173, + "grad_norm": 0.30971238017082214, + "learning_rate": 0.00019940992214611576, + "loss": 1.2907, + "step": 243 + }, + { + "epoch": 0.0031706687154730907, + "grad_norm": 0.34188809990882874, + "learning_rate": 0.00019940732268420439, + "loss": 1.4408, + "step": 244 + }, + { + "epoch": 0.003183663259388964, + "grad_norm": 0.3238825798034668, + "learning_rate": 0.000199404723222293, + "loss": 1.6224, + "step": 245 + }, + { + "epoch": 0.0031966578033048375, + "grad_norm": 0.5203685760498047, + "learning_rate": 0.0001994021237603816, + "loss": 1.5555, + "step": 246 + }, + { + "epoch": 0.003209652347220711, + "grad_norm": 0.3210241198539734, + "learning_rate": 0.00019939952429847023, + "loss": 1.5639, + "step": 247 + }, + { + "epoch": 0.003222646891136584, + "grad_norm": 0.29965224862098694, + "learning_rate": 0.00019939692483655886, + "loss": 1.5204, + "step": 248 + }, + { + "epoch": 0.003235641435052457, + "grad_norm": 0.36836618185043335, + "learning_rate": 0.00019939432537464745, + "loss": 1.3735, + "step": 249 + }, + { + "epoch": 0.0032486359789683306, + "grad_norm": 0.316834419965744, + "learning_rate": 0.00019939172591273608, + "loss": 1.6373, + "step": 250 + }, + { + "epoch": 0.003261630522884204, + "grad_norm": 0.3484581410884857, + "learning_rate": 0.00019938912645082468, + "loss": 1.4217, + "step": 251 + }, + { + "epoch": 0.0032746250668000774, + "grad_norm": 0.3081274628639221, + "learning_rate": 0.0001993865269889133, + "loss": 1.5174, + "step": 252 + }, + { + "epoch": 0.0032876196107159508, + "grad_norm": 0.35599973797798157, + "learning_rate": 0.00019938392752700193, + "loss": 1.3621, + "step": 253 + }, + { + "epoch": 0.003300614154631824, + "grad_norm": 0.3440588712692261, + "learning_rate": 0.00019938132806509052, + "loss": 1.6824, + "step": 254 + }, + { + "epoch": 0.003313608698547697, + "grad_norm": 0.34352511167526245, + "learning_rate": 0.00019937872860317915, + "loss": 1.5523, + "step": 255 + }, + { + "epoch": 0.0033266032424635705, + "grad_norm": 0.3553222119808197, + "learning_rate": 0.00019937612914126777, + "loss": 1.4915, + "step": 256 + }, + { + "epoch": 0.003339597786379444, + "grad_norm": 0.32171377539634705, + "learning_rate": 0.0001993735296793564, + "loss": 1.5895, + "step": 257 + }, + { + "epoch": 0.0033525923302953173, + "grad_norm": 0.48296815156936646, + "learning_rate": 0.000199370930217445, + "loss": 1.7365, + "step": 258 + }, + { + "epoch": 0.0033655868742111907, + "grad_norm": 0.3553713262081146, + "learning_rate": 0.00019936833075553362, + "loss": 1.6974, + "step": 259 + }, + { + "epoch": 0.003378581418127064, + "grad_norm": 0.2838060259819031, + "learning_rate": 0.00019936573129362224, + "loss": 1.3813, + "step": 260 + }, + { + "epoch": 0.003391575962042937, + "grad_norm": 0.3649294078350067, + "learning_rate": 0.00019936313183171084, + "loss": 1.5391, + "step": 261 + }, + { + "epoch": 0.0034045705059588104, + "grad_norm": 0.31763696670532227, + "learning_rate": 0.00019936053236979946, + "loss": 1.3523, + "step": 262 + }, + { + "epoch": 0.003417565049874684, + "grad_norm": 0.38990485668182373, + "learning_rate": 0.00019935793290788806, + "loss": 1.6409, + "step": 263 + }, + { + "epoch": 0.003430559593790557, + "grad_norm": 0.2968158721923828, + "learning_rate": 0.00019935533344597669, + "loss": 1.5148, + "step": 264 + }, + { + "epoch": 0.0034435541377064306, + "grad_norm": 0.3040262460708618, + "learning_rate": 0.0001993527339840653, + "loss": 1.5458, + "step": 265 + }, + { + "epoch": 0.003456548681622304, + "grad_norm": 0.4232870638370514, + "learning_rate": 0.0001993501345221539, + "loss": 1.6085, + "step": 266 + }, + { + "epoch": 0.003469543225538177, + "grad_norm": 0.3867042660713196, + "learning_rate": 0.00019934753506024253, + "loss": 1.5473, + "step": 267 + }, + { + "epoch": 0.0034825377694540503, + "grad_norm": 0.3499396741390228, + "learning_rate": 0.00019934493559833116, + "loss": 1.5009, + "step": 268 + }, + { + "epoch": 0.0034955323133699237, + "grad_norm": 0.38908466696739197, + "learning_rate": 0.00019934233613641978, + "loss": 1.4159, + "step": 269 + }, + { + "epoch": 0.003508526857285797, + "grad_norm": 0.29934537410736084, + "learning_rate": 0.00019933973667450838, + "loss": 1.2518, + "step": 270 + }, + { + "epoch": 0.0035215214012016705, + "grad_norm": 0.31104952096939087, + "learning_rate": 0.000199337137212597, + "loss": 1.4959, + "step": 271 + }, + { + "epoch": 0.003534515945117544, + "grad_norm": 0.4307188391685486, + "learning_rate": 0.00019933453775068563, + "loss": 1.675, + "step": 272 + }, + { + "epoch": 0.0035475104890334173, + "grad_norm": 0.30613699555397034, + "learning_rate": 0.00019933193828877423, + "loss": 1.5123, + "step": 273 + }, + { + "epoch": 0.0035605050329492903, + "grad_norm": 0.39022064208984375, + "learning_rate": 0.00019932933882686285, + "loss": 1.5492, + "step": 274 + }, + { + "epoch": 0.0035734995768651637, + "grad_norm": 0.4059942960739136, + "learning_rate": 0.00019932673936495145, + "loss": 1.5862, + "step": 275 + }, + { + "epoch": 0.003586494120781037, + "grad_norm": 0.33377179503440857, + "learning_rate": 0.0001993241399030401, + "loss": 1.6645, + "step": 276 + }, + { + "epoch": 0.0035994886646969104, + "grad_norm": 0.3957732915878296, + "learning_rate": 0.0001993215404411287, + "loss": 1.4876, + "step": 277 + }, + { + "epoch": 0.003612483208612784, + "grad_norm": 0.3599517047405243, + "learning_rate": 0.0001993189409792173, + "loss": 1.4725, + "step": 278 + }, + { + "epoch": 0.0036254777525286572, + "grad_norm": 0.28503838181495667, + "learning_rate": 0.00019931634151730594, + "loss": 1.3983, + "step": 279 + }, + { + "epoch": 0.00363847229644453, + "grad_norm": 0.3737494647502899, + "learning_rate": 0.00019931374205539454, + "loss": 1.5421, + "step": 280 + }, + { + "epoch": 0.0036514668403604036, + "grad_norm": 0.32609713077545166, + "learning_rate": 0.00019931114259348317, + "loss": 1.4412, + "step": 281 + }, + { + "epoch": 0.003664461384276277, + "grad_norm": 0.27020320296287537, + "learning_rate": 0.00019930854313157176, + "loss": 1.323, + "step": 282 + }, + { + "epoch": 0.0036774559281921504, + "grad_norm": 0.3037111759185791, + "learning_rate": 0.0001993059436696604, + "loss": 1.6002, + "step": 283 + }, + { + "epoch": 0.0036904504721080237, + "grad_norm": 0.319678395986557, + "learning_rate": 0.000199303344207749, + "loss": 1.6927, + "step": 284 + }, + { + "epoch": 0.003703445016023897, + "grad_norm": 0.3375207781791687, + "learning_rate": 0.0001993007447458376, + "loss": 1.6532, + "step": 285 + }, + { + "epoch": 0.00371643955993977, + "grad_norm": 0.43750518560409546, + "learning_rate": 0.00019929814528392624, + "loss": 1.5129, + "step": 286 + }, + { + "epoch": 0.0037294341038556435, + "grad_norm": 0.5206657648086548, + "learning_rate": 0.00019929554582201486, + "loss": 1.4792, + "step": 287 + }, + { + "epoch": 0.003742428647771517, + "grad_norm": 0.3427729904651642, + "learning_rate": 0.00019929294636010348, + "loss": 1.4153, + "step": 288 + }, + { + "epoch": 0.0037554231916873903, + "grad_norm": 0.2810385227203369, + "learning_rate": 0.00019929034689819208, + "loss": 1.4501, + "step": 289 + }, + { + "epoch": 0.0037684177356032637, + "grad_norm": 0.3164840340614319, + "learning_rate": 0.00019928774743628068, + "loss": 1.5437, + "step": 290 + }, + { + "epoch": 0.003781412279519137, + "grad_norm": 0.40451282262802124, + "learning_rate": 0.00019928514797436933, + "loss": 1.5069, + "step": 291 + }, + { + "epoch": 0.00379440682343501, + "grad_norm": 0.24449609220027924, + "learning_rate": 0.00019928254851245793, + "loss": 1.3157, + "step": 292 + }, + { + "epoch": 0.0038074013673508834, + "grad_norm": 0.3006196916103363, + "learning_rate": 0.00019927994905054655, + "loss": 1.4647, + "step": 293 + }, + { + "epoch": 0.003820395911266757, + "grad_norm": 0.2584126591682434, + "learning_rate": 0.00019927734958863515, + "loss": 1.5663, + "step": 294 + }, + { + "epoch": 0.00383339045518263, + "grad_norm": 0.3571203649044037, + "learning_rate": 0.00019927475012672377, + "loss": 1.3851, + "step": 295 + }, + { + "epoch": 0.0038463849990985036, + "grad_norm": 0.31314989924430847, + "learning_rate": 0.0001992721506648124, + "loss": 1.4645, + "step": 296 + }, + { + "epoch": 0.003859379543014377, + "grad_norm": 0.3966493010520935, + "learning_rate": 0.000199269551202901, + "loss": 1.2261, + "step": 297 + }, + { + "epoch": 0.0038723740869302504, + "grad_norm": 0.37772703170776367, + "learning_rate": 0.00019926695174098962, + "loss": 1.6829, + "step": 298 + }, + { + "epoch": 0.0038853686308461233, + "grad_norm": 0.34503284096717834, + "learning_rate": 0.00019926435227907824, + "loss": 1.5732, + "step": 299 + }, + { + "epoch": 0.0038983631747619967, + "grad_norm": 0.3639414608478546, + "learning_rate": 0.00019926175281716687, + "loss": 1.4561, + "step": 300 + }, + { + "epoch": 0.0039113577186778705, + "grad_norm": 0.36225298047065735, + "learning_rate": 0.00019925915335525547, + "loss": 1.5391, + "step": 301 + }, + { + "epoch": 0.0039243522625937435, + "grad_norm": 0.3410588204860687, + "learning_rate": 0.0001992565538933441, + "loss": 1.3652, + "step": 302 + }, + { + "epoch": 0.0039373468065096165, + "grad_norm": 0.3484848141670227, + "learning_rate": 0.00019925395443143272, + "loss": 1.3219, + "step": 303 + }, + { + "epoch": 0.00395034135042549, + "grad_norm": 0.3137170374393463, + "learning_rate": 0.0001992513549695213, + "loss": 1.7442, + "step": 304 + }, + { + "epoch": 0.003963335894341363, + "grad_norm": 0.27639421820640564, + "learning_rate": 0.00019924875550760994, + "loss": 1.4587, + "step": 305 + }, + { + "epoch": 0.003976330438257237, + "grad_norm": 0.4024495482444763, + "learning_rate": 0.00019924615604569854, + "loss": 1.4646, + "step": 306 + }, + { + "epoch": 0.00398932498217311, + "grad_norm": 0.3829977214336395, + "learning_rate": 0.00019924355658378716, + "loss": 1.476, + "step": 307 + }, + { + "epoch": 0.004002319526088983, + "grad_norm": 0.24776104092597961, + "learning_rate": 0.00019924095712187578, + "loss": 1.3956, + "step": 308 + }, + { + "epoch": 0.004015314070004857, + "grad_norm": 0.30229294300079346, + "learning_rate": 0.00019923835765996438, + "loss": 1.4949, + "step": 309 + }, + { + "epoch": 0.00402830861392073, + "grad_norm": 0.2815120220184326, + "learning_rate": 0.000199235758198053, + "loss": 1.4075, + "step": 310 + }, + { + "epoch": 0.004041303157836604, + "grad_norm": 0.3126884698867798, + "learning_rate": 0.00019923315873614163, + "loss": 1.2761, + "step": 311 + }, + { + "epoch": 0.0040542977017524766, + "grad_norm": 0.38102856278419495, + "learning_rate": 0.00019923055927423025, + "loss": 1.4739, + "step": 312 + }, + { + "epoch": 0.00406729224566835, + "grad_norm": 0.2952413260936737, + "learning_rate": 0.00019922795981231885, + "loss": 1.3963, + "step": 313 + }, + { + "epoch": 0.004080286789584223, + "grad_norm": 0.3740508556365967, + "learning_rate": 0.00019922536035040748, + "loss": 1.426, + "step": 314 + }, + { + "epoch": 0.004093281333500096, + "grad_norm": 0.3956066370010376, + "learning_rate": 0.0001992227608884961, + "loss": 1.396, + "step": 315 + }, + { + "epoch": 0.00410627587741597, + "grad_norm": 0.32432445883750916, + "learning_rate": 0.0001992201614265847, + "loss": 1.5289, + "step": 316 + }, + { + "epoch": 0.004119270421331843, + "grad_norm": 0.3189620077610016, + "learning_rate": 0.00019921756196467332, + "loss": 1.6106, + "step": 317 + }, + { + "epoch": 0.004132264965247717, + "grad_norm": 0.27923664450645447, + "learning_rate": 0.00019921496250276195, + "loss": 1.4541, + "step": 318 + }, + { + "epoch": 0.00414525950916359, + "grad_norm": 0.33311569690704346, + "learning_rate": 0.00019921236304085054, + "loss": 1.4914, + "step": 319 + }, + { + "epoch": 0.004158254053079464, + "grad_norm": 0.4120247960090637, + "learning_rate": 0.00019920976357893917, + "loss": 1.5054, + "step": 320 + }, + { + "epoch": 0.004171248596995337, + "grad_norm": 0.37923744320869446, + "learning_rate": 0.00019920716411702777, + "loss": 1.5037, + "step": 321 + }, + { + "epoch": 0.00418424314091121, + "grad_norm": 0.5349377989768982, + "learning_rate": 0.00019920456465511642, + "loss": 1.5083, + "step": 322 + }, + { + "epoch": 0.004197237684827083, + "grad_norm": 0.3078610599040985, + "learning_rate": 0.00019920196519320502, + "loss": 1.2024, + "step": 323 + }, + { + "epoch": 0.004210232228742956, + "grad_norm": 0.30870485305786133, + "learning_rate": 0.00019919936573129364, + "loss": 1.2176, + "step": 324 + }, + { + "epoch": 0.00422322677265883, + "grad_norm": 0.3730783760547638, + "learning_rate": 0.00019919676626938224, + "loss": 1.549, + "step": 325 + }, + { + "epoch": 0.004236221316574703, + "grad_norm": 0.4394433796405792, + "learning_rate": 0.00019919416680747086, + "loss": 1.6255, + "step": 326 + }, + { + "epoch": 0.004249215860490576, + "grad_norm": 0.35662856698036194, + "learning_rate": 0.0001991915673455595, + "loss": 1.5798, + "step": 327 + }, + { + "epoch": 0.00426221040440645, + "grad_norm": 0.38909202814102173, + "learning_rate": 0.00019918896788364808, + "loss": 1.7098, + "step": 328 + }, + { + "epoch": 0.004275204948322323, + "grad_norm": 0.2588720917701721, + "learning_rate": 0.0001991863684217367, + "loss": 1.4841, + "step": 329 + }, + { + "epoch": 0.004288199492238197, + "grad_norm": 0.26081642508506775, + "learning_rate": 0.00019918376895982533, + "loss": 1.3831, + "step": 330 + }, + { + "epoch": 0.00430119403615407, + "grad_norm": 0.3421750068664551, + "learning_rate": 0.00019918116949791396, + "loss": 1.4811, + "step": 331 + }, + { + "epoch": 0.0043141885800699435, + "grad_norm": 0.3355828821659088, + "learning_rate": 0.00019917857003600255, + "loss": 1.4944, + "step": 332 + }, + { + "epoch": 0.0043271831239858165, + "grad_norm": 0.36427485942840576, + "learning_rate": 0.00019917597057409115, + "loss": 1.4734, + "step": 333 + }, + { + "epoch": 0.0043401776679016894, + "grad_norm": 0.23450542986392975, + "learning_rate": 0.0001991733711121798, + "loss": 1.5442, + "step": 334 + }, + { + "epoch": 0.004353172211817563, + "grad_norm": 0.33973386883735657, + "learning_rate": 0.0001991707716502684, + "loss": 1.3849, + "step": 335 + }, + { + "epoch": 0.004366166755733436, + "grad_norm": 0.3666324019432068, + "learning_rate": 0.00019916817218835703, + "loss": 1.4994, + "step": 336 + }, + { + "epoch": 0.00437916129964931, + "grad_norm": 0.3015466332435608, + "learning_rate": 0.00019916557272644562, + "loss": 1.3411, + "step": 337 + }, + { + "epoch": 0.004392155843565183, + "grad_norm": 0.39283299446105957, + "learning_rate": 0.00019916297326453425, + "loss": 1.5291, + "step": 338 + }, + { + "epoch": 0.004405150387481056, + "grad_norm": 0.33914270997047424, + "learning_rate": 0.00019916037380262287, + "loss": 1.5784, + "step": 339 + }, + { + "epoch": 0.00441814493139693, + "grad_norm": 0.3376213014125824, + "learning_rate": 0.00019915777434071147, + "loss": 1.6643, + "step": 340 + }, + { + "epoch": 0.004431139475312803, + "grad_norm": 0.3161202073097229, + "learning_rate": 0.0001991551748788001, + "loss": 1.581, + "step": 341 + }, + { + "epoch": 0.004444134019228677, + "grad_norm": 0.281429260969162, + "learning_rate": 0.00019915257541688872, + "loss": 1.4395, + "step": 342 + }, + { + "epoch": 0.0044571285631445495, + "grad_norm": 0.30229508876800537, + "learning_rate": 0.00019914997595497734, + "loss": 1.3674, + "step": 343 + }, + { + "epoch": 0.004470123107060423, + "grad_norm": 0.38609886169433594, + "learning_rate": 0.00019914737649306594, + "loss": 1.4923, + "step": 344 + }, + { + "epoch": 0.004483117650976296, + "grad_norm": 0.3662254512310028, + "learning_rate": 0.00019914477703115454, + "loss": 1.4679, + "step": 345 + }, + { + "epoch": 0.004496112194892169, + "grad_norm": 0.41322511434555054, + "learning_rate": 0.0001991421775692432, + "loss": 1.6495, + "step": 346 + }, + { + "epoch": 0.004509106738808043, + "grad_norm": 0.33841001987457275, + "learning_rate": 0.0001991395781073318, + "loss": 1.4648, + "step": 347 + }, + { + "epoch": 0.004522101282723916, + "grad_norm": 0.2575288712978363, + "learning_rate": 0.0001991369786454204, + "loss": 1.4663, + "step": 348 + }, + { + "epoch": 0.00453509582663979, + "grad_norm": 0.29992565512657166, + "learning_rate": 0.000199134379183509, + "loss": 1.6909, + "step": 349 + }, + { + "epoch": 0.004548090370555663, + "grad_norm": 0.30328288674354553, + "learning_rate": 0.00019913177972159763, + "loss": 1.3658, + "step": 350 + }, + { + "epoch": 0.004561084914471537, + "grad_norm": 0.35181304812431335, + "learning_rate": 0.00019912918025968626, + "loss": 1.7398, + "step": 351 + }, + { + "epoch": 0.00457407945838741, + "grad_norm": 0.35897067189216614, + "learning_rate": 0.00019912658079777485, + "loss": 1.5741, + "step": 352 + }, + { + "epoch": 0.004587074002303283, + "grad_norm": 0.3016517758369446, + "learning_rate": 0.0001991239813358635, + "loss": 1.448, + "step": 353 + }, + { + "epoch": 0.004600068546219156, + "grad_norm": 0.4207736849784851, + "learning_rate": 0.0001991213818739521, + "loss": 1.6313, + "step": 354 + }, + { + "epoch": 0.004613063090135029, + "grad_norm": 0.5491371154785156, + "learning_rate": 0.00019911878241204073, + "loss": 1.7278, + "step": 355 + }, + { + "epoch": 0.004626057634050903, + "grad_norm": 0.4126145541667938, + "learning_rate": 0.00019911618295012933, + "loss": 1.4285, + "step": 356 + }, + { + "epoch": 0.004639052177966776, + "grad_norm": 0.3121761679649353, + "learning_rate": 0.00019911358348821795, + "loss": 1.5382, + "step": 357 + }, + { + "epoch": 0.004652046721882649, + "grad_norm": 0.35061636567115784, + "learning_rate": 0.00019911098402630657, + "loss": 1.5044, + "step": 358 + }, + { + "epoch": 0.004665041265798523, + "grad_norm": 0.30849018692970276, + "learning_rate": 0.00019910838456439517, + "loss": 1.4529, + "step": 359 + }, + { + "epoch": 0.004678035809714396, + "grad_norm": 0.33758828043937683, + "learning_rate": 0.0001991057851024838, + "loss": 1.489, + "step": 360 + }, + { + "epoch": 0.00469103035363027, + "grad_norm": 0.24260103702545166, + "learning_rate": 0.00019910318564057242, + "loss": 1.2288, + "step": 361 + }, + { + "epoch": 0.004704024897546143, + "grad_norm": 0.30140095949172974, + "learning_rate": 0.00019910058617866102, + "loss": 1.6953, + "step": 362 + }, + { + "epoch": 0.0047170194414620165, + "grad_norm": 0.43354475498199463, + "learning_rate": 0.00019909798671674964, + "loss": 1.5043, + "step": 363 + }, + { + "epoch": 0.0047300139853778895, + "grad_norm": 0.322436660528183, + "learning_rate": 0.00019909538725483824, + "loss": 1.4263, + "step": 364 + }, + { + "epoch": 0.004743008529293762, + "grad_norm": 0.3592729866504669, + "learning_rate": 0.0001990927877929269, + "loss": 1.4993, + "step": 365 + }, + { + "epoch": 0.004756003073209636, + "grad_norm": 0.3389509320259094, + "learning_rate": 0.0001990901883310155, + "loss": 1.5002, + "step": 366 + }, + { + "epoch": 0.004768997617125509, + "grad_norm": 0.35353222489356995, + "learning_rate": 0.00019908758886910411, + "loss": 1.4997, + "step": 367 + }, + { + "epoch": 0.004781992161041383, + "grad_norm": 0.2971359193325043, + "learning_rate": 0.0001990849894071927, + "loss": 1.5266, + "step": 368 + }, + { + "epoch": 0.004794986704957256, + "grad_norm": 0.2539112865924835, + "learning_rate": 0.00019908238994528134, + "loss": 1.4109, + "step": 369 + }, + { + "epoch": 0.00480798124887313, + "grad_norm": 0.3044717013835907, + "learning_rate": 0.00019907979048336996, + "loss": 1.3098, + "step": 370 + }, + { + "epoch": 0.004820975792789003, + "grad_norm": 0.2754577100276947, + "learning_rate": 0.00019907719102145856, + "loss": 1.2542, + "step": 371 + }, + { + "epoch": 0.004833970336704876, + "grad_norm": 0.4150601029396057, + "learning_rate": 0.00019907459155954718, + "loss": 1.629, + "step": 372 + }, + { + "epoch": 0.0048469648806207495, + "grad_norm": 0.4287967085838318, + "learning_rate": 0.0001990719920976358, + "loss": 1.5594, + "step": 373 + }, + { + "epoch": 0.0048599594245366225, + "grad_norm": 0.4358462691307068, + "learning_rate": 0.0001990693926357244, + "loss": 1.4678, + "step": 374 + }, + { + "epoch": 0.004872953968452496, + "grad_norm": 0.36713430285453796, + "learning_rate": 0.00019906679317381303, + "loss": 1.5234, + "step": 375 + }, + { + "epoch": 0.004885948512368369, + "grad_norm": 0.26237761974334717, + "learning_rate": 0.00019906419371190163, + "loss": 1.1133, + "step": 376 + }, + { + "epoch": 0.004898943056284242, + "grad_norm": 0.34526896476745605, + "learning_rate": 0.00019906159424999028, + "loss": 1.5015, + "step": 377 + }, + { + "epoch": 0.004911937600200116, + "grad_norm": 0.32094281911849976, + "learning_rate": 0.00019905899478807887, + "loss": 1.3854, + "step": 378 + }, + { + "epoch": 0.004924932144115989, + "grad_norm": 0.36162108182907104, + "learning_rate": 0.0001990563953261675, + "loss": 1.4663, + "step": 379 + }, + { + "epoch": 0.004937926688031863, + "grad_norm": 0.310148149728775, + "learning_rate": 0.0001990537958642561, + "loss": 1.3797, + "step": 380 + }, + { + "epoch": 0.004950921231947736, + "grad_norm": 0.30552294850349426, + "learning_rate": 0.00019905119640234472, + "loss": 1.5114, + "step": 381 + }, + { + "epoch": 0.00496391577586361, + "grad_norm": 0.4390285015106201, + "learning_rate": 0.00019904859694043335, + "loss": 1.6308, + "step": 382 + }, + { + "epoch": 0.004976910319779483, + "grad_norm": 0.4116781949996948, + "learning_rate": 0.00019904599747852194, + "loss": 1.5106, + "step": 383 + }, + { + "epoch": 0.0049899048636953556, + "grad_norm": 0.4091288149356842, + "learning_rate": 0.00019904339801661057, + "loss": 1.4398, + "step": 384 + }, + { + "epoch": 0.005002899407611229, + "grad_norm": 0.3653791546821594, + "learning_rate": 0.0001990407985546992, + "loss": 1.6652, + "step": 385 + }, + { + "epoch": 0.005015893951527102, + "grad_norm": 0.2884867191314697, + "learning_rate": 0.00019903819909278782, + "loss": 1.5647, + "step": 386 + }, + { + "epoch": 0.005028888495442976, + "grad_norm": 0.4264460504055023, + "learning_rate": 0.0001990355996308764, + "loss": 1.57, + "step": 387 + }, + { + "epoch": 0.005041883039358849, + "grad_norm": 0.418401837348938, + "learning_rate": 0.00019903300016896504, + "loss": 1.5994, + "step": 388 + }, + { + "epoch": 0.005054877583274723, + "grad_norm": 0.3586342930793762, + "learning_rate": 0.00019903040070705366, + "loss": 1.4049, + "step": 389 + }, + { + "epoch": 0.005067872127190596, + "grad_norm": 0.36481258273124695, + "learning_rate": 0.00019902780124514226, + "loss": 1.5148, + "step": 390 + }, + { + "epoch": 0.005080866671106469, + "grad_norm": 0.2749533951282501, + "learning_rate": 0.00019902520178323088, + "loss": 1.5359, + "step": 391 + }, + { + "epoch": 0.005093861215022343, + "grad_norm": 0.29476672410964966, + "learning_rate": 0.0001990226023213195, + "loss": 1.3621, + "step": 392 + }, + { + "epoch": 0.005106855758938216, + "grad_norm": 0.382303923368454, + "learning_rate": 0.0001990200028594081, + "loss": 1.4162, + "step": 393 + }, + { + "epoch": 0.0051198503028540895, + "grad_norm": 0.3639098107814789, + "learning_rate": 0.00019901740339749673, + "loss": 1.4234, + "step": 394 + }, + { + "epoch": 0.005132844846769962, + "grad_norm": 0.3024827241897583, + "learning_rate": 0.00019901480393558533, + "loss": 1.4654, + "step": 395 + }, + { + "epoch": 0.005145839390685835, + "grad_norm": 0.3591507375240326, + "learning_rate": 0.00019901220447367398, + "loss": 1.5982, + "step": 396 + }, + { + "epoch": 0.005158833934601709, + "grad_norm": 0.2867201566696167, + "learning_rate": 0.00019900960501176258, + "loss": 1.4561, + "step": 397 + }, + { + "epoch": 0.005171828478517582, + "grad_norm": 0.24428586661815643, + "learning_rate": 0.0001990070055498512, + "loss": 1.4842, + "step": 398 + }, + { + "epoch": 0.005184823022433456, + "grad_norm": 0.3388294577598572, + "learning_rate": 0.0001990044060879398, + "loss": 1.4982, + "step": 399 + }, + { + "epoch": 0.005197817566349329, + "grad_norm": 0.3624451458454132, + "learning_rate": 0.00019900180662602842, + "loss": 1.4539, + "step": 400 + }, + { + "epoch": 0.005210812110265203, + "grad_norm": 0.4799981415271759, + "learning_rate": 0.00019899920716411705, + "loss": 1.5812, + "step": 401 + }, + { + "epoch": 0.005223806654181076, + "grad_norm": 0.3657929301261902, + "learning_rate": 0.00019899660770220565, + "loss": 1.5602, + "step": 402 + }, + { + "epoch": 0.005236801198096949, + "grad_norm": 0.38014063239097595, + "learning_rate": 0.00019899400824029427, + "loss": 1.6159, + "step": 403 + }, + { + "epoch": 0.0052497957420128225, + "grad_norm": 0.392153263092041, + "learning_rate": 0.0001989914087783829, + "loss": 1.5834, + "step": 404 + }, + { + "epoch": 0.0052627902859286955, + "grad_norm": 0.39197036623954773, + "learning_rate": 0.0001989888093164715, + "loss": 1.5262, + "step": 405 + }, + { + "epoch": 0.005275784829844569, + "grad_norm": 0.36897343397140503, + "learning_rate": 0.00019898620985456012, + "loss": 1.4581, + "step": 406 + }, + { + "epoch": 0.005288779373760442, + "grad_norm": 0.3160710632801056, + "learning_rate": 0.0001989836103926487, + "loss": 1.3241, + "step": 407 + }, + { + "epoch": 0.005301773917676316, + "grad_norm": 0.32507070899009705, + "learning_rate": 0.00019898101093073737, + "loss": 1.4533, + "step": 408 + }, + { + "epoch": 0.005314768461592189, + "grad_norm": 0.3860207498073578, + "learning_rate": 0.00019897841146882596, + "loss": 1.6103, + "step": 409 + }, + { + "epoch": 0.005327763005508062, + "grad_norm": 0.28321462869644165, + "learning_rate": 0.0001989758120069146, + "loss": 1.4407, + "step": 410 + }, + { + "epoch": 0.005340757549423936, + "grad_norm": 0.36039674282073975, + "learning_rate": 0.00019897321254500318, + "loss": 1.5651, + "step": 411 + }, + { + "epoch": 0.005353752093339809, + "grad_norm": 0.38339999318122864, + "learning_rate": 0.0001989706130830918, + "loss": 1.4411, + "step": 412 + }, + { + "epoch": 0.005366746637255683, + "grad_norm": 0.3544124960899353, + "learning_rate": 0.00019896801362118043, + "loss": 1.4977, + "step": 413 + }, + { + "epoch": 0.005379741181171556, + "grad_norm": 0.3119560480117798, + "learning_rate": 0.00019896541415926903, + "loss": 1.5812, + "step": 414 + }, + { + "epoch": 0.0053927357250874285, + "grad_norm": 0.41256198287010193, + "learning_rate": 0.00019896281469735766, + "loss": 1.5193, + "step": 415 + }, + { + "epoch": 0.005405730269003302, + "grad_norm": 0.3362357020378113, + "learning_rate": 0.00019896021523544628, + "loss": 1.3828, + "step": 416 + }, + { + "epoch": 0.005418724812919175, + "grad_norm": 0.28691500425338745, + "learning_rate": 0.00019895761577353488, + "loss": 1.3371, + "step": 417 + }, + { + "epoch": 0.005431719356835049, + "grad_norm": 0.31148120760917664, + "learning_rate": 0.0001989550163116235, + "loss": 1.4256, + "step": 418 + }, + { + "epoch": 0.005444713900750922, + "grad_norm": 0.3507455587387085, + "learning_rate": 0.0001989524168497121, + "loss": 1.5591, + "step": 419 + }, + { + "epoch": 0.005457708444666796, + "grad_norm": 0.5161775946617126, + "learning_rate": 0.00019894981738780075, + "loss": 1.7213, + "step": 420 + }, + { + "epoch": 0.005470702988582669, + "grad_norm": 0.4539799094200134, + "learning_rate": 0.00019894721792588935, + "loss": 1.5054, + "step": 421 + }, + { + "epoch": 0.005483697532498542, + "grad_norm": 0.38446927070617676, + "learning_rate": 0.00019894461846397797, + "loss": 1.6458, + "step": 422 + }, + { + "epoch": 0.005496692076414416, + "grad_norm": 0.2804405987262726, + "learning_rate": 0.00019894201900206657, + "loss": 1.2219, + "step": 423 + }, + { + "epoch": 0.005509686620330289, + "grad_norm": 0.3562676012516022, + "learning_rate": 0.0001989394195401552, + "loss": 1.3673, + "step": 424 + }, + { + "epoch": 0.0055226811642461624, + "grad_norm": 0.39644521474838257, + "learning_rate": 0.00019893682007824382, + "loss": 1.6778, + "step": 425 + }, + { + "epoch": 0.005535675708162035, + "grad_norm": 0.35181400179862976, + "learning_rate": 0.00019893422061633242, + "loss": 1.4347, + "step": 426 + }, + { + "epoch": 0.005548670252077908, + "grad_norm": 0.33172330260276794, + "learning_rate": 0.00019893162115442107, + "loss": 1.5032, + "step": 427 + }, + { + "epoch": 0.005561664795993782, + "grad_norm": 0.29729771614074707, + "learning_rate": 0.00019892902169250966, + "loss": 1.3403, + "step": 428 + }, + { + "epoch": 0.005574659339909655, + "grad_norm": 0.4405791461467743, + "learning_rate": 0.00019892642223059826, + "loss": 1.7167, + "step": 429 + }, + { + "epoch": 0.005587653883825529, + "grad_norm": 0.31268784403800964, + "learning_rate": 0.0001989238227686869, + "loss": 1.3955, + "step": 430 + }, + { + "epoch": 0.005600648427741402, + "grad_norm": 0.2711225748062134, + "learning_rate": 0.0001989212233067755, + "loss": 1.5813, + "step": 431 + }, + { + "epoch": 0.005613642971657276, + "grad_norm": 0.3970961570739746, + "learning_rate": 0.00019891862384486414, + "loss": 1.4497, + "step": 432 + }, + { + "epoch": 0.005626637515573149, + "grad_norm": 0.3672778308391571, + "learning_rate": 0.00019891602438295273, + "loss": 1.4871, + "step": 433 + }, + { + "epoch": 0.005639632059489022, + "grad_norm": 0.3104008138179779, + "learning_rate": 0.00019891342492104136, + "loss": 1.4096, + "step": 434 + }, + { + "epoch": 0.0056526266034048955, + "grad_norm": 0.4113682806491852, + "learning_rate": 0.00019891082545912998, + "loss": 1.599, + "step": 435 + }, + { + "epoch": 0.0056656211473207685, + "grad_norm": 0.3929567337036133, + "learning_rate": 0.00019890822599721858, + "loss": 1.5196, + "step": 436 + }, + { + "epoch": 0.005678615691236642, + "grad_norm": 0.47488945722579956, + "learning_rate": 0.0001989056265353072, + "loss": 1.6349, + "step": 437 + }, + { + "epoch": 0.005691610235152515, + "grad_norm": 0.3263094127178192, + "learning_rate": 0.0001989030270733958, + "loss": 1.4518, + "step": 438 + }, + { + "epoch": 0.005704604779068389, + "grad_norm": 0.3672201633453369, + "learning_rate": 0.00019890042761148445, + "loss": 1.5772, + "step": 439 + }, + { + "epoch": 0.005717599322984262, + "grad_norm": 0.4115719199180603, + "learning_rate": 0.00019889782814957305, + "loss": 1.7124, + "step": 440 + }, + { + "epoch": 0.005730593866900135, + "grad_norm": 0.31117019057273865, + "learning_rate": 0.00019889522868766165, + "loss": 1.5545, + "step": 441 + }, + { + "epoch": 0.005743588410816009, + "grad_norm": 0.43034565448760986, + "learning_rate": 0.00019889262922575027, + "loss": 1.5853, + "step": 442 + }, + { + "epoch": 0.005756582954731882, + "grad_norm": 0.3980412781238556, + "learning_rate": 0.0001988900297638389, + "loss": 1.5383, + "step": 443 + }, + { + "epoch": 0.005769577498647756, + "grad_norm": 0.3163507282733917, + "learning_rate": 0.00019888743030192752, + "loss": 1.3385, + "step": 444 + }, + { + "epoch": 0.0057825720425636285, + "grad_norm": 0.37393853068351746, + "learning_rate": 0.00019888483084001612, + "loss": 1.4841, + "step": 445 + }, + { + "epoch": 0.0057955665864795015, + "grad_norm": 0.3804837763309479, + "learning_rate": 0.00019888223137810474, + "loss": 1.4943, + "step": 446 + }, + { + "epoch": 0.005808561130395375, + "grad_norm": 0.3611343204975128, + "learning_rate": 0.00019887963191619337, + "loss": 1.5363, + "step": 447 + }, + { + "epoch": 0.005821555674311248, + "grad_norm": 0.284523069858551, + "learning_rate": 0.00019887703245428196, + "loss": 1.5203, + "step": 448 + }, + { + "epoch": 0.005834550218227122, + "grad_norm": 0.398366779088974, + "learning_rate": 0.0001988744329923706, + "loss": 1.6513, + "step": 449 + }, + { + "epoch": 0.005847544762142995, + "grad_norm": 0.3861139416694641, + "learning_rate": 0.0001988718335304592, + "loss": 1.4824, + "step": 450 + }, + { + "epoch": 0.005860539306058869, + "grad_norm": 0.32513299584388733, + "learning_rate": 0.00019886923406854784, + "loss": 1.409, + "step": 451 + }, + { + "epoch": 0.005873533849974742, + "grad_norm": 0.3809260427951813, + "learning_rate": 0.00019886663460663644, + "loss": 1.5881, + "step": 452 + }, + { + "epoch": 0.005886528393890615, + "grad_norm": 0.3487527072429657, + "learning_rate": 0.00019886403514472506, + "loss": 1.5019, + "step": 453 + }, + { + "epoch": 0.005899522937806489, + "grad_norm": 0.31411534547805786, + "learning_rate": 0.00019886143568281366, + "loss": 1.5802, + "step": 454 + }, + { + "epoch": 0.005912517481722362, + "grad_norm": 0.4067882001399994, + "learning_rate": 0.00019885883622090228, + "loss": 1.4341, + "step": 455 + }, + { + "epoch": 0.005925512025638235, + "grad_norm": 0.32054728269577026, + "learning_rate": 0.0001988562367589909, + "loss": 1.4677, + "step": 456 + }, + { + "epoch": 0.005938506569554108, + "grad_norm": 0.336537629365921, + "learning_rate": 0.0001988536372970795, + "loss": 1.7248, + "step": 457 + }, + { + "epoch": 0.005951501113469982, + "grad_norm": 0.34672510623931885, + "learning_rate": 0.00019885103783516813, + "loss": 1.3866, + "step": 458 + }, + { + "epoch": 0.005964495657385855, + "grad_norm": 0.38182681798934937, + "learning_rate": 0.00019884843837325675, + "loss": 1.5306, + "step": 459 + }, + { + "epoch": 0.005977490201301728, + "grad_norm": 0.3976811468601227, + "learning_rate": 0.00019884583891134535, + "loss": 1.7088, + "step": 460 + }, + { + "epoch": 0.005990484745217602, + "grad_norm": 0.34490296244621277, + "learning_rate": 0.00019884323944943397, + "loss": 1.6008, + "step": 461 + }, + { + "epoch": 0.006003479289133475, + "grad_norm": 0.4401227831840515, + "learning_rate": 0.0001988406399875226, + "loss": 1.4556, + "step": 462 + }, + { + "epoch": 0.006016473833049349, + "grad_norm": 0.32190635800361633, + "learning_rate": 0.00019883804052561122, + "loss": 1.436, + "step": 463 + }, + { + "epoch": 0.006029468376965222, + "grad_norm": 0.39786359667778015, + "learning_rate": 0.00019883544106369982, + "loss": 1.9256, + "step": 464 + }, + { + "epoch": 0.006042462920881095, + "grad_norm": 0.32083266973495483, + "learning_rate": 0.00019883284160178845, + "loss": 1.5449, + "step": 465 + }, + { + "epoch": 0.0060554574647969685, + "grad_norm": 0.37164804339408875, + "learning_rate": 0.00019883024213987707, + "loss": 1.4283, + "step": 466 + }, + { + "epoch": 0.006068452008712841, + "grad_norm": 0.32890164852142334, + "learning_rate": 0.00019882764267796567, + "loss": 1.3723, + "step": 467 + }, + { + "epoch": 0.006081446552628715, + "grad_norm": 0.3165036737918854, + "learning_rate": 0.0001988250432160543, + "loss": 1.2878, + "step": 468 + }, + { + "epoch": 0.006094441096544588, + "grad_norm": 0.287625789642334, + "learning_rate": 0.0001988224437541429, + "loss": 1.5451, + "step": 469 + }, + { + "epoch": 0.006107435640460462, + "grad_norm": 0.31385481357574463, + "learning_rate": 0.00019881984429223154, + "loss": 1.3884, + "step": 470 + }, + { + "epoch": 0.006120430184376335, + "grad_norm": 0.48251134157180786, + "learning_rate": 0.00019881724483032014, + "loss": 1.4744, + "step": 471 + }, + { + "epoch": 0.006133424728292208, + "grad_norm": 0.38446444272994995, + "learning_rate": 0.00019881464536840874, + "loss": 1.5976, + "step": 472 + }, + { + "epoch": 0.006146419272208082, + "grad_norm": 0.40736618638038635, + "learning_rate": 0.00019881204590649736, + "loss": 1.6116, + "step": 473 + }, + { + "epoch": 0.006159413816123955, + "grad_norm": 0.3143475651741028, + "learning_rate": 0.00019880944644458598, + "loss": 1.5222, + "step": 474 + }, + { + "epoch": 0.006172408360039829, + "grad_norm": 0.4445915222167969, + "learning_rate": 0.0001988068469826746, + "loss": 1.5871, + "step": 475 + }, + { + "epoch": 0.0061854029039557015, + "grad_norm": 0.29055100679397583, + "learning_rate": 0.0001988042475207632, + "loss": 1.6007, + "step": 476 + }, + { + "epoch": 0.006198397447871575, + "grad_norm": 0.3698204457759857, + "learning_rate": 0.00019880164805885183, + "loss": 1.5176, + "step": 477 + }, + { + "epoch": 0.006211391991787448, + "grad_norm": 0.31237465143203735, + "learning_rate": 0.00019879904859694046, + "loss": 1.4398, + "step": 478 + }, + { + "epoch": 0.006224386535703321, + "grad_norm": 0.3039858043193817, + "learning_rate": 0.00019879644913502905, + "loss": 1.3464, + "step": 479 + }, + { + "epoch": 0.006237381079619195, + "grad_norm": 0.40471094846725464, + "learning_rate": 0.00019879384967311768, + "loss": 1.7087, + "step": 480 + }, + { + "epoch": 0.006250375623535068, + "grad_norm": 0.360324889421463, + "learning_rate": 0.00019879125021120627, + "loss": 1.523, + "step": 481 + }, + { + "epoch": 0.006263370167450942, + "grad_norm": 0.3731452226638794, + "learning_rate": 0.00019878865074929493, + "loss": 1.5483, + "step": 482 + }, + { + "epoch": 0.006276364711366815, + "grad_norm": 0.1972702145576477, + "learning_rate": 0.00019878605128738352, + "loss": 1.2281, + "step": 483 + }, + { + "epoch": 0.006289359255282688, + "grad_norm": 0.36422380805015564, + "learning_rate": 0.00019878345182547212, + "loss": 1.4624, + "step": 484 + }, + { + "epoch": 0.006302353799198562, + "grad_norm": 0.366229772567749, + "learning_rate": 0.00019878085236356075, + "loss": 1.4592, + "step": 485 + }, + { + "epoch": 0.006315348343114435, + "grad_norm": 0.295625239610672, + "learning_rate": 0.00019877825290164937, + "loss": 1.4777, + "step": 486 + }, + { + "epoch": 0.006328342887030308, + "grad_norm": 0.29468804597854614, + "learning_rate": 0.000198775653439738, + "loss": 1.4056, + "step": 487 + }, + { + "epoch": 0.006341337430946181, + "grad_norm": 0.4027608036994934, + "learning_rate": 0.0001987730539778266, + "loss": 1.5459, + "step": 488 + }, + { + "epoch": 0.006354331974862055, + "grad_norm": 0.39129120111465454, + "learning_rate": 0.00019877045451591522, + "loss": 1.4901, + "step": 489 + }, + { + "epoch": 0.006367326518777928, + "grad_norm": 0.2942184507846832, + "learning_rate": 0.00019876785505400384, + "loss": 1.5704, + "step": 490 + }, + { + "epoch": 0.006380321062693801, + "grad_norm": 0.3314521014690399, + "learning_rate": 0.00019876525559209244, + "loss": 1.4116, + "step": 491 + }, + { + "epoch": 0.006393315606609675, + "grad_norm": 0.3074198365211487, + "learning_rate": 0.00019876265613018106, + "loss": 1.1821, + "step": 492 + }, + { + "epoch": 0.006406310150525548, + "grad_norm": 0.2810860574245453, + "learning_rate": 0.00019876005666826966, + "loss": 1.472, + "step": 493 + }, + { + "epoch": 0.006419304694441422, + "grad_norm": 0.35208213329315186, + "learning_rate": 0.0001987574572063583, + "loss": 1.5873, + "step": 494 + }, + { + "epoch": 0.006432299238357295, + "grad_norm": 0.3943978548049927, + "learning_rate": 0.0001987548577444469, + "loss": 1.6375, + "step": 495 + }, + { + "epoch": 0.006445293782273168, + "grad_norm": 0.32735174894332886, + "learning_rate": 0.0001987522582825355, + "loss": 1.5516, + "step": 496 + }, + { + "epoch": 0.0064582883261890415, + "grad_norm": 0.361736923456192, + "learning_rate": 0.00019874965882062413, + "loss": 1.5815, + "step": 497 + }, + { + "epoch": 0.006471282870104914, + "grad_norm": 0.3275897800922394, + "learning_rate": 0.00019874705935871276, + "loss": 1.5671, + "step": 498 + }, + { + "epoch": 0.006484277414020788, + "grad_norm": 0.27669093012809753, + "learning_rate": 0.00019874445989680138, + "loss": 1.5036, + "step": 499 + }, + { + "epoch": 0.006497271957936661, + "grad_norm": 0.39098384976387024, + "learning_rate": 0.00019874186043488998, + "loss": 1.661, + "step": 500 + }, + { + "epoch": 0.006510266501852535, + "grad_norm": 0.32691437005996704, + "learning_rate": 0.0001987392609729786, + "loss": 1.3988, + "step": 501 + }, + { + "epoch": 0.006523261045768408, + "grad_norm": 0.37341952323913574, + "learning_rate": 0.00019873666151106723, + "loss": 1.4535, + "step": 502 + }, + { + "epoch": 0.006536255589684281, + "grad_norm": 0.3480914235115051, + "learning_rate": 0.00019873406204915582, + "loss": 1.419, + "step": 503 + }, + { + "epoch": 0.006549250133600155, + "grad_norm": 0.41433480381965637, + "learning_rate": 0.00019873146258724445, + "loss": 1.6009, + "step": 504 + }, + { + "epoch": 0.006562244677516028, + "grad_norm": 0.31330931186676025, + "learning_rate": 0.00019872886312533307, + "loss": 1.2479, + "step": 505 + }, + { + "epoch": 0.0065752392214319015, + "grad_norm": 0.32603132724761963, + "learning_rate": 0.0001987262636634217, + "loss": 1.3484, + "step": 506 + }, + { + "epoch": 0.0065882337653477745, + "grad_norm": 0.33049696683883667, + "learning_rate": 0.0001987236642015103, + "loss": 1.5938, + "step": 507 + }, + { + "epoch": 0.006601228309263648, + "grad_norm": 0.31383728981018066, + "learning_rate": 0.00019872106473959892, + "loss": 1.4817, + "step": 508 + }, + { + "epoch": 0.006614222853179521, + "grad_norm": 0.35135915875434875, + "learning_rate": 0.00019871846527768754, + "loss": 1.34, + "step": 509 + }, + { + "epoch": 0.006627217397095394, + "grad_norm": 0.32868465781211853, + "learning_rate": 0.00019871586581577614, + "loss": 1.5162, + "step": 510 + }, + { + "epoch": 0.006640211941011268, + "grad_norm": 0.3015967011451721, + "learning_rate": 0.00019871326635386477, + "loss": 1.3572, + "step": 511 + }, + { + "epoch": 0.006653206484927141, + "grad_norm": 0.4350413978099823, + "learning_rate": 0.00019871066689195336, + "loss": 1.5271, + "step": 512 + }, + { + "epoch": 0.006666201028843015, + "grad_norm": 0.42811042070388794, + "learning_rate": 0.000198708067430042, + "loss": 1.4912, + "step": 513 + }, + { + "epoch": 0.006679195572758888, + "grad_norm": 0.389029324054718, + "learning_rate": 0.0001987054679681306, + "loss": 1.5096, + "step": 514 + }, + { + "epoch": 0.006692190116674761, + "grad_norm": 0.26462414860725403, + "learning_rate": 0.0001987028685062192, + "loss": 1.1706, + "step": 515 + }, + { + "epoch": 0.006705184660590635, + "grad_norm": 0.3653124272823334, + "learning_rate": 0.00019870026904430783, + "loss": 1.5413, + "step": 516 + }, + { + "epoch": 0.0067181792045065076, + "grad_norm": 0.35684728622436523, + "learning_rate": 0.00019869766958239646, + "loss": 1.5449, + "step": 517 + }, + { + "epoch": 0.006731173748422381, + "grad_norm": 0.37269213795661926, + "learning_rate": 0.00019869507012048508, + "loss": 1.6315, + "step": 518 + }, + { + "epoch": 0.006744168292338254, + "grad_norm": 0.4106650948524475, + "learning_rate": 0.00019869247065857368, + "loss": 1.4719, + "step": 519 + }, + { + "epoch": 0.006757162836254128, + "grad_norm": 0.3294067680835724, + "learning_rate": 0.0001986898711966623, + "loss": 1.5744, + "step": 520 + }, + { + "epoch": 0.006770157380170001, + "grad_norm": 0.3181130290031433, + "learning_rate": 0.00019868727173475093, + "loss": 1.5229, + "step": 521 + }, + { + "epoch": 0.006783151924085874, + "grad_norm": 0.24196115136146545, + "learning_rate": 0.00019868467227283953, + "loss": 1.5383, + "step": 522 + }, + { + "epoch": 0.006796146468001748, + "grad_norm": 0.28848740458488464, + "learning_rate": 0.00019868207281092815, + "loss": 1.517, + "step": 523 + }, + { + "epoch": 0.006809141011917621, + "grad_norm": 0.37162500619888306, + "learning_rate": 0.00019867947334901675, + "loss": 1.4766, + "step": 524 + }, + { + "epoch": 0.006822135555833495, + "grad_norm": 0.4149519205093384, + "learning_rate": 0.00019867687388710537, + "loss": 1.4985, + "step": 525 + }, + { + "epoch": 0.006835130099749368, + "grad_norm": 0.4525688886642456, + "learning_rate": 0.000198674274425194, + "loss": 1.6895, + "step": 526 + }, + { + "epoch": 0.0068481246436652415, + "grad_norm": 0.3126329481601715, + "learning_rate": 0.0001986716749632826, + "loss": 1.3427, + "step": 527 + }, + { + "epoch": 0.006861119187581114, + "grad_norm": 0.49626997113227844, + "learning_rate": 0.00019866907550137122, + "loss": 1.5601, + "step": 528 + }, + { + "epoch": 0.006874113731496987, + "grad_norm": 0.33315759897232056, + "learning_rate": 0.00019866647603945984, + "loss": 1.3841, + "step": 529 + }, + { + "epoch": 0.006887108275412861, + "grad_norm": 0.33371663093566895, + "learning_rate": 0.00019866387657754847, + "loss": 1.5505, + "step": 530 + }, + { + "epoch": 0.006900102819328734, + "grad_norm": 0.3767539858818054, + "learning_rate": 0.00019866127711563707, + "loss": 1.5361, + "step": 531 + }, + { + "epoch": 0.006913097363244608, + "grad_norm": 0.38601014018058777, + "learning_rate": 0.0001986586776537257, + "loss": 1.469, + "step": 532 + }, + { + "epoch": 0.006926091907160481, + "grad_norm": 0.31352558732032776, + "learning_rate": 0.00019865607819181431, + "loss": 1.4213, + "step": 533 + }, + { + "epoch": 0.006939086451076354, + "grad_norm": 0.41270455718040466, + "learning_rate": 0.0001986534787299029, + "loss": 1.5892, + "step": 534 + }, + { + "epoch": 0.006952080994992228, + "grad_norm": 0.3458991050720215, + "learning_rate": 0.00019865087926799154, + "loss": 1.3762, + "step": 535 + }, + { + "epoch": 0.006965075538908101, + "grad_norm": 0.35102227330207825, + "learning_rate": 0.00019864827980608016, + "loss": 1.4757, + "step": 536 + }, + { + "epoch": 0.0069780700828239745, + "grad_norm": 0.3958595395088196, + "learning_rate": 0.00019864568034416879, + "loss": 1.5713, + "step": 537 + }, + { + "epoch": 0.0069910646267398475, + "grad_norm": 0.43266427516937256, + "learning_rate": 0.00019864308088225738, + "loss": 1.5958, + "step": 538 + }, + { + "epoch": 0.007004059170655721, + "grad_norm": 0.30778467655181885, + "learning_rate": 0.00019864048142034598, + "loss": 1.3696, + "step": 539 + }, + { + "epoch": 0.007017053714571594, + "grad_norm": 0.34286659955978394, + "learning_rate": 0.00019863788195843463, + "loss": 1.5121, + "step": 540 + }, + { + "epoch": 0.007030048258487467, + "grad_norm": 0.370387464761734, + "learning_rate": 0.00019863528249652323, + "loss": 1.6041, + "step": 541 + }, + { + "epoch": 0.007043042802403341, + "grad_norm": 0.35693231225013733, + "learning_rate": 0.00019863268303461185, + "loss": 1.2903, + "step": 542 + }, + { + "epoch": 0.007056037346319214, + "grad_norm": 0.39616748690605164, + "learning_rate": 0.00019863008357270045, + "loss": 1.4348, + "step": 543 + }, + { + "epoch": 0.007069031890235088, + "grad_norm": 0.4330069422721863, + "learning_rate": 0.00019862748411078908, + "loss": 1.581, + "step": 544 + }, + { + "epoch": 0.007082026434150961, + "grad_norm": 0.38319265842437744, + "learning_rate": 0.0001986248846488777, + "loss": 1.4199, + "step": 545 + }, + { + "epoch": 0.007095020978066835, + "grad_norm": 0.3927115201950073, + "learning_rate": 0.0001986222851869663, + "loss": 1.717, + "step": 546 + }, + { + "epoch": 0.007108015521982708, + "grad_norm": 0.3346995711326599, + "learning_rate": 0.00019861968572505492, + "loss": 1.5346, + "step": 547 + }, + { + "epoch": 0.0071210100658985805, + "grad_norm": 0.3665822744369507, + "learning_rate": 0.00019861708626314355, + "loss": 1.4155, + "step": 548 + }, + { + "epoch": 0.007134004609814454, + "grad_norm": 0.3435610234737396, + "learning_rate": 0.00019861448680123217, + "loss": 1.5495, + "step": 549 + }, + { + "epoch": 0.007146999153730327, + "grad_norm": 0.382671058177948, + "learning_rate": 0.00019861188733932077, + "loss": 1.445, + "step": 550 + }, + { + "epoch": 0.007159993697646201, + "grad_norm": 0.29679736495018005, + "learning_rate": 0.00019860928787740937, + "loss": 1.5509, + "step": 551 + }, + { + "epoch": 0.007172988241562074, + "grad_norm": 0.41203227639198303, + "learning_rate": 0.00019860668841549802, + "loss": 1.5446, + "step": 552 + }, + { + "epoch": 0.007185982785477947, + "grad_norm": 0.4276414215564728, + "learning_rate": 0.00019860408895358661, + "loss": 1.556, + "step": 553 + }, + { + "epoch": 0.007198977329393821, + "grad_norm": 0.30258363485336304, + "learning_rate": 0.00019860148949167524, + "loss": 1.3663, + "step": 554 + }, + { + "epoch": 0.007211971873309694, + "grad_norm": 0.4531673192977905, + "learning_rate": 0.00019859889002976384, + "loss": 1.5232, + "step": 555 + }, + { + "epoch": 0.007224966417225568, + "grad_norm": 0.38264134526252747, + "learning_rate": 0.00019859629056785246, + "loss": 1.4465, + "step": 556 + }, + { + "epoch": 0.007237960961141441, + "grad_norm": 0.3618118464946747, + "learning_rate": 0.00019859369110594109, + "loss": 1.4752, + "step": 557 + }, + { + "epoch": 0.0072509555050573144, + "grad_norm": 0.28785741329193115, + "learning_rate": 0.00019859109164402968, + "loss": 1.2222, + "step": 558 + }, + { + "epoch": 0.007263950048973187, + "grad_norm": 0.38898271322250366, + "learning_rate": 0.0001985884921821183, + "loss": 1.3929, + "step": 559 + }, + { + "epoch": 0.00727694459288906, + "grad_norm": 0.32042554020881653, + "learning_rate": 0.00019858589272020693, + "loss": 1.5251, + "step": 560 + }, + { + "epoch": 0.007289939136804934, + "grad_norm": 0.3364934027194977, + "learning_rate": 0.00019858329325829556, + "loss": 1.5811, + "step": 561 + }, + { + "epoch": 0.007302933680720807, + "grad_norm": 0.36081287264823914, + "learning_rate": 0.00019858069379638415, + "loss": 1.6106, + "step": 562 + }, + { + "epoch": 0.007315928224636681, + "grad_norm": 0.33980289101600647, + "learning_rate": 0.00019857809433447275, + "loss": 1.3746, + "step": 563 + }, + { + "epoch": 0.007328922768552554, + "grad_norm": 0.3250245749950409, + "learning_rate": 0.0001985754948725614, + "loss": 1.3789, + "step": 564 + }, + { + "epoch": 0.007341917312468427, + "grad_norm": 0.3220341205596924, + "learning_rate": 0.00019857289541065, + "loss": 1.7125, + "step": 565 + }, + { + "epoch": 0.007354911856384301, + "grad_norm": 0.3877193033695221, + "learning_rate": 0.00019857029594873862, + "loss": 1.6931, + "step": 566 + }, + { + "epoch": 0.007367906400300174, + "grad_norm": 0.3251323997974396, + "learning_rate": 0.00019856769648682722, + "loss": 1.396, + "step": 567 + }, + { + "epoch": 0.0073809009442160475, + "grad_norm": 0.294418066740036, + "learning_rate": 0.00019856509702491585, + "loss": 1.3975, + "step": 568 + }, + { + "epoch": 0.0073938954881319205, + "grad_norm": 0.3654199540615082, + "learning_rate": 0.00019856249756300447, + "loss": 1.5838, + "step": 569 + }, + { + "epoch": 0.007406890032047794, + "grad_norm": 0.4372889995574951, + "learning_rate": 0.00019855989810109307, + "loss": 1.6906, + "step": 570 + }, + { + "epoch": 0.007419884575963667, + "grad_norm": 0.34681013226509094, + "learning_rate": 0.0001985572986391817, + "loss": 1.4219, + "step": 571 + }, + { + "epoch": 0.00743287911987954, + "grad_norm": 0.28042349219322205, + "learning_rate": 0.00019855469917727032, + "loss": 1.3512, + "step": 572 + }, + { + "epoch": 0.007445873663795414, + "grad_norm": 0.37795329093933105, + "learning_rate": 0.00019855209971535894, + "loss": 1.4061, + "step": 573 + }, + { + "epoch": 0.007458868207711287, + "grad_norm": 0.3672502934932709, + "learning_rate": 0.00019854950025344754, + "loss": 1.5503, + "step": 574 + }, + { + "epoch": 0.007471862751627161, + "grad_norm": 0.34373030066490173, + "learning_rate": 0.00019854690079153616, + "loss": 1.6254, + "step": 575 + }, + { + "epoch": 0.007484857295543034, + "grad_norm": 0.40038031339645386, + "learning_rate": 0.0001985443013296248, + "loss": 1.5282, + "step": 576 + }, + { + "epoch": 0.007497851839458908, + "grad_norm": 0.3042319118976593, + "learning_rate": 0.00019854170186771338, + "loss": 1.4934, + "step": 577 + }, + { + "epoch": 0.0075108463833747805, + "grad_norm": 0.34824636578559875, + "learning_rate": 0.000198539102405802, + "loss": 1.6298, + "step": 578 + }, + { + "epoch": 0.0075238409272906535, + "grad_norm": 0.21115775406360626, + "learning_rate": 0.00019853650294389063, + "loss": 1.1295, + "step": 579 + }, + { + "epoch": 0.007536835471206527, + "grad_norm": 0.4801003932952881, + "learning_rate": 0.00019853390348197923, + "loss": 1.4094, + "step": 580 + }, + { + "epoch": 0.0075498300151224, + "grad_norm": 0.3953585922718048, + "learning_rate": 0.00019853130402006786, + "loss": 1.532, + "step": 581 + }, + { + "epoch": 0.007562824559038274, + "grad_norm": 0.4331101179122925, + "learning_rate": 0.00019852870455815645, + "loss": 1.5266, + "step": 582 + }, + { + "epoch": 0.007575819102954147, + "grad_norm": 0.3411102890968323, + "learning_rate": 0.0001985261050962451, + "loss": 1.5216, + "step": 583 + }, + { + "epoch": 0.00758881364687002, + "grad_norm": 0.3673463761806488, + "learning_rate": 0.0001985235056343337, + "loss": 1.5071, + "step": 584 + }, + { + "epoch": 0.007601808190785894, + "grad_norm": 0.34652939438819885, + "learning_rate": 0.00019852090617242233, + "loss": 1.2927, + "step": 585 + }, + { + "epoch": 0.007614802734701767, + "grad_norm": 0.3582465946674347, + "learning_rate": 0.00019851830671051092, + "loss": 1.4479, + "step": 586 + }, + { + "epoch": 0.007627797278617641, + "grad_norm": 0.33398643136024475, + "learning_rate": 0.00019851570724859955, + "loss": 1.6125, + "step": 587 + }, + { + "epoch": 0.007640791822533514, + "grad_norm": 0.4001072645187378, + "learning_rate": 0.00019851310778668817, + "loss": 1.4119, + "step": 588 + }, + { + "epoch": 0.007653786366449387, + "grad_norm": 0.3611069619655609, + "learning_rate": 0.00019851050832477677, + "loss": 1.5331, + "step": 589 + }, + { + "epoch": 0.00766678091036526, + "grad_norm": 0.36661073565483093, + "learning_rate": 0.0001985079088628654, + "loss": 1.5552, + "step": 590 + }, + { + "epoch": 0.007679775454281133, + "grad_norm": 0.40941113233566284, + "learning_rate": 0.00019850530940095402, + "loss": 1.4044, + "step": 591 + }, + { + "epoch": 0.007692769998197007, + "grad_norm": 0.3474884629249573, + "learning_rate": 0.00019850270993904264, + "loss": 1.3388, + "step": 592 + }, + { + "epoch": 0.00770576454211288, + "grad_norm": 0.3682302236557007, + "learning_rate": 0.00019850011047713124, + "loss": 1.4577, + "step": 593 + }, + { + "epoch": 0.007718759086028754, + "grad_norm": 0.4242841303348541, + "learning_rate": 0.00019849751101521984, + "loss": 1.824, + "step": 594 + }, + { + "epoch": 0.007731753629944627, + "grad_norm": 0.3588680624961853, + "learning_rate": 0.0001984949115533085, + "loss": 1.4645, + "step": 595 + }, + { + "epoch": 0.007744748173860501, + "grad_norm": 0.41413599252700806, + "learning_rate": 0.0001984923120913971, + "loss": 1.4754, + "step": 596 + }, + { + "epoch": 0.007757742717776374, + "grad_norm": 0.44178664684295654, + "learning_rate": 0.0001984897126294857, + "loss": 1.6119, + "step": 597 + }, + { + "epoch": 0.007770737261692247, + "grad_norm": 0.34459012746810913, + "learning_rate": 0.0001984871131675743, + "loss": 1.4424, + "step": 598 + }, + { + "epoch": 0.0077837318056081205, + "grad_norm": 0.3490884304046631, + "learning_rate": 0.00019848451370566293, + "loss": 1.531, + "step": 599 + }, + { + "epoch": 0.007796726349523993, + "grad_norm": 0.3423737585544586, + "learning_rate": 0.00019848191424375156, + "loss": 1.4427, + "step": 600 + }, + { + "epoch": 0.007809720893439867, + "grad_norm": 0.4252668619155884, + "learning_rate": 0.00019847931478184016, + "loss": 1.4669, + "step": 601 + }, + { + "epoch": 0.007822715437355741, + "grad_norm": 0.28277668356895447, + "learning_rate": 0.00019847671531992878, + "loss": 1.4039, + "step": 602 + }, + { + "epoch": 0.007835709981271614, + "grad_norm": 0.3732830584049225, + "learning_rate": 0.0001984741158580174, + "loss": 1.4148, + "step": 603 + }, + { + "epoch": 0.007848704525187487, + "grad_norm": 0.3279058337211609, + "learning_rate": 0.00019847151639610603, + "loss": 1.3893, + "step": 604 + }, + { + "epoch": 0.00786169906910336, + "grad_norm": 0.25931790471076965, + "learning_rate": 0.00019846891693419463, + "loss": 1.3712, + "step": 605 + }, + { + "epoch": 0.007874693613019233, + "grad_norm": 0.34375739097595215, + "learning_rate": 0.00019846631747228322, + "loss": 1.4624, + "step": 606 + }, + { + "epoch": 0.007887688156935108, + "grad_norm": 0.3998558819293976, + "learning_rate": 0.00019846371801037188, + "loss": 1.518, + "step": 607 + }, + { + "epoch": 0.00790068270085098, + "grad_norm": 0.31153321266174316, + "learning_rate": 0.00019846111854846047, + "loss": 1.4971, + "step": 608 + }, + { + "epoch": 0.007913677244766854, + "grad_norm": 0.4275885224342346, + "learning_rate": 0.0001984585190865491, + "loss": 1.6311, + "step": 609 + }, + { + "epoch": 0.007926671788682726, + "grad_norm": 0.3340792655944824, + "learning_rate": 0.00019845591962463772, + "loss": 1.6886, + "step": 610 + }, + { + "epoch": 0.0079396663325986, + "grad_norm": 0.3447030782699585, + "learning_rate": 0.00019845332016272632, + "loss": 1.4708, + "step": 611 + }, + { + "epoch": 0.007952660876514474, + "grad_norm": 0.33856722712516785, + "learning_rate": 0.00019845072070081494, + "loss": 1.5812, + "step": 612 + }, + { + "epoch": 0.007965655420430347, + "grad_norm": 0.4070112407207489, + "learning_rate": 0.00019844812123890354, + "loss": 1.522, + "step": 613 + }, + { + "epoch": 0.00797864996434622, + "grad_norm": 0.24768106639385223, + "learning_rate": 0.0001984455217769922, + "loss": 1.1529, + "step": 614 + }, + { + "epoch": 0.007991644508262093, + "grad_norm": 0.24420788884162903, + "learning_rate": 0.0001984429223150808, + "loss": 1.2745, + "step": 615 + }, + { + "epoch": 0.008004639052177966, + "grad_norm": 0.407650887966156, + "learning_rate": 0.00019844032285316941, + "loss": 1.4861, + "step": 616 + }, + { + "epoch": 0.00801763359609384, + "grad_norm": 0.31649330258369446, + "learning_rate": 0.000198437723391258, + "loss": 1.3632, + "step": 617 + }, + { + "epoch": 0.008030628140009714, + "grad_norm": 0.41149628162384033, + "learning_rate": 0.00019843512392934664, + "loss": 1.7538, + "step": 618 + }, + { + "epoch": 0.008043622683925587, + "grad_norm": 0.31875699758529663, + "learning_rate": 0.00019843252446743526, + "loss": 1.66, + "step": 619 + }, + { + "epoch": 0.00805661722784146, + "grad_norm": 0.3146108090877533, + "learning_rate": 0.00019842992500552386, + "loss": 1.6156, + "step": 620 + }, + { + "epoch": 0.008069611771757334, + "grad_norm": 0.3958038091659546, + "learning_rate": 0.00019842732554361248, + "loss": 1.4977, + "step": 621 + }, + { + "epoch": 0.008082606315673207, + "grad_norm": 0.419617235660553, + "learning_rate": 0.0001984247260817011, + "loss": 1.5449, + "step": 622 + }, + { + "epoch": 0.00809560085958908, + "grad_norm": 0.31068864464759827, + "learning_rate": 0.0001984221266197897, + "loss": 1.406, + "step": 623 + }, + { + "epoch": 0.008108595403504953, + "grad_norm": 0.3374605178833008, + "learning_rate": 0.00019841952715787833, + "loss": 1.5347, + "step": 624 + }, + { + "epoch": 0.008121589947420826, + "grad_norm": 0.4293529689311981, + "learning_rate": 0.00019841692769596693, + "loss": 1.5466, + "step": 625 + }, + { + "epoch": 0.0081345844913367, + "grad_norm": 0.3483525514602661, + "learning_rate": 0.00019841432823405558, + "loss": 1.3158, + "step": 626 + }, + { + "epoch": 0.008147579035252574, + "grad_norm": 0.3526119589805603, + "learning_rate": 0.00019841172877214418, + "loss": 1.5133, + "step": 627 + }, + { + "epoch": 0.008160573579168447, + "grad_norm": 0.40216967463493347, + "learning_rate": 0.0001984091293102328, + "loss": 1.4854, + "step": 628 + }, + { + "epoch": 0.00817356812308432, + "grad_norm": 0.3084598183631897, + "learning_rate": 0.0001984065298483214, + "loss": 1.2879, + "step": 629 + }, + { + "epoch": 0.008186562667000193, + "grad_norm": 0.37966227531433105, + "learning_rate": 0.00019840393038641002, + "loss": 1.6095, + "step": 630 + }, + { + "epoch": 0.008199557210916067, + "grad_norm": 0.43067601323127747, + "learning_rate": 0.00019840133092449865, + "loss": 1.356, + "step": 631 + }, + { + "epoch": 0.00821255175483194, + "grad_norm": 0.4464859962463379, + "learning_rate": 0.00019839873146258724, + "loss": 1.5336, + "step": 632 + }, + { + "epoch": 0.008225546298747813, + "grad_norm": 0.35634273290634155, + "learning_rate": 0.00019839613200067587, + "loss": 1.4559, + "step": 633 + }, + { + "epoch": 0.008238540842663686, + "grad_norm": 0.33753782510757446, + "learning_rate": 0.0001983935325387645, + "loss": 1.4193, + "step": 634 + }, + { + "epoch": 0.008251535386579559, + "grad_norm": 0.3455103933811188, + "learning_rate": 0.0001983909330768531, + "loss": 1.4038, + "step": 635 + }, + { + "epoch": 0.008264529930495434, + "grad_norm": 0.35408419370651245, + "learning_rate": 0.00019838833361494171, + "loss": 1.4354, + "step": 636 + }, + { + "epoch": 0.008277524474411307, + "grad_norm": 0.421975314617157, + "learning_rate": 0.0001983857341530303, + "loss": 1.1963, + "step": 637 + }, + { + "epoch": 0.00829051901832718, + "grad_norm": 0.42913416028022766, + "learning_rate": 0.00019838313469111896, + "loss": 1.3913, + "step": 638 + }, + { + "epoch": 0.008303513562243053, + "grad_norm": 0.3791632354259491, + "learning_rate": 0.00019838053522920756, + "loss": 1.4676, + "step": 639 + }, + { + "epoch": 0.008316508106158927, + "grad_norm": 0.31101372838020325, + "learning_rate": 0.00019837793576729619, + "loss": 1.5258, + "step": 640 + }, + { + "epoch": 0.0083295026500748, + "grad_norm": 0.40362784266471863, + "learning_rate": 0.00019837533630538478, + "loss": 1.7226, + "step": 641 + }, + { + "epoch": 0.008342497193990673, + "grad_norm": 0.2881763279438019, + "learning_rate": 0.0001983727368434734, + "loss": 1.3932, + "step": 642 + }, + { + "epoch": 0.008355491737906546, + "grad_norm": 0.39208993315696716, + "learning_rate": 0.00019837013738156203, + "loss": 1.4546, + "step": 643 + }, + { + "epoch": 0.00836848628182242, + "grad_norm": 0.38456860184669495, + "learning_rate": 0.00019836753791965063, + "loss": 1.5148, + "step": 644 + }, + { + "epoch": 0.008381480825738294, + "grad_norm": 0.23708252608776093, + "learning_rate": 0.00019836493845773925, + "loss": 1.0973, + "step": 645 + }, + { + "epoch": 0.008394475369654167, + "grad_norm": 0.4995286762714386, + "learning_rate": 0.00019836233899582788, + "loss": 1.4641, + "step": 646 + }, + { + "epoch": 0.00840746991357004, + "grad_norm": 0.3843645751476288, + "learning_rate": 0.00019835973953391648, + "loss": 1.4017, + "step": 647 + }, + { + "epoch": 0.008420464457485913, + "grad_norm": 0.3064028024673462, + "learning_rate": 0.0001983571400720051, + "loss": 1.5515, + "step": 648 + }, + { + "epoch": 0.008433459001401786, + "grad_norm": 0.38051947951316833, + "learning_rate": 0.00019835454061009372, + "loss": 1.5171, + "step": 649 + }, + { + "epoch": 0.00844645354531766, + "grad_norm": 0.36851462721824646, + "learning_rate": 0.00019835194114818235, + "loss": 1.6806, + "step": 650 + }, + { + "epoch": 0.008459448089233533, + "grad_norm": 0.4453751742839813, + "learning_rate": 0.00019834934168627095, + "loss": 1.5515, + "step": 651 + }, + { + "epoch": 0.008472442633149406, + "grad_norm": 0.415773868560791, + "learning_rate": 0.00019834674222435957, + "loss": 1.6913, + "step": 652 + }, + { + "epoch": 0.00848543717706528, + "grad_norm": 0.4147622585296631, + "learning_rate": 0.0001983441427624482, + "loss": 1.5182, + "step": 653 + }, + { + "epoch": 0.008498431720981152, + "grad_norm": 0.3704775869846344, + "learning_rate": 0.0001983415433005368, + "loss": 1.6625, + "step": 654 + }, + { + "epoch": 0.008511426264897027, + "grad_norm": 0.33425942063331604, + "learning_rate": 0.00019833894383862542, + "loss": 1.401, + "step": 655 + }, + { + "epoch": 0.0085244208088129, + "grad_norm": 0.4238104820251465, + "learning_rate": 0.00019833634437671401, + "loss": 1.4789, + "step": 656 + }, + { + "epoch": 0.008537415352728773, + "grad_norm": 0.37472251057624817, + "learning_rate": 0.00019833374491480267, + "loss": 1.5653, + "step": 657 + }, + { + "epoch": 0.008550409896644646, + "grad_norm": 0.33363428711891174, + "learning_rate": 0.00019833114545289126, + "loss": 1.3726, + "step": 658 + }, + { + "epoch": 0.00856340444056052, + "grad_norm": 0.3292219042778015, + "learning_rate": 0.0001983285459909799, + "loss": 1.5948, + "step": 659 + }, + { + "epoch": 0.008576398984476393, + "grad_norm": 0.41897693276405334, + "learning_rate": 0.00019832594652906849, + "loss": 1.534, + "step": 660 + }, + { + "epoch": 0.008589393528392266, + "grad_norm": 0.38180336356163025, + "learning_rate": 0.0001983233470671571, + "loss": 1.4522, + "step": 661 + }, + { + "epoch": 0.00860238807230814, + "grad_norm": 0.32229453325271606, + "learning_rate": 0.00019832074760524573, + "loss": 1.5321, + "step": 662 + }, + { + "epoch": 0.008615382616224012, + "grad_norm": 0.31744512915611267, + "learning_rate": 0.00019831814814333433, + "loss": 1.2042, + "step": 663 + }, + { + "epoch": 0.008628377160139887, + "grad_norm": 0.4684705138206482, + "learning_rate": 0.00019831554868142296, + "loss": 1.8078, + "step": 664 + }, + { + "epoch": 0.00864137170405576, + "grad_norm": 0.34191256761550903, + "learning_rate": 0.00019831294921951158, + "loss": 1.445, + "step": 665 + }, + { + "epoch": 0.008654366247971633, + "grad_norm": 0.33159396052360535, + "learning_rate": 0.00019831034975760018, + "loss": 1.4312, + "step": 666 + }, + { + "epoch": 0.008667360791887506, + "grad_norm": 0.3650842308998108, + "learning_rate": 0.0001983077502956888, + "loss": 1.6764, + "step": 667 + }, + { + "epoch": 0.008680355335803379, + "grad_norm": 0.40540340542793274, + "learning_rate": 0.0001983051508337774, + "loss": 1.4032, + "step": 668 + }, + { + "epoch": 0.008693349879719254, + "grad_norm": 0.35292646288871765, + "learning_rate": 0.00019830255137186605, + "loss": 1.3865, + "step": 669 + }, + { + "epoch": 0.008706344423635127, + "grad_norm": 0.42002010345458984, + "learning_rate": 0.00019829995190995465, + "loss": 1.4331, + "step": 670 + }, + { + "epoch": 0.008719338967551, + "grad_norm": 0.3328249156475067, + "learning_rate": 0.00019829735244804327, + "loss": 1.565, + "step": 671 + }, + { + "epoch": 0.008732333511466872, + "grad_norm": 0.3382279872894287, + "learning_rate": 0.00019829475298613187, + "loss": 1.3253, + "step": 672 + }, + { + "epoch": 0.008745328055382745, + "grad_norm": 0.402645081281662, + "learning_rate": 0.0001982921535242205, + "loss": 1.4082, + "step": 673 + }, + { + "epoch": 0.00875832259929862, + "grad_norm": 0.3478292226791382, + "learning_rate": 0.00019828955406230912, + "loss": 1.6943, + "step": 674 + }, + { + "epoch": 0.008771317143214493, + "grad_norm": 0.31939834356307983, + "learning_rate": 0.00019828695460039772, + "loss": 1.5331, + "step": 675 + }, + { + "epoch": 0.008784311687130366, + "grad_norm": 0.3915035128593445, + "learning_rate": 0.00019828435513848634, + "loss": 1.5543, + "step": 676 + }, + { + "epoch": 0.008797306231046239, + "grad_norm": 0.39418208599090576, + "learning_rate": 0.00019828175567657497, + "loss": 1.4944, + "step": 677 + }, + { + "epoch": 0.008810300774962112, + "grad_norm": 0.337128221988678, + "learning_rate": 0.00019827915621466356, + "loss": 1.3955, + "step": 678 + }, + { + "epoch": 0.008823295318877987, + "grad_norm": 0.2792261242866516, + "learning_rate": 0.0001982765567527522, + "loss": 1.3216, + "step": 679 + }, + { + "epoch": 0.00883628986279386, + "grad_norm": 0.310337632894516, + "learning_rate": 0.00019827395729084079, + "loss": 1.7405, + "step": 680 + }, + { + "epoch": 0.008849284406709733, + "grad_norm": 0.3071850538253784, + "learning_rate": 0.00019827135782892944, + "loss": 1.4601, + "step": 681 + }, + { + "epoch": 0.008862278950625605, + "grad_norm": 0.35270869731903076, + "learning_rate": 0.00019826875836701803, + "loss": 1.3447, + "step": 682 + }, + { + "epoch": 0.00887527349454148, + "grad_norm": 0.43033266067504883, + "learning_rate": 0.00019826615890510666, + "loss": 1.5967, + "step": 683 + }, + { + "epoch": 0.008888268038457353, + "grad_norm": 0.35486581921577454, + "learning_rate": 0.00019826355944319526, + "loss": 1.4615, + "step": 684 + }, + { + "epoch": 0.008901262582373226, + "grad_norm": 0.36915886402130127, + "learning_rate": 0.00019826095998128388, + "loss": 1.5589, + "step": 685 + }, + { + "epoch": 0.008914257126289099, + "grad_norm": 0.2822892367839813, + "learning_rate": 0.0001982583605193725, + "loss": 1.383, + "step": 686 + }, + { + "epoch": 0.008927251670204972, + "grad_norm": 0.4104083180427551, + "learning_rate": 0.0001982557610574611, + "loss": 1.5449, + "step": 687 + }, + { + "epoch": 0.008940246214120847, + "grad_norm": 0.3812151253223419, + "learning_rate": 0.00019825316159554975, + "loss": 1.5726, + "step": 688 + }, + { + "epoch": 0.00895324075803672, + "grad_norm": 0.33798748254776, + "learning_rate": 0.00019825056213363835, + "loss": 1.5412, + "step": 689 + }, + { + "epoch": 0.008966235301952593, + "grad_norm": 0.38268429040908813, + "learning_rate": 0.00019824796267172695, + "loss": 1.5783, + "step": 690 + }, + { + "epoch": 0.008979229845868466, + "grad_norm": 0.33230215311050415, + "learning_rate": 0.00019824536320981557, + "loss": 1.4306, + "step": 691 + }, + { + "epoch": 0.008992224389784339, + "grad_norm": 0.33637288212776184, + "learning_rate": 0.0001982427637479042, + "loss": 1.3414, + "step": 692 + }, + { + "epoch": 0.009005218933700213, + "grad_norm": 0.444703072309494, + "learning_rate": 0.00019824016428599282, + "loss": 1.4449, + "step": 693 + }, + { + "epoch": 0.009018213477616086, + "grad_norm": 0.30531391501426697, + "learning_rate": 0.00019823756482408142, + "loss": 1.4927, + "step": 694 + }, + { + "epoch": 0.00903120802153196, + "grad_norm": 0.3695407211780548, + "learning_rate": 0.00019823496536217004, + "loss": 1.5052, + "step": 695 + }, + { + "epoch": 0.009044202565447832, + "grad_norm": 0.3395458459854126, + "learning_rate": 0.00019823236590025867, + "loss": 1.3262, + "step": 696 + }, + { + "epoch": 0.009057197109363705, + "grad_norm": 0.33390477299690247, + "learning_rate": 0.00019822976643834727, + "loss": 1.5135, + "step": 697 + }, + { + "epoch": 0.00907019165327958, + "grad_norm": 0.33290162682533264, + "learning_rate": 0.0001982271669764359, + "loss": 1.2037, + "step": 698 + }, + { + "epoch": 0.009083186197195453, + "grad_norm": 0.36634618043899536, + "learning_rate": 0.0001982245675145245, + "loss": 1.5616, + "step": 699 + }, + { + "epoch": 0.009096180741111326, + "grad_norm": 0.3186809718608856, + "learning_rate": 0.00019822196805261314, + "loss": 1.4216, + "step": 700 + }, + { + "epoch": 0.009109175285027199, + "grad_norm": 0.3685540556907654, + "learning_rate": 0.00019821936859070174, + "loss": 1.5776, + "step": 701 + }, + { + "epoch": 0.009122169828943073, + "grad_norm": 0.35809457302093506, + "learning_rate": 0.00019821676912879033, + "loss": 1.5361, + "step": 702 + }, + { + "epoch": 0.009135164372858946, + "grad_norm": 0.36902254819869995, + "learning_rate": 0.00019821416966687896, + "loss": 1.563, + "step": 703 + }, + { + "epoch": 0.00914815891677482, + "grad_norm": 0.2929859757423401, + "learning_rate": 0.00019821157020496758, + "loss": 1.4686, + "step": 704 + }, + { + "epoch": 0.009161153460690692, + "grad_norm": 0.34742724895477295, + "learning_rate": 0.0001982089707430562, + "loss": 1.6509, + "step": 705 + }, + { + "epoch": 0.009174148004606565, + "grad_norm": 0.3738511800765991, + "learning_rate": 0.0001982063712811448, + "loss": 1.4771, + "step": 706 + }, + { + "epoch": 0.00918714254852244, + "grad_norm": 0.29607006907463074, + "learning_rate": 0.00019820377181923343, + "loss": 1.324, + "step": 707 + }, + { + "epoch": 0.009200137092438313, + "grad_norm": 0.3647070825099945, + "learning_rate": 0.00019820117235732205, + "loss": 1.549, + "step": 708 + }, + { + "epoch": 0.009213131636354186, + "grad_norm": 0.22530755400657654, + "learning_rate": 0.00019819857289541065, + "loss": 1.3984, + "step": 709 + }, + { + "epoch": 0.009226126180270059, + "grad_norm": 0.37032994627952576, + "learning_rate": 0.00019819597343349928, + "loss": 1.2236, + "step": 710 + }, + { + "epoch": 0.009239120724185932, + "grad_norm": 0.32561302185058594, + "learning_rate": 0.00019819337397158787, + "loss": 1.4317, + "step": 711 + }, + { + "epoch": 0.009252115268101806, + "grad_norm": 0.34994667768478394, + "learning_rate": 0.00019819077450967652, + "loss": 1.3987, + "step": 712 + }, + { + "epoch": 0.00926510981201768, + "grad_norm": 0.367980033159256, + "learning_rate": 0.00019818817504776512, + "loss": 1.5015, + "step": 713 + }, + { + "epoch": 0.009278104355933552, + "grad_norm": 0.4678630530834198, + "learning_rate": 0.00019818557558585375, + "loss": 1.5274, + "step": 714 + }, + { + "epoch": 0.009291098899849425, + "grad_norm": 0.36450856924057007, + "learning_rate": 0.00019818297612394234, + "loss": 1.6098, + "step": 715 + }, + { + "epoch": 0.009304093443765298, + "grad_norm": 0.36564064025878906, + "learning_rate": 0.00019818037666203097, + "loss": 1.3938, + "step": 716 + }, + { + "epoch": 0.009317087987681173, + "grad_norm": 0.3113052248954773, + "learning_rate": 0.0001981777772001196, + "loss": 1.5942, + "step": 717 + }, + { + "epoch": 0.009330082531597046, + "grad_norm": 0.28900638222694397, + "learning_rate": 0.0001981751777382082, + "loss": 1.487, + "step": 718 + }, + { + "epoch": 0.009343077075512919, + "grad_norm": 0.3917618691921234, + "learning_rate": 0.00019817257827629681, + "loss": 1.5005, + "step": 719 + }, + { + "epoch": 0.009356071619428792, + "grad_norm": 0.2771584093570709, + "learning_rate": 0.00019816997881438544, + "loss": 1.3067, + "step": 720 + }, + { + "epoch": 0.009369066163344666, + "grad_norm": 0.45164725184440613, + "learning_rate": 0.00019816737935247404, + "loss": 1.5159, + "step": 721 + }, + { + "epoch": 0.00938206070726054, + "grad_norm": 0.3692169785499573, + "learning_rate": 0.00019816477989056266, + "loss": 1.2412, + "step": 722 + }, + { + "epoch": 0.009395055251176412, + "grad_norm": 0.49505218863487244, + "learning_rate": 0.00019816218042865129, + "loss": 1.7205, + "step": 723 + }, + { + "epoch": 0.009408049795092285, + "grad_norm": 0.34234389662742615, + "learning_rate": 0.0001981595809667399, + "loss": 1.515, + "step": 724 + }, + { + "epoch": 0.009421044339008158, + "grad_norm": 0.38328883051872253, + "learning_rate": 0.0001981569815048285, + "loss": 1.5291, + "step": 725 + }, + { + "epoch": 0.009434038882924033, + "grad_norm": 0.29110851883888245, + "learning_rate": 0.00019815438204291713, + "loss": 1.4392, + "step": 726 + }, + { + "epoch": 0.009447033426839906, + "grad_norm": 0.3245619833469391, + "learning_rate": 0.00019815178258100576, + "loss": 1.48, + "step": 727 + }, + { + "epoch": 0.009460027970755779, + "grad_norm": 0.3311016261577606, + "learning_rate": 0.00019814918311909435, + "loss": 1.4422, + "step": 728 + }, + { + "epoch": 0.009473022514671652, + "grad_norm": 0.3260456919670105, + "learning_rate": 0.00019814658365718298, + "loss": 1.3742, + "step": 729 + }, + { + "epoch": 0.009486017058587525, + "grad_norm": 0.3841300904750824, + "learning_rate": 0.00019814398419527158, + "loss": 1.5685, + "step": 730 + }, + { + "epoch": 0.0094990116025034, + "grad_norm": 0.36956533789634705, + "learning_rate": 0.0001981413847333602, + "loss": 1.4435, + "step": 731 + }, + { + "epoch": 0.009512006146419272, + "grad_norm": 0.416090726852417, + "learning_rate": 0.00019813878527144882, + "loss": 1.5898, + "step": 732 + }, + { + "epoch": 0.009525000690335145, + "grad_norm": 0.3416915535926819, + "learning_rate": 0.00019813618580953742, + "loss": 1.3792, + "step": 733 + }, + { + "epoch": 0.009537995234251018, + "grad_norm": 0.35502490401268005, + "learning_rate": 0.00019813358634762605, + "loss": 1.5907, + "step": 734 + }, + { + "epoch": 0.009550989778166891, + "grad_norm": 0.39543843269348145, + "learning_rate": 0.00019813098688571467, + "loss": 1.5287, + "step": 735 + }, + { + "epoch": 0.009563984322082766, + "grad_norm": 0.3150179982185364, + "learning_rate": 0.0001981283874238033, + "loss": 1.4447, + "step": 736 + }, + { + "epoch": 0.009576978865998639, + "grad_norm": 0.37571099400520325, + "learning_rate": 0.0001981257879618919, + "loss": 1.3266, + "step": 737 + }, + { + "epoch": 0.009589973409914512, + "grad_norm": 0.36124467849731445, + "learning_rate": 0.00019812318849998052, + "loss": 1.4497, + "step": 738 + }, + { + "epoch": 0.009602967953830385, + "grad_norm": 0.3628777265548706, + "learning_rate": 0.00019812058903806914, + "loss": 1.4794, + "step": 739 + }, + { + "epoch": 0.00961596249774626, + "grad_norm": 0.3742762506008148, + "learning_rate": 0.00019811798957615774, + "loss": 1.7162, + "step": 740 + }, + { + "epoch": 0.009628957041662133, + "grad_norm": 0.5336025953292847, + "learning_rate": 0.00019811539011424636, + "loss": 1.5559, + "step": 741 + }, + { + "epoch": 0.009641951585578006, + "grad_norm": 0.4883899688720703, + "learning_rate": 0.00019811279065233496, + "loss": 1.5139, + "step": 742 + }, + { + "epoch": 0.009654946129493878, + "grad_norm": 0.45548561215400696, + "learning_rate": 0.0001981101911904236, + "loss": 1.4212, + "step": 743 + }, + { + "epoch": 0.009667940673409751, + "grad_norm": 0.31765982508659363, + "learning_rate": 0.0001981075917285122, + "loss": 1.4476, + "step": 744 + }, + { + "epoch": 0.009680935217325626, + "grad_norm": 0.3143158555030823, + "learning_rate": 0.0001981049922666008, + "loss": 1.7234, + "step": 745 + }, + { + "epoch": 0.009693929761241499, + "grad_norm": 0.3834339678287506, + "learning_rate": 0.00019810239280468943, + "loss": 1.7519, + "step": 746 + }, + { + "epoch": 0.009706924305157372, + "grad_norm": 0.33390897512435913, + "learning_rate": 0.00019809979334277806, + "loss": 1.5717, + "step": 747 + }, + { + "epoch": 0.009719918849073245, + "grad_norm": 0.40962928533554077, + "learning_rate": 0.00019809719388086668, + "loss": 1.6574, + "step": 748 + }, + { + "epoch": 0.009732913392989118, + "grad_norm": 0.39467093348503113, + "learning_rate": 0.00019809459441895528, + "loss": 1.6905, + "step": 749 + }, + { + "epoch": 0.009745907936904993, + "grad_norm": 0.3943708539009094, + "learning_rate": 0.0001980919949570439, + "loss": 1.5263, + "step": 750 + }, + { + "epoch": 0.009758902480820866, + "grad_norm": 0.2936602830886841, + "learning_rate": 0.00019808939549513253, + "loss": 1.3639, + "step": 751 + }, + { + "epoch": 0.009771897024736739, + "grad_norm": 0.36436378955841064, + "learning_rate": 0.00019808679603322112, + "loss": 1.522, + "step": 752 + }, + { + "epoch": 0.009784891568652612, + "grad_norm": 0.39485645294189453, + "learning_rate": 0.00019808419657130975, + "loss": 1.4362, + "step": 753 + }, + { + "epoch": 0.009797886112568484, + "grad_norm": 0.3032613694667816, + "learning_rate": 0.00019808159710939835, + "loss": 1.4838, + "step": 754 + }, + { + "epoch": 0.00981088065648436, + "grad_norm": 0.3500721752643585, + "learning_rate": 0.000198078997647487, + "loss": 1.4274, + "step": 755 + }, + { + "epoch": 0.009823875200400232, + "grad_norm": 0.3658673167228699, + "learning_rate": 0.0001980763981855756, + "loss": 1.422, + "step": 756 + }, + { + "epoch": 0.009836869744316105, + "grad_norm": 0.3680466413497925, + "learning_rate": 0.0001980737987236642, + "loss": 1.6103, + "step": 757 + }, + { + "epoch": 0.009849864288231978, + "grad_norm": 0.4412722885608673, + "learning_rate": 0.00019807119926175282, + "loss": 1.4037, + "step": 758 + }, + { + "epoch": 0.009862858832147853, + "grad_norm": 0.3488806188106537, + "learning_rate": 0.00019806859979984144, + "loss": 1.6904, + "step": 759 + }, + { + "epoch": 0.009875853376063726, + "grad_norm": 0.38538268208503723, + "learning_rate": 0.00019806600033793007, + "loss": 1.6448, + "step": 760 + }, + { + "epoch": 0.009888847919979599, + "grad_norm": 0.3275357484817505, + "learning_rate": 0.00019806340087601866, + "loss": 1.3492, + "step": 761 + }, + { + "epoch": 0.009901842463895472, + "grad_norm": 0.45382624864578247, + "learning_rate": 0.0001980608014141073, + "loss": 1.2303, + "step": 762 + }, + { + "epoch": 0.009914837007811345, + "grad_norm": 0.32894131541252136, + "learning_rate": 0.0001980582019521959, + "loss": 1.6423, + "step": 763 + }, + { + "epoch": 0.00992783155172722, + "grad_norm": 0.3140595853328705, + "learning_rate": 0.0001980556024902845, + "loss": 1.2432, + "step": 764 + }, + { + "epoch": 0.009940826095643092, + "grad_norm": 0.2719400227069855, + "learning_rate": 0.00019805300302837313, + "loss": 1.6344, + "step": 765 + }, + { + "epoch": 0.009953820639558965, + "grad_norm": 0.29642561078071594, + "learning_rate": 0.00019805040356646176, + "loss": 1.4362, + "step": 766 + }, + { + "epoch": 0.009966815183474838, + "grad_norm": 0.38486072421073914, + "learning_rate": 0.00019804780410455038, + "loss": 1.6494, + "step": 767 + }, + { + "epoch": 0.009979809727390711, + "grad_norm": 0.3146076798439026, + "learning_rate": 0.00019804520464263898, + "loss": 1.5003, + "step": 768 + }, + { + "epoch": 0.009992804271306586, + "grad_norm": 0.2924637496471405, + "learning_rate": 0.00019804260518072758, + "loss": 1.4728, + "step": 769 + }, + { + "epoch": 0.010005798815222459, + "grad_norm": 0.2947547435760498, + "learning_rate": 0.00019804000571881623, + "loss": 1.4447, + "step": 770 + }, + { + "epoch": 0.010018793359138332, + "grad_norm": 0.3057027757167816, + "learning_rate": 0.00019803740625690483, + "loss": 1.2061, + "step": 771 + }, + { + "epoch": 0.010031787903054205, + "grad_norm": 0.3925977647304535, + "learning_rate": 0.00019803480679499345, + "loss": 1.5638, + "step": 772 + }, + { + "epoch": 0.010044782446970078, + "grad_norm": 0.3047654330730438, + "learning_rate": 0.00019803220733308205, + "loss": 1.6397, + "step": 773 + }, + { + "epoch": 0.010057776990885952, + "grad_norm": 0.2800816297531128, + "learning_rate": 0.00019802960787117067, + "loss": 1.4873, + "step": 774 + }, + { + "epoch": 0.010070771534801825, + "grad_norm": 0.6156725287437439, + "learning_rate": 0.0001980270084092593, + "loss": 1.6791, + "step": 775 + }, + { + "epoch": 0.010083766078717698, + "grad_norm": 0.3589865565299988, + "learning_rate": 0.0001980244089473479, + "loss": 1.512, + "step": 776 + }, + { + "epoch": 0.010096760622633571, + "grad_norm": 0.2741207778453827, + "learning_rate": 0.00019802180948543652, + "loss": 1.6018, + "step": 777 + }, + { + "epoch": 0.010109755166549446, + "grad_norm": 0.31274279952049255, + "learning_rate": 0.00019801921002352514, + "loss": 1.3728, + "step": 778 + }, + { + "epoch": 0.010122749710465319, + "grad_norm": 0.427493691444397, + "learning_rate": 0.00019801661056161377, + "loss": 1.4774, + "step": 779 + }, + { + "epoch": 0.010135744254381192, + "grad_norm": 0.33083292841911316, + "learning_rate": 0.00019801401109970237, + "loss": 1.3489, + "step": 780 + }, + { + "epoch": 0.010148738798297065, + "grad_norm": 0.33251968026161194, + "learning_rate": 0.000198011411637791, + "loss": 1.5482, + "step": 781 + }, + { + "epoch": 0.010161733342212938, + "grad_norm": 0.2710573971271515, + "learning_rate": 0.00019800881217587962, + "loss": 1.1779, + "step": 782 + }, + { + "epoch": 0.010174727886128812, + "grad_norm": 0.354383647441864, + "learning_rate": 0.0001980062127139682, + "loss": 1.4562, + "step": 783 + }, + { + "epoch": 0.010187722430044685, + "grad_norm": 0.41652238368988037, + "learning_rate": 0.00019800361325205684, + "loss": 1.4855, + "step": 784 + }, + { + "epoch": 0.010200716973960558, + "grad_norm": 0.3225327134132385, + "learning_rate": 0.00019800101379014543, + "loss": 1.6558, + "step": 785 + }, + { + "epoch": 0.010213711517876431, + "grad_norm": 0.38271769881248474, + "learning_rate": 0.00019799841432823406, + "loss": 1.4385, + "step": 786 + }, + { + "epoch": 0.010226706061792304, + "grad_norm": 0.3630881607532501, + "learning_rate": 0.00019799581486632268, + "loss": 1.2769, + "step": 787 + }, + { + "epoch": 0.010239700605708179, + "grad_norm": 0.33851003646850586, + "learning_rate": 0.00019799321540441128, + "loss": 1.5789, + "step": 788 + }, + { + "epoch": 0.010252695149624052, + "grad_norm": 0.4176260828971863, + "learning_rate": 0.0001979906159424999, + "loss": 1.4785, + "step": 789 + }, + { + "epoch": 0.010265689693539925, + "grad_norm": 0.3471718430519104, + "learning_rate": 0.00019798801648058853, + "loss": 1.4152, + "step": 790 + }, + { + "epoch": 0.010278684237455798, + "grad_norm": 0.3253538906574249, + "learning_rate": 0.00019798541701867715, + "loss": 1.4411, + "step": 791 + }, + { + "epoch": 0.01029167878137167, + "grad_norm": 0.37727871537208557, + "learning_rate": 0.00019798281755676575, + "loss": 1.5329, + "step": 792 + }, + { + "epoch": 0.010304673325287545, + "grad_norm": 0.38673219084739685, + "learning_rate": 0.00019798021809485438, + "loss": 1.4563, + "step": 793 + }, + { + "epoch": 0.010317667869203418, + "grad_norm": 0.37884414196014404, + "learning_rate": 0.000197977618632943, + "loss": 1.4507, + "step": 794 + }, + { + "epoch": 0.010330662413119291, + "grad_norm": 0.3687722980976105, + "learning_rate": 0.0001979750191710316, + "loss": 1.402, + "step": 795 + }, + { + "epoch": 0.010343656957035164, + "grad_norm": 0.39976727962493896, + "learning_rate": 0.00019797241970912022, + "loss": 1.5013, + "step": 796 + }, + { + "epoch": 0.010356651500951039, + "grad_norm": 0.31597524881362915, + "learning_rate": 0.00019796982024720885, + "loss": 1.5531, + "step": 797 + }, + { + "epoch": 0.010369646044866912, + "grad_norm": 0.4013782739639282, + "learning_rate": 0.00019796722078529747, + "loss": 1.3796, + "step": 798 + }, + { + "epoch": 0.010382640588782785, + "grad_norm": 0.3744184076786041, + "learning_rate": 0.00019796462132338607, + "loss": 1.3908, + "step": 799 + }, + { + "epoch": 0.010395635132698658, + "grad_norm": 0.2818164527416229, + "learning_rate": 0.00019796202186147467, + "loss": 1.5691, + "step": 800 + }, + { + "epoch": 0.010408629676614531, + "grad_norm": 0.31891414523124695, + "learning_rate": 0.00019795942239956332, + "loss": 1.3117, + "step": 801 + }, + { + "epoch": 0.010421624220530406, + "grad_norm": 0.44532331824302673, + "learning_rate": 0.00019795682293765192, + "loss": 1.7584, + "step": 802 + }, + { + "epoch": 0.010434618764446279, + "grad_norm": 0.2924066185951233, + "learning_rate": 0.00019795422347574054, + "loss": 1.3124, + "step": 803 + }, + { + "epoch": 0.010447613308362151, + "grad_norm": 0.3202171325683594, + "learning_rate": 0.00019795162401382914, + "loss": 1.2046, + "step": 804 + }, + { + "epoch": 0.010460607852278024, + "grad_norm": 0.37899860739707947, + "learning_rate": 0.00019794902455191776, + "loss": 1.5393, + "step": 805 + }, + { + "epoch": 0.010473602396193897, + "grad_norm": 0.4298096299171448, + "learning_rate": 0.00019794642509000639, + "loss": 1.4334, + "step": 806 + }, + { + "epoch": 0.010486596940109772, + "grad_norm": 0.2809349596500397, + "learning_rate": 0.00019794382562809498, + "loss": 1.4074, + "step": 807 + }, + { + "epoch": 0.010499591484025645, + "grad_norm": 0.3504308760166168, + "learning_rate": 0.0001979412261661836, + "loss": 1.4385, + "step": 808 + }, + { + "epoch": 0.010512586027941518, + "grad_norm": 0.4143553078174591, + "learning_rate": 0.00019793862670427223, + "loss": 1.5984, + "step": 809 + }, + { + "epoch": 0.010525580571857391, + "grad_norm": 0.2987794876098633, + "learning_rate": 0.00019793602724236086, + "loss": 1.247, + "step": 810 + }, + { + "epoch": 0.010538575115773264, + "grad_norm": 0.3241748511791229, + "learning_rate": 0.00019793342778044945, + "loss": 1.3828, + "step": 811 + }, + { + "epoch": 0.010551569659689139, + "grad_norm": 0.4106390178203583, + "learning_rate": 0.00019793082831853805, + "loss": 1.6831, + "step": 812 + }, + { + "epoch": 0.010564564203605012, + "grad_norm": 0.4032892882823944, + "learning_rate": 0.0001979282288566267, + "loss": 1.5073, + "step": 813 + }, + { + "epoch": 0.010577558747520885, + "grad_norm": 0.378022164106369, + "learning_rate": 0.0001979256293947153, + "loss": 1.6767, + "step": 814 + }, + { + "epoch": 0.010590553291436757, + "grad_norm": 0.3420693278312683, + "learning_rate": 0.00019792302993280393, + "loss": 1.5153, + "step": 815 + }, + { + "epoch": 0.010603547835352632, + "grad_norm": 0.313639760017395, + "learning_rate": 0.00019792043047089252, + "loss": 1.339, + "step": 816 + }, + { + "epoch": 0.010616542379268505, + "grad_norm": 0.39957714080810547, + "learning_rate": 0.00019791783100898115, + "loss": 1.4186, + "step": 817 + }, + { + "epoch": 0.010629536923184378, + "grad_norm": 0.283723920583725, + "learning_rate": 0.00019791523154706977, + "loss": 1.2269, + "step": 818 + }, + { + "epoch": 0.010642531467100251, + "grad_norm": 0.2821674048900604, + "learning_rate": 0.00019791263208515837, + "loss": 1.5801, + "step": 819 + }, + { + "epoch": 0.010655526011016124, + "grad_norm": 0.4604765772819519, + "learning_rate": 0.000197910032623247, + "loss": 1.4114, + "step": 820 + }, + { + "epoch": 0.010668520554931999, + "grad_norm": 0.41818079352378845, + "learning_rate": 0.00019790743316133562, + "loss": 1.4538, + "step": 821 + }, + { + "epoch": 0.010681515098847872, + "grad_norm": 0.4463641941547394, + "learning_rate": 0.00019790483369942424, + "loss": 1.5206, + "step": 822 + }, + { + "epoch": 0.010694509642763745, + "grad_norm": 0.2832520008087158, + "learning_rate": 0.00019790223423751284, + "loss": 1.3815, + "step": 823 + }, + { + "epoch": 0.010707504186679618, + "grad_norm": 0.38248422741889954, + "learning_rate": 0.00019789963477560144, + "loss": 1.5137, + "step": 824 + }, + { + "epoch": 0.01072049873059549, + "grad_norm": 0.3523542284965515, + "learning_rate": 0.0001978970353136901, + "loss": 1.4657, + "step": 825 + }, + { + "epoch": 0.010733493274511365, + "grad_norm": 0.39274516701698303, + "learning_rate": 0.00019789443585177869, + "loss": 1.557, + "step": 826 + }, + { + "epoch": 0.010746487818427238, + "grad_norm": 0.3270724415779114, + "learning_rate": 0.0001978918363898673, + "loss": 1.2439, + "step": 827 + }, + { + "epoch": 0.010759482362343111, + "grad_norm": 0.4127451181411743, + "learning_rate": 0.0001978892369279559, + "loss": 1.631, + "step": 828 + }, + { + "epoch": 0.010772476906258984, + "grad_norm": 0.5745660662651062, + "learning_rate": 0.00019788663746604453, + "loss": 1.4533, + "step": 829 + }, + { + "epoch": 0.010785471450174857, + "grad_norm": 0.3350748121738434, + "learning_rate": 0.00019788403800413316, + "loss": 1.3469, + "step": 830 + }, + { + "epoch": 0.010798465994090732, + "grad_norm": 0.3463400602340698, + "learning_rate": 0.00019788143854222175, + "loss": 1.4159, + "step": 831 + }, + { + "epoch": 0.010811460538006605, + "grad_norm": 0.36435380578041077, + "learning_rate": 0.00019787883908031038, + "loss": 1.4794, + "step": 832 + }, + { + "epoch": 0.010824455081922478, + "grad_norm": 0.3727715313434601, + "learning_rate": 0.000197876239618399, + "loss": 1.6761, + "step": 833 + }, + { + "epoch": 0.01083744962583835, + "grad_norm": 0.49526360630989075, + "learning_rate": 0.00019787364015648763, + "loss": 1.4028, + "step": 834 + }, + { + "epoch": 0.010850444169754224, + "grad_norm": 0.3774344325065613, + "learning_rate": 0.00019787104069457623, + "loss": 1.4989, + "step": 835 + }, + { + "epoch": 0.010863438713670098, + "grad_norm": 0.297404408454895, + "learning_rate": 0.00019786844123266485, + "loss": 1.4746, + "step": 836 + }, + { + "epoch": 0.010876433257585971, + "grad_norm": 0.3675737977027893, + "learning_rate": 0.00019786584177075347, + "loss": 1.6151, + "step": 837 + }, + { + "epoch": 0.010889427801501844, + "grad_norm": 0.363771915435791, + "learning_rate": 0.00019786324230884207, + "loss": 1.4896, + "step": 838 + }, + { + "epoch": 0.010902422345417717, + "grad_norm": 0.2401675581932068, + "learning_rate": 0.0001978606428469307, + "loss": 1.2489, + "step": 839 + }, + { + "epoch": 0.010915416889333592, + "grad_norm": 0.39043956995010376, + "learning_rate": 0.00019785804338501932, + "loss": 1.6779, + "step": 840 + }, + { + "epoch": 0.010928411433249465, + "grad_norm": 0.3350941240787506, + "learning_rate": 0.00019785544392310792, + "loss": 1.4197, + "step": 841 + }, + { + "epoch": 0.010941405977165338, + "grad_norm": 0.3687950670719147, + "learning_rate": 0.00019785284446119654, + "loss": 1.2805, + "step": 842 + }, + { + "epoch": 0.01095440052108121, + "grad_norm": 0.3335752487182617, + "learning_rate": 0.00019785024499928514, + "loss": 1.534, + "step": 843 + }, + { + "epoch": 0.010967395064997084, + "grad_norm": 0.37645015120506287, + "learning_rate": 0.0001978476455373738, + "loss": 1.2933, + "step": 844 + }, + { + "epoch": 0.010980389608912958, + "grad_norm": 0.2943829596042633, + "learning_rate": 0.0001978450460754624, + "loss": 1.4492, + "step": 845 + }, + { + "epoch": 0.010993384152828831, + "grad_norm": 0.3535577654838562, + "learning_rate": 0.000197842446613551, + "loss": 1.1818, + "step": 846 + }, + { + "epoch": 0.011006378696744704, + "grad_norm": 0.347501277923584, + "learning_rate": 0.0001978398471516396, + "loss": 1.3698, + "step": 847 + }, + { + "epoch": 0.011019373240660577, + "grad_norm": 0.41602951288223267, + "learning_rate": 0.00019783724768972823, + "loss": 1.5016, + "step": 848 + }, + { + "epoch": 0.01103236778457645, + "grad_norm": 0.33995816111564636, + "learning_rate": 0.00019783464822781686, + "loss": 1.3072, + "step": 849 + }, + { + "epoch": 0.011045362328492325, + "grad_norm": 0.4060112237930298, + "learning_rate": 0.00019783204876590546, + "loss": 1.5691, + "step": 850 + }, + { + "epoch": 0.011058356872408198, + "grad_norm": 0.4026666581630707, + "learning_rate": 0.00019782944930399408, + "loss": 1.4755, + "step": 851 + }, + { + "epoch": 0.01107135141632407, + "grad_norm": 0.3719538450241089, + "learning_rate": 0.0001978268498420827, + "loss": 1.4684, + "step": 852 + }, + { + "epoch": 0.011084345960239944, + "grad_norm": 0.3730430006980896, + "learning_rate": 0.0001978242503801713, + "loss": 1.5605, + "step": 853 + }, + { + "epoch": 0.011097340504155817, + "grad_norm": 0.3669048547744751, + "learning_rate": 0.00019782165091825993, + "loss": 1.5665, + "step": 854 + }, + { + "epoch": 0.011110335048071691, + "grad_norm": 0.41444581747055054, + "learning_rate": 0.00019781905145634853, + "loss": 1.5279, + "step": 855 + }, + { + "epoch": 0.011123329591987564, + "grad_norm": 0.31362953782081604, + "learning_rate": 0.00019781645199443718, + "loss": 1.5469, + "step": 856 + }, + { + "epoch": 0.011136324135903437, + "grad_norm": 0.2954976260662079, + "learning_rate": 0.00019781385253252577, + "loss": 1.4712, + "step": 857 + }, + { + "epoch": 0.01114931867981931, + "grad_norm": 0.36397314071655273, + "learning_rate": 0.0001978112530706144, + "loss": 1.583, + "step": 858 + }, + { + "epoch": 0.011162313223735185, + "grad_norm": 0.3063300549983978, + "learning_rate": 0.000197808653608703, + "loss": 1.2616, + "step": 859 + }, + { + "epoch": 0.011175307767651058, + "grad_norm": 0.39166536927223206, + "learning_rate": 0.00019780605414679162, + "loss": 1.5711, + "step": 860 + }, + { + "epoch": 0.011188302311566931, + "grad_norm": 0.3681032657623291, + "learning_rate": 0.00019780345468488024, + "loss": 1.482, + "step": 861 + }, + { + "epoch": 0.011201296855482804, + "grad_norm": 0.30603691935539246, + "learning_rate": 0.00019780085522296884, + "loss": 1.3632, + "step": 862 + }, + { + "epoch": 0.011214291399398677, + "grad_norm": 0.48085567355155945, + "learning_rate": 0.00019779825576105747, + "loss": 1.564, + "step": 863 + }, + { + "epoch": 0.011227285943314552, + "grad_norm": 0.3470838963985443, + "learning_rate": 0.0001977956562991461, + "loss": 1.4226, + "step": 864 + }, + { + "epoch": 0.011240280487230424, + "grad_norm": 0.3490869104862213, + "learning_rate": 0.00019779305683723472, + "loss": 1.5778, + "step": 865 + }, + { + "epoch": 0.011253275031146297, + "grad_norm": 0.4007914066314697, + "learning_rate": 0.0001977904573753233, + "loss": 1.6429, + "step": 866 + }, + { + "epoch": 0.01126626957506217, + "grad_norm": 0.3783588707447052, + "learning_rate": 0.0001977878579134119, + "loss": 1.4628, + "step": 867 + }, + { + "epoch": 0.011279264118978043, + "grad_norm": 0.3523287773132324, + "learning_rate": 0.00019778525845150056, + "loss": 1.4835, + "step": 868 + }, + { + "epoch": 0.011292258662893918, + "grad_norm": 0.3539884686470032, + "learning_rate": 0.00019778265898958916, + "loss": 1.524, + "step": 869 + }, + { + "epoch": 0.011305253206809791, + "grad_norm": 0.3133487403392792, + "learning_rate": 0.00019778005952767778, + "loss": 1.2986, + "step": 870 + }, + { + "epoch": 0.011318247750725664, + "grad_norm": 0.4288972318172455, + "learning_rate": 0.0001977774600657664, + "loss": 1.7114, + "step": 871 + }, + { + "epoch": 0.011331242294641537, + "grad_norm": 0.2804165482521057, + "learning_rate": 0.000197774860603855, + "loss": 1.325, + "step": 872 + }, + { + "epoch": 0.01134423683855741, + "grad_norm": 0.4230891466140747, + "learning_rate": 0.00019777226114194363, + "loss": 1.5955, + "step": 873 + }, + { + "epoch": 0.011357231382473285, + "grad_norm": 0.40529075264930725, + "learning_rate": 0.00019776966168003223, + "loss": 1.4972, + "step": 874 + }, + { + "epoch": 0.011370225926389158, + "grad_norm": 0.4664103388786316, + "learning_rate": 0.00019776706221812088, + "loss": 1.5273, + "step": 875 + }, + { + "epoch": 0.01138322047030503, + "grad_norm": 0.37206152081489563, + "learning_rate": 0.00019776446275620948, + "loss": 1.4608, + "step": 876 + }, + { + "epoch": 0.011396215014220903, + "grad_norm": 0.48054125905036926, + "learning_rate": 0.0001977618632942981, + "loss": 1.5868, + "step": 877 + }, + { + "epoch": 0.011409209558136778, + "grad_norm": 0.3271986246109009, + "learning_rate": 0.0001977592638323867, + "loss": 1.3005, + "step": 878 + }, + { + "epoch": 0.011422204102052651, + "grad_norm": 0.3968530297279358, + "learning_rate": 0.00019775666437047532, + "loss": 1.3459, + "step": 879 + }, + { + "epoch": 0.011435198645968524, + "grad_norm": 0.32345154881477356, + "learning_rate": 0.00019775406490856395, + "loss": 1.5788, + "step": 880 + }, + { + "epoch": 0.011448193189884397, + "grad_norm": 0.4595682621002197, + "learning_rate": 0.00019775146544665254, + "loss": 1.4036, + "step": 881 + }, + { + "epoch": 0.01146118773380027, + "grad_norm": 0.34216704964637756, + "learning_rate": 0.00019774886598474117, + "loss": 1.591, + "step": 882 + }, + { + "epoch": 0.011474182277716145, + "grad_norm": 0.40196308493614197, + "learning_rate": 0.0001977462665228298, + "loss": 1.4519, + "step": 883 + }, + { + "epoch": 0.011487176821632018, + "grad_norm": 0.3939869701862335, + "learning_rate": 0.0001977436670609184, + "loss": 1.2319, + "step": 884 + }, + { + "epoch": 0.01150017136554789, + "grad_norm": 0.42931994795799255, + "learning_rate": 0.00019774106759900702, + "loss": 1.5851, + "step": 885 + }, + { + "epoch": 0.011513165909463764, + "grad_norm": 0.3681303560733795, + "learning_rate": 0.0001977384681370956, + "loss": 1.4846, + "step": 886 + }, + { + "epoch": 0.011526160453379636, + "grad_norm": 0.23849858343601227, + "learning_rate": 0.00019773586867518426, + "loss": 1.1437, + "step": 887 + }, + { + "epoch": 0.011539154997295511, + "grad_norm": 0.35961535573005676, + "learning_rate": 0.00019773326921327286, + "loss": 1.5136, + "step": 888 + }, + { + "epoch": 0.011552149541211384, + "grad_norm": 0.4356399476528168, + "learning_rate": 0.00019773066975136149, + "loss": 1.6493, + "step": 889 + }, + { + "epoch": 0.011565144085127257, + "grad_norm": 0.3308802843093872, + "learning_rate": 0.00019772807028945008, + "loss": 1.312, + "step": 890 + }, + { + "epoch": 0.01157813862904313, + "grad_norm": 0.29127219319343567, + "learning_rate": 0.0001977254708275387, + "loss": 1.3896, + "step": 891 + }, + { + "epoch": 0.011591133172959003, + "grad_norm": 0.2908242344856262, + "learning_rate": 0.00019772287136562733, + "loss": 1.4376, + "step": 892 + }, + { + "epoch": 0.011604127716874878, + "grad_norm": 0.3977406919002533, + "learning_rate": 0.00019772027190371593, + "loss": 1.5558, + "step": 893 + }, + { + "epoch": 0.01161712226079075, + "grad_norm": 0.26013070344924927, + "learning_rate": 0.00019771767244180455, + "loss": 1.2552, + "step": 894 + }, + { + "epoch": 0.011630116804706624, + "grad_norm": 0.32726550102233887, + "learning_rate": 0.00019771507297989318, + "loss": 1.5186, + "step": 895 + }, + { + "epoch": 0.011643111348622497, + "grad_norm": 0.39039891958236694, + "learning_rate": 0.00019771247351798178, + "loss": 1.3885, + "step": 896 + }, + { + "epoch": 0.011656105892538371, + "grad_norm": 0.23171035945415497, + "learning_rate": 0.0001977098740560704, + "loss": 1.2481, + "step": 897 + }, + { + "epoch": 0.011669100436454244, + "grad_norm": 0.3014744818210602, + "learning_rate": 0.000197707274594159, + "loss": 1.2552, + "step": 898 + }, + { + "epoch": 0.011682094980370117, + "grad_norm": 0.3523556590080261, + "learning_rate": 0.00019770467513224765, + "loss": 1.426, + "step": 899 + }, + { + "epoch": 0.01169508952428599, + "grad_norm": 0.4000703692436218, + "learning_rate": 0.00019770207567033625, + "loss": 1.3178, + "step": 900 + }, + { + "epoch": 0.011708084068201863, + "grad_norm": 0.36981841921806335, + "learning_rate": 0.00019769947620842487, + "loss": 1.4109, + "step": 901 + }, + { + "epoch": 0.011721078612117738, + "grad_norm": 0.36349907517433167, + "learning_rate": 0.00019769687674651347, + "loss": 1.5898, + "step": 902 + }, + { + "epoch": 0.01173407315603361, + "grad_norm": 0.363073468208313, + "learning_rate": 0.0001976942772846021, + "loss": 1.4734, + "step": 903 + }, + { + "epoch": 0.011747067699949484, + "grad_norm": 0.3224738538265228, + "learning_rate": 0.00019769167782269072, + "loss": 1.3555, + "step": 904 + }, + { + "epoch": 0.011760062243865357, + "grad_norm": 0.42104336619377136, + "learning_rate": 0.00019768907836077932, + "loss": 1.5197, + "step": 905 + }, + { + "epoch": 0.01177305678778123, + "grad_norm": 0.3163914084434509, + "learning_rate": 0.00019768647889886794, + "loss": 1.5051, + "step": 906 + }, + { + "epoch": 0.011786051331697104, + "grad_norm": 0.307299941778183, + "learning_rate": 0.00019768387943695656, + "loss": 1.4167, + "step": 907 + }, + { + "epoch": 0.011799045875612977, + "grad_norm": 0.3385258913040161, + "learning_rate": 0.00019768127997504516, + "loss": 1.3296, + "step": 908 + }, + { + "epoch": 0.01181204041952885, + "grad_norm": 0.3504544794559479, + "learning_rate": 0.00019767868051313379, + "loss": 1.5734, + "step": 909 + }, + { + "epoch": 0.011825034963444723, + "grad_norm": 0.28617730736732483, + "learning_rate": 0.0001976760810512224, + "loss": 1.458, + "step": 910 + }, + { + "epoch": 0.011838029507360596, + "grad_norm": 0.3386898934841156, + "learning_rate": 0.00019767348158931104, + "loss": 1.3342, + "step": 911 + }, + { + "epoch": 0.01185102405127647, + "grad_norm": 0.3196926712989807, + "learning_rate": 0.00019767088212739963, + "loss": 1.6146, + "step": 912 + }, + { + "epoch": 0.011864018595192344, + "grad_norm": 0.3252779245376587, + "learning_rate": 0.00019766828266548826, + "loss": 1.4038, + "step": 913 + }, + { + "epoch": 0.011877013139108217, + "grad_norm": 0.33069950342178345, + "learning_rate": 0.00019766568320357688, + "loss": 1.56, + "step": 914 + }, + { + "epoch": 0.01189000768302409, + "grad_norm": 0.3373044729232788, + "learning_rate": 0.00019766308374166548, + "loss": 1.4666, + "step": 915 + }, + { + "epoch": 0.011903002226939964, + "grad_norm": 0.33167558908462524, + "learning_rate": 0.0001976604842797541, + "loss": 1.5786, + "step": 916 + }, + { + "epoch": 0.011915996770855837, + "grad_norm": 0.33954721689224243, + "learning_rate": 0.0001976578848178427, + "loss": 1.5849, + "step": 917 + }, + { + "epoch": 0.01192899131477171, + "grad_norm": 0.3559896945953369, + "learning_rate": 0.00019765528535593135, + "loss": 1.3744, + "step": 918 + }, + { + "epoch": 0.011941985858687583, + "grad_norm": 0.31615790724754333, + "learning_rate": 0.00019765268589401995, + "loss": 1.488, + "step": 919 + }, + { + "epoch": 0.011954980402603456, + "grad_norm": 0.39266765117645264, + "learning_rate": 0.00019765008643210857, + "loss": 1.4314, + "step": 920 + }, + { + "epoch": 0.011967974946519331, + "grad_norm": 0.3904924690723419, + "learning_rate": 0.00019764748697019717, + "loss": 1.5633, + "step": 921 + }, + { + "epoch": 0.011980969490435204, + "grad_norm": 0.32989031076431274, + "learning_rate": 0.0001976448875082858, + "loss": 1.2155, + "step": 922 + }, + { + "epoch": 0.011993964034351077, + "grad_norm": 0.34584271907806396, + "learning_rate": 0.00019764228804637442, + "loss": 1.4938, + "step": 923 + }, + { + "epoch": 0.01200695857826695, + "grad_norm": 0.35245075821876526, + "learning_rate": 0.00019763968858446302, + "loss": 1.4404, + "step": 924 + }, + { + "epoch": 0.012019953122182823, + "grad_norm": 0.330208420753479, + "learning_rate": 0.00019763708912255164, + "loss": 1.8242, + "step": 925 + }, + { + "epoch": 0.012032947666098697, + "grad_norm": 0.3622221350669861, + "learning_rate": 0.00019763448966064027, + "loss": 1.6096, + "step": 926 + }, + { + "epoch": 0.01204594221001457, + "grad_norm": 0.3644472360610962, + "learning_rate": 0.00019763189019872886, + "loss": 1.3615, + "step": 927 + }, + { + "epoch": 0.012058936753930443, + "grad_norm": 0.33621031045913696, + "learning_rate": 0.0001976292907368175, + "loss": 1.6668, + "step": 928 + }, + { + "epoch": 0.012071931297846316, + "grad_norm": 0.30896222591400146, + "learning_rate": 0.00019762669127490609, + "loss": 1.3955, + "step": 929 + }, + { + "epoch": 0.01208492584176219, + "grad_norm": 0.38593050837516785, + "learning_rate": 0.00019762409181299474, + "loss": 1.5416, + "step": 930 + }, + { + "epoch": 0.012097920385678064, + "grad_norm": 0.3486964702606201, + "learning_rate": 0.00019762149235108334, + "loss": 1.4852, + "step": 931 + }, + { + "epoch": 0.012110914929593937, + "grad_norm": 0.4540570080280304, + "learning_rate": 0.00019761889288917196, + "loss": 1.6986, + "step": 932 + }, + { + "epoch": 0.01212390947350981, + "grad_norm": 0.3150416314601898, + "learning_rate": 0.00019761629342726056, + "loss": 1.3562, + "step": 933 + }, + { + "epoch": 0.012136904017425683, + "grad_norm": 0.2738734185695648, + "learning_rate": 0.00019761369396534918, + "loss": 1.4792, + "step": 934 + }, + { + "epoch": 0.012149898561341558, + "grad_norm": 0.33239731192588806, + "learning_rate": 0.0001976110945034378, + "loss": 1.3922, + "step": 935 + }, + { + "epoch": 0.01216289310525743, + "grad_norm": 0.3253883123397827, + "learning_rate": 0.0001976084950415264, + "loss": 1.443, + "step": 936 + }, + { + "epoch": 0.012175887649173303, + "grad_norm": 0.39133119583129883, + "learning_rate": 0.00019760589557961503, + "loss": 1.4235, + "step": 937 + }, + { + "epoch": 0.012188882193089176, + "grad_norm": 0.3229159414768219, + "learning_rate": 0.00019760329611770365, + "loss": 1.6038, + "step": 938 + }, + { + "epoch": 0.01220187673700505, + "grad_norm": 0.3184714615345001, + "learning_rate": 0.00019760069665579225, + "loss": 1.5701, + "step": 939 + }, + { + "epoch": 0.012214871280920924, + "grad_norm": 0.37220147252082825, + "learning_rate": 0.00019759809719388087, + "loss": 1.4988, + "step": 940 + }, + { + "epoch": 0.012227865824836797, + "grad_norm": 0.320101797580719, + "learning_rate": 0.00019759549773196947, + "loss": 1.4489, + "step": 941 + }, + { + "epoch": 0.01224086036875267, + "grad_norm": 0.3790021538734436, + "learning_rate": 0.00019759289827005812, + "loss": 1.4883, + "step": 942 + }, + { + "epoch": 0.012253854912668543, + "grad_norm": 0.390257865190506, + "learning_rate": 0.00019759029880814672, + "loss": 1.4613, + "step": 943 + }, + { + "epoch": 0.012266849456584416, + "grad_norm": 0.3869599401950836, + "learning_rate": 0.00019758769934623535, + "loss": 1.4212, + "step": 944 + }, + { + "epoch": 0.01227984400050029, + "grad_norm": 0.38254520297050476, + "learning_rate": 0.00019758509988432397, + "loss": 1.414, + "step": 945 + }, + { + "epoch": 0.012292838544416164, + "grad_norm": 0.36777254939079285, + "learning_rate": 0.00019758250042241257, + "loss": 1.4511, + "step": 946 + }, + { + "epoch": 0.012305833088332037, + "grad_norm": 0.4114663898944855, + "learning_rate": 0.0001975799009605012, + "loss": 1.5631, + "step": 947 + }, + { + "epoch": 0.01231882763224791, + "grad_norm": 0.4308132231235504, + "learning_rate": 0.0001975773014985898, + "loss": 1.4463, + "step": 948 + }, + { + "epoch": 0.012331822176163782, + "grad_norm": 0.47477850317955017, + "learning_rate": 0.00019757470203667844, + "loss": 1.6277, + "step": 949 + }, + { + "epoch": 0.012344816720079657, + "grad_norm": 0.37226349115371704, + "learning_rate": 0.00019757210257476704, + "loss": 1.5688, + "step": 950 + }, + { + "epoch": 0.01235781126399553, + "grad_norm": 0.3418992757797241, + "learning_rate": 0.00019756950311285564, + "loss": 1.3014, + "step": 951 + }, + { + "epoch": 0.012370805807911403, + "grad_norm": 0.32099708914756775, + "learning_rate": 0.00019756690365094426, + "loss": 1.4976, + "step": 952 + }, + { + "epoch": 0.012383800351827276, + "grad_norm": 0.3482179641723633, + "learning_rate": 0.00019756430418903288, + "loss": 1.383, + "step": 953 + }, + { + "epoch": 0.01239679489574315, + "grad_norm": 0.3348259925842285, + "learning_rate": 0.0001975617047271215, + "loss": 1.4764, + "step": 954 + }, + { + "epoch": 0.012409789439659024, + "grad_norm": 0.37045207619667053, + "learning_rate": 0.0001975591052652101, + "loss": 1.416, + "step": 955 + }, + { + "epoch": 0.012422783983574897, + "grad_norm": 0.4267137944698334, + "learning_rate": 0.00019755650580329873, + "loss": 1.5242, + "step": 956 + }, + { + "epoch": 0.01243577852749077, + "grad_norm": 0.3423871397972107, + "learning_rate": 0.00019755390634138736, + "loss": 1.5163, + "step": 957 + }, + { + "epoch": 0.012448773071406643, + "grad_norm": 0.4420553743839264, + "learning_rate": 0.00019755130687947595, + "loss": 1.3978, + "step": 958 + }, + { + "epoch": 0.012461767615322517, + "grad_norm": 0.39365220069885254, + "learning_rate": 0.00019754870741756458, + "loss": 1.4297, + "step": 959 + }, + { + "epoch": 0.01247476215923839, + "grad_norm": 0.3856015205383301, + "learning_rate": 0.00019754610795565317, + "loss": 1.3805, + "step": 960 + }, + { + "epoch": 0.012487756703154263, + "grad_norm": 0.40324947237968445, + "learning_rate": 0.00019754350849374183, + "loss": 1.5517, + "step": 961 + }, + { + "epoch": 0.012500751247070136, + "grad_norm": 0.2551697790622711, + "learning_rate": 0.00019754090903183042, + "loss": 1.3591, + "step": 962 + }, + { + "epoch": 0.012513745790986009, + "grad_norm": 0.3336125314235687, + "learning_rate": 0.00019753830956991902, + "loss": 1.7566, + "step": 963 + }, + { + "epoch": 0.012526740334901884, + "grad_norm": 0.3386547565460205, + "learning_rate": 0.00019753571010800765, + "loss": 1.5109, + "step": 964 + }, + { + "epoch": 0.012539734878817757, + "grad_norm": 0.3088414669036865, + "learning_rate": 0.00019753311064609627, + "loss": 1.3733, + "step": 965 + }, + { + "epoch": 0.01255272942273363, + "grad_norm": 0.3609013557434082, + "learning_rate": 0.0001975305111841849, + "loss": 1.3377, + "step": 966 + }, + { + "epoch": 0.012565723966649503, + "grad_norm": 0.35096341371536255, + "learning_rate": 0.0001975279117222735, + "loss": 1.4581, + "step": 967 + }, + { + "epoch": 0.012578718510565376, + "grad_norm": 0.3886059522628784, + "learning_rate": 0.00019752531226036212, + "loss": 1.4808, + "step": 968 + }, + { + "epoch": 0.01259171305448125, + "grad_norm": 0.4206385314464569, + "learning_rate": 0.00019752271279845074, + "loss": 1.4298, + "step": 969 + }, + { + "epoch": 0.012604707598397123, + "grad_norm": 0.31605201959609985, + "learning_rate": 0.00019752011333653934, + "loss": 1.3647, + "step": 970 + }, + { + "epoch": 0.012617702142312996, + "grad_norm": 0.2802874445915222, + "learning_rate": 0.00019751751387462796, + "loss": 1.2144, + "step": 971 + }, + { + "epoch": 0.01263069668622887, + "grad_norm": 0.2805991768836975, + "learning_rate": 0.00019751491441271656, + "loss": 1.3136, + "step": 972 + }, + { + "epoch": 0.012643691230144742, + "grad_norm": 0.38155031204223633, + "learning_rate": 0.0001975123149508052, + "loss": 1.4381, + "step": 973 + }, + { + "epoch": 0.012656685774060617, + "grad_norm": 0.38293060660362244, + "learning_rate": 0.0001975097154888938, + "loss": 1.4946, + "step": 974 + }, + { + "epoch": 0.01266968031797649, + "grad_norm": 0.3291340172290802, + "learning_rate": 0.0001975071160269824, + "loss": 1.4199, + "step": 975 + }, + { + "epoch": 0.012682674861892363, + "grad_norm": 0.401024729013443, + "learning_rate": 0.00019750451656507103, + "loss": 1.4644, + "step": 976 + }, + { + "epoch": 0.012695669405808236, + "grad_norm": 0.30952614545822144, + "learning_rate": 0.00019750191710315966, + "loss": 1.6848, + "step": 977 + }, + { + "epoch": 0.01270866394972411, + "grad_norm": 0.2506324052810669, + "learning_rate": 0.00019749931764124828, + "loss": 1.3772, + "step": 978 + }, + { + "epoch": 0.012721658493639983, + "grad_norm": 0.47377681732177734, + "learning_rate": 0.00019749671817933688, + "loss": 1.5883, + "step": 979 + }, + { + "epoch": 0.012734653037555856, + "grad_norm": 0.35284608602523804, + "learning_rate": 0.0001974941187174255, + "loss": 1.696, + "step": 980 + }, + { + "epoch": 0.01274764758147173, + "grad_norm": 0.41148173809051514, + "learning_rate": 0.00019749151925551413, + "loss": 1.4473, + "step": 981 + }, + { + "epoch": 0.012760642125387602, + "grad_norm": 0.3409004807472229, + "learning_rate": 0.00019748891979360272, + "loss": 1.3546, + "step": 982 + }, + { + "epoch": 0.012773636669303477, + "grad_norm": 0.2892957627773285, + "learning_rate": 0.00019748632033169135, + "loss": 1.413, + "step": 983 + }, + { + "epoch": 0.01278663121321935, + "grad_norm": 0.28190919756889343, + "learning_rate": 0.00019748372086977997, + "loss": 1.341, + "step": 984 + }, + { + "epoch": 0.012799625757135223, + "grad_norm": 0.2929656505584717, + "learning_rate": 0.0001974811214078686, + "loss": 1.361, + "step": 985 + }, + { + "epoch": 0.012812620301051096, + "grad_norm": 0.3396834433078766, + "learning_rate": 0.0001974785219459572, + "loss": 1.6593, + "step": 986 + }, + { + "epoch": 0.012825614844966969, + "grad_norm": 0.35800784826278687, + "learning_rate": 0.00019747592248404582, + "loss": 1.5808, + "step": 987 + }, + { + "epoch": 0.012838609388882843, + "grad_norm": 0.36989840865135193, + "learning_rate": 0.00019747332302213444, + "loss": 1.7048, + "step": 988 + }, + { + "epoch": 0.012851603932798716, + "grad_norm": 0.4329179525375366, + "learning_rate": 0.00019747072356022304, + "loss": 1.5349, + "step": 989 + }, + { + "epoch": 0.01286459847671459, + "grad_norm": 0.32291293144226074, + "learning_rate": 0.00019746812409831166, + "loss": 1.3178, + "step": 990 + }, + { + "epoch": 0.012877593020630462, + "grad_norm": 0.2734065055847168, + "learning_rate": 0.00019746552463640026, + "loss": 1.419, + "step": 991 + }, + { + "epoch": 0.012890587564546335, + "grad_norm": 0.32284560799598694, + "learning_rate": 0.0001974629251744889, + "loss": 1.221, + "step": 992 + }, + { + "epoch": 0.01290358210846221, + "grad_norm": 0.37148168683052063, + "learning_rate": 0.0001974603257125775, + "loss": 1.5621, + "step": 993 + }, + { + "epoch": 0.012916576652378083, + "grad_norm": 0.4166358709335327, + "learning_rate": 0.0001974577262506661, + "loss": 1.565, + "step": 994 + }, + { + "epoch": 0.012929571196293956, + "grad_norm": 0.41298240423202515, + "learning_rate": 0.00019745512678875473, + "loss": 1.4933, + "step": 995 + }, + { + "epoch": 0.012942565740209829, + "grad_norm": 0.41472184658050537, + "learning_rate": 0.00019745252732684336, + "loss": 1.5142, + "step": 996 + }, + { + "epoch": 0.012955560284125704, + "grad_norm": 0.365089476108551, + "learning_rate": 0.00019744992786493198, + "loss": 1.4963, + "step": 997 + }, + { + "epoch": 0.012968554828041576, + "grad_norm": 0.46369650959968567, + "learning_rate": 0.00019744732840302058, + "loss": 1.5349, + "step": 998 + }, + { + "epoch": 0.01298154937195745, + "grad_norm": 0.2969778776168823, + "learning_rate": 0.0001974447289411092, + "loss": 1.3989, + "step": 999 + }, + { + "epoch": 0.012994543915873322, + "grad_norm": 0.3416184186935425, + "learning_rate": 0.00019744212947919783, + "loss": 1.4271, + "step": 1000 + }, + { + "epoch": 0.013007538459789195, + "grad_norm": 0.3964928686618805, + "learning_rate": 0.00019743953001728643, + "loss": 1.4873, + "step": 1001 + }, + { + "epoch": 0.01302053300370507, + "grad_norm": 0.2853555977344513, + "learning_rate": 0.00019743693055537505, + "loss": 1.3868, + "step": 1002 + }, + { + "epoch": 0.013033527547620943, + "grad_norm": 0.38141801953315735, + "learning_rate": 0.00019743433109346365, + "loss": 1.5366, + "step": 1003 + }, + { + "epoch": 0.013046522091536816, + "grad_norm": 0.32994070649147034, + "learning_rate": 0.0001974317316315523, + "loss": 1.3386, + "step": 1004 + }, + { + "epoch": 0.013059516635452689, + "grad_norm": 0.372125506401062, + "learning_rate": 0.0001974291321696409, + "loss": 1.6295, + "step": 1005 + }, + { + "epoch": 0.013072511179368562, + "grad_norm": 0.357553094625473, + "learning_rate": 0.0001974265327077295, + "loss": 1.5544, + "step": 1006 + }, + { + "epoch": 0.013085505723284437, + "grad_norm": 0.33602210879325867, + "learning_rate": 0.00019742393324581812, + "loss": 1.4295, + "step": 1007 + }, + { + "epoch": 0.01309850026720031, + "grad_norm": 0.2846996486186981, + "learning_rate": 0.00019742133378390674, + "loss": 1.3163, + "step": 1008 + }, + { + "epoch": 0.013111494811116182, + "grad_norm": 0.2966218888759613, + "learning_rate": 0.00019741873432199537, + "loss": 1.1789, + "step": 1009 + }, + { + "epoch": 0.013124489355032055, + "grad_norm": 0.3746318519115448, + "learning_rate": 0.00019741613486008396, + "loss": 1.5497, + "step": 1010 + }, + { + "epoch": 0.013137483898947928, + "grad_norm": 0.3484114110469818, + "learning_rate": 0.0001974135353981726, + "loss": 1.5893, + "step": 1011 + }, + { + "epoch": 0.013150478442863803, + "grad_norm": 0.43390658497810364, + "learning_rate": 0.00019741093593626121, + "loss": 1.513, + "step": 1012 + }, + { + "epoch": 0.013163472986779676, + "grad_norm": 0.3467201590538025, + "learning_rate": 0.0001974083364743498, + "loss": 1.2857, + "step": 1013 + }, + { + "epoch": 0.013176467530695549, + "grad_norm": 0.2515032887458801, + "learning_rate": 0.00019740573701243844, + "loss": 1.5167, + "step": 1014 + }, + { + "epoch": 0.013189462074611422, + "grad_norm": 0.3913467824459076, + "learning_rate": 0.00019740313755052703, + "loss": 1.4647, + "step": 1015 + }, + { + "epoch": 0.013202456618527297, + "grad_norm": 0.4138876497745514, + "learning_rate": 0.00019740053808861568, + "loss": 1.5505, + "step": 1016 + }, + { + "epoch": 0.01321545116244317, + "grad_norm": 0.36367595195770264, + "learning_rate": 0.00019739793862670428, + "loss": 1.548, + "step": 1017 + }, + { + "epoch": 0.013228445706359043, + "grad_norm": 0.3937438726425171, + "learning_rate": 0.00019739533916479288, + "loss": 1.5888, + "step": 1018 + }, + { + "epoch": 0.013241440250274916, + "grad_norm": 0.3310936391353607, + "learning_rate": 0.00019739273970288153, + "loss": 1.4138, + "step": 1019 + }, + { + "epoch": 0.013254434794190788, + "grad_norm": 0.32583069801330566, + "learning_rate": 0.00019739014024097013, + "loss": 1.465, + "step": 1020 + }, + { + "epoch": 0.013267429338106663, + "grad_norm": 0.6564087271690369, + "learning_rate": 0.00019738754077905875, + "loss": 1.4758, + "step": 1021 + }, + { + "epoch": 0.013280423882022536, + "grad_norm": 0.4228411614894867, + "learning_rate": 0.00019738494131714735, + "loss": 1.7239, + "step": 1022 + }, + { + "epoch": 0.013293418425938409, + "grad_norm": 0.38515767455101013, + "learning_rate": 0.00019738234185523597, + "loss": 1.3104, + "step": 1023 + }, + { + "epoch": 0.013306412969854282, + "grad_norm": 0.4113703668117523, + "learning_rate": 0.0001973797423933246, + "loss": 1.5076, + "step": 1024 + }, + { + "epoch": 0.013319407513770155, + "grad_norm": 0.30311328172683716, + "learning_rate": 0.0001973771429314132, + "loss": 1.6749, + "step": 1025 + }, + { + "epoch": 0.01333240205768603, + "grad_norm": 0.38779404759407043, + "learning_rate": 0.00019737454346950182, + "loss": 1.5167, + "step": 1026 + }, + { + "epoch": 0.013345396601601903, + "grad_norm": 0.3705992102622986, + "learning_rate": 0.00019737194400759045, + "loss": 1.6297, + "step": 1027 + }, + { + "epoch": 0.013358391145517776, + "grad_norm": 0.3721454441547394, + "learning_rate": 0.00019736934454567907, + "loss": 1.6093, + "step": 1028 + }, + { + "epoch": 0.013371385689433649, + "grad_norm": 0.27784106135368347, + "learning_rate": 0.00019736674508376767, + "loss": 1.4481, + "step": 1029 + }, + { + "epoch": 0.013384380233349522, + "grad_norm": 0.4030505418777466, + "learning_rate": 0.00019736414562185626, + "loss": 1.571, + "step": 1030 + }, + { + "epoch": 0.013397374777265396, + "grad_norm": 0.4560116231441498, + "learning_rate": 0.00019736154615994492, + "loss": 1.5739, + "step": 1031 + }, + { + "epoch": 0.01341036932118127, + "grad_norm": 0.44230586290359497, + "learning_rate": 0.00019735894669803351, + "loss": 1.5975, + "step": 1032 + }, + { + "epoch": 0.013423363865097142, + "grad_norm": 0.29934000968933105, + "learning_rate": 0.00019735634723612214, + "loss": 1.337, + "step": 1033 + }, + { + "epoch": 0.013436358409013015, + "grad_norm": 0.36191225051879883, + "learning_rate": 0.00019735374777421074, + "loss": 1.5397, + "step": 1034 + }, + { + "epoch": 0.01344935295292889, + "grad_norm": 0.20351794362068176, + "learning_rate": 0.00019735114831229936, + "loss": 1.3109, + "step": 1035 + }, + { + "epoch": 0.013462347496844763, + "grad_norm": 0.39175131916999817, + "learning_rate": 0.00019734854885038798, + "loss": 1.5401, + "step": 1036 + }, + { + "epoch": 0.013475342040760636, + "grad_norm": 0.3709506094455719, + "learning_rate": 0.00019734594938847658, + "loss": 1.4169, + "step": 1037 + }, + { + "epoch": 0.013488336584676509, + "grad_norm": 0.34935545921325684, + "learning_rate": 0.0001973433499265652, + "loss": 1.6892, + "step": 1038 + }, + { + "epoch": 0.013501331128592382, + "grad_norm": 0.38193994760513306, + "learning_rate": 0.00019734075046465383, + "loss": 1.5982, + "step": 1039 + }, + { + "epoch": 0.013514325672508256, + "grad_norm": 0.41490989923477173, + "learning_rate": 0.00019733815100274246, + "loss": 1.5192, + "step": 1040 + }, + { + "epoch": 0.01352732021642413, + "grad_norm": 0.4238545000553131, + "learning_rate": 0.00019733555154083105, + "loss": 1.4892, + "step": 1041 + }, + { + "epoch": 0.013540314760340002, + "grad_norm": 0.35215967893600464, + "learning_rate": 0.00019733295207891968, + "loss": 1.3275, + "step": 1042 + }, + { + "epoch": 0.013553309304255875, + "grad_norm": 0.39302146434783936, + "learning_rate": 0.0001973303526170083, + "loss": 1.5638, + "step": 1043 + }, + { + "epoch": 0.013566303848171748, + "grad_norm": 0.3888627886772156, + "learning_rate": 0.0001973277531550969, + "loss": 1.7963, + "step": 1044 + }, + { + "epoch": 0.013579298392087623, + "grad_norm": 0.3744841516017914, + "learning_rate": 0.00019732515369318552, + "loss": 1.4169, + "step": 1045 + }, + { + "epoch": 0.013592292936003496, + "grad_norm": 0.36582887172698975, + "learning_rate": 0.00019732255423127412, + "loss": 1.4175, + "step": 1046 + }, + { + "epoch": 0.013605287479919369, + "grad_norm": 0.40386176109313965, + "learning_rate": 0.00019731995476936275, + "loss": 1.4876, + "step": 1047 + }, + { + "epoch": 0.013618282023835242, + "grad_norm": 0.38611915707588196, + "learning_rate": 0.00019731735530745137, + "loss": 1.5448, + "step": 1048 + }, + { + "epoch": 0.013631276567751115, + "grad_norm": 0.3901870846748352, + "learning_rate": 0.00019731475584553997, + "loss": 1.4399, + "step": 1049 + }, + { + "epoch": 0.01364427111166699, + "grad_norm": 0.4732416868209839, + "learning_rate": 0.0001973121563836286, + "loss": 1.4883, + "step": 1050 + }, + { + "epoch": 0.013657265655582862, + "grad_norm": 0.39601173996925354, + "learning_rate": 0.00019730955692171722, + "loss": 1.4738, + "step": 1051 + }, + { + "epoch": 0.013670260199498735, + "grad_norm": 0.3134896457195282, + "learning_rate": 0.00019730695745980584, + "loss": 1.5333, + "step": 1052 + }, + { + "epoch": 0.013683254743414608, + "grad_norm": 0.32899072766304016, + "learning_rate": 0.00019730435799789444, + "loss": 1.4485, + "step": 1053 + }, + { + "epoch": 0.013696249287330483, + "grad_norm": 0.33218157291412354, + "learning_rate": 0.00019730175853598306, + "loss": 1.2999, + "step": 1054 + }, + { + "epoch": 0.013709243831246356, + "grad_norm": 0.37725093960762024, + "learning_rate": 0.0001972991590740717, + "loss": 1.6862, + "step": 1055 + }, + { + "epoch": 0.013722238375162229, + "grad_norm": 0.3622485399246216, + "learning_rate": 0.00019729655961216028, + "loss": 1.3255, + "step": 1056 + }, + { + "epoch": 0.013735232919078102, + "grad_norm": 0.23080238699913025, + "learning_rate": 0.0001972939601502489, + "loss": 1.326, + "step": 1057 + }, + { + "epoch": 0.013748227462993975, + "grad_norm": 0.4250565469264984, + "learning_rate": 0.00019729136068833753, + "loss": 1.3157, + "step": 1058 + }, + { + "epoch": 0.01376122200690985, + "grad_norm": 0.37228456139564514, + "learning_rate": 0.00019728876122642613, + "loss": 1.523, + "step": 1059 + }, + { + "epoch": 0.013774216550825722, + "grad_norm": 0.432044118642807, + "learning_rate": 0.00019728616176451476, + "loss": 1.5878, + "step": 1060 + }, + { + "epoch": 0.013787211094741595, + "grad_norm": 0.3209286630153656, + "learning_rate": 0.00019728356230260335, + "loss": 1.2784, + "step": 1061 + }, + { + "epoch": 0.013800205638657468, + "grad_norm": 0.41788139939308167, + "learning_rate": 0.000197280962840692, + "loss": 1.5172, + "step": 1062 + }, + { + "epoch": 0.013813200182573341, + "grad_norm": 0.375265896320343, + "learning_rate": 0.0001972783633787806, + "loss": 1.3935, + "step": 1063 + }, + { + "epoch": 0.013826194726489216, + "grad_norm": 0.30664366483688354, + "learning_rate": 0.00019727576391686923, + "loss": 1.4017, + "step": 1064 + }, + { + "epoch": 0.013839189270405089, + "grad_norm": 0.38297995924949646, + "learning_rate": 0.00019727316445495782, + "loss": 1.3393, + "step": 1065 + }, + { + "epoch": 0.013852183814320962, + "grad_norm": 0.4578399956226349, + "learning_rate": 0.00019727056499304645, + "loss": 1.5749, + "step": 1066 + }, + { + "epoch": 0.013865178358236835, + "grad_norm": 0.3051646053791046, + "learning_rate": 0.00019726796553113507, + "loss": 1.3907, + "step": 1067 + }, + { + "epoch": 0.013878172902152708, + "grad_norm": 0.34466204047203064, + "learning_rate": 0.00019726536606922367, + "loss": 1.5721, + "step": 1068 + }, + { + "epoch": 0.013891167446068583, + "grad_norm": 0.3795184791088104, + "learning_rate": 0.0001972627666073123, + "loss": 1.6227, + "step": 1069 + }, + { + "epoch": 0.013904161989984455, + "grad_norm": 0.3206294775009155, + "learning_rate": 0.00019726016714540092, + "loss": 1.3982, + "step": 1070 + }, + { + "epoch": 0.013917156533900328, + "grad_norm": 0.3425387144088745, + "learning_rate": 0.00019725756768348954, + "loss": 1.4367, + "step": 1071 + }, + { + "epoch": 0.013930151077816201, + "grad_norm": 0.4226791560649872, + "learning_rate": 0.00019725496822157814, + "loss": 1.5182, + "step": 1072 + }, + { + "epoch": 0.013943145621732076, + "grad_norm": 0.3256654739379883, + "learning_rate": 0.00019725236875966674, + "loss": 1.5103, + "step": 1073 + }, + { + "epoch": 0.013956140165647949, + "grad_norm": 0.4334042966365814, + "learning_rate": 0.0001972497692977554, + "loss": 1.4865, + "step": 1074 + }, + { + "epoch": 0.013969134709563822, + "grad_norm": 0.3617881238460541, + "learning_rate": 0.000197247169835844, + "loss": 1.3425, + "step": 1075 + }, + { + "epoch": 0.013982129253479695, + "grad_norm": 0.35514402389526367, + "learning_rate": 0.0001972445703739326, + "loss": 1.3842, + "step": 1076 + }, + { + "epoch": 0.013995123797395568, + "grad_norm": 0.3968522250652313, + "learning_rate": 0.0001972419709120212, + "loss": 1.565, + "step": 1077 + }, + { + "epoch": 0.014008118341311443, + "grad_norm": 0.3910624086856842, + "learning_rate": 0.00019723937145010983, + "loss": 1.6575, + "step": 1078 + }, + { + "epoch": 0.014021112885227316, + "grad_norm": 0.3052786588668823, + "learning_rate": 0.00019723677198819846, + "loss": 1.4456, + "step": 1079 + }, + { + "epoch": 0.014034107429143189, + "grad_norm": 0.3568594455718994, + "learning_rate": 0.00019723417252628706, + "loss": 1.4607, + "step": 1080 + }, + { + "epoch": 0.014047101973059061, + "grad_norm": 0.35911497473716736, + "learning_rate": 0.00019723157306437568, + "loss": 1.4244, + "step": 1081 + }, + { + "epoch": 0.014060096516974934, + "grad_norm": 0.35595959424972534, + "learning_rate": 0.0001972289736024643, + "loss": 1.4078, + "step": 1082 + }, + { + "epoch": 0.01407309106089081, + "grad_norm": 0.4164542555809021, + "learning_rate": 0.00019722637414055293, + "loss": 1.5674, + "step": 1083 + }, + { + "epoch": 0.014086085604806682, + "grad_norm": 0.4948754608631134, + "learning_rate": 0.00019722377467864153, + "loss": 1.5765, + "step": 1084 + }, + { + "epoch": 0.014099080148722555, + "grad_norm": 0.4149811863899231, + "learning_rate": 0.00019722117521673012, + "loss": 1.5938, + "step": 1085 + }, + { + "epoch": 0.014112074692638428, + "grad_norm": 0.2832207679748535, + "learning_rate": 0.00019721857575481878, + "loss": 1.429, + "step": 1086 + }, + { + "epoch": 0.014125069236554301, + "grad_norm": 0.46026912331581116, + "learning_rate": 0.00019721597629290737, + "loss": 1.6396, + "step": 1087 + }, + { + "epoch": 0.014138063780470176, + "grad_norm": 0.4021912217140198, + "learning_rate": 0.000197213376830996, + "loss": 1.4451, + "step": 1088 + }, + { + "epoch": 0.014151058324386049, + "grad_norm": 0.33511781692504883, + "learning_rate": 0.0001972107773690846, + "loss": 1.6081, + "step": 1089 + }, + { + "epoch": 0.014164052868301922, + "grad_norm": 0.2907991409301758, + "learning_rate": 0.00019720817790717322, + "loss": 1.4162, + "step": 1090 + }, + { + "epoch": 0.014177047412217795, + "grad_norm": 0.44519951939582825, + "learning_rate": 0.00019720557844526184, + "loss": 1.5443, + "step": 1091 + }, + { + "epoch": 0.01419004195613367, + "grad_norm": 0.30131325125694275, + "learning_rate": 0.00019720297898335044, + "loss": 1.4325, + "step": 1092 + }, + { + "epoch": 0.014203036500049542, + "grad_norm": 0.32041850686073303, + "learning_rate": 0.0001972003795214391, + "loss": 1.6483, + "step": 1093 + }, + { + "epoch": 0.014216031043965415, + "grad_norm": 0.3435966372489929, + "learning_rate": 0.0001971977800595277, + "loss": 1.4087, + "step": 1094 + }, + { + "epoch": 0.014229025587881288, + "grad_norm": 0.36322468519210815, + "learning_rate": 0.00019719518059761631, + "loss": 1.3984, + "step": 1095 + }, + { + "epoch": 0.014242020131797161, + "grad_norm": 0.3201417028903961, + "learning_rate": 0.0001971925811357049, + "loss": 1.4159, + "step": 1096 + }, + { + "epoch": 0.014255014675713036, + "grad_norm": 0.371951699256897, + "learning_rate": 0.00019718998167379354, + "loss": 1.4836, + "step": 1097 + }, + { + "epoch": 0.014268009219628909, + "grad_norm": 0.5051089525222778, + "learning_rate": 0.00019718738221188216, + "loss": 1.6927, + "step": 1098 + }, + { + "epoch": 0.014281003763544782, + "grad_norm": 0.36920613050460815, + "learning_rate": 0.00019718478274997076, + "loss": 1.4733, + "step": 1099 + }, + { + "epoch": 0.014293998307460655, + "grad_norm": 0.42907023429870605, + "learning_rate": 0.00019718218328805938, + "loss": 1.5133, + "step": 1100 + }, + { + "epoch": 0.014306992851376528, + "grad_norm": 0.34839338064193726, + "learning_rate": 0.000197179583826148, + "loss": 1.5482, + "step": 1101 + }, + { + "epoch": 0.014319987395292402, + "grad_norm": 0.32832157611846924, + "learning_rate": 0.0001971769843642366, + "loss": 1.5275, + "step": 1102 + }, + { + "epoch": 0.014332981939208275, + "grad_norm": 0.4097306728363037, + "learning_rate": 0.00019717438490232523, + "loss": 1.3461, + "step": 1103 + }, + { + "epoch": 0.014345976483124148, + "grad_norm": 0.3983323276042938, + "learning_rate": 0.00019717178544041383, + "loss": 1.4898, + "step": 1104 + }, + { + "epoch": 0.014358971027040021, + "grad_norm": 0.385093092918396, + "learning_rate": 0.00019716918597850248, + "loss": 1.4728, + "step": 1105 + }, + { + "epoch": 0.014371965570955894, + "grad_norm": 0.42422205209732056, + "learning_rate": 0.00019716658651659108, + "loss": 1.6079, + "step": 1106 + }, + { + "epoch": 0.014384960114871769, + "grad_norm": 0.33223721385002136, + "learning_rate": 0.0001971639870546797, + "loss": 1.3487, + "step": 1107 + }, + { + "epoch": 0.014397954658787642, + "grad_norm": 0.34783658385276794, + "learning_rate": 0.0001971613875927683, + "loss": 1.6666, + "step": 1108 + }, + { + "epoch": 0.014410949202703515, + "grad_norm": 0.42093703150749207, + "learning_rate": 0.00019715878813085692, + "loss": 1.7062, + "step": 1109 + }, + { + "epoch": 0.014423943746619388, + "grad_norm": 0.3577468693256378, + "learning_rate": 0.00019715618866894555, + "loss": 1.4418, + "step": 1110 + }, + { + "epoch": 0.014436938290535262, + "grad_norm": 0.3317162096500397, + "learning_rate": 0.00019715358920703414, + "loss": 1.5098, + "step": 1111 + }, + { + "epoch": 0.014449932834451135, + "grad_norm": 0.39414602518081665, + "learning_rate": 0.00019715098974512277, + "loss": 1.5375, + "step": 1112 + }, + { + "epoch": 0.014462927378367008, + "grad_norm": 0.3096737861633301, + "learning_rate": 0.0001971483902832114, + "loss": 1.5748, + "step": 1113 + }, + { + "epoch": 0.014475921922282881, + "grad_norm": 0.31180667877197266, + "learning_rate": 0.0001971457908213, + "loss": 1.2854, + "step": 1114 + }, + { + "epoch": 0.014488916466198754, + "grad_norm": 0.3488854765892029, + "learning_rate": 0.00019714319135938861, + "loss": 1.3896, + "step": 1115 + }, + { + "epoch": 0.014501911010114629, + "grad_norm": 0.4322400689125061, + "learning_rate": 0.0001971405918974772, + "loss": 1.6063, + "step": 1116 + }, + { + "epoch": 0.014514905554030502, + "grad_norm": 0.3862898647785187, + "learning_rate": 0.00019713799243556586, + "loss": 1.3693, + "step": 1117 + }, + { + "epoch": 0.014527900097946375, + "grad_norm": 0.4383971095085144, + "learning_rate": 0.00019713539297365446, + "loss": 1.4469, + "step": 1118 + }, + { + "epoch": 0.014540894641862248, + "grad_norm": 0.40656328201293945, + "learning_rate": 0.00019713279351174308, + "loss": 1.6469, + "step": 1119 + }, + { + "epoch": 0.01455388918577812, + "grad_norm": 0.3286970257759094, + "learning_rate": 0.00019713019404983168, + "loss": 1.448, + "step": 1120 + }, + { + "epoch": 0.014566883729693995, + "grad_norm": 0.3503006100654602, + "learning_rate": 0.0001971275945879203, + "loss": 1.4671, + "step": 1121 + }, + { + "epoch": 0.014579878273609868, + "grad_norm": 0.4297201633453369, + "learning_rate": 0.00019712499512600893, + "loss": 1.3163, + "step": 1122 + }, + { + "epoch": 0.014592872817525741, + "grad_norm": 0.3751363456249237, + "learning_rate": 0.00019712239566409753, + "loss": 1.4478, + "step": 1123 + }, + { + "epoch": 0.014605867361441614, + "grad_norm": 0.38053035736083984, + "learning_rate": 0.00019711979620218615, + "loss": 1.2915, + "step": 1124 + }, + { + "epoch": 0.014618861905357487, + "grad_norm": 0.3267087936401367, + "learning_rate": 0.00019711719674027478, + "loss": 1.4562, + "step": 1125 + }, + { + "epoch": 0.014631856449273362, + "grad_norm": 0.3064032793045044, + "learning_rate": 0.0001971145972783634, + "loss": 1.4026, + "step": 1126 + }, + { + "epoch": 0.014644850993189235, + "grad_norm": 0.360404372215271, + "learning_rate": 0.000197111997816452, + "loss": 1.3682, + "step": 1127 + }, + { + "epoch": 0.014657845537105108, + "grad_norm": 0.5024319887161255, + "learning_rate": 0.0001971093983545406, + "loss": 1.6088, + "step": 1128 + }, + { + "epoch": 0.01467084008102098, + "grad_norm": 0.38353124260902405, + "learning_rate": 0.00019710679889262925, + "loss": 1.4739, + "step": 1129 + }, + { + "epoch": 0.014683834624936854, + "grad_norm": 0.35435613989830017, + "learning_rate": 0.00019710419943071785, + "loss": 1.3529, + "step": 1130 + }, + { + "epoch": 0.014696829168852728, + "grad_norm": 0.43681633472442627, + "learning_rate": 0.00019710159996880647, + "loss": 1.3106, + "step": 1131 + }, + { + "epoch": 0.014709823712768601, + "grad_norm": 0.4419565498828888, + "learning_rate": 0.0001970990005068951, + "loss": 1.5253, + "step": 1132 + }, + { + "epoch": 0.014722818256684474, + "grad_norm": 0.3059302270412445, + "learning_rate": 0.0001970964010449837, + "loss": 1.3594, + "step": 1133 + }, + { + "epoch": 0.014735812800600347, + "grad_norm": 0.38897067308425903, + "learning_rate": 0.00019709380158307232, + "loss": 1.4217, + "step": 1134 + }, + { + "epoch": 0.014748807344516222, + "grad_norm": 0.34824851155281067, + "learning_rate": 0.00019709120212116091, + "loss": 1.4551, + "step": 1135 + }, + { + "epoch": 0.014761801888432095, + "grad_norm": 0.3961305618286133, + "learning_rate": 0.00019708860265924957, + "loss": 1.6001, + "step": 1136 + }, + { + "epoch": 0.014774796432347968, + "grad_norm": 0.3158442974090576, + "learning_rate": 0.00019708600319733816, + "loss": 1.4555, + "step": 1137 + }, + { + "epoch": 0.014787790976263841, + "grad_norm": 0.3308698832988739, + "learning_rate": 0.0001970834037354268, + "loss": 1.5223, + "step": 1138 + }, + { + "epoch": 0.014800785520179714, + "grad_norm": 0.31649670004844666, + "learning_rate": 0.00019708080427351538, + "loss": 1.4427, + "step": 1139 + }, + { + "epoch": 0.014813780064095589, + "grad_norm": 0.30845457315444946, + "learning_rate": 0.000197078204811604, + "loss": 1.4735, + "step": 1140 + }, + { + "epoch": 0.014826774608011462, + "grad_norm": 0.34705907106399536, + "learning_rate": 0.00019707560534969263, + "loss": 1.4968, + "step": 1141 + }, + { + "epoch": 0.014839769151927334, + "grad_norm": 0.33273646235466003, + "learning_rate": 0.00019707300588778123, + "loss": 1.1341, + "step": 1142 + }, + { + "epoch": 0.014852763695843207, + "grad_norm": 0.37411919236183167, + "learning_rate": 0.00019707040642586986, + "loss": 1.6221, + "step": 1143 + }, + { + "epoch": 0.01486575823975908, + "grad_norm": 0.3228834271430969, + "learning_rate": 0.00019706780696395848, + "loss": 1.7656, + "step": 1144 + }, + { + "epoch": 0.014878752783674955, + "grad_norm": 0.24861685931682587, + "learning_rate": 0.00019706520750204708, + "loss": 1.219, + "step": 1145 + }, + { + "epoch": 0.014891747327590828, + "grad_norm": 0.3395216166973114, + "learning_rate": 0.0001970626080401357, + "loss": 1.3166, + "step": 1146 + }, + { + "epoch": 0.014904741871506701, + "grad_norm": 0.28913232684135437, + "learning_rate": 0.0001970600085782243, + "loss": 1.4134, + "step": 1147 + }, + { + "epoch": 0.014917736415422574, + "grad_norm": 0.399809330701828, + "learning_rate": 0.00019705740911631295, + "loss": 1.5225, + "step": 1148 + }, + { + "epoch": 0.014930730959338447, + "grad_norm": 0.43894335627555847, + "learning_rate": 0.00019705480965440155, + "loss": 1.6128, + "step": 1149 + }, + { + "epoch": 0.014943725503254322, + "grad_norm": 0.3003581166267395, + "learning_rate": 0.00019705221019249017, + "loss": 1.3618, + "step": 1150 + }, + { + "epoch": 0.014956720047170195, + "grad_norm": 0.5539864301681519, + "learning_rate": 0.00019704961073057877, + "loss": 1.5579, + "step": 1151 + }, + { + "epoch": 0.014969714591086068, + "grad_norm": 0.4360288083553314, + "learning_rate": 0.0001970470112686674, + "loss": 1.4598, + "step": 1152 + }, + { + "epoch": 0.01498270913500194, + "grad_norm": 0.3488319218158722, + "learning_rate": 0.00019704441180675602, + "loss": 1.4878, + "step": 1153 + }, + { + "epoch": 0.014995703678917815, + "grad_norm": 0.414284348487854, + "learning_rate": 0.00019704181234484462, + "loss": 1.3644, + "step": 1154 + }, + { + "epoch": 0.015008698222833688, + "grad_norm": 0.3985223174095154, + "learning_rate": 0.00019703921288293324, + "loss": 1.6135, + "step": 1155 + }, + { + "epoch": 0.015021692766749561, + "grad_norm": 0.3805556297302246, + "learning_rate": 0.00019703661342102187, + "loss": 1.5731, + "step": 1156 + }, + { + "epoch": 0.015034687310665434, + "grad_norm": 0.3703943192958832, + "learning_rate": 0.00019703401395911046, + "loss": 1.5537, + "step": 1157 + }, + { + "epoch": 0.015047681854581307, + "grad_norm": 0.3613834083080292, + "learning_rate": 0.0001970314144971991, + "loss": 1.6242, + "step": 1158 + }, + { + "epoch": 0.015060676398497182, + "grad_norm": 0.3949839770793915, + "learning_rate": 0.00019702881503528768, + "loss": 1.4553, + "step": 1159 + }, + { + "epoch": 0.015073670942413055, + "grad_norm": 0.34406524896621704, + "learning_rate": 0.00019702621557337634, + "loss": 1.1838, + "step": 1160 + }, + { + "epoch": 0.015086665486328928, + "grad_norm": 0.3827522099018097, + "learning_rate": 0.00019702361611146493, + "loss": 1.4566, + "step": 1161 + }, + { + "epoch": 0.0150996600302448, + "grad_norm": 0.2957545816898346, + "learning_rate": 0.00019702101664955356, + "loss": 1.7034, + "step": 1162 + }, + { + "epoch": 0.015112654574160674, + "grad_norm": 0.42204660177230835, + "learning_rate": 0.00019701841718764216, + "loss": 1.4461, + "step": 1163 + }, + { + "epoch": 0.015125649118076548, + "grad_norm": 0.44211307168006897, + "learning_rate": 0.00019701581772573078, + "loss": 1.6812, + "step": 1164 + }, + { + "epoch": 0.015138643661992421, + "grad_norm": 0.42191219329833984, + "learning_rate": 0.0001970132182638194, + "loss": 1.6109, + "step": 1165 + }, + { + "epoch": 0.015151638205908294, + "grad_norm": 0.4305635690689087, + "learning_rate": 0.000197010618801908, + "loss": 1.5388, + "step": 1166 + }, + { + "epoch": 0.015164632749824167, + "grad_norm": 0.3535262644290924, + "learning_rate": 0.00019700801933999665, + "loss": 1.5471, + "step": 1167 + }, + { + "epoch": 0.01517762729374004, + "grad_norm": 0.27894142270088196, + "learning_rate": 0.00019700541987808525, + "loss": 1.3392, + "step": 1168 + }, + { + "epoch": 0.015190621837655915, + "grad_norm": 0.34142574667930603, + "learning_rate": 0.00019700282041617385, + "loss": 1.285, + "step": 1169 + }, + { + "epoch": 0.015203616381571788, + "grad_norm": 0.30574700236320496, + "learning_rate": 0.00019700022095426247, + "loss": 1.1122, + "step": 1170 + }, + { + "epoch": 0.01521661092548766, + "grad_norm": 0.34303486347198486, + "learning_rate": 0.0001969976214923511, + "loss": 1.4074, + "step": 1171 + }, + { + "epoch": 0.015229605469403534, + "grad_norm": 0.39488857984542847, + "learning_rate": 0.00019699502203043972, + "loss": 1.5406, + "step": 1172 + }, + { + "epoch": 0.015242600013319408, + "grad_norm": 0.362435907125473, + "learning_rate": 0.00019699242256852832, + "loss": 1.4349, + "step": 1173 + }, + { + "epoch": 0.015255594557235281, + "grad_norm": 0.41773369908332825, + "learning_rate": 0.00019698982310661694, + "loss": 1.5431, + "step": 1174 + }, + { + "epoch": 0.015268589101151154, + "grad_norm": 0.4345804750919342, + "learning_rate": 0.00019698722364470557, + "loss": 1.4943, + "step": 1175 + }, + { + "epoch": 0.015281583645067027, + "grad_norm": 0.46782350540161133, + "learning_rate": 0.00019698462418279417, + "loss": 1.6564, + "step": 1176 + }, + { + "epoch": 0.0152945781889829, + "grad_norm": 0.40089139342308044, + "learning_rate": 0.0001969820247208828, + "loss": 1.6876, + "step": 1177 + }, + { + "epoch": 0.015307572732898775, + "grad_norm": 0.3707917034626007, + "learning_rate": 0.0001969794252589714, + "loss": 1.487, + "step": 1178 + }, + { + "epoch": 0.015320567276814648, + "grad_norm": 0.37704959511756897, + "learning_rate": 0.00019697682579706004, + "loss": 1.609, + "step": 1179 + }, + { + "epoch": 0.01533356182073052, + "grad_norm": 0.3373548090457916, + "learning_rate": 0.00019697422633514864, + "loss": 1.3733, + "step": 1180 + }, + { + "epoch": 0.015346556364646394, + "grad_norm": 0.3421562612056732, + "learning_rate": 0.00019697162687323723, + "loss": 1.5377, + "step": 1181 + }, + { + "epoch": 0.015359550908562267, + "grad_norm": 0.42313849925994873, + "learning_rate": 0.00019696902741132586, + "loss": 1.5373, + "step": 1182 + }, + { + "epoch": 0.015372545452478141, + "grad_norm": 0.3664761483669281, + "learning_rate": 0.00019696642794941448, + "loss": 1.5097, + "step": 1183 + }, + { + "epoch": 0.015385539996394014, + "grad_norm": 0.3784591257572174, + "learning_rate": 0.0001969638284875031, + "loss": 1.4573, + "step": 1184 + }, + { + "epoch": 0.015398534540309887, + "grad_norm": 0.36157453060150146, + "learning_rate": 0.0001969612290255917, + "loss": 1.5686, + "step": 1185 + }, + { + "epoch": 0.01541152908422576, + "grad_norm": 0.3741309642791748, + "learning_rate": 0.00019695862956368033, + "loss": 1.5118, + "step": 1186 + }, + { + "epoch": 0.015424523628141633, + "grad_norm": 0.3733425438404083, + "learning_rate": 0.00019695603010176895, + "loss": 1.4915, + "step": 1187 + }, + { + "epoch": 0.015437518172057508, + "grad_norm": 0.4223376214504242, + "learning_rate": 0.00019695343063985755, + "loss": 1.4806, + "step": 1188 + }, + { + "epoch": 0.01545051271597338, + "grad_norm": 0.4280458092689514, + "learning_rate": 0.00019695083117794618, + "loss": 1.5442, + "step": 1189 + }, + { + "epoch": 0.015463507259889254, + "grad_norm": 0.3798462152481079, + "learning_rate": 0.00019694823171603477, + "loss": 1.5422, + "step": 1190 + }, + { + "epoch": 0.015476501803805127, + "grad_norm": 0.22774185240268707, + "learning_rate": 0.00019694563225412342, + "loss": 1.2608, + "step": 1191 + }, + { + "epoch": 0.015489496347721001, + "grad_norm": 0.38605165481567383, + "learning_rate": 0.00019694303279221202, + "loss": 1.4247, + "step": 1192 + }, + { + "epoch": 0.015502490891636874, + "grad_norm": 0.4194793999195099, + "learning_rate": 0.00019694043333030065, + "loss": 1.508, + "step": 1193 + }, + { + "epoch": 0.015515485435552747, + "grad_norm": 0.3809635043144226, + "learning_rate": 0.00019693783386838924, + "loss": 1.6783, + "step": 1194 + }, + { + "epoch": 0.01552847997946862, + "grad_norm": 0.34799525141716003, + "learning_rate": 0.00019693523440647787, + "loss": 1.3097, + "step": 1195 + }, + { + "epoch": 0.015541474523384493, + "grad_norm": 0.30423760414123535, + "learning_rate": 0.0001969326349445665, + "loss": 1.215, + "step": 1196 + }, + { + "epoch": 0.015554469067300368, + "grad_norm": 0.3915634751319885, + "learning_rate": 0.0001969300354826551, + "loss": 1.5735, + "step": 1197 + }, + { + "epoch": 0.015567463611216241, + "grad_norm": 0.37613967061042786, + "learning_rate": 0.00019692743602074371, + "loss": 1.4058, + "step": 1198 + }, + { + "epoch": 0.015580458155132114, + "grad_norm": 0.40080973505973816, + "learning_rate": 0.00019692483655883234, + "loss": 1.5197, + "step": 1199 + }, + { + "epoch": 0.015593452699047987, + "grad_norm": 0.3446778357028961, + "learning_rate": 0.00019692223709692094, + "loss": 1.6041, + "step": 1200 + }, + { + "epoch": 0.01560644724296386, + "grad_norm": 0.366235613822937, + "learning_rate": 0.00019691963763500956, + "loss": 1.383, + "step": 1201 + }, + { + "epoch": 0.015619441786879735, + "grad_norm": 0.3766935467720032, + "learning_rate": 0.00019691703817309816, + "loss": 1.6052, + "step": 1202 + }, + { + "epoch": 0.015632436330795606, + "grad_norm": 0.3511297404766083, + "learning_rate": 0.0001969144387111868, + "loss": 1.3423, + "step": 1203 + }, + { + "epoch": 0.015645430874711482, + "grad_norm": 0.48626863956451416, + "learning_rate": 0.0001969118392492754, + "loss": 1.4677, + "step": 1204 + }, + { + "epoch": 0.015658425418627355, + "grad_norm": 0.34959498047828674, + "learning_rate": 0.00019690923978736403, + "loss": 1.3027, + "step": 1205 + }, + { + "epoch": 0.015671419962543228, + "grad_norm": 0.3932822644710541, + "learning_rate": 0.00019690664032545266, + "loss": 1.3391, + "step": 1206 + }, + { + "epoch": 0.0156844145064591, + "grad_norm": 0.21857501566410065, + "learning_rate": 0.00019690404086354125, + "loss": 1.1553, + "step": 1207 + }, + { + "epoch": 0.015697409050374974, + "grad_norm": 0.4064771831035614, + "learning_rate": 0.00019690144140162988, + "loss": 1.678, + "step": 1208 + }, + { + "epoch": 0.015710403594290847, + "grad_norm": 0.3586530387401581, + "learning_rate": 0.00019689884193971848, + "loss": 1.3805, + "step": 1209 + }, + { + "epoch": 0.01572339813820672, + "grad_norm": 0.3518249988555908, + "learning_rate": 0.00019689624247780713, + "loss": 1.2187, + "step": 1210 + }, + { + "epoch": 0.015736392682122593, + "grad_norm": 0.27563443779945374, + "learning_rate": 0.00019689364301589572, + "loss": 1.1782, + "step": 1211 + }, + { + "epoch": 0.015749387226038466, + "grad_norm": 0.3735935389995575, + "learning_rate": 0.00019689104355398432, + "loss": 1.5659, + "step": 1212 + }, + { + "epoch": 0.01576238176995434, + "grad_norm": 0.3346775472164154, + "learning_rate": 0.00019688844409207295, + "loss": 1.2095, + "step": 1213 + }, + { + "epoch": 0.015775376313870215, + "grad_norm": 0.3192962408065796, + "learning_rate": 0.00019688584463016157, + "loss": 1.2406, + "step": 1214 + }, + { + "epoch": 0.015788370857786088, + "grad_norm": 0.44287389516830444, + "learning_rate": 0.0001968832451682502, + "loss": 1.4406, + "step": 1215 + }, + { + "epoch": 0.01580136540170196, + "grad_norm": 0.2607724368572235, + "learning_rate": 0.0001968806457063388, + "loss": 1.6125, + "step": 1216 + }, + { + "epoch": 0.015814359945617834, + "grad_norm": 0.35675251483917236, + "learning_rate": 0.00019687804624442742, + "loss": 1.4747, + "step": 1217 + }, + { + "epoch": 0.015827354489533707, + "grad_norm": 0.45708343386650085, + "learning_rate": 0.00019687544678251604, + "loss": 1.4091, + "step": 1218 + }, + { + "epoch": 0.01584034903344958, + "grad_norm": 0.43029478192329407, + "learning_rate": 0.00019687284732060464, + "loss": 1.4042, + "step": 1219 + }, + { + "epoch": 0.015853343577365453, + "grad_norm": 0.3849797546863556, + "learning_rate": 0.00019687024785869326, + "loss": 1.4625, + "step": 1220 + }, + { + "epoch": 0.015866338121281326, + "grad_norm": 0.300675630569458, + "learning_rate": 0.00019686764839678186, + "loss": 1.4155, + "step": 1221 + }, + { + "epoch": 0.0158793326651972, + "grad_norm": 0.49140745401382446, + "learning_rate": 0.0001968650489348705, + "loss": 1.6745, + "step": 1222 + }, + { + "epoch": 0.015892327209113075, + "grad_norm": 0.4113255739212036, + "learning_rate": 0.0001968624494729591, + "loss": 1.4548, + "step": 1223 + }, + { + "epoch": 0.01590532175302895, + "grad_norm": 0.4348243474960327, + "learning_rate": 0.0001968598500110477, + "loss": 1.4758, + "step": 1224 + }, + { + "epoch": 0.01591831629694482, + "grad_norm": 0.47167786955833435, + "learning_rate": 0.00019685725054913633, + "loss": 1.5269, + "step": 1225 + }, + { + "epoch": 0.015931310840860694, + "grad_norm": 0.40507128834724426, + "learning_rate": 0.00019685465108722496, + "loss": 1.5768, + "step": 1226 + }, + { + "epoch": 0.015944305384776567, + "grad_norm": 0.34047117829322815, + "learning_rate": 0.00019685205162531358, + "loss": 1.5666, + "step": 1227 + }, + { + "epoch": 0.01595729992869244, + "grad_norm": 0.3125999867916107, + "learning_rate": 0.00019684945216340218, + "loss": 1.6384, + "step": 1228 + }, + { + "epoch": 0.015970294472608313, + "grad_norm": 0.41916677355766296, + "learning_rate": 0.0001968468527014908, + "loss": 1.4802, + "step": 1229 + }, + { + "epoch": 0.015983289016524186, + "grad_norm": 0.3740885853767395, + "learning_rate": 0.00019684425323957943, + "loss": 1.404, + "step": 1230 + }, + { + "epoch": 0.01599628356044006, + "grad_norm": 0.3712579607963562, + "learning_rate": 0.00019684165377766802, + "loss": 1.5274, + "step": 1231 + }, + { + "epoch": 0.016009278104355932, + "grad_norm": 0.4336252510547638, + "learning_rate": 0.00019683905431575665, + "loss": 1.5589, + "step": 1232 + }, + { + "epoch": 0.01602227264827181, + "grad_norm": 0.337321013212204, + "learning_rate": 0.00019683645485384525, + "loss": 1.2543, + "step": 1233 + }, + { + "epoch": 0.01603526719218768, + "grad_norm": 0.3992306590080261, + "learning_rate": 0.0001968338553919339, + "loss": 1.2617, + "step": 1234 + }, + { + "epoch": 0.016048261736103554, + "grad_norm": 0.33321884274482727, + "learning_rate": 0.0001968312559300225, + "loss": 1.5899, + "step": 1235 + }, + { + "epoch": 0.016061256280019427, + "grad_norm": 0.4244740307331085, + "learning_rate": 0.0001968286564681111, + "loss": 1.5195, + "step": 1236 + }, + { + "epoch": 0.0160742508239353, + "grad_norm": 0.3898663818836212, + "learning_rate": 0.00019682605700619972, + "loss": 1.7735, + "step": 1237 + }, + { + "epoch": 0.016087245367851173, + "grad_norm": 0.3646060526371002, + "learning_rate": 0.00019682345754428834, + "loss": 1.5273, + "step": 1238 + }, + { + "epoch": 0.016100239911767046, + "grad_norm": 0.43874886631965637, + "learning_rate": 0.00019682085808237697, + "loss": 1.5299, + "step": 1239 + }, + { + "epoch": 0.01611323445568292, + "grad_norm": 0.3499436378479004, + "learning_rate": 0.00019681825862046556, + "loss": 1.6245, + "step": 1240 + }, + { + "epoch": 0.016126228999598792, + "grad_norm": 0.475818395614624, + "learning_rate": 0.0001968156591585542, + "loss": 1.3729, + "step": 1241 + }, + { + "epoch": 0.01613922354351467, + "grad_norm": 0.3963949978351593, + "learning_rate": 0.0001968130596966428, + "loss": 1.3623, + "step": 1242 + }, + { + "epoch": 0.01615221808743054, + "grad_norm": 0.37632128596305847, + "learning_rate": 0.0001968104602347314, + "loss": 1.419, + "step": 1243 + }, + { + "epoch": 0.016165212631346414, + "grad_norm": 0.4357317090034485, + "learning_rate": 0.00019680786077282003, + "loss": 1.56, + "step": 1244 + }, + { + "epoch": 0.016178207175262287, + "grad_norm": 0.33209028840065, + "learning_rate": 0.00019680526131090866, + "loss": 1.3852, + "step": 1245 + }, + { + "epoch": 0.01619120171917816, + "grad_norm": 0.3596300482749939, + "learning_rate": 0.00019680266184899728, + "loss": 1.4985, + "step": 1246 + }, + { + "epoch": 0.016204196263094033, + "grad_norm": 0.3832753598690033, + "learning_rate": 0.00019680006238708588, + "loss": 1.4908, + "step": 1247 + }, + { + "epoch": 0.016217190807009906, + "grad_norm": 0.3142191767692566, + "learning_rate": 0.0001967974629251745, + "loss": 1.5067, + "step": 1248 + }, + { + "epoch": 0.01623018535092578, + "grad_norm": 0.36329564452171326, + "learning_rate": 0.00019679486346326313, + "loss": 1.4688, + "step": 1249 + }, + { + "epoch": 0.016243179894841652, + "grad_norm": 0.31218141317367554, + "learning_rate": 0.00019679226400135173, + "loss": 1.3132, + "step": 1250 + }, + { + "epoch": 0.016256174438757525, + "grad_norm": 0.4516143500804901, + "learning_rate": 0.00019678966453944035, + "loss": 1.5997, + "step": 1251 + }, + { + "epoch": 0.0162691689826734, + "grad_norm": 0.37211713194847107, + "learning_rate": 0.00019678706507752895, + "loss": 1.3798, + "step": 1252 + }, + { + "epoch": 0.016282163526589274, + "grad_norm": 0.4344126582145691, + "learning_rate": 0.00019678446561561757, + "loss": 1.5206, + "step": 1253 + }, + { + "epoch": 0.016295158070505147, + "grad_norm": 0.33741259574890137, + "learning_rate": 0.0001967818661537062, + "loss": 1.5059, + "step": 1254 + }, + { + "epoch": 0.01630815261442102, + "grad_norm": 0.4300345182418823, + "learning_rate": 0.0001967792666917948, + "loss": 1.624, + "step": 1255 + }, + { + "epoch": 0.016321147158336893, + "grad_norm": 0.33736440539360046, + "learning_rate": 0.00019677666722988342, + "loss": 1.4496, + "step": 1256 + }, + { + "epoch": 0.016334141702252766, + "grad_norm": 0.3926686644554138, + "learning_rate": 0.00019677406776797204, + "loss": 1.5301, + "step": 1257 + }, + { + "epoch": 0.01634713624616864, + "grad_norm": 0.426224023103714, + "learning_rate": 0.00019677146830606067, + "loss": 1.5169, + "step": 1258 + }, + { + "epoch": 0.016360130790084512, + "grad_norm": 0.4742068946361542, + "learning_rate": 0.00019676886884414927, + "loss": 1.5163, + "step": 1259 + }, + { + "epoch": 0.016373125334000385, + "grad_norm": 0.3262328505516052, + "learning_rate": 0.0001967662693822379, + "loss": 1.4143, + "step": 1260 + }, + { + "epoch": 0.01638611987791626, + "grad_norm": 0.2593839466571808, + "learning_rate": 0.00019676366992032651, + "loss": 1.23, + "step": 1261 + }, + { + "epoch": 0.016399114421832135, + "grad_norm": 0.38010913133621216, + "learning_rate": 0.0001967610704584151, + "loss": 1.6117, + "step": 1262 + }, + { + "epoch": 0.016412108965748008, + "grad_norm": 0.267661452293396, + "learning_rate": 0.00019675847099650374, + "loss": 1.315, + "step": 1263 + }, + { + "epoch": 0.01642510350966388, + "grad_norm": 0.3073880672454834, + "learning_rate": 0.00019675587153459233, + "loss": 1.4164, + "step": 1264 + }, + { + "epoch": 0.016438098053579753, + "grad_norm": 0.3029155135154724, + "learning_rate": 0.00019675327207268096, + "loss": 1.6044, + "step": 1265 + }, + { + "epoch": 0.016451092597495626, + "grad_norm": 0.31216323375701904, + "learning_rate": 0.00019675067261076958, + "loss": 1.4097, + "step": 1266 + }, + { + "epoch": 0.0164640871414115, + "grad_norm": 0.41255277395248413, + "learning_rate": 0.00019674807314885818, + "loss": 1.5876, + "step": 1267 + }, + { + "epoch": 0.016477081685327372, + "grad_norm": 0.3925461769104004, + "learning_rate": 0.0001967454736869468, + "loss": 1.4628, + "step": 1268 + }, + { + "epoch": 0.016490076229243245, + "grad_norm": 0.47023651003837585, + "learning_rate": 0.00019674287422503543, + "loss": 1.4258, + "step": 1269 + }, + { + "epoch": 0.016503070773159118, + "grad_norm": 0.3889276683330536, + "learning_rate": 0.00019674027476312405, + "loss": 1.4992, + "step": 1270 + }, + { + "epoch": 0.016516065317074995, + "grad_norm": 0.5152811408042908, + "learning_rate": 0.00019673767530121265, + "loss": 1.5095, + "step": 1271 + }, + { + "epoch": 0.016529059860990868, + "grad_norm": 0.3542300760746002, + "learning_rate": 0.00019673507583930128, + "loss": 1.5557, + "step": 1272 + }, + { + "epoch": 0.01654205440490674, + "grad_norm": 0.3644115626811981, + "learning_rate": 0.0001967324763773899, + "loss": 1.3361, + "step": 1273 + }, + { + "epoch": 0.016555048948822614, + "grad_norm": 0.4357520043849945, + "learning_rate": 0.0001967298769154785, + "loss": 1.4503, + "step": 1274 + }, + { + "epoch": 0.016568043492738486, + "grad_norm": 0.3494506776332855, + "learning_rate": 0.00019672727745356712, + "loss": 1.2671, + "step": 1275 + }, + { + "epoch": 0.01658103803665436, + "grad_norm": 0.39975765347480774, + "learning_rate": 0.00019672467799165572, + "loss": 1.5215, + "step": 1276 + }, + { + "epoch": 0.016594032580570232, + "grad_norm": 0.373329222202301, + "learning_rate": 0.00019672207852974437, + "loss": 1.4909, + "step": 1277 + }, + { + "epoch": 0.016607027124486105, + "grad_norm": 0.43159815669059753, + "learning_rate": 0.00019671947906783297, + "loss": 1.4468, + "step": 1278 + }, + { + "epoch": 0.01662002166840198, + "grad_norm": 0.41933301091194153, + "learning_rate": 0.00019671687960592157, + "loss": 1.3893, + "step": 1279 + }, + { + "epoch": 0.016633016212317855, + "grad_norm": 0.3944145739078522, + "learning_rate": 0.00019671428014401022, + "loss": 1.5484, + "step": 1280 + }, + { + "epoch": 0.016646010756233728, + "grad_norm": 0.38021165132522583, + "learning_rate": 0.00019671168068209881, + "loss": 1.4971, + "step": 1281 + }, + { + "epoch": 0.0166590053001496, + "grad_norm": 0.3604079782962799, + "learning_rate": 0.00019670908122018744, + "loss": 1.3135, + "step": 1282 + }, + { + "epoch": 0.016671999844065474, + "grad_norm": 0.2936398386955261, + "learning_rate": 0.00019670648175827604, + "loss": 1.3214, + "step": 1283 + }, + { + "epoch": 0.016684994387981347, + "grad_norm": 0.41944095492362976, + "learning_rate": 0.00019670388229636466, + "loss": 1.5769, + "step": 1284 + }, + { + "epoch": 0.01669798893189722, + "grad_norm": 0.3244532346725464, + "learning_rate": 0.00019670128283445329, + "loss": 1.4632, + "step": 1285 + }, + { + "epoch": 0.016710983475813092, + "grad_norm": 0.4025017023086548, + "learning_rate": 0.00019669868337254188, + "loss": 1.2391, + "step": 1286 + }, + { + "epoch": 0.016723978019728965, + "grad_norm": 0.2932943105697632, + "learning_rate": 0.0001966960839106305, + "loss": 1.2617, + "step": 1287 + }, + { + "epoch": 0.01673697256364484, + "grad_norm": 0.33363422751426697, + "learning_rate": 0.00019669348444871913, + "loss": 1.3775, + "step": 1288 + }, + { + "epoch": 0.01674996710756071, + "grad_norm": 0.3420233428478241, + "learning_rate": 0.00019669088498680776, + "loss": 1.4534, + "step": 1289 + }, + { + "epoch": 0.016762961651476588, + "grad_norm": 0.3920753002166748, + "learning_rate": 0.00019668828552489635, + "loss": 1.4119, + "step": 1290 + }, + { + "epoch": 0.01677595619539246, + "grad_norm": 0.1987222135066986, + "learning_rate": 0.00019668568606298495, + "loss": 1.4032, + "step": 1291 + }, + { + "epoch": 0.016788950739308334, + "grad_norm": 0.431986927986145, + "learning_rate": 0.0001966830866010736, + "loss": 1.4481, + "step": 1292 + }, + { + "epoch": 0.016801945283224207, + "grad_norm": 0.36777424812316895, + "learning_rate": 0.0001966804871391622, + "loss": 1.4234, + "step": 1293 + }, + { + "epoch": 0.01681493982714008, + "grad_norm": 0.41752204298973083, + "learning_rate": 0.00019667788767725082, + "loss": 1.4181, + "step": 1294 + }, + { + "epoch": 0.016827934371055953, + "grad_norm": 0.36617961525917053, + "learning_rate": 0.00019667528821533942, + "loss": 1.4269, + "step": 1295 + }, + { + "epoch": 0.016840928914971826, + "grad_norm": 0.35011550784111023, + "learning_rate": 0.00019667268875342805, + "loss": 1.3349, + "step": 1296 + }, + { + "epoch": 0.0168539234588877, + "grad_norm": 0.46133318543434143, + "learning_rate": 0.00019667008929151667, + "loss": 1.6471, + "step": 1297 + }, + { + "epoch": 0.01686691800280357, + "grad_norm": 0.31067320704460144, + "learning_rate": 0.00019666748982960527, + "loss": 1.4683, + "step": 1298 + }, + { + "epoch": 0.016879912546719448, + "grad_norm": 0.37132853269577026, + "learning_rate": 0.0001966648903676939, + "loss": 1.4418, + "step": 1299 + }, + { + "epoch": 0.01689290709063532, + "grad_norm": 0.4115432798862457, + "learning_rate": 0.00019666229090578252, + "loss": 1.473, + "step": 1300 + }, + { + "epoch": 0.016905901634551194, + "grad_norm": 0.38667991757392883, + "learning_rate": 0.00019665969144387114, + "loss": 1.8367, + "step": 1301 + }, + { + "epoch": 0.016918896178467067, + "grad_norm": 0.3053534924983978, + "learning_rate": 0.00019665709198195974, + "loss": 1.6643, + "step": 1302 + }, + { + "epoch": 0.01693189072238294, + "grad_norm": 0.47252416610717773, + "learning_rate": 0.00019665449252004834, + "loss": 1.4148, + "step": 1303 + }, + { + "epoch": 0.016944885266298813, + "grad_norm": 0.3782711327075958, + "learning_rate": 0.000196651893058137, + "loss": 1.5064, + "step": 1304 + }, + { + "epoch": 0.016957879810214686, + "grad_norm": 0.3270961344242096, + "learning_rate": 0.00019664929359622559, + "loss": 1.5107, + "step": 1305 + }, + { + "epoch": 0.01697087435413056, + "grad_norm": 0.46320730447769165, + "learning_rate": 0.0001966466941343142, + "loss": 1.5103, + "step": 1306 + }, + { + "epoch": 0.01698386889804643, + "grad_norm": 0.4325108230113983, + "learning_rate": 0.0001966440946724028, + "loss": 1.4994, + "step": 1307 + }, + { + "epoch": 0.016996863441962305, + "grad_norm": 0.4188390076160431, + "learning_rate": 0.00019664149521049143, + "loss": 1.6011, + "step": 1308 + }, + { + "epoch": 0.01700985798587818, + "grad_norm": 0.3909618854522705, + "learning_rate": 0.00019663889574858006, + "loss": 1.3284, + "step": 1309 + }, + { + "epoch": 0.017022852529794054, + "grad_norm": 0.33980676531791687, + "learning_rate": 0.00019663629628666865, + "loss": 1.4136, + "step": 1310 + }, + { + "epoch": 0.017035847073709927, + "grad_norm": 0.4157145917415619, + "learning_rate": 0.00019663369682475728, + "loss": 1.4706, + "step": 1311 + }, + { + "epoch": 0.0170488416176258, + "grad_norm": 0.38978928327560425, + "learning_rate": 0.0001966310973628459, + "loss": 1.4535, + "step": 1312 + }, + { + "epoch": 0.017061836161541673, + "grad_norm": 0.3986358642578125, + "learning_rate": 0.00019662849790093453, + "loss": 1.4836, + "step": 1313 + }, + { + "epoch": 0.017074830705457546, + "grad_norm": 0.35495102405548096, + "learning_rate": 0.00019662589843902312, + "loss": 1.468, + "step": 1314 + }, + { + "epoch": 0.01708782524937342, + "grad_norm": 0.3181716501712799, + "learning_rate": 0.00019662329897711175, + "loss": 1.4632, + "step": 1315 + }, + { + "epoch": 0.01710081979328929, + "grad_norm": 0.28140318393707275, + "learning_rate": 0.00019662069951520037, + "loss": 1.3824, + "step": 1316 + }, + { + "epoch": 0.017113814337205165, + "grad_norm": 0.47010791301727295, + "learning_rate": 0.00019661810005328897, + "loss": 1.5767, + "step": 1317 + }, + { + "epoch": 0.01712680888112104, + "grad_norm": 0.33159905672073364, + "learning_rate": 0.0001966155005913776, + "loss": 1.547, + "step": 1318 + }, + { + "epoch": 0.017139803425036914, + "grad_norm": 0.39347562193870544, + "learning_rate": 0.00019661290112946622, + "loss": 1.4111, + "step": 1319 + }, + { + "epoch": 0.017152797968952787, + "grad_norm": 0.38357362151145935, + "learning_rate": 0.00019661030166755482, + "loss": 1.5499, + "step": 1320 + }, + { + "epoch": 0.01716579251286866, + "grad_norm": 0.4207701086997986, + "learning_rate": 0.00019660770220564344, + "loss": 1.6153, + "step": 1321 + }, + { + "epoch": 0.017178787056784533, + "grad_norm": 0.3719491958618164, + "learning_rate": 0.00019660510274373204, + "loss": 1.323, + "step": 1322 + }, + { + "epoch": 0.017191781600700406, + "grad_norm": 0.3435841500759125, + "learning_rate": 0.0001966025032818207, + "loss": 1.5274, + "step": 1323 + }, + { + "epoch": 0.01720477614461628, + "grad_norm": 0.389879435300827, + "learning_rate": 0.0001965999038199093, + "loss": 1.3968, + "step": 1324 + }, + { + "epoch": 0.01721777068853215, + "grad_norm": 0.42259082198143005, + "learning_rate": 0.0001965973043579979, + "loss": 1.5433, + "step": 1325 + }, + { + "epoch": 0.017230765232448025, + "grad_norm": 0.35753822326660156, + "learning_rate": 0.0001965947048960865, + "loss": 1.709, + "step": 1326 + }, + { + "epoch": 0.017243759776363898, + "grad_norm": 0.26413440704345703, + "learning_rate": 0.00019659210543417513, + "loss": 1.378, + "step": 1327 + }, + { + "epoch": 0.017256754320279774, + "grad_norm": 0.42384201288223267, + "learning_rate": 0.00019658950597226376, + "loss": 1.4592, + "step": 1328 + }, + { + "epoch": 0.017269748864195647, + "grad_norm": 0.7451703548431396, + "learning_rate": 0.00019658690651035236, + "loss": 1.6224, + "step": 1329 + }, + { + "epoch": 0.01728274340811152, + "grad_norm": 0.32426488399505615, + "learning_rate": 0.00019658430704844098, + "loss": 1.2088, + "step": 1330 + }, + { + "epoch": 0.017295737952027393, + "grad_norm": 0.5340223908424377, + "learning_rate": 0.0001965817075865296, + "loss": 1.4398, + "step": 1331 + }, + { + "epoch": 0.017308732495943266, + "grad_norm": 0.3323982357978821, + "learning_rate": 0.00019657910812461823, + "loss": 1.5148, + "step": 1332 + }, + { + "epoch": 0.01732172703985914, + "grad_norm": 0.511139988899231, + "learning_rate": 0.00019657650866270683, + "loss": 1.4744, + "step": 1333 + }, + { + "epoch": 0.017334721583775012, + "grad_norm": 0.3836911618709564, + "learning_rate": 0.00019657390920079542, + "loss": 1.5983, + "step": 1334 + }, + { + "epoch": 0.017347716127690885, + "grad_norm": 0.3472544252872467, + "learning_rate": 0.00019657130973888408, + "loss": 1.3834, + "step": 1335 + }, + { + "epoch": 0.017360710671606758, + "grad_norm": 0.28959575295448303, + "learning_rate": 0.00019656871027697267, + "loss": 1.2538, + "step": 1336 + }, + { + "epoch": 0.017373705215522634, + "grad_norm": 0.4705945551395416, + "learning_rate": 0.0001965661108150613, + "loss": 1.4277, + "step": 1337 + }, + { + "epoch": 0.017386699759438507, + "grad_norm": 0.44887563586235046, + "learning_rate": 0.0001965635113531499, + "loss": 1.6429, + "step": 1338 + }, + { + "epoch": 0.01739969430335438, + "grad_norm": 0.39057818055152893, + "learning_rate": 0.00019656091189123852, + "loss": 1.4686, + "step": 1339 + }, + { + "epoch": 0.017412688847270253, + "grad_norm": 0.39304253458976746, + "learning_rate": 0.00019655831242932714, + "loss": 1.4984, + "step": 1340 + }, + { + "epoch": 0.017425683391186126, + "grad_norm": 0.44281622767448425, + "learning_rate": 0.00019655571296741574, + "loss": 1.5449, + "step": 1341 + }, + { + "epoch": 0.017438677935102, + "grad_norm": 0.38267067074775696, + "learning_rate": 0.00019655311350550437, + "loss": 1.4291, + "step": 1342 + }, + { + "epoch": 0.017451672479017872, + "grad_norm": 0.31228458881378174, + "learning_rate": 0.000196550514043593, + "loss": 1.544, + "step": 1343 + }, + { + "epoch": 0.017464667022933745, + "grad_norm": 0.3796466886997223, + "learning_rate": 0.00019654791458168162, + "loss": 1.5666, + "step": 1344 + }, + { + "epoch": 0.017477661566849618, + "grad_norm": 0.2629081904888153, + "learning_rate": 0.0001965453151197702, + "loss": 1.4495, + "step": 1345 + }, + { + "epoch": 0.01749065611076549, + "grad_norm": 0.44984155893325806, + "learning_rate": 0.0001965427156578588, + "loss": 1.6004, + "step": 1346 + }, + { + "epoch": 0.017503650654681367, + "grad_norm": 0.3668937385082245, + "learning_rate": 0.00019654011619594746, + "loss": 1.5419, + "step": 1347 + }, + { + "epoch": 0.01751664519859724, + "grad_norm": 0.41513144969940186, + "learning_rate": 0.00019653751673403606, + "loss": 1.5247, + "step": 1348 + }, + { + "epoch": 0.017529639742513113, + "grad_norm": 0.36529871821403503, + "learning_rate": 0.00019653491727212468, + "loss": 1.3889, + "step": 1349 + }, + { + "epoch": 0.017542634286428986, + "grad_norm": 0.272764652967453, + "learning_rate": 0.00019653231781021328, + "loss": 1.3127, + "step": 1350 + }, + { + "epoch": 0.01755562883034486, + "grad_norm": 0.4416327178478241, + "learning_rate": 0.0001965297183483019, + "loss": 1.4581, + "step": 1351 + }, + { + "epoch": 0.017568623374260732, + "grad_norm": 0.3854648470878601, + "learning_rate": 0.00019652711888639053, + "loss": 1.4115, + "step": 1352 + }, + { + "epoch": 0.017581617918176605, + "grad_norm": 0.3894107937812805, + "learning_rate": 0.00019652451942447913, + "loss": 1.5245, + "step": 1353 + }, + { + "epoch": 0.017594612462092478, + "grad_norm": 0.3270106613636017, + "learning_rate": 0.00019652191996256778, + "loss": 1.377, + "step": 1354 + }, + { + "epoch": 0.01760760700600835, + "grad_norm": 0.35854044556617737, + "learning_rate": 0.00019651932050065638, + "loss": 1.3939, + "step": 1355 + }, + { + "epoch": 0.017620601549924224, + "grad_norm": 0.391289621591568, + "learning_rate": 0.000196516721038745, + "loss": 1.4741, + "step": 1356 + }, + { + "epoch": 0.0176335960938401, + "grad_norm": 0.4587692618370056, + "learning_rate": 0.0001965141215768336, + "loss": 1.6093, + "step": 1357 + }, + { + "epoch": 0.017646590637755973, + "grad_norm": 0.5379094481468201, + "learning_rate": 0.00019651152211492222, + "loss": 1.526, + "step": 1358 + }, + { + "epoch": 0.017659585181671846, + "grad_norm": 0.37917113304138184, + "learning_rate": 0.00019650892265301085, + "loss": 1.44, + "step": 1359 + }, + { + "epoch": 0.01767257972558772, + "grad_norm": 0.48548614978790283, + "learning_rate": 0.00019650632319109944, + "loss": 1.5297, + "step": 1360 + }, + { + "epoch": 0.017685574269503592, + "grad_norm": 0.3773084878921509, + "learning_rate": 0.00019650372372918807, + "loss": 1.3303, + "step": 1361 + }, + { + "epoch": 0.017698568813419465, + "grad_norm": 0.30313122272491455, + "learning_rate": 0.0001965011242672767, + "loss": 1.3955, + "step": 1362 + }, + { + "epoch": 0.017711563357335338, + "grad_norm": 0.3642292320728302, + "learning_rate": 0.0001964985248053653, + "loss": 1.4358, + "step": 1363 + }, + { + "epoch": 0.01772455790125121, + "grad_norm": 0.34993597865104675, + "learning_rate": 0.00019649592534345392, + "loss": 1.4424, + "step": 1364 + }, + { + "epoch": 0.017737552445167084, + "grad_norm": 0.38262784481048584, + "learning_rate": 0.0001964933258815425, + "loss": 1.4008, + "step": 1365 + }, + { + "epoch": 0.01775054698908296, + "grad_norm": 0.36598148941993713, + "learning_rate": 0.00019649072641963116, + "loss": 1.3502, + "step": 1366 + }, + { + "epoch": 0.017763541532998833, + "grad_norm": 0.32614684104919434, + "learning_rate": 0.00019648812695771976, + "loss": 1.2557, + "step": 1367 + }, + { + "epoch": 0.017776536076914706, + "grad_norm": 0.2726497948169708, + "learning_rate": 0.00019648552749580839, + "loss": 1.4819, + "step": 1368 + }, + { + "epoch": 0.01778953062083058, + "grad_norm": 0.41167518496513367, + "learning_rate": 0.00019648292803389698, + "loss": 1.5286, + "step": 1369 + }, + { + "epoch": 0.017802525164746452, + "grad_norm": 0.42786210775375366, + "learning_rate": 0.0001964803285719856, + "loss": 1.4808, + "step": 1370 + }, + { + "epoch": 0.017815519708662325, + "grad_norm": 0.3530051112174988, + "learning_rate": 0.00019647772911007423, + "loss": 1.2793, + "step": 1371 + }, + { + "epoch": 0.017828514252578198, + "grad_norm": 0.6079438924789429, + "learning_rate": 0.00019647512964816283, + "loss": 1.5808, + "step": 1372 + }, + { + "epoch": 0.01784150879649407, + "grad_norm": 0.3473976254463196, + "learning_rate": 0.00019647253018625145, + "loss": 1.4315, + "step": 1373 + }, + { + "epoch": 0.017854503340409944, + "grad_norm": 0.3194151222705841, + "learning_rate": 0.00019646993072434008, + "loss": 1.5639, + "step": 1374 + }, + { + "epoch": 0.017867497884325817, + "grad_norm": 0.3278243839740753, + "learning_rate": 0.00019646733126242868, + "loss": 1.5066, + "step": 1375 + }, + { + "epoch": 0.017880492428241693, + "grad_norm": 0.4241548478603363, + "learning_rate": 0.0001964647318005173, + "loss": 1.5062, + "step": 1376 + }, + { + "epoch": 0.017893486972157566, + "grad_norm": 0.40428072214126587, + "learning_rate": 0.0001964621323386059, + "loss": 1.5999, + "step": 1377 + }, + { + "epoch": 0.01790648151607344, + "grad_norm": 0.402030348777771, + "learning_rate": 0.00019645953287669455, + "loss": 1.5519, + "step": 1378 + }, + { + "epoch": 0.017919476059989312, + "grad_norm": 0.33562710881233215, + "learning_rate": 0.00019645693341478315, + "loss": 1.3547, + "step": 1379 + }, + { + "epoch": 0.017932470603905185, + "grad_norm": 0.37471288442611694, + "learning_rate": 0.00019645433395287177, + "loss": 1.5516, + "step": 1380 + }, + { + "epoch": 0.017945465147821058, + "grad_norm": 0.45008721947669983, + "learning_rate": 0.00019645173449096037, + "loss": 1.4721, + "step": 1381 + }, + { + "epoch": 0.01795845969173693, + "grad_norm": 0.3916715979576111, + "learning_rate": 0.000196449135029049, + "loss": 1.3908, + "step": 1382 + }, + { + "epoch": 0.017971454235652804, + "grad_norm": 0.364383727312088, + "learning_rate": 0.00019644653556713762, + "loss": 1.59, + "step": 1383 + }, + { + "epoch": 0.017984448779568677, + "grad_norm": 0.41933467984199524, + "learning_rate": 0.00019644393610522622, + "loss": 1.6209, + "step": 1384 + }, + { + "epoch": 0.017997443323484554, + "grad_norm": 0.37265875935554504, + "learning_rate": 0.00019644133664331484, + "loss": 1.4868, + "step": 1385 + }, + { + "epoch": 0.018010437867400426, + "grad_norm": 0.3947279751300812, + "learning_rate": 0.00019643873718140346, + "loss": 1.3463, + "step": 1386 + }, + { + "epoch": 0.0180234324113163, + "grad_norm": 0.38589316606521606, + "learning_rate": 0.00019643613771949206, + "loss": 1.416, + "step": 1387 + }, + { + "epoch": 0.018036426955232172, + "grad_norm": 0.3667202889919281, + "learning_rate": 0.00019643353825758069, + "loss": 1.468, + "step": 1388 + }, + { + "epoch": 0.018049421499148045, + "grad_norm": 0.47952884435653687, + "learning_rate": 0.00019643093879566928, + "loss": 1.5815, + "step": 1389 + }, + { + "epoch": 0.01806241604306392, + "grad_norm": 0.4453055262565613, + "learning_rate": 0.00019642833933375793, + "loss": 1.601, + "step": 1390 + }, + { + "epoch": 0.01807541058697979, + "grad_norm": 0.36778724193573, + "learning_rate": 0.00019642573987184653, + "loss": 1.4059, + "step": 1391 + }, + { + "epoch": 0.018088405130895664, + "grad_norm": 0.3312399685382843, + "learning_rate": 0.00019642314040993516, + "loss": 1.6006, + "step": 1392 + }, + { + "epoch": 0.018101399674811537, + "grad_norm": 0.29925376176834106, + "learning_rate": 0.00019642054094802378, + "loss": 1.3347, + "step": 1393 + }, + { + "epoch": 0.01811439421872741, + "grad_norm": 0.32471317052841187, + "learning_rate": 0.00019641794148611238, + "loss": 1.6194, + "step": 1394 + }, + { + "epoch": 0.018127388762643287, + "grad_norm": 0.3405779004096985, + "learning_rate": 0.000196415342024201, + "loss": 1.2661, + "step": 1395 + }, + { + "epoch": 0.01814038330655916, + "grad_norm": 0.34910064935684204, + "learning_rate": 0.0001964127425622896, + "loss": 1.4643, + "step": 1396 + }, + { + "epoch": 0.018153377850475032, + "grad_norm": 0.3806057572364807, + "learning_rate": 0.00019641014310037825, + "loss": 1.4939, + "step": 1397 + }, + { + "epoch": 0.018166372394390905, + "grad_norm": 0.3816677927970886, + "learning_rate": 0.00019640754363846685, + "loss": 1.486, + "step": 1398 + }, + { + "epoch": 0.01817936693830678, + "grad_norm": 0.38400983810424805, + "learning_rate": 0.00019640494417655547, + "loss": 1.6257, + "step": 1399 + }, + { + "epoch": 0.01819236148222265, + "grad_norm": 0.4322017431259155, + "learning_rate": 0.00019640234471464407, + "loss": 1.6245, + "step": 1400 + }, + { + "epoch": 0.018205356026138524, + "grad_norm": 0.3365495204925537, + "learning_rate": 0.0001963997452527327, + "loss": 1.4112, + "step": 1401 + }, + { + "epoch": 0.018218350570054397, + "grad_norm": 0.32547080516815186, + "learning_rate": 0.00019639714579082132, + "loss": 1.5964, + "step": 1402 + }, + { + "epoch": 0.01823134511397027, + "grad_norm": 0.32175591588020325, + "learning_rate": 0.00019639454632890992, + "loss": 1.5287, + "step": 1403 + }, + { + "epoch": 0.018244339657886147, + "grad_norm": 0.5055533051490784, + "learning_rate": 0.00019639194686699854, + "loss": 1.5169, + "step": 1404 + }, + { + "epoch": 0.01825733420180202, + "grad_norm": 0.43116527795791626, + "learning_rate": 0.00019638934740508717, + "loss": 1.6879, + "step": 1405 + }, + { + "epoch": 0.018270328745717893, + "grad_norm": 0.44412800669670105, + "learning_rate": 0.00019638674794317576, + "loss": 1.5493, + "step": 1406 + }, + { + "epoch": 0.018283323289633766, + "grad_norm": 0.3559592664241791, + "learning_rate": 0.0001963841484812644, + "loss": 1.431, + "step": 1407 + }, + { + "epoch": 0.01829631783354964, + "grad_norm": 0.28410786390304565, + "learning_rate": 0.00019638154901935299, + "loss": 1.2386, + "step": 1408 + }, + { + "epoch": 0.01830931237746551, + "grad_norm": 0.3983698785305023, + "learning_rate": 0.00019637894955744164, + "loss": 1.4429, + "step": 1409 + }, + { + "epoch": 0.018322306921381384, + "grad_norm": 0.44524967670440674, + "learning_rate": 0.00019637635009553023, + "loss": 1.4997, + "step": 1410 + }, + { + "epoch": 0.018335301465297257, + "grad_norm": 0.41650405526161194, + "learning_rate": 0.00019637375063361886, + "loss": 1.4228, + "step": 1411 + }, + { + "epoch": 0.01834829600921313, + "grad_norm": 0.33032137155532837, + "learning_rate": 0.00019637115117170746, + "loss": 1.3543, + "step": 1412 + }, + { + "epoch": 0.018361290553129003, + "grad_norm": 0.37099602818489075, + "learning_rate": 0.00019636855170979608, + "loss": 1.4397, + "step": 1413 + }, + { + "epoch": 0.01837428509704488, + "grad_norm": 0.35451018810272217, + "learning_rate": 0.0001963659522478847, + "loss": 1.4941, + "step": 1414 + }, + { + "epoch": 0.018387279640960753, + "grad_norm": 0.39484304189682007, + "learning_rate": 0.0001963633527859733, + "loss": 1.4442, + "step": 1415 + }, + { + "epoch": 0.018400274184876626, + "grad_norm": 0.3444206118583679, + "learning_rate": 0.00019636075332406193, + "loss": 1.2892, + "step": 1416 + }, + { + "epoch": 0.0184132687287925, + "grad_norm": 0.39459744095802307, + "learning_rate": 0.00019635815386215055, + "loss": 1.3448, + "step": 1417 + }, + { + "epoch": 0.01842626327270837, + "grad_norm": 0.43979862332344055, + "learning_rate": 0.00019635555440023915, + "loss": 1.4319, + "step": 1418 + }, + { + "epoch": 0.018439257816624244, + "grad_norm": 0.37974467873573303, + "learning_rate": 0.00019635295493832777, + "loss": 1.408, + "step": 1419 + }, + { + "epoch": 0.018452252360540117, + "grad_norm": 0.4778790771961212, + "learning_rate": 0.00019635035547641637, + "loss": 1.5885, + "step": 1420 + }, + { + "epoch": 0.01846524690445599, + "grad_norm": 0.4614560604095459, + "learning_rate": 0.00019634775601450502, + "loss": 1.4592, + "step": 1421 + }, + { + "epoch": 0.018478241448371863, + "grad_norm": 0.3092544674873352, + "learning_rate": 0.00019634515655259362, + "loss": 1.4532, + "step": 1422 + }, + { + "epoch": 0.01849123599228774, + "grad_norm": 0.4220763146877289, + "learning_rate": 0.00019634255709068224, + "loss": 1.4722, + "step": 1423 + }, + { + "epoch": 0.018504230536203613, + "grad_norm": 0.3420048952102661, + "learning_rate": 0.00019633995762877084, + "loss": 1.6317, + "step": 1424 + }, + { + "epoch": 0.018517225080119486, + "grad_norm": 0.3050045371055603, + "learning_rate": 0.00019633735816685947, + "loss": 1.3881, + "step": 1425 + }, + { + "epoch": 0.01853021962403536, + "grad_norm": 0.34365832805633545, + "learning_rate": 0.0001963347587049481, + "loss": 1.4057, + "step": 1426 + }, + { + "epoch": 0.01854321416795123, + "grad_norm": 0.6004374027252197, + "learning_rate": 0.0001963321592430367, + "loss": 1.5394, + "step": 1427 + }, + { + "epoch": 0.018556208711867105, + "grad_norm": 0.318081259727478, + "learning_rate": 0.00019632955978112534, + "loss": 1.5718, + "step": 1428 + }, + { + "epoch": 0.018569203255782978, + "grad_norm": 0.33333227038383484, + "learning_rate": 0.00019632696031921394, + "loss": 1.2994, + "step": 1429 + }, + { + "epoch": 0.01858219779969885, + "grad_norm": 0.4516380727291107, + "learning_rate": 0.00019632436085730253, + "loss": 1.5457, + "step": 1430 + }, + { + "epoch": 0.018595192343614723, + "grad_norm": 0.4420582056045532, + "learning_rate": 0.00019632176139539116, + "loss": 1.3913, + "step": 1431 + }, + { + "epoch": 0.018608186887530596, + "grad_norm": 0.3033845126628876, + "learning_rate": 0.00019631916193347978, + "loss": 1.477, + "step": 1432 + }, + { + "epoch": 0.018621181431446473, + "grad_norm": 0.4648483693599701, + "learning_rate": 0.0001963165624715684, + "loss": 1.6043, + "step": 1433 + }, + { + "epoch": 0.018634175975362346, + "grad_norm": 0.28452369570732117, + "learning_rate": 0.000196313963009657, + "loss": 1.1945, + "step": 1434 + }, + { + "epoch": 0.01864717051927822, + "grad_norm": 0.3563484251499176, + "learning_rate": 0.00019631136354774563, + "loss": 1.5169, + "step": 1435 + }, + { + "epoch": 0.01866016506319409, + "grad_norm": 0.44836705923080444, + "learning_rate": 0.00019630876408583425, + "loss": 1.5318, + "step": 1436 + }, + { + "epoch": 0.018673159607109965, + "grad_norm": 0.3512181341648102, + "learning_rate": 0.00019630616462392285, + "loss": 1.4591, + "step": 1437 + }, + { + "epoch": 0.018686154151025838, + "grad_norm": 0.3802800476551056, + "learning_rate": 0.00019630356516201148, + "loss": 1.681, + "step": 1438 + }, + { + "epoch": 0.01869914869494171, + "grad_norm": 0.3871404826641083, + "learning_rate": 0.00019630096570010007, + "loss": 1.315, + "step": 1439 + }, + { + "epoch": 0.018712143238857584, + "grad_norm": 0.4375351369380951, + "learning_rate": 0.00019629836623818873, + "loss": 1.5924, + "step": 1440 + }, + { + "epoch": 0.018725137782773457, + "grad_norm": 0.3527708649635315, + "learning_rate": 0.00019629576677627732, + "loss": 1.519, + "step": 1441 + }, + { + "epoch": 0.018738132326689333, + "grad_norm": 0.48671025037765503, + "learning_rate": 0.00019629316731436592, + "loss": 1.4259, + "step": 1442 + }, + { + "epoch": 0.018751126870605206, + "grad_norm": 0.3766919672489166, + "learning_rate": 0.00019629056785245454, + "loss": 1.5339, + "step": 1443 + }, + { + "epoch": 0.01876412141452108, + "grad_norm": 0.40276792645454407, + "learning_rate": 0.00019628796839054317, + "loss": 1.4726, + "step": 1444 + }, + { + "epoch": 0.018777115958436952, + "grad_norm": 0.3981993496417999, + "learning_rate": 0.0001962853689286318, + "loss": 1.6222, + "step": 1445 + }, + { + "epoch": 0.018790110502352825, + "grad_norm": 0.41286009550094604, + "learning_rate": 0.0001962827694667204, + "loss": 1.4731, + "step": 1446 + }, + { + "epoch": 0.018803105046268698, + "grad_norm": 0.3539413809776306, + "learning_rate": 0.00019628017000480902, + "loss": 1.619, + "step": 1447 + }, + { + "epoch": 0.01881609959018457, + "grad_norm": 0.33750399947166443, + "learning_rate": 0.00019627757054289764, + "loss": 1.3662, + "step": 1448 + }, + { + "epoch": 0.018829094134100444, + "grad_norm": 0.3177066445350647, + "learning_rate": 0.00019627497108098624, + "loss": 1.1867, + "step": 1449 + }, + { + "epoch": 0.018842088678016317, + "grad_norm": 0.39732736349105835, + "learning_rate": 0.00019627237161907486, + "loss": 1.4744, + "step": 1450 + }, + { + "epoch": 0.01885508322193219, + "grad_norm": 0.41208726167678833, + "learning_rate": 0.00019626977215716346, + "loss": 1.4427, + "step": 1451 + }, + { + "epoch": 0.018868077765848066, + "grad_norm": 0.34757867455482483, + "learning_rate": 0.0001962671726952521, + "loss": 1.3379, + "step": 1452 + }, + { + "epoch": 0.01888107230976394, + "grad_norm": 0.37493589520454407, + "learning_rate": 0.0001962645732333407, + "loss": 1.5556, + "step": 1453 + }, + { + "epoch": 0.018894066853679812, + "grad_norm": 0.311355322599411, + "learning_rate": 0.00019626197377142933, + "loss": 1.1438, + "step": 1454 + }, + { + "epoch": 0.018907061397595685, + "grad_norm": 0.3531663119792938, + "learning_rate": 0.00019625937430951793, + "loss": 1.399, + "step": 1455 + }, + { + "epoch": 0.018920055941511558, + "grad_norm": 0.36858677864074707, + "learning_rate": 0.00019625677484760655, + "loss": 1.4933, + "step": 1456 + }, + { + "epoch": 0.01893305048542743, + "grad_norm": 0.4147689938545227, + "learning_rate": 0.00019625417538569518, + "loss": 1.424, + "step": 1457 + }, + { + "epoch": 0.018946045029343304, + "grad_norm": 0.3704078495502472, + "learning_rate": 0.00019625157592378378, + "loss": 1.4857, + "step": 1458 + }, + { + "epoch": 0.018959039573259177, + "grad_norm": 0.38175466656684875, + "learning_rate": 0.0001962489764618724, + "loss": 1.4635, + "step": 1459 + }, + { + "epoch": 0.01897203411717505, + "grad_norm": 0.4142589569091797, + "learning_rate": 0.00019624637699996103, + "loss": 1.3575, + "step": 1460 + }, + { + "epoch": 0.018985028661090926, + "grad_norm": 0.37956202030181885, + "learning_rate": 0.00019624377753804962, + "loss": 1.3203, + "step": 1461 + }, + { + "epoch": 0.0189980232050068, + "grad_norm": 0.34043657779693604, + "learning_rate": 0.00019624117807613825, + "loss": 1.6161, + "step": 1462 + }, + { + "epoch": 0.019011017748922672, + "grad_norm": 0.4602002203464508, + "learning_rate": 0.00019623857861422684, + "loss": 1.4248, + "step": 1463 + }, + { + "epoch": 0.019024012292838545, + "grad_norm": 0.39603501558303833, + "learning_rate": 0.0001962359791523155, + "loss": 1.6715, + "step": 1464 + }, + { + "epoch": 0.019037006836754418, + "grad_norm": 0.357126384973526, + "learning_rate": 0.0001962333796904041, + "loss": 1.6395, + "step": 1465 + }, + { + "epoch": 0.01905000138067029, + "grad_norm": 0.3500949442386627, + "learning_rate": 0.00019623078022849272, + "loss": 1.3647, + "step": 1466 + }, + { + "epoch": 0.019062995924586164, + "grad_norm": 0.36586612462997437, + "learning_rate": 0.00019622818076658134, + "loss": 1.3783, + "step": 1467 + }, + { + "epoch": 0.019075990468502037, + "grad_norm": 0.4651097357273102, + "learning_rate": 0.00019622558130466994, + "loss": 1.5214, + "step": 1468 + }, + { + "epoch": 0.01908898501241791, + "grad_norm": 0.5161360502243042, + "learning_rate": 0.00019622298184275856, + "loss": 1.5843, + "step": 1469 + }, + { + "epoch": 0.019101979556333783, + "grad_norm": 0.31639549136161804, + "learning_rate": 0.00019622038238084716, + "loss": 1.2787, + "step": 1470 + }, + { + "epoch": 0.01911497410024966, + "grad_norm": 0.3406965434551239, + "learning_rate": 0.00019621778291893579, + "loss": 1.5458, + "step": 1471 + }, + { + "epoch": 0.019127968644165532, + "grad_norm": 0.3634943664073944, + "learning_rate": 0.0001962151834570244, + "loss": 1.3922, + "step": 1472 + }, + { + "epoch": 0.019140963188081405, + "grad_norm": 0.4297401010990143, + "learning_rate": 0.000196212583995113, + "loss": 1.6699, + "step": 1473 + }, + { + "epoch": 0.019153957731997278, + "grad_norm": 0.3873206079006195, + "learning_rate": 0.00019620998453320163, + "loss": 1.3755, + "step": 1474 + }, + { + "epoch": 0.01916695227591315, + "grad_norm": 0.35218915343284607, + "learning_rate": 0.00019620738507129026, + "loss": 1.5802, + "step": 1475 + }, + { + "epoch": 0.019179946819829024, + "grad_norm": 0.4823857247829437, + "learning_rate": 0.00019620478560937888, + "loss": 1.3618, + "step": 1476 + }, + { + "epoch": 0.019192941363744897, + "grad_norm": 0.33877623081207275, + "learning_rate": 0.00019620218614746748, + "loss": 1.5062, + "step": 1477 + }, + { + "epoch": 0.01920593590766077, + "grad_norm": 0.33483630418777466, + "learning_rate": 0.0001961995866855561, + "loss": 1.6163, + "step": 1478 + }, + { + "epoch": 0.019218930451576643, + "grad_norm": 0.37369218468666077, + "learning_rate": 0.00019619698722364473, + "loss": 1.5505, + "step": 1479 + }, + { + "epoch": 0.01923192499549252, + "grad_norm": 0.3920283615589142, + "learning_rate": 0.00019619438776173333, + "loss": 1.6222, + "step": 1480 + }, + { + "epoch": 0.019244919539408392, + "grad_norm": 0.35777056217193604, + "learning_rate": 0.00019619178829982195, + "loss": 1.7302, + "step": 1481 + }, + { + "epoch": 0.019257914083324265, + "grad_norm": 0.4896351397037506, + "learning_rate": 0.00019618918883791055, + "loss": 1.5716, + "step": 1482 + }, + { + "epoch": 0.019270908627240138, + "grad_norm": 0.3160429894924164, + "learning_rate": 0.0001961865893759992, + "loss": 1.4115, + "step": 1483 + }, + { + "epoch": 0.01928390317115601, + "grad_norm": 0.3355870246887207, + "learning_rate": 0.0001961839899140878, + "loss": 1.3963, + "step": 1484 + }, + { + "epoch": 0.019296897715071884, + "grad_norm": 0.39011481404304504, + "learning_rate": 0.0001961813904521764, + "loss": 1.6994, + "step": 1485 + }, + { + "epoch": 0.019309892258987757, + "grad_norm": 0.3732297420501709, + "learning_rate": 0.00019617879099026502, + "loss": 1.4887, + "step": 1486 + }, + { + "epoch": 0.01932288680290363, + "grad_norm": 0.3787655532360077, + "learning_rate": 0.00019617619152835364, + "loss": 1.7666, + "step": 1487 + }, + { + "epoch": 0.019335881346819503, + "grad_norm": 0.45300403237342834, + "learning_rate": 0.00019617359206644227, + "loss": 1.5503, + "step": 1488 + }, + { + "epoch": 0.019348875890735376, + "grad_norm": 0.36539945006370544, + "learning_rate": 0.00019617099260453086, + "loss": 1.3619, + "step": 1489 + }, + { + "epoch": 0.019361870434651252, + "grad_norm": 0.38043802976608276, + "learning_rate": 0.0001961683931426195, + "loss": 1.5032, + "step": 1490 + }, + { + "epoch": 0.019374864978567125, + "grad_norm": 0.3876885175704956, + "learning_rate": 0.0001961657936807081, + "loss": 1.5929, + "step": 1491 + }, + { + "epoch": 0.019387859522482998, + "grad_norm": 0.31826695799827576, + "learning_rate": 0.0001961631942187967, + "loss": 1.2323, + "step": 1492 + }, + { + "epoch": 0.01940085406639887, + "grad_norm": 0.45770037174224854, + "learning_rate": 0.00019616059475688534, + "loss": 1.4443, + "step": 1493 + }, + { + "epoch": 0.019413848610314744, + "grad_norm": 0.39884278178215027, + "learning_rate": 0.00019615799529497393, + "loss": 1.3063, + "step": 1494 + }, + { + "epoch": 0.019426843154230617, + "grad_norm": 0.28220826387405396, + "learning_rate": 0.00019615539583306258, + "loss": 1.5326, + "step": 1495 + }, + { + "epoch": 0.01943983769814649, + "grad_norm": 0.47629642486572266, + "learning_rate": 0.00019615279637115118, + "loss": 1.5384, + "step": 1496 + }, + { + "epoch": 0.019452832242062363, + "grad_norm": 0.417201429605484, + "learning_rate": 0.00019615019690923978, + "loss": 1.4289, + "step": 1497 + }, + { + "epoch": 0.019465826785978236, + "grad_norm": 0.4086882770061493, + "learning_rate": 0.0001961475974473284, + "loss": 1.5168, + "step": 1498 + }, + { + "epoch": 0.019478821329894112, + "grad_norm": 0.3688398003578186, + "learning_rate": 0.00019614499798541703, + "loss": 1.5101, + "step": 1499 + }, + { + "epoch": 0.019491815873809985, + "grad_norm": 0.39062660932540894, + "learning_rate": 0.00019614239852350565, + "loss": 1.5113, + "step": 1500 + }, + { + "epoch": 0.01950481041772586, + "grad_norm": 0.395280659198761, + "learning_rate": 0.00019613979906159425, + "loss": 1.45, + "step": 1501 + }, + { + "epoch": 0.01951780496164173, + "grad_norm": 0.43302208185195923, + "learning_rate": 0.00019613719959968287, + "loss": 1.5149, + "step": 1502 + }, + { + "epoch": 0.019530799505557604, + "grad_norm": 0.37377139925956726, + "learning_rate": 0.0001961346001377715, + "loss": 1.6256, + "step": 1503 + }, + { + "epoch": 0.019543794049473477, + "grad_norm": 0.370684951543808, + "learning_rate": 0.0001961320006758601, + "loss": 1.5941, + "step": 1504 + }, + { + "epoch": 0.01955678859338935, + "grad_norm": 0.33560818433761597, + "learning_rate": 0.00019612940121394872, + "loss": 1.2448, + "step": 1505 + }, + { + "epoch": 0.019569783137305223, + "grad_norm": 0.3527664840221405, + "learning_rate": 0.00019612680175203735, + "loss": 1.4019, + "step": 1506 + }, + { + "epoch": 0.019582777681221096, + "grad_norm": 0.4073215126991272, + "learning_rate": 0.00019612420229012597, + "loss": 1.5824, + "step": 1507 + }, + { + "epoch": 0.01959577222513697, + "grad_norm": 0.3010920584201813, + "learning_rate": 0.00019612160282821457, + "loss": 1.3867, + "step": 1508 + }, + { + "epoch": 0.019608766769052845, + "grad_norm": 0.44669947028160095, + "learning_rate": 0.00019611900336630316, + "loss": 1.7077, + "step": 1509 + }, + { + "epoch": 0.01962176131296872, + "grad_norm": 0.28721189498901367, + "learning_rate": 0.00019611640390439182, + "loss": 1.2883, + "step": 1510 + }, + { + "epoch": 0.01963475585688459, + "grad_norm": 0.4460401237010956, + "learning_rate": 0.0001961138044424804, + "loss": 1.4707, + "step": 1511 + }, + { + "epoch": 0.019647750400800464, + "grad_norm": 0.4010676145553589, + "learning_rate": 0.00019611120498056904, + "loss": 1.7405, + "step": 1512 + }, + { + "epoch": 0.019660744944716337, + "grad_norm": 0.40383365750312805, + "learning_rate": 0.00019610860551865764, + "loss": 1.5814, + "step": 1513 + }, + { + "epoch": 0.01967373948863221, + "grad_norm": 0.37719887495040894, + "learning_rate": 0.00019610600605674626, + "loss": 1.4757, + "step": 1514 + }, + { + "epoch": 0.019686734032548083, + "grad_norm": 0.3334643244743347, + "learning_rate": 0.00019610340659483488, + "loss": 1.2733, + "step": 1515 + }, + { + "epoch": 0.019699728576463956, + "grad_norm": 0.2876072824001312, + "learning_rate": 0.00019610080713292348, + "loss": 1.54, + "step": 1516 + }, + { + "epoch": 0.01971272312037983, + "grad_norm": 0.41408345103263855, + "learning_rate": 0.0001960982076710121, + "loss": 1.4716, + "step": 1517 + }, + { + "epoch": 0.019725717664295706, + "grad_norm": 0.2929058372974396, + "learning_rate": 0.00019609560820910073, + "loss": 1.4984, + "step": 1518 + }, + { + "epoch": 0.01973871220821158, + "grad_norm": 0.3934227228164673, + "learning_rate": 0.00019609300874718935, + "loss": 1.5725, + "step": 1519 + }, + { + "epoch": 0.01975170675212745, + "grad_norm": 0.42947065830230713, + "learning_rate": 0.00019609040928527795, + "loss": 1.3895, + "step": 1520 + }, + { + "epoch": 0.019764701296043324, + "grad_norm": 0.3334328830242157, + "learning_rate": 0.00019608780982336658, + "loss": 1.491, + "step": 1521 + }, + { + "epoch": 0.019777695839959197, + "grad_norm": 0.494296133518219, + "learning_rate": 0.0001960852103614552, + "loss": 1.4017, + "step": 1522 + }, + { + "epoch": 0.01979069038387507, + "grad_norm": 0.2985383868217468, + "learning_rate": 0.0001960826108995438, + "loss": 1.3115, + "step": 1523 + }, + { + "epoch": 0.019803684927790943, + "grad_norm": 0.35311025381088257, + "learning_rate": 0.00019608001143763242, + "loss": 1.5029, + "step": 1524 + }, + { + "epoch": 0.019816679471706816, + "grad_norm": 0.40701058506965637, + "learning_rate": 0.00019607741197572102, + "loss": 1.4916, + "step": 1525 + }, + { + "epoch": 0.01982967401562269, + "grad_norm": 0.5032493472099304, + "learning_rate": 0.00019607481251380965, + "loss": 1.4567, + "step": 1526 + }, + { + "epoch": 0.019842668559538562, + "grad_norm": 0.4273280203342438, + "learning_rate": 0.00019607221305189827, + "loss": 1.5937, + "step": 1527 + }, + { + "epoch": 0.01985566310345444, + "grad_norm": 0.34755900502204895, + "learning_rate": 0.00019606961358998687, + "loss": 1.3849, + "step": 1528 + }, + { + "epoch": 0.01986865764737031, + "grad_norm": 0.4055110812187195, + "learning_rate": 0.0001960670141280755, + "loss": 1.6312, + "step": 1529 + }, + { + "epoch": 0.019881652191286184, + "grad_norm": 0.3750371038913727, + "learning_rate": 0.00019606441466616412, + "loss": 1.3506, + "step": 1530 + }, + { + "epoch": 0.019894646735202057, + "grad_norm": 0.39874982833862305, + "learning_rate": 0.00019606181520425274, + "loss": 1.6667, + "step": 1531 + }, + { + "epoch": 0.01990764127911793, + "grad_norm": 0.3380453288555145, + "learning_rate": 0.00019605921574234134, + "loss": 1.3847, + "step": 1532 + }, + { + "epoch": 0.019920635823033803, + "grad_norm": 0.37355825304985046, + "learning_rate": 0.00019605661628042996, + "loss": 1.349, + "step": 1533 + }, + { + "epoch": 0.019933630366949676, + "grad_norm": 0.36169975996017456, + "learning_rate": 0.0001960540168185186, + "loss": 1.3348, + "step": 1534 + }, + { + "epoch": 0.01994662491086555, + "grad_norm": 0.4419514834880829, + "learning_rate": 0.00019605141735660718, + "loss": 1.7414, + "step": 1535 + }, + { + "epoch": 0.019959619454781422, + "grad_norm": 0.37134990096092224, + "learning_rate": 0.0001960488178946958, + "loss": 1.4494, + "step": 1536 + }, + { + "epoch": 0.0199726139986973, + "grad_norm": 0.40228742361068726, + "learning_rate": 0.0001960462184327844, + "loss": 1.2943, + "step": 1537 + }, + { + "epoch": 0.01998560854261317, + "grad_norm": 0.3903040885925293, + "learning_rate": 0.00019604361897087306, + "loss": 1.5018, + "step": 1538 + }, + { + "epoch": 0.019998603086529045, + "grad_norm": 0.25189146399497986, + "learning_rate": 0.00019604101950896165, + "loss": 1.1487, + "step": 1539 + }, + { + "epoch": 0.020011597630444918, + "grad_norm": 0.3548290729522705, + "learning_rate": 0.00019603842004705025, + "loss": 1.2833, + "step": 1540 + }, + { + "epoch": 0.02002459217436079, + "grad_norm": 0.3860187828540802, + "learning_rate": 0.0001960358205851389, + "loss": 1.5513, + "step": 1541 + }, + { + "epoch": 0.020037586718276663, + "grad_norm": 0.3593614101409912, + "learning_rate": 0.0001960332211232275, + "loss": 1.3988, + "step": 1542 + }, + { + "epoch": 0.020050581262192536, + "grad_norm": 0.42542481422424316, + "learning_rate": 0.00019603062166131613, + "loss": 1.5644, + "step": 1543 + }, + { + "epoch": 0.02006357580610841, + "grad_norm": 0.4755573570728302, + "learning_rate": 0.00019602802219940472, + "loss": 1.6379, + "step": 1544 + }, + { + "epoch": 0.020076570350024282, + "grad_norm": 0.4325181543827057, + "learning_rate": 0.00019602542273749335, + "loss": 1.4713, + "step": 1545 + }, + { + "epoch": 0.020089564893940155, + "grad_norm": 0.4106459617614746, + "learning_rate": 0.00019602282327558197, + "loss": 1.5294, + "step": 1546 + }, + { + "epoch": 0.02010255943785603, + "grad_norm": 0.4240623116493225, + "learning_rate": 0.00019602022381367057, + "loss": 1.4604, + "step": 1547 + }, + { + "epoch": 0.020115553981771905, + "grad_norm": 0.4559037685394287, + "learning_rate": 0.0001960176243517592, + "loss": 1.6868, + "step": 1548 + }, + { + "epoch": 0.020128548525687778, + "grad_norm": 0.4125285744667053, + "learning_rate": 0.00019601502488984782, + "loss": 1.5499, + "step": 1549 + }, + { + "epoch": 0.02014154306960365, + "grad_norm": 0.3951132297515869, + "learning_rate": 0.00019601242542793644, + "loss": 1.3677, + "step": 1550 + }, + { + "epoch": 0.020154537613519524, + "grad_norm": 0.3750077784061432, + "learning_rate": 0.00019600982596602504, + "loss": 1.2986, + "step": 1551 + }, + { + "epoch": 0.020167532157435396, + "grad_norm": 0.3765091001987457, + "learning_rate": 0.00019600722650411364, + "loss": 1.3322, + "step": 1552 + }, + { + "epoch": 0.02018052670135127, + "grad_norm": 0.465188205242157, + "learning_rate": 0.0001960046270422023, + "loss": 1.3976, + "step": 1553 + }, + { + "epoch": 0.020193521245267142, + "grad_norm": 0.30513960123062134, + "learning_rate": 0.0001960020275802909, + "loss": 1.6173, + "step": 1554 + }, + { + "epoch": 0.020206515789183015, + "grad_norm": 0.35195010900497437, + "learning_rate": 0.0001959994281183795, + "loss": 1.4578, + "step": 1555 + }, + { + "epoch": 0.020219510333098892, + "grad_norm": 0.2966456711292267, + "learning_rate": 0.0001959968286564681, + "loss": 1.3418, + "step": 1556 + }, + { + "epoch": 0.020232504877014765, + "grad_norm": 0.4176999628543854, + "learning_rate": 0.00019599422919455673, + "loss": 1.4104, + "step": 1557 + }, + { + "epoch": 0.020245499420930638, + "grad_norm": 0.4266605079174042, + "learning_rate": 0.00019599162973264536, + "loss": 1.5092, + "step": 1558 + }, + { + "epoch": 0.02025849396484651, + "grad_norm": 0.358388751745224, + "learning_rate": 0.00019598903027073395, + "loss": 1.6631, + "step": 1559 + }, + { + "epoch": 0.020271488508762384, + "grad_norm": 0.30513235926628113, + "learning_rate": 0.00019598643080882258, + "loss": 1.1193, + "step": 1560 + }, + { + "epoch": 0.020284483052678257, + "grad_norm": 0.45252203941345215, + "learning_rate": 0.0001959838313469112, + "loss": 1.3373, + "step": 1561 + }, + { + "epoch": 0.02029747759659413, + "grad_norm": 0.41170284152030945, + "learning_rate": 0.00019598123188499983, + "loss": 1.4798, + "step": 1562 + }, + { + "epoch": 0.020310472140510002, + "grad_norm": 0.33470121026039124, + "learning_rate": 0.00019597863242308843, + "loss": 1.6219, + "step": 1563 + }, + { + "epoch": 0.020323466684425875, + "grad_norm": 0.3679245710372925, + "learning_rate": 0.00019597603296117702, + "loss": 1.4923, + "step": 1564 + }, + { + "epoch": 0.02033646122834175, + "grad_norm": 0.41525131464004517, + "learning_rate": 0.00019597343349926567, + "loss": 1.3043, + "step": 1565 + }, + { + "epoch": 0.020349455772257625, + "grad_norm": 0.3219340145587921, + "learning_rate": 0.00019597083403735427, + "loss": 1.3098, + "step": 1566 + }, + { + "epoch": 0.020362450316173498, + "grad_norm": 0.3557848334312439, + "learning_rate": 0.0001959682345754429, + "loss": 1.4068, + "step": 1567 + }, + { + "epoch": 0.02037544486008937, + "grad_norm": 0.34874600172042847, + "learning_rate": 0.0001959656351135315, + "loss": 1.4774, + "step": 1568 + }, + { + "epoch": 0.020388439404005244, + "grad_norm": 0.3569698929786682, + "learning_rate": 0.00019596303565162012, + "loss": 1.3676, + "step": 1569 + }, + { + "epoch": 0.020401433947921117, + "grad_norm": 0.42987772822380066, + "learning_rate": 0.00019596043618970874, + "loss": 1.5767, + "step": 1570 + }, + { + "epoch": 0.02041442849183699, + "grad_norm": 0.3264305889606476, + "learning_rate": 0.00019595783672779734, + "loss": 1.4497, + "step": 1571 + }, + { + "epoch": 0.020427423035752863, + "grad_norm": 0.36391645669937134, + "learning_rate": 0.00019595523726588596, + "loss": 1.3669, + "step": 1572 + }, + { + "epoch": 0.020440417579668736, + "grad_norm": 0.28453534841537476, + "learning_rate": 0.0001959526378039746, + "loss": 1.2903, + "step": 1573 + }, + { + "epoch": 0.02045341212358461, + "grad_norm": 0.43484920263290405, + "learning_rate": 0.00019595003834206321, + "loss": 1.5088, + "step": 1574 + }, + { + "epoch": 0.020466406667500485, + "grad_norm": 0.42664283514022827, + "learning_rate": 0.0001959474388801518, + "loss": 1.4973, + "step": 1575 + }, + { + "epoch": 0.020479401211416358, + "grad_norm": 0.3683311641216278, + "learning_rate": 0.00019594483941824044, + "loss": 1.4963, + "step": 1576 + }, + { + "epoch": 0.02049239575533223, + "grad_norm": 0.44799497723579407, + "learning_rate": 0.00019594223995632906, + "loss": 1.6409, + "step": 1577 + }, + { + "epoch": 0.020505390299248104, + "grad_norm": 0.47593674063682556, + "learning_rate": 0.00019593964049441766, + "loss": 1.4892, + "step": 1578 + }, + { + "epoch": 0.020518384843163977, + "grad_norm": 0.3819369077682495, + "learning_rate": 0.00019593704103250628, + "loss": 1.3729, + "step": 1579 + }, + { + "epoch": 0.02053137938707985, + "grad_norm": 0.40040919184684753, + "learning_rate": 0.0001959344415705949, + "loss": 1.5173, + "step": 1580 + }, + { + "epoch": 0.020544373930995723, + "grad_norm": 0.35317182540893555, + "learning_rate": 0.0001959318421086835, + "loss": 1.4997, + "step": 1581 + }, + { + "epoch": 0.020557368474911596, + "grad_norm": 0.3929605782032013, + "learning_rate": 0.00019592924264677213, + "loss": 1.6177, + "step": 1582 + }, + { + "epoch": 0.02057036301882747, + "grad_norm": 0.2945997416973114, + "learning_rate": 0.00019592664318486073, + "loss": 1.4386, + "step": 1583 + }, + { + "epoch": 0.02058335756274334, + "grad_norm": 0.3660469055175781, + "learning_rate": 0.00019592404372294938, + "loss": 1.4455, + "step": 1584 + }, + { + "epoch": 0.020596352106659218, + "grad_norm": 0.3905063569545746, + "learning_rate": 0.00019592144426103797, + "loss": 1.5125, + "step": 1585 + }, + { + "epoch": 0.02060934665057509, + "grad_norm": 0.36867886781692505, + "learning_rate": 0.0001959188447991266, + "loss": 1.3826, + "step": 1586 + }, + { + "epoch": 0.020622341194490964, + "grad_norm": 0.4044784605503082, + "learning_rate": 0.0001959162453372152, + "loss": 1.4459, + "step": 1587 + }, + { + "epoch": 0.020635335738406837, + "grad_norm": 0.3518185317516327, + "learning_rate": 0.00019591364587530382, + "loss": 1.3904, + "step": 1588 + }, + { + "epoch": 0.02064833028232271, + "grad_norm": 0.386371910572052, + "learning_rate": 0.00019591104641339245, + "loss": 1.5087, + "step": 1589 + }, + { + "epoch": 0.020661324826238583, + "grad_norm": 0.3203730881214142, + "learning_rate": 0.00019590844695148104, + "loss": 1.148, + "step": 1590 + }, + { + "epoch": 0.020674319370154456, + "grad_norm": 0.2952377200126648, + "learning_rate": 0.00019590584748956967, + "loss": 1.4205, + "step": 1591 + }, + { + "epoch": 0.02068731391407033, + "grad_norm": 0.3668016195297241, + "learning_rate": 0.0001959032480276583, + "loss": 1.4445, + "step": 1592 + }, + { + "epoch": 0.0207003084579862, + "grad_norm": 0.3414691388607025, + "learning_rate": 0.0001959006485657469, + "loss": 1.4218, + "step": 1593 + }, + { + "epoch": 0.020713303001902078, + "grad_norm": 0.39005324244499207, + "learning_rate": 0.00019589804910383551, + "loss": 1.6387, + "step": 1594 + }, + { + "epoch": 0.02072629754581795, + "grad_norm": 0.3395806550979614, + "learning_rate": 0.0001958954496419241, + "loss": 1.3853, + "step": 1595 + }, + { + "epoch": 0.020739292089733824, + "grad_norm": 0.3177931308746338, + "learning_rate": 0.00019589285018001276, + "loss": 1.2602, + "step": 1596 + }, + { + "epoch": 0.020752286633649697, + "grad_norm": 0.4624570608139038, + "learning_rate": 0.00019589025071810136, + "loss": 1.4313, + "step": 1597 + }, + { + "epoch": 0.02076528117756557, + "grad_norm": 0.3250756859779358, + "learning_rate": 0.00019588765125618998, + "loss": 1.4142, + "step": 1598 + }, + { + "epoch": 0.020778275721481443, + "grad_norm": 0.3827402889728546, + "learning_rate": 0.00019588505179427858, + "loss": 1.4889, + "step": 1599 + }, + { + "epoch": 0.020791270265397316, + "grad_norm": 0.3126867115497589, + "learning_rate": 0.0001958824523323672, + "loss": 1.3279, + "step": 1600 + }, + { + "epoch": 0.02080426480931319, + "grad_norm": 0.3472033441066742, + "learning_rate": 0.00019587985287045583, + "loss": 1.4702, + "step": 1601 + }, + { + "epoch": 0.020817259353229062, + "grad_norm": 0.4711418151855469, + "learning_rate": 0.00019587725340854443, + "loss": 1.4495, + "step": 1602 + }, + { + "epoch": 0.020830253897144935, + "grad_norm": 0.36849603056907654, + "learning_rate": 0.00019587465394663305, + "loss": 1.748, + "step": 1603 + }, + { + "epoch": 0.02084324844106081, + "grad_norm": 0.38734176754951477, + "learning_rate": 0.00019587205448472168, + "loss": 1.6044, + "step": 1604 + }, + { + "epoch": 0.020856242984976684, + "grad_norm": 0.4070992171764374, + "learning_rate": 0.0001958694550228103, + "loss": 1.4472, + "step": 1605 + }, + { + "epoch": 0.020869237528892557, + "grad_norm": 0.44407230615615845, + "learning_rate": 0.0001958668555608989, + "loss": 1.4856, + "step": 1606 + }, + { + "epoch": 0.02088223207280843, + "grad_norm": 0.42099443078041077, + "learning_rate": 0.0001958642560989875, + "loss": 1.4157, + "step": 1607 + }, + { + "epoch": 0.020895226616724303, + "grad_norm": 0.3330783545970917, + "learning_rate": 0.00019586165663707615, + "loss": 1.269, + "step": 1608 + }, + { + "epoch": 0.020908221160640176, + "grad_norm": 0.3795165419578552, + "learning_rate": 0.00019585905717516475, + "loss": 1.6631, + "step": 1609 + }, + { + "epoch": 0.02092121570455605, + "grad_norm": 0.3347870409488678, + "learning_rate": 0.00019585645771325337, + "loss": 1.39, + "step": 1610 + }, + { + "epoch": 0.020934210248471922, + "grad_norm": 0.42705029249191284, + "learning_rate": 0.00019585385825134197, + "loss": 1.6396, + "step": 1611 + }, + { + "epoch": 0.020947204792387795, + "grad_norm": 0.3586139976978302, + "learning_rate": 0.0001958512587894306, + "loss": 1.5277, + "step": 1612 + }, + { + "epoch": 0.02096019933630367, + "grad_norm": 0.38245609402656555, + "learning_rate": 0.00019584865932751922, + "loss": 1.5861, + "step": 1613 + }, + { + "epoch": 0.020973193880219544, + "grad_norm": 0.3106684684753418, + "learning_rate": 0.00019584605986560781, + "loss": 1.1715, + "step": 1614 + }, + { + "epoch": 0.020986188424135417, + "grad_norm": 0.37953510880470276, + "learning_rate": 0.00019584346040369647, + "loss": 1.5584, + "step": 1615 + }, + { + "epoch": 0.02099918296805129, + "grad_norm": 0.2933439016342163, + "learning_rate": 0.00019584086094178506, + "loss": 1.2928, + "step": 1616 + }, + { + "epoch": 0.021012177511967163, + "grad_norm": 0.369454562664032, + "learning_rate": 0.0001958382614798737, + "loss": 1.453, + "step": 1617 + }, + { + "epoch": 0.021025172055883036, + "grad_norm": 0.34735023975372314, + "learning_rate": 0.00019583566201796228, + "loss": 1.4575, + "step": 1618 + }, + { + "epoch": 0.02103816659979891, + "grad_norm": 0.26747339963912964, + "learning_rate": 0.0001958330625560509, + "loss": 1.4802, + "step": 1619 + }, + { + "epoch": 0.021051161143714782, + "grad_norm": 0.44794002175331116, + "learning_rate": 0.00019583046309413953, + "loss": 1.5177, + "step": 1620 + }, + { + "epoch": 0.021064155687630655, + "grad_norm": 0.41062891483306885, + "learning_rate": 0.00019582786363222813, + "loss": 1.5903, + "step": 1621 + }, + { + "epoch": 0.021077150231546528, + "grad_norm": 0.3192037343978882, + "learning_rate": 0.00019582526417031676, + "loss": 1.322, + "step": 1622 + }, + { + "epoch": 0.021090144775462404, + "grad_norm": 0.47188833355903625, + "learning_rate": 0.00019582266470840538, + "loss": 1.5766, + "step": 1623 + }, + { + "epoch": 0.021103139319378277, + "grad_norm": 0.329072505235672, + "learning_rate": 0.00019582006524649398, + "loss": 1.3272, + "step": 1624 + }, + { + "epoch": 0.02111613386329415, + "grad_norm": 0.3218333125114441, + "learning_rate": 0.0001958174657845826, + "loss": 1.252, + "step": 1625 + }, + { + "epoch": 0.021129128407210023, + "grad_norm": 0.44932013750076294, + "learning_rate": 0.0001958148663226712, + "loss": 1.2511, + "step": 1626 + }, + { + "epoch": 0.021142122951125896, + "grad_norm": 0.4039768576622009, + "learning_rate": 0.00019581226686075985, + "loss": 1.3452, + "step": 1627 + }, + { + "epoch": 0.02115511749504177, + "grad_norm": 0.3648715019226074, + "learning_rate": 0.00019580966739884845, + "loss": 1.5494, + "step": 1628 + }, + { + "epoch": 0.021168112038957642, + "grad_norm": 0.30340489745140076, + "learning_rate": 0.00019580706793693707, + "loss": 1.3307, + "step": 1629 + }, + { + "epoch": 0.021181106582873515, + "grad_norm": 0.3260335624217987, + "learning_rate": 0.00019580446847502567, + "loss": 1.2421, + "step": 1630 + }, + { + "epoch": 0.021194101126789388, + "grad_norm": 0.39445722103118896, + "learning_rate": 0.0001958018690131143, + "loss": 1.5133, + "step": 1631 + }, + { + "epoch": 0.021207095670705264, + "grad_norm": 0.4508373439311981, + "learning_rate": 0.00019579926955120292, + "loss": 1.497, + "step": 1632 + }, + { + "epoch": 0.021220090214621137, + "grad_norm": 0.34700095653533936, + "learning_rate": 0.00019579667008929152, + "loss": 1.5436, + "step": 1633 + }, + { + "epoch": 0.02123308475853701, + "grad_norm": 0.3277294635772705, + "learning_rate": 0.00019579407062738014, + "loss": 1.5377, + "step": 1634 + }, + { + "epoch": 0.021246079302452883, + "grad_norm": 0.3692040741443634, + "learning_rate": 0.00019579147116546877, + "loss": 1.4643, + "step": 1635 + }, + { + "epoch": 0.021259073846368756, + "grad_norm": 0.33273717761039734, + "learning_rate": 0.00019578887170355736, + "loss": 1.4873, + "step": 1636 + }, + { + "epoch": 0.02127206839028463, + "grad_norm": 0.4648902714252472, + "learning_rate": 0.000195786272241646, + "loss": 1.5925, + "step": 1637 + }, + { + "epoch": 0.021285062934200502, + "grad_norm": 0.32959115505218506, + "learning_rate": 0.00019578367277973458, + "loss": 1.4733, + "step": 1638 + }, + { + "epoch": 0.021298057478116375, + "grad_norm": 0.3375896215438843, + "learning_rate": 0.00019578107331782324, + "loss": 1.3738, + "step": 1639 + }, + { + "epoch": 0.021311052022032248, + "grad_norm": 0.397700697183609, + "learning_rate": 0.00019577847385591183, + "loss": 1.4783, + "step": 1640 + }, + { + "epoch": 0.02132404656594812, + "grad_norm": 0.3494514226913452, + "learning_rate": 0.00019577587439400046, + "loss": 1.4699, + "step": 1641 + }, + { + "epoch": 0.021337041109863997, + "grad_norm": 0.3787671625614166, + "learning_rate": 0.00019577327493208906, + "loss": 1.3895, + "step": 1642 + }, + { + "epoch": 0.02135003565377987, + "grad_norm": 0.3944171965122223, + "learning_rate": 0.00019577067547017768, + "loss": 1.4478, + "step": 1643 + }, + { + "epoch": 0.021363030197695743, + "grad_norm": 0.3857954442501068, + "learning_rate": 0.0001957680760082663, + "loss": 1.5263, + "step": 1644 + }, + { + "epoch": 0.021376024741611616, + "grad_norm": 0.3964695334434509, + "learning_rate": 0.0001957654765463549, + "loss": 1.6049, + "step": 1645 + }, + { + "epoch": 0.02138901928552749, + "grad_norm": 0.4778573215007782, + "learning_rate": 0.00019576287708444353, + "loss": 1.5705, + "step": 1646 + }, + { + "epoch": 0.021402013829443362, + "grad_norm": 0.35294604301452637, + "learning_rate": 0.00019576027762253215, + "loss": 1.4666, + "step": 1647 + }, + { + "epoch": 0.021415008373359235, + "grad_norm": 0.31928515434265137, + "learning_rate": 0.00019575767816062075, + "loss": 1.4431, + "step": 1648 + }, + { + "epoch": 0.021428002917275108, + "grad_norm": 0.3613530099391937, + "learning_rate": 0.00019575507869870937, + "loss": 1.3176, + "step": 1649 + }, + { + "epoch": 0.02144099746119098, + "grad_norm": 0.2900174558162689, + "learning_rate": 0.000195752479236798, + "loss": 1.4382, + "step": 1650 + }, + { + "epoch": 0.021453992005106854, + "grad_norm": 0.3931158483028412, + "learning_rate": 0.00019574987977488662, + "loss": 1.5383, + "step": 1651 + }, + { + "epoch": 0.02146698654902273, + "grad_norm": 0.4072851240634918, + "learning_rate": 0.00019574728031297522, + "loss": 1.7854, + "step": 1652 + }, + { + "epoch": 0.021479981092938603, + "grad_norm": 0.3354068994522095, + "learning_rate": 0.00019574468085106384, + "loss": 1.4827, + "step": 1653 + }, + { + "epoch": 0.021492975636854476, + "grad_norm": 0.25913044810295105, + "learning_rate": 0.00019574208138915247, + "loss": 1.291, + "step": 1654 + }, + { + "epoch": 0.02150597018077035, + "grad_norm": 0.2384590059518814, + "learning_rate": 0.00019573948192724107, + "loss": 1.3029, + "step": 1655 + }, + { + "epoch": 0.021518964724686222, + "grad_norm": 0.31463539600372314, + "learning_rate": 0.0001957368824653297, + "loss": 1.3735, + "step": 1656 + }, + { + "epoch": 0.021531959268602095, + "grad_norm": 0.39537736773490906, + "learning_rate": 0.0001957342830034183, + "loss": 1.3781, + "step": 1657 + }, + { + "epoch": 0.021544953812517968, + "grad_norm": 0.33102867007255554, + "learning_rate": 0.00019573168354150694, + "loss": 1.5955, + "step": 1658 + }, + { + "epoch": 0.02155794835643384, + "grad_norm": 0.4276646077632904, + "learning_rate": 0.00019572908407959554, + "loss": 1.5761, + "step": 1659 + }, + { + "epoch": 0.021570942900349714, + "grad_norm": 0.41354063153266907, + "learning_rate": 0.00019572648461768416, + "loss": 1.444, + "step": 1660 + }, + { + "epoch": 0.02158393744426559, + "grad_norm": 0.37918925285339355, + "learning_rate": 0.00019572388515577276, + "loss": 1.4344, + "step": 1661 + }, + { + "epoch": 0.021596931988181464, + "grad_norm": 0.5014055371284485, + "learning_rate": 0.00019572128569386138, + "loss": 1.5845, + "step": 1662 + }, + { + "epoch": 0.021609926532097336, + "grad_norm": 0.42829424142837524, + "learning_rate": 0.00019571868623195, + "loss": 1.5156, + "step": 1663 + }, + { + "epoch": 0.02162292107601321, + "grad_norm": 0.43786388635635376, + "learning_rate": 0.0001957160867700386, + "loss": 1.6055, + "step": 1664 + }, + { + "epoch": 0.021635915619929082, + "grad_norm": 0.31251829862594604, + "learning_rate": 0.00019571348730812723, + "loss": 1.5896, + "step": 1665 + }, + { + "epoch": 0.021648910163844955, + "grad_norm": 0.31247207522392273, + "learning_rate": 0.00019571088784621585, + "loss": 1.4182, + "step": 1666 + }, + { + "epoch": 0.02166190470776083, + "grad_norm": 0.3295499086380005, + "learning_rate": 0.00019570828838430445, + "loss": 1.2062, + "step": 1667 + }, + { + "epoch": 0.0216748992516767, + "grad_norm": 0.3025866746902466, + "learning_rate": 0.00019570568892239307, + "loss": 1.5354, + "step": 1668 + }, + { + "epoch": 0.021687893795592574, + "grad_norm": 0.249764546751976, + "learning_rate": 0.00019570308946048167, + "loss": 1.346, + "step": 1669 + }, + { + "epoch": 0.021700888339508447, + "grad_norm": 0.4976350963115692, + "learning_rate": 0.00019570048999857032, + "loss": 1.5642, + "step": 1670 + }, + { + "epoch": 0.021713882883424324, + "grad_norm": 0.37651264667510986, + "learning_rate": 0.00019569789053665892, + "loss": 1.5268, + "step": 1671 + }, + { + "epoch": 0.021726877427340197, + "grad_norm": 0.3025478720664978, + "learning_rate": 0.00019569529107474755, + "loss": 1.309, + "step": 1672 + }, + { + "epoch": 0.02173987197125607, + "grad_norm": 0.5278252959251404, + "learning_rate": 0.00019569269161283614, + "loss": 1.5146, + "step": 1673 + }, + { + "epoch": 0.021752866515171942, + "grad_norm": 0.3368765711784363, + "learning_rate": 0.00019569009215092477, + "loss": 1.4717, + "step": 1674 + }, + { + "epoch": 0.021765861059087815, + "grad_norm": 0.37126728892326355, + "learning_rate": 0.0001956874926890134, + "loss": 1.3485, + "step": 1675 + }, + { + "epoch": 0.02177885560300369, + "grad_norm": 0.4636981785297394, + "learning_rate": 0.000195684893227102, + "loss": 1.5706, + "step": 1676 + }, + { + "epoch": 0.02179185014691956, + "grad_norm": 0.25153589248657227, + "learning_rate": 0.00019568229376519061, + "loss": 1.3659, + "step": 1677 + }, + { + "epoch": 0.021804844690835434, + "grad_norm": 0.41266581416130066, + "learning_rate": 0.00019567969430327924, + "loss": 1.4365, + "step": 1678 + }, + { + "epoch": 0.021817839234751307, + "grad_norm": 0.37196439504623413, + "learning_rate": 0.00019567709484136784, + "loss": 1.2218, + "step": 1679 + }, + { + "epoch": 0.021830833778667184, + "grad_norm": 0.43542590737342834, + "learning_rate": 0.00019567449537945646, + "loss": 1.5324, + "step": 1680 + }, + { + "epoch": 0.021843828322583057, + "grad_norm": 0.34926947951316833, + "learning_rate": 0.00019567189591754506, + "loss": 1.5377, + "step": 1681 + }, + { + "epoch": 0.02185682286649893, + "grad_norm": 0.41107818484306335, + "learning_rate": 0.0001956692964556337, + "loss": 1.5147, + "step": 1682 + }, + { + "epoch": 0.021869817410414803, + "grad_norm": 0.3859551250934601, + "learning_rate": 0.0001956666969937223, + "loss": 1.5414, + "step": 1683 + }, + { + "epoch": 0.021882811954330676, + "grad_norm": 0.3669387996196747, + "learning_rate": 0.00019566409753181093, + "loss": 1.4922, + "step": 1684 + }, + { + "epoch": 0.02189580649824655, + "grad_norm": 0.34603843092918396, + "learning_rate": 0.00019566149806989953, + "loss": 1.3241, + "step": 1685 + }, + { + "epoch": 0.02190880104216242, + "grad_norm": 0.37981685996055603, + "learning_rate": 0.00019565889860798815, + "loss": 1.3466, + "step": 1686 + }, + { + "epoch": 0.021921795586078294, + "grad_norm": 0.2920598089694977, + "learning_rate": 0.00019565629914607678, + "loss": 1.4988, + "step": 1687 + }, + { + "epoch": 0.021934790129994167, + "grad_norm": 0.40843528509140015, + "learning_rate": 0.00019565369968416537, + "loss": 1.304, + "step": 1688 + }, + { + "epoch": 0.02194778467391004, + "grad_norm": 0.48986881971359253, + "learning_rate": 0.00019565110022225403, + "loss": 1.3282, + "step": 1689 + }, + { + "epoch": 0.021960779217825917, + "grad_norm": 0.35337305068969727, + "learning_rate": 0.00019564850076034262, + "loss": 1.5884, + "step": 1690 + }, + { + "epoch": 0.02197377376174179, + "grad_norm": 0.37043288350105286, + "learning_rate": 0.00019564590129843122, + "loss": 1.3222, + "step": 1691 + }, + { + "epoch": 0.021986768305657663, + "grad_norm": 0.30725833773612976, + "learning_rate": 0.00019564330183651985, + "loss": 1.4441, + "step": 1692 + }, + { + "epoch": 0.021999762849573536, + "grad_norm": 0.3049827218055725, + "learning_rate": 0.00019564070237460847, + "loss": 1.3802, + "step": 1693 + }, + { + "epoch": 0.02201275739348941, + "grad_norm": 0.3680458962917328, + "learning_rate": 0.0001956381029126971, + "loss": 1.3028, + "step": 1694 + }, + { + "epoch": 0.02202575193740528, + "grad_norm": 0.3884005844593048, + "learning_rate": 0.0001956355034507857, + "loss": 1.5881, + "step": 1695 + }, + { + "epoch": 0.022038746481321154, + "grad_norm": 0.386909544467926, + "learning_rate": 0.00019563290398887432, + "loss": 1.4967, + "step": 1696 + }, + { + "epoch": 0.022051741025237027, + "grad_norm": 0.327727347612381, + "learning_rate": 0.00019563030452696294, + "loss": 1.4156, + "step": 1697 + }, + { + "epoch": 0.0220647355691529, + "grad_norm": 0.36065375804901123, + "learning_rate": 0.00019562770506505154, + "loss": 1.434, + "step": 1698 + }, + { + "epoch": 0.022077730113068777, + "grad_norm": 0.41146379709243774, + "learning_rate": 0.00019562510560314016, + "loss": 1.4004, + "step": 1699 + }, + { + "epoch": 0.02209072465698465, + "grad_norm": 0.28855806589126587, + "learning_rate": 0.00019562250614122876, + "loss": 1.352, + "step": 1700 + }, + { + "epoch": 0.022103719200900523, + "grad_norm": 0.35015758872032166, + "learning_rate": 0.0001956199066793174, + "loss": 1.4631, + "step": 1701 + }, + { + "epoch": 0.022116713744816396, + "grad_norm": 0.3388160169124603, + "learning_rate": 0.000195617307217406, + "loss": 1.5847, + "step": 1702 + }, + { + "epoch": 0.02212970828873227, + "grad_norm": 0.3232349157333374, + "learning_rate": 0.0001956147077554946, + "loss": 1.353, + "step": 1703 + }, + { + "epoch": 0.02214270283264814, + "grad_norm": 0.3832527995109558, + "learning_rate": 0.00019561210829358323, + "loss": 1.5827, + "step": 1704 + }, + { + "epoch": 0.022155697376564015, + "grad_norm": 0.321197509765625, + "learning_rate": 0.00019560950883167186, + "loss": 1.376, + "step": 1705 + }, + { + "epoch": 0.022168691920479888, + "grad_norm": 0.2841126024723053, + "learning_rate": 0.00019560690936976048, + "loss": 1.262, + "step": 1706 + }, + { + "epoch": 0.02218168646439576, + "grad_norm": 0.27810975909233093, + "learning_rate": 0.00019560430990784908, + "loss": 1.1901, + "step": 1707 + }, + { + "epoch": 0.022194681008311633, + "grad_norm": 0.40710029006004333, + "learning_rate": 0.0001956017104459377, + "loss": 1.3795, + "step": 1708 + }, + { + "epoch": 0.02220767555222751, + "grad_norm": 0.27838972210884094, + "learning_rate": 0.00019559911098402633, + "loss": 1.464, + "step": 1709 + }, + { + "epoch": 0.022220670096143383, + "grad_norm": 0.37907466292381287, + "learning_rate": 0.00019559651152211492, + "loss": 1.3841, + "step": 1710 + }, + { + "epoch": 0.022233664640059256, + "grad_norm": 0.4405648708343506, + "learning_rate": 0.00019559391206020355, + "loss": 1.4879, + "step": 1711 + }, + { + "epoch": 0.02224665918397513, + "grad_norm": 0.30526280403137207, + "learning_rate": 0.00019559131259829215, + "loss": 1.3064, + "step": 1712 + }, + { + "epoch": 0.022259653727891, + "grad_norm": 0.4097805619239807, + "learning_rate": 0.0001955887131363808, + "loss": 1.4996, + "step": 1713 + }, + { + "epoch": 0.022272648271806875, + "grad_norm": 0.3696689009666443, + "learning_rate": 0.0001955861136744694, + "loss": 1.3542, + "step": 1714 + }, + { + "epoch": 0.022285642815722748, + "grad_norm": 0.4285631775856018, + "learning_rate": 0.000195583514212558, + "loss": 1.465, + "step": 1715 + }, + { + "epoch": 0.02229863735963862, + "grad_norm": 0.39842674136161804, + "learning_rate": 0.00019558091475064662, + "loss": 1.4928, + "step": 1716 + }, + { + "epoch": 0.022311631903554494, + "grad_norm": 0.41399630904197693, + "learning_rate": 0.00019557831528873524, + "loss": 1.3706, + "step": 1717 + }, + { + "epoch": 0.02232462644747037, + "grad_norm": 0.31001347303390503, + "learning_rate": 0.00019557571582682387, + "loss": 1.3039, + "step": 1718 + }, + { + "epoch": 0.022337620991386243, + "grad_norm": 0.6012894511222839, + "learning_rate": 0.00019557311636491246, + "loss": 1.5732, + "step": 1719 + }, + { + "epoch": 0.022350615535302116, + "grad_norm": 0.4421006739139557, + "learning_rate": 0.0001955705169030011, + "loss": 1.4559, + "step": 1720 + }, + { + "epoch": 0.02236361007921799, + "grad_norm": 0.3570576310157776, + "learning_rate": 0.0001955679174410897, + "loss": 1.3467, + "step": 1721 + }, + { + "epoch": 0.022376604623133862, + "grad_norm": 0.3897778391838074, + "learning_rate": 0.0001955653179791783, + "loss": 1.3302, + "step": 1722 + }, + { + "epoch": 0.022389599167049735, + "grad_norm": 0.3074800968170166, + "learning_rate": 0.00019556271851726693, + "loss": 1.3689, + "step": 1723 + }, + { + "epoch": 0.022402593710965608, + "grad_norm": 0.29305726289749146, + "learning_rate": 0.00019556011905535556, + "loss": 1.4544, + "step": 1724 + }, + { + "epoch": 0.02241558825488148, + "grad_norm": 0.4414934515953064, + "learning_rate": 0.00019555751959344418, + "loss": 1.4851, + "step": 1725 + }, + { + "epoch": 0.022428582798797354, + "grad_norm": 0.27918463945388794, + "learning_rate": 0.00019555492013153278, + "loss": 1.4063, + "step": 1726 + }, + { + "epoch": 0.022441577342713227, + "grad_norm": 0.4117446541786194, + "learning_rate": 0.0001955523206696214, + "loss": 1.4085, + "step": 1727 + }, + { + "epoch": 0.022454571886629103, + "grad_norm": 0.42266303300857544, + "learning_rate": 0.00019554972120771003, + "loss": 1.5035, + "step": 1728 + }, + { + "epoch": 0.022467566430544976, + "grad_norm": 0.4759625792503357, + "learning_rate": 0.00019554712174579863, + "loss": 1.4085, + "step": 1729 + }, + { + "epoch": 0.02248056097446085, + "grad_norm": 0.374464213848114, + "learning_rate": 0.00019554452228388725, + "loss": 1.5305, + "step": 1730 + }, + { + "epoch": 0.022493555518376722, + "grad_norm": 0.31605151295661926, + "learning_rate": 0.00019554192282197585, + "loss": 1.5106, + "step": 1731 + }, + { + "epoch": 0.022506550062292595, + "grad_norm": 0.3102051913738251, + "learning_rate": 0.00019553932336006447, + "loss": 1.598, + "step": 1732 + }, + { + "epoch": 0.022519544606208468, + "grad_norm": 0.32919368147850037, + "learning_rate": 0.0001955367238981531, + "loss": 1.7339, + "step": 1733 + }, + { + "epoch": 0.02253253915012434, + "grad_norm": 0.37161538004875183, + "learning_rate": 0.0001955341244362417, + "loss": 1.3533, + "step": 1734 + }, + { + "epoch": 0.022545533694040214, + "grad_norm": 0.33915358781814575, + "learning_rate": 0.00019553152497433032, + "loss": 1.5635, + "step": 1735 + }, + { + "epoch": 0.022558528237956087, + "grad_norm": 0.23286893963813782, + "learning_rate": 0.00019552892551241894, + "loss": 1.229, + "step": 1736 + }, + { + "epoch": 0.022571522781871963, + "grad_norm": 0.30798688530921936, + "learning_rate": 0.00019552632605050757, + "loss": 1.4644, + "step": 1737 + }, + { + "epoch": 0.022584517325787836, + "grad_norm": 0.3687688410282135, + "learning_rate": 0.00019552372658859617, + "loss": 1.3613, + "step": 1738 + }, + { + "epoch": 0.02259751186970371, + "grad_norm": 0.3794326186180115, + "learning_rate": 0.0001955211271266848, + "loss": 1.5485, + "step": 1739 + }, + { + "epoch": 0.022610506413619582, + "grad_norm": 0.42618221044540405, + "learning_rate": 0.00019551852766477341, + "loss": 1.6085, + "step": 1740 + }, + { + "epoch": 0.022623500957535455, + "grad_norm": 0.4333156645298004, + "learning_rate": 0.000195515928202862, + "loss": 1.4613, + "step": 1741 + }, + { + "epoch": 0.022636495501451328, + "grad_norm": 0.35553714632987976, + "learning_rate": 0.00019551332874095064, + "loss": 1.5204, + "step": 1742 + }, + { + "epoch": 0.0226494900453672, + "grad_norm": 0.3447892367839813, + "learning_rate": 0.00019551072927903923, + "loss": 1.5232, + "step": 1743 + }, + { + "epoch": 0.022662484589283074, + "grad_norm": 0.408682256937027, + "learning_rate": 0.00019550812981712789, + "loss": 1.5445, + "step": 1744 + }, + { + "epoch": 0.022675479133198947, + "grad_norm": 0.28475674986839294, + "learning_rate": 0.00019550553035521648, + "loss": 1.4025, + "step": 1745 + }, + { + "epoch": 0.02268847367711482, + "grad_norm": 0.4748742878437042, + "learning_rate": 0.00019550293089330508, + "loss": 1.3943, + "step": 1746 + }, + { + "epoch": 0.022701468221030696, + "grad_norm": 0.30381184816360474, + "learning_rate": 0.0001955003314313937, + "loss": 1.3255, + "step": 1747 + }, + { + "epoch": 0.02271446276494657, + "grad_norm": 0.4273858964443207, + "learning_rate": 0.00019549773196948233, + "loss": 1.4897, + "step": 1748 + }, + { + "epoch": 0.022727457308862442, + "grad_norm": 0.44716984033584595, + "learning_rate": 0.00019549513250757095, + "loss": 1.5146, + "step": 1749 + }, + { + "epoch": 0.022740451852778315, + "grad_norm": 0.3693029284477234, + "learning_rate": 0.00019549253304565955, + "loss": 1.5719, + "step": 1750 + }, + { + "epoch": 0.022753446396694188, + "grad_norm": 0.3475962281227112, + "learning_rate": 0.00019548993358374818, + "loss": 1.5674, + "step": 1751 + }, + { + "epoch": 0.02276644094061006, + "grad_norm": 0.43653982877731323, + "learning_rate": 0.0001954873341218368, + "loss": 1.6131, + "step": 1752 + }, + { + "epoch": 0.022779435484525934, + "grad_norm": 0.2988130748271942, + "learning_rate": 0.0001954847346599254, + "loss": 1.4084, + "step": 1753 + }, + { + "epoch": 0.022792430028441807, + "grad_norm": 0.33502525091171265, + "learning_rate": 0.00019548213519801402, + "loss": 1.6681, + "step": 1754 + }, + { + "epoch": 0.02280542457235768, + "grad_norm": 0.3344237208366394, + "learning_rate": 0.00019547953573610262, + "loss": 1.4013, + "step": 1755 + }, + { + "epoch": 0.022818419116273556, + "grad_norm": 0.27920055389404297, + "learning_rate": 0.00019547693627419127, + "loss": 1.2838, + "step": 1756 + }, + { + "epoch": 0.02283141366018943, + "grad_norm": 0.3209424316883087, + "learning_rate": 0.00019547433681227987, + "loss": 1.3716, + "step": 1757 + }, + { + "epoch": 0.022844408204105302, + "grad_norm": 0.36769208312034607, + "learning_rate": 0.00019547173735036847, + "loss": 1.6245, + "step": 1758 + }, + { + "epoch": 0.022857402748021175, + "grad_norm": 0.33044371008872986, + "learning_rate": 0.0001954691378884571, + "loss": 1.4102, + "step": 1759 + }, + { + "epoch": 0.022870397291937048, + "grad_norm": 0.4227673411369324, + "learning_rate": 0.00019546653842654571, + "loss": 1.5309, + "step": 1760 + }, + { + "epoch": 0.02288339183585292, + "grad_norm": 0.3412001430988312, + "learning_rate": 0.00019546393896463434, + "loss": 1.453, + "step": 1761 + }, + { + "epoch": 0.022896386379768794, + "grad_norm": 0.31443512439727783, + "learning_rate": 0.00019546133950272294, + "loss": 1.1764, + "step": 1762 + }, + { + "epoch": 0.022909380923684667, + "grad_norm": 0.4690960645675659, + "learning_rate": 0.00019545874004081156, + "loss": 1.4996, + "step": 1763 + }, + { + "epoch": 0.02292237546760054, + "grad_norm": 0.23522229492664337, + "learning_rate": 0.00019545614057890019, + "loss": 1.3415, + "step": 1764 + }, + { + "epoch": 0.022935370011516413, + "grad_norm": 0.38632360100746155, + "learning_rate": 0.00019545354111698878, + "loss": 1.6867, + "step": 1765 + }, + { + "epoch": 0.02294836455543229, + "grad_norm": 0.447410523891449, + "learning_rate": 0.0001954509416550774, + "loss": 1.6823, + "step": 1766 + }, + { + "epoch": 0.022961359099348162, + "grad_norm": 0.3630903959274292, + "learning_rate": 0.00019544834219316603, + "loss": 1.456, + "step": 1767 + }, + { + "epoch": 0.022974353643264035, + "grad_norm": 0.37147101759910583, + "learning_rate": 0.00019544574273125466, + "loss": 1.4898, + "step": 1768 + }, + { + "epoch": 0.022987348187179908, + "grad_norm": 0.3553246557712555, + "learning_rate": 0.00019544314326934325, + "loss": 1.3276, + "step": 1769 + }, + { + "epoch": 0.02300034273109578, + "grad_norm": 0.42636096477508545, + "learning_rate": 0.00019544054380743185, + "loss": 1.3457, + "step": 1770 + }, + { + "epoch": 0.023013337275011654, + "grad_norm": 0.35853463411331177, + "learning_rate": 0.0001954379443455205, + "loss": 1.5554, + "step": 1771 + }, + { + "epoch": 0.023026331818927527, + "grad_norm": 0.40227043628692627, + "learning_rate": 0.0001954353448836091, + "loss": 1.5046, + "step": 1772 + }, + { + "epoch": 0.0230393263628434, + "grad_norm": 0.43165484070777893, + "learning_rate": 0.00019543274542169772, + "loss": 1.5448, + "step": 1773 + }, + { + "epoch": 0.023052320906759273, + "grad_norm": 0.40751728415489197, + "learning_rate": 0.00019543014595978632, + "loss": 1.4163, + "step": 1774 + }, + { + "epoch": 0.02306531545067515, + "grad_norm": 0.3644002377986908, + "learning_rate": 0.00019542754649787495, + "loss": 1.3576, + "step": 1775 + }, + { + "epoch": 0.023078309994591022, + "grad_norm": 0.40311771631240845, + "learning_rate": 0.00019542494703596357, + "loss": 1.5754, + "step": 1776 + }, + { + "epoch": 0.023091304538506895, + "grad_norm": 0.2770278751850128, + "learning_rate": 0.00019542234757405217, + "loss": 1.4552, + "step": 1777 + }, + { + "epoch": 0.02310429908242277, + "grad_norm": 0.3735068440437317, + "learning_rate": 0.0001954197481121408, + "loss": 1.411, + "step": 1778 + }, + { + "epoch": 0.02311729362633864, + "grad_norm": 0.40823933482170105, + "learning_rate": 0.00019541714865022942, + "loss": 1.3935, + "step": 1779 + }, + { + "epoch": 0.023130288170254514, + "grad_norm": 0.4334075450897217, + "learning_rate": 0.00019541454918831804, + "loss": 1.5425, + "step": 1780 + }, + { + "epoch": 0.023143282714170387, + "grad_norm": 0.4126792252063751, + "learning_rate": 0.00019541194972640664, + "loss": 1.5586, + "step": 1781 + }, + { + "epoch": 0.02315627725808626, + "grad_norm": 0.39981988072395325, + "learning_rate": 0.00019540935026449526, + "loss": 1.5523, + "step": 1782 + }, + { + "epoch": 0.023169271802002133, + "grad_norm": 0.34040069580078125, + "learning_rate": 0.0001954067508025839, + "loss": 1.3141, + "step": 1783 + }, + { + "epoch": 0.023182266345918006, + "grad_norm": 0.3204672038555145, + "learning_rate": 0.00019540415134067249, + "loss": 1.5524, + "step": 1784 + }, + { + "epoch": 0.023195260889833882, + "grad_norm": 0.394387811422348, + "learning_rate": 0.0001954015518787611, + "loss": 1.3762, + "step": 1785 + }, + { + "epoch": 0.023208255433749755, + "grad_norm": 0.4125596880912781, + "learning_rate": 0.0001953989524168497, + "loss": 1.3622, + "step": 1786 + }, + { + "epoch": 0.02322124997766563, + "grad_norm": 0.35644418001174927, + "learning_rate": 0.00019539635295493833, + "loss": 1.5096, + "step": 1787 + }, + { + "epoch": 0.0232342445215815, + "grad_norm": 0.3932243883609772, + "learning_rate": 0.00019539375349302696, + "loss": 1.3592, + "step": 1788 + }, + { + "epoch": 0.023247239065497374, + "grad_norm": 0.2546967566013336, + "learning_rate": 0.00019539115403111555, + "loss": 1.4257, + "step": 1789 + }, + { + "epoch": 0.023260233609413247, + "grad_norm": 0.3977675139904022, + "learning_rate": 0.00019538855456920418, + "loss": 1.3011, + "step": 1790 + }, + { + "epoch": 0.02327322815332912, + "grad_norm": 0.43489986658096313, + "learning_rate": 0.0001953859551072928, + "loss": 1.6674, + "step": 1791 + }, + { + "epoch": 0.023286222697244993, + "grad_norm": 0.4233644902706146, + "learning_rate": 0.00019538335564538143, + "loss": 1.4111, + "step": 1792 + }, + { + "epoch": 0.023299217241160866, + "grad_norm": 0.41177353262901306, + "learning_rate": 0.00019538075618347002, + "loss": 1.4682, + "step": 1793 + }, + { + "epoch": 0.023312211785076743, + "grad_norm": 0.38274767994880676, + "learning_rate": 0.00019537815672155865, + "loss": 1.3928, + "step": 1794 + }, + { + "epoch": 0.023325206328992616, + "grad_norm": 0.38090863823890686, + "learning_rate": 0.00019537555725964727, + "loss": 1.6614, + "step": 1795 + }, + { + "epoch": 0.02333820087290849, + "grad_norm": 0.35949671268463135, + "learning_rate": 0.00019537295779773587, + "loss": 1.58, + "step": 1796 + }, + { + "epoch": 0.02335119541682436, + "grad_norm": 0.3463672697544098, + "learning_rate": 0.0001953703583358245, + "loss": 1.5728, + "step": 1797 + }, + { + "epoch": 0.023364189960740234, + "grad_norm": 0.4024708569049835, + "learning_rate": 0.00019536775887391312, + "loss": 1.4199, + "step": 1798 + }, + { + "epoch": 0.023377184504656107, + "grad_norm": 0.3992582857608795, + "learning_rate": 0.00019536515941200172, + "loss": 1.4886, + "step": 1799 + }, + { + "epoch": 0.02339017904857198, + "grad_norm": 0.32905566692352295, + "learning_rate": 0.00019536255995009034, + "loss": 1.4478, + "step": 1800 + }, + { + "epoch": 0.023403173592487853, + "grad_norm": 0.5432623028755188, + "learning_rate": 0.00019535996048817894, + "loss": 1.3756, + "step": 1801 + }, + { + "epoch": 0.023416168136403726, + "grad_norm": 0.27649611234664917, + "learning_rate": 0.0001953573610262676, + "loss": 1.3959, + "step": 1802 + }, + { + "epoch": 0.0234291626803196, + "grad_norm": 0.39728596806526184, + "learning_rate": 0.0001953547615643562, + "loss": 1.3592, + "step": 1803 + }, + { + "epoch": 0.023442157224235476, + "grad_norm": 0.3490980267524719, + "learning_rate": 0.0001953521621024448, + "loss": 1.493, + "step": 1804 + }, + { + "epoch": 0.02345515176815135, + "grad_norm": 0.3728959858417511, + "learning_rate": 0.0001953495626405334, + "loss": 1.3163, + "step": 1805 + }, + { + "epoch": 0.02346814631206722, + "grad_norm": 0.3092458248138428, + "learning_rate": 0.00019534696317862203, + "loss": 1.3774, + "step": 1806 + }, + { + "epoch": 0.023481140855983094, + "grad_norm": 0.47971421480178833, + "learning_rate": 0.00019534436371671066, + "loss": 1.5847, + "step": 1807 + }, + { + "epoch": 0.023494135399898967, + "grad_norm": 0.3586556315422058, + "learning_rate": 0.00019534176425479926, + "loss": 1.2791, + "step": 1808 + }, + { + "epoch": 0.02350712994381484, + "grad_norm": 0.4068545699119568, + "learning_rate": 0.00019533916479288788, + "loss": 1.3263, + "step": 1809 + }, + { + "epoch": 0.023520124487730713, + "grad_norm": 0.44036608934402466, + "learning_rate": 0.0001953365653309765, + "loss": 1.5351, + "step": 1810 + }, + { + "epoch": 0.023533119031646586, + "grad_norm": 0.3685401976108551, + "learning_rate": 0.00019533396586906513, + "loss": 1.3645, + "step": 1811 + }, + { + "epoch": 0.02354611357556246, + "grad_norm": 0.4026589095592499, + "learning_rate": 0.00019533136640715373, + "loss": 1.3607, + "step": 1812 + }, + { + "epoch": 0.023559108119478336, + "grad_norm": 0.3264610469341278, + "learning_rate": 0.00019532876694524232, + "loss": 1.492, + "step": 1813 + }, + { + "epoch": 0.02357210266339421, + "grad_norm": 0.38049957156181335, + "learning_rate": 0.00019532616748333098, + "loss": 1.3762, + "step": 1814 + }, + { + "epoch": 0.02358509720731008, + "grad_norm": 0.44767656922340393, + "learning_rate": 0.00019532356802141957, + "loss": 1.5042, + "step": 1815 + }, + { + "epoch": 0.023598091751225955, + "grad_norm": 0.41038426756858826, + "learning_rate": 0.0001953209685595082, + "loss": 1.4511, + "step": 1816 + }, + { + "epoch": 0.023611086295141828, + "grad_norm": 0.45387715101242065, + "learning_rate": 0.0001953183690975968, + "loss": 1.5997, + "step": 1817 + }, + { + "epoch": 0.0236240808390577, + "grad_norm": 0.4338409900665283, + "learning_rate": 0.00019531576963568542, + "loss": 1.5321, + "step": 1818 + }, + { + "epoch": 0.023637075382973573, + "grad_norm": 0.3182096779346466, + "learning_rate": 0.00019531317017377404, + "loss": 1.331, + "step": 1819 + }, + { + "epoch": 0.023650069926889446, + "grad_norm": 0.39733293652534485, + "learning_rate": 0.00019531057071186264, + "loss": 1.3856, + "step": 1820 + }, + { + "epoch": 0.02366306447080532, + "grad_norm": 0.3503945469856262, + "learning_rate": 0.00019530797124995127, + "loss": 1.3635, + "step": 1821 + }, + { + "epoch": 0.023676059014721192, + "grad_norm": 0.3881324231624603, + "learning_rate": 0.0001953053717880399, + "loss": 1.4314, + "step": 1822 + }, + { + "epoch": 0.02368905355863707, + "grad_norm": 0.46018099784851074, + "learning_rate": 0.00019530277232612851, + "loss": 1.532, + "step": 1823 + }, + { + "epoch": 0.02370204810255294, + "grad_norm": 0.3509732782840729, + "learning_rate": 0.0001953001728642171, + "loss": 1.4576, + "step": 1824 + }, + { + "epoch": 0.023715042646468815, + "grad_norm": 0.38438957929611206, + "learning_rate": 0.0001952975734023057, + "loss": 1.4521, + "step": 1825 + }, + { + "epoch": 0.023728037190384688, + "grad_norm": 0.4050957262516022, + "learning_rate": 0.00019529497394039436, + "loss": 1.5061, + "step": 1826 + }, + { + "epoch": 0.02374103173430056, + "grad_norm": 0.33474844694137573, + "learning_rate": 0.00019529237447848296, + "loss": 1.5358, + "step": 1827 + }, + { + "epoch": 0.023754026278216434, + "grad_norm": 0.4549270570278168, + "learning_rate": 0.00019528977501657158, + "loss": 1.3857, + "step": 1828 + }, + { + "epoch": 0.023767020822132306, + "grad_norm": 0.4617195427417755, + "learning_rate": 0.00019528717555466018, + "loss": 1.5747, + "step": 1829 + }, + { + "epoch": 0.02378001536604818, + "grad_norm": 0.3896535634994507, + "learning_rate": 0.0001952845760927488, + "loss": 1.3527, + "step": 1830 + }, + { + "epoch": 0.023793009909964052, + "grad_norm": 0.43055301904678345, + "learning_rate": 0.00019528197663083743, + "loss": 1.5093, + "step": 1831 + }, + { + "epoch": 0.02380600445387993, + "grad_norm": 0.36662182211875916, + "learning_rate": 0.00019527937716892603, + "loss": 1.3762, + "step": 1832 + }, + { + "epoch": 0.023818998997795802, + "grad_norm": 0.2322726547718048, + "learning_rate": 0.00019527677770701465, + "loss": 1.3949, + "step": 1833 + }, + { + "epoch": 0.023831993541711675, + "grad_norm": 0.3672502636909485, + "learning_rate": 0.00019527417824510328, + "loss": 1.4479, + "step": 1834 + }, + { + "epoch": 0.023844988085627548, + "grad_norm": 0.2350219041109085, + "learning_rate": 0.0001952715787831919, + "loss": 1.2864, + "step": 1835 + }, + { + "epoch": 0.02385798262954342, + "grad_norm": 0.4465300738811493, + "learning_rate": 0.0001952689793212805, + "loss": 1.6505, + "step": 1836 + }, + { + "epoch": 0.023870977173459294, + "grad_norm": 0.4024832546710968, + "learning_rate": 0.00019526637985936912, + "loss": 1.5301, + "step": 1837 + }, + { + "epoch": 0.023883971717375167, + "grad_norm": 0.42621225118637085, + "learning_rate": 0.00019526378039745775, + "loss": 1.4636, + "step": 1838 + }, + { + "epoch": 0.02389696626129104, + "grad_norm": 0.5147488713264465, + "learning_rate": 0.00019526118093554634, + "loss": 1.6715, + "step": 1839 + }, + { + "epoch": 0.023909960805206912, + "grad_norm": 0.24608314037322998, + "learning_rate": 0.00019525858147363497, + "loss": 1.3956, + "step": 1840 + }, + { + "epoch": 0.023922955349122785, + "grad_norm": 0.35765427350997925, + "learning_rate": 0.0001952559820117236, + "loss": 1.4464, + "step": 1841 + }, + { + "epoch": 0.023935949893038662, + "grad_norm": 0.4051652252674103, + "learning_rate": 0.0001952533825498122, + "loss": 1.4202, + "step": 1842 + }, + { + "epoch": 0.023948944436954535, + "grad_norm": 0.35005876421928406, + "learning_rate": 0.00019525078308790081, + "loss": 1.6109, + "step": 1843 + }, + { + "epoch": 0.023961938980870408, + "grad_norm": 0.44820502400398254, + "learning_rate": 0.0001952481836259894, + "loss": 1.5186, + "step": 1844 + }, + { + "epoch": 0.02397493352478628, + "grad_norm": 0.5122503042221069, + "learning_rate": 0.00019524558416407806, + "loss": 1.6335, + "step": 1845 + }, + { + "epoch": 0.023987928068702154, + "grad_norm": 0.37109479308128357, + "learning_rate": 0.00019524298470216666, + "loss": 1.3491, + "step": 1846 + }, + { + "epoch": 0.024000922612618027, + "grad_norm": 0.3216516971588135, + "learning_rate": 0.00019524038524025529, + "loss": 1.4127, + "step": 1847 + }, + { + "epoch": 0.0240139171565339, + "grad_norm": 0.35651183128356934, + "learning_rate": 0.00019523778577834388, + "loss": 1.6087, + "step": 1848 + }, + { + "epoch": 0.024026911700449773, + "grad_norm": 0.3415221571922302, + "learning_rate": 0.0001952351863164325, + "loss": 1.4724, + "step": 1849 + }, + { + "epoch": 0.024039906244365646, + "grad_norm": 0.5249360799789429, + "learning_rate": 0.00019523258685452113, + "loss": 1.5401, + "step": 1850 + }, + { + "epoch": 0.024052900788281522, + "grad_norm": 0.40630602836608887, + "learning_rate": 0.00019522998739260973, + "loss": 1.5061, + "step": 1851 + }, + { + "epoch": 0.024065895332197395, + "grad_norm": 0.46728020906448364, + "learning_rate": 0.00019522738793069835, + "loss": 1.5968, + "step": 1852 + }, + { + "epoch": 0.024078889876113268, + "grad_norm": 0.41321757435798645, + "learning_rate": 0.00019522478846878698, + "loss": 1.6464, + "step": 1853 + }, + { + "epoch": 0.02409188442002914, + "grad_norm": 0.3643966019153595, + "learning_rate": 0.00019522218900687558, + "loss": 1.3056, + "step": 1854 + }, + { + "epoch": 0.024104878963945014, + "grad_norm": 0.3538874685764313, + "learning_rate": 0.0001952195895449642, + "loss": 1.4451, + "step": 1855 + }, + { + "epoch": 0.024117873507860887, + "grad_norm": 0.4620124399662018, + "learning_rate": 0.0001952169900830528, + "loss": 1.5543, + "step": 1856 + }, + { + "epoch": 0.02413086805177676, + "grad_norm": 0.3777616322040558, + "learning_rate": 0.00019521439062114145, + "loss": 1.7202, + "step": 1857 + }, + { + "epoch": 0.024143862595692633, + "grad_norm": 0.38026413321495056, + "learning_rate": 0.00019521179115923005, + "loss": 1.6294, + "step": 1858 + }, + { + "epoch": 0.024156857139608506, + "grad_norm": 0.3189028203487396, + "learning_rate": 0.00019520919169731867, + "loss": 1.3578, + "step": 1859 + }, + { + "epoch": 0.02416985168352438, + "grad_norm": 0.3826671540737152, + "learning_rate": 0.00019520659223540727, + "loss": 1.5216, + "step": 1860 + }, + { + "epoch": 0.024182846227440255, + "grad_norm": 0.44067713618278503, + "learning_rate": 0.0001952039927734959, + "loss": 1.6301, + "step": 1861 + }, + { + "epoch": 0.024195840771356128, + "grad_norm": 0.43638303875923157, + "learning_rate": 0.00019520139331158452, + "loss": 1.5364, + "step": 1862 + }, + { + "epoch": 0.024208835315272, + "grad_norm": 0.4482448101043701, + "learning_rate": 0.00019519879384967311, + "loss": 1.4069, + "step": 1863 + }, + { + "epoch": 0.024221829859187874, + "grad_norm": 0.4252583384513855, + "learning_rate": 0.00019519619438776174, + "loss": 1.3853, + "step": 1864 + }, + { + "epoch": 0.024234824403103747, + "grad_norm": 0.40959984064102173, + "learning_rate": 0.00019519359492585036, + "loss": 1.3477, + "step": 1865 + }, + { + "epoch": 0.02424781894701962, + "grad_norm": 0.4321820139884949, + "learning_rate": 0.000195190995463939, + "loss": 1.6339, + "step": 1866 + }, + { + "epoch": 0.024260813490935493, + "grad_norm": 0.37583255767822266, + "learning_rate": 0.00019518839600202759, + "loss": 1.5711, + "step": 1867 + }, + { + "epoch": 0.024273808034851366, + "grad_norm": 0.3513713479042053, + "learning_rate": 0.00019518579654011618, + "loss": 1.416, + "step": 1868 + }, + { + "epoch": 0.02428680257876724, + "grad_norm": 0.4160189628601074, + "learning_rate": 0.00019518319707820483, + "loss": 1.5614, + "step": 1869 + }, + { + "epoch": 0.024299797122683115, + "grad_norm": 0.21549619734287262, + "learning_rate": 0.00019518059761629343, + "loss": 1.4135, + "step": 1870 + }, + { + "epoch": 0.024312791666598988, + "grad_norm": 0.4574928879737854, + "learning_rate": 0.00019517799815438206, + "loss": 1.509, + "step": 1871 + }, + { + "epoch": 0.02432578621051486, + "grad_norm": 0.29601985216140747, + "learning_rate": 0.00019517539869247068, + "loss": 1.4798, + "step": 1872 + }, + { + "epoch": 0.024338780754430734, + "grad_norm": 0.33058714866638184, + "learning_rate": 0.00019517279923055928, + "loss": 1.5106, + "step": 1873 + }, + { + "epoch": 0.024351775298346607, + "grad_norm": 0.553209662437439, + "learning_rate": 0.0001951701997686479, + "loss": 1.6211, + "step": 1874 + }, + { + "epoch": 0.02436476984226248, + "grad_norm": 0.31298884749412537, + "learning_rate": 0.0001951676003067365, + "loss": 1.3592, + "step": 1875 + }, + { + "epoch": 0.024377764386178353, + "grad_norm": 0.3478134870529175, + "learning_rate": 0.00019516500084482515, + "loss": 1.3123, + "step": 1876 + }, + { + "epoch": 0.024390758930094226, + "grad_norm": 0.3965305984020233, + "learning_rate": 0.00019516240138291375, + "loss": 1.6281, + "step": 1877 + }, + { + "epoch": 0.0244037534740101, + "grad_norm": 0.3518056869506836, + "learning_rate": 0.00019515980192100237, + "loss": 1.5344, + "step": 1878 + }, + { + "epoch": 0.024416748017925972, + "grad_norm": 0.42089545726776123, + "learning_rate": 0.00019515720245909097, + "loss": 1.5098, + "step": 1879 + }, + { + "epoch": 0.024429742561841848, + "grad_norm": 0.3613037168979645, + "learning_rate": 0.0001951546029971796, + "loss": 1.6852, + "step": 1880 + }, + { + "epoch": 0.02444273710575772, + "grad_norm": 0.46798157691955566, + "learning_rate": 0.00019515200353526822, + "loss": 1.5023, + "step": 1881 + }, + { + "epoch": 0.024455731649673594, + "grad_norm": 0.3810153603553772, + "learning_rate": 0.00019514940407335682, + "loss": 1.4599, + "step": 1882 + }, + { + "epoch": 0.024468726193589467, + "grad_norm": 0.35787442326545715, + "learning_rate": 0.00019514680461144544, + "loss": 1.3997, + "step": 1883 + }, + { + "epoch": 0.02448172073750534, + "grad_norm": 0.3975280225276947, + "learning_rate": 0.00019514420514953407, + "loss": 1.3629, + "step": 1884 + }, + { + "epoch": 0.024494715281421213, + "grad_norm": 0.36241644620895386, + "learning_rate": 0.00019514160568762266, + "loss": 1.4633, + "step": 1885 + }, + { + "epoch": 0.024507709825337086, + "grad_norm": 0.40416109561920166, + "learning_rate": 0.0001951390062257113, + "loss": 1.3648, + "step": 1886 + }, + { + "epoch": 0.02452070436925296, + "grad_norm": 0.3390207290649414, + "learning_rate": 0.00019513640676379989, + "loss": 1.2775, + "step": 1887 + }, + { + "epoch": 0.024533698913168832, + "grad_norm": 0.37671610713005066, + "learning_rate": 0.00019513380730188854, + "loss": 1.496, + "step": 1888 + }, + { + "epoch": 0.024546693457084708, + "grad_norm": 0.3258741796016693, + "learning_rate": 0.00019513120783997713, + "loss": 1.3034, + "step": 1889 + }, + { + "epoch": 0.02455968800100058, + "grad_norm": 0.3780488967895508, + "learning_rate": 0.00019512860837806576, + "loss": 1.518, + "step": 1890 + }, + { + "epoch": 0.024572682544916454, + "grad_norm": 0.3583645820617676, + "learning_rate": 0.00019512600891615436, + "loss": 1.5031, + "step": 1891 + }, + { + "epoch": 0.024585677088832327, + "grad_norm": 0.28971540927886963, + "learning_rate": 0.00019512340945424298, + "loss": 1.2767, + "step": 1892 + }, + { + "epoch": 0.0245986716327482, + "grad_norm": 0.4159455895423889, + "learning_rate": 0.0001951208099923316, + "loss": 1.4147, + "step": 1893 + }, + { + "epoch": 0.024611666176664073, + "grad_norm": 0.45120856165885925, + "learning_rate": 0.0001951182105304202, + "loss": 1.5091, + "step": 1894 + }, + { + "epoch": 0.024624660720579946, + "grad_norm": 0.2567850649356842, + "learning_rate": 0.00019511561106850883, + "loss": 1.5043, + "step": 1895 + }, + { + "epoch": 0.02463765526449582, + "grad_norm": 0.4277418851852417, + "learning_rate": 0.00019511301160659745, + "loss": 1.5215, + "step": 1896 + }, + { + "epoch": 0.024650649808411692, + "grad_norm": 0.529853343963623, + "learning_rate": 0.00019511041214468605, + "loss": 1.607, + "step": 1897 + }, + { + "epoch": 0.024663644352327565, + "grad_norm": 0.415848970413208, + "learning_rate": 0.00019510781268277467, + "loss": 1.5035, + "step": 1898 + }, + { + "epoch": 0.02467663889624344, + "grad_norm": 0.3474975526332855, + "learning_rate": 0.00019510521322086327, + "loss": 1.4307, + "step": 1899 + }, + { + "epoch": 0.024689633440159314, + "grad_norm": 0.40108722448349, + "learning_rate": 0.00019510261375895192, + "loss": 1.6045, + "step": 1900 + }, + { + "epoch": 0.024702627984075187, + "grad_norm": 0.3662712574005127, + "learning_rate": 0.00019510001429704052, + "loss": 1.5101, + "step": 1901 + }, + { + "epoch": 0.02471562252799106, + "grad_norm": 0.3879152536392212, + "learning_rate": 0.00019509741483512914, + "loss": 1.513, + "step": 1902 + }, + { + "epoch": 0.024728617071906933, + "grad_norm": 0.4358268082141876, + "learning_rate": 0.00019509481537321774, + "loss": 1.4907, + "step": 1903 + }, + { + "epoch": 0.024741611615822806, + "grad_norm": 0.2656150162220001, + "learning_rate": 0.00019509221591130637, + "loss": 1.2582, + "step": 1904 + }, + { + "epoch": 0.02475460615973868, + "grad_norm": 0.3241492509841919, + "learning_rate": 0.000195089616449395, + "loss": 1.4139, + "step": 1905 + }, + { + "epoch": 0.024767600703654552, + "grad_norm": 0.33699047565460205, + "learning_rate": 0.0001950870169874836, + "loss": 1.4947, + "step": 1906 + }, + { + "epoch": 0.024780595247570425, + "grad_norm": 0.3858382999897003, + "learning_rate": 0.0001950844175255722, + "loss": 1.4898, + "step": 1907 + }, + { + "epoch": 0.0247935897914863, + "grad_norm": 0.42251360416412354, + "learning_rate": 0.00019508181806366084, + "loss": 1.5405, + "step": 1908 + }, + { + "epoch": 0.024806584335402174, + "grad_norm": 0.41407859325408936, + "learning_rate": 0.00019507921860174943, + "loss": 1.5812, + "step": 1909 + }, + { + "epoch": 0.024819578879318047, + "grad_norm": 0.4241170287132263, + "learning_rate": 0.00019507661913983806, + "loss": 1.6296, + "step": 1910 + }, + { + "epoch": 0.02483257342323392, + "grad_norm": 0.40611186623573303, + "learning_rate": 0.00019507401967792668, + "loss": 1.5777, + "step": 1911 + }, + { + "epoch": 0.024845567967149793, + "grad_norm": 0.31183817982673645, + "learning_rate": 0.0001950714202160153, + "loss": 1.4478, + "step": 1912 + }, + { + "epoch": 0.024858562511065666, + "grad_norm": 0.310866117477417, + "learning_rate": 0.0001950688207541039, + "loss": 1.393, + "step": 1913 + }, + { + "epoch": 0.02487155705498154, + "grad_norm": 0.3437785804271698, + "learning_rate": 0.00019506622129219253, + "loss": 1.3354, + "step": 1914 + }, + { + "epoch": 0.024884551598897412, + "grad_norm": 0.36962059140205383, + "learning_rate": 0.00019506362183028115, + "loss": 1.7343, + "step": 1915 + }, + { + "epoch": 0.024897546142813285, + "grad_norm": 0.3808988630771637, + "learning_rate": 0.00019506102236836975, + "loss": 1.5288, + "step": 1916 + }, + { + "epoch": 0.024910540686729158, + "grad_norm": 0.4140763282775879, + "learning_rate": 0.00019505842290645838, + "loss": 1.4053, + "step": 1917 + }, + { + "epoch": 0.024923535230645034, + "grad_norm": 0.4288756549358368, + "learning_rate": 0.00019505582344454697, + "loss": 1.499, + "step": 1918 + }, + { + "epoch": 0.024936529774560907, + "grad_norm": 0.3769247531890869, + "learning_rate": 0.00019505322398263562, + "loss": 1.3153, + "step": 1919 + }, + { + "epoch": 0.02494952431847678, + "grad_norm": 0.3923449218273163, + "learning_rate": 0.00019505062452072422, + "loss": 1.3731, + "step": 1920 + }, + { + "epoch": 0.024962518862392653, + "grad_norm": 0.40384578704833984, + "learning_rate": 0.00019504802505881285, + "loss": 1.6362, + "step": 1921 + }, + { + "epoch": 0.024975513406308526, + "grad_norm": 0.25997865200042725, + "learning_rate": 0.00019504542559690144, + "loss": 1.4529, + "step": 1922 + }, + { + "epoch": 0.0249885079502244, + "grad_norm": 0.32743462920188904, + "learning_rate": 0.00019504282613499007, + "loss": 1.318, + "step": 1923 + }, + { + "epoch": 0.025001502494140272, + "grad_norm": 0.4509304463863373, + "learning_rate": 0.0001950402266730787, + "loss": 1.5391, + "step": 1924 + }, + { + "epoch": 0.025014497038056145, + "grad_norm": 0.4460051953792572, + "learning_rate": 0.0001950376272111673, + "loss": 1.6095, + "step": 1925 + }, + { + "epoch": 0.025027491581972018, + "grad_norm": 0.3727940618991852, + "learning_rate": 0.00019503502774925592, + "loss": 1.3623, + "step": 1926 + }, + { + "epoch": 0.025040486125887895, + "grad_norm": 0.4771404564380646, + "learning_rate": 0.00019503242828734454, + "loss": 1.6147, + "step": 1927 + }, + { + "epoch": 0.025053480669803768, + "grad_norm": 0.40584996342658997, + "learning_rate": 0.00019502982882543314, + "loss": 1.5194, + "step": 1928 + }, + { + "epoch": 0.02506647521371964, + "grad_norm": 0.4106747806072235, + "learning_rate": 0.00019502722936352176, + "loss": 1.3099, + "step": 1929 + }, + { + "epoch": 0.025079469757635513, + "grad_norm": 0.4870680868625641, + "learning_rate": 0.00019502462990161036, + "loss": 1.5416, + "step": 1930 + }, + { + "epoch": 0.025092464301551386, + "grad_norm": 0.30915436148643494, + "learning_rate": 0.000195022030439699, + "loss": 1.3029, + "step": 1931 + }, + { + "epoch": 0.02510545884546726, + "grad_norm": 0.30829089879989624, + "learning_rate": 0.0001950194309777876, + "loss": 1.3773, + "step": 1932 + }, + { + "epoch": 0.025118453389383132, + "grad_norm": 0.3524402678012848, + "learning_rate": 0.00019501683151587623, + "loss": 1.4034, + "step": 1933 + }, + { + "epoch": 0.025131447933299005, + "grad_norm": 0.3223772644996643, + "learning_rate": 0.00019501423205396483, + "loss": 1.486, + "step": 1934 + }, + { + "epoch": 0.025144442477214878, + "grad_norm": 0.37729641795158386, + "learning_rate": 0.00019501163259205345, + "loss": 1.5304, + "step": 1935 + }, + { + "epoch": 0.02515743702113075, + "grad_norm": 0.4305570423603058, + "learning_rate": 0.00019500903313014208, + "loss": 1.4949, + "step": 1936 + }, + { + "epoch": 0.025170431565046628, + "grad_norm": 0.28514212369918823, + "learning_rate": 0.00019500643366823068, + "loss": 1.119, + "step": 1937 + }, + { + "epoch": 0.0251834261089625, + "grad_norm": 0.2823854088783264, + "learning_rate": 0.0001950038342063193, + "loss": 1.411, + "step": 1938 + }, + { + "epoch": 0.025196420652878374, + "grad_norm": 0.49034997820854187, + "learning_rate": 0.00019500123474440792, + "loss": 1.6508, + "step": 1939 + }, + { + "epoch": 0.025209415196794246, + "grad_norm": 0.37515679001808167, + "learning_rate": 0.00019499863528249652, + "loss": 1.507, + "step": 1940 + }, + { + "epoch": 0.02522240974071012, + "grad_norm": 0.4184080958366394, + "learning_rate": 0.00019499603582058515, + "loss": 1.4623, + "step": 1941 + }, + { + "epoch": 0.025235404284625992, + "grad_norm": 0.35954853892326355, + "learning_rate": 0.00019499343635867374, + "loss": 1.3605, + "step": 1942 + }, + { + "epoch": 0.025248398828541865, + "grad_norm": 0.39759063720703125, + "learning_rate": 0.0001949908368967624, + "loss": 1.4, + "step": 1943 + }, + { + "epoch": 0.02526139337245774, + "grad_norm": 0.3427615165710449, + "learning_rate": 0.000194988237434851, + "loss": 1.294, + "step": 1944 + }, + { + "epoch": 0.02527438791637361, + "grad_norm": 0.35400789976119995, + "learning_rate": 0.00019498563797293962, + "loss": 1.5946, + "step": 1945 + }, + { + "epoch": 0.025287382460289484, + "grad_norm": 0.3394954800605774, + "learning_rate": 0.00019498303851102824, + "loss": 1.2008, + "step": 1946 + }, + { + "epoch": 0.02530037700420536, + "grad_norm": 0.43507492542266846, + "learning_rate": 0.00019498043904911684, + "loss": 1.4453, + "step": 1947 + }, + { + "epoch": 0.025313371548121234, + "grad_norm": 0.3879145383834839, + "learning_rate": 0.00019497783958720546, + "loss": 1.451, + "step": 1948 + }, + { + "epoch": 0.025326366092037107, + "grad_norm": 0.4290544390678406, + "learning_rate": 0.00019497524012529406, + "loss": 1.5188, + "step": 1949 + }, + { + "epoch": 0.02533936063595298, + "grad_norm": 0.45724618434906006, + "learning_rate": 0.0001949726406633827, + "loss": 1.5062, + "step": 1950 + }, + { + "epoch": 0.025352355179868852, + "grad_norm": 0.3396241068840027, + "learning_rate": 0.0001949700412014713, + "loss": 1.4249, + "step": 1951 + }, + { + "epoch": 0.025365349723784725, + "grad_norm": 0.4424882233142853, + "learning_rate": 0.0001949674417395599, + "loss": 1.5906, + "step": 1952 + }, + { + "epoch": 0.0253783442677006, + "grad_norm": 0.49618202447891235, + "learning_rate": 0.00019496484227764853, + "loss": 1.6788, + "step": 1953 + }, + { + "epoch": 0.02539133881161647, + "grad_norm": 0.37435975670814514, + "learning_rate": 0.00019496224281573716, + "loss": 1.4677, + "step": 1954 + }, + { + "epoch": 0.025404333355532344, + "grad_norm": 0.3808096945285797, + "learning_rate": 0.00019495964335382578, + "loss": 1.5246, + "step": 1955 + }, + { + "epoch": 0.02541732789944822, + "grad_norm": 0.3840477764606476, + "learning_rate": 0.00019495704389191438, + "loss": 1.5503, + "step": 1956 + }, + { + "epoch": 0.025430322443364094, + "grad_norm": 0.3403218388557434, + "learning_rate": 0.000194954444430003, + "loss": 1.3534, + "step": 1957 + }, + { + "epoch": 0.025443316987279967, + "grad_norm": 0.3999299108982086, + "learning_rate": 0.00019495184496809163, + "loss": 1.5026, + "step": 1958 + }, + { + "epoch": 0.02545631153119584, + "grad_norm": 0.3052142858505249, + "learning_rate": 0.00019494924550618022, + "loss": 1.2833, + "step": 1959 + }, + { + "epoch": 0.025469306075111713, + "grad_norm": 0.35920408368110657, + "learning_rate": 0.00019494664604426885, + "loss": 1.5176, + "step": 1960 + }, + { + "epoch": 0.025482300619027586, + "grad_norm": 0.4335194528102875, + "learning_rate": 0.00019494404658235745, + "loss": 1.5712, + "step": 1961 + }, + { + "epoch": 0.02549529516294346, + "grad_norm": 0.31558382511138916, + "learning_rate": 0.0001949414471204461, + "loss": 1.4686, + "step": 1962 + }, + { + "epoch": 0.02550828970685933, + "grad_norm": 0.3751327693462372, + "learning_rate": 0.0001949388476585347, + "loss": 1.3645, + "step": 1963 + }, + { + "epoch": 0.025521284250775204, + "grad_norm": 0.365709513425827, + "learning_rate": 0.0001949362481966233, + "loss": 1.3094, + "step": 1964 + }, + { + "epoch": 0.025534278794691077, + "grad_norm": 0.35730892419815063, + "learning_rate": 0.00019493364873471192, + "loss": 1.4899, + "step": 1965 + }, + { + "epoch": 0.025547273338606954, + "grad_norm": 0.37063705921173096, + "learning_rate": 0.00019493104927280054, + "loss": 1.6645, + "step": 1966 + }, + { + "epoch": 0.025560267882522827, + "grad_norm": 0.4170796275138855, + "learning_rate": 0.00019492844981088917, + "loss": 1.2274, + "step": 1967 + }, + { + "epoch": 0.0255732624264387, + "grad_norm": 0.4714570939540863, + "learning_rate": 0.00019492585034897776, + "loss": 1.5636, + "step": 1968 + }, + { + "epoch": 0.025586256970354573, + "grad_norm": 0.41997700929641724, + "learning_rate": 0.0001949232508870664, + "loss": 1.3946, + "step": 1969 + }, + { + "epoch": 0.025599251514270446, + "grad_norm": 0.3678540587425232, + "learning_rate": 0.000194920651425155, + "loss": 1.48, + "step": 1970 + }, + { + "epoch": 0.02561224605818632, + "grad_norm": 0.39316561818122864, + "learning_rate": 0.0001949180519632436, + "loss": 1.53, + "step": 1971 + }, + { + "epoch": 0.02562524060210219, + "grad_norm": 0.3689624071121216, + "learning_rate": 0.00019491545250133223, + "loss": 1.332, + "step": 1972 + }, + { + "epoch": 0.025638235146018064, + "grad_norm": 0.3830846846103668, + "learning_rate": 0.00019491285303942083, + "loss": 1.4066, + "step": 1973 + }, + { + "epoch": 0.025651229689933937, + "grad_norm": 0.3716714382171631, + "learning_rate": 0.00019491025357750948, + "loss": 1.3653, + "step": 1974 + }, + { + "epoch": 0.025664224233849814, + "grad_norm": 0.3765179216861725, + "learning_rate": 0.00019490765411559808, + "loss": 1.5237, + "step": 1975 + }, + { + "epoch": 0.025677218777765687, + "grad_norm": 0.33990898728370667, + "learning_rate": 0.00019490505465368668, + "loss": 1.3687, + "step": 1976 + }, + { + "epoch": 0.02569021332168156, + "grad_norm": 0.5103486776351929, + "learning_rate": 0.0001949024551917753, + "loss": 1.4904, + "step": 1977 + }, + { + "epoch": 0.025703207865597433, + "grad_norm": 0.49304041266441345, + "learning_rate": 0.00019489985572986393, + "loss": 1.5119, + "step": 1978 + }, + { + "epoch": 0.025716202409513306, + "grad_norm": 0.33284151554107666, + "learning_rate": 0.00019489725626795255, + "loss": 1.4815, + "step": 1979 + }, + { + "epoch": 0.02572919695342918, + "grad_norm": 0.4878380298614502, + "learning_rate": 0.00019489465680604115, + "loss": 1.6747, + "step": 1980 + }, + { + "epoch": 0.02574219149734505, + "grad_norm": 0.607802152633667, + "learning_rate": 0.00019489205734412977, + "loss": 1.5806, + "step": 1981 + }, + { + "epoch": 0.025755186041260925, + "grad_norm": 0.2903883457183838, + "learning_rate": 0.0001948894578822184, + "loss": 1.4749, + "step": 1982 + }, + { + "epoch": 0.025768180585176798, + "grad_norm": 0.28316977620124817, + "learning_rate": 0.000194886858420307, + "loss": 1.4754, + "step": 1983 + }, + { + "epoch": 0.02578117512909267, + "grad_norm": 0.40020421147346497, + "learning_rate": 0.00019488425895839562, + "loss": 1.634, + "step": 1984 + }, + { + "epoch": 0.025794169673008547, + "grad_norm": 0.4239027500152588, + "learning_rate": 0.00019488165949648424, + "loss": 1.6033, + "step": 1985 + }, + { + "epoch": 0.02580716421692442, + "grad_norm": 0.4065403938293457, + "learning_rate": 0.00019487906003457287, + "loss": 1.4784, + "step": 1986 + }, + { + "epoch": 0.025820158760840293, + "grad_norm": 0.46506601572036743, + "learning_rate": 0.00019487646057266147, + "loss": 1.561, + "step": 1987 + }, + { + "epoch": 0.025833153304756166, + "grad_norm": 0.2645089030265808, + "learning_rate": 0.0001948738611107501, + "loss": 1.4561, + "step": 1988 + }, + { + "epoch": 0.02584614784867204, + "grad_norm": 0.34971901774406433, + "learning_rate": 0.00019487126164883872, + "loss": 1.4301, + "step": 1989 + }, + { + "epoch": 0.02585914239258791, + "grad_norm": 0.3946782946586609, + "learning_rate": 0.0001948686621869273, + "loss": 1.5002, + "step": 1990 + }, + { + "epoch": 0.025872136936503785, + "grad_norm": 0.3921728730201721, + "learning_rate": 0.00019486606272501594, + "loss": 1.5428, + "step": 1991 + }, + { + "epoch": 0.025885131480419658, + "grad_norm": 0.4433472454547882, + "learning_rate": 0.00019486346326310453, + "loss": 1.4727, + "step": 1992 + }, + { + "epoch": 0.02589812602433553, + "grad_norm": 0.42046865820884705, + "learning_rate": 0.00019486086380119316, + "loss": 1.5136, + "step": 1993 + }, + { + "epoch": 0.025911120568251407, + "grad_norm": 0.3857981860637665, + "learning_rate": 0.00019485826433928178, + "loss": 1.4629, + "step": 1994 + }, + { + "epoch": 0.02592411511216728, + "grad_norm": 0.43305647373199463, + "learning_rate": 0.00019485566487737038, + "loss": 1.4255, + "step": 1995 + }, + { + "epoch": 0.025937109656083153, + "grad_norm": 0.40418142080307007, + "learning_rate": 0.000194853065415459, + "loss": 1.5196, + "step": 1996 + }, + { + "epoch": 0.025950104199999026, + "grad_norm": 0.3565748333930969, + "learning_rate": 0.00019485046595354763, + "loss": 1.531, + "step": 1997 + }, + { + "epoch": 0.0259630987439149, + "grad_norm": 0.3650085926055908, + "learning_rate": 0.00019484786649163625, + "loss": 1.3057, + "step": 1998 + }, + { + "epoch": 0.025976093287830772, + "grad_norm": 0.3957287073135376, + "learning_rate": 0.00019484526702972485, + "loss": 1.4302, + "step": 1999 + }, + { + "epoch": 0.025989087831746645, + "grad_norm": 0.37043535709381104, + "learning_rate": 0.00019484266756781348, + "loss": 1.525, + "step": 2000 + }, + { + "epoch": 0.026002082375662518, + "grad_norm": 0.37735459208488464, + "learning_rate": 0.0001948400681059021, + "loss": 1.4637, + "step": 2001 + }, + { + "epoch": 0.02601507691957839, + "grad_norm": 0.3438260853290558, + "learning_rate": 0.0001948374686439907, + "loss": 1.5209, + "step": 2002 + }, + { + "epoch": 0.026028071463494264, + "grad_norm": 0.3096390664577484, + "learning_rate": 0.00019483486918207932, + "loss": 1.3843, + "step": 2003 + }, + { + "epoch": 0.02604106600741014, + "grad_norm": 0.38412708044052124, + "learning_rate": 0.00019483226972016792, + "loss": 1.5368, + "step": 2004 + }, + { + "epoch": 0.026054060551326013, + "grad_norm": 0.2826448976993561, + "learning_rate": 0.00019482967025825654, + "loss": 1.3217, + "step": 2005 + }, + { + "epoch": 0.026067055095241886, + "grad_norm": 0.27543365955352783, + "learning_rate": 0.00019482707079634517, + "loss": 1.3478, + "step": 2006 + }, + { + "epoch": 0.02608004963915776, + "grad_norm": 0.4210810363292694, + "learning_rate": 0.00019482447133443377, + "loss": 1.6229, + "step": 2007 + }, + { + "epoch": 0.026093044183073632, + "grad_norm": 0.37372809648513794, + "learning_rate": 0.0001948218718725224, + "loss": 1.5294, + "step": 2008 + }, + { + "epoch": 0.026106038726989505, + "grad_norm": 0.38795676827430725, + "learning_rate": 0.00019481927241061102, + "loss": 1.4597, + "step": 2009 + }, + { + "epoch": 0.026119033270905378, + "grad_norm": 0.49610623717308044, + "learning_rate": 0.00019481667294869964, + "loss": 1.7004, + "step": 2010 + }, + { + "epoch": 0.02613202781482125, + "grad_norm": 0.3665335476398468, + "learning_rate": 0.00019481407348678824, + "loss": 1.4932, + "step": 2011 + }, + { + "epoch": 0.026145022358737124, + "grad_norm": 0.3332715332508087, + "learning_rate": 0.00019481147402487686, + "loss": 1.4019, + "step": 2012 + }, + { + "epoch": 0.026158016902653, + "grad_norm": 0.47080639004707336, + "learning_rate": 0.00019480887456296549, + "loss": 1.5523, + "step": 2013 + }, + { + "epoch": 0.026171011446568873, + "grad_norm": 0.39187318086624146, + "learning_rate": 0.00019480627510105408, + "loss": 1.6874, + "step": 2014 + }, + { + "epoch": 0.026184005990484746, + "grad_norm": 0.3465287685394287, + "learning_rate": 0.0001948036756391427, + "loss": 1.2811, + "step": 2015 + }, + { + "epoch": 0.02619700053440062, + "grad_norm": 0.4644133746623993, + "learning_rate": 0.0001948010761772313, + "loss": 1.5755, + "step": 2016 + }, + { + "epoch": 0.026209995078316492, + "grad_norm": 0.37882065773010254, + "learning_rate": 0.00019479847671531996, + "loss": 1.5025, + "step": 2017 + }, + { + "epoch": 0.026222989622232365, + "grad_norm": 0.31355947256088257, + "learning_rate": 0.00019479587725340855, + "loss": 1.2981, + "step": 2018 + }, + { + "epoch": 0.026235984166148238, + "grad_norm": 0.39792558550834656, + "learning_rate": 0.00019479327779149715, + "loss": 1.4684, + "step": 2019 + }, + { + "epoch": 0.02624897871006411, + "grad_norm": 0.38897544145584106, + "learning_rate": 0.00019479067832958578, + "loss": 1.4391, + "step": 2020 + }, + { + "epoch": 0.026261973253979984, + "grad_norm": 0.33233538269996643, + "learning_rate": 0.0001947880788676744, + "loss": 1.4077, + "step": 2021 + }, + { + "epoch": 0.026274967797895857, + "grad_norm": 0.2888289988040924, + "learning_rate": 0.00019478547940576303, + "loss": 1.1484, + "step": 2022 + }, + { + "epoch": 0.026287962341811733, + "grad_norm": 0.3799711763858795, + "learning_rate": 0.00019478287994385162, + "loss": 1.6634, + "step": 2023 + }, + { + "epoch": 0.026300956885727606, + "grad_norm": 0.3666038513183594, + "learning_rate": 0.00019478028048194025, + "loss": 1.5296, + "step": 2024 + }, + { + "epoch": 0.02631395142964348, + "grad_norm": 0.3440206050872803, + "learning_rate": 0.00019477768102002887, + "loss": 1.6052, + "step": 2025 + }, + { + "epoch": 0.026326945973559352, + "grad_norm": 0.4239407479763031, + "learning_rate": 0.00019477508155811747, + "loss": 1.7206, + "step": 2026 + }, + { + "epoch": 0.026339940517475225, + "grad_norm": 0.37203747034072876, + "learning_rate": 0.0001947724820962061, + "loss": 1.5201, + "step": 2027 + }, + { + "epoch": 0.026352935061391098, + "grad_norm": 0.40613853931427, + "learning_rate": 0.00019476988263429472, + "loss": 1.327, + "step": 2028 + }, + { + "epoch": 0.02636592960530697, + "grad_norm": 0.3439273238182068, + "learning_rate": 0.00019476728317238334, + "loss": 1.4519, + "step": 2029 + }, + { + "epoch": 0.026378924149222844, + "grad_norm": 0.32606256008148193, + "learning_rate": 0.00019476468371047194, + "loss": 1.4166, + "step": 2030 + }, + { + "epoch": 0.026391918693138717, + "grad_norm": 0.41630515456199646, + "learning_rate": 0.00019476208424856054, + "loss": 1.4042, + "step": 2031 + }, + { + "epoch": 0.026404913237054593, + "grad_norm": 0.30288708209991455, + "learning_rate": 0.0001947594847866492, + "loss": 1.3639, + "step": 2032 + }, + { + "epoch": 0.026417907780970466, + "grad_norm": 0.3434416949748993, + "learning_rate": 0.00019475688532473779, + "loss": 1.4555, + "step": 2033 + }, + { + "epoch": 0.02643090232488634, + "grad_norm": 0.4458916485309601, + "learning_rate": 0.0001947542858628264, + "loss": 1.4114, + "step": 2034 + }, + { + "epoch": 0.026443896868802212, + "grad_norm": 0.3650971055030823, + "learning_rate": 0.000194751686400915, + "loss": 1.4294, + "step": 2035 + }, + { + "epoch": 0.026456891412718085, + "grad_norm": 0.4173720180988312, + "learning_rate": 0.00019474908693900363, + "loss": 1.6647, + "step": 2036 + }, + { + "epoch": 0.026469885956633958, + "grad_norm": 0.4249539375305176, + "learning_rate": 0.00019474648747709226, + "loss": 1.6043, + "step": 2037 + }, + { + "epoch": 0.02648288050054983, + "grad_norm": 0.35150468349456787, + "learning_rate": 0.00019474388801518085, + "loss": 1.5264, + "step": 2038 + }, + { + "epoch": 0.026495875044465704, + "grad_norm": 0.3060664236545563, + "learning_rate": 0.00019474128855326948, + "loss": 1.4176, + "step": 2039 + }, + { + "epoch": 0.026508869588381577, + "grad_norm": 0.3721410632133484, + "learning_rate": 0.0001947386890913581, + "loss": 1.5991, + "step": 2040 + }, + { + "epoch": 0.02652186413229745, + "grad_norm": 0.2834204435348511, + "learning_rate": 0.00019473608962944673, + "loss": 1.4996, + "step": 2041 + }, + { + "epoch": 0.026534858676213326, + "grad_norm": 0.3835855722427368, + "learning_rate": 0.00019473349016753533, + "loss": 1.6001, + "step": 2042 + }, + { + "epoch": 0.0265478532201292, + "grad_norm": 0.544060230255127, + "learning_rate": 0.00019473089070562395, + "loss": 1.4033, + "step": 2043 + }, + { + "epoch": 0.026560847764045072, + "grad_norm": 0.4304278790950775, + "learning_rate": 0.00019472829124371257, + "loss": 1.4974, + "step": 2044 + }, + { + "epoch": 0.026573842307960945, + "grad_norm": 0.36693310737609863, + "learning_rate": 0.00019472569178180117, + "loss": 1.4544, + "step": 2045 + }, + { + "epoch": 0.026586836851876818, + "grad_norm": 0.3819798231124878, + "learning_rate": 0.0001947230923198898, + "loss": 1.4423, + "step": 2046 + }, + { + "epoch": 0.02659983139579269, + "grad_norm": 0.43944051861763, + "learning_rate": 0.0001947204928579784, + "loss": 1.7041, + "step": 2047 + }, + { + "epoch": 0.026612825939708564, + "grad_norm": 0.4430011808872223, + "learning_rate": 0.00019471789339606702, + "loss": 1.405, + "step": 2048 + }, + { + "epoch": 0.026625820483624437, + "grad_norm": 0.4401518404483795, + "learning_rate": 0.00019471529393415564, + "loss": 1.6303, + "step": 2049 + }, + { + "epoch": 0.02663881502754031, + "grad_norm": 0.32380321621894836, + "learning_rate": 0.00019471269447224424, + "loss": 1.5584, + "step": 2050 + }, + { + "epoch": 0.026651809571456186, + "grad_norm": 0.3804262578487396, + "learning_rate": 0.00019471009501033286, + "loss": 1.4661, + "step": 2051 + }, + { + "epoch": 0.02666480411537206, + "grad_norm": 0.35763663053512573, + "learning_rate": 0.0001947074955484215, + "loss": 1.2534, + "step": 2052 + }, + { + "epoch": 0.026677798659287932, + "grad_norm": 0.4508529007434845, + "learning_rate": 0.0001947048960865101, + "loss": 1.5882, + "step": 2053 + }, + { + "epoch": 0.026690793203203805, + "grad_norm": 0.3657906651496887, + "learning_rate": 0.0001947022966245987, + "loss": 1.4638, + "step": 2054 + }, + { + "epoch": 0.02670378774711968, + "grad_norm": 0.3245506286621094, + "learning_rate": 0.00019469969716268734, + "loss": 1.3875, + "step": 2055 + }, + { + "epoch": 0.02671678229103555, + "grad_norm": 0.27691057324409485, + "learning_rate": 0.00019469709770077596, + "loss": 1.1873, + "step": 2056 + }, + { + "epoch": 0.026729776834951424, + "grad_norm": 0.29322341084480286, + "learning_rate": 0.00019469449823886456, + "loss": 1.3471, + "step": 2057 + }, + { + "epoch": 0.026742771378867297, + "grad_norm": 0.34089669585227966, + "learning_rate": 0.00019469189877695318, + "loss": 1.3804, + "step": 2058 + }, + { + "epoch": 0.02675576592278317, + "grad_norm": 0.3724285364151001, + "learning_rate": 0.0001946892993150418, + "loss": 1.5864, + "step": 2059 + }, + { + "epoch": 0.026768760466699043, + "grad_norm": 0.3318098783493042, + "learning_rate": 0.0001946866998531304, + "loss": 1.4543, + "step": 2060 + }, + { + "epoch": 0.02678175501061492, + "grad_norm": 0.3071412444114685, + "learning_rate": 0.00019468410039121903, + "loss": 1.249, + "step": 2061 + }, + { + "epoch": 0.026794749554530792, + "grad_norm": 0.31382542848587036, + "learning_rate": 0.00019468150092930763, + "loss": 1.2915, + "step": 2062 + }, + { + "epoch": 0.026807744098446665, + "grad_norm": 0.5058941841125488, + "learning_rate": 0.00019467890146739628, + "loss": 1.5519, + "step": 2063 + }, + { + "epoch": 0.02682073864236254, + "grad_norm": 0.4811553359031677, + "learning_rate": 0.00019467630200548487, + "loss": 1.4646, + "step": 2064 + }, + { + "epoch": 0.02683373318627841, + "grad_norm": 0.3544370234012604, + "learning_rate": 0.0001946737025435735, + "loss": 1.2672, + "step": 2065 + }, + { + "epoch": 0.026846727730194284, + "grad_norm": 0.3728879988193512, + "learning_rate": 0.0001946711030816621, + "loss": 1.4612, + "step": 2066 + }, + { + "epoch": 0.026859722274110157, + "grad_norm": 0.320218563079834, + "learning_rate": 0.00019466850361975072, + "loss": 1.4772, + "step": 2067 + }, + { + "epoch": 0.02687271681802603, + "grad_norm": 0.3651033937931061, + "learning_rate": 0.00019466590415783934, + "loss": 1.3758, + "step": 2068 + }, + { + "epoch": 0.026885711361941903, + "grad_norm": 0.32942336797714233, + "learning_rate": 0.00019466330469592794, + "loss": 1.3695, + "step": 2069 + }, + { + "epoch": 0.02689870590585778, + "grad_norm": 0.3832577168941498, + "learning_rate": 0.00019466070523401657, + "loss": 1.5221, + "step": 2070 + }, + { + "epoch": 0.026911700449773653, + "grad_norm": 0.4433085322380066, + "learning_rate": 0.0001946581057721052, + "loss": 1.3553, + "step": 2071 + }, + { + "epoch": 0.026924694993689526, + "grad_norm": 0.3786352574825287, + "learning_rate": 0.00019465550631019382, + "loss": 1.3309, + "step": 2072 + }, + { + "epoch": 0.0269376895376054, + "grad_norm": 0.22439265251159668, + "learning_rate": 0.0001946529068482824, + "loss": 1.1815, + "step": 2073 + }, + { + "epoch": 0.02695068408152127, + "grad_norm": 0.3922896087169647, + "learning_rate": 0.000194650307386371, + "loss": 1.4499, + "step": 2074 + }, + { + "epoch": 0.026963678625437144, + "grad_norm": 0.43656137585639954, + "learning_rate": 0.00019464770792445966, + "loss": 1.7509, + "step": 2075 + }, + { + "epoch": 0.026976673169353017, + "grad_norm": 0.3852211833000183, + "learning_rate": 0.00019464510846254826, + "loss": 1.3327, + "step": 2076 + }, + { + "epoch": 0.02698966771326889, + "grad_norm": 0.33573904633522034, + "learning_rate": 0.00019464250900063688, + "loss": 1.4515, + "step": 2077 + }, + { + "epoch": 0.027002662257184763, + "grad_norm": 0.32149389386177063, + "learning_rate": 0.00019463990953872548, + "loss": 1.2627, + "step": 2078 + }, + { + "epoch": 0.027015656801100636, + "grad_norm": 0.30632153153419495, + "learning_rate": 0.0001946373100768141, + "loss": 1.0881, + "step": 2079 + }, + { + "epoch": 0.027028651345016513, + "grad_norm": 0.40625059604644775, + "learning_rate": 0.00019463471061490273, + "loss": 1.3931, + "step": 2080 + }, + { + "epoch": 0.027041645888932386, + "grad_norm": 0.4922195076942444, + "learning_rate": 0.00019463211115299133, + "loss": 1.5984, + "step": 2081 + }, + { + "epoch": 0.02705464043284826, + "grad_norm": 0.36153602600097656, + "learning_rate": 0.00019462951169107995, + "loss": 1.1035, + "step": 2082 + }, + { + "epoch": 0.02706763497676413, + "grad_norm": 0.40944454073905945, + "learning_rate": 0.00019462691222916858, + "loss": 1.4335, + "step": 2083 + }, + { + "epoch": 0.027080629520680004, + "grad_norm": 0.3449093699455261, + "learning_rate": 0.0001946243127672572, + "loss": 1.4577, + "step": 2084 + }, + { + "epoch": 0.027093624064595877, + "grad_norm": 0.27790942788124084, + "learning_rate": 0.0001946217133053458, + "loss": 1.2703, + "step": 2085 + }, + { + "epoch": 0.02710661860851175, + "grad_norm": 0.37999653816223145, + "learning_rate": 0.0001946191138434344, + "loss": 1.3875, + "step": 2086 + }, + { + "epoch": 0.027119613152427623, + "grad_norm": 0.38547950983047485, + "learning_rate": 0.00019461651438152305, + "loss": 1.4481, + "step": 2087 + }, + { + "epoch": 0.027132607696343496, + "grad_norm": 0.37379974126815796, + "learning_rate": 0.00019461391491961164, + "loss": 1.3661, + "step": 2088 + }, + { + "epoch": 0.027145602240259373, + "grad_norm": 0.39012619853019714, + "learning_rate": 0.00019461131545770027, + "loss": 1.4049, + "step": 2089 + }, + { + "epoch": 0.027158596784175246, + "grad_norm": 0.46945053339004517, + "learning_rate": 0.00019460871599578887, + "loss": 1.5731, + "step": 2090 + }, + { + "epoch": 0.02717159132809112, + "grad_norm": 0.2752492427825928, + "learning_rate": 0.0001946061165338775, + "loss": 1.4742, + "step": 2091 + }, + { + "epoch": 0.02718458587200699, + "grad_norm": 0.3483410179615021, + "learning_rate": 0.00019460351707196612, + "loss": 1.363, + "step": 2092 + }, + { + "epoch": 0.027197580415922865, + "grad_norm": 0.3654286563396454, + "learning_rate": 0.0001946009176100547, + "loss": 1.6469, + "step": 2093 + }, + { + "epoch": 0.027210574959838738, + "grad_norm": 0.4013746380805969, + "learning_rate": 0.00019459831814814334, + "loss": 1.4361, + "step": 2094 + }, + { + "epoch": 0.02722356950375461, + "grad_norm": 0.33663803339004517, + "learning_rate": 0.00019459571868623196, + "loss": 1.343, + "step": 2095 + }, + { + "epoch": 0.027236564047670483, + "grad_norm": 0.33509793877601624, + "learning_rate": 0.0001945931192243206, + "loss": 1.469, + "step": 2096 + }, + { + "epoch": 0.027249558591586356, + "grad_norm": 0.36451658606529236, + "learning_rate": 0.00019459051976240918, + "loss": 1.4419, + "step": 2097 + }, + { + "epoch": 0.02726255313550223, + "grad_norm": 0.45805126428604126, + "learning_rate": 0.0001945879203004978, + "loss": 1.4376, + "step": 2098 + }, + { + "epoch": 0.027275547679418106, + "grad_norm": 0.3644254505634308, + "learning_rate": 0.00019458532083858643, + "loss": 1.4541, + "step": 2099 + }, + { + "epoch": 0.02728854222333398, + "grad_norm": 0.544037938117981, + "learning_rate": 0.00019458272137667503, + "loss": 1.5757, + "step": 2100 + }, + { + "epoch": 0.02730153676724985, + "grad_norm": 0.434401273727417, + "learning_rate": 0.00019458012191476365, + "loss": 1.5415, + "step": 2101 + }, + { + "epoch": 0.027314531311165725, + "grad_norm": 0.21660558879375458, + "learning_rate": 0.00019457752245285228, + "loss": 1.4633, + "step": 2102 + }, + { + "epoch": 0.027327525855081598, + "grad_norm": 0.5315440893173218, + "learning_rate": 0.00019457492299094088, + "loss": 1.5467, + "step": 2103 + }, + { + "epoch": 0.02734052039899747, + "grad_norm": 0.31882423162460327, + "learning_rate": 0.0001945723235290295, + "loss": 1.3665, + "step": 2104 + }, + { + "epoch": 0.027353514942913344, + "grad_norm": 0.3948776125907898, + "learning_rate": 0.0001945697240671181, + "loss": 1.477, + "step": 2105 + }, + { + "epoch": 0.027366509486829216, + "grad_norm": 0.3557848036289215, + "learning_rate": 0.00019456712460520675, + "loss": 1.4599, + "step": 2106 + }, + { + "epoch": 0.02737950403074509, + "grad_norm": 0.3059878945350647, + "learning_rate": 0.00019456452514329535, + "loss": 1.4175, + "step": 2107 + }, + { + "epoch": 0.027392498574660966, + "grad_norm": 0.38770779967308044, + "learning_rate": 0.00019456192568138397, + "loss": 1.5319, + "step": 2108 + }, + { + "epoch": 0.02740549311857684, + "grad_norm": 0.4011865258216858, + "learning_rate": 0.00019455932621947257, + "loss": 1.3311, + "step": 2109 + }, + { + "epoch": 0.027418487662492712, + "grad_norm": 0.2876737713813782, + "learning_rate": 0.0001945567267575612, + "loss": 1.1813, + "step": 2110 + }, + { + "epoch": 0.027431482206408585, + "grad_norm": 0.339067280292511, + "learning_rate": 0.00019455412729564982, + "loss": 1.2076, + "step": 2111 + }, + { + "epoch": 0.027444476750324458, + "grad_norm": 0.43654006719589233, + "learning_rate": 0.00019455152783373842, + "loss": 1.5403, + "step": 2112 + }, + { + "epoch": 0.02745747129424033, + "grad_norm": 0.4195444881916046, + "learning_rate": 0.00019454892837182704, + "loss": 1.5803, + "step": 2113 + }, + { + "epoch": 0.027470465838156204, + "grad_norm": 0.37961822748184204, + "learning_rate": 0.00019454632890991566, + "loss": 1.3663, + "step": 2114 + }, + { + "epoch": 0.027483460382072077, + "grad_norm": 0.46763333678245544, + "learning_rate": 0.00019454372944800426, + "loss": 1.6148, + "step": 2115 + }, + { + "epoch": 0.02749645492598795, + "grad_norm": 0.3583020567893982, + "learning_rate": 0.0001945411299860929, + "loss": 1.4051, + "step": 2116 + }, + { + "epoch": 0.027509449469903823, + "grad_norm": 0.3380371928215027, + "learning_rate": 0.00019453853052418148, + "loss": 1.2766, + "step": 2117 + }, + { + "epoch": 0.0275224440138197, + "grad_norm": 0.3549646735191345, + "learning_rate": 0.00019453593106227014, + "loss": 1.4812, + "step": 2118 + }, + { + "epoch": 0.027535438557735572, + "grad_norm": 0.39473992586135864, + "learning_rate": 0.00019453333160035873, + "loss": 1.4351, + "step": 2119 + }, + { + "epoch": 0.027548433101651445, + "grad_norm": 0.3086841106414795, + "learning_rate": 0.00019453073213844736, + "loss": 1.498, + "step": 2120 + }, + { + "epoch": 0.027561427645567318, + "grad_norm": 0.3433244526386261, + "learning_rate": 0.00019452813267653595, + "loss": 1.6794, + "step": 2121 + }, + { + "epoch": 0.02757442218948319, + "grad_norm": 0.3646318316459656, + "learning_rate": 0.00019452553321462458, + "loss": 1.4646, + "step": 2122 + }, + { + "epoch": 0.027587416733399064, + "grad_norm": 0.36260178685188293, + "learning_rate": 0.0001945229337527132, + "loss": 1.6132, + "step": 2123 + }, + { + "epoch": 0.027600411277314937, + "grad_norm": 0.3652835488319397, + "learning_rate": 0.0001945203342908018, + "loss": 1.2324, + "step": 2124 + }, + { + "epoch": 0.02761340582123081, + "grad_norm": 0.3084219992160797, + "learning_rate": 0.00019451773482889043, + "loss": 1.2231, + "step": 2125 + }, + { + "epoch": 0.027626400365146683, + "grad_norm": 0.36105242371559143, + "learning_rate": 0.00019451513536697905, + "loss": 1.3947, + "step": 2126 + }, + { + "epoch": 0.02763939490906256, + "grad_norm": 0.6619263887405396, + "learning_rate": 0.00019451253590506767, + "loss": 1.456, + "step": 2127 + }, + { + "epoch": 0.027652389452978432, + "grad_norm": 0.3887653052806854, + "learning_rate": 0.00019450993644315627, + "loss": 1.6393, + "step": 2128 + }, + { + "epoch": 0.027665383996894305, + "grad_norm": 0.32822638750076294, + "learning_rate": 0.00019450733698124487, + "loss": 1.3381, + "step": 2129 + }, + { + "epoch": 0.027678378540810178, + "grad_norm": 0.3243531584739685, + "learning_rate": 0.00019450473751933352, + "loss": 1.2432, + "step": 2130 + }, + { + "epoch": 0.02769137308472605, + "grad_norm": 0.38900068402290344, + "learning_rate": 0.00019450213805742212, + "loss": 1.6306, + "step": 2131 + }, + { + "epoch": 0.027704367628641924, + "grad_norm": 0.39967986941337585, + "learning_rate": 0.00019449953859551074, + "loss": 1.5413, + "step": 2132 + }, + { + "epoch": 0.027717362172557797, + "grad_norm": 0.3535040616989136, + "learning_rate": 0.00019449693913359937, + "loss": 1.4364, + "step": 2133 + }, + { + "epoch": 0.02773035671647367, + "grad_norm": 0.34658634662628174, + "learning_rate": 0.00019449433967168796, + "loss": 1.341, + "step": 2134 + }, + { + "epoch": 0.027743351260389543, + "grad_norm": 0.34029027819633484, + "learning_rate": 0.0001944917402097766, + "loss": 1.609, + "step": 2135 + }, + { + "epoch": 0.027756345804305416, + "grad_norm": 0.354667603969574, + "learning_rate": 0.0001944891407478652, + "loss": 1.3905, + "step": 2136 + }, + { + "epoch": 0.027769340348221292, + "grad_norm": 0.31113141775131226, + "learning_rate": 0.00019448654128595384, + "loss": 1.5166, + "step": 2137 + }, + { + "epoch": 0.027782334892137165, + "grad_norm": 0.3507639467716217, + "learning_rate": 0.00019448394182404244, + "loss": 1.4244, + "step": 2138 + }, + { + "epoch": 0.027795329436053038, + "grad_norm": 0.3752739727497101, + "learning_rate": 0.00019448134236213106, + "loss": 1.3482, + "step": 2139 + }, + { + "epoch": 0.02780832397996891, + "grad_norm": 0.4519732892513275, + "learning_rate": 0.00019447874290021966, + "loss": 1.578, + "step": 2140 + }, + { + "epoch": 0.027821318523884784, + "grad_norm": 0.4155772924423218, + "learning_rate": 0.00019447614343830828, + "loss": 1.4331, + "step": 2141 + }, + { + "epoch": 0.027834313067800657, + "grad_norm": 0.34380027651786804, + "learning_rate": 0.0001944735439763969, + "loss": 1.2967, + "step": 2142 + }, + { + "epoch": 0.02784730761171653, + "grad_norm": 0.4142053425312042, + "learning_rate": 0.0001944709445144855, + "loss": 1.4169, + "step": 2143 + }, + { + "epoch": 0.027860302155632403, + "grad_norm": 0.45165592432022095, + "learning_rate": 0.00019446834505257413, + "loss": 1.617, + "step": 2144 + }, + { + "epoch": 0.027873296699548276, + "grad_norm": 0.36684224009513855, + "learning_rate": 0.00019446574559066275, + "loss": 1.3973, + "step": 2145 + }, + { + "epoch": 0.027886291243464152, + "grad_norm": 0.352329820394516, + "learning_rate": 0.00019446314612875135, + "loss": 1.3445, + "step": 2146 + }, + { + "epoch": 0.027899285787380025, + "grad_norm": 0.3258627653121948, + "learning_rate": 0.00019446054666683997, + "loss": 1.3411, + "step": 2147 + }, + { + "epoch": 0.027912280331295898, + "grad_norm": 0.4311891198158264, + "learning_rate": 0.00019445794720492857, + "loss": 1.3768, + "step": 2148 + }, + { + "epoch": 0.02792527487521177, + "grad_norm": 0.33797669410705566, + "learning_rate": 0.00019445534774301722, + "loss": 1.4377, + "step": 2149 + }, + { + "epoch": 0.027938269419127644, + "grad_norm": 0.44946396350860596, + "learning_rate": 0.00019445274828110582, + "loss": 1.2503, + "step": 2150 + }, + { + "epoch": 0.027951263963043517, + "grad_norm": 0.395925372838974, + "learning_rate": 0.00019445014881919445, + "loss": 1.6571, + "step": 2151 + }, + { + "epoch": 0.02796425850695939, + "grad_norm": 0.3170452117919922, + "learning_rate": 0.00019444754935728304, + "loss": 1.5332, + "step": 2152 + }, + { + "epoch": 0.027977253050875263, + "grad_norm": 0.3092959523200989, + "learning_rate": 0.00019444494989537167, + "loss": 1.4253, + "step": 2153 + }, + { + "epoch": 0.027990247594791136, + "grad_norm": 0.44351643323898315, + "learning_rate": 0.0001944423504334603, + "loss": 1.6285, + "step": 2154 + }, + { + "epoch": 0.02800324213870701, + "grad_norm": 0.6147965788841248, + "learning_rate": 0.0001944397509715489, + "loss": 1.5539, + "step": 2155 + }, + { + "epoch": 0.028016236682622885, + "grad_norm": 0.42599916458129883, + "learning_rate": 0.0001944371515096375, + "loss": 1.3832, + "step": 2156 + }, + { + "epoch": 0.028029231226538758, + "grad_norm": 0.4279272258281708, + "learning_rate": 0.00019443455204772614, + "loss": 1.5309, + "step": 2157 + }, + { + "epoch": 0.02804222577045463, + "grad_norm": 0.3968813717365265, + "learning_rate": 0.00019443195258581474, + "loss": 1.5024, + "step": 2158 + }, + { + "epoch": 0.028055220314370504, + "grad_norm": 0.4060969352722168, + "learning_rate": 0.00019442935312390336, + "loss": 1.3264, + "step": 2159 + }, + { + "epoch": 0.028068214858286377, + "grad_norm": 0.29550233483314514, + "learning_rate": 0.00019442675366199196, + "loss": 1.3736, + "step": 2160 + }, + { + "epoch": 0.02808120940220225, + "grad_norm": 0.327310711145401, + "learning_rate": 0.0001944241542000806, + "loss": 1.4457, + "step": 2161 + }, + { + "epoch": 0.028094203946118123, + "grad_norm": 0.2951511740684509, + "learning_rate": 0.0001944215547381692, + "loss": 1.4906, + "step": 2162 + }, + { + "epoch": 0.028107198490033996, + "grad_norm": 0.46167200803756714, + "learning_rate": 0.00019441895527625783, + "loss": 1.5882, + "step": 2163 + }, + { + "epoch": 0.02812019303394987, + "grad_norm": 0.36212441325187683, + "learning_rate": 0.00019441635581434643, + "loss": 1.3374, + "step": 2164 + }, + { + "epoch": 0.028133187577865745, + "grad_norm": 0.4009593427181244, + "learning_rate": 0.00019441375635243505, + "loss": 1.3603, + "step": 2165 + }, + { + "epoch": 0.02814618212178162, + "grad_norm": 0.45525768399238586, + "learning_rate": 0.00019441115689052368, + "loss": 1.4289, + "step": 2166 + }, + { + "epoch": 0.02815917666569749, + "grad_norm": 0.3185199499130249, + "learning_rate": 0.00019440855742861227, + "loss": 1.4702, + "step": 2167 + }, + { + "epoch": 0.028172171209613364, + "grad_norm": 0.428203821182251, + "learning_rate": 0.0001944059579667009, + "loss": 1.4742, + "step": 2168 + }, + { + "epoch": 0.028185165753529237, + "grad_norm": 0.4030996561050415, + "learning_rate": 0.00019440335850478952, + "loss": 1.4453, + "step": 2169 + }, + { + "epoch": 0.02819816029744511, + "grad_norm": 0.33252498507499695, + "learning_rate": 0.00019440075904287812, + "loss": 1.501, + "step": 2170 + }, + { + "epoch": 0.028211154841360983, + "grad_norm": 0.35301727056503296, + "learning_rate": 0.00019439815958096675, + "loss": 1.4448, + "step": 2171 + }, + { + "epoch": 0.028224149385276856, + "grad_norm": 0.5131669044494629, + "learning_rate": 0.00019439556011905537, + "loss": 1.5222, + "step": 2172 + }, + { + "epoch": 0.02823714392919273, + "grad_norm": 0.4534998834133148, + "learning_rate": 0.000194392960657144, + "loss": 1.3583, + "step": 2173 + }, + { + "epoch": 0.028250138473108602, + "grad_norm": 0.3587299585342407, + "learning_rate": 0.0001943903611952326, + "loss": 1.3828, + "step": 2174 + }, + { + "epoch": 0.02826313301702448, + "grad_norm": 0.2546916604042053, + "learning_rate": 0.00019438776173332122, + "loss": 1.3169, + "step": 2175 + }, + { + "epoch": 0.02827612756094035, + "grad_norm": 0.3397449851036072, + "learning_rate": 0.00019438516227140984, + "loss": 1.3687, + "step": 2176 + }, + { + "epoch": 0.028289122104856224, + "grad_norm": 0.38214564323425293, + "learning_rate": 0.00019438256280949844, + "loss": 1.4583, + "step": 2177 + }, + { + "epoch": 0.028302116648772097, + "grad_norm": 0.3576605021953583, + "learning_rate": 0.00019437996334758706, + "loss": 1.3904, + "step": 2178 + }, + { + "epoch": 0.02831511119268797, + "grad_norm": 0.4208417534828186, + "learning_rate": 0.00019437736388567566, + "loss": 1.2989, + "step": 2179 + }, + { + "epoch": 0.028328105736603843, + "grad_norm": 0.33130720257759094, + "learning_rate": 0.0001943747644237643, + "loss": 1.3052, + "step": 2180 + }, + { + "epoch": 0.028341100280519716, + "grad_norm": 0.4892352223396301, + "learning_rate": 0.0001943721649618529, + "loss": 1.5524, + "step": 2181 + }, + { + "epoch": 0.02835409482443559, + "grad_norm": 0.46355971693992615, + "learning_rate": 0.0001943695654999415, + "loss": 1.6365, + "step": 2182 + }, + { + "epoch": 0.028367089368351462, + "grad_norm": 0.3080877959728241, + "learning_rate": 0.00019436696603803013, + "loss": 1.3449, + "step": 2183 + }, + { + "epoch": 0.02838008391226734, + "grad_norm": 0.34956327080726624, + "learning_rate": 0.00019436436657611876, + "loss": 1.4234, + "step": 2184 + }, + { + "epoch": 0.02839307845618321, + "grad_norm": 0.35542523860931396, + "learning_rate": 0.00019436176711420738, + "loss": 1.4423, + "step": 2185 + }, + { + "epoch": 0.028406073000099084, + "grad_norm": 0.4644271433353424, + "learning_rate": 0.00019435916765229598, + "loss": 1.481, + "step": 2186 + }, + { + "epoch": 0.028419067544014957, + "grad_norm": 0.4230409264564514, + "learning_rate": 0.0001943565681903846, + "loss": 1.5347, + "step": 2187 + }, + { + "epoch": 0.02843206208793083, + "grad_norm": 0.39285415410995483, + "learning_rate": 0.00019435396872847323, + "loss": 1.4178, + "step": 2188 + }, + { + "epoch": 0.028445056631846703, + "grad_norm": 0.41373884677886963, + "learning_rate": 0.00019435136926656182, + "loss": 1.47, + "step": 2189 + }, + { + "epoch": 0.028458051175762576, + "grad_norm": 0.4094409644603729, + "learning_rate": 0.00019434876980465045, + "loss": 1.5291, + "step": 2190 + }, + { + "epoch": 0.02847104571967845, + "grad_norm": 0.4535164535045624, + "learning_rate": 0.00019434617034273905, + "loss": 1.4188, + "step": 2191 + }, + { + "epoch": 0.028484040263594322, + "grad_norm": 0.36925461888313293, + "learning_rate": 0.0001943435708808277, + "loss": 1.4442, + "step": 2192 + }, + { + "epoch": 0.028497034807510195, + "grad_norm": 0.38038572669029236, + "learning_rate": 0.0001943409714189163, + "loss": 1.4644, + "step": 2193 + }, + { + "epoch": 0.02851002935142607, + "grad_norm": 0.3370724320411682, + "learning_rate": 0.00019433837195700492, + "loss": 1.2589, + "step": 2194 + }, + { + "epoch": 0.028523023895341944, + "grad_norm": 0.36902526021003723, + "learning_rate": 0.00019433577249509352, + "loss": 1.726, + "step": 2195 + }, + { + "epoch": 0.028536018439257817, + "grad_norm": 0.40908193588256836, + "learning_rate": 0.00019433317303318214, + "loss": 1.4635, + "step": 2196 + }, + { + "epoch": 0.02854901298317369, + "grad_norm": 0.36590951681137085, + "learning_rate": 0.00019433057357127077, + "loss": 1.4601, + "step": 2197 + }, + { + "epoch": 0.028562007527089563, + "grad_norm": 0.37208840250968933, + "learning_rate": 0.00019432797410935936, + "loss": 1.6144, + "step": 2198 + }, + { + "epoch": 0.028575002071005436, + "grad_norm": 0.35629284381866455, + "learning_rate": 0.000194325374647448, + "loss": 1.4232, + "step": 2199 + }, + { + "epoch": 0.02858799661492131, + "grad_norm": 0.4063398241996765, + "learning_rate": 0.0001943227751855366, + "loss": 1.5298, + "step": 2200 + }, + { + "epoch": 0.028600991158837182, + "grad_norm": 0.47579383850097656, + "learning_rate": 0.0001943201757236252, + "loss": 1.431, + "step": 2201 + }, + { + "epoch": 0.028613985702753055, + "grad_norm": 0.3551229238510132, + "learning_rate": 0.00019431757626171383, + "loss": 1.4623, + "step": 2202 + }, + { + "epoch": 0.02862698024666893, + "grad_norm": 0.3218754827976227, + "learning_rate": 0.00019431497679980243, + "loss": 1.5658, + "step": 2203 + }, + { + "epoch": 0.028639974790584805, + "grad_norm": 0.360439658164978, + "learning_rate": 0.00019431237733789108, + "loss": 1.3714, + "step": 2204 + }, + { + "epoch": 0.028652969334500678, + "grad_norm": 0.3467865586280823, + "learning_rate": 0.00019430977787597968, + "loss": 1.4742, + "step": 2205 + }, + { + "epoch": 0.02866596387841655, + "grad_norm": 0.33811742067337036, + "learning_rate": 0.0001943071784140683, + "loss": 1.33, + "step": 2206 + }, + { + "epoch": 0.028678958422332423, + "grad_norm": 0.41617485880851746, + "learning_rate": 0.00019430457895215693, + "loss": 1.5135, + "step": 2207 + }, + { + "epoch": 0.028691952966248296, + "grad_norm": 0.3327701985836029, + "learning_rate": 0.00019430197949024553, + "loss": 1.5446, + "step": 2208 + }, + { + "epoch": 0.02870494751016417, + "grad_norm": 0.3984528183937073, + "learning_rate": 0.00019429938002833415, + "loss": 1.3622, + "step": 2209 + }, + { + "epoch": 0.028717942054080042, + "grad_norm": 0.4298560321331024, + "learning_rate": 0.00019429678056642275, + "loss": 1.4283, + "step": 2210 + }, + { + "epoch": 0.028730936597995915, + "grad_norm": 0.35935840010643005, + "learning_rate": 0.00019429418110451137, + "loss": 1.4038, + "step": 2211 + }, + { + "epoch": 0.028743931141911788, + "grad_norm": 0.4088001549243927, + "learning_rate": 0.0001942915816426, + "loss": 1.6949, + "step": 2212 + }, + { + "epoch": 0.028756925685827665, + "grad_norm": 0.3564993739128113, + "learning_rate": 0.0001942889821806886, + "loss": 1.3841, + "step": 2213 + }, + { + "epoch": 0.028769920229743538, + "grad_norm": 0.27074694633483887, + "learning_rate": 0.00019428638271877722, + "loss": 1.231, + "step": 2214 + }, + { + "epoch": 0.02878291477365941, + "grad_norm": 0.3868809640407562, + "learning_rate": 0.00019428378325686584, + "loss": 1.4815, + "step": 2215 + }, + { + "epoch": 0.028795909317575284, + "grad_norm": 0.43486249446868896, + "learning_rate": 0.00019428118379495447, + "loss": 1.4355, + "step": 2216 + }, + { + "epoch": 0.028808903861491156, + "grad_norm": 0.5228997468948364, + "learning_rate": 0.00019427858433304306, + "loss": 1.5785, + "step": 2217 + }, + { + "epoch": 0.02882189840540703, + "grad_norm": 0.4311128854751587, + "learning_rate": 0.0001942759848711317, + "loss": 1.5378, + "step": 2218 + }, + { + "epoch": 0.028834892949322902, + "grad_norm": 0.36703139543533325, + "learning_rate": 0.00019427338540922031, + "loss": 1.4434, + "step": 2219 + }, + { + "epoch": 0.028847887493238775, + "grad_norm": 0.39816349744796753, + "learning_rate": 0.0001942707859473089, + "loss": 1.3951, + "step": 2220 + }, + { + "epoch": 0.02886088203715465, + "grad_norm": 0.4139325022697449, + "learning_rate": 0.00019426818648539754, + "loss": 1.591, + "step": 2221 + }, + { + "epoch": 0.028873876581070525, + "grad_norm": 0.4262898862361908, + "learning_rate": 0.00019426558702348613, + "loss": 1.4914, + "step": 2222 + }, + { + "epoch": 0.028886871124986398, + "grad_norm": 0.42953887581825256, + "learning_rate": 0.00019426298756157478, + "loss": 1.5241, + "step": 2223 + }, + { + "epoch": 0.02889986566890227, + "grad_norm": 0.7413663864135742, + "learning_rate": 0.00019426038809966338, + "loss": 1.455, + "step": 2224 + }, + { + "epoch": 0.028912860212818144, + "grad_norm": 0.35395124554634094, + "learning_rate": 0.00019425778863775198, + "loss": 1.5257, + "step": 2225 + }, + { + "epoch": 0.028925854756734017, + "grad_norm": 0.4739680290222168, + "learning_rate": 0.0001942551891758406, + "loss": 1.5969, + "step": 2226 + }, + { + "epoch": 0.02893884930064989, + "grad_norm": 0.3732389509677887, + "learning_rate": 0.00019425258971392923, + "loss": 1.4742, + "step": 2227 + }, + { + "epoch": 0.028951843844565762, + "grad_norm": 0.40652787685394287, + "learning_rate": 0.00019424999025201785, + "loss": 1.3275, + "step": 2228 + }, + { + "epoch": 0.028964838388481635, + "grad_norm": 0.3962986469268799, + "learning_rate": 0.00019424739079010645, + "loss": 1.6153, + "step": 2229 + }, + { + "epoch": 0.02897783293239751, + "grad_norm": 0.37969836592674255, + "learning_rate": 0.00019424479132819507, + "loss": 1.5725, + "step": 2230 + }, + { + "epoch": 0.02899082747631338, + "grad_norm": 0.6805177927017212, + "learning_rate": 0.0001942421918662837, + "loss": 1.4578, + "step": 2231 + }, + { + "epoch": 0.029003822020229258, + "grad_norm": 0.4655977189540863, + "learning_rate": 0.0001942395924043723, + "loss": 1.3158, + "step": 2232 + }, + { + "epoch": 0.02901681656414513, + "grad_norm": 0.5238486528396606, + "learning_rate": 0.00019423699294246092, + "loss": 1.4599, + "step": 2233 + }, + { + "epoch": 0.029029811108061004, + "grad_norm": 0.4091179072856903, + "learning_rate": 0.00019423439348054952, + "loss": 1.5138, + "step": 2234 + }, + { + "epoch": 0.029042805651976877, + "grad_norm": 0.3676810562610626, + "learning_rate": 0.00019423179401863817, + "loss": 1.6341, + "step": 2235 + }, + { + "epoch": 0.02905580019589275, + "grad_norm": 0.37930363416671753, + "learning_rate": 0.00019422919455672677, + "loss": 1.4647, + "step": 2236 + }, + { + "epoch": 0.029068794739808623, + "grad_norm": 0.3974941372871399, + "learning_rate": 0.00019422659509481536, + "loss": 1.6024, + "step": 2237 + }, + { + "epoch": 0.029081789283724496, + "grad_norm": 0.39573827385902405, + "learning_rate": 0.000194223995632904, + "loss": 1.4298, + "step": 2238 + }, + { + "epoch": 0.02909478382764037, + "grad_norm": 0.3913654685020447, + "learning_rate": 0.00019422139617099261, + "loss": 1.3641, + "step": 2239 + }, + { + "epoch": 0.02910777837155624, + "grad_norm": 0.3153485059738159, + "learning_rate": 0.00019421879670908124, + "loss": 1.2573, + "step": 2240 + }, + { + "epoch": 0.029120772915472114, + "grad_norm": 0.3856097161769867, + "learning_rate": 0.00019421619724716984, + "loss": 1.3456, + "step": 2241 + }, + { + "epoch": 0.02913376745938799, + "grad_norm": 0.308120995759964, + "learning_rate": 0.00019421359778525846, + "loss": 1.3074, + "step": 2242 + }, + { + "epoch": 0.029146762003303864, + "grad_norm": 0.37055110931396484, + "learning_rate": 0.00019421099832334708, + "loss": 1.5545, + "step": 2243 + }, + { + "epoch": 0.029159756547219737, + "grad_norm": 0.3175742030143738, + "learning_rate": 0.00019420839886143568, + "loss": 1.1888, + "step": 2244 + }, + { + "epoch": 0.02917275109113561, + "grad_norm": 0.32111236453056335, + "learning_rate": 0.0001942057993995243, + "loss": 1.6643, + "step": 2245 + }, + { + "epoch": 0.029185745635051483, + "grad_norm": 0.3519747853279114, + "learning_rate": 0.00019420319993761293, + "loss": 1.4373, + "step": 2246 + }, + { + "epoch": 0.029198740178967356, + "grad_norm": 0.3781045973300934, + "learning_rate": 0.00019420060047570156, + "loss": 1.3688, + "step": 2247 + }, + { + "epoch": 0.02921173472288323, + "grad_norm": 0.3909277617931366, + "learning_rate": 0.00019419800101379015, + "loss": 1.4453, + "step": 2248 + }, + { + "epoch": 0.0292247292667991, + "grad_norm": 0.41048064827919006, + "learning_rate": 0.00019419540155187878, + "loss": 1.3133, + "step": 2249 + }, + { + "epoch": 0.029237723810714975, + "grad_norm": 0.3009323477745056, + "learning_rate": 0.0001941928020899674, + "loss": 1.5612, + "step": 2250 + }, + { + "epoch": 0.02925071835463085, + "grad_norm": 0.4818994998931885, + "learning_rate": 0.000194190202628056, + "loss": 1.4417, + "step": 2251 + }, + { + "epoch": 0.029263712898546724, + "grad_norm": 0.46832817792892456, + "learning_rate": 0.00019418760316614462, + "loss": 1.5761, + "step": 2252 + }, + { + "epoch": 0.029276707442462597, + "grad_norm": 0.41898638010025024, + "learning_rate": 0.00019418500370423322, + "loss": 1.4214, + "step": 2253 + }, + { + "epoch": 0.02928970198637847, + "grad_norm": 0.3925115466117859, + "learning_rate": 0.00019418240424232185, + "loss": 1.5422, + "step": 2254 + }, + { + "epoch": 0.029302696530294343, + "grad_norm": 0.5036033987998962, + "learning_rate": 0.00019417980478041047, + "loss": 1.5198, + "step": 2255 + }, + { + "epoch": 0.029315691074210216, + "grad_norm": 0.34338614344596863, + "learning_rate": 0.00019417720531849907, + "loss": 1.4765, + "step": 2256 + }, + { + "epoch": 0.02932868561812609, + "grad_norm": 0.4155566394329071, + "learning_rate": 0.0001941746058565877, + "loss": 1.5128, + "step": 2257 + }, + { + "epoch": 0.02934168016204196, + "grad_norm": 0.3929993212223053, + "learning_rate": 0.00019417200639467632, + "loss": 1.4375, + "step": 2258 + }, + { + "epoch": 0.029354674705957835, + "grad_norm": 0.26651692390441895, + "learning_rate": 0.00019416940693276494, + "loss": 1.458, + "step": 2259 + }, + { + "epoch": 0.029367669249873708, + "grad_norm": 0.41451215744018555, + "learning_rate": 0.00019416680747085354, + "loss": 1.538, + "step": 2260 + }, + { + "epoch": 0.029380663793789584, + "grad_norm": 0.3747888207435608, + "learning_rate": 0.00019416420800894216, + "loss": 1.386, + "step": 2261 + }, + { + "epoch": 0.029393658337705457, + "grad_norm": 0.29739704728126526, + "learning_rate": 0.0001941616085470308, + "loss": 1.3246, + "step": 2262 + }, + { + "epoch": 0.02940665288162133, + "grad_norm": 0.49290746450424194, + "learning_rate": 0.00019415900908511938, + "loss": 1.407, + "step": 2263 + }, + { + "epoch": 0.029419647425537203, + "grad_norm": 0.28033435344696045, + "learning_rate": 0.000194156409623208, + "loss": 1.367, + "step": 2264 + }, + { + "epoch": 0.029432641969453076, + "grad_norm": 0.37781044840812683, + "learning_rate": 0.0001941538101612966, + "loss": 1.4717, + "step": 2265 + }, + { + "epoch": 0.02944563651336895, + "grad_norm": 0.40272921323776245, + "learning_rate": 0.00019415121069938523, + "loss": 1.3663, + "step": 2266 + }, + { + "epoch": 0.02945863105728482, + "grad_norm": 0.3746371567249298, + "learning_rate": 0.00019414861123747386, + "loss": 1.2733, + "step": 2267 + }, + { + "epoch": 0.029471625601200695, + "grad_norm": 0.3725739121437073, + "learning_rate": 0.00019414601177556245, + "loss": 1.4286, + "step": 2268 + }, + { + "epoch": 0.029484620145116568, + "grad_norm": 0.35759153962135315, + "learning_rate": 0.00019414341231365108, + "loss": 1.6504, + "step": 2269 + }, + { + "epoch": 0.029497614689032444, + "grad_norm": 0.37699389457702637, + "learning_rate": 0.0001941408128517397, + "loss": 1.5343, + "step": 2270 + }, + { + "epoch": 0.029510609232948317, + "grad_norm": 0.4187336564064026, + "learning_rate": 0.00019413821338982833, + "loss": 1.6269, + "step": 2271 + }, + { + "epoch": 0.02952360377686419, + "grad_norm": 0.46491625905036926, + "learning_rate": 0.00019413561392791692, + "loss": 1.692, + "step": 2272 + }, + { + "epoch": 0.029536598320780063, + "grad_norm": 0.3756006062030792, + "learning_rate": 0.00019413301446600555, + "loss": 1.5247, + "step": 2273 + }, + { + "epoch": 0.029549592864695936, + "grad_norm": 0.354198157787323, + "learning_rate": 0.00019413041500409417, + "loss": 1.4363, + "step": 2274 + }, + { + "epoch": 0.02956258740861181, + "grad_norm": 0.3795511722564697, + "learning_rate": 0.00019412781554218277, + "loss": 1.4965, + "step": 2275 + }, + { + "epoch": 0.029575581952527682, + "grad_norm": 0.37611326575279236, + "learning_rate": 0.0001941252160802714, + "loss": 1.4589, + "step": 2276 + }, + { + "epoch": 0.029588576496443555, + "grad_norm": 0.5443653464317322, + "learning_rate": 0.00019412261661836, + "loss": 1.4941, + "step": 2277 + }, + { + "epoch": 0.029601571040359428, + "grad_norm": 0.3725961744785309, + "learning_rate": 0.00019412001715644864, + "loss": 1.3932, + "step": 2278 + }, + { + "epoch": 0.0296145655842753, + "grad_norm": 0.3796549141407013, + "learning_rate": 0.00019411741769453724, + "loss": 1.3333, + "step": 2279 + }, + { + "epoch": 0.029627560128191177, + "grad_norm": 0.3841266334056854, + "learning_rate": 0.00019411481823262584, + "loss": 1.5165, + "step": 2280 + }, + { + "epoch": 0.02964055467210705, + "grad_norm": 0.3486854135990143, + "learning_rate": 0.0001941122187707145, + "loss": 1.2011, + "step": 2281 + }, + { + "epoch": 0.029653549216022923, + "grad_norm": 0.4347458481788635, + "learning_rate": 0.0001941096193088031, + "loss": 1.5416, + "step": 2282 + }, + { + "epoch": 0.029666543759938796, + "grad_norm": 0.3523584306240082, + "learning_rate": 0.0001941070198468917, + "loss": 1.5624, + "step": 2283 + }, + { + "epoch": 0.02967953830385467, + "grad_norm": 0.4369819462299347, + "learning_rate": 0.0001941044203849803, + "loss": 1.4357, + "step": 2284 + }, + { + "epoch": 0.029692532847770542, + "grad_norm": 0.47636428475379944, + "learning_rate": 0.00019410182092306893, + "loss": 1.6214, + "step": 2285 + }, + { + "epoch": 0.029705527391686415, + "grad_norm": 0.42865052819252014, + "learning_rate": 0.00019409922146115756, + "loss": 1.6789, + "step": 2286 + }, + { + "epoch": 0.029718521935602288, + "grad_norm": 0.3979128897190094, + "learning_rate": 0.00019409662199924616, + "loss": 1.4674, + "step": 2287 + }, + { + "epoch": 0.02973151647951816, + "grad_norm": 0.30139976739883423, + "learning_rate": 0.00019409402253733478, + "loss": 1.4787, + "step": 2288 + }, + { + "epoch": 0.029744511023434037, + "grad_norm": 0.38433244824409485, + "learning_rate": 0.0001940914230754234, + "loss": 1.3779, + "step": 2289 + }, + { + "epoch": 0.02975750556734991, + "grad_norm": 0.3885755240917206, + "learning_rate": 0.00019408882361351203, + "loss": 1.4913, + "step": 2290 + }, + { + "epoch": 0.029770500111265783, + "grad_norm": 0.35155102610588074, + "learning_rate": 0.00019408622415160063, + "loss": 1.4257, + "step": 2291 + }, + { + "epoch": 0.029783494655181656, + "grad_norm": 0.3547854423522949, + "learning_rate": 0.00019408362468968922, + "loss": 1.388, + "step": 2292 + }, + { + "epoch": 0.02979648919909753, + "grad_norm": 0.3712790310382843, + "learning_rate": 0.00019408102522777788, + "loss": 1.5405, + "step": 2293 + }, + { + "epoch": 0.029809483743013402, + "grad_norm": 0.3487582504749298, + "learning_rate": 0.00019407842576586647, + "loss": 1.3696, + "step": 2294 + }, + { + "epoch": 0.029822478286929275, + "grad_norm": 0.36178717017173767, + "learning_rate": 0.0001940758263039551, + "loss": 1.5208, + "step": 2295 + }, + { + "epoch": 0.029835472830845148, + "grad_norm": 0.4386369585990906, + "learning_rate": 0.0001940732268420437, + "loss": 1.381, + "step": 2296 + }, + { + "epoch": 0.02984846737476102, + "grad_norm": 0.36633753776550293, + "learning_rate": 0.00019407062738013232, + "loss": 1.4501, + "step": 2297 + }, + { + "epoch": 0.029861461918676894, + "grad_norm": 0.37941429018974304, + "learning_rate": 0.00019406802791822094, + "loss": 1.5131, + "step": 2298 + }, + { + "epoch": 0.02987445646259277, + "grad_norm": 0.3331683278083801, + "learning_rate": 0.00019406542845630954, + "loss": 1.3258, + "step": 2299 + }, + { + "epoch": 0.029887451006508643, + "grad_norm": 0.3746281862258911, + "learning_rate": 0.00019406282899439817, + "loss": 1.5902, + "step": 2300 + }, + { + "epoch": 0.029900445550424516, + "grad_norm": 0.41005784273147583, + "learning_rate": 0.0001940602295324868, + "loss": 1.5298, + "step": 2301 + }, + { + "epoch": 0.02991344009434039, + "grad_norm": 0.3656270503997803, + "learning_rate": 0.00019405763007057541, + "loss": 1.425, + "step": 2302 + }, + { + "epoch": 0.029926434638256262, + "grad_norm": 0.32205793261528015, + "learning_rate": 0.000194055030608664, + "loss": 1.5017, + "step": 2303 + }, + { + "epoch": 0.029939429182172135, + "grad_norm": 0.37533479928970337, + "learning_rate": 0.0001940524311467526, + "loss": 1.4854, + "step": 2304 + }, + { + "epoch": 0.029952423726088008, + "grad_norm": 0.364859014749527, + "learning_rate": 0.00019404983168484126, + "loss": 1.6089, + "step": 2305 + }, + { + "epoch": 0.02996541827000388, + "grad_norm": 0.4076573848724365, + "learning_rate": 0.00019404723222292986, + "loss": 1.5191, + "step": 2306 + }, + { + "epoch": 0.029978412813919754, + "grad_norm": 0.6191604137420654, + "learning_rate": 0.00019404463276101848, + "loss": 1.4567, + "step": 2307 + }, + { + "epoch": 0.02999140735783563, + "grad_norm": 0.4787641167640686, + "learning_rate": 0.00019404203329910708, + "loss": 1.5462, + "step": 2308 + }, + { + "epoch": 0.030004401901751503, + "grad_norm": 0.4141702651977539, + "learning_rate": 0.0001940394338371957, + "loss": 1.5666, + "step": 2309 + }, + { + "epoch": 0.030017396445667376, + "grad_norm": 0.4188869893550873, + "learning_rate": 0.00019403683437528433, + "loss": 1.5711, + "step": 2310 + }, + { + "epoch": 0.03003039098958325, + "grad_norm": 0.46232324838638306, + "learning_rate": 0.00019403423491337293, + "loss": 1.4781, + "step": 2311 + }, + { + "epoch": 0.030043385533499122, + "grad_norm": 0.39016687870025635, + "learning_rate": 0.00019403163545146155, + "loss": 1.5628, + "step": 2312 + }, + { + "epoch": 0.030056380077414995, + "grad_norm": 0.4115760922431946, + "learning_rate": 0.00019402903598955018, + "loss": 1.5846, + "step": 2313 + }, + { + "epoch": 0.030069374621330868, + "grad_norm": 0.38630566000938416, + "learning_rate": 0.0001940264365276388, + "loss": 1.4349, + "step": 2314 + }, + { + "epoch": 0.03008236916524674, + "grad_norm": 0.39953503012657166, + "learning_rate": 0.0001940238370657274, + "loss": 1.556, + "step": 2315 + }, + { + "epoch": 0.030095363709162614, + "grad_norm": 0.3413309156894684, + "learning_rate": 0.00019402123760381602, + "loss": 1.4068, + "step": 2316 + }, + { + "epoch": 0.030108358253078487, + "grad_norm": 0.43236130475997925, + "learning_rate": 0.00019401863814190465, + "loss": 1.5038, + "step": 2317 + }, + { + "epoch": 0.030121352796994363, + "grad_norm": 0.42266950011253357, + "learning_rate": 0.00019401603867999324, + "loss": 1.6838, + "step": 2318 + }, + { + "epoch": 0.030134347340910236, + "grad_norm": 0.385187029838562, + "learning_rate": 0.00019401343921808187, + "loss": 1.3582, + "step": 2319 + }, + { + "epoch": 0.03014734188482611, + "grad_norm": 0.3673193156719208, + "learning_rate": 0.0001940108397561705, + "loss": 1.1196, + "step": 2320 + }, + { + "epoch": 0.030160336428741982, + "grad_norm": 0.333638072013855, + "learning_rate": 0.0001940082402942591, + "loss": 1.616, + "step": 2321 + }, + { + "epoch": 0.030173330972657855, + "grad_norm": 0.34829479455947876, + "learning_rate": 0.00019400564083234771, + "loss": 1.541, + "step": 2322 + }, + { + "epoch": 0.030186325516573728, + "grad_norm": 0.4145924150943756, + "learning_rate": 0.0001940030413704363, + "loss": 1.6013, + "step": 2323 + }, + { + "epoch": 0.0301993200604896, + "grad_norm": 0.3042910397052765, + "learning_rate": 0.00019400044190852496, + "loss": 1.4189, + "step": 2324 + }, + { + "epoch": 0.030212314604405474, + "grad_norm": 0.4206189215183258, + "learning_rate": 0.00019399784244661356, + "loss": 1.4251, + "step": 2325 + }, + { + "epoch": 0.030225309148321347, + "grad_norm": 0.4113405644893646, + "learning_rate": 0.00019399524298470219, + "loss": 1.4679, + "step": 2326 + }, + { + "epoch": 0.030238303692237224, + "grad_norm": 0.4340955913066864, + "learning_rate": 0.00019399264352279078, + "loss": 1.4749, + "step": 2327 + }, + { + "epoch": 0.030251298236153096, + "grad_norm": 0.39315587282180786, + "learning_rate": 0.0001939900440608794, + "loss": 1.4331, + "step": 2328 + }, + { + "epoch": 0.03026429278006897, + "grad_norm": 0.39607760310173035, + "learning_rate": 0.00019398744459896803, + "loss": 1.6217, + "step": 2329 + }, + { + "epoch": 0.030277287323984842, + "grad_norm": 0.41378265619277954, + "learning_rate": 0.00019398484513705663, + "loss": 1.3429, + "step": 2330 + }, + { + "epoch": 0.030290281867900715, + "grad_norm": 0.3606838583946228, + "learning_rate": 0.00019398224567514525, + "loss": 1.5375, + "step": 2331 + }, + { + "epoch": 0.03030327641181659, + "grad_norm": 0.5275381803512573, + "learning_rate": 0.00019397964621323388, + "loss": 1.5635, + "step": 2332 + }, + { + "epoch": 0.03031627095573246, + "grad_norm": 0.2677003741264343, + "learning_rate": 0.0001939770467513225, + "loss": 1.213, + "step": 2333 + }, + { + "epoch": 0.030329265499648334, + "grad_norm": 0.3109504282474518, + "learning_rate": 0.0001939744472894111, + "loss": 1.2838, + "step": 2334 + }, + { + "epoch": 0.030342260043564207, + "grad_norm": 0.4251733720302582, + "learning_rate": 0.0001939718478274997, + "loss": 1.6817, + "step": 2335 + }, + { + "epoch": 0.03035525458748008, + "grad_norm": 0.4478415846824646, + "learning_rate": 0.00019396924836558835, + "loss": 1.4814, + "step": 2336 + }, + { + "epoch": 0.030368249131395957, + "grad_norm": 0.42527344822883606, + "learning_rate": 0.00019396664890367695, + "loss": 1.3118, + "step": 2337 + }, + { + "epoch": 0.03038124367531183, + "grad_norm": 0.44282853603363037, + "learning_rate": 0.00019396404944176557, + "loss": 1.6291, + "step": 2338 + }, + { + "epoch": 0.030394238219227702, + "grad_norm": 0.4052993655204773, + "learning_rate": 0.00019396144997985417, + "loss": 1.3936, + "step": 2339 + }, + { + "epoch": 0.030407232763143575, + "grad_norm": 0.3035629987716675, + "learning_rate": 0.0001939588505179428, + "loss": 1.3489, + "step": 2340 + }, + { + "epoch": 0.03042022730705945, + "grad_norm": 0.41601982712745667, + "learning_rate": 0.00019395625105603142, + "loss": 1.3657, + "step": 2341 + }, + { + "epoch": 0.03043322185097532, + "grad_norm": 0.37834465503692627, + "learning_rate": 0.00019395365159412001, + "loss": 1.2281, + "step": 2342 + }, + { + "epoch": 0.030446216394891194, + "grad_norm": 0.27632567286491394, + "learning_rate": 0.00019395105213220864, + "loss": 1.3653, + "step": 2343 + }, + { + "epoch": 0.030459210938807067, + "grad_norm": 0.4285818934440613, + "learning_rate": 0.00019394845267029726, + "loss": 1.3668, + "step": 2344 + }, + { + "epoch": 0.03047220548272294, + "grad_norm": 0.39858153462409973, + "learning_rate": 0.0001939458532083859, + "loss": 1.5637, + "step": 2345 + }, + { + "epoch": 0.030485200026638817, + "grad_norm": 0.40720924735069275, + "learning_rate": 0.00019394325374647449, + "loss": 1.5489, + "step": 2346 + }, + { + "epoch": 0.03049819457055469, + "grad_norm": 0.3333108723163605, + "learning_rate": 0.00019394065428456308, + "loss": 1.2593, + "step": 2347 + }, + { + "epoch": 0.030511189114470563, + "grad_norm": 0.3936966359615326, + "learning_rate": 0.00019393805482265173, + "loss": 1.4027, + "step": 2348 + }, + { + "epoch": 0.030524183658386436, + "grad_norm": 0.3745768368244171, + "learning_rate": 0.00019393545536074033, + "loss": 1.3695, + "step": 2349 + }, + { + "epoch": 0.03053717820230231, + "grad_norm": 0.357470840215683, + "learning_rate": 0.00019393285589882896, + "loss": 1.4901, + "step": 2350 + }, + { + "epoch": 0.03055017274621818, + "grad_norm": 0.4375747740268707, + "learning_rate": 0.00019393025643691755, + "loss": 1.499, + "step": 2351 + }, + { + "epoch": 0.030563167290134054, + "grad_norm": 0.40660470724105835, + "learning_rate": 0.00019392765697500618, + "loss": 1.3784, + "step": 2352 + }, + { + "epoch": 0.030576161834049927, + "grad_norm": 0.38084301352500916, + "learning_rate": 0.0001939250575130948, + "loss": 1.466, + "step": 2353 + }, + { + "epoch": 0.0305891563779658, + "grad_norm": 0.3898766040802002, + "learning_rate": 0.0001939224580511834, + "loss": 1.4877, + "step": 2354 + }, + { + "epoch": 0.030602150921881673, + "grad_norm": 0.5616552233695984, + "learning_rate": 0.00019391985858927205, + "loss": 1.7029, + "step": 2355 + }, + { + "epoch": 0.03061514546579755, + "grad_norm": 0.3264707028865814, + "learning_rate": 0.00019391725912736065, + "loss": 1.282, + "step": 2356 + }, + { + "epoch": 0.030628140009713423, + "grad_norm": 0.41132742166519165, + "learning_rate": 0.00019391465966544927, + "loss": 1.6009, + "step": 2357 + }, + { + "epoch": 0.030641134553629296, + "grad_norm": 0.3672133982181549, + "learning_rate": 0.00019391206020353787, + "loss": 1.4757, + "step": 2358 + }, + { + "epoch": 0.03065412909754517, + "grad_norm": 0.400573194026947, + "learning_rate": 0.0001939094607416265, + "loss": 1.4037, + "step": 2359 + }, + { + "epoch": 0.03066712364146104, + "grad_norm": 0.3660773038864136, + "learning_rate": 0.00019390686127971512, + "loss": 1.4139, + "step": 2360 + }, + { + "epoch": 0.030680118185376914, + "grad_norm": 0.38157641887664795, + "learning_rate": 0.00019390426181780372, + "loss": 1.4627, + "step": 2361 + }, + { + "epoch": 0.030693112729292787, + "grad_norm": 0.4337979853153229, + "learning_rate": 0.00019390166235589234, + "loss": 1.6154, + "step": 2362 + }, + { + "epoch": 0.03070610727320866, + "grad_norm": 0.3866462707519531, + "learning_rate": 0.00019389906289398097, + "loss": 1.5673, + "step": 2363 + }, + { + "epoch": 0.030719101817124533, + "grad_norm": 0.39175018668174744, + "learning_rate": 0.00019389646343206956, + "loss": 1.507, + "step": 2364 + }, + { + "epoch": 0.03073209636104041, + "grad_norm": 0.36119142174720764, + "learning_rate": 0.0001938938639701582, + "loss": 1.6573, + "step": 2365 + }, + { + "epoch": 0.030745090904956283, + "grad_norm": 0.40435540676116943, + "learning_rate": 0.00019389126450824678, + "loss": 1.4668, + "step": 2366 + }, + { + "epoch": 0.030758085448872156, + "grad_norm": 0.4037638306617737, + "learning_rate": 0.00019388866504633544, + "loss": 1.1692, + "step": 2367 + }, + { + "epoch": 0.03077107999278803, + "grad_norm": 0.4345413148403168, + "learning_rate": 0.00019388606558442403, + "loss": 1.5138, + "step": 2368 + }, + { + "epoch": 0.0307840745367039, + "grad_norm": 0.4004787802696228, + "learning_rate": 0.00019388346612251266, + "loss": 1.6534, + "step": 2369 + }, + { + "epoch": 0.030797069080619775, + "grad_norm": 0.3963693678379059, + "learning_rate": 0.00019388086666060126, + "loss": 1.4003, + "step": 2370 + }, + { + "epoch": 0.030810063624535648, + "grad_norm": 0.3286566138267517, + "learning_rate": 0.00019387826719868988, + "loss": 1.5337, + "step": 2371 + }, + { + "epoch": 0.03082305816845152, + "grad_norm": 0.31235527992248535, + "learning_rate": 0.0001938756677367785, + "loss": 1.3039, + "step": 2372 + }, + { + "epoch": 0.030836052712367393, + "grad_norm": 0.39822515845298767, + "learning_rate": 0.0001938730682748671, + "loss": 1.3909, + "step": 2373 + }, + { + "epoch": 0.030849047256283266, + "grad_norm": 0.4359123706817627, + "learning_rate": 0.00019387046881295573, + "loss": 1.4659, + "step": 2374 + }, + { + "epoch": 0.030862041800199143, + "grad_norm": 0.34830644726753235, + "learning_rate": 0.00019386786935104435, + "loss": 1.4766, + "step": 2375 + }, + { + "epoch": 0.030875036344115016, + "grad_norm": 0.42656728625297546, + "learning_rate": 0.00019386526988913295, + "loss": 1.4657, + "step": 2376 + }, + { + "epoch": 0.03088803088803089, + "grad_norm": 0.4401993155479431, + "learning_rate": 0.00019386267042722157, + "loss": 1.4028, + "step": 2377 + }, + { + "epoch": 0.03090102543194676, + "grad_norm": 0.37953370809555054, + "learning_rate": 0.00019386007096531017, + "loss": 1.3901, + "step": 2378 + }, + { + "epoch": 0.030914019975862635, + "grad_norm": 0.47079914808273315, + "learning_rate": 0.00019385747150339882, + "loss": 1.5176, + "step": 2379 + }, + { + "epoch": 0.030927014519778508, + "grad_norm": 0.39718374609947205, + "learning_rate": 0.00019385487204148742, + "loss": 1.4507, + "step": 2380 + }, + { + "epoch": 0.03094000906369438, + "grad_norm": 0.42498964071273804, + "learning_rate": 0.00019385227257957604, + "loss": 1.6366, + "step": 2381 + }, + { + "epoch": 0.030953003607610254, + "grad_norm": 0.4166160821914673, + "learning_rate": 0.00019384967311766464, + "loss": 1.6349, + "step": 2382 + }, + { + "epoch": 0.030965998151526127, + "grad_norm": 0.3568093180656433, + "learning_rate": 0.00019384707365575327, + "loss": 1.4379, + "step": 2383 + }, + { + "epoch": 0.030978992695442003, + "grad_norm": 0.41537830233573914, + "learning_rate": 0.0001938444741938419, + "loss": 1.5079, + "step": 2384 + }, + { + "epoch": 0.030991987239357876, + "grad_norm": 0.32123616337776184, + "learning_rate": 0.0001938418747319305, + "loss": 1.3794, + "step": 2385 + }, + { + "epoch": 0.03100498178327375, + "grad_norm": 0.3980812132358551, + "learning_rate": 0.0001938392752700191, + "loss": 1.4997, + "step": 2386 + }, + { + "epoch": 0.031017976327189622, + "grad_norm": 0.359386682510376, + "learning_rate": 0.00019383667580810774, + "loss": 1.5401, + "step": 2387 + }, + { + "epoch": 0.031030970871105495, + "grad_norm": 0.30126532912254333, + "learning_rate": 0.00019383407634619633, + "loss": 1.4162, + "step": 2388 + }, + { + "epoch": 0.031043965415021368, + "grad_norm": 0.2829444408416748, + "learning_rate": 0.00019383147688428496, + "loss": 1.5548, + "step": 2389 + }, + { + "epoch": 0.03105695995893724, + "grad_norm": 0.49797871708869934, + "learning_rate": 0.00019382887742237356, + "loss": 1.5038, + "step": 2390 + }, + { + "epoch": 0.031069954502853114, + "grad_norm": 0.3714198172092438, + "learning_rate": 0.0001938262779604622, + "loss": 1.4442, + "step": 2391 + }, + { + "epoch": 0.031082949046768987, + "grad_norm": 0.3066392242908478, + "learning_rate": 0.0001938236784985508, + "loss": 1.4485, + "step": 2392 + }, + { + "epoch": 0.03109594359068486, + "grad_norm": 0.4711465835571289, + "learning_rate": 0.00019382107903663943, + "loss": 1.4427, + "step": 2393 + }, + { + "epoch": 0.031108938134600736, + "grad_norm": 0.4103511869907379, + "learning_rate": 0.00019381847957472805, + "loss": 1.653, + "step": 2394 + }, + { + "epoch": 0.03112193267851661, + "grad_norm": 0.3232939541339874, + "learning_rate": 0.00019381588011281665, + "loss": 1.4108, + "step": 2395 + }, + { + "epoch": 0.031134927222432482, + "grad_norm": 0.3427291214466095, + "learning_rate": 0.00019381328065090528, + "loss": 1.5509, + "step": 2396 + }, + { + "epoch": 0.031147921766348355, + "grad_norm": 0.36916622519493103, + "learning_rate": 0.00019381068118899387, + "loss": 1.5009, + "step": 2397 + }, + { + "epoch": 0.031160916310264228, + "grad_norm": 0.38322317600250244, + "learning_rate": 0.00019380808172708252, + "loss": 1.36, + "step": 2398 + }, + { + "epoch": 0.0311739108541801, + "grad_norm": 0.5019955039024353, + "learning_rate": 0.00019380548226517112, + "loss": 1.4435, + "step": 2399 + }, + { + "epoch": 0.031186905398095974, + "grad_norm": 0.4351155161857605, + "learning_rate": 0.00019380288280325975, + "loss": 1.5429, + "step": 2400 + }, + { + "epoch": 0.031199899942011847, + "grad_norm": 0.3205655813217163, + "learning_rate": 0.00019380028334134834, + "loss": 1.558, + "step": 2401 + }, + { + "epoch": 0.03121289448592772, + "grad_norm": 0.4249109625816345, + "learning_rate": 0.00019379768387943697, + "loss": 1.3846, + "step": 2402 + }, + { + "epoch": 0.031225889029843596, + "grad_norm": 0.43097853660583496, + "learning_rate": 0.0001937950844175256, + "loss": 1.73, + "step": 2403 + }, + { + "epoch": 0.03123888357375947, + "grad_norm": 0.3964199125766754, + "learning_rate": 0.0001937924849556142, + "loss": 1.4712, + "step": 2404 + }, + { + "epoch": 0.03125187811767534, + "grad_norm": 0.40434059500694275, + "learning_rate": 0.00019378988549370281, + "loss": 1.4685, + "step": 2405 + }, + { + "epoch": 0.03126487266159121, + "grad_norm": 0.330955445766449, + "learning_rate": 0.00019378728603179144, + "loss": 1.3936, + "step": 2406 + }, + { + "epoch": 0.031277867205507084, + "grad_norm": 0.36570027470588684, + "learning_rate": 0.00019378468656988004, + "loss": 1.5513, + "step": 2407 + }, + { + "epoch": 0.031290861749422964, + "grad_norm": 0.31739145517349243, + "learning_rate": 0.00019378208710796866, + "loss": 1.5095, + "step": 2408 + }, + { + "epoch": 0.03130385629333884, + "grad_norm": 0.3888550102710724, + "learning_rate": 0.00019377948764605726, + "loss": 1.3702, + "step": 2409 + }, + { + "epoch": 0.03131685083725471, + "grad_norm": 0.3065565824508667, + "learning_rate": 0.0001937768881841459, + "loss": 1.3797, + "step": 2410 + }, + { + "epoch": 0.03132984538117058, + "grad_norm": 0.2960432767868042, + "learning_rate": 0.0001937742887222345, + "loss": 1.3027, + "step": 2411 + }, + { + "epoch": 0.031342839925086456, + "grad_norm": 0.3159444034099579, + "learning_rate": 0.00019377168926032313, + "loss": 1.3071, + "step": 2412 + }, + { + "epoch": 0.03135583446900233, + "grad_norm": 0.41638821363449097, + "learning_rate": 0.00019376908979841173, + "loss": 1.5492, + "step": 2413 + }, + { + "epoch": 0.0313688290129182, + "grad_norm": 0.42141085863113403, + "learning_rate": 0.00019376649033650035, + "loss": 1.4054, + "step": 2414 + }, + { + "epoch": 0.031381823556834075, + "grad_norm": 0.43707937002182007, + "learning_rate": 0.00019376389087458898, + "loss": 1.4755, + "step": 2415 + }, + { + "epoch": 0.03139481810074995, + "grad_norm": 0.3669463098049164, + "learning_rate": 0.00019376129141267758, + "loss": 1.4415, + "step": 2416 + }, + { + "epoch": 0.03140781264466582, + "grad_norm": 0.3679063022136688, + "learning_rate": 0.0001937586919507662, + "loss": 1.4681, + "step": 2417 + }, + { + "epoch": 0.031420807188581694, + "grad_norm": 0.33323216438293457, + "learning_rate": 0.00019375609248885482, + "loss": 1.5661, + "step": 2418 + }, + { + "epoch": 0.03143380173249757, + "grad_norm": 0.3823724687099457, + "learning_rate": 0.00019375349302694342, + "loss": 1.4424, + "step": 2419 + }, + { + "epoch": 0.03144679627641344, + "grad_norm": 0.37714922428131104, + "learning_rate": 0.00019375089356503205, + "loss": 1.5948, + "step": 2420 + }, + { + "epoch": 0.03145979082032931, + "grad_norm": 0.36402538418769836, + "learning_rate": 0.00019374829410312064, + "loss": 1.4183, + "step": 2421 + }, + { + "epoch": 0.031472785364245186, + "grad_norm": 0.5110093355178833, + "learning_rate": 0.0001937456946412093, + "loss": 1.4378, + "step": 2422 + }, + { + "epoch": 0.03148577990816106, + "grad_norm": 0.3924713730812073, + "learning_rate": 0.0001937430951792979, + "loss": 1.5022, + "step": 2423 + }, + { + "epoch": 0.03149877445207693, + "grad_norm": 0.3390837013721466, + "learning_rate": 0.00019374049571738652, + "loss": 1.3778, + "step": 2424 + }, + { + "epoch": 0.031511768995992805, + "grad_norm": 0.38642051815986633, + "learning_rate": 0.00019373789625547511, + "loss": 1.4276, + "step": 2425 + }, + { + "epoch": 0.03152476353990868, + "grad_norm": 0.36511752009391785, + "learning_rate": 0.00019373529679356374, + "loss": 1.2479, + "step": 2426 + }, + { + "epoch": 0.03153775808382456, + "grad_norm": 0.4753246009349823, + "learning_rate": 0.00019373269733165236, + "loss": 1.4581, + "step": 2427 + }, + { + "epoch": 0.03155075262774043, + "grad_norm": 0.507246732711792, + "learning_rate": 0.00019373009786974096, + "loss": 1.4634, + "step": 2428 + }, + { + "epoch": 0.0315637471716563, + "grad_norm": 0.4189542829990387, + "learning_rate": 0.0001937274984078296, + "loss": 1.5611, + "step": 2429 + }, + { + "epoch": 0.031576741715572176, + "grad_norm": 0.5271665453910828, + "learning_rate": 0.0001937248989459182, + "loss": 1.3618, + "step": 2430 + }, + { + "epoch": 0.03158973625948805, + "grad_norm": 0.30904582142829895, + "learning_rate": 0.0001937222994840068, + "loss": 1.3887, + "step": 2431 + }, + { + "epoch": 0.03160273080340392, + "grad_norm": 0.29169711470603943, + "learning_rate": 0.00019371970002209543, + "loss": 1.53, + "step": 2432 + }, + { + "epoch": 0.031615725347319795, + "grad_norm": 0.36168599128723145, + "learning_rate": 0.00019371710056018406, + "loss": 1.3156, + "step": 2433 + }, + { + "epoch": 0.03162871989123567, + "grad_norm": 0.3162071108818054, + "learning_rate": 0.00019371450109827268, + "loss": 1.4583, + "step": 2434 + }, + { + "epoch": 0.03164171443515154, + "grad_norm": 0.24736791849136353, + "learning_rate": 0.00019371190163636128, + "loss": 1.3349, + "step": 2435 + }, + { + "epoch": 0.031654708979067414, + "grad_norm": 0.2598811984062195, + "learning_rate": 0.0001937093021744499, + "loss": 1.2131, + "step": 2436 + }, + { + "epoch": 0.03166770352298329, + "grad_norm": 0.45681071281433105, + "learning_rate": 0.00019370670271253853, + "loss": 1.5545, + "step": 2437 + }, + { + "epoch": 0.03168069806689916, + "grad_norm": 0.3820514678955078, + "learning_rate": 0.00019370410325062712, + "loss": 1.3836, + "step": 2438 + }, + { + "epoch": 0.03169369261081503, + "grad_norm": 0.39064905047416687, + "learning_rate": 0.00019370150378871575, + "loss": 1.4368, + "step": 2439 + }, + { + "epoch": 0.031706687154730906, + "grad_norm": 0.4174213707447052, + "learning_rate": 0.00019369890432680435, + "loss": 1.587, + "step": 2440 + }, + { + "epoch": 0.03171968169864678, + "grad_norm": 0.3780830204486847, + "learning_rate": 0.000193696304864893, + "loss": 1.7156, + "step": 2441 + }, + { + "epoch": 0.03173267624256265, + "grad_norm": 0.29549428820610046, + "learning_rate": 0.0001936937054029816, + "loss": 1.4188, + "step": 2442 + }, + { + "epoch": 0.031745670786478525, + "grad_norm": 0.30807724595069885, + "learning_rate": 0.0001936911059410702, + "loss": 1.4814, + "step": 2443 + }, + { + "epoch": 0.0317586653303944, + "grad_norm": 0.38823434710502625, + "learning_rate": 0.00019368850647915882, + "loss": 1.5692, + "step": 2444 + }, + { + "epoch": 0.03177165987431027, + "grad_norm": 0.3719289004802704, + "learning_rate": 0.00019368590701724744, + "loss": 1.4046, + "step": 2445 + }, + { + "epoch": 0.03178465441822615, + "grad_norm": 0.4645630419254303, + "learning_rate": 0.00019368330755533607, + "loss": 1.4787, + "step": 2446 + }, + { + "epoch": 0.031797648962142024, + "grad_norm": 0.32714515924453735, + "learning_rate": 0.00019368070809342466, + "loss": 1.4412, + "step": 2447 + }, + { + "epoch": 0.0318106435060579, + "grad_norm": 0.3628447353839874, + "learning_rate": 0.0001936781086315133, + "loss": 1.4802, + "step": 2448 + }, + { + "epoch": 0.03182363804997377, + "grad_norm": 0.2753009498119354, + "learning_rate": 0.0001936755091696019, + "loss": 1.5107, + "step": 2449 + }, + { + "epoch": 0.03183663259388964, + "grad_norm": 0.36268672347068787, + "learning_rate": 0.0001936729097076905, + "loss": 1.5527, + "step": 2450 + }, + { + "epoch": 0.031849627137805515, + "grad_norm": 0.4151109457015991, + "learning_rate": 0.00019367031024577913, + "loss": 1.2486, + "step": 2451 + }, + { + "epoch": 0.03186262168172139, + "grad_norm": 0.36598706245422363, + "learning_rate": 0.00019366771078386773, + "loss": 1.6202, + "step": 2452 + }, + { + "epoch": 0.03187561622563726, + "grad_norm": 0.47833046317100525, + "learning_rate": 0.00019366511132195638, + "loss": 1.5974, + "step": 2453 + }, + { + "epoch": 0.031888610769553134, + "grad_norm": 0.392170786857605, + "learning_rate": 0.00019366251186004498, + "loss": 1.3854, + "step": 2454 + }, + { + "epoch": 0.03190160531346901, + "grad_norm": 0.4662458002567291, + "learning_rate": 0.0001936599123981336, + "loss": 1.6315, + "step": 2455 + }, + { + "epoch": 0.03191459985738488, + "grad_norm": 0.39337852597236633, + "learning_rate": 0.0001936573129362222, + "loss": 1.5138, + "step": 2456 + }, + { + "epoch": 0.03192759440130075, + "grad_norm": 0.3348299562931061, + "learning_rate": 0.00019365471347431083, + "loss": 1.4841, + "step": 2457 + }, + { + "epoch": 0.031940588945216626, + "grad_norm": 1.329984188079834, + "learning_rate": 0.00019365211401239945, + "loss": 1.2603, + "step": 2458 + }, + { + "epoch": 0.0319535834891325, + "grad_norm": 0.41095101833343506, + "learning_rate": 0.00019364951455048805, + "loss": 1.5679, + "step": 2459 + }, + { + "epoch": 0.03196657803304837, + "grad_norm": 0.42867493629455566, + "learning_rate": 0.00019364691508857667, + "loss": 1.3936, + "step": 2460 + }, + { + "epoch": 0.031979572576964245, + "grad_norm": 0.4014851152896881, + "learning_rate": 0.0001936443156266653, + "loss": 1.5077, + "step": 2461 + }, + { + "epoch": 0.03199256712088012, + "grad_norm": 0.32092902064323425, + "learning_rate": 0.0001936417161647539, + "loss": 1.39, + "step": 2462 + }, + { + "epoch": 0.03200556166479599, + "grad_norm": 0.31203359365463257, + "learning_rate": 0.00019363911670284252, + "loss": 1.4216, + "step": 2463 + }, + { + "epoch": 0.032018556208711864, + "grad_norm": 0.4010063707828522, + "learning_rate": 0.00019363651724093112, + "loss": 1.5438, + "step": 2464 + }, + { + "epoch": 0.032031550752627744, + "grad_norm": 0.45139941573143005, + "learning_rate": 0.00019363391777901977, + "loss": 1.5588, + "step": 2465 + }, + { + "epoch": 0.03204454529654362, + "grad_norm": 0.3555641770362854, + "learning_rate": 0.00019363131831710837, + "loss": 1.489, + "step": 2466 + }, + { + "epoch": 0.03205753984045949, + "grad_norm": 0.37799158692359924, + "learning_rate": 0.000193628718855197, + "loss": 1.3104, + "step": 2467 + }, + { + "epoch": 0.03207053438437536, + "grad_norm": 0.32376259565353394, + "learning_rate": 0.00019362611939328561, + "loss": 1.299, + "step": 2468 + }, + { + "epoch": 0.032083528928291236, + "grad_norm": 0.2891981303691864, + "learning_rate": 0.0001936235199313742, + "loss": 1.2869, + "step": 2469 + }, + { + "epoch": 0.03209652347220711, + "grad_norm": 0.30140420794487, + "learning_rate": 0.00019362092046946284, + "loss": 1.2806, + "step": 2470 + }, + { + "epoch": 0.03210951801612298, + "grad_norm": 0.3630262017250061, + "learning_rate": 0.00019361832100755143, + "loss": 1.4137, + "step": 2471 + }, + { + "epoch": 0.032122512560038854, + "grad_norm": 0.3762016296386719, + "learning_rate": 0.00019361572154564006, + "loss": 1.4943, + "step": 2472 + }, + { + "epoch": 0.03213550710395473, + "grad_norm": 0.34979432821273804, + "learning_rate": 0.00019361312208372868, + "loss": 1.2977, + "step": 2473 + }, + { + "epoch": 0.0321485016478706, + "grad_norm": 0.3361506462097168, + "learning_rate": 0.00019361052262181728, + "loss": 1.267, + "step": 2474 + }, + { + "epoch": 0.03216149619178647, + "grad_norm": 0.38225317001342773, + "learning_rate": 0.0001936079231599059, + "loss": 1.4333, + "step": 2475 + }, + { + "epoch": 0.032174490735702346, + "grad_norm": 0.4617854058742523, + "learning_rate": 0.00019360532369799453, + "loss": 1.5878, + "step": 2476 + }, + { + "epoch": 0.03218748527961822, + "grad_norm": 0.3995307385921478, + "learning_rate": 0.00019360272423608315, + "loss": 1.5246, + "step": 2477 + }, + { + "epoch": 0.03220047982353409, + "grad_norm": 0.37244245409965515, + "learning_rate": 0.00019360012477417175, + "loss": 1.2833, + "step": 2478 + }, + { + "epoch": 0.032213474367449965, + "grad_norm": 0.30733734369277954, + "learning_rate": 0.00019359752531226038, + "loss": 1.3623, + "step": 2479 + }, + { + "epoch": 0.03222646891136584, + "grad_norm": 0.4627993404865265, + "learning_rate": 0.000193594925850349, + "loss": 1.4876, + "step": 2480 + }, + { + "epoch": 0.03223946345528171, + "grad_norm": 0.33499887585639954, + "learning_rate": 0.0001935923263884376, + "loss": 1.5466, + "step": 2481 + }, + { + "epoch": 0.032252457999197584, + "grad_norm": 0.3215530514717102, + "learning_rate": 0.00019358972692652622, + "loss": 1.4848, + "step": 2482 + }, + { + "epoch": 0.03226545254311346, + "grad_norm": 0.3846237361431122, + "learning_rate": 0.00019358712746461482, + "loss": 1.4881, + "step": 2483 + }, + { + "epoch": 0.03227844708702934, + "grad_norm": 0.40432724356651306, + "learning_rate": 0.00019358452800270347, + "loss": 1.6301, + "step": 2484 + }, + { + "epoch": 0.03229144163094521, + "grad_norm": 0.4204750955104828, + "learning_rate": 0.00019358192854079207, + "loss": 1.5951, + "step": 2485 + }, + { + "epoch": 0.03230443617486108, + "grad_norm": 0.33970895409584045, + "learning_rate": 0.00019357932907888067, + "loss": 1.3571, + "step": 2486 + }, + { + "epoch": 0.032317430718776956, + "grad_norm": 0.3833889067173004, + "learning_rate": 0.0001935767296169693, + "loss": 1.6484, + "step": 2487 + }, + { + "epoch": 0.03233042526269283, + "grad_norm": 0.5447184443473816, + "learning_rate": 0.00019357413015505791, + "loss": 1.5994, + "step": 2488 + }, + { + "epoch": 0.0323434198066087, + "grad_norm": 0.39822641015052795, + "learning_rate": 0.00019357153069314654, + "loss": 1.3907, + "step": 2489 + }, + { + "epoch": 0.032356414350524575, + "grad_norm": 0.36578643321990967, + "learning_rate": 0.00019356893123123514, + "loss": 1.3455, + "step": 2490 + }, + { + "epoch": 0.03236940889444045, + "grad_norm": 0.31123805046081543, + "learning_rate": 0.00019356633176932376, + "loss": 1.289, + "step": 2491 + }, + { + "epoch": 0.03238240343835632, + "grad_norm": 0.6136845350265503, + "learning_rate": 0.00019356373230741239, + "loss": 1.639, + "step": 2492 + }, + { + "epoch": 0.032395397982272194, + "grad_norm": 0.32089829444885254, + "learning_rate": 0.00019356113284550098, + "loss": 1.3375, + "step": 2493 + }, + { + "epoch": 0.032408392526188066, + "grad_norm": 0.3012245297431946, + "learning_rate": 0.0001935585333835896, + "loss": 1.5459, + "step": 2494 + }, + { + "epoch": 0.03242138707010394, + "grad_norm": 0.3286607265472412, + "learning_rate": 0.0001935559339216782, + "loss": 1.4914, + "step": 2495 + }, + { + "epoch": 0.03243438161401981, + "grad_norm": 0.2875760793685913, + "learning_rate": 0.00019355333445976686, + "loss": 1.3503, + "step": 2496 + }, + { + "epoch": 0.032447376157935685, + "grad_norm": 0.3188101053237915, + "learning_rate": 0.00019355073499785545, + "loss": 1.3547, + "step": 2497 + }, + { + "epoch": 0.03246037070185156, + "grad_norm": 0.3665832579135895, + "learning_rate": 0.00019354813553594405, + "loss": 1.2182, + "step": 2498 + }, + { + "epoch": 0.03247336524576743, + "grad_norm": 0.3593955934047699, + "learning_rate": 0.00019354553607403268, + "loss": 1.3949, + "step": 2499 + }, + { + "epoch": 0.032486359789683304, + "grad_norm": 0.4981686472892761, + "learning_rate": 0.0001935429366121213, + "loss": 1.4597, + "step": 2500 + }, + { + "epoch": 0.03249935433359918, + "grad_norm": 0.29916468262672424, + "learning_rate": 0.00019354033715020992, + "loss": 1.4686, + "step": 2501 + }, + { + "epoch": 0.03251234887751505, + "grad_norm": 0.5272825956344604, + "learning_rate": 0.00019353773768829852, + "loss": 1.478, + "step": 2502 + }, + { + "epoch": 0.03252534342143093, + "grad_norm": 0.39652758836746216, + "learning_rate": 0.00019353513822638715, + "loss": 1.4928, + "step": 2503 + }, + { + "epoch": 0.0325383379653468, + "grad_norm": 0.43363118171691895, + "learning_rate": 0.00019353253876447577, + "loss": 1.6528, + "step": 2504 + }, + { + "epoch": 0.032551332509262676, + "grad_norm": 0.374214768409729, + "learning_rate": 0.00019352993930256437, + "loss": 1.5646, + "step": 2505 + }, + { + "epoch": 0.03256432705317855, + "grad_norm": 0.2884669303894043, + "learning_rate": 0.000193527339840653, + "loss": 1.4726, + "step": 2506 + }, + { + "epoch": 0.03257732159709442, + "grad_norm": 0.2782253324985504, + "learning_rate": 0.00019352474037874162, + "loss": 1.4378, + "step": 2507 + }, + { + "epoch": 0.032590316141010295, + "grad_norm": 0.4006049633026123, + "learning_rate": 0.00019352214091683024, + "loss": 1.6467, + "step": 2508 + }, + { + "epoch": 0.03260331068492617, + "grad_norm": 0.38108521699905396, + "learning_rate": 0.00019351954145491884, + "loss": 1.5763, + "step": 2509 + }, + { + "epoch": 0.03261630522884204, + "grad_norm": 0.3799011707305908, + "learning_rate": 0.00019351694199300744, + "loss": 1.4375, + "step": 2510 + }, + { + "epoch": 0.032629299772757914, + "grad_norm": 0.3035220205783844, + "learning_rate": 0.0001935143425310961, + "loss": 1.5254, + "step": 2511 + }, + { + "epoch": 0.03264229431667379, + "grad_norm": 0.36107808351516724, + "learning_rate": 0.00019351174306918469, + "loss": 1.561, + "step": 2512 + }, + { + "epoch": 0.03265528886058966, + "grad_norm": 0.42562344670295715, + "learning_rate": 0.0001935091436072733, + "loss": 1.4518, + "step": 2513 + }, + { + "epoch": 0.03266828340450553, + "grad_norm": 0.3229316473007202, + "learning_rate": 0.0001935065441453619, + "loss": 1.5941, + "step": 2514 + }, + { + "epoch": 0.032681277948421406, + "grad_norm": 0.27872514724731445, + "learning_rate": 0.00019350394468345053, + "loss": 1.1342, + "step": 2515 + }, + { + "epoch": 0.03269427249233728, + "grad_norm": 0.4430004060268402, + "learning_rate": 0.00019350134522153916, + "loss": 1.568, + "step": 2516 + }, + { + "epoch": 0.03270726703625315, + "grad_norm": 0.438882440328598, + "learning_rate": 0.00019349874575962775, + "loss": 1.5194, + "step": 2517 + }, + { + "epoch": 0.032720261580169024, + "grad_norm": 0.34737488627433777, + "learning_rate": 0.00019349614629771638, + "loss": 1.478, + "step": 2518 + }, + { + "epoch": 0.0327332561240849, + "grad_norm": 0.32929858565330505, + "learning_rate": 0.000193493546835805, + "loss": 1.4294, + "step": 2519 + }, + { + "epoch": 0.03274625066800077, + "grad_norm": 0.3582665026187897, + "learning_rate": 0.00019349094737389363, + "loss": 1.2878, + "step": 2520 + }, + { + "epoch": 0.03275924521191664, + "grad_norm": 0.4200032949447632, + "learning_rate": 0.00019348834791198222, + "loss": 1.6029, + "step": 2521 + }, + { + "epoch": 0.03277223975583252, + "grad_norm": 0.31368643045425415, + "learning_rate": 0.00019348574845007085, + "loss": 1.4605, + "step": 2522 + }, + { + "epoch": 0.032785234299748396, + "grad_norm": 0.3194805085659027, + "learning_rate": 0.00019348314898815947, + "loss": 1.3755, + "step": 2523 + }, + { + "epoch": 0.03279822884366427, + "grad_norm": 0.2415430098772049, + "learning_rate": 0.00019348054952624807, + "loss": 1.3457, + "step": 2524 + }, + { + "epoch": 0.03281122338758014, + "grad_norm": 0.3878587484359741, + "learning_rate": 0.0001934779500643367, + "loss": 1.4192, + "step": 2525 + }, + { + "epoch": 0.032824217931496015, + "grad_norm": 0.3380589485168457, + "learning_rate": 0.0001934753506024253, + "loss": 1.5376, + "step": 2526 + }, + { + "epoch": 0.03283721247541189, + "grad_norm": 0.3906152546405792, + "learning_rate": 0.00019347275114051392, + "loss": 1.4204, + "step": 2527 + }, + { + "epoch": 0.03285020701932776, + "grad_norm": 0.41287320852279663, + "learning_rate": 0.00019347015167860254, + "loss": 1.3862, + "step": 2528 + }, + { + "epoch": 0.032863201563243634, + "grad_norm": 0.27187255024909973, + "learning_rate": 0.00019346755221669114, + "loss": 1.282, + "step": 2529 + }, + { + "epoch": 0.03287619610715951, + "grad_norm": 0.41603827476501465, + "learning_rate": 0.00019346495275477976, + "loss": 1.6202, + "step": 2530 + }, + { + "epoch": 0.03288919065107538, + "grad_norm": 0.2614023685455322, + "learning_rate": 0.0001934623532928684, + "loss": 1.32, + "step": 2531 + }, + { + "epoch": 0.03290218519499125, + "grad_norm": 0.3348638117313385, + "learning_rate": 0.000193459753830957, + "loss": 1.349, + "step": 2532 + }, + { + "epoch": 0.032915179738907126, + "grad_norm": 0.42554065585136414, + "learning_rate": 0.0001934571543690456, + "loss": 1.4376, + "step": 2533 + }, + { + "epoch": 0.032928174282823, + "grad_norm": 0.4203813970088959, + "learning_rate": 0.00019345455490713423, + "loss": 1.5713, + "step": 2534 + }, + { + "epoch": 0.03294116882673887, + "grad_norm": 0.30855053663253784, + "learning_rate": 0.00019345195544522286, + "loss": 1.3682, + "step": 2535 + }, + { + "epoch": 0.032954163370654745, + "grad_norm": 0.3360995650291443, + "learning_rate": 0.00019344935598331146, + "loss": 1.3917, + "step": 2536 + }, + { + "epoch": 0.03296715791457062, + "grad_norm": 0.34986838698387146, + "learning_rate": 0.00019344675652140008, + "loss": 1.3935, + "step": 2537 + }, + { + "epoch": 0.03298015245848649, + "grad_norm": 0.3301599621772766, + "learning_rate": 0.00019344415705948868, + "loss": 1.4279, + "step": 2538 + }, + { + "epoch": 0.032993147002402363, + "grad_norm": 0.28528234362602234, + "learning_rate": 0.00019344155759757733, + "loss": 1.2924, + "step": 2539 + }, + { + "epoch": 0.033006141546318236, + "grad_norm": 0.404764324426651, + "learning_rate": 0.00019343895813566593, + "loss": 1.4343, + "step": 2540 + }, + { + "epoch": 0.033019136090234116, + "grad_norm": 0.3436245620250702, + "learning_rate": 0.00019343635867375452, + "loss": 1.2012, + "step": 2541 + }, + { + "epoch": 0.03303213063414999, + "grad_norm": 0.28586044907569885, + "learning_rate": 0.00019343375921184318, + "loss": 1.4012, + "step": 2542 + }, + { + "epoch": 0.03304512517806586, + "grad_norm": 0.3174670338630676, + "learning_rate": 0.00019343115974993177, + "loss": 1.1817, + "step": 2543 + }, + { + "epoch": 0.033058119721981735, + "grad_norm": 0.4420662224292755, + "learning_rate": 0.0001934285602880204, + "loss": 1.407, + "step": 2544 + }, + { + "epoch": 0.03307111426589761, + "grad_norm": 0.4695225954055786, + "learning_rate": 0.000193425960826109, + "loss": 1.5113, + "step": 2545 + }, + { + "epoch": 0.03308410880981348, + "grad_norm": 0.3479248583316803, + "learning_rate": 0.00019342336136419762, + "loss": 1.4837, + "step": 2546 + }, + { + "epoch": 0.033097103353729354, + "grad_norm": 0.42536017298698425, + "learning_rate": 0.00019342076190228624, + "loss": 1.4728, + "step": 2547 + }, + { + "epoch": 0.03311009789764523, + "grad_norm": 0.3892519474029541, + "learning_rate": 0.00019341816244037484, + "loss": 1.5892, + "step": 2548 + }, + { + "epoch": 0.0331230924415611, + "grad_norm": 0.5114793181419373, + "learning_rate": 0.00019341556297846347, + "loss": 1.5139, + "step": 2549 + }, + { + "epoch": 0.03313608698547697, + "grad_norm": 0.34405434131622314, + "learning_rate": 0.0001934129635165521, + "loss": 1.3389, + "step": 2550 + }, + { + "epoch": 0.033149081529392846, + "grad_norm": 0.38240694999694824, + "learning_rate": 0.00019341036405464072, + "loss": 1.437, + "step": 2551 + }, + { + "epoch": 0.03316207607330872, + "grad_norm": 0.35863661766052246, + "learning_rate": 0.0001934077645927293, + "loss": 1.4334, + "step": 2552 + }, + { + "epoch": 0.03317507061722459, + "grad_norm": 0.3211493492126465, + "learning_rate": 0.0001934051651308179, + "loss": 1.2768, + "step": 2553 + }, + { + "epoch": 0.033188065161140465, + "grad_norm": 0.3290875256061554, + "learning_rate": 0.00019340256566890656, + "loss": 1.1401, + "step": 2554 + }, + { + "epoch": 0.03320105970505634, + "grad_norm": 0.3791804611682892, + "learning_rate": 0.00019339996620699516, + "loss": 1.4081, + "step": 2555 + }, + { + "epoch": 0.03321405424897221, + "grad_norm": 0.2770865261554718, + "learning_rate": 0.00019339736674508378, + "loss": 1.3397, + "step": 2556 + }, + { + "epoch": 0.033227048792888084, + "grad_norm": 0.3526403307914734, + "learning_rate": 0.00019339476728317238, + "loss": 1.388, + "step": 2557 + }, + { + "epoch": 0.03324004333680396, + "grad_norm": 0.37430208921432495, + "learning_rate": 0.000193392167821261, + "loss": 1.5212, + "step": 2558 + }, + { + "epoch": 0.03325303788071983, + "grad_norm": 0.3494778275489807, + "learning_rate": 0.00019338956835934963, + "loss": 1.3671, + "step": 2559 + }, + { + "epoch": 0.03326603242463571, + "grad_norm": 0.4079028069972992, + "learning_rate": 0.00019338696889743823, + "loss": 1.402, + "step": 2560 + }, + { + "epoch": 0.03327902696855158, + "grad_norm": 0.46199265122413635, + "learning_rate": 0.00019338436943552685, + "loss": 1.4798, + "step": 2561 + }, + { + "epoch": 0.033292021512467455, + "grad_norm": 0.32505708932876587, + "learning_rate": 0.00019338176997361548, + "loss": 1.4432, + "step": 2562 + }, + { + "epoch": 0.03330501605638333, + "grad_norm": 0.3568376302719116, + "learning_rate": 0.0001933791705117041, + "loss": 1.3947, + "step": 2563 + }, + { + "epoch": 0.0333180106002992, + "grad_norm": 0.3708251118659973, + "learning_rate": 0.0001933765710497927, + "loss": 1.2981, + "step": 2564 + }, + { + "epoch": 0.033331005144215074, + "grad_norm": 0.39222443103790283, + "learning_rate": 0.0001933739715878813, + "loss": 1.3699, + "step": 2565 + }, + { + "epoch": 0.03334399968813095, + "grad_norm": 0.2827359139919281, + "learning_rate": 0.00019337137212596995, + "loss": 1.3626, + "step": 2566 + }, + { + "epoch": 0.03335699423204682, + "grad_norm": 0.3819025158882141, + "learning_rate": 0.00019336877266405854, + "loss": 1.3338, + "step": 2567 + }, + { + "epoch": 0.03336998877596269, + "grad_norm": 0.38024330139160156, + "learning_rate": 0.00019336617320214717, + "loss": 1.4123, + "step": 2568 + }, + { + "epoch": 0.033382983319878566, + "grad_norm": 0.3383539915084839, + "learning_rate": 0.00019336357374023577, + "loss": 1.391, + "step": 2569 + }, + { + "epoch": 0.03339597786379444, + "grad_norm": 0.2682119905948639, + "learning_rate": 0.0001933609742783244, + "loss": 1.575, + "step": 2570 + }, + { + "epoch": 0.03340897240771031, + "grad_norm": 0.40894395112991333, + "learning_rate": 0.00019335837481641302, + "loss": 1.3491, + "step": 2571 + }, + { + "epoch": 0.033421966951626185, + "grad_norm": 0.4955017864704132, + "learning_rate": 0.0001933557753545016, + "loss": 1.3903, + "step": 2572 + }, + { + "epoch": 0.03343496149554206, + "grad_norm": 0.40897059440612793, + "learning_rate": 0.00019335317589259024, + "loss": 1.2006, + "step": 2573 + }, + { + "epoch": 0.03344795603945793, + "grad_norm": 0.31796297430992126, + "learning_rate": 0.00019335057643067886, + "loss": 1.2861, + "step": 2574 + }, + { + "epoch": 0.033460950583373804, + "grad_norm": 0.3871162235736847, + "learning_rate": 0.00019334797696876749, + "loss": 1.5619, + "step": 2575 + }, + { + "epoch": 0.03347394512728968, + "grad_norm": 0.33858761191368103, + "learning_rate": 0.00019334537750685608, + "loss": 1.4107, + "step": 2576 + }, + { + "epoch": 0.03348693967120555, + "grad_norm": 0.35653814673423767, + "learning_rate": 0.0001933427780449447, + "loss": 1.6332, + "step": 2577 + }, + { + "epoch": 0.03349993421512142, + "grad_norm": 0.3362171947956085, + "learning_rate": 0.00019334017858303333, + "loss": 1.425, + "step": 2578 + }, + { + "epoch": 0.0335129287590373, + "grad_norm": 0.4000280797481537, + "learning_rate": 0.00019333757912112193, + "loss": 1.3239, + "step": 2579 + }, + { + "epoch": 0.033525923302953176, + "grad_norm": 0.42451217770576477, + "learning_rate": 0.00019333497965921055, + "loss": 1.5075, + "step": 2580 + }, + { + "epoch": 0.03353891784686905, + "grad_norm": 0.3276481032371521, + "learning_rate": 0.00019333238019729918, + "loss": 1.2976, + "step": 2581 + }, + { + "epoch": 0.03355191239078492, + "grad_norm": 0.4846782088279724, + "learning_rate": 0.00019332978073538778, + "loss": 1.5055, + "step": 2582 + }, + { + "epoch": 0.033564906934700794, + "grad_norm": 0.31280940771102905, + "learning_rate": 0.0001933271812734764, + "loss": 1.3257, + "step": 2583 + }, + { + "epoch": 0.03357790147861667, + "grad_norm": 0.3765902817249298, + "learning_rate": 0.000193324581811565, + "loss": 1.496, + "step": 2584 + }, + { + "epoch": 0.03359089602253254, + "grad_norm": 0.3513646721839905, + "learning_rate": 0.00019332198234965365, + "loss": 1.4519, + "step": 2585 + }, + { + "epoch": 0.03360389056644841, + "grad_norm": 0.3542924225330353, + "learning_rate": 0.00019331938288774225, + "loss": 1.4657, + "step": 2586 + }, + { + "epoch": 0.033616885110364286, + "grad_norm": 0.27782920002937317, + "learning_rate": 0.00019331678342583087, + "loss": 1.2721, + "step": 2587 + }, + { + "epoch": 0.03362987965428016, + "grad_norm": 0.4466197192668915, + "learning_rate": 0.00019331418396391947, + "loss": 1.4218, + "step": 2588 + }, + { + "epoch": 0.03364287419819603, + "grad_norm": 0.3801550269126892, + "learning_rate": 0.0001933115845020081, + "loss": 1.4564, + "step": 2589 + }, + { + "epoch": 0.033655868742111905, + "grad_norm": 0.5039145946502686, + "learning_rate": 0.00019330898504009672, + "loss": 1.5975, + "step": 2590 + }, + { + "epoch": 0.03366886328602778, + "grad_norm": 0.326337605714798, + "learning_rate": 0.00019330638557818532, + "loss": 1.3116, + "step": 2591 + }, + { + "epoch": 0.03368185782994365, + "grad_norm": 0.5112564563751221, + "learning_rate": 0.00019330378611627394, + "loss": 1.5561, + "step": 2592 + }, + { + "epoch": 0.033694852373859524, + "grad_norm": 0.404899924993515, + "learning_rate": 0.00019330118665436256, + "loss": 1.4605, + "step": 2593 + }, + { + "epoch": 0.0337078469177754, + "grad_norm": 0.2979227304458618, + "learning_rate": 0.00019329858719245116, + "loss": 1.2656, + "step": 2594 + }, + { + "epoch": 0.03372084146169127, + "grad_norm": 0.42224523425102234, + "learning_rate": 0.00019329598773053979, + "loss": 1.6135, + "step": 2595 + }, + { + "epoch": 0.03373383600560714, + "grad_norm": 0.2901502549648285, + "learning_rate": 0.00019329338826862838, + "loss": 1.4867, + "step": 2596 + }, + { + "epoch": 0.033746830549523016, + "grad_norm": 0.3446262776851654, + "learning_rate": 0.00019329078880671704, + "loss": 1.242, + "step": 2597 + }, + { + "epoch": 0.033759825093438896, + "grad_norm": 0.45019254088401794, + "learning_rate": 0.00019328818934480563, + "loss": 1.3692, + "step": 2598 + }, + { + "epoch": 0.03377281963735477, + "grad_norm": 0.49961885809898376, + "learning_rate": 0.00019328558988289426, + "loss": 1.5872, + "step": 2599 + }, + { + "epoch": 0.03378581418127064, + "grad_norm": 0.36805474758148193, + "learning_rate": 0.00019328299042098285, + "loss": 1.4729, + "step": 2600 + }, + { + "epoch": 0.033798808725186515, + "grad_norm": 0.4167197346687317, + "learning_rate": 0.00019328039095907148, + "loss": 1.6615, + "step": 2601 + }, + { + "epoch": 0.03381180326910239, + "grad_norm": 0.35889965295791626, + "learning_rate": 0.0001932777914971601, + "loss": 1.4583, + "step": 2602 + }, + { + "epoch": 0.03382479781301826, + "grad_norm": 0.3481581211090088, + "learning_rate": 0.0001932751920352487, + "loss": 1.4279, + "step": 2603 + }, + { + "epoch": 0.033837792356934134, + "grad_norm": 0.42533284425735474, + "learning_rate": 0.00019327259257333733, + "loss": 1.6325, + "step": 2604 + }, + { + "epoch": 0.033850786900850006, + "grad_norm": 0.43109431862831116, + "learning_rate": 0.00019326999311142595, + "loss": 1.3109, + "step": 2605 + }, + { + "epoch": 0.03386378144476588, + "grad_norm": 0.40868356823921204, + "learning_rate": 0.00019326739364951457, + "loss": 1.4989, + "step": 2606 + }, + { + "epoch": 0.03387677598868175, + "grad_norm": 0.3893938958644867, + "learning_rate": 0.00019326479418760317, + "loss": 1.4323, + "step": 2607 + }, + { + "epoch": 0.033889770532597625, + "grad_norm": 0.4579794406890869, + "learning_rate": 0.00019326219472569177, + "loss": 1.5663, + "step": 2608 + }, + { + "epoch": 0.0339027650765135, + "grad_norm": 0.3872573971748352, + "learning_rate": 0.00019325959526378042, + "loss": 1.6074, + "step": 2609 + }, + { + "epoch": 0.03391575962042937, + "grad_norm": 0.27037733793258667, + "learning_rate": 0.00019325699580186902, + "loss": 1.321, + "step": 2610 + }, + { + "epoch": 0.033928754164345244, + "grad_norm": 0.38498052954673767, + "learning_rate": 0.00019325439633995764, + "loss": 1.3941, + "step": 2611 + }, + { + "epoch": 0.03394174870826112, + "grad_norm": 0.2826327979564667, + "learning_rate": 0.00019325179687804624, + "loss": 1.3917, + "step": 2612 + }, + { + "epoch": 0.03395474325217699, + "grad_norm": 0.39286717772483826, + "learning_rate": 0.00019324919741613486, + "loss": 1.4006, + "step": 2613 + }, + { + "epoch": 0.03396773779609286, + "grad_norm": 0.32138383388519287, + "learning_rate": 0.0001932465979542235, + "loss": 1.3965, + "step": 2614 + }, + { + "epoch": 0.033980732340008736, + "grad_norm": 0.3826918303966522, + "learning_rate": 0.00019324399849231209, + "loss": 1.5142, + "step": 2615 + }, + { + "epoch": 0.03399372688392461, + "grad_norm": 0.44314083456993103, + "learning_rate": 0.00019324139903040074, + "loss": 1.4034, + "step": 2616 + }, + { + "epoch": 0.03400672142784049, + "grad_norm": 0.31664013862609863, + "learning_rate": 0.00019323879956848933, + "loss": 1.3188, + "step": 2617 + }, + { + "epoch": 0.03401971597175636, + "grad_norm": 0.5277805924415588, + "learning_rate": 0.00019323620010657796, + "loss": 1.3762, + "step": 2618 + }, + { + "epoch": 0.034032710515672235, + "grad_norm": 0.5130351781845093, + "learning_rate": 0.00019323360064466656, + "loss": 1.4866, + "step": 2619 + }, + { + "epoch": 0.03404570505958811, + "grad_norm": 0.3151296377182007, + "learning_rate": 0.00019323100118275518, + "loss": 1.389, + "step": 2620 + }, + { + "epoch": 0.03405869960350398, + "grad_norm": 0.535396933555603, + "learning_rate": 0.0001932284017208438, + "loss": 1.5263, + "step": 2621 + }, + { + "epoch": 0.034071694147419854, + "grad_norm": 0.48304587602615356, + "learning_rate": 0.0001932258022589324, + "loss": 1.5219, + "step": 2622 + }, + { + "epoch": 0.03408468869133573, + "grad_norm": 0.41263094544410706, + "learning_rate": 0.00019322320279702103, + "loss": 1.5241, + "step": 2623 + }, + { + "epoch": 0.0340976832352516, + "grad_norm": 0.41084033250808716, + "learning_rate": 0.00019322060333510965, + "loss": 1.3622, + "step": 2624 + }, + { + "epoch": 0.03411067777916747, + "grad_norm": 0.28344786167144775, + "learning_rate": 0.00019321800387319825, + "loss": 1.3885, + "step": 2625 + }, + { + "epoch": 0.034123672323083346, + "grad_norm": 0.37179088592529297, + "learning_rate": 0.00019321540441128687, + "loss": 1.5059, + "step": 2626 + }, + { + "epoch": 0.03413666686699922, + "grad_norm": 0.3244517147541046, + "learning_rate": 0.00019321280494937547, + "loss": 1.4318, + "step": 2627 + }, + { + "epoch": 0.03414966141091509, + "grad_norm": 0.4654742181301117, + "learning_rate": 0.00019321020548746412, + "loss": 1.4745, + "step": 2628 + }, + { + "epoch": 0.034162655954830964, + "grad_norm": 0.3580836355686188, + "learning_rate": 0.00019320760602555272, + "loss": 1.4278, + "step": 2629 + }, + { + "epoch": 0.03417565049874684, + "grad_norm": 0.45845532417297363, + "learning_rate": 0.00019320500656364134, + "loss": 1.5502, + "step": 2630 + }, + { + "epoch": 0.03418864504266271, + "grad_norm": 0.38971734046936035, + "learning_rate": 0.00019320240710172994, + "loss": 1.5486, + "step": 2631 + }, + { + "epoch": 0.03420163958657858, + "grad_norm": 0.3922237157821655, + "learning_rate": 0.00019319980763981857, + "loss": 1.3719, + "step": 2632 + }, + { + "epoch": 0.034214634130494456, + "grad_norm": 0.37390995025634766, + "learning_rate": 0.0001931972081779072, + "loss": 1.2856, + "step": 2633 + }, + { + "epoch": 0.03422762867441033, + "grad_norm": 0.46617215871810913, + "learning_rate": 0.0001931946087159958, + "loss": 1.4395, + "step": 2634 + }, + { + "epoch": 0.0342406232183262, + "grad_norm": 0.3563721477985382, + "learning_rate": 0.0001931920092540844, + "loss": 1.4457, + "step": 2635 + }, + { + "epoch": 0.03425361776224208, + "grad_norm": 0.409978449344635, + "learning_rate": 0.00019318940979217304, + "loss": 1.4947, + "step": 2636 + }, + { + "epoch": 0.034266612306157955, + "grad_norm": 0.4160659611225128, + "learning_rate": 0.00019318681033026163, + "loss": 1.4029, + "step": 2637 + }, + { + "epoch": 0.03427960685007383, + "grad_norm": 0.4420088827610016, + "learning_rate": 0.00019318421086835026, + "loss": 1.4389, + "step": 2638 + }, + { + "epoch": 0.0342926013939897, + "grad_norm": 0.40361663699150085, + "learning_rate": 0.00019318161140643886, + "loss": 1.4556, + "step": 2639 + }, + { + "epoch": 0.034305595937905574, + "grad_norm": 0.3836807906627655, + "learning_rate": 0.0001931790119445275, + "loss": 1.5375, + "step": 2640 + }, + { + "epoch": 0.03431859048182145, + "grad_norm": 0.21775692701339722, + "learning_rate": 0.0001931764124826161, + "loss": 1.3929, + "step": 2641 + }, + { + "epoch": 0.03433158502573732, + "grad_norm": 0.4751773476600647, + "learning_rate": 0.00019317381302070473, + "loss": 1.533, + "step": 2642 + }, + { + "epoch": 0.03434457956965319, + "grad_norm": 0.3994961380958557, + "learning_rate": 0.00019317121355879333, + "loss": 1.3528, + "step": 2643 + }, + { + "epoch": 0.034357574113569066, + "grad_norm": 0.33339062333106995, + "learning_rate": 0.00019316861409688195, + "loss": 1.4777, + "step": 2644 + }, + { + "epoch": 0.03437056865748494, + "grad_norm": 0.4146357774734497, + "learning_rate": 0.00019316601463497058, + "loss": 1.5178, + "step": 2645 + }, + { + "epoch": 0.03438356320140081, + "grad_norm": 0.4500183165073395, + "learning_rate": 0.00019316341517305917, + "loss": 1.5561, + "step": 2646 + }, + { + "epoch": 0.034396557745316685, + "grad_norm": 0.38291653990745544, + "learning_rate": 0.0001931608157111478, + "loss": 1.2276, + "step": 2647 + }, + { + "epoch": 0.03440955228923256, + "grad_norm": 0.41809695959091187, + "learning_rate": 0.00019315821624923642, + "loss": 1.5846, + "step": 2648 + }, + { + "epoch": 0.03442254683314843, + "grad_norm": 0.32560357451438904, + "learning_rate": 0.00019315561678732502, + "loss": 1.5845, + "step": 2649 + }, + { + "epoch": 0.0344355413770643, + "grad_norm": 0.46995043754577637, + "learning_rate": 0.00019315301732541364, + "loss": 1.5644, + "step": 2650 + }, + { + "epoch": 0.034448535920980176, + "grad_norm": 0.344234436750412, + "learning_rate": 0.00019315041786350224, + "loss": 1.3925, + "step": 2651 + }, + { + "epoch": 0.03446153046489605, + "grad_norm": 0.3973907232284546, + "learning_rate": 0.0001931478184015909, + "loss": 1.4672, + "step": 2652 + }, + { + "epoch": 0.03447452500881192, + "grad_norm": 0.4125625491142273, + "learning_rate": 0.0001931452189396795, + "loss": 1.3658, + "step": 2653 + }, + { + "epoch": 0.034487519552727795, + "grad_norm": 0.32643839716911316, + "learning_rate": 0.00019314261947776812, + "loss": 1.4776, + "step": 2654 + }, + { + "epoch": 0.034500514096643675, + "grad_norm": 0.3233036398887634, + "learning_rate": 0.00019314002001585674, + "loss": 1.4777, + "step": 2655 + }, + { + "epoch": 0.03451350864055955, + "grad_norm": 0.4476162791252136, + "learning_rate": 0.00019313742055394534, + "loss": 1.5353, + "step": 2656 + }, + { + "epoch": 0.03452650318447542, + "grad_norm": 0.42174360156059265, + "learning_rate": 0.00019313482109203396, + "loss": 1.3087, + "step": 2657 + }, + { + "epoch": 0.034539497728391294, + "grad_norm": 0.3945503830909729, + "learning_rate": 0.00019313222163012256, + "loss": 1.5998, + "step": 2658 + }, + { + "epoch": 0.03455249227230717, + "grad_norm": 0.4233253300189972, + "learning_rate": 0.0001931296221682112, + "loss": 1.638, + "step": 2659 + }, + { + "epoch": 0.03456548681622304, + "grad_norm": 0.4910193085670471, + "learning_rate": 0.0001931270227062998, + "loss": 1.4125, + "step": 2660 + }, + { + "epoch": 0.03457848136013891, + "grad_norm": 0.47878560423851013, + "learning_rate": 0.00019312442324438843, + "loss": 1.4589, + "step": 2661 + }, + { + "epoch": 0.034591475904054786, + "grad_norm": 0.3505452871322632, + "learning_rate": 0.00019312182378247703, + "loss": 1.2657, + "step": 2662 + }, + { + "epoch": 0.03460447044797066, + "grad_norm": 0.3399491012096405, + "learning_rate": 0.00019311922432056565, + "loss": 1.4723, + "step": 2663 + }, + { + "epoch": 0.03461746499188653, + "grad_norm": 0.38019537925720215, + "learning_rate": 0.00019311662485865428, + "loss": 1.5725, + "step": 2664 + }, + { + "epoch": 0.034630459535802405, + "grad_norm": 0.36103230714797974, + "learning_rate": 0.00019311402539674288, + "loss": 1.405, + "step": 2665 + }, + { + "epoch": 0.03464345407971828, + "grad_norm": 0.41101959347724915, + "learning_rate": 0.0001931114259348315, + "loss": 1.4975, + "step": 2666 + }, + { + "epoch": 0.03465644862363415, + "grad_norm": 0.4802800118923187, + "learning_rate": 0.00019310882647292013, + "loss": 1.4648, + "step": 2667 + }, + { + "epoch": 0.034669443167550024, + "grad_norm": 0.4229055345058441, + "learning_rate": 0.00019310622701100872, + "loss": 1.5491, + "step": 2668 + }, + { + "epoch": 0.0346824377114659, + "grad_norm": 0.3899267911911011, + "learning_rate": 0.00019310362754909735, + "loss": 1.3745, + "step": 2669 + }, + { + "epoch": 0.03469543225538177, + "grad_norm": 0.5346493124961853, + "learning_rate": 0.00019310102808718594, + "loss": 1.4355, + "step": 2670 + }, + { + "epoch": 0.03470842679929764, + "grad_norm": 0.425605833530426, + "learning_rate": 0.0001930984286252746, + "loss": 1.6465, + "step": 2671 + }, + { + "epoch": 0.034721421343213515, + "grad_norm": 0.4313183128833771, + "learning_rate": 0.0001930958291633632, + "loss": 1.4226, + "step": 2672 + }, + { + "epoch": 0.03473441588712939, + "grad_norm": 0.33624106645584106, + "learning_rate": 0.00019309322970145182, + "loss": 1.1646, + "step": 2673 + }, + { + "epoch": 0.03474741043104527, + "grad_norm": 0.37093326449394226, + "learning_rate": 0.00019309063023954042, + "loss": 1.526, + "step": 2674 + }, + { + "epoch": 0.03476040497496114, + "grad_norm": 0.44133490324020386, + "learning_rate": 0.00019308803077762904, + "loss": 1.5556, + "step": 2675 + }, + { + "epoch": 0.034773399518877014, + "grad_norm": 0.3195987045764923, + "learning_rate": 0.00019308543131571766, + "loss": 1.3604, + "step": 2676 + }, + { + "epoch": 0.03478639406279289, + "grad_norm": 0.41076597571372986, + "learning_rate": 0.00019308283185380626, + "loss": 1.4536, + "step": 2677 + }, + { + "epoch": 0.03479938860670876, + "grad_norm": 0.501983106136322, + "learning_rate": 0.00019308023239189489, + "loss": 1.4381, + "step": 2678 + }, + { + "epoch": 0.03481238315062463, + "grad_norm": 0.4325046241283417, + "learning_rate": 0.0001930776329299835, + "loss": 1.5159, + "step": 2679 + }, + { + "epoch": 0.034825377694540506, + "grad_norm": 0.32546481490135193, + "learning_rate": 0.0001930750334680721, + "loss": 1.2549, + "step": 2680 + }, + { + "epoch": 0.03483837223845638, + "grad_norm": 0.3376631736755371, + "learning_rate": 0.00019307243400616073, + "loss": 1.6918, + "step": 2681 + }, + { + "epoch": 0.03485136678237225, + "grad_norm": 0.40217649936676025, + "learning_rate": 0.00019306983454424933, + "loss": 1.3748, + "step": 2682 + }, + { + "epoch": 0.034864361326288125, + "grad_norm": 0.24640783667564392, + "learning_rate": 0.00019306723508233798, + "loss": 1.1395, + "step": 2683 + }, + { + "epoch": 0.034877355870204, + "grad_norm": 0.43814030289649963, + "learning_rate": 0.00019306463562042658, + "loss": 1.5176, + "step": 2684 + }, + { + "epoch": 0.03489035041411987, + "grad_norm": 0.2910681664943695, + "learning_rate": 0.0001930620361585152, + "loss": 1.4568, + "step": 2685 + }, + { + "epoch": 0.034903344958035744, + "grad_norm": 0.4540664851665497, + "learning_rate": 0.0001930594366966038, + "loss": 1.4671, + "step": 2686 + }, + { + "epoch": 0.03491633950195162, + "grad_norm": 0.34096863865852356, + "learning_rate": 0.00019305683723469243, + "loss": 1.2896, + "step": 2687 + }, + { + "epoch": 0.03492933404586749, + "grad_norm": 0.4042552709579468, + "learning_rate": 0.00019305423777278105, + "loss": 1.4341, + "step": 2688 + }, + { + "epoch": 0.03494232858978336, + "grad_norm": 0.33614152669906616, + "learning_rate": 0.00019305163831086965, + "loss": 1.3547, + "step": 2689 + }, + { + "epoch": 0.034955323133699236, + "grad_norm": 0.35462722182273865, + "learning_rate": 0.0001930490388489583, + "loss": 1.23, + "step": 2690 + }, + { + "epoch": 0.03496831767761511, + "grad_norm": 0.3538316786289215, + "learning_rate": 0.0001930464393870469, + "loss": 1.5511, + "step": 2691 + }, + { + "epoch": 0.03498131222153098, + "grad_norm": 0.2901199758052826, + "learning_rate": 0.0001930438399251355, + "loss": 1.346, + "step": 2692 + }, + { + "epoch": 0.034994306765446855, + "grad_norm": 0.40211308002471924, + "learning_rate": 0.00019304124046322412, + "loss": 1.6341, + "step": 2693 + }, + { + "epoch": 0.035007301309362734, + "grad_norm": 0.3229469358921051, + "learning_rate": 0.00019303864100131274, + "loss": 1.5874, + "step": 2694 + }, + { + "epoch": 0.03502029585327861, + "grad_norm": 0.4672679603099823, + "learning_rate": 0.00019303604153940137, + "loss": 1.3592, + "step": 2695 + }, + { + "epoch": 0.03503329039719448, + "grad_norm": 0.3832331597805023, + "learning_rate": 0.00019303344207748996, + "loss": 1.6367, + "step": 2696 + }, + { + "epoch": 0.03504628494111035, + "grad_norm": 0.4546045660972595, + "learning_rate": 0.0001930308426155786, + "loss": 1.3327, + "step": 2697 + }, + { + "epoch": 0.035059279485026226, + "grad_norm": 0.4661729037761688, + "learning_rate": 0.0001930282431536672, + "loss": 1.6591, + "step": 2698 + }, + { + "epoch": 0.0350722740289421, + "grad_norm": 0.4349871575832367, + "learning_rate": 0.0001930256436917558, + "loss": 1.4524, + "step": 2699 + }, + { + "epoch": 0.03508526857285797, + "grad_norm": 0.38665902614593506, + "learning_rate": 0.00019302304422984444, + "loss": 1.4058, + "step": 2700 + }, + { + "epoch": 0.035098263116773845, + "grad_norm": 0.38995254039764404, + "learning_rate": 0.00019302044476793303, + "loss": 1.3594, + "step": 2701 + }, + { + "epoch": 0.03511125766068972, + "grad_norm": 0.2993116080760956, + "learning_rate": 0.00019301784530602168, + "loss": 1.5534, + "step": 2702 + }, + { + "epoch": 0.03512425220460559, + "grad_norm": 0.3170860707759857, + "learning_rate": 0.00019301524584411028, + "loss": 1.413, + "step": 2703 + }, + { + "epoch": 0.035137246748521464, + "grad_norm": 0.4542633891105652, + "learning_rate": 0.00019301264638219888, + "loss": 1.5157, + "step": 2704 + }, + { + "epoch": 0.03515024129243734, + "grad_norm": 0.30418646335601807, + "learning_rate": 0.0001930100469202875, + "loss": 1.4775, + "step": 2705 + }, + { + "epoch": 0.03516323583635321, + "grad_norm": 0.4070758819580078, + "learning_rate": 0.00019300744745837613, + "loss": 1.3137, + "step": 2706 + }, + { + "epoch": 0.03517623038026908, + "grad_norm": 0.40993425250053406, + "learning_rate": 0.00019300484799646475, + "loss": 1.395, + "step": 2707 + }, + { + "epoch": 0.035189224924184956, + "grad_norm": 0.43533939123153687, + "learning_rate": 0.00019300224853455335, + "loss": 1.5093, + "step": 2708 + }, + { + "epoch": 0.03520221946810083, + "grad_norm": 0.34060242772102356, + "learning_rate": 0.00019299964907264197, + "loss": 1.5795, + "step": 2709 + }, + { + "epoch": 0.0352152140120167, + "grad_norm": 0.45110419392585754, + "learning_rate": 0.0001929970496107306, + "loss": 1.3807, + "step": 2710 + }, + { + "epoch": 0.035228208555932575, + "grad_norm": 0.2704315483570099, + "learning_rate": 0.0001929944501488192, + "loss": 1.3191, + "step": 2711 + }, + { + "epoch": 0.03524120309984845, + "grad_norm": 0.46195530891418457, + "learning_rate": 0.00019299185068690782, + "loss": 1.5613, + "step": 2712 + }, + { + "epoch": 0.03525419764376433, + "grad_norm": 0.40167585015296936, + "learning_rate": 0.00019298925122499642, + "loss": 1.3393, + "step": 2713 + }, + { + "epoch": 0.0352671921876802, + "grad_norm": 0.44363099336624146, + "learning_rate": 0.00019298665176308507, + "loss": 1.4231, + "step": 2714 + }, + { + "epoch": 0.035280186731596073, + "grad_norm": 0.40086689591407776, + "learning_rate": 0.00019298405230117367, + "loss": 1.3922, + "step": 2715 + }, + { + "epoch": 0.035293181275511946, + "grad_norm": 0.3890444040298462, + "learning_rate": 0.00019298145283926226, + "loss": 1.4934, + "step": 2716 + }, + { + "epoch": 0.03530617581942782, + "grad_norm": 0.35410887002944946, + "learning_rate": 0.0001929788533773509, + "loss": 1.4242, + "step": 2717 + }, + { + "epoch": 0.03531917036334369, + "grad_norm": 0.4561476707458496, + "learning_rate": 0.0001929762539154395, + "loss": 1.5194, + "step": 2718 + }, + { + "epoch": 0.035332164907259565, + "grad_norm": 0.3089006245136261, + "learning_rate": 0.00019297365445352814, + "loss": 1.5368, + "step": 2719 + }, + { + "epoch": 0.03534515945117544, + "grad_norm": 0.38267242908477783, + "learning_rate": 0.00019297105499161674, + "loss": 1.4586, + "step": 2720 + }, + { + "epoch": 0.03535815399509131, + "grad_norm": 0.32861995697021484, + "learning_rate": 0.00019296845552970536, + "loss": 1.4387, + "step": 2721 + }, + { + "epoch": 0.035371148539007184, + "grad_norm": 0.37558478116989136, + "learning_rate": 0.00019296585606779398, + "loss": 1.5579, + "step": 2722 + }, + { + "epoch": 0.03538414308292306, + "grad_norm": 0.3885676860809326, + "learning_rate": 0.00019296325660588258, + "loss": 1.5473, + "step": 2723 + }, + { + "epoch": 0.03539713762683893, + "grad_norm": 0.3887832462787628, + "learning_rate": 0.0001929606571439712, + "loss": 1.4257, + "step": 2724 + }, + { + "epoch": 0.0354101321707548, + "grad_norm": 0.25129011273384094, + "learning_rate": 0.0001929580576820598, + "loss": 1.2151, + "step": 2725 + }, + { + "epoch": 0.035423126714670676, + "grad_norm": 0.34469395875930786, + "learning_rate": 0.00019295545822014846, + "loss": 1.3625, + "step": 2726 + }, + { + "epoch": 0.03543612125858655, + "grad_norm": 0.29271090030670166, + "learning_rate": 0.00019295285875823705, + "loss": 1.6102, + "step": 2727 + }, + { + "epoch": 0.03544911580250242, + "grad_norm": 0.35284462571144104, + "learning_rate": 0.00019295025929632568, + "loss": 1.5273, + "step": 2728 + }, + { + "epoch": 0.035462110346418295, + "grad_norm": 0.30605068802833557, + "learning_rate": 0.0001929476598344143, + "loss": 1.322, + "step": 2729 + }, + { + "epoch": 0.03547510489033417, + "grad_norm": 0.380879670381546, + "learning_rate": 0.0001929450603725029, + "loss": 1.4431, + "step": 2730 + }, + { + "epoch": 0.03548809943425004, + "grad_norm": 0.3157254755496979, + "learning_rate": 0.00019294246091059152, + "loss": 1.5266, + "step": 2731 + }, + { + "epoch": 0.03550109397816592, + "grad_norm": 0.4257451295852661, + "learning_rate": 0.00019293986144868012, + "loss": 1.5895, + "step": 2732 + }, + { + "epoch": 0.035514088522081794, + "grad_norm": 0.4311240613460541, + "learning_rate": 0.00019293726198676875, + "loss": 1.5401, + "step": 2733 + }, + { + "epoch": 0.03552708306599767, + "grad_norm": 0.3798077404499054, + "learning_rate": 0.00019293466252485737, + "loss": 1.2725, + "step": 2734 + }, + { + "epoch": 0.03554007760991354, + "grad_norm": 0.38537341356277466, + "learning_rate": 0.00019293206306294597, + "loss": 1.5666, + "step": 2735 + }, + { + "epoch": 0.03555307215382941, + "grad_norm": 0.4559878706932068, + "learning_rate": 0.0001929294636010346, + "loss": 1.5698, + "step": 2736 + }, + { + "epoch": 0.035566066697745286, + "grad_norm": 0.3879867196083069, + "learning_rate": 0.00019292686413912322, + "loss": 1.3961, + "step": 2737 + }, + { + "epoch": 0.03557906124166116, + "grad_norm": 0.3555436432361603, + "learning_rate": 0.00019292426467721184, + "loss": 1.5106, + "step": 2738 + }, + { + "epoch": 0.03559205578557703, + "grad_norm": 0.4483688175678253, + "learning_rate": 0.00019292166521530044, + "loss": 1.5846, + "step": 2739 + }, + { + "epoch": 0.035605050329492904, + "grad_norm": 0.3611527383327484, + "learning_rate": 0.00019291906575338906, + "loss": 1.4275, + "step": 2740 + }, + { + "epoch": 0.03561804487340878, + "grad_norm": 0.3116562068462372, + "learning_rate": 0.0001929164662914777, + "loss": 1.2558, + "step": 2741 + }, + { + "epoch": 0.03563103941732465, + "grad_norm": 0.37978363037109375, + "learning_rate": 0.00019291386682956628, + "loss": 1.4643, + "step": 2742 + }, + { + "epoch": 0.03564403396124052, + "grad_norm": 0.24704952538013458, + "learning_rate": 0.0001929112673676549, + "loss": 1.146, + "step": 2743 + }, + { + "epoch": 0.035657028505156396, + "grad_norm": 0.3831765949726105, + "learning_rate": 0.0001929086679057435, + "loss": 1.5438, + "step": 2744 + }, + { + "epoch": 0.03567002304907227, + "grad_norm": 0.30055537819862366, + "learning_rate": 0.00019290606844383216, + "loss": 1.4412, + "step": 2745 + }, + { + "epoch": 0.03568301759298814, + "grad_norm": 0.2527606189250946, + "learning_rate": 0.00019290346898192076, + "loss": 1.4767, + "step": 2746 + }, + { + "epoch": 0.035696012136904015, + "grad_norm": 0.42268630862236023, + "learning_rate": 0.00019290086952000935, + "loss": 1.6555, + "step": 2747 + }, + { + "epoch": 0.03570900668081989, + "grad_norm": 0.47067180275917053, + "learning_rate": 0.00019289827005809798, + "loss": 1.4822, + "step": 2748 + }, + { + "epoch": 0.03572200122473576, + "grad_norm": 0.411670058965683, + "learning_rate": 0.0001928956705961866, + "loss": 1.4746, + "step": 2749 + }, + { + "epoch": 0.035734995768651634, + "grad_norm": 0.3741244971752167, + "learning_rate": 0.00019289307113427523, + "loss": 1.4604, + "step": 2750 + }, + { + "epoch": 0.035747990312567514, + "grad_norm": 0.4290016293525696, + "learning_rate": 0.00019289047167236382, + "loss": 1.4158, + "step": 2751 + }, + { + "epoch": 0.03576098485648339, + "grad_norm": 0.364467054605484, + "learning_rate": 0.00019288787221045245, + "loss": 1.3861, + "step": 2752 + }, + { + "epoch": 0.03577397940039926, + "grad_norm": 0.35490918159484863, + "learning_rate": 0.00019288527274854107, + "loss": 1.5629, + "step": 2753 + }, + { + "epoch": 0.03578697394431513, + "grad_norm": 0.2794831097126007, + "learning_rate": 0.00019288267328662967, + "loss": 1.4614, + "step": 2754 + }, + { + "epoch": 0.035799968488231006, + "grad_norm": 0.287809818983078, + "learning_rate": 0.0001928800738247183, + "loss": 1.2182, + "step": 2755 + }, + { + "epoch": 0.03581296303214688, + "grad_norm": 0.3844785690307617, + "learning_rate": 0.0001928774743628069, + "loss": 1.391, + "step": 2756 + }, + { + "epoch": 0.03582595757606275, + "grad_norm": 0.32365888357162476, + "learning_rate": 0.00019287487490089554, + "loss": 1.5859, + "step": 2757 + }, + { + "epoch": 0.035838952119978625, + "grad_norm": 0.37656038999557495, + "learning_rate": 0.00019287227543898414, + "loss": 1.4207, + "step": 2758 + }, + { + "epoch": 0.0358519466638945, + "grad_norm": 0.3249402642250061, + "learning_rate": 0.00019286967597707274, + "loss": 1.2407, + "step": 2759 + }, + { + "epoch": 0.03586494120781037, + "grad_norm": 0.6949344873428345, + "learning_rate": 0.00019286707651516136, + "loss": 1.718, + "step": 2760 + }, + { + "epoch": 0.03587793575172624, + "grad_norm": 0.3642655611038208, + "learning_rate": 0.00019286447705325, + "loss": 1.4663, + "step": 2761 + }, + { + "epoch": 0.035890930295642116, + "grad_norm": 0.3835026025772095, + "learning_rate": 0.0001928618775913386, + "loss": 1.6773, + "step": 2762 + }, + { + "epoch": 0.03590392483955799, + "grad_norm": 0.3899837136268616, + "learning_rate": 0.0001928592781294272, + "loss": 1.5875, + "step": 2763 + }, + { + "epoch": 0.03591691938347386, + "grad_norm": 1.368149995803833, + "learning_rate": 0.00019285667866751583, + "loss": 1.6732, + "step": 2764 + }, + { + "epoch": 0.035929913927389735, + "grad_norm": 0.449677437543869, + "learning_rate": 0.00019285407920560446, + "loss": 1.6178, + "step": 2765 + }, + { + "epoch": 0.03594290847130561, + "grad_norm": 0.34505805373191833, + "learning_rate": 0.00019285147974369305, + "loss": 1.487, + "step": 2766 + }, + { + "epoch": 0.03595590301522148, + "grad_norm": 0.3163648247718811, + "learning_rate": 0.00019284888028178168, + "loss": 1.307, + "step": 2767 + }, + { + "epoch": 0.035968897559137354, + "grad_norm": 0.43610504269599915, + "learning_rate": 0.0001928462808198703, + "loss": 1.4207, + "step": 2768 + }, + { + "epoch": 0.03598189210305323, + "grad_norm": 0.3650064468383789, + "learning_rate": 0.00019284368135795893, + "loss": 1.5816, + "step": 2769 + }, + { + "epoch": 0.03599488664696911, + "grad_norm": 0.45498794317245483, + "learning_rate": 0.00019284108189604753, + "loss": 1.6289, + "step": 2770 + }, + { + "epoch": 0.03600788119088498, + "grad_norm": 0.38969722390174866, + "learning_rate": 0.00019283848243413612, + "loss": 1.2897, + "step": 2771 + }, + { + "epoch": 0.03602087573480085, + "grad_norm": 0.3977135419845581, + "learning_rate": 0.00019283588297222477, + "loss": 1.3847, + "step": 2772 + }, + { + "epoch": 0.036033870278716726, + "grad_norm": 0.41410887241363525, + "learning_rate": 0.00019283328351031337, + "loss": 1.4496, + "step": 2773 + }, + { + "epoch": 0.0360468648226326, + "grad_norm": 0.3857017457485199, + "learning_rate": 0.000192830684048402, + "loss": 1.4659, + "step": 2774 + }, + { + "epoch": 0.03605985936654847, + "grad_norm": 0.49290332198143005, + "learning_rate": 0.0001928280845864906, + "loss": 1.3607, + "step": 2775 + }, + { + "epoch": 0.036072853910464345, + "grad_norm": 0.31616663932800293, + "learning_rate": 0.00019282548512457922, + "loss": 1.3968, + "step": 2776 + }, + { + "epoch": 0.03608584845438022, + "grad_norm": 0.5127238035202026, + "learning_rate": 0.00019282288566266784, + "loss": 1.5859, + "step": 2777 + }, + { + "epoch": 0.03609884299829609, + "grad_norm": 0.3986349403858185, + "learning_rate": 0.00019282028620075644, + "loss": 1.4152, + "step": 2778 + }, + { + "epoch": 0.036111837542211964, + "grad_norm": 0.32311007380485535, + "learning_rate": 0.00019281768673884506, + "loss": 1.2462, + "step": 2779 + }, + { + "epoch": 0.03612483208612784, + "grad_norm": 0.33841562271118164, + "learning_rate": 0.0001928150872769337, + "loss": 1.2382, + "step": 2780 + }, + { + "epoch": 0.03613782663004371, + "grad_norm": 0.30147987604141235, + "learning_rate": 0.00019281248781502231, + "loss": 1.3171, + "step": 2781 + }, + { + "epoch": 0.03615082117395958, + "grad_norm": 0.2974450886249542, + "learning_rate": 0.0001928098883531109, + "loss": 1.2797, + "step": 2782 + }, + { + "epoch": 0.036163815717875455, + "grad_norm": 0.452944815158844, + "learning_rate": 0.00019280728889119954, + "loss": 1.5388, + "step": 2783 + }, + { + "epoch": 0.03617681026179133, + "grad_norm": 0.35432595014572144, + "learning_rate": 0.00019280468942928816, + "loss": 1.4834, + "step": 2784 + }, + { + "epoch": 0.0361898048057072, + "grad_norm": 0.3833582401275635, + "learning_rate": 0.00019280208996737676, + "loss": 1.4581, + "step": 2785 + }, + { + "epoch": 0.036202799349623074, + "grad_norm": 0.438627153635025, + "learning_rate": 0.00019279949050546538, + "loss": 1.5574, + "step": 2786 + }, + { + "epoch": 0.03621579389353895, + "grad_norm": 0.4910653233528137, + "learning_rate": 0.00019279689104355398, + "loss": 1.2658, + "step": 2787 + }, + { + "epoch": 0.03622878843745482, + "grad_norm": 0.4092201292514801, + "learning_rate": 0.0001927942915816426, + "loss": 1.4815, + "step": 2788 + }, + { + "epoch": 0.0362417829813707, + "grad_norm": 0.36961793899536133, + "learning_rate": 0.00019279169211973123, + "loss": 1.4692, + "step": 2789 + }, + { + "epoch": 0.03625477752528657, + "grad_norm": 0.3870302736759186, + "learning_rate": 0.00019278909265781983, + "loss": 1.3644, + "step": 2790 + }, + { + "epoch": 0.036267772069202446, + "grad_norm": 0.4005250036716461, + "learning_rate": 0.00019278649319590845, + "loss": 1.4207, + "step": 2791 + }, + { + "epoch": 0.03628076661311832, + "grad_norm": 0.33879703283309937, + "learning_rate": 0.00019278389373399707, + "loss": 1.4814, + "step": 2792 + }, + { + "epoch": 0.03629376115703419, + "grad_norm": 0.39506030082702637, + "learning_rate": 0.0001927812942720857, + "loss": 1.4696, + "step": 2793 + }, + { + "epoch": 0.036306755700950065, + "grad_norm": 0.42061692476272583, + "learning_rate": 0.0001927786948101743, + "loss": 1.4733, + "step": 2794 + }, + { + "epoch": 0.03631975024486594, + "grad_norm": 0.42268291115760803, + "learning_rate": 0.00019277609534826292, + "loss": 1.4487, + "step": 2795 + }, + { + "epoch": 0.03633274478878181, + "grad_norm": 0.3141692876815796, + "learning_rate": 0.00019277349588635155, + "loss": 1.3469, + "step": 2796 + }, + { + "epoch": 0.036345739332697684, + "grad_norm": 0.3309340476989746, + "learning_rate": 0.00019277089642444014, + "loss": 1.0896, + "step": 2797 + }, + { + "epoch": 0.03635873387661356, + "grad_norm": 0.39377573132514954, + "learning_rate": 0.00019276829696252877, + "loss": 1.3871, + "step": 2798 + }, + { + "epoch": 0.03637172842052943, + "grad_norm": 0.36125418543815613, + "learning_rate": 0.00019276569750061736, + "loss": 1.4044, + "step": 2799 + }, + { + "epoch": 0.0363847229644453, + "grad_norm": 0.37107643485069275, + "learning_rate": 0.000192763098038706, + "loss": 1.4066, + "step": 2800 + }, + { + "epoch": 0.036397717508361176, + "grad_norm": 0.514762818813324, + "learning_rate": 0.00019276049857679461, + "loss": 1.4408, + "step": 2801 + }, + { + "epoch": 0.03641071205227705, + "grad_norm": 0.4846345782279968, + "learning_rate": 0.0001927578991148832, + "loss": 1.6813, + "step": 2802 + }, + { + "epoch": 0.03642370659619292, + "grad_norm": 0.36676642298698425, + "learning_rate": 0.00019275529965297186, + "loss": 1.4402, + "step": 2803 + }, + { + "epoch": 0.036436701140108795, + "grad_norm": 0.38358089327812195, + "learning_rate": 0.00019275270019106046, + "loss": 1.4207, + "step": 2804 + }, + { + "epoch": 0.03644969568402467, + "grad_norm": 0.43707358837127686, + "learning_rate": 0.00019275010072914908, + "loss": 1.6785, + "step": 2805 + }, + { + "epoch": 0.03646269022794054, + "grad_norm": 0.393779993057251, + "learning_rate": 0.00019274750126723768, + "loss": 1.428, + "step": 2806 + }, + { + "epoch": 0.03647568477185641, + "grad_norm": 0.3945956528186798, + "learning_rate": 0.0001927449018053263, + "loss": 1.3964, + "step": 2807 + }, + { + "epoch": 0.03648867931577229, + "grad_norm": 0.2623893618583679, + "learning_rate": 0.00019274230234341493, + "loss": 1.4994, + "step": 2808 + }, + { + "epoch": 0.036501673859688166, + "grad_norm": 0.3510804772377014, + "learning_rate": 0.00019273970288150353, + "loss": 1.271, + "step": 2809 + }, + { + "epoch": 0.03651466840360404, + "grad_norm": 0.4407918453216553, + "learning_rate": 0.00019273710341959215, + "loss": 1.3439, + "step": 2810 + }, + { + "epoch": 0.03652766294751991, + "grad_norm": 0.4164028465747833, + "learning_rate": 0.00019273450395768078, + "loss": 1.4009, + "step": 2811 + }, + { + "epoch": 0.036540657491435785, + "grad_norm": 0.38145989179611206, + "learning_rate": 0.0001927319044957694, + "loss": 1.3704, + "step": 2812 + }, + { + "epoch": 0.03655365203535166, + "grad_norm": 0.3335427939891815, + "learning_rate": 0.000192729305033858, + "loss": 1.3729, + "step": 2813 + }, + { + "epoch": 0.03656664657926753, + "grad_norm": 0.37175679206848145, + "learning_rate": 0.0001927267055719466, + "loss": 1.4238, + "step": 2814 + }, + { + "epoch": 0.036579641123183404, + "grad_norm": 0.34029579162597656, + "learning_rate": 0.00019272410611003525, + "loss": 1.3745, + "step": 2815 + }, + { + "epoch": 0.03659263566709928, + "grad_norm": 0.44117608666419983, + "learning_rate": 0.00019272150664812385, + "loss": 1.3927, + "step": 2816 + }, + { + "epoch": 0.03660563021101515, + "grad_norm": 0.37611329555511475, + "learning_rate": 0.00019271890718621247, + "loss": 1.3691, + "step": 2817 + }, + { + "epoch": 0.03661862475493102, + "grad_norm": 0.3684469163417816, + "learning_rate": 0.00019271630772430107, + "loss": 1.3714, + "step": 2818 + }, + { + "epoch": 0.036631619298846896, + "grad_norm": 0.41082513332366943, + "learning_rate": 0.0001927137082623897, + "loss": 1.4639, + "step": 2819 + }, + { + "epoch": 0.03664461384276277, + "grad_norm": 0.4682472348213196, + "learning_rate": 0.00019271110880047832, + "loss": 1.3318, + "step": 2820 + }, + { + "epoch": 0.03665760838667864, + "grad_norm": 0.34688398241996765, + "learning_rate": 0.00019270850933856691, + "loss": 1.3724, + "step": 2821 + }, + { + "epoch": 0.036670602930594515, + "grad_norm": 0.38562142848968506, + "learning_rate": 0.00019270590987665554, + "loss": 1.3346, + "step": 2822 + }, + { + "epoch": 0.03668359747451039, + "grad_norm": 0.3660123646259308, + "learning_rate": 0.00019270331041474416, + "loss": 1.5847, + "step": 2823 + }, + { + "epoch": 0.03669659201842626, + "grad_norm": 0.3025285303592682, + "learning_rate": 0.0001927007109528328, + "loss": 1.577, + "step": 2824 + }, + { + "epoch": 0.036709586562342134, + "grad_norm": 0.35528770089149475, + "learning_rate": 0.00019269811149092138, + "loss": 1.3957, + "step": 2825 + }, + { + "epoch": 0.03672258110625801, + "grad_norm": 0.3572424352169037, + "learning_rate": 0.00019269551202900998, + "loss": 1.5052, + "step": 2826 + }, + { + "epoch": 0.036735575650173886, + "grad_norm": 0.30051520466804504, + "learning_rate": 0.00019269291256709863, + "loss": 1.4398, + "step": 2827 + }, + { + "epoch": 0.03674857019408976, + "grad_norm": 0.42542964220046997, + "learning_rate": 0.00019269031310518723, + "loss": 1.4406, + "step": 2828 + }, + { + "epoch": 0.03676156473800563, + "grad_norm": 0.35860592126846313, + "learning_rate": 0.00019268771364327586, + "loss": 1.3644, + "step": 2829 + }, + { + "epoch": 0.036774559281921505, + "grad_norm": 0.43359726667404175, + "learning_rate": 0.00019268511418136445, + "loss": 1.4387, + "step": 2830 + }, + { + "epoch": 0.03678755382583738, + "grad_norm": 0.3515236973762512, + "learning_rate": 0.00019268251471945308, + "loss": 1.3023, + "step": 2831 + }, + { + "epoch": 0.03680054836975325, + "grad_norm": 0.28295955061912537, + "learning_rate": 0.0001926799152575417, + "loss": 1.2749, + "step": 2832 + }, + { + "epoch": 0.036813542913669124, + "grad_norm": 0.36652517318725586, + "learning_rate": 0.0001926773157956303, + "loss": 1.4286, + "step": 2833 + }, + { + "epoch": 0.036826537457585, + "grad_norm": 0.3952235281467438, + "learning_rate": 0.00019267471633371892, + "loss": 1.5777, + "step": 2834 + }, + { + "epoch": 0.03683953200150087, + "grad_norm": 0.31595343351364136, + "learning_rate": 0.00019267211687180755, + "loss": 1.4189, + "step": 2835 + }, + { + "epoch": 0.03685252654541674, + "grad_norm": 0.33102521300315857, + "learning_rate": 0.00019266951740989617, + "loss": 1.394, + "step": 2836 + }, + { + "epoch": 0.036865521089332616, + "grad_norm": 0.2974259555339813, + "learning_rate": 0.00019266691794798477, + "loss": 1.357, + "step": 2837 + }, + { + "epoch": 0.03687851563324849, + "grad_norm": 0.6384289264678955, + "learning_rate": 0.0001926643184860734, + "loss": 1.4339, + "step": 2838 + }, + { + "epoch": 0.03689151017716436, + "grad_norm": 0.4564565122127533, + "learning_rate": 0.00019266171902416202, + "loss": 1.5453, + "step": 2839 + }, + { + "epoch": 0.036904504721080235, + "grad_norm": 0.38966381549835205, + "learning_rate": 0.00019265911956225062, + "loss": 1.5627, + "step": 2840 + }, + { + "epoch": 0.03691749926499611, + "grad_norm": 0.4062577188014984, + "learning_rate": 0.00019265652010033924, + "loss": 1.4098, + "step": 2841 + }, + { + "epoch": 0.03693049380891198, + "grad_norm": 0.29419323801994324, + "learning_rate": 0.00019265392063842787, + "loss": 1.2312, + "step": 2842 + }, + { + "epoch": 0.036943488352827854, + "grad_norm": 0.4806249737739563, + "learning_rate": 0.00019265132117651646, + "loss": 1.2879, + "step": 2843 + }, + { + "epoch": 0.03695648289674373, + "grad_norm": 0.32719606161117554, + "learning_rate": 0.0001926487217146051, + "loss": 1.4757, + "step": 2844 + }, + { + "epoch": 0.0369694774406596, + "grad_norm": 0.4196498394012451, + "learning_rate": 0.00019264612225269368, + "loss": 1.2361, + "step": 2845 + }, + { + "epoch": 0.03698247198457548, + "grad_norm": 0.5088819861412048, + "learning_rate": 0.00019264352279078234, + "loss": 1.5954, + "step": 2846 + }, + { + "epoch": 0.03699546652849135, + "grad_norm": 0.378284215927124, + "learning_rate": 0.00019264092332887093, + "loss": 1.3907, + "step": 2847 + }, + { + "epoch": 0.037008461072407225, + "grad_norm": 0.442389577627182, + "learning_rate": 0.00019263832386695956, + "loss": 1.5548, + "step": 2848 + }, + { + "epoch": 0.0370214556163231, + "grad_norm": 0.39782026410102844, + "learning_rate": 0.00019263572440504816, + "loss": 1.4675, + "step": 2849 + }, + { + "epoch": 0.03703445016023897, + "grad_norm": 0.43371039628982544, + "learning_rate": 0.00019263312494313678, + "loss": 1.4222, + "step": 2850 + }, + { + "epoch": 0.037047444704154844, + "grad_norm": 0.41124409437179565, + "learning_rate": 0.0001926305254812254, + "loss": 1.4427, + "step": 2851 + }, + { + "epoch": 0.03706043924807072, + "grad_norm": 0.3725537061691284, + "learning_rate": 0.000192627926019314, + "loss": 1.4465, + "step": 2852 + }, + { + "epoch": 0.03707343379198659, + "grad_norm": 0.3172045648097992, + "learning_rate": 0.00019262532655740263, + "loss": 1.3537, + "step": 2853 + }, + { + "epoch": 0.03708642833590246, + "grad_norm": 0.35795867443084717, + "learning_rate": 0.00019262272709549125, + "loss": 1.7009, + "step": 2854 + }, + { + "epoch": 0.037099422879818336, + "grad_norm": 0.43789443373680115, + "learning_rate": 0.00019262012763357985, + "loss": 1.6143, + "step": 2855 + }, + { + "epoch": 0.03711241742373421, + "grad_norm": 0.33214282989501953, + "learning_rate": 0.00019261752817166847, + "loss": 1.4922, + "step": 2856 + }, + { + "epoch": 0.03712541196765008, + "grad_norm": 0.4682319760322571, + "learning_rate": 0.00019261492870975707, + "loss": 1.6605, + "step": 2857 + }, + { + "epoch": 0.037138406511565955, + "grad_norm": 0.48196035623550415, + "learning_rate": 0.00019261232924784572, + "loss": 1.5506, + "step": 2858 + }, + { + "epoch": 0.03715140105548183, + "grad_norm": 0.4416365325450897, + "learning_rate": 0.00019260972978593432, + "loss": 1.7541, + "step": 2859 + }, + { + "epoch": 0.0371643955993977, + "grad_norm": 0.41896694898605347, + "learning_rate": 0.00019260713032402294, + "loss": 1.4409, + "step": 2860 + }, + { + "epoch": 0.037177390143313574, + "grad_norm": 0.35514992475509644, + "learning_rate": 0.00019260453086211154, + "loss": 1.421, + "step": 2861 + }, + { + "epoch": 0.03719038468722945, + "grad_norm": 0.2940196692943573, + "learning_rate": 0.00019260193140020017, + "loss": 1.2734, + "step": 2862 + }, + { + "epoch": 0.03720337923114532, + "grad_norm": 0.3980550467967987, + "learning_rate": 0.0001925993319382888, + "loss": 1.7478, + "step": 2863 + }, + { + "epoch": 0.03721637377506119, + "grad_norm": 0.4403688907623291, + "learning_rate": 0.0001925967324763774, + "loss": 1.5199, + "step": 2864 + }, + { + "epoch": 0.03722936831897707, + "grad_norm": 0.38370171189308167, + "learning_rate": 0.000192594133014466, + "loss": 1.4741, + "step": 2865 + }, + { + "epoch": 0.037242362862892946, + "grad_norm": 0.3681994080543518, + "learning_rate": 0.00019259153355255464, + "loss": 1.4456, + "step": 2866 + }, + { + "epoch": 0.03725535740680882, + "grad_norm": 0.35437920689582825, + "learning_rate": 0.00019258893409064326, + "loss": 1.5705, + "step": 2867 + }, + { + "epoch": 0.03726835195072469, + "grad_norm": 0.49699491262435913, + "learning_rate": 0.00019258633462873186, + "loss": 1.4665, + "step": 2868 + }, + { + "epoch": 0.037281346494640565, + "grad_norm": 0.356911301612854, + "learning_rate": 0.00019258373516682046, + "loss": 1.4914, + "step": 2869 + }, + { + "epoch": 0.03729434103855644, + "grad_norm": 0.26467427611351013, + "learning_rate": 0.0001925811357049091, + "loss": 1.2276, + "step": 2870 + }, + { + "epoch": 0.03730733558247231, + "grad_norm": 0.4024367034435272, + "learning_rate": 0.0001925785362429977, + "loss": 1.476, + "step": 2871 + }, + { + "epoch": 0.03732033012638818, + "grad_norm": 0.4283798336982727, + "learning_rate": 0.00019257593678108633, + "loss": 1.4843, + "step": 2872 + }, + { + "epoch": 0.037333324670304056, + "grad_norm": 0.38268956542015076, + "learning_rate": 0.00019257333731917493, + "loss": 1.3179, + "step": 2873 + }, + { + "epoch": 0.03734631921421993, + "grad_norm": 0.4232378602027893, + "learning_rate": 0.00019257073785726355, + "loss": 1.5569, + "step": 2874 + }, + { + "epoch": 0.0373593137581358, + "grad_norm": 0.4544670581817627, + "learning_rate": 0.00019256813839535218, + "loss": 1.5626, + "step": 2875 + }, + { + "epoch": 0.037372308302051675, + "grad_norm": 0.34140169620513916, + "learning_rate": 0.00019256553893344077, + "loss": 1.4478, + "step": 2876 + }, + { + "epoch": 0.03738530284596755, + "grad_norm": 0.42812231183052063, + "learning_rate": 0.00019256293947152942, + "loss": 1.4363, + "step": 2877 + }, + { + "epoch": 0.03739829738988342, + "grad_norm": 0.3961136043071747, + "learning_rate": 0.00019256034000961802, + "loss": 1.3606, + "step": 2878 + }, + { + "epoch": 0.037411291933799294, + "grad_norm": 0.28746384382247925, + "learning_rate": 0.00019255774054770665, + "loss": 1.0565, + "step": 2879 + }, + { + "epoch": 0.03742428647771517, + "grad_norm": 0.37665361166000366, + "learning_rate": 0.00019255514108579524, + "loss": 1.5813, + "step": 2880 + }, + { + "epoch": 0.03743728102163104, + "grad_norm": 0.47615721821784973, + "learning_rate": 0.00019255254162388387, + "loss": 1.3915, + "step": 2881 + }, + { + "epoch": 0.03745027556554691, + "grad_norm": 0.40794748067855835, + "learning_rate": 0.0001925499421619725, + "loss": 1.5256, + "step": 2882 + }, + { + "epoch": 0.037463270109462786, + "grad_norm": 0.39038363099098206, + "learning_rate": 0.0001925473427000611, + "loss": 1.3619, + "step": 2883 + }, + { + "epoch": 0.037476264653378666, + "grad_norm": 0.4532313644886017, + "learning_rate": 0.00019254474323814971, + "loss": 1.5355, + "step": 2884 + }, + { + "epoch": 0.03748925919729454, + "grad_norm": 0.41911494731903076, + "learning_rate": 0.00019254214377623834, + "loss": 1.6976, + "step": 2885 + }, + { + "epoch": 0.03750225374121041, + "grad_norm": 0.45028582215309143, + "learning_rate": 0.00019253954431432694, + "loss": 1.5581, + "step": 2886 + }, + { + "epoch": 0.037515248285126285, + "grad_norm": 0.4192914664745331, + "learning_rate": 0.00019253694485241556, + "loss": 1.3167, + "step": 2887 + }, + { + "epoch": 0.03752824282904216, + "grad_norm": 0.38193854689598083, + "learning_rate": 0.00019253434539050416, + "loss": 1.4941, + "step": 2888 + }, + { + "epoch": 0.03754123737295803, + "grad_norm": 0.3629629611968994, + "learning_rate": 0.0001925317459285928, + "loss": 1.4815, + "step": 2889 + }, + { + "epoch": 0.037554231916873904, + "grad_norm": 0.33992719650268555, + "learning_rate": 0.0001925291464666814, + "loss": 1.5113, + "step": 2890 + }, + { + "epoch": 0.03756722646078978, + "grad_norm": 0.3796389400959015, + "learning_rate": 0.00019252654700477003, + "loss": 1.4956, + "step": 2891 + }, + { + "epoch": 0.03758022100470565, + "grad_norm": 0.3908306658267975, + "learning_rate": 0.00019252394754285863, + "loss": 1.4478, + "step": 2892 + }, + { + "epoch": 0.03759321554862152, + "grad_norm": 0.34537550806999207, + "learning_rate": 0.00019252134808094725, + "loss": 1.4948, + "step": 2893 + }, + { + "epoch": 0.037606210092537395, + "grad_norm": 0.43783748149871826, + "learning_rate": 0.00019251874861903588, + "loss": 1.3109, + "step": 2894 + }, + { + "epoch": 0.03761920463645327, + "grad_norm": 0.30554085969924927, + "learning_rate": 0.00019251614915712448, + "loss": 1.5971, + "step": 2895 + }, + { + "epoch": 0.03763219918036914, + "grad_norm": 0.38141849637031555, + "learning_rate": 0.0001925135496952131, + "loss": 1.5607, + "step": 2896 + }, + { + "epoch": 0.037645193724285014, + "grad_norm": 0.3407510817050934, + "learning_rate": 0.00019251095023330172, + "loss": 1.5818, + "step": 2897 + }, + { + "epoch": 0.03765818826820089, + "grad_norm": 0.39207011461257935, + "learning_rate": 0.00019250835077139032, + "loss": 1.4283, + "step": 2898 + }, + { + "epoch": 0.03767118281211676, + "grad_norm": 0.34534555673599243, + "learning_rate": 0.00019250575130947895, + "loss": 1.4144, + "step": 2899 + }, + { + "epoch": 0.03768417735603263, + "grad_norm": 0.4672044515609741, + "learning_rate": 0.00019250315184756754, + "loss": 1.7198, + "step": 2900 + }, + { + "epoch": 0.037697171899948506, + "grad_norm": 0.36498475074768066, + "learning_rate": 0.0001925005523856562, + "loss": 1.3947, + "step": 2901 + }, + { + "epoch": 0.03771016644386438, + "grad_norm": 0.4158748984336853, + "learning_rate": 0.0001924979529237448, + "loss": 1.5405, + "step": 2902 + }, + { + "epoch": 0.03772316098778026, + "grad_norm": 0.3219645619392395, + "learning_rate": 0.00019249535346183342, + "loss": 1.5222, + "step": 2903 + }, + { + "epoch": 0.03773615553169613, + "grad_norm": 0.38233116269111633, + "learning_rate": 0.00019249275399992201, + "loss": 1.3962, + "step": 2904 + }, + { + "epoch": 0.037749150075612005, + "grad_norm": 0.35942554473876953, + "learning_rate": 0.00019249015453801064, + "loss": 1.3824, + "step": 2905 + }, + { + "epoch": 0.03776214461952788, + "grad_norm": 0.36550280451774597, + "learning_rate": 0.00019248755507609926, + "loss": 1.48, + "step": 2906 + }, + { + "epoch": 0.03777513916344375, + "grad_norm": 0.4292823374271393, + "learning_rate": 0.00019248495561418786, + "loss": 1.5711, + "step": 2907 + }, + { + "epoch": 0.037788133707359624, + "grad_norm": 0.4042067229747772, + "learning_rate": 0.00019248235615227648, + "loss": 1.6332, + "step": 2908 + }, + { + "epoch": 0.0378011282512755, + "grad_norm": 0.31133267283439636, + "learning_rate": 0.0001924797566903651, + "loss": 1.3715, + "step": 2909 + }, + { + "epoch": 0.03781412279519137, + "grad_norm": 0.4283090829849243, + "learning_rate": 0.0001924771572284537, + "loss": 1.2436, + "step": 2910 + }, + { + "epoch": 0.03782711733910724, + "grad_norm": 0.4492538869380951, + "learning_rate": 0.00019247455776654233, + "loss": 1.5023, + "step": 2911 + }, + { + "epoch": 0.037840111883023116, + "grad_norm": 0.37573447823524475, + "learning_rate": 0.00019247195830463096, + "loss": 1.4719, + "step": 2912 + }, + { + "epoch": 0.03785310642693899, + "grad_norm": 0.3890881836414337, + "learning_rate": 0.00019246935884271958, + "loss": 1.3534, + "step": 2913 + }, + { + "epoch": 0.03786610097085486, + "grad_norm": 0.3864831030368805, + "learning_rate": 0.00019246675938080818, + "loss": 1.3381, + "step": 2914 + }, + { + "epoch": 0.037879095514770734, + "grad_norm": 0.3690125644207001, + "learning_rate": 0.0001924641599188968, + "loss": 1.4198, + "step": 2915 + }, + { + "epoch": 0.03789209005868661, + "grad_norm": 0.3906749188899994, + "learning_rate": 0.00019246156045698543, + "loss": 1.5224, + "step": 2916 + }, + { + "epoch": 0.03790508460260248, + "grad_norm": 0.49216577410697937, + "learning_rate": 0.00019245896099507402, + "loss": 1.4364, + "step": 2917 + }, + { + "epoch": 0.03791807914651835, + "grad_norm": 0.3060561418533325, + "learning_rate": 0.00019245636153316265, + "loss": 1.5562, + "step": 2918 + }, + { + "epoch": 0.037931073690434226, + "grad_norm": 0.37658512592315674, + "learning_rate": 0.00019245376207125125, + "loss": 1.4912, + "step": 2919 + }, + { + "epoch": 0.0379440682343501, + "grad_norm": 0.3828648030757904, + "learning_rate": 0.0001924511626093399, + "loss": 1.3431, + "step": 2920 + }, + { + "epoch": 0.03795706277826597, + "grad_norm": 0.45422253012657166, + "learning_rate": 0.0001924485631474285, + "loss": 1.5319, + "step": 2921 + }, + { + "epoch": 0.03797005732218185, + "grad_norm": 0.4186297357082367, + "learning_rate": 0.0001924459636855171, + "loss": 1.3389, + "step": 2922 + }, + { + "epoch": 0.037983051866097725, + "grad_norm": 0.4232546091079712, + "learning_rate": 0.00019244336422360572, + "loss": 1.3911, + "step": 2923 + }, + { + "epoch": 0.0379960464100136, + "grad_norm": 0.33594444394111633, + "learning_rate": 0.00019244076476169434, + "loss": 1.5508, + "step": 2924 + }, + { + "epoch": 0.03800904095392947, + "grad_norm": 0.37918147444725037, + "learning_rate": 0.00019243816529978297, + "loss": 1.4947, + "step": 2925 + }, + { + "epoch": 0.038022035497845344, + "grad_norm": 0.3648119866847992, + "learning_rate": 0.00019243556583787156, + "loss": 1.4514, + "step": 2926 + }, + { + "epoch": 0.03803503004176122, + "grad_norm": 0.3229885697364807, + "learning_rate": 0.0001924329663759602, + "loss": 1.3821, + "step": 2927 + }, + { + "epoch": 0.03804802458567709, + "grad_norm": 0.3869847357273102, + "learning_rate": 0.0001924303669140488, + "loss": 1.3408, + "step": 2928 + }, + { + "epoch": 0.03806101912959296, + "grad_norm": 0.3429639935493469, + "learning_rate": 0.0001924277674521374, + "loss": 1.4586, + "step": 2929 + }, + { + "epoch": 0.038074013673508836, + "grad_norm": 0.3165348768234253, + "learning_rate": 0.00019242516799022603, + "loss": 1.3305, + "step": 2930 + }, + { + "epoch": 0.03808700821742471, + "grad_norm": 0.42802226543426514, + "learning_rate": 0.00019242256852831463, + "loss": 1.4054, + "step": 2931 + }, + { + "epoch": 0.03810000276134058, + "grad_norm": 0.3656299412250519, + "learning_rate": 0.00019241996906640328, + "loss": 1.604, + "step": 2932 + }, + { + "epoch": 0.038112997305256455, + "grad_norm": 0.38018178939819336, + "learning_rate": 0.00019241736960449188, + "loss": 1.5177, + "step": 2933 + }, + { + "epoch": 0.03812599184917233, + "grad_norm": 0.4416995048522949, + "learning_rate": 0.0001924147701425805, + "loss": 1.4017, + "step": 2934 + }, + { + "epoch": 0.0381389863930882, + "grad_norm": 0.4500894844532013, + "learning_rate": 0.0001924121706806691, + "loss": 1.4532, + "step": 2935 + }, + { + "epoch": 0.038151980937004074, + "grad_norm": 0.27240195870399475, + "learning_rate": 0.00019240957121875773, + "loss": 1.4876, + "step": 2936 + }, + { + "epoch": 0.038164975480919947, + "grad_norm": 0.3285284638404846, + "learning_rate": 0.00019240697175684635, + "loss": 1.5121, + "step": 2937 + }, + { + "epoch": 0.03817797002483582, + "grad_norm": 0.37454378604888916, + "learning_rate": 0.00019240437229493495, + "loss": 1.2427, + "step": 2938 + }, + { + "epoch": 0.03819096456875169, + "grad_norm": 0.375453919172287, + "learning_rate": 0.00019240177283302357, + "loss": 1.3329, + "step": 2939 + }, + { + "epoch": 0.038203959112667565, + "grad_norm": 0.34264907240867615, + "learning_rate": 0.0001923991733711122, + "loss": 1.4839, + "step": 2940 + }, + { + "epoch": 0.038216953656583445, + "grad_norm": 0.3332982659339905, + "learning_rate": 0.0001923965739092008, + "loss": 1.6241, + "step": 2941 + }, + { + "epoch": 0.03822994820049932, + "grad_norm": 0.4663456082344055, + "learning_rate": 0.00019239397444728942, + "loss": 1.4251, + "step": 2942 + }, + { + "epoch": 0.03824294274441519, + "grad_norm": 0.29390889406204224, + "learning_rate": 0.00019239137498537802, + "loss": 1.3889, + "step": 2943 + }, + { + "epoch": 0.038255937288331064, + "grad_norm": 0.3493329584598541, + "learning_rate": 0.00019238877552346667, + "loss": 1.2042, + "step": 2944 + }, + { + "epoch": 0.03826893183224694, + "grad_norm": 0.41765937209129333, + "learning_rate": 0.00019238617606155527, + "loss": 1.4362, + "step": 2945 + }, + { + "epoch": 0.03828192637616281, + "grad_norm": 0.4229629635810852, + "learning_rate": 0.0001923835765996439, + "loss": 1.5947, + "step": 2946 + }, + { + "epoch": 0.03829492092007868, + "grad_norm": 0.39950308203697205, + "learning_rate": 0.0001923809771377325, + "loss": 1.3957, + "step": 2947 + }, + { + "epoch": 0.038307915463994556, + "grad_norm": 0.3801060616970062, + "learning_rate": 0.0001923783776758211, + "loss": 1.0505, + "step": 2948 + }, + { + "epoch": 0.03832091000791043, + "grad_norm": 0.3297862708568573, + "learning_rate": 0.00019237577821390974, + "loss": 1.3285, + "step": 2949 + }, + { + "epoch": 0.0383339045518263, + "grad_norm": 0.42620059847831726, + "learning_rate": 0.00019237317875199833, + "loss": 1.5758, + "step": 2950 + }, + { + "epoch": 0.038346899095742175, + "grad_norm": 0.40891823172569275, + "learning_rate": 0.00019237057929008699, + "loss": 1.4526, + "step": 2951 + }, + { + "epoch": 0.03835989363965805, + "grad_norm": 0.35171499848365784, + "learning_rate": 0.00019236797982817558, + "loss": 1.4504, + "step": 2952 + }, + { + "epoch": 0.03837288818357392, + "grad_norm": 0.30551573634147644, + "learning_rate": 0.00019236538036626418, + "loss": 1.4851, + "step": 2953 + }, + { + "epoch": 0.038385882727489794, + "grad_norm": 0.3448511064052582, + "learning_rate": 0.0001923627809043528, + "loss": 1.3106, + "step": 2954 + }, + { + "epoch": 0.03839887727140567, + "grad_norm": 0.4777507483959198, + "learning_rate": 0.00019236018144244143, + "loss": 1.6141, + "step": 2955 + }, + { + "epoch": 0.03841187181532154, + "grad_norm": 0.3326804041862488, + "learning_rate": 0.00019235758198053005, + "loss": 1.2672, + "step": 2956 + }, + { + "epoch": 0.03842486635923741, + "grad_norm": 0.4908417761325836, + "learning_rate": 0.00019235498251861865, + "loss": 1.6664, + "step": 2957 + }, + { + "epoch": 0.038437860903153286, + "grad_norm": 0.36089977622032166, + "learning_rate": 0.00019235238305670728, + "loss": 1.3454, + "step": 2958 + }, + { + "epoch": 0.03845085544706916, + "grad_norm": 0.3847794830799103, + "learning_rate": 0.0001923497835947959, + "loss": 1.4589, + "step": 2959 + }, + { + "epoch": 0.03846384999098504, + "grad_norm": 0.24992746114730835, + "learning_rate": 0.0001923471841328845, + "loss": 1.3842, + "step": 2960 + }, + { + "epoch": 0.03847684453490091, + "grad_norm": 0.38662829995155334, + "learning_rate": 0.00019234458467097312, + "loss": 1.4272, + "step": 2961 + }, + { + "epoch": 0.038489839078816784, + "grad_norm": 0.46076446771621704, + "learning_rate": 0.00019234198520906172, + "loss": 1.4969, + "step": 2962 + }, + { + "epoch": 0.03850283362273266, + "grad_norm": 0.30887776613235474, + "learning_rate": 0.00019233938574715037, + "loss": 1.4812, + "step": 2963 + }, + { + "epoch": 0.03851582816664853, + "grad_norm": 0.37549760937690735, + "learning_rate": 0.00019233678628523897, + "loss": 1.4241, + "step": 2964 + }, + { + "epoch": 0.0385288227105644, + "grad_norm": 0.3620559275150299, + "learning_rate": 0.00019233418682332757, + "loss": 1.5063, + "step": 2965 + }, + { + "epoch": 0.038541817254480276, + "grad_norm": 0.35685279965400696, + "learning_rate": 0.0001923315873614162, + "loss": 1.458, + "step": 2966 + }, + { + "epoch": 0.03855481179839615, + "grad_norm": 0.3742183744907379, + "learning_rate": 0.00019232898789950481, + "loss": 1.5527, + "step": 2967 + }, + { + "epoch": 0.03856780634231202, + "grad_norm": 0.3448207378387451, + "learning_rate": 0.00019232638843759344, + "loss": 1.5387, + "step": 2968 + }, + { + "epoch": 0.038580800886227895, + "grad_norm": 0.2890182435512543, + "learning_rate": 0.00019232378897568204, + "loss": 1.4247, + "step": 2969 + }, + { + "epoch": 0.03859379543014377, + "grad_norm": 0.38998499512672424, + "learning_rate": 0.00019232118951377066, + "loss": 1.3513, + "step": 2970 + }, + { + "epoch": 0.03860678997405964, + "grad_norm": 0.31292712688446045, + "learning_rate": 0.00019231859005185929, + "loss": 1.4684, + "step": 2971 + }, + { + "epoch": 0.038619784517975514, + "grad_norm": 0.3543795943260193, + "learning_rate": 0.00019231599058994788, + "loss": 1.6187, + "step": 2972 + }, + { + "epoch": 0.03863277906189139, + "grad_norm": 0.3005315661430359, + "learning_rate": 0.0001923133911280365, + "loss": 1.2564, + "step": 2973 + }, + { + "epoch": 0.03864577360580726, + "grad_norm": 0.40173181891441345, + "learning_rate": 0.0001923107916661251, + "loss": 1.6105, + "step": 2974 + }, + { + "epoch": 0.03865876814972313, + "grad_norm": 0.34406977891921997, + "learning_rate": 0.00019230819220421376, + "loss": 1.473, + "step": 2975 + }, + { + "epoch": 0.038671762693639006, + "grad_norm": 0.3398469090461731, + "learning_rate": 0.00019230559274230235, + "loss": 1.344, + "step": 2976 + }, + { + "epoch": 0.03868475723755488, + "grad_norm": 0.36122235655784607, + "learning_rate": 0.00019230299328039095, + "loss": 1.2835, + "step": 2977 + }, + { + "epoch": 0.03869775178147075, + "grad_norm": 0.38948532938957214, + "learning_rate": 0.00019230039381847958, + "loss": 1.3939, + "step": 2978 + }, + { + "epoch": 0.03871074632538663, + "grad_norm": 0.2958020567893982, + "learning_rate": 0.0001922977943565682, + "loss": 1.326, + "step": 2979 + }, + { + "epoch": 0.038723740869302505, + "grad_norm": 0.336001455783844, + "learning_rate": 0.00019229519489465682, + "loss": 1.4251, + "step": 2980 + }, + { + "epoch": 0.03873673541321838, + "grad_norm": 0.35965749621391296, + "learning_rate": 0.00019229259543274542, + "loss": 1.6125, + "step": 2981 + }, + { + "epoch": 0.03874972995713425, + "grad_norm": 0.3999517261981964, + "learning_rate": 0.00019228999597083405, + "loss": 1.2803, + "step": 2982 + }, + { + "epoch": 0.03876272450105012, + "grad_norm": 0.3699052035808563, + "learning_rate": 0.00019228739650892267, + "loss": 1.689, + "step": 2983 + }, + { + "epoch": 0.038775719044965996, + "grad_norm": 0.42240333557128906, + "learning_rate": 0.00019228479704701127, + "loss": 1.5546, + "step": 2984 + }, + { + "epoch": 0.03878871358888187, + "grad_norm": 0.3998093008995056, + "learning_rate": 0.0001922821975850999, + "loss": 1.5531, + "step": 2985 + }, + { + "epoch": 0.03880170813279774, + "grad_norm": 0.3260725438594818, + "learning_rate": 0.00019227959812318852, + "loss": 1.4899, + "step": 2986 + }, + { + "epoch": 0.038814702676713615, + "grad_norm": 0.3507775664329529, + "learning_rate": 0.00019227699866127714, + "loss": 1.3648, + "step": 2987 + }, + { + "epoch": 0.03882769722062949, + "grad_norm": 0.35958436131477356, + "learning_rate": 0.00019227439919936574, + "loss": 1.5094, + "step": 2988 + }, + { + "epoch": 0.03884069176454536, + "grad_norm": 0.28536874055862427, + "learning_rate": 0.00019227179973745436, + "loss": 1.4603, + "step": 2989 + }, + { + "epoch": 0.038853686308461234, + "grad_norm": 0.5363749861717224, + "learning_rate": 0.000192269200275543, + "loss": 1.4896, + "step": 2990 + }, + { + "epoch": 0.03886668085237711, + "grad_norm": 0.4216437339782715, + "learning_rate": 0.00019226660081363159, + "loss": 1.5247, + "step": 2991 + }, + { + "epoch": 0.03887967539629298, + "grad_norm": 0.2677660584449768, + "learning_rate": 0.0001922640013517202, + "loss": 1.4846, + "step": 2992 + }, + { + "epoch": 0.03889266994020885, + "grad_norm": 0.4548943042755127, + "learning_rate": 0.0001922614018898088, + "loss": 1.6278, + "step": 2993 + }, + { + "epoch": 0.038905664484124726, + "grad_norm": 0.3277578353881836, + "learning_rate": 0.00019225880242789743, + "loss": 1.4106, + "step": 2994 + }, + { + "epoch": 0.0389186590280406, + "grad_norm": 0.3157084286212921, + "learning_rate": 0.00019225620296598606, + "loss": 1.4095, + "step": 2995 + }, + { + "epoch": 0.03893165357195647, + "grad_norm": 0.36140143871307373, + "learning_rate": 0.00019225360350407465, + "loss": 1.4858, + "step": 2996 + }, + { + "epoch": 0.038944648115872345, + "grad_norm": 0.3800543546676636, + "learning_rate": 0.00019225100404216328, + "loss": 1.5974, + "step": 2997 + }, + { + "epoch": 0.038957642659788225, + "grad_norm": 0.36502912640571594, + "learning_rate": 0.0001922484045802519, + "loss": 1.5445, + "step": 2998 + }, + { + "epoch": 0.0389706372037041, + "grad_norm": 0.35446375608444214, + "learning_rate": 0.00019224580511834053, + "loss": 1.4025, + "step": 2999 + }, + { + "epoch": 0.03898363174761997, + "grad_norm": 0.3510723412036896, + "learning_rate": 0.00019224320565642912, + "loss": 1.2342, + "step": 3000 + }, + { + "epoch": 0.038996626291535844, + "grad_norm": 0.31108078360557556, + "learning_rate": 0.00019224060619451775, + "loss": 1.3674, + "step": 3001 + }, + { + "epoch": 0.03900962083545172, + "grad_norm": 0.45004478096961975, + "learning_rate": 0.00019223800673260637, + "loss": 1.585, + "step": 3002 + }, + { + "epoch": 0.03902261537936759, + "grad_norm": 0.372096449136734, + "learning_rate": 0.00019223540727069497, + "loss": 1.451, + "step": 3003 + }, + { + "epoch": 0.03903560992328346, + "grad_norm": 0.3879518508911133, + "learning_rate": 0.0001922328078087836, + "loss": 1.5369, + "step": 3004 + }, + { + "epoch": 0.039048604467199335, + "grad_norm": 0.39250648021698, + "learning_rate": 0.0001922302083468722, + "loss": 1.3992, + "step": 3005 + }, + { + "epoch": 0.03906159901111521, + "grad_norm": 0.4562216103076935, + "learning_rate": 0.00019222760888496082, + "loss": 1.555, + "step": 3006 + }, + { + "epoch": 0.03907459355503108, + "grad_norm": 0.32097911834716797, + "learning_rate": 0.00019222500942304944, + "loss": 1.4113, + "step": 3007 + }, + { + "epoch": 0.039087588098946954, + "grad_norm": 0.3371255099773407, + "learning_rate": 0.00019222240996113804, + "loss": 1.4397, + "step": 3008 + }, + { + "epoch": 0.03910058264286283, + "grad_norm": 0.35715731978416443, + "learning_rate": 0.00019221981049922666, + "loss": 1.4138, + "step": 3009 + }, + { + "epoch": 0.0391135771867787, + "grad_norm": 0.37619277834892273, + "learning_rate": 0.0001922172110373153, + "loss": 1.2273, + "step": 3010 + }, + { + "epoch": 0.03912657173069457, + "grad_norm": 0.36379989981651306, + "learning_rate": 0.0001922146115754039, + "loss": 1.4565, + "step": 3011 + }, + { + "epoch": 0.039139566274610446, + "grad_norm": 0.3899136185646057, + "learning_rate": 0.0001922120121134925, + "loss": 1.5098, + "step": 3012 + }, + { + "epoch": 0.03915256081852632, + "grad_norm": 0.32606860995292664, + "learning_rate": 0.00019220941265158113, + "loss": 1.2806, + "step": 3013 + }, + { + "epoch": 0.03916555536244219, + "grad_norm": 0.5540496706962585, + "learning_rate": 0.00019220681318966976, + "loss": 1.5175, + "step": 3014 + }, + { + "epoch": 0.039178549906358065, + "grad_norm": 0.4048491418361664, + "learning_rate": 0.00019220421372775836, + "loss": 1.4804, + "step": 3015 + }, + { + "epoch": 0.03919154445027394, + "grad_norm": 0.36790144443511963, + "learning_rate": 0.00019220161426584698, + "loss": 1.5979, + "step": 3016 + }, + { + "epoch": 0.03920453899418982, + "grad_norm": 0.5192331671714783, + "learning_rate": 0.00019219901480393558, + "loss": 1.5445, + "step": 3017 + }, + { + "epoch": 0.03921753353810569, + "grad_norm": 0.3224785327911377, + "learning_rate": 0.00019219641534202423, + "loss": 1.2415, + "step": 3018 + }, + { + "epoch": 0.039230528082021564, + "grad_norm": 0.4109537601470947, + "learning_rate": 0.00019219381588011283, + "loss": 1.4474, + "step": 3019 + }, + { + "epoch": 0.03924352262593744, + "grad_norm": 0.4873676896095276, + "learning_rate": 0.00019219121641820142, + "loss": 1.5665, + "step": 3020 + }, + { + "epoch": 0.03925651716985331, + "grad_norm": 0.36733099818229675, + "learning_rate": 0.00019218861695629005, + "loss": 1.4073, + "step": 3021 + }, + { + "epoch": 0.03926951171376918, + "grad_norm": 0.4428819417953491, + "learning_rate": 0.00019218601749437867, + "loss": 1.4578, + "step": 3022 + }, + { + "epoch": 0.039282506257685056, + "grad_norm": 0.41870808601379395, + "learning_rate": 0.0001921834180324673, + "loss": 1.524, + "step": 3023 + }, + { + "epoch": 0.03929550080160093, + "grad_norm": 0.4455831050872803, + "learning_rate": 0.0001921808185705559, + "loss": 1.3535, + "step": 3024 + }, + { + "epoch": 0.0393084953455168, + "grad_norm": 0.4161718189716339, + "learning_rate": 0.00019217821910864452, + "loss": 1.4771, + "step": 3025 + }, + { + "epoch": 0.039321489889432674, + "grad_norm": 0.3113771080970764, + "learning_rate": 0.00019217561964673314, + "loss": 1.3269, + "step": 3026 + }, + { + "epoch": 0.03933448443334855, + "grad_norm": 0.4568064212799072, + "learning_rate": 0.00019217302018482174, + "loss": 1.36, + "step": 3027 + }, + { + "epoch": 0.03934747897726442, + "grad_norm": 0.36629778146743774, + "learning_rate": 0.00019217042072291037, + "loss": 1.4161, + "step": 3028 + }, + { + "epoch": 0.03936047352118029, + "grad_norm": 0.4069276750087738, + "learning_rate": 0.000192167821260999, + "loss": 1.3755, + "step": 3029 + }, + { + "epoch": 0.039373468065096166, + "grad_norm": 0.26498275995254517, + "learning_rate": 0.00019216522179908761, + "loss": 1.1617, + "step": 3030 + }, + { + "epoch": 0.03938646260901204, + "grad_norm": 0.3571862578392029, + "learning_rate": 0.0001921626223371762, + "loss": 1.4352, + "step": 3031 + }, + { + "epoch": 0.03939945715292791, + "grad_norm": 0.3694959580898285, + "learning_rate": 0.0001921600228752648, + "loss": 1.4404, + "step": 3032 + }, + { + "epoch": 0.039412451696843785, + "grad_norm": 0.45220714807510376, + "learning_rate": 0.00019215742341335346, + "loss": 1.5745, + "step": 3033 + }, + { + "epoch": 0.03942544624075966, + "grad_norm": 0.37689951062202454, + "learning_rate": 0.00019215482395144206, + "loss": 1.3817, + "step": 3034 + }, + { + "epoch": 0.03943844078467553, + "grad_norm": 0.29421502351760864, + "learning_rate": 0.00019215222448953068, + "loss": 1.5907, + "step": 3035 + }, + { + "epoch": 0.03945143532859141, + "grad_norm": 0.3483215272426605, + "learning_rate": 0.00019214962502761928, + "loss": 1.4719, + "step": 3036 + }, + { + "epoch": 0.039464429872507284, + "grad_norm": 0.5257629752159119, + "learning_rate": 0.0001921470255657079, + "loss": 1.2928, + "step": 3037 + }, + { + "epoch": 0.03947742441642316, + "grad_norm": 0.42938506603240967, + "learning_rate": 0.00019214442610379653, + "loss": 1.4492, + "step": 3038 + }, + { + "epoch": 0.03949041896033903, + "grad_norm": 0.4321722686290741, + "learning_rate": 0.00019214182664188513, + "loss": 1.4239, + "step": 3039 + }, + { + "epoch": 0.0395034135042549, + "grad_norm": 0.3258979022502899, + "learning_rate": 0.00019213922717997375, + "loss": 1.5091, + "step": 3040 + }, + { + "epoch": 0.039516408048170776, + "grad_norm": 0.35038477182388306, + "learning_rate": 0.00019213662771806238, + "loss": 1.5664, + "step": 3041 + }, + { + "epoch": 0.03952940259208665, + "grad_norm": 0.38343197107315063, + "learning_rate": 0.000192134028256151, + "loss": 1.5419, + "step": 3042 + }, + { + "epoch": 0.03954239713600252, + "grad_norm": 0.317745566368103, + "learning_rate": 0.0001921314287942396, + "loss": 1.5216, + "step": 3043 + }, + { + "epoch": 0.039555391679918395, + "grad_norm": 0.3610340654850006, + "learning_rate": 0.0001921288293323282, + "loss": 1.378, + "step": 3044 + }, + { + "epoch": 0.03956838622383427, + "grad_norm": 0.5125117897987366, + "learning_rate": 0.00019212622987041685, + "loss": 1.5753, + "step": 3045 + }, + { + "epoch": 0.03958138076775014, + "grad_norm": 0.38834917545318604, + "learning_rate": 0.00019212363040850544, + "loss": 1.4358, + "step": 3046 + }, + { + "epoch": 0.039594375311666014, + "grad_norm": 0.39003828167915344, + "learning_rate": 0.00019212103094659407, + "loss": 1.505, + "step": 3047 + }, + { + "epoch": 0.039607369855581886, + "grad_norm": 0.3908798396587372, + "learning_rate": 0.00019211843148468267, + "loss": 1.4697, + "step": 3048 + }, + { + "epoch": 0.03962036439949776, + "grad_norm": 0.33871498703956604, + "learning_rate": 0.0001921158320227713, + "loss": 1.3073, + "step": 3049 + }, + { + "epoch": 0.03963335894341363, + "grad_norm": 0.5763903260231018, + "learning_rate": 0.00019211323256085991, + "loss": 1.5585, + "step": 3050 + }, + { + "epoch": 0.039646353487329505, + "grad_norm": 0.35756200551986694, + "learning_rate": 0.0001921106330989485, + "loss": 1.6285, + "step": 3051 + }, + { + "epoch": 0.03965934803124538, + "grad_norm": 0.29677605628967285, + "learning_rate": 0.00019210803363703714, + "loss": 1.4501, + "step": 3052 + }, + { + "epoch": 0.03967234257516125, + "grad_norm": 0.3514330983161926, + "learning_rate": 0.00019210543417512576, + "loss": 1.3728, + "step": 3053 + }, + { + "epoch": 0.039685337119077124, + "grad_norm": 0.33042457699775696, + "learning_rate": 0.00019210283471321439, + "loss": 1.5604, + "step": 3054 + }, + { + "epoch": 0.039698331662993004, + "grad_norm": 0.369961142539978, + "learning_rate": 0.00019210023525130298, + "loss": 1.4684, + "step": 3055 + }, + { + "epoch": 0.03971132620690888, + "grad_norm": 0.42035093903541565, + "learning_rate": 0.0001920976357893916, + "loss": 1.6284, + "step": 3056 + }, + { + "epoch": 0.03972432075082475, + "grad_norm": 0.44291654229164124, + "learning_rate": 0.00019209503632748023, + "loss": 1.385, + "step": 3057 + }, + { + "epoch": 0.03973731529474062, + "grad_norm": 0.42376652359962463, + "learning_rate": 0.00019209243686556883, + "loss": 1.7623, + "step": 3058 + }, + { + "epoch": 0.039750309838656496, + "grad_norm": 0.47812968492507935, + "learning_rate": 0.00019208983740365745, + "loss": 1.2825, + "step": 3059 + }, + { + "epoch": 0.03976330438257237, + "grad_norm": 0.40783625841140747, + "learning_rate": 0.00019208723794174608, + "loss": 1.4694, + "step": 3060 + }, + { + "epoch": 0.03977629892648824, + "grad_norm": 0.4069369435310364, + "learning_rate": 0.00019208463847983468, + "loss": 1.7472, + "step": 3061 + }, + { + "epoch": 0.039789293470404115, + "grad_norm": 0.33194565773010254, + "learning_rate": 0.0001920820390179233, + "loss": 1.3744, + "step": 3062 + }, + { + "epoch": 0.03980228801431999, + "grad_norm": 0.2969339191913605, + "learning_rate": 0.0001920794395560119, + "loss": 1.5694, + "step": 3063 + }, + { + "epoch": 0.03981528255823586, + "grad_norm": 0.3042122721672058, + "learning_rate": 0.00019207684009410055, + "loss": 1.4554, + "step": 3064 + }, + { + "epoch": 0.039828277102151734, + "grad_norm": 0.3624174892902374, + "learning_rate": 0.00019207424063218915, + "loss": 1.3019, + "step": 3065 + }, + { + "epoch": 0.03984127164606761, + "grad_norm": 0.36166122555732727, + "learning_rate": 0.00019207164117027777, + "loss": 1.4189, + "step": 3066 + }, + { + "epoch": 0.03985426618998348, + "grad_norm": 0.37450551986694336, + "learning_rate": 0.00019206904170836637, + "loss": 1.2247, + "step": 3067 + }, + { + "epoch": 0.03986726073389935, + "grad_norm": 0.44436436891555786, + "learning_rate": 0.000192066442246455, + "loss": 1.4524, + "step": 3068 + }, + { + "epoch": 0.039880255277815226, + "grad_norm": 0.23996558785438538, + "learning_rate": 0.00019206384278454362, + "loss": 1.3004, + "step": 3069 + }, + { + "epoch": 0.0398932498217311, + "grad_norm": 0.3490460515022278, + "learning_rate": 0.00019206124332263221, + "loss": 1.5343, + "step": 3070 + }, + { + "epoch": 0.03990624436564697, + "grad_norm": 0.3472658395767212, + "learning_rate": 0.00019205864386072084, + "loss": 1.5033, + "step": 3071 + }, + { + "epoch": 0.039919238909562844, + "grad_norm": 0.4355182647705078, + "learning_rate": 0.00019205604439880946, + "loss": 1.3936, + "step": 3072 + }, + { + "epoch": 0.03993223345347872, + "grad_norm": 0.41063302755355835, + "learning_rate": 0.0001920534449368981, + "loss": 1.4734, + "step": 3073 + }, + { + "epoch": 0.0399452279973946, + "grad_norm": 0.4038110673427582, + "learning_rate": 0.00019205084547498669, + "loss": 1.6693, + "step": 3074 + }, + { + "epoch": 0.03995822254131047, + "grad_norm": 0.41437745094299316, + "learning_rate": 0.00019204824601307528, + "loss": 1.4717, + "step": 3075 + }, + { + "epoch": 0.03997121708522634, + "grad_norm": 0.33500972390174866, + "learning_rate": 0.00019204564655116393, + "loss": 1.3303, + "step": 3076 + }, + { + "epoch": 0.039984211629142216, + "grad_norm": 0.3980722725391388, + "learning_rate": 0.00019204304708925253, + "loss": 1.4424, + "step": 3077 + }, + { + "epoch": 0.03999720617305809, + "grad_norm": 0.39202064275741577, + "learning_rate": 0.00019204044762734116, + "loss": 1.4207, + "step": 3078 + }, + { + "epoch": 0.04001020071697396, + "grad_norm": 0.448565274477005, + "learning_rate": 0.00019203784816542975, + "loss": 1.3661, + "step": 3079 + }, + { + "epoch": 0.040023195260889835, + "grad_norm": 0.29417359828948975, + "learning_rate": 0.00019203524870351838, + "loss": 1.2719, + "step": 3080 + }, + { + "epoch": 0.04003618980480571, + "grad_norm": 0.37672901153564453, + "learning_rate": 0.000192032649241607, + "loss": 1.4153, + "step": 3081 + }, + { + "epoch": 0.04004918434872158, + "grad_norm": 0.4291362464427948, + "learning_rate": 0.0001920300497796956, + "loss": 1.509, + "step": 3082 + }, + { + "epoch": 0.040062178892637454, + "grad_norm": 0.4519380033016205, + "learning_rate": 0.00019202745031778422, + "loss": 1.5726, + "step": 3083 + }, + { + "epoch": 0.04007517343655333, + "grad_norm": 0.33586302399635315, + "learning_rate": 0.00019202485085587285, + "loss": 1.5352, + "step": 3084 + }, + { + "epoch": 0.0400881679804692, + "grad_norm": 0.379517138004303, + "learning_rate": 0.00019202225139396147, + "loss": 1.5744, + "step": 3085 + }, + { + "epoch": 0.04010116252438507, + "grad_norm": 0.47037217020988464, + "learning_rate": 0.00019201965193205007, + "loss": 1.6572, + "step": 3086 + }, + { + "epoch": 0.040114157068300946, + "grad_norm": 0.3559758961200714, + "learning_rate": 0.00019201705247013867, + "loss": 1.3829, + "step": 3087 + }, + { + "epoch": 0.04012715161221682, + "grad_norm": 0.45659565925598145, + "learning_rate": 0.00019201445300822732, + "loss": 1.6783, + "step": 3088 + }, + { + "epoch": 0.04014014615613269, + "grad_norm": 0.29577526450157166, + "learning_rate": 0.00019201185354631592, + "loss": 1.3802, + "step": 3089 + }, + { + "epoch": 0.040153140700048565, + "grad_norm": 0.4399329125881195, + "learning_rate": 0.00019200925408440454, + "loss": 1.7204, + "step": 3090 + }, + { + "epoch": 0.04016613524396444, + "grad_norm": 0.5615129470825195, + "learning_rate": 0.00019200665462249314, + "loss": 1.4306, + "step": 3091 + }, + { + "epoch": 0.04017912978788031, + "grad_norm": 0.39894187450408936, + "learning_rate": 0.00019200405516058176, + "loss": 1.6013, + "step": 3092 + }, + { + "epoch": 0.04019212433179619, + "grad_norm": 0.49865931272506714, + "learning_rate": 0.0001920014556986704, + "loss": 1.5064, + "step": 3093 + }, + { + "epoch": 0.04020511887571206, + "grad_norm": 0.3973310887813568, + "learning_rate": 0.00019199885623675899, + "loss": 1.5184, + "step": 3094 + }, + { + "epoch": 0.040218113419627936, + "grad_norm": 0.3640269339084625, + "learning_rate": 0.0001919962567748476, + "loss": 1.3856, + "step": 3095 + }, + { + "epoch": 0.04023110796354381, + "grad_norm": 0.3153294324874878, + "learning_rate": 0.00019199365731293623, + "loss": 1.3646, + "step": 3096 + }, + { + "epoch": 0.04024410250745968, + "grad_norm": 0.42748406529426575, + "learning_rate": 0.00019199105785102486, + "loss": 1.3435, + "step": 3097 + }, + { + "epoch": 0.040257097051375555, + "grad_norm": 0.33205223083496094, + "learning_rate": 0.00019198845838911346, + "loss": 1.4495, + "step": 3098 + }, + { + "epoch": 0.04027009159529143, + "grad_norm": 0.3578440248966217, + "learning_rate": 0.00019198585892720208, + "loss": 1.2762, + "step": 3099 + }, + { + "epoch": 0.0402830861392073, + "grad_norm": 0.33170372247695923, + "learning_rate": 0.0001919832594652907, + "loss": 1.3112, + "step": 3100 + }, + { + "epoch": 0.040296080683123174, + "grad_norm": 0.3646683692932129, + "learning_rate": 0.0001919806600033793, + "loss": 1.6309, + "step": 3101 + }, + { + "epoch": 0.04030907522703905, + "grad_norm": 0.4076707363128662, + "learning_rate": 0.00019197806054146793, + "loss": 1.5324, + "step": 3102 + }, + { + "epoch": 0.04032206977095492, + "grad_norm": 0.38818368315696716, + "learning_rate": 0.00019197546107955655, + "loss": 1.4028, + "step": 3103 + }, + { + "epoch": 0.04033506431487079, + "grad_norm": 0.40092992782592773, + "learning_rate": 0.00019197286161764515, + "loss": 1.2861, + "step": 3104 + }, + { + "epoch": 0.040348058858786666, + "grad_norm": 0.3484092950820923, + "learning_rate": 0.00019197026215573377, + "loss": 1.5126, + "step": 3105 + }, + { + "epoch": 0.04036105340270254, + "grad_norm": 0.44686079025268555, + "learning_rate": 0.00019196766269382237, + "loss": 1.4899, + "step": 3106 + }, + { + "epoch": 0.04037404794661841, + "grad_norm": 1.2319358587265015, + "learning_rate": 0.00019196506323191102, + "loss": 1.4878, + "step": 3107 + }, + { + "epoch": 0.040387042490534285, + "grad_norm": 0.401244580745697, + "learning_rate": 0.00019196246376999962, + "loss": 1.3403, + "step": 3108 + }, + { + "epoch": 0.04040003703445016, + "grad_norm": 0.3605179488658905, + "learning_rate": 0.00019195986430808824, + "loss": 1.6123, + "step": 3109 + }, + { + "epoch": 0.04041303157836603, + "grad_norm": 0.449707567691803, + "learning_rate": 0.00019195726484617684, + "loss": 1.4513, + "step": 3110 + }, + { + "epoch": 0.040426026122281904, + "grad_norm": 0.437468022108078, + "learning_rate": 0.00019195466538426547, + "loss": 1.4019, + "step": 3111 + }, + { + "epoch": 0.040439020666197784, + "grad_norm": 0.38820868730545044, + "learning_rate": 0.0001919520659223541, + "loss": 1.3386, + "step": 3112 + }, + { + "epoch": 0.040452015210113657, + "grad_norm": 0.3916557729244232, + "learning_rate": 0.0001919494664604427, + "loss": 1.5633, + "step": 3113 + }, + { + "epoch": 0.04046500975402953, + "grad_norm": 0.2851991653442383, + "learning_rate": 0.0001919468669985313, + "loss": 1.3573, + "step": 3114 + }, + { + "epoch": 0.0404780042979454, + "grad_norm": 0.41225916147232056, + "learning_rate": 0.00019194426753661994, + "loss": 1.5041, + "step": 3115 + }, + { + "epoch": 0.040490998841861275, + "grad_norm": 0.43771931529045105, + "learning_rate": 0.00019194166807470853, + "loss": 1.5264, + "step": 3116 + }, + { + "epoch": 0.04050399338577715, + "grad_norm": 0.4076649844646454, + "learning_rate": 0.00019193906861279716, + "loss": 1.5627, + "step": 3117 + }, + { + "epoch": 0.04051698792969302, + "grad_norm": 0.4111360013484955, + "learning_rate": 0.00019193646915088576, + "loss": 1.497, + "step": 3118 + }, + { + "epoch": 0.040529982473608894, + "grad_norm": 0.3873785138130188, + "learning_rate": 0.0001919338696889744, + "loss": 1.4727, + "step": 3119 + }, + { + "epoch": 0.04054297701752477, + "grad_norm": 0.4220013916492462, + "learning_rate": 0.000191931270227063, + "loss": 1.4145, + "step": 3120 + }, + { + "epoch": 0.04055597156144064, + "grad_norm": 0.4144245684146881, + "learning_rate": 0.00019192867076515163, + "loss": 1.4805, + "step": 3121 + }, + { + "epoch": 0.04056896610535651, + "grad_norm": 0.4178270399570465, + "learning_rate": 0.00019192607130324023, + "loss": 1.3928, + "step": 3122 + }, + { + "epoch": 0.040581960649272386, + "grad_norm": 0.4145921468734741, + "learning_rate": 0.00019192347184132885, + "loss": 1.4666, + "step": 3123 + }, + { + "epoch": 0.04059495519318826, + "grad_norm": 0.40947577357292175, + "learning_rate": 0.00019192087237941748, + "loss": 1.4129, + "step": 3124 + }, + { + "epoch": 0.04060794973710413, + "grad_norm": 0.4217536747455597, + "learning_rate": 0.00019191827291750607, + "loss": 1.4506, + "step": 3125 + }, + { + "epoch": 0.040620944281020005, + "grad_norm": 0.43890392780303955, + "learning_rate": 0.0001919156734555947, + "loss": 1.4512, + "step": 3126 + }, + { + "epoch": 0.04063393882493588, + "grad_norm": 0.3964190185070038, + "learning_rate": 0.00019191307399368332, + "loss": 1.3763, + "step": 3127 + }, + { + "epoch": 0.04064693336885175, + "grad_norm": 0.4450424015522003, + "learning_rate": 0.00019191047453177192, + "loss": 1.5343, + "step": 3128 + }, + { + "epoch": 0.040659927912767624, + "grad_norm": 0.3186704218387604, + "learning_rate": 0.00019190787506986054, + "loss": 1.3483, + "step": 3129 + }, + { + "epoch": 0.0406729224566835, + "grad_norm": 0.4094642996788025, + "learning_rate": 0.00019190527560794914, + "loss": 1.5211, + "step": 3130 + }, + { + "epoch": 0.04068591700059938, + "grad_norm": 0.44076094031333923, + "learning_rate": 0.0001919026761460378, + "loss": 1.4394, + "step": 3131 + }, + { + "epoch": 0.04069891154451525, + "grad_norm": 0.5064018368721008, + "learning_rate": 0.0001919000766841264, + "loss": 1.5669, + "step": 3132 + }, + { + "epoch": 0.04071190608843112, + "grad_norm": 0.39235207438468933, + "learning_rate": 0.00019189747722221502, + "loss": 1.4464, + "step": 3133 + }, + { + "epoch": 0.040724900632346996, + "grad_norm": 0.3050629794597626, + "learning_rate": 0.00019189487776030364, + "loss": 1.1383, + "step": 3134 + }, + { + "epoch": 0.04073789517626287, + "grad_norm": 0.4484102129936218, + "learning_rate": 0.00019189227829839224, + "loss": 1.3698, + "step": 3135 + }, + { + "epoch": 0.04075088972017874, + "grad_norm": 0.46817225217819214, + "learning_rate": 0.00019188967883648086, + "loss": 1.4105, + "step": 3136 + }, + { + "epoch": 0.040763884264094614, + "grad_norm": 0.3172460198402405, + "learning_rate": 0.00019188707937456946, + "loss": 1.4185, + "step": 3137 + }, + { + "epoch": 0.04077687880801049, + "grad_norm": 0.3920261263847351, + "learning_rate": 0.0001918844799126581, + "loss": 1.2577, + "step": 3138 + }, + { + "epoch": 0.04078987335192636, + "grad_norm": 0.37091803550720215, + "learning_rate": 0.0001918818804507467, + "loss": 1.533, + "step": 3139 + }, + { + "epoch": 0.04080286789584223, + "grad_norm": 0.38436436653137207, + "learning_rate": 0.00019187928098883533, + "loss": 1.3562, + "step": 3140 + }, + { + "epoch": 0.040815862439758106, + "grad_norm": 0.34745246171951294, + "learning_rate": 0.00019187668152692393, + "loss": 1.4727, + "step": 3141 + }, + { + "epoch": 0.04082885698367398, + "grad_norm": 0.30579110980033875, + "learning_rate": 0.00019187408206501255, + "loss": 1.185, + "step": 3142 + }, + { + "epoch": 0.04084185152758985, + "grad_norm": 0.3694552481174469, + "learning_rate": 0.00019187148260310118, + "loss": 1.5686, + "step": 3143 + }, + { + "epoch": 0.040854846071505725, + "grad_norm": 0.3438752591609955, + "learning_rate": 0.00019186888314118978, + "loss": 1.4681, + "step": 3144 + }, + { + "epoch": 0.0408678406154216, + "grad_norm": 0.4971218705177307, + "learning_rate": 0.0001918662836792784, + "loss": 1.4305, + "step": 3145 + }, + { + "epoch": 0.04088083515933747, + "grad_norm": 0.31639137864112854, + "learning_rate": 0.00019186368421736703, + "loss": 1.3869, + "step": 3146 + }, + { + "epoch": 0.040893829703253344, + "grad_norm": 0.42337992787361145, + "learning_rate": 0.00019186108475545562, + "loss": 1.4166, + "step": 3147 + }, + { + "epoch": 0.04090682424716922, + "grad_norm": 0.43055403232574463, + "learning_rate": 0.00019185848529354425, + "loss": 1.4959, + "step": 3148 + }, + { + "epoch": 0.04091981879108509, + "grad_norm": 0.43309667706489563, + "learning_rate": 0.00019185588583163284, + "loss": 1.7316, + "step": 3149 + }, + { + "epoch": 0.04093281333500097, + "grad_norm": 0.46645283699035645, + "learning_rate": 0.0001918532863697215, + "loss": 1.4909, + "step": 3150 + }, + { + "epoch": 0.04094580787891684, + "grad_norm": 0.4567408263683319, + "learning_rate": 0.0001918506869078101, + "loss": 1.5053, + "step": 3151 + }, + { + "epoch": 0.040958802422832716, + "grad_norm": 0.4238909184932709, + "learning_rate": 0.00019184808744589872, + "loss": 1.594, + "step": 3152 + }, + { + "epoch": 0.04097179696674859, + "grad_norm": 0.3949819505214691, + "learning_rate": 0.00019184548798398732, + "loss": 1.5029, + "step": 3153 + }, + { + "epoch": 0.04098479151066446, + "grad_norm": 0.4502496123313904, + "learning_rate": 0.00019184288852207594, + "loss": 1.5892, + "step": 3154 + }, + { + "epoch": 0.040997786054580335, + "grad_norm": 0.3974790573120117, + "learning_rate": 0.00019184028906016456, + "loss": 1.6014, + "step": 3155 + }, + { + "epoch": 0.04101078059849621, + "grad_norm": 0.4246217608451843, + "learning_rate": 0.00019183768959825316, + "loss": 1.3469, + "step": 3156 + }, + { + "epoch": 0.04102377514241208, + "grad_norm": 0.3463131785392761, + "learning_rate": 0.00019183509013634179, + "loss": 1.5015, + "step": 3157 + }, + { + "epoch": 0.041036769686327954, + "grad_norm": 0.5264678597450256, + "learning_rate": 0.0001918324906744304, + "loss": 1.6415, + "step": 3158 + }, + { + "epoch": 0.041049764230243826, + "grad_norm": 0.4146251380443573, + "learning_rate": 0.000191829891212519, + "loss": 1.4009, + "step": 3159 + }, + { + "epoch": 0.0410627587741597, + "grad_norm": 0.3029043972492218, + "learning_rate": 0.00019182729175060763, + "loss": 1.4445, + "step": 3160 + }, + { + "epoch": 0.04107575331807557, + "grad_norm": 0.4535422623157501, + "learning_rate": 0.00019182469228869623, + "loss": 1.5333, + "step": 3161 + }, + { + "epoch": 0.041088747861991445, + "grad_norm": 0.37516871094703674, + "learning_rate": 0.00019182209282678488, + "loss": 1.6223, + "step": 3162 + }, + { + "epoch": 0.04110174240590732, + "grad_norm": 0.5137490630149841, + "learning_rate": 0.00019181949336487348, + "loss": 1.5471, + "step": 3163 + }, + { + "epoch": 0.04111473694982319, + "grad_norm": 0.4472859799861908, + "learning_rate": 0.0001918168939029621, + "loss": 1.4141, + "step": 3164 + }, + { + "epoch": 0.041127731493739064, + "grad_norm": 0.40392884612083435, + "learning_rate": 0.0001918142944410507, + "loss": 1.4995, + "step": 3165 + }, + { + "epoch": 0.04114072603765494, + "grad_norm": 0.390100359916687, + "learning_rate": 0.00019181169497913932, + "loss": 1.4495, + "step": 3166 + }, + { + "epoch": 0.04115372058157081, + "grad_norm": 0.3772204518318176, + "learning_rate": 0.00019180909551722795, + "loss": 1.4012, + "step": 3167 + }, + { + "epoch": 0.04116671512548668, + "grad_norm": 0.3527357876300812, + "learning_rate": 0.00019180649605531655, + "loss": 1.5031, + "step": 3168 + }, + { + "epoch": 0.04117970966940256, + "grad_norm": 0.43985220789909363, + "learning_rate": 0.00019180389659340517, + "loss": 1.7382, + "step": 3169 + }, + { + "epoch": 0.041192704213318436, + "grad_norm": 0.30340802669525146, + "learning_rate": 0.0001918012971314938, + "loss": 1.4302, + "step": 3170 + }, + { + "epoch": 0.04120569875723431, + "grad_norm": 0.43774303793907166, + "learning_rate": 0.0001917986976695824, + "loss": 1.4333, + "step": 3171 + }, + { + "epoch": 0.04121869330115018, + "grad_norm": 0.3428357243537903, + "learning_rate": 0.00019179609820767102, + "loss": 1.4309, + "step": 3172 + }, + { + "epoch": 0.041231687845066055, + "grad_norm": 0.3634168207645416, + "learning_rate": 0.00019179349874575964, + "loss": 1.4616, + "step": 3173 + }, + { + "epoch": 0.04124468238898193, + "grad_norm": 0.3627302646636963, + "learning_rate": 0.00019179089928384827, + "loss": 1.6259, + "step": 3174 + }, + { + "epoch": 0.0412576769328978, + "grad_norm": 0.2947049140930176, + "learning_rate": 0.00019178829982193686, + "loss": 1.4478, + "step": 3175 + }, + { + "epoch": 0.041270671476813674, + "grad_norm": 0.22655540704727173, + "learning_rate": 0.0001917857003600255, + "loss": 1.5733, + "step": 3176 + }, + { + "epoch": 0.04128366602072955, + "grad_norm": 0.44975316524505615, + "learning_rate": 0.0001917831008981141, + "loss": 1.5573, + "step": 3177 + }, + { + "epoch": 0.04129666056464542, + "grad_norm": 0.37253931164741516, + "learning_rate": 0.0001917805014362027, + "loss": 1.3003, + "step": 3178 + }, + { + "epoch": 0.04130965510856129, + "grad_norm": 0.42107266187667847, + "learning_rate": 0.00019177790197429133, + "loss": 1.4648, + "step": 3179 + }, + { + "epoch": 0.041322649652477166, + "grad_norm": 0.395936518907547, + "learning_rate": 0.00019177530251237993, + "loss": 1.6666, + "step": 3180 + }, + { + "epoch": 0.04133564419639304, + "grad_norm": 0.3395724892616272, + "learning_rate": 0.00019177270305046858, + "loss": 1.4189, + "step": 3181 + }, + { + "epoch": 0.04134863874030891, + "grad_norm": 0.34199321269989014, + "learning_rate": 0.00019177010358855718, + "loss": 1.3027, + "step": 3182 + }, + { + "epoch": 0.041361633284224784, + "grad_norm": 0.40141457319259644, + "learning_rate": 0.00019176750412664578, + "loss": 1.4258, + "step": 3183 + }, + { + "epoch": 0.04137462782814066, + "grad_norm": 0.2920992076396942, + "learning_rate": 0.0001917649046647344, + "loss": 1.3822, + "step": 3184 + }, + { + "epoch": 0.04138762237205653, + "grad_norm": 0.41035526990890503, + "learning_rate": 0.00019176230520282303, + "loss": 1.3713, + "step": 3185 + }, + { + "epoch": 0.0414006169159724, + "grad_norm": 0.5116333365440369, + "learning_rate": 0.00019175970574091165, + "loss": 1.5178, + "step": 3186 + }, + { + "epoch": 0.041413611459888276, + "grad_norm": 0.34667888283729553, + "learning_rate": 0.00019175710627900025, + "loss": 1.2822, + "step": 3187 + }, + { + "epoch": 0.041426606003804156, + "grad_norm": 0.35072648525238037, + "learning_rate": 0.00019175450681708887, + "loss": 1.2398, + "step": 3188 + }, + { + "epoch": 0.04143960054772003, + "grad_norm": 0.37003394961357117, + "learning_rate": 0.0001917519073551775, + "loss": 1.3389, + "step": 3189 + }, + { + "epoch": 0.0414525950916359, + "grad_norm": 0.37191593647003174, + "learning_rate": 0.0001917493078932661, + "loss": 1.4589, + "step": 3190 + }, + { + "epoch": 0.041465589635551775, + "grad_norm": 0.4373728930950165, + "learning_rate": 0.00019174670843135472, + "loss": 1.5607, + "step": 3191 + }, + { + "epoch": 0.04147858417946765, + "grad_norm": 0.37511971592903137, + "learning_rate": 0.00019174410896944332, + "loss": 1.4171, + "step": 3192 + }, + { + "epoch": 0.04149157872338352, + "grad_norm": 0.2950449287891388, + "learning_rate": 0.00019174150950753197, + "loss": 1.4469, + "step": 3193 + }, + { + "epoch": 0.041504573267299394, + "grad_norm": 0.3146478831768036, + "learning_rate": 0.00019173891004562057, + "loss": 1.4147, + "step": 3194 + }, + { + "epoch": 0.04151756781121527, + "grad_norm": 0.32311949133872986, + "learning_rate": 0.0001917363105837092, + "loss": 1.4398, + "step": 3195 + }, + { + "epoch": 0.04153056235513114, + "grad_norm": 0.41635018587112427, + "learning_rate": 0.0001917337111217978, + "loss": 1.4912, + "step": 3196 + }, + { + "epoch": 0.04154355689904701, + "grad_norm": 0.25399893522262573, + "learning_rate": 0.0001917311116598864, + "loss": 1.1599, + "step": 3197 + }, + { + "epoch": 0.041556551442962886, + "grad_norm": 0.4764833152294159, + "learning_rate": 0.00019172851219797504, + "loss": 1.5619, + "step": 3198 + }, + { + "epoch": 0.04156954598687876, + "grad_norm": 0.37993738055229187, + "learning_rate": 0.00019172591273606363, + "loss": 1.5158, + "step": 3199 + }, + { + "epoch": 0.04158254053079463, + "grad_norm": 0.37712740898132324, + "learning_rate": 0.00019172331327415226, + "loss": 1.2766, + "step": 3200 + }, + { + "epoch": 0.041595535074710505, + "grad_norm": 0.2939991056919098, + "learning_rate": 0.00019172071381224088, + "loss": 1.3083, + "step": 3201 + }, + { + "epoch": 0.04160852961862638, + "grad_norm": 0.41009974479675293, + "learning_rate": 0.00019171811435032948, + "loss": 1.4472, + "step": 3202 + }, + { + "epoch": 0.04162152416254225, + "grad_norm": 0.30324918031692505, + "learning_rate": 0.0001917155148884181, + "loss": 1.2577, + "step": 3203 + }, + { + "epoch": 0.041634518706458123, + "grad_norm": 0.346285879611969, + "learning_rate": 0.0001917129154265067, + "loss": 1.5461, + "step": 3204 + }, + { + "epoch": 0.041647513250373996, + "grad_norm": 0.3356819748878479, + "learning_rate": 0.00019171031596459535, + "loss": 1.5499, + "step": 3205 + }, + { + "epoch": 0.04166050779428987, + "grad_norm": 0.39320090413093567, + "learning_rate": 0.00019170771650268395, + "loss": 1.5392, + "step": 3206 + }, + { + "epoch": 0.04167350233820575, + "grad_norm": 0.42837026715278625, + "learning_rate": 0.00019170511704077258, + "loss": 1.635, + "step": 3207 + }, + { + "epoch": 0.04168649688212162, + "grad_norm": 0.4710414409637451, + "learning_rate": 0.0001917025175788612, + "loss": 1.3536, + "step": 3208 + }, + { + "epoch": 0.041699491426037495, + "grad_norm": 0.37710267305374146, + "learning_rate": 0.0001916999181169498, + "loss": 1.5275, + "step": 3209 + }, + { + "epoch": 0.04171248596995337, + "grad_norm": 0.5383466482162476, + "learning_rate": 0.00019169731865503842, + "loss": 1.3972, + "step": 3210 + }, + { + "epoch": 0.04172548051386924, + "grad_norm": 0.4248463213443756, + "learning_rate": 0.00019169471919312702, + "loss": 1.5377, + "step": 3211 + }, + { + "epoch": 0.041738475057785114, + "grad_norm": 0.3452415466308594, + "learning_rate": 0.00019169211973121564, + "loss": 1.4607, + "step": 3212 + }, + { + "epoch": 0.04175146960170099, + "grad_norm": 0.3828495442867279, + "learning_rate": 0.00019168952026930427, + "loss": 1.4264, + "step": 3213 + }, + { + "epoch": 0.04176446414561686, + "grad_norm": 0.40293848514556885, + "learning_rate": 0.00019168692080739287, + "loss": 1.5344, + "step": 3214 + }, + { + "epoch": 0.04177745868953273, + "grad_norm": 0.3619150221347809, + "learning_rate": 0.0001916843213454815, + "loss": 1.4832, + "step": 3215 + }, + { + "epoch": 0.041790453233448606, + "grad_norm": 0.40804359316825867, + "learning_rate": 0.00019168172188357012, + "loss": 1.5644, + "step": 3216 + }, + { + "epoch": 0.04180344777736448, + "grad_norm": 0.4149736166000366, + "learning_rate": 0.00019167912242165874, + "loss": 1.437, + "step": 3217 + }, + { + "epoch": 0.04181644232128035, + "grad_norm": 0.31111371517181396, + "learning_rate": 0.00019167652295974734, + "loss": 1.4111, + "step": 3218 + }, + { + "epoch": 0.041829436865196225, + "grad_norm": 0.3087538182735443, + "learning_rate": 0.00019167392349783596, + "loss": 1.3734, + "step": 3219 + }, + { + "epoch": 0.0418424314091121, + "grad_norm": 0.4850727617740631, + "learning_rate": 0.00019167132403592459, + "loss": 1.4706, + "step": 3220 + }, + { + "epoch": 0.04185542595302797, + "grad_norm": 0.38028115034103394, + "learning_rate": 0.00019166872457401318, + "loss": 1.3597, + "step": 3221 + }, + { + "epoch": 0.041868420496943844, + "grad_norm": 0.3625474274158478, + "learning_rate": 0.0001916661251121018, + "loss": 1.4015, + "step": 3222 + }, + { + "epoch": 0.04188141504085972, + "grad_norm": 0.4645829498767853, + "learning_rate": 0.0001916635256501904, + "loss": 1.5933, + "step": 3223 + }, + { + "epoch": 0.04189440958477559, + "grad_norm": 0.42849549651145935, + "learning_rate": 0.00019166092618827906, + "loss": 1.5038, + "step": 3224 + }, + { + "epoch": 0.04190740412869146, + "grad_norm": 0.38927534222602844, + "learning_rate": 0.00019165832672636765, + "loss": 1.4694, + "step": 3225 + }, + { + "epoch": 0.04192039867260734, + "grad_norm": 0.3418194353580475, + "learning_rate": 0.00019165572726445625, + "loss": 1.3263, + "step": 3226 + }, + { + "epoch": 0.041933393216523215, + "grad_norm": 0.28867998719215393, + "learning_rate": 0.00019165312780254488, + "loss": 1.2728, + "step": 3227 + }, + { + "epoch": 0.04194638776043909, + "grad_norm": 0.7567052841186523, + "learning_rate": 0.0001916505283406335, + "loss": 1.535, + "step": 3228 + }, + { + "epoch": 0.04195938230435496, + "grad_norm": 0.3170691728591919, + "learning_rate": 0.00019164792887872213, + "loss": 1.4031, + "step": 3229 + }, + { + "epoch": 0.041972376848270834, + "grad_norm": 0.4613756835460663, + "learning_rate": 0.00019164532941681072, + "loss": 1.4951, + "step": 3230 + }, + { + "epoch": 0.04198537139218671, + "grad_norm": 0.42037251591682434, + "learning_rate": 0.00019164272995489935, + "loss": 1.4238, + "step": 3231 + }, + { + "epoch": 0.04199836593610258, + "grad_norm": 0.39461565017700195, + "learning_rate": 0.00019164013049298797, + "loss": 1.5215, + "step": 3232 + }, + { + "epoch": 0.04201136048001845, + "grad_norm": 0.3436048924922943, + "learning_rate": 0.00019163753103107657, + "loss": 1.555, + "step": 3233 + }, + { + "epoch": 0.042024355023934326, + "grad_norm": 0.42129844427108765, + "learning_rate": 0.0001916349315691652, + "loss": 1.5112, + "step": 3234 + }, + { + "epoch": 0.0420373495678502, + "grad_norm": 0.39394626021385193, + "learning_rate": 0.0001916323321072538, + "loss": 1.254, + "step": 3235 + }, + { + "epoch": 0.04205034411176607, + "grad_norm": 0.4991368055343628, + "learning_rate": 0.00019162973264534244, + "loss": 1.509, + "step": 3236 + }, + { + "epoch": 0.042063338655681945, + "grad_norm": 0.4191257357597351, + "learning_rate": 0.00019162713318343104, + "loss": 1.4626, + "step": 3237 + }, + { + "epoch": 0.04207633319959782, + "grad_norm": 0.3565533459186554, + "learning_rate": 0.00019162453372151964, + "loss": 1.3298, + "step": 3238 + }, + { + "epoch": 0.04208932774351369, + "grad_norm": 0.3717363178730011, + "learning_rate": 0.00019162193425960826, + "loss": 1.4278, + "step": 3239 + }, + { + "epoch": 0.042102322287429564, + "grad_norm": 0.5267666578292847, + "learning_rate": 0.00019161933479769689, + "loss": 1.4161, + "step": 3240 + }, + { + "epoch": 0.04211531683134544, + "grad_norm": 0.3773806691169739, + "learning_rate": 0.0001916167353357855, + "loss": 1.388, + "step": 3241 + }, + { + "epoch": 0.04212831137526131, + "grad_norm": 0.4267769455909729, + "learning_rate": 0.0001916141358738741, + "loss": 1.3399, + "step": 3242 + }, + { + "epoch": 0.04214130591917718, + "grad_norm": 0.3078964948654175, + "learning_rate": 0.00019161153641196273, + "loss": 1.3957, + "step": 3243 + }, + { + "epoch": 0.042154300463093056, + "grad_norm": 0.44167131185531616, + "learning_rate": 0.00019160893695005136, + "loss": 1.6181, + "step": 3244 + }, + { + "epoch": 0.042167295007008936, + "grad_norm": 0.41440489888191223, + "learning_rate": 0.00019160633748813995, + "loss": 1.3941, + "step": 3245 + }, + { + "epoch": 0.04218028955092481, + "grad_norm": 0.3722870349884033, + "learning_rate": 0.00019160373802622858, + "loss": 1.3619, + "step": 3246 + }, + { + "epoch": 0.04219328409484068, + "grad_norm": 0.4042980372905731, + "learning_rate": 0.0001916011385643172, + "loss": 1.4128, + "step": 3247 + }, + { + "epoch": 0.042206278638756554, + "grad_norm": 0.415513813495636, + "learning_rate": 0.00019159853910240583, + "loss": 1.6317, + "step": 3248 + }, + { + "epoch": 0.04221927318267243, + "grad_norm": 0.4566916823387146, + "learning_rate": 0.00019159593964049443, + "loss": 1.4313, + "step": 3249 + }, + { + "epoch": 0.0422322677265883, + "grad_norm": 0.4161822497844696, + "learning_rate": 0.00019159334017858302, + "loss": 1.3289, + "step": 3250 + }, + { + "epoch": 0.04224526227050417, + "grad_norm": 0.319856196641922, + "learning_rate": 0.00019159074071667167, + "loss": 1.4485, + "step": 3251 + }, + { + "epoch": 0.042258256814420046, + "grad_norm": 0.37036123871803284, + "learning_rate": 0.00019158814125476027, + "loss": 1.4701, + "step": 3252 + }, + { + "epoch": 0.04227125135833592, + "grad_norm": 0.31518563628196716, + "learning_rate": 0.0001915855417928489, + "loss": 1.4222, + "step": 3253 + }, + { + "epoch": 0.04228424590225179, + "grad_norm": 0.38721492886543274, + "learning_rate": 0.0001915829423309375, + "loss": 1.5214, + "step": 3254 + }, + { + "epoch": 0.042297240446167665, + "grad_norm": 0.3673882782459259, + "learning_rate": 0.00019158034286902612, + "loss": 1.5217, + "step": 3255 + }, + { + "epoch": 0.04231023499008354, + "grad_norm": 0.3944181203842163, + "learning_rate": 0.00019157774340711474, + "loss": 1.3793, + "step": 3256 + }, + { + "epoch": 0.04232322953399941, + "grad_norm": 0.38716012239456177, + "learning_rate": 0.00019157514394520334, + "loss": 1.6818, + "step": 3257 + }, + { + "epoch": 0.042336224077915284, + "grad_norm": 0.4616550803184509, + "learning_rate": 0.00019157254448329196, + "loss": 1.2827, + "step": 3258 + }, + { + "epoch": 0.04234921862183116, + "grad_norm": 0.3793615698814392, + "learning_rate": 0.0001915699450213806, + "loss": 1.4161, + "step": 3259 + }, + { + "epoch": 0.04236221316574703, + "grad_norm": 0.24249446392059326, + "learning_rate": 0.0001915673455594692, + "loss": 1.1609, + "step": 3260 + }, + { + "epoch": 0.0423752077096629, + "grad_norm": 0.3935641050338745, + "learning_rate": 0.0001915647460975578, + "loss": 1.4688, + "step": 3261 + }, + { + "epoch": 0.042388202253578776, + "grad_norm": 0.4153493046760559, + "learning_rate": 0.00019156214663564644, + "loss": 1.4151, + "step": 3262 + }, + { + "epoch": 0.04240119679749465, + "grad_norm": 0.4026470482349396, + "learning_rate": 0.00019155954717373506, + "loss": 1.5416, + "step": 3263 + }, + { + "epoch": 0.04241419134141053, + "grad_norm": 0.3599541485309601, + "learning_rate": 0.00019155694771182366, + "loss": 1.4973, + "step": 3264 + }, + { + "epoch": 0.0424271858853264, + "grad_norm": 0.3831346929073334, + "learning_rate": 0.00019155434824991228, + "loss": 1.284, + "step": 3265 + }, + { + "epoch": 0.042440180429242275, + "grad_norm": 0.3427187204360962, + "learning_rate": 0.00019155174878800088, + "loss": 1.5802, + "step": 3266 + }, + { + "epoch": 0.04245317497315815, + "grad_norm": 0.4238213002681732, + "learning_rate": 0.0001915491493260895, + "loss": 1.4432, + "step": 3267 + }, + { + "epoch": 0.04246616951707402, + "grad_norm": 0.3959029018878937, + "learning_rate": 0.00019154654986417813, + "loss": 1.492, + "step": 3268 + }, + { + "epoch": 0.042479164060989894, + "grad_norm": 0.3928011655807495, + "learning_rate": 0.00019154395040226673, + "loss": 1.6269, + "step": 3269 + }, + { + "epoch": 0.042492158604905766, + "grad_norm": 0.4751276969909668, + "learning_rate": 0.00019154135094035535, + "loss": 1.451, + "step": 3270 + }, + { + "epoch": 0.04250515314882164, + "grad_norm": 0.3577234148979187, + "learning_rate": 0.00019153875147844397, + "loss": 1.383, + "step": 3271 + }, + { + "epoch": 0.04251814769273751, + "grad_norm": 0.3346809446811676, + "learning_rate": 0.0001915361520165326, + "loss": 1.442, + "step": 3272 + }, + { + "epoch": 0.042531142236653385, + "grad_norm": 0.39318138360977173, + "learning_rate": 0.0001915335525546212, + "loss": 1.2091, + "step": 3273 + }, + { + "epoch": 0.04254413678056926, + "grad_norm": 0.40168461203575134, + "learning_rate": 0.00019153095309270982, + "loss": 1.2523, + "step": 3274 + }, + { + "epoch": 0.04255713132448513, + "grad_norm": 0.3460966944694519, + "learning_rate": 0.00019152835363079845, + "loss": 1.2636, + "step": 3275 + }, + { + "epoch": 0.042570125868401004, + "grad_norm": 0.48361414670944214, + "learning_rate": 0.00019152575416888704, + "loss": 1.4194, + "step": 3276 + }, + { + "epoch": 0.04258312041231688, + "grad_norm": 0.31816691160202026, + "learning_rate": 0.00019152315470697567, + "loss": 1.1946, + "step": 3277 + }, + { + "epoch": 0.04259611495623275, + "grad_norm": 0.42274630069732666, + "learning_rate": 0.00019152055524506426, + "loss": 1.5731, + "step": 3278 + }, + { + "epoch": 0.04260910950014862, + "grad_norm": 0.35717618465423584, + "learning_rate": 0.00019151795578315292, + "loss": 1.313, + "step": 3279 + }, + { + "epoch": 0.042622104044064496, + "grad_norm": 0.3694474399089813, + "learning_rate": 0.0001915153563212415, + "loss": 1.2986, + "step": 3280 + }, + { + "epoch": 0.04263509858798037, + "grad_norm": 0.396898090839386, + "learning_rate": 0.0001915127568593301, + "loss": 1.4396, + "step": 3281 + }, + { + "epoch": 0.04264809313189624, + "grad_norm": 0.4371946454048157, + "learning_rate": 0.00019151015739741874, + "loss": 1.5678, + "step": 3282 + }, + { + "epoch": 0.04266108767581212, + "grad_norm": 0.4931833744049072, + "learning_rate": 0.00019150755793550736, + "loss": 1.4774, + "step": 3283 + }, + { + "epoch": 0.042674082219727995, + "grad_norm": 0.37313833832740784, + "learning_rate": 0.00019150495847359598, + "loss": 1.3354, + "step": 3284 + }, + { + "epoch": 0.04268707676364387, + "grad_norm": 0.3993110656738281, + "learning_rate": 0.00019150235901168458, + "loss": 1.4563, + "step": 3285 + }, + { + "epoch": 0.04270007130755974, + "grad_norm": 0.47115734219551086, + "learning_rate": 0.0001914997595497732, + "loss": 1.5814, + "step": 3286 + }, + { + "epoch": 0.042713065851475614, + "grad_norm": 0.39446935057640076, + "learning_rate": 0.00019149716008786183, + "loss": 1.48, + "step": 3287 + }, + { + "epoch": 0.04272606039539149, + "grad_norm": 0.38951924443244934, + "learning_rate": 0.00019149456062595043, + "loss": 1.519, + "step": 3288 + }, + { + "epoch": 0.04273905493930736, + "grad_norm": 0.4462415874004364, + "learning_rate": 0.00019149196116403905, + "loss": 1.4445, + "step": 3289 + }, + { + "epoch": 0.04275204948322323, + "grad_norm": 0.3566194176673889, + "learning_rate": 0.00019148936170212768, + "loss": 1.4556, + "step": 3290 + }, + { + "epoch": 0.042765044027139106, + "grad_norm": 0.4223826229572296, + "learning_rate": 0.0001914867622402163, + "loss": 1.6536, + "step": 3291 + }, + { + "epoch": 0.04277803857105498, + "grad_norm": 0.36028677225112915, + "learning_rate": 0.0001914841627783049, + "loss": 1.3713, + "step": 3292 + }, + { + "epoch": 0.04279103311497085, + "grad_norm": 0.3774397373199463, + "learning_rate": 0.0001914815633163935, + "loss": 1.1729, + "step": 3293 + }, + { + "epoch": 0.042804027658886724, + "grad_norm": 0.42678093910217285, + "learning_rate": 0.00019147896385448215, + "loss": 1.5057, + "step": 3294 + }, + { + "epoch": 0.0428170222028026, + "grad_norm": 0.4033164978027344, + "learning_rate": 0.00019147636439257075, + "loss": 1.3776, + "step": 3295 + }, + { + "epoch": 0.04283001674671847, + "grad_norm": 0.37973758578300476, + "learning_rate": 0.00019147376493065937, + "loss": 1.2883, + "step": 3296 + }, + { + "epoch": 0.04284301129063434, + "grad_norm": 0.4481126666069031, + "learning_rate": 0.00019147116546874797, + "loss": 1.5364, + "step": 3297 + }, + { + "epoch": 0.042856005834550216, + "grad_norm": 0.40619781613349915, + "learning_rate": 0.0001914685660068366, + "loss": 1.49, + "step": 3298 + }, + { + "epoch": 0.04286900037846609, + "grad_norm": 0.3777209222316742, + "learning_rate": 0.00019146596654492522, + "loss": 1.2836, + "step": 3299 + }, + { + "epoch": 0.04288199492238196, + "grad_norm": 0.39847204089164734, + "learning_rate": 0.0001914633670830138, + "loss": 1.3011, + "step": 3300 + }, + { + "epoch": 0.042894989466297835, + "grad_norm": 0.43236109614372253, + "learning_rate": 0.00019146076762110244, + "loss": 1.6102, + "step": 3301 + }, + { + "epoch": 0.04290798401021371, + "grad_norm": 0.4709397852420807, + "learning_rate": 0.00019145816815919106, + "loss": 1.6345, + "step": 3302 + }, + { + "epoch": 0.04292097855412959, + "grad_norm": 0.36808282136917114, + "learning_rate": 0.0001914555686972797, + "loss": 1.4226, + "step": 3303 + }, + { + "epoch": 0.04293397309804546, + "grad_norm": 0.39973029494285583, + "learning_rate": 0.00019145296923536828, + "loss": 1.4584, + "step": 3304 + }, + { + "epoch": 0.042946967641961334, + "grad_norm": 0.3749482333660126, + "learning_rate": 0.00019145036977345688, + "loss": 1.2796, + "step": 3305 + }, + { + "epoch": 0.04295996218587721, + "grad_norm": 0.32088154554367065, + "learning_rate": 0.00019144777031154553, + "loss": 1.4063, + "step": 3306 + }, + { + "epoch": 0.04297295672979308, + "grad_norm": 0.40059664845466614, + "learning_rate": 0.00019144517084963413, + "loss": 1.5157, + "step": 3307 + }, + { + "epoch": 0.04298595127370895, + "grad_norm": 0.40178564190864563, + "learning_rate": 0.00019144257138772275, + "loss": 1.5518, + "step": 3308 + }, + { + "epoch": 0.042998945817624826, + "grad_norm": 0.42714112997055054, + "learning_rate": 0.00019143997192581135, + "loss": 1.5906, + "step": 3309 + }, + { + "epoch": 0.0430119403615407, + "grad_norm": 0.4414735734462738, + "learning_rate": 0.00019143737246389998, + "loss": 1.556, + "step": 3310 + }, + { + "epoch": 0.04302493490545657, + "grad_norm": 0.34161970019340515, + "learning_rate": 0.0001914347730019886, + "loss": 1.3804, + "step": 3311 + }, + { + "epoch": 0.043037929449372445, + "grad_norm": 0.389321506023407, + "learning_rate": 0.0001914321735400772, + "loss": 1.3994, + "step": 3312 + }, + { + "epoch": 0.04305092399328832, + "grad_norm": 0.40419989824295044, + "learning_rate": 0.00019142957407816582, + "loss": 1.4814, + "step": 3313 + }, + { + "epoch": 0.04306391853720419, + "grad_norm": 0.3991282880306244, + "learning_rate": 0.00019142697461625445, + "loss": 1.5364, + "step": 3314 + }, + { + "epoch": 0.04307691308112006, + "grad_norm": 0.24831891059875488, + "learning_rate": 0.00019142437515434307, + "loss": 1.3061, + "step": 3315 + }, + { + "epoch": 0.043089907625035936, + "grad_norm": 0.32519426941871643, + "learning_rate": 0.00019142177569243167, + "loss": 1.428, + "step": 3316 + }, + { + "epoch": 0.04310290216895181, + "grad_norm": 0.3940295875072479, + "learning_rate": 0.0001914191762305203, + "loss": 1.2883, + "step": 3317 + }, + { + "epoch": 0.04311589671286768, + "grad_norm": 0.4799407720565796, + "learning_rate": 0.00019141657676860892, + "loss": 1.4101, + "step": 3318 + }, + { + "epoch": 0.043128891256783555, + "grad_norm": 0.4417022168636322, + "learning_rate": 0.00019141397730669752, + "loss": 1.4683, + "step": 3319 + }, + { + "epoch": 0.04314188580069943, + "grad_norm": 0.37678059935569763, + "learning_rate": 0.00019141137784478614, + "loss": 1.4166, + "step": 3320 + }, + { + "epoch": 0.0431548803446153, + "grad_norm": 0.3916042149066925, + "learning_rate": 0.00019140877838287476, + "loss": 1.5433, + "step": 3321 + }, + { + "epoch": 0.04316787488853118, + "grad_norm": 0.2916439473628998, + "learning_rate": 0.00019140617892096336, + "loss": 1.3216, + "step": 3322 + }, + { + "epoch": 0.043180869432447054, + "grad_norm": 0.43066203594207764, + "learning_rate": 0.000191403579459052, + "loss": 1.3716, + "step": 3323 + }, + { + "epoch": 0.04319386397636293, + "grad_norm": 0.3231244683265686, + "learning_rate": 0.00019140097999714058, + "loss": 1.4794, + "step": 3324 + }, + { + "epoch": 0.0432068585202788, + "grad_norm": 0.39126765727996826, + "learning_rate": 0.00019139838053522924, + "loss": 1.236, + "step": 3325 + }, + { + "epoch": 0.04321985306419467, + "grad_norm": 0.36046120524406433, + "learning_rate": 0.00019139578107331783, + "loss": 1.4241, + "step": 3326 + }, + { + "epoch": 0.043232847608110546, + "grad_norm": 0.3886171877384186, + "learning_rate": 0.00019139318161140646, + "loss": 1.3316, + "step": 3327 + }, + { + "epoch": 0.04324584215202642, + "grad_norm": 0.35811758041381836, + "learning_rate": 0.00019139058214949505, + "loss": 1.3552, + "step": 3328 + }, + { + "epoch": 0.04325883669594229, + "grad_norm": 0.480874627828598, + "learning_rate": 0.00019138798268758368, + "loss": 1.6968, + "step": 3329 + }, + { + "epoch": 0.043271831239858165, + "grad_norm": 0.3552871644496918, + "learning_rate": 0.0001913853832256723, + "loss": 1.4936, + "step": 3330 + }, + { + "epoch": 0.04328482578377404, + "grad_norm": 0.3589709401130676, + "learning_rate": 0.0001913827837637609, + "loss": 1.449, + "step": 3331 + }, + { + "epoch": 0.04329782032768991, + "grad_norm": 0.43328332901000977, + "learning_rate": 0.00019138018430184953, + "loss": 1.4585, + "step": 3332 + }, + { + "epoch": 0.043310814871605784, + "grad_norm": 0.3951812982559204, + "learning_rate": 0.00019137758483993815, + "loss": 1.5951, + "step": 3333 + }, + { + "epoch": 0.04332380941552166, + "grad_norm": 0.3889358639717102, + "learning_rate": 0.00019137498537802675, + "loss": 1.3383, + "step": 3334 + }, + { + "epoch": 0.04333680395943753, + "grad_norm": 0.4324074983596802, + "learning_rate": 0.00019137238591611537, + "loss": 1.3453, + "step": 3335 + }, + { + "epoch": 0.0433497985033534, + "grad_norm": 0.28207793831825256, + "learning_rate": 0.00019136978645420397, + "loss": 1.5449, + "step": 3336 + }, + { + "epoch": 0.043362793047269275, + "grad_norm": 0.3594921827316284, + "learning_rate": 0.00019136718699229262, + "loss": 1.442, + "step": 3337 + }, + { + "epoch": 0.04337578759118515, + "grad_norm": 0.40041711926460266, + "learning_rate": 0.00019136458753038122, + "loss": 1.4303, + "step": 3338 + }, + { + "epoch": 0.04338878213510102, + "grad_norm": 0.387516587972641, + "learning_rate": 0.00019136198806846984, + "loss": 1.2913, + "step": 3339 + }, + { + "epoch": 0.043401776679016894, + "grad_norm": 0.34604567289352417, + "learning_rate": 0.00019135938860655844, + "loss": 1.4383, + "step": 3340 + }, + { + "epoch": 0.043414771222932774, + "grad_norm": 0.45010215044021606, + "learning_rate": 0.00019135678914464706, + "loss": 1.424, + "step": 3341 + }, + { + "epoch": 0.04342776576684865, + "grad_norm": 0.37968873977661133, + "learning_rate": 0.0001913541896827357, + "loss": 1.5163, + "step": 3342 + }, + { + "epoch": 0.04344076031076452, + "grad_norm": 0.2532168924808502, + "learning_rate": 0.0001913515902208243, + "loss": 1.2122, + "step": 3343 + }, + { + "epoch": 0.04345375485468039, + "grad_norm": 0.40517109632492065, + "learning_rate": 0.0001913489907589129, + "loss": 1.475, + "step": 3344 + }, + { + "epoch": 0.043466749398596266, + "grad_norm": 0.3487946391105652, + "learning_rate": 0.00019134639129700154, + "loss": 1.3463, + "step": 3345 + }, + { + "epoch": 0.04347974394251214, + "grad_norm": 0.4634891748428345, + "learning_rate": 0.00019134379183509016, + "loss": 1.4547, + "step": 3346 + }, + { + "epoch": 0.04349273848642801, + "grad_norm": 0.6074512004852295, + "learning_rate": 0.00019134119237317876, + "loss": 1.5215, + "step": 3347 + }, + { + "epoch": 0.043505733030343885, + "grad_norm": 0.3533388376235962, + "learning_rate": 0.00019133859291126735, + "loss": 1.1557, + "step": 3348 + }, + { + "epoch": 0.04351872757425976, + "grad_norm": 0.3960515558719635, + "learning_rate": 0.000191335993449356, + "loss": 1.3195, + "step": 3349 + }, + { + "epoch": 0.04353172211817563, + "grad_norm": 0.436777800321579, + "learning_rate": 0.0001913333939874446, + "loss": 1.3511, + "step": 3350 + }, + { + "epoch": 0.043544716662091504, + "grad_norm": 0.36303094029426575, + "learning_rate": 0.00019133079452553323, + "loss": 1.6913, + "step": 3351 + }, + { + "epoch": 0.04355771120600738, + "grad_norm": 0.339926153421402, + "learning_rate": 0.00019132819506362183, + "loss": 1.4129, + "step": 3352 + }, + { + "epoch": 0.04357070574992325, + "grad_norm": 0.37749627232551575, + "learning_rate": 0.00019132559560171045, + "loss": 1.4347, + "step": 3353 + }, + { + "epoch": 0.04358370029383912, + "grad_norm": 0.4598933756351471, + "learning_rate": 0.00019132299613979907, + "loss": 1.507, + "step": 3354 + }, + { + "epoch": 0.043596694837754996, + "grad_norm": 0.34212225675582886, + "learning_rate": 0.00019132039667788767, + "loss": 1.3819, + "step": 3355 + }, + { + "epoch": 0.04360968938167087, + "grad_norm": 0.4093504548072815, + "learning_rate": 0.0001913177972159763, + "loss": 1.5085, + "step": 3356 + }, + { + "epoch": 0.04362268392558674, + "grad_norm": 0.3737906515598297, + "learning_rate": 0.00019131519775406492, + "loss": 1.462, + "step": 3357 + }, + { + "epoch": 0.043635678469502615, + "grad_norm": 0.35606417059898376, + "learning_rate": 0.00019131259829215355, + "loss": 1.3589, + "step": 3358 + }, + { + "epoch": 0.04364867301341849, + "grad_norm": 0.39090147614479065, + "learning_rate": 0.00019130999883024214, + "loss": 1.2332, + "step": 3359 + }, + { + "epoch": 0.04366166755733437, + "grad_norm": 0.3723491430282593, + "learning_rate": 0.00019130739936833077, + "loss": 1.3077, + "step": 3360 + }, + { + "epoch": 0.04367466210125024, + "grad_norm": 0.30465856194496155, + "learning_rate": 0.0001913047999064194, + "loss": 1.4543, + "step": 3361 + }, + { + "epoch": 0.04368765664516611, + "grad_norm": 0.4320400655269623, + "learning_rate": 0.000191302200444508, + "loss": 1.518, + "step": 3362 + }, + { + "epoch": 0.043700651189081986, + "grad_norm": 0.33320820331573486, + "learning_rate": 0.00019129960098259661, + "loss": 1.5668, + "step": 3363 + }, + { + "epoch": 0.04371364573299786, + "grad_norm": 0.6051658391952515, + "learning_rate": 0.00019129700152068524, + "loss": 1.5598, + "step": 3364 + }, + { + "epoch": 0.04372664027691373, + "grad_norm": 0.43451371788978577, + "learning_rate": 0.00019129440205877384, + "loss": 1.5338, + "step": 3365 + }, + { + "epoch": 0.043739634820829605, + "grad_norm": 0.38585007190704346, + "learning_rate": 0.00019129180259686246, + "loss": 1.4364, + "step": 3366 + }, + { + "epoch": 0.04375262936474548, + "grad_norm": 0.345638245344162, + "learning_rate": 0.00019128920313495106, + "loss": 1.3738, + "step": 3367 + }, + { + "epoch": 0.04376562390866135, + "grad_norm": 0.4160021245479584, + "learning_rate": 0.0001912866036730397, + "loss": 1.4794, + "step": 3368 + }, + { + "epoch": 0.043778618452577224, + "grad_norm": 0.4704032838344574, + "learning_rate": 0.0001912840042111283, + "loss": 1.6299, + "step": 3369 + }, + { + "epoch": 0.0437916129964931, + "grad_norm": 0.34709498286247253, + "learning_rate": 0.00019128140474921693, + "loss": 1.3436, + "step": 3370 + }, + { + "epoch": 0.04380460754040897, + "grad_norm": 0.2992079257965088, + "learning_rate": 0.00019127880528730553, + "loss": 1.2318, + "step": 3371 + }, + { + "epoch": 0.04381760208432484, + "grad_norm": 0.37219589948654175, + "learning_rate": 0.00019127620582539415, + "loss": 1.6489, + "step": 3372 + }, + { + "epoch": 0.043830596628240716, + "grad_norm": 0.43788623809814453, + "learning_rate": 0.00019127360636348278, + "loss": 1.4366, + "step": 3373 + }, + { + "epoch": 0.04384359117215659, + "grad_norm": 0.3767496347427368, + "learning_rate": 0.00019127100690157137, + "loss": 1.4562, + "step": 3374 + }, + { + "epoch": 0.04385658571607246, + "grad_norm": 0.30844634771347046, + "learning_rate": 0.00019126840743966, + "loss": 1.2874, + "step": 3375 + }, + { + "epoch": 0.043869580259988335, + "grad_norm": 0.3502744138240814, + "learning_rate": 0.00019126580797774862, + "loss": 1.3586, + "step": 3376 + }, + { + "epoch": 0.04388257480390421, + "grad_norm": 0.36088719964027405, + "learning_rate": 0.00019126320851583722, + "loss": 1.614, + "step": 3377 + }, + { + "epoch": 0.04389556934782008, + "grad_norm": 0.3557823896408081, + "learning_rate": 0.00019126060905392585, + "loss": 1.5289, + "step": 3378 + }, + { + "epoch": 0.04390856389173596, + "grad_norm": 0.3625345826148987, + "learning_rate": 0.00019125800959201444, + "loss": 1.5533, + "step": 3379 + }, + { + "epoch": 0.043921558435651833, + "grad_norm": 0.29409506916999817, + "learning_rate": 0.0001912554101301031, + "loss": 1.4743, + "step": 3380 + }, + { + "epoch": 0.043934552979567706, + "grad_norm": 0.33340227603912354, + "learning_rate": 0.0001912528106681917, + "loss": 1.4126, + "step": 3381 + }, + { + "epoch": 0.04394754752348358, + "grad_norm": 0.43819737434387207, + "learning_rate": 0.00019125021120628032, + "loss": 1.5138, + "step": 3382 + }, + { + "epoch": 0.04396054206739945, + "grad_norm": 0.3450305163860321, + "learning_rate": 0.00019124761174436891, + "loss": 1.3265, + "step": 3383 + }, + { + "epoch": 0.043973536611315325, + "grad_norm": 0.38587579131126404, + "learning_rate": 0.00019124501228245754, + "loss": 1.415, + "step": 3384 + }, + { + "epoch": 0.0439865311552312, + "grad_norm": 0.3960534334182739, + "learning_rate": 0.00019124241282054616, + "loss": 1.6012, + "step": 3385 + }, + { + "epoch": 0.04399952569914707, + "grad_norm": 0.398406058549881, + "learning_rate": 0.00019123981335863476, + "loss": 1.2995, + "step": 3386 + }, + { + "epoch": 0.044012520243062944, + "grad_norm": 0.33824533224105835, + "learning_rate": 0.00019123721389672338, + "loss": 1.4585, + "step": 3387 + }, + { + "epoch": 0.04402551478697882, + "grad_norm": 0.33364346623420715, + "learning_rate": 0.000191234614434812, + "loss": 1.3822, + "step": 3388 + }, + { + "epoch": 0.04403850933089469, + "grad_norm": 0.4895845651626587, + "learning_rate": 0.0001912320149729006, + "loss": 1.4984, + "step": 3389 + }, + { + "epoch": 0.04405150387481056, + "grad_norm": 0.39063429832458496, + "learning_rate": 0.00019122941551098923, + "loss": 1.5546, + "step": 3390 + }, + { + "epoch": 0.044064498418726436, + "grad_norm": 0.3552950322628021, + "learning_rate": 0.00019122681604907783, + "loss": 1.5222, + "step": 3391 + }, + { + "epoch": 0.04407749296264231, + "grad_norm": 0.3311936557292938, + "learning_rate": 0.00019122421658716648, + "loss": 1.4323, + "step": 3392 + }, + { + "epoch": 0.04409048750655818, + "grad_norm": 0.3944788873195648, + "learning_rate": 0.00019122161712525508, + "loss": 1.5376, + "step": 3393 + }, + { + "epoch": 0.044103482050474055, + "grad_norm": 0.529399573802948, + "learning_rate": 0.0001912190176633437, + "loss": 1.7481, + "step": 3394 + }, + { + "epoch": 0.04411647659438993, + "grad_norm": 0.35166001319885254, + "learning_rate": 0.00019121641820143233, + "loss": 1.3484, + "step": 3395 + }, + { + "epoch": 0.0441294711383058, + "grad_norm": 0.3795880079269409, + "learning_rate": 0.00019121381873952092, + "loss": 1.5504, + "step": 3396 + }, + { + "epoch": 0.044142465682221674, + "grad_norm": 0.3294336795806885, + "learning_rate": 0.00019121121927760955, + "loss": 1.3608, + "step": 3397 + }, + { + "epoch": 0.044155460226137554, + "grad_norm": 0.282772421836853, + "learning_rate": 0.00019120861981569815, + "loss": 1.1645, + "step": 3398 + }, + { + "epoch": 0.04416845477005343, + "grad_norm": 0.45824310183525085, + "learning_rate": 0.0001912060203537868, + "loss": 1.4217, + "step": 3399 + }, + { + "epoch": 0.0441814493139693, + "grad_norm": 0.37385669350624084, + "learning_rate": 0.0001912034208918754, + "loss": 1.4098, + "step": 3400 + }, + { + "epoch": 0.04419444385788517, + "grad_norm": 0.3293857276439667, + "learning_rate": 0.00019120082142996402, + "loss": 1.3769, + "step": 3401 + }, + { + "epoch": 0.044207438401801046, + "grad_norm": 0.6088602542877197, + "learning_rate": 0.00019119822196805262, + "loss": 1.3635, + "step": 3402 + }, + { + "epoch": 0.04422043294571692, + "grad_norm": 0.40845146775245667, + "learning_rate": 0.00019119562250614124, + "loss": 1.5731, + "step": 3403 + }, + { + "epoch": 0.04423342748963279, + "grad_norm": 0.30555814504623413, + "learning_rate": 0.00019119302304422987, + "loss": 1.4244, + "step": 3404 + }, + { + "epoch": 0.044246422033548664, + "grad_norm": 0.4098779857158661, + "learning_rate": 0.00019119042358231846, + "loss": 1.4273, + "step": 3405 + }, + { + "epoch": 0.04425941657746454, + "grad_norm": 0.31717047095298767, + "learning_rate": 0.0001911878241204071, + "loss": 1.5747, + "step": 3406 + }, + { + "epoch": 0.04427241112138041, + "grad_norm": 0.4416353702545166, + "learning_rate": 0.0001911852246584957, + "loss": 1.5328, + "step": 3407 + }, + { + "epoch": 0.04428540566529628, + "grad_norm": 0.45297935605049133, + "learning_rate": 0.0001911826251965843, + "loss": 1.5303, + "step": 3408 + }, + { + "epoch": 0.044298400209212156, + "grad_norm": 0.36077067255973816, + "learning_rate": 0.00019118002573467293, + "loss": 1.3611, + "step": 3409 + }, + { + "epoch": 0.04431139475312803, + "grad_norm": 0.311235636472702, + "learning_rate": 0.00019117742627276153, + "loss": 1.3419, + "step": 3410 + }, + { + "epoch": 0.0443243892970439, + "grad_norm": 0.38337618112564087, + "learning_rate": 0.00019117482681085018, + "loss": 1.5022, + "step": 3411 + }, + { + "epoch": 0.044337383840959775, + "grad_norm": 0.3651482164859772, + "learning_rate": 0.00019117222734893878, + "loss": 1.5406, + "step": 3412 + }, + { + "epoch": 0.04435037838487565, + "grad_norm": 0.3558707535266876, + "learning_rate": 0.0001911696278870274, + "loss": 1.5622, + "step": 3413 + }, + { + "epoch": 0.04436337292879152, + "grad_norm": 0.3880421221256256, + "learning_rate": 0.000191167028425116, + "loss": 1.5068, + "step": 3414 + }, + { + "epoch": 0.044376367472707394, + "grad_norm": 0.4164212942123413, + "learning_rate": 0.00019116442896320463, + "loss": 1.3439, + "step": 3415 + }, + { + "epoch": 0.04438936201662327, + "grad_norm": 0.36084792017936707, + "learning_rate": 0.00019116182950129325, + "loss": 1.6476, + "step": 3416 + }, + { + "epoch": 0.04440235656053915, + "grad_norm": 0.38160771131515503, + "learning_rate": 0.00019115923003938185, + "loss": 1.3344, + "step": 3417 + }, + { + "epoch": 0.04441535110445502, + "grad_norm": 0.39801520109176636, + "learning_rate": 0.00019115663057747047, + "loss": 1.4835, + "step": 3418 + }, + { + "epoch": 0.04442834564837089, + "grad_norm": 0.4460076689720154, + "learning_rate": 0.0001911540311155591, + "loss": 1.5934, + "step": 3419 + }, + { + "epoch": 0.044441340192286766, + "grad_norm": 0.3776906430721283, + "learning_rate": 0.0001911514316536477, + "loss": 1.3509, + "step": 3420 + }, + { + "epoch": 0.04445433473620264, + "grad_norm": 0.37701916694641113, + "learning_rate": 0.00019114883219173632, + "loss": 1.4444, + "step": 3421 + }, + { + "epoch": 0.04446732928011851, + "grad_norm": 0.39519202709198, + "learning_rate": 0.00019114623272982492, + "loss": 1.4658, + "step": 3422 + }, + { + "epoch": 0.044480323824034385, + "grad_norm": 0.4301571547985077, + "learning_rate": 0.00019114363326791357, + "loss": 1.5473, + "step": 3423 + }, + { + "epoch": 0.04449331836795026, + "grad_norm": 0.33932965993881226, + "learning_rate": 0.00019114103380600217, + "loss": 1.4612, + "step": 3424 + }, + { + "epoch": 0.04450631291186613, + "grad_norm": 0.4156722128391266, + "learning_rate": 0.0001911384343440908, + "loss": 1.4608, + "step": 3425 + }, + { + "epoch": 0.044519307455782, + "grad_norm": 0.32418495416641235, + "learning_rate": 0.0001911358348821794, + "loss": 1.4705, + "step": 3426 + }, + { + "epoch": 0.044532301999697876, + "grad_norm": 0.3849780559539795, + "learning_rate": 0.000191133235420268, + "loss": 1.2606, + "step": 3427 + }, + { + "epoch": 0.04454529654361375, + "grad_norm": 0.37302151322364807, + "learning_rate": 0.00019113063595835664, + "loss": 1.5298, + "step": 3428 + }, + { + "epoch": 0.04455829108752962, + "grad_norm": 0.43634387850761414, + "learning_rate": 0.00019112803649644523, + "loss": 1.7202, + "step": 3429 + }, + { + "epoch": 0.044571285631445495, + "grad_norm": 0.2565285861492157, + "learning_rate": 0.00019112543703453386, + "loss": 1.4194, + "step": 3430 + }, + { + "epoch": 0.04458428017536137, + "grad_norm": 0.41815894842147827, + "learning_rate": 0.00019112283757262248, + "loss": 1.1767, + "step": 3431 + }, + { + "epoch": 0.04459727471927724, + "grad_norm": 0.42732083797454834, + "learning_rate": 0.00019112023811071108, + "loss": 1.4933, + "step": 3432 + }, + { + "epoch": 0.044610269263193114, + "grad_norm": 0.288154661655426, + "learning_rate": 0.0001911176386487997, + "loss": 1.2661, + "step": 3433 + }, + { + "epoch": 0.04462326380710899, + "grad_norm": 0.43552613258361816, + "learning_rate": 0.00019111503918688833, + "loss": 1.6755, + "step": 3434 + }, + { + "epoch": 0.04463625835102486, + "grad_norm": 0.38512101769447327, + "learning_rate": 0.00019111243972497695, + "loss": 1.4476, + "step": 3435 + }, + { + "epoch": 0.04464925289494074, + "grad_norm": 0.4210730493068695, + "learning_rate": 0.00019110984026306555, + "loss": 1.5755, + "step": 3436 + }, + { + "epoch": 0.04466224743885661, + "grad_norm": 0.3965669870376587, + "learning_rate": 0.00019110724080115417, + "loss": 1.3531, + "step": 3437 + }, + { + "epoch": 0.044675241982772486, + "grad_norm": 0.3597378134727478, + "learning_rate": 0.0001911046413392428, + "loss": 1.1933, + "step": 3438 + }, + { + "epoch": 0.04468823652668836, + "grad_norm": 0.550243616104126, + "learning_rate": 0.0001911020418773314, + "loss": 1.5067, + "step": 3439 + }, + { + "epoch": 0.04470123107060423, + "grad_norm": 0.3286186456680298, + "learning_rate": 0.00019109944241542002, + "loss": 1.4354, + "step": 3440 + }, + { + "epoch": 0.044714225614520105, + "grad_norm": 0.37238383293151855, + "learning_rate": 0.00019109684295350862, + "loss": 1.5725, + "step": 3441 + }, + { + "epoch": 0.04472722015843598, + "grad_norm": 0.43052932620048523, + "learning_rate": 0.00019109424349159727, + "loss": 1.4398, + "step": 3442 + }, + { + "epoch": 0.04474021470235185, + "grad_norm": 0.397022545337677, + "learning_rate": 0.00019109164402968587, + "loss": 1.4153, + "step": 3443 + }, + { + "epoch": 0.044753209246267724, + "grad_norm": 0.4465096890926361, + "learning_rate": 0.00019108904456777447, + "loss": 1.8237, + "step": 3444 + }, + { + "epoch": 0.0447662037901836, + "grad_norm": 0.34367677569389343, + "learning_rate": 0.0001910864451058631, + "loss": 1.2262, + "step": 3445 + }, + { + "epoch": 0.04477919833409947, + "grad_norm": 0.368748277425766, + "learning_rate": 0.00019108384564395171, + "loss": 1.3616, + "step": 3446 + }, + { + "epoch": 0.04479219287801534, + "grad_norm": 0.3148956894874573, + "learning_rate": 0.00019108124618204034, + "loss": 1.4302, + "step": 3447 + }, + { + "epoch": 0.044805187421931215, + "grad_norm": 0.3931005597114563, + "learning_rate": 0.00019107864672012894, + "loss": 1.4085, + "step": 3448 + }, + { + "epoch": 0.04481818196584709, + "grad_norm": 0.40465056896209717, + "learning_rate": 0.00019107604725821756, + "loss": 1.4855, + "step": 3449 + }, + { + "epoch": 0.04483117650976296, + "grad_norm": 0.5275431871414185, + "learning_rate": 0.00019107344779630618, + "loss": 1.5041, + "step": 3450 + }, + { + "epoch": 0.044844171053678834, + "grad_norm": 0.42719629406929016, + "learning_rate": 0.00019107084833439478, + "loss": 1.5919, + "step": 3451 + }, + { + "epoch": 0.04485716559759471, + "grad_norm": 0.46026965975761414, + "learning_rate": 0.0001910682488724834, + "loss": 1.5621, + "step": 3452 + }, + { + "epoch": 0.04487016014151058, + "grad_norm": 0.3713780343532562, + "learning_rate": 0.000191065649410572, + "loss": 1.4741, + "step": 3453 + }, + { + "epoch": 0.04488315468542645, + "grad_norm": 0.4859156608581543, + "learning_rate": 0.00019106304994866066, + "loss": 1.532, + "step": 3454 + }, + { + "epoch": 0.04489614922934233, + "grad_norm": 0.3467864692211151, + "learning_rate": 0.00019106045048674925, + "loss": 1.1647, + "step": 3455 + }, + { + "epoch": 0.044909143773258206, + "grad_norm": 0.3873916566371918, + "learning_rate": 0.00019105785102483785, + "loss": 1.5057, + "step": 3456 + }, + { + "epoch": 0.04492213831717408, + "grad_norm": 0.397622674703598, + "learning_rate": 0.00019105525156292647, + "loss": 1.5431, + "step": 3457 + }, + { + "epoch": 0.04493513286108995, + "grad_norm": 0.36541593074798584, + "learning_rate": 0.0001910526521010151, + "loss": 1.5761, + "step": 3458 + }, + { + "epoch": 0.044948127405005825, + "grad_norm": 0.40878790616989136, + "learning_rate": 0.00019105005263910372, + "loss": 1.3912, + "step": 3459 + }, + { + "epoch": 0.0449611219489217, + "grad_norm": 0.451894074678421, + "learning_rate": 0.00019104745317719232, + "loss": 1.5588, + "step": 3460 + }, + { + "epoch": 0.04497411649283757, + "grad_norm": 0.43258681893348694, + "learning_rate": 0.00019104485371528095, + "loss": 1.5533, + "step": 3461 + }, + { + "epoch": 0.044987111036753444, + "grad_norm": 0.4070456624031067, + "learning_rate": 0.00019104225425336957, + "loss": 1.4362, + "step": 3462 + }, + { + "epoch": 0.04500010558066932, + "grad_norm": 0.4204276502132416, + "learning_rate": 0.00019103965479145817, + "loss": 1.6424, + "step": 3463 + }, + { + "epoch": 0.04501310012458519, + "grad_norm": 0.3918856382369995, + "learning_rate": 0.0001910370553295468, + "loss": 1.427, + "step": 3464 + }, + { + "epoch": 0.04502609466850106, + "grad_norm": 0.4557289481163025, + "learning_rate": 0.0001910344558676354, + "loss": 1.3844, + "step": 3465 + }, + { + "epoch": 0.045039089212416936, + "grad_norm": 0.47764307260513306, + "learning_rate": 0.00019103185640572404, + "loss": 1.4944, + "step": 3466 + }, + { + "epoch": 0.04505208375633281, + "grad_norm": 0.4500553607940674, + "learning_rate": 0.00019102925694381264, + "loss": 1.5151, + "step": 3467 + }, + { + "epoch": 0.04506507830024868, + "grad_norm": 0.36121290922164917, + "learning_rate": 0.00019102665748190126, + "loss": 1.279, + "step": 3468 + }, + { + "epoch": 0.045078072844164555, + "grad_norm": 0.3657602071762085, + "learning_rate": 0.0001910240580199899, + "loss": 1.2671, + "step": 3469 + }, + { + "epoch": 0.04509106738808043, + "grad_norm": 0.3611789047718048, + "learning_rate": 0.00019102145855807848, + "loss": 1.3417, + "step": 3470 + }, + { + "epoch": 0.0451040619319963, + "grad_norm": 0.34144169092178345, + "learning_rate": 0.0001910188590961671, + "loss": 1.1636, + "step": 3471 + }, + { + "epoch": 0.04511705647591217, + "grad_norm": 0.349431574344635, + "learning_rate": 0.0001910162596342557, + "loss": 1.5859, + "step": 3472 + }, + { + "epoch": 0.045130051019828046, + "grad_norm": 0.499302476644516, + "learning_rate": 0.00019101366017234433, + "loss": 1.49, + "step": 3473 + }, + { + "epoch": 0.045143045563743926, + "grad_norm": 0.2950809895992279, + "learning_rate": 0.00019101106071043296, + "loss": 1.3242, + "step": 3474 + }, + { + "epoch": 0.0451560401076598, + "grad_norm": 0.35766127705574036, + "learning_rate": 0.00019100846124852155, + "loss": 1.5298, + "step": 3475 + }, + { + "epoch": 0.04516903465157567, + "grad_norm": 0.30416497588157654, + "learning_rate": 0.00019100586178661018, + "loss": 1.5598, + "step": 3476 + }, + { + "epoch": 0.045182029195491545, + "grad_norm": 0.4109727740287781, + "learning_rate": 0.0001910032623246988, + "loss": 1.224, + "step": 3477 + }, + { + "epoch": 0.04519502373940742, + "grad_norm": 0.3664378523826599, + "learning_rate": 0.00019100066286278743, + "loss": 1.4657, + "step": 3478 + }, + { + "epoch": 0.04520801828332329, + "grad_norm": 0.5163008570671082, + "learning_rate": 0.00019099806340087602, + "loss": 1.4298, + "step": 3479 + }, + { + "epoch": 0.045221012827239164, + "grad_norm": 0.4767324924468994, + "learning_rate": 0.00019099546393896465, + "loss": 1.4132, + "step": 3480 + }, + { + "epoch": 0.04523400737115504, + "grad_norm": 0.35452350974082947, + "learning_rate": 0.00019099286447705327, + "loss": 1.4272, + "step": 3481 + }, + { + "epoch": 0.04524700191507091, + "grad_norm": 0.4030720293521881, + "learning_rate": 0.00019099026501514187, + "loss": 1.6696, + "step": 3482 + }, + { + "epoch": 0.04525999645898678, + "grad_norm": 0.30554822087287903, + "learning_rate": 0.0001909876655532305, + "loss": 1.3461, + "step": 3483 + }, + { + "epoch": 0.045272991002902656, + "grad_norm": 0.4898455739021301, + "learning_rate": 0.0001909850660913191, + "loss": 1.5115, + "step": 3484 + }, + { + "epoch": 0.04528598554681853, + "grad_norm": 0.4397023022174835, + "learning_rate": 0.00019098246662940774, + "loss": 1.3619, + "step": 3485 + }, + { + "epoch": 0.0452989800907344, + "grad_norm": 0.44009312987327576, + "learning_rate": 0.00019097986716749634, + "loss": 1.411, + "step": 3486 + }, + { + "epoch": 0.045311974634650275, + "grad_norm": 0.33126917481422424, + "learning_rate": 0.00019097726770558494, + "loss": 1.3827, + "step": 3487 + }, + { + "epoch": 0.04532496917856615, + "grad_norm": 0.3113521337509155, + "learning_rate": 0.00019097466824367356, + "loss": 1.3504, + "step": 3488 + }, + { + "epoch": 0.04533796372248202, + "grad_norm": 0.4283364713191986, + "learning_rate": 0.0001909720687817622, + "loss": 1.4902, + "step": 3489 + }, + { + "epoch": 0.045350958266397894, + "grad_norm": 0.3907061219215393, + "learning_rate": 0.0001909694693198508, + "loss": 1.4679, + "step": 3490 + }, + { + "epoch": 0.045363952810313767, + "grad_norm": 0.4915041923522949, + "learning_rate": 0.0001909668698579394, + "loss": 1.4044, + "step": 3491 + }, + { + "epoch": 0.04537694735422964, + "grad_norm": 0.3264465630054474, + "learning_rate": 0.00019096427039602803, + "loss": 1.4853, + "step": 3492 + }, + { + "epoch": 0.04538994189814552, + "grad_norm": 0.39820829033851624, + "learning_rate": 0.00019096167093411666, + "loss": 1.4823, + "step": 3493 + }, + { + "epoch": 0.04540293644206139, + "grad_norm": 0.32205459475517273, + "learning_rate": 0.00019095907147220526, + "loss": 1.2417, + "step": 3494 + }, + { + "epoch": 0.045415930985977265, + "grad_norm": 0.38776856660842896, + "learning_rate": 0.00019095647201029388, + "loss": 1.4646, + "step": 3495 + }, + { + "epoch": 0.04542892552989314, + "grad_norm": 0.33155444264411926, + "learning_rate": 0.00019095387254838248, + "loss": 1.1417, + "step": 3496 + }, + { + "epoch": 0.04544192007380901, + "grad_norm": 0.3750079870223999, + "learning_rate": 0.00019095127308647113, + "loss": 1.4263, + "step": 3497 + }, + { + "epoch": 0.045454914617724884, + "grad_norm": 0.3497723937034607, + "learning_rate": 0.00019094867362455973, + "loss": 1.4163, + "step": 3498 + }, + { + "epoch": 0.04546790916164076, + "grad_norm": 0.4343213737010956, + "learning_rate": 0.00019094607416264832, + "loss": 1.4147, + "step": 3499 + }, + { + "epoch": 0.04548090370555663, + "grad_norm": 0.46293163299560547, + "learning_rate": 0.00019094347470073695, + "loss": 1.4799, + "step": 3500 + }, + { + "epoch": 0.0454938982494725, + "grad_norm": 0.43661198019981384, + "learning_rate": 0.00019094087523882557, + "loss": 1.4232, + "step": 3501 + }, + { + "epoch": 0.045506892793388376, + "grad_norm": 0.4032571613788605, + "learning_rate": 0.0001909382757769142, + "loss": 1.322, + "step": 3502 + }, + { + "epoch": 0.04551988733730425, + "grad_norm": 0.3524533212184906, + "learning_rate": 0.0001909356763150028, + "loss": 1.2897, + "step": 3503 + }, + { + "epoch": 0.04553288188122012, + "grad_norm": 0.3699171543121338, + "learning_rate": 0.00019093307685309142, + "loss": 1.5074, + "step": 3504 + }, + { + "epoch": 0.045545876425135995, + "grad_norm": 0.40244174003601074, + "learning_rate": 0.00019093047739118004, + "loss": 1.5089, + "step": 3505 + }, + { + "epoch": 0.04555887096905187, + "grad_norm": 0.37908700108528137, + "learning_rate": 0.00019092787792926864, + "loss": 1.5981, + "step": 3506 + }, + { + "epoch": 0.04557186551296774, + "grad_norm": 0.3592241108417511, + "learning_rate": 0.00019092527846735727, + "loss": 1.38, + "step": 3507 + }, + { + "epoch": 0.045584860056883614, + "grad_norm": 0.45989441871643066, + "learning_rate": 0.0001909226790054459, + "loss": 1.3989, + "step": 3508 + }, + { + "epoch": 0.04559785460079949, + "grad_norm": 0.3594523072242737, + "learning_rate": 0.00019092007954353451, + "loss": 1.4228, + "step": 3509 + }, + { + "epoch": 0.04561084914471536, + "grad_norm": 0.37525588274002075, + "learning_rate": 0.0001909174800816231, + "loss": 1.4879, + "step": 3510 + }, + { + "epoch": 0.04562384368863123, + "grad_norm": 0.4640478193759918, + "learning_rate": 0.0001909148806197117, + "loss": 1.5419, + "step": 3511 + }, + { + "epoch": 0.04563683823254711, + "grad_norm": 0.34540554881095886, + "learning_rate": 0.00019091228115780036, + "loss": 1.596, + "step": 3512 + }, + { + "epoch": 0.045649832776462985, + "grad_norm": 0.35171157121658325, + "learning_rate": 0.00019090968169588896, + "loss": 1.4258, + "step": 3513 + }, + { + "epoch": 0.04566282732037886, + "grad_norm": 0.3607141077518463, + "learning_rate": 0.00019090708223397758, + "loss": 1.4261, + "step": 3514 + }, + { + "epoch": 0.04567582186429473, + "grad_norm": 0.31665804982185364, + "learning_rate": 0.00019090448277206618, + "loss": 1.3936, + "step": 3515 + }, + { + "epoch": 0.045688816408210604, + "grad_norm": 0.3263287842273712, + "learning_rate": 0.0001909018833101548, + "loss": 1.4128, + "step": 3516 + }, + { + "epoch": 0.04570181095212648, + "grad_norm": 0.4578251242637634, + "learning_rate": 0.00019089928384824343, + "loss": 1.4633, + "step": 3517 + }, + { + "epoch": 0.04571480549604235, + "grad_norm": 0.39254575967788696, + "learning_rate": 0.00019089668438633203, + "loss": 1.4463, + "step": 3518 + }, + { + "epoch": 0.04572780003995822, + "grad_norm": 0.4729418456554413, + "learning_rate": 0.00019089408492442065, + "loss": 1.4424, + "step": 3519 + }, + { + "epoch": 0.045740794583874096, + "grad_norm": 0.3395959138870239, + "learning_rate": 0.00019089148546250928, + "loss": 1.5223, + "step": 3520 + }, + { + "epoch": 0.04575378912778997, + "grad_norm": 0.4151439070701599, + "learning_rate": 0.0001908888860005979, + "loss": 1.5108, + "step": 3521 + }, + { + "epoch": 0.04576678367170584, + "grad_norm": 0.31169265508651733, + "learning_rate": 0.0001908862865386865, + "loss": 1.339, + "step": 3522 + }, + { + "epoch": 0.045779778215621715, + "grad_norm": 0.38498082756996155, + "learning_rate": 0.00019088368707677512, + "loss": 1.5744, + "step": 3523 + }, + { + "epoch": 0.04579277275953759, + "grad_norm": 0.46308454871177673, + "learning_rate": 0.00019088108761486375, + "loss": 1.5169, + "step": 3524 + }, + { + "epoch": 0.04580576730345346, + "grad_norm": 0.36932235956192017, + "learning_rate": 0.00019087848815295234, + "loss": 1.5982, + "step": 3525 + }, + { + "epoch": 0.045818761847369334, + "grad_norm": 0.4504307806491852, + "learning_rate": 0.00019087588869104097, + "loss": 1.2685, + "step": 3526 + }, + { + "epoch": 0.04583175639128521, + "grad_norm": 0.4500713646411896, + "learning_rate": 0.00019087328922912957, + "loss": 1.5148, + "step": 3527 + }, + { + "epoch": 0.04584475093520108, + "grad_norm": 0.43191471695899963, + "learning_rate": 0.0001908706897672182, + "loss": 1.3907, + "step": 3528 + }, + { + "epoch": 0.04585774547911695, + "grad_norm": 0.23284229636192322, + "learning_rate": 0.00019086809030530681, + "loss": 1.4453, + "step": 3529 + }, + { + "epoch": 0.045870740023032826, + "grad_norm": 0.39071956276893616, + "learning_rate": 0.0001908654908433954, + "loss": 1.5298, + "step": 3530 + }, + { + "epoch": 0.045883734566948706, + "grad_norm": 0.4016498327255249, + "learning_rate": 0.00019086289138148404, + "loss": 1.3926, + "step": 3531 + }, + { + "epoch": 0.04589672911086458, + "grad_norm": 0.39591899514198303, + "learning_rate": 0.00019086029191957266, + "loss": 1.7326, + "step": 3532 + }, + { + "epoch": 0.04590972365478045, + "grad_norm": 0.5015997290611267, + "learning_rate": 0.00019085769245766129, + "loss": 1.5801, + "step": 3533 + }, + { + "epoch": 0.045922718198696325, + "grad_norm": 0.37236592173576355, + "learning_rate": 0.00019085509299574988, + "loss": 1.3658, + "step": 3534 + }, + { + "epoch": 0.0459357127426122, + "grad_norm": 0.3775648772716522, + "learning_rate": 0.0001908524935338385, + "loss": 1.4203, + "step": 3535 + }, + { + "epoch": 0.04594870728652807, + "grad_norm": 0.41073334217071533, + "learning_rate": 0.00019084989407192713, + "loss": 1.3554, + "step": 3536 + }, + { + "epoch": 0.04596170183044394, + "grad_norm": 0.38080254197120667, + "learning_rate": 0.00019084729461001573, + "loss": 1.3259, + "step": 3537 + }, + { + "epoch": 0.045974696374359816, + "grad_norm": 0.3639844059944153, + "learning_rate": 0.00019084469514810435, + "loss": 1.5634, + "step": 3538 + }, + { + "epoch": 0.04598769091827569, + "grad_norm": 0.4045422673225403, + "learning_rate": 0.00019084209568619295, + "loss": 1.6289, + "step": 3539 + }, + { + "epoch": 0.04600068546219156, + "grad_norm": 0.3872968554496765, + "learning_rate": 0.00019083949622428158, + "loss": 1.3977, + "step": 3540 + }, + { + "epoch": 0.046013680006107435, + "grad_norm": 0.33250367641448975, + "learning_rate": 0.0001908368967623702, + "loss": 1.4352, + "step": 3541 + }, + { + "epoch": 0.04602667455002331, + "grad_norm": 0.6230947971343994, + "learning_rate": 0.0001908342973004588, + "loss": 1.3423, + "step": 3542 + }, + { + "epoch": 0.04603966909393918, + "grad_norm": 0.3451698422431946, + "learning_rate": 0.00019083169783854745, + "loss": 1.4538, + "step": 3543 + }, + { + "epoch": 0.046052663637855054, + "grad_norm": 0.4396614730358124, + "learning_rate": 0.00019082909837663605, + "loss": 1.4726, + "step": 3544 + }, + { + "epoch": 0.04606565818177093, + "grad_norm": 0.4091491997241974, + "learning_rate": 0.00019082649891472467, + "loss": 1.7656, + "step": 3545 + }, + { + "epoch": 0.0460786527256868, + "grad_norm": 0.341570109128952, + "learning_rate": 0.00019082389945281327, + "loss": 1.2968, + "step": 3546 + }, + { + "epoch": 0.04609164726960267, + "grad_norm": 0.3608258366584778, + "learning_rate": 0.0001908212999909019, + "loss": 1.361, + "step": 3547 + }, + { + "epoch": 0.046104641813518546, + "grad_norm": 0.44565585255622864, + "learning_rate": 0.00019081870052899052, + "loss": 1.4502, + "step": 3548 + }, + { + "epoch": 0.04611763635743442, + "grad_norm": 0.4324675500392914, + "learning_rate": 0.00019081610106707911, + "loss": 1.5883, + "step": 3549 + }, + { + "epoch": 0.0461306309013503, + "grad_norm": 0.41858798265457153, + "learning_rate": 0.00019081350160516774, + "loss": 1.5429, + "step": 3550 + }, + { + "epoch": 0.04614362544526617, + "grad_norm": 0.41675299406051636, + "learning_rate": 0.00019081090214325636, + "loss": 1.3907, + "step": 3551 + }, + { + "epoch": 0.046156619989182045, + "grad_norm": 0.45589423179626465, + "learning_rate": 0.000190808302681345, + "loss": 1.4325, + "step": 3552 + }, + { + "epoch": 0.04616961453309792, + "grad_norm": 0.37698423862457275, + "learning_rate": 0.00019080570321943359, + "loss": 1.5058, + "step": 3553 + }, + { + "epoch": 0.04618260907701379, + "grad_norm": 0.29102256894111633, + "learning_rate": 0.00019080310375752218, + "loss": 1.2826, + "step": 3554 + }, + { + "epoch": 0.046195603620929664, + "grad_norm": 0.3150346875190735, + "learning_rate": 0.00019080050429561083, + "loss": 1.4803, + "step": 3555 + }, + { + "epoch": 0.04620859816484554, + "grad_norm": 0.42382198572158813, + "learning_rate": 0.00019079790483369943, + "loss": 1.4414, + "step": 3556 + }, + { + "epoch": 0.04622159270876141, + "grad_norm": 0.48112672567367554, + "learning_rate": 0.00019079530537178806, + "loss": 1.3901, + "step": 3557 + }, + { + "epoch": 0.04623458725267728, + "grad_norm": 0.2613075375556946, + "learning_rate": 0.00019079270590987665, + "loss": 1.334, + "step": 3558 + }, + { + "epoch": 0.046247581796593155, + "grad_norm": 0.5119329690933228, + "learning_rate": 0.00019079010644796528, + "loss": 1.5087, + "step": 3559 + }, + { + "epoch": 0.04626057634050903, + "grad_norm": 0.4582822620868683, + "learning_rate": 0.0001907875069860539, + "loss": 1.3544, + "step": 3560 + }, + { + "epoch": 0.0462735708844249, + "grad_norm": 0.38838452100753784, + "learning_rate": 0.0001907849075241425, + "loss": 1.5, + "step": 3561 + }, + { + "epoch": 0.046286565428340774, + "grad_norm": 0.41043031215667725, + "learning_rate": 0.00019078230806223112, + "loss": 1.4815, + "step": 3562 + }, + { + "epoch": 0.04629955997225665, + "grad_norm": 0.3754003345966339, + "learning_rate": 0.00019077970860031975, + "loss": 1.4818, + "step": 3563 + }, + { + "epoch": 0.04631255451617252, + "grad_norm": 0.30324748158454895, + "learning_rate": 0.00019077710913840837, + "loss": 1.3506, + "step": 3564 + }, + { + "epoch": 0.04632554906008839, + "grad_norm": 0.3835824131965637, + "learning_rate": 0.00019077450967649697, + "loss": 1.5176, + "step": 3565 + }, + { + "epoch": 0.046338543604004266, + "grad_norm": 0.34142348170280457, + "learning_rate": 0.00019077191021458557, + "loss": 1.3284, + "step": 3566 + }, + { + "epoch": 0.04635153814792014, + "grad_norm": 0.4325087368488312, + "learning_rate": 0.00019076931075267422, + "loss": 1.5274, + "step": 3567 + }, + { + "epoch": 0.04636453269183601, + "grad_norm": 0.41978904604911804, + "learning_rate": 0.00019076671129076282, + "loss": 1.3381, + "step": 3568 + }, + { + "epoch": 0.04637752723575189, + "grad_norm": 0.43123379349708557, + "learning_rate": 0.00019076411182885144, + "loss": 1.5037, + "step": 3569 + }, + { + "epoch": 0.046390521779667765, + "grad_norm": 0.3783569037914276, + "learning_rate": 0.00019076151236694004, + "loss": 1.3391, + "step": 3570 + }, + { + "epoch": 0.04640351632358364, + "grad_norm": 0.4903333783149719, + "learning_rate": 0.00019075891290502866, + "loss": 1.5181, + "step": 3571 + }, + { + "epoch": 0.04641651086749951, + "grad_norm": 0.383445143699646, + "learning_rate": 0.0001907563134431173, + "loss": 1.0928, + "step": 3572 + }, + { + "epoch": 0.046429505411415384, + "grad_norm": 0.4517488181591034, + "learning_rate": 0.00019075371398120589, + "loss": 1.6872, + "step": 3573 + }, + { + "epoch": 0.04644249995533126, + "grad_norm": 0.3907638192176819, + "learning_rate": 0.0001907511145192945, + "loss": 1.2761, + "step": 3574 + }, + { + "epoch": 0.04645549449924713, + "grad_norm": 0.4366452991962433, + "learning_rate": 0.00019074851505738313, + "loss": 1.39, + "step": 3575 + }, + { + "epoch": 0.046468489043163, + "grad_norm": 0.46755126118659973, + "learning_rate": 0.00019074591559547176, + "loss": 1.5274, + "step": 3576 + }, + { + "epoch": 0.046481483587078876, + "grad_norm": 0.4617554545402527, + "learning_rate": 0.00019074331613356036, + "loss": 1.5557, + "step": 3577 + }, + { + "epoch": 0.04649447813099475, + "grad_norm": 0.38301703333854675, + "learning_rate": 0.00019074071667164898, + "loss": 1.3252, + "step": 3578 + }, + { + "epoch": 0.04650747267491062, + "grad_norm": 0.4291174113750458, + "learning_rate": 0.0001907381172097376, + "loss": 1.3795, + "step": 3579 + }, + { + "epoch": 0.046520467218826494, + "grad_norm": 0.4066070318222046, + "learning_rate": 0.0001907355177478262, + "loss": 1.4005, + "step": 3580 + }, + { + "epoch": 0.04653346176274237, + "grad_norm": 0.3907028138637543, + "learning_rate": 0.00019073291828591483, + "loss": 1.3814, + "step": 3581 + }, + { + "epoch": 0.04654645630665824, + "grad_norm": 0.4663504660129547, + "learning_rate": 0.00019073031882400345, + "loss": 1.5346, + "step": 3582 + }, + { + "epoch": 0.04655945085057411, + "grad_norm": 0.32876548171043396, + "learning_rate": 0.00019072771936209205, + "loss": 1.5176, + "step": 3583 + }, + { + "epoch": 0.046572445394489986, + "grad_norm": 0.45175355672836304, + "learning_rate": 0.00019072511990018067, + "loss": 1.3537, + "step": 3584 + }, + { + "epoch": 0.04658543993840586, + "grad_norm": 0.35481515526771545, + "learning_rate": 0.00019072252043826927, + "loss": 1.5478, + "step": 3585 + }, + { + "epoch": 0.04659843448232173, + "grad_norm": 0.29223692417144775, + "learning_rate": 0.00019071992097635792, + "loss": 1.6045, + "step": 3586 + }, + { + "epoch": 0.046611429026237605, + "grad_norm": 0.3558688759803772, + "learning_rate": 0.00019071732151444652, + "loss": 1.4885, + "step": 3587 + }, + { + "epoch": 0.046624423570153485, + "grad_norm": 0.4412180781364441, + "learning_rate": 0.00019071472205253514, + "loss": 1.4081, + "step": 3588 + }, + { + "epoch": 0.04663741811406936, + "grad_norm": 0.44046077132225037, + "learning_rate": 0.00019071212259062374, + "loss": 1.4469, + "step": 3589 + }, + { + "epoch": 0.04665041265798523, + "grad_norm": 0.3765508532524109, + "learning_rate": 0.00019070952312871237, + "loss": 1.2601, + "step": 3590 + }, + { + "epoch": 0.046663407201901104, + "grad_norm": 0.4437933564186096, + "learning_rate": 0.000190706923666801, + "loss": 1.6313, + "step": 3591 + }, + { + "epoch": 0.04667640174581698, + "grad_norm": 0.4332149624824524, + "learning_rate": 0.0001907043242048896, + "loss": 1.4368, + "step": 3592 + }, + { + "epoch": 0.04668939628973285, + "grad_norm": 0.5110267996788025, + "learning_rate": 0.0001907017247429782, + "loss": 1.4827, + "step": 3593 + }, + { + "epoch": 0.04670239083364872, + "grad_norm": 0.3366774916648865, + "learning_rate": 0.00019069912528106684, + "loss": 1.4352, + "step": 3594 + }, + { + "epoch": 0.046715385377564596, + "grad_norm": 0.35826683044433594, + "learning_rate": 0.00019069652581915543, + "loss": 1.512, + "step": 3595 + }, + { + "epoch": 0.04672837992148047, + "grad_norm": 0.4029589593410492, + "learning_rate": 0.00019069392635724406, + "loss": 1.4252, + "step": 3596 + }, + { + "epoch": 0.04674137446539634, + "grad_norm": 0.4411650002002716, + "learning_rate": 0.00019069132689533266, + "loss": 1.5004, + "step": 3597 + }, + { + "epoch": 0.046754369009312215, + "grad_norm": 0.4946557581424713, + "learning_rate": 0.0001906887274334213, + "loss": 1.3942, + "step": 3598 + }, + { + "epoch": 0.04676736355322809, + "grad_norm": 0.4434962868690491, + "learning_rate": 0.0001906861279715099, + "loss": 1.4991, + "step": 3599 + }, + { + "epoch": 0.04678035809714396, + "grad_norm": 0.338807076215744, + "learning_rate": 0.00019068352850959853, + "loss": 1.1774, + "step": 3600 + }, + { + "epoch": 0.046793352641059834, + "grad_norm": 0.40168771147727966, + "learning_rate": 0.00019068092904768713, + "loss": 1.5569, + "step": 3601 + }, + { + "epoch": 0.046806347184975707, + "grad_norm": 0.3353491425514221, + "learning_rate": 0.00019067832958577575, + "loss": 1.4, + "step": 3602 + }, + { + "epoch": 0.04681934172889158, + "grad_norm": 0.4418715238571167, + "learning_rate": 0.00019067573012386438, + "loss": 1.4268, + "step": 3603 + }, + { + "epoch": 0.04683233627280745, + "grad_norm": 0.3415584862232208, + "learning_rate": 0.00019067313066195297, + "loss": 1.3771, + "step": 3604 + }, + { + "epoch": 0.046845330816723325, + "grad_norm": 0.48909637331962585, + "learning_rate": 0.0001906705312000416, + "loss": 1.3793, + "step": 3605 + }, + { + "epoch": 0.0468583253606392, + "grad_norm": 0.3500750958919525, + "learning_rate": 0.00019066793173813022, + "loss": 1.4094, + "step": 3606 + }, + { + "epoch": 0.04687131990455508, + "grad_norm": 0.3301793038845062, + "learning_rate": 0.00019066533227621885, + "loss": 1.388, + "step": 3607 + }, + { + "epoch": 0.04688431444847095, + "grad_norm": 0.4042547345161438, + "learning_rate": 0.00019066273281430744, + "loss": 1.4307, + "step": 3608 + }, + { + "epoch": 0.046897308992386824, + "grad_norm": 0.31483331322669983, + "learning_rate": 0.00019066013335239604, + "loss": 1.3205, + "step": 3609 + }, + { + "epoch": 0.0469103035363027, + "grad_norm": 0.40097078680992126, + "learning_rate": 0.0001906575338904847, + "loss": 1.6694, + "step": 3610 + }, + { + "epoch": 0.04692329808021857, + "grad_norm": 0.3837755620479584, + "learning_rate": 0.0001906549344285733, + "loss": 1.5345, + "step": 3611 + }, + { + "epoch": 0.04693629262413444, + "grad_norm": 0.4778191149234772, + "learning_rate": 0.00019065233496666191, + "loss": 1.5407, + "step": 3612 + }, + { + "epoch": 0.046949287168050316, + "grad_norm": 0.415060430765152, + "learning_rate": 0.0001906497355047505, + "loss": 1.5277, + "step": 3613 + }, + { + "epoch": 0.04696228171196619, + "grad_norm": 0.2776385247707367, + "learning_rate": 0.00019064713604283914, + "loss": 1.361, + "step": 3614 + }, + { + "epoch": 0.04697527625588206, + "grad_norm": 0.3970852792263031, + "learning_rate": 0.00019064453658092776, + "loss": 1.2939, + "step": 3615 + }, + { + "epoch": 0.046988270799797935, + "grad_norm": 0.36459505558013916, + "learning_rate": 0.00019064193711901636, + "loss": 1.4153, + "step": 3616 + }, + { + "epoch": 0.04700126534371381, + "grad_norm": 0.45243555307388306, + "learning_rate": 0.000190639337657105, + "loss": 1.5853, + "step": 3617 + }, + { + "epoch": 0.04701425988762968, + "grad_norm": 0.37392571568489075, + "learning_rate": 0.0001906367381951936, + "loss": 1.4884, + "step": 3618 + }, + { + "epoch": 0.047027254431545554, + "grad_norm": 0.3919658660888672, + "learning_rate": 0.00019063413873328223, + "loss": 1.4168, + "step": 3619 + }, + { + "epoch": 0.04704024897546143, + "grad_norm": 0.3406011462211609, + "learning_rate": 0.00019063153927137083, + "loss": 1.4847, + "step": 3620 + }, + { + "epoch": 0.0470532435193773, + "grad_norm": 0.34002724289894104, + "learning_rate": 0.00019062893980945945, + "loss": 1.4379, + "step": 3621 + }, + { + "epoch": 0.04706623806329317, + "grad_norm": 0.3958209753036499, + "learning_rate": 0.00019062634034754808, + "loss": 1.526, + "step": 3622 + }, + { + "epoch": 0.047079232607209046, + "grad_norm": 0.3342420160770416, + "learning_rate": 0.00019062374088563668, + "loss": 1.5268, + "step": 3623 + }, + { + "epoch": 0.04709222715112492, + "grad_norm": 0.45454952120780945, + "learning_rate": 0.0001906211414237253, + "loss": 1.6099, + "step": 3624 + }, + { + "epoch": 0.04710522169504079, + "grad_norm": 0.3253743648529053, + "learning_rate": 0.00019061854196181392, + "loss": 1.6407, + "step": 3625 + }, + { + "epoch": 0.04711821623895667, + "grad_norm": 0.4331594705581665, + "learning_rate": 0.00019061594249990252, + "loss": 1.6284, + "step": 3626 + }, + { + "epoch": 0.047131210782872544, + "grad_norm": 0.41288936138153076, + "learning_rate": 0.00019061334303799115, + "loss": 1.5875, + "step": 3627 + }, + { + "epoch": 0.04714420532678842, + "grad_norm": 0.22082705795764923, + "learning_rate": 0.00019061074357607974, + "loss": 1.2422, + "step": 3628 + }, + { + "epoch": 0.04715719987070429, + "grad_norm": 0.450970321893692, + "learning_rate": 0.0001906081441141684, + "loss": 1.5674, + "step": 3629 + }, + { + "epoch": 0.04717019441462016, + "grad_norm": 0.3594655990600586, + "learning_rate": 0.000190605544652257, + "loss": 1.3607, + "step": 3630 + }, + { + "epoch": 0.047183188958536036, + "grad_norm": 0.4247593581676483, + "learning_rate": 0.00019060294519034562, + "loss": 1.352, + "step": 3631 + }, + { + "epoch": 0.04719618350245191, + "grad_norm": 0.36219149827957153, + "learning_rate": 0.00019060034572843421, + "loss": 1.4978, + "step": 3632 + }, + { + "epoch": 0.04720917804636778, + "grad_norm": 0.36923158168792725, + "learning_rate": 0.00019059774626652284, + "loss": 1.3403, + "step": 3633 + }, + { + "epoch": 0.047222172590283655, + "grad_norm": 0.414307177066803, + "learning_rate": 0.00019059514680461146, + "loss": 1.4527, + "step": 3634 + }, + { + "epoch": 0.04723516713419953, + "grad_norm": 0.330245703458786, + "learning_rate": 0.00019059254734270006, + "loss": 1.2773, + "step": 3635 + }, + { + "epoch": 0.0472481616781154, + "grad_norm": 0.35710257291793823, + "learning_rate": 0.00019058994788078869, + "loss": 1.2525, + "step": 3636 + }, + { + "epoch": 0.047261156222031274, + "grad_norm": 0.34154680371284485, + "learning_rate": 0.0001905873484188773, + "loss": 1.3289, + "step": 3637 + }, + { + "epoch": 0.04727415076594715, + "grad_norm": 0.4402523338794708, + "learning_rate": 0.0001905847489569659, + "loss": 1.5782, + "step": 3638 + }, + { + "epoch": 0.04728714530986302, + "grad_norm": 0.37777984142303467, + "learning_rate": 0.00019058214949505453, + "loss": 1.4294, + "step": 3639 + }, + { + "epoch": 0.04730013985377889, + "grad_norm": 0.43438291549682617, + "learning_rate": 0.00019057955003314313, + "loss": 1.4628, + "step": 3640 + }, + { + "epoch": 0.047313134397694766, + "grad_norm": 0.3909463882446289, + "learning_rate": 0.00019057695057123178, + "loss": 1.4606, + "step": 3641 + }, + { + "epoch": 0.04732612894161064, + "grad_norm": 0.3320329189300537, + "learning_rate": 0.00019057435110932038, + "loss": 1.3508, + "step": 3642 + }, + { + "epoch": 0.04733912348552651, + "grad_norm": 0.32663288712501526, + "learning_rate": 0.000190571751647409, + "loss": 1.4622, + "step": 3643 + }, + { + "epoch": 0.047352118029442385, + "grad_norm": 0.3893803358078003, + "learning_rate": 0.0001905691521854976, + "loss": 1.3823, + "step": 3644 + }, + { + "epoch": 0.047365112573358265, + "grad_norm": 0.3809978663921356, + "learning_rate": 0.00019056655272358622, + "loss": 1.4029, + "step": 3645 + }, + { + "epoch": 0.04737810711727414, + "grad_norm": 0.4102168381214142, + "learning_rate": 0.00019056395326167485, + "loss": 1.3259, + "step": 3646 + }, + { + "epoch": 0.04739110166119001, + "grad_norm": 0.3692684471607208, + "learning_rate": 0.00019056135379976345, + "loss": 1.4453, + "step": 3647 + }, + { + "epoch": 0.04740409620510588, + "grad_norm": 0.3841249942779541, + "learning_rate": 0.00019055875433785207, + "loss": 1.5368, + "step": 3648 + }, + { + "epoch": 0.047417090749021756, + "grad_norm": 0.35412758588790894, + "learning_rate": 0.0001905561548759407, + "loss": 1.6147, + "step": 3649 + }, + { + "epoch": 0.04743008529293763, + "grad_norm": 0.38396957516670227, + "learning_rate": 0.0001905535554140293, + "loss": 1.4088, + "step": 3650 + }, + { + "epoch": 0.0474430798368535, + "grad_norm": 0.45067089796066284, + "learning_rate": 0.00019055095595211792, + "loss": 1.5147, + "step": 3651 + }, + { + "epoch": 0.047456074380769375, + "grad_norm": 0.40251994132995605, + "learning_rate": 0.00019054835649020651, + "loss": 1.3527, + "step": 3652 + }, + { + "epoch": 0.04746906892468525, + "grad_norm": 0.30425411462783813, + "learning_rate": 0.00019054575702829517, + "loss": 1.2581, + "step": 3653 + }, + { + "epoch": 0.04748206346860112, + "grad_norm": 0.3596998155117035, + "learning_rate": 0.00019054315756638376, + "loss": 1.508, + "step": 3654 + }, + { + "epoch": 0.047495058012516994, + "grad_norm": 0.3922152519226074, + "learning_rate": 0.0001905405581044724, + "loss": 1.3969, + "step": 3655 + }, + { + "epoch": 0.04750805255643287, + "grad_norm": 0.3967946767807007, + "learning_rate": 0.000190537958642561, + "loss": 1.3867, + "step": 3656 + }, + { + "epoch": 0.04752104710034874, + "grad_norm": 0.39123407006263733, + "learning_rate": 0.0001905353591806496, + "loss": 1.4557, + "step": 3657 + }, + { + "epoch": 0.04753404164426461, + "grad_norm": 0.33910247683525085, + "learning_rate": 0.00019053275971873823, + "loss": 1.5389, + "step": 3658 + }, + { + "epoch": 0.047547036188180486, + "grad_norm": 0.3255355656147003, + "learning_rate": 0.00019053016025682683, + "loss": 1.4173, + "step": 3659 + }, + { + "epoch": 0.04756003073209636, + "grad_norm": 0.38558611273765564, + "learning_rate": 0.00019052756079491548, + "loss": 1.635, + "step": 3660 + }, + { + "epoch": 0.04757302527601223, + "grad_norm": 0.318462610244751, + "learning_rate": 0.00019052496133300408, + "loss": 1.3435, + "step": 3661 + }, + { + "epoch": 0.047586019819928105, + "grad_norm": 0.4146612584590912, + "learning_rate": 0.00019052236187109268, + "loss": 1.1471, + "step": 3662 + }, + { + "epoch": 0.04759901436384398, + "grad_norm": 0.5100176930427551, + "learning_rate": 0.0001905197624091813, + "loss": 1.5207, + "step": 3663 + }, + { + "epoch": 0.04761200890775986, + "grad_norm": 0.2977929413318634, + "learning_rate": 0.00019051716294726993, + "loss": 1.3735, + "step": 3664 + }, + { + "epoch": 0.04762500345167573, + "grad_norm": 0.38414695858955383, + "learning_rate": 0.00019051456348535855, + "loss": 1.4907, + "step": 3665 + }, + { + "epoch": 0.047637997995591604, + "grad_norm": 0.2535100281238556, + "learning_rate": 0.00019051196402344715, + "loss": 1.2081, + "step": 3666 + }, + { + "epoch": 0.04765099253950748, + "grad_norm": 0.34876739978790283, + "learning_rate": 0.00019050936456153577, + "loss": 1.3364, + "step": 3667 + }, + { + "epoch": 0.04766398708342335, + "grad_norm": 0.3137843906879425, + "learning_rate": 0.0001905067650996244, + "loss": 1.1271, + "step": 3668 + }, + { + "epoch": 0.04767698162733922, + "grad_norm": 0.44447457790374756, + "learning_rate": 0.000190504165637713, + "loss": 1.4365, + "step": 3669 + }, + { + "epoch": 0.047689976171255095, + "grad_norm": 0.3810134828090668, + "learning_rate": 0.00019050156617580162, + "loss": 1.6645, + "step": 3670 + }, + { + "epoch": 0.04770297071517097, + "grad_norm": 0.42089030146598816, + "learning_rate": 0.00019049896671389022, + "loss": 1.4278, + "step": 3671 + }, + { + "epoch": 0.04771596525908684, + "grad_norm": 0.45074015855789185, + "learning_rate": 0.00019049636725197887, + "loss": 1.4899, + "step": 3672 + }, + { + "epoch": 0.047728959803002714, + "grad_norm": 0.41166314482688904, + "learning_rate": 0.00019049376779006747, + "loss": 1.6006, + "step": 3673 + }, + { + "epoch": 0.04774195434691859, + "grad_norm": 0.47895753383636475, + "learning_rate": 0.0001904911683281561, + "loss": 1.5603, + "step": 3674 + }, + { + "epoch": 0.04775494889083446, + "grad_norm": 0.36243027448654175, + "learning_rate": 0.0001904885688662447, + "loss": 1.5536, + "step": 3675 + }, + { + "epoch": 0.04776794343475033, + "grad_norm": 0.4041252136230469, + "learning_rate": 0.0001904859694043333, + "loss": 1.4372, + "step": 3676 + }, + { + "epoch": 0.047780937978666206, + "grad_norm": 0.3591209948062897, + "learning_rate": 0.00019048336994242194, + "loss": 1.3041, + "step": 3677 + }, + { + "epoch": 0.04779393252258208, + "grad_norm": 0.4013978838920593, + "learning_rate": 0.00019048077048051053, + "loss": 1.6379, + "step": 3678 + }, + { + "epoch": 0.04780692706649795, + "grad_norm": 0.39858371019363403, + "learning_rate": 0.00019047817101859916, + "loss": 1.4992, + "step": 3679 + }, + { + "epoch": 0.047819921610413825, + "grad_norm": 0.43754443526268005, + "learning_rate": 0.00019047557155668778, + "loss": 1.3867, + "step": 3680 + }, + { + "epoch": 0.0478329161543297, + "grad_norm": 0.37209850549697876, + "learning_rate": 0.00019047297209477638, + "loss": 1.3733, + "step": 3681 + }, + { + "epoch": 0.04784591069824557, + "grad_norm": 0.38896167278289795, + "learning_rate": 0.000190470372632865, + "loss": 1.5887, + "step": 3682 + }, + { + "epoch": 0.04785890524216145, + "grad_norm": 0.36297520995140076, + "learning_rate": 0.0001904677731709536, + "loss": 1.4573, + "step": 3683 + }, + { + "epoch": 0.047871899786077324, + "grad_norm": 0.2979295253753662, + "learning_rate": 0.00019046517370904225, + "loss": 1.3184, + "step": 3684 + }, + { + "epoch": 0.0478848943299932, + "grad_norm": 0.3434945344924927, + "learning_rate": 0.00019046257424713085, + "loss": 1.3364, + "step": 3685 + }, + { + "epoch": 0.04789788887390907, + "grad_norm": 0.4513181746006012, + "learning_rate": 0.00019045997478521948, + "loss": 1.5704, + "step": 3686 + }, + { + "epoch": 0.04791088341782494, + "grad_norm": 0.43385082483291626, + "learning_rate": 0.00019045737532330807, + "loss": 1.4051, + "step": 3687 + }, + { + "epoch": 0.047923877961740816, + "grad_norm": 0.391757607460022, + "learning_rate": 0.0001904547758613967, + "loss": 1.5644, + "step": 3688 + }, + { + "epoch": 0.04793687250565669, + "grad_norm": 0.3542194068431854, + "learning_rate": 0.00019045217639948532, + "loss": 1.5415, + "step": 3689 + }, + { + "epoch": 0.04794986704957256, + "grad_norm": 0.3492177426815033, + "learning_rate": 0.00019044957693757392, + "loss": 1.3601, + "step": 3690 + }, + { + "epoch": 0.047962861593488434, + "grad_norm": 0.3090565800666809, + "learning_rate": 0.00019044697747566257, + "loss": 1.3153, + "step": 3691 + }, + { + "epoch": 0.04797585613740431, + "grad_norm": 0.4005926549434662, + "learning_rate": 0.00019044437801375117, + "loss": 1.7639, + "step": 3692 + }, + { + "epoch": 0.04798885068132018, + "grad_norm": 0.691261887550354, + "learning_rate": 0.00019044177855183977, + "loss": 1.5533, + "step": 3693 + }, + { + "epoch": 0.04800184522523605, + "grad_norm": 0.4005010724067688, + "learning_rate": 0.0001904391790899284, + "loss": 1.4576, + "step": 3694 + }, + { + "epoch": 0.048014839769151926, + "grad_norm": 0.4017079472541809, + "learning_rate": 0.00019043657962801702, + "loss": 1.4879, + "step": 3695 + }, + { + "epoch": 0.0480278343130678, + "grad_norm": 0.39686429500579834, + "learning_rate": 0.00019043398016610564, + "loss": 1.4527, + "step": 3696 + }, + { + "epoch": 0.04804082885698367, + "grad_norm": 0.40051910281181335, + "learning_rate": 0.00019043138070419424, + "loss": 1.4086, + "step": 3697 + }, + { + "epoch": 0.048053823400899545, + "grad_norm": 0.3536207377910614, + "learning_rate": 0.00019042878124228286, + "loss": 1.2281, + "step": 3698 + }, + { + "epoch": 0.04806681794481542, + "grad_norm": 0.3526003658771515, + "learning_rate": 0.00019042618178037149, + "loss": 1.2182, + "step": 3699 + }, + { + "epoch": 0.04807981248873129, + "grad_norm": 0.37056267261505127, + "learning_rate": 0.00019042358231846008, + "loss": 1.463, + "step": 3700 + }, + { + "epoch": 0.048092807032647164, + "grad_norm": 0.33729904890060425, + "learning_rate": 0.0001904209828565487, + "loss": 1.4765, + "step": 3701 + }, + { + "epoch": 0.048105801576563044, + "grad_norm": 0.31484049558639526, + "learning_rate": 0.0001904183833946373, + "loss": 1.3468, + "step": 3702 + }, + { + "epoch": 0.04811879612047892, + "grad_norm": 0.6188753843307495, + "learning_rate": 0.00019041578393272596, + "loss": 1.4841, + "step": 3703 + }, + { + "epoch": 0.04813179066439479, + "grad_norm": 0.36766317486763, + "learning_rate": 0.00019041318447081455, + "loss": 1.4597, + "step": 3704 + }, + { + "epoch": 0.04814478520831066, + "grad_norm": 0.3956480026245117, + "learning_rate": 0.00019041058500890315, + "loss": 1.5674, + "step": 3705 + }, + { + "epoch": 0.048157779752226536, + "grad_norm": 0.466078519821167, + "learning_rate": 0.00019040798554699178, + "loss": 1.4406, + "step": 3706 + }, + { + "epoch": 0.04817077429614241, + "grad_norm": 0.4860243797302246, + "learning_rate": 0.0001904053860850804, + "loss": 1.4233, + "step": 3707 + }, + { + "epoch": 0.04818376884005828, + "grad_norm": 0.35091403126716614, + "learning_rate": 0.00019040278662316902, + "loss": 1.5212, + "step": 3708 + }, + { + "epoch": 0.048196763383974155, + "grad_norm": 0.4561355412006378, + "learning_rate": 0.00019040018716125762, + "loss": 1.4089, + "step": 3709 + }, + { + "epoch": 0.04820975792789003, + "grad_norm": 0.3972288966178894, + "learning_rate": 0.00019039758769934625, + "loss": 1.4688, + "step": 3710 + }, + { + "epoch": 0.0482227524718059, + "grad_norm": 0.35800743103027344, + "learning_rate": 0.00019039498823743487, + "loss": 1.3091, + "step": 3711 + }, + { + "epoch": 0.048235747015721774, + "grad_norm": 0.3681759834289551, + "learning_rate": 0.00019039238877552347, + "loss": 1.4073, + "step": 3712 + }, + { + "epoch": 0.048248741559637646, + "grad_norm": 0.3054344952106476, + "learning_rate": 0.0001903897893136121, + "loss": 1.3646, + "step": 3713 + }, + { + "epoch": 0.04826173610355352, + "grad_norm": 0.37973904609680176, + "learning_rate": 0.0001903871898517007, + "loss": 1.5879, + "step": 3714 + }, + { + "epoch": 0.04827473064746939, + "grad_norm": 0.46718481183052063, + "learning_rate": 0.00019038459038978934, + "loss": 1.5299, + "step": 3715 + }, + { + "epoch": 0.048287725191385265, + "grad_norm": 0.39083757996559143, + "learning_rate": 0.00019038199092787794, + "loss": 1.295, + "step": 3716 + }, + { + "epoch": 0.04830071973530114, + "grad_norm": 0.2630516588687897, + "learning_rate": 0.00019037939146596654, + "loss": 1.3067, + "step": 3717 + }, + { + "epoch": 0.04831371427921701, + "grad_norm": 0.3986029028892517, + "learning_rate": 0.00019037679200405516, + "loss": 1.4904, + "step": 3718 + }, + { + "epoch": 0.048326708823132884, + "grad_norm": 0.5132040977478027, + "learning_rate": 0.00019037419254214379, + "loss": 1.5321, + "step": 3719 + }, + { + "epoch": 0.04833970336704876, + "grad_norm": 0.37517425417900085, + "learning_rate": 0.0001903715930802324, + "loss": 1.4871, + "step": 3720 + }, + { + "epoch": 0.04835269791096464, + "grad_norm": 0.3649660646915436, + "learning_rate": 0.000190368993618321, + "loss": 1.1234, + "step": 3721 + }, + { + "epoch": 0.04836569245488051, + "grad_norm": 0.4588010907173157, + "learning_rate": 0.00019036639415640963, + "loss": 1.7344, + "step": 3722 + }, + { + "epoch": 0.04837868699879638, + "grad_norm": 0.3188712000846863, + "learning_rate": 0.00019036379469449826, + "loss": 1.519, + "step": 3723 + }, + { + "epoch": 0.048391681542712256, + "grad_norm": 0.451236367225647, + "learning_rate": 0.00019036119523258685, + "loss": 1.5545, + "step": 3724 + }, + { + "epoch": 0.04840467608662813, + "grad_norm": 0.38023120164871216, + "learning_rate": 0.00019035859577067548, + "loss": 1.513, + "step": 3725 + }, + { + "epoch": 0.048417670630544, + "grad_norm": 0.3073883056640625, + "learning_rate": 0.00019035599630876408, + "loss": 1.4846, + "step": 3726 + }, + { + "epoch": 0.048430665174459875, + "grad_norm": 0.32462555170059204, + "learning_rate": 0.00019035339684685273, + "loss": 1.5351, + "step": 3727 + }, + { + "epoch": 0.04844365971837575, + "grad_norm": 0.38581156730651855, + "learning_rate": 0.00019035079738494132, + "loss": 1.5359, + "step": 3728 + }, + { + "epoch": 0.04845665426229162, + "grad_norm": 0.46308737993240356, + "learning_rate": 0.00019034819792302995, + "loss": 1.471, + "step": 3729 + }, + { + "epoch": 0.048469648806207494, + "grad_norm": 0.49333426356315613, + "learning_rate": 0.00019034559846111857, + "loss": 1.4374, + "step": 3730 + }, + { + "epoch": 0.04848264335012337, + "grad_norm": 0.4720149338245392, + "learning_rate": 0.00019034299899920717, + "loss": 1.4151, + "step": 3731 + }, + { + "epoch": 0.04849563789403924, + "grad_norm": 0.43370410799980164, + "learning_rate": 0.0001903403995372958, + "loss": 1.4138, + "step": 3732 + }, + { + "epoch": 0.04850863243795511, + "grad_norm": 0.46720731258392334, + "learning_rate": 0.0001903378000753844, + "loss": 1.5979, + "step": 3733 + }, + { + "epoch": 0.048521626981870986, + "grad_norm": 0.3942004144191742, + "learning_rate": 0.00019033520061347302, + "loss": 1.8305, + "step": 3734 + }, + { + "epoch": 0.04853462152578686, + "grad_norm": 0.44630008935928345, + "learning_rate": 0.00019033260115156164, + "loss": 1.6237, + "step": 3735 + }, + { + "epoch": 0.04854761606970273, + "grad_norm": 0.38367313146591187, + "learning_rate": 0.00019033000168965024, + "loss": 1.3709, + "step": 3736 + }, + { + "epoch": 0.048560610613618604, + "grad_norm": 0.3858979642391205, + "learning_rate": 0.00019032740222773886, + "loss": 1.334, + "step": 3737 + }, + { + "epoch": 0.04857360515753448, + "grad_norm": 0.3400343358516693, + "learning_rate": 0.0001903248027658275, + "loss": 1.3174, + "step": 3738 + }, + { + "epoch": 0.04858659970145035, + "grad_norm": 0.39535218477249146, + "learning_rate": 0.0001903222033039161, + "loss": 1.2091, + "step": 3739 + }, + { + "epoch": 0.04859959424536623, + "grad_norm": 0.32496944069862366, + "learning_rate": 0.0001903196038420047, + "loss": 1.3763, + "step": 3740 + }, + { + "epoch": 0.0486125887892821, + "grad_norm": 0.3461853265762329, + "learning_rate": 0.00019031700438009333, + "loss": 1.4223, + "step": 3741 + }, + { + "epoch": 0.048625583333197976, + "grad_norm": 0.4377201795578003, + "learning_rate": 0.00019031440491818196, + "loss": 1.5245, + "step": 3742 + }, + { + "epoch": 0.04863857787711385, + "grad_norm": 0.4547326862812042, + "learning_rate": 0.00019031180545627056, + "loss": 1.6038, + "step": 3743 + }, + { + "epoch": 0.04865157242102972, + "grad_norm": 0.43616053462028503, + "learning_rate": 0.00019030920599435918, + "loss": 1.6193, + "step": 3744 + }, + { + "epoch": 0.048664566964945595, + "grad_norm": 0.3790490925312042, + "learning_rate": 0.00019030660653244778, + "loss": 1.6556, + "step": 3745 + }, + { + "epoch": 0.04867756150886147, + "grad_norm": 0.4055534303188324, + "learning_rate": 0.0001903040070705364, + "loss": 1.5272, + "step": 3746 + }, + { + "epoch": 0.04869055605277734, + "grad_norm": 0.5028093457221985, + "learning_rate": 0.00019030140760862503, + "loss": 1.4023, + "step": 3747 + }, + { + "epoch": 0.048703550596693214, + "grad_norm": 0.37545719742774963, + "learning_rate": 0.00019029880814671362, + "loss": 1.4415, + "step": 3748 + }, + { + "epoch": 0.04871654514060909, + "grad_norm": 0.41033586859703064, + "learning_rate": 0.00019029620868480225, + "loss": 1.4422, + "step": 3749 + }, + { + "epoch": 0.04872953968452496, + "grad_norm": 0.41760483384132385, + "learning_rate": 0.00019029360922289087, + "loss": 1.4391, + "step": 3750 + }, + { + "epoch": 0.04874253422844083, + "grad_norm": 0.4780189096927643, + "learning_rate": 0.0001902910097609795, + "loss": 1.3392, + "step": 3751 + }, + { + "epoch": 0.048755528772356706, + "grad_norm": 0.33102235198020935, + "learning_rate": 0.0001902884102990681, + "loss": 1.3838, + "step": 3752 + }, + { + "epoch": 0.04876852331627258, + "grad_norm": 0.3358710706233978, + "learning_rate": 0.00019028581083715672, + "loss": 1.5932, + "step": 3753 + }, + { + "epoch": 0.04878151786018845, + "grad_norm": 0.5055046677589417, + "learning_rate": 0.00019028321137524534, + "loss": 1.5045, + "step": 3754 + }, + { + "epoch": 0.048794512404104325, + "grad_norm": 0.34506577253341675, + "learning_rate": 0.00019028061191333394, + "loss": 1.4033, + "step": 3755 + }, + { + "epoch": 0.0488075069480202, + "grad_norm": 0.5171628594398499, + "learning_rate": 0.00019027801245142257, + "loss": 1.3833, + "step": 3756 + }, + { + "epoch": 0.04882050149193607, + "grad_norm": 0.4783114492893219, + "learning_rate": 0.00019027541298951116, + "loss": 1.457, + "step": 3757 + }, + { + "epoch": 0.048833496035851943, + "grad_norm": 0.32838794589042664, + "learning_rate": 0.00019027281352759982, + "loss": 1.2866, + "step": 3758 + }, + { + "epoch": 0.04884649057976782, + "grad_norm": 0.405598521232605, + "learning_rate": 0.0001902702140656884, + "loss": 1.4387, + "step": 3759 + }, + { + "epoch": 0.048859485123683696, + "grad_norm": 0.3459217846393585, + "learning_rate": 0.000190267614603777, + "loss": 1.4465, + "step": 3760 + }, + { + "epoch": 0.04887247966759957, + "grad_norm": 0.42128950357437134, + "learning_rate": 0.00019026501514186563, + "loss": 1.6357, + "step": 3761 + }, + { + "epoch": 0.04888547421151544, + "grad_norm": 0.44771215319633484, + "learning_rate": 0.00019026241567995426, + "loss": 1.5112, + "step": 3762 + }, + { + "epoch": 0.048898468755431315, + "grad_norm": 0.32228025794029236, + "learning_rate": 0.00019025981621804288, + "loss": 1.3816, + "step": 3763 + }, + { + "epoch": 0.04891146329934719, + "grad_norm": 0.38464412093162537, + "learning_rate": 0.00019025721675613148, + "loss": 1.4335, + "step": 3764 + }, + { + "epoch": 0.04892445784326306, + "grad_norm": 0.38852474093437195, + "learning_rate": 0.0001902546172942201, + "loss": 1.3744, + "step": 3765 + }, + { + "epoch": 0.048937452387178934, + "grad_norm": 0.46775123476982117, + "learning_rate": 0.00019025201783230873, + "loss": 1.5269, + "step": 3766 + }, + { + "epoch": 0.04895044693109481, + "grad_norm": 0.3528216779232025, + "learning_rate": 0.00019024941837039733, + "loss": 1.4835, + "step": 3767 + }, + { + "epoch": 0.04896344147501068, + "grad_norm": 0.4044293165206909, + "learning_rate": 0.00019024681890848595, + "loss": 1.322, + "step": 3768 + }, + { + "epoch": 0.04897643601892655, + "grad_norm": 0.3173816204071045, + "learning_rate": 0.00019024421944657458, + "loss": 1.3836, + "step": 3769 + }, + { + "epoch": 0.048989430562842426, + "grad_norm": 0.34366855025291443, + "learning_rate": 0.0001902416199846632, + "loss": 1.3584, + "step": 3770 + }, + { + "epoch": 0.0490024251067583, + "grad_norm": 0.5185137987136841, + "learning_rate": 0.0001902390205227518, + "loss": 1.7088, + "step": 3771 + }, + { + "epoch": 0.04901541965067417, + "grad_norm": 0.41273248195648193, + "learning_rate": 0.0001902364210608404, + "loss": 1.2926, + "step": 3772 + }, + { + "epoch": 0.049028414194590045, + "grad_norm": 0.39725032448768616, + "learning_rate": 0.00019023382159892905, + "loss": 1.6967, + "step": 3773 + }, + { + "epoch": 0.04904140873850592, + "grad_norm": 0.3438766896724701, + "learning_rate": 0.00019023122213701764, + "loss": 1.4632, + "step": 3774 + }, + { + "epoch": 0.04905440328242179, + "grad_norm": 0.48754197359085083, + "learning_rate": 0.00019022862267510627, + "loss": 1.3423, + "step": 3775 + }, + { + "epoch": 0.049067397826337664, + "grad_norm": 0.44422802329063416, + "learning_rate": 0.00019022602321319487, + "loss": 1.2952, + "step": 3776 + }, + { + "epoch": 0.04908039237025354, + "grad_norm": 0.40650635957717896, + "learning_rate": 0.0001902234237512835, + "loss": 1.4863, + "step": 3777 + }, + { + "epoch": 0.049093386914169417, + "grad_norm": 0.3226718604564667, + "learning_rate": 0.00019022082428937212, + "loss": 1.3477, + "step": 3778 + }, + { + "epoch": 0.04910638145808529, + "grad_norm": 0.3460264801979065, + "learning_rate": 0.0001902182248274607, + "loss": 1.3279, + "step": 3779 + }, + { + "epoch": 0.04911937600200116, + "grad_norm": 0.3752950429916382, + "learning_rate": 0.00019021562536554934, + "loss": 1.473, + "step": 3780 + }, + { + "epoch": 0.049132370545917035, + "grad_norm": 0.42022013664245605, + "learning_rate": 0.00019021302590363796, + "loss": 1.5363, + "step": 3781 + }, + { + "epoch": 0.04914536508983291, + "grad_norm": 0.3701004683971405, + "learning_rate": 0.00019021042644172659, + "loss": 1.6591, + "step": 3782 + }, + { + "epoch": 0.04915835963374878, + "grad_norm": 0.4206155836582184, + "learning_rate": 0.00019020782697981518, + "loss": 1.4595, + "step": 3783 + }, + { + "epoch": 0.049171354177664654, + "grad_norm": 0.37528252601623535, + "learning_rate": 0.0001902052275179038, + "loss": 1.5484, + "step": 3784 + }, + { + "epoch": 0.04918434872158053, + "grad_norm": 0.4147874414920807, + "learning_rate": 0.00019020262805599243, + "loss": 1.4868, + "step": 3785 + }, + { + "epoch": 0.0491973432654964, + "grad_norm": 0.30775102972984314, + "learning_rate": 0.00019020002859408103, + "loss": 1.4969, + "step": 3786 + }, + { + "epoch": 0.04921033780941227, + "grad_norm": 0.33989468216896057, + "learning_rate": 0.00019019742913216965, + "loss": 1.3719, + "step": 3787 + }, + { + "epoch": 0.049223332353328146, + "grad_norm": 0.3552214801311493, + "learning_rate": 0.00019019482967025825, + "loss": 1.4006, + "step": 3788 + }, + { + "epoch": 0.04923632689724402, + "grad_norm": 0.3304178714752197, + "learning_rate": 0.00019019223020834688, + "loss": 1.422, + "step": 3789 + }, + { + "epoch": 0.04924932144115989, + "grad_norm": 0.3558228015899658, + "learning_rate": 0.0001901896307464355, + "loss": 1.6192, + "step": 3790 + }, + { + "epoch": 0.049262315985075765, + "grad_norm": 0.4193790555000305, + "learning_rate": 0.0001901870312845241, + "loss": 1.4656, + "step": 3791 + }, + { + "epoch": 0.04927531052899164, + "grad_norm": 0.3567087650299072, + "learning_rate": 0.00019018443182261272, + "loss": 1.2557, + "step": 3792 + }, + { + "epoch": 0.04928830507290751, + "grad_norm": 0.4742031395435333, + "learning_rate": 0.00019018183236070135, + "loss": 1.5403, + "step": 3793 + }, + { + "epoch": 0.049301299616823384, + "grad_norm": 0.4044789969921112, + "learning_rate": 0.00019017923289878997, + "loss": 1.3158, + "step": 3794 + }, + { + "epoch": 0.04931429416073926, + "grad_norm": 0.37397974729537964, + "learning_rate": 0.00019017663343687857, + "loss": 1.3066, + "step": 3795 + }, + { + "epoch": 0.04932728870465513, + "grad_norm": 0.39239782094955444, + "learning_rate": 0.0001901740339749672, + "loss": 1.4822, + "step": 3796 + }, + { + "epoch": 0.04934028324857101, + "grad_norm": 0.2867686152458191, + "learning_rate": 0.00019017143451305582, + "loss": 1.1689, + "step": 3797 + }, + { + "epoch": 0.04935327779248688, + "grad_norm": 0.4478296637535095, + "learning_rate": 0.00019016883505114442, + "loss": 1.4884, + "step": 3798 + }, + { + "epoch": 0.049366272336402756, + "grad_norm": 0.4071597456932068, + "learning_rate": 0.00019016623558923304, + "loss": 1.4253, + "step": 3799 + }, + { + "epoch": 0.04937926688031863, + "grad_norm": 0.3840247392654419, + "learning_rate": 0.00019016363612732164, + "loss": 1.4365, + "step": 3800 + }, + { + "epoch": 0.0493922614242345, + "grad_norm": 0.35399726033210754, + "learning_rate": 0.00019016103666541026, + "loss": 1.5576, + "step": 3801 + }, + { + "epoch": 0.049405255968150374, + "grad_norm": 0.45631012320518494, + "learning_rate": 0.00019015843720349889, + "loss": 1.5579, + "step": 3802 + }, + { + "epoch": 0.04941825051206625, + "grad_norm": 0.3735182583332062, + "learning_rate": 0.00019015583774158748, + "loss": 1.3921, + "step": 3803 + }, + { + "epoch": 0.04943124505598212, + "grad_norm": 0.3003441095352173, + "learning_rate": 0.00019015323827967614, + "loss": 1.4741, + "step": 3804 + }, + { + "epoch": 0.04944423959989799, + "grad_norm": 0.5309226512908936, + "learning_rate": 0.00019015063881776473, + "loss": 1.499, + "step": 3805 + }, + { + "epoch": 0.049457234143813866, + "grad_norm": 0.32520508766174316, + "learning_rate": 0.00019014803935585336, + "loss": 1.5014, + "step": 3806 + }, + { + "epoch": 0.04947022868772974, + "grad_norm": 0.435548335313797, + "learning_rate": 0.00019014543989394195, + "loss": 1.4792, + "step": 3807 + }, + { + "epoch": 0.04948322323164561, + "grad_norm": 0.46274563670158386, + "learning_rate": 0.00019014284043203058, + "loss": 1.6184, + "step": 3808 + }, + { + "epoch": 0.049496217775561485, + "grad_norm": 0.36255529522895813, + "learning_rate": 0.0001901402409701192, + "loss": 1.3842, + "step": 3809 + }, + { + "epoch": 0.04950921231947736, + "grad_norm": 0.3969224989414215, + "learning_rate": 0.0001901376415082078, + "loss": 1.582, + "step": 3810 + }, + { + "epoch": 0.04952220686339323, + "grad_norm": 0.4023350179195404, + "learning_rate": 0.00019013504204629643, + "loss": 1.5543, + "step": 3811 + }, + { + "epoch": 0.049535201407309104, + "grad_norm": 0.3140353560447693, + "learning_rate": 0.00019013244258438505, + "loss": 1.3916, + "step": 3812 + }, + { + "epoch": 0.04954819595122498, + "grad_norm": 0.29110926389694214, + "learning_rate": 0.00019012984312247367, + "loss": 1.471, + "step": 3813 + }, + { + "epoch": 0.04956119049514085, + "grad_norm": 0.41599124670028687, + "learning_rate": 0.00019012724366056227, + "loss": 1.4365, + "step": 3814 + }, + { + "epoch": 0.04957418503905672, + "grad_norm": 0.367975652217865, + "learning_rate": 0.00019012464419865087, + "loss": 1.5376, + "step": 3815 + }, + { + "epoch": 0.0495871795829726, + "grad_norm": 0.3864184021949768, + "learning_rate": 0.00019012204473673952, + "loss": 1.5685, + "step": 3816 + }, + { + "epoch": 0.049600174126888476, + "grad_norm": 0.399983286857605, + "learning_rate": 0.00019011944527482812, + "loss": 1.4027, + "step": 3817 + }, + { + "epoch": 0.04961316867080435, + "grad_norm": 0.3606490194797516, + "learning_rate": 0.00019011684581291674, + "loss": 1.3922, + "step": 3818 + }, + { + "epoch": 0.04962616321472022, + "grad_norm": 0.293867290019989, + "learning_rate": 0.00019011424635100534, + "loss": 1.3493, + "step": 3819 + }, + { + "epoch": 0.049639157758636095, + "grad_norm": 0.3390860855579376, + "learning_rate": 0.00019011164688909396, + "loss": 1.4754, + "step": 3820 + }, + { + "epoch": 0.04965215230255197, + "grad_norm": 0.5028812289237976, + "learning_rate": 0.0001901090474271826, + "loss": 1.4913, + "step": 3821 + }, + { + "epoch": 0.04966514684646784, + "grad_norm": 0.4193522036075592, + "learning_rate": 0.00019010644796527119, + "loss": 1.5433, + "step": 3822 + }, + { + "epoch": 0.049678141390383714, + "grad_norm": 0.4153229594230652, + "learning_rate": 0.0001901038485033598, + "loss": 1.5895, + "step": 3823 + }, + { + "epoch": 0.049691135934299586, + "grad_norm": 0.4533325433731079, + "learning_rate": 0.00019010124904144844, + "loss": 1.4521, + "step": 3824 + }, + { + "epoch": 0.04970413047821546, + "grad_norm": 0.38161057233810425, + "learning_rate": 0.00019009864957953706, + "loss": 1.3993, + "step": 3825 + }, + { + "epoch": 0.04971712502213133, + "grad_norm": 0.3211041986942291, + "learning_rate": 0.00019009605011762566, + "loss": 1.6267, + "step": 3826 + }, + { + "epoch": 0.049730119566047205, + "grad_norm": 0.26964321732521057, + "learning_rate": 0.00019009345065571425, + "loss": 1.2429, + "step": 3827 + }, + { + "epoch": 0.04974311410996308, + "grad_norm": 0.35638460516929626, + "learning_rate": 0.0001900908511938029, + "loss": 1.6416, + "step": 3828 + }, + { + "epoch": 0.04975610865387895, + "grad_norm": 0.3692905008792877, + "learning_rate": 0.0001900882517318915, + "loss": 1.4969, + "step": 3829 + }, + { + "epoch": 0.049769103197794824, + "grad_norm": 0.307699978351593, + "learning_rate": 0.00019008565226998013, + "loss": 1.2532, + "step": 3830 + }, + { + "epoch": 0.0497820977417107, + "grad_norm": 0.3573067784309387, + "learning_rate": 0.00019008305280806873, + "loss": 1.6465, + "step": 3831 + }, + { + "epoch": 0.04979509228562657, + "grad_norm": 0.3379407227039337, + "learning_rate": 0.00019008045334615735, + "loss": 1.28, + "step": 3832 + }, + { + "epoch": 0.04980808682954244, + "grad_norm": 0.37374475598335266, + "learning_rate": 0.00019007785388424597, + "loss": 1.3597, + "step": 3833 + }, + { + "epoch": 0.049821081373458316, + "grad_norm": 0.3123195171356201, + "learning_rate": 0.00019007525442233457, + "loss": 1.3405, + "step": 3834 + }, + { + "epoch": 0.049834075917374196, + "grad_norm": 0.3965378999710083, + "learning_rate": 0.0001900726549604232, + "loss": 1.351, + "step": 3835 + }, + { + "epoch": 0.04984707046129007, + "grad_norm": 0.48493969440460205, + "learning_rate": 0.00019007005549851182, + "loss": 1.742, + "step": 3836 + }, + { + "epoch": 0.04986006500520594, + "grad_norm": 0.4489361643791199, + "learning_rate": 0.00019006745603660044, + "loss": 1.5255, + "step": 3837 + }, + { + "epoch": 0.049873059549121815, + "grad_norm": 0.39542123675346375, + "learning_rate": 0.00019006485657468904, + "loss": 1.6101, + "step": 3838 + }, + { + "epoch": 0.04988605409303769, + "grad_norm": 0.3340208828449249, + "learning_rate": 0.00019006225711277767, + "loss": 1.2449, + "step": 3839 + }, + { + "epoch": 0.04989904863695356, + "grad_norm": 0.3478074371814728, + "learning_rate": 0.0001900596576508663, + "loss": 1.5239, + "step": 3840 + }, + { + "epoch": 0.049912043180869434, + "grad_norm": 0.37966689467430115, + "learning_rate": 0.0001900570581889549, + "loss": 1.4362, + "step": 3841 + }, + { + "epoch": 0.04992503772478531, + "grad_norm": 0.39213261008262634, + "learning_rate": 0.0001900544587270435, + "loss": 1.4932, + "step": 3842 + }, + { + "epoch": 0.04993803226870118, + "grad_norm": 0.4360811412334442, + "learning_rate": 0.00019005185926513214, + "loss": 1.5797, + "step": 3843 + }, + { + "epoch": 0.04995102681261705, + "grad_norm": 0.40591880679130554, + "learning_rate": 0.00019004925980322074, + "loss": 1.3499, + "step": 3844 + }, + { + "epoch": 0.049964021356532926, + "grad_norm": 0.42182737588882446, + "learning_rate": 0.00019004666034130936, + "loss": 1.5688, + "step": 3845 + }, + { + "epoch": 0.0499770159004488, + "grad_norm": 0.44136422872543335, + "learning_rate": 0.00019004406087939796, + "loss": 1.4248, + "step": 3846 + }, + { + "epoch": 0.04999001044436467, + "grad_norm": 0.4252159595489502, + "learning_rate": 0.0001900414614174866, + "loss": 1.5371, + "step": 3847 + }, + { + "epoch": 0.050003004988280544, + "grad_norm": 0.406872421503067, + "learning_rate": 0.0001900388619555752, + "loss": 1.6734, + "step": 3848 + }, + { + "epoch": 0.05001599953219642, + "grad_norm": 0.3588224947452545, + "learning_rate": 0.00019003626249366383, + "loss": 1.3372, + "step": 3849 + }, + { + "epoch": 0.05002899407611229, + "grad_norm": 0.25963282585144043, + "learning_rate": 0.00019003366303175243, + "loss": 1.2794, + "step": 3850 + }, + { + "epoch": 0.05004198862002816, + "grad_norm": 0.4401414394378662, + "learning_rate": 0.00019003106356984105, + "loss": 1.4461, + "step": 3851 + }, + { + "epoch": 0.050054983163944036, + "grad_norm": 0.43792974948883057, + "learning_rate": 0.00019002846410792968, + "loss": 1.581, + "step": 3852 + }, + { + "epoch": 0.05006797770785991, + "grad_norm": 0.36800432205200195, + "learning_rate": 0.00019002586464601827, + "loss": 1.3583, + "step": 3853 + }, + { + "epoch": 0.05008097225177579, + "grad_norm": 0.4142528772354126, + "learning_rate": 0.0001900232651841069, + "loss": 1.3109, + "step": 3854 + }, + { + "epoch": 0.05009396679569166, + "grad_norm": 0.32176101207733154, + "learning_rate": 0.00019002066572219552, + "loss": 1.3163, + "step": 3855 + }, + { + "epoch": 0.050106961339607535, + "grad_norm": 0.41339778900146484, + "learning_rate": 0.00019001806626028412, + "loss": 1.4049, + "step": 3856 + }, + { + "epoch": 0.05011995588352341, + "grad_norm": 0.404988557100296, + "learning_rate": 0.00019001546679837274, + "loss": 1.423, + "step": 3857 + }, + { + "epoch": 0.05013295042743928, + "grad_norm": 0.4195566177368164, + "learning_rate": 0.00019001286733646134, + "loss": 1.4085, + "step": 3858 + }, + { + "epoch": 0.050145944971355154, + "grad_norm": 0.39707931876182556, + "learning_rate": 0.00019001026787455, + "loss": 1.22, + "step": 3859 + }, + { + "epoch": 0.05015893951527103, + "grad_norm": 0.4275943636894226, + "learning_rate": 0.0001900076684126386, + "loss": 1.4051, + "step": 3860 + }, + { + "epoch": 0.0501719340591869, + "grad_norm": 0.31092458963394165, + "learning_rate": 0.00019000506895072722, + "loss": 1.0827, + "step": 3861 + }, + { + "epoch": 0.05018492860310277, + "grad_norm": 0.4529190957546234, + "learning_rate": 0.0001900024694888158, + "loss": 1.442, + "step": 3862 + }, + { + "epoch": 0.050197923147018646, + "grad_norm": 0.49008408188819885, + "learning_rate": 0.00018999987002690444, + "loss": 1.4841, + "step": 3863 + }, + { + "epoch": 0.05021091769093452, + "grad_norm": 0.348882257938385, + "learning_rate": 0.00018999727056499306, + "loss": 1.3866, + "step": 3864 + }, + { + "epoch": 0.05022391223485039, + "grad_norm": 0.44368821382522583, + "learning_rate": 0.00018999467110308166, + "loss": 1.4542, + "step": 3865 + }, + { + "epoch": 0.050236906778766265, + "grad_norm": 0.41098013520240784, + "learning_rate": 0.00018999207164117028, + "loss": 1.4977, + "step": 3866 + }, + { + "epoch": 0.05024990132268214, + "grad_norm": 0.4766836166381836, + "learning_rate": 0.0001899894721792589, + "loss": 1.2225, + "step": 3867 + }, + { + "epoch": 0.05026289586659801, + "grad_norm": 0.4762088656425476, + "learning_rate": 0.0001899868727173475, + "loss": 1.4545, + "step": 3868 + }, + { + "epoch": 0.05027589041051388, + "grad_norm": 0.20205679535865784, + "learning_rate": 0.00018998427325543613, + "loss": 1.4255, + "step": 3869 + }, + { + "epoch": 0.050288884954429756, + "grad_norm": 0.3192203938961029, + "learning_rate": 0.00018998167379352473, + "loss": 1.4206, + "step": 3870 + }, + { + "epoch": 0.05030187949834563, + "grad_norm": 0.3639497756958008, + "learning_rate": 0.00018997907433161338, + "loss": 1.4292, + "step": 3871 + }, + { + "epoch": 0.0503148740422615, + "grad_norm": 0.35752227902412415, + "learning_rate": 0.00018997647486970198, + "loss": 1.3998, + "step": 3872 + }, + { + "epoch": 0.05032786858617738, + "grad_norm": 0.43642255663871765, + "learning_rate": 0.0001899738754077906, + "loss": 1.5194, + "step": 3873 + }, + { + "epoch": 0.050340863130093255, + "grad_norm": 0.4110788106918335, + "learning_rate": 0.0001899712759458792, + "loss": 1.375, + "step": 3874 + }, + { + "epoch": 0.05035385767400913, + "grad_norm": 0.4246370792388916, + "learning_rate": 0.00018996867648396782, + "loss": 1.4195, + "step": 3875 + }, + { + "epoch": 0.050366852217925, + "grad_norm": 0.43559038639068604, + "learning_rate": 0.00018996607702205645, + "loss": 1.3269, + "step": 3876 + }, + { + "epoch": 0.050379846761840874, + "grad_norm": 0.2696673572063446, + "learning_rate": 0.00018996347756014504, + "loss": 1.2111, + "step": 3877 + }, + { + "epoch": 0.05039284130575675, + "grad_norm": 0.34256792068481445, + "learning_rate": 0.0001899608780982337, + "loss": 1.4508, + "step": 3878 + }, + { + "epoch": 0.05040583584967262, + "grad_norm": 0.3418741226196289, + "learning_rate": 0.0001899582786363223, + "loss": 1.2265, + "step": 3879 + }, + { + "epoch": 0.05041883039358849, + "grad_norm": 0.4165349304676056, + "learning_rate": 0.00018995567917441092, + "loss": 1.4036, + "step": 3880 + }, + { + "epoch": 0.050431824937504366, + "grad_norm": 0.3854735791683197, + "learning_rate": 0.00018995307971249952, + "loss": 1.2993, + "step": 3881 + }, + { + "epoch": 0.05044481948142024, + "grad_norm": 0.3382399082183838, + "learning_rate": 0.00018995048025058814, + "loss": 1.5128, + "step": 3882 + }, + { + "epoch": 0.05045781402533611, + "grad_norm": 0.4080575108528137, + "learning_rate": 0.00018994788078867676, + "loss": 1.5723, + "step": 3883 + }, + { + "epoch": 0.050470808569251985, + "grad_norm": 0.3790281116962433, + "learning_rate": 0.00018994528132676536, + "loss": 1.4365, + "step": 3884 + }, + { + "epoch": 0.05048380311316786, + "grad_norm": 0.3492949306964874, + "learning_rate": 0.000189942681864854, + "loss": 1.3486, + "step": 3885 + }, + { + "epoch": 0.05049679765708373, + "grad_norm": 0.40551507472991943, + "learning_rate": 0.0001899400824029426, + "loss": 1.4734, + "step": 3886 + }, + { + "epoch": 0.050509792200999604, + "grad_norm": 0.2826780676841736, + "learning_rate": 0.0001899374829410312, + "loss": 1.4101, + "step": 3887 + }, + { + "epoch": 0.05052278674491548, + "grad_norm": 0.3535975217819214, + "learning_rate": 0.00018993488347911983, + "loss": 1.3429, + "step": 3888 + }, + { + "epoch": 0.05053578128883135, + "grad_norm": 0.34710127115249634, + "learning_rate": 0.00018993228401720843, + "loss": 1.5799, + "step": 3889 + }, + { + "epoch": 0.05054877583274722, + "grad_norm": 0.3437153697013855, + "learning_rate": 0.00018992968455529708, + "loss": 1.3689, + "step": 3890 + }, + { + "epoch": 0.050561770376663095, + "grad_norm": 0.32699424028396606, + "learning_rate": 0.00018992708509338568, + "loss": 1.2133, + "step": 3891 + }, + { + "epoch": 0.05057476492057897, + "grad_norm": 0.2864934206008911, + "learning_rate": 0.0001899244856314743, + "loss": 1.3931, + "step": 3892 + }, + { + "epoch": 0.05058775946449485, + "grad_norm": 0.3372269570827484, + "learning_rate": 0.0001899218861695629, + "loss": 1.5607, + "step": 3893 + }, + { + "epoch": 0.05060075400841072, + "grad_norm": 0.3295058012008667, + "learning_rate": 0.00018991928670765153, + "loss": 1.6334, + "step": 3894 + }, + { + "epoch": 0.050613748552326594, + "grad_norm": 0.45339012145996094, + "learning_rate": 0.00018991668724574015, + "loss": 1.4967, + "step": 3895 + }, + { + "epoch": 0.05062674309624247, + "grad_norm": 0.47987493872642517, + "learning_rate": 0.00018991408778382875, + "loss": 1.6288, + "step": 3896 + }, + { + "epoch": 0.05063973764015834, + "grad_norm": 0.3702436089515686, + "learning_rate": 0.00018991148832191737, + "loss": 1.6368, + "step": 3897 + }, + { + "epoch": 0.05065273218407421, + "grad_norm": 0.3788699209690094, + "learning_rate": 0.000189908888860006, + "loss": 1.2819, + "step": 3898 + }, + { + "epoch": 0.050665726727990086, + "grad_norm": 0.39456719160079956, + "learning_rate": 0.0001899062893980946, + "loss": 1.4102, + "step": 3899 + }, + { + "epoch": 0.05067872127190596, + "grad_norm": 0.49345386028289795, + "learning_rate": 0.00018990368993618322, + "loss": 1.5114, + "step": 3900 + }, + { + "epoch": 0.05069171581582183, + "grad_norm": 0.30331170558929443, + "learning_rate": 0.00018990109047427182, + "loss": 1.419, + "step": 3901 + }, + { + "epoch": 0.050704710359737705, + "grad_norm": 0.3756665885448456, + "learning_rate": 0.00018989849101236047, + "loss": 1.3318, + "step": 3902 + }, + { + "epoch": 0.05071770490365358, + "grad_norm": 0.3780282735824585, + "learning_rate": 0.00018989589155044906, + "loss": 1.2521, + "step": 3903 + }, + { + "epoch": 0.05073069944756945, + "grad_norm": 0.3055044412612915, + "learning_rate": 0.0001898932920885377, + "loss": 1.4419, + "step": 3904 + }, + { + "epoch": 0.050743693991485324, + "grad_norm": 0.3280501365661621, + "learning_rate": 0.0001898906926266263, + "loss": 1.3271, + "step": 3905 + }, + { + "epoch": 0.0507566885354012, + "grad_norm": 0.5151320099830627, + "learning_rate": 0.0001898880931647149, + "loss": 1.6363, + "step": 3906 + }, + { + "epoch": 0.05076968307931707, + "grad_norm": 0.4122999310493469, + "learning_rate": 0.00018988549370280354, + "loss": 1.5251, + "step": 3907 + }, + { + "epoch": 0.05078267762323294, + "grad_norm": 0.3506038188934326, + "learning_rate": 0.00018988289424089213, + "loss": 1.4699, + "step": 3908 + }, + { + "epoch": 0.050795672167148816, + "grad_norm": 0.3838462829589844, + "learning_rate": 0.00018988029477898076, + "loss": 1.4741, + "step": 3909 + }, + { + "epoch": 0.05080866671106469, + "grad_norm": 0.36845508217811584, + "learning_rate": 0.00018987769531706938, + "loss": 1.498, + "step": 3910 + }, + { + "epoch": 0.05082166125498056, + "grad_norm": 0.4354405105113983, + "learning_rate": 0.00018987509585515798, + "loss": 1.2742, + "step": 3911 + }, + { + "epoch": 0.05083465579889644, + "grad_norm": 0.37164774537086487, + "learning_rate": 0.0001898724963932466, + "loss": 1.5356, + "step": 3912 + }, + { + "epoch": 0.050847650342812314, + "grad_norm": 0.40080708265304565, + "learning_rate": 0.0001898698969313352, + "loss": 1.3931, + "step": 3913 + }, + { + "epoch": 0.05086064488672819, + "grad_norm": 0.33808889985084534, + "learning_rate": 0.00018986729746942385, + "loss": 1.6375, + "step": 3914 + }, + { + "epoch": 0.05087363943064406, + "grad_norm": 0.4188748896121979, + "learning_rate": 0.00018986469800751245, + "loss": 1.3959, + "step": 3915 + }, + { + "epoch": 0.05088663397455993, + "grad_norm": 0.39286088943481445, + "learning_rate": 0.00018986209854560107, + "loss": 1.5286, + "step": 3916 + }, + { + "epoch": 0.050899628518475806, + "grad_norm": 0.3928526043891907, + "learning_rate": 0.0001898594990836897, + "loss": 1.4883, + "step": 3917 + }, + { + "epoch": 0.05091262306239168, + "grad_norm": 0.31225576996803284, + "learning_rate": 0.0001898568996217783, + "loss": 1.3666, + "step": 3918 + }, + { + "epoch": 0.05092561760630755, + "grad_norm": 0.35683757066726685, + "learning_rate": 0.00018985430015986692, + "loss": 1.397, + "step": 3919 + }, + { + "epoch": 0.050938612150223425, + "grad_norm": 0.4413927495479584, + "learning_rate": 0.00018985170069795552, + "loss": 1.607, + "step": 3920 + }, + { + "epoch": 0.0509516066941393, + "grad_norm": 0.37743958830833435, + "learning_rate": 0.00018984910123604417, + "loss": 1.3102, + "step": 3921 + }, + { + "epoch": 0.05096460123805517, + "grad_norm": 0.3739142417907715, + "learning_rate": 0.00018984650177413277, + "loss": 1.3585, + "step": 3922 + }, + { + "epoch": 0.050977595781971044, + "grad_norm": 0.397958904504776, + "learning_rate": 0.00018984390231222136, + "loss": 1.3426, + "step": 3923 + }, + { + "epoch": 0.05099059032588692, + "grad_norm": 0.4387616217136383, + "learning_rate": 0.00018984130285031, + "loss": 1.6153, + "step": 3924 + }, + { + "epoch": 0.05100358486980279, + "grad_norm": 0.4138893485069275, + "learning_rate": 0.0001898387033883986, + "loss": 1.4823, + "step": 3925 + }, + { + "epoch": 0.05101657941371866, + "grad_norm": 0.4145199656486511, + "learning_rate": 0.00018983610392648724, + "loss": 1.4494, + "step": 3926 + }, + { + "epoch": 0.051029573957634536, + "grad_norm": 0.39016634225845337, + "learning_rate": 0.00018983350446457584, + "loss": 1.456, + "step": 3927 + }, + { + "epoch": 0.05104256850155041, + "grad_norm": 0.40434640645980835, + "learning_rate": 0.00018983090500266446, + "loss": 1.4514, + "step": 3928 + }, + { + "epoch": 0.05105556304546628, + "grad_norm": 0.2926351726055145, + "learning_rate": 0.00018982830554075308, + "loss": 1.4219, + "step": 3929 + }, + { + "epoch": 0.051068557589382155, + "grad_norm": 0.3985045850276947, + "learning_rate": 0.00018982570607884168, + "loss": 1.5569, + "step": 3930 + }, + { + "epoch": 0.051081552133298035, + "grad_norm": 0.35467809438705444, + "learning_rate": 0.0001898231066169303, + "loss": 1.513, + "step": 3931 + }, + { + "epoch": 0.05109454667721391, + "grad_norm": 0.3070789575576782, + "learning_rate": 0.0001898205071550189, + "loss": 1.5246, + "step": 3932 + }, + { + "epoch": 0.05110754122112978, + "grad_norm": 0.40189459919929504, + "learning_rate": 0.00018981790769310756, + "loss": 1.4707, + "step": 3933 + }, + { + "epoch": 0.051120535765045653, + "grad_norm": 0.41328689455986023, + "learning_rate": 0.00018981530823119615, + "loss": 1.4124, + "step": 3934 + }, + { + "epoch": 0.051133530308961526, + "grad_norm": 0.37467867136001587, + "learning_rate": 0.00018981270876928478, + "loss": 1.475, + "step": 3935 + }, + { + "epoch": 0.0511465248528774, + "grad_norm": 0.39252179861068726, + "learning_rate": 0.00018981010930737337, + "loss": 1.564, + "step": 3936 + }, + { + "epoch": 0.05115951939679327, + "grad_norm": 0.40778642892837524, + "learning_rate": 0.000189807509845462, + "loss": 1.539, + "step": 3937 + }, + { + "epoch": 0.051172513940709145, + "grad_norm": 0.33827200531959534, + "learning_rate": 0.00018980491038355062, + "loss": 1.302, + "step": 3938 + }, + { + "epoch": 0.05118550848462502, + "grad_norm": 0.3841506838798523, + "learning_rate": 0.00018980231092163922, + "loss": 1.5352, + "step": 3939 + }, + { + "epoch": 0.05119850302854089, + "grad_norm": 0.4162614345550537, + "learning_rate": 0.00018979971145972785, + "loss": 1.4781, + "step": 3940 + }, + { + "epoch": 0.051211497572456764, + "grad_norm": 0.38057389855384827, + "learning_rate": 0.00018979711199781647, + "loss": 1.3907, + "step": 3941 + }, + { + "epoch": 0.05122449211637264, + "grad_norm": 0.4606248140335083, + "learning_rate": 0.00018979451253590507, + "loss": 1.7181, + "step": 3942 + }, + { + "epoch": 0.05123748666028851, + "grad_norm": 0.35811400413513184, + "learning_rate": 0.0001897919130739937, + "loss": 1.3935, + "step": 3943 + }, + { + "epoch": 0.05125048120420438, + "grad_norm": 0.34928786754608154, + "learning_rate": 0.0001897893136120823, + "loss": 1.5874, + "step": 3944 + }, + { + "epoch": 0.051263475748120256, + "grad_norm": 0.6878196001052856, + "learning_rate": 0.00018978671415017094, + "loss": 1.2778, + "step": 3945 + }, + { + "epoch": 0.05127647029203613, + "grad_norm": 0.42672884464263916, + "learning_rate": 0.00018978411468825954, + "loss": 1.5405, + "step": 3946 + }, + { + "epoch": 0.051289464835952, + "grad_norm": 0.4493511915206909, + "learning_rate": 0.00018978151522634816, + "loss": 1.4918, + "step": 3947 + }, + { + "epoch": 0.051302459379867875, + "grad_norm": 0.42197975516319275, + "learning_rate": 0.00018977891576443676, + "loss": 1.473, + "step": 3948 + }, + { + "epoch": 0.05131545392378375, + "grad_norm": 0.43470677733421326, + "learning_rate": 0.00018977631630252538, + "loss": 1.4323, + "step": 3949 + }, + { + "epoch": 0.05132844846769963, + "grad_norm": 0.33630824089050293, + "learning_rate": 0.000189773716840614, + "loss": 1.3864, + "step": 3950 + }, + { + "epoch": 0.0513414430116155, + "grad_norm": 0.4269065260887146, + "learning_rate": 0.0001897711173787026, + "loss": 1.4342, + "step": 3951 + }, + { + "epoch": 0.051354437555531374, + "grad_norm": 0.3277847170829773, + "learning_rate": 0.00018976851791679123, + "loss": 1.3853, + "step": 3952 + }, + { + "epoch": 0.05136743209944725, + "grad_norm": 0.4418373703956604, + "learning_rate": 0.00018976591845487986, + "loss": 1.4092, + "step": 3953 + }, + { + "epoch": 0.05138042664336312, + "grad_norm": 0.4023517370223999, + "learning_rate": 0.00018976331899296845, + "loss": 1.5127, + "step": 3954 + }, + { + "epoch": 0.05139342118727899, + "grad_norm": 0.4355026185512543, + "learning_rate": 0.00018976071953105708, + "loss": 1.4716, + "step": 3955 + }, + { + "epoch": 0.051406415731194866, + "grad_norm": 0.3902953863143921, + "learning_rate": 0.0001897581200691457, + "loss": 1.4001, + "step": 3956 + }, + { + "epoch": 0.05141941027511074, + "grad_norm": 0.40391770005226135, + "learning_rate": 0.00018975552060723433, + "loss": 1.5625, + "step": 3957 + }, + { + "epoch": 0.05143240481902661, + "grad_norm": 0.45951566100120544, + "learning_rate": 0.00018975292114532292, + "loss": 1.478, + "step": 3958 + }, + { + "epoch": 0.051445399362942484, + "grad_norm": 0.42912647128105164, + "learning_rate": 0.00018975032168341155, + "loss": 1.6006, + "step": 3959 + }, + { + "epoch": 0.05145839390685836, + "grad_norm": 0.45635467767715454, + "learning_rate": 0.00018974772222150017, + "loss": 1.4409, + "step": 3960 + }, + { + "epoch": 0.05147138845077423, + "grad_norm": 0.35998058319091797, + "learning_rate": 0.00018974512275958877, + "loss": 1.5093, + "step": 3961 + }, + { + "epoch": 0.0514843829946901, + "grad_norm": 0.3792210817337036, + "learning_rate": 0.0001897425232976774, + "loss": 1.4528, + "step": 3962 + }, + { + "epoch": 0.051497377538605976, + "grad_norm": 0.4066675007343292, + "learning_rate": 0.000189739923835766, + "loss": 1.7153, + "step": 3963 + }, + { + "epoch": 0.05151037208252185, + "grad_norm": 0.41340136528015137, + "learning_rate": 0.00018973732437385464, + "loss": 1.4482, + "step": 3964 + }, + { + "epoch": 0.05152336662643772, + "grad_norm": 0.38980579376220703, + "learning_rate": 0.00018973472491194324, + "loss": 1.4207, + "step": 3965 + }, + { + "epoch": 0.051536361170353595, + "grad_norm": 0.36849039793014526, + "learning_rate": 0.00018973212545003184, + "loss": 1.3176, + "step": 3966 + }, + { + "epoch": 0.05154935571426947, + "grad_norm": 0.39238834381103516, + "learning_rate": 0.00018972952598812046, + "loss": 1.5189, + "step": 3967 + }, + { + "epoch": 0.05156235025818534, + "grad_norm": 0.33392074704170227, + "learning_rate": 0.0001897269265262091, + "loss": 1.5565, + "step": 3968 + }, + { + "epoch": 0.05157534480210122, + "grad_norm": 0.44543009996414185, + "learning_rate": 0.0001897243270642977, + "loss": 1.3318, + "step": 3969 + }, + { + "epoch": 0.051588339346017094, + "grad_norm": 0.45427942276000977, + "learning_rate": 0.0001897217276023863, + "loss": 1.3999, + "step": 3970 + }, + { + "epoch": 0.05160133388993297, + "grad_norm": 0.3277691602706909, + "learning_rate": 0.00018971912814047493, + "loss": 1.4067, + "step": 3971 + }, + { + "epoch": 0.05161432843384884, + "grad_norm": 0.2940301299095154, + "learning_rate": 0.00018971652867856356, + "loss": 1.3412, + "step": 3972 + }, + { + "epoch": 0.05162732297776471, + "grad_norm": 0.5000894665718079, + "learning_rate": 0.00018971392921665216, + "loss": 1.541, + "step": 3973 + }, + { + "epoch": 0.051640317521680586, + "grad_norm": 0.36390557885169983, + "learning_rate": 0.00018971132975474078, + "loss": 1.3446, + "step": 3974 + }, + { + "epoch": 0.05165331206559646, + "grad_norm": 0.3943788707256317, + "learning_rate": 0.00018970873029282938, + "loss": 1.4573, + "step": 3975 + }, + { + "epoch": 0.05166630660951233, + "grad_norm": 0.29128462076187134, + "learning_rate": 0.00018970613083091803, + "loss": 1.4002, + "step": 3976 + }, + { + "epoch": 0.051679301153428205, + "grad_norm": 0.39258241653442383, + "learning_rate": 0.00018970353136900663, + "loss": 1.4483, + "step": 3977 + }, + { + "epoch": 0.05169229569734408, + "grad_norm": 0.44540131092071533, + "learning_rate": 0.00018970093190709522, + "loss": 1.3354, + "step": 3978 + }, + { + "epoch": 0.05170529024125995, + "grad_norm": 0.291391521692276, + "learning_rate": 0.00018969833244518385, + "loss": 1.3594, + "step": 3979 + }, + { + "epoch": 0.05171828478517582, + "grad_norm": 0.401706725358963, + "learning_rate": 0.00018969573298327247, + "loss": 1.3292, + "step": 3980 + }, + { + "epoch": 0.051731279329091696, + "grad_norm": 0.3497712314128876, + "learning_rate": 0.0001896931335213611, + "loss": 1.5336, + "step": 3981 + }, + { + "epoch": 0.05174427387300757, + "grad_norm": 0.3641929626464844, + "learning_rate": 0.0001896905340594497, + "loss": 1.3193, + "step": 3982 + }, + { + "epoch": 0.05175726841692344, + "grad_norm": 0.3684643507003784, + "learning_rate": 0.00018968793459753832, + "loss": 1.5871, + "step": 3983 + }, + { + "epoch": 0.051770262960839315, + "grad_norm": 0.3841712772846222, + "learning_rate": 0.00018968533513562694, + "loss": 1.3227, + "step": 3984 + }, + { + "epoch": 0.05178325750475519, + "grad_norm": 0.319903165102005, + "learning_rate": 0.00018968273567371554, + "loss": 1.3485, + "step": 3985 + }, + { + "epoch": 0.05179625204867106, + "grad_norm": 0.428154319524765, + "learning_rate": 0.00018968013621180416, + "loss": 1.2606, + "step": 3986 + }, + { + "epoch": 0.051809246592586934, + "grad_norm": 0.3619686961174011, + "learning_rate": 0.00018967753674989276, + "loss": 1.5088, + "step": 3987 + }, + { + "epoch": 0.051822241136502814, + "grad_norm": 0.2955183982849121, + "learning_rate": 0.00018967493728798141, + "loss": 1.3389, + "step": 3988 + }, + { + "epoch": 0.05183523568041869, + "grad_norm": 0.2890011966228485, + "learning_rate": 0.00018967233782607, + "loss": 1.2731, + "step": 3989 + }, + { + "epoch": 0.05184823022433456, + "grad_norm": 0.4698973000049591, + "learning_rate": 0.00018966973836415864, + "loss": 1.6323, + "step": 3990 + }, + { + "epoch": 0.05186122476825043, + "grad_norm": 0.45719438791275024, + "learning_rate": 0.00018966713890224726, + "loss": 1.5064, + "step": 3991 + }, + { + "epoch": 0.051874219312166306, + "grad_norm": 0.40210840106010437, + "learning_rate": 0.00018966453944033586, + "loss": 1.4794, + "step": 3992 + }, + { + "epoch": 0.05188721385608218, + "grad_norm": 0.42772597074508667, + "learning_rate": 0.00018966193997842448, + "loss": 1.252, + "step": 3993 + }, + { + "epoch": 0.05190020839999805, + "grad_norm": 0.46020588278770447, + "learning_rate": 0.00018965934051651308, + "loss": 1.6665, + "step": 3994 + }, + { + "epoch": 0.051913202943913925, + "grad_norm": 0.400939017534256, + "learning_rate": 0.0001896567410546017, + "loss": 1.6223, + "step": 3995 + }, + { + "epoch": 0.0519261974878298, + "grad_norm": 0.3720465898513794, + "learning_rate": 0.00018965414159269033, + "loss": 1.3402, + "step": 3996 + }, + { + "epoch": 0.05193919203174567, + "grad_norm": 0.26941317319869995, + "learning_rate": 0.00018965154213077893, + "loss": 1.2729, + "step": 3997 + }, + { + "epoch": 0.051952186575661544, + "grad_norm": 0.24724151194095612, + "learning_rate": 0.00018964894266886755, + "loss": 1.2738, + "step": 3998 + }, + { + "epoch": 0.05196518111957742, + "grad_norm": 0.4975112974643707, + "learning_rate": 0.00018964634320695617, + "loss": 1.5769, + "step": 3999 + }, + { + "epoch": 0.05197817566349329, + "grad_norm": 0.45040571689605713, + "learning_rate": 0.0001896437437450448, + "loss": 1.4282, + "step": 4000 + }, + { + "epoch": 0.05199117020740916, + "grad_norm": 0.47491106390953064, + "learning_rate": 0.0001896411442831334, + "loss": 1.3699, + "step": 4001 + }, + { + "epoch": 0.052004164751325035, + "grad_norm": 0.3222854435443878, + "learning_rate": 0.00018963854482122202, + "loss": 1.2931, + "step": 4002 + }, + { + "epoch": 0.05201715929524091, + "grad_norm": 0.3565429747104645, + "learning_rate": 0.00018963594535931065, + "loss": 1.5645, + "step": 4003 + }, + { + "epoch": 0.05203015383915678, + "grad_norm": 0.4117390811443329, + "learning_rate": 0.00018963334589739924, + "loss": 1.4915, + "step": 4004 + }, + { + "epoch": 0.052043148383072654, + "grad_norm": 0.48919737339019775, + "learning_rate": 0.00018963074643548787, + "loss": 1.5253, + "step": 4005 + }, + { + "epoch": 0.05205614292698853, + "grad_norm": 0.2919909358024597, + "learning_rate": 0.00018962814697357646, + "loss": 1.4976, + "step": 4006 + }, + { + "epoch": 0.05206913747090441, + "grad_norm": 0.4114655554294586, + "learning_rate": 0.0001896255475116651, + "loss": 1.6495, + "step": 4007 + }, + { + "epoch": 0.05208213201482028, + "grad_norm": 0.38783255219459534, + "learning_rate": 0.00018962294804975371, + "loss": 1.5461, + "step": 4008 + }, + { + "epoch": 0.05209512655873615, + "grad_norm": 0.36005041003227234, + "learning_rate": 0.0001896203485878423, + "loss": 1.5542, + "step": 4009 + }, + { + "epoch": 0.052108121102652026, + "grad_norm": 0.31175991892814636, + "learning_rate": 0.00018961774912593094, + "loss": 1.531, + "step": 4010 + }, + { + "epoch": 0.0521211156465679, + "grad_norm": 0.336824506521225, + "learning_rate": 0.00018961514966401956, + "loss": 1.5141, + "step": 4011 + }, + { + "epoch": 0.05213411019048377, + "grad_norm": 0.41131842136383057, + "learning_rate": 0.00018961255020210818, + "loss": 1.4053, + "step": 4012 + }, + { + "epoch": 0.052147104734399645, + "grad_norm": 0.45289406180381775, + "learning_rate": 0.00018960995074019678, + "loss": 1.3242, + "step": 4013 + }, + { + "epoch": 0.05216009927831552, + "grad_norm": 0.3515275716781616, + "learning_rate": 0.0001896073512782854, + "loss": 1.4834, + "step": 4014 + }, + { + "epoch": 0.05217309382223139, + "grad_norm": 0.3492477536201477, + "learning_rate": 0.00018960475181637403, + "loss": 1.3464, + "step": 4015 + }, + { + "epoch": 0.052186088366147264, + "grad_norm": 0.3573140799999237, + "learning_rate": 0.00018960215235446263, + "loss": 1.3785, + "step": 4016 + }, + { + "epoch": 0.05219908291006314, + "grad_norm": 0.4475495219230652, + "learning_rate": 0.00018959955289255125, + "loss": 1.6641, + "step": 4017 + }, + { + "epoch": 0.05221207745397901, + "grad_norm": 0.29575610160827637, + "learning_rate": 0.00018959695343063985, + "loss": 1.485, + "step": 4018 + }, + { + "epoch": 0.05222507199789488, + "grad_norm": 0.35003504157066345, + "learning_rate": 0.0001895943539687285, + "loss": 1.3035, + "step": 4019 + }, + { + "epoch": 0.052238066541810756, + "grad_norm": 0.4605973958969116, + "learning_rate": 0.0001895917545068171, + "loss": 1.5717, + "step": 4020 + }, + { + "epoch": 0.05225106108572663, + "grad_norm": 0.41698935627937317, + "learning_rate": 0.0001895891550449057, + "loss": 1.6018, + "step": 4021 + }, + { + "epoch": 0.0522640556296425, + "grad_norm": 0.3836056590080261, + "learning_rate": 0.00018958655558299432, + "loss": 1.5877, + "step": 4022 + }, + { + "epoch": 0.052277050173558375, + "grad_norm": 0.4108414947986603, + "learning_rate": 0.00018958395612108295, + "loss": 1.599, + "step": 4023 + }, + { + "epoch": 0.05229004471747425, + "grad_norm": 0.3855954110622406, + "learning_rate": 0.00018958135665917157, + "loss": 1.4276, + "step": 4024 + }, + { + "epoch": 0.05230303926139012, + "grad_norm": 0.35487011075019836, + "learning_rate": 0.00018957875719726017, + "loss": 1.4927, + "step": 4025 + }, + { + "epoch": 0.052316033805306, + "grad_norm": 0.3558792769908905, + "learning_rate": 0.0001895761577353488, + "loss": 1.2971, + "step": 4026 + }, + { + "epoch": 0.05232902834922187, + "grad_norm": 0.2872849404811859, + "learning_rate": 0.00018957355827343742, + "loss": 1.3233, + "step": 4027 + }, + { + "epoch": 0.052342022893137746, + "grad_norm": 0.3171432614326477, + "learning_rate": 0.00018957095881152601, + "loss": 1.394, + "step": 4028 + }, + { + "epoch": 0.05235501743705362, + "grad_norm": 0.40315118432044983, + "learning_rate": 0.00018956835934961464, + "loss": 1.5473, + "step": 4029 + }, + { + "epoch": 0.05236801198096949, + "grad_norm": 0.44339045882225037, + "learning_rate": 0.00018956575988770326, + "loss": 1.5307, + "step": 4030 + }, + { + "epoch": 0.052381006524885365, + "grad_norm": 0.37378010153770447, + "learning_rate": 0.0001895631604257919, + "loss": 1.4751, + "step": 4031 + }, + { + "epoch": 0.05239400106880124, + "grad_norm": 0.4320535659790039, + "learning_rate": 0.00018956056096388048, + "loss": 1.3061, + "step": 4032 + }, + { + "epoch": 0.05240699561271711, + "grad_norm": 0.2966800332069397, + "learning_rate": 0.00018955796150196908, + "loss": 1.2925, + "step": 4033 + }, + { + "epoch": 0.052419990156632984, + "grad_norm": 0.39737069606781006, + "learning_rate": 0.00018955536204005773, + "loss": 1.5461, + "step": 4034 + }, + { + "epoch": 0.05243298470054886, + "grad_norm": 0.39521902799606323, + "learning_rate": 0.00018955276257814633, + "loss": 1.5108, + "step": 4035 + }, + { + "epoch": 0.05244597924446473, + "grad_norm": 0.438451886177063, + "learning_rate": 0.00018955016311623496, + "loss": 1.2581, + "step": 4036 + }, + { + "epoch": 0.0524589737883806, + "grad_norm": 0.3777187466621399, + "learning_rate": 0.00018954756365432355, + "loss": 1.3956, + "step": 4037 + }, + { + "epoch": 0.052471968332296476, + "grad_norm": 0.5382497310638428, + "learning_rate": 0.00018954496419241218, + "loss": 1.4657, + "step": 4038 + }, + { + "epoch": 0.05248496287621235, + "grad_norm": 0.38454824686050415, + "learning_rate": 0.0001895423647305008, + "loss": 1.3285, + "step": 4039 + }, + { + "epoch": 0.05249795742012822, + "grad_norm": 0.361906498670578, + "learning_rate": 0.0001895397652685894, + "loss": 1.4909, + "step": 4040 + }, + { + "epoch": 0.052510951964044095, + "grad_norm": 0.3523094654083252, + "learning_rate": 0.00018953716580667802, + "loss": 1.4241, + "step": 4041 + }, + { + "epoch": 0.05252394650795997, + "grad_norm": 0.3995682895183563, + "learning_rate": 0.00018953456634476665, + "loss": 1.5484, + "step": 4042 + }, + { + "epoch": 0.05253694105187584, + "grad_norm": 0.4963608980178833, + "learning_rate": 0.00018953196688285527, + "loss": 1.5385, + "step": 4043 + }, + { + "epoch": 0.052549935595791714, + "grad_norm": 0.4471227824687958, + "learning_rate": 0.00018952936742094387, + "loss": 1.519, + "step": 4044 + }, + { + "epoch": 0.052562930139707593, + "grad_norm": 0.3839107155799866, + "learning_rate": 0.00018952676795903247, + "loss": 1.2874, + "step": 4045 + }, + { + "epoch": 0.052575924683623466, + "grad_norm": 0.33835527300834656, + "learning_rate": 0.00018952416849712112, + "loss": 1.5024, + "step": 4046 + }, + { + "epoch": 0.05258891922753934, + "grad_norm": 0.41741132736206055, + "learning_rate": 0.00018952156903520972, + "loss": 1.5582, + "step": 4047 + }, + { + "epoch": 0.05260191377145521, + "grad_norm": 0.47670289874076843, + "learning_rate": 0.00018951896957329834, + "loss": 1.6401, + "step": 4048 + }, + { + "epoch": 0.052614908315371085, + "grad_norm": 0.3288404643535614, + "learning_rate": 0.00018951637011138694, + "loss": 1.3547, + "step": 4049 + }, + { + "epoch": 0.05262790285928696, + "grad_norm": 1.391832709312439, + "learning_rate": 0.00018951377064947556, + "loss": 1.3922, + "step": 4050 + }, + { + "epoch": 0.05264089740320283, + "grad_norm": 0.4483433663845062, + "learning_rate": 0.0001895111711875642, + "loss": 1.2929, + "step": 4051 + }, + { + "epoch": 0.052653891947118704, + "grad_norm": 0.4345998466014862, + "learning_rate": 0.00018950857172565278, + "loss": 1.3383, + "step": 4052 + }, + { + "epoch": 0.05266688649103458, + "grad_norm": 0.39746445417404175, + "learning_rate": 0.0001895059722637414, + "loss": 1.4204, + "step": 4053 + }, + { + "epoch": 0.05267988103495045, + "grad_norm": 0.387657105922699, + "learning_rate": 0.00018950337280183003, + "loss": 1.3751, + "step": 4054 + }, + { + "epoch": 0.05269287557886632, + "grad_norm": 0.38720956444740295, + "learning_rate": 0.00018950077333991866, + "loss": 1.4092, + "step": 4055 + }, + { + "epoch": 0.052705870122782196, + "grad_norm": 0.4390738010406494, + "learning_rate": 0.00018949817387800726, + "loss": 1.5691, + "step": 4056 + }, + { + "epoch": 0.05271886466669807, + "grad_norm": 0.47112616896629333, + "learning_rate": 0.00018949557441609588, + "loss": 1.5116, + "step": 4057 + }, + { + "epoch": 0.05273185921061394, + "grad_norm": 0.2595531642436981, + "learning_rate": 0.0001894929749541845, + "loss": 1.3734, + "step": 4058 + }, + { + "epoch": 0.052744853754529815, + "grad_norm": 0.3311459720134735, + "learning_rate": 0.0001894903754922731, + "loss": 1.4041, + "step": 4059 + }, + { + "epoch": 0.05275784829844569, + "grad_norm": 0.5230029225349426, + "learning_rate": 0.00018948777603036173, + "loss": 1.5171, + "step": 4060 + }, + { + "epoch": 0.05277084284236156, + "grad_norm": 0.3773336112499237, + "learning_rate": 0.00018948517656845032, + "loss": 1.2762, + "step": 4061 + }, + { + "epoch": 0.052783837386277434, + "grad_norm": 0.3544774651527405, + "learning_rate": 0.00018948257710653895, + "loss": 1.3548, + "step": 4062 + }, + { + "epoch": 0.05279683193019331, + "grad_norm": 0.4684385061264038, + "learning_rate": 0.00018947997764462757, + "loss": 1.5289, + "step": 4063 + }, + { + "epoch": 0.05280982647410919, + "grad_norm": 0.5378061532974243, + "learning_rate": 0.00018947737818271617, + "loss": 1.602, + "step": 4064 + }, + { + "epoch": 0.05282282101802506, + "grad_norm": 0.435875803232193, + "learning_rate": 0.00018947477872080482, + "loss": 1.5745, + "step": 4065 + }, + { + "epoch": 0.05283581556194093, + "grad_norm": 0.40728333592414856, + "learning_rate": 0.00018947217925889342, + "loss": 1.3974, + "step": 4066 + }, + { + "epoch": 0.052848810105856805, + "grad_norm": 0.32386717200279236, + "learning_rate": 0.00018946957979698204, + "loss": 1.5727, + "step": 4067 + }, + { + "epoch": 0.05286180464977268, + "grad_norm": 0.3410390615463257, + "learning_rate": 0.00018946698033507064, + "loss": 1.5648, + "step": 4068 + }, + { + "epoch": 0.05287479919368855, + "grad_norm": 0.2897005081176758, + "learning_rate": 0.00018946438087315927, + "loss": 1.3153, + "step": 4069 + }, + { + "epoch": 0.052887793737604424, + "grad_norm": 0.4447380602359772, + "learning_rate": 0.0001894617814112479, + "loss": 1.4752, + "step": 4070 + }, + { + "epoch": 0.0529007882815203, + "grad_norm": 0.4160134494304657, + "learning_rate": 0.0001894591819493365, + "loss": 1.3817, + "step": 4071 + }, + { + "epoch": 0.05291378282543617, + "grad_norm": 0.4001814126968384, + "learning_rate": 0.0001894565824874251, + "loss": 1.4828, + "step": 4072 + }, + { + "epoch": 0.05292677736935204, + "grad_norm": 0.3617863953113556, + "learning_rate": 0.00018945398302551374, + "loss": 1.3612, + "step": 4073 + }, + { + "epoch": 0.052939771913267916, + "grad_norm": 0.3365032374858856, + "learning_rate": 0.00018945138356360233, + "loss": 1.3973, + "step": 4074 + }, + { + "epoch": 0.05295276645718379, + "grad_norm": 0.38645651936531067, + "learning_rate": 0.00018944878410169096, + "loss": 1.2994, + "step": 4075 + }, + { + "epoch": 0.05296576100109966, + "grad_norm": 0.42820626497268677, + "learning_rate": 0.00018944618463977956, + "loss": 1.4455, + "step": 4076 + }, + { + "epoch": 0.052978755545015535, + "grad_norm": 0.3983556926250458, + "learning_rate": 0.0001894435851778682, + "loss": 1.5465, + "step": 4077 + }, + { + "epoch": 0.05299175008893141, + "grad_norm": 0.4455036520957947, + "learning_rate": 0.0001894409857159568, + "loss": 1.5994, + "step": 4078 + }, + { + "epoch": 0.05300474463284728, + "grad_norm": 0.4224860668182373, + "learning_rate": 0.00018943838625404543, + "loss": 1.4456, + "step": 4079 + }, + { + "epoch": 0.053017739176763154, + "grad_norm": 0.38385462760925293, + "learning_rate": 0.00018943578679213403, + "loss": 1.2557, + "step": 4080 + }, + { + "epoch": 0.05303073372067903, + "grad_norm": 0.3887278139591217, + "learning_rate": 0.00018943318733022265, + "loss": 1.4912, + "step": 4081 + }, + { + "epoch": 0.0530437282645949, + "grad_norm": 0.296936959028244, + "learning_rate": 0.00018943058786831128, + "loss": 1.3998, + "step": 4082 + }, + { + "epoch": 0.05305672280851078, + "grad_norm": 0.3960835635662079, + "learning_rate": 0.00018942798840639987, + "loss": 1.3654, + "step": 4083 + }, + { + "epoch": 0.05306971735242665, + "grad_norm": 0.29843807220458984, + "learning_rate": 0.0001894253889444885, + "loss": 1.3795, + "step": 4084 + }, + { + "epoch": 0.053082711896342526, + "grad_norm": 0.4079810380935669, + "learning_rate": 0.00018942278948257712, + "loss": 1.3624, + "step": 4085 + }, + { + "epoch": 0.0530957064402584, + "grad_norm": 0.4206112027168274, + "learning_rate": 0.00018942019002066575, + "loss": 1.3792, + "step": 4086 + }, + { + "epoch": 0.05310870098417427, + "grad_norm": 0.3746127784252167, + "learning_rate": 0.00018941759055875434, + "loss": 1.578, + "step": 4087 + }, + { + "epoch": 0.053121695528090145, + "grad_norm": 0.3708513081073761, + "learning_rate": 0.00018941499109684294, + "loss": 1.5625, + "step": 4088 + }, + { + "epoch": 0.05313469007200602, + "grad_norm": 0.3918353021144867, + "learning_rate": 0.0001894123916349316, + "loss": 1.4684, + "step": 4089 + }, + { + "epoch": 0.05314768461592189, + "grad_norm": 0.3920225203037262, + "learning_rate": 0.0001894097921730202, + "loss": 1.2668, + "step": 4090 + }, + { + "epoch": 0.05316067915983776, + "grad_norm": 0.4823001027107239, + "learning_rate": 0.00018940719271110881, + "loss": 1.5541, + "step": 4091 + }, + { + "epoch": 0.053173673703753636, + "grad_norm": 0.35504278540611267, + "learning_rate": 0.0001894045932491974, + "loss": 1.4308, + "step": 4092 + }, + { + "epoch": 0.05318666824766951, + "grad_norm": 0.4522918462753296, + "learning_rate": 0.00018940199378728604, + "loss": 1.5973, + "step": 4093 + }, + { + "epoch": 0.05319966279158538, + "grad_norm": 0.4165440797805786, + "learning_rate": 0.00018939939432537466, + "loss": 1.4632, + "step": 4094 + }, + { + "epoch": 0.053212657335501255, + "grad_norm": 0.5297533273696899, + "learning_rate": 0.00018939679486346326, + "loss": 1.5272, + "step": 4095 + }, + { + "epoch": 0.05322565187941713, + "grad_norm": 0.38570156693458557, + "learning_rate": 0.00018939419540155188, + "loss": 1.2312, + "step": 4096 + }, + { + "epoch": 0.053238646423333, + "grad_norm": 0.3549193739891052, + "learning_rate": 0.0001893915959396405, + "loss": 1.2436, + "step": 4097 + }, + { + "epoch": 0.053251640967248874, + "grad_norm": 0.4733002483844757, + "learning_rate": 0.00018938899647772913, + "loss": 1.5235, + "step": 4098 + }, + { + "epoch": 0.05326463551116475, + "grad_norm": 0.3299548327922821, + "learning_rate": 0.00018938639701581773, + "loss": 1.4887, + "step": 4099 + }, + { + "epoch": 0.05327763005508062, + "grad_norm": 0.37195706367492676, + "learning_rate": 0.00018938379755390635, + "loss": 1.4369, + "step": 4100 + }, + { + "epoch": 0.05329062459899649, + "grad_norm": 0.4118349254131317, + "learning_rate": 0.00018938119809199498, + "loss": 1.4259, + "step": 4101 + }, + { + "epoch": 0.05330361914291237, + "grad_norm": 0.4443964660167694, + "learning_rate": 0.00018937859863008358, + "loss": 1.5751, + "step": 4102 + }, + { + "epoch": 0.053316613686828246, + "grad_norm": 0.19295981526374817, + "learning_rate": 0.0001893759991681722, + "loss": 1.2777, + "step": 4103 + }, + { + "epoch": 0.05332960823074412, + "grad_norm": 0.3431297540664673, + "learning_rate": 0.00018937339970626082, + "loss": 1.4713, + "step": 4104 + }, + { + "epoch": 0.05334260277465999, + "grad_norm": 0.3754962682723999, + "learning_rate": 0.00018937080024434942, + "loss": 1.5343, + "step": 4105 + }, + { + "epoch": 0.053355597318575865, + "grad_norm": 0.41893696784973145, + "learning_rate": 0.00018936820078243805, + "loss": 1.3261, + "step": 4106 + }, + { + "epoch": 0.05336859186249174, + "grad_norm": 0.3288760781288147, + "learning_rate": 0.00018936560132052664, + "loss": 1.4207, + "step": 4107 + }, + { + "epoch": 0.05338158640640761, + "grad_norm": 0.5319237112998962, + "learning_rate": 0.0001893630018586153, + "loss": 1.5312, + "step": 4108 + }, + { + "epoch": 0.053394580950323484, + "grad_norm": 0.42209652066230774, + "learning_rate": 0.0001893604023967039, + "loss": 1.5887, + "step": 4109 + }, + { + "epoch": 0.05340757549423936, + "grad_norm": 0.4267975389957428, + "learning_rate": 0.00018935780293479252, + "loss": 1.578, + "step": 4110 + }, + { + "epoch": 0.05342057003815523, + "grad_norm": 0.39445337653160095, + "learning_rate": 0.00018935520347288111, + "loss": 1.3847, + "step": 4111 + }, + { + "epoch": 0.0534335645820711, + "grad_norm": 0.41110867261886597, + "learning_rate": 0.00018935260401096974, + "loss": 1.5021, + "step": 4112 + }, + { + "epoch": 0.053446559125986975, + "grad_norm": 0.5017188787460327, + "learning_rate": 0.00018935000454905836, + "loss": 1.5875, + "step": 4113 + }, + { + "epoch": 0.05345955366990285, + "grad_norm": 0.365742951631546, + "learning_rate": 0.00018934740508714696, + "loss": 1.4253, + "step": 4114 + }, + { + "epoch": 0.05347254821381872, + "grad_norm": 0.37313637137413025, + "learning_rate": 0.00018934480562523559, + "loss": 1.5807, + "step": 4115 + }, + { + "epoch": 0.053485542757734594, + "grad_norm": 0.32133063673973083, + "learning_rate": 0.0001893422061633242, + "loss": 1.3378, + "step": 4116 + }, + { + "epoch": 0.05349853730165047, + "grad_norm": 0.36928263306617737, + "learning_rate": 0.0001893396067014128, + "loss": 1.361, + "step": 4117 + }, + { + "epoch": 0.05351153184556634, + "grad_norm": 0.3204761743545532, + "learning_rate": 0.00018933700723950143, + "loss": 1.5236, + "step": 4118 + }, + { + "epoch": 0.05352452638948221, + "grad_norm": 0.34568074345588684, + "learning_rate": 0.00018933440777759003, + "loss": 1.4459, + "step": 4119 + }, + { + "epoch": 0.053537520933398086, + "grad_norm": 0.4119112193584442, + "learning_rate": 0.00018933180831567868, + "loss": 1.426, + "step": 4120 + }, + { + "epoch": 0.053550515477313966, + "grad_norm": 0.32223182916641235, + "learning_rate": 0.00018932920885376728, + "loss": 1.3968, + "step": 4121 + }, + { + "epoch": 0.05356351002122984, + "grad_norm": 0.37836843729019165, + "learning_rate": 0.0001893266093918559, + "loss": 1.4545, + "step": 4122 + }, + { + "epoch": 0.05357650456514571, + "grad_norm": 0.3146507740020752, + "learning_rate": 0.0001893240099299445, + "loss": 1.1039, + "step": 4123 + }, + { + "epoch": 0.053589499109061585, + "grad_norm": 0.39077362418174744, + "learning_rate": 0.00018932141046803312, + "loss": 1.3596, + "step": 4124 + }, + { + "epoch": 0.05360249365297746, + "grad_norm": 0.44414275884628296, + "learning_rate": 0.00018931881100612175, + "loss": 1.3353, + "step": 4125 + }, + { + "epoch": 0.05361548819689333, + "grad_norm": 0.35662269592285156, + "learning_rate": 0.00018931621154421035, + "loss": 1.3744, + "step": 4126 + }, + { + "epoch": 0.053628482740809204, + "grad_norm": 0.3510833978652954, + "learning_rate": 0.00018931361208229897, + "loss": 1.4567, + "step": 4127 + }, + { + "epoch": 0.05364147728472508, + "grad_norm": 0.4733801484107971, + "learning_rate": 0.0001893110126203876, + "loss": 1.571, + "step": 4128 + }, + { + "epoch": 0.05365447182864095, + "grad_norm": 0.46760106086730957, + "learning_rate": 0.0001893084131584762, + "loss": 1.4941, + "step": 4129 + }, + { + "epoch": 0.05366746637255682, + "grad_norm": 0.3219572603702545, + "learning_rate": 0.00018930581369656482, + "loss": 1.4493, + "step": 4130 + }, + { + "epoch": 0.053680460916472696, + "grad_norm": 0.32618248462677, + "learning_rate": 0.00018930321423465341, + "loss": 1.3629, + "step": 4131 + }, + { + "epoch": 0.05369345546038857, + "grad_norm": 0.28745201230049133, + "learning_rate": 0.00018930061477274207, + "loss": 1.1966, + "step": 4132 + }, + { + "epoch": 0.05370645000430444, + "grad_norm": 0.41781213879585266, + "learning_rate": 0.00018929801531083066, + "loss": 1.4458, + "step": 4133 + }, + { + "epoch": 0.053719444548220315, + "grad_norm": 0.42739737033843994, + "learning_rate": 0.0001892954158489193, + "loss": 1.4283, + "step": 4134 + }, + { + "epoch": 0.05373243909213619, + "grad_norm": 0.33842942118644714, + "learning_rate": 0.00018929281638700788, + "loss": 1.5339, + "step": 4135 + }, + { + "epoch": 0.05374543363605206, + "grad_norm": 0.31668564677238464, + "learning_rate": 0.0001892902169250965, + "loss": 1.5259, + "step": 4136 + }, + { + "epoch": 0.05375842817996793, + "grad_norm": 0.44656166434288025, + "learning_rate": 0.00018928761746318513, + "loss": 1.4575, + "step": 4137 + }, + { + "epoch": 0.053771422723883806, + "grad_norm": 0.35060915350914, + "learning_rate": 0.00018928501800127373, + "loss": 1.3341, + "step": 4138 + }, + { + "epoch": 0.05378441726779968, + "grad_norm": 0.42004090547561646, + "learning_rate": 0.00018928241853936238, + "loss": 1.3517, + "step": 4139 + }, + { + "epoch": 0.05379741181171556, + "grad_norm": 0.37880387902259827, + "learning_rate": 0.00018927981907745098, + "loss": 1.3493, + "step": 4140 + }, + { + "epoch": 0.05381040635563143, + "grad_norm": 0.4265282452106476, + "learning_rate": 0.0001892772196155396, + "loss": 1.4873, + "step": 4141 + }, + { + "epoch": 0.053823400899547305, + "grad_norm": 0.24523285031318665, + "learning_rate": 0.0001892746201536282, + "loss": 1.4249, + "step": 4142 + }, + { + "epoch": 0.05383639544346318, + "grad_norm": 0.4270896017551422, + "learning_rate": 0.00018927202069171683, + "loss": 1.4341, + "step": 4143 + }, + { + "epoch": 0.05384938998737905, + "grad_norm": 0.46497875452041626, + "learning_rate": 0.00018926942122980545, + "loss": 1.4216, + "step": 4144 + }, + { + "epoch": 0.053862384531294924, + "grad_norm": 0.3323686122894287, + "learning_rate": 0.00018926682176789405, + "loss": 1.36, + "step": 4145 + }, + { + "epoch": 0.0538753790752108, + "grad_norm": 0.43498021364212036, + "learning_rate": 0.00018926422230598267, + "loss": 1.3576, + "step": 4146 + }, + { + "epoch": 0.05388837361912667, + "grad_norm": 0.4388672411441803, + "learning_rate": 0.0001892616228440713, + "loss": 1.4025, + "step": 4147 + }, + { + "epoch": 0.05390136816304254, + "grad_norm": 0.3581877052783966, + "learning_rate": 0.0001892590233821599, + "loss": 1.5512, + "step": 4148 + }, + { + "epoch": 0.053914362706958416, + "grad_norm": 0.24979346990585327, + "learning_rate": 0.00018925642392024852, + "loss": 1.4426, + "step": 4149 + }, + { + "epoch": 0.05392735725087429, + "grad_norm": 0.2491956204175949, + "learning_rate": 0.00018925382445833712, + "loss": 1.3142, + "step": 4150 + }, + { + "epoch": 0.05394035179479016, + "grad_norm": 0.4160778820514679, + "learning_rate": 0.00018925122499642577, + "loss": 1.4305, + "step": 4151 + }, + { + "epoch": 0.053953346338706035, + "grad_norm": 0.37397119402885437, + "learning_rate": 0.00018924862553451437, + "loss": 1.3388, + "step": 4152 + }, + { + "epoch": 0.05396634088262191, + "grad_norm": 0.21446822583675385, + "learning_rate": 0.000189246026072603, + "loss": 1.0965, + "step": 4153 + }, + { + "epoch": 0.05397933542653778, + "grad_norm": 0.5060781240463257, + "learning_rate": 0.0001892434266106916, + "loss": 1.478, + "step": 4154 + }, + { + "epoch": 0.053992329970453654, + "grad_norm": 0.39640262722969055, + "learning_rate": 0.0001892408271487802, + "loss": 1.3321, + "step": 4155 + }, + { + "epoch": 0.054005324514369527, + "grad_norm": 0.4792666435241699, + "learning_rate": 0.00018923822768686884, + "loss": 1.399, + "step": 4156 + }, + { + "epoch": 0.0540183190582854, + "grad_norm": 0.2827445864677429, + "learning_rate": 0.00018923562822495743, + "loss": 1.4774, + "step": 4157 + }, + { + "epoch": 0.05403131360220127, + "grad_norm": 0.41037750244140625, + "learning_rate": 0.00018923302876304606, + "loss": 1.7294, + "step": 4158 + }, + { + "epoch": 0.05404430814611715, + "grad_norm": 0.3872053623199463, + "learning_rate": 0.00018923042930113468, + "loss": 1.7031, + "step": 4159 + }, + { + "epoch": 0.054057302690033025, + "grad_norm": 0.40729260444641113, + "learning_rate": 0.00018922782983922328, + "loss": 1.5746, + "step": 4160 + }, + { + "epoch": 0.0540702972339489, + "grad_norm": 0.2942042052745819, + "learning_rate": 0.0001892252303773119, + "loss": 1.366, + "step": 4161 + }, + { + "epoch": 0.05408329177786477, + "grad_norm": 0.3768075704574585, + "learning_rate": 0.0001892226309154005, + "loss": 1.394, + "step": 4162 + }, + { + "epoch": 0.054096286321780644, + "grad_norm": 0.33810779452323914, + "learning_rate": 0.00018922003145348915, + "loss": 1.3207, + "step": 4163 + }, + { + "epoch": 0.05410928086569652, + "grad_norm": 0.3797534704208374, + "learning_rate": 0.00018921743199157775, + "loss": 1.3949, + "step": 4164 + }, + { + "epoch": 0.05412227540961239, + "grad_norm": 0.3625997006893158, + "learning_rate": 0.00018921483252966638, + "loss": 1.4761, + "step": 4165 + }, + { + "epoch": 0.05413526995352826, + "grad_norm": 0.31058254837989807, + "learning_rate": 0.00018921223306775497, + "loss": 1.4738, + "step": 4166 + }, + { + "epoch": 0.054148264497444136, + "grad_norm": 0.31756383180618286, + "learning_rate": 0.0001892096336058436, + "loss": 1.4252, + "step": 4167 + }, + { + "epoch": 0.05416125904136001, + "grad_norm": 0.4506288468837738, + "learning_rate": 0.00018920703414393222, + "loss": 1.3989, + "step": 4168 + }, + { + "epoch": 0.05417425358527588, + "grad_norm": 0.30721551179885864, + "learning_rate": 0.00018920443468202082, + "loss": 1.5109, + "step": 4169 + }, + { + "epoch": 0.054187248129191755, + "grad_norm": 0.34977883100509644, + "learning_rate": 0.00018920183522010944, + "loss": 1.4504, + "step": 4170 + }, + { + "epoch": 0.05420024267310763, + "grad_norm": 0.3549085855484009, + "learning_rate": 0.00018919923575819807, + "loss": 1.5236, + "step": 4171 + }, + { + "epoch": 0.0542132372170235, + "grad_norm": 0.45250678062438965, + "learning_rate": 0.00018919663629628667, + "loss": 1.3842, + "step": 4172 + }, + { + "epoch": 0.054226231760939374, + "grad_norm": 0.39982518553733826, + "learning_rate": 0.0001891940368343753, + "loss": 1.4139, + "step": 4173 + }, + { + "epoch": 0.05423922630485525, + "grad_norm": 0.3891928195953369, + "learning_rate": 0.00018919143737246391, + "loss": 1.4246, + "step": 4174 + }, + { + "epoch": 0.05425222084877112, + "grad_norm": 0.417416512966156, + "learning_rate": 0.00018918883791055254, + "loss": 1.7258, + "step": 4175 + }, + { + "epoch": 0.05426521539268699, + "grad_norm": 0.3698502779006958, + "learning_rate": 0.00018918623844864114, + "loss": 1.4572, + "step": 4176 + }, + { + "epoch": 0.054278209936602866, + "grad_norm": 0.32752782106399536, + "learning_rate": 0.00018918363898672976, + "loss": 1.5054, + "step": 4177 + }, + { + "epoch": 0.054291204480518745, + "grad_norm": 0.34790733456611633, + "learning_rate": 0.00018918103952481839, + "loss": 1.5698, + "step": 4178 + }, + { + "epoch": 0.05430419902443462, + "grad_norm": 0.42732593417167664, + "learning_rate": 0.00018917844006290698, + "loss": 1.5238, + "step": 4179 + }, + { + "epoch": 0.05431719356835049, + "grad_norm": 0.3276183009147644, + "learning_rate": 0.0001891758406009956, + "loss": 1.3563, + "step": 4180 + }, + { + "epoch": 0.054330188112266364, + "grad_norm": 0.38544175028800964, + "learning_rate": 0.0001891732411390842, + "loss": 1.613, + "step": 4181 + }, + { + "epoch": 0.05434318265618224, + "grad_norm": 0.4363090693950653, + "learning_rate": 0.00018917064167717286, + "loss": 1.302, + "step": 4182 + }, + { + "epoch": 0.05435617720009811, + "grad_norm": 0.39787256717681885, + "learning_rate": 0.00018916804221526145, + "loss": 1.4739, + "step": 4183 + }, + { + "epoch": 0.05436917174401398, + "grad_norm": 0.4567318856716156, + "learning_rate": 0.00018916544275335005, + "loss": 1.538, + "step": 4184 + }, + { + "epoch": 0.054382166287929856, + "grad_norm": 0.35571911931037903, + "learning_rate": 0.00018916284329143868, + "loss": 1.229, + "step": 4185 + }, + { + "epoch": 0.05439516083184573, + "grad_norm": 0.31598547101020813, + "learning_rate": 0.0001891602438295273, + "loss": 1.3032, + "step": 4186 + }, + { + "epoch": 0.0544081553757616, + "grad_norm": 0.4143746793270111, + "learning_rate": 0.00018915764436761592, + "loss": 1.181, + "step": 4187 + }, + { + "epoch": 0.054421149919677475, + "grad_norm": 0.32978639006614685, + "learning_rate": 0.00018915504490570452, + "loss": 1.4337, + "step": 4188 + }, + { + "epoch": 0.05443414446359335, + "grad_norm": 0.3699577748775482, + "learning_rate": 0.00018915244544379315, + "loss": 1.4999, + "step": 4189 + }, + { + "epoch": 0.05444713900750922, + "grad_norm": 0.43622177839279175, + "learning_rate": 0.00018914984598188177, + "loss": 1.5525, + "step": 4190 + }, + { + "epoch": 0.054460133551425094, + "grad_norm": 0.6816190481185913, + "learning_rate": 0.00018914724651997037, + "loss": 1.5219, + "step": 4191 + }, + { + "epoch": 0.05447312809534097, + "grad_norm": 0.42073604464530945, + "learning_rate": 0.000189144647058059, + "loss": 1.4336, + "step": 4192 + }, + { + "epoch": 0.05448612263925684, + "grad_norm": 0.6639722585678101, + "learning_rate": 0.0001891420475961476, + "loss": 1.4878, + "step": 4193 + }, + { + "epoch": 0.05449911718317271, + "grad_norm": 0.40993350744247437, + "learning_rate": 0.00018913944813423624, + "loss": 1.4694, + "step": 4194 + }, + { + "epoch": 0.054512111727088586, + "grad_norm": 0.39893394708633423, + "learning_rate": 0.00018913684867232484, + "loss": 1.4709, + "step": 4195 + }, + { + "epoch": 0.05452510627100446, + "grad_norm": 0.4202471971511841, + "learning_rate": 0.00018913424921041346, + "loss": 1.5103, + "step": 4196 + }, + { + "epoch": 0.05453810081492034, + "grad_norm": 0.38577938079833984, + "learning_rate": 0.00018913164974850206, + "loss": 1.4811, + "step": 4197 + }, + { + "epoch": 0.05455109535883621, + "grad_norm": 0.40890538692474365, + "learning_rate": 0.00018912905028659069, + "loss": 1.1876, + "step": 4198 + }, + { + "epoch": 0.054564089902752085, + "grad_norm": 0.37090209126472473, + "learning_rate": 0.0001891264508246793, + "loss": 1.505, + "step": 4199 + }, + { + "epoch": 0.05457708444666796, + "grad_norm": 0.36616623401641846, + "learning_rate": 0.0001891238513627679, + "loss": 1.4369, + "step": 4200 + }, + { + "epoch": 0.05459007899058383, + "grad_norm": 0.33773836493492126, + "learning_rate": 0.00018912125190085653, + "loss": 1.4114, + "step": 4201 + }, + { + "epoch": 0.0546030735344997, + "grad_norm": 0.44164177775382996, + "learning_rate": 0.00018911865243894516, + "loss": 1.3541, + "step": 4202 + }, + { + "epoch": 0.054616068078415576, + "grad_norm": 0.3994191586971283, + "learning_rate": 0.00018911605297703375, + "loss": 1.4149, + "step": 4203 + }, + { + "epoch": 0.05462906262233145, + "grad_norm": 0.40110909938812256, + "learning_rate": 0.00018911345351512238, + "loss": 1.5296, + "step": 4204 + }, + { + "epoch": 0.05464205716624732, + "grad_norm": 0.44197770953178406, + "learning_rate": 0.00018911085405321098, + "loss": 1.3985, + "step": 4205 + }, + { + "epoch": 0.054655051710163195, + "grad_norm": 0.333254337310791, + "learning_rate": 0.00018910825459129963, + "loss": 1.2687, + "step": 4206 + }, + { + "epoch": 0.05466804625407907, + "grad_norm": 0.38280072808265686, + "learning_rate": 0.00018910565512938822, + "loss": 1.4165, + "step": 4207 + }, + { + "epoch": 0.05468104079799494, + "grad_norm": 0.43307724595069885, + "learning_rate": 0.00018910305566747685, + "loss": 1.5463, + "step": 4208 + }, + { + "epoch": 0.054694035341910814, + "grad_norm": 0.39417174458503723, + "learning_rate": 0.00018910045620556545, + "loss": 1.4257, + "step": 4209 + }, + { + "epoch": 0.05470702988582669, + "grad_norm": 0.4331287443637848, + "learning_rate": 0.00018909785674365407, + "loss": 1.453, + "step": 4210 + }, + { + "epoch": 0.05472002442974256, + "grad_norm": 0.33602091670036316, + "learning_rate": 0.0001890952572817427, + "loss": 1.2553, + "step": 4211 + }, + { + "epoch": 0.05473301897365843, + "grad_norm": 0.43651923537254333, + "learning_rate": 0.0001890926578198313, + "loss": 1.4727, + "step": 4212 + }, + { + "epoch": 0.054746013517574306, + "grad_norm": 0.47823551297187805, + "learning_rate": 0.00018909005835791992, + "loss": 1.5448, + "step": 4213 + }, + { + "epoch": 0.05475900806149018, + "grad_norm": 0.3251281976699829, + "learning_rate": 0.00018908745889600854, + "loss": 1.3866, + "step": 4214 + }, + { + "epoch": 0.05477200260540605, + "grad_norm": 0.3628029227256775, + "learning_rate": 0.00018908485943409714, + "loss": 1.3968, + "step": 4215 + }, + { + "epoch": 0.05478499714932193, + "grad_norm": 0.35794857144355774, + "learning_rate": 0.00018908225997218576, + "loss": 1.4602, + "step": 4216 + }, + { + "epoch": 0.054797991693237805, + "grad_norm": 0.4379797875881195, + "learning_rate": 0.0001890796605102744, + "loss": 1.5374, + "step": 4217 + }, + { + "epoch": 0.05481098623715368, + "grad_norm": 0.47001224756240845, + "learning_rate": 0.000189077061048363, + "loss": 1.6732, + "step": 4218 + }, + { + "epoch": 0.05482398078106955, + "grad_norm": 0.38813579082489014, + "learning_rate": 0.0001890744615864516, + "loss": 1.3578, + "step": 4219 + }, + { + "epoch": 0.054836975324985424, + "grad_norm": 0.4324837923049927, + "learning_rate": 0.00018907186212454023, + "loss": 1.7067, + "step": 4220 + }, + { + "epoch": 0.0548499698689013, + "grad_norm": 0.4201784133911133, + "learning_rate": 0.00018906926266262886, + "loss": 1.5515, + "step": 4221 + }, + { + "epoch": 0.05486296441281717, + "grad_norm": 0.4128369987010956, + "learning_rate": 0.00018906666320071746, + "loss": 1.4877, + "step": 4222 + }, + { + "epoch": 0.05487595895673304, + "grad_norm": 0.42768174409866333, + "learning_rate": 0.00018906406373880608, + "loss": 1.3831, + "step": 4223 + }, + { + "epoch": 0.054888953500648915, + "grad_norm": 0.3972453773021698, + "learning_rate": 0.00018906146427689468, + "loss": 1.5632, + "step": 4224 + }, + { + "epoch": 0.05490194804456479, + "grad_norm": 0.45946013927459717, + "learning_rate": 0.00018905886481498333, + "loss": 1.5957, + "step": 4225 + }, + { + "epoch": 0.05491494258848066, + "grad_norm": 0.4405013918876648, + "learning_rate": 0.00018905626535307193, + "loss": 1.562, + "step": 4226 + }, + { + "epoch": 0.054927937132396534, + "grad_norm": 0.4791746139526367, + "learning_rate": 0.00018905366589116052, + "loss": 1.5667, + "step": 4227 + }, + { + "epoch": 0.05494093167631241, + "grad_norm": 0.3574109971523285, + "learning_rate": 0.00018905106642924915, + "loss": 1.3823, + "step": 4228 + }, + { + "epoch": 0.05495392622022828, + "grad_norm": 0.41632288694381714, + "learning_rate": 0.00018904846696733777, + "loss": 1.4108, + "step": 4229 + }, + { + "epoch": 0.05496692076414415, + "grad_norm": 0.3771124482154846, + "learning_rate": 0.0001890458675054264, + "loss": 1.3334, + "step": 4230 + }, + { + "epoch": 0.054979915308060026, + "grad_norm": 0.3680572211742401, + "learning_rate": 0.000189043268043515, + "loss": 1.561, + "step": 4231 + }, + { + "epoch": 0.0549929098519759, + "grad_norm": 0.4603727459907532, + "learning_rate": 0.00018904066858160362, + "loss": 1.5374, + "step": 4232 + }, + { + "epoch": 0.05500590439589177, + "grad_norm": 0.3189932107925415, + "learning_rate": 0.00018903806911969224, + "loss": 1.4432, + "step": 4233 + }, + { + "epoch": 0.055018898939807645, + "grad_norm": 0.23667339980602264, + "learning_rate": 0.00018903546965778084, + "loss": 1.3938, + "step": 4234 + }, + { + "epoch": 0.055031893483723525, + "grad_norm": 0.37380659580230713, + "learning_rate": 0.00018903287019586947, + "loss": 1.3736, + "step": 4235 + }, + { + "epoch": 0.0550448880276394, + "grad_norm": 0.45473358035087585, + "learning_rate": 0.00018903027073395806, + "loss": 1.4154, + "step": 4236 + }, + { + "epoch": 0.05505788257155527, + "grad_norm": 0.43753811717033386, + "learning_rate": 0.00018902767127204671, + "loss": 1.5389, + "step": 4237 + }, + { + "epoch": 0.055070877115471144, + "grad_norm": 0.42147397994995117, + "learning_rate": 0.0001890250718101353, + "loss": 1.4252, + "step": 4238 + }, + { + "epoch": 0.05508387165938702, + "grad_norm": 0.396310955286026, + "learning_rate": 0.0001890224723482239, + "loss": 1.3903, + "step": 4239 + }, + { + "epoch": 0.05509686620330289, + "grad_norm": 0.4641113579273224, + "learning_rate": 0.00018901987288631253, + "loss": 1.3335, + "step": 4240 + }, + { + "epoch": 0.05510986074721876, + "grad_norm": 0.32203537225723267, + "learning_rate": 0.00018901727342440116, + "loss": 1.3022, + "step": 4241 + }, + { + "epoch": 0.055122855291134636, + "grad_norm": 0.41443485021591187, + "learning_rate": 0.00018901467396248978, + "loss": 1.4563, + "step": 4242 + }, + { + "epoch": 0.05513584983505051, + "grad_norm": 0.3391130566596985, + "learning_rate": 0.00018901207450057838, + "loss": 1.5188, + "step": 4243 + }, + { + "epoch": 0.05514884437896638, + "grad_norm": 0.41159477829933167, + "learning_rate": 0.000189009475038667, + "loss": 1.5471, + "step": 4244 + }, + { + "epoch": 0.055161838922882254, + "grad_norm": 0.35348883271217346, + "learning_rate": 0.00018900687557675563, + "loss": 1.3703, + "step": 4245 + }, + { + "epoch": 0.05517483346679813, + "grad_norm": 0.5232519507408142, + "learning_rate": 0.00018900427611484423, + "loss": 1.3852, + "step": 4246 + }, + { + "epoch": 0.055187828010714, + "grad_norm": 0.34217482805252075, + "learning_rate": 0.00018900167665293285, + "loss": 1.353, + "step": 4247 + }, + { + "epoch": 0.05520082255462987, + "grad_norm": 0.5088396072387695, + "learning_rate": 0.00018899907719102148, + "loss": 1.4171, + "step": 4248 + }, + { + "epoch": 0.055213817098545746, + "grad_norm": 0.42737191915512085, + "learning_rate": 0.0001889964777291101, + "loss": 1.3982, + "step": 4249 + }, + { + "epoch": 0.05522681164246162, + "grad_norm": 0.34471267461776733, + "learning_rate": 0.0001889938782671987, + "loss": 1.4787, + "step": 4250 + }, + { + "epoch": 0.05523980618637749, + "grad_norm": 0.2757571339607239, + "learning_rate": 0.0001889912788052873, + "loss": 1.4539, + "step": 4251 + }, + { + "epoch": 0.055252800730293365, + "grad_norm": 0.3801814317703247, + "learning_rate": 0.00018898867934337595, + "loss": 1.2948, + "step": 4252 + }, + { + "epoch": 0.05526579527420924, + "grad_norm": 0.4791862666606903, + "learning_rate": 0.00018898607988146454, + "loss": 1.4388, + "step": 4253 + }, + { + "epoch": 0.05527878981812512, + "grad_norm": 0.39182424545288086, + "learning_rate": 0.00018898348041955317, + "loss": 1.3614, + "step": 4254 + }, + { + "epoch": 0.05529178436204099, + "grad_norm": 0.39498671889305115, + "learning_rate": 0.00018898088095764177, + "loss": 1.4235, + "step": 4255 + }, + { + "epoch": 0.055304778905956864, + "grad_norm": 0.3955695331096649, + "learning_rate": 0.0001889782814957304, + "loss": 1.3083, + "step": 4256 + }, + { + "epoch": 0.05531777344987274, + "grad_norm": 0.34899240732192993, + "learning_rate": 0.00018897568203381901, + "loss": 1.4599, + "step": 4257 + }, + { + "epoch": 0.05533076799378861, + "grad_norm": 0.3629237115383148, + "learning_rate": 0.0001889730825719076, + "loss": 1.4181, + "step": 4258 + }, + { + "epoch": 0.05534376253770448, + "grad_norm": 0.46008944511413574, + "learning_rate": 0.00018897048310999624, + "loss": 1.7422, + "step": 4259 + }, + { + "epoch": 0.055356757081620356, + "grad_norm": 0.38336610794067383, + "learning_rate": 0.00018896788364808486, + "loss": 1.3975, + "step": 4260 + }, + { + "epoch": 0.05536975162553623, + "grad_norm": 0.38027679920196533, + "learning_rate": 0.00018896528418617349, + "loss": 1.493, + "step": 4261 + }, + { + "epoch": 0.0553827461694521, + "grad_norm": 0.356699138879776, + "learning_rate": 0.00018896268472426208, + "loss": 1.382, + "step": 4262 + }, + { + "epoch": 0.055395740713367975, + "grad_norm": 0.4037978947162628, + "learning_rate": 0.0001889600852623507, + "loss": 1.2951, + "step": 4263 + }, + { + "epoch": 0.05540873525728385, + "grad_norm": 0.42546093463897705, + "learning_rate": 0.00018895748580043933, + "loss": 1.3919, + "step": 4264 + }, + { + "epoch": 0.05542172980119972, + "grad_norm": 0.4108228087425232, + "learning_rate": 0.00018895488633852793, + "loss": 1.7336, + "step": 4265 + }, + { + "epoch": 0.055434724345115594, + "grad_norm": 0.37069782614707947, + "learning_rate": 0.00018895228687661655, + "loss": 1.3827, + "step": 4266 + }, + { + "epoch": 0.055447718889031467, + "grad_norm": 0.3522305488586426, + "learning_rate": 0.00018894968741470515, + "loss": 1.3179, + "step": 4267 + }, + { + "epoch": 0.05546071343294734, + "grad_norm": 0.3672562837600708, + "learning_rate": 0.00018894708795279378, + "loss": 1.3482, + "step": 4268 + }, + { + "epoch": 0.05547370797686321, + "grad_norm": 0.40353137254714966, + "learning_rate": 0.0001889444884908824, + "loss": 1.3486, + "step": 4269 + }, + { + "epoch": 0.055486702520779085, + "grad_norm": 0.471489280462265, + "learning_rate": 0.000188941889028971, + "loss": 1.5052, + "step": 4270 + }, + { + "epoch": 0.05549969706469496, + "grad_norm": 0.3540598154067993, + "learning_rate": 0.00018893928956705962, + "loss": 1.3245, + "step": 4271 + }, + { + "epoch": 0.05551269160861083, + "grad_norm": 0.3782809376716614, + "learning_rate": 0.00018893669010514825, + "loss": 1.3392, + "step": 4272 + }, + { + "epoch": 0.05552568615252671, + "grad_norm": 0.3543851375579834, + "learning_rate": 0.00018893409064323687, + "loss": 1.3055, + "step": 4273 + }, + { + "epoch": 0.055538680696442584, + "grad_norm": 0.4358716309070587, + "learning_rate": 0.00018893149118132547, + "loss": 1.3783, + "step": 4274 + }, + { + "epoch": 0.05555167524035846, + "grad_norm": 0.3005434572696686, + "learning_rate": 0.0001889288917194141, + "loss": 1.2415, + "step": 4275 + }, + { + "epoch": 0.05556466978427433, + "grad_norm": 0.2737489342689514, + "learning_rate": 0.00018892629225750272, + "loss": 1.3817, + "step": 4276 + }, + { + "epoch": 0.0555776643281902, + "grad_norm": 0.3879261314868927, + "learning_rate": 0.00018892369279559131, + "loss": 1.3333, + "step": 4277 + }, + { + "epoch": 0.055590658872106076, + "grad_norm": 0.3781391382217407, + "learning_rate": 0.00018892109333367994, + "loss": 1.5116, + "step": 4278 + }, + { + "epoch": 0.05560365341602195, + "grad_norm": 0.43596363067626953, + "learning_rate": 0.00018891849387176854, + "loss": 1.3875, + "step": 4279 + }, + { + "epoch": 0.05561664795993782, + "grad_norm": 0.22721873223781586, + "learning_rate": 0.0001889158944098572, + "loss": 1.1289, + "step": 4280 + }, + { + "epoch": 0.055629642503853695, + "grad_norm": 0.3520500659942627, + "learning_rate": 0.00018891329494794579, + "loss": 1.4431, + "step": 4281 + }, + { + "epoch": 0.05564263704776957, + "grad_norm": 0.47316721081733704, + "learning_rate": 0.00018891069548603438, + "loss": 1.651, + "step": 4282 + }, + { + "epoch": 0.05565563159168544, + "grad_norm": 0.4087876081466675, + "learning_rate": 0.000188908096024123, + "loss": 1.4293, + "step": 4283 + }, + { + "epoch": 0.055668626135601314, + "grad_norm": 0.46811285614967346, + "learning_rate": 0.00018890549656221163, + "loss": 1.4035, + "step": 4284 + }, + { + "epoch": 0.05568162067951719, + "grad_norm": 0.31009799242019653, + "learning_rate": 0.00018890289710030026, + "loss": 1.3051, + "step": 4285 + }, + { + "epoch": 0.05569461522343306, + "grad_norm": 0.32416224479675293, + "learning_rate": 0.00018890029763838885, + "loss": 1.3752, + "step": 4286 + }, + { + "epoch": 0.05570760976734893, + "grad_norm": 0.2768734395503998, + "learning_rate": 0.00018889769817647748, + "loss": 1.5295, + "step": 4287 + }, + { + "epoch": 0.055720604311264806, + "grad_norm": 0.3529180586338043, + "learning_rate": 0.0001888950987145661, + "loss": 1.3711, + "step": 4288 + }, + { + "epoch": 0.05573359885518068, + "grad_norm": 0.31314295530319214, + "learning_rate": 0.0001888924992526547, + "loss": 1.1908, + "step": 4289 + }, + { + "epoch": 0.05574659339909655, + "grad_norm": 0.3992103040218353, + "learning_rate": 0.00018888989979074332, + "loss": 1.3531, + "step": 4290 + }, + { + "epoch": 0.055759587943012424, + "grad_norm": 0.3720290958881378, + "learning_rate": 0.00018888730032883195, + "loss": 1.5652, + "step": 4291 + }, + { + "epoch": 0.055772582486928304, + "grad_norm": 0.38190796971321106, + "learning_rate": 0.00018888470086692057, + "loss": 1.4262, + "step": 4292 + }, + { + "epoch": 0.05578557703084418, + "grad_norm": 0.39032599329948425, + "learning_rate": 0.00018888210140500917, + "loss": 1.4588, + "step": 4293 + }, + { + "epoch": 0.05579857157476005, + "grad_norm": 0.4417930245399475, + "learning_rate": 0.00018887950194309777, + "loss": 1.4527, + "step": 4294 + }, + { + "epoch": 0.05581156611867592, + "grad_norm": 0.25574052333831787, + "learning_rate": 0.00018887690248118642, + "loss": 1.4443, + "step": 4295 + }, + { + "epoch": 0.055824560662591796, + "grad_norm": 0.4334050714969635, + "learning_rate": 0.00018887430301927502, + "loss": 1.2305, + "step": 4296 + }, + { + "epoch": 0.05583755520650767, + "grad_norm": 0.4551560878753662, + "learning_rate": 0.00018887170355736364, + "loss": 1.4285, + "step": 4297 + }, + { + "epoch": 0.05585054975042354, + "grad_norm": 0.29558664560317993, + "learning_rate": 0.00018886910409545224, + "loss": 1.2468, + "step": 4298 + }, + { + "epoch": 0.055863544294339415, + "grad_norm": 0.3928694427013397, + "learning_rate": 0.00018886650463354086, + "loss": 1.4435, + "step": 4299 + }, + { + "epoch": 0.05587653883825529, + "grad_norm": 0.3278985917568207, + "learning_rate": 0.0001888639051716295, + "loss": 1.3559, + "step": 4300 + }, + { + "epoch": 0.05588953338217116, + "grad_norm": 0.3517388701438904, + "learning_rate": 0.00018886130570971809, + "loss": 1.2926, + "step": 4301 + }, + { + "epoch": 0.055902527926087034, + "grad_norm": 0.41983991861343384, + "learning_rate": 0.0001888587062478067, + "loss": 1.2926, + "step": 4302 + }, + { + "epoch": 0.05591552247000291, + "grad_norm": 0.35589921474456787, + "learning_rate": 0.00018885610678589533, + "loss": 1.6002, + "step": 4303 + }, + { + "epoch": 0.05592851701391878, + "grad_norm": 0.3442300856113434, + "learning_rate": 0.00018885350732398396, + "loss": 1.3369, + "step": 4304 + }, + { + "epoch": 0.05594151155783465, + "grad_norm": 0.38496631383895874, + "learning_rate": 0.00018885090786207256, + "loss": 1.4899, + "step": 4305 + }, + { + "epoch": 0.055954506101750526, + "grad_norm": 0.3305857479572296, + "learning_rate": 0.00018884830840016115, + "loss": 1.4139, + "step": 4306 + }, + { + "epoch": 0.0559675006456664, + "grad_norm": 0.3925778865814209, + "learning_rate": 0.0001888457089382498, + "loss": 1.4576, + "step": 4307 + }, + { + "epoch": 0.05598049518958227, + "grad_norm": 0.4273711144924164, + "learning_rate": 0.0001888431094763384, + "loss": 1.2212, + "step": 4308 + }, + { + "epoch": 0.055993489733498145, + "grad_norm": 0.3588268756866455, + "learning_rate": 0.00018884051001442703, + "loss": 1.5006, + "step": 4309 + }, + { + "epoch": 0.05600648427741402, + "grad_norm": 0.33522024750709534, + "learning_rate": 0.00018883791055251562, + "loss": 1.2204, + "step": 4310 + }, + { + "epoch": 0.0560194788213299, + "grad_norm": 0.4092654287815094, + "learning_rate": 0.00018883531109060425, + "loss": 1.359, + "step": 4311 + }, + { + "epoch": 0.05603247336524577, + "grad_norm": 0.40513813495635986, + "learning_rate": 0.00018883271162869287, + "loss": 1.2697, + "step": 4312 + }, + { + "epoch": 0.05604546790916164, + "grad_norm": 0.4017346501350403, + "learning_rate": 0.00018883011216678147, + "loss": 1.4096, + "step": 4313 + }, + { + "epoch": 0.056058462453077516, + "grad_norm": 0.39384058117866516, + "learning_rate": 0.0001888275127048701, + "loss": 1.3911, + "step": 4314 + }, + { + "epoch": 0.05607145699699339, + "grad_norm": 0.3191656172275543, + "learning_rate": 0.00018882491324295872, + "loss": 1.3967, + "step": 4315 + }, + { + "epoch": 0.05608445154090926, + "grad_norm": 0.30934426188468933, + "learning_rate": 0.00018882231378104734, + "loss": 1.5199, + "step": 4316 + }, + { + "epoch": 0.056097446084825135, + "grad_norm": 0.3824143707752228, + "learning_rate": 0.00018881971431913594, + "loss": 1.3707, + "step": 4317 + }, + { + "epoch": 0.05611044062874101, + "grad_norm": 0.42951467633247375, + "learning_rate": 0.00018881711485722457, + "loss": 1.4281, + "step": 4318 + }, + { + "epoch": 0.05612343517265688, + "grad_norm": 0.39471718668937683, + "learning_rate": 0.0001888145153953132, + "loss": 1.4594, + "step": 4319 + }, + { + "epoch": 0.056136429716572754, + "grad_norm": 0.4263366758823395, + "learning_rate": 0.0001888119159334018, + "loss": 1.4854, + "step": 4320 + }, + { + "epoch": 0.05614942426048863, + "grad_norm": 0.3760986030101776, + "learning_rate": 0.0001888093164714904, + "loss": 1.5372, + "step": 4321 + }, + { + "epoch": 0.0561624188044045, + "grad_norm": 0.3160140812397003, + "learning_rate": 0.00018880671700957904, + "loss": 1.3305, + "step": 4322 + }, + { + "epoch": 0.05617541334832037, + "grad_norm": 0.32864001393318176, + "learning_rate": 0.00018880411754766763, + "loss": 1.5169, + "step": 4323 + }, + { + "epoch": 0.056188407892236246, + "grad_norm": 0.3781583309173584, + "learning_rate": 0.00018880151808575626, + "loss": 1.5385, + "step": 4324 + }, + { + "epoch": 0.05620140243615212, + "grad_norm": 0.25325852632522583, + "learning_rate": 0.00018879891862384486, + "loss": 1.3887, + "step": 4325 + }, + { + "epoch": 0.05621439698006799, + "grad_norm": 0.3994160592556, + "learning_rate": 0.0001887963191619335, + "loss": 1.3673, + "step": 4326 + }, + { + "epoch": 0.056227391523983865, + "grad_norm": 0.36684292554855347, + "learning_rate": 0.0001887937197000221, + "loss": 1.6225, + "step": 4327 + }, + { + "epoch": 0.05624038606789974, + "grad_norm": 0.36858800053596497, + "learning_rate": 0.00018879112023811073, + "loss": 1.2697, + "step": 4328 + }, + { + "epoch": 0.05625338061181561, + "grad_norm": 0.2822013199329376, + "learning_rate": 0.00018878852077619933, + "loss": 1.3315, + "step": 4329 + }, + { + "epoch": 0.05626637515573149, + "grad_norm": 0.34770163893699646, + "learning_rate": 0.00018878592131428795, + "loss": 1.2821, + "step": 4330 + }, + { + "epoch": 0.056279369699647364, + "grad_norm": 0.3689379096031189, + "learning_rate": 0.00018878332185237658, + "loss": 1.5098, + "step": 4331 + }, + { + "epoch": 0.05629236424356324, + "grad_norm": 0.3937860429286957, + "learning_rate": 0.00018878072239046517, + "loss": 1.3241, + "step": 4332 + }, + { + "epoch": 0.05630535878747911, + "grad_norm": 0.3446391820907593, + "learning_rate": 0.0001887781229285538, + "loss": 1.5134, + "step": 4333 + }, + { + "epoch": 0.05631835333139498, + "grad_norm": 0.32316312193870544, + "learning_rate": 0.00018877552346664242, + "loss": 1.3648, + "step": 4334 + }, + { + "epoch": 0.056331347875310855, + "grad_norm": 0.4080737829208374, + "learning_rate": 0.00018877292400473102, + "loss": 1.4906, + "step": 4335 + }, + { + "epoch": 0.05634434241922673, + "grad_norm": 0.4372391402721405, + "learning_rate": 0.00018877032454281964, + "loss": 1.4577, + "step": 4336 + }, + { + "epoch": 0.0563573369631426, + "grad_norm": 0.3978181481361389, + "learning_rate": 0.00018876772508090824, + "loss": 1.5947, + "step": 4337 + }, + { + "epoch": 0.056370331507058474, + "grad_norm": 0.4092581272125244, + "learning_rate": 0.0001887651256189969, + "loss": 1.5452, + "step": 4338 + }, + { + "epoch": 0.05638332605097435, + "grad_norm": 0.29107651114463806, + "learning_rate": 0.0001887625261570855, + "loss": 1.4862, + "step": 4339 + }, + { + "epoch": 0.05639632059489022, + "grad_norm": 0.36635684967041016, + "learning_rate": 0.00018875992669517412, + "loss": 1.4533, + "step": 4340 + }, + { + "epoch": 0.05640931513880609, + "grad_norm": 0.3783576488494873, + "learning_rate": 0.0001887573272332627, + "loss": 1.4592, + "step": 4341 + }, + { + "epoch": 0.056422309682721966, + "grad_norm": 0.47449570894241333, + "learning_rate": 0.00018875472777135134, + "loss": 1.4946, + "step": 4342 + }, + { + "epoch": 0.05643530422663784, + "grad_norm": 0.3573222756385803, + "learning_rate": 0.00018875212830943996, + "loss": 1.1712, + "step": 4343 + }, + { + "epoch": 0.05644829877055371, + "grad_norm": 0.35999277234077454, + "learning_rate": 0.00018874952884752856, + "loss": 1.3245, + "step": 4344 + }, + { + "epoch": 0.056461293314469585, + "grad_norm": 0.4604869782924652, + "learning_rate": 0.00018874692938561718, + "loss": 1.5313, + "step": 4345 + }, + { + "epoch": 0.05647428785838546, + "grad_norm": 0.40419793128967285, + "learning_rate": 0.0001887443299237058, + "loss": 1.5753, + "step": 4346 + }, + { + "epoch": 0.05648728240230133, + "grad_norm": 0.43127113580703735, + "learning_rate": 0.00018874173046179443, + "loss": 1.3463, + "step": 4347 + }, + { + "epoch": 0.056500276946217204, + "grad_norm": 0.38444408774375916, + "learning_rate": 0.00018873913099988303, + "loss": 1.3402, + "step": 4348 + }, + { + "epoch": 0.056513271490133084, + "grad_norm": 0.3234100937843323, + "learning_rate": 0.00018873653153797163, + "loss": 1.6239, + "step": 4349 + }, + { + "epoch": 0.05652626603404896, + "grad_norm": 0.3555113673210144, + "learning_rate": 0.00018873393207606028, + "loss": 1.3021, + "step": 4350 + }, + { + "epoch": 0.05653926057796483, + "grad_norm": 0.36611834168434143, + "learning_rate": 0.00018873133261414888, + "loss": 1.5364, + "step": 4351 + }, + { + "epoch": 0.0565522551218807, + "grad_norm": 0.4237237572669983, + "learning_rate": 0.0001887287331522375, + "loss": 1.5019, + "step": 4352 + }, + { + "epoch": 0.056565249665796576, + "grad_norm": 0.38082775473594666, + "learning_rate": 0.0001887261336903261, + "loss": 1.5492, + "step": 4353 + }, + { + "epoch": 0.05657824420971245, + "grad_norm": 0.3991524279117584, + "learning_rate": 0.00018872353422841472, + "loss": 1.4544, + "step": 4354 + }, + { + "epoch": 0.05659123875362832, + "grad_norm": 0.4199187457561493, + "learning_rate": 0.00018872093476650335, + "loss": 1.5366, + "step": 4355 + }, + { + "epoch": 0.056604233297544194, + "grad_norm": 0.35470521450042725, + "learning_rate": 0.00018871833530459194, + "loss": 1.5311, + "step": 4356 + }, + { + "epoch": 0.05661722784146007, + "grad_norm": 0.2809489667415619, + "learning_rate": 0.00018871573584268057, + "loss": 1.2625, + "step": 4357 + }, + { + "epoch": 0.05663022238537594, + "grad_norm": 0.3196016550064087, + "learning_rate": 0.0001887131363807692, + "loss": 1.4976, + "step": 4358 + }, + { + "epoch": 0.05664321692929181, + "grad_norm": 0.408279687166214, + "learning_rate": 0.00018871053691885782, + "loss": 1.5503, + "step": 4359 + }, + { + "epoch": 0.056656211473207686, + "grad_norm": 0.36303767561912537, + "learning_rate": 0.00018870793745694642, + "loss": 1.493, + "step": 4360 + }, + { + "epoch": 0.05666920601712356, + "grad_norm": 0.3275662064552307, + "learning_rate": 0.00018870533799503504, + "loss": 1.3344, + "step": 4361 + }, + { + "epoch": 0.05668220056103943, + "grad_norm": 0.38077402114868164, + "learning_rate": 0.00018870273853312366, + "loss": 1.4943, + "step": 4362 + }, + { + "epoch": 0.056695195104955305, + "grad_norm": 0.34123852849006653, + "learning_rate": 0.00018870013907121226, + "loss": 1.3335, + "step": 4363 + }, + { + "epoch": 0.05670818964887118, + "grad_norm": 0.4996756315231323, + "learning_rate": 0.00018869753960930089, + "loss": 1.4703, + "step": 4364 + }, + { + "epoch": 0.05672118419278705, + "grad_norm": 0.3341628313064575, + "learning_rate": 0.0001886949401473895, + "loss": 1.5521, + "step": 4365 + }, + { + "epoch": 0.056734178736702924, + "grad_norm": 0.45765554904937744, + "learning_rate": 0.0001886923406854781, + "loss": 1.5338, + "step": 4366 + }, + { + "epoch": 0.0567471732806188, + "grad_norm": 0.24268069863319397, + "learning_rate": 0.00018868974122356673, + "loss": 1.0957, + "step": 4367 + }, + { + "epoch": 0.05676016782453468, + "grad_norm": 0.36286821961402893, + "learning_rate": 0.00018868714176165533, + "loss": 1.6127, + "step": 4368 + }, + { + "epoch": 0.05677316236845055, + "grad_norm": 0.3851850926876068, + "learning_rate": 0.00018868454229974398, + "loss": 1.4118, + "step": 4369 + }, + { + "epoch": 0.05678615691236642, + "grad_norm": 0.2679467499256134, + "learning_rate": 0.00018868194283783258, + "loss": 1.4785, + "step": 4370 + }, + { + "epoch": 0.056799151456282296, + "grad_norm": 0.26473966240882874, + "learning_rate": 0.0001886793433759212, + "loss": 1.5598, + "step": 4371 + }, + { + "epoch": 0.05681214600019817, + "grad_norm": 0.3831224739551544, + "learning_rate": 0.0001886767439140098, + "loss": 1.4606, + "step": 4372 + }, + { + "epoch": 0.05682514054411404, + "grad_norm": 0.3660658895969391, + "learning_rate": 0.00018867414445209843, + "loss": 1.3516, + "step": 4373 + }, + { + "epoch": 0.056838135088029915, + "grad_norm": 0.40516167879104614, + "learning_rate": 0.00018867154499018705, + "loss": 1.5301, + "step": 4374 + }, + { + "epoch": 0.05685112963194579, + "grad_norm": 0.44480594992637634, + "learning_rate": 0.00018866894552827565, + "loss": 1.4486, + "step": 4375 + }, + { + "epoch": 0.05686412417586166, + "grad_norm": 0.33053305745124817, + "learning_rate": 0.00018866634606636427, + "loss": 1.3866, + "step": 4376 + }, + { + "epoch": 0.056877118719777534, + "grad_norm": 0.37060365080833435, + "learning_rate": 0.0001886637466044529, + "loss": 1.3789, + "step": 4377 + }, + { + "epoch": 0.056890113263693406, + "grad_norm": 0.3704473078250885, + "learning_rate": 0.0001886611471425415, + "loss": 1.3734, + "step": 4378 + }, + { + "epoch": 0.05690310780760928, + "grad_norm": 0.39931410551071167, + "learning_rate": 0.00018865854768063012, + "loss": 1.5498, + "step": 4379 + }, + { + "epoch": 0.05691610235152515, + "grad_norm": 0.5383016467094421, + "learning_rate": 0.00018865594821871872, + "loss": 1.4388, + "step": 4380 + }, + { + "epoch": 0.056929096895441025, + "grad_norm": 0.24925829470157623, + "learning_rate": 0.00018865334875680737, + "loss": 1.2727, + "step": 4381 + }, + { + "epoch": 0.0569420914393569, + "grad_norm": 0.35634124279022217, + "learning_rate": 0.00018865074929489596, + "loss": 1.396, + "step": 4382 + }, + { + "epoch": 0.05695508598327277, + "grad_norm": 0.4414868652820587, + "learning_rate": 0.0001886481498329846, + "loss": 1.558, + "step": 4383 + }, + { + "epoch": 0.056968080527188644, + "grad_norm": 0.32493627071380615, + "learning_rate": 0.00018864555037107319, + "loss": 1.6839, + "step": 4384 + }, + { + "epoch": 0.05698107507110452, + "grad_norm": 0.3880787193775177, + "learning_rate": 0.0001886429509091618, + "loss": 1.4107, + "step": 4385 + }, + { + "epoch": 0.05699406961502039, + "grad_norm": 0.4654039740562439, + "learning_rate": 0.00018864035144725043, + "loss": 1.6062, + "step": 4386 + }, + { + "epoch": 0.05700706415893627, + "grad_norm": 0.38659539818763733, + "learning_rate": 0.00018863775198533903, + "loss": 1.5001, + "step": 4387 + }, + { + "epoch": 0.05702005870285214, + "grad_norm": 0.35698825120925903, + "learning_rate": 0.00018863515252342766, + "loss": 1.3173, + "step": 4388 + }, + { + "epoch": 0.057033053246768016, + "grad_norm": 0.38120898604393005, + "learning_rate": 0.00018863255306151628, + "loss": 1.4852, + "step": 4389 + }, + { + "epoch": 0.05704604779068389, + "grad_norm": 0.37367287278175354, + "learning_rate": 0.00018862995359960488, + "loss": 1.4621, + "step": 4390 + }, + { + "epoch": 0.05705904233459976, + "grad_norm": 0.39866533875465393, + "learning_rate": 0.0001886273541376935, + "loss": 1.2533, + "step": 4391 + }, + { + "epoch": 0.057072036878515635, + "grad_norm": 0.4202077090740204, + "learning_rate": 0.0001886247546757821, + "loss": 1.4478, + "step": 4392 + }, + { + "epoch": 0.05708503142243151, + "grad_norm": 0.2696082592010498, + "learning_rate": 0.00018862215521387075, + "loss": 1.4689, + "step": 4393 + }, + { + "epoch": 0.05709802596634738, + "grad_norm": 0.4467935562133789, + "learning_rate": 0.00018861955575195935, + "loss": 1.3469, + "step": 4394 + }, + { + "epoch": 0.057111020510263254, + "grad_norm": 0.3542415201663971, + "learning_rate": 0.00018861695629004797, + "loss": 1.614, + "step": 4395 + }, + { + "epoch": 0.05712401505417913, + "grad_norm": 0.35639870166778564, + "learning_rate": 0.0001886143568281366, + "loss": 1.5655, + "step": 4396 + }, + { + "epoch": 0.057137009598095, + "grad_norm": 0.40242400765419006, + "learning_rate": 0.0001886117573662252, + "loss": 1.4791, + "step": 4397 + }, + { + "epoch": 0.05715000414201087, + "grad_norm": 0.34428870677948, + "learning_rate": 0.00018860915790431382, + "loss": 1.3569, + "step": 4398 + }, + { + "epoch": 0.057162998685926746, + "grad_norm": 0.391963928937912, + "learning_rate": 0.00018860655844240242, + "loss": 1.4659, + "step": 4399 + }, + { + "epoch": 0.05717599322984262, + "grad_norm": 0.4823959767818451, + "learning_rate": 0.00018860395898049107, + "loss": 1.4824, + "step": 4400 + }, + { + "epoch": 0.05718898777375849, + "grad_norm": 0.49892473220825195, + "learning_rate": 0.00018860135951857967, + "loss": 1.5869, + "step": 4401 + }, + { + "epoch": 0.057201982317674364, + "grad_norm": 0.3645821213722229, + "learning_rate": 0.0001885987600566683, + "loss": 1.607, + "step": 4402 + }, + { + "epoch": 0.05721497686159024, + "grad_norm": 0.384893000125885, + "learning_rate": 0.0001885961605947569, + "loss": 1.5141, + "step": 4403 + }, + { + "epoch": 0.05722797140550611, + "grad_norm": 0.37683653831481934, + "learning_rate": 0.0001885935611328455, + "loss": 1.2531, + "step": 4404 + }, + { + "epoch": 0.05724096594942198, + "grad_norm": 0.3627872169017792, + "learning_rate": 0.00018859096167093414, + "loss": 1.3925, + "step": 4405 + }, + { + "epoch": 0.05725396049333786, + "grad_norm": 0.3693505823612213, + "learning_rate": 0.00018858836220902273, + "loss": 1.6687, + "step": 4406 + }, + { + "epoch": 0.057266955037253736, + "grad_norm": 0.3219207525253296, + "learning_rate": 0.00018858576274711136, + "loss": 1.4408, + "step": 4407 + }, + { + "epoch": 0.05727994958116961, + "grad_norm": 0.2685849070549011, + "learning_rate": 0.00018858316328519998, + "loss": 1.4384, + "step": 4408 + }, + { + "epoch": 0.05729294412508548, + "grad_norm": 0.3409925699234009, + "learning_rate": 0.00018858056382328858, + "loss": 1.3818, + "step": 4409 + }, + { + "epoch": 0.057305938669001355, + "grad_norm": 0.4180390238761902, + "learning_rate": 0.0001885779643613772, + "loss": 1.4226, + "step": 4410 + }, + { + "epoch": 0.05731893321291723, + "grad_norm": 0.48234209418296814, + "learning_rate": 0.0001885753648994658, + "loss": 1.556, + "step": 4411 + }, + { + "epoch": 0.0573319277568331, + "grad_norm": 0.5108433961868286, + "learning_rate": 0.00018857276543755445, + "loss": 1.466, + "step": 4412 + }, + { + "epoch": 0.057344922300748974, + "grad_norm": 0.4192737638950348, + "learning_rate": 0.00018857016597564305, + "loss": 1.4339, + "step": 4413 + }, + { + "epoch": 0.05735791684466485, + "grad_norm": 0.3814343512058258, + "learning_rate": 0.00018856756651373168, + "loss": 1.3235, + "step": 4414 + }, + { + "epoch": 0.05737091138858072, + "grad_norm": 0.22901009023189545, + "learning_rate": 0.00018856496705182027, + "loss": 1.2285, + "step": 4415 + }, + { + "epoch": 0.05738390593249659, + "grad_norm": 0.26425036787986755, + "learning_rate": 0.0001885623675899089, + "loss": 1.3256, + "step": 4416 + }, + { + "epoch": 0.057396900476412466, + "grad_norm": 0.3967977464199066, + "learning_rate": 0.00018855976812799752, + "loss": 1.3776, + "step": 4417 + }, + { + "epoch": 0.05740989502032834, + "grad_norm": 0.4345617890357971, + "learning_rate": 0.00018855716866608612, + "loss": 1.5153, + "step": 4418 + }, + { + "epoch": 0.05742288956424421, + "grad_norm": 0.423673540353775, + "learning_rate": 0.00018855456920417474, + "loss": 1.4166, + "step": 4419 + }, + { + "epoch": 0.057435884108160085, + "grad_norm": 0.374982625246048, + "learning_rate": 0.00018855196974226337, + "loss": 1.314, + "step": 4420 + }, + { + "epoch": 0.05744887865207596, + "grad_norm": 0.4199756979942322, + "learning_rate": 0.00018854937028035197, + "loss": 1.5202, + "step": 4421 + }, + { + "epoch": 0.05746187319599183, + "grad_norm": 0.32804566621780396, + "learning_rate": 0.0001885467708184406, + "loss": 1.5451, + "step": 4422 + }, + { + "epoch": 0.057474867739907703, + "grad_norm": 0.3952144682407379, + "learning_rate": 0.0001885441713565292, + "loss": 1.5651, + "step": 4423 + }, + { + "epoch": 0.057487862283823576, + "grad_norm": 0.5229212045669556, + "learning_rate": 0.00018854157189461784, + "loss": 1.2889, + "step": 4424 + }, + { + "epoch": 0.057500856827739456, + "grad_norm": 0.30185407400131226, + "learning_rate": 0.00018853897243270644, + "loss": 1.4835, + "step": 4425 + }, + { + "epoch": 0.05751385137165533, + "grad_norm": 0.4042168855667114, + "learning_rate": 0.00018853637297079506, + "loss": 1.4768, + "step": 4426 + }, + { + "epoch": 0.0575268459155712, + "grad_norm": 0.4536532759666443, + "learning_rate": 0.00018853377350888366, + "loss": 1.4839, + "step": 4427 + }, + { + "epoch": 0.057539840459487075, + "grad_norm": 0.4703330397605896, + "learning_rate": 0.00018853117404697228, + "loss": 1.4641, + "step": 4428 + }, + { + "epoch": 0.05755283500340295, + "grad_norm": 0.3574981689453125, + "learning_rate": 0.0001885285745850609, + "loss": 1.3675, + "step": 4429 + }, + { + "epoch": 0.05756582954731882, + "grad_norm": 0.2977541387081146, + "learning_rate": 0.0001885259751231495, + "loss": 1.3422, + "step": 4430 + }, + { + "epoch": 0.057578824091234694, + "grad_norm": 0.4809090197086334, + "learning_rate": 0.00018852337566123813, + "loss": 1.5843, + "step": 4431 + }, + { + "epoch": 0.05759181863515057, + "grad_norm": 0.37623995542526245, + "learning_rate": 0.00018852077619932675, + "loss": 1.4277, + "step": 4432 + }, + { + "epoch": 0.05760481317906644, + "grad_norm": 0.2894614040851593, + "learning_rate": 0.00018851817673741535, + "loss": 1.3494, + "step": 4433 + }, + { + "epoch": 0.05761780772298231, + "grad_norm": 0.3871980309486389, + "learning_rate": 0.00018851557727550398, + "loss": 1.426, + "step": 4434 + }, + { + "epoch": 0.057630802266898186, + "grad_norm": 0.3628866970539093, + "learning_rate": 0.0001885129778135926, + "loss": 1.2789, + "step": 4435 + }, + { + "epoch": 0.05764379681081406, + "grad_norm": 0.46236157417297363, + "learning_rate": 0.00018851037835168123, + "loss": 1.604, + "step": 4436 + }, + { + "epoch": 0.05765679135472993, + "grad_norm": 0.41646429896354675, + "learning_rate": 0.00018850777888976982, + "loss": 1.6024, + "step": 4437 + }, + { + "epoch": 0.057669785898645805, + "grad_norm": 0.3667775094509125, + "learning_rate": 0.00018850517942785845, + "loss": 1.4866, + "step": 4438 + }, + { + "epoch": 0.05768278044256168, + "grad_norm": 0.3537268340587616, + "learning_rate": 0.00018850257996594707, + "loss": 1.3893, + "step": 4439 + }, + { + "epoch": 0.05769577498647755, + "grad_norm": 0.4517529606819153, + "learning_rate": 0.00018849998050403567, + "loss": 1.3818, + "step": 4440 + }, + { + "epoch": 0.057708769530393424, + "grad_norm": 0.374284029006958, + "learning_rate": 0.0001884973810421243, + "loss": 1.5122, + "step": 4441 + }, + { + "epoch": 0.0577217640743093, + "grad_norm": 0.49993324279785156, + "learning_rate": 0.0001884947815802129, + "loss": 1.5521, + "step": 4442 + }, + { + "epoch": 0.05773475861822517, + "grad_norm": 0.367410272359848, + "learning_rate": 0.00018849218211830154, + "loss": 1.4932, + "step": 4443 + }, + { + "epoch": 0.05774775316214105, + "grad_norm": 0.42041441798210144, + "learning_rate": 0.00018848958265639014, + "loss": 1.5762, + "step": 4444 + }, + { + "epoch": 0.05776074770605692, + "grad_norm": 0.3849473297595978, + "learning_rate": 0.00018848698319447874, + "loss": 1.4216, + "step": 4445 + }, + { + "epoch": 0.057773742249972795, + "grad_norm": 0.40465423464775085, + "learning_rate": 0.00018848438373256736, + "loss": 1.421, + "step": 4446 + }, + { + "epoch": 0.05778673679388867, + "grad_norm": 0.4473832845687866, + "learning_rate": 0.00018848178427065599, + "loss": 1.4875, + "step": 4447 + }, + { + "epoch": 0.05779973133780454, + "grad_norm": 0.39657723903656006, + "learning_rate": 0.0001884791848087446, + "loss": 1.5944, + "step": 4448 + }, + { + "epoch": 0.057812725881720414, + "grad_norm": 0.40098837018013, + "learning_rate": 0.0001884765853468332, + "loss": 1.3575, + "step": 4449 + }, + { + "epoch": 0.05782572042563629, + "grad_norm": 0.4412459135055542, + "learning_rate": 0.00018847398588492183, + "loss": 1.4385, + "step": 4450 + }, + { + "epoch": 0.05783871496955216, + "grad_norm": 0.3459238111972809, + "learning_rate": 0.00018847138642301046, + "loss": 1.4425, + "step": 4451 + }, + { + "epoch": 0.05785170951346803, + "grad_norm": 0.39290276169776917, + "learning_rate": 0.00018846878696109905, + "loss": 1.6497, + "step": 4452 + }, + { + "epoch": 0.057864704057383906, + "grad_norm": 0.3830406069755554, + "learning_rate": 0.00018846618749918768, + "loss": 1.3845, + "step": 4453 + }, + { + "epoch": 0.05787769860129978, + "grad_norm": 0.4009338617324829, + "learning_rate": 0.00018846358803727628, + "loss": 1.6374, + "step": 4454 + }, + { + "epoch": 0.05789069314521565, + "grad_norm": 0.4065341651439667, + "learning_rate": 0.00018846098857536493, + "loss": 1.2858, + "step": 4455 + }, + { + "epoch": 0.057903687689131525, + "grad_norm": 0.4415000081062317, + "learning_rate": 0.00018845838911345353, + "loss": 1.4675, + "step": 4456 + }, + { + "epoch": 0.0579166822330474, + "grad_norm": 0.42395147681236267, + "learning_rate": 0.00018845578965154212, + "loss": 1.5597, + "step": 4457 + }, + { + "epoch": 0.05792967677696327, + "grad_norm": 0.3737245798110962, + "learning_rate": 0.00018845319018963075, + "loss": 1.2627, + "step": 4458 + }, + { + "epoch": 0.057942671320879144, + "grad_norm": 0.30794718861579895, + "learning_rate": 0.00018845059072771937, + "loss": 1.4222, + "step": 4459 + }, + { + "epoch": 0.05795566586479502, + "grad_norm": 0.2816654145717621, + "learning_rate": 0.000188447991265808, + "loss": 1.353, + "step": 4460 + }, + { + "epoch": 0.05796866040871089, + "grad_norm": 0.3791772425174713, + "learning_rate": 0.0001884453918038966, + "loss": 1.4008, + "step": 4461 + }, + { + "epoch": 0.05798165495262676, + "grad_norm": 0.43883711099624634, + "learning_rate": 0.00018844279234198522, + "loss": 1.5184, + "step": 4462 + }, + { + "epoch": 0.05799464949654264, + "grad_norm": 0.38996151089668274, + "learning_rate": 0.00018844019288007384, + "loss": 1.3953, + "step": 4463 + }, + { + "epoch": 0.058007644040458516, + "grad_norm": 0.4525420665740967, + "learning_rate": 0.00018843759341816244, + "loss": 1.5363, + "step": 4464 + }, + { + "epoch": 0.05802063858437439, + "grad_norm": 0.35998621582984924, + "learning_rate": 0.00018843499395625106, + "loss": 1.542, + "step": 4465 + }, + { + "epoch": 0.05803363312829026, + "grad_norm": 0.322517991065979, + "learning_rate": 0.00018843239449433966, + "loss": 1.4447, + "step": 4466 + }, + { + "epoch": 0.058046627672206134, + "grad_norm": 0.4839661419391632, + "learning_rate": 0.0001884297950324283, + "loss": 1.5355, + "step": 4467 + }, + { + "epoch": 0.05805962221612201, + "grad_norm": 0.48378366231918335, + "learning_rate": 0.0001884271955705169, + "loss": 1.464, + "step": 4468 + }, + { + "epoch": 0.05807261676003788, + "grad_norm": 0.4123730957508087, + "learning_rate": 0.00018842459610860554, + "loss": 1.4833, + "step": 4469 + }, + { + "epoch": 0.05808561130395375, + "grad_norm": 0.38946062326431274, + "learning_rate": 0.00018842199664669416, + "loss": 1.3464, + "step": 4470 + }, + { + "epoch": 0.058098605847869626, + "grad_norm": 0.37049511075019836, + "learning_rate": 0.00018841939718478276, + "loss": 1.4161, + "step": 4471 + }, + { + "epoch": 0.0581116003917855, + "grad_norm": 0.42215830087661743, + "learning_rate": 0.00018841679772287138, + "loss": 1.6647, + "step": 4472 + }, + { + "epoch": 0.05812459493570137, + "grad_norm": 0.45599594712257385, + "learning_rate": 0.00018841419826095998, + "loss": 1.5973, + "step": 4473 + }, + { + "epoch": 0.058137589479617245, + "grad_norm": 0.30034714937210083, + "learning_rate": 0.0001884115987990486, + "loss": 1.2091, + "step": 4474 + }, + { + "epoch": 0.05815058402353312, + "grad_norm": 0.3550974726676941, + "learning_rate": 0.00018840899933713723, + "loss": 1.3227, + "step": 4475 + }, + { + "epoch": 0.05816357856744899, + "grad_norm": 0.4022202789783478, + "learning_rate": 0.00018840639987522583, + "loss": 1.5537, + "step": 4476 + }, + { + "epoch": 0.058176573111364864, + "grad_norm": 0.39612770080566406, + "learning_rate": 0.00018840380041331445, + "loss": 1.6405, + "step": 4477 + }, + { + "epoch": 0.05818956765528074, + "grad_norm": 0.46339184045791626, + "learning_rate": 0.00018840120095140307, + "loss": 1.4874, + "step": 4478 + }, + { + "epoch": 0.05820256219919661, + "grad_norm": 0.3612900972366333, + "learning_rate": 0.0001883986014894917, + "loss": 1.3682, + "step": 4479 + }, + { + "epoch": 0.05821555674311248, + "grad_norm": 0.45932045578956604, + "learning_rate": 0.0001883960020275803, + "loss": 1.4133, + "step": 4480 + }, + { + "epoch": 0.058228551287028356, + "grad_norm": 0.4143752157688141, + "learning_rate": 0.00018839340256566892, + "loss": 1.4237, + "step": 4481 + }, + { + "epoch": 0.05824154583094423, + "grad_norm": 0.4116329848766327, + "learning_rate": 0.00018839080310375755, + "loss": 1.5742, + "step": 4482 + }, + { + "epoch": 0.05825454037486011, + "grad_norm": 0.42525044083595276, + "learning_rate": 0.00018838820364184614, + "loss": 1.233, + "step": 4483 + }, + { + "epoch": 0.05826753491877598, + "grad_norm": 0.33023613691329956, + "learning_rate": 0.00018838560417993477, + "loss": 1.3341, + "step": 4484 + }, + { + "epoch": 0.058280529462691855, + "grad_norm": 0.3632299304008484, + "learning_rate": 0.00018838300471802336, + "loss": 1.4923, + "step": 4485 + }, + { + "epoch": 0.05829352400660773, + "grad_norm": 0.35917314887046814, + "learning_rate": 0.00018838040525611202, + "loss": 1.4307, + "step": 4486 + }, + { + "epoch": 0.0583065185505236, + "grad_norm": 0.3781297206878662, + "learning_rate": 0.0001883778057942006, + "loss": 1.4574, + "step": 4487 + }, + { + "epoch": 0.058319513094439474, + "grad_norm": 0.3325479328632355, + "learning_rate": 0.0001883752063322892, + "loss": 1.4971, + "step": 4488 + }, + { + "epoch": 0.058332507638355346, + "grad_norm": 0.4469044804573059, + "learning_rate": 0.00018837260687037784, + "loss": 1.4701, + "step": 4489 + }, + { + "epoch": 0.05834550218227122, + "grad_norm": 0.4646832346916199, + "learning_rate": 0.00018837000740846646, + "loss": 1.4618, + "step": 4490 + }, + { + "epoch": 0.05835849672618709, + "grad_norm": 0.4738560616970062, + "learning_rate": 0.00018836740794655508, + "loss": 1.3937, + "step": 4491 + }, + { + "epoch": 0.058371491270102965, + "grad_norm": 0.37531280517578125, + "learning_rate": 0.00018836480848464368, + "loss": 1.3389, + "step": 4492 + }, + { + "epoch": 0.05838448581401884, + "grad_norm": 0.3102249205112457, + "learning_rate": 0.0001883622090227323, + "loss": 1.2293, + "step": 4493 + }, + { + "epoch": 0.05839748035793471, + "grad_norm": 0.32316166162490845, + "learning_rate": 0.00018835960956082093, + "loss": 1.5093, + "step": 4494 + }, + { + "epoch": 0.058410474901850584, + "grad_norm": 0.38363924622535706, + "learning_rate": 0.00018835701009890953, + "loss": 1.2756, + "step": 4495 + }, + { + "epoch": 0.05842346944576646, + "grad_norm": 0.3228376507759094, + "learning_rate": 0.00018835441063699815, + "loss": 1.4595, + "step": 4496 + }, + { + "epoch": 0.05843646398968233, + "grad_norm": 0.3308725357055664, + "learning_rate": 0.00018835181117508675, + "loss": 1.3428, + "step": 4497 + }, + { + "epoch": 0.0584494585335982, + "grad_norm": 0.4232613444328308, + "learning_rate": 0.0001883492117131754, + "loss": 1.4279, + "step": 4498 + }, + { + "epoch": 0.058462453077514076, + "grad_norm": 0.38894835114479065, + "learning_rate": 0.000188346612251264, + "loss": 1.4692, + "step": 4499 + }, + { + "epoch": 0.05847544762142995, + "grad_norm": 0.3871758282184601, + "learning_rate": 0.0001883440127893526, + "loss": 1.516, + "step": 4500 + }, + { + "epoch": 0.05848844216534582, + "grad_norm": 0.46854329109191895, + "learning_rate": 0.00018834141332744122, + "loss": 1.5492, + "step": 4501 + }, + { + "epoch": 0.0585014367092617, + "grad_norm": 0.3060224950313568, + "learning_rate": 0.00018833881386552985, + "loss": 1.3563, + "step": 4502 + }, + { + "epoch": 0.058514431253177575, + "grad_norm": 0.46446001529693604, + "learning_rate": 0.00018833621440361847, + "loss": 1.6972, + "step": 4503 + }, + { + "epoch": 0.05852742579709345, + "grad_norm": 0.3680477738380432, + "learning_rate": 0.00018833361494170707, + "loss": 1.6149, + "step": 4504 + }, + { + "epoch": 0.05854042034100932, + "grad_norm": 0.44431188702583313, + "learning_rate": 0.0001883310154797957, + "loss": 1.5737, + "step": 4505 + }, + { + "epoch": 0.058553414884925194, + "grad_norm": 0.44540688395500183, + "learning_rate": 0.00018832841601788432, + "loss": 1.348, + "step": 4506 + }, + { + "epoch": 0.05856640942884107, + "grad_norm": 0.3630962371826172, + "learning_rate": 0.0001883258165559729, + "loss": 1.4643, + "step": 4507 + }, + { + "epoch": 0.05857940397275694, + "grad_norm": 0.4329513609409332, + "learning_rate": 0.00018832321709406154, + "loss": 1.5085, + "step": 4508 + }, + { + "epoch": 0.05859239851667281, + "grad_norm": 0.44908472895622253, + "learning_rate": 0.00018832061763215016, + "loss": 1.3614, + "step": 4509 + }, + { + "epoch": 0.058605393060588686, + "grad_norm": 0.3840656876564026, + "learning_rate": 0.0001883180181702388, + "loss": 1.6086, + "step": 4510 + }, + { + "epoch": 0.05861838760450456, + "grad_norm": 0.4723275601863861, + "learning_rate": 0.00018831541870832738, + "loss": 1.4794, + "step": 4511 + }, + { + "epoch": 0.05863138214842043, + "grad_norm": 0.3561088442802429, + "learning_rate": 0.00018831281924641598, + "loss": 1.5629, + "step": 4512 + }, + { + "epoch": 0.058644376692336304, + "grad_norm": 0.5417634844779968, + "learning_rate": 0.00018831021978450463, + "loss": 1.5771, + "step": 4513 + }, + { + "epoch": 0.05865737123625218, + "grad_norm": 0.4606473445892334, + "learning_rate": 0.00018830762032259323, + "loss": 1.4187, + "step": 4514 + }, + { + "epoch": 0.05867036578016805, + "grad_norm": 0.3861846327781677, + "learning_rate": 0.00018830502086068186, + "loss": 1.3589, + "step": 4515 + }, + { + "epoch": 0.05868336032408392, + "grad_norm": 0.4596537947654724, + "learning_rate": 0.00018830242139877045, + "loss": 1.4268, + "step": 4516 + }, + { + "epoch": 0.058696354867999796, + "grad_norm": 0.43084535002708435, + "learning_rate": 0.00018829982193685908, + "loss": 1.5676, + "step": 4517 + }, + { + "epoch": 0.05870934941191567, + "grad_norm": 0.4324735105037689, + "learning_rate": 0.0001882972224749477, + "loss": 1.3924, + "step": 4518 + }, + { + "epoch": 0.05872234395583154, + "grad_norm": 0.4066638946533203, + "learning_rate": 0.0001882946230130363, + "loss": 1.4511, + "step": 4519 + }, + { + "epoch": 0.058735338499747415, + "grad_norm": 0.2944854199886322, + "learning_rate": 0.00018829202355112492, + "loss": 1.4473, + "step": 4520 + }, + { + "epoch": 0.058748333043663295, + "grad_norm": 0.518361508846283, + "learning_rate": 0.00018828942408921355, + "loss": 1.5047, + "step": 4521 + }, + { + "epoch": 0.05876132758757917, + "grad_norm": 0.5447002649307251, + "learning_rate": 0.00018828682462730217, + "loss": 1.3459, + "step": 4522 + }, + { + "epoch": 0.05877432213149504, + "grad_norm": 0.43478384613990784, + "learning_rate": 0.00018828422516539077, + "loss": 1.3106, + "step": 4523 + }, + { + "epoch": 0.058787316675410914, + "grad_norm": 0.2438018023967743, + "learning_rate": 0.0001882816257034794, + "loss": 1.342, + "step": 4524 + }, + { + "epoch": 0.05880031121932679, + "grad_norm": 0.3428383767604828, + "learning_rate": 0.00018827902624156802, + "loss": 1.5545, + "step": 4525 + }, + { + "epoch": 0.05881330576324266, + "grad_norm": 0.4866192638874054, + "learning_rate": 0.00018827642677965662, + "loss": 1.4466, + "step": 4526 + }, + { + "epoch": 0.05882630030715853, + "grad_norm": 0.43104222416877747, + "learning_rate": 0.00018827382731774524, + "loss": 1.386, + "step": 4527 + }, + { + "epoch": 0.058839294851074406, + "grad_norm": 0.46314769983291626, + "learning_rate": 0.00018827122785583384, + "loss": 1.4909, + "step": 4528 + }, + { + "epoch": 0.05885228939499028, + "grad_norm": 0.3360287845134735, + "learning_rate": 0.00018826862839392246, + "loss": 1.4474, + "step": 4529 + }, + { + "epoch": 0.05886528393890615, + "grad_norm": 0.41563481092453003, + "learning_rate": 0.0001882660289320111, + "loss": 1.4183, + "step": 4530 + }, + { + "epoch": 0.058878278482822025, + "grad_norm": 0.3972434997558594, + "learning_rate": 0.00018826342947009968, + "loss": 1.5192, + "step": 4531 + }, + { + "epoch": 0.0588912730267379, + "grad_norm": 0.4367322027683258, + "learning_rate": 0.0001882608300081883, + "loss": 1.7404, + "step": 4532 + }, + { + "epoch": 0.05890426757065377, + "grad_norm": 0.365024596452713, + "learning_rate": 0.00018825823054627693, + "loss": 1.4461, + "step": 4533 + }, + { + "epoch": 0.05891726211456964, + "grad_norm": 0.329790860414505, + "learning_rate": 0.00018825563108436556, + "loss": 1.2034, + "step": 4534 + }, + { + "epoch": 0.058930256658485516, + "grad_norm": 0.3816514015197754, + "learning_rate": 0.00018825303162245415, + "loss": 1.4533, + "step": 4535 + }, + { + "epoch": 0.05894325120240139, + "grad_norm": 0.3474125266075134, + "learning_rate": 0.00018825043216054278, + "loss": 1.3997, + "step": 4536 + }, + { + "epoch": 0.05895624574631726, + "grad_norm": 0.46969178318977356, + "learning_rate": 0.0001882478326986314, + "loss": 1.5531, + "step": 4537 + }, + { + "epoch": 0.058969240290233135, + "grad_norm": 0.33096548914909363, + "learning_rate": 0.00018824523323672, + "loss": 1.443, + "step": 4538 + }, + { + "epoch": 0.05898223483414901, + "grad_norm": 0.34960368275642395, + "learning_rate": 0.00018824263377480863, + "loss": 1.3415, + "step": 4539 + }, + { + "epoch": 0.05899522937806489, + "grad_norm": 0.4240265190601349, + "learning_rate": 0.00018824003431289722, + "loss": 1.5412, + "step": 4540 + }, + { + "epoch": 0.05900822392198076, + "grad_norm": 0.4648091793060303, + "learning_rate": 0.00018823743485098585, + "loss": 1.4664, + "step": 4541 + }, + { + "epoch": 0.059021218465896634, + "grad_norm": 0.34888583421707153, + "learning_rate": 0.00018823483538907447, + "loss": 1.354, + "step": 4542 + }, + { + "epoch": 0.05903421300981251, + "grad_norm": 0.4759249687194824, + "learning_rate": 0.00018823223592716307, + "loss": 1.4718, + "step": 4543 + }, + { + "epoch": 0.05904720755372838, + "grad_norm": 0.3777037262916565, + "learning_rate": 0.0001882296364652517, + "loss": 1.3386, + "step": 4544 + }, + { + "epoch": 0.05906020209764425, + "grad_norm": 0.4605162739753723, + "learning_rate": 0.00018822703700334032, + "loss": 1.5282, + "step": 4545 + }, + { + "epoch": 0.059073196641560126, + "grad_norm": 0.436426043510437, + "learning_rate": 0.00018822443754142894, + "loss": 1.5363, + "step": 4546 + }, + { + "epoch": 0.059086191185476, + "grad_norm": 0.40732067823410034, + "learning_rate": 0.00018822183807951754, + "loss": 1.6434, + "step": 4547 + }, + { + "epoch": 0.05909918572939187, + "grad_norm": 0.37293821573257446, + "learning_rate": 0.00018821923861760616, + "loss": 1.3742, + "step": 4548 + }, + { + "epoch": 0.059112180273307745, + "grad_norm": 0.42027273774147034, + "learning_rate": 0.0001882166391556948, + "loss": 1.4205, + "step": 4549 + }, + { + "epoch": 0.05912517481722362, + "grad_norm": 0.40116173028945923, + "learning_rate": 0.0001882140396937834, + "loss": 1.6846, + "step": 4550 + }, + { + "epoch": 0.05913816936113949, + "grad_norm": 0.48561277985572815, + "learning_rate": 0.000188211440231872, + "loss": 1.4225, + "step": 4551 + }, + { + "epoch": 0.059151163905055364, + "grad_norm": 0.3712165355682373, + "learning_rate": 0.00018820884076996064, + "loss": 1.2646, + "step": 4552 + }, + { + "epoch": 0.05916415844897124, + "grad_norm": 0.2929782271385193, + "learning_rate": 0.00018820624130804926, + "loss": 1.3504, + "step": 4553 + }, + { + "epoch": 0.05917715299288711, + "grad_norm": 0.4422702491283417, + "learning_rate": 0.00018820364184613786, + "loss": 1.5324, + "step": 4554 + }, + { + "epoch": 0.05919014753680298, + "grad_norm": 0.31200239062309265, + "learning_rate": 0.00018820104238422645, + "loss": 1.5455, + "step": 4555 + }, + { + "epoch": 0.059203142080718855, + "grad_norm": 0.32378989458084106, + "learning_rate": 0.0001881984429223151, + "loss": 1.2866, + "step": 4556 + }, + { + "epoch": 0.05921613662463473, + "grad_norm": 0.4474591016769409, + "learning_rate": 0.0001881958434604037, + "loss": 1.3814, + "step": 4557 + }, + { + "epoch": 0.0592291311685506, + "grad_norm": 0.39132970571517944, + "learning_rate": 0.00018819324399849233, + "loss": 1.3229, + "step": 4558 + }, + { + "epoch": 0.05924212571246648, + "grad_norm": 0.3722696900367737, + "learning_rate": 0.00018819064453658093, + "loss": 1.5352, + "step": 4559 + }, + { + "epoch": 0.059255120256382354, + "grad_norm": 0.36158639192581177, + "learning_rate": 0.00018818804507466955, + "loss": 1.4934, + "step": 4560 + }, + { + "epoch": 0.05926811480029823, + "grad_norm": 0.3869750201702118, + "learning_rate": 0.00018818544561275817, + "loss": 1.4109, + "step": 4561 + }, + { + "epoch": 0.0592811093442141, + "grad_norm": 0.28921765089035034, + "learning_rate": 0.00018818284615084677, + "loss": 1.4839, + "step": 4562 + }, + { + "epoch": 0.05929410388812997, + "grad_norm": 0.2798103988170624, + "learning_rate": 0.0001881802466889354, + "loss": 1.4997, + "step": 4563 + }, + { + "epoch": 0.059307098432045846, + "grad_norm": 0.4336410462856293, + "learning_rate": 0.00018817764722702402, + "loss": 1.4585, + "step": 4564 + }, + { + "epoch": 0.05932009297596172, + "grad_norm": 0.3858420252799988, + "learning_rate": 0.00018817504776511265, + "loss": 1.3882, + "step": 4565 + }, + { + "epoch": 0.05933308751987759, + "grad_norm": 0.346164345741272, + "learning_rate": 0.00018817244830320124, + "loss": 1.428, + "step": 4566 + }, + { + "epoch": 0.059346082063793465, + "grad_norm": 0.36873191595077515, + "learning_rate": 0.00018816984884128984, + "loss": 1.434, + "step": 4567 + }, + { + "epoch": 0.05935907660770934, + "grad_norm": 0.4173000156879425, + "learning_rate": 0.0001881672493793785, + "loss": 1.2551, + "step": 4568 + }, + { + "epoch": 0.05937207115162521, + "grad_norm": 0.47325408458709717, + "learning_rate": 0.0001881646499174671, + "loss": 1.3303, + "step": 4569 + }, + { + "epoch": 0.059385065695541084, + "grad_norm": 0.3723583519458771, + "learning_rate": 0.00018816205045555571, + "loss": 1.5589, + "step": 4570 + }, + { + "epoch": 0.05939806023945696, + "grad_norm": 0.4007658064365387, + "learning_rate": 0.0001881594509936443, + "loss": 1.6914, + "step": 4571 + }, + { + "epoch": 0.05941105478337283, + "grad_norm": 0.30833274126052856, + "learning_rate": 0.00018815685153173294, + "loss": 1.3943, + "step": 4572 + }, + { + "epoch": 0.0594240493272887, + "grad_norm": 0.38788431882858276, + "learning_rate": 0.00018815425206982156, + "loss": 1.4956, + "step": 4573 + }, + { + "epoch": 0.059437043871204576, + "grad_norm": 0.3323206603527069, + "learning_rate": 0.00018815165260791016, + "loss": 1.4467, + "step": 4574 + }, + { + "epoch": 0.05945003841512045, + "grad_norm": 0.32013601064682007, + "learning_rate": 0.00018814905314599878, + "loss": 1.294, + "step": 4575 + }, + { + "epoch": 0.05946303295903632, + "grad_norm": 0.3274105191230774, + "learning_rate": 0.0001881464536840874, + "loss": 1.5338, + "step": 4576 + }, + { + "epoch": 0.059476027502952195, + "grad_norm": 0.3312860429286957, + "learning_rate": 0.00018814385422217603, + "loss": 1.3967, + "step": 4577 + }, + { + "epoch": 0.059489022046868074, + "grad_norm": 0.3890517055988312, + "learning_rate": 0.00018814125476026463, + "loss": 1.3403, + "step": 4578 + }, + { + "epoch": 0.05950201659078395, + "grad_norm": 0.33841392397880554, + "learning_rate": 0.00018813865529835323, + "loss": 1.5476, + "step": 4579 + }, + { + "epoch": 0.05951501113469982, + "grad_norm": 0.3727584183216095, + "learning_rate": 0.00018813605583644188, + "loss": 1.6292, + "step": 4580 + }, + { + "epoch": 0.05952800567861569, + "grad_norm": 0.4647679328918457, + "learning_rate": 0.00018813345637453047, + "loss": 1.4954, + "step": 4581 + }, + { + "epoch": 0.059541000222531566, + "grad_norm": 0.4308343231678009, + "learning_rate": 0.0001881308569126191, + "loss": 1.4879, + "step": 4582 + }, + { + "epoch": 0.05955399476644744, + "grad_norm": 0.32289424538612366, + "learning_rate": 0.00018812825745070772, + "loss": 1.3234, + "step": 4583 + }, + { + "epoch": 0.05956698931036331, + "grad_norm": 0.25068745017051697, + "learning_rate": 0.00018812565798879632, + "loss": 1.3987, + "step": 4584 + }, + { + "epoch": 0.059579983854279185, + "grad_norm": 0.32612547278404236, + "learning_rate": 0.00018812305852688495, + "loss": 1.3596, + "step": 4585 + }, + { + "epoch": 0.05959297839819506, + "grad_norm": 0.3579429090023041, + "learning_rate": 0.00018812045906497354, + "loss": 1.2533, + "step": 4586 + }, + { + "epoch": 0.05960597294211093, + "grad_norm": 0.4372074604034424, + "learning_rate": 0.0001881178596030622, + "loss": 1.4473, + "step": 4587 + }, + { + "epoch": 0.059618967486026804, + "grad_norm": 0.30114609003067017, + "learning_rate": 0.0001881152601411508, + "loss": 1.276, + "step": 4588 + }, + { + "epoch": 0.05963196202994268, + "grad_norm": 0.4335648715496063, + "learning_rate": 0.00018811266067923942, + "loss": 1.2533, + "step": 4589 + }, + { + "epoch": 0.05964495657385855, + "grad_norm": 0.40057137608528137, + "learning_rate": 0.00018811006121732801, + "loss": 1.4782, + "step": 4590 + }, + { + "epoch": 0.05965795111777442, + "grad_norm": 0.43965011835098267, + "learning_rate": 0.00018810746175541664, + "loss": 1.4541, + "step": 4591 + }, + { + "epoch": 0.059670945661690296, + "grad_norm": 0.3410152792930603, + "learning_rate": 0.00018810486229350526, + "loss": 1.2469, + "step": 4592 + }, + { + "epoch": 0.05968394020560617, + "grad_norm": 0.3688991665840149, + "learning_rate": 0.00018810226283159386, + "loss": 1.6052, + "step": 4593 + }, + { + "epoch": 0.05969693474952204, + "grad_norm": 0.3528572916984558, + "learning_rate": 0.00018809966336968248, + "loss": 1.5222, + "step": 4594 + }, + { + "epoch": 0.059709929293437915, + "grad_norm": 0.4465098977088928, + "learning_rate": 0.0001880970639077711, + "loss": 1.4465, + "step": 4595 + }, + { + "epoch": 0.05972292383735379, + "grad_norm": 0.34993624687194824, + "learning_rate": 0.0001880944644458597, + "loss": 1.3766, + "step": 4596 + }, + { + "epoch": 0.05973591838126967, + "grad_norm": 0.4656573534011841, + "learning_rate": 0.00018809186498394833, + "loss": 1.359, + "step": 4597 + }, + { + "epoch": 0.05974891292518554, + "grad_norm": 0.3948744237422943, + "learning_rate": 0.00018808926552203693, + "loss": 1.4361, + "step": 4598 + }, + { + "epoch": 0.059761907469101413, + "grad_norm": 0.5141533613204956, + "learning_rate": 0.00018808666606012558, + "loss": 1.3905, + "step": 4599 + }, + { + "epoch": 0.059774902013017286, + "grad_norm": 0.3613419532775879, + "learning_rate": 0.00018808406659821418, + "loss": 1.2921, + "step": 4600 + }, + { + "epoch": 0.05978789655693316, + "grad_norm": 0.39984235167503357, + "learning_rate": 0.0001880814671363028, + "loss": 1.5183, + "step": 4601 + }, + { + "epoch": 0.05980089110084903, + "grad_norm": 0.3041442632675171, + "learning_rate": 0.0001880788676743914, + "loss": 1.4087, + "step": 4602 + }, + { + "epoch": 0.059813885644764905, + "grad_norm": 0.42544159293174744, + "learning_rate": 0.00018807626821248002, + "loss": 1.4614, + "step": 4603 + }, + { + "epoch": 0.05982688018868078, + "grad_norm": 0.4615595042705536, + "learning_rate": 0.00018807366875056865, + "loss": 1.6792, + "step": 4604 + }, + { + "epoch": 0.05983987473259665, + "grad_norm": 0.35686227679252625, + "learning_rate": 0.00018807106928865725, + "loss": 1.6299, + "step": 4605 + }, + { + "epoch": 0.059852869276512524, + "grad_norm": 0.3619399070739746, + "learning_rate": 0.00018806846982674587, + "loss": 1.2132, + "step": 4606 + }, + { + "epoch": 0.0598658638204284, + "grad_norm": 0.4124472141265869, + "learning_rate": 0.0001880658703648345, + "loss": 1.5429, + "step": 4607 + }, + { + "epoch": 0.05987885836434427, + "grad_norm": 0.22899313271045685, + "learning_rate": 0.00018806327090292312, + "loss": 1.0735, + "step": 4608 + }, + { + "epoch": 0.05989185290826014, + "grad_norm": 0.3973156213760376, + "learning_rate": 0.00018806067144101172, + "loss": 1.2878, + "step": 4609 + }, + { + "epoch": 0.059904847452176016, + "grad_norm": 0.38161152601242065, + "learning_rate": 0.00018805807197910031, + "loss": 1.2518, + "step": 4610 + }, + { + "epoch": 0.05991784199609189, + "grad_norm": 0.37226733565330505, + "learning_rate": 0.00018805547251718897, + "loss": 1.4684, + "step": 4611 + }, + { + "epoch": 0.05993083654000776, + "grad_norm": 0.30384692549705505, + "learning_rate": 0.00018805287305527756, + "loss": 1.4512, + "step": 4612 + }, + { + "epoch": 0.059943831083923635, + "grad_norm": 0.46904847025871277, + "learning_rate": 0.0001880502735933662, + "loss": 1.5138, + "step": 4613 + }, + { + "epoch": 0.05995682562783951, + "grad_norm": 0.41440069675445557, + "learning_rate": 0.00018804767413145478, + "loss": 1.4284, + "step": 4614 + }, + { + "epoch": 0.05996982017175538, + "grad_norm": 0.42137232422828674, + "learning_rate": 0.0001880450746695434, + "loss": 1.5833, + "step": 4615 + }, + { + "epoch": 0.05998281471567126, + "grad_norm": 0.40327513217926025, + "learning_rate": 0.00018804247520763203, + "loss": 1.6324, + "step": 4616 + }, + { + "epoch": 0.059995809259587134, + "grad_norm": 0.3549708425998688, + "learning_rate": 0.00018803987574572063, + "loss": 1.4376, + "step": 4617 + }, + { + "epoch": 0.06000880380350301, + "grad_norm": 0.3755921423435211, + "learning_rate": 0.00018803727628380926, + "loss": 1.4155, + "step": 4618 + }, + { + "epoch": 0.06002179834741888, + "grad_norm": 0.5553861856460571, + "learning_rate": 0.00018803467682189788, + "loss": 1.4298, + "step": 4619 + }, + { + "epoch": 0.06003479289133475, + "grad_norm": 0.40438908338546753, + "learning_rate": 0.0001880320773599865, + "loss": 1.6984, + "step": 4620 + }, + { + "epoch": 0.060047787435250626, + "grad_norm": 0.4536943733692169, + "learning_rate": 0.0001880294778980751, + "loss": 1.5542, + "step": 4621 + }, + { + "epoch": 0.0600607819791665, + "grad_norm": 0.3599386215209961, + "learning_rate": 0.00018802687843616373, + "loss": 1.3511, + "step": 4622 + }, + { + "epoch": 0.06007377652308237, + "grad_norm": 0.42840346693992615, + "learning_rate": 0.00018802427897425235, + "loss": 1.4751, + "step": 4623 + }, + { + "epoch": 0.060086771066998244, + "grad_norm": 0.3442873954772949, + "learning_rate": 0.00018802167951234095, + "loss": 1.2768, + "step": 4624 + }, + { + "epoch": 0.06009976561091412, + "grad_norm": 0.42270854115486145, + "learning_rate": 0.00018801908005042957, + "loss": 1.5792, + "step": 4625 + }, + { + "epoch": 0.06011276015482999, + "grad_norm": 0.4190753400325775, + "learning_rate": 0.0001880164805885182, + "loss": 1.2809, + "step": 4626 + }, + { + "epoch": 0.06012575469874586, + "grad_norm": 0.37923577427864075, + "learning_rate": 0.0001880138811266068, + "loss": 1.4505, + "step": 4627 + }, + { + "epoch": 0.060138749242661736, + "grad_norm": 0.3711230754852295, + "learning_rate": 0.00018801128166469542, + "loss": 1.5396, + "step": 4628 + }, + { + "epoch": 0.06015174378657761, + "grad_norm": 0.41335341334342957, + "learning_rate": 0.00018800868220278402, + "loss": 1.4764, + "step": 4629 + }, + { + "epoch": 0.06016473833049348, + "grad_norm": 0.3963650166988373, + "learning_rate": 0.00018800608274087267, + "loss": 1.4486, + "step": 4630 + }, + { + "epoch": 0.060177732874409355, + "grad_norm": 0.43489956855773926, + "learning_rate": 0.00018800348327896127, + "loss": 1.3373, + "step": 4631 + }, + { + "epoch": 0.06019072741832523, + "grad_norm": 0.3788653612136841, + "learning_rate": 0.0001880008838170499, + "loss": 1.6634, + "step": 4632 + }, + { + "epoch": 0.0602037219622411, + "grad_norm": 0.28406089544296265, + "learning_rate": 0.0001879982843551385, + "loss": 1.2853, + "step": 4633 + }, + { + "epoch": 0.060216716506156974, + "grad_norm": 0.3892311155796051, + "learning_rate": 0.0001879956848932271, + "loss": 1.3714, + "step": 4634 + }, + { + "epoch": 0.060229711050072854, + "grad_norm": 0.3391830623149872, + "learning_rate": 0.00018799308543131574, + "loss": 1.5051, + "step": 4635 + }, + { + "epoch": 0.06024270559398873, + "grad_norm": 0.37689408659935, + "learning_rate": 0.00018799048596940433, + "loss": 1.4626, + "step": 4636 + }, + { + "epoch": 0.0602557001379046, + "grad_norm": 0.4322948455810547, + "learning_rate": 0.00018798788650749296, + "loss": 1.345, + "step": 4637 + }, + { + "epoch": 0.06026869468182047, + "grad_norm": 0.34250393509864807, + "learning_rate": 0.00018798528704558158, + "loss": 1.5787, + "step": 4638 + }, + { + "epoch": 0.060281689225736346, + "grad_norm": 0.35673874616622925, + "learning_rate": 0.00018798268758367018, + "loss": 1.5439, + "step": 4639 + }, + { + "epoch": 0.06029468376965222, + "grad_norm": 0.3739342987537384, + "learning_rate": 0.0001879800881217588, + "loss": 1.3467, + "step": 4640 + }, + { + "epoch": 0.06030767831356809, + "grad_norm": 0.38367798924446106, + "learning_rate": 0.0001879774886598474, + "loss": 1.422, + "step": 4641 + }, + { + "epoch": 0.060320672857483965, + "grad_norm": 0.3680630326271057, + "learning_rate": 0.00018797488919793605, + "loss": 1.4557, + "step": 4642 + }, + { + "epoch": 0.06033366740139984, + "grad_norm": 0.3883708715438843, + "learning_rate": 0.00018797228973602465, + "loss": 1.4711, + "step": 4643 + }, + { + "epoch": 0.06034666194531571, + "grad_norm": 0.4156469404697418, + "learning_rate": 0.00018796969027411328, + "loss": 1.4911, + "step": 4644 + }, + { + "epoch": 0.06035965648923158, + "grad_norm": 0.3638401925563812, + "learning_rate": 0.00018796709081220187, + "loss": 1.3364, + "step": 4645 + }, + { + "epoch": 0.060372651033147456, + "grad_norm": 0.3887464702129364, + "learning_rate": 0.0001879644913502905, + "loss": 1.5257, + "step": 4646 + }, + { + "epoch": 0.06038564557706333, + "grad_norm": 0.3279300034046173, + "learning_rate": 0.00018796189188837912, + "loss": 1.4309, + "step": 4647 + }, + { + "epoch": 0.0603986401209792, + "grad_norm": 0.4504035711288452, + "learning_rate": 0.00018795929242646772, + "loss": 1.236, + "step": 4648 + }, + { + "epoch": 0.060411634664895075, + "grad_norm": 0.3183561861515045, + "learning_rate": 0.00018795669296455634, + "loss": 1.6423, + "step": 4649 + }, + { + "epoch": 0.06042462920881095, + "grad_norm": 0.37958523631095886, + "learning_rate": 0.00018795409350264497, + "loss": 1.3208, + "step": 4650 + }, + { + "epoch": 0.06043762375272682, + "grad_norm": 0.359591007232666, + "learning_rate": 0.00018795149404073357, + "loss": 1.3348, + "step": 4651 + }, + { + "epoch": 0.060450618296642694, + "grad_norm": 0.3713827133178711, + "learning_rate": 0.0001879488945788222, + "loss": 1.5255, + "step": 4652 + }, + { + "epoch": 0.06046361284055857, + "grad_norm": 0.5505270957946777, + "learning_rate": 0.0001879462951169108, + "loss": 1.3536, + "step": 4653 + }, + { + "epoch": 0.06047660738447445, + "grad_norm": 0.43987274169921875, + "learning_rate": 0.00018794369565499944, + "loss": 1.4315, + "step": 4654 + }, + { + "epoch": 0.06048960192839032, + "grad_norm": 0.5327964425086975, + "learning_rate": 0.00018794109619308804, + "loss": 1.4154, + "step": 4655 + }, + { + "epoch": 0.06050259647230619, + "grad_norm": 0.43529361486434937, + "learning_rate": 0.00018793849673117666, + "loss": 1.4928, + "step": 4656 + }, + { + "epoch": 0.060515591016222066, + "grad_norm": 0.36240583658218384, + "learning_rate": 0.00018793589726926528, + "loss": 1.3947, + "step": 4657 + }, + { + "epoch": 0.06052858556013794, + "grad_norm": 0.4684526026248932, + "learning_rate": 0.00018793329780735388, + "loss": 1.5496, + "step": 4658 + }, + { + "epoch": 0.06054158010405381, + "grad_norm": 0.29860028624534607, + "learning_rate": 0.0001879306983454425, + "loss": 1.5375, + "step": 4659 + }, + { + "epoch": 0.060554574647969685, + "grad_norm": 0.501872181892395, + "learning_rate": 0.0001879280988835311, + "loss": 1.5208, + "step": 4660 + }, + { + "epoch": 0.06056756919188556, + "grad_norm": 0.4405615031719208, + "learning_rate": 0.00018792549942161976, + "loss": 1.5717, + "step": 4661 + }, + { + "epoch": 0.06058056373580143, + "grad_norm": 0.3775745928287506, + "learning_rate": 0.00018792289995970835, + "loss": 1.3513, + "step": 4662 + }, + { + "epoch": 0.060593558279717304, + "grad_norm": 0.33774715662002563, + "learning_rate": 0.00018792030049779695, + "loss": 1.2951, + "step": 4663 + }, + { + "epoch": 0.06060655282363318, + "grad_norm": 0.43124672770500183, + "learning_rate": 0.00018791770103588558, + "loss": 1.589, + "step": 4664 + }, + { + "epoch": 0.06061954736754905, + "grad_norm": 0.3844517767429352, + "learning_rate": 0.0001879151015739742, + "loss": 1.3519, + "step": 4665 + }, + { + "epoch": 0.06063254191146492, + "grad_norm": 0.356585294008255, + "learning_rate": 0.00018791250211206282, + "loss": 1.3697, + "step": 4666 + }, + { + "epoch": 0.060645536455380795, + "grad_norm": 0.3608217239379883, + "learning_rate": 0.00018790990265015142, + "loss": 1.5346, + "step": 4667 + }, + { + "epoch": 0.06065853099929667, + "grad_norm": 0.5022678375244141, + "learning_rate": 0.00018790730318824005, + "loss": 1.5213, + "step": 4668 + }, + { + "epoch": 0.06067152554321254, + "grad_norm": 0.39539477229118347, + "learning_rate": 0.00018790470372632867, + "loss": 1.5392, + "step": 4669 + }, + { + "epoch": 0.060684520087128414, + "grad_norm": 0.3768894076347351, + "learning_rate": 0.00018790210426441727, + "loss": 1.5174, + "step": 4670 + }, + { + "epoch": 0.06069751463104429, + "grad_norm": 0.40725386142730713, + "learning_rate": 0.0001878995048025059, + "loss": 1.4684, + "step": 4671 + }, + { + "epoch": 0.06071050917496016, + "grad_norm": 0.5199422240257263, + "learning_rate": 0.0001878969053405945, + "loss": 1.4579, + "step": 4672 + }, + { + "epoch": 0.06072350371887604, + "grad_norm": 0.35900917649269104, + "learning_rate": 0.00018789430587868314, + "loss": 1.6107, + "step": 4673 + }, + { + "epoch": 0.06073649826279191, + "grad_norm": 0.37490200996398926, + "learning_rate": 0.00018789170641677174, + "loss": 1.4037, + "step": 4674 + }, + { + "epoch": 0.060749492806707786, + "grad_norm": 0.3832722008228302, + "learning_rate": 0.00018788910695486036, + "loss": 1.5209, + "step": 4675 + }, + { + "epoch": 0.06076248735062366, + "grad_norm": 0.4584009647369385, + "learning_rate": 0.00018788650749294896, + "loss": 1.4505, + "step": 4676 + }, + { + "epoch": 0.06077548189453953, + "grad_norm": 0.39893820881843567, + "learning_rate": 0.00018788390803103758, + "loss": 1.6227, + "step": 4677 + }, + { + "epoch": 0.060788476438455405, + "grad_norm": 0.4154282212257385, + "learning_rate": 0.0001878813085691262, + "loss": 1.53, + "step": 4678 + }, + { + "epoch": 0.06080147098237128, + "grad_norm": 0.4616670310497284, + "learning_rate": 0.0001878787091072148, + "loss": 1.5009, + "step": 4679 + }, + { + "epoch": 0.06081446552628715, + "grad_norm": 0.39531344175338745, + "learning_rate": 0.00018787610964530343, + "loss": 1.3302, + "step": 4680 + }, + { + "epoch": 0.060827460070203024, + "grad_norm": 0.41708120703697205, + "learning_rate": 0.00018787351018339206, + "loss": 1.4394, + "step": 4681 + }, + { + "epoch": 0.0608404546141189, + "grad_norm": 0.4020169675350189, + "learning_rate": 0.00018787091072148065, + "loss": 1.7117, + "step": 4682 + }, + { + "epoch": 0.06085344915803477, + "grad_norm": 0.42392462491989136, + "learning_rate": 0.00018786831125956928, + "loss": 1.481, + "step": 4683 + }, + { + "epoch": 0.06086644370195064, + "grad_norm": 0.476747989654541, + "learning_rate": 0.00018786571179765787, + "loss": 1.6551, + "step": 4684 + }, + { + "epoch": 0.060879438245866516, + "grad_norm": 0.339716374874115, + "learning_rate": 0.00018786311233574653, + "loss": 1.369, + "step": 4685 + }, + { + "epoch": 0.06089243278978239, + "grad_norm": 0.3359491229057312, + "learning_rate": 0.00018786051287383512, + "loss": 1.4248, + "step": 4686 + }, + { + "epoch": 0.06090542733369826, + "grad_norm": 0.4091237485408783, + "learning_rate": 0.00018785791341192375, + "loss": 1.5515, + "step": 4687 + }, + { + "epoch": 0.060918421877614135, + "grad_norm": 0.3720499277114868, + "learning_rate": 0.00018785531395001235, + "loss": 1.6032, + "step": 4688 + }, + { + "epoch": 0.06093141642153001, + "grad_norm": 0.4408172369003296, + "learning_rate": 0.00018785271448810097, + "loss": 1.4856, + "step": 4689 + }, + { + "epoch": 0.06094441096544588, + "grad_norm": 0.358181893825531, + "learning_rate": 0.0001878501150261896, + "loss": 1.4114, + "step": 4690 + }, + { + "epoch": 0.06095740550936175, + "grad_norm": 0.42284950613975525, + "learning_rate": 0.0001878475155642782, + "loss": 1.4355, + "step": 4691 + }, + { + "epoch": 0.06097040005327763, + "grad_norm": 0.34294018149375916, + "learning_rate": 0.00018784491610236682, + "loss": 1.425, + "step": 4692 + }, + { + "epoch": 0.060983394597193506, + "grad_norm": 0.3853449523448944, + "learning_rate": 0.00018784231664045544, + "loss": 1.553, + "step": 4693 + }, + { + "epoch": 0.06099638914110938, + "grad_norm": 0.44293487071990967, + "learning_rate": 0.00018783971717854404, + "loss": 1.605, + "step": 4694 + }, + { + "epoch": 0.06100938368502525, + "grad_norm": 0.30496087670326233, + "learning_rate": 0.00018783711771663266, + "loss": 1.323, + "step": 4695 + }, + { + "epoch": 0.061022378228941125, + "grad_norm": 0.37509262561798096, + "learning_rate": 0.0001878345182547213, + "loss": 1.3581, + "step": 4696 + }, + { + "epoch": 0.061035372772857, + "grad_norm": 0.3697527050971985, + "learning_rate": 0.0001878319187928099, + "loss": 1.44, + "step": 4697 + }, + { + "epoch": 0.06104836731677287, + "grad_norm": 0.3637860417366028, + "learning_rate": 0.0001878293193308985, + "loss": 1.5373, + "step": 4698 + }, + { + "epoch": 0.061061361860688744, + "grad_norm": 0.4187600612640381, + "learning_rate": 0.00018782671986898713, + "loss": 1.5917, + "step": 4699 + }, + { + "epoch": 0.06107435640460462, + "grad_norm": 0.40024304389953613, + "learning_rate": 0.00018782412040707576, + "loss": 1.4136, + "step": 4700 + }, + { + "epoch": 0.06108735094852049, + "grad_norm": 0.4038996696472168, + "learning_rate": 0.00018782152094516436, + "loss": 1.4541, + "step": 4701 + }, + { + "epoch": 0.06110034549243636, + "grad_norm": 0.42494386434555054, + "learning_rate": 0.00018781892148325298, + "loss": 1.3921, + "step": 4702 + }, + { + "epoch": 0.061113340036352236, + "grad_norm": 0.3595023453235626, + "learning_rate": 0.00018781632202134158, + "loss": 1.5208, + "step": 4703 + }, + { + "epoch": 0.06112633458026811, + "grad_norm": 0.4407199025154114, + "learning_rate": 0.00018781372255943023, + "loss": 1.6047, + "step": 4704 + }, + { + "epoch": 0.06113932912418398, + "grad_norm": 0.3866972029209137, + "learning_rate": 0.00018781112309751883, + "loss": 1.4929, + "step": 4705 + }, + { + "epoch": 0.061152323668099855, + "grad_norm": 0.38850200176239014, + "learning_rate": 0.00018780852363560742, + "loss": 1.5825, + "step": 4706 + }, + { + "epoch": 0.06116531821201573, + "grad_norm": 0.4282163977622986, + "learning_rate": 0.00018780592417369605, + "loss": 1.4532, + "step": 4707 + }, + { + "epoch": 0.0611783127559316, + "grad_norm": 0.4159630835056305, + "learning_rate": 0.00018780332471178467, + "loss": 1.2992, + "step": 4708 + }, + { + "epoch": 0.061191307299847474, + "grad_norm": 0.3920672535896301, + "learning_rate": 0.0001878007252498733, + "loss": 1.5302, + "step": 4709 + }, + { + "epoch": 0.06120430184376335, + "grad_norm": 0.4455764591693878, + "learning_rate": 0.0001877981257879619, + "loss": 1.559, + "step": 4710 + }, + { + "epoch": 0.061217296387679226, + "grad_norm": 0.3193241357803345, + "learning_rate": 0.00018779552632605052, + "loss": 1.1913, + "step": 4711 + }, + { + "epoch": 0.0612302909315951, + "grad_norm": 0.3334447741508484, + "learning_rate": 0.00018779292686413914, + "loss": 1.3724, + "step": 4712 + }, + { + "epoch": 0.06124328547551097, + "grad_norm": 0.31845778226852417, + "learning_rate": 0.00018779032740222774, + "loss": 1.3204, + "step": 4713 + }, + { + "epoch": 0.061256280019426845, + "grad_norm": 0.3759171664714813, + "learning_rate": 0.00018778772794031637, + "loss": 1.5403, + "step": 4714 + }, + { + "epoch": 0.06126927456334272, + "grad_norm": 0.41342827677726746, + "learning_rate": 0.00018778512847840496, + "loss": 1.2863, + "step": 4715 + }, + { + "epoch": 0.06128226910725859, + "grad_norm": 0.3663657009601593, + "learning_rate": 0.00018778252901649361, + "loss": 1.5629, + "step": 4716 + }, + { + "epoch": 0.061295263651174464, + "grad_norm": 0.41584208607673645, + "learning_rate": 0.0001877799295545822, + "loss": 1.4875, + "step": 4717 + }, + { + "epoch": 0.06130825819509034, + "grad_norm": 0.3094605505466461, + "learning_rate": 0.0001877773300926708, + "loss": 1.4665, + "step": 4718 + }, + { + "epoch": 0.06132125273900621, + "grad_norm": 0.29234790802001953, + "learning_rate": 0.00018777473063075943, + "loss": 1.2721, + "step": 4719 + }, + { + "epoch": 0.06133424728292208, + "grad_norm": 0.3660517930984497, + "learning_rate": 0.00018777213116884806, + "loss": 1.4399, + "step": 4720 + }, + { + "epoch": 0.061347241826837956, + "grad_norm": 0.40468522906303406, + "learning_rate": 0.00018776953170693668, + "loss": 1.4982, + "step": 4721 + }, + { + "epoch": 0.06136023637075383, + "grad_norm": 0.34839415550231934, + "learning_rate": 0.00018776693224502528, + "loss": 1.3703, + "step": 4722 + }, + { + "epoch": 0.0613732309146697, + "grad_norm": 0.36361679434776306, + "learning_rate": 0.0001877643327831139, + "loss": 1.4074, + "step": 4723 + }, + { + "epoch": 0.061386225458585575, + "grad_norm": 0.3819040060043335, + "learning_rate": 0.00018776173332120253, + "loss": 1.3619, + "step": 4724 + }, + { + "epoch": 0.06139922000250145, + "grad_norm": 0.4625639319419861, + "learning_rate": 0.00018775913385929113, + "loss": 1.4666, + "step": 4725 + }, + { + "epoch": 0.06141221454641732, + "grad_norm": 0.38273537158966064, + "learning_rate": 0.00018775653439737975, + "loss": 1.4758, + "step": 4726 + }, + { + "epoch": 0.061425209090333194, + "grad_norm": 0.32814735174179077, + "learning_rate": 0.00018775393493546835, + "loss": 1.3391, + "step": 4727 + }, + { + "epoch": 0.06143820363424907, + "grad_norm": 0.4361976087093353, + "learning_rate": 0.000187751335473557, + "loss": 1.386, + "step": 4728 + }, + { + "epoch": 0.06145119817816494, + "grad_norm": 0.470562607049942, + "learning_rate": 0.0001877487360116456, + "loss": 1.3806, + "step": 4729 + }, + { + "epoch": 0.06146419272208082, + "grad_norm": 0.48570799827575684, + "learning_rate": 0.00018774613654973422, + "loss": 1.2079, + "step": 4730 + }, + { + "epoch": 0.06147718726599669, + "grad_norm": 0.5506556630134583, + "learning_rate": 0.00018774353708782285, + "loss": 1.4277, + "step": 4731 + }, + { + "epoch": 0.061490181809912565, + "grad_norm": 0.37174728512763977, + "learning_rate": 0.00018774093762591144, + "loss": 1.2928, + "step": 4732 + }, + { + "epoch": 0.06150317635382844, + "grad_norm": 0.3608209788799286, + "learning_rate": 0.00018773833816400007, + "loss": 1.1566, + "step": 4733 + }, + { + "epoch": 0.06151617089774431, + "grad_norm": 0.4654795825481415, + "learning_rate": 0.00018773573870208867, + "loss": 1.3341, + "step": 4734 + }, + { + "epoch": 0.061529165441660184, + "grad_norm": 0.26536867022514343, + "learning_rate": 0.0001877331392401773, + "loss": 1.3326, + "step": 4735 + }, + { + "epoch": 0.06154215998557606, + "grad_norm": 0.44267329573631287, + "learning_rate": 0.00018773053977826591, + "loss": 1.4208, + "step": 4736 + }, + { + "epoch": 0.06155515452949193, + "grad_norm": 0.3890420198440552, + "learning_rate": 0.0001877279403163545, + "loss": 1.4988, + "step": 4737 + }, + { + "epoch": 0.0615681490734078, + "grad_norm": 0.3623405992984772, + "learning_rate": 0.00018772534085444314, + "loss": 1.3181, + "step": 4738 + }, + { + "epoch": 0.061581143617323676, + "grad_norm": 0.46809765696525574, + "learning_rate": 0.00018772274139253176, + "loss": 1.5969, + "step": 4739 + }, + { + "epoch": 0.06159413816123955, + "grad_norm": 0.4159347116947174, + "learning_rate": 0.00018772014193062039, + "loss": 1.4688, + "step": 4740 + }, + { + "epoch": 0.06160713270515542, + "grad_norm": 0.36170199513435364, + "learning_rate": 0.00018771754246870898, + "loss": 1.5292, + "step": 4741 + }, + { + "epoch": 0.061620127249071295, + "grad_norm": 0.4478611350059509, + "learning_rate": 0.0001877149430067976, + "loss": 1.4147, + "step": 4742 + }, + { + "epoch": 0.06163312179298717, + "grad_norm": 0.33543694019317627, + "learning_rate": 0.00018771234354488623, + "loss": 1.3701, + "step": 4743 + }, + { + "epoch": 0.06164611633690304, + "grad_norm": 0.43642449378967285, + "learning_rate": 0.00018770974408297483, + "loss": 1.5713, + "step": 4744 + }, + { + "epoch": 0.061659110880818914, + "grad_norm": 0.4053778350353241, + "learning_rate": 0.00018770714462106345, + "loss": 1.5266, + "step": 4745 + }, + { + "epoch": 0.06167210542473479, + "grad_norm": 0.41506728529930115, + "learning_rate": 0.00018770454515915205, + "loss": 1.2939, + "step": 4746 + }, + { + "epoch": 0.06168509996865066, + "grad_norm": 0.30604878067970276, + "learning_rate": 0.00018770194569724068, + "loss": 1.2528, + "step": 4747 + }, + { + "epoch": 0.06169809451256653, + "grad_norm": 0.48034870624542236, + "learning_rate": 0.0001876993462353293, + "loss": 1.5633, + "step": 4748 + }, + { + "epoch": 0.06171108905648241, + "grad_norm": 0.3947703242301941, + "learning_rate": 0.0001876967467734179, + "loss": 1.4401, + "step": 4749 + }, + { + "epoch": 0.061724083600398286, + "grad_norm": 0.43567919731140137, + "learning_rate": 0.00018769414731150652, + "loss": 1.5715, + "step": 4750 + }, + { + "epoch": 0.06173707814431416, + "grad_norm": 0.37029874324798584, + "learning_rate": 0.00018769154784959515, + "loss": 1.3434, + "step": 4751 + }, + { + "epoch": 0.06175007268823003, + "grad_norm": 0.3966732919216156, + "learning_rate": 0.00018768894838768377, + "loss": 1.3285, + "step": 4752 + }, + { + "epoch": 0.061763067232145905, + "grad_norm": 0.3607752025127411, + "learning_rate": 0.00018768634892577237, + "loss": 1.4862, + "step": 4753 + }, + { + "epoch": 0.06177606177606178, + "grad_norm": 0.37333714962005615, + "learning_rate": 0.000187683749463861, + "loss": 1.493, + "step": 4754 + }, + { + "epoch": 0.06178905631997765, + "grad_norm": 0.38883668184280396, + "learning_rate": 0.00018768115000194962, + "loss": 1.3699, + "step": 4755 + }, + { + "epoch": 0.06180205086389352, + "grad_norm": 0.4387046694755554, + "learning_rate": 0.00018767855054003821, + "loss": 1.7208, + "step": 4756 + }, + { + "epoch": 0.061815045407809396, + "grad_norm": 0.33561962842941284, + "learning_rate": 0.00018767595107812684, + "loss": 1.4431, + "step": 4757 + }, + { + "epoch": 0.06182803995172527, + "grad_norm": 0.40161532163619995, + "learning_rate": 0.00018767335161621544, + "loss": 1.4407, + "step": 4758 + }, + { + "epoch": 0.06184103449564114, + "grad_norm": 0.4498758614063263, + "learning_rate": 0.0001876707521543041, + "loss": 1.4239, + "step": 4759 + }, + { + "epoch": 0.061854029039557015, + "grad_norm": 0.5349181890487671, + "learning_rate": 0.00018766815269239269, + "loss": 1.4826, + "step": 4760 + }, + { + "epoch": 0.06186702358347289, + "grad_norm": 0.44247332215309143, + "learning_rate": 0.00018766555323048128, + "loss": 1.4883, + "step": 4761 + }, + { + "epoch": 0.06188001812738876, + "grad_norm": 0.40534019470214844, + "learning_rate": 0.0001876629537685699, + "loss": 1.5636, + "step": 4762 + }, + { + "epoch": 0.061893012671304634, + "grad_norm": 0.4621119201183319, + "learning_rate": 0.00018766035430665853, + "loss": 1.5419, + "step": 4763 + }, + { + "epoch": 0.06190600721522051, + "grad_norm": 0.3977503478527069, + "learning_rate": 0.00018765775484474716, + "loss": 1.2825, + "step": 4764 + }, + { + "epoch": 0.06191900175913638, + "grad_norm": 0.40935274958610535, + "learning_rate": 0.00018765515538283575, + "loss": 1.423, + "step": 4765 + }, + { + "epoch": 0.06193199630305225, + "grad_norm": 0.35663458704948425, + "learning_rate": 0.00018765255592092438, + "loss": 1.3097, + "step": 4766 + }, + { + "epoch": 0.061944990846968126, + "grad_norm": 0.34516972303390503, + "learning_rate": 0.000187649956459013, + "loss": 1.4726, + "step": 4767 + }, + { + "epoch": 0.061957985390884006, + "grad_norm": 0.3514779806137085, + "learning_rate": 0.0001876473569971016, + "loss": 1.3543, + "step": 4768 + }, + { + "epoch": 0.06197097993479988, + "grad_norm": 0.3431565761566162, + "learning_rate": 0.00018764475753519022, + "loss": 1.4397, + "step": 4769 + }, + { + "epoch": 0.06198397447871575, + "grad_norm": 0.29300588369369507, + "learning_rate": 0.00018764215807327885, + "loss": 1.4131, + "step": 4770 + }, + { + "epoch": 0.061996969022631625, + "grad_norm": 0.42205876111984253, + "learning_rate": 0.00018763955861136747, + "loss": 1.5009, + "step": 4771 + }, + { + "epoch": 0.0620099635665475, + "grad_norm": 0.43114832043647766, + "learning_rate": 0.00018763695914945607, + "loss": 1.4078, + "step": 4772 + }, + { + "epoch": 0.06202295811046337, + "grad_norm": 0.3251851201057434, + "learning_rate": 0.00018763435968754467, + "loss": 1.3527, + "step": 4773 + }, + { + "epoch": 0.062035952654379244, + "grad_norm": 0.4490157663822174, + "learning_rate": 0.00018763176022563332, + "loss": 1.4597, + "step": 4774 + }, + { + "epoch": 0.06204894719829512, + "grad_norm": 0.36808133125305176, + "learning_rate": 0.00018762916076372192, + "loss": 1.4885, + "step": 4775 + }, + { + "epoch": 0.06206194174221099, + "grad_norm": 0.4256531298160553, + "learning_rate": 0.00018762656130181054, + "loss": 1.4016, + "step": 4776 + }, + { + "epoch": 0.06207493628612686, + "grad_norm": 0.3940003216266632, + "learning_rate": 0.00018762396183989914, + "loss": 1.5645, + "step": 4777 + }, + { + "epoch": 0.062087930830042735, + "grad_norm": 0.4098092019557953, + "learning_rate": 0.00018762136237798776, + "loss": 1.3158, + "step": 4778 + }, + { + "epoch": 0.06210092537395861, + "grad_norm": 0.4216630458831787, + "learning_rate": 0.0001876187629160764, + "loss": 1.5037, + "step": 4779 + }, + { + "epoch": 0.06211391991787448, + "grad_norm": 0.2802329659461975, + "learning_rate": 0.00018761616345416499, + "loss": 1.2852, + "step": 4780 + }, + { + "epoch": 0.062126914461790354, + "grad_norm": 0.4184139668941498, + "learning_rate": 0.0001876135639922536, + "loss": 1.4164, + "step": 4781 + }, + { + "epoch": 0.06213990900570623, + "grad_norm": 0.4360269606113434, + "learning_rate": 0.00018761096453034223, + "loss": 1.6717, + "step": 4782 + }, + { + "epoch": 0.0621529035496221, + "grad_norm": 0.37473347783088684, + "learning_rate": 0.00018760836506843086, + "loss": 1.3978, + "step": 4783 + }, + { + "epoch": 0.06216589809353797, + "grad_norm": 0.5111908912658691, + "learning_rate": 0.00018760576560651946, + "loss": 1.4218, + "step": 4784 + }, + { + "epoch": 0.062178892637453846, + "grad_norm": 0.5529409646987915, + "learning_rate": 0.00018760316614460805, + "loss": 1.5151, + "step": 4785 + }, + { + "epoch": 0.06219188718136972, + "grad_norm": 0.42840296030044556, + "learning_rate": 0.0001876005666826967, + "loss": 1.5215, + "step": 4786 + }, + { + "epoch": 0.0622048817252856, + "grad_norm": 0.39297595620155334, + "learning_rate": 0.0001875979672207853, + "loss": 1.5035, + "step": 4787 + }, + { + "epoch": 0.06221787626920147, + "grad_norm": 0.3373584449291229, + "learning_rate": 0.00018759536775887393, + "loss": 1.38, + "step": 4788 + }, + { + "epoch": 0.062230870813117345, + "grad_norm": 0.36476561427116394, + "learning_rate": 0.00018759276829696252, + "loss": 1.363, + "step": 4789 + }, + { + "epoch": 0.06224386535703322, + "grad_norm": 0.3945012092590332, + "learning_rate": 0.00018759016883505115, + "loss": 1.4091, + "step": 4790 + }, + { + "epoch": 0.06225685990094909, + "grad_norm": 0.3533738851547241, + "learning_rate": 0.00018758756937313977, + "loss": 1.1834, + "step": 4791 + }, + { + "epoch": 0.062269854444864964, + "grad_norm": 0.3350159227848053, + "learning_rate": 0.00018758496991122837, + "loss": 1.2534, + "step": 4792 + }, + { + "epoch": 0.06228284898878084, + "grad_norm": 0.38589489459991455, + "learning_rate": 0.000187582370449317, + "loss": 1.3611, + "step": 4793 + }, + { + "epoch": 0.06229584353269671, + "grad_norm": 0.4569569230079651, + "learning_rate": 0.00018757977098740562, + "loss": 1.4845, + "step": 4794 + }, + { + "epoch": 0.06230883807661258, + "grad_norm": 0.38989877700805664, + "learning_rate": 0.00018757717152549424, + "loss": 1.2864, + "step": 4795 + }, + { + "epoch": 0.062321832620528456, + "grad_norm": 0.3836038410663605, + "learning_rate": 0.00018757457206358284, + "loss": 1.4262, + "step": 4796 + }, + { + "epoch": 0.06233482716444433, + "grad_norm": 0.3546046018600464, + "learning_rate": 0.00018757197260167147, + "loss": 1.4234, + "step": 4797 + }, + { + "epoch": 0.0623478217083602, + "grad_norm": 0.3120918273925781, + "learning_rate": 0.0001875693731397601, + "loss": 1.4014, + "step": 4798 + }, + { + "epoch": 0.062360816252276075, + "grad_norm": 0.5139092206954956, + "learning_rate": 0.0001875667736778487, + "loss": 1.6836, + "step": 4799 + }, + { + "epoch": 0.06237381079619195, + "grad_norm": 0.3254159986972809, + "learning_rate": 0.0001875641742159373, + "loss": 1.3511, + "step": 4800 + }, + { + "epoch": 0.06238680534010782, + "grad_norm": 0.3734841048717499, + "learning_rate": 0.0001875615747540259, + "loss": 1.5588, + "step": 4801 + }, + { + "epoch": 0.06239979988402369, + "grad_norm": 0.38808658719062805, + "learning_rate": 0.00018755897529211453, + "loss": 1.5629, + "step": 4802 + }, + { + "epoch": 0.062412794427939566, + "grad_norm": 0.48189741373062134, + "learning_rate": 0.00018755637583020316, + "loss": 1.4998, + "step": 4803 + }, + { + "epoch": 0.06242578897185544, + "grad_norm": 0.4589759409427643, + "learning_rate": 0.00018755377636829176, + "loss": 1.5244, + "step": 4804 + }, + { + "epoch": 0.06243878351577131, + "grad_norm": 0.38827449083328247, + "learning_rate": 0.0001875511769063804, + "loss": 1.5605, + "step": 4805 + }, + { + "epoch": 0.06245177805968719, + "grad_norm": 0.35549396276474, + "learning_rate": 0.000187548577444469, + "loss": 1.4121, + "step": 4806 + }, + { + "epoch": 0.062464772603603065, + "grad_norm": 0.36884772777557373, + "learning_rate": 0.00018754597798255763, + "loss": 1.4399, + "step": 4807 + }, + { + "epoch": 0.06247776714751894, + "grad_norm": 0.41779768466949463, + "learning_rate": 0.00018754337852064623, + "loss": 1.6428, + "step": 4808 + }, + { + "epoch": 0.06249076169143481, + "grad_norm": 0.345687597990036, + "learning_rate": 0.00018754077905873485, + "loss": 1.3752, + "step": 4809 + }, + { + "epoch": 0.06250375623535068, + "grad_norm": 0.29694631695747375, + "learning_rate": 0.00018753817959682348, + "loss": 1.4213, + "step": 4810 + }, + { + "epoch": 0.06251675077926655, + "grad_norm": 0.41125616431236267, + "learning_rate": 0.00018753558013491207, + "loss": 1.3525, + "step": 4811 + }, + { + "epoch": 0.06252974532318242, + "grad_norm": 0.4661087393760681, + "learning_rate": 0.0001875329806730007, + "loss": 1.6321, + "step": 4812 + }, + { + "epoch": 0.0625427398670983, + "grad_norm": 0.40610751509666443, + "learning_rate": 0.00018753038121108932, + "loss": 1.4283, + "step": 4813 + }, + { + "epoch": 0.06255573441101417, + "grad_norm": 0.3044849634170532, + "learning_rate": 0.00018752778174917795, + "loss": 1.3231, + "step": 4814 + }, + { + "epoch": 0.06256872895493004, + "grad_norm": 0.44375497102737427, + "learning_rate": 0.00018752518228726654, + "loss": 1.5793, + "step": 4815 + }, + { + "epoch": 0.06258172349884593, + "grad_norm": 0.463663250207901, + "learning_rate": 0.00018752258282535514, + "loss": 1.5444, + "step": 4816 + }, + { + "epoch": 0.0625947180427618, + "grad_norm": 0.34532085061073303, + "learning_rate": 0.0001875199833634438, + "loss": 1.4096, + "step": 4817 + }, + { + "epoch": 0.06260771258667767, + "grad_norm": 0.4654233753681183, + "learning_rate": 0.0001875173839015324, + "loss": 1.7286, + "step": 4818 + }, + { + "epoch": 0.06262070713059355, + "grad_norm": 0.4802190065383911, + "learning_rate": 0.00018751478443962101, + "loss": 1.4878, + "step": 4819 + }, + { + "epoch": 0.06263370167450942, + "grad_norm": 0.3427649438381195, + "learning_rate": 0.0001875121849777096, + "loss": 1.6239, + "step": 4820 + }, + { + "epoch": 0.0626466962184253, + "grad_norm": 0.31176993250846863, + "learning_rate": 0.00018750958551579824, + "loss": 1.4806, + "step": 4821 + }, + { + "epoch": 0.06265969076234117, + "grad_norm": 0.43891656398773193, + "learning_rate": 0.00018750698605388686, + "loss": 1.4473, + "step": 4822 + }, + { + "epoch": 0.06267268530625704, + "grad_norm": 0.3852303624153137, + "learning_rate": 0.00018750438659197546, + "loss": 1.2812, + "step": 4823 + }, + { + "epoch": 0.06268567985017291, + "grad_norm": 0.3743630349636078, + "learning_rate": 0.00018750178713006408, + "loss": 1.3848, + "step": 4824 + }, + { + "epoch": 0.06269867439408879, + "grad_norm": 0.3716904819011688, + "learning_rate": 0.0001874991876681527, + "loss": 1.3388, + "step": 4825 + }, + { + "epoch": 0.06271166893800466, + "grad_norm": 0.44633615016937256, + "learning_rate": 0.00018749658820624133, + "loss": 1.5431, + "step": 4826 + }, + { + "epoch": 0.06272466348192053, + "grad_norm": 0.36774638295173645, + "learning_rate": 0.00018749398874432993, + "loss": 1.247, + "step": 4827 + }, + { + "epoch": 0.0627376580258364, + "grad_norm": 0.3416709303855896, + "learning_rate": 0.00018749138928241853, + "loss": 1.4417, + "step": 4828 + }, + { + "epoch": 0.06275065256975228, + "grad_norm": 0.39817753434181213, + "learning_rate": 0.00018748878982050718, + "loss": 1.4612, + "step": 4829 + }, + { + "epoch": 0.06276364711366815, + "grad_norm": 0.4371894299983978, + "learning_rate": 0.00018748619035859578, + "loss": 1.3331, + "step": 4830 + }, + { + "epoch": 0.06277664165758402, + "grad_norm": 0.3970278203487396, + "learning_rate": 0.0001874835908966844, + "loss": 1.3768, + "step": 4831 + }, + { + "epoch": 0.0627896362014999, + "grad_norm": 0.4924564063549042, + "learning_rate": 0.000187480991434773, + "loss": 1.4808, + "step": 4832 + }, + { + "epoch": 0.06280263074541577, + "grad_norm": 0.4114846885204315, + "learning_rate": 0.00018747839197286162, + "loss": 1.479, + "step": 4833 + }, + { + "epoch": 0.06281562528933164, + "grad_norm": 0.4020672142505646, + "learning_rate": 0.00018747579251095025, + "loss": 1.6144, + "step": 4834 + }, + { + "epoch": 0.06282861983324751, + "grad_norm": 0.34900376200675964, + "learning_rate": 0.00018747319304903884, + "loss": 1.2328, + "step": 4835 + }, + { + "epoch": 0.06284161437716339, + "grad_norm": 0.3972448706626892, + "learning_rate": 0.00018747059358712747, + "loss": 1.3904, + "step": 4836 + }, + { + "epoch": 0.06285460892107926, + "grad_norm": 0.49019351601600647, + "learning_rate": 0.0001874679941252161, + "loss": 1.7153, + "step": 4837 + }, + { + "epoch": 0.06286760346499513, + "grad_norm": 0.2971283793449402, + "learning_rate": 0.00018746539466330472, + "loss": 1.4029, + "step": 4838 + }, + { + "epoch": 0.062880598008911, + "grad_norm": 0.2929637134075165, + "learning_rate": 0.00018746279520139331, + "loss": 1.4402, + "step": 4839 + }, + { + "epoch": 0.06289359255282688, + "grad_norm": 0.34337177872657776, + "learning_rate": 0.0001874601957394819, + "loss": 1.3277, + "step": 4840 + }, + { + "epoch": 0.06290658709674275, + "grad_norm": 0.49686381220817566, + "learning_rate": 0.00018745759627757056, + "loss": 1.3806, + "step": 4841 + }, + { + "epoch": 0.06291958164065863, + "grad_norm": 0.4317689538002014, + "learning_rate": 0.00018745499681565916, + "loss": 1.4662, + "step": 4842 + }, + { + "epoch": 0.0629325761845745, + "grad_norm": 0.35114434361457825, + "learning_rate": 0.00018745239735374779, + "loss": 1.6053, + "step": 4843 + }, + { + "epoch": 0.06294557072849037, + "grad_norm": 0.46290478110313416, + "learning_rate": 0.0001874497978918364, + "loss": 1.6538, + "step": 4844 + }, + { + "epoch": 0.06295856527240624, + "grad_norm": 0.38073739409446716, + "learning_rate": 0.000187447198429925, + "loss": 1.5796, + "step": 4845 + }, + { + "epoch": 0.06297155981632212, + "grad_norm": 0.3745887875556946, + "learning_rate": 0.00018744459896801363, + "loss": 1.3767, + "step": 4846 + }, + { + "epoch": 0.06298455436023799, + "grad_norm": 0.35628604888916016, + "learning_rate": 0.00018744199950610223, + "loss": 1.5253, + "step": 4847 + }, + { + "epoch": 0.06299754890415386, + "grad_norm": 0.39481082558631897, + "learning_rate": 0.00018743940004419088, + "loss": 1.5285, + "step": 4848 + }, + { + "epoch": 0.06301054344806974, + "grad_norm": 0.40641120076179504, + "learning_rate": 0.00018743680058227948, + "loss": 1.6706, + "step": 4849 + }, + { + "epoch": 0.06302353799198561, + "grad_norm": 0.41250911355018616, + "learning_rate": 0.0001874342011203681, + "loss": 1.4065, + "step": 4850 + }, + { + "epoch": 0.06303653253590148, + "grad_norm": 0.465817928314209, + "learning_rate": 0.0001874316016584567, + "loss": 1.4455, + "step": 4851 + }, + { + "epoch": 0.06304952707981736, + "grad_norm": 0.31983256340026855, + "learning_rate": 0.00018742900219654532, + "loss": 1.4688, + "step": 4852 + }, + { + "epoch": 0.06306252162373323, + "grad_norm": 0.3200644254684448, + "learning_rate": 0.00018742640273463395, + "loss": 1.2597, + "step": 4853 + }, + { + "epoch": 0.06307551616764911, + "grad_norm": 0.38094374537467957, + "learning_rate": 0.00018742380327272255, + "loss": 1.3426, + "step": 4854 + }, + { + "epoch": 0.06308851071156499, + "grad_norm": 0.3403262495994568, + "learning_rate": 0.00018742120381081117, + "loss": 1.5393, + "step": 4855 + }, + { + "epoch": 0.06310150525548086, + "grad_norm": 0.318154901266098, + "learning_rate": 0.0001874186043488998, + "loss": 1.403, + "step": 4856 + }, + { + "epoch": 0.06311449979939673, + "grad_norm": 0.4239601790904999, + "learning_rate": 0.0001874160048869884, + "loss": 1.6201, + "step": 4857 + }, + { + "epoch": 0.0631274943433126, + "grad_norm": 0.3950299322605133, + "learning_rate": 0.00018741340542507702, + "loss": 1.3967, + "step": 4858 + }, + { + "epoch": 0.06314048888722848, + "grad_norm": 0.35630932450294495, + "learning_rate": 0.00018741080596316561, + "loss": 1.4808, + "step": 4859 + }, + { + "epoch": 0.06315348343114435, + "grad_norm": 0.5480958223342896, + "learning_rate": 0.00018740820650125427, + "loss": 1.5293, + "step": 4860 + }, + { + "epoch": 0.06316647797506023, + "grad_norm": 0.4242057204246521, + "learning_rate": 0.00018740560703934286, + "loss": 1.356, + "step": 4861 + }, + { + "epoch": 0.0631794725189761, + "grad_norm": 0.4509803056716919, + "learning_rate": 0.0001874030075774315, + "loss": 1.4957, + "step": 4862 + }, + { + "epoch": 0.06319246706289197, + "grad_norm": 0.3295024037361145, + "learning_rate": 0.00018740040811552009, + "loss": 1.2792, + "step": 4863 + }, + { + "epoch": 0.06320546160680784, + "grad_norm": 0.3918381333351135, + "learning_rate": 0.0001873978086536087, + "loss": 1.4447, + "step": 4864 + }, + { + "epoch": 0.06321845615072372, + "grad_norm": 0.383027046918869, + "learning_rate": 0.00018739520919169733, + "loss": 1.5594, + "step": 4865 + }, + { + "epoch": 0.06323145069463959, + "grad_norm": 0.36572399735450745, + "learning_rate": 0.00018739260972978593, + "loss": 1.3749, + "step": 4866 + }, + { + "epoch": 0.06324444523855546, + "grad_norm": 0.3963125944137573, + "learning_rate": 0.00018739001026787456, + "loss": 1.4538, + "step": 4867 + }, + { + "epoch": 0.06325743978247134, + "grad_norm": 0.3682536482810974, + "learning_rate": 0.00018738741080596318, + "loss": 1.4723, + "step": 4868 + }, + { + "epoch": 0.06327043432638721, + "grad_norm": 0.406673789024353, + "learning_rate": 0.00018738481134405178, + "loss": 1.3904, + "step": 4869 + }, + { + "epoch": 0.06328342887030308, + "grad_norm": 0.41151633858680725, + "learning_rate": 0.0001873822118821404, + "loss": 1.5182, + "step": 4870 + }, + { + "epoch": 0.06329642341421896, + "grad_norm": 0.405517041683197, + "learning_rate": 0.000187379612420229, + "loss": 1.5509, + "step": 4871 + }, + { + "epoch": 0.06330941795813483, + "grad_norm": 0.4210527539253235, + "learning_rate": 0.00018737701295831765, + "loss": 1.5795, + "step": 4872 + }, + { + "epoch": 0.0633224125020507, + "grad_norm": 0.413746178150177, + "learning_rate": 0.00018737441349640625, + "loss": 1.4749, + "step": 4873 + }, + { + "epoch": 0.06333540704596657, + "grad_norm": 0.40400955080986023, + "learning_rate": 0.00018737181403449487, + "loss": 1.3607, + "step": 4874 + }, + { + "epoch": 0.06334840158988245, + "grad_norm": 0.46322938799858093, + "learning_rate": 0.00018736921457258347, + "loss": 1.3933, + "step": 4875 + }, + { + "epoch": 0.06336139613379832, + "grad_norm": 0.4640101194381714, + "learning_rate": 0.0001873666151106721, + "loss": 1.4439, + "step": 4876 + }, + { + "epoch": 0.06337439067771419, + "grad_norm": 0.40370509028434753, + "learning_rate": 0.00018736401564876072, + "loss": 1.474, + "step": 4877 + }, + { + "epoch": 0.06338738522163007, + "grad_norm": 0.4165574908256531, + "learning_rate": 0.00018736141618684932, + "loss": 1.5289, + "step": 4878 + }, + { + "epoch": 0.06340037976554594, + "grad_norm": 0.4392988085746765, + "learning_rate": 0.00018735881672493797, + "loss": 1.3942, + "step": 4879 + }, + { + "epoch": 0.06341337430946181, + "grad_norm": 0.33102694153785706, + "learning_rate": 0.00018735621726302657, + "loss": 1.4154, + "step": 4880 + }, + { + "epoch": 0.06342636885337768, + "grad_norm": 0.3873150646686554, + "learning_rate": 0.0001873536178011152, + "loss": 1.5786, + "step": 4881 + }, + { + "epoch": 0.06343936339729356, + "grad_norm": 0.32687070965766907, + "learning_rate": 0.0001873510183392038, + "loss": 1.3004, + "step": 4882 + }, + { + "epoch": 0.06345235794120943, + "grad_norm": 0.3952428698539734, + "learning_rate": 0.0001873484188772924, + "loss": 1.3218, + "step": 4883 + }, + { + "epoch": 0.0634653524851253, + "grad_norm": 0.2773917019367218, + "learning_rate": 0.00018734581941538104, + "loss": 1.3843, + "step": 4884 + }, + { + "epoch": 0.06347834702904118, + "grad_norm": 0.3833611011505127, + "learning_rate": 0.00018734321995346963, + "loss": 1.5324, + "step": 4885 + }, + { + "epoch": 0.06349134157295705, + "grad_norm": 0.29361867904663086, + "learning_rate": 0.00018734062049155826, + "loss": 1.1574, + "step": 4886 + }, + { + "epoch": 0.06350433611687292, + "grad_norm": 0.38995978236198425, + "learning_rate": 0.00018733802102964688, + "loss": 1.4576, + "step": 4887 + }, + { + "epoch": 0.0635173306607888, + "grad_norm": 0.3644857108592987, + "learning_rate": 0.00018733542156773548, + "loss": 1.5604, + "step": 4888 + }, + { + "epoch": 0.06353032520470467, + "grad_norm": 0.2922137677669525, + "learning_rate": 0.0001873328221058241, + "loss": 1.2234, + "step": 4889 + }, + { + "epoch": 0.06354331974862054, + "grad_norm": 0.3658216595649719, + "learning_rate": 0.0001873302226439127, + "loss": 1.4905, + "step": 4890 + }, + { + "epoch": 0.06355631429253641, + "grad_norm": 0.3737642765045166, + "learning_rate": 0.00018732762318200135, + "loss": 1.6124, + "step": 4891 + }, + { + "epoch": 0.0635693088364523, + "grad_norm": 0.33959582448005676, + "learning_rate": 0.00018732502372008995, + "loss": 1.5312, + "step": 4892 + }, + { + "epoch": 0.06358230338036817, + "grad_norm": 0.3634147346019745, + "learning_rate": 0.00018732242425817858, + "loss": 1.2265, + "step": 4893 + }, + { + "epoch": 0.06359529792428405, + "grad_norm": 0.38192006945610046, + "learning_rate": 0.00018731982479626717, + "loss": 1.158, + "step": 4894 + }, + { + "epoch": 0.06360829246819992, + "grad_norm": 0.384304940700531, + "learning_rate": 0.0001873172253343558, + "loss": 1.4773, + "step": 4895 + }, + { + "epoch": 0.0636212870121158, + "grad_norm": 0.41935840249061584, + "learning_rate": 0.00018731462587244442, + "loss": 1.4632, + "step": 4896 + }, + { + "epoch": 0.06363428155603167, + "grad_norm": 0.4345855116844177, + "learning_rate": 0.00018731202641053302, + "loss": 1.4467, + "step": 4897 + }, + { + "epoch": 0.06364727609994754, + "grad_norm": 0.4804510772228241, + "learning_rate": 0.00018730942694862164, + "loss": 1.6504, + "step": 4898 + }, + { + "epoch": 0.06366027064386341, + "grad_norm": 0.40293917059898376, + "learning_rate": 0.00018730682748671027, + "loss": 1.3154, + "step": 4899 + }, + { + "epoch": 0.06367326518777928, + "grad_norm": 0.35224154591560364, + "learning_rate": 0.00018730422802479887, + "loss": 1.5255, + "step": 4900 + }, + { + "epoch": 0.06368625973169516, + "grad_norm": 0.3401183784008026, + "learning_rate": 0.0001873016285628875, + "loss": 1.399, + "step": 4901 + }, + { + "epoch": 0.06369925427561103, + "grad_norm": 0.3785831928253174, + "learning_rate": 0.0001872990291009761, + "loss": 1.3195, + "step": 4902 + }, + { + "epoch": 0.0637122488195269, + "grad_norm": 0.2865942418575287, + "learning_rate": 0.00018729642963906474, + "loss": 1.4083, + "step": 4903 + }, + { + "epoch": 0.06372524336344278, + "grad_norm": 0.3530195653438568, + "learning_rate": 0.00018729383017715334, + "loss": 1.4825, + "step": 4904 + }, + { + "epoch": 0.06373823790735865, + "grad_norm": 0.2818562984466553, + "learning_rate": 0.00018729123071524196, + "loss": 1.4423, + "step": 4905 + }, + { + "epoch": 0.06375123245127452, + "grad_norm": 0.36763685941696167, + "learning_rate": 0.00018728863125333056, + "loss": 1.3577, + "step": 4906 + }, + { + "epoch": 0.0637642269951904, + "grad_norm": 0.4166032373905182, + "learning_rate": 0.00018728603179141918, + "loss": 1.5294, + "step": 4907 + }, + { + "epoch": 0.06377722153910627, + "grad_norm": 0.32174357771873474, + "learning_rate": 0.0001872834323295078, + "loss": 1.3242, + "step": 4908 + }, + { + "epoch": 0.06379021608302214, + "grad_norm": 0.37954533100128174, + "learning_rate": 0.0001872808328675964, + "loss": 1.525, + "step": 4909 + }, + { + "epoch": 0.06380321062693801, + "grad_norm": 0.41271960735321045, + "learning_rate": 0.00018727823340568503, + "loss": 1.2738, + "step": 4910 + }, + { + "epoch": 0.06381620517085389, + "grad_norm": 0.2798667252063751, + "learning_rate": 0.00018727563394377365, + "loss": 1.4685, + "step": 4911 + }, + { + "epoch": 0.06382919971476976, + "grad_norm": 0.3457575738430023, + "learning_rate": 0.00018727303448186225, + "loss": 1.5371, + "step": 4912 + }, + { + "epoch": 0.06384219425868563, + "grad_norm": 0.32217657566070557, + "learning_rate": 0.00018727043501995088, + "loss": 1.3464, + "step": 4913 + }, + { + "epoch": 0.0638551888026015, + "grad_norm": 0.3394484221935272, + "learning_rate": 0.00018726783555803947, + "loss": 1.4517, + "step": 4914 + }, + { + "epoch": 0.06386818334651738, + "grad_norm": 0.3870427906513214, + "learning_rate": 0.00018726523609612813, + "loss": 1.4706, + "step": 4915 + }, + { + "epoch": 0.06388117789043325, + "grad_norm": 0.4184185564517975, + "learning_rate": 0.00018726263663421672, + "loss": 1.3265, + "step": 4916 + }, + { + "epoch": 0.06389417243434913, + "grad_norm": 0.33759355545043945, + "learning_rate": 0.00018726003717230535, + "loss": 1.3971, + "step": 4917 + }, + { + "epoch": 0.063907166978265, + "grad_norm": 0.42973077297210693, + "learning_rate": 0.00018725743771039397, + "loss": 1.5604, + "step": 4918 + }, + { + "epoch": 0.06392016152218087, + "grad_norm": 0.45261356234550476, + "learning_rate": 0.00018725483824848257, + "loss": 1.5697, + "step": 4919 + }, + { + "epoch": 0.06393315606609674, + "grad_norm": 0.40429526567459106, + "learning_rate": 0.0001872522387865712, + "loss": 1.3049, + "step": 4920 + }, + { + "epoch": 0.06394615061001262, + "grad_norm": 0.3268931210041046, + "learning_rate": 0.0001872496393246598, + "loss": 1.5144, + "step": 4921 + }, + { + "epoch": 0.06395914515392849, + "grad_norm": 0.462516725063324, + "learning_rate": 0.00018724703986274844, + "loss": 1.6261, + "step": 4922 + }, + { + "epoch": 0.06397213969784436, + "grad_norm": 0.40660592913627625, + "learning_rate": 0.00018724444040083704, + "loss": 1.5333, + "step": 4923 + }, + { + "epoch": 0.06398513424176024, + "grad_norm": 0.43708881735801697, + "learning_rate": 0.00018724184093892564, + "loss": 1.5775, + "step": 4924 + }, + { + "epoch": 0.06399812878567611, + "grad_norm": 0.46050506830215454, + "learning_rate": 0.00018723924147701426, + "loss": 1.4223, + "step": 4925 + }, + { + "epoch": 0.06401112332959198, + "grad_norm": 0.3936122953891754, + "learning_rate": 0.00018723664201510289, + "loss": 1.4894, + "step": 4926 + }, + { + "epoch": 0.06402411787350785, + "grad_norm": 0.32210901379585266, + "learning_rate": 0.0001872340425531915, + "loss": 1.3553, + "step": 4927 + }, + { + "epoch": 0.06403711241742373, + "grad_norm": 0.29751095175743103, + "learning_rate": 0.0001872314430912801, + "loss": 1.291, + "step": 4928 + }, + { + "epoch": 0.0640501069613396, + "grad_norm": 0.3196150064468384, + "learning_rate": 0.00018722884362936873, + "loss": 1.3245, + "step": 4929 + }, + { + "epoch": 0.06406310150525549, + "grad_norm": 0.3867841064929962, + "learning_rate": 0.00018722624416745736, + "loss": 1.3225, + "step": 4930 + }, + { + "epoch": 0.06407609604917136, + "grad_norm": 0.5138772130012512, + "learning_rate": 0.00018722364470554595, + "loss": 1.4339, + "step": 4931 + }, + { + "epoch": 0.06408909059308723, + "grad_norm": 0.41582852602005005, + "learning_rate": 0.00018722104524363458, + "loss": 1.4092, + "step": 4932 + }, + { + "epoch": 0.0641020851370031, + "grad_norm": 0.4686858057975769, + "learning_rate": 0.00018721844578172318, + "loss": 1.4387, + "step": 4933 + }, + { + "epoch": 0.06411507968091898, + "grad_norm": 0.44573119282722473, + "learning_rate": 0.00018721584631981183, + "loss": 1.7254, + "step": 4934 + }, + { + "epoch": 0.06412807422483485, + "grad_norm": 0.3969566524028778, + "learning_rate": 0.00018721324685790043, + "loss": 1.4787, + "step": 4935 + }, + { + "epoch": 0.06414106876875073, + "grad_norm": 0.3597899377346039, + "learning_rate": 0.00018721064739598905, + "loss": 1.3418, + "step": 4936 + }, + { + "epoch": 0.0641540633126666, + "grad_norm": 0.3663119673728943, + "learning_rate": 0.00018720804793407765, + "loss": 1.4231, + "step": 4937 + }, + { + "epoch": 0.06416705785658247, + "grad_norm": 0.35539206862449646, + "learning_rate": 0.00018720544847216627, + "loss": 1.4016, + "step": 4938 + }, + { + "epoch": 0.06418005240049834, + "grad_norm": 0.3831118643283844, + "learning_rate": 0.0001872028490102549, + "loss": 1.5359, + "step": 4939 + }, + { + "epoch": 0.06419304694441422, + "grad_norm": 0.3788885772228241, + "learning_rate": 0.0001872002495483435, + "loss": 1.4351, + "step": 4940 + }, + { + "epoch": 0.06420604148833009, + "grad_norm": 0.3635537922382355, + "learning_rate": 0.00018719765008643212, + "loss": 1.5072, + "step": 4941 + }, + { + "epoch": 0.06421903603224596, + "grad_norm": 0.4678174555301666, + "learning_rate": 0.00018719505062452074, + "loss": 1.5083, + "step": 4942 + }, + { + "epoch": 0.06423203057616184, + "grad_norm": 0.3268563747406006, + "learning_rate": 0.00018719245116260934, + "loss": 1.5077, + "step": 4943 + }, + { + "epoch": 0.06424502512007771, + "grad_norm": 0.315374493598938, + "learning_rate": 0.00018718985170069796, + "loss": 1.4007, + "step": 4944 + }, + { + "epoch": 0.06425801966399358, + "grad_norm": 0.4014391601085663, + "learning_rate": 0.00018718725223878656, + "loss": 1.678, + "step": 4945 + }, + { + "epoch": 0.06427101420790945, + "grad_norm": 0.36102932691574097, + "learning_rate": 0.0001871846527768752, + "loss": 1.3457, + "step": 4946 + }, + { + "epoch": 0.06428400875182533, + "grad_norm": 0.4070459008216858, + "learning_rate": 0.0001871820533149638, + "loss": 1.4825, + "step": 4947 + }, + { + "epoch": 0.0642970032957412, + "grad_norm": 0.42442283034324646, + "learning_rate": 0.00018717945385305243, + "loss": 1.5607, + "step": 4948 + }, + { + "epoch": 0.06430999783965707, + "grad_norm": 0.33655229210853577, + "learning_rate": 0.00018717685439114103, + "loss": 1.5214, + "step": 4949 + }, + { + "epoch": 0.06432299238357295, + "grad_norm": 0.40103280544281006, + "learning_rate": 0.00018717425492922966, + "loss": 1.6607, + "step": 4950 + }, + { + "epoch": 0.06433598692748882, + "grad_norm": 0.33195415139198303, + "learning_rate": 0.00018717165546731828, + "loss": 1.5514, + "step": 4951 + }, + { + "epoch": 0.06434898147140469, + "grad_norm": 0.3654930591583252, + "learning_rate": 0.00018716905600540688, + "loss": 1.4176, + "step": 4952 + }, + { + "epoch": 0.06436197601532057, + "grad_norm": 0.3469924032688141, + "learning_rate": 0.0001871664565434955, + "loss": 1.336, + "step": 4953 + }, + { + "epoch": 0.06437497055923644, + "grad_norm": 0.36465591192245483, + "learning_rate": 0.00018716385708158413, + "loss": 1.4397, + "step": 4954 + }, + { + "epoch": 0.06438796510315231, + "grad_norm": 0.5298256874084473, + "learning_rate": 0.00018716125761967272, + "loss": 1.388, + "step": 4955 + }, + { + "epoch": 0.06440095964706818, + "grad_norm": 0.3695477843284607, + "learning_rate": 0.00018715865815776135, + "loss": 1.4277, + "step": 4956 + }, + { + "epoch": 0.06441395419098406, + "grad_norm": 0.39149484038352966, + "learning_rate": 0.00018715605869584997, + "loss": 1.4951, + "step": 4957 + }, + { + "epoch": 0.06442694873489993, + "grad_norm": 0.4806424677371979, + "learning_rate": 0.0001871534592339386, + "loss": 1.3813, + "step": 4958 + }, + { + "epoch": 0.0644399432788158, + "grad_norm": 0.4368738830089569, + "learning_rate": 0.0001871508597720272, + "loss": 1.4217, + "step": 4959 + }, + { + "epoch": 0.06445293782273168, + "grad_norm": 0.4076714813709259, + "learning_rate": 0.00018714826031011582, + "loss": 1.4966, + "step": 4960 + }, + { + "epoch": 0.06446593236664755, + "grad_norm": 0.35776248574256897, + "learning_rate": 0.00018714566084820444, + "loss": 1.3683, + "step": 4961 + }, + { + "epoch": 0.06447892691056342, + "grad_norm": 0.5128650665283203, + "learning_rate": 0.00018714306138629304, + "loss": 1.4594, + "step": 4962 + }, + { + "epoch": 0.0644919214544793, + "grad_norm": 0.47895219922065735, + "learning_rate": 0.00018714046192438167, + "loss": 1.4747, + "step": 4963 + }, + { + "epoch": 0.06450491599839517, + "grad_norm": 0.4154643416404724, + "learning_rate": 0.00018713786246247026, + "loss": 1.421, + "step": 4964 + }, + { + "epoch": 0.06451791054231104, + "grad_norm": 0.40340059995651245, + "learning_rate": 0.00018713526300055892, + "loss": 1.582, + "step": 4965 + }, + { + "epoch": 0.06453090508622691, + "grad_norm": 0.39383891224861145, + "learning_rate": 0.0001871326635386475, + "loss": 1.4579, + "step": 4966 + }, + { + "epoch": 0.06454389963014279, + "grad_norm": 0.3662666082382202, + "learning_rate": 0.0001871300640767361, + "loss": 1.2288, + "step": 4967 + }, + { + "epoch": 0.06455689417405867, + "grad_norm": 0.5457305908203125, + "learning_rate": 0.00018712746461482473, + "loss": 1.4215, + "step": 4968 + }, + { + "epoch": 0.06456988871797455, + "grad_norm": 0.3704676628112793, + "learning_rate": 0.00018712486515291336, + "loss": 1.4879, + "step": 4969 + }, + { + "epoch": 0.06458288326189042, + "grad_norm": 0.40221697092056274, + "learning_rate": 0.00018712226569100198, + "loss": 1.4518, + "step": 4970 + }, + { + "epoch": 0.06459587780580629, + "grad_norm": 0.3336734473705292, + "learning_rate": 0.00018711966622909058, + "loss": 1.2597, + "step": 4971 + }, + { + "epoch": 0.06460887234972217, + "grad_norm": 0.403178334236145, + "learning_rate": 0.0001871170667671792, + "loss": 1.5475, + "step": 4972 + }, + { + "epoch": 0.06462186689363804, + "grad_norm": 0.34967759251594543, + "learning_rate": 0.00018711446730526783, + "loss": 1.3389, + "step": 4973 + }, + { + "epoch": 0.06463486143755391, + "grad_norm": 0.44066518545150757, + "learning_rate": 0.00018711186784335643, + "loss": 1.425, + "step": 4974 + }, + { + "epoch": 0.06464785598146978, + "grad_norm": 0.3541397452354431, + "learning_rate": 0.00018710926838144505, + "loss": 1.3289, + "step": 4975 + }, + { + "epoch": 0.06466085052538566, + "grad_norm": 0.2975107729434967, + "learning_rate": 0.00018710666891953365, + "loss": 1.3044, + "step": 4976 + }, + { + "epoch": 0.06467384506930153, + "grad_norm": 0.4533711373806, + "learning_rate": 0.0001871040694576223, + "loss": 1.612, + "step": 4977 + }, + { + "epoch": 0.0646868396132174, + "grad_norm": 0.34213998913764954, + "learning_rate": 0.0001871014699957109, + "loss": 1.4785, + "step": 4978 + }, + { + "epoch": 0.06469983415713328, + "grad_norm": 0.41275545954704285, + "learning_rate": 0.0001870988705337995, + "loss": 1.4921, + "step": 4979 + }, + { + "epoch": 0.06471282870104915, + "grad_norm": 0.2710539400577545, + "learning_rate": 0.00018709627107188812, + "loss": 1.5003, + "step": 4980 + }, + { + "epoch": 0.06472582324496502, + "grad_norm": 0.31130334734916687, + "learning_rate": 0.00018709367160997674, + "loss": 1.3272, + "step": 4981 + }, + { + "epoch": 0.0647388177888809, + "grad_norm": 0.4436469078063965, + "learning_rate": 0.00018709107214806537, + "loss": 1.5013, + "step": 4982 + }, + { + "epoch": 0.06475181233279677, + "grad_norm": 0.42660483717918396, + "learning_rate": 0.00018708847268615397, + "loss": 1.3936, + "step": 4983 + }, + { + "epoch": 0.06476480687671264, + "grad_norm": 0.397128164768219, + "learning_rate": 0.0001870858732242426, + "loss": 1.5172, + "step": 4984 + }, + { + "epoch": 0.06477780142062851, + "grad_norm": 0.3946099579334259, + "learning_rate": 0.00018708327376233122, + "loss": 1.5568, + "step": 4985 + }, + { + "epoch": 0.06479079596454439, + "grad_norm": 0.43961185216903687, + "learning_rate": 0.0001870806743004198, + "loss": 1.4902, + "step": 4986 + }, + { + "epoch": 0.06480379050846026, + "grad_norm": 0.4488104283809662, + "learning_rate": 0.00018707807483850844, + "loss": 1.5189, + "step": 4987 + }, + { + "epoch": 0.06481678505237613, + "grad_norm": 0.34242379665374756, + "learning_rate": 0.00018707547537659703, + "loss": 1.47, + "step": 4988 + }, + { + "epoch": 0.064829779596292, + "grad_norm": 0.38616856932640076, + "learning_rate": 0.00018707287591468569, + "loss": 1.4936, + "step": 4989 + }, + { + "epoch": 0.06484277414020788, + "grad_norm": 0.4312630891799927, + "learning_rate": 0.00018707027645277428, + "loss": 1.3957, + "step": 4990 + }, + { + "epoch": 0.06485576868412375, + "grad_norm": 0.3538651764392853, + "learning_rate": 0.00018706767699086288, + "loss": 1.3587, + "step": 4991 + }, + { + "epoch": 0.06486876322803962, + "grad_norm": 0.32463616132736206, + "learning_rate": 0.00018706507752895153, + "loss": 1.3117, + "step": 4992 + }, + { + "epoch": 0.0648817577719555, + "grad_norm": 0.38083845376968384, + "learning_rate": 0.00018706247806704013, + "loss": 1.6409, + "step": 4993 + }, + { + "epoch": 0.06489475231587137, + "grad_norm": 0.395946741104126, + "learning_rate": 0.00018705987860512875, + "loss": 1.7357, + "step": 4994 + }, + { + "epoch": 0.06490774685978724, + "grad_norm": 0.4680713415145874, + "learning_rate": 0.00018705727914321735, + "loss": 1.5717, + "step": 4995 + }, + { + "epoch": 0.06492074140370312, + "grad_norm": 0.3958815932273865, + "learning_rate": 0.00018705467968130598, + "loss": 1.4041, + "step": 4996 + }, + { + "epoch": 0.06493373594761899, + "grad_norm": 0.3951125741004944, + "learning_rate": 0.0001870520802193946, + "loss": 1.5398, + "step": 4997 + }, + { + "epoch": 0.06494673049153486, + "grad_norm": 0.3468087613582611, + "learning_rate": 0.0001870494807574832, + "loss": 1.428, + "step": 4998 + }, + { + "epoch": 0.06495972503545074, + "grad_norm": 0.439179927110672, + "learning_rate": 0.00018704688129557182, + "loss": 1.358, + "step": 4999 + }, + { + "epoch": 0.06497271957936661, + "grad_norm": 0.31045451760292053, + "learning_rate": 0.00018704428183366045, + "loss": 1.3327, + "step": 5000 + }, + { + "epoch": 0.06498571412328248, + "grad_norm": 0.4269821345806122, + "learning_rate": 0.00018704168237174907, + "loss": 1.5544, + "step": 5001 + }, + { + "epoch": 0.06499870866719835, + "grad_norm": 0.2961575388908386, + "learning_rate": 0.00018703908290983767, + "loss": 1.4255, + "step": 5002 + }, + { + "epoch": 0.06501170321111423, + "grad_norm": 0.3565014600753784, + "learning_rate": 0.0001870364834479263, + "loss": 1.3879, + "step": 5003 + }, + { + "epoch": 0.0650246977550301, + "grad_norm": 0.41762495040893555, + "learning_rate": 0.00018703388398601492, + "loss": 1.475, + "step": 5004 + }, + { + "epoch": 0.06503769229894597, + "grad_norm": 0.35624971985816956, + "learning_rate": 0.00018703128452410352, + "loss": 1.4048, + "step": 5005 + }, + { + "epoch": 0.06505068684286186, + "grad_norm": 0.43210282921791077, + "learning_rate": 0.00018702868506219214, + "loss": 1.4183, + "step": 5006 + }, + { + "epoch": 0.06506368138677773, + "grad_norm": 0.3515591621398926, + "learning_rate": 0.00018702608560028074, + "loss": 1.3787, + "step": 5007 + }, + { + "epoch": 0.0650766759306936, + "grad_norm": 0.3989673852920532, + "learning_rate": 0.00018702348613836936, + "loss": 1.4394, + "step": 5008 + }, + { + "epoch": 0.06508967047460948, + "grad_norm": 0.3967299163341522, + "learning_rate": 0.00018702088667645799, + "loss": 1.3999, + "step": 5009 + }, + { + "epoch": 0.06510266501852535, + "grad_norm": 0.40923601388931274, + "learning_rate": 0.00018701828721454658, + "loss": 1.4759, + "step": 5010 + }, + { + "epoch": 0.06511565956244122, + "grad_norm": 0.41464051604270935, + "learning_rate": 0.0001870156877526352, + "loss": 1.7134, + "step": 5011 + }, + { + "epoch": 0.0651286541063571, + "grad_norm": 0.3874533772468567, + "learning_rate": 0.00018701308829072383, + "loss": 1.5878, + "step": 5012 + }, + { + "epoch": 0.06514164865027297, + "grad_norm": 0.4026485085487366, + "learning_rate": 0.00018701048882881246, + "loss": 1.3231, + "step": 5013 + }, + { + "epoch": 0.06515464319418884, + "grad_norm": 0.40182870626449585, + "learning_rate": 0.00018700788936690105, + "loss": 1.5892, + "step": 5014 + }, + { + "epoch": 0.06516763773810472, + "grad_norm": 0.369368314743042, + "learning_rate": 0.00018700528990498968, + "loss": 1.2536, + "step": 5015 + }, + { + "epoch": 0.06518063228202059, + "grad_norm": 0.2835204601287842, + "learning_rate": 0.0001870026904430783, + "loss": 1.4752, + "step": 5016 + }, + { + "epoch": 0.06519362682593646, + "grad_norm": 0.4133825898170471, + "learning_rate": 0.0001870000909811669, + "loss": 1.3276, + "step": 5017 + }, + { + "epoch": 0.06520662136985234, + "grad_norm": 0.5406321883201599, + "learning_rate": 0.00018699749151925553, + "loss": 1.3796, + "step": 5018 + }, + { + "epoch": 0.06521961591376821, + "grad_norm": 0.4201764762401581, + "learning_rate": 0.00018699489205734412, + "loss": 1.4892, + "step": 5019 + }, + { + "epoch": 0.06523261045768408, + "grad_norm": 0.43299543857574463, + "learning_rate": 0.00018699229259543277, + "loss": 1.4428, + "step": 5020 + }, + { + "epoch": 0.06524560500159995, + "grad_norm": 0.4272761642932892, + "learning_rate": 0.00018698969313352137, + "loss": 1.6236, + "step": 5021 + }, + { + "epoch": 0.06525859954551583, + "grad_norm": 0.2663511037826538, + "learning_rate": 0.00018698709367160997, + "loss": 1.3557, + "step": 5022 + }, + { + "epoch": 0.0652715940894317, + "grad_norm": 0.45003563165664673, + "learning_rate": 0.0001869844942096986, + "loss": 1.6476, + "step": 5023 + }, + { + "epoch": 0.06528458863334757, + "grad_norm": 0.38335496187210083, + "learning_rate": 0.00018698189474778722, + "loss": 1.4705, + "step": 5024 + }, + { + "epoch": 0.06529758317726345, + "grad_norm": 0.43141335248947144, + "learning_rate": 0.00018697929528587584, + "loss": 1.5704, + "step": 5025 + }, + { + "epoch": 0.06531057772117932, + "grad_norm": 0.4025283455848694, + "learning_rate": 0.00018697669582396444, + "loss": 1.4308, + "step": 5026 + }, + { + "epoch": 0.06532357226509519, + "grad_norm": 0.3948580324649811, + "learning_rate": 0.00018697409636205306, + "loss": 1.6566, + "step": 5027 + }, + { + "epoch": 0.06533656680901107, + "grad_norm": 0.45215436816215515, + "learning_rate": 0.0001869714969001417, + "loss": 1.6829, + "step": 5028 + }, + { + "epoch": 0.06534956135292694, + "grad_norm": 0.38435620069503784, + "learning_rate": 0.00018696889743823029, + "loss": 1.3634, + "step": 5029 + }, + { + "epoch": 0.06536255589684281, + "grad_norm": 0.38024795055389404, + "learning_rate": 0.0001869662979763189, + "loss": 1.5269, + "step": 5030 + }, + { + "epoch": 0.06537555044075868, + "grad_norm": 0.37334343791007996, + "learning_rate": 0.00018696369851440754, + "loss": 1.2901, + "step": 5031 + }, + { + "epoch": 0.06538854498467456, + "grad_norm": 0.3237695097923279, + "learning_rate": 0.00018696109905249616, + "loss": 1.523, + "step": 5032 + }, + { + "epoch": 0.06540153952859043, + "grad_norm": 0.35873982310295105, + "learning_rate": 0.00018695849959058476, + "loss": 1.4954, + "step": 5033 + }, + { + "epoch": 0.0654145340725063, + "grad_norm": 0.37050142884254456, + "learning_rate": 0.00018695590012867335, + "loss": 1.5798, + "step": 5034 + }, + { + "epoch": 0.06542752861642218, + "grad_norm": 0.37431302666664124, + "learning_rate": 0.000186953300666762, + "loss": 1.3788, + "step": 5035 + }, + { + "epoch": 0.06544052316033805, + "grad_norm": 0.37319809198379517, + "learning_rate": 0.0001869507012048506, + "loss": 1.4956, + "step": 5036 + }, + { + "epoch": 0.06545351770425392, + "grad_norm": 0.3907186686992645, + "learning_rate": 0.00018694810174293923, + "loss": 1.3038, + "step": 5037 + }, + { + "epoch": 0.0654665122481698, + "grad_norm": 0.3807026445865631, + "learning_rate": 0.00018694550228102783, + "loss": 1.5015, + "step": 5038 + }, + { + "epoch": 0.06547950679208567, + "grad_norm": 0.37635213136672974, + "learning_rate": 0.00018694290281911645, + "loss": 1.3797, + "step": 5039 + }, + { + "epoch": 0.06549250133600154, + "grad_norm": 0.4360508322715759, + "learning_rate": 0.00018694030335720507, + "loss": 1.5969, + "step": 5040 + }, + { + "epoch": 0.06550549587991741, + "grad_norm": 0.3133851885795593, + "learning_rate": 0.00018693770389529367, + "loss": 1.4129, + "step": 5041 + }, + { + "epoch": 0.06551849042383329, + "grad_norm": 0.32416558265686035, + "learning_rate": 0.0001869351044333823, + "loss": 1.4297, + "step": 5042 + }, + { + "epoch": 0.06553148496774916, + "grad_norm": 0.42722848057746887, + "learning_rate": 0.00018693250497147092, + "loss": 1.6206, + "step": 5043 + }, + { + "epoch": 0.06554447951166505, + "grad_norm": 0.4987455904483795, + "learning_rate": 0.00018692990550955955, + "loss": 1.4992, + "step": 5044 + }, + { + "epoch": 0.06555747405558092, + "grad_norm": 0.43340280652046204, + "learning_rate": 0.00018692730604764814, + "loss": 1.6011, + "step": 5045 + }, + { + "epoch": 0.06557046859949679, + "grad_norm": 0.4113078713417053, + "learning_rate": 0.00018692470658573674, + "loss": 1.5793, + "step": 5046 + }, + { + "epoch": 0.06558346314341267, + "grad_norm": 0.4991094470024109, + "learning_rate": 0.0001869221071238254, + "loss": 1.5646, + "step": 5047 + }, + { + "epoch": 0.06559645768732854, + "grad_norm": 0.48501214385032654, + "learning_rate": 0.000186919507661914, + "loss": 1.3584, + "step": 5048 + }, + { + "epoch": 0.06560945223124441, + "grad_norm": 0.3096579611301422, + "learning_rate": 0.0001869169082000026, + "loss": 1.4875, + "step": 5049 + }, + { + "epoch": 0.06562244677516028, + "grad_norm": 0.3580510914325714, + "learning_rate": 0.0001869143087380912, + "loss": 1.2825, + "step": 5050 + }, + { + "epoch": 0.06563544131907616, + "grad_norm": 0.3211670517921448, + "learning_rate": 0.00018691170927617984, + "loss": 1.2917, + "step": 5051 + }, + { + "epoch": 0.06564843586299203, + "grad_norm": 0.3310427963733673, + "learning_rate": 0.00018690910981426846, + "loss": 1.4332, + "step": 5052 + }, + { + "epoch": 0.0656614304069079, + "grad_norm": 0.3947664797306061, + "learning_rate": 0.00018690651035235706, + "loss": 1.4708, + "step": 5053 + }, + { + "epoch": 0.06567442495082378, + "grad_norm": 0.44028183817863464, + "learning_rate": 0.00018690391089044568, + "loss": 1.5272, + "step": 5054 + }, + { + "epoch": 0.06568741949473965, + "grad_norm": 0.27063998579978943, + "learning_rate": 0.0001869013114285343, + "loss": 1.2619, + "step": 5055 + }, + { + "epoch": 0.06570041403865552, + "grad_norm": 0.3743489384651184, + "learning_rate": 0.00018689871196662293, + "loss": 1.4847, + "step": 5056 + }, + { + "epoch": 0.0657134085825714, + "grad_norm": 0.38224536180496216, + "learning_rate": 0.00018689611250471153, + "loss": 1.3754, + "step": 5057 + }, + { + "epoch": 0.06572640312648727, + "grad_norm": 0.44809606671333313, + "learning_rate": 0.00018689351304280015, + "loss": 1.5811, + "step": 5058 + }, + { + "epoch": 0.06573939767040314, + "grad_norm": 0.38005557656288147, + "learning_rate": 0.00018689091358088878, + "loss": 1.408, + "step": 5059 + }, + { + "epoch": 0.06575239221431901, + "grad_norm": 0.4105590581893921, + "learning_rate": 0.00018688831411897737, + "loss": 1.242, + "step": 5060 + }, + { + "epoch": 0.06576538675823489, + "grad_norm": 0.3711411952972412, + "learning_rate": 0.000186885714657066, + "loss": 1.4263, + "step": 5061 + }, + { + "epoch": 0.06577838130215076, + "grad_norm": 0.4252431094646454, + "learning_rate": 0.0001868831151951546, + "loss": 1.4642, + "step": 5062 + }, + { + "epoch": 0.06579137584606663, + "grad_norm": 0.45010408759117126, + "learning_rate": 0.00018688051573324322, + "loss": 1.5068, + "step": 5063 + }, + { + "epoch": 0.0658043703899825, + "grad_norm": 0.3801364302635193, + "learning_rate": 0.00018687791627133185, + "loss": 1.5146, + "step": 5064 + }, + { + "epoch": 0.06581736493389838, + "grad_norm": 0.36441367864608765, + "learning_rate": 0.00018687531680942044, + "loss": 1.3055, + "step": 5065 + }, + { + "epoch": 0.06583035947781425, + "grad_norm": 0.3163621127605438, + "learning_rate": 0.0001868727173475091, + "loss": 1.3386, + "step": 5066 + }, + { + "epoch": 0.06584335402173012, + "grad_norm": 0.35099443793296814, + "learning_rate": 0.0001868701178855977, + "loss": 1.1686, + "step": 5067 + }, + { + "epoch": 0.065856348565646, + "grad_norm": 0.28214946389198303, + "learning_rate": 0.00018686751842368632, + "loss": 1.5059, + "step": 5068 + }, + { + "epoch": 0.06586934310956187, + "grad_norm": 0.36046352982521057, + "learning_rate": 0.0001868649189617749, + "loss": 1.3996, + "step": 5069 + }, + { + "epoch": 0.06588233765347774, + "grad_norm": 0.331374853849411, + "learning_rate": 0.00018686231949986354, + "loss": 1.3427, + "step": 5070 + }, + { + "epoch": 0.06589533219739362, + "grad_norm": 0.2996813654899597, + "learning_rate": 0.00018685972003795216, + "loss": 1.4744, + "step": 5071 + }, + { + "epoch": 0.06590832674130949, + "grad_norm": 0.4613484740257263, + "learning_rate": 0.00018685712057604076, + "loss": 1.6503, + "step": 5072 + }, + { + "epoch": 0.06592132128522536, + "grad_norm": 0.35255923867225647, + "learning_rate": 0.00018685452111412938, + "loss": 1.4307, + "step": 5073 + }, + { + "epoch": 0.06593431582914124, + "grad_norm": 0.6994350552558899, + "learning_rate": 0.000186851921652218, + "loss": 1.5619, + "step": 5074 + }, + { + "epoch": 0.06594731037305711, + "grad_norm": 0.39828792214393616, + "learning_rate": 0.0001868493221903066, + "loss": 1.4517, + "step": 5075 + }, + { + "epoch": 0.06596030491697298, + "grad_norm": 0.37385642528533936, + "learning_rate": 0.00018684672272839523, + "loss": 1.5417, + "step": 5076 + }, + { + "epoch": 0.06597329946088885, + "grad_norm": 0.33690065145492554, + "learning_rate": 0.00018684412326648383, + "loss": 1.3128, + "step": 5077 + }, + { + "epoch": 0.06598629400480473, + "grad_norm": 0.404055655002594, + "learning_rate": 0.00018684152380457248, + "loss": 1.6508, + "step": 5078 + }, + { + "epoch": 0.0659992885487206, + "grad_norm": 0.41085606813430786, + "learning_rate": 0.00018683892434266108, + "loss": 1.4099, + "step": 5079 + }, + { + "epoch": 0.06601228309263647, + "grad_norm": 0.34779030084609985, + "learning_rate": 0.0001868363248807497, + "loss": 1.2436, + "step": 5080 + }, + { + "epoch": 0.06602527763655235, + "grad_norm": 0.45767742395401, + "learning_rate": 0.0001868337254188383, + "loss": 1.5184, + "step": 5081 + }, + { + "epoch": 0.06603827218046823, + "grad_norm": 0.45148172974586487, + "learning_rate": 0.00018683112595692692, + "loss": 1.496, + "step": 5082 + }, + { + "epoch": 0.0660512667243841, + "grad_norm": 0.45361578464508057, + "learning_rate": 0.00018682852649501555, + "loss": 1.3767, + "step": 5083 + }, + { + "epoch": 0.06606426126829998, + "grad_norm": 0.39419567584991455, + "learning_rate": 0.00018682592703310414, + "loss": 1.4638, + "step": 5084 + }, + { + "epoch": 0.06607725581221585, + "grad_norm": 0.4580013155937195, + "learning_rate": 0.00018682332757119277, + "loss": 1.5722, + "step": 5085 + }, + { + "epoch": 0.06609025035613172, + "grad_norm": 0.45496541261672974, + "learning_rate": 0.0001868207281092814, + "loss": 1.4134, + "step": 5086 + }, + { + "epoch": 0.0661032449000476, + "grad_norm": 0.3905915319919586, + "learning_rate": 0.00018681812864737002, + "loss": 1.467, + "step": 5087 + }, + { + "epoch": 0.06611623944396347, + "grad_norm": 0.359813392162323, + "learning_rate": 0.00018681552918545862, + "loss": 1.2496, + "step": 5088 + }, + { + "epoch": 0.06612923398787934, + "grad_norm": 0.4537199139595032, + "learning_rate": 0.0001868129297235472, + "loss": 1.4641, + "step": 5089 + }, + { + "epoch": 0.06614222853179522, + "grad_norm": 0.39269357919692993, + "learning_rate": 0.00018681033026163586, + "loss": 1.569, + "step": 5090 + }, + { + "epoch": 0.06615522307571109, + "grad_norm": 0.3269616663455963, + "learning_rate": 0.00018680773079972446, + "loss": 1.3089, + "step": 5091 + }, + { + "epoch": 0.06616821761962696, + "grad_norm": 0.4135110676288605, + "learning_rate": 0.0001868051313378131, + "loss": 1.5335, + "step": 5092 + }, + { + "epoch": 0.06618121216354284, + "grad_norm": 0.35924768447875977, + "learning_rate": 0.00018680253187590168, + "loss": 1.4796, + "step": 5093 + }, + { + "epoch": 0.06619420670745871, + "grad_norm": 0.4137924015522003, + "learning_rate": 0.0001867999324139903, + "loss": 1.5997, + "step": 5094 + }, + { + "epoch": 0.06620720125137458, + "grad_norm": 0.4537808299064636, + "learning_rate": 0.00018679733295207893, + "loss": 1.5673, + "step": 5095 + }, + { + "epoch": 0.06622019579529045, + "grad_norm": 0.5089097619056702, + "learning_rate": 0.00018679473349016753, + "loss": 1.4853, + "step": 5096 + }, + { + "epoch": 0.06623319033920633, + "grad_norm": 0.42744994163513184, + "learning_rate": 0.00018679213402825615, + "loss": 1.4846, + "step": 5097 + }, + { + "epoch": 0.0662461848831222, + "grad_norm": 0.4083409905433655, + "learning_rate": 0.00018678953456634478, + "loss": 1.4917, + "step": 5098 + }, + { + "epoch": 0.06625917942703807, + "grad_norm": 0.41509148478507996, + "learning_rate": 0.0001867869351044334, + "loss": 1.51, + "step": 5099 + }, + { + "epoch": 0.06627217397095395, + "grad_norm": 0.404069721698761, + "learning_rate": 0.000186784335642522, + "loss": 1.4931, + "step": 5100 + }, + { + "epoch": 0.06628516851486982, + "grad_norm": 0.4125731289386749, + "learning_rate": 0.00018678173618061063, + "loss": 1.364, + "step": 5101 + }, + { + "epoch": 0.06629816305878569, + "grad_norm": 0.48383399844169617, + "learning_rate": 0.00018677913671869925, + "loss": 1.6206, + "step": 5102 + }, + { + "epoch": 0.06631115760270156, + "grad_norm": 0.44578373432159424, + "learning_rate": 0.00018677653725678785, + "loss": 1.3747, + "step": 5103 + }, + { + "epoch": 0.06632415214661744, + "grad_norm": 0.4086229205131531, + "learning_rate": 0.00018677393779487647, + "loss": 1.4139, + "step": 5104 + }, + { + "epoch": 0.06633714669053331, + "grad_norm": 0.3682081997394562, + "learning_rate": 0.0001867713383329651, + "loss": 1.5932, + "step": 5105 + }, + { + "epoch": 0.06635014123444918, + "grad_norm": 0.37063175439834595, + "learning_rate": 0.0001867687388710537, + "loss": 1.3922, + "step": 5106 + }, + { + "epoch": 0.06636313577836506, + "grad_norm": 0.3529486060142517, + "learning_rate": 0.00018676613940914232, + "loss": 1.3967, + "step": 5107 + }, + { + "epoch": 0.06637613032228093, + "grad_norm": 0.3288261592388153, + "learning_rate": 0.00018676353994723092, + "loss": 1.5192, + "step": 5108 + }, + { + "epoch": 0.0663891248661968, + "grad_norm": 0.41075578331947327, + "learning_rate": 0.00018676094048531957, + "loss": 1.4928, + "step": 5109 + }, + { + "epoch": 0.06640211941011268, + "grad_norm": 0.3678074777126312, + "learning_rate": 0.00018675834102340816, + "loss": 1.3007, + "step": 5110 + }, + { + "epoch": 0.06641511395402855, + "grad_norm": 0.39846938848495483, + "learning_rate": 0.0001867557415614968, + "loss": 1.3693, + "step": 5111 + }, + { + "epoch": 0.06642810849794442, + "grad_norm": 0.3051009178161621, + "learning_rate": 0.0001867531420995854, + "loss": 1.4999, + "step": 5112 + }, + { + "epoch": 0.0664411030418603, + "grad_norm": 0.3826873004436493, + "learning_rate": 0.000186750542637674, + "loss": 1.5415, + "step": 5113 + }, + { + "epoch": 0.06645409758577617, + "grad_norm": 0.3207344710826874, + "learning_rate": 0.00018674794317576264, + "loss": 1.4091, + "step": 5114 + }, + { + "epoch": 0.06646709212969204, + "grad_norm": 0.3725905418395996, + "learning_rate": 0.00018674534371385123, + "loss": 1.5888, + "step": 5115 + }, + { + "epoch": 0.06648008667360791, + "grad_norm": 0.3939463496208191, + "learning_rate": 0.00018674274425193986, + "loss": 1.3029, + "step": 5116 + }, + { + "epoch": 0.06649308121752379, + "grad_norm": 0.3740473687648773, + "learning_rate": 0.00018674014479002848, + "loss": 1.3662, + "step": 5117 + }, + { + "epoch": 0.06650607576143966, + "grad_norm": 0.4016275107860565, + "learning_rate": 0.00018673754532811708, + "loss": 1.42, + "step": 5118 + }, + { + "epoch": 0.06651907030535553, + "grad_norm": 0.368172824382782, + "learning_rate": 0.0001867349458662057, + "loss": 1.6754, + "step": 5119 + }, + { + "epoch": 0.06653206484927142, + "grad_norm": 0.4605575501918793, + "learning_rate": 0.0001867323464042943, + "loss": 1.6073, + "step": 5120 + }, + { + "epoch": 0.06654505939318729, + "grad_norm": 0.3691301643848419, + "learning_rate": 0.00018672974694238295, + "loss": 1.2124, + "step": 5121 + }, + { + "epoch": 0.06655805393710316, + "grad_norm": 0.28400954604148865, + "learning_rate": 0.00018672714748047155, + "loss": 1.3996, + "step": 5122 + }, + { + "epoch": 0.06657104848101904, + "grad_norm": 0.37806907296180725, + "learning_rate": 0.00018672454801856017, + "loss": 1.3488, + "step": 5123 + }, + { + "epoch": 0.06658404302493491, + "grad_norm": 0.4382490813732147, + "learning_rate": 0.00018672194855664877, + "loss": 1.5047, + "step": 5124 + }, + { + "epoch": 0.06659703756885078, + "grad_norm": 0.4405001997947693, + "learning_rate": 0.0001867193490947374, + "loss": 1.4759, + "step": 5125 + }, + { + "epoch": 0.06661003211276666, + "grad_norm": 0.35975533723831177, + "learning_rate": 0.00018671674963282602, + "loss": 1.3524, + "step": 5126 + }, + { + "epoch": 0.06662302665668253, + "grad_norm": 0.2759242057800293, + "learning_rate": 0.00018671415017091462, + "loss": 1.2414, + "step": 5127 + }, + { + "epoch": 0.0666360212005984, + "grad_norm": 0.44217249751091003, + "learning_rate": 0.00018671155070900324, + "loss": 1.5851, + "step": 5128 + }, + { + "epoch": 0.06664901574451428, + "grad_norm": 0.3929182291030884, + "learning_rate": 0.00018670895124709187, + "loss": 1.575, + "step": 5129 + }, + { + "epoch": 0.06666201028843015, + "grad_norm": 0.35938742756843567, + "learning_rate": 0.00018670635178518046, + "loss": 1.4744, + "step": 5130 + }, + { + "epoch": 0.06667500483234602, + "grad_norm": 0.3668377995491028, + "learning_rate": 0.0001867037523232691, + "loss": 1.5975, + "step": 5131 + }, + { + "epoch": 0.0666879993762619, + "grad_norm": 0.3105641305446625, + "learning_rate": 0.0001867011528613577, + "loss": 1.4407, + "step": 5132 + }, + { + "epoch": 0.06670099392017777, + "grad_norm": 0.43602290749549866, + "learning_rate": 0.00018669855339944634, + "loss": 1.4216, + "step": 5133 + }, + { + "epoch": 0.06671398846409364, + "grad_norm": 0.3823474049568176, + "learning_rate": 0.00018669595393753494, + "loss": 1.3277, + "step": 5134 + }, + { + "epoch": 0.06672698300800951, + "grad_norm": 0.3559263050556183, + "learning_rate": 0.00018669335447562356, + "loss": 1.344, + "step": 5135 + }, + { + "epoch": 0.06673997755192539, + "grad_norm": 0.4177055060863495, + "learning_rate": 0.00018669075501371216, + "loss": 1.5246, + "step": 5136 + }, + { + "epoch": 0.06675297209584126, + "grad_norm": 0.4542948603630066, + "learning_rate": 0.00018668815555180078, + "loss": 1.4636, + "step": 5137 + }, + { + "epoch": 0.06676596663975713, + "grad_norm": 0.3753998875617981, + "learning_rate": 0.0001866855560898894, + "loss": 1.4454, + "step": 5138 + }, + { + "epoch": 0.066778961183673, + "grad_norm": 0.3499920070171356, + "learning_rate": 0.000186682956627978, + "loss": 1.4657, + "step": 5139 + }, + { + "epoch": 0.06679195572758888, + "grad_norm": 0.3218679130077362, + "learning_rate": 0.00018668035716606666, + "loss": 1.2491, + "step": 5140 + }, + { + "epoch": 0.06680495027150475, + "grad_norm": 0.43401187658309937, + "learning_rate": 0.00018667775770415525, + "loss": 1.5627, + "step": 5141 + }, + { + "epoch": 0.06681794481542062, + "grad_norm": 0.4200804531574249, + "learning_rate": 0.00018667515824224388, + "loss": 1.282, + "step": 5142 + }, + { + "epoch": 0.0668309393593365, + "grad_norm": 0.41693368554115295, + "learning_rate": 0.00018667255878033247, + "loss": 1.5221, + "step": 5143 + }, + { + "epoch": 0.06684393390325237, + "grad_norm": 0.48830854892730713, + "learning_rate": 0.0001866699593184211, + "loss": 1.6156, + "step": 5144 + }, + { + "epoch": 0.06685692844716824, + "grad_norm": 0.4205325245857239, + "learning_rate": 0.00018666735985650972, + "loss": 1.3908, + "step": 5145 + }, + { + "epoch": 0.06686992299108412, + "grad_norm": 0.4797542989253998, + "learning_rate": 0.00018666476039459832, + "loss": 1.4816, + "step": 5146 + }, + { + "epoch": 0.06688291753499999, + "grad_norm": 0.6492830514907837, + "learning_rate": 0.00018666216093268695, + "loss": 1.5275, + "step": 5147 + }, + { + "epoch": 0.06689591207891586, + "grad_norm": 0.3697177767753601, + "learning_rate": 0.00018665956147077557, + "loss": 1.302, + "step": 5148 + }, + { + "epoch": 0.06690890662283173, + "grad_norm": 0.381051629781723, + "learning_rate": 0.00018665696200886417, + "loss": 1.4579, + "step": 5149 + }, + { + "epoch": 0.06692190116674761, + "grad_norm": 0.4169452488422394, + "learning_rate": 0.0001866543625469528, + "loss": 1.5325, + "step": 5150 + }, + { + "epoch": 0.06693489571066348, + "grad_norm": 0.5180598497390747, + "learning_rate": 0.0001866517630850414, + "loss": 1.5506, + "step": 5151 + }, + { + "epoch": 0.06694789025457935, + "grad_norm": 0.5124133229255676, + "learning_rate": 0.00018664916362313004, + "loss": 1.558, + "step": 5152 + }, + { + "epoch": 0.06696088479849523, + "grad_norm": 0.4223629832267761, + "learning_rate": 0.00018664656416121864, + "loss": 1.4085, + "step": 5153 + }, + { + "epoch": 0.0669738793424111, + "grad_norm": 0.4464322328567505, + "learning_rate": 0.00018664396469930726, + "loss": 1.5744, + "step": 5154 + }, + { + "epoch": 0.06698687388632697, + "grad_norm": 0.39805474877357483, + "learning_rate": 0.00018664136523739586, + "loss": 1.3028, + "step": 5155 + }, + { + "epoch": 0.06699986843024285, + "grad_norm": 0.5118453502655029, + "learning_rate": 0.00018663876577548448, + "loss": 1.5302, + "step": 5156 + }, + { + "epoch": 0.06701286297415872, + "grad_norm": 0.3416501581668854, + "learning_rate": 0.0001866361663135731, + "loss": 1.2731, + "step": 5157 + }, + { + "epoch": 0.0670258575180746, + "grad_norm": 0.3870924115180969, + "learning_rate": 0.0001866335668516617, + "loss": 1.3796, + "step": 5158 + }, + { + "epoch": 0.06703885206199048, + "grad_norm": 0.3704221844673157, + "learning_rate": 0.00018663096738975033, + "loss": 1.3454, + "step": 5159 + }, + { + "epoch": 0.06705184660590635, + "grad_norm": 0.3537854850292206, + "learning_rate": 0.00018662836792783896, + "loss": 1.4649, + "step": 5160 + }, + { + "epoch": 0.06706484114982222, + "grad_norm": 0.4003196358680725, + "learning_rate": 0.00018662576846592755, + "loss": 1.3104, + "step": 5161 + }, + { + "epoch": 0.0670778356937381, + "grad_norm": 0.3782847225666046, + "learning_rate": 0.00018662316900401618, + "loss": 1.407, + "step": 5162 + }, + { + "epoch": 0.06709083023765397, + "grad_norm": 0.30927857756614685, + "learning_rate": 0.00018662056954210477, + "loss": 1.3548, + "step": 5163 + }, + { + "epoch": 0.06710382478156984, + "grad_norm": 0.44752243161201477, + "learning_rate": 0.00018661797008019343, + "loss": 1.3521, + "step": 5164 + }, + { + "epoch": 0.06711681932548572, + "grad_norm": 0.4513384997844696, + "learning_rate": 0.00018661537061828202, + "loss": 1.6778, + "step": 5165 + }, + { + "epoch": 0.06712981386940159, + "grad_norm": 0.43821585178375244, + "learning_rate": 0.00018661277115637065, + "loss": 1.6521, + "step": 5166 + }, + { + "epoch": 0.06714280841331746, + "grad_norm": 0.3929871618747711, + "learning_rate": 0.00018661017169445925, + "loss": 1.2764, + "step": 5167 + }, + { + "epoch": 0.06715580295723333, + "grad_norm": 0.4311833381652832, + "learning_rate": 0.00018660757223254787, + "loss": 1.463, + "step": 5168 + }, + { + "epoch": 0.06716879750114921, + "grad_norm": 0.3531707227230072, + "learning_rate": 0.0001866049727706365, + "loss": 1.4412, + "step": 5169 + }, + { + "epoch": 0.06718179204506508, + "grad_norm": 0.44328662753105164, + "learning_rate": 0.0001866023733087251, + "loss": 1.507, + "step": 5170 + }, + { + "epoch": 0.06719478658898095, + "grad_norm": 0.32282599806785583, + "learning_rate": 0.00018659977384681372, + "loss": 1.3373, + "step": 5171 + }, + { + "epoch": 0.06720778113289683, + "grad_norm": 0.3568762540817261, + "learning_rate": 0.00018659717438490234, + "loss": 1.2362, + "step": 5172 + }, + { + "epoch": 0.0672207756768127, + "grad_norm": 0.32072365283966064, + "learning_rate": 0.00018659457492299094, + "loss": 1.5883, + "step": 5173 + }, + { + "epoch": 0.06723377022072857, + "grad_norm": 0.33857256174087524, + "learning_rate": 0.00018659197546107956, + "loss": 1.5246, + "step": 5174 + }, + { + "epoch": 0.06724676476464445, + "grad_norm": 0.35791605710983276, + "learning_rate": 0.0001865893759991682, + "loss": 1.5009, + "step": 5175 + }, + { + "epoch": 0.06725975930856032, + "grad_norm": 0.31736233830451965, + "learning_rate": 0.0001865867765372568, + "loss": 1.406, + "step": 5176 + }, + { + "epoch": 0.06727275385247619, + "grad_norm": 0.5177508592605591, + "learning_rate": 0.0001865841770753454, + "loss": 1.4898, + "step": 5177 + }, + { + "epoch": 0.06728574839639206, + "grad_norm": 0.27760931849479675, + "learning_rate": 0.00018658157761343403, + "loss": 1.1981, + "step": 5178 + }, + { + "epoch": 0.06729874294030794, + "grad_norm": 0.46979695558547974, + "learning_rate": 0.00018657897815152266, + "loss": 1.5755, + "step": 5179 + }, + { + "epoch": 0.06731173748422381, + "grad_norm": 0.4669855237007141, + "learning_rate": 0.00018657637868961126, + "loss": 1.4488, + "step": 5180 + }, + { + "epoch": 0.06732473202813968, + "grad_norm": 0.19816741347312927, + "learning_rate": 0.00018657377922769988, + "loss": 1.2818, + "step": 5181 + }, + { + "epoch": 0.06733772657205556, + "grad_norm": 0.3971433639526367, + "learning_rate": 0.00018657117976578848, + "loss": 1.4341, + "step": 5182 + }, + { + "epoch": 0.06735072111597143, + "grad_norm": 0.2910483777523041, + "learning_rate": 0.00018656858030387713, + "loss": 1.3133, + "step": 5183 + }, + { + "epoch": 0.0673637156598873, + "grad_norm": 0.3295382261276245, + "learning_rate": 0.00018656598084196573, + "loss": 1.3246, + "step": 5184 + }, + { + "epoch": 0.06737671020380318, + "grad_norm": 0.3677801787853241, + "learning_rate": 0.00018656338138005432, + "loss": 1.1968, + "step": 5185 + }, + { + "epoch": 0.06738970474771905, + "grad_norm": 0.3704491853713989, + "learning_rate": 0.00018656078191814295, + "loss": 1.3783, + "step": 5186 + }, + { + "epoch": 0.06740269929163492, + "grad_norm": 0.30574753880500793, + "learning_rate": 0.00018655818245623157, + "loss": 1.3477, + "step": 5187 + }, + { + "epoch": 0.0674156938355508, + "grad_norm": 0.3611847758293152, + "learning_rate": 0.0001865555829943202, + "loss": 1.4376, + "step": 5188 + }, + { + "epoch": 0.06742868837946667, + "grad_norm": 0.3521232604980469, + "learning_rate": 0.0001865529835324088, + "loss": 1.4912, + "step": 5189 + }, + { + "epoch": 0.06744168292338254, + "grad_norm": 0.33566415309906006, + "learning_rate": 0.00018655038407049742, + "loss": 1.4116, + "step": 5190 + }, + { + "epoch": 0.06745467746729841, + "grad_norm": 0.3886506259441376, + "learning_rate": 0.00018654778460858604, + "loss": 1.4885, + "step": 5191 + }, + { + "epoch": 0.06746767201121429, + "grad_norm": 0.3376752734184265, + "learning_rate": 0.00018654518514667464, + "loss": 1.3143, + "step": 5192 + }, + { + "epoch": 0.06748066655513016, + "grad_norm": 0.45552459359169006, + "learning_rate": 0.00018654258568476327, + "loss": 1.472, + "step": 5193 + }, + { + "epoch": 0.06749366109904603, + "grad_norm": 0.3657516539096832, + "learning_rate": 0.00018653998622285186, + "loss": 1.4285, + "step": 5194 + }, + { + "epoch": 0.0675066556429619, + "grad_norm": 0.4129646420478821, + "learning_rate": 0.00018653738676094051, + "loss": 1.4733, + "step": 5195 + }, + { + "epoch": 0.06751965018687779, + "grad_norm": 0.4120321571826935, + "learning_rate": 0.0001865347872990291, + "loss": 1.5079, + "step": 5196 + }, + { + "epoch": 0.06753264473079366, + "grad_norm": 0.4931769371032715, + "learning_rate": 0.0001865321878371177, + "loss": 1.4502, + "step": 5197 + }, + { + "epoch": 0.06754563927470954, + "grad_norm": 0.31527647376060486, + "learning_rate": 0.00018652958837520633, + "loss": 1.4159, + "step": 5198 + }, + { + "epoch": 0.06755863381862541, + "grad_norm": 0.42977553606033325, + "learning_rate": 0.00018652698891329496, + "loss": 1.277, + "step": 5199 + }, + { + "epoch": 0.06757162836254128, + "grad_norm": 0.3936081528663635, + "learning_rate": 0.00018652438945138358, + "loss": 1.3278, + "step": 5200 + }, + { + "epoch": 0.06758462290645716, + "grad_norm": 0.38466107845306396, + "learning_rate": 0.00018652178998947218, + "loss": 1.3467, + "step": 5201 + }, + { + "epoch": 0.06759761745037303, + "grad_norm": 0.3167068362236023, + "learning_rate": 0.0001865191905275608, + "loss": 1.3721, + "step": 5202 + }, + { + "epoch": 0.0676106119942889, + "grad_norm": 0.4390086233615875, + "learning_rate": 0.00018651659106564943, + "loss": 1.4633, + "step": 5203 + }, + { + "epoch": 0.06762360653820478, + "grad_norm": 0.3464265763759613, + "learning_rate": 0.00018651399160373803, + "loss": 1.5348, + "step": 5204 + }, + { + "epoch": 0.06763660108212065, + "grad_norm": 0.46605727076530457, + "learning_rate": 0.00018651139214182665, + "loss": 1.4266, + "step": 5205 + }, + { + "epoch": 0.06764959562603652, + "grad_norm": 0.31618010997772217, + "learning_rate": 0.00018650879267991525, + "loss": 1.4348, + "step": 5206 + }, + { + "epoch": 0.0676625901699524, + "grad_norm": 0.41530296206474304, + "learning_rate": 0.0001865061932180039, + "loss": 1.366, + "step": 5207 + }, + { + "epoch": 0.06767558471386827, + "grad_norm": 0.5661311149597168, + "learning_rate": 0.0001865035937560925, + "loss": 1.564, + "step": 5208 + }, + { + "epoch": 0.06768857925778414, + "grad_norm": 0.3350794315338135, + "learning_rate": 0.00018650099429418112, + "loss": 1.2329, + "step": 5209 + }, + { + "epoch": 0.06770157380170001, + "grad_norm": 0.3374537527561188, + "learning_rate": 0.00018649839483226972, + "loss": 1.2643, + "step": 5210 + }, + { + "epoch": 0.06771456834561589, + "grad_norm": 0.4399222433567047, + "learning_rate": 0.00018649579537035834, + "loss": 1.3484, + "step": 5211 + }, + { + "epoch": 0.06772756288953176, + "grad_norm": 0.40509864687919617, + "learning_rate": 0.00018649319590844697, + "loss": 1.7185, + "step": 5212 + }, + { + "epoch": 0.06774055743344763, + "grad_norm": 0.3245294392108917, + "learning_rate": 0.00018649059644653557, + "loss": 1.3554, + "step": 5213 + }, + { + "epoch": 0.0677535519773635, + "grad_norm": 0.32566478848457336, + "learning_rate": 0.0001864879969846242, + "loss": 1.4119, + "step": 5214 + }, + { + "epoch": 0.06776654652127938, + "grad_norm": 0.38132572174072266, + "learning_rate": 0.00018648539752271281, + "loss": 1.4767, + "step": 5215 + }, + { + "epoch": 0.06777954106519525, + "grad_norm": 0.3775690495967865, + "learning_rate": 0.0001864827980608014, + "loss": 1.4809, + "step": 5216 + }, + { + "epoch": 0.06779253560911112, + "grad_norm": 0.4051850736141205, + "learning_rate": 0.00018648019859889004, + "loss": 1.5445, + "step": 5217 + }, + { + "epoch": 0.067805530153027, + "grad_norm": 0.44299229979515076, + "learning_rate": 0.00018647759913697866, + "loss": 1.4679, + "step": 5218 + }, + { + "epoch": 0.06781852469694287, + "grad_norm": 0.4380272626876831, + "learning_rate": 0.00018647499967506728, + "loss": 1.2535, + "step": 5219 + }, + { + "epoch": 0.06783151924085874, + "grad_norm": 0.44665494561195374, + "learning_rate": 0.00018647240021315588, + "loss": 1.3511, + "step": 5220 + }, + { + "epoch": 0.06784451378477462, + "grad_norm": 0.33580636978149414, + "learning_rate": 0.0001864698007512445, + "loss": 1.231, + "step": 5221 + }, + { + "epoch": 0.06785750832869049, + "grad_norm": 0.3992922306060791, + "learning_rate": 0.00018646720128933313, + "loss": 1.212, + "step": 5222 + }, + { + "epoch": 0.06787050287260636, + "grad_norm": 0.44707828760147095, + "learning_rate": 0.00018646460182742173, + "loss": 1.294, + "step": 5223 + }, + { + "epoch": 0.06788349741652223, + "grad_norm": 0.3758302330970764, + "learning_rate": 0.00018646200236551035, + "loss": 1.2983, + "step": 5224 + }, + { + "epoch": 0.06789649196043811, + "grad_norm": 0.38771140575408936, + "learning_rate": 0.00018645940290359895, + "loss": 1.4407, + "step": 5225 + }, + { + "epoch": 0.06790948650435398, + "grad_norm": 0.2942237854003906, + "learning_rate": 0.0001864568034416876, + "loss": 1.3648, + "step": 5226 + }, + { + "epoch": 0.06792248104826985, + "grad_norm": 0.3121523857116699, + "learning_rate": 0.0001864542039797762, + "loss": 1.3511, + "step": 5227 + }, + { + "epoch": 0.06793547559218573, + "grad_norm": 0.44604796171188354, + "learning_rate": 0.0001864516045178648, + "loss": 1.4367, + "step": 5228 + }, + { + "epoch": 0.0679484701361016, + "grad_norm": 0.4803444743156433, + "learning_rate": 0.00018644900505595342, + "loss": 1.4321, + "step": 5229 + }, + { + "epoch": 0.06796146468001747, + "grad_norm": 0.4141957759857178, + "learning_rate": 0.00018644640559404205, + "loss": 1.5444, + "step": 5230 + }, + { + "epoch": 0.06797445922393335, + "grad_norm": 0.364609032869339, + "learning_rate": 0.00018644380613213067, + "loss": 1.363, + "step": 5231 + }, + { + "epoch": 0.06798745376784922, + "grad_norm": 0.44600462913513184, + "learning_rate": 0.00018644120667021927, + "loss": 1.5828, + "step": 5232 + }, + { + "epoch": 0.06800044831176509, + "grad_norm": 0.3881787657737732, + "learning_rate": 0.0001864386072083079, + "loss": 1.4084, + "step": 5233 + }, + { + "epoch": 0.06801344285568098, + "grad_norm": 0.5400509238243103, + "learning_rate": 0.00018643600774639652, + "loss": 1.5342, + "step": 5234 + }, + { + "epoch": 0.06802643739959685, + "grad_norm": 0.4942297041416168, + "learning_rate": 0.00018643340828448511, + "loss": 1.6631, + "step": 5235 + }, + { + "epoch": 0.06803943194351272, + "grad_norm": 0.33731839060783386, + "learning_rate": 0.00018643080882257374, + "loss": 1.5178, + "step": 5236 + }, + { + "epoch": 0.0680524264874286, + "grad_norm": 0.3498685956001282, + "learning_rate": 0.00018642820936066234, + "loss": 1.3351, + "step": 5237 + }, + { + "epoch": 0.06806542103134447, + "grad_norm": 0.4106043875217438, + "learning_rate": 0.000186425609898751, + "loss": 1.4977, + "step": 5238 + }, + { + "epoch": 0.06807841557526034, + "grad_norm": 0.36881181597709656, + "learning_rate": 0.00018642301043683958, + "loss": 1.4291, + "step": 5239 + }, + { + "epoch": 0.06809141011917622, + "grad_norm": 0.42651116847991943, + "learning_rate": 0.00018642041097492818, + "loss": 1.4475, + "step": 5240 + }, + { + "epoch": 0.06810440466309209, + "grad_norm": 0.38923752307891846, + "learning_rate": 0.0001864178115130168, + "loss": 1.3683, + "step": 5241 + }, + { + "epoch": 0.06811739920700796, + "grad_norm": 0.3672979772090912, + "learning_rate": 0.00018641521205110543, + "loss": 1.3896, + "step": 5242 + }, + { + "epoch": 0.06813039375092383, + "grad_norm": 0.3511219620704651, + "learning_rate": 0.00018641261258919406, + "loss": 1.291, + "step": 5243 + }, + { + "epoch": 0.06814338829483971, + "grad_norm": 0.3356342017650604, + "learning_rate": 0.00018641001312728265, + "loss": 1.4553, + "step": 5244 + }, + { + "epoch": 0.06815638283875558, + "grad_norm": 0.36938512325286865, + "learning_rate": 0.00018640741366537128, + "loss": 1.282, + "step": 5245 + }, + { + "epoch": 0.06816937738267145, + "grad_norm": 0.37965676188468933, + "learning_rate": 0.0001864048142034599, + "loss": 1.3674, + "step": 5246 + }, + { + "epoch": 0.06818237192658733, + "grad_norm": 0.32190224528312683, + "learning_rate": 0.0001864022147415485, + "loss": 1.475, + "step": 5247 + }, + { + "epoch": 0.0681953664705032, + "grad_norm": 0.377675324678421, + "learning_rate": 0.00018639961527963712, + "loss": 1.6903, + "step": 5248 + }, + { + "epoch": 0.06820836101441907, + "grad_norm": 0.45027559995651245, + "learning_rate": 0.00018639701581772572, + "loss": 1.6195, + "step": 5249 + }, + { + "epoch": 0.06822135555833495, + "grad_norm": 0.38764241337776184, + "learning_rate": 0.00018639441635581437, + "loss": 1.3975, + "step": 5250 + }, + { + "epoch": 0.06823435010225082, + "grad_norm": 0.3362537622451782, + "learning_rate": 0.00018639181689390297, + "loss": 1.2832, + "step": 5251 + }, + { + "epoch": 0.06824734464616669, + "grad_norm": 0.48953282833099365, + "learning_rate": 0.00018638921743199157, + "loss": 1.7233, + "step": 5252 + }, + { + "epoch": 0.06826033919008256, + "grad_norm": 0.3870137631893158, + "learning_rate": 0.00018638661797008022, + "loss": 1.5522, + "step": 5253 + }, + { + "epoch": 0.06827333373399844, + "grad_norm": 0.4046284258365631, + "learning_rate": 0.00018638401850816882, + "loss": 1.3826, + "step": 5254 + }, + { + "epoch": 0.06828632827791431, + "grad_norm": 0.45206111669540405, + "learning_rate": 0.00018638141904625744, + "loss": 1.5993, + "step": 5255 + }, + { + "epoch": 0.06829932282183018, + "grad_norm": 0.35778701305389404, + "learning_rate": 0.00018637881958434604, + "loss": 1.6784, + "step": 5256 + }, + { + "epoch": 0.06831231736574606, + "grad_norm": 0.4111640453338623, + "learning_rate": 0.00018637622012243466, + "loss": 1.3314, + "step": 5257 + }, + { + "epoch": 0.06832531190966193, + "grad_norm": 0.4049829840660095, + "learning_rate": 0.0001863736206605233, + "loss": 1.5166, + "step": 5258 + }, + { + "epoch": 0.0683383064535778, + "grad_norm": 0.4178505837917328, + "learning_rate": 0.00018637102119861188, + "loss": 1.4619, + "step": 5259 + }, + { + "epoch": 0.06835130099749367, + "grad_norm": 0.3455483317375183, + "learning_rate": 0.0001863684217367005, + "loss": 1.5682, + "step": 5260 + }, + { + "epoch": 0.06836429554140955, + "grad_norm": 0.34195348620414734, + "learning_rate": 0.00018636582227478913, + "loss": 1.2803, + "step": 5261 + }, + { + "epoch": 0.06837729008532542, + "grad_norm": 0.464097261428833, + "learning_rate": 0.00018636322281287776, + "loss": 1.4458, + "step": 5262 + }, + { + "epoch": 0.0683902846292413, + "grad_norm": 0.47718513011932373, + "learning_rate": 0.00018636062335096636, + "loss": 1.6724, + "step": 5263 + }, + { + "epoch": 0.06840327917315717, + "grad_norm": 0.4567102789878845, + "learning_rate": 0.00018635802388905498, + "loss": 1.5884, + "step": 5264 + }, + { + "epoch": 0.06841627371707304, + "grad_norm": 0.3462975025177002, + "learning_rate": 0.0001863554244271436, + "loss": 1.297, + "step": 5265 + }, + { + "epoch": 0.06842926826098891, + "grad_norm": 0.3322049677371979, + "learning_rate": 0.0001863528249652322, + "loss": 1.385, + "step": 5266 + }, + { + "epoch": 0.06844226280490479, + "grad_norm": 0.4266486167907715, + "learning_rate": 0.00018635022550332083, + "loss": 1.4837, + "step": 5267 + }, + { + "epoch": 0.06845525734882066, + "grad_norm": 0.4028988778591156, + "learning_rate": 0.00018634762604140942, + "loss": 1.4101, + "step": 5268 + }, + { + "epoch": 0.06846825189273653, + "grad_norm": 0.46043241024017334, + "learning_rate": 0.00018634502657949805, + "loss": 1.3826, + "step": 5269 + }, + { + "epoch": 0.0684812464366524, + "grad_norm": 0.3606872260570526, + "learning_rate": 0.00018634242711758667, + "loss": 1.629, + "step": 5270 + }, + { + "epoch": 0.06849424098056828, + "grad_norm": 0.36665815114974976, + "learning_rate": 0.00018633982765567527, + "loss": 1.4191, + "step": 5271 + }, + { + "epoch": 0.06850723552448416, + "grad_norm": 0.3493126630783081, + "learning_rate": 0.0001863372281937639, + "loss": 1.4933, + "step": 5272 + }, + { + "epoch": 0.06852023006840004, + "grad_norm": 0.35804951190948486, + "learning_rate": 0.00018633462873185252, + "loss": 1.3218, + "step": 5273 + }, + { + "epoch": 0.06853322461231591, + "grad_norm": 0.41491150856018066, + "learning_rate": 0.00018633202926994114, + "loss": 1.3729, + "step": 5274 + }, + { + "epoch": 0.06854621915623178, + "grad_norm": 0.43768370151519775, + "learning_rate": 0.00018632942980802974, + "loss": 1.5797, + "step": 5275 + }, + { + "epoch": 0.06855921370014766, + "grad_norm": 0.308243066072464, + "learning_rate": 0.00018632683034611837, + "loss": 1.4119, + "step": 5276 + }, + { + "epoch": 0.06857220824406353, + "grad_norm": 0.44702255725860596, + "learning_rate": 0.000186324230884207, + "loss": 1.3934, + "step": 5277 + }, + { + "epoch": 0.0685852027879794, + "grad_norm": 0.2788557708263397, + "learning_rate": 0.0001863216314222956, + "loss": 1.2067, + "step": 5278 + }, + { + "epoch": 0.06859819733189527, + "grad_norm": 0.37357908487319946, + "learning_rate": 0.0001863190319603842, + "loss": 1.4587, + "step": 5279 + }, + { + "epoch": 0.06861119187581115, + "grad_norm": 0.42723459005355835, + "learning_rate": 0.0001863164324984728, + "loss": 1.4007, + "step": 5280 + }, + { + "epoch": 0.06862418641972702, + "grad_norm": 0.5236778855323792, + "learning_rate": 0.00018631383303656143, + "loss": 1.4214, + "step": 5281 + }, + { + "epoch": 0.0686371809636429, + "grad_norm": 0.36598870158195496, + "learning_rate": 0.00018631123357465006, + "loss": 1.4405, + "step": 5282 + }, + { + "epoch": 0.06865017550755877, + "grad_norm": 0.3029235601425171, + "learning_rate": 0.00018630863411273866, + "loss": 1.5244, + "step": 5283 + }, + { + "epoch": 0.06866317005147464, + "grad_norm": 0.40026795864105225, + "learning_rate": 0.00018630603465082728, + "loss": 1.5395, + "step": 5284 + }, + { + "epoch": 0.06867616459539051, + "grad_norm": 0.2823532223701477, + "learning_rate": 0.0001863034351889159, + "loss": 1.2154, + "step": 5285 + }, + { + "epoch": 0.06868915913930639, + "grad_norm": 0.3997417390346527, + "learning_rate": 0.00018630083572700453, + "loss": 1.4944, + "step": 5286 + }, + { + "epoch": 0.06870215368322226, + "grad_norm": 0.43358761072158813, + "learning_rate": 0.00018629823626509313, + "loss": 1.5712, + "step": 5287 + }, + { + "epoch": 0.06871514822713813, + "grad_norm": 0.4592825770378113, + "learning_rate": 0.00018629563680318175, + "loss": 1.3432, + "step": 5288 + }, + { + "epoch": 0.068728142771054, + "grad_norm": 0.3673540949821472, + "learning_rate": 0.00018629303734127038, + "loss": 1.3306, + "step": 5289 + }, + { + "epoch": 0.06874113731496988, + "grad_norm": 0.42722219228744507, + "learning_rate": 0.00018629043787935897, + "loss": 1.425, + "step": 5290 + }, + { + "epoch": 0.06875413185888575, + "grad_norm": 0.35838770866394043, + "learning_rate": 0.0001862878384174476, + "loss": 1.4759, + "step": 5291 + }, + { + "epoch": 0.06876712640280162, + "grad_norm": 0.29523012042045593, + "learning_rate": 0.00018628523895553622, + "loss": 1.4983, + "step": 5292 + }, + { + "epoch": 0.0687801209467175, + "grad_norm": 0.4709910750389099, + "learning_rate": 0.00018628263949362485, + "loss": 1.3785, + "step": 5293 + }, + { + "epoch": 0.06879311549063337, + "grad_norm": 0.47797682881355286, + "learning_rate": 0.00018628004003171344, + "loss": 1.526, + "step": 5294 + }, + { + "epoch": 0.06880611003454924, + "grad_norm": 0.28782710433006287, + "learning_rate": 0.00018627744056980204, + "loss": 1.3666, + "step": 5295 + }, + { + "epoch": 0.06881910457846512, + "grad_norm": 0.39402514696121216, + "learning_rate": 0.0001862748411078907, + "loss": 1.3459, + "step": 5296 + }, + { + "epoch": 0.06883209912238099, + "grad_norm": 0.3537048101425171, + "learning_rate": 0.0001862722416459793, + "loss": 1.4959, + "step": 5297 + }, + { + "epoch": 0.06884509366629686, + "grad_norm": 0.32225099205970764, + "learning_rate": 0.00018626964218406791, + "loss": 1.2476, + "step": 5298 + }, + { + "epoch": 0.06885808821021273, + "grad_norm": 0.38983437418937683, + "learning_rate": 0.0001862670427221565, + "loss": 1.5324, + "step": 5299 + }, + { + "epoch": 0.0688710827541286, + "grad_norm": 0.3756939470767975, + "learning_rate": 0.00018626444326024514, + "loss": 1.4344, + "step": 5300 + }, + { + "epoch": 0.06888407729804448, + "grad_norm": 0.34896355867385864, + "learning_rate": 0.00018626184379833376, + "loss": 1.3561, + "step": 5301 + }, + { + "epoch": 0.06889707184196035, + "grad_norm": 0.42439353466033936, + "learning_rate": 0.00018625924433642236, + "loss": 1.3665, + "step": 5302 + }, + { + "epoch": 0.06891006638587623, + "grad_norm": 0.45926398038864136, + "learning_rate": 0.00018625664487451098, + "loss": 1.3234, + "step": 5303 + }, + { + "epoch": 0.0689230609297921, + "grad_norm": 0.34778714179992676, + "learning_rate": 0.0001862540454125996, + "loss": 1.387, + "step": 5304 + }, + { + "epoch": 0.06893605547370797, + "grad_norm": 0.4572735130786896, + "learning_rate": 0.00018625144595068823, + "loss": 1.4087, + "step": 5305 + }, + { + "epoch": 0.06894905001762384, + "grad_norm": 0.3111006021499634, + "learning_rate": 0.00018624884648877683, + "loss": 1.1695, + "step": 5306 + }, + { + "epoch": 0.06896204456153972, + "grad_norm": 0.35482633113861084, + "learning_rate": 0.00018624624702686543, + "loss": 1.4407, + "step": 5307 + }, + { + "epoch": 0.06897503910545559, + "grad_norm": 0.47263920307159424, + "learning_rate": 0.00018624364756495408, + "loss": 1.5393, + "step": 5308 + }, + { + "epoch": 0.06898803364937146, + "grad_norm": 0.4116426408290863, + "learning_rate": 0.00018624104810304268, + "loss": 1.4939, + "step": 5309 + }, + { + "epoch": 0.06900102819328735, + "grad_norm": 0.4379853904247284, + "learning_rate": 0.0001862384486411313, + "loss": 1.4931, + "step": 5310 + }, + { + "epoch": 0.06901402273720322, + "grad_norm": 0.3272196352481842, + "learning_rate": 0.0001862358491792199, + "loss": 1.3136, + "step": 5311 + }, + { + "epoch": 0.0690270172811191, + "grad_norm": 0.3781753182411194, + "learning_rate": 0.00018623324971730852, + "loss": 1.3917, + "step": 5312 + }, + { + "epoch": 0.06904001182503497, + "grad_norm": 0.3424317538738251, + "learning_rate": 0.00018623065025539715, + "loss": 1.4196, + "step": 5313 + }, + { + "epoch": 0.06905300636895084, + "grad_norm": 0.4061320126056671, + "learning_rate": 0.00018622805079348574, + "loss": 1.3784, + "step": 5314 + }, + { + "epoch": 0.06906600091286672, + "grad_norm": 0.2918657660484314, + "learning_rate": 0.00018622545133157437, + "loss": 1.4452, + "step": 5315 + }, + { + "epoch": 0.06907899545678259, + "grad_norm": 0.36582809686660767, + "learning_rate": 0.000186222851869663, + "loss": 1.413, + "step": 5316 + }, + { + "epoch": 0.06909199000069846, + "grad_norm": 0.3369402587413788, + "learning_rate": 0.00018622025240775162, + "loss": 1.5082, + "step": 5317 + }, + { + "epoch": 0.06910498454461433, + "grad_norm": 0.48518049716949463, + "learning_rate": 0.00018621765294584021, + "loss": 1.3053, + "step": 5318 + }, + { + "epoch": 0.06911797908853021, + "grad_norm": 0.42002245783805847, + "learning_rate": 0.0001862150534839288, + "loss": 1.5164, + "step": 5319 + }, + { + "epoch": 0.06913097363244608, + "grad_norm": 0.32677462697029114, + "learning_rate": 0.00018621245402201746, + "loss": 1.2893, + "step": 5320 + }, + { + "epoch": 0.06914396817636195, + "grad_norm": 0.3690161108970642, + "learning_rate": 0.00018620985456010606, + "loss": 1.5781, + "step": 5321 + }, + { + "epoch": 0.06915696272027783, + "grad_norm": 0.4283229410648346, + "learning_rate": 0.00018620725509819469, + "loss": 1.3131, + "step": 5322 + }, + { + "epoch": 0.0691699572641937, + "grad_norm": 0.4192677140235901, + "learning_rate": 0.00018620465563628328, + "loss": 1.6219, + "step": 5323 + }, + { + "epoch": 0.06918295180810957, + "grad_norm": 0.27933207154273987, + "learning_rate": 0.0001862020561743719, + "loss": 1.2215, + "step": 5324 + }, + { + "epoch": 0.06919594635202544, + "grad_norm": 0.37069013714790344, + "learning_rate": 0.00018619945671246053, + "loss": 1.4709, + "step": 5325 + }, + { + "epoch": 0.06920894089594132, + "grad_norm": 0.47909682989120483, + "learning_rate": 0.00018619685725054913, + "loss": 1.6683, + "step": 5326 + }, + { + "epoch": 0.06922193543985719, + "grad_norm": 0.38398486375808716, + "learning_rate": 0.00018619425778863778, + "loss": 1.4308, + "step": 5327 + }, + { + "epoch": 0.06923492998377306, + "grad_norm": 0.5003829002380371, + "learning_rate": 0.00018619165832672638, + "loss": 1.58, + "step": 5328 + }, + { + "epoch": 0.06924792452768894, + "grad_norm": 0.42183107137680054, + "learning_rate": 0.000186189058864815, + "loss": 1.3787, + "step": 5329 + }, + { + "epoch": 0.06926091907160481, + "grad_norm": 0.4104776084423065, + "learning_rate": 0.0001861864594029036, + "loss": 1.3821, + "step": 5330 + }, + { + "epoch": 0.06927391361552068, + "grad_norm": 0.3703576326370239, + "learning_rate": 0.00018618385994099222, + "loss": 1.4411, + "step": 5331 + }, + { + "epoch": 0.06928690815943656, + "grad_norm": 0.4419730007648468, + "learning_rate": 0.00018618126047908085, + "loss": 1.4491, + "step": 5332 + }, + { + "epoch": 0.06929990270335243, + "grad_norm": 0.4102279543876648, + "learning_rate": 0.00018617866101716945, + "loss": 1.3687, + "step": 5333 + }, + { + "epoch": 0.0693128972472683, + "grad_norm": 0.43752139806747437, + "learning_rate": 0.00018617606155525807, + "loss": 1.5106, + "step": 5334 + }, + { + "epoch": 0.06932589179118417, + "grad_norm": 0.3585141897201538, + "learning_rate": 0.0001861734620933467, + "loss": 1.5743, + "step": 5335 + }, + { + "epoch": 0.06933888633510005, + "grad_norm": 0.475147008895874, + "learning_rate": 0.0001861708626314353, + "loss": 1.5227, + "step": 5336 + }, + { + "epoch": 0.06935188087901592, + "grad_norm": 0.40625354647636414, + "learning_rate": 0.00018616826316952392, + "loss": 1.6361, + "step": 5337 + }, + { + "epoch": 0.0693648754229318, + "grad_norm": 0.34236976504325867, + "learning_rate": 0.00018616566370761251, + "loss": 1.4576, + "step": 5338 + }, + { + "epoch": 0.06937786996684767, + "grad_norm": 0.38249075412750244, + "learning_rate": 0.00018616306424570117, + "loss": 1.1979, + "step": 5339 + }, + { + "epoch": 0.06939086451076354, + "grad_norm": 0.3851509690284729, + "learning_rate": 0.00018616046478378976, + "loss": 1.5422, + "step": 5340 + }, + { + "epoch": 0.06940385905467941, + "grad_norm": 0.3492293059825897, + "learning_rate": 0.0001861578653218784, + "loss": 1.4181, + "step": 5341 + }, + { + "epoch": 0.06941685359859529, + "grad_norm": 0.5009096264839172, + "learning_rate": 0.00018615526585996699, + "loss": 1.5556, + "step": 5342 + }, + { + "epoch": 0.06942984814251116, + "grad_norm": 0.3898882269859314, + "learning_rate": 0.0001861526663980556, + "loss": 1.5137, + "step": 5343 + }, + { + "epoch": 0.06944284268642703, + "grad_norm": 0.3906683027744293, + "learning_rate": 0.00018615006693614423, + "loss": 1.3354, + "step": 5344 + }, + { + "epoch": 0.0694558372303429, + "grad_norm": 0.3135431706905365, + "learning_rate": 0.00018614746747423283, + "loss": 1.398, + "step": 5345 + }, + { + "epoch": 0.06946883177425878, + "grad_norm": 0.3445374071598053, + "learning_rate": 0.00018614486801232146, + "loss": 1.3836, + "step": 5346 + }, + { + "epoch": 0.06948182631817465, + "grad_norm": 0.42195379734039307, + "learning_rate": 0.00018614226855041008, + "loss": 1.399, + "step": 5347 + }, + { + "epoch": 0.06949482086209054, + "grad_norm": 0.3718344569206238, + "learning_rate": 0.0001861396690884987, + "loss": 1.3381, + "step": 5348 + }, + { + "epoch": 0.06950781540600641, + "grad_norm": 0.4483731687068939, + "learning_rate": 0.0001861370696265873, + "loss": 1.5374, + "step": 5349 + }, + { + "epoch": 0.06952080994992228, + "grad_norm": 0.41100406646728516, + "learning_rate": 0.0001861344701646759, + "loss": 1.593, + "step": 5350 + }, + { + "epoch": 0.06953380449383816, + "grad_norm": 0.30381742119789124, + "learning_rate": 0.00018613187070276455, + "loss": 1.3517, + "step": 5351 + }, + { + "epoch": 0.06954679903775403, + "grad_norm": 0.35975003242492676, + "learning_rate": 0.00018612927124085315, + "loss": 1.4362, + "step": 5352 + }, + { + "epoch": 0.0695597935816699, + "grad_norm": 0.46031081676483154, + "learning_rate": 0.00018612667177894177, + "loss": 1.6122, + "step": 5353 + }, + { + "epoch": 0.06957278812558577, + "grad_norm": 0.4003441333770752, + "learning_rate": 0.00018612407231703037, + "loss": 1.4936, + "step": 5354 + }, + { + "epoch": 0.06958578266950165, + "grad_norm": 0.310420423746109, + "learning_rate": 0.000186121472855119, + "loss": 1.5211, + "step": 5355 + }, + { + "epoch": 0.06959877721341752, + "grad_norm": 0.46783626079559326, + "learning_rate": 0.00018611887339320762, + "loss": 1.3485, + "step": 5356 + }, + { + "epoch": 0.0696117717573334, + "grad_norm": 0.46449097990989685, + "learning_rate": 0.00018611627393129622, + "loss": 1.6073, + "step": 5357 + }, + { + "epoch": 0.06962476630124927, + "grad_norm": 0.404897540807724, + "learning_rate": 0.00018611367446938484, + "loss": 1.4101, + "step": 5358 + }, + { + "epoch": 0.06963776084516514, + "grad_norm": 0.46192020177841187, + "learning_rate": 0.00018611107500747347, + "loss": 1.4044, + "step": 5359 + }, + { + "epoch": 0.06965075538908101, + "grad_norm": 0.36133280396461487, + "learning_rate": 0.0001861084755455621, + "loss": 1.4238, + "step": 5360 + }, + { + "epoch": 0.06966374993299689, + "grad_norm": 0.24162161350250244, + "learning_rate": 0.0001861058760836507, + "loss": 1.1458, + "step": 5361 + }, + { + "epoch": 0.06967674447691276, + "grad_norm": 0.3562941551208496, + "learning_rate": 0.0001861032766217393, + "loss": 1.3348, + "step": 5362 + }, + { + "epoch": 0.06968973902082863, + "grad_norm": 0.2858433127403259, + "learning_rate": 0.00018610067715982794, + "loss": 1.4673, + "step": 5363 + }, + { + "epoch": 0.0697027335647445, + "grad_norm": 0.33209657669067383, + "learning_rate": 0.00018609807769791653, + "loss": 1.449, + "step": 5364 + }, + { + "epoch": 0.06971572810866038, + "grad_norm": 0.3816041052341461, + "learning_rate": 0.00018609547823600516, + "loss": 1.5225, + "step": 5365 + }, + { + "epoch": 0.06972872265257625, + "grad_norm": 0.4275282025337219, + "learning_rate": 0.00018609287877409378, + "loss": 1.4657, + "step": 5366 + }, + { + "epoch": 0.06974171719649212, + "grad_norm": 0.5272087454795837, + "learning_rate": 0.00018609027931218238, + "loss": 1.58, + "step": 5367 + }, + { + "epoch": 0.069754711740408, + "grad_norm": 0.3843916952610016, + "learning_rate": 0.000186087679850271, + "loss": 1.368, + "step": 5368 + }, + { + "epoch": 0.06976770628432387, + "grad_norm": 0.5260601043701172, + "learning_rate": 0.0001860850803883596, + "loss": 1.3693, + "step": 5369 + }, + { + "epoch": 0.06978070082823974, + "grad_norm": 0.35798612236976624, + "learning_rate": 0.00018608248092644825, + "loss": 1.5617, + "step": 5370 + }, + { + "epoch": 0.06979369537215561, + "grad_norm": 0.37765389680862427, + "learning_rate": 0.00018607988146453685, + "loss": 1.5099, + "step": 5371 + }, + { + "epoch": 0.06980668991607149, + "grad_norm": 0.37394413352012634, + "learning_rate": 0.00018607728200262548, + "loss": 1.261, + "step": 5372 + }, + { + "epoch": 0.06981968445998736, + "grad_norm": 0.339111328125, + "learning_rate": 0.00018607468254071407, + "loss": 1.2864, + "step": 5373 + }, + { + "epoch": 0.06983267900390323, + "grad_norm": 0.38882938027381897, + "learning_rate": 0.0001860720830788027, + "loss": 1.3353, + "step": 5374 + }, + { + "epoch": 0.0698456735478191, + "grad_norm": 0.42675748467445374, + "learning_rate": 0.00018606948361689132, + "loss": 1.4789, + "step": 5375 + }, + { + "epoch": 0.06985866809173498, + "grad_norm": 0.35793524980545044, + "learning_rate": 0.00018606688415497992, + "loss": 1.3242, + "step": 5376 + }, + { + "epoch": 0.06987166263565085, + "grad_norm": 0.4599798321723938, + "learning_rate": 0.00018606428469306854, + "loss": 1.5116, + "step": 5377 + }, + { + "epoch": 0.06988465717956673, + "grad_norm": 0.5006440281867981, + "learning_rate": 0.00018606168523115717, + "loss": 1.4687, + "step": 5378 + }, + { + "epoch": 0.0698976517234826, + "grad_norm": 0.4113091826438904, + "learning_rate": 0.00018605908576924577, + "loss": 1.3881, + "step": 5379 + }, + { + "epoch": 0.06991064626739847, + "grad_norm": 0.4981347322463989, + "learning_rate": 0.0001860564863073344, + "loss": 1.5034, + "step": 5380 + }, + { + "epoch": 0.06992364081131434, + "grad_norm": 0.2606261670589447, + "learning_rate": 0.000186053886845423, + "loss": 1.3155, + "step": 5381 + }, + { + "epoch": 0.06993663535523022, + "grad_norm": 0.43710580468177795, + "learning_rate": 0.00018605128738351164, + "loss": 1.4758, + "step": 5382 + }, + { + "epoch": 0.06994962989914609, + "grad_norm": 0.4450712203979492, + "learning_rate": 0.00018604868792160024, + "loss": 1.3672, + "step": 5383 + }, + { + "epoch": 0.06996262444306196, + "grad_norm": 0.43217140436172485, + "learning_rate": 0.00018604608845968886, + "loss": 1.5189, + "step": 5384 + }, + { + "epoch": 0.06997561898697784, + "grad_norm": 0.4858933687210083, + "learning_rate": 0.00018604348899777746, + "loss": 1.4209, + "step": 5385 + }, + { + "epoch": 0.06998861353089371, + "grad_norm": 0.4423260986804962, + "learning_rate": 0.00018604088953586608, + "loss": 1.4636, + "step": 5386 + }, + { + "epoch": 0.0700016080748096, + "grad_norm": 0.43281981348991394, + "learning_rate": 0.0001860382900739547, + "loss": 1.4459, + "step": 5387 + }, + { + "epoch": 0.07001460261872547, + "grad_norm": 0.5359558463096619, + "learning_rate": 0.0001860356906120433, + "loss": 1.5114, + "step": 5388 + }, + { + "epoch": 0.07002759716264134, + "grad_norm": 0.39268165826797485, + "learning_rate": 0.00018603309115013193, + "loss": 1.3415, + "step": 5389 + }, + { + "epoch": 0.07004059170655721, + "grad_norm": 0.4200558364391327, + "learning_rate": 0.00018603049168822055, + "loss": 1.3982, + "step": 5390 + }, + { + "epoch": 0.07005358625047309, + "grad_norm": 0.5264400243759155, + "learning_rate": 0.00018602789222630915, + "loss": 1.6304, + "step": 5391 + }, + { + "epoch": 0.07006658079438896, + "grad_norm": 0.4133051931858063, + "learning_rate": 0.00018602529276439778, + "loss": 1.5549, + "step": 5392 + }, + { + "epoch": 0.07007957533830483, + "grad_norm": 0.30834338068962097, + "learning_rate": 0.00018602269330248637, + "loss": 1.4716, + "step": 5393 + }, + { + "epoch": 0.0700925698822207, + "grad_norm": 0.45827698707580566, + "learning_rate": 0.00018602009384057502, + "loss": 1.5348, + "step": 5394 + }, + { + "epoch": 0.07010556442613658, + "grad_norm": 0.30906587839126587, + "learning_rate": 0.00018601749437866362, + "loss": 1.3635, + "step": 5395 + }, + { + "epoch": 0.07011855897005245, + "grad_norm": 0.38631874322891235, + "learning_rate": 0.00018601489491675225, + "loss": 1.4959, + "step": 5396 + }, + { + "epoch": 0.07013155351396833, + "grad_norm": 0.3548017144203186, + "learning_rate": 0.00018601229545484084, + "loss": 1.4748, + "step": 5397 + }, + { + "epoch": 0.0701445480578842, + "grad_norm": 0.35564255714416504, + "learning_rate": 0.00018600969599292947, + "loss": 1.4983, + "step": 5398 + }, + { + "epoch": 0.07015754260180007, + "grad_norm": 0.38662031292915344, + "learning_rate": 0.0001860070965310181, + "loss": 1.5811, + "step": 5399 + }, + { + "epoch": 0.07017053714571594, + "grad_norm": 0.40547817945480347, + "learning_rate": 0.0001860044970691067, + "loss": 1.6064, + "step": 5400 + }, + { + "epoch": 0.07018353168963182, + "grad_norm": 0.3626398742198944, + "learning_rate": 0.00018600189760719534, + "loss": 1.0841, + "step": 5401 + }, + { + "epoch": 0.07019652623354769, + "grad_norm": 0.3960350453853607, + "learning_rate": 0.00018599929814528394, + "loss": 1.5816, + "step": 5402 + }, + { + "epoch": 0.07020952077746356, + "grad_norm": 0.3598216772079468, + "learning_rate": 0.00018599669868337254, + "loss": 1.4682, + "step": 5403 + }, + { + "epoch": 0.07022251532137944, + "grad_norm": 0.43670007586479187, + "learning_rate": 0.00018599409922146116, + "loss": 1.503, + "step": 5404 + }, + { + "epoch": 0.07023550986529531, + "grad_norm": 0.4934794306755066, + "learning_rate": 0.00018599149975954979, + "loss": 1.5594, + "step": 5405 + }, + { + "epoch": 0.07024850440921118, + "grad_norm": 0.35070568323135376, + "learning_rate": 0.0001859889002976384, + "loss": 1.4915, + "step": 5406 + }, + { + "epoch": 0.07026149895312706, + "grad_norm": 0.2919926345348358, + "learning_rate": 0.000185986300835727, + "loss": 1.5776, + "step": 5407 + }, + { + "epoch": 0.07027449349704293, + "grad_norm": 0.3852614760398865, + "learning_rate": 0.00018598370137381563, + "loss": 1.3357, + "step": 5408 + }, + { + "epoch": 0.0702874880409588, + "grad_norm": 0.402317076921463, + "learning_rate": 0.00018598110191190426, + "loss": 1.4288, + "step": 5409 + }, + { + "epoch": 0.07030048258487467, + "grad_norm": 0.3355379104614258, + "learning_rate": 0.00018597850244999285, + "loss": 1.5372, + "step": 5410 + }, + { + "epoch": 0.07031347712879055, + "grad_norm": 0.5210515260696411, + "learning_rate": 0.00018597590298808148, + "loss": 1.5411, + "step": 5411 + }, + { + "epoch": 0.07032647167270642, + "grad_norm": 0.3981787860393524, + "learning_rate": 0.00018597330352617008, + "loss": 1.5348, + "step": 5412 + }, + { + "epoch": 0.07033946621662229, + "grad_norm": 0.40836101770401, + "learning_rate": 0.00018597070406425873, + "loss": 1.3734, + "step": 5413 + }, + { + "epoch": 0.07035246076053817, + "grad_norm": 0.5196245312690735, + "learning_rate": 0.00018596810460234732, + "loss": 1.4643, + "step": 5414 + }, + { + "epoch": 0.07036545530445404, + "grad_norm": 0.34003061056137085, + "learning_rate": 0.00018596550514043595, + "loss": 1.2051, + "step": 5415 + }, + { + "epoch": 0.07037844984836991, + "grad_norm": 0.40599769353866577, + "learning_rate": 0.00018596290567852455, + "loss": 1.5264, + "step": 5416 + }, + { + "epoch": 0.07039144439228578, + "grad_norm": 0.379014790058136, + "learning_rate": 0.00018596030621661317, + "loss": 1.3553, + "step": 5417 + }, + { + "epoch": 0.07040443893620166, + "grad_norm": 0.41018831729888916, + "learning_rate": 0.0001859577067547018, + "loss": 1.3308, + "step": 5418 + }, + { + "epoch": 0.07041743348011753, + "grad_norm": 0.40476205945014954, + "learning_rate": 0.0001859551072927904, + "loss": 1.4389, + "step": 5419 + }, + { + "epoch": 0.0704304280240334, + "grad_norm": 0.3424795866012573, + "learning_rate": 0.00018595250783087902, + "loss": 1.5422, + "step": 5420 + }, + { + "epoch": 0.07044342256794928, + "grad_norm": 0.35810622572898865, + "learning_rate": 0.00018594990836896764, + "loss": 1.4295, + "step": 5421 + }, + { + "epoch": 0.07045641711186515, + "grad_norm": 0.4544055759906769, + "learning_rate": 0.00018594730890705624, + "loss": 1.6031, + "step": 5422 + }, + { + "epoch": 0.07046941165578102, + "grad_norm": 0.3985508978366852, + "learning_rate": 0.00018594470944514486, + "loss": 1.4231, + "step": 5423 + }, + { + "epoch": 0.0704824061996969, + "grad_norm": 0.49580472707748413, + "learning_rate": 0.00018594210998323346, + "loss": 1.4039, + "step": 5424 + }, + { + "epoch": 0.07049540074361278, + "grad_norm": 0.391928493976593, + "learning_rate": 0.0001859395105213221, + "loss": 1.5262, + "step": 5425 + }, + { + "epoch": 0.07050839528752866, + "grad_norm": 0.36121219396591187, + "learning_rate": 0.0001859369110594107, + "loss": 1.3354, + "step": 5426 + }, + { + "epoch": 0.07052138983144453, + "grad_norm": 0.27666881680488586, + "learning_rate": 0.00018593431159749933, + "loss": 1.4533, + "step": 5427 + }, + { + "epoch": 0.0705343843753604, + "grad_norm": 0.28745222091674805, + "learning_rate": 0.00018593171213558793, + "loss": 1.3047, + "step": 5428 + }, + { + "epoch": 0.07054737891927627, + "grad_norm": 0.44759461283683777, + "learning_rate": 0.00018592911267367656, + "loss": 1.6501, + "step": 5429 + }, + { + "epoch": 0.07056037346319215, + "grad_norm": 0.49797555804252625, + "learning_rate": 0.00018592651321176518, + "loss": 1.6201, + "step": 5430 + }, + { + "epoch": 0.07057336800710802, + "grad_norm": 0.29849007725715637, + "learning_rate": 0.00018592391374985378, + "loss": 1.4536, + "step": 5431 + }, + { + "epoch": 0.07058636255102389, + "grad_norm": 0.3546353280544281, + "learning_rate": 0.0001859213142879424, + "loss": 1.3535, + "step": 5432 + }, + { + "epoch": 0.07059935709493977, + "grad_norm": 0.4548199474811554, + "learning_rate": 0.00018591871482603103, + "loss": 1.4198, + "step": 5433 + }, + { + "epoch": 0.07061235163885564, + "grad_norm": 0.43979954719543457, + "learning_rate": 0.00018591611536411962, + "loss": 1.3925, + "step": 5434 + }, + { + "epoch": 0.07062534618277151, + "grad_norm": 0.39429983496665955, + "learning_rate": 0.00018591351590220825, + "loss": 1.5399, + "step": 5435 + }, + { + "epoch": 0.07063834072668738, + "grad_norm": 0.24984683096408844, + "learning_rate": 0.00018591091644029687, + "loss": 1.3133, + "step": 5436 + }, + { + "epoch": 0.07065133527060326, + "grad_norm": 0.357465535402298, + "learning_rate": 0.0001859083169783855, + "loss": 1.5052, + "step": 5437 + }, + { + "epoch": 0.07066432981451913, + "grad_norm": 0.40855807065963745, + "learning_rate": 0.0001859057175164741, + "loss": 1.5294, + "step": 5438 + }, + { + "epoch": 0.070677324358435, + "grad_norm": 0.3908853530883789, + "learning_rate": 0.00018590311805456272, + "loss": 1.319, + "step": 5439 + }, + { + "epoch": 0.07069031890235088, + "grad_norm": 0.3614378273487091, + "learning_rate": 0.00018590051859265134, + "loss": 1.4397, + "step": 5440 + }, + { + "epoch": 0.07070331344626675, + "grad_norm": 0.3503163754940033, + "learning_rate": 0.00018589791913073994, + "loss": 1.5543, + "step": 5441 + }, + { + "epoch": 0.07071630799018262, + "grad_norm": 0.4148585796356201, + "learning_rate": 0.00018589531966882857, + "loss": 1.4212, + "step": 5442 + }, + { + "epoch": 0.0707293025340985, + "grad_norm": 0.31987398862838745, + "learning_rate": 0.00018589272020691716, + "loss": 1.3397, + "step": 5443 + }, + { + "epoch": 0.07074229707801437, + "grad_norm": 0.329733282327652, + "learning_rate": 0.00018589012074500582, + "loss": 1.4183, + "step": 5444 + }, + { + "epoch": 0.07075529162193024, + "grad_norm": 0.4021393060684204, + "learning_rate": 0.0001858875212830944, + "loss": 1.3736, + "step": 5445 + }, + { + "epoch": 0.07076828616584611, + "grad_norm": 0.4750361740589142, + "learning_rate": 0.000185884921821183, + "loss": 1.6133, + "step": 5446 + }, + { + "epoch": 0.07078128070976199, + "grad_norm": 0.34743133187294006, + "learning_rate": 0.00018588232235927163, + "loss": 1.4817, + "step": 5447 + }, + { + "epoch": 0.07079427525367786, + "grad_norm": 0.3615758419036865, + "learning_rate": 0.00018587972289736026, + "loss": 1.2433, + "step": 5448 + }, + { + "epoch": 0.07080726979759373, + "grad_norm": 0.427296906709671, + "learning_rate": 0.00018587712343544888, + "loss": 1.4121, + "step": 5449 + }, + { + "epoch": 0.0708202643415096, + "grad_norm": 0.3961687386035919, + "learning_rate": 0.00018587452397353748, + "loss": 1.541, + "step": 5450 + }, + { + "epoch": 0.07083325888542548, + "grad_norm": 0.2941243648529053, + "learning_rate": 0.0001858719245116261, + "loss": 1.2386, + "step": 5451 + }, + { + "epoch": 0.07084625342934135, + "grad_norm": 0.4702562689781189, + "learning_rate": 0.00018586932504971473, + "loss": 1.5811, + "step": 5452 + }, + { + "epoch": 0.07085924797325723, + "grad_norm": 0.44686421751976013, + "learning_rate": 0.00018586672558780333, + "loss": 1.375, + "step": 5453 + }, + { + "epoch": 0.0708722425171731, + "grad_norm": 0.3779712915420532, + "learning_rate": 0.00018586412612589195, + "loss": 1.266, + "step": 5454 + }, + { + "epoch": 0.07088523706108897, + "grad_norm": 0.4184970557689667, + "learning_rate": 0.00018586152666398055, + "loss": 1.4821, + "step": 5455 + }, + { + "epoch": 0.07089823160500484, + "grad_norm": 0.43823352456092834, + "learning_rate": 0.0001858589272020692, + "loss": 1.5553, + "step": 5456 + }, + { + "epoch": 0.07091122614892072, + "grad_norm": 0.37257346510887146, + "learning_rate": 0.0001858563277401578, + "loss": 1.5413, + "step": 5457 + }, + { + "epoch": 0.07092422069283659, + "grad_norm": 0.38129737973213196, + "learning_rate": 0.0001858537282782464, + "loss": 1.3405, + "step": 5458 + }, + { + "epoch": 0.07093721523675246, + "grad_norm": 0.3697986304759979, + "learning_rate": 0.00018585112881633502, + "loss": 1.3921, + "step": 5459 + }, + { + "epoch": 0.07095020978066834, + "grad_norm": 0.3374973237514496, + "learning_rate": 0.00018584852935442364, + "loss": 1.4575, + "step": 5460 + }, + { + "epoch": 0.07096320432458421, + "grad_norm": 0.46403875946998596, + "learning_rate": 0.00018584592989251227, + "loss": 1.6582, + "step": 5461 + }, + { + "epoch": 0.07097619886850008, + "grad_norm": 0.3866918981075287, + "learning_rate": 0.00018584333043060087, + "loss": 1.5336, + "step": 5462 + }, + { + "epoch": 0.07098919341241597, + "grad_norm": 0.3675229251384735, + "learning_rate": 0.0001858407309686895, + "loss": 1.3309, + "step": 5463 + }, + { + "epoch": 0.07100218795633184, + "grad_norm": 0.3723551034927368, + "learning_rate": 0.00018583813150677812, + "loss": 1.5619, + "step": 5464 + }, + { + "epoch": 0.07101518250024771, + "grad_norm": 0.3850755989551544, + "learning_rate": 0.0001858355320448667, + "loss": 1.523, + "step": 5465 + }, + { + "epoch": 0.07102817704416359, + "grad_norm": 0.34363529086112976, + "learning_rate": 0.00018583293258295534, + "loss": 1.2147, + "step": 5466 + }, + { + "epoch": 0.07104117158807946, + "grad_norm": 0.35843032598495483, + "learning_rate": 0.00018583033312104393, + "loss": 1.5313, + "step": 5467 + }, + { + "epoch": 0.07105416613199533, + "grad_norm": 0.3088741600513458, + "learning_rate": 0.00018582773365913259, + "loss": 1.5659, + "step": 5468 + }, + { + "epoch": 0.0710671606759112, + "grad_norm": 0.5088632702827454, + "learning_rate": 0.00018582513419722118, + "loss": 1.5353, + "step": 5469 + }, + { + "epoch": 0.07108015521982708, + "grad_norm": 0.4652015268802643, + "learning_rate": 0.0001858225347353098, + "loss": 1.2798, + "step": 5470 + }, + { + "epoch": 0.07109314976374295, + "grad_norm": 0.316891074180603, + "learning_rate": 0.0001858199352733984, + "loss": 1.2449, + "step": 5471 + }, + { + "epoch": 0.07110614430765883, + "grad_norm": 0.34538426995277405, + "learning_rate": 0.00018581733581148703, + "loss": 1.2856, + "step": 5472 + }, + { + "epoch": 0.0711191388515747, + "grad_norm": 0.4169134795665741, + "learning_rate": 0.00018581473634957565, + "loss": 1.3852, + "step": 5473 + }, + { + "epoch": 0.07113213339549057, + "grad_norm": 0.4016588628292084, + "learning_rate": 0.00018581213688766425, + "loss": 1.5823, + "step": 5474 + }, + { + "epoch": 0.07114512793940644, + "grad_norm": 0.3618713915348053, + "learning_rate": 0.00018580953742575288, + "loss": 1.4659, + "step": 5475 + }, + { + "epoch": 0.07115812248332232, + "grad_norm": 0.4007403552532196, + "learning_rate": 0.0001858069379638415, + "loss": 1.4936, + "step": 5476 + }, + { + "epoch": 0.07117111702723819, + "grad_norm": 0.34492501616477966, + "learning_rate": 0.0001858043385019301, + "loss": 1.5609, + "step": 5477 + }, + { + "epoch": 0.07118411157115406, + "grad_norm": 0.34819379448890686, + "learning_rate": 0.00018580173904001872, + "loss": 1.4435, + "step": 5478 + }, + { + "epoch": 0.07119710611506994, + "grad_norm": 0.4405934810638428, + "learning_rate": 0.00018579913957810735, + "loss": 1.381, + "step": 5479 + }, + { + "epoch": 0.07121010065898581, + "grad_norm": 0.40937164425849915, + "learning_rate": 0.00018579654011619597, + "loss": 1.5527, + "step": 5480 + }, + { + "epoch": 0.07122309520290168, + "grad_norm": 0.3682154715061188, + "learning_rate": 0.00018579394065428457, + "loss": 1.3222, + "step": 5481 + }, + { + "epoch": 0.07123608974681755, + "grad_norm": 0.38335657119750977, + "learning_rate": 0.0001857913411923732, + "loss": 1.5465, + "step": 5482 + }, + { + "epoch": 0.07124908429073343, + "grad_norm": 0.3870220482349396, + "learning_rate": 0.00018578874173046182, + "loss": 1.3894, + "step": 5483 + }, + { + "epoch": 0.0712620788346493, + "grad_norm": 0.47531741857528687, + "learning_rate": 0.00018578614226855042, + "loss": 1.5172, + "step": 5484 + }, + { + "epoch": 0.07127507337856517, + "grad_norm": 0.38174280524253845, + "learning_rate": 0.00018578354280663904, + "loss": 1.4115, + "step": 5485 + }, + { + "epoch": 0.07128806792248105, + "grad_norm": 0.35346317291259766, + "learning_rate": 0.00018578094334472764, + "loss": 1.3289, + "step": 5486 + }, + { + "epoch": 0.07130106246639692, + "grad_norm": 0.31188464164733887, + "learning_rate": 0.00018577834388281626, + "loss": 1.2731, + "step": 5487 + }, + { + "epoch": 0.07131405701031279, + "grad_norm": 0.41615059971809387, + "learning_rate": 0.00018577574442090489, + "loss": 1.5308, + "step": 5488 + }, + { + "epoch": 0.07132705155422867, + "grad_norm": 0.5589435696601868, + "learning_rate": 0.00018577314495899348, + "loss": 1.5682, + "step": 5489 + }, + { + "epoch": 0.07134004609814454, + "grad_norm": 0.36484724283218384, + "learning_rate": 0.0001857705454970821, + "loss": 1.7109, + "step": 5490 + }, + { + "epoch": 0.07135304064206041, + "grad_norm": 0.4378516376018524, + "learning_rate": 0.00018576794603517073, + "loss": 1.5515, + "step": 5491 + }, + { + "epoch": 0.07136603518597628, + "grad_norm": 0.31119734048843384, + "learning_rate": 0.00018576534657325936, + "loss": 1.3169, + "step": 5492 + }, + { + "epoch": 0.07137902972989216, + "grad_norm": 0.49236518144607544, + "learning_rate": 0.00018576274711134795, + "loss": 1.598, + "step": 5493 + }, + { + "epoch": 0.07139202427380803, + "grad_norm": 0.5020756721496582, + "learning_rate": 0.00018576014764943658, + "loss": 1.5953, + "step": 5494 + }, + { + "epoch": 0.0714050188177239, + "grad_norm": 0.3653267025947571, + "learning_rate": 0.0001857575481875252, + "loss": 1.3867, + "step": 5495 + }, + { + "epoch": 0.07141801336163978, + "grad_norm": 0.42779818177223206, + "learning_rate": 0.0001857549487256138, + "loss": 1.4839, + "step": 5496 + }, + { + "epoch": 0.07143100790555565, + "grad_norm": 0.4221138060092926, + "learning_rate": 0.00018575234926370242, + "loss": 1.4289, + "step": 5497 + }, + { + "epoch": 0.07144400244947152, + "grad_norm": 0.4305471181869507, + "learning_rate": 0.00018574974980179102, + "loss": 1.4869, + "step": 5498 + }, + { + "epoch": 0.0714569969933874, + "grad_norm": 0.7394503355026245, + "learning_rate": 0.00018574715033987967, + "loss": 1.4199, + "step": 5499 + }, + { + "epoch": 0.07146999153730327, + "grad_norm": 0.42988282442092896, + "learning_rate": 0.00018574455087796827, + "loss": 1.4626, + "step": 5500 + }, + { + "epoch": 0.07148298608121915, + "grad_norm": 0.4476601183414459, + "learning_rate": 0.00018574195141605687, + "loss": 1.4446, + "step": 5501 + }, + { + "epoch": 0.07149598062513503, + "grad_norm": 0.3432336449623108, + "learning_rate": 0.0001857393519541455, + "loss": 1.3539, + "step": 5502 + }, + { + "epoch": 0.0715089751690509, + "grad_norm": 0.30796200037002563, + "learning_rate": 0.00018573675249223412, + "loss": 1.458, + "step": 5503 + }, + { + "epoch": 0.07152196971296677, + "grad_norm": 0.35000425577163696, + "learning_rate": 0.00018573415303032274, + "loss": 1.5302, + "step": 5504 + }, + { + "epoch": 0.07153496425688265, + "grad_norm": 0.3962832987308502, + "learning_rate": 0.00018573155356841134, + "loss": 1.3661, + "step": 5505 + }, + { + "epoch": 0.07154795880079852, + "grad_norm": 0.4164280891418457, + "learning_rate": 0.00018572895410649996, + "loss": 1.4931, + "step": 5506 + }, + { + "epoch": 0.07156095334471439, + "grad_norm": 0.4541429579257965, + "learning_rate": 0.0001857263546445886, + "loss": 1.2475, + "step": 5507 + }, + { + "epoch": 0.07157394788863027, + "grad_norm": 0.32912299036979675, + "learning_rate": 0.00018572375518267719, + "loss": 1.5443, + "step": 5508 + }, + { + "epoch": 0.07158694243254614, + "grad_norm": 0.661169171333313, + "learning_rate": 0.0001857211557207658, + "loss": 1.6283, + "step": 5509 + }, + { + "epoch": 0.07159993697646201, + "grad_norm": 0.3548290729522705, + "learning_rate": 0.00018571855625885443, + "loss": 1.3872, + "step": 5510 + }, + { + "epoch": 0.07161293152037788, + "grad_norm": 0.36101076006889343, + "learning_rate": 0.00018571595679694306, + "loss": 1.5111, + "step": 5511 + }, + { + "epoch": 0.07162592606429376, + "grad_norm": 0.3813101649284363, + "learning_rate": 0.00018571335733503166, + "loss": 1.3474, + "step": 5512 + }, + { + "epoch": 0.07163892060820963, + "grad_norm": 0.3508604168891907, + "learning_rate": 0.00018571075787312025, + "loss": 1.4458, + "step": 5513 + }, + { + "epoch": 0.0716519151521255, + "grad_norm": 0.3383382558822632, + "learning_rate": 0.0001857081584112089, + "loss": 1.3505, + "step": 5514 + }, + { + "epoch": 0.07166490969604138, + "grad_norm": 0.36480727791786194, + "learning_rate": 0.0001857055589492975, + "loss": 1.4672, + "step": 5515 + }, + { + "epoch": 0.07167790423995725, + "grad_norm": 0.36195939779281616, + "learning_rate": 0.00018570295948738613, + "loss": 1.4437, + "step": 5516 + }, + { + "epoch": 0.07169089878387312, + "grad_norm": 0.3500955402851105, + "learning_rate": 0.00018570036002547472, + "loss": 1.3328, + "step": 5517 + }, + { + "epoch": 0.071703893327789, + "grad_norm": 0.4806077778339386, + "learning_rate": 0.00018569776056356335, + "loss": 1.6179, + "step": 5518 + }, + { + "epoch": 0.07171688787170487, + "grad_norm": 0.38188299536705017, + "learning_rate": 0.00018569516110165197, + "loss": 1.4384, + "step": 5519 + }, + { + "epoch": 0.07172988241562074, + "grad_norm": 0.32269108295440674, + "learning_rate": 0.00018569256163974057, + "loss": 1.2824, + "step": 5520 + }, + { + "epoch": 0.07174287695953661, + "grad_norm": 0.3889369070529938, + "learning_rate": 0.0001856899621778292, + "loss": 1.3971, + "step": 5521 + }, + { + "epoch": 0.07175587150345249, + "grad_norm": 0.38597962260246277, + "learning_rate": 0.00018568736271591782, + "loss": 1.4444, + "step": 5522 + }, + { + "epoch": 0.07176886604736836, + "grad_norm": 0.38747406005859375, + "learning_rate": 0.00018568476325400644, + "loss": 1.5089, + "step": 5523 + }, + { + "epoch": 0.07178186059128423, + "grad_norm": 0.5148333311080933, + "learning_rate": 0.00018568216379209504, + "loss": 1.4134, + "step": 5524 + }, + { + "epoch": 0.0717948551352001, + "grad_norm": 0.3707602322101593, + "learning_rate": 0.00018567956433018364, + "loss": 1.3292, + "step": 5525 + }, + { + "epoch": 0.07180784967911598, + "grad_norm": 0.37553104758262634, + "learning_rate": 0.0001856769648682723, + "loss": 1.3988, + "step": 5526 + }, + { + "epoch": 0.07182084422303185, + "grad_norm": 0.3768678605556488, + "learning_rate": 0.0001856743654063609, + "loss": 1.3855, + "step": 5527 + }, + { + "epoch": 0.07183383876694772, + "grad_norm": 0.4996545612812042, + "learning_rate": 0.0001856717659444495, + "loss": 1.4192, + "step": 5528 + }, + { + "epoch": 0.0718468333108636, + "grad_norm": 0.4021969437599182, + "learning_rate": 0.0001856691664825381, + "loss": 1.4751, + "step": 5529 + }, + { + "epoch": 0.07185982785477947, + "grad_norm": 0.357889324426651, + "learning_rate": 0.00018566656702062673, + "loss": 1.4382, + "step": 5530 + }, + { + "epoch": 0.07187282239869534, + "grad_norm": 0.413897305727005, + "learning_rate": 0.00018566396755871536, + "loss": 1.4548, + "step": 5531 + }, + { + "epoch": 0.07188581694261122, + "grad_norm": 0.3731609284877777, + "learning_rate": 0.00018566136809680396, + "loss": 1.3424, + "step": 5532 + }, + { + "epoch": 0.07189881148652709, + "grad_norm": 0.386899471282959, + "learning_rate": 0.00018565876863489258, + "loss": 1.4701, + "step": 5533 + }, + { + "epoch": 0.07191180603044296, + "grad_norm": 0.5114560127258301, + "learning_rate": 0.0001856561691729812, + "loss": 1.5957, + "step": 5534 + }, + { + "epoch": 0.07192480057435884, + "grad_norm": 0.31541940569877625, + "learning_rate": 0.00018565356971106983, + "loss": 1.3177, + "step": 5535 + }, + { + "epoch": 0.07193779511827471, + "grad_norm": 0.34982484579086304, + "learning_rate": 0.00018565097024915843, + "loss": 1.4722, + "step": 5536 + }, + { + "epoch": 0.07195078966219058, + "grad_norm": 0.3410489857196808, + "learning_rate": 0.00018564837078724705, + "loss": 1.5004, + "step": 5537 + }, + { + "epoch": 0.07196378420610645, + "grad_norm": 0.44715091586112976, + "learning_rate": 0.00018564577132533568, + "loss": 1.3261, + "step": 5538 + }, + { + "epoch": 0.07197677875002234, + "grad_norm": 0.324921190738678, + "learning_rate": 0.00018564317186342427, + "loss": 1.4688, + "step": 5539 + }, + { + "epoch": 0.07198977329393821, + "grad_norm": 0.4115283489227295, + "learning_rate": 0.0001856405724015129, + "loss": 1.5109, + "step": 5540 + }, + { + "epoch": 0.07200276783785409, + "grad_norm": 0.37124302983283997, + "learning_rate": 0.0001856379729396015, + "loss": 1.2954, + "step": 5541 + }, + { + "epoch": 0.07201576238176996, + "grad_norm": 0.5044050216674805, + "learning_rate": 0.00018563537347769012, + "loss": 1.5364, + "step": 5542 + }, + { + "epoch": 0.07202875692568583, + "grad_norm": 0.3772490620613098, + "learning_rate": 0.00018563277401577874, + "loss": 1.4122, + "step": 5543 + }, + { + "epoch": 0.0720417514696017, + "grad_norm": 0.4495158791542053, + "learning_rate": 0.00018563017455386734, + "loss": 1.5468, + "step": 5544 + }, + { + "epoch": 0.07205474601351758, + "grad_norm": 0.4341451823711395, + "learning_rate": 0.00018562757509195597, + "loss": 1.4036, + "step": 5545 + }, + { + "epoch": 0.07206774055743345, + "grad_norm": 0.4424019753932953, + "learning_rate": 0.0001856249756300446, + "loss": 1.4686, + "step": 5546 + }, + { + "epoch": 0.07208073510134932, + "grad_norm": 0.4379132390022278, + "learning_rate": 0.00018562237616813322, + "loss": 1.5167, + "step": 5547 + }, + { + "epoch": 0.0720937296452652, + "grad_norm": 0.2422821968793869, + "learning_rate": 0.0001856197767062218, + "loss": 1.2305, + "step": 5548 + }, + { + "epoch": 0.07210672418918107, + "grad_norm": 0.427615761756897, + "learning_rate": 0.00018561717724431044, + "loss": 1.5418, + "step": 5549 + }, + { + "epoch": 0.07211971873309694, + "grad_norm": 0.34096798300743103, + "learning_rate": 0.00018561457778239906, + "loss": 1.2817, + "step": 5550 + }, + { + "epoch": 0.07213271327701282, + "grad_norm": 0.3905801773071289, + "learning_rate": 0.00018561197832048766, + "loss": 1.3466, + "step": 5551 + }, + { + "epoch": 0.07214570782092869, + "grad_norm": 0.3452693819999695, + "learning_rate": 0.00018560937885857628, + "loss": 1.3596, + "step": 5552 + }, + { + "epoch": 0.07215870236484456, + "grad_norm": 0.4102634787559509, + "learning_rate": 0.0001856067793966649, + "loss": 1.5921, + "step": 5553 + }, + { + "epoch": 0.07217169690876044, + "grad_norm": 0.4027297794818878, + "learning_rate": 0.00018560417993475353, + "loss": 1.5723, + "step": 5554 + }, + { + "epoch": 0.07218469145267631, + "grad_norm": 0.47069665789604187, + "learning_rate": 0.00018560158047284213, + "loss": 1.505, + "step": 5555 + }, + { + "epoch": 0.07219768599659218, + "grad_norm": 0.33498018980026245, + "learning_rate": 0.00018559898101093073, + "loss": 1.2608, + "step": 5556 + }, + { + "epoch": 0.07221068054050805, + "grad_norm": 0.2888638973236084, + "learning_rate": 0.00018559638154901938, + "loss": 1.3408, + "step": 5557 + }, + { + "epoch": 0.07222367508442393, + "grad_norm": 0.3327373266220093, + "learning_rate": 0.00018559378208710798, + "loss": 1.3849, + "step": 5558 + }, + { + "epoch": 0.0722366696283398, + "grad_norm": 0.4662718176841736, + "learning_rate": 0.0001855911826251966, + "loss": 1.4976, + "step": 5559 + }, + { + "epoch": 0.07224966417225567, + "grad_norm": 0.30613040924072266, + "learning_rate": 0.0001855885831632852, + "loss": 1.2854, + "step": 5560 + }, + { + "epoch": 0.07226265871617155, + "grad_norm": 0.35056453943252563, + "learning_rate": 0.00018558598370137382, + "loss": 1.3573, + "step": 5561 + }, + { + "epoch": 0.07227565326008742, + "grad_norm": 0.4638853967189789, + "learning_rate": 0.00018558338423946245, + "loss": 1.6123, + "step": 5562 + }, + { + "epoch": 0.07228864780400329, + "grad_norm": 0.466193825006485, + "learning_rate": 0.00018558078477755104, + "loss": 1.3917, + "step": 5563 + }, + { + "epoch": 0.07230164234791916, + "grad_norm": 0.5195732712745667, + "learning_rate": 0.00018557818531563967, + "loss": 1.5221, + "step": 5564 + }, + { + "epoch": 0.07231463689183504, + "grad_norm": 0.39762115478515625, + "learning_rate": 0.0001855755858537283, + "loss": 1.5463, + "step": 5565 + }, + { + "epoch": 0.07232763143575091, + "grad_norm": 0.44320446252822876, + "learning_rate": 0.00018557298639181692, + "loss": 1.5412, + "step": 5566 + }, + { + "epoch": 0.07234062597966678, + "grad_norm": 0.3181615173816681, + "learning_rate": 0.00018557038692990552, + "loss": 1.2224, + "step": 5567 + }, + { + "epoch": 0.07235362052358266, + "grad_norm": 0.2894691526889801, + "learning_rate": 0.0001855677874679941, + "loss": 1.3671, + "step": 5568 + }, + { + "epoch": 0.07236661506749853, + "grad_norm": 0.4079062044620514, + "learning_rate": 0.00018556518800608276, + "loss": 1.5234, + "step": 5569 + }, + { + "epoch": 0.0723796096114144, + "grad_norm": 0.3911947011947632, + "learning_rate": 0.00018556258854417136, + "loss": 1.6498, + "step": 5570 + }, + { + "epoch": 0.07239260415533028, + "grad_norm": 0.3745634853839874, + "learning_rate": 0.00018555998908225999, + "loss": 1.3886, + "step": 5571 + }, + { + "epoch": 0.07240559869924615, + "grad_norm": 0.39828798174858093, + "learning_rate": 0.00018555738962034858, + "loss": 1.5264, + "step": 5572 + }, + { + "epoch": 0.07241859324316202, + "grad_norm": 0.4185575246810913, + "learning_rate": 0.0001855547901584372, + "loss": 1.4389, + "step": 5573 + }, + { + "epoch": 0.0724315877870779, + "grad_norm": 0.3066312074661255, + "learning_rate": 0.00018555219069652583, + "loss": 1.4756, + "step": 5574 + }, + { + "epoch": 0.07244458233099377, + "grad_norm": 0.3252162039279938, + "learning_rate": 0.00018554959123461443, + "loss": 1.4824, + "step": 5575 + }, + { + "epoch": 0.07245757687490964, + "grad_norm": 0.41581249237060547, + "learning_rate": 0.00018554699177270305, + "loss": 1.4503, + "step": 5576 + }, + { + "epoch": 0.07247057141882553, + "grad_norm": 0.41156283020973206, + "learning_rate": 0.00018554439231079168, + "loss": 1.6927, + "step": 5577 + }, + { + "epoch": 0.0724835659627414, + "grad_norm": 0.4363971948623657, + "learning_rate": 0.0001855417928488803, + "loss": 1.5528, + "step": 5578 + }, + { + "epoch": 0.07249656050665727, + "grad_norm": 0.42701852321624756, + "learning_rate": 0.0001855391933869689, + "loss": 1.4775, + "step": 5579 + }, + { + "epoch": 0.07250955505057315, + "grad_norm": 0.4176211655139923, + "learning_rate": 0.0001855365939250575, + "loss": 1.4082, + "step": 5580 + }, + { + "epoch": 0.07252254959448902, + "grad_norm": 0.39570191502571106, + "learning_rate": 0.00018553399446314615, + "loss": 1.4394, + "step": 5581 + }, + { + "epoch": 0.07253554413840489, + "grad_norm": 0.4649661183357239, + "learning_rate": 0.00018553139500123475, + "loss": 1.5078, + "step": 5582 + }, + { + "epoch": 0.07254853868232077, + "grad_norm": 0.3875357210636139, + "learning_rate": 0.00018552879553932337, + "loss": 1.301, + "step": 5583 + }, + { + "epoch": 0.07256153322623664, + "grad_norm": 0.48106566071510315, + "learning_rate": 0.000185526196077412, + "loss": 1.3939, + "step": 5584 + }, + { + "epoch": 0.07257452777015251, + "grad_norm": 0.3127942979335785, + "learning_rate": 0.0001855235966155006, + "loss": 1.4939, + "step": 5585 + }, + { + "epoch": 0.07258752231406838, + "grad_norm": 0.40679267048835754, + "learning_rate": 0.00018552099715358922, + "loss": 1.349, + "step": 5586 + }, + { + "epoch": 0.07260051685798426, + "grad_norm": 0.4243844151496887, + "learning_rate": 0.00018551839769167782, + "loss": 1.6021, + "step": 5587 + }, + { + "epoch": 0.07261351140190013, + "grad_norm": 0.4676162004470825, + "learning_rate": 0.00018551579822976647, + "loss": 1.5655, + "step": 5588 + }, + { + "epoch": 0.072626505945816, + "grad_norm": 0.4225911796092987, + "learning_rate": 0.00018551319876785506, + "loss": 1.2942, + "step": 5589 + }, + { + "epoch": 0.07263950048973188, + "grad_norm": 0.35572972893714905, + "learning_rate": 0.0001855105993059437, + "loss": 1.3874, + "step": 5590 + }, + { + "epoch": 0.07265249503364775, + "grad_norm": 0.46690237522125244, + "learning_rate": 0.00018550799984403229, + "loss": 1.5002, + "step": 5591 + }, + { + "epoch": 0.07266548957756362, + "grad_norm": 0.38411280512809753, + "learning_rate": 0.0001855054003821209, + "loss": 1.5267, + "step": 5592 + }, + { + "epoch": 0.0726784841214795, + "grad_norm": 0.30819186568260193, + "learning_rate": 0.00018550280092020954, + "loss": 1.3883, + "step": 5593 + }, + { + "epoch": 0.07269147866539537, + "grad_norm": 0.41401219367980957, + "learning_rate": 0.00018550020145829813, + "loss": 1.6515, + "step": 5594 + }, + { + "epoch": 0.07270447320931124, + "grad_norm": 0.4541913866996765, + "learning_rate": 0.00018549760199638676, + "loss": 1.4572, + "step": 5595 + }, + { + "epoch": 0.07271746775322711, + "grad_norm": 0.3997112214565277, + "learning_rate": 0.00018549500253447538, + "loss": 1.4728, + "step": 5596 + }, + { + "epoch": 0.07273046229714299, + "grad_norm": 0.3244662284851074, + "learning_rate": 0.00018549240307256398, + "loss": 1.3256, + "step": 5597 + }, + { + "epoch": 0.07274345684105886, + "grad_norm": 0.4154819846153259, + "learning_rate": 0.0001854898036106526, + "loss": 1.6368, + "step": 5598 + }, + { + "epoch": 0.07275645138497473, + "grad_norm": 0.3281448781490326, + "learning_rate": 0.0001854872041487412, + "loss": 1.3244, + "step": 5599 + }, + { + "epoch": 0.0727694459288906, + "grad_norm": 0.27969348430633545, + "learning_rate": 0.00018548460468682985, + "loss": 1.4892, + "step": 5600 + }, + { + "epoch": 0.07278244047280648, + "grad_norm": 0.45478615164756775, + "learning_rate": 0.00018548200522491845, + "loss": 1.5494, + "step": 5601 + }, + { + "epoch": 0.07279543501672235, + "grad_norm": 0.4144267737865448, + "learning_rate": 0.00018547940576300707, + "loss": 1.4138, + "step": 5602 + }, + { + "epoch": 0.07280842956063822, + "grad_norm": 0.3249765932559967, + "learning_rate": 0.00018547680630109567, + "loss": 1.4079, + "step": 5603 + }, + { + "epoch": 0.0728214241045541, + "grad_norm": 0.49898579716682434, + "learning_rate": 0.0001854742068391843, + "loss": 1.6388, + "step": 5604 + }, + { + "epoch": 0.07283441864846997, + "grad_norm": 0.40876245498657227, + "learning_rate": 0.00018547160737727292, + "loss": 1.2897, + "step": 5605 + }, + { + "epoch": 0.07284741319238584, + "grad_norm": 0.4395081102848053, + "learning_rate": 0.00018546900791536152, + "loss": 1.5956, + "step": 5606 + }, + { + "epoch": 0.07286040773630172, + "grad_norm": 0.6372933387756348, + "learning_rate": 0.00018546640845345014, + "loss": 1.6542, + "step": 5607 + }, + { + "epoch": 0.07287340228021759, + "grad_norm": 0.39420461654663086, + "learning_rate": 0.00018546380899153877, + "loss": 1.4588, + "step": 5608 + }, + { + "epoch": 0.07288639682413346, + "grad_norm": 0.3278575837612152, + "learning_rate": 0.00018546120952962736, + "loss": 1.489, + "step": 5609 + }, + { + "epoch": 0.07289939136804933, + "grad_norm": 0.3301399350166321, + "learning_rate": 0.000185458610067716, + "loss": 1.3192, + "step": 5610 + }, + { + "epoch": 0.07291238591196521, + "grad_norm": 0.4959639310836792, + "learning_rate": 0.00018545601060580459, + "loss": 1.5534, + "step": 5611 + }, + { + "epoch": 0.07292538045588108, + "grad_norm": 0.47583460807800293, + "learning_rate": 0.00018545341114389324, + "loss": 1.4835, + "step": 5612 + }, + { + "epoch": 0.07293837499979695, + "grad_norm": 0.3644411265850067, + "learning_rate": 0.00018545081168198184, + "loss": 1.5143, + "step": 5613 + }, + { + "epoch": 0.07295136954371283, + "grad_norm": 0.42564284801483154, + "learning_rate": 0.00018544821222007046, + "loss": 1.553, + "step": 5614 + }, + { + "epoch": 0.07296436408762871, + "grad_norm": 0.41172221302986145, + "learning_rate": 0.00018544561275815906, + "loss": 1.5168, + "step": 5615 + }, + { + "epoch": 0.07297735863154459, + "grad_norm": 0.4057060480117798, + "learning_rate": 0.00018544301329624768, + "loss": 1.4829, + "step": 5616 + }, + { + "epoch": 0.07299035317546046, + "grad_norm": 0.4562079906463623, + "learning_rate": 0.0001854404138343363, + "loss": 1.4217, + "step": 5617 + }, + { + "epoch": 0.07300334771937633, + "grad_norm": 0.45361238718032837, + "learning_rate": 0.0001854378143724249, + "loss": 1.3939, + "step": 5618 + }, + { + "epoch": 0.0730163422632922, + "grad_norm": 0.3313571512699127, + "learning_rate": 0.00018543521491051353, + "loss": 1.3955, + "step": 5619 + }, + { + "epoch": 0.07302933680720808, + "grad_norm": 0.399566650390625, + "learning_rate": 0.00018543261544860215, + "loss": 1.5584, + "step": 5620 + }, + { + "epoch": 0.07304233135112395, + "grad_norm": 0.46305033564567566, + "learning_rate": 0.00018543001598669078, + "loss": 1.5445, + "step": 5621 + }, + { + "epoch": 0.07305532589503982, + "grad_norm": 0.5216182470321655, + "learning_rate": 0.00018542741652477937, + "loss": 1.4129, + "step": 5622 + }, + { + "epoch": 0.0730683204389557, + "grad_norm": 0.3481959402561188, + "learning_rate": 0.000185424817062868, + "loss": 1.5363, + "step": 5623 + }, + { + "epoch": 0.07308131498287157, + "grad_norm": 0.38653409481048584, + "learning_rate": 0.00018542221760095662, + "loss": 1.4856, + "step": 5624 + }, + { + "epoch": 0.07309430952678744, + "grad_norm": 0.40774691104888916, + "learning_rate": 0.00018541961813904522, + "loss": 1.4834, + "step": 5625 + }, + { + "epoch": 0.07310730407070332, + "grad_norm": 0.46210741996765137, + "learning_rate": 0.00018541701867713384, + "loss": 1.3434, + "step": 5626 + }, + { + "epoch": 0.07312029861461919, + "grad_norm": 0.25466418266296387, + "learning_rate": 0.00018541441921522247, + "loss": 1.3635, + "step": 5627 + }, + { + "epoch": 0.07313329315853506, + "grad_norm": 0.4233796298503876, + "learning_rate": 0.00018541181975331107, + "loss": 1.4875, + "step": 5628 + }, + { + "epoch": 0.07314628770245094, + "grad_norm": 0.3973749577999115, + "learning_rate": 0.0001854092202913997, + "loss": 1.5705, + "step": 5629 + }, + { + "epoch": 0.07315928224636681, + "grad_norm": 0.41903677582740784, + "learning_rate": 0.0001854066208294883, + "loss": 1.6226, + "step": 5630 + }, + { + "epoch": 0.07317227679028268, + "grad_norm": 0.4141426384449005, + "learning_rate": 0.00018540402136757694, + "loss": 1.5467, + "step": 5631 + }, + { + "epoch": 0.07318527133419855, + "grad_norm": 0.4214317202568054, + "learning_rate": 0.00018540142190566554, + "loss": 1.4372, + "step": 5632 + }, + { + "epoch": 0.07319826587811443, + "grad_norm": 0.3303471803665161, + "learning_rate": 0.00018539882244375416, + "loss": 1.5253, + "step": 5633 + }, + { + "epoch": 0.0732112604220303, + "grad_norm": 0.4142480194568634, + "learning_rate": 0.00018539622298184276, + "loss": 1.4846, + "step": 5634 + }, + { + "epoch": 0.07322425496594617, + "grad_norm": 0.4120120108127594, + "learning_rate": 0.00018539362351993138, + "loss": 1.5881, + "step": 5635 + }, + { + "epoch": 0.07323724950986205, + "grad_norm": 0.36254847049713135, + "learning_rate": 0.00018539102405802, + "loss": 1.5508, + "step": 5636 + }, + { + "epoch": 0.07325024405377792, + "grad_norm": 0.34341758489608765, + "learning_rate": 0.0001853884245961086, + "loss": 1.4403, + "step": 5637 + }, + { + "epoch": 0.07326323859769379, + "grad_norm": 0.4617409110069275, + "learning_rate": 0.00018538582513419723, + "loss": 1.3383, + "step": 5638 + }, + { + "epoch": 0.07327623314160966, + "grad_norm": 0.37782910466194153, + "learning_rate": 0.00018538322567228585, + "loss": 1.5037, + "step": 5639 + }, + { + "epoch": 0.07328922768552554, + "grad_norm": 0.3224794566631317, + "learning_rate": 0.00018538062621037445, + "loss": 1.593, + "step": 5640 + }, + { + "epoch": 0.07330222222944141, + "grad_norm": 0.4115370512008667, + "learning_rate": 0.00018537802674846308, + "loss": 1.3537, + "step": 5641 + }, + { + "epoch": 0.07331521677335728, + "grad_norm": 0.4420442581176758, + "learning_rate": 0.00018537542728655167, + "loss": 1.5839, + "step": 5642 + }, + { + "epoch": 0.07332821131727316, + "grad_norm": 0.4861597418785095, + "learning_rate": 0.00018537282782464033, + "loss": 1.4875, + "step": 5643 + }, + { + "epoch": 0.07334120586118903, + "grad_norm": 0.3862830400466919, + "learning_rate": 0.00018537022836272892, + "loss": 1.4328, + "step": 5644 + }, + { + "epoch": 0.0733542004051049, + "grad_norm": 0.38285693526268005, + "learning_rate": 0.00018536762890081755, + "loss": 1.3033, + "step": 5645 + }, + { + "epoch": 0.07336719494902078, + "grad_norm": 0.6086478233337402, + "learning_rate": 0.00018536502943890614, + "loss": 1.3785, + "step": 5646 + }, + { + "epoch": 0.07338018949293665, + "grad_norm": 0.46374431252479553, + "learning_rate": 0.00018536242997699477, + "loss": 1.5682, + "step": 5647 + }, + { + "epoch": 0.07339318403685252, + "grad_norm": 0.5707323551177979, + "learning_rate": 0.0001853598305150834, + "loss": 1.212, + "step": 5648 + }, + { + "epoch": 0.0734061785807684, + "grad_norm": 0.3908240795135498, + "learning_rate": 0.000185357231053172, + "loss": 1.4904, + "step": 5649 + }, + { + "epoch": 0.07341917312468427, + "grad_norm": 0.39183786511421204, + "learning_rate": 0.00018535463159126062, + "loss": 1.4241, + "step": 5650 + }, + { + "epoch": 0.07343216766860014, + "grad_norm": 0.28787100315093994, + "learning_rate": 0.00018535203212934924, + "loss": 1.3047, + "step": 5651 + }, + { + "epoch": 0.07344516221251601, + "grad_norm": 0.4687195420265198, + "learning_rate": 0.00018534943266743784, + "loss": 1.4699, + "step": 5652 + }, + { + "epoch": 0.0734581567564319, + "grad_norm": 0.3689233958721161, + "learning_rate": 0.00018534683320552646, + "loss": 1.3981, + "step": 5653 + }, + { + "epoch": 0.07347115130034777, + "grad_norm": 0.47054871916770935, + "learning_rate": 0.00018534423374361506, + "loss": 1.3714, + "step": 5654 + }, + { + "epoch": 0.07348414584426365, + "grad_norm": 0.40562546253204346, + "learning_rate": 0.0001853416342817037, + "loss": 1.4845, + "step": 5655 + }, + { + "epoch": 0.07349714038817952, + "grad_norm": 0.371001660823822, + "learning_rate": 0.0001853390348197923, + "loss": 1.4136, + "step": 5656 + }, + { + "epoch": 0.07351013493209539, + "grad_norm": 0.41035521030426025, + "learning_rate": 0.00018533643535788093, + "loss": 1.3432, + "step": 5657 + }, + { + "epoch": 0.07352312947601126, + "grad_norm": 0.3619242012500763, + "learning_rate": 0.00018533383589596956, + "loss": 1.3709, + "step": 5658 + }, + { + "epoch": 0.07353612401992714, + "grad_norm": 0.4026356041431427, + "learning_rate": 0.00018533123643405815, + "loss": 1.5246, + "step": 5659 + }, + { + "epoch": 0.07354911856384301, + "grad_norm": 0.3411107063293457, + "learning_rate": 0.00018532863697214678, + "loss": 1.3789, + "step": 5660 + }, + { + "epoch": 0.07356211310775888, + "grad_norm": 0.4111210107803345, + "learning_rate": 0.00018532603751023538, + "loss": 1.6607, + "step": 5661 + }, + { + "epoch": 0.07357510765167476, + "grad_norm": 0.38018786907196045, + "learning_rate": 0.00018532343804832403, + "loss": 1.1654, + "step": 5662 + }, + { + "epoch": 0.07358810219559063, + "grad_norm": 0.4601939022541046, + "learning_rate": 0.00018532083858641263, + "loss": 1.5294, + "step": 5663 + }, + { + "epoch": 0.0736010967395065, + "grad_norm": 0.45119649171829224, + "learning_rate": 0.00018531823912450122, + "loss": 1.5943, + "step": 5664 + }, + { + "epoch": 0.07361409128342238, + "grad_norm": 0.4009825587272644, + "learning_rate": 0.00018531563966258985, + "loss": 1.3158, + "step": 5665 + }, + { + "epoch": 0.07362708582733825, + "grad_norm": 0.3716208338737488, + "learning_rate": 0.00018531304020067847, + "loss": 1.491, + "step": 5666 + }, + { + "epoch": 0.07364008037125412, + "grad_norm": 0.4477281868457794, + "learning_rate": 0.0001853104407387671, + "loss": 1.5405, + "step": 5667 + }, + { + "epoch": 0.07365307491517, + "grad_norm": 0.37739893794059753, + "learning_rate": 0.0001853078412768557, + "loss": 1.3925, + "step": 5668 + }, + { + "epoch": 0.07366606945908587, + "grad_norm": 0.3842066824436188, + "learning_rate": 0.00018530524181494432, + "loss": 1.5007, + "step": 5669 + }, + { + "epoch": 0.07367906400300174, + "grad_norm": 0.33403629064559937, + "learning_rate": 0.00018530264235303294, + "loss": 1.2318, + "step": 5670 + }, + { + "epoch": 0.07369205854691761, + "grad_norm": 0.37254855036735535, + "learning_rate": 0.00018530004289112154, + "loss": 1.4311, + "step": 5671 + }, + { + "epoch": 0.07370505309083349, + "grad_norm": 0.4163953363895416, + "learning_rate": 0.00018529744342921016, + "loss": 1.625, + "step": 5672 + }, + { + "epoch": 0.07371804763474936, + "grad_norm": 0.37085282802581787, + "learning_rate": 0.00018529484396729876, + "loss": 1.3594, + "step": 5673 + }, + { + "epoch": 0.07373104217866523, + "grad_norm": 0.42054829001426697, + "learning_rate": 0.00018529224450538741, + "loss": 1.5976, + "step": 5674 + }, + { + "epoch": 0.0737440367225811, + "grad_norm": 0.3884391188621521, + "learning_rate": 0.000185289645043476, + "loss": 1.4202, + "step": 5675 + }, + { + "epoch": 0.07375703126649698, + "grad_norm": 0.4759387969970703, + "learning_rate": 0.00018528704558156464, + "loss": 1.4894, + "step": 5676 + }, + { + "epoch": 0.07377002581041285, + "grad_norm": 0.5548647046089172, + "learning_rate": 0.00018528444611965323, + "loss": 1.5356, + "step": 5677 + }, + { + "epoch": 0.07378302035432872, + "grad_norm": 0.45505598187446594, + "learning_rate": 0.00018528184665774186, + "loss": 1.488, + "step": 5678 + }, + { + "epoch": 0.0737960148982446, + "grad_norm": 0.39482781291007996, + "learning_rate": 0.00018527924719583048, + "loss": 1.2231, + "step": 5679 + }, + { + "epoch": 0.07380900944216047, + "grad_norm": 0.39929914474487305, + "learning_rate": 0.00018527664773391908, + "loss": 1.2839, + "step": 5680 + }, + { + "epoch": 0.07382200398607634, + "grad_norm": 0.4383854866027832, + "learning_rate": 0.0001852740482720077, + "loss": 1.6429, + "step": 5681 + }, + { + "epoch": 0.07383499852999222, + "grad_norm": 0.3644176721572876, + "learning_rate": 0.00018527144881009633, + "loss": 1.31, + "step": 5682 + }, + { + "epoch": 0.07384799307390809, + "grad_norm": 0.30723321437835693, + "learning_rate": 0.00018526884934818493, + "loss": 1.419, + "step": 5683 + }, + { + "epoch": 0.07386098761782396, + "grad_norm": 0.3782745599746704, + "learning_rate": 0.00018526624988627355, + "loss": 1.4171, + "step": 5684 + }, + { + "epoch": 0.07387398216173983, + "grad_norm": 0.49302423000335693, + "learning_rate": 0.00018526365042436215, + "loss": 1.515, + "step": 5685 + }, + { + "epoch": 0.07388697670565571, + "grad_norm": 0.34835150837898254, + "learning_rate": 0.0001852610509624508, + "loss": 1.3484, + "step": 5686 + }, + { + "epoch": 0.07389997124957158, + "grad_norm": 0.407061368227005, + "learning_rate": 0.0001852584515005394, + "loss": 1.4315, + "step": 5687 + }, + { + "epoch": 0.07391296579348745, + "grad_norm": 0.38006383180618286, + "learning_rate": 0.00018525585203862802, + "loss": 1.5133, + "step": 5688 + }, + { + "epoch": 0.07392596033740333, + "grad_norm": 0.5021685361862183, + "learning_rate": 0.00018525325257671662, + "loss": 1.4815, + "step": 5689 + }, + { + "epoch": 0.0739389548813192, + "grad_norm": 0.4555334150791168, + "learning_rate": 0.00018525065311480524, + "loss": 1.4961, + "step": 5690 + }, + { + "epoch": 0.07395194942523509, + "grad_norm": 0.3869423270225525, + "learning_rate": 0.00018524805365289387, + "loss": 1.3448, + "step": 5691 + }, + { + "epoch": 0.07396494396915096, + "grad_norm": 0.33090898394584656, + "learning_rate": 0.00018524545419098246, + "loss": 1.3476, + "step": 5692 + }, + { + "epoch": 0.07397793851306683, + "grad_norm": 0.46437868475914, + "learning_rate": 0.0001852428547290711, + "loss": 1.4696, + "step": 5693 + }, + { + "epoch": 0.0739909330569827, + "grad_norm": 0.3153212070465088, + "learning_rate": 0.00018524025526715971, + "loss": 1.5102, + "step": 5694 + }, + { + "epoch": 0.07400392760089858, + "grad_norm": 0.31786441802978516, + "learning_rate": 0.0001852376558052483, + "loss": 1.4115, + "step": 5695 + }, + { + "epoch": 0.07401692214481445, + "grad_norm": 0.3887665569782257, + "learning_rate": 0.00018523505634333694, + "loss": 1.4906, + "step": 5696 + }, + { + "epoch": 0.07402991668873032, + "grad_norm": 0.49712473154067993, + "learning_rate": 0.00018523245688142556, + "loss": 1.5448, + "step": 5697 + }, + { + "epoch": 0.0740429112326462, + "grad_norm": 0.4528498947620392, + "learning_rate": 0.00018522985741951418, + "loss": 1.5413, + "step": 5698 + }, + { + "epoch": 0.07405590577656207, + "grad_norm": 0.35879814624786377, + "learning_rate": 0.00018522725795760278, + "loss": 1.3134, + "step": 5699 + }, + { + "epoch": 0.07406890032047794, + "grad_norm": 0.3808993697166443, + "learning_rate": 0.0001852246584956914, + "loss": 1.522, + "step": 5700 + }, + { + "epoch": 0.07408189486439382, + "grad_norm": 0.44484058022499084, + "learning_rate": 0.00018522205903378003, + "loss": 1.5045, + "step": 5701 + }, + { + "epoch": 0.07409488940830969, + "grad_norm": 0.46724170446395874, + "learning_rate": 0.00018521945957186863, + "loss": 1.5769, + "step": 5702 + }, + { + "epoch": 0.07410788395222556, + "grad_norm": 0.25938349962234497, + "learning_rate": 0.00018521686010995725, + "loss": 1.2574, + "step": 5703 + }, + { + "epoch": 0.07412087849614143, + "grad_norm": 0.34519582986831665, + "learning_rate": 0.00018521426064804585, + "loss": 1.5257, + "step": 5704 + }, + { + "epoch": 0.07413387304005731, + "grad_norm": 0.3566371500492096, + "learning_rate": 0.0001852116611861345, + "loss": 1.3797, + "step": 5705 + }, + { + "epoch": 0.07414686758397318, + "grad_norm": 0.330496609210968, + "learning_rate": 0.0001852090617242231, + "loss": 1.3353, + "step": 5706 + }, + { + "epoch": 0.07415986212788905, + "grad_norm": 0.37345677614212036, + "learning_rate": 0.0001852064622623117, + "loss": 1.4017, + "step": 5707 + }, + { + "epoch": 0.07417285667180493, + "grad_norm": 0.36138954758644104, + "learning_rate": 0.00018520386280040032, + "loss": 1.3175, + "step": 5708 + }, + { + "epoch": 0.0741858512157208, + "grad_norm": 0.3797924816608429, + "learning_rate": 0.00018520126333848895, + "loss": 1.3777, + "step": 5709 + }, + { + "epoch": 0.07419884575963667, + "grad_norm": 0.3644832968711853, + "learning_rate": 0.00018519866387657757, + "loss": 1.2839, + "step": 5710 + }, + { + "epoch": 0.07421184030355255, + "grad_norm": 0.3977486491203308, + "learning_rate": 0.00018519606441466617, + "loss": 1.4154, + "step": 5711 + }, + { + "epoch": 0.07422483484746842, + "grad_norm": 0.4330911338329315, + "learning_rate": 0.0001851934649527548, + "loss": 1.304, + "step": 5712 + }, + { + "epoch": 0.07423782939138429, + "grad_norm": 0.44224247336387634, + "learning_rate": 0.00018519086549084342, + "loss": 1.4277, + "step": 5713 + }, + { + "epoch": 0.07425082393530016, + "grad_norm": 0.3843836188316345, + "learning_rate": 0.000185188266028932, + "loss": 1.5312, + "step": 5714 + }, + { + "epoch": 0.07426381847921604, + "grad_norm": 0.4149883985519409, + "learning_rate": 0.00018518566656702064, + "loss": 1.3724, + "step": 5715 + }, + { + "epoch": 0.07427681302313191, + "grad_norm": 0.4066537022590637, + "learning_rate": 0.00018518306710510924, + "loss": 1.3919, + "step": 5716 + }, + { + "epoch": 0.07428980756704778, + "grad_norm": 0.3956160843372345, + "learning_rate": 0.0001851804676431979, + "loss": 1.5682, + "step": 5717 + }, + { + "epoch": 0.07430280211096366, + "grad_norm": 0.414580762386322, + "learning_rate": 0.00018517786818128648, + "loss": 1.4342, + "step": 5718 + }, + { + "epoch": 0.07431579665487953, + "grad_norm": 0.40832704305648804, + "learning_rate": 0.00018517526871937508, + "loss": 1.5079, + "step": 5719 + }, + { + "epoch": 0.0743287911987954, + "grad_norm": 0.33055317401885986, + "learning_rate": 0.0001851726692574637, + "loss": 1.3293, + "step": 5720 + }, + { + "epoch": 0.07434178574271127, + "grad_norm": 0.3359217047691345, + "learning_rate": 0.00018517006979555233, + "loss": 1.445, + "step": 5721 + }, + { + "epoch": 0.07435478028662715, + "grad_norm": 0.4215114116668701, + "learning_rate": 0.00018516747033364096, + "loss": 1.504, + "step": 5722 + }, + { + "epoch": 0.07436777483054302, + "grad_norm": 0.42226025462150574, + "learning_rate": 0.00018516487087172955, + "loss": 1.4326, + "step": 5723 + }, + { + "epoch": 0.0743807693744589, + "grad_norm": 0.3280055820941925, + "learning_rate": 0.00018516227140981818, + "loss": 1.3928, + "step": 5724 + }, + { + "epoch": 0.07439376391837477, + "grad_norm": 0.33235517144203186, + "learning_rate": 0.0001851596719479068, + "loss": 1.4994, + "step": 5725 + }, + { + "epoch": 0.07440675846229064, + "grad_norm": 0.36907488107681274, + "learning_rate": 0.0001851570724859954, + "loss": 1.2933, + "step": 5726 + }, + { + "epoch": 0.07441975300620651, + "grad_norm": 0.3370386064052582, + "learning_rate": 0.00018515447302408402, + "loss": 1.4913, + "step": 5727 + }, + { + "epoch": 0.07443274755012239, + "grad_norm": 0.3899488151073456, + "learning_rate": 0.00018515187356217262, + "loss": 1.4666, + "step": 5728 + }, + { + "epoch": 0.07444574209403827, + "grad_norm": 0.42508041858673096, + "learning_rate": 0.00018514927410026127, + "loss": 1.4094, + "step": 5729 + }, + { + "epoch": 0.07445873663795415, + "grad_norm": 0.37834376096725464, + "learning_rate": 0.00018514667463834987, + "loss": 1.5049, + "step": 5730 + }, + { + "epoch": 0.07447173118187002, + "grad_norm": 0.41443678736686707, + "learning_rate": 0.0001851440751764385, + "loss": 1.6022, + "step": 5731 + }, + { + "epoch": 0.07448472572578589, + "grad_norm": 0.3229401111602783, + "learning_rate": 0.00018514147571452712, + "loss": 1.496, + "step": 5732 + }, + { + "epoch": 0.07449772026970176, + "grad_norm": 0.45629793405532837, + "learning_rate": 0.00018513887625261572, + "loss": 1.3344, + "step": 5733 + }, + { + "epoch": 0.07451071481361764, + "grad_norm": 0.44251513481140137, + "learning_rate": 0.00018513627679070434, + "loss": 1.6878, + "step": 5734 + }, + { + "epoch": 0.07452370935753351, + "grad_norm": 0.41825950145721436, + "learning_rate": 0.00018513367732879294, + "loss": 1.6729, + "step": 5735 + }, + { + "epoch": 0.07453670390144938, + "grad_norm": 0.47430917620658875, + "learning_rate": 0.00018513107786688156, + "loss": 1.5414, + "step": 5736 + }, + { + "epoch": 0.07454969844536526, + "grad_norm": 0.38670629262924194, + "learning_rate": 0.0001851284784049702, + "loss": 1.619, + "step": 5737 + }, + { + "epoch": 0.07456269298928113, + "grad_norm": 0.48887214064598083, + "learning_rate": 0.00018512587894305878, + "loss": 1.428, + "step": 5738 + }, + { + "epoch": 0.074575687533197, + "grad_norm": 0.4115327298641205, + "learning_rate": 0.0001851232794811474, + "loss": 1.4086, + "step": 5739 + }, + { + "epoch": 0.07458868207711288, + "grad_norm": 0.4153934121131897, + "learning_rate": 0.00018512068001923603, + "loss": 1.4183, + "step": 5740 + }, + { + "epoch": 0.07460167662102875, + "grad_norm": 0.37232258915901184, + "learning_rate": 0.00018511808055732466, + "loss": 1.3497, + "step": 5741 + }, + { + "epoch": 0.07461467116494462, + "grad_norm": 0.4177737236022949, + "learning_rate": 0.00018511548109541326, + "loss": 1.4034, + "step": 5742 + }, + { + "epoch": 0.0746276657088605, + "grad_norm": 0.436621755361557, + "learning_rate": 0.00018511288163350188, + "loss": 1.3205, + "step": 5743 + }, + { + "epoch": 0.07464066025277637, + "grad_norm": 0.37911200523376465, + "learning_rate": 0.0001851102821715905, + "loss": 1.451, + "step": 5744 + }, + { + "epoch": 0.07465365479669224, + "grad_norm": 0.4468877911567688, + "learning_rate": 0.0001851076827096791, + "loss": 1.3117, + "step": 5745 + }, + { + "epoch": 0.07466664934060811, + "grad_norm": 0.40068939328193665, + "learning_rate": 0.00018510508324776773, + "loss": 1.4544, + "step": 5746 + }, + { + "epoch": 0.07467964388452399, + "grad_norm": 0.3930702805519104, + "learning_rate": 0.00018510248378585632, + "loss": 1.4813, + "step": 5747 + }, + { + "epoch": 0.07469263842843986, + "grad_norm": 0.3526495099067688, + "learning_rate": 0.00018509988432394495, + "loss": 1.5122, + "step": 5748 + }, + { + "epoch": 0.07470563297235573, + "grad_norm": 0.4161950945854187, + "learning_rate": 0.00018509728486203357, + "loss": 1.3742, + "step": 5749 + }, + { + "epoch": 0.0747186275162716, + "grad_norm": 0.4628022313117981, + "learning_rate": 0.00018509468540012217, + "loss": 1.4864, + "step": 5750 + }, + { + "epoch": 0.07473162206018748, + "grad_norm": 0.46159228682518005, + "learning_rate": 0.0001850920859382108, + "loss": 1.3268, + "step": 5751 + }, + { + "epoch": 0.07474461660410335, + "grad_norm": 0.405312180519104, + "learning_rate": 0.00018508948647629942, + "loss": 1.4594, + "step": 5752 + }, + { + "epoch": 0.07475761114801922, + "grad_norm": 0.3584649860858917, + "learning_rate": 0.00018508688701438804, + "loss": 1.4297, + "step": 5753 + }, + { + "epoch": 0.0747706056919351, + "grad_norm": 0.5019659996032715, + "learning_rate": 0.00018508428755247664, + "loss": 1.6759, + "step": 5754 + }, + { + "epoch": 0.07478360023585097, + "grad_norm": 0.19369813799858093, + "learning_rate": 0.00018508168809056526, + "loss": 1.195, + "step": 5755 + }, + { + "epoch": 0.07479659477976684, + "grad_norm": 0.3313782811164856, + "learning_rate": 0.0001850790886286539, + "loss": 1.4215, + "step": 5756 + }, + { + "epoch": 0.07480958932368272, + "grad_norm": 0.4623451828956604, + "learning_rate": 0.0001850764891667425, + "loss": 1.5191, + "step": 5757 + }, + { + "epoch": 0.07482258386759859, + "grad_norm": 0.5040561556816101, + "learning_rate": 0.0001850738897048311, + "loss": 1.4171, + "step": 5758 + }, + { + "epoch": 0.07483557841151446, + "grad_norm": 0.4187166094779968, + "learning_rate": 0.0001850712902429197, + "loss": 1.4973, + "step": 5759 + }, + { + "epoch": 0.07484857295543033, + "grad_norm": 0.5405794978141785, + "learning_rate": 0.00018506869078100836, + "loss": 1.4428, + "step": 5760 + }, + { + "epoch": 0.07486156749934621, + "grad_norm": 0.3933601379394531, + "learning_rate": 0.00018506609131909696, + "loss": 1.3624, + "step": 5761 + }, + { + "epoch": 0.07487456204326208, + "grad_norm": 0.38803520798683167, + "learning_rate": 0.00018506349185718556, + "loss": 1.3182, + "step": 5762 + }, + { + "epoch": 0.07488755658717795, + "grad_norm": 0.3997972011566162, + "learning_rate": 0.00018506089239527418, + "loss": 1.4482, + "step": 5763 + }, + { + "epoch": 0.07490055113109383, + "grad_norm": 0.36882323026657104, + "learning_rate": 0.0001850582929333628, + "loss": 1.3833, + "step": 5764 + }, + { + "epoch": 0.0749135456750097, + "grad_norm": 0.42174553871154785, + "learning_rate": 0.00018505569347145143, + "loss": 1.584, + "step": 5765 + }, + { + "epoch": 0.07492654021892557, + "grad_norm": 0.33125194907188416, + "learning_rate": 0.00018505309400954003, + "loss": 1.2519, + "step": 5766 + }, + { + "epoch": 0.07493953476284146, + "grad_norm": 0.35263311862945557, + "learning_rate": 0.00018505049454762865, + "loss": 1.4555, + "step": 5767 + }, + { + "epoch": 0.07495252930675733, + "grad_norm": 0.3132159113883972, + "learning_rate": 0.00018504789508571727, + "loss": 1.3918, + "step": 5768 + }, + { + "epoch": 0.0749655238506732, + "grad_norm": 0.3519570529460907, + "learning_rate": 0.00018504529562380587, + "loss": 1.1826, + "step": 5769 + }, + { + "epoch": 0.07497851839458908, + "grad_norm": 0.3637275695800781, + "learning_rate": 0.0001850426961618945, + "loss": 1.4613, + "step": 5770 + }, + { + "epoch": 0.07499151293850495, + "grad_norm": 0.33911579847335815, + "learning_rate": 0.00018504009669998312, + "loss": 1.3359, + "step": 5771 + }, + { + "epoch": 0.07500450748242082, + "grad_norm": 0.4679122567176819, + "learning_rate": 0.00018503749723807175, + "loss": 1.4188, + "step": 5772 + }, + { + "epoch": 0.0750175020263367, + "grad_norm": 0.38590025901794434, + "learning_rate": 0.00018503489777616034, + "loss": 1.4469, + "step": 5773 + }, + { + "epoch": 0.07503049657025257, + "grad_norm": 0.3744296431541443, + "learning_rate": 0.00018503229831424894, + "loss": 1.4446, + "step": 5774 + }, + { + "epoch": 0.07504349111416844, + "grad_norm": 0.4199884831905365, + "learning_rate": 0.0001850296988523376, + "loss": 1.3196, + "step": 5775 + }, + { + "epoch": 0.07505648565808432, + "grad_norm": 0.3628632128238678, + "learning_rate": 0.0001850270993904262, + "loss": 1.415, + "step": 5776 + }, + { + "epoch": 0.07506948020200019, + "grad_norm": 0.3270149230957031, + "learning_rate": 0.00018502449992851481, + "loss": 1.4124, + "step": 5777 + }, + { + "epoch": 0.07508247474591606, + "grad_norm": 0.4369531273841858, + "learning_rate": 0.0001850219004666034, + "loss": 1.4002, + "step": 5778 + }, + { + "epoch": 0.07509546928983193, + "grad_norm": 0.3593451976776123, + "learning_rate": 0.00018501930100469204, + "loss": 1.4635, + "step": 5779 + }, + { + "epoch": 0.07510846383374781, + "grad_norm": 0.43367254734039307, + "learning_rate": 0.00018501670154278066, + "loss": 1.4829, + "step": 5780 + }, + { + "epoch": 0.07512145837766368, + "grad_norm": 0.40663108229637146, + "learning_rate": 0.00018501410208086926, + "loss": 1.4948, + "step": 5781 + }, + { + "epoch": 0.07513445292157955, + "grad_norm": 0.4318493902683258, + "learning_rate": 0.00018501150261895788, + "loss": 1.5239, + "step": 5782 + }, + { + "epoch": 0.07514744746549543, + "grad_norm": 0.40720197558403015, + "learning_rate": 0.0001850089031570465, + "loss": 1.4984, + "step": 5783 + }, + { + "epoch": 0.0751604420094113, + "grad_norm": 0.32058578729629517, + "learning_rate": 0.00018500630369513513, + "loss": 1.1605, + "step": 5784 + }, + { + "epoch": 0.07517343655332717, + "grad_norm": 0.39151209592819214, + "learning_rate": 0.00018500370423322373, + "loss": 1.3383, + "step": 5785 + }, + { + "epoch": 0.07518643109724304, + "grad_norm": 0.42630091309547424, + "learning_rate": 0.00018500110477131233, + "loss": 1.4948, + "step": 5786 + }, + { + "epoch": 0.07519942564115892, + "grad_norm": 0.3706749975681305, + "learning_rate": 0.00018499850530940098, + "loss": 1.2889, + "step": 5787 + }, + { + "epoch": 0.07521242018507479, + "grad_norm": 0.37756261229515076, + "learning_rate": 0.00018499590584748957, + "loss": 1.387, + "step": 5788 + }, + { + "epoch": 0.07522541472899066, + "grad_norm": 0.39900779724121094, + "learning_rate": 0.0001849933063855782, + "loss": 1.509, + "step": 5789 + }, + { + "epoch": 0.07523840927290654, + "grad_norm": 0.34100595116615295, + "learning_rate": 0.0001849907069236668, + "loss": 1.3943, + "step": 5790 + }, + { + "epoch": 0.07525140381682241, + "grad_norm": 0.4117633104324341, + "learning_rate": 0.00018498810746175542, + "loss": 1.6532, + "step": 5791 + }, + { + "epoch": 0.07526439836073828, + "grad_norm": 0.38742175698280334, + "learning_rate": 0.00018498550799984405, + "loss": 1.501, + "step": 5792 + }, + { + "epoch": 0.07527739290465416, + "grad_norm": 0.323934942483902, + "learning_rate": 0.00018498290853793264, + "loss": 1.327, + "step": 5793 + }, + { + "epoch": 0.07529038744857003, + "grad_norm": 0.351481169462204, + "learning_rate": 0.00018498030907602127, + "loss": 1.1411, + "step": 5794 + }, + { + "epoch": 0.0753033819924859, + "grad_norm": 0.3165893256664276, + "learning_rate": 0.0001849777096141099, + "loss": 1.4399, + "step": 5795 + }, + { + "epoch": 0.07531637653640177, + "grad_norm": 0.34249910712242126, + "learning_rate": 0.00018497511015219852, + "loss": 1.397, + "step": 5796 + }, + { + "epoch": 0.07532937108031765, + "grad_norm": 0.44883275032043457, + "learning_rate": 0.00018497251069028711, + "loss": 1.4407, + "step": 5797 + }, + { + "epoch": 0.07534236562423352, + "grad_norm": 0.35347557067871094, + "learning_rate": 0.00018496991122837574, + "loss": 1.2652, + "step": 5798 + }, + { + "epoch": 0.0753553601681494, + "grad_norm": 0.4140808880329132, + "learning_rate": 0.00018496731176646436, + "loss": 1.3446, + "step": 5799 + }, + { + "epoch": 0.07536835471206527, + "grad_norm": 0.4276021718978882, + "learning_rate": 0.00018496471230455296, + "loss": 1.2137, + "step": 5800 + }, + { + "epoch": 0.07538134925598114, + "grad_norm": 0.34881800413131714, + "learning_rate": 0.00018496211284264158, + "loss": 1.3496, + "step": 5801 + }, + { + "epoch": 0.07539434379989701, + "grad_norm": 0.3739721477031708, + "learning_rate": 0.00018495951338073018, + "loss": 1.4815, + "step": 5802 + }, + { + "epoch": 0.07540733834381289, + "grad_norm": 0.41915491223335266, + "learning_rate": 0.0001849569139188188, + "loss": 1.3621, + "step": 5803 + }, + { + "epoch": 0.07542033288772876, + "grad_norm": 0.3745441734790802, + "learning_rate": 0.00018495431445690743, + "loss": 1.4732, + "step": 5804 + }, + { + "epoch": 0.07543332743164465, + "grad_norm": 0.6090517640113831, + "learning_rate": 0.00018495171499499603, + "loss": 1.6198, + "step": 5805 + }, + { + "epoch": 0.07544632197556052, + "grad_norm": 0.34556782245635986, + "learning_rate": 0.00018494911553308468, + "loss": 1.2102, + "step": 5806 + }, + { + "epoch": 0.07545931651947639, + "grad_norm": 0.3727145791053772, + "learning_rate": 0.00018494651607117328, + "loss": 1.3996, + "step": 5807 + }, + { + "epoch": 0.07547231106339226, + "grad_norm": 0.3725121021270752, + "learning_rate": 0.0001849439166092619, + "loss": 1.5234, + "step": 5808 + }, + { + "epoch": 0.07548530560730814, + "grad_norm": 0.39851897954940796, + "learning_rate": 0.0001849413171473505, + "loss": 1.3714, + "step": 5809 + }, + { + "epoch": 0.07549830015122401, + "grad_norm": 0.3840694725513458, + "learning_rate": 0.00018493871768543912, + "loss": 1.4691, + "step": 5810 + }, + { + "epoch": 0.07551129469513988, + "grad_norm": 0.4172472059726715, + "learning_rate": 0.00018493611822352775, + "loss": 1.525, + "step": 5811 + }, + { + "epoch": 0.07552428923905576, + "grad_norm": 0.2960830330848694, + "learning_rate": 0.00018493351876161635, + "loss": 1.4679, + "step": 5812 + }, + { + "epoch": 0.07553728378297163, + "grad_norm": 0.35858914256095886, + "learning_rate": 0.00018493091929970497, + "loss": 1.4521, + "step": 5813 + }, + { + "epoch": 0.0755502783268875, + "grad_norm": 0.34739160537719727, + "learning_rate": 0.0001849283198377936, + "loss": 1.4745, + "step": 5814 + }, + { + "epoch": 0.07556327287080337, + "grad_norm": 0.44756895303726196, + "learning_rate": 0.0001849257203758822, + "loss": 1.5411, + "step": 5815 + }, + { + "epoch": 0.07557626741471925, + "grad_norm": 0.3778274655342102, + "learning_rate": 0.00018492312091397082, + "loss": 1.419, + "step": 5816 + }, + { + "epoch": 0.07558926195863512, + "grad_norm": 0.49816054105758667, + "learning_rate": 0.00018492052145205941, + "loss": 1.5912, + "step": 5817 + }, + { + "epoch": 0.075602256502551, + "grad_norm": 0.42864638566970825, + "learning_rate": 0.00018491792199014807, + "loss": 1.3594, + "step": 5818 + }, + { + "epoch": 0.07561525104646687, + "grad_norm": 0.42342302203178406, + "learning_rate": 0.00018491532252823666, + "loss": 1.5458, + "step": 5819 + }, + { + "epoch": 0.07562824559038274, + "grad_norm": 0.3598827123641968, + "learning_rate": 0.0001849127230663253, + "loss": 1.249, + "step": 5820 + }, + { + "epoch": 0.07564124013429861, + "grad_norm": 0.3846488296985626, + "learning_rate": 0.00018491012360441388, + "loss": 1.4677, + "step": 5821 + }, + { + "epoch": 0.07565423467821449, + "grad_norm": 0.3567892014980316, + "learning_rate": 0.0001849075241425025, + "loss": 1.4233, + "step": 5822 + }, + { + "epoch": 0.07566722922213036, + "grad_norm": 0.3446444272994995, + "learning_rate": 0.00018490492468059113, + "loss": 1.5045, + "step": 5823 + }, + { + "epoch": 0.07568022376604623, + "grad_norm": 0.33591076731681824, + "learning_rate": 0.00018490232521867973, + "loss": 1.4461, + "step": 5824 + }, + { + "epoch": 0.0756932183099621, + "grad_norm": 0.4599846303462982, + "learning_rate": 0.00018489972575676836, + "loss": 1.4785, + "step": 5825 + }, + { + "epoch": 0.07570621285387798, + "grad_norm": 0.3949660062789917, + "learning_rate": 0.00018489712629485698, + "loss": 1.6074, + "step": 5826 + }, + { + "epoch": 0.07571920739779385, + "grad_norm": 0.4107274115085602, + "learning_rate": 0.0001848945268329456, + "loss": 1.4772, + "step": 5827 + }, + { + "epoch": 0.07573220194170972, + "grad_norm": 0.4312751889228821, + "learning_rate": 0.0001848919273710342, + "loss": 1.5427, + "step": 5828 + }, + { + "epoch": 0.0757451964856256, + "grad_norm": 0.38404184579849243, + "learning_rate": 0.0001848893279091228, + "loss": 1.3894, + "step": 5829 + }, + { + "epoch": 0.07575819102954147, + "grad_norm": 0.43782028555870056, + "learning_rate": 0.00018488672844721145, + "loss": 1.4754, + "step": 5830 + }, + { + "epoch": 0.07577118557345734, + "grad_norm": 0.4280025064945221, + "learning_rate": 0.00018488412898530005, + "loss": 1.3574, + "step": 5831 + }, + { + "epoch": 0.07578418011737321, + "grad_norm": 0.4050753712654114, + "learning_rate": 0.00018488152952338867, + "loss": 1.3057, + "step": 5832 + }, + { + "epoch": 0.07579717466128909, + "grad_norm": 0.3534747064113617, + "learning_rate": 0.00018487893006147727, + "loss": 1.3667, + "step": 5833 + }, + { + "epoch": 0.07581016920520496, + "grad_norm": 0.3065432906150818, + "learning_rate": 0.0001848763305995659, + "loss": 1.2162, + "step": 5834 + }, + { + "epoch": 0.07582316374912083, + "grad_norm": 0.2661783993244171, + "learning_rate": 0.00018487373113765452, + "loss": 1.4638, + "step": 5835 + }, + { + "epoch": 0.0758361582930367, + "grad_norm": 0.4889761209487915, + "learning_rate": 0.00018487113167574312, + "loss": 1.5143, + "step": 5836 + }, + { + "epoch": 0.07584915283695258, + "grad_norm": 0.3838978707790375, + "learning_rate": 0.00018486853221383174, + "loss": 1.3842, + "step": 5837 + }, + { + "epoch": 0.07586214738086845, + "grad_norm": 0.4237801432609558, + "learning_rate": 0.00018486593275192037, + "loss": 1.6655, + "step": 5838 + }, + { + "epoch": 0.07587514192478433, + "grad_norm": 0.39514413475990295, + "learning_rate": 0.000184863333290009, + "loss": 1.3252, + "step": 5839 + }, + { + "epoch": 0.0758881364687002, + "grad_norm": 0.36931896209716797, + "learning_rate": 0.0001848607338280976, + "loss": 1.5721, + "step": 5840 + }, + { + "epoch": 0.07590113101261607, + "grad_norm": 0.4811924695968628, + "learning_rate": 0.00018485813436618618, + "loss": 1.4095, + "step": 5841 + }, + { + "epoch": 0.07591412555653194, + "grad_norm": 0.3904770314693451, + "learning_rate": 0.00018485553490427484, + "loss": 1.6637, + "step": 5842 + }, + { + "epoch": 0.07592712010044783, + "grad_norm": 0.3167823255062103, + "learning_rate": 0.00018485293544236343, + "loss": 1.3825, + "step": 5843 + }, + { + "epoch": 0.0759401146443637, + "grad_norm": 0.3343684673309326, + "learning_rate": 0.00018485033598045206, + "loss": 1.1775, + "step": 5844 + }, + { + "epoch": 0.07595310918827958, + "grad_norm": 0.422659695148468, + "learning_rate": 0.00018484773651854068, + "loss": 1.3561, + "step": 5845 + }, + { + "epoch": 0.07596610373219545, + "grad_norm": 0.35066717863082886, + "learning_rate": 0.00018484513705662928, + "loss": 1.1534, + "step": 5846 + }, + { + "epoch": 0.07597909827611132, + "grad_norm": 0.47269636392593384, + "learning_rate": 0.0001848425375947179, + "loss": 1.4737, + "step": 5847 + }, + { + "epoch": 0.0759920928200272, + "grad_norm": 0.3691405653953552, + "learning_rate": 0.0001848399381328065, + "loss": 1.3411, + "step": 5848 + }, + { + "epoch": 0.07600508736394307, + "grad_norm": 0.4624173045158386, + "learning_rate": 0.00018483733867089515, + "loss": 1.5863, + "step": 5849 + }, + { + "epoch": 0.07601808190785894, + "grad_norm": 0.40431129932403564, + "learning_rate": 0.00018483473920898375, + "loss": 1.2269, + "step": 5850 + }, + { + "epoch": 0.07603107645177481, + "grad_norm": 0.46895161271095276, + "learning_rate": 0.00018483213974707238, + "loss": 1.4745, + "step": 5851 + }, + { + "epoch": 0.07604407099569069, + "grad_norm": 0.3865452706813812, + "learning_rate": 0.00018482954028516097, + "loss": 1.2737, + "step": 5852 + }, + { + "epoch": 0.07605706553960656, + "grad_norm": 0.48697805404663086, + "learning_rate": 0.0001848269408232496, + "loss": 1.5581, + "step": 5853 + }, + { + "epoch": 0.07607006008352243, + "grad_norm": 0.45192742347717285, + "learning_rate": 0.00018482434136133822, + "loss": 1.4669, + "step": 5854 + }, + { + "epoch": 0.0760830546274383, + "grad_norm": 0.3490251898765564, + "learning_rate": 0.00018482174189942682, + "loss": 1.2573, + "step": 5855 + }, + { + "epoch": 0.07609604917135418, + "grad_norm": 0.3953474760055542, + "learning_rate": 0.00018481914243751544, + "loss": 1.3058, + "step": 5856 + }, + { + "epoch": 0.07610904371527005, + "grad_norm": 0.397747278213501, + "learning_rate": 0.00018481654297560407, + "loss": 1.37, + "step": 5857 + }, + { + "epoch": 0.07612203825918593, + "grad_norm": 0.46927931904792786, + "learning_rate": 0.00018481394351369267, + "loss": 1.3264, + "step": 5858 + }, + { + "epoch": 0.0761350328031018, + "grad_norm": 0.3955818712711334, + "learning_rate": 0.0001848113440517813, + "loss": 1.5555, + "step": 5859 + }, + { + "epoch": 0.07614802734701767, + "grad_norm": 0.39604973793029785, + "learning_rate": 0.0001848087445898699, + "loss": 1.6205, + "step": 5860 + }, + { + "epoch": 0.07616102189093354, + "grad_norm": 0.37378475069999695, + "learning_rate": 0.00018480614512795854, + "loss": 1.3903, + "step": 5861 + }, + { + "epoch": 0.07617401643484942, + "grad_norm": 0.3710018992424011, + "learning_rate": 0.00018480354566604714, + "loss": 1.4082, + "step": 5862 + }, + { + "epoch": 0.07618701097876529, + "grad_norm": 0.48273733258247375, + "learning_rate": 0.00018480094620413576, + "loss": 1.2398, + "step": 5863 + }, + { + "epoch": 0.07620000552268116, + "grad_norm": 0.4252732992172241, + "learning_rate": 0.00018479834674222436, + "loss": 1.4478, + "step": 5864 + }, + { + "epoch": 0.07621300006659704, + "grad_norm": 0.40777069330215454, + "learning_rate": 0.00018479574728031298, + "loss": 1.5822, + "step": 5865 + }, + { + "epoch": 0.07622599461051291, + "grad_norm": 0.3742409944534302, + "learning_rate": 0.0001847931478184016, + "loss": 1.1418, + "step": 5866 + }, + { + "epoch": 0.07623898915442878, + "grad_norm": 0.4629398286342621, + "learning_rate": 0.0001847905483564902, + "loss": 1.5865, + "step": 5867 + }, + { + "epoch": 0.07625198369834466, + "grad_norm": 0.21505504846572876, + "learning_rate": 0.00018478794889457883, + "loss": 1.1823, + "step": 5868 + }, + { + "epoch": 0.07626497824226053, + "grad_norm": 0.4295143783092499, + "learning_rate": 0.00018478534943266745, + "loss": 1.349, + "step": 5869 + }, + { + "epoch": 0.0762779727861764, + "grad_norm": 0.4053129553794861, + "learning_rate": 0.00018478274997075605, + "loss": 1.4786, + "step": 5870 + }, + { + "epoch": 0.07629096733009227, + "grad_norm": 0.3658570349216461, + "learning_rate": 0.00018478015050884468, + "loss": 1.297, + "step": 5871 + }, + { + "epoch": 0.07630396187400815, + "grad_norm": 0.4330212473869324, + "learning_rate": 0.00018477755104693327, + "loss": 1.378, + "step": 5872 + }, + { + "epoch": 0.07631695641792402, + "grad_norm": 0.45120710134506226, + "learning_rate": 0.00018477495158502192, + "loss": 1.3482, + "step": 5873 + }, + { + "epoch": 0.07632995096183989, + "grad_norm": 0.4001186192035675, + "learning_rate": 0.00018477235212311052, + "loss": 1.5825, + "step": 5874 + }, + { + "epoch": 0.07634294550575577, + "grad_norm": 0.37141090631484985, + "learning_rate": 0.00018476975266119915, + "loss": 1.4463, + "step": 5875 + }, + { + "epoch": 0.07635594004967164, + "grad_norm": 0.36818942427635193, + "learning_rate": 0.00018476715319928774, + "loss": 1.4735, + "step": 5876 + }, + { + "epoch": 0.07636893459358751, + "grad_norm": 0.40000078082084656, + "learning_rate": 0.00018476455373737637, + "loss": 1.102, + "step": 5877 + }, + { + "epoch": 0.07638192913750338, + "grad_norm": 0.428722083568573, + "learning_rate": 0.000184761954275465, + "loss": 1.5632, + "step": 5878 + }, + { + "epoch": 0.07639492368141926, + "grad_norm": 0.373018741607666, + "learning_rate": 0.0001847593548135536, + "loss": 1.1342, + "step": 5879 + }, + { + "epoch": 0.07640791822533513, + "grad_norm": 0.46091723442077637, + "learning_rate": 0.00018475675535164221, + "loss": 1.5319, + "step": 5880 + }, + { + "epoch": 0.07642091276925102, + "grad_norm": 0.5123398900032043, + "learning_rate": 0.00018475415588973084, + "loss": 1.3243, + "step": 5881 + }, + { + "epoch": 0.07643390731316689, + "grad_norm": 0.468747079372406, + "learning_rate": 0.00018475155642781946, + "loss": 1.474, + "step": 5882 + }, + { + "epoch": 0.07644690185708276, + "grad_norm": 0.5109299421310425, + "learning_rate": 0.00018474895696590806, + "loss": 1.6138, + "step": 5883 + }, + { + "epoch": 0.07645989640099864, + "grad_norm": 0.373140424489975, + "learning_rate": 0.00018474635750399669, + "loss": 1.4796, + "step": 5884 + }, + { + "epoch": 0.07647289094491451, + "grad_norm": 0.39965105056762695, + "learning_rate": 0.0001847437580420853, + "loss": 1.5189, + "step": 5885 + }, + { + "epoch": 0.07648588548883038, + "grad_norm": 0.4661354422569275, + "learning_rate": 0.0001847411585801739, + "loss": 1.5229, + "step": 5886 + }, + { + "epoch": 0.07649888003274626, + "grad_norm": 0.33783531188964844, + "learning_rate": 0.00018473855911826253, + "loss": 1.4403, + "step": 5887 + }, + { + "epoch": 0.07651187457666213, + "grad_norm": 0.350143700838089, + "learning_rate": 0.00018473595965635116, + "loss": 1.4552, + "step": 5888 + }, + { + "epoch": 0.076524869120578, + "grad_norm": 0.37084877490997314, + "learning_rate": 0.00018473336019443975, + "loss": 1.1972, + "step": 5889 + }, + { + "epoch": 0.07653786366449387, + "grad_norm": 0.4950472414493561, + "learning_rate": 0.00018473076073252838, + "loss": 1.4101, + "step": 5890 + }, + { + "epoch": 0.07655085820840975, + "grad_norm": 0.3145788609981537, + "learning_rate": 0.00018472816127061698, + "loss": 1.4652, + "step": 5891 + }, + { + "epoch": 0.07656385275232562, + "grad_norm": 0.3568994104862213, + "learning_rate": 0.00018472556180870563, + "loss": 1.517, + "step": 5892 + }, + { + "epoch": 0.0765768472962415, + "grad_norm": 0.37646493315696716, + "learning_rate": 0.00018472296234679422, + "loss": 1.4366, + "step": 5893 + }, + { + "epoch": 0.07658984184015737, + "grad_norm": 0.3793256878852844, + "learning_rate": 0.00018472036288488285, + "loss": 1.4065, + "step": 5894 + }, + { + "epoch": 0.07660283638407324, + "grad_norm": 0.4935625195503235, + "learning_rate": 0.00018471776342297145, + "loss": 1.4447, + "step": 5895 + }, + { + "epoch": 0.07661583092798911, + "grad_norm": 0.4019348919391632, + "learning_rate": 0.00018471516396106007, + "loss": 1.5234, + "step": 5896 + }, + { + "epoch": 0.07662882547190498, + "grad_norm": 0.40772294998168945, + "learning_rate": 0.0001847125644991487, + "loss": 1.3832, + "step": 5897 + }, + { + "epoch": 0.07664182001582086, + "grad_norm": 0.3134511411190033, + "learning_rate": 0.0001847099650372373, + "loss": 1.3266, + "step": 5898 + }, + { + "epoch": 0.07665481455973673, + "grad_norm": 0.3904426693916321, + "learning_rate": 0.00018470736557532592, + "loss": 1.4104, + "step": 5899 + }, + { + "epoch": 0.0766678091036526, + "grad_norm": 0.25366300344467163, + "learning_rate": 0.00018470476611341454, + "loss": 1.3223, + "step": 5900 + }, + { + "epoch": 0.07668080364756848, + "grad_norm": 0.40504834055900574, + "learning_rate": 0.00018470216665150314, + "loss": 1.3474, + "step": 5901 + }, + { + "epoch": 0.07669379819148435, + "grad_norm": 0.43310731649398804, + "learning_rate": 0.00018469956718959176, + "loss": 1.5406, + "step": 5902 + }, + { + "epoch": 0.07670679273540022, + "grad_norm": 0.4102226793766022, + "learning_rate": 0.00018469696772768036, + "loss": 1.3684, + "step": 5903 + }, + { + "epoch": 0.0767197872793161, + "grad_norm": 0.41729775071144104, + "learning_rate": 0.000184694368265769, + "loss": 1.3798, + "step": 5904 + }, + { + "epoch": 0.07673278182323197, + "grad_norm": 0.4303189814090729, + "learning_rate": 0.0001846917688038576, + "loss": 1.4608, + "step": 5905 + }, + { + "epoch": 0.07674577636714784, + "grad_norm": 0.3884187936782837, + "learning_rate": 0.00018468916934194623, + "loss": 1.6917, + "step": 5906 + }, + { + "epoch": 0.07675877091106371, + "grad_norm": 0.38658177852630615, + "learning_rate": 0.00018468656988003483, + "loss": 1.543, + "step": 5907 + }, + { + "epoch": 0.07677176545497959, + "grad_norm": 0.3554634153842926, + "learning_rate": 0.00018468397041812346, + "loss": 1.5053, + "step": 5908 + }, + { + "epoch": 0.07678475999889546, + "grad_norm": 0.445719838142395, + "learning_rate": 0.00018468137095621208, + "loss": 1.4064, + "step": 5909 + }, + { + "epoch": 0.07679775454281133, + "grad_norm": 0.3174709975719452, + "learning_rate": 0.00018467877149430068, + "loss": 1.2606, + "step": 5910 + }, + { + "epoch": 0.0768107490867272, + "grad_norm": 0.4045545756816864, + "learning_rate": 0.0001846761720323893, + "loss": 1.3734, + "step": 5911 + }, + { + "epoch": 0.07682374363064308, + "grad_norm": 0.39586082100868225, + "learning_rate": 0.00018467357257047793, + "loss": 1.4758, + "step": 5912 + }, + { + "epoch": 0.07683673817455895, + "grad_norm": 0.5619513988494873, + "learning_rate": 0.00018467097310856652, + "loss": 1.7004, + "step": 5913 + }, + { + "epoch": 0.07684973271847483, + "grad_norm": 0.37469327449798584, + "learning_rate": 0.00018466837364665515, + "loss": 1.4063, + "step": 5914 + }, + { + "epoch": 0.0768627272623907, + "grad_norm": 0.3134441673755646, + "learning_rate": 0.00018466577418474375, + "loss": 1.2579, + "step": 5915 + }, + { + "epoch": 0.07687572180630657, + "grad_norm": 0.38641074299812317, + "learning_rate": 0.0001846631747228324, + "loss": 1.6522, + "step": 5916 + }, + { + "epoch": 0.07688871635022244, + "grad_norm": 0.3021990656852722, + "learning_rate": 0.000184660575260921, + "loss": 1.3556, + "step": 5917 + }, + { + "epoch": 0.07690171089413832, + "grad_norm": 0.3889233469963074, + "learning_rate": 0.00018465797579900962, + "loss": 1.4543, + "step": 5918 + }, + { + "epoch": 0.0769147054380542, + "grad_norm": 0.3740723133087158, + "learning_rate": 0.00018465537633709824, + "loss": 1.5567, + "step": 5919 + }, + { + "epoch": 0.07692769998197008, + "grad_norm": 0.37010201811790466, + "learning_rate": 0.00018465277687518684, + "loss": 1.4651, + "step": 5920 + }, + { + "epoch": 0.07694069452588595, + "grad_norm": 0.3395724892616272, + "learning_rate": 0.00018465017741327547, + "loss": 1.2977, + "step": 5921 + }, + { + "epoch": 0.07695368906980182, + "grad_norm": 0.4764532446861267, + "learning_rate": 0.00018464757795136406, + "loss": 1.4808, + "step": 5922 + }, + { + "epoch": 0.0769666836137177, + "grad_norm": 0.4063996374607086, + "learning_rate": 0.00018464497848945271, + "loss": 1.5211, + "step": 5923 + }, + { + "epoch": 0.07697967815763357, + "grad_norm": 0.5106183290481567, + "learning_rate": 0.0001846423790275413, + "loss": 1.6894, + "step": 5924 + }, + { + "epoch": 0.07699267270154944, + "grad_norm": 0.2898086607456207, + "learning_rate": 0.0001846397795656299, + "loss": 1.4411, + "step": 5925 + }, + { + "epoch": 0.07700566724546531, + "grad_norm": 0.48833972215652466, + "learning_rate": 0.00018463718010371853, + "loss": 1.6266, + "step": 5926 + }, + { + "epoch": 0.07701866178938119, + "grad_norm": 0.3334541320800781, + "learning_rate": 0.00018463458064180716, + "loss": 1.2624, + "step": 5927 + }, + { + "epoch": 0.07703165633329706, + "grad_norm": 0.4016132652759552, + "learning_rate": 0.00018463198117989578, + "loss": 1.3208, + "step": 5928 + }, + { + "epoch": 0.07704465087721293, + "grad_norm": 0.33149516582489014, + "learning_rate": 0.00018462938171798438, + "loss": 1.4494, + "step": 5929 + }, + { + "epoch": 0.0770576454211288, + "grad_norm": 0.45958682894706726, + "learning_rate": 0.000184626782256073, + "loss": 1.363, + "step": 5930 + }, + { + "epoch": 0.07707063996504468, + "grad_norm": 0.42671599984169006, + "learning_rate": 0.00018462418279416163, + "loss": 1.3863, + "step": 5931 + }, + { + "epoch": 0.07708363450896055, + "grad_norm": 0.4863154888153076, + "learning_rate": 0.00018462158333225023, + "loss": 1.3409, + "step": 5932 + }, + { + "epoch": 0.07709662905287643, + "grad_norm": 0.4399794936180115, + "learning_rate": 0.00018461898387033885, + "loss": 1.3409, + "step": 5933 + }, + { + "epoch": 0.0771096235967923, + "grad_norm": 0.4456668198108673, + "learning_rate": 0.00018461638440842745, + "loss": 1.4864, + "step": 5934 + }, + { + "epoch": 0.07712261814070817, + "grad_norm": 0.45837193727493286, + "learning_rate": 0.0001846137849465161, + "loss": 1.4673, + "step": 5935 + }, + { + "epoch": 0.07713561268462404, + "grad_norm": 0.46039247512817383, + "learning_rate": 0.0001846111854846047, + "loss": 1.2641, + "step": 5936 + }, + { + "epoch": 0.07714860722853992, + "grad_norm": 0.34568727016448975, + "learning_rate": 0.00018460858602269332, + "loss": 1.215, + "step": 5937 + }, + { + "epoch": 0.07716160177245579, + "grad_norm": 0.40586352348327637, + "learning_rate": 0.00018460598656078192, + "loss": 1.3489, + "step": 5938 + }, + { + "epoch": 0.07717459631637166, + "grad_norm": 0.33365893363952637, + "learning_rate": 0.00018460338709887054, + "loss": 1.2618, + "step": 5939 + }, + { + "epoch": 0.07718759086028754, + "grad_norm": 0.35039955377578735, + "learning_rate": 0.00018460078763695917, + "loss": 1.4472, + "step": 5940 + }, + { + "epoch": 0.07720058540420341, + "grad_norm": 0.5006925463676453, + "learning_rate": 0.00018459818817504777, + "loss": 1.4732, + "step": 5941 + }, + { + "epoch": 0.07721357994811928, + "grad_norm": 0.4903702139854431, + "learning_rate": 0.0001845955887131364, + "loss": 1.2832, + "step": 5942 + }, + { + "epoch": 0.07722657449203515, + "grad_norm": 0.4398631155490875, + "learning_rate": 0.00018459298925122501, + "loss": 1.4457, + "step": 5943 + }, + { + "epoch": 0.07723956903595103, + "grad_norm": 0.36402255296707153, + "learning_rate": 0.0001845903897893136, + "loss": 1.567, + "step": 5944 + }, + { + "epoch": 0.0772525635798669, + "grad_norm": 0.3949216306209564, + "learning_rate": 0.00018458779032740224, + "loss": 1.4803, + "step": 5945 + }, + { + "epoch": 0.07726555812378277, + "grad_norm": 0.3183668851852417, + "learning_rate": 0.00018458519086549083, + "loss": 1.2234, + "step": 5946 + }, + { + "epoch": 0.07727855266769865, + "grad_norm": 0.39650097489356995, + "learning_rate": 0.00018458259140357949, + "loss": 1.4524, + "step": 5947 + }, + { + "epoch": 0.07729154721161452, + "grad_norm": 0.38234174251556396, + "learning_rate": 0.00018457999194166808, + "loss": 1.4546, + "step": 5948 + }, + { + "epoch": 0.07730454175553039, + "grad_norm": 0.40995270013809204, + "learning_rate": 0.0001845773924797567, + "loss": 1.5724, + "step": 5949 + }, + { + "epoch": 0.07731753629944627, + "grad_norm": 0.4977561831474304, + "learning_rate": 0.0001845747930178453, + "loss": 1.539, + "step": 5950 + }, + { + "epoch": 0.07733053084336214, + "grad_norm": 0.3710271716117859, + "learning_rate": 0.00018457219355593393, + "loss": 1.4109, + "step": 5951 + }, + { + "epoch": 0.07734352538727801, + "grad_norm": 0.4911384582519531, + "learning_rate": 0.00018456959409402255, + "loss": 1.5608, + "step": 5952 + }, + { + "epoch": 0.07735651993119388, + "grad_norm": 0.3211837112903595, + "learning_rate": 0.00018456699463211115, + "loss": 1.3303, + "step": 5953 + }, + { + "epoch": 0.07736951447510976, + "grad_norm": 0.361427366733551, + "learning_rate": 0.00018456439517019978, + "loss": 1.5518, + "step": 5954 + }, + { + "epoch": 0.07738250901902563, + "grad_norm": 0.3623093366622925, + "learning_rate": 0.0001845617957082884, + "loss": 1.559, + "step": 5955 + }, + { + "epoch": 0.0773955035629415, + "grad_norm": 0.44917863607406616, + "learning_rate": 0.000184559196246377, + "loss": 1.346, + "step": 5956 + }, + { + "epoch": 0.07740849810685739, + "grad_norm": 0.34671348333358765, + "learning_rate": 0.00018455659678446562, + "loss": 1.2791, + "step": 5957 + }, + { + "epoch": 0.07742149265077326, + "grad_norm": 0.39676445722579956, + "learning_rate": 0.00018455399732255425, + "loss": 1.5143, + "step": 5958 + }, + { + "epoch": 0.07743448719468914, + "grad_norm": 0.5092577934265137, + "learning_rate": 0.00018455139786064287, + "loss": 1.5055, + "step": 5959 + }, + { + "epoch": 0.07744748173860501, + "grad_norm": 0.38445591926574707, + "learning_rate": 0.00018454879839873147, + "loss": 1.5349, + "step": 5960 + }, + { + "epoch": 0.07746047628252088, + "grad_norm": 0.3044701814651489, + "learning_rate": 0.0001845461989368201, + "loss": 1.2968, + "step": 5961 + }, + { + "epoch": 0.07747347082643675, + "grad_norm": 0.3887306749820709, + "learning_rate": 0.00018454359947490872, + "loss": 1.4493, + "step": 5962 + }, + { + "epoch": 0.07748646537035263, + "grad_norm": 0.24546630680561066, + "learning_rate": 0.00018454100001299731, + "loss": 1.1963, + "step": 5963 + }, + { + "epoch": 0.0774994599142685, + "grad_norm": 0.4075829088687897, + "learning_rate": 0.00018453840055108594, + "loss": 1.4309, + "step": 5964 + }, + { + "epoch": 0.07751245445818437, + "grad_norm": 0.3502098321914673, + "learning_rate": 0.00018453580108917454, + "loss": 1.2734, + "step": 5965 + }, + { + "epoch": 0.07752544900210025, + "grad_norm": 0.3049415051937103, + "learning_rate": 0.0001845332016272632, + "loss": 1.3011, + "step": 5966 + }, + { + "epoch": 0.07753844354601612, + "grad_norm": 0.5176023244857788, + "learning_rate": 0.00018453060216535179, + "loss": 1.5442, + "step": 5967 + }, + { + "epoch": 0.07755143808993199, + "grad_norm": 0.30824077129364014, + "learning_rate": 0.00018452800270344038, + "loss": 1.2717, + "step": 5968 + }, + { + "epoch": 0.07756443263384787, + "grad_norm": 0.2755676507949829, + "learning_rate": 0.000184525403241529, + "loss": 1.4089, + "step": 5969 + }, + { + "epoch": 0.07757742717776374, + "grad_norm": 0.2947712242603302, + "learning_rate": 0.00018452280377961763, + "loss": 1.3732, + "step": 5970 + }, + { + "epoch": 0.07759042172167961, + "grad_norm": 0.33350735902786255, + "learning_rate": 0.00018452020431770626, + "loss": 1.4573, + "step": 5971 + }, + { + "epoch": 0.07760341626559548, + "grad_norm": 0.390143483877182, + "learning_rate": 0.00018451760485579485, + "loss": 1.6165, + "step": 5972 + }, + { + "epoch": 0.07761641080951136, + "grad_norm": 0.4501892626285553, + "learning_rate": 0.00018451500539388348, + "loss": 1.4443, + "step": 5973 + }, + { + "epoch": 0.07762940535342723, + "grad_norm": 0.37454545497894287, + "learning_rate": 0.0001845124059319721, + "loss": 1.4505, + "step": 5974 + }, + { + "epoch": 0.0776423998973431, + "grad_norm": 0.35362398624420166, + "learning_rate": 0.0001845098064700607, + "loss": 1.3278, + "step": 5975 + }, + { + "epoch": 0.07765539444125898, + "grad_norm": 0.37507548928260803, + "learning_rate": 0.00018450720700814932, + "loss": 1.1848, + "step": 5976 + }, + { + "epoch": 0.07766838898517485, + "grad_norm": 0.4120621085166931, + "learning_rate": 0.00018450460754623792, + "loss": 1.483, + "step": 5977 + }, + { + "epoch": 0.07768138352909072, + "grad_norm": 0.40024903416633606, + "learning_rate": 0.00018450200808432657, + "loss": 1.5498, + "step": 5978 + }, + { + "epoch": 0.0776943780730066, + "grad_norm": 0.3831518888473511, + "learning_rate": 0.00018449940862241517, + "loss": 1.4385, + "step": 5979 + }, + { + "epoch": 0.07770737261692247, + "grad_norm": 0.4055730104446411, + "learning_rate": 0.00018449680916050377, + "loss": 1.6143, + "step": 5980 + }, + { + "epoch": 0.07772036716083834, + "grad_norm": 0.38118797540664673, + "learning_rate": 0.0001844942096985924, + "loss": 1.3368, + "step": 5981 + }, + { + "epoch": 0.07773336170475421, + "grad_norm": 0.4233747124671936, + "learning_rate": 0.00018449161023668102, + "loss": 1.5142, + "step": 5982 + }, + { + "epoch": 0.07774635624867009, + "grad_norm": 0.3702712059020996, + "learning_rate": 0.00018448901077476964, + "loss": 1.3708, + "step": 5983 + }, + { + "epoch": 0.07775935079258596, + "grad_norm": 0.5815667510032654, + "learning_rate": 0.00018448641131285824, + "loss": 1.2719, + "step": 5984 + }, + { + "epoch": 0.07777234533650183, + "grad_norm": 0.428805410861969, + "learning_rate": 0.00018448381185094686, + "loss": 1.433, + "step": 5985 + }, + { + "epoch": 0.0777853398804177, + "grad_norm": 0.4008171558380127, + "learning_rate": 0.0001844812123890355, + "loss": 1.599, + "step": 5986 + }, + { + "epoch": 0.07779833442433358, + "grad_norm": 0.4541243612766266, + "learning_rate": 0.00018447861292712409, + "loss": 1.4879, + "step": 5987 + }, + { + "epoch": 0.07781132896824945, + "grad_norm": 0.28324034810066223, + "learning_rate": 0.0001844760134652127, + "loss": 1.2413, + "step": 5988 + }, + { + "epoch": 0.07782432351216532, + "grad_norm": 0.43518519401550293, + "learning_rate": 0.0001844734140033013, + "loss": 1.4548, + "step": 5989 + }, + { + "epoch": 0.0778373180560812, + "grad_norm": 0.3788944184780121, + "learning_rate": 0.00018447081454138996, + "loss": 1.3762, + "step": 5990 + }, + { + "epoch": 0.07785031259999707, + "grad_norm": 0.41933387517929077, + "learning_rate": 0.00018446821507947856, + "loss": 1.4301, + "step": 5991 + }, + { + "epoch": 0.07786330714391294, + "grad_norm": 0.45617789030075073, + "learning_rate": 0.00018446561561756715, + "loss": 1.4336, + "step": 5992 + }, + { + "epoch": 0.07787630168782882, + "grad_norm": 0.38238054513931274, + "learning_rate": 0.0001844630161556558, + "loss": 1.3785, + "step": 5993 + }, + { + "epoch": 0.07788929623174469, + "grad_norm": 0.3966638743877411, + "learning_rate": 0.0001844604166937444, + "loss": 1.4539, + "step": 5994 + }, + { + "epoch": 0.07790229077566056, + "grad_norm": 0.29679030179977417, + "learning_rate": 0.00018445781723183303, + "loss": 1.1991, + "step": 5995 + }, + { + "epoch": 0.07791528531957645, + "grad_norm": 0.30962076783180237, + "learning_rate": 0.00018445521776992162, + "loss": 1.4212, + "step": 5996 + }, + { + "epoch": 0.07792827986349232, + "grad_norm": 0.33151495456695557, + "learning_rate": 0.00018445261830801025, + "loss": 1.3329, + "step": 5997 + }, + { + "epoch": 0.0779412744074082, + "grad_norm": 0.39591848850250244, + "learning_rate": 0.00018445001884609887, + "loss": 1.4453, + "step": 5998 + }, + { + "epoch": 0.07795426895132407, + "grad_norm": 0.31794121861457825, + "learning_rate": 0.00018444741938418747, + "loss": 1.408, + "step": 5999 + }, + { + "epoch": 0.07796726349523994, + "grad_norm": 0.4208926260471344, + "learning_rate": 0.0001844448199222761, + "loss": 1.3413, + "step": 6000 + }, + { + "epoch": 0.07798025803915581, + "grad_norm": 0.34509938955307007, + "learning_rate": 0.00018444222046036472, + "loss": 1.3611, + "step": 6001 + }, + { + "epoch": 0.07799325258307169, + "grad_norm": 0.5411408543586731, + "learning_rate": 0.00018443962099845334, + "loss": 1.2923, + "step": 6002 + }, + { + "epoch": 0.07800624712698756, + "grad_norm": 0.3810083270072937, + "learning_rate": 0.00018443702153654194, + "loss": 1.5298, + "step": 6003 + }, + { + "epoch": 0.07801924167090343, + "grad_norm": 0.4454694092273712, + "learning_rate": 0.00018443442207463057, + "loss": 1.4519, + "step": 6004 + }, + { + "epoch": 0.0780322362148193, + "grad_norm": 0.48522186279296875, + "learning_rate": 0.0001844318226127192, + "loss": 1.4947, + "step": 6005 + }, + { + "epoch": 0.07804523075873518, + "grad_norm": 0.41381701827049255, + "learning_rate": 0.0001844292231508078, + "loss": 1.5144, + "step": 6006 + }, + { + "epoch": 0.07805822530265105, + "grad_norm": 0.35586321353912354, + "learning_rate": 0.0001844266236888964, + "loss": 1.4649, + "step": 6007 + }, + { + "epoch": 0.07807121984656692, + "grad_norm": 0.3811730742454529, + "learning_rate": 0.000184424024226985, + "loss": 1.3283, + "step": 6008 + }, + { + "epoch": 0.0780842143904828, + "grad_norm": 0.34810492396354675, + "learning_rate": 0.00018442142476507363, + "loss": 1.2199, + "step": 6009 + }, + { + "epoch": 0.07809720893439867, + "grad_norm": 0.4614677429199219, + "learning_rate": 0.00018441882530316226, + "loss": 1.2591, + "step": 6010 + }, + { + "epoch": 0.07811020347831454, + "grad_norm": 0.362655907869339, + "learning_rate": 0.00018441622584125086, + "loss": 1.3368, + "step": 6011 + }, + { + "epoch": 0.07812319802223042, + "grad_norm": 0.42832571268081665, + "learning_rate": 0.00018441362637933948, + "loss": 1.6327, + "step": 6012 + }, + { + "epoch": 0.07813619256614629, + "grad_norm": 0.37692761421203613, + "learning_rate": 0.0001844110269174281, + "loss": 1.5255, + "step": 6013 + }, + { + "epoch": 0.07814918711006216, + "grad_norm": 0.3350902795791626, + "learning_rate": 0.00018440842745551673, + "loss": 1.4163, + "step": 6014 + }, + { + "epoch": 0.07816218165397804, + "grad_norm": 0.3296547830104828, + "learning_rate": 0.00018440582799360533, + "loss": 1.4614, + "step": 6015 + }, + { + "epoch": 0.07817517619789391, + "grad_norm": 0.39048123359680176, + "learning_rate": 0.00018440322853169395, + "loss": 1.4951, + "step": 6016 + }, + { + "epoch": 0.07818817074180978, + "grad_norm": 0.3843384087085724, + "learning_rate": 0.00018440062906978258, + "loss": 1.4064, + "step": 6017 + }, + { + "epoch": 0.07820116528572565, + "grad_norm": 0.41673725843429565, + "learning_rate": 0.00018439802960787117, + "loss": 1.4817, + "step": 6018 + }, + { + "epoch": 0.07821415982964153, + "grad_norm": 0.41697585582733154, + "learning_rate": 0.0001843954301459598, + "loss": 1.5021, + "step": 6019 + }, + { + "epoch": 0.0782271543735574, + "grad_norm": 0.3676753342151642, + "learning_rate": 0.0001843928306840484, + "loss": 1.5594, + "step": 6020 + }, + { + "epoch": 0.07824014891747327, + "grad_norm": 0.41512975096702576, + "learning_rate": 0.00018439023122213702, + "loss": 1.5761, + "step": 6021 + }, + { + "epoch": 0.07825314346138915, + "grad_norm": 0.43105581402778625, + "learning_rate": 0.00018438763176022564, + "loss": 1.4796, + "step": 6022 + }, + { + "epoch": 0.07826613800530502, + "grad_norm": 0.3050001561641693, + "learning_rate": 0.00018438503229831424, + "loss": 1.2084, + "step": 6023 + }, + { + "epoch": 0.07827913254922089, + "grad_norm": 0.43741321563720703, + "learning_rate": 0.00018438243283640287, + "loss": 1.5064, + "step": 6024 + }, + { + "epoch": 0.07829212709313677, + "grad_norm": 0.38264143466949463, + "learning_rate": 0.0001843798333744915, + "loss": 1.5653, + "step": 6025 + }, + { + "epoch": 0.07830512163705264, + "grad_norm": 0.3128499686717987, + "learning_rate": 0.00018437723391258011, + "loss": 1.3427, + "step": 6026 + }, + { + "epoch": 0.07831811618096851, + "grad_norm": 0.49272000789642334, + "learning_rate": 0.0001843746344506687, + "loss": 1.4129, + "step": 6027 + }, + { + "epoch": 0.07833111072488438, + "grad_norm": 0.36527642607688904, + "learning_rate": 0.00018437203498875734, + "loss": 1.6205, + "step": 6028 + }, + { + "epoch": 0.07834410526880026, + "grad_norm": 0.3390181362628937, + "learning_rate": 0.00018436943552684596, + "loss": 1.5177, + "step": 6029 + }, + { + "epoch": 0.07835709981271613, + "grad_norm": 0.4812953770160675, + "learning_rate": 0.00018436683606493456, + "loss": 1.5312, + "step": 6030 + }, + { + "epoch": 0.078370094356632, + "grad_norm": 0.7524536848068237, + "learning_rate": 0.00018436423660302318, + "loss": 1.4617, + "step": 6031 + }, + { + "epoch": 0.07838308890054788, + "grad_norm": 0.2742747962474823, + "learning_rate": 0.0001843616371411118, + "loss": 1.232, + "step": 6032 + }, + { + "epoch": 0.07839608344446375, + "grad_norm": 0.4105358123779297, + "learning_rate": 0.00018435903767920043, + "loss": 1.3249, + "step": 6033 + }, + { + "epoch": 0.07840907798837964, + "grad_norm": 0.38426193594932556, + "learning_rate": 0.00018435643821728903, + "loss": 1.4264, + "step": 6034 + }, + { + "epoch": 0.07842207253229551, + "grad_norm": 0.491949200630188, + "learning_rate": 0.00018435383875537763, + "loss": 1.5094, + "step": 6035 + }, + { + "epoch": 0.07843506707621138, + "grad_norm": 0.4059925675392151, + "learning_rate": 0.00018435123929346628, + "loss": 1.449, + "step": 6036 + }, + { + "epoch": 0.07844806162012725, + "grad_norm": 0.35963520407676697, + "learning_rate": 0.00018434863983155488, + "loss": 1.4089, + "step": 6037 + }, + { + "epoch": 0.07846105616404313, + "grad_norm": 0.46164417266845703, + "learning_rate": 0.0001843460403696435, + "loss": 1.4802, + "step": 6038 + }, + { + "epoch": 0.078474050707959, + "grad_norm": 0.4237184524536133, + "learning_rate": 0.0001843434409077321, + "loss": 1.3275, + "step": 6039 + }, + { + "epoch": 0.07848704525187487, + "grad_norm": 0.38426315784454346, + "learning_rate": 0.00018434084144582072, + "loss": 1.6909, + "step": 6040 + }, + { + "epoch": 0.07850003979579075, + "grad_norm": 0.44933873414993286, + "learning_rate": 0.00018433824198390935, + "loss": 1.6576, + "step": 6041 + }, + { + "epoch": 0.07851303433970662, + "grad_norm": 0.4057685136795044, + "learning_rate": 0.00018433564252199794, + "loss": 1.3317, + "step": 6042 + }, + { + "epoch": 0.07852602888362249, + "grad_norm": 0.31768089532852173, + "learning_rate": 0.00018433304306008657, + "loss": 1.4909, + "step": 6043 + }, + { + "epoch": 0.07853902342753837, + "grad_norm": 0.3535803556442261, + "learning_rate": 0.0001843304435981752, + "loss": 1.4807, + "step": 6044 + }, + { + "epoch": 0.07855201797145424, + "grad_norm": 0.6468108892440796, + "learning_rate": 0.00018432784413626382, + "loss": 1.4055, + "step": 6045 + }, + { + "epoch": 0.07856501251537011, + "grad_norm": 0.3221374452114105, + "learning_rate": 0.00018432524467435241, + "loss": 1.2863, + "step": 6046 + }, + { + "epoch": 0.07857800705928598, + "grad_norm": 0.3894992470741272, + "learning_rate": 0.000184322645212441, + "loss": 1.3439, + "step": 6047 + }, + { + "epoch": 0.07859100160320186, + "grad_norm": 0.3688139319419861, + "learning_rate": 0.00018432004575052966, + "loss": 1.4272, + "step": 6048 + }, + { + "epoch": 0.07860399614711773, + "grad_norm": 0.3546498417854309, + "learning_rate": 0.00018431744628861826, + "loss": 1.2453, + "step": 6049 + }, + { + "epoch": 0.0786169906910336, + "grad_norm": 0.2959712743759155, + "learning_rate": 0.00018431484682670689, + "loss": 1.3423, + "step": 6050 + }, + { + "epoch": 0.07862998523494948, + "grad_norm": 0.36510032415390015, + "learning_rate": 0.00018431224736479548, + "loss": 1.4213, + "step": 6051 + }, + { + "epoch": 0.07864297977886535, + "grad_norm": 0.34032556414604187, + "learning_rate": 0.0001843096479028841, + "loss": 1.2443, + "step": 6052 + }, + { + "epoch": 0.07865597432278122, + "grad_norm": 0.3144850730895996, + "learning_rate": 0.00018430704844097273, + "loss": 1.1622, + "step": 6053 + }, + { + "epoch": 0.0786689688666971, + "grad_norm": 0.32013940811157227, + "learning_rate": 0.00018430444897906133, + "loss": 1.4364, + "step": 6054 + }, + { + "epoch": 0.07868196341061297, + "grad_norm": 0.511988639831543, + "learning_rate": 0.00018430184951714995, + "loss": 1.3648, + "step": 6055 + }, + { + "epoch": 0.07869495795452884, + "grad_norm": 0.5024500489234924, + "learning_rate": 0.00018429925005523858, + "loss": 1.5292, + "step": 6056 + }, + { + "epoch": 0.07870795249844471, + "grad_norm": 0.2716609537601471, + "learning_rate": 0.0001842966505933272, + "loss": 1.4871, + "step": 6057 + }, + { + "epoch": 0.07872094704236059, + "grad_norm": 0.2951996326446533, + "learning_rate": 0.0001842940511314158, + "loss": 1.4362, + "step": 6058 + }, + { + "epoch": 0.07873394158627646, + "grad_norm": 0.3120375871658325, + "learning_rate": 0.00018429145166950442, + "loss": 1.4638, + "step": 6059 + }, + { + "epoch": 0.07874693613019233, + "grad_norm": 0.4412476420402527, + "learning_rate": 0.00018428885220759305, + "loss": 1.3742, + "step": 6060 + }, + { + "epoch": 0.0787599306741082, + "grad_norm": 0.38948512077331543, + "learning_rate": 0.00018428625274568165, + "loss": 1.2452, + "step": 6061 + }, + { + "epoch": 0.07877292521802408, + "grad_norm": 0.3852015435695648, + "learning_rate": 0.00018428365328377027, + "loss": 1.5265, + "step": 6062 + }, + { + "epoch": 0.07878591976193995, + "grad_norm": 0.422014057636261, + "learning_rate": 0.00018428105382185887, + "loss": 1.6678, + "step": 6063 + }, + { + "epoch": 0.07879891430585582, + "grad_norm": 0.3978329598903656, + "learning_rate": 0.0001842784543599475, + "loss": 1.31, + "step": 6064 + }, + { + "epoch": 0.0788119088497717, + "grad_norm": 0.4004749655723572, + "learning_rate": 0.00018427585489803612, + "loss": 1.4686, + "step": 6065 + }, + { + "epoch": 0.07882490339368757, + "grad_norm": 0.3805711567401886, + "learning_rate": 0.00018427325543612471, + "loss": 1.557, + "step": 6066 + }, + { + "epoch": 0.07883789793760344, + "grad_norm": 0.3802225589752197, + "learning_rate": 0.00018427065597421337, + "loss": 1.3069, + "step": 6067 + }, + { + "epoch": 0.07885089248151932, + "grad_norm": 0.3975139558315277, + "learning_rate": 0.00018426805651230196, + "loss": 1.5216, + "step": 6068 + }, + { + "epoch": 0.07886388702543519, + "grad_norm": 0.42604002356529236, + "learning_rate": 0.0001842654570503906, + "loss": 1.4016, + "step": 6069 + }, + { + "epoch": 0.07887688156935106, + "grad_norm": 0.5064747929573059, + "learning_rate": 0.00018426285758847919, + "loss": 1.4061, + "step": 6070 + }, + { + "epoch": 0.07888987611326694, + "grad_norm": 0.4241625666618347, + "learning_rate": 0.0001842602581265678, + "loss": 1.6493, + "step": 6071 + }, + { + "epoch": 0.07890287065718282, + "grad_norm": 0.40913069248199463, + "learning_rate": 0.00018425765866465643, + "loss": 1.4782, + "step": 6072 + }, + { + "epoch": 0.0789158652010987, + "grad_norm": 0.3326561152935028, + "learning_rate": 0.00018425505920274503, + "loss": 1.5368, + "step": 6073 + }, + { + "epoch": 0.07892885974501457, + "grad_norm": 0.43161246180534363, + "learning_rate": 0.00018425245974083366, + "loss": 1.6333, + "step": 6074 + }, + { + "epoch": 0.07894185428893044, + "grad_norm": 0.5879168510437012, + "learning_rate": 0.00018424986027892228, + "loss": 1.5427, + "step": 6075 + }, + { + "epoch": 0.07895484883284631, + "grad_norm": 0.41375404596328735, + "learning_rate": 0.00018424726081701088, + "loss": 1.5386, + "step": 6076 + }, + { + "epoch": 0.07896784337676219, + "grad_norm": 0.4308698773384094, + "learning_rate": 0.0001842446613550995, + "loss": 1.4401, + "step": 6077 + }, + { + "epoch": 0.07898083792067806, + "grad_norm": 0.453360915184021, + "learning_rate": 0.0001842420618931881, + "loss": 1.5339, + "step": 6078 + }, + { + "epoch": 0.07899383246459393, + "grad_norm": 0.4292598366737366, + "learning_rate": 0.00018423946243127675, + "loss": 1.4217, + "step": 6079 + }, + { + "epoch": 0.0790068270085098, + "grad_norm": 0.2839990556240082, + "learning_rate": 0.00018423686296936535, + "loss": 1.5659, + "step": 6080 + }, + { + "epoch": 0.07901982155242568, + "grad_norm": 0.3806931674480438, + "learning_rate": 0.00018423426350745397, + "loss": 1.4281, + "step": 6081 + }, + { + "epoch": 0.07903281609634155, + "grad_norm": 0.38026121258735657, + "learning_rate": 0.00018423166404554257, + "loss": 1.3982, + "step": 6082 + }, + { + "epoch": 0.07904581064025742, + "grad_norm": 0.36227384209632874, + "learning_rate": 0.0001842290645836312, + "loss": 1.3195, + "step": 6083 + }, + { + "epoch": 0.0790588051841733, + "grad_norm": 0.38675713539123535, + "learning_rate": 0.00018422646512171982, + "loss": 1.2942, + "step": 6084 + }, + { + "epoch": 0.07907179972808917, + "grad_norm": 0.3891057074069977, + "learning_rate": 0.00018422386565980842, + "loss": 1.5729, + "step": 6085 + }, + { + "epoch": 0.07908479427200504, + "grad_norm": 0.47500666975975037, + "learning_rate": 0.00018422126619789704, + "loss": 1.5495, + "step": 6086 + }, + { + "epoch": 0.07909778881592092, + "grad_norm": 0.4468681216239929, + "learning_rate": 0.00018421866673598567, + "loss": 1.4148, + "step": 6087 + }, + { + "epoch": 0.07911078335983679, + "grad_norm": 0.46201348304748535, + "learning_rate": 0.0001842160672740743, + "loss": 1.3881, + "step": 6088 + }, + { + "epoch": 0.07912377790375266, + "grad_norm": 0.4509305953979492, + "learning_rate": 0.0001842134678121629, + "loss": 1.4759, + "step": 6089 + }, + { + "epoch": 0.07913677244766854, + "grad_norm": 0.3844403326511383, + "learning_rate": 0.00018421086835025149, + "loss": 1.5496, + "step": 6090 + }, + { + "epoch": 0.07914976699158441, + "grad_norm": 0.3434497117996216, + "learning_rate": 0.00018420826888834014, + "loss": 1.4548, + "step": 6091 + }, + { + "epoch": 0.07916276153550028, + "grad_norm": 0.4298562705516815, + "learning_rate": 0.00018420566942642873, + "loss": 1.2458, + "step": 6092 + }, + { + "epoch": 0.07917575607941615, + "grad_norm": 0.3077503442764282, + "learning_rate": 0.00018420306996451736, + "loss": 1.3318, + "step": 6093 + }, + { + "epoch": 0.07918875062333203, + "grad_norm": 0.46109291911125183, + "learning_rate": 0.00018420047050260596, + "loss": 1.5237, + "step": 6094 + }, + { + "epoch": 0.0792017451672479, + "grad_norm": 0.36462199687957764, + "learning_rate": 0.00018419787104069458, + "loss": 1.3325, + "step": 6095 + }, + { + "epoch": 0.07921473971116377, + "grad_norm": 0.3409494459629059, + "learning_rate": 0.0001841952715787832, + "loss": 1.477, + "step": 6096 + }, + { + "epoch": 0.07922773425507965, + "grad_norm": 0.3770604431629181, + "learning_rate": 0.0001841926721168718, + "loss": 1.4151, + "step": 6097 + }, + { + "epoch": 0.07924072879899552, + "grad_norm": 0.44813933968544006, + "learning_rate": 0.00018419007265496043, + "loss": 1.3529, + "step": 6098 + }, + { + "epoch": 0.07925372334291139, + "grad_norm": 0.42887306213378906, + "learning_rate": 0.00018418747319304905, + "loss": 1.5434, + "step": 6099 + }, + { + "epoch": 0.07926671788682726, + "grad_norm": 0.3355839252471924, + "learning_rate": 0.00018418487373113768, + "loss": 1.5068, + "step": 6100 + }, + { + "epoch": 0.07927971243074314, + "grad_norm": 0.345017671585083, + "learning_rate": 0.00018418227426922627, + "loss": 1.5166, + "step": 6101 + }, + { + "epoch": 0.07929270697465901, + "grad_norm": 0.3053815960884094, + "learning_rate": 0.00018417967480731487, + "loss": 1.3564, + "step": 6102 + }, + { + "epoch": 0.07930570151857488, + "grad_norm": 0.4291446805000305, + "learning_rate": 0.00018417707534540352, + "loss": 1.5859, + "step": 6103 + }, + { + "epoch": 0.07931869606249076, + "grad_norm": 0.2839497923851013, + "learning_rate": 0.00018417447588349212, + "loss": 1.1365, + "step": 6104 + }, + { + "epoch": 0.07933169060640663, + "grad_norm": 0.32945430278778076, + "learning_rate": 0.00018417187642158074, + "loss": 1.3896, + "step": 6105 + }, + { + "epoch": 0.0793446851503225, + "grad_norm": 0.3918827772140503, + "learning_rate": 0.00018416927695966937, + "loss": 1.5631, + "step": 6106 + }, + { + "epoch": 0.07935767969423838, + "grad_norm": 0.3880796730518341, + "learning_rate": 0.00018416667749775797, + "loss": 1.5472, + "step": 6107 + }, + { + "epoch": 0.07937067423815425, + "grad_norm": 0.49858877062797546, + "learning_rate": 0.0001841640780358466, + "loss": 1.448, + "step": 6108 + }, + { + "epoch": 0.07938366878207012, + "grad_norm": 0.3416162431240082, + "learning_rate": 0.0001841614785739352, + "loss": 1.4361, + "step": 6109 + }, + { + "epoch": 0.07939666332598601, + "grad_norm": 0.27496257424354553, + "learning_rate": 0.00018415887911202384, + "loss": 1.4356, + "step": 6110 + }, + { + "epoch": 0.07940965786990188, + "grad_norm": 0.38354769349098206, + "learning_rate": 0.00018415627965011244, + "loss": 1.4506, + "step": 6111 + }, + { + "epoch": 0.07942265241381775, + "grad_norm": 0.37779149413108826, + "learning_rate": 0.00018415368018820106, + "loss": 1.5162, + "step": 6112 + }, + { + "epoch": 0.07943564695773363, + "grad_norm": 0.3469626307487488, + "learning_rate": 0.00018415108072628966, + "loss": 1.3967, + "step": 6113 + }, + { + "epoch": 0.0794486415016495, + "grad_norm": 0.4428900480270386, + "learning_rate": 0.00018414848126437828, + "loss": 1.3716, + "step": 6114 + }, + { + "epoch": 0.07946163604556537, + "grad_norm": 0.36185234785079956, + "learning_rate": 0.0001841458818024669, + "loss": 1.3165, + "step": 6115 + }, + { + "epoch": 0.07947463058948125, + "grad_norm": 0.5801428556442261, + "learning_rate": 0.0001841432823405555, + "loss": 1.5218, + "step": 6116 + }, + { + "epoch": 0.07948762513339712, + "grad_norm": 0.3739064335823059, + "learning_rate": 0.00018414068287864413, + "loss": 1.3396, + "step": 6117 + }, + { + "epoch": 0.07950061967731299, + "grad_norm": 0.4277260899543762, + "learning_rate": 0.00018413808341673275, + "loss": 1.5768, + "step": 6118 + }, + { + "epoch": 0.07951361422122886, + "grad_norm": 0.4436528980731964, + "learning_rate": 0.00018413548395482135, + "loss": 1.4144, + "step": 6119 + }, + { + "epoch": 0.07952660876514474, + "grad_norm": 0.3718056082725525, + "learning_rate": 0.00018413288449290998, + "loss": 1.5007, + "step": 6120 + }, + { + "epoch": 0.07953960330906061, + "grad_norm": 0.4448679983615875, + "learning_rate": 0.00018413028503099857, + "loss": 1.5605, + "step": 6121 + }, + { + "epoch": 0.07955259785297648, + "grad_norm": 0.28929075598716736, + "learning_rate": 0.00018412768556908723, + "loss": 1.3381, + "step": 6122 + }, + { + "epoch": 0.07956559239689236, + "grad_norm": 0.4016954004764557, + "learning_rate": 0.00018412508610717582, + "loss": 1.5228, + "step": 6123 + }, + { + "epoch": 0.07957858694080823, + "grad_norm": 0.3283591866493225, + "learning_rate": 0.00018412248664526445, + "loss": 1.2988, + "step": 6124 + }, + { + "epoch": 0.0795915814847241, + "grad_norm": 0.3484986424446106, + "learning_rate": 0.00018411988718335304, + "loss": 1.4909, + "step": 6125 + }, + { + "epoch": 0.07960457602863998, + "grad_norm": 0.4112135171890259, + "learning_rate": 0.00018411728772144167, + "loss": 1.3978, + "step": 6126 + }, + { + "epoch": 0.07961757057255585, + "grad_norm": 0.39837971329689026, + "learning_rate": 0.0001841146882595303, + "loss": 1.4346, + "step": 6127 + }, + { + "epoch": 0.07963056511647172, + "grad_norm": 0.4017678499221802, + "learning_rate": 0.0001841120887976189, + "loss": 1.5039, + "step": 6128 + }, + { + "epoch": 0.0796435596603876, + "grad_norm": 0.44885116815567017, + "learning_rate": 0.00018410948933570752, + "loss": 1.6208, + "step": 6129 + }, + { + "epoch": 0.07965655420430347, + "grad_norm": 0.4098648726940155, + "learning_rate": 0.00018410688987379614, + "loss": 1.4538, + "step": 6130 + }, + { + "epoch": 0.07966954874821934, + "grad_norm": 0.4489709436893463, + "learning_rate": 0.00018410429041188474, + "loss": 1.4321, + "step": 6131 + }, + { + "epoch": 0.07968254329213521, + "grad_norm": 0.40003854036331177, + "learning_rate": 0.00018410169094997336, + "loss": 1.4442, + "step": 6132 + }, + { + "epoch": 0.07969553783605109, + "grad_norm": 0.37231209874153137, + "learning_rate": 0.00018409909148806196, + "loss": 1.4277, + "step": 6133 + }, + { + "epoch": 0.07970853237996696, + "grad_norm": 0.43695852160453796, + "learning_rate": 0.0001840964920261506, + "loss": 1.4894, + "step": 6134 + }, + { + "epoch": 0.07972152692388283, + "grad_norm": 0.35560116171836853, + "learning_rate": 0.0001840938925642392, + "loss": 1.3351, + "step": 6135 + }, + { + "epoch": 0.0797345214677987, + "grad_norm": 0.26695865392684937, + "learning_rate": 0.00018409129310232783, + "loss": 1.3985, + "step": 6136 + }, + { + "epoch": 0.07974751601171458, + "grad_norm": 0.36609166860580444, + "learning_rate": 0.00018408869364041643, + "loss": 1.4149, + "step": 6137 + }, + { + "epoch": 0.07976051055563045, + "grad_norm": 0.3258199691772461, + "learning_rate": 0.00018408609417850505, + "loss": 1.3311, + "step": 6138 + }, + { + "epoch": 0.07977350509954632, + "grad_norm": 0.5638648867607117, + "learning_rate": 0.00018408349471659368, + "loss": 1.517, + "step": 6139 + }, + { + "epoch": 0.0797864996434622, + "grad_norm": 0.37880295515060425, + "learning_rate": 0.00018408089525468228, + "loss": 1.4107, + "step": 6140 + }, + { + "epoch": 0.07979949418737807, + "grad_norm": 0.43471813201904297, + "learning_rate": 0.00018407829579277093, + "loss": 1.4891, + "step": 6141 + }, + { + "epoch": 0.07981248873129394, + "grad_norm": 0.46257299184799194, + "learning_rate": 0.00018407569633085953, + "loss": 1.5192, + "step": 6142 + }, + { + "epoch": 0.07982548327520982, + "grad_norm": 0.48966357111930847, + "learning_rate": 0.00018407309686894815, + "loss": 1.3936, + "step": 6143 + }, + { + "epoch": 0.07983847781912569, + "grad_norm": 0.4501968324184418, + "learning_rate": 0.00018407049740703675, + "loss": 1.5821, + "step": 6144 + }, + { + "epoch": 0.07985147236304156, + "grad_norm": 0.502448558807373, + "learning_rate": 0.00018406789794512537, + "loss": 1.4745, + "step": 6145 + }, + { + "epoch": 0.07986446690695743, + "grad_norm": 0.37218302488327026, + "learning_rate": 0.000184065298483214, + "loss": 1.4213, + "step": 6146 + }, + { + "epoch": 0.07987746145087331, + "grad_norm": 0.4011833071708679, + "learning_rate": 0.0001840626990213026, + "loss": 1.3675, + "step": 6147 + }, + { + "epoch": 0.0798904559947892, + "grad_norm": 0.4688166081905365, + "learning_rate": 0.00018406009955939122, + "loss": 1.5187, + "step": 6148 + }, + { + "epoch": 0.07990345053870507, + "grad_norm": 0.3566301167011261, + "learning_rate": 0.00018405750009747984, + "loss": 1.3601, + "step": 6149 + }, + { + "epoch": 0.07991644508262094, + "grad_norm": 0.41925936937332153, + "learning_rate": 0.00018405490063556844, + "loss": 1.434, + "step": 6150 + }, + { + "epoch": 0.07992943962653681, + "grad_norm": 0.3408973813056946, + "learning_rate": 0.00018405230117365706, + "loss": 1.4679, + "step": 6151 + }, + { + "epoch": 0.07994243417045269, + "grad_norm": 0.42943698167800903, + "learning_rate": 0.00018404970171174566, + "loss": 1.4241, + "step": 6152 + }, + { + "epoch": 0.07995542871436856, + "grad_norm": 0.33581170439720154, + "learning_rate": 0.0001840471022498343, + "loss": 1.5286, + "step": 6153 + }, + { + "epoch": 0.07996842325828443, + "grad_norm": 0.32352644205093384, + "learning_rate": 0.0001840445027879229, + "loss": 1.2904, + "step": 6154 + }, + { + "epoch": 0.0799814178022003, + "grad_norm": 0.37474170327186584, + "learning_rate": 0.00018404190332601154, + "loss": 1.3927, + "step": 6155 + }, + { + "epoch": 0.07999441234611618, + "grad_norm": 0.4087858200073242, + "learning_rate": 0.00018403930386410013, + "loss": 1.3849, + "step": 6156 + }, + { + "epoch": 0.08000740689003205, + "grad_norm": 0.29384076595306396, + "learning_rate": 0.00018403670440218876, + "loss": 1.6336, + "step": 6157 + }, + { + "epoch": 0.08002040143394792, + "grad_norm": 0.4185171127319336, + "learning_rate": 0.00018403410494027738, + "loss": 1.4786, + "step": 6158 + }, + { + "epoch": 0.0800333959778638, + "grad_norm": 0.3421257436275482, + "learning_rate": 0.00018403150547836598, + "loss": 1.4873, + "step": 6159 + }, + { + "epoch": 0.08004639052177967, + "grad_norm": 0.3407057225704193, + "learning_rate": 0.0001840289060164546, + "loss": 1.3454, + "step": 6160 + }, + { + "epoch": 0.08005938506569554, + "grad_norm": 0.5290220379829407, + "learning_rate": 0.00018402630655454323, + "loss": 1.4196, + "step": 6161 + }, + { + "epoch": 0.08007237960961142, + "grad_norm": 0.5162290930747986, + "learning_rate": 0.00018402370709263183, + "loss": 1.5756, + "step": 6162 + }, + { + "epoch": 0.08008537415352729, + "grad_norm": 0.3592604100704193, + "learning_rate": 0.00018402110763072045, + "loss": 1.5719, + "step": 6163 + }, + { + "epoch": 0.08009836869744316, + "grad_norm": 0.41995272040367126, + "learning_rate": 0.00018401850816880905, + "loss": 1.361, + "step": 6164 + }, + { + "epoch": 0.08011136324135903, + "grad_norm": 0.44744640588760376, + "learning_rate": 0.0001840159087068977, + "loss": 1.4387, + "step": 6165 + }, + { + "epoch": 0.08012435778527491, + "grad_norm": 0.48496049642562866, + "learning_rate": 0.0001840133092449863, + "loss": 1.4521, + "step": 6166 + }, + { + "epoch": 0.08013735232919078, + "grad_norm": 0.3519055247306824, + "learning_rate": 0.00018401070978307492, + "loss": 1.4123, + "step": 6167 + }, + { + "epoch": 0.08015034687310665, + "grad_norm": 0.38523778319358826, + "learning_rate": 0.00018400811032116352, + "loss": 1.3916, + "step": 6168 + }, + { + "epoch": 0.08016334141702253, + "grad_norm": 0.38707244396209717, + "learning_rate": 0.00018400551085925214, + "loss": 1.4847, + "step": 6169 + }, + { + "epoch": 0.0801763359609384, + "grad_norm": 0.3970610201358795, + "learning_rate": 0.00018400291139734077, + "loss": 1.4294, + "step": 6170 + }, + { + "epoch": 0.08018933050485427, + "grad_norm": 0.3914804756641388, + "learning_rate": 0.00018400031193542936, + "loss": 1.339, + "step": 6171 + }, + { + "epoch": 0.08020232504877015, + "grad_norm": 0.35249534249305725, + "learning_rate": 0.000183997712473518, + "loss": 1.5041, + "step": 6172 + }, + { + "epoch": 0.08021531959268602, + "grad_norm": 0.3310585916042328, + "learning_rate": 0.0001839951130116066, + "loss": 1.459, + "step": 6173 + }, + { + "epoch": 0.08022831413660189, + "grad_norm": 0.20819032192230225, + "learning_rate": 0.0001839925135496952, + "loss": 1.157, + "step": 6174 + }, + { + "epoch": 0.08024130868051776, + "grad_norm": 0.2881905138492584, + "learning_rate": 0.00018398991408778383, + "loss": 1.1212, + "step": 6175 + }, + { + "epoch": 0.08025430322443364, + "grad_norm": 0.4127061069011688, + "learning_rate": 0.00018398731462587243, + "loss": 1.3076, + "step": 6176 + }, + { + "epoch": 0.08026729776834951, + "grad_norm": 0.34341922402381897, + "learning_rate": 0.00018398471516396108, + "loss": 1.6453, + "step": 6177 + }, + { + "epoch": 0.08028029231226538, + "grad_norm": 0.4085890054702759, + "learning_rate": 0.00018398211570204968, + "loss": 1.452, + "step": 6178 + }, + { + "epoch": 0.08029328685618126, + "grad_norm": 0.3887009024620056, + "learning_rate": 0.0001839795162401383, + "loss": 1.3789, + "step": 6179 + }, + { + "epoch": 0.08030628140009713, + "grad_norm": 0.40998849272727966, + "learning_rate": 0.00018397691677822693, + "loss": 1.4491, + "step": 6180 + }, + { + "epoch": 0.080319275944013, + "grad_norm": 0.3676016330718994, + "learning_rate": 0.00018397431731631553, + "loss": 1.1966, + "step": 6181 + }, + { + "epoch": 0.08033227048792888, + "grad_norm": 0.44718894362449646, + "learning_rate": 0.00018397171785440415, + "loss": 1.624, + "step": 6182 + }, + { + "epoch": 0.08034526503184475, + "grad_norm": 0.4542218744754791, + "learning_rate": 0.00018396911839249275, + "loss": 1.3156, + "step": 6183 + }, + { + "epoch": 0.08035825957576062, + "grad_norm": 0.3227229714393616, + "learning_rate": 0.0001839665189305814, + "loss": 1.3696, + "step": 6184 + }, + { + "epoch": 0.0803712541196765, + "grad_norm": 0.4354618489742279, + "learning_rate": 0.00018396391946867, + "loss": 1.4932, + "step": 6185 + }, + { + "epoch": 0.08038424866359238, + "grad_norm": 0.35693785548210144, + "learning_rate": 0.0001839613200067586, + "loss": 1.3006, + "step": 6186 + }, + { + "epoch": 0.08039724320750825, + "grad_norm": 0.3768044710159302, + "learning_rate": 0.00018395872054484722, + "loss": 1.4651, + "step": 6187 + }, + { + "epoch": 0.08041023775142413, + "grad_norm": 0.31924551725387573, + "learning_rate": 0.00018395612108293584, + "loss": 1.6138, + "step": 6188 + }, + { + "epoch": 0.08042323229534, + "grad_norm": 0.4328700602054596, + "learning_rate": 0.00018395352162102447, + "loss": 1.6016, + "step": 6189 + }, + { + "epoch": 0.08043622683925587, + "grad_norm": 0.358801931142807, + "learning_rate": 0.00018395092215911307, + "loss": 1.3202, + "step": 6190 + }, + { + "epoch": 0.08044922138317175, + "grad_norm": 0.612026035785675, + "learning_rate": 0.0001839483226972017, + "loss": 1.2749, + "step": 6191 + }, + { + "epoch": 0.08046221592708762, + "grad_norm": 0.3530118465423584, + "learning_rate": 0.00018394572323529032, + "loss": 1.4187, + "step": 6192 + }, + { + "epoch": 0.08047521047100349, + "grad_norm": 0.49632585048675537, + "learning_rate": 0.0001839431237733789, + "loss": 1.6824, + "step": 6193 + }, + { + "epoch": 0.08048820501491936, + "grad_norm": 0.3985925018787384, + "learning_rate": 0.00018394052431146754, + "loss": 1.416, + "step": 6194 + }, + { + "epoch": 0.08050119955883524, + "grad_norm": 0.5029274821281433, + "learning_rate": 0.00018393792484955613, + "loss": 1.3748, + "step": 6195 + }, + { + "epoch": 0.08051419410275111, + "grad_norm": 0.3749791085720062, + "learning_rate": 0.0001839353253876448, + "loss": 1.4471, + "step": 6196 + }, + { + "epoch": 0.08052718864666698, + "grad_norm": 0.4623998701572418, + "learning_rate": 0.00018393272592573338, + "loss": 1.3695, + "step": 6197 + }, + { + "epoch": 0.08054018319058286, + "grad_norm": 0.3979378938674927, + "learning_rate": 0.00018393012646382198, + "loss": 1.448, + "step": 6198 + }, + { + "epoch": 0.08055317773449873, + "grad_norm": 0.47189393639564514, + "learning_rate": 0.0001839275270019106, + "loss": 1.6139, + "step": 6199 + }, + { + "epoch": 0.0805661722784146, + "grad_norm": 0.33067888021469116, + "learning_rate": 0.00018392492753999923, + "loss": 1.623, + "step": 6200 + }, + { + "epoch": 0.08057916682233048, + "grad_norm": 0.3573934733867645, + "learning_rate": 0.00018392232807808785, + "loss": 1.3681, + "step": 6201 + }, + { + "epoch": 0.08059216136624635, + "grad_norm": 0.3497483730316162, + "learning_rate": 0.00018391972861617645, + "loss": 1.3895, + "step": 6202 + }, + { + "epoch": 0.08060515591016222, + "grad_norm": 0.4011506140232086, + "learning_rate": 0.00018391712915426508, + "loss": 1.5495, + "step": 6203 + }, + { + "epoch": 0.0806181504540781, + "grad_norm": 0.39159879088401794, + "learning_rate": 0.0001839145296923537, + "loss": 1.4483, + "step": 6204 + }, + { + "epoch": 0.08063114499799397, + "grad_norm": 0.3504270017147064, + "learning_rate": 0.0001839119302304423, + "loss": 1.5802, + "step": 6205 + }, + { + "epoch": 0.08064413954190984, + "grad_norm": 0.41678738594055176, + "learning_rate": 0.00018390933076853092, + "loss": 1.4952, + "step": 6206 + }, + { + "epoch": 0.08065713408582571, + "grad_norm": 0.3560841381549835, + "learning_rate": 0.00018390673130661952, + "loss": 1.4196, + "step": 6207 + }, + { + "epoch": 0.08067012862974159, + "grad_norm": 0.32116806507110596, + "learning_rate": 0.00018390413184470817, + "loss": 1.2481, + "step": 6208 + }, + { + "epoch": 0.08068312317365746, + "grad_norm": 0.39083969593048096, + "learning_rate": 0.00018390153238279677, + "loss": 1.385, + "step": 6209 + }, + { + "epoch": 0.08069611771757333, + "grad_norm": 0.4282075762748718, + "learning_rate": 0.0001838989329208854, + "loss": 1.5725, + "step": 6210 + }, + { + "epoch": 0.0807091122614892, + "grad_norm": 0.3888271152973175, + "learning_rate": 0.000183896333458974, + "loss": 1.475, + "step": 6211 + }, + { + "epoch": 0.08072210680540508, + "grad_norm": 0.4699666202068329, + "learning_rate": 0.00018389373399706262, + "loss": 1.4977, + "step": 6212 + }, + { + "epoch": 0.08073510134932095, + "grad_norm": 0.38986507058143616, + "learning_rate": 0.00018389113453515124, + "loss": 1.2864, + "step": 6213 + }, + { + "epoch": 0.08074809589323682, + "grad_norm": 0.3026287257671356, + "learning_rate": 0.00018388853507323984, + "loss": 1.2347, + "step": 6214 + }, + { + "epoch": 0.0807610904371527, + "grad_norm": 0.40429025888442993, + "learning_rate": 0.00018388593561132846, + "loss": 1.5108, + "step": 6215 + }, + { + "epoch": 0.08077408498106857, + "grad_norm": 0.3776196837425232, + "learning_rate": 0.00018388333614941709, + "loss": 1.7085, + "step": 6216 + }, + { + "epoch": 0.08078707952498444, + "grad_norm": 0.37580227851867676, + "learning_rate": 0.00018388073668750568, + "loss": 1.3545, + "step": 6217 + }, + { + "epoch": 0.08080007406890032, + "grad_norm": 0.27544987201690674, + "learning_rate": 0.0001838781372255943, + "loss": 1.2992, + "step": 6218 + }, + { + "epoch": 0.08081306861281619, + "grad_norm": 0.37173834443092346, + "learning_rate": 0.00018387553776368293, + "loss": 1.4749, + "step": 6219 + }, + { + "epoch": 0.08082606315673206, + "grad_norm": 0.29365089535713196, + "learning_rate": 0.00018387293830177156, + "loss": 1.5172, + "step": 6220 + }, + { + "epoch": 0.08083905770064793, + "grad_norm": 0.39724504947662354, + "learning_rate": 0.00018387033883986015, + "loss": 1.4817, + "step": 6221 + }, + { + "epoch": 0.08085205224456381, + "grad_norm": 0.43359705805778503, + "learning_rate": 0.00018386773937794878, + "loss": 1.643, + "step": 6222 + }, + { + "epoch": 0.08086504678847968, + "grad_norm": 0.32017529010772705, + "learning_rate": 0.0001838651399160374, + "loss": 1.6074, + "step": 6223 + }, + { + "epoch": 0.08087804133239557, + "grad_norm": 0.33276161551475525, + "learning_rate": 0.000183862540454126, + "loss": 1.2605, + "step": 6224 + }, + { + "epoch": 0.08089103587631144, + "grad_norm": 0.36590129137039185, + "learning_rate": 0.00018385994099221463, + "loss": 1.3109, + "step": 6225 + }, + { + "epoch": 0.08090403042022731, + "grad_norm": 0.38914740085601807, + "learning_rate": 0.00018385734153030322, + "loss": 1.6086, + "step": 6226 + }, + { + "epoch": 0.08091702496414319, + "grad_norm": 0.34516239166259766, + "learning_rate": 0.00018385474206839185, + "loss": 1.439, + "step": 6227 + }, + { + "epoch": 0.08093001950805906, + "grad_norm": 0.4240453243255615, + "learning_rate": 0.00018385214260648047, + "loss": 1.4602, + "step": 6228 + }, + { + "epoch": 0.08094301405197493, + "grad_norm": 0.41503041982650757, + "learning_rate": 0.00018384954314456907, + "loss": 1.3663, + "step": 6229 + }, + { + "epoch": 0.0809560085958908, + "grad_norm": 0.35573306679725647, + "learning_rate": 0.0001838469436826577, + "loss": 1.6092, + "step": 6230 + }, + { + "epoch": 0.08096900313980668, + "grad_norm": 0.3753318190574646, + "learning_rate": 0.00018384434422074632, + "loss": 1.4279, + "step": 6231 + }, + { + "epoch": 0.08098199768372255, + "grad_norm": 0.4131934344768524, + "learning_rate": 0.00018384174475883494, + "loss": 1.2911, + "step": 6232 + }, + { + "epoch": 0.08099499222763842, + "grad_norm": 0.38943934440612793, + "learning_rate": 0.00018383914529692354, + "loss": 1.3937, + "step": 6233 + }, + { + "epoch": 0.0810079867715543, + "grad_norm": 0.44479724764823914, + "learning_rate": 0.00018383654583501216, + "loss": 1.5545, + "step": 6234 + }, + { + "epoch": 0.08102098131547017, + "grad_norm": 0.34958744049072266, + "learning_rate": 0.0001838339463731008, + "loss": 1.2487, + "step": 6235 + }, + { + "epoch": 0.08103397585938604, + "grad_norm": 0.2523711919784546, + "learning_rate": 0.00018383134691118939, + "loss": 1.4417, + "step": 6236 + }, + { + "epoch": 0.08104697040330192, + "grad_norm": 0.49556130170822144, + "learning_rate": 0.000183828747449278, + "loss": 1.3459, + "step": 6237 + }, + { + "epoch": 0.08105996494721779, + "grad_norm": 0.43096816539764404, + "learning_rate": 0.0001838261479873666, + "loss": 1.5894, + "step": 6238 + }, + { + "epoch": 0.08107295949113366, + "grad_norm": 0.2899806797504425, + "learning_rate": 0.00018382354852545526, + "loss": 1.4316, + "step": 6239 + }, + { + "epoch": 0.08108595403504953, + "grad_norm": 0.35303351283073425, + "learning_rate": 0.00018382094906354386, + "loss": 1.3656, + "step": 6240 + }, + { + "epoch": 0.08109894857896541, + "grad_norm": 0.3945399224758148, + "learning_rate": 0.00018381834960163245, + "loss": 1.4313, + "step": 6241 + }, + { + "epoch": 0.08111194312288128, + "grad_norm": 0.45361238718032837, + "learning_rate": 0.00018381575013972108, + "loss": 1.2535, + "step": 6242 + }, + { + "epoch": 0.08112493766679715, + "grad_norm": 0.46176430583000183, + "learning_rate": 0.0001838131506778097, + "loss": 1.517, + "step": 6243 + }, + { + "epoch": 0.08113793221071303, + "grad_norm": 0.4771147072315216, + "learning_rate": 0.00018381055121589833, + "loss": 1.3434, + "step": 6244 + }, + { + "epoch": 0.0811509267546289, + "grad_norm": 0.30673474073410034, + "learning_rate": 0.00018380795175398693, + "loss": 1.3649, + "step": 6245 + }, + { + "epoch": 0.08116392129854477, + "grad_norm": 0.3452615439891815, + "learning_rate": 0.00018380535229207555, + "loss": 1.3725, + "step": 6246 + }, + { + "epoch": 0.08117691584246065, + "grad_norm": 0.416964590549469, + "learning_rate": 0.00018380275283016417, + "loss": 1.4851, + "step": 6247 + }, + { + "epoch": 0.08118991038637652, + "grad_norm": 0.38670051097869873, + "learning_rate": 0.00018380015336825277, + "loss": 1.3859, + "step": 6248 + }, + { + "epoch": 0.08120290493029239, + "grad_norm": 0.4432021975517273, + "learning_rate": 0.0001837975539063414, + "loss": 1.4931, + "step": 6249 + }, + { + "epoch": 0.08121589947420826, + "grad_norm": 0.4541003108024597, + "learning_rate": 0.00018379495444443, + "loss": 1.5087, + "step": 6250 + }, + { + "epoch": 0.08122889401812414, + "grad_norm": 0.45787960290908813, + "learning_rate": 0.00018379235498251865, + "loss": 1.5582, + "step": 6251 + }, + { + "epoch": 0.08124188856204001, + "grad_norm": 0.30369269847869873, + "learning_rate": 0.00018378975552060724, + "loss": 1.3281, + "step": 6252 + }, + { + "epoch": 0.08125488310595588, + "grad_norm": 0.3906219005584717, + "learning_rate": 0.00018378715605869584, + "loss": 1.4755, + "step": 6253 + }, + { + "epoch": 0.08126787764987176, + "grad_norm": 0.3645680248737335, + "learning_rate": 0.0001837845565967845, + "loss": 1.66, + "step": 6254 + }, + { + "epoch": 0.08128087219378763, + "grad_norm": 0.3993740975856781, + "learning_rate": 0.0001837819571348731, + "loss": 1.2824, + "step": 6255 + }, + { + "epoch": 0.0812938667377035, + "grad_norm": 0.38091859221458435, + "learning_rate": 0.0001837793576729617, + "loss": 1.556, + "step": 6256 + }, + { + "epoch": 0.08130686128161937, + "grad_norm": 0.4366264045238495, + "learning_rate": 0.0001837767582110503, + "loss": 1.3614, + "step": 6257 + }, + { + "epoch": 0.08131985582553525, + "grad_norm": 0.4241064786911011, + "learning_rate": 0.00018377415874913894, + "loss": 1.5158, + "step": 6258 + }, + { + "epoch": 0.08133285036945112, + "grad_norm": 0.38198429346084595, + "learning_rate": 0.00018377155928722756, + "loss": 1.4414, + "step": 6259 + }, + { + "epoch": 0.081345844913367, + "grad_norm": 0.4117804169654846, + "learning_rate": 0.00018376895982531616, + "loss": 1.4767, + "step": 6260 + }, + { + "epoch": 0.08135883945728287, + "grad_norm": 0.4990704357624054, + "learning_rate": 0.00018376636036340478, + "loss": 1.2187, + "step": 6261 + }, + { + "epoch": 0.08137183400119875, + "grad_norm": 0.3977000117301941, + "learning_rate": 0.0001837637609014934, + "loss": 1.5617, + "step": 6262 + }, + { + "epoch": 0.08138482854511463, + "grad_norm": 0.398823618888855, + "learning_rate": 0.00018376116143958203, + "loss": 1.5426, + "step": 6263 + }, + { + "epoch": 0.0813978230890305, + "grad_norm": 0.4289095103740692, + "learning_rate": 0.00018375856197767063, + "loss": 1.5082, + "step": 6264 + }, + { + "epoch": 0.08141081763294637, + "grad_norm": 0.3886120319366455, + "learning_rate": 0.00018375596251575925, + "loss": 1.7187, + "step": 6265 + }, + { + "epoch": 0.08142381217686225, + "grad_norm": 0.34749701619148254, + "learning_rate": 0.00018375336305384788, + "loss": 1.3236, + "step": 6266 + }, + { + "epoch": 0.08143680672077812, + "grad_norm": 0.367283433675766, + "learning_rate": 0.00018375076359193647, + "loss": 1.4599, + "step": 6267 + }, + { + "epoch": 0.08144980126469399, + "grad_norm": 0.3778984248638153, + "learning_rate": 0.0001837481641300251, + "loss": 1.5789, + "step": 6268 + }, + { + "epoch": 0.08146279580860986, + "grad_norm": 0.6308656930923462, + "learning_rate": 0.0001837455646681137, + "loss": 1.3317, + "step": 6269 + }, + { + "epoch": 0.08147579035252574, + "grad_norm": 0.38500407338142395, + "learning_rate": 0.00018374296520620232, + "loss": 1.437, + "step": 6270 + }, + { + "epoch": 0.08148878489644161, + "grad_norm": 0.415232390165329, + "learning_rate": 0.00018374036574429095, + "loss": 1.4343, + "step": 6271 + }, + { + "epoch": 0.08150177944035748, + "grad_norm": 0.3456633687019348, + "learning_rate": 0.00018373776628237954, + "loss": 1.3905, + "step": 6272 + }, + { + "epoch": 0.08151477398427336, + "grad_norm": 0.40678590536117554, + "learning_rate": 0.00018373516682046817, + "loss": 1.4074, + "step": 6273 + }, + { + "epoch": 0.08152776852818923, + "grad_norm": 0.3896959125995636, + "learning_rate": 0.0001837325673585568, + "loss": 1.3847, + "step": 6274 + }, + { + "epoch": 0.0815407630721051, + "grad_norm": 0.41664302349090576, + "learning_rate": 0.00018372996789664542, + "loss": 1.6356, + "step": 6275 + }, + { + "epoch": 0.08155375761602097, + "grad_norm": 0.3730113208293915, + "learning_rate": 0.000183727368434734, + "loss": 1.5023, + "step": 6276 + }, + { + "epoch": 0.08156675215993685, + "grad_norm": 0.30903345346450806, + "learning_rate": 0.00018372476897282264, + "loss": 1.3195, + "step": 6277 + }, + { + "epoch": 0.08157974670385272, + "grad_norm": 0.4093523323535919, + "learning_rate": 0.00018372216951091126, + "loss": 1.5907, + "step": 6278 + }, + { + "epoch": 0.0815927412477686, + "grad_norm": 0.49078160524368286, + "learning_rate": 0.00018371957004899986, + "loss": 1.6554, + "step": 6279 + }, + { + "epoch": 0.08160573579168447, + "grad_norm": 0.49689167737960815, + "learning_rate": 0.00018371697058708848, + "loss": 1.5196, + "step": 6280 + }, + { + "epoch": 0.08161873033560034, + "grad_norm": 0.43951088190078735, + "learning_rate": 0.00018371437112517708, + "loss": 1.5346, + "step": 6281 + }, + { + "epoch": 0.08163172487951621, + "grad_norm": 0.3622475862503052, + "learning_rate": 0.0001837117716632657, + "loss": 1.609, + "step": 6282 + }, + { + "epoch": 0.08164471942343209, + "grad_norm": 0.5064297318458557, + "learning_rate": 0.00018370917220135433, + "loss": 1.3538, + "step": 6283 + }, + { + "epoch": 0.08165771396734796, + "grad_norm": 0.39422526955604553, + "learning_rate": 0.00018370657273944293, + "loss": 1.542, + "step": 6284 + }, + { + "epoch": 0.08167070851126383, + "grad_norm": 0.4281027615070343, + "learning_rate": 0.00018370397327753155, + "loss": 1.5572, + "step": 6285 + }, + { + "epoch": 0.0816837030551797, + "grad_norm": 0.4165332317352295, + "learning_rate": 0.00018370137381562018, + "loss": 1.4751, + "step": 6286 + }, + { + "epoch": 0.08169669759909558, + "grad_norm": 0.30757102370262146, + "learning_rate": 0.0001836987743537088, + "loss": 1.3309, + "step": 6287 + }, + { + "epoch": 0.08170969214301145, + "grad_norm": 0.4747482240200043, + "learning_rate": 0.0001836961748917974, + "loss": 1.575, + "step": 6288 + }, + { + "epoch": 0.08172268668692732, + "grad_norm": 0.3442952334880829, + "learning_rate": 0.00018369357542988602, + "loss": 1.2855, + "step": 6289 + }, + { + "epoch": 0.0817356812308432, + "grad_norm": 0.27735987305641174, + "learning_rate": 0.00018369097596797465, + "loss": 1.2472, + "step": 6290 + }, + { + "epoch": 0.08174867577475907, + "grad_norm": 0.30388858914375305, + "learning_rate": 0.00018368837650606325, + "loss": 1.4579, + "step": 6291 + }, + { + "epoch": 0.08176167031867494, + "grad_norm": 0.3185122311115265, + "learning_rate": 0.00018368577704415187, + "loss": 1.482, + "step": 6292 + }, + { + "epoch": 0.08177466486259082, + "grad_norm": 0.4130038619041443, + "learning_rate": 0.0001836831775822405, + "loss": 1.403, + "step": 6293 + }, + { + "epoch": 0.08178765940650669, + "grad_norm": 0.4267940819263458, + "learning_rate": 0.00018368057812032912, + "loss": 1.5247, + "step": 6294 + }, + { + "epoch": 0.08180065395042256, + "grad_norm": 0.3583746552467346, + "learning_rate": 0.00018367797865841772, + "loss": 1.4195, + "step": 6295 + }, + { + "epoch": 0.08181364849433843, + "grad_norm": 0.37809985876083374, + "learning_rate": 0.0001836753791965063, + "loss": 1.4028, + "step": 6296 + }, + { + "epoch": 0.08182664303825431, + "grad_norm": 0.4940185248851776, + "learning_rate": 0.00018367277973459496, + "loss": 1.4484, + "step": 6297 + }, + { + "epoch": 0.08183963758217018, + "grad_norm": 0.3952077329158783, + "learning_rate": 0.00018367018027268356, + "loss": 1.4168, + "step": 6298 + }, + { + "epoch": 0.08185263212608605, + "grad_norm": 0.4756681025028229, + "learning_rate": 0.0001836675808107722, + "loss": 1.6063, + "step": 6299 + }, + { + "epoch": 0.08186562667000194, + "grad_norm": 0.31230786442756653, + "learning_rate": 0.00018366498134886078, + "loss": 1.5183, + "step": 6300 + }, + { + "epoch": 0.08187862121391781, + "grad_norm": 0.3104645311832428, + "learning_rate": 0.0001836623818869494, + "loss": 1.275, + "step": 6301 + }, + { + "epoch": 0.08189161575783369, + "grad_norm": 0.36013808846473694, + "learning_rate": 0.00018365978242503803, + "loss": 1.5446, + "step": 6302 + }, + { + "epoch": 0.08190461030174956, + "grad_norm": 0.4131903052330017, + "learning_rate": 0.00018365718296312663, + "loss": 1.3805, + "step": 6303 + }, + { + "epoch": 0.08191760484566543, + "grad_norm": 0.41663122177124023, + "learning_rate": 0.00018365458350121526, + "loss": 1.6242, + "step": 6304 + }, + { + "epoch": 0.0819305993895813, + "grad_norm": 0.379584938287735, + "learning_rate": 0.00018365198403930388, + "loss": 1.4697, + "step": 6305 + }, + { + "epoch": 0.08194359393349718, + "grad_norm": 0.3859310746192932, + "learning_rate": 0.0001836493845773925, + "loss": 1.6415, + "step": 6306 + }, + { + "epoch": 0.08195658847741305, + "grad_norm": 0.40626102685928345, + "learning_rate": 0.0001836467851154811, + "loss": 1.519, + "step": 6307 + }, + { + "epoch": 0.08196958302132892, + "grad_norm": 0.4381747245788574, + "learning_rate": 0.0001836441856535697, + "loss": 1.4524, + "step": 6308 + }, + { + "epoch": 0.0819825775652448, + "grad_norm": 0.42206859588623047, + "learning_rate": 0.00018364158619165835, + "loss": 1.6181, + "step": 6309 + }, + { + "epoch": 0.08199557210916067, + "grad_norm": 0.4496839642524719, + "learning_rate": 0.00018363898672974695, + "loss": 1.4177, + "step": 6310 + }, + { + "epoch": 0.08200856665307654, + "grad_norm": 0.4506014585494995, + "learning_rate": 0.00018363638726783557, + "loss": 1.5302, + "step": 6311 + }, + { + "epoch": 0.08202156119699242, + "grad_norm": 0.4212636947631836, + "learning_rate": 0.00018363378780592417, + "loss": 1.6268, + "step": 6312 + }, + { + "epoch": 0.08203455574090829, + "grad_norm": 0.4072587192058563, + "learning_rate": 0.0001836311883440128, + "loss": 1.3955, + "step": 6313 + }, + { + "epoch": 0.08204755028482416, + "grad_norm": 0.3774198889732361, + "learning_rate": 0.00018362858888210142, + "loss": 1.3902, + "step": 6314 + }, + { + "epoch": 0.08206054482874003, + "grad_norm": 0.40328261256217957, + "learning_rate": 0.00018362598942019002, + "loss": 1.4437, + "step": 6315 + }, + { + "epoch": 0.08207353937265591, + "grad_norm": 0.32378828525543213, + "learning_rate": 0.00018362338995827864, + "loss": 1.5592, + "step": 6316 + }, + { + "epoch": 0.08208653391657178, + "grad_norm": 0.42563384771347046, + "learning_rate": 0.00018362079049636726, + "loss": 1.5266, + "step": 6317 + }, + { + "epoch": 0.08209952846048765, + "grad_norm": 0.4518590569496155, + "learning_rate": 0.0001836181910344559, + "loss": 1.4812, + "step": 6318 + }, + { + "epoch": 0.08211252300440353, + "grad_norm": 0.36745765805244446, + "learning_rate": 0.0001836155915725445, + "loss": 1.2738, + "step": 6319 + }, + { + "epoch": 0.0821255175483194, + "grad_norm": 0.44639402627944946, + "learning_rate": 0.00018361299211063308, + "loss": 1.3873, + "step": 6320 + }, + { + "epoch": 0.08213851209223527, + "grad_norm": 0.36386215686798096, + "learning_rate": 0.00018361039264872174, + "loss": 1.4629, + "step": 6321 + }, + { + "epoch": 0.08215150663615114, + "grad_norm": 0.30434849858283997, + "learning_rate": 0.00018360779318681033, + "loss": 1.2594, + "step": 6322 + }, + { + "epoch": 0.08216450118006702, + "grad_norm": 0.34231579303741455, + "learning_rate": 0.00018360519372489896, + "loss": 1.5994, + "step": 6323 + }, + { + "epoch": 0.08217749572398289, + "grad_norm": 0.43207627534866333, + "learning_rate": 0.00018360259426298755, + "loss": 1.4636, + "step": 6324 + }, + { + "epoch": 0.08219049026789876, + "grad_norm": 0.4585356116294861, + "learning_rate": 0.00018359999480107618, + "loss": 1.6959, + "step": 6325 + }, + { + "epoch": 0.08220348481181464, + "grad_norm": 0.28777074813842773, + "learning_rate": 0.0001835973953391648, + "loss": 1.5394, + "step": 6326 + }, + { + "epoch": 0.08221647935573051, + "grad_norm": 0.4813830554485321, + "learning_rate": 0.0001835947958772534, + "loss": 1.6296, + "step": 6327 + }, + { + "epoch": 0.08222947389964638, + "grad_norm": 0.33295485377311707, + "learning_rate": 0.00018359219641534205, + "loss": 1.2077, + "step": 6328 + }, + { + "epoch": 0.08224246844356226, + "grad_norm": 0.3948964476585388, + "learning_rate": 0.00018358959695343065, + "loss": 1.458, + "step": 6329 + }, + { + "epoch": 0.08225546298747813, + "grad_norm": 0.6034369468688965, + "learning_rate": 0.00018358699749151927, + "loss": 1.5965, + "step": 6330 + }, + { + "epoch": 0.082268457531394, + "grad_norm": 0.39594656229019165, + "learning_rate": 0.00018358439802960787, + "loss": 1.494, + "step": 6331 + }, + { + "epoch": 0.08228145207530987, + "grad_norm": 0.36615267395973206, + "learning_rate": 0.0001835817985676965, + "loss": 1.3831, + "step": 6332 + }, + { + "epoch": 0.08229444661922575, + "grad_norm": 0.4499756693840027, + "learning_rate": 0.00018357919910578512, + "loss": 1.5744, + "step": 6333 + }, + { + "epoch": 0.08230744116314162, + "grad_norm": 0.33017051219940186, + "learning_rate": 0.00018357659964387372, + "loss": 1.6343, + "step": 6334 + }, + { + "epoch": 0.0823204357070575, + "grad_norm": 0.3306119441986084, + "learning_rate": 0.00018357400018196234, + "loss": 1.394, + "step": 6335 + }, + { + "epoch": 0.08233343025097337, + "grad_norm": 0.3868931531906128, + "learning_rate": 0.00018357140072005097, + "loss": 1.5379, + "step": 6336 + }, + { + "epoch": 0.08234642479488924, + "grad_norm": 0.42174673080444336, + "learning_rate": 0.00018356880125813956, + "loss": 1.3303, + "step": 6337 + }, + { + "epoch": 0.08235941933880513, + "grad_norm": 0.4145820438861847, + "learning_rate": 0.0001835662017962282, + "loss": 1.3131, + "step": 6338 + }, + { + "epoch": 0.082372413882721, + "grad_norm": 0.43100664019584656, + "learning_rate": 0.0001835636023343168, + "loss": 1.2742, + "step": 6339 + }, + { + "epoch": 0.08238540842663687, + "grad_norm": 0.33210310339927673, + "learning_rate": 0.00018356100287240544, + "loss": 1.4114, + "step": 6340 + }, + { + "epoch": 0.08239840297055274, + "grad_norm": 0.4400142431259155, + "learning_rate": 0.00018355840341049404, + "loss": 1.5164, + "step": 6341 + }, + { + "epoch": 0.08241139751446862, + "grad_norm": 0.2951825261116028, + "learning_rate": 0.00018355580394858266, + "loss": 1.3462, + "step": 6342 + }, + { + "epoch": 0.08242439205838449, + "grad_norm": 0.35411882400512695, + "learning_rate": 0.00018355320448667126, + "loss": 1.4355, + "step": 6343 + }, + { + "epoch": 0.08243738660230036, + "grad_norm": 0.3669249713420868, + "learning_rate": 0.00018355060502475988, + "loss": 1.5282, + "step": 6344 + }, + { + "epoch": 0.08245038114621624, + "grad_norm": 0.5035409927368164, + "learning_rate": 0.0001835480055628485, + "loss": 1.3305, + "step": 6345 + }, + { + "epoch": 0.08246337569013211, + "grad_norm": 0.3865646421909332, + "learning_rate": 0.0001835454061009371, + "loss": 1.4901, + "step": 6346 + }, + { + "epoch": 0.08247637023404798, + "grad_norm": 0.44800180196762085, + "learning_rate": 0.00018354280663902573, + "loss": 1.31, + "step": 6347 + }, + { + "epoch": 0.08248936477796386, + "grad_norm": 0.3423851728439331, + "learning_rate": 0.00018354020717711435, + "loss": 1.4015, + "step": 6348 + }, + { + "epoch": 0.08250235932187973, + "grad_norm": 0.5188177227973938, + "learning_rate": 0.00018353760771520298, + "loss": 1.6804, + "step": 6349 + }, + { + "epoch": 0.0825153538657956, + "grad_norm": 0.3972238600254059, + "learning_rate": 0.00018353500825329157, + "loss": 1.3281, + "step": 6350 + }, + { + "epoch": 0.08252834840971147, + "grad_norm": 0.40426206588745117, + "learning_rate": 0.00018353240879138017, + "loss": 1.6123, + "step": 6351 + }, + { + "epoch": 0.08254134295362735, + "grad_norm": 0.4127432703971863, + "learning_rate": 0.00018352980932946882, + "loss": 1.4924, + "step": 6352 + }, + { + "epoch": 0.08255433749754322, + "grad_norm": 0.280304878950119, + "learning_rate": 0.00018352720986755742, + "loss": 1.392, + "step": 6353 + }, + { + "epoch": 0.0825673320414591, + "grad_norm": 0.45447084307670593, + "learning_rate": 0.00018352461040564605, + "loss": 1.4239, + "step": 6354 + }, + { + "epoch": 0.08258032658537497, + "grad_norm": 0.3501328229904175, + "learning_rate": 0.00018352201094373464, + "loss": 1.3715, + "step": 6355 + }, + { + "epoch": 0.08259332112929084, + "grad_norm": 0.40163177251815796, + "learning_rate": 0.00018351941148182327, + "loss": 1.2484, + "step": 6356 + }, + { + "epoch": 0.08260631567320671, + "grad_norm": 0.4098997116088867, + "learning_rate": 0.0001835168120199119, + "loss": 1.4671, + "step": 6357 + }, + { + "epoch": 0.08261931021712259, + "grad_norm": 0.3810834288597107, + "learning_rate": 0.0001835142125580005, + "loss": 1.5171, + "step": 6358 + }, + { + "epoch": 0.08263230476103846, + "grad_norm": 0.41706377267837524, + "learning_rate": 0.00018351161309608911, + "loss": 1.4981, + "step": 6359 + }, + { + "epoch": 0.08264529930495433, + "grad_norm": 0.4371885061264038, + "learning_rate": 0.00018350901363417774, + "loss": 1.4537, + "step": 6360 + }, + { + "epoch": 0.0826582938488702, + "grad_norm": 0.3304905295372009, + "learning_rate": 0.00018350641417226636, + "loss": 1.337, + "step": 6361 + }, + { + "epoch": 0.08267128839278608, + "grad_norm": 0.41819995641708374, + "learning_rate": 0.00018350381471035496, + "loss": 1.2867, + "step": 6362 + }, + { + "epoch": 0.08268428293670195, + "grad_norm": 0.4859756827354431, + "learning_rate": 0.00018350121524844358, + "loss": 1.4621, + "step": 6363 + }, + { + "epoch": 0.08269727748061782, + "grad_norm": 0.32665860652923584, + "learning_rate": 0.0001834986157865322, + "loss": 1.3724, + "step": 6364 + }, + { + "epoch": 0.0827102720245337, + "grad_norm": 0.3746100664138794, + "learning_rate": 0.0001834960163246208, + "loss": 1.4845, + "step": 6365 + }, + { + "epoch": 0.08272326656844957, + "grad_norm": 0.3771091103553772, + "learning_rate": 0.00018349341686270943, + "loss": 1.4149, + "step": 6366 + }, + { + "epoch": 0.08273626111236544, + "grad_norm": 0.38788601756095886, + "learning_rate": 0.00018349081740079806, + "loss": 1.3736, + "step": 6367 + }, + { + "epoch": 0.08274925565628131, + "grad_norm": 0.42723456025123596, + "learning_rate": 0.00018348821793888665, + "loss": 1.5531, + "step": 6368 + }, + { + "epoch": 0.08276225020019719, + "grad_norm": 0.3780154287815094, + "learning_rate": 0.00018348561847697528, + "loss": 1.5811, + "step": 6369 + }, + { + "epoch": 0.08277524474411306, + "grad_norm": 0.4152792990207672, + "learning_rate": 0.00018348301901506387, + "loss": 1.4255, + "step": 6370 + }, + { + "epoch": 0.08278823928802893, + "grad_norm": 0.34465155005455017, + "learning_rate": 0.00018348041955315253, + "loss": 1.4392, + "step": 6371 + }, + { + "epoch": 0.0828012338319448, + "grad_norm": 0.379810094833374, + "learning_rate": 0.00018347782009124112, + "loss": 1.5304, + "step": 6372 + }, + { + "epoch": 0.08281422837586068, + "grad_norm": 0.40812456607818604, + "learning_rate": 0.00018347522062932975, + "loss": 1.3356, + "step": 6373 + }, + { + "epoch": 0.08282722291977655, + "grad_norm": 0.4013248383998871, + "learning_rate": 0.00018347262116741835, + "loss": 1.7343, + "step": 6374 + }, + { + "epoch": 0.08284021746369243, + "grad_norm": 0.3170357644557953, + "learning_rate": 0.00018347002170550697, + "loss": 1.4234, + "step": 6375 + }, + { + "epoch": 0.08285321200760831, + "grad_norm": 0.38626012206077576, + "learning_rate": 0.0001834674222435956, + "loss": 1.6129, + "step": 6376 + }, + { + "epoch": 0.08286620655152419, + "grad_norm": 0.44640329480171204, + "learning_rate": 0.0001834648227816842, + "loss": 1.5105, + "step": 6377 + }, + { + "epoch": 0.08287920109544006, + "grad_norm": 0.3134946823120117, + "learning_rate": 0.00018346222331977282, + "loss": 1.413, + "step": 6378 + }, + { + "epoch": 0.08289219563935593, + "grad_norm": 0.3978760838508606, + "learning_rate": 0.00018345962385786144, + "loss": 1.327, + "step": 6379 + }, + { + "epoch": 0.0829051901832718, + "grad_norm": 0.4487209618091583, + "learning_rate": 0.00018345702439595004, + "loss": 1.4105, + "step": 6380 + }, + { + "epoch": 0.08291818472718768, + "grad_norm": 0.3922748565673828, + "learning_rate": 0.00018345442493403866, + "loss": 1.5619, + "step": 6381 + }, + { + "epoch": 0.08293117927110355, + "grad_norm": 0.35467037558555603, + "learning_rate": 0.00018345182547212726, + "loss": 1.5109, + "step": 6382 + }, + { + "epoch": 0.08294417381501942, + "grad_norm": 0.35295793414115906, + "learning_rate": 0.0001834492260102159, + "loss": 1.4991, + "step": 6383 + }, + { + "epoch": 0.0829571683589353, + "grad_norm": 0.408783882856369, + "learning_rate": 0.0001834466265483045, + "loss": 1.4983, + "step": 6384 + }, + { + "epoch": 0.08297016290285117, + "grad_norm": 0.39997902512550354, + "learning_rate": 0.00018344402708639313, + "loss": 1.3185, + "step": 6385 + }, + { + "epoch": 0.08298315744676704, + "grad_norm": 0.36605584621429443, + "learning_rate": 0.00018344142762448173, + "loss": 1.3637, + "step": 6386 + }, + { + "epoch": 0.08299615199068291, + "grad_norm": 0.48475515842437744, + "learning_rate": 0.00018343882816257036, + "loss": 1.4331, + "step": 6387 + }, + { + "epoch": 0.08300914653459879, + "grad_norm": 0.37995797395706177, + "learning_rate": 0.00018343622870065898, + "loss": 1.4912, + "step": 6388 + }, + { + "epoch": 0.08302214107851466, + "grad_norm": 0.35502561926841736, + "learning_rate": 0.00018343362923874758, + "loss": 1.3799, + "step": 6389 + }, + { + "epoch": 0.08303513562243053, + "grad_norm": 0.2844708561897278, + "learning_rate": 0.0001834310297768362, + "loss": 1.6807, + "step": 6390 + }, + { + "epoch": 0.0830481301663464, + "grad_norm": 0.42832931876182556, + "learning_rate": 0.00018342843031492483, + "loss": 1.2539, + "step": 6391 + }, + { + "epoch": 0.08306112471026228, + "grad_norm": 0.4181867241859436, + "learning_rate": 0.00018342583085301342, + "loss": 1.5233, + "step": 6392 + }, + { + "epoch": 0.08307411925417815, + "grad_norm": 0.4286670684814453, + "learning_rate": 0.00018342323139110205, + "loss": 1.381, + "step": 6393 + }, + { + "epoch": 0.08308711379809403, + "grad_norm": 0.37081727385520935, + "learning_rate": 0.00018342063192919065, + "loss": 1.539, + "step": 6394 + }, + { + "epoch": 0.0831001083420099, + "grad_norm": 0.3459774851799011, + "learning_rate": 0.0001834180324672793, + "loss": 1.2504, + "step": 6395 + }, + { + "epoch": 0.08311310288592577, + "grad_norm": 0.4026981294155121, + "learning_rate": 0.0001834154330053679, + "loss": 1.394, + "step": 6396 + }, + { + "epoch": 0.08312609742984164, + "grad_norm": 0.421860009431839, + "learning_rate": 0.00018341283354345652, + "loss": 1.4205, + "step": 6397 + }, + { + "epoch": 0.08313909197375752, + "grad_norm": 0.4410203993320465, + "learning_rate": 0.00018341023408154512, + "loss": 1.4552, + "step": 6398 + }, + { + "epoch": 0.08315208651767339, + "grad_norm": 0.3262028694152832, + "learning_rate": 0.00018340763461963374, + "loss": 1.4587, + "step": 6399 + }, + { + "epoch": 0.08316508106158926, + "grad_norm": 0.39461982250213623, + "learning_rate": 0.00018340503515772237, + "loss": 1.3985, + "step": 6400 + }, + { + "epoch": 0.08317807560550514, + "grad_norm": 0.308521032333374, + "learning_rate": 0.00018340243569581096, + "loss": 1.4266, + "step": 6401 + }, + { + "epoch": 0.08319107014942101, + "grad_norm": 0.39089030027389526, + "learning_rate": 0.00018339983623389961, + "loss": 1.4905, + "step": 6402 + }, + { + "epoch": 0.08320406469333688, + "grad_norm": 0.478932648897171, + "learning_rate": 0.0001833972367719882, + "loss": 1.6396, + "step": 6403 + }, + { + "epoch": 0.08321705923725276, + "grad_norm": 0.4224386215209961, + "learning_rate": 0.0001833946373100768, + "loss": 1.4816, + "step": 6404 + }, + { + "epoch": 0.08323005378116863, + "grad_norm": 0.38532745838165283, + "learning_rate": 0.00018339203784816543, + "loss": 1.4052, + "step": 6405 + }, + { + "epoch": 0.0832430483250845, + "grad_norm": 0.24512577056884766, + "learning_rate": 0.00018338943838625406, + "loss": 1.1233, + "step": 6406 + }, + { + "epoch": 0.08325604286900037, + "grad_norm": 0.3907359540462494, + "learning_rate": 0.00018338683892434268, + "loss": 1.4548, + "step": 6407 + }, + { + "epoch": 0.08326903741291625, + "grad_norm": 0.4116802513599396, + "learning_rate": 0.00018338423946243128, + "loss": 1.5438, + "step": 6408 + }, + { + "epoch": 0.08328203195683212, + "grad_norm": 0.38361355662345886, + "learning_rate": 0.0001833816400005199, + "loss": 1.7092, + "step": 6409 + }, + { + "epoch": 0.08329502650074799, + "grad_norm": 0.4690987169742584, + "learning_rate": 0.00018337904053860853, + "loss": 1.6206, + "step": 6410 + }, + { + "epoch": 0.08330802104466387, + "grad_norm": 0.43358904123306274, + "learning_rate": 0.00018337644107669713, + "loss": 1.5192, + "step": 6411 + }, + { + "epoch": 0.08332101558857974, + "grad_norm": 0.4032171368598938, + "learning_rate": 0.00018337384161478575, + "loss": 1.4371, + "step": 6412 + }, + { + "epoch": 0.08333401013249561, + "grad_norm": 0.483635276556015, + "learning_rate": 0.00018337124215287435, + "loss": 1.4422, + "step": 6413 + }, + { + "epoch": 0.0833470046764115, + "grad_norm": 0.3206835985183716, + "learning_rate": 0.000183368642690963, + "loss": 1.2842, + "step": 6414 + }, + { + "epoch": 0.08335999922032737, + "grad_norm": 0.4473925530910492, + "learning_rate": 0.0001833660432290516, + "loss": 1.5486, + "step": 6415 + }, + { + "epoch": 0.08337299376424324, + "grad_norm": 0.39831212162971497, + "learning_rate": 0.00018336344376714022, + "loss": 1.4663, + "step": 6416 + }, + { + "epoch": 0.08338598830815912, + "grad_norm": 0.3915517032146454, + "learning_rate": 0.00018336084430522882, + "loss": 1.3953, + "step": 6417 + }, + { + "epoch": 0.08339898285207499, + "grad_norm": 0.37744471430778503, + "learning_rate": 0.00018335824484331744, + "loss": 1.3147, + "step": 6418 + }, + { + "epoch": 0.08341197739599086, + "grad_norm": 0.27386778593063354, + "learning_rate": 0.00018335564538140607, + "loss": 1.4143, + "step": 6419 + }, + { + "epoch": 0.08342497193990674, + "grad_norm": 0.3985297679901123, + "learning_rate": 0.00018335304591949467, + "loss": 1.4344, + "step": 6420 + }, + { + "epoch": 0.08343796648382261, + "grad_norm": 0.35788625478744507, + "learning_rate": 0.0001833504464575833, + "loss": 1.4988, + "step": 6421 + }, + { + "epoch": 0.08345096102773848, + "grad_norm": 0.6142004132270813, + "learning_rate": 0.00018334784699567191, + "loss": 1.554, + "step": 6422 + }, + { + "epoch": 0.08346395557165436, + "grad_norm": 0.4944033920764923, + "learning_rate": 0.0001833452475337605, + "loss": 1.6212, + "step": 6423 + }, + { + "epoch": 0.08347695011557023, + "grad_norm": 0.4056105315685272, + "learning_rate": 0.00018334264807184914, + "loss": 1.581, + "step": 6424 + }, + { + "epoch": 0.0834899446594861, + "grad_norm": 0.3387428820133209, + "learning_rate": 0.00018334004860993773, + "loss": 1.3207, + "step": 6425 + }, + { + "epoch": 0.08350293920340197, + "grad_norm": 0.36850714683532715, + "learning_rate": 0.00018333744914802638, + "loss": 1.4192, + "step": 6426 + }, + { + "epoch": 0.08351593374731785, + "grad_norm": 0.40807291865348816, + "learning_rate": 0.00018333484968611498, + "loss": 1.5072, + "step": 6427 + }, + { + "epoch": 0.08352892829123372, + "grad_norm": 0.3865683078765869, + "learning_rate": 0.0001833322502242036, + "loss": 1.7671, + "step": 6428 + }, + { + "epoch": 0.08354192283514959, + "grad_norm": 0.46008211374282837, + "learning_rate": 0.0001833296507622922, + "loss": 1.3162, + "step": 6429 + }, + { + "epoch": 0.08355491737906547, + "grad_norm": 0.3837754428386688, + "learning_rate": 0.00018332705130038083, + "loss": 1.5041, + "step": 6430 + }, + { + "epoch": 0.08356791192298134, + "grad_norm": 0.43874359130859375, + "learning_rate": 0.00018332445183846945, + "loss": 1.6245, + "step": 6431 + }, + { + "epoch": 0.08358090646689721, + "grad_norm": 0.3807770907878876, + "learning_rate": 0.00018332185237655805, + "loss": 1.4719, + "step": 6432 + }, + { + "epoch": 0.08359390101081308, + "grad_norm": 0.3583511710166931, + "learning_rate": 0.00018331925291464668, + "loss": 1.4884, + "step": 6433 + }, + { + "epoch": 0.08360689555472896, + "grad_norm": 0.3377654254436493, + "learning_rate": 0.0001833166534527353, + "loss": 1.4229, + "step": 6434 + }, + { + "epoch": 0.08361989009864483, + "grad_norm": 0.42893752455711365, + "learning_rate": 0.0001833140539908239, + "loss": 1.4813, + "step": 6435 + }, + { + "epoch": 0.0836328846425607, + "grad_norm": 0.3380570113658905, + "learning_rate": 0.00018331145452891252, + "loss": 1.4832, + "step": 6436 + }, + { + "epoch": 0.08364587918647658, + "grad_norm": 0.4045338034629822, + "learning_rate": 0.00018330885506700115, + "loss": 1.5704, + "step": 6437 + }, + { + "epoch": 0.08365887373039245, + "grad_norm": 0.3734631836414337, + "learning_rate": 0.00018330625560508977, + "loss": 1.3702, + "step": 6438 + }, + { + "epoch": 0.08367186827430832, + "grad_norm": 0.4069790244102478, + "learning_rate": 0.00018330365614317837, + "loss": 1.4478, + "step": 6439 + }, + { + "epoch": 0.0836848628182242, + "grad_norm": 0.370533287525177, + "learning_rate": 0.000183301056681267, + "loss": 1.2567, + "step": 6440 + }, + { + "epoch": 0.08369785736214007, + "grad_norm": 0.4504433274269104, + "learning_rate": 0.00018329845721935562, + "loss": 1.38, + "step": 6441 + }, + { + "epoch": 0.08371085190605594, + "grad_norm": 0.44486233592033386, + "learning_rate": 0.00018329585775744421, + "loss": 1.3701, + "step": 6442 + }, + { + "epoch": 0.08372384644997181, + "grad_norm": 0.39275482296943665, + "learning_rate": 0.00018329325829553284, + "loss": 1.4081, + "step": 6443 + }, + { + "epoch": 0.08373684099388769, + "grad_norm": 0.42928028106689453, + "learning_rate": 0.00018329065883362144, + "loss": 1.4704, + "step": 6444 + }, + { + "epoch": 0.08374983553780356, + "grad_norm": 0.3356246054172516, + "learning_rate": 0.0001832880593717101, + "loss": 1.3732, + "step": 6445 + }, + { + "epoch": 0.08376283008171943, + "grad_norm": 0.3775327205657959, + "learning_rate": 0.00018328545990979868, + "loss": 1.4392, + "step": 6446 + }, + { + "epoch": 0.0837758246256353, + "grad_norm": 0.4768427014350891, + "learning_rate": 0.00018328286044788728, + "loss": 1.4577, + "step": 6447 + }, + { + "epoch": 0.08378881916955118, + "grad_norm": 0.3144374489784241, + "learning_rate": 0.0001832802609859759, + "loss": 1.2489, + "step": 6448 + }, + { + "epoch": 0.08380181371346705, + "grad_norm": 0.41254571080207825, + "learning_rate": 0.00018327766152406453, + "loss": 1.2922, + "step": 6449 + }, + { + "epoch": 0.08381480825738293, + "grad_norm": 0.4477587938308716, + "learning_rate": 0.00018327506206215316, + "loss": 1.476, + "step": 6450 + }, + { + "epoch": 0.0838278028012988, + "grad_norm": 0.4534163177013397, + "learning_rate": 0.00018327246260024175, + "loss": 1.5073, + "step": 6451 + }, + { + "epoch": 0.08384079734521468, + "grad_norm": 0.4524908661842346, + "learning_rate": 0.00018326986313833038, + "loss": 1.6396, + "step": 6452 + }, + { + "epoch": 0.08385379188913056, + "grad_norm": 0.4170534014701843, + "learning_rate": 0.000183267263676419, + "loss": 1.3806, + "step": 6453 + }, + { + "epoch": 0.08386678643304643, + "grad_norm": 0.3801972568035126, + "learning_rate": 0.0001832646642145076, + "loss": 1.3386, + "step": 6454 + }, + { + "epoch": 0.0838797809769623, + "grad_norm": 0.43365636467933655, + "learning_rate": 0.00018326206475259622, + "loss": 1.5673, + "step": 6455 + }, + { + "epoch": 0.08389277552087818, + "grad_norm": 0.3590330183506012, + "learning_rate": 0.00018325946529068482, + "loss": 1.4961, + "step": 6456 + }, + { + "epoch": 0.08390577006479405, + "grad_norm": 0.42626532912254333, + "learning_rate": 0.00018325686582877347, + "loss": 1.449, + "step": 6457 + }, + { + "epoch": 0.08391876460870992, + "grad_norm": 0.45199307799339294, + "learning_rate": 0.00018325426636686207, + "loss": 1.348, + "step": 6458 + }, + { + "epoch": 0.0839317591526258, + "grad_norm": 0.3183334767818451, + "learning_rate": 0.00018325166690495067, + "loss": 1.406, + "step": 6459 + }, + { + "epoch": 0.08394475369654167, + "grad_norm": 0.6126296520233154, + "learning_rate": 0.0001832490674430393, + "loss": 1.4263, + "step": 6460 + }, + { + "epoch": 0.08395774824045754, + "grad_norm": 0.3479015827178955, + "learning_rate": 0.00018324646798112792, + "loss": 1.4934, + "step": 6461 + }, + { + "epoch": 0.08397074278437341, + "grad_norm": 0.4620133936405182, + "learning_rate": 0.00018324386851921654, + "loss": 1.2783, + "step": 6462 + }, + { + "epoch": 0.08398373732828929, + "grad_norm": 0.3367346227169037, + "learning_rate": 0.00018324126905730514, + "loss": 1.3929, + "step": 6463 + }, + { + "epoch": 0.08399673187220516, + "grad_norm": 0.4636990427970886, + "learning_rate": 0.00018323866959539376, + "loss": 1.6359, + "step": 6464 + }, + { + "epoch": 0.08400972641612103, + "grad_norm": 0.39021503925323486, + "learning_rate": 0.0001832360701334824, + "loss": 1.3567, + "step": 6465 + }, + { + "epoch": 0.0840227209600369, + "grad_norm": 0.4318285286426544, + "learning_rate": 0.00018323347067157098, + "loss": 1.4567, + "step": 6466 + }, + { + "epoch": 0.08403571550395278, + "grad_norm": 0.3595968782901764, + "learning_rate": 0.0001832308712096596, + "loss": 1.4315, + "step": 6467 + }, + { + "epoch": 0.08404871004786865, + "grad_norm": 0.3814601004123688, + "learning_rate": 0.0001832282717477482, + "loss": 1.4064, + "step": 6468 + }, + { + "epoch": 0.08406170459178453, + "grad_norm": 0.404971182346344, + "learning_rate": 0.00018322567228583686, + "loss": 1.4376, + "step": 6469 + }, + { + "epoch": 0.0840746991357004, + "grad_norm": 0.3815779685974121, + "learning_rate": 0.00018322307282392546, + "loss": 1.3255, + "step": 6470 + }, + { + "epoch": 0.08408769367961627, + "grad_norm": 0.3937915563583374, + "learning_rate": 0.00018322047336201408, + "loss": 1.293, + "step": 6471 + }, + { + "epoch": 0.08410068822353214, + "grad_norm": 0.43878594040870667, + "learning_rate": 0.00018321787390010268, + "loss": 1.5132, + "step": 6472 + }, + { + "epoch": 0.08411368276744802, + "grad_norm": 0.3743495047092438, + "learning_rate": 0.0001832152744381913, + "loss": 1.3853, + "step": 6473 + }, + { + "epoch": 0.08412667731136389, + "grad_norm": 0.43098658323287964, + "learning_rate": 0.00018321267497627993, + "loss": 1.4325, + "step": 6474 + }, + { + "epoch": 0.08413967185527976, + "grad_norm": 0.43854820728302, + "learning_rate": 0.00018321007551436852, + "loss": 1.5207, + "step": 6475 + }, + { + "epoch": 0.08415266639919564, + "grad_norm": 0.3657275140285492, + "learning_rate": 0.00018320747605245715, + "loss": 1.4217, + "step": 6476 + }, + { + "epoch": 0.08416566094311151, + "grad_norm": 0.38142380118370056, + "learning_rate": 0.00018320487659054577, + "loss": 1.3738, + "step": 6477 + }, + { + "epoch": 0.08417865548702738, + "grad_norm": 0.38894620537757874, + "learning_rate": 0.00018320227712863437, + "loss": 1.4504, + "step": 6478 + }, + { + "epoch": 0.08419165003094325, + "grad_norm": 0.3506476879119873, + "learning_rate": 0.000183199677666723, + "loss": 1.1223, + "step": 6479 + }, + { + "epoch": 0.08420464457485913, + "grad_norm": 0.44102421402931213, + "learning_rate": 0.00018319707820481162, + "loss": 1.4269, + "step": 6480 + }, + { + "epoch": 0.084217639118775, + "grad_norm": 0.4569186568260193, + "learning_rate": 0.00018319447874290024, + "loss": 1.596, + "step": 6481 + }, + { + "epoch": 0.08423063366269087, + "grad_norm": 0.38186657428741455, + "learning_rate": 0.00018319187928098884, + "loss": 1.2877, + "step": 6482 + }, + { + "epoch": 0.08424362820660675, + "grad_norm": 0.45444315671920776, + "learning_rate": 0.00018318927981907747, + "loss": 1.4535, + "step": 6483 + }, + { + "epoch": 0.08425662275052262, + "grad_norm": 0.32623738050460815, + "learning_rate": 0.0001831866803571661, + "loss": 1.582, + "step": 6484 + }, + { + "epoch": 0.08426961729443849, + "grad_norm": 0.3647509813308716, + "learning_rate": 0.0001831840808952547, + "loss": 1.3403, + "step": 6485 + }, + { + "epoch": 0.08428261183835437, + "grad_norm": 0.4334845542907715, + "learning_rate": 0.0001831814814333433, + "loss": 1.6941, + "step": 6486 + }, + { + "epoch": 0.08429560638227024, + "grad_norm": 0.2596302926540375, + "learning_rate": 0.0001831788819714319, + "loss": 1.1878, + "step": 6487 + }, + { + "epoch": 0.08430860092618611, + "grad_norm": 0.3581424355506897, + "learning_rate": 0.00018317628250952053, + "loss": 1.5037, + "step": 6488 + }, + { + "epoch": 0.08432159547010198, + "grad_norm": 0.36386409401893616, + "learning_rate": 0.00018317368304760916, + "loss": 1.7406, + "step": 6489 + }, + { + "epoch": 0.08433459001401787, + "grad_norm": 0.5269473791122437, + "learning_rate": 0.00018317108358569776, + "loss": 1.4095, + "step": 6490 + }, + { + "epoch": 0.08434758455793374, + "grad_norm": 0.36427703499794006, + "learning_rate": 0.00018316848412378638, + "loss": 1.5148, + "step": 6491 + }, + { + "epoch": 0.08436057910184962, + "grad_norm": 0.4502542316913605, + "learning_rate": 0.000183165884661875, + "loss": 1.4436, + "step": 6492 + }, + { + "epoch": 0.08437357364576549, + "grad_norm": 0.41653943061828613, + "learning_rate": 0.00018316328519996363, + "loss": 1.3564, + "step": 6493 + }, + { + "epoch": 0.08438656818968136, + "grad_norm": 0.40431392192840576, + "learning_rate": 0.00018316068573805223, + "loss": 1.4104, + "step": 6494 + }, + { + "epoch": 0.08439956273359724, + "grad_norm": 0.37855595350265503, + "learning_rate": 0.00018315808627614085, + "loss": 1.3603, + "step": 6495 + }, + { + "epoch": 0.08441255727751311, + "grad_norm": 0.37523722648620605, + "learning_rate": 0.00018315548681422948, + "loss": 1.4139, + "step": 6496 + }, + { + "epoch": 0.08442555182142898, + "grad_norm": 0.42329201102256775, + "learning_rate": 0.00018315288735231807, + "loss": 1.5167, + "step": 6497 + }, + { + "epoch": 0.08443854636534485, + "grad_norm": 0.47706955671310425, + "learning_rate": 0.0001831502878904067, + "loss": 1.3451, + "step": 6498 + }, + { + "epoch": 0.08445154090926073, + "grad_norm": 0.37929004430770874, + "learning_rate": 0.0001831476884284953, + "loss": 1.4028, + "step": 6499 + }, + { + "epoch": 0.0844645354531766, + "grad_norm": 0.2957339286804199, + "learning_rate": 0.00018314508896658395, + "loss": 1.5917, + "step": 6500 + }, + { + "epoch": 0.08447752999709247, + "grad_norm": 0.3257904350757599, + "learning_rate": 0.00018314248950467254, + "loss": 1.5287, + "step": 6501 + }, + { + "epoch": 0.08449052454100835, + "grad_norm": 0.3705645203590393, + "learning_rate": 0.00018313989004276114, + "loss": 1.2935, + "step": 6502 + }, + { + "epoch": 0.08450351908492422, + "grad_norm": 0.37605494260787964, + "learning_rate": 0.00018313729058084977, + "loss": 1.4802, + "step": 6503 + }, + { + "epoch": 0.08451651362884009, + "grad_norm": 0.3834712505340576, + "learning_rate": 0.0001831346911189384, + "loss": 1.5491, + "step": 6504 + }, + { + "epoch": 0.08452950817275597, + "grad_norm": 0.31999561190605164, + "learning_rate": 0.00018313209165702701, + "loss": 1.4002, + "step": 6505 + }, + { + "epoch": 0.08454250271667184, + "grad_norm": 0.31522393226623535, + "learning_rate": 0.0001831294921951156, + "loss": 1.392, + "step": 6506 + }, + { + "epoch": 0.08455549726058771, + "grad_norm": 0.4409068524837494, + "learning_rate": 0.00018312689273320424, + "loss": 1.6956, + "step": 6507 + }, + { + "epoch": 0.08456849180450358, + "grad_norm": 0.38314947485923767, + "learning_rate": 0.00018312429327129286, + "loss": 1.3847, + "step": 6508 + }, + { + "epoch": 0.08458148634841946, + "grad_norm": 0.39943504333496094, + "learning_rate": 0.00018312169380938146, + "loss": 1.6174, + "step": 6509 + }, + { + "epoch": 0.08459448089233533, + "grad_norm": 0.32379797101020813, + "learning_rate": 0.00018311909434747008, + "loss": 1.2652, + "step": 6510 + }, + { + "epoch": 0.0846074754362512, + "grad_norm": 0.4151378273963928, + "learning_rate": 0.00018311649488555868, + "loss": 1.1655, + "step": 6511 + }, + { + "epoch": 0.08462046998016708, + "grad_norm": 0.4282408654689789, + "learning_rate": 0.00018311389542364733, + "loss": 1.5631, + "step": 6512 + }, + { + "epoch": 0.08463346452408295, + "grad_norm": 0.4001627266407013, + "learning_rate": 0.00018311129596173593, + "loss": 1.2653, + "step": 6513 + }, + { + "epoch": 0.08464645906799882, + "grad_norm": 0.3585038483142853, + "learning_rate": 0.00018310869649982453, + "loss": 1.4528, + "step": 6514 + }, + { + "epoch": 0.0846594536119147, + "grad_norm": 0.379959374666214, + "learning_rate": 0.00018310609703791318, + "loss": 1.5179, + "step": 6515 + }, + { + "epoch": 0.08467244815583057, + "grad_norm": 0.40313899517059326, + "learning_rate": 0.00018310349757600178, + "loss": 1.4173, + "step": 6516 + }, + { + "epoch": 0.08468544269974644, + "grad_norm": 0.26642388105392456, + "learning_rate": 0.0001831008981140904, + "loss": 1.4209, + "step": 6517 + }, + { + "epoch": 0.08469843724366231, + "grad_norm": 0.41755956411361694, + "learning_rate": 0.000183098298652179, + "loss": 1.3722, + "step": 6518 + }, + { + "epoch": 0.08471143178757819, + "grad_norm": 0.3805711567401886, + "learning_rate": 0.00018309569919026762, + "loss": 1.4338, + "step": 6519 + }, + { + "epoch": 0.08472442633149406, + "grad_norm": 0.35494542121887207, + "learning_rate": 0.00018309309972835625, + "loss": 1.5149, + "step": 6520 + }, + { + "epoch": 0.08473742087540993, + "grad_norm": 0.35381510853767395, + "learning_rate": 0.00018309050026644484, + "loss": 1.633, + "step": 6521 + }, + { + "epoch": 0.0847504154193258, + "grad_norm": 0.4472545385360718, + "learning_rate": 0.00018308790080453347, + "loss": 1.5209, + "step": 6522 + }, + { + "epoch": 0.08476340996324168, + "grad_norm": 0.4020017385482788, + "learning_rate": 0.0001830853013426221, + "loss": 1.404, + "step": 6523 + }, + { + "epoch": 0.08477640450715755, + "grad_norm": 0.5307603478431702, + "learning_rate": 0.00018308270188071072, + "loss": 1.4422, + "step": 6524 + }, + { + "epoch": 0.08478939905107342, + "grad_norm": 0.32418203353881836, + "learning_rate": 0.00018308010241879931, + "loss": 1.528, + "step": 6525 + }, + { + "epoch": 0.0848023935949893, + "grad_norm": 0.47254547476768494, + "learning_rate": 0.0001830775029568879, + "loss": 1.5067, + "step": 6526 + }, + { + "epoch": 0.08481538813890517, + "grad_norm": 0.3759053647518158, + "learning_rate": 0.00018307490349497656, + "loss": 1.4258, + "step": 6527 + }, + { + "epoch": 0.08482838268282106, + "grad_norm": 0.35423743724823, + "learning_rate": 0.00018307230403306516, + "loss": 1.5367, + "step": 6528 + }, + { + "epoch": 0.08484137722673693, + "grad_norm": 0.5136507749557495, + "learning_rate": 0.00018306970457115379, + "loss": 1.4426, + "step": 6529 + }, + { + "epoch": 0.0848543717706528, + "grad_norm": 0.3568829596042633, + "learning_rate": 0.00018306710510924238, + "loss": 1.4597, + "step": 6530 + }, + { + "epoch": 0.08486736631456868, + "grad_norm": 0.41523584723472595, + "learning_rate": 0.000183064505647331, + "loss": 1.527, + "step": 6531 + }, + { + "epoch": 0.08488036085848455, + "grad_norm": 0.33089902997016907, + "learning_rate": 0.00018306190618541963, + "loss": 1.149, + "step": 6532 + }, + { + "epoch": 0.08489335540240042, + "grad_norm": 0.38755321502685547, + "learning_rate": 0.00018305930672350823, + "loss": 1.5139, + "step": 6533 + }, + { + "epoch": 0.0849063499463163, + "grad_norm": 0.3957598805427551, + "learning_rate": 0.00018305670726159685, + "loss": 1.3406, + "step": 6534 + }, + { + "epoch": 0.08491934449023217, + "grad_norm": 0.4124167859554291, + "learning_rate": 0.00018305410779968548, + "loss": 1.5872, + "step": 6535 + }, + { + "epoch": 0.08493233903414804, + "grad_norm": 0.51162189245224, + "learning_rate": 0.0001830515083377741, + "loss": 1.386, + "step": 6536 + }, + { + "epoch": 0.08494533357806391, + "grad_norm": 0.34675490856170654, + "learning_rate": 0.0001830489088758627, + "loss": 1.5843, + "step": 6537 + }, + { + "epoch": 0.08495832812197979, + "grad_norm": 0.38475319743156433, + "learning_rate": 0.00018304630941395132, + "loss": 1.3556, + "step": 6538 + }, + { + "epoch": 0.08497132266589566, + "grad_norm": 0.40646085143089294, + "learning_rate": 0.00018304370995203995, + "loss": 1.3462, + "step": 6539 + }, + { + "epoch": 0.08498431720981153, + "grad_norm": 0.40540021657943726, + "learning_rate": 0.00018304111049012855, + "loss": 1.4431, + "step": 6540 + }, + { + "epoch": 0.0849973117537274, + "grad_norm": 0.4556567370891571, + "learning_rate": 0.00018303851102821717, + "loss": 1.5604, + "step": 6541 + }, + { + "epoch": 0.08501030629764328, + "grad_norm": 0.35327455401420593, + "learning_rate": 0.00018303591156630577, + "loss": 1.2263, + "step": 6542 + }, + { + "epoch": 0.08502330084155915, + "grad_norm": 0.4268738329410553, + "learning_rate": 0.0001830333121043944, + "loss": 1.5877, + "step": 6543 + }, + { + "epoch": 0.08503629538547502, + "grad_norm": 0.43034985661506653, + "learning_rate": 0.00018303071264248302, + "loss": 1.4343, + "step": 6544 + }, + { + "epoch": 0.0850492899293909, + "grad_norm": 0.4247622787952423, + "learning_rate": 0.00018302811318057161, + "loss": 1.4409, + "step": 6545 + }, + { + "epoch": 0.08506228447330677, + "grad_norm": 0.3663536012172699, + "learning_rate": 0.00018302551371866024, + "loss": 1.2992, + "step": 6546 + }, + { + "epoch": 0.08507527901722264, + "grad_norm": 0.35559043288230896, + "learning_rate": 0.00018302291425674886, + "loss": 1.4171, + "step": 6547 + }, + { + "epoch": 0.08508827356113852, + "grad_norm": 0.4016110897064209, + "learning_rate": 0.0001830203147948375, + "loss": 1.2979, + "step": 6548 + }, + { + "epoch": 0.08510126810505439, + "grad_norm": 0.36676278710365295, + "learning_rate": 0.00018301771533292609, + "loss": 1.34, + "step": 6549 + }, + { + "epoch": 0.08511426264897026, + "grad_norm": 0.44288933277130127, + "learning_rate": 0.0001830151158710147, + "loss": 1.5184, + "step": 6550 + }, + { + "epoch": 0.08512725719288614, + "grad_norm": 0.4219822287559509, + "learning_rate": 0.00018301251640910333, + "loss": 1.4674, + "step": 6551 + }, + { + "epoch": 0.08514025173680201, + "grad_norm": 0.3047439754009247, + "learning_rate": 0.00018300991694719193, + "loss": 1.4308, + "step": 6552 + }, + { + "epoch": 0.08515324628071788, + "grad_norm": 0.395720899105072, + "learning_rate": 0.00018300731748528056, + "loss": 1.4747, + "step": 6553 + }, + { + "epoch": 0.08516624082463375, + "grad_norm": 0.3163815140724182, + "learning_rate": 0.00018300471802336918, + "loss": 1.3703, + "step": 6554 + }, + { + "epoch": 0.08517923536854963, + "grad_norm": 0.4249371886253357, + "learning_rate": 0.0001830021185614578, + "loss": 1.6033, + "step": 6555 + }, + { + "epoch": 0.0851922299124655, + "grad_norm": 0.3310534954071045, + "learning_rate": 0.0001829995190995464, + "loss": 1.3846, + "step": 6556 + }, + { + "epoch": 0.08520522445638137, + "grad_norm": 0.3335309326648712, + "learning_rate": 0.000182996919637635, + "loss": 1.3639, + "step": 6557 + }, + { + "epoch": 0.08521821900029725, + "grad_norm": 0.3078988492488861, + "learning_rate": 0.00018299432017572365, + "loss": 1.2313, + "step": 6558 + }, + { + "epoch": 0.08523121354421312, + "grad_norm": 0.4341719150543213, + "learning_rate": 0.00018299172071381225, + "loss": 1.5524, + "step": 6559 + }, + { + "epoch": 0.08524420808812899, + "grad_norm": 0.368182897567749, + "learning_rate": 0.00018298912125190087, + "loss": 1.394, + "step": 6560 + }, + { + "epoch": 0.08525720263204487, + "grad_norm": 0.4454631209373474, + "learning_rate": 0.00018298652178998947, + "loss": 1.287, + "step": 6561 + }, + { + "epoch": 0.08527019717596074, + "grad_norm": 0.3346441090106964, + "learning_rate": 0.0001829839223280781, + "loss": 1.3334, + "step": 6562 + }, + { + "epoch": 0.08528319171987661, + "grad_norm": 0.4472450315952301, + "learning_rate": 0.00018298132286616672, + "loss": 1.5126, + "step": 6563 + }, + { + "epoch": 0.08529618626379248, + "grad_norm": 0.42458298802375793, + "learning_rate": 0.00018297872340425532, + "loss": 1.5703, + "step": 6564 + }, + { + "epoch": 0.08530918080770836, + "grad_norm": 0.41712358593940735, + "learning_rate": 0.00018297612394234394, + "loss": 1.2264, + "step": 6565 + }, + { + "epoch": 0.08532217535162424, + "grad_norm": 0.36192449927330017, + "learning_rate": 0.00018297352448043257, + "loss": 1.4235, + "step": 6566 + }, + { + "epoch": 0.08533516989554012, + "grad_norm": 0.5095565319061279, + "learning_rate": 0.0001829709250185212, + "loss": 1.341, + "step": 6567 + }, + { + "epoch": 0.08534816443945599, + "grad_norm": 0.2955765426158905, + "learning_rate": 0.0001829683255566098, + "loss": 1.2465, + "step": 6568 + }, + { + "epoch": 0.08536115898337186, + "grad_norm": 0.4438478648662567, + "learning_rate": 0.00018296572609469839, + "loss": 1.3453, + "step": 6569 + }, + { + "epoch": 0.08537415352728774, + "grad_norm": 0.42083898186683655, + "learning_rate": 0.00018296312663278704, + "loss": 1.3034, + "step": 6570 + }, + { + "epoch": 0.08538714807120361, + "grad_norm": 0.4801914691925049, + "learning_rate": 0.00018296052717087563, + "loss": 1.7428, + "step": 6571 + }, + { + "epoch": 0.08540014261511948, + "grad_norm": 0.37993961572647095, + "learning_rate": 0.00018295792770896426, + "loss": 1.5196, + "step": 6572 + }, + { + "epoch": 0.08541313715903535, + "grad_norm": 0.4432593286037445, + "learning_rate": 0.00018295532824705286, + "loss": 1.5703, + "step": 6573 + }, + { + "epoch": 0.08542613170295123, + "grad_norm": 0.3644528388977051, + "learning_rate": 0.00018295272878514148, + "loss": 1.3511, + "step": 6574 + }, + { + "epoch": 0.0854391262468671, + "grad_norm": 0.4239424765110016, + "learning_rate": 0.0001829501293232301, + "loss": 1.5522, + "step": 6575 + }, + { + "epoch": 0.08545212079078297, + "grad_norm": 0.3994225263595581, + "learning_rate": 0.0001829475298613187, + "loss": 1.4388, + "step": 6576 + }, + { + "epoch": 0.08546511533469885, + "grad_norm": 0.35156315565109253, + "learning_rate": 0.00018294493039940733, + "loss": 1.6481, + "step": 6577 + }, + { + "epoch": 0.08547810987861472, + "grad_norm": 0.4217776358127594, + "learning_rate": 0.00018294233093749595, + "loss": 1.5105, + "step": 6578 + }, + { + "epoch": 0.08549110442253059, + "grad_norm": 0.44565704464912415, + "learning_rate": 0.00018293973147558458, + "loss": 1.3199, + "step": 6579 + }, + { + "epoch": 0.08550409896644647, + "grad_norm": 0.46253079175949097, + "learning_rate": 0.00018293713201367317, + "loss": 1.5615, + "step": 6580 + }, + { + "epoch": 0.08551709351036234, + "grad_norm": 0.44732171297073364, + "learning_rate": 0.00018293453255176177, + "loss": 1.493, + "step": 6581 + }, + { + "epoch": 0.08553008805427821, + "grad_norm": 0.5240910649299622, + "learning_rate": 0.00018293193308985042, + "loss": 1.6053, + "step": 6582 + }, + { + "epoch": 0.08554308259819408, + "grad_norm": 0.3878539204597473, + "learning_rate": 0.00018292933362793902, + "loss": 1.4246, + "step": 6583 + }, + { + "epoch": 0.08555607714210996, + "grad_norm": 0.40849217772483826, + "learning_rate": 0.00018292673416602764, + "loss": 1.4402, + "step": 6584 + }, + { + "epoch": 0.08556907168602583, + "grad_norm": 0.46559247374534607, + "learning_rate": 0.00018292413470411624, + "loss": 1.4489, + "step": 6585 + }, + { + "epoch": 0.0855820662299417, + "grad_norm": 0.35966387391090393, + "learning_rate": 0.00018292153524220487, + "loss": 1.3184, + "step": 6586 + }, + { + "epoch": 0.08559506077385758, + "grad_norm": 0.36290839314460754, + "learning_rate": 0.0001829189357802935, + "loss": 1.4918, + "step": 6587 + }, + { + "epoch": 0.08560805531777345, + "grad_norm": 0.4738025665283203, + "learning_rate": 0.0001829163363183821, + "loss": 1.6859, + "step": 6588 + }, + { + "epoch": 0.08562104986168932, + "grad_norm": 0.3524314761161804, + "learning_rate": 0.00018291373685647074, + "loss": 1.316, + "step": 6589 + }, + { + "epoch": 0.0856340444056052, + "grad_norm": 0.26684197783470154, + "learning_rate": 0.00018291113739455934, + "loss": 1.3911, + "step": 6590 + }, + { + "epoch": 0.08564703894952107, + "grad_norm": 0.5597006678581238, + "learning_rate": 0.00018290853793264796, + "loss": 1.4495, + "step": 6591 + }, + { + "epoch": 0.08566003349343694, + "grad_norm": 0.4325263202190399, + "learning_rate": 0.00018290593847073656, + "loss": 1.4761, + "step": 6592 + }, + { + "epoch": 0.08567302803735281, + "grad_norm": 0.34806618094444275, + "learning_rate": 0.00018290333900882518, + "loss": 1.2459, + "step": 6593 + }, + { + "epoch": 0.08568602258126869, + "grad_norm": 0.3460070490837097, + "learning_rate": 0.0001829007395469138, + "loss": 1.3498, + "step": 6594 + }, + { + "epoch": 0.08569901712518456, + "grad_norm": 0.3247237205505371, + "learning_rate": 0.0001828981400850024, + "loss": 1.2516, + "step": 6595 + }, + { + "epoch": 0.08571201166910043, + "grad_norm": 0.36009061336517334, + "learning_rate": 0.00018289554062309103, + "loss": 1.4388, + "step": 6596 + }, + { + "epoch": 0.0857250062130163, + "grad_norm": 0.41921988129615784, + "learning_rate": 0.00018289294116117965, + "loss": 1.466, + "step": 6597 + }, + { + "epoch": 0.08573800075693218, + "grad_norm": 0.5115149021148682, + "learning_rate": 0.00018289034169926825, + "loss": 1.651, + "step": 6598 + }, + { + "epoch": 0.08575099530084805, + "grad_norm": 0.3894090950489044, + "learning_rate": 0.00018288774223735688, + "loss": 1.531, + "step": 6599 + }, + { + "epoch": 0.08576398984476392, + "grad_norm": 0.32461825013160706, + "learning_rate": 0.00018288514277544547, + "loss": 1.3326, + "step": 6600 + }, + { + "epoch": 0.0857769843886798, + "grad_norm": 0.3585717976093292, + "learning_rate": 0.00018288254331353412, + "loss": 1.4792, + "step": 6601 + }, + { + "epoch": 0.08578997893259567, + "grad_norm": 0.43605196475982666, + "learning_rate": 0.00018287994385162272, + "loss": 1.4011, + "step": 6602 + }, + { + "epoch": 0.08580297347651154, + "grad_norm": 0.288531094789505, + "learning_rate": 0.00018287734438971135, + "loss": 1.5059, + "step": 6603 + }, + { + "epoch": 0.08581596802042742, + "grad_norm": 0.34259143471717834, + "learning_rate": 0.00018287474492779994, + "loss": 1.2856, + "step": 6604 + }, + { + "epoch": 0.0858289625643433, + "grad_norm": 0.3507554531097412, + "learning_rate": 0.00018287214546588857, + "loss": 1.4691, + "step": 6605 + }, + { + "epoch": 0.08584195710825918, + "grad_norm": 0.32275745272636414, + "learning_rate": 0.0001828695460039772, + "loss": 1.6526, + "step": 6606 + }, + { + "epoch": 0.08585495165217505, + "grad_norm": 0.3872429430484772, + "learning_rate": 0.0001828669465420658, + "loss": 1.2231, + "step": 6607 + }, + { + "epoch": 0.08586794619609092, + "grad_norm": 0.4679528474807739, + "learning_rate": 0.00018286434708015441, + "loss": 1.4568, + "step": 6608 + }, + { + "epoch": 0.0858809407400068, + "grad_norm": 0.4792744815349579, + "learning_rate": 0.00018286174761824304, + "loss": 1.4876, + "step": 6609 + }, + { + "epoch": 0.08589393528392267, + "grad_norm": 0.45394331216812134, + "learning_rate": 0.00018285914815633164, + "loss": 1.4684, + "step": 6610 + }, + { + "epoch": 0.08590692982783854, + "grad_norm": 0.31654372811317444, + "learning_rate": 0.00018285654869442026, + "loss": 1.4346, + "step": 6611 + }, + { + "epoch": 0.08591992437175441, + "grad_norm": 0.34356799721717834, + "learning_rate": 0.00018285394923250886, + "loss": 1.4469, + "step": 6612 + }, + { + "epoch": 0.08593291891567029, + "grad_norm": 0.3233030140399933, + "learning_rate": 0.0001828513497705975, + "loss": 1.276, + "step": 6613 + }, + { + "epoch": 0.08594591345958616, + "grad_norm": 0.4390173554420471, + "learning_rate": 0.0001828487503086861, + "loss": 1.5669, + "step": 6614 + }, + { + "epoch": 0.08595890800350203, + "grad_norm": 0.3441517651081085, + "learning_rate": 0.00018284615084677473, + "loss": 1.2979, + "step": 6615 + }, + { + "epoch": 0.0859719025474179, + "grad_norm": 0.32941415905952454, + "learning_rate": 0.00018284355138486333, + "loss": 1.3249, + "step": 6616 + }, + { + "epoch": 0.08598489709133378, + "grad_norm": 0.40751445293426514, + "learning_rate": 0.00018284095192295195, + "loss": 1.1955, + "step": 6617 + }, + { + "epoch": 0.08599789163524965, + "grad_norm": 0.27998894453048706, + "learning_rate": 0.00018283835246104058, + "loss": 1.4467, + "step": 6618 + }, + { + "epoch": 0.08601088617916552, + "grad_norm": 0.5176675319671631, + "learning_rate": 0.00018283575299912918, + "loss": 1.568, + "step": 6619 + }, + { + "epoch": 0.0860238807230814, + "grad_norm": 0.4798547923564911, + "learning_rate": 0.0001828331535372178, + "loss": 1.5515, + "step": 6620 + }, + { + "epoch": 0.08603687526699727, + "grad_norm": 0.40374356508255005, + "learning_rate": 0.00018283055407530642, + "loss": 1.4655, + "step": 6621 + }, + { + "epoch": 0.08604986981091314, + "grad_norm": 0.3490309715270996, + "learning_rate": 0.00018282795461339505, + "loss": 1.4753, + "step": 6622 + }, + { + "epoch": 0.08606286435482902, + "grad_norm": 0.5119740962982178, + "learning_rate": 0.00018282535515148365, + "loss": 1.5082, + "step": 6623 + }, + { + "epoch": 0.08607585889874489, + "grad_norm": 0.459770143032074, + "learning_rate": 0.00018282275568957227, + "loss": 1.3987, + "step": 6624 + }, + { + "epoch": 0.08608885344266076, + "grad_norm": 0.2700651288032532, + "learning_rate": 0.0001828201562276609, + "loss": 1.5007, + "step": 6625 + }, + { + "epoch": 0.08610184798657664, + "grad_norm": 0.2813178598880768, + "learning_rate": 0.0001828175567657495, + "loss": 1.2854, + "step": 6626 + }, + { + "epoch": 0.08611484253049251, + "grad_norm": 0.3808160126209259, + "learning_rate": 0.00018281495730383812, + "loss": 1.4859, + "step": 6627 + }, + { + "epoch": 0.08612783707440838, + "grad_norm": 0.33384811878204346, + "learning_rate": 0.00018281235784192674, + "loss": 1.611, + "step": 6628 + }, + { + "epoch": 0.08614083161832425, + "grad_norm": 0.3770199716091156, + "learning_rate": 0.00018280975838001534, + "loss": 1.5353, + "step": 6629 + }, + { + "epoch": 0.08615382616224013, + "grad_norm": 0.35864925384521484, + "learning_rate": 0.00018280715891810396, + "loss": 1.3598, + "step": 6630 + }, + { + "epoch": 0.086166820706156, + "grad_norm": 0.4076600670814514, + "learning_rate": 0.00018280455945619256, + "loss": 1.6174, + "step": 6631 + }, + { + "epoch": 0.08617981525007187, + "grad_norm": 0.42534613609313965, + "learning_rate": 0.0001828019599942812, + "loss": 1.3148, + "step": 6632 + }, + { + "epoch": 0.08619280979398775, + "grad_norm": 0.4146581292152405, + "learning_rate": 0.0001827993605323698, + "loss": 1.4736, + "step": 6633 + }, + { + "epoch": 0.08620580433790362, + "grad_norm": 0.3903135657310486, + "learning_rate": 0.00018279676107045843, + "loss": 1.3434, + "step": 6634 + }, + { + "epoch": 0.08621879888181949, + "grad_norm": 0.3914014399051666, + "learning_rate": 0.00018279416160854703, + "loss": 1.3335, + "step": 6635 + }, + { + "epoch": 0.08623179342573536, + "grad_norm": 0.3830832839012146, + "learning_rate": 0.00018279156214663566, + "loss": 1.2612, + "step": 6636 + }, + { + "epoch": 0.08624478796965124, + "grad_norm": 0.34652161598205566, + "learning_rate": 0.00018278896268472428, + "loss": 1.3799, + "step": 6637 + }, + { + "epoch": 0.08625778251356711, + "grad_norm": 0.4538786709308624, + "learning_rate": 0.00018278636322281288, + "loss": 1.4622, + "step": 6638 + }, + { + "epoch": 0.08627077705748298, + "grad_norm": 0.341753214597702, + "learning_rate": 0.0001827837637609015, + "loss": 1.3347, + "step": 6639 + }, + { + "epoch": 0.08628377160139886, + "grad_norm": 0.4236353635787964, + "learning_rate": 0.00018278116429899013, + "loss": 1.4453, + "step": 6640 + }, + { + "epoch": 0.08629676614531473, + "grad_norm": 0.3563282787799835, + "learning_rate": 0.00018277856483707872, + "loss": 1.3889, + "step": 6641 + }, + { + "epoch": 0.0863097606892306, + "grad_norm": 0.4225514829158783, + "learning_rate": 0.00018277596537516735, + "loss": 1.312, + "step": 6642 + }, + { + "epoch": 0.08632275523314649, + "grad_norm": 0.5219054222106934, + "learning_rate": 0.00018277336591325595, + "loss": 1.5966, + "step": 6643 + }, + { + "epoch": 0.08633574977706236, + "grad_norm": 0.4198419451713562, + "learning_rate": 0.0001827707664513446, + "loss": 1.5836, + "step": 6644 + }, + { + "epoch": 0.08634874432097824, + "grad_norm": 0.4462706744670868, + "learning_rate": 0.0001827681669894332, + "loss": 1.3672, + "step": 6645 + }, + { + "epoch": 0.08636173886489411, + "grad_norm": 0.46152985095977783, + "learning_rate": 0.00018276556752752182, + "loss": 1.157, + "step": 6646 + }, + { + "epoch": 0.08637473340880998, + "grad_norm": 0.3734128773212433, + "learning_rate": 0.00018276296806561042, + "loss": 1.4892, + "step": 6647 + }, + { + "epoch": 0.08638772795272585, + "grad_norm": 0.3328731656074524, + "learning_rate": 0.00018276036860369904, + "loss": 1.4599, + "step": 6648 + }, + { + "epoch": 0.08640072249664173, + "grad_norm": 0.4166589379310608, + "learning_rate": 0.00018275776914178767, + "loss": 1.3006, + "step": 6649 + }, + { + "epoch": 0.0864137170405576, + "grad_norm": 0.3915296494960785, + "learning_rate": 0.00018275516967987626, + "loss": 1.5149, + "step": 6650 + }, + { + "epoch": 0.08642671158447347, + "grad_norm": 0.4036220610141754, + "learning_rate": 0.0001827525702179649, + "loss": 1.3709, + "step": 6651 + }, + { + "epoch": 0.08643970612838935, + "grad_norm": 0.38509851694107056, + "learning_rate": 0.0001827499707560535, + "loss": 1.2501, + "step": 6652 + }, + { + "epoch": 0.08645270067230522, + "grad_norm": 0.3693358600139618, + "learning_rate": 0.0001827473712941421, + "loss": 1.6177, + "step": 6653 + }, + { + "epoch": 0.08646569521622109, + "grad_norm": 0.4955030083656311, + "learning_rate": 0.00018274477183223073, + "loss": 1.523, + "step": 6654 + }, + { + "epoch": 0.08647868976013696, + "grad_norm": 0.40535542368888855, + "learning_rate": 0.00018274217237031933, + "loss": 1.3908, + "step": 6655 + }, + { + "epoch": 0.08649168430405284, + "grad_norm": 0.3674229681491852, + "learning_rate": 0.00018273957290840798, + "loss": 1.3266, + "step": 6656 + }, + { + "epoch": 0.08650467884796871, + "grad_norm": 0.3828207850456238, + "learning_rate": 0.00018273697344649658, + "loss": 1.5555, + "step": 6657 + }, + { + "epoch": 0.08651767339188458, + "grad_norm": 0.4368997812271118, + "learning_rate": 0.0001827343739845852, + "loss": 1.3704, + "step": 6658 + }, + { + "epoch": 0.08653066793580046, + "grad_norm": 0.3853335380554199, + "learning_rate": 0.0001827317745226738, + "loss": 1.1551, + "step": 6659 + }, + { + "epoch": 0.08654366247971633, + "grad_norm": 0.37162187695503235, + "learning_rate": 0.00018272917506076243, + "loss": 1.3298, + "step": 6660 + }, + { + "epoch": 0.0865566570236322, + "grad_norm": 0.4518080949783325, + "learning_rate": 0.00018272657559885105, + "loss": 1.3398, + "step": 6661 + }, + { + "epoch": 0.08656965156754808, + "grad_norm": 0.3125367760658264, + "learning_rate": 0.00018272397613693965, + "loss": 1.3816, + "step": 6662 + }, + { + "epoch": 0.08658264611146395, + "grad_norm": 0.3811508119106293, + "learning_rate": 0.0001827213766750283, + "loss": 1.321, + "step": 6663 + }, + { + "epoch": 0.08659564065537982, + "grad_norm": 0.47511252760887146, + "learning_rate": 0.0001827187772131169, + "loss": 1.4614, + "step": 6664 + }, + { + "epoch": 0.0866086351992957, + "grad_norm": 0.41604506969451904, + "learning_rate": 0.0001827161777512055, + "loss": 1.5482, + "step": 6665 + }, + { + "epoch": 0.08662162974321157, + "grad_norm": 0.41973546147346497, + "learning_rate": 0.00018271357828929412, + "loss": 1.5492, + "step": 6666 + }, + { + "epoch": 0.08663462428712744, + "grad_norm": 0.43922296166419983, + "learning_rate": 0.00018271097882738274, + "loss": 1.5224, + "step": 6667 + }, + { + "epoch": 0.08664761883104331, + "grad_norm": 0.3093338906764984, + "learning_rate": 0.00018270837936547137, + "loss": 1.3158, + "step": 6668 + }, + { + "epoch": 0.08666061337495919, + "grad_norm": 0.37729695439338684, + "learning_rate": 0.00018270577990355997, + "loss": 1.5179, + "step": 6669 + }, + { + "epoch": 0.08667360791887506, + "grad_norm": 0.40877071022987366, + "learning_rate": 0.0001827031804416486, + "loss": 1.3835, + "step": 6670 + }, + { + "epoch": 0.08668660246279093, + "grad_norm": 0.3912234604358673, + "learning_rate": 0.00018270058097973722, + "loss": 1.4194, + "step": 6671 + }, + { + "epoch": 0.0866995970067068, + "grad_norm": 0.481755793094635, + "learning_rate": 0.0001826979815178258, + "loss": 1.4019, + "step": 6672 + }, + { + "epoch": 0.08671259155062268, + "grad_norm": 0.3227919638156891, + "learning_rate": 0.00018269538205591444, + "loss": 1.3923, + "step": 6673 + }, + { + "epoch": 0.08672558609453855, + "grad_norm": 0.33725088834762573, + "learning_rate": 0.00018269278259400303, + "loss": 1.4424, + "step": 6674 + }, + { + "epoch": 0.08673858063845442, + "grad_norm": 0.36098626255989075, + "learning_rate": 0.00018269018313209169, + "loss": 1.4174, + "step": 6675 + }, + { + "epoch": 0.0867515751823703, + "grad_norm": 0.40109825134277344, + "learning_rate": 0.00018268758367018028, + "loss": 1.5953, + "step": 6676 + }, + { + "epoch": 0.08676456972628617, + "grad_norm": 0.4429379999637604, + "learning_rate": 0.0001826849842082689, + "loss": 1.5633, + "step": 6677 + }, + { + "epoch": 0.08677756427020204, + "grad_norm": 0.3582462668418884, + "learning_rate": 0.0001826823847463575, + "loss": 1.5346, + "step": 6678 + }, + { + "epoch": 0.08679055881411792, + "grad_norm": 0.35185185074806213, + "learning_rate": 0.00018267978528444613, + "loss": 1.2513, + "step": 6679 + }, + { + "epoch": 0.08680355335803379, + "grad_norm": 0.40533891320228577, + "learning_rate": 0.00018267718582253475, + "loss": 1.4454, + "step": 6680 + }, + { + "epoch": 0.08681654790194968, + "grad_norm": 0.3830435574054718, + "learning_rate": 0.00018267458636062335, + "loss": 1.4817, + "step": 6681 + }, + { + "epoch": 0.08682954244586555, + "grad_norm": 0.4009658694267273, + "learning_rate": 0.00018267198689871198, + "loss": 1.4674, + "step": 6682 + }, + { + "epoch": 0.08684253698978142, + "grad_norm": 0.44393613934516907, + "learning_rate": 0.0001826693874368006, + "loss": 1.4079, + "step": 6683 + }, + { + "epoch": 0.0868555315336973, + "grad_norm": 0.4655272662639618, + "learning_rate": 0.0001826667879748892, + "loss": 1.4485, + "step": 6684 + }, + { + "epoch": 0.08686852607761317, + "grad_norm": 0.5212623476982117, + "learning_rate": 0.00018266418851297782, + "loss": 1.3869, + "step": 6685 + }, + { + "epoch": 0.08688152062152904, + "grad_norm": 0.46332216262817383, + "learning_rate": 0.00018266158905106642, + "loss": 1.5046, + "step": 6686 + }, + { + "epoch": 0.08689451516544491, + "grad_norm": 0.45600855350494385, + "learning_rate": 0.00018265898958915507, + "loss": 1.5166, + "step": 6687 + }, + { + "epoch": 0.08690750970936079, + "grad_norm": 0.4539586901664734, + "learning_rate": 0.00018265639012724367, + "loss": 1.5981, + "step": 6688 + }, + { + "epoch": 0.08692050425327666, + "grad_norm": 0.38734015822410583, + "learning_rate": 0.0001826537906653323, + "loss": 1.3194, + "step": 6689 + }, + { + "epoch": 0.08693349879719253, + "grad_norm": 0.38938552141189575, + "learning_rate": 0.0001826511912034209, + "loss": 1.4787, + "step": 6690 + }, + { + "epoch": 0.0869464933411084, + "grad_norm": 0.4348485767841339, + "learning_rate": 0.00018264859174150952, + "loss": 1.3383, + "step": 6691 + }, + { + "epoch": 0.08695948788502428, + "grad_norm": 0.3537873327732086, + "learning_rate": 0.00018264599227959814, + "loss": 1.4022, + "step": 6692 + }, + { + "epoch": 0.08697248242894015, + "grad_norm": 0.4237958490848541, + "learning_rate": 0.00018264339281768674, + "loss": 1.4953, + "step": 6693 + }, + { + "epoch": 0.08698547697285602, + "grad_norm": 0.39175277948379517, + "learning_rate": 0.00018264079335577536, + "loss": 1.402, + "step": 6694 + }, + { + "epoch": 0.0869984715167719, + "grad_norm": 0.40237271785736084, + "learning_rate": 0.00018263819389386399, + "loss": 1.2712, + "step": 6695 + }, + { + "epoch": 0.08701146606068777, + "grad_norm": 0.42883434891700745, + "learning_rate": 0.00018263559443195258, + "loss": 1.5904, + "step": 6696 + }, + { + "epoch": 0.08702446060460364, + "grad_norm": 0.28929761052131653, + "learning_rate": 0.0001826329949700412, + "loss": 1.3678, + "step": 6697 + }, + { + "epoch": 0.08703745514851952, + "grad_norm": 0.3892543911933899, + "learning_rate": 0.00018263039550812983, + "loss": 1.5108, + "step": 6698 + }, + { + "epoch": 0.08705044969243539, + "grad_norm": 0.41149577498435974, + "learning_rate": 0.00018262779604621846, + "loss": 1.4441, + "step": 6699 + }, + { + "epoch": 0.08706344423635126, + "grad_norm": 0.49987760186195374, + "learning_rate": 0.00018262519658430705, + "loss": 1.4826, + "step": 6700 + }, + { + "epoch": 0.08707643878026713, + "grad_norm": 0.3278210163116455, + "learning_rate": 0.00018262259712239568, + "loss": 1.5166, + "step": 6701 + }, + { + "epoch": 0.08708943332418301, + "grad_norm": 0.5012274980545044, + "learning_rate": 0.0001826199976604843, + "loss": 1.4114, + "step": 6702 + }, + { + "epoch": 0.08710242786809888, + "grad_norm": 0.37377458810806274, + "learning_rate": 0.0001826173981985729, + "loss": 1.4066, + "step": 6703 + }, + { + "epoch": 0.08711542241201475, + "grad_norm": 0.3846884071826935, + "learning_rate": 0.00018261479873666153, + "loss": 1.4282, + "step": 6704 + }, + { + "epoch": 0.08712841695593063, + "grad_norm": 0.48730963468551636, + "learning_rate": 0.00018261219927475012, + "loss": 1.4026, + "step": 6705 + }, + { + "epoch": 0.0871414114998465, + "grad_norm": 0.4235099256038666, + "learning_rate": 0.00018260959981283877, + "loss": 1.6324, + "step": 6706 + }, + { + "epoch": 0.08715440604376237, + "grad_norm": 0.42631927132606506, + "learning_rate": 0.00018260700035092737, + "loss": 1.618, + "step": 6707 + }, + { + "epoch": 0.08716740058767825, + "grad_norm": 0.38488584756851196, + "learning_rate": 0.00018260440088901597, + "loss": 1.4508, + "step": 6708 + }, + { + "epoch": 0.08718039513159412, + "grad_norm": 0.3621501326560974, + "learning_rate": 0.0001826018014271046, + "loss": 1.436, + "step": 6709 + }, + { + "epoch": 0.08719338967550999, + "grad_norm": 0.41321900486946106, + "learning_rate": 0.00018259920196519322, + "loss": 1.4415, + "step": 6710 + }, + { + "epoch": 0.08720638421942586, + "grad_norm": 0.366578608751297, + "learning_rate": 0.00018259660250328184, + "loss": 1.4146, + "step": 6711 + }, + { + "epoch": 0.08721937876334174, + "grad_norm": 0.4188486933708191, + "learning_rate": 0.00018259400304137044, + "loss": 1.6776, + "step": 6712 + }, + { + "epoch": 0.08723237330725761, + "grad_norm": 0.3642802834510803, + "learning_rate": 0.00018259140357945906, + "loss": 1.4507, + "step": 6713 + }, + { + "epoch": 0.08724536785117348, + "grad_norm": 0.38941413164138794, + "learning_rate": 0.0001825888041175477, + "loss": 1.301, + "step": 6714 + }, + { + "epoch": 0.08725836239508936, + "grad_norm": 0.47522616386413574, + "learning_rate": 0.00018258620465563629, + "loss": 1.3847, + "step": 6715 + }, + { + "epoch": 0.08727135693900523, + "grad_norm": 0.32577309012413025, + "learning_rate": 0.0001825836051937249, + "loss": 1.4071, + "step": 6716 + }, + { + "epoch": 0.0872843514829211, + "grad_norm": 0.31356143951416016, + "learning_rate": 0.0001825810057318135, + "loss": 1.3019, + "step": 6717 + }, + { + "epoch": 0.08729734602683697, + "grad_norm": 0.38095274567604065, + "learning_rate": 0.00018257840626990216, + "loss": 1.3849, + "step": 6718 + }, + { + "epoch": 0.08731034057075286, + "grad_norm": 0.4623626470565796, + "learning_rate": 0.00018257580680799076, + "loss": 1.6373, + "step": 6719 + }, + { + "epoch": 0.08732333511466873, + "grad_norm": 0.33871403336524963, + "learning_rate": 0.00018257320734607935, + "loss": 1.4132, + "step": 6720 + }, + { + "epoch": 0.08733632965858461, + "grad_norm": 0.35408031940460205, + "learning_rate": 0.00018257060788416798, + "loss": 1.4725, + "step": 6721 + }, + { + "epoch": 0.08734932420250048, + "grad_norm": 0.4097619354724884, + "learning_rate": 0.0001825680084222566, + "loss": 1.5569, + "step": 6722 + }, + { + "epoch": 0.08736231874641635, + "grad_norm": 0.45207032561302185, + "learning_rate": 0.00018256540896034523, + "loss": 1.5888, + "step": 6723 + }, + { + "epoch": 0.08737531329033223, + "grad_norm": 0.4350290894508362, + "learning_rate": 0.00018256280949843382, + "loss": 1.3835, + "step": 6724 + }, + { + "epoch": 0.0873883078342481, + "grad_norm": 0.43973809480667114, + "learning_rate": 0.00018256021003652245, + "loss": 1.3678, + "step": 6725 + }, + { + "epoch": 0.08740130237816397, + "grad_norm": 0.44525089859962463, + "learning_rate": 0.00018255761057461107, + "loss": 1.5277, + "step": 6726 + }, + { + "epoch": 0.08741429692207985, + "grad_norm": 0.3814772069454193, + "learning_rate": 0.00018255501111269967, + "loss": 1.2925, + "step": 6727 + }, + { + "epoch": 0.08742729146599572, + "grad_norm": 0.42106109857559204, + "learning_rate": 0.0001825524116507883, + "loss": 1.2959, + "step": 6728 + }, + { + "epoch": 0.08744028600991159, + "grad_norm": 0.4496477246284485, + "learning_rate": 0.0001825498121888769, + "loss": 1.5256, + "step": 6729 + }, + { + "epoch": 0.08745328055382746, + "grad_norm": 0.3612123429775238, + "learning_rate": 0.00018254721272696554, + "loss": 1.2477, + "step": 6730 + }, + { + "epoch": 0.08746627509774334, + "grad_norm": 0.33892276883125305, + "learning_rate": 0.00018254461326505414, + "loss": 1.5615, + "step": 6731 + }, + { + "epoch": 0.08747926964165921, + "grad_norm": 0.40976324677467346, + "learning_rate": 0.00018254201380314274, + "loss": 1.6113, + "step": 6732 + }, + { + "epoch": 0.08749226418557508, + "grad_norm": 0.4102124273777008, + "learning_rate": 0.00018253941434123136, + "loss": 1.4434, + "step": 6733 + }, + { + "epoch": 0.08750525872949096, + "grad_norm": 0.4042928218841553, + "learning_rate": 0.00018253681487932, + "loss": 1.6459, + "step": 6734 + }, + { + "epoch": 0.08751825327340683, + "grad_norm": 0.32147514820098877, + "learning_rate": 0.0001825342154174086, + "loss": 1.4155, + "step": 6735 + }, + { + "epoch": 0.0875312478173227, + "grad_norm": 0.4061089754104614, + "learning_rate": 0.0001825316159554972, + "loss": 1.2141, + "step": 6736 + }, + { + "epoch": 0.08754424236123858, + "grad_norm": 0.37023404240608215, + "learning_rate": 0.00018252901649358583, + "loss": 1.4745, + "step": 6737 + }, + { + "epoch": 0.08755723690515445, + "grad_norm": 0.38748273253440857, + "learning_rate": 0.00018252641703167446, + "loss": 1.41, + "step": 6738 + }, + { + "epoch": 0.08757023144907032, + "grad_norm": 0.44647854566574097, + "learning_rate": 0.00018252381756976306, + "loss": 1.2981, + "step": 6739 + }, + { + "epoch": 0.0875832259929862, + "grad_norm": 0.3964136838912964, + "learning_rate": 0.00018252121810785168, + "loss": 1.2843, + "step": 6740 + }, + { + "epoch": 0.08759622053690207, + "grad_norm": 0.4006388187408447, + "learning_rate": 0.0001825186186459403, + "loss": 1.6067, + "step": 6741 + }, + { + "epoch": 0.08760921508081794, + "grad_norm": 0.40677353739738464, + "learning_rate": 0.00018251601918402893, + "loss": 1.5652, + "step": 6742 + }, + { + "epoch": 0.08762220962473381, + "grad_norm": 0.36035212874412537, + "learning_rate": 0.00018251341972211753, + "loss": 1.3936, + "step": 6743 + }, + { + "epoch": 0.08763520416864969, + "grad_norm": 0.36891618371009827, + "learning_rate": 0.00018251082026020615, + "loss": 1.3743, + "step": 6744 + }, + { + "epoch": 0.08764819871256556, + "grad_norm": 0.4133085310459137, + "learning_rate": 0.00018250822079829478, + "loss": 1.4251, + "step": 6745 + }, + { + "epoch": 0.08766119325648143, + "grad_norm": 0.3932144343852997, + "learning_rate": 0.00018250562133638337, + "loss": 1.3792, + "step": 6746 + }, + { + "epoch": 0.0876741878003973, + "grad_norm": 0.3970622718334198, + "learning_rate": 0.000182503021874472, + "loss": 1.292, + "step": 6747 + }, + { + "epoch": 0.08768718234431318, + "grad_norm": 0.3572845757007599, + "learning_rate": 0.0001825004224125606, + "loss": 1.3365, + "step": 6748 + }, + { + "epoch": 0.08770017688822905, + "grad_norm": 0.3556772768497467, + "learning_rate": 0.00018249782295064922, + "loss": 1.5918, + "step": 6749 + }, + { + "epoch": 0.08771317143214492, + "grad_norm": 0.363410621881485, + "learning_rate": 0.00018249522348873784, + "loss": 1.3931, + "step": 6750 + }, + { + "epoch": 0.0877261659760608, + "grad_norm": 0.41273099184036255, + "learning_rate": 0.00018249262402682644, + "loss": 1.2968, + "step": 6751 + }, + { + "epoch": 0.08773916051997667, + "grad_norm": 0.3791990876197815, + "learning_rate": 0.00018249002456491507, + "loss": 1.4417, + "step": 6752 + }, + { + "epoch": 0.08775215506389254, + "grad_norm": 0.48342934250831604, + "learning_rate": 0.0001824874251030037, + "loss": 1.5098, + "step": 6753 + }, + { + "epoch": 0.08776514960780842, + "grad_norm": 0.5464855432510376, + "learning_rate": 0.00018248482564109232, + "loss": 1.5592, + "step": 6754 + }, + { + "epoch": 0.08777814415172429, + "grad_norm": 0.35458904504776, + "learning_rate": 0.0001824822261791809, + "loss": 1.5243, + "step": 6755 + }, + { + "epoch": 0.08779113869564016, + "grad_norm": 0.4396544098854065, + "learning_rate": 0.00018247962671726954, + "loss": 1.4359, + "step": 6756 + }, + { + "epoch": 0.08780413323955605, + "grad_norm": 0.39417317509651184, + "learning_rate": 0.00018247702725535816, + "loss": 1.4294, + "step": 6757 + }, + { + "epoch": 0.08781712778347192, + "grad_norm": 0.355384886264801, + "learning_rate": 0.00018247442779344676, + "loss": 1.3781, + "step": 6758 + }, + { + "epoch": 0.0878301223273878, + "grad_norm": 0.35375678539276123, + "learning_rate": 0.00018247182833153538, + "loss": 1.4783, + "step": 6759 + }, + { + "epoch": 0.08784311687130367, + "grad_norm": 0.3800255358219147, + "learning_rate": 0.00018246922886962398, + "loss": 1.5268, + "step": 6760 + }, + { + "epoch": 0.08785611141521954, + "grad_norm": 0.44282281398773193, + "learning_rate": 0.00018246662940771263, + "loss": 1.3767, + "step": 6761 + }, + { + "epoch": 0.08786910595913541, + "grad_norm": 0.3774675726890564, + "learning_rate": 0.00018246402994580123, + "loss": 1.3743, + "step": 6762 + }, + { + "epoch": 0.08788210050305129, + "grad_norm": 0.36646127700805664, + "learning_rate": 0.00018246143048388983, + "loss": 1.5078, + "step": 6763 + }, + { + "epoch": 0.08789509504696716, + "grad_norm": 0.3378525972366333, + "learning_rate": 0.00018245883102197845, + "loss": 1.3718, + "step": 6764 + }, + { + "epoch": 0.08790808959088303, + "grad_norm": 0.38771650195121765, + "learning_rate": 0.00018245623156006708, + "loss": 1.3849, + "step": 6765 + }, + { + "epoch": 0.0879210841347989, + "grad_norm": 0.3553890883922577, + "learning_rate": 0.0001824536320981557, + "loss": 1.4497, + "step": 6766 + }, + { + "epoch": 0.08793407867871478, + "grad_norm": 0.342769980430603, + "learning_rate": 0.0001824510326362443, + "loss": 1.2222, + "step": 6767 + }, + { + "epoch": 0.08794707322263065, + "grad_norm": 0.39237168431282043, + "learning_rate": 0.00018244843317433292, + "loss": 1.1589, + "step": 6768 + }, + { + "epoch": 0.08796006776654652, + "grad_norm": 0.4822896718978882, + "learning_rate": 0.00018244583371242155, + "loss": 1.6161, + "step": 6769 + }, + { + "epoch": 0.0879730623104624, + "grad_norm": 0.40048688650131226, + "learning_rate": 0.00018244323425051014, + "loss": 1.3556, + "step": 6770 + }, + { + "epoch": 0.08798605685437827, + "grad_norm": 0.4437680244445801, + "learning_rate": 0.00018244063478859877, + "loss": 1.5057, + "step": 6771 + }, + { + "epoch": 0.08799905139829414, + "grad_norm": 0.2981235682964325, + "learning_rate": 0.0001824380353266874, + "loss": 1.2565, + "step": 6772 + }, + { + "epoch": 0.08801204594221002, + "grad_norm": 0.33592531085014343, + "learning_rate": 0.00018243543586477602, + "loss": 1.4356, + "step": 6773 + }, + { + "epoch": 0.08802504048612589, + "grad_norm": 0.3112647831439972, + "learning_rate": 0.00018243283640286462, + "loss": 1.4179, + "step": 6774 + }, + { + "epoch": 0.08803803503004176, + "grad_norm": 0.48134511709213257, + "learning_rate": 0.0001824302369409532, + "loss": 1.65, + "step": 6775 + }, + { + "epoch": 0.08805102957395763, + "grad_norm": 0.3852820098400116, + "learning_rate": 0.00018242763747904186, + "loss": 1.5032, + "step": 6776 + }, + { + "epoch": 0.08806402411787351, + "grad_norm": 0.4001849293708801, + "learning_rate": 0.00018242503801713046, + "loss": 1.5401, + "step": 6777 + }, + { + "epoch": 0.08807701866178938, + "grad_norm": 0.3402118980884552, + "learning_rate": 0.00018242243855521909, + "loss": 1.4296, + "step": 6778 + }, + { + "epoch": 0.08809001320570525, + "grad_norm": 0.412910133600235, + "learning_rate": 0.00018241983909330768, + "loss": 1.4326, + "step": 6779 + }, + { + "epoch": 0.08810300774962113, + "grad_norm": 0.4577445983886719, + "learning_rate": 0.0001824172396313963, + "loss": 1.4972, + "step": 6780 + }, + { + "epoch": 0.088116002293537, + "grad_norm": 0.31557148694992065, + "learning_rate": 0.00018241464016948493, + "loss": 1.3197, + "step": 6781 + }, + { + "epoch": 0.08812899683745287, + "grad_norm": 0.3885459899902344, + "learning_rate": 0.00018241204070757353, + "loss": 1.7714, + "step": 6782 + }, + { + "epoch": 0.08814199138136875, + "grad_norm": 0.4835717976093292, + "learning_rate": 0.00018240944124566215, + "loss": 1.4426, + "step": 6783 + }, + { + "epoch": 0.08815498592528462, + "grad_norm": 0.31692183017730713, + "learning_rate": 0.00018240684178375078, + "loss": 1.3472, + "step": 6784 + }, + { + "epoch": 0.08816798046920049, + "grad_norm": 0.31482797861099243, + "learning_rate": 0.0001824042423218394, + "loss": 1.31, + "step": 6785 + }, + { + "epoch": 0.08818097501311636, + "grad_norm": 0.4475362300872803, + "learning_rate": 0.000182401642859928, + "loss": 1.5553, + "step": 6786 + }, + { + "epoch": 0.08819396955703224, + "grad_norm": 0.2428712546825409, + "learning_rate": 0.0001823990433980166, + "loss": 1.1972, + "step": 6787 + }, + { + "epoch": 0.08820696410094811, + "grad_norm": 0.3964831829071045, + "learning_rate": 0.00018239644393610525, + "loss": 1.4751, + "step": 6788 + }, + { + "epoch": 0.08821995864486398, + "grad_norm": 0.4768216609954834, + "learning_rate": 0.00018239384447419385, + "loss": 1.6285, + "step": 6789 + }, + { + "epoch": 0.08823295318877986, + "grad_norm": 0.43892496824264526, + "learning_rate": 0.00018239124501228247, + "loss": 1.5025, + "step": 6790 + }, + { + "epoch": 0.08824594773269573, + "grad_norm": 0.3282887935638428, + "learning_rate": 0.00018238864555037107, + "loss": 1.3379, + "step": 6791 + }, + { + "epoch": 0.0882589422766116, + "grad_norm": 0.4435223937034607, + "learning_rate": 0.0001823860460884597, + "loss": 1.7546, + "step": 6792 + }, + { + "epoch": 0.08827193682052747, + "grad_norm": 0.4093644320964813, + "learning_rate": 0.00018238344662654832, + "loss": 1.3932, + "step": 6793 + }, + { + "epoch": 0.08828493136444335, + "grad_norm": 0.424146831035614, + "learning_rate": 0.00018238084716463692, + "loss": 1.6236, + "step": 6794 + }, + { + "epoch": 0.08829792590835923, + "grad_norm": 0.5087403059005737, + "learning_rate": 0.00018237824770272554, + "loss": 1.4988, + "step": 6795 + }, + { + "epoch": 0.08831092045227511, + "grad_norm": 0.34016329050064087, + "learning_rate": 0.00018237564824081416, + "loss": 1.3984, + "step": 6796 + }, + { + "epoch": 0.08832391499619098, + "grad_norm": 0.307283878326416, + "learning_rate": 0.0001823730487789028, + "loss": 1.5193, + "step": 6797 + }, + { + "epoch": 0.08833690954010685, + "grad_norm": 0.3525094985961914, + "learning_rate": 0.00018237044931699139, + "loss": 1.3198, + "step": 6798 + }, + { + "epoch": 0.08834990408402273, + "grad_norm": 0.3240290582180023, + "learning_rate": 0.00018236784985508, + "loss": 1.533, + "step": 6799 + }, + { + "epoch": 0.0883628986279386, + "grad_norm": 0.30903682112693787, + "learning_rate": 0.00018236525039316864, + "loss": 1.2742, + "step": 6800 + }, + { + "epoch": 0.08837589317185447, + "grad_norm": 0.3859502971172333, + "learning_rate": 0.00018236265093125723, + "loss": 1.3709, + "step": 6801 + }, + { + "epoch": 0.08838888771577035, + "grad_norm": 0.3207455575466156, + "learning_rate": 0.00018236005146934586, + "loss": 1.2321, + "step": 6802 + }, + { + "epoch": 0.08840188225968622, + "grad_norm": 0.4492981433868408, + "learning_rate": 0.00018235745200743445, + "loss": 1.6, + "step": 6803 + }, + { + "epoch": 0.08841487680360209, + "grad_norm": 0.4954877197742462, + "learning_rate": 0.00018235485254552308, + "loss": 1.4338, + "step": 6804 + }, + { + "epoch": 0.08842787134751796, + "grad_norm": 0.4125249981880188, + "learning_rate": 0.0001823522530836117, + "loss": 1.3939, + "step": 6805 + }, + { + "epoch": 0.08844086589143384, + "grad_norm": 0.38339829444885254, + "learning_rate": 0.0001823496536217003, + "loss": 1.649, + "step": 6806 + }, + { + "epoch": 0.08845386043534971, + "grad_norm": 0.4332326054573059, + "learning_rate": 0.00018234705415978893, + "loss": 1.3569, + "step": 6807 + }, + { + "epoch": 0.08846685497926558, + "grad_norm": 0.3514329493045807, + "learning_rate": 0.00018234445469787755, + "loss": 1.4779, + "step": 6808 + }, + { + "epoch": 0.08847984952318146, + "grad_norm": 0.39225804805755615, + "learning_rate": 0.00018234185523596617, + "loss": 1.4586, + "step": 6809 + }, + { + "epoch": 0.08849284406709733, + "grad_norm": 0.4566524028778076, + "learning_rate": 0.00018233925577405477, + "loss": 1.4375, + "step": 6810 + }, + { + "epoch": 0.0885058386110132, + "grad_norm": 0.38308635354042053, + "learning_rate": 0.0001823366563121434, + "loss": 1.4494, + "step": 6811 + }, + { + "epoch": 0.08851883315492907, + "grad_norm": 0.32838448882102966, + "learning_rate": 0.00018233405685023202, + "loss": 1.4249, + "step": 6812 + }, + { + "epoch": 0.08853182769884495, + "grad_norm": 0.36618572473526, + "learning_rate": 0.00018233145738832062, + "loss": 1.3167, + "step": 6813 + }, + { + "epoch": 0.08854482224276082, + "grad_norm": 0.45102056860923767, + "learning_rate": 0.00018232885792640924, + "loss": 1.4872, + "step": 6814 + }, + { + "epoch": 0.0885578167866767, + "grad_norm": 0.3948284089565277, + "learning_rate": 0.00018232625846449787, + "loss": 1.4123, + "step": 6815 + }, + { + "epoch": 0.08857081133059257, + "grad_norm": 0.4315451979637146, + "learning_rate": 0.00018232365900258646, + "loss": 1.3051, + "step": 6816 + }, + { + "epoch": 0.08858380587450844, + "grad_norm": 0.4110200107097626, + "learning_rate": 0.0001823210595406751, + "loss": 1.416, + "step": 6817 + }, + { + "epoch": 0.08859680041842431, + "grad_norm": 0.3555764853954315, + "learning_rate": 0.00018231846007876369, + "loss": 1.2121, + "step": 6818 + }, + { + "epoch": 0.08860979496234019, + "grad_norm": 0.4137984812259674, + "learning_rate": 0.00018231586061685234, + "loss": 1.375, + "step": 6819 + }, + { + "epoch": 0.08862278950625606, + "grad_norm": 0.39686208963394165, + "learning_rate": 0.00018231326115494094, + "loss": 1.4284, + "step": 6820 + }, + { + "epoch": 0.08863578405017193, + "grad_norm": 0.3907967805862427, + "learning_rate": 0.00018231066169302956, + "loss": 1.4825, + "step": 6821 + }, + { + "epoch": 0.0886487785940878, + "grad_norm": 0.3236677348613739, + "learning_rate": 0.00018230806223111816, + "loss": 1.351, + "step": 6822 + }, + { + "epoch": 0.08866177313800368, + "grad_norm": 0.3468323349952698, + "learning_rate": 0.00018230546276920678, + "loss": 1.3833, + "step": 6823 + }, + { + "epoch": 0.08867476768191955, + "grad_norm": 0.3934495449066162, + "learning_rate": 0.0001823028633072954, + "loss": 1.4241, + "step": 6824 + }, + { + "epoch": 0.08868776222583542, + "grad_norm": 0.49817419052124023, + "learning_rate": 0.000182300263845384, + "loss": 1.5475, + "step": 6825 + }, + { + "epoch": 0.0887007567697513, + "grad_norm": 0.43562814593315125, + "learning_rate": 0.00018229766438347263, + "loss": 1.5361, + "step": 6826 + }, + { + "epoch": 0.08871375131366717, + "grad_norm": 0.42393261194229126, + "learning_rate": 0.00018229506492156125, + "loss": 1.4752, + "step": 6827 + }, + { + "epoch": 0.08872674585758304, + "grad_norm": 0.44226598739624023, + "learning_rate": 0.00018229246545964988, + "loss": 1.5798, + "step": 6828 + }, + { + "epoch": 0.08873974040149891, + "grad_norm": 0.7142444849014282, + "learning_rate": 0.00018228986599773847, + "loss": 1.7151, + "step": 6829 + }, + { + "epoch": 0.08875273494541479, + "grad_norm": 0.42160525918006897, + "learning_rate": 0.00018228726653582707, + "loss": 1.4608, + "step": 6830 + }, + { + "epoch": 0.08876572948933066, + "grad_norm": 0.3964773118495941, + "learning_rate": 0.00018228466707391572, + "loss": 1.5238, + "step": 6831 + }, + { + "epoch": 0.08877872403324653, + "grad_norm": 0.4013957679271698, + "learning_rate": 0.00018228206761200432, + "loss": 1.569, + "step": 6832 + }, + { + "epoch": 0.08879171857716242, + "grad_norm": 0.32581770420074463, + "learning_rate": 0.00018227946815009295, + "loss": 1.5966, + "step": 6833 + }, + { + "epoch": 0.0888047131210783, + "grad_norm": 0.39206647872924805, + "learning_rate": 0.00018227686868818154, + "loss": 1.3957, + "step": 6834 + }, + { + "epoch": 0.08881770766499417, + "grad_norm": 0.4365825951099396, + "learning_rate": 0.00018227426922627017, + "loss": 1.4571, + "step": 6835 + }, + { + "epoch": 0.08883070220891004, + "grad_norm": 0.31897208094596863, + "learning_rate": 0.0001822716697643588, + "loss": 1.2062, + "step": 6836 + }, + { + "epoch": 0.08884369675282591, + "grad_norm": 0.3728909492492676, + "learning_rate": 0.0001822690703024474, + "loss": 1.4331, + "step": 6837 + }, + { + "epoch": 0.08885669129674179, + "grad_norm": 0.3852331340312958, + "learning_rate": 0.000182266470840536, + "loss": 1.421, + "step": 6838 + }, + { + "epoch": 0.08886968584065766, + "grad_norm": 0.4345470368862152, + "learning_rate": 0.00018226387137862464, + "loss": 1.6635, + "step": 6839 + }, + { + "epoch": 0.08888268038457353, + "grad_norm": 0.3771885335445404, + "learning_rate": 0.00018226127191671326, + "loss": 1.4296, + "step": 6840 + }, + { + "epoch": 0.0888956749284894, + "grad_norm": 0.31102120876312256, + "learning_rate": 0.00018225867245480186, + "loss": 1.3999, + "step": 6841 + }, + { + "epoch": 0.08890866947240528, + "grad_norm": 0.3644242584705353, + "learning_rate": 0.00018225607299289046, + "loss": 1.5674, + "step": 6842 + }, + { + "epoch": 0.08892166401632115, + "grad_norm": 0.38540053367614746, + "learning_rate": 0.0001822534735309791, + "loss": 1.5188, + "step": 6843 + }, + { + "epoch": 0.08893465856023702, + "grad_norm": 0.43788740038871765, + "learning_rate": 0.0001822508740690677, + "loss": 1.3869, + "step": 6844 + }, + { + "epoch": 0.0889476531041529, + "grad_norm": 0.3545567989349365, + "learning_rate": 0.00018224827460715633, + "loss": 1.3809, + "step": 6845 + }, + { + "epoch": 0.08896064764806877, + "grad_norm": 0.2941083610057831, + "learning_rate": 0.00018224567514524495, + "loss": 1.4417, + "step": 6846 + }, + { + "epoch": 0.08897364219198464, + "grad_norm": 0.3817788064479828, + "learning_rate": 0.00018224307568333355, + "loss": 1.5529, + "step": 6847 + }, + { + "epoch": 0.08898663673590052, + "grad_norm": 0.4662015438079834, + "learning_rate": 0.00018224047622142218, + "loss": 1.328, + "step": 6848 + }, + { + "epoch": 0.08899963127981639, + "grad_norm": 0.37792739272117615, + "learning_rate": 0.00018223787675951077, + "loss": 1.3555, + "step": 6849 + }, + { + "epoch": 0.08901262582373226, + "grad_norm": 0.43304669857025146, + "learning_rate": 0.00018223527729759943, + "loss": 1.4969, + "step": 6850 + }, + { + "epoch": 0.08902562036764813, + "grad_norm": 0.3964652717113495, + "learning_rate": 0.00018223267783568802, + "loss": 1.429, + "step": 6851 + }, + { + "epoch": 0.089038614911564, + "grad_norm": 0.35465994477272034, + "learning_rate": 0.00018223007837377665, + "loss": 1.3504, + "step": 6852 + }, + { + "epoch": 0.08905160945547988, + "grad_norm": 0.4410751163959503, + "learning_rate": 0.00018222747891186525, + "loss": 1.4947, + "step": 6853 + }, + { + "epoch": 0.08906460399939575, + "grad_norm": 0.3674108684062958, + "learning_rate": 0.00018222487944995387, + "loss": 1.3659, + "step": 6854 + }, + { + "epoch": 0.08907759854331163, + "grad_norm": 0.49862414598464966, + "learning_rate": 0.0001822222799880425, + "loss": 1.3169, + "step": 6855 + }, + { + "epoch": 0.0890905930872275, + "grad_norm": 0.4159758985042572, + "learning_rate": 0.0001822196805261311, + "loss": 1.1606, + "step": 6856 + }, + { + "epoch": 0.08910358763114337, + "grad_norm": 0.3795209228992462, + "learning_rate": 0.00018221708106421972, + "loss": 1.3068, + "step": 6857 + }, + { + "epoch": 0.08911658217505924, + "grad_norm": 0.4399988353252411, + "learning_rate": 0.00018221448160230834, + "loss": 1.3575, + "step": 6858 + }, + { + "epoch": 0.08912957671897512, + "grad_norm": 0.3988720774650574, + "learning_rate": 0.00018221188214039694, + "loss": 1.3137, + "step": 6859 + }, + { + "epoch": 0.08914257126289099, + "grad_norm": 0.3311937153339386, + "learning_rate": 0.00018220928267848556, + "loss": 1.3278, + "step": 6860 + }, + { + "epoch": 0.08915556580680686, + "grad_norm": 0.38437026739120483, + "learning_rate": 0.00018220668321657416, + "loss": 1.3375, + "step": 6861 + }, + { + "epoch": 0.08916856035072274, + "grad_norm": 0.4377189874649048, + "learning_rate": 0.0001822040837546628, + "loss": 1.5218, + "step": 6862 + }, + { + "epoch": 0.08918155489463861, + "grad_norm": 0.47967737913131714, + "learning_rate": 0.0001822014842927514, + "loss": 1.4893, + "step": 6863 + }, + { + "epoch": 0.08919454943855448, + "grad_norm": 0.4246910512447357, + "learning_rate": 0.00018219888483084003, + "loss": 1.5762, + "step": 6864 + }, + { + "epoch": 0.08920754398247036, + "grad_norm": 0.4137103259563446, + "learning_rate": 0.00018219628536892863, + "loss": 1.4194, + "step": 6865 + }, + { + "epoch": 0.08922053852638623, + "grad_norm": 0.24333456158638, + "learning_rate": 0.00018219368590701725, + "loss": 1.2215, + "step": 6866 + }, + { + "epoch": 0.0892335330703021, + "grad_norm": 0.3583102524280548, + "learning_rate": 0.00018219108644510588, + "loss": 1.5534, + "step": 6867 + }, + { + "epoch": 0.08924652761421797, + "grad_norm": 0.45948895812034607, + "learning_rate": 0.00018218848698319448, + "loss": 1.3769, + "step": 6868 + }, + { + "epoch": 0.08925952215813385, + "grad_norm": 0.34668073058128357, + "learning_rate": 0.0001821858875212831, + "loss": 1.5531, + "step": 6869 + }, + { + "epoch": 0.08927251670204972, + "grad_norm": 0.4705469608306885, + "learning_rate": 0.00018218328805937173, + "loss": 1.5348, + "step": 6870 + }, + { + "epoch": 0.08928551124596561, + "grad_norm": 0.29723280668258667, + "learning_rate": 0.00018218068859746032, + "loss": 1.3343, + "step": 6871 + }, + { + "epoch": 0.08929850578988148, + "grad_norm": 0.4309476613998413, + "learning_rate": 0.00018217808913554895, + "loss": 1.4278, + "step": 6872 + }, + { + "epoch": 0.08931150033379735, + "grad_norm": 0.4949832856655121, + "learning_rate": 0.00018217548967363754, + "loss": 1.5741, + "step": 6873 + }, + { + "epoch": 0.08932449487771323, + "grad_norm": 0.4408215284347534, + "learning_rate": 0.0001821728902117262, + "loss": 1.4931, + "step": 6874 + }, + { + "epoch": 0.0893374894216291, + "grad_norm": 0.49203070998191833, + "learning_rate": 0.0001821702907498148, + "loss": 1.5534, + "step": 6875 + }, + { + "epoch": 0.08935048396554497, + "grad_norm": 0.5074947476387024, + "learning_rate": 0.00018216769128790342, + "loss": 1.6048, + "step": 6876 + }, + { + "epoch": 0.08936347850946084, + "grad_norm": 0.4955956041812897, + "learning_rate": 0.00018216509182599202, + "loss": 1.3616, + "step": 6877 + }, + { + "epoch": 0.08937647305337672, + "grad_norm": 0.4054872393608093, + "learning_rate": 0.00018216249236408064, + "loss": 1.2805, + "step": 6878 + }, + { + "epoch": 0.08938946759729259, + "grad_norm": 0.3785209059715271, + "learning_rate": 0.00018215989290216926, + "loss": 1.4633, + "step": 6879 + }, + { + "epoch": 0.08940246214120846, + "grad_norm": 0.3802170157432556, + "learning_rate": 0.00018215729344025786, + "loss": 1.4995, + "step": 6880 + }, + { + "epoch": 0.08941545668512434, + "grad_norm": 0.39720088243484497, + "learning_rate": 0.0001821546939783465, + "loss": 1.5033, + "step": 6881 + }, + { + "epoch": 0.08942845122904021, + "grad_norm": 0.42358294129371643, + "learning_rate": 0.0001821520945164351, + "loss": 1.3757, + "step": 6882 + }, + { + "epoch": 0.08944144577295608, + "grad_norm": 0.3463008403778076, + "learning_rate": 0.00018214949505452374, + "loss": 1.1762, + "step": 6883 + }, + { + "epoch": 0.08945444031687196, + "grad_norm": 0.48838555812835693, + "learning_rate": 0.00018214689559261233, + "loss": 1.6427, + "step": 6884 + }, + { + "epoch": 0.08946743486078783, + "grad_norm": 0.3624010980129242, + "learning_rate": 0.00018214429613070096, + "loss": 1.6316, + "step": 6885 + }, + { + "epoch": 0.0894804294047037, + "grad_norm": 0.36835822463035583, + "learning_rate": 0.00018214169666878958, + "loss": 1.3811, + "step": 6886 + }, + { + "epoch": 0.08949342394861957, + "grad_norm": 0.36487331986427307, + "learning_rate": 0.00018213909720687818, + "loss": 1.2815, + "step": 6887 + }, + { + "epoch": 0.08950641849253545, + "grad_norm": 0.3315284848213196, + "learning_rate": 0.0001821364977449668, + "loss": 1.4272, + "step": 6888 + }, + { + "epoch": 0.08951941303645132, + "grad_norm": 0.3720794916152954, + "learning_rate": 0.00018213389828305543, + "loss": 1.5472, + "step": 6889 + }, + { + "epoch": 0.0895324075803672, + "grad_norm": 0.4438225328922272, + "learning_rate": 0.00018213129882114403, + "loss": 1.6675, + "step": 6890 + }, + { + "epoch": 0.08954540212428307, + "grad_norm": 0.43497613072395325, + "learning_rate": 0.00018212869935923265, + "loss": 1.4811, + "step": 6891 + }, + { + "epoch": 0.08955839666819894, + "grad_norm": 0.375722199678421, + "learning_rate": 0.00018212609989732125, + "loss": 1.5981, + "step": 6892 + }, + { + "epoch": 0.08957139121211481, + "grad_norm": 0.35424378514289856, + "learning_rate": 0.0001821235004354099, + "loss": 1.4233, + "step": 6893 + }, + { + "epoch": 0.08958438575603068, + "grad_norm": 0.4285990297794342, + "learning_rate": 0.0001821209009734985, + "loss": 1.4388, + "step": 6894 + }, + { + "epoch": 0.08959738029994656, + "grad_norm": 0.3616275489330292, + "learning_rate": 0.00018211830151158712, + "loss": 1.3027, + "step": 6895 + }, + { + "epoch": 0.08961037484386243, + "grad_norm": 0.42806243896484375, + "learning_rate": 0.00018211570204967572, + "loss": 1.5357, + "step": 6896 + }, + { + "epoch": 0.0896233693877783, + "grad_norm": 0.424429714679718, + "learning_rate": 0.00018211310258776434, + "loss": 1.5203, + "step": 6897 + }, + { + "epoch": 0.08963636393169418, + "grad_norm": 0.45211121439933777, + "learning_rate": 0.00018211050312585297, + "loss": 1.35, + "step": 6898 + }, + { + "epoch": 0.08964935847561005, + "grad_norm": 0.4621683955192566, + "learning_rate": 0.00018210790366394156, + "loss": 1.3788, + "step": 6899 + }, + { + "epoch": 0.08966235301952592, + "grad_norm": 0.34341955184936523, + "learning_rate": 0.0001821053042020302, + "loss": 1.4968, + "step": 6900 + }, + { + "epoch": 0.0896753475634418, + "grad_norm": 0.5100769996643066, + "learning_rate": 0.00018210270474011881, + "loss": 1.5317, + "step": 6901 + }, + { + "epoch": 0.08968834210735767, + "grad_norm": 0.4372366964817047, + "learning_rate": 0.0001821001052782074, + "loss": 1.5806, + "step": 6902 + }, + { + "epoch": 0.08970133665127354, + "grad_norm": 0.4510011672973633, + "learning_rate": 0.00018209750581629604, + "loss": 1.5886, + "step": 6903 + }, + { + "epoch": 0.08971433119518941, + "grad_norm": 0.5962079167366028, + "learning_rate": 0.00018209490635438463, + "loss": 1.5202, + "step": 6904 + }, + { + "epoch": 0.08972732573910529, + "grad_norm": 0.38910624384880066, + "learning_rate": 0.00018209230689247328, + "loss": 1.3992, + "step": 6905 + }, + { + "epoch": 0.08974032028302116, + "grad_norm": 0.3711726665496826, + "learning_rate": 0.00018208970743056188, + "loss": 1.4978, + "step": 6906 + }, + { + "epoch": 0.08975331482693703, + "grad_norm": 0.39889922738075256, + "learning_rate": 0.0001820871079686505, + "loss": 1.4703, + "step": 6907 + }, + { + "epoch": 0.0897663093708529, + "grad_norm": 0.35066187381744385, + "learning_rate": 0.0001820845085067391, + "loss": 1.5049, + "step": 6908 + }, + { + "epoch": 0.0897793039147688, + "grad_norm": 0.33957386016845703, + "learning_rate": 0.00018208190904482773, + "loss": 1.2951, + "step": 6909 + }, + { + "epoch": 0.08979229845868467, + "grad_norm": 0.3135322630405426, + "learning_rate": 0.00018207930958291635, + "loss": 1.6078, + "step": 6910 + }, + { + "epoch": 0.08980529300260054, + "grad_norm": 0.3385043144226074, + "learning_rate": 0.00018207671012100495, + "loss": 1.1911, + "step": 6911 + }, + { + "epoch": 0.08981828754651641, + "grad_norm": 0.4538031816482544, + "learning_rate": 0.00018207411065909357, + "loss": 1.7088, + "step": 6912 + }, + { + "epoch": 0.08983128209043229, + "grad_norm": 0.41059112548828125, + "learning_rate": 0.0001820715111971822, + "loss": 1.3147, + "step": 6913 + }, + { + "epoch": 0.08984427663434816, + "grad_norm": 0.3995553255081177, + "learning_rate": 0.0001820689117352708, + "loss": 1.4009, + "step": 6914 + }, + { + "epoch": 0.08985727117826403, + "grad_norm": 0.34051433205604553, + "learning_rate": 0.00018206631227335942, + "loss": 1.3919, + "step": 6915 + }, + { + "epoch": 0.0898702657221799, + "grad_norm": 0.4123254418373108, + "learning_rate": 0.00018206371281144802, + "loss": 1.5242, + "step": 6916 + }, + { + "epoch": 0.08988326026609578, + "grad_norm": 0.3846631944179535, + "learning_rate": 0.00018206111334953667, + "loss": 1.5889, + "step": 6917 + }, + { + "epoch": 0.08989625481001165, + "grad_norm": 0.4947319030761719, + "learning_rate": 0.00018205851388762527, + "loss": 1.5242, + "step": 6918 + }, + { + "epoch": 0.08990924935392752, + "grad_norm": 0.3631800711154938, + "learning_rate": 0.0001820559144257139, + "loss": 1.2717, + "step": 6919 + }, + { + "epoch": 0.0899222438978434, + "grad_norm": 0.4155273735523224, + "learning_rate": 0.00018205331496380252, + "loss": 1.5665, + "step": 6920 + }, + { + "epoch": 0.08993523844175927, + "grad_norm": 0.3829532265663147, + "learning_rate": 0.00018205071550189111, + "loss": 1.4345, + "step": 6921 + }, + { + "epoch": 0.08994823298567514, + "grad_norm": 0.3948131799697876, + "learning_rate": 0.00018204811603997974, + "loss": 1.448, + "step": 6922 + }, + { + "epoch": 0.08996122752959101, + "grad_norm": 0.3429071307182312, + "learning_rate": 0.00018204551657806834, + "loss": 1.4406, + "step": 6923 + }, + { + "epoch": 0.08997422207350689, + "grad_norm": 0.324846476316452, + "learning_rate": 0.000182042917116157, + "loss": 1.356, + "step": 6924 + }, + { + "epoch": 0.08998721661742276, + "grad_norm": 0.4990135133266449, + "learning_rate": 0.00018204031765424558, + "loss": 1.4544, + "step": 6925 + }, + { + "epoch": 0.09000021116133863, + "grad_norm": 0.39653822779655457, + "learning_rate": 0.00018203771819233418, + "loss": 1.5492, + "step": 6926 + }, + { + "epoch": 0.0900132057052545, + "grad_norm": 0.45440101623535156, + "learning_rate": 0.0001820351187304228, + "loss": 1.4791, + "step": 6927 + }, + { + "epoch": 0.09002620024917038, + "grad_norm": 0.44539371132850647, + "learning_rate": 0.00018203251926851143, + "loss": 1.6066, + "step": 6928 + }, + { + "epoch": 0.09003919479308625, + "grad_norm": 0.32898858189582825, + "learning_rate": 0.00018202991980660006, + "loss": 1.3568, + "step": 6929 + }, + { + "epoch": 0.09005218933700213, + "grad_norm": 0.4969126284122467, + "learning_rate": 0.00018202732034468865, + "loss": 1.6048, + "step": 6930 + }, + { + "epoch": 0.090065183880918, + "grad_norm": 0.43364131450653076, + "learning_rate": 0.00018202472088277728, + "loss": 1.345, + "step": 6931 + }, + { + "epoch": 0.09007817842483387, + "grad_norm": 0.42342495918273926, + "learning_rate": 0.0001820221214208659, + "loss": 1.5351, + "step": 6932 + }, + { + "epoch": 0.09009117296874974, + "grad_norm": 0.33170658349990845, + "learning_rate": 0.0001820195219589545, + "loss": 1.4517, + "step": 6933 + }, + { + "epoch": 0.09010416751266562, + "grad_norm": 0.3703378438949585, + "learning_rate": 0.00018201692249704312, + "loss": 1.5418, + "step": 6934 + }, + { + "epoch": 0.09011716205658149, + "grad_norm": 0.32989075779914856, + "learning_rate": 0.00018201432303513172, + "loss": 1.2628, + "step": 6935 + }, + { + "epoch": 0.09013015660049736, + "grad_norm": 0.3939376473426819, + "learning_rate": 0.00018201172357322037, + "loss": 1.4046, + "step": 6936 + }, + { + "epoch": 0.09014315114441324, + "grad_norm": 0.46229249238967896, + "learning_rate": 0.00018200912411130897, + "loss": 1.5823, + "step": 6937 + }, + { + "epoch": 0.09015614568832911, + "grad_norm": 0.36480844020843506, + "learning_rate": 0.00018200652464939757, + "loss": 1.3736, + "step": 6938 + }, + { + "epoch": 0.09016914023224498, + "grad_norm": 0.4342983663082123, + "learning_rate": 0.0001820039251874862, + "loss": 1.4448, + "step": 6939 + }, + { + "epoch": 0.09018213477616085, + "grad_norm": 0.4324263036251068, + "learning_rate": 0.00018200132572557482, + "loss": 1.5807, + "step": 6940 + }, + { + "epoch": 0.09019512932007673, + "grad_norm": 0.3215867877006531, + "learning_rate": 0.00018199872626366344, + "loss": 1.445, + "step": 6941 + }, + { + "epoch": 0.0902081238639926, + "grad_norm": 0.34165892004966736, + "learning_rate": 0.00018199612680175204, + "loss": 1.4501, + "step": 6942 + }, + { + "epoch": 0.09022111840790847, + "grad_norm": 0.41335347294807434, + "learning_rate": 0.00018199352733984066, + "loss": 1.5691, + "step": 6943 + }, + { + "epoch": 0.09023411295182435, + "grad_norm": 0.4693707525730133, + "learning_rate": 0.0001819909278779293, + "loss": 1.3992, + "step": 6944 + }, + { + "epoch": 0.09024710749574022, + "grad_norm": 0.453662633895874, + "learning_rate": 0.00018198832841601788, + "loss": 1.6101, + "step": 6945 + }, + { + "epoch": 0.09026010203965609, + "grad_norm": 0.3513772189617157, + "learning_rate": 0.0001819857289541065, + "loss": 1.3074, + "step": 6946 + }, + { + "epoch": 0.09027309658357198, + "grad_norm": 0.4163765609264374, + "learning_rate": 0.0001819831294921951, + "loss": 1.5037, + "step": 6947 + }, + { + "epoch": 0.09028609112748785, + "grad_norm": 0.44242116808891296, + "learning_rate": 0.00018198053003028376, + "loss": 1.5549, + "step": 6948 + }, + { + "epoch": 0.09029908567140373, + "grad_norm": 0.43407416343688965, + "learning_rate": 0.00018197793056837236, + "loss": 1.7314, + "step": 6949 + }, + { + "epoch": 0.0903120802153196, + "grad_norm": 0.3851221799850464, + "learning_rate": 0.00018197533110646098, + "loss": 1.4527, + "step": 6950 + }, + { + "epoch": 0.09032507475923547, + "grad_norm": 0.2697647511959076, + "learning_rate": 0.00018197273164454958, + "loss": 1.2771, + "step": 6951 + }, + { + "epoch": 0.09033806930315134, + "grad_norm": 0.36101463437080383, + "learning_rate": 0.0001819701321826382, + "loss": 1.3333, + "step": 6952 + }, + { + "epoch": 0.09035106384706722, + "grad_norm": 0.4358506202697754, + "learning_rate": 0.00018196753272072683, + "loss": 1.5135, + "step": 6953 + }, + { + "epoch": 0.09036405839098309, + "grad_norm": 0.45592454075813293, + "learning_rate": 0.00018196493325881542, + "loss": 1.5169, + "step": 6954 + }, + { + "epoch": 0.09037705293489896, + "grad_norm": 0.4009954631328583, + "learning_rate": 0.00018196233379690405, + "loss": 1.426, + "step": 6955 + }, + { + "epoch": 0.09039004747881484, + "grad_norm": 0.3629184365272522, + "learning_rate": 0.00018195973433499267, + "loss": 1.453, + "step": 6956 + }, + { + "epoch": 0.09040304202273071, + "grad_norm": 0.3912285566329956, + "learning_rate": 0.00018195713487308127, + "loss": 1.3439, + "step": 6957 + }, + { + "epoch": 0.09041603656664658, + "grad_norm": 0.44960349798202515, + "learning_rate": 0.0001819545354111699, + "loss": 1.4849, + "step": 6958 + }, + { + "epoch": 0.09042903111056246, + "grad_norm": 0.41558364033699036, + "learning_rate": 0.00018195193594925852, + "loss": 1.4932, + "step": 6959 + }, + { + "epoch": 0.09044202565447833, + "grad_norm": 0.3597593307495117, + "learning_rate": 0.00018194933648734714, + "loss": 1.3039, + "step": 6960 + }, + { + "epoch": 0.0904550201983942, + "grad_norm": 0.40801116824150085, + "learning_rate": 0.00018194673702543574, + "loss": 1.4394, + "step": 6961 + }, + { + "epoch": 0.09046801474231007, + "grad_norm": 0.45404523611068726, + "learning_rate": 0.00018194413756352437, + "loss": 1.5567, + "step": 6962 + }, + { + "epoch": 0.09048100928622595, + "grad_norm": 0.32136112451553345, + "learning_rate": 0.000181941538101613, + "loss": 1.3397, + "step": 6963 + }, + { + "epoch": 0.09049400383014182, + "grad_norm": 0.44544461369514465, + "learning_rate": 0.0001819389386397016, + "loss": 1.3232, + "step": 6964 + }, + { + "epoch": 0.09050699837405769, + "grad_norm": 0.36751747131347656, + "learning_rate": 0.0001819363391777902, + "loss": 1.2491, + "step": 6965 + }, + { + "epoch": 0.09051999291797357, + "grad_norm": 0.40375760197639465, + "learning_rate": 0.0001819337397158788, + "loss": 1.6012, + "step": 6966 + }, + { + "epoch": 0.09053298746188944, + "grad_norm": 0.3991863429546356, + "learning_rate": 0.00018193114025396746, + "loss": 1.4175, + "step": 6967 + }, + { + "epoch": 0.09054598200580531, + "grad_norm": 0.5024530291557312, + "learning_rate": 0.00018192854079205606, + "loss": 1.5153, + "step": 6968 + }, + { + "epoch": 0.09055897654972118, + "grad_norm": 0.4122013449668884, + "learning_rate": 0.00018192594133014466, + "loss": 1.3864, + "step": 6969 + }, + { + "epoch": 0.09057197109363706, + "grad_norm": 0.39671802520751953, + "learning_rate": 0.00018192334186823328, + "loss": 1.3102, + "step": 6970 + }, + { + "epoch": 0.09058496563755293, + "grad_norm": 0.3554450273513794, + "learning_rate": 0.0001819207424063219, + "loss": 1.4223, + "step": 6971 + }, + { + "epoch": 0.0905979601814688, + "grad_norm": 0.4238445460796356, + "learning_rate": 0.00018191814294441053, + "loss": 1.4315, + "step": 6972 + }, + { + "epoch": 0.09061095472538468, + "grad_norm": 0.36172258853912354, + "learning_rate": 0.00018191554348249913, + "loss": 1.2792, + "step": 6973 + }, + { + "epoch": 0.09062394926930055, + "grad_norm": 0.3543929159641266, + "learning_rate": 0.00018191294402058775, + "loss": 1.3975, + "step": 6974 + }, + { + "epoch": 0.09063694381321642, + "grad_norm": 0.3743111193180084, + "learning_rate": 0.00018191034455867637, + "loss": 1.4346, + "step": 6975 + }, + { + "epoch": 0.0906499383571323, + "grad_norm": 0.455879271030426, + "learning_rate": 0.00018190774509676497, + "loss": 1.5487, + "step": 6976 + }, + { + "epoch": 0.09066293290104817, + "grad_norm": 0.33087995648384094, + "learning_rate": 0.0001819051456348536, + "loss": 1.5044, + "step": 6977 + }, + { + "epoch": 0.09067592744496404, + "grad_norm": 0.4066532850265503, + "learning_rate": 0.0001819025461729422, + "loss": 1.5154, + "step": 6978 + }, + { + "epoch": 0.09068892198887991, + "grad_norm": 0.41942641139030457, + "learning_rate": 0.00018189994671103085, + "loss": 1.5276, + "step": 6979 + }, + { + "epoch": 0.09070191653279579, + "grad_norm": 0.43709370493888855, + "learning_rate": 0.00018189734724911944, + "loss": 1.666, + "step": 6980 + }, + { + "epoch": 0.09071491107671166, + "grad_norm": 0.34738513827323914, + "learning_rate": 0.00018189474778720804, + "loss": 1.415, + "step": 6981 + }, + { + "epoch": 0.09072790562062753, + "grad_norm": 0.3906846046447754, + "learning_rate": 0.00018189214832529667, + "loss": 1.243, + "step": 6982 + }, + { + "epoch": 0.0907409001645434, + "grad_norm": 0.4436666965484619, + "learning_rate": 0.0001818895488633853, + "loss": 1.242, + "step": 6983 + }, + { + "epoch": 0.09075389470845928, + "grad_norm": 0.4382869005203247, + "learning_rate": 0.00018188694940147391, + "loss": 1.3336, + "step": 6984 + }, + { + "epoch": 0.09076688925237517, + "grad_norm": 0.3783816397190094, + "learning_rate": 0.0001818843499395625, + "loss": 1.3196, + "step": 6985 + }, + { + "epoch": 0.09077988379629104, + "grad_norm": 0.44984304904937744, + "learning_rate": 0.00018188175047765114, + "loss": 1.4837, + "step": 6986 + }, + { + "epoch": 0.09079287834020691, + "grad_norm": 0.7355769872665405, + "learning_rate": 0.00018187915101573976, + "loss": 1.496, + "step": 6987 + }, + { + "epoch": 0.09080587288412278, + "grad_norm": 0.340820848941803, + "learning_rate": 0.00018187655155382836, + "loss": 1.2732, + "step": 6988 + }, + { + "epoch": 0.09081886742803866, + "grad_norm": 0.3869789242744446, + "learning_rate": 0.00018187395209191698, + "loss": 1.5891, + "step": 6989 + }, + { + "epoch": 0.09083186197195453, + "grad_norm": 0.3887910544872284, + "learning_rate": 0.00018187135263000558, + "loss": 1.4475, + "step": 6990 + }, + { + "epoch": 0.0908448565158704, + "grad_norm": 0.4561319947242737, + "learning_rate": 0.00018186875316809423, + "loss": 1.3507, + "step": 6991 + }, + { + "epoch": 0.09085785105978628, + "grad_norm": 0.34259194135665894, + "learning_rate": 0.00018186615370618283, + "loss": 1.4105, + "step": 6992 + }, + { + "epoch": 0.09087084560370215, + "grad_norm": 0.41069769859313965, + "learning_rate": 0.00018186355424427143, + "loss": 1.4076, + "step": 6993 + }, + { + "epoch": 0.09088384014761802, + "grad_norm": 0.5440664887428284, + "learning_rate": 0.00018186095478236008, + "loss": 1.4778, + "step": 6994 + }, + { + "epoch": 0.0908968346915339, + "grad_norm": 0.44725194573402405, + "learning_rate": 0.00018185835532044867, + "loss": 1.539, + "step": 6995 + }, + { + "epoch": 0.09090982923544977, + "grad_norm": 0.35201412439346313, + "learning_rate": 0.0001818557558585373, + "loss": 1.5734, + "step": 6996 + }, + { + "epoch": 0.09092282377936564, + "grad_norm": 0.4043336510658264, + "learning_rate": 0.0001818531563966259, + "loss": 1.5282, + "step": 6997 + }, + { + "epoch": 0.09093581832328151, + "grad_norm": 0.43089672923088074, + "learning_rate": 0.00018185055693471452, + "loss": 1.5351, + "step": 6998 + }, + { + "epoch": 0.09094881286719739, + "grad_norm": 0.33127808570861816, + "learning_rate": 0.00018184795747280315, + "loss": 1.3167, + "step": 6999 + }, + { + "epoch": 0.09096180741111326, + "grad_norm": 0.35555338859558105, + "learning_rate": 0.00018184535801089174, + "loss": 1.4014, + "step": 7000 + }, + { + "epoch": 0.09097480195502913, + "grad_norm": 0.47664740681648254, + "learning_rate": 0.00018184275854898037, + "loss": 1.231, + "step": 7001 + }, + { + "epoch": 0.090987796498945, + "grad_norm": 0.37155023217201233, + "learning_rate": 0.000181840159087069, + "loss": 1.3657, + "step": 7002 + }, + { + "epoch": 0.09100079104286088, + "grad_norm": 0.3100747764110565, + "learning_rate": 0.00018183755962515762, + "loss": 1.1602, + "step": 7003 + }, + { + "epoch": 0.09101378558677675, + "grad_norm": 0.3802303075790405, + "learning_rate": 0.00018183496016324621, + "loss": 1.3541, + "step": 7004 + }, + { + "epoch": 0.09102678013069262, + "grad_norm": 0.43504342436790466, + "learning_rate": 0.00018183236070133484, + "loss": 1.3814, + "step": 7005 + }, + { + "epoch": 0.0910397746746085, + "grad_norm": 0.26990699768066406, + "learning_rate": 0.00018182976123942346, + "loss": 1.2038, + "step": 7006 + }, + { + "epoch": 0.09105276921852437, + "grad_norm": 0.38820093870162964, + "learning_rate": 0.00018182716177751206, + "loss": 1.4681, + "step": 7007 + }, + { + "epoch": 0.09106576376244024, + "grad_norm": 0.4903755784034729, + "learning_rate": 0.00018182456231560068, + "loss": 1.5494, + "step": 7008 + }, + { + "epoch": 0.09107875830635612, + "grad_norm": 0.35954418778419495, + "learning_rate": 0.00018182196285368928, + "loss": 1.4889, + "step": 7009 + }, + { + "epoch": 0.09109175285027199, + "grad_norm": 0.29346954822540283, + "learning_rate": 0.0001818193633917779, + "loss": 1.1728, + "step": 7010 + }, + { + "epoch": 0.09110474739418786, + "grad_norm": 0.3912215530872345, + "learning_rate": 0.00018181676392986653, + "loss": 1.5047, + "step": 7011 + }, + { + "epoch": 0.09111774193810374, + "grad_norm": 0.4584551751613617, + "learning_rate": 0.00018181416446795513, + "loss": 1.54, + "step": 7012 + }, + { + "epoch": 0.09113073648201961, + "grad_norm": 0.4383290410041809, + "learning_rate": 0.00018181156500604375, + "loss": 1.3977, + "step": 7013 + }, + { + "epoch": 0.09114373102593548, + "grad_norm": 0.45443469285964966, + "learning_rate": 0.00018180896554413238, + "loss": 1.5307, + "step": 7014 + }, + { + "epoch": 0.09115672556985135, + "grad_norm": 0.34525009989738464, + "learning_rate": 0.000181806366082221, + "loss": 1.5244, + "step": 7015 + }, + { + "epoch": 0.09116972011376723, + "grad_norm": 0.4214661121368408, + "learning_rate": 0.0001818037666203096, + "loss": 1.5493, + "step": 7016 + }, + { + "epoch": 0.0911827146576831, + "grad_norm": 0.35950130224227905, + "learning_rate": 0.00018180116715839822, + "loss": 1.4499, + "step": 7017 + }, + { + "epoch": 0.09119570920159897, + "grad_norm": 0.360706627368927, + "learning_rate": 0.00018179856769648685, + "loss": 1.5785, + "step": 7018 + }, + { + "epoch": 0.09120870374551485, + "grad_norm": 0.37173810601234436, + "learning_rate": 0.00018179596823457545, + "loss": 1.4272, + "step": 7019 + }, + { + "epoch": 0.09122169828943072, + "grad_norm": 0.36263522505760193, + "learning_rate": 0.00018179336877266407, + "loss": 1.3951, + "step": 7020 + }, + { + "epoch": 0.09123469283334659, + "grad_norm": 0.42371153831481934, + "learning_rate": 0.00018179076931075267, + "loss": 1.4595, + "step": 7021 + }, + { + "epoch": 0.09124768737726247, + "grad_norm": 0.3105991780757904, + "learning_rate": 0.0001817881698488413, + "loss": 1.4995, + "step": 7022 + }, + { + "epoch": 0.09126068192117835, + "grad_norm": 0.3982296288013458, + "learning_rate": 0.00018178557038692992, + "loss": 1.5567, + "step": 7023 + }, + { + "epoch": 0.09127367646509423, + "grad_norm": 0.4463663399219513, + "learning_rate": 0.00018178297092501851, + "loss": 1.6307, + "step": 7024 + }, + { + "epoch": 0.0912866710090101, + "grad_norm": 0.4056328535079956, + "learning_rate": 0.00018178037146310714, + "loss": 1.445, + "step": 7025 + }, + { + "epoch": 0.09129966555292597, + "grad_norm": 0.30898308753967285, + "learning_rate": 0.00018177777200119576, + "loss": 1.2924, + "step": 7026 + }, + { + "epoch": 0.09131266009684184, + "grad_norm": 0.4046187996864319, + "learning_rate": 0.0001817751725392844, + "loss": 1.4658, + "step": 7027 + }, + { + "epoch": 0.09132565464075772, + "grad_norm": 0.4392760992050171, + "learning_rate": 0.00018177257307737298, + "loss": 1.4515, + "step": 7028 + }, + { + "epoch": 0.09133864918467359, + "grad_norm": 0.4211624264717102, + "learning_rate": 0.0001817699736154616, + "loss": 1.394, + "step": 7029 + }, + { + "epoch": 0.09135164372858946, + "grad_norm": 0.3351362645626068, + "learning_rate": 0.00018176737415355023, + "loss": 1.2156, + "step": 7030 + }, + { + "epoch": 0.09136463827250534, + "grad_norm": 0.4164751470088959, + "learning_rate": 0.00018176477469163883, + "loss": 1.3609, + "step": 7031 + }, + { + "epoch": 0.09137763281642121, + "grad_norm": 0.492680162191391, + "learning_rate": 0.00018176217522972746, + "loss": 1.2473, + "step": 7032 + }, + { + "epoch": 0.09139062736033708, + "grad_norm": 0.44297999143600464, + "learning_rate": 0.00018175957576781608, + "loss": 1.4983, + "step": 7033 + }, + { + "epoch": 0.09140362190425295, + "grad_norm": 0.3613723814487457, + "learning_rate": 0.0001817569763059047, + "loss": 1.4359, + "step": 7034 + }, + { + "epoch": 0.09141661644816883, + "grad_norm": 0.45229169726371765, + "learning_rate": 0.0001817543768439933, + "loss": 1.5633, + "step": 7035 + }, + { + "epoch": 0.0914296109920847, + "grad_norm": 0.39727187156677246, + "learning_rate": 0.0001817517773820819, + "loss": 1.3757, + "step": 7036 + }, + { + "epoch": 0.09144260553600057, + "grad_norm": 0.39390307664871216, + "learning_rate": 0.00018174917792017055, + "loss": 1.4118, + "step": 7037 + }, + { + "epoch": 0.09145560007991645, + "grad_norm": 0.40995270013809204, + "learning_rate": 0.00018174657845825915, + "loss": 1.529, + "step": 7038 + }, + { + "epoch": 0.09146859462383232, + "grad_norm": 0.5152929425239563, + "learning_rate": 0.00018174397899634777, + "loss": 1.3893, + "step": 7039 + }, + { + "epoch": 0.09148158916774819, + "grad_norm": 0.40617021918296814, + "learning_rate": 0.00018174137953443637, + "loss": 1.4569, + "step": 7040 + }, + { + "epoch": 0.09149458371166407, + "grad_norm": 0.4657137989997864, + "learning_rate": 0.000181738780072525, + "loss": 1.651, + "step": 7041 + }, + { + "epoch": 0.09150757825557994, + "grad_norm": 0.36911675333976746, + "learning_rate": 0.00018173618061061362, + "loss": 1.5006, + "step": 7042 + }, + { + "epoch": 0.09152057279949581, + "grad_norm": 0.4068413972854614, + "learning_rate": 0.00018173358114870222, + "loss": 1.4879, + "step": 7043 + }, + { + "epoch": 0.09153356734341168, + "grad_norm": 0.40428224205970764, + "learning_rate": 0.00018173098168679084, + "loss": 1.269, + "step": 7044 + }, + { + "epoch": 0.09154656188732756, + "grad_norm": 0.48272445797920227, + "learning_rate": 0.00018172838222487947, + "loss": 1.5141, + "step": 7045 + }, + { + "epoch": 0.09155955643124343, + "grad_norm": 0.3999854326248169, + "learning_rate": 0.0001817257827629681, + "loss": 1.3822, + "step": 7046 + }, + { + "epoch": 0.0915725509751593, + "grad_norm": 0.34400972723960876, + "learning_rate": 0.0001817231833010567, + "loss": 1.5411, + "step": 7047 + }, + { + "epoch": 0.09158554551907518, + "grad_norm": 0.4068968594074249, + "learning_rate": 0.00018172058383914528, + "loss": 1.3241, + "step": 7048 + }, + { + "epoch": 0.09159854006299105, + "grad_norm": 0.4398244321346283, + "learning_rate": 0.00018171798437723394, + "loss": 1.513, + "step": 7049 + }, + { + "epoch": 0.09161153460690692, + "grad_norm": 0.48099762201309204, + "learning_rate": 0.00018171538491532253, + "loss": 1.4429, + "step": 7050 + }, + { + "epoch": 0.0916245291508228, + "grad_norm": 0.4356433153152466, + "learning_rate": 0.00018171278545341116, + "loss": 1.342, + "step": 7051 + }, + { + "epoch": 0.09163752369473867, + "grad_norm": 0.3434799313545227, + "learning_rate": 0.00018171018599149976, + "loss": 1.4022, + "step": 7052 + }, + { + "epoch": 0.09165051823865454, + "grad_norm": 0.3334213197231293, + "learning_rate": 0.00018170758652958838, + "loss": 1.4959, + "step": 7053 + }, + { + "epoch": 0.09166351278257041, + "grad_norm": 0.43539541959762573, + "learning_rate": 0.000181704987067677, + "loss": 1.466, + "step": 7054 + }, + { + "epoch": 0.09167650732648629, + "grad_norm": 0.3760571777820587, + "learning_rate": 0.0001817023876057656, + "loss": 1.3674, + "step": 7055 + }, + { + "epoch": 0.09168950187040216, + "grad_norm": 0.4352891743183136, + "learning_rate": 0.00018169978814385423, + "loss": 1.6493, + "step": 7056 + }, + { + "epoch": 0.09170249641431803, + "grad_norm": 0.40841996669769287, + "learning_rate": 0.00018169718868194285, + "loss": 1.3558, + "step": 7057 + }, + { + "epoch": 0.0917154909582339, + "grad_norm": 0.3723299503326416, + "learning_rate": 0.00018169458922003148, + "loss": 1.1837, + "step": 7058 + }, + { + "epoch": 0.09172848550214978, + "grad_norm": 0.3892412483692169, + "learning_rate": 0.00018169198975812007, + "loss": 1.5824, + "step": 7059 + }, + { + "epoch": 0.09174148004606565, + "grad_norm": 0.35799074172973633, + "learning_rate": 0.00018168939029620867, + "loss": 1.5078, + "step": 7060 + }, + { + "epoch": 0.09175447458998154, + "grad_norm": 0.3643108606338501, + "learning_rate": 0.00018168679083429732, + "loss": 1.4008, + "step": 7061 + }, + { + "epoch": 0.09176746913389741, + "grad_norm": 0.44489216804504395, + "learning_rate": 0.00018168419137238592, + "loss": 1.4752, + "step": 7062 + }, + { + "epoch": 0.09178046367781328, + "grad_norm": 0.33809971809387207, + "learning_rate": 0.00018168159191047454, + "loss": 1.52, + "step": 7063 + }, + { + "epoch": 0.09179345822172916, + "grad_norm": 0.30471858382225037, + "learning_rate": 0.00018167899244856314, + "loss": 1.6783, + "step": 7064 + }, + { + "epoch": 0.09180645276564503, + "grad_norm": 0.39556384086608887, + "learning_rate": 0.00018167639298665177, + "loss": 1.3709, + "step": 7065 + }, + { + "epoch": 0.0918194473095609, + "grad_norm": 0.3694120943546295, + "learning_rate": 0.0001816737935247404, + "loss": 1.4105, + "step": 7066 + }, + { + "epoch": 0.09183244185347678, + "grad_norm": 0.4004029929637909, + "learning_rate": 0.000181671194062829, + "loss": 1.4499, + "step": 7067 + }, + { + "epoch": 0.09184543639739265, + "grad_norm": 0.31606435775756836, + "learning_rate": 0.00018166859460091764, + "loss": 1.3162, + "step": 7068 + }, + { + "epoch": 0.09185843094130852, + "grad_norm": 0.31936192512512207, + "learning_rate": 0.00018166599513900624, + "loss": 1.4311, + "step": 7069 + }, + { + "epoch": 0.0918714254852244, + "grad_norm": 0.4151539206504822, + "learning_rate": 0.00018166339567709486, + "loss": 1.3412, + "step": 7070 + }, + { + "epoch": 0.09188442002914027, + "grad_norm": 0.338312566280365, + "learning_rate": 0.00018166079621518346, + "loss": 1.4083, + "step": 7071 + }, + { + "epoch": 0.09189741457305614, + "grad_norm": 0.35674360394477844, + "learning_rate": 0.00018165819675327208, + "loss": 1.3069, + "step": 7072 + }, + { + "epoch": 0.09191040911697201, + "grad_norm": 0.4314648509025574, + "learning_rate": 0.0001816555972913607, + "loss": 1.549, + "step": 7073 + }, + { + "epoch": 0.09192340366088789, + "grad_norm": 0.39260637760162354, + "learning_rate": 0.0001816529978294493, + "loss": 1.4833, + "step": 7074 + }, + { + "epoch": 0.09193639820480376, + "grad_norm": 0.35555902123451233, + "learning_rate": 0.00018165039836753793, + "loss": 1.2901, + "step": 7075 + }, + { + "epoch": 0.09194939274871963, + "grad_norm": 0.3010965883731842, + "learning_rate": 0.00018164779890562655, + "loss": 1.5237, + "step": 7076 + }, + { + "epoch": 0.0919623872926355, + "grad_norm": 0.40476569533348083, + "learning_rate": 0.00018164519944371515, + "loss": 1.3212, + "step": 7077 + }, + { + "epoch": 0.09197538183655138, + "grad_norm": 0.3887787163257599, + "learning_rate": 0.00018164259998180378, + "loss": 1.3762, + "step": 7078 + }, + { + "epoch": 0.09198837638046725, + "grad_norm": 0.42832162976264954, + "learning_rate": 0.00018164000051989237, + "loss": 1.5362, + "step": 7079 + }, + { + "epoch": 0.09200137092438312, + "grad_norm": 0.3696151077747345, + "learning_rate": 0.00018163740105798102, + "loss": 1.2748, + "step": 7080 + }, + { + "epoch": 0.092014365468299, + "grad_norm": 0.3990439474582672, + "learning_rate": 0.00018163480159606962, + "loss": 1.2019, + "step": 7081 + }, + { + "epoch": 0.09202736001221487, + "grad_norm": 0.34416481852531433, + "learning_rate": 0.00018163220213415825, + "loss": 1.496, + "step": 7082 + }, + { + "epoch": 0.09204035455613074, + "grad_norm": 0.3878422975540161, + "learning_rate": 0.00018162960267224684, + "loss": 1.4295, + "step": 7083 + }, + { + "epoch": 0.09205334910004662, + "grad_norm": 0.40328705310821533, + "learning_rate": 0.00018162700321033547, + "loss": 1.4789, + "step": 7084 + }, + { + "epoch": 0.09206634364396249, + "grad_norm": 0.3785564601421356, + "learning_rate": 0.0001816244037484241, + "loss": 1.3298, + "step": 7085 + }, + { + "epoch": 0.09207933818787836, + "grad_norm": 0.42818620800971985, + "learning_rate": 0.0001816218042865127, + "loss": 1.3423, + "step": 7086 + }, + { + "epoch": 0.09209233273179424, + "grad_norm": 0.4125883877277374, + "learning_rate": 0.00018161920482460131, + "loss": 1.3533, + "step": 7087 + }, + { + "epoch": 0.09210532727571011, + "grad_norm": 0.4832891523838043, + "learning_rate": 0.00018161660536268994, + "loss": 1.4342, + "step": 7088 + }, + { + "epoch": 0.09211832181962598, + "grad_norm": 0.37861329317092896, + "learning_rate": 0.00018161400590077856, + "loss": 1.407, + "step": 7089 + }, + { + "epoch": 0.09213131636354185, + "grad_norm": 0.6877596378326416, + "learning_rate": 0.00018161140643886716, + "loss": 1.4898, + "step": 7090 + }, + { + "epoch": 0.09214431090745773, + "grad_norm": 0.3705253601074219, + "learning_rate": 0.00018160880697695576, + "loss": 1.5152, + "step": 7091 + }, + { + "epoch": 0.0921573054513736, + "grad_norm": 0.4205756187438965, + "learning_rate": 0.0001816062075150444, + "loss": 1.4117, + "step": 7092 + }, + { + "epoch": 0.09217029999528947, + "grad_norm": 0.35524222254753113, + "learning_rate": 0.000181603608053133, + "loss": 1.5007, + "step": 7093 + }, + { + "epoch": 0.09218329453920535, + "grad_norm": 0.367727667093277, + "learning_rate": 0.00018160100859122163, + "loss": 1.2419, + "step": 7094 + }, + { + "epoch": 0.09219628908312122, + "grad_norm": 0.4637907147407532, + "learning_rate": 0.00018159840912931023, + "loss": 1.4009, + "step": 7095 + }, + { + "epoch": 0.09220928362703709, + "grad_norm": 0.30720487236976624, + "learning_rate": 0.00018159580966739885, + "loss": 1.4144, + "step": 7096 + }, + { + "epoch": 0.09222227817095296, + "grad_norm": 0.44006332755088806, + "learning_rate": 0.00018159321020548748, + "loss": 1.285, + "step": 7097 + }, + { + "epoch": 0.09223527271486884, + "grad_norm": 0.3205036222934723, + "learning_rate": 0.00018159061074357608, + "loss": 1.4792, + "step": 7098 + }, + { + "epoch": 0.09224826725878472, + "grad_norm": 0.41404226422309875, + "learning_rate": 0.0001815880112816647, + "loss": 1.4057, + "step": 7099 + }, + { + "epoch": 0.0922612618027006, + "grad_norm": 0.3069048523902893, + "learning_rate": 0.00018158541181975332, + "loss": 1.3685, + "step": 7100 + }, + { + "epoch": 0.09227425634661647, + "grad_norm": 0.3949953615665436, + "learning_rate": 0.00018158281235784195, + "loss": 1.4948, + "step": 7101 + }, + { + "epoch": 0.09228725089053234, + "grad_norm": 0.4210084080696106, + "learning_rate": 0.00018158021289593055, + "loss": 1.4039, + "step": 7102 + }, + { + "epoch": 0.09230024543444822, + "grad_norm": 0.31463754177093506, + "learning_rate": 0.00018157761343401914, + "loss": 1.3567, + "step": 7103 + }, + { + "epoch": 0.09231323997836409, + "grad_norm": 0.3610028326511383, + "learning_rate": 0.0001815750139721078, + "loss": 1.4224, + "step": 7104 + }, + { + "epoch": 0.09232623452227996, + "grad_norm": 0.4031345844268799, + "learning_rate": 0.0001815724145101964, + "loss": 1.4431, + "step": 7105 + }, + { + "epoch": 0.09233922906619584, + "grad_norm": 0.39141783118247986, + "learning_rate": 0.00018156981504828502, + "loss": 1.3225, + "step": 7106 + }, + { + "epoch": 0.09235222361011171, + "grad_norm": 0.429343044757843, + "learning_rate": 0.00018156721558637364, + "loss": 1.4191, + "step": 7107 + }, + { + "epoch": 0.09236521815402758, + "grad_norm": 0.4602467119693756, + "learning_rate": 0.00018156461612446224, + "loss": 1.5815, + "step": 7108 + }, + { + "epoch": 0.09237821269794345, + "grad_norm": 0.349427193403244, + "learning_rate": 0.00018156201666255086, + "loss": 1.381, + "step": 7109 + }, + { + "epoch": 0.09239120724185933, + "grad_norm": 0.34029173851013184, + "learning_rate": 0.00018155941720063946, + "loss": 1.5318, + "step": 7110 + }, + { + "epoch": 0.0924042017857752, + "grad_norm": 0.42074456810951233, + "learning_rate": 0.0001815568177387281, + "loss": 1.3697, + "step": 7111 + }, + { + "epoch": 0.09241719632969107, + "grad_norm": 0.3333298861980438, + "learning_rate": 0.0001815542182768167, + "loss": 1.3375, + "step": 7112 + }, + { + "epoch": 0.09243019087360695, + "grad_norm": 0.45479950308799744, + "learning_rate": 0.00018155161881490533, + "loss": 1.4995, + "step": 7113 + }, + { + "epoch": 0.09244318541752282, + "grad_norm": 0.34272897243499756, + "learning_rate": 0.00018154901935299393, + "loss": 1.4244, + "step": 7114 + }, + { + "epoch": 0.09245617996143869, + "grad_norm": 0.5107108950614929, + "learning_rate": 0.00018154641989108256, + "loss": 1.4776, + "step": 7115 + }, + { + "epoch": 0.09246917450535456, + "grad_norm": 0.32694002985954285, + "learning_rate": 0.00018154382042917118, + "loss": 1.3601, + "step": 7116 + }, + { + "epoch": 0.09248216904927044, + "grad_norm": 0.42429476976394653, + "learning_rate": 0.00018154122096725978, + "loss": 1.4091, + "step": 7117 + }, + { + "epoch": 0.09249516359318631, + "grad_norm": 0.48859018087387085, + "learning_rate": 0.0001815386215053484, + "loss": 1.5322, + "step": 7118 + }, + { + "epoch": 0.09250815813710218, + "grad_norm": 0.33436352014541626, + "learning_rate": 0.00018153602204343703, + "loss": 1.1238, + "step": 7119 + }, + { + "epoch": 0.09252115268101806, + "grad_norm": 0.40995797514915466, + "learning_rate": 0.00018153342258152562, + "loss": 1.514, + "step": 7120 + }, + { + "epoch": 0.09253414722493393, + "grad_norm": 0.36481329798698425, + "learning_rate": 0.00018153082311961425, + "loss": 1.5133, + "step": 7121 + }, + { + "epoch": 0.0925471417688498, + "grad_norm": 0.602674126625061, + "learning_rate": 0.00018152822365770285, + "loss": 1.6651, + "step": 7122 + }, + { + "epoch": 0.09256013631276568, + "grad_norm": 0.3543475270271301, + "learning_rate": 0.0001815256241957915, + "loss": 1.2827, + "step": 7123 + }, + { + "epoch": 0.09257313085668155, + "grad_norm": 0.3600478768348694, + "learning_rate": 0.0001815230247338801, + "loss": 1.5326, + "step": 7124 + }, + { + "epoch": 0.09258612540059742, + "grad_norm": 0.3686290979385376, + "learning_rate": 0.00018152042527196872, + "loss": 1.2841, + "step": 7125 + }, + { + "epoch": 0.0925991199445133, + "grad_norm": 0.46782857179641724, + "learning_rate": 0.00018151782581005732, + "loss": 1.5128, + "step": 7126 + }, + { + "epoch": 0.09261211448842917, + "grad_norm": 0.4918583929538727, + "learning_rate": 0.00018151522634814594, + "loss": 1.4921, + "step": 7127 + }, + { + "epoch": 0.09262510903234504, + "grad_norm": 0.37741222977638245, + "learning_rate": 0.00018151262688623457, + "loss": 1.3863, + "step": 7128 + }, + { + "epoch": 0.09263810357626091, + "grad_norm": 0.5578149557113647, + "learning_rate": 0.00018151002742432316, + "loss": 1.5841, + "step": 7129 + }, + { + "epoch": 0.09265109812017679, + "grad_norm": 0.38426831364631653, + "learning_rate": 0.0001815074279624118, + "loss": 1.3376, + "step": 7130 + }, + { + "epoch": 0.09266409266409266, + "grad_norm": 0.36330536007881165, + "learning_rate": 0.0001815048285005004, + "loss": 1.5015, + "step": 7131 + }, + { + "epoch": 0.09267708720800853, + "grad_norm": 0.4214705228805542, + "learning_rate": 0.000181502229038589, + "loss": 1.4123, + "step": 7132 + }, + { + "epoch": 0.0926900817519244, + "grad_norm": 0.3297528326511383, + "learning_rate": 0.00018149962957667763, + "loss": 1.3856, + "step": 7133 + }, + { + "epoch": 0.09270307629584028, + "grad_norm": 0.45045676827430725, + "learning_rate": 0.00018149703011476623, + "loss": 1.3108, + "step": 7134 + }, + { + "epoch": 0.09271607083975615, + "grad_norm": 0.4501149654388428, + "learning_rate": 0.00018149443065285488, + "loss": 1.589, + "step": 7135 + }, + { + "epoch": 0.09272906538367202, + "grad_norm": 0.25426357984542847, + "learning_rate": 0.00018149183119094348, + "loss": 1.3711, + "step": 7136 + }, + { + "epoch": 0.09274205992758791, + "grad_norm": 0.41150906682014465, + "learning_rate": 0.0001814892317290321, + "loss": 1.3483, + "step": 7137 + }, + { + "epoch": 0.09275505447150378, + "grad_norm": 0.40769341588020325, + "learning_rate": 0.0001814866322671207, + "loss": 1.4589, + "step": 7138 + }, + { + "epoch": 0.09276804901541966, + "grad_norm": 0.3981645703315735, + "learning_rate": 0.00018148403280520933, + "loss": 1.6916, + "step": 7139 + }, + { + "epoch": 0.09278104355933553, + "grad_norm": 0.28859761357307434, + "learning_rate": 0.00018148143334329795, + "loss": 1.2536, + "step": 7140 + }, + { + "epoch": 0.0927940381032514, + "grad_norm": 0.3678811490535736, + "learning_rate": 0.00018147883388138655, + "loss": 1.5757, + "step": 7141 + }, + { + "epoch": 0.09280703264716728, + "grad_norm": 0.4214949905872345, + "learning_rate": 0.00018147623441947517, + "loss": 1.5823, + "step": 7142 + }, + { + "epoch": 0.09282002719108315, + "grad_norm": 0.2991885542869568, + "learning_rate": 0.0001814736349575638, + "loss": 1.4296, + "step": 7143 + }, + { + "epoch": 0.09283302173499902, + "grad_norm": 0.45115718245506287, + "learning_rate": 0.0001814710354956524, + "loss": 1.3936, + "step": 7144 + }, + { + "epoch": 0.0928460162789149, + "grad_norm": 0.38903236389160156, + "learning_rate": 0.00018146843603374102, + "loss": 1.3686, + "step": 7145 + }, + { + "epoch": 0.09285901082283077, + "grad_norm": 0.4278908371925354, + "learning_rate": 0.00018146583657182964, + "loss": 1.4786, + "step": 7146 + }, + { + "epoch": 0.09287200536674664, + "grad_norm": 0.42673051357269287, + "learning_rate": 0.00018146323710991827, + "loss": 1.6307, + "step": 7147 + }, + { + "epoch": 0.09288499991066251, + "grad_norm": 0.39841514825820923, + "learning_rate": 0.00018146063764800687, + "loss": 1.6144, + "step": 7148 + }, + { + "epoch": 0.09289799445457839, + "grad_norm": 0.239227294921875, + "learning_rate": 0.0001814580381860955, + "loss": 1.3622, + "step": 7149 + }, + { + "epoch": 0.09291098899849426, + "grad_norm": 0.31507638096809387, + "learning_rate": 0.00018145543872418411, + "loss": 1.3801, + "step": 7150 + }, + { + "epoch": 0.09292398354241013, + "grad_norm": 0.3635648190975189, + "learning_rate": 0.0001814528392622727, + "loss": 1.5107, + "step": 7151 + }, + { + "epoch": 0.092936978086326, + "grad_norm": 0.398466020822525, + "learning_rate": 0.00018145023980036134, + "loss": 1.6659, + "step": 7152 + }, + { + "epoch": 0.09294997263024188, + "grad_norm": 0.4346812665462494, + "learning_rate": 0.00018144764033844993, + "loss": 1.496, + "step": 7153 + }, + { + "epoch": 0.09296296717415775, + "grad_norm": 0.35877594351768494, + "learning_rate": 0.00018144504087653859, + "loss": 1.342, + "step": 7154 + }, + { + "epoch": 0.09297596171807362, + "grad_norm": 0.42430737614631653, + "learning_rate": 0.00018144244141462718, + "loss": 1.2827, + "step": 7155 + }, + { + "epoch": 0.0929889562619895, + "grad_norm": 0.3588751554489136, + "learning_rate": 0.0001814398419527158, + "loss": 1.5225, + "step": 7156 + }, + { + "epoch": 0.09300195080590537, + "grad_norm": 0.3559873402118683, + "learning_rate": 0.0001814372424908044, + "loss": 1.3338, + "step": 7157 + }, + { + "epoch": 0.09301494534982124, + "grad_norm": 0.41714149713516235, + "learning_rate": 0.00018143464302889303, + "loss": 1.4866, + "step": 7158 + }, + { + "epoch": 0.09302793989373712, + "grad_norm": 0.4089769124984741, + "learning_rate": 0.00018143204356698165, + "loss": 1.3828, + "step": 7159 + }, + { + "epoch": 0.09304093443765299, + "grad_norm": 0.3431582450866699, + "learning_rate": 0.00018142944410507025, + "loss": 1.5926, + "step": 7160 + }, + { + "epoch": 0.09305392898156886, + "grad_norm": 0.35248029232025146, + "learning_rate": 0.00018142684464315888, + "loss": 1.5597, + "step": 7161 + }, + { + "epoch": 0.09306692352548473, + "grad_norm": 0.47301214933395386, + "learning_rate": 0.0001814242451812475, + "loss": 1.5375, + "step": 7162 + }, + { + "epoch": 0.09307991806940061, + "grad_norm": 0.4430922567844391, + "learning_rate": 0.0001814216457193361, + "loss": 1.4473, + "step": 7163 + }, + { + "epoch": 0.09309291261331648, + "grad_norm": 0.28790032863616943, + "learning_rate": 0.00018141904625742472, + "loss": 1.1821, + "step": 7164 + }, + { + "epoch": 0.09310590715723235, + "grad_norm": 0.41752859950065613, + "learning_rate": 0.00018141644679551332, + "loss": 1.5598, + "step": 7165 + }, + { + "epoch": 0.09311890170114823, + "grad_norm": 0.4206375181674957, + "learning_rate": 0.00018141384733360197, + "loss": 1.5172, + "step": 7166 + }, + { + "epoch": 0.0931318962450641, + "grad_norm": 0.3069722652435303, + "learning_rate": 0.00018141124787169057, + "loss": 1.4652, + "step": 7167 + }, + { + "epoch": 0.09314489078897997, + "grad_norm": 0.3917238116264343, + "learning_rate": 0.0001814086484097792, + "loss": 1.4294, + "step": 7168 + }, + { + "epoch": 0.09315788533289585, + "grad_norm": 0.2199425846338272, + "learning_rate": 0.0001814060489478678, + "loss": 1.2917, + "step": 7169 + }, + { + "epoch": 0.09317087987681172, + "grad_norm": 0.3223063051700592, + "learning_rate": 0.00018140344948595641, + "loss": 1.3479, + "step": 7170 + }, + { + "epoch": 0.09318387442072759, + "grad_norm": 0.37226495146751404, + "learning_rate": 0.00018140085002404504, + "loss": 1.5153, + "step": 7171 + }, + { + "epoch": 0.09319686896464346, + "grad_norm": 0.351835697889328, + "learning_rate": 0.00018139825056213364, + "loss": 1.3814, + "step": 7172 + }, + { + "epoch": 0.09320986350855934, + "grad_norm": 0.4412347972393036, + "learning_rate": 0.00018139565110022226, + "loss": 1.4869, + "step": 7173 + }, + { + "epoch": 0.09322285805247521, + "grad_norm": 0.4145277142524719, + "learning_rate": 0.00018139305163831089, + "loss": 1.5661, + "step": 7174 + }, + { + "epoch": 0.09323585259639108, + "grad_norm": 0.4118339419364929, + "learning_rate": 0.00018139045217639948, + "loss": 1.2829, + "step": 7175 + }, + { + "epoch": 0.09324884714030697, + "grad_norm": 0.27595028281211853, + "learning_rate": 0.0001813878527144881, + "loss": 1.3253, + "step": 7176 + }, + { + "epoch": 0.09326184168422284, + "grad_norm": 0.2849619686603546, + "learning_rate": 0.0001813852532525767, + "loss": 1.3401, + "step": 7177 + }, + { + "epoch": 0.09327483622813872, + "grad_norm": 0.32255294919013977, + "learning_rate": 0.00018138265379066536, + "loss": 1.3041, + "step": 7178 + }, + { + "epoch": 0.09328783077205459, + "grad_norm": 0.33268553018569946, + "learning_rate": 0.00018138005432875395, + "loss": 1.4912, + "step": 7179 + }, + { + "epoch": 0.09330082531597046, + "grad_norm": 0.41254350543022156, + "learning_rate": 0.00018137745486684258, + "loss": 1.632, + "step": 7180 + }, + { + "epoch": 0.09331381985988633, + "grad_norm": 0.4047669768333435, + "learning_rate": 0.0001813748554049312, + "loss": 1.4541, + "step": 7181 + }, + { + "epoch": 0.09332681440380221, + "grad_norm": 0.4384784698486328, + "learning_rate": 0.0001813722559430198, + "loss": 1.5041, + "step": 7182 + }, + { + "epoch": 0.09333980894771808, + "grad_norm": 0.40634799003601074, + "learning_rate": 0.00018136965648110842, + "loss": 1.5847, + "step": 7183 + }, + { + "epoch": 0.09335280349163395, + "grad_norm": 0.2665691077709198, + "learning_rate": 0.00018136705701919702, + "loss": 1.1773, + "step": 7184 + }, + { + "epoch": 0.09336579803554983, + "grad_norm": 0.40051954984664917, + "learning_rate": 0.00018136445755728567, + "loss": 1.5293, + "step": 7185 + }, + { + "epoch": 0.0933787925794657, + "grad_norm": 0.40561601519584656, + "learning_rate": 0.00018136185809537427, + "loss": 1.2482, + "step": 7186 + }, + { + "epoch": 0.09339178712338157, + "grad_norm": 0.35493436455726624, + "learning_rate": 0.00018135925863346287, + "loss": 1.371, + "step": 7187 + }, + { + "epoch": 0.09340478166729745, + "grad_norm": 0.5505121350288391, + "learning_rate": 0.0001813566591715515, + "loss": 1.6067, + "step": 7188 + }, + { + "epoch": 0.09341777621121332, + "grad_norm": 0.37096771597862244, + "learning_rate": 0.00018135405970964012, + "loss": 1.5045, + "step": 7189 + }, + { + "epoch": 0.09343077075512919, + "grad_norm": 0.3663386106491089, + "learning_rate": 0.00018135146024772874, + "loss": 1.3917, + "step": 7190 + }, + { + "epoch": 0.09344376529904506, + "grad_norm": 0.331105500459671, + "learning_rate": 0.00018134886078581734, + "loss": 1.3407, + "step": 7191 + }, + { + "epoch": 0.09345675984296094, + "grad_norm": 0.3691966235637665, + "learning_rate": 0.00018134626132390596, + "loss": 1.4897, + "step": 7192 + }, + { + "epoch": 0.09346975438687681, + "grad_norm": 0.5245912075042725, + "learning_rate": 0.0001813436618619946, + "loss": 1.4214, + "step": 7193 + }, + { + "epoch": 0.09348274893079268, + "grad_norm": 0.4015735387802124, + "learning_rate": 0.00018134106240008319, + "loss": 1.2476, + "step": 7194 + }, + { + "epoch": 0.09349574347470856, + "grad_norm": 0.40817928314208984, + "learning_rate": 0.0001813384629381718, + "loss": 1.2945, + "step": 7195 + }, + { + "epoch": 0.09350873801862443, + "grad_norm": 0.5424723029136658, + "learning_rate": 0.0001813358634762604, + "loss": 1.508, + "step": 7196 + }, + { + "epoch": 0.0935217325625403, + "grad_norm": 0.4100075960159302, + "learning_rate": 0.00018133326401434906, + "loss": 1.5291, + "step": 7197 + }, + { + "epoch": 0.09353472710645618, + "grad_norm": 0.2901502847671509, + "learning_rate": 0.00018133066455243766, + "loss": 1.6145, + "step": 7198 + }, + { + "epoch": 0.09354772165037205, + "grad_norm": 0.368080198764801, + "learning_rate": 0.00018132806509052625, + "loss": 1.2638, + "step": 7199 + }, + { + "epoch": 0.09356071619428792, + "grad_norm": 0.4281146228313446, + "learning_rate": 0.00018132546562861488, + "loss": 1.4845, + "step": 7200 + }, + { + "epoch": 0.0935737107382038, + "grad_norm": 0.3882058262825012, + "learning_rate": 0.0001813228661667035, + "loss": 1.3361, + "step": 7201 + }, + { + "epoch": 0.09358670528211967, + "grad_norm": 0.3651081323623657, + "learning_rate": 0.00018132026670479213, + "loss": 1.515, + "step": 7202 + }, + { + "epoch": 0.09359969982603554, + "grad_norm": 0.34518468379974365, + "learning_rate": 0.00018131766724288072, + "loss": 1.3506, + "step": 7203 + }, + { + "epoch": 0.09361269436995141, + "grad_norm": 0.4185968041419983, + "learning_rate": 0.00018131506778096935, + "loss": 1.5779, + "step": 7204 + }, + { + "epoch": 0.09362568891386729, + "grad_norm": 0.4731001555919647, + "learning_rate": 0.00018131246831905797, + "loss": 1.5335, + "step": 7205 + }, + { + "epoch": 0.09363868345778316, + "grad_norm": 0.3844606280326843, + "learning_rate": 0.00018130986885714657, + "loss": 1.3826, + "step": 7206 + }, + { + "epoch": 0.09365167800169903, + "grad_norm": 0.3645429313182831, + "learning_rate": 0.0001813072693952352, + "loss": 1.4334, + "step": 7207 + }, + { + "epoch": 0.0936646725456149, + "grad_norm": 0.44046902656555176, + "learning_rate": 0.0001813046699333238, + "loss": 1.2695, + "step": 7208 + }, + { + "epoch": 0.09367766708953078, + "grad_norm": 0.4989940822124481, + "learning_rate": 0.00018130207047141244, + "loss": 1.4647, + "step": 7209 + }, + { + "epoch": 0.09369066163344665, + "grad_norm": 0.3138985335826874, + "learning_rate": 0.00018129947100950104, + "loss": 1.3611, + "step": 7210 + }, + { + "epoch": 0.09370365617736252, + "grad_norm": 0.4873834550380707, + "learning_rate": 0.00018129687154758967, + "loss": 1.4739, + "step": 7211 + }, + { + "epoch": 0.0937166507212784, + "grad_norm": 0.4353213608264923, + "learning_rate": 0.00018129427208567826, + "loss": 1.2071, + "step": 7212 + }, + { + "epoch": 0.09372964526519427, + "grad_norm": 0.43387168645858765, + "learning_rate": 0.0001812916726237669, + "loss": 1.4833, + "step": 7213 + }, + { + "epoch": 0.09374263980911016, + "grad_norm": 0.4404866695404053, + "learning_rate": 0.0001812890731618555, + "loss": 1.3925, + "step": 7214 + }, + { + "epoch": 0.09375563435302603, + "grad_norm": 0.48707398772239685, + "learning_rate": 0.0001812864736999441, + "loss": 1.5795, + "step": 7215 + }, + { + "epoch": 0.0937686288969419, + "grad_norm": 0.3479972183704376, + "learning_rate": 0.00018128387423803273, + "loss": 1.4622, + "step": 7216 + }, + { + "epoch": 0.09378162344085778, + "grad_norm": 0.24355284869670868, + "learning_rate": 0.00018128127477612136, + "loss": 1.2919, + "step": 7217 + }, + { + "epoch": 0.09379461798477365, + "grad_norm": 0.39244765043258667, + "learning_rate": 0.00018127867531420996, + "loss": 1.5117, + "step": 7218 + }, + { + "epoch": 0.09380761252868952, + "grad_norm": 0.32687613368034363, + "learning_rate": 0.00018127607585229858, + "loss": 1.1883, + "step": 7219 + }, + { + "epoch": 0.0938206070726054, + "grad_norm": 0.4393558204174042, + "learning_rate": 0.0001812734763903872, + "loss": 1.4521, + "step": 7220 + }, + { + "epoch": 0.09383360161652127, + "grad_norm": 0.34508129954338074, + "learning_rate": 0.00018127087692847583, + "loss": 1.2808, + "step": 7221 + }, + { + "epoch": 0.09384659616043714, + "grad_norm": 0.4363642930984497, + "learning_rate": 0.00018126827746656443, + "loss": 1.4579, + "step": 7222 + }, + { + "epoch": 0.09385959070435301, + "grad_norm": 0.404473215341568, + "learning_rate": 0.00018126567800465305, + "loss": 1.2808, + "step": 7223 + }, + { + "epoch": 0.09387258524826889, + "grad_norm": 0.4129363000392914, + "learning_rate": 0.00018126307854274168, + "loss": 1.2411, + "step": 7224 + }, + { + "epoch": 0.09388557979218476, + "grad_norm": 0.39193013310432434, + "learning_rate": 0.00018126047908083027, + "loss": 1.4219, + "step": 7225 + }, + { + "epoch": 0.09389857433610063, + "grad_norm": 0.4417882263660431, + "learning_rate": 0.0001812578796189189, + "loss": 1.3255, + "step": 7226 + }, + { + "epoch": 0.0939115688800165, + "grad_norm": 0.31806811690330505, + "learning_rate": 0.0001812552801570075, + "loss": 1.2958, + "step": 7227 + }, + { + "epoch": 0.09392456342393238, + "grad_norm": 0.3990212380886078, + "learning_rate": 0.00018125268069509612, + "loss": 1.4539, + "step": 7228 + }, + { + "epoch": 0.09393755796784825, + "grad_norm": 0.3806932270526886, + "learning_rate": 0.00018125008123318474, + "loss": 1.6653, + "step": 7229 + }, + { + "epoch": 0.09395055251176412, + "grad_norm": 0.3945797383785248, + "learning_rate": 0.00018124748177127334, + "loss": 1.4329, + "step": 7230 + }, + { + "epoch": 0.09396354705568, + "grad_norm": 0.4346785247325897, + "learning_rate": 0.00018124488230936197, + "loss": 1.5304, + "step": 7231 + }, + { + "epoch": 0.09397654159959587, + "grad_norm": 0.32155469059944153, + "learning_rate": 0.0001812422828474506, + "loss": 1.0665, + "step": 7232 + }, + { + "epoch": 0.09398953614351174, + "grad_norm": 0.394033282995224, + "learning_rate": 0.00018123968338553922, + "loss": 1.365, + "step": 7233 + }, + { + "epoch": 0.09400253068742762, + "grad_norm": 0.46820324659347534, + "learning_rate": 0.0001812370839236278, + "loss": 1.4929, + "step": 7234 + }, + { + "epoch": 0.09401552523134349, + "grad_norm": 0.3518800437450409, + "learning_rate": 0.00018123448446171644, + "loss": 1.4794, + "step": 7235 + }, + { + "epoch": 0.09402851977525936, + "grad_norm": 0.5230505466461182, + "learning_rate": 0.00018123188499980506, + "loss": 1.447, + "step": 7236 + }, + { + "epoch": 0.09404151431917523, + "grad_norm": 0.42552244663238525, + "learning_rate": 0.00018122928553789366, + "loss": 1.2292, + "step": 7237 + }, + { + "epoch": 0.09405450886309111, + "grad_norm": 0.49563997983932495, + "learning_rate": 0.00018122668607598228, + "loss": 1.4937, + "step": 7238 + }, + { + "epoch": 0.09406750340700698, + "grad_norm": 0.33242788910865784, + "learning_rate": 0.00018122408661407088, + "loss": 1.4184, + "step": 7239 + }, + { + "epoch": 0.09408049795092285, + "grad_norm": 0.41565611958503723, + "learning_rate": 0.00018122148715215953, + "loss": 1.5235, + "step": 7240 + }, + { + "epoch": 0.09409349249483873, + "grad_norm": 0.38495129346847534, + "learning_rate": 0.00018121888769024813, + "loss": 1.3662, + "step": 7241 + }, + { + "epoch": 0.0941064870387546, + "grad_norm": 0.432624876499176, + "learning_rate": 0.00018121628822833673, + "loss": 1.4292, + "step": 7242 + }, + { + "epoch": 0.09411948158267047, + "grad_norm": 0.3683008849620819, + "learning_rate": 0.00018121368876642535, + "loss": 1.5875, + "step": 7243 + }, + { + "epoch": 0.09413247612658635, + "grad_norm": 0.35654228925704956, + "learning_rate": 0.00018121108930451398, + "loss": 1.5008, + "step": 7244 + }, + { + "epoch": 0.09414547067050222, + "grad_norm": 0.36029231548309326, + "learning_rate": 0.0001812084898426026, + "loss": 1.4172, + "step": 7245 + }, + { + "epoch": 0.09415846521441809, + "grad_norm": 0.414405882358551, + "learning_rate": 0.0001812058903806912, + "loss": 1.5133, + "step": 7246 + }, + { + "epoch": 0.09417145975833396, + "grad_norm": 0.4873195290565491, + "learning_rate": 0.00018120329091877982, + "loss": 1.5078, + "step": 7247 + }, + { + "epoch": 0.09418445430224984, + "grad_norm": 0.5806477665901184, + "learning_rate": 0.00018120069145686845, + "loss": 1.4059, + "step": 7248 + }, + { + "epoch": 0.09419744884616571, + "grad_norm": 0.41904518008232117, + "learning_rate": 0.00018119809199495704, + "loss": 1.5204, + "step": 7249 + }, + { + "epoch": 0.09421044339008158, + "grad_norm": 0.421613872051239, + "learning_rate": 0.00018119549253304567, + "loss": 1.4697, + "step": 7250 + }, + { + "epoch": 0.09422343793399746, + "grad_norm": 0.3459349572658539, + "learning_rate": 0.00018119289307113427, + "loss": 1.3191, + "step": 7251 + }, + { + "epoch": 0.09423643247791334, + "grad_norm": 0.42944464087486267, + "learning_rate": 0.00018119029360922292, + "loss": 1.3137, + "step": 7252 + }, + { + "epoch": 0.09424942702182922, + "grad_norm": 0.3202815651893616, + "learning_rate": 0.00018118769414731152, + "loss": 1.4005, + "step": 7253 + }, + { + "epoch": 0.09426242156574509, + "grad_norm": 0.32814428210258484, + "learning_rate": 0.0001811850946854001, + "loss": 1.5978, + "step": 7254 + }, + { + "epoch": 0.09427541610966096, + "grad_norm": 0.44766125082969666, + "learning_rate": 0.00018118249522348876, + "loss": 1.3655, + "step": 7255 + }, + { + "epoch": 0.09428841065357683, + "grad_norm": 0.2914086878299713, + "learning_rate": 0.00018117989576157736, + "loss": 1.4049, + "step": 7256 + }, + { + "epoch": 0.09430140519749271, + "grad_norm": 0.39988675713539124, + "learning_rate": 0.00018117729629966599, + "loss": 1.4526, + "step": 7257 + }, + { + "epoch": 0.09431439974140858, + "grad_norm": 0.3311614394187927, + "learning_rate": 0.00018117469683775458, + "loss": 1.3424, + "step": 7258 + }, + { + "epoch": 0.09432739428532445, + "grad_norm": 0.4980418384075165, + "learning_rate": 0.0001811720973758432, + "loss": 1.4031, + "step": 7259 + }, + { + "epoch": 0.09434038882924033, + "grad_norm": 0.6409164071083069, + "learning_rate": 0.00018116949791393183, + "loss": 1.5083, + "step": 7260 + }, + { + "epoch": 0.0943533833731562, + "grad_norm": 0.6993975043296814, + "learning_rate": 0.00018116689845202043, + "loss": 1.557, + "step": 7261 + }, + { + "epoch": 0.09436637791707207, + "grad_norm": 0.4055137634277344, + "learning_rate": 0.00018116429899010905, + "loss": 1.2874, + "step": 7262 + }, + { + "epoch": 0.09437937246098795, + "grad_norm": 0.36568740010261536, + "learning_rate": 0.00018116169952819768, + "loss": 1.7407, + "step": 7263 + }, + { + "epoch": 0.09439236700490382, + "grad_norm": 0.2845642566680908, + "learning_rate": 0.0001811591000662863, + "loss": 1.3145, + "step": 7264 + }, + { + "epoch": 0.09440536154881969, + "grad_norm": 0.45284831523895264, + "learning_rate": 0.0001811565006043749, + "loss": 1.4203, + "step": 7265 + }, + { + "epoch": 0.09441835609273556, + "grad_norm": 0.46527132391929626, + "learning_rate": 0.0001811539011424635, + "loss": 1.5508, + "step": 7266 + }, + { + "epoch": 0.09443135063665144, + "grad_norm": 0.3700307309627533, + "learning_rate": 0.00018115130168055215, + "loss": 1.4195, + "step": 7267 + }, + { + "epoch": 0.09444434518056731, + "grad_norm": 0.45265263319015503, + "learning_rate": 0.00018114870221864075, + "loss": 1.3754, + "step": 7268 + }, + { + "epoch": 0.09445733972448318, + "grad_norm": 0.44388577342033386, + "learning_rate": 0.00018114610275672937, + "loss": 1.39, + "step": 7269 + }, + { + "epoch": 0.09447033426839906, + "grad_norm": 0.3856041431427002, + "learning_rate": 0.00018114350329481797, + "loss": 1.3591, + "step": 7270 + }, + { + "epoch": 0.09448332881231493, + "grad_norm": 0.37693923711776733, + "learning_rate": 0.0001811409038329066, + "loss": 1.4226, + "step": 7271 + }, + { + "epoch": 0.0944963233562308, + "grad_norm": 0.368068128824234, + "learning_rate": 0.00018113830437099522, + "loss": 1.4905, + "step": 7272 + }, + { + "epoch": 0.09450931790014667, + "grad_norm": 0.4132508337497711, + "learning_rate": 0.00018113570490908381, + "loss": 1.3154, + "step": 7273 + }, + { + "epoch": 0.09452231244406255, + "grad_norm": 0.39588838815689087, + "learning_rate": 0.00018113310544717244, + "loss": 1.3746, + "step": 7274 + }, + { + "epoch": 0.09453530698797842, + "grad_norm": 0.4163910150527954, + "learning_rate": 0.00018113050598526106, + "loss": 1.2441, + "step": 7275 + }, + { + "epoch": 0.0945483015318943, + "grad_norm": 0.35650166869163513, + "learning_rate": 0.0001811279065233497, + "loss": 1.2851, + "step": 7276 + }, + { + "epoch": 0.09456129607581017, + "grad_norm": 0.4032966196537018, + "learning_rate": 0.00018112530706143829, + "loss": 1.5843, + "step": 7277 + }, + { + "epoch": 0.09457429061972604, + "grad_norm": 0.3548662066459656, + "learning_rate": 0.0001811227075995269, + "loss": 1.5866, + "step": 7278 + }, + { + "epoch": 0.09458728516364191, + "grad_norm": 0.4279075860977173, + "learning_rate": 0.00018112010813761553, + "loss": 1.3916, + "step": 7279 + }, + { + "epoch": 0.09460027970755779, + "grad_norm": 0.31919071078300476, + "learning_rate": 0.00018111750867570413, + "loss": 1.535, + "step": 7280 + }, + { + "epoch": 0.09461327425147366, + "grad_norm": 0.40728500485420227, + "learning_rate": 0.00018111490921379276, + "loss": 1.6466, + "step": 7281 + }, + { + "epoch": 0.09462626879538953, + "grad_norm": 0.25449395179748535, + "learning_rate": 0.00018111230975188135, + "loss": 1.4623, + "step": 7282 + }, + { + "epoch": 0.0946392633393054, + "grad_norm": 0.4637458324432373, + "learning_rate": 0.00018110971028996998, + "loss": 1.4263, + "step": 7283 + }, + { + "epoch": 0.09465225788322128, + "grad_norm": 0.329838365316391, + "learning_rate": 0.0001811071108280586, + "loss": 1.5491, + "step": 7284 + }, + { + "epoch": 0.09466525242713715, + "grad_norm": 0.29775992035865784, + "learning_rate": 0.0001811045113661472, + "loss": 1.5777, + "step": 7285 + }, + { + "epoch": 0.09467824697105302, + "grad_norm": 0.41244420409202576, + "learning_rate": 0.00018110191190423582, + "loss": 1.3067, + "step": 7286 + }, + { + "epoch": 0.0946912415149689, + "grad_norm": 0.318005234003067, + "learning_rate": 0.00018109931244232445, + "loss": 1.3772, + "step": 7287 + }, + { + "epoch": 0.09470423605888477, + "grad_norm": 0.38731497526168823, + "learning_rate": 0.00018109671298041307, + "loss": 1.4829, + "step": 7288 + }, + { + "epoch": 0.09471723060280064, + "grad_norm": 0.35857513546943665, + "learning_rate": 0.00018109411351850167, + "loss": 1.4593, + "step": 7289 + }, + { + "epoch": 0.09473022514671653, + "grad_norm": 0.4021919369697571, + "learning_rate": 0.0001810915140565903, + "loss": 1.4017, + "step": 7290 + }, + { + "epoch": 0.0947432196906324, + "grad_norm": 0.4414508044719696, + "learning_rate": 0.00018108891459467892, + "loss": 1.5618, + "step": 7291 + }, + { + "epoch": 0.09475621423454827, + "grad_norm": 0.4374082088470459, + "learning_rate": 0.00018108631513276752, + "loss": 1.4636, + "step": 7292 + }, + { + "epoch": 0.09476920877846415, + "grad_norm": 0.41006237268447876, + "learning_rate": 0.00018108371567085614, + "loss": 1.4143, + "step": 7293 + }, + { + "epoch": 0.09478220332238002, + "grad_norm": 0.30947592854499817, + "learning_rate": 0.00018108111620894477, + "loss": 1.4089, + "step": 7294 + }, + { + "epoch": 0.0947951978662959, + "grad_norm": 0.4499688148498535, + "learning_rate": 0.0001810785167470334, + "loss": 1.5636, + "step": 7295 + }, + { + "epoch": 0.09480819241021177, + "grad_norm": 0.6653492450714111, + "learning_rate": 0.000181075917285122, + "loss": 1.4635, + "step": 7296 + }, + { + "epoch": 0.09482118695412764, + "grad_norm": 0.3968139588832855, + "learning_rate": 0.00018107331782321059, + "loss": 1.4746, + "step": 7297 + }, + { + "epoch": 0.09483418149804351, + "grad_norm": 0.3583230972290039, + "learning_rate": 0.00018107071836129924, + "loss": 1.4722, + "step": 7298 + }, + { + "epoch": 0.09484717604195939, + "grad_norm": 0.4312600791454315, + "learning_rate": 0.00018106811889938783, + "loss": 1.3817, + "step": 7299 + }, + { + "epoch": 0.09486017058587526, + "grad_norm": 0.406304270029068, + "learning_rate": 0.00018106551943747646, + "loss": 1.5599, + "step": 7300 + }, + { + "epoch": 0.09487316512979113, + "grad_norm": 0.36707812547683716, + "learning_rate": 0.00018106291997556506, + "loss": 1.2522, + "step": 7301 + }, + { + "epoch": 0.094886159673707, + "grad_norm": 0.43151021003723145, + "learning_rate": 0.00018106032051365368, + "loss": 1.4555, + "step": 7302 + }, + { + "epoch": 0.09489915421762288, + "grad_norm": 0.5606217384338379, + "learning_rate": 0.0001810577210517423, + "loss": 1.5608, + "step": 7303 + }, + { + "epoch": 0.09491214876153875, + "grad_norm": 0.297919899225235, + "learning_rate": 0.0001810551215898309, + "loss": 1.2231, + "step": 7304 + }, + { + "epoch": 0.09492514330545462, + "grad_norm": 0.5842865705490112, + "learning_rate": 0.00018105252212791953, + "loss": 1.5461, + "step": 7305 + }, + { + "epoch": 0.0949381378493705, + "grad_norm": 0.40130487084388733, + "learning_rate": 0.00018104992266600815, + "loss": 1.3524, + "step": 7306 + }, + { + "epoch": 0.09495113239328637, + "grad_norm": 0.3985798954963684, + "learning_rate": 0.00018104732320409678, + "loss": 1.2793, + "step": 7307 + }, + { + "epoch": 0.09496412693720224, + "grad_norm": 0.35852622985839844, + "learning_rate": 0.00018104472374218537, + "loss": 1.3091, + "step": 7308 + }, + { + "epoch": 0.09497712148111812, + "grad_norm": 0.39018869400024414, + "learning_rate": 0.00018104212428027397, + "loss": 1.3751, + "step": 7309 + }, + { + "epoch": 0.09499011602503399, + "grad_norm": 0.5982402563095093, + "learning_rate": 0.00018103952481836262, + "loss": 1.4635, + "step": 7310 + }, + { + "epoch": 0.09500311056894986, + "grad_norm": 0.4077194631099701, + "learning_rate": 0.00018103692535645122, + "loss": 1.4362, + "step": 7311 + }, + { + "epoch": 0.09501610511286573, + "grad_norm": 0.3744657039642334, + "learning_rate": 0.00018103432589453984, + "loss": 1.2755, + "step": 7312 + }, + { + "epoch": 0.09502909965678161, + "grad_norm": 0.4277615547180176, + "learning_rate": 0.00018103172643262844, + "loss": 1.4048, + "step": 7313 + }, + { + "epoch": 0.09504209420069748, + "grad_norm": 0.3324318528175354, + "learning_rate": 0.00018102912697071707, + "loss": 1.3802, + "step": 7314 + }, + { + "epoch": 0.09505508874461335, + "grad_norm": 0.3124167025089264, + "learning_rate": 0.0001810265275088057, + "loss": 1.2795, + "step": 7315 + }, + { + "epoch": 0.09506808328852923, + "grad_norm": 0.4548947513103485, + "learning_rate": 0.0001810239280468943, + "loss": 1.6053, + "step": 7316 + }, + { + "epoch": 0.0950810778324451, + "grad_norm": 0.4082011580467224, + "learning_rate": 0.0001810213285849829, + "loss": 1.6244, + "step": 7317 + }, + { + "epoch": 0.09509407237636097, + "grad_norm": 0.3603448271751404, + "learning_rate": 0.00018101872912307154, + "loss": 1.278, + "step": 7318 + }, + { + "epoch": 0.09510706692027684, + "grad_norm": 0.4153482913970947, + "learning_rate": 0.00018101612966116016, + "loss": 1.4195, + "step": 7319 + }, + { + "epoch": 0.09512006146419272, + "grad_norm": 0.4001554548740387, + "learning_rate": 0.00018101353019924876, + "loss": 1.328, + "step": 7320 + }, + { + "epoch": 0.09513305600810859, + "grad_norm": 0.46756473183631897, + "learning_rate": 0.00018101093073733736, + "loss": 1.292, + "step": 7321 + }, + { + "epoch": 0.09514605055202446, + "grad_norm": 0.39417633414268494, + "learning_rate": 0.000181008331275426, + "loss": 1.3832, + "step": 7322 + }, + { + "epoch": 0.09515904509594034, + "grad_norm": 0.45010629296302795, + "learning_rate": 0.0001810057318135146, + "loss": 1.5333, + "step": 7323 + }, + { + "epoch": 0.09517203963985621, + "grad_norm": 0.3506676256656647, + "learning_rate": 0.00018100313235160323, + "loss": 1.5742, + "step": 7324 + }, + { + "epoch": 0.09518503418377208, + "grad_norm": 0.4224172532558441, + "learning_rate": 0.00018100053288969183, + "loss": 1.4842, + "step": 7325 + }, + { + "epoch": 0.09519802872768796, + "grad_norm": 0.34364381432533264, + "learning_rate": 0.00018099793342778045, + "loss": 1.3448, + "step": 7326 + }, + { + "epoch": 0.09521102327160383, + "grad_norm": 0.43910667300224304, + "learning_rate": 0.00018099533396586908, + "loss": 1.3444, + "step": 7327 + }, + { + "epoch": 0.09522401781551972, + "grad_norm": 0.39456990361213684, + "learning_rate": 0.00018099273450395767, + "loss": 1.5078, + "step": 7328 + }, + { + "epoch": 0.09523701235943559, + "grad_norm": 0.3839728832244873, + "learning_rate": 0.00018099013504204633, + "loss": 1.4328, + "step": 7329 + }, + { + "epoch": 0.09525000690335146, + "grad_norm": 0.3956484794616699, + "learning_rate": 0.00018098753558013492, + "loss": 1.4098, + "step": 7330 + }, + { + "epoch": 0.09526300144726733, + "grad_norm": 0.4149603545665741, + "learning_rate": 0.00018098493611822355, + "loss": 1.3573, + "step": 7331 + }, + { + "epoch": 0.09527599599118321, + "grad_norm": 0.3212421238422394, + "learning_rate": 0.00018098233665631214, + "loss": 1.4073, + "step": 7332 + }, + { + "epoch": 0.09528899053509908, + "grad_norm": 0.4270807206630707, + "learning_rate": 0.00018097973719440077, + "loss": 1.4798, + "step": 7333 + }, + { + "epoch": 0.09530198507901495, + "grad_norm": 0.48940223455429077, + "learning_rate": 0.0001809771377324894, + "loss": 1.4807, + "step": 7334 + }, + { + "epoch": 0.09531497962293083, + "grad_norm": 0.39676734805107117, + "learning_rate": 0.000180974538270578, + "loss": 1.5772, + "step": 7335 + }, + { + "epoch": 0.0953279741668467, + "grad_norm": 0.3974304795265198, + "learning_rate": 0.00018097193880866662, + "loss": 1.4628, + "step": 7336 + }, + { + "epoch": 0.09534096871076257, + "grad_norm": 0.3763043284416199, + "learning_rate": 0.00018096933934675524, + "loss": 1.4777, + "step": 7337 + }, + { + "epoch": 0.09535396325467844, + "grad_norm": 0.4016934335231781, + "learning_rate": 0.00018096673988484384, + "loss": 1.4591, + "step": 7338 + }, + { + "epoch": 0.09536695779859432, + "grad_norm": 0.4639422297477722, + "learning_rate": 0.00018096414042293246, + "loss": 1.5875, + "step": 7339 + }, + { + "epoch": 0.09537995234251019, + "grad_norm": 0.47356316447257996, + "learning_rate": 0.00018096154096102106, + "loss": 1.4073, + "step": 7340 + }, + { + "epoch": 0.09539294688642606, + "grad_norm": 0.4202554523944855, + "learning_rate": 0.0001809589414991097, + "loss": 1.1655, + "step": 7341 + }, + { + "epoch": 0.09540594143034194, + "grad_norm": 0.3120211362838745, + "learning_rate": 0.0001809563420371983, + "loss": 1.3712, + "step": 7342 + }, + { + "epoch": 0.09541893597425781, + "grad_norm": 0.30092450976371765, + "learning_rate": 0.00018095374257528693, + "loss": 1.5021, + "step": 7343 + }, + { + "epoch": 0.09543193051817368, + "grad_norm": 0.33890169858932495, + "learning_rate": 0.00018095114311337553, + "loss": 1.4425, + "step": 7344 + }, + { + "epoch": 0.09544492506208956, + "grad_norm": 0.44607123732566833, + "learning_rate": 0.00018094854365146415, + "loss": 1.6243, + "step": 7345 + }, + { + "epoch": 0.09545791960600543, + "grad_norm": 0.41282761096954346, + "learning_rate": 0.00018094594418955278, + "loss": 1.4266, + "step": 7346 + }, + { + "epoch": 0.0954709141499213, + "grad_norm": 0.375723659992218, + "learning_rate": 0.00018094334472764138, + "loss": 1.5753, + "step": 7347 + }, + { + "epoch": 0.09548390869383717, + "grad_norm": 0.3935757279396057, + "learning_rate": 0.00018094074526573, + "loss": 1.4751, + "step": 7348 + }, + { + "epoch": 0.09549690323775305, + "grad_norm": 0.3137151896953583, + "learning_rate": 0.00018093814580381863, + "loss": 1.4364, + "step": 7349 + }, + { + "epoch": 0.09550989778166892, + "grad_norm": 0.44521477818489075, + "learning_rate": 0.00018093554634190722, + "loss": 1.4635, + "step": 7350 + }, + { + "epoch": 0.0955228923255848, + "grad_norm": 0.5662813186645508, + "learning_rate": 0.00018093294687999585, + "loss": 1.5626, + "step": 7351 + }, + { + "epoch": 0.09553588686950067, + "grad_norm": 0.3745267689228058, + "learning_rate": 0.00018093034741808444, + "loss": 1.4033, + "step": 7352 + }, + { + "epoch": 0.09554888141341654, + "grad_norm": 0.402737021446228, + "learning_rate": 0.0001809277479561731, + "loss": 1.5217, + "step": 7353 + }, + { + "epoch": 0.09556187595733241, + "grad_norm": 0.44217994809150696, + "learning_rate": 0.0001809251484942617, + "loss": 1.5257, + "step": 7354 + }, + { + "epoch": 0.09557487050124829, + "grad_norm": 0.33841899037361145, + "learning_rate": 0.00018092254903235032, + "loss": 1.3847, + "step": 7355 + }, + { + "epoch": 0.09558786504516416, + "grad_norm": 0.3889140188694, + "learning_rate": 0.00018091994957043892, + "loss": 1.4086, + "step": 7356 + }, + { + "epoch": 0.09560085958908003, + "grad_norm": 0.42446374893188477, + "learning_rate": 0.00018091735010852754, + "loss": 1.3973, + "step": 7357 + }, + { + "epoch": 0.0956138541329959, + "grad_norm": 0.406424880027771, + "learning_rate": 0.00018091475064661616, + "loss": 1.3708, + "step": 7358 + }, + { + "epoch": 0.09562684867691178, + "grad_norm": 0.3968759775161743, + "learning_rate": 0.00018091215118470476, + "loss": 1.1888, + "step": 7359 + }, + { + "epoch": 0.09563984322082765, + "grad_norm": 0.3304159939289093, + "learning_rate": 0.00018090955172279339, + "loss": 1.196, + "step": 7360 + }, + { + "epoch": 0.09565283776474352, + "grad_norm": 0.34259432554244995, + "learning_rate": 0.000180906952260882, + "loss": 1.3919, + "step": 7361 + }, + { + "epoch": 0.0956658323086594, + "grad_norm": 0.41082262992858887, + "learning_rate": 0.00018090435279897064, + "loss": 1.3382, + "step": 7362 + }, + { + "epoch": 0.09567882685257527, + "grad_norm": 0.46010255813598633, + "learning_rate": 0.00018090175333705923, + "loss": 1.5625, + "step": 7363 + }, + { + "epoch": 0.09569182139649114, + "grad_norm": 0.3312811255455017, + "learning_rate": 0.00018089915387514783, + "loss": 1.3329, + "step": 7364 + }, + { + "epoch": 0.09570481594040701, + "grad_norm": 0.3977181613445282, + "learning_rate": 0.00018089655441323648, + "loss": 1.3733, + "step": 7365 + }, + { + "epoch": 0.0957178104843229, + "grad_norm": 0.398436963558197, + "learning_rate": 0.00018089395495132508, + "loss": 1.4563, + "step": 7366 + }, + { + "epoch": 0.09573080502823877, + "grad_norm": 0.43061181902885437, + "learning_rate": 0.0001808913554894137, + "loss": 1.5571, + "step": 7367 + }, + { + "epoch": 0.09574379957215465, + "grad_norm": 0.35109207034111023, + "learning_rate": 0.00018088875602750233, + "loss": 1.302, + "step": 7368 + }, + { + "epoch": 0.09575679411607052, + "grad_norm": 0.34773433208465576, + "learning_rate": 0.00018088615656559093, + "loss": 1.4575, + "step": 7369 + }, + { + "epoch": 0.0957697886599864, + "grad_norm": 0.4366167485713959, + "learning_rate": 0.00018088355710367955, + "loss": 1.3934, + "step": 7370 + }, + { + "epoch": 0.09578278320390227, + "grad_norm": 0.37397125363349915, + "learning_rate": 0.00018088095764176815, + "loss": 1.5105, + "step": 7371 + }, + { + "epoch": 0.09579577774781814, + "grad_norm": 0.47733554244041443, + "learning_rate": 0.0001808783581798568, + "loss": 1.4562, + "step": 7372 + }, + { + "epoch": 0.09580877229173401, + "grad_norm": 0.4071448743343353, + "learning_rate": 0.0001808757587179454, + "loss": 1.5588, + "step": 7373 + }, + { + "epoch": 0.09582176683564989, + "grad_norm": 0.2880563735961914, + "learning_rate": 0.00018087315925603402, + "loss": 1.3191, + "step": 7374 + }, + { + "epoch": 0.09583476137956576, + "grad_norm": 0.3139370083808899, + "learning_rate": 0.00018087055979412262, + "loss": 1.2754, + "step": 7375 + }, + { + "epoch": 0.09584775592348163, + "grad_norm": 0.4224952161312103, + "learning_rate": 0.00018086796033221124, + "loss": 1.5666, + "step": 7376 + }, + { + "epoch": 0.0958607504673975, + "grad_norm": 0.3922746777534485, + "learning_rate": 0.00018086536087029987, + "loss": 1.671, + "step": 7377 + }, + { + "epoch": 0.09587374501131338, + "grad_norm": 0.41716238856315613, + "learning_rate": 0.00018086276140838846, + "loss": 1.3269, + "step": 7378 + }, + { + "epoch": 0.09588673955522925, + "grad_norm": 0.45813536643981934, + "learning_rate": 0.0001808601619464771, + "loss": 1.6024, + "step": 7379 + }, + { + "epoch": 0.09589973409914512, + "grad_norm": 0.33751386404037476, + "learning_rate": 0.0001808575624845657, + "loss": 1.2527, + "step": 7380 + }, + { + "epoch": 0.095912728643061, + "grad_norm": 0.4570043981075287, + "learning_rate": 0.0001808549630226543, + "loss": 1.5184, + "step": 7381 + }, + { + "epoch": 0.09592572318697687, + "grad_norm": 0.35360071063041687, + "learning_rate": 0.00018085236356074294, + "loss": 1.4118, + "step": 7382 + }, + { + "epoch": 0.09593871773089274, + "grad_norm": 0.2964922785758972, + "learning_rate": 0.00018084976409883153, + "loss": 1.5343, + "step": 7383 + }, + { + "epoch": 0.09595171227480861, + "grad_norm": 0.40963271260261536, + "learning_rate": 0.00018084716463692018, + "loss": 1.4879, + "step": 7384 + }, + { + "epoch": 0.09596470681872449, + "grad_norm": 0.4206002354621887, + "learning_rate": 0.00018084456517500878, + "loss": 1.3153, + "step": 7385 + }, + { + "epoch": 0.09597770136264036, + "grad_norm": 0.3872007727622986, + "learning_rate": 0.0001808419657130974, + "loss": 1.5442, + "step": 7386 + }, + { + "epoch": 0.09599069590655623, + "grad_norm": 0.3645806312561035, + "learning_rate": 0.000180839366251186, + "loss": 1.4758, + "step": 7387 + }, + { + "epoch": 0.0960036904504721, + "grad_norm": 0.46816349029541016, + "learning_rate": 0.00018083676678927463, + "loss": 1.4461, + "step": 7388 + }, + { + "epoch": 0.09601668499438798, + "grad_norm": 0.3602139353752136, + "learning_rate": 0.00018083416732736325, + "loss": 1.4969, + "step": 7389 + }, + { + "epoch": 0.09602967953830385, + "grad_norm": 0.3550172448158264, + "learning_rate": 0.00018083156786545185, + "loss": 1.518, + "step": 7390 + }, + { + "epoch": 0.09604267408221973, + "grad_norm": 0.46977195143699646, + "learning_rate": 0.00018082896840354047, + "loss": 1.6275, + "step": 7391 + }, + { + "epoch": 0.0960556686261356, + "grad_norm": 0.5217018127441406, + "learning_rate": 0.0001808263689416291, + "loss": 1.6169, + "step": 7392 + }, + { + "epoch": 0.09606866317005147, + "grad_norm": 0.36064958572387695, + "learning_rate": 0.0001808237694797177, + "loss": 1.5654, + "step": 7393 + }, + { + "epoch": 0.09608165771396734, + "grad_norm": 0.37483900785446167, + "learning_rate": 0.00018082117001780632, + "loss": 1.3788, + "step": 7394 + }, + { + "epoch": 0.09609465225788322, + "grad_norm": 0.3947065472602844, + "learning_rate": 0.00018081857055589492, + "loss": 1.539, + "step": 7395 + }, + { + "epoch": 0.09610764680179909, + "grad_norm": 0.40937402844429016, + "learning_rate": 0.00018081597109398357, + "loss": 1.33, + "step": 7396 + }, + { + "epoch": 0.09612064134571496, + "grad_norm": 0.375588595867157, + "learning_rate": 0.00018081337163207217, + "loss": 1.3802, + "step": 7397 + }, + { + "epoch": 0.09613363588963084, + "grad_norm": 0.4242369830608368, + "learning_rate": 0.0001808107721701608, + "loss": 1.2283, + "step": 7398 + }, + { + "epoch": 0.09614663043354671, + "grad_norm": 0.31093209981918335, + "learning_rate": 0.0001808081727082494, + "loss": 1.1685, + "step": 7399 + }, + { + "epoch": 0.09615962497746258, + "grad_norm": 0.33873701095581055, + "learning_rate": 0.000180805573246338, + "loss": 1.2617, + "step": 7400 + }, + { + "epoch": 0.09617261952137846, + "grad_norm": 0.4038071632385254, + "learning_rate": 0.00018080297378442664, + "loss": 1.2833, + "step": 7401 + }, + { + "epoch": 0.09618561406529433, + "grad_norm": 0.43444883823394775, + "learning_rate": 0.00018080037432251524, + "loss": 1.4952, + "step": 7402 + }, + { + "epoch": 0.0961986086092102, + "grad_norm": 0.37988609075546265, + "learning_rate": 0.0001807977748606039, + "loss": 1.2989, + "step": 7403 + }, + { + "epoch": 0.09621160315312609, + "grad_norm": 0.39441198110580444, + "learning_rate": 0.00018079517539869248, + "loss": 1.327, + "step": 7404 + }, + { + "epoch": 0.09622459769704196, + "grad_norm": 0.385147362947464, + "learning_rate": 0.00018079257593678108, + "loss": 1.4291, + "step": 7405 + }, + { + "epoch": 0.09623759224095783, + "grad_norm": 0.34766459465026855, + "learning_rate": 0.0001807899764748697, + "loss": 1.3645, + "step": 7406 + }, + { + "epoch": 0.0962505867848737, + "grad_norm": 0.28275880217552185, + "learning_rate": 0.00018078737701295833, + "loss": 1.2419, + "step": 7407 + }, + { + "epoch": 0.09626358132878958, + "grad_norm": 0.4010525047779083, + "learning_rate": 0.00018078477755104695, + "loss": 1.3837, + "step": 7408 + }, + { + "epoch": 0.09627657587270545, + "grad_norm": 0.5371651649475098, + "learning_rate": 0.00018078217808913555, + "loss": 1.3649, + "step": 7409 + }, + { + "epoch": 0.09628957041662133, + "grad_norm": 0.35657212138175964, + "learning_rate": 0.00018077957862722418, + "loss": 1.2928, + "step": 7410 + }, + { + "epoch": 0.0963025649605372, + "grad_norm": 0.33672940731048584, + "learning_rate": 0.0001807769791653128, + "loss": 1.481, + "step": 7411 + }, + { + "epoch": 0.09631555950445307, + "grad_norm": 0.31562361121177673, + "learning_rate": 0.0001807743797034014, + "loss": 1.3423, + "step": 7412 + }, + { + "epoch": 0.09632855404836894, + "grad_norm": 0.5343045592308044, + "learning_rate": 0.00018077178024149002, + "loss": 1.466, + "step": 7413 + }, + { + "epoch": 0.09634154859228482, + "grad_norm": 0.43652138113975525, + "learning_rate": 0.00018076918077957862, + "loss": 1.4483, + "step": 7414 + }, + { + "epoch": 0.09635454313620069, + "grad_norm": 0.4236850142478943, + "learning_rate": 0.00018076658131766727, + "loss": 1.5935, + "step": 7415 + }, + { + "epoch": 0.09636753768011656, + "grad_norm": 0.4643765985965729, + "learning_rate": 0.00018076398185575587, + "loss": 1.4371, + "step": 7416 + }, + { + "epoch": 0.09638053222403244, + "grad_norm": 0.3924958109855652, + "learning_rate": 0.0001807613823938445, + "loss": 1.4879, + "step": 7417 + }, + { + "epoch": 0.09639352676794831, + "grad_norm": 0.3421379327774048, + "learning_rate": 0.0001807587829319331, + "loss": 1.5619, + "step": 7418 + }, + { + "epoch": 0.09640652131186418, + "grad_norm": 0.4625434875488281, + "learning_rate": 0.00018075618347002172, + "loss": 1.3606, + "step": 7419 + }, + { + "epoch": 0.09641951585578006, + "grad_norm": 0.5802971124649048, + "learning_rate": 0.00018075358400811034, + "loss": 1.4982, + "step": 7420 + }, + { + "epoch": 0.09643251039969593, + "grad_norm": 0.3458288609981537, + "learning_rate": 0.00018075098454619894, + "loss": 1.562, + "step": 7421 + }, + { + "epoch": 0.0964455049436118, + "grad_norm": 0.3565884530544281, + "learning_rate": 0.00018074838508428756, + "loss": 1.3796, + "step": 7422 + }, + { + "epoch": 0.09645849948752767, + "grad_norm": 0.37184450030326843, + "learning_rate": 0.0001807457856223762, + "loss": 1.2933, + "step": 7423 + }, + { + "epoch": 0.09647149403144355, + "grad_norm": 0.396337628364563, + "learning_rate": 0.00018074318616046478, + "loss": 1.5114, + "step": 7424 + }, + { + "epoch": 0.09648448857535942, + "grad_norm": 0.4396108090877533, + "learning_rate": 0.0001807405866985534, + "loss": 1.6913, + "step": 7425 + }, + { + "epoch": 0.09649748311927529, + "grad_norm": 0.3083972632884979, + "learning_rate": 0.000180737987236642, + "loss": 1.4721, + "step": 7426 + }, + { + "epoch": 0.09651047766319117, + "grad_norm": 0.29246222972869873, + "learning_rate": 0.00018073538777473066, + "loss": 1.375, + "step": 7427 + }, + { + "epoch": 0.09652347220710704, + "grad_norm": 0.5322912335395813, + "learning_rate": 0.00018073278831281925, + "loss": 1.63, + "step": 7428 + }, + { + "epoch": 0.09653646675102291, + "grad_norm": 0.3530120551586151, + "learning_rate": 0.00018073018885090788, + "loss": 1.3668, + "step": 7429 + }, + { + "epoch": 0.09654946129493878, + "grad_norm": 0.44654831290245056, + "learning_rate": 0.00018072758938899648, + "loss": 1.465, + "step": 7430 + }, + { + "epoch": 0.09656245583885466, + "grad_norm": 0.35736167430877686, + "learning_rate": 0.0001807249899270851, + "loss": 1.2781, + "step": 7431 + }, + { + "epoch": 0.09657545038277053, + "grad_norm": 0.5238146781921387, + "learning_rate": 0.00018072239046517373, + "loss": 1.4013, + "step": 7432 + }, + { + "epoch": 0.0965884449266864, + "grad_norm": 0.35692641139030457, + "learning_rate": 0.00018071979100326232, + "loss": 1.3866, + "step": 7433 + }, + { + "epoch": 0.09660143947060228, + "grad_norm": 0.4682738780975342, + "learning_rate": 0.00018071719154135095, + "loss": 1.3896, + "step": 7434 + }, + { + "epoch": 0.09661443401451815, + "grad_norm": 0.45371633768081665, + "learning_rate": 0.00018071459207943957, + "loss": 1.587, + "step": 7435 + }, + { + "epoch": 0.09662742855843402, + "grad_norm": 0.4439394772052765, + "learning_rate": 0.00018071199261752817, + "loss": 1.5158, + "step": 7436 + }, + { + "epoch": 0.0966404231023499, + "grad_norm": 0.3876210153102875, + "learning_rate": 0.0001807093931556168, + "loss": 1.3619, + "step": 7437 + }, + { + "epoch": 0.09665341764626577, + "grad_norm": 0.389203280210495, + "learning_rate": 0.0001807067936937054, + "loss": 1.4799, + "step": 7438 + }, + { + "epoch": 0.09666641219018164, + "grad_norm": 0.44843536615371704, + "learning_rate": 0.00018070419423179404, + "loss": 1.6466, + "step": 7439 + }, + { + "epoch": 0.09667940673409751, + "grad_norm": 0.5099455714225769, + "learning_rate": 0.00018070159476988264, + "loss": 1.511, + "step": 7440 + }, + { + "epoch": 0.09669240127801339, + "grad_norm": 0.46425628662109375, + "learning_rate": 0.00018069899530797126, + "loss": 1.35, + "step": 7441 + }, + { + "epoch": 0.09670539582192927, + "grad_norm": 0.35282662510871887, + "learning_rate": 0.0001806963958460599, + "loss": 1.3496, + "step": 7442 + }, + { + "epoch": 0.09671839036584515, + "grad_norm": 0.4124460816383362, + "learning_rate": 0.0001806937963841485, + "loss": 1.4794, + "step": 7443 + }, + { + "epoch": 0.09673138490976102, + "grad_norm": 0.3811986446380615, + "learning_rate": 0.0001806911969222371, + "loss": 1.4536, + "step": 7444 + }, + { + "epoch": 0.0967443794536769, + "grad_norm": 0.5014336705207825, + "learning_rate": 0.0001806885974603257, + "loss": 1.5964, + "step": 7445 + }, + { + "epoch": 0.09675737399759277, + "grad_norm": 0.38406094908714294, + "learning_rate": 0.00018068599799841436, + "loss": 1.3432, + "step": 7446 + }, + { + "epoch": 0.09677036854150864, + "grad_norm": 0.50011146068573, + "learning_rate": 0.00018068339853650296, + "loss": 1.6317, + "step": 7447 + }, + { + "epoch": 0.09678336308542451, + "grad_norm": 0.5149485468864441, + "learning_rate": 0.00018068079907459155, + "loss": 1.5929, + "step": 7448 + }, + { + "epoch": 0.09679635762934038, + "grad_norm": 0.29857826232910156, + "learning_rate": 0.00018067819961268018, + "loss": 1.3656, + "step": 7449 + }, + { + "epoch": 0.09680935217325626, + "grad_norm": 0.38642942905426025, + "learning_rate": 0.0001806756001507688, + "loss": 1.4969, + "step": 7450 + }, + { + "epoch": 0.09682234671717213, + "grad_norm": 0.4397829473018646, + "learning_rate": 0.00018067300068885743, + "loss": 1.4019, + "step": 7451 + }, + { + "epoch": 0.096835341261088, + "grad_norm": 0.3764725923538208, + "learning_rate": 0.00018067040122694603, + "loss": 1.297, + "step": 7452 + }, + { + "epoch": 0.09684833580500388, + "grad_norm": 0.41457849740982056, + "learning_rate": 0.00018066780176503465, + "loss": 1.5439, + "step": 7453 + }, + { + "epoch": 0.09686133034891975, + "grad_norm": 0.3851878345012665, + "learning_rate": 0.00018066520230312327, + "loss": 1.6111, + "step": 7454 + }, + { + "epoch": 0.09687432489283562, + "grad_norm": 0.5073195099830627, + "learning_rate": 0.00018066260284121187, + "loss": 1.6866, + "step": 7455 + }, + { + "epoch": 0.0968873194367515, + "grad_norm": 0.3910175561904907, + "learning_rate": 0.0001806600033793005, + "loss": 1.4723, + "step": 7456 + }, + { + "epoch": 0.09690031398066737, + "grad_norm": 0.3264296054840088, + "learning_rate": 0.0001806574039173891, + "loss": 1.2507, + "step": 7457 + }, + { + "epoch": 0.09691330852458324, + "grad_norm": 0.39491769671440125, + "learning_rate": 0.00018065480445547775, + "loss": 1.5712, + "step": 7458 + }, + { + "epoch": 0.09692630306849911, + "grad_norm": 0.3605521023273468, + "learning_rate": 0.00018065220499356634, + "loss": 1.4412, + "step": 7459 + }, + { + "epoch": 0.09693929761241499, + "grad_norm": 0.40367692708969116, + "learning_rate": 0.00018064960553165494, + "loss": 1.4422, + "step": 7460 + }, + { + "epoch": 0.09695229215633086, + "grad_norm": 0.3287511467933655, + "learning_rate": 0.00018064700606974356, + "loss": 1.5895, + "step": 7461 + }, + { + "epoch": 0.09696528670024673, + "grad_norm": 0.49243098497390747, + "learning_rate": 0.0001806444066078322, + "loss": 1.438, + "step": 7462 + }, + { + "epoch": 0.0969782812441626, + "grad_norm": 0.4011189341545105, + "learning_rate": 0.00018064180714592081, + "loss": 1.3119, + "step": 7463 + }, + { + "epoch": 0.09699127578807848, + "grad_norm": 0.3325624465942383, + "learning_rate": 0.0001806392076840094, + "loss": 1.3323, + "step": 7464 + }, + { + "epoch": 0.09700427033199435, + "grad_norm": 0.4215577244758606, + "learning_rate": 0.00018063660822209804, + "loss": 1.3532, + "step": 7465 + }, + { + "epoch": 0.09701726487591023, + "grad_norm": 0.3219201862812042, + "learning_rate": 0.00018063400876018666, + "loss": 1.3114, + "step": 7466 + }, + { + "epoch": 0.0970302594198261, + "grad_norm": 0.481382817029953, + "learning_rate": 0.00018063140929827526, + "loss": 1.5181, + "step": 7467 + }, + { + "epoch": 0.09704325396374197, + "grad_norm": 0.40177300572395325, + "learning_rate": 0.00018062880983636388, + "loss": 1.1998, + "step": 7468 + }, + { + "epoch": 0.09705624850765784, + "grad_norm": 0.41246241331100464, + "learning_rate": 0.00018062621037445248, + "loss": 1.29, + "step": 7469 + }, + { + "epoch": 0.09706924305157372, + "grad_norm": 0.37534165382385254, + "learning_rate": 0.00018062361091254113, + "loss": 1.2805, + "step": 7470 + }, + { + "epoch": 0.09708223759548959, + "grad_norm": 0.3613832890987396, + "learning_rate": 0.00018062101145062973, + "loss": 1.5733, + "step": 7471 + }, + { + "epoch": 0.09709523213940546, + "grad_norm": 0.43854978680610657, + "learning_rate": 0.00018061841198871833, + "loss": 1.3541, + "step": 7472 + }, + { + "epoch": 0.09710822668332134, + "grad_norm": 0.40474042296409607, + "learning_rate": 0.00018061581252680695, + "loss": 1.3872, + "step": 7473 + }, + { + "epoch": 0.09712122122723721, + "grad_norm": 0.37860623002052307, + "learning_rate": 0.00018061321306489557, + "loss": 1.4542, + "step": 7474 + }, + { + "epoch": 0.09713421577115308, + "grad_norm": 0.42968985438346863, + "learning_rate": 0.0001806106136029842, + "loss": 1.4972, + "step": 7475 + }, + { + "epoch": 0.09714721031506895, + "grad_norm": 0.32527509331703186, + "learning_rate": 0.0001806080141410728, + "loss": 1.4433, + "step": 7476 + }, + { + "epoch": 0.09716020485898483, + "grad_norm": 0.49563100934028625, + "learning_rate": 0.00018060541467916142, + "loss": 1.4954, + "step": 7477 + }, + { + "epoch": 0.0971731994029007, + "grad_norm": 0.4498002529144287, + "learning_rate": 0.00018060281521725005, + "loss": 1.415, + "step": 7478 + }, + { + "epoch": 0.09718619394681657, + "grad_norm": 0.41119900345802307, + "learning_rate": 0.00018060021575533864, + "loss": 1.4771, + "step": 7479 + }, + { + "epoch": 0.09719918849073246, + "grad_norm": 0.3612847924232483, + "learning_rate": 0.00018059761629342727, + "loss": 1.4699, + "step": 7480 + }, + { + "epoch": 0.09721218303464833, + "grad_norm": 0.38155508041381836, + "learning_rate": 0.0001805950168315159, + "loss": 1.2713, + "step": 7481 + }, + { + "epoch": 0.0972251775785642, + "grad_norm": 0.40645986795425415, + "learning_rate": 0.00018059241736960452, + "loss": 1.5713, + "step": 7482 + }, + { + "epoch": 0.09723817212248008, + "grad_norm": 0.42810964584350586, + "learning_rate": 0.0001805898179076931, + "loss": 1.458, + "step": 7483 + }, + { + "epoch": 0.09725116666639595, + "grad_norm": 0.4785873293876648, + "learning_rate": 0.00018058721844578174, + "loss": 1.4083, + "step": 7484 + }, + { + "epoch": 0.09726416121031183, + "grad_norm": 0.3351441025733948, + "learning_rate": 0.00018058461898387036, + "loss": 1.4183, + "step": 7485 + }, + { + "epoch": 0.0972771557542277, + "grad_norm": 0.3357161581516266, + "learning_rate": 0.00018058201952195896, + "loss": 1.3946, + "step": 7486 + }, + { + "epoch": 0.09729015029814357, + "grad_norm": 0.3906060755252838, + "learning_rate": 0.00018057942006004758, + "loss": 1.4933, + "step": 7487 + }, + { + "epoch": 0.09730314484205944, + "grad_norm": 0.3968433439731598, + "learning_rate": 0.00018057682059813618, + "loss": 1.4347, + "step": 7488 + }, + { + "epoch": 0.09731613938597532, + "grad_norm": 0.3303782343864441, + "learning_rate": 0.0001805742211362248, + "loss": 1.2685, + "step": 7489 + }, + { + "epoch": 0.09732913392989119, + "grad_norm": 0.3775410056114197, + "learning_rate": 0.00018057162167431343, + "loss": 1.3588, + "step": 7490 + }, + { + "epoch": 0.09734212847380706, + "grad_norm": 0.3607504665851593, + "learning_rate": 0.00018056902221240203, + "loss": 1.4946, + "step": 7491 + }, + { + "epoch": 0.09735512301772294, + "grad_norm": 0.41206371784210205, + "learning_rate": 0.00018056642275049065, + "loss": 1.4833, + "step": 7492 + }, + { + "epoch": 0.09736811756163881, + "grad_norm": 0.37164849042892456, + "learning_rate": 0.00018056382328857928, + "loss": 1.6235, + "step": 7493 + }, + { + "epoch": 0.09738111210555468, + "grad_norm": 0.4014964997768402, + "learning_rate": 0.0001805612238266679, + "loss": 1.3329, + "step": 7494 + }, + { + "epoch": 0.09739410664947055, + "grad_norm": 0.3379157781600952, + "learning_rate": 0.0001805586243647565, + "loss": 1.3853, + "step": 7495 + }, + { + "epoch": 0.09740710119338643, + "grad_norm": 0.37434065341949463, + "learning_rate": 0.00018055602490284512, + "loss": 1.3878, + "step": 7496 + }, + { + "epoch": 0.0974200957373023, + "grad_norm": 0.5373088121414185, + "learning_rate": 0.00018055342544093375, + "loss": 1.5024, + "step": 7497 + }, + { + "epoch": 0.09743309028121817, + "grad_norm": 0.3984873592853546, + "learning_rate": 0.00018055082597902235, + "loss": 1.5179, + "step": 7498 + }, + { + "epoch": 0.09744608482513405, + "grad_norm": 0.4356929659843445, + "learning_rate": 0.00018054822651711097, + "loss": 1.5313, + "step": 7499 + }, + { + "epoch": 0.09745907936904992, + "grad_norm": 0.39235201478004456, + "learning_rate": 0.00018054562705519957, + "loss": 1.4062, + "step": 7500 + }, + { + "epoch": 0.09747207391296579, + "grad_norm": 0.38887616991996765, + "learning_rate": 0.00018054302759328822, + "loss": 1.625, + "step": 7501 + }, + { + "epoch": 0.09748506845688167, + "grad_norm": 0.3356170654296875, + "learning_rate": 0.00018054042813137682, + "loss": 1.4263, + "step": 7502 + }, + { + "epoch": 0.09749806300079754, + "grad_norm": 0.4639907777309418, + "learning_rate": 0.0001805378286694654, + "loss": 1.4609, + "step": 7503 + }, + { + "epoch": 0.09751105754471341, + "grad_norm": 0.39358067512512207, + "learning_rate": 0.00018053522920755404, + "loss": 1.411, + "step": 7504 + }, + { + "epoch": 0.09752405208862928, + "grad_norm": 0.40222474932670593, + "learning_rate": 0.00018053262974564266, + "loss": 1.3899, + "step": 7505 + }, + { + "epoch": 0.09753704663254516, + "grad_norm": 0.4041571319103241, + "learning_rate": 0.0001805300302837313, + "loss": 1.5465, + "step": 7506 + }, + { + "epoch": 0.09755004117646103, + "grad_norm": 0.3240668475627899, + "learning_rate": 0.00018052743082181988, + "loss": 1.1432, + "step": 7507 + }, + { + "epoch": 0.0975630357203769, + "grad_norm": 0.3331120014190674, + "learning_rate": 0.0001805248313599085, + "loss": 1.5669, + "step": 7508 + }, + { + "epoch": 0.09757603026429278, + "grad_norm": 0.5190784931182861, + "learning_rate": 0.00018052223189799713, + "loss": 1.538, + "step": 7509 + }, + { + "epoch": 0.09758902480820865, + "grad_norm": 0.34999462962150574, + "learning_rate": 0.00018051963243608573, + "loss": 1.1415, + "step": 7510 + }, + { + "epoch": 0.09760201935212452, + "grad_norm": 0.4424222409725189, + "learning_rate": 0.00018051703297417436, + "loss": 1.3399, + "step": 7511 + }, + { + "epoch": 0.0976150138960404, + "grad_norm": 0.40078315138816833, + "learning_rate": 0.00018051443351226295, + "loss": 1.5825, + "step": 7512 + }, + { + "epoch": 0.09762800843995627, + "grad_norm": 0.43246933817863464, + "learning_rate": 0.0001805118340503516, + "loss": 1.4266, + "step": 7513 + }, + { + "epoch": 0.09764100298387214, + "grad_norm": 0.3761407136917114, + "learning_rate": 0.0001805092345884402, + "loss": 1.4836, + "step": 7514 + }, + { + "epoch": 0.09765399752778801, + "grad_norm": 0.3609583377838135, + "learning_rate": 0.0001805066351265288, + "loss": 1.3117, + "step": 7515 + }, + { + "epoch": 0.09766699207170389, + "grad_norm": 0.47262704372406006, + "learning_rate": 0.00018050403566461745, + "loss": 1.4304, + "step": 7516 + }, + { + "epoch": 0.09767998661561976, + "grad_norm": 0.3673385977745056, + "learning_rate": 0.00018050143620270605, + "loss": 1.4625, + "step": 7517 + }, + { + "epoch": 0.09769298115953565, + "grad_norm": 0.42160317301750183, + "learning_rate": 0.00018049883674079467, + "loss": 1.3847, + "step": 7518 + }, + { + "epoch": 0.09770597570345152, + "grad_norm": 0.3719385266304016, + "learning_rate": 0.00018049623727888327, + "loss": 1.3846, + "step": 7519 + }, + { + "epoch": 0.09771897024736739, + "grad_norm": 0.39151620864868164, + "learning_rate": 0.0001804936378169719, + "loss": 1.3828, + "step": 7520 + }, + { + "epoch": 0.09773196479128327, + "grad_norm": 0.2977345585823059, + "learning_rate": 0.00018049103835506052, + "loss": 1.2172, + "step": 7521 + }, + { + "epoch": 0.09774495933519914, + "grad_norm": 0.4201314449310303, + "learning_rate": 0.00018048843889314912, + "loss": 1.3374, + "step": 7522 + }, + { + "epoch": 0.09775795387911501, + "grad_norm": 0.38257303833961487, + "learning_rate": 0.00018048583943123774, + "loss": 1.549, + "step": 7523 + }, + { + "epoch": 0.09777094842303088, + "grad_norm": 0.4054087698459625, + "learning_rate": 0.00018048323996932637, + "loss": 1.4241, + "step": 7524 + }, + { + "epoch": 0.09778394296694676, + "grad_norm": 0.360950231552124, + "learning_rate": 0.000180480640507415, + "loss": 1.2819, + "step": 7525 + }, + { + "epoch": 0.09779693751086263, + "grad_norm": 0.35810574889183044, + "learning_rate": 0.0001804780410455036, + "loss": 1.3003, + "step": 7526 + }, + { + "epoch": 0.0978099320547785, + "grad_norm": 0.3271980285644531, + "learning_rate": 0.00018047544158359218, + "loss": 1.4801, + "step": 7527 + }, + { + "epoch": 0.09782292659869438, + "grad_norm": 0.47151824831962585, + "learning_rate": 0.00018047284212168084, + "loss": 1.5772, + "step": 7528 + }, + { + "epoch": 0.09783592114261025, + "grad_norm": 0.28331610560417175, + "learning_rate": 0.00018047024265976943, + "loss": 1.4071, + "step": 7529 + }, + { + "epoch": 0.09784891568652612, + "grad_norm": 0.3555101752281189, + "learning_rate": 0.00018046764319785806, + "loss": 1.4646, + "step": 7530 + }, + { + "epoch": 0.097861910230442, + "grad_norm": 0.3308420181274414, + "learning_rate": 0.00018046504373594666, + "loss": 1.3502, + "step": 7531 + }, + { + "epoch": 0.09787490477435787, + "grad_norm": 0.2997888922691345, + "learning_rate": 0.00018046244427403528, + "loss": 1.2462, + "step": 7532 + }, + { + "epoch": 0.09788789931827374, + "grad_norm": 0.35908517241477966, + "learning_rate": 0.0001804598448121239, + "loss": 1.3868, + "step": 7533 + }, + { + "epoch": 0.09790089386218961, + "grad_norm": 0.35377171635627747, + "learning_rate": 0.0001804572453502125, + "loss": 1.4193, + "step": 7534 + }, + { + "epoch": 0.09791388840610549, + "grad_norm": 0.39672043919563293, + "learning_rate": 0.00018045464588830113, + "loss": 1.2546, + "step": 7535 + }, + { + "epoch": 0.09792688295002136, + "grad_norm": 0.3332906663417816, + "learning_rate": 0.00018045204642638975, + "loss": 1.2704, + "step": 7536 + }, + { + "epoch": 0.09793987749393723, + "grad_norm": 0.3438867926597595, + "learning_rate": 0.00018044944696447837, + "loss": 1.4203, + "step": 7537 + }, + { + "epoch": 0.0979528720378531, + "grad_norm": 0.31091347336769104, + "learning_rate": 0.00018044684750256697, + "loss": 1.2197, + "step": 7538 + }, + { + "epoch": 0.09796586658176898, + "grad_norm": 0.42031732201576233, + "learning_rate": 0.0001804442480406556, + "loss": 1.4666, + "step": 7539 + }, + { + "epoch": 0.09797886112568485, + "grad_norm": 0.3627092242240906, + "learning_rate": 0.00018044164857874422, + "loss": 1.4846, + "step": 7540 + }, + { + "epoch": 0.09799185566960072, + "grad_norm": 0.36253103613853455, + "learning_rate": 0.00018043904911683282, + "loss": 1.5323, + "step": 7541 + }, + { + "epoch": 0.0980048502135166, + "grad_norm": 0.3460425138473511, + "learning_rate": 0.00018043644965492144, + "loss": 1.5784, + "step": 7542 + }, + { + "epoch": 0.09801784475743247, + "grad_norm": 0.4711621105670929, + "learning_rate": 0.00018043385019301004, + "loss": 1.4127, + "step": 7543 + }, + { + "epoch": 0.09803083930134834, + "grad_norm": 0.41815438866615295, + "learning_rate": 0.00018043125073109866, + "loss": 1.4559, + "step": 7544 + }, + { + "epoch": 0.09804383384526422, + "grad_norm": 0.44216975569725037, + "learning_rate": 0.0001804286512691873, + "loss": 1.4986, + "step": 7545 + }, + { + "epoch": 0.09805682838918009, + "grad_norm": 0.3501591980457306, + "learning_rate": 0.0001804260518072759, + "loss": 1.5426, + "step": 7546 + }, + { + "epoch": 0.09806982293309596, + "grad_norm": 0.4122762084007263, + "learning_rate": 0.0001804234523453645, + "loss": 1.5177, + "step": 7547 + }, + { + "epoch": 0.09808281747701184, + "grad_norm": 0.3824808597564697, + "learning_rate": 0.00018042085288345314, + "loss": 1.6754, + "step": 7548 + }, + { + "epoch": 0.09809581202092771, + "grad_norm": 0.4453409016132355, + "learning_rate": 0.00018041825342154176, + "loss": 1.4781, + "step": 7549 + }, + { + "epoch": 0.09810880656484358, + "grad_norm": 0.39347773790359497, + "learning_rate": 0.00018041565395963036, + "loss": 1.433, + "step": 7550 + }, + { + "epoch": 0.09812180110875945, + "grad_norm": 0.3569260537624359, + "learning_rate": 0.00018041305449771898, + "loss": 1.2342, + "step": 7551 + }, + { + "epoch": 0.09813479565267533, + "grad_norm": 0.3456551134586334, + "learning_rate": 0.0001804104550358076, + "loss": 1.5651, + "step": 7552 + }, + { + "epoch": 0.0981477901965912, + "grad_norm": 0.3950441777706146, + "learning_rate": 0.0001804078555738962, + "loss": 1.3125, + "step": 7553 + }, + { + "epoch": 0.09816078474050707, + "grad_norm": 0.5004082918167114, + "learning_rate": 0.00018040525611198483, + "loss": 1.6303, + "step": 7554 + }, + { + "epoch": 0.09817377928442295, + "grad_norm": 0.3228236436843872, + "learning_rate": 0.00018040265665007345, + "loss": 1.2768, + "step": 7555 + }, + { + "epoch": 0.09818677382833883, + "grad_norm": 0.21827465295791626, + "learning_rate": 0.00018040005718816205, + "loss": 1.3292, + "step": 7556 + }, + { + "epoch": 0.0981997683722547, + "grad_norm": 0.32572606205940247, + "learning_rate": 0.00018039745772625067, + "loss": 1.5443, + "step": 7557 + }, + { + "epoch": 0.09821276291617058, + "grad_norm": 0.446177214384079, + "learning_rate": 0.00018039485826433927, + "loss": 1.2286, + "step": 7558 + }, + { + "epoch": 0.09822575746008645, + "grad_norm": 0.35685598850250244, + "learning_rate": 0.00018039225880242792, + "loss": 1.355, + "step": 7559 + }, + { + "epoch": 0.09823875200400232, + "grad_norm": 0.44513094425201416, + "learning_rate": 0.00018038965934051652, + "loss": 1.5031, + "step": 7560 + }, + { + "epoch": 0.0982517465479182, + "grad_norm": 0.47552233934402466, + "learning_rate": 0.00018038705987860515, + "loss": 1.3778, + "step": 7561 + }, + { + "epoch": 0.09826474109183407, + "grad_norm": 0.4690370559692383, + "learning_rate": 0.00018038446041669374, + "loss": 1.5849, + "step": 7562 + }, + { + "epoch": 0.09827773563574994, + "grad_norm": 0.35861924290657043, + "learning_rate": 0.00018038186095478237, + "loss": 1.5092, + "step": 7563 + }, + { + "epoch": 0.09829073017966582, + "grad_norm": 0.35612034797668457, + "learning_rate": 0.000180379261492871, + "loss": 1.2118, + "step": 7564 + }, + { + "epoch": 0.09830372472358169, + "grad_norm": 0.4245584309101105, + "learning_rate": 0.0001803766620309596, + "loss": 1.7199, + "step": 7565 + }, + { + "epoch": 0.09831671926749756, + "grad_norm": 0.2901960015296936, + "learning_rate": 0.00018037406256904821, + "loss": 1.3535, + "step": 7566 + }, + { + "epoch": 0.09832971381141344, + "grad_norm": 0.4192603826522827, + "learning_rate": 0.00018037146310713684, + "loss": 1.4169, + "step": 7567 + }, + { + "epoch": 0.09834270835532931, + "grad_norm": 0.44621896743774414, + "learning_rate": 0.00018036886364522546, + "loss": 1.4539, + "step": 7568 + }, + { + "epoch": 0.09835570289924518, + "grad_norm": 0.34592095017433167, + "learning_rate": 0.00018036626418331406, + "loss": 1.4665, + "step": 7569 + }, + { + "epoch": 0.09836869744316105, + "grad_norm": 0.32394349575042725, + "learning_rate": 0.00018036366472140266, + "loss": 1.4129, + "step": 7570 + }, + { + "epoch": 0.09838169198707693, + "grad_norm": 0.43621963262557983, + "learning_rate": 0.0001803610652594913, + "loss": 1.5761, + "step": 7571 + }, + { + "epoch": 0.0983946865309928, + "grad_norm": 0.3277306854724884, + "learning_rate": 0.0001803584657975799, + "loss": 1.6591, + "step": 7572 + }, + { + "epoch": 0.09840768107490867, + "grad_norm": 0.37760117650032043, + "learning_rate": 0.00018035586633566853, + "loss": 1.5335, + "step": 7573 + }, + { + "epoch": 0.09842067561882455, + "grad_norm": 0.3746792674064636, + "learning_rate": 0.00018035326687375713, + "loss": 1.5167, + "step": 7574 + }, + { + "epoch": 0.09843367016274042, + "grad_norm": 0.38139402866363525, + "learning_rate": 0.00018035066741184575, + "loss": 1.4593, + "step": 7575 + }, + { + "epoch": 0.09844666470665629, + "grad_norm": 0.3655568063259125, + "learning_rate": 0.00018034806794993438, + "loss": 1.4886, + "step": 7576 + }, + { + "epoch": 0.09845965925057217, + "grad_norm": 0.37853625416755676, + "learning_rate": 0.00018034546848802297, + "loss": 1.3427, + "step": 7577 + }, + { + "epoch": 0.09847265379448804, + "grad_norm": 0.390889048576355, + "learning_rate": 0.0001803428690261116, + "loss": 1.6101, + "step": 7578 + }, + { + "epoch": 0.09848564833840391, + "grad_norm": 0.9414922595024109, + "learning_rate": 0.00018034026956420022, + "loss": 1.4874, + "step": 7579 + }, + { + "epoch": 0.09849864288231978, + "grad_norm": 0.42668992280960083, + "learning_rate": 0.00018033767010228885, + "loss": 1.4178, + "step": 7580 + }, + { + "epoch": 0.09851163742623566, + "grad_norm": 0.3904257118701935, + "learning_rate": 0.00018033507064037745, + "loss": 1.371, + "step": 7581 + }, + { + "epoch": 0.09852463197015153, + "grad_norm": 0.4323786497116089, + "learning_rate": 0.00018033247117846604, + "loss": 1.4144, + "step": 7582 + }, + { + "epoch": 0.0985376265140674, + "grad_norm": 0.40542373061180115, + "learning_rate": 0.0001803298717165547, + "loss": 1.4599, + "step": 7583 + }, + { + "epoch": 0.09855062105798328, + "grad_norm": 0.37801429629325867, + "learning_rate": 0.0001803272722546433, + "loss": 1.3522, + "step": 7584 + }, + { + "epoch": 0.09856361560189915, + "grad_norm": 0.35894036293029785, + "learning_rate": 0.00018032467279273192, + "loss": 1.385, + "step": 7585 + }, + { + "epoch": 0.09857661014581502, + "grad_norm": 0.44921109080314636, + "learning_rate": 0.00018032207333082051, + "loss": 1.4985, + "step": 7586 + }, + { + "epoch": 0.0985896046897309, + "grad_norm": 0.4041857421398163, + "learning_rate": 0.00018031947386890914, + "loss": 1.5342, + "step": 7587 + }, + { + "epoch": 0.09860259923364677, + "grad_norm": 0.3700587749481201, + "learning_rate": 0.00018031687440699776, + "loss": 1.3321, + "step": 7588 + }, + { + "epoch": 0.09861559377756264, + "grad_norm": 0.3191107213497162, + "learning_rate": 0.00018031427494508636, + "loss": 1.3033, + "step": 7589 + }, + { + "epoch": 0.09862858832147851, + "grad_norm": 0.2953815162181854, + "learning_rate": 0.000180311675483175, + "loss": 1.2716, + "step": 7590 + }, + { + "epoch": 0.09864158286539439, + "grad_norm": 0.44157710671424866, + "learning_rate": 0.0001803090760212636, + "loss": 1.6003, + "step": 7591 + }, + { + "epoch": 0.09865457740931026, + "grad_norm": 0.35140421986579895, + "learning_rate": 0.00018030647655935223, + "loss": 1.4758, + "step": 7592 + }, + { + "epoch": 0.09866757195322613, + "grad_norm": 0.4715821444988251, + "learning_rate": 0.00018030387709744083, + "loss": 1.4399, + "step": 7593 + }, + { + "epoch": 0.09868056649714202, + "grad_norm": 0.3662639260292053, + "learning_rate": 0.00018030127763552946, + "loss": 1.4089, + "step": 7594 + }, + { + "epoch": 0.09869356104105789, + "grad_norm": 0.4767250418663025, + "learning_rate": 0.00018029867817361808, + "loss": 1.3589, + "step": 7595 + }, + { + "epoch": 0.09870655558497377, + "grad_norm": 0.49728071689605713, + "learning_rate": 0.00018029607871170668, + "loss": 1.4808, + "step": 7596 + }, + { + "epoch": 0.09871955012888964, + "grad_norm": 0.32903313636779785, + "learning_rate": 0.0001802934792497953, + "loss": 1.2998, + "step": 7597 + }, + { + "epoch": 0.09873254467280551, + "grad_norm": 0.34764564037323, + "learning_rate": 0.00018029087978788393, + "loss": 1.4103, + "step": 7598 + }, + { + "epoch": 0.09874553921672138, + "grad_norm": 0.40784770250320435, + "learning_rate": 0.00018028828032597252, + "loss": 1.5499, + "step": 7599 + }, + { + "epoch": 0.09875853376063726, + "grad_norm": 0.3321189284324646, + "learning_rate": 0.00018028568086406115, + "loss": 1.3849, + "step": 7600 + }, + { + "epoch": 0.09877152830455313, + "grad_norm": 0.43755555152893066, + "learning_rate": 0.00018028308140214975, + "loss": 1.2838, + "step": 7601 + }, + { + "epoch": 0.098784522848469, + "grad_norm": 0.3918802738189697, + "learning_rate": 0.0001802804819402384, + "loss": 1.4765, + "step": 7602 + }, + { + "epoch": 0.09879751739238488, + "grad_norm": 0.4191012382507324, + "learning_rate": 0.000180277882478327, + "loss": 1.4286, + "step": 7603 + }, + { + "epoch": 0.09881051193630075, + "grad_norm": 0.5088168978691101, + "learning_rate": 0.00018027528301641562, + "loss": 1.6129, + "step": 7604 + }, + { + "epoch": 0.09882350648021662, + "grad_norm": 0.35823875665664673, + "learning_rate": 0.00018027268355450422, + "loss": 1.5607, + "step": 7605 + }, + { + "epoch": 0.0988365010241325, + "grad_norm": 0.39868295192718506, + "learning_rate": 0.00018027008409259284, + "loss": 1.4932, + "step": 7606 + }, + { + "epoch": 0.09884949556804837, + "grad_norm": 0.35696572065353394, + "learning_rate": 0.00018026748463068147, + "loss": 1.5195, + "step": 7607 + }, + { + "epoch": 0.09886249011196424, + "grad_norm": 0.4137752652168274, + "learning_rate": 0.00018026488516877006, + "loss": 1.6511, + "step": 7608 + }, + { + "epoch": 0.09887548465588011, + "grad_norm": 0.37321752309799194, + "learning_rate": 0.0001802622857068587, + "loss": 1.3618, + "step": 7609 + }, + { + "epoch": 0.09888847919979599, + "grad_norm": 0.49691078066825867, + "learning_rate": 0.0001802596862449473, + "loss": 1.5592, + "step": 7610 + }, + { + "epoch": 0.09890147374371186, + "grad_norm": 0.4516826868057251, + "learning_rate": 0.0001802570867830359, + "loss": 1.4276, + "step": 7611 + }, + { + "epoch": 0.09891446828762773, + "grad_norm": 0.3362126648426056, + "learning_rate": 0.00018025448732112453, + "loss": 1.4064, + "step": 7612 + }, + { + "epoch": 0.0989274628315436, + "grad_norm": 0.34383925795555115, + "learning_rate": 0.00018025188785921313, + "loss": 1.3941, + "step": 7613 + }, + { + "epoch": 0.09894045737545948, + "grad_norm": 0.3464807868003845, + "learning_rate": 0.00018024928839730178, + "loss": 1.1543, + "step": 7614 + }, + { + "epoch": 0.09895345191937535, + "grad_norm": 0.39652219414711, + "learning_rate": 0.00018024668893539038, + "loss": 1.3663, + "step": 7615 + }, + { + "epoch": 0.09896644646329122, + "grad_norm": 0.4803014397621155, + "learning_rate": 0.000180244089473479, + "loss": 1.4493, + "step": 7616 + }, + { + "epoch": 0.0989794410072071, + "grad_norm": 0.37208932638168335, + "learning_rate": 0.0001802414900115676, + "loss": 1.5482, + "step": 7617 + }, + { + "epoch": 0.09899243555112297, + "grad_norm": 0.34949222207069397, + "learning_rate": 0.00018023889054965623, + "loss": 1.5523, + "step": 7618 + }, + { + "epoch": 0.09900543009503884, + "grad_norm": 0.3843694031238556, + "learning_rate": 0.00018023629108774485, + "loss": 1.5452, + "step": 7619 + }, + { + "epoch": 0.09901842463895472, + "grad_norm": 0.4375974237918854, + "learning_rate": 0.00018023369162583345, + "loss": 1.4226, + "step": 7620 + }, + { + "epoch": 0.09903141918287059, + "grad_norm": 0.42773744463920593, + "learning_rate": 0.00018023109216392207, + "loss": 1.4304, + "step": 7621 + }, + { + "epoch": 0.09904441372678646, + "grad_norm": 0.42004647850990295, + "learning_rate": 0.0001802284927020107, + "loss": 1.4345, + "step": 7622 + }, + { + "epoch": 0.09905740827070234, + "grad_norm": 0.6433254480361938, + "learning_rate": 0.00018022589324009932, + "loss": 1.3391, + "step": 7623 + }, + { + "epoch": 0.09907040281461821, + "grad_norm": 0.358011394739151, + "learning_rate": 0.00018022329377818792, + "loss": 1.4138, + "step": 7624 + }, + { + "epoch": 0.09908339735853408, + "grad_norm": 0.4215996265411377, + "learning_rate": 0.00018022069431627654, + "loss": 1.2198, + "step": 7625 + }, + { + "epoch": 0.09909639190244995, + "grad_norm": 0.33352354168891907, + "learning_rate": 0.00018021809485436517, + "loss": 1.3744, + "step": 7626 + }, + { + "epoch": 0.09910938644636583, + "grad_norm": 0.4371497333049774, + "learning_rate": 0.00018021549539245377, + "loss": 1.4866, + "step": 7627 + }, + { + "epoch": 0.0991223809902817, + "grad_norm": 0.2593614459037781, + "learning_rate": 0.0001802128959305424, + "loss": 1.3998, + "step": 7628 + }, + { + "epoch": 0.09913537553419757, + "grad_norm": 0.42470139265060425, + "learning_rate": 0.00018021029646863101, + "loss": 1.5663, + "step": 7629 + }, + { + "epoch": 0.09914837007811345, + "grad_norm": 0.4210742115974426, + "learning_rate": 0.0001802076970067196, + "loss": 1.5433, + "step": 7630 + }, + { + "epoch": 0.09916136462202932, + "grad_norm": 0.27131569385528564, + "learning_rate": 0.00018020509754480824, + "loss": 1.2315, + "step": 7631 + }, + { + "epoch": 0.0991743591659452, + "grad_norm": 0.44708698987960815, + "learning_rate": 0.00018020249808289683, + "loss": 1.3469, + "step": 7632 + }, + { + "epoch": 0.09918735370986108, + "grad_norm": 0.42890846729278564, + "learning_rate": 0.00018019989862098549, + "loss": 1.4295, + "step": 7633 + }, + { + "epoch": 0.09920034825377695, + "grad_norm": 0.45680341124534607, + "learning_rate": 0.00018019729915907408, + "loss": 1.6792, + "step": 7634 + }, + { + "epoch": 0.09921334279769282, + "grad_norm": 0.3280755281448364, + "learning_rate": 0.0001801946996971627, + "loss": 1.3708, + "step": 7635 + }, + { + "epoch": 0.0992263373416087, + "grad_norm": 0.328364759683609, + "learning_rate": 0.0001801921002352513, + "loss": 1.3109, + "step": 7636 + }, + { + "epoch": 0.09923933188552457, + "grad_norm": 0.46605971455574036, + "learning_rate": 0.00018018950077333993, + "loss": 1.4123, + "step": 7637 + }, + { + "epoch": 0.09925232642944044, + "grad_norm": 0.5017272233963013, + "learning_rate": 0.00018018690131142855, + "loss": 1.4994, + "step": 7638 + }, + { + "epoch": 0.09926532097335632, + "grad_norm": 0.43509921431541443, + "learning_rate": 0.00018018430184951715, + "loss": 1.5142, + "step": 7639 + }, + { + "epoch": 0.09927831551727219, + "grad_norm": 0.29439496994018555, + "learning_rate": 0.00018018170238760578, + "loss": 1.5676, + "step": 7640 + }, + { + "epoch": 0.09929131006118806, + "grad_norm": 0.44283726811408997, + "learning_rate": 0.0001801791029256944, + "loss": 1.3522, + "step": 7641 + }, + { + "epoch": 0.09930430460510394, + "grad_norm": 0.3328530192375183, + "learning_rate": 0.000180176503463783, + "loss": 1.5117, + "step": 7642 + }, + { + "epoch": 0.09931729914901981, + "grad_norm": 0.4323212504386902, + "learning_rate": 0.00018017390400187162, + "loss": 1.3786, + "step": 7643 + }, + { + "epoch": 0.09933029369293568, + "grad_norm": 0.3416399657726288, + "learning_rate": 0.00018017130453996022, + "loss": 1.4117, + "step": 7644 + }, + { + "epoch": 0.09934328823685155, + "grad_norm": 0.42340588569641113, + "learning_rate": 0.00018016870507804887, + "loss": 1.3224, + "step": 7645 + }, + { + "epoch": 0.09935628278076743, + "grad_norm": 0.3885568082332611, + "learning_rate": 0.00018016610561613747, + "loss": 1.3456, + "step": 7646 + }, + { + "epoch": 0.0993692773246833, + "grad_norm": 0.41005587577819824, + "learning_rate": 0.0001801635061542261, + "loss": 1.4821, + "step": 7647 + }, + { + "epoch": 0.09938227186859917, + "grad_norm": 0.4532742500305176, + "learning_rate": 0.0001801609066923147, + "loss": 1.4531, + "step": 7648 + }, + { + "epoch": 0.09939526641251505, + "grad_norm": 0.48052161931991577, + "learning_rate": 0.00018015830723040331, + "loss": 1.4078, + "step": 7649 + }, + { + "epoch": 0.09940826095643092, + "grad_norm": 0.31460922956466675, + "learning_rate": 0.00018015570776849194, + "loss": 1.3792, + "step": 7650 + }, + { + "epoch": 0.09942125550034679, + "grad_norm": 0.37324583530426025, + "learning_rate": 0.00018015310830658054, + "loss": 1.2037, + "step": 7651 + }, + { + "epoch": 0.09943425004426266, + "grad_norm": 0.5292695760726929, + "learning_rate": 0.00018015050884466916, + "loss": 1.4458, + "step": 7652 + }, + { + "epoch": 0.09944724458817854, + "grad_norm": 0.3979688286781311, + "learning_rate": 0.00018014790938275779, + "loss": 1.4226, + "step": 7653 + }, + { + "epoch": 0.09946023913209441, + "grad_norm": 0.3596692681312561, + "learning_rate": 0.00018014530992084638, + "loss": 1.266, + "step": 7654 + }, + { + "epoch": 0.09947323367601028, + "grad_norm": 0.35680097341537476, + "learning_rate": 0.000180142710458935, + "loss": 1.561, + "step": 7655 + }, + { + "epoch": 0.09948622821992616, + "grad_norm": 0.366475373506546, + "learning_rate": 0.0001801401109970236, + "loss": 1.4271, + "step": 7656 + }, + { + "epoch": 0.09949922276384203, + "grad_norm": 0.3848341703414917, + "learning_rate": 0.00018013751153511226, + "loss": 1.3237, + "step": 7657 + }, + { + "epoch": 0.0995122173077579, + "grad_norm": 0.3629254698753357, + "learning_rate": 0.00018013491207320085, + "loss": 1.5101, + "step": 7658 + }, + { + "epoch": 0.09952521185167378, + "grad_norm": 0.41521772742271423, + "learning_rate": 0.00018013231261128948, + "loss": 1.3345, + "step": 7659 + }, + { + "epoch": 0.09953820639558965, + "grad_norm": 0.4822794497013092, + "learning_rate": 0.00018012971314937808, + "loss": 1.5778, + "step": 7660 + }, + { + "epoch": 0.09955120093950552, + "grad_norm": 0.3347254693508148, + "learning_rate": 0.0001801271136874667, + "loss": 1.5753, + "step": 7661 + }, + { + "epoch": 0.0995641954834214, + "grad_norm": 0.5010448694229126, + "learning_rate": 0.00018012451422555532, + "loss": 1.5201, + "step": 7662 + }, + { + "epoch": 0.09957719002733727, + "grad_norm": 0.40447068214416504, + "learning_rate": 0.00018012191476364392, + "loss": 1.5344, + "step": 7663 + }, + { + "epoch": 0.09959018457125314, + "grad_norm": 0.4594039022922516, + "learning_rate": 0.00018011931530173257, + "loss": 1.6741, + "step": 7664 + }, + { + "epoch": 0.09960317911516901, + "grad_norm": 0.3824000358581543, + "learning_rate": 0.00018011671583982117, + "loss": 1.3474, + "step": 7665 + }, + { + "epoch": 0.09961617365908489, + "grad_norm": 0.33877086639404297, + "learning_rate": 0.00018011411637790977, + "loss": 1.3428, + "step": 7666 + }, + { + "epoch": 0.09962916820300076, + "grad_norm": 0.3358142673969269, + "learning_rate": 0.0001801115169159984, + "loss": 1.395, + "step": 7667 + }, + { + "epoch": 0.09964216274691663, + "grad_norm": 0.46106573939323425, + "learning_rate": 0.00018010891745408702, + "loss": 1.4206, + "step": 7668 + }, + { + "epoch": 0.0996551572908325, + "grad_norm": 0.3157424330711365, + "learning_rate": 0.00018010631799217564, + "loss": 1.4562, + "step": 7669 + }, + { + "epoch": 0.09966815183474839, + "grad_norm": 0.3862575888633728, + "learning_rate": 0.00018010371853026424, + "loss": 1.4042, + "step": 7670 + }, + { + "epoch": 0.09968114637866426, + "grad_norm": 0.48718324303627014, + "learning_rate": 0.00018010111906835286, + "loss": 1.4858, + "step": 7671 + }, + { + "epoch": 0.09969414092258014, + "grad_norm": 0.37498462200164795, + "learning_rate": 0.0001800985196064415, + "loss": 1.6164, + "step": 7672 + }, + { + "epoch": 0.09970713546649601, + "grad_norm": 0.42233625054359436, + "learning_rate": 0.00018009592014453008, + "loss": 1.5249, + "step": 7673 + }, + { + "epoch": 0.09972013001041188, + "grad_norm": 0.3537615239620209, + "learning_rate": 0.0001800933206826187, + "loss": 1.4671, + "step": 7674 + }, + { + "epoch": 0.09973312455432776, + "grad_norm": 0.34899771213531494, + "learning_rate": 0.0001800907212207073, + "loss": 1.4183, + "step": 7675 + }, + { + "epoch": 0.09974611909824363, + "grad_norm": 0.5116575360298157, + "learning_rate": 0.00018008812175879596, + "loss": 1.313, + "step": 7676 + }, + { + "epoch": 0.0997591136421595, + "grad_norm": 0.3554348051548004, + "learning_rate": 0.00018008552229688456, + "loss": 1.3488, + "step": 7677 + }, + { + "epoch": 0.09977210818607538, + "grad_norm": 0.4460490345954895, + "learning_rate": 0.00018008292283497315, + "loss": 1.3481, + "step": 7678 + }, + { + "epoch": 0.09978510272999125, + "grad_norm": 0.3297146260738373, + "learning_rate": 0.00018008032337306178, + "loss": 1.4292, + "step": 7679 + }, + { + "epoch": 0.09979809727390712, + "grad_norm": 0.3345957100391388, + "learning_rate": 0.0001800777239111504, + "loss": 1.4054, + "step": 7680 + }, + { + "epoch": 0.099811091817823, + "grad_norm": 0.4210030436515808, + "learning_rate": 0.00018007512444923903, + "loss": 1.4803, + "step": 7681 + }, + { + "epoch": 0.09982408636173887, + "grad_norm": 0.46942102909088135, + "learning_rate": 0.00018007252498732762, + "loss": 1.4172, + "step": 7682 + }, + { + "epoch": 0.09983708090565474, + "grad_norm": 0.4845230281352997, + "learning_rate": 0.00018006992552541625, + "loss": 1.497, + "step": 7683 + }, + { + "epoch": 0.09985007544957061, + "grad_norm": 0.35429152846336365, + "learning_rate": 0.00018006732606350487, + "loss": 1.2605, + "step": 7684 + }, + { + "epoch": 0.09986306999348649, + "grad_norm": 0.4821487069129944, + "learning_rate": 0.00018006472660159347, + "loss": 1.4393, + "step": 7685 + }, + { + "epoch": 0.09987606453740236, + "grad_norm": 0.5426856279373169, + "learning_rate": 0.0001800621271396821, + "loss": 1.2874, + "step": 7686 + }, + { + "epoch": 0.09988905908131823, + "grad_norm": 0.35171741247177124, + "learning_rate": 0.0001800595276777707, + "loss": 1.3084, + "step": 7687 + }, + { + "epoch": 0.0999020536252341, + "grad_norm": 0.40299689769744873, + "learning_rate": 0.00018005692821585934, + "loss": 1.3285, + "step": 7688 + }, + { + "epoch": 0.09991504816914998, + "grad_norm": 0.4185236990451813, + "learning_rate": 0.00018005432875394794, + "loss": 1.4843, + "step": 7689 + }, + { + "epoch": 0.09992804271306585, + "grad_norm": 0.3998733162879944, + "learning_rate": 0.00018005172929203657, + "loss": 1.5653, + "step": 7690 + }, + { + "epoch": 0.09994103725698172, + "grad_norm": 0.40453624725341797, + "learning_rate": 0.00018004912983012516, + "loss": 1.1748, + "step": 7691 + }, + { + "epoch": 0.0999540318008976, + "grad_norm": 0.41360390186309814, + "learning_rate": 0.0001800465303682138, + "loss": 1.3347, + "step": 7692 + }, + { + "epoch": 0.09996702634481347, + "grad_norm": 0.4500105082988739, + "learning_rate": 0.0001800439309063024, + "loss": 1.6253, + "step": 7693 + }, + { + "epoch": 0.09998002088872934, + "grad_norm": 0.40081730484962463, + "learning_rate": 0.000180041331444391, + "loss": 1.4126, + "step": 7694 + }, + { + "epoch": 0.09999301543264522, + "grad_norm": 0.5991983413696289, + "learning_rate": 0.00018003873198247963, + "loss": 1.4585, + "step": 7695 + }, + { + "epoch": 0.10000600997656109, + "grad_norm": 0.40953150391578674, + "learning_rate": 0.00018003613252056826, + "loss": 1.4681, + "step": 7696 + }, + { + "epoch": 0.10001900452047696, + "grad_norm": 0.4214688241481781, + "learning_rate": 0.00018003353305865686, + "loss": 1.2906, + "step": 7697 + }, + { + "epoch": 0.10003199906439283, + "grad_norm": 0.38336676359176636, + "learning_rate": 0.00018003093359674548, + "loss": 1.4432, + "step": 7698 + }, + { + "epoch": 0.10004499360830871, + "grad_norm": 0.4267819821834564, + "learning_rate": 0.0001800283341348341, + "loss": 1.4618, + "step": 7699 + }, + { + "epoch": 0.10005798815222458, + "grad_norm": 0.3744755685329437, + "learning_rate": 0.00018002573467292273, + "loss": 1.4235, + "step": 7700 + }, + { + "epoch": 0.10007098269614045, + "grad_norm": 0.4582114517688751, + "learning_rate": 0.00018002313521101133, + "loss": 1.4422, + "step": 7701 + }, + { + "epoch": 0.10008397724005633, + "grad_norm": 0.4534589648246765, + "learning_rate": 0.00018002053574909995, + "loss": 1.4194, + "step": 7702 + }, + { + "epoch": 0.1000969717839722, + "grad_norm": 0.34885936975479126, + "learning_rate": 0.00018001793628718858, + "loss": 1.4276, + "step": 7703 + }, + { + "epoch": 0.10010996632788807, + "grad_norm": 0.3478038012981415, + "learning_rate": 0.00018001533682527717, + "loss": 1.3986, + "step": 7704 + }, + { + "epoch": 0.10012296087180395, + "grad_norm": 0.36455038189888, + "learning_rate": 0.0001800127373633658, + "loss": 1.5459, + "step": 7705 + }, + { + "epoch": 0.10013595541571982, + "grad_norm": 0.4637551009654999, + "learning_rate": 0.0001800101379014544, + "loss": 1.5051, + "step": 7706 + }, + { + "epoch": 0.10014894995963569, + "grad_norm": 0.4914877712726593, + "learning_rate": 0.00018000753843954305, + "loss": 1.333, + "step": 7707 + }, + { + "epoch": 0.10016194450355158, + "grad_norm": 0.39947807788848877, + "learning_rate": 0.00018000493897763164, + "loss": 1.5386, + "step": 7708 + }, + { + "epoch": 0.10017493904746745, + "grad_norm": 0.34416329860687256, + "learning_rate": 0.00018000233951572024, + "loss": 1.32, + "step": 7709 + }, + { + "epoch": 0.10018793359138332, + "grad_norm": 0.43152153491973877, + "learning_rate": 0.00017999974005380887, + "loss": 1.7124, + "step": 7710 + }, + { + "epoch": 0.1002009281352992, + "grad_norm": 0.31289172172546387, + "learning_rate": 0.0001799971405918975, + "loss": 1.4115, + "step": 7711 + }, + { + "epoch": 0.10021392267921507, + "grad_norm": 0.48261481523513794, + "learning_rate": 0.00017999454112998611, + "loss": 1.4367, + "step": 7712 + }, + { + "epoch": 0.10022691722313094, + "grad_norm": 0.3611449599266052, + "learning_rate": 0.0001799919416680747, + "loss": 1.4975, + "step": 7713 + }, + { + "epoch": 0.10023991176704682, + "grad_norm": 0.3715337812900543, + "learning_rate": 0.00017998934220616334, + "loss": 1.3302, + "step": 7714 + }, + { + "epoch": 0.10025290631096269, + "grad_norm": 0.35607150197029114, + "learning_rate": 0.00017998674274425196, + "loss": 1.3665, + "step": 7715 + }, + { + "epoch": 0.10026590085487856, + "grad_norm": 0.5352275371551514, + "learning_rate": 0.00017998414328234056, + "loss": 1.6504, + "step": 7716 + }, + { + "epoch": 0.10027889539879443, + "grad_norm": 0.34869760274887085, + "learning_rate": 0.00017998154382042918, + "loss": 1.411, + "step": 7717 + }, + { + "epoch": 0.10029188994271031, + "grad_norm": 0.3344050645828247, + "learning_rate": 0.00017997894435851778, + "loss": 1.2106, + "step": 7718 + }, + { + "epoch": 0.10030488448662618, + "grad_norm": 0.4052220284938812, + "learning_rate": 0.00017997634489660643, + "loss": 1.4318, + "step": 7719 + }, + { + "epoch": 0.10031787903054205, + "grad_norm": 0.36047059297561646, + "learning_rate": 0.00017997374543469503, + "loss": 1.3013, + "step": 7720 + }, + { + "epoch": 0.10033087357445793, + "grad_norm": 0.4065697491168976, + "learning_rate": 0.00017997114597278363, + "loss": 1.3914, + "step": 7721 + }, + { + "epoch": 0.1003438681183738, + "grad_norm": 0.3502679169178009, + "learning_rate": 0.00017996854651087225, + "loss": 1.3115, + "step": 7722 + }, + { + "epoch": 0.10035686266228967, + "grad_norm": 0.3656959533691406, + "learning_rate": 0.00017996594704896088, + "loss": 1.3683, + "step": 7723 + }, + { + "epoch": 0.10036985720620555, + "grad_norm": 0.41776686906814575, + "learning_rate": 0.0001799633475870495, + "loss": 1.5869, + "step": 7724 + }, + { + "epoch": 0.10038285175012142, + "grad_norm": 0.2601430416107178, + "learning_rate": 0.0001799607481251381, + "loss": 1.4407, + "step": 7725 + }, + { + "epoch": 0.10039584629403729, + "grad_norm": 0.41610637307167053, + "learning_rate": 0.00017995814866322672, + "loss": 1.3239, + "step": 7726 + }, + { + "epoch": 0.10040884083795316, + "grad_norm": 0.401491641998291, + "learning_rate": 0.00017995554920131535, + "loss": 1.5508, + "step": 7727 + }, + { + "epoch": 0.10042183538186904, + "grad_norm": 0.44771575927734375, + "learning_rate": 0.00017995294973940394, + "loss": 1.337, + "step": 7728 + }, + { + "epoch": 0.10043482992578491, + "grad_norm": 0.4742160141468048, + "learning_rate": 0.00017995035027749257, + "loss": 1.407, + "step": 7729 + }, + { + "epoch": 0.10044782446970078, + "grad_norm": 0.3995453417301178, + "learning_rate": 0.00017994775081558117, + "loss": 1.432, + "step": 7730 + }, + { + "epoch": 0.10046081901361666, + "grad_norm": 0.3786431550979614, + "learning_rate": 0.00017994515135366982, + "loss": 1.3565, + "step": 7731 + }, + { + "epoch": 0.10047381355753253, + "grad_norm": 0.3390199840068817, + "learning_rate": 0.00017994255189175841, + "loss": 1.3396, + "step": 7732 + }, + { + "epoch": 0.1004868081014484, + "grad_norm": 0.3517451882362366, + "learning_rate": 0.000179939952429847, + "loss": 1.4114, + "step": 7733 + }, + { + "epoch": 0.10049980264536428, + "grad_norm": 0.32465922832489014, + "learning_rate": 0.00017993735296793564, + "loss": 1.264, + "step": 7734 + }, + { + "epoch": 0.10051279718928015, + "grad_norm": 0.34513556957244873, + "learning_rate": 0.00017993475350602426, + "loss": 1.3132, + "step": 7735 + }, + { + "epoch": 0.10052579173319602, + "grad_norm": 0.3557755947113037, + "learning_rate": 0.00017993215404411289, + "loss": 1.4187, + "step": 7736 + }, + { + "epoch": 0.1005387862771119, + "grad_norm": 0.44158443808555603, + "learning_rate": 0.00017992955458220148, + "loss": 1.451, + "step": 7737 + }, + { + "epoch": 0.10055178082102777, + "grad_norm": 0.4244614243507385, + "learning_rate": 0.0001799269551202901, + "loss": 1.3511, + "step": 7738 + }, + { + "epoch": 0.10056477536494364, + "grad_norm": 0.3721020221710205, + "learning_rate": 0.00017992435565837873, + "loss": 1.3173, + "step": 7739 + }, + { + "epoch": 0.10057776990885951, + "grad_norm": 0.44494977593421936, + "learning_rate": 0.00017992175619646733, + "loss": 1.4004, + "step": 7740 + }, + { + "epoch": 0.10059076445277539, + "grad_norm": 0.41851723194122314, + "learning_rate": 0.00017991915673455595, + "loss": 1.3677, + "step": 7741 + }, + { + "epoch": 0.10060375899669126, + "grad_norm": 0.31408342719078064, + "learning_rate": 0.00017991655727264458, + "loss": 1.3433, + "step": 7742 + }, + { + "epoch": 0.10061675354060713, + "grad_norm": 0.48315465450286865, + "learning_rate": 0.0001799139578107332, + "loss": 1.4698, + "step": 7743 + }, + { + "epoch": 0.100629748084523, + "grad_norm": 0.42054128646850586, + "learning_rate": 0.0001799113583488218, + "loss": 1.5336, + "step": 7744 + }, + { + "epoch": 0.10064274262843888, + "grad_norm": 0.33167505264282227, + "learning_rate": 0.00017990875888691042, + "loss": 1.2733, + "step": 7745 + }, + { + "epoch": 0.10065573717235476, + "grad_norm": 0.46904489398002625, + "learning_rate": 0.00017990615942499905, + "loss": 1.5083, + "step": 7746 + }, + { + "epoch": 0.10066873171627064, + "grad_norm": 0.41800230741500854, + "learning_rate": 0.00017990355996308765, + "loss": 1.3772, + "step": 7747 + }, + { + "epoch": 0.10068172626018651, + "grad_norm": 0.3952338993549347, + "learning_rate": 0.00017990096050117627, + "loss": 1.44, + "step": 7748 + }, + { + "epoch": 0.10069472080410238, + "grad_norm": 0.42936819791793823, + "learning_rate": 0.00017989836103926487, + "loss": 1.3384, + "step": 7749 + }, + { + "epoch": 0.10070771534801826, + "grad_norm": 0.45824122428894043, + "learning_rate": 0.0001798957615773535, + "loss": 1.5126, + "step": 7750 + }, + { + "epoch": 0.10072070989193413, + "grad_norm": 0.36568549275398254, + "learning_rate": 0.00017989316211544212, + "loss": 1.4435, + "step": 7751 + }, + { + "epoch": 0.10073370443585, + "grad_norm": 0.2619154155254364, + "learning_rate": 0.00017989056265353071, + "loss": 1.4428, + "step": 7752 + }, + { + "epoch": 0.10074669897976588, + "grad_norm": 0.30440089106559753, + "learning_rate": 0.00017988796319161934, + "loss": 1.4661, + "step": 7753 + }, + { + "epoch": 0.10075969352368175, + "grad_norm": 0.3850434422492981, + "learning_rate": 0.00017988536372970796, + "loss": 1.5891, + "step": 7754 + }, + { + "epoch": 0.10077268806759762, + "grad_norm": 0.40767812728881836, + "learning_rate": 0.0001798827642677966, + "loss": 1.3217, + "step": 7755 + }, + { + "epoch": 0.1007856826115135, + "grad_norm": 0.3729136288166046, + "learning_rate": 0.00017988016480588519, + "loss": 1.3227, + "step": 7756 + }, + { + "epoch": 0.10079867715542937, + "grad_norm": 0.40594127774238586, + "learning_rate": 0.0001798775653439738, + "loss": 1.5095, + "step": 7757 + }, + { + "epoch": 0.10081167169934524, + "grad_norm": 0.39236366748809814, + "learning_rate": 0.00017987496588206243, + "loss": 1.4879, + "step": 7758 + }, + { + "epoch": 0.10082466624326111, + "grad_norm": 0.40335825085639954, + "learning_rate": 0.00017987236642015103, + "loss": 1.7277, + "step": 7759 + }, + { + "epoch": 0.10083766078717699, + "grad_norm": 0.36751919984817505, + "learning_rate": 0.00017986976695823966, + "loss": 1.4349, + "step": 7760 + }, + { + "epoch": 0.10085065533109286, + "grad_norm": 0.35334184765815735, + "learning_rate": 0.00017986716749632825, + "loss": 1.4215, + "step": 7761 + }, + { + "epoch": 0.10086364987500873, + "grad_norm": 0.3580816388130188, + "learning_rate": 0.00017986456803441688, + "loss": 1.5138, + "step": 7762 + }, + { + "epoch": 0.1008766444189246, + "grad_norm": 0.41510021686553955, + "learning_rate": 0.0001798619685725055, + "loss": 1.3897, + "step": 7763 + }, + { + "epoch": 0.10088963896284048, + "grad_norm": 0.3836803734302521, + "learning_rate": 0.0001798593691105941, + "loss": 1.5846, + "step": 7764 + }, + { + "epoch": 0.10090263350675635, + "grad_norm": 0.3758074939250946, + "learning_rate": 0.00017985676964868272, + "loss": 1.543, + "step": 7765 + }, + { + "epoch": 0.10091562805067222, + "grad_norm": 0.45739293098449707, + "learning_rate": 0.00017985417018677135, + "loss": 1.4393, + "step": 7766 + }, + { + "epoch": 0.1009286225945881, + "grad_norm": 0.3502725660800934, + "learning_rate": 0.00017985157072485997, + "loss": 1.3627, + "step": 7767 + }, + { + "epoch": 0.10094161713850397, + "grad_norm": 0.37774160504341125, + "learning_rate": 0.00017984897126294857, + "loss": 1.3531, + "step": 7768 + }, + { + "epoch": 0.10095461168241984, + "grad_norm": 0.40376293659210205, + "learning_rate": 0.0001798463718010372, + "loss": 1.3984, + "step": 7769 + }, + { + "epoch": 0.10096760622633572, + "grad_norm": 0.3908384442329407, + "learning_rate": 0.00017984377233912582, + "loss": 1.4472, + "step": 7770 + }, + { + "epoch": 0.10098060077025159, + "grad_norm": 0.41029441356658936, + "learning_rate": 0.00017984117287721442, + "loss": 1.4773, + "step": 7771 + }, + { + "epoch": 0.10099359531416746, + "grad_norm": 0.4586721658706665, + "learning_rate": 0.00017983857341530304, + "loss": 1.5646, + "step": 7772 + }, + { + "epoch": 0.10100658985808333, + "grad_norm": 0.39528968930244446, + "learning_rate": 0.00017983597395339167, + "loss": 1.3538, + "step": 7773 + }, + { + "epoch": 0.10101958440199921, + "grad_norm": 0.3632925748825073, + "learning_rate": 0.0001798333744914803, + "loss": 1.6117, + "step": 7774 + }, + { + "epoch": 0.10103257894591508, + "grad_norm": 0.3847543001174927, + "learning_rate": 0.0001798307750295689, + "loss": 1.4012, + "step": 7775 + }, + { + "epoch": 0.10104557348983095, + "grad_norm": 0.43651658296585083, + "learning_rate": 0.00017982817556765749, + "loss": 1.3917, + "step": 7776 + }, + { + "epoch": 0.10105856803374683, + "grad_norm": 0.4280056953430176, + "learning_rate": 0.00017982557610574614, + "loss": 1.4205, + "step": 7777 + }, + { + "epoch": 0.1010715625776627, + "grad_norm": 0.4749160706996918, + "learning_rate": 0.00017982297664383473, + "loss": 1.5303, + "step": 7778 + }, + { + "epoch": 0.10108455712157857, + "grad_norm": 0.3926313519477844, + "learning_rate": 0.00017982037718192336, + "loss": 1.5686, + "step": 7779 + }, + { + "epoch": 0.10109755166549445, + "grad_norm": 0.44535452127456665, + "learning_rate": 0.00017981777772001196, + "loss": 1.5519, + "step": 7780 + }, + { + "epoch": 0.10111054620941032, + "grad_norm": 0.33920952677726746, + "learning_rate": 0.00017981517825810058, + "loss": 1.2957, + "step": 7781 + }, + { + "epoch": 0.10112354075332619, + "grad_norm": 0.365041047334671, + "learning_rate": 0.0001798125787961892, + "loss": 1.5364, + "step": 7782 + }, + { + "epoch": 0.10113653529724206, + "grad_norm": 0.4695407748222351, + "learning_rate": 0.0001798099793342778, + "loss": 1.4655, + "step": 7783 + }, + { + "epoch": 0.10114952984115794, + "grad_norm": 0.3340899646282196, + "learning_rate": 0.00017980737987236643, + "loss": 1.398, + "step": 7784 + }, + { + "epoch": 0.10116252438507382, + "grad_norm": 0.38033172488212585, + "learning_rate": 0.00017980478041045505, + "loss": 1.428, + "step": 7785 + }, + { + "epoch": 0.1011755189289897, + "grad_norm": 0.37100502848625183, + "learning_rate": 0.00017980218094854368, + "loss": 1.4597, + "step": 7786 + }, + { + "epoch": 0.10118851347290557, + "grad_norm": 0.3285274803638458, + "learning_rate": 0.00017979958148663227, + "loss": 1.3332, + "step": 7787 + }, + { + "epoch": 0.10120150801682144, + "grad_norm": 0.43906766176223755, + "learning_rate": 0.00017979698202472087, + "loss": 1.4413, + "step": 7788 + }, + { + "epoch": 0.10121450256073732, + "grad_norm": 0.42923998832702637, + "learning_rate": 0.00017979438256280952, + "loss": 1.455, + "step": 7789 + }, + { + "epoch": 0.10122749710465319, + "grad_norm": 0.4535680413246155, + "learning_rate": 0.00017979178310089812, + "loss": 1.2622, + "step": 7790 + }, + { + "epoch": 0.10124049164856906, + "grad_norm": 0.4653768241405487, + "learning_rate": 0.00017978918363898674, + "loss": 1.5935, + "step": 7791 + }, + { + "epoch": 0.10125348619248493, + "grad_norm": 0.4389393627643585, + "learning_rate": 0.00017978658417707534, + "loss": 1.485, + "step": 7792 + }, + { + "epoch": 0.10126648073640081, + "grad_norm": 0.4844987690448761, + "learning_rate": 0.00017978398471516397, + "loss": 1.4351, + "step": 7793 + }, + { + "epoch": 0.10127947528031668, + "grad_norm": 0.4055352210998535, + "learning_rate": 0.0001797813852532526, + "loss": 1.5297, + "step": 7794 + }, + { + "epoch": 0.10129246982423255, + "grad_norm": 0.40472620725631714, + "learning_rate": 0.0001797787857913412, + "loss": 1.6019, + "step": 7795 + }, + { + "epoch": 0.10130546436814843, + "grad_norm": 0.390473335981369, + "learning_rate": 0.0001797761863294298, + "loss": 1.3117, + "step": 7796 + }, + { + "epoch": 0.1013184589120643, + "grad_norm": 0.3171139657497406, + "learning_rate": 0.00017977358686751844, + "loss": 1.3259, + "step": 7797 + }, + { + "epoch": 0.10133145345598017, + "grad_norm": 0.38119930028915405, + "learning_rate": 0.00017977098740560706, + "loss": 1.5352, + "step": 7798 + }, + { + "epoch": 0.10134444799989605, + "grad_norm": 0.39812228083610535, + "learning_rate": 0.00017976838794369566, + "loss": 1.4322, + "step": 7799 + }, + { + "epoch": 0.10135744254381192, + "grad_norm": 0.34248262643814087, + "learning_rate": 0.00017976578848178428, + "loss": 1.2128, + "step": 7800 + }, + { + "epoch": 0.10137043708772779, + "grad_norm": 0.3865338861942291, + "learning_rate": 0.0001797631890198729, + "loss": 1.4702, + "step": 7801 + }, + { + "epoch": 0.10138343163164366, + "grad_norm": 0.3255064785480499, + "learning_rate": 0.0001797605895579615, + "loss": 1.435, + "step": 7802 + }, + { + "epoch": 0.10139642617555954, + "grad_norm": 0.33372730016708374, + "learning_rate": 0.00017975799009605013, + "loss": 1.2388, + "step": 7803 + }, + { + "epoch": 0.10140942071947541, + "grad_norm": 0.5129263997077942, + "learning_rate": 0.00017975539063413873, + "loss": 1.5348, + "step": 7804 + }, + { + "epoch": 0.10142241526339128, + "grad_norm": 0.38935065269470215, + "learning_rate": 0.00017975279117222735, + "loss": 1.4739, + "step": 7805 + }, + { + "epoch": 0.10143540980730716, + "grad_norm": 0.43530070781707764, + "learning_rate": 0.00017975019171031598, + "loss": 1.4767, + "step": 7806 + }, + { + "epoch": 0.10144840435122303, + "grad_norm": 0.4198201298713684, + "learning_rate": 0.00017974759224840457, + "loss": 1.5228, + "step": 7807 + }, + { + "epoch": 0.1014613988951389, + "grad_norm": 0.3787910044193268, + "learning_rate": 0.0001797449927864932, + "loss": 1.2797, + "step": 7808 + }, + { + "epoch": 0.10147439343905477, + "grad_norm": 0.38960161805152893, + "learning_rate": 0.00017974239332458182, + "loss": 1.3406, + "step": 7809 + }, + { + "epoch": 0.10148738798297065, + "grad_norm": 0.4343084692955017, + "learning_rate": 0.00017973979386267045, + "loss": 1.6734, + "step": 7810 + }, + { + "epoch": 0.10150038252688652, + "grad_norm": 0.4567416310310364, + "learning_rate": 0.00017973719440075904, + "loss": 1.4247, + "step": 7811 + }, + { + "epoch": 0.1015133770708024, + "grad_norm": 0.4708667993545532, + "learning_rate": 0.00017973459493884767, + "loss": 1.5348, + "step": 7812 + }, + { + "epoch": 0.10152637161471827, + "grad_norm": 0.42531076073646545, + "learning_rate": 0.0001797319954769363, + "loss": 1.483, + "step": 7813 + }, + { + "epoch": 0.10153936615863414, + "grad_norm": 0.44487708806991577, + "learning_rate": 0.0001797293960150249, + "loss": 1.4065, + "step": 7814 + }, + { + "epoch": 0.10155236070255001, + "grad_norm": 0.3435768187046051, + "learning_rate": 0.00017972679655311351, + "loss": 1.1367, + "step": 7815 + }, + { + "epoch": 0.10156535524646589, + "grad_norm": 0.4042853116989136, + "learning_rate": 0.00017972419709120214, + "loss": 1.4232, + "step": 7816 + }, + { + "epoch": 0.10157834979038176, + "grad_norm": 0.3916345238685608, + "learning_rate": 0.00017972159762929074, + "loss": 1.3807, + "step": 7817 + }, + { + "epoch": 0.10159134433429763, + "grad_norm": 0.37747761607170105, + "learning_rate": 0.00017971899816737936, + "loss": 1.2671, + "step": 7818 + }, + { + "epoch": 0.1016043388782135, + "grad_norm": 0.45680347084999084, + "learning_rate": 0.00017971639870546796, + "loss": 1.5233, + "step": 7819 + }, + { + "epoch": 0.10161733342212938, + "grad_norm": 0.39336326718330383, + "learning_rate": 0.0001797137992435566, + "loss": 1.4689, + "step": 7820 + }, + { + "epoch": 0.10163032796604525, + "grad_norm": 0.39701905846595764, + "learning_rate": 0.0001797111997816452, + "loss": 1.4432, + "step": 7821 + }, + { + "epoch": 0.10164332250996112, + "grad_norm": 0.5037115216255188, + "learning_rate": 0.00017970860031973383, + "loss": 1.4934, + "step": 7822 + }, + { + "epoch": 0.10165631705387701, + "grad_norm": 0.4388052225112915, + "learning_rate": 0.00017970600085782243, + "loss": 1.4042, + "step": 7823 + }, + { + "epoch": 0.10166931159779288, + "grad_norm": 0.34577223658561707, + "learning_rate": 0.00017970340139591105, + "loss": 1.3769, + "step": 7824 + }, + { + "epoch": 0.10168230614170876, + "grad_norm": 0.39711371064186096, + "learning_rate": 0.00017970080193399968, + "loss": 1.2014, + "step": 7825 + }, + { + "epoch": 0.10169530068562463, + "grad_norm": 0.32872632145881653, + "learning_rate": 0.00017969820247208828, + "loss": 1.1028, + "step": 7826 + }, + { + "epoch": 0.1017082952295405, + "grad_norm": 0.3827008306980133, + "learning_rate": 0.0001796956030101769, + "loss": 1.4979, + "step": 7827 + }, + { + "epoch": 0.10172128977345637, + "grad_norm": 0.30252885818481445, + "learning_rate": 0.00017969300354826552, + "loss": 1.1797, + "step": 7828 + }, + { + "epoch": 0.10173428431737225, + "grad_norm": 0.4236536920070648, + "learning_rate": 0.00017969040408635415, + "loss": 1.4234, + "step": 7829 + }, + { + "epoch": 0.10174727886128812, + "grad_norm": 0.41328996419906616, + "learning_rate": 0.00017968780462444275, + "loss": 1.4877, + "step": 7830 + }, + { + "epoch": 0.101760273405204, + "grad_norm": 0.39690878987312317, + "learning_rate": 0.00017968520516253134, + "loss": 1.3703, + "step": 7831 + }, + { + "epoch": 0.10177326794911987, + "grad_norm": 0.3942885398864746, + "learning_rate": 0.00017968260570062, + "loss": 1.4388, + "step": 7832 + }, + { + "epoch": 0.10178626249303574, + "grad_norm": 0.3404872715473175, + "learning_rate": 0.0001796800062387086, + "loss": 1.4037, + "step": 7833 + }, + { + "epoch": 0.10179925703695161, + "grad_norm": 0.3025733232498169, + "learning_rate": 0.00017967740677679722, + "loss": 1.3139, + "step": 7834 + }, + { + "epoch": 0.10181225158086749, + "grad_norm": 0.38775646686553955, + "learning_rate": 0.00017967480731488581, + "loss": 1.5021, + "step": 7835 + }, + { + "epoch": 0.10182524612478336, + "grad_norm": 0.45014211535453796, + "learning_rate": 0.00017967220785297444, + "loss": 1.4182, + "step": 7836 + }, + { + "epoch": 0.10183824066869923, + "grad_norm": 0.4211207330226898, + "learning_rate": 0.00017966960839106306, + "loss": 1.5032, + "step": 7837 + }, + { + "epoch": 0.1018512352126151, + "grad_norm": 0.35970160365104675, + "learning_rate": 0.00017966700892915166, + "loss": 1.3707, + "step": 7838 + }, + { + "epoch": 0.10186422975653098, + "grad_norm": 0.32968997955322266, + "learning_rate": 0.00017966440946724029, + "loss": 1.3483, + "step": 7839 + }, + { + "epoch": 0.10187722430044685, + "grad_norm": 0.40386343002319336, + "learning_rate": 0.0001796618100053289, + "loss": 1.486, + "step": 7840 + }, + { + "epoch": 0.10189021884436272, + "grad_norm": 0.41209647059440613, + "learning_rate": 0.00017965921054341753, + "loss": 1.4984, + "step": 7841 + }, + { + "epoch": 0.1019032133882786, + "grad_norm": 0.4551544487476349, + "learning_rate": 0.00017965661108150613, + "loss": 1.7353, + "step": 7842 + }, + { + "epoch": 0.10191620793219447, + "grad_norm": 0.28845903277397156, + "learning_rate": 0.00017965401161959473, + "loss": 1.1945, + "step": 7843 + }, + { + "epoch": 0.10192920247611034, + "grad_norm": 0.4547567069530487, + "learning_rate": 0.00017965141215768338, + "loss": 1.4428, + "step": 7844 + }, + { + "epoch": 0.10194219702002622, + "grad_norm": 0.3907301425933838, + "learning_rate": 0.00017964881269577198, + "loss": 1.4306, + "step": 7845 + }, + { + "epoch": 0.10195519156394209, + "grad_norm": 0.3539254367351532, + "learning_rate": 0.0001796462132338606, + "loss": 1.25, + "step": 7846 + }, + { + "epoch": 0.10196818610785796, + "grad_norm": 0.42573782801628113, + "learning_rate": 0.0001796436137719492, + "loss": 1.4194, + "step": 7847 + }, + { + "epoch": 0.10198118065177383, + "grad_norm": 0.5222511887550354, + "learning_rate": 0.00017964101431003782, + "loss": 1.5892, + "step": 7848 + }, + { + "epoch": 0.1019941751956897, + "grad_norm": 0.4325457513332367, + "learning_rate": 0.00017963841484812645, + "loss": 1.4501, + "step": 7849 + }, + { + "epoch": 0.10200716973960558, + "grad_norm": 0.42050960659980774, + "learning_rate": 0.00017963581538621505, + "loss": 1.5406, + "step": 7850 + }, + { + "epoch": 0.10202016428352145, + "grad_norm": 0.34707149863243103, + "learning_rate": 0.0001796332159243037, + "loss": 1.3757, + "step": 7851 + }, + { + "epoch": 0.10203315882743733, + "grad_norm": 0.36952534317970276, + "learning_rate": 0.0001796306164623923, + "loss": 1.6663, + "step": 7852 + }, + { + "epoch": 0.1020461533713532, + "grad_norm": 0.5112026929855347, + "learning_rate": 0.00017962801700048092, + "loss": 1.5723, + "step": 7853 + }, + { + "epoch": 0.10205914791526907, + "grad_norm": 0.4922695457935333, + "learning_rate": 0.00017962541753856952, + "loss": 1.3254, + "step": 7854 + }, + { + "epoch": 0.10207214245918494, + "grad_norm": 0.3786063492298126, + "learning_rate": 0.00017962281807665814, + "loss": 1.5976, + "step": 7855 + }, + { + "epoch": 0.10208513700310082, + "grad_norm": 0.4102255403995514, + "learning_rate": 0.00017962021861474677, + "loss": 1.4643, + "step": 7856 + }, + { + "epoch": 0.10209813154701669, + "grad_norm": 0.4170873165130615, + "learning_rate": 0.00017961761915283536, + "loss": 1.4876, + "step": 7857 + }, + { + "epoch": 0.10211112609093256, + "grad_norm": 0.33146771788597107, + "learning_rate": 0.000179615019690924, + "loss": 1.425, + "step": 7858 + }, + { + "epoch": 0.10212412063484844, + "grad_norm": 0.4506660997867584, + "learning_rate": 0.0001796124202290126, + "loss": 1.5549, + "step": 7859 + }, + { + "epoch": 0.10213711517876431, + "grad_norm": 0.4250401258468628, + "learning_rate": 0.0001796098207671012, + "loss": 1.2596, + "step": 7860 + }, + { + "epoch": 0.1021501097226802, + "grad_norm": 0.3729168176651001, + "learning_rate": 0.00017960722130518983, + "loss": 1.3355, + "step": 7861 + }, + { + "epoch": 0.10216310426659607, + "grad_norm": 0.42832502722740173, + "learning_rate": 0.00017960462184327843, + "loss": 1.4575, + "step": 7862 + }, + { + "epoch": 0.10217609881051194, + "grad_norm": 0.40425217151641846, + "learning_rate": 0.00017960202238136708, + "loss": 1.4841, + "step": 7863 + }, + { + "epoch": 0.10218909335442782, + "grad_norm": 0.5028495788574219, + "learning_rate": 0.00017959942291945568, + "loss": 1.4228, + "step": 7864 + }, + { + "epoch": 0.10220208789834369, + "grad_norm": 0.41667184233665466, + "learning_rate": 0.0001795968234575443, + "loss": 1.4579, + "step": 7865 + }, + { + "epoch": 0.10221508244225956, + "grad_norm": 0.34058699011802673, + "learning_rate": 0.0001795942239956329, + "loss": 1.2667, + "step": 7866 + }, + { + "epoch": 0.10222807698617543, + "grad_norm": 0.4319571554660797, + "learning_rate": 0.00017959162453372153, + "loss": 1.3744, + "step": 7867 + }, + { + "epoch": 0.10224107153009131, + "grad_norm": 0.4323987662792206, + "learning_rate": 0.00017958902507181015, + "loss": 1.4948, + "step": 7868 + }, + { + "epoch": 0.10225406607400718, + "grad_norm": 0.4942198395729065, + "learning_rate": 0.00017958642560989875, + "loss": 1.6714, + "step": 7869 + }, + { + "epoch": 0.10226706061792305, + "grad_norm": 0.4277282655239105, + "learning_rate": 0.00017958382614798737, + "loss": 1.4999, + "step": 7870 + }, + { + "epoch": 0.10228005516183893, + "grad_norm": 0.4215066432952881, + "learning_rate": 0.000179581226686076, + "loss": 1.4917, + "step": 7871 + }, + { + "epoch": 0.1022930497057548, + "grad_norm": 0.4057972729206085, + "learning_rate": 0.0001795786272241646, + "loss": 1.5589, + "step": 7872 + }, + { + "epoch": 0.10230604424967067, + "grad_norm": 0.5839102268218994, + "learning_rate": 0.00017957602776225322, + "loss": 1.5924, + "step": 7873 + }, + { + "epoch": 0.10231903879358654, + "grad_norm": 0.3789006769657135, + "learning_rate": 0.00017957342830034182, + "loss": 1.4465, + "step": 7874 + }, + { + "epoch": 0.10233203333750242, + "grad_norm": 0.3740769326686859, + "learning_rate": 0.00017957082883843047, + "loss": 1.5131, + "step": 7875 + }, + { + "epoch": 0.10234502788141829, + "grad_norm": 0.3642594814300537, + "learning_rate": 0.00017956822937651907, + "loss": 1.5794, + "step": 7876 + }, + { + "epoch": 0.10235802242533416, + "grad_norm": 0.38273754715919495, + "learning_rate": 0.0001795656299146077, + "loss": 1.4333, + "step": 7877 + }, + { + "epoch": 0.10237101696925004, + "grad_norm": 0.45849257707595825, + "learning_rate": 0.0001795630304526963, + "loss": 1.5496, + "step": 7878 + }, + { + "epoch": 0.10238401151316591, + "grad_norm": 0.438909649848938, + "learning_rate": 0.0001795604309907849, + "loss": 1.6207, + "step": 7879 + }, + { + "epoch": 0.10239700605708178, + "grad_norm": 0.468718022108078, + "learning_rate": 0.00017955783152887354, + "loss": 1.3413, + "step": 7880 + }, + { + "epoch": 0.10241000060099766, + "grad_norm": 0.41679978370666504, + "learning_rate": 0.00017955523206696213, + "loss": 1.4155, + "step": 7881 + }, + { + "epoch": 0.10242299514491353, + "grad_norm": 0.41075843572616577, + "learning_rate": 0.00017955263260505076, + "loss": 1.5617, + "step": 7882 + }, + { + "epoch": 0.1024359896888294, + "grad_norm": 0.35567933320999146, + "learning_rate": 0.00017955003314313938, + "loss": 1.4127, + "step": 7883 + }, + { + "epoch": 0.10244898423274527, + "grad_norm": 0.42304596304893494, + "learning_rate": 0.00017954743368122798, + "loss": 1.5871, + "step": 7884 + }, + { + "epoch": 0.10246197877666115, + "grad_norm": 0.5064476132392883, + "learning_rate": 0.0001795448342193166, + "loss": 1.4753, + "step": 7885 + }, + { + "epoch": 0.10247497332057702, + "grad_norm": 0.3652750551700592, + "learning_rate": 0.00017954223475740523, + "loss": 1.4736, + "step": 7886 + }, + { + "epoch": 0.1024879678644929, + "grad_norm": 0.38267767429351807, + "learning_rate": 0.00017953963529549385, + "loss": 1.4946, + "step": 7887 + }, + { + "epoch": 0.10250096240840877, + "grad_norm": 0.42121654748916626, + "learning_rate": 0.00017953703583358245, + "loss": 1.4531, + "step": 7888 + }, + { + "epoch": 0.10251395695232464, + "grad_norm": 0.2655147612094879, + "learning_rate": 0.00017953443637167108, + "loss": 1.5136, + "step": 7889 + }, + { + "epoch": 0.10252695149624051, + "grad_norm": 0.37908679246902466, + "learning_rate": 0.0001795318369097597, + "loss": 1.2953, + "step": 7890 + }, + { + "epoch": 0.10253994604015639, + "grad_norm": 0.37963083386421204, + "learning_rate": 0.0001795292374478483, + "loss": 1.446, + "step": 7891 + }, + { + "epoch": 0.10255294058407226, + "grad_norm": 0.3967644274234772, + "learning_rate": 0.00017952663798593692, + "loss": 1.2688, + "step": 7892 + }, + { + "epoch": 0.10256593512798813, + "grad_norm": 0.3967345356941223, + "learning_rate": 0.00017952403852402552, + "loss": 1.5164, + "step": 7893 + }, + { + "epoch": 0.102578929671904, + "grad_norm": 0.43092790246009827, + "learning_rate": 0.00017952143906211417, + "loss": 1.3236, + "step": 7894 + }, + { + "epoch": 0.10259192421581988, + "grad_norm": 0.4328557848930359, + "learning_rate": 0.00017951883960020277, + "loss": 1.5222, + "step": 7895 + }, + { + "epoch": 0.10260491875973575, + "grad_norm": 0.3354707956314087, + "learning_rate": 0.0001795162401382914, + "loss": 1.3445, + "step": 7896 + }, + { + "epoch": 0.10261791330365162, + "grad_norm": 0.44896838068962097, + "learning_rate": 0.00017951364067638, + "loss": 1.4368, + "step": 7897 + }, + { + "epoch": 0.1026309078475675, + "grad_norm": 0.42886048555374146, + "learning_rate": 0.00017951104121446862, + "loss": 1.4952, + "step": 7898 + }, + { + "epoch": 0.10264390239148338, + "grad_norm": 0.4371505379676819, + "learning_rate": 0.00017950844175255724, + "loss": 1.6611, + "step": 7899 + }, + { + "epoch": 0.10265689693539926, + "grad_norm": 0.4039475917816162, + "learning_rate": 0.00017950584229064584, + "loss": 1.4485, + "step": 7900 + }, + { + "epoch": 0.10266989147931513, + "grad_norm": 0.37279394268989563, + "learning_rate": 0.00017950324282873446, + "loss": 1.541, + "step": 7901 + }, + { + "epoch": 0.102682886023231, + "grad_norm": 0.4172093868255615, + "learning_rate": 0.00017950064336682309, + "loss": 1.558, + "step": 7902 + }, + { + "epoch": 0.10269588056714687, + "grad_norm": 0.4678516387939453, + "learning_rate": 0.00017949804390491168, + "loss": 1.4673, + "step": 7903 + }, + { + "epoch": 0.10270887511106275, + "grad_norm": 0.39488333463668823, + "learning_rate": 0.0001794954444430003, + "loss": 1.5715, + "step": 7904 + }, + { + "epoch": 0.10272186965497862, + "grad_norm": 0.4110446274280548, + "learning_rate": 0.0001794928449810889, + "loss": 1.418, + "step": 7905 + }, + { + "epoch": 0.1027348641988945, + "grad_norm": 0.3216738700866699, + "learning_rate": 0.00017949024551917756, + "loss": 1.4773, + "step": 7906 + }, + { + "epoch": 0.10274785874281037, + "grad_norm": 0.4549819231033325, + "learning_rate": 0.00017948764605726615, + "loss": 1.4852, + "step": 7907 + }, + { + "epoch": 0.10276085328672624, + "grad_norm": 0.5669338703155518, + "learning_rate": 0.00017948504659535478, + "loss": 1.4325, + "step": 7908 + }, + { + "epoch": 0.10277384783064211, + "grad_norm": 0.433988094329834, + "learning_rate": 0.00017948244713344338, + "loss": 1.4724, + "step": 7909 + }, + { + "epoch": 0.10278684237455799, + "grad_norm": 0.40066781640052795, + "learning_rate": 0.000179479847671532, + "loss": 1.5209, + "step": 7910 + }, + { + "epoch": 0.10279983691847386, + "grad_norm": 0.39563825726509094, + "learning_rate": 0.00017947724820962063, + "loss": 1.4775, + "step": 7911 + }, + { + "epoch": 0.10281283146238973, + "grad_norm": 0.45007336139678955, + "learning_rate": 0.00017947464874770922, + "loss": 1.5504, + "step": 7912 + }, + { + "epoch": 0.1028258260063056, + "grad_norm": 0.4451403319835663, + "learning_rate": 0.00017947204928579785, + "loss": 1.4718, + "step": 7913 + }, + { + "epoch": 0.10283882055022148, + "grad_norm": 0.34433048963546753, + "learning_rate": 0.00017946944982388647, + "loss": 1.3367, + "step": 7914 + }, + { + "epoch": 0.10285181509413735, + "grad_norm": 0.3291279971599579, + "learning_rate": 0.00017946685036197507, + "loss": 1.3682, + "step": 7915 + }, + { + "epoch": 0.10286480963805322, + "grad_norm": 0.2266838252544403, + "learning_rate": 0.0001794642509000637, + "loss": 1.2422, + "step": 7916 + }, + { + "epoch": 0.1028778041819691, + "grad_norm": 0.3619321286678314, + "learning_rate": 0.0001794616514381523, + "loss": 1.2012, + "step": 7917 + }, + { + "epoch": 0.10289079872588497, + "grad_norm": 0.3631254732608795, + "learning_rate": 0.00017945905197624094, + "loss": 1.3714, + "step": 7918 + }, + { + "epoch": 0.10290379326980084, + "grad_norm": 0.37563759088516235, + "learning_rate": 0.00017945645251432954, + "loss": 1.4261, + "step": 7919 + }, + { + "epoch": 0.10291678781371671, + "grad_norm": 0.43486857414245605, + "learning_rate": 0.00017945385305241816, + "loss": 1.4363, + "step": 7920 + }, + { + "epoch": 0.10292978235763259, + "grad_norm": 0.42600587010383606, + "learning_rate": 0.00017945125359050676, + "loss": 1.5778, + "step": 7921 + }, + { + "epoch": 0.10294277690154846, + "grad_norm": 0.5117783546447754, + "learning_rate": 0.00017944865412859539, + "loss": 1.4536, + "step": 7922 + }, + { + "epoch": 0.10295577144546433, + "grad_norm": 0.2974124252796173, + "learning_rate": 0.000179446054666684, + "loss": 1.3523, + "step": 7923 + }, + { + "epoch": 0.1029687659893802, + "grad_norm": 0.49731770157814026, + "learning_rate": 0.0001794434552047726, + "loss": 1.3632, + "step": 7924 + }, + { + "epoch": 0.10298176053329608, + "grad_norm": 0.2756224572658539, + "learning_rate": 0.00017944085574286126, + "loss": 1.3703, + "step": 7925 + }, + { + "epoch": 0.10299475507721195, + "grad_norm": 0.4568815529346466, + "learning_rate": 0.00017943825628094986, + "loss": 1.4597, + "step": 7926 + }, + { + "epoch": 0.10300774962112783, + "grad_norm": 0.2808983325958252, + "learning_rate": 0.00017943565681903845, + "loss": 1.2667, + "step": 7927 + }, + { + "epoch": 0.1030207441650437, + "grad_norm": 0.4003196358680725, + "learning_rate": 0.00017943305735712708, + "loss": 1.3522, + "step": 7928 + }, + { + "epoch": 0.10303373870895957, + "grad_norm": 0.3558260202407837, + "learning_rate": 0.0001794304578952157, + "loss": 1.3169, + "step": 7929 + }, + { + "epoch": 0.10304673325287544, + "grad_norm": 0.43108677864074707, + "learning_rate": 0.00017942785843330433, + "loss": 1.4761, + "step": 7930 + }, + { + "epoch": 0.10305972779679132, + "grad_norm": 0.3817146122455597, + "learning_rate": 0.00017942525897139293, + "loss": 1.5432, + "step": 7931 + }, + { + "epoch": 0.10307272234070719, + "grad_norm": 0.39911437034606934, + "learning_rate": 0.00017942265950948155, + "loss": 1.3403, + "step": 7932 + }, + { + "epoch": 0.10308571688462306, + "grad_norm": 0.41367942094802856, + "learning_rate": 0.00017942006004757017, + "loss": 1.5102, + "step": 7933 + }, + { + "epoch": 0.10309871142853894, + "grad_norm": 0.411382794380188, + "learning_rate": 0.00017941746058565877, + "loss": 1.4226, + "step": 7934 + }, + { + "epoch": 0.10311170597245481, + "grad_norm": 0.45562857389450073, + "learning_rate": 0.0001794148611237474, + "loss": 1.5076, + "step": 7935 + }, + { + "epoch": 0.10312470051637068, + "grad_norm": 0.35147354006767273, + "learning_rate": 0.000179412261661836, + "loss": 1.3266, + "step": 7936 + }, + { + "epoch": 0.10313769506028657, + "grad_norm": 0.3584577143192291, + "learning_rate": 0.00017940966219992464, + "loss": 1.4739, + "step": 7937 + }, + { + "epoch": 0.10315068960420244, + "grad_norm": 0.37035125494003296, + "learning_rate": 0.00017940706273801324, + "loss": 1.481, + "step": 7938 + }, + { + "epoch": 0.10316368414811831, + "grad_norm": 0.414473295211792, + "learning_rate": 0.00017940446327610184, + "loss": 1.3736, + "step": 7939 + }, + { + "epoch": 0.10317667869203419, + "grad_norm": 0.4054807424545288, + "learning_rate": 0.00017940186381419046, + "loss": 1.4648, + "step": 7940 + }, + { + "epoch": 0.10318967323595006, + "grad_norm": 0.46387138962745667, + "learning_rate": 0.0001793992643522791, + "loss": 1.4871, + "step": 7941 + }, + { + "epoch": 0.10320266777986593, + "grad_norm": 0.4025227129459381, + "learning_rate": 0.0001793966648903677, + "loss": 1.476, + "step": 7942 + }, + { + "epoch": 0.1032156623237818, + "grad_norm": 0.39503058791160583, + "learning_rate": 0.0001793940654284563, + "loss": 1.5687, + "step": 7943 + }, + { + "epoch": 0.10322865686769768, + "grad_norm": 0.3631810247898102, + "learning_rate": 0.00017939146596654493, + "loss": 1.3722, + "step": 7944 + }, + { + "epoch": 0.10324165141161355, + "grad_norm": 0.37845462560653687, + "learning_rate": 0.00017938886650463356, + "loss": 1.2985, + "step": 7945 + }, + { + "epoch": 0.10325464595552943, + "grad_norm": 0.3191598951816559, + "learning_rate": 0.00017938626704272216, + "loss": 1.4712, + "step": 7946 + }, + { + "epoch": 0.1032676404994453, + "grad_norm": 0.3889971077442169, + "learning_rate": 0.00017938366758081078, + "loss": 1.5827, + "step": 7947 + }, + { + "epoch": 0.10328063504336117, + "grad_norm": 0.4626803398132324, + "learning_rate": 0.00017938106811889938, + "loss": 1.5201, + "step": 7948 + }, + { + "epoch": 0.10329362958727704, + "grad_norm": 0.27911537885665894, + "learning_rate": 0.00017937846865698803, + "loss": 1.3734, + "step": 7949 + }, + { + "epoch": 0.10330662413119292, + "grad_norm": 0.5439074635505676, + "learning_rate": 0.00017937586919507663, + "loss": 1.4754, + "step": 7950 + }, + { + "epoch": 0.10331961867510879, + "grad_norm": 0.45616888999938965, + "learning_rate": 0.00017937326973316525, + "loss": 1.4651, + "step": 7951 + }, + { + "epoch": 0.10333261321902466, + "grad_norm": 0.4382798373699188, + "learning_rate": 0.00017937067027125385, + "loss": 1.2766, + "step": 7952 + }, + { + "epoch": 0.10334560776294054, + "grad_norm": 0.43490156531333923, + "learning_rate": 0.00017936807080934247, + "loss": 1.5375, + "step": 7953 + }, + { + "epoch": 0.10335860230685641, + "grad_norm": 0.469547301530838, + "learning_rate": 0.0001793654713474311, + "loss": 1.6105, + "step": 7954 + }, + { + "epoch": 0.10337159685077228, + "grad_norm": 0.43007007241249084, + "learning_rate": 0.0001793628718855197, + "loss": 1.5242, + "step": 7955 + }, + { + "epoch": 0.10338459139468816, + "grad_norm": 0.3522557318210602, + "learning_rate": 0.00017936027242360832, + "loss": 1.388, + "step": 7956 + }, + { + "epoch": 0.10339758593860403, + "grad_norm": 0.40999171137809753, + "learning_rate": 0.00017935767296169694, + "loss": 1.2348, + "step": 7957 + }, + { + "epoch": 0.1034105804825199, + "grad_norm": 0.36213162541389465, + "learning_rate": 0.00017935507349978554, + "loss": 1.5623, + "step": 7958 + }, + { + "epoch": 0.10342357502643577, + "grad_norm": 0.4497208595275879, + "learning_rate": 0.00017935247403787417, + "loss": 1.5633, + "step": 7959 + }, + { + "epoch": 0.10343656957035165, + "grad_norm": 0.3032718598842621, + "learning_rate": 0.0001793498745759628, + "loss": 1.3471, + "step": 7960 + }, + { + "epoch": 0.10344956411426752, + "grad_norm": 0.40988150238990784, + "learning_rate": 0.00017934727511405142, + "loss": 1.3207, + "step": 7961 + }, + { + "epoch": 0.10346255865818339, + "grad_norm": 0.3369917869567871, + "learning_rate": 0.00017934467565214, + "loss": 1.5725, + "step": 7962 + }, + { + "epoch": 0.10347555320209927, + "grad_norm": 0.48962143063545227, + "learning_rate": 0.00017934207619022864, + "loss": 1.3241, + "step": 7963 + }, + { + "epoch": 0.10348854774601514, + "grad_norm": 0.38944628834724426, + "learning_rate": 0.00017933947672831726, + "loss": 1.4909, + "step": 7964 + }, + { + "epoch": 0.10350154228993101, + "grad_norm": 0.3094650208950043, + "learning_rate": 0.00017933687726640586, + "loss": 1.3422, + "step": 7965 + }, + { + "epoch": 0.10351453683384688, + "grad_norm": 0.3750936985015869, + "learning_rate": 0.00017933427780449448, + "loss": 1.6183, + "step": 7966 + }, + { + "epoch": 0.10352753137776276, + "grad_norm": 0.36818066239356995, + "learning_rate": 0.00017933167834258308, + "loss": 1.3194, + "step": 7967 + }, + { + "epoch": 0.10354052592167863, + "grad_norm": 0.33165282011032104, + "learning_rate": 0.0001793290788806717, + "loss": 1.4134, + "step": 7968 + }, + { + "epoch": 0.1035535204655945, + "grad_norm": 0.2703864872455597, + "learning_rate": 0.00017932647941876033, + "loss": 1.1932, + "step": 7969 + }, + { + "epoch": 0.10356651500951038, + "grad_norm": 0.37373217940330505, + "learning_rate": 0.00017932387995684893, + "loss": 1.5205, + "step": 7970 + }, + { + "epoch": 0.10357950955342625, + "grad_norm": 0.42540067434310913, + "learning_rate": 0.00017932128049493755, + "loss": 1.4115, + "step": 7971 + }, + { + "epoch": 0.10359250409734212, + "grad_norm": 0.39851224422454834, + "learning_rate": 0.00017931868103302618, + "loss": 1.4232, + "step": 7972 + }, + { + "epoch": 0.103605498641258, + "grad_norm": 0.4035250246524811, + "learning_rate": 0.0001793160815711148, + "loss": 1.414, + "step": 7973 + }, + { + "epoch": 0.10361849318517387, + "grad_norm": 0.39069944620132446, + "learning_rate": 0.0001793134821092034, + "loss": 1.4404, + "step": 7974 + }, + { + "epoch": 0.10363148772908976, + "grad_norm": 0.36573952436447144, + "learning_rate": 0.00017931088264729202, + "loss": 1.5088, + "step": 7975 + }, + { + "epoch": 0.10364448227300563, + "grad_norm": 0.347029447555542, + "learning_rate": 0.00017930828318538065, + "loss": 1.6356, + "step": 7976 + }, + { + "epoch": 0.1036574768169215, + "grad_norm": 0.48322170972824097, + "learning_rate": 0.00017930568372346924, + "loss": 1.5193, + "step": 7977 + }, + { + "epoch": 0.10367047136083737, + "grad_norm": 0.4446842670440674, + "learning_rate": 0.00017930308426155787, + "loss": 1.3507, + "step": 7978 + }, + { + "epoch": 0.10368346590475325, + "grad_norm": 0.3830401599407196, + "learning_rate": 0.00017930048479964647, + "loss": 1.5086, + "step": 7979 + }, + { + "epoch": 0.10369646044866912, + "grad_norm": 0.4655143916606903, + "learning_rate": 0.00017929788533773512, + "loss": 1.5273, + "step": 7980 + }, + { + "epoch": 0.10370945499258499, + "grad_norm": 0.38228240609169006, + "learning_rate": 0.00017929528587582372, + "loss": 1.3069, + "step": 7981 + }, + { + "epoch": 0.10372244953650087, + "grad_norm": 0.37955889105796814, + "learning_rate": 0.0001792926864139123, + "loss": 1.3866, + "step": 7982 + }, + { + "epoch": 0.10373544408041674, + "grad_norm": 0.3734840452671051, + "learning_rate": 0.00017929008695200094, + "loss": 1.5102, + "step": 7983 + }, + { + "epoch": 0.10374843862433261, + "grad_norm": 0.3247419595718384, + "learning_rate": 0.00017928748749008956, + "loss": 1.3812, + "step": 7984 + }, + { + "epoch": 0.10376143316824848, + "grad_norm": 0.3345896005630493, + "learning_rate": 0.0001792848880281782, + "loss": 1.4327, + "step": 7985 + }, + { + "epoch": 0.10377442771216436, + "grad_norm": 0.39056822657585144, + "learning_rate": 0.00017928228856626678, + "loss": 1.4673, + "step": 7986 + }, + { + "epoch": 0.10378742225608023, + "grad_norm": 0.4080594778060913, + "learning_rate": 0.0001792796891043554, + "loss": 1.4138, + "step": 7987 + }, + { + "epoch": 0.1038004167999961, + "grad_norm": 0.38304218649864197, + "learning_rate": 0.00017927708964244403, + "loss": 1.4701, + "step": 7988 + }, + { + "epoch": 0.10381341134391198, + "grad_norm": 0.4749680459499359, + "learning_rate": 0.00017927449018053263, + "loss": 1.5491, + "step": 7989 + }, + { + "epoch": 0.10382640588782785, + "grad_norm": 0.4998060464859009, + "learning_rate": 0.00017927189071862125, + "loss": 1.4176, + "step": 7990 + }, + { + "epoch": 0.10383940043174372, + "grad_norm": 0.43049871921539307, + "learning_rate": 0.00017926929125670985, + "loss": 1.3857, + "step": 7991 + }, + { + "epoch": 0.1038523949756596, + "grad_norm": 0.3602466583251953, + "learning_rate": 0.0001792666917947985, + "loss": 1.4252, + "step": 7992 + }, + { + "epoch": 0.10386538951957547, + "grad_norm": 0.31236013770103455, + "learning_rate": 0.0001792640923328871, + "loss": 1.125, + "step": 7993 + }, + { + "epoch": 0.10387838406349134, + "grad_norm": 0.42238807678222656, + "learning_rate": 0.0001792614928709757, + "loss": 1.4078, + "step": 7994 + }, + { + "epoch": 0.10389137860740721, + "grad_norm": 0.4486556649208069, + "learning_rate": 0.00017925889340906432, + "loss": 1.3463, + "step": 7995 + }, + { + "epoch": 0.10390437315132309, + "grad_norm": 0.3729092478752136, + "learning_rate": 0.00017925629394715295, + "loss": 1.366, + "step": 7996 + }, + { + "epoch": 0.10391736769523896, + "grad_norm": 0.4954756498336792, + "learning_rate": 0.00017925369448524157, + "loss": 1.5263, + "step": 7997 + }, + { + "epoch": 0.10393036223915483, + "grad_norm": 0.40927091240882874, + "learning_rate": 0.00017925109502333017, + "loss": 1.5341, + "step": 7998 + }, + { + "epoch": 0.1039433567830707, + "grad_norm": 0.4058709442615509, + "learning_rate": 0.0001792484955614188, + "loss": 1.1035, + "step": 7999 + }, + { + "epoch": 0.10395635132698658, + "grad_norm": 0.4331141412258148, + "learning_rate": 0.00017924589609950742, + "loss": 1.4959, + "step": 8000 + }, + { + "epoch": 0.10396934587090245, + "grad_norm": 0.45258501172065735, + "learning_rate": 0.00017924329663759602, + "loss": 1.5827, + "step": 8001 + }, + { + "epoch": 0.10398234041481833, + "grad_norm": 0.37838178873062134, + "learning_rate": 0.00017924069717568464, + "loss": 1.5544, + "step": 8002 + }, + { + "epoch": 0.1039953349587342, + "grad_norm": 0.3194632828235626, + "learning_rate": 0.00017923809771377326, + "loss": 1.2325, + "step": 8003 + }, + { + "epoch": 0.10400832950265007, + "grad_norm": 0.3741583228111267, + "learning_rate": 0.0001792354982518619, + "loss": 1.5401, + "step": 8004 + }, + { + "epoch": 0.10402132404656594, + "grad_norm": 0.4134669303894043, + "learning_rate": 0.00017923289878995049, + "loss": 1.3613, + "step": 8005 + }, + { + "epoch": 0.10403431859048182, + "grad_norm": 0.4060792922973633, + "learning_rate": 0.0001792302993280391, + "loss": 1.5222, + "step": 8006 + }, + { + "epoch": 0.10404731313439769, + "grad_norm": 0.47166624665260315, + "learning_rate": 0.00017922769986612774, + "loss": 1.6411, + "step": 8007 + }, + { + "epoch": 0.10406030767831356, + "grad_norm": 0.38318881392478943, + "learning_rate": 0.00017922510040421633, + "loss": 1.3628, + "step": 8008 + }, + { + "epoch": 0.10407330222222944, + "grad_norm": 0.31945088505744934, + "learning_rate": 0.00017922250094230496, + "loss": 1.2308, + "step": 8009 + }, + { + "epoch": 0.10408629676614531, + "grad_norm": 0.41478046774864197, + "learning_rate": 0.00017921990148039355, + "loss": 1.3938, + "step": 8010 + }, + { + "epoch": 0.10409929131006118, + "grad_norm": 0.3366563022136688, + "learning_rate": 0.00017921730201848218, + "loss": 1.4673, + "step": 8011 + }, + { + "epoch": 0.10411228585397705, + "grad_norm": 0.40028998255729675, + "learning_rate": 0.0001792147025565708, + "loss": 1.2632, + "step": 8012 + }, + { + "epoch": 0.10412528039789294, + "grad_norm": 0.3477810323238373, + "learning_rate": 0.0001792121030946594, + "loss": 1.3746, + "step": 8013 + }, + { + "epoch": 0.10413827494180881, + "grad_norm": 0.26680251955986023, + "learning_rate": 0.00017920950363274803, + "loss": 1.2252, + "step": 8014 + }, + { + "epoch": 0.10415126948572469, + "grad_norm": 0.4064973294734955, + "learning_rate": 0.00017920690417083665, + "loss": 1.453, + "step": 8015 + }, + { + "epoch": 0.10416426402964056, + "grad_norm": 0.4885926842689514, + "learning_rate": 0.00017920430470892527, + "loss": 1.5816, + "step": 8016 + }, + { + "epoch": 0.10417725857355643, + "grad_norm": 0.3536035418510437, + "learning_rate": 0.00017920170524701387, + "loss": 1.474, + "step": 8017 + }, + { + "epoch": 0.1041902531174723, + "grad_norm": 0.3967699706554413, + "learning_rate": 0.0001791991057851025, + "loss": 1.3436, + "step": 8018 + }, + { + "epoch": 0.10420324766138818, + "grad_norm": 0.4234827160835266, + "learning_rate": 0.00017919650632319112, + "loss": 1.4573, + "step": 8019 + }, + { + "epoch": 0.10421624220530405, + "grad_norm": 0.4700900614261627, + "learning_rate": 0.00017919390686127972, + "loss": 1.5331, + "step": 8020 + }, + { + "epoch": 0.10422923674921993, + "grad_norm": 0.30281639099121094, + "learning_rate": 0.00017919130739936834, + "loss": 1.3903, + "step": 8021 + }, + { + "epoch": 0.1042422312931358, + "grad_norm": 0.34131497144699097, + "learning_rate": 0.00017918870793745694, + "loss": 1.3567, + "step": 8022 + }, + { + "epoch": 0.10425522583705167, + "grad_norm": 0.3436199426651001, + "learning_rate": 0.00017918610847554556, + "loss": 1.3489, + "step": 8023 + }, + { + "epoch": 0.10426822038096754, + "grad_norm": 0.359914094209671, + "learning_rate": 0.0001791835090136342, + "loss": 1.5661, + "step": 8024 + }, + { + "epoch": 0.10428121492488342, + "grad_norm": 0.3884362578392029, + "learning_rate": 0.00017918090955172279, + "loss": 1.4661, + "step": 8025 + }, + { + "epoch": 0.10429420946879929, + "grad_norm": 0.36077505350112915, + "learning_rate": 0.0001791783100898114, + "loss": 1.3918, + "step": 8026 + }, + { + "epoch": 0.10430720401271516, + "grad_norm": 0.3498428463935852, + "learning_rate": 0.00017917571062790004, + "loss": 1.3795, + "step": 8027 + }, + { + "epoch": 0.10432019855663104, + "grad_norm": 0.3328615427017212, + "learning_rate": 0.00017917311116598866, + "loss": 1.4904, + "step": 8028 + }, + { + "epoch": 0.10433319310054691, + "grad_norm": 0.30838197469711304, + "learning_rate": 0.00017917051170407726, + "loss": 1.4022, + "step": 8029 + }, + { + "epoch": 0.10434618764446278, + "grad_norm": 0.37146902084350586, + "learning_rate": 0.00017916791224216588, + "loss": 1.4213, + "step": 8030 + }, + { + "epoch": 0.10435918218837865, + "grad_norm": 0.41162291169166565, + "learning_rate": 0.0001791653127802545, + "loss": 1.4455, + "step": 8031 + }, + { + "epoch": 0.10437217673229453, + "grad_norm": 0.37905123829841614, + "learning_rate": 0.0001791627133183431, + "loss": 1.3856, + "step": 8032 + }, + { + "epoch": 0.1043851712762104, + "grad_norm": 0.3954203128814697, + "learning_rate": 0.00017916011385643173, + "loss": 1.3571, + "step": 8033 + }, + { + "epoch": 0.10439816582012627, + "grad_norm": 0.3242570459842682, + "learning_rate": 0.00017915751439452035, + "loss": 1.4654, + "step": 8034 + }, + { + "epoch": 0.10441116036404215, + "grad_norm": 0.39476215839385986, + "learning_rate": 0.00017915491493260898, + "loss": 1.4966, + "step": 8035 + }, + { + "epoch": 0.10442415490795802, + "grad_norm": 0.4378581941127777, + "learning_rate": 0.00017915231547069757, + "loss": 1.434, + "step": 8036 + }, + { + "epoch": 0.10443714945187389, + "grad_norm": 0.42644456028938293, + "learning_rate": 0.00017914971600878617, + "loss": 1.5247, + "step": 8037 + }, + { + "epoch": 0.10445014399578977, + "grad_norm": 0.2627384066581726, + "learning_rate": 0.00017914711654687482, + "loss": 1.4398, + "step": 8038 + }, + { + "epoch": 0.10446313853970564, + "grad_norm": 0.35542765259742737, + "learning_rate": 0.00017914451708496342, + "loss": 1.3226, + "step": 8039 + }, + { + "epoch": 0.10447613308362151, + "grad_norm": 0.33493709564208984, + "learning_rate": 0.00017914191762305205, + "loss": 1.4565, + "step": 8040 + }, + { + "epoch": 0.10448912762753738, + "grad_norm": 0.3139078617095947, + "learning_rate": 0.00017913931816114064, + "loss": 1.2585, + "step": 8041 + }, + { + "epoch": 0.10450212217145326, + "grad_norm": 0.37238168716430664, + "learning_rate": 0.00017913671869922927, + "loss": 1.4998, + "step": 8042 + }, + { + "epoch": 0.10451511671536913, + "grad_norm": 0.3731575310230255, + "learning_rate": 0.0001791341192373179, + "loss": 1.5094, + "step": 8043 + }, + { + "epoch": 0.104528111259285, + "grad_norm": 0.23553532361984253, + "learning_rate": 0.0001791315197754065, + "loss": 1.3379, + "step": 8044 + }, + { + "epoch": 0.10454110580320088, + "grad_norm": 0.32989755272865295, + "learning_rate": 0.0001791289203134951, + "loss": 1.3699, + "step": 8045 + }, + { + "epoch": 0.10455410034711675, + "grad_norm": 0.3031196594238281, + "learning_rate": 0.00017912632085158374, + "loss": 1.3595, + "step": 8046 + }, + { + "epoch": 0.10456709489103262, + "grad_norm": 0.4284035563468933, + "learning_rate": 0.00017912372138967236, + "loss": 1.7581, + "step": 8047 + }, + { + "epoch": 0.1045800894349485, + "grad_norm": 0.3994840383529663, + "learning_rate": 0.00017912112192776096, + "loss": 1.3112, + "step": 8048 + }, + { + "epoch": 0.10459308397886437, + "grad_norm": 0.4671224057674408, + "learning_rate": 0.00017911852246584956, + "loss": 1.5354, + "step": 8049 + }, + { + "epoch": 0.10460607852278024, + "grad_norm": 0.4269476532936096, + "learning_rate": 0.0001791159230039382, + "loss": 1.6347, + "step": 8050 + }, + { + "epoch": 0.10461907306669613, + "grad_norm": 0.39409470558166504, + "learning_rate": 0.0001791133235420268, + "loss": 1.5037, + "step": 8051 + }, + { + "epoch": 0.104632067610612, + "grad_norm": 0.3732718229293823, + "learning_rate": 0.00017911072408011543, + "loss": 1.3904, + "step": 8052 + }, + { + "epoch": 0.10464506215452787, + "grad_norm": 0.44791218638420105, + "learning_rate": 0.00017910812461820403, + "loss": 1.446, + "step": 8053 + }, + { + "epoch": 0.10465805669844375, + "grad_norm": 0.4334414005279541, + "learning_rate": 0.00017910552515629265, + "loss": 1.4468, + "step": 8054 + }, + { + "epoch": 0.10467105124235962, + "grad_norm": 0.38104677200317383, + "learning_rate": 0.00017910292569438128, + "loss": 1.5195, + "step": 8055 + }, + { + "epoch": 0.10468404578627549, + "grad_norm": 0.3623162508010864, + "learning_rate": 0.00017910032623246987, + "loss": 1.5297, + "step": 8056 + }, + { + "epoch": 0.10469704033019137, + "grad_norm": 0.42105117440223694, + "learning_rate": 0.0001790977267705585, + "loss": 1.4329, + "step": 8057 + }, + { + "epoch": 0.10471003487410724, + "grad_norm": 0.37765073776245117, + "learning_rate": 0.00017909512730864712, + "loss": 1.3613, + "step": 8058 + }, + { + "epoch": 0.10472302941802311, + "grad_norm": 0.4411623179912567, + "learning_rate": 0.00017909252784673575, + "loss": 1.4635, + "step": 8059 + }, + { + "epoch": 0.10473602396193898, + "grad_norm": 0.36713117361068726, + "learning_rate": 0.00017908992838482435, + "loss": 1.3269, + "step": 8060 + }, + { + "epoch": 0.10474901850585486, + "grad_norm": 0.43734514713287354, + "learning_rate": 0.00017908732892291294, + "loss": 1.2507, + "step": 8061 + }, + { + "epoch": 0.10476201304977073, + "grad_norm": 0.45628947019577026, + "learning_rate": 0.0001790847294610016, + "loss": 1.3814, + "step": 8062 + }, + { + "epoch": 0.1047750075936866, + "grad_norm": 0.4164862632751465, + "learning_rate": 0.0001790821299990902, + "loss": 1.3777, + "step": 8063 + }, + { + "epoch": 0.10478800213760248, + "grad_norm": 0.4260309934616089, + "learning_rate": 0.00017907953053717882, + "loss": 1.5667, + "step": 8064 + }, + { + "epoch": 0.10480099668151835, + "grad_norm": 0.4057696461677551, + "learning_rate": 0.0001790769310752674, + "loss": 1.4148, + "step": 8065 + }, + { + "epoch": 0.10481399122543422, + "grad_norm": 0.4715280830860138, + "learning_rate": 0.00017907433161335604, + "loss": 1.6246, + "step": 8066 + }, + { + "epoch": 0.1048269857693501, + "grad_norm": 0.4925602376461029, + "learning_rate": 0.00017907173215144466, + "loss": 1.4226, + "step": 8067 + }, + { + "epoch": 0.10483998031326597, + "grad_norm": 0.32506945729255676, + "learning_rate": 0.00017906913268953326, + "loss": 1.614, + "step": 8068 + }, + { + "epoch": 0.10485297485718184, + "grad_norm": 0.3706367015838623, + "learning_rate": 0.00017906653322762188, + "loss": 1.3106, + "step": 8069 + }, + { + "epoch": 0.10486596940109771, + "grad_norm": 0.31357234716415405, + "learning_rate": 0.0001790639337657105, + "loss": 1.2569, + "step": 8070 + }, + { + "epoch": 0.10487896394501359, + "grad_norm": 0.35913679003715515, + "learning_rate": 0.00017906133430379913, + "loss": 1.6568, + "step": 8071 + }, + { + "epoch": 0.10489195848892946, + "grad_norm": 0.42747074365615845, + "learning_rate": 0.00017905873484188773, + "loss": 1.4414, + "step": 8072 + }, + { + "epoch": 0.10490495303284533, + "grad_norm": 0.34334713220596313, + "learning_rate": 0.00017905613537997636, + "loss": 1.3707, + "step": 8073 + }, + { + "epoch": 0.1049179475767612, + "grad_norm": 0.432465136051178, + "learning_rate": 0.00017905353591806498, + "loss": 1.5078, + "step": 8074 + }, + { + "epoch": 0.10493094212067708, + "grad_norm": 0.490108847618103, + "learning_rate": 0.00017905093645615358, + "loss": 1.5214, + "step": 8075 + }, + { + "epoch": 0.10494393666459295, + "grad_norm": 0.276267409324646, + "learning_rate": 0.0001790483369942422, + "loss": 1.4508, + "step": 8076 + }, + { + "epoch": 0.10495693120850882, + "grad_norm": 0.41033869981765747, + "learning_rate": 0.00017904573753233083, + "loss": 1.4749, + "step": 8077 + }, + { + "epoch": 0.1049699257524247, + "grad_norm": 0.404231458902359, + "learning_rate": 0.00017904313807041942, + "loss": 1.3342, + "step": 8078 + }, + { + "epoch": 0.10498292029634057, + "grad_norm": 0.35010117292404175, + "learning_rate": 0.00017904053860850805, + "loss": 1.3754, + "step": 8079 + }, + { + "epoch": 0.10499591484025644, + "grad_norm": 0.45433515310287476, + "learning_rate": 0.00017903793914659665, + "loss": 1.3815, + "step": 8080 + }, + { + "epoch": 0.10500890938417232, + "grad_norm": 0.3908640444278717, + "learning_rate": 0.0001790353396846853, + "loss": 1.4581, + "step": 8081 + }, + { + "epoch": 0.10502190392808819, + "grad_norm": 0.4251435101032257, + "learning_rate": 0.0001790327402227739, + "loss": 1.3893, + "step": 8082 + }, + { + "epoch": 0.10503489847200406, + "grad_norm": 0.42225953936576843, + "learning_rate": 0.00017903014076086252, + "loss": 1.4656, + "step": 8083 + }, + { + "epoch": 0.10504789301591994, + "grad_norm": 0.29635852575302124, + "learning_rate": 0.00017902754129895112, + "loss": 1.2572, + "step": 8084 + }, + { + "epoch": 0.10506088755983581, + "grad_norm": 0.33479174971580505, + "learning_rate": 0.00017902494183703974, + "loss": 1.2901, + "step": 8085 + }, + { + "epoch": 0.10507388210375168, + "grad_norm": 0.4221325218677521, + "learning_rate": 0.00017902234237512836, + "loss": 1.507, + "step": 8086 + }, + { + "epoch": 0.10508687664766755, + "grad_norm": 0.47237905859947205, + "learning_rate": 0.00017901974291321696, + "loss": 1.5308, + "step": 8087 + }, + { + "epoch": 0.10509987119158343, + "grad_norm": 0.4242340326309204, + "learning_rate": 0.0001790171434513056, + "loss": 1.5421, + "step": 8088 + }, + { + "epoch": 0.10511286573549931, + "grad_norm": 0.3724267780780792, + "learning_rate": 0.0001790145439893942, + "loss": 1.3626, + "step": 8089 + }, + { + "epoch": 0.10512586027941519, + "grad_norm": 0.3293302655220032, + "learning_rate": 0.00017901194452748284, + "loss": 1.4489, + "step": 8090 + }, + { + "epoch": 0.10513885482333106, + "grad_norm": 0.3883960545063019, + "learning_rate": 0.00017900934506557143, + "loss": 1.5761, + "step": 8091 + }, + { + "epoch": 0.10515184936724693, + "grad_norm": 0.30380895733833313, + "learning_rate": 0.00017900674560366003, + "loss": 1.3394, + "step": 8092 + }, + { + "epoch": 0.1051648439111628, + "grad_norm": 0.3718329071998596, + "learning_rate": 0.00017900414614174868, + "loss": 1.4642, + "step": 8093 + }, + { + "epoch": 0.10517783845507868, + "grad_norm": 0.4548642337322235, + "learning_rate": 0.00017900154667983728, + "loss": 1.6218, + "step": 8094 + }, + { + "epoch": 0.10519083299899455, + "grad_norm": 0.3894287347793579, + "learning_rate": 0.0001789989472179259, + "loss": 1.3941, + "step": 8095 + }, + { + "epoch": 0.10520382754291042, + "grad_norm": 0.49693650007247925, + "learning_rate": 0.0001789963477560145, + "loss": 1.3501, + "step": 8096 + }, + { + "epoch": 0.1052168220868263, + "grad_norm": 0.4253354072570801, + "learning_rate": 0.00017899374829410313, + "loss": 1.5231, + "step": 8097 + }, + { + "epoch": 0.10522981663074217, + "grad_norm": 0.4086737036705017, + "learning_rate": 0.00017899114883219175, + "loss": 1.4767, + "step": 8098 + }, + { + "epoch": 0.10524281117465804, + "grad_norm": 0.42117011547088623, + "learning_rate": 0.00017898854937028035, + "loss": 1.5377, + "step": 8099 + }, + { + "epoch": 0.10525580571857392, + "grad_norm": 0.3988659977912903, + "learning_rate": 0.00017898594990836897, + "loss": 1.6711, + "step": 8100 + }, + { + "epoch": 0.10526880026248979, + "grad_norm": 0.4064216911792755, + "learning_rate": 0.0001789833504464576, + "loss": 1.4378, + "step": 8101 + }, + { + "epoch": 0.10528179480640566, + "grad_norm": 0.3536483347415924, + "learning_rate": 0.00017898075098454622, + "loss": 1.3556, + "step": 8102 + }, + { + "epoch": 0.10529478935032154, + "grad_norm": 0.3714431822299957, + "learning_rate": 0.00017897815152263482, + "loss": 1.6121, + "step": 8103 + }, + { + "epoch": 0.10530778389423741, + "grad_norm": 0.35359179973602295, + "learning_rate": 0.00017897555206072342, + "loss": 1.3843, + "step": 8104 + }, + { + "epoch": 0.10532077843815328, + "grad_norm": 0.39783602952957153, + "learning_rate": 0.00017897295259881207, + "loss": 1.3086, + "step": 8105 + }, + { + "epoch": 0.10533377298206915, + "grad_norm": 0.4251030683517456, + "learning_rate": 0.00017897035313690066, + "loss": 1.5838, + "step": 8106 + }, + { + "epoch": 0.10534676752598503, + "grad_norm": 0.4016352593898773, + "learning_rate": 0.0001789677536749893, + "loss": 1.4287, + "step": 8107 + }, + { + "epoch": 0.1053597620699009, + "grad_norm": 0.41159847378730774, + "learning_rate": 0.00017896515421307791, + "loss": 1.3477, + "step": 8108 + }, + { + "epoch": 0.10537275661381677, + "grad_norm": 0.42181316018104553, + "learning_rate": 0.0001789625547511665, + "loss": 1.5682, + "step": 8109 + }, + { + "epoch": 0.10538575115773265, + "grad_norm": 0.4328581690788269, + "learning_rate": 0.00017895995528925514, + "loss": 1.2462, + "step": 8110 + }, + { + "epoch": 0.10539874570164852, + "grad_norm": 0.4267323613166809, + "learning_rate": 0.00017895735582734373, + "loss": 1.3571, + "step": 8111 + }, + { + "epoch": 0.10541174024556439, + "grad_norm": 0.38363146781921387, + "learning_rate": 0.00017895475636543238, + "loss": 1.2698, + "step": 8112 + }, + { + "epoch": 0.10542473478948026, + "grad_norm": 0.35397276282310486, + "learning_rate": 0.00017895215690352098, + "loss": 1.3735, + "step": 8113 + }, + { + "epoch": 0.10543772933339614, + "grad_norm": 0.42066776752471924, + "learning_rate": 0.0001789495574416096, + "loss": 1.4774, + "step": 8114 + }, + { + "epoch": 0.10545072387731201, + "grad_norm": 0.372273325920105, + "learning_rate": 0.0001789469579796982, + "loss": 1.4501, + "step": 8115 + }, + { + "epoch": 0.10546371842122788, + "grad_norm": 0.3861474394798279, + "learning_rate": 0.00017894435851778683, + "loss": 1.5177, + "step": 8116 + }, + { + "epoch": 0.10547671296514376, + "grad_norm": 0.3844112455844879, + "learning_rate": 0.00017894175905587545, + "loss": 1.4317, + "step": 8117 + }, + { + "epoch": 0.10548970750905963, + "grad_norm": 0.3366958498954773, + "learning_rate": 0.00017893915959396405, + "loss": 1.1885, + "step": 8118 + }, + { + "epoch": 0.1055027020529755, + "grad_norm": 0.42144110798835754, + "learning_rate": 0.00017893656013205267, + "loss": 1.5554, + "step": 8119 + }, + { + "epoch": 0.10551569659689138, + "grad_norm": 0.4657565951347351, + "learning_rate": 0.0001789339606701413, + "loss": 1.599, + "step": 8120 + }, + { + "epoch": 0.10552869114080725, + "grad_norm": 0.2501119375228882, + "learning_rate": 0.0001789313612082299, + "loss": 1.2468, + "step": 8121 + }, + { + "epoch": 0.10554168568472312, + "grad_norm": 0.4707627296447754, + "learning_rate": 0.00017892876174631852, + "loss": 1.4058, + "step": 8122 + }, + { + "epoch": 0.105554680228639, + "grad_norm": 0.4245232343673706, + "learning_rate": 0.00017892616228440712, + "loss": 1.4248, + "step": 8123 + }, + { + "epoch": 0.10556767477255487, + "grad_norm": 0.3967687785625458, + "learning_rate": 0.00017892356282249577, + "loss": 1.5913, + "step": 8124 + }, + { + "epoch": 0.10558066931647074, + "grad_norm": 0.37496134638786316, + "learning_rate": 0.00017892096336058437, + "loss": 1.4844, + "step": 8125 + }, + { + "epoch": 0.10559366386038661, + "grad_norm": 0.38550230860710144, + "learning_rate": 0.000178918363898673, + "loss": 1.6091, + "step": 8126 + }, + { + "epoch": 0.1056066584043025, + "grad_norm": 0.39407652616500854, + "learning_rate": 0.0001789157644367616, + "loss": 1.3621, + "step": 8127 + }, + { + "epoch": 0.10561965294821837, + "grad_norm": 0.433112770318985, + "learning_rate": 0.00017891316497485021, + "loss": 1.4456, + "step": 8128 + }, + { + "epoch": 0.10563264749213425, + "grad_norm": 0.44419875741004944, + "learning_rate": 0.00017891056551293884, + "loss": 1.3648, + "step": 8129 + }, + { + "epoch": 0.10564564203605012, + "grad_norm": 0.38489314913749695, + "learning_rate": 0.00017890796605102744, + "loss": 1.2905, + "step": 8130 + }, + { + "epoch": 0.10565863657996599, + "grad_norm": 0.33085474371910095, + "learning_rate": 0.00017890536658911606, + "loss": 1.3691, + "step": 8131 + }, + { + "epoch": 0.10567163112388187, + "grad_norm": 0.4659140408039093, + "learning_rate": 0.00017890276712720468, + "loss": 1.5332, + "step": 8132 + }, + { + "epoch": 0.10568462566779774, + "grad_norm": 0.3782356381416321, + "learning_rate": 0.00017890016766529328, + "loss": 1.5257, + "step": 8133 + }, + { + "epoch": 0.10569762021171361, + "grad_norm": 0.4764174520969391, + "learning_rate": 0.0001788975682033819, + "loss": 1.5432, + "step": 8134 + }, + { + "epoch": 0.10571061475562948, + "grad_norm": 0.30911892652511597, + "learning_rate": 0.0001788949687414705, + "loss": 1.3439, + "step": 8135 + }, + { + "epoch": 0.10572360929954536, + "grad_norm": 0.43696072697639465, + "learning_rate": 0.00017889236927955916, + "loss": 1.2969, + "step": 8136 + }, + { + "epoch": 0.10573660384346123, + "grad_norm": 0.3797033131122589, + "learning_rate": 0.00017888976981764775, + "loss": 1.5047, + "step": 8137 + }, + { + "epoch": 0.1057495983873771, + "grad_norm": 0.3823290169239044, + "learning_rate": 0.00017888717035573638, + "loss": 1.4263, + "step": 8138 + }, + { + "epoch": 0.10576259293129298, + "grad_norm": 0.4489169120788574, + "learning_rate": 0.00017888457089382497, + "loss": 1.5657, + "step": 8139 + }, + { + "epoch": 0.10577558747520885, + "grad_norm": 0.3395143449306488, + "learning_rate": 0.0001788819714319136, + "loss": 1.5834, + "step": 8140 + }, + { + "epoch": 0.10578858201912472, + "grad_norm": 0.34869250655174255, + "learning_rate": 0.00017887937197000222, + "loss": 1.272, + "step": 8141 + }, + { + "epoch": 0.1058015765630406, + "grad_norm": 0.44896647334098816, + "learning_rate": 0.00017887677250809082, + "loss": 1.3581, + "step": 8142 + }, + { + "epoch": 0.10581457110695647, + "grad_norm": 0.7160615921020508, + "learning_rate": 0.00017887417304617945, + "loss": 1.663, + "step": 8143 + }, + { + "epoch": 0.10582756565087234, + "grad_norm": 0.38803112506866455, + "learning_rate": 0.00017887157358426807, + "loss": 1.4343, + "step": 8144 + }, + { + "epoch": 0.10584056019478821, + "grad_norm": 0.31806567311286926, + "learning_rate": 0.00017886897412235667, + "loss": 1.3541, + "step": 8145 + }, + { + "epoch": 0.10585355473870409, + "grad_norm": 0.331630140542984, + "learning_rate": 0.0001788663746604453, + "loss": 1.1125, + "step": 8146 + }, + { + "epoch": 0.10586654928261996, + "grad_norm": 0.4458574950695038, + "learning_rate": 0.00017886377519853392, + "loss": 1.4972, + "step": 8147 + }, + { + "epoch": 0.10587954382653583, + "grad_norm": 0.31424760818481445, + "learning_rate": 0.00017886117573662254, + "loss": 1.159, + "step": 8148 + }, + { + "epoch": 0.1058925383704517, + "grad_norm": 0.43325430154800415, + "learning_rate": 0.00017885857627471114, + "loss": 1.4445, + "step": 8149 + }, + { + "epoch": 0.10590553291436758, + "grad_norm": 0.37639760971069336, + "learning_rate": 0.00017885597681279976, + "loss": 1.5666, + "step": 8150 + }, + { + "epoch": 0.10591852745828345, + "grad_norm": 0.38434675335884094, + "learning_rate": 0.0001788533773508884, + "loss": 1.374, + "step": 8151 + }, + { + "epoch": 0.10593152200219932, + "grad_norm": 0.34173765778541565, + "learning_rate": 0.00017885077788897698, + "loss": 1.2533, + "step": 8152 + }, + { + "epoch": 0.1059445165461152, + "grad_norm": 0.3970118761062622, + "learning_rate": 0.0001788481784270656, + "loss": 1.5193, + "step": 8153 + }, + { + "epoch": 0.10595751109003107, + "grad_norm": 0.43598806858062744, + "learning_rate": 0.0001788455789651542, + "loss": 1.4516, + "step": 8154 + }, + { + "epoch": 0.10597050563394694, + "grad_norm": 0.3351729214191437, + "learning_rate": 0.00017884297950324286, + "loss": 1.2827, + "step": 8155 + }, + { + "epoch": 0.10598350017786282, + "grad_norm": 0.43314504623413086, + "learning_rate": 0.00017884038004133146, + "loss": 1.4287, + "step": 8156 + }, + { + "epoch": 0.10599649472177869, + "grad_norm": 0.506985604763031, + "learning_rate": 0.00017883778057942008, + "loss": 1.467, + "step": 8157 + }, + { + "epoch": 0.10600948926569456, + "grad_norm": 0.4688164293766022, + "learning_rate": 0.00017883518111750868, + "loss": 1.3713, + "step": 8158 + }, + { + "epoch": 0.10602248380961043, + "grad_norm": 0.3287079930305481, + "learning_rate": 0.0001788325816555973, + "loss": 1.2446, + "step": 8159 + }, + { + "epoch": 0.10603547835352631, + "grad_norm": 0.4234907031059265, + "learning_rate": 0.00017882998219368593, + "loss": 1.4933, + "step": 8160 + }, + { + "epoch": 0.10604847289744218, + "grad_norm": 0.44690683484077454, + "learning_rate": 0.00017882738273177452, + "loss": 1.5702, + "step": 8161 + }, + { + "epoch": 0.10606146744135805, + "grad_norm": 0.44422945380210876, + "learning_rate": 0.00017882478326986315, + "loss": 1.6069, + "step": 8162 + }, + { + "epoch": 0.10607446198527393, + "grad_norm": 0.26249459385871887, + "learning_rate": 0.00017882218380795177, + "loss": 1.4724, + "step": 8163 + }, + { + "epoch": 0.1060874565291898, + "grad_norm": 0.4622667729854584, + "learning_rate": 0.00017881958434604037, + "loss": 1.4932, + "step": 8164 + }, + { + "epoch": 0.10610045107310569, + "grad_norm": 0.44710975885391235, + "learning_rate": 0.000178816984884129, + "loss": 1.386, + "step": 8165 + }, + { + "epoch": 0.10611344561702156, + "grad_norm": 0.3701595664024353, + "learning_rate": 0.0001788143854222176, + "loss": 1.4438, + "step": 8166 + }, + { + "epoch": 0.10612644016093743, + "grad_norm": 0.19898907840251923, + "learning_rate": 0.00017881178596030624, + "loss": 1.2453, + "step": 8167 + }, + { + "epoch": 0.1061394347048533, + "grad_norm": 0.3883827030658722, + "learning_rate": 0.00017880918649839484, + "loss": 1.5868, + "step": 8168 + }, + { + "epoch": 0.10615242924876918, + "grad_norm": 0.4100607633590698, + "learning_rate": 0.00017880658703648347, + "loss": 1.5252, + "step": 8169 + }, + { + "epoch": 0.10616542379268505, + "grad_norm": 0.4724958539009094, + "learning_rate": 0.00017880398757457206, + "loss": 1.4307, + "step": 8170 + }, + { + "epoch": 0.10617841833660092, + "grad_norm": 0.5259565114974976, + "learning_rate": 0.0001788013881126607, + "loss": 1.3125, + "step": 8171 + }, + { + "epoch": 0.1061914128805168, + "grad_norm": 0.4943244457244873, + "learning_rate": 0.0001787987886507493, + "loss": 1.5503, + "step": 8172 + }, + { + "epoch": 0.10620440742443267, + "grad_norm": 0.29483816027641296, + "learning_rate": 0.0001787961891888379, + "loss": 1.3411, + "step": 8173 + }, + { + "epoch": 0.10621740196834854, + "grad_norm": 0.32326146960258484, + "learning_rate": 0.00017879358972692653, + "loss": 1.4104, + "step": 8174 + }, + { + "epoch": 0.10623039651226442, + "grad_norm": 0.33578529953956604, + "learning_rate": 0.00017879099026501516, + "loss": 1.5157, + "step": 8175 + }, + { + "epoch": 0.10624339105618029, + "grad_norm": 0.42963072657585144, + "learning_rate": 0.00017878839080310376, + "loss": 1.3957, + "step": 8176 + }, + { + "epoch": 0.10625638560009616, + "grad_norm": 0.434430330991745, + "learning_rate": 0.00017878579134119238, + "loss": 1.5243, + "step": 8177 + }, + { + "epoch": 0.10626938014401204, + "grad_norm": 0.3892306983470917, + "learning_rate": 0.00017878319187928098, + "loss": 1.5756, + "step": 8178 + }, + { + "epoch": 0.10628237468792791, + "grad_norm": 0.400583952665329, + "learning_rate": 0.00017878059241736963, + "loss": 1.4903, + "step": 8179 + }, + { + "epoch": 0.10629536923184378, + "grad_norm": 0.3598881959915161, + "learning_rate": 0.00017877799295545823, + "loss": 1.6074, + "step": 8180 + }, + { + "epoch": 0.10630836377575965, + "grad_norm": 0.4258742332458496, + "learning_rate": 0.00017877539349354685, + "loss": 1.1456, + "step": 8181 + }, + { + "epoch": 0.10632135831967553, + "grad_norm": 0.4188748002052307, + "learning_rate": 0.00017877279403163548, + "loss": 1.5721, + "step": 8182 + }, + { + "epoch": 0.1063343528635914, + "grad_norm": 0.3962958753108978, + "learning_rate": 0.00017877019456972407, + "loss": 1.7283, + "step": 8183 + }, + { + "epoch": 0.10634734740750727, + "grad_norm": 0.3618302345275879, + "learning_rate": 0.0001787675951078127, + "loss": 1.5692, + "step": 8184 + }, + { + "epoch": 0.10636034195142315, + "grad_norm": 0.3765220642089844, + "learning_rate": 0.0001787649956459013, + "loss": 1.4312, + "step": 8185 + }, + { + "epoch": 0.10637333649533902, + "grad_norm": 0.43553921580314636, + "learning_rate": 0.00017876239618398995, + "loss": 1.4953, + "step": 8186 + }, + { + "epoch": 0.10638633103925489, + "grad_norm": 0.3680141270160675, + "learning_rate": 0.00017875979672207854, + "loss": 1.5256, + "step": 8187 + }, + { + "epoch": 0.10639932558317076, + "grad_norm": 0.34125861525535583, + "learning_rate": 0.00017875719726016714, + "loss": 1.3869, + "step": 8188 + }, + { + "epoch": 0.10641232012708664, + "grad_norm": 0.3506383001804352, + "learning_rate": 0.00017875459779825577, + "loss": 1.6094, + "step": 8189 + }, + { + "epoch": 0.10642531467100251, + "grad_norm": 0.31801915168762207, + "learning_rate": 0.0001787519983363444, + "loss": 1.5161, + "step": 8190 + }, + { + "epoch": 0.10643830921491838, + "grad_norm": 0.283669650554657, + "learning_rate": 0.00017874939887443301, + "loss": 1.3582, + "step": 8191 + }, + { + "epoch": 0.10645130375883426, + "grad_norm": 0.3953026235103607, + "learning_rate": 0.0001787467994125216, + "loss": 1.4716, + "step": 8192 + }, + { + "epoch": 0.10646429830275013, + "grad_norm": 0.4820462763309479, + "learning_rate": 0.00017874419995061024, + "loss": 1.5582, + "step": 8193 + }, + { + "epoch": 0.106477292846666, + "grad_norm": 0.500321090221405, + "learning_rate": 0.00017874160048869886, + "loss": 1.5507, + "step": 8194 + }, + { + "epoch": 0.10649028739058188, + "grad_norm": 0.25349944829940796, + "learning_rate": 0.00017873900102678746, + "loss": 1.1298, + "step": 8195 + }, + { + "epoch": 0.10650328193449775, + "grad_norm": 0.32448187470436096, + "learning_rate": 0.00017873640156487608, + "loss": 1.4198, + "step": 8196 + }, + { + "epoch": 0.10651627647841362, + "grad_norm": 0.5821177959442139, + "learning_rate": 0.00017873380210296468, + "loss": 1.3302, + "step": 8197 + }, + { + "epoch": 0.1065292710223295, + "grad_norm": 0.3544238209724426, + "learning_rate": 0.00017873120264105333, + "loss": 1.4634, + "step": 8198 + }, + { + "epoch": 0.10654226556624537, + "grad_norm": 0.3090178966522217, + "learning_rate": 0.00017872860317914193, + "loss": 1.2475, + "step": 8199 + }, + { + "epoch": 0.10655526011016124, + "grad_norm": 0.39294150471687317, + "learning_rate": 0.00017872600371723053, + "loss": 1.5182, + "step": 8200 + }, + { + "epoch": 0.10656825465407711, + "grad_norm": 0.4525615870952606, + "learning_rate": 0.00017872340425531915, + "loss": 1.4657, + "step": 8201 + }, + { + "epoch": 0.10658124919799299, + "grad_norm": 0.3070647120475769, + "learning_rate": 0.00017872080479340778, + "loss": 1.3586, + "step": 8202 + }, + { + "epoch": 0.10659424374190887, + "grad_norm": 0.42253831028938293, + "learning_rate": 0.0001787182053314964, + "loss": 1.7267, + "step": 8203 + }, + { + "epoch": 0.10660723828582475, + "grad_norm": 0.4536573886871338, + "learning_rate": 0.000178715605869585, + "loss": 1.4703, + "step": 8204 + }, + { + "epoch": 0.10662023282974062, + "grad_norm": 0.41535767912864685, + "learning_rate": 0.00017871300640767362, + "loss": 1.4469, + "step": 8205 + }, + { + "epoch": 0.10663322737365649, + "grad_norm": 0.40323224663734436, + "learning_rate": 0.00017871040694576225, + "loss": 1.5483, + "step": 8206 + }, + { + "epoch": 0.10664622191757236, + "grad_norm": 0.39425545930862427, + "learning_rate": 0.00017870780748385084, + "loss": 1.5584, + "step": 8207 + }, + { + "epoch": 0.10665921646148824, + "grad_norm": 0.33285897970199585, + "learning_rate": 0.00017870520802193947, + "loss": 1.5746, + "step": 8208 + }, + { + "epoch": 0.10667221100540411, + "grad_norm": 0.46968698501586914, + "learning_rate": 0.00017870260856002807, + "loss": 1.5728, + "step": 8209 + }, + { + "epoch": 0.10668520554931998, + "grad_norm": 0.4709252119064331, + "learning_rate": 0.00017870000909811672, + "loss": 1.5017, + "step": 8210 + }, + { + "epoch": 0.10669820009323586, + "grad_norm": 0.3548968732357025, + "learning_rate": 0.00017869740963620531, + "loss": 1.194, + "step": 8211 + }, + { + "epoch": 0.10671119463715173, + "grad_norm": 0.2755083441734314, + "learning_rate": 0.00017869481017429394, + "loss": 1.3951, + "step": 8212 + }, + { + "epoch": 0.1067241891810676, + "grad_norm": 0.43299031257629395, + "learning_rate": 0.00017869221071238254, + "loss": 1.7226, + "step": 8213 + }, + { + "epoch": 0.10673718372498348, + "grad_norm": 0.4304381310939789, + "learning_rate": 0.00017868961125047116, + "loss": 1.5715, + "step": 8214 + }, + { + "epoch": 0.10675017826889935, + "grad_norm": 0.4301910996437073, + "learning_rate": 0.00017868701178855978, + "loss": 1.3712, + "step": 8215 + }, + { + "epoch": 0.10676317281281522, + "grad_norm": 0.4068301320075989, + "learning_rate": 0.00017868441232664838, + "loss": 1.2109, + "step": 8216 + }, + { + "epoch": 0.1067761673567311, + "grad_norm": 0.3920714259147644, + "learning_rate": 0.000178681812864737, + "loss": 1.3769, + "step": 8217 + }, + { + "epoch": 0.10678916190064697, + "grad_norm": 0.43683186173439026, + "learning_rate": 0.00017867921340282563, + "loss": 1.5601, + "step": 8218 + }, + { + "epoch": 0.10680215644456284, + "grad_norm": 0.414109468460083, + "learning_rate": 0.00017867661394091423, + "loss": 1.7161, + "step": 8219 + }, + { + "epoch": 0.10681515098847871, + "grad_norm": 0.44408103823661804, + "learning_rate": 0.00017867401447900285, + "loss": 1.4109, + "step": 8220 + }, + { + "epoch": 0.10682814553239459, + "grad_norm": 0.34131303429603577, + "learning_rate": 0.00017867141501709148, + "loss": 1.3086, + "step": 8221 + }, + { + "epoch": 0.10684114007631046, + "grad_norm": 0.35469838976860046, + "learning_rate": 0.0001786688155551801, + "loss": 1.1541, + "step": 8222 + }, + { + "epoch": 0.10685413462022633, + "grad_norm": 0.41554033756256104, + "learning_rate": 0.0001786662160932687, + "loss": 1.47, + "step": 8223 + }, + { + "epoch": 0.1068671291641422, + "grad_norm": 0.4296305179595947, + "learning_rate": 0.00017866361663135732, + "loss": 1.2554, + "step": 8224 + }, + { + "epoch": 0.10688012370805808, + "grad_norm": 0.4148561358451843, + "learning_rate": 0.00017866101716944595, + "loss": 1.5861, + "step": 8225 + }, + { + "epoch": 0.10689311825197395, + "grad_norm": 0.5389063954353333, + "learning_rate": 0.00017865841770753455, + "loss": 1.5225, + "step": 8226 + }, + { + "epoch": 0.10690611279588982, + "grad_norm": 0.4192093312740326, + "learning_rate": 0.00017865581824562317, + "loss": 1.3588, + "step": 8227 + }, + { + "epoch": 0.1069191073398057, + "grad_norm": 0.36082568764686584, + "learning_rate": 0.00017865321878371177, + "loss": 1.5087, + "step": 8228 + }, + { + "epoch": 0.10693210188372157, + "grad_norm": 0.3729625940322876, + "learning_rate": 0.0001786506193218004, + "loss": 1.275, + "step": 8229 + }, + { + "epoch": 0.10694509642763744, + "grad_norm": 0.3628804385662079, + "learning_rate": 0.00017864801985988902, + "loss": 1.3786, + "step": 8230 + }, + { + "epoch": 0.10695809097155332, + "grad_norm": 0.443193644285202, + "learning_rate": 0.00017864542039797761, + "loss": 1.5793, + "step": 8231 + }, + { + "epoch": 0.10697108551546919, + "grad_norm": 0.4084622263908386, + "learning_rate": 0.00017864282093606624, + "loss": 1.6569, + "step": 8232 + }, + { + "epoch": 0.10698408005938506, + "grad_norm": 0.3738953471183777, + "learning_rate": 0.00017864022147415486, + "loss": 1.2883, + "step": 8233 + }, + { + "epoch": 0.10699707460330093, + "grad_norm": 0.3393458127975464, + "learning_rate": 0.0001786376220122435, + "loss": 1.2927, + "step": 8234 + }, + { + "epoch": 0.10701006914721681, + "grad_norm": 0.5116984248161316, + "learning_rate": 0.00017863502255033208, + "loss": 1.6635, + "step": 8235 + }, + { + "epoch": 0.10702306369113268, + "grad_norm": 0.3769250214099884, + "learning_rate": 0.0001786324230884207, + "loss": 1.361, + "step": 8236 + }, + { + "epoch": 0.10703605823504855, + "grad_norm": 0.39683759212493896, + "learning_rate": 0.00017862982362650933, + "loss": 1.4072, + "step": 8237 + }, + { + "epoch": 0.10704905277896443, + "grad_norm": 0.40951600670814514, + "learning_rate": 0.00017862722416459793, + "loss": 1.4581, + "step": 8238 + }, + { + "epoch": 0.1070620473228803, + "grad_norm": 0.4117775559425354, + "learning_rate": 0.00017862462470268656, + "loss": 1.5397, + "step": 8239 + }, + { + "epoch": 0.10707504186679617, + "grad_norm": 0.37748417258262634, + "learning_rate": 0.00017862202524077515, + "loss": 1.3814, + "step": 8240 + }, + { + "epoch": 0.10708803641071206, + "grad_norm": 0.381867378950119, + "learning_rate": 0.0001786194257788638, + "loss": 1.3362, + "step": 8241 + }, + { + "epoch": 0.10710103095462793, + "grad_norm": 0.40221303701400757, + "learning_rate": 0.0001786168263169524, + "loss": 1.4331, + "step": 8242 + }, + { + "epoch": 0.1071140254985438, + "grad_norm": 0.3956340551376343, + "learning_rate": 0.000178614226855041, + "loss": 1.4068, + "step": 8243 + }, + { + "epoch": 0.10712702004245968, + "grad_norm": 0.43356987833976746, + "learning_rate": 0.00017861162739312962, + "loss": 1.411, + "step": 8244 + }, + { + "epoch": 0.10714001458637555, + "grad_norm": 0.3969554901123047, + "learning_rate": 0.00017860902793121825, + "loss": 1.5321, + "step": 8245 + }, + { + "epoch": 0.10715300913029142, + "grad_norm": 0.39121213555336, + "learning_rate": 0.00017860642846930687, + "loss": 1.4016, + "step": 8246 + }, + { + "epoch": 0.1071660036742073, + "grad_norm": 0.3624888062477112, + "learning_rate": 0.00017860382900739547, + "loss": 1.3461, + "step": 8247 + }, + { + "epoch": 0.10717899821812317, + "grad_norm": 0.3919220268726349, + "learning_rate": 0.0001786012295454841, + "loss": 1.4161, + "step": 8248 + }, + { + "epoch": 0.10719199276203904, + "grad_norm": 0.4599630832672119, + "learning_rate": 0.00017859863008357272, + "loss": 1.4957, + "step": 8249 + }, + { + "epoch": 0.10720498730595492, + "grad_norm": 0.3882891833782196, + "learning_rate": 0.00017859603062166132, + "loss": 1.4307, + "step": 8250 + }, + { + "epoch": 0.10721798184987079, + "grad_norm": 0.31193241477012634, + "learning_rate": 0.00017859343115974994, + "loss": 1.2459, + "step": 8251 + }, + { + "epoch": 0.10723097639378666, + "grad_norm": 0.3721153140068054, + "learning_rate": 0.00017859083169783854, + "loss": 1.4672, + "step": 8252 + }, + { + "epoch": 0.10724397093770253, + "grad_norm": 0.43315741419792175, + "learning_rate": 0.0001785882322359272, + "loss": 1.5726, + "step": 8253 + }, + { + "epoch": 0.10725696548161841, + "grad_norm": 0.38219547271728516, + "learning_rate": 0.0001785856327740158, + "loss": 1.5057, + "step": 8254 + }, + { + "epoch": 0.10726996002553428, + "grad_norm": 0.3723348379135132, + "learning_rate": 0.00017858303331210438, + "loss": 1.3849, + "step": 8255 + }, + { + "epoch": 0.10728295456945015, + "grad_norm": 0.3644058406352997, + "learning_rate": 0.00017858043385019304, + "loss": 1.2639, + "step": 8256 + }, + { + "epoch": 0.10729594911336603, + "grad_norm": 0.4135352373123169, + "learning_rate": 0.00017857783438828163, + "loss": 1.624, + "step": 8257 + }, + { + "epoch": 0.1073089436572819, + "grad_norm": 0.39943966269493103, + "learning_rate": 0.00017857523492637026, + "loss": 1.5365, + "step": 8258 + }, + { + "epoch": 0.10732193820119777, + "grad_norm": 0.49921396374702454, + "learning_rate": 0.00017857263546445886, + "loss": 1.601, + "step": 8259 + }, + { + "epoch": 0.10733493274511365, + "grad_norm": 0.34018537402153015, + "learning_rate": 0.00017857003600254748, + "loss": 1.5129, + "step": 8260 + }, + { + "epoch": 0.10734792728902952, + "grad_norm": 0.33740413188934326, + "learning_rate": 0.0001785674365406361, + "loss": 1.1765, + "step": 8261 + }, + { + "epoch": 0.10736092183294539, + "grad_norm": 0.3108792006969452, + "learning_rate": 0.0001785648370787247, + "loss": 1.1707, + "step": 8262 + }, + { + "epoch": 0.10737391637686126, + "grad_norm": 0.39253249764442444, + "learning_rate": 0.00017856223761681333, + "loss": 1.5172, + "step": 8263 + }, + { + "epoch": 0.10738691092077714, + "grad_norm": 0.34953510761260986, + "learning_rate": 0.00017855963815490195, + "loss": 1.3751, + "step": 8264 + }, + { + "epoch": 0.10739990546469301, + "grad_norm": 0.3397866487503052, + "learning_rate": 0.00017855703869299058, + "loss": 1.2223, + "step": 8265 + }, + { + "epoch": 0.10741290000860888, + "grad_norm": 0.34169650077819824, + "learning_rate": 0.00017855443923107917, + "loss": 1.365, + "step": 8266 + }, + { + "epoch": 0.10742589455252476, + "grad_norm": 0.492722749710083, + "learning_rate": 0.00017855183976916777, + "loss": 1.4904, + "step": 8267 + }, + { + "epoch": 0.10743888909644063, + "grad_norm": 0.3122013807296753, + "learning_rate": 0.00017854924030725642, + "loss": 1.241, + "step": 8268 + }, + { + "epoch": 0.1074518836403565, + "grad_norm": 0.36099573969841003, + "learning_rate": 0.00017854664084534502, + "loss": 1.6654, + "step": 8269 + }, + { + "epoch": 0.10746487818427237, + "grad_norm": 0.3124205470085144, + "learning_rate": 0.00017854404138343364, + "loss": 1.3006, + "step": 8270 + }, + { + "epoch": 0.10747787272818825, + "grad_norm": 0.4837087392807007, + "learning_rate": 0.00017854144192152224, + "loss": 1.4524, + "step": 8271 + }, + { + "epoch": 0.10749086727210412, + "grad_norm": 0.38991719484329224, + "learning_rate": 0.00017853884245961087, + "loss": 1.3326, + "step": 8272 + }, + { + "epoch": 0.10750386181602, + "grad_norm": 0.41984185576438904, + "learning_rate": 0.0001785362429976995, + "loss": 1.5051, + "step": 8273 + }, + { + "epoch": 0.10751685635993587, + "grad_norm": 0.34884050488471985, + "learning_rate": 0.0001785336435357881, + "loss": 1.3276, + "step": 8274 + }, + { + "epoch": 0.10752985090385174, + "grad_norm": 0.44545289874076843, + "learning_rate": 0.0001785310440738767, + "loss": 1.2967, + "step": 8275 + }, + { + "epoch": 0.10754284544776761, + "grad_norm": 0.45583096146583557, + "learning_rate": 0.00017852844461196534, + "loss": 1.5243, + "step": 8276 + }, + { + "epoch": 0.10755583999168349, + "grad_norm": 0.3560146391391754, + "learning_rate": 0.00017852584515005396, + "loss": 1.4024, + "step": 8277 + }, + { + "epoch": 0.10756883453559936, + "grad_norm": 0.42076122760772705, + "learning_rate": 0.00017852324568814256, + "loss": 1.36, + "step": 8278 + }, + { + "epoch": 0.10758182907951525, + "grad_norm": 0.39722198247909546, + "learning_rate": 0.00017852064622623118, + "loss": 1.4957, + "step": 8279 + }, + { + "epoch": 0.10759482362343112, + "grad_norm": 0.4036417603492737, + "learning_rate": 0.0001785180467643198, + "loss": 1.5451, + "step": 8280 + }, + { + "epoch": 0.10760781816734699, + "grad_norm": 0.4436379373073578, + "learning_rate": 0.0001785154473024084, + "loss": 1.361, + "step": 8281 + }, + { + "epoch": 0.10762081271126286, + "grad_norm": 0.5275803804397583, + "learning_rate": 0.00017851284784049703, + "loss": 1.4568, + "step": 8282 + }, + { + "epoch": 0.10763380725517874, + "grad_norm": 0.5393362045288086, + "learning_rate": 0.00017851024837858563, + "loss": 1.5041, + "step": 8283 + }, + { + "epoch": 0.10764680179909461, + "grad_norm": 0.3804057538509369, + "learning_rate": 0.00017850764891667425, + "loss": 1.3058, + "step": 8284 + }, + { + "epoch": 0.10765979634301048, + "grad_norm": 0.5642374157905579, + "learning_rate": 0.00017850504945476288, + "loss": 1.4152, + "step": 8285 + }, + { + "epoch": 0.10767279088692636, + "grad_norm": 0.3726355731487274, + "learning_rate": 0.00017850244999285147, + "loss": 1.6072, + "step": 8286 + }, + { + "epoch": 0.10768578543084223, + "grad_norm": 0.4349422752857208, + "learning_rate": 0.0001784998505309401, + "loss": 1.5623, + "step": 8287 + }, + { + "epoch": 0.1076987799747581, + "grad_norm": 0.39895492792129517, + "learning_rate": 0.00017849725106902872, + "loss": 1.4752, + "step": 8288 + }, + { + "epoch": 0.10771177451867398, + "grad_norm": 0.2515937089920044, + "learning_rate": 0.00017849465160711735, + "loss": 1.1081, + "step": 8289 + }, + { + "epoch": 0.10772476906258985, + "grad_norm": 0.4791084825992584, + "learning_rate": 0.00017849205214520594, + "loss": 1.7281, + "step": 8290 + }, + { + "epoch": 0.10773776360650572, + "grad_norm": 0.5024106502532959, + "learning_rate": 0.00017848945268329457, + "loss": 1.3555, + "step": 8291 + }, + { + "epoch": 0.1077507581504216, + "grad_norm": 0.5199039578437805, + "learning_rate": 0.0001784868532213832, + "loss": 1.5712, + "step": 8292 + }, + { + "epoch": 0.10776375269433747, + "grad_norm": 0.34852391481399536, + "learning_rate": 0.0001784842537594718, + "loss": 1.292, + "step": 8293 + }, + { + "epoch": 0.10777674723825334, + "grad_norm": 0.4339490234851837, + "learning_rate": 0.00017848165429756041, + "loss": 1.4883, + "step": 8294 + }, + { + "epoch": 0.10778974178216921, + "grad_norm": 0.5038226246833801, + "learning_rate": 0.00017847905483564904, + "loss": 1.5377, + "step": 8295 + }, + { + "epoch": 0.10780273632608509, + "grad_norm": 0.3878563940525055, + "learning_rate": 0.00017847645537373766, + "loss": 1.4318, + "step": 8296 + }, + { + "epoch": 0.10781573087000096, + "grad_norm": 0.2645588517189026, + "learning_rate": 0.00017847385591182626, + "loss": 1.4389, + "step": 8297 + }, + { + "epoch": 0.10782872541391683, + "grad_norm": 0.44635987281799316, + "learning_rate": 0.00017847125644991486, + "loss": 1.5234, + "step": 8298 + }, + { + "epoch": 0.1078417199578327, + "grad_norm": 0.4055132567882538, + "learning_rate": 0.0001784686569880035, + "loss": 1.352, + "step": 8299 + }, + { + "epoch": 0.10785471450174858, + "grad_norm": 0.3837491571903229, + "learning_rate": 0.0001784660575260921, + "loss": 1.7072, + "step": 8300 + }, + { + "epoch": 0.10786770904566445, + "grad_norm": 0.47413310408592224, + "learning_rate": 0.00017846345806418073, + "loss": 1.5348, + "step": 8301 + }, + { + "epoch": 0.10788070358958032, + "grad_norm": 0.36519861221313477, + "learning_rate": 0.00017846085860226933, + "loss": 1.4536, + "step": 8302 + }, + { + "epoch": 0.1078936981334962, + "grad_norm": 0.3981136679649353, + "learning_rate": 0.00017845825914035795, + "loss": 1.3759, + "step": 8303 + }, + { + "epoch": 0.10790669267741207, + "grad_norm": 0.3445344865322113, + "learning_rate": 0.00017845565967844658, + "loss": 1.556, + "step": 8304 + }, + { + "epoch": 0.10791968722132794, + "grad_norm": 0.40029338002204895, + "learning_rate": 0.00017845306021653518, + "loss": 1.6734, + "step": 8305 + }, + { + "epoch": 0.10793268176524382, + "grad_norm": 0.45845022797584534, + "learning_rate": 0.0001784504607546238, + "loss": 1.3827, + "step": 8306 + }, + { + "epoch": 0.10794567630915969, + "grad_norm": 0.41059330105781555, + "learning_rate": 0.00017844786129271242, + "loss": 1.4094, + "step": 8307 + }, + { + "epoch": 0.10795867085307556, + "grad_norm": 0.45285895466804504, + "learning_rate": 0.00017844526183080105, + "loss": 1.414, + "step": 8308 + }, + { + "epoch": 0.10797166539699143, + "grad_norm": 0.3477665185928345, + "learning_rate": 0.00017844266236888965, + "loss": 1.4796, + "step": 8309 + }, + { + "epoch": 0.10798465994090731, + "grad_norm": 0.2362794131040573, + "learning_rate": 0.00017844006290697824, + "loss": 1.1584, + "step": 8310 + }, + { + "epoch": 0.10799765448482318, + "grad_norm": 0.38085436820983887, + "learning_rate": 0.0001784374634450669, + "loss": 1.186, + "step": 8311 + }, + { + "epoch": 0.10801064902873905, + "grad_norm": 0.38006147742271423, + "learning_rate": 0.0001784348639831555, + "loss": 1.4155, + "step": 8312 + }, + { + "epoch": 0.10802364357265493, + "grad_norm": 0.3679640591144562, + "learning_rate": 0.00017843226452124412, + "loss": 1.4904, + "step": 8313 + }, + { + "epoch": 0.1080366381165708, + "grad_norm": 0.46301308274269104, + "learning_rate": 0.00017842966505933271, + "loss": 1.588, + "step": 8314 + }, + { + "epoch": 0.10804963266048667, + "grad_norm": 0.35534656047821045, + "learning_rate": 0.00017842706559742134, + "loss": 1.2514, + "step": 8315 + }, + { + "epoch": 0.10806262720440254, + "grad_norm": 0.3737853169441223, + "learning_rate": 0.00017842446613550996, + "loss": 1.2733, + "step": 8316 + }, + { + "epoch": 0.10807562174831843, + "grad_norm": 0.4065062701702118, + "learning_rate": 0.00017842186667359856, + "loss": 1.3768, + "step": 8317 + }, + { + "epoch": 0.1080886162922343, + "grad_norm": 0.42382216453552246, + "learning_rate": 0.00017841926721168719, + "loss": 1.5366, + "step": 8318 + }, + { + "epoch": 0.10810161083615018, + "grad_norm": 0.3473309576511383, + "learning_rate": 0.0001784166677497758, + "loss": 1.3417, + "step": 8319 + }, + { + "epoch": 0.10811460538006605, + "grad_norm": 0.388875275850296, + "learning_rate": 0.00017841406828786443, + "loss": 1.5457, + "step": 8320 + }, + { + "epoch": 0.10812759992398192, + "grad_norm": 0.5575242638587952, + "learning_rate": 0.00017841146882595303, + "loss": 1.4715, + "step": 8321 + }, + { + "epoch": 0.1081405944678978, + "grad_norm": 0.43632930517196655, + "learning_rate": 0.00017840886936404163, + "loss": 1.4983, + "step": 8322 + }, + { + "epoch": 0.10815358901181367, + "grad_norm": 0.5354071259498596, + "learning_rate": 0.00017840626990213028, + "loss": 1.444, + "step": 8323 + }, + { + "epoch": 0.10816658355572954, + "grad_norm": 0.4031495749950409, + "learning_rate": 0.00017840367044021888, + "loss": 1.4035, + "step": 8324 + }, + { + "epoch": 0.10817957809964542, + "grad_norm": 0.355292946100235, + "learning_rate": 0.0001784010709783075, + "loss": 1.4213, + "step": 8325 + }, + { + "epoch": 0.10819257264356129, + "grad_norm": 0.44800370931625366, + "learning_rate": 0.0001783984715163961, + "loss": 1.3318, + "step": 8326 + }, + { + "epoch": 0.10820556718747716, + "grad_norm": 0.3072616457939148, + "learning_rate": 0.00017839587205448472, + "loss": 1.3076, + "step": 8327 + }, + { + "epoch": 0.10821856173139303, + "grad_norm": 0.30312517285346985, + "learning_rate": 0.00017839327259257335, + "loss": 1.4326, + "step": 8328 + }, + { + "epoch": 0.10823155627530891, + "grad_norm": 0.4308004081249237, + "learning_rate": 0.00017839067313066195, + "loss": 1.4755, + "step": 8329 + }, + { + "epoch": 0.10824455081922478, + "grad_norm": 0.40817609429359436, + "learning_rate": 0.0001783880736687506, + "loss": 1.3326, + "step": 8330 + }, + { + "epoch": 0.10825754536314065, + "grad_norm": 0.3512544631958008, + "learning_rate": 0.0001783854742068392, + "loss": 1.4777, + "step": 8331 + }, + { + "epoch": 0.10827053990705653, + "grad_norm": 0.4642842710018158, + "learning_rate": 0.00017838287474492782, + "loss": 1.4219, + "step": 8332 + }, + { + "epoch": 0.1082835344509724, + "grad_norm": 0.2809307277202606, + "learning_rate": 0.00017838027528301642, + "loss": 1.0385, + "step": 8333 + }, + { + "epoch": 0.10829652899488827, + "grad_norm": 0.44858652353286743, + "learning_rate": 0.00017837767582110504, + "loss": 1.4492, + "step": 8334 + }, + { + "epoch": 0.10830952353880414, + "grad_norm": 0.3510497808456421, + "learning_rate": 0.00017837507635919367, + "loss": 1.5068, + "step": 8335 + }, + { + "epoch": 0.10832251808272002, + "grad_norm": 0.35914674401283264, + "learning_rate": 0.00017837247689728226, + "loss": 1.3606, + "step": 8336 + }, + { + "epoch": 0.10833551262663589, + "grad_norm": 0.3239414095878601, + "learning_rate": 0.0001783698774353709, + "loss": 1.2075, + "step": 8337 + }, + { + "epoch": 0.10834850717055176, + "grad_norm": 0.45280778408050537, + "learning_rate": 0.0001783672779734595, + "loss": 1.4402, + "step": 8338 + }, + { + "epoch": 0.10836150171446764, + "grad_norm": 0.41107428073883057, + "learning_rate": 0.0001783646785115481, + "loss": 1.4391, + "step": 8339 + }, + { + "epoch": 0.10837449625838351, + "grad_norm": 0.47195643186569214, + "learning_rate": 0.00017836207904963673, + "loss": 1.2798, + "step": 8340 + }, + { + "epoch": 0.10838749080229938, + "grad_norm": 0.3820756673812866, + "learning_rate": 0.00017835947958772533, + "loss": 1.2492, + "step": 8341 + }, + { + "epoch": 0.10840048534621526, + "grad_norm": 0.3791525363922119, + "learning_rate": 0.00017835688012581398, + "loss": 1.3166, + "step": 8342 + }, + { + "epoch": 0.10841347989013113, + "grad_norm": 0.3632073700428009, + "learning_rate": 0.00017835428066390258, + "loss": 1.308, + "step": 8343 + }, + { + "epoch": 0.108426474434047, + "grad_norm": 0.40637534856796265, + "learning_rate": 0.0001783516812019912, + "loss": 1.3872, + "step": 8344 + }, + { + "epoch": 0.10843946897796287, + "grad_norm": 0.40175411105155945, + "learning_rate": 0.0001783490817400798, + "loss": 1.3106, + "step": 8345 + }, + { + "epoch": 0.10845246352187875, + "grad_norm": 0.5319552421569824, + "learning_rate": 0.00017834648227816843, + "loss": 1.3413, + "step": 8346 + }, + { + "epoch": 0.10846545806579462, + "grad_norm": 0.36020201444625854, + "learning_rate": 0.00017834388281625705, + "loss": 1.3822, + "step": 8347 + }, + { + "epoch": 0.1084784526097105, + "grad_norm": 0.3974641263484955, + "learning_rate": 0.00017834128335434565, + "loss": 1.5008, + "step": 8348 + }, + { + "epoch": 0.10849144715362637, + "grad_norm": 0.4384458661079407, + "learning_rate": 0.00017833868389243427, + "loss": 1.3096, + "step": 8349 + }, + { + "epoch": 0.10850444169754224, + "grad_norm": 0.33949577808380127, + "learning_rate": 0.0001783360844305229, + "loss": 1.4741, + "step": 8350 + }, + { + "epoch": 0.10851743624145811, + "grad_norm": 0.47890397906303406, + "learning_rate": 0.0001783334849686115, + "loss": 1.5686, + "step": 8351 + }, + { + "epoch": 0.10853043078537399, + "grad_norm": 0.3752754032611847, + "learning_rate": 0.00017833088550670012, + "loss": 1.2255, + "step": 8352 + }, + { + "epoch": 0.10854342532928986, + "grad_norm": 0.24401240050792694, + "learning_rate": 0.00017832828604478872, + "loss": 1.2197, + "step": 8353 + }, + { + "epoch": 0.10855641987320573, + "grad_norm": 0.3515971899032593, + "learning_rate": 0.00017832568658287737, + "loss": 1.3718, + "step": 8354 + }, + { + "epoch": 0.10856941441712162, + "grad_norm": 0.44225677847862244, + "learning_rate": 0.00017832308712096597, + "loss": 1.5992, + "step": 8355 + }, + { + "epoch": 0.10858240896103749, + "grad_norm": 0.47988206148147583, + "learning_rate": 0.0001783204876590546, + "loss": 1.3909, + "step": 8356 + }, + { + "epoch": 0.10859540350495336, + "grad_norm": 0.4157367944717407, + "learning_rate": 0.0001783178881971432, + "loss": 1.5578, + "step": 8357 + }, + { + "epoch": 0.10860839804886924, + "grad_norm": 0.38189607858657837, + "learning_rate": 0.0001783152887352318, + "loss": 1.5537, + "step": 8358 + }, + { + "epoch": 0.10862139259278511, + "grad_norm": 0.4212566316127777, + "learning_rate": 0.00017831268927332044, + "loss": 1.4137, + "step": 8359 + }, + { + "epoch": 0.10863438713670098, + "grad_norm": 0.31272128224372864, + "learning_rate": 0.00017831008981140903, + "loss": 1.2933, + "step": 8360 + }, + { + "epoch": 0.10864738168061686, + "grad_norm": 0.4786490797996521, + "learning_rate": 0.00017830749034949766, + "loss": 1.5483, + "step": 8361 + }, + { + "epoch": 0.10866037622453273, + "grad_norm": 0.3467690348625183, + "learning_rate": 0.00017830489088758628, + "loss": 1.2842, + "step": 8362 + }, + { + "epoch": 0.1086733707684486, + "grad_norm": 0.4524156451225281, + "learning_rate": 0.0001783022914256749, + "loss": 1.4121, + "step": 8363 + }, + { + "epoch": 0.10868636531236447, + "grad_norm": 0.41800379753112793, + "learning_rate": 0.0001782996919637635, + "loss": 1.5609, + "step": 8364 + }, + { + "epoch": 0.10869935985628035, + "grad_norm": 0.3651736080646515, + "learning_rate": 0.0001782970925018521, + "loss": 1.5597, + "step": 8365 + }, + { + "epoch": 0.10871235440019622, + "grad_norm": 0.3153468668460846, + "learning_rate": 0.00017829449303994075, + "loss": 1.4299, + "step": 8366 + }, + { + "epoch": 0.1087253489441121, + "grad_norm": 0.4016343951225281, + "learning_rate": 0.00017829189357802935, + "loss": 1.421, + "step": 8367 + }, + { + "epoch": 0.10873834348802797, + "grad_norm": 0.37183037400245667, + "learning_rate": 0.00017828929411611798, + "loss": 1.3028, + "step": 8368 + }, + { + "epoch": 0.10875133803194384, + "grad_norm": 0.3810477554798126, + "learning_rate": 0.0001782866946542066, + "loss": 1.5618, + "step": 8369 + }, + { + "epoch": 0.10876433257585971, + "grad_norm": 0.348891943693161, + "learning_rate": 0.0001782840951922952, + "loss": 1.4816, + "step": 8370 + }, + { + "epoch": 0.10877732711977559, + "grad_norm": 0.45606300234794617, + "learning_rate": 0.00017828149573038382, + "loss": 1.3803, + "step": 8371 + }, + { + "epoch": 0.10879032166369146, + "grad_norm": 0.4514254629611969, + "learning_rate": 0.00017827889626847242, + "loss": 1.5694, + "step": 8372 + }, + { + "epoch": 0.10880331620760733, + "grad_norm": 0.332529753446579, + "learning_rate": 0.00017827629680656107, + "loss": 1.3346, + "step": 8373 + }, + { + "epoch": 0.1088163107515232, + "grad_norm": 0.3527812361717224, + "learning_rate": 0.00017827369734464967, + "loss": 1.4949, + "step": 8374 + }, + { + "epoch": 0.10882930529543908, + "grad_norm": 0.43754810094833374, + "learning_rate": 0.0001782710978827383, + "loss": 1.4718, + "step": 8375 + }, + { + "epoch": 0.10884229983935495, + "grad_norm": 0.3545166254043579, + "learning_rate": 0.0001782684984208269, + "loss": 1.5941, + "step": 8376 + }, + { + "epoch": 0.10885529438327082, + "grad_norm": 0.4640529453754425, + "learning_rate": 0.00017826589895891551, + "loss": 1.521, + "step": 8377 + }, + { + "epoch": 0.1088682889271867, + "grad_norm": 0.39585447311401367, + "learning_rate": 0.00017826329949700414, + "loss": 1.4753, + "step": 8378 + }, + { + "epoch": 0.10888128347110257, + "grad_norm": 0.3528577387332916, + "learning_rate": 0.00017826070003509274, + "loss": 1.5648, + "step": 8379 + }, + { + "epoch": 0.10889427801501844, + "grad_norm": 0.30275285243988037, + "learning_rate": 0.00017825810057318136, + "loss": 1.5874, + "step": 8380 + }, + { + "epoch": 0.10890727255893431, + "grad_norm": 0.4410100281238556, + "learning_rate": 0.00017825550111126999, + "loss": 1.7, + "step": 8381 + }, + { + "epoch": 0.10892026710285019, + "grad_norm": 0.40226468443870544, + "learning_rate": 0.00017825290164935858, + "loss": 1.4831, + "step": 8382 + }, + { + "epoch": 0.10893326164676606, + "grad_norm": 0.33901968598365784, + "learning_rate": 0.0001782503021874472, + "loss": 1.4329, + "step": 8383 + }, + { + "epoch": 0.10894625619068193, + "grad_norm": 0.28389546275138855, + "learning_rate": 0.0001782477027255358, + "loss": 1.2968, + "step": 8384 + }, + { + "epoch": 0.1089592507345978, + "grad_norm": 0.3036123216152191, + "learning_rate": 0.00017824510326362446, + "loss": 1.2778, + "step": 8385 + }, + { + "epoch": 0.10897224527851368, + "grad_norm": 0.40709248185157776, + "learning_rate": 0.00017824250380171305, + "loss": 1.6276, + "step": 8386 + }, + { + "epoch": 0.10898523982242955, + "grad_norm": 0.6016970872879028, + "learning_rate": 0.00017823990433980168, + "loss": 1.4721, + "step": 8387 + }, + { + "epoch": 0.10899823436634543, + "grad_norm": 0.4364887475967407, + "learning_rate": 0.00017823730487789028, + "loss": 1.5284, + "step": 8388 + }, + { + "epoch": 0.1090112289102613, + "grad_norm": 0.33313050866127014, + "learning_rate": 0.0001782347054159789, + "loss": 1.493, + "step": 8389 + }, + { + "epoch": 0.10902422345417717, + "grad_norm": 0.4251091182231903, + "learning_rate": 0.00017823210595406752, + "loss": 1.4608, + "step": 8390 + }, + { + "epoch": 0.10903721799809304, + "grad_norm": 0.49666833877563477, + "learning_rate": 0.00017822950649215612, + "loss": 1.5754, + "step": 8391 + }, + { + "epoch": 0.10905021254200892, + "grad_norm": 0.4272218644618988, + "learning_rate": 0.00017822690703024475, + "loss": 1.3725, + "step": 8392 + }, + { + "epoch": 0.10906320708592479, + "grad_norm": 0.322427362203598, + "learning_rate": 0.00017822430756833337, + "loss": 1.3875, + "step": 8393 + }, + { + "epoch": 0.10907620162984068, + "grad_norm": 0.3300779461860657, + "learning_rate": 0.00017822170810642197, + "loss": 1.3297, + "step": 8394 + }, + { + "epoch": 0.10908919617375655, + "grad_norm": 0.341291606426239, + "learning_rate": 0.0001782191086445106, + "loss": 1.3353, + "step": 8395 + }, + { + "epoch": 0.10910219071767242, + "grad_norm": 0.4103115499019623, + "learning_rate": 0.0001782165091825992, + "loss": 1.5571, + "step": 8396 + }, + { + "epoch": 0.1091151852615883, + "grad_norm": 0.4109514653682709, + "learning_rate": 0.00017821390972068784, + "loss": 1.2875, + "step": 8397 + }, + { + "epoch": 0.10912817980550417, + "grad_norm": 0.4343460500240326, + "learning_rate": 0.00017821131025877644, + "loss": 1.2937, + "step": 8398 + }, + { + "epoch": 0.10914117434942004, + "grad_norm": 0.32695797085762024, + "learning_rate": 0.00017820871079686506, + "loss": 1.3604, + "step": 8399 + }, + { + "epoch": 0.10915416889333591, + "grad_norm": 0.31041043996810913, + "learning_rate": 0.00017820611133495366, + "loss": 1.2872, + "step": 8400 + }, + { + "epoch": 0.10916716343725179, + "grad_norm": 0.4419708549976349, + "learning_rate": 0.00017820351187304229, + "loss": 1.5659, + "step": 8401 + }, + { + "epoch": 0.10918015798116766, + "grad_norm": 0.4739778935909271, + "learning_rate": 0.0001782009124111309, + "loss": 1.3573, + "step": 8402 + }, + { + "epoch": 0.10919315252508353, + "grad_norm": 0.41323843598365784, + "learning_rate": 0.0001781983129492195, + "loss": 1.4687, + "step": 8403 + }, + { + "epoch": 0.1092061470689994, + "grad_norm": 0.377329021692276, + "learning_rate": 0.00017819571348730816, + "loss": 1.1409, + "step": 8404 + }, + { + "epoch": 0.10921914161291528, + "grad_norm": 0.324496865272522, + "learning_rate": 0.00017819311402539676, + "loss": 1.4109, + "step": 8405 + }, + { + "epoch": 0.10923213615683115, + "grad_norm": 0.3908027708530426, + "learning_rate": 0.00017819051456348535, + "loss": 1.5341, + "step": 8406 + }, + { + "epoch": 0.10924513070074703, + "grad_norm": 0.41613420844078064, + "learning_rate": 0.00017818791510157398, + "loss": 1.3407, + "step": 8407 + }, + { + "epoch": 0.1092581252446629, + "grad_norm": 0.5755199790000916, + "learning_rate": 0.0001781853156396626, + "loss": 1.6164, + "step": 8408 + }, + { + "epoch": 0.10927111978857877, + "grad_norm": 0.42957574129104614, + "learning_rate": 0.00017818271617775123, + "loss": 1.3906, + "step": 8409 + }, + { + "epoch": 0.10928411433249464, + "grad_norm": 0.31957948207855225, + "learning_rate": 0.00017818011671583982, + "loss": 1.2539, + "step": 8410 + }, + { + "epoch": 0.10929710887641052, + "grad_norm": 0.38507527112960815, + "learning_rate": 0.00017817751725392845, + "loss": 1.488, + "step": 8411 + }, + { + "epoch": 0.10931010342032639, + "grad_norm": 0.47600457072257996, + "learning_rate": 0.00017817491779201707, + "loss": 1.5102, + "step": 8412 + }, + { + "epoch": 0.10932309796424226, + "grad_norm": 0.37596261501312256, + "learning_rate": 0.00017817231833010567, + "loss": 1.4317, + "step": 8413 + }, + { + "epoch": 0.10933609250815814, + "grad_norm": 0.49288859963417053, + "learning_rate": 0.0001781697188681943, + "loss": 1.5681, + "step": 8414 + }, + { + "epoch": 0.10934908705207401, + "grad_norm": 0.3904706835746765, + "learning_rate": 0.0001781671194062829, + "loss": 1.3318, + "step": 8415 + }, + { + "epoch": 0.10936208159598988, + "grad_norm": 0.29792383313179016, + "learning_rate": 0.00017816451994437154, + "loss": 1.2497, + "step": 8416 + }, + { + "epoch": 0.10937507613990576, + "grad_norm": 0.3291919231414795, + "learning_rate": 0.00017816192048246014, + "loss": 1.2146, + "step": 8417 + }, + { + "epoch": 0.10938807068382163, + "grad_norm": 0.37436097860336304, + "learning_rate": 0.00017815932102054877, + "loss": 1.4496, + "step": 8418 + }, + { + "epoch": 0.1094010652277375, + "grad_norm": 0.33629274368286133, + "learning_rate": 0.00017815672155863736, + "loss": 1.4727, + "step": 8419 + }, + { + "epoch": 0.10941405977165337, + "grad_norm": 0.36726537346839905, + "learning_rate": 0.000178154122096726, + "loss": 1.3965, + "step": 8420 + }, + { + "epoch": 0.10942705431556925, + "grad_norm": 0.32399168610572815, + "learning_rate": 0.0001781515226348146, + "loss": 1.2605, + "step": 8421 + }, + { + "epoch": 0.10944004885948512, + "grad_norm": 0.3672553598880768, + "learning_rate": 0.0001781489231729032, + "loss": 1.4345, + "step": 8422 + }, + { + "epoch": 0.10945304340340099, + "grad_norm": 0.378427118062973, + "learning_rate": 0.00017814632371099183, + "loss": 1.3482, + "step": 8423 + }, + { + "epoch": 0.10946603794731687, + "grad_norm": 0.4666999876499176, + "learning_rate": 0.00017814372424908046, + "loss": 1.6318, + "step": 8424 + }, + { + "epoch": 0.10947903249123274, + "grad_norm": 0.3476693630218506, + "learning_rate": 0.00017814112478716906, + "loss": 1.316, + "step": 8425 + }, + { + "epoch": 0.10949202703514861, + "grad_norm": 0.35928329825401306, + "learning_rate": 0.00017813852532525768, + "loss": 1.3788, + "step": 8426 + }, + { + "epoch": 0.10950502157906448, + "grad_norm": 0.4357830286026001, + "learning_rate": 0.00017813592586334628, + "loss": 1.521, + "step": 8427 + }, + { + "epoch": 0.10951801612298036, + "grad_norm": 0.4237259328365326, + "learning_rate": 0.00017813332640143493, + "loss": 1.3429, + "step": 8428 + }, + { + "epoch": 0.10953101066689623, + "grad_norm": 0.3974064886569977, + "learning_rate": 0.00017813072693952353, + "loss": 1.3985, + "step": 8429 + }, + { + "epoch": 0.1095440052108121, + "grad_norm": 0.42458170652389526, + "learning_rate": 0.00017812812747761215, + "loss": 1.535, + "step": 8430 + }, + { + "epoch": 0.10955699975472798, + "grad_norm": 0.404532253742218, + "learning_rate": 0.00017812552801570075, + "loss": 1.3499, + "step": 8431 + }, + { + "epoch": 0.10956999429864386, + "grad_norm": 0.3189915120601654, + "learning_rate": 0.00017812292855378937, + "loss": 1.3588, + "step": 8432 + }, + { + "epoch": 0.10958298884255974, + "grad_norm": 0.33536967635154724, + "learning_rate": 0.000178120329091878, + "loss": 1.4059, + "step": 8433 + }, + { + "epoch": 0.10959598338647561, + "grad_norm": 0.26062992215156555, + "learning_rate": 0.0001781177296299666, + "loss": 1.3249, + "step": 8434 + }, + { + "epoch": 0.10960897793039148, + "grad_norm": 0.35935214161872864, + "learning_rate": 0.00017811513016805522, + "loss": 1.4979, + "step": 8435 + }, + { + "epoch": 0.10962197247430736, + "grad_norm": 0.4060344994068146, + "learning_rate": 0.00017811253070614384, + "loss": 1.2814, + "step": 8436 + }, + { + "epoch": 0.10963496701822323, + "grad_norm": 0.37158462405204773, + "learning_rate": 0.00017810993124423244, + "loss": 1.476, + "step": 8437 + }, + { + "epoch": 0.1096479615621391, + "grad_norm": 0.3771154582500458, + "learning_rate": 0.00017810733178232107, + "loss": 1.5873, + "step": 8438 + }, + { + "epoch": 0.10966095610605497, + "grad_norm": 0.447327584028244, + "learning_rate": 0.00017810473232040966, + "loss": 1.6553, + "step": 8439 + }, + { + "epoch": 0.10967395064997085, + "grad_norm": 0.45415040850639343, + "learning_rate": 0.00017810213285849832, + "loss": 1.4273, + "step": 8440 + }, + { + "epoch": 0.10968694519388672, + "grad_norm": 0.4562950134277344, + "learning_rate": 0.0001780995333965869, + "loss": 1.5406, + "step": 8441 + }, + { + "epoch": 0.1096999397378026, + "grad_norm": 0.2560071051120758, + "learning_rate": 0.00017809693393467554, + "loss": 1.3796, + "step": 8442 + }, + { + "epoch": 0.10971293428171847, + "grad_norm": 0.4355359375476837, + "learning_rate": 0.00017809433447276416, + "loss": 1.5637, + "step": 8443 + }, + { + "epoch": 0.10972592882563434, + "grad_norm": 0.3415091633796692, + "learning_rate": 0.00017809173501085276, + "loss": 1.419, + "step": 8444 + }, + { + "epoch": 0.10973892336955021, + "grad_norm": 0.5199602246284485, + "learning_rate": 0.00017808913554894138, + "loss": 1.679, + "step": 8445 + }, + { + "epoch": 0.10975191791346608, + "grad_norm": 0.3922266364097595, + "learning_rate": 0.00017808653608702998, + "loss": 1.4026, + "step": 8446 + }, + { + "epoch": 0.10976491245738196, + "grad_norm": 0.4205973446369171, + "learning_rate": 0.00017808393662511863, + "loss": 1.6379, + "step": 8447 + }, + { + "epoch": 0.10977790700129783, + "grad_norm": 0.4112226366996765, + "learning_rate": 0.00017808133716320723, + "loss": 1.5078, + "step": 8448 + }, + { + "epoch": 0.1097909015452137, + "grad_norm": 0.5164234638214111, + "learning_rate": 0.00017807873770129583, + "loss": 1.4231, + "step": 8449 + }, + { + "epoch": 0.10980389608912958, + "grad_norm": 0.4740740656852722, + "learning_rate": 0.00017807613823938445, + "loss": 1.5024, + "step": 8450 + }, + { + "epoch": 0.10981689063304545, + "grad_norm": 0.4100686311721802, + "learning_rate": 0.00017807353877747308, + "loss": 1.5633, + "step": 8451 + }, + { + "epoch": 0.10982988517696132, + "grad_norm": 0.38827216625213623, + "learning_rate": 0.0001780709393155617, + "loss": 1.5038, + "step": 8452 + }, + { + "epoch": 0.1098428797208772, + "grad_norm": 0.5411360263824463, + "learning_rate": 0.0001780683398536503, + "loss": 1.4401, + "step": 8453 + }, + { + "epoch": 0.10985587426479307, + "grad_norm": 0.3749694526195526, + "learning_rate": 0.00017806574039173892, + "loss": 1.4677, + "step": 8454 + }, + { + "epoch": 0.10986886880870894, + "grad_norm": 0.35603395104408264, + "learning_rate": 0.00017806314092982755, + "loss": 1.5813, + "step": 8455 + }, + { + "epoch": 0.10988186335262481, + "grad_norm": 0.3705896735191345, + "learning_rate": 0.00017806054146791614, + "loss": 1.4599, + "step": 8456 + }, + { + "epoch": 0.10989485789654069, + "grad_norm": 0.3408558666706085, + "learning_rate": 0.00017805794200600477, + "loss": 1.3976, + "step": 8457 + }, + { + "epoch": 0.10990785244045656, + "grad_norm": 0.40790504217147827, + "learning_rate": 0.00017805534254409337, + "loss": 1.5811, + "step": 8458 + }, + { + "epoch": 0.10992084698437243, + "grad_norm": 0.4712159037590027, + "learning_rate": 0.00017805274308218202, + "loss": 1.3591, + "step": 8459 + }, + { + "epoch": 0.1099338415282883, + "grad_norm": 0.4193955361843109, + "learning_rate": 0.00017805014362027062, + "loss": 1.4916, + "step": 8460 + }, + { + "epoch": 0.10994683607220418, + "grad_norm": 0.3751177489757538, + "learning_rate": 0.0001780475441583592, + "loss": 1.3984, + "step": 8461 + }, + { + "epoch": 0.10995983061612005, + "grad_norm": 0.300302118062973, + "learning_rate": 0.00017804494469644784, + "loss": 1.4287, + "step": 8462 + }, + { + "epoch": 0.10997282516003593, + "grad_norm": 0.3934277594089508, + "learning_rate": 0.00017804234523453646, + "loss": 1.2978, + "step": 8463 + }, + { + "epoch": 0.1099858197039518, + "grad_norm": 0.33747467398643494, + "learning_rate": 0.00017803974577262509, + "loss": 1.2051, + "step": 8464 + }, + { + "epoch": 0.10999881424786767, + "grad_norm": 0.40012240409851074, + "learning_rate": 0.00017803714631071368, + "loss": 1.5453, + "step": 8465 + }, + { + "epoch": 0.11001180879178354, + "grad_norm": 0.4303434491157532, + "learning_rate": 0.0001780345468488023, + "loss": 1.1881, + "step": 8466 + }, + { + "epoch": 0.11002480333569942, + "grad_norm": 0.39507564902305603, + "learning_rate": 0.00017803194738689093, + "loss": 1.4508, + "step": 8467 + }, + { + "epoch": 0.11003779787961529, + "grad_norm": 0.35819846391677856, + "learning_rate": 0.00017802934792497953, + "loss": 1.4652, + "step": 8468 + }, + { + "epoch": 0.11005079242353116, + "grad_norm": 0.3903351426124573, + "learning_rate": 0.00017802674846306815, + "loss": 1.4496, + "step": 8469 + }, + { + "epoch": 0.11006378696744705, + "grad_norm": 0.3091757893562317, + "learning_rate": 0.00017802414900115675, + "loss": 1.3738, + "step": 8470 + }, + { + "epoch": 0.11007678151136292, + "grad_norm": 0.36216795444488525, + "learning_rate": 0.0001780215495392454, + "loss": 1.4149, + "step": 8471 + }, + { + "epoch": 0.1100897760552788, + "grad_norm": 0.34850308299064636, + "learning_rate": 0.000178018950077334, + "loss": 1.4391, + "step": 8472 + }, + { + "epoch": 0.11010277059919467, + "grad_norm": 0.3682047426700592, + "learning_rate": 0.0001780163506154226, + "loss": 1.4511, + "step": 8473 + }, + { + "epoch": 0.11011576514311054, + "grad_norm": 0.3127826750278473, + "learning_rate": 0.00017801375115351122, + "loss": 1.3262, + "step": 8474 + }, + { + "epoch": 0.11012875968702641, + "grad_norm": 0.39212527871131897, + "learning_rate": 0.00017801115169159985, + "loss": 1.4265, + "step": 8475 + }, + { + "epoch": 0.11014175423094229, + "grad_norm": 0.6448709964752197, + "learning_rate": 0.00017800855222968847, + "loss": 1.4576, + "step": 8476 + }, + { + "epoch": 0.11015474877485816, + "grad_norm": 0.48138076066970825, + "learning_rate": 0.00017800595276777707, + "loss": 1.3788, + "step": 8477 + }, + { + "epoch": 0.11016774331877403, + "grad_norm": 0.3340052366256714, + "learning_rate": 0.0001780033533058657, + "loss": 1.2852, + "step": 8478 + }, + { + "epoch": 0.1101807378626899, + "grad_norm": 0.45999959111213684, + "learning_rate": 0.00017800075384395432, + "loss": 1.4337, + "step": 8479 + }, + { + "epoch": 0.11019373240660578, + "grad_norm": 0.4603680670261383, + "learning_rate": 0.00017799815438204292, + "loss": 1.3663, + "step": 8480 + }, + { + "epoch": 0.11020672695052165, + "grad_norm": 0.417120099067688, + "learning_rate": 0.00017799555492013154, + "loss": 1.4079, + "step": 8481 + }, + { + "epoch": 0.11021972149443753, + "grad_norm": 0.37151020765304565, + "learning_rate": 0.00017799295545822016, + "loss": 1.331, + "step": 8482 + }, + { + "epoch": 0.1102327160383534, + "grad_norm": 0.5135869383811951, + "learning_rate": 0.0001779903559963088, + "loss": 1.0712, + "step": 8483 + }, + { + "epoch": 0.11024571058226927, + "grad_norm": 0.3920194208621979, + "learning_rate": 0.00017798775653439739, + "loss": 1.5737, + "step": 8484 + }, + { + "epoch": 0.11025870512618514, + "grad_norm": 0.32528769969940186, + "learning_rate": 0.000177985157072486, + "loss": 1.3838, + "step": 8485 + }, + { + "epoch": 0.11027169967010102, + "grad_norm": 0.4005073606967926, + "learning_rate": 0.00017798255761057463, + "loss": 1.4042, + "step": 8486 + }, + { + "epoch": 0.11028469421401689, + "grad_norm": 0.295510470867157, + "learning_rate": 0.00017797995814866323, + "loss": 1.2493, + "step": 8487 + }, + { + "epoch": 0.11029768875793276, + "grad_norm": 0.46690911054611206, + "learning_rate": 0.00017797735868675186, + "loss": 1.4751, + "step": 8488 + }, + { + "epoch": 0.11031068330184864, + "grad_norm": 0.4221472144126892, + "learning_rate": 0.00017797475922484045, + "loss": 1.3042, + "step": 8489 + }, + { + "epoch": 0.11032367784576451, + "grad_norm": 0.313406378030777, + "learning_rate": 0.00017797215976292908, + "loss": 1.3951, + "step": 8490 + }, + { + "epoch": 0.11033667238968038, + "grad_norm": 0.44129878282546997, + "learning_rate": 0.0001779695603010177, + "loss": 1.4626, + "step": 8491 + }, + { + "epoch": 0.11034966693359625, + "grad_norm": 0.30615466833114624, + "learning_rate": 0.0001779669608391063, + "loss": 1.446, + "step": 8492 + }, + { + "epoch": 0.11036266147751213, + "grad_norm": 0.3778288662433624, + "learning_rate": 0.00017796436137719492, + "loss": 1.6043, + "step": 8493 + }, + { + "epoch": 0.110375656021428, + "grad_norm": 0.30573880672454834, + "learning_rate": 0.00017796176191528355, + "loss": 1.3341, + "step": 8494 + }, + { + "epoch": 0.11038865056534387, + "grad_norm": 0.37661105394363403, + "learning_rate": 0.00017795916245337217, + "loss": 1.5529, + "step": 8495 + }, + { + "epoch": 0.11040164510925975, + "grad_norm": 0.33200299739837646, + "learning_rate": 0.00017795656299146077, + "loss": 1.3194, + "step": 8496 + }, + { + "epoch": 0.11041463965317562, + "grad_norm": 0.4370069205760956, + "learning_rate": 0.0001779539635295494, + "loss": 1.4061, + "step": 8497 + }, + { + "epoch": 0.11042763419709149, + "grad_norm": 0.3966250717639923, + "learning_rate": 0.00017795136406763802, + "loss": 1.5674, + "step": 8498 + }, + { + "epoch": 0.11044062874100737, + "grad_norm": 0.33314865827560425, + "learning_rate": 0.00017794876460572662, + "loss": 1.2829, + "step": 8499 + }, + { + "epoch": 0.11045362328492324, + "grad_norm": 0.32898661494255066, + "learning_rate": 0.00017794616514381524, + "loss": 1.4767, + "step": 8500 + }, + { + "epoch": 0.11046661782883911, + "grad_norm": 0.3853452205657959, + "learning_rate": 0.00017794356568190384, + "loss": 1.3856, + "step": 8501 + }, + { + "epoch": 0.11047961237275498, + "grad_norm": 0.4081169366836548, + "learning_rate": 0.0001779409662199925, + "loss": 1.354, + "step": 8502 + }, + { + "epoch": 0.11049260691667086, + "grad_norm": 0.6198732852935791, + "learning_rate": 0.0001779383667580811, + "loss": 1.5742, + "step": 8503 + }, + { + "epoch": 0.11050560146058673, + "grad_norm": 0.44481897354125977, + "learning_rate": 0.00017793576729616969, + "loss": 1.324, + "step": 8504 + }, + { + "epoch": 0.1105185960045026, + "grad_norm": 0.36927396059036255, + "learning_rate": 0.0001779331678342583, + "loss": 1.3693, + "step": 8505 + }, + { + "epoch": 0.11053159054841848, + "grad_norm": 0.3820902109146118, + "learning_rate": 0.00017793056837234693, + "loss": 1.4511, + "step": 8506 + }, + { + "epoch": 0.11054458509233435, + "grad_norm": 0.37168335914611816, + "learning_rate": 0.00017792796891043556, + "loss": 1.4119, + "step": 8507 + }, + { + "epoch": 0.11055757963625024, + "grad_norm": 0.4066098928451538, + "learning_rate": 0.00017792536944852416, + "loss": 1.3242, + "step": 8508 + }, + { + "epoch": 0.11057057418016611, + "grad_norm": 0.31800830364227295, + "learning_rate": 0.00017792276998661278, + "loss": 1.354, + "step": 8509 + }, + { + "epoch": 0.11058356872408198, + "grad_norm": 0.4043053388595581, + "learning_rate": 0.0001779201705247014, + "loss": 1.2733, + "step": 8510 + }, + { + "epoch": 0.11059656326799785, + "grad_norm": 0.3298899531364441, + "learning_rate": 0.00017791757106279, + "loss": 1.4046, + "step": 8511 + }, + { + "epoch": 0.11060955781191373, + "grad_norm": 0.4717923104763031, + "learning_rate": 0.00017791497160087863, + "loss": 1.3724, + "step": 8512 + }, + { + "epoch": 0.1106225523558296, + "grad_norm": 0.41223472356796265, + "learning_rate": 0.00017791237213896722, + "loss": 1.4516, + "step": 8513 + }, + { + "epoch": 0.11063554689974547, + "grad_norm": 0.3834102153778076, + "learning_rate": 0.00017790977267705588, + "loss": 1.6291, + "step": 8514 + }, + { + "epoch": 0.11064854144366135, + "grad_norm": 0.3822353184223175, + "learning_rate": 0.00017790717321514447, + "loss": 1.2732, + "step": 8515 + }, + { + "epoch": 0.11066153598757722, + "grad_norm": 0.4058038294315338, + "learning_rate": 0.00017790457375323307, + "loss": 1.4128, + "step": 8516 + }, + { + "epoch": 0.11067453053149309, + "grad_norm": 0.29582715034484863, + "learning_rate": 0.00017790197429132172, + "loss": 1.1204, + "step": 8517 + }, + { + "epoch": 0.11068752507540897, + "grad_norm": 0.3588052988052368, + "learning_rate": 0.00017789937482941032, + "loss": 1.2235, + "step": 8518 + }, + { + "epoch": 0.11070051961932484, + "grad_norm": 0.4162876009941101, + "learning_rate": 0.00017789677536749894, + "loss": 1.5949, + "step": 8519 + }, + { + "epoch": 0.11071351416324071, + "grad_norm": 0.3948515057563782, + "learning_rate": 0.00017789417590558754, + "loss": 1.388, + "step": 8520 + }, + { + "epoch": 0.11072650870715658, + "grad_norm": 0.37233200669288635, + "learning_rate": 0.00017789157644367617, + "loss": 1.4276, + "step": 8521 + }, + { + "epoch": 0.11073950325107246, + "grad_norm": 0.4185180962085724, + "learning_rate": 0.0001778889769817648, + "loss": 1.5226, + "step": 8522 + }, + { + "epoch": 0.11075249779498833, + "grad_norm": 0.39894646406173706, + "learning_rate": 0.0001778863775198534, + "loss": 1.5131, + "step": 8523 + }, + { + "epoch": 0.1107654923389042, + "grad_norm": 0.3927561342716217, + "learning_rate": 0.000177883778057942, + "loss": 1.4621, + "step": 8524 + }, + { + "epoch": 0.11077848688282008, + "grad_norm": 0.3699970245361328, + "learning_rate": 0.00017788117859603064, + "loss": 1.3531, + "step": 8525 + }, + { + "epoch": 0.11079148142673595, + "grad_norm": 0.39307940006256104, + "learning_rate": 0.00017787857913411926, + "loss": 1.4239, + "step": 8526 + }, + { + "epoch": 0.11080447597065182, + "grad_norm": 0.39279013872146606, + "learning_rate": 0.00017787597967220786, + "loss": 1.4238, + "step": 8527 + }, + { + "epoch": 0.1108174705145677, + "grad_norm": 0.39066559076309204, + "learning_rate": 0.00017787338021029646, + "loss": 1.4186, + "step": 8528 + }, + { + "epoch": 0.11083046505848357, + "grad_norm": 0.2830652892589569, + "learning_rate": 0.0001778707807483851, + "loss": 1.4811, + "step": 8529 + }, + { + "epoch": 0.11084345960239944, + "grad_norm": 0.3115213215351105, + "learning_rate": 0.0001778681812864737, + "loss": 1.1241, + "step": 8530 + }, + { + "epoch": 0.11085645414631531, + "grad_norm": 0.46800488233566284, + "learning_rate": 0.00017786558182456233, + "loss": 1.6362, + "step": 8531 + }, + { + "epoch": 0.11086944869023119, + "grad_norm": 0.3846072256565094, + "learning_rate": 0.00017786298236265093, + "loss": 1.4724, + "step": 8532 + }, + { + "epoch": 0.11088244323414706, + "grad_norm": 0.30631932616233826, + "learning_rate": 0.00017786038290073955, + "loss": 1.3344, + "step": 8533 + }, + { + "epoch": 0.11089543777806293, + "grad_norm": 0.38891199231147766, + "learning_rate": 0.00017785778343882818, + "loss": 1.4796, + "step": 8534 + }, + { + "epoch": 0.1109084323219788, + "grad_norm": 0.3535824716091156, + "learning_rate": 0.00017785518397691677, + "loss": 1.3722, + "step": 8535 + }, + { + "epoch": 0.11092142686589468, + "grad_norm": 0.489162802696228, + "learning_rate": 0.0001778525845150054, + "loss": 1.5587, + "step": 8536 + }, + { + "epoch": 0.11093442140981055, + "grad_norm": 0.482291042804718, + "learning_rate": 0.00017784998505309402, + "loss": 1.3759, + "step": 8537 + }, + { + "epoch": 0.11094741595372642, + "grad_norm": 0.44002512097358704, + "learning_rate": 0.00017784738559118265, + "loss": 1.5708, + "step": 8538 + }, + { + "epoch": 0.1109604104976423, + "grad_norm": 0.31956562399864197, + "learning_rate": 0.00017784478612927124, + "loss": 1.2889, + "step": 8539 + }, + { + "epoch": 0.11097340504155817, + "grad_norm": 0.34225621819496155, + "learning_rate": 0.00017784218666735987, + "loss": 1.3836, + "step": 8540 + }, + { + "epoch": 0.11098639958547404, + "grad_norm": 0.39811187982559204, + "learning_rate": 0.0001778395872054485, + "loss": 1.5996, + "step": 8541 + }, + { + "epoch": 0.11099939412938992, + "grad_norm": 0.3725215494632721, + "learning_rate": 0.0001778369877435371, + "loss": 1.5099, + "step": 8542 + }, + { + "epoch": 0.11101238867330579, + "grad_norm": 0.3944075107574463, + "learning_rate": 0.00017783438828162572, + "loss": 1.6023, + "step": 8543 + }, + { + "epoch": 0.11102538321722166, + "grad_norm": 0.39231187105178833, + "learning_rate": 0.0001778317888197143, + "loss": 1.5115, + "step": 8544 + }, + { + "epoch": 0.11103837776113754, + "grad_norm": 0.41220220923423767, + "learning_rate": 0.00017782918935780294, + "loss": 1.3902, + "step": 8545 + }, + { + "epoch": 0.11105137230505342, + "grad_norm": 0.4165779948234558, + "learning_rate": 0.00017782658989589156, + "loss": 1.5146, + "step": 8546 + }, + { + "epoch": 0.1110643668489693, + "grad_norm": 0.27829891443252563, + "learning_rate": 0.00017782399043398016, + "loss": 1.2942, + "step": 8547 + }, + { + "epoch": 0.11107736139288517, + "grad_norm": 0.41873669624328613, + "learning_rate": 0.00017782139097206878, + "loss": 1.3266, + "step": 8548 + }, + { + "epoch": 0.11109035593680104, + "grad_norm": 0.4187389314174652, + "learning_rate": 0.0001778187915101574, + "loss": 1.5471, + "step": 8549 + }, + { + "epoch": 0.11110335048071691, + "grad_norm": 0.4204435348510742, + "learning_rate": 0.00017781619204824603, + "loss": 1.3635, + "step": 8550 + }, + { + "epoch": 0.11111634502463279, + "grad_norm": 0.4332634508609772, + "learning_rate": 0.00017781359258633463, + "loss": 1.439, + "step": 8551 + }, + { + "epoch": 0.11112933956854866, + "grad_norm": 0.4367465078830719, + "learning_rate": 0.00017781099312442325, + "loss": 1.4757, + "step": 8552 + }, + { + "epoch": 0.11114233411246453, + "grad_norm": 0.32651135325431824, + "learning_rate": 0.00017780839366251188, + "loss": 1.4655, + "step": 8553 + }, + { + "epoch": 0.1111553286563804, + "grad_norm": 0.33064085245132446, + "learning_rate": 0.00017780579420060048, + "loss": 1.2961, + "step": 8554 + }, + { + "epoch": 0.11116832320029628, + "grad_norm": 0.3711194097995758, + "learning_rate": 0.0001778031947386891, + "loss": 1.1332, + "step": 8555 + }, + { + "epoch": 0.11118131774421215, + "grad_norm": 0.46861642599105835, + "learning_rate": 0.00017780059527677773, + "loss": 1.3883, + "step": 8556 + }, + { + "epoch": 0.11119431228812802, + "grad_norm": 0.4704437255859375, + "learning_rate": 0.00017779799581486632, + "loss": 1.5139, + "step": 8557 + }, + { + "epoch": 0.1112073068320439, + "grad_norm": 0.4395003616809845, + "learning_rate": 0.00017779539635295495, + "loss": 1.4169, + "step": 8558 + }, + { + "epoch": 0.11122030137595977, + "grad_norm": 0.4101468622684479, + "learning_rate": 0.00017779279689104354, + "loss": 1.416, + "step": 8559 + }, + { + "epoch": 0.11123329591987564, + "grad_norm": 0.3807888329029083, + "learning_rate": 0.0001777901974291322, + "loss": 1.5845, + "step": 8560 + }, + { + "epoch": 0.11124629046379152, + "grad_norm": 0.424621045589447, + "learning_rate": 0.0001777875979672208, + "loss": 1.4007, + "step": 8561 + }, + { + "epoch": 0.11125928500770739, + "grad_norm": 0.37832802534103394, + "learning_rate": 0.00017778499850530942, + "loss": 1.5257, + "step": 8562 + }, + { + "epoch": 0.11127227955162326, + "grad_norm": 0.4164350628852844, + "learning_rate": 0.00017778239904339802, + "loss": 1.5499, + "step": 8563 + }, + { + "epoch": 0.11128527409553914, + "grad_norm": 0.4691551923751831, + "learning_rate": 0.00017777979958148664, + "loss": 1.5241, + "step": 8564 + }, + { + "epoch": 0.11129826863945501, + "grad_norm": 0.33752667903900146, + "learning_rate": 0.00017777720011957526, + "loss": 1.374, + "step": 8565 + }, + { + "epoch": 0.11131126318337088, + "grad_norm": 0.44429147243499756, + "learning_rate": 0.00017777460065766386, + "loss": 1.36, + "step": 8566 + }, + { + "epoch": 0.11132425772728675, + "grad_norm": 0.3906398117542267, + "learning_rate": 0.00017777200119575249, + "loss": 1.4907, + "step": 8567 + }, + { + "epoch": 0.11133725227120263, + "grad_norm": 0.4127035439014435, + "learning_rate": 0.0001777694017338411, + "loss": 1.252, + "step": 8568 + }, + { + "epoch": 0.1113502468151185, + "grad_norm": 0.37651756405830383, + "learning_rate": 0.00017776680227192974, + "loss": 1.3173, + "step": 8569 + }, + { + "epoch": 0.11136324135903437, + "grad_norm": 0.4683295786380768, + "learning_rate": 0.00017776420281001833, + "loss": 1.6301, + "step": 8570 + }, + { + "epoch": 0.11137623590295025, + "grad_norm": 0.42249608039855957, + "learning_rate": 0.00017776160334810693, + "loss": 1.5849, + "step": 8571 + }, + { + "epoch": 0.11138923044686612, + "grad_norm": 0.4293670058250427, + "learning_rate": 0.00017775900388619558, + "loss": 1.3854, + "step": 8572 + }, + { + "epoch": 0.11140222499078199, + "grad_norm": 0.4635482430458069, + "learning_rate": 0.00017775640442428418, + "loss": 1.4164, + "step": 8573 + }, + { + "epoch": 0.11141521953469787, + "grad_norm": 0.4318121075630188, + "learning_rate": 0.0001777538049623728, + "loss": 1.4145, + "step": 8574 + }, + { + "epoch": 0.11142821407861374, + "grad_norm": 0.3186655640602112, + "learning_rate": 0.0001777512055004614, + "loss": 1.2911, + "step": 8575 + }, + { + "epoch": 0.11144120862252961, + "grad_norm": 0.3924480974674225, + "learning_rate": 0.00017774860603855003, + "loss": 1.4145, + "step": 8576 + }, + { + "epoch": 0.11145420316644548, + "grad_norm": 0.4285651445388794, + "learning_rate": 0.00017774600657663865, + "loss": 1.386, + "step": 8577 + }, + { + "epoch": 0.11146719771036136, + "grad_norm": 0.3569808900356293, + "learning_rate": 0.00017774340711472725, + "loss": 1.3373, + "step": 8578 + }, + { + "epoch": 0.11148019225427723, + "grad_norm": 0.4820652902126312, + "learning_rate": 0.00017774080765281587, + "loss": 1.5354, + "step": 8579 + }, + { + "epoch": 0.1114931867981931, + "grad_norm": 0.48390087485313416, + "learning_rate": 0.0001777382081909045, + "loss": 1.5638, + "step": 8580 + }, + { + "epoch": 0.11150618134210898, + "grad_norm": 0.3615210950374603, + "learning_rate": 0.00017773560872899312, + "loss": 1.3449, + "step": 8581 + }, + { + "epoch": 0.11151917588602485, + "grad_norm": 0.41461074352264404, + "learning_rate": 0.00017773300926708172, + "loss": 1.4682, + "step": 8582 + }, + { + "epoch": 0.11153217042994072, + "grad_norm": 0.4105028510093689, + "learning_rate": 0.00017773040980517032, + "loss": 1.4419, + "step": 8583 + }, + { + "epoch": 0.11154516497385661, + "grad_norm": 0.3765884339809418, + "learning_rate": 0.00017772781034325897, + "loss": 1.2889, + "step": 8584 + }, + { + "epoch": 0.11155815951777248, + "grad_norm": 0.34632161259651184, + "learning_rate": 0.00017772521088134756, + "loss": 1.41, + "step": 8585 + }, + { + "epoch": 0.11157115406168835, + "grad_norm": 0.4560492932796478, + "learning_rate": 0.0001777226114194362, + "loss": 1.425, + "step": 8586 + }, + { + "epoch": 0.11158414860560423, + "grad_norm": 0.3899783194065094, + "learning_rate": 0.00017772001195752479, + "loss": 1.2785, + "step": 8587 + }, + { + "epoch": 0.1115971431495201, + "grad_norm": 0.4155313968658447, + "learning_rate": 0.0001777174124956134, + "loss": 1.2635, + "step": 8588 + }, + { + "epoch": 0.11161013769343597, + "grad_norm": 0.4057666063308716, + "learning_rate": 0.00017771481303370204, + "loss": 1.4463, + "step": 8589 + }, + { + "epoch": 0.11162313223735185, + "grad_norm": 0.5535851120948792, + "learning_rate": 0.00017771221357179063, + "loss": 1.2467, + "step": 8590 + }, + { + "epoch": 0.11163612678126772, + "grad_norm": 0.5103722810745239, + "learning_rate": 0.00017770961410987928, + "loss": 1.4143, + "step": 8591 + }, + { + "epoch": 0.11164912132518359, + "grad_norm": 0.3232981264591217, + "learning_rate": 0.00017770701464796788, + "loss": 1.5859, + "step": 8592 + }, + { + "epoch": 0.11166211586909947, + "grad_norm": 0.4483446180820465, + "learning_rate": 0.0001777044151860565, + "loss": 1.41, + "step": 8593 + }, + { + "epoch": 0.11167511041301534, + "grad_norm": 0.3665595054626465, + "learning_rate": 0.0001777018157241451, + "loss": 1.4495, + "step": 8594 + }, + { + "epoch": 0.11168810495693121, + "grad_norm": 0.49292799830436707, + "learning_rate": 0.00017769921626223373, + "loss": 1.4515, + "step": 8595 + }, + { + "epoch": 0.11170109950084708, + "grad_norm": 0.40318048000335693, + "learning_rate": 0.00017769661680032235, + "loss": 1.2241, + "step": 8596 + }, + { + "epoch": 0.11171409404476296, + "grad_norm": 0.38092324137687683, + "learning_rate": 0.00017769401733841095, + "loss": 1.271, + "step": 8597 + }, + { + "epoch": 0.11172708858867883, + "grad_norm": 0.34880295395851135, + "learning_rate": 0.00017769141787649957, + "loss": 1.3237, + "step": 8598 + }, + { + "epoch": 0.1117400831325947, + "grad_norm": 0.3966793417930603, + "learning_rate": 0.0001776888184145882, + "loss": 1.3332, + "step": 8599 + }, + { + "epoch": 0.11175307767651058, + "grad_norm": 0.3927079439163208, + "learning_rate": 0.0001776862189526768, + "loss": 1.3222, + "step": 8600 + }, + { + "epoch": 0.11176607222042645, + "grad_norm": 0.4302537441253662, + "learning_rate": 0.00017768361949076542, + "loss": 1.6084, + "step": 8601 + }, + { + "epoch": 0.11177906676434232, + "grad_norm": 0.46645569801330566, + "learning_rate": 0.00017768102002885402, + "loss": 1.4463, + "step": 8602 + }, + { + "epoch": 0.1117920613082582, + "grad_norm": 0.3670461177825928, + "learning_rate": 0.00017767842056694267, + "loss": 1.3618, + "step": 8603 + }, + { + "epoch": 0.11180505585217407, + "grad_norm": 0.37556034326553345, + "learning_rate": 0.00017767582110503127, + "loss": 1.4138, + "step": 8604 + }, + { + "epoch": 0.11181805039608994, + "grad_norm": 0.3809594511985779, + "learning_rate": 0.0001776732216431199, + "loss": 1.3004, + "step": 8605 + }, + { + "epoch": 0.11183104494000581, + "grad_norm": 0.36627018451690674, + "learning_rate": 0.0001776706221812085, + "loss": 1.4118, + "step": 8606 + }, + { + "epoch": 0.11184403948392169, + "grad_norm": 0.42096495628356934, + "learning_rate": 0.0001776680227192971, + "loss": 1.7041, + "step": 8607 + }, + { + "epoch": 0.11185703402783756, + "grad_norm": 0.3480881452560425, + "learning_rate": 0.00017766542325738574, + "loss": 1.3083, + "step": 8608 + }, + { + "epoch": 0.11187002857175343, + "grad_norm": 0.378113329410553, + "learning_rate": 0.00017766282379547434, + "loss": 1.467, + "step": 8609 + }, + { + "epoch": 0.1118830231156693, + "grad_norm": 0.3729805052280426, + "learning_rate": 0.00017766022433356296, + "loss": 1.1988, + "step": 8610 + }, + { + "epoch": 0.11189601765958518, + "grad_norm": 0.4683253765106201, + "learning_rate": 0.00017765762487165158, + "loss": 1.4543, + "step": 8611 + }, + { + "epoch": 0.11190901220350105, + "grad_norm": 0.3739033341407776, + "learning_rate": 0.00017765502540974018, + "loss": 1.5065, + "step": 8612 + }, + { + "epoch": 0.11192200674741692, + "grad_norm": 0.46230778098106384, + "learning_rate": 0.0001776524259478288, + "loss": 1.6942, + "step": 8613 + }, + { + "epoch": 0.1119350012913328, + "grad_norm": 0.3907121419906616, + "learning_rate": 0.0001776498264859174, + "loss": 1.5587, + "step": 8614 + }, + { + "epoch": 0.11194799583524867, + "grad_norm": 0.3419528603553772, + "learning_rate": 0.00017764722702400605, + "loss": 1.4183, + "step": 8615 + }, + { + "epoch": 0.11196099037916454, + "grad_norm": 0.306140273809433, + "learning_rate": 0.00017764462756209465, + "loss": 1.5105, + "step": 8616 + }, + { + "epoch": 0.11197398492308042, + "grad_norm": 0.33101925253868103, + "learning_rate": 0.00017764202810018328, + "loss": 1.4979, + "step": 8617 + }, + { + "epoch": 0.11198697946699629, + "grad_norm": 0.3967652916908264, + "learning_rate": 0.00017763942863827187, + "loss": 1.4418, + "step": 8618 + }, + { + "epoch": 0.11199997401091216, + "grad_norm": 0.36774036288261414, + "learning_rate": 0.0001776368291763605, + "loss": 1.5051, + "step": 8619 + }, + { + "epoch": 0.11201296855482804, + "grad_norm": 0.3067176342010498, + "learning_rate": 0.00017763422971444912, + "loss": 1.108, + "step": 8620 + }, + { + "epoch": 0.11202596309874391, + "grad_norm": 0.39619627594947815, + "learning_rate": 0.00017763163025253772, + "loss": 1.4802, + "step": 8621 + }, + { + "epoch": 0.1120389576426598, + "grad_norm": 0.3856840133666992, + "learning_rate": 0.00017762903079062635, + "loss": 1.4216, + "step": 8622 + }, + { + "epoch": 0.11205195218657567, + "grad_norm": 0.4031469523906708, + "learning_rate": 0.00017762643132871497, + "loss": 1.4308, + "step": 8623 + }, + { + "epoch": 0.11206494673049154, + "grad_norm": 0.3182077705860138, + "learning_rate": 0.0001776238318668036, + "loss": 1.6204, + "step": 8624 + }, + { + "epoch": 0.11207794127440741, + "grad_norm": 0.3879203796386719, + "learning_rate": 0.0001776212324048922, + "loss": 1.4453, + "step": 8625 + }, + { + "epoch": 0.11209093581832329, + "grad_norm": 0.40703073143959045, + "learning_rate": 0.0001776186329429808, + "loss": 1.5185, + "step": 8626 + }, + { + "epoch": 0.11210393036223916, + "grad_norm": 0.40695518255233765, + "learning_rate": 0.00017761603348106944, + "loss": 1.2622, + "step": 8627 + }, + { + "epoch": 0.11211692490615503, + "grad_norm": 0.35042908787727356, + "learning_rate": 0.00017761343401915804, + "loss": 1.4216, + "step": 8628 + }, + { + "epoch": 0.1121299194500709, + "grad_norm": 0.2891005277633667, + "learning_rate": 0.00017761083455724666, + "loss": 1.3575, + "step": 8629 + }, + { + "epoch": 0.11214291399398678, + "grad_norm": 0.4249033033847809, + "learning_rate": 0.0001776082350953353, + "loss": 1.6655, + "step": 8630 + }, + { + "epoch": 0.11215590853790265, + "grad_norm": 0.4816652536392212, + "learning_rate": 0.00017760563563342388, + "loss": 1.5725, + "step": 8631 + }, + { + "epoch": 0.11216890308181852, + "grad_norm": 0.3626181483268738, + "learning_rate": 0.0001776030361715125, + "loss": 1.4344, + "step": 8632 + }, + { + "epoch": 0.1121818976257344, + "grad_norm": 0.39744338393211365, + "learning_rate": 0.0001776004367096011, + "loss": 1.517, + "step": 8633 + }, + { + "epoch": 0.11219489216965027, + "grad_norm": 0.3981624245643616, + "learning_rate": 0.00017759783724768976, + "loss": 1.4557, + "step": 8634 + }, + { + "epoch": 0.11220788671356614, + "grad_norm": 0.4194968640804291, + "learning_rate": 0.00017759523778577835, + "loss": 1.403, + "step": 8635 + }, + { + "epoch": 0.11222088125748202, + "grad_norm": 0.37882140278816223, + "learning_rate": 0.00017759263832386698, + "loss": 1.4211, + "step": 8636 + }, + { + "epoch": 0.11223387580139789, + "grad_norm": 0.44159385561943054, + "learning_rate": 0.00017759003886195558, + "loss": 1.5235, + "step": 8637 + }, + { + "epoch": 0.11224687034531376, + "grad_norm": 0.376777321100235, + "learning_rate": 0.0001775874394000442, + "loss": 1.5527, + "step": 8638 + }, + { + "epoch": 0.11225986488922964, + "grad_norm": 0.3993493914604187, + "learning_rate": 0.00017758483993813283, + "loss": 1.6731, + "step": 8639 + }, + { + "epoch": 0.11227285943314551, + "grad_norm": 0.3251712918281555, + "learning_rate": 0.00017758224047622142, + "loss": 1.4856, + "step": 8640 + }, + { + "epoch": 0.11228585397706138, + "grad_norm": 0.5070064663887024, + "learning_rate": 0.00017757964101431005, + "loss": 1.3719, + "step": 8641 + }, + { + "epoch": 0.11229884852097725, + "grad_norm": 0.39667999744415283, + "learning_rate": 0.00017757704155239867, + "loss": 1.4109, + "step": 8642 + }, + { + "epoch": 0.11231184306489313, + "grad_norm": 0.35416334867477417, + "learning_rate": 0.00017757444209048727, + "loss": 1.3849, + "step": 8643 + }, + { + "epoch": 0.112324837608809, + "grad_norm": 0.4591248631477356, + "learning_rate": 0.0001775718426285759, + "loss": 1.5327, + "step": 8644 + }, + { + "epoch": 0.11233783215272487, + "grad_norm": 0.3754018247127533, + "learning_rate": 0.0001775692431666645, + "loss": 1.3392, + "step": 8645 + }, + { + "epoch": 0.11235082669664075, + "grad_norm": 0.41714969277381897, + "learning_rate": 0.00017756664370475314, + "loss": 1.4186, + "step": 8646 + }, + { + "epoch": 0.11236382124055662, + "grad_norm": 0.4091562330722809, + "learning_rate": 0.00017756404424284174, + "loss": 1.3283, + "step": 8647 + }, + { + "epoch": 0.11237681578447249, + "grad_norm": 0.4639502465724945, + "learning_rate": 0.00017756144478093036, + "loss": 1.5955, + "step": 8648 + }, + { + "epoch": 0.11238981032838836, + "grad_norm": 0.2942112982273102, + "learning_rate": 0.00017755884531901896, + "loss": 1.2763, + "step": 8649 + }, + { + "epoch": 0.11240280487230424, + "grad_norm": 0.4005592465400696, + "learning_rate": 0.0001775562458571076, + "loss": 1.4896, + "step": 8650 + }, + { + "epoch": 0.11241579941622011, + "grad_norm": 0.4401690363883972, + "learning_rate": 0.0001775536463951962, + "loss": 1.4973, + "step": 8651 + }, + { + "epoch": 0.11242879396013598, + "grad_norm": 0.3246303200721741, + "learning_rate": 0.0001775510469332848, + "loss": 1.2462, + "step": 8652 + }, + { + "epoch": 0.11244178850405186, + "grad_norm": 0.4185452163219452, + "learning_rate": 0.00017754844747137343, + "loss": 1.4796, + "step": 8653 + }, + { + "epoch": 0.11245478304796773, + "grad_norm": 0.3429237902164459, + "learning_rate": 0.00017754584800946206, + "loss": 1.6688, + "step": 8654 + }, + { + "epoch": 0.1124677775918836, + "grad_norm": 0.3326869010925293, + "learning_rate": 0.00017754324854755065, + "loss": 1.3756, + "step": 8655 + }, + { + "epoch": 0.11248077213579948, + "grad_norm": 0.4681721329689026, + "learning_rate": 0.00017754064908563928, + "loss": 1.4612, + "step": 8656 + }, + { + "epoch": 0.11249376667971535, + "grad_norm": 0.44035854935646057, + "learning_rate": 0.00017753804962372788, + "loss": 1.4571, + "step": 8657 + }, + { + "epoch": 0.11250676122363122, + "grad_norm": 0.3797861933708191, + "learning_rate": 0.00017753545016181653, + "loss": 1.4258, + "step": 8658 + }, + { + "epoch": 0.1125197557675471, + "grad_norm": 0.5091722011566162, + "learning_rate": 0.00017753285069990513, + "loss": 1.6727, + "step": 8659 + }, + { + "epoch": 0.11253275031146298, + "grad_norm": 0.3938788175582886, + "learning_rate": 0.00017753025123799375, + "loss": 1.4774, + "step": 8660 + }, + { + "epoch": 0.11254574485537885, + "grad_norm": 0.32285359501838684, + "learning_rate": 0.00017752765177608235, + "loss": 1.3694, + "step": 8661 + }, + { + "epoch": 0.11255873939929473, + "grad_norm": 0.3990798890590668, + "learning_rate": 0.00017752505231417097, + "loss": 1.3121, + "step": 8662 + }, + { + "epoch": 0.1125717339432106, + "grad_norm": 0.39969685673713684, + "learning_rate": 0.0001775224528522596, + "loss": 1.5714, + "step": 8663 + }, + { + "epoch": 0.11258472848712647, + "grad_norm": 0.36271074414253235, + "learning_rate": 0.0001775198533903482, + "loss": 1.494, + "step": 8664 + }, + { + "epoch": 0.11259772303104235, + "grad_norm": 0.43183159828186035, + "learning_rate": 0.00017751725392843685, + "loss": 1.401, + "step": 8665 + }, + { + "epoch": 0.11261071757495822, + "grad_norm": 0.3798542022705078, + "learning_rate": 0.00017751465446652544, + "loss": 1.2555, + "step": 8666 + }, + { + "epoch": 0.11262371211887409, + "grad_norm": 0.42847663164138794, + "learning_rate": 0.00017751205500461404, + "loss": 1.5422, + "step": 8667 + }, + { + "epoch": 0.11263670666278996, + "grad_norm": 0.47489461302757263, + "learning_rate": 0.00017750945554270266, + "loss": 1.3911, + "step": 8668 + }, + { + "epoch": 0.11264970120670584, + "grad_norm": 0.4326644241809845, + "learning_rate": 0.0001775068560807913, + "loss": 1.4057, + "step": 8669 + }, + { + "epoch": 0.11266269575062171, + "grad_norm": 0.35288766026496887, + "learning_rate": 0.00017750425661887991, + "loss": 1.2756, + "step": 8670 + }, + { + "epoch": 0.11267569029453758, + "grad_norm": 0.38767990469932556, + "learning_rate": 0.0001775016571569685, + "loss": 1.3891, + "step": 8671 + }, + { + "epoch": 0.11268868483845346, + "grad_norm": 0.4462668299674988, + "learning_rate": 0.00017749905769505714, + "loss": 1.4228, + "step": 8672 + }, + { + "epoch": 0.11270167938236933, + "grad_norm": 0.2589188814163208, + "learning_rate": 0.00017749645823314576, + "loss": 1.2878, + "step": 8673 + }, + { + "epoch": 0.1127146739262852, + "grad_norm": 0.4760963022708893, + "learning_rate": 0.00017749385877123436, + "loss": 1.5268, + "step": 8674 + }, + { + "epoch": 0.11272766847020108, + "grad_norm": 0.4072748124599457, + "learning_rate": 0.00017749125930932298, + "loss": 1.4719, + "step": 8675 + }, + { + "epoch": 0.11274066301411695, + "grad_norm": 0.40276533365249634, + "learning_rate": 0.00017748865984741158, + "loss": 1.3391, + "step": 8676 + }, + { + "epoch": 0.11275365755803282, + "grad_norm": 0.3967902660369873, + "learning_rate": 0.00017748606038550023, + "loss": 1.4678, + "step": 8677 + }, + { + "epoch": 0.1127666521019487, + "grad_norm": 0.40766680240631104, + "learning_rate": 0.00017748346092358883, + "loss": 1.6556, + "step": 8678 + }, + { + "epoch": 0.11277964664586457, + "grad_norm": 0.4481208622455597, + "learning_rate": 0.00017748086146167743, + "loss": 1.4944, + "step": 8679 + }, + { + "epoch": 0.11279264118978044, + "grad_norm": 0.39495933055877686, + "learning_rate": 0.00017747826199976605, + "loss": 1.3384, + "step": 8680 + }, + { + "epoch": 0.11280563573369631, + "grad_norm": 0.41857582330703735, + "learning_rate": 0.00017747566253785467, + "loss": 1.5347, + "step": 8681 + }, + { + "epoch": 0.11281863027761219, + "grad_norm": 0.43404144048690796, + "learning_rate": 0.0001774730630759433, + "loss": 1.5017, + "step": 8682 + }, + { + "epoch": 0.11283162482152806, + "grad_norm": 0.36908257007598877, + "learning_rate": 0.0001774704636140319, + "loss": 1.4221, + "step": 8683 + }, + { + "epoch": 0.11284461936544393, + "grad_norm": 0.3650283217430115, + "learning_rate": 0.00017746786415212052, + "loss": 1.4195, + "step": 8684 + }, + { + "epoch": 0.1128576139093598, + "grad_norm": 0.49003270268440247, + "learning_rate": 0.00017746526469020915, + "loss": 1.524, + "step": 8685 + }, + { + "epoch": 0.11287060845327568, + "grad_norm": 0.38387563824653625, + "learning_rate": 0.00017746266522829774, + "loss": 1.4145, + "step": 8686 + }, + { + "epoch": 0.11288360299719155, + "grad_norm": 0.35055360198020935, + "learning_rate": 0.00017746006576638637, + "loss": 1.4636, + "step": 8687 + }, + { + "epoch": 0.11289659754110742, + "grad_norm": 0.42743873596191406, + "learning_rate": 0.00017745746630447496, + "loss": 1.6415, + "step": 8688 + }, + { + "epoch": 0.1129095920850233, + "grad_norm": 0.45148128271102905, + "learning_rate": 0.00017745486684256362, + "loss": 1.4739, + "step": 8689 + }, + { + "epoch": 0.11292258662893917, + "grad_norm": 0.4883705675601959, + "learning_rate": 0.00017745226738065221, + "loss": 1.5701, + "step": 8690 + }, + { + "epoch": 0.11293558117285504, + "grad_norm": 0.4535878896713257, + "learning_rate": 0.00017744966791874084, + "loss": 1.5729, + "step": 8691 + }, + { + "epoch": 0.11294857571677092, + "grad_norm": 0.31692615151405334, + "learning_rate": 0.00017744706845682944, + "loss": 1.3584, + "step": 8692 + }, + { + "epoch": 0.11296157026068679, + "grad_norm": 0.4068675637245178, + "learning_rate": 0.00017744446899491806, + "loss": 1.3121, + "step": 8693 + }, + { + "epoch": 0.11297456480460266, + "grad_norm": 0.3965752124786377, + "learning_rate": 0.00017744186953300668, + "loss": 1.4902, + "step": 8694 + }, + { + "epoch": 0.11298755934851853, + "grad_norm": 0.3603360950946808, + "learning_rate": 0.00017743927007109528, + "loss": 1.4944, + "step": 8695 + }, + { + "epoch": 0.11300055389243441, + "grad_norm": 0.34367141127586365, + "learning_rate": 0.0001774366706091839, + "loss": 1.4593, + "step": 8696 + }, + { + "epoch": 0.11301354843635028, + "grad_norm": 0.4477933645248413, + "learning_rate": 0.00017743407114727253, + "loss": 1.5064, + "step": 8697 + }, + { + "epoch": 0.11302654298026617, + "grad_norm": 0.48470833897590637, + "learning_rate": 0.00017743147168536113, + "loss": 1.4904, + "step": 8698 + }, + { + "epoch": 0.11303953752418204, + "grad_norm": 0.3278193771839142, + "learning_rate": 0.00017742887222344975, + "loss": 1.2976, + "step": 8699 + }, + { + "epoch": 0.11305253206809791, + "grad_norm": 0.3547235429286957, + "learning_rate": 0.00017742627276153835, + "loss": 1.4448, + "step": 8700 + }, + { + "epoch": 0.11306552661201379, + "grad_norm": 0.4378271996974945, + "learning_rate": 0.000177423673299627, + "loss": 1.3966, + "step": 8701 + }, + { + "epoch": 0.11307852115592966, + "grad_norm": 0.4148450195789337, + "learning_rate": 0.0001774210738377156, + "loss": 1.6076, + "step": 8702 + }, + { + "epoch": 0.11309151569984553, + "grad_norm": 0.2955113649368286, + "learning_rate": 0.00017741847437580422, + "loss": 1.4415, + "step": 8703 + }, + { + "epoch": 0.1131045102437614, + "grad_norm": 0.3896235525608063, + "learning_rate": 0.00017741587491389285, + "loss": 1.4444, + "step": 8704 + }, + { + "epoch": 0.11311750478767728, + "grad_norm": 0.37741780281066895, + "learning_rate": 0.00017741327545198145, + "loss": 1.3442, + "step": 8705 + }, + { + "epoch": 0.11313049933159315, + "grad_norm": 0.4967813491821289, + "learning_rate": 0.00017741067599007007, + "loss": 1.4187, + "step": 8706 + }, + { + "epoch": 0.11314349387550902, + "grad_norm": 0.3762017786502838, + "learning_rate": 0.00017740807652815867, + "loss": 1.4215, + "step": 8707 + }, + { + "epoch": 0.1131564884194249, + "grad_norm": 0.3131179213523865, + "learning_rate": 0.00017740547706624732, + "loss": 1.3398, + "step": 8708 + }, + { + "epoch": 0.11316948296334077, + "grad_norm": 0.3077961504459381, + "learning_rate": 0.00017740287760433592, + "loss": 1.3629, + "step": 8709 + }, + { + "epoch": 0.11318247750725664, + "grad_norm": 0.41269850730895996, + "learning_rate": 0.00017740027814242451, + "loss": 1.3385, + "step": 8710 + }, + { + "epoch": 0.11319547205117252, + "grad_norm": 0.5147266387939453, + "learning_rate": 0.00017739767868051314, + "loss": 1.5745, + "step": 8711 + }, + { + "epoch": 0.11320846659508839, + "grad_norm": 0.3871222734451294, + "learning_rate": 0.00017739507921860176, + "loss": 1.2539, + "step": 8712 + }, + { + "epoch": 0.11322146113900426, + "grad_norm": 0.423583060503006, + "learning_rate": 0.0001773924797566904, + "loss": 1.367, + "step": 8713 + }, + { + "epoch": 0.11323445568292013, + "grad_norm": 0.39959803223609924, + "learning_rate": 0.00017738988029477898, + "loss": 1.57, + "step": 8714 + }, + { + "epoch": 0.11324745022683601, + "grad_norm": 0.4388839900493622, + "learning_rate": 0.0001773872808328676, + "loss": 1.3366, + "step": 8715 + }, + { + "epoch": 0.11326044477075188, + "grad_norm": 0.3355761468410492, + "learning_rate": 0.00017738468137095623, + "loss": 1.5388, + "step": 8716 + }, + { + "epoch": 0.11327343931466775, + "grad_norm": 0.293712317943573, + "learning_rate": 0.00017738208190904483, + "loss": 1.2502, + "step": 8717 + }, + { + "epoch": 0.11328643385858363, + "grad_norm": 0.380664587020874, + "learning_rate": 0.00017737948244713346, + "loss": 1.6161, + "step": 8718 + }, + { + "epoch": 0.1132994284024995, + "grad_norm": 0.4230097234249115, + "learning_rate": 0.00017737688298522205, + "loss": 1.5456, + "step": 8719 + }, + { + "epoch": 0.11331242294641537, + "grad_norm": 0.34559720754623413, + "learning_rate": 0.0001773742835233107, + "loss": 1.3284, + "step": 8720 + }, + { + "epoch": 0.11332541749033125, + "grad_norm": 0.3350925147533417, + "learning_rate": 0.0001773716840613993, + "loss": 1.3247, + "step": 8721 + }, + { + "epoch": 0.11333841203424712, + "grad_norm": 0.2867088317871094, + "learning_rate": 0.0001773690845994879, + "loss": 1.5199, + "step": 8722 + }, + { + "epoch": 0.11335140657816299, + "grad_norm": 0.3675834834575653, + "learning_rate": 0.00017736648513757652, + "loss": 1.4505, + "step": 8723 + }, + { + "epoch": 0.11336440112207886, + "grad_norm": 0.33889830112457275, + "learning_rate": 0.00017736388567566515, + "loss": 1.3594, + "step": 8724 + }, + { + "epoch": 0.11337739566599474, + "grad_norm": 0.4452855587005615, + "learning_rate": 0.00017736128621375377, + "loss": 1.3238, + "step": 8725 + }, + { + "epoch": 0.11339039020991061, + "grad_norm": 0.3662298321723938, + "learning_rate": 0.00017735868675184237, + "loss": 1.1203, + "step": 8726 + }, + { + "epoch": 0.11340338475382648, + "grad_norm": 0.3540753722190857, + "learning_rate": 0.000177356087289931, + "loss": 1.3663, + "step": 8727 + }, + { + "epoch": 0.11341637929774236, + "grad_norm": 0.39487308263778687, + "learning_rate": 0.00017735348782801962, + "loss": 1.4743, + "step": 8728 + }, + { + "epoch": 0.11342937384165823, + "grad_norm": 0.38472577929496765, + "learning_rate": 0.00017735088836610822, + "loss": 1.504, + "step": 8729 + }, + { + "epoch": 0.1134423683855741, + "grad_norm": 0.35093024373054504, + "learning_rate": 0.00017734828890419684, + "loss": 1.317, + "step": 8730 + }, + { + "epoch": 0.11345536292948998, + "grad_norm": 0.44766491651535034, + "learning_rate": 0.00017734568944228544, + "loss": 1.1707, + "step": 8731 + }, + { + "epoch": 0.11346835747340585, + "grad_norm": 0.35561200976371765, + "learning_rate": 0.0001773430899803741, + "loss": 1.429, + "step": 8732 + }, + { + "epoch": 0.11348135201732172, + "grad_norm": 0.3999424874782562, + "learning_rate": 0.0001773404905184627, + "loss": 1.4971, + "step": 8733 + }, + { + "epoch": 0.1134943465612376, + "grad_norm": 0.3015002906322479, + "learning_rate": 0.00017733789105655128, + "loss": 1.317, + "step": 8734 + }, + { + "epoch": 0.11350734110515347, + "grad_norm": 0.3526372015476227, + "learning_rate": 0.0001773352915946399, + "loss": 1.2748, + "step": 8735 + }, + { + "epoch": 0.11352033564906935, + "grad_norm": 0.3884231746196747, + "learning_rate": 0.00017733269213272853, + "loss": 1.3825, + "step": 8736 + }, + { + "epoch": 0.11353333019298523, + "grad_norm": 0.3941156566143036, + "learning_rate": 0.00017733009267081716, + "loss": 1.4156, + "step": 8737 + }, + { + "epoch": 0.1135463247369011, + "grad_norm": 0.37532535195350647, + "learning_rate": 0.00017732749320890576, + "loss": 1.438, + "step": 8738 + }, + { + "epoch": 0.11355931928081697, + "grad_norm": 0.35799306631088257, + "learning_rate": 0.00017732489374699438, + "loss": 1.3502, + "step": 8739 + }, + { + "epoch": 0.11357231382473285, + "grad_norm": 0.33526861667633057, + "learning_rate": 0.000177322294285083, + "loss": 1.5803, + "step": 8740 + }, + { + "epoch": 0.11358530836864872, + "grad_norm": 0.22466392815113068, + "learning_rate": 0.0001773196948231716, + "loss": 1.1175, + "step": 8741 + }, + { + "epoch": 0.11359830291256459, + "grad_norm": 0.45935171842575073, + "learning_rate": 0.00017731709536126023, + "loss": 1.4189, + "step": 8742 + }, + { + "epoch": 0.11361129745648046, + "grad_norm": 0.3255417048931122, + "learning_rate": 0.00017731449589934885, + "loss": 1.2526, + "step": 8743 + }, + { + "epoch": 0.11362429200039634, + "grad_norm": 0.44808411598205566, + "learning_rate": 0.00017731189643743748, + "loss": 1.2929, + "step": 8744 + }, + { + "epoch": 0.11363728654431221, + "grad_norm": 0.48643577098846436, + "learning_rate": 0.00017730929697552607, + "loss": 1.4578, + "step": 8745 + }, + { + "epoch": 0.11365028108822808, + "grad_norm": 0.36762893199920654, + "learning_rate": 0.0001773066975136147, + "loss": 1.5275, + "step": 8746 + }, + { + "epoch": 0.11366327563214396, + "grad_norm": 0.4509170949459076, + "learning_rate": 0.00017730409805170332, + "loss": 1.5922, + "step": 8747 + }, + { + "epoch": 0.11367627017605983, + "grad_norm": 0.43448859453201294, + "learning_rate": 0.00017730149858979192, + "loss": 1.3632, + "step": 8748 + }, + { + "epoch": 0.1136892647199757, + "grad_norm": 0.34186384081840515, + "learning_rate": 0.00017729889912788054, + "loss": 1.4937, + "step": 8749 + }, + { + "epoch": 0.11370225926389158, + "grad_norm": 0.3646673262119293, + "learning_rate": 0.00017729629966596914, + "loss": 1.4673, + "step": 8750 + }, + { + "epoch": 0.11371525380780745, + "grad_norm": 0.23201428353786469, + "learning_rate": 0.00017729370020405777, + "loss": 1.342, + "step": 8751 + }, + { + "epoch": 0.11372824835172332, + "grad_norm": 0.3720986247062683, + "learning_rate": 0.0001772911007421464, + "loss": 1.6353, + "step": 8752 + }, + { + "epoch": 0.1137412428956392, + "grad_norm": 0.3540084958076477, + "learning_rate": 0.000177288501280235, + "loss": 1.3618, + "step": 8753 + }, + { + "epoch": 0.11375423743955507, + "grad_norm": 0.47163715958595276, + "learning_rate": 0.0001772859018183236, + "loss": 1.4355, + "step": 8754 + }, + { + "epoch": 0.11376723198347094, + "grad_norm": 0.3831759989261627, + "learning_rate": 0.00017728330235641224, + "loss": 1.5364, + "step": 8755 + }, + { + "epoch": 0.11378022652738681, + "grad_norm": 0.41464853286743164, + "learning_rate": 0.00017728070289450086, + "loss": 1.4174, + "step": 8756 + }, + { + "epoch": 0.11379322107130269, + "grad_norm": 0.4090052843093872, + "learning_rate": 0.00017727810343258946, + "loss": 1.5311, + "step": 8757 + }, + { + "epoch": 0.11380621561521856, + "grad_norm": 0.37021541595458984, + "learning_rate": 0.00017727550397067808, + "loss": 1.3689, + "step": 8758 + }, + { + "epoch": 0.11381921015913443, + "grad_norm": 0.4000149667263031, + "learning_rate": 0.0001772729045087667, + "loss": 1.4935, + "step": 8759 + }, + { + "epoch": 0.1138322047030503, + "grad_norm": 0.30711108446121216, + "learning_rate": 0.0001772703050468553, + "loss": 1.1174, + "step": 8760 + }, + { + "epoch": 0.11384519924696618, + "grad_norm": 0.49210160970687866, + "learning_rate": 0.00017726770558494393, + "loss": 1.5122, + "step": 8761 + }, + { + "epoch": 0.11385819379088205, + "grad_norm": 0.36736223101615906, + "learning_rate": 0.00017726510612303253, + "loss": 1.3047, + "step": 8762 + }, + { + "epoch": 0.11387118833479792, + "grad_norm": 0.42107829451560974, + "learning_rate": 0.00017726250666112115, + "loss": 1.6142, + "step": 8763 + }, + { + "epoch": 0.1138841828787138, + "grad_norm": 0.30236494541168213, + "learning_rate": 0.00017725990719920977, + "loss": 1.3697, + "step": 8764 + }, + { + "epoch": 0.11389717742262967, + "grad_norm": 0.3880172371864319, + "learning_rate": 0.00017725730773729837, + "loss": 1.3695, + "step": 8765 + }, + { + "epoch": 0.11391017196654554, + "grad_norm": 0.4311392307281494, + "learning_rate": 0.000177254708275387, + "loss": 1.4307, + "step": 8766 + }, + { + "epoch": 0.11392316651046142, + "grad_norm": 0.36553239822387695, + "learning_rate": 0.00017725210881347562, + "loss": 1.2901, + "step": 8767 + }, + { + "epoch": 0.11393616105437729, + "grad_norm": 0.3549526035785675, + "learning_rate": 0.00017724950935156425, + "loss": 1.1591, + "step": 8768 + }, + { + "epoch": 0.11394915559829316, + "grad_norm": 0.40318581461906433, + "learning_rate": 0.00017724690988965284, + "loss": 1.4488, + "step": 8769 + }, + { + "epoch": 0.11396215014220903, + "grad_norm": 0.32147684693336487, + "learning_rate": 0.00017724431042774147, + "loss": 1.4482, + "step": 8770 + }, + { + "epoch": 0.11397514468612491, + "grad_norm": 0.3440645933151245, + "learning_rate": 0.0001772417109658301, + "loss": 1.5292, + "step": 8771 + }, + { + "epoch": 0.11398813923004078, + "grad_norm": 0.40621593594551086, + "learning_rate": 0.0001772391115039187, + "loss": 1.5201, + "step": 8772 + }, + { + "epoch": 0.11400113377395665, + "grad_norm": 0.3528688848018646, + "learning_rate": 0.00017723651204200731, + "loss": 1.3687, + "step": 8773 + }, + { + "epoch": 0.11401412831787254, + "grad_norm": 0.31640294194221497, + "learning_rate": 0.0001772339125800959, + "loss": 1.3212, + "step": 8774 + }, + { + "epoch": 0.11402712286178841, + "grad_norm": 0.3772437274456024, + "learning_rate": 0.00017723131311818456, + "loss": 1.4486, + "step": 8775 + }, + { + "epoch": 0.11404011740570429, + "grad_norm": 0.350006639957428, + "learning_rate": 0.00017722871365627316, + "loss": 1.42, + "step": 8776 + }, + { + "epoch": 0.11405311194962016, + "grad_norm": 0.5429489612579346, + "learning_rate": 0.00017722611419436176, + "loss": 1.4749, + "step": 8777 + }, + { + "epoch": 0.11406610649353603, + "grad_norm": 0.45498207211494446, + "learning_rate": 0.0001772235147324504, + "loss": 1.569, + "step": 8778 + }, + { + "epoch": 0.1140791010374519, + "grad_norm": 0.3631664216518402, + "learning_rate": 0.000177220915270539, + "loss": 1.4255, + "step": 8779 + }, + { + "epoch": 0.11409209558136778, + "grad_norm": 0.4022939205169678, + "learning_rate": 0.00017721831580862763, + "loss": 1.5694, + "step": 8780 + }, + { + "epoch": 0.11410509012528365, + "grad_norm": 0.2507469952106476, + "learning_rate": 0.00017721571634671623, + "loss": 1.4725, + "step": 8781 + }, + { + "epoch": 0.11411808466919952, + "grad_norm": 0.40444067120552063, + "learning_rate": 0.00017721311688480485, + "loss": 1.5507, + "step": 8782 + }, + { + "epoch": 0.1141310792131154, + "grad_norm": 0.32774579524993896, + "learning_rate": 0.00017721051742289348, + "loss": 1.3757, + "step": 8783 + }, + { + "epoch": 0.11414407375703127, + "grad_norm": 0.4960706830024719, + "learning_rate": 0.00017720791796098207, + "loss": 1.3014, + "step": 8784 + }, + { + "epoch": 0.11415706830094714, + "grad_norm": 0.4631768763065338, + "learning_rate": 0.0001772053184990707, + "loss": 1.4516, + "step": 8785 + }, + { + "epoch": 0.11417006284486302, + "grad_norm": 0.5482436418533325, + "learning_rate": 0.00017720271903715932, + "loss": 1.4805, + "step": 8786 + }, + { + "epoch": 0.11418305738877889, + "grad_norm": 0.5119069814682007, + "learning_rate": 0.00017720011957524795, + "loss": 1.5827, + "step": 8787 + }, + { + "epoch": 0.11419605193269476, + "grad_norm": 0.3748325705528259, + "learning_rate": 0.00017719752011333655, + "loss": 1.1814, + "step": 8788 + }, + { + "epoch": 0.11420904647661063, + "grad_norm": 0.36338698863983154, + "learning_rate": 0.00017719492065142514, + "loss": 1.4111, + "step": 8789 + }, + { + "epoch": 0.11422204102052651, + "grad_norm": 0.53028404712677, + "learning_rate": 0.0001771923211895138, + "loss": 1.5906, + "step": 8790 + }, + { + "epoch": 0.11423503556444238, + "grad_norm": 0.38513025641441345, + "learning_rate": 0.0001771897217276024, + "loss": 1.6797, + "step": 8791 + }, + { + "epoch": 0.11424803010835825, + "grad_norm": 0.4013994634151459, + "learning_rate": 0.00017718712226569102, + "loss": 1.3541, + "step": 8792 + }, + { + "epoch": 0.11426102465227413, + "grad_norm": 0.31763675808906555, + "learning_rate": 0.00017718452280377961, + "loss": 1.1484, + "step": 8793 + }, + { + "epoch": 0.11427401919619, + "grad_norm": 0.3755086660385132, + "learning_rate": 0.00017718192334186824, + "loss": 1.5035, + "step": 8794 + }, + { + "epoch": 0.11428701374010587, + "grad_norm": 0.4056829810142517, + "learning_rate": 0.00017717932387995686, + "loss": 1.3224, + "step": 8795 + }, + { + "epoch": 0.11430000828402175, + "grad_norm": 0.45021751523017883, + "learning_rate": 0.00017717672441804546, + "loss": 1.4002, + "step": 8796 + }, + { + "epoch": 0.11431300282793762, + "grad_norm": 0.32861268520355225, + "learning_rate": 0.00017717412495613408, + "loss": 1.2972, + "step": 8797 + }, + { + "epoch": 0.11432599737185349, + "grad_norm": 0.5190981030464172, + "learning_rate": 0.0001771715254942227, + "loss": 1.581, + "step": 8798 + }, + { + "epoch": 0.11433899191576936, + "grad_norm": 0.4130539894104004, + "learning_rate": 0.00017716892603231133, + "loss": 1.385, + "step": 8799 + }, + { + "epoch": 0.11435198645968524, + "grad_norm": 0.39239710569381714, + "learning_rate": 0.00017716632657039993, + "loss": 1.531, + "step": 8800 + }, + { + "epoch": 0.11436498100360111, + "grad_norm": 0.36715611815452576, + "learning_rate": 0.00017716372710848853, + "loss": 1.1537, + "step": 8801 + }, + { + "epoch": 0.11437797554751698, + "grad_norm": 0.42312273383140564, + "learning_rate": 0.00017716112764657718, + "loss": 1.3779, + "step": 8802 + }, + { + "epoch": 0.11439097009143286, + "grad_norm": 0.3798922896385193, + "learning_rate": 0.00017715852818466578, + "loss": 1.6103, + "step": 8803 + }, + { + "epoch": 0.11440396463534873, + "grad_norm": 0.4298017919063568, + "learning_rate": 0.0001771559287227544, + "loss": 1.362, + "step": 8804 + }, + { + "epoch": 0.1144169591792646, + "grad_norm": 0.3507894277572632, + "learning_rate": 0.000177153329260843, + "loss": 1.4814, + "step": 8805 + }, + { + "epoch": 0.11442995372318047, + "grad_norm": 0.46142932772636414, + "learning_rate": 0.00017715072979893162, + "loss": 1.5395, + "step": 8806 + }, + { + "epoch": 0.11444294826709635, + "grad_norm": 0.3149317800998688, + "learning_rate": 0.00017714813033702025, + "loss": 1.4222, + "step": 8807 + }, + { + "epoch": 0.11445594281101222, + "grad_norm": 0.4920963943004608, + "learning_rate": 0.00017714553087510885, + "loss": 1.603, + "step": 8808 + }, + { + "epoch": 0.1144689373549281, + "grad_norm": 0.3889062702655792, + "learning_rate": 0.00017714293141319747, + "loss": 1.3815, + "step": 8809 + }, + { + "epoch": 0.11448193189884397, + "grad_norm": 0.4440886676311493, + "learning_rate": 0.0001771403319512861, + "loss": 1.6133, + "step": 8810 + }, + { + "epoch": 0.11449492644275984, + "grad_norm": 0.4331966042518616, + "learning_rate": 0.00017713773248937472, + "loss": 1.6185, + "step": 8811 + }, + { + "epoch": 0.11450792098667573, + "grad_norm": 0.4049389064311981, + "learning_rate": 0.00017713513302746332, + "loss": 1.3872, + "step": 8812 + }, + { + "epoch": 0.1145209155305916, + "grad_norm": 0.26543596386909485, + "learning_rate": 0.00017713253356555194, + "loss": 1.4367, + "step": 8813 + }, + { + "epoch": 0.11453391007450747, + "grad_norm": 0.2800551652908325, + "learning_rate": 0.00017712993410364057, + "loss": 1.5072, + "step": 8814 + }, + { + "epoch": 0.11454690461842335, + "grad_norm": 0.3064061105251312, + "learning_rate": 0.00017712733464172916, + "loss": 1.2181, + "step": 8815 + }, + { + "epoch": 0.11455989916233922, + "grad_norm": 0.33081939816474915, + "learning_rate": 0.0001771247351798178, + "loss": 1.3777, + "step": 8816 + }, + { + "epoch": 0.11457289370625509, + "grad_norm": 0.42478451132774353, + "learning_rate": 0.0001771221357179064, + "loss": 1.542, + "step": 8817 + }, + { + "epoch": 0.11458588825017096, + "grad_norm": 0.41032856702804565, + "learning_rate": 0.000177119536255995, + "loss": 1.4939, + "step": 8818 + }, + { + "epoch": 0.11459888279408684, + "grad_norm": 0.3695392310619354, + "learning_rate": 0.00017711693679408363, + "loss": 1.5077, + "step": 8819 + }, + { + "epoch": 0.11461187733800271, + "grad_norm": 0.38681721687316895, + "learning_rate": 0.00017711433733217223, + "loss": 1.1793, + "step": 8820 + }, + { + "epoch": 0.11462487188191858, + "grad_norm": 0.4438772201538086, + "learning_rate": 0.00017711173787026088, + "loss": 1.5969, + "step": 8821 + }, + { + "epoch": 0.11463786642583446, + "grad_norm": 0.351814329624176, + "learning_rate": 0.00017710913840834948, + "loss": 1.5354, + "step": 8822 + }, + { + "epoch": 0.11465086096975033, + "grad_norm": 0.4256584346294403, + "learning_rate": 0.0001771065389464381, + "loss": 1.3641, + "step": 8823 + }, + { + "epoch": 0.1146638555136662, + "grad_norm": 0.3491363823413849, + "learning_rate": 0.0001771039394845267, + "loss": 1.2797, + "step": 8824 + }, + { + "epoch": 0.11467685005758207, + "grad_norm": 0.3753140866756439, + "learning_rate": 0.00017710134002261533, + "loss": 1.2732, + "step": 8825 + }, + { + "epoch": 0.11468984460149795, + "grad_norm": 0.35834160447120667, + "learning_rate": 0.00017709874056070395, + "loss": 1.4378, + "step": 8826 + }, + { + "epoch": 0.11470283914541382, + "grad_norm": 0.3439320921897888, + "learning_rate": 0.00017709614109879255, + "loss": 1.606, + "step": 8827 + }, + { + "epoch": 0.1147158336893297, + "grad_norm": 0.3852708041667938, + "learning_rate": 0.00017709354163688117, + "loss": 1.1904, + "step": 8828 + }, + { + "epoch": 0.11472882823324557, + "grad_norm": 0.3906395733356476, + "learning_rate": 0.0001770909421749698, + "loss": 1.6437, + "step": 8829 + }, + { + "epoch": 0.11474182277716144, + "grad_norm": 0.3611818850040436, + "learning_rate": 0.00017708834271305842, + "loss": 1.3814, + "step": 8830 + }, + { + "epoch": 0.11475481732107731, + "grad_norm": 0.38790372014045715, + "learning_rate": 0.00017708574325114702, + "loss": 1.407, + "step": 8831 + }, + { + "epoch": 0.11476781186499319, + "grad_norm": 0.40654081106185913, + "learning_rate": 0.00017708314378923562, + "loss": 1.3125, + "step": 8832 + }, + { + "epoch": 0.11478080640890906, + "grad_norm": 0.5135018825531006, + "learning_rate": 0.00017708054432732427, + "loss": 1.3447, + "step": 8833 + }, + { + "epoch": 0.11479380095282493, + "grad_norm": 0.45891162753105164, + "learning_rate": 0.00017707794486541287, + "loss": 1.382, + "step": 8834 + }, + { + "epoch": 0.1148067954967408, + "grad_norm": 0.3153199255466461, + "learning_rate": 0.0001770753454035015, + "loss": 1.466, + "step": 8835 + }, + { + "epoch": 0.11481979004065668, + "grad_norm": 0.41551390290260315, + "learning_rate": 0.0001770727459415901, + "loss": 1.4179, + "step": 8836 + }, + { + "epoch": 0.11483278458457255, + "grad_norm": 0.44601374864578247, + "learning_rate": 0.0001770701464796787, + "loss": 1.3488, + "step": 8837 + }, + { + "epoch": 0.11484577912848842, + "grad_norm": 0.34145426750183105, + "learning_rate": 0.00017706754701776734, + "loss": 1.4425, + "step": 8838 + }, + { + "epoch": 0.1148587736724043, + "grad_norm": 0.45654135942459106, + "learning_rate": 0.00017706494755585593, + "loss": 1.4304, + "step": 8839 + }, + { + "epoch": 0.11487176821632017, + "grad_norm": 0.4220637083053589, + "learning_rate": 0.00017706234809394456, + "loss": 1.5045, + "step": 8840 + }, + { + "epoch": 0.11488476276023604, + "grad_norm": 0.31946858763694763, + "learning_rate": 0.00017705974863203318, + "loss": 1.4616, + "step": 8841 + }, + { + "epoch": 0.11489775730415192, + "grad_norm": 0.38705259561538696, + "learning_rate": 0.0001770571491701218, + "loss": 1.2423, + "step": 8842 + }, + { + "epoch": 0.11491075184806779, + "grad_norm": 0.40289783477783203, + "learning_rate": 0.0001770545497082104, + "loss": 1.3599, + "step": 8843 + }, + { + "epoch": 0.11492374639198366, + "grad_norm": 0.37104278802871704, + "learning_rate": 0.000177051950246299, + "loss": 1.55, + "step": 8844 + }, + { + "epoch": 0.11493674093589953, + "grad_norm": 0.37584975361824036, + "learning_rate": 0.00017704935078438765, + "loss": 1.3435, + "step": 8845 + }, + { + "epoch": 0.11494973547981541, + "grad_norm": 0.48514819145202637, + "learning_rate": 0.00017704675132247625, + "loss": 1.3335, + "step": 8846 + }, + { + "epoch": 0.11496273002373128, + "grad_norm": 0.36433327198028564, + "learning_rate": 0.00017704415186056488, + "loss": 1.4699, + "step": 8847 + }, + { + "epoch": 0.11497572456764715, + "grad_norm": 0.378147155046463, + "learning_rate": 0.00017704155239865347, + "loss": 1.5004, + "step": 8848 + }, + { + "epoch": 0.11498871911156303, + "grad_norm": 0.44031819701194763, + "learning_rate": 0.0001770389529367421, + "loss": 1.5224, + "step": 8849 + }, + { + "epoch": 0.11500171365547891, + "grad_norm": 0.3756456673145294, + "learning_rate": 0.00017703635347483072, + "loss": 1.4458, + "step": 8850 + }, + { + "epoch": 0.11501470819939479, + "grad_norm": 0.4542354941368103, + "learning_rate": 0.00017703375401291932, + "loss": 1.4985, + "step": 8851 + }, + { + "epoch": 0.11502770274331066, + "grad_norm": 0.3495444357395172, + "learning_rate": 0.00017703115455100797, + "loss": 1.2886, + "step": 8852 + }, + { + "epoch": 0.11504069728722653, + "grad_norm": 0.3535845875740051, + "learning_rate": 0.00017702855508909657, + "loss": 1.2623, + "step": 8853 + }, + { + "epoch": 0.1150536918311424, + "grad_norm": 0.4975256323814392, + "learning_rate": 0.0001770259556271852, + "loss": 1.5397, + "step": 8854 + }, + { + "epoch": 0.11506668637505828, + "grad_norm": 0.3407978415489197, + "learning_rate": 0.0001770233561652738, + "loss": 1.3114, + "step": 8855 + }, + { + "epoch": 0.11507968091897415, + "grad_norm": 0.4966141879558563, + "learning_rate": 0.00017702075670336241, + "loss": 1.6036, + "step": 8856 + }, + { + "epoch": 0.11509267546289002, + "grad_norm": 0.24291011691093445, + "learning_rate": 0.00017701815724145104, + "loss": 1.4086, + "step": 8857 + }, + { + "epoch": 0.1151056700068059, + "grad_norm": 0.4362110197544098, + "learning_rate": 0.00017701555777953964, + "loss": 1.6148, + "step": 8858 + }, + { + "epoch": 0.11511866455072177, + "grad_norm": 0.4057730734348297, + "learning_rate": 0.00017701295831762826, + "loss": 1.456, + "step": 8859 + }, + { + "epoch": 0.11513165909463764, + "grad_norm": 0.40696412324905396, + "learning_rate": 0.00017701035885571689, + "loss": 1.4372, + "step": 8860 + }, + { + "epoch": 0.11514465363855352, + "grad_norm": 0.41950613260269165, + "learning_rate": 0.00017700775939380548, + "loss": 1.6611, + "step": 8861 + }, + { + "epoch": 0.11515764818246939, + "grad_norm": 0.3580048680305481, + "learning_rate": 0.0001770051599318941, + "loss": 1.3704, + "step": 8862 + }, + { + "epoch": 0.11517064272638526, + "grad_norm": 0.434225469827652, + "learning_rate": 0.0001770025604699827, + "loss": 1.4208, + "step": 8863 + }, + { + "epoch": 0.11518363727030113, + "grad_norm": 0.43868011236190796, + "learning_rate": 0.00017699996100807136, + "loss": 1.6859, + "step": 8864 + }, + { + "epoch": 0.11519663181421701, + "grad_norm": 0.4498310685157776, + "learning_rate": 0.00017699736154615995, + "loss": 1.4112, + "step": 8865 + }, + { + "epoch": 0.11520962635813288, + "grad_norm": 0.31805649399757385, + "learning_rate": 0.00017699476208424858, + "loss": 1.6124, + "step": 8866 + }, + { + "epoch": 0.11522262090204875, + "grad_norm": 0.37745043635368347, + "learning_rate": 0.00017699216262233718, + "loss": 1.4802, + "step": 8867 + }, + { + "epoch": 0.11523561544596463, + "grad_norm": 0.40003910660743713, + "learning_rate": 0.0001769895631604258, + "loss": 1.4032, + "step": 8868 + }, + { + "epoch": 0.1152486099898805, + "grad_norm": 0.5220471024513245, + "learning_rate": 0.00017698696369851442, + "loss": 1.5611, + "step": 8869 + }, + { + "epoch": 0.11526160453379637, + "grad_norm": 0.4814424216747284, + "learning_rate": 0.00017698436423660302, + "loss": 1.5152, + "step": 8870 + }, + { + "epoch": 0.11527459907771224, + "grad_norm": 0.4091566205024719, + "learning_rate": 0.00017698176477469165, + "loss": 1.4474, + "step": 8871 + }, + { + "epoch": 0.11528759362162812, + "grad_norm": 0.44456708431243896, + "learning_rate": 0.00017697916531278027, + "loss": 1.4016, + "step": 8872 + }, + { + "epoch": 0.11530058816554399, + "grad_norm": 0.4723033010959625, + "learning_rate": 0.00017697656585086887, + "loss": 1.6079, + "step": 8873 + }, + { + "epoch": 0.11531358270945986, + "grad_norm": 0.4143528938293457, + "learning_rate": 0.0001769739663889575, + "loss": 1.4639, + "step": 8874 + }, + { + "epoch": 0.11532657725337574, + "grad_norm": 0.4051390588283539, + "learning_rate": 0.0001769713669270461, + "loss": 1.4984, + "step": 8875 + }, + { + "epoch": 0.11533957179729161, + "grad_norm": 0.4169769883155823, + "learning_rate": 0.00017696876746513474, + "loss": 1.6132, + "step": 8876 + }, + { + "epoch": 0.11535256634120748, + "grad_norm": 0.42784371972084045, + "learning_rate": 0.00017696616800322334, + "loss": 1.4837, + "step": 8877 + }, + { + "epoch": 0.11536556088512336, + "grad_norm": 0.46404895186424255, + "learning_rate": 0.00017696356854131196, + "loss": 1.4151, + "step": 8878 + }, + { + "epoch": 0.11537855542903923, + "grad_norm": 0.38801050186157227, + "learning_rate": 0.00017696096907940056, + "loss": 1.5737, + "step": 8879 + }, + { + "epoch": 0.1153915499729551, + "grad_norm": 0.3865167796611786, + "learning_rate": 0.00017695836961748919, + "loss": 1.5625, + "step": 8880 + }, + { + "epoch": 0.11540454451687097, + "grad_norm": 0.3784886300563812, + "learning_rate": 0.0001769557701555778, + "loss": 1.5683, + "step": 8881 + }, + { + "epoch": 0.11541753906078685, + "grad_norm": 0.3542916178703308, + "learning_rate": 0.0001769531706936664, + "loss": 1.3851, + "step": 8882 + }, + { + "epoch": 0.11543053360470272, + "grad_norm": 0.34836384654045105, + "learning_rate": 0.00017695057123175503, + "loss": 1.3651, + "step": 8883 + }, + { + "epoch": 0.1154435281486186, + "grad_norm": 0.31087860465049744, + "learning_rate": 0.00017694797176984366, + "loss": 1.4249, + "step": 8884 + }, + { + "epoch": 0.11545652269253447, + "grad_norm": 0.38339918851852417, + "learning_rate": 0.00017694537230793225, + "loss": 1.4929, + "step": 8885 + }, + { + "epoch": 0.11546951723645034, + "grad_norm": 0.37504562735557556, + "learning_rate": 0.00017694277284602088, + "loss": 1.4332, + "step": 8886 + }, + { + "epoch": 0.11548251178036621, + "grad_norm": 0.40253403782844543, + "learning_rate": 0.0001769401733841095, + "loss": 1.6358, + "step": 8887 + }, + { + "epoch": 0.1154955063242821, + "grad_norm": 0.3863559663295746, + "learning_rate": 0.00017693757392219813, + "loss": 1.2662, + "step": 8888 + }, + { + "epoch": 0.11550850086819797, + "grad_norm": 0.33933955430984497, + "learning_rate": 0.00017693497446028672, + "loss": 1.6058, + "step": 8889 + }, + { + "epoch": 0.11552149541211384, + "grad_norm": 0.38857215642929077, + "learning_rate": 0.00017693237499837535, + "loss": 1.4662, + "step": 8890 + }, + { + "epoch": 0.11553448995602972, + "grad_norm": 0.34573081135749817, + "learning_rate": 0.00017692977553646397, + "loss": 1.3575, + "step": 8891 + }, + { + "epoch": 0.11554748449994559, + "grad_norm": 0.4519613981246948, + "learning_rate": 0.00017692717607455257, + "loss": 1.5525, + "step": 8892 + }, + { + "epoch": 0.11556047904386146, + "grad_norm": 0.38523808121681213, + "learning_rate": 0.0001769245766126412, + "loss": 1.5181, + "step": 8893 + }, + { + "epoch": 0.11557347358777734, + "grad_norm": 0.3955749273300171, + "learning_rate": 0.0001769219771507298, + "loss": 1.6206, + "step": 8894 + }, + { + "epoch": 0.11558646813169321, + "grad_norm": 0.319232314825058, + "learning_rate": 0.00017691937768881844, + "loss": 1.3868, + "step": 8895 + }, + { + "epoch": 0.11559946267560908, + "grad_norm": 0.3183676302433014, + "learning_rate": 0.00017691677822690704, + "loss": 1.3049, + "step": 8896 + }, + { + "epoch": 0.11561245721952496, + "grad_norm": 0.3820331394672394, + "learning_rate": 0.00017691417876499567, + "loss": 1.3616, + "step": 8897 + }, + { + "epoch": 0.11562545176344083, + "grad_norm": 0.40897414088249207, + "learning_rate": 0.00017691157930308426, + "loss": 1.4553, + "step": 8898 + }, + { + "epoch": 0.1156384463073567, + "grad_norm": 0.43336331844329834, + "learning_rate": 0.0001769089798411729, + "loss": 1.3499, + "step": 8899 + }, + { + "epoch": 0.11565144085127257, + "grad_norm": 0.35033878684043884, + "learning_rate": 0.0001769063803792615, + "loss": 1.458, + "step": 8900 + }, + { + "epoch": 0.11566443539518845, + "grad_norm": 0.35629796981811523, + "learning_rate": 0.0001769037809173501, + "loss": 1.5141, + "step": 8901 + }, + { + "epoch": 0.11567742993910432, + "grad_norm": 0.34182658791542053, + "learning_rate": 0.00017690118145543873, + "loss": 1.4882, + "step": 8902 + }, + { + "epoch": 0.1156904244830202, + "grad_norm": 0.29156792163848877, + "learning_rate": 0.00017689858199352736, + "loss": 1.3431, + "step": 8903 + }, + { + "epoch": 0.11570341902693607, + "grad_norm": 0.42234691977500916, + "learning_rate": 0.00017689598253161596, + "loss": 1.5054, + "step": 8904 + }, + { + "epoch": 0.11571641357085194, + "grad_norm": 0.3064664304256439, + "learning_rate": 0.00017689338306970458, + "loss": 1.2828, + "step": 8905 + }, + { + "epoch": 0.11572940811476781, + "grad_norm": 0.44407403469085693, + "learning_rate": 0.00017689078360779318, + "loss": 1.5759, + "step": 8906 + }, + { + "epoch": 0.11574240265868369, + "grad_norm": 0.42143547534942627, + "learning_rate": 0.00017688818414588183, + "loss": 1.4991, + "step": 8907 + }, + { + "epoch": 0.11575539720259956, + "grad_norm": 0.4423423111438751, + "learning_rate": 0.00017688558468397043, + "loss": 1.3935, + "step": 8908 + }, + { + "epoch": 0.11576839174651543, + "grad_norm": 0.505020260810852, + "learning_rate": 0.00017688298522205905, + "loss": 1.4539, + "step": 8909 + }, + { + "epoch": 0.1157813862904313, + "grad_norm": 0.4939204156398773, + "learning_rate": 0.00017688038576014765, + "loss": 1.5619, + "step": 8910 + }, + { + "epoch": 0.11579438083434718, + "grad_norm": 0.3910490870475769, + "learning_rate": 0.00017687778629823627, + "loss": 1.3836, + "step": 8911 + }, + { + "epoch": 0.11580737537826305, + "grad_norm": 0.3882463872432709, + "learning_rate": 0.0001768751868363249, + "loss": 1.4598, + "step": 8912 + }, + { + "epoch": 0.11582036992217892, + "grad_norm": 0.33080700039863586, + "learning_rate": 0.0001768725873744135, + "loss": 1.3029, + "step": 8913 + }, + { + "epoch": 0.1158333644660948, + "grad_norm": 0.3902685344219208, + "learning_rate": 0.00017686998791250212, + "loss": 1.1121, + "step": 8914 + }, + { + "epoch": 0.11584635901001067, + "grad_norm": 0.2802811861038208, + "learning_rate": 0.00017686738845059074, + "loss": 1.4127, + "step": 8915 + }, + { + "epoch": 0.11585935355392654, + "grad_norm": 0.38256385922431946, + "learning_rate": 0.00017686478898867934, + "loss": 1.5203, + "step": 8916 + }, + { + "epoch": 0.11587234809784241, + "grad_norm": 0.3764645755290985, + "learning_rate": 0.00017686218952676797, + "loss": 1.4504, + "step": 8917 + }, + { + "epoch": 0.11588534264175829, + "grad_norm": 0.391086220741272, + "learning_rate": 0.00017685959006485656, + "loss": 1.4046, + "step": 8918 + }, + { + "epoch": 0.11589833718567416, + "grad_norm": 0.42881014943122864, + "learning_rate": 0.00017685699060294521, + "loss": 1.4219, + "step": 8919 + }, + { + "epoch": 0.11591133172959003, + "grad_norm": 0.3621368706226349, + "learning_rate": 0.0001768543911410338, + "loss": 1.4589, + "step": 8920 + }, + { + "epoch": 0.1159243262735059, + "grad_norm": 0.40348464250564575, + "learning_rate": 0.00017685179167912244, + "loss": 1.3321, + "step": 8921 + }, + { + "epoch": 0.11593732081742178, + "grad_norm": 0.3442576229572296, + "learning_rate": 0.00017684919221721103, + "loss": 1.5715, + "step": 8922 + }, + { + "epoch": 0.11595031536133765, + "grad_norm": 0.42147359251976013, + "learning_rate": 0.00017684659275529966, + "loss": 1.3348, + "step": 8923 + }, + { + "epoch": 0.11596330990525353, + "grad_norm": 0.44710928201675415, + "learning_rate": 0.00017684399329338828, + "loss": 1.4736, + "step": 8924 + }, + { + "epoch": 0.1159763044491694, + "grad_norm": 0.3510482907295227, + "learning_rate": 0.00017684139383147688, + "loss": 1.4809, + "step": 8925 + }, + { + "epoch": 0.11598929899308529, + "grad_norm": 0.3672427535057068, + "learning_rate": 0.00017683879436956553, + "loss": 1.4735, + "step": 8926 + }, + { + "epoch": 0.11600229353700116, + "grad_norm": 0.3786901533603668, + "learning_rate": 0.00017683619490765413, + "loss": 1.2821, + "step": 8927 + }, + { + "epoch": 0.11601528808091703, + "grad_norm": 0.42218825221061707, + "learning_rate": 0.00017683359544574273, + "loss": 1.1771, + "step": 8928 + }, + { + "epoch": 0.1160282826248329, + "grad_norm": 0.5461128950119019, + "learning_rate": 0.00017683099598383135, + "loss": 1.537, + "step": 8929 + }, + { + "epoch": 0.11604127716874878, + "grad_norm": 0.39218270778656006, + "learning_rate": 0.00017682839652191998, + "loss": 1.6629, + "step": 8930 + }, + { + "epoch": 0.11605427171266465, + "grad_norm": 0.48970291018486023, + "learning_rate": 0.0001768257970600086, + "loss": 1.465, + "step": 8931 + }, + { + "epoch": 0.11606726625658052, + "grad_norm": 0.36469948291778564, + "learning_rate": 0.0001768231975980972, + "loss": 1.1396, + "step": 8932 + }, + { + "epoch": 0.1160802608004964, + "grad_norm": 0.3875502347946167, + "learning_rate": 0.00017682059813618582, + "loss": 1.4774, + "step": 8933 + }, + { + "epoch": 0.11609325534441227, + "grad_norm": 0.35956117510795593, + "learning_rate": 0.00017681799867427445, + "loss": 1.3158, + "step": 8934 + }, + { + "epoch": 0.11610624988832814, + "grad_norm": 0.3810860514640808, + "learning_rate": 0.00017681539921236304, + "loss": 1.3578, + "step": 8935 + }, + { + "epoch": 0.11611924443224401, + "grad_norm": 0.3534294068813324, + "learning_rate": 0.00017681279975045167, + "loss": 1.4307, + "step": 8936 + }, + { + "epoch": 0.11613223897615989, + "grad_norm": 0.3832017779350281, + "learning_rate": 0.00017681020028854027, + "loss": 1.1423, + "step": 8937 + }, + { + "epoch": 0.11614523352007576, + "grad_norm": 0.35043561458587646, + "learning_rate": 0.00017680760082662892, + "loss": 1.3967, + "step": 8938 + }, + { + "epoch": 0.11615822806399163, + "grad_norm": 0.3551879823207855, + "learning_rate": 0.00017680500136471751, + "loss": 1.4977, + "step": 8939 + }, + { + "epoch": 0.1161712226079075, + "grad_norm": 0.34447479248046875, + "learning_rate": 0.0001768024019028061, + "loss": 1.5291, + "step": 8940 + }, + { + "epoch": 0.11618421715182338, + "grad_norm": 0.396355003118515, + "learning_rate": 0.00017679980244089474, + "loss": 1.4142, + "step": 8941 + }, + { + "epoch": 0.11619721169573925, + "grad_norm": 0.5634851455688477, + "learning_rate": 0.00017679720297898336, + "loss": 1.3662, + "step": 8942 + }, + { + "epoch": 0.11621020623965513, + "grad_norm": 0.3756301701068878, + "learning_rate": 0.00017679460351707199, + "loss": 1.1948, + "step": 8943 + }, + { + "epoch": 0.116223200783571, + "grad_norm": 0.33726197481155396, + "learning_rate": 0.00017679200405516058, + "loss": 1.4539, + "step": 8944 + }, + { + "epoch": 0.11623619532748687, + "grad_norm": 0.399055540561676, + "learning_rate": 0.0001767894045932492, + "loss": 1.4658, + "step": 8945 + }, + { + "epoch": 0.11624918987140274, + "grad_norm": 0.3979785144329071, + "learning_rate": 0.00017678680513133783, + "loss": 1.5467, + "step": 8946 + }, + { + "epoch": 0.11626218441531862, + "grad_norm": 0.3564103841781616, + "learning_rate": 0.00017678420566942643, + "loss": 1.5565, + "step": 8947 + }, + { + "epoch": 0.11627517895923449, + "grad_norm": 0.31619182229042053, + "learning_rate": 0.00017678160620751505, + "loss": 1.2381, + "step": 8948 + }, + { + "epoch": 0.11628817350315036, + "grad_norm": 0.3950020670890808, + "learning_rate": 0.00017677900674560365, + "loss": 1.4118, + "step": 8949 + }, + { + "epoch": 0.11630116804706624, + "grad_norm": 0.35931581258773804, + "learning_rate": 0.0001767764072836923, + "loss": 1.3007, + "step": 8950 + }, + { + "epoch": 0.11631416259098211, + "grad_norm": 0.3802351951599121, + "learning_rate": 0.0001767738078217809, + "loss": 1.4365, + "step": 8951 + }, + { + "epoch": 0.11632715713489798, + "grad_norm": 0.354037880897522, + "learning_rate": 0.00017677120835986952, + "loss": 1.4072, + "step": 8952 + }, + { + "epoch": 0.11634015167881386, + "grad_norm": 0.3977590799331665, + "learning_rate": 0.00017676860889795812, + "loss": 1.4536, + "step": 8953 + }, + { + "epoch": 0.11635314622272973, + "grad_norm": 0.4268656373023987, + "learning_rate": 0.00017676600943604675, + "loss": 1.583, + "step": 8954 + }, + { + "epoch": 0.1163661407666456, + "grad_norm": 0.32860028743743896, + "learning_rate": 0.00017676340997413537, + "loss": 1.4924, + "step": 8955 + }, + { + "epoch": 0.11637913531056147, + "grad_norm": 0.3602045178413391, + "learning_rate": 0.00017676081051222397, + "loss": 1.4033, + "step": 8956 + }, + { + "epoch": 0.11639212985447735, + "grad_norm": 0.31551361083984375, + "learning_rate": 0.0001767582110503126, + "loss": 1.4526, + "step": 8957 + }, + { + "epoch": 0.11640512439839322, + "grad_norm": 0.3984453082084656, + "learning_rate": 0.00017675561158840122, + "loss": 1.2386, + "step": 8958 + }, + { + "epoch": 0.11641811894230909, + "grad_norm": 0.39161843061447144, + "learning_rate": 0.00017675301212648981, + "loss": 1.4361, + "step": 8959 + }, + { + "epoch": 0.11643111348622497, + "grad_norm": 0.37419062852859497, + "learning_rate": 0.00017675041266457844, + "loss": 1.6336, + "step": 8960 + }, + { + "epoch": 0.11644410803014084, + "grad_norm": 0.35908767580986023, + "learning_rate": 0.00017674781320266706, + "loss": 1.3653, + "step": 8961 + }, + { + "epoch": 0.11645710257405671, + "grad_norm": 0.40294522047042847, + "learning_rate": 0.0001767452137407557, + "loss": 1.4608, + "step": 8962 + }, + { + "epoch": 0.11647009711797258, + "grad_norm": 0.3234565258026123, + "learning_rate": 0.00017674261427884429, + "loss": 1.5228, + "step": 8963 + }, + { + "epoch": 0.11648309166188846, + "grad_norm": 0.4464538097381592, + "learning_rate": 0.0001767400148169329, + "loss": 1.6113, + "step": 8964 + }, + { + "epoch": 0.11649608620580434, + "grad_norm": 0.36843040585517883, + "learning_rate": 0.00017673741535502153, + "loss": 1.5668, + "step": 8965 + }, + { + "epoch": 0.11650908074972022, + "grad_norm": 0.47493094205856323, + "learning_rate": 0.00017673481589311013, + "loss": 1.5702, + "step": 8966 + }, + { + "epoch": 0.11652207529363609, + "grad_norm": 0.3580007553100586, + "learning_rate": 0.00017673221643119876, + "loss": 1.4124, + "step": 8967 + }, + { + "epoch": 0.11653506983755196, + "grad_norm": 0.4454807937145233, + "learning_rate": 0.00017672961696928735, + "loss": 1.3978, + "step": 8968 + }, + { + "epoch": 0.11654806438146784, + "grad_norm": 0.34001752734184265, + "learning_rate": 0.00017672701750737598, + "loss": 1.3282, + "step": 8969 + }, + { + "epoch": 0.11656105892538371, + "grad_norm": 0.37206077575683594, + "learning_rate": 0.0001767244180454646, + "loss": 1.3462, + "step": 8970 + }, + { + "epoch": 0.11657405346929958, + "grad_norm": 0.3516480624675751, + "learning_rate": 0.0001767218185835532, + "loss": 1.3093, + "step": 8971 + }, + { + "epoch": 0.11658704801321546, + "grad_norm": 0.4320347309112549, + "learning_rate": 0.00017671921912164182, + "loss": 1.4906, + "step": 8972 + }, + { + "epoch": 0.11660004255713133, + "grad_norm": 0.30538275837898254, + "learning_rate": 0.00017671661965973045, + "loss": 1.267, + "step": 8973 + }, + { + "epoch": 0.1166130371010472, + "grad_norm": 0.4605169892311096, + "learning_rate": 0.00017671402019781907, + "loss": 1.3431, + "step": 8974 + }, + { + "epoch": 0.11662603164496307, + "grad_norm": 0.4159318804740906, + "learning_rate": 0.00017671142073590767, + "loss": 1.5798, + "step": 8975 + }, + { + "epoch": 0.11663902618887895, + "grad_norm": 0.40308213233947754, + "learning_rate": 0.0001767088212739963, + "loss": 1.483, + "step": 8976 + }, + { + "epoch": 0.11665202073279482, + "grad_norm": 0.3748931586742401, + "learning_rate": 0.00017670622181208492, + "loss": 1.3179, + "step": 8977 + }, + { + "epoch": 0.11666501527671069, + "grad_norm": 0.5202804803848267, + "learning_rate": 0.00017670362235017352, + "loss": 1.5477, + "step": 8978 + }, + { + "epoch": 0.11667800982062657, + "grad_norm": 0.3610033690929413, + "learning_rate": 0.00017670102288826214, + "loss": 1.2878, + "step": 8979 + }, + { + "epoch": 0.11669100436454244, + "grad_norm": 0.37570205330848694, + "learning_rate": 0.00017669842342635074, + "loss": 1.3448, + "step": 8980 + }, + { + "epoch": 0.11670399890845831, + "grad_norm": 0.3589829206466675, + "learning_rate": 0.0001766958239644394, + "loss": 1.2419, + "step": 8981 + }, + { + "epoch": 0.11671699345237418, + "grad_norm": 0.4157191216945648, + "learning_rate": 0.000176693224502528, + "loss": 1.4887, + "step": 8982 + }, + { + "epoch": 0.11672998799629006, + "grad_norm": 0.488595187664032, + "learning_rate": 0.00017669062504061659, + "loss": 1.6534, + "step": 8983 + }, + { + "epoch": 0.11674298254020593, + "grad_norm": 0.3122594356536865, + "learning_rate": 0.0001766880255787052, + "loss": 1.3417, + "step": 8984 + }, + { + "epoch": 0.1167559770841218, + "grad_norm": 0.3656911253929138, + "learning_rate": 0.00017668542611679383, + "loss": 1.4776, + "step": 8985 + }, + { + "epoch": 0.11676897162803768, + "grad_norm": 0.4155288636684418, + "learning_rate": 0.00017668282665488246, + "loss": 1.5027, + "step": 8986 + }, + { + "epoch": 0.11678196617195355, + "grad_norm": 0.410571813583374, + "learning_rate": 0.00017668022719297106, + "loss": 1.5129, + "step": 8987 + }, + { + "epoch": 0.11679496071586942, + "grad_norm": 0.4012853801250458, + "learning_rate": 0.00017667762773105968, + "loss": 1.2757, + "step": 8988 + }, + { + "epoch": 0.1168079552597853, + "grad_norm": 0.3849453628063202, + "learning_rate": 0.0001766750282691483, + "loss": 1.5095, + "step": 8989 + }, + { + "epoch": 0.11682094980370117, + "grad_norm": 0.45215800404548645, + "learning_rate": 0.0001766724288072369, + "loss": 1.4937, + "step": 8990 + }, + { + "epoch": 0.11683394434761704, + "grad_norm": 0.42277297377586365, + "learning_rate": 0.00017666982934532553, + "loss": 1.6123, + "step": 8991 + }, + { + "epoch": 0.11684693889153291, + "grad_norm": 0.37789762020111084, + "learning_rate": 0.00017666722988341412, + "loss": 1.7941, + "step": 8992 + }, + { + "epoch": 0.11685993343544879, + "grad_norm": 0.34148523211479187, + "learning_rate": 0.00017666463042150278, + "loss": 1.5682, + "step": 8993 + }, + { + "epoch": 0.11687292797936466, + "grad_norm": 0.38256609439849854, + "learning_rate": 0.00017666203095959137, + "loss": 1.435, + "step": 8994 + }, + { + "epoch": 0.11688592252328053, + "grad_norm": 0.4391743838787079, + "learning_rate": 0.00017665943149767997, + "loss": 1.4486, + "step": 8995 + }, + { + "epoch": 0.1168989170671964, + "grad_norm": 0.3742262125015259, + "learning_rate": 0.0001766568320357686, + "loss": 1.5345, + "step": 8996 + }, + { + "epoch": 0.11691191161111228, + "grad_norm": 0.27352893352508545, + "learning_rate": 0.00017665423257385722, + "loss": 1.3189, + "step": 8997 + }, + { + "epoch": 0.11692490615502815, + "grad_norm": 0.41723528504371643, + "learning_rate": 0.00017665163311194584, + "loss": 1.3694, + "step": 8998 + }, + { + "epoch": 0.11693790069894403, + "grad_norm": 0.4188641607761383, + "learning_rate": 0.00017664903365003444, + "loss": 1.4569, + "step": 8999 + }, + { + "epoch": 0.1169508952428599, + "grad_norm": 0.4057868421077728, + "learning_rate": 0.00017664643418812307, + "loss": 1.5233, + "step": 9000 + }, + { + "epoch": 0.11696388978677577, + "grad_norm": 0.3293067514896393, + "learning_rate": 0.0001766438347262117, + "loss": 1.3608, + "step": 9001 + }, + { + "epoch": 0.11697688433069164, + "grad_norm": 0.4646606743335724, + "learning_rate": 0.0001766412352643003, + "loss": 1.6135, + "step": 9002 + }, + { + "epoch": 0.11698987887460753, + "grad_norm": 0.4501727819442749, + "learning_rate": 0.0001766386358023889, + "loss": 1.4578, + "step": 9003 + }, + { + "epoch": 0.1170028734185234, + "grad_norm": 0.389575719833374, + "learning_rate": 0.00017663603634047754, + "loss": 1.3977, + "step": 9004 + }, + { + "epoch": 0.11701586796243928, + "grad_norm": 0.35004299879074097, + "learning_rate": 0.00017663343687856616, + "loss": 1.556, + "step": 9005 + }, + { + "epoch": 0.11702886250635515, + "grad_norm": 0.36122962832450867, + "learning_rate": 0.00017663083741665476, + "loss": 1.3904, + "step": 9006 + }, + { + "epoch": 0.11704185705027102, + "grad_norm": 0.4256083071231842, + "learning_rate": 0.00017662823795474336, + "loss": 1.474, + "step": 9007 + }, + { + "epoch": 0.1170548515941869, + "grad_norm": 0.43045875430107117, + "learning_rate": 0.000176625638492832, + "loss": 1.4376, + "step": 9008 + }, + { + "epoch": 0.11706784613810277, + "grad_norm": 0.3174442946910858, + "learning_rate": 0.0001766230390309206, + "loss": 1.4488, + "step": 9009 + }, + { + "epoch": 0.11708084068201864, + "grad_norm": 0.3972882926464081, + "learning_rate": 0.00017662043956900923, + "loss": 1.4777, + "step": 9010 + }, + { + "epoch": 0.11709383522593451, + "grad_norm": 0.37631741166114807, + "learning_rate": 0.00017661784010709783, + "loss": 1.3369, + "step": 9011 + }, + { + "epoch": 0.11710682976985039, + "grad_norm": 0.33900636434555054, + "learning_rate": 0.00017661524064518645, + "loss": 1.4271, + "step": 9012 + }, + { + "epoch": 0.11711982431376626, + "grad_norm": 0.3936507999897003, + "learning_rate": 0.00017661264118327508, + "loss": 1.4139, + "step": 9013 + }, + { + "epoch": 0.11713281885768213, + "grad_norm": 0.33837345242500305, + "learning_rate": 0.00017661004172136367, + "loss": 1.2523, + "step": 9014 + }, + { + "epoch": 0.117145813401598, + "grad_norm": 0.4173916280269623, + "learning_rate": 0.0001766074422594523, + "loss": 1.3606, + "step": 9015 + }, + { + "epoch": 0.11715880794551388, + "grad_norm": 0.41455206274986267, + "learning_rate": 0.00017660484279754092, + "loss": 1.5347, + "step": 9016 + }, + { + "epoch": 0.11717180248942975, + "grad_norm": 0.4208053648471832, + "learning_rate": 0.00017660224333562955, + "loss": 1.277, + "step": 9017 + }, + { + "epoch": 0.11718479703334563, + "grad_norm": 0.34625452756881714, + "learning_rate": 0.00017659964387371814, + "loss": 1.3009, + "step": 9018 + }, + { + "epoch": 0.1171977915772615, + "grad_norm": 0.3499453067779541, + "learning_rate": 0.00017659704441180677, + "loss": 1.4951, + "step": 9019 + }, + { + "epoch": 0.11721078612117737, + "grad_norm": 0.3977719843387604, + "learning_rate": 0.0001765944449498954, + "loss": 1.3714, + "step": 9020 + }, + { + "epoch": 0.11722378066509324, + "grad_norm": 0.34811314940452576, + "learning_rate": 0.000176591845487984, + "loss": 1.4242, + "step": 9021 + }, + { + "epoch": 0.11723677520900912, + "grad_norm": 0.39048734307289124, + "learning_rate": 0.00017658924602607262, + "loss": 1.5532, + "step": 9022 + }, + { + "epoch": 0.11724976975292499, + "grad_norm": 0.29212865233421326, + "learning_rate": 0.0001765866465641612, + "loss": 1.1341, + "step": 9023 + }, + { + "epoch": 0.11726276429684086, + "grad_norm": 0.3969223201274872, + "learning_rate": 0.00017658404710224984, + "loss": 1.3572, + "step": 9024 + }, + { + "epoch": 0.11727575884075674, + "grad_norm": 0.31001126766204834, + "learning_rate": 0.00017658144764033846, + "loss": 1.3244, + "step": 9025 + }, + { + "epoch": 0.11728875338467261, + "grad_norm": 0.3509131073951721, + "learning_rate": 0.00017657884817842706, + "loss": 1.129, + "step": 9026 + }, + { + "epoch": 0.11730174792858848, + "grad_norm": 0.4359632134437561, + "learning_rate": 0.00017657624871651568, + "loss": 1.4418, + "step": 9027 + }, + { + "epoch": 0.11731474247250435, + "grad_norm": 0.3695792853832245, + "learning_rate": 0.0001765736492546043, + "loss": 1.5434, + "step": 9028 + }, + { + "epoch": 0.11732773701642023, + "grad_norm": 0.36861875653266907, + "learning_rate": 0.00017657104979269293, + "loss": 1.5469, + "step": 9029 + }, + { + "epoch": 0.1173407315603361, + "grad_norm": 0.3502877354621887, + "learning_rate": 0.00017656845033078153, + "loss": 1.3562, + "step": 9030 + }, + { + "epoch": 0.11735372610425197, + "grad_norm": 0.4547896981239319, + "learning_rate": 0.00017656585086887015, + "loss": 1.3683, + "step": 9031 + }, + { + "epoch": 0.11736672064816785, + "grad_norm": 0.4016554355621338, + "learning_rate": 0.00017656325140695878, + "loss": 1.4324, + "step": 9032 + }, + { + "epoch": 0.11737971519208372, + "grad_norm": 0.43591275811195374, + "learning_rate": 0.00017656065194504738, + "loss": 1.5161, + "step": 9033 + }, + { + "epoch": 0.11739270973599959, + "grad_norm": 0.397156685590744, + "learning_rate": 0.000176558052483136, + "loss": 1.2897, + "step": 9034 + }, + { + "epoch": 0.11740570427991547, + "grad_norm": 0.4603898227214813, + "learning_rate": 0.00017655545302122462, + "loss": 1.3899, + "step": 9035 + }, + { + "epoch": 0.11741869882383134, + "grad_norm": 0.3756342828273773, + "learning_rate": 0.00017655285355931325, + "loss": 1.3801, + "step": 9036 + }, + { + "epoch": 0.11743169336774721, + "grad_norm": 0.5173438787460327, + "learning_rate": 0.00017655025409740185, + "loss": 1.3651, + "step": 9037 + }, + { + "epoch": 0.11744468791166308, + "grad_norm": 0.3822080194950104, + "learning_rate": 0.00017654765463549044, + "loss": 1.6543, + "step": 9038 + }, + { + "epoch": 0.11745768245557896, + "grad_norm": 0.35000085830688477, + "learning_rate": 0.0001765450551735791, + "loss": 1.313, + "step": 9039 + }, + { + "epoch": 0.11747067699949483, + "grad_norm": 0.32739773392677307, + "learning_rate": 0.0001765424557116677, + "loss": 1.4679, + "step": 9040 + }, + { + "epoch": 0.11748367154341072, + "grad_norm": 0.4821600317955017, + "learning_rate": 0.00017653985624975632, + "loss": 1.5763, + "step": 9041 + }, + { + "epoch": 0.11749666608732659, + "grad_norm": 0.26479119062423706, + "learning_rate": 0.00017653725678784491, + "loss": 1.3357, + "step": 9042 + }, + { + "epoch": 0.11750966063124246, + "grad_norm": 0.47364068031311035, + "learning_rate": 0.00017653465732593354, + "loss": 1.4487, + "step": 9043 + }, + { + "epoch": 0.11752265517515834, + "grad_norm": 0.34486469626426697, + "learning_rate": 0.00017653205786402216, + "loss": 1.2702, + "step": 9044 + }, + { + "epoch": 0.11753564971907421, + "grad_norm": 0.5029379725456238, + "learning_rate": 0.00017652945840211076, + "loss": 1.4134, + "step": 9045 + }, + { + "epoch": 0.11754864426299008, + "grad_norm": 0.4247707724571228, + "learning_rate": 0.00017652685894019939, + "loss": 1.3643, + "step": 9046 + }, + { + "epoch": 0.11756163880690595, + "grad_norm": 0.4294482469558716, + "learning_rate": 0.000176524259478288, + "loss": 1.4631, + "step": 9047 + }, + { + "epoch": 0.11757463335082183, + "grad_norm": 0.33735018968582153, + "learning_rate": 0.00017652166001637663, + "loss": 1.3189, + "step": 9048 + }, + { + "epoch": 0.1175876278947377, + "grad_norm": 0.46224460005760193, + "learning_rate": 0.00017651906055446523, + "loss": 1.3773, + "step": 9049 + }, + { + "epoch": 0.11760062243865357, + "grad_norm": 0.4145694673061371, + "learning_rate": 0.00017651646109255383, + "loss": 1.5058, + "step": 9050 + }, + { + "epoch": 0.11761361698256945, + "grad_norm": 0.3737334609031677, + "learning_rate": 0.00017651386163064248, + "loss": 1.3586, + "step": 9051 + }, + { + "epoch": 0.11762661152648532, + "grad_norm": 0.38131749629974365, + "learning_rate": 0.00017651126216873108, + "loss": 1.5761, + "step": 9052 + }, + { + "epoch": 0.11763960607040119, + "grad_norm": 0.38808903098106384, + "learning_rate": 0.0001765086627068197, + "loss": 1.3509, + "step": 9053 + }, + { + "epoch": 0.11765260061431707, + "grad_norm": 0.3853827714920044, + "learning_rate": 0.0001765060632449083, + "loss": 1.5033, + "step": 9054 + }, + { + "epoch": 0.11766559515823294, + "grad_norm": 0.34500375390052795, + "learning_rate": 0.00017650346378299692, + "loss": 1.5851, + "step": 9055 + }, + { + "epoch": 0.11767858970214881, + "grad_norm": 0.3166362941265106, + "learning_rate": 0.00017650086432108555, + "loss": 1.173, + "step": 9056 + }, + { + "epoch": 0.11769158424606468, + "grad_norm": 0.387178510427475, + "learning_rate": 0.00017649826485917415, + "loss": 1.712, + "step": 9057 + }, + { + "epoch": 0.11770457878998056, + "grad_norm": 0.425152063369751, + "learning_rate": 0.00017649566539726277, + "loss": 1.3726, + "step": 9058 + }, + { + "epoch": 0.11771757333389643, + "grad_norm": 0.45254212617874146, + "learning_rate": 0.0001764930659353514, + "loss": 1.5894, + "step": 9059 + }, + { + "epoch": 0.1177305678778123, + "grad_norm": 0.37838661670684814, + "learning_rate": 0.00017649046647344002, + "loss": 1.5149, + "step": 9060 + }, + { + "epoch": 0.11774356242172818, + "grad_norm": 0.3081110417842865, + "learning_rate": 0.00017648786701152862, + "loss": 1.3828, + "step": 9061 + }, + { + "epoch": 0.11775655696564405, + "grad_norm": 0.42954570055007935, + "learning_rate": 0.00017648526754961721, + "loss": 1.5931, + "step": 9062 + }, + { + "epoch": 0.11776955150955992, + "grad_norm": 0.399687796831131, + "learning_rate": 0.00017648266808770587, + "loss": 1.4147, + "step": 9063 + }, + { + "epoch": 0.1177825460534758, + "grad_norm": 0.39408156275749207, + "learning_rate": 0.00017648006862579446, + "loss": 1.6926, + "step": 9064 + }, + { + "epoch": 0.11779554059739167, + "grad_norm": 0.442289799451828, + "learning_rate": 0.0001764774691638831, + "loss": 1.3548, + "step": 9065 + }, + { + "epoch": 0.11780853514130754, + "grad_norm": 0.5029153823852539, + "learning_rate": 0.00017647486970197169, + "loss": 1.4643, + "step": 9066 + }, + { + "epoch": 0.11782152968522341, + "grad_norm": 0.4636981785297394, + "learning_rate": 0.0001764722702400603, + "loss": 1.5374, + "step": 9067 + }, + { + "epoch": 0.11783452422913929, + "grad_norm": 0.4107612073421478, + "learning_rate": 0.00017646967077814893, + "loss": 1.5267, + "step": 9068 + }, + { + "epoch": 0.11784751877305516, + "grad_norm": 0.3928777277469635, + "learning_rate": 0.00017646707131623753, + "loss": 1.6081, + "step": 9069 + }, + { + "epoch": 0.11786051331697103, + "grad_norm": 0.3787069618701935, + "learning_rate": 0.00017646447185432616, + "loss": 1.2859, + "step": 9070 + }, + { + "epoch": 0.1178735078608869, + "grad_norm": 0.35391828417778015, + "learning_rate": 0.00017646187239241478, + "loss": 1.4555, + "step": 9071 + }, + { + "epoch": 0.11788650240480278, + "grad_norm": 0.45430445671081543, + "learning_rate": 0.0001764592729305034, + "loss": 1.3373, + "step": 9072 + }, + { + "epoch": 0.11789949694871865, + "grad_norm": 0.45637673139572144, + "learning_rate": 0.000176456673468592, + "loss": 1.5001, + "step": 9073 + }, + { + "epoch": 0.11791249149263452, + "grad_norm": 0.5029851198196411, + "learning_rate": 0.00017645407400668063, + "loss": 1.5598, + "step": 9074 + }, + { + "epoch": 0.1179254860365504, + "grad_norm": 0.32405078411102295, + "learning_rate": 0.00017645147454476925, + "loss": 1.4992, + "step": 9075 + }, + { + "epoch": 0.11793848058046627, + "grad_norm": 0.4317871630191803, + "learning_rate": 0.00017644887508285785, + "loss": 1.1203, + "step": 9076 + }, + { + "epoch": 0.11795147512438214, + "grad_norm": 0.3266223073005676, + "learning_rate": 0.00017644627562094647, + "loss": 1.2223, + "step": 9077 + }, + { + "epoch": 0.11796446966829802, + "grad_norm": 0.38675281405448914, + "learning_rate": 0.0001764436761590351, + "loss": 1.5496, + "step": 9078 + }, + { + "epoch": 0.1179774642122139, + "grad_norm": 0.3666847050189972, + "learning_rate": 0.0001764410766971237, + "loss": 1.2681, + "step": 9079 + }, + { + "epoch": 0.11799045875612978, + "grad_norm": 0.4113713204860687, + "learning_rate": 0.00017643847723521232, + "loss": 1.3278, + "step": 9080 + }, + { + "epoch": 0.11800345330004565, + "grad_norm": 0.3360549211502075, + "learning_rate": 0.00017643587777330092, + "loss": 1.2036, + "step": 9081 + }, + { + "epoch": 0.11801644784396152, + "grad_norm": 0.3180699348449707, + "learning_rate": 0.00017643327831138957, + "loss": 1.4236, + "step": 9082 + }, + { + "epoch": 0.1180294423878774, + "grad_norm": 0.36815357208251953, + "learning_rate": 0.00017643067884947817, + "loss": 1.3284, + "step": 9083 + }, + { + "epoch": 0.11804243693179327, + "grad_norm": 0.4223071336746216, + "learning_rate": 0.0001764280793875668, + "loss": 1.6531, + "step": 9084 + }, + { + "epoch": 0.11805543147570914, + "grad_norm": 0.43429213762283325, + "learning_rate": 0.0001764254799256554, + "loss": 1.5114, + "step": 9085 + }, + { + "epoch": 0.11806842601962501, + "grad_norm": 0.4690849483013153, + "learning_rate": 0.000176422880463744, + "loss": 1.5304, + "step": 9086 + }, + { + "epoch": 0.11808142056354089, + "grad_norm": 0.33998048305511475, + "learning_rate": 0.00017642028100183264, + "loss": 1.1628, + "step": 9087 + }, + { + "epoch": 0.11809441510745676, + "grad_norm": 0.40953803062438965, + "learning_rate": 0.00017641768153992123, + "loss": 1.693, + "step": 9088 + }, + { + "epoch": 0.11810740965137263, + "grad_norm": 0.446580708026886, + "learning_rate": 0.00017641508207800986, + "loss": 1.4581, + "step": 9089 + }, + { + "epoch": 0.1181204041952885, + "grad_norm": 0.32885387539863586, + "learning_rate": 0.00017641248261609848, + "loss": 1.3159, + "step": 9090 + }, + { + "epoch": 0.11813339873920438, + "grad_norm": 0.4315154254436493, + "learning_rate": 0.00017640988315418708, + "loss": 1.4291, + "step": 9091 + }, + { + "epoch": 0.11814639328312025, + "grad_norm": 0.4724661409854889, + "learning_rate": 0.0001764072836922757, + "loss": 1.3778, + "step": 9092 + }, + { + "epoch": 0.11815938782703612, + "grad_norm": 0.36929646134376526, + "learning_rate": 0.0001764046842303643, + "loss": 1.5197, + "step": 9093 + }, + { + "epoch": 0.118172382370952, + "grad_norm": 0.3613959550857544, + "learning_rate": 0.00017640208476845295, + "loss": 1.5014, + "step": 9094 + }, + { + "epoch": 0.11818537691486787, + "grad_norm": 0.4160013496875763, + "learning_rate": 0.00017639948530654155, + "loss": 1.5534, + "step": 9095 + }, + { + "epoch": 0.11819837145878374, + "grad_norm": 0.3813033998012543, + "learning_rate": 0.00017639688584463018, + "loss": 1.3654, + "step": 9096 + }, + { + "epoch": 0.11821136600269962, + "grad_norm": 0.451663613319397, + "learning_rate": 0.00017639428638271877, + "loss": 1.567, + "step": 9097 + }, + { + "epoch": 0.11822436054661549, + "grad_norm": 0.3539445698261261, + "learning_rate": 0.0001763916869208074, + "loss": 1.5162, + "step": 9098 + }, + { + "epoch": 0.11823735509053136, + "grad_norm": 0.4470173418521881, + "learning_rate": 0.00017638908745889602, + "loss": 1.5748, + "step": 9099 + }, + { + "epoch": 0.11825034963444724, + "grad_norm": 0.3758106529712677, + "learning_rate": 0.00017638648799698462, + "loss": 1.4006, + "step": 9100 + }, + { + "epoch": 0.11826334417836311, + "grad_norm": 0.3752334415912628, + "learning_rate": 0.00017638388853507324, + "loss": 1.3266, + "step": 9101 + }, + { + "epoch": 0.11827633872227898, + "grad_norm": 0.3364852964878082, + "learning_rate": 0.00017638128907316187, + "loss": 1.2271, + "step": 9102 + }, + { + "epoch": 0.11828933326619485, + "grad_norm": 0.4314340353012085, + "learning_rate": 0.0001763786896112505, + "loss": 1.4655, + "step": 9103 + }, + { + "epoch": 0.11830232781011073, + "grad_norm": 0.32277750968933105, + "learning_rate": 0.0001763760901493391, + "loss": 1.3103, + "step": 9104 + }, + { + "epoch": 0.1183153223540266, + "grad_norm": 0.403269499540329, + "learning_rate": 0.0001763734906874277, + "loss": 1.5042, + "step": 9105 + }, + { + "epoch": 0.11832831689794247, + "grad_norm": 0.360903263092041, + "learning_rate": 0.00017637089122551634, + "loss": 1.4842, + "step": 9106 + }, + { + "epoch": 0.11834131144185835, + "grad_norm": 0.33754587173461914, + "learning_rate": 0.00017636829176360494, + "loss": 1.3197, + "step": 9107 + }, + { + "epoch": 0.11835430598577422, + "grad_norm": 0.4857274293899536, + "learning_rate": 0.00017636569230169356, + "loss": 1.4843, + "step": 9108 + }, + { + "epoch": 0.11836730052969009, + "grad_norm": 0.3593307137489319, + "learning_rate": 0.00017636309283978216, + "loss": 1.4489, + "step": 9109 + }, + { + "epoch": 0.11838029507360597, + "grad_norm": 0.3252532184123993, + "learning_rate": 0.00017636049337787078, + "loss": 1.398, + "step": 9110 + }, + { + "epoch": 0.11839328961752184, + "grad_norm": 0.5149595737457275, + "learning_rate": 0.0001763578939159594, + "loss": 1.5012, + "step": 9111 + }, + { + "epoch": 0.11840628416143771, + "grad_norm": 0.4347781538963318, + "learning_rate": 0.000176355294454048, + "loss": 1.4523, + "step": 9112 + }, + { + "epoch": 0.11841927870535358, + "grad_norm": 0.4299534559249878, + "learning_rate": 0.00017635269499213666, + "loss": 1.6555, + "step": 9113 + }, + { + "epoch": 0.11843227324926946, + "grad_norm": 0.38847315311431885, + "learning_rate": 0.00017635009553022525, + "loss": 1.2422, + "step": 9114 + }, + { + "epoch": 0.11844526779318533, + "grad_norm": 0.3944111764431, + "learning_rate": 0.00017634749606831388, + "loss": 1.3886, + "step": 9115 + }, + { + "epoch": 0.1184582623371012, + "grad_norm": 0.5636235475540161, + "learning_rate": 0.00017634489660640248, + "loss": 1.5124, + "step": 9116 + }, + { + "epoch": 0.11847125688101709, + "grad_norm": 0.4249912202358246, + "learning_rate": 0.0001763422971444911, + "loss": 1.4151, + "step": 9117 + }, + { + "epoch": 0.11848425142493296, + "grad_norm": 0.45633092522621155, + "learning_rate": 0.00017633969768257973, + "loss": 1.3655, + "step": 9118 + }, + { + "epoch": 0.11849724596884884, + "grad_norm": 0.3120082914829254, + "learning_rate": 0.00017633709822066832, + "loss": 1.4502, + "step": 9119 + }, + { + "epoch": 0.11851024051276471, + "grad_norm": 0.423132985830307, + "learning_rate": 0.00017633449875875695, + "loss": 1.4398, + "step": 9120 + }, + { + "epoch": 0.11852323505668058, + "grad_norm": 0.46242639422416687, + "learning_rate": 0.00017633189929684557, + "loss": 1.6068, + "step": 9121 + }, + { + "epoch": 0.11853622960059645, + "grad_norm": 0.3650546669960022, + "learning_rate": 0.00017632929983493417, + "loss": 1.5062, + "step": 9122 + }, + { + "epoch": 0.11854922414451233, + "grad_norm": 0.3244951069355011, + "learning_rate": 0.0001763267003730228, + "loss": 1.1703, + "step": 9123 + }, + { + "epoch": 0.1185622186884282, + "grad_norm": 0.40561574697494507, + "learning_rate": 0.0001763241009111114, + "loss": 1.5337, + "step": 9124 + }, + { + "epoch": 0.11857521323234407, + "grad_norm": 0.5181355476379395, + "learning_rate": 0.00017632150144920004, + "loss": 1.54, + "step": 9125 + }, + { + "epoch": 0.11858820777625995, + "grad_norm": 0.23585772514343262, + "learning_rate": 0.00017631890198728864, + "loss": 1.2874, + "step": 9126 + }, + { + "epoch": 0.11860120232017582, + "grad_norm": 0.42218998074531555, + "learning_rate": 0.00017631630252537726, + "loss": 1.5099, + "step": 9127 + }, + { + "epoch": 0.11861419686409169, + "grad_norm": 0.4974414110183716, + "learning_rate": 0.00017631370306346586, + "loss": 1.5862, + "step": 9128 + }, + { + "epoch": 0.11862719140800757, + "grad_norm": 0.37885788083076477, + "learning_rate": 0.00017631110360155449, + "loss": 1.4945, + "step": 9129 + }, + { + "epoch": 0.11864018595192344, + "grad_norm": 0.37644726037979126, + "learning_rate": 0.0001763085041396431, + "loss": 1.7116, + "step": 9130 + }, + { + "epoch": 0.11865318049583931, + "grad_norm": 0.42313623428344727, + "learning_rate": 0.0001763059046777317, + "loss": 1.3986, + "step": 9131 + }, + { + "epoch": 0.11866617503975518, + "grad_norm": 0.4575001895427704, + "learning_rate": 0.00017630330521582033, + "loss": 1.514, + "step": 9132 + }, + { + "epoch": 0.11867916958367106, + "grad_norm": 0.38589704036712646, + "learning_rate": 0.00017630070575390896, + "loss": 1.44, + "step": 9133 + }, + { + "epoch": 0.11869216412758693, + "grad_norm": 0.3714844584465027, + "learning_rate": 0.00017629810629199755, + "loss": 1.4082, + "step": 9134 + }, + { + "epoch": 0.1187051586715028, + "grad_norm": 0.46000030636787415, + "learning_rate": 0.00017629550683008618, + "loss": 1.6049, + "step": 9135 + }, + { + "epoch": 0.11871815321541868, + "grad_norm": 0.4333815276622772, + "learning_rate": 0.00017629290736817478, + "loss": 1.352, + "step": 9136 + }, + { + "epoch": 0.11873114775933455, + "grad_norm": 0.36204734444618225, + "learning_rate": 0.00017629030790626343, + "loss": 1.5806, + "step": 9137 + }, + { + "epoch": 0.11874414230325042, + "grad_norm": 0.3939478099346161, + "learning_rate": 0.00017628770844435203, + "loss": 1.5185, + "step": 9138 + }, + { + "epoch": 0.1187571368471663, + "grad_norm": 0.4169251620769501, + "learning_rate": 0.00017628510898244065, + "loss": 1.3396, + "step": 9139 + }, + { + "epoch": 0.11877013139108217, + "grad_norm": 0.4443172216415405, + "learning_rate": 0.00017628250952052925, + "loss": 1.4886, + "step": 9140 + }, + { + "epoch": 0.11878312593499804, + "grad_norm": 0.4923625588417053, + "learning_rate": 0.00017627991005861787, + "loss": 1.4416, + "step": 9141 + }, + { + "epoch": 0.11879612047891391, + "grad_norm": 0.4224836826324463, + "learning_rate": 0.0001762773105967065, + "loss": 1.4631, + "step": 9142 + }, + { + "epoch": 0.11880911502282979, + "grad_norm": 0.348072350025177, + "learning_rate": 0.0001762747111347951, + "loss": 1.4693, + "step": 9143 + }, + { + "epoch": 0.11882210956674566, + "grad_norm": 0.3817839026451111, + "learning_rate": 0.00017627211167288372, + "loss": 1.3953, + "step": 9144 + }, + { + "epoch": 0.11883510411066153, + "grad_norm": 0.4439152777194977, + "learning_rate": 0.00017626951221097234, + "loss": 1.3352, + "step": 9145 + }, + { + "epoch": 0.1188480986545774, + "grad_norm": 0.399453341960907, + "learning_rate": 0.00017626691274906094, + "loss": 1.4849, + "step": 9146 + }, + { + "epoch": 0.11886109319849328, + "grad_norm": 0.2881518602371216, + "learning_rate": 0.00017626431328714956, + "loss": 1.1897, + "step": 9147 + }, + { + "epoch": 0.11887408774240915, + "grad_norm": 0.41193607449531555, + "learning_rate": 0.0001762617138252382, + "loss": 1.3936, + "step": 9148 + }, + { + "epoch": 0.11888708228632502, + "grad_norm": 0.28240767121315, + "learning_rate": 0.0001762591143633268, + "loss": 1.3064, + "step": 9149 + }, + { + "epoch": 0.1189000768302409, + "grad_norm": 0.4817437529563904, + "learning_rate": 0.0001762565149014154, + "loss": 1.4227, + "step": 9150 + }, + { + "epoch": 0.11891307137415677, + "grad_norm": 0.3776431381702423, + "learning_rate": 0.00017625391543950404, + "loss": 1.1917, + "step": 9151 + }, + { + "epoch": 0.11892606591807264, + "grad_norm": 0.48175594210624695, + "learning_rate": 0.00017625131597759266, + "loss": 1.4662, + "step": 9152 + }, + { + "epoch": 0.11893906046198852, + "grad_norm": 0.46940624713897705, + "learning_rate": 0.00017624871651568126, + "loss": 1.2709, + "step": 9153 + }, + { + "epoch": 0.11895205500590439, + "grad_norm": 0.33565062284469604, + "learning_rate": 0.00017624611705376988, + "loss": 1.2616, + "step": 9154 + }, + { + "epoch": 0.11896504954982028, + "grad_norm": 0.4791678190231323, + "learning_rate": 0.00017624351759185848, + "loss": 1.4862, + "step": 9155 + }, + { + "epoch": 0.11897804409373615, + "grad_norm": 0.47324106097221375, + "learning_rate": 0.00017624091812994713, + "loss": 1.5412, + "step": 9156 + }, + { + "epoch": 0.11899103863765202, + "grad_norm": 0.4095821678638458, + "learning_rate": 0.00017623831866803573, + "loss": 1.652, + "step": 9157 + }, + { + "epoch": 0.1190040331815679, + "grad_norm": 0.42597275972366333, + "learning_rate": 0.00017623571920612435, + "loss": 1.5127, + "step": 9158 + }, + { + "epoch": 0.11901702772548377, + "grad_norm": 0.4836615324020386, + "learning_rate": 0.00017623311974421295, + "loss": 1.5348, + "step": 9159 + }, + { + "epoch": 0.11903002226939964, + "grad_norm": 0.34499868750572205, + "learning_rate": 0.00017623052028230157, + "loss": 1.3959, + "step": 9160 + }, + { + "epoch": 0.11904301681331551, + "grad_norm": 0.33290573954582214, + "learning_rate": 0.0001762279208203902, + "loss": 1.1943, + "step": 9161 + }, + { + "epoch": 0.11905601135723139, + "grad_norm": 0.3397541046142578, + "learning_rate": 0.0001762253213584788, + "loss": 1.1803, + "step": 9162 + }, + { + "epoch": 0.11906900590114726, + "grad_norm": 0.408047080039978, + "learning_rate": 0.00017622272189656742, + "loss": 1.5064, + "step": 9163 + }, + { + "epoch": 0.11908200044506313, + "grad_norm": 0.4218473434448242, + "learning_rate": 0.00017622012243465604, + "loss": 1.4539, + "step": 9164 + }, + { + "epoch": 0.119094994988979, + "grad_norm": 0.36197763681411743, + "learning_rate": 0.00017621752297274464, + "loss": 1.5051, + "step": 9165 + }, + { + "epoch": 0.11910798953289488, + "grad_norm": 0.3601924777030945, + "learning_rate": 0.00017621492351083327, + "loss": 1.5871, + "step": 9166 + }, + { + "epoch": 0.11912098407681075, + "grad_norm": 0.4823008179664612, + "learning_rate": 0.00017621232404892186, + "loss": 1.2832, + "step": 9167 + }, + { + "epoch": 0.11913397862072662, + "grad_norm": 0.4834491014480591, + "learning_rate": 0.00017620972458701052, + "loss": 1.4861, + "step": 9168 + }, + { + "epoch": 0.1191469731646425, + "grad_norm": 0.47572794556617737, + "learning_rate": 0.0001762071251250991, + "loss": 1.5033, + "step": 9169 + }, + { + "epoch": 0.11915996770855837, + "grad_norm": 0.38881343603134155, + "learning_rate": 0.00017620452566318774, + "loss": 1.5676, + "step": 9170 + }, + { + "epoch": 0.11917296225247424, + "grad_norm": 0.3057236671447754, + "learning_rate": 0.00017620192620127634, + "loss": 1.3916, + "step": 9171 + }, + { + "epoch": 0.11918595679639012, + "grad_norm": 0.3584456145763397, + "learning_rate": 0.00017619932673936496, + "loss": 1.3828, + "step": 9172 + }, + { + "epoch": 0.11919895134030599, + "grad_norm": 0.3690851032733917, + "learning_rate": 0.00017619672727745358, + "loss": 1.4101, + "step": 9173 + }, + { + "epoch": 0.11921194588422186, + "grad_norm": 0.4371185600757599, + "learning_rate": 0.00017619412781554218, + "loss": 1.5609, + "step": 9174 + }, + { + "epoch": 0.11922494042813774, + "grad_norm": 0.4170822203159332, + "learning_rate": 0.0001761915283536308, + "loss": 1.3929, + "step": 9175 + }, + { + "epoch": 0.11923793497205361, + "grad_norm": 0.24932368099689484, + "learning_rate": 0.00017618892889171943, + "loss": 1.1849, + "step": 9176 + }, + { + "epoch": 0.11925092951596948, + "grad_norm": 0.5735031962394714, + "learning_rate": 0.00017618632942980803, + "loss": 1.6123, + "step": 9177 + }, + { + "epoch": 0.11926392405988535, + "grad_norm": 0.3706362545490265, + "learning_rate": 0.00017618372996789665, + "loss": 1.3372, + "step": 9178 + }, + { + "epoch": 0.11927691860380123, + "grad_norm": 0.3488906919956207, + "learning_rate": 0.00017618113050598525, + "loss": 1.2939, + "step": 9179 + }, + { + "epoch": 0.1192899131477171, + "grad_norm": 0.46113717555999756, + "learning_rate": 0.0001761785310440739, + "loss": 1.5784, + "step": 9180 + }, + { + "epoch": 0.11930290769163297, + "grad_norm": 0.38700899481773376, + "learning_rate": 0.0001761759315821625, + "loss": 1.4882, + "step": 9181 + }, + { + "epoch": 0.11931590223554885, + "grad_norm": 0.40001922845840454, + "learning_rate": 0.00017617333212025112, + "loss": 1.2753, + "step": 9182 + }, + { + "epoch": 0.11932889677946472, + "grad_norm": 0.48631253838539124, + "learning_rate": 0.00017617073265833972, + "loss": 1.6322, + "step": 9183 + }, + { + "epoch": 0.11934189132338059, + "grad_norm": 0.35049542784690857, + "learning_rate": 0.00017616813319642834, + "loss": 1.4875, + "step": 9184 + }, + { + "epoch": 0.11935488586729646, + "grad_norm": 0.3739171624183655, + "learning_rate": 0.00017616553373451697, + "loss": 1.1823, + "step": 9185 + }, + { + "epoch": 0.11936788041121234, + "grad_norm": 0.4527442157268524, + "learning_rate": 0.00017616293427260557, + "loss": 1.5332, + "step": 9186 + }, + { + "epoch": 0.11938087495512821, + "grad_norm": 0.48978838324546814, + "learning_rate": 0.00017616033481069422, + "loss": 1.5907, + "step": 9187 + }, + { + "epoch": 0.11939386949904408, + "grad_norm": 0.312648743391037, + "learning_rate": 0.00017615773534878282, + "loss": 1.3738, + "step": 9188 + }, + { + "epoch": 0.11940686404295996, + "grad_norm": 0.31487321853637695, + "learning_rate": 0.0001761551358868714, + "loss": 1.538, + "step": 9189 + }, + { + "epoch": 0.11941985858687583, + "grad_norm": 0.3423134684562683, + "learning_rate": 0.00017615253642496004, + "loss": 1.463, + "step": 9190 + }, + { + "epoch": 0.1194328531307917, + "grad_norm": 0.27038952708244324, + "learning_rate": 0.00017614993696304866, + "loss": 1.3668, + "step": 9191 + }, + { + "epoch": 0.11944584767470758, + "grad_norm": 0.37435290217399597, + "learning_rate": 0.0001761473375011373, + "loss": 1.2847, + "step": 9192 + }, + { + "epoch": 0.11945884221862346, + "grad_norm": 0.32118090987205505, + "learning_rate": 0.00017614473803922588, + "loss": 1.2147, + "step": 9193 + }, + { + "epoch": 0.11947183676253934, + "grad_norm": 0.38151276111602783, + "learning_rate": 0.0001761421385773145, + "loss": 1.3207, + "step": 9194 + }, + { + "epoch": 0.11948483130645521, + "grad_norm": 0.39752355217933655, + "learning_rate": 0.00017613953911540313, + "loss": 1.5357, + "step": 9195 + }, + { + "epoch": 0.11949782585037108, + "grad_norm": 0.3887764811515808, + "learning_rate": 0.00017613693965349173, + "loss": 1.208, + "step": 9196 + }, + { + "epoch": 0.11951082039428695, + "grad_norm": 0.4786817133426666, + "learning_rate": 0.00017613434019158035, + "loss": 1.599, + "step": 9197 + }, + { + "epoch": 0.11952381493820283, + "grad_norm": 0.38299015164375305, + "learning_rate": 0.00017613174072966895, + "loss": 1.3418, + "step": 9198 + }, + { + "epoch": 0.1195368094821187, + "grad_norm": 0.3579920530319214, + "learning_rate": 0.0001761291412677576, + "loss": 1.5076, + "step": 9199 + }, + { + "epoch": 0.11954980402603457, + "grad_norm": 0.37152478098869324, + "learning_rate": 0.0001761265418058462, + "loss": 1.2421, + "step": 9200 + }, + { + "epoch": 0.11956279856995045, + "grad_norm": 0.28056010603904724, + "learning_rate": 0.0001761239423439348, + "loss": 1.6692, + "step": 9201 + }, + { + "epoch": 0.11957579311386632, + "grad_norm": 0.4593879282474518, + "learning_rate": 0.00017612134288202342, + "loss": 1.393, + "step": 9202 + }, + { + "epoch": 0.11958878765778219, + "grad_norm": 0.36429837346076965, + "learning_rate": 0.00017611874342011205, + "loss": 1.5133, + "step": 9203 + }, + { + "epoch": 0.11960178220169806, + "grad_norm": 0.3880722224712372, + "learning_rate": 0.00017611614395820067, + "loss": 1.4259, + "step": 9204 + }, + { + "epoch": 0.11961477674561394, + "grad_norm": 0.3845650851726532, + "learning_rate": 0.00017611354449628927, + "loss": 1.6412, + "step": 9205 + }, + { + "epoch": 0.11962777128952981, + "grad_norm": 0.34730517864227295, + "learning_rate": 0.0001761109450343779, + "loss": 1.3237, + "step": 9206 + }, + { + "epoch": 0.11964076583344568, + "grad_norm": 0.3938792049884796, + "learning_rate": 0.00017610834557246652, + "loss": 1.534, + "step": 9207 + }, + { + "epoch": 0.11965376037736156, + "grad_norm": 0.3831009566783905, + "learning_rate": 0.00017610574611055512, + "loss": 1.3745, + "step": 9208 + }, + { + "epoch": 0.11966675492127743, + "grad_norm": 0.4241017997264862, + "learning_rate": 0.00017610314664864374, + "loss": 1.417, + "step": 9209 + }, + { + "epoch": 0.1196797494651933, + "grad_norm": 0.4668117165565491, + "learning_rate": 0.00017610054718673234, + "loss": 1.3501, + "step": 9210 + }, + { + "epoch": 0.11969274400910918, + "grad_norm": 0.3994259834289551, + "learning_rate": 0.000176097947724821, + "loss": 1.5354, + "step": 9211 + }, + { + "epoch": 0.11970573855302505, + "grad_norm": 0.3899083137512207, + "learning_rate": 0.0001760953482629096, + "loss": 1.465, + "step": 9212 + }, + { + "epoch": 0.11971873309694092, + "grad_norm": 0.3840634822845459, + "learning_rate": 0.00017609274880099818, + "loss": 1.2357, + "step": 9213 + }, + { + "epoch": 0.1197317276408568, + "grad_norm": 0.34098339080810547, + "learning_rate": 0.0001760901493390868, + "loss": 1.5441, + "step": 9214 + }, + { + "epoch": 0.11974472218477267, + "grad_norm": 0.4204641282558441, + "learning_rate": 0.00017608754987717543, + "loss": 1.3739, + "step": 9215 + }, + { + "epoch": 0.11975771672868854, + "grad_norm": 0.38086310029029846, + "learning_rate": 0.00017608495041526406, + "loss": 1.5106, + "step": 9216 + }, + { + "epoch": 0.11977071127260441, + "grad_norm": 0.4430563151836395, + "learning_rate": 0.00017608235095335265, + "loss": 1.6276, + "step": 9217 + }, + { + "epoch": 0.11978370581652029, + "grad_norm": 0.3624548316001892, + "learning_rate": 0.00017607975149144128, + "loss": 1.3159, + "step": 9218 + }, + { + "epoch": 0.11979670036043616, + "grad_norm": 0.3226706385612488, + "learning_rate": 0.0001760771520295299, + "loss": 1.4212, + "step": 9219 + }, + { + "epoch": 0.11980969490435203, + "grad_norm": 0.3631337285041809, + "learning_rate": 0.0001760745525676185, + "loss": 1.438, + "step": 9220 + }, + { + "epoch": 0.1198226894482679, + "grad_norm": 0.3964703381061554, + "learning_rate": 0.00017607195310570713, + "loss": 1.5525, + "step": 9221 + }, + { + "epoch": 0.11983568399218378, + "grad_norm": 0.4067436754703522, + "learning_rate": 0.00017606935364379575, + "loss": 1.4267, + "step": 9222 + }, + { + "epoch": 0.11984867853609965, + "grad_norm": 0.37554192543029785, + "learning_rate": 0.00017606675418188437, + "loss": 1.5489, + "step": 9223 + }, + { + "epoch": 0.11986167308001552, + "grad_norm": 0.38695022463798523, + "learning_rate": 0.00017606415471997297, + "loss": 1.4888, + "step": 9224 + }, + { + "epoch": 0.1198746676239314, + "grad_norm": 0.27086248993873596, + "learning_rate": 0.0001760615552580616, + "loss": 1.1403, + "step": 9225 + }, + { + "epoch": 0.11988766216784727, + "grad_norm": 0.3320566415786743, + "learning_rate": 0.00017605895579615022, + "loss": 1.3235, + "step": 9226 + }, + { + "epoch": 0.11990065671176314, + "grad_norm": 0.4477056860923767, + "learning_rate": 0.00017605635633423882, + "loss": 1.3832, + "step": 9227 + }, + { + "epoch": 0.11991365125567902, + "grad_norm": 0.42116689682006836, + "learning_rate": 0.00017605375687232744, + "loss": 1.3223, + "step": 9228 + }, + { + "epoch": 0.11992664579959489, + "grad_norm": 0.3749730587005615, + "learning_rate": 0.00017605115741041604, + "loss": 1.4251, + "step": 9229 + }, + { + "epoch": 0.11993964034351076, + "grad_norm": 0.3425008952617645, + "learning_rate": 0.00017604855794850466, + "loss": 1.2577, + "step": 9230 + }, + { + "epoch": 0.11995263488742665, + "grad_norm": 0.3652496039867401, + "learning_rate": 0.0001760459584865933, + "loss": 1.2502, + "step": 9231 + }, + { + "epoch": 0.11996562943134252, + "grad_norm": 0.4033985435962677, + "learning_rate": 0.0001760433590246819, + "loss": 1.407, + "step": 9232 + }, + { + "epoch": 0.1199786239752584, + "grad_norm": 0.38904184103012085, + "learning_rate": 0.0001760407595627705, + "loss": 1.3089, + "step": 9233 + }, + { + "epoch": 0.11999161851917427, + "grad_norm": 0.4373151659965515, + "learning_rate": 0.00017603816010085914, + "loss": 1.4005, + "step": 9234 + }, + { + "epoch": 0.12000461306309014, + "grad_norm": 0.39823117852211, + "learning_rate": 0.00017603556063894776, + "loss": 1.4045, + "step": 9235 + }, + { + "epoch": 0.12001760760700601, + "grad_norm": 0.3663402199745178, + "learning_rate": 0.00017603296117703636, + "loss": 1.3554, + "step": 9236 + }, + { + "epoch": 0.12003060215092189, + "grad_norm": 0.44000574946403503, + "learning_rate": 0.00017603036171512498, + "loss": 1.3884, + "step": 9237 + }, + { + "epoch": 0.12004359669483776, + "grad_norm": 0.38616931438446045, + "learning_rate": 0.0001760277622532136, + "loss": 1.4719, + "step": 9238 + }, + { + "epoch": 0.12005659123875363, + "grad_norm": 0.40794098377227783, + "learning_rate": 0.0001760251627913022, + "loss": 1.3611, + "step": 9239 + }, + { + "epoch": 0.1200695857826695, + "grad_norm": 0.473959743976593, + "learning_rate": 0.00017602256332939083, + "loss": 1.4182, + "step": 9240 + }, + { + "epoch": 0.12008258032658538, + "grad_norm": 0.30298808217048645, + "learning_rate": 0.00017601996386747943, + "loss": 1.3097, + "step": 9241 + }, + { + "epoch": 0.12009557487050125, + "grad_norm": 0.32721996307373047, + "learning_rate": 0.00017601736440556808, + "loss": 1.4447, + "step": 9242 + }, + { + "epoch": 0.12010856941441712, + "grad_norm": 0.40660449862480164, + "learning_rate": 0.00017601476494365667, + "loss": 1.4161, + "step": 9243 + }, + { + "epoch": 0.120121563958333, + "grad_norm": 0.7570306658744812, + "learning_rate": 0.00017601216548174527, + "loss": 1.4576, + "step": 9244 + }, + { + "epoch": 0.12013455850224887, + "grad_norm": 0.4389442801475525, + "learning_rate": 0.0001760095660198339, + "loss": 1.5235, + "step": 9245 + }, + { + "epoch": 0.12014755304616474, + "grad_norm": 0.39088016748428345, + "learning_rate": 0.00017600696655792252, + "loss": 1.3278, + "step": 9246 + }, + { + "epoch": 0.12016054759008062, + "grad_norm": 0.3125208020210266, + "learning_rate": 0.00017600436709601115, + "loss": 1.3421, + "step": 9247 + }, + { + "epoch": 0.12017354213399649, + "grad_norm": 0.3390772342681885, + "learning_rate": 0.00017600176763409974, + "loss": 1.18, + "step": 9248 + }, + { + "epoch": 0.12018653667791236, + "grad_norm": 0.35902246832847595, + "learning_rate": 0.00017599916817218837, + "loss": 1.2974, + "step": 9249 + }, + { + "epoch": 0.12019953122182823, + "grad_norm": 0.41039159893989563, + "learning_rate": 0.000175996568710277, + "loss": 1.552, + "step": 9250 + }, + { + "epoch": 0.12021252576574411, + "grad_norm": 0.7680051922798157, + "learning_rate": 0.0001759939692483656, + "loss": 1.3846, + "step": 9251 + }, + { + "epoch": 0.12022552030965998, + "grad_norm": 0.3319466710090637, + "learning_rate": 0.0001759913697864542, + "loss": 1.3807, + "step": 9252 + }, + { + "epoch": 0.12023851485357585, + "grad_norm": 0.3757462501525879, + "learning_rate": 0.0001759887703245428, + "loss": 1.4435, + "step": 9253 + }, + { + "epoch": 0.12025150939749173, + "grad_norm": 0.32348567247390747, + "learning_rate": 0.00017598617086263146, + "loss": 1.4973, + "step": 9254 + }, + { + "epoch": 0.1202645039414076, + "grad_norm": 0.4984109699726105, + "learning_rate": 0.00017598357140072006, + "loss": 1.5998, + "step": 9255 + }, + { + "epoch": 0.12027749848532347, + "grad_norm": 0.4879307746887207, + "learning_rate": 0.00017598097193880866, + "loss": 1.3849, + "step": 9256 + }, + { + "epoch": 0.12029049302923935, + "grad_norm": 0.46921801567077637, + "learning_rate": 0.00017597837247689728, + "loss": 1.5048, + "step": 9257 + }, + { + "epoch": 0.12030348757315522, + "grad_norm": 0.5488225221633911, + "learning_rate": 0.0001759757730149859, + "loss": 1.5263, + "step": 9258 + }, + { + "epoch": 0.12031648211707109, + "grad_norm": 0.46183568239212036, + "learning_rate": 0.00017597317355307453, + "loss": 1.6255, + "step": 9259 + }, + { + "epoch": 0.12032947666098696, + "grad_norm": 0.35899120569229126, + "learning_rate": 0.00017597057409116313, + "loss": 1.3729, + "step": 9260 + }, + { + "epoch": 0.12034247120490284, + "grad_norm": 0.36731332540512085, + "learning_rate": 0.00017596797462925175, + "loss": 1.2305, + "step": 9261 + }, + { + "epoch": 0.12035546574881871, + "grad_norm": 0.3999217450618744, + "learning_rate": 0.00017596537516734038, + "loss": 1.7356, + "step": 9262 + }, + { + "epoch": 0.12036846029273458, + "grad_norm": 0.4213993549346924, + "learning_rate": 0.00017596277570542897, + "loss": 1.4108, + "step": 9263 + }, + { + "epoch": 0.12038145483665046, + "grad_norm": 0.3461703956127167, + "learning_rate": 0.0001759601762435176, + "loss": 1.5847, + "step": 9264 + }, + { + "epoch": 0.12039444938056633, + "grad_norm": 0.36527061462402344, + "learning_rate": 0.00017595757678160622, + "loss": 1.3304, + "step": 9265 + }, + { + "epoch": 0.1204074439244822, + "grad_norm": 0.4013476073741913, + "learning_rate": 0.00017595497731969485, + "loss": 1.2884, + "step": 9266 + }, + { + "epoch": 0.12042043846839807, + "grad_norm": 0.4246699810028076, + "learning_rate": 0.00017595237785778345, + "loss": 1.5034, + "step": 9267 + }, + { + "epoch": 0.12043343301231395, + "grad_norm": 0.38888460397720337, + "learning_rate": 0.00017594977839587204, + "loss": 1.3504, + "step": 9268 + }, + { + "epoch": 0.12044642755622983, + "grad_norm": 0.3762776553630829, + "learning_rate": 0.0001759471789339607, + "loss": 1.5032, + "step": 9269 + }, + { + "epoch": 0.12045942210014571, + "grad_norm": 0.40783512592315674, + "learning_rate": 0.0001759445794720493, + "loss": 1.4541, + "step": 9270 + }, + { + "epoch": 0.12047241664406158, + "grad_norm": 0.33674609661102295, + "learning_rate": 0.00017594198001013792, + "loss": 1.3576, + "step": 9271 + }, + { + "epoch": 0.12048541118797745, + "grad_norm": 0.3624383807182312, + "learning_rate": 0.0001759393805482265, + "loss": 1.4659, + "step": 9272 + }, + { + "epoch": 0.12049840573189333, + "grad_norm": 0.4997468590736389, + "learning_rate": 0.00017593678108631514, + "loss": 1.568, + "step": 9273 + }, + { + "epoch": 0.1205114002758092, + "grad_norm": 0.49450770020484924, + "learning_rate": 0.00017593418162440376, + "loss": 1.4277, + "step": 9274 + }, + { + "epoch": 0.12052439481972507, + "grad_norm": 0.21475233137607574, + "learning_rate": 0.00017593158216249236, + "loss": 1.2039, + "step": 9275 + }, + { + "epoch": 0.12053738936364095, + "grad_norm": 0.3248988389968872, + "learning_rate": 0.00017592898270058098, + "loss": 1.2936, + "step": 9276 + }, + { + "epoch": 0.12055038390755682, + "grad_norm": 0.41979753971099854, + "learning_rate": 0.0001759263832386696, + "loss": 1.3144, + "step": 9277 + }, + { + "epoch": 0.12056337845147269, + "grad_norm": 0.4150199294090271, + "learning_rate": 0.00017592378377675823, + "loss": 1.3194, + "step": 9278 + }, + { + "epoch": 0.12057637299538856, + "grad_norm": 0.36927545070648193, + "learning_rate": 0.00017592118431484683, + "loss": 1.5334, + "step": 9279 + }, + { + "epoch": 0.12058936753930444, + "grad_norm": 0.38874977827072144, + "learning_rate": 0.00017591858485293546, + "loss": 1.3427, + "step": 9280 + }, + { + "epoch": 0.12060236208322031, + "grad_norm": 0.4251580238342285, + "learning_rate": 0.00017591598539102408, + "loss": 1.2466, + "step": 9281 + }, + { + "epoch": 0.12061535662713618, + "grad_norm": 0.32135188579559326, + "learning_rate": 0.00017591338592911268, + "loss": 1.3673, + "step": 9282 + }, + { + "epoch": 0.12062835117105206, + "grad_norm": 0.4495638608932495, + "learning_rate": 0.0001759107864672013, + "loss": 1.4018, + "step": 9283 + }, + { + "epoch": 0.12064134571496793, + "grad_norm": 0.3772464990615845, + "learning_rate": 0.0001759081870052899, + "loss": 1.5787, + "step": 9284 + }, + { + "epoch": 0.1206543402588838, + "grad_norm": 0.4275606870651245, + "learning_rate": 0.00017590558754337852, + "loss": 1.4332, + "step": 9285 + }, + { + "epoch": 0.12066733480279968, + "grad_norm": 0.3489293158054352, + "learning_rate": 0.00017590298808146715, + "loss": 1.356, + "step": 9286 + }, + { + "epoch": 0.12068032934671555, + "grad_norm": 0.4110635221004486, + "learning_rate": 0.00017590038861955575, + "loss": 1.4342, + "step": 9287 + }, + { + "epoch": 0.12069332389063142, + "grad_norm": 0.44617557525634766, + "learning_rate": 0.00017589778915764437, + "loss": 1.4562, + "step": 9288 + }, + { + "epoch": 0.1207063184345473, + "grad_norm": 0.3555718660354614, + "learning_rate": 0.000175895189695733, + "loss": 1.1468, + "step": 9289 + }, + { + "epoch": 0.12071931297846317, + "grad_norm": 0.4795862138271332, + "learning_rate": 0.00017589259023382162, + "loss": 1.5313, + "step": 9290 + }, + { + "epoch": 0.12073230752237904, + "grad_norm": 0.44497111439704895, + "learning_rate": 0.00017588999077191022, + "loss": 1.5787, + "step": 9291 + }, + { + "epoch": 0.12074530206629491, + "grad_norm": 0.3562009930610657, + "learning_rate": 0.00017588739130999884, + "loss": 1.6434, + "step": 9292 + }, + { + "epoch": 0.12075829661021079, + "grad_norm": 0.43425676226615906, + "learning_rate": 0.00017588479184808747, + "loss": 1.3577, + "step": 9293 + }, + { + "epoch": 0.12077129115412666, + "grad_norm": 0.3656645119190216, + "learning_rate": 0.00017588219238617606, + "loss": 1.3952, + "step": 9294 + }, + { + "epoch": 0.12078428569804253, + "grad_norm": 0.3730233609676361, + "learning_rate": 0.0001758795929242647, + "loss": 1.474, + "step": 9295 + }, + { + "epoch": 0.1207972802419584, + "grad_norm": 0.39313989877700806, + "learning_rate": 0.0001758769934623533, + "loss": 1.3837, + "step": 9296 + }, + { + "epoch": 0.12081027478587428, + "grad_norm": 0.415451318025589, + "learning_rate": 0.0001758743940004419, + "loss": 1.4818, + "step": 9297 + }, + { + "epoch": 0.12082326932979015, + "grad_norm": 0.40186449885368347, + "learning_rate": 0.00017587179453853053, + "loss": 1.412, + "step": 9298 + }, + { + "epoch": 0.12083626387370602, + "grad_norm": 0.32585060596466064, + "learning_rate": 0.00017586919507661913, + "loss": 1.4182, + "step": 9299 + }, + { + "epoch": 0.1208492584176219, + "grad_norm": 0.36978092789649963, + "learning_rate": 0.00017586659561470778, + "loss": 1.3818, + "step": 9300 + }, + { + "epoch": 0.12086225296153777, + "grad_norm": 0.3578946590423584, + "learning_rate": 0.00017586399615279638, + "loss": 1.2783, + "step": 9301 + }, + { + "epoch": 0.12087524750545364, + "grad_norm": 0.2977055311203003, + "learning_rate": 0.000175861396690885, + "loss": 1.5007, + "step": 9302 + }, + { + "epoch": 0.12088824204936952, + "grad_norm": 0.37292179465293884, + "learning_rate": 0.0001758587972289736, + "loss": 1.3219, + "step": 9303 + }, + { + "epoch": 0.12090123659328539, + "grad_norm": 0.39701586961746216, + "learning_rate": 0.00017585619776706223, + "loss": 1.4429, + "step": 9304 + }, + { + "epoch": 0.12091423113720126, + "grad_norm": 0.27553433179855347, + "learning_rate": 0.00017585359830515085, + "loss": 1.3378, + "step": 9305 + }, + { + "epoch": 0.12092722568111713, + "grad_norm": 0.30529335141181946, + "learning_rate": 0.00017585099884323945, + "loss": 1.2722, + "step": 9306 + }, + { + "epoch": 0.12094022022503302, + "grad_norm": 0.3753760755062103, + "learning_rate": 0.00017584839938132807, + "loss": 1.3034, + "step": 9307 + }, + { + "epoch": 0.1209532147689489, + "grad_norm": 0.38662222027778625, + "learning_rate": 0.0001758457999194167, + "loss": 1.3716, + "step": 9308 + }, + { + "epoch": 0.12096620931286477, + "grad_norm": 0.3851815164089203, + "learning_rate": 0.00017584320045750532, + "loss": 1.2685, + "step": 9309 + }, + { + "epoch": 0.12097920385678064, + "grad_norm": 0.3915620744228363, + "learning_rate": 0.00017584060099559392, + "loss": 1.3465, + "step": 9310 + }, + { + "epoch": 0.12099219840069651, + "grad_norm": 0.4091010093688965, + "learning_rate": 0.00017583800153368252, + "loss": 1.1862, + "step": 9311 + }, + { + "epoch": 0.12100519294461239, + "grad_norm": 0.360620379447937, + "learning_rate": 0.00017583540207177117, + "loss": 1.3402, + "step": 9312 + }, + { + "epoch": 0.12101818748852826, + "grad_norm": 0.4678625464439392, + "learning_rate": 0.00017583280260985976, + "loss": 1.6014, + "step": 9313 + }, + { + "epoch": 0.12103118203244413, + "grad_norm": 0.4157090187072754, + "learning_rate": 0.0001758302031479484, + "loss": 1.245, + "step": 9314 + }, + { + "epoch": 0.12104417657636, + "grad_norm": 0.2855381965637207, + "learning_rate": 0.000175827603686037, + "loss": 1.289, + "step": 9315 + }, + { + "epoch": 0.12105717112027588, + "grad_norm": 0.4155445992946625, + "learning_rate": 0.0001758250042241256, + "loss": 1.4041, + "step": 9316 + }, + { + "epoch": 0.12107016566419175, + "grad_norm": 0.4189385175704956, + "learning_rate": 0.00017582240476221424, + "loss": 1.4988, + "step": 9317 + }, + { + "epoch": 0.12108316020810762, + "grad_norm": 0.5436775088310242, + "learning_rate": 0.00017581980530030283, + "loss": 1.5024, + "step": 9318 + }, + { + "epoch": 0.1210961547520235, + "grad_norm": 0.38973748683929443, + "learning_rate": 0.00017581720583839146, + "loss": 1.3585, + "step": 9319 + }, + { + "epoch": 0.12110914929593937, + "grad_norm": 0.4831514358520508, + "learning_rate": 0.00017581460637648008, + "loss": 1.4914, + "step": 9320 + }, + { + "epoch": 0.12112214383985524, + "grad_norm": 0.4586810767650604, + "learning_rate": 0.0001758120069145687, + "loss": 1.4678, + "step": 9321 + }, + { + "epoch": 0.12113513838377112, + "grad_norm": 0.3998999297618866, + "learning_rate": 0.0001758094074526573, + "loss": 1.5773, + "step": 9322 + }, + { + "epoch": 0.12114813292768699, + "grad_norm": 0.41111308336257935, + "learning_rate": 0.0001758068079907459, + "loss": 1.3625, + "step": 9323 + }, + { + "epoch": 0.12116112747160286, + "grad_norm": 0.32451263070106506, + "learning_rate": 0.00017580420852883455, + "loss": 1.2822, + "step": 9324 + }, + { + "epoch": 0.12117412201551873, + "grad_norm": 0.42317408323287964, + "learning_rate": 0.00017580160906692315, + "loss": 1.3132, + "step": 9325 + }, + { + "epoch": 0.12118711655943461, + "grad_norm": 0.33010753989219666, + "learning_rate": 0.00017579900960501177, + "loss": 1.3395, + "step": 9326 + }, + { + "epoch": 0.12120011110335048, + "grad_norm": 0.4237150251865387, + "learning_rate": 0.00017579641014310037, + "loss": 1.5932, + "step": 9327 + }, + { + "epoch": 0.12121310564726635, + "grad_norm": 0.33284929394721985, + "learning_rate": 0.000175793810681189, + "loss": 1.2039, + "step": 9328 + }, + { + "epoch": 0.12122610019118223, + "grad_norm": 0.4798779785633087, + "learning_rate": 0.00017579121121927762, + "loss": 1.5158, + "step": 9329 + }, + { + "epoch": 0.1212390947350981, + "grad_norm": 0.3473212420940399, + "learning_rate": 0.00017578861175736622, + "loss": 1.3428, + "step": 9330 + }, + { + "epoch": 0.12125208927901397, + "grad_norm": 0.2981327176094055, + "learning_rate": 0.00017578601229545484, + "loss": 1.3875, + "step": 9331 + }, + { + "epoch": 0.12126508382292985, + "grad_norm": 0.42622026801109314, + "learning_rate": 0.00017578341283354347, + "loss": 1.4639, + "step": 9332 + }, + { + "epoch": 0.12127807836684572, + "grad_norm": 0.29069605469703674, + "learning_rate": 0.0001757808133716321, + "loss": 1.2565, + "step": 9333 + }, + { + "epoch": 0.12129107291076159, + "grad_norm": 0.3991733193397522, + "learning_rate": 0.0001757782139097207, + "loss": 1.4763, + "step": 9334 + }, + { + "epoch": 0.12130406745467746, + "grad_norm": 0.42441391944885254, + "learning_rate": 0.00017577561444780931, + "loss": 1.5025, + "step": 9335 + }, + { + "epoch": 0.12131706199859334, + "grad_norm": 0.36271902918815613, + "learning_rate": 0.00017577301498589794, + "loss": 1.6598, + "step": 9336 + }, + { + "epoch": 0.12133005654250921, + "grad_norm": 0.4363735318183899, + "learning_rate": 0.00017577041552398654, + "loss": 1.4993, + "step": 9337 + }, + { + "epoch": 0.12134305108642508, + "grad_norm": 0.3632548749446869, + "learning_rate": 0.00017576781606207516, + "loss": 1.2148, + "step": 9338 + }, + { + "epoch": 0.12135604563034096, + "grad_norm": 0.48507094383239746, + "learning_rate": 0.00017576521660016378, + "loss": 1.526, + "step": 9339 + }, + { + "epoch": 0.12136904017425683, + "grad_norm": 0.3822447955608368, + "learning_rate": 0.00017576261713825238, + "loss": 1.3814, + "step": 9340 + }, + { + "epoch": 0.1213820347181727, + "grad_norm": 0.43666285276412964, + "learning_rate": 0.000175760017676341, + "loss": 1.4766, + "step": 9341 + }, + { + "epoch": 0.12139502926208857, + "grad_norm": 0.46178942918777466, + "learning_rate": 0.0001757574182144296, + "loss": 1.4913, + "step": 9342 + }, + { + "epoch": 0.12140802380600445, + "grad_norm": 0.4100555181503296, + "learning_rate": 0.00017575481875251826, + "loss": 1.4562, + "step": 9343 + }, + { + "epoch": 0.12142101834992032, + "grad_norm": 0.3818536102771759, + "learning_rate": 0.00017575221929060685, + "loss": 1.5386, + "step": 9344 + }, + { + "epoch": 0.12143401289383621, + "grad_norm": 0.25779303908348083, + "learning_rate": 0.00017574961982869548, + "loss": 1.1867, + "step": 9345 + }, + { + "epoch": 0.12144700743775208, + "grad_norm": 0.3203864097595215, + "learning_rate": 0.00017574702036678407, + "loss": 1.2648, + "step": 9346 + }, + { + "epoch": 0.12146000198166795, + "grad_norm": 0.487255722284317, + "learning_rate": 0.0001757444209048727, + "loss": 1.5688, + "step": 9347 + }, + { + "epoch": 0.12147299652558383, + "grad_norm": 0.33795788884162903, + "learning_rate": 0.00017574182144296132, + "loss": 1.4224, + "step": 9348 + }, + { + "epoch": 0.1214859910694997, + "grad_norm": 0.4090534448623657, + "learning_rate": 0.00017573922198104992, + "loss": 1.3815, + "step": 9349 + }, + { + "epoch": 0.12149898561341557, + "grad_norm": 0.4001588821411133, + "learning_rate": 0.00017573662251913855, + "loss": 1.4646, + "step": 9350 + }, + { + "epoch": 0.12151198015733145, + "grad_norm": 0.4228440523147583, + "learning_rate": 0.00017573402305722717, + "loss": 1.393, + "step": 9351 + }, + { + "epoch": 0.12152497470124732, + "grad_norm": 0.45497748255729675, + "learning_rate": 0.00017573142359531577, + "loss": 1.4689, + "step": 9352 + }, + { + "epoch": 0.12153796924516319, + "grad_norm": 0.7702919840812683, + "learning_rate": 0.0001757288241334044, + "loss": 1.4536, + "step": 9353 + }, + { + "epoch": 0.12155096378907906, + "grad_norm": 0.4107643663883209, + "learning_rate": 0.000175726224671493, + "loss": 1.5099, + "step": 9354 + }, + { + "epoch": 0.12156395833299494, + "grad_norm": 0.3273683488368988, + "learning_rate": 0.00017572362520958164, + "loss": 1.3992, + "step": 9355 + }, + { + "epoch": 0.12157695287691081, + "grad_norm": 0.26075479388237, + "learning_rate": 0.00017572102574767024, + "loss": 1.3166, + "step": 9356 + }, + { + "epoch": 0.12158994742082668, + "grad_norm": 0.3445676863193512, + "learning_rate": 0.00017571842628575886, + "loss": 1.2702, + "step": 9357 + }, + { + "epoch": 0.12160294196474256, + "grad_norm": 0.3990176320075989, + "learning_rate": 0.00017571582682384746, + "loss": 1.4989, + "step": 9358 + }, + { + "epoch": 0.12161593650865843, + "grad_norm": 0.4130074977874756, + "learning_rate": 0.00017571322736193608, + "loss": 1.4303, + "step": 9359 + }, + { + "epoch": 0.1216289310525743, + "grad_norm": 0.39101868867874146, + "learning_rate": 0.0001757106279000247, + "loss": 1.406, + "step": 9360 + }, + { + "epoch": 0.12164192559649017, + "grad_norm": 0.3175065219402313, + "learning_rate": 0.0001757080284381133, + "loss": 1.4506, + "step": 9361 + }, + { + "epoch": 0.12165492014040605, + "grad_norm": 0.32256609201431274, + "learning_rate": 0.00017570542897620193, + "loss": 1.5279, + "step": 9362 + }, + { + "epoch": 0.12166791468432192, + "grad_norm": 0.4326832890510559, + "learning_rate": 0.00017570282951429056, + "loss": 1.4225, + "step": 9363 + }, + { + "epoch": 0.1216809092282378, + "grad_norm": 0.33629029989242554, + "learning_rate": 0.00017570023005237918, + "loss": 1.374, + "step": 9364 + }, + { + "epoch": 0.12169390377215367, + "grad_norm": 0.414267897605896, + "learning_rate": 0.00017569763059046778, + "loss": 1.4348, + "step": 9365 + }, + { + "epoch": 0.12170689831606954, + "grad_norm": 0.3454011380672455, + "learning_rate": 0.00017569503112855637, + "loss": 1.3953, + "step": 9366 + }, + { + "epoch": 0.12171989285998541, + "grad_norm": 0.523872971534729, + "learning_rate": 0.00017569243166664503, + "loss": 1.5102, + "step": 9367 + }, + { + "epoch": 0.12173288740390129, + "grad_norm": 0.4031912088394165, + "learning_rate": 0.00017568983220473362, + "loss": 1.6535, + "step": 9368 + }, + { + "epoch": 0.12174588194781716, + "grad_norm": 0.3654964864253998, + "learning_rate": 0.00017568723274282225, + "loss": 1.396, + "step": 9369 + }, + { + "epoch": 0.12175887649173303, + "grad_norm": 0.37155187129974365, + "learning_rate": 0.00017568463328091087, + "loss": 1.5124, + "step": 9370 + }, + { + "epoch": 0.1217718710356489, + "grad_norm": 0.31775298714637756, + "learning_rate": 0.00017568203381899947, + "loss": 1.3741, + "step": 9371 + }, + { + "epoch": 0.12178486557956478, + "grad_norm": 0.303800106048584, + "learning_rate": 0.0001756794343570881, + "loss": 1.2745, + "step": 9372 + }, + { + "epoch": 0.12179786012348065, + "grad_norm": 0.3304106593132019, + "learning_rate": 0.0001756768348951767, + "loss": 1.5423, + "step": 9373 + }, + { + "epoch": 0.12181085466739652, + "grad_norm": 0.42666420340538025, + "learning_rate": 0.00017567423543326534, + "loss": 1.4133, + "step": 9374 + }, + { + "epoch": 0.1218238492113124, + "grad_norm": 0.43140193819999695, + "learning_rate": 0.00017567163597135394, + "loss": 1.5115, + "step": 9375 + }, + { + "epoch": 0.12183684375522827, + "grad_norm": 0.3730902373790741, + "learning_rate": 0.00017566903650944257, + "loss": 1.3174, + "step": 9376 + }, + { + "epoch": 0.12184983829914414, + "grad_norm": 0.36808860301971436, + "learning_rate": 0.00017566643704753116, + "loss": 1.4683, + "step": 9377 + }, + { + "epoch": 0.12186283284306001, + "grad_norm": 0.3478783667087555, + "learning_rate": 0.0001756638375856198, + "loss": 1.3244, + "step": 9378 + }, + { + "epoch": 0.12187582738697589, + "grad_norm": 0.42890840768814087, + "learning_rate": 0.0001756612381237084, + "loss": 1.5046, + "step": 9379 + }, + { + "epoch": 0.12188882193089176, + "grad_norm": 0.2624491751194, + "learning_rate": 0.000175658638661797, + "loss": 1.3872, + "step": 9380 + }, + { + "epoch": 0.12190181647480763, + "grad_norm": 0.29321375489234924, + "learning_rate": 0.00017565603919988563, + "loss": 1.4468, + "step": 9381 + }, + { + "epoch": 0.1219148110187235, + "grad_norm": 0.4059242606163025, + "learning_rate": 0.00017565343973797426, + "loss": 1.4384, + "step": 9382 + }, + { + "epoch": 0.1219278055626394, + "grad_norm": 0.48989370465278625, + "learning_rate": 0.00017565084027606286, + "loss": 1.2889, + "step": 9383 + }, + { + "epoch": 0.12194080010655527, + "grad_norm": 0.3938312232494354, + "learning_rate": 0.00017564824081415148, + "loss": 1.4097, + "step": 9384 + }, + { + "epoch": 0.12195379465047114, + "grad_norm": 0.4003935754299164, + "learning_rate": 0.00017564564135224008, + "loss": 1.3756, + "step": 9385 + }, + { + "epoch": 0.12196678919438701, + "grad_norm": 0.4381786584854126, + "learning_rate": 0.00017564304189032873, + "loss": 1.2769, + "step": 9386 + }, + { + "epoch": 0.12197978373830289, + "grad_norm": 0.36698466539382935, + "learning_rate": 0.00017564044242841733, + "loss": 1.5093, + "step": 9387 + }, + { + "epoch": 0.12199277828221876, + "grad_norm": 0.31886789202690125, + "learning_rate": 0.00017563784296650595, + "loss": 1.416, + "step": 9388 + }, + { + "epoch": 0.12200577282613463, + "grad_norm": 0.37381190061569214, + "learning_rate": 0.00017563524350459455, + "loss": 1.4511, + "step": 9389 + }, + { + "epoch": 0.1220187673700505, + "grad_norm": 0.43987271189689636, + "learning_rate": 0.00017563264404268317, + "loss": 1.5223, + "step": 9390 + }, + { + "epoch": 0.12203176191396638, + "grad_norm": 0.37794679403305054, + "learning_rate": 0.0001756300445807718, + "loss": 1.28, + "step": 9391 + }, + { + "epoch": 0.12204475645788225, + "grad_norm": 0.37859833240509033, + "learning_rate": 0.0001756274451188604, + "loss": 1.2429, + "step": 9392 + }, + { + "epoch": 0.12205775100179812, + "grad_norm": 0.4435123801231384, + "learning_rate": 0.00017562484565694902, + "loss": 1.4836, + "step": 9393 + }, + { + "epoch": 0.122070745545714, + "grad_norm": 0.5046146512031555, + "learning_rate": 0.00017562224619503764, + "loss": 1.5646, + "step": 9394 + }, + { + "epoch": 0.12208374008962987, + "grad_norm": 0.4131261706352234, + "learning_rate": 0.00017561964673312624, + "loss": 1.4854, + "step": 9395 + }, + { + "epoch": 0.12209673463354574, + "grad_norm": 0.370067298412323, + "learning_rate": 0.00017561704727121487, + "loss": 1.348, + "step": 9396 + }, + { + "epoch": 0.12210972917746162, + "grad_norm": 0.3547513484954834, + "learning_rate": 0.00017561444780930346, + "loss": 1.3926, + "step": 9397 + }, + { + "epoch": 0.12212272372137749, + "grad_norm": 0.36864474415779114, + "learning_rate": 0.00017561184834739211, + "loss": 1.3426, + "step": 9398 + }, + { + "epoch": 0.12213571826529336, + "grad_norm": 0.39651089906692505, + "learning_rate": 0.0001756092488854807, + "loss": 1.4726, + "step": 9399 + }, + { + "epoch": 0.12214871280920923, + "grad_norm": 0.37016063928604126, + "learning_rate": 0.00017560664942356934, + "loss": 1.2783, + "step": 9400 + }, + { + "epoch": 0.1221617073531251, + "grad_norm": 0.4092501103878021, + "learning_rate": 0.00017560404996165793, + "loss": 1.3685, + "step": 9401 + }, + { + "epoch": 0.12217470189704098, + "grad_norm": 0.3386761546134949, + "learning_rate": 0.00017560145049974656, + "loss": 1.5491, + "step": 9402 + }, + { + "epoch": 0.12218769644095685, + "grad_norm": 0.5029440522193909, + "learning_rate": 0.00017559885103783518, + "loss": 1.5938, + "step": 9403 + }, + { + "epoch": 0.12220069098487273, + "grad_norm": 0.36059367656707764, + "learning_rate": 0.00017559625157592378, + "loss": 1.2096, + "step": 9404 + }, + { + "epoch": 0.1222136855287886, + "grad_norm": 0.49594607949256897, + "learning_rate": 0.0001755936521140124, + "loss": 1.4915, + "step": 9405 + }, + { + "epoch": 0.12222668007270447, + "grad_norm": 0.30236825346946716, + "learning_rate": 0.00017559105265210103, + "loss": 1.2743, + "step": 9406 + }, + { + "epoch": 0.12223967461662034, + "grad_norm": 0.37394291162490845, + "learning_rate": 0.00017558845319018963, + "loss": 1.2983, + "step": 9407 + }, + { + "epoch": 0.12225266916053622, + "grad_norm": 0.42718207836151123, + "learning_rate": 0.00017558585372827825, + "loss": 1.3058, + "step": 9408 + }, + { + "epoch": 0.12226566370445209, + "grad_norm": 0.4009205996990204, + "learning_rate": 0.00017558325426636688, + "loss": 1.4841, + "step": 9409 + }, + { + "epoch": 0.12227865824836796, + "grad_norm": 0.3838242292404175, + "learning_rate": 0.0001755806548044555, + "loss": 1.4362, + "step": 9410 + }, + { + "epoch": 0.12229165279228384, + "grad_norm": 0.4049939811229706, + "learning_rate": 0.0001755780553425441, + "loss": 1.4122, + "step": 9411 + }, + { + "epoch": 0.12230464733619971, + "grad_norm": 0.3356418013572693, + "learning_rate": 0.00017557545588063272, + "loss": 1.3598, + "step": 9412 + }, + { + "epoch": 0.12231764188011558, + "grad_norm": 0.5174401998519897, + "learning_rate": 0.00017557285641872135, + "loss": 1.5235, + "step": 9413 + }, + { + "epoch": 0.12233063642403146, + "grad_norm": 0.5014281868934631, + "learning_rate": 0.00017557025695680994, + "loss": 1.5247, + "step": 9414 + }, + { + "epoch": 0.12234363096794733, + "grad_norm": 0.3203625977039337, + "learning_rate": 0.00017556765749489857, + "loss": 1.2185, + "step": 9415 + }, + { + "epoch": 0.1223566255118632, + "grad_norm": 0.3388095200061798, + "learning_rate": 0.00017556505803298717, + "loss": 1.2516, + "step": 9416 + }, + { + "epoch": 0.12236962005577907, + "grad_norm": 0.3947952091693878, + "learning_rate": 0.00017556245857107582, + "loss": 1.5154, + "step": 9417 + }, + { + "epoch": 0.12238261459969495, + "grad_norm": 0.4261745512485504, + "learning_rate": 0.00017555985910916441, + "loss": 1.5087, + "step": 9418 + }, + { + "epoch": 0.12239560914361082, + "grad_norm": 0.4744755029678345, + "learning_rate": 0.000175557259647253, + "loss": 1.4432, + "step": 9419 + }, + { + "epoch": 0.1224086036875267, + "grad_norm": 0.4018121659755707, + "learning_rate": 0.00017555466018534164, + "loss": 1.466, + "step": 9420 + }, + { + "epoch": 0.12242159823144258, + "grad_norm": 0.39566123485565186, + "learning_rate": 0.00017555206072343026, + "loss": 1.2452, + "step": 9421 + }, + { + "epoch": 0.12243459277535845, + "grad_norm": 0.42603591084480286, + "learning_rate": 0.00017554946126151889, + "loss": 1.4223, + "step": 9422 + }, + { + "epoch": 0.12244758731927433, + "grad_norm": 0.3795925974845886, + "learning_rate": 0.00017554686179960748, + "loss": 1.4818, + "step": 9423 + }, + { + "epoch": 0.1224605818631902, + "grad_norm": 0.4446429908275604, + "learning_rate": 0.0001755442623376961, + "loss": 1.4246, + "step": 9424 + }, + { + "epoch": 0.12247357640710607, + "grad_norm": 0.336588978767395, + "learning_rate": 0.00017554166287578473, + "loss": 1.2808, + "step": 9425 + }, + { + "epoch": 0.12248657095102194, + "grad_norm": 0.39158517122268677, + "learning_rate": 0.00017553906341387333, + "loss": 1.3705, + "step": 9426 + }, + { + "epoch": 0.12249956549493782, + "grad_norm": 0.390872985124588, + "learning_rate": 0.00017553646395196195, + "loss": 1.2758, + "step": 9427 + }, + { + "epoch": 0.12251256003885369, + "grad_norm": 0.455569326877594, + "learning_rate": 0.00017553386449005055, + "loss": 1.5882, + "step": 9428 + }, + { + "epoch": 0.12252555458276956, + "grad_norm": 0.42535704374313354, + "learning_rate": 0.0001755312650281392, + "loss": 1.4453, + "step": 9429 + }, + { + "epoch": 0.12253854912668544, + "grad_norm": 0.3217112421989441, + "learning_rate": 0.0001755286655662278, + "loss": 1.4702, + "step": 9430 + }, + { + "epoch": 0.12255154367060131, + "grad_norm": 0.39075425267219543, + "learning_rate": 0.00017552606610431642, + "loss": 1.4497, + "step": 9431 + }, + { + "epoch": 0.12256453821451718, + "grad_norm": 0.3755894601345062, + "learning_rate": 0.00017552346664240502, + "loss": 1.4311, + "step": 9432 + }, + { + "epoch": 0.12257753275843306, + "grad_norm": 0.4219174385070801, + "learning_rate": 0.00017552086718049365, + "loss": 1.3618, + "step": 9433 + }, + { + "epoch": 0.12259052730234893, + "grad_norm": 0.47242096066474915, + "learning_rate": 0.00017551826771858227, + "loss": 1.3837, + "step": 9434 + }, + { + "epoch": 0.1226035218462648, + "grad_norm": 0.5205214619636536, + "learning_rate": 0.00017551566825667087, + "loss": 1.4809, + "step": 9435 + }, + { + "epoch": 0.12261651639018067, + "grad_norm": 0.4229324162006378, + "learning_rate": 0.0001755130687947595, + "loss": 1.4869, + "step": 9436 + }, + { + "epoch": 0.12262951093409655, + "grad_norm": 0.4389857053756714, + "learning_rate": 0.00017551046933284812, + "loss": 1.4979, + "step": 9437 + }, + { + "epoch": 0.12264250547801242, + "grad_norm": 0.4339556097984314, + "learning_rate": 0.00017550786987093671, + "loss": 1.5938, + "step": 9438 + }, + { + "epoch": 0.1226555000219283, + "grad_norm": 0.2765056788921356, + "learning_rate": 0.00017550527040902534, + "loss": 1.327, + "step": 9439 + }, + { + "epoch": 0.12266849456584417, + "grad_norm": 0.46839138865470886, + "learning_rate": 0.00017550267094711394, + "loss": 1.3228, + "step": 9440 + }, + { + "epoch": 0.12268148910976004, + "grad_norm": 0.4560263454914093, + "learning_rate": 0.0001755000714852026, + "loss": 1.255, + "step": 9441 + }, + { + "epoch": 0.12269448365367591, + "grad_norm": 0.3007943034172058, + "learning_rate": 0.00017549747202329119, + "loss": 1.4302, + "step": 9442 + }, + { + "epoch": 0.12270747819759178, + "grad_norm": 0.31359604001045227, + "learning_rate": 0.0001754948725613798, + "loss": 1.3717, + "step": 9443 + }, + { + "epoch": 0.12272047274150766, + "grad_norm": 0.3923957347869873, + "learning_rate": 0.00017549227309946843, + "loss": 1.5479, + "step": 9444 + }, + { + "epoch": 0.12273346728542353, + "grad_norm": 0.37875622510910034, + "learning_rate": 0.00017548967363755703, + "loss": 1.4428, + "step": 9445 + }, + { + "epoch": 0.1227464618293394, + "grad_norm": 0.2996625006198883, + "learning_rate": 0.00017548707417564566, + "loss": 1.3771, + "step": 9446 + }, + { + "epoch": 0.12275945637325528, + "grad_norm": 0.36629050970077515, + "learning_rate": 0.00017548447471373425, + "loss": 1.2443, + "step": 9447 + }, + { + "epoch": 0.12277245091717115, + "grad_norm": 0.39929649233818054, + "learning_rate": 0.0001754818752518229, + "loss": 1.3785, + "step": 9448 + }, + { + "epoch": 0.12278544546108702, + "grad_norm": 0.43769311904907227, + "learning_rate": 0.0001754792757899115, + "loss": 1.5107, + "step": 9449 + }, + { + "epoch": 0.1227984400050029, + "grad_norm": 0.5155845284461975, + "learning_rate": 0.0001754766763280001, + "loss": 1.5704, + "step": 9450 + }, + { + "epoch": 0.12281143454891877, + "grad_norm": 0.43582573533058167, + "learning_rate": 0.00017547407686608872, + "loss": 1.3126, + "step": 9451 + }, + { + "epoch": 0.12282442909283464, + "grad_norm": 0.3846193850040436, + "learning_rate": 0.00017547147740417735, + "loss": 1.4231, + "step": 9452 + }, + { + "epoch": 0.12283742363675051, + "grad_norm": 0.41088631749153137, + "learning_rate": 0.00017546887794226597, + "loss": 1.4574, + "step": 9453 + }, + { + "epoch": 0.12285041818066639, + "grad_norm": 0.45022234320640564, + "learning_rate": 0.00017546627848035457, + "loss": 1.343, + "step": 9454 + }, + { + "epoch": 0.12286341272458226, + "grad_norm": 0.34805867075920105, + "learning_rate": 0.0001754636790184432, + "loss": 1.4487, + "step": 9455 + }, + { + "epoch": 0.12287640726849813, + "grad_norm": 0.2945215404033661, + "learning_rate": 0.00017546107955653182, + "loss": 1.3538, + "step": 9456 + }, + { + "epoch": 0.122889401812414, + "grad_norm": 0.428950697183609, + "learning_rate": 0.00017545848009462042, + "loss": 1.5344, + "step": 9457 + }, + { + "epoch": 0.12290239635632988, + "grad_norm": 0.39568817615509033, + "learning_rate": 0.00017545588063270904, + "loss": 1.5052, + "step": 9458 + }, + { + "epoch": 0.12291539090024577, + "grad_norm": 0.49282294511795044, + "learning_rate": 0.00017545328117079764, + "loss": 1.3926, + "step": 9459 + }, + { + "epoch": 0.12292838544416164, + "grad_norm": 0.4138358533382416, + "learning_rate": 0.0001754506817088863, + "loss": 1.4615, + "step": 9460 + }, + { + "epoch": 0.12294137998807751, + "grad_norm": 0.4411356747150421, + "learning_rate": 0.0001754480822469749, + "loss": 1.6069, + "step": 9461 + }, + { + "epoch": 0.12295437453199339, + "grad_norm": 0.41140323877334595, + "learning_rate": 0.00017544548278506348, + "loss": 1.4461, + "step": 9462 + }, + { + "epoch": 0.12296736907590926, + "grad_norm": 0.41371530294418335, + "learning_rate": 0.0001754428833231521, + "loss": 1.4174, + "step": 9463 + }, + { + "epoch": 0.12298036361982513, + "grad_norm": 0.3792431950569153, + "learning_rate": 0.00017544028386124073, + "loss": 1.3999, + "step": 9464 + }, + { + "epoch": 0.122993358163741, + "grad_norm": 0.38682985305786133, + "learning_rate": 0.00017543768439932936, + "loss": 1.4607, + "step": 9465 + }, + { + "epoch": 0.12300635270765688, + "grad_norm": 0.37916719913482666, + "learning_rate": 0.00017543508493741796, + "loss": 1.3804, + "step": 9466 + }, + { + "epoch": 0.12301934725157275, + "grad_norm": 0.37214428186416626, + "learning_rate": 0.00017543248547550658, + "loss": 1.5521, + "step": 9467 + }, + { + "epoch": 0.12303234179548862, + "grad_norm": 0.44244682788848877, + "learning_rate": 0.0001754298860135952, + "loss": 1.3643, + "step": 9468 + }, + { + "epoch": 0.1230453363394045, + "grad_norm": 0.3487483263015747, + "learning_rate": 0.0001754272865516838, + "loss": 1.2952, + "step": 9469 + }, + { + "epoch": 0.12305833088332037, + "grad_norm": 0.4238225817680359, + "learning_rate": 0.00017542468708977243, + "loss": 1.3952, + "step": 9470 + }, + { + "epoch": 0.12307132542723624, + "grad_norm": 0.26047229766845703, + "learning_rate": 0.00017542208762786102, + "loss": 1.2802, + "step": 9471 + }, + { + "epoch": 0.12308431997115211, + "grad_norm": 0.34330442547798157, + "learning_rate": 0.00017541948816594968, + "loss": 1.3557, + "step": 9472 + }, + { + "epoch": 0.12309731451506799, + "grad_norm": 0.42483723163604736, + "learning_rate": 0.00017541688870403827, + "loss": 1.5422, + "step": 9473 + }, + { + "epoch": 0.12311030905898386, + "grad_norm": 0.3888275623321533, + "learning_rate": 0.00017541428924212687, + "loss": 1.394, + "step": 9474 + }, + { + "epoch": 0.12312330360289973, + "grad_norm": 0.42202091217041016, + "learning_rate": 0.0001754116897802155, + "loss": 1.4337, + "step": 9475 + }, + { + "epoch": 0.1231362981468156, + "grad_norm": 0.3551664650440216, + "learning_rate": 0.00017540909031830412, + "loss": 1.3497, + "step": 9476 + }, + { + "epoch": 0.12314929269073148, + "grad_norm": 0.38255226612091064, + "learning_rate": 0.00017540649085639274, + "loss": 1.3497, + "step": 9477 + }, + { + "epoch": 0.12316228723464735, + "grad_norm": 0.3706818222999573, + "learning_rate": 0.00017540389139448134, + "loss": 1.5531, + "step": 9478 + }, + { + "epoch": 0.12317528177856323, + "grad_norm": 0.45490768551826477, + "learning_rate": 0.00017540129193256997, + "loss": 1.6672, + "step": 9479 + }, + { + "epoch": 0.1231882763224791, + "grad_norm": 0.2934543490409851, + "learning_rate": 0.0001753986924706586, + "loss": 1.2269, + "step": 9480 + }, + { + "epoch": 0.12320127086639497, + "grad_norm": 0.36701881885528564, + "learning_rate": 0.0001753960930087472, + "loss": 1.414, + "step": 9481 + }, + { + "epoch": 0.12321426541031084, + "grad_norm": 0.37207210063934326, + "learning_rate": 0.0001753934935468358, + "loss": 1.3823, + "step": 9482 + }, + { + "epoch": 0.12322725995422672, + "grad_norm": 0.3229626715183258, + "learning_rate": 0.00017539089408492444, + "loss": 1.2548, + "step": 9483 + }, + { + "epoch": 0.12324025449814259, + "grad_norm": 0.3672535717487335, + "learning_rate": 0.00017538829462301306, + "loss": 1.3426, + "step": 9484 + }, + { + "epoch": 0.12325324904205846, + "grad_norm": 0.25910496711730957, + "learning_rate": 0.00017538569516110166, + "loss": 1.3688, + "step": 9485 + }, + { + "epoch": 0.12326624358597434, + "grad_norm": 0.3056235611438751, + "learning_rate": 0.00017538309569919028, + "loss": 1.22, + "step": 9486 + }, + { + "epoch": 0.12327923812989021, + "grad_norm": 0.482042521238327, + "learning_rate": 0.0001753804962372789, + "loss": 1.669, + "step": 9487 + }, + { + "epoch": 0.12329223267380608, + "grad_norm": 0.45445653796195984, + "learning_rate": 0.0001753778967753675, + "loss": 1.5081, + "step": 9488 + }, + { + "epoch": 0.12330522721772195, + "grad_norm": 0.3391783535480499, + "learning_rate": 0.00017537529731345613, + "loss": 1.2828, + "step": 9489 + }, + { + "epoch": 0.12331822176163783, + "grad_norm": 0.2934155762195587, + "learning_rate": 0.00017537269785154473, + "loss": 1.248, + "step": 9490 + }, + { + "epoch": 0.1233312163055537, + "grad_norm": 0.3356635868549347, + "learning_rate": 0.00017537009838963335, + "loss": 1.4247, + "step": 9491 + }, + { + "epoch": 0.12334421084946957, + "grad_norm": 0.4243333339691162, + "learning_rate": 0.00017536749892772198, + "loss": 1.517, + "step": 9492 + }, + { + "epoch": 0.12335720539338545, + "grad_norm": 0.3923400044441223, + "learning_rate": 0.00017536489946581057, + "loss": 1.5942, + "step": 9493 + }, + { + "epoch": 0.12337019993730132, + "grad_norm": 0.2597026526927948, + "learning_rate": 0.0001753623000038992, + "loss": 1.2606, + "step": 9494 + }, + { + "epoch": 0.12338319448121719, + "grad_norm": 0.4306289553642273, + "learning_rate": 0.00017535970054198782, + "loss": 1.3489, + "step": 9495 + }, + { + "epoch": 0.12339618902513307, + "grad_norm": 0.5493301749229431, + "learning_rate": 0.00017535710108007645, + "loss": 1.5796, + "step": 9496 + }, + { + "epoch": 0.12340918356904895, + "grad_norm": 0.2613542377948761, + "learning_rate": 0.00017535450161816504, + "loss": 1.0414, + "step": 9497 + }, + { + "epoch": 0.12342217811296483, + "grad_norm": 0.40695062279701233, + "learning_rate": 0.00017535190215625367, + "loss": 1.4048, + "step": 9498 + }, + { + "epoch": 0.1234351726568807, + "grad_norm": 0.3995246887207031, + "learning_rate": 0.0001753493026943423, + "loss": 1.4324, + "step": 9499 + }, + { + "epoch": 0.12344816720079657, + "grad_norm": 0.35809874534606934, + "learning_rate": 0.0001753467032324309, + "loss": 1.5628, + "step": 9500 + }, + { + "epoch": 0.12346116174471244, + "grad_norm": 0.4493583142757416, + "learning_rate": 0.00017534410377051951, + "loss": 1.5692, + "step": 9501 + }, + { + "epoch": 0.12347415628862832, + "grad_norm": 0.33677470684051514, + "learning_rate": 0.0001753415043086081, + "loss": 1.4222, + "step": 9502 + }, + { + "epoch": 0.12348715083254419, + "grad_norm": 0.41809993982315063, + "learning_rate": 0.00017533890484669674, + "loss": 1.5264, + "step": 9503 + }, + { + "epoch": 0.12350014537646006, + "grad_norm": 0.32507792115211487, + "learning_rate": 0.00017533630538478536, + "loss": 1.328, + "step": 9504 + }, + { + "epoch": 0.12351313992037594, + "grad_norm": 0.430988073348999, + "learning_rate": 0.00017533370592287396, + "loss": 1.5181, + "step": 9505 + }, + { + "epoch": 0.12352613446429181, + "grad_norm": 0.38258063793182373, + "learning_rate": 0.00017533110646096258, + "loss": 1.3622, + "step": 9506 + }, + { + "epoch": 0.12353912900820768, + "grad_norm": 0.2594582438468933, + "learning_rate": 0.0001753285069990512, + "loss": 1.2754, + "step": 9507 + }, + { + "epoch": 0.12355212355212356, + "grad_norm": 0.4643630385398865, + "learning_rate": 0.00017532590753713983, + "loss": 1.362, + "step": 9508 + }, + { + "epoch": 0.12356511809603943, + "grad_norm": 0.37695011496543884, + "learning_rate": 0.00017532330807522843, + "loss": 1.5886, + "step": 9509 + }, + { + "epoch": 0.1235781126399553, + "grad_norm": 0.388703852891922, + "learning_rate": 0.00017532070861331705, + "loss": 1.4588, + "step": 9510 + }, + { + "epoch": 0.12359110718387117, + "grad_norm": 0.4319486916065216, + "learning_rate": 0.00017531810915140568, + "loss": 1.5042, + "step": 9511 + }, + { + "epoch": 0.12360410172778705, + "grad_norm": 0.41553017497062683, + "learning_rate": 0.00017531550968949428, + "loss": 1.4554, + "step": 9512 + }, + { + "epoch": 0.12361709627170292, + "grad_norm": 0.36376845836639404, + "learning_rate": 0.0001753129102275829, + "loss": 1.6371, + "step": 9513 + }, + { + "epoch": 0.12363009081561879, + "grad_norm": 0.3340856432914734, + "learning_rate": 0.0001753103107656715, + "loss": 1.3669, + "step": 9514 + }, + { + "epoch": 0.12364308535953467, + "grad_norm": 0.3058205544948578, + "learning_rate": 0.00017530771130376015, + "loss": 1.5662, + "step": 9515 + }, + { + "epoch": 0.12365607990345054, + "grad_norm": 0.48043322563171387, + "learning_rate": 0.00017530511184184875, + "loss": 1.6312, + "step": 9516 + }, + { + "epoch": 0.12366907444736641, + "grad_norm": 0.4191220998764038, + "learning_rate": 0.00017530251237993734, + "loss": 1.6763, + "step": 9517 + }, + { + "epoch": 0.12368206899128228, + "grad_norm": 0.3762301802635193, + "learning_rate": 0.000175299912918026, + "loss": 1.2855, + "step": 9518 + }, + { + "epoch": 0.12369506353519816, + "grad_norm": 0.424789696931839, + "learning_rate": 0.0001752973134561146, + "loss": 1.4668, + "step": 9519 + }, + { + "epoch": 0.12370805807911403, + "grad_norm": 0.36871305108070374, + "learning_rate": 0.00017529471399420322, + "loss": 1.3455, + "step": 9520 + }, + { + "epoch": 0.1237210526230299, + "grad_norm": 0.46252450346946716, + "learning_rate": 0.00017529211453229181, + "loss": 1.6386, + "step": 9521 + }, + { + "epoch": 0.12373404716694578, + "grad_norm": 0.3832029104232788, + "learning_rate": 0.00017528951507038044, + "loss": 1.2696, + "step": 9522 + }, + { + "epoch": 0.12374704171086165, + "grad_norm": 0.37716394662857056, + "learning_rate": 0.00017528691560846906, + "loss": 1.389, + "step": 9523 + }, + { + "epoch": 0.12376003625477752, + "grad_norm": 0.3911168575286865, + "learning_rate": 0.00017528431614655766, + "loss": 1.4422, + "step": 9524 + }, + { + "epoch": 0.1237730307986934, + "grad_norm": 0.4176744818687439, + "learning_rate": 0.00017528171668464629, + "loss": 1.5345, + "step": 9525 + }, + { + "epoch": 0.12378602534260927, + "grad_norm": 0.41857510805130005, + "learning_rate": 0.0001752791172227349, + "loss": 1.361, + "step": 9526 + }, + { + "epoch": 0.12379901988652514, + "grad_norm": 0.3802216947078705, + "learning_rate": 0.00017527651776082353, + "loss": 1.5399, + "step": 9527 + }, + { + "epoch": 0.12381201443044101, + "grad_norm": 0.38757428526878357, + "learning_rate": 0.00017527391829891213, + "loss": 1.2467, + "step": 9528 + }, + { + "epoch": 0.12382500897435689, + "grad_norm": 0.49100178480148315, + "learning_rate": 0.00017527131883700073, + "loss": 1.4281, + "step": 9529 + }, + { + "epoch": 0.12383800351827276, + "grad_norm": 0.39563506841659546, + "learning_rate": 0.00017526871937508938, + "loss": 1.4232, + "step": 9530 + }, + { + "epoch": 0.12385099806218863, + "grad_norm": 0.3539801239967346, + "learning_rate": 0.00017526611991317798, + "loss": 1.6213, + "step": 9531 + }, + { + "epoch": 0.1238639926061045, + "grad_norm": 0.33889925479888916, + "learning_rate": 0.0001752635204512666, + "loss": 1.4746, + "step": 9532 + }, + { + "epoch": 0.12387698715002038, + "grad_norm": 0.32110705971717834, + "learning_rate": 0.0001752609209893552, + "loss": 1.2862, + "step": 9533 + }, + { + "epoch": 0.12388998169393625, + "grad_norm": 0.2726416289806366, + "learning_rate": 0.00017525832152744382, + "loss": 1.3376, + "step": 9534 + }, + { + "epoch": 0.12390297623785214, + "grad_norm": 0.40119504928588867, + "learning_rate": 0.00017525572206553245, + "loss": 1.3445, + "step": 9535 + }, + { + "epoch": 0.12391597078176801, + "grad_norm": 0.4073296785354614, + "learning_rate": 0.00017525312260362105, + "loss": 1.4134, + "step": 9536 + }, + { + "epoch": 0.12392896532568388, + "grad_norm": 0.4655570387840271, + "learning_rate": 0.00017525052314170967, + "loss": 1.3833, + "step": 9537 + }, + { + "epoch": 0.12394195986959976, + "grad_norm": 0.44354668259620667, + "learning_rate": 0.0001752479236797983, + "loss": 1.3378, + "step": 9538 + }, + { + "epoch": 0.12395495441351563, + "grad_norm": 0.3735615313053131, + "learning_rate": 0.00017524532421788692, + "loss": 1.1322, + "step": 9539 + }, + { + "epoch": 0.1239679489574315, + "grad_norm": 0.49971988797187805, + "learning_rate": 0.00017524272475597552, + "loss": 1.2174, + "step": 9540 + }, + { + "epoch": 0.12398094350134738, + "grad_norm": 0.3243158161640167, + "learning_rate": 0.00017524012529406414, + "loss": 1.577, + "step": 9541 + }, + { + "epoch": 0.12399393804526325, + "grad_norm": 0.2929551601409912, + "learning_rate": 0.00017523752583215277, + "loss": 1.3187, + "step": 9542 + }, + { + "epoch": 0.12400693258917912, + "grad_norm": 0.37671124935150146, + "learning_rate": 0.00017523492637024136, + "loss": 1.4367, + "step": 9543 + }, + { + "epoch": 0.124019927133095, + "grad_norm": 0.44659480452537537, + "learning_rate": 0.00017523232690833, + "loss": 1.4277, + "step": 9544 + }, + { + "epoch": 0.12403292167701087, + "grad_norm": 0.36298638582229614, + "learning_rate": 0.00017522972744641859, + "loss": 1.6034, + "step": 9545 + }, + { + "epoch": 0.12404591622092674, + "grad_norm": 0.3981684744358063, + "learning_rate": 0.0001752271279845072, + "loss": 1.6356, + "step": 9546 + }, + { + "epoch": 0.12405891076484261, + "grad_norm": 0.39422181248664856, + "learning_rate": 0.00017522452852259583, + "loss": 1.4045, + "step": 9547 + }, + { + "epoch": 0.12407190530875849, + "grad_norm": 0.43765023350715637, + "learning_rate": 0.00017522192906068443, + "loss": 1.4791, + "step": 9548 + }, + { + "epoch": 0.12408489985267436, + "grad_norm": 0.2757258117198944, + "learning_rate": 0.00017521932959877306, + "loss": 1.4103, + "step": 9549 + }, + { + "epoch": 0.12409789439659023, + "grad_norm": 0.34877264499664307, + "learning_rate": 0.00017521673013686168, + "loss": 1.261, + "step": 9550 + }, + { + "epoch": 0.1241108889405061, + "grad_norm": 0.4051900804042816, + "learning_rate": 0.0001752141306749503, + "loss": 1.4377, + "step": 9551 + }, + { + "epoch": 0.12412388348442198, + "grad_norm": 0.4281129539012909, + "learning_rate": 0.0001752115312130389, + "loss": 1.4058, + "step": 9552 + }, + { + "epoch": 0.12413687802833785, + "grad_norm": 0.37326550483703613, + "learning_rate": 0.00017520893175112753, + "loss": 1.4333, + "step": 9553 + }, + { + "epoch": 0.12414987257225372, + "grad_norm": 0.44275426864624023, + "learning_rate": 0.00017520633228921615, + "loss": 1.4008, + "step": 9554 + }, + { + "epoch": 0.1241628671161696, + "grad_norm": 0.30423763394355774, + "learning_rate": 0.00017520373282730475, + "loss": 1.287, + "step": 9555 + }, + { + "epoch": 0.12417586166008547, + "grad_norm": 0.38611316680908203, + "learning_rate": 0.00017520113336539337, + "loss": 1.2952, + "step": 9556 + }, + { + "epoch": 0.12418885620400134, + "grad_norm": 0.47034040093421936, + "learning_rate": 0.000175198533903482, + "loss": 1.5033, + "step": 9557 + }, + { + "epoch": 0.12420185074791722, + "grad_norm": 0.4749831259250641, + "learning_rate": 0.0001751959344415706, + "loss": 1.3976, + "step": 9558 + }, + { + "epoch": 0.12421484529183309, + "grad_norm": 0.41762426495552063, + "learning_rate": 0.00017519333497965922, + "loss": 1.5608, + "step": 9559 + }, + { + "epoch": 0.12422783983574896, + "grad_norm": 0.41292503476142883, + "learning_rate": 0.00017519073551774782, + "loss": 1.6435, + "step": 9560 + }, + { + "epoch": 0.12424083437966484, + "grad_norm": 0.3542183041572571, + "learning_rate": 0.00017518813605583647, + "loss": 1.3538, + "step": 9561 + }, + { + "epoch": 0.12425382892358071, + "grad_norm": 0.44358035922050476, + "learning_rate": 0.00017518553659392507, + "loss": 1.483, + "step": 9562 + }, + { + "epoch": 0.12426682346749658, + "grad_norm": 0.3438158333301544, + "learning_rate": 0.0001751829371320137, + "loss": 1.147, + "step": 9563 + }, + { + "epoch": 0.12427981801141245, + "grad_norm": 0.4154832065105438, + "learning_rate": 0.0001751803376701023, + "loss": 1.6448, + "step": 9564 + }, + { + "epoch": 0.12429281255532833, + "grad_norm": 0.37668576836586, + "learning_rate": 0.0001751777382081909, + "loss": 1.405, + "step": 9565 + }, + { + "epoch": 0.1243058070992442, + "grad_norm": 0.4438360035419464, + "learning_rate": 0.00017517513874627954, + "loss": 1.3371, + "step": 9566 + }, + { + "epoch": 0.12431880164316007, + "grad_norm": 0.3932449221611023, + "learning_rate": 0.00017517253928436813, + "loss": 1.5031, + "step": 9567 + }, + { + "epoch": 0.12433179618707595, + "grad_norm": 0.42104554176330566, + "learning_rate": 0.00017516993982245676, + "loss": 1.4451, + "step": 9568 + }, + { + "epoch": 0.12434479073099182, + "grad_norm": 0.36716482043266296, + "learning_rate": 0.00017516734036054538, + "loss": 1.4091, + "step": 9569 + }, + { + "epoch": 0.12435778527490769, + "grad_norm": 0.3912370204925537, + "learning_rate": 0.000175164740898634, + "loss": 1.4318, + "step": 9570 + }, + { + "epoch": 0.12437077981882357, + "grad_norm": 0.37993699312210083, + "learning_rate": 0.0001751621414367226, + "loss": 1.6242, + "step": 9571 + }, + { + "epoch": 0.12438377436273944, + "grad_norm": 0.4448337256908417, + "learning_rate": 0.0001751595419748112, + "loss": 1.5356, + "step": 9572 + }, + { + "epoch": 0.12439676890665531, + "grad_norm": 0.49726277589797974, + "learning_rate": 0.00017515694251289985, + "loss": 1.5825, + "step": 9573 + }, + { + "epoch": 0.1244097634505712, + "grad_norm": 0.39247578382492065, + "learning_rate": 0.00017515434305098845, + "loss": 1.5825, + "step": 9574 + }, + { + "epoch": 0.12442275799448707, + "grad_norm": 0.4429816007614136, + "learning_rate": 0.00017515174358907708, + "loss": 1.579, + "step": 9575 + }, + { + "epoch": 0.12443575253840294, + "grad_norm": 0.365694135427475, + "learning_rate": 0.00017514914412716567, + "loss": 1.4903, + "step": 9576 + }, + { + "epoch": 0.12444874708231882, + "grad_norm": 0.34383833408355713, + "learning_rate": 0.0001751465446652543, + "loss": 1.417, + "step": 9577 + }, + { + "epoch": 0.12446174162623469, + "grad_norm": 0.3965412676334381, + "learning_rate": 0.00017514394520334292, + "loss": 1.3773, + "step": 9578 + }, + { + "epoch": 0.12447473617015056, + "grad_norm": 0.29918986558914185, + "learning_rate": 0.00017514134574143152, + "loss": 1.2956, + "step": 9579 + }, + { + "epoch": 0.12448773071406644, + "grad_norm": 0.4160357415676117, + "learning_rate": 0.00017513874627952014, + "loss": 1.5154, + "step": 9580 + }, + { + "epoch": 0.12450072525798231, + "grad_norm": 0.42975103855133057, + "learning_rate": 0.00017513614681760877, + "loss": 1.4572, + "step": 9581 + }, + { + "epoch": 0.12451371980189818, + "grad_norm": 0.4243880808353424, + "learning_rate": 0.0001751335473556974, + "loss": 1.6084, + "step": 9582 + }, + { + "epoch": 0.12452671434581405, + "grad_norm": 0.44164714217185974, + "learning_rate": 0.000175130947893786, + "loss": 1.4222, + "step": 9583 + }, + { + "epoch": 0.12453970888972993, + "grad_norm": 0.32200759649276733, + "learning_rate": 0.0001751283484318746, + "loss": 1.3142, + "step": 9584 + }, + { + "epoch": 0.1245527034336458, + "grad_norm": 0.39200395345687866, + "learning_rate": 0.00017512574896996324, + "loss": 1.4469, + "step": 9585 + }, + { + "epoch": 0.12456569797756167, + "grad_norm": 0.4106068015098572, + "learning_rate": 0.00017512314950805184, + "loss": 1.6844, + "step": 9586 + }, + { + "epoch": 0.12457869252147755, + "grad_norm": 0.33713802695274353, + "learning_rate": 0.00017512055004614046, + "loss": 1.4453, + "step": 9587 + }, + { + "epoch": 0.12459168706539342, + "grad_norm": 0.4372261166572571, + "learning_rate": 0.00017511795058422906, + "loss": 1.4759, + "step": 9588 + }, + { + "epoch": 0.12460468160930929, + "grad_norm": 0.4063788652420044, + "learning_rate": 0.00017511535112231768, + "loss": 1.6063, + "step": 9589 + }, + { + "epoch": 0.12461767615322517, + "grad_norm": 0.3728642165660858, + "learning_rate": 0.0001751127516604063, + "loss": 1.4006, + "step": 9590 + }, + { + "epoch": 0.12463067069714104, + "grad_norm": 0.3007648289203644, + "learning_rate": 0.0001751101521984949, + "loss": 1.281, + "step": 9591 + }, + { + "epoch": 0.12464366524105691, + "grad_norm": 0.3721601068973541, + "learning_rate": 0.00017510755273658356, + "loss": 1.5699, + "step": 9592 + }, + { + "epoch": 0.12465665978497278, + "grad_norm": 0.4571717381477356, + "learning_rate": 0.00017510495327467215, + "loss": 1.4366, + "step": 9593 + }, + { + "epoch": 0.12466965432888866, + "grad_norm": 0.4183889329433441, + "learning_rate": 0.00017510235381276078, + "loss": 1.4985, + "step": 9594 + }, + { + "epoch": 0.12468264887280453, + "grad_norm": 0.4092417061328888, + "learning_rate": 0.00017509975435084938, + "loss": 1.3064, + "step": 9595 + }, + { + "epoch": 0.1246956434167204, + "grad_norm": 0.3628922402858734, + "learning_rate": 0.000175097154888938, + "loss": 1.4016, + "step": 9596 + }, + { + "epoch": 0.12470863796063628, + "grad_norm": 0.37899014353752136, + "learning_rate": 0.00017509455542702662, + "loss": 1.2399, + "step": 9597 + }, + { + "epoch": 0.12472163250455215, + "grad_norm": 0.3253507912158966, + "learning_rate": 0.00017509195596511522, + "loss": 1.3236, + "step": 9598 + }, + { + "epoch": 0.12473462704846802, + "grad_norm": 0.43177762627601624, + "learning_rate": 0.00017508935650320385, + "loss": 1.3943, + "step": 9599 + }, + { + "epoch": 0.1247476215923839, + "grad_norm": 0.42290180921554565, + "learning_rate": 0.00017508675704129247, + "loss": 1.5283, + "step": 9600 + }, + { + "epoch": 0.12476061613629977, + "grad_norm": 0.4956355094909668, + "learning_rate": 0.00017508415757938107, + "loss": 1.4605, + "step": 9601 + }, + { + "epoch": 0.12477361068021564, + "grad_norm": 0.40923964977264404, + "learning_rate": 0.0001750815581174697, + "loss": 1.2837, + "step": 9602 + }, + { + "epoch": 0.12478660522413151, + "grad_norm": 0.41952595114707947, + "learning_rate": 0.0001750789586555583, + "loss": 1.428, + "step": 9603 + }, + { + "epoch": 0.12479959976804739, + "grad_norm": 0.40707823634147644, + "learning_rate": 0.00017507635919364694, + "loss": 1.3958, + "step": 9604 + }, + { + "epoch": 0.12481259431196326, + "grad_norm": 0.21762244403362274, + "learning_rate": 0.00017507375973173554, + "loss": 1.3969, + "step": 9605 + }, + { + "epoch": 0.12482558885587913, + "grad_norm": 0.3387683033943176, + "learning_rate": 0.00017507116026982416, + "loss": 1.2537, + "step": 9606 + }, + { + "epoch": 0.124838583399795, + "grad_norm": 0.3670079708099365, + "learning_rate": 0.00017506856080791276, + "loss": 1.376, + "step": 9607 + }, + { + "epoch": 0.12485157794371088, + "grad_norm": 0.39507466554641724, + "learning_rate": 0.00017506596134600139, + "loss": 1.3001, + "step": 9608 + }, + { + "epoch": 0.12486457248762675, + "grad_norm": 0.2887982130050659, + "learning_rate": 0.00017506336188409, + "loss": 1.2367, + "step": 9609 + }, + { + "epoch": 0.12487756703154262, + "grad_norm": 0.3801294267177582, + "learning_rate": 0.0001750607624221786, + "loss": 1.4169, + "step": 9610 + }, + { + "epoch": 0.1248905615754585, + "grad_norm": 0.33027201890945435, + "learning_rate": 0.00017505816296026723, + "loss": 1.534, + "step": 9611 + }, + { + "epoch": 0.12490355611937438, + "grad_norm": 0.4124357998371124, + "learning_rate": 0.00017505556349835586, + "loss": 1.477, + "step": 9612 + }, + { + "epoch": 0.12491655066329026, + "grad_norm": 0.3271085023880005, + "learning_rate": 0.00017505296403644445, + "loss": 1.2346, + "step": 9613 + }, + { + "epoch": 0.12492954520720613, + "grad_norm": 0.36430051922798157, + "learning_rate": 0.00017505036457453308, + "loss": 1.3238, + "step": 9614 + }, + { + "epoch": 0.124942539751122, + "grad_norm": 0.4155019223690033, + "learning_rate": 0.00017504776511262168, + "loss": 1.3999, + "step": 9615 + }, + { + "epoch": 0.12495553429503788, + "grad_norm": 0.31151819229125977, + "learning_rate": 0.00017504516565071033, + "loss": 1.3762, + "step": 9616 + }, + { + "epoch": 0.12496852883895375, + "grad_norm": 0.4495059847831726, + "learning_rate": 0.00017504256618879892, + "loss": 1.4062, + "step": 9617 + }, + { + "epoch": 0.12498152338286962, + "grad_norm": 0.4299633800983429, + "learning_rate": 0.00017503996672688755, + "loss": 1.434, + "step": 9618 + }, + { + "epoch": 0.1249945179267855, + "grad_norm": 0.42572519183158875, + "learning_rate": 0.00017503736726497615, + "loss": 1.2537, + "step": 9619 + }, + { + "epoch": 0.12500751247070135, + "grad_norm": 0.43866828083992004, + "learning_rate": 0.00017503476780306477, + "loss": 1.321, + "step": 9620 + }, + { + "epoch": 0.12502050701461723, + "grad_norm": 0.3476933538913727, + "learning_rate": 0.0001750321683411534, + "loss": 1.3085, + "step": 9621 + }, + { + "epoch": 0.1250335015585331, + "grad_norm": 0.33552300930023193, + "learning_rate": 0.000175029568879242, + "loss": 1.3884, + "step": 9622 + }, + { + "epoch": 0.12504649610244897, + "grad_norm": 0.40971824526786804, + "learning_rate": 0.00017502696941733062, + "loss": 1.299, + "step": 9623 + }, + { + "epoch": 0.12505949064636485, + "grad_norm": 0.42949429154396057, + "learning_rate": 0.00017502436995541924, + "loss": 1.5686, + "step": 9624 + }, + { + "epoch": 0.12507248519028072, + "grad_norm": 0.37826862931251526, + "learning_rate": 0.00017502177049350784, + "loss": 1.4853, + "step": 9625 + }, + { + "epoch": 0.1250854797341966, + "grad_norm": 0.44199806451797485, + "learning_rate": 0.00017501917103159646, + "loss": 1.4439, + "step": 9626 + }, + { + "epoch": 0.12509847427811246, + "grad_norm": 0.4022911787033081, + "learning_rate": 0.00017501657156968506, + "loss": 1.574, + "step": 9627 + }, + { + "epoch": 0.12511146882202834, + "grad_norm": 0.3098083436489105, + "learning_rate": 0.0001750139721077737, + "loss": 1.4625, + "step": 9628 + }, + { + "epoch": 0.1251244633659442, + "grad_norm": 0.4486669898033142, + "learning_rate": 0.0001750113726458623, + "loss": 1.481, + "step": 9629 + }, + { + "epoch": 0.12513745790986008, + "grad_norm": 0.35100775957107544, + "learning_rate": 0.00017500877318395093, + "loss": 1.3312, + "step": 9630 + }, + { + "epoch": 0.12515045245377598, + "grad_norm": 0.34200555086135864, + "learning_rate": 0.00017500617372203956, + "loss": 1.4985, + "step": 9631 + }, + { + "epoch": 0.12516344699769186, + "grad_norm": 0.3374740481376648, + "learning_rate": 0.00017500357426012816, + "loss": 1.3524, + "step": 9632 + }, + { + "epoch": 0.12517644154160773, + "grad_norm": 0.3939988613128662, + "learning_rate": 0.00017500097479821678, + "loss": 1.3465, + "step": 9633 + }, + { + "epoch": 0.1251894360855236, + "grad_norm": 0.345689594745636, + "learning_rate": 0.00017499837533630538, + "loss": 1.3531, + "step": 9634 + }, + { + "epoch": 0.12520243062943948, + "grad_norm": 0.4319462478160858, + "learning_rate": 0.00017499577587439403, + "loss": 1.5274, + "step": 9635 + }, + { + "epoch": 0.12521542517335535, + "grad_norm": 0.4508669078350067, + "learning_rate": 0.00017499317641248263, + "loss": 1.4374, + "step": 9636 + }, + { + "epoch": 0.12522841971727122, + "grad_norm": 0.37562060356140137, + "learning_rate": 0.00017499057695057125, + "loss": 1.5023, + "step": 9637 + }, + { + "epoch": 0.1252414142611871, + "grad_norm": 0.3058767020702362, + "learning_rate": 0.00017498797748865985, + "loss": 1.2453, + "step": 9638 + }, + { + "epoch": 0.12525440880510297, + "grad_norm": 0.38932371139526367, + "learning_rate": 0.00017498537802674847, + "loss": 1.303, + "step": 9639 + }, + { + "epoch": 0.12526740334901884, + "grad_norm": 0.39255431294441223, + "learning_rate": 0.0001749827785648371, + "loss": 1.3559, + "step": 9640 + }, + { + "epoch": 0.12528039789293471, + "grad_norm": 0.3753657042980194, + "learning_rate": 0.0001749801791029257, + "loss": 1.5122, + "step": 9641 + }, + { + "epoch": 0.1252933924368506, + "grad_norm": 0.31538498401641846, + "learning_rate": 0.00017497757964101432, + "loss": 1.2619, + "step": 9642 + }, + { + "epoch": 0.12530638698076646, + "grad_norm": 0.3877624571323395, + "learning_rate": 0.00017497498017910294, + "loss": 1.5394, + "step": 9643 + }, + { + "epoch": 0.12531938152468233, + "grad_norm": 0.24258852005004883, + "learning_rate": 0.00017497238071719154, + "loss": 1.3273, + "step": 9644 + }, + { + "epoch": 0.1253323760685982, + "grad_norm": 0.3393879234790802, + "learning_rate": 0.00017496978125528017, + "loss": 1.3401, + "step": 9645 + }, + { + "epoch": 0.12534537061251408, + "grad_norm": 0.44138243794441223, + "learning_rate": 0.00017496718179336876, + "loss": 1.3758, + "step": 9646 + }, + { + "epoch": 0.12535836515642995, + "grad_norm": 0.3094342052936554, + "learning_rate": 0.00017496458233145742, + "loss": 1.3767, + "step": 9647 + }, + { + "epoch": 0.12537135970034582, + "grad_norm": 0.3506278991699219, + "learning_rate": 0.000174961982869546, + "loss": 1.5701, + "step": 9648 + }, + { + "epoch": 0.1253843542442617, + "grad_norm": 0.4021117687225342, + "learning_rate": 0.00017495938340763464, + "loss": 1.3221, + "step": 9649 + }, + { + "epoch": 0.12539734878817757, + "grad_norm": 0.37280309200286865, + "learning_rate": 0.00017495678394572323, + "loss": 1.2719, + "step": 9650 + }, + { + "epoch": 0.12541034333209344, + "grad_norm": 0.3920639157295227, + "learning_rate": 0.00017495418448381186, + "loss": 1.2147, + "step": 9651 + }, + { + "epoch": 0.12542333787600932, + "grad_norm": 0.39731329679489136, + "learning_rate": 0.00017495158502190048, + "loss": 1.6081, + "step": 9652 + }, + { + "epoch": 0.1254363324199252, + "grad_norm": 0.46653154492378235, + "learning_rate": 0.00017494898555998908, + "loss": 1.4074, + "step": 9653 + }, + { + "epoch": 0.12544932696384106, + "grad_norm": 0.3996526598930359, + "learning_rate": 0.0001749463860980777, + "loss": 1.4357, + "step": 9654 + }, + { + "epoch": 0.12546232150775694, + "grad_norm": 0.413546085357666, + "learning_rate": 0.00017494378663616633, + "loss": 1.5657, + "step": 9655 + }, + { + "epoch": 0.1254753160516728, + "grad_norm": 0.4259064495563507, + "learning_rate": 0.00017494118717425493, + "loss": 1.3984, + "step": 9656 + }, + { + "epoch": 0.12548831059558868, + "grad_norm": 0.3858471214771271, + "learning_rate": 0.00017493858771234355, + "loss": 1.3308, + "step": 9657 + }, + { + "epoch": 0.12550130513950455, + "grad_norm": 0.46405744552612305, + "learning_rate": 0.00017493598825043215, + "loss": 1.5752, + "step": 9658 + }, + { + "epoch": 0.12551429968342043, + "grad_norm": 0.47689926624298096, + "learning_rate": 0.0001749333887885208, + "loss": 1.5239, + "step": 9659 + }, + { + "epoch": 0.1255272942273363, + "grad_norm": 0.42176753282546997, + "learning_rate": 0.0001749307893266094, + "loss": 1.6362, + "step": 9660 + }, + { + "epoch": 0.12554028877125217, + "grad_norm": 0.4492424428462982, + "learning_rate": 0.00017492818986469802, + "loss": 1.3585, + "step": 9661 + }, + { + "epoch": 0.12555328331516805, + "grad_norm": 0.3477698266506195, + "learning_rate": 0.00017492559040278662, + "loss": 1.2526, + "step": 9662 + }, + { + "epoch": 0.12556627785908392, + "grad_norm": 0.39830154180526733, + "learning_rate": 0.00017492299094087524, + "loss": 1.4401, + "step": 9663 + }, + { + "epoch": 0.1255792724029998, + "grad_norm": 0.39245128631591797, + "learning_rate": 0.00017492039147896387, + "loss": 1.6153, + "step": 9664 + }, + { + "epoch": 0.12559226694691566, + "grad_norm": 0.39771386981010437, + "learning_rate": 0.00017491779201705247, + "loss": 1.4228, + "step": 9665 + }, + { + "epoch": 0.12560526149083154, + "grad_norm": 0.24116118252277374, + "learning_rate": 0.00017491519255514112, + "loss": 1.3096, + "step": 9666 + }, + { + "epoch": 0.1256182560347474, + "grad_norm": 0.44289201498031616, + "learning_rate": 0.00017491259309322972, + "loss": 1.4459, + "step": 9667 + }, + { + "epoch": 0.12563125057866328, + "grad_norm": 0.43484336137771606, + "learning_rate": 0.0001749099936313183, + "loss": 1.446, + "step": 9668 + }, + { + "epoch": 0.12564424512257916, + "grad_norm": 0.2405206859111786, + "learning_rate": 0.00017490739416940694, + "loss": 1.3298, + "step": 9669 + }, + { + "epoch": 0.12565723966649503, + "grad_norm": 0.35324826836586, + "learning_rate": 0.00017490479470749556, + "loss": 1.5308, + "step": 9670 + }, + { + "epoch": 0.1256702342104109, + "grad_norm": 0.31959229707717896, + "learning_rate": 0.00017490219524558419, + "loss": 1.3147, + "step": 9671 + }, + { + "epoch": 0.12568322875432678, + "grad_norm": 0.4756709933280945, + "learning_rate": 0.00017489959578367278, + "loss": 1.4761, + "step": 9672 + }, + { + "epoch": 0.12569622329824265, + "grad_norm": 0.3300042152404785, + "learning_rate": 0.0001748969963217614, + "loss": 1.4613, + "step": 9673 + }, + { + "epoch": 0.12570921784215852, + "grad_norm": 0.260833203792572, + "learning_rate": 0.00017489439685985003, + "loss": 1.2715, + "step": 9674 + }, + { + "epoch": 0.1257222123860744, + "grad_norm": 0.4841623306274414, + "learning_rate": 0.00017489179739793863, + "loss": 1.3387, + "step": 9675 + }, + { + "epoch": 0.12573520692999027, + "grad_norm": 0.353515088558197, + "learning_rate": 0.00017488919793602725, + "loss": 1.6013, + "step": 9676 + }, + { + "epoch": 0.12574820147390614, + "grad_norm": 0.45149222016334534, + "learning_rate": 0.00017488659847411585, + "loss": 1.5653, + "step": 9677 + }, + { + "epoch": 0.125761196017822, + "grad_norm": 0.4398069381713867, + "learning_rate": 0.0001748839990122045, + "loss": 1.5515, + "step": 9678 + }, + { + "epoch": 0.1257741905617379, + "grad_norm": 0.37375950813293457, + "learning_rate": 0.0001748813995502931, + "loss": 1.3408, + "step": 9679 + }, + { + "epoch": 0.12578718510565376, + "grad_norm": 0.276103675365448, + "learning_rate": 0.0001748788000883817, + "loss": 1.3096, + "step": 9680 + }, + { + "epoch": 0.12580017964956963, + "grad_norm": 0.442453533411026, + "learning_rate": 0.00017487620062647032, + "loss": 1.5137, + "step": 9681 + }, + { + "epoch": 0.1258131741934855, + "grad_norm": 0.48259907960891724, + "learning_rate": 0.00017487360116455895, + "loss": 1.5208, + "step": 9682 + }, + { + "epoch": 0.12582616873740138, + "grad_norm": 0.37423524260520935, + "learning_rate": 0.00017487100170264757, + "loss": 1.4551, + "step": 9683 + }, + { + "epoch": 0.12583916328131725, + "grad_norm": 0.34281760454177856, + "learning_rate": 0.00017486840224073617, + "loss": 1.5296, + "step": 9684 + }, + { + "epoch": 0.12585215782523312, + "grad_norm": 0.3728395402431488, + "learning_rate": 0.0001748658027788248, + "loss": 1.3543, + "step": 9685 + }, + { + "epoch": 0.125865152369149, + "grad_norm": 0.3292859196662903, + "learning_rate": 0.00017486320331691342, + "loss": 1.2164, + "step": 9686 + }, + { + "epoch": 0.12587814691306487, + "grad_norm": 0.4014461934566498, + "learning_rate": 0.00017486060385500202, + "loss": 1.4251, + "step": 9687 + }, + { + "epoch": 0.12589114145698074, + "grad_norm": 0.4269237518310547, + "learning_rate": 0.00017485800439309064, + "loss": 1.4323, + "step": 9688 + }, + { + "epoch": 0.12590413600089662, + "grad_norm": 0.3459317088127136, + "learning_rate": 0.00017485540493117924, + "loss": 1.3725, + "step": 9689 + }, + { + "epoch": 0.1259171305448125, + "grad_norm": 0.33514484763145447, + "learning_rate": 0.0001748528054692679, + "loss": 1.4692, + "step": 9690 + }, + { + "epoch": 0.12593012508872836, + "grad_norm": 0.3969482481479645, + "learning_rate": 0.00017485020600735649, + "loss": 1.4396, + "step": 9691 + }, + { + "epoch": 0.12594311963264423, + "grad_norm": 0.31107670068740845, + "learning_rate": 0.0001748476065454451, + "loss": 1.2722, + "step": 9692 + }, + { + "epoch": 0.1259561141765601, + "grad_norm": 0.4391982853412628, + "learning_rate": 0.0001748450070835337, + "loss": 1.443, + "step": 9693 + }, + { + "epoch": 0.12596910872047598, + "grad_norm": 0.3729909360408783, + "learning_rate": 0.00017484240762162233, + "loss": 1.424, + "step": 9694 + }, + { + "epoch": 0.12598210326439185, + "grad_norm": 0.5445814728736877, + "learning_rate": 0.00017483980815971096, + "loss": 1.4115, + "step": 9695 + }, + { + "epoch": 0.12599509780830773, + "grad_norm": 0.45242586731910706, + "learning_rate": 0.00017483720869779955, + "loss": 1.3058, + "step": 9696 + }, + { + "epoch": 0.1260080923522236, + "grad_norm": 0.38592544198036194, + "learning_rate": 0.00017483460923588818, + "loss": 1.4362, + "step": 9697 + }, + { + "epoch": 0.12602108689613947, + "grad_norm": 0.5082756280899048, + "learning_rate": 0.0001748320097739768, + "loss": 1.3649, + "step": 9698 + }, + { + "epoch": 0.12603408144005535, + "grad_norm": 0.2916025221347809, + "learning_rate": 0.0001748294103120654, + "loss": 1.3652, + "step": 9699 + }, + { + "epoch": 0.12604707598397122, + "grad_norm": 0.319277286529541, + "learning_rate": 0.00017482681085015403, + "loss": 1.47, + "step": 9700 + }, + { + "epoch": 0.1260600705278871, + "grad_norm": 0.3986632227897644, + "learning_rate": 0.00017482421138824262, + "loss": 1.4361, + "step": 9701 + }, + { + "epoch": 0.12607306507180296, + "grad_norm": 0.41115862131118774, + "learning_rate": 0.00017482161192633127, + "loss": 1.4418, + "step": 9702 + }, + { + "epoch": 0.12608605961571884, + "grad_norm": 0.39329981803894043, + "learning_rate": 0.00017481901246441987, + "loss": 1.554, + "step": 9703 + }, + { + "epoch": 0.1260990541596347, + "grad_norm": 0.3860429525375366, + "learning_rate": 0.0001748164130025085, + "loss": 1.4948, + "step": 9704 + }, + { + "epoch": 0.12611204870355058, + "grad_norm": 0.3884877860546112, + "learning_rate": 0.00017481381354059712, + "loss": 1.4031, + "step": 9705 + }, + { + "epoch": 0.12612504324746646, + "grad_norm": 0.3482835590839386, + "learning_rate": 0.00017481121407868572, + "loss": 1.444, + "step": 9706 + }, + { + "epoch": 0.12613803779138236, + "grad_norm": 0.5124087929725647, + "learning_rate": 0.00017480861461677434, + "loss": 1.5332, + "step": 9707 + }, + { + "epoch": 0.12615103233529823, + "grad_norm": 0.5126358270645142, + "learning_rate": 0.00017480601515486294, + "loss": 1.385, + "step": 9708 + }, + { + "epoch": 0.1261640268792141, + "grad_norm": 0.4038727879524231, + "learning_rate": 0.00017480341569295156, + "loss": 1.4983, + "step": 9709 + }, + { + "epoch": 0.12617702142312998, + "grad_norm": 2.4577696323394775, + "learning_rate": 0.0001748008162310402, + "loss": 1.3601, + "step": 9710 + }, + { + "epoch": 0.12619001596704585, + "grad_norm": 0.3879411816596985, + "learning_rate": 0.00017479821676912879, + "loss": 1.3718, + "step": 9711 + }, + { + "epoch": 0.12620301051096172, + "grad_norm": 0.40619683265686035, + "learning_rate": 0.0001747956173072174, + "loss": 1.5004, + "step": 9712 + }, + { + "epoch": 0.1262160050548776, + "grad_norm": 0.43570661544799805, + "learning_rate": 0.00017479301784530603, + "loss": 1.4814, + "step": 9713 + }, + { + "epoch": 0.12622899959879347, + "grad_norm": 0.3898126184940338, + "learning_rate": 0.00017479041838339466, + "loss": 1.3759, + "step": 9714 + }, + { + "epoch": 0.12624199414270934, + "grad_norm": 0.4663819968700409, + "learning_rate": 0.00017478781892148326, + "loss": 1.5347, + "step": 9715 + }, + { + "epoch": 0.1262549886866252, + "grad_norm": 0.3044021725654602, + "learning_rate": 0.00017478521945957188, + "loss": 1.1682, + "step": 9716 + }, + { + "epoch": 0.1262679832305411, + "grad_norm": 0.43841981887817383, + "learning_rate": 0.0001747826199976605, + "loss": 1.2314, + "step": 9717 + }, + { + "epoch": 0.12628097777445696, + "grad_norm": 0.4041503071784973, + "learning_rate": 0.0001747800205357491, + "loss": 1.5978, + "step": 9718 + }, + { + "epoch": 0.12629397231837283, + "grad_norm": 0.38805675506591797, + "learning_rate": 0.00017477742107383773, + "loss": 1.4442, + "step": 9719 + }, + { + "epoch": 0.1263069668622887, + "grad_norm": 0.37587684392929077, + "learning_rate": 0.00017477482161192633, + "loss": 1.4096, + "step": 9720 + }, + { + "epoch": 0.12631996140620458, + "grad_norm": 0.39248234033584595, + "learning_rate": 0.00017477222215001498, + "loss": 1.4625, + "step": 9721 + }, + { + "epoch": 0.12633295595012045, + "grad_norm": 0.4851655066013336, + "learning_rate": 0.00017476962268810357, + "loss": 1.5584, + "step": 9722 + }, + { + "epoch": 0.12634595049403632, + "grad_norm": 0.30397647619247437, + "learning_rate": 0.00017476702322619217, + "loss": 1.3512, + "step": 9723 + }, + { + "epoch": 0.1263589450379522, + "grad_norm": 0.3748270571231842, + "learning_rate": 0.0001747644237642808, + "loss": 1.4888, + "step": 9724 + }, + { + "epoch": 0.12637193958186807, + "grad_norm": 0.4174167215824127, + "learning_rate": 0.00017476182430236942, + "loss": 1.4032, + "step": 9725 + }, + { + "epoch": 0.12638493412578394, + "grad_norm": 0.3229725658893585, + "learning_rate": 0.00017475922484045804, + "loss": 1.4386, + "step": 9726 + }, + { + "epoch": 0.12639792866969982, + "grad_norm": 0.4116579592227936, + "learning_rate": 0.00017475662537854664, + "loss": 1.3328, + "step": 9727 + }, + { + "epoch": 0.1264109232136157, + "grad_norm": 0.46099111437797546, + "learning_rate": 0.00017475402591663527, + "loss": 1.482, + "step": 9728 + }, + { + "epoch": 0.12642391775753156, + "grad_norm": 0.35294049978256226, + "learning_rate": 0.0001747514264547239, + "loss": 1.396, + "step": 9729 + }, + { + "epoch": 0.12643691230144743, + "grad_norm": 0.37404778599739075, + "learning_rate": 0.0001747488269928125, + "loss": 1.4334, + "step": 9730 + }, + { + "epoch": 0.1264499068453633, + "grad_norm": 0.34945693612098694, + "learning_rate": 0.0001747462275309011, + "loss": 1.5552, + "step": 9731 + }, + { + "epoch": 0.12646290138927918, + "grad_norm": 0.3755609691143036, + "learning_rate": 0.0001747436280689897, + "loss": 1.3793, + "step": 9732 + }, + { + "epoch": 0.12647589593319505, + "grad_norm": 0.4764070212841034, + "learning_rate": 0.00017474102860707836, + "loss": 1.4632, + "step": 9733 + }, + { + "epoch": 0.12648889047711093, + "grad_norm": 0.3914155960083008, + "learning_rate": 0.00017473842914516696, + "loss": 1.4597, + "step": 9734 + }, + { + "epoch": 0.1265018850210268, + "grad_norm": 0.39013493061065674, + "learning_rate": 0.00017473582968325556, + "loss": 1.3596, + "step": 9735 + }, + { + "epoch": 0.12651487956494267, + "grad_norm": 0.30755171179771423, + "learning_rate": 0.00017473323022134418, + "loss": 1.3081, + "step": 9736 + }, + { + "epoch": 0.12652787410885855, + "grad_norm": 0.39651721715927124, + "learning_rate": 0.0001747306307594328, + "loss": 1.3449, + "step": 9737 + }, + { + "epoch": 0.12654086865277442, + "grad_norm": 0.36479347944259644, + "learning_rate": 0.00017472803129752143, + "loss": 1.3956, + "step": 9738 + }, + { + "epoch": 0.1265538631966903, + "grad_norm": 0.3559912145137787, + "learning_rate": 0.00017472543183561003, + "loss": 1.1267, + "step": 9739 + }, + { + "epoch": 0.12656685774060616, + "grad_norm": 0.44560301303863525, + "learning_rate": 0.00017472283237369865, + "loss": 1.4641, + "step": 9740 + }, + { + "epoch": 0.12657985228452204, + "grad_norm": 0.39466798305511475, + "learning_rate": 0.00017472023291178728, + "loss": 1.5841, + "step": 9741 + }, + { + "epoch": 0.1265928468284379, + "grad_norm": 0.4178992807865143, + "learning_rate": 0.00017471763344987587, + "loss": 1.2845, + "step": 9742 + }, + { + "epoch": 0.12660584137235378, + "grad_norm": 0.3627108037471771, + "learning_rate": 0.0001747150339879645, + "loss": 1.4178, + "step": 9743 + }, + { + "epoch": 0.12661883591626966, + "grad_norm": 0.3000160753726959, + "learning_rate": 0.00017471243452605312, + "loss": 1.2315, + "step": 9744 + }, + { + "epoch": 0.12663183046018553, + "grad_norm": 0.38646963238716125, + "learning_rate": 0.00017470983506414175, + "loss": 1.5533, + "step": 9745 + }, + { + "epoch": 0.1266448250041014, + "grad_norm": 0.44327566027641296, + "learning_rate": 0.00017470723560223034, + "loss": 1.477, + "step": 9746 + }, + { + "epoch": 0.12665781954801728, + "grad_norm": 0.46295878291130066, + "learning_rate": 0.00017470463614031897, + "loss": 1.4691, + "step": 9747 + }, + { + "epoch": 0.12667081409193315, + "grad_norm": 0.35043972730636597, + "learning_rate": 0.0001747020366784076, + "loss": 1.2843, + "step": 9748 + }, + { + "epoch": 0.12668380863584902, + "grad_norm": 0.4215966761112213, + "learning_rate": 0.0001746994372164962, + "loss": 1.5864, + "step": 9749 + }, + { + "epoch": 0.1266968031797649, + "grad_norm": 0.3857872188091278, + "learning_rate": 0.00017469683775458482, + "loss": 1.2177, + "step": 9750 + }, + { + "epoch": 0.12670979772368077, + "grad_norm": 0.45712223649024963, + "learning_rate": 0.0001746942382926734, + "loss": 1.4889, + "step": 9751 + }, + { + "epoch": 0.12672279226759664, + "grad_norm": 0.397826224565506, + "learning_rate": 0.00017469163883076204, + "loss": 1.3779, + "step": 9752 + }, + { + "epoch": 0.1267357868115125, + "grad_norm": 0.45743003487586975, + "learning_rate": 0.00017468903936885066, + "loss": 1.6859, + "step": 9753 + }, + { + "epoch": 0.12674878135542839, + "grad_norm": 0.31271249055862427, + "learning_rate": 0.00017468643990693926, + "loss": 1.1679, + "step": 9754 + }, + { + "epoch": 0.12676177589934426, + "grad_norm": 0.3818182051181793, + "learning_rate": 0.00017468384044502788, + "loss": 1.2417, + "step": 9755 + }, + { + "epoch": 0.12677477044326013, + "grad_norm": 0.3628392219543457, + "learning_rate": 0.0001746812409831165, + "loss": 1.7035, + "step": 9756 + }, + { + "epoch": 0.126787764987176, + "grad_norm": 0.3837031424045563, + "learning_rate": 0.00017467864152120513, + "loss": 1.4246, + "step": 9757 + }, + { + "epoch": 0.12680075953109188, + "grad_norm": 0.36988452076911926, + "learning_rate": 0.00017467604205929373, + "loss": 1.5692, + "step": 9758 + }, + { + "epoch": 0.12681375407500775, + "grad_norm": 0.4251534640789032, + "learning_rate": 0.00017467344259738235, + "loss": 1.3156, + "step": 9759 + }, + { + "epoch": 0.12682674861892362, + "grad_norm": 0.3459431231021881, + "learning_rate": 0.00017467084313547098, + "loss": 1.3139, + "step": 9760 + }, + { + "epoch": 0.1268397431628395, + "grad_norm": 0.4471019506454468, + "learning_rate": 0.00017466824367355958, + "loss": 1.3948, + "step": 9761 + }, + { + "epoch": 0.12685273770675537, + "grad_norm": 0.28526002168655396, + "learning_rate": 0.0001746656442116482, + "loss": 1.3949, + "step": 9762 + }, + { + "epoch": 0.12686573225067124, + "grad_norm": 0.40695279836654663, + "learning_rate": 0.0001746630447497368, + "loss": 1.3802, + "step": 9763 + }, + { + "epoch": 0.12687872679458712, + "grad_norm": 0.36840522289276123, + "learning_rate": 0.00017466044528782542, + "loss": 1.6343, + "step": 9764 + }, + { + "epoch": 0.126891721338503, + "grad_norm": 0.48099154233932495, + "learning_rate": 0.00017465784582591405, + "loss": 1.3398, + "step": 9765 + }, + { + "epoch": 0.12690471588241886, + "grad_norm": 0.3887917995452881, + "learning_rate": 0.00017465524636400264, + "loss": 1.3394, + "step": 9766 + }, + { + "epoch": 0.12691771042633473, + "grad_norm": 0.3510952889919281, + "learning_rate": 0.00017465264690209127, + "loss": 1.3629, + "step": 9767 + }, + { + "epoch": 0.1269307049702506, + "grad_norm": 0.46056118607521057, + "learning_rate": 0.0001746500474401799, + "loss": 1.5036, + "step": 9768 + }, + { + "epoch": 0.12694369951416648, + "grad_norm": 0.4226197600364685, + "learning_rate": 0.00017464744797826852, + "loss": 1.4027, + "step": 9769 + }, + { + "epoch": 0.12695669405808235, + "grad_norm": 0.4660278856754303, + "learning_rate": 0.00017464484851635712, + "loss": 1.4996, + "step": 9770 + }, + { + "epoch": 0.12696968860199823, + "grad_norm": 0.39086925983428955, + "learning_rate": 0.00017464224905444574, + "loss": 1.2653, + "step": 9771 + }, + { + "epoch": 0.1269826831459141, + "grad_norm": 0.39842262864112854, + "learning_rate": 0.00017463964959253436, + "loss": 1.3166, + "step": 9772 + }, + { + "epoch": 0.12699567768982997, + "grad_norm": 0.4808734357357025, + "learning_rate": 0.00017463705013062296, + "loss": 1.4062, + "step": 9773 + }, + { + "epoch": 0.12700867223374585, + "grad_norm": 0.36386552453041077, + "learning_rate": 0.00017463445066871159, + "loss": 1.2046, + "step": 9774 + }, + { + "epoch": 0.12702166677766172, + "grad_norm": 0.42672857642173767, + "learning_rate": 0.00017463185120680018, + "loss": 1.7419, + "step": 9775 + }, + { + "epoch": 0.1270346613215776, + "grad_norm": 0.4021878242492676, + "learning_rate": 0.00017462925174488884, + "loss": 1.2812, + "step": 9776 + }, + { + "epoch": 0.12704765586549346, + "grad_norm": 0.33598777651786804, + "learning_rate": 0.00017462665228297743, + "loss": 1.0491, + "step": 9777 + }, + { + "epoch": 0.12706065040940934, + "grad_norm": 0.4053421914577484, + "learning_rate": 0.00017462405282106603, + "loss": 1.419, + "step": 9778 + }, + { + "epoch": 0.1270736449533252, + "grad_norm": 0.4082357585430145, + "learning_rate": 0.00017462145335915468, + "loss": 1.7965, + "step": 9779 + }, + { + "epoch": 0.12708663949724108, + "grad_norm": 0.286214143037796, + "learning_rate": 0.00017461885389724328, + "loss": 1.4666, + "step": 9780 + }, + { + "epoch": 0.12709963404115696, + "grad_norm": 0.5199337601661682, + "learning_rate": 0.0001746162544353319, + "loss": 1.472, + "step": 9781 + }, + { + "epoch": 0.12711262858507283, + "grad_norm": 0.41897672414779663, + "learning_rate": 0.0001746136549734205, + "loss": 1.332, + "step": 9782 + }, + { + "epoch": 0.12712562312898873, + "grad_norm": 0.38286423683166504, + "learning_rate": 0.00017461105551150913, + "loss": 1.4898, + "step": 9783 + }, + { + "epoch": 0.1271386176729046, + "grad_norm": 0.3432929813861847, + "learning_rate": 0.00017460845604959775, + "loss": 1.4053, + "step": 9784 + }, + { + "epoch": 0.12715161221682048, + "grad_norm": 0.34561246633529663, + "learning_rate": 0.00017460585658768635, + "loss": 1.5973, + "step": 9785 + }, + { + "epoch": 0.12716460676073635, + "grad_norm": 0.4448879659175873, + "learning_rate": 0.00017460325712577497, + "loss": 1.5423, + "step": 9786 + }, + { + "epoch": 0.12717760130465222, + "grad_norm": 0.43982186913490295, + "learning_rate": 0.0001746006576638636, + "loss": 1.3203, + "step": 9787 + }, + { + "epoch": 0.1271905958485681, + "grad_norm": 0.3380548357963562, + "learning_rate": 0.00017459805820195222, + "loss": 1.3999, + "step": 9788 + }, + { + "epoch": 0.12720359039248397, + "grad_norm": 0.410491943359375, + "learning_rate": 0.00017459545874004082, + "loss": 1.373, + "step": 9789 + }, + { + "epoch": 0.12721658493639984, + "grad_norm": 0.412641316652298, + "learning_rate": 0.00017459285927812942, + "loss": 1.4604, + "step": 9790 + }, + { + "epoch": 0.1272295794803157, + "grad_norm": 0.34360596537590027, + "learning_rate": 0.00017459025981621807, + "loss": 1.3508, + "step": 9791 + }, + { + "epoch": 0.1272425740242316, + "grad_norm": 0.3051782548427582, + "learning_rate": 0.00017458766035430666, + "loss": 1.5529, + "step": 9792 + }, + { + "epoch": 0.12725556856814746, + "grad_norm": 0.3569876551628113, + "learning_rate": 0.0001745850608923953, + "loss": 1.5069, + "step": 9793 + }, + { + "epoch": 0.12726856311206333, + "grad_norm": 0.4056491255760193, + "learning_rate": 0.00017458246143048389, + "loss": 1.2813, + "step": 9794 + }, + { + "epoch": 0.1272815576559792, + "grad_norm": 0.45286914706230164, + "learning_rate": 0.0001745798619685725, + "loss": 1.5803, + "step": 9795 + }, + { + "epoch": 0.12729455219989508, + "grad_norm": 0.3876975178718567, + "learning_rate": 0.00017457726250666114, + "loss": 1.1811, + "step": 9796 + }, + { + "epoch": 0.12730754674381095, + "grad_norm": 0.3707972466945648, + "learning_rate": 0.00017457466304474973, + "loss": 1.4004, + "step": 9797 + }, + { + "epoch": 0.12732054128772682, + "grad_norm": 0.3774029016494751, + "learning_rate": 0.00017457206358283836, + "loss": 1.3448, + "step": 9798 + }, + { + "epoch": 0.1273335358316427, + "grad_norm": 0.4498341381549835, + "learning_rate": 0.00017456946412092698, + "loss": 1.4729, + "step": 9799 + }, + { + "epoch": 0.12734653037555857, + "grad_norm": 0.6029853224754333, + "learning_rate": 0.0001745668646590156, + "loss": 1.4736, + "step": 9800 + }, + { + "epoch": 0.12735952491947444, + "grad_norm": 0.3127281665802002, + "learning_rate": 0.0001745642651971042, + "loss": 1.2452, + "step": 9801 + }, + { + "epoch": 0.12737251946339032, + "grad_norm": 0.39024844765663147, + "learning_rate": 0.0001745616657351928, + "loss": 1.584, + "step": 9802 + }, + { + "epoch": 0.1273855140073062, + "grad_norm": 0.3598044812679291, + "learning_rate": 0.00017455906627328145, + "loss": 1.5844, + "step": 9803 + }, + { + "epoch": 0.12739850855122206, + "grad_norm": 0.4696773290634155, + "learning_rate": 0.00017455646681137005, + "loss": 1.288, + "step": 9804 + }, + { + "epoch": 0.12741150309513793, + "grad_norm": 0.4604615271091461, + "learning_rate": 0.00017455386734945867, + "loss": 1.5048, + "step": 9805 + }, + { + "epoch": 0.1274244976390538, + "grad_norm": 0.3901355266571045, + "learning_rate": 0.00017455126788754727, + "loss": 1.2994, + "step": 9806 + }, + { + "epoch": 0.12743749218296968, + "grad_norm": 0.39078086614608765, + "learning_rate": 0.0001745486684256359, + "loss": 1.4265, + "step": 9807 + }, + { + "epoch": 0.12745048672688555, + "grad_norm": 0.48265746235847473, + "learning_rate": 0.00017454606896372452, + "loss": 1.7173, + "step": 9808 + }, + { + "epoch": 0.12746348127080143, + "grad_norm": 0.42275094985961914, + "learning_rate": 0.00017454346950181312, + "loss": 1.3499, + "step": 9809 + }, + { + "epoch": 0.1274764758147173, + "grad_norm": 0.354317843914032, + "learning_rate": 0.00017454087003990174, + "loss": 1.3284, + "step": 9810 + }, + { + "epoch": 0.12748947035863317, + "grad_norm": 0.4367891848087311, + "learning_rate": 0.00017453827057799037, + "loss": 1.4068, + "step": 9811 + }, + { + "epoch": 0.12750246490254905, + "grad_norm": 0.3880905210971832, + "learning_rate": 0.000174535671116079, + "loss": 1.4614, + "step": 9812 + }, + { + "epoch": 0.12751545944646492, + "grad_norm": 0.3903173506259918, + "learning_rate": 0.0001745330716541676, + "loss": 1.5189, + "step": 9813 + }, + { + "epoch": 0.1275284539903808, + "grad_norm": 0.3960549831390381, + "learning_rate": 0.0001745304721922562, + "loss": 1.6009, + "step": 9814 + }, + { + "epoch": 0.12754144853429666, + "grad_norm": 0.5237694382667542, + "learning_rate": 0.00017452787273034484, + "loss": 1.5611, + "step": 9815 + }, + { + "epoch": 0.12755444307821254, + "grad_norm": 0.45572733879089355, + "learning_rate": 0.00017452527326843344, + "loss": 1.3569, + "step": 9816 + }, + { + "epoch": 0.1275674376221284, + "grad_norm": 0.25524795055389404, + "learning_rate": 0.00017452267380652206, + "loss": 1.4838, + "step": 9817 + }, + { + "epoch": 0.12758043216604428, + "grad_norm": 0.4669489562511444, + "learning_rate": 0.00017452007434461068, + "loss": 1.4431, + "step": 9818 + }, + { + "epoch": 0.12759342670996016, + "grad_norm": 0.4880148470401764, + "learning_rate": 0.00017451747488269928, + "loss": 1.6159, + "step": 9819 + }, + { + "epoch": 0.12760642125387603, + "grad_norm": 0.3587469756603241, + "learning_rate": 0.0001745148754207879, + "loss": 1.3839, + "step": 9820 + }, + { + "epoch": 0.1276194157977919, + "grad_norm": 0.42655476927757263, + "learning_rate": 0.0001745122759588765, + "loss": 1.4199, + "step": 9821 + }, + { + "epoch": 0.12763241034170777, + "grad_norm": 0.33567482233047485, + "learning_rate": 0.00017450967649696516, + "loss": 1.3931, + "step": 9822 + }, + { + "epoch": 0.12764540488562365, + "grad_norm": 0.3328567147254944, + "learning_rate": 0.00017450707703505375, + "loss": 1.3867, + "step": 9823 + }, + { + "epoch": 0.12765839942953952, + "grad_norm": 0.33665406703948975, + "learning_rate": 0.00017450447757314238, + "loss": 1.2934, + "step": 9824 + }, + { + "epoch": 0.1276713939734554, + "grad_norm": 0.4406411945819855, + "learning_rate": 0.00017450187811123097, + "loss": 1.3838, + "step": 9825 + }, + { + "epoch": 0.12768438851737127, + "grad_norm": 0.38822394609451294, + "learning_rate": 0.0001744992786493196, + "loss": 1.5633, + "step": 9826 + }, + { + "epoch": 0.12769738306128714, + "grad_norm": 0.4578480124473572, + "learning_rate": 0.00017449667918740822, + "loss": 1.4932, + "step": 9827 + }, + { + "epoch": 0.127710377605203, + "grad_norm": 0.4058312773704529, + "learning_rate": 0.00017449407972549682, + "loss": 1.4168, + "step": 9828 + }, + { + "epoch": 0.12772337214911889, + "grad_norm": 0.4173702597618103, + "learning_rate": 0.00017449148026358545, + "loss": 1.3421, + "step": 9829 + }, + { + "epoch": 0.12773636669303476, + "grad_norm": 0.44695279002189636, + "learning_rate": 0.00017448888080167407, + "loss": 1.5197, + "step": 9830 + }, + { + "epoch": 0.12774936123695063, + "grad_norm": 0.36493515968322754, + "learning_rate": 0.00017448628133976267, + "loss": 1.4921, + "step": 9831 + }, + { + "epoch": 0.1277623557808665, + "grad_norm": 0.356090784072876, + "learning_rate": 0.0001744836818778513, + "loss": 1.5286, + "step": 9832 + }, + { + "epoch": 0.12777535032478238, + "grad_norm": 0.34752827882766724, + "learning_rate": 0.0001744810824159399, + "loss": 1.2365, + "step": 9833 + }, + { + "epoch": 0.12778834486869825, + "grad_norm": 0.33421093225479126, + "learning_rate": 0.00017447848295402854, + "loss": 1.417, + "step": 9834 + }, + { + "epoch": 0.12780133941261412, + "grad_norm": 0.4228314757347107, + "learning_rate": 0.00017447588349211714, + "loss": 1.6549, + "step": 9835 + }, + { + "epoch": 0.12781433395653, + "grad_norm": 0.4263027012348175, + "learning_rate": 0.00017447328403020576, + "loss": 1.3583, + "step": 9836 + }, + { + "epoch": 0.12782732850044587, + "grad_norm": 0.45514747500419617, + "learning_rate": 0.00017447068456829436, + "loss": 1.6082, + "step": 9837 + }, + { + "epoch": 0.12784032304436174, + "grad_norm": 0.3952755928039551, + "learning_rate": 0.00017446808510638298, + "loss": 1.5577, + "step": 9838 + }, + { + "epoch": 0.12785331758827762, + "grad_norm": 0.40582403540611267, + "learning_rate": 0.0001744654856444716, + "loss": 1.4233, + "step": 9839 + }, + { + "epoch": 0.1278663121321935, + "grad_norm": 0.44542303681373596, + "learning_rate": 0.0001744628861825602, + "loss": 1.493, + "step": 9840 + }, + { + "epoch": 0.12787930667610936, + "grad_norm": 0.3275494873523712, + "learning_rate": 0.00017446028672064883, + "loss": 1.2484, + "step": 9841 + }, + { + "epoch": 0.12789230122002523, + "grad_norm": 0.39033234119415283, + "learning_rate": 0.00017445768725873746, + "loss": 1.4901, + "step": 9842 + }, + { + "epoch": 0.1279052957639411, + "grad_norm": 0.37724947929382324, + "learning_rate": 0.00017445508779682608, + "loss": 1.4831, + "step": 9843 + }, + { + "epoch": 0.12791829030785698, + "grad_norm": 0.42083632946014404, + "learning_rate": 0.00017445248833491468, + "loss": 1.5955, + "step": 9844 + }, + { + "epoch": 0.12793128485177285, + "grad_norm": 0.3097129762172699, + "learning_rate": 0.00017444988887300327, + "loss": 1.3957, + "step": 9845 + }, + { + "epoch": 0.12794427939568873, + "grad_norm": 0.27840664982795715, + "learning_rate": 0.00017444728941109193, + "loss": 1.2559, + "step": 9846 + }, + { + "epoch": 0.1279572739396046, + "grad_norm": 0.3947417140007019, + "learning_rate": 0.00017444468994918052, + "loss": 1.5292, + "step": 9847 + }, + { + "epoch": 0.12797026848352047, + "grad_norm": 0.37195923924446106, + "learning_rate": 0.00017444209048726915, + "loss": 1.3016, + "step": 9848 + }, + { + "epoch": 0.12798326302743634, + "grad_norm": 0.41136813163757324, + "learning_rate": 0.00017443949102535775, + "loss": 1.4146, + "step": 9849 + }, + { + "epoch": 0.12799625757135222, + "grad_norm": 0.45513203740119934, + "learning_rate": 0.00017443689156344637, + "loss": 1.4977, + "step": 9850 + }, + { + "epoch": 0.1280092521152681, + "grad_norm": 0.46818891167640686, + "learning_rate": 0.000174434292101535, + "loss": 1.6209, + "step": 9851 + }, + { + "epoch": 0.12802224665918396, + "grad_norm": 0.46940889954566956, + "learning_rate": 0.0001744316926396236, + "loss": 1.2997, + "step": 9852 + }, + { + "epoch": 0.12803524120309984, + "grad_norm": 0.4131140410900116, + "learning_rate": 0.00017442909317771224, + "loss": 1.5038, + "step": 9853 + }, + { + "epoch": 0.1280482357470157, + "grad_norm": 0.3882255554199219, + "learning_rate": 0.00017442649371580084, + "loss": 1.3948, + "step": 9854 + }, + { + "epoch": 0.12806123029093158, + "grad_norm": 0.3601725101470947, + "learning_rate": 0.00017442389425388946, + "loss": 1.3917, + "step": 9855 + }, + { + "epoch": 0.12807422483484746, + "grad_norm": 0.375658243894577, + "learning_rate": 0.00017442129479197806, + "loss": 1.2736, + "step": 9856 + }, + { + "epoch": 0.12808721937876333, + "grad_norm": 1.2153809070587158, + "learning_rate": 0.0001744186953300667, + "loss": 1.2592, + "step": 9857 + }, + { + "epoch": 0.1281002139226792, + "grad_norm": 0.424893319606781, + "learning_rate": 0.0001744160958681553, + "loss": 1.4105, + "step": 9858 + }, + { + "epoch": 0.1281132084665951, + "grad_norm": 0.3003944456577301, + "learning_rate": 0.0001744134964062439, + "loss": 1.6327, + "step": 9859 + }, + { + "epoch": 0.12812620301051098, + "grad_norm": 0.4156136214733124, + "learning_rate": 0.00017441089694433253, + "loss": 1.4591, + "step": 9860 + }, + { + "epoch": 0.12813919755442685, + "grad_norm": 0.4319014549255371, + "learning_rate": 0.00017440829748242116, + "loss": 1.3651, + "step": 9861 + }, + { + "epoch": 0.12815219209834272, + "grad_norm": 0.4158675968647003, + "learning_rate": 0.00017440569802050975, + "loss": 1.4095, + "step": 9862 + }, + { + "epoch": 0.1281651866422586, + "grad_norm": 0.42379266023635864, + "learning_rate": 0.00017440309855859838, + "loss": 1.5282, + "step": 9863 + }, + { + "epoch": 0.12817818118617447, + "grad_norm": 0.38992419838905334, + "learning_rate": 0.00017440049909668698, + "loss": 1.492, + "step": 9864 + }, + { + "epoch": 0.12819117573009034, + "grad_norm": 0.43003717064857483, + "learning_rate": 0.00017439789963477563, + "loss": 1.461, + "step": 9865 + }, + { + "epoch": 0.1282041702740062, + "grad_norm": 0.39112550020217896, + "learning_rate": 0.00017439530017286423, + "loss": 1.4819, + "step": 9866 + }, + { + "epoch": 0.12821716481792209, + "grad_norm": 0.46558400988578796, + "learning_rate": 0.00017439270071095285, + "loss": 1.5735, + "step": 9867 + }, + { + "epoch": 0.12823015936183796, + "grad_norm": 0.3939507305622101, + "learning_rate": 0.00017439010124904145, + "loss": 1.457, + "step": 9868 + }, + { + "epoch": 0.12824315390575383, + "grad_norm": 0.38000988960266113, + "learning_rate": 0.00017438750178713007, + "loss": 1.5126, + "step": 9869 + }, + { + "epoch": 0.1282561484496697, + "grad_norm": 0.346591055393219, + "learning_rate": 0.0001743849023252187, + "loss": 1.3331, + "step": 9870 + }, + { + "epoch": 0.12826914299358558, + "grad_norm": 0.3416518568992615, + "learning_rate": 0.0001743823028633073, + "loss": 1.2106, + "step": 9871 + }, + { + "epoch": 0.12828213753750145, + "grad_norm": 0.45469239354133606, + "learning_rate": 0.00017437970340139592, + "loss": 1.4145, + "step": 9872 + }, + { + "epoch": 0.12829513208141732, + "grad_norm": 0.32440483570098877, + "learning_rate": 0.00017437710393948454, + "loss": 1.1556, + "step": 9873 + }, + { + "epoch": 0.1283081266253332, + "grad_norm": 0.40842241048812866, + "learning_rate": 0.00017437450447757314, + "loss": 1.5499, + "step": 9874 + }, + { + "epoch": 0.12832112116924907, + "grad_norm": 0.49925798177719116, + "learning_rate": 0.00017437190501566176, + "loss": 1.5201, + "step": 9875 + }, + { + "epoch": 0.12833411571316494, + "grad_norm": 0.41571882367134094, + "learning_rate": 0.00017436930555375036, + "loss": 1.4696, + "step": 9876 + }, + { + "epoch": 0.12834711025708082, + "grad_norm": 0.410710871219635, + "learning_rate": 0.00017436670609183901, + "loss": 1.3628, + "step": 9877 + }, + { + "epoch": 0.1283601048009967, + "grad_norm": 0.37453174591064453, + "learning_rate": 0.0001743641066299276, + "loss": 1.3728, + "step": 9878 + }, + { + "epoch": 0.12837309934491256, + "grad_norm": 0.4252074062824249, + "learning_rate": 0.00017436150716801624, + "loss": 1.6187, + "step": 9879 + }, + { + "epoch": 0.12838609388882843, + "grad_norm": 0.5142189264297485, + "learning_rate": 0.00017435890770610483, + "loss": 1.6014, + "step": 9880 + }, + { + "epoch": 0.1283990884327443, + "grad_norm": 0.3541359305381775, + "learning_rate": 0.00017435630824419346, + "loss": 1.6009, + "step": 9881 + }, + { + "epoch": 0.12841208297666018, + "grad_norm": 0.3904385566711426, + "learning_rate": 0.00017435370878228208, + "loss": 1.5398, + "step": 9882 + }, + { + "epoch": 0.12842507752057605, + "grad_norm": 0.4350864589214325, + "learning_rate": 0.00017435110932037068, + "loss": 1.4435, + "step": 9883 + }, + { + "epoch": 0.12843807206449193, + "grad_norm": 0.4111993908882141, + "learning_rate": 0.0001743485098584593, + "loss": 1.2768, + "step": 9884 + }, + { + "epoch": 0.1284510666084078, + "grad_norm": 0.37192025780677795, + "learning_rate": 0.00017434591039654793, + "loss": 1.5097, + "step": 9885 + }, + { + "epoch": 0.12846406115232367, + "grad_norm": 0.5264346599578857, + "learning_rate": 0.00017434331093463653, + "loss": 1.5803, + "step": 9886 + }, + { + "epoch": 0.12847705569623954, + "grad_norm": 0.3304254114627838, + "learning_rate": 0.00017434071147272515, + "loss": 1.4239, + "step": 9887 + }, + { + "epoch": 0.12849005024015542, + "grad_norm": 0.5335957407951355, + "learning_rate": 0.00017433811201081375, + "loss": 1.4026, + "step": 9888 + }, + { + "epoch": 0.1285030447840713, + "grad_norm": 0.42111849784851074, + "learning_rate": 0.0001743355125489024, + "loss": 1.5571, + "step": 9889 + }, + { + "epoch": 0.12851603932798716, + "grad_norm": 0.3414677083492279, + "learning_rate": 0.000174332913086991, + "loss": 1.4748, + "step": 9890 + }, + { + "epoch": 0.12852903387190304, + "grad_norm": 0.43178120255470276, + "learning_rate": 0.00017433031362507962, + "loss": 1.4642, + "step": 9891 + }, + { + "epoch": 0.1285420284158189, + "grad_norm": 0.3523310422897339, + "learning_rate": 0.00017432771416316825, + "loss": 1.484, + "step": 9892 + }, + { + "epoch": 0.12855502295973478, + "grad_norm": 0.3393974006175995, + "learning_rate": 0.00017432511470125684, + "loss": 1.3141, + "step": 9893 + }, + { + "epoch": 0.12856801750365066, + "grad_norm": 0.4137031137943268, + "learning_rate": 0.00017432251523934547, + "loss": 1.4515, + "step": 9894 + }, + { + "epoch": 0.12858101204756653, + "grad_norm": 0.43366238474845886, + "learning_rate": 0.00017431991577743406, + "loss": 1.5058, + "step": 9895 + }, + { + "epoch": 0.1285940065914824, + "grad_norm": 0.3363425135612488, + "learning_rate": 0.00017431731631552272, + "loss": 1.4453, + "step": 9896 + }, + { + "epoch": 0.12860700113539827, + "grad_norm": 0.47601714730262756, + "learning_rate": 0.00017431471685361131, + "loss": 1.5211, + "step": 9897 + }, + { + "epoch": 0.12861999567931415, + "grad_norm": 0.2401895374059677, + "learning_rate": 0.00017431211739169994, + "loss": 1.2929, + "step": 9898 + }, + { + "epoch": 0.12863299022323002, + "grad_norm": 0.44325509667396545, + "learning_rate": 0.00017430951792978854, + "loss": 1.4449, + "step": 9899 + }, + { + "epoch": 0.1286459847671459, + "grad_norm": 0.38812899589538574, + "learning_rate": 0.00017430691846787716, + "loss": 1.5306, + "step": 9900 + }, + { + "epoch": 0.12865897931106177, + "grad_norm": 0.5528951287269592, + "learning_rate": 0.00017430431900596578, + "loss": 1.392, + "step": 9901 + }, + { + "epoch": 0.12867197385497764, + "grad_norm": 0.3680340349674225, + "learning_rate": 0.00017430171954405438, + "loss": 1.2677, + "step": 9902 + }, + { + "epoch": 0.1286849683988935, + "grad_norm": 0.31848806142807007, + "learning_rate": 0.000174299120082143, + "loss": 1.2974, + "step": 9903 + }, + { + "epoch": 0.12869796294280939, + "grad_norm": 0.3728020489215851, + "learning_rate": 0.00017429652062023163, + "loss": 1.5088, + "step": 9904 + }, + { + "epoch": 0.12871095748672526, + "grad_norm": 0.41009002923965454, + "learning_rate": 0.00017429392115832023, + "loss": 1.4047, + "step": 9905 + }, + { + "epoch": 0.12872395203064113, + "grad_norm": 0.46786561608314514, + "learning_rate": 0.00017429132169640885, + "loss": 1.547, + "step": 9906 + }, + { + "epoch": 0.128736946574557, + "grad_norm": 0.4904294013977051, + "learning_rate": 0.00017428872223449745, + "loss": 1.5006, + "step": 9907 + }, + { + "epoch": 0.12874994111847288, + "grad_norm": 0.3381388187408447, + "learning_rate": 0.0001742861227725861, + "loss": 1.3418, + "step": 9908 + }, + { + "epoch": 0.12876293566238875, + "grad_norm": 0.3874700665473938, + "learning_rate": 0.0001742835233106747, + "loss": 1.3174, + "step": 9909 + }, + { + "epoch": 0.12877593020630462, + "grad_norm": 0.39168882369995117, + "learning_rate": 0.00017428092384876332, + "loss": 1.4861, + "step": 9910 + }, + { + "epoch": 0.1287889247502205, + "grad_norm": 0.4563981890678406, + "learning_rate": 0.00017427832438685192, + "loss": 1.4282, + "step": 9911 + }, + { + "epoch": 0.12880191929413637, + "grad_norm": 0.41339311003685, + "learning_rate": 0.00017427572492494055, + "loss": 1.3893, + "step": 9912 + }, + { + "epoch": 0.12881491383805224, + "grad_norm": 0.48420026898384094, + "learning_rate": 0.00017427312546302917, + "loss": 1.5723, + "step": 9913 + }, + { + "epoch": 0.12882790838196811, + "grad_norm": 0.29279735684394836, + "learning_rate": 0.00017427052600111777, + "loss": 1.3857, + "step": 9914 + }, + { + "epoch": 0.128840902925884, + "grad_norm": 0.39467430114746094, + "learning_rate": 0.0001742679265392064, + "loss": 1.6277, + "step": 9915 + }, + { + "epoch": 0.12885389746979986, + "grad_norm": 0.41499289870262146, + "learning_rate": 0.00017426532707729502, + "loss": 1.3248, + "step": 9916 + }, + { + "epoch": 0.12886689201371573, + "grad_norm": 0.3549771010875702, + "learning_rate": 0.00017426272761538361, + "loss": 1.5646, + "step": 9917 + }, + { + "epoch": 0.1288798865576316, + "grad_norm": 0.42362797260284424, + "learning_rate": 0.00017426012815347224, + "loss": 1.4338, + "step": 9918 + }, + { + "epoch": 0.12889288110154748, + "grad_norm": 0.406766414642334, + "learning_rate": 0.00017425752869156084, + "loss": 1.6041, + "step": 9919 + }, + { + "epoch": 0.12890587564546335, + "grad_norm": 0.4528195261955261, + "learning_rate": 0.0001742549292296495, + "loss": 1.3579, + "step": 9920 + }, + { + "epoch": 0.12891887018937923, + "grad_norm": 0.4112229645252228, + "learning_rate": 0.00017425232976773808, + "loss": 1.5462, + "step": 9921 + }, + { + "epoch": 0.1289318647332951, + "grad_norm": 0.4516632556915283, + "learning_rate": 0.0001742497303058267, + "loss": 1.6634, + "step": 9922 + }, + { + "epoch": 0.12894485927721097, + "grad_norm": 0.4296059012413025, + "learning_rate": 0.0001742471308439153, + "loss": 1.4795, + "step": 9923 + }, + { + "epoch": 0.12895785382112684, + "grad_norm": 0.43611133098602295, + "learning_rate": 0.00017424453138200393, + "loss": 1.4189, + "step": 9924 + }, + { + "epoch": 0.12897084836504272, + "grad_norm": 0.39404794573783875, + "learning_rate": 0.00017424193192009256, + "loss": 1.2678, + "step": 9925 + }, + { + "epoch": 0.1289838429089586, + "grad_norm": 0.3659047484397888, + "learning_rate": 0.00017423933245818115, + "loss": 1.3423, + "step": 9926 + }, + { + "epoch": 0.12899683745287446, + "grad_norm": 0.32098567485809326, + "learning_rate": 0.0001742367329962698, + "loss": 1.6296, + "step": 9927 + }, + { + "epoch": 0.12900983199679034, + "grad_norm": 0.42100879549980164, + "learning_rate": 0.0001742341335343584, + "loss": 1.4443, + "step": 9928 + }, + { + "epoch": 0.1290228265407062, + "grad_norm": 0.2853875458240509, + "learning_rate": 0.000174231534072447, + "loss": 1.3316, + "step": 9929 + }, + { + "epoch": 0.12903582108462208, + "grad_norm": 0.3158057630062103, + "learning_rate": 0.00017422893461053562, + "loss": 1.5502, + "step": 9930 + }, + { + "epoch": 0.12904881562853796, + "grad_norm": 0.3264307975769043, + "learning_rate": 0.00017422633514862425, + "loss": 1.4234, + "step": 9931 + }, + { + "epoch": 0.12906181017245383, + "grad_norm": 0.4035046696662903, + "learning_rate": 0.00017422373568671287, + "loss": 1.5789, + "step": 9932 + }, + { + "epoch": 0.1290748047163697, + "grad_norm": 0.3322938084602356, + "learning_rate": 0.00017422113622480147, + "loss": 1.2747, + "step": 9933 + }, + { + "epoch": 0.12908779926028557, + "grad_norm": 0.35565537214279175, + "learning_rate": 0.0001742185367628901, + "loss": 1.3475, + "step": 9934 + }, + { + "epoch": 0.12910079380420147, + "grad_norm": 0.44883137941360474, + "learning_rate": 0.00017421593730097872, + "loss": 1.5142, + "step": 9935 + }, + { + "epoch": 0.12911378834811735, + "grad_norm": 0.49093690514564514, + "learning_rate": 0.00017421333783906732, + "loss": 1.5117, + "step": 9936 + }, + { + "epoch": 0.12912678289203322, + "grad_norm": 0.3607786297798157, + "learning_rate": 0.00017421073837715594, + "loss": 1.611, + "step": 9937 + }, + { + "epoch": 0.1291397774359491, + "grad_norm": 0.38029131293296814, + "learning_rate": 0.00017420813891524454, + "loss": 1.5231, + "step": 9938 + }, + { + "epoch": 0.12915277197986497, + "grad_norm": 0.3842584490776062, + "learning_rate": 0.0001742055394533332, + "loss": 1.5305, + "step": 9939 + }, + { + "epoch": 0.12916576652378084, + "grad_norm": 0.37519174814224243, + "learning_rate": 0.0001742029399914218, + "loss": 1.265, + "step": 9940 + }, + { + "epoch": 0.1291787610676967, + "grad_norm": 0.3791878819465637, + "learning_rate": 0.00017420034052951038, + "loss": 1.3648, + "step": 9941 + }, + { + "epoch": 0.12919175561161259, + "grad_norm": 0.31812354922294617, + "learning_rate": 0.000174197741067599, + "loss": 1.3863, + "step": 9942 + }, + { + "epoch": 0.12920475015552846, + "grad_norm": 0.2940219044685364, + "learning_rate": 0.00017419514160568763, + "loss": 1.3034, + "step": 9943 + }, + { + "epoch": 0.12921774469944433, + "grad_norm": 0.38797011971473694, + "learning_rate": 0.00017419254214377626, + "loss": 1.3865, + "step": 9944 + }, + { + "epoch": 0.1292307392433602, + "grad_norm": 0.33331772685050964, + "learning_rate": 0.00017418994268186486, + "loss": 1.4643, + "step": 9945 + }, + { + "epoch": 0.12924373378727608, + "grad_norm": 0.3813553750514984, + "learning_rate": 0.00017418734321995348, + "loss": 1.4999, + "step": 9946 + }, + { + "epoch": 0.12925672833119195, + "grad_norm": 0.4414568245410919, + "learning_rate": 0.0001741847437580421, + "loss": 1.3252, + "step": 9947 + }, + { + "epoch": 0.12926972287510782, + "grad_norm": 0.3919098377227783, + "learning_rate": 0.0001741821442961307, + "loss": 1.3675, + "step": 9948 + }, + { + "epoch": 0.1292827174190237, + "grad_norm": 0.37384888529777527, + "learning_rate": 0.00017417954483421933, + "loss": 1.3339, + "step": 9949 + }, + { + "epoch": 0.12929571196293957, + "grad_norm": 0.4755634367465973, + "learning_rate": 0.00017417694537230792, + "loss": 1.4288, + "step": 9950 + }, + { + "epoch": 0.12930870650685544, + "grad_norm": 0.41291344165802, + "learning_rate": 0.00017417434591039658, + "loss": 1.6188, + "step": 9951 + }, + { + "epoch": 0.12932170105077131, + "grad_norm": 0.30861541628837585, + "learning_rate": 0.00017417174644848517, + "loss": 1.3917, + "step": 9952 + }, + { + "epoch": 0.1293346955946872, + "grad_norm": 0.40257593989372253, + "learning_rate": 0.0001741691469865738, + "loss": 1.3579, + "step": 9953 + }, + { + "epoch": 0.12934769013860306, + "grad_norm": 0.38112974166870117, + "learning_rate": 0.0001741665475246624, + "loss": 1.3009, + "step": 9954 + }, + { + "epoch": 0.12936068468251893, + "grad_norm": 0.41760537028312683, + "learning_rate": 0.00017416394806275102, + "loss": 1.2413, + "step": 9955 + }, + { + "epoch": 0.1293736792264348, + "grad_norm": 0.42220789194107056, + "learning_rate": 0.00017416134860083964, + "loss": 1.3973, + "step": 9956 + }, + { + "epoch": 0.12938667377035068, + "grad_norm": 0.41747868061065674, + "learning_rate": 0.00017415874913892824, + "loss": 1.4522, + "step": 9957 + }, + { + "epoch": 0.12939966831426655, + "grad_norm": 0.44334641098976135, + "learning_rate": 0.00017415614967701687, + "loss": 1.6256, + "step": 9958 + }, + { + "epoch": 0.12941266285818243, + "grad_norm": 0.3481699824333191, + "learning_rate": 0.0001741535502151055, + "loss": 1.4451, + "step": 9959 + }, + { + "epoch": 0.1294256574020983, + "grad_norm": 0.333882212638855, + "learning_rate": 0.0001741509507531941, + "loss": 1.4569, + "step": 9960 + }, + { + "epoch": 0.12943865194601417, + "grad_norm": 0.4541371464729309, + "learning_rate": 0.0001741483512912827, + "loss": 1.279, + "step": 9961 + }, + { + "epoch": 0.12945164648993004, + "grad_norm": 0.3667670786380768, + "learning_rate": 0.0001741457518293713, + "loss": 1.3596, + "step": 9962 + }, + { + "epoch": 0.12946464103384592, + "grad_norm": 0.4397205412387848, + "learning_rate": 0.00017414315236745996, + "loss": 1.5906, + "step": 9963 + }, + { + "epoch": 0.1294776355777618, + "grad_norm": 0.3494838774204254, + "learning_rate": 0.00017414055290554856, + "loss": 1.5216, + "step": 9964 + }, + { + "epoch": 0.12949063012167766, + "grad_norm": 0.41109204292297363, + "learning_rate": 0.00017413795344363718, + "loss": 1.5668, + "step": 9965 + }, + { + "epoch": 0.12950362466559354, + "grad_norm": 0.46567806601524353, + "learning_rate": 0.0001741353539817258, + "loss": 1.3667, + "step": 9966 + }, + { + "epoch": 0.1295166192095094, + "grad_norm": 0.43416059017181396, + "learning_rate": 0.0001741327545198144, + "loss": 1.5459, + "step": 9967 + }, + { + "epoch": 0.12952961375342528, + "grad_norm": 0.2909247875213623, + "learning_rate": 0.00017413015505790303, + "loss": 1.2324, + "step": 9968 + }, + { + "epoch": 0.12954260829734116, + "grad_norm": 0.42831721901893616, + "learning_rate": 0.00017412755559599163, + "loss": 1.3707, + "step": 9969 + }, + { + "epoch": 0.12955560284125703, + "grad_norm": 0.4365481734275818, + "learning_rate": 0.00017412495613408025, + "loss": 1.3582, + "step": 9970 + }, + { + "epoch": 0.1295685973851729, + "grad_norm": 0.351633757352829, + "learning_rate": 0.00017412235667216888, + "loss": 1.4046, + "step": 9971 + }, + { + "epoch": 0.12958159192908877, + "grad_norm": 0.35377952456474304, + "learning_rate": 0.00017411975721025747, + "loss": 1.238, + "step": 9972 + }, + { + "epoch": 0.12959458647300465, + "grad_norm": 0.38907644152641296, + "learning_rate": 0.0001741171577483461, + "loss": 1.4636, + "step": 9973 + }, + { + "epoch": 0.12960758101692052, + "grad_norm": 0.4175402820110321, + "learning_rate": 0.00017411455828643472, + "loss": 1.6342, + "step": 9974 + }, + { + "epoch": 0.1296205755608364, + "grad_norm": 0.2818557918071747, + "learning_rate": 0.00017411195882452335, + "loss": 1.4161, + "step": 9975 + }, + { + "epoch": 0.12963357010475227, + "grad_norm": 0.4198802411556244, + "learning_rate": 0.00017410935936261194, + "loss": 1.4363, + "step": 9976 + }, + { + "epoch": 0.12964656464866814, + "grad_norm": 0.39951491355895996, + "learning_rate": 0.00017410675990070057, + "loss": 1.5615, + "step": 9977 + }, + { + "epoch": 0.129659559192584, + "grad_norm": 0.38862529397010803, + "learning_rate": 0.0001741041604387892, + "loss": 1.4479, + "step": 9978 + }, + { + "epoch": 0.12967255373649988, + "grad_norm": 0.3623588979244232, + "learning_rate": 0.0001741015609768778, + "loss": 1.4353, + "step": 9979 + }, + { + "epoch": 0.12968554828041576, + "grad_norm": 0.35875847935676575, + "learning_rate": 0.00017409896151496641, + "loss": 1.2707, + "step": 9980 + }, + { + "epoch": 0.12969854282433163, + "grad_norm": 0.34811297059059143, + "learning_rate": 0.000174096362053055, + "loss": 1.5755, + "step": 9981 + }, + { + "epoch": 0.1297115373682475, + "grad_norm": 0.40699586272239685, + "learning_rate": 0.00017409376259114366, + "loss": 1.6184, + "step": 9982 + }, + { + "epoch": 0.12972453191216338, + "grad_norm": 0.3738786578178406, + "learning_rate": 0.00017409116312923226, + "loss": 1.2635, + "step": 9983 + }, + { + "epoch": 0.12973752645607925, + "grad_norm": 0.29468458890914917, + "learning_rate": 0.00017408856366732086, + "loss": 1.6569, + "step": 9984 + }, + { + "epoch": 0.12975052099999512, + "grad_norm": 0.43510788679122925, + "learning_rate": 0.00017408596420540948, + "loss": 1.4216, + "step": 9985 + }, + { + "epoch": 0.129763515543911, + "grad_norm": 0.33560457825660706, + "learning_rate": 0.0001740833647434981, + "loss": 1.3781, + "step": 9986 + }, + { + "epoch": 0.12977651008782687, + "grad_norm": 0.3269752562046051, + "learning_rate": 0.00017408076528158673, + "loss": 1.1667, + "step": 9987 + }, + { + "epoch": 0.12978950463174274, + "grad_norm": 0.4033019542694092, + "learning_rate": 0.00017407816581967533, + "loss": 1.3924, + "step": 9988 + }, + { + "epoch": 0.12980249917565861, + "grad_norm": 0.5156260132789612, + "learning_rate": 0.00017407556635776395, + "loss": 1.2991, + "step": 9989 + }, + { + "epoch": 0.1298154937195745, + "grad_norm": 0.4047981798648834, + "learning_rate": 0.00017407296689585258, + "loss": 1.4773, + "step": 9990 + }, + { + "epoch": 0.12982848826349036, + "grad_norm": 0.3604089915752411, + "learning_rate": 0.00017407036743394118, + "loss": 1.5106, + "step": 9991 + }, + { + "epoch": 0.12984148280740623, + "grad_norm": 0.4189310371875763, + "learning_rate": 0.0001740677679720298, + "loss": 1.3032, + "step": 9992 + }, + { + "epoch": 0.1298544773513221, + "grad_norm": 0.38563403487205505, + "learning_rate": 0.0001740651685101184, + "loss": 1.5565, + "step": 9993 + }, + { + "epoch": 0.12986747189523798, + "grad_norm": 0.371124804019928, + "learning_rate": 0.00017406256904820705, + "loss": 1.3489, + "step": 9994 + }, + { + "epoch": 0.12988046643915385, + "grad_norm": 0.40574729442596436, + "learning_rate": 0.00017405996958629565, + "loss": 1.3917, + "step": 9995 + }, + { + "epoch": 0.12989346098306973, + "grad_norm": 0.2749800682067871, + "learning_rate": 0.00017405737012438424, + "loss": 1.3252, + "step": 9996 + }, + { + "epoch": 0.1299064555269856, + "grad_norm": 0.3030362129211426, + "learning_rate": 0.00017405477066247287, + "loss": 1.2645, + "step": 9997 + }, + { + "epoch": 0.12991945007090147, + "grad_norm": 0.4077836871147156, + "learning_rate": 0.0001740521712005615, + "loss": 1.6004, + "step": 9998 + }, + { + "epoch": 0.12993244461481734, + "grad_norm": 0.47647544741630554, + "learning_rate": 0.00017404957173865012, + "loss": 1.4612, + "step": 9999 + }, + { + "epoch": 0.12994543915873322, + "grad_norm": 0.3247867524623871, + "learning_rate": 0.00017404697227673871, + "loss": 1.4825, + "step": 10000 + }, + { + "epoch": 0.1299584337026491, + "grad_norm": 0.3616940379142761, + "learning_rate": 0.00017404437281482734, + "loss": 1.299, + "step": 10001 + }, + { + "epoch": 0.12997142824656496, + "grad_norm": 0.4881681501865387, + "learning_rate": 0.00017404177335291596, + "loss": 1.5746, + "step": 10002 + }, + { + "epoch": 0.12998442279048084, + "grad_norm": 0.4217986762523651, + "learning_rate": 0.00017403917389100456, + "loss": 1.4649, + "step": 10003 + }, + { + "epoch": 0.1299974173343967, + "grad_norm": 0.5113387703895569, + "learning_rate": 0.00017403657442909318, + "loss": 1.7252, + "step": 10004 + }, + { + "epoch": 0.13001041187831258, + "grad_norm": 0.32327333092689514, + "learning_rate": 0.0001740339749671818, + "loss": 1.2406, + "step": 10005 + }, + { + "epoch": 0.13002340642222845, + "grad_norm": 0.3602115213871002, + "learning_rate": 0.00017403137550527043, + "loss": 1.569, + "step": 10006 + }, + { + "epoch": 0.13003640096614433, + "grad_norm": 0.43408840894699097, + "learning_rate": 0.00017402877604335903, + "loss": 1.5304, + "step": 10007 + }, + { + "epoch": 0.1300493955100602, + "grad_norm": 0.47221341729164124, + "learning_rate": 0.00017402617658144763, + "loss": 1.5453, + "step": 10008 + }, + { + "epoch": 0.13006239005397607, + "grad_norm": 0.45013290643692017, + "learning_rate": 0.00017402357711953628, + "loss": 1.4231, + "step": 10009 + }, + { + "epoch": 0.13007538459789195, + "grad_norm": 0.32998397946357727, + "learning_rate": 0.00017402097765762488, + "loss": 1.3098, + "step": 10010 + }, + { + "epoch": 0.13008837914180785, + "grad_norm": 0.29858359694480896, + "learning_rate": 0.0001740183781957135, + "loss": 1.3373, + "step": 10011 + }, + { + "epoch": 0.13010137368572372, + "grad_norm": 0.45778200030326843, + "learning_rate": 0.0001740157787338021, + "loss": 1.4403, + "step": 10012 + }, + { + "epoch": 0.1301143682296396, + "grad_norm": 0.397909939289093, + "learning_rate": 0.00017401317927189072, + "loss": 1.4327, + "step": 10013 + }, + { + "epoch": 0.13012736277355547, + "grad_norm": 0.35794296860694885, + "learning_rate": 0.00017401057980997935, + "loss": 1.2409, + "step": 10014 + }, + { + "epoch": 0.13014035731747134, + "grad_norm": 0.40209275484085083, + "learning_rate": 0.00017400798034806795, + "loss": 1.2208, + "step": 10015 + }, + { + "epoch": 0.1301533518613872, + "grad_norm": 0.39884623885154724, + "learning_rate": 0.00017400538088615657, + "loss": 1.3237, + "step": 10016 + }, + { + "epoch": 0.13016634640530308, + "grad_norm": 0.3505159020423889, + "learning_rate": 0.0001740027814242452, + "loss": 1.4201, + "step": 10017 + }, + { + "epoch": 0.13017934094921896, + "grad_norm": 0.48346638679504395, + "learning_rate": 0.00017400018196233382, + "loss": 1.6036, + "step": 10018 + }, + { + "epoch": 0.13019233549313483, + "grad_norm": 0.41650640964508057, + "learning_rate": 0.00017399758250042242, + "loss": 1.3955, + "step": 10019 + }, + { + "epoch": 0.1302053300370507, + "grad_norm": 0.3803153336048126, + "learning_rate": 0.00017399498303851104, + "loss": 1.333, + "step": 10020 + }, + { + "epoch": 0.13021832458096658, + "grad_norm": 0.4341132938861847, + "learning_rate": 0.00017399238357659967, + "loss": 1.4136, + "step": 10021 + }, + { + "epoch": 0.13023131912488245, + "grad_norm": 0.44876110553741455, + "learning_rate": 0.00017398978411468826, + "loss": 1.4719, + "step": 10022 + }, + { + "epoch": 0.13024431366879832, + "grad_norm": 0.43796607851982117, + "learning_rate": 0.0001739871846527769, + "loss": 1.5937, + "step": 10023 + }, + { + "epoch": 0.1302573082127142, + "grad_norm": 0.3934550881385803, + "learning_rate": 0.00017398458519086548, + "loss": 1.5697, + "step": 10024 + }, + { + "epoch": 0.13027030275663007, + "grad_norm": 0.3820855915546417, + "learning_rate": 0.0001739819857289541, + "loss": 1.476, + "step": 10025 + }, + { + "epoch": 0.13028329730054594, + "grad_norm": 0.405866801738739, + "learning_rate": 0.00017397938626704273, + "loss": 1.3468, + "step": 10026 + }, + { + "epoch": 0.13029629184446181, + "grad_norm": 0.3781905174255371, + "learning_rate": 0.00017397678680513133, + "loss": 1.6149, + "step": 10027 + }, + { + "epoch": 0.1303092863883777, + "grad_norm": 0.40196311473846436, + "learning_rate": 0.00017397418734321996, + "loss": 1.502, + "step": 10028 + }, + { + "epoch": 0.13032228093229356, + "grad_norm": 0.3302616775035858, + "learning_rate": 0.00017397158788130858, + "loss": 1.4482, + "step": 10029 + }, + { + "epoch": 0.13033527547620943, + "grad_norm": 0.3582909107208252, + "learning_rate": 0.0001739689884193972, + "loss": 1.2684, + "step": 10030 + }, + { + "epoch": 0.1303482700201253, + "grad_norm": 0.5488734841346741, + "learning_rate": 0.0001739663889574858, + "loss": 1.348, + "step": 10031 + }, + { + "epoch": 0.13036126456404118, + "grad_norm": 0.42265790700912476, + "learning_rate": 0.00017396378949557443, + "loss": 1.6309, + "step": 10032 + }, + { + "epoch": 0.13037425910795705, + "grad_norm": 0.3424038887023926, + "learning_rate": 0.00017396119003366305, + "loss": 1.6459, + "step": 10033 + }, + { + "epoch": 0.13038725365187293, + "grad_norm": 0.31561824679374695, + "learning_rate": 0.00017395859057175165, + "loss": 1.5422, + "step": 10034 + }, + { + "epoch": 0.1304002481957888, + "grad_norm": 0.3706008195877075, + "learning_rate": 0.00017395599110984027, + "loss": 1.654, + "step": 10035 + }, + { + "epoch": 0.13041324273970467, + "grad_norm": 0.3678598999977112, + "learning_rate": 0.00017395339164792887, + "loss": 1.4911, + "step": 10036 + }, + { + "epoch": 0.13042623728362054, + "grad_norm": 0.4187855124473572, + "learning_rate": 0.0001739507921860175, + "loss": 1.2834, + "step": 10037 + }, + { + "epoch": 0.13043923182753642, + "grad_norm": 0.28725212812423706, + "learning_rate": 0.00017394819272410612, + "loss": 1.4427, + "step": 10038 + }, + { + "epoch": 0.1304522263714523, + "grad_norm": 0.4345038831233978, + "learning_rate": 0.00017394559326219472, + "loss": 1.5042, + "step": 10039 + }, + { + "epoch": 0.13046522091536816, + "grad_norm": 0.4611448645591736, + "learning_rate": 0.00017394299380028337, + "loss": 1.6384, + "step": 10040 + }, + { + "epoch": 0.13047821545928404, + "grad_norm": 0.3396282196044922, + "learning_rate": 0.00017394039433837197, + "loss": 1.2994, + "step": 10041 + }, + { + "epoch": 0.1304912100031999, + "grad_norm": 0.44800370931625366, + "learning_rate": 0.0001739377948764606, + "loss": 1.5327, + "step": 10042 + }, + { + "epoch": 0.13050420454711578, + "grad_norm": 0.36406949162483215, + "learning_rate": 0.0001739351954145492, + "loss": 1.5999, + "step": 10043 + }, + { + "epoch": 0.13051719909103165, + "grad_norm": 0.3297083377838135, + "learning_rate": 0.0001739325959526378, + "loss": 1.302, + "step": 10044 + }, + { + "epoch": 0.13053019363494753, + "grad_norm": 0.3537190854549408, + "learning_rate": 0.00017392999649072644, + "loss": 1.3884, + "step": 10045 + }, + { + "epoch": 0.1305431881788634, + "grad_norm": 0.388339102268219, + "learning_rate": 0.00017392739702881503, + "loss": 1.3715, + "step": 10046 + }, + { + "epoch": 0.13055618272277927, + "grad_norm": 0.2931365370750427, + "learning_rate": 0.00017392479756690366, + "loss": 1.2103, + "step": 10047 + }, + { + "epoch": 0.13056917726669515, + "grad_norm": 0.37645572423934937, + "learning_rate": 0.00017392219810499228, + "loss": 1.3214, + "step": 10048 + }, + { + "epoch": 0.13058217181061102, + "grad_norm": 0.3196539878845215, + "learning_rate": 0.0001739195986430809, + "loss": 1.528, + "step": 10049 + }, + { + "epoch": 0.1305951663545269, + "grad_norm": 0.45146259665489197, + "learning_rate": 0.0001739169991811695, + "loss": 1.4401, + "step": 10050 + }, + { + "epoch": 0.13060816089844277, + "grad_norm": 0.3216227889060974, + "learning_rate": 0.0001739143997192581, + "loss": 1.5061, + "step": 10051 + }, + { + "epoch": 0.13062115544235864, + "grad_norm": 0.3479582667350769, + "learning_rate": 0.00017391180025734675, + "loss": 1.5469, + "step": 10052 + }, + { + "epoch": 0.1306341499862745, + "grad_norm": 0.3404166102409363, + "learning_rate": 0.00017390920079543535, + "loss": 1.4499, + "step": 10053 + }, + { + "epoch": 0.13064714453019038, + "grad_norm": 0.37766242027282715, + "learning_rate": 0.00017390660133352398, + "loss": 1.4437, + "step": 10054 + }, + { + "epoch": 0.13066013907410626, + "grad_norm": 0.2526474893093109, + "learning_rate": 0.00017390400187161257, + "loss": 1.2312, + "step": 10055 + }, + { + "epoch": 0.13067313361802213, + "grad_norm": 0.4539259970188141, + "learning_rate": 0.0001739014024097012, + "loss": 1.5017, + "step": 10056 + }, + { + "epoch": 0.130686128161938, + "grad_norm": 0.35707470774650574, + "learning_rate": 0.00017389880294778982, + "loss": 1.3938, + "step": 10057 + }, + { + "epoch": 0.13069912270585388, + "grad_norm": 0.37519896030426025, + "learning_rate": 0.00017389620348587842, + "loss": 1.3482, + "step": 10058 + }, + { + "epoch": 0.13071211724976975, + "grad_norm": 0.4118571877479553, + "learning_rate": 0.00017389360402396704, + "loss": 1.392, + "step": 10059 + }, + { + "epoch": 0.13072511179368562, + "grad_norm": 0.29952940344810486, + "learning_rate": 0.00017389100456205567, + "loss": 1.1987, + "step": 10060 + }, + { + "epoch": 0.1307381063376015, + "grad_norm": 0.39584389328956604, + "learning_rate": 0.0001738884051001443, + "loss": 1.4993, + "step": 10061 + }, + { + "epoch": 0.13075110088151737, + "grad_norm": 0.30917489528656006, + "learning_rate": 0.0001738858056382329, + "loss": 1.1698, + "step": 10062 + }, + { + "epoch": 0.13076409542543324, + "grad_norm": 0.4434426724910736, + "learning_rate": 0.0001738832061763215, + "loss": 1.3465, + "step": 10063 + }, + { + "epoch": 0.13077708996934911, + "grad_norm": 0.4485422670841217, + "learning_rate": 0.00017388060671441014, + "loss": 1.5882, + "step": 10064 + }, + { + "epoch": 0.130790084513265, + "grad_norm": 0.4047817289829254, + "learning_rate": 0.00017387800725249874, + "loss": 1.3376, + "step": 10065 + }, + { + "epoch": 0.13080307905718086, + "grad_norm": 0.3567257225513458, + "learning_rate": 0.00017387540779058736, + "loss": 1.4853, + "step": 10066 + }, + { + "epoch": 0.13081607360109673, + "grad_norm": 0.3692239820957184, + "learning_rate": 0.00017387280832867596, + "loss": 1.3567, + "step": 10067 + }, + { + "epoch": 0.1308290681450126, + "grad_norm": 0.511049211025238, + "learning_rate": 0.00017387020886676458, + "loss": 1.4765, + "step": 10068 + }, + { + "epoch": 0.13084206268892848, + "grad_norm": 0.40648844838142395, + "learning_rate": 0.0001738676094048532, + "loss": 1.3206, + "step": 10069 + }, + { + "epoch": 0.13085505723284435, + "grad_norm": 0.2965278625488281, + "learning_rate": 0.0001738650099429418, + "loss": 1.4545, + "step": 10070 + }, + { + "epoch": 0.13086805177676022, + "grad_norm": 0.3403579592704773, + "learning_rate": 0.00017386241048103043, + "loss": 1.6177, + "step": 10071 + }, + { + "epoch": 0.1308810463206761, + "grad_norm": 0.3705838620662689, + "learning_rate": 0.00017385981101911905, + "loss": 1.5671, + "step": 10072 + }, + { + "epoch": 0.13089404086459197, + "grad_norm": 0.43603697419166565, + "learning_rate": 0.00017385721155720768, + "loss": 1.4139, + "step": 10073 + }, + { + "epoch": 0.13090703540850784, + "grad_norm": 0.3996585011482239, + "learning_rate": 0.00017385461209529628, + "loss": 1.4051, + "step": 10074 + }, + { + "epoch": 0.13092002995242372, + "grad_norm": 0.44317492842674255, + "learning_rate": 0.0001738520126333849, + "loss": 1.5417, + "step": 10075 + }, + { + "epoch": 0.1309330244963396, + "grad_norm": 0.3266716003417969, + "learning_rate": 0.00017384941317147352, + "loss": 1.719, + "step": 10076 + }, + { + "epoch": 0.13094601904025546, + "grad_norm": 0.45574870705604553, + "learning_rate": 0.00017384681370956212, + "loss": 1.4382, + "step": 10077 + }, + { + "epoch": 0.13095901358417134, + "grad_norm": 0.45127570629119873, + "learning_rate": 0.00017384421424765075, + "loss": 1.5466, + "step": 10078 + }, + { + "epoch": 0.1309720081280872, + "grad_norm": 0.43713030219078064, + "learning_rate": 0.00017384161478573937, + "loss": 1.4455, + "step": 10079 + }, + { + "epoch": 0.13098500267200308, + "grad_norm": 0.42944011092185974, + "learning_rate": 0.00017383901532382797, + "loss": 1.2745, + "step": 10080 + }, + { + "epoch": 0.13099799721591895, + "grad_norm": 0.43258917331695557, + "learning_rate": 0.0001738364158619166, + "loss": 1.5097, + "step": 10081 + }, + { + "epoch": 0.13101099175983483, + "grad_norm": 0.3020515441894531, + "learning_rate": 0.0001738338164000052, + "loss": 1.2863, + "step": 10082 + }, + { + "epoch": 0.1310239863037507, + "grad_norm": 0.4114360213279724, + "learning_rate": 0.00017383121693809384, + "loss": 1.4895, + "step": 10083 + }, + { + "epoch": 0.13103698084766657, + "grad_norm": 0.44200456142425537, + "learning_rate": 0.00017382861747618244, + "loss": 1.4812, + "step": 10084 + }, + { + "epoch": 0.13104997539158245, + "grad_norm": 0.3922804892063141, + "learning_rate": 0.00017382601801427106, + "loss": 1.4677, + "step": 10085 + }, + { + "epoch": 0.13106296993549832, + "grad_norm": 0.4374137818813324, + "learning_rate": 0.00017382341855235966, + "loss": 1.4032, + "step": 10086 + }, + { + "epoch": 0.13107596447941422, + "grad_norm": 0.3786354660987854, + "learning_rate": 0.00017382081909044829, + "loss": 1.2245, + "step": 10087 + }, + { + "epoch": 0.1310889590233301, + "grad_norm": 0.3816983699798584, + "learning_rate": 0.0001738182196285369, + "loss": 1.4987, + "step": 10088 + }, + { + "epoch": 0.13110195356724597, + "grad_norm": 0.4352808892726898, + "learning_rate": 0.0001738156201666255, + "loss": 1.3087, + "step": 10089 + }, + { + "epoch": 0.13111494811116184, + "grad_norm": 0.3937617242336273, + "learning_rate": 0.00017381302070471413, + "loss": 1.297, + "step": 10090 + }, + { + "epoch": 0.1311279426550777, + "grad_norm": 0.3940543830394745, + "learning_rate": 0.00017381042124280276, + "loss": 1.5594, + "step": 10091 + }, + { + "epoch": 0.13114093719899358, + "grad_norm": 0.3190916180610657, + "learning_rate": 0.00017380782178089135, + "loss": 1.4878, + "step": 10092 + }, + { + "epoch": 0.13115393174290946, + "grad_norm": 0.3550054132938385, + "learning_rate": 0.00017380522231897998, + "loss": 1.3759, + "step": 10093 + }, + { + "epoch": 0.13116692628682533, + "grad_norm": 0.3647286593914032, + "learning_rate": 0.00017380262285706858, + "loss": 1.4697, + "step": 10094 + }, + { + "epoch": 0.1311799208307412, + "grad_norm": 0.43294957280158997, + "learning_rate": 0.00017380002339515723, + "loss": 1.2944, + "step": 10095 + }, + { + "epoch": 0.13119291537465708, + "grad_norm": 0.4288536012172699, + "learning_rate": 0.00017379742393324582, + "loss": 1.322, + "step": 10096 + }, + { + "epoch": 0.13120590991857295, + "grad_norm": 0.4165160357952118, + "learning_rate": 0.00017379482447133445, + "loss": 1.4039, + "step": 10097 + }, + { + "epoch": 0.13121890446248882, + "grad_norm": 0.3345651626586914, + "learning_rate": 0.00017379222500942305, + "loss": 1.3406, + "step": 10098 + }, + { + "epoch": 0.1312318990064047, + "grad_norm": 0.45656460523605347, + "learning_rate": 0.00017378962554751167, + "loss": 1.571, + "step": 10099 + }, + { + "epoch": 0.13124489355032057, + "grad_norm": 0.45965951681137085, + "learning_rate": 0.0001737870260856003, + "loss": 1.5504, + "step": 10100 + }, + { + "epoch": 0.13125788809423644, + "grad_norm": 0.3732752799987793, + "learning_rate": 0.0001737844266236889, + "loss": 1.5779, + "step": 10101 + }, + { + "epoch": 0.13127088263815231, + "grad_norm": 0.42539912462234497, + "learning_rate": 0.00017378182716177752, + "loss": 1.4566, + "step": 10102 + }, + { + "epoch": 0.1312838771820682, + "grad_norm": 0.3683663010597229, + "learning_rate": 0.00017377922769986614, + "loss": 1.5273, + "step": 10103 + }, + { + "epoch": 0.13129687172598406, + "grad_norm": 0.4218456447124481, + "learning_rate": 0.00017377662823795477, + "loss": 1.4019, + "step": 10104 + }, + { + "epoch": 0.13130986626989993, + "grad_norm": 0.36343470215797424, + "learning_rate": 0.00017377402877604336, + "loss": 1.4652, + "step": 10105 + }, + { + "epoch": 0.1313228608138158, + "grad_norm": 0.37586092948913574, + "learning_rate": 0.00017377142931413196, + "loss": 1.7567, + "step": 10106 + }, + { + "epoch": 0.13133585535773168, + "grad_norm": 0.3357359766960144, + "learning_rate": 0.0001737688298522206, + "loss": 1.46, + "step": 10107 + }, + { + "epoch": 0.13134884990164755, + "grad_norm": 0.41328558325767517, + "learning_rate": 0.0001737662303903092, + "loss": 1.342, + "step": 10108 + }, + { + "epoch": 0.13136184444556342, + "grad_norm": 0.40142837166786194, + "learning_rate": 0.00017376363092839783, + "loss": 1.4464, + "step": 10109 + }, + { + "epoch": 0.1313748389894793, + "grad_norm": 0.3585902452468872, + "learning_rate": 0.00017376103146648643, + "loss": 1.4277, + "step": 10110 + }, + { + "epoch": 0.13138783353339517, + "grad_norm": 0.4308318495750427, + "learning_rate": 0.00017375843200457506, + "loss": 1.403, + "step": 10111 + }, + { + "epoch": 0.13140082807731104, + "grad_norm": 0.40160951018333435, + "learning_rate": 0.00017375583254266368, + "loss": 1.5691, + "step": 10112 + }, + { + "epoch": 0.13141382262122692, + "grad_norm": 0.3902919590473175, + "learning_rate": 0.00017375323308075228, + "loss": 1.5813, + "step": 10113 + }, + { + "epoch": 0.1314268171651428, + "grad_norm": 0.39474159479141235, + "learning_rate": 0.00017375063361884093, + "loss": 1.4, + "step": 10114 + }, + { + "epoch": 0.13143981170905866, + "grad_norm": 0.43503084778785706, + "learning_rate": 0.00017374803415692953, + "loss": 1.3456, + "step": 10115 + }, + { + "epoch": 0.13145280625297454, + "grad_norm": 0.33803674578666687, + "learning_rate": 0.00017374543469501815, + "loss": 1.3012, + "step": 10116 + }, + { + "epoch": 0.1314658007968904, + "grad_norm": 0.35437917709350586, + "learning_rate": 0.00017374283523310675, + "loss": 1.451, + "step": 10117 + }, + { + "epoch": 0.13147879534080628, + "grad_norm": 0.4315512478351593, + "learning_rate": 0.00017374023577119537, + "loss": 1.6047, + "step": 10118 + }, + { + "epoch": 0.13149178988472215, + "grad_norm": 0.39365872740745544, + "learning_rate": 0.000173737636309284, + "loss": 1.4927, + "step": 10119 + }, + { + "epoch": 0.13150478442863803, + "grad_norm": 0.2758640646934509, + "learning_rate": 0.0001737350368473726, + "loss": 1.3181, + "step": 10120 + }, + { + "epoch": 0.1315177789725539, + "grad_norm": 0.374590665102005, + "learning_rate": 0.00017373243738546122, + "loss": 1.4964, + "step": 10121 + }, + { + "epoch": 0.13153077351646977, + "grad_norm": 0.3375817835330963, + "learning_rate": 0.00017372983792354984, + "loss": 1.3943, + "step": 10122 + }, + { + "epoch": 0.13154376806038565, + "grad_norm": 0.3470326364040375, + "learning_rate": 0.00017372723846163844, + "loss": 1.3671, + "step": 10123 + }, + { + "epoch": 0.13155676260430152, + "grad_norm": 0.39509227871894836, + "learning_rate": 0.00017372463899972707, + "loss": 1.4218, + "step": 10124 + }, + { + "epoch": 0.1315697571482174, + "grad_norm": 0.398009717464447, + "learning_rate": 0.00017372203953781566, + "loss": 1.3567, + "step": 10125 + }, + { + "epoch": 0.13158275169213327, + "grad_norm": 0.39217978715896606, + "learning_rate": 0.00017371944007590431, + "loss": 1.5198, + "step": 10126 + }, + { + "epoch": 0.13159574623604914, + "grad_norm": 0.4248711168766022, + "learning_rate": 0.0001737168406139929, + "loss": 1.5381, + "step": 10127 + }, + { + "epoch": 0.131608740779965, + "grad_norm": 0.40015125274658203, + "learning_rate": 0.00017371424115208154, + "loss": 1.1266, + "step": 10128 + }, + { + "epoch": 0.13162173532388088, + "grad_norm": 0.42026111483573914, + "learning_rate": 0.00017371164169017013, + "loss": 1.575, + "step": 10129 + }, + { + "epoch": 0.13163472986779676, + "grad_norm": 0.3741748332977295, + "learning_rate": 0.00017370904222825876, + "loss": 1.4243, + "step": 10130 + }, + { + "epoch": 0.13164772441171263, + "grad_norm": 0.43683138489723206, + "learning_rate": 0.00017370644276634738, + "loss": 1.3241, + "step": 10131 + }, + { + "epoch": 0.1316607189556285, + "grad_norm": 0.4618951976299286, + "learning_rate": 0.00017370384330443598, + "loss": 1.4304, + "step": 10132 + }, + { + "epoch": 0.13167371349954438, + "grad_norm": 0.4014121890068054, + "learning_rate": 0.0001737012438425246, + "loss": 1.5852, + "step": 10133 + }, + { + "epoch": 0.13168670804346025, + "grad_norm": 0.365263968706131, + "learning_rate": 0.00017369864438061323, + "loss": 1.6531, + "step": 10134 + }, + { + "epoch": 0.13169970258737612, + "grad_norm": 0.27088990807533264, + "learning_rate": 0.00017369604491870183, + "loss": 1.2816, + "step": 10135 + }, + { + "epoch": 0.131712697131292, + "grad_norm": 0.41688206791877747, + "learning_rate": 0.00017369344545679045, + "loss": 1.3097, + "step": 10136 + }, + { + "epoch": 0.13172569167520787, + "grad_norm": 0.40518784523010254, + "learning_rate": 0.00017369084599487905, + "loss": 1.3726, + "step": 10137 + }, + { + "epoch": 0.13173868621912374, + "grad_norm": 0.38007235527038574, + "learning_rate": 0.0001736882465329677, + "loss": 1.3544, + "step": 10138 + }, + { + "epoch": 0.1317516807630396, + "grad_norm": 0.40002375841140747, + "learning_rate": 0.0001736856470710563, + "loss": 1.6149, + "step": 10139 + }, + { + "epoch": 0.1317646753069555, + "grad_norm": 0.36495813727378845, + "learning_rate": 0.00017368304760914492, + "loss": 1.4099, + "step": 10140 + }, + { + "epoch": 0.13177766985087136, + "grad_norm": 0.33296138048171997, + "learning_rate": 0.00017368044814723352, + "loss": 1.3445, + "step": 10141 + }, + { + "epoch": 0.13179066439478723, + "grad_norm": 0.5464882254600525, + "learning_rate": 0.00017367784868532214, + "loss": 1.4817, + "step": 10142 + }, + { + "epoch": 0.1318036589387031, + "grad_norm": 0.3261016011238098, + "learning_rate": 0.00017367524922341077, + "loss": 1.3142, + "step": 10143 + }, + { + "epoch": 0.13181665348261898, + "grad_norm": 0.47110143303871155, + "learning_rate": 0.00017367264976149937, + "loss": 1.3247, + "step": 10144 + }, + { + "epoch": 0.13182964802653485, + "grad_norm": 0.3830544054508209, + "learning_rate": 0.000173670050299588, + "loss": 1.5533, + "step": 10145 + }, + { + "epoch": 0.13184264257045072, + "grad_norm": 0.3269497752189636, + "learning_rate": 0.00017366745083767661, + "loss": 1.4121, + "step": 10146 + }, + { + "epoch": 0.1318556371143666, + "grad_norm": 0.35049694776535034, + "learning_rate": 0.0001736648513757652, + "loss": 1.1665, + "step": 10147 + }, + { + "epoch": 0.13186863165828247, + "grad_norm": 0.4158876836299896, + "learning_rate": 0.00017366225191385384, + "loss": 1.4094, + "step": 10148 + }, + { + "epoch": 0.13188162620219834, + "grad_norm": 0.43320849537849426, + "learning_rate": 0.00017365965245194246, + "loss": 1.5473, + "step": 10149 + }, + { + "epoch": 0.13189462074611422, + "grad_norm": 0.35757991671562195, + "learning_rate": 0.00017365705299003109, + "loss": 1.436, + "step": 10150 + }, + { + "epoch": 0.1319076152900301, + "grad_norm": 0.4002746343612671, + "learning_rate": 0.00017365445352811968, + "loss": 1.3635, + "step": 10151 + }, + { + "epoch": 0.13192060983394596, + "grad_norm": 0.29216960072517395, + "learning_rate": 0.0001736518540662083, + "loss": 1.3389, + "step": 10152 + }, + { + "epoch": 0.13193360437786184, + "grad_norm": 0.325194776058197, + "learning_rate": 0.00017364925460429693, + "loss": 1.1925, + "step": 10153 + }, + { + "epoch": 0.1319465989217777, + "grad_norm": 0.5074470043182373, + "learning_rate": 0.00017364665514238553, + "loss": 1.432, + "step": 10154 + }, + { + "epoch": 0.13195959346569358, + "grad_norm": 0.451858788728714, + "learning_rate": 0.00017364405568047415, + "loss": 1.3861, + "step": 10155 + }, + { + "epoch": 0.13197258800960945, + "grad_norm": 0.3618403971195221, + "learning_rate": 0.00017364145621856275, + "loss": 1.6644, + "step": 10156 + }, + { + "epoch": 0.13198558255352533, + "grad_norm": 0.3296149671077728, + "learning_rate": 0.0001736388567566514, + "loss": 1.3162, + "step": 10157 + }, + { + "epoch": 0.1319985770974412, + "grad_norm": 0.34646278619766235, + "learning_rate": 0.00017363625729474, + "loss": 1.5117, + "step": 10158 + }, + { + "epoch": 0.13201157164135707, + "grad_norm": 0.3559836745262146, + "learning_rate": 0.00017363365783282862, + "loss": 1.3612, + "step": 10159 + }, + { + "epoch": 0.13202456618527295, + "grad_norm": 0.4512968957424164, + "learning_rate": 0.00017363105837091722, + "loss": 1.4755, + "step": 10160 + }, + { + "epoch": 0.13203756072918882, + "grad_norm": 0.38232871890068054, + "learning_rate": 0.00017362845890900585, + "loss": 1.4515, + "step": 10161 + }, + { + "epoch": 0.1320505552731047, + "grad_norm": 0.38601744174957275, + "learning_rate": 0.00017362585944709447, + "loss": 1.4488, + "step": 10162 + }, + { + "epoch": 0.13206354981702056, + "grad_norm": 0.5873156189918518, + "learning_rate": 0.00017362325998518307, + "loss": 1.5512, + "step": 10163 + }, + { + "epoch": 0.13207654436093647, + "grad_norm": 0.4278324842453003, + "learning_rate": 0.0001736206605232717, + "loss": 1.517, + "step": 10164 + }, + { + "epoch": 0.13208953890485234, + "grad_norm": 0.3717762231826782, + "learning_rate": 0.00017361806106136032, + "loss": 1.3807, + "step": 10165 + }, + { + "epoch": 0.1321025334487682, + "grad_norm": 0.3839687407016754, + "learning_rate": 0.00017361546159944891, + "loss": 1.5757, + "step": 10166 + }, + { + "epoch": 0.13211552799268408, + "grad_norm": 0.3228435516357422, + "learning_rate": 0.00017361286213753754, + "loss": 1.4229, + "step": 10167 + }, + { + "epoch": 0.13212852253659996, + "grad_norm": 0.312141090631485, + "learning_rate": 0.00017361026267562614, + "loss": 1.2159, + "step": 10168 + }, + { + "epoch": 0.13214151708051583, + "grad_norm": 0.46125009655952454, + "learning_rate": 0.0001736076632137148, + "loss": 1.3299, + "step": 10169 + }, + { + "epoch": 0.1321545116244317, + "grad_norm": 0.2772444486618042, + "learning_rate": 0.00017360506375180339, + "loss": 1.2596, + "step": 10170 + }, + { + "epoch": 0.13216750616834758, + "grad_norm": 0.4322095513343811, + "learning_rate": 0.000173602464289892, + "loss": 1.5936, + "step": 10171 + }, + { + "epoch": 0.13218050071226345, + "grad_norm": 0.31296905875205994, + "learning_rate": 0.0001735998648279806, + "loss": 1.4962, + "step": 10172 + }, + { + "epoch": 0.13219349525617932, + "grad_norm": 0.44079339504241943, + "learning_rate": 0.00017359726536606923, + "loss": 1.3271, + "step": 10173 + }, + { + "epoch": 0.1322064898000952, + "grad_norm": 0.4436073899269104, + "learning_rate": 0.00017359466590415786, + "loss": 1.3907, + "step": 10174 + }, + { + "epoch": 0.13221948434401107, + "grad_norm": 0.3870185315608978, + "learning_rate": 0.00017359206644224645, + "loss": 1.4656, + "step": 10175 + }, + { + "epoch": 0.13223247888792694, + "grad_norm": 0.370878666639328, + "learning_rate": 0.00017358946698033508, + "loss": 1.5911, + "step": 10176 + }, + { + "epoch": 0.1322454734318428, + "grad_norm": 0.43744736909866333, + "learning_rate": 0.0001735868675184237, + "loss": 1.3463, + "step": 10177 + }, + { + "epoch": 0.1322584679757587, + "grad_norm": 0.4157949984073639, + "learning_rate": 0.0001735842680565123, + "loss": 1.3609, + "step": 10178 + }, + { + "epoch": 0.13227146251967456, + "grad_norm": 0.4446072280406952, + "learning_rate": 0.00017358166859460092, + "loss": 1.4509, + "step": 10179 + }, + { + "epoch": 0.13228445706359043, + "grad_norm": 0.8006092309951782, + "learning_rate": 0.00017357906913268952, + "loss": 1.3285, + "step": 10180 + }, + { + "epoch": 0.1322974516075063, + "grad_norm": 0.48992887139320374, + "learning_rate": 0.00017357646967077817, + "loss": 1.3868, + "step": 10181 + }, + { + "epoch": 0.13231044615142218, + "grad_norm": 0.38604074716567993, + "learning_rate": 0.00017357387020886677, + "loss": 1.4719, + "step": 10182 + }, + { + "epoch": 0.13232344069533805, + "grad_norm": 0.4488598704338074, + "learning_rate": 0.0001735712707469554, + "loss": 1.5205, + "step": 10183 + }, + { + "epoch": 0.13233643523925392, + "grad_norm": 0.37880659103393555, + "learning_rate": 0.000173568671285044, + "loss": 1.5112, + "step": 10184 + }, + { + "epoch": 0.1323494297831698, + "grad_norm": 0.43228811025619507, + "learning_rate": 0.00017356607182313262, + "loss": 1.4397, + "step": 10185 + }, + { + "epoch": 0.13236242432708567, + "grad_norm": 0.44876420497894287, + "learning_rate": 0.00017356347236122124, + "loss": 1.4611, + "step": 10186 + }, + { + "epoch": 0.13237541887100154, + "grad_norm": 0.33343929052352905, + "learning_rate": 0.00017356087289930984, + "loss": 1.5622, + "step": 10187 + }, + { + "epoch": 0.13238841341491742, + "grad_norm": 0.3798485994338989, + "learning_rate": 0.0001735582734373985, + "loss": 1.3399, + "step": 10188 + }, + { + "epoch": 0.1324014079588333, + "grad_norm": 0.47137361764907837, + "learning_rate": 0.0001735556739754871, + "loss": 1.564, + "step": 10189 + }, + { + "epoch": 0.13241440250274916, + "grad_norm": 0.3483763635158539, + "learning_rate": 0.00017355307451357569, + "loss": 1.2146, + "step": 10190 + }, + { + "epoch": 0.13242739704666504, + "grad_norm": 0.43212610483169556, + "learning_rate": 0.0001735504750516643, + "loss": 1.5085, + "step": 10191 + }, + { + "epoch": 0.1324403915905809, + "grad_norm": 0.351666659116745, + "learning_rate": 0.00017354787558975293, + "loss": 1.3518, + "step": 10192 + }, + { + "epoch": 0.13245338613449678, + "grad_norm": 0.41554951667785645, + "learning_rate": 0.00017354527612784156, + "loss": 1.467, + "step": 10193 + }, + { + "epoch": 0.13246638067841265, + "grad_norm": 0.2740756571292877, + "learning_rate": 0.00017354267666593016, + "loss": 1.1586, + "step": 10194 + }, + { + "epoch": 0.13247937522232853, + "grad_norm": 0.5781879425048828, + "learning_rate": 0.00017354007720401878, + "loss": 1.4672, + "step": 10195 + }, + { + "epoch": 0.1324923697662444, + "grad_norm": 0.3172120451927185, + "learning_rate": 0.0001735374777421074, + "loss": 1.171, + "step": 10196 + }, + { + "epoch": 0.13250536431016027, + "grad_norm": 0.45059317350387573, + "learning_rate": 0.000173534878280196, + "loss": 1.3814, + "step": 10197 + }, + { + "epoch": 0.13251835885407615, + "grad_norm": 0.5048568248748779, + "learning_rate": 0.00017353227881828463, + "loss": 1.4982, + "step": 10198 + }, + { + "epoch": 0.13253135339799202, + "grad_norm": 0.3370574116706848, + "learning_rate": 0.00017352967935637322, + "loss": 1.3753, + "step": 10199 + }, + { + "epoch": 0.1325443479419079, + "grad_norm": 0.39954623579978943, + "learning_rate": 0.00017352707989446188, + "loss": 1.468, + "step": 10200 + }, + { + "epoch": 0.13255734248582376, + "grad_norm": 0.3422948718070984, + "learning_rate": 0.00017352448043255047, + "loss": 1.4215, + "step": 10201 + }, + { + "epoch": 0.13257033702973964, + "grad_norm": 0.3474291265010834, + "learning_rate": 0.00017352188097063907, + "loss": 1.3762, + "step": 10202 + }, + { + "epoch": 0.1325833315736555, + "grad_norm": 0.34193500876426697, + "learning_rate": 0.0001735192815087277, + "loss": 1.3661, + "step": 10203 + }, + { + "epoch": 0.13259632611757138, + "grad_norm": 0.4035217761993408, + "learning_rate": 0.00017351668204681632, + "loss": 1.4916, + "step": 10204 + }, + { + "epoch": 0.13260932066148726, + "grad_norm": 0.43940484523773193, + "learning_rate": 0.00017351408258490494, + "loss": 1.4685, + "step": 10205 + }, + { + "epoch": 0.13262231520540313, + "grad_norm": 0.3900766670703888, + "learning_rate": 0.00017351148312299354, + "loss": 1.4861, + "step": 10206 + }, + { + "epoch": 0.132635309749319, + "grad_norm": 0.373460590839386, + "learning_rate": 0.00017350888366108217, + "loss": 1.3934, + "step": 10207 + }, + { + "epoch": 0.13264830429323488, + "grad_norm": 0.38645410537719727, + "learning_rate": 0.0001735062841991708, + "loss": 1.3767, + "step": 10208 + }, + { + "epoch": 0.13266129883715075, + "grad_norm": 0.5162149667739868, + "learning_rate": 0.0001735036847372594, + "loss": 1.5425, + "step": 10209 + }, + { + "epoch": 0.13267429338106662, + "grad_norm": 0.46621623635292053, + "learning_rate": 0.000173501085275348, + "loss": 1.4586, + "step": 10210 + }, + { + "epoch": 0.1326872879249825, + "grad_norm": 0.39738011360168457, + "learning_rate": 0.0001734984858134366, + "loss": 1.4309, + "step": 10211 + }, + { + "epoch": 0.13270028246889837, + "grad_norm": 0.45434921979904175, + "learning_rate": 0.00017349588635152526, + "loss": 1.5152, + "step": 10212 + }, + { + "epoch": 0.13271327701281424, + "grad_norm": 0.3409869372844696, + "learning_rate": 0.00017349328688961386, + "loss": 1.3408, + "step": 10213 + }, + { + "epoch": 0.1327262715567301, + "grad_norm": 0.36123141646385193, + "learning_rate": 0.00017349068742770246, + "loss": 1.444, + "step": 10214 + }, + { + "epoch": 0.132739266100646, + "grad_norm": 0.4403824806213379, + "learning_rate": 0.00017348808796579108, + "loss": 1.4582, + "step": 10215 + }, + { + "epoch": 0.13275226064456186, + "grad_norm": 0.532588005065918, + "learning_rate": 0.0001734854885038797, + "loss": 1.6039, + "step": 10216 + }, + { + "epoch": 0.13276525518847773, + "grad_norm": 0.35847392678260803, + "learning_rate": 0.00017348288904196833, + "loss": 1.4615, + "step": 10217 + }, + { + "epoch": 0.1327782497323936, + "grad_norm": 0.3607536852359772, + "learning_rate": 0.00017348028958005693, + "loss": 1.292, + "step": 10218 + }, + { + "epoch": 0.13279124427630948, + "grad_norm": 0.4733213782310486, + "learning_rate": 0.00017347769011814555, + "loss": 1.6282, + "step": 10219 + }, + { + "epoch": 0.13280423882022535, + "grad_norm": 0.4726121127605438, + "learning_rate": 0.00017347509065623418, + "loss": 1.3695, + "step": 10220 + }, + { + "epoch": 0.13281723336414122, + "grad_norm": 0.3787165582180023, + "learning_rate": 0.00017347249119432277, + "loss": 1.4199, + "step": 10221 + }, + { + "epoch": 0.1328302279080571, + "grad_norm": 0.38981640338897705, + "learning_rate": 0.0001734698917324114, + "loss": 1.3939, + "step": 10222 + }, + { + "epoch": 0.13284322245197297, + "grad_norm": 0.3238557279109955, + "learning_rate": 0.00017346729227050002, + "loss": 1.292, + "step": 10223 + }, + { + "epoch": 0.13285621699588884, + "grad_norm": 0.18271692097187042, + "learning_rate": 0.00017346469280858865, + "loss": 1.1872, + "step": 10224 + }, + { + "epoch": 0.13286921153980472, + "grad_norm": 0.3624701499938965, + "learning_rate": 0.00017346209334667724, + "loss": 1.5763, + "step": 10225 + }, + { + "epoch": 0.1328822060837206, + "grad_norm": 0.3814622163772583, + "learning_rate": 0.00017345949388476587, + "loss": 1.4388, + "step": 10226 + }, + { + "epoch": 0.13289520062763646, + "grad_norm": 0.4015159010887146, + "learning_rate": 0.0001734568944228545, + "loss": 1.4904, + "step": 10227 + }, + { + "epoch": 0.13290819517155233, + "grad_norm": 0.4241006076335907, + "learning_rate": 0.0001734542949609431, + "loss": 1.3432, + "step": 10228 + }, + { + "epoch": 0.1329211897154682, + "grad_norm": 0.4571399986743927, + "learning_rate": 0.00017345169549903172, + "loss": 1.4652, + "step": 10229 + }, + { + "epoch": 0.13293418425938408, + "grad_norm": 0.6004480123519897, + "learning_rate": 0.0001734490960371203, + "loss": 1.4914, + "step": 10230 + }, + { + "epoch": 0.13294717880329995, + "grad_norm": 0.3231486976146698, + "learning_rate": 0.00017344649657520894, + "loss": 1.4043, + "step": 10231 + }, + { + "epoch": 0.13296017334721583, + "grad_norm": 0.45555949211120605, + "learning_rate": 0.00017344389711329756, + "loss": 1.5629, + "step": 10232 + }, + { + "epoch": 0.1329731678911317, + "grad_norm": 0.3109978139400482, + "learning_rate": 0.00017344129765138616, + "loss": 1.3871, + "step": 10233 + }, + { + "epoch": 0.13298616243504757, + "grad_norm": 0.4051888883113861, + "learning_rate": 0.00017343869818947478, + "loss": 1.3399, + "step": 10234 + }, + { + "epoch": 0.13299915697896345, + "grad_norm": 0.40826016664505005, + "learning_rate": 0.0001734360987275634, + "loss": 1.5187, + "step": 10235 + }, + { + "epoch": 0.13301215152287932, + "grad_norm": 0.32955506443977356, + "learning_rate": 0.00017343349926565203, + "loss": 1.4244, + "step": 10236 + }, + { + "epoch": 0.1330251460667952, + "grad_norm": 0.3049336075782776, + "learning_rate": 0.00017343089980374063, + "loss": 1.3437, + "step": 10237 + }, + { + "epoch": 0.13303814061071106, + "grad_norm": 0.3286857008934021, + "learning_rate": 0.00017342830034182925, + "loss": 1.2963, + "step": 10238 + }, + { + "epoch": 0.13305113515462694, + "grad_norm": 0.3328917920589447, + "learning_rate": 0.00017342570087991788, + "loss": 1.4539, + "step": 10239 + }, + { + "epoch": 0.13306412969854284, + "grad_norm": 0.4485374093055725, + "learning_rate": 0.00017342310141800648, + "loss": 1.4527, + "step": 10240 + }, + { + "epoch": 0.1330771242424587, + "grad_norm": 0.398308128118515, + "learning_rate": 0.0001734205019560951, + "loss": 1.496, + "step": 10241 + }, + { + "epoch": 0.13309011878637458, + "grad_norm": 0.4124026894569397, + "learning_rate": 0.0001734179024941837, + "loss": 1.4088, + "step": 10242 + }, + { + "epoch": 0.13310311333029046, + "grad_norm": 0.32842618227005005, + "learning_rate": 0.00017341530303227235, + "loss": 1.4355, + "step": 10243 + }, + { + "epoch": 0.13311610787420633, + "grad_norm": 0.372936487197876, + "learning_rate": 0.00017341270357036095, + "loss": 1.4122, + "step": 10244 + }, + { + "epoch": 0.1331291024181222, + "grad_norm": 0.3864259719848633, + "learning_rate": 0.00017341010410844954, + "loss": 1.4042, + "step": 10245 + }, + { + "epoch": 0.13314209696203808, + "grad_norm": 0.415939062833786, + "learning_rate": 0.00017340750464653817, + "loss": 1.2855, + "step": 10246 + }, + { + "epoch": 0.13315509150595395, + "grad_norm": 0.37138351798057556, + "learning_rate": 0.0001734049051846268, + "loss": 1.7111, + "step": 10247 + }, + { + "epoch": 0.13316808604986982, + "grad_norm": 0.41065409779548645, + "learning_rate": 0.00017340230572271542, + "loss": 1.4176, + "step": 10248 + }, + { + "epoch": 0.1331810805937857, + "grad_norm": 0.3087286055088043, + "learning_rate": 0.00017339970626080402, + "loss": 1.465, + "step": 10249 + }, + { + "epoch": 0.13319407513770157, + "grad_norm": 0.38127902150154114, + "learning_rate": 0.00017339710679889264, + "loss": 1.489, + "step": 10250 + }, + { + "epoch": 0.13320706968161744, + "grad_norm": 0.47871479392051697, + "learning_rate": 0.00017339450733698126, + "loss": 1.4475, + "step": 10251 + }, + { + "epoch": 0.1332200642255333, + "grad_norm": 0.39952757954597473, + "learning_rate": 0.00017339190787506986, + "loss": 1.4572, + "step": 10252 + }, + { + "epoch": 0.1332330587694492, + "grad_norm": 0.4555143415927887, + "learning_rate": 0.00017338930841315849, + "loss": 1.2898, + "step": 10253 + }, + { + "epoch": 0.13324605331336506, + "grad_norm": 0.35699325799942017, + "learning_rate": 0.00017338670895124708, + "loss": 1.4545, + "step": 10254 + }, + { + "epoch": 0.13325904785728093, + "grad_norm": 0.4163183569908142, + "learning_rate": 0.00017338410948933573, + "loss": 1.2866, + "step": 10255 + }, + { + "epoch": 0.1332720424011968, + "grad_norm": 0.35340219736099243, + "learning_rate": 0.00017338151002742433, + "loss": 1.4647, + "step": 10256 + }, + { + "epoch": 0.13328503694511268, + "grad_norm": 0.4699966013431549, + "learning_rate": 0.00017337891056551293, + "loss": 1.5466, + "step": 10257 + }, + { + "epoch": 0.13329803148902855, + "grad_norm": 0.5489582419395447, + "learning_rate": 0.00017337631110360155, + "loss": 1.6783, + "step": 10258 + }, + { + "epoch": 0.13331102603294442, + "grad_norm": 0.41623225808143616, + "learning_rate": 0.00017337371164169018, + "loss": 1.3705, + "step": 10259 + }, + { + "epoch": 0.1333240205768603, + "grad_norm": 0.4936605393886566, + "learning_rate": 0.0001733711121797788, + "loss": 1.3937, + "step": 10260 + }, + { + "epoch": 0.13333701512077617, + "grad_norm": 0.3930872082710266, + "learning_rate": 0.0001733685127178674, + "loss": 1.3575, + "step": 10261 + }, + { + "epoch": 0.13335000966469204, + "grad_norm": 0.34105145931243896, + "learning_rate": 0.00017336591325595602, + "loss": 1.2825, + "step": 10262 + }, + { + "epoch": 0.13336300420860792, + "grad_norm": 0.4458957314491272, + "learning_rate": 0.00017336331379404465, + "loss": 1.4232, + "step": 10263 + }, + { + "epoch": 0.1333759987525238, + "grad_norm": 0.45651087164878845, + "learning_rate": 0.00017336071433213325, + "loss": 1.5728, + "step": 10264 + }, + { + "epoch": 0.13338899329643966, + "grad_norm": 0.4242545962333679, + "learning_rate": 0.00017335811487022187, + "loss": 1.476, + "step": 10265 + }, + { + "epoch": 0.13340198784035553, + "grad_norm": 0.4469432532787323, + "learning_rate": 0.0001733555154083105, + "loss": 1.4461, + "step": 10266 + }, + { + "epoch": 0.1334149823842714, + "grad_norm": 0.35513532161712646, + "learning_rate": 0.00017335291594639912, + "loss": 1.3621, + "step": 10267 + }, + { + "epoch": 0.13342797692818728, + "grad_norm": 0.4214397668838501, + "learning_rate": 0.00017335031648448772, + "loss": 1.4888, + "step": 10268 + }, + { + "epoch": 0.13344097147210315, + "grad_norm": 0.4333958625793457, + "learning_rate": 0.00017334771702257632, + "loss": 1.3795, + "step": 10269 + }, + { + "epoch": 0.13345396601601903, + "grad_norm": 0.43227511644363403, + "learning_rate": 0.00017334511756066497, + "loss": 1.3898, + "step": 10270 + }, + { + "epoch": 0.1334669605599349, + "grad_norm": 0.38267451524734497, + "learning_rate": 0.00017334251809875356, + "loss": 1.5822, + "step": 10271 + }, + { + "epoch": 0.13347995510385077, + "grad_norm": 0.36044323444366455, + "learning_rate": 0.0001733399186368422, + "loss": 1.2313, + "step": 10272 + }, + { + "epoch": 0.13349294964776665, + "grad_norm": 0.37287914752960205, + "learning_rate": 0.00017333731917493079, + "loss": 1.4127, + "step": 10273 + }, + { + "epoch": 0.13350594419168252, + "grad_norm": 0.441448837518692, + "learning_rate": 0.0001733347197130194, + "loss": 1.5315, + "step": 10274 + }, + { + "epoch": 0.1335189387355984, + "grad_norm": 0.4397992491722107, + "learning_rate": 0.00017333212025110803, + "loss": 1.3751, + "step": 10275 + }, + { + "epoch": 0.13353193327951426, + "grad_norm": 0.48151400685310364, + "learning_rate": 0.00017332952078919663, + "loss": 1.5896, + "step": 10276 + }, + { + "epoch": 0.13354492782343014, + "grad_norm": 0.3794490098953247, + "learning_rate": 0.00017332692132728526, + "loss": 1.4475, + "step": 10277 + }, + { + "epoch": 0.133557922367346, + "grad_norm": 0.4280606210231781, + "learning_rate": 0.00017332432186537388, + "loss": 1.5373, + "step": 10278 + }, + { + "epoch": 0.13357091691126188, + "grad_norm": 0.4502997100353241, + "learning_rate": 0.0001733217224034625, + "loss": 1.5484, + "step": 10279 + }, + { + "epoch": 0.13358391145517776, + "grad_norm": 0.37605345249176025, + "learning_rate": 0.0001733191229415511, + "loss": 1.5437, + "step": 10280 + }, + { + "epoch": 0.13359690599909363, + "grad_norm": 0.2983585000038147, + "learning_rate": 0.00017331652347963973, + "loss": 1.2876, + "step": 10281 + }, + { + "epoch": 0.1336099005430095, + "grad_norm": 0.5259339213371277, + "learning_rate": 0.00017331392401772835, + "loss": 1.4547, + "step": 10282 + }, + { + "epoch": 0.13362289508692538, + "grad_norm": 0.49637162685394287, + "learning_rate": 0.00017331132455581695, + "loss": 1.6262, + "step": 10283 + }, + { + "epoch": 0.13363588963084125, + "grad_norm": 0.3729323744773865, + "learning_rate": 0.00017330872509390557, + "loss": 1.4826, + "step": 10284 + }, + { + "epoch": 0.13364888417475712, + "grad_norm": 0.34967002272605896, + "learning_rate": 0.00017330612563199417, + "loss": 1.3662, + "step": 10285 + }, + { + "epoch": 0.133661878718673, + "grad_norm": 0.3983106315135956, + "learning_rate": 0.0001733035261700828, + "loss": 1.5116, + "step": 10286 + }, + { + "epoch": 0.13367487326258887, + "grad_norm": 0.403053343296051, + "learning_rate": 0.00017330092670817142, + "loss": 1.4737, + "step": 10287 + }, + { + "epoch": 0.13368786780650474, + "grad_norm": 0.4467945396900177, + "learning_rate": 0.00017329832724626002, + "loss": 1.3548, + "step": 10288 + }, + { + "epoch": 0.1337008623504206, + "grad_norm": 0.4631403684616089, + "learning_rate": 0.00017329572778434864, + "loss": 1.3214, + "step": 10289 + }, + { + "epoch": 0.13371385689433649, + "grad_norm": 0.5224770307540894, + "learning_rate": 0.00017329312832243727, + "loss": 1.3509, + "step": 10290 + }, + { + "epoch": 0.13372685143825236, + "grad_norm": 0.4467334449291229, + "learning_rate": 0.0001732905288605259, + "loss": 1.597, + "step": 10291 + }, + { + "epoch": 0.13373984598216823, + "grad_norm": 0.4227731227874756, + "learning_rate": 0.0001732879293986145, + "loss": 1.5186, + "step": 10292 + }, + { + "epoch": 0.1337528405260841, + "grad_norm": 0.4261920750141144, + "learning_rate": 0.0001732853299367031, + "loss": 1.7286, + "step": 10293 + }, + { + "epoch": 0.13376583506999998, + "grad_norm": 0.41498351097106934, + "learning_rate": 0.00017328273047479174, + "loss": 1.4786, + "step": 10294 + }, + { + "epoch": 0.13377882961391585, + "grad_norm": 0.39950403571128845, + "learning_rate": 0.00017328013101288033, + "loss": 1.6114, + "step": 10295 + }, + { + "epoch": 0.13379182415783172, + "grad_norm": 0.38255569338798523, + "learning_rate": 0.00017327753155096896, + "loss": 1.5391, + "step": 10296 + }, + { + "epoch": 0.1338048187017476, + "grad_norm": 0.3950129449367523, + "learning_rate": 0.00017327493208905758, + "loss": 1.6083, + "step": 10297 + }, + { + "epoch": 0.13381781324566347, + "grad_norm": 0.4778757095336914, + "learning_rate": 0.00017327233262714618, + "loss": 1.3761, + "step": 10298 + }, + { + "epoch": 0.13383080778957934, + "grad_norm": 0.4529207646846771, + "learning_rate": 0.0001732697331652348, + "loss": 1.5515, + "step": 10299 + }, + { + "epoch": 0.13384380233349522, + "grad_norm": 0.38711661100387573, + "learning_rate": 0.0001732671337033234, + "loss": 1.2721, + "step": 10300 + }, + { + "epoch": 0.1338567968774111, + "grad_norm": 0.3175511360168457, + "learning_rate": 0.00017326453424141205, + "loss": 1.4429, + "step": 10301 + }, + { + "epoch": 0.13386979142132696, + "grad_norm": 0.36104458570480347, + "learning_rate": 0.00017326193477950065, + "loss": 1.4208, + "step": 10302 + }, + { + "epoch": 0.13388278596524283, + "grad_norm": 0.3169723451137543, + "learning_rate": 0.00017325933531758928, + "loss": 1.4414, + "step": 10303 + }, + { + "epoch": 0.1338957805091587, + "grad_norm": 0.4668019711971283, + "learning_rate": 0.00017325673585567787, + "loss": 1.3821, + "step": 10304 + }, + { + "epoch": 0.13390877505307458, + "grad_norm": 0.319766640663147, + "learning_rate": 0.0001732541363937665, + "loss": 1.3179, + "step": 10305 + }, + { + "epoch": 0.13392176959699045, + "grad_norm": 0.46580541133880615, + "learning_rate": 0.00017325153693185512, + "loss": 1.4061, + "step": 10306 + }, + { + "epoch": 0.13393476414090633, + "grad_norm": 0.41201043128967285, + "learning_rate": 0.00017324893746994372, + "loss": 1.4647, + "step": 10307 + }, + { + "epoch": 0.1339477586848222, + "grad_norm": 0.4252960681915283, + "learning_rate": 0.00017324633800803234, + "loss": 1.3306, + "step": 10308 + }, + { + "epoch": 0.13396075322873807, + "grad_norm": 0.5184053778648376, + "learning_rate": 0.00017324373854612097, + "loss": 1.3388, + "step": 10309 + }, + { + "epoch": 0.13397374777265394, + "grad_norm": 0.380666047334671, + "learning_rate": 0.0001732411390842096, + "loss": 1.3891, + "step": 10310 + }, + { + "epoch": 0.13398674231656982, + "grad_norm": 0.41391023993492126, + "learning_rate": 0.0001732385396222982, + "loss": 1.7379, + "step": 10311 + }, + { + "epoch": 0.1339997368604857, + "grad_norm": 0.402710884809494, + "learning_rate": 0.0001732359401603868, + "loss": 1.4074, + "step": 10312 + }, + { + "epoch": 0.13401273140440156, + "grad_norm": 0.39145785570144653, + "learning_rate": 0.00017323334069847544, + "loss": 1.3624, + "step": 10313 + }, + { + "epoch": 0.13402572594831744, + "grad_norm": 0.43482527136802673, + "learning_rate": 0.00017323074123656404, + "loss": 1.3909, + "step": 10314 + }, + { + "epoch": 0.1340387204922333, + "grad_norm": 0.38763582706451416, + "learning_rate": 0.00017322814177465266, + "loss": 1.4342, + "step": 10315 + }, + { + "epoch": 0.1340517150361492, + "grad_norm": 0.40869957208633423, + "learning_rate": 0.00017322554231274126, + "loss": 1.5143, + "step": 10316 + }, + { + "epoch": 0.13406470958006508, + "grad_norm": 0.3941170573234558, + "learning_rate": 0.00017322294285082988, + "loss": 1.5095, + "step": 10317 + }, + { + "epoch": 0.13407770412398096, + "grad_norm": 0.4547525942325592, + "learning_rate": 0.0001732203433889185, + "loss": 1.3317, + "step": 10318 + }, + { + "epoch": 0.13409069866789683, + "grad_norm": 0.3826883137226105, + "learning_rate": 0.0001732177439270071, + "loss": 1.5269, + "step": 10319 + }, + { + "epoch": 0.1341036932118127, + "grad_norm": 0.392856240272522, + "learning_rate": 0.00017321514446509573, + "loss": 1.5489, + "step": 10320 + }, + { + "epoch": 0.13411668775572858, + "grad_norm": 0.452817440032959, + "learning_rate": 0.00017321254500318435, + "loss": 1.4722, + "step": 10321 + }, + { + "epoch": 0.13412968229964445, + "grad_norm": 0.3699339032173157, + "learning_rate": 0.00017320994554127298, + "loss": 1.2521, + "step": 10322 + }, + { + "epoch": 0.13414267684356032, + "grad_norm": 0.40437933802604675, + "learning_rate": 0.00017320734607936158, + "loss": 1.4651, + "step": 10323 + }, + { + "epoch": 0.1341556713874762, + "grad_norm": 0.4699774980545044, + "learning_rate": 0.00017320474661745017, + "loss": 1.4492, + "step": 10324 + }, + { + "epoch": 0.13416866593139207, + "grad_norm": 0.3139057159423828, + "learning_rate": 0.00017320214715553883, + "loss": 1.3312, + "step": 10325 + }, + { + "epoch": 0.13418166047530794, + "grad_norm": 0.4362099766731262, + "learning_rate": 0.00017319954769362742, + "loss": 1.4336, + "step": 10326 + }, + { + "epoch": 0.1341946550192238, + "grad_norm": 0.38473936915397644, + "learning_rate": 0.00017319694823171605, + "loss": 1.4587, + "step": 10327 + }, + { + "epoch": 0.13420764956313969, + "grad_norm": 0.4335325062274933, + "learning_rate": 0.00017319434876980464, + "loss": 1.3493, + "step": 10328 + }, + { + "epoch": 0.13422064410705556, + "grad_norm": 0.41719844937324524, + "learning_rate": 0.00017319174930789327, + "loss": 1.434, + "step": 10329 + }, + { + "epoch": 0.13423363865097143, + "grad_norm": 0.46054530143737793, + "learning_rate": 0.0001731891498459819, + "loss": 1.4347, + "step": 10330 + }, + { + "epoch": 0.1342466331948873, + "grad_norm": 0.4843178391456604, + "learning_rate": 0.0001731865503840705, + "loss": 1.4339, + "step": 10331 + }, + { + "epoch": 0.13425962773880318, + "grad_norm": 0.38382700085639954, + "learning_rate": 0.00017318395092215912, + "loss": 1.4265, + "step": 10332 + }, + { + "epoch": 0.13427262228271905, + "grad_norm": 0.43007034063339233, + "learning_rate": 0.00017318135146024774, + "loss": 1.7029, + "step": 10333 + }, + { + "epoch": 0.13428561682663492, + "grad_norm": 0.42118921875953674, + "learning_rate": 0.00017317875199833636, + "loss": 1.4431, + "step": 10334 + }, + { + "epoch": 0.1342986113705508, + "grad_norm": 0.43030717968940735, + "learning_rate": 0.00017317615253642496, + "loss": 1.5413, + "step": 10335 + }, + { + "epoch": 0.13431160591446667, + "grad_norm": 0.46663835644721985, + "learning_rate": 0.00017317355307451359, + "loss": 1.4771, + "step": 10336 + }, + { + "epoch": 0.13432460045838254, + "grad_norm": 0.39839795231819153, + "learning_rate": 0.0001731709536126022, + "loss": 1.3301, + "step": 10337 + }, + { + "epoch": 0.13433759500229842, + "grad_norm": 0.41001981496810913, + "learning_rate": 0.0001731683541506908, + "loss": 1.4955, + "step": 10338 + }, + { + "epoch": 0.1343505895462143, + "grad_norm": 0.41984260082244873, + "learning_rate": 0.00017316575468877943, + "loss": 1.506, + "step": 10339 + }, + { + "epoch": 0.13436358409013016, + "grad_norm": 0.4015372395515442, + "learning_rate": 0.00017316315522686806, + "loss": 1.4426, + "step": 10340 + }, + { + "epoch": 0.13437657863404603, + "grad_norm": 0.4550345838069916, + "learning_rate": 0.00017316055576495665, + "loss": 1.343, + "step": 10341 + }, + { + "epoch": 0.1343895731779619, + "grad_norm": 0.33817529678344727, + "learning_rate": 0.00017315795630304528, + "loss": 1.6399, + "step": 10342 + }, + { + "epoch": 0.13440256772187778, + "grad_norm": 0.3979460895061493, + "learning_rate": 0.00017315535684113388, + "loss": 1.5058, + "step": 10343 + }, + { + "epoch": 0.13441556226579365, + "grad_norm": 0.34761905670166016, + "learning_rate": 0.00017315275737922253, + "loss": 1.4553, + "step": 10344 + }, + { + "epoch": 0.13442855680970953, + "grad_norm": 0.4102572798728943, + "learning_rate": 0.00017315015791731113, + "loss": 1.4372, + "step": 10345 + }, + { + "epoch": 0.1344415513536254, + "grad_norm": 0.4973316788673401, + "learning_rate": 0.00017314755845539975, + "loss": 1.3344, + "step": 10346 + }, + { + "epoch": 0.13445454589754127, + "grad_norm": 0.3183603286743164, + "learning_rate": 0.00017314495899348835, + "loss": 1.1312, + "step": 10347 + }, + { + "epoch": 0.13446754044145715, + "grad_norm": 0.360584020614624, + "learning_rate": 0.00017314235953157697, + "loss": 1.4803, + "step": 10348 + }, + { + "epoch": 0.13448053498537302, + "grad_norm": 0.44296449422836304, + "learning_rate": 0.0001731397600696656, + "loss": 1.5199, + "step": 10349 + }, + { + "epoch": 0.1344935295292889, + "grad_norm": 0.25250524282455444, + "learning_rate": 0.0001731371606077542, + "loss": 1.0543, + "step": 10350 + }, + { + "epoch": 0.13450652407320476, + "grad_norm": 0.3233751058578491, + "learning_rate": 0.00017313456114584282, + "loss": 1.4248, + "step": 10351 + }, + { + "epoch": 0.13451951861712064, + "grad_norm": 0.32315072417259216, + "learning_rate": 0.00017313196168393144, + "loss": 1.4422, + "step": 10352 + }, + { + "epoch": 0.1345325131610365, + "grad_norm": 0.3747360110282898, + "learning_rate": 0.00017312936222202004, + "loss": 1.3942, + "step": 10353 + }, + { + "epoch": 0.13454550770495238, + "grad_norm": 0.33627772331237793, + "learning_rate": 0.00017312676276010866, + "loss": 1.4478, + "step": 10354 + }, + { + "epoch": 0.13455850224886826, + "grad_norm": 0.38291966915130615, + "learning_rate": 0.00017312416329819726, + "loss": 1.3497, + "step": 10355 + }, + { + "epoch": 0.13457149679278413, + "grad_norm": 0.4060259461402893, + "learning_rate": 0.0001731215638362859, + "loss": 1.3532, + "step": 10356 + }, + { + "epoch": 0.1345844913367, + "grad_norm": 0.4041518270969391, + "learning_rate": 0.0001731189643743745, + "loss": 1.4656, + "step": 10357 + }, + { + "epoch": 0.13459748588061587, + "grad_norm": 0.4120069742202759, + "learning_rate": 0.00017311636491246314, + "loss": 1.3757, + "step": 10358 + }, + { + "epoch": 0.13461048042453175, + "grad_norm": 0.3214429020881653, + "learning_rate": 0.00017311376545055173, + "loss": 1.2607, + "step": 10359 + }, + { + "epoch": 0.13462347496844762, + "grad_norm": 0.4030624330043793, + "learning_rate": 0.00017311116598864036, + "loss": 1.5785, + "step": 10360 + }, + { + "epoch": 0.1346364695123635, + "grad_norm": 0.35320013761520386, + "learning_rate": 0.00017310856652672898, + "loss": 1.269, + "step": 10361 + }, + { + "epoch": 0.13464946405627937, + "grad_norm": 0.35841625928878784, + "learning_rate": 0.00017310596706481758, + "loss": 1.548, + "step": 10362 + }, + { + "epoch": 0.13466245860019524, + "grad_norm": 0.3769742250442505, + "learning_rate": 0.0001731033676029062, + "loss": 1.3, + "step": 10363 + }, + { + "epoch": 0.1346754531441111, + "grad_norm": 0.45740842819213867, + "learning_rate": 0.00017310076814099483, + "loss": 1.3847, + "step": 10364 + }, + { + "epoch": 0.13468844768802699, + "grad_norm": 0.335723876953125, + "learning_rate": 0.00017309816867908345, + "loss": 1.4011, + "step": 10365 + }, + { + "epoch": 0.13470144223194286, + "grad_norm": 0.3708224296569824, + "learning_rate": 0.00017309556921717205, + "loss": 1.3776, + "step": 10366 + }, + { + "epoch": 0.13471443677585873, + "grad_norm": 0.4020022451877594, + "learning_rate": 0.00017309296975526065, + "loss": 1.4931, + "step": 10367 + }, + { + "epoch": 0.1347274313197746, + "grad_norm": 0.42780259251594543, + "learning_rate": 0.0001730903702933493, + "loss": 1.5831, + "step": 10368 + }, + { + "epoch": 0.13474042586369048, + "grad_norm": 0.41389259696006775, + "learning_rate": 0.0001730877708314379, + "loss": 1.4241, + "step": 10369 + }, + { + "epoch": 0.13475342040760635, + "grad_norm": 0.3301853835582733, + "learning_rate": 0.00017308517136952652, + "loss": 1.4376, + "step": 10370 + }, + { + "epoch": 0.13476641495152222, + "grad_norm": 0.3607798218727112, + "learning_rate": 0.00017308257190761512, + "loss": 1.2714, + "step": 10371 + }, + { + "epoch": 0.1347794094954381, + "grad_norm": 0.39608651399612427, + "learning_rate": 0.00017307997244570374, + "loss": 1.4355, + "step": 10372 + }, + { + "epoch": 0.13479240403935397, + "grad_norm": 0.4363602101802826, + "learning_rate": 0.00017307737298379237, + "loss": 1.3418, + "step": 10373 + }, + { + "epoch": 0.13480539858326984, + "grad_norm": 0.4446527659893036, + "learning_rate": 0.00017307477352188096, + "loss": 1.4276, + "step": 10374 + }, + { + "epoch": 0.13481839312718571, + "grad_norm": 0.3339526057243347, + "learning_rate": 0.00017307217405996962, + "loss": 1.3378, + "step": 10375 + }, + { + "epoch": 0.1348313876711016, + "grad_norm": 0.33654919266700745, + "learning_rate": 0.0001730695745980582, + "loss": 1.2298, + "step": 10376 + }, + { + "epoch": 0.13484438221501746, + "grad_norm": 0.4624748229980469, + "learning_rate": 0.00017306697513614684, + "loss": 1.4168, + "step": 10377 + }, + { + "epoch": 0.13485737675893333, + "grad_norm": 0.47972872853279114, + "learning_rate": 0.00017306437567423544, + "loss": 1.4545, + "step": 10378 + }, + { + "epoch": 0.1348703713028492, + "grad_norm": 0.49401047825813293, + "learning_rate": 0.00017306177621232406, + "loss": 1.4005, + "step": 10379 + }, + { + "epoch": 0.13488336584676508, + "grad_norm": 0.511713445186615, + "learning_rate": 0.00017305917675041268, + "loss": 1.3228, + "step": 10380 + }, + { + "epoch": 0.13489636039068095, + "grad_norm": 0.458068311214447, + "learning_rate": 0.00017305657728850128, + "loss": 1.3656, + "step": 10381 + }, + { + "epoch": 0.13490935493459683, + "grad_norm": 0.43266889452934265, + "learning_rate": 0.0001730539778265899, + "loss": 1.375, + "step": 10382 + }, + { + "epoch": 0.1349223494785127, + "grad_norm": 0.395559698343277, + "learning_rate": 0.00017305137836467853, + "loss": 1.3088, + "step": 10383 + }, + { + "epoch": 0.13493534402242857, + "grad_norm": 0.4026636779308319, + "learning_rate": 0.00017304877890276713, + "loss": 1.2945, + "step": 10384 + }, + { + "epoch": 0.13494833856634444, + "grad_norm": 0.5190314650535583, + "learning_rate": 0.00017304617944085575, + "loss": 1.4983, + "step": 10385 + }, + { + "epoch": 0.13496133311026032, + "grad_norm": 0.49648547172546387, + "learning_rate": 0.00017304357997894435, + "loss": 1.5121, + "step": 10386 + }, + { + "epoch": 0.1349743276541762, + "grad_norm": 0.46948403120040894, + "learning_rate": 0.000173040980517033, + "loss": 1.449, + "step": 10387 + }, + { + "epoch": 0.13498732219809206, + "grad_norm": 0.40046605467796326, + "learning_rate": 0.0001730383810551216, + "loss": 1.4601, + "step": 10388 + }, + { + "epoch": 0.13500031674200794, + "grad_norm": 0.3224344849586487, + "learning_rate": 0.00017303578159321022, + "loss": 1.4101, + "step": 10389 + }, + { + "epoch": 0.1350133112859238, + "grad_norm": 0.3160516619682312, + "learning_rate": 0.00017303318213129882, + "loss": 1.4129, + "step": 10390 + }, + { + "epoch": 0.13502630582983968, + "grad_norm": 0.46160274744033813, + "learning_rate": 0.00017303058266938745, + "loss": 1.4075, + "step": 10391 + }, + { + "epoch": 0.13503930037375558, + "grad_norm": 0.3166397213935852, + "learning_rate": 0.00017302798320747607, + "loss": 1.0969, + "step": 10392 + }, + { + "epoch": 0.13505229491767146, + "grad_norm": 0.4604710042476654, + "learning_rate": 0.00017302538374556467, + "loss": 1.4373, + "step": 10393 + }, + { + "epoch": 0.13506528946158733, + "grad_norm": 0.452091783285141, + "learning_rate": 0.0001730227842836533, + "loss": 1.5174, + "step": 10394 + }, + { + "epoch": 0.1350782840055032, + "grad_norm": 0.4143034517765045, + "learning_rate": 0.00017302018482174192, + "loss": 1.5032, + "step": 10395 + }, + { + "epoch": 0.13509127854941907, + "grad_norm": 0.30584678053855896, + "learning_rate": 0.0001730175853598305, + "loss": 1.1431, + "step": 10396 + }, + { + "epoch": 0.13510427309333495, + "grad_norm": 0.35098373889923096, + "learning_rate": 0.00017301498589791914, + "loss": 1.3377, + "step": 10397 + }, + { + "epoch": 0.13511726763725082, + "grad_norm": 0.4178418219089508, + "learning_rate": 0.00017301238643600774, + "loss": 1.4858, + "step": 10398 + }, + { + "epoch": 0.1351302621811667, + "grad_norm": 0.45049336552619934, + "learning_rate": 0.0001730097869740964, + "loss": 1.4288, + "step": 10399 + }, + { + "epoch": 0.13514325672508257, + "grad_norm": 0.6616224646568298, + "learning_rate": 0.00017300718751218498, + "loss": 1.5191, + "step": 10400 + }, + { + "epoch": 0.13515625126899844, + "grad_norm": 0.44530558586120605, + "learning_rate": 0.0001730045880502736, + "loss": 1.3849, + "step": 10401 + }, + { + "epoch": 0.1351692458129143, + "grad_norm": 0.22822310030460358, + "learning_rate": 0.0001730019885883622, + "loss": 1.0842, + "step": 10402 + }, + { + "epoch": 0.13518224035683019, + "grad_norm": 0.2947538495063782, + "learning_rate": 0.00017299938912645083, + "loss": 1.4456, + "step": 10403 + }, + { + "epoch": 0.13519523490074606, + "grad_norm": 0.4881010353565216, + "learning_rate": 0.00017299678966453945, + "loss": 1.4939, + "step": 10404 + }, + { + "epoch": 0.13520822944466193, + "grad_norm": 0.40051162242889404, + "learning_rate": 0.00017299419020262805, + "loss": 1.5808, + "step": 10405 + }, + { + "epoch": 0.1352212239885778, + "grad_norm": 0.3905968964099884, + "learning_rate": 0.00017299159074071668, + "loss": 1.1902, + "step": 10406 + }, + { + "epoch": 0.13523421853249368, + "grad_norm": 0.43358638882637024, + "learning_rate": 0.0001729889912788053, + "loss": 1.3914, + "step": 10407 + }, + { + "epoch": 0.13524721307640955, + "grad_norm": 0.2835332453250885, + "learning_rate": 0.0001729863918168939, + "loss": 1.3044, + "step": 10408 + }, + { + "epoch": 0.13526020762032542, + "grad_norm": 0.41867509484291077, + "learning_rate": 0.00017298379235498252, + "loss": 1.483, + "step": 10409 + }, + { + "epoch": 0.1352732021642413, + "grad_norm": 0.4036106467247009, + "learning_rate": 0.00017298119289307115, + "loss": 1.2726, + "step": 10410 + }, + { + "epoch": 0.13528619670815717, + "grad_norm": 0.2868950068950653, + "learning_rate": 0.00017297859343115977, + "loss": 1.3164, + "step": 10411 + }, + { + "epoch": 0.13529919125207304, + "grad_norm": 0.3779778778553009, + "learning_rate": 0.00017297599396924837, + "loss": 1.5453, + "step": 10412 + }, + { + "epoch": 0.13531218579598892, + "grad_norm": 0.5382668972015381, + "learning_rate": 0.000172973394507337, + "loss": 1.5733, + "step": 10413 + }, + { + "epoch": 0.1353251803399048, + "grad_norm": 0.4318283796310425, + "learning_rate": 0.00017297079504542562, + "loss": 1.6126, + "step": 10414 + }, + { + "epoch": 0.13533817488382066, + "grad_norm": 0.3862241208553314, + "learning_rate": 0.00017296819558351422, + "loss": 1.2677, + "step": 10415 + }, + { + "epoch": 0.13535116942773653, + "grad_norm": 0.46455109119415283, + "learning_rate": 0.00017296559612160284, + "loss": 1.351, + "step": 10416 + }, + { + "epoch": 0.1353641639716524, + "grad_norm": 0.4568712115287781, + "learning_rate": 0.00017296299665969144, + "loss": 1.5687, + "step": 10417 + }, + { + "epoch": 0.13537715851556828, + "grad_norm": 0.3928455710411072, + "learning_rate": 0.0001729603971977801, + "loss": 1.3934, + "step": 10418 + }, + { + "epoch": 0.13539015305948415, + "grad_norm": 0.4216494560241699, + "learning_rate": 0.0001729577977358687, + "loss": 1.4433, + "step": 10419 + }, + { + "epoch": 0.13540314760340003, + "grad_norm": 0.4168281555175781, + "learning_rate": 0.00017295519827395728, + "loss": 1.6246, + "step": 10420 + }, + { + "epoch": 0.1354161421473159, + "grad_norm": 0.5439172387123108, + "learning_rate": 0.0001729525988120459, + "loss": 1.6783, + "step": 10421 + }, + { + "epoch": 0.13542913669123177, + "grad_norm": 0.40333667397499084, + "learning_rate": 0.00017294999935013453, + "loss": 1.4696, + "step": 10422 + }, + { + "epoch": 0.13544213123514764, + "grad_norm": 0.3222063183784485, + "learning_rate": 0.00017294739988822316, + "loss": 1.5009, + "step": 10423 + }, + { + "epoch": 0.13545512577906352, + "grad_norm": 0.36979150772094727, + "learning_rate": 0.00017294480042631175, + "loss": 1.3801, + "step": 10424 + }, + { + "epoch": 0.1354681203229794, + "grad_norm": 0.3979223072528839, + "learning_rate": 0.00017294220096440038, + "loss": 1.3626, + "step": 10425 + }, + { + "epoch": 0.13548111486689526, + "grad_norm": 0.4525166451931, + "learning_rate": 0.000172939601502489, + "loss": 1.4386, + "step": 10426 + }, + { + "epoch": 0.13549410941081114, + "grad_norm": 0.24766109883785248, + "learning_rate": 0.0001729370020405776, + "loss": 1.3634, + "step": 10427 + }, + { + "epoch": 0.135507103954727, + "grad_norm": 0.34726038575172424, + "learning_rate": 0.00017293440257866623, + "loss": 1.3956, + "step": 10428 + }, + { + "epoch": 0.13552009849864288, + "grad_norm": 0.3226400315761566, + "learning_rate": 0.00017293180311675482, + "loss": 1.1377, + "step": 10429 + }, + { + "epoch": 0.13553309304255876, + "grad_norm": 0.25845417380332947, + "learning_rate": 0.00017292920365484347, + "loss": 1.185, + "step": 10430 + }, + { + "epoch": 0.13554608758647463, + "grad_norm": 0.41717925667762756, + "learning_rate": 0.00017292660419293207, + "loss": 1.4206, + "step": 10431 + }, + { + "epoch": 0.1355590821303905, + "grad_norm": 0.35647621750831604, + "learning_rate": 0.0001729240047310207, + "loss": 1.5524, + "step": 10432 + }, + { + "epoch": 0.13557207667430637, + "grad_norm": 0.37825509905815125, + "learning_rate": 0.0001729214052691093, + "loss": 1.4484, + "step": 10433 + }, + { + "epoch": 0.13558507121822225, + "grad_norm": 0.35352712869644165, + "learning_rate": 0.00017291880580719792, + "loss": 1.4822, + "step": 10434 + }, + { + "epoch": 0.13559806576213812, + "grad_norm": 0.3956758379936218, + "learning_rate": 0.00017291620634528654, + "loss": 1.475, + "step": 10435 + }, + { + "epoch": 0.135611060306054, + "grad_norm": 0.22947891056537628, + "learning_rate": 0.00017291360688337514, + "loss": 1.3214, + "step": 10436 + }, + { + "epoch": 0.13562405484996987, + "grad_norm": 0.380686491727829, + "learning_rate": 0.00017291100742146376, + "loss": 1.3786, + "step": 10437 + }, + { + "epoch": 0.13563704939388574, + "grad_norm": 0.39232754707336426, + "learning_rate": 0.0001729084079595524, + "loss": 1.3554, + "step": 10438 + }, + { + "epoch": 0.1356500439378016, + "grad_norm": 0.45147377252578735, + "learning_rate": 0.000172905808497641, + "loss": 1.5893, + "step": 10439 + }, + { + "epoch": 0.13566303848171749, + "grad_norm": 0.4316250681877136, + "learning_rate": 0.0001729032090357296, + "loss": 1.6121, + "step": 10440 + }, + { + "epoch": 0.13567603302563336, + "grad_norm": 0.4013034999370575, + "learning_rate": 0.0001729006095738182, + "loss": 1.4084, + "step": 10441 + }, + { + "epoch": 0.13568902756954923, + "grad_norm": 0.3859449326992035, + "learning_rate": 0.00017289801011190686, + "loss": 1.4563, + "step": 10442 + }, + { + "epoch": 0.1357020221134651, + "grad_norm": 0.44385725259780884, + "learning_rate": 0.00017289541064999546, + "loss": 1.5617, + "step": 10443 + }, + { + "epoch": 0.13571501665738098, + "grad_norm": 0.30905887484550476, + "learning_rate": 0.00017289281118808408, + "loss": 1.3442, + "step": 10444 + }, + { + "epoch": 0.13572801120129685, + "grad_norm": 0.3928665518760681, + "learning_rate": 0.00017289021172617268, + "loss": 1.2986, + "step": 10445 + }, + { + "epoch": 0.13574100574521272, + "grad_norm": 0.4879981279373169, + "learning_rate": 0.0001728876122642613, + "loss": 1.6302, + "step": 10446 + }, + { + "epoch": 0.1357540002891286, + "grad_norm": 0.3093537986278534, + "learning_rate": 0.00017288501280234993, + "loss": 1.4506, + "step": 10447 + }, + { + "epoch": 0.13576699483304447, + "grad_norm": 0.3475709557533264, + "learning_rate": 0.00017288241334043853, + "loss": 1.3921, + "step": 10448 + }, + { + "epoch": 0.13577998937696034, + "grad_norm": 0.3754947781562805, + "learning_rate": 0.00017287981387852718, + "loss": 1.3088, + "step": 10449 + }, + { + "epoch": 0.13579298392087621, + "grad_norm": 0.4017201066017151, + "learning_rate": 0.00017287721441661577, + "loss": 1.5099, + "step": 10450 + }, + { + "epoch": 0.1358059784647921, + "grad_norm": 0.42656177282333374, + "learning_rate": 0.00017287461495470437, + "loss": 1.5031, + "step": 10451 + }, + { + "epoch": 0.13581897300870796, + "grad_norm": 0.42006027698516846, + "learning_rate": 0.000172872015492793, + "loss": 1.465, + "step": 10452 + }, + { + "epoch": 0.13583196755262383, + "grad_norm": 0.44549089670181274, + "learning_rate": 0.00017286941603088162, + "loss": 1.3782, + "step": 10453 + }, + { + "epoch": 0.1358449620965397, + "grad_norm": 0.340950608253479, + "learning_rate": 0.00017286681656897025, + "loss": 1.2409, + "step": 10454 + }, + { + "epoch": 0.13585795664045558, + "grad_norm": 0.46853703260421753, + "learning_rate": 0.00017286421710705884, + "loss": 1.4525, + "step": 10455 + }, + { + "epoch": 0.13587095118437145, + "grad_norm": 0.28742673993110657, + "learning_rate": 0.00017286161764514747, + "loss": 1.2775, + "step": 10456 + }, + { + "epoch": 0.13588394572828733, + "grad_norm": 0.7063469886779785, + "learning_rate": 0.0001728590181832361, + "loss": 1.4862, + "step": 10457 + }, + { + "epoch": 0.1358969402722032, + "grad_norm": 0.4160999357700348, + "learning_rate": 0.0001728564187213247, + "loss": 1.5103, + "step": 10458 + }, + { + "epoch": 0.13590993481611907, + "grad_norm": 0.2786518335342407, + "learning_rate": 0.00017285381925941331, + "loss": 1.4151, + "step": 10459 + }, + { + "epoch": 0.13592292936003494, + "grad_norm": 0.37167608737945557, + "learning_rate": 0.0001728512197975019, + "loss": 1.4621, + "step": 10460 + }, + { + "epoch": 0.13593592390395082, + "grad_norm": 0.3835524916648865, + "learning_rate": 0.00017284862033559056, + "loss": 1.4056, + "step": 10461 + }, + { + "epoch": 0.1359489184478667, + "grad_norm": 0.43963563442230225, + "learning_rate": 0.00017284602087367916, + "loss": 1.3055, + "step": 10462 + }, + { + "epoch": 0.13596191299178256, + "grad_norm": 0.3543713092803955, + "learning_rate": 0.00017284342141176776, + "loss": 1.3345, + "step": 10463 + }, + { + "epoch": 0.13597490753569844, + "grad_norm": 0.3504061698913574, + "learning_rate": 0.00017284082194985638, + "loss": 1.6044, + "step": 10464 + }, + { + "epoch": 0.1359879020796143, + "grad_norm": 0.36942625045776367, + "learning_rate": 0.000172838222487945, + "loss": 1.4402, + "step": 10465 + }, + { + "epoch": 0.13600089662353018, + "grad_norm": 0.38377702236175537, + "learning_rate": 0.00017283562302603363, + "loss": 1.444, + "step": 10466 + }, + { + "epoch": 0.13601389116744605, + "grad_norm": 0.3686785101890564, + "learning_rate": 0.00017283302356412223, + "loss": 1.4189, + "step": 10467 + }, + { + "epoch": 0.13602688571136196, + "grad_norm": 0.4503157436847687, + "learning_rate": 0.00017283042410221085, + "loss": 1.4786, + "step": 10468 + }, + { + "epoch": 0.13603988025527783, + "grad_norm": 0.37379541993141174, + "learning_rate": 0.00017282782464029948, + "loss": 1.665, + "step": 10469 + }, + { + "epoch": 0.1360528747991937, + "grad_norm": 0.53169184923172, + "learning_rate": 0.00017282522517838807, + "loss": 1.3902, + "step": 10470 + }, + { + "epoch": 0.13606586934310957, + "grad_norm": 0.34697386622428894, + "learning_rate": 0.0001728226257164767, + "loss": 1.3928, + "step": 10471 + }, + { + "epoch": 0.13607886388702545, + "grad_norm": 0.3834492266178131, + "learning_rate": 0.0001728200262545653, + "loss": 1.3315, + "step": 10472 + }, + { + "epoch": 0.13609185843094132, + "grad_norm": 0.3892078697681427, + "learning_rate": 0.00017281742679265395, + "loss": 1.4145, + "step": 10473 + }, + { + "epoch": 0.1361048529748572, + "grad_norm": 0.38613563776016235, + "learning_rate": 0.00017281482733074255, + "loss": 1.4233, + "step": 10474 + }, + { + "epoch": 0.13611784751877307, + "grad_norm": 0.34394288063049316, + "learning_rate": 0.00017281222786883114, + "loss": 1.3178, + "step": 10475 + }, + { + "epoch": 0.13613084206268894, + "grad_norm": 0.4280282258987427, + "learning_rate": 0.00017280962840691977, + "loss": 1.4976, + "step": 10476 + }, + { + "epoch": 0.1361438366066048, + "grad_norm": 0.38222944736480713, + "learning_rate": 0.0001728070289450084, + "loss": 1.4249, + "step": 10477 + }, + { + "epoch": 0.13615683115052069, + "grad_norm": 0.4223169684410095, + "learning_rate": 0.00017280442948309702, + "loss": 1.4968, + "step": 10478 + }, + { + "epoch": 0.13616982569443656, + "grad_norm": 0.3264272212982178, + "learning_rate": 0.00017280183002118561, + "loss": 1.3399, + "step": 10479 + }, + { + "epoch": 0.13618282023835243, + "grad_norm": 0.41677117347717285, + "learning_rate": 0.00017279923055927424, + "loss": 1.5063, + "step": 10480 + }, + { + "epoch": 0.1361958147822683, + "grad_norm": 0.36260271072387695, + "learning_rate": 0.00017279663109736286, + "loss": 1.3966, + "step": 10481 + }, + { + "epoch": 0.13620880932618418, + "grad_norm": 0.3674204349517822, + "learning_rate": 0.00017279403163545146, + "loss": 1.383, + "step": 10482 + }, + { + "epoch": 0.13622180387010005, + "grad_norm": 0.37665751576423645, + "learning_rate": 0.00017279143217354008, + "loss": 1.4083, + "step": 10483 + }, + { + "epoch": 0.13623479841401592, + "grad_norm": 0.426018089056015, + "learning_rate": 0.0001727888327116287, + "loss": 1.3924, + "step": 10484 + }, + { + "epoch": 0.1362477929579318, + "grad_norm": 0.3398778736591339, + "learning_rate": 0.00017278623324971733, + "loss": 1.5797, + "step": 10485 + }, + { + "epoch": 0.13626078750184767, + "grad_norm": 0.29407134652137756, + "learning_rate": 0.00017278363378780593, + "loss": 1.509, + "step": 10486 + }, + { + "epoch": 0.13627378204576354, + "grad_norm": 0.44475987553596497, + "learning_rate": 0.00017278103432589456, + "loss": 1.2657, + "step": 10487 + }, + { + "epoch": 0.13628677658967941, + "grad_norm": 0.3878095746040344, + "learning_rate": 0.00017277843486398318, + "loss": 1.3901, + "step": 10488 + }, + { + "epoch": 0.1362997711335953, + "grad_norm": 0.4573918879032135, + "learning_rate": 0.00017277583540207178, + "loss": 1.6271, + "step": 10489 + }, + { + "epoch": 0.13631276567751116, + "grad_norm": 0.4283890426158905, + "learning_rate": 0.0001727732359401604, + "loss": 1.4373, + "step": 10490 + }, + { + "epoch": 0.13632576022142703, + "grad_norm": 0.29495999217033386, + "learning_rate": 0.000172770636478249, + "loss": 1.3181, + "step": 10491 + }, + { + "epoch": 0.1363387547653429, + "grad_norm": 0.41229575872421265, + "learning_rate": 0.00017276803701633762, + "loss": 1.313, + "step": 10492 + }, + { + "epoch": 0.13635174930925878, + "grad_norm": 0.35758906602859497, + "learning_rate": 0.00017276543755442625, + "loss": 1.4246, + "step": 10493 + }, + { + "epoch": 0.13636474385317465, + "grad_norm": 0.35581740736961365, + "learning_rate": 0.00017276283809251485, + "loss": 1.2879, + "step": 10494 + }, + { + "epoch": 0.13637773839709053, + "grad_norm": 0.4496549963951111, + "learning_rate": 0.00017276023863060347, + "loss": 1.5218, + "step": 10495 + }, + { + "epoch": 0.1363907329410064, + "grad_norm": 0.4742807447910309, + "learning_rate": 0.0001727576391686921, + "loss": 1.4893, + "step": 10496 + }, + { + "epoch": 0.13640372748492227, + "grad_norm": 0.4038223922252655, + "learning_rate": 0.00017275503970678072, + "loss": 1.4817, + "step": 10497 + }, + { + "epoch": 0.13641672202883814, + "grad_norm": 0.45823538303375244, + "learning_rate": 0.00017275244024486932, + "loss": 1.5909, + "step": 10498 + }, + { + "epoch": 0.13642971657275402, + "grad_norm": 0.397400826215744, + "learning_rate": 0.00017274984078295794, + "loss": 1.3527, + "step": 10499 + }, + { + "epoch": 0.1364427111166699, + "grad_norm": 0.40433722734451294, + "learning_rate": 0.00017274724132104657, + "loss": 1.2691, + "step": 10500 + }, + { + "epoch": 0.13645570566058576, + "grad_norm": 0.2724015712738037, + "learning_rate": 0.00017274464185913516, + "loss": 1.461, + "step": 10501 + }, + { + "epoch": 0.13646870020450164, + "grad_norm": 0.399953693151474, + "learning_rate": 0.0001727420423972238, + "loss": 1.3822, + "step": 10502 + }, + { + "epoch": 0.1364816947484175, + "grad_norm": 0.5578676462173462, + "learning_rate": 0.00017273944293531238, + "loss": 1.335, + "step": 10503 + }, + { + "epoch": 0.13649468929233338, + "grad_norm": 0.3851067125797272, + "learning_rate": 0.000172736843473401, + "loss": 1.488, + "step": 10504 + }, + { + "epoch": 0.13650768383624926, + "grad_norm": 0.2920408248901367, + "learning_rate": 0.00017273424401148963, + "loss": 1.2559, + "step": 10505 + }, + { + "epoch": 0.13652067838016513, + "grad_norm": 0.3744671046733856, + "learning_rate": 0.00017273164454957823, + "loss": 1.3797, + "step": 10506 + }, + { + "epoch": 0.136533672924081, + "grad_norm": 0.24825863540172577, + "learning_rate": 0.00017272904508766686, + "loss": 1.4313, + "step": 10507 + }, + { + "epoch": 0.13654666746799687, + "grad_norm": 0.42083391547203064, + "learning_rate": 0.00017272644562575548, + "loss": 1.5479, + "step": 10508 + }, + { + "epoch": 0.13655966201191275, + "grad_norm": 0.4425850808620453, + "learning_rate": 0.0001727238461638441, + "loss": 1.417, + "step": 10509 + }, + { + "epoch": 0.13657265655582862, + "grad_norm": 0.4810177981853485, + "learning_rate": 0.0001727212467019327, + "loss": 1.4951, + "step": 10510 + }, + { + "epoch": 0.1365856510997445, + "grad_norm": 0.28087952733039856, + "learning_rate": 0.00017271864724002133, + "loss": 1.2765, + "step": 10511 + }, + { + "epoch": 0.13659864564366037, + "grad_norm": 0.47911617159843445, + "learning_rate": 0.00017271604777810995, + "loss": 1.4801, + "step": 10512 + }, + { + "epoch": 0.13661164018757624, + "grad_norm": 0.4173252582550049, + "learning_rate": 0.00017271344831619855, + "loss": 1.3148, + "step": 10513 + }, + { + "epoch": 0.1366246347314921, + "grad_norm": 0.35897526144981384, + "learning_rate": 0.00017271084885428717, + "loss": 1.4882, + "step": 10514 + }, + { + "epoch": 0.13663762927540798, + "grad_norm": 0.43524637818336487, + "learning_rate": 0.00017270824939237577, + "loss": 1.2251, + "step": 10515 + }, + { + "epoch": 0.13665062381932386, + "grad_norm": 0.38038232922554016, + "learning_rate": 0.00017270564993046442, + "loss": 1.4763, + "step": 10516 + }, + { + "epoch": 0.13666361836323973, + "grad_norm": 0.39000430703163147, + "learning_rate": 0.00017270305046855302, + "loss": 1.498, + "step": 10517 + }, + { + "epoch": 0.1366766129071556, + "grad_norm": 0.8693828582763672, + "learning_rate": 0.00017270045100664162, + "loss": 1.4689, + "step": 10518 + }, + { + "epoch": 0.13668960745107148, + "grad_norm": 0.49815839529037476, + "learning_rate": 0.00017269785154473024, + "loss": 1.4892, + "step": 10519 + }, + { + "epoch": 0.13670260199498735, + "grad_norm": 0.3810057044029236, + "learning_rate": 0.00017269525208281887, + "loss": 1.3877, + "step": 10520 + }, + { + "epoch": 0.13671559653890322, + "grad_norm": 0.2651503086090088, + "learning_rate": 0.0001726926526209075, + "loss": 1.2183, + "step": 10521 + }, + { + "epoch": 0.1367285910828191, + "grad_norm": 0.3170636296272278, + "learning_rate": 0.0001726900531589961, + "loss": 1.5981, + "step": 10522 + }, + { + "epoch": 0.13674158562673497, + "grad_norm": 0.3709258437156677, + "learning_rate": 0.0001726874536970847, + "loss": 1.4153, + "step": 10523 + }, + { + "epoch": 0.13675458017065084, + "grad_norm": 0.48535898327827454, + "learning_rate": 0.00017268485423517334, + "loss": 1.2872, + "step": 10524 + }, + { + "epoch": 0.13676757471456671, + "grad_norm": 0.3966870605945587, + "learning_rate": 0.00017268225477326193, + "loss": 1.5348, + "step": 10525 + }, + { + "epoch": 0.1367805692584826, + "grad_norm": 0.304897665977478, + "learning_rate": 0.00017267965531135056, + "loss": 1.2594, + "step": 10526 + }, + { + "epoch": 0.13679356380239846, + "grad_norm": 0.39303287863731384, + "learning_rate": 0.00017267705584943918, + "loss": 1.6777, + "step": 10527 + }, + { + "epoch": 0.13680655834631433, + "grad_norm": 0.4536191523075104, + "learning_rate": 0.0001726744563875278, + "loss": 1.6083, + "step": 10528 + }, + { + "epoch": 0.1368195528902302, + "grad_norm": 0.4273262321949005, + "learning_rate": 0.0001726718569256164, + "loss": 1.4775, + "step": 10529 + }, + { + "epoch": 0.13683254743414608, + "grad_norm": 0.4868556261062622, + "learning_rate": 0.000172669257463705, + "loss": 1.4794, + "step": 10530 + }, + { + "epoch": 0.13684554197806195, + "grad_norm": 0.4230353534221649, + "learning_rate": 0.00017266665800179365, + "loss": 1.3272, + "step": 10531 + }, + { + "epoch": 0.13685853652197782, + "grad_norm": 0.3850875794887543, + "learning_rate": 0.00017266405853988225, + "loss": 1.3766, + "step": 10532 + }, + { + "epoch": 0.1368715310658937, + "grad_norm": 0.38219889998435974, + "learning_rate": 0.00017266145907797087, + "loss": 1.3608, + "step": 10533 + }, + { + "epoch": 0.13688452560980957, + "grad_norm": 0.580220103263855, + "learning_rate": 0.00017265885961605947, + "loss": 1.4533, + "step": 10534 + }, + { + "epoch": 0.13689752015372544, + "grad_norm": 0.4341967701911926, + "learning_rate": 0.0001726562601541481, + "loss": 1.5912, + "step": 10535 + }, + { + "epoch": 0.13691051469764132, + "grad_norm": 0.38223201036453247, + "learning_rate": 0.00017265366069223672, + "loss": 1.422, + "step": 10536 + }, + { + "epoch": 0.1369235092415572, + "grad_norm": 0.38255515694618225, + "learning_rate": 0.00017265106123032532, + "loss": 1.3093, + "step": 10537 + }, + { + "epoch": 0.13693650378547306, + "grad_norm": 0.3335675895214081, + "learning_rate": 0.00017264846176841394, + "loss": 1.3909, + "step": 10538 + }, + { + "epoch": 0.13694949832938894, + "grad_norm": 0.3044624924659729, + "learning_rate": 0.00017264586230650257, + "loss": 1.3799, + "step": 10539 + }, + { + "epoch": 0.1369624928733048, + "grad_norm": 0.31655076146125793, + "learning_rate": 0.0001726432628445912, + "loss": 1.1527, + "step": 10540 + }, + { + "epoch": 0.13697548741722068, + "grad_norm": 0.40791475772857666, + "learning_rate": 0.0001726406633826798, + "loss": 1.5779, + "step": 10541 + }, + { + "epoch": 0.13698848196113655, + "grad_norm": 0.2784699499607086, + "learning_rate": 0.0001726380639207684, + "loss": 1.3844, + "step": 10542 + }, + { + "epoch": 0.13700147650505243, + "grad_norm": 0.3830648958683014, + "learning_rate": 0.00017263546445885704, + "loss": 1.5756, + "step": 10543 + }, + { + "epoch": 0.13701447104896833, + "grad_norm": 0.38863110542297363, + "learning_rate": 0.00017263286499694564, + "loss": 1.3717, + "step": 10544 + }, + { + "epoch": 0.1370274655928842, + "grad_norm": 0.4381507933139801, + "learning_rate": 0.00017263026553503426, + "loss": 1.5142, + "step": 10545 + }, + { + "epoch": 0.13704046013680007, + "grad_norm": 0.36823081970214844, + "learning_rate": 0.00017262766607312286, + "loss": 1.4017, + "step": 10546 + }, + { + "epoch": 0.13705345468071595, + "grad_norm": 0.38603949546813965, + "learning_rate": 0.00017262506661121148, + "loss": 1.4844, + "step": 10547 + }, + { + "epoch": 0.13706644922463182, + "grad_norm": 0.30869975686073303, + "learning_rate": 0.0001726224671493001, + "loss": 1.2908, + "step": 10548 + }, + { + "epoch": 0.1370794437685477, + "grad_norm": 0.4072871506214142, + "learning_rate": 0.0001726198676873887, + "loss": 1.4849, + "step": 10549 + }, + { + "epoch": 0.13709243831246357, + "grad_norm": 0.322182297706604, + "learning_rate": 0.00017261726822547733, + "loss": 1.1986, + "step": 10550 + }, + { + "epoch": 0.13710543285637944, + "grad_norm": 0.43072229623794556, + "learning_rate": 0.00017261466876356595, + "loss": 1.672, + "step": 10551 + }, + { + "epoch": 0.1371184274002953, + "grad_norm": 0.3653903007507324, + "learning_rate": 0.00017261206930165458, + "loss": 1.4151, + "step": 10552 + }, + { + "epoch": 0.13713142194421118, + "grad_norm": 0.42655181884765625, + "learning_rate": 0.00017260946983974317, + "loss": 1.4845, + "step": 10553 + }, + { + "epoch": 0.13714441648812706, + "grad_norm": 0.4901348948478699, + "learning_rate": 0.0001726068703778318, + "loss": 1.2358, + "step": 10554 + }, + { + "epoch": 0.13715741103204293, + "grad_norm": 0.3265390396118164, + "learning_rate": 0.00017260427091592042, + "loss": 1.3265, + "step": 10555 + }, + { + "epoch": 0.1371704055759588, + "grad_norm": 0.35710471868515015, + "learning_rate": 0.00017260167145400902, + "loss": 1.2568, + "step": 10556 + }, + { + "epoch": 0.13718340011987468, + "grad_norm": 0.5008095502853394, + "learning_rate": 0.00017259907199209765, + "loss": 1.5164, + "step": 10557 + }, + { + "epoch": 0.13719639466379055, + "grad_norm": 0.4730646014213562, + "learning_rate": 0.00017259647253018627, + "loss": 1.4292, + "step": 10558 + }, + { + "epoch": 0.13720938920770642, + "grad_norm": 0.34148985147476196, + "learning_rate": 0.00017259387306827487, + "loss": 1.3327, + "step": 10559 + }, + { + "epoch": 0.1372223837516223, + "grad_norm": 0.4202231466770172, + "learning_rate": 0.0001725912736063635, + "loss": 1.5288, + "step": 10560 + }, + { + "epoch": 0.13723537829553817, + "grad_norm": 0.3581228256225586, + "learning_rate": 0.0001725886741444521, + "loss": 1.306, + "step": 10561 + }, + { + "epoch": 0.13724837283945404, + "grad_norm": 0.3379080891609192, + "learning_rate": 0.00017258607468254074, + "loss": 1.2369, + "step": 10562 + }, + { + "epoch": 0.13726136738336991, + "grad_norm": 0.3051481544971466, + "learning_rate": 0.00017258347522062934, + "loss": 1.2441, + "step": 10563 + }, + { + "epoch": 0.1372743619272858, + "grad_norm": 0.413403183221817, + "learning_rate": 0.00017258087575871796, + "loss": 1.3255, + "step": 10564 + }, + { + "epoch": 0.13728735647120166, + "grad_norm": 0.42408865690231323, + "learning_rate": 0.00017257827629680656, + "loss": 1.4321, + "step": 10565 + }, + { + "epoch": 0.13730035101511753, + "grad_norm": 0.3705972135066986, + "learning_rate": 0.00017257567683489518, + "loss": 1.4341, + "step": 10566 + }, + { + "epoch": 0.1373133455590334, + "grad_norm": 0.5651363134384155, + "learning_rate": 0.0001725730773729838, + "loss": 1.4846, + "step": 10567 + }, + { + "epoch": 0.13732634010294928, + "grad_norm": 0.3010171353816986, + "learning_rate": 0.0001725704779110724, + "loss": 1.2273, + "step": 10568 + }, + { + "epoch": 0.13733933464686515, + "grad_norm": 0.5309604406356812, + "learning_rate": 0.00017256787844916103, + "loss": 1.4023, + "step": 10569 + }, + { + "epoch": 0.13735232919078103, + "grad_norm": 0.3206302523612976, + "learning_rate": 0.00017256527898724966, + "loss": 1.3123, + "step": 10570 + }, + { + "epoch": 0.1373653237346969, + "grad_norm": 0.3580516278743744, + "learning_rate": 0.00017256267952533828, + "loss": 1.4181, + "step": 10571 + }, + { + "epoch": 0.13737831827861277, + "grad_norm": 0.38778382539749146, + "learning_rate": 0.00017256008006342688, + "loss": 1.3119, + "step": 10572 + }, + { + "epoch": 0.13739131282252864, + "grad_norm": 0.43884924054145813, + "learning_rate": 0.00017255748060151547, + "loss": 1.4495, + "step": 10573 + }, + { + "epoch": 0.13740430736644452, + "grad_norm": 0.7913076877593994, + "learning_rate": 0.00017255488113960413, + "loss": 1.5133, + "step": 10574 + }, + { + "epoch": 0.1374173019103604, + "grad_norm": 0.4408593773841858, + "learning_rate": 0.00017255228167769272, + "loss": 1.3516, + "step": 10575 + }, + { + "epoch": 0.13743029645427626, + "grad_norm": 0.3535459339618683, + "learning_rate": 0.00017254968221578135, + "loss": 1.6191, + "step": 10576 + }, + { + "epoch": 0.13744329099819214, + "grad_norm": 0.5241795778274536, + "learning_rate": 0.00017254708275386995, + "loss": 1.5931, + "step": 10577 + }, + { + "epoch": 0.137456285542108, + "grad_norm": 0.3066680133342743, + "learning_rate": 0.00017254448329195857, + "loss": 1.3775, + "step": 10578 + }, + { + "epoch": 0.13746928008602388, + "grad_norm": 0.46135973930358887, + "learning_rate": 0.0001725418838300472, + "loss": 1.4092, + "step": 10579 + }, + { + "epoch": 0.13748227462993975, + "grad_norm": 0.3281068205833435, + "learning_rate": 0.0001725392843681358, + "loss": 1.3127, + "step": 10580 + }, + { + "epoch": 0.13749526917385563, + "grad_norm": 0.42797136306762695, + "learning_rate": 0.00017253668490622442, + "loss": 1.6918, + "step": 10581 + }, + { + "epoch": 0.1375082637177715, + "grad_norm": 0.4591136872768402, + "learning_rate": 0.00017253408544431304, + "loss": 1.494, + "step": 10582 + }, + { + "epoch": 0.13752125826168737, + "grad_norm": 0.3453805148601532, + "learning_rate": 0.00017253148598240167, + "loss": 1.3688, + "step": 10583 + }, + { + "epoch": 0.13753425280560325, + "grad_norm": 0.4060210883617401, + "learning_rate": 0.00017252888652049026, + "loss": 1.5277, + "step": 10584 + }, + { + "epoch": 0.13754724734951912, + "grad_norm": 0.6141288876533508, + "learning_rate": 0.00017252628705857886, + "loss": 1.3349, + "step": 10585 + }, + { + "epoch": 0.137560241893435, + "grad_norm": 0.4474853277206421, + "learning_rate": 0.0001725236875966675, + "loss": 1.6698, + "step": 10586 + }, + { + "epoch": 0.13757323643735087, + "grad_norm": 0.3038195073604584, + "learning_rate": 0.0001725210881347561, + "loss": 1.4218, + "step": 10587 + }, + { + "epoch": 0.13758623098126674, + "grad_norm": 0.43007174134254456, + "learning_rate": 0.00017251848867284473, + "loss": 1.5399, + "step": 10588 + }, + { + "epoch": 0.1375992255251826, + "grad_norm": 0.49214497208595276, + "learning_rate": 0.00017251588921093333, + "loss": 1.5708, + "step": 10589 + }, + { + "epoch": 0.13761222006909848, + "grad_norm": 0.3328685164451599, + "learning_rate": 0.00017251328974902196, + "loss": 1.2818, + "step": 10590 + }, + { + "epoch": 0.13762521461301436, + "grad_norm": 0.3670251667499542, + "learning_rate": 0.00017251069028711058, + "loss": 1.2983, + "step": 10591 + }, + { + "epoch": 0.13763820915693023, + "grad_norm": 0.2783401608467102, + "learning_rate": 0.00017250809082519918, + "loss": 1.3688, + "step": 10592 + }, + { + "epoch": 0.1376512037008461, + "grad_norm": 0.3808026909828186, + "learning_rate": 0.0001725054913632878, + "loss": 1.4213, + "step": 10593 + }, + { + "epoch": 0.13766419824476198, + "grad_norm": 0.3565732538700104, + "learning_rate": 0.00017250289190137643, + "loss": 1.4686, + "step": 10594 + }, + { + "epoch": 0.13767719278867785, + "grad_norm": 0.44309598207473755, + "learning_rate": 0.00017250029243946505, + "loss": 1.1618, + "step": 10595 + }, + { + "epoch": 0.13769018733259372, + "grad_norm": 0.3477417528629303, + "learning_rate": 0.00017249769297755365, + "loss": 1.5659, + "step": 10596 + }, + { + "epoch": 0.1377031818765096, + "grad_norm": 0.423973947763443, + "learning_rate": 0.00017249509351564227, + "loss": 1.4957, + "step": 10597 + }, + { + "epoch": 0.13771617642042547, + "grad_norm": 0.3800935745239258, + "learning_rate": 0.0001724924940537309, + "loss": 1.4856, + "step": 10598 + }, + { + "epoch": 0.13772917096434134, + "grad_norm": 0.4738127589225769, + "learning_rate": 0.0001724898945918195, + "loss": 1.5107, + "step": 10599 + }, + { + "epoch": 0.1377421655082572, + "grad_norm": 0.39662086963653564, + "learning_rate": 0.00017248729512990812, + "loss": 1.4037, + "step": 10600 + }, + { + "epoch": 0.1377551600521731, + "grad_norm": 0.47219574451446533, + "learning_rate": 0.00017248469566799674, + "loss": 1.4837, + "step": 10601 + }, + { + "epoch": 0.13776815459608896, + "grad_norm": 0.3509545922279358, + "learning_rate": 0.00017248209620608534, + "loss": 1.551, + "step": 10602 + }, + { + "epoch": 0.13778114914000483, + "grad_norm": 0.4001471698284149, + "learning_rate": 0.00017247949674417397, + "loss": 1.3914, + "step": 10603 + }, + { + "epoch": 0.1377941436839207, + "grad_norm": 0.35533377528190613, + "learning_rate": 0.00017247689728226256, + "loss": 1.6054, + "step": 10604 + }, + { + "epoch": 0.13780713822783658, + "grad_norm": 0.30173033475875854, + "learning_rate": 0.00017247429782035121, + "loss": 1.41, + "step": 10605 + }, + { + "epoch": 0.13782013277175245, + "grad_norm": 0.4919365346431732, + "learning_rate": 0.0001724716983584398, + "loss": 1.3973, + "step": 10606 + }, + { + "epoch": 0.13783312731566832, + "grad_norm": 0.338100790977478, + "learning_rate": 0.00017246909889652844, + "loss": 1.1979, + "step": 10607 + }, + { + "epoch": 0.1378461218595842, + "grad_norm": 0.361561119556427, + "learning_rate": 0.00017246649943461703, + "loss": 1.4515, + "step": 10608 + }, + { + "epoch": 0.13785911640350007, + "grad_norm": 0.4210669994354248, + "learning_rate": 0.00017246389997270566, + "loss": 1.4154, + "step": 10609 + }, + { + "epoch": 0.13787211094741594, + "grad_norm": 0.38256022334098816, + "learning_rate": 0.00017246130051079428, + "loss": 1.3763, + "step": 10610 + }, + { + "epoch": 0.13788510549133182, + "grad_norm": 0.4099956750869751, + "learning_rate": 0.00017245870104888288, + "loss": 1.4154, + "step": 10611 + }, + { + "epoch": 0.1378981000352477, + "grad_norm": 0.42823123931884766, + "learning_rate": 0.0001724561015869715, + "loss": 1.2585, + "step": 10612 + }, + { + "epoch": 0.13791109457916356, + "grad_norm": 0.43820980191230774, + "learning_rate": 0.00017245350212506013, + "loss": 1.58, + "step": 10613 + }, + { + "epoch": 0.13792408912307944, + "grad_norm": 0.3689756393432617, + "learning_rate": 0.00017245090266314873, + "loss": 1.3779, + "step": 10614 + }, + { + "epoch": 0.1379370836669953, + "grad_norm": 0.4533342719078064, + "learning_rate": 0.00017244830320123735, + "loss": 1.3371, + "step": 10615 + }, + { + "epoch": 0.13795007821091118, + "grad_norm": 0.3453167676925659, + "learning_rate": 0.00017244570373932595, + "loss": 1.4459, + "step": 10616 + }, + { + "epoch": 0.13796307275482705, + "grad_norm": 0.42374488711357117, + "learning_rate": 0.0001724431042774146, + "loss": 1.4445, + "step": 10617 + }, + { + "epoch": 0.13797606729874293, + "grad_norm": 0.4540706276893616, + "learning_rate": 0.0001724405048155032, + "loss": 1.4461, + "step": 10618 + }, + { + "epoch": 0.1379890618426588, + "grad_norm": 0.4351465106010437, + "learning_rate": 0.00017243790535359182, + "loss": 1.4642, + "step": 10619 + }, + { + "epoch": 0.1380020563865747, + "grad_norm": 0.4485321640968323, + "learning_rate": 0.00017243530589168042, + "loss": 1.6215, + "step": 10620 + }, + { + "epoch": 0.13801505093049057, + "grad_norm": 0.37194985151290894, + "learning_rate": 0.00017243270642976904, + "loss": 1.4377, + "step": 10621 + }, + { + "epoch": 0.13802804547440645, + "grad_norm": 0.4043198227882385, + "learning_rate": 0.00017243010696785767, + "loss": 1.2744, + "step": 10622 + }, + { + "epoch": 0.13804104001832232, + "grad_norm": 0.4267769157886505, + "learning_rate": 0.00017242750750594627, + "loss": 1.3616, + "step": 10623 + }, + { + "epoch": 0.1380540345622382, + "grad_norm": 0.46974459290504456, + "learning_rate": 0.0001724249080440349, + "loss": 1.5613, + "step": 10624 + }, + { + "epoch": 0.13806702910615407, + "grad_norm": 0.3130131661891937, + "learning_rate": 0.00017242230858212351, + "loss": 1.1999, + "step": 10625 + }, + { + "epoch": 0.13808002365006994, + "grad_norm": 0.4928038716316223, + "learning_rate": 0.0001724197091202121, + "loss": 1.5424, + "step": 10626 + }, + { + "epoch": 0.1380930181939858, + "grad_norm": 0.35475754737854004, + "learning_rate": 0.00017241710965830074, + "loss": 1.4689, + "step": 10627 + }, + { + "epoch": 0.13810601273790168, + "grad_norm": 0.4033093750476837, + "learning_rate": 0.00017241451019638933, + "loss": 1.2252, + "step": 10628 + }, + { + "epoch": 0.13811900728181756, + "grad_norm": 0.34757670760154724, + "learning_rate": 0.00017241191073447799, + "loss": 1.3315, + "step": 10629 + }, + { + "epoch": 0.13813200182573343, + "grad_norm": 0.39496028423309326, + "learning_rate": 0.00017240931127256658, + "loss": 1.4887, + "step": 10630 + }, + { + "epoch": 0.1381449963696493, + "grad_norm": 0.3175010085105896, + "learning_rate": 0.0001724067118106552, + "loss": 1.474, + "step": 10631 + }, + { + "epoch": 0.13815799091356518, + "grad_norm": 0.44320324063301086, + "learning_rate": 0.00017240411234874383, + "loss": 1.4776, + "step": 10632 + }, + { + "epoch": 0.13817098545748105, + "grad_norm": 0.4860862195491791, + "learning_rate": 0.00017240151288683243, + "loss": 1.3037, + "step": 10633 + }, + { + "epoch": 0.13818398000139692, + "grad_norm": 0.4313218891620636, + "learning_rate": 0.00017239891342492105, + "loss": 1.3952, + "step": 10634 + }, + { + "epoch": 0.1381969745453128, + "grad_norm": 0.4191610813140869, + "learning_rate": 0.00017239631396300965, + "loss": 1.3451, + "step": 10635 + }, + { + "epoch": 0.13820996908922867, + "grad_norm": 0.35789498686790466, + "learning_rate": 0.0001723937145010983, + "loss": 1.4343, + "step": 10636 + }, + { + "epoch": 0.13822296363314454, + "grad_norm": 0.3415585458278656, + "learning_rate": 0.0001723911150391869, + "loss": 1.358, + "step": 10637 + }, + { + "epoch": 0.13823595817706041, + "grad_norm": 0.4315042495727539, + "learning_rate": 0.00017238851557727552, + "loss": 1.5833, + "step": 10638 + }, + { + "epoch": 0.1382489527209763, + "grad_norm": 0.44580018520355225, + "learning_rate": 0.00017238591611536412, + "loss": 1.5413, + "step": 10639 + }, + { + "epoch": 0.13826194726489216, + "grad_norm": 0.48365074396133423, + "learning_rate": 0.00017238331665345275, + "loss": 1.4997, + "step": 10640 + }, + { + "epoch": 0.13827494180880803, + "grad_norm": 0.3096107542514801, + "learning_rate": 0.00017238071719154137, + "loss": 1.4661, + "step": 10641 + }, + { + "epoch": 0.1382879363527239, + "grad_norm": 0.39686885476112366, + "learning_rate": 0.00017237811772962997, + "loss": 1.5045, + "step": 10642 + }, + { + "epoch": 0.13830093089663978, + "grad_norm": 0.5136502981185913, + "learning_rate": 0.0001723755182677186, + "loss": 1.4981, + "step": 10643 + }, + { + "epoch": 0.13831392544055565, + "grad_norm": 0.4076172411441803, + "learning_rate": 0.00017237291880580722, + "loss": 1.4703, + "step": 10644 + }, + { + "epoch": 0.13832691998447152, + "grad_norm": 0.33252274990081787, + "learning_rate": 0.00017237031934389581, + "loss": 1.4858, + "step": 10645 + }, + { + "epoch": 0.1383399145283874, + "grad_norm": 0.3941386044025421, + "learning_rate": 0.00017236771988198444, + "loss": 1.5372, + "step": 10646 + }, + { + "epoch": 0.13835290907230327, + "grad_norm": 0.3625786304473877, + "learning_rate": 0.00017236512042007304, + "loss": 1.4991, + "step": 10647 + }, + { + "epoch": 0.13836590361621914, + "grad_norm": 0.3857329189777374, + "learning_rate": 0.0001723625209581617, + "loss": 1.5909, + "step": 10648 + }, + { + "epoch": 0.13837889816013502, + "grad_norm": 0.36059844493865967, + "learning_rate": 0.00017235992149625029, + "loss": 1.6087, + "step": 10649 + }, + { + "epoch": 0.1383918927040509, + "grad_norm": 0.3763032853603363, + "learning_rate": 0.0001723573220343389, + "loss": 1.3809, + "step": 10650 + }, + { + "epoch": 0.13840488724796676, + "grad_norm": 0.37493008375167847, + "learning_rate": 0.0001723547225724275, + "loss": 1.4514, + "step": 10651 + }, + { + "epoch": 0.13841788179188264, + "grad_norm": 0.33912596106529236, + "learning_rate": 0.00017235212311051613, + "loss": 1.3984, + "step": 10652 + }, + { + "epoch": 0.1384308763357985, + "grad_norm": 0.29063013195991516, + "learning_rate": 0.00017234952364860476, + "loss": 1.3259, + "step": 10653 + }, + { + "epoch": 0.13844387087971438, + "grad_norm": 0.3569600284099579, + "learning_rate": 0.00017234692418669335, + "loss": 1.2968, + "step": 10654 + }, + { + "epoch": 0.13845686542363025, + "grad_norm": 0.389163076877594, + "learning_rate": 0.00017234432472478198, + "loss": 1.1315, + "step": 10655 + }, + { + "epoch": 0.13846985996754613, + "grad_norm": 0.4171965718269348, + "learning_rate": 0.0001723417252628706, + "loss": 1.4694, + "step": 10656 + }, + { + "epoch": 0.138482854511462, + "grad_norm": 0.35065358877182007, + "learning_rate": 0.0001723391258009592, + "loss": 1.3226, + "step": 10657 + }, + { + "epoch": 0.13849584905537787, + "grad_norm": 0.2886101305484772, + "learning_rate": 0.00017233652633904782, + "loss": 1.3109, + "step": 10658 + }, + { + "epoch": 0.13850884359929375, + "grad_norm": 0.41169869899749756, + "learning_rate": 0.00017233392687713642, + "loss": 1.572, + "step": 10659 + }, + { + "epoch": 0.13852183814320962, + "grad_norm": 0.31052806973457336, + "learning_rate": 0.00017233132741522507, + "loss": 1.3383, + "step": 10660 + }, + { + "epoch": 0.1385348326871255, + "grad_norm": 0.37719324231147766, + "learning_rate": 0.00017232872795331367, + "loss": 1.5925, + "step": 10661 + }, + { + "epoch": 0.13854782723104136, + "grad_norm": 0.4659688174724579, + "learning_rate": 0.0001723261284914023, + "loss": 1.5769, + "step": 10662 + }, + { + "epoch": 0.13856082177495724, + "grad_norm": 0.3831879794597626, + "learning_rate": 0.0001723235290294909, + "loss": 1.3675, + "step": 10663 + }, + { + "epoch": 0.1385738163188731, + "grad_norm": 0.41098740696907043, + "learning_rate": 0.00017232092956757952, + "loss": 1.5494, + "step": 10664 + }, + { + "epoch": 0.13858681086278898, + "grad_norm": 0.33471986651420593, + "learning_rate": 0.00017231833010566814, + "loss": 1.3501, + "step": 10665 + }, + { + "epoch": 0.13859980540670486, + "grad_norm": 0.455466628074646, + "learning_rate": 0.00017231573064375674, + "loss": 1.7076, + "step": 10666 + }, + { + "epoch": 0.13861279995062073, + "grad_norm": 0.3916175365447998, + "learning_rate": 0.00017231313118184536, + "loss": 1.5835, + "step": 10667 + }, + { + "epoch": 0.1386257944945366, + "grad_norm": 0.3723258972167969, + "learning_rate": 0.000172310531719934, + "loss": 1.406, + "step": 10668 + }, + { + "epoch": 0.13863878903845248, + "grad_norm": 0.4556010067462921, + "learning_rate": 0.00017230793225802259, + "loss": 1.5664, + "step": 10669 + }, + { + "epoch": 0.13865178358236835, + "grad_norm": 0.4294789731502533, + "learning_rate": 0.0001723053327961112, + "loss": 1.5854, + "step": 10670 + }, + { + "epoch": 0.13866477812628422, + "grad_norm": 0.4607834815979004, + "learning_rate": 0.00017230273333419983, + "loss": 1.4694, + "step": 10671 + }, + { + "epoch": 0.1386777726702001, + "grad_norm": 0.4007158875465393, + "learning_rate": 0.00017230013387228846, + "loss": 1.3922, + "step": 10672 + }, + { + "epoch": 0.13869076721411597, + "grad_norm": 0.4190846085548401, + "learning_rate": 0.00017229753441037706, + "loss": 1.5175, + "step": 10673 + }, + { + "epoch": 0.13870376175803184, + "grad_norm": 0.4587985873222351, + "learning_rate": 0.00017229493494846568, + "loss": 1.3897, + "step": 10674 + }, + { + "epoch": 0.1387167563019477, + "grad_norm": 0.37550240755081177, + "learning_rate": 0.0001722923354865543, + "loss": 1.4088, + "step": 10675 + }, + { + "epoch": 0.1387297508458636, + "grad_norm": 0.38696759939193726, + "learning_rate": 0.0001722897360246429, + "loss": 1.3405, + "step": 10676 + }, + { + "epoch": 0.13874274538977946, + "grad_norm": 0.35055679082870483, + "learning_rate": 0.00017228713656273153, + "loss": 1.5386, + "step": 10677 + }, + { + "epoch": 0.13875573993369533, + "grad_norm": 0.34917086362838745, + "learning_rate": 0.00017228453710082012, + "loss": 1.3728, + "step": 10678 + }, + { + "epoch": 0.1387687344776112, + "grad_norm": 0.44074827432632446, + "learning_rate": 0.00017228193763890878, + "loss": 1.4399, + "step": 10679 + }, + { + "epoch": 0.13878172902152708, + "grad_norm": 0.347922146320343, + "learning_rate": 0.00017227933817699737, + "loss": 1.401, + "step": 10680 + }, + { + "epoch": 0.13879472356544295, + "grad_norm": 0.44942864775657654, + "learning_rate": 0.00017227673871508597, + "loss": 1.4835, + "step": 10681 + }, + { + "epoch": 0.13880771810935882, + "grad_norm": 0.44252923130989075, + "learning_rate": 0.0001722741392531746, + "loss": 1.4024, + "step": 10682 + }, + { + "epoch": 0.1388207126532747, + "grad_norm": 0.3438055217266083, + "learning_rate": 0.00017227153979126322, + "loss": 1.5072, + "step": 10683 + }, + { + "epoch": 0.13883370719719057, + "grad_norm": 0.43290746212005615, + "learning_rate": 0.00017226894032935184, + "loss": 1.3606, + "step": 10684 + }, + { + "epoch": 0.13884670174110644, + "grad_norm": 0.4248257279396057, + "learning_rate": 0.00017226634086744044, + "loss": 1.5866, + "step": 10685 + }, + { + "epoch": 0.13885969628502232, + "grad_norm": 0.41684690117836, + "learning_rate": 0.00017226374140552907, + "loss": 1.3661, + "step": 10686 + }, + { + "epoch": 0.1388726908289382, + "grad_norm": 0.4077261984348297, + "learning_rate": 0.0001722611419436177, + "loss": 1.2825, + "step": 10687 + }, + { + "epoch": 0.13888568537285406, + "grad_norm": 0.3705204129219055, + "learning_rate": 0.0001722585424817063, + "loss": 1.4974, + "step": 10688 + }, + { + "epoch": 0.13889867991676993, + "grad_norm": 0.5640507340431213, + "learning_rate": 0.0001722559430197949, + "loss": 1.4143, + "step": 10689 + }, + { + "epoch": 0.1389116744606858, + "grad_norm": 0.42965078353881836, + "learning_rate": 0.0001722533435578835, + "loss": 1.6264, + "step": 10690 + }, + { + "epoch": 0.13892466900460168, + "grad_norm": 0.4460815489292145, + "learning_rate": 0.00017225074409597216, + "loss": 1.7322, + "step": 10691 + }, + { + "epoch": 0.13893766354851755, + "grad_norm": 0.35348957777023315, + "learning_rate": 0.00017224814463406076, + "loss": 1.2189, + "step": 10692 + }, + { + "epoch": 0.13895065809243343, + "grad_norm": 0.42552369832992554, + "learning_rate": 0.00017224554517214938, + "loss": 1.4319, + "step": 10693 + }, + { + "epoch": 0.1389636526363493, + "grad_norm": 0.3813215494155884, + "learning_rate": 0.00017224294571023798, + "loss": 1.5396, + "step": 10694 + }, + { + "epoch": 0.13897664718026517, + "grad_norm": 0.43992581963539124, + "learning_rate": 0.0001722403462483266, + "loss": 1.4858, + "step": 10695 + }, + { + "epoch": 0.13898964172418107, + "grad_norm": 0.4361250102519989, + "learning_rate": 0.00017223774678641523, + "loss": 1.5107, + "step": 10696 + }, + { + "epoch": 0.13900263626809695, + "grad_norm": 0.3931949734687805, + "learning_rate": 0.00017223514732450383, + "loss": 1.3818, + "step": 10697 + }, + { + "epoch": 0.13901563081201282, + "grad_norm": 0.35805174708366394, + "learning_rate": 0.00017223254786259245, + "loss": 1.3802, + "step": 10698 + }, + { + "epoch": 0.1390286253559287, + "grad_norm": 0.378232479095459, + "learning_rate": 0.00017222994840068108, + "loss": 1.5178, + "step": 10699 + }, + { + "epoch": 0.13904161989984457, + "grad_norm": 0.34583643078804016, + "learning_rate": 0.00017222734893876967, + "loss": 1.3225, + "step": 10700 + }, + { + "epoch": 0.13905461444376044, + "grad_norm": 0.3947950303554535, + "learning_rate": 0.0001722247494768583, + "loss": 1.4634, + "step": 10701 + }, + { + "epoch": 0.1390676089876763, + "grad_norm": 0.4488944113254547, + "learning_rate": 0.0001722221500149469, + "loss": 1.5424, + "step": 10702 + }, + { + "epoch": 0.13908060353159218, + "grad_norm": 0.4175129234790802, + "learning_rate": 0.00017221955055303555, + "loss": 1.4955, + "step": 10703 + }, + { + "epoch": 0.13909359807550806, + "grad_norm": 0.4612016975879669, + "learning_rate": 0.00017221695109112414, + "loss": 1.5747, + "step": 10704 + }, + { + "epoch": 0.13910659261942393, + "grad_norm": 0.3313022255897522, + "learning_rate": 0.00017221435162921277, + "loss": 1.3595, + "step": 10705 + }, + { + "epoch": 0.1391195871633398, + "grad_norm": 0.31256556510925293, + "learning_rate": 0.0001722117521673014, + "loss": 1.2647, + "step": 10706 + }, + { + "epoch": 0.13913258170725568, + "grad_norm": 0.3766583800315857, + "learning_rate": 0.00017220915270539, + "loss": 1.5235, + "step": 10707 + }, + { + "epoch": 0.13914557625117155, + "grad_norm": 0.40331658720970154, + "learning_rate": 0.00017220655324347861, + "loss": 1.4741, + "step": 10708 + }, + { + "epoch": 0.13915857079508742, + "grad_norm": 0.4173697233200073, + "learning_rate": 0.0001722039537815672, + "loss": 1.738, + "step": 10709 + }, + { + "epoch": 0.1391715653390033, + "grad_norm": 0.4242304861545563, + "learning_rate": 0.00017220135431965584, + "loss": 1.5726, + "step": 10710 + }, + { + "epoch": 0.13918455988291917, + "grad_norm": 0.39564964175224304, + "learning_rate": 0.00017219875485774446, + "loss": 1.5576, + "step": 10711 + }, + { + "epoch": 0.13919755442683504, + "grad_norm": 0.3382103443145752, + "learning_rate": 0.00017219615539583306, + "loss": 1.2134, + "step": 10712 + }, + { + "epoch": 0.1392105489707509, + "grad_norm": 0.4230686128139496, + "learning_rate": 0.00017219355593392168, + "loss": 1.6145, + "step": 10713 + }, + { + "epoch": 0.1392235435146668, + "grad_norm": 0.34732839465141296, + "learning_rate": 0.0001721909564720103, + "loss": 1.4446, + "step": 10714 + }, + { + "epoch": 0.13923653805858266, + "grad_norm": 0.4478197991847992, + "learning_rate": 0.00017218835701009893, + "loss": 1.4869, + "step": 10715 + }, + { + "epoch": 0.13924953260249853, + "grad_norm": 0.35791486501693726, + "learning_rate": 0.00017218575754818753, + "loss": 1.256, + "step": 10716 + }, + { + "epoch": 0.1392625271464144, + "grad_norm": 0.3732423782348633, + "learning_rate": 0.00017218315808627615, + "loss": 1.4686, + "step": 10717 + }, + { + "epoch": 0.13927552169033028, + "grad_norm": 0.3404478132724762, + "learning_rate": 0.00017218055862436478, + "loss": 1.479, + "step": 10718 + }, + { + "epoch": 0.13928851623424615, + "grad_norm": 0.4723266065120697, + "learning_rate": 0.00017217795916245338, + "loss": 1.6898, + "step": 10719 + }, + { + "epoch": 0.13930151077816202, + "grad_norm": 0.3965282440185547, + "learning_rate": 0.000172175359700542, + "loss": 1.4205, + "step": 10720 + }, + { + "epoch": 0.1393145053220779, + "grad_norm": 0.5039243102073669, + "learning_rate": 0.0001721727602386306, + "loss": 1.4879, + "step": 10721 + }, + { + "epoch": 0.13932749986599377, + "grad_norm": 0.4013881981372833, + "learning_rate": 0.00017217016077671925, + "loss": 1.4122, + "step": 10722 + }, + { + "epoch": 0.13934049440990964, + "grad_norm": 0.33086255192756653, + "learning_rate": 0.00017216756131480785, + "loss": 1.2714, + "step": 10723 + }, + { + "epoch": 0.13935348895382552, + "grad_norm": 0.34979256987571716, + "learning_rate": 0.00017216496185289644, + "loss": 1.3973, + "step": 10724 + }, + { + "epoch": 0.1393664834977414, + "grad_norm": 0.8638617992401123, + "learning_rate": 0.00017216236239098507, + "loss": 1.5524, + "step": 10725 + }, + { + "epoch": 0.13937947804165726, + "grad_norm": 0.41470256447792053, + "learning_rate": 0.0001721597629290737, + "loss": 1.4213, + "step": 10726 + }, + { + "epoch": 0.13939247258557314, + "grad_norm": 0.4871446490287781, + "learning_rate": 0.00017215716346716232, + "loss": 1.4645, + "step": 10727 + }, + { + "epoch": 0.139405467129489, + "grad_norm": 0.35828638076782227, + "learning_rate": 0.00017215456400525091, + "loss": 1.4868, + "step": 10728 + }, + { + "epoch": 0.13941846167340488, + "grad_norm": 0.39944687485694885, + "learning_rate": 0.00017215196454333954, + "loss": 1.4677, + "step": 10729 + }, + { + "epoch": 0.13943145621732075, + "grad_norm": 0.3520928919315338, + "learning_rate": 0.00017214936508142816, + "loss": 1.2735, + "step": 10730 + }, + { + "epoch": 0.13944445076123663, + "grad_norm": 0.39557158946990967, + "learning_rate": 0.00017214676561951676, + "loss": 1.2845, + "step": 10731 + }, + { + "epoch": 0.1394574453051525, + "grad_norm": 0.3717704713344574, + "learning_rate": 0.00017214416615760539, + "loss": 1.5166, + "step": 10732 + }, + { + "epoch": 0.13947043984906837, + "grad_norm": 0.4318528175354004, + "learning_rate": 0.00017214156669569398, + "loss": 1.5827, + "step": 10733 + }, + { + "epoch": 0.13948343439298425, + "grad_norm": 0.3730836510658264, + "learning_rate": 0.00017213896723378263, + "loss": 1.3915, + "step": 10734 + }, + { + "epoch": 0.13949642893690012, + "grad_norm": 0.47108346223831177, + "learning_rate": 0.00017213636777187123, + "loss": 1.2788, + "step": 10735 + }, + { + "epoch": 0.139509423480816, + "grad_norm": 0.4092840850353241, + "learning_rate": 0.00017213376830995983, + "loss": 1.4124, + "step": 10736 + }, + { + "epoch": 0.13952241802473186, + "grad_norm": 0.46748873591423035, + "learning_rate": 0.00017213116884804845, + "loss": 1.5797, + "step": 10737 + }, + { + "epoch": 0.13953541256864774, + "grad_norm": 0.4005483090877533, + "learning_rate": 0.00017212856938613708, + "loss": 1.5289, + "step": 10738 + }, + { + "epoch": 0.1395484071125636, + "grad_norm": 0.41816186904907227, + "learning_rate": 0.0001721259699242257, + "loss": 1.3852, + "step": 10739 + }, + { + "epoch": 0.13956140165647948, + "grad_norm": 0.2817828357219696, + "learning_rate": 0.0001721233704623143, + "loss": 1.4576, + "step": 10740 + }, + { + "epoch": 0.13957439620039536, + "grad_norm": 0.398346483707428, + "learning_rate": 0.00017212077100040292, + "loss": 1.2757, + "step": 10741 + }, + { + "epoch": 0.13958739074431123, + "grad_norm": 0.4326547682285309, + "learning_rate": 0.00017211817153849155, + "loss": 1.3765, + "step": 10742 + }, + { + "epoch": 0.1396003852882271, + "grad_norm": 0.4393986463546753, + "learning_rate": 0.00017211557207658015, + "loss": 1.6248, + "step": 10743 + }, + { + "epoch": 0.13961337983214298, + "grad_norm": 0.38088586926460266, + "learning_rate": 0.00017211297261466877, + "loss": 1.2487, + "step": 10744 + }, + { + "epoch": 0.13962637437605885, + "grad_norm": 0.4307241439819336, + "learning_rate": 0.0001721103731527574, + "loss": 1.4015, + "step": 10745 + }, + { + "epoch": 0.13963936891997472, + "grad_norm": 0.43914666771888733, + "learning_rate": 0.00017210777369084602, + "loss": 1.4995, + "step": 10746 + }, + { + "epoch": 0.1396523634638906, + "grad_norm": 0.4074976444244385, + "learning_rate": 0.00017210517422893462, + "loss": 1.5153, + "step": 10747 + }, + { + "epoch": 0.13966535800780647, + "grad_norm": 0.40763652324676514, + "learning_rate": 0.00017210257476702321, + "loss": 1.4273, + "step": 10748 + }, + { + "epoch": 0.13967835255172234, + "grad_norm": 0.4152686297893524, + "learning_rate": 0.00017209997530511187, + "loss": 1.4594, + "step": 10749 + }, + { + "epoch": 0.1396913470956382, + "grad_norm": 0.2346455156803131, + "learning_rate": 0.00017209737584320046, + "loss": 1.4827, + "step": 10750 + }, + { + "epoch": 0.13970434163955409, + "grad_norm": 0.39084431529045105, + "learning_rate": 0.0001720947763812891, + "loss": 1.6069, + "step": 10751 + }, + { + "epoch": 0.13971733618346996, + "grad_norm": 0.4696912169456482, + "learning_rate": 0.00017209217691937769, + "loss": 1.4265, + "step": 10752 + }, + { + "epoch": 0.13973033072738583, + "grad_norm": 0.357042133808136, + "learning_rate": 0.0001720895774574663, + "loss": 1.3044, + "step": 10753 + }, + { + "epoch": 0.1397433252713017, + "grad_norm": 0.4950782358646393, + "learning_rate": 0.00017208697799555493, + "loss": 1.4042, + "step": 10754 + }, + { + "epoch": 0.13975631981521758, + "grad_norm": 0.4539402425289154, + "learning_rate": 0.00017208437853364353, + "loss": 1.5494, + "step": 10755 + }, + { + "epoch": 0.13976931435913345, + "grad_norm": 0.39369234442710876, + "learning_rate": 0.00017208177907173216, + "loss": 1.2591, + "step": 10756 + }, + { + "epoch": 0.13978230890304932, + "grad_norm": 0.3353974223136902, + "learning_rate": 0.00017207917960982078, + "loss": 1.3499, + "step": 10757 + }, + { + "epoch": 0.1397953034469652, + "grad_norm": 0.39832958579063416, + "learning_rate": 0.0001720765801479094, + "loss": 1.4912, + "step": 10758 + }, + { + "epoch": 0.13980829799088107, + "grad_norm": 0.33993783593177795, + "learning_rate": 0.000172073980685998, + "loss": 1.2443, + "step": 10759 + }, + { + "epoch": 0.13982129253479694, + "grad_norm": 0.30756813287734985, + "learning_rate": 0.00017207138122408663, + "loss": 1.2784, + "step": 10760 + }, + { + "epoch": 0.13983428707871282, + "grad_norm": 0.4411168694496155, + "learning_rate": 0.00017206878176217525, + "loss": 1.4152, + "step": 10761 + }, + { + "epoch": 0.1398472816226287, + "grad_norm": 0.40573054552078247, + "learning_rate": 0.00017206618230026385, + "loss": 1.4497, + "step": 10762 + }, + { + "epoch": 0.13986027616654456, + "grad_norm": 0.352725625038147, + "learning_rate": 0.00017206358283835247, + "loss": 1.4119, + "step": 10763 + }, + { + "epoch": 0.13987327071046043, + "grad_norm": 0.3691973090171814, + "learning_rate": 0.00017206098337644107, + "loss": 1.4265, + "step": 10764 + }, + { + "epoch": 0.1398862652543763, + "grad_norm": 0.36644649505615234, + "learning_rate": 0.0001720583839145297, + "loss": 1.276, + "step": 10765 + }, + { + "epoch": 0.13989925979829218, + "grad_norm": 0.40595853328704834, + "learning_rate": 0.00017205578445261832, + "loss": 1.5091, + "step": 10766 + }, + { + "epoch": 0.13991225434220805, + "grad_norm": 0.39276009798049927, + "learning_rate": 0.00017205318499070692, + "loss": 1.3175, + "step": 10767 + }, + { + "epoch": 0.13992524888612393, + "grad_norm": 0.5398718118667603, + "learning_rate": 0.00017205058552879554, + "loss": 1.3301, + "step": 10768 + }, + { + "epoch": 0.1399382434300398, + "grad_norm": 0.38218799233436584, + "learning_rate": 0.00017204798606688417, + "loss": 1.3951, + "step": 10769 + }, + { + "epoch": 0.13995123797395567, + "grad_norm": 0.428398460149765, + "learning_rate": 0.0001720453866049728, + "loss": 1.553, + "step": 10770 + }, + { + "epoch": 0.13996423251787155, + "grad_norm": 0.4070543944835663, + "learning_rate": 0.0001720427871430614, + "loss": 1.4543, + "step": 10771 + }, + { + "epoch": 0.13997722706178742, + "grad_norm": 0.465413361787796, + "learning_rate": 0.00017204018768115, + "loss": 1.2245, + "step": 10772 + }, + { + "epoch": 0.13999022160570332, + "grad_norm": 0.3781396746635437, + "learning_rate": 0.00017203758821923864, + "loss": 1.488, + "step": 10773 + }, + { + "epoch": 0.1400032161496192, + "grad_norm": 0.3066791892051697, + "learning_rate": 0.00017203498875732723, + "loss": 1.6712, + "step": 10774 + }, + { + "epoch": 0.14001621069353506, + "grad_norm": 0.3620041310787201, + "learning_rate": 0.00017203238929541586, + "loss": 1.4716, + "step": 10775 + }, + { + "epoch": 0.14002920523745094, + "grad_norm": 0.4115479290485382, + "learning_rate": 0.00017202978983350446, + "loss": 1.3786, + "step": 10776 + }, + { + "epoch": 0.1400421997813668, + "grad_norm": 0.3244473934173584, + "learning_rate": 0.0001720271903715931, + "loss": 1.3423, + "step": 10777 + }, + { + "epoch": 0.14005519432528268, + "grad_norm": 0.4513790011405945, + "learning_rate": 0.0001720245909096817, + "loss": 1.3319, + "step": 10778 + }, + { + "epoch": 0.14006818886919856, + "grad_norm": 0.35393035411834717, + "learning_rate": 0.0001720219914477703, + "loss": 1.3531, + "step": 10779 + }, + { + "epoch": 0.14008118341311443, + "grad_norm": 0.36377277970314026, + "learning_rate": 0.00017201939198585895, + "loss": 1.3916, + "step": 10780 + }, + { + "epoch": 0.1400941779570303, + "grad_norm": 0.4200163185596466, + "learning_rate": 0.00017201679252394755, + "loss": 1.3245, + "step": 10781 + }, + { + "epoch": 0.14010717250094618, + "grad_norm": 0.3571647107601166, + "learning_rate": 0.00017201419306203618, + "loss": 1.3705, + "step": 10782 + }, + { + "epoch": 0.14012016704486205, + "grad_norm": 0.4349687695503235, + "learning_rate": 0.00017201159360012477, + "loss": 1.4128, + "step": 10783 + }, + { + "epoch": 0.14013316158877792, + "grad_norm": 0.41650599241256714, + "learning_rate": 0.0001720089941382134, + "loss": 1.3617, + "step": 10784 + }, + { + "epoch": 0.1401461561326938, + "grad_norm": 0.4022287428379059, + "learning_rate": 0.00017200639467630202, + "loss": 1.4494, + "step": 10785 + }, + { + "epoch": 0.14015915067660967, + "grad_norm": 0.33680790662765503, + "learning_rate": 0.00017200379521439062, + "loss": 1.5274, + "step": 10786 + }, + { + "epoch": 0.14017214522052554, + "grad_norm": 0.46293941140174866, + "learning_rate": 0.00017200119575247924, + "loss": 1.2085, + "step": 10787 + }, + { + "epoch": 0.1401851397644414, + "grad_norm": 0.3471514582633972, + "learning_rate": 0.00017199859629056787, + "loss": 1.5912, + "step": 10788 + }, + { + "epoch": 0.1401981343083573, + "grad_norm": 0.32524195313453674, + "learning_rate": 0.0001719959968286565, + "loss": 1.4603, + "step": 10789 + }, + { + "epoch": 0.14021112885227316, + "grad_norm": 0.360619455575943, + "learning_rate": 0.0001719933973667451, + "loss": 1.3013, + "step": 10790 + }, + { + "epoch": 0.14022412339618903, + "grad_norm": 0.43202275037765503, + "learning_rate": 0.0001719907979048337, + "loss": 1.4518, + "step": 10791 + }, + { + "epoch": 0.1402371179401049, + "grad_norm": 0.42665886878967285, + "learning_rate": 0.00017198819844292234, + "loss": 1.4011, + "step": 10792 + }, + { + "epoch": 0.14025011248402078, + "grad_norm": 0.3868856728076935, + "learning_rate": 0.00017198559898101094, + "loss": 1.3267, + "step": 10793 + }, + { + "epoch": 0.14026310702793665, + "grad_norm": 0.3203757107257843, + "learning_rate": 0.00017198299951909956, + "loss": 1.4568, + "step": 10794 + }, + { + "epoch": 0.14027610157185252, + "grad_norm": 0.35420846939086914, + "learning_rate": 0.00017198040005718816, + "loss": 1.5337, + "step": 10795 + }, + { + "epoch": 0.1402890961157684, + "grad_norm": 0.4244528114795685, + "learning_rate": 0.00017197780059527678, + "loss": 1.3677, + "step": 10796 + }, + { + "epoch": 0.14030209065968427, + "grad_norm": 0.35529932379722595, + "learning_rate": 0.0001719752011333654, + "loss": 1.3323, + "step": 10797 + }, + { + "epoch": 0.14031508520360014, + "grad_norm": 0.37506797909736633, + "learning_rate": 0.000171972601671454, + "loss": 1.4979, + "step": 10798 + }, + { + "epoch": 0.14032807974751602, + "grad_norm": 0.32244589924812317, + "learning_rate": 0.00017197000220954263, + "loss": 1.5285, + "step": 10799 + }, + { + "epoch": 0.1403410742914319, + "grad_norm": 0.3708314597606659, + "learning_rate": 0.00017196740274763125, + "loss": 1.2533, + "step": 10800 + }, + { + "epoch": 0.14035406883534776, + "grad_norm": 0.3478354513645172, + "learning_rate": 0.00017196480328571988, + "loss": 1.4054, + "step": 10801 + }, + { + "epoch": 0.14036706337926363, + "grad_norm": 0.3617178201675415, + "learning_rate": 0.00017196220382380848, + "loss": 1.2874, + "step": 10802 + }, + { + "epoch": 0.1403800579231795, + "grad_norm": 0.3668794631958008, + "learning_rate": 0.00017195960436189707, + "loss": 1.4735, + "step": 10803 + }, + { + "epoch": 0.14039305246709538, + "grad_norm": 0.39561164379119873, + "learning_rate": 0.00017195700489998572, + "loss": 1.1695, + "step": 10804 + }, + { + "epoch": 0.14040604701101125, + "grad_norm": 0.45432788133621216, + "learning_rate": 0.00017195440543807432, + "loss": 1.4439, + "step": 10805 + }, + { + "epoch": 0.14041904155492713, + "grad_norm": 0.28599485754966736, + "learning_rate": 0.00017195180597616295, + "loss": 1.4012, + "step": 10806 + }, + { + "epoch": 0.140432036098843, + "grad_norm": 0.375226229429245, + "learning_rate": 0.00017194920651425154, + "loss": 1.2792, + "step": 10807 + }, + { + "epoch": 0.14044503064275887, + "grad_norm": 0.3512086272239685, + "learning_rate": 0.00017194660705234017, + "loss": 1.505, + "step": 10808 + }, + { + "epoch": 0.14045802518667475, + "grad_norm": 0.5066471695899963, + "learning_rate": 0.0001719440075904288, + "loss": 1.5636, + "step": 10809 + }, + { + "epoch": 0.14047101973059062, + "grad_norm": 0.42726922035217285, + "learning_rate": 0.0001719414081285174, + "loss": 1.5015, + "step": 10810 + }, + { + "epoch": 0.1404840142745065, + "grad_norm": 0.3772279620170593, + "learning_rate": 0.00017193880866660602, + "loss": 1.3784, + "step": 10811 + }, + { + "epoch": 0.14049700881842236, + "grad_norm": 0.38701918721199036, + "learning_rate": 0.00017193620920469464, + "loss": 1.3881, + "step": 10812 + }, + { + "epoch": 0.14051000336233824, + "grad_norm": 0.29993683099746704, + "learning_rate": 0.00017193360974278326, + "loss": 1.4011, + "step": 10813 + }, + { + "epoch": 0.1405229979062541, + "grad_norm": 0.45598283410072327, + "learning_rate": 0.00017193101028087186, + "loss": 1.4538, + "step": 10814 + }, + { + "epoch": 0.14053599245016998, + "grad_norm": 0.6004972457885742, + "learning_rate": 0.00017192841081896049, + "loss": 1.2832, + "step": 10815 + }, + { + "epoch": 0.14054898699408586, + "grad_norm": 0.2675066888332367, + "learning_rate": 0.0001719258113570491, + "loss": 1.3657, + "step": 10816 + }, + { + "epoch": 0.14056198153800173, + "grad_norm": 0.3949751853942871, + "learning_rate": 0.0001719232118951377, + "loss": 1.4446, + "step": 10817 + }, + { + "epoch": 0.1405749760819176, + "grad_norm": 0.41369563341140747, + "learning_rate": 0.00017192061243322633, + "loss": 1.3497, + "step": 10818 + }, + { + "epoch": 0.14058797062583347, + "grad_norm": 0.4654668867588043, + "learning_rate": 0.00017191801297131496, + "loss": 1.6589, + "step": 10819 + }, + { + "epoch": 0.14060096516974935, + "grad_norm": 0.39784547686576843, + "learning_rate": 0.00017191541350940355, + "loss": 1.4265, + "step": 10820 + }, + { + "epoch": 0.14061395971366522, + "grad_norm": 0.421874076128006, + "learning_rate": 0.00017191281404749218, + "loss": 1.6356, + "step": 10821 + }, + { + "epoch": 0.1406269542575811, + "grad_norm": 0.3859943449497223, + "learning_rate": 0.00017191021458558078, + "loss": 1.2623, + "step": 10822 + }, + { + "epoch": 0.14063994880149697, + "grad_norm": 0.3922678828239441, + "learning_rate": 0.00017190761512366943, + "loss": 1.3465, + "step": 10823 + }, + { + "epoch": 0.14065294334541284, + "grad_norm": 0.39367225766181946, + "learning_rate": 0.00017190501566175802, + "loss": 1.5162, + "step": 10824 + }, + { + "epoch": 0.1406659378893287, + "grad_norm": 0.3528103530406952, + "learning_rate": 0.00017190241619984665, + "loss": 1.2572, + "step": 10825 + }, + { + "epoch": 0.14067893243324459, + "grad_norm": 0.38575342297554016, + "learning_rate": 0.00017189981673793525, + "loss": 1.5632, + "step": 10826 + }, + { + "epoch": 0.14069192697716046, + "grad_norm": 0.44456741213798523, + "learning_rate": 0.00017189721727602387, + "loss": 1.3928, + "step": 10827 + }, + { + "epoch": 0.14070492152107633, + "grad_norm": 0.4356706142425537, + "learning_rate": 0.0001718946178141125, + "loss": 1.5353, + "step": 10828 + }, + { + "epoch": 0.1407179160649922, + "grad_norm": 0.5134885311126709, + "learning_rate": 0.0001718920183522011, + "loss": 1.6023, + "step": 10829 + }, + { + "epoch": 0.14073091060890808, + "grad_norm": 0.4360247254371643, + "learning_rate": 0.00017188941889028972, + "loss": 1.4886, + "step": 10830 + }, + { + "epoch": 0.14074390515282395, + "grad_norm": 0.3940284550189972, + "learning_rate": 0.00017188681942837834, + "loss": 1.2592, + "step": 10831 + }, + { + "epoch": 0.14075689969673982, + "grad_norm": 0.4224902093410492, + "learning_rate": 0.00017188421996646694, + "loss": 1.4618, + "step": 10832 + }, + { + "epoch": 0.1407698942406557, + "grad_norm": 0.3279314935207367, + "learning_rate": 0.00017188162050455556, + "loss": 1.4254, + "step": 10833 + }, + { + "epoch": 0.14078288878457157, + "grad_norm": 0.36889636516571045, + "learning_rate": 0.00017187902104264416, + "loss": 1.3752, + "step": 10834 + }, + { + "epoch": 0.14079588332848744, + "grad_norm": 0.38372352719306946, + "learning_rate": 0.0001718764215807328, + "loss": 1.5421, + "step": 10835 + }, + { + "epoch": 0.14080887787240332, + "grad_norm": 0.4109812080860138, + "learning_rate": 0.0001718738221188214, + "loss": 1.2519, + "step": 10836 + }, + { + "epoch": 0.1408218724163192, + "grad_norm": 0.3611900210380554, + "learning_rate": 0.00017187122265691003, + "loss": 1.5434, + "step": 10837 + }, + { + "epoch": 0.14083486696023506, + "grad_norm": 0.3826359212398529, + "learning_rate": 0.00017186862319499863, + "loss": 1.2481, + "step": 10838 + }, + { + "epoch": 0.14084786150415093, + "grad_norm": 0.5015074610710144, + "learning_rate": 0.00017186602373308726, + "loss": 1.3544, + "step": 10839 + }, + { + "epoch": 0.1408608560480668, + "grad_norm": 0.39184704422950745, + "learning_rate": 0.00017186342427117588, + "loss": 1.3709, + "step": 10840 + }, + { + "epoch": 0.14087385059198268, + "grad_norm": 0.46706393361091614, + "learning_rate": 0.00017186082480926448, + "loss": 1.45, + "step": 10841 + }, + { + "epoch": 0.14088684513589855, + "grad_norm": 0.43448421359062195, + "learning_rate": 0.0001718582253473531, + "loss": 1.5921, + "step": 10842 + }, + { + "epoch": 0.14089983967981443, + "grad_norm": 0.4248597025871277, + "learning_rate": 0.00017185562588544173, + "loss": 1.5616, + "step": 10843 + }, + { + "epoch": 0.1409128342237303, + "grad_norm": 0.39774712920188904, + "learning_rate": 0.00017185302642353035, + "loss": 1.4397, + "step": 10844 + }, + { + "epoch": 0.14092582876764617, + "grad_norm": 0.3106837868690491, + "learning_rate": 0.00017185042696161895, + "loss": 1.4959, + "step": 10845 + }, + { + "epoch": 0.14093882331156204, + "grad_norm": 0.45596882700920105, + "learning_rate": 0.00017184782749970755, + "loss": 1.3741, + "step": 10846 + }, + { + "epoch": 0.14095181785547792, + "grad_norm": 0.4356490969657898, + "learning_rate": 0.0001718452280377962, + "loss": 1.3784, + "step": 10847 + }, + { + "epoch": 0.1409648123993938, + "grad_norm": 0.3726494014263153, + "learning_rate": 0.0001718426285758848, + "loss": 1.5095, + "step": 10848 + }, + { + "epoch": 0.1409778069433097, + "grad_norm": 0.4151771366596222, + "learning_rate": 0.00017184002911397342, + "loss": 1.2665, + "step": 10849 + }, + { + "epoch": 0.14099080148722556, + "grad_norm": 0.32824963331222534, + "learning_rate": 0.00017183742965206202, + "loss": 1.3579, + "step": 10850 + }, + { + "epoch": 0.14100379603114144, + "grad_norm": 0.3854600191116333, + "learning_rate": 0.00017183483019015064, + "loss": 1.5345, + "step": 10851 + }, + { + "epoch": 0.1410167905750573, + "grad_norm": 0.32627803087234497, + "learning_rate": 0.00017183223072823927, + "loss": 1.3422, + "step": 10852 + }, + { + "epoch": 0.14102978511897318, + "grad_norm": 0.4469495117664337, + "learning_rate": 0.00017182963126632786, + "loss": 1.4616, + "step": 10853 + }, + { + "epoch": 0.14104277966288906, + "grad_norm": 0.29169851541519165, + "learning_rate": 0.00017182703180441652, + "loss": 1.3603, + "step": 10854 + }, + { + "epoch": 0.14105577420680493, + "grad_norm": 0.29822713136672974, + "learning_rate": 0.0001718244323425051, + "loss": 1.4908, + "step": 10855 + }, + { + "epoch": 0.1410687687507208, + "grad_norm": 0.3460671901702881, + "learning_rate": 0.00017182183288059374, + "loss": 1.1356, + "step": 10856 + }, + { + "epoch": 0.14108176329463668, + "grad_norm": 0.4160926342010498, + "learning_rate": 0.00017181923341868233, + "loss": 1.403, + "step": 10857 + }, + { + "epoch": 0.14109475783855255, + "grad_norm": 0.42342308163642883, + "learning_rate": 0.00017181663395677096, + "loss": 1.5801, + "step": 10858 + }, + { + "epoch": 0.14110775238246842, + "grad_norm": 0.42777422070503235, + "learning_rate": 0.00017181403449485958, + "loss": 1.4779, + "step": 10859 + }, + { + "epoch": 0.1411207469263843, + "grad_norm": 0.4467618763446808, + "learning_rate": 0.00017181143503294818, + "loss": 1.5562, + "step": 10860 + }, + { + "epoch": 0.14113374147030017, + "grad_norm": 0.3858185112476349, + "learning_rate": 0.0001718088355710368, + "loss": 1.5842, + "step": 10861 + }, + { + "epoch": 0.14114673601421604, + "grad_norm": 0.39428919553756714, + "learning_rate": 0.00017180623610912543, + "loss": 1.5314, + "step": 10862 + }, + { + "epoch": 0.1411597305581319, + "grad_norm": 0.335600882768631, + "learning_rate": 0.00017180363664721403, + "loss": 1.6467, + "step": 10863 + }, + { + "epoch": 0.14117272510204779, + "grad_norm": 0.5017620921134949, + "learning_rate": 0.00017180103718530265, + "loss": 1.414, + "step": 10864 + }, + { + "epoch": 0.14118571964596366, + "grad_norm": 0.4224272668361664, + "learning_rate": 0.00017179843772339125, + "loss": 1.5467, + "step": 10865 + }, + { + "epoch": 0.14119871418987953, + "grad_norm": 0.39809706807136536, + "learning_rate": 0.0001717958382614799, + "loss": 1.4449, + "step": 10866 + }, + { + "epoch": 0.1412117087337954, + "grad_norm": 0.2664085030555725, + "learning_rate": 0.0001717932387995685, + "loss": 1.2187, + "step": 10867 + }, + { + "epoch": 0.14122470327771128, + "grad_norm": 0.36554619669914246, + "learning_rate": 0.00017179063933765712, + "loss": 1.3219, + "step": 10868 + }, + { + "epoch": 0.14123769782162715, + "grad_norm": 0.4317478835582733, + "learning_rate": 0.00017178803987574572, + "loss": 1.4364, + "step": 10869 + }, + { + "epoch": 0.14125069236554302, + "grad_norm": 0.46758297085762024, + "learning_rate": 0.00017178544041383434, + "loss": 1.5511, + "step": 10870 + }, + { + "epoch": 0.1412636869094589, + "grad_norm": 0.3566475212574005, + "learning_rate": 0.00017178284095192297, + "loss": 1.048, + "step": 10871 + }, + { + "epoch": 0.14127668145337477, + "grad_norm": 0.42115387320518494, + "learning_rate": 0.00017178024149001157, + "loss": 1.4822, + "step": 10872 + }, + { + "epoch": 0.14128967599729064, + "grad_norm": 0.4404464662075043, + "learning_rate": 0.0001717776420281002, + "loss": 1.5487, + "step": 10873 + }, + { + "epoch": 0.14130267054120652, + "grad_norm": 0.3825858235359192, + "learning_rate": 0.00017177504256618882, + "loss": 1.3018, + "step": 10874 + }, + { + "epoch": 0.1413156650851224, + "grad_norm": 0.3493032455444336, + "learning_rate": 0.0001717724431042774, + "loss": 1.4301, + "step": 10875 + }, + { + "epoch": 0.14132865962903826, + "grad_norm": 0.44915756583213806, + "learning_rate": 0.00017176984364236604, + "loss": 1.6921, + "step": 10876 + }, + { + "epoch": 0.14134165417295413, + "grad_norm": 0.3446107506752014, + "learning_rate": 0.00017176724418045463, + "loss": 1.3504, + "step": 10877 + }, + { + "epoch": 0.14135464871687, + "grad_norm": 0.31326380372047424, + "learning_rate": 0.00017176464471854329, + "loss": 1.5635, + "step": 10878 + }, + { + "epoch": 0.14136764326078588, + "grad_norm": 0.46126633882522583, + "learning_rate": 0.00017176204525663188, + "loss": 1.4412, + "step": 10879 + }, + { + "epoch": 0.14138063780470175, + "grad_norm": 0.40074828267097473, + "learning_rate": 0.0001717594457947205, + "loss": 1.3619, + "step": 10880 + }, + { + "epoch": 0.14139363234861763, + "grad_norm": 0.3470706045627594, + "learning_rate": 0.0001717568463328091, + "loss": 1.4544, + "step": 10881 + }, + { + "epoch": 0.1414066268925335, + "grad_norm": 0.3700906038284302, + "learning_rate": 0.00017175424687089773, + "loss": 1.2816, + "step": 10882 + }, + { + "epoch": 0.14141962143644937, + "grad_norm": 0.41927361488342285, + "learning_rate": 0.00017175164740898635, + "loss": 1.4843, + "step": 10883 + }, + { + "epoch": 0.14143261598036524, + "grad_norm": 0.38305652141571045, + "learning_rate": 0.00017174904794707495, + "loss": 1.3507, + "step": 10884 + }, + { + "epoch": 0.14144561052428112, + "grad_norm": 0.4511977732181549, + "learning_rate": 0.00017174644848516358, + "loss": 1.3806, + "step": 10885 + }, + { + "epoch": 0.141458605068197, + "grad_norm": 0.3517056107521057, + "learning_rate": 0.0001717438490232522, + "loss": 1.3782, + "step": 10886 + }, + { + "epoch": 0.14147159961211286, + "grad_norm": 0.35727646946907043, + "learning_rate": 0.0001717412495613408, + "loss": 1.1464, + "step": 10887 + }, + { + "epoch": 0.14148459415602874, + "grad_norm": 0.3452603816986084, + "learning_rate": 0.00017173865009942942, + "loss": 1.3215, + "step": 10888 + }, + { + "epoch": 0.1414975886999446, + "grad_norm": 0.4418522119522095, + "learning_rate": 0.00017173605063751802, + "loss": 1.4409, + "step": 10889 + }, + { + "epoch": 0.14151058324386048, + "grad_norm": 0.47208675742149353, + "learning_rate": 0.00017173345117560667, + "loss": 1.4753, + "step": 10890 + }, + { + "epoch": 0.14152357778777636, + "grad_norm": 0.4535081684589386, + "learning_rate": 0.00017173085171369527, + "loss": 1.3373, + "step": 10891 + }, + { + "epoch": 0.14153657233169223, + "grad_norm": 0.323885977268219, + "learning_rate": 0.0001717282522517839, + "loss": 1.2169, + "step": 10892 + }, + { + "epoch": 0.1415495668756081, + "grad_norm": 0.4445568919181824, + "learning_rate": 0.00017172565278987252, + "loss": 1.4632, + "step": 10893 + }, + { + "epoch": 0.14156256141952397, + "grad_norm": 0.4691617488861084, + "learning_rate": 0.00017172305332796112, + "loss": 1.5746, + "step": 10894 + }, + { + "epoch": 0.14157555596343985, + "grad_norm": 0.33492690324783325, + "learning_rate": 0.00017172045386604974, + "loss": 1.2249, + "step": 10895 + }, + { + "epoch": 0.14158855050735572, + "grad_norm": 0.4575447738170624, + "learning_rate": 0.00017171785440413834, + "loss": 1.44, + "step": 10896 + }, + { + "epoch": 0.1416015450512716, + "grad_norm": 0.4821552038192749, + "learning_rate": 0.000171715254942227, + "loss": 1.5711, + "step": 10897 + }, + { + "epoch": 0.14161453959518747, + "grad_norm": 0.4753732681274414, + "learning_rate": 0.00017171265548031559, + "loss": 1.5572, + "step": 10898 + }, + { + "epoch": 0.14162753413910334, + "grad_norm": 0.3616781532764435, + "learning_rate": 0.0001717100560184042, + "loss": 1.5594, + "step": 10899 + }, + { + "epoch": 0.1416405286830192, + "grad_norm": 0.42938709259033203, + "learning_rate": 0.0001717074565564928, + "loss": 1.3992, + "step": 10900 + }, + { + "epoch": 0.14165352322693509, + "grad_norm": 0.44628751277923584, + "learning_rate": 0.00017170485709458143, + "loss": 1.5046, + "step": 10901 + }, + { + "epoch": 0.14166651777085096, + "grad_norm": 0.4351179003715515, + "learning_rate": 0.00017170225763267006, + "loss": 1.6107, + "step": 10902 + }, + { + "epoch": 0.14167951231476683, + "grad_norm": 0.4173390865325928, + "learning_rate": 0.00017169965817075865, + "loss": 1.377, + "step": 10903 + }, + { + "epoch": 0.1416925068586827, + "grad_norm": 0.39643236994743347, + "learning_rate": 0.00017169705870884728, + "loss": 1.5887, + "step": 10904 + }, + { + "epoch": 0.14170550140259858, + "grad_norm": 0.38675668835639954, + "learning_rate": 0.0001716944592469359, + "loss": 1.6578, + "step": 10905 + }, + { + "epoch": 0.14171849594651445, + "grad_norm": 0.4231336712837219, + "learning_rate": 0.0001716918597850245, + "loss": 1.5683, + "step": 10906 + }, + { + "epoch": 0.14173149049043032, + "grad_norm": 0.3367353677749634, + "learning_rate": 0.00017168926032311313, + "loss": 1.4348, + "step": 10907 + }, + { + "epoch": 0.1417444850343462, + "grad_norm": 0.3846409320831299, + "learning_rate": 0.00017168666086120172, + "loss": 1.5294, + "step": 10908 + }, + { + "epoch": 0.14175747957826207, + "grad_norm": 0.46416211128234863, + "learning_rate": 0.00017168406139929037, + "loss": 1.3925, + "step": 10909 + }, + { + "epoch": 0.14177047412217794, + "grad_norm": 0.28698790073394775, + "learning_rate": 0.00017168146193737897, + "loss": 1.373, + "step": 10910 + }, + { + "epoch": 0.14178346866609381, + "grad_norm": 0.4230034649372101, + "learning_rate": 0.0001716788624754676, + "loss": 1.632, + "step": 10911 + }, + { + "epoch": 0.1417964632100097, + "grad_norm": 0.4080103039741516, + "learning_rate": 0.0001716762630135562, + "loss": 1.5997, + "step": 10912 + }, + { + "epoch": 0.14180945775392556, + "grad_norm": 0.3559096157550812, + "learning_rate": 0.00017167366355164482, + "loss": 1.3581, + "step": 10913 + }, + { + "epoch": 0.14182245229784143, + "grad_norm": 0.4752035140991211, + "learning_rate": 0.00017167106408973344, + "loss": 1.6462, + "step": 10914 + }, + { + "epoch": 0.1418354468417573, + "grad_norm": 0.45039257407188416, + "learning_rate": 0.00017166846462782204, + "loss": 1.3962, + "step": 10915 + }, + { + "epoch": 0.14184844138567318, + "grad_norm": 0.46960341930389404, + "learning_rate": 0.00017166586516591066, + "loss": 1.5447, + "step": 10916 + }, + { + "epoch": 0.14186143592958905, + "grad_norm": 0.5272099375724792, + "learning_rate": 0.0001716632657039993, + "loss": 1.2417, + "step": 10917 + }, + { + "epoch": 0.14187443047350493, + "grad_norm": 0.3738939166069031, + "learning_rate": 0.00017166066624208789, + "loss": 1.5318, + "step": 10918 + }, + { + "epoch": 0.1418874250174208, + "grad_norm": 0.3941914737224579, + "learning_rate": 0.0001716580667801765, + "loss": 1.4321, + "step": 10919 + }, + { + "epoch": 0.14190041956133667, + "grad_norm": 0.39023685455322266, + "learning_rate": 0.0001716554673182651, + "loss": 1.3207, + "step": 10920 + }, + { + "epoch": 0.14191341410525254, + "grad_norm": 0.40618133544921875, + "learning_rate": 0.00017165286785635376, + "loss": 1.5365, + "step": 10921 + }, + { + "epoch": 0.14192640864916842, + "grad_norm": 0.4446902573108673, + "learning_rate": 0.00017165026839444236, + "loss": 1.5207, + "step": 10922 + }, + { + "epoch": 0.1419394031930843, + "grad_norm": 0.35658472776412964, + "learning_rate": 0.00017164766893253098, + "loss": 1.4366, + "step": 10923 + }, + { + "epoch": 0.14195239773700016, + "grad_norm": 0.4167204201221466, + "learning_rate": 0.00017164506947061958, + "loss": 1.4769, + "step": 10924 + }, + { + "epoch": 0.14196539228091606, + "grad_norm": 0.3677447438240051, + "learning_rate": 0.0001716424700087082, + "loss": 1.5478, + "step": 10925 + }, + { + "epoch": 0.14197838682483194, + "grad_norm": 0.39147135615348816, + "learning_rate": 0.00017163987054679683, + "loss": 1.3728, + "step": 10926 + }, + { + "epoch": 0.1419913813687478, + "grad_norm": 0.5226168632507324, + "learning_rate": 0.00017163727108488543, + "loss": 1.3327, + "step": 10927 + }, + { + "epoch": 0.14200437591266368, + "grad_norm": 0.5036211609840393, + "learning_rate": 0.00017163467162297408, + "loss": 1.3846, + "step": 10928 + }, + { + "epoch": 0.14201737045657956, + "grad_norm": 0.41347211599349976, + "learning_rate": 0.00017163207216106267, + "loss": 1.5415, + "step": 10929 + }, + { + "epoch": 0.14203036500049543, + "grad_norm": 0.4115000367164612, + "learning_rate": 0.00017162947269915127, + "loss": 1.4409, + "step": 10930 + }, + { + "epoch": 0.1420433595444113, + "grad_norm": 0.27023792266845703, + "learning_rate": 0.0001716268732372399, + "loss": 1.2019, + "step": 10931 + }, + { + "epoch": 0.14205635408832717, + "grad_norm": 0.42166751623153687, + "learning_rate": 0.00017162427377532852, + "loss": 1.4693, + "step": 10932 + }, + { + "epoch": 0.14206934863224305, + "grad_norm": 0.33947116136550903, + "learning_rate": 0.00017162167431341714, + "loss": 1.3955, + "step": 10933 + }, + { + "epoch": 0.14208234317615892, + "grad_norm": 0.4271692633628845, + "learning_rate": 0.00017161907485150574, + "loss": 1.4593, + "step": 10934 + }, + { + "epoch": 0.1420953377200748, + "grad_norm": 0.4756326973438263, + "learning_rate": 0.00017161647538959437, + "loss": 1.3656, + "step": 10935 + }, + { + "epoch": 0.14210833226399067, + "grad_norm": 0.3282777667045593, + "learning_rate": 0.000171613875927683, + "loss": 1.2668, + "step": 10936 + }, + { + "epoch": 0.14212132680790654, + "grad_norm": 0.2732298672199249, + "learning_rate": 0.0001716112764657716, + "loss": 1.3854, + "step": 10937 + }, + { + "epoch": 0.1421343213518224, + "grad_norm": 0.34180983901023865, + "learning_rate": 0.0001716086770038602, + "loss": 1.3432, + "step": 10938 + }, + { + "epoch": 0.14214731589573829, + "grad_norm": 0.2745610177516937, + "learning_rate": 0.0001716060775419488, + "loss": 1.2864, + "step": 10939 + }, + { + "epoch": 0.14216031043965416, + "grad_norm": 0.561955988407135, + "learning_rate": 0.00017160347808003746, + "loss": 1.6208, + "step": 10940 + }, + { + "epoch": 0.14217330498357003, + "grad_norm": 0.3340148329734802, + "learning_rate": 0.00017160087861812606, + "loss": 1.4183, + "step": 10941 + }, + { + "epoch": 0.1421862995274859, + "grad_norm": 0.4886804521083832, + "learning_rate": 0.00017159827915621466, + "loss": 1.4969, + "step": 10942 + }, + { + "epoch": 0.14219929407140178, + "grad_norm": 0.3795613646507263, + "learning_rate": 0.00017159567969430328, + "loss": 1.4586, + "step": 10943 + }, + { + "epoch": 0.14221228861531765, + "grad_norm": 0.4117739200592041, + "learning_rate": 0.0001715930802323919, + "loss": 1.4792, + "step": 10944 + }, + { + "epoch": 0.14222528315923352, + "grad_norm": 0.4263666570186615, + "learning_rate": 0.00017159048077048053, + "loss": 1.4526, + "step": 10945 + }, + { + "epoch": 0.1422382777031494, + "grad_norm": 0.5333686470985413, + "learning_rate": 0.00017158788130856913, + "loss": 1.5833, + "step": 10946 + }, + { + "epoch": 0.14225127224706527, + "grad_norm": 0.3140769898891449, + "learning_rate": 0.00017158528184665775, + "loss": 1.4418, + "step": 10947 + }, + { + "epoch": 0.14226426679098114, + "grad_norm": 0.3371421992778778, + "learning_rate": 0.00017158268238474638, + "loss": 1.385, + "step": 10948 + }, + { + "epoch": 0.14227726133489701, + "grad_norm": 0.4432927370071411, + "learning_rate": 0.00017158008292283497, + "loss": 1.2335, + "step": 10949 + }, + { + "epoch": 0.1422902558788129, + "grad_norm": 0.4457622170448303, + "learning_rate": 0.0001715774834609236, + "loss": 1.3749, + "step": 10950 + }, + { + "epoch": 0.14230325042272876, + "grad_norm": 0.43068206310272217, + "learning_rate": 0.0001715748839990122, + "loss": 1.3275, + "step": 10951 + }, + { + "epoch": 0.14231624496664463, + "grad_norm": 0.49113088846206665, + "learning_rate": 0.00017157228453710085, + "loss": 1.3536, + "step": 10952 + }, + { + "epoch": 0.1423292395105605, + "grad_norm": 0.312968909740448, + "learning_rate": 0.00017156968507518944, + "loss": 1.2682, + "step": 10953 + }, + { + "epoch": 0.14234223405447638, + "grad_norm": 0.3179893493652344, + "learning_rate": 0.00017156708561327804, + "loss": 1.5962, + "step": 10954 + }, + { + "epoch": 0.14235522859839225, + "grad_norm": 0.4181991517543793, + "learning_rate": 0.00017156448615136667, + "loss": 1.4341, + "step": 10955 + }, + { + "epoch": 0.14236822314230813, + "grad_norm": 0.34897053241729736, + "learning_rate": 0.0001715618866894553, + "loss": 1.2624, + "step": 10956 + }, + { + "epoch": 0.142381217686224, + "grad_norm": 0.4222790598869324, + "learning_rate": 0.00017155928722754392, + "loss": 1.3553, + "step": 10957 + }, + { + "epoch": 0.14239421223013987, + "grad_norm": 0.4330940544605255, + "learning_rate": 0.0001715566877656325, + "loss": 1.2607, + "step": 10958 + }, + { + "epoch": 0.14240720677405574, + "grad_norm": 0.44673553109169006, + "learning_rate": 0.00017155408830372114, + "loss": 1.4648, + "step": 10959 + }, + { + "epoch": 0.14242020131797162, + "grad_norm": 0.38661807775497437, + "learning_rate": 0.00017155148884180976, + "loss": 1.445, + "step": 10960 + }, + { + "epoch": 0.1424331958618875, + "grad_norm": 0.3997431993484497, + "learning_rate": 0.00017154888937989836, + "loss": 1.2605, + "step": 10961 + }, + { + "epoch": 0.14244619040580336, + "grad_norm": 0.362303227186203, + "learning_rate": 0.00017154628991798698, + "loss": 1.2934, + "step": 10962 + }, + { + "epoch": 0.14245918494971924, + "grad_norm": 0.39247629046440125, + "learning_rate": 0.00017154369045607558, + "loss": 1.5652, + "step": 10963 + }, + { + "epoch": 0.1424721794936351, + "grad_norm": 0.5087887644767761, + "learning_rate": 0.00017154109099416423, + "loss": 1.6123, + "step": 10964 + }, + { + "epoch": 0.14248517403755098, + "grad_norm": 0.43665438890457153, + "learning_rate": 0.00017153849153225283, + "loss": 1.5289, + "step": 10965 + }, + { + "epoch": 0.14249816858146686, + "grad_norm": 0.27714627981185913, + "learning_rate": 0.00017153589207034145, + "loss": 1.2534, + "step": 10966 + }, + { + "epoch": 0.14251116312538273, + "grad_norm": 0.46377792954444885, + "learning_rate": 0.00017153329260843008, + "loss": 1.5276, + "step": 10967 + }, + { + "epoch": 0.1425241576692986, + "grad_norm": 0.46302077174186707, + "learning_rate": 0.00017153069314651868, + "loss": 1.5346, + "step": 10968 + }, + { + "epoch": 0.14253715221321447, + "grad_norm": 0.44581708312034607, + "learning_rate": 0.0001715280936846073, + "loss": 1.4782, + "step": 10969 + }, + { + "epoch": 0.14255014675713035, + "grad_norm": 0.497243195772171, + "learning_rate": 0.0001715254942226959, + "loss": 1.5687, + "step": 10970 + }, + { + "epoch": 0.14256314130104622, + "grad_norm": 0.3940199315547943, + "learning_rate": 0.00017152289476078452, + "loss": 1.3456, + "step": 10971 + }, + { + "epoch": 0.1425761358449621, + "grad_norm": 0.48827141523361206, + "learning_rate": 0.00017152029529887315, + "loss": 1.4532, + "step": 10972 + }, + { + "epoch": 0.14258913038887797, + "grad_norm": 0.32716068625450134, + "learning_rate": 0.00017151769583696174, + "loss": 1.4463, + "step": 10973 + }, + { + "epoch": 0.14260212493279384, + "grad_norm": 0.35811272263526917, + "learning_rate": 0.00017151509637505037, + "loss": 1.3541, + "step": 10974 + }, + { + "epoch": 0.1426151194767097, + "grad_norm": 0.37943723797798157, + "learning_rate": 0.000171512496913139, + "loss": 1.5692, + "step": 10975 + }, + { + "epoch": 0.14262811402062558, + "grad_norm": 0.3421405255794525, + "learning_rate": 0.00017150989745122762, + "loss": 1.5162, + "step": 10976 + }, + { + "epoch": 0.14264110856454146, + "grad_norm": 0.4516383111476898, + "learning_rate": 0.00017150729798931622, + "loss": 1.4494, + "step": 10977 + }, + { + "epoch": 0.14265410310845733, + "grad_norm": 0.493931382894516, + "learning_rate": 0.00017150469852740484, + "loss": 1.3791, + "step": 10978 + }, + { + "epoch": 0.1426670976523732, + "grad_norm": 0.26790735125541687, + "learning_rate": 0.00017150209906549346, + "loss": 1.2092, + "step": 10979 + }, + { + "epoch": 0.14268009219628908, + "grad_norm": 0.3421579599380493, + "learning_rate": 0.00017149949960358206, + "loss": 1.481, + "step": 10980 + }, + { + "epoch": 0.14269308674020495, + "grad_norm": 0.25323083996772766, + "learning_rate": 0.0001714969001416707, + "loss": 1.3159, + "step": 10981 + }, + { + "epoch": 0.14270608128412082, + "grad_norm": 0.42508524656295776, + "learning_rate": 0.00017149430067975928, + "loss": 1.5003, + "step": 10982 + }, + { + "epoch": 0.1427190758280367, + "grad_norm": 0.34492215514183044, + "learning_rate": 0.00017149170121784794, + "loss": 1.5328, + "step": 10983 + }, + { + "epoch": 0.14273207037195257, + "grad_norm": 0.35682275891304016, + "learning_rate": 0.00017148910175593653, + "loss": 1.3526, + "step": 10984 + }, + { + "epoch": 0.14274506491586844, + "grad_norm": 0.5120967030525208, + "learning_rate": 0.00017148650229402513, + "loss": 1.5462, + "step": 10985 + }, + { + "epoch": 0.14275805945978431, + "grad_norm": 0.30757835507392883, + "learning_rate": 0.00017148390283211375, + "loss": 1.3035, + "step": 10986 + }, + { + "epoch": 0.1427710540037002, + "grad_norm": 0.5287626385688782, + "learning_rate": 0.00017148130337020238, + "loss": 1.475, + "step": 10987 + }, + { + "epoch": 0.14278404854761606, + "grad_norm": 0.3142072260379791, + "learning_rate": 0.000171478703908291, + "loss": 1.2456, + "step": 10988 + }, + { + "epoch": 0.14279704309153193, + "grad_norm": 0.38019683957099915, + "learning_rate": 0.0001714761044463796, + "loss": 1.3963, + "step": 10989 + }, + { + "epoch": 0.1428100376354478, + "grad_norm": 0.39742204546928406, + "learning_rate": 0.00017147350498446823, + "loss": 1.5119, + "step": 10990 + }, + { + "epoch": 0.14282303217936368, + "grad_norm": 0.3889755606651306, + "learning_rate": 0.00017147090552255685, + "loss": 1.1885, + "step": 10991 + }, + { + "epoch": 0.14283602672327955, + "grad_norm": 0.4565885663032532, + "learning_rate": 0.00017146830606064545, + "loss": 1.2739, + "step": 10992 + }, + { + "epoch": 0.14284902126719543, + "grad_norm": 0.44561681151390076, + "learning_rate": 0.00017146570659873407, + "loss": 1.484, + "step": 10993 + }, + { + "epoch": 0.1428620158111113, + "grad_norm": 0.4350430369377136, + "learning_rate": 0.00017146310713682267, + "loss": 1.4873, + "step": 10994 + }, + { + "epoch": 0.14287501035502717, + "grad_norm": 0.33578014373779297, + "learning_rate": 0.00017146050767491132, + "loss": 1.5052, + "step": 10995 + }, + { + "epoch": 0.14288800489894304, + "grad_norm": 0.44567304849624634, + "learning_rate": 0.00017145790821299992, + "loss": 1.5442, + "step": 10996 + }, + { + "epoch": 0.14290099944285892, + "grad_norm": 0.4245055317878723, + "learning_rate": 0.00017145530875108852, + "loss": 1.3197, + "step": 10997 + }, + { + "epoch": 0.1429139939867748, + "grad_norm": 0.3837421238422394, + "learning_rate": 0.00017145270928917714, + "loss": 1.5555, + "step": 10998 + }, + { + "epoch": 0.14292698853069066, + "grad_norm": 0.3903602063655853, + "learning_rate": 0.00017145010982726576, + "loss": 1.2573, + "step": 10999 + }, + { + "epoch": 0.14293998307460654, + "grad_norm": 0.4907805025577545, + "learning_rate": 0.0001714475103653544, + "loss": 1.5547, + "step": 11000 + }, + { + "epoch": 0.14295297761852244, + "grad_norm": 0.44125404953956604, + "learning_rate": 0.000171444910903443, + "loss": 1.6019, + "step": 11001 + }, + { + "epoch": 0.1429659721624383, + "grad_norm": 0.3582591712474823, + "learning_rate": 0.0001714423114415316, + "loss": 1.3749, + "step": 11002 + }, + { + "epoch": 0.14297896670635418, + "grad_norm": 0.42408159375190735, + "learning_rate": 0.00017143971197962024, + "loss": 1.4933, + "step": 11003 + }, + { + "epoch": 0.14299196125027006, + "grad_norm": 0.3925914764404297, + "learning_rate": 0.00017143711251770883, + "loss": 1.3263, + "step": 11004 + }, + { + "epoch": 0.14300495579418593, + "grad_norm": 0.40110456943511963, + "learning_rate": 0.00017143451305579746, + "loss": 1.4584, + "step": 11005 + }, + { + "epoch": 0.1430179503381018, + "grad_norm": 0.3904884159564972, + "learning_rate": 0.00017143191359388608, + "loss": 1.4286, + "step": 11006 + }, + { + "epoch": 0.14303094488201767, + "grad_norm": 0.4744924306869507, + "learning_rate": 0.0001714293141319747, + "loss": 1.5802, + "step": 11007 + }, + { + "epoch": 0.14304393942593355, + "grad_norm": 0.3206443786621094, + "learning_rate": 0.0001714267146700633, + "loss": 1.3874, + "step": 11008 + }, + { + "epoch": 0.14305693396984942, + "grad_norm": 0.4607909321784973, + "learning_rate": 0.0001714241152081519, + "loss": 1.4407, + "step": 11009 + }, + { + "epoch": 0.1430699285137653, + "grad_norm": 0.43808385729789734, + "learning_rate": 0.00017142151574624055, + "loss": 1.4162, + "step": 11010 + }, + { + "epoch": 0.14308292305768117, + "grad_norm": 0.41574355959892273, + "learning_rate": 0.00017141891628432915, + "loss": 1.4881, + "step": 11011 + }, + { + "epoch": 0.14309591760159704, + "grad_norm": 0.38654017448425293, + "learning_rate": 0.00017141631682241777, + "loss": 1.4732, + "step": 11012 + }, + { + "epoch": 0.1431089121455129, + "grad_norm": 0.4168238639831543, + "learning_rate": 0.00017141371736050637, + "loss": 1.4363, + "step": 11013 + }, + { + "epoch": 0.14312190668942879, + "grad_norm": 0.4135061800479889, + "learning_rate": 0.000171411117898595, + "loss": 1.4598, + "step": 11014 + }, + { + "epoch": 0.14313490123334466, + "grad_norm": 0.3929010331630707, + "learning_rate": 0.00017140851843668362, + "loss": 1.3049, + "step": 11015 + }, + { + "epoch": 0.14314789577726053, + "grad_norm": 0.47263097763061523, + "learning_rate": 0.00017140591897477222, + "loss": 1.4306, + "step": 11016 + }, + { + "epoch": 0.1431608903211764, + "grad_norm": 0.36611026525497437, + "learning_rate": 0.00017140331951286084, + "loss": 1.2967, + "step": 11017 + }, + { + "epoch": 0.14317388486509228, + "grad_norm": 0.3584540784358978, + "learning_rate": 0.00017140072005094947, + "loss": 1.208, + "step": 11018 + }, + { + "epoch": 0.14318687940900815, + "grad_norm": 0.34560224413871765, + "learning_rate": 0.0001713981205890381, + "loss": 1.4214, + "step": 11019 + }, + { + "epoch": 0.14319987395292402, + "grad_norm": 0.46460917592048645, + "learning_rate": 0.0001713955211271267, + "loss": 1.5404, + "step": 11020 + }, + { + "epoch": 0.1432128684968399, + "grad_norm": 0.44852039217948914, + "learning_rate": 0.0001713929216652153, + "loss": 1.4811, + "step": 11021 + }, + { + "epoch": 0.14322586304075577, + "grad_norm": 0.3241423964500427, + "learning_rate": 0.00017139032220330394, + "loss": 1.4997, + "step": 11022 + }, + { + "epoch": 0.14323885758467164, + "grad_norm": 0.3574792444705963, + "learning_rate": 0.00017138772274139254, + "loss": 1.3769, + "step": 11023 + }, + { + "epoch": 0.14325185212858751, + "grad_norm": 0.4007195830345154, + "learning_rate": 0.00017138512327948116, + "loss": 1.4272, + "step": 11024 + }, + { + "epoch": 0.1432648466725034, + "grad_norm": 0.41601213812828064, + "learning_rate": 0.00017138252381756976, + "loss": 1.4171, + "step": 11025 + }, + { + "epoch": 0.14327784121641926, + "grad_norm": 0.34559139609336853, + "learning_rate": 0.00017137992435565838, + "loss": 1.1943, + "step": 11026 + }, + { + "epoch": 0.14329083576033513, + "grad_norm": 0.4047466814517975, + "learning_rate": 0.000171377324893747, + "loss": 1.4327, + "step": 11027 + }, + { + "epoch": 0.143303830304251, + "grad_norm": 0.3817596137523651, + "learning_rate": 0.0001713747254318356, + "loss": 1.4415, + "step": 11028 + }, + { + "epoch": 0.14331682484816688, + "grad_norm": 0.45390865206718445, + "learning_rate": 0.00017137212596992423, + "loss": 1.3426, + "step": 11029 + }, + { + "epoch": 0.14332981939208275, + "grad_norm": 0.4162684381008148, + "learning_rate": 0.00017136952650801285, + "loss": 1.3738, + "step": 11030 + }, + { + "epoch": 0.14334281393599863, + "grad_norm": 0.4135703444480896, + "learning_rate": 0.00017136692704610148, + "loss": 1.5929, + "step": 11031 + }, + { + "epoch": 0.1433558084799145, + "grad_norm": 0.39735686779022217, + "learning_rate": 0.00017136432758419007, + "loss": 1.5254, + "step": 11032 + }, + { + "epoch": 0.14336880302383037, + "grad_norm": 0.39109891653060913, + "learning_rate": 0.0001713617281222787, + "loss": 1.3437, + "step": 11033 + }, + { + "epoch": 0.14338179756774624, + "grad_norm": 0.37252798676490784, + "learning_rate": 0.00017135912866036732, + "loss": 1.3626, + "step": 11034 + }, + { + "epoch": 0.14339479211166212, + "grad_norm": 0.33316025137901306, + "learning_rate": 0.00017135652919845592, + "loss": 1.4341, + "step": 11035 + }, + { + "epoch": 0.143407786655578, + "grad_norm": 0.561901330947876, + "learning_rate": 0.00017135392973654455, + "loss": 1.5039, + "step": 11036 + }, + { + "epoch": 0.14342078119949386, + "grad_norm": 0.48901546001434326, + "learning_rate": 0.00017135133027463314, + "loss": 1.4492, + "step": 11037 + }, + { + "epoch": 0.14343377574340974, + "grad_norm": 0.3802975118160248, + "learning_rate": 0.00017134873081272177, + "loss": 1.3288, + "step": 11038 + }, + { + "epoch": 0.1434467702873256, + "grad_norm": 0.43023818731307983, + "learning_rate": 0.0001713461313508104, + "loss": 1.3944, + "step": 11039 + }, + { + "epoch": 0.14345976483124148, + "grad_norm": 0.40096986293792725, + "learning_rate": 0.000171343531888899, + "loss": 1.5239, + "step": 11040 + }, + { + "epoch": 0.14347275937515735, + "grad_norm": 0.281429260969162, + "learning_rate": 0.00017134093242698764, + "loss": 1.4416, + "step": 11041 + }, + { + "epoch": 0.14348575391907323, + "grad_norm": 0.4465419352054596, + "learning_rate": 0.00017133833296507624, + "loss": 1.5091, + "step": 11042 + }, + { + "epoch": 0.1434987484629891, + "grad_norm": 0.36565929651260376, + "learning_rate": 0.00017133573350316486, + "loss": 1.3416, + "step": 11043 + }, + { + "epoch": 0.14351174300690497, + "grad_norm": 0.38336122035980225, + "learning_rate": 0.00017133313404125346, + "loss": 1.477, + "step": 11044 + }, + { + "epoch": 0.14352473755082085, + "grad_norm": 0.39667901396751404, + "learning_rate": 0.00017133053457934208, + "loss": 1.5861, + "step": 11045 + }, + { + "epoch": 0.14353773209473672, + "grad_norm": 0.37267830967903137, + "learning_rate": 0.0001713279351174307, + "loss": 1.5611, + "step": 11046 + }, + { + "epoch": 0.1435507266386526, + "grad_norm": 0.4420000910758972, + "learning_rate": 0.0001713253356555193, + "loss": 1.4206, + "step": 11047 + }, + { + "epoch": 0.14356372118256847, + "grad_norm": 0.3165684640407562, + "learning_rate": 0.00017132273619360793, + "loss": 1.3244, + "step": 11048 + }, + { + "epoch": 0.14357671572648434, + "grad_norm": 0.35696810483932495, + "learning_rate": 0.00017132013673169656, + "loss": 1.3325, + "step": 11049 + }, + { + "epoch": 0.1435897102704002, + "grad_norm": 0.3536531329154968, + "learning_rate": 0.00017131753726978518, + "loss": 1.4568, + "step": 11050 + }, + { + "epoch": 0.14360270481431608, + "grad_norm": 0.33488917350769043, + "learning_rate": 0.00017131493780787378, + "loss": 1.4413, + "step": 11051 + }, + { + "epoch": 0.14361569935823196, + "grad_norm": 0.42920148372650146, + "learning_rate": 0.00017131233834596237, + "loss": 1.528, + "step": 11052 + }, + { + "epoch": 0.14362869390214783, + "grad_norm": 0.3805094063282013, + "learning_rate": 0.00017130973888405103, + "loss": 1.3961, + "step": 11053 + }, + { + "epoch": 0.1436416884460637, + "grad_norm": 0.5175474882125854, + "learning_rate": 0.00017130713942213962, + "loss": 1.3858, + "step": 11054 + }, + { + "epoch": 0.14365468298997958, + "grad_norm": 0.3624133765697479, + "learning_rate": 0.00017130453996022825, + "loss": 1.3044, + "step": 11055 + }, + { + "epoch": 0.14366767753389545, + "grad_norm": 0.4375150203704834, + "learning_rate": 0.00017130194049831685, + "loss": 1.4163, + "step": 11056 + }, + { + "epoch": 0.14368067207781132, + "grad_norm": 0.4642685651779175, + "learning_rate": 0.00017129934103640547, + "loss": 1.5782, + "step": 11057 + }, + { + "epoch": 0.1436936666217272, + "grad_norm": 0.3251955211162567, + "learning_rate": 0.0001712967415744941, + "loss": 1.5498, + "step": 11058 + }, + { + "epoch": 0.14370666116564307, + "grad_norm": 0.5314301252365112, + "learning_rate": 0.0001712941421125827, + "loss": 1.4687, + "step": 11059 + }, + { + "epoch": 0.14371965570955894, + "grad_norm": 0.2936308681964874, + "learning_rate": 0.00017129154265067132, + "loss": 1.435, + "step": 11060 + }, + { + "epoch": 0.14373265025347481, + "grad_norm": 0.380831241607666, + "learning_rate": 0.00017128894318875994, + "loss": 1.3785, + "step": 11061 + }, + { + "epoch": 0.1437456447973907, + "grad_norm": 0.35102686285972595, + "learning_rate": 0.00017128634372684857, + "loss": 1.3924, + "step": 11062 + }, + { + "epoch": 0.14375863934130656, + "grad_norm": 0.373811274766922, + "learning_rate": 0.00017128374426493716, + "loss": 1.3483, + "step": 11063 + }, + { + "epoch": 0.14377163388522243, + "grad_norm": 0.42298251390457153, + "learning_rate": 0.00017128114480302576, + "loss": 1.3907, + "step": 11064 + }, + { + "epoch": 0.1437846284291383, + "grad_norm": 0.41398540139198303, + "learning_rate": 0.0001712785453411144, + "loss": 1.3628, + "step": 11065 + }, + { + "epoch": 0.14379762297305418, + "grad_norm": 0.411158949136734, + "learning_rate": 0.000171275945879203, + "loss": 1.4593, + "step": 11066 + }, + { + "epoch": 0.14381061751697005, + "grad_norm": 0.4771862328052521, + "learning_rate": 0.00017127334641729163, + "loss": 1.5053, + "step": 11067 + }, + { + "epoch": 0.14382361206088592, + "grad_norm": 0.38249608874320984, + "learning_rate": 0.00017127074695538023, + "loss": 1.312, + "step": 11068 + }, + { + "epoch": 0.1438366066048018, + "grad_norm": 0.38712644577026367, + "learning_rate": 0.00017126814749346886, + "loss": 1.2148, + "step": 11069 + }, + { + "epoch": 0.14384960114871767, + "grad_norm": 0.36538711190223694, + "learning_rate": 0.00017126554803155748, + "loss": 1.3879, + "step": 11070 + }, + { + "epoch": 0.14386259569263354, + "grad_norm": 0.23660413920879364, + "learning_rate": 0.00017126294856964608, + "loss": 1.3647, + "step": 11071 + }, + { + "epoch": 0.14387559023654942, + "grad_norm": 0.45454829931259155, + "learning_rate": 0.0001712603491077347, + "loss": 1.5353, + "step": 11072 + }, + { + "epoch": 0.1438885847804653, + "grad_norm": 0.3014482855796814, + "learning_rate": 0.00017125774964582333, + "loss": 1.2862, + "step": 11073 + }, + { + "epoch": 0.14390157932438116, + "grad_norm": 0.36239662766456604, + "learning_rate": 0.00017125515018391195, + "loss": 1.5245, + "step": 11074 + }, + { + "epoch": 0.14391457386829704, + "grad_norm": 0.41044533252716064, + "learning_rate": 0.00017125255072200055, + "loss": 1.4635, + "step": 11075 + }, + { + "epoch": 0.1439275684122129, + "grad_norm": 0.2653467059135437, + "learning_rate": 0.00017124995126008915, + "loss": 1.0017, + "step": 11076 + }, + { + "epoch": 0.1439405629561288, + "grad_norm": 0.3259505033493042, + "learning_rate": 0.0001712473517981778, + "loss": 1.4711, + "step": 11077 + }, + { + "epoch": 0.14395355750004468, + "grad_norm": 0.3671286404132843, + "learning_rate": 0.0001712447523362664, + "loss": 1.1098, + "step": 11078 + }, + { + "epoch": 0.14396655204396056, + "grad_norm": 0.3763706088066101, + "learning_rate": 0.00017124215287435502, + "loss": 1.3169, + "step": 11079 + }, + { + "epoch": 0.14397954658787643, + "grad_norm": 0.31940358877182007, + "learning_rate": 0.00017123955341244364, + "loss": 1.202, + "step": 11080 + }, + { + "epoch": 0.1439925411317923, + "grad_norm": 0.3961261808872223, + "learning_rate": 0.00017123695395053224, + "loss": 1.5025, + "step": 11081 + }, + { + "epoch": 0.14400553567570817, + "grad_norm": 0.5282770991325378, + "learning_rate": 0.00017123435448862086, + "loss": 1.5227, + "step": 11082 + }, + { + "epoch": 0.14401853021962405, + "grad_norm": 0.36336007714271545, + "learning_rate": 0.00017123175502670946, + "loss": 1.5521, + "step": 11083 + }, + { + "epoch": 0.14403152476353992, + "grad_norm": 0.27255767583847046, + "learning_rate": 0.00017122915556479811, + "loss": 1.2776, + "step": 11084 + }, + { + "epoch": 0.1440445193074558, + "grad_norm": 0.30391019582748413, + "learning_rate": 0.0001712265561028867, + "loss": 1.3658, + "step": 11085 + }, + { + "epoch": 0.14405751385137167, + "grad_norm": 0.36819812655448914, + "learning_rate": 0.00017122395664097534, + "loss": 1.6366, + "step": 11086 + }, + { + "epoch": 0.14407050839528754, + "grad_norm": 0.3538173735141754, + "learning_rate": 0.00017122135717906393, + "loss": 1.4214, + "step": 11087 + }, + { + "epoch": 0.1440835029392034, + "grad_norm": 0.4467506408691406, + "learning_rate": 0.00017121875771715256, + "loss": 1.6023, + "step": 11088 + }, + { + "epoch": 0.14409649748311928, + "grad_norm": 0.3107706606388092, + "learning_rate": 0.00017121615825524118, + "loss": 1.495, + "step": 11089 + }, + { + "epoch": 0.14410949202703516, + "grad_norm": 0.44247889518737793, + "learning_rate": 0.00017121355879332978, + "loss": 1.4899, + "step": 11090 + }, + { + "epoch": 0.14412248657095103, + "grad_norm": 0.41364026069641113, + "learning_rate": 0.0001712109593314184, + "loss": 1.4425, + "step": 11091 + }, + { + "epoch": 0.1441354811148669, + "grad_norm": 0.5177464485168457, + "learning_rate": 0.00017120835986950703, + "loss": 1.47, + "step": 11092 + }, + { + "epoch": 0.14414847565878278, + "grad_norm": 0.325764924287796, + "learning_rate": 0.00017120576040759563, + "loss": 1.4266, + "step": 11093 + }, + { + "epoch": 0.14416147020269865, + "grad_norm": 0.3556436002254486, + "learning_rate": 0.00017120316094568425, + "loss": 1.2437, + "step": 11094 + }, + { + "epoch": 0.14417446474661452, + "grad_norm": 0.4278867244720459, + "learning_rate": 0.00017120056148377285, + "loss": 1.29, + "step": 11095 + }, + { + "epoch": 0.1441874592905304, + "grad_norm": 0.376253217458725, + "learning_rate": 0.0001711979620218615, + "loss": 1.3918, + "step": 11096 + }, + { + "epoch": 0.14420045383444627, + "grad_norm": 0.3262886106967926, + "learning_rate": 0.0001711953625599501, + "loss": 1.6173, + "step": 11097 + }, + { + "epoch": 0.14421344837836214, + "grad_norm": 0.4144711196422577, + "learning_rate": 0.00017119276309803872, + "loss": 1.5227, + "step": 11098 + }, + { + "epoch": 0.14422644292227801, + "grad_norm": 0.39721980690956116, + "learning_rate": 0.00017119016363612732, + "loss": 1.3542, + "step": 11099 + }, + { + "epoch": 0.1442394374661939, + "grad_norm": 0.34060022234916687, + "learning_rate": 0.00017118756417421594, + "loss": 1.3545, + "step": 11100 + }, + { + "epoch": 0.14425243201010976, + "grad_norm": 0.4440550208091736, + "learning_rate": 0.00017118496471230457, + "loss": 1.549, + "step": 11101 + }, + { + "epoch": 0.14426542655402563, + "grad_norm": 0.3309880197048187, + "learning_rate": 0.00017118236525039316, + "loss": 1.3861, + "step": 11102 + }, + { + "epoch": 0.1442784210979415, + "grad_norm": 0.40864211320877075, + "learning_rate": 0.0001711797657884818, + "loss": 1.3038, + "step": 11103 + }, + { + "epoch": 0.14429141564185738, + "grad_norm": 0.38104912638664246, + "learning_rate": 0.00017117716632657041, + "loss": 1.3511, + "step": 11104 + }, + { + "epoch": 0.14430441018577325, + "grad_norm": 0.4476778507232666, + "learning_rate": 0.00017117456686465904, + "loss": 1.4264, + "step": 11105 + }, + { + "epoch": 0.14431740472968912, + "grad_norm": 0.42206504940986633, + "learning_rate": 0.00017117196740274764, + "loss": 1.6553, + "step": 11106 + }, + { + "epoch": 0.144330399273605, + "grad_norm": 0.3561924397945404, + "learning_rate": 0.00017116936794083623, + "loss": 1.38, + "step": 11107 + }, + { + "epoch": 0.14434339381752087, + "grad_norm": 0.37079834938049316, + "learning_rate": 0.00017116676847892488, + "loss": 1.2549, + "step": 11108 + }, + { + "epoch": 0.14435638836143674, + "grad_norm": 0.4306679368019104, + "learning_rate": 0.00017116416901701348, + "loss": 1.5187, + "step": 11109 + }, + { + "epoch": 0.14436938290535262, + "grad_norm": 0.38881582021713257, + "learning_rate": 0.0001711615695551021, + "loss": 1.3777, + "step": 11110 + }, + { + "epoch": 0.1443823774492685, + "grad_norm": 0.5320708155632019, + "learning_rate": 0.0001711589700931907, + "loss": 1.6033, + "step": 11111 + }, + { + "epoch": 0.14439537199318436, + "grad_norm": 0.3984955847263336, + "learning_rate": 0.00017115637063127933, + "loss": 1.3257, + "step": 11112 + }, + { + "epoch": 0.14440836653710024, + "grad_norm": 0.39208194613456726, + "learning_rate": 0.00017115377116936795, + "loss": 1.3624, + "step": 11113 + }, + { + "epoch": 0.1444213610810161, + "grad_norm": 0.3975045084953308, + "learning_rate": 0.00017115117170745655, + "loss": 1.5256, + "step": 11114 + }, + { + "epoch": 0.14443435562493198, + "grad_norm": 0.38531166315078735, + "learning_rate": 0.0001711485722455452, + "loss": 1.3806, + "step": 11115 + }, + { + "epoch": 0.14444735016884785, + "grad_norm": 0.40180933475494385, + "learning_rate": 0.0001711459727836338, + "loss": 1.6383, + "step": 11116 + }, + { + "epoch": 0.14446034471276373, + "grad_norm": 0.4208340048789978, + "learning_rate": 0.00017114337332172242, + "loss": 1.3653, + "step": 11117 + }, + { + "epoch": 0.1444733392566796, + "grad_norm": 0.4378970265388489, + "learning_rate": 0.00017114077385981102, + "loss": 1.4139, + "step": 11118 + }, + { + "epoch": 0.14448633380059547, + "grad_norm": 0.37437704205513, + "learning_rate": 0.00017113817439789965, + "loss": 1.4935, + "step": 11119 + }, + { + "epoch": 0.14449932834451135, + "grad_norm": 0.4415818154811859, + "learning_rate": 0.00017113557493598827, + "loss": 1.3587, + "step": 11120 + }, + { + "epoch": 0.14451232288842722, + "grad_norm": 0.4054763615131378, + "learning_rate": 0.00017113297547407687, + "loss": 1.4784, + "step": 11121 + }, + { + "epoch": 0.1445253174323431, + "grad_norm": 0.39209458231925964, + "learning_rate": 0.0001711303760121655, + "loss": 1.25, + "step": 11122 + }, + { + "epoch": 0.14453831197625897, + "grad_norm": 0.40410366654396057, + "learning_rate": 0.00017112777655025412, + "loss": 1.3344, + "step": 11123 + }, + { + "epoch": 0.14455130652017484, + "grad_norm": 0.3767906129360199, + "learning_rate": 0.00017112517708834271, + "loss": 1.4141, + "step": 11124 + }, + { + "epoch": 0.1445643010640907, + "grad_norm": 0.36057430505752563, + "learning_rate": 0.00017112257762643134, + "loss": 1.3687, + "step": 11125 + }, + { + "epoch": 0.14457729560800658, + "grad_norm": 0.4281517267227173, + "learning_rate": 0.00017111997816451994, + "loss": 1.516, + "step": 11126 + }, + { + "epoch": 0.14459029015192246, + "grad_norm": 0.4196946620941162, + "learning_rate": 0.0001711173787026086, + "loss": 1.6903, + "step": 11127 + }, + { + "epoch": 0.14460328469583833, + "grad_norm": 0.3593243360519409, + "learning_rate": 0.00017111477924069718, + "loss": 1.4337, + "step": 11128 + }, + { + "epoch": 0.1446162792397542, + "grad_norm": 0.4050748348236084, + "learning_rate": 0.0001711121797787858, + "loss": 1.4971, + "step": 11129 + }, + { + "epoch": 0.14462927378367008, + "grad_norm": 0.39851024746894836, + "learning_rate": 0.0001711095803168744, + "loss": 1.5997, + "step": 11130 + }, + { + "epoch": 0.14464226832758595, + "grad_norm": 0.414989173412323, + "learning_rate": 0.00017110698085496303, + "loss": 1.3923, + "step": 11131 + }, + { + "epoch": 0.14465526287150182, + "grad_norm": 0.40924960374832153, + "learning_rate": 0.00017110438139305166, + "loss": 1.3856, + "step": 11132 + }, + { + "epoch": 0.1446682574154177, + "grad_norm": 0.41865432262420654, + "learning_rate": 0.00017110178193114025, + "loss": 1.4682, + "step": 11133 + }, + { + "epoch": 0.14468125195933357, + "grad_norm": 0.34652113914489746, + "learning_rate": 0.00017109918246922888, + "loss": 1.4289, + "step": 11134 + }, + { + "epoch": 0.14469424650324944, + "grad_norm": 0.3899216055870056, + "learning_rate": 0.0001710965830073175, + "loss": 1.2774, + "step": 11135 + }, + { + "epoch": 0.1447072410471653, + "grad_norm": 0.47964945435523987, + "learning_rate": 0.0001710939835454061, + "loss": 1.4399, + "step": 11136 + }, + { + "epoch": 0.1447202355910812, + "grad_norm": 0.4223995506763458, + "learning_rate": 0.00017109138408349472, + "loss": 1.4325, + "step": 11137 + }, + { + "epoch": 0.14473323013499706, + "grad_norm": 0.36780813336372375, + "learning_rate": 0.00017108878462158332, + "loss": 1.4635, + "step": 11138 + }, + { + "epoch": 0.14474622467891293, + "grad_norm": 0.30381128191947937, + "learning_rate": 0.00017108618515967197, + "loss": 1.3566, + "step": 11139 + }, + { + "epoch": 0.1447592192228288, + "grad_norm": 0.34869423508644104, + "learning_rate": 0.00017108358569776057, + "loss": 1.3611, + "step": 11140 + }, + { + "epoch": 0.14477221376674468, + "grad_norm": 0.3324102461338043, + "learning_rate": 0.0001710809862358492, + "loss": 1.4183, + "step": 11141 + }, + { + "epoch": 0.14478520831066055, + "grad_norm": 0.4056366980075836, + "learning_rate": 0.0001710783867739378, + "loss": 1.4025, + "step": 11142 + }, + { + "epoch": 0.14479820285457642, + "grad_norm": 0.377490371465683, + "learning_rate": 0.00017107578731202642, + "loss": 1.4025, + "step": 11143 + }, + { + "epoch": 0.1448111973984923, + "grad_norm": 0.3825136721134186, + "learning_rate": 0.00017107318785011504, + "loss": 1.3839, + "step": 11144 + }, + { + "epoch": 0.14482419194240817, + "grad_norm": 0.32290422916412354, + "learning_rate": 0.00017107058838820364, + "loss": 1.477, + "step": 11145 + }, + { + "epoch": 0.14483718648632404, + "grad_norm": 0.422802597284317, + "learning_rate": 0.00017106798892629226, + "loss": 1.3897, + "step": 11146 + }, + { + "epoch": 0.14485018103023992, + "grad_norm": 0.3886449337005615, + "learning_rate": 0.0001710653894643809, + "loss": 1.4643, + "step": 11147 + }, + { + "epoch": 0.1448631755741558, + "grad_norm": 0.349416583776474, + "learning_rate": 0.00017106279000246948, + "loss": 1.4922, + "step": 11148 + }, + { + "epoch": 0.14487617011807166, + "grad_norm": 0.32664382457733154, + "learning_rate": 0.0001710601905405581, + "loss": 1.3626, + "step": 11149 + }, + { + "epoch": 0.14488916466198754, + "grad_norm": 0.40521445870399475, + "learning_rate": 0.0001710575910786467, + "loss": 1.3644, + "step": 11150 + }, + { + "epoch": 0.1449021592059034, + "grad_norm": 0.3790925443172455, + "learning_rate": 0.00017105499161673536, + "loss": 1.4366, + "step": 11151 + }, + { + "epoch": 0.14491515374981928, + "grad_norm": 0.3130298852920532, + "learning_rate": 0.00017105239215482396, + "loss": 1.6264, + "step": 11152 + }, + { + "epoch": 0.14492814829373518, + "grad_norm": 0.34523412585258484, + "learning_rate": 0.00017104979269291258, + "loss": 1.1371, + "step": 11153 + }, + { + "epoch": 0.14494114283765105, + "grad_norm": 0.34358522295951843, + "learning_rate": 0.0001710471932310012, + "loss": 1.2934, + "step": 11154 + }, + { + "epoch": 0.14495413738156693, + "grad_norm": 0.3753041923046112, + "learning_rate": 0.0001710445937690898, + "loss": 1.5024, + "step": 11155 + }, + { + "epoch": 0.1449671319254828, + "grad_norm": 0.5765540599822998, + "learning_rate": 0.00017104199430717843, + "loss": 1.6194, + "step": 11156 + }, + { + "epoch": 0.14498012646939867, + "grad_norm": 0.3787371814250946, + "learning_rate": 0.00017103939484526702, + "loss": 1.4714, + "step": 11157 + }, + { + "epoch": 0.14499312101331455, + "grad_norm": 0.36760982871055603, + "learning_rate": 0.00017103679538335568, + "loss": 1.484, + "step": 11158 + }, + { + "epoch": 0.14500611555723042, + "grad_norm": 0.4599936306476593, + "learning_rate": 0.00017103419592144427, + "loss": 1.3481, + "step": 11159 + }, + { + "epoch": 0.1450191101011463, + "grad_norm": 0.40311580896377563, + "learning_rate": 0.00017103159645953287, + "loss": 1.3385, + "step": 11160 + }, + { + "epoch": 0.14503210464506217, + "grad_norm": 0.4979571998119354, + "learning_rate": 0.0001710289969976215, + "loss": 1.4405, + "step": 11161 + }, + { + "epoch": 0.14504509918897804, + "grad_norm": 0.37653255462646484, + "learning_rate": 0.00017102639753571012, + "loss": 1.4526, + "step": 11162 + }, + { + "epoch": 0.1450580937328939, + "grad_norm": 0.41032832860946655, + "learning_rate": 0.00017102379807379874, + "loss": 1.4929, + "step": 11163 + }, + { + "epoch": 0.14507108827680978, + "grad_norm": 0.4043903946876526, + "learning_rate": 0.00017102119861188734, + "loss": 1.4022, + "step": 11164 + }, + { + "epoch": 0.14508408282072566, + "grad_norm": 0.27943894267082214, + "learning_rate": 0.00017101859914997597, + "loss": 1.42, + "step": 11165 + }, + { + "epoch": 0.14509707736464153, + "grad_norm": 0.5411479473114014, + "learning_rate": 0.0001710159996880646, + "loss": 1.4382, + "step": 11166 + }, + { + "epoch": 0.1451100719085574, + "grad_norm": 0.4582824110984802, + "learning_rate": 0.0001710134002261532, + "loss": 1.5043, + "step": 11167 + }, + { + "epoch": 0.14512306645247328, + "grad_norm": 0.46927210688591003, + "learning_rate": 0.0001710108007642418, + "loss": 1.4868, + "step": 11168 + }, + { + "epoch": 0.14513606099638915, + "grad_norm": 0.3008632957935333, + "learning_rate": 0.0001710082013023304, + "loss": 1.293, + "step": 11169 + }, + { + "epoch": 0.14514905554030502, + "grad_norm": 0.36784055829048157, + "learning_rate": 0.00017100560184041906, + "loss": 1.5808, + "step": 11170 + }, + { + "epoch": 0.1451620500842209, + "grad_norm": 0.41808003187179565, + "learning_rate": 0.00017100300237850766, + "loss": 1.3989, + "step": 11171 + }, + { + "epoch": 0.14517504462813677, + "grad_norm": 0.42032837867736816, + "learning_rate": 0.00017100040291659628, + "loss": 1.4308, + "step": 11172 + }, + { + "epoch": 0.14518803917205264, + "grad_norm": 0.5430816411972046, + "learning_rate": 0.00017099780345468488, + "loss": 1.4639, + "step": 11173 + }, + { + "epoch": 0.1452010337159685, + "grad_norm": 0.3855245113372803, + "learning_rate": 0.0001709952039927735, + "loss": 1.5902, + "step": 11174 + }, + { + "epoch": 0.1452140282598844, + "grad_norm": 0.3944976031780243, + "learning_rate": 0.00017099260453086213, + "loss": 1.3476, + "step": 11175 + }, + { + "epoch": 0.14522702280380026, + "grad_norm": 0.4476650357246399, + "learning_rate": 0.00017099000506895073, + "loss": 1.4947, + "step": 11176 + }, + { + "epoch": 0.14524001734771613, + "grad_norm": 0.411156564950943, + "learning_rate": 0.00017098740560703935, + "loss": 1.5132, + "step": 11177 + }, + { + "epoch": 0.145253011891632, + "grad_norm": 0.36436235904693604, + "learning_rate": 0.00017098480614512798, + "loss": 1.2626, + "step": 11178 + }, + { + "epoch": 0.14526600643554788, + "grad_norm": 0.42791301012039185, + "learning_rate": 0.00017098220668321657, + "loss": 1.4867, + "step": 11179 + }, + { + "epoch": 0.14527900097946375, + "grad_norm": 0.4045431911945343, + "learning_rate": 0.0001709796072213052, + "loss": 1.4253, + "step": 11180 + }, + { + "epoch": 0.14529199552337962, + "grad_norm": 0.37130504846572876, + "learning_rate": 0.0001709770077593938, + "loss": 1.358, + "step": 11181 + }, + { + "epoch": 0.1453049900672955, + "grad_norm": 0.3352845311164856, + "learning_rate": 0.00017097440829748245, + "loss": 1.4942, + "step": 11182 + }, + { + "epoch": 0.14531798461121137, + "grad_norm": 0.37348663806915283, + "learning_rate": 0.00017097180883557104, + "loss": 1.4177, + "step": 11183 + }, + { + "epoch": 0.14533097915512724, + "grad_norm": 0.37484636902809143, + "learning_rate": 0.00017096920937365967, + "loss": 1.4759, + "step": 11184 + }, + { + "epoch": 0.14534397369904312, + "grad_norm": 0.3262447416782379, + "learning_rate": 0.00017096660991174827, + "loss": 1.575, + "step": 11185 + }, + { + "epoch": 0.145356968242959, + "grad_norm": 0.47388604283332825, + "learning_rate": 0.0001709640104498369, + "loss": 1.5107, + "step": 11186 + }, + { + "epoch": 0.14536996278687486, + "grad_norm": 0.5634368658065796, + "learning_rate": 0.00017096141098792551, + "loss": 1.6389, + "step": 11187 + }, + { + "epoch": 0.14538295733079074, + "grad_norm": 0.2917027771472931, + "learning_rate": 0.0001709588115260141, + "loss": 1.3179, + "step": 11188 + }, + { + "epoch": 0.1453959518747066, + "grad_norm": 0.38248276710510254, + "learning_rate": 0.00017095621206410276, + "loss": 1.4292, + "step": 11189 + }, + { + "epoch": 0.14540894641862248, + "grad_norm": 0.3398478329181671, + "learning_rate": 0.00017095361260219136, + "loss": 1.5298, + "step": 11190 + }, + { + "epoch": 0.14542194096253835, + "grad_norm": 0.4128149151802063, + "learning_rate": 0.00017095101314027996, + "loss": 1.3958, + "step": 11191 + }, + { + "epoch": 0.14543493550645423, + "grad_norm": 0.4019889235496521, + "learning_rate": 0.00017094841367836858, + "loss": 1.3419, + "step": 11192 + }, + { + "epoch": 0.1454479300503701, + "grad_norm": 0.3496367931365967, + "learning_rate": 0.0001709458142164572, + "loss": 1.5676, + "step": 11193 + }, + { + "epoch": 0.14546092459428597, + "grad_norm": 0.38252392411231995, + "learning_rate": 0.00017094321475454583, + "loss": 1.4532, + "step": 11194 + }, + { + "epoch": 0.14547391913820185, + "grad_norm": 0.684743344783783, + "learning_rate": 0.00017094061529263443, + "loss": 1.587, + "step": 11195 + }, + { + "epoch": 0.14548691368211772, + "grad_norm": 0.4842517077922821, + "learning_rate": 0.00017093801583072305, + "loss": 1.4501, + "step": 11196 + }, + { + "epoch": 0.1454999082260336, + "grad_norm": 0.43303027749061584, + "learning_rate": 0.00017093541636881168, + "loss": 1.4915, + "step": 11197 + }, + { + "epoch": 0.14551290276994946, + "grad_norm": 0.4664687216281891, + "learning_rate": 0.00017093281690690028, + "loss": 1.4023, + "step": 11198 + }, + { + "epoch": 0.14552589731386534, + "grad_norm": 0.4656442701816559, + "learning_rate": 0.0001709302174449889, + "loss": 1.5368, + "step": 11199 + }, + { + "epoch": 0.1455388918577812, + "grad_norm": 0.4284462034702301, + "learning_rate": 0.0001709276179830775, + "loss": 1.4093, + "step": 11200 + }, + { + "epoch": 0.14555188640169708, + "grad_norm": 0.42092394828796387, + "learning_rate": 0.00017092501852116615, + "loss": 1.3105, + "step": 11201 + }, + { + "epoch": 0.14556488094561296, + "grad_norm": 0.4908880293369293, + "learning_rate": 0.00017092241905925475, + "loss": 1.5216, + "step": 11202 + }, + { + "epoch": 0.14557787548952883, + "grad_norm": 0.4073388874530792, + "learning_rate": 0.00017091981959734334, + "loss": 1.5007, + "step": 11203 + }, + { + "epoch": 0.1455908700334447, + "grad_norm": 0.32807299494743347, + "learning_rate": 0.00017091722013543197, + "loss": 1.3313, + "step": 11204 + }, + { + "epoch": 0.14560386457736058, + "grad_norm": 0.5015289187431335, + "learning_rate": 0.0001709146206735206, + "loss": 1.4676, + "step": 11205 + }, + { + "epoch": 0.14561685912127645, + "grad_norm": 0.304504930973053, + "learning_rate": 0.00017091202121160922, + "loss": 1.3186, + "step": 11206 + }, + { + "epoch": 0.14562985366519232, + "grad_norm": 0.3211117386817932, + "learning_rate": 0.00017090942174969781, + "loss": 1.4544, + "step": 11207 + }, + { + "epoch": 0.1456428482091082, + "grad_norm": 0.3818785548210144, + "learning_rate": 0.00017090682228778644, + "loss": 1.4742, + "step": 11208 + }, + { + "epoch": 0.14565584275302407, + "grad_norm": 0.40314486622810364, + "learning_rate": 0.00017090422282587506, + "loss": 1.4474, + "step": 11209 + }, + { + "epoch": 0.14566883729693994, + "grad_norm": 0.4286314845085144, + "learning_rate": 0.00017090162336396366, + "loss": 1.6516, + "step": 11210 + }, + { + "epoch": 0.1456818318408558, + "grad_norm": 0.5021548867225647, + "learning_rate": 0.00017089902390205229, + "loss": 1.413, + "step": 11211 + }, + { + "epoch": 0.1456948263847717, + "grad_norm": 0.3518587648868561, + "learning_rate": 0.00017089642444014088, + "loss": 1.3294, + "step": 11212 + }, + { + "epoch": 0.14570782092868756, + "grad_norm": 0.37559816241264343, + "learning_rate": 0.00017089382497822953, + "loss": 1.5423, + "step": 11213 + }, + { + "epoch": 0.14572081547260343, + "grad_norm": 0.4267975091934204, + "learning_rate": 0.00017089122551631813, + "loss": 1.4666, + "step": 11214 + }, + { + "epoch": 0.1457338100165193, + "grad_norm": 0.3794823884963989, + "learning_rate": 0.00017088862605440673, + "loss": 1.4704, + "step": 11215 + }, + { + "epoch": 0.14574680456043518, + "grad_norm": 0.40062078833580017, + "learning_rate": 0.00017088602659249535, + "loss": 1.3516, + "step": 11216 + }, + { + "epoch": 0.14575979910435105, + "grad_norm": 0.4705944061279297, + "learning_rate": 0.00017088342713058398, + "loss": 1.583, + "step": 11217 + }, + { + "epoch": 0.14577279364826692, + "grad_norm": 0.3194669485092163, + "learning_rate": 0.0001708808276686726, + "loss": 1.4838, + "step": 11218 + }, + { + "epoch": 0.1457857881921828, + "grad_norm": 0.2934216856956482, + "learning_rate": 0.0001708782282067612, + "loss": 1.3715, + "step": 11219 + }, + { + "epoch": 0.14579878273609867, + "grad_norm": 0.45715153217315674, + "learning_rate": 0.00017087562874484982, + "loss": 1.6046, + "step": 11220 + }, + { + "epoch": 0.14581177728001454, + "grad_norm": 0.32778844237327576, + "learning_rate": 0.00017087302928293845, + "loss": 1.4126, + "step": 11221 + }, + { + "epoch": 0.14582477182393042, + "grad_norm": 0.38202613592147827, + "learning_rate": 0.00017087042982102705, + "loss": 1.322, + "step": 11222 + }, + { + "epoch": 0.1458377663678463, + "grad_norm": 0.40567055344581604, + "learning_rate": 0.00017086783035911567, + "loss": 1.4551, + "step": 11223 + }, + { + "epoch": 0.14585076091176216, + "grad_norm": 0.23350529372692108, + "learning_rate": 0.00017086523089720427, + "loss": 1.3017, + "step": 11224 + }, + { + "epoch": 0.14586375545567803, + "grad_norm": 0.3749062418937683, + "learning_rate": 0.00017086263143529292, + "loss": 1.3058, + "step": 11225 + }, + { + "epoch": 0.1458767499995939, + "grad_norm": 0.2677316963672638, + "learning_rate": 0.00017086003197338152, + "loss": 1.3903, + "step": 11226 + }, + { + "epoch": 0.14588974454350978, + "grad_norm": 0.39584994316101074, + "learning_rate": 0.00017085743251147014, + "loss": 1.5382, + "step": 11227 + }, + { + "epoch": 0.14590273908742565, + "grad_norm": 0.34881478548049927, + "learning_rate": 0.00017085483304955877, + "loss": 1.4239, + "step": 11228 + }, + { + "epoch": 0.14591573363134155, + "grad_norm": 0.4115470051765442, + "learning_rate": 0.00017085223358764736, + "loss": 1.4182, + "step": 11229 + }, + { + "epoch": 0.14592872817525743, + "grad_norm": 0.3866683840751648, + "learning_rate": 0.000170849634125736, + "loss": 1.4995, + "step": 11230 + }, + { + "epoch": 0.1459417227191733, + "grad_norm": 0.4981044828891754, + "learning_rate": 0.00017084703466382458, + "loss": 1.4853, + "step": 11231 + }, + { + "epoch": 0.14595471726308917, + "grad_norm": 0.408642441034317, + "learning_rate": 0.0001708444352019132, + "loss": 1.41, + "step": 11232 + }, + { + "epoch": 0.14596771180700505, + "grad_norm": 0.4571438729763031, + "learning_rate": 0.00017084183574000183, + "loss": 1.623, + "step": 11233 + }, + { + "epoch": 0.14598070635092092, + "grad_norm": 0.27386486530303955, + "learning_rate": 0.00017083923627809043, + "loss": 1.1693, + "step": 11234 + }, + { + "epoch": 0.1459937008948368, + "grad_norm": 0.44032835960388184, + "learning_rate": 0.00017083663681617906, + "loss": 1.5072, + "step": 11235 + }, + { + "epoch": 0.14600669543875266, + "grad_norm": 0.4462278485298157, + "learning_rate": 0.00017083403735426768, + "loss": 1.4152, + "step": 11236 + }, + { + "epoch": 0.14601968998266854, + "grad_norm": 0.2907092273235321, + "learning_rate": 0.0001708314378923563, + "loss": 1.6447, + "step": 11237 + }, + { + "epoch": 0.1460326845265844, + "grad_norm": 0.39260178804397583, + "learning_rate": 0.0001708288384304449, + "loss": 1.5782, + "step": 11238 + }, + { + "epoch": 0.14604567907050028, + "grad_norm": 0.33923542499542236, + "learning_rate": 0.00017082623896853353, + "loss": 1.49, + "step": 11239 + }, + { + "epoch": 0.14605867361441616, + "grad_norm": 0.3640190660953522, + "learning_rate": 0.00017082363950662215, + "loss": 1.5123, + "step": 11240 + }, + { + "epoch": 0.14607166815833203, + "grad_norm": 0.4283309578895569, + "learning_rate": 0.00017082104004471075, + "loss": 1.4022, + "step": 11241 + }, + { + "epoch": 0.1460846627022479, + "grad_norm": 0.4036342203617096, + "learning_rate": 0.00017081844058279937, + "loss": 1.457, + "step": 11242 + }, + { + "epoch": 0.14609765724616378, + "grad_norm": 0.38892436027526855, + "learning_rate": 0.00017081584112088797, + "loss": 1.4649, + "step": 11243 + }, + { + "epoch": 0.14611065179007965, + "grad_norm": 0.4600119888782501, + "learning_rate": 0.0001708132416589766, + "loss": 1.3541, + "step": 11244 + }, + { + "epoch": 0.14612364633399552, + "grad_norm": 0.37838712334632874, + "learning_rate": 0.00017081064219706522, + "loss": 1.5102, + "step": 11245 + }, + { + "epoch": 0.1461366408779114, + "grad_norm": 0.4538290798664093, + "learning_rate": 0.00017080804273515382, + "loss": 1.4415, + "step": 11246 + }, + { + "epoch": 0.14614963542182727, + "grad_norm": 0.4385407567024231, + "learning_rate": 0.00017080544327324244, + "loss": 1.388, + "step": 11247 + }, + { + "epoch": 0.14616262996574314, + "grad_norm": 0.4056902825832367, + "learning_rate": 0.00017080284381133107, + "loss": 1.3527, + "step": 11248 + }, + { + "epoch": 0.146175624509659, + "grad_norm": 0.5407351851463318, + "learning_rate": 0.0001708002443494197, + "loss": 1.4754, + "step": 11249 + }, + { + "epoch": 0.1461886190535749, + "grad_norm": 0.3910459280014038, + "learning_rate": 0.0001707976448875083, + "loss": 1.2567, + "step": 11250 + }, + { + "epoch": 0.14620161359749076, + "grad_norm": 0.37198469042778015, + "learning_rate": 0.0001707950454255969, + "loss": 1.3948, + "step": 11251 + }, + { + "epoch": 0.14621460814140663, + "grad_norm": 0.39405572414398193, + "learning_rate": 0.00017079244596368554, + "loss": 1.4035, + "step": 11252 + }, + { + "epoch": 0.1462276026853225, + "grad_norm": 0.49109235405921936, + "learning_rate": 0.00017078984650177413, + "loss": 1.5257, + "step": 11253 + }, + { + "epoch": 0.14624059722923838, + "grad_norm": 0.4589422345161438, + "learning_rate": 0.00017078724703986276, + "loss": 1.367, + "step": 11254 + }, + { + "epoch": 0.14625359177315425, + "grad_norm": 0.47225743532180786, + "learning_rate": 0.00017078464757795136, + "loss": 1.3805, + "step": 11255 + }, + { + "epoch": 0.14626658631707012, + "grad_norm": 0.44636714458465576, + "learning_rate": 0.00017078204811604, + "loss": 1.4947, + "step": 11256 + }, + { + "epoch": 0.146279580860986, + "grad_norm": 0.4095315933227539, + "learning_rate": 0.0001707794486541286, + "loss": 1.4525, + "step": 11257 + }, + { + "epoch": 0.14629257540490187, + "grad_norm": 0.4779440462589264, + "learning_rate": 0.0001707768491922172, + "loss": 1.3759, + "step": 11258 + }, + { + "epoch": 0.14630556994881774, + "grad_norm": 0.40051358938217163, + "learning_rate": 0.00017077424973030583, + "loss": 1.4801, + "step": 11259 + }, + { + "epoch": 0.14631856449273362, + "grad_norm": 0.35656315088272095, + "learning_rate": 0.00017077165026839445, + "loss": 1.2252, + "step": 11260 + }, + { + "epoch": 0.1463315590366495, + "grad_norm": 0.4320249557495117, + "learning_rate": 0.00017076905080648308, + "loss": 1.5085, + "step": 11261 + }, + { + "epoch": 0.14634455358056536, + "grad_norm": 0.4325583279132843, + "learning_rate": 0.00017076645134457167, + "loss": 1.5621, + "step": 11262 + }, + { + "epoch": 0.14635754812448123, + "grad_norm": 0.3113324046134949, + "learning_rate": 0.0001707638518826603, + "loss": 1.4537, + "step": 11263 + }, + { + "epoch": 0.1463705426683971, + "grad_norm": 0.32814615964889526, + "learning_rate": 0.00017076125242074892, + "loss": 1.3648, + "step": 11264 + }, + { + "epoch": 0.14638353721231298, + "grad_norm": 0.42125335335731506, + "learning_rate": 0.00017075865295883752, + "loss": 1.3706, + "step": 11265 + }, + { + "epoch": 0.14639653175622885, + "grad_norm": 0.4062483012676239, + "learning_rate": 0.00017075605349692614, + "loss": 1.5214, + "step": 11266 + }, + { + "epoch": 0.14640952630014473, + "grad_norm": 0.3844647705554962, + "learning_rate": 0.00017075345403501477, + "loss": 1.4872, + "step": 11267 + }, + { + "epoch": 0.1464225208440606, + "grad_norm": 0.3286508619785309, + "learning_rate": 0.0001707508545731034, + "loss": 1.3626, + "step": 11268 + }, + { + "epoch": 0.14643551538797647, + "grad_norm": 0.3928113877773285, + "learning_rate": 0.000170748255111192, + "loss": 1.4427, + "step": 11269 + }, + { + "epoch": 0.14644850993189235, + "grad_norm": 0.45838379859924316, + "learning_rate": 0.0001707456556492806, + "loss": 1.372, + "step": 11270 + }, + { + "epoch": 0.14646150447580822, + "grad_norm": 0.4003114700317383, + "learning_rate": 0.00017074305618736924, + "loss": 1.2152, + "step": 11271 + }, + { + "epoch": 0.1464744990197241, + "grad_norm": 0.45826438069343567, + "learning_rate": 0.00017074045672545784, + "loss": 1.3034, + "step": 11272 + }, + { + "epoch": 0.14648749356363996, + "grad_norm": 0.4394860863685608, + "learning_rate": 0.00017073785726354646, + "loss": 1.4534, + "step": 11273 + }, + { + "epoch": 0.14650048810755584, + "grad_norm": 0.26345837116241455, + "learning_rate": 0.00017073525780163506, + "loss": 1.4064, + "step": 11274 + }, + { + "epoch": 0.1465134826514717, + "grad_norm": 0.37141886353492737, + "learning_rate": 0.00017073265833972368, + "loss": 1.3198, + "step": 11275 + }, + { + "epoch": 0.14652647719538758, + "grad_norm": 0.42236199975013733, + "learning_rate": 0.0001707300588778123, + "loss": 1.4317, + "step": 11276 + }, + { + "epoch": 0.14653947173930346, + "grad_norm": 0.38771164417266846, + "learning_rate": 0.0001707274594159009, + "loss": 1.3816, + "step": 11277 + }, + { + "epoch": 0.14655246628321933, + "grad_norm": 0.383962482213974, + "learning_rate": 0.00017072485995398953, + "loss": 1.4785, + "step": 11278 + }, + { + "epoch": 0.1465654608271352, + "grad_norm": 0.3914499580860138, + "learning_rate": 0.00017072226049207815, + "loss": 1.4628, + "step": 11279 + }, + { + "epoch": 0.14657845537105108, + "grad_norm": 0.36763709783554077, + "learning_rate": 0.00017071966103016678, + "loss": 1.5323, + "step": 11280 + }, + { + "epoch": 0.14659144991496695, + "grad_norm": 0.5018381476402283, + "learning_rate": 0.00017071706156825538, + "loss": 1.5501, + "step": 11281 + }, + { + "epoch": 0.14660444445888282, + "grad_norm": 0.39596062898635864, + "learning_rate": 0.00017071446210634397, + "loss": 1.259, + "step": 11282 + }, + { + "epoch": 0.1466174390027987, + "grad_norm": 0.4241613745689392, + "learning_rate": 0.00017071186264443262, + "loss": 1.3641, + "step": 11283 + }, + { + "epoch": 0.14663043354671457, + "grad_norm": 0.3441568613052368, + "learning_rate": 0.00017070926318252122, + "loss": 1.3624, + "step": 11284 + }, + { + "epoch": 0.14664342809063044, + "grad_norm": 0.43853938579559326, + "learning_rate": 0.00017070666372060985, + "loss": 1.3387, + "step": 11285 + }, + { + "epoch": 0.1466564226345463, + "grad_norm": 0.3544641435146332, + "learning_rate": 0.00017070406425869844, + "loss": 1.4138, + "step": 11286 + }, + { + "epoch": 0.14666941717846219, + "grad_norm": 0.3245483934879303, + "learning_rate": 0.00017070146479678707, + "loss": 1.2798, + "step": 11287 + }, + { + "epoch": 0.14668241172237806, + "grad_norm": 0.44017425179481506, + "learning_rate": 0.0001706988653348757, + "loss": 1.4239, + "step": 11288 + }, + { + "epoch": 0.14669540626629393, + "grad_norm": 0.40762537717819214, + "learning_rate": 0.0001706962658729643, + "loss": 1.4932, + "step": 11289 + }, + { + "epoch": 0.1467084008102098, + "grad_norm": 0.38577723503112793, + "learning_rate": 0.00017069366641105291, + "loss": 1.294, + "step": 11290 + }, + { + "epoch": 0.14672139535412568, + "grad_norm": 0.31441450119018555, + "learning_rate": 0.00017069106694914154, + "loss": 1.3226, + "step": 11291 + }, + { + "epoch": 0.14673438989804155, + "grad_norm": 0.3920429050922394, + "learning_rate": 0.00017068846748723016, + "loss": 1.5302, + "step": 11292 + }, + { + "epoch": 0.14674738444195742, + "grad_norm": 0.36099323630332947, + "learning_rate": 0.00017068586802531876, + "loss": 1.389, + "step": 11293 + }, + { + "epoch": 0.1467603789858733, + "grad_norm": 0.4814140200614929, + "learning_rate": 0.00017068326856340739, + "loss": 1.4357, + "step": 11294 + }, + { + "epoch": 0.14677337352978917, + "grad_norm": 0.38020220398902893, + "learning_rate": 0.000170680669101496, + "loss": 1.3838, + "step": 11295 + }, + { + "epoch": 0.14678636807370504, + "grad_norm": 0.38883206248283386, + "learning_rate": 0.0001706780696395846, + "loss": 1.4297, + "step": 11296 + }, + { + "epoch": 0.14679936261762092, + "grad_norm": 0.5336456894874573, + "learning_rate": 0.00017067547017767323, + "loss": 1.2553, + "step": 11297 + }, + { + "epoch": 0.1468123571615368, + "grad_norm": 0.4061763882637024, + "learning_rate": 0.00017067287071576183, + "loss": 1.587, + "step": 11298 + }, + { + "epoch": 0.14682535170545266, + "grad_norm": 0.3841736316680908, + "learning_rate": 0.00017067027125385045, + "loss": 1.3514, + "step": 11299 + }, + { + "epoch": 0.14683834624936853, + "grad_norm": 0.4954317510128021, + "learning_rate": 0.00017066767179193908, + "loss": 1.4853, + "step": 11300 + }, + { + "epoch": 0.1468513407932844, + "grad_norm": 0.3520929217338562, + "learning_rate": 0.00017066507233002768, + "loss": 1.4049, + "step": 11301 + }, + { + "epoch": 0.14686433533720028, + "grad_norm": 0.3793904185295105, + "learning_rate": 0.00017066247286811633, + "loss": 1.4555, + "step": 11302 + }, + { + "epoch": 0.14687732988111615, + "grad_norm": 0.4387750029563904, + "learning_rate": 0.00017065987340620492, + "loss": 1.3121, + "step": 11303 + }, + { + "epoch": 0.14689032442503203, + "grad_norm": 0.466605007648468, + "learning_rate": 0.00017065727394429355, + "loss": 1.2954, + "step": 11304 + }, + { + "epoch": 0.14690331896894793, + "grad_norm": 0.4261796474456787, + "learning_rate": 0.00017065467448238215, + "loss": 1.5242, + "step": 11305 + }, + { + "epoch": 0.1469163135128638, + "grad_norm": 0.3150370717048645, + "learning_rate": 0.00017065207502047077, + "loss": 1.3216, + "step": 11306 + }, + { + "epoch": 0.14692930805677967, + "grad_norm": 0.47326594591140747, + "learning_rate": 0.0001706494755585594, + "loss": 1.5901, + "step": 11307 + }, + { + "epoch": 0.14694230260069555, + "grad_norm": 0.35577261447906494, + "learning_rate": 0.000170646876096648, + "loss": 1.1514, + "step": 11308 + }, + { + "epoch": 0.14695529714461142, + "grad_norm": 0.4099962115287781, + "learning_rate": 0.00017064427663473662, + "loss": 1.4694, + "step": 11309 + }, + { + "epoch": 0.1469682916885273, + "grad_norm": 0.38423559069633484, + "learning_rate": 0.00017064167717282524, + "loss": 1.4282, + "step": 11310 + }, + { + "epoch": 0.14698128623244316, + "grad_norm": 0.38542187213897705, + "learning_rate": 0.00017063907771091387, + "loss": 1.2436, + "step": 11311 + }, + { + "epoch": 0.14699428077635904, + "grad_norm": 0.39691177010536194, + "learning_rate": 0.00017063647824900246, + "loss": 1.4153, + "step": 11312 + }, + { + "epoch": 0.1470072753202749, + "grad_norm": 0.37217992544174194, + "learning_rate": 0.00017063387878709106, + "loss": 1.3109, + "step": 11313 + }, + { + "epoch": 0.14702026986419078, + "grad_norm": 0.3418477773666382, + "learning_rate": 0.0001706312793251797, + "loss": 1.48, + "step": 11314 + }, + { + "epoch": 0.14703326440810666, + "grad_norm": 0.382400780916214, + "learning_rate": 0.0001706286798632683, + "loss": 1.4198, + "step": 11315 + }, + { + "epoch": 0.14704625895202253, + "grad_norm": 0.4471965432167053, + "learning_rate": 0.00017062608040135693, + "loss": 1.4009, + "step": 11316 + }, + { + "epoch": 0.1470592534959384, + "grad_norm": 0.7015073299407959, + "learning_rate": 0.00017062348093944553, + "loss": 1.4773, + "step": 11317 + }, + { + "epoch": 0.14707224803985428, + "grad_norm": 0.29717767238616943, + "learning_rate": 0.00017062088147753416, + "loss": 1.381, + "step": 11318 + }, + { + "epoch": 0.14708524258377015, + "grad_norm": 0.4319836497306824, + "learning_rate": 0.00017061828201562278, + "loss": 1.4696, + "step": 11319 + }, + { + "epoch": 0.14709823712768602, + "grad_norm": 0.2944984436035156, + "learning_rate": 0.00017061568255371138, + "loss": 1.3719, + "step": 11320 + }, + { + "epoch": 0.1471112316716019, + "grad_norm": 0.41287854313850403, + "learning_rate": 0.0001706130830918, + "loss": 1.466, + "step": 11321 + }, + { + "epoch": 0.14712422621551777, + "grad_norm": 0.48095208406448364, + "learning_rate": 0.00017061048362988863, + "loss": 1.4417, + "step": 11322 + }, + { + "epoch": 0.14713722075943364, + "grad_norm": 0.3037012219429016, + "learning_rate": 0.00017060788416797725, + "loss": 1.4666, + "step": 11323 + }, + { + "epoch": 0.1471502153033495, + "grad_norm": 0.34090596437454224, + "learning_rate": 0.00017060528470606585, + "loss": 1.3608, + "step": 11324 + }, + { + "epoch": 0.14716320984726539, + "grad_norm": 0.3609218895435333, + "learning_rate": 0.00017060268524415445, + "loss": 1.2406, + "step": 11325 + }, + { + "epoch": 0.14717620439118126, + "grad_norm": 0.42388495802879333, + "learning_rate": 0.0001706000857822431, + "loss": 1.3757, + "step": 11326 + }, + { + "epoch": 0.14718919893509713, + "grad_norm": 0.41894280910491943, + "learning_rate": 0.0001705974863203317, + "loss": 1.34, + "step": 11327 + }, + { + "epoch": 0.147202193479013, + "grad_norm": 0.4209027886390686, + "learning_rate": 0.00017059488685842032, + "loss": 1.4093, + "step": 11328 + }, + { + "epoch": 0.14721518802292888, + "grad_norm": 0.4063383638858795, + "learning_rate": 0.00017059228739650892, + "loss": 1.595, + "step": 11329 + }, + { + "epoch": 0.14722818256684475, + "grad_norm": 0.4827539920806885, + "learning_rate": 0.00017058968793459754, + "loss": 1.6648, + "step": 11330 + }, + { + "epoch": 0.14724117711076062, + "grad_norm": 0.4543558657169342, + "learning_rate": 0.00017058708847268617, + "loss": 1.5291, + "step": 11331 + }, + { + "epoch": 0.1472541716546765, + "grad_norm": 0.3768217861652374, + "learning_rate": 0.00017058448901077476, + "loss": 1.5283, + "step": 11332 + }, + { + "epoch": 0.14726716619859237, + "grad_norm": 0.8364927768707275, + "learning_rate": 0.0001705818895488634, + "loss": 1.2543, + "step": 11333 + }, + { + "epoch": 0.14728016074250824, + "grad_norm": 0.32101795077323914, + "learning_rate": 0.000170579290086952, + "loss": 1.2807, + "step": 11334 + }, + { + "epoch": 0.14729315528642412, + "grad_norm": 0.31307604908943176, + "learning_rate": 0.00017057669062504064, + "loss": 1.4749, + "step": 11335 + }, + { + "epoch": 0.14730614983034, + "grad_norm": 0.39819425344467163, + "learning_rate": 0.00017057409116312923, + "loss": 1.5478, + "step": 11336 + }, + { + "epoch": 0.14731914437425586, + "grad_norm": 0.4148833453655243, + "learning_rate": 0.00017057149170121786, + "loss": 1.5194, + "step": 11337 + }, + { + "epoch": 0.14733213891817173, + "grad_norm": 0.42625489830970764, + "learning_rate": 0.00017056889223930648, + "loss": 1.3707, + "step": 11338 + }, + { + "epoch": 0.1473451334620876, + "grad_norm": 0.34176045656204224, + "learning_rate": 0.00017056629277739508, + "loss": 1.2678, + "step": 11339 + }, + { + "epoch": 0.14735812800600348, + "grad_norm": 0.3996594250202179, + "learning_rate": 0.0001705636933154837, + "loss": 1.5279, + "step": 11340 + }, + { + "epoch": 0.14737112254991935, + "grad_norm": 0.40667155385017395, + "learning_rate": 0.00017056109385357233, + "loss": 1.4916, + "step": 11341 + }, + { + "epoch": 0.14738411709383523, + "grad_norm": 0.2968926429748535, + "learning_rate": 0.00017055849439166093, + "loss": 1.4355, + "step": 11342 + }, + { + "epoch": 0.1473971116377511, + "grad_norm": 0.3608457148075104, + "learning_rate": 0.00017055589492974955, + "loss": 1.4887, + "step": 11343 + }, + { + "epoch": 0.14741010618166697, + "grad_norm": 0.37557971477508545, + "learning_rate": 0.00017055329546783815, + "loss": 1.241, + "step": 11344 + }, + { + "epoch": 0.14742310072558285, + "grad_norm": 0.3155742287635803, + "learning_rate": 0.0001705506960059268, + "loss": 1.3422, + "step": 11345 + }, + { + "epoch": 0.14743609526949872, + "grad_norm": 0.48092982172966003, + "learning_rate": 0.0001705480965440154, + "loss": 1.641, + "step": 11346 + }, + { + "epoch": 0.1474490898134146, + "grad_norm": 0.39166221022605896, + "learning_rate": 0.00017054549708210402, + "loss": 1.4743, + "step": 11347 + }, + { + "epoch": 0.14746208435733046, + "grad_norm": 0.3319631814956665, + "learning_rate": 0.00017054289762019262, + "loss": 1.5296, + "step": 11348 + }, + { + "epoch": 0.14747507890124634, + "grad_norm": 0.37180498242378235, + "learning_rate": 0.00017054029815828124, + "loss": 1.5253, + "step": 11349 + }, + { + "epoch": 0.1474880734451622, + "grad_norm": 0.5395805835723877, + "learning_rate": 0.00017053769869636987, + "loss": 1.5573, + "step": 11350 + }, + { + "epoch": 0.14750106798907808, + "grad_norm": 0.3219076097011566, + "learning_rate": 0.00017053509923445847, + "loss": 1.3363, + "step": 11351 + }, + { + "epoch": 0.14751406253299396, + "grad_norm": 0.4966753423213959, + "learning_rate": 0.0001705324997725471, + "loss": 1.4895, + "step": 11352 + }, + { + "epoch": 0.14752705707690983, + "grad_norm": 0.3192373514175415, + "learning_rate": 0.00017052990031063571, + "loss": 1.0323, + "step": 11353 + }, + { + "epoch": 0.1475400516208257, + "grad_norm": 0.3798332214355469, + "learning_rate": 0.0001705273008487243, + "loss": 1.4693, + "step": 11354 + }, + { + "epoch": 0.14755304616474157, + "grad_norm": 0.4342315196990967, + "learning_rate": 0.00017052470138681294, + "loss": 1.4513, + "step": 11355 + }, + { + "epoch": 0.14756604070865745, + "grad_norm": 0.4443801939487457, + "learning_rate": 0.00017052210192490153, + "loss": 1.3978, + "step": 11356 + }, + { + "epoch": 0.14757903525257332, + "grad_norm": 0.4159848988056183, + "learning_rate": 0.00017051950246299019, + "loss": 1.3956, + "step": 11357 + }, + { + "epoch": 0.1475920297964892, + "grad_norm": 0.39567968249320984, + "learning_rate": 0.00017051690300107878, + "loss": 1.4222, + "step": 11358 + }, + { + "epoch": 0.14760502434040507, + "grad_norm": 0.49021586775779724, + "learning_rate": 0.0001705143035391674, + "loss": 1.5596, + "step": 11359 + }, + { + "epoch": 0.14761801888432094, + "grad_norm": 0.41064900159835815, + "learning_rate": 0.000170511704077256, + "loss": 1.3305, + "step": 11360 + }, + { + "epoch": 0.1476310134282368, + "grad_norm": 0.31437233090400696, + "learning_rate": 0.00017050910461534463, + "loss": 1.4809, + "step": 11361 + }, + { + "epoch": 0.14764400797215269, + "grad_norm": 0.5019233822822571, + "learning_rate": 0.00017050650515343325, + "loss": 1.3856, + "step": 11362 + }, + { + "epoch": 0.14765700251606856, + "grad_norm": 0.3403305411338806, + "learning_rate": 0.00017050390569152185, + "loss": 1.2366, + "step": 11363 + }, + { + "epoch": 0.14766999705998443, + "grad_norm": 0.47358494997024536, + "learning_rate": 0.00017050130622961048, + "loss": 1.4457, + "step": 11364 + }, + { + "epoch": 0.1476829916039003, + "grad_norm": 0.45909085869789124, + "learning_rate": 0.0001704987067676991, + "loss": 1.3399, + "step": 11365 + }, + { + "epoch": 0.14769598614781618, + "grad_norm": 0.2728167176246643, + "learning_rate": 0.0001704961073057877, + "loss": 1.1756, + "step": 11366 + }, + { + "epoch": 0.14770898069173205, + "grad_norm": 0.3456907272338867, + "learning_rate": 0.00017049350784387632, + "loss": 1.1714, + "step": 11367 + }, + { + "epoch": 0.14772197523564792, + "grad_norm": 0.29394423961639404, + "learning_rate": 0.00017049090838196492, + "loss": 1.4336, + "step": 11368 + }, + { + "epoch": 0.1477349697795638, + "grad_norm": 0.3596062958240509, + "learning_rate": 0.00017048830892005357, + "loss": 1.3993, + "step": 11369 + }, + { + "epoch": 0.14774796432347967, + "grad_norm": 0.430662602186203, + "learning_rate": 0.00017048570945814217, + "loss": 1.3741, + "step": 11370 + }, + { + "epoch": 0.14776095886739554, + "grad_norm": 0.6005839109420776, + "learning_rate": 0.0001704831099962308, + "loss": 1.631, + "step": 11371 + }, + { + "epoch": 0.14777395341131142, + "grad_norm": 0.3450881242752075, + "learning_rate": 0.0001704805105343194, + "loss": 1.2928, + "step": 11372 + }, + { + "epoch": 0.1477869479552273, + "grad_norm": 0.3537757396697998, + "learning_rate": 0.00017047791107240801, + "loss": 1.3556, + "step": 11373 + }, + { + "epoch": 0.14779994249914316, + "grad_norm": 0.380252480506897, + "learning_rate": 0.00017047531161049664, + "loss": 1.5213, + "step": 11374 + }, + { + "epoch": 0.14781293704305903, + "grad_norm": 0.30626946687698364, + "learning_rate": 0.00017047271214858524, + "loss": 1.1617, + "step": 11375 + }, + { + "epoch": 0.1478259315869749, + "grad_norm": 0.37865880131721497, + "learning_rate": 0.0001704701126866739, + "loss": 1.6296, + "step": 11376 + }, + { + "epoch": 0.14783892613089078, + "grad_norm": 0.40330734848976135, + "learning_rate": 0.00017046751322476249, + "loss": 1.5046, + "step": 11377 + }, + { + "epoch": 0.14785192067480665, + "grad_norm": 0.33202293515205383, + "learning_rate": 0.0001704649137628511, + "loss": 1.496, + "step": 11378 + }, + { + "epoch": 0.14786491521872253, + "grad_norm": 0.4565335214138031, + "learning_rate": 0.0001704623143009397, + "loss": 1.4885, + "step": 11379 + }, + { + "epoch": 0.1478779097626384, + "grad_norm": 0.4210394322872162, + "learning_rate": 0.00017045971483902833, + "loss": 1.4602, + "step": 11380 + }, + { + "epoch": 0.14789090430655427, + "grad_norm": 0.29736971855163574, + "learning_rate": 0.00017045711537711696, + "loss": 1.4655, + "step": 11381 + }, + { + "epoch": 0.14790389885047017, + "grad_norm": 0.45637187361717224, + "learning_rate": 0.00017045451591520555, + "loss": 1.4731, + "step": 11382 + }, + { + "epoch": 0.14791689339438605, + "grad_norm": 0.47508472204208374, + "learning_rate": 0.00017045191645329418, + "loss": 1.4168, + "step": 11383 + }, + { + "epoch": 0.14792988793830192, + "grad_norm": 0.3757266402244568, + "learning_rate": 0.0001704493169913828, + "loss": 1.484, + "step": 11384 + }, + { + "epoch": 0.1479428824822178, + "grad_norm": 0.45095619559288025, + "learning_rate": 0.0001704467175294714, + "loss": 1.4376, + "step": 11385 + }, + { + "epoch": 0.14795587702613366, + "grad_norm": 0.3567884564399719, + "learning_rate": 0.00017044411806756002, + "loss": 1.253, + "step": 11386 + }, + { + "epoch": 0.14796887157004954, + "grad_norm": 0.4230720102787018, + "learning_rate": 0.00017044151860564862, + "loss": 1.4911, + "step": 11387 + }, + { + "epoch": 0.1479818661139654, + "grad_norm": 0.3521578907966614, + "learning_rate": 0.00017043891914373727, + "loss": 1.3722, + "step": 11388 + }, + { + "epoch": 0.14799486065788128, + "grad_norm": 0.3804466426372528, + "learning_rate": 0.00017043631968182587, + "loss": 1.5331, + "step": 11389 + }, + { + "epoch": 0.14800785520179716, + "grad_norm": 0.3658037781715393, + "learning_rate": 0.0001704337202199145, + "loss": 1.3457, + "step": 11390 + }, + { + "epoch": 0.14802084974571303, + "grad_norm": 0.3560846447944641, + "learning_rate": 0.0001704311207580031, + "loss": 1.2726, + "step": 11391 + }, + { + "epoch": 0.1480338442896289, + "grad_norm": 0.3416954576969147, + "learning_rate": 0.00017042852129609172, + "loss": 1.4962, + "step": 11392 + }, + { + "epoch": 0.14804683883354477, + "grad_norm": 0.38784071803092957, + "learning_rate": 0.00017042592183418034, + "loss": 1.3989, + "step": 11393 + }, + { + "epoch": 0.14805983337746065, + "grad_norm": 0.4625445306301117, + "learning_rate": 0.00017042332237226894, + "loss": 1.4303, + "step": 11394 + }, + { + "epoch": 0.14807282792137652, + "grad_norm": 0.7778862714767456, + "learning_rate": 0.00017042072291035756, + "loss": 1.5541, + "step": 11395 + }, + { + "epoch": 0.1480858224652924, + "grad_norm": 0.3546242415904999, + "learning_rate": 0.0001704181234484462, + "loss": 1.5026, + "step": 11396 + }, + { + "epoch": 0.14809881700920827, + "grad_norm": 0.3760068714618683, + "learning_rate": 0.00017041552398653479, + "loss": 1.4071, + "step": 11397 + }, + { + "epoch": 0.14811181155312414, + "grad_norm": 0.48493143916130066, + "learning_rate": 0.0001704129245246234, + "loss": 1.4072, + "step": 11398 + }, + { + "epoch": 0.14812480609704, + "grad_norm": 0.40202566981315613, + "learning_rate": 0.000170410325062712, + "loss": 1.4443, + "step": 11399 + }, + { + "epoch": 0.14813780064095589, + "grad_norm": 0.3808056712150574, + "learning_rate": 0.00017040772560080066, + "loss": 1.4696, + "step": 11400 + }, + { + "epoch": 0.14815079518487176, + "grad_norm": 0.3787548243999481, + "learning_rate": 0.00017040512613888926, + "loss": 1.5949, + "step": 11401 + }, + { + "epoch": 0.14816378972878763, + "grad_norm": 0.3699875771999359, + "learning_rate": 0.00017040252667697788, + "loss": 1.6121, + "step": 11402 + }, + { + "epoch": 0.1481767842727035, + "grad_norm": 0.4473171830177307, + "learning_rate": 0.00017039992721506648, + "loss": 1.3451, + "step": 11403 + }, + { + "epoch": 0.14818977881661938, + "grad_norm": 0.3499412536621094, + "learning_rate": 0.0001703973277531551, + "loss": 1.2668, + "step": 11404 + }, + { + "epoch": 0.14820277336053525, + "grad_norm": 0.4555814862251282, + "learning_rate": 0.00017039472829124373, + "loss": 1.55, + "step": 11405 + }, + { + "epoch": 0.14821576790445112, + "grad_norm": 0.459335595369339, + "learning_rate": 0.00017039212882933232, + "loss": 1.4368, + "step": 11406 + }, + { + "epoch": 0.148228762448367, + "grad_norm": 0.3566705286502838, + "learning_rate": 0.00017038952936742095, + "loss": 1.5282, + "step": 11407 + }, + { + "epoch": 0.14824175699228287, + "grad_norm": 0.38004159927368164, + "learning_rate": 0.00017038692990550957, + "loss": 1.5673, + "step": 11408 + }, + { + "epoch": 0.14825475153619874, + "grad_norm": 0.37175101041793823, + "learning_rate": 0.00017038433044359817, + "loss": 1.5419, + "step": 11409 + }, + { + "epoch": 0.14826774608011462, + "grad_norm": 0.4951498806476593, + "learning_rate": 0.0001703817309816868, + "loss": 1.2901, + "step": 11410 + }, + { + "epoch": 0.1482807406240305, + "grad_norm": 0.3955337107181549, + "learning_rate": 0.00017037913151977542, + "loss": 1.3176, + "step": 11411 + }, + { + "epoch": 0.14829373516794636, + "grad_norm": 0.42036792635917664, + "learning_rate": 0.00017037653205786404, + "loss": 1.6788, + "step": 11412 + }, + { + "epoch": 0.14830672971186223, + "grad_norm": 0.3640468418598175, + "learning_rate": 0.00017037393259595264, + "loss": 1.4525, + "step": 11413 + }, + { + "epoch": 0.1483197242557781, + "grad_norm": 0.408597469329834, + "learning_rate": 0.00017037133313404127, + "loss": 1.4319, + "step": 11414 + }, + { + "epoch": 0.14833271879969398, + "grad_norm": 0.3696690499782562, + "learning_rate": 0.0001703687336721299, + "loss": 1.7039, + "step": 11415 + }, + { + "epoch": 0.14834571334360985, + "grad_norm": 0.35438260436058044, + "learning_rate": 0.0001703661342102185, + "loss": 1.3458, + "step": 11416 + }, + { + "epoch": 0.14835870788752573, + "grad_norm": 0.37571418285369873, + "learning_rate": 0.0001703635347483071, + "loss": 1.299, + "step": 11417 + }, + { + "epoch": 0.1483717024314416, + "grad_norm": 0.4100554585456848, + "learning_rate": 0.0001703609352863957, + "loss": 1.544, + "step": 11418 + }, + { + "epoch": 0.14838469697535747, + "grad_norm": 0.3780863583087921, + "learning_rate": 0.00017035833582448436, + "loss": 1.2972, + "step": 11419 + }, + { + "epoch": 0.14839769151927334, + "grad_norm": 0.5090693235397339, + "learning_rate": 0.00017035573636257296, + "loss": 1.3581, + "step": 11420 + }, + { + "epoch": 0.14841068606318922, + "grad_norm": 0.3582127094268799, + "learning_rate": 0.00017035313690066156, + "loss": 1.5535, + "step": 11421 + }, + { + "epoch": 0.1484236806071051, + "grad_norm": 0.4482356309890747, + "learning_rate": 0.00017035053743875018, + "loss": 1.2912, + "step": 11422 + }, + { + "epoch": 0.14843667515102096, + "grad_norm": 0.4512385129928589, + "learning_rate": 0.0001703479379768388, + "loss": 1.5941, + "step": 11423 + }, + { + "epoch": 0.14844966969493684, + "grad_norm": 0.263071745634079, + "learning_rate": 0.00017034533851492743, + "loss": 1.3899, + "step": 11424 + }, + { + "epoch": 0.1484626642388527, + "grad_norm": 0.4690077304840088, + "learning_rate": 0.00017034273905301603, + "loss": 1.4529, + "step": 11425 + }, + { + "epoch": 0.14847565878276858, + "grad_norm": 0.4605686664581299, + "learning_rate": 0.00017034013959110465, + "loss": 1.4893, + "step": 11426 + }, + { + "epoch": 0.14848865332668446, + "grad_norm": 0.42092588543891907, + "learning_rate": 0.00017033754012919328, + "loss": 1.5955, + "step": 11427 + }, + { + "epoch": 0.14850164787060033, + "grad_norm": 0.4872777760028839, + "learning_rate": 0.00017033494066728187, + "loss": 1.3893, + "step": 11428 + }, + { + "epoch": 0.1485146424145162, + "grad_norm": 0.4220643639564514, + "learning_rate": 0.0001703323412053705, + "loss": 1.4826, + "step": 11429 + }, + { + "epoch": 0.14852763695843207, + "grad_norm": 0.41432252526283264, + "learning_rate": 0.0001703297417434591, + "loss": 1.6453, + "step": 11430 + }, + { + "epoch": 0.14854063150234795, + "grad_norm": 0.3603918254375458, + "learning_rate": 0.00017032714228154775, + "loss": 1.5115, + "step": 11431 + }, + { + "epoch": 0.14855362604626382, + "grad_norm": 0.4360947608947754, + "learning_rate": 0.00017032454281963634, + "loss": 1.306, + "step": 11432 + }, + { + "epoch": 0.1485666205901797, + "grad_norm": 0.40506333112716675, + "learning_rate": 0.00017032194335772497, + "loss": 1.529, + "step": 11433 + }, + { + "epoch": 0.14857961513409557, + "grad_norm": 0.32672175765037537, + "learning_rate": 0.00017031934389581357, + "loss": 1.3138, + "step": 11434 + }, + { + "epoch": 0.14859260967801144, + "grad_norm": 0.31174424290657043, + "learning_rate": 0.0001703167444339022, + "loss": 1.4927, + "step": 11435 + }, + { + "epoch": 0.1486056042219273, + "grad_norm": 0.4484836757183075, + "learning_rate": 0.00017031414497199082, + "loss": 1.5058, + "step": 11436 + }, + { + "epoch": 0.14861859876584319, + "grad_norm": 0.38390782475471497, + "learning_rate": 0.0001703115455100794, + "loss": 1.2884, + "step": 11437 + }, + { + "epoch": 0.14863159330975906, + "grad_norm": 0.39949488639831543, + "learning_rate": 0.00017030894604816804, + "loss": 1.5338, + "step": 11438 + }, + { + "epoch": 0.14864458785367493, + "grad_norm": 0.2830457389354706, + "learning_rate": 0.00017030634658625666, + "loss": 1.2745, + "step": 11439 + }, + { + "epoch": 0.1486575823975908, + "grad_norm": 0.48456454277038574, + "learning_rate": 0.00017030374712434526, + "loss": 1.51, + "step": 11440 + }, + { + "epoch": 0.14867057694150668, + "grad_norm": 0.39934781193733215, + "learning_rate": 0.00017030114766243388, + "loss": 1.4383, + "step": 11441 + }, + { + "epoch": 0.14868357148542255, + "grad_norm": 0.34529444575309753, + "learning_rate": 0.00017029854820052248, + "loss": 1.3632, + "step": 11442 + }, + { + "epoch": 0.14869656602933842, + "grad_norm": 0.36620810627937317, + "learning_rate": 0.00017029594873861113, + "loss": 1.359, + "step": 11443 + }, + { + "epoch": 0.1487095605732543, + "grad_norm": 0.3614499568939209, + "learning_rate": 0.00017029334927669973, + "loss": 1.3205, + "step": 11444 + }, + { + "epoch": 0.14872255511717017, + "grad_norm": 0.4184872508049011, + "learning_rate": 0.00017029074981478835, + "loss": 1.7229, + "step": 11445 + }, + { + "epoch": 0.14873554966108604, + "grad_norm": 0.39357790350914, + "learning_rate": 0.00017028815035287695, + "loss": 1.6542, + "step": 11446 + }, + { + "epoch": 0.14874854420500191, + "grad_norm": 0.456364244222641, + "learning_rate": 0.00017028555089096558, + "loss": 1.4659, + "step": 11447 + }, + { + "epoch": 0.1487615387489178, + "grad_norm": 0.450084388256073, + "learning_rate": 0.0001702829514290542, + "loss": 1.4164, + "step": 11448 + }, + { + "epoch": 0.14877453329283366, + "grad_norm": 0.4823257625102997, + "learning_rate": 0.0001702803519671428, + "loss": 1.4727, + "step": 11449 + }, + { + "epoch": 0.14878752783674953, + "grad_norm": 0.4477674961090088, + "learning_rate": 0.00017027775250523142, + "loss": 1.4606, + "step": 11450 + }, + { + "epoch": 0.1488005223806654, + "grad_norm": 0.3501686751842499, + "learning_rate": 0.00017027515304332005, + "loss": 1.5221, + "step": 11451 + }, + { + "epoch": 0.14881351692458128, + "grad_norm": 0.4303988516330719, + "learning_rate": 0.00017027255358140864, + "loss": 1.4635, + "step": 11452 + }, + { + "epoch": 0.14882651146849715, + "grad_norm": 0.42449280619621277, + "learning_rate": 0.00017026995411949727, + "loss": 1.6963, + "step": 11453 + }, + { + "epoch": 0.14883950601241303, + "grad_norm": 0.34983667731285095, + "learning_rate": 0.0001702673546575859, + "loss": 1.3887, + "step": 11454 + }, + { + "epoch": 0.1488525005563289, + "grad_norm": 0.3292420506477356, + "learning_rate": 0.00017026475519567452, + "loss": 1.2677, + "step": 11455 + }, + { + "epoch": 0.14886549510024477, + "grad_norm": 0.36140671372413635, + "learning_rate": 0.00017026215573376312, + "loss": 1.3379, + "step": 11456 + }, + { + "epoch": 0.14887848964416064, + "grad_norm": 0.2870061993598938, + "learning_rate": 0.00017025955627185174, + "loss": 1.4775, + "step": 11457 + }, + { + "epoch": 0.14889148418807654, + "grad_norm": 0.5135142207145691, + "learning_rate": 0.00017025695680994036, + "loss": 1.5304, + "step": 11458 + }, + { + "epoch": 0.14890447873199242, + "grad_norm": 0.3882032036781311, + "learning_rate": 0.00017025435734802896, + "loss": 1.3712, + "step": 11459 + }, + { + "epoch": 0.1489174732759083, + "grad_norm": 0.5394908785820007, + "learning_rate": 0.00017025175788611759, + "loss": 1.5066, + "step": 11460 + }, + { + "epoch": 0.14893046781982416, + "grad_norm": 0.4915997087955475, + "learning_rate": 0.00017024915842420618, + "loss": 1.4292, + "step": 11461 + }, + { + "epoch": 0.14894346236374004, + "grad_norm": 0.4425760507583618, + "learning_rate": 0.00017024655896229484, + "loss": 1.4188, + "step": 11462 + }, + { + "epoch": 0.1489564569076559, + "grad_norm": 0.358690470457077, + "learning_rate": 0.00017024395950038343, + "loss": 1.4782, + "step": 11463 + }, + { + "epoch": 0.14896945145157178, + "grad_norm": 0.29866454005241394, + "learning_rate": 0.00017024136003847203, + "loss": 1.2896, + "step": 11464 + }, + { + "epoch": 0.14898244599548766, + "grad_norm": 0.3622207045555115, + "learning_rate": 0.00017023876057656065, + "loss": 1.4822, + "step": 11465 + }, + { + "epoch": 0.14899544053940353, + "grad_norm": 0.441483736038208, + "learning_rate": 0.00017023616111464928, + "loss": 1.6407, + "step": 11466 + }, + { + "epoch": 0.1490084350833194, + "grad_norm": 0.39719903469085693, + "learning_rate": 0.0001702335616527379, + "loss": 1.6008, + "step": 11467 + }, + { + "epoch": 0.14902142962723527, + "grad_norm": 0.43006718158721924, + "learning_rate": 0.0001702309621908265, + "loss": 1.4292, + "step": 11468 + }, + { + "epoch": 0.14903442417115115, + "grad_norm": 0.4161652624607086, + "learning_rate": 0.00017022836272891513, + "loss": 1.3961, + "step": 11469 + }, + { + "epoch": 0.14904741871506702, + "grad_norm": 0.30316856503486633, + "learning_rate": 0.00017022576326700375, + "loss": 1.2608, + "step": 11470 + }, + { + "epoch": 0.1490604132589829, + "grad_norm": 0.3793087303638458, + "learning_rate": 0.00017022316380509235, + "loss": 1.3831, + "step": 11471 + }, + { + "epoch": 0.14907340780289877, + "grad_norm": 0.4868999123573303, + "learning_rate": 0.00017022056434318097, + "loss": 1.4315, + "step": 11472 + }, + { + "epoch": 0.14908640234681464, + "grad_norm": 0.34300515055656433, + "learning_rate": 0.00017021796488126957, + "loss": 1.2452, + "step": 11473 + }, + { + "epoch": 0.1490993968907305, + "grad_norm": 0.3368987441062927, + "learning_rate": 0.00017021536541935822, + "loss": 1.3229, + "step": 11474 + }, + { + "epoch": 0.14911239143464639, + "grad_norm": 0.37760478258132935, + "learning_rate": 0.00017021276595744682, + "loss": 1.2469, + "step": 11475 + }, + { + "epoch": 0.14912538597856226, + "grad_norm": 0.3617461025714874, + "learning_rate": 0.00017021016649553542, + "loss": 1.5085, + "step": 11476 + }, + { + "epoch": 0.14913838052247813, + "grad_norm": 0.35681548714637756, + "learning_rate": 0.00017020756703362404, + "loss": 1.3706, + "step": 11477 + }, + { + "epoch": 0.149151375066394, + "grad_norm": 0.4595588445663452, + "learning_rate": 0.00017020496757171266, + "loss": 1.4449, + "step": 11478 + }, + { + "epoch": 0.14916436961030988, + "grad_norm": 0.5224841833114624, + "learning_rate": 0.0001702023681098013, + "loss": 1.4452, + "step": 11479 + }, + { + "epoch": 0.14917736415422575, + "grad_norm": 0.41214656829833984, + "learning_rate": 0.00017019976864788989, + "loss": 1.5993, + "step": 11480 + }, + { + "epoch": 0.14919035869814162, + "grad_norm": 0.3594188392162323, + "learning_rate": 0.0001701971691859785, + "loss": 1.4421, + "step": 11481 + }, + { + "epoch": 0.1492033532420575, + "grad_norm": 0.5174275040626526, + "learning_rate": 0.00017019456972406714, + "loss": 1.3417, + "step": 11482 + }, + { + "epoch": 0.14921634778597337, + "grad_norm": 0.23060114681720734, + "learning_rate": 0.00017019197026215573, + "loss": 1.1345, + "step": 11483 + }, + { + "epoch": 0.14922934232988924, + "grad_norm": 0.30914217233657837, + "learning_rate": 0.00017018937080024436, + "loss": 1.3373, + "step": 11484 + }, + { + "epoch": 0.14924233687380511, + "grad_norm": 0.3719383776187897, + "learning_rate": 0.00017018677133833298, + "loss": 1.527, + "step": 11485 + }, + { + "epoch": 0.149255331417721, + "grad_norm": 0.4321482479572296, + "learning_rate": 0.0001701841718764216, + "loss": 1.321, + "step": 11486 + }, + { + "epoch": 0.14926832596163686, + "grad_norm": 0.2888380289077759, + "learning_rate": 0.0001701815724145102, + "loss": 1.3297, + "step": 11487 + }, + { + "epoch": 0.14928132050555273, + "grad_norm": 0.4421655535697937, + "learning_rate": 0.0001701789729525988, + "loss": 1.5391, + "step": 11488 + }, + { + "epoch": 0.1492943150494686, + "grad_norm": 0.3577654957771301, + "learning_rate": 0.00017017637349068745, + "loss": 1.368, + "step": 11489 + }, + { + "epoch": 0.14930730959338448, + "grad_norm": 0.3633783161640167, + "learning_rate": 0.00017017377402877605, + "loss": 1.5692, + "step": 11490 + }, + { + "epoch": 0.14932030413730035, + "grad_norm": 0.3944384753704071, + "learning_rate": 0.00017017117456686467, + "loss": 1.3401, + "step": 11491 + }, + { + "epoch": 0.14933329868121623, + "grad_norm": 0.34175336360931396, + "learning_rate": 0.00017016857510495327, + "loss": 1.3792, + "step": 11492 + }, + { + "epoch": 0.1493462932251321, + "grad_norm": 0.3473910689353943, + "learning_rate": 0.0001701659756430419, + "loss": 1.3927, + "step": 11493 + }, + { + "epoch": 0.14935928776904797, + "grad_norm": 0.412923663854599, + "learning_rate": 0.00017016337618113052, + "loss": 1.462, + "step": 11494 + }, + { + "epoch": 0.14937228231296384, + "grad_norm": 0.46928200125694275, + "learning_rate": 0.00017016077671921912, + "loss": 1.5559, + "step": 11495 + }, + { + "epoch": 0.14938527685687972, + "grad_norm": 0.36658424139022827, + "learning_rate": 0.00017015817725730774, + "loss": 1.4706, + "step": 11496 + }, + { + "epoch": 0.1493982714007956, + "grad_norm": 0.44214513897895813, + "learning_rate": 0.00017015557779539637, + "loss": 1.5378, + "step": 11497 + }, + { + "epoch": 0.14941126594471146, + "grad_norm": 0.40049588680267334, + "learning_rate": 0.000170152978333485, + "loss": 1.3949, + "step": 11498 + }, + { + "epoch": 0.14942426048862734, + "grad_norm": 0.4464159607887268, + "learning_rate": 0.0001701503788715736, + "loss": 1.4155, + "step": 11499 + }, + { + "epoch": 0.1494372550325432, + "grad_norm": 0.4204886257648468, + "learning_rate": 0.0001701477794096622, + "loss": 1.5775, + "step": 11500 + }, + { + "epoch": 0.14945024957645908, + "grad_norm": 0.41453877091407776, + "learning_rate": 0.00017014517994775084, + "loss": 1.4874, + "step": 11501 + }, + { + "epoch": 0.14946324412037496, + "grad_norm": 0.43641197681427, + "learning_rate": 0.00017014258048583943, + "loss": 1.3632, + "step": 11502 + }, + { + "epoch": 0.14947623866429083, + "grad_norm": 0.4206138551235199, + "learning_rate": 0.00017013998102392806, + "loss": 1.5258, + "step": 11503 + }, + { + "epoch": 0.1494892332082067, + "grad_norm": 0.4245132803916931, + "learning_rate": 0.00017013738156201666, + "loss": 1.3837, + "step": 11504 + }, + { + "epoch": 0.14950222775212257, + "grad_norm": 0.4638732969760895, + "learning_rate": 0.00017013478210010528, + "loss": 1.4121, + "step": 11505 + }, + { + "epoch": 0.14951522229603845, + "grad_norm": 0.36251822113990784, + "learning_rate": 0.0001701321826381939, + "loss": 1.4755, + "step": 11506 + }, + { + "epoch": 0.14952821683995432, + "grad_norm": 0.3813481330871582, + "learning_rate": 0.0001701295831762825, + "loss": 1.5329, + "step": 11507 + }, + { + "epoch": 0.1495412113838702, + "grad_norm": 0.5225658416748047, + "learning_rate": 0.00017012698371437113, + "loss": 1.5728, + "step": 11508 + }, + { + "epoch": 0.14955420592778607, + "grad_norm": 0.35174351930618286, + "learning_rate": 0.00017012438425245975, + "loss": 1.1789, + "step": 11509 + }, + { + "epoch": 0.14956720047170194, + "grad_norm": 0.3173539340496063, + "learning_rate": 0.00017012178479054838, + "loss": 1.3163, + "step": 11510 + }, + { + "epoch": 0.1495801950156178, + "grad_norm": 0.395747572183609, + "learning_rate": 0.00017011918532863697, + "loss": 1.4944, + "step": 11511 + }, + { + "epoch": 0.14959318955953368, + "grad_norm": 0.3002728521823883, + "learning_rate": 0.0001701165858667256, + "loss": 1.2828, + "step": 11512 + }, + { + "epoch": 0.14960618410344956, + "grad_norm": 0.43162137269973755, + "learning_rate": 0.00017011398640481422, + "loss": 1.4107, + "step": 11513 + }, + { + "epoch": 0.14961917864736543, + "grad_norm": 0.41468486189842224, + "learning_rate": 0.00017011138694290282, + "loss": 1.5094, + "step": 11514 + }, + { + "epoch": 0.1496321731912813, + "grad_norm": 0.44490954279899597, + "learning_rate": 0.00017010878748099144, + "loss": 1.3995, + "step": 11515 + }, + { + "epoch": 0.14964516773519718, + "grad_norm": 0.399789959192276, + "learning_rate": 0.00017010618801908004, + "loss": 1.3026, + "step": 11516 + }, + { + "epoch": 0.14965816227911305, + "grad_norm": 0.5348469614982605, + "learning_rate": 0.0001701035885571687, + "loss": 1.3991, + "step": 11517 + }, + { + "epoch": 0.14967115682302892, + "grad_norm": 0.38565200567245483, + "learning_rate": 0.0001701009890952573, + "loss": 1.3996, + "step": 11518 + }, + { + "epoch": 0.1496841513669448, + "grad_norm": 0.38135018944740295, + "learning_rate": 0.0001700983896333459, + "loss": 1.3403, + "step": 11519 + }, + { + "epoch": 0.14969714591086067, + "grad_norm": 0.3521561622619629, + "learning_rate": 0.0001700957901714345, + "loss": 1.0993, + "step": 11520 + }, + { + "epoch": 0.14971014045477654, + "grad_norm": 0.4569131135940552, + "learning_rate": 0.00017009319070952314, + "loss": 1.3737, + "step": 11521 + }, + { + "epoch": 0.14972313499869241, + "grad_norm": 0.6189255118370056, + "learning_rate": 0.00017009059124761176, + "loss": 1.5228, + "step": 11522 + }, + { + "epoch": 0.1497361295426083, + "grad_norm": 0.4383171498775482, + "learning_rate": 0.00017008799178570036, + "loss": 1.3263, + "step": 11523 + }, + { + "epoch": 0.14974912408652416, + "grad_norm": 0.5443047285079956, + "learning_rate": 0.00017008539232378898, + "loss": 1.4963, + "step": 11524 + }, + { + "epoch": 0.14976211863044003, + "grad_norm": 0.32509666681289673, + "learning_rate": 0.0001700827928618776, + "loss": 1.4382, + "step": 11525 + }, + { + "epoch": 0.1497751131743559, + "grad_norm": 0.35972630977630615, + "learning_rate": 0.0001700801933999662, + "loss": 1.34, + "step": 11526 + }, + { + "epoch": 0.14978810771827178, + "grad_norm": 0.3555624783039093, + "learning_rate": 0.00017007759393805483, + "loss": 1.2037, + "step": 11527 + }, + { + "epoch": 0.14980110226218765, + "grad_norm": 0.4087228775024414, + "learning_rate": 0.00017007499447614345, + "loss": 1.4519, + "step": 11528 + }, + { + "epoch": 0.14981409680610352, + "grad_norm": 0.34961187839508057, + "learning_rate": 0.00017007239501423208, + "loss": 1.4393, + "step": 11529 + }, + { + "epoch": 0.1498270913500194, + "grad_norm": 0.3724414110183716, + "learning_rate": 0.00017006979555232068, + "loss": 1.3521, + "step": 11530 + }, + { + "epoch": 0.14984008589393527, + "grad_norm": 0.4106716513633728, + "learning_rate": 0.00017006719609040927, + "loss": 1.4609, + "step": 11531 + }, + { + "epoch": 0.14985308043785114, + "grad_norm": 0.4787338972091675, + "learning_rate": 0.00017006459662849793, + "loss": 1.473, + "step": 11532 + }, + { + "epoch": 0.14986607498176702, + "grad_norm": 0.3432345390319824, + "learning_rate": 0.00017006199716658652, + "loss": 1.2589, + "step": 11533 + }, + { + "epoch": 0.14987906952568292, + "grad_norm": 0.29949235916137695, + "learning_rate": 0.00017005939770467515, + "loss": 1.4328, + "step": 11534 + }, + { + "epoch": 0.1498920640695988, + "grad_norm": 0.383391410112381, + "learning_rate": 0.00017005679824276374, + "loss": 1.4559, + "step": 11535 + }, + { + "epoch": 0.14990505861351466, + "grad_norm": 0.36394205689430237, + "learning_rate": 0.00017005419878085237, + "loss": 1.572, + "step": 11536 + }, + { + "epoch": 0.14991805315743054, + "grad_norm": 0.4608973562717438, + "learning_rate": 0.000170051599318941, + "loss": 1.581, + "step": 11537 + }, + { + "epoch": 0.1499310477013464, + "grad_norm": 0.5038651823997498, + "learning_rate": 0.0001700489998570296, + "loss": 1.4742, + "step": 11538 + }, + { + "epoch": 0.14994404224526228, + "grad_norm": 0.32855066657066345, + "learning_rate": 0.00017004640039511822, + "loss": 1.4412, + "step": 11539 + }, + { + "epoch": 0.14995703678917816, + "grad_norm": 0.36024582386016846, + "learning_rate": 0.00017004380093320684, + "loss": 1.2248, + "step": 11540 + }, + { + "epoch": 0.14997003133309403, + "grad_norm": 0.46043330430984497, + "learning_rate": 0.00017004120147129546, + "loss": 1.5752, + "step": 11541 + }, + { + "epoch": 0.1499830258770099, + "grad_norm": 0.41294220089912415, + "learning_rate": 0.00017003860200938406, + "loss": 1.2996, + "step": 11542 + }, + { + "epoch": 0.14999602042092577, + "grad_norm": 0.32781121134757996, + "learning_rate": 0.00017003600254747266, + "loss": 1.3951, + "step": 11543 + }, + { + "epoch": 0.15000901496484165, + "grad_norm": 0.4537646472454071, + "learning_rate": 0.0001700334030855613, + "loss": 1.4994, + "step": 11544 + }, + { + "epoch": 0.15002200950875752, + "grad_norm": 0.4044649004936218, + "learning_rate": 0.0001700308036236499, + "loss": 1.4393, + "step": 11545 + }, + { + "epoch": 0.1500350040526734, + "grad_norm": 0.4367256760597229, + "learning_rate": 0.00017002820416173853, + "loss": 1.2856, + "step": 11546 + }, + { + "epoch": 0.15004799859658927, + "grad_norm": 0.3509795665740967, + "learning_rate": 0.00017002560469982713, + "loss": 1.3965, + "step": 11547 + }, + { + "epoch": 0.15006099314050514, + "grad_norm": 0.3995439112186432, + "learning_rate": 0.00017002300523791575, + "loss": 1.4932, + "step": 11548 + }, + { + "epoch": 0.150073987684421, + "grad_norm": 0.5210452079772949, + "learning_rate": 0.00017002040577600438, + "loss": 1.3718, + "step": 11549 + }, + { + "epoch": 0.15008698222833688, + "grad_norm": 0.3905256390571594, + "learning_rate": 0.00017001780631409298, + "loss": 1.5177, + "step": 11550 + }, + { + "epoch": 0.15009997677225276, + "grad_norm": 0.4281039237976074, + "learning_rate": 0.0001700152068521816, + "loss": 1.5604, + "step": 11551 + }, + { + "epoch": 0.15011297131616863, + "grad_norm": 0.34358009696006775, + "learning_rate": 0.00017001260739027023, + "loss": 1.5522, + "step": 11552 + }, + { + "epoch": 0.1501259658600845, + "grad_norm": 0.4277726411819458, + "learning_rate": 0.00017001000792835885, + "loss": 1.3577, + "step": 11553 + }, + { + "epoch": 0.15013896040400038, + "grad_norm": 0.3907712399959564, + "learning_rate": 0.00017000740846644745, + "loss": 1.4417, + "step": 11554 + }, + { + "epoch": 0.15015195494791625, + "grad_norm": 0.29075127840042114, + "learning_rate": 0.00017000480900453607, + "loss": 1.4091, + "step": 11555 + }, + { + "epoch": 0.15016494949183212, + "grad_norm": 0.34866446256637573, + "learning_rate": 0.0001700022095426247, + "loss": 1.369, + "step": 11556 + }, + { + "epoch": 0.150177944035748, + "grad_norm": 0.41150814294815063, + "learning_rate": 0.0001699996100807133, + "loss": 1.6419, + "step": 11557 + }, + { + "epoch": 0.15019093857966387, + "grad_norm": 0.3709924817085266, + "learning_rate": 0.00016999701061880192, + "loss": 1.2633, + "step": 11558 + }, + { + "epoch": 0.15020393312357974, + "grad_norm": 0.43236422538757324, + "learning_rate": 0.00016999441115689054, + "loss": 1.527, + "step": 11559 + }, + { + "epoch": 0.15021692766749561, + "grad_norm": 0.39735615253448486, + "learning_rate": 0.00016999181169497914, + "loss": 1.4665, + "step": 11560 + }, + { + "epoch": 0.1502299222114115, + "grad_norm": 0.3811829686164856, + "learning_rate": 0.00016998921223306776, + "loss": 1.472, + "step": 11561 + }, + { + "epoch": 0.15024291675532736, + "grad_norm": 0.447645902633667, + "learning_rate": 0.00016998661277115636, + "loss": 1.4971, + "step": 11562 + }, + { + "epoch": 0.15025591129924323, + "grad_norm": 0.4006495773792267, + "learning_rate": 0.000169984013309245, + "loss": 1.3629, + "step": 11563 + }, + { + "epoch": 0.1502689058431591, + "grad_norm": 0.44452422857284546, + "learning_rate": 0.0001699814138473336, + "loss": 1.409, + "step": 11564 + }, + { + "epoch": 0.15028190038707498, + "grad_norm": 0.33562028408050537, + "learning_rate": 0.00016997881438542224, + "loss": 1.4957, + "step": 11565 + }, + { + "epoch": 0.15029489493099085, + "grad_norm": 0.3849475383758545, + "learning_rate": 0.00016997621492351083, + "loss": 1.3658, + "step": 11566 + }, + { + "epoch": 0.15030788947490673, + "grad_norm": 0.5096610188484192, + "learning_rate": 0.00016997361546159946, + "loss": 1.4549, + "step": 11567 + }, + { + "epoch": 0.1503208840188226, + "grad_norm": 0.5005442500114441, + "learning_rate": 0.00016997101599968808, + "loss": 1.5121, + "step": 11568 + }, + { + "epoch": 0.15033387856273847, + "grad_norm": 0.4770018458366394, + "learning_rate": 0.00016996841653777668, + "loss": 1.4404, + "step": 11569 + }, + { + "epoch": 0.15034687310665434, + "grad_norm": 0.3391443192958832, + "learning_rate": 0.0001699658170758653, + "loss": 1.4948, + "step": 11570 + }, + { + "epoch": 0.15035986765057022, + "grad_norm": 0.31590428948402405, + "learning_rate": 0.00016996321761395393, + "loss": 1.2762, + "step": 11571 + }, + { + "epoch": 0.1503728621944861, + "grad_norm": 0.5151616930961609, + "learning_rate": 0.00016996061815204253, + "loss": 1.4638, + "step": 11572 + }, + { + "epoch": 0.15038585673840196, + "grad_norm": 0.37689313292503357, + "learning_rate": 0.00016995801869013115, + "loss": 1.3971, + "step": 11573 + }, + { + "epoch": 0.15039885128231784, + "grad_norm": 0.34364745020866394, + "learning_rate": 0.00016995541922821975, + "loss": 1.3721, + "step": 11574 + }, + { + "epoch": 0.1504118458262337, + "grad_norm": 0.3163674473762512, + "learning_rate": 0.0001699528197663084, + "loss": 1.4001, + "step": 11575 + }, + { + "epoch": 0.15042484037014958, + "grad_norm": 0.29825130105018616, + "learning_rate": 0.000169950220304397, + "loss": 1.3976, + "step": 11576 + }, + { + "epoch": 0.15043783491406545, + "grad_norm": 0.36027073860168457, + "learning_rate": 0.00016994762084248562, + "loss": 1.4552, + "step": 11577 + }, + { + "epoch": 0.15045082945798133, + "grad_norm": 0.44277259707450867, + "learning_rate": 0.00016994502138057422, + "loss": 1.4799, + "step": 11578 + }, + { + "epoch": 0.1504638240018972, + "grad_norm": 0.3719746470451355, + "learning_rate": 0.00016994242191866284, + "loss": 1.3541, + "step": 11579 + }, + { + "epoch": 0.15047681854581307, + "grad_norm": 0.288047730922699, + "learning_rate": 0.00016993982245675147, + "loss": 1.4721, + "step": 11580 + }, + { + "epoch": 0.15048981308972895, + "grad_norm": 0.45356979966163635, + "learning_rate": 0.00016993722299484006, + "loss": 1.3371, + "step": 11581 + }, + { + "epoch": 0.15050280763364482, + "grad_norm": 0.3906473219394684, + "learning_rate": 0.0001699346235329287, + "loss": 1.2715, + "step": 11582 + }, + { + "epoch": 0.1505158021775607, + "grad_norm": 0.36312851309776306, + "learning_rate": 0.0001699320240710173, + "loss": 1.3731, + "step": 11583 + }, + { + "epoch": 0.15052879672147657, + "grad_norm": 0.4284430146217346, + "learning_rate": 0.00016992942460910594, + "loss": 1.4051, + "step": 11584 + }, + { + "epoch": 0.15054179126539244, + "grad_norm": 0.35854005813598633, + "learning_rate": 0.00016992682514719454, + "loss": 1.4289, + "step": 11585 + }, + { + "epoch": 0.1505547858093083, + "grad_norm": 0.4714095890522003, + "learning_rate": 0.00016992422568528313, + "loss": 1.4351, + "step": 11586 + }, + { + "epoch": 0.15056778035322418, + "grad_norm": 0.4110665023326874, + "learning_rate": 0.00016992162622337178, + "loss": 1.3518, + "step": 11587 + }, + { + "epoch": 0.15058077489714006, + "grad_norm": 0.3543650209903717, + "learning_rate": 0.00016991902676146038, + "loss": 1.2744, + "step": 11588 + }, + { + "epoch": 0.15059376944105593, + "grad_norm": 0.3380261957645416, + "learning_rate": 0.000169916427299549, + "loss": 1.3543, + "step": 11589 + }, + { + "epoch": 0.1506067639849718, + "grad_norm": 0.3972093462944031, + "learning_rate": 0.0001699138278376376, + "loss": 1.3247, + "step": 11590 + }, + { + "epoch": 0.15061975852888768, + "grad_norm": 0.39904487133026123, + "learning_rate": 0.00016991122837572623, + "loss": 1.4365, + "step": 11591 + }, + { + "epoch": 0.15063275307280355, + "grad_norm": 0.4320739805698395, + "learning_rate": 0.00016990862891381485, + "loss": 1.4437, + "step": 11592 + }, + { + "epoch": 0.15064574761671942, + "grad_norm": 0.3298642039299011, + "learning_rate": 0.00016990602945190345, + "loss": 1.4883, + "step": 11593 + }, + { + "epoch": 0.1506587421606353, + "grad_norm": 0.3774215281009674, + "learning_rate": 0.00016990342998999207, + "loss": 1.2819, + "step": 11594 + }, + { + "epoch": 0.15067173670455117, + "grad_norm": 0.42426952719688416, + "learning_rate": 0.0001699008305280807, + "loss": 1.4879, + "step": 11595 + }, + { + "epoch": 0.15068473124846704, + "grad_norm": 0.28732702136039734, + "learning_rate": 0.00016989823106616932, + "loss": 1.3825, + "step": 11596 + }, + { + "epoch": 0.15069772579238291, + "grad_norm": 0.34764164686203003, + "learning_rate": 0.00016989563160425792, + "loss": 1.2436, + "step": 11597 + }, + { + "epoch": 0.1507107203362988, + "grad_norm": 0.29732435941696167, + "learning_rate": 0.00016989303214234655, + "loss": 1.3263, + "step": 11598 + }, + { + "epoch": 0.15072371488021466, + "grad_norm": 0.4686795473098755, + "learning_rate": 0.00016989043268043517, + "loss": 1.4554, + "step": 11599 + }, + { + "epoch": 0.15073670942413053, + "grad_norm": 0.3847302496433258, + "learning_rate": 0.00016988783321852377, + "loss": 1.3348, + "step": 11600 + }, + { + "epoch": 0.1507497039680464, + "grad_norm": 0.37850356101989746, + "learning_rate": 0.0001698852337566124, + "loss": 1.5803, + "step": 11601 + }, + { + "epoch": 0.15076269851196228, + "grad_norm": 0.49775758385658264, + "learning_rate": 0.00016988263429470102, + "loss": 1.4388, + "step": 11602 + }, + { + "epoch": 0.15077569305587815, + "grad_norm": 0.39869606494903564, + "learning_rate": 0.0001698800348327896, + "loss": 1.2424, + "step": 11603 + }, + { + "epoch": 0.15078868759979402, + "grad_norm": 0.4403208792209625, + "learning_rate": 0.00016987743537087824, + "loss": 1.485, + "step": 11604 + }, + { + "epoch": 0.1508016821437099, + "grad_norm": 0.42019885778427124, + "learning_rate": 0.00016987483590896684, + "loss": 1.2227, + "step": 11605 + }, + { + "epoch": 0.15081467668762577, + "grad_norm": 0.4300461411476135, + "learning_rate": 0.0001698722364470555, + "loss": 1.4772, + "step": 11606 + }, + { + "epoch": 0.15082767123154164, + "grad_norm": 0.31564974784851074, + "learning_rate": 0.00016986963698514408, + "loss": 1.3623, + "step": 11607 + }, + { + "epoch": 0.15084066577545752, + "grad_norm": 0.4545326232910156, + "learning_rate": 0.0001698670375232327, + "loss": 1.5086, + "step": 11608 + }, + { + "epoch": 0.1508536603193734, + "grad_norm": 0.447229266166687, + "learning_rate": 0.0001698644380613213, + "loss": 1.3773, + "step": 11609 + }, + { + "epoch": 0.1508666548632893, + "grad_norm": 0.593318521976471, + "learning_rate": 0.00016986183859940993, + "loss": 1.6173, + "step": 11610 + }, + { + "epoch": 0.15087964940720516, + "grad_norm": 0.3710833787918091, + "learning_rate": 0.00016985923913749856, + "loss": 1.4388, + "step": 11611 + }, + { + "epoch": 0.15089264395112104, + "grad_norm": 0.32597455382347107, + "learning_rate": 0.00016985663967558715, + "loss": 1.2032, + "step": 11612 + }, + { + "epoch": 0.1509056384950369, + "grad_norm": 0.5803954005241394, + "learning_rate": 0.00016985404021367578, + "loss": 1.526, + "step": 11613 + }, + { + "epoch": 0.15091863303895278, + "grad_norm": 0.33099353313446045, + "learning_rate": 0.0001698514407517644, + "loss": 1.4271, + "step": 11614 + }, + { + "epoch": 0.15093162758286865, + "grad_norm": 0.40249207615852356, + "learning_rate": 0.000169848841289853, + "loss": 1.5788, + "step": 11615 + }, + { + "epoch": 0.15094462212678453, + "grad_norm": 0.3555307686328888, + "learning_rate": 0.00016984624182794162, + "loss": 1.3229, + "step": 11616 + }, + { + "epoch": 0.1509576166707004, + "grad_norm": 0.3716249465942383, + "learning_rate": 0.00016984364236603022, + "loss": 1.4209, + "step": 11617 + }, + { + "epoch": 0.15097061121461627, + "grad_norm": 0.4033333957195282, + "learning_rate": 0.00016984104290411887, + "loss": 1.368, + "step": 11618 + }, + { + "epoch": 0.15098360575853215, + "grad_norm": 0.4291049540042877, + "learning_rate": 0.00016983844344220747, + "loss": 1.583, + "step": 11619 + }, + { + "epoch": 0.15099660030244802, + "grad_norm": 0.21523873507976532, + "learning_rate": 0.0001698358439802961, + "loss": 1.3157, + "step": 11620 + }, + { + "epoch": 0.1510095948463639, + "grad_norm": 0.6846172213554382, + "learning_rate": 0.0001698332445183847, + "loss": 1.4644, + "step": 11621 + }, + { + "epoch": 0.15102258939027977, + "grad_norm": 0.4366627335548401, + "learning_rate": 0.00016983064505647332, + "loss": 1.3652, + "step": 11622 + }, + { + "epoch": 0.15103558393419564, + "grad_norm": 0.3246694505214691, + "learning_rate": 0.00016982804559456194, + "loss": 1.4393, + "step": 11623 + }, + { + "epoch": 0.1510485784781115, + "grad_norm": 0.3185274302959442, + "learning_rate": 0.00016982544613265054, + "loss": 1.3833, + "step": 11624 + }, + { + "epoch": 0.15106157302202738, + "grad_norm": 0.428157240152359, + "learning_rate": 0.00016982284667073916, + "loss": 1.439, + "step": 11625 + }, + { + "epoch": 0.15107456756594326, + "grad_norm": 0.41474050283432007, + "learning_rate": 0.0001698202472088278, + "loss": 1.4438, + "step": 11626 + }, + { + "epoch": 0.15108756210985913, + "grad_norm": 0.4155484139919281, + "learning_rate": 0.00016981764774691638, + "loss": 1.6136, + "step": 11627 + }, + { + "epoch": 0.151100556653775, + "grad_norm": 0.36345165967941284, + "learning_rate": 0.000169815048285005, + "loss": 1.3957, + "step": 11628 + }, + { + "epoch": 0.15111355119769088, + "grad_norm": 0.3928201198577881, + "learning_rate": 0.0001698124488230936, + "loss": 1.4091, + "step": 11629 + }, + { + "epoch": 0.15112654574160675, + "grad_norm": 0.38821154832839966, + "learning_rate": 0.00016980984936118226, + "loss": 1.3059, + "step": 11630 + }, + { + "epoch": 0.15113954028552262, + "grad_norm": 0.4513520300388336, + "learning_rate": 0.00016980724989927085, + "loss": 1.5232, + "step": 11631 + }, + { + "epoch": 0.1511525348294385, + "grad_norm": 0.38104453682899475, + "learning_rate": 0.00016980465043735948, + "loss": 1.4161, + "step": 11632 + }, + { + "epoch": 0.15116552937335437, + "grad_norm": 0.32530683279037476, + "learning_rate": 0.0001698020509754481, + "loss": 1.6381, + "step": 11633 + }, + { + "epoch": 0.15117852391727024, + "grad_norm": 0.4283129870891571, + "learning_rate": 0.0001697994515135367, + "loss": 1.4463, + "step": 11634 + }, + { + "epoch": 0.15119151846118611, + "grad_norm": 0.36654505133628845, + "learning_rate": 0.00016979685205162533, + "loss": 1.3663, + "step": 11635 + }, + { + "epoch": 0.151204513005102, + "grad_norm": 0.3840312957763672, + "learning_rate": 0.00016979425258971392, + "loss": 1.4194, + "step": 11636 + }, + { + "epoch": 0.15121750754901786, + "grad_norm": 0.4775658845901489, + "learning_rate": 0.00016979165312780257, + "loss": 1.326, + "step": 11637 + }, + { + "epoch": 0.15123050209293373, + "grad_norm": 0.4977417588233948, + "learning_rate": 0.00016978905366589117, + "loss": 1.4097, + "step": 11638 + }, + { + "epoch": 0.1512434966368496, + "grad_norm": 0.45721298456192017, + "learning_rate": 0.0001697864542039798, + "loss": 1.6385, + "step": 11639 + }, + { + "epoch": 0.15125649118076548, + "grad_norm": 0.38940730690956116, + "learning_rate": 0.0001697838547420684, + "loss": 1.527, + "step": 11640 + }, + { + "epoch": 0.15126948572468135, + "grad_norm": 0.29324549436569214, + "learning_rate": 0.00016978125528015702, + "loss": 1.347, + "step": 11641 + }, + { + "epoch": 0.15128248026859722, + "grad_norm": 0.4306432604789734, + "learning_rate": 0.00016977865581824564, + "loss": 1.4768, + "step": 11642 + }, + { + "epoch": 0.1512954748125131, + "grad_norm": 0.33141615986824036, + "learning_rate": 0.00016977605635633424, + "loss": 1.5374, + "step": 11643 + }, + { + "epoch": 0.15130846935642897, + "grad_norm": 0.4426579475402832, + "learning_rate": 0.00016977345689442286, + "loss": 1.5092, + "step": 11644 + }, + { + "epoch": 0.15132146390034484, + "grad_norm": 0.40388157963752747, + "learning_rate": 0.0001697708574325115, + "loss": 1.4579, + "step": 11645 + }, + { + "epoch": 0.15133445844426072, + "grad_norm": 0.37738269567489624, + "learning_rate": 0.0001697682579706001, + "loss": 1.2104, + "step": 11646 + }, + { + "epoch": 0.1513474529881766, + "grad_norm": 0.3412838578224182, + "learning_rate": 0.0001697656585086887, + "loss": 1.4118, + "step": 11647 + }, + { + "epoch": 0.15136044753209246, + "grad_norm": 0.315868616104126, + "learning_rate": 0.0001697630590467773, + "loss": 1.189, + "step": 11648 + }, + { + "epoch": 0.15137344207600834, + "grad_norm": 0.4698161780834198, + "learning_rate": 0.00016976045958486596, + "loss": 1.3925, + "step": 11649 + }, + { + "epoch": 0.1513864366199242, + "grad_norm": 0.4214162528514862, + "learning_rate": 0.00016975786012295456, + "loss": 1.3597, + "step": 11650 + }, + { + "epoch": 0.15139943116384008, + "grad_norm": 0.35944873094558716, + "learning_rate": 0.00016975526066104318, + "loss": 1.3452, + "step": 11651 + }, + { + "epoch": 0.15141242570775595, + "grad_norm": 0.42097073793411255, + "learning_rate": 0.00016975266119913178, + "loss": 1.5193, + "step": 11652 + }, + { + "epoch": 0.15142542025167183, + "grad_norm": 0.36438876390457153, + "learning_rate": 0.0001697500617372204, + "loss": 1.508, + "step": 11653 + }, + { + "epoch": 0.1514384147955877, + "grad_norm": 0.3361772894859314, + "learning_rate": 0.00016974746227530903, + "loss": 1.4607, + "step": 11654 + }, + { + "epoch": 0.15145140933950357, + "grad_norm": 0.34069326519966125, + "learning_rate": 0.00016974486281339763, + "loss": 1.4599, + "step": 11655 + }, + { + "epoch": 0.15146440388341945, + "grad_norm": 0.3773195445537567, + "learning_rate": 0.00016974226335148625, + "loss": 1.365, + "step": 11656 + }, + { + "epoch": 0.15147739842733532, + "grad_norm": 0.40503740310668945, + "learning_rate": 0.00016973966388957487, + "loss": 1.37, + "step": 11657 + }, + { + "epoch": 0.1514903929712512, + "grad_norm": 0.4238918721675873, + "learning_rate": 0.00016973706442766347, + "loss": 1.3574, + "step": 11658 + }, + { + "epoch": 0.15150338751516707, + "grad_norm": 0.3853955864906311, + "learning_rate": 0.0001697344649657521, + "loss": 1.485, + "step": 11659 + }, + { + "epoch": 0.15151638205908294, + "grad_norm": 0.32171550393104553, + "learning_rate": 0.0001697318655038407, + "loss": 1.4155, + "step": 11660 + }, + { + "epoch": 0.1515293766029988, + "grad_norm": 0.361360639333725, + "learning_rate": 0.00016972926604192935, + "loss": 1.45, + "step": 11661 + }, + { + "epoch": 0.15154237114691468, + "grad_norm": 0.3709021508693695, + "learning_rate": 0.00016972666658001794, + "loss": 1.4208, + "step": 11662 + }, + { + "epoch": 0.15155536569083056, + "grad_norm": 0.3821188807487488, + "learning_rate": 0.00016972406711810657, + "loss": 1.3785, + "step": 11663 + }, + { + "epoch": 0.15156836023474643, + "grad_norm": 0.45584356784820557, + "learning_rate": 0.00016972146765619516, + "loss": 1.4151, + "step": 11664 + }, + { + "epoch": 0.1515813547786623, + "grad_norm": 0.427600234746933, + "learning_rate": 0.0001697188681942838, + "loss": 1.4295, + "step": 11665 + }, + { + "epoch": 0.15159434932257818, + "grad_norm": 0.4677857458591461, + "learning_rate": 0.00016971626873237241, + "loss": 1.5649, + "step": 11666 + }, + { + "epoch": 0.15160734386649405, + "grad_norm": 0.44161033630371094, + "learning_rate": 0.000169713669270461, + "loss": 1.6056, + "step": 11667 + }, + { + "epoch": 0.15162033841040992, + "grad_norm": 0.40123143792152405, + "learning_rate": 0.00016971106980854964, + "loss": 1.571, + "step": 11668 + }, + { + "epoch": 0.1516333329543258, + "grad_norm": 0.35370585322380066, + "learning_rate": 0.00016970847034663826, + "loss": 1.4027, + "step": 11669 + }, + { + "epoch": 0.15164632749824167, + "grad_norm": 0.41544097661972046, + "learning_rate": 0.00016970587088472686, + "loss": 1.3507, + "step": 11670 + }, + { + "epoch": 0.15165932204215754, + "grad_norm": 0.3244834542274475, + "learning_rate": 0.00016970327142281548, + "loss": 1.2671, + "step": 11671 + }, + { + "epoch": 0.1516723165860734, + "grad_norm": 0.4362773597240448, + "learning_rate": 0.0001697006719609041, + "loss": 1.4427, + "step": 11672 + }, + { + "epoch": 0.1516853111299893, + "grad_norm": 0.3408445119857788, + "learning_rate": 0.00016969807249899273, + "loss": 1.4695, + "step": 11673 + }, + { + "epoch": 0.15169830567390516, + "grad_norm": 0.40236592292785645, + "learning_rate": 0.00016969547303708133, + "loss": 1.3771, + "step": 11674 + }, + { + "epoch": 0.15171130021782103, + "grad_norm": 0.4245149791240692, + "learning_rate": 0.00016969287357516995, + "loss": 1.4197, + "step": 11675 + }, + { + "epoch": 0.1517242947617369, + "grad_norm": 0.333930641412735, + "learning_rate": 0.00016969027411325858, + "loss": 1.3159, + "step": 11676 + }, + { + "epoch": 0.15173728930565278, + "grad_norm": 0.4928986728191376, + "learning_rate": 0.00016968767465134717, + "loss": 1.5399, + "step": 11677 + }, + { + "epoch": 0.15175028384956865, + "grad_norm": 0.4258807599544525, + "learning_rate": 0.0001696850751894358, + "loss": 1.4626, + "step": 11678 + }, + { + "epoch": 0.15176327839348452, + "grad_norm": 0.31232404708862305, + "learning_rate": 0.0001696824757275244, + "loss": 1.3532, + "step": 11679 + }, + { + "epoch": 0.1517762729374004, + "grad_norm": 0.3483731746673584, + "learning_rate": 0.00016967987626561305, + "loss": 1.3621, + "step": 11680 + }, + { + "epoch": 0.15178926748131627, + "grad_norm": 0.37102842330932617, + "learning_rate": 0.00016967727680370165, + "loss": 1.461, + "step": 11681 + }, + { + "epoch": 0.15180226202523214, + "grad_norm": 0.2596631646156311, + "learning_rate": 0.00016967467734179024, + "loss": 1.3379, + "step": 11682 + }, + { + "epoch": 0.15181525656914802, + "grad_norm": 0.3717079162597656, + "learning_rate": 0.00016967207787987887, + "loss": 1.5471, + "step": 11683 + }, + { + "epoch": 0.1518282511130639, + "grad_norm": 0.43462130427360535, + "learning_rate": 0.0001696694784179675, + "loss": 1.2458, + "step": 11684 + }, + { + "epoch": 0.15184124565697976, + "grad_norm": 0.3457672595977783, + "learning_rate": 0.00016966687895605612, + "loss": 1.5248, + "step": 11685 + }, + { + "epoch": 0.15185424020089566, + "grad_norm": 0.4820672273635864, + "learning_rate": 0.00016966427949414471, + "loss": 1.4057, + "step": 11686 + }, + { + "epoch": 0.15186723474481154, + "grad_norm": 0.43865731358528137, + "learning_rate": 0.00016966168003223334, + "loss": 1.5902, + "step": 11687 + }, + { + "epoch": 0.1518802292887274, + "grad_norm": 0.4584430456161499, + "learning_rate": 0.00016965908057032196, + "loss": 1.4073, + "step": 11688 + }, + { + "epoch": 0.15189322383264328, + "grad_norm": 0.42312484979629517, + "learning_rate": 0.00016965648110841056, + "loss": 1.5019, + "step": 11689 + }, + { + "epoch": 0.15190621837655915, + "grad_norm": 0.37992262840270996, + "learning_rate": 0.00016965388164649918, + "loss": 1.7851, + "step": 11690 + }, + { + "epoch": 0.15191921292047503, + "grad_norm": 0.424396812915802, + "learning_rate": 0.00016965128218458778, + "loss": 1.3634, + "step": 11691 + }, + { + "epoch": 0.1519322074643909, + "grad_norm": 0.48533403873443604, + "learning_rate": 0.00016964868272267643, + "loss": 1.4662, + "step": 11692 + }, + { + "epoch": 0.15194520200830677, + "grad_norm": 0.48497989773750305, + "learning_rate": 0.00016964608326076503, + "loss": 1.3883, + "step": 11693 + }, + { + "epoch": 0.15195819655222265, + "grad_norm": 0.412862628698349, + "learning_rate": 0.00016964348379885363, + "loss": 1.4506, + "step": 11694 + }, + { + "epoch": 0.15197119109613852, + "grad_norm": 0.3911252021789551, + "learning_rate": 0.00016964088433694225, + "loss": 1.3657, + "step": 11695 + }, + { + "epoch": 0.1519841856400544, + "grad_norm": 0.4373502731323242, + "learning_rate": 0.00016963828487503088, + "loss": 1.4065, + "step": 11696 + }, + { + "epoch": 0.15199718018397027, + "grad_norm": 0.33631205558776855, + "learning_rate": 0.0001696356854131195, + "loss": 1.3583, + "step": 11697 + }, + { + "epoch": 0.15201017472788614, + "grad_norm": 0.2947554290294647, + "learning_rate": 0.0001696330859512081, + "loss": 1.3599, + "step": 11698 + }, + { + "epoch": 0.152023169271802, + "grad_norm": 0.3604918122291565, + "learning_rate": 0.00016963048648929672, + "loss": 1.3281, + "step": 11699 + }, + { + "epoch": 0.15203616381571788, + "grad_norm": 0.47640731930732727, + "learning_rate": 0.00016962788702738535, + "loss": 1.5404, + "step": 11700 + }, + { + "epoch": 0.15204915835963376, + "grad_norm": 0.43649137020111084, + "learning_rate": 0.00016962528756547395, + "loss": 1.6743, + "step": 11701 + }, + { + "epoch": 0.15206215290354963, + "grad_norm": 0.3978079557418823, + "learning_rate": 0.00016962268810356257, + "loss": 1.4434, + "step": 11702 + }, + { + "epoch": 0.1520751474474655, + "grad_norm": 0.3781042993068695, + "learning_rate": 0.00016962008864165117, + "loss": 1.6099, + "step": 11703 + }, + { + "epoch": 0.15208814199138138, + "grad_norm": 0.3335297405719757, + "learning_rate": 0.00016961748917973982, + "loss": 1.3613, + "step": 11704 + }, + { + "epoch": 0.15210113653529725, + "grad_norm": 0.3304100036621094, + "learning_rate": 0.00016961488971782842, + "loss": 1.2748, + "step": 11705 + }, + { + "epoch": 0.15211413107921312, + "grad_norm": 0.34727466106414795, + "learning_rate": 0.00016961229025591704, + "loss": 1.333, + "step": 11706 + }, + { + "epoch": 0.152127125623129, + "grad_norm": 0.3454049229621887, + "learning_rate": 0.00016960969079400564, + "loss": 1.2993, + "step": 11707 + }, + { + "epoch": 0.15214012016704487, + "grad_norm": 0.35378775000572205, + "learning_rate": 0.00016960709133209426, + "loss": 1.2406, + "step": 11708 + }, + { + "epoch": 0.15215311471096074, + "grad_norm": 0.5171632170677185, + "learning_rate": 0.0001696044918701829, + "loss": 1.404, + "step": 11709 + }, + { + "epoch": 0.1521661092548766, + "grad_norm": 0.37615811824798584, + "learning_rate": 0.00016960189240827148, + "loss": 1.3631, + "step": 11710 + }, + { + "epoch": 0.1521791037987925, + "grad_norm": 0.3766386806964874, + "learning_rate": 0.0001695992929463601, + "loss": 1.2087, + "step": 11711 + }, + { + "epoch": 0.15219209834270836, + "grad_norm": 0.37699365615844727, + "learning_rate": 0.00016959669348444873, + "loss": 1.3586, + "step": 11712 + }, + { + "epoch": 0.15220509288662423, + "grad_norm": 0.35546258091926575, + "learning_rate": 0.00016959409402253733, + "loss": 1.3371, + "step": 11713 + }, + { + "epoch": 0.1522180874305401, + "grad_norm": 0.4114397466182709, + "learning_rate": 0.00016959149456062596, + "loss": 1.4927, + "step": 11714 + }, + { + "epoch": 0.15223108197445598, + "grad_norm": 0.38973891735076904, + "learning_rate": 0.00016958889509871458, + "loss": 1.4476, + "step": 11715 + }, + { + "epoch": 0.15224407651837185, + "grad_norm": 0.496116042137146, + "learning_rate": 0.0001695862956368032, + "loss": 1.3409, + "step": 11716 + }, + { + "epoch": 0.15225707106228772, + "grad_norm": 0.4139866232872009, + "learning_rate": 0.0001695836961748918, + "loss": 1.361, + "step": 11717 + }, + { + "epoch": 0.1522700656062036, + "grad_norm": 0.4198775887489319, + "learning_rate": 0.00016958109671298043, + "loss": 1.5599, + "step": 11718 + }, + { + "epoch": 0.15228306015011947, + "grad_norm": 0.49194204807281494, + "learning_rate": 0.00016957849725106905, + "loss": 1.5704, + "step": 11719 + }, + { + "epoch": 0.15229605469403534, + "grad_norm": 0.3449385166168213, + "learning_rate": 0.00016957589778915765, + "loss": 1.3414, + "step": 11720 + }, + { + "epoch": 0.15230904923795122, + "grad_norm": 0.47164613008499146, + "learning_rate": 0.00016957329832724627, + "loss": 1.6382, + "step": 11721 + }, + { + "epoch": 0.1523220437818671, + "grad_norm": 0.4473242163658142, + "learning_rate": 0.00016957069886533487, + "loss": 1.3647, + "step": 11722 + }, + { + "epoch": 0.15233503832578296, + "grad_norm": 0.4058029055595398, + "learning_rate": 0.00016956809940342352, + "loss": 1.5811, + "step": 11723 + }, + { + "epoch": 0.15234803286969884, + "grad_norm": 0.4079439043998718, + "learning_rate": 0.00016956549994151212, + "loss": 1.4383, + "step": 11724 + }, + { + "epoch": 0.1523610274136147, + "grad_norm": 0.41152986884117126, + "learning_rate": 0.00016956290047960072, + "loss": 1.5773, + "step": 11725 + }, + { + "epoch": 0.15237402195753058, + "grad_norm": 0.3442077040672302, + "learning_rate": 0.00016956030101768934, + "loss": 1.4294, + "step": 11726 + }, + { + "epoch": 0.15238701650144645, + "grad_norm": 0.3956660330295563, + "learning_rate": 0.00016955770155577797, + "loss": 1.4581, + "step": 11727 + }, + { + "epoch": 0.15240001104536233, + "grad_norm": 0.42144888639450073, + "learning_rate": 0.0001695551020938666, + "loss": 1.4309, + "step": 11728 + }, + { + "epoch": 0.1524130055892782, + "grad_norm": 0.3697633147239685, + "learning_rate": 0.0001695525026319552, + "loss": 1.515, + "step": 11729 + }, + { + "epoch": 0.15242600013319407, + "grad_norm": 0.40641388297080994, + "learning_rate": 0.0001695499031700438, + "loss": 1.3713, + "step": 11730 + }, + { + "epoch": 0.15243899467710995, + "grad_norm": 0.2823372781276703, + "learning_rate": 0.00016954730370813244, + "loss": 1.2594, + "step": 11731 + }, + { + "epoch": 0.15245198922102582, + "grad_norm": 0.47480398416519165, + "learning_rate": 0.00016954470424622103, + "loss": 1.6418, + "step": 11732 + }, + { + "epoch": 0.1524649837649417, + "grad_norm": 0.504581868648529, + "learning_rate": 0.00016954210478430966, + "loss": 1.4622, + "step": 11733 + }, + { + "epoch": 0.15247797830885756, + "grad_norm": 0.4075296223163605, + "learning_rate": 0.00016953950532239826, + "loss": 1.4376, + "step": 11734 + }, + { + "epoch": 0.15249097285277344, + "grad_norm": 0.2864323854446411, + "learning_rate": 0.0001695369058604869, + "loss": 1.601, + "step": 11735 + }, + { + "epoch": 0.1525039673966893, + "grad_norm": 0.42500028014183044, + "learning_rate": 0.0001695343063985755, + "loss": 1.6395, + "step": 11736 + }, + { + "epoch": 0.15251696194060518, + "grad_norm": 0.36792105436325073, + "learning_rate": 0.0001695317069366641, + "loss": 1.3489, + "step": 11737 + }, + { + "epoch": 0.15252995648452106, + "grad_norm": 0.27140694856643677, + "learning_rate": 0.00016952910747475273, + "loss": 1.3624, + "step": 11738 + }, + { + "epoch": 0.15254295102843693, + "grad_norm": 0.3548365831375122, + "learning_rate": 0.00016952650801284135, + "loss": 1.4093, + "step": 11739 + }, + { + "epoch": 0.1525559455723528, + "grad_norm": 0.3475998640060425, + "learning_rate": 0.00016952390855092998, + "loss": 1.3448, + "step": 11740 + }, + { + "epoch": 0.15256894011626868, + "grad_norm": 0.43726444244384766, + "learning_rate": 0.00016952130908901857, + "loss": 1.4867, + "step": 11741 + }, + { + "epoch": 0.15258193466018455, + "grad_norm": 0.4378892779350281, + "learning_rate": 0.0001695187096271072, + "loss": 1.6379, + "step": 11742 + }, + { + "epoch": 0.15259492920410042, + "grad_norm": 0.45598649978637695, + "learning_rate": 0.00016951611016519582, + "loss": 1.3628, + "step": 11743 + }, + { + "epoch": 0.1526079237480163, + "grad_norm": 0.3495231866836548, + "learning_rate": 0.00016951351070328442, + "loss": 1.6197, + "step": 11744 + }, + { + "epoch": 0.15262091829193217, + "grad_norm": 0.5096492767333984, + "learning_rate": 0.00016951091124137304, + "loss": 1.4921, + "step": 11745 + }, + { + "epoch": 0.15263391283584804, + "grad_norm": 0.41441166400909424, + "learning_rate": 0.00016950831177946167, + "loss": 1.5289, + "step": 11746 + }, + { + "epoch": 0.1526469073797639, + "grad_norm": 0.48872193694114685, + "learning_rate": 0.0001695057123175503, + "loss": 1.4264, + "step": 11747 + }, + { + "epoch": 0.15265990192367979, + "grad_norm": 0.3623164892196655, + "learning_rate": 0.0001695031128556389, + "loss": 1.4951, + "step": 11748 + }, + { + "epoch": 0.15267289646759566, + "grad_norm": 0.43154725432395935, + "learning_rate": 0.0001695005133937275, + "loss": 1.4624, + "step": 11749 + }, + { + "epoch": 0.15268589101151153, + "grad_norm": 0.34462079405784607, + "learning_rate": 0.00016949791393181614, + "loss": 1.339, + "step": 11750 + }, + { + "epoch": 0.1526988855554274, + "grad_norm": 0.37993985414505005, + "learning_rate": 0.00016949531446990474, + "loss": 1.3516, + "step": 11751 + }, + { + "epoch": 0.15271188009934328, + "grad_norm": 0.4231403172016144, + "learning_rate": 0.00016949271500799336, + "loss": 1.4528, + "step": 11752 + }, + { + "epoch": 0.15272487464325915, + "grad_norm": 0.3255899250507355, + "learning_rate": 0.00016949011554608196, + "loss": 1.3643, + "step": 11753 + }, + { + "epoch": 0.15273786918717502, + "grad_norm": 0.45418569445610046, + "learning_rate": 0.00016948751608417058, + "loss": 1.476, + "step": 11754 + }, + { + "epoch": 0.1527508637310909, + "grad_norm": 0.37670525908470154, + "learning_rate": 0.0001694849166222592, + "loss": 1.5341, + "step": 11755 + }, + { + "epoch": 0.15276385827500677, + "grad_norm": 0.35125041007995605, + "learning_rate": 0.0001694823171603478, + "loss": 1.3668, + "step": 11756 + }, + { + "epoch": 0.15277685281892264, + "grad_norm": 0.498047411441803, + "learning_rate": 0.00016947971769843643, + "loss": 1.52, + "step": 11757 + }, + { + "epoch": 0.15278984736283852, + "grad_norm": 0.4123964309692383, + "learning_rate": 0.00016947711823652505, + "loss": 1.4486, + "step": 11758 + }, + { + "epoch": 0.1528028419067544, + "grad_norm": 0.49128657579421997, + "learning_rate": 0.00016947451877461368, + "loss": 1.5993, + "step": 11759 + }, + { + "epoch": 0.15281583645067026, + "grad_norm": 0.32412177324295044, + "learning_rate": 0.00016947191931270228, + "loss": 1.2347, + "step": 11760 + }, + { + "epoch": 0.15282883099458613, + "grad_norm": 0.2931101322174072, + "learning_rate": 0.0001694693198507909, + "loss": 1.4559, + "step": 11761 + }, + { + "epoch": 0.15284182553850204, + "grad_norm": 0.43628764152526855, + "learning_rate": 0.00016946672038887952, + "loss": 1.4967, + "step": 11762 + }, + { + "epoch": 0.1528548200824179, + "grad_norm": 0.339720219373703, + "learning_rate": 0.00016946412092696812, + "loss": 1.2241, + "step": 11763 + }, + { + "epoch": 0.15286781462633378, + "grad_norm": 0.39957964420318604, + "learning_rate": 0.00016946152146505675, + "loss": 1.7905, + "step": 11764 + }, + { + "epoch": 0.15288080917024965, + "grad_norm": 0.48330721259117126, + "learning_rate": 0.00016945892200314534, + "loss": 1.3863, + "step": 11765 + }, + { + "epoch": 0.15289380371416553, + "grad_norm": 0.31594640016555786, + "learning_rate": 0.00016945632254123397, + "loss": 1.2484, + "step": 11766 + }, + { + "epoch": 0.1529067982580814, + "grad_norm": 0.35658761858940125, + "learning_rate": 0.0001694537230793226, + "loss": 1.5249, + "step": 11767 + }, + { + "epoch": 0.15291979280199727, + "grad_norm": 0.3486155867576599, + "learning_rate": 0.0001694511236174112, + "loss": 1.4423, + "step": 11768 + }, + { + "epoch": 0.15293278734591315, + "grad_norm": 0.36469170451164246, + "learning_rate": 0.00016944852415549981, + "loss": 1.3934, + "step": 11769 + }, + { + "epoch": 0.15294578188982902, + "grad_norm": 0.44081711769104004, + "learning_rate": 0.00016944592469358844, + "loss": 1.4846, + "step": 11770 + }, + { + "epoch": 0.1529587764337449, + "grad_norm": 0.35378196835517883, + "learning_rate": 0.00016944332523167706, + "loss": 1.2729, + "step": 11771 + }, + { + "epoch": 0.15297177097766076, + "grad_norm": 0.42618003487586975, + "learning_rate": 0.00016944072576976566, + "loss": 1.4347, + "step": 11772 + }, + { + "epoch": 0.15298476552157664, + "grad_norm": 0.2815198600292206, + "learning_rate": 0.00016943812630785428, + "loss": 1.1926, + "step": 11773 + }, + { + "epoch": 0.1529977600654925, + "grad_norm": 0.3783093988895416, + "learning_rate": 0.0001694355268459429, + "loss": 1.4634, + "step": 11774 + }, + { + "epoch": 0.15301075460940838, + "grad_norm": 0.41654062271118164, + "learning_rate": 0.0001694329273840315, + "loss": 1.2944, + "step": 11775 + }, + { + "epoch": 0.15302374915332426, + "grad_norm": 0.4205237329006195, + "learning_rate": 0.00016943032792212013, + "loss": 1.5138, + "step": 11776 + }, + { + "epoch": 0.15303674369724013, + "grad_norm": 0.3374442458152771, + "learning_rate": 0.00016942772846020873, + "loss": 1.3233, + "step": 11777 + }, + { + "epoch": 0.153049738241156, + "grad_norm": 0.36444100737571716, + "learning_rate": 0.00016942512899829735, + "loss": 1.3894, + "step": 11778 + }, + { + "epoch": 0.15306273278507188, + "grad_norm": 0.37599268555641174, + "learning_rate": 0.00016942252953638598, + "loss": 1.3029, + "step": 11779 + }, + { + "epoch": 0.15307572732898775, + "grad_norm": 0.38979172706604004, + "learning_rate": 0.00016941993007447457, + "loss": 1.382, + "step": 11780 + }, + { + "epoch": 0.15308872187290362, + "grad_norm": 0.48710331320762634, + "learning_rate": 0.0001694173306125632, + "loss": 1.4296, + "step": 11781 + }, + { + "epoch": 0.1531017164168195, + "grad_norm": 0.45828455686569214, + "learning_rate": 0.00016941473115065182, + "loss": 1.281, + "step": 11782 + }, + { + "epoch": 0.15311471096073537, + "grad_norm": 0.36745476722717285, + "learning_rate": 0.00016941213168874045, + "loss": 1.548, + "step": 11783 + }, + { + "epoch": 0.15312770550465124, + "grad_norm": 0.3311762809753418, + "learning_rate": 0.00016940953222682905, + "loss": 1.2836, + "step": 11784 + }, + { + "epoch": 0.1531407000485671, + "grad_norm": 0.3976030647754669, + "learning_rate": 0.00016940693276491767, + "loss": 1.3015, + "step": 11785 + }, + { + "epoch": 0.153153694592483, + "grad_norm": 0.43746116757392883, + "learning_rate": 0.0001694043333030063, + "loss": 1.4303, + "step": 11786 + }, + { + "epoch": 0.15316668913639886, + "grad_norm": 0.4561688303947449, + "learning_rate": 0.0001694017338410949, + "loss": 1.5099, + "step": 11787 + }, + { + "epoch": 0.15317968368031473, + "grad_norm": 0.41558125615119934, + "learning_rate": 0.00016939913437918352, + "loss": 1.3331, + "step": 11788 + }, + { + "epoch": 0.1531926782242306, + "grad_norm": 0.40876078605651855, + "learning_rate": 0.00016939653491727214, + "loss": 1.6399, + "step": 11789 + }, + { + "epoch": 0.15320567276814648, + "grad_norm": 0.45276597142219543, + "learning_rate": 0.00016939393545536077, + "loss": 1.5795, + "step": 11790 + }, + { + "epoch": 0.15321866731206235, + "grad_norm": 0.3830238878726959, + "learning_rate": 0.00016939133599344936, + "loss": 1.4626, + "step": 11791 + }, + { + "epoch": 0.15323166185597822, + "grad_norm": 0.3617945909500122, + "learning_rate": 0.00016938873653153796, + "loss": 1.4513, + "step": 11792 + }, + { + "epoch": 0.1532446563998941, + "grad_norm": 0.31999918818473816, + "learning_rate": 0.0001693861370696266, + "loss": 1.448, + "step": 11793 + }, + { + "epoch": 0.15325765094380997, + "grad_norm": 0.36458662152290344, + "learning_rate": 0.0001693835376077152, + "loss": 1.5335, + "step": 11794 + }, + { + "epoch": 0.15327064548772584, + "grad_norm": 0.3933316767215729, + "learning_rate": 0.00016938093814580383, + "loss": 1.2247, + "step": 11795 + }, + { + "epoch": 0.15328364003164172, + "grad_norm": 0.40105941891670227, + "learning_rate": 0.00016937833868389243, + "loss": 1.4213, + "step": 11796 + }, + { + "epoch": 0.1532966345755576, + "grad_norm": 0.35096871852874756, + "learning_rate": 0.00016937573922198106, + "loss": 1.4141, + "step": 11797 + }, + { + "epoch": 0.15330962911947346, + "grad_norm": 0.40356430411338806, + "learning_rate": 0.00016937313976006968, + "loss": 1.4835, + "step": 11798 + }, + { + "epoch": 0.15332262366338933, + "grad_norm": 0.3647235333919525, + "learning_rate": 0.00016937054029815828, + "loss": 1.3013, + "step": 11799 + }, + { + "epoch": 0.1533356182073052, + "grad_norm": 0.43235769867897034, + "learning_rate": 0.0001693679408362469, + "loss": 1.4495, + "step": 11800 + }, + { + "epoch": 0.15334861275122108, + "grad_norm": 0.3538236916065216, + "learning_rate": 0.00016936534137433553, + "loss": 1.3958, + "step": 11801 + }, + { + "epoch": 0.15336160729513695, + "grad_norm": 0.42808666825294495, + "learning_rate": 0.00016936274191242415, + "loss": 1.4402, + "step": 11802 + }, + { + "epoch": 0.15337460183905283, + "grad_norm": 0.3915475606918335, + "learning_rate": 0.00016936014245051275, + "loss": 1.2853, + "step": 11803 + }, + { + "epoch": 0.1533875963829687, + "grad_norm": 0.27652060985565186, + "learning_rate": 0.00016935754298860135, + "loss": 1.4558, + "step": 11804 + }, + { + "epoch": 0.15340059092688457, + "grad_norm": 0.4507613182067871, + "learning_rate": 0.00016935494352669, + "loss": 1.4878, + "step": 11805 + }, + { + "epoch": 0.15341358547080045, + "grad_norm": 0.40186434984207153, + "learning_rate": 0.0001693523440647786, + "loss": 1.75, + "step": 11806 + }, + { + "epoch": 0.15342658001471632, + "grad_norm": 0.35875099897384644, + "learning_rate": 0.00016934974460286722, + "loss": 1.2244, + "step": 11807 + }, + { + "epoch": 0.1534395745586322, + "grad_norm": 0.435232013463974, + "learning_rate": 0.00016934714514095582, + "loss": 1.5072, + "step": 11808 + }, + { + "epoch": 0.15345256910254806, + "grad_norm": 0.38161787390708923, + "learning_rate": 0.00016934454567904444, + "loss": 1.5401, + "step": 11809 + }, + { + "epoch": 0.15346556364646394, + "grad_norm": 0.37121862173080444, + "learning_rate": 0.00016934194621713307, + "loss": 1.4201, + "step": 11810 + }, + { + "epoch": 0.1534785581903798, + "grad_norm": 0.35048747062683105, + "learning_rate": 0.00016933934675522166, + "loss": 1.4342, + "step": 11811 + }, + { + "epoch": 0.15349155273429568, + "grad_norm": 0.3794581890106201, + "learning_rate": 0.0001693367472933103, + "loss": 1.2232, + "step": 11812 + }, + { + "epoch": 0.15350454727821156, + "grad_norm": 0.39692410826683044, + "learning_rate": 0.0001693341478313989, + "loss": 1.5388, + "step": 11813 + }, + { + "epoch": 0.15351754182212743, + "grad_norm": 0.4707062244415283, + "learning_rate": 0.00016933154836948754, + "loss": 1.4907, + "step": 11814 + }, + { + "epoch": 0.1535305363660433, + "grad_norm": 0.3456328511238098, + "learning_rate": 0.00016932894890757613, + "loss": 1.5022, + "step": 11815 + }, + { + "epoch": 0.15354353090995917, + "grad_norm": 0.32065534591674805, + "learning_rate": 0.00016932634944566476, + "loss": 1.3006, + "step": 11816 + }, + { + "epoch": 0.15355652545387505, + "grad_norm": 0.38260048627853394, + "learning_rate": 0.00016932374998375338, + "loss": 1.2331, + "step": 11817 + }, + { + "epoch": 0.15356951999779092, + "grad_norm": 0.38718950748443604, + "learning_rate": 0.00016932115052184198, + "loss": 1.526, + "step": 11818 + }, + { + "epoch": 0.1535825145417068, + "grad_norm": 0.3998222053050995, + "learning_rate": 0.0001693185510599306, + "loss": 1.6439, + "step": 11819 + }, + { + "epoch": 0.15359550908562267, + "grad_norm": 0.5104019045829773, + "learning_rate": 0.00016931595159801923, + "loss": 1.3729, + "step": 11820 + }, + { + "epoch": 0.15360850362953854, + "grad_norm": 0.351725310087204, + "learning_rate": 0.00016931335213610783, + "loss": 1.374, + "step": 11821 + }, + { + "epoch": 0.1536214981734544, + "grad_norm": 0.4281952679157257, + "learning_rate": 0.00016931075267419645, + "loss": 1.4901, + "step": 11822 + }, + { + "epoch": 0.15363449271737029, + "grad_norm": 0.3969263434410095, + "learning_rate": 0.00016930815321228505, + "loss": 1.1343, + "step": 11823 + }, + { + "epoch": 0.15364748726128616, + "grad_norm": 0.26854419708251953, + "learning_rate": 0.0001693055537503737, + "loss": 1.3934, + "step": 11824 + }, + { + "epoch": 0.15366048180520203, + "grad_norm": 0.35883522033691406, + "learning_rate": 0.0001693029542884623, + "loss": 1.5546, + "step": 11825 + }, + { + "epoch": 0.1536734763491179, + "grad_norm": 0.3865770399570465, + "learning_rate": 0.00016930035482655092, + "loss": 1.4676, + "step": 11826 + }, + { + "epoch": 0.15368647089303378, + "grad_norm": 0.382995069026947, + "learning_rate": 0.00016929775536463952, + "loss": 1.4358, + "step": 11827 + }, + { + "epoch": 0.15369946543694965, + "grad_norm": 0.4226953387260437, + "learning_rate": 0.00016929515590272814, + "loss": 1.4437, + "step": 11828 + }, + { + "epoch": 0.15371245998086552, + "grad_norm": 0.3526670038700104, + "learning_rate": 0.00016929255644081677, + "loss": 1.5229, + "step": 11829 + }, + { + "epoch": 0.1537254545247814, + "grad_norm": 0.34893375635147095, + "learning_rate": 0.00016928995697890537, + "loss": 1.0607, + "step": 11830 + }, + { + "epoch": 0.15373844906869727, + "grad_norm": 0.44681602716445923, + "learning_rate": 0.000169287357516994, + "loss": 1.2742, + "step": 11831 + }, + { + "epoch": 0.15375144361261314, + "grad_norm": 0.3585251271724701, + "learning_rate": 0.00016928475805508261, + "loss": 1.3792, + "step": 11832 + }, + { + "epoch": 0.15376443815652902, + "grad_norm": 0.2582624554634094, + "learning_rate": 0.0001692821585931712, + "loss": 1.4141, + "step": 11833 + }, + { + "epoch": 0.1537774327004449, + "grad_norm": 0.4682348966598511, + "learning_rate": 0.00016927955913125984, + "loss": 1.4077, + "step": 11834 + }, + { + "epoch": 0.15379042724436076, + "grad_norm": 0.27334079146385193, + "learning_rate": 0.00016927695966934843, + "loss": 1.2248, + "step": 11835 + }, + { + "epoch": 0.15380342178827663, + "grad_norm": 0.4253043830394745, + "learning_rate": 0.00016927436020743709, + "loss": 1.4339, + "step": 11836 + }, + { + "epoch": 0.1538164163321925, + "grad_norm": 0.4004599153995514, + "learning_rate": 0.00016927176074552568, + "loss": 1.2909, + "step": 11837 + }, + { + "epoch": 0.1538294108761084, + "grad_norm": 0.4284515380859375, + "learning_rate": 0.0001692691612836143, + "loss": 1.5672, + "step": 11838 + }, + { + "epoch": 0.15384240542002428, + "grad_norm": 0.3077544867992401, + "learning_rate": 0.0001692665618217029, + "loss": 1.2494, + "step": 11839 + }, + { + "epoch": 0.15385539996394015, + "grad_norm": 0.39761441946029663, + "learning_rate": 0.00016926396235979153, + "loss": 1.4684, + "step": 11840 + }, + { + "epoch": 0.15386839450785603, + "grad_norm": 0.4121223986148834, + "learning_rate": 0.00016926136289788015, + "loss": 1.4011, + "step": 11841 + }, + { + "epoch": 0.1538813890517719, + "grad_norm": 0.4146476089954376, + "learning_rate": 0.00016925876343596875, + "loss": 1.505, + "step": 11842 + }, + { + "epoch": 0.15389438359568777, + "grad_norm": 0.38500046730041504, + "learning_rate": 0.00016925616397405738, + "loss": 1.5509, + "step": 11843 + }, + { + "epoch": 0.15390737813960365, + "grad_norm": 0.4074002802371979, + "learning_rate": 0.000169253564512146, + "loss": 1.2974, + "step": 11844 + }, + { + "epoch": 0.15392037268351952, + "grad_norm": 0.3999421298503876, + "learning_rate": 0.00016925096505023462, + "loss": 1.5522, + "step": 11845 + }, + { + "epoch": 0.1539333672274354, + "grad_norm": 0.34143367409706116, + "learning_rate": 0.00016924836558832322, + "loss": 1.6049, + "step": 11846 + }, + { + "epoch": 0.15394636177135126, + "grad_norm": 0.35129424929618835, + "learning_rate": 0.00016924576612641182, + "loss": 1.1661, + "step": 11847 + }, + { + "epoch": 0.15395935631526714, + "grad_norm": 0.3280280828475952, + "learning_rate": 0.00016924316666450047, + "loss": 1.3978, + "step": 11848 + }, + { + "epoch": 0.153972350859183, + "grad_norm": 0.3519212603569031, + "learning_rate": 0.00016924056720258907, + "loss": 1.3831, + "step": 11849 + }, + { + "epoch": 0.15398534540309888, + "grad_norm": 0.38983094692230225, + "learning_rate": 0.0001692379677406777, + "loss": 1.4393, + "step": 11850 + }, + { + "epoch": 0.15399833994701476, + "grad_norm": 0.42494019865989685, + "learning_rate": 0.0001692353682787663, + "loss": 1.4972, + "step": 11851 + }, + { + "epoch": 0.15401133449093063, + "grad_norm": 0.4433857798576355, + "learning_rate": 0.00016923276881685491, + "loss": 1.5799, + "step": 11852 + }, + { + "epoch": 0.1540243290348465, + "grad_norm": 0.40632128715515137, + "learning_rate": 0.00016923016935494354, + "loss": 1.551, + "step": 11853 + }, + { + "epoch": 0.15403732357876238, + "grad_norm": 0.4582209885120392, + "learning_rate": 0.00016922756989303214, + "loss": 1.3856, + "step": 11854 + }, + { + "epoch": 0.15405031812267825, + "grad_norm": 0.4091321527957916, + "learning_rate": 0.00016922497043112076, + "loss": 1.4386, + "step": 11855 + }, + { + "epoch": 0.15406331266659412, + "grad_norm": 0.3921829164028168, + "learning_rate": 0.00016922237096920939, + "loss": 1.4022, + "step": 11856 + }, + { + "epoch": 0.15407630721051, + "grad_norm": 0.44247809052467346, + "learning_rate": 0.000169219771507298, + "loss": 1.4853, + "step": 11857 + }, + { + "epoch": 0.15408930175442587, + "grad_norm": 0.41530054807662964, + "learning_rate": 0.0001692171720453866, + "loss": 1.4713, + "step": 11858 + }, + { + "epoch": 0.15410229629834174, + "grad_norm": 0.4238353371620178, + "learning_rate": 0.00016921457258347523, + "loss": 1.5081, + "step": 11859 + }, + { + "epoch": 0.1541152908422576, + "grad_norm": 0.44257503747940063, + "learning_rate": 0.00016921197312156386, + "loss": 1.3601, + "step": 11860 + }, + { + "epoch": 0.15412828538617349, + "grad_norm": 0.28186023235321045, + "learning_rate": 0.00016920937365965245, + "loss": 1.3482, + "step": 11861 + }, + { + "epoch": 0.15414127993008936, + "grad_norm": 0.36280325055122375, + "learning_rate": 0.00016920677419774108, + "loss": 1.4553, + "step": 11862 + }, + { + "epoch": 0.15415427447400523, + "grad_norm": 0.3822360038757324, + "learning_rate": 0.0001692041747358297, + "loss": 1.4787, + "step": 11863 + }, + { + "epoch": 0.1541672690179211, + "grad_norm": 0.3648161292076111, + "learning_rate": 0.0001692015752739183, + "loss": 1.4321, + "step": 11864 + }, + { + "epoch": 0.15418026356183698, + "grad_norm": 0.3299514651298523, + "learning_rate": 0.00016919897581200692, + "loss": 1.107, + "step": 11865 + }, + { + "epoch": 0.15419325810575285, + "grad_norm": 0.401964396238327, + "learning_rate": 0.00016919637635009552, + "loss": 1.4733, + "step": 11866 + }, + { + "epoch": 0.15420625264966872, + "grad_norm": 0.36504676938056946, + "learning_rate": 0.00016919377688818417, + "loss": 1.4544, + "step": 11867 + }, + { + "epoch": 0.1542192471935846, + "grad_norm": 0.44176867604255676, + "learning_rate": 0.00016919117742627277, + "loss": 1.4806, + "step": 11868 + }, + { + "epoch": 0.15423224173750047, + "grad_norm": 0.33480894565582275, + "learning_rate": 0.0001691885779643614, + "loss": 1.2861, + "step": 11869 + }, + { + "epoch": 0.15424523628141634, + "grad_norm": 0.4212414622306824, + "learning_rate": 0.00016918597850245, + "loss": 1.2678, + "step": 11870 + }, + { + "epoch": 0.15425823082533222, + "grad_norm": 0.31657376885414124, + "learning_rate": 0.00016918337904053862, + "loss": 1.6445, + "step": 11871 + }, + { + "epoch": 0.1542712253692481, + "grad_norm": 0.33360037207603455, + "learning_rate": 0.00016918077957862724, + "loss": 1.3473, + "step": 11872 + }, + { + "epoch": 0.15428421991316396, + "grad_norm": 0.3970799148082733, + "learning_rate": 0.00016917818011671584, + "loss": 1.344, + "step": 11873 + }, + { + "epoch": 0.15429721445707983, + "grad_norm": 0.42508774995803833, + "learning_rate": 0.00016917558065480446, + "loss": 1.3699, + "step": 11874 + }, + { + "epoch": 0.1543102090009957, + "grad_norm": 0.43957820534706116, + "learning_rate": 0.0001691729811928931, + "loss": 1.4833, + "step": 11875 + }, + { + "epoch": 0.15432320354491158, + "grad_norm": 0.34737902879714966, + "learning_rate": 0.00016917038173098169, + "loss": 1.4269, + "step": 11876 + }, + { + "epoch": 0.15433619808882745, + "grad_norm": 0.484232634305954, + "learning_rate": 0.0001691677822690703, + "loss": 1.2752, + "step": 11877 + }, + { + "epoch": 0.15434919263274333, + "grad_norm": 0.44650790095329285, + "learning_rate": 0.0001691651828071589, + "loss": 1.5006, + "step": 11878 + }, + { + "epoch": 0.1543621871766592, + "grad_norm": 0.40729624032974243, + "learning_rate": 0.00016916258334524756, + "loss": 1.3514, + "step": 11879 + }, + { + "epoch": 0.15437518172057507, + "grad_norm": 0.3570656180381775, + "learning_rate": 0.00016915998388333616, + "loss": 1.3146, + "step": 11880 + }, + { + "epoch": 0.15438817626449094, + "grad_norm": 0.3242524266242981, + "learning_rate": 0.00016915738442142478, + "loss": 1.2045, + "step": 11881 + }, + { + "epoch": 0.15440117080840682, + "grad_norm": 0.2881937623023987, + "learning_rate": 0.00016915478495951338, + "loss": 1.6563, + "step": 11882 + }, + { + "epoch": 0.1544141653523227, + "grad_norm": 0.39352545142173767, + "learning_rate": 0.000169152185497602, + "loss": 1.5157, + "step": 11883 + }, + { + "epoch": 0.15442715989623856, + "grad_norm": 0.4529942274093628, + "learning_rate": 0.00016914958603569063, + "loss": 1.4842, + "step": 11884 + }, + { + "epoch": 0.15444015444015444, + "grad_norm": 0.43497994542121887, + "learning_rate": 0.00016914698657377922, + "loss": 1.4339, + "step": 11885 + }, + { + "epoch": 0.1544531489840703, + "grad_norm": 0.3621883690357208, + "learning_rate": 0.00016914438711186785, + "loss": 1.3941, + "step": 11886 + }, + { + "epoch": 0.15446614352798618, + "grad_norm": 0.4022464156150818, + "learning_rate": 0.00016914178764995647, + "loss": 1.3937, + "step": 11887 + }, + { + "epoch": 0.15447913807190206, + "grad_norm": 0.4090622663497925, + "learning_rate": 0.00016913918818804507, + "loss": 1.4717, + "step": 11888 + }, + { + "epoch": 0.15449213261581793, + "grad_norm": 0.3842346966266632, + "learning_rate": 0.0001691365887261337, + "loss": 1.3914, + "step": 11889 + }, + { + "epoch": 0.1545051271597338, + "grad_norm": 0.39419880509376526, + "learning_rate": 0.0001691339892642223, + "loss": 1.2498, + "step": 11890 + }, + { + "epoch": 0.15451812170364967, + "grad_norm": 0.3775467276573181, + "learning_rate": 0.00016913138980231094, + "loss": 1.3555, + "step": 11891 + }, + { + "epoch": 0.15453111624756555, + "grad_norm": 0.3680625259876251, + "learning_rate": 0.00016912879034039954, + "loss": 1.3571, + "step": 11892 + }, + { + "epoch": 0.15454411079148142, + "grad_norm": 0.42519471049308777, + "learning_rate": 0.00016912619087848817, + "loss": 1.3928, + "step": 11893 + }, + { + "epoch": 0.1545571053353973, + "grad_norm": 0.41862019896507263, + "learning_rate": 0.0001691235914165768, + "loss": 1.4091, + "step": 11894 + }, + { + "epoch": 0.15457009987931317, + "grad_norm": 0.343043714761734, + "learning_rate": 0.0001691209919546654, + "loss": 1.5125, + "step": 11895 + }, + { + "epoch": 0.15458309442322904, + "grad_norm": 0.4209970533847809, + "learning_rate": 0.000169118392492754, + "loss": 1.4047, + "step": 11896 + }, + { + "epoch": 0.1545960889671449, + "grad_norm": 0.39638662338256836, + "learning_rate": 0.0001691157930308426, + "loss": 1.6015, + "step": 11897 + }, + { + "epoch": 0.15460908351106079, + "grad_norm": 0.412302166223526, + "learning_rate": 0.00016911319356893126, + "loss": 1.5197, + "step": 11898 + }, + { + "epoch": 0.15462207805497666, + "grad_norm": 0.3367950916290283, + "learning_rate": 0.00016911059410701986, + "loss": 1.4427, + "step": 11899 + }, + { + "epoch": 0.15463507259889253, + "grad_norm": 0.33590689301490784, + "learning_rate": 0.00016910799464510848, + "loss": 1.2768, + "step": 11900 + }, + { + "epoch": 0.1546480671428084, + "grad_norm": 0.312641978263855, + "learning_rate": 0.00016910539518319708, + "loss": 1.3458, + "step": 11901 + }, + { + "epoch": 0.15466106168672428, + "grad_norm": 0.39236322045326233, + "learning_rate": 0.0001691027957212857, + "loss": 1.5649, + "step": 11902 + }, + { + "epoch": 0.15467405623064015, + "grad_norm": 0.29848867654800415, + "learning_rate": 0.00016910019625937433, + "loss": 1.3152, + "step": 11903 + }, + { + "epoch": 0.15468705077455602, + "grad_norm": 0.4709525406360626, + "learning_rate": 0.00016909759679746293, + "loss": 1.6501, + "step": 11904 + }, + { + "epoch": 0.1547000453184719, + "grad_norm": 0.27534300088882446, + "learning_rate": 0.00016909499733555155, + "loss": 1.3424, + "step": 11905 + }, + { + "epoch": 0.15471303986238777, + "grad_norm": 0.39742717146873474, + "learning_rate": 0.00016909239787364018, + "loss": 1.4036, + "step": 11906 + }, + { + "epoch": 0.15472603440630364, + "grad_norm": 0.36763429641723633, + "learning_rate": 0.00016908979841172877, + "loss": 1.5504, + "step": 11907 + }, + { + "epoch": 0.15473902895021951, + "grad_norm": 0.489873468875885, + "learning_rate": 0.0001690871989498174, + "loss": 1.5053, + "step": 11908 + }, + { + "epoch": 0.1547520234941354, + "grad_norm": 0.3825457990169525, + "learning_rate": 0.000169084599487906, + "loss": 1.4927, + "step": 11909 + }, + { + "epoch": 0.15476501803805126, + "grad_norm": 0.384034126996994, + "learning_rate": 0.00016908200002599465, + "loss": 1.1581, + "step": 11910 + }, + { + "epoch": 0.15477801258196713, + "grad_norm": 0.44588908553123474, + "learning_rate": 0.00016907940056408324, + "loss": 1.5723, + "step": 11911 + }, + { + "epoch": 0.154791007125883, + "grad_norm": 0.3585994839668274, + "learning_rate": 0.00016907680110217187, + "loss": 1.5926, + "step": 11912 + }, + { + "epoch": 0.15480400166979888, + "grad_norm": 0.4727434515953064, + "learning_rate": 0.00016907420164026047, + "loss": 1.643, + "step": 11913 + }, + { + "epoch": 0.15481699621371478, + "grad_norm": 0.321334570646286, + "learning_rate": 0.0001690716021783491, + "loss": 1.315, + "step": 11914 + }, + { + "epoch": 0.15482999075763065, + "grad_norm": 0.45254164934158325, + "learning_rate": 0.00016906900271643771, + "loss": 1.458, + "step": 11915 + }, + { + "epoch": 0.15484298530154653, + "grad_norm": 0.28681501746177673, + "learning_rate": 0.0001690664032545263, + "loss": 1.5302, + "step": 11916 + }, + { + "epoch": 0.1548559798454624, + "grad_norm": 0.28579723834991455, + "learning_rate": 0.00016906380379261494, + "loss": 1.2712, + "step": 11917 + }, + { + "epoch": 0.15486897438937827, + "grad_norm": 0.35989296436309814, + "learning_rate": 0.00016906120433070356, + "loss": 1.5095, + "step": 11918 + }, + { + "epoch": 0.15488196893329415, + "grad_norm": 0.3587045967578888, + "learning_rate": 0.00016905860486879216, + "loss": 1.1496, + "step": 11919 + }, + { + "epoch": 0.15489496347721002, + "grad_norm": 0.4513356685638428, + "learning_rate": 0.00016905600540688078, + "loss": 1.5327, + "step": 11920 + }, + { + "epoch": 0.1549079580211259, + "grad_norm": 0.36633196473121643, + "learning_rate": 0.00016905340594496938, + "loss": 1.3146, + "step": 11921 + }, + { + "epoch": 0.15492095256504176, + "grad_norm": 0.37696900963783264, + "learning_rate": 0.00016905080648305803, + "loss": 1.5891, + "step": 11922 + }, + { + "epoch": 0.15493394710895764, + "grad_norm": 0.41157111525535583, + "learning_rate": 0.00016904820702114663, + "loss": 1.3362, + "step": 11923 + }, + { + "epoch": 0.1549469416528735, + "grad_norm": 0.4482050836086273, + "learning_rate": 0.00016904560755923525, + "loss": 1.3105, + "step": 11924 + }, + { + "epoch": 0.15495993619678938, + "grad_norm": 0.42405182123184204, + "learning_rate": 0.00016904300809732385, + "loss": 1.6842, + "step": 11925 + }, + { + "epoch": 0.15497293074070526, + "grad_norm": 0.4213549494743347, + "learning_rate": 0.00016904040863541248, + "loss": 1.4985, + "step": 11926 + }, + { + "epoch": 0.15498592528462113, + "grad_norm": 0.38105490803718567, + "learning_rate": 0.0001690378091735011, + "loss": 1.504, + "step": 11927 + }, + { + "epoch": 0.154998919828537, + "grad_norm": 0.32744768261909485, + "learning_rate": 0.0001690352097115897, + "loss": 1.5141, + "step": 11928 + }, + { + "epoch": 0.15501191437245287, + "grad_norm": 0.38247156143188477, + "learning_rate": 0.00016903261024967832, + "loss": 1.4567, + "step": 11929 + }, + { + "epoch": 0.15502490891636875, + "grad_norm": 0.297108918428421, + "learning_rate": 0.00016903001078776695, + "loss": 1.1905, + "step": 11930 + }, + { + "epoch": 0.15503790346028462, + "grad_norm": 0.39979055523872375, + "learning_rate": 0.00016902741132585554, + "loss": 1.3984, + "step": 11931 + }, + { + "epoch": 0.1550508980042005, + "grad_norm": 0.3583003580570221, + "learning_rate": 0.00016902481186394417, + "loss": 1.6012, + "step": 11932 + }, + { + "epoch": 0.15506389254811637, + "grad_norm": 0.4437173008918762, + "learning_rate": 0.0001690222124020328, + "loss": 1.2219, + "step": 11933 + }, + { + "epoch": 0.15507688709203224, + "grad_norm": 0.3262316882610321, + "learning_rate": 0.00016901961294012142, + "loss": 1.3229, + "step": 11934 + }, + { + "epoch": 0.1550898816359481, + "grad_norm": 0.3768066167831421, + "learning_rate": 0.00016901701347821001, + "loss": 1.3877, + "step": 11935 + }, + { + "epoch": 0.15510287617986399, + "grad_norm": 0.4145079255104065, + "learning_rate": 0.00016901441401629864, + "loss": 1.332, + "step": 11936 + }, + { + "epoch": 0.15511587072377986, + "grad_norm": 0.4187820851802826, + "learning_rate": 0.00016901181455438726, + "loss": 1.4219, + "step": 11937 + }, + { + "epoch": 0.15512886526769573, + "grad_norm": 0.3847416043281555, + "learning_rate": 0.00016900921509247586, + "loss": 1.2818, + "step": 11938 + }, + { + "epoch": 0.1551418598116116, + "grad_norm": 0.41393569111824036, + "learning_rate": 0.00016900661563056449, + "loss": 1.4979, + "step": 11939 + }, + { + "epoch": 0.15515485435552748, + "grad_norm": 0.4220622181892395, + "learning_rate": 0.00016900401616865308, + "loss": 1.4021, + "step": 11940 + }, + { + "epoch": 0.15516784889944335, + "grad_norm": 0.4015654921531677, + "learning_rate": 0.00016900141670674173, + "loss": 1.6743, + "step": 11941 + }, + { + "epoch": 0.15518084344335922, + "grad_norm": 0.40798476338386536, + "learning_rate": 0.00016899881724483033, + "loss": 1.5833, + "step": 11942 + }, + { + "epoch": 0.1551938379872751, + "grad_norm": 0.3616056740283966, + "learning_rate": 0.00016899621778291893, + "loss": 1.1696, + "step": 11943 + }, + { + "epoch": 0.15520683253119097, + "grad_norm": 0.39458411931991577, + "learning_rate": 0.00016899361832100755, + "loss": 1.4008, + "step": 11944 + }, + { + "epoch": 0.15521982707510684, + "grad_norm": 0.51478111743927, + "learning_rate": 0.00016899101885909618, + "loss": 1.3113, + "step": 11945 + }, + { + "epoch": 0.15523282161902272, + "grad_norm": 0.45048725605010986, + "learning_rate": 0.0001689884193971848, + "loss": 1.4865, + "step": 11946 + }, + { + "epoch": 0.1552458161629386, + "grad_norm": 0.44991689920425415, + "learning_rate": 0.0001689858199352734, + "loss": 1.4505, + "step": 11947 + }, + { + "epoch": 0.15525881070685446, + "grad_norm": 0.34293219447135925, + "learning_rate": 0.00016898322047336202, + "loss": 1.2344, + "step": 11948 + }, + { + "epoch": 0.15527180525077033, + "grad_norm": 0.32282909750938416, + "learning_rate": 0.00016898062101145065, + "loss": 1.4802, + "step": 11949 + }, + { + "epoch": 0.1552847997946862, + "grad_norm": 0.3538459241390228, + "learning_rate": 0.00016897802154953925, + "loss": 1.2857, + "step": 11950 + }, + { + "epoch": 0.15529779433860208, + "grad_norm": 0.5327444076538086, + "learning_rate": 0.00016897542208762787, + "loss": 1.5016, + "step": 11951 + }, + { + "epoch": 0.15531078888251795, + "grad_norm": 0.48369458317756653, + "learning_rate": 0.00016897282262571647, + "loss": 1.5615, + "step": 11952 + }, + { + "epoch": 0.15532378342643383, + "grad_norm": 0.33527451753616333, + "learning_rate": 0.00016897022316380512, + "loss": 1.4315, + "step": 11953 + }, + { + "epoch": 0.1553367779703497, + "grad_norm": 0.503851592540741, + "learning_rate": 0.00016896762370189372, + "loss": 1.5008, + "step": 11954 + }, + { + "epoch": 0.15534977251426557, + "grad_norm": 0.4223160147666931, + "learning_rate": 0.00016896502423998231, + "loss": 1.5847, + "step": 11955 + }, + { + "epoch": 0.15536276705818144, + "grad_norm": 0.4810388684272766, + "learning_rate": 0.00016896242477807094, + "loss": 1.3925, + "step": 11956 + }, + { + "epoch": 0.15537576160209732, + "grad_norm": 0.36106744408607483, + "learning_rate": 0.00016895982531615956, + "loss": 1.1159, + "step": 11957 + }, + { + "epoch": 0.1553887561460132, + "grad_norm": 0.3481575548648834, + "learning_rate": 0.0001689572258542482, + "loss": 1.3377, + "step": 11958 + }, + { + "epoch": 0.15540175068992906, + "grad_norm": 0.41145047545433044, + "learning_rate": 0.00016895462639233679, + "loss": 1.4227, + "step": 11959 + }, + { + "epoch": 0.15541474523384494, + "grad_norm": 0.3502742350101471, + "learning_rate": 0.0001689520269304254, + "loss": 1.4869, + "step": 11960 + }, + { + "epoch": 0.1554277397777608, + "grad_norm": 0.4538623094558716, + "learning_rate": 0.00016894942746851403, + "loss": 1.3256, + "step": 11961 + }, + { + "epoch": 0.15544073432167668, + "grad_norm": 0.43536028265953064, + "learning_rate": 0.00016894682800660263, + "loss": 1.5321, + "step": 11962 + }, + { + "epoch": 0.15545372886559256, + "grad_norm": 0.41437843441963196, + "learning_rate": 0.00016894422854469126, + "loss": 1.5899, + "step": 11963 + }, + { + "epoch": 0.15546672340950843, + "grad_norm": 0.4876006245613098, + "learning_rate": 0.00016894162908277985, + "loss": 1.5311, + "step": 11964 + }, + { + "epoch": 0.1554797179534243, + "grad_norm": 0.45469412207603455, + "learning_rate": 0.0001689390296208685, + "loss": 1.6302, + "step": 11965 + }, + { + "epoch": 0.15549271249734017, + "grad_norm": 0.3620186746120453, + "learning_rate": 0.0001689364301589571, + "loss": 1.5112, + "step": 11966 + }, + { + "epoch": 0.15550570704125605, + "grad_norm": 0.44205158948898315, + "learning_rate": 0.00016893383069704573, + "loss": 1.2681, + "step": 11967 + }, + { + "epoch": 0.15551870158517192, + "grad_norm": 0.5003493428230286, + "learning_rate": 0.00016893123123513435, + "loss": 1.4616, + "step": 11968 + }, + { + "epoch": 0.1555316961290878, + "grad_norm": 0.3435812294483185, + "learning_rate": 0.00016892863177322295, + "loss": 1.3523, + "step": 11969 + }, + { + "epoch": 0.15554469067300367, + "grad_norm": 0.35033366084098816, + "learning_rate": 0.00016892603231131157, + "loss": 1.471, + "step": 11970 + }, + { + "epoch": 0.15555768521691954, + "grad_norm": 0.3777720332145691, + "learning_rate": 0.00016892343284940017, + "loss": 1.3449, + "step": 11971 + }, + { + "epoch": 0.1555706797608354, + "grad_norm": 0.44455766677856445, + "learning_rate": 0.0001689208333874888, + "loss": 1.7027, + "step": 11972 + }, + { + "epoch": 0.15558367430475128, + "grad_norm": 0.3895317316055298, + "learning_rate": 0.00016891823392557742, + "loss": 1.4081, + "step": 11973 + }, + { + "epoch": 0.15559666884866716, + "grad_norm": 0.3123410642147064, + "learning_rate": 0.00016891563446366602, + "loss": 1.3425, + "step": 11974 + }, + { + "epoch": 0.15560966339258303, + "grad_norm": 0.337747722864151, + "learning_rate": 0.00016891303500175464, + "loss": 1.4477, + "step": 11975 + }, + { + "epoch": 0.1556226579364989, + "grad_norm": 0.4136996269226074, + "learning_rate": 0.00016891043553984327, + "loss": 1.4475, + "step": 11976 + }, + { + "epoch": 0.15563565248041478, + "grad_norm": 0.39037227630615234, + "learning_rate": 0.0001689078360779319, + "loss": 1.3774, + "step": 11977 + }, + { + "epoch": 0.15564864702433065, + "grad_norm": 0.3910379111766815, + "learning_rate": 0.0001689052366160205, + "loss": 1.4258, + "step": 11978 + }, + { + "epoch": 0.15566164156824652, + "grad_norm": 0.40478605031967163, + "learning_rate": 0.0001689026371541091, + "loss": 1.4454, + "step": 11979 + }, + { + "epoch": 0.1556746361121624, + "grad_norm": 0.4217974841594696, + "learning_rate": 0.00016890003769219774, + "loss": 1.4353, + "step": 11980 + }, + { + "epoch": 0.15568763065607827, + "grad_norm": 0.28577250242233276, + "learning_rate": 0.00016889743823028633, + "loss": 1.502, + "step": 11981 + }, + { + "epoch": 0.15570062519999414, + "grad_norm": 0.5484791994094849, + "learning_rate": 0.00016889483876837496, + "loss": 1.6585, + "step": 11982 + }, + { + "epoch": 0.15571361974391001, + "grad_norm": 0.33722198009490967, + "learning_rate": 0.00016889223930646356, + "loss": 1.4272, + "step": 11983 + }, + { + "epoch": 0.1557266142878259, + "grad_norm": 0.47989392280578613, + "learning_rate": 0.00016888963984455218, + "loss": 1.4959, + "step": 11984 + }, + { + "epoch": 0.15573960883174176, + "grad_norm": 0.5010594725608826, + "learning_rate": 0.0001688870403826408, + "loss": 1.4862, + "step": 11985 + }, + { + "epoch": 0.15575260337565763, + "grad_norm": 0.44075724482536316, + "learning_rate": 0.0001688844409207294, + "loss": 1.4108, + "step": 11986 + }, + { + "epoch": 0.1557655979195735, + "grad_norm": 0.3535269498825073, + "learning_rate": 0.00016888184145881803, + "loss": 1.3329, + "step": 11987 + }, + { + "epoch": 0.15577859246348938, + "grad_norm": 0.31797629594802856, + "learning_rate": 0.00016887924199690665, + "loss": 1.4949, + "step": 11988 + }, + { + "epoch": 0.15579158700740525, + "grad_norm": 0.5116481184959412, + "learning_rate": 0.00016887664253499528, + "loss": 1.6497, + "step": 11989 + }, + { + "epoch": 0.15580458155132113, + "grad_norm": 0.35807478427886963, + "learning_rate": 0.00016887404307308387, + "loss": 1.2568, + "step": 11990 + }, + { + "epoch": 0.15581757609523703, + "grad_norm": 0.43894821405410767, + "learning_rate": 0.0001688714436111725, + "loss": 1.6529, + "step": 11991 + }, + { + "epoch": 0.1558305706391529, + "grad_norm": 0.3731173276901245, + "learning_rate": 0.00016886884414926112, + "loss": 1.4194, + "step": 11992 + }, + { + "epoch": 0.15584356518306877, + "grad_norm": 0.44934573769569397, + "learning_rate": 0.00016886624468734972, + "loss": 1.4915, + "step": 11993 + }, + { + "epoch": 0.15585655972698464, + "grad_norm": 0.3778323531150818, + "learning_rate": 0.00016886364522543834, + "loss": 1.4198, + "step": 11994 + }, + { + "epoch": 0.15586955427090052, + "grad_norm": 0.3148915767669678, + "learning_rate": 0.00016886104576352694, + "loss": 1.2636, + "step": 11995 + }, + { + "epoch": 0.1558825488148164, + "grad_norm": 0.4303494691848755, + "learning_rate": 0.0001688584463016156, + "loss": 1.4403, + "step": 11996 + }, + { + "epoch": 0.15589554335873226, + "grad_norm": 0.45718246698379517, + "learning_rate": 0.0001688558468397042, + "loss": 1.557, + "step": 11997 + }, + { + "epoch": 0.15590853790264814, + "grad_norm": 0.40785664319992065, + "learning_rate": 0.0001688532473777928, + "loss": 1.5745, + "step": 11998 + }, + { + "epoch": 0.155921532446564, + "grad_norm": 0.3330020606517792, + "learning_rate": 0.0001688506479158814, + "loss": 1.4788, + "step": 11999 + }, + { + "epoch": 0.15593452699047988, + "grad_norm": 0.4892558455467224, + "learning_rate": 0.00016884804845397004, + "loss": 1.6078, + "step": 12000 + }, + { + "epoch": 0.15594752153439576, + "grad_norm": 0.40915924310684204, + "learning_rate": 0.00016884544899205866, + "loss": 1.2677, + "step": 12001 + }, + { + "epoch": 0.15596051607831163, + "grad_norm": 0.3810061812400818, + "learning_rate": 0.00016884284953014726, + "loss": 1.5153, + "step": 12002 + }, + { + "epoch": 0.1559735106222275, + "grad_norm": 0.42884185910224915, + "learning_rate": 0.00016884025006823588, + "loss": 1.4682, + "step": 12003 + }, + { + "epoch": 0.15598650516614337, + "grad_norm": 0.4271387457847595, + "learning_rate": 0.0001688376506063245, + "loss": 1.457, + "step": 12004 + }, + { + "epoch": 0.15599949971005925, + "grad_norm": 0.45230433344841003, + "learning_rate": 0.0001688350511444131, + "loss": 1.3025, + "step": 12005 + }, + { + "epoch": 0.15601249425397512, + "grad_norm": 0.35235241055488586, + "learning_rate": 0.00016883245168250173, + "loss": 1.4811, + "step": 12006 + }, + { + "epoch": 0.156025488797891, + "grad_norm": 0.4068051874637604, + "learning_rate": 0.00016882985222059035, + "loss": 1.5421, + "step": 12007 + }, + { + "epoch": 0.15603848334180687, + "grad_norm": 0.35475414991378784, + "learning_rate": 0.00016882725275867898, + "loss": 1.3846, + "step": 12008 + }, + { + "epoch": 0.15605147788572274, + "grad_norm": 0.3560280203819275, + "learning_rate": 0.00016882465329676758, + "loss": 1.3331, + "step": 12009 + }, + { + "epoch": 0.1560644724296386, + "grad_norm": 0.3426785171031952, + "learning_rate": 0.00016882205383485617, + "loss": 1.3675, + "step": 12010 + }, + { + "epoch": 0.15607746697355449, + "grad_norm": 0.45811954140663147, + "learning_rate": 0.00016881945437294483, + "loss": 1.5147, + "step": 12011 + }, + { + "epoch": 0.15609046151747036, + "grad_norm": 0.31036385893821716, + "learning_rate": 0.00016881685491103342, + "loss": 1.4167, + "step": 12012 + }, + { + "epoch": 0.15610345606138623, + "grad_norm": 0.44240742921829224, + "learning_rate": 0.00016881425544912205, + "loss": 1.5844, + "step": 12013 + }, + { + "epoch": 0.1561164506053021, + "grad_norm": 0.45457398891448975, + "learning_rate": 0.00016881165598721064, + "loss": 1.4403, + "step": 12014 + }, + { + "epoch": 0.15612944514921798, + "grad_norm": 0.3350835144519806, + "learning_rate": 0.00016880905652529927, + "loss": 1.6916, + "step": 12015 + }, + { + "epoch": 0.15614243969313385, + "grad_norm": 0.37067195773124695, + "learning_rate": 0.0001688064570633879, + "loss": 1.2549, + "step": 12016 + }, + { + "epoch": 0.15615543423704972, + "grad_norm": 0.3972967565059662, + "learning_rate": 0.0001688038576014765, + "loss": 1.3549, + "step": 12017 + }, + { + "epoch": 0.1561684287809656, + "grad_norm": 0.35073786973953247, + "learning_rate": 0.00016880125813956512, + "loss": 1.3421, + "step": 12018 + }, + { + "epoch": 0.15618142332488147, + "grad_norm": 0.35622137784957886, + "learning_rate": 0.00016879865867765374, + "loss": 1.5753, + "step": 12019 + }, + { + "epoch": 0.15619441786879734, + "grad_norm": 0.4132542908191681, + "learning_rate": 0.00016879605921574236, + "loss": 1.5363, + "step": 12020 + }, + { + "epoch": 0.15620741241271321, + "grad_norm": 0.4252627491950989, + "learning_rate": 0.00016879345975383096, + "loss": 1.34, + "step": 12021 + }, + { + "epoch": 0.1562204069566291, + "grad_norm": 0.41447702050209045, + "learning_rate": 0.00016879086029191959, + "loss": 1.3011, + "step": 12022 + }, + { + "epoch": 0.15623340150054496, + "grad_norm": 0.44659537076950073, + "learning_rate": 0.0001687882608300082, + "loss": 1.5662, + "step": 12023 + }, + { + "epoch": 0.15624639604446083, + "grad_norm": 0.3066611886024475, + "learning_rate": 0.0001687856613680968, + "loss": 1.3217, + "step": 12024 + }, + { + "epoch": 0.1562593905883767, + "grad_norm": 0.46207112073898315, + "learning_rate": 0.00016878306190618543, + "loss": 1.5429, + "step": 12025 + }, + { + "epoch": 0.15627238513229258, + "grad_norm": 0.4428323805332184, + "learning_rate": 0.00016878046244427403, + "loss": 1.3047, + "step": 12026 + }, + { + "epoch": 0.15628537967620845, + "grad_norm": 0.36988434195518494, + "learning_rate": 0.00016877786298236265, + "loss": 1.2062, + "step": 12027 + }, + { + "epoch": 0.15629837422012433, + "grad_norm": 0.3586268424987793, + "learning_rate": 0.00016877526352045128, + "loss": 1.3487, + "step": 12028 + }, + { + "epoch": 0.1563113687640402, + "grad_norm": 0.26521074771881104, + "learning_rate": 0.00016877266405853988, + "loss": 1.3636, + "step": 12029 + }, + { + "epoch": 0.15632436330795607, + "grad_norm": 0.5100510120391846, + "learning_rate": 0.0001687700645966285, + "loss": 1.5946, + "step": 12030 + }, + { + "epoch": 0.15633735785187194, + "grad_norm": 0.4092614948749542, + "learning_rate": 0.00016876746513471713, + "loss": 1.2236, + "step": 12031 + }, + { + "epoch": 0.15635035239578782, + "grad_norm": 0.3568085730075836, + "learning_rate": 0.00016876486567280575, + "loss": 1.4141, + "step": 12032 + }, + { + "epoch": 0.1563633469397037, + "grad_norm": 0.36383116245269775, + "learning_rate": 0.00016876226621089435, + "loss": 1.7182, + "step": 12033 + }, + { + "epoch": 0.15637634148361956, + "grad_norm": 0.43455275893211365, + "learning_rate": 0.00016875966674898297, + "loss": 1.2831, + "step": 12034 + }, + { + "epoch": 0.15638933602753544, + "grad_norm": 0.38674166798591614, + "learning_rate": 0.0001687570672870716, + "loss": 1.5929, + "step": 12035 + }, + { + "epoch": 0.1564023305714513, + "grad_norm": 0.44080469012260437, + "learning_rate": 0.0001687544678251602, + "loss": 1.4638, + "step": 12036 + }, + { + "epoch": 0.15641532511536718, + "grad_norm": 0.4807296097278595, + "learning_rate": 0.00016875186836324882, + "loss": 1.5531, + "step": 12037 + }, + { + "epoch": 0.15642831965928305, + "grad_norm": 0.3581717014312744, + "learning_rate": 0.00016874926890133742, + "loss": 1.2694, + "step": 12038 + }, + { + "epoch": 0.15644131420319893, + "grad_norm": 0.3941170573234558, + "learning_rate": 0.00016874666943942604, + "loss": 1.3648, + "step": 12039 + }, + { + "epoch": 0.1564543087471148, + "grad_norm": 0.3562326431274414, + "learning_rate": 0.00016874406997751466, + "loss": 1.3535, + "step": 12040 + }, + { + "epoch": 0.15646730329103067, + "grad_norm": 0.3954648971557617, + "learning_rate": 0.00016874147051560326, + "loss": 1.5573, + "step": 12041 + }, + { + "epoch": 0.15648029783494655, + "grad_norm": 0.26697438955307007, + "learning_rate": 0.0001687388710536919, + "loss": 1.434, + "step": 12042 + }, + { + "epoch": 0.15649329237886242, + "grad_norm": 0.42756736278533936, + "learning_rate": 0.0001687362715917805, + "loss": 1.3733, + "step": 12043 + }, + { + "epoch": 0.1565062869227783, + "grad_norm": 0.4607290029525757, + "learning_rate": 0.00016873367212986913, + "loss": 1.5074, + "step": 12044 + }, + { + "epoch": 0.15651928146669417, + "grad_norm": 0.3384189009666443, + "learning_rate": 0.00016873107266795773, + "loss": 1.3675, + "step": 12045 + }, + { + "epoch": 0.15653227601061004, + "grad_norm": 0.21360808610916138, + "learning_rate": 0.00016872847320604636, + "loss": 1.1779, + "step": 12046 + }, + { + "epoch": 0.1565452705545259, + "grad_norm": 0.37245047092437744, + "learning_rate": 0.00016872587374413498, + "loss": 1.3669, + "step": 12047 + }, + { + "epoch": 0.15655826509844178, + "grad_norm": 0.5019447803497314, + "learning_rate": 0.00016872327428222358, + "loss": 1.668, + "step": 12048 + }, + { + "epoch": 0.15657125964235766, + "grad_norm": 0.4975603222846985, + "learning_rate": 0.0001687206748203122, + "loss": 1.6643, + "step": 12049 + }, + { + "epoch": 0.15658425418627353, + "grad_norm": 0.41646575927734375, + "learning_rate": 0.00016871807535840083, + "loss": 1.4798, + "step": 12050 + }, + { + "epoch": 0.1565972487301894, + "grad_norm": 0.3129939138889313, + "learning_rate": 0.00016871547589648945, + "loss": 1.2571, + "step": 12051 + }, + { + "epoch": 0.15661024327410528, + "grad_norm": 0.46845167875289917, + "learning_rate": 0.00016871287643457805, + "loss": 1.4047, + "step": 12052 + }, + { + "epoch": 0.15662323781802115, + "grad_norm": 0.34697386622428894, + "learning_rate": 0.00016871027697266665, + "loss": 1.5065, + "step": 12053 + }, + { + "epoch": 0.15663623236193702, + "grad_norm": 0.3365623652935028, + "learning_rate": 0.0001687076775107553, + "loss": 1.3844, + "step": 12054 + }, + { + "epoch": 0.1566492269058529, + "grad_norm": 0.33361899852752686, + "learning_rate": 0.0001687050780488439, + "loss": 1.3084, + "step": 12055 + }, + { + "epoch": 0.15666222144976877, + "grad_norm": 0.42262473702430725, + "learning_rate": 0.00016870247858693252, + "loss": 1.3859, + "step": 12056 + }, + { + "epoch": 0.15667521599368464, + "grad_norm": 0.44393181800842285, + "learning_rate": 0.00016869987912502112, + "loss": 1.5549, + "step": 12057 + }, + { + "epoch": 0.15668821053760051, + "grad_norm": 0.36272329092025757, + "learning_rate": 0.00016869727966310974, + "loss": 1.4526, + "step": 12058 + }, + { + "epoch": 0.1567012050815164, + "grad_norm": 0.34070709347724915, + "learning_rate": 0.00016869468020119837, + "loss": 1.1817, + "step": 12059 + }, + { + "epoch": 0.15671419962543226, + "grad_norm": 0.3324626386165619, + "learning_rate": 0.00016869208073928696, + "loss": 1.4151, + "step": 12060 + }, + { + "epoch": 0.15672719416934813, + "grad_norm": 0.43929848074913025, + "learning_rate": 0.0001686894812773756, + "loss": 1.405, + "step": 12061 + }, + { + "epoch": 0.156740188713264, + "grad_norm": 0.37687528133392334, + "learning_rate": 0.0001686868818154642, + "loss": 1.5245, + "step": 12062 + }, + { + "epoch": 0.15675318325717988, + "grad_norm": 0.34735623002052307, + "learning_rate": 0.00016868428235355284, + "loss": 1.433, + "step": 12063 + }, + { + "epoch": 0.15676617780109575, + "grad_norm": 0.28883078694343567, + "learning_rate": 0.00016868168289164143, + "loss": 1.2274, + "step": 12064 + }, + { + "epoch": 0.15677917234501162, + "grad_norm": 0.48190370202064514, + "learning_rate": 0.00016867908342973003, + "loss": 1.3238, + "step": 12065 + }, + { + "epoch": 0.1567921668889275, + "grad_norm": 0.4140412211418152, + "learning_rate": 0.00016867648396781868, + "loss": 1.3874, + "step": 12066 + }, + { + "epoch": 0.1568051614328434, + "grad_norm": 0.39700308442115784, + "learning_rate": 0.00016867388450590728, + "loss": 1.3749, + "step": 12067 + }, + { + "epoch": 0.15681815597675927, + "grad_norm": 0.37823575735092163, + "learning_rate": 0.0001686712850439959, + "loss": 1.5489, + "step": 12068 + }, + { + "epoch": 0.15683115052067514, + "grad_norm": 0.33525389432907104, + "learning_rate": 0.0001686686855820845, + "loss": 1.4614, + "step": 12069 + }, + { + "epoch": 0.15684414506459102, + "grad_norm": 0.3624768555164337, + "learning_rate": 0.00016866608612017313, + "loss": 1.2793, + "step": 12070 + }, + { + "epoch": 0.1568571396085069, + "grad_norm": 0.42806684970855713, + "learning_rate": 0.00016866348665826175, + "loss": 1.7052, + "step": 12071 + }, + { + "epoch": 0.15687013415242276, + "grad_norm": 0.3201879560947418, + "learning_rate": 0.00016866088719635035, + "loss": 1.2095, + "step": 12072 + }, + { + "epoch": 0.15688312869633864, + "grad_norm": 0.3719903528690338, + "learning_rate": 0.00016865828773443897, + "loss": 1.6028, + "step": 12073 + }, + { + "epoch": 0.1568961232402545, + "grad_norm": 0.38908639550209045, + "learning_rate": 0.0001686556882725276, + "loss": 1.4535, + "step": 12074 + }, + { + "epoch": 0.15690911778417038, + "grad_norm": 0.41018176078796387, + "learning_rate": 0.00016865308881061622, + "loss": 1.2228, + "step": 12075 + }, + { + "epoch": 0.15692211232808626, + "grad_norm": 0.4546106457710266, + "learning_rate": 0.00016865048934870482, + "loss": 1.5348, + "step": 12076 + }, + { + "epoch": 0.15693510687200213, + "grad_norm": 0.41519367694854736, + "learning_rate": 0.00016864788988679342, + "loss": 1.3985, + "step": 12077 + }, + { + "epoch": 0.156948101415918, + "grad_norm": 0.43420615792274475, + "learning_rate": 0.00016864529042488207, + "loss": 1.4531, + "step": 12078 + }, + { + "epoch": 0.15696109595983387, + "grad_norm": 0.2989868223667145, + "learning_rate": 0.00016864269096297067, + "loss": 1.4021, + "step": 12079 + }, + { + "epoch": 0.15697409050374975, + "grad_norm": 0.421214759349823, + "learning_rate": 0.0001686400915010593, + "loss": 1.5715, + "step": 12080 + }, + { + "epoch": 0.15698708504766562, + "grad_norm": 0.3859822750091553, + "learning_rate": 0.00016863749203914792, + "loss": 1.4295, + "step": 12081 + }, + { + "epoch": 0.1570000795915815, + "grad_norm": 0.38460224866867065, + "learning_rate": 0.0001686348925772365, + "loss": 1.3309, + "step": 12082 + }, + { + "epoch": 0.15701307413549737, + "grad_norm": 0.37776118516921997, + "learning_rate": 0.00016863229311532514, + "loss": 1.524, + "step": 12083 + }, + { + "epoch": 0.15702606867941324, + "grad_norm": 0.35567814111709595, + "learning_rate": 0.00016862969365341373, + "loss": 1.5778, + "step": 12084 + }, + { + "epoch": 0.1570390632233291, + "grad_norm": 0.4244820773601532, + "learning_rate": 0.00016862709419150239, + "loss": 1.6133, + "step": 12085 + }, + { + "epoch": 0.15705205776724498, + "grad_norm": 0.42508867383003235, + "learning_rate": 0.00016862449472959098, + "loss": 1.4931, + "step": 12086 + }, + { + "epoch": 0.15706505231116086, + "grad_norm": 0.4721623659133911, + "learning_rate": 0.0001686218952676796, + "loss": 1.5827, + "step": 12087 + }, + { + "epoch": 0.15707804685507673, + "grad_norm": 0.36248722672462463, + "learning_rate": 0.0001686192958057682, + "loss": 1.3802, + "step": 12088 + }, + { + "epoch": 0.1570910413989926, + "grad_norm": 0.39904311299324036, + "learning_rate": 0.00016861669634385683, + "loss": 1.3497, + "step": 12089 + }, + { + "epoch": 0.15710403594290848, + "grad_norm": 0.4013567566871643, + "learning_rate": 0.00016861409688194545, + "loss": 1.5756, + "step": 12090 + }, + { + "epoch": 0.15711703048682435, + "grad_norm": 0.3746693730354309, + "learning_rate": 0.00016861149742003405, + "loss": 1.5049, + "step": 12091 + }, + { + "epoch": 0.15713002503074022, + "grad_norm": 0.3480197489261627, + "learning_rate": 0.00016860889795812268, + "loss": 1.3034, + "step": 12092 + }, + { + "epoch": 0.1571430195746561, + "grad_norm": 0.4265461564064026, + "learning_rate": 0.0001686062984962113, + "loss": 1.5288, + "step": 12093 + }, + { + "epoch": 0.15715601411857197, + "grad_norm": 0.4015458822250366, + "learning_rate": 0.0001686036990342999, + "loss": 1.3763, + "step": 12094 + }, + { + "epoch": 0.15716900866248784, + "grad_norm": 0.3355420231819153, + "learning_rate": 0.00016860109957238852, + "loss": 1.4359, + "step": 12095 + }, + { + "epoch": 0.15718200320640371, + "grad_norm": 0.5164967179298401, + "learning_rate": 0.00016859850011047712, + "loss": 1.5925, + "step": 12096 + }, + { + "epoch": 0.1571949977503196, + "grad_norm": 0.39553114771842957, + "learning_rate": 0.00016859590064856577, + "loss": 1.5647, + "step": 12097 + }, + { + "epoch": 0.15720799229423546, + "grad_norm": 0.4172547459602356, + "learning_rate": 0.00016859330118665437, + "loss": 1.4438, + "step": 12098 + }, + { + "epoch": 0.15722098683815133, + "grad_norm": 0.4288550615310669, + "learning_rate": 0.000168590701724743, + "loss": 1.4382, + "step": 12099 + }, + { + "epoch": 0.1572339813820672, + "grad_norm": 0.39426884055137634, + "learning_rate": 0.0001685881022628316, + "loss": 1.4978, + "step": 12100 + }, + { + "epoch": 0.15724697592598308, + "grad_norm": 0.4198729991912842, + "learning_rate": 0.00016858550280092022, + "loss": 1.5107, + "step": 12101 + }, + { + "epoch": 0.15725997046989895, + "grad_norm": 0.37695348262786865, + "learning_rate": 0.00016858290333900884, + "loss": 1.3203, + "step": 12102 + }, + { + "epoch": 0.15727296501381482, + "grad_norm": 0.4280935823917389, + "learning_rate": 0.00016858030387709744, + "loss": 1.2111, + "step": 12103 + }, + { + "epoch": 0.1572859595577307, + "grad_norm": 0.4634476900100708, + "learning_rate": 0.00016857770441518606, + "loss": 1.5782, + "step": 12104 + }, + { + "epoch": 0.15729895410164657, + "grad_norm": 0.4004434049129486, + "learning_rate": 0.00016857510495327469, + "loss": 1.3733, + "step": 12105 + }, + { + "epoch": 0.15731194864556244, + "grad_norm": 0.43846994638442993, + "learning_rate": 0.0001685725054913633, + "loss": 1.6742, + "step": 12106 + }, + { + "epoch": 0.15732494318947832, + "grad_norm": 0.4106936752796173, + "learning_rate": 0.0001685699060294519, + "loss": 1.3449, + "step": 12107 + }, + { + "epoch": 0.1573379377333942, + "grad_norm": 0.40755710005760193, + "learning_rate": 0.0001685673065675405, + "loss": 1.4562, + "step": 12108 + }, + { + "epoch": 0.15735093227731006, + "grad_norm": 0.37867283821105957, + "learning_rate": 0.00016856470710562916, + "loss": 1.3525, + "step": 12109 + }, + { + "epoch": 0.15736392682122594, + "grad_norm": 0.38126394152641296, + "learning_rate": 0.00016856210764371775, + "loss": 1.253, + "step": 12110 + }, + { + "epoch": 0.1573769213651418, + "grad_norm": 0.4599338471889496, + "learning_rate": 0.00016855950818180638, + "loss": 1.4677, + "step": 12111 + }, + { + "epoch": 0.15738991590905768, + "grad_norm": 0.42347508668899536, + "learning_rate": 0.00016855690871989498, + "loss": 1.3983, + "step": 12112 + }, + { + "epoch": 0.15740291045297355, + "grad_norm": 0.3543795645236969, + "learning_rate": 0.0001685543092579836, + "loss": 1.5169, + "step": 12113 + }, + { + "epoch": 0.15741590499688943, + "grad_norm": 0.4439345598220825, + "learning_rate": 0.00016855170979607223, + "loss": 1.4053, + "step": 12114 + }, + { + "epoch": 0.1574288995408053, + "grad_norm": 0.4256225526332855, + "learning_rate": 0.00016854911033416082, + "loss": 1.6394, + "step": 12115 + }, + { + "epoch": 0.15744189408472117, + "grad_norm": 0.3350345492362976, + "learning_rate": 0.00016854651087224947, + "loss": 1.348, + "step": 12116 + }, + { + "epoch": 0.15745488862863705, + "grad_norm": 0.3530762791633606, + "learning_rate": 0.00016854391141033807, + "loss": 1.3662, + "step": 12117 + }, + { + "epoch": 0.15746788317255292, + "grad_norm": 0.43185654282569885, + "learning_rate": 0.0001685413119484267, + "loss": 1.4478, + "step": 12118 + }, + { + "epoch": 0.1574808777164688, + "grad_norm": 0.3207387626171112, + "learning_rate": 0.0001685387124865153, + "loss": 1.332, + "step": 12119 + }, + { + "epoch": 0.15749387226038467, + "grad_norm": 0.45395734906196594, + "learning_rate": 0.00016853611302460392, + "loss": 1.5576, + "step": 12120 + }, + { + "epoch": 0.15750686680430054, + "grad_norm": 0.5471612215042114, + "learning_rate": 0.00016853351356269254, + "loss": 1.3549, + "step": 12121 + }, + { + "epoch": 0.1575198613482164, + "grad_norm": 0.24745294451713562, + "learning_rate": 0.00016853091410078114, + "loss": 1.3708, + "step": 12122 + }, + { + "epoch": 0.15753285589213228, + "grad_norm": 0.3814707100391388, + "learning_rate": 0.00016852831463886976, + "loss": 1.513, + "step": 12123 + }, + { + "epoch": 0.15754585043604816, + "grad_norm": 0.3231234550476074, + "learning_rate": 0.0001685257151769584, + "loss": 1.3583, + "step": 12124 + }, + { + "epoch": 0.15755884497996403, + "grad_norm": 0.30630233883857727, + "learning_rate": 0.00016852311571504699, + "loss": 1.4103, + "step": 12125 + }, + { + "epoch": 0.1575718395238799, + "grad_norm": 0.39077651500701904, + "learning_rate": 0.0001685205162531356, + "loss": 1.3963, + "step": 12126 + }, + { + "epoch": 0.15758483406779578, + "grad_norm": 0.2572426497936249, + "learning_rate": 0.0001685179167912242, + "loss": 1.3737, + "step": 12127 + }, + { + "epoch": 0.15759782861171165, + "grad_norm": 0.35180747509002686, + "learning_rate": 0.00016851531732931286, + "loss": 1.5199, + "step": 12128 + }, + { + "epoch": 0.15761082315562752, + "grad_norm": 0.37713390588760376, + "learning_rate": 0.00016851271786740146, + "loss": 1.4161, + "step": 12129 + }, + { + "epoch": 0.1576238176995434, + "grad_norm": 0.30682334303855896, + "learning_rate": 0.00016851011840549008, + "loss": 1.4216, + "step": 12130 + }, + { + "epoch": 0.15763681224345927, + "grad_norm": 0.42932432889938354, + "learning_rate": 0.00016850751894357868, + "loss": 1.3388, + "step": 12131 + }, + { + "epoch": 0.15764980678737514, + "grad_norm": 0.40550339221954346, + "learning_rate": 0.0001685049194816673, + "loss": 1.4025, + "step": 12132 + }, + { + "epoch": 0.157662801331291, + "grad_norm": 0.27826327085494995, + "learning_rate": 0.00016850232001975593, + "loss": 1.3198, + "step": 12133 + }, + { + "epoch": 0.1576757958752069, + "grad_norm": 0.422445684671402, + "learning_rate": 0.00016849972055784453, + "loss": 1.4361, + "step": 12134 + }, + { + "epoch": 0.15768879041912276, + "grad_norm": 0.3424423336982727, + "learning_rate": 0.00016849712109593315, + "loss": 1.2521, + "step": 12135 + }, + { + "epoch": 0.15770178496303863, + "grad_norm": 0.40800026059150696, + "learning_rate": 0.00016849452163402177, + "loss": 1.3826, + "step": 12136 + }, + { + "epoch": 0.1577147795069545, + "grad_norm": 0.23556606471538544, + "learning_rate": 0.00016849192217211037, + "loss": 1.3703, + "step": 12137 + }, + { + "epoch": 0.15772777405087038, + "grad_norm": 0.3475390374660492, + "learning_rate": 0.000168489322710199, + "loss": 1.4566, + "step": 12138 + }, + { + "epoch": 0.15774076859478625, + "grad_norm": 0.3246475160121918, + "learning_rate": 0.0001684867232482876, + "loss": 1.2551, + "step": 12139 + }, + { + "epoch": 0.15775376313870212, + "grad_norm": 0.35173729062080383, + "learning_rate": 0.00016848412378637625, + "loss": 1.401, + "step": 12140 + }, + { + "epoch": 0.157766757682618, + "grad_norm": 0.3947131931781769, + "learning_rate": 0.00016848152432446484, + "loss": 1.4912, + "step": 12141 + }, + { + "epoch": 0.15777975222653387, + "grad_norm": 0.43315646052360535, + "learning_rate": 0.00016847892486255347, + "loss": 1.4778, + "step": 12142 + }, + { + "epoch": 0.15779274677044977, + "grad_norm": 0.413069486618042, + "learning_rate": 0.00016847632540064206, + "loss": 1.5419, + "step": 12143 + }, + { + "epoch": 0.15780574131436564, + "grad_norm": 0.3021068274974823, + "learning_rate": 0.0001684737259387307, + "loss": 1.368, + "step": 12144 + }, + { + "epoch": 0.15781873585828152, + "grad_norm": 0.3128720819950104, + "learning_rate": 0.0001684711264768193, + "loss": 1.3739, + "step": 12145 + }, + { + "epoch": 0.1578317304021974, + "grad_norm": 0.3862049877643585, + "learning_rate": 0.0001684685270149079, + "loss": 1.4451, + "step": 12146 + }, + { + "epoch": 0.15784472494611326, + "grad_norm": 0.3226591944694519, + "learning_rate": 0.00016846592755299654, + "loss": 1.3165, + "step": 12147 + }, + { + "epoch": 0.15785771949002914, + "grad_norm": 0.3609507381916046, + "learning_rate": 0.00016846332809108516, + "loss": 1.3225, + "step": 12148 + }, + { + "epoch": 0.157870714033945, + "grad_norm": 0.38621002435684204, + "learning_rate": 0.00016846072862917376, + "loss": 1.5426, + "step": 12149 + }, + { + "epoch": 0.15788370857786088, + "grad_norm": 0.4232640564441681, + "learning_rate": 0.00016845812916726238, + "loss": 1.5035, + "step": 12150 + }, + { + "epoch": 0.15789670312177675, + "grad_norm": 0.390117883682251, + "learning_rate": 0.00016845552970535098, + "loss": 1.5398, + "step": 12151 + }, + { + "epoch": 0.15790969766569263, + "grad_norm": 0.35571977496147156, + "learning_rate": 0.00016845293024343963, + "loss": 1.5468, + "step": 12152 + }, + { + "epoch": 0.1579226922096085, + "grad_norm": 0.3840009272098541, + "learning_rate": 0.00016845033078152823, + "loss": 1.301, + "step": 12153 + }, + { + "epoch": 0.15793568675352437, + "grad_norm": 0.3309866786003113, + "learning_rate": 0.00016844773131961685, + "loss": 1.4471, + "step": 12154 + }, + { + "epoch": 0.15794868129744025, + "grad_norm": 0.3633089065551758, + "learning_rate": 0.00016844513185770548, + "loss": 1.3995, + "step": 12155 + }, + { + "epoch": 0.15796167584135612, + "grad_norm": 0.4083336293697357, + "learning_rate": 0.00016844253239579407, + "loss": 1.4613, + "step": 12156 + }, + { + "epoch": 0.157974670385272, + "grad_norm": 0.3916897475719452, + "learning_rate": 0.0001684399329338827, + "loss": 1.454, + "step": 12157 + }, + { + "epoch": 0.15798766492918787, + "grad_norm": 0.3418882489204407, + "learning_rate": 0.0001684373334719713, + "loss": 1.2817, + "step": 12158 + }, + { + "epoch": 0.15800065947310374, + "grad_norm": 0.35763347148895264, + "learning_rate": 0.00016843473401005995, + "loss": 1.3088, + "step": 12159 + }, + { + "epoch": 0.1580136540170196, + "grad_norm": 0.41593673825263977, + "learning_rate": 0.00016843213454814855, + "loss": 1.4513, + "step": 12160 + }, + { + "epoch": 0.15802664856093548, + "grad_norm": 0.3025377094745636, + "learning_rate": 0.00016842953508623714, + "loss": 1.3647, + "step": 12161 + }, + { + "epoch": 0.15803964310485136, + "grad_norm": 0.37623780965805054, + "learning_rate": 0.00016842693562432577, + "loss": 1.4612, + "step": 12162 + }, + { + "epoch": 0.15805263764876723, + "grad_norm": 0.3976060450077057, + "learning_rate": 0.0001684243361624144, + "loss": 1.3837, + "step": 12163 + }, + { + "epoch": 0.1580656321926831, + "grad_norm": 0.4235958755016327, + "learning_rate": 0.00016842173670050302, + "loss": 1.514, + "step": 12164 + }, + { + "epoch": 0.15807862673659898, + "grad_norm": 0.4469101130962372, + "learning_rate": 0.0001684191372385916, + "loss": 1.4825, + "step": 12165 + }, + { + "epoch": 0.15809162128051485, + "grad_norm": 0.4407886266708374, + "learning_rate": 0.00016841653777668024, + "loss": 1.5262, + "step": 12166 + }, + { + "epoch": 0.15810461582443072, + "grad_norm": 0.468839168548584, + "learning_rate": 0.00016841393831476886, + "loss": 1.4787, + "step": 12167 + }, + { + "epoch": 0.1581176103683466, + "grad_norm": 0.3731576204299927, + "learning_rate": 0.00016841133885285746, + "loss": 1.3271, + "step": 12168 + }, + { + "epoch": 0.15813060491226247, + "grad_norm": 0.31835997104644775, + "learning_rate": 0.00016840873939094608, + "loss": 1.342, + "step": 12169 + }, + { + "epoch": 0.15814359945617834, + "grad_norm": 0.43828311562538147, + "learning_rate": 0.00016840613992903468, + "loss": 1.4252, + "step": 12170 + }, + { + "epoch": 0.1581565940000942, + "grad_norm": 0.3683306872844696, + "learning_rate": 0.00016840354046712333, + "loss": 1.5245, + "step": 12171 + }, + { + "epoch": 0.1581695885440101, + "grad_norm": 0.4457702934741974, + "learning_rate": 0.00016840094100521193, + "loss": 1.4573, + "step": 12172 + }, + { + "epoch": 0.15818258308792596, + "grad_norm": 0.3749345541000366, + "learning_rate": 0.00016839834154330055, + "loss": 1.503, + "step": 12173 + }, + { + "epoch": 0.15819557763184183, + "grad_norm": 0.43286803364753723, + "learning_rate": 0.00016839574208138915, + "loss": 1.3979, + "step": 12174 + }, + { + "epoch": 0.1582085721757577, + "grad_norm": 0.43005579710006714, + "learning_rate": 0.00016839314261947778, + "loss": 1.4188, + "step": 12175 + }, + { + "epoch": 0.15822156671967358, + "grad_norm": 0.5076369643211365, + "learning_rate": 0.0001683905431575664, + "loss": 1.4817, + "step": 12176 + }, + { + "epoch": 0.15823456126358945, + "grad_norm": 0.4112192988395691, + "learning_rate": 0.000168387943695655, + "loss": 1.6797, + "step": 12177 + }, + { + "epoch": 0.15824755580750532, + "grad_norm": 0.38447749614715576, + "learning_rate": 0.00016838534423374362, + "loss": 1.3085, + "step": 12178 + }, + { + "epoch": 0.1582605503514212, + "grad_norm": 0.34840282797813416, + "learning_rate": 0.00016838274477183225, + "loss": 1.1225, + "step": 12179 + }, + { + "epoch": 0.15827354489533707, + "grad_norm": 0.3794090747833252, + "learning_rate": 0.00016838014530992085, + "loss": 1.554, + "step": 12180 + }, + { + "epoch": 0.15828653943925294, + "grad_norm": 0.3945227265357971, + "learning_rate": 0.00016837754584800947, + "loss": 1.4316, + "step": 12181 + }, + { + "epoch": 0.15829953398316882, + "grad_norm": 0.4256705343723297, + "learning_rate": 0.00016837494638609807, + "loss": 1.386, + "step": 12182 + }, + { + "epoch": 0.1583125285270847, + "grad_norm": 0.3695443570613861, + "learning_rate": 0.00016837234692418672, + "loss": 1.3649, + "step": 12183 + }, + { + "epoch": 0.15832552307100056, + "grad_norm": 0.39672568440437317, + "learning_rate": 0.00016836974746227532, + "loss": 1.277, + "step": 12184 + }, + { + "epoch": 0.15833851761491644, + "grad_norm": 0.31588834524154663, + "learning_rate": 0.00016836714800036394, + "loss": 1.3848, + "step": 12185 + }, + { + "epoch": 0.1583515121588323, + "grad_norm": 0.45509183406829834, + "learning_rate": 0.00016836454853845254, + "loss": 1.2637, + "step": 12186 + }, + { + "epoch": 0.15836450670274818, + "grad_norm": 0.38124537467956543, + "learning_rate": 0.00016836194907654116, + "loss": 1.2465, + "step": 12187 + }, + { + "epoch": 0.15837750124666405, + "grad_norm": 0.46467000246047974, + "learning_rate": 0.0001683593496146298, + "loss": 1.6668, + "step": 12188 + }, + { + "epoch": 0.15839049579057993, + "grad_norm": 0.29997479915618896, + "learning_rate": 0.00016835675015271838, + "loss": 1.1691, + "step": 12189 + }, + { + "epoch": 0.1584034903344958, + "grad_norm": 0.4041958153247833, + "learning_rate": 0.000168354150690807, + "loss": 1.3437, + "step": 12190 + }, + { + "epoch": 0.15841648487841167, + "grad_norm": 0.4309135377407074, + "learning_rate": 0.00016835155122889563, + "loss": 1.5585, + "step": 12191 + }, + { + "epoch": 0.15842947942232755, + "grad_norm": 0.4675934314727783, + "learning_rate": 0.00016834895176698423, + "loss": 1.4627, + "step": 12192 + }, + { + "epoch": 0.15844247396624342, + "grad_norm": 0.4504415690898895, + "learning_rate": 0.00016834635230507285, + "loss": 1.501, + "step": 12193 + }, + { + "epoch": 0.1584554685101593, + "grad_norm": 0.3013511896133423, + "learning_rate": 0.00016834375284316148, + "loss": 1.3931, + "step": 12194 + }, + { + "epoch": 0.15846846305407516, + "grad_norm": 0.33756259083747864, + "learning_rate": 0.0001683411533812501, + "loss": 1.5259, + "step": 12195 + }, + { + "epoch": 0.15848145759799104, + "grad_norm": 0.30408650636672974, + "learning_rate": 0.0001683385539193387, + "loss": 1.3791, + "step": 12196 + }, + { + "epoch": 0.1584944521419069, + "grad_norm": 0.5211697816848755, + "learning_rate": 0.00016833595445742733, + "loss": 1.6588, + "step": 12197 + }, + { + "epoch": 0.15850744668582278, + "grad_norm": 0.41396358609199524, + "learning_rate": 0.00016833335499551595, + "loss": 1.1439, + "step": 12198 + }, + { + "epoch": 0.15852044122973866, + "grad_norm": 0.4607744812965393, + "learning_rate": 0.00016833075553360455, + "loss": 1.5323, + "step": 12199 + }, + { + "epoch": 0.15853343577365453, + "grad_norm": 0.4046708047389984, + "learning_rate": 0.00016832815607169317, + "loss": 1.215, + "step": 12200 + }, + { + "epoch": 0.1585464303175704, + "grad_norm": 0.4478952884674072, + "learning_rate": 0.00016832555660978177, + "loss": 1.4066, + "step": 12201 + }, + { + "epoch": 0.15855942486148628, + "grad_norm": 0.3196367621421814, + "learning_rate": 0.00016832295714787042, + "loss": 1.3782, + "step": 12202 + }, + { + "epoch": 0.15857241940540215, + "grad_norm": 0.32422423362731934, + "learning_rate": 0.00016832035768595902, + "loss": 1.3825, + "step": 12203 + }, + { + "epoch": 0.15858541394931802, + "grad_norm": 0.4274030327796936, + "learning_rate": 0.00016831775822404762, + "loss": 1.2455, + "step": 12204 + }, + { + "epoch": 0.1585984084932339, + "grad_norm": 0.39410191774368286, + "learning_rate": 0.00016831515876213624, + "loss": 1.3206, + "step": 12205 + }, + { + "epoch": 0.15861140303714977, + "grad_norm": 0.3155837059020996, + "learning_rate": 0.00016831255930022486, + "loss": 1.3546, + "step": 12206 + }, + { + "epoch": 0.15862439758106564, + "grad_norm": 0.3832182288169861, + "learning_rate": 0.0001683099598383135, + "loss": 1.4002, + "step": 12207 + }, + { + "epoch": 0.1586373921249815, + "grad_norm": 0.44513365626335144, + "learning_rate": 0.0001683073603764021, + "loss": 1.4265, + "step": 12208 + }, + { + "epoch": 0.1586503866688974, + "grad_norm": 0.344574511051178, + "learning_rate": 0.0001683047609144907, + "loss": 1.1895, + "step": 12209 + }, + { + "epoch": 0.15866338121281326, + "grad_norm": 0.37313321232795715, + "learning_rate": 0.00016830216145257934, + "loss": 1.2722, + "step": 12210 + }, + { + "epoch": 0.15867637575672913, + "grad_norm": 0.4319547414779663, + "learning_rate": 0.00016829956199066793, + "loss": 1.4179, + "step": 12211 + }, + { + "epoch": 0.158689370300645, + "grad_norm": 0.36353787779808044, + "learning_rate": 0.00016829696252875656, + "loss": 1.4473, + "step": 12212 + }, + { + "epoch": 0.15870236484456088, + "grad_norm": 0.38511598110198975, + "learning_rate": 0.00016829436306684515, + "loss": 1.1778, + "step": 12213 + }, + { + "epoch": 0.15871535938847675, + "grad_norm": 0.4999373257160187, + "learning_rate": 0.0001682917636049338, + "loss": 1.4818, + "step": 12214 + }, + { + "epoch": 0.15872835393239262, + "grad_norm": 0.39421847462654114, + "learning_rate": 0.0001682891641430224, + "loss": 1.4275, + "step": 12215 + }, + { + "epoch": 0.1587413484763085, + "grad_norm": 0.849568247795105, + "learning_rate": 0.000168286564681111, + "loss": 1.3637, + "step": 12216 + }, + { + "epoch": 0.15875434302022437, + "grad_norm": 0.42851266264915466, + "learning_rate": 0.00016828396521919963, + "loss": 1.5218, + "step": 12217 + }, + { + "epoch": 0.15876733756414024, + "grad_norm": 0.38274115324020386, + "learning_rate": 0.00016828136575728825, + "loss": 1.3916, + "step": 12218 + }, + { + "epoch": 0.15878033210805614, + "grad_norm": 0.41079720854759216, + "learning_rate": 0.00016827876629537687, + "loss": 1.4406, + "step": 12219 + }, + { + "epoch": 0.15879332665197202, + "grad_norm": 0.3841322660446167, + "learning_rate": 0.00016827616683346547, + "loss": 1.2904, + "step": 12220 + }, + { + "epoch": 0.1588063211958879, + "grad_norm": 0.4471125304698944, + "learning_rate": 0.0001682735673715541, + "loss": 1.4745, + "step": 12221 + }, + { + "epoch": 0.15881931573980376, + "grad_norm": 0.42396268248558044, + "learning_rate": 0.00016827096790964272, + "loss": 1.4187, + "step": 12222 + }, + { + "epoch": 0.15883231028371964, + "grad_norm": 0.34756559133529663, + "learning_rate": 0.00016826836844773132, + "loss": 1.423, + "step": 12223 + }, + { + "epoch": 0.1588453048276355, + "grad_norm": 0.3663460910320282, + "learning_rate": 0.00016826576898581994, + "loss": 1.4851, + "step": 12224 + }, + { + "epoch": 0.15885829937155138, + "grad_norm": 0.4245162904262543, + "learning_rate": 0.00016826316952390854, + "loss": 1.4524, + "step": 12225 + }, + { + "epoch": 0.15887129391546725, + "grad_norm": 0.3648967146873474, + "learning_rate": 0.0001682605700619972, + "loss": 1.452, + "step": 12226 + }, + { + "epoch": 0.15888428845938313, + "grad_norm": 0.4318394362926483, + "learning_rate": 0.0001682579706000858, + "loss": 1.36, + "step": 12227 + }, + { + "epoch": 0.158897283003299, + "grad_norm": 0.4152277708053589, + "learning_rate": 0.00016825537113817441, + "loss": 1.5309, + "step": 12228 + }, + { + "epoch": 0.15891027754721487, + "grad_norm": 0.3483196198940277, + "learning_rate": 0.00016825277167626304, + "loss": 1.3308, + "step": 12229 + }, + { + "epoch": 0.15892327209113075, + "grad_norm": 0.3724815547466278, + "learning_rate": 0.00016825017221435164, + "loss": 1.3413, + "step": 12230 + }, + { + "epoch": 0.15893626663504662, + "grad_norm": 0.3755175769329071, + "learning_rate": 0.00016824757275244026, + "loss": 1.6108, + "step": 12231 + }, + { + "epoch": 0.1589492611789625, + "grad_norm": 0.37909120321273804, + "learning_rate": 0.00016824497329052886, + "loss": 1.4606, + "step": 12232 + }, + { + "epoch": 0.15896225572287837, + "grad_norm": 0.3077991306781769, + "learning_rate": 0.00016824237382861748, + "loss": 1.3839, + "step": 12233 + }, + { + "epoch": 0.15897525026679424, + "grad_norm": 0.4607841372489929, + "learning_rate": 0.0001682397743667061, + "loss": 1.6681, + "step": 12234 + }, + { + "epoch": 0.1589882448107101, + "grad_norm": 0.47513526678085327, + "learning_rate": 0.0001682371749047947, + "loss": 1.4253, + "step": 12235 + }, + { + "epoch": 0.15900123935462598, + "grad_norm": 0.4572325348854065, + "learning_rate": 0.00016823457544288333, + "loss": 1.3791, + "step": 12236 + }, + { + "epoch": 0.15901423389854186, + "grad_norm": 0.32650768756866455, + "learning_rate": 0.00016823197598097195, + "loss": 1.5368, + "step": 12237 + }, + { + "epoch": 0.15902722844245773, + "grad_norm": 0.3811675012111664, + "learning_rate": 0.00016822937651906058, + "loss": 1.4804, + "step": 12238 + }, + { + "epoch": 0.1590402229863736, + "grad_norm": 0.33600297570228577, + "learning_rate": 0.00016822677705714917, + "loss": 1.34, + "step": 12239 + }, + { + "epoch": 0.15905321753028948, + "grad_norm": 0.4331541955471039, + "learning_rate": 0.0001682241775952378, + "loss": 1.3042, + "step": 12240 + }, + { + "epoch": 0.15906621207420535, + "grad_norm": 0.4234504997730255, + "learning_rate": 0.00016822157813332642, + "loss": 1.4482, + "step": 12241 + }, + { + "epoch": 0.15907920661812122, + "grad_norm": 0.36658522486686707, + "learning_rate": 0.00016821897867141502, + "loss": 1.3592, + "step": 12242 + }, + { + "epoch": 0.1590922011620371, + "grad_norm": 0.4279352128505707, + "learning_rate": 0.00016821637920950365, + "loss": 1.3467, + "step": 12243 + }, + { + "epoch": 0.15910519570595297, + "grad_norm": 0.36761078238487244, + "learning_rate": 0.00016821377974759224, + "loss": 1.317, + "step": 12244 + }, + { + "epoch": 0.15911819024986884, + "grad_norm": 0.4478946626186371, + "learning_rate": 0.00016821118028568087, + "loss": 1.5632, + "step": 12245 + }, + { + "epoch": 0.1591311847937847, + "grad_norm": 0.3182785212993622, + "learning_rate": 0.0001682085808237695, + "loss": 1.543, + "step": 12246 + }, + { + "epoch": 0.1591441793377006, + "grad_norm": 0.4061199128627777, + "learning_rate": 0.0001682059813618581, + "loss": 1.4911, + "step": 12247 + }, + { + "epoch": 0.15915717388161646, + "grad_norm": 0.43322935700416565, + "learning_rate": 0.00016820338189994671, + "loss": 1.4278, + "step": 12248 + }, + { + "epoch": 0.15917016842553233, + "grad_norm": 0.30969804525375366, + "learning_rate": 0.00016820078243803534, + "loss": 1.3471, + "step": 12249 + }, + { + "epoch": 0.1591831629694482, + "grad_norm": 0.36357054114341736, + "learning_rate": 0.00016819818297612396, + "loss": 1.5466, + "step": 12250 + }, + { + "epoch": 0.15919615751336408, + "grad_norm": 0.44624578952789307, + "learning_rate": 0.00016819558351421256, + "loss": 1.4011, + "step": 12251 + }, + { + "epoch": 0.15920915205727995, + "grad_norm": 0.35145607590675354, + "learning_rate": 0.00016819298405230118, + "loss": 1.369, + "step": 12252 + }, + { + "epoch": 0.15922214660119582, + "grad_norm": 0.2584977447986603, + "learning_rate": 0.0001681903845903898, + "loss": 1.4514, + "step": 12253 + }, + { + "epoch": 0.1592351411451117, + "grad_norm": 0.4205785393714905, + "learning_rate": 0.0001681877851284784, + "loss": 1.3796, + "step": 12254 + }, + { + "epoch": 0.15924813568902757, + "grad_norm": 0.32457953691482544, + "learning_rate": 0.00016818518566656703, + "loss": 1.3525, + "step": 12255 + }, + { + "epoch": 0.15926113023294344, + "grad_norm": 0.4039461016654968, + "learning_rate": 0.00016818258620465563, + "loss": 1.3716, + "step": 12256 + }, + { + "epoch": 0.15927412477685932, + "grad_norm": 0.3441017270088196, + "learning_rate": 0.00016817998674274428, + "loss": 1.4265, + "step": 12257 + }, + { + "epoch": 0.1592871193207752, + "grad_norm": 0.3918721079826355, + "learning_rate": 0.00016817738728083288, + "loss": 1.4946, + "step": 12258 + }, + { + "epoch": 0.15930011386469106, + "grad_norm": 0.45541828870773315, + "learning_rate": 0.00016817478781892147, + "loss": 1.3818, + "step": 12259 + }, + { + "epoch": 0.15931310840860693, + "grad_norm": 0.3422129452228546, + "learning_rate": 0.0001681721883570101, + "loss": 1.373, + "step": 12260 + }, + { + "epoch": 0.1593261029525228, + "grad_norm": 0.43476754426956177, + "learning_rate": 0.00016816958889509872, + "loss": 1.5534, + "step": 12261 + }, + { + "epoch": 0.15933909749643868, + "grad_norm": 0.3889281153678894, + "learning_rate": 0.00016816698943318735, + "loss": 1.4018, + "step": 12262 + }, + { + "epoch": 0.15935209204035455, + "grad_norm": 0.360443651676178, + "learning_rate": 0.00016816438997127595, + "loss": 1.3878, + "step": 12263 + }, + { + "epoch": 0.15936508658427043, + "grad_norm": 0.4079136848449707, + "learning_rate": 0.00016816179050936457, + "loss": 1.433, + "step": 12264 + }, + { + "epoch": 0.1593780811281863, + "grad_norm": 0.4729766249656677, + "learning_rate": 0.0001681591910474532, + "loss": 1.5027, + "step": 12265 + }, + { + "epoch": 0.15939107567210217, + "grad_norm": 0.3890770971775055, + "learning_rate": 0.0001681565915855418, + "loss": 1.4746, + "step": 12266 + }, + { + "epoch": 0.15940407021601805, + "grad_norm": 0.4352772533893585, + "learning_rate": 0.00016815399212363042, + "loss": 1.3511, + "step": 12267 + }, + { + "epoch": 0.15941706475993392, + "grad_norm": 0.4645844101905823, + "learning_rate": 0.00016815139266171904, + "loss": 1.5011, + "step": 12268 + }, + { + "epoch": 0.1594300593038498, + "grad_norm": 0.4356909692287445, + "learning_rate": 0.00016814879319980767, + "loss": 1.3281, + "step": 12269 + }, + { + "epoch": 0.15944305384776566, + "grad_norm": 0.45108431577682495, + "learning_rate": 0.00016814619373789626, + "loss": 1.4642, + "step": 12270 + }, + { + "epoch": 0.15945604839168154, + "grad_norm": 0.4070507287979126, + "learning_rate": 0.00016814359427598486, + "loss": 1.4571, + "step": 12271 + }, + { + "epoch": 0.1594690429355974, + "grad_norm": 0.4008220434188843, + "learning_rate": 0.0001681409948140735, + "loss": 1.5671, + "step": 12272 + }, + { + "epoch": 0.15948203747951328, + "grad_norm": 0.44126230478286743, + "learning_rate": 0.0001681383953521621, + "loss": 1.3447, + "step": 12273 + }, + { + "epoch": 0.15949503202342916, + "grad_norm": 0.4537971317768097, + "learning_rate": 0.00016813579589025073, + "loss": 1.4687, + "step": 12274 + }, + { + "epoch": 0.15950802656734503, + "grad_norm": 0.4635203182697296, + "learning_rate": 0.00016813319642833933, + "loss": 1.3791, + "step": 12275 + }, + { + "epoch": 0.1595210211112609, + "grad_norm": 0.4206159710884094, + "learning_rate": 0.00016813059696642796, + "loss": 1.3304, + "step": 12276 + }, + { + "epoch": 0.15953401565517678, + "grad_norm": 0.40822598338127136, + "learning_rate": 0.00016812799750451658, + "loss": 1.3977, + "step": 12277 + }, + { + "epoch": 0.15954701019909265, + "grad_norm": 0.3308940827846527, + "learning_rate": 0.00016812539804260518, + "loss": 1.3779, + "step": 12278 + }, + { + "epoch": 0.15956000474300852, + "grad_norm": 0.38944390416145325, + "learning_rate": 0.0001681227985806938, + "loss": 1.5612, + "step": 12279 + }, + { + "epoch": 0.1595729992869244, + "grad_norm": 0.38299763202667236, + "learning_rate": 0.00016812019911878243, + "loss": 1.4782, + "step": 12280 + }, + { + "epoch": 0.15958599383084027, + "grad_norm": 0.3599013686180115, + "learning_rate": 0.00016811759965687105, + "loss": 1.2701, + "step": 12281 + }, + { + "epoch": 0.15959898837475614, + "grad_norm": 0.30971989035606384, + "learning_rate": 0.00016811500019495965, + "loss": 1.1869, + "step": 12282 + }, + { + "epoch": 0.159611982918672, + "grad_norm": 0.2928314208984375, + "learning_rate": 0.00016811240073304825, + "loss": 0.9674, + "step": 12283 + }, + { + "epoch": 0.15962497746258789, + "grad_norm": 0.36406126618385315, + "learning_rate": 0.0001681098012711369, + "loss": 1.3816, + "step": 12284 + }, + { + "epoch": 0.15963797200650376, + "grad_norm": 0.4082266092300415, + "learning_rate": 0.0001681072018092255, + "loss": 1.3682, + "step": 12285 + }, + { + "epoch": 0.15965096655041963, + "grad_norm": 0.35065507888793945, + "learning_rate": 0.00016810460234731412, + "loss": 1.3165, + "step": 12286 + }, + { + "epoch": 0.1596639610943355, + "grad_norm": 0.47152179479599, + "learning_rate": 0.00016810200288540272, + "loss": 1.594, + "step": 12287 + }, + { + "epoch": 0.15967695563825138, + "grad_norm": 0.42153266072273254, + "learning_rate": 0.00016809940342349134, + "loss": 1.5297, + "step": 12288 + }, + { + "epoch": 0.15968995018216725, + "grad_norm": 0.4511811435222626, + "learning_rate": 0.00016809680396157997, + "loss": 1.2811, + "step": 12289 + }, + { + "epoch": 0.15970294472608312, + "grad_norm": 0.487887978553772, + "learning_rate": 0.00016809420449966856, + "loss": 1.3768, + "step": 12290 + }, + { + "epoch": 0.159715939269999, + "grad_norm": 0.3068332076072693, + "learning_rate": 0.0001680916050377572, + "loss": 1.2984, + "step": 12291 + }, + { + "epoch": 0.15972893381391487, + "grad_norm": 0.3803914785385132, + "learning_rate": 0.0001680890055758458, + "loss": 1.4536, + "step": 12292 + }, + { + "epoch": 0.15974192835783074, + "grad_norm": 0.2833569645881653, + "learning_rate": 0.00016808640611393444, + "loss": 1.4077, + "step": 12293 + }, + { + "epoch": 0.15975492290174662, + "grad_norm": 0.43343907594680786, + "learning_rate": 0.00016808380665202303, + "loss": 1.4809, + "step": 12294 + }, + { + "epoch": 0.15976791744566252, + "grad_norm": 0.5081380605697632, + "learning_rate": 0.00016808120719011166, + "loss": 1.4445, + "step": 12295 + }, + { + "epoch": 0.1597809119895784, + "grad_norm": 0.341413676738739, + "learning_rate": 0.00016807860772820028, + "loss": 1.5356, + "step": 12296 + }, + { + "epoch": 0.15979390653349426, + "grad_norm": 0.3639095425605774, + "learning_rate": 0.00016807600826628888, + "loss": 1.2932, + "step": 12297 + }, + { + "epoch": 0.15980690107741014, + "grad_norm": 0.41499632596969604, + "learning_rate": 0.0001680734088043775, + "loss": 1.3549, + "step": 12298 + }, + { + "epoch": 0.159819895621326, + "grad_norm": 0.390863835811615, + "learning_rate": 0.0001680708093424661, + "loss": 1.3866, + "step": 12299 + }, + { + "epoch": 0.15983289016524188, + "grad_norm": 0.48037195205688477, + "learning_rate": 0.00016806820988055473, + "loss": 1.582, + "step": 12300 + }, + { + "epoch": 0.15984588470915775, + "grad_norm": 0.337342232465744, + "learning_rate": 0.00016806561041864335, + "loss": 1.4569, + "step": 12301 + }, + { + "epoch": 0.15985887925307363, + "grad_norm": 0.3409208059310913, + "learning_rate": 0.00016806301095673195, + "loss": 1.4153, + "step": 12302 + }, + { + "epoch": 0.1598718737969895, + "grad_norm": 0.9956346154212952, + "learning_rate": 0.0001680604114948206, + "loss": 1.3828, + "step": 12303 + }, + { + "epoch": 0.15988486834090537, + "grad_norm": 0.29615992307662964, + "learning_rate": 0.0001680578120329092, + "loss": 1.4366, + "step": 12304 + }, + { + "epoch": 0.15989786288482125, + "grad_norm": 0.347982794046402, + "learning_rate": 0.00016805521257099782, + "loss": 1.3476, + "step": 12305 + }, + { + "epoch": 0.15991085742873712, + "grad_norm": 0.4028204083442688, + "learning_rate": 0.00016805261310908642, + "loss": 1.3924, + "step": 12306 + }, + { + "epoch": 0.159923851972653, + "grad_norm": 0.38938236236572266, + "learning_rate": 0.00016805001364717504, + "loss": 1.5419, + "step": 12307 + }, + { + "epoch": 0.15993684651656886, + "grad_norm": 0.5025116801261902, + "learning_rate": 0.00016804741418526367, + "loss": 1.5345, + "step": 12308 + }, + { + "epoch": 0.15994984106048474, + "grad_norm": 0.5307797789573669, + "learning_rate": 0.00016804481472335227, + "loss": 1.3458, + "step": 12309 + }, + { + "epoch": 0.1599628356044006, + "grad_norm": 0.422865092754364, + "learning_rate": 0.0001680422152614409, + "loss": 1.3635, + "step": 12310 + }, + { + "epoch": 0.15997583014831648, + "grad_norm": 0.4449157416820526, + "learning_rate": 0.00016803961579952951, + "loss": 1.5415, + "step": 12311 + }, + { + "epoch": 0.15998882469223236, + "grad_norm": 0.3656712770462036, + "learning_rate": 0.00016803701633761814, + "loss": 1.375, + "step": 12312 + }, + { + "epoch": 0.16000181923614823, + "grad_norm": 0.3500148355960846, + "learning_rate": 0.00016803441687570674, + "loss": 1.3688, + "step": 12313 + }, + { + "epoch": 0.1600148137800641, + "grad_norm": 0.5006784200668335, + "learning_rate": 0.00016803181741379533, + "loss": 1.4751, + "step": 12314 + }, + { + "epoch": 0.16002780832397998, + "grad_norm": 0.4967231750488281, + "learning_rate": 0.00016802921795188398, + "loss": 1.454, + "step": 12315 + }, + { + "epoch": 0.16004080286789585, + "grad_norm": 0.4023984372615814, + "learning_rate": 0.00016802661848997258, + "loss": 1.4578, + "step": 12316 + }, + { + "epoch": 0.16005379741181172, + "grad_norm": 0.4796692728996277, + "learning_rate": 0.0001680240190280612, + "loss": 1.4378, + "step": 12317 + }, + { + "epoch": 0.1600667919557276, + "grad_norm": 0.39401862025260925, + "learning_rate": 0.0001680214195661498, + "loss": 1.363, + "step": 12318 + }, + { + "epoch": 0.16007978649964347, + "grad_norm": 0.41930028796195984, + "learning_rate": 0.00016801882010423843, + "loss": 1.4318, + "step": 12319 + }, + { + "epoch": 0.16009278104355934, + "grad_norm": 0.3851275146007538, + "learning_rate": 0.00016801622064232705, + "loss": 1.3894, + "step": 12320 + }, + { + "epoch": 0.1601057755874752, + "grad_norm": 0.3232443034648895, + "learning_rate": 0.00016801362118041565, + "loss": 1.3706, + "step": 12321 + }, + { + "epoch": 0.16011877013139109, + "grad_norm": 0.39164260029792786, + "learning_rate": 0.00016801102171850427, + "loss": 1.5531, + "step": 12322 + }, + { + "epoch": 0.16013176467530696, + "grad_norm": 0.4041592478752136, + "learning_rate": 0.0001680084222565929, + "loss": 1.5064, + "step": 12323 + }, + { + "epoch": 0.16014475921922283, + "grad_norm": 0.4000818431377411, + "learning_rate": 0.00016800582279468152, + "loss": 1.5579, + "step": 12324 + }, + { + "epoch": 0.1601577537631387, + "grad_norm": 0.40931230783462524, + "learning_rate": 0.00016800322333277012, + "loss": 1.2782, + "step": 12325 + }, + { + "epoch": 0.16017074830705458, + "grad_norm": 0.3512624204158783, + "learning_rate": 0.00016800062387085872, + "loss": 1.3011, + "step": 12326 + }, + { + "epoch": 0.16018374285097045, + "grad_norm": 0.3761516511440277, + "learning_rate": 0.00016799802440894737, + "loss": 1.4862, + "step": 12327 + }, + { + "epoch": 0.16019673739488632, + "grad_norm": 0.32938069105148315, + "learning_rate": 0.00016799542494703597, + "loss": 1.37, + "step": 12328 + }, + { + "epoch": 0.1602097319388022, + "grad_norm": 0.4327149987220764, + "learning_rate": 0.0001679928254851246, + "loss": 1.3948, + "step": 12329 + }, + { + "epoch": 0.16022272648271807, + "grad_norm": 0.3925078511238098, + "learning_rate": 0.0001679902260232132, + "loss": 1.4822, + "step": 12330 + }, + { + "epoch": 0.16023572102663394, + "grad_norm": 0.36957135796546936, + "learning_rate": 0.00016798762656130181, + "loss": 1.3715, + "step": 12331 + }, + { + "epoch": 0.16024871557054982, + "grad_norm": 0.47471681237220764, + "learning_rate": 0.00016798502709939044, + "loss": 1.4818, + "step": 12332 + }, + { + "epoch": 0.1602617101144657, + "grad_norm": 0.4172595739364624, + "learning_rate": 0.00016798242763747904, + "loss": 1.6152, + "step": 12333 + }, + { + "epoch": 0.16027470465838156, + "grad_norm": 0.43155524134635925, + "learning_rate": 0.00016797982817556766, + "loss": 1.5537, + "step": 12334 + }, + { + "epoch": 0.16028769920229743, + "grad_norm": 0.319709450006485, + "learning_rate": 0.00016797722871365628, + "loss": 1.3403, + "step": 12335 + }, + { + "epoch": 0.1603006937462133, + "grad_norm": 0.37101930379867554, + "learning_rate": 0.0001679746292517449, + "loss": 1.3522, + "step": 12336 + }, + { + "epoch": 0.16031368829012918, + "grad_norm": 0.3466882109642029, + "learning_rate": 0.0001679720297898335, + "loss": 1.2713, + "step": 12337 + }, + { + "epoch": 0.16032668283404505, + "grad_norm": 0.4527888000011444, + "learning_rate": 0.0001679694303279221, + "loss": 1.5937, + "step": 12338 + }, + { + "epoch": 0.16033967737796093, + "grad_norm": 0.3342328369617462, + "learning_rate": 0.00016796683086601076, + "loss": 1.2712, + "step": 12339 + }, + { + "epoch": 0.1603526719218768, + "grad_norm": 0.448095440864563, + "learning_rate": 0.00016796423140409935, + "loss": 1.3891, + "step": 12340 + }, + { + "epoch": 0.16036566646579267, + "grad_norm": 0.4362563192844391, + "learning_rate": 0.00016796163194218798, + "loss": 1.2109, + "step": 12341 + }, + { + "epoch": 0.16037866100970855, + "grad_norm": 0.3978665769100189, + "learning_rate": 0.0001679590324802766, + "loss": 1.2336, + "step": 12342 + }, + { + "epoch": 0.16039165555362442, + "grad_norm": 0.5934082865715027, + "learning_rate": 0.0001679564330183652, + "loss": 1.5638, + "step": 12343 + }, + { + "epoch": 0.1604046500975403, + "grad_norm": 0.3339918851852417, + "learning_rate": 0.00016795383355645382, + "loss": 1.3778, + "step": 12344 + }, + { + "epoch": 0.16041764464145616, + "grad_norm": 0.4134880304336548, + "learning_rate": 0.00016795123409454242, + "loss": 1.5087, + "step": 12345 + }, + { + "epoch": 0.16043063918537204, + "grad_norm": 0.5018779039382935, + "learning_rate": 0.00016794863463263107, + "loss": 1.2578, + "step": 12346 + }, + { + "epoch": 0.1604436337292879, + "grad_norm": 0.3423566222190857, + "learning_rate": 0.00016794603517071967, + "loss": 1.3727, + "step": 12347 + }, + { + "epoch": 0.16045662827320378, + "grad_norm": 0.442096471786499, + "learning_rate": 0.0001679434357088083, + "loss": 1.6549, + "step": 12348 + }, + { + "epoch": 0.16046962281711966, + "grad_norm": 0.42707157135009766, + "learning_rate": 0.0001679408362468969, + "loss": 1.3967, + "step": 12349 + }, + { + "epoch": 0.16048261736103553, + "grad_norm": 0.34618017077445984, + "learning_rate": 0.00016793823678498552, + "loss": 1.3041, + "step": 12350 + }, + { + "epoch": 0.1604956119049514, + "grad_norm": 0.4824414551258087, + "learning_rate": 0.00016793563732307414, + "loss": 1.4628, + "step": 12351 + }, + { + "epoch": 0.16050860644886727, + "grad_norm": 0.41753262281417847, + "learning_rate": 0.00016793303786116274, + "loss": 1.3927, + "step": 12352 + }, + { + "epoch": 0.16052160099278315, + "grad_norm": 0.49709153175354004, + "learning_rate": 0.00016793043839925136, + "loss": 1.4807, + "step": 12353 + }, + { + "epoch": 0.16053459553669902, + "grad_norm": 0.4568771719932556, + "learning_rate": 0.00016792783893734, + "loss": 1.4595, + "step": 12354 + }, + { + "epoch": 0.1605475900806149, + "grad_norm": 0.4320828914642334, + "learning_rate": 0.00016792523947542858, + "loss": 1.5625, + "step": 12355 + }, + { + "epoch": 0.16056058462453077, + "grad_norm": 0.38529759645462036, + "learning_rate": 0.0001679226400135172, + "loss": 1.439, + "step": 12356 + }, + { + "epoch": 0.16057357916844664, + "grad_norm": 0.37635865807533264, + "learning_rate": 0.0001679200405516058, + "loss": 1.3404, + "step": 12357 + }, + { + "epoch": 0.1605865737123625, + "grad_norm": 0.42272937297821045, + "learning_rate": 0.00016791744108969446, + "loss": 1.5292, + "step": 12358 + }, + { + "epoch": 0.16059956825627839, + "grad_norm": 0.40684419870376587, + "learning_rate": 0.00016791484162778306, + "loss": 1.2902, + "step": 12359 + }, + { + "epoch": 0.16061256280019426, + "grad_norm": 0.4250040650367737, + "learning_rate": 0.00016791224216587168, + "loss": 1.4581, + "step": 12360 + }, + { + "epoch": 0.16062555734411013, + "grad_norm": 0.46793752908706665, + "learning_rate": 0.00016790964270396028, + "loss": 1.4947, + "step": 12361 + }, + { + "epoch": 0.160638551888026, + "grad_norm": 0.3696674406528473, + "learning_rate": 0.0001679070432420489, + "loss": 1.3373, + "step": 12362 + }, + { + "epoch": 0.16065154643194188, + "grad_norm": 0.3296555280685425, + "learning_rate": 0.00016790444378013753, + "loss": 1.47, + "step": 12363 + }, + { + "epoch": 0.16066454097585775, + "grad_norm": 0.37473469972610474, + "learning_rate": 0.00016790184431822612, + "loss": 1.4424, + "step": 12364 + }, + { + "epoch": 0.16067753551977362, + "grad_norm": 0.31683987379074097, + "learning_rate": 0.00016789924485631475, + "loss": 1.2346, + "step": 12365 + }, + { + "epoch": 0.1606905300636895, + "grad_norm": 0.32565101981163025, + "learning_rate": 0.00016789664539440337, + "loss": 1.3305, + "step": 12366 + }, + { + "epoch": 0.16070352460760537, + "grad_norm": 0.37582066655158997, + "learning_rate": 0.00016789404593249197, + "loss": 1.414, + "step": 12367 + }, + { + "epoch": 0.16071651915152124, + "grad_norm": 0.3694254159927368, + "learning_rate": 0.0001678914464705806, + "loss": 1.1283, + "step": 12368 + }, + { + "epoch": 0.16072951369543712, + "grad_norm": 0.38893142342567444, + "learning_rate": 0.0001678888470086692, + "loss": 1.2784, + "step": 12369 + }, + { + "epoch": 0.160742508239353, + "grad_norm": 0.3975680470466614, + "learning_rate": 0.00016788624754675784, + "loss": 1.475, + "step": 12370 + }, + { + "epoch": 0.1607555027832689, + "grad_norm": 0.4705055356025696, + "learning_rate": 0.00016788364808484644, + "loss": 1.5295, + "step": 12371 + }, + { + "epoch": 0.16076849732718476, + "grad_norm": 0.38787126541137695, + "learning_rate": 0.00016788104862293507, + "loss": 1.5269, + "step": 12372 + }, + { + "epoch": 0.16078149187110063, + "grad_norm": 0.44869711995124817, + "learning_rate": 0.00016787844916102366, + "loss": 1.5004, + "step": 12373 + }, + { + "epoch": 0.1607944864150165, + "grad_norm": 0.27974042296409607, + "learning_rate": 0.0001678758496991123, + "loss": 1.2975, + "step": 12374 + }, + { + "epoch": 0.16080748095893238, + "grad_norm": 0.38531556725502014, + "learning_rate": 0.0001678732502372009, + "loss": 1.5095, + "step": 12375 + }, + { + "epoch": 0.16082047550284825, + "grad_norm": 0.4017702341079712, + "learning_rate": 0.0001678706507752895, + "loss": 1.4914, + "step": 12376 + }, + { + "epoch": 0.16083347004676413, + "grad_norm": 0.4168252646923065, + "learning_rate": 0.00016786805131337816, + "loss": 1.6041, + "step": 12377 + }, + { + "epoch": 0.16084646459068, + "grad_norm": 0.35655486583709717, + "learning_rate": 0.00016786545185146676, + "loss": 1.6019, + "step": 12378 + }, + { + "epoch": 0.16085945913459587, + "grad_norm": 0.4495946764945984, + "learning_rate": 0.00016786285238955538, + "loss": 1.3708, + "step": 12379 + }, + { + "epoch": 0.16087245367851175, + "grad_norm": 0.4912163317203522, + "learning_rate": 0.00016786025292764398, + "loss": 1.4714, + "step": 12380 + }, + { + "epoch": 0.16088544822242762, + "grad_norm": 0.4345300495624542, + "learning_rate": 0.0001678576534657326, + "loss": 1.5672, + "step": 12381 + }, + { + "epoch": 0.1608984427663435, + "grad_norm": 0.38896751403808594, + "learning_rate": 0.00016785505400382123, + "loss": 1.6449, + "step": 12382 + }, + { + "epoch": 0.16091143731025936, + "grad_norm": 0.3724277913570404, + "learning_rate": 0.00016785245454190983, + "loss": 1.3915, + "step": 12383 + }, + { + "epoch": 0.16092443185417524, + "grad_norm": 0.3206885755062103, + "learning_rate": 0.00016784985507999845, + "loss": 1.2872, + "step": 12384 + }, + { + "epoch": 0.1609374263980911, + "grad_norm": 0.4560156762599945, + "learning_rate": 0.00016784725561808708, + "loss": 1.535, + "step": 12385 + }, + { + "epoch": 0.16095042094200698, + "grad_norm": 0.4259679615497589, + "learning_rate": 0.00016784465615617567, + "loss": 1.4153, + "step": 12386 + }, + { + "epoch": 0.16096341548592286, + "grad_norm": 0.4947696626186371, + "learning_rate": 0.0001678420566942643, + "loss": 1.3949, + "step": 12387 + }, + { + "epoch": 0.16097641002983873, + "grad_norm": 0.41922178864479065, + "learning_rate": 0.0001678394572323529, + "loss": 1.3765, + "step": 12388 + }, + { + "epoch": 0.1609894045737546, + "grad_norm": 0.41384056210517883, + "learning_rate": 0.00016783685777044155, + "loss": 1.6216, + "step": 12389 + }, + { + "epoch": 0.16100239911767047, + "grad_norm": 0.43948566913604736, + "learning_rate": 0.00016783425830853014, + "loss": 1.6283, + "step": 12390 + }, + { + "epoch": 0.16101539366158635, + "grad_norm": 0.47521379590034485, + "learning_rate": 0.00016783165884661877, + "loss": 1.5599, + "step": 12391 + }, + { + "epoch": 0.16102838820550222, + "grad_norm": 0.3941165804862976, + "learning_rate": 0.00016782905938470737, + "loss": 1.4446, + "step": 12392 + }, + { + "epoch": 0.1610413827494181, + "grad_norm": 0.33074066042900085, + "learning_rate": 0.000167826459922796, + "loss": 1.2219, + "step": 12393 + }, + { + "epoch": 0.16105437729333397, + "grad_norm": 0.38846495747566223, + "learning_rate": 0.00016782386046088461, + "loss": 1.3052, + "step": 12394 + }, + { + "epoch": 0.16106737183724984, + "grad_norm": 0.3925749957561493, + "learning_rate": 0.0001678212609989732, + "loss": 1.6176, + "step": 12395 + }, + { + "epoch": 0.1610803663811657, + "grad_norm": 0.3784163296222687, + "learning_rate": 0.00016781866153706184, + "loss": 1.391, + "step": 12396 + }, + { + "epoch": 0.16109336092508159, + "grad_norm": 0.44950613379478455, + "learning_rate": 0.00016781606207515046, + "loss": 1.3998, + "step": 12397 + }, + { + "epoch": 0.16110635546899746, + "grad_norm": 0.16205565631389618, + "learning_rate": 0.00016781346261323906, + "loss": 1.2534, + "step": 12398 + }, + { + "epoch": 0.16111935001291333, + "grad_norm": 0.361299991607666, + "learning_rate": 0.00016781086315132768, + "loss": 1.5207, + "step": 12399 + }, + { + "epoch": 0.1611323445568292, + "grad_norm": 0.3345767855644226, + "learning_rate": 0.00016780826368941628, + "loss": 1.5089, + "step": 12400 + }, + { + "epoch": 0.16114533910074508, + "grad_norm": 0.41074296832084656, + "learning_rate": 0.00016780566422750493, + "loss": 1.5026, + "step": 12401 + }, + { + "epoch": 0.16115833364466095, + "grad_norm": 0.4379768967628479, + "learning_rate": 0.00016780306476559353, + "loss": 1.3023, + "step": 12402 + }, + { + "epoch": 0.16117132818857682, + "grad_norm": 0.4849367141723633, + "learning_rate": 0.00016780046530368215, + "loss": 1.3242, + "step": 12403 + }, + { + "epoch": 0.1611843227324927, + "grad_norm": 0.3181443214416504, + "learning_rate": 0.00016779786584177075, + "loss": 1.4434, + "step": 12404 + }, + { + "epoch": 0.16119731727640857, + "grad_norm": 0.47958919405937195, + "learning_rate": 0.00016779526637985938, + "loss": 1.4558, + "step": 12405 + }, + { + "epoch": 0.16121031182032444, + "grad_norm": 0.3560980558395386, + "learning_rate": 0.000167792666917948, + "loss": 1.4342, + "step": 12406 + }, + { + "epoch": 0.16122330636424032, + "grad_norm": 0.4329281449317932, + "learning_rate": 0.0001677900674560366, + "loss": 1.3874, + "step": 12407 + }, + { + "epoch": 0.1612363009081562, + "grad_norm": 0.4536091983318329, + "learning_rate": 0.00016778746799412522, + "loss": 1.5581, + "step": 12408 + }, + { + "epoch": 0.16124929545207206, + "grad_norm": 0.3778620958328247, + "learning_rate": 0.00016778486853221385, + "loss": 1.3726, + "step": 12409 + }, + { + "epoch": 0.16126228999598793, + "grad_norm": 0.3825274109840393, + "learning_rate": 0.00016778226907030244, + "loss": 1.2303, + "step": 12410 + }, + { + "epoch": 0.1612752845399038, + "grad_norm": 0.5703203678131104, + "learning_rate": 0.00016777966960839107, + "loss": 1.5041, + "step": 12411 + }, + { + "epoch": 0.16128827908381968, + "grad_norm": 0.41571110486984253, + "learning_rate": 0.00016777707014647967, + "loss": 1.2696, + "step": 12412 + }, + { + "epoch": 0.16130127362773555, + "grad_norm": 0.4587453305721283, + "learning_rate": 0.00016777447068456832, + "loss": 1.5076, + "step": 12413 + }, + { + "epoch": 0.16131426817165143, + "grad_norm": 0.39475521445274353, + "learning_rate": 0.00016777187122265691, + "loss": 1.5531, + "step": 12414 + }, + { + "epoch": 0.1613272627155673, + "grad_norm": 0.44247695803642273, + "learning_rate": 0.00016776927176074554, + "loss": 1.4369, + "step": 12415 + }, + { + "epoch": 0.16134025725948317, + "grad_norm": 0.5244262218475342, + "learning_rate": 0.00016776667229883416, + "loss": 1.456, + "step": 12416 + }, + { + "epoch": 0.16135325180339904, + "grad_norm": 0.39449018239974976, + "learning_rate": 0.00016776407283692276, + "loss": 1.3718, + "step": 12417 + }, + { + "epoch": 0.16136624634731492, + "grad_norm": 0.3736203908920288, + "learning_rate": 0.00016776147337501139, + "loss": 1.5875, + "step": 12418 + }, + { + "epoch": 0.1613792408912308, + "grad_norm": 0.3177570402622223, + "learning_rate": 0.00016775887391309998, + "loss": 1.2373, + "step": 12419 + }, + { + "epoch": 0.16139223543514666, + "grad_norm": 0.3923036456108093, + "learning_rate": 0.00016775627445118863, + "loss": 1.4983, + "step": 12420 + }, + { + "epoch": 0.16140522997906254, + "grad_norm": 0.31973084807395935, + "learning_rate": 0.00016775367498927723, + "loss": 1.274, + "step": 12421 + }, + { + "epoch": 0.1614182245229784, + "grad_norm": 0.5027869939804077, + "learning_rate": 0.00016775107552736583, + "loss": 1.376, + "step": 12422 + }, + { + "epoch": 0.16143121906689428, + "grad_norm": 0.3654315173625946, + "learning_rate": 0.00016774847606545445, + "loss": 1.2902, + "step": 12423 + }, + { + "epoch": 0.16144421361081016, + "grad_norm": 0.42000365257263184, + "learning_rate": 0.00016774587660354308, + "loss": 1.3067, + "step": 12424 + }, + { + "epoch": 0.16145720815472603, + "grad_norm": 0.37522441148757935, + "learning_rate": 0.0001677432771416317, + "loss": 1.4023, + "step": 12425 + }, + { + "epoch": 0.1614702026986419, + "grad_norm": 0.4151870012283325, + "learning_rate": 0.0001677406776797203, + "loss": 1.4056, + "step": 12426 + }, + { + "epoch": 0.16148319724255777, + "grad_norm": 0.32252201437950134, + "learning_rate": 0.00016773807821780892, + "loss": 1.5134, + "step": 12427 + }, + { + "epoch": 0.16149619178647365, + "grad_norm": 0.3820458650588989, + "learning_rate": 0.00016773547875589755, + "loss": 1.4787, + "step": 12428 + }, + { + "epoch": 0.16150918633038952, + "grad_norm": 0.41924530267715454, + "learning_rate": 0.00016773287929398615, + "loss": 1.1841, + "step": 12429 + }, + { + "epoch": 0.1615221808743054, + "grad_norm": 0.35878345370292664, + "learning_rate": 0.00016773027983207477, + "loss": 1.3852, + "step": 12430 + }, + { + "epoch": 0.16153517541822127, + "grad_norm": 0.36791571974754333, + "learning_rate": 0.00016772768037016337, + "loss": 1.4378, + "step": 12431 + }, + { + "epoch": 0.16154816996213714, + "grad_norm": 0.3435977101325989, + "learning_rate": 0.00016772508090825202, + "loss": 1.2819, + "step": 12432 + }, + { + "epoch": 0.161561164506053, + "grad_norm": 0.42321762442588806, + "learning_rate": 0.00016772248144634062, + "loss": 1.3297, + "step": 12433 + }, + { + "epoch": 0.16157415904996889, + "grad_norm": 0.37435030937194824, + "learning_rate": 0.00016771988198442924, + "loss": 1.3103, + "step": 12434 + }, + { + "epoch": 0.16158715359388476, + "grad_norm": 0.38132244348526, + "learning_rate": 0.00016771728252251784, + "loss": 1.4204, + "step": 12435 + }, + { + "epoch": 0.16160014813780063, + "grad_norm": 0.3555836081504822, + "learning_rate": 0.00016771468306060646, + "loss": 1.3865, + "step": 12436 + }, + { + "epoch": 0.1616131426817165, + "grad_norm": 0.3584928512573242, + "learning_rate": 0.0001677120835986951, + "loss": 1.583, + "step": 12437 + }, + { + "epoch": 0.16162613722563238, + "grad_norm": 0.5064800977706909, + "learning_rate": 0.00016770948413678369, + "loss": 1.4038, + "step": 12438 + }, + { + "epoch": 0.16163913176954825, + "grad_norm": 0.44835808873176575, + "learning_rate": 0.0001677068846748723, + "loss": 1.3392, + "step": 12439 + }, + { + "epoch": 0.16165212631346412, + "grad_norm": 0.49193501472473145, + "learning_rate": 0.00016770428521296093, + "loss": 1.5003, + "step": 12440 + }, + { + "epoch": 0.16166512085738, + "grad_norm": 0.4471583664417267, + "learning_rate": 0.00016770168575104953, + "loss": 1.4338, + "step": 12441 + }, + { + "epoch": 0.16167811540129587, + "grad_norm": 0.3750327527523041, + "learning_rate": 0.00016769908628913816, + "loss": 1.322, + "step": 12442 + }, + { + "epoch": 0.16169110994521174, + "grad_norm": 0.4402744472026825, + "learning_rate": 0.00016769648682722675, + "loss": 1.2689, + "step": 12443 + }, + { + "epoch": 0.16170410448912761, + "grad_norm": 0.3910582363605499, + "learning_rate": 0.0001676938873653154, + "loss": 1.4286, + "step": 12444 + }, + { + "epoch": 0.1617170990330435, + "grad_norm": 0.40360188484191895, + "learning_rate": 0.000167691287903404, + "loss": 1.4026, + "step": 12445 + }, + { + "epoch": 0.16173009357695936, + "grad_norm": 0.39934396743774414, + "learning_rate": 0.00016768868844149263, + "loss": 1.3837, + "step": 12446 + }, + { + "epoch": 0.16174308812087526, + "grad_norm": 0.38112446665763855, + "learning_rate": 0.00016768608897958122, + "loss": 1.3256, + "step": 12447 + }, + { + "epoch": 0.16175608266479113, + "grad_norm": 0.3562523126602173, + "learning_rate": 0.00016768348951766985, + "loss": 1.4838, + "step": 12448 + }, + { + "epoch": 0.161769077208707, + "grad_norm": 0.31360092759132385, + "learning_rate": 0.00016768089005575847, + "loss": 1.405, + "step": 12449 + }, + { + "epoch": 0.16178207175262288, + "grad_norm": 0.3318835198879242, + "learning_rate": 0.00016767829059384707, + "loss": 1.341, + "step": 12450 + }, + { + "epoch": 0.16179506629653875, + "grad_norm": 0.3418152630329132, + "learning_rate": 0.0001676756911319357, + "loss": 1.6521, + "step": 12451 + }, + { + "epoch": 0.16180806084045463, + "grad_norm": 0.5165520310401917, + "learning_rate": 0.00016767309167002432, + "loss": 1.6486, + "step": 12452 + }, + { + "epoch": 0.1618210553843705, + "grad_norm": 0.4152046740055084, + "learning_rate": 0.00016767049220811292, + "loss": 1.5561, + "step": 12453 + }, + { + "epoch": 0.16183404992828637, + "grad_norm": 0.27496403455734253, + "learning_rate": 0.00016766789274620154, + "loss": 1.2952, + "step": 12454 + }, + { + "epoch": 0.16184704447220224, + "grad_norm": 0.3715158700942993, + "learning_rate": 0.00016766529328429017, + "loss": 1.344, + "step": 12455 + }, + { + "epoch": 0.16186003901611812, + "grad_norm": 0.2977321445941925, + "learning_rate": 0.0001676626938223788, + "loss": 1.2016, + "step": 12456 + }, + { + "epoch": 0.161873033560034, + "grad_norm": 0.3309260904788971, + "learning_rate": 0.0001676600943604674, + "loss": 1.2894, + "step": 12457 + }, + { + "epoch": 0.16188602810394986, + "grad_norm": 0.3265513777732849, + "learning_rate": 0.000167657494898556, + "loss": 1.5233, + "step": 12458 + }, + { + "epoch": 0.16189902264786574, + "grad_norm": 0.4204879403114319, + "learning_rate": 0.00016765489543664464, + "loss": 1.3116, + "step": 12459 + }, + { + "epoch": 0.1619120171917816, + "grad_norm": 0.3536544144153595, + "learning_rate": 0.00016765229597473323, + "loss": 1.6288, + "step": 12460 + }, + { + "epoch": 0.16192501173569748, + "grad_norm": 0.4077003598213196, + "learning_rate": 0.00016764969651282186, + "loss": 1.2719, + "step": 12461 + }, + { + "epoch": 0.16193800627961336, + "grad_norm": 0.35277050733566284, + "learning_rate": 0.00016764709705091046, + "loss": 1.5044, + "step": 12462 + }, + { + "epoch": 0.16195100082352923, + "grad_norm": 0.35302749276161194, + "learning_rate": 0.0001676444975889991, + "loss": 1.1694, + "step": 12463 + }, + { + "epoch": 0.1619639953674451, + "grad_norm": 0.39460182189941406, + "learning_rate": 0.0001676418981270877, + "loss": 1.2709, + "step": 12464 + }, + { + "epoch": 0.16197698991136097, + "grad_norm": 0.3833228349685669, + "learning_rate": 0.0001676392986651763, + "loss": 1.4144, + "step": 12465 + }, + { + "epoch": 0.16198998445527685, + "grad_norm": 0.42872926592826843, + "learning_rate": 0.00016763669920326493, + "loss": 1.4237, + "step": 12466 + }, + { + "epoch": 0.16200297899919272, + "grad_norm": 0.387791246175766, + "learning_rate": 0.00016763409974135355, + "loss": 1.3357, + "step": 12467 + }, + { + "epoch": 0.1620159735431086, + "grad_norm": 0.34608373045921326, + "learning_rate": 0.00016763150027944218, + "loss": 1.3956, + "step": 12468 + }, + { + "epoch": 0.16202896808702447, + "grad_norm": 0.3558301031589508, + "learning_rate": 0.00016762890081753077, + "loss": 1.4846, + "step": 12469 + }, + { + "epoch": 0.16204196263094034, + "grad_norm": 0.4031662046909332, + "learning_rate": 0.0001676263013556194, + "loss": 1.4586, + "step": 12470 + }, + { + "epoch": 0.1620549571748562, + "grad_norm": 0.2614748775959015, + "learning_rate": 0.00016762370189370802, + "loss": 1.3889, + "step": 12471 + }, + { + "epoch": 0.16206795171877209, + "grad_norm": 0.4343404471874237, + "learning_rate": 0.00016762110243179662, + "loss": 1.5688, + "step": 12472 + }, + { + "epoch": 0.16208094626268796, + "grad_norm": 0.38676416873931885, + "learning_rate": 0.00016761850296988524, + "loss": 1.3723, + "step": 12473 + }, + { + "epoch": 0.16209394080660383, + "grad_norm": 0.34105879068374634, + "learning_rate": 0.00016761590350797384, + "loss": 1.4464, + "step": 12474 + }, + { + "epoch": 0.1621069353505197, + "grad_norm": 0.4008832573890686, + "learning_rate": 0.0001676133040460625, + "loss": 1.4875, + "step": 12475 + }, + { + "epoch": 0.16211992989443558, + "grad_norm": 0.3638954162597656, + "learning_rate": 0.0001676107045841511, + "loss": 1.3394, + "step": 12476 + }, + { + "epoch": 0.16213292443835145, + "grad_norm": 0.2658711373806, + "learning_rate": 0.0001676081051222397, + "loss": 1.2147, + "step": 12477 + }, + { + "epoch": 0.16214591898226732, + "grad_norm": 0.33120301365852356, + "learning_rate": 0.0001676055056603283, + "loss": 1.44, + "step": 12478 + }, + { + "epoch": 0.1621589135261832, + "grad_norm": 0.42474862933158875, + "learning_rate": 0.00016760290619841694, + "loss": 1.4726, + "step": 12479 + }, + { + "epoch": 0.16217190807009907, + "grad_norm": 0.43191882967948914, + "learning_rate": 0.00016760030673650556, + "loss": 1.3718, + "step": 12480 + }, + { + "epoch": 0.16218490261401494, + "grad_norm": 0.27691856026649475, + "learning_rate": 0.00016759770727459416, + "loss": 1.3691, + "step": 12481 + }, + { + "epoch": 0.16219789715793081, + "grad_norm": 0.4043032228946686, + "learning_rate": 0.00016759510781268278, + "loss": 1.524, + "step": 12482 + }, + { + "epoch": 0.1622108917018467, + "grad_norm": 0.36862054467201233, + "learning_rate": 0.0001675925083507714, + "loss": 1.361, + "step": 12483 + }, + { + "epoch": 0.16222388624576256, + "grad_norm": 0.4421246647834778, + "learning_rate": 0.00016758990888886, + "loss": 1.6674, + "step": 12484 + }, + { + "epoch": 0.16223688078967843, + "grad_norm": 0.38296762108802795, + "learning_rate": 0.00016758730942694863, + "loss": 1.3305, + "step": 12485 + }, + { + "epoch": 0.1622498753335943, + "grad_norm": 0.45953384041786194, + "learning_rate": 0.00016758470996503723, + "loss": 1.4367, + "step": 12486 + }, + { + "epoch": 0.16226286987751018, + "grad_norm": 0.2831425368785858, + "learning_rate": 0.00016758211050312588, + "loss": 1.2269, + "step": 12487 + }, + { + "epoch": 0.16227586442142605, + "grad_norm": 0.4589655101299286, + "learning_rate": 0.00016757951104121448, + "loss": 1.5357, + "step": 12488 + }, + { + "epoch": 0.16228885896534193, + "grad_norm": 0.42549723386764526, + "learning_rate": 0.00016757691157930307, + "loss": 1.4394, + "step": 12489 + }, + { + "epoch": 0.1623018535092578, + "grad_norm": 0.37913697957992554, + "learning_rate": 0.00016757431211739172, + "loss": 1.3221, + "step": 12490 + }, + { + "epoch": 0.16231484805317367, + "grad_norm": 0.39310240745544434, + "learning_rate": 0.00016757171265548032, + "loss": 1.4348, + "step": 12491 + }, + { + "epoch": 0.16232784259708954, + "grad_norm": 0.3992055058479309, + "learning_rate": 0.00016756911319356895, + "loss": 1.4737, + "step": 12492 + }, + { + "epoch": 0.16234083714100542, + "grad_norm": 0.40475642681121826, + "learning_rate": 0.00016756651373165754, + "loss": 1.4655, + "step": 12493 + }, + { + "epoch": 0.1623538316849213, + "grad_norm": 0.5873568058013916, + "learning_rate": 0.00016756391426974617, + "loss": 1.6717, + "step": 12494 + }, + { + "epoch": 0.16236682622883716, + "grad_norm": 0.41350436210632324, + "learning_rate": 0.0001675613148078348, + "loss": 1.5462, + "step": 12495 + }, + { + "epoch": 0.16237982077275304, + "grad_norm": 0.34503740072250366, + "learning_rate": 0.0001675587153459234, + "loss": 1.4019, + "step": 12496 + }, + { + "epoch": 0.1623928153166689, + "grad_norm": 0.6243388056755066, + "learning_rate": 0.00016755611588401201, + "loss": 1.3853, + "step": 12497 + }, + { + "epoch": 0.16240580986058478, + "grad_norm": 0.6286265254020691, + "learning_rate": 0.00016755351642210064, + "loss": 1.5795, + "step": 12498 + }, + { + "epoch": 0.16241880440450066, + "grad_norm": 0.43260112404823303, + "learning_rate": 0.00016755091696018926, + "loss": 1.4648, + "step": 12499 + }, + { + "epoch": 0.16243179894841653, + "grad_norm": 0.4709896147251129, + "learning_rate": 0.00016754831749827786, + "loss": 1.5809, + "step": 12500 + }, + { + "epoch": 0.1624447934923324, + "grad_norm": 0.3622850775718689, + "learning_rate": 0.00016754571803636649, + "loss": 1.4312, + "step": 12501 + }, + { + "epoch": 0.16245778803624827, + "grad_norm": 0.373507022857666, + "learning_rate": 0.0001675431185744551, + "loss": 1.4647, + "step": 12502 + }, + { + "epoch": 0.16247078258016415, + "grad_norm": 0.37186363339424133, + "learning_rate": 0.0001675405191125437, + "loss": 1.4544, + "step": 12503 + }, + { + "epoch": 0.16248377712408002, + "grad_norm": 0.4374506175518036, + "learning_rate": 0.00016753791965063233, + "loss": 1.5011, + "step": 12504 + }, + { + "epoch": 0.1624967716679959, + "grad_norm": 0.3648233413696289, + "learning_rate": 0.00016753532018872093, + "loss": 1.4016, + "step": 12505 + }, + { + "epoch": 0.16250976621191177, + "grad_norm": 0.40256452560424805, + "learning_rate": 0.00016753272072680955, + "loss": 1.4461, + "step": 12506 + }, + { + "epoch": 0.16252276075582764, + "grad_norm": 0.3151097595691681, + "learning_rate": 0.00016753012126489818, + "loss": 1.4472, + "step": 12507 + }, + { + "epoch": 0.1625357552997435, + "grad_norm": 0.5215485095977783, + "learning_rate": 0.00016752752180298678, + "loss": 1.5156, + "step": 12508 + }, + { + "epoch": 0.16254874984365938, + "grad_norm": 0.3705575466156006, + "learning_rate": 0.0001675249223410754, + "loss": 1.5263, + "step": 12509 + }, + { + "epoch": 0.16256174438757526, + "grad_norm": 0.3705456852912903, + "learning_rate": 0.00016752232287916402, + "loss": 1.5039, + "step": 12510 + }, + { + "epoch": 0.16257473893149113, + "grad_norm": 0.3548305034637451, + "learning_rate": 0.00016751972341725265, + "loss": 1.4415, + "step": 12511 + }, + { + "epoch": 0.162587733475407, + "grad_norm": 0.4036993086338043, + "learning_rate": 0.00016751712395534125, + "loss": 1.3671, + "step": 12512 + }, + { + "epoch": 0.16260072801932288, + "grad_norm": 0.4407697021961212, + "learning_rate": 0.00016751452449342987, + "loss": 1.4945, + "step": 12513 + }, + { + "epoch": 0.16261372256323875, + "grad_norm": 0.4536076784133911, + "learning_rate": 0.0001675119250315185, + "loss": 1.6528, + "step": 12514 + }, + { + "epoch": 0.16262671710715462, + "grad_norm": 0.34685930609703064, + "learning_rate": 0.0001675093255696071, + "loss": 1.2501, + "step": 12515 + }, + { + "epoch": 0.1626397116510705, + "grad_norm": 0.3214898407459259, + "learning_rate": 0.00016750672610769572, + "loss": 1.3126, + "step": 12516 + }, + { + "epoch": 0.16265270619498637, + "grad_norm": 0.4461318552494049, + "learning_rate": 0.00016750412664578431, + "loss": 1.6137, + "step": 12517 + }, + { + "epoch": 0.16266570073890224, + "grad_norm": 0.38026106357574463, + "learning_rate": 0.00016750152718387297, + "loss": 1.2828, + "step": 12518 + }, + { + "epoch": 0.16267869528281811, + "grad_norm": 0.42775630950927734, + "learning_rate": 0.00016749892772196156, + "loss": 1.3382, + "step": 12519 + }, + { + "epoch": 0.162691689826734, + "grad_norm": 0.31045591831207275, + "learning_rate": 0.00016749632826005016, + "loss": 1.2946, + "step": 12520 + }, + { + "epoch": 0.16270468437064986, + "grad_norm": 0.4147631525993347, + "learning_rate": 0.00016749372879813879, + "loss": 1.5143, + "step": 12521 + }, + { + "epoch": 0.16271767891456573, + "grad_norm": 0.4874553680419922, + "learning_rate": 0.0001674911293362274, + "loss": 1.5505, + "step": 12522 + }, + { + "epoch": 0.16273067345848163, + "grad_norm": 0.5340845584869385, + "learning_rate": 0.00016748852987431603, + "loss": 1.5953, + "step": 12523 + }, + { + "epoch": 0.1627436680023975, + "grad_norm": 0.4391802251338959, + "learning_rate": 0.00016748593041240463, + "loss": 1.2792, + "step": 12524 + }, + { + "epoch": 0.16275666254631338, + "grad_norm": 0.48946189880371094, + "learning_rate": 0.00016748333095049326, + "loss": 1.5752, + "step": 12525 + }, + { + "epoch": 0.16276965709022925, + "grad_norm": 0.4446520209312439, + "learning_rate": 0.00016748073148858188, + "loss": 1.3333, + "step": 12526 + }, + { + "epoch": 0.16278265163414513, + "grad_norm": 0.31156834959983826, + "learning_rate": 0.00016747813202667048, + "loss": 1.4307, + "step": 12527 + }, + { + "epoch": 0.162795646178061, + "grad_norm": 0.42377573251724243, + "learning_rate": 0.0001674755325647591, + "loss": 1.5881, + "step": 12528 + }, + { + "epoch": 0.16280864072197687, + "grad_norm": 0.3825845420360565, + "learning_rate": 0.00016747293310284773, + "loss": 1.4812, + "step": 12529 + }, + { + "epoch": 0.16282163526589274, + "grad_norm": 0.3442310094833374, + "learning_rate": 0.00016747033364093635, + "loss": 1.4224, + "step": 12530 + }, + { + "epoch": 0.16283462980980862, + "grad_norm": 0.24969570338726044, + "learning_rate": 0.00016746773417902495, + "loss": 1.2607, + "step": 12531 + }, + { + "epoch": 0.1628476243537245, + "grad_norm": 0.4235866963863373, + "learning_rate": 0.00016746513471711355, + "loss": 1.4143, + "step": 12532 + }, + { + "epoch": 0.16286061889764036, + "grad_norm": 0.3431743085384369, + "learning_rate": 0.0001674625352552022, + "loss": 1.2748, + "step": 12533 + }, + { + "epoch": 0.16287361344155624, + "grad_norm": 0.3926869034767151, + "learning_rate": 0.0001674599357932908, + "loss": 1.2887, + "step": 12534 + }, + { + "epoch": 0.1628866079854721, + "grad_norm": 0.2974802553653717, + "learning_rate": 0.00016745733633137942, + "loss": 1.2394, + "step": 12535 + }, + { + "epoch": 0.16289960252938798, + "grad_norm": 0.4136240482330322, + "learning_rate": 0.00016745473686946802, + "loss": 1.3044, + "step": 12536 + }, + { + "epoch": 0.16291259707330386, + "grad_norm": 0.3392886519432068, + "learning_rate": 0.00016745213740755664, + "loss": 1.5434, + "step": 12537 + }, + { + "epoch": 0.16292559161721973, + "grad_norm": 0.35140377283096313, + "learning_rate": 0.00016744953794564527, + "loss": 1.4271, + "step": 12538 + }, + { + "epoch": 0.1629385861611356, + "grad_norm": 0.4330267012119293, + "learning_rate": 0.00016744693848373386, + "loss": 1.2443, + "step": 12539 + }, + { + "epoch": 0.16295158070505147, + "grad_norm": 0.4170609712600708, + "learning_rate": 0.0001674443390218225, + "loss": 1.4508, + "step": 12540 + }, + { + "epoch": 0.16296457524896735, + "grad_norm": 0.4131290316581726, + "learning_rate": 0.0001674417395599111, + "loss": 1.3705, + "step": 12541 + }, + { + "epoch": 0.16297756979288322, + "grad_norm": 0.3768766224384308, + "learning_rate": 0.00016743914009799974, + "loss": 1.3165, + "step": 12542 + }, + { + "epoch": 0.1629905643367991, + "grad_norm": 0.41880467534065247, + "learning_rate": 0.00016743654063608833, + "loss": 1.2335, + "step": 12543 + }, + { + "epoch": 0.16300355888071497, + "grad_norm": 0.4210159480571747, + "learning_rate": 0.00016743394117417693, + "loss": 1.6584, + "step": 12544 + }, + { + "epoch": 0.16301655342463084, + "grad_norm": 0.464741051197052, + "learning_rate": 0.00016743134171226558, + "loss": 1.4828, + "step": 12545 + }, + { + "epoch": 0.1630295479685467, + "grad_norm": 0.24336490035057068, + "learning_rate": 0.00016742874225035418, + "loss": 1.5262, + "step": 12546 + }, + { + "epoch": 0.16304254251246258, + "grad_norm": 0.406823992729187, + "learning_rate": 0.0001674261427884428, + "loss": 1.4437, + "step": 12547 + }, + { + "epoch": 0.16305553705637846, + "grad_norm": 0.30817320942878723, + "learning_rate": 0.0001674235433265314, + "loss": 1.3974, + "step": 12548 + }, + { + "epoch": 0.16306853160029433, + "grad_norm": 0.39377352595329285, + "learning_rate": 0.00016742094386462003, + "loss": 1.2797, + "step": 12549 + }, + { + "epoch": 0.1630815261442102, + "grad_norm": 0.38271719217300415, + "learning_rate": 0.00016741834440270865, + "loss": 1.505, + "step": 12550 + }, + { + "epoch": 0.16309452068812608, + "grad_norm": 0.48419830203056335, + "learning_rate": 0.00016741574494079725, + "loss": 1.4236, + "step": 12551 + }, + { + "epoch": 0.16310751523204195, + "grad_norm": 0.37128719687461853, + "learning_rate": 0.00016741314547888587, + "loss": 1.3847, + "step": 12552 + }, + { + "epoch": 0.16312050977595782, + "grad_norm": 0.339509516954422, + "learning_rate": 0.0001674105460169745, + "loss": 1.2947, + "step": 12553 + }, + { + "epoch": 0.1631335043198737, + "grad_norm": 0.3616383969783783, + "learning_rate": 0.00016740794655506312, + "loss": 1.4312, + "step": 12554 + }, + { + "epoch": 0.16314649886378957, + "grad_norm": 0.48535311222076416, + "learning_rate": 0.00016740534709315172, + "loss": 1.4295, + "step": 12555 + }, + { + "epoch": 0.16315949340770544, + "grad_norm": 0.540007472038269, + "learning_rate": 0.00016740274763124034, + "loss": 1.5298, + "step": 12556 + }, + { + "epoch": 0.16317248795162131, + "grad_norm": 0.36977851390838623, + "learning_rate": 0.00016740014816932897, + "loss": 1.3399, + "step": 12557 + }, + { + "epoch": 0.1631854824955372, + "grad_norm": 0.4357253313064575, + "learning_rate": 0.00016739754870741757, + "loss": 1.3493, + "step": 12558 + }, + { + "epoch": 0.16319847703945306, + "grad_norm": 0.3888311982154846, + "learning_rate": 0.0001673949492455062, + "loss": 1.414, + "step": 12559 + }, + { + "epoch": 0.16321147158336893, + "grad_norm": 0.42950886487960815, + "learning_rate": 0.0001673923497835948, + "loss": 1.5973, + "step": 12560 + }, + { + "epoch": 0.1632244661272848, + "grad_norm": 0.36851391196250916, + "learning_rate": 0.0001673897503216834, + "loss": 1.4451, + "step": 12561 + }, + { + "epoch": 0.16323746067120068, + "grad_norm": 0.4680193066596985, + "learning_rate": 0.00016738715085977204, + "loss": 1.398, + "step": 12562 + }, + { + "epoch": 0.16325045521511655, + "grad_norm": 0.3904152512550354, + "learning_rate": 0.00016738455139786063, + "loss": 1.2553, + "step": 12563 + }, + { + "epoch": 0.16326344975903243, + "grad_norm": 0.4095899164676666, + "learning_rate": 0.00016738195193594929, + "loss": 1.5629, + "step": 12564 + }, + { + "epoch": 0.1632764443029483, + "grad_norm": 0.3752861022949219, + "learning_rate": 0.00016737935247403788, + "loss": 1.3462, + "step": 12565 + }, + { + "epoch": 0.16328943884686417, + "grad_norm": 0.46612271666526794, + "learning_rate": 0.0001673767530121265, + "loss": 1.4625, + "step": 12566 + }, + { + "epoch": 0.16330243339078004, + "grad_norm": 0.38140806555747986, + "learning_rate": 0.0001673741535502151, + "loss": 1.1929, + "step": 12567 + }, + { + "epoch": 0.16331542793469592, + "grad_norm": 0.3754453957080841, + "learning_rate": 0.00016737155408830373, + "loss": 1.5937, + "step": 12568 + }, + { + "epoch": 0.1633284224786118, + "grad_norm": 0.2939801514148712, + "learning_rate": 0.00016736895462639235, + "loss": 1.1274, + "step": 12569 + }, + { + "epoch": 0.16334141702252766, + "grad_norm": 0.3077393174171448, + "learning_rate": 0.00016736635516448095, + "loss": 1.3211, + "step": 12570 + }, + { + "epoch": 0.16335441156644354, + "grad_norm": 0.30053406953811646, + "learning_rate": 0.00016736375570256958, + "loss": 1.4289, + "step": 12571 + }, + { + "epoch": 0.1633674061103594, + "grad_norm": 0.4110419750213623, + "learning_rate": 0.0001673611562406582, + "loss": 1.2429, + "step": 12572 + }, + { + "epoch": 0.16338040065427528, + "grad_norm": 0.4024268090724945, + "learning_rate": 0.0001673585567787468, + "loss": 1.4757, + "step": 12573 + }, + { + "epoch": 0.16339339519819115, + "grad_norm": 0.42453181743621826, + "learning_rate": 0.00016735595731683542, + "loss": 1.5472, + "step": 12574 + }, + { + "epoch": 0.16340638974210703, + "grad_norm": 0.41031795740127563, + "learning_rate": 0.00016735335785492402, + "loss": 1.5385, + "step": 12575 + }, + { + "epoch": 0.1634193842860229, + "grad_norm": 0.6229802966117859, + "learning_rate": 0.00016735075839301267, + "loss": 1.5193, + "step": 12576 + }, + { + "epoch": 0.16343237882993877, + "grad_norm": 0.45157402753829956, + "learning_rate": 0.00016734815893110127, + "loss": 1.4181, + "step": 12577 + }, + { + "epoch": 0.16344537337385465, + "grad_norm": 0.32237404584884644, + "learning_rate": 0.0001673455594691899, + "loss": 1.4816, + "step": 12578 + }, + { + "epoch": 0.16345836791777052, + "grad_norm": 0.4542950391769409, + "learning_rate": 0.0001673429600072785, + "loss": 1.7294, + "step": 12579 + }, + { + "epoch": 0.1634713624616864, + "grad_norm": 0.462037056684494, + "learning_rate": 0.00016734036054536712, + "loss": 1.4178, + "step": 12580 + }, + { + "epoch": 0.16348435700560227, + "grad_norm": 0.37257617712020874, + "learning_rate": 0.00016733776108345574, + "loss": 1.4599, + "step": 12581 + }, + { + "epoch": 0.16349735154951814, + "grad_norm": 0.4069055914878845, + "learning_rate": 0.00016733516162154434, + "loss": 1.2239, + "step": 12582 + }, + { + "epoch": 0.163510346093434, + "grad_norm": 0.435127854347229, + "learning_rate": 0.00016733256215963296, + "loss": 1.4567, + "step": 12583 + }, + { + "epoch": 0.16352334063734988, + "grad_norm": 0.4554899036884308, + "learning_rate": 0.00016732996269772159, + "loss": 1.4514, + "step": 12584 + }, + { + "epoch": 0.16353633518126576, + "grad_norm": 0.4384957551956177, + "learning_rate": 0.0001673273632358102, + "loss": 1.348, + "step": 12585 + }, + { + "epoch": 0.16354932972518163, + "grad_norm": 0.42155084013938904, + "learning_rate": 0.0001673247637738988, + "loss": 1.545, + "step": 12586 + }, + { + "epoch": 0.1635623242690975, + "grad_norm": 0.5434795618057251, + "learning_rate": 0.0001673221643119874, + "loss": 1.4157, + "step": 12587 + }, + { + "epoch": 0.16357531881301338, + "grad_norm": 0.46022364497184753, + "learning_rate": 0.00016731956485007606, + "loss": 1.4784, + "step": 12588 + }, + { + "epoch": 0.16358831335692925, + "grad_norm": 0.4144461452960968, + "learning_rate": 0.00016731696538816465, + "loss": 1.6507, + "step": 12589 + }, + { + "epoch": 0.16360130790084512, + "grad_norm": 0.4117562472820282, + "learning_rate": 0.00016731436592625328, + "loss": 1.4959, + "step": 12590 + }, + { + "epoch": 0.163614302444761, + "grad_norm": 0.30628302693367004, + "learning_rate": 0.00016731176646434188, + "loss": 1.2468, + "step": 12591 + }, + { + "epoch": 0.16362729698867687, + "grad_norm": 0.4234026372432709, + "learning_rate": 0.0001673091670024305, + "loss": 1.4373, + "step": 12592 + }, + { + "epoch": 0.16364029153259274, + "grad_norm": 0.43162137269973755, + "learning_rate": 0.00016730656754051912, + "loss": 1.4762, + "step": 12593 + }, + { + "epoch": 0.16365328607650861, + "grad_norm": 0.39841943979263306, + "learning_rate": 0.00016730396807860772, + "loss": 1.3299, + "step": 12594 + }, + { + "epoch": 0.1636662806204245, + "grad_norm": 0.4639006555080414, + "learning_rate": 0.00016730136861669635, + "loss": 1.4119, + "step": 12595 + }, + { + "epoch": 0.16367927516434036, + "grad_norm": 0.34936848282814026, + "learning_rate": 0.00016729876915478497, + "loss": 1.4184, + "step": 12596 + }, + { + "epoch": 0.16369226970825623, + "grad_norm": 0.43725940585136414, + "learning_rate": 0.0001672961696928736, + "loss": 1.4743, + "step": 12597 + }, + { + "epoch": 0.1637052642521721, + "grad_norm": 0.43568554520606995, + "learning_rate": 0.0001672935702309622, + "loss": 1.4848, + "step": 12598 + }, + { + "epoch": 0.16371825879608798, + "grad_norm": 0.31008097529411316, + "learning_rate": 0.00016729097076905082, + "loss": 1.1339, + "step": 12599 + }, + { + "epoch": 0.16373125334000388, + "grad_norm": 0.4076475203037262, + "learning_rate": 0.00016728837130713944, + "loss": 1.5483, + "step": 12600 + }, + { + "epoch": 0.16374424788391975, + "grad_norm": 0.3901926279067993, + "learning_rate": 0.00016728577184522804, + "loss": 1.4576, + "step": 12601 + }, + { + "epoch": 0.16375724242783563, + "grad_norm": 0.41686126589775085, + "learning_rate": 0.00016728317238331666, + "loss": 1.2471, + "step": 12602 + }, + { + "epoch": 0.1637702369717515, + "grad_norm": 0.36490100622177124, + "learning_rate": 0.0001672805729214053, + "loss": 1.3633, + "step": 12603 + }, + { + "epoch": 0.16378323151566737, + "grad_norm": 0.4355268180370331, + "learning_rate": 0.00016727797345949389, + "loss": 1.5152, + "step": 12604 + }, + { + "epoch": 0.16379622605958324, + "grad_norm": 0.49196478724479675, + "learning_rate": 0.0001672753739975825, + "loss": 1.3193, + "step": 12605 + }, + { + "epoch": 0.16380922060349912, + "grad_norm": 0.4712654948234558, + "learning_rate": 0.0001672727745356711, + "loss": 1.4477, + "step": 12606 + }, + { + "epoch": 0.163822215147415, + "grad_norm": 0.4347473680973053, + "learning_rate": 0.00016727017507375976, + "loss": 1.4417, + "step": 12607 + }, + { + "epoch": 0.16383520969133086, + "grad_norm": 0.41889631748199463, + "learning_rate": 0.00016726757561184836, + "loss": 1.5078, + "step": 12608 + }, + { + "epoch": 0.16384820423524674, + "grad_norm": 0.3379688858985901, + "learning_rate": 0.00016726497614993698, + "loss": 1.4814, + "step": 12609 + }, + { + "epoch": 0.1638611987791626, + "grad_norm": 0.28536832332611084, + "learning_rate": 0.00016726237668802558, + "loss": 1.193, + "step": 12610 + }, + { + "epoch": 0.16387419332307848, + "grad_norm": 0.4006774127483368, + "learning_rate": 0.0001672597772261142, + "loss": 1.5469, + "step": 12611 + }, + { + "epoch": 0.16388718786699435, + "grad_norm": 0.4076038897037506, + "learning_rate": 0.00016725717776420283, + "loss": 1.3567, + "step": 12612 + }, + { + "epoch": 0.16390018241091023, + "grad_norm": 0.2930568754673004, + "learning_rate": 0.00016725457830229142, + "loss": 1.3811, + "step": 12613 + }, + { + "epoch": 0.1639131769548261, + "grad_norm": 0.35494542121887207, + "learning_rate": 0.00016725197884038005, + "loss": 1.4878, + "step": 12614 + }, + { + "epoch": 0.16392617149874197, + "grad_norm": 0.33522146940231323, + "learning_rate": 0.00016724937937846867, + "loss": 1.4318, + "step": 12615 + }, + { + "epoch": 0.16393916604265785, + "grad_norm": 0.3044874370098114, + "learning_rate": 0.00016724677991655727, + "loss": 1.3189, + "step": 12616 + }, + { + "epoch": 0.16395216058657372, + "grad_norm": 0.3664427101612091, + "learning_rate": 0.0001672441804546459, + "loss": 1.4674, + "step": 12617 + }, + { + "epoch": 0.1639651551304896, + "grad_norm": 0.3620952367782593, + "learning_rate": 0.0001672415809927345, + "loss": 1.4301, + "step": 12618 + }, + { + "epoch": 0.16397814967440547, + "grad_norm": 0.3472168445587158, + "learning_rate": 0.00016723898153082314, + "loss": 1.2693, + "step": 12619 + }, + { + "epoch": 0.16399114421832134, + "grad_norm": 0.402240514755249, + "learning_rate": 0.00016723638206891174, + "loss": 1.4768, + "step": 12620 + }, + { + "epoch": 0.1640041387622372, + "grad_norm": 0.38466158509254456, + "learning_rate": 0.00016723378260700037, + "loss": 1.4592, + "step": 12621 + }, + { + "epoch": 0.16401713330615308, + "grad_norm": 0.49159517884254456, + "learning_rate": 0.00016723118314508896, + "loss": 1.5502, + "step": 12622 + }, + { + "epoch": 0.16403012785006896, + "grad_norm": 0.43623292446136475, + "learning_rate": 0.0001672285836831776, + "loss": 1.2735, + "step": 12623 + }, + { + "epoch": 0.16404312239398483, + "grad_norm": 0.35589292645454407, + "learning_rate": 0.0001672259842212662, + "loss": 1.2686, + "step": 12624 + }, + { + "epoch": 0.1640561169379007, + "grad_norm": 0.43091779947280884, + "learning_rate": 0.0001672233847593548, + "loss": 1.264, + "step": 12625 + }, + { + "epoch": 0.16406911148181658, + "grad_norm": 0.4590790569782257, + "learning_rate": 0.00016722078529744343, + "loss": 1.4688, + "step": 12626 + }, + { + "epoch": 0.16408210602573245, + "grad_norm": 0.31129980087280273, + "learning_rate": 0.00016721818583553206, + "loss": 1.3618, + "step": 12627 + }, + { + "epoch": 0.16409510056964832, + "grad_norm": 0.4001850485801697, + "learning_rate": 0.00016721558637362066, + "loss": 1.4362, + "step": 12628 + }, + { + "epoch": 0.1641080951135642, + "grad_norm": 0.4880580008029938, + "learning_rate": 0.00016721298691170928, + "loss": 1.6125, + "step": 12629 + }, + { + "epoch": 0.16412108965748007, + "grad_norm": 0.35294270515441895, + "learning_rate": 0.00016721038744979788, + "loss": 1.4813, + "step": 12630 + }, + { + "epoch": 0.16413408420139594, + "grad_norm": 0.3448569178581238, + "learning_rate": 0.00016720778798788653, + "loss": 1.3985, + "step": 12631 + }, + { + "epoch": 0.16414707874531181, + "grad_norm": 0.32524916529655457, + "learning_rate": 0.00016720518852597513, + "loss": 1.2129, + "step": 12632 + }, + { + "epoch": 0.1641600732892277, + "grad_norm": 0.45836636424064636, + "learning_rate": 0.00016720258906406375, + "loss": 1.5342, + "step": 12633 + }, + { + "epoch": 0.16417306783314356, + "grad_norm": 0.4628974199295044, + "learning_rate": 0.00016719998960215235, + "loss": 1.435, + "step": 12634 + }, + { + "epoch": 0.16418606237705943, + "grad_norm": 0.30716392397880554, + "learning_rate": 0.00016719739014024097, + "loss": 1.1494, + "step": 12635 + }, + { + "epoch": 0.1641990569209753, + "grad_norm": 0.39553263783454895, + "learning_rate": 0.0001671947906783296, + "loss": 1.5279, + "step": 12636 + }, + { + "epoch": 0.16421205146489118, + "grad_norm": 0.3989872932434082, + "learning_rate": 0.0001671921912164182, + "loss": 1.551, + "step": 12637 + }, + { + "epoch": 0.16422504600880705, + "grad_norm": 0.32105305790901184, + "learning_rate": 0.00016718959175450685, + "loss": 1.4091, + "step": 12638 + }, + { + "epoch": 0.16423804055272292, + "grad_norm": 0.45716384053230286, + "learning_rate": 0.00016718699229259544, + "loss": 1.4811, + "step": 12639 + }, + { + "epoch": 0.1642510350966388, + "grad_norm": 0.4184134006500244, + "learning_rate": 0.00016718439283068407, + "loss": 1.3924, + "step": 12640 + }, + { + "epoch": 0.16426402964055467, + "grad_norm": 0.44609779119491577, + "learning_rate": 0.00016718179336877267, + "loss": 1.334, + "step": 12641 + }, + { + "epoch": 0.16427702418447054, + "grad_norm": 0.37901467084884644, + "learning_rate": 0.0001671791939068613, + "loss": 1.4078, + "step": 12642 + }, + { + "epoch": 0.16429001872838642, + "grad_norm": 0.3853563964366913, + "learning_rate": 0.00016717659444494992, + "loss": 1.5297, + "step": 12643 + }, + { + "epoch": 0.1643030132723023, + "grad_norm": 0.4908764958381653, + "learning_rate": 0.0001671739949830385, + "loss": 1.505, + "step": 12644 + }, + { + "epoch": 0.16431600781621816, + "grad_norm": 0.32138046622276306, + "learning_rate": 0.00016717139552112714, + "loss": 1.3888, + "step": 12645 + }, + { + "epoch": 0.16432900236013404, + "grad_norm": 0.37270838022232056, + "learning_rate": 0.00016716879605921576, + "loss": 1.4012, + "step": 12646 + }, + { + "epoch": 0.1643419969040499, + "grad_norm": 0.5202423334121704, + "learning_rate": 0.00016716619659730436, + "loss": 1.2435, + "step": 12647 + }, + { + "epoch": 0.16435499144796578, + "grad_norm": 0.41264837980270386, + "learning_rate": 0.00016716359713539298, + "loss": 1.701, + "step": 12648 + }, + { + "epoch": 0.16436798599188165, + "grad_norm": 0.338055819272995, + "learning_rate": 0.00016716099767348158, + "loss": 1.4998, + "step": 12649 + }, + { + "epoch": 0.16438098053579753, + "grad_norm": 0.32998618483543396, + "learning_rate": 0.00016715839821157023, + "loss": 1.5474, + "step": 12650 + }, + { + "epoch": 0.1643939750797134, + "grad_norm": 0.39601635932922363, + "learning_rate": 0.00016715579874965883, + "loss": 1.6777, + "step": 12651 + }, + { + "epoch": 0.16440696962362927, + "grad_norm": 0.39117392897605896, + "learning_rate": 0.00016715319928774745, + "loss": 1.3039, + "step": 12652 + }, + { + "epoch": 0.16441996416754515, + "grad_norm": 0.41321861743927, + "learning_rate": 0.00016715059982583605, + "loss": 1.2456, + "step": 12653 + }, + { + "epoch": 0.16443295871146102, + "grad_norm": 0.36340978741645813, + "learning_rate": 0.00016714800036392468, + "loss": 1.6684, + "step": 12654 + }, + { + "epoch": 0.1644459532553769, + "grad_norm": 0.4573553800582886, + "learning_rate": 0.0001671454009020133, + "loss": 1.4145, + "step": 12655 + }, + { + "epoch": 0.16445894779929277, + "grad_norm": 0.35159313678741455, + "learning_rate": 0.0001671428014401019, + "loss": 1.2721, + "step": 12656 + }, + { + "epoch": 0.16447194234320864, + "grad_norm": 0.3576256334781647, + "learning_rate": 0.00016714020197819052, + "loss": 1.3786, + "step": 12657 + }, + { + "epoch": 0.1644849368871245, + "grad_norm": 0.4518357813358307, + "learning_rate": 0.00016713760251627915, + "loss": 1.3445, + "step": 12658 + }, + { + "epoch": 0.16449793143104038, + "grad_norm": 0.3353070318698883, + "learning_rate": 0.00016713500305436774, + "loss": 1.4189, + "step": 12659 + }, + { + "epoch": 0.16451092597495626, + "grad_norm": 0.5224003195762634, + "learning_rate": 0.00016713240359245637, + "loss": 1.3229, + "step": 12660 + }, + { + "epoch": 0.16452392051887213, + "grad_norm": 0.45401838421821594, + "learning_rate": 0.00016712980413054497, + "loss": 1.6328, + "step": 12661 + }, + { + "epoch": 0.164536915062788, + "grad_norm": 0.48674678802490234, + "learning_rate": 0.00016712720466863362, + "loss": 1.5087, + "step": 12662 + }, + { + "epoch": 0.16454990960670388, + "grad_norm": 0.4273641109466553, + "learning_rate": 0.00016712460520672222, + "loss": 1.3423, + "step": 12663 + }, + { + "epoch": 0.16456290415061975, + "grad_norm": 0.3729701638221741, + "learning_rate": 0.00016712200574481084, + "loss": 1.2581, + "step": 12664 + }, + { + "epoch": 0.16457589869453562, + "grad_norm": 0.36620792746543884, + "learning_rate": 0.00016711940628289944, + "loss": 1.5081, + "step": 12665 + }, + { + "epoch": 0.1645888932384515, + "grad_norm": 0.38684314489364624, + "learning_rate": 0.00016711680682098806, + "loss": 1.5505, + "step": 12666 + }, + { + "epoch": 0.16460188778236737, + "grad_norm": 0.43507975339889526, + "learning_rate": 0.00016711420735907669, + "loss": 1.4525, + "step": 12667 + }, + { + "epoch": 0.16461488232628324, + "grad_norm": 0.44767484068870544, + "learning_rate": 0.00016711160789716528, + "loss": 1.4059, + "step": 12668 + }, + { + "epoch": 0.1646278768701991, + "grad_norm": 0.43577510118484497, + "learning_rate": 0.0001671090084352539, + "loss": 1.5334, + "step": 12669 + }, + { + "epoch": 0.164640871414115, + "grad_norm": 0.4192003905773163, + "learning_rate": 0.00016710640897334253, + "loss": 1.4228, + "step": 12670 + }, + { + "epoch": 0.16465386595803086, + "grad_norm": 0.3818548321723938, + "learning_rate": 0.00016710380951143113, + "loss": 1.4217, + "step": 12671 + }, + { + "epoch": 0.16466686050194673, + "grad_norm": 0.45438137650489807, + "learning_rate": 0.00016710121004951975, + "loss": 1.4976, + "step": 12672 + }, + { + "epoch": 0.1646798550458626, + "grad_norm": 0.4225194752216339, + "learning_rate": 0.00016709861058760838, + "loss": 1.4947, + "step": 12673 + }, + { + "epoch": 0.16469284958977848, + "grad_norm": 0.299696147441864, + "learning_rate": 0.000167096011125697, + "loss": 1.2598, + "step": 12674 + }, + { + "epoch": 0.16470584413369435, + "grad_norm": 0.41225069761276245, + "learning_rate": 0.0001670934116637856, + "loss": 1.4895, + "step": 12675 + }, + { + "epoch": 0.16471883867761025, + "grad_norm": 0.38099080324172974, + "learning_rate": 0.00016709081220187423, + "loss": 1.4745, + "step": 12676 + }, + { + "epoch": 0.16473183322152612, + "grad_norm": 0.28796982765197754, + "learning_rate": 0.00016708821273996285, + "loss": 1.2963, + "step": 12677 + }, + { + "epoch": 0.164744827765442, + "grad_norm": 0.35307663679122925, + "learning_rate": 0.00016708561327805145, + "loss": 1.3488, + "step": 12678 + }, + { + "epoch": 0.16475782230935787, + "grad_norm": 0.39195382595062256, + "learning_rate": 0.00016708301381614007, + "loss": 1.226, + "step": 12679 + }, + { + "epoch": 0.16477081685327374, + "grad_norm": 0.38366764783859253, + "learning_rate": 0.00016708041435422867, + "loss": 1.2977, + "step": 12680 + }, + { + "epoch": 0.16478381139718962, + "grad_norm": 0.4170806109905243, + "learning_rate": 0.00016707781489231732, + "loss": 1.6368, + "step": 12681 + }, + { + "epoch": 0.1647968059411055, + "grad_norm": 0.41293367743492126, + "learning_rate": 0.00016707521543040592, + "loss": 1.2814, + "step": 12682 + }, + { + "epoch": 0.16480980048502136, + "grad_norm": 0.48112955689430237, + "learning_rate": 0.00016707261596849452, + "loss": 1.3622, + "step": 12683 + }, + { + "epoch": 0.16482279502893724, + "grad_norm": 0.3877650201320648, + "learning_rate": 0.00016707001650658314, + "loss": 1.5362, + "step": 12684 + }, + { + "epoch": 0.1648357895728531, + "grad_norm": 0.4014728367328644, + "learning_rate": 0.00016706741704467176, + "loss": 1.56, + "step": 12685 + }, + { + "epoch": 0.16484878411676898, + "grad_norm": 0.49827906489372253, + "learning_rate": 0.0001670648175827604, + "loss": 1.5652, + "step": 12686 + }, + { + "epoch": 0.16486177866068485, + "grad_norm": 0.4770209491252899, + "learning_rate": 0.00016706221812084899, + "loss": 1.506, + "step": 12687 + }, + { + "epoch": 0.16487477320460073, + "grad_norm": 0.39580288529396057, + "learning_rate": 0.0001670596186589376, + "loss": 1.2318, + "step": 12688 + }, + { + "epoch": 0.1648877677485166, + "grad_norm": 0.34847956895828247, + "learning_rate": 0.00016705701919702624, + "loss": 1.4048, + "step": 12689 + }, + { + "epoch": 0.16490076229243247, + "grad_norm": 0.3450503349304199, + "learning_rate": 0.00016705441973511483, + "loss": 1.2685, + "step": 12690 + }, + { + "epoch": 0.16491375683634835, + "grad_norm": 0.38976818323135376, + "learning_rate": 0.00016705182027320346, + "loss": 1.5662, + "step": 12691 + }, + { + "epoch": 0.16492675138026422, + "grad_norm": 0.4271141290664673, + "learning_rate": 0.00016704922081129205, + "loss": 1.5289, + "step": 12692 + }, + { + "epoch": 0.1649397459241801, + "grad_norm": 0.39912524819374084, + "learning_rate": 0.0001670466213493807, + "loss": 1.3907, + "step": 12693 + }, + { + "epoch": 0.16495274046809597, + "grad_norm": 0.4122074246406555, + "learning_rate": 0.0001670440218874693, + "loss": 1.4457, + "step": 12694 + }, + { + "epoch": 0.16496573501201184, + "grad_norm": 0.36283794045448303, + "learning_rate": 0.0001670414224255579, + "loss": 1.4269, + "step": 12695 + }, + { + "epoch": 0.1649787295559277, + "grad_norm": 0.34611138701438904, + "learning_rate": 0.00016703882296364653, + "loss": 1.2887, + "step": 12696 + }, + { + "epoch": 0.16499172409984358, + "grad_norm": 0.4332440197467804, + "learning_rate": 0.00016703622350173515, + "loss": 1.5546, + "step": 12697 + }, + { + "epoch": 0.16500471864375946, + "grad_norm": 0.4708455502986908, + "learning_rate": 0.00016703362403982377, + "loss": 1.4198, + "step": 12698 + }, + { + "epoch": 0.16501771318767533, + "grad_norm": 0.404793918132782, + "learning_rate": 0.00016703102457791237, + "loss": 1.342, + "step": 12699 + }, + { + "epoch": 0.1650307077315912, + "grad_norm": 0.4312038719654083, + "learning_rate": 0.000167028425116001, + "loss": 1.3826, + "step": 12700 + }, + { + "epoch": 0.16504370227550708, + "grad_norm": 0.35647881031036377, + "learning_rate": 0.00016702582565408962, + "loss": 1.2897, + "step": 12701 + }, + { + "epoch": 0.16505669681942295, + "grad_norm": 0.3905855417251587, + "learning_rate": 0.00016702322619217822, + "loss": 1.3434, + "step": 12702 + }, + { + "epoch": 0.16506969136333882, + "grad_norm": 0.33579957485198975, + "learning_rate": 0.00016702062673026684, + "loss": 1.4348, + "step": 12703 + }, + { + "epoch": 0.1650826859072547, + "grad_norm": 0.3677779734134674, + "learning_rate": 0.00016701802726835544, + "loss": 1.4049, + "step": 12704 + }, + { + "epoch": 0.16509568045117057, + "grad_norm": 0.3726382255554199, + "learning_rate": 0.0001670154278064441, + "loss": 1.4293, + "step": 12705 + }, + { + "epoch": 0.16510867499508644, + "grad_norm": 0.5283243656158447, + "learning_rate": 0.0001670128283445327, + "loss": 1.4101, + "step": 12706 + }, + { + "epoch": 0.1651216695390023, + "grad_norm": 0.27916380763053894, + "learning_rate": 0.0001670102288826213, + "loss": 1.3427, + "step": 12707 + }, + { + "epoch": 0.1651346640829182, + "grad_norm": 0.41635891795158386, + "learning_rate": 0.0001670076294207099, + "loss": 1.3672, + "step": 12708 + }, + { + "epoch": 0.16514765862683406, + "grad_norm": 0.3994525671005249, + "learning_rate": 0.00016700502995879854, + "loss": 1.5141, + "step": 12709 + }, + { + "epoch": 0.16516065317074993, + "grad_norm": 0.3423674404621124, + "learning_rate": 0.00016700243049688716, + "loss": 1.4823, + "step": 12710 + }, + { + "epoch": 0.1651736477146658, + "grad_norm": 0.3354313373565674, + "learning_rate": 0.00016699983103497576, + "loss": 1.3609, + "step": 12711 + }, + { + "epoch": 0.16518664225858168, + "grad_norm": 0.3780425786972046, + "learning_rate": 0.00016699723157306438, + "loss": 1.3445, + "step": 12712 + }, + { + "epoch": 0.16519963680249755, + "grad_norm": 0.42627015709877014, + "learning_rate": 0.000166994632111153, + "loss": 1.4527, + "step": 12713 + }, + { + "epoch": 0.16521263134641342, + "grad_norm": 0.33906233310699463, + "learning_rate": 0.0001669920326492416, + "loss": 1.7223, + "step": 12714 + }, + { + "epoch": 0.1652256258903293, + "grad_norm": 0.43348586559295654, + "learning_rate": 0.00016698943318733023, + "loss": 1.2718, + "step": 12715 + }, + { + "epoch": 0.16523862043424517, + "grad_norm": 0.38032829761505127, + "learning_rate": 0.00016698683372541885, + "loss": 1.4151, + "step": 12716 + }, + { + "epoch": 0.16525161497816104, + "grad_norm": 0.43958452343940735, + "learning_rate": 0.00016698423426350748, + "loss": 1.5789, + "step": 12717 + }, + { + "epoch": 0.16526460952207692, + "grad_norm": 0.4106976091861725, + "learning_rate": 0.00016698163480159607, + "loss": 1.3721, + "step": 12718 + }, + { + "epoch": 0.1652776040659928, + "grad_norm": 0.3732120990753174, + "learning_rate": 0.0001669790353396847, + "loss": 1.4823, + "step": 12719 + }, + { + "epoch": 0.16529059860990866, + "grad_norm": 0.3490251898765564, + "learning_rate": 0.00016697643587777332, + "loss": 1.5651, + "step": 12720 + }, + { + "epoch": 0.16530359315382454, + "grad_norm": 0.31975293159484863, + "learning_rate": 0.00016697383641586192, + "loss": 1.4319, + "step": 12721 + }, + { + "epoch": 0.1653165876977404, + "grad_norm": 0.29429569840431213, + "learning_rate": 0.00016697123695395054, + "loss": 1.4278, + "step": 12722 + }, + { + "epoch": 0.16532958224165628, + "grad_norm": 0.40375661849975586, + "learning_rate": 0.00016696863749203914, + "loss": 1.5753, + "step": 12723 + }, + { + "epoch": 0.16534257678557215, + "grad_norm": 0.3255603313446045, + "learning_rate": 0.0001669660380301278, + "loss": 1.3613, + "step": 12724 + }, + { + "epoch": 0.16535557132948803, + "grad_norm": 0.6354274153709412, + "learning_rate": 0.0001669634385682164, + "loss": 1.4492, + "step": 12725 + }, + { + "epoch": 0.1653685658734039, + "grad_norm": 0.35847610235214233, + "learning_rate": 0.000166960839106305, + "loss": 1.3535, + "step": 12726 + }, + { + "epoch": 0.16538156041731977, + "grad_norm": 0.2997526228427887, + "learning_rate": 0.0001669582396443936, + "loss": 1.5066, + "step": 12727 + }, + { + "epoch": 0.16539455496123565, + "grad_norm": 0.25073716044425964, + "learning_rate": 0.00016695564018248224, + "loss": 1.4377, + "step": 12728 + }, + { + "epoch": 0.16540754950515152, + "grad_norm": 0.4339577853679657, + "learning_rate": 0.00016695304072057086, + "loss": 1.4269, + "step": 12729 + }, + { + "epoch": 0.1654205440490674, + "grad_norm": 0.3620936870574951, + "learning_rate": 0.00016695044125865946, + "loss": 1.4394, + "step": 12730 + }, + { + "epoch": 0.16543353859298326, + "grad_norm": 0.3854120969772339, + "learning_rate": 0.00016694784179674808, + "loss": 1.5163, + "step": 12731 + }, + { + "epoch": 0.16544653313689914, + "grad_norm": 0.3299930989742279, + "learning_rate": 0.0001669452423348367, + "loss": 1.5508, + "step": 12732 + }, + { + "epoch": 0.165459527680815, + "grad_norm": 0.39410945773124695, + "learning_rate": 0.0001669426428729253, + "loss": 1.4698, + "step": 12733 + }, + { + "epoch": 0.16547252222473088, + "grad_norm": 0.41791895031929016, + "learning_rate": 0.00016694004341101393, + "loss": 1.3089, + "step": 12734 + }, + { + "epoch": 0.16548551676864676, + "grad_norm": 0.3998549282550812, + "learning_rate": 0.00016693744394910253, + "loss": 1.352, + "step": 12735 + }, + { + "epoch": 0.16549851131256263, + "grad_norm": 0.43996545672416687, + "learning_rate": 0.00016693484448719118, + "loss": 1.1485, + "step": 12736 + }, + { + "epoch": 0.1655115058564785, + "grad_norm": 0.4087182879447937, + "learning_rate": 0.00016693224502527978, + "loss": 1.4929, + "step": 12737 + }, + { + "epoch": 0.16552450040039438, + "grad_norm": 0.38097959756851196, + "learning_rate": 0.00016692964556336837, + "loss": 1.4011, + "step": 12738 + }, + { + "epoch": 0.16553749494431025, + "grad_norm": 0.2987470328807831, + "learning_rate": 0.000166927046101457, + "loss": 1.436, + "step": 12739 + }, + { + "epoch": 0.16555048948822612, + "grad_norm": 0.3647497296333313, + "learning_rate": 0.00016692444663954562, + "loss": 1.4595, + "step": 12740 + }, + { + "epoch": 0.165563484032142, + "grad_norm": 0.4313162565231323, + "learning_rate": 0.00016692184717763425, + "loss": 1.4956, + "step": 12741 + }, + { + "epoch": 0.16557647857605787, + "grad_norm": 0.40411344170570374, + "learning_rate": 0.00016691924771572284, + "loss": 1.4129, + "step": 12742 + }, + { + "epoch": 0.16558947311997374, + "grad_norm": 0.38567715883255005, + "learning_rate": 0.00016691664825381147, + "loss": 1.5529, + "step": 12743 + }, + { + "epoch": 0.1656024676638896, + "grad_norm": 0.36858832836151123, + "learning_rate": 0.0001669140487919001, + "loss": 1.4827, + "step": 12744 + }, + { + "epoch": 0.16561546220780549, + "grad_norm": 0.5153440237045288, + "learning_rate": 0.0001669114493299887, + "loss": 1.4709, + "step": 12745 + }, + { + "epoch": 0.16562845675172136, + "grad_norm": 0.36753207445144653, + "learning_rate": 0.00016690884986807732, + "loss": 1.4611, + "step": 12746 + }, + { + "epoch": 0.16564145129563723, + "grad_norm": 0.3692333996295929, + "learning_rate": 0.00016690625040616594, + "loss": 1.4302, + "step": 12747 + }, + { + "epoch": 0.1656544458395531, + "grad_norm": 0.42418134212493896, + "learning_rate": 0.00016690365094425456, + "loss": 1.5146, + "step": 12748 + }, + { + "epoch": 0.16566744038346898, + "grad_norm": 0.5124931335449219, + "learning_rate": 0.00016690105148234316, + "loss": 1.7057, + "step": 12749 + }, + { + "epoch": 0.16568043492738485, + "grad_norm": 0.4228874146938324, + "learning_rate": 0.00016689845202043176, + "loss": 1.4672, + "step": 12750 + }, + { + "epoch": 0.16569342947130072, + "grad_norm": 0.4007064402103424, + "learning_rate": 0.0001668958525585204, + "loss": 1.3748, + "step": 12751 + }, + { + "epoch": 0.16570642401521662, + "grad_norm": 0.42129477858543396, + "learning_rate": 0.000166893253096609, + "loss": 1.511, + "step": 12752 + }, + { + "epoch": 0.1657194185591325, + "grad_norm": 0.5369883179664612, + "learning_rate": 0.00016689065363469763, + "loss": 1.55, + "step": 12753 + }, + { + "epoch": 0.16573241310304837, + "grad_norm": 0.37164899706840515, + "learning_rate": 0.00016688805417278623, + "loss": 1.429, + "step": 12754 + }, + { + "epoch": 0.16574540764696424, + "grad_norm": 0.35931336879730225, + "learning_rate": 0.00016688545471087485, + "loss": 1.2623, + "step": 12755 + }, + { + "epoch": 0.16575840219088012, + "grad_norm": 0.3817209005355835, + "learning_rate": 0.00016688285524896348, + "loss": 1.0998, + "step": 12756 + }, + { + "epoch": 0.165771396734796, + "grad_norm": 0.3386791944503784, + "learning_rate": 0.00016688025578705208, + "loss": 1.4165, + "step": 12757 + }, + { + "epoch": 0.16578439127871186, + "grad_norm": 0.4538019895553589, + "learning_rate": 0.0001668776563251407, + "loss": 1.5655, + "step": 12758 + }, + { + "epoch": 0.16579738582262774, + "grad_norm": 0.45190227031707764, + "learning_rate": 0.00016687505686322933, + "loss": 1.4953, + "step": 12759 + }, + { + "epoch": 0.1658103803665436, + "grad_norm": 0.3689805865287781, + "learning_rate": 0.00016687245740131795, + "loss": 1.5334, + "step": 12760 + }, + { + "epoch": 0.16582337491045948, + "grad_norm": 0.5716325044631958, + "learning_rate": 0.00016686985793940655, + "loss": 1.4974, + "step": 12761 + }, + { + "epoch": 0.16583636945437535, + "grad_norm": 0.3748626708984375, + "learning_rate": 0.00016686725847749517, + "loss": 1.4128, + "step": 12762 + }, + { + "epoch": 0.16584936399829123, + "grad_norm": 0.4868242144584656, + "learning_rate": 0.0001668646590155838, + "loss": 1.5548, + "step": 12763 + }, + { + "epoch": 0.1658623585422071, + "grad_norm": 0.31340986490249634, + "learning_rate": 0.0001668620595536724, + "loss": 1.3049, + "step": 12764 + }, + { + "epoch": 0.16587535308612297, + "grad_norm": 0.3670234680175781, + "learning_rate": 0.00016685946009176102, + "loss": 1.5032, + "step": 12765 + }, + { + "epoch": 0.16588834763003885, + "grad_norm": 0.34331753849983215, + "learning_rate": 0.00016685686062984962, + "loss": 1.3376, + "step": 12766 + }, + { + "epoch": 0.16590134217395472, + "grad_norm": 0.2816208600997925, + "learning_rate": 0.00016685426116793824, + "loss": 1.3386, + "step": 12767 + }, + { + "epoch": 0.1659143367178706, + "grad_norm": 0.43149030208587646, + "learning_rate": 0.00016685166170602686, + "loss": 1.5037, + "step": 12768 + }, + { + "epoch": 0.16592733126178646, + "grad_norm": 0.4204499125480652, + "learning_rate": 0.00016684906224411546, + "loss": 1.3985, + "step": 12769 + }, + { + "epoch": 0.16594032580570234, + "grad_norm": 0.4354959726333618, + "learning_rate": 0.0001668464627822041, + "loss": 1.3395, + "step": 12770 + }, + { + "epoch": 0.1659533203496182, + "grad_norm": 0.38286638259887695, + "learning_rate": 0.0001668438633202927, + "loss": 1.3766, + "step": 12771 + }, + { + "epoch": 0.16596631489353408, + "grad_norm": 0.31627151370048523, + "learning_rate": 0.00016684126385838134, + "loss": 1.3678, + "step": 12772 + }, + { + "epoch": 0.16597930943744996, + "grad_norm": 0.4957989454269409, + "learning_rate": 0.00016683866439646993, + "loss": 1.5672, + "step": 12773 + }, + { + "epoch": 0.16599230398136583, + "grad_norm": 0.3770240843296051, + "learning_rate": 0.00016683606493455856, + "loss": 1.3653, + "step": 12774 + }, + { + "epoch": 0.1660052985252817, + "grad_norm": 0.3548699617385864, + "learning_rate": 0.00016683346547264718, + "loss": 1.2602, + "step": 12775 + }, + { + "epoch": 0.16601829306919758, + "grad_norm": 0.4624626934528351, + "learning_rate": 0.00016683086601073578, + "loss": 1.3246, + "step": 12776 + }, + { + "epoch": 0.16603128761311345, + "grad_norm": 0.5028133392333984, + "learning_rate": 0.0001668282665488244, + "loss": 1.4082, + "step": 12777 + }, + { + "epoch": 0.16604428215702932, + "grad_norm": 0.4477551579475403, + "learning_rate": 0.000166825667086913, + "loss": 1.6132, + "step": 12778 + }, + { + "epoch": 0.1660572767009452, + "grad_norm": 0.3564830422401428, + "learning_rate": 0.00016682306762500163, + "loss": 1.2895, + "step": 12779 + }, + { + "epoch": 0.16607027124486107, + "grad_norm": 0.37649983167648315, + "learning_rate": 0.00016682046816309025, + "loss": 1.6371, + "step": 12780 + }, + { + "epoch": 0.16608326578877694, + "grad_norm": 0.3895941972732544, + "learning_rate": 0.00016681786870117885, + "loss": 1.3311, + "step": 12781 + }, + { + "epoch": 0.1660962603326928, + "grad_norm": 0.3195134997367859, + "learning_rate": 0.00016681526923926747, + "loss": 1.3647, + "step": 12782 + }, + { + "epoch": 0.1661092548766087, + "grad_norm": 0.42182788252830505, + "learning_rate": 0.0001668126697773561, + "loss": 1.4729, + "step": 12783 + }, + { + "epoch": 0.16612224942052456, + "grad_norm": 0.4348873198032379, + "learning_rate": 0.00016681007031544472, + "loss": 1.3943, + "step": 12784 + }, + { + "epoch": 0.16613524396444043, + "grad_norm": 0.3574744760990143, + "learning_rate": 0.00016680747085353332, + "loss": 1.2052, + "step": 12785 + }, + { + "epoch": 0.1661482385083563, + "grad_norm": 0.4844158887863159, + "learning_rate": 0.00016680487139162194, + "loss": 1.57, + "step": 12786 + }, + { + "epoch": 0.16616123305227218, + "grad_norm": 0.5127004384994507, + "learning_rate": 0.00016680227192971057, + "loss": 1.6192, + "step": 12787 + }, + { + "epoch": 0.16617422759618805, + "grad_norm": 0.43348947167396545, + "learning_rate": 0.00016679967246779916, + "loss": 1.6037, + "step": 12788 + }, + { + "epoch": 0.16618722214010392, + "grad_norm": 0.3308321237564087, + "learning_rate": 0.0001667970730058878, + "loss": 1.5887, + "step": 12789 + }, + { + "epoch": 0.1662002166840198, + "grad_norm": 0.3559459447860718, + "learning_rate": 0.00016679447354397641, + "loss": 1.3745, + "step": 12790 + }, + { + "epoch": 0.16621321122793567, + "grad_norm": 0.3071127235889435, + "learning_rate": 0.00016679187408206504, + "loss": 1.4145, + "step": 12791 + }, + { + "epoch": 0.16622620577185154, + "grad_norm": 0.3767787218093872, + "learning_rate": 0.00016678927462015364, + "loss": 1.5357, + "step": 12792 + }, + { + "epoch": 0.16623920031576742, + "grad_norm": 0.38860946893692017, + "learning_rate": 0.00016678667515824223, + "loss": 1.5691, + "step": 12793 + }, + { + "epoch": 0.1662521948596833, + "grad_norm": 0.39276090264320374, + "learning_rate": 0.00016678407569633088, + "loss": 1.551, + "step": 12794 + }, + { + "epoch": 0.16626518940359916, + "grad_norm": 0.34387215971946716, + "learning_rate": 0.00016678147623441948, + "loss": 1.3966, + "step": 12795 + }, + { + "epoch": 0.16627818394751503, + "grad_norm": 0.24353717267513275, + "learning_rate": 0.0001667788767725081, + "loss": 1.1984, + "step": 12796 + }, + { + "epoch": 0.1662911784914309, + "grad_norm": 0.374902606010437, + "learning_rate": 0.0001667762773105967, + "loss": 1.6019, + "step": 12797 + }, + { + "epoch": 0.16630417303534678, + "grad_norm": 0.3707432746887207, + "learning_rate": 0.00016677367784868533, + "loss": 1.4587, + "step": 12798 + }, + { + "epoch": 0.16631716757926265, + "grad_norm": 0.42449063062667847, + "learning_rate": 0.00016677107838677395, + "loss": 1.6654, + "step": 12799 + }, + { + "epoch": 0.16633016212317853, + "grad_norm": 0.37490546703338623, + "learning_rate": 0.00016676847892486255, + "loss": 1.3744, + "step": 12800 + }, + { + "epoch": 0.1663431566670944, + "grad_norm": 0.4316680133342743, + "learning_rate": 0.00016676587946295117, + "loss": 1.4633, + "step": 12801 + }, + { + "epoch": 0.16635615121101027, + "grad_norm": 0.42527732253074646, + "learning_rate": 0.0001667632800010398, + "loss": 1.4212, + "step": 12802 + }, + { + "epoch": 0.16636914575492615, + "grad_norm": 0.34765082597732544, + "learning_rate": 0.00016676068053912842, + "loss": 1.4254, + "step": 12803 + }, + { + "epoch": 0.16638214029884202, + "grad_norm": 0.44545778632164, + "learning_rate": 0.00016675808107721702, + "loss": 1.5502, + "step": 12804 + }, + { + "epoch": 0.1663951348427579, + "grad_norm": 0.42165297269821167, + "learning_rate": 0.00016675548161530562, + "loss": 1.4112, + "step": 12805 + }, + { + "epoch": 0.16640812938667376, + "grad_norm": 0.42091259360313416, + "learning_rate": 0.00016675288215339427, + "loss": 1.4712, + "step": 12806 + }, + { + "epoch": 0.16642112393058964, + "grad_norm": 0.4956926703453064, + "learning_rate": 0.00016675028269148287, + "loss": 1.3942, + "step": 12807 + }, + { + "epoch": 0.1664341184745055, + "grad_norm": 0.4420771598815918, + "learning_rate": 0.0001667476832295715, + "loss": 1.4173, + "step": 12808 + }, + { + "epoch": 0.16644711301842138, + "grad_norm": 0.21815244853496552, + "learning_rate": 0.0001667450837676601, + "loss": 1.2988, + "step": 12809 + }, + { + "epoch": 0.16646010756233726, + "grad_norm": 0.3586496412754059, + "learning_rate": 0.0001667424843057487, + "loss": 1.4763, + "step": 12810 + }, + { + "epoch": 0.16647310210625313, + "grad_norm": 0.3256484270095825, + "learning_rate": 0.00016673988484383734, + "loss": 1.461, + "step": 12811 + }, + { + "epoch": 0.166486096650169, + "grad_norm": 0.36909791827201843, + "learning_rate": 0.00016673728538192594, + "loss": 1.4461, + "step": 12812 + }, + { + "epoch": 0.16649909119408488, + "grad_norm": 0.396945059299469, + "learning_rate": 0.00016673468592001456, + "loss": 1.4764, + "step": 12813 + }, + { + "epoch": 0.16651208573800075, + "grad_norm": 0.47493255138397217, + "learning_rate": 0.00016673208645810318, + "loss": 1.2814, + "step": 12814 + }, + { + "epoch": 0.16652508028191662, + "grad_norm": 0.41407179832458496, + "learning_rate": 0.0001667294869961918, + "loss": 1.2914, + "step": 12815 + }, + { + "epoch": 0.1665380748258325, + "grad_norm": 0.41274240612983704, + "learning_rate": 0.0001667268875342804, + "loss": 1.4875, + "step": 12816 + }, + { + "epoch": 0.16655106936974837, + "grad_norm": 0.2747964560985565, + "learning_rate": 0.000166724288072369, + "loss": 1.3357, + "step": 12817 + }, + { + "epoch": 0.16656406391366424, + "grad_norm": 0.3709779679775238, + "learning_rate": 0.00016672168861045766, + "loss": 1.4881, + "step": 12818 + }, + { + "epoch": 0.1665770584575801, + "grad_norm": 0.3713243305683136, + "learning_rate": 0.00016671908914854625, + "loss": 1.4259, + "step": 12819 + }, + { + "epoch": 0.16659005300149599, + "grad_norm": 0.5128543972969055, + "learning_rate": 0.00016671648968663488, + "loss": 1.5151, + "step": 12820 + }, + { + "epoch": 0.16660304754541186, + "grad_norm": 0.42065709829330444, + "learning_rate": 0.0001667138902247235, + "loss": 1.3927, + "step": 12821 + }, + { + "epoch": 0.16661604208932773, + "grad_norm": 0.453928142786026, + "learning_rate": 0.0001667112907628121, + "loss": 1.3186, + "step": 12822 + }, + { + "epoch": 0.1666290366332436, + "grad_norm": 0.4526253342628479, + "learning_rate": 0.00016670869130090072, + "loss": 1.3925, + "step": 12823 + }, + { + "epoch": 0.16664203117715948, + "grad_norm": 0.540834367275238, + "learning_rate": 0.00016670609183898932, + "loss": 1.4528, + "step": 12824 + }, + { + "epoch": 0.16665502572107535, + "grad_norm": 0.3812747299671173, + "learning_rate": 0.00016670349237707797, + "loss": 1.52, + "step": 12825 + }, + { + "epoch": 0.16666802026499122, + "grad_norm": 0.2953941226005554, + "learning_rate": 0.00016670089291516657, + "loss": 1.3892, + "step": 12826 + }, + { + "epoch": 0.1666810148089071, + "grad_norm": 0.47402068972587585, + "learning_rate": 0.0001666982934532552, + "loss": 1.4324, + "step": 12827 + }, + { + "epoch": 0.166694009352823, + "grad_norm": 0.42593613266944885, + "learning_rate": 0.0001666956939913438, + "loss": 1.5167, + "step": 12828 + }, + { + "epoch": 0.16670700389673887, + "grad_norm": 0.3507402241230011, + "learning_rate": 0.00016669309452943242, + "loss": 1.2295, + "step": 12829 + }, + { + "epoch": 0.16671999844065474, + "grad_norm": 0.369003564119339, + "learning_rate": 0.00016669049506752104, + "loss": 1.381, + "step": 12830 + }, + { + "epoch": 0.16673299298457062, + "grad_norm": 0.43361109495162964, + "learning_rate": 0.00016668789560560964, + "loss": 1.4831, + "step": 12831 + }, + { + "epoch": 0.1667459875284865, + "grad_norm": 0.33255043625831604, + "learning_rate": 0.00016668529614369826, + "loss": 1.3915, + "step": 12832 + }, + { + "epoch": 0.16675898207240236, + "grad_norm": 0.34869447350502014, + "learning_rate": 0.0001666826966817869, + "loss": 1.2207, + "step": 12833 + }, + { + "epoch": 0.16677197661631823, + "grad_norm": 0.4056282639503479, + "learning_rate": 0.00016668009721987548, + "loss": 1.5521, + "step": 12834 + }, + { + "epoch": 0.1667849711602341, + "grad_norm": 0.4887652099132538, + "learning_rate": 0.0001666774977579641, + "loss": 1.543, + "step": 12835 + }, + { + "epoch": 0.16679796570414998, + "grad_norm": 0.34442567825317383, + "learning_rate": 0.0001666748982960527, + "loss": 1.2209, + "step": 12836 + }, + { + "epoch": 0.16681096024806585, + "grad_norm": 0.41982781887054443, + "learning_rate": 0.00016667229883414136, + "loss": 1.6639, + "step": 12837 + }, + { + "epoch": 0.16682395479198173, + "grad_norm": 0.4775190055370331, + "learning_rate": 0.00016666969937222996, + "loss": 1.4735, + "step": 12838 + }, + { + "epoch": 0.1668369493358976, + "grad_norm": 0.35715991258621216, + "learning_rate": 0.00016666709991031858, + "loss": 1.6269, + "step": 12839 + }, + { + "epoch": 0.16684994387981347, + "grad_norm": 0.43140268325805664, + "learning_rate": 0.00016666450044840718, + "loss": 1.3078, + "step": 12840 + }, + { + "epoch": 0.16686293842372935, + "grad_norm": 0.3623979985713959, + "learning_rate": 0.0001666619009864958, + "loss": 1.1823, + "step": 12841 + }, + { + "epoch": 0.16687593296764522, + "grad_norm": 0.3775346875190735, + "learning_rate": 0.00016665930152458443, + "loss": 1.5372, + "step": 12842 + }, + { + "epoch": 0.1668889275115611, + "grad_norm": 0.3679216206073761, + "learning_rate": 0.00016665670206267302, + "loss": 1.6282, + "step": 12843 + }, + { + "epoch": 0.16690192205547696, + "grad_norm": 0.40435758233070374, + "learning_rate": 0.00016665410260076165, + "loss": 1.1387, + "step": 12844 + }, + { + "epoch": 0.16691491659939284, + "grad_norm": 0.38754045963287354, + "learning_rate": 0.00016665150313885027, + "loss": 1.4503, + "step": 12845 + }, + { + "epoch": 0.1669279111433087, + "grad_norm": 0.47942426800727844, + "learning_rate": 0.0001666489036769389, + "loss": 1.4406, + "step": 12846 + }, + { + "epoch": 0.16694090568722458, + "grad_norm": 0.49686306715011597, + "learning_rate": 0.0001666463042150275, + "loss": 1.5014, + "step": 12847 + }, + { + "epoch": 0.16695390023114046, + "grad_norm": 0.33438125252723694, + "learning_rate": 0.0001666437047531161, + "loss": 1.6046, + "step": 12848 + }, + { + "epoch": 0.16696689477505633, + "grad_norm": 0.36495068669319153, + "learning_rate": 0.00016664110529120474, + "loss": 1.4154, + "step": 12849 + }, + { + "epoch": 0.1669798893189722, + "grad_norm": 0.41140130162239075, + "learning_rate": 0.00016663850582929334, + "loss": 1.2698, + "step": 12850 + }, + { + "epoch": 0.16699288386288808, + "grad_norm": 0.38917186856269836, + "learning_rate": 0.00016663590636738196, + "loss": 1.526, + "step": 12851 + }, + { + "epoch": 0.16700587840680395, + "grad_norm": 0.40571409463882446, + "learning_rate": 0.00016663330690547056, + "loss": 1.307, + "step": 12852 + }, + { + "epoch": 0.16701887295071982, + "grad_norm": 0.33642643690109253, + "learning_rate": 0.0001666307074435592, + "loss": 1.4184, + "step": 12853 + }, + { + "epoch": 0.1670318674946357, + "grad_norm": 0.4248434007167816, + "learning_rate": 0.0001666281079816478, + "loss": 1.4063, + "step": 12854 + }, + { + "epoch": 0.16704486203855157, + "grad_norm": 0.36782601475715637, + "learning_rate": 0.0001666255085197364, + "loss": 1.5157, + "step": 12855 + }, + { + "epoch": 0.16705785658246744, + "grad_norm": 0.3769363760948181, + "learning_rate": 0.00016662290905782503, + "loss": 1.3521, + "step": 12856 + }, + { + "epoch": 0.1670708511263833, + "grad_norm": 0.4285936951637268, + "learning_rate": 0.00016662030959591366, + "loss": 1.4896, + "step": 12857 + }, + { + "epoch": 0.16708384567029919, + "grad_norm": 0.4447292983531952, + "learning_rate": 0.00016661771013400228, + "loss": 1.5812, + "step": 12858 + }, + { + "epoch": 0.16709684021421506, + "grad_norm": 0.3940449059009552, + "learning_rate": 0.00016661511067209088, + "loss": 1.5996, + "step": 12859 + }, + { + "epoch": 0.16710983475813093, + "grad_norm": 0.37623709440231323, + "learning_rate": 0.0001666125112101795, + "loss": 1.5044, + "step": 12860 + }, + { + "epoch": 0.1671228293020468, + "grad_norm": 0.37312057614326477, + "learning_rate": 0.00016660991174826813, + "loss": 1.5332, + "step": 12861 + }, + { + "epoch": 0.16713582384596268, + "grad_norm": 0.3506205081939697, + "learning_rate": 0.00016660731228635673, + "loss": 1.3821, + "step": 12862 + }, + { + "epoch": 0.16714881838987855, + "grad_norm": 0.3562324345111847, + "learning_rate": 0.00016660471282444535, + "loss": 1.4178, + "step": 12863 + }, + { + "epoch": 0.16716181293379442, + "grad_norm": 0.4145570695400238, + "learning_rate": 0.00016660211336253397, + "loss": 1.4428, + "step": 12864 + }, + { + "epoch": 0.1671748074777103, + "grad_norm": 0.3577467203140259, + "learning_rate": 0.00016659951390062257, + "loss": 1.4778, + "step": 12865 + }, + { + "epoch": 0.16718780202162617, + "grad_norm": 0.37337398529052734, + "learning_rate": 0.0001665969144387112, + "loss": 1.4985, + "step": 12866 + }, + { + "epoch": 0.16720079656554204, + "grad_norm": 0.3283328711986542, + "learning_rate": 0.0001665943149767998, + "loss": 1.5841, + "step": 12867 + }, + { + "epoch": 0.16721379110945792, + "grad_norm": 0.40434518456459045, + "learning_rate": 0.00016659171551488845, + "loss": 1.3934, + "step": 12868 + }, + { + "epoch": 0.1672267856533738, + "grad_norm": 0.4591914713382721, + "learning_rate": 0.00016658911605297704, + "loss": 1.3366, + "step": 12869 + }, + { + "epoch": 0.16723978019728966, + "grad_norm": 0.33408480882644653, + "learning_rate": 0.00016658651659106567, + "loss": 1.3697, + "step": 12870 + }, + { + "epoch": 0.16725277474120553, + "grad_norm": 0.3955046832561493, + "learning_rate": 0.00016658391712915426, + "loss": 1.4126, + "step": 12871 + }, + { + "epoch": 0.1672657692851214, + "grad_norm": 0.4294576644897461, + "learning_rate": 0.0001665813176672429, + "loss": 1.3959, + "step": 12872 + }, + { + "epoch": 0.16727876382903728, + "grad_norm": 0.30360329151153564, + "learning_rate": 0.00016657871820533151, + "loss": 1.2906, + "step": 12873 + }, + { + "epoch": 0.16729175837295315, + "grad_norm": 0.42848408222198486, + "learning_rate": 0.0001665761187434201, + "loss": 1.2307, + "step": 12874 + }, + { + "epoch": 0.16730475291686903, + "grad_norm": 0.45242586731910706, + "learning_rate": 0.00016657351928150874, + "loss": 1.3869, + "step": 12875 + }, + { + "epoch": 0.1673177474607849, + "grad_norm": 0.47310593724250793, + "learning_rate": 0.00016657091981959736, + "loss": 1.3105, + "step": 12876 + }, + { + "epoch": 0.16733074200470077, + "grad_norm": 0.3324052393436432, + "learning_rate": 0.00016656832035768596, + "loss": 1.3804, + "step": 12877 + }, + { + "epoch": 0.16734373654861665, + "grad_norm": 0.4319552481174469, + "learning_rate": 0.00016656572089577458, + "loss": 1.3998, + "step": 12878 + }, + { + "epoch": 0.16735673109253252, + "grad_norm": 0.31943750381469727, + "learning_rate": 0.00016656312143386318, + "loss": 1.2418, + "step": 12879 + }, + { + "epoch": 0.1673697256364484, + "grad_norm": 0.40766334533691406, + "learning_rate": 0.00016656052197195183, + "loss": 1.5104, + "step": 12880 + }, + { + "epoch": 0.16738272018036426, + "grad_norm": 0.46422079205513, + "learning_rate": 0.00016655792251004043, + "loss": 1.5967, + "step": 12881 + }, + { + "epoch": 0.16739571472428014, + "grad_norm": 0.3993177115917206, + "learning_rate": 0.00016655532304812905, + "loss": 1.3782, + "step": 12882 + }, + { + "epoch": 0.167408709268196, + "grad_norm": 0.4317792057991028, + "learning_rate": 0.00016655272358621765, + "loss": 1.3474, + "step": 12883 + }, + { + "epoch": 0.16742170381211188, + "grad_norm": 0.45836618542671204, + "learning_rate": 0.00016655012412430627, + "loss": 1.3878, + "step": 12884 + }, + { + "epoch": 0.16743469835602776, + "grad_norm": 0.44449880719184875, + "learning_rate": 0.0001665475246623949, + "loss": 1.4617, + "step": 12885 + }, + { + "epoch": 0.16744769289994363, + "grad_norm": 0.4375962018966675, + "learning_rate": 0.0001665449252004835, + "loss": 1.392, + "step": 12886 + }, + { + "epoch": 0.1674606874438595, + "grad_norm": 0.4703550934791565, + "learning_rate": 0.00016654232573857212, + "loss": 1.5668, + "step": 12887 + }, + { + "epoch": 0.16747368198777537, + "grad_norm": 0.31866562366485596, + "learning_rate": 0.00016653972627666075, + "loss": 1.5196, + "step": 12888 + }, + { + "epoch": 0.16748667653169125, + "grad_norm": 0.312232106924057, + "learning_rate": 0.00016653712681474934, + "loss": 1.4614, + "step": 12889 + }, + { + "epoch": 0.16749967107560712, + "grad_norm": 0.4223799407482147, + "learning_rate": 0.00016653452735283797, + "loss": 1.5073, + "step": 12890 + }, + { + "epoch": 0.167512665619523, + "grad_norm": 0.41408222913742065, + "learning_rate": 0.00016653192789092656, + "loss": 1.5173, + "step": 12891 + }, + { + "epoch": 0.16752566016343887, + "grad_norm": 0.40249311923980713, + "learning_rate": 0.00016652932842901522, + "loss": 1.4593, + "step": 12892 + }, + { + "epoch": 0.16753865470735474, + "grad_norm": 0.39199817180633545, + "learning_rate": 0.00016652672896710381, + "loss": 1.544, + "step": 12893 + }, + { + "epoch": 0.1675516492512706, + "grad_norm": 0.41567808389663696, + "learning_rate": 0.00016652412950519244, + "loss": 1.3974, + "step": 12894 + }, + { + "epoch": 0.16756464379518649, + "grad_norm": 0.40071341395378113, + "learning_rate": 0.00016652153004328106, + "loss": 1.3259, + "step": 12895 + }, + { + "epoch": 0.16757763833910236, + "grad_norm": 0.3968471586704254, + "learning_rate": 0.00016651893058136966, + "loss": 1.4359, + "step": 12896 + }, + { + "epoch": 0.16759063288301823, + "grad_norm": 0.28312477469444275, + "learning_rate": 0.00016651633111945828, + "loss": 1.4723, + "step": 12897 + }, + { + "epoch": 0.1676036274269341, + "grad_norm": 0.3450145423412323, + "learning_rate": 0.00016651373165754688, + "loss": 1.6804, + "step": 12898 + }, + { + "epoch": 0.16761662197084998, + "grad_norm": 0.3740321099758148, + "learning_rate": 0.00016651113219563553, + "loss": 1.3322, + "step": 12899 + }, + { + "epoch": 0.16762961651476585, + "grad_norm": 0.37756139039993286, + "learning_rate": 0.00016650853273372413, + "loss": 1.2638, + "step": 12900 + }, + { + "epoch": 0.16764261105868172, + "grad_norm": 0.3208197057247162, + "learning_rate": 0.00016650593327181273, + "loss": 1.3465, + "step": 12901 + }, + { + "epoch": 0.1676556056025976, + "grad_norm": 0.2987249791622162, + "learning_rate": 0.00016650333380990135, + "loss": 1.1555, + "step": 12902 + }, + { + "epoch": 0.16766860014651347, + "grad_norm": 0.28416958451271057, + "learning_rate": 0.00016650073434798998, + "loss": 1.4346, + "step": 12903 + }, + { + "epoch": 0.16768159469042937, + "grad_norm": 0.36585891246795654, + "learning_rate": 0.0001664981348860786, + "loss": 1.4997, + "step": 12904 + }, + { + "epoch": 0.16769458923434524, + "grad_norm": 0.33395785093307495, + "learning_rate": 0.0001664955354241672, + "loss": 1.2841, + "step": 12905 + }, + { + "epoch": 0.16770758377826112, + "grad_norm": 0.3879500925540924, + "learning_rate": 0.00016649293596225582, + "loss": 1.3641, + "step": 12906 + }, + { + "epoch": 0.167720578322177, + "grad_norm": 0.27617931365966797, + "learning_rate": 0.00016649033650034445, + "loss": 1.4174, + "step": 12907 + }, + { + "epoch": 0.16773357286609286, + "grad_norm": 0.35746923089027405, + "learning_rate": 0.00016648773703843305, + "loss": 1.3993, + "step": 12908 + }, + { + "epoch": 0.16774656741000873, + "grad_norm": 0.39186882972717285, + "learning_rate": 0.00016648513757652167, + "loss": 1.2611, + "step": 12909 + }, + { + "epoch": 0.1677595619539246, + "grad_norm": 0.5087866187095642, + "learning_rate": 0.00016648253811461027, + "loss": 1.4859, + "step": 12910 + }, + { + "epoch": 0.16777255649784048, + "grad_norm": 0.4065325856208801, + "learning_rate": 0.00016647993865269892, + "loss": 1.4694, + "step": 12911 + }, + { + "epoch": 0.16778555104175635, + "grad_norm": 0.4485664367675781, + "learning_rate": 0.00016647733919078752, + "loss": 1.47, + "step": 12912 + }, + { + "epoch": 0.16779854558567223, + "grad_norm": 0.42203405499458313, + "learning_rate": 0.00016647473972887614, + "loss": 1.3472, + "step": 12913 + }, + { + "epoch": 0.1678115401295881, + "grad_norm": 0.3816760778427124, + "learning_rate": 0.00016647214026696474, + "loss": 1.3832, + "step": 12914 + }, + { + "epoch": 0.16782453467350397, + "grad_norm": 0.36738601326942444, + "learning_rate": 0.00016646954080505336, + "loss": 1.5662, + "step": 12915 + }, + { + "epoch": 0.16783752921741985, + "grad_norm": 0.32766804099082947, + "learning_rate": 0.000166466941343142, + "loss": 1.4689, + "step": 12916 + }, + { + "epoch": 0.16785052376133572, + "grad_norm": 0.4251675009727478, + "learning_rate": 0.00016646434188123058, + "loss": 1.5494, + "step": 12917 + }, + { + "epoch": 0.1678635183052516, + "grad_norm": 0.31626254320144653, + "learning_rate": 0.0001664617424193192, + "loss": 1.3572, + "step": 12918 + }, + { + "epoch": 0.16787651284916746, + "grad_norm": 0.40903937816619873, + "learning_rate": 0.00016645914295740783, + "loss": 1.4695, + "step": 12919 + }, + { + "epoch": 0.16788950739308334, + "grad_norm": 0.352841317653656, + "learning_rate": 0.00016645654349549643, + "loss": 1.3018, + "step": 12920 + }, + { + "epoch": 0.1679025019369992, + "grad_norm": 0.6975099444389343, + "learning_rate": 0.00016645394403358506, + "loss": 1.4839, + "step": 12921 + }, + { + "epoch": 0.16791549648091508, + "grad_norm": 0.3246796429157257, + "learning_rate": 0.00016645134457167365, + "loss": 1.4653, + "step": 12922 + }, + { + "epoch": 0.16792849102483096, + "grad_norm": 0.4402077794075012, + "learning_rate": 0.0001664487451097623, + "loss": 1.4932, + "step": 12923 + }, + { + "epoch": 0.16794148556874683, + "grad_norm": 0.3910205662250519, + "learning_rate": 0.0001664461456478509, + "loss": 1.6219, + "step": 12924 + }, + { + "epoch": 0.1679544801126627, + "grad_norm": 0.38421013951301575, + "learning_rate": 0.00016644354618593953, + "loss": 1.3568, + "step": 12925 + }, + { + "epoch": 0.16796747465657857, + "grad_norm": 0.36789801716804504, + "learning_rate": 0.00016644094672402812, + "loss": 1.3636, + "step": 12926 + }, + { + "epoch": 0.16798046920049445, + "grad_norm": 0.4123135507106781, + "learning_rate": 0.00016643834726211675, + "loss": 1.5264, + "step": 12927 + }, + { + "epoch": 0.16799346374441032, + "grad_norm": 0.4110749661922455, + "learning_rate": 0.00016643574780020537, + "loss": 1.4197, + "step": 12928 + }, + { + "epoch": 0.1680064582883262, + "grad_norm": 0.4219306409358978, + "learning_rate": 0.00016643314833829397, + "loss": 1.4166, + "step": 12929 + }, + { + "epoch": 0.16801945283224207, + "grad_norm": 0.4811237156391144, + "learning_rate": 0.0001664305488763826, + "loss": 1.5247, + "step": 12930 + }, + { + "epoch": 0.16803244737615794, + "grad_norm": 0.5712220072746277, + "learning_rate": 0.00016642794941447122, + "loss": 1.4353, + "step": 12931 + }, + { + "epoch": 0.1680454419200738, + "grad_norm": 0.3985210955142975, + "learning_rate": 0.00016642534995255982, + "loss": 1.3716, + "step": 12932 + }, + { + "epoch": 0.16805843646398969, + "grad_norm": 0.4543681740760803, + "learning_rate": 0.00016642275049064844, + "loss": 1.4218, + "step": 12933 + }, + { + "epoch": 0.16807143100790556, + "grad_norm": 0.40007272362709045, + "learning_rate": 0.00016642015102873707, + "loss": 1.3386, + "step": 12934 + }, + { + "epoch": 0.16808442555182143, + "grad_norm": 0.33320870995521545, + "learning_rate": 0.0001664175515668257, + "loss": 1.4486, + "step": 12935 + }, + { + "epoch": 0.1680974200957373, + "grad_norm": 0.33012160658836365, + "learning_rate": 0.0001664149521049143, + "loss": 1.4514, + "step": 12936 + }, + { + "epoch": 0.16811041463965318, + "grad_norm": 0.31427663564682007, + "learning_rate": 0.0001664123526430029, + "loss": 1.2062, + "step": 12937 + }, + { + "epoch": 0.16812340918356905, + "grad_norm": 0.2677564322948456, + "learning_rate": 0.00016640975318109154, + "loss": 1.4618, + "step": 12938 + }, + { + "epoch": 0.16813640372748492, + "grad_norm": 0.3974771201610565, + "learning_rate": 0.00016640715371918013, + "loss": 1.3538, + "step": 12939 + }, + { + "epoch": 0.1681493982714008, + "grad_norm": 0.4523717164993286, + "learning_rate": 0.00016640455425726876, + "loss": 1.4396, + "step": 12940 + }, + { + "epoch": 0.16816239281531667, + "grad_norm": 0.39971163868904114, + "learning_rate": 0.00016640195479535736, + "loss": 1.3753, + "step": 12941 + }, + { + "epoch": 0.16817538735923254, + "grad_norm": 0.34252262115478516, + "learning_rate": 0.000166399355333446, + "loss": 1.4618, + "step": 12942 + }, + { + "epoch": 0.16818838190314842, + "grad_norm": 0.39884689450263977, + "learning_rate": 0.0001663967558715346, + "loss": 1.3216, + "step": 12943 + }, + { + "epoch": 0.1682013764470643, + "grad_norm": 0.3307528495788574, + "learning_rate": 0.0001663941564096232, + "loss": 1.4386, + "step": 12944 + }, + { + "epoch": 0.16821437099098016, + "grad_norm": 0.39804431796073914, + "learning_rate": 0.00016639155694771183, + "loss": 1.3465, + "step": 12945 + }, + { + "epoch": 0.16822736553489603, + "grad_norm": 0.38360700011253357, + "learning_rate": 0.00016638895748580045, + "loss": 1.4103, + "step": 12946 + }, + { + "epoch": 0.1682403600788119, + "grad_norm": 0.35822516679763794, + "learning_rate": 0.00016638635802388908, + "loss": 1.3075, + "step": 12947 + }, + { + "epoch": 0.16825335462272778, + "grad_norm": 0.3822590708732605, + "learning_rate": 0.00016638375856197767, + "loss": 1.3297, + "step": 12948 + }, + { + "epoch": 0.16826634916664365, + "grad_norm": 0.37314456701278687, + "learning_rate": 0.0001663811591000663, + "loss": 1.4815, + "step": 12949 + }, + { + "epoch": 0.16827934371055953, + "grad_norm": 0.2879674732685089, + "learning_rate": 0.00016637855963815492, + "loss": 1.2061, + "step": 12950 + }, + { + "epoch": 0.1682923382544754, + "grad_norm": 0.39561727643013, + "learning_rate": 0.00016637596017624352, + "loss": 1.3816, + "step": 12951 + }, + { + "epoch": 0.16830533279839127, + "grad_norm": 0.3500477373600006, + "learning_rate": 0.00016637336071433214, + "loss": 1.3735, + "step": 12952 + }, + { + "epoch": 0.16831832734230714, + "grad_norm": 0.4114043414592743, + "learning_rate": 0.00016637076125242074, + "loss": 1.4901, + "step": 12953 + }, + { + "epoch": 0.16833132188622302, + "grad_norm": 0.3913855254650116, + "learning_rate": 0.0001663681617905094, + "loss": 1.4516, + "step": 12954 + }, + { + "epoch": 0.1683443164301389, + "grad_norm": 0.375450074672699, + "learning_rate": 0.000166365562328598, + "loss": 1.382, + "step": 12955 + }, + { + "epoch": 0.16835731097405476, + "grad_norm": 0.3838854730129242, + "learning_rate": 0.0001663629628666866, + "loss": 1.4039, + "step": 12956 + }, + { + "epoch": 0.16837030551797064, + "grad_norm": 0.3887314796447754, + "learning_rate": 0.0001663603634047752, + "loss": 1.2659, + "step": 12957 + }, + { + "epoch": 0.1683833000618865, + "grad_norm": 0.34468498826026917, + "learning_rate": 0.00016635776394286384, + "loss": 1.29, + "step": 12958 + }, + { + "epoch": 0.16839629460580238, + "grad_norm": 0.3886233866214752, + "learning_rate": 0.00016635516448095246, + "loss": 1.2587, + "step": 12959 + }, + { + "epoch": 0.16840928914971826, + "grad_norm": 0.4780377745628357, + "learning_rate": 0.00016635256501904106, + "loss": 1.3597, + "step": 12960 + }, + { + "epoch": 0.16842228369363413, + "grad_norm": 0.45188426971435547, + "learning_rate": 0.00016634996555712968, + "loss": 1.4089, + "step": 12961 + }, + { + "epoch": 0.16843527823755, + "grad_norm": 0.39710554480552673, + "learning_rate": 0.0001663473660952183, + "loss": 1.4722, + "step": 12962 + }, + { + "epoch": 0.16844827278146587, + "grad_norm": 0.3980942368507385, + "learning_rate": 0.0001663447666333069, + "loss": 1.4086, + "step": 12963 + }, + { + "epoch": 0.16846126732538175, + "grad_norm": 0.31052473187446594, + "learning_rate": 0.00016634216717139553, + "loss": 1.3962, + "step": 12964 + }, + { + "epoch": 0.16847426186929762, + "grad_norm": 0.3566180169582367, + "learning_rate": 0.00016633956770948413, + "loss": 1.444, + "step": 12965 + }, + { + "epoch": 0.1684872564132135, + "grad_norm": 0.3781813085079193, + "learning_rate": 0.00016633696824757278, + "loss": 1.5075, + "step": 12966 + }, + { + "epoch": 0.16850025095712937, + "grad_norm": 0.358777791261673, + "learning_rate": 0.00016633436878566138, + "loss": 1.343, + "step": 12967 + }, + { + "epoch": 0.16851324550104524, + "grad_norm": 0.4308653175830841, + "learning_rate": 0.00016633176932375, + "loss": 1.4658, + "step": 12968 + }, + { + "epoch": 0.1685262400449611, + "grad_norm": 0.4362732470035553, + "learning_rate": 0.0001663291698618386, + "loss": 1.4236, + "step": 12969 + }, + { + "epoch": 0.16853923458887698, + "grad_norm": 0.3547877371311188, + "learning_rate": 0.00016632657039992722, + "loss": 1.5566, + "step": 12970 + }, + { + "epoch": 0.16855222913279286, + "grad_norm": 0.4114231467247009, + "learning_rate": 0.00016632397093801585, + "loss": 1.3791, + "step": 12971 + }, + { + "epoch": 0.16856522367670873, + "grad_norm": 0.5442303419113159, + "learning_rate": 0.00016632137147610444, + "loss": 1.249, + "step": 12972 + }, + { + "epoch": 0.1685782182206246, + "grad_norm": 0.3428789973258972, + "learning_rate": 0.00016631877201419307, + "loss": 1.367, + "step": 12973 + }, + { + "epoch": 0.16859121276454048, + "grad_norm": 0.4587998390197754, + "learning_rate": 0.0001663161725522817, + "loss": 1.7032, + "step": 12974 + }, + { + "epoch": 0.16860420730845635, + "grad_norm": 0.47045060992240906, + "learning_rate": 0.0001663135730903703, + "loss": 1.3904, + "step": 12975 + }, + { + "epoch": 0.16861720185237222, + "grad_norm": 0.4513276517391205, + "learning_rate": 0.00016631097362845891, + "loss": 1.5989, + "step": 12976 + }, + { + "epoch": 0.1686301963962881, + "grad_norm": 0.3916197419166565, + "learning_rate": 0.00016630837416654754, + "loss": 1.3805, + "step": 12977 + }, + { + "epoch": 0.16864319094020397, + "grad_norm": 0.3675021827220917, + "learning_rate": 0.00016630577470463616, + "loss": 1.4053, + "step": 12978 + }, + { + "epoch": 0.16865618548411984, + "grad_norm": 0.41621658205986023, + "learning_rate": 0.00016630317524272476, + "loss": 1.4082, + "step": 12979 + }, + { + "epoch": 0.16866918002803574, + "grad_norm": 0.40411582589149475, + "learning_rate": 0.00016630057578081339, + "loss": 1.576, + "step": 12980 + }, + { + "epoch": 0.16868217457195162, + "grad_norm": 0.44377803802490234, + "learning_rate": 0.000166297976318902, + "loss": 1.4596, + "step": 12981 + }, + { + "epoch": 0.1686951691158675, + "grad_norm": 0.3872390389442444, + "learning_rate": 0.0001662953768569906, + "loss": 1.4144, + "step": 12982 + }, + { + "epoch": 0.16870816365978336, + "grad_norm": 0.37950897216796875, + "learning_rate": 0.00016629277739507923, + "loss": 1.4474, + "step": 12983 + }, + { + "epoch": 0.16872115820369923, + "grad_norm": 0.3403869867324829, + "learning_rate": 0.00016629017793316783, + "loss": 1.316, + "step": 12984 + }, + { + "epoch": 0.1687341527476151, + "grad_norm": 0.31518515944480896, + "learning_rate": 0.00016628757847125645, + "loss": 1.1081, + "step": 12985 + }, + { + "epoch": 0.16874714729153098, + "grad_norm": 0.44658586382865906, + "learning_rate": 0.00016628497900934508, + "loss": 1.453, + "step": 12986 + }, + { + "epoch": 0.16876014183544685, + "grad_norm": 0.4312993288040161, + "learning_rate": 0.00016628237954743368, + "loss": 1.5359, + "step": 12987 + }, + { + "epoch": 0.16877313637936273, + "grad_norm": 0.3020199239253998, + "learning_rate": 0.0001662797800855223, + "loss": 1.4486, + "step": 12988 + }, + { + "epoch": 0.1687861309232786, + "grad_norm": 0.36508649587631226, + "learning_rate": 0.00016627718062361092, + "loss": 1.4935, + "step": 12989 + }, + { + "epoch": 0.16879912546719447, + "grad_norm": 0.38302719593048096, + "learning_rate": 0.00016627458116169955, + "loss": 1.3812, + "step": 12990 + }, + { + "epoch": 0.16881212001111034, + "grad_norm": 0.4328066110610962, + "learning_rate": 0.00016627198169978815, + "loss": 1.3152, + "step": 12991 + }, + { + "epoch": 0.16882511455502622, + "grad_norm": 0.3908267021179199, + "learning_rate": 0.00016626938223787677, + "loss": 1.4277, + "step": 12992 + }, + { + "epoch": 0.1688381090989421, + "grad_norm": 0.37785065174102783, + "learning_rate": 0.0001662667827759654, + "loss": 1.3782, + "step": 12993 + }, + { + "epoch": 0.16885110364285796, + "grad_norm": 0.37397968769073486, + "learning_rate": 0.000166264183314054, + "loss": 1.3038, + "step": 12994 + }, + { + "epoch": 0.16886409818677384, + "grad_norm": 0.46899425983428955, + "learning_rate": 0.00016626158385214262, + "loss": 1.6436, + "step": 12995 + }, + { + "epoch": 0.1688770927306897, + "grad_norm": 0.4795193374156952, + "learning_rate": 0.00016625898439023121, + "loss": 1.5185, + "step": 12996 + }, + { + "epoch": 0.16889008727460558, + "grad_norm": 0.4219282567501068, + "learning_rate": 0.00016625638492831987, + "loss": 1.7168, + "step": 12997 + }, + { + "epoch": 0.16890308181852146, + "grad_norm": 0.4655729830265045, + "learning_rate": 0.00016625378546640846, + "loss": 1.3464, + "step": 12998 + }, + { + "epoch": 0.16891607636243733, + "grad_norm": 0.4344814121723175, + "learning_rate": 0.00016625118600449706, + "loss": 1.3381, + "step": 12999 + }, + { + "epoch": 0.1689290709063532, + "grad_norm": 0.45287421345710754, + "learning_rate": 0.00016624858654258568, + "loss": 1.6687, + "step": 13000 + }, + { + "epoch": 0.16894206545026907, + "grad_norm": 0.35197800397872925, + "learning_rate": 0.0001662459870806743, + "loss": 1.2894, + "step": 13001 + }, + { + "epoch": 0.16895505999418495, + "grad_norm": 0.3689841330051422, + "learning_rate": 0.00016624338761876293, + "loss": 1.4353, + "step": 13002 + }, + { + "epoch": 0.16896805453810082, + "grad_norm": 0.40450453758239746, + "learning_rate": 0.00016624078815685153, + "loss": 1.6761, + "step": 13003 + }, + { + "epoch": 0.1689810490820167, + "grad_norm": 0.4451766610145569, + "learning_rate": 0.00016623818869494016, + "loss": 1.5902, + "step": 13004 + }, + { + "epoch": 0.16899404362593257, + "grad_norm": 0.4453631639480591, + "learning_rate": 0.00016623558923302878, + "loss": 1.4594, + "step": 13005 + }, + { + "epoch": 0.16900703816984844, + "grad_norm": 0.4766538143157959, + "learning_rate": 0.00016623298977111738, + "loss": 1.3703, + "step": 13006 + }, + { + "epoch": 0.1690200327137643, + "grad_norm": 0.3757951557636261, + "learning_rate": 0.000166230390309206, + "loss": 1.2751, + "step": 13007 + }, + { + "epoch": 0.16903302725768019, + "grad_norm": 0.48573431372642517, + "learning_rate": 0.00016622779084729463, + "loss": 1.6351, + "step": 13008 + }, + { + "epoch": 0.16904602180159606, + "grad_norm": 0.4519944489002228, + "learning_rate": 0.00016622519138538325, + "loss": 1.4012, + "step": 13009 + }, + { + "epoch": 0.16905901634551193, + "grad_norm": 0.35359400510787964, + "learning_rate": 0.00016622259192347185, + "loss": 1.2539, + "step": 13010 + }, + { + "epoch": 0.1690720108894278, + "grad_norm": 0.4511498212814331, + "learning_rate": 0.00016621999246156045, + "loss": 1.5149, + "step": 13011 + }, + { + "epoch": 0.16908500543334368, + "grad_norm": 0.46017739176750183, + "learning_rate": 0.0001662173929996491, + "loss": 1.5946, + "step": 13012 + }, + { + "epoch": 0.16909799997725955, + "grad_norm": 0.3584032654762268, + "learning_rate": 0.0001662147935377377, + "loss": 1.4686, + "step": 13013 + }, + { + "epoch": 0.16911099452117542, + "grad_norm": 0.42846420407295227, + "learning_rate": 0.00016621219407582632, + "loss": 1.3919, + "step": 13014 + }, + { + "epoch": 0.1691239890650913, + "grad_norm": 0.43575266003608704, + "learning_rate": 0.00016620959461391492, + "loss": 1.3496, + "step": 13015 + }, + { + "epoch": 0.16913698360900717, + "grad_norm": 0.3435360789299011, + "learning_rate": 0.00016620699515200354, + "loss": 1.2168, + "step": 13016 + }, + { + "epoch": 0.16914997815292304, + "grad_norm": 0.4018901288509369, + "learning_rate": 0.00016620439569009217, + "loss": 1.5358, + "step": 13017 + }, + { + "epoch": 0.16916297269683891, + "grad_norm": 0.4270346164703369, + "learning_rate": 0.00016620179622818076, + "loss": 1.3092, + "step": 13018 + }, + { + "epoch": 0.1691759672407548, + "grad_norm": 0.35556191205978394, + "learning_rate": 0.0001661991967662694, + "loss": 1.5059, + "step": 13019 + }, + { + "epoch": 0.16918896178467066, + "grad_norm": 0.3976903557777405, + "learning_rate": 0.000166196597304358, + "loss": 1.5113, + "step": 13020 + }, + { + "epoch": 0.16920195632858653, + "grad_norm": 0.38232460618019104, + "learning_rate": 0.00016619399784244664, + "loss": 1.4498, + "step": 13021 + }, + { + "epoch": 0.1692149508725024, + "grad_norm": 0.33682799339294434, + "learning_rate": 0.00016619139838053523, + "loss": 1.4111, + "step": 13022 + }, + { + "epoch": 0.16922794541641828, + "grad_norm": 0.4582914412021637, + "learning_rate": 0.00016618879891862383, + "loss": 1.478, + "step": 13023 + }, + { + "epoch": 0.16924093996033415, + "grad_norm": 0.40171658992767334, + "learning_rate": 0.00016618619945671248, + "loss": 1.3487, + "step": 13024 + }, + { + "epoch": 0.16925393450425003, + "grad_norm": 0.4668387472629547, + "learning_rate": 0.00016618359999480108, + "loss": 1.4507, + "step": 13025 + }, + { + "epoch": 0.1692669290481659, + "grad_norm": 0.34170088171958923, + "learning_rate": 0.0001661810005328897, + "loss": 1.3844, + "step": 13026 + }, + { + "epoch": 0.16927992359208177, + "grad_norm": 0.4102608561515808, + "learning_rate": 0.0001661784010709783, + "loss": 1.2803, + "step": 13027 + }, + { + "epoch": 0.16929291813599764, + "grad_norm": 0.36769479513168335, + "learning_rate": 0.00016617580160906693, + "loss": 1.4923, + "step": 13028 + }, + { + "epoch": 0.16930591267991352, + "grad_norm": 0.42638328671455383, + "learning_rate": 0.00016617320214715555, + "loss": 1.4586, + "step": 13029 + }, + { + "epoch": 0.1693189072238294, + "grad_norm": 0.5077546238899231, + "learning_rate": 0.00016617060268524415, + "loss": 1.5017, + "step": 13030 + }, + { + "epoch": 0.16933190176774526, + "grad_norm": 0.5049060583114624, + "learning_rate": 0.00016616800322333277, + "loss": 1.4212, + "step": 13031 + }, + { + "epoch": 0.16934489631166114, + "grad_norm": 0.37264689803123474, + "learning_rate": 0.0001661654037614214, + "loss": 1.2791, + "step": 13032 + }, + { + "epoch": 0.169357890855577, + "grad_norm": 0.37991753220558167, + "learning_rate": 0.00016616280429951002, + "loss": 1.4667, + "step": 13033 + }, + { + "epoch": 0.16937088539949288, + "grad_norm": 0.42088231444358826, + "learning_rate": 0.00016616020483759862, + "loss": 1.4358, + "step": 13034 + }, + { + "epoch": 0.16938387994340875, + "grad_norm": 0.37237268686294556, + "learning_rate": 0.00016615760537568724, + "loss": 1.2656, + "step": 13035 + }, + { + "epoch": 0.16939687448732463, + "grad_norm": 0.3532378375530243, + "learning_rate": 0.00016615500591377587, + "loss": 1.1733, + "step": 13036 + }, + { + "epoch": 0.1694098690312405, + "grad_norm": 0.38380929827690125, + "learning_rate": 0.00016615240645186447, + "loss": 1.4433, + "step": 13037 + }, + { + "epoch": 0.16942286357515637, + "grad_norm": 0.4379131495952606, + "learning_rate": 0.0001661498069899531, + "loss": 1.3661, + "step": 13038 + }, + { + "epoch": 0.16943585811907225, + "grad_norm": 0.2939518690109253, + "learning_rate": 0.0001661472075280417, + "loss": 1.3203, + "step": 13039 + }, + { + "epoch": 0.16944885266298812, + "grad_norm": 0.34767088294029236, + "learning_rate": 0.0001661446080661303, + "loss": 1.3178, + "step": 13040 + }, + { + "epoch": 0.169461847206904, + "grad_norm": 0.34375861287117004, + "learning_rate": 0.00016614200860421894, + "loss": 1.1991, + "step": 13041 + }, + { + "epoch": 0.16947484175081987, + "grad_norm": 0.36901646852493286, + "learning_rate": 0.00016613940914230753, + "loss": 1.3708, + "step": 13042 + }, + { + "epoch": 0.16948783629473574, + "grad_norm": 0.4402235150337219, + "learning_rate": 0.00016613680968039616, + "loss": 1.4395, + "step": 13043 + }, + { + "epoch": 0.1695008308386516, + "grad_norm": 0.47073665261268616, + "learning_rate": 0.00016613421021848478, + "loss": 1.5272, + "step": 13044 + }, + { + "epoch": 0.16951382538256748, + "grad_norm": 0.4363754391670227, + "learning_rate": 0.0001661316107565734, + "loss": 1.4797, + "step": 13045 + }, + { + "epoch": 0.16952681992648336, + "grad_norm": 0.3937673568725586, + "learning_rate": 0.000166129011294662, + "loss": 1.2299, + "step": 13046 + }, + { + "epoch": 0.16953981447039923, + "grad_norm": 0.4259859323501587, + "learning_rate": 0.00016612641183275063, + "loss": 1.5098, + "step": 13047 + }, + { + "epoch": 0.1695528090143151, + "grad_norm": 0.4275612235069275, + "learning_rate": 0.00016612381237083925, + "loss": 1.5159, + "step": 13048 + }, + { + "epoch": 0.16956580355823098, + "grad_norm": 0.4734569489955902, + "learning_rate": 0.00016612121290892785, + "loss": 1.4805, + "step": 13049 + }, + { + "epoch": 0.16957879810214685, + "grad_norm": 0.3092706799507141, + "learning_rate": 0.00016611861344701648, + "loss": 1.1996, + "step": 13050 + }, + { + "epoch": 0.16959179264606272, + "grad_norm": 0.35904547572135925, + "learning_rate": 0.0001661160139851051, + "loss": 1.4469, + "step": 13051 + }, + { + "epoch": 0.1696047871899786, + "grad_norm": 0.4766669273376465, + "learning_rate": 0.00016611341452319372, + "loss": 1.4596, + "step": 13052 + }, + { + "epoch": 0.16961778173389447, + "grad_norm": 0.36199504137039185, + "learning_rate": 0.00016611081506128232, + "loss": 1.6569, + "step": 13053 + }, + { + "epoch": 0.16963077627781034, + "grad_norm": 0.43424689769744873, + "learning_rate": 0.00016610821559937092, + "loss": 1.6603, + "step": 13054 + }, + { + "epoch": 0.16964377082172621, + "grad_norm": 0.4443923234939575, + "learning_rate": 0.00016610561613745957, + "loss": 1.4116, + "step": 13055 + }, + { + "epoch": 0.16965676536564211, + "grad_norm": 0.40876296162605286, + "learning_rate": 0.00016610301667554817, + "loss": 1.5402, + "step": 13056 + }, + { + "epoch": 0.169669759909558, + "grad_norm": 0.4384091794490814, + "learning_rate": 0.0001661004172136368, + "loss": 1.4127, + "step": 13057 + }, + { + "epoch": 0.16968275445347386, + "grad_norm": 0.5392494201660156, + "learning_rate": 0.0001660978177517254, + "loss": 1.5602, + "step": 13058 + }, + { + "epoch": 0.16969574899738973, + "grad_norm": 0.41336789727211, + "learning_rate": 0.00016609521828981401, + "loss": 1.5778, + "step": 13059 + }, + { + "epoch": 0.1697087435413056, + "grad_norm": 0.5546835660934448, + "learning_rate": 0.00016609261882790264, + "loss": 1.6286, + "step": 13060 + }, + { + "epoch": 0.16972173808522148, + "grad_norm": 0.43889087438583374, + "learning_rate": 0.00016609001936599124, + "loss": 1.3447, + "step": 13061 + }, + { + "epoch": 0.16973473262913735, + "grad_norm": 0.4774346947669983, + "learning_rate": 0.00016608741990407986, + "loss": 1.5451, + "step": 13062 + }, + { + "epoch": 0.16974772717305323, + "grad_norm": 0.3462209105491638, + "learning_rate": 0.00016608482044216849, + "loss": 1.5146, + "step": 13063 + }, + { + "epoch": 0.1697607217169691, + "grad_norm": 0.4561622440814972, + "learning_rate": 0.0001660822209802571, + "loss": 1.5189, + "step": 13064 + }, + { + "epoch": 0.16977371626088497, + "grad_norm": 0.4071427285671234, + "learning_rate": 0.0001660796215183457, + "loss": 1.3883, + "step": 13065 + }, + { + "epoch": 0.16978671080480084, + "grad_norm": 0.3243250548839569, + "learning_rate": 0.0001660770220564343, + "loss": 1.4454, + "step": 13066 + }, + { + "epoch": 0.16979970534871672, + "grad_norm": 0.4391162097454071, + "learning_rate": 0.00016607442259452296, + "loss": 1.4878, + "step": 13067 + }, + { + "epoch": 0.1698126998926326, + "grad_norm": 0.4476023316383362, + "learning_rate": 0.00016607182313261155, + "loss": 1.5835, + "step": 13068 + }, + { + "epoch": 0.16982569443654846, + "grad_norm": 0.384269118309021, + "learning_rate": 0.00016606922367070018, + "loss": 1.4852, + "step": 13069 + }, + { + "epoch": 0.16983868898046434, + "grad_norm": 0.41587504744529724, + "learning_rate": 0.00016606662420878878, + "loss": 1.371, + "step": 13070 + }, + { + "epoch": 0.1698516835243802, + "grad_norm": 0.3882206380367279, + "learning_rate": 0.0001660640247468774, + "loss": 1.4205, + "step": 13071 + }, + { + "epoch": 0.16986467806829608, + "grad_norm": 0.4553848206996918, + "learning_rate": 0.00016606142528496602, + "loss": 1.4247, + "step": 13072 + }, + { + "epoch": 0.16987767261221196, + "grad_norm": 0.4057481586933136, + "learning_rate": 0.00016605882582305462, + "loss": 1.3769, + "step": 13073 + }, + { + "epoch": 0.16989066715612783, + "grad_norm": 0.5231629014015198, + "learning_rate": 0.00016605622636114325, + "loss": 1.5658, + "step": 13074 + }, + { + "epoch": 0.1699036617000437, + "grad_norm": 0.4660027325153351, + "learning_rate": 0.00016605362689923187, + "loss": 1.6079, + "step": 13075 + }, + { + "epoch": 0.16991665624395957, + "grad_norm": 0.40947598218917847, + "learning_rate": 0.0001660510274373205, + "loss": 1.4387, + "step": 13076 + }, + { + "epoch": 0.16992965078787545, + "grad_norm": 0.430973082780838, + "learning_rate": 0.0001660484279754091, + "loss": 1.5157, + "step": 13077 + }, + { + "epoch": 0.16994264533179132, + "grad_norm": 0.4244171679019928, + "learning_rate": 0.0001660458285134977, + "loss": 1.3798, + "step": 13078 + }, + { + "epoch": 0.1699556398757072, + "grad_norm": 0.4093726575374603, + "learning_rate": 0.00016604322905158634, + "loss": 1.3113, + "step": 13079 + }, + { + "epoch": 0.16996863441962307, + "grad_norm": 0.4701089859008789, + "learning_rate": 0.00016604062958967494, + "loss": 1.4074, + "step": 13080 + }, + { + "epoch": 0.16998162896353894, + "grad_norm": 0.394441694021225, + "learning_rate": 0.00016603803012776356, + "loss": 1.6285, + "step": 13081 + }, + { + "epoch": 0.1699946235074548, + "grad_norm": 0.3873908519744873, + "learning_rate": 0.0001660354306658522, + "loss": 1.5151, + "step": 13082 + }, + { + "epoch": 0.17000761805137068, + "grad_norm": 0.4067518413066864, + "learning_rate": 0.00016603283120394079, + "loss": 1.3673, + "step": 13083 + }, + { + "epoch": 0.17002061259528656, + "grad_norm": 0.41336995363235474, + "learning_rate": 0.0001660302317420294, + "loss": 1.5925, + "step": 13084 + }, + { + "epoch": 0.17003360713920243, + "grad_norm": 0.3967403471469879, + "learning_rate": 0.000166027632280118, + "loss": 1.4412, + "step": 13085 + }, + { + "epoch": 0.1700466016831183, + "grad_norm": 0.4271251857280731, + "learning_rate": 0.00016602503281820666, + "loss": 1.4962, + "step": 13086 + }, + { + "epoch": 0.17005959622703418, + "grad_norm": 0.3277058005332947, + "learning_rate": 0.00016602243335629526, + "loss": 1.1996, + "step": 13087 + }, + { + "epoch": 0.17007259077095005, + "grad_norm": 0.32344329357147217, + "learning_rate": 0.00016601983389438388, + "loss": 1.3144, + "step": 13088 + }, + { + "epoch": 0.17008558531486592, + "grad_norm": 0.30760663747787476, + "learning_rate": 0.00016601723443247248, + "loss": 1.3494, + "step": 13089 + }, + { + "epoch": 0.1700985798587818, + "grad_norm": 0.39171960949897766, + "learning_rate": 0.0001660146349705611, + "loss": 1.3846, + "step": 13090 + }, + { + "epoch": 0.17011157440269767, + "grad_norm": 0.37440428137779236, + "learning_rate": 0.00016601203550864973, + "loss": 1.5354, + "step": 13091 + }, + { + "epoch": 0.17012456894661354, + "grad_norm": 0.2898044288158417, + "learning_rate": 0.00016600943604673832, + "loss": 1.2046, + "step": 13092 + }, + { + "epoch": 0.17013756349052941, + "grad_norm": 0.3481447696685791, + "learning_rate": 0.00016600683658482695, + "loss": 1.4129, + "step": 13093 + }, + { + "epoch": 0.1701505580344453, + "grad_norm": 0.24416305124759674, + "learning_rate": 0.00016600423712291557, + "loss": 1.4474, + "step": 13094 + }, + { + "epoch": 0.17016355257836116, + "grad_norm": 0.3698457181453705, + "learning_rate": 0.00016600163766100417, + "loss": 1.4244, + "step": 13095 + }, + { + "epoch": 0.17017654712227703, + "grad_norm": 0.4061986207962036, + "learning_rate": 0.0001659990381990928, + "loss": 1.5318, + "step": 13096 + }, + { + "epoch": 0.1701895416661929, + "grad_norm": 0.3631649613380432, + "learning_rate": 0.0001659964387371814, + "loss": 1.3014, + "step": 13097 + }, + { + "epoch": 0.17020253621010878, + "grad_norm": 0.40322378277778625, + "learning_rate": 0.00016599383927527004, + "loss": 1.3754, + "step": 13098 + }, + { + "epoch": 0.17021553075402465, + "grad_norm": 0.3894997537136078, + "learning_rate": 0.00016599123981335864, + "loss": 1.5042, + "step": 13099 + }, + { + "epoch": 0.17022852529794053, + "grad_norm": 0.34744152426719666, + "learning_rate": 0.00016598864035144727, + "loss": 1.4292, + "step": 13100 + }, + { + "epoch": 0.1702415198418564, + "grad_norm": 0.34252381324768066, + "learning_rate": 0.00016598604088953586, + "loss": 1.5083, + "step": 13101 + }, + { + "epoch": 0.17025451438577227, + "grad_norm": 0.4321986138820648, + "learning_rate": 0.0001659834414276245, + "loss": 1.2633, + "step": 13102 + }, + { + "epoch": 0.17026750892968814, + "grad_norm": 0.5124055743217468, + "learning_rate": 0.0001659808419657131, + "loss": 1.6757, + "step": 13103 + }, + { + "epoch": 0.17028050347360402, + "grad_norm": 0.31914374232292175, + "learning_rate": 0.0001659782425038017, + "loss": 1.5143, + "step": 13104 + }, + { + "epoch": 0.1702934980175199, + "grad_norm": 0.3535726070404053, + "learning_rate": 0.00016597564304189033, + "loss": 1.5525, + "step": 13105 + }, + { + "epoch": 0.17030649256143576, + "grad_norm": 0.42167311906814575, + "learning_rate": 0.00016597304357997896, + "loss": 1.4386, + "step": 13106 + }, + { + "epoch": 0.17031948710535164, + "grad_norm": 0.41847625374794006, + "learning_rate": 0.00016597044411806756, + "loss": 1.2622, + "step": 13107 + }, + { + "epoch": 0.1703324816492675, + "grad_norm": 0.41937389969825745, + "learning_rate": 0.00016596784465615618, + "loss": 1.4325, + "step": 13108 + }, + { + "epoch": 0.17034547619318338, + "grad_norm": 0.3507000505924225, + "learning_rate": 0.00016596524519424478, + "loss": 1.4062, + "step": 13109 + }, + { + "epoch": 0.17035847073709925, + "grad_norm": 0.40511760115623474, + "learning_rate": 0.00016596264573233343, + "loss": 1.517, + "step": 13110 + }, + { + "epoch": 0.17037146528101513, + "grad_norm": 0.3216192424297333, + "learning_rate": 0.00016596004627042203, + "loss": 1.3859, + "step": 13111 + }, + { + "epoch": 0.170384459824931, + "grad_norm": 0.5736033320426941, + "learning_rate": 0.00016595744680851065, + "loss": 1.2398, + "step": 13112 + }, + { + "epoch": 0.17039745436884687, + "grad_norm": 0.3535250425338745, + "learning_rate": 0.00016595484734659925, + "loss": 1.646, + "step": 13113 + }, + { + "epoch": 0.17041044891276275, + "grad_norm": 0.41037517786026, + "learning_rate": 0.00016595224788468787, + "loss": 1.3661, + "step": 13114 + }, + { + "epoch": 0.17042344345667862, + "grad_norm": 0.35530975461006165, + "learning_rate": 0.0001659496484227765, + "loss": 1.2946, + "step": 13115 + }, + { + "epoch": 0.1704364380005945, + "grad_norm": 0.4560537338256836, + "learning_rate": 0.0001659470489608651, + "loss": 1.4758, + "step": 13116 + }, + { + "epoch": 0.17044943254451037, + "grad_norm": 0.45485448837280273, + "learning_rate": 0.00016594444949895372, + "loss": 1.4339, + "step": 13117 + }, + { + "epoch": 0.17046242708842624, + "grad_norm": 0.40731996297836304, + "learning_rate": 0.00016594185003704234, + "loss": 1.4631, + "step": 13118 + }, + { + "epoch": 0.1704754216323421, + "grad_norm": 0.40122923254966736, + "learning_rate": 0.00016593925057513097, + "loss": 1.5834, + "step": 13119 + }, + { + "epoch": 0.17048841617625798, + "grad_norm": 0.46182429790496826, + "learning_rate": 0.00016593665111321957, + "loss": 1.3853, + "step": 13120 + }, + { + "epoch": 0.17050141072017386, + "grad_norm": 0.27336835861206055, + "learning_rate": 0.0001659340516513082, + "loss": 1.3728, + "step": 13121 + }, + { + "epoch": 0.17051440526408973, + "grad_norm": 0.38571515679359436, + "learning_rate": 0.00016593145218939681, + "loss": 1.3127, + "step": 13122 + }, + { + "epoch": 0.1705273998080056, + "grad_norm": 0.36559775471687317, + "learning_rate": 0.0001659288527274854, + "loss": 1.2105, + "step": 13123 + }, + { + "epoch": 0.17054039435192148, + "grad_norm": 0.35533106327056885, + "learning_rate": 0.00016592625326557404, + "loss": 1.3879, + "step": 13124 + }, + { + "epoch": 0.17055338889583735, + "grad_norm": 0.3331126272678375, + "learning_rate": 0.00016592365380366266, + "loss": 1.5676, + "step": 13125 + }, + { + "epoch": 0.17056638343975322, + "grad_norm": 0.38350364565849304, + "learning_rate": 0.00016592105434175126, + "loss": 1.5012, + "step": 13126 + }, + { + "epoch": 0.1705793779836691, + "grad_norm": 0.29600998759269714, + "learning_rate": 0.00016591845487983988, + "loss": 1.3653, + "step": 13127 + }, + { + "epoch": 0.17059237252758497, + "grad_norm": 0.35822761058807373, + "learning_rate": 0.00016591585541792848, + "loss": 1.5574, + "step": 13128 + }, + { + "epoch": 0.17060536707150084, + "grad_norm": 0.26348254084587097, + "learning_rate": 0.00016591325595601713, + "loss": 1.376, + "step": 13129 + }, + { + "epoch": 0.1706183616154167, + "grad_norm": 0.4692091643810272, + "learning_rate": 0.00016591065649410573, + "loss": 1.4453, + "step": 13130 + }, + { + "epoch": 0.1706313561593326, + "grad_norm": 0.3583778738975525, + "learning_rate": 0.00016590805703219435, + "loss": 1.4775, + "step": 13131 + }, + { + "epoch": 0.1706443507032485, + "grad_norm": 0.40448296070098877, + "learning_rate": 0.00016590545757028295, + "loss": 1.4801, + "step": 13132 + }, + { + "epoch": 0.17065734524716436, + "grad_norm": 0.37129494547843933, + "learning_rate": 0.00016590285810837158, + "loss": 1.3942, + "step": 13133 + }, + { + "epoch": 0.17067033979108023, + "grad_norm": 0.38922998309135437, + "learning_rate": 0.0001659002586464602, + "loss": 1.4409, + "step": 13134 + }, + { + "epoch": 0.1706833343349961, + "grad_norm": 0.4431775212287903, + "learning_rate": 0.0001658976591845488, + "loss": 1.4489, + "step": 13135 + }, + { + "epoch": 0.17069632887891198, + "grad_norm": 0.3151502311229706, + "learning_rate": 0.00016589505972263742, + "loss": 1.1253, + "step": 13136 + }, + { + "epoch": 0.17070932342282785, + "grad_norm": 0.4834553301334381, + "learning_rate": 0.00016589246026072605, + "loss": 1.4064, + "step": 13137 + }, + { + "epoch": 0.17072231796674373, + "grad_norm": 0.4206945300102234, + "learning_rate": 0.00016588986079881464, + "loss": 1.601, + "step": 13138 + }, + { + "epoch": 0.1707353125106596, + "grad_norm": 0.3786860406398773, + "learning_rate": 0.00016588726133690327, + "loss": 1.3215, + "step": 13139 + }, + { + "epoch": 0.17074830705457547, + "grad_norm": 0.44053781032562256, + "learning_rate": 0.00016588466187499187, + "loss": 1.3995, + "step": 13140 + }, + { + "epoch": 0.17076130159849134, + "grad_norm": 0.4013817310333252, + "learning_rate": 0.00016588206241308052, + "loss": 1.6734, + "step": 13141 + }, + { + "epoch": 0.17077429614240722, + "grad_norm": 0.30979812145233154, + "learning_rate": 0.00016587946295116911, + "loss": 1.3133, + "step": 13142 + }, + { + "epoch": 0.1707872906863231, + "grad_norm": 0.399179071187973, + "learning_rate": 0.00016587686348925774, + "loss": 1.5437, + "step": 13143 + }, + { + "epoch": 0.17080028523023896, + "grad_norm": 0.45547887682914734, + "learning_rate": 0.00016587426402734634, + "loss": 1.5763, + "step": 13144 + }, + { + "epoch": 0.17081327977415484, + "grad_norm": 0.33724868297576904, + "learning_rate": 0.00016587166456543496, + "loss": 1.575, + "step": 13145 + }, + { + "epoch": 0.1708262743180707, + "grad_norm": 0.281260222196579, + "learning_rate": 0.00016586906510352359, + "loss": 1.3803, + "step": 13146 + }, + { + "epoch": 0.17083926886198658, + "grad_norm": 0.46044909954071045, + "learning_rate": 0.00016586646564161218, + "loss": 1.5038, + "step": 13147 + }, + { + "epoch": 0.17085226340590245, + "grad_norm": 0.44409069418907166, + "learning_rate": 0.0001658638661797008, + "loss": 1.4848, + "step": 13148 + }, + { + "epoch": 0.17086525794981833, + "grad_norm": 0.41668617725372314, + "learning_rate": 0.00016586126671778943, + "loss": 1.4433, + "step": 13149 + }, + { + "epoch": 0.1708782524937342, + "grad_norm": 0.36708563566207886, + "learning_rate": 0.00016585866725587803, + "loss": 1.517, + "step": 13150 + }, + { + "epoch": 0.17089124703765007, + "grad_norm": 0.49574822187423706, + "learning_rate": 0.00016585606779396665, + "loss": 1.51, + "step": 13151 + }, + { + "epoch": 0.17090424158156595, + "grad_norm": 0.4861152470111847, + "learning_rate": 0.00016585346833205525, + "loss": 1.4301, + "step": 13152 + }, + { + "epoch": 0.17091723612548182, + "grad_norm": 0.3129928708076477, + "learning_rate": 0.0001658508688701439, + "loss": 1.4281, + "step": 13153 + }, + { + "epoch": 0.1709302306693977, + "grad_norm": 0.4674815237522125, + "learning_rate": 0.0001658482694082325, + "loss": 1.3359, + "step": 13154 + }, + { + "epoch": 0.17094322521331357, + "grad_norm": 0.43875250220298767, + "learning_rate": 0.00016584566994632112, + "loss": 1.3378, + "step": 13155 + }, + { + "epoch": 0.17095621975722944, + "grad_norm": 0.32091060280799866, + "learning_rate": 0.00016584307048440975, + "loss": 1.4266, + "step": 13156 + }, + { + "epoch": 0.1709692143011453, + "grad_norm": 0.33862367272377014, + "learning_rate": 0.00016584047102249835, + "loss": 1.2224, + "step": 13157 + }, + { + "epoch": 0.17098220884506118, + "grad_norm": 0.42534947395324707, + "learning_rate": 0.00016583787156058697, + "loss": 1.4238, + "step": 13158 + }, + { + "epoch": 0.17099520338897706, + "grad_norm": 0.3321935534477234, + "learning_rate": 0.00016583527209867557, + "loss": 1.34, + "step": 13159 + }, + { + "epoch": 0.17100819793289293, + "grad_norm": 0.45860472321510315, + "learning_rate": 0.00016583267263676422, + "loss": 1.5475, + "step": 13160 + }, + { + "epoch": 0.1710211924768088, + "grad_norm": 0.42171356081962585, + "learning_rate": 0.00016583007317485282, + "loss": 1.2746, + "step": 13161 + }, + { + "epoch": 0.17103418702072468, + "grad_norm": 0.3615332543849945, + "learning_rate": 0.00016582747371294141, + "loss": 1.5291, + "step": 13162 + }, + { + "epoch": 0.17104718156464055, + "grad_norm": 0.3345520794391632, + "learning_rate": 0.00016582487425103004, + "loss": 1.2914, + "step": 13163 + }, + { + "epoch": 0.17106017610855642, + "grad_norm": 0.36885762214660645, + "learning_rate": 0.00016582227478911866, + "loss": 1.4094, + "step": 13164 + }, + { + "epoch": 0.1710731706524723, + "grad_norm": 0.40950146317481995, + "learning_rate": 0.0001658196753272073, + "loss": 1.3136, + "step": 13165 + }, + { + "epoch": 0.17108616519638817, + "grad_norm": 0.4001840054988861, + "learning_rate": 0.00016581707586529589, + "loss": 1.4957, + "step": 13166 + }, + { + "epoch": 0.17109915974030404, + "grad_norm": 0.361457884311676, + "learning_rate": 0.0001658144764033845, + "loss": 1.6166, + "step": 13167 + }, + { + "epoch": 0.17111215428421991, + "grad_norm": 0.3711101710796356, + "learning_rate": 0.00016581187694147313, + "loss": 1.6462, + "step": 13168 + }, + { + "epoch": 0.1711251488281358, + "grad_norm": 0.4070250391960144, + "learning_rate": 0.00016580927747956173, + "loss": 1.3827, + "step": 13169 + }, + { + "epoch": 0.17113814337205166, + "grad_norm": 0.35707810521125793, + "learning_rate": 0.00016580667801765036, + "loss": 1.4192, + "step": 13170 + }, + { + "epoch": 0.17115113791596753, + "grad_norm": 0.5058854818344116, + "learning_rate": 0.00016580407855573895, + "loss": 1.5332, + "step": 13171 + }, + { + "epoch": 0.1711641324598834, + "grad_norm": 0.40400612354278564, + "learning_rate": 0.0001658014790938276, + "loss": 1.5948, + "step": 13172 + }, + { + "epoch": 0.17117712700379928, + "grad_norm": 0.4434622526168823, + "learning_rate": 0.0001657988796319162, + "loss": 1.4775, + "step": 13173 + }, + { + "epoch": 0.17119012154771515, + "grad_norm": 0.3293512165546417, + "learning_rate": 0.00016579628017000483, + "loss": 1.4274, + "step": 13174 + }, + { + "epoch": 0.17120311609163102, + "grad_norm": 0.3103547692298889, + "learning_rate": 0.00016579368070809342, + "loss": 1.4477, + "step": 13175 + }, + { + "epoch": 0.1712161106355469, + "grad_norm": 0.4046972692012787, + "learning_rate": 0.00016579108124618205, + "loss": 1.3526, + "step": 13176 + }, + { + "epoch": 0.17122910517946277, + "grad_norm": 0.35560137033462524, + "learning_rate": 0.00016578848178427067, + "loss": 1.5781, + "step": 13177 + }, + { + "epoch": 0.17124209972337864, + "grad_norm": 0.44884198904037476, + "learning_rate": 0.00016578588232235927, + "loss": 1.4761, + "step": 13178 + }, + { + "epoch": 0.17125509426729452, + "grad_norm": 0.42357537150382996, + "learning_rate": 0.0001657832828604479, + "loss": 1.5649, + "step": 13179 + }, + { + "epoch": 0.1712680888112104, + "grad_norm": 0.35290294885635376, + "learning_rate": 0.00016578068339853652, + "loss": 1.2905, + "step": 13180 + }, + { + "epoch": 0.17128108335512626, + "grad_norm": 0.37031111121177673, + "learning_rate": 0.00016577808393662512, + "loss": 1.5097, + "step": 13181 + }, + { + "epoch": 0.17129407789904214, + "grad_norm": 0.5222713947296143, + "learning_rate": 0.00016577548447471374, + "loss": 1.4259, + "step": 13182 + }, + { + "epoch": 0.171307072442958, + "grad_norm": 0.3635243773460388, + "learning_rate": 0.00016577288501280234, + "loss": 1.3436, + "step": 13183 + }, + { + "epoch": 0.17132006698687388, + "grad_norm": 0.4176403284072876, + "learning_rate": 0.000165770285550891, + "loss": 1.5, + "step": 13184 + }, + { + "epoch": 0.17133306153078975, + "grad_norm": 0.4752853214740753, + "learning_rate": 0.0001657676860889796, + "loss": 1.3882, + "step": 13185 + }, + { + "epoch": 0.17134605607470563, + "grad_norm": 0.40398016571998596, + "learning_rate": 0.0001657650866270682, + "loss": 1.403, + "step": 13186 + }, + { + "epoch": 0.1713590506186215, + "grad_norm": 0.3192632496356964, + "learning_rate": 0.0001657624871651568, + "loss": 1.2473, + "step": 13187 + }, + { + "epoch": 0.17137204516253737, + "grad_norm": 0.40304452180862427, + "learning_rate": 0.00016575988770324543, + "loss": 1.2444, + "step": 13188 + }, + { + "epoch": 0.17138503970645325, + "grad_norm": 0.4249211847782135, + "learning_rate": 0.00016575728824133406, + "loss": 1.4169, + "step": 13189 + }, + { + "epoch": 0.17139803425036912, + "grad_norm": 0.309519499540329, + "learning_rate": 0.00016575468877942266, + "loss": 1.2616, + "step": 13190 + }, + { + "epoch": 0.171411028794285, + "grad_norm": 0.41727471351623535, + "learning_rate": 0.00016575208931751128, + "loss": 1.3114, + "step": 13191 + }, + { + "epoch": 0.17142402333820086, + "grad_norm": 0.4680091142654419, + "learning_rate": 0.0001657494898555999, + "loss": 1.3036, + "step": 13192 + }, + { + "epoch": 0.17143701788211674, + "grad_norm": 0.5041329264640808, + "learning_rate": 0.0001657468903936885, + "loss": 1.3392, + "step": 13193 + }, + { + "epoch": 0.1714500124260326, + "grad_norm": 0.36246755719184875, + "learning_rate": 0.00016574429093177713, + "loss": 1.5403, + "step": 13194 + }, + { + "epoch": 0.17146300696994848, + "grad_norm": 0.36042118072509766, + "learning_rate": 0.00016574169146986575, + "loss": 1.4594, + "step": 13195 + }, + { + "epoch": 0.17147600151386436, + "grad_norm": 0.4207163453102112, + "learning_rate": 0.00016573909200795438, + "loss": 1.556, + "step": 13196 + }, + { + "epoch": 0.17148899605778023, + "grad_norm": 0.48187360167503357, + "learning_rate": 0.00016573649254604297, + "loss": 1.6193, + "step": 13197 + }, + { + "epoch": 0.1715019906016961, + "grad_norm": 0.28130674362182617, + "learning_rate": 0.0001657338930841316, + "loss": 1.4549, + "step": 13198 + }, + { + "epoch": 0.17151498514561198, + "grad_norm": 0.32879185676574707, + "learning_rate": 0.00016573129362222022, + "loss": 1.3952, + "step": 13199 + }, + { + "epoch": 0.17152797968952785, + "grad_norm": 0.43828660249710083, + "learning_rate": 0.00016572869416030882, + "loss": 1.7164, + "step": 13200 + }, + { + "epoch": 0.17154097423344372, + "grad_norm": 0.2906350791454315, + "learning_rate": 0.00016572609469839744, + "loss": 1.4835, + "step": 13201 + }, + { + "epoch": 0.1715539687773596, + "grad_norm": 0.354322612285614, + "learning_rate": 0.00016572349523648604, + "loss": 1.4717, + "step": 13202 + }, + { + "epoch": 0.17156696332127547, + "grad_norm": 0.4343889057636261, + "learning_rate": 0.0001657208957745747, + "loss": 1.5283, + "step": 13203 + }, + { + "epoch": 0.17157995786519134, + "grad_norm": 0.40784355998039246, + "learning_rate": 0.0001657182963126633, + "loss": 1.2878, + "step": 13204 + }, + { + "epoch": 0.1715929524091072, + "grad_norm": 0.4645921289920807, + "learning_rate": 0.0001657156968507519, + "loss": 1.3383, + "step": 13205 + }, + { + "epoch": 0.1716059469530231, + "grad_norm": 0.24410507082939148, + "learning_rate": 0.0001657130973888405, + "loss": 1.3735, + "step": 13206 + }, + { + "epoch": 0.17161894149693896, + "grad_norm": 0.25429943203926086, + "learning_rate": 0.00016571049792692914, + "loss": 1.2981, + "step": 13207 + }, + { + "epoch": 0.17163193604085483, + "grad_norm": 0.4111470878124237, + "learning_rate": 0.00016570789846501776, + "loss": 1.5104, + "step": 13208 + }, + { + "epoch": 0.17164493058477073, + "grad_norm": 0.43327146768569946, + "learning_rate": 0.00016570529900310636, + "loss": 1.5432, + "step": 13209 + }, + { + "epoch": 0.1716579251286866, + "grad_norm": 0.39463886618614197, + "learning_rate": 0.00016570269954119498, + "loss": 1.3528, + "step": 13210 + }, + { + "epoch": 0.17167091967260248, + "grad_norm": 0.35991427302360535, + "learning_rate": 0.0001657001000792836, + "loss": 1.441, + "step": 13211 + }, + { + "epoch": 0.17168391421651835, + "grad_norm": 0.232219859957695, + "learning_rate": 0.0001656975006173722, + "loss": 1.2687, + "step": 13212 + }, + { + "epoch": 0.17169690876043422, + "grad_norm": 0.43521979451179504, + "learning_rate": 0.00016569490115546083, + "loss": 1.4167, + "step": 13213 + }, + { + "epoch": 0.1717099033043501, + "grad_norm": 0.33573973178863525, + "learning_rate": 0.00016569230169354943, + "loss": 1.3387, + "step": 13214 + }, + { + "epoch": 0.17172289784826597, + "grad_norm": 0.3633989989757538, + "learning_rate": 0.00016568970223163808, + "loss": 1.3057, + "step": 13215 + }, + { + "epoch": 0.17173589239218184, + "grad_norm": 0.39607352018356323, + "learning_rate": 0.00016568710276972668, + "loss": 1.3566, + "step": 13216 + }, + { + "epoch": 0.17174888693609772, + "grad_norm": 0.5490531325340271, + "learning_rate": 0.00016568450330781527, + "loss": 1.4677, + "step": 13217 + }, + { + "epoch": 0.1717618814800136, + "grad_norm": 0.4028678238391876, + "learning_rate": 0.0001656819038459039, + "loss": 1.4905, + "step": 13218 + }, + { + "epoch": 0.17177487602392946, + "grad_norm": 0.3589508533477783, + "learning_rate": 0.00016567930438399252, + "loss": 1.5017, + "step": 13219 + }, + { + "epoch": 0.17178787056784534, + "grad_norm": 0.32853490114212036, + "learning_rate": 0.00016567670492208115, + "loss": 1.3615, + "step": 13220 + }, + { + "epoch": 0.1718008651117612, + "grad_norm": 0.38775429129600525, + "learning_rate": 0.00016567410546016974, + "loss": 1.3167, + "step": 13221 + }, + { + "epoch": 0.17181385965567708, + "grad_norm": 0.3832327723503113, + "learning_rate": 0.00016567150599825837, + "loss": 1.518, + "step": 13222 + }, + { + "epoch": 0.17182685419959295, + "grad_norm": 0.47670498490333557, + "learning_rate": 0.000165668906536347, + "loss": 1.3783, + "step": 13223 + }, + { + "epoch": 0.17183984874350883, + "grad_norm": 0.46496906876564026, + "learning_rate": 0.0001656663070744356, + "loss": 1.2044, + "step": 13224 + }, + { + "epoch": 0.1718528432874247, + "grad_norm": 0.434190034866333, + "learning_rate": 0.00016566370761252422, + "loss": 1.4648, + "step": 13225 + }, + { + "epoch": 0.17186583783134057, + "grad_norm": 0.4518909156322479, + "learning_rate": 0.0001656611081506128, + "loss": 1.4685, + "step": 13226 + }, + { + "epoch": 0.17187883237525645, + "grad_norm": 0.4945921003818512, + "learning_rate": 0.00016565850868870146, + "loss": 1.5399, + "step": 13227 + }, + { + "epoch": 0.17189182691917232, + "grad_norm": 0.3644948899745941, + "learning_rate": 0.00016565590922679006, + "loss": 1.3921, + "step": 13228 + }, + { + "epoch": 0.1719048214630882, + "grad_norm": 0.49629640579223633, + "learning_rate": 0.00016565330976487866, + "loss": 1.3186, + "step": 13229 + }, + { + "epoch": 0.17191781600700407, + "grad_norm": 0.24035559594631195, + "learning_rate": 0.0001656507103029673, + "loss": 1.1132, + "step": 13230 + }, + { + "epoch": 0.17193081055091994, + "grad_norm": 0.4147299826145172, + "learning_rate": 0.0001656481108410559, + "loss": 1.4242, + "step": 13231 + }, + { + "epoch": 0.1719438050948358, + "grad_norm": 0.45291200280189514, + "learning_rate": 0.00016564551137914453, + "loss": 1.5819, + "step": 13232 + }, + { + "epoch": 0.17195679963875168, + "grad_norm": 0.42972901463508606, + "learning_rate": 0.00016564291191723313, + "loss": 1.446, + "step": 13233 + }, + { + "epoch": 0.17196979418266756, + "grad_norm": 0.47062039375305176, + "learning_rate": 0.00016564031245532175, + "loss": 1.6251, + "step": 13234 + }, + { + "epoch": 0.17198278872658343, + "grad_norm": 0.33339330554008484, + "learning_rate": 0.00016563771299341038, + "loss": 1.3773, + "step": 13235 + }, + { + "epoch": 0.1719957832704993, + "grad_norm": 0.3569968640804291, + "learning_rate": 0.00016563511353149898, + "loss": 1.5002, + "step": 13236 + }, + { + "epoch": 0.17200877781441518, + "grad_norm": 0.3116925060749054, + "learning_rate": 0.0001656325140695876, + "loss": 1.4727, + "step": 13237 + }, + { + "epoch": 0.17202177235833105, + "grad_norm": 0.40232232213020325, + "learning_rate": 0.00016562991460767623, + "loss": 1.5291, + "step": 13238 + }, + { + "epoch": 0.17203476690224692, + "grad_norm": 0.3872472941875458, + "learning_rate": 0.00016562731514576485, + "loss": 1.5817, + "step": 13239 + }, + { + "epoch": 0.1720477614461628, + "grad_norm": 0.4164724051952362, + "learning_rate": 0.00016562471568385345, + "loss": 1.4418, + "step": 13240 + }, + { + "epoch": 0.17206075599007867, + "grad_norm": 0.3428027331829071, + "learning_rate": 0.00016562211622194207, + "loss": 1.3743, + "step": 13241 + }, + { + "epoch": 0.17207375053399454, + "grad_norm": 0.3754356801509857, + "learning_rate": 0.0001656195167600307, + "loss": 1.2897, + "step": 13242 + }, + { + "epoch": 0.1720867450779104, + "grad_norm": 0.43637245893478394, + "learning_rate": 0.0001656169172981193, + "loss": 1.374, + "step": 13243 + }, + { + "epoch": 0.1720997396218263, + "grad_norm": 0.4943617284297943, + "learning_rate": 0.00016561431783620792, + "loss": 1.3587, + "step": 13244 + }, + { + "epoch": 0.17211273416574216, + "grad_norm": 0.3823917508125305, + "learning_rate": 0.00016561171837429652, + "loss": 1.3661, + "step": 13245 + }, + { + "epoch": 0.17212572870965803, + "grad_norm": 0.3014433681964874, + "learning_rate": 0.00016560911891238514, + "loss": 1.4207, + "step": 13246 + }, + { + "epoch": 0.1721387232535739, + "grad_norm": 0.43272411823272705, + "learning_rate": 0.00016560651945047376, + "loss": 1.3471, + "step": 13247 + }, + { + "epoch": 0.17215171779748978, + "grad_norm": 0.4171760082244873, + "learning_rate": 0.00016560391998856236, + "loss": 1.4447, + "step": 13248 + }, + { + "epoch": 0.17216471234140565, + "grad_norm": 0.44503775238990784, + "learning_rate": 0.00016560132052665099, + "loss": 1.3682, + "step": 13249 + }, + { + "epoch": 0.17217770688532152, + "grad_norm": 0.392220139503479, + "learning_rate": 0.0001655987210647396, + "loss": 1.3014, + "step": 13250 + }, + { + "epoch": 0.1721907014292374, + "grad_norm": 0.44181498885154724, + "learning_rate": 0.00016559612160282824, + "loss": 1.4563, + "step": 13251 + }, + { + "epoch": 0.17220369597315327, + "grad_norm": 0.3580576181411743, + "learning_rate": 0.00016559352214091683, + "loss": 1.4462, + "step": 13252 + }, + { + "epoch": 0.17221669051706914, + "grad_norm": 0.44580137729644775, + "learning_rate": 0.00016559092267900546, + "loss": 1.316, + "step": 13253 + }, + { + "epoch": 0.17222968506098502, + "grad_norm": 0.40787971019744873, + "learning_rate": 0.00016558832321709408, + "loss": 1.3787, + "step": 13254 + }, + { + "epoch": 0.1722426796049009, + "grad_norm": 0.2945028245449066, + "learning_rate": 0.00016558572375518268, + "loss": 1.1279, + "step": 13255 + }, + { + "epoch": 0.17225567414881676, + "grad_norm": 0.4573705196380615, + "learning_rate": 0.0001655831242932713, + "loss": 1.5413, + "step": 13256 + }, + { + "epoch": 0.17226866869273263, + "grad_norm": 0.35718896985054016, + "learning_rate": 0.0001655805248313599, + "loss": 1.4945, + "step": 13257 + }, + { + "epoch": 0.1722816632366485, + "grad_norm": 0.35502755641937256, + "learning_rate": 0.00016557792536944855, + "loss": 1.3139, + "step": 13258 + }, + { + "epoch": 0.17229465778056438, + "grad_norm": 0.4150936007499695, + "learning_rate": 0.00016557532590753715, + "loss": 1.3093, + "step": 13259 + }, + { + "epoch": 0.17230765232448025, + "grad_norm": 0.4627138078212738, + "learning_rate": 0.00016557272644562575, + "loss": 1.4308, + "step": 13260 + }, + { + "epoch": 0.17232064686839613, + "grad_norm": 0.4367901086807251, + "learning_rate": 0.00016557012698371437, + "loss": 1.6893, + "step": 13261 + }, + { + "epoch": 0.172333641412312, + "grad_norm": 0.36571958661079407, + "learning_rate": 0.000165567527521803, + "loss": 1.1728, + "step": 13262 + }, + { + "epoch": 0.17234663595622787, + "grad_norm": 0.4008215069770813, + "learning_rate": 0.00016556492805989162, + "loss": 1.2705, + "step": 13263 + }, + { + "epoch": 0.17235963050014375, + "grad_norm": 0.2850292921066284, + "learning_rate": 0.00016556232859798022, + "loss": 1.3028, + "step": 13264 + }, + { + "epoch": 0.17237262504405962, + "grad_norm": 0.29150980710983276, + "learning_rate": 0.00016555972913606884, + "loss": 1.4276, + "step": 13265 + }, + { + "epoch": 0.1723856195879755, + "grad_norm": 0.4610038995742798, + "learning_rate": 0.00016555712967415747, + "loss": 1.2558, + "step": 13266 + }, + { + "epoch": 0.17239861413189136, + "grad_norm": 0.37384361028671265, + "learning_rate": 0.00016555453021224606, + "loss": 1.6667, + "step": 13267 + }, + { + "epoch": 0.17241160867580724, + "grad_norm": 0.30299925804138184, + "learning_rate": 0.0001655519307503347, + "loss": 1.3843, + "step": 13268 + }, + { + "epoch": 0.1724246032197231, + "grad_norm": 0.4450954496860504, + "learning_rate": 0.0001655493312884233, + "loss": 1.3142, + "step": 13269 + }, + { + "epoch": 0.17243759776363898, + "grad_norm": 0.4767892062664032, + "learning_rate": 0.00016554673182651194, + "loss": 1.5149, + "step": 13270 + }, + { + "epoch": 0.17245059230755486, + "grad_norm": 0.3988877236843109, + "learning_rate": 0.00016554413236460053, + "loss": 1.3396, + "step": 13271 + }, + { + "epoch": 0.17246358685147073, + "grad_norm": 0.45012447237968445, + "learning_rate": 0.00016554153290268913, + "loss": 1.4525, + "step": 13272 + }, + { + "epoch": 0.1724765813953866, + "grad_norm": 0.4664531648159027, + "learning_rate": 0.00016553893344077778, + "loss": 1.4385, + "step": 13273 + }, + { + "epoch": 0.17248957593930248, + "grad_norm": 0.45344552397727966, + "learning_rate": 0.00016553633397886638, + "loss": 1.4004, + "step": 13274 + }, + { + "epoch": 0.17250257048321835, + "grad_norm": 0.4349400997161865, + "learning_rate": 0.000165533734516955, + "loss": 1.3669, + "step": 13275 + }, + { + "epoch": 0.17251556502713422, + "grad_norm": 0.4356987774372101, + "learning_rate": 0.0001655311350550436, + "loss": 1.4662, + "step": 13276 + }, + { + "epoch": 0.1725285595710501, + "grad_norm": 0.4695001542568207, + "learning_rate": 0.00016552853559313223, + "loss": 1.4871, + "step": 13277 + }, + { + "epoch": 0.17254155411496597, + "grad_norm": 0.5414415001869202, + "learning_rate": 0.00016552593613122085, + "loss": 1.5076, + "step": 13278 + }, + { + "epoch": 0.17255454865888184, + "grad_norm": 0.39165061712265015, + "learning_rate": 0.00016552333666930945, + "loss": 1.6107, + "step": 13279 + }, + { + "epoch": 0.1725675432027977, + "grad_norm": 0.5397108197212219, + "learning_rate": 0.00016552073720739807, + "loss": 1.3159, + "step": 13280 + }, + { + "epoch": 0.17258053774671359, + "grad_norm": 0.5476521253585815, + "learning_rate": 0.0001655181377454867, + "loss": 1.3692, + "step": 13281 + }, + { + "epoch": 0.17259353229062946, + "grad_norm": 0.40746042132377625, + "learning_rate": 0.00016551553828357532, + "loss": 1.3033, + "step": 13282 + }, + { + "epoch": 0.17260652683454533, + "grad_norm": 0.39313873648643494, + "learning_rate": 0.00016551293882166392, + "loss": 1.4005, + "step": 13283 + }, + { + "epoch": 0.1726195213784612, + "grad_norm": 0.3358429968357086, + "learning_rate": 0.00016551033935975252, + "loss": 1.4321, + "step": 13284 + }, + { + "epoch": 0.1726325159223771, + "grad_norm": 0.322401225566864, + "learning_rate": 0.00016550773989784117, + "loss": 1.5043, + "step": 13285 + }, + { + "epoch": 0.17264551046629298, + "grad_norm": 0.4391809403896332, + "learning_rate": 0.00016550514043592977, + "loss": 1.4891, + "step": 13286 + }, + { + "epoch": 0.17265850501020885, + "grad_norm": 0.44552457332611084, + "learning_rate": 0.0001655025409740184, + "loss": 1.4174, + "step": 13287 + }, + { + "epoch": 0.17267149955412472, + "grad_norm": 0.43498408794403076, + "learning_rate": 0.000165499941512107, + "loss": 1.4961, + "step": 13288 + }, + { + "epoch": 0.1726844940980406, + "grad_norm": 0.3670695126056671, + "learning_rate": 0.0001654973420501956, + "loss": 1.3749, + "step": 13289 + }, + { + "epoch": 0.17269748864195647, + "grad_norm": 0.38664329051971436, + "learning_rate": 0.00016549474258828424, + "loss": 1.4293, + "step": 13290 + }, + { + "epoch": 0.17271048318587234, + "grad_norm": 0.41639307141304016, + "learning_rate": 0.00016549214312637283, + "loss": 1.3655, + "step": 13291 + }, + { + "epoch": 0.17272347772978822, + "grad_norm": 0.27676376700401306, + "learning_rate": 0.00016548954366446146, + "loss": 1.2425, + "step": 13292 + }, + { + "epoch": 0.1727364722737041, + "grad_norm": 0.34870126843452454, + "learning_rate": 0.00016548694420255008, + "loss": 1.3445, + "step": 13293 + }, + { + "epoch": 0.17274946681761996, + "grad_norm": 0.40599924325942993, + "learning_rate": 0.0001654843447406387, + "loss": 1.422, + "step": 13294 + }, + { + "epoch": 0.17276246136153584, + "grad_norm": 0.3603878915309906, + "learning_rate": 0.0001654817452787273, + "loss": 1.5414, + "step": 13295 + }, + { + "epoch": 0.1727754559054517, + "grad_norm": 0.37636372447013855, + "learning_rate": 0.00016547914581681593, + "loss": 1.4838, + "step": 13296 + }, + { + "epoch": 0.17278845044936758, + "grad_norm": 0.3460656702518463, + "learning_rate": 0.00016547654635490455, + "loss": 1.3337, + "step": 13297 + }, + { + "epoch": 0.17280144499328345, + "grad_norm": 0.45613473653793335, + "learning_rate": 0.00016547394689299315, + "loss": 1.5599, + "step": 13298 + }, + { + "epoch": 0.17281443953719933, + "grad_norm": 0.36926427483558655, + "learning_rate": 0.00016547134743108178, + "loss": 1.4375, + "step": 13299 + }, + { + "epoch": 0.1728274340811152, + "grad_norm": 0.37286102771759033, + "learning_rate": 0.00016546874796917037, + "loss": 1.1608, + "step": 13300 + }, + { + "epoch": 0.17284042862503107, + "grad_norm": 0.3439924120903015, + "learning_rate": 0.000165466148507259, + "loss": 1.2277, + "step": 13301 + }, + { + "epoch": 0.17285342316894695, + "grad_norm": 0.4017575681209564, + "learning_rate": 0.00016546354904534762, + "loss": 1.5146, + "step": 13302 + }, + { + "epoch": 0.17286641771286282, + "grad_norm": 0.3311365842819214, + "learning_rate": 0.00016546094958343622, + "loss": 1.626, + "step": 13303 + }, + { + "epoch": 0.1728794122567787, + "grad_norm": 0.4362022578716278, + "learning_rate": 0.00016545835012152487, + "loss": 1.4609, + "step": 13304 + }, + { + "epoch": 0.17289240680069456, + "grad_norm": 0.4214492738246918, + "learning_rate": 0.00016545575065961347, + "loss": 1.4682, + "step": 13305 + }, + { + "epoch": 0.17290540134461044, + "grad_norm": 0.4576879143714905, + "learning_rate": 0.0001654531511977021, + "loss": 1.3129, + "step": 13306 + }, + { + "epoch": 0.1729183958885263, + "grad_norm": 0.422690212726593, + "learning_rate": 0.0001654505517357907, + "loss": 1.5334, + "step": 13307 + }, + { + "epoch": 0.17293139043244218, + "grad_norm": 0.32679685950279236, + "learning_rate": 0.00016544795227387932, + "loss": 1.2894, + "step": 13308 + }, + { + "epoch": 0.17294438497635806, + "grad_norm": 0.32620567083358765, + "learning_rate": 0.00016544535281196794, + "loss": 1.2126, + "step": 13309 + }, + { + "epoch": 0.17295737952027393, + "grad_norm": 0.46092817187309265, + "learning_rate": 0.00016544275335005654, + "loss": 1.4209, + "step": 13310 + }, + { + "epoch": 0.1729703740641898, + "grad_norm": 0.3583175241947174, + "learning_rate": 0.00016544015388814516, + "loss": 1.3962, + "step": 13311 + }, + { + "epoch": 0.17298336860810568, + "grad_norm": 0.31701523065567017, + "learning_rate": 0.0001654375544262338, + "loss": 1.1837, + "step": 13312 + }, + { + "epoch": 0.17299636315202155, + "grad_norm": 0.4170938730239868, + "learning_rate": 0.00016543495496432238, + "loss": 1.3438, + "step": 13313 + }, + { + "epoch": 0.17300935769593742, + "grad_norm": 0.3459280729293823, + "learning_rate": 0.000165432355502411, + "loss": 1.4976, + "step": 13314 + }, + { + "epoch": 0.1730223522398533, + "grad_norm": 0.4367409944534302, + "learning_rate": 0.0001654297560404996, + "loss": 1.4108, + "step": 13315 + }, + { + "epoch": 0.17303534678376917, + "grad_norm": 0.399282306432724, + "learning_rate": 0.00016542715657858826, + "loss": 1.6142, + "step": 13316 + }, + { + "epoch": 0.17304834132768504, + "grad_norm": 0.3932700455188751, + "learning_rate": 0.00016542455711667685, + "loss": 1.3773, + "step": 13317 + }, + { + "epoch": 0.1730613358716009, + "grad_norm": 0.41390523314476013, + "learning_rate": 0.00016542195765476548, + "loss": 1.3757, + "step": 13318 + }, + { + "epoch": 0.17307433041551679, + "grad_norm": 0.48950520157814026, + "learning_rate": 0.00016541935819285408, + "loss": 1.5238, + "step": 13319 + }, + { + "epoch": 0.17308732495943266, + "grad_norm": 0.3633231520652771, + "learning_rate": 0.0001654167587309427, + "loss": 1.4904, + "step": 13320 + }, + { + "epoch": 0.17310031950334853, + "grad_norm": 0.3302246034145355, + "learning_rate": 0.00016541415926903133, + "loss": 1.3426, + "step": 13321 + }, + { + "epoch": 0.1731133140472644, + "grad_norm": 0.36626505851745605, + "learning_rate": 0.00016541155980711992, + "loss": 1.3724, + "step": 13322 + }, + { + "epoch": 0.17312630859118028, + "grad_norm": 0.3938538134098053, + "learning_rate": 0.00016540896034520855, + "loss": 1.3163, + "step": 13323 + }, + { + "epoch": 0.17313930313509615, + "grad_norm": 0.3940238952636719, + "learning_rate": 0.00016540636088329717, + "loss": 1.7296, + "step": 13324 + }, + { + "epoch": 0.17315229767901202, + "grad_norm": 0.38923120498657227, + "learning_rate": 0.0001654037614213858, + "loss": 1.5316, + "step": 13325 + }, + { + "epoch": 0.1731652922229279, + "grad_norm": 0.4201051890850067, + "learning_rate": 0.0001654011619594744, + "loss": 1.542, + "step": 13326 + }, + { + "epoch": 0.17317828676684377, + "grad_norm": 0.4189986288547516, + "learning_rate": 0.000165398562497563, + "loss": 1.4463, + "step": 13327 + }, + { + "epoch": 0.17319128131075964, + "grad_norm": 0.4199727177619934, + "learning_rate": 0.00016539596303565164, + "loss": 1.392, + "step": 13328 + }, + { + "epoch": 0.17320427585467552, + "grad_norm": 0.4013659954071045, + "learning_rate": 0.00016539336357374024, + "loss": 1.4156, + "step": 13329 + }, + { + "epoch": 0.1732172703985914, + "grad_norm": 0.44378623366355896, + "learning_rate": 0.00016539076411182886, + "loss": 1.3054, + "step": 13330 + }, + { + "epoch": 0.17323026494250726, + "grad_norm": 0.43498170375823975, + "learning_rate": 0.00016538816464991746, + "loss": 1.3975, + "step": 13331 + }, + { + "epoch": 0.17324325948642313, + "grad_norm": 0.4229772388935089, + "learning_rate": 0.00016538556518800609, + "loss": 1.5115, + "step": 13332 + }, + { + "epoch": 0.173256254030339, + "grad_norm": 0.4326575994491577, + "learning_rate": 0.0001653829657260947, + "loss": 1.6879, + "step": 13333 + }, + { + "epoch": 0.17326924857425488, + "grad_norm": 0.41954946517944336, + "learning_rate": 0.0001653803662641833, + "loss": 1.3056, + "step": 13334 + }, + { + "epoch": 0.17328224311817075, + "grad_norm": 0.47150903940200806, + "learning_rate": 0.00016537776680227193, + "loss": 1.598, + "step": 13335 + }, + { + "epoch": 0.17329523766208663, + "grad_norm": 0.45423462986946106, + "learning_rate": 0.00016537516734036056, + "loss": 1.5666, + "step": 13336 + }, + { + "epoch": 0.1733082322060025, + "grad_norm": 0.46473002433776855, + "learning_rate": 0.00016537256787844918, + "loss": 1.4679, + "step": 13337 + }, + { + "epoch": 0.17332122674991837, + "grad_norm": 0.44201385974884033, + "learning_rate": 0.00016536996841653778, + "loss": 1.4588, + "step": 13338 + }, + { + "epoch": 0.17333422129383425, + "grad_norm": 0.6297826170921326, + "learning_rate": 0.00016536736895462638, + "loss": 1.4134, + "step": 13339 + }, + { + "epoch": 0.17334721583775012, + "grad_norm": 0.4003432095050812, + "learning_rate": 0.00016536476949271503, + "loss": 1.4075, + "step": 13340 + }, + { + "epoch": 0.173360210381666, + "grad_norm": 0.40075111389160156, + "learning_rate": 0.00016536217003080363, + "loss": 1.3696, + "step": 13341 + }, + { + "epoch": 0.17337320492558186, + "grad_norm": 0.4209158718585968, + "learning_rate": 0.00016535957056889225, + "loss": 1.4707, + "step": 13342 + }, + { + "epoch": 0.17338619946949774, + "grad_norm": 0.4144658148288727, + "learning_rate": 0.00016535697110698087, + "loss": 1.3246, + "step": 13343 + }, + { + "epoch": 0.1733991940134136, + "grad_norm": 0.3935602903366089, + "learning_rate": 0.00016535437164506947, + "loss": 1.3723, + "step": 13344 + }, + { + "epoch": 0.17341218855732948, + "grad_norm": 0.3846154510974884, + "learning_rate": 0.0001653517721831581, + "loss": 1.7502, + "step": 13345 + }, + { + "epoch": 0.17342518310124536, + "grad_norm": 0.41360217332839966, + "learning_rate": 0.0001653491727212467, + "loss": 1.3585, + "step": 13346 + }, + { + "epoch": 0.17343817764516123, + "grad_norm": 0.4347723722457886, + "learning_rate": 0.00016534657325933535, + "loss": 1.3222, + "step": 13347 + }, + { + "epoch": 0.1734511721890771, + "grad_norm": 0.5524615049362183, + "learning_rate": 0.00016534397379742394, + "loss": 1.5136, + "step": 13348 + }, + { + "epoch": 0.17346416673299297, + "grad_norm": 0.3776842951774597, + "learning_rate": 0.00016534137433551257, + "loss": 1.2762, + "step": 13349 + }, + { + "epoch": 0.17347716127690885, + "grad_norm": 0.39902234077453613, + "learning_rate": 0.00016533877487360116, + "loss": 1.5468, + "step": 13350 + }, + { + "epoch": 0.17349015582082472, + "grad_norm": 0.3801536560058594, + "learning_rate": 0.0001653361754116898, + "loss": 1.4054, + "step": 13351 + }, + { + "epoch": 0.1735031503647406, + "grad_norm": 0.34295907616615295, + "learning_rate": 0.0001653335759497784, + "loss": 1.3759, + "step": 13352 + }, + { + "epoch": 0.17351614490865647, + "grad_norm": 0.49465489387512207, + "learning_rate": 0.000165330976487867, + "loss": 1.3806, + "step": 13353 + }, + { + "epoch": 0.17352913945257234, + "grad_norm": 0.33638831973075867, + "learning_rate": 0.00016532837702595564, + "loss": 1.6792, + "step": 13354 + }, + { + "epoch": 0.1735421339964882, + "grad_norm": 0.36213600635528564, + "learning_rate": 0.00016532577756404426, + "loss": 1.5455, + "step": 13355 + }, + { + "epoch": 0.17355512854040409, + "grad_norm": 0.37444978952407837, + "learning_rate": 0.00016532317810213286, + "loss": 1.4258, + "step": 13356 + }, + { + "epoch": 0.17356812308431996, + "grad_norm": 0.371562659740448, + "learning_rate": 0.00016532057864022148, + "loss": 1.4555, + "step": 13357 + }, + { + "epoch": 0.17358111762823583, + "grad_norm": 0.4598786234855652, + "learning_rate": 0.00016531797917831008, + "loss": 1.4655, + "step": 13358 + }, + { + "epoch": 0.1735941121721517, + "grad_norm": 0.3862901031970978, + "learning_rate": 0.00016531537971639873, + "loss": 1.5347, + "step": 13359 + }, + { + "epoch": 0.17360710671606758, + "grad_norm": 0.3729879856109619, + "learning_rate": 0.00016531278025448733, + "loss": 1.4025, + "step": 13360 + }, + { + "epoch": 0.17362010125998348, + "grad_norm": 0.30724814534187317, + "learning_rate": 0.00016531018079257595, + "loss": 1.3901, + "step": 13361 + }, + { + "epoch": 0.17363309580389935, + "grad_norm": 0.3591684401035309, + "learning_rate": 0.00016530758133066455, + "loss": 1.2564, + "step": 13362 + }, + { + "epoch": 0.17364609034781522, + "grad_norm": 0.37339314818382263, + "learning_rate": 0.00016530498186875317, + "loss": 1.4111, + "step": 13363 + }, + { + "epoch": 0.1736590848917311, + "grad_norm": 0.34402546286582947, + "learning_rate": 0.0001653023824068418, + "loss": 1.4524, + "step": 13364 + }, + { + "epoch": 0.17367207943564697, + "grad_norm": 0.45641106367111206, + "learning_rate": 0.0001652997829449304, + "loss": 1.4755, + "step": 13365 + }, + { + "epoch": 0.17368507397956284, + "grad_norm": 0.34375521540641785, + "learning_rate": 0.00016529718348301902, + "loss": 1.393, + "step": 13366 + }, + { + "epoch": 0.17369806852347872, + "grad_norm": 0.3787298798561096, + "learning_rate": 0.00016529458402110765, + "loss": 1.2405, + "step": 13367 + }, + { + "epoch": 0.1737110630673946, + "grad_norm": 0.3178604245185852, + "learning_rate": 0.00016529198455919624, + "loss": 1.289, + "step": 13368 + }, + { + "epoch": 0.17372405761131046, + "grad_norm": 0.46018078923225403, + "learning_rate": 0.00016528938509728487, + "loss": 1.5789, + "step": 13369 + }, + { + "epoch": 0.17373705215522633, + "grad_norm": 0.43831494450569153, + "learning_rate": 0.00016528678563537346, + "loss": 1.4416, + "step": 13370 + }, + { + "epoch": 0.1737500466991422, + "grad_norm": 0.30505529046058655, + "learning_rate": 0.00016528418617346212, + "loss": 1.2265, + "step": 13371 + }, + { + "epoch": 0.17376304124305808, + "grad_norm": 0.4703518748283386, + "learning_rate": 0.0001652815867115507, + "loss": 1.2355, + "step": 13372 + }, + { + "epoch": 0.17377603578697395, + "grad_norm": 0.40107858180999756, + "learning_rate": 0.00016527898724963934, + "loss": 1.4418, + "step": 13373 + }, + { + "epoch": 0.17378903033088983, + "grad_norm": 0.3823404014110565, + "learning_rate": 0.00016527638778772794, + "loss": 1.3932, + "step": 13374 + }, + { + "epoch": 0.1738020248748057, + "grad_norm": 0.4739859700202942, + "learning_rate": 0.00016527378832581656, + "loss": 1.2348, + "step": 13375 + }, + { + "epoch": 0.17381501941872157, + "grad_norm": 0.34157299995422363, + "learning_rate": 0.00016527118886390518, + "loss": 1.3998, + "step": 13376 + }, + { + "epoch": 0.17382801396263745, + "grad_norm": 0.3854232132434845, + "learning_rate": 0.00016526858940199378, + "loss": 1.5666, + "step": 13377 + }, + { + "epoch": 0.17384100850655332, + "grad_norm": 0.3532569706439972, + "learning_rate": 0.00016526598994008243, + "loss": 1.3913, + "step": 13378 + }, + { + "epoch": 0.1738540030504692, + "grad_norm": 0.4467248022556305, + "learning_rate": 0.00016526339047817103, + "loss": 1.4405, + "step": 13379 + }, + { + "epoch": 0.17386699759438506, + "grad_norm": 0.3107064366340637, + "learning_rate": 0.00016526079101625966, + "loss": 1.3317, + "step": 13380 + }, + { + "epoch": 0.17387999213830094, + "grad_norm": 0.43914172053337097, + "learning_rate": 0.00016525819155434825, + "loss": 1.493, + "step": 13381 + }, + { + "epoch": 0.1738929866822168, + "grad_norm": 0.39585360884666443, + "learning_rate": 0.00016525559209243688, + "loss": 1.2851, + "step": 13382 + }, + { + "epoch": 0.17390598122613268, + "grad_norm": 0.3452441990375519, + "learning_rate": 0.0001652529926305255, + "loss": 1.4804, + "step": 13383 + }, + { + "epoch": 0.17391897577004856, + "grad_norm": 0.37664923071861267, + "learning_rate": 0.0001652503931686141, + "loss": 1.3659, + "step": 13384 + }, + { + "epoch": 0.17393197031396443, + "grad_norm": 0.36215031147003174, + "learning_rate": 0.00016524779370670272, + "loss": 1.542, + "step": 13385 + }, + { + "epoch": 0.1739449648578803, + "grad_norm": 0.3891845643520355, + "learning_rate": 0.00016524519424479135, + "loss": 1.3174, + "step": 13386 + }, + { + "epoch": 0.17395795940179618, + "grad_norm": 0.3435726761817932, + "learning_rate": 0.00016524259478287995, + "loss": 1.4056, + "step": 13387 + }, + { + "epoch": 0.17397095394571205, + "grad_norm": 0.36685875058174133, + "learning_rate": 0.00016523999532096857, + "loss": 1.2921, + "step": 13388 + }, + { + "epoch": 0.17398394848962792, + "grad_norm": 0.4003696143627167, + "learning_rate": 0.00016523739585905717, + "loss": 1.4938, + "step": 13389 + }, + { + "epoch": 0.1739969430335438, + "grad_norm": 0.48055654764175415, + "learning_rate": 0.00016523479639714582, + "loss": 1.5115, + "step": 13390 + }, + { + "epoch": 0.17400993757745967, + "grad_norm": 0.41080594062805176, + "learning_rate": 0.00016523219693523442, + "loss": 1.3036, + "step": 13391 + }, + { + "epoch": 0.17402293212137554, + "grad_norm": 0.4288446605205536, + "learning_rate": 0.00016522959747332304, + "loss": 1.3061, + "step": 13392 + }, + { + "epoch": 0.1740359266652914, + "grad_norm": 0.34696251153945923, + "learning_rate": 0.00016522699801141164, + "loss": 1.2724, + "step": 13393 + }, + { + "epoch": 0.17404892120920729, + "grad_norm": 0.4945676028728485, + "learning_rate": 0.00016522439854950026, + "loss": 1.4038, + "step": 13394 + }, + { + "epoch": 0.17406191575312316, + "grad_norm": 0.36798331141471863, + "learning_rate": 0.0001652217990875889, + "loss": 1.3993, + "step": 13395 + }, + { + "epoch": 0.17407491029703903, + "grad_norm": 0.3245902955532074, + "learning_rate": 0.00016521919962567748, + "loss": 1.2825, + "step": 13396 + }, + { + "epoch": 0.1740879048409549, + "grad_norm": 0.42146414518356323, + "learning_rate": 0.0001652166001637661, + "loss": 1.3793, + "step": 13397 + }, + { + "epoch": 0.17410089938487078, + "grad_norm": 0.547269344329834, + "learning_rate": 0.00016521400070185473, + "loss": 1.3652, + "step": 13398 + }, + { + "epoch": 0.17411389392878665, + "grad_norm": 0.3122113049030304, + "learning_rate": 0.00016521140123994333, + "loss": 1.501, + "step": 13399 + }, + { + "epoch": 0.17412688847270252, + "grad_norm": 0.3969956338405609, + "learning_rate": 0.00016520880177803196, + "loss": 1.4332, + "step": 13400 + }, + { + "epoch": 0.1741398830166184, + "grad_norm": 0.4108419120311737, + "learning_rate": 0.00016520620231612055, + "loss": 1.303, + "step": 13401 + }, + { + "epoch": 0.17415287756053427, + "grad_norm": 0.33740755915641785, + "learning_rate": 0.0001652036028542092, + "loss": 1.4279, + "step": 13402 + }, + { + "epoch": 0.17416587210445014, + "grad_norm": 0.5096627473831177, + "learning_rate": 0.0001652010033922978, + "loss": 1.4009, + "step": 13403 + }, + { + "epoch": 0.17417886664836602, + "grad_norm": 0.380125492811203, + "learning_rate": 0.00016519840393038643, + "loss": 1.4589, + "step": 13404 + }, + { + "epoch": 0.1741918611922819, + "grad_norm": 0.412034273147583, + "learning_rate": 0.00016519580446847502, + "loss": 1.4604, + "step": 13405 + }, + { + "epoch": 0.17420485573619776, + "grad_norm": 0.3821055293083191, + "learning_rate": 0.00016519320500656365, + "loss": 1.4195, + "step": 13406 + }, + { + "epoch": 0.17421785028011363, + "grad_norm": 0.4046683609485626, + "learning_rate": 0.00016519060554465227, + "loss": 1.2702, + "step": 13407 + }, + { + "epoch": 0.1742308448240295, + "grad_norm": 0.35326266288757324, + "learning_rate": 0.00016518800608274087, + "loss": 1.304, + "step": 13408 + }, + { + "epoch": 0.17424383936794538, + "grad_norm": 0.35064899921417236, + "learning_rate": 0.0001651854066208295, + "loss": 1.4338, + "step": 13409 + }, + { + "epoch": 0.17425683391186125, + "grad_norm": 0.38213178515434265, + "learning_rate": 0.00016518280715891812, + "loss": 1.359, + "step": 13410 + }, + { + "epoch": 0.17426982845577713, + "grad_norm": 0.28947770595550537, + "learning_rate": 0.00016518020769700672, + "loss": 1.5043, + "step": 13411 + }, + { + "epoch": 0.174282822999693, + "grad_norm": 0.28834593296051025, + "learning_rate": 0.00016517760823509534, + "loss": 1.3545, + "step": 13412 + }, + { + "epoch": 0.17429581754360887, + "grad_norm": 0.4509648382663727, + "learning_rate": 0.00016517500877318394, + "loss": 1.6549, + "step": 13413 + }, + { + "epoch": 0.17430881208752474, + "grad_norm": 0.480600506067276, + "learning_rate": 0.0001651724093112726, + "loss": 1.4065, + "step": 13414 + }, + { + "epoch": 0.17432180663144062, + "grad_norm": 0.37305060029029846, + "learning_rate": 0.0001651698098493612, + "loss": 1.4353, + "step": 13415 + }, + { + "epoch": 0.1743348011753565, + "grad_norm": 0.4292123019695282, + "learning_rate": 0.0001651672103874498, + "loss": 1.3883, + "step": 13416 + }, + { + "epoch": 0.17434779571927236, + "grad_norm": 0.2961142361164093, + "learning_rate": 0.00016516461092553844, + "loss": 1.3846, + "step": 13417 + }, + { + "epoch": 0.17436079026318824, + "grad_norm": 0.45750001072883606, + "learning_rate": 0.00016516201146362703, + "loss": 1.5323, + "step": 13418 + }, + { + "epoch": 0.1743737848071041, + "grad_norm": 0.43683168292045593, + "learning_rate": 0.00016515941200171566, + "loss": 1.4173, + "step": 13419 + }, + { + "epoch": 0.17438677935101998, + "grad_norm": 0.46560683846473694, + "learning_rate": 0.00016515681253980425, + "loss": 1.5242, + "step": 13420 + }, + { + "epoch": 0.17439977389493586, + "grad_norm": 0.41565653681755066, + "learning_rate": 0.0001651542130778929, + "loss": 1.4636, + "step": 13421 + }, + { + "epoch": 0.17441276843885173, + "grad_norm": 0.3468264043331146, + "learning_rate": 0.0001651516136159815, + "loss": 1.2744, + "step": 13422 + }, + { + "epoch": 0.1744257629827676, + "grad_norm": 0.35076987743377686, + "learning_rate": 0.0001651490141540701, + "loss": 1.4505, + "step": 13423 + }, + { + "epoch": 0.17443875752668347, + "grad_norm": 0.38492074608802795, + "learning_rate": 0.00016514641469215873, + "loss": 1.4471, + "step": 13424 + }, + { + "epoch": 0.17445175207059935, + "grad_norm": 0.44133260846138, + "learning_rate": 0.00016514381523024735, + "loss": 1.5224, + "step": 13425 + }, + { + "epoch": 0.17446474661451522, + "grad_norm": 0.41571658849716187, + "learning_rate": 0.00016514121576833597, + "loss": 1.5849, + "step": 13426 + }, + { + "epoch": 0.1744777411584311, + "grad_norm": 0.35711586475372314, + "learning_rate": 0.00016513861630642457, + "loss": 1.3589, + "step": 13427 + }, + { + "epoch": 0.17449073570234697, + "grad_norm": 0.27254778146743774, + "learning_rate": 0.0001651360168445132, + "loss": 1.2383, + "step": 13428 + }, + { + "epoch": 0.17450373024626284, + "grad_norm": 0.2841547131538391, + "learning_rate": 0.00016513341738260182, + "loss": 1.295, + "step": 13429 + }, + { + "epoch": 0.1745167247901787, + "grad_norm": 0.24505028128623962, + "learning_rate": 0.00016513081792069042, + "loss": 1.3533, + "step": 13430 + }, + { + "epoch": 0.17452971933409459, + "grad_norm": 0.46519798040390015, + "learning_rate": 0.00016512821845877904, + "loss": 1.5446, + "step": 13431 + }, + { + "epoch": 0.17454271387801046, + "grad_norm": 0.3529621660709381, + "learning_rate": 0.00016512561899686764, + "loss": 1.4733, + "step": 13432 + }, + { + "epoch": 0.17455570842192633, + "grad_norm": 0.4122268855571747, + "learning_rate": 0.0001651230195349563, + "loss": 1.5913, + "step": 13433 + }, + { + "epoch": 0.1745687029658422, + "grad_norm": 0.4126298725605011, + "learning_rate": 0.0001651204200730449, + "loss": 1.3624, + "step": 13434 + }, + { + "epoch": 0.17458169750975808, + "grad_norm": 0.4127906858921051, + "learning_rate": 0.0001651178206111335, + "loss": 1.5192, + "step": 13435 + }, + { + "epoch": 0.17459469205367395, + "grad_norm": 0.6212459802627563, + "learning_rate": 0.0001651152211492221, + "loss": 1.5355, + "step": 13436 + }, + { + "epoch": 0.17460768659758985, + "grad_norm": 0.5126368403434753, + "learning_rate": 0.00016511262168731074, + "loss": 1.4107, + "step": 13437 + }, + { + "epoch": 0.17462068114150572, + "grad_norm": 0.4047090709209442, + "learning_rate": 0.00016511002222539936, + "loss": 1.1911, + "step": 13438 + }, + { + "epoch": 0.1746336756854216, + "grad_norm": 0.343588650226593, + "learning_rate": 0.00016510742276348796, + "loss": 1.3364, + "step": 13439 + }, + { + "epoch": 0.17464667022933747, + "grad_norm": 0.38149598240852356, + "learning_rate": 0.00016510482330157658, + "loss": 1.3004, + "step": 13440 + }, + { + "epoch": 0.17465966477325334, + "grad_norm": 0.3995436131954193, + "learning_rate": 0.0001651022238396652, + "loss": 1.2263, + "step": 13441 + }, + { + "epoch": 0.17467265931716922, + "grad_norm": 0.330473929643631, + "learning_rate": 0.0001650996243777538, + "loss": 1.2429, + "step": 13442 + }, + { + "epoch": 0.1746856538610851, + "grad_norm": 0.44789496064186096, + "learning_rate": 0.00016509702491584243, + "loss": 1.3022, + "step": 13443 + }, + { + "epoch": 0.17469864840500096, + "grad_norm": 0.2974104583263397, + "learning_rate": 0.00016509442545393103, + "loss": 1.2184, + "step": 13444 + }, + { + "epoch": 0.17471164294891683, + "grad_norm": 0.38194137811660767, + "learning_rate": 0.00016509182599201968, + "loss": 1.446, + "step": 13445 + }, + { + "epoch": 0.1747246374928327, + "grad_norm": 0.2928636372089386, + "learning_rate": 0.00016508922653010827, + "loss": 1.4253, + "step": 13446 + }, + { + "epoch": 0.17473763203674858, + "grad_norm": 0.36509057879447937, + "learning_rate": 0.0001650866270681969, + "loss": 1.4523, + "step": 13447 + }, + { + "epoch": 0.17475062658066445, + "grad_norm": 0.36516791582107544, + "learning_rate": 0.0001650840276062855, + "loss": 1.5678, + "step": 13448 + }, + { + "epoch": 0.17476362112458033, + "grad_norm": 0.3799486756324768, + "learning_rate": 0.00016508142814437412, + "loss": 1.4619, + "step": 13449 + }, + { + "epoch": 0.1747766156684962, + "grad_norm": 0.45014092326164246, + "learning_rate": 0.00016507882868246275, + "loss": 1.5263, + "step": 13450 + }, + { + "epoch": 0.17478961021241207, + "grad_norm": 0.40081489086151123, + "learning_rate": 0.00016507622922055134, + "loss": 1.4657, + "step": 13451 + }, + { + "epoch": 0.17480260475632795, + "grad_norm": 0.46514081954956055, + "learning_rate": 0.00016507362975863997, + "loss": 1.6489, + "step": 13452 + }, + { + "epoch": 0.17481559930024382, + "grad_norm": 0.358997106552124, + "learning_rate": 0.0001650710302967286, + "loss": 1.6649, + "step": 13453 + }, + { + "epoch": 0.1748285938441597, + "grad_norm": 0.4227268695831299, + "learning_rate": 0.0001650684308348172, + "loss": 1.2737, + "step": 13454 + }, + { + "epoch": 0.17484158838807556, + "grad_norm": 0.3060193955898285, + "learning_rate": 0.00016506583137290581, + "loss": 1.3838, + "step": 13455 + }, + { + "epoch": 0.17485458293199144, + "grad_norm": 0.42281782627105713, + "learning_rate": 0.00016506323191099444, + "loss": 1.5166, + "step": 13456 + }, + { + "epoch": 0.1748675774759073, + "grad_norm": 0.439098060131073, + "learning_rate": 0.00016506063244908306, + "loss": 1.5466, + "step": 13457 + }, + { + "epoch": 0.17488057201982318, + "grad_norm": 0.4470057487487793, + "learning_rate": 0.00016505803298717166, + "loss": 1.3987, + "step": 13458 + }, + { + "epoch": 0.17489356656373906, + "grad_norm": 0.4449363052845001, + "learning_rate": 0.00016505543352526028, + "loss": 1.4259, + "step": 13459 + }, + { + "epoch": 0.17490656110765493, + "grad_norm": 0.4303593635559082, + "learning_rate": 0.0001650528340633489, + "loss": 1.4017, + "step": 13460 + }, + { + "epoch": 0.1749195556515708, + "grad_norm": 0.37148261070251465, + "learning_rate": 0.0001650502346014375, + "loss": 1.3952, + "step": 13461 + }, + { + "epoch": 0.17493255019548667, + "grad_norm": 0.361826628446579, + "learning_rate": 0.00016504763513952613, + "loss": 1.4783, + "step": 13462 + }, + { + "epoch": 0.17494554473940255, + "grad_norm": 0.3926715850830078, + "learning_rate": 0.00016504503567761473, + "loss": 1.4838, + "step": 13463 + }, + { + "epoch": 0.17495853928331842, + "grad_norm": 0.3533923327922821, + "learning_rate": 0.00016504243621570338, + "loss": 1.4042, + "step": 13464 + }, + { + "epoch": 0.1749715338272343, + "grad_norm": 0.4848467707633972, + "learning_rate": 0.00016503983675379198, + "loss": 1.3995, + "step": 13465 + }, + { + "epoch": 0.17498452837115017, + "grad_norm": 0.35159027576446533, + "learning_rate": 0.00016503723729188057, + "loss": 1.5477, + "step": 13466 + }, + { + "epoch": 0.17499752291506604, + "grad_norm": 0.34370023012161255, + "learning_rate": 0.0001650346378299692, + "loss": 1.3882, + "step": 13467 + }, + { + "epoch": 0.1750105174589819, + "grad_norm": 0.49847400188446045, + "learning_rate": 0.00016503203836805782, + "loss": 1.4393, + "step": 13468 + }, + { + "epoch": 0.17502351200289779, + "grad_norm": 0.48788490891456604, + "learning_rate": 0.00016502943890614645, + "loss": 1.3966, + "step": 13469 + }, + { + "epoch": 0.17503650654681366, + "grad_norm": 0.3483833372592926, + "learning_rate": 0.00016502683944423505, + "loss": 1.3532, + "step": 13470 + }, + { + "epoch": 0.17504950109072953, + "grad_norm": 0.4486258924007416, + "learning_rate": 0.00016502423998232367, + "loss": 1.4069, + "step": 13471 + }, + { + "epoch": 0.1750624956346454, + "grad_norm": 0.40895354747772217, + "learning_rate": 0.0001650216405204123, + "loss": 1.4384, + "step": 13472 + }, + { + "epoch": 0.17507549017856128, + "grad_norm": 0.35748904943466187, + "learning_rate": 0.0001650190410585009, + "loss": 1.2782, + "step": 13473 + }, + { + "epoch": 0.17508848472247715, + "grad_norm": 0.3825523853302002, + "learning_rate": 0.00016501644159658952, + "loss": 1.687, + "step": 13474 + }, + { + "epoch": 0.17510147926639302, + "grad_norm": 0.4871703088283539, + "learning_rate": 0.00016501384213467811, + "loss": 1.4416, + "step": 13475 + }, + { + "epoch": 0.1751144738103089, + "grad_norm": 0.4077610373497009, + "learning_rate": 0.00016501124267276677, + "loss": 1.486, + "step": 13476 + }, + { + "epoch": 0.17512746835422477, + "grad_norm": 0.43831121921539307, + "learning_rate": 0.00016500864321085536, + "loss": 1.338, + "step": 13477 + }, + { + "epoch": 0.17514046289814064, + "grad_norm": 0.42199379205703735, + "learning_rate": 0.00016500604374894396, + "loss": 1.3239, + "step": 13478 + }, + { + "epoch": 0.17515345744205651, + "grad_norm": 0.3701782524585724, + "learning_rate": 0.00016500344428703258, + "loss": 1.3909, + "step": 13479 + }, + { + "epoch": 0.1751664519859724, + "grad_norm": 0.7420125603675842, + "learning_rate": 0.0001650008448251212, + "loss": 1.5017, + "step": 13480 + }, + { + "epoch": 0.17517944652988826, + "grad_norm": 0.37119871377944946, + "learning_rate": 0.00016499824536320983, + "loss": 1.4445, + "step": 13481 + }, + { + "epoch": 0.17519244107380413, + "grad_norm": 0.32587212324142456, + "learning_rate": 0.00016499564590129843, + "loss": 1.3808, + "step": 13482 + }, + { + "epoch": 0.17520543561772, + "grad_norm": 0.34763771295547485, + "learning_rate": 0.00016499304643938706, + "loss": 1.2203, + "step": 13483 + }, + { + "epoch": 0.17521843016163588, + "grad_norm": 0.4741244912147522, + "learning_rate": 0.00016499044697747568, + "loss": 1.379, + "step": 13484 + }, + { + "epoch": 0.17523142470555175, + "grad_norm": 0.6077156662940979, + "learning_rate": 0.00016498784751556428, + "loss": 1.6124, + "step": 13485 + }, + { + "epoch": 0.17524441924946763, + "grad_norm": 0.3056173324584961, + "learning_rate": 0.0001649852480536529, + "loss": 1.1708, + "step": 13486 + }, + { + "epoch": 0.1752574137933835, + "grad_norm": 0.38942110538482666, + "learning_rate": 0.0001649826485917415, + "loss": 1.4529, + "step": 13487 + }, + { + "epoch": 0.17527040833729937, + "grad_norm": 0.397161066532135, + "learning_rate": 0.00016498004912983015, + "loss": 1.4906, + "step": 13488 + }, + { + "epoch": 0.17528340288121524, + "grad_norm": 0.38064852356910706, + "learning_rate": 0.00016497744966791875, + "loss": 1.3094, + "step": 13489 + }, + { + "epoch": 0.17529639742513112, + "grad_norm": 0.37729570269584656, + "learning_rate": 0.00016497485020600735, + "loss": 1.5922, + "step": 13490 + }, + { + "epoch": 0.175309391969047, + "grad_norm": 0.49073395133018494, + "learning_rate": 0.000164972250744096, + "loss": 1.5308, + "step": 13491 + }, + { + "epoch": 0.17532238651296286, + "grad_norm": 0.4614732563495636, + "learning_rate": 0.0001649696512821846, + "loss": 1.483, + "step": 13492 + }, + { + "epoch": 0.17533538105687874, + "grad_norm": 0.3456222712993622, + "learning_rate": 0.00016496705182027322, + "loss": 1.2901, + "step": 13493 + }, + { + "epoch": 0.1753483756007946, + "grad_norm": 0.3475855886936188, + "learning_rate": 0.00016496445235836182, + "loss": 1.4711, + "step": 13494 + }, + { + "epoch": 0.17536137014471048, + "grad_norm": 0.44296783208847046, + "learning_rate": 0.00016496185289645044, + "loss": 1.3944, + "step": 13495 + }, + { + "epoch": 0.17537436468862636, + "grad_norm": 0.4115827679634094, + "learning_rate": 0.00016495925343453907, + "loss": 1.4954, + "step": 13496 + }, + { + "epoch": 0.17538735923254223, + "grad_norm": 0.33640995621681213, + "learning_rate": 0.00016495665397262766, + "loss": 1.3453, + "step": 13497 + }, + { + "epoch": 0.1754003537764581, + "grad_norm": 0.5466011166572571, + "learning_rate": 0.0001649540545107163, + "loss": 1.5637, + "step": 13498 + }, + { + "epoch": 0.17541334832037397, + "grad_norm": 0.3716542422771454, + "learning_rate": 0.0001649514550488049, + "loss": 1.4289, + "step": 13499 + }, + { + "epoch": 0.17542634286428985, + "grad_norm": 0.349242627620697, + "learning_rate": 0.00016494885558689354, + "loss": 1.4617, + "step": 13500 + }, + { + "epoch": 0.17543933740820572, + "grad_norm": 0.36021289229393005, + "learning_rate": 0.00016494625612498213, + "loss": 1.1995, + "step": 13501 + }, + { + "epoch": 0.1754523319521216, + "grad_norm": 0.29966408014297485, + "learning_rate": 0.00016494365666307076, + "loss": 1.3276, + "step": 13502 + }, + { + "epoch": 0.17546532649603747, + "grad_norm": 0.4373058080673218, + "learning_rate": 0.00016494105720115938, + "loss": 1.3889, + "step": 13503 + }, + { + "epoch": 0.17547832103995334, + "grad_norm": 0.3564509451389313, + "learning_rate": 0.00016493845773924798, + "loss": 1.5267, + "step": 13504 + }, + { + "epoch": 0.1754913155838692, + "grad_norm": 0.3977997899055481, + "learning_rate": 0.0001649358582773366, + "loss": 1.4256, + "step": 13505 + }, + { + "epoch": 0.17550431012778508, + "grad_norm": 0.32826700806617737, + "learning_rate": 0.0001649332588154252, + "loss": 1.3825, + "step": 13506 + }, + { + "epoch": 0.17551730467170096, + "grad_norm": 0.4034384787082672, + "learning_rate": 0.00016493065935351383, + "loss": 1.2995, + "step": 13507 + }, + { + "epoch": 0.17553029921561683, + "grad_norm": 0.37431657314300537, + "learning_rate": 0.00016492805989160245, + "loss": 1.4742, + "step": 13508 + }, + { + "epoch": 0.1755432937595327, + "grad_norm": 0.40061405301094055, + "learning_rate": 0.00016492546042969105, + "loss": 1.49, + "step": 13509 + }, + { + "epoch": 0.17555628830344858, + "grad_norm": 0.36853882670402527, + "learning_rate": 0.00016492286096777967, + "loss": 1.3964, + "step": 13510 + }, + { + "epoch": 0.17556928284736445, + "grad_norm": 0.4105042517185211, + "learning_rate": 0.0001649202615058683, + "loss": 1.2653, + "step": 13511 + }, + { + "epoch": 0.17558227739128032, + "grad_norm": 0.42176076769828796, + "learning_rate": 0.00016491766204395692, + "loss": 1.4333, + "step": 13512 + }, + { + "epoch": 0.17559527193519622, + "grad_norm": 0.42144495248794556, + "learning_rate": 0.00016491506258204552, + "loss": 1.5783, + "step": 13513 + }, + { + "epoch": 0.1756082664791121, + "grad_norm": 0.40591567754745483, + "learning_rate": 0.00016491246312013414, + "loss": 1.5444, + "step": 13514 + }, + { + "epoch": 0.17562126102302797, + "grad_norm": 0.29232579469680786, + "learning_rate": 0.00016490986365822277, + "loss": 1.3192, + "step": 13515 + }, + { + "epoch": 0.17563425556694384, + "grad_norm": 0.48433569073677063, + "learning_rate": 0.00016490726419631137, + "loss": 1.3967, + "step": 13516 + }, + { + "epoch": 0.17564725011085972, + "grad_norm": 0.3832167685031891, + "learning_rate": 0.0001649046647344, + "loss": 1.6544, + "step": 13517 + }, + { + "epoch": 0.1756602446547756, + "grad_norm": 0.48258134722709656, + "learning_rate": 0.0001649020652724886, + "loss": 1.3863, + "step": 13518 + }, + { + "epoch": 0.17567323919869146, + "grad_norm": 0.4282076954841614, + "learning_rate": 0.0001648994658105772, + "loss": 1.3261, + "step": 13519 + }, + { + "epoch": 0.17568623374260733, + "grad_norm": 0.3869810104370117, + "learning_rate": 0.00016489686634866584, + "loss": 1.3552, + "step": 13520 + }, + { + "epoch": 0.1756992282865232, + "grad_norm": 0.4020592272281647, + "learning_rate": 0.00016489426688675443, + "loss": 1.5436, + "step": 13521 + }, + { + "epoch": 0.17571222283043908, + "grad_norm": 0.347240686416626, + "learning_rate": 0.00016489166742484306, + "loss": 1.4601, + "step": 13522 + }, + { + "epoch": 0.17572521737435495, + "grad_norm": 0.5202689170837402, + "learning_rate": 0.00016488906796293168, + "loss": 1.6156, + "step": 13523 + }, + { + "epoch": 0.17573821191827083, + "grad_norm": 0.3712344765663147, + "learning_rate": 0.0001648864685010203, + "loss": 1.3057, + "step": 13524 + }, + { + "epoch": 0.1757512064621867, + "grad_norm": 0.37039464712142944, + "learning_rate": 0.0001648838690391089, + "loss": 1.3453, + "step": 13525 + }, + { + "epoch": 0.17576420100610257, + "grad_norm": 0.3911130726337433, + "learning_rate": 0.00016488126957719753, + "loss": 1.4548, + "step": 13526 + }, + { + "epoch": 0.17577719555001844, + "grad_norm": 0.397586464881897, + "learning_rate": 0.00016487867011528615, + "loss": 1.3052, + "step": 13527 + }, + { + "epoch": 0.17579019009393432, + "grad_norm": 0.39860421419143677, + "learning_rate": 0.00016487607065337475, + "loss": 1.3872, + "step": 13528 + }, + { + "epoch": 0.1758031846378502, + "grad_norm": 0.47928884625434875, + "learning_rate": 0.00016487347119146338, + "loss": 1.3522, + "step": 13529 + }, + { + "epoch": 0.17581617918176606, + "grad_norm": 0.34829577803611755, + "learning_rate": 0.000164870871729552, + "loss": 1.471, + "step": 13530 + }, + { + "epoch": 0.17582917372568194, + "grad_norm": 0.4381202459335327, + "learning_rate": 0.00016486827226764062, + "loss": 1.4197, + "step": 13531 + }, + { + "epoch": 0.1758421682695978, + "grad_norm": 0.408735990524292, + "learning_rate": 0.00016486567280572922, + "loss": 1.4568, + "step": 13532 + }, + { + "epoch": 0.17585516281351368, + "grad_norm": 0.3802216053009033, + "learning_rate": 0.00016486307334381782, + "loss": 1.7062, + "step": 13533 + }, + { + "epoch": 0.17586815735742956, + "grad_norm": 0.39596137404441833, + "learning_rate": 0.00016486047388190647, + "loss": 1.5423, + "step": 13534 + }, + { + "epoch": 0.17588115190134543, + "grad_norm": 0.33654001355171204, + "learning_rate": 0.00016485787441999507, + "loss": 1.4071, + "step": 13535 + }, + { + "epoch": 0.1758941464452613, + "grad_norm": 0.3452851176261902, + "learning_rate": 0.0001648552749580837, + "loss": 1.265, + "step": 13536 + }, + { + "epoch": 0.17590714098917717, + "grad_norm": 0.30356869101524353, + "learning_rate": 0.0001648526754961723, + "loss": 1.2295, + "step": 13537 + }, + { + "epoch": 0.17592013553309305, + "grad_norm": 0.31200847029685974, + "learning_rate": 0.00016485007603426091, + "loss": 1.3852, + "step": 13538 + }, + { + "epoch": 0.17593313007700892, + "grad_norm": 0.49106618762016296, + "learning_rate": 0.00016484747657234954, + "loss": 1.4197, + "step": 13539 + }, + { + "epoch": 0.1759461246209248, + "grad_norm": 0.412903368473053, + "learning_rate": 0.00016484487711043814, + "loss": 1.4273, + "step": 13540 + }, + { + "epoch": 0.17595911916484067, + "grad_norm": 0.32957229018211365, + "learning_rate": 0.00016484227764852676, + "loss": 1.4691, + "step": 13541 + }, + { + "epoch": 0.17597211370875654, + "grad_norm": 0.4434698224067688, + "learning_rate": 0.00016483967818661538, + "loss": 1.5286, + "step": 13542 + }, + { + "epoch": 0.1759851082526724, + "grad_norm": 0.3510220944881439, + "learning_rate": 0.000164837078724704, + "loss": 1.4435, + "step": 13543 + }, + { + "epoch": 0.17599810279658828, + "grad_norm": 0.42501917481422424, + "learning_rate": 0.0001648344792627926, + "loss": 1.427, + "step": 13544 + }, + { + "epoch": 0.17601109734050416, + "grad_norm": 0.41684237122535706, + "learning_rate": 0.0001648318798008812, + "loss": 1.2161, + "step": 13545 + }, + { + "epoch": 0.17602409188442003, + "grad_norm": 0.5242071151733398, + "learning_rate": 0.00016482928033896986, + "loss": 1.4906, + "step": 13546 + }, + { + "epoch": 0.1760370864283359, + "grad_norm": 0.4684368073940277, + "learning_rate": 0.00016482668087705845, + "loss": 1.423, + "step": 13547 + }, + { + "epoch": 0.17605008097225178, + "grad_norm": 0.5643107891082764, + "learning_rate": 0.00016482408141514708, + "loss": 1.6422, + "step": 13548 + }, + { + "epoch": 0.17606307551616765, + "grad_norm": 0.43667447566986084, + "learning_rate": 0.00016482148195323568, + "loss": 1.3703, + "step": 13549 + }, + { + "epoch": 0.17607607006008352, + "grad_norm": 0.42366620898246765, + "learning_rate": 0.0001648188824913243, + "loss": 1.3363, + "step": 13550 + }, + { + "epoch": 0.1760890646039994, + "grad_norm": 0.3786791265010834, + "learning_rate": 0.00016481628302941292, + "loss": 1.4411, + "step": 13551 + }, + { + "epoch": 0.17610205914791527, + "grad_norm": 0.434643030166626, + "learning_rate": 0.00016481368356750152, + "loss": 1.4437, + "step": 13552 + }, + { + "epoch": 0.17611505369183114, + "grad_norm": 0.34708961844444275, + "learning_rate": 0.00016481108410559015, + "loss": 1.4127, + "step": 13553 + }, + { + "epoch": 0.17612804823574701, + "grad_norm": 0.3480059802532196, + "learning_rate": 0.00016480848464367877, + "loss": 1.4018, + "step": 13554 + }, + { + "epoch": 0.1761410427796629, + "grad_norm": 0.472269743680954, + "learning_rate": 0.0001648058851817674, + "loss": 1.4841, + "step": 13555 + }, + { + "epoch": 0.17615403732357876, + "grad_norm": 0.3097432255744934, + "learning_rate": 0.000164803285719856, + "loss": 1.1991, + "step": 13556 + }, + { + "epoch": 0.17616703186749463, + "grad_norm": 0.4113951623439789, + "learning_rate": 0.00016480068625794462, + "loss": 1.6214, + "step": 13557 + }, + { + "epoch": 0.1761800264114105, + "grad_norm": 0.4583827257156372, + "learning_rate": 0.00016479808679603324, + "loss": 1.5132, + "step": 13558 + }, + { + "epoch": 0.17619302095532638, + "grad_norm": 0.49137359857559204, + "learning_rate": 0.00016479548733412184, + "loss": 1.422, + "step": 13559 + }, + { + "epoch": 0.17620601549924225, + "grad_norm": 0.5129138231277466, + "learning_rate": 0.00016479288787221046, + "loss": 1.3676, + "step": 13560 + }, + { + "epoch": 0.17621901004315813, + "grad_norm": 0.4639744758605957, + "learning_rate": 0.00016479028841029906, + "loss": 1.4898, + "step": 13561 + }, + { + "epoch": 0.176232004587074, + "grad_norm": 0.39463192224502563, + "learning_rate": 0.00016478768894838768, + "loss": 1.5868, + "step": 13562 + }, + { + "epoch": 0.17624499913098987, + "grad_norm": 0.40299704670906067, + "learning_rate": 0.0001647850894864763, + "loss": 1.3985, + "step": 13563 + }, + { + "epoch": 0.17625799367490574, + "grad_norm": 0.3154836893081665, + "learning_rate": 0.0001647824900245649, + "loss": 1.4936, + "step": 13564 + }, + { + "epoch": 0.17627098821882162, + "grad_norm": 0.535276472568512, + "learning_rate": 0.00016477989056265356, + "loss": 1.2896, + "step": 13565 + }, + { + "epoch": 0.1762839827627375, + "grad_norm": 0.4607764482498169, + "learning_rate": 0.00016477729110074216, + "loss": 1.5687, + "step": 13566 + }, + { + "epoch": 0.17629697730665336, + "grad_norm": 0.3937928378582001, + "learning_rate": 0.00016477469163883078, + "loss": 1.3595, + "step": 13567 + }, + { + "epoch": 0.17630997185056924, + "grad_norm": 0.433514267206192, + "learning_rate": 0.00016477209217691938, + "loss": 1.6069, + "step": 13568 + }, + { + "epoch": 0.1763229663944851, + "grad_norm": 0.4924296736717224, + "learning_rate": 0.000164769492715008, + "loss": 1.4961, + "step": 13569 + }, + { + "epoch": 0.17633596093840098, + "grad_norm": 0.43467968702316284, + "learning_rate": 0.00016476689325309663, + "loss": 1.2885, + "step": 13570 + }, + { + "epoch": 0.17634895548231685, + "grad_norm": 0.29190871119499207, + "learning_rate": 0.00016476429379118522, + "loss": 1.067, + "step": 13571 + }, + { + "epoch": 0.17636195002623273, + "grad_norm": 0.39473944902420044, + "learning_rate": 0.00016476169432927385, + "loss": 1.4965, + "step": 13572 + }, + { + "epoch": 0.1763749445701486, + "grad_norm": 0.5320652723312378, + "learning_rate": 0.00016475909486736247, + "loss": 1.3424, + "step": 13573 + }, + { + "epoch": 0.17638793911406447, + "grad_norm": 0.32814016938209534, + "learning_rate": 0.00016475649540545107, + "loss": 1.1422, + "step": 13574 + }, + { + "epoch": 0.17640093365798035, + "grad_norm": 0.33688774704933167, + "learning_rate": 0.0001647538959435397, + "loss": 1.3045, + "step": 13575 + }, + { + "epoch": 0.17641392820189622, + "grad_norm": 0.31415408849716187, + "learning_rate": 0.0001647512964816283, + "loss": 1.4122, + "step": 13576 + }, + { + "epoch": 0.1764269227458121, + "grad_norm": 0.42619383335113525, + "learning_rate": 0.00016474869701971694, + "loss": 1.2647, + "step": 13577 + }, + { + "epoch": 0.17643991728972797, + "grad_norm": 0.36367809772491455, + "learning_rate": 0.00016474609755780554, + "loss": 1.4734, + "step": 13578 + }, + { + "epoch": 0.17645291183364384, + "grad_norm": 0.4452381432056427, + "learning_rate": 0.00016474349809589417, + "loss": 1.6356, + "step": 13579 + }, + { + "epoch": 0.1764659063775597, + "grad_norm": 0.4153081178665161, + "learning_rate": 0.00016474089863398276, + "loss": 1.2722, + "step": 13580 + }, + { + "epoch": 0.17647890092147558, + "grad_norm": 0.4036629796028137, + "learning_rate": 0.0001647382991720714, + "loss": 1.1678, + "step": 13581 + }, + { + "epoch": 0.17649189546539146, + "grad_norm": 0.4922664761543274, + "learning_rate": 0.00016473569971016, + "loss": 1.5953, + "step": 13582 + }, + { + "epoch": 0.17650489000930733, + "grad_norm": 0.4043973982334137, + "learning_rate": 0.0001647331002482486, + "loss": 1.3069, + "step": 13583 + }, + { + "epoch": 0.1765178845532232, + "grad_norm": 0.38580867648124695, + "learning_rate": 0.00016473050078633723, + "loss": 1.4947, + "step": 13584 + }, + { + "epoch": 0.17653087909713908, + "grad_norm": 0.352580189704895, + "learning_rate": 0.00016472790132442586, + "loss": 1.4729, + "step": 13585 + }, + { + "epoch": 0.17654387364105495, + "grad_norm": 0.44040560722351074, + "learning_rate": 0.00016472530186251448, + "loss": 1.2951, + "step": 13586 + }, + { + "epoch": 0.17655686818497082, + "grad_norm": 0.4771287739276886, + "learning_rate": 0.00016472270240060308, + "loss": 1.4535, + "step": 13587 + }, + { + "epoch": 0.1765698627288867, + "grad_norm": 0.42403465509414673, + "learning_rate": 0.00016472010293869168, + "loss": 1.2877, + "step": 13588 + }, + { + "epoch": 0.1765828572728026, + "grad_norm": 0.3243350684642792, + "learning_rate": 0.00016471750347678033, + "loss": 1.2671, + "step": 13589 + }, + { + "epoch": 0.17659585181671847, + "grad_norm": 0.3546353280544281, + "learning_rate": 0.00016471490401486893, + "loss": 1.4155, + "step": 13590 + }, + { + "epoch": 0.17660884636063434, + "grad_norm": 0.4194067716598511, + "learning_rate": 0.00016471230455295755, + "loss": 1.54, + "step": 13591 + }, + { + "epoch": 0.17662184090455021, + "grad_norm": 0.33934760093688965, + "learning_rate": 0.00016470970509104615, + "loss": 1.5222, + "step": 13592 + }, + { + "epoch": 0.1766348354484661, + "grad_norm": 0.39700618386268616, + "learning_rate": 0.00016470710562913477, + "loss": 1.4634, + "step": 13593 + }, + { + "epoch": 0.17664782999238196, + "grad_norm": 0.37450742721557617, + "learning_rate": 0.0001647045061672234, + "loss": 1.647, + "step": 13594 + }, + { + "epoch": 0.17666082453629783, + "grad_norm": 0.42002931237220764, + "learning_rate": 0.000164701906705312, + "loss": 1.5675, + "step": 13595 + }, + { + "epoch": 0.1766738190802137, + "grad_norm": 0.34220069646835327, + "learning_rate": 0.00016469930724340062, + "loss": 1.3423, + "step": 13596 + }, + { + "epoch": 0.17668681362412958, + "grad_norm": 0.6377004981040955, + "learning_rate": 0.00016469670778148924, + "loss": 1.5027, + "step": 13597 + }, + { + "epoch": 0.17669980816804545, + "grad_norm": 0.3749501407146454, + "learning_rate": 0.00016469410831957787, + "loss": 1.5834, + "step": 13598 + }, + { + "epoch": 0.17671280271196133, + "grad_norm": 0.4093885123729706, + "learning_rate": 0.00016469150885766647, + "loss": 1.3875, + "step": 13599 + }, + { + "epoch": 0.1767257972558772, + "grad_norm": 0.24530300498008728, + "learning_rate": 0.00016468890939575506, + "loss": 1.2604, + "step": 13600 + }, + { + "epoch": 0.17673879179979307, + "grad_norm": 0.41008853912353516, + "learning_rate": 0.00016468630993384371, + "loss": 1.4578, + "step": 13601 + }, + { + "epoch": 0.17675178634370894, + "grad_norm": 0.4438818097114563, + "learning_rate": 0.0001646837104719323, + "loss": 1.3622, + "step": 13602 + }, + { + "epoch": 0.17676478088762482, + "grad_norm": 0.43295395374298096, + "learning_rate": 0.00016468111101002094, + "loss": 1.5506, + "step": 13603 + }, + { + "epoch": 0.1767777754315407, + "grad_norm": 0.32715073227882385, + "learning_rate": 0.00016467851154810956, + "loss": 1.3477, + "step": 13604 + }, + { + "epoch": 0.17679076997545656, + "grad_norm": 0.40638142824172974, + "learning_rate": 0.00016467591208619816, + "loss": 1.1918, + "step": 13605 + }, + { + "epoch": 0.17680376451937244, + "grad_norm": 0.31783148646354675, + "learning_rate": 0.00016467331262428678, + "loss": 1.3725, + "step": 13606 + }, + { + "epoch": 0.1768167590632883, + "grad_norm": 0.6016378402709961, + "learning_rate": 0.00016467071316237538, + "loss": 1.6826, + "step": 13607 + }, + { + "epoch": 0.17682975360720418, + "grad_norm": 0.403513640165329, + "learning_rate": 0.00016466811370046403, + "loss": 1.3186, + "step": 13608 + }, + { + "epoch": 0.17684274815112005, + "grad_norm": 0.4337422847747803, + "learning_rate": 0.00016466551423855263, + "loss": 1.3861, + "step": 13609 + }, + { + "epoch": 0.17685574269503593, + "grad_norm": 0.4710150957107544, + "learning_rate": 0.00016466291477664125, + "loss": 1.4649, + "step": 13610 + }, + { + "epoch": 0.1768687372389518, + "grad_norm": 0.3504270315170288, + "learning_rate": 0.00016466031531472985, + "loss": 1.1975, + "step": 13611 + }, + { + "epoch": 0.17688173178286767, + "grad_norm": 0.482486367225647, + "learning_rate": 0.00016465771585281848, + "loss": 1.5034, + "step": 13612 + }, + { + "epoch": 0.17689472632678355, + "grad_norm": 0.40145424008369446, + "learning_rate": 0.0001646551163909071, + "loss": 1.4967, + "step": 13613 + }, + { + "epoch": 0.17690772087069942, + "grad_norm": 0.4193040132522583, + "learning_rate": 0.0001646525169289957, + "loss": 1.4751, + "step": 13614 + }, + { + "epoch": 0.1769207154146153, + "grad_norm": 0.40405330061912537, + "learning_rate": 0.00016464991746708432, + "loss": 1.2807, + "step": 13615 + }, + { + "epoch": 0.17693370995853117, + "grad_norm": 0.40608423948287964, + "learning_rate": 0.00016464731800517295, + "loss": 1.6081, + "step": 13616 + }, + { + "epoch": 0.17694670450244704, + "grad_norm": 0.3507035970687866, + "learning_rate": 0.00016464471854326154, + "loss": 1.1923, + "step": 13617 + }, + { + "epoch": 0.1769596990463629, + "grad_norm": 0.30400189757347107, + "learning_rate": 0.00016464211908135017, + "loss": 1.2624, + "step": 13618 + }, + { + "epoch": 0.17697269359027878, + "grad_norm": 0.3268266022205353, + "learning_rate": 0.00016463951961943877, + "loss": 1.0787, + "step": 13619 + }, + { + "epoch": 0.17698568813419466, + "grad_norm": 0.464998722076416, + "learning_rate": 0.00016463692015752742, + "loss": 1.4647, + "step": 13620 + }, + { + "epoch": 0.17699868267811053, + "grad_norm": 0.2747749090194702, + "learning_rate": 0.00016463432069561601, + "loss": 1.3683, + "step": 13621 + }, + { + "epoch": 0.1770116772220264, + "grad_norm": 0.45030391216278076, + "learning_rate": 0.00016463172123370464, + "loss": 1.4602, + "step": 13622 + }, + { + "epoch": 0.17702467176594228, + "grad_norm": 0.42780452966690063, + "learning_rate": 0.00016462912177179324, + "loss": 1.4929, + "step": 13623 + }, + { + "epoch": 0.17703766630985815, + "grad_norm": 0.27499017119407654, + "learning_rate": 0.00016462652230988186, + "loss": 1.398, + "step": 13624 + }, + { + "epoch": 0.17705066085377402, + "grad_norm": 0.40039119124412537, + "learning_rate": 0.00016462392284797049, + "loss": 1.5109, + "step": 13625 + }, + { + "epoch": 0.1770636553976899, + "grad_norm": 0.41366317868232727, + "learning_rate": 0.00016462132338605908, + "loss": 1.4753, + "step": 13626 + }, + { + "epoch": 0.17707664994160577, + "grad_norm": 0.3705471158027649, + "learning_rate": 0.0001646187239241477, + "loss": 1.4735, + "step": 13627 + }, + { + "epoch": 0.17708964448552164, + "grad_norm": 0.4617614448070526, + "learning_rate": 0.00016461612446223633, + "loss": 1.6215, + "step": 13628 + }, + { + "epoch": 0.17710263902943751, + "grad_norm": 0.3661966025829315, + "learning_rate": 0.00016461352500032493, + "loss": 1.228, + "step": 13629 + }, + { + "epoch": 0.1771156335733534, + "grad_norm": 0.33977580070495605, + "learning_rate": 0.00016461092553841355, + "loss": 1.3822, + "step": 13630 + }, + { + "epoch": 0.17712862811726926, + "grad_norm": 0.3308474123477936, + "learning_rate": 0.00016460832607650215, + "loss": 1.32, + "step": 13631 + }, + { + "epoch": 0.17714162266118513, + "grad_norm": 0.40739473700523376, + "learning_rate": 0.0001646057266145908, + "loss": 1.5853, + "step": 13632 + }, + { + "epoch": 0.177154617205101, + "grad_norm": 0.31952229142189026, + "learning_rate": 0.0001646031271526794, + "loss": 1.3924, + "step": 13633 + }, + { + "epoch": 0.17716761174901688, + "grad_norm": 0.3937205374240875, + "learning_rate": 0.00016460052769076802, + "loss": 1.1896, + "step": 13634 + }, + { + "epoch": 0.17718060629293275, + "grad_norm": 0.3613029718399048, + "learning_rate": 0.00016459792822885662, + "loss": 1.2623, + "step": 13635 + }, + { + "epoch": 0.17719360083684862, + "grad_norm": 0.34531939029693604, + "learning_rate": 0.00016459532876694525, + "loss": 1.4017, + "step": 13636 + }, + { + "epoch": 0.1772065953807645, + "grad_norm": 0.37922030687332153, + "learning_rate": 0.00016459272930503387, + "loss": 1.5051, + "step": 13637 + }, + { + "epoch": 0.17721958992468037, + "grad_norm": 0.3644823431968689, + "learning_rate": 0.00016459012984312247, + "loss": 1.4082, + "step": 13638 + }, + { + "epoch": 0.17723258446859624, + "grad_norm": 0.3796321153640747, + "learning_rate": 0.00016458753038121112, + "loss": 1.4114, + "step": 13639 + }, + { + "epoch": 0.17724557901251212, + "grad_norm": 0.5171111226081848, + "learning_rate": 0.00016458493091929972, + "loss": 1.3827, + "step": 13640 + }, + { + "epoch": 0.177258573556428, + "grad_norm": 0.3387519121170044, + "learning_rate": 0.00016458233145738831, + "loss": 1.3418, + "step": 13641 + }, + { + "epoch": 0.17727156810034386, + "grad_norm": 0.3412686288356781, + "learning_rate": 0.00016457973199547694, + "loss": 1.471, + "step": 13642 + }, + { + "epoch": 0.17728456264425974, + "grad_norm": 0.3255894184112549, + "learning_rate": 0.00016457713253356556, + "loss": 1.231, + "step": 13643 + }, + { + "epoch": 0.1772975571881756, + "grad_norm": 0.4041486382484436, + "learning_rate": 0.0001645745330716542, + "loss": 1.3806, + "step": 13644 + }, + { + "epoch": 0.17731055173209148, + "grad_norm": 0.3939409852027893, + "learning_rate": 0.00016457193360974279, + "loss": 1.4591, + "step": 13645 + }, + { + "epoch": 0.17732354627600735, + "grad_norm": 0.35969486832618713, + "learning_rate": 0.0001645693341478314, + "loss": 1.5352, + "step": 13646 + }, + { + "epoch": 0.17733654081992323, + "grad_norm": 0.4259096384048462, + "learning_rate": 0.00016456673468592003, + "loss": 1.4844, + "step": 13647 + }, + { + "epoch": 0.1773495353638391, + "grad_norm": 0.35051289200782776, + "learning_rate": 0.00016456413522400863, + "loss": 1.2712, + "step": 13648 + }, + { + "epoch": 0.17736252990775497, + "grad_norm": 0.4646059572696686, + "learning_rate": 0.00016456153576209726, + "loss": 1.5192, + "step": 13649 + }, + { + "epoch": 0.17737552445167085, + "grad_norm": 0.44001391530036926, + "learning_rate": 0.00016455893630018585, + "loss": 1.4213, + "step": 13650 + }, + { + "epoch": 0.17738851899558672, + "grad_norm": 0.37329018115997314, + "learning_rate": 0.0001645563368382745, + "loss": 1.5207, + "step": 13651 + }, + { + "epoch": 0.1774015135395026, + "grad_norm": 0.4457497000694275, + "learning_rate": 0.0001645537373763631, + "loss": 1.4993, + "step": 13652 + }, + { + "epoch": 0.17741450808341847, + "grad_norm": 0.40998464822769165, + "learning_rate": 0.00016455113791445173, + "loss": 1.2091, + "step": 13653 + }, + { + "epoch": 0.17742750262733434, + "grad_norm": 0.3715917468070984, + "learning_rate": 0.00016454853845254032, + "loss": 1.3589, + "step": 13654 + }, + { + "epoch": 0.1774404971712502, + "grad_norm": 0.46409469842910767, + "learning_rate": 0.00016454593899062895, + "loss": 1.5047, + "step": 13655 + }, + { + "epoch": 0.17745349171516608, + "grad_norm": 0.40728530287742615, + "learning_rate": 0.00016454333952871757, + "loss": 1.4768, + "step": 13656 + }, + { + "epoch": 0.17746648625908196, + "grad_norm": 0.43321725726127625, + "learning_rate": 0.00016454074006680617, + "loss": 1.6089, + "step": 13657 + }, + { + "epoch": 0.17747948080299783, + "grad_norm": 0.438004732131958, + "learning_rate": 0.0001645381406048948, + "loss": 1.4013, + "step": 13658 + }, + { + "epoch": 0.1774924753469137, + "grad_norm": 0.4390339255332947, + "learning_rate": 0.00016453554114298342, + "loss": 1.3902, + "step": 13659 + }, + { + "epoch": 0.17750546989082958, + "grad_norm": 0.4157491624355316, + "learning_rate": 0.00016453294168107202, + "loss": 1.5744, + "step": 13660 + }, + { + "epoch": 0.17751846443474545, + "grad_norm": 0.3626139760017395, + "learning_rate": 0.00016453034221916064, + "loss": 1.3843, + "step": 13661 + }, + { + "epoch": 0.17753145897866132, + "grad_norm": 0.49568676948547363, + "learning_rate": 0.00016452774275724924, + "loss": 1.4975, + "step": 13662 + }, + { + "epoch": 0.1775444535225772, + "grad_norm": 0.4177223742008209, + "learning_rate": 0.0001645251432953379, + "loss": 1.5525, + "step": 13663 + }, + { + "epoch": 0.17755744806649307, + "grad_norm": 0.3607726991176605, + "learning_rate": 0.0001645225438334265, + "loss": 1.278, + "step": 13664 + }, + { + "epoch": 0.17757044261040897, + "grad_norm": 0.3979356288909912, + "learning_rate": 0.0001645199443715151, + "loss": 1.3086, + "step": 13665 + }, + { + "epoch": 0.17758343715432484, + "grad_norm": 0.42499083280563354, + "learning_rate": 0.0001645173449096037, + "loss": 1.5858, + "step": 13666 + }, + { + "epoch": 0.17759643169824071, + "grad_norm": 0.3166114389896393, + "learning_rate": 0.00016451474544769233, + "loss": 1.4167, + "step": 13667 + }, + { + "epoch": 0.1776094262421566, + "grad_norm": 0.440518856048584, + "learning_rate": 0.00016451214598578096, + "loss": 1.5791, + "step": 13668 + }, + { + "epoch": 0.17762242078607246, + "grad_norm": 0.3920338451862335, + "learning_rate": 0.00016450954652386956, + "loss": 1.3097, + "step": 13669 + }, + { + "epoch": 0.17763541532998833, + "grad_norm": 0.41914042830467224, + "learning_rate": 0.00016450694706195818, + "loss": 1.4959, + "step": 13670 + }, + { + "epoch": 0.1776484098739042, + "grad_norm": 0.38990136981010437, + "learning_rate": 0.0001645043476000468, + "loss": 1.2791, + "step": 13671 + }, + { + "epoch": 0.17766140441782008, + "grad_norm": 0.3316994309425354, + "learning_rate": 0.0001645017481381354, + "loss": 1.3783, + "step": 13672 + }, + { + "epoch": 0.17767439896173595, + "grad_norm": 0.4271516501903534, + "learning_rate": 0.00016449914867622403, + "loss": 1.3465, + "step": 13673 + }, + { + "epoch": 0.17768739350565182, + "grad_norm": 0.3148038685321808, + "learning_rate": 0.00016449654921431262, + "loss": 1.3453, + "step": 13674 + }, + { + "epoch": 0.1777003880495677, + "grad_norm": 0.25042101740837097, + "learning_rate": 0.00016449394975240128, + "loss": 1.228, + "step": 13675 + }, + { + "epoch": 0.17771338259348357, + "grad_norm": 0.28322187066078186, + "learning_rate": 0.00016449135029048987, + "loss": 1.5356, + "step": 13676 + }, + { + "epoch": 0.17772637713739944, + "grad_norm": 0.33902743458747864, + "learning_rate": 0.0001644887508285785, + "loss": 1.5147, + "step": 13677 + }, + { + "epoch": 0.17773937168131532, + "grad_norm": 0.3827899396419525, + "learning_rate": 0.00016448615136666712, + "loss": 1.5787, + "step": 13678 + }, + { + "epoch": 0.1777523662252312, + "grad_norm": 0.43593862652778625, + "learning_rate": 0.00016448355190475572, + "loss": 1.4679, + "step": 13679 + }, + { + "epoch": 0.17776536076914706, + "grad_norm": 0.40292251110076904, + "learning_rate": 0.00016448095244284434, + "loss": 1.2723, + "step": 13680 + }, + { + "epoch": 0.17777835531306294, + "grad_norm": 0.38633421063423157, + "learning_rate": 0.00016447835298093294, + "loss": 1.5552, + "step": 13681 + }, + { + "epoch": 0.1777913498569788, + "grad_norm": 0.23915137350559235, + "learning_rate": 0.0001644757535190216, + "loss": 1.1348, + "step": 13682 + }, + { + "epoch": 0.17780434440089468, + "grad_norm": 0.4916902482509613, + "learning_rate": 0.0001644731540571102, + "loss": 1.4298, + "step": 13683 + }, + { + "epoch": 0.17781733894481055, + "grad_norm": 0.3672642409801483, + "learning_rate": 0.0001644705545951988, + "loss": 1.3906, + "step": 13684 + }, + { + "epoch": 0.17783033348872643, + "grad_norm": 0.3869721293449402, + "learning_rate": 0.0001644679551332874, + "loss": 1.4877, + "step": 13685 + }, + { + "epoch": 0.1778433280326423, + "grad_norm": 0.32864007353782654, + "learning_rate": 0.00016446535567137604, + "loss": 1.392, + "step": 13686 + }, + { + "epoch": 0.17785632257655817, + "grad_norm": 0.5088270902633667, + "learning_rate": 0.00016446275620946466, + "loss": 1.3626, + "step": 13687 + }, + { + "epoch": 0.17786931712047405, + "grad_norm": 0.45448508858680725, + "learning_rate": 0.00016446015674755326, + "loss": 1.3931, + "step": 13688 + }, + { + "epoch": 0.17788231166438992, + "grad_norm": 0.4488750696182251, + "learning_rate": 0.00016445755728564188, + "loss": 1.3493, + "step": 13689 + }, + { + "epoch": 0.1778953062083058, + "grad_norm": 0.3026556074619293, + "learning_rate": 0.0001644549578237305, + "loss": 1.2031, + "step": 13690 + }, + { + "epoch": 0.17790830075222167, + "grad_norm": 0.3764384984970093, + "learning_rate": 0.0001644523583618191, + "loss": 1.4912, + "step": 13691 + }, + { + "epoch": 0.17792129529613754, + "grad_norm": 0.47016799449920654, + "learning_rate": 0.00016444975889990773, + "loss": 1.5177, + "step": 13692 + }, + { + "epoch": 0.1779342898400534, + "grad_norm": 0.4489028751850128, + "learning_rate": 0.00016444715943799633, + "loss": 1.3652, + "step": 13693 + }, + { + "epoch": 0.17794728438396928, + "grad_norm": 0.40877869725227356, + "learning_rate": 0.00016444455997608498, + "loss": 1.4849, + "step": 13694 + }, + { + "epoch": 0.17796027892788516, + "grad_norm": 0.3565262258052826, + "learning_rate": 0.00016444196051417358, + "loss": 1.4428, + "step": 13695 + }, + { + "epoch": 0.17797327347180103, + "grad_norm": 0.43148869276046753, + "learning_rate": 0.00016443936105226217, + "loss": 1.3438, + "step": 13696 + }, + { + "epoch": 0.1779862680157169, + "grad_norm": 0.30440086126327515, + "learning_rate": 0.0001644367615903508, + "loss": 1.3172, + "step": 13697 + }, + { + "epoch": 0.17799926255963278, + "grad_norm": 0.46799495816230774, + "learning_rate": 0.00016443416212843942, + "loss": 1.6404, + "step": 13698 + }, + { + "epoch": 0.17801225710354865, + "grad_norm": 0.4725026786327362, + "learning_rate": 0.00016443156266652805, + "loss": 1.4119, + "step": 13699 + }, + { + "epoch": 0.17802525164746452, + "grad_norm": 0.3942779302597046, + "learning_rate": 0.00016442896320461664, + "loss": 1.4096, + "step": 13700 + }, + { + "epoch": 0.1780382461913804, + "grad_norm": 0.4225706160068512, + "learning_rate": 0.00016442636374270527, + "loss": 1.4275, + "step": 13701 + }, + { + "epoch": 0.17805124073529627, + "grad_norm": 0.3985162675380707, + "learning_rate": 0.0001644237642807939, + "loss": 1.3276, + "step": 13702 + }, + { + "epoch": 0.17806423527921214, + "grad_norm": 0.3767097294330597, + "learning_rate": 0.0001644211648188825, + "loss": 1.3346, + "step": 13703 + }, + { + "epoch": 0.178077229823128, + "grad_norm": 0.42983728647232056, + "learning_rate": 0.00016441856535697111, + "loss": 1.3921, + "step": 13704 + }, + { + "epoch": 0.1780902243670439, + "grad_norm": 0.35475462675094604, + "learning_rate": 0.0001644159658950597, + "loss": 1.375, + "step": 13705 + }, + { + "epoch": 0.17810321891095976, + "grad_norm": 0.3303016424179077, + "learning_rate": 0.00016441336643314836, + "loss": 1.3996, + "step": 13706 + }, + { + "epoch": 0.17811621345487563, + "grad_norm": 0.3781614601612091, + "learning_rate": 0.00016441076697123696, + "loss": 1.5475, + "step": 13707 + }, + { + "epoch": 0.1781292079987915, + "grad_norm": 0.6083277463912964, + "learning_rate": 0.00016440816750932559, + "loss": 1.5927, + "step": 13708 + }, + { + "epoch": 0.17814220254270738, + "grad_norm": 0.3208395838737488, + "learning_rate": 0.00016440556804741418, + "loss": 1.1576, + "step": 13709 + }, + { + "epoch": 0.17815519708662325, + "grad_norm": 0.43532049655914307, + "learning_rate": 0.0001644029685855028, + "loss": 1.3186, + "step": 13710 + }, + { + "epoch": 0.17816819163053912, + "grad_norm": 0.3627203106880188, + "learning_rate": 0.00016440036912359143, + "loss": 1.3851, + "step": 13711 + }, + { + "epoch": 0.178181186174455, + "grad_norm": 0.36681053042411804, + "learning_rate": 0.00016439776966168003, + "loss": 1.5454, + "step": 13712 + }, + { + "epoch": 0.17819418071837087, + "grad_norm": 0.4069535732269287, + "learning_rate": 0.00016439517019976865, + "loss": 1.6454, + "step": 13713 + }, + { + "epoch": 0.17820717526228674, + "grad_norm": 0.4214186370372772, + "learning_rate": 0.00016439257073785728, + "loss": 1.5185, + "step": 13714 + }, + { + "epoch": 0.17822016980620262, + "grad_norm": 0.45307931303977966, + "learning_rate": 0.00016438997127594588, + "loss": 1.418, + "step": 13715 + }, + { + "epoch": 0.1782331643501185, + "grad_norm": 0.3571094274520874, + "learning_rate": 0.0001643873718140345, + "loss": 1.4635, + "step": 13716 + }, + { + "epoch": 0.17824615889403436, + "grad_norm": 0.35167446732521057, + "learning_rate": 0.00016438477235212312, + "loss": 1.2989, + "step": 13717 + }, + { + "epoch": 0.17825915343795024, + "grad_norm": 0.34684211015701294, + "learning_rate": 0.00016438217289021175, + "loss": 1.4367, + "step": 13718 + }, + { + "epoch": 0.1782721479818661, + "grad_norm": 0.4268544912338257, + "learning_rate": 0.00016437957342830035, + "loss": 1.4932, + "step": 13719 + }, + { + "epoch": 0.17828514252578198, + "grad_norm": 0.4498889148235321, + "learning_rate": 0.00016437697396638897, + "loss": 1.5777, + "step": 13720 + }, + { + "epoch": 0.17829813706969785, + "grad_norm": 0.37742704153060913, + "learning_rate": 0.0001643743745044776, + "loss": 1.5398, + "step": 13721 + }, + { + "epoch": 0.17831113161361373, + "grad_norm": 0.3591799736022949, + "learning_rate": 0.0001643717750425662, + "loss": 1.3604, + "step": 13722 + }, + { + "epoch": 0.1783241261575296, + "grad_norm": 0.36309027671813965, + "learning_rate": 0.00016436917558065482, + "loss": 1.3675, + "step": 13723 + }, + { + "epoch": 0.17833712070144547, + "grad_norm": 0.374479740858078, + "learning_rate": 0.00016436657611874341, + "loss": 1.3342, + "step": 13724 + }, + { + "epoch": 0.17835011524536135, + "grad_norm": 0.42510488629341125, + "learning_rate": 0.00016436397665683204, + "loss": 1.4327, + "step": 13725 + }, + { + "epoch": 0.17836310978927722, + "grad_norm": 0.44269806146621704, + "learning_rate": 0.00016436137719492066, + "loss": 1.4045, + "step": 13726 + }, + { + "epoch": 0.1783761043331931, + "grad_norm": 0.4799162447452545, + "learning_rate": 0.00016435877773300926, + "loss": 1.4628, + "step": 13727 + }, + { + "epoch": 0.17838909887710896, + "grad_norm": 0.4027549624443054, + "learning_rate": 0.00016435617827109789, + "loss": 1.373, + "step": 13728 + }, + { + "epoch": 0.17840209342102484, + "grad_norm": 0.35438939929008484, + "learning_rate": 0.0001643535788091865, + "loss": 1.4518, + "step": 13729 + }, + { + "epoch": 0.1784150879649407, + "grad_norm": 0.4149875342845917, + "learning_rate": 0.00016435097934727513, + "loss": 1.4657, + "step": 13730 + }, + { + "epoch": 0.17842808250885658, + "grad_norm": 0.4245990216732025, + "learning_rate": 0.00016434837988536373, + "loss": 1.4014, + "step": 13731 + }, + { + "epoch": 0.17844107705277246, + "grad_norm": 0.3770747780799866, + "learning_rate": 0.00016434578042345236, + "loss": 1.5124, + "step": 13732 + }, + { + "epoch": 0.17845407159668833, + "grad_norm": 0.36359724402427673, + "learning_rate": 0.00016434318096154098, + "loss": 1.3344, + "step": 13733 + }, + { + "epoch": 0.1784670661406042, + "grad_norm": 0.3413032591342926, + "learning_rate": 0.00016434058149962958, + "loss": 1.3387, + "step": 13734 + }, + { + "epoch": 0.17848006068452008, + "grad_norm": 0.43239447474479675, + "learning_rate": 0.0001643379820377182, + "loss": 1.6183, + "step": 13735 + }, + { + "epoch": 0.17849305522843595, + "grad_norm": 0.4184035062789917, + "learning_rate": 0.0001643353825758068, + "loss": 1.4185, + "step": 13736 + }, + { + "epoch": 0.17850604977235182, + "grad_norm": 0.4724022150039673, + "learning_rate": 0.00016433278311389545, + "loss": 1.3884, + "step": 13737 + }, + { + "epoch": 0.1785190443162677, + "grad_norm": 0.35345447063446045, + "learning_rate": 0.00016433018365198405, + "loss": 1.3651, + "step": 13738 + }, + { + "epoch": 0.17853203886018357, + "grad_norm": 0.36239373683929443, + "learning_rate": 0.00016432758419007265, + "loss": 1.4274, + "step": 13739 + }, + { + "epoch": 0.17854503340409944, + "grad_norm": 0.45474785566329956, + "learning_rate": 0.00016432498472816127, + "loss": 1.6063, + "step": 13740 + }, + { + "epoch": 0.1785580279480153, + "grad_norm": 0.4569450914859772, + "learning_rate": 0.0001643223852662499, + "loss": 1.2744, + "step": 13741 + }, + { + "epoch": 0.17857102249193121, + "grad_norm": 0.38433629274368286, + "learning_rate": 0.00016431978580433852, + "loss": 1.5087, + "step": 13742 + }, + { + "epoch": 0.1785840170358471, + "grad_norm": 0.39262667298316956, + "learning_rate": 0.00016431718634242712, + "loss": 1.5212, + "step": 13743 + }, + { + "epoch": 0.17859701157976296, + "grad_norm": 0.4967557489871979, + "learning_rate": 0.00016431458688051574, + "loss": 1.5266, + "step": 13744 + }, + { + "epoch": 0.17861000612367883, + "grad_norm": 0.42679092288017273, + "learning_rate": 0.00016431198741860437, + "loss": 1.3931, + "step": 13745 + }, + { + "epoch": 0.1786230006675947, + "grad_norm": 0.45991623401641846, + "learning_rate": 0.00016430938795669296, + "loss": 1.3164, + "step": 13746 + }, + { + "epoch": 0.17863599521151058, + "grad_norm": 0.3419206440448761, + "learning_rate": 0.0001643067884947816, + "loss": 1.5314, + "step": 13747 + }, + { + "epoch": 0.17864898975542645, + "grad_norm": 0.3307262361049652, + "learning_rate": 0.00016430418903287019, + "loss": 1.2903, + "step": 13748 + }, + { + "epoch": 0.17866198429934232, + "grad_norm": 0.4099598824977875, + "learning_rate": 0.00016430158957095884, + "loss": 1.4847, + "step": 13749 + }, + { + "epoch": 0.1786749788432582, + "grad_norm": 0.516158401966095, + "learning_rate": 0.00016429899010904743, + "loss": 1.4336, + "step": 13750 + }, + { + "epoch": 0.17868797338717407, + "grad_norm": 0.4255308210849762, + "learning_rate": 0.00016429639064713603, + "loss": 1.4437, + "step": 13751 + }, + { + "epoch": 0.17870096793108994, + "grad_norm": 0.430316686630249, + "learning_rate": 0.00016429379118522468, + "loss": 1.4715, + "step": 13752 + }, + { + "epoch": 0.17871396247500582, + "grad_norm": 0.4875301420688629, + "learning_rate": 0.00016429119172331328, + "loss": 1.4921, + "step": 13753 + }, + { + "epoch": 0.1787269570189217, + "grad_norm": 0.3707745671272278, + "learning_rate": 0.0001642885922614019, + "loss": 1.5666, + "step": 13754 + }, + { + "epoch": 0.17873995156283756, + "grad_norm": 0.43877026438713074, + "learning_rate": 0.0001642859927994905, + "loss": 1.3701, + "step": 13755 + }, + { + "epoch": 0.17875294610675344, + "grad_norm": 0.43068212270736694, + "learning_rate": 0.00016428339333757913, + "loss": 1.5801, + "step": 13756 + }, + { + "epoch": 0.1787659406506693, + "grad_norm": 0.3238990902900696, + "learning_rate": 0.00016428079387566775, + "loss": 1.4267, + "step": 13757 + }, + { + "epoch": 0.17877893519458518, + "grad_norm": 0.38150307536125183, + "learning_rate": 0.00016427819441375635, + "loss": 1.4572, + "step": 13758 + }, + { + "epoch": 0.17879192973850105, + "grad_norm": 0.45798560976982117, + "learning_rate": 0.00016427559495184497, + "loss": 1.504, + "step": 13759 + }, + { + "epoch": 0.17880492428241693, + "grad_norm": 0.36258891224861145, + "learning_rate": 0.0001642729954899336, + "loss": 1.5949, + "step": 13760 + }, + { + "epoch": 0.1788179188263328, + "grad_norm": 0.40525761246681213, + "learning_rate": 0.00016427039602802222, + "loss": 1.4535, + "step": 13761 + }, + { + "epoch": 0.17883091337024867, + "grad_norm": 0.43064939975738525, + "learning_rate": 0.00016426779656611082, + "loss": 1.4622, + "step": 13762 + }, + { + "epoch": 0.17884390791416455, + "grad_norm": 0.43405669927597046, + "learning_rate": 0.00016426519710419944, + "loss": 1.4771, + "step": 13763 + }, + { + "epoch": 0.17885690245808042, + "grad_norm": 0.39857834577560425, + "learning_rate": 0.00016426259764228807, + "loss": 1.5152, + "step": 13764 + }, + { + "epoch": 0.1788698970019963, + "grad_norm": 0.547569215297699, + "learning_rate": 0.00016425999818037667, + "loss": 1.5585, + "step": 13765 + }, + { + "epoch": 0.17888289154591216, + "grad_norm": 0.45613643527030945, + "learning_rate": 0.0001642573987184653, + "loss": 1.4797, + "step": 13766 + }, + { + "epoch": 0.17889588608982804, + "grad_norm": 0.3804607093334198, + "learning_rate": 0.0001642547992565539, + "loss": 1.6104, + "step": 13767 + }, + { + "epoch": 0.1789088806337439, + "grad_norm": 0.4312661588191986, + "learning_rate": 0.0001642521997946425, + "loss": 1.3192, + "step": 13768 + }, + { + "epoch": 0.17892187517765978, + "grad_norm": 0.4098782539367676, + "learning_rate": 0.00016424960033273114, + "loss": 1.3174, + "step": 13769 + }, + { + "epoch": 0.17893486972157566, + "grad_norm": 0.38290780782699585, + "learning_rate": 0.00016424700087081973, + "loss": 1.4901, + "step": 13770 + }, + { + "epoch": 0.17894786426549153, + "grad_norm": 0.4679395258426666, + "learning_rate": 0.00016424440140890836, + "loss": 1.4962, + "step": 13771 + }, + { + "epoch": 0.1789608588094074, + "grad_norm": 0.3485274910926819, + "learning_rate": 0.00016424180194699698, + "loss": 1.4394, + "step": 13772 + }, + { + "epoch": 0.17897385335332328, + "grad_norm": 0.32977503538131714, + "learning_rate": 0.0001642392024850856, + "loss": 1.3033, + "step": 13773 + }, + { + "epoch": 0.17898684789723915, + "grad_norm": 0.373714417219162, + "learning_rate": 0.0001642366030231742, + "loss": 1.3787, + "step": 13774 + }, + { + "epoch": 0.17899984244115502, + "grad_norm": 0.2925184965133667, + "learning_rate": 0.00016423400356126283, + "loss": 1.0962, + "step": 13775 + }, + { + "epoch": 0.1790128369850709, + "grad_norm": 0.3441365659236908, + "learning_rate": 0.00016423140409935145, + "loss": 1.4411, + "step": 13776 + }, + { + "epoch": 0.17902583152898677, + "grad_norm": 0.35370850563049316, + "learning_rate": 0.00016422880463744005, + "loss": 1.589, + "step": 13777 + }, + { + "epoch": 0.17903882607290264, + "grad_norm": 0.43551695346832275, + "learning_rate": 0.00016422620517552868, + "loss": 1.3024, + "step": 13778 + }, + { + "epoch": 0.1790518206168185, + "grad_norm": 0.5173757076263428, + "learning_rate": 0.00016422360571361727, + "loss": 1.446, + "step": 13779 + }, + { + "epoch": 0.1790648151607344, + "grad_norm": 0.3859349191188812, + "learning_rate": 0.0001642210062517059, + "loss": 1.3633, + "step": 13780 + }, + { + "epoch": 0.17907780970465026, + "grad_norm": 0.3870016932487488, + "learning_rate": 0.00016421840678979452, + "loss": 1.3529, + "step": 13781 + }, + { + "epoch": 0.17909080424856613, + "grad_norm": 0.48903608322143555, + "learning_rate": 0.00016421580732788312, + "loss": 1.5851, + "step": 13782 + }, + { + "epoch": 0.179103798792482, + "grad_norm": 0.4143184423446655, + "learning_rate": 0.00016421320786597174, + "loss": 1.4583, + "step": 13783 + }, + { + "epoch": 0.17911679333639788, + "grad_norm": 0.40734627842903137, + "learning_rate": 0.00016421060840406037, + "loss": 1.4948, + "step": 13784 + }, + { + "epoch": 0.17912978788031375, + "grad_norm": 0.402630478143692, + "learning_rate": 0.000164208008942149, + "loss": 1.3483, + "step": 13785 + }, + { + "epoch": 0.17914278242422962, + "grad_norm": 0.3838624656200409, + "learning_rate": 0.0001642054094802376, + "loss": 1.4212, + "step": 13786 + }, + { + "epoch": 0.1791557769681455, + "grad_norm": 0.44989466667175293, + "learning_rate": 0.00016420281001832622, + "loss": 1.5962, + "step": 13787 + }, + { + "epoch": 0.17916877151206137, + "grad_norm": 0.3460371792316437, + "learning_rate": 0.00016420021055641484, + "loss": 1.3895, + "step": 13788 + }, + { + "epoch": 0.17918176605597724, + "grad_norm": 0.39832690358161926, + "learning_rate": 0.00016419761109450344, + "loss": 1.269, + "step": 13789 + }, + { + "epoch": 0.17919476059989312, + "grad_norm": 0.47547417879104614, + "learning_rate": 0.00016419501163259206, + "loss": 1.5116, + "step": 13790 + }, + { + "epoch": 0.179207755143809, + "grad_norm": 0.4040995240211487, + "learning_rate": 0.00016419241217068069, + "loss": 1.3665, + "step": 13791 + }, + { + "epoch": 0.17922074968772486, + "grad_norm": 0.310497909784317, + "learning_rate": 0.0001641898127087693, + "loss": 1.4201, + "step": 13792 + }, + { + "epoch": 0.17923374423164073, + "grad_norm": 0.36639413237571716, + "learning_rate": 0.0001641872132468579, + "loss": 1.3808, + "step": 13793 + }, + { + "epoch": 0.1792467387755566, + "grad_norm": 0.407367467880249, + "learning_rate": 0.0001641846137849465, + "loss": 1.5367, + "step": 13794 + }, + { + "epoch": 0.17925973331947248, + "grad_norm": 0.3933084011077881, + "learning_rate": 0.00016418201432303516, + "loss": 1.2892, + "step": 13795 + }, + { + "epoch": 0.17927272786338835, + "grad_norm": 0.28527238965034485, + "learning_rate": 0.00016417941486112375, + "loss": 1.2736, + "step": 13796 + }, + { + "epoch": 0.17928572240730423, + "grad_norm": 0.46660131216049194, + "learning_rate": 0.00016417681539921238, + "loss": 1.4808, + "step": 13797 + }, + { + "epoch": 0.1792987169512201, + "grad_norm": 0.3512675166130066, + "learning_rate": 0.00016417421593730098, + "loss": 1.4368, + "step": 13798 + }, + { + "epoch": 0.17931171149513597, + "grad_norm": 0.3426361680030823, + "learning_rate": 0.0001641716164753896, + "loss": 1.3443, + "step": 13799 + }, + { + "epoch": 0.17932470603905185, + "grad_norm": 0.48628151416778564, + "learning_rate": 0.00016416901701347823, + "loss": 1.6091, + "step": 13800 + }, + { + "epoch": 0.17933770058296772, + "grad_norm": 0.40451952815055847, + "learning_rate": 0.00016416641755156682, + "loss": 1.4953, + "step": 13801 + }, + { + "epoch": 0.1793506951268836, + "grad_norm": 0.3639897108078003, + "learning_rate": 0.00016416381808965545, + "loss": 1.3484, + "step": 13802 + }, + { + "epoch": 0.17936368967079946, + "grad_norm": 0.3489830195903778, + "learning_rate": 0.00016416121862774407, + "loss": 1.2712, + "step": 13803 + }, + { + "epoch": 0.17937668421471534, + "grad_norm": 0.4058685004711151, + "learning_rate": 0.0001641586191658327, + "loss": 1.4822, + "step": 13804 + }, + { + "epoch": 0.1793896787586312, + "grad_norm": 0.4718252420425415, + "learning_rate": 0.0001641560197039213, + "loss": 1.551, + "step": 13805 + }, + { + "epoch": 0.17940267330254708, + "grad_norm": 0.4722570776939392, + "learning_rate": 0.0001641534202420099, + "loss": 1.4756, + "step": 13806 + }, + { + "epoch": 0.17941566784646296, + "grad_norm": 0.43057000637054443, + "learning_rate": 0.00016415082078009854, + "loss": 1.5222, + "step": 13807 + }, + { + "epoch": 0.17942866239037883, + "grad_norm": 0.4507659375667572, + "learning_rate": 0.00016414822131818714, + "loss": 1.5635, + "step": 13808 + }, + { + "epoch": 0.1794416569342947, + "grad_norm": 0.39975231885910034, + "learning_rate": 0.00016414562185627576, + "loss": 1.3116, + "step": 13809 + }, + { + "epoch": 0.17945465147821058, + "grad_norm": 0.38917291164398193, + "learning_rate": 0.00016414302239436436, + "loss": 1.3201, + "step": 13810 + }, + { + "epoch": 0.17946764602212645, + "grad_norm": 0.2852829694747925, + "learning_rate": 0.00016414042293245299, + "loss": 1.332, + "step": 13811 + }, + { + "epoch": 0.17948064056604232, + "grad_norm": 0.3799082338809967, + "learning_rate": 0.0001641378234705416, + "loss": 1.4331, + "step": 13812 + }, + { + "epoch": 0.1794936351099582, + "grad_norm": 0.32822662591934204, + "learning_rate": 0.0001641352240086302, + "loss": 1.2213, + "step": 13813 + }, + { + "epoch": 0.17950662965387407, + "grad_norm": 0.44622769951820374, + "learning_rate": 0.00016413262454671883, + "loss": 1.5485, + "step": 13814 + }, + { + "epoch": 0.17951962419778994, + "grad_norm": 0.44788530468940735, + "learning_rate": 0.00016413002508480746, + "loss": 1.5579, + "step": 13815 + }, + { + "epoch": 0.1795326187417058, + "grad_norm": 0.4718187153339386, + "learning_rate": 0.00016412742562289608, + "loss": 1.3605, + "step": 13816 + }, + { + "epoch": 0.17954561328562169, + "grad_norm": 0.4571569859981537, + "learning_rate": 0.00016412482616098468, + "loss": 1.375, + "step": 13817 + }, + { + "epoch": 0.1795586078295376, + "grad_norm": 0.36007070541381836, + "learning_rate": 0.00016412222669907328, + "loss": 1.2141, + "step": 13818 + }, + { + "epoch": 0.17957160237345346, + "grad_norm": 0.40235635638237, + "learning_rate": 0.00016411962723716193, + "loss": 1.2654, + "step": 13819 + }, + { + "epoch": 0.17958459691736933, + "grad_norm": 0.45574745535850525, + "learning_rate": 0.00016411702777525052, + "loss": 1.4978, + "step": 13820 + }, + { + "epoch": 0.1795975914612852, + "grad_norm": 0.4910139739513397, + "learning_rate": 0.00016411442831333915, + "loss": 1.3389, + "step": 13821 + }, + { + "epoch": 0.17961058600520108, + "grad_norm": 0.41516998410224915, + "learning_rate": 0.00016411182885142775, + "loss": 1.4916, + "step": 13822 + }, + { + "epoch": 0.17962358054911695, + "grad_norm": 0.41109657287597656, + "learning_rate": 0.00016410922938951637, + "loss": 1.5574, + "step": 13823 + }, + { + "epoch": 0.17963657509303282, + "grad_norm": 0.39612483978271484, + "learning_rate": 0.000164106629927605, + "loss": 1.2068, + "step": 13824 + }, + { + "epoch": 0.1796495696369487, + "grad_norm": 0.4362066090106964, + "learning_rate": 0.0001641040304656936, + "loss": 1.6603, + "step": 13825 + }, + { + "epoch": 0.17966256418086457, + "grad_norm": 0.4348241984844208, + "learning_rate": 0.00016410143100378224, + "loss": 1.6296, + "step": 13826 + }, + { + "epoch": 0.17967555872478044, + "grad_norm": 0.48206841945648193, + "learning_rate": 0.00016409883154187084, + "loss": 1.54, + "step": 13827 + }, + { + "epoch": 0.17968855326869632, + "grad_norm": 0.3649912178516388, + "learning_rate": 0.00016409623207995947, + "loss": 1.2774, + "step": 13828 + }, + { + "epoch": 0.1797015478126122, + "grad_norm": 0.3826022446155548, + "learning_rate": 0.00016409363261804806, + "loss": 1.562, + "step": 13829 + }, + { + "epoch": 0.17971454235652806, + "grad_norm": 0.5306301712989807, + "learning_rate": 0.0001640910331561367, + "loss": 1.465, + "step": 13830 + }, + { + "epoch": 0.17972753690044393, + "grad_norm": 0.38976162672042847, + "learning_rate": 0.0001640884336942253, + "loss": 1.4673, + "step": 13831 + }, + { + "epoch": 0.1797405314443598, + "grad_norm": 0.49456754326820374, + "learning_rate": 0.0001640858342323139, + "loss": 1.5426, + "step": 13832 + }, + { + "epoch": 0.17975352598827568, + "grad_norm": 0.34266799688339233, + "learning_rate": 0.00016408323477040253, + "loss": 1.4015, + "step": 13833 + }, + { + "epoch": 0.17976652053219155, + "grad_norm": 0.35109132528305054, + "learning_rate": 0.00016408063530849116, + "loss": 1.4474, + "step": 13834 + }, + { + "epoch": 0.17977951507610743, + "grad_norm": 0.4558723270893097, + "learning_rate": 0.00016407803584657976, + "loss": 1.4873, + "step": 13835 + }, + { + "epoch": 0.1797925096200233, + "grad_norm": 0.4390784204006195, + "learning_rate": 0.00016407543638466838, + "loss": 1.4802, + "step": 13836 + }, + { + "epoch": 0.17980550416393917, + "grad_norm": 0.41165438294410706, + "learning_rate": 0.00016407283692275698, + "loss": 1.4237, + "step": 13837 + }, + { + "epoch": 0.17981849870785505, + "grad_norm": 0.3708517253398895, + "learning_rate": 0.00016407023746084563, + "loss": 1.4113, + "step": 13838 + }, + { + "epoch": 0.17983149325177092, + "grad_norm": 0.30848339200019836, + "learning_rate": 0.00016406763799893423, + "loss": 1.4877, + "step": 13839 + }, + { + "epoch": 0.1798444877956868, + "grad_norm": 0.3068132698535919, + "learning_rate": 0.00016406503853702285, + "loss": 1.5416, + "step": 13840 + }, + { + "epoch": 0.17985748233960266, + "grad_norm": 0.32952257990837097, + "learning_rate": 0.00016406243907511145, + "loss": 1.4833, + "step": 13841 + }, + { + "epoch": 0.17987047688351854, + "grad_norm": 0.44472792744636536, + "learning_rate": 0.00016405983961320007, + "loss": 1.427, + "step": 13842 + }, + { + "epoch": 0.1798834714274344, + "grad_norm": 0.3594261109828949, + "learning_rate": 0.0001640572401512887, + "loss": 1.4627, + "step": 13843 + }, + { + "epoch": 0.17989646597135028, + "grad_norm": 0.3843877613544464, + "learning_rate": 0.0001640546406893773, + "loss": 1.3977, + "step": 13844 + }, + { + "epoch": 0.17990946051526616, + "grad_norm": 0.38599681854248047, + "learning_rate": 0.00016405204122746592, + "loss": 1.4046, + "step": 13845 + }, + { + "epoch": 0.17992245505918203, + "grad_norm": 0.4580393433570862, + "learning_rate": 0.00016404944176555454, + "loss": 1.4546, + "step": 13846 + }, + { + "epoch": 0.1799354496030979, + "grad_norm": 0.3166486918926239, + "learning_rate": 0.00016404684230364314, + "loss": 1.3271, + "step": 13847 + }, + { + "epoch": 0.17994844414701378, + "grad_norm": 0.3559587895870209, + "learning_rate": 0.00016404424284173177, + "loss": 1.413, + "step": 13848 + }, + { + "epoch": 0.17996143869092965, + "grad_norm": 0.48100772500038147, + "learning_rate": 0.00016404164337982036, + "loss": 1.4306, + "step": 13849 + }, + { + "epoch": 0.17997443323484552, + "grad_norm": 0.45428118109703064, + "learning_rate": 0.00016403904391790902, + "loss": 1.6888, + "step": 13850 + }, + { + "epoch": 0.1799874277787614, + "grad_norm": 0.433809369802475, + "learning_rate": 0.0001640364444559976, + "loss": 1.5382, + "step": 13851 + }, + { + "epoch": 0.18000042232267727, + "grad_norm": 0.4594508111476898, + "learning_rate": 0.00016403384499408624, + "loss": 1.5995, + "step": 13852 + }, + { + "epoch": 0.18001341686659314, + "grad_norm": 0.4555775821208954, + "learning_rate": 0.00016403124553217483, + "loss": 1.439, + "step": 13853 + }, + { + "epoch": 0.180026411410509, + "grad_norm": 0.32143816351890564, + "learning_rate": 0.00016402864607026346, + "loss": 1.375, + "step": 13854 + }, + { + "epoch": 0.18003940595442489, + "grad_norm": 0.3113643229007721, + "learning_rate": 0.00016402604660835208, + "loss": 1.5854, + "step": 13855 + }, + { + "epoch": 0.18005240049834076, + "grad_norm": 0.4634411036968231, + "learning_rate": 0.00016402344714644068, + "loss": 1.5706, + "step": 13856 + }, + { + "epoch": 0.18006539504225663, + "grad_norm": 0.32296934723854065, + "learning_rate": 0.0001640208476845293, + "loss": 1.3125, + "step": 13857 + }, + { + "epoch": 0.1800783895861725, + "grad_norm": 0.3722306489944458, + "learning_rate": 0.00016401824822261793, + "loss": 1.4272, + "step": 13858 + }, + { + "epoch": 0.18009138413008838, + "grad_norm": 0.4071848690509796, + "learning_rate": 0.00016401564876070655, + "loss": 1.5021, + "step": 13859 + }, + { + "epoch": 0.18010437867400425, + "grad_norm": 0.36169856786727905, + "learning_rate": 0.00016401304929879515, + "loss": 1.4682, + "step": 13860 + }, + { + "epoch": 0.18011737321792012, + "grad_norm": 0.44762492179870605, + "learning_rate": 0.00016401044983688378, + "loss": 1.5594, + "step": 13861 + }, + { + "epoch": 0.180130367761836, + "grad_norm": 0.46865421533584595, + "learning_rate": 0.0001640078503749724, + "loss": 1.5174, + "step": 13862 + }, + { + "epoch": 0.18014336230575187, + "grad_norm": 0.33863139152526855, + "learning_rate": 0.000164005250913061, + "loss": 1.3611, + "step": 13863 + }, + { + "epoch": 0.18015635684966774, + "grad_norm": 0.29973089694976807, + "learning_rate": 0.00016400265145114962, + "loss": 1.3293, + "step": 13864 + }, + { + "epoch": 0.18016935139358362, + "grad_norm": 0.4617586135864258, + "learning_rate": 0.00016400005198923825, + "loss": 1.4859, + "step": 13865 + }, + { + "epoch": 0.1801823459374995, + "grad_norm": 0.3091930150985718, + "learning_rate": 0.00016399745252732684, + "loss": 1.489, + "step": 13866 + }, + { + "epoch": 0.18019534048141536, + "grad_norm": 0.3450116813182831, + "learning_rate": 0.00016399485306541547, + "loss": 1.4479, + "step": 13867 + }, + { + "epoch": 0.18020833502533123, + "grad_norm": 0.3327886760234833, + "learning_rate": 0.00016399225360350407, + "loss": 1.3031, + "step": 13868 + }, + { + "epoch": 0.1802213295692471, + "grad_norm": 0.36577045917510986, + "learning_rate": 0.00016398965414159272, + "loss": 1.3405, + "step": 13869 + }, + { + "epoch": 0.18023432411316298, + "grad_norm": 0.39328888058662415, + "learning_rate": 0.00016398705467968132, + "loss": 1.4669, + "step": 13870 + }, + { + "epoch": 0.18024731865707885, + "grad_norm": 0.3182171583175659, + "learning_rate": 0.00016398445521776994, + "loss": 1.4693, + "step": 13871 + }, + { + "epoch": 0.18026031320099473, + "grad_norm": 0.3632968068122864, + "learning_rate": 0.00016398185575585854, + "loss": 1.2892, + "step": 13872 + }, + { + "epoch": 0.1802733077449106, + "grad_norm": 0.45355263352394104, + "learning_rate": 0.00016397925629394716, + "loss": 1.3309, + "step": 13873 + }, + { + "epoch": 0.18028630228882647, + "grad_norm": 0.3434823453426361, + "learning_rate": 0.00016397665683203579, + "loss": 1.4543, + "step": 13874 + }, + { + "epoch": 0.18029929683274235, + "grad_norm": 0.39346957206726074, + "learning_rate": 0.00016397405737012438, + "loss": 1.5127, + "step": 13875 + }, + { + "epoch": 0.18031229137665822, + "grad_norm": 0.3791983425617218, + "learning_rate": 0.000163971457908213, + "loss": 1.5049, + "step": 13876 + }, + { + "epoch": 0.1803252859205741, + "grad_norm": 0.40748652815818787, + "learning_rate": 0.00016396885844630163, + "loss": 1.3163, + "step": 13877 + }, + { + "epoch": 0.18033828046448996, + "grad_norm": 0.365323007106781, + "learning_rate": 0.00016396625898439023, + "loss": 1.6092, + "step": 13878 + }, + { + "epoch": 0.18035127500840584, + "grad_norm": 0.30958792567253113, + "learning_rate": 0.00016396365952247885, + "loss": 1.5301, + "step": 13879 + }, + { + "epoch": 0.1803642695523217, + "grad_norm": 0.35711079835891724, + "learning_rate": 0.00016396106006056745, + "loss": 1.4603, + "step": 13880 + }, + { + "epoch": 0.18037726409623758, + "grad_norm": 0.34600594639778137, + "learning_rate": 0.0001639584605986561, + "loss": 1.5782, + "step": 13881 + }, + { + "epoch": 0.18039025864015346, + "grad_norm": 0.31940600275993347, + "learning_rate": 0.0001639558611367447, + "loss": 1.2671, + "step": 13882 + }, + { + "epoch": 0.18040325318406933, + "grad_norm": 0.428899347782135, + "learning_rate": 0.00016395326167483333, + "loss": 1.3482, + "step": 13883 + }, + { + "epoch": 0.1804162477279852, + "grad_norm": 0.3702729642391205, + "learning_rate": 0.00016395066221292192, + "loss": 1.4955, + "step": 13884 + }, + { + "epoch": 0.18042924227190107, + "grad_norm": 0.42614781856536865, + "learning_rate": 0.00016394806275101055, + "loss": 1.469, + "step": 13885 + }, + { + "epoch": 0.18044223681581695, + "grad_norm": 0.3947365880012512, + "learning_rate": 0.00016394546328909917, + "loss": 1.345, + "step": 13886 + }, + { + "epoch": 0.18045523135973282, + "grad_norm": 0.38064542412757874, + "learning_rate": 0.00016394286382718777, + "loss": 1.4187, + "step": 13887 + }, + { + "epoch": 0.1804682259036487, + "grad_norm": 0.36633992195129395, + "learning_rate": 0.0001639402643652764, + "loss": 1.4348, + "step": 13888 + }, + { + "epoch": 0.18048122044756457, + "grad_norm": 0.43246686458587646, + "learning_rate": 0.00016393766490336502, + "loss": 1.3842, + "step": 13889 + }, + { + "epoch": 0.18049421499148044, + "grad_norm": 0.32606256008148193, + "learning_rate": 0.00016393506544145362, + "loss": 1.5689, + "step": 13890 + }, + { + "epoch": 0.1805072095353963, + "grad_norm": 0.3458372950553894, + "learning_rate": 0.00016393246597954224, + "loss": 1.5251, + "step": 13891 + }, + { + "epoch": 0.18052020407931219, + "grad_norm": 0.32178136706352234, + "learning_rate": 0.00016392986651763084, + "loss": 1.3493, + "step": 13892 + }, + { + "epoch": 0.18053319862322806, + "grad_norm": 0.45217379927635193, + "learning_rate": 0.0001639272670557195, + "loss": 1.3539, + "step": 13893 + }, + { + "epoch": 0.18054619316714396, + "grad_norm": 0.30768677592277527, + "learning_rate": 0.00016392466759380809, + "loss": 1.5169, + "step": 13894 + }, + { + "epoch": 0.18055918771105983, + "grad_norm": 0.4948274791240692, + "learning_rate": 0.0001639220681318967, + "loss": 1.2032, + "step": 13895 + }, + { + "epoch": 0.1805721822549757, + "grad_norm": 0.3379811942577362, + "learning_rate": 0.0001639194686699853, + "loss": 1.44, + "step": 13896 + }, + { + "epoch": 0.18058517679889158, + "grad_norm": 0.34871724247932434, + "learning_rate": 0.00016391686920807393, + "loss": 1.2331, + "step": 13897 + }, + { + "epoch": 0.18059817134280745, + "grad_norm": 0.41146326065063477, + "learning_rate": 0.00016391426974616256, + "loss": 1.3666, + "step": 13898 + }, + { + "epoch": 0.18061116588672332, + "grad_norm": 0.3320792019367218, + "learning_rate": 0.00016391167028425115, + "loss": 1.341, + "step": 13899 + }, + { + "epoch": 0.1806241604306392, + "grad_norm": 0.37427496910095215, + "learning_rate": 0.0001639090708223398, + "loss": 1.5761, + "step": 13900 + }, + { + "epoch": 0.18063715497455507, + "grad_norm": 0.33989691734313965, + "learning_rate": 0.0001639064713604284, + "loss": 1.3395, + "step": 13901 + }, + { + "epoch": 0.18065014951847094, + "grad_norm": 0.44891902804374695, + "learning_rate": 0.000163903871898517, + "loss": 1.4473, + "step": 13902 + }, + { + "epoch": 0.18066314406238682, + "grad_norm": 0.36803674697875977, + "learning_rate": 0.00016390127243660563, + "loss": 1.4268, + "step": 13903 + }, + { + "epoch": 0.1806761386063027, + "grad_norm": 0.3450038731098175, + "learning_rate": 0.00016389867297469425, + "loss": 1.5073, + "step": 13904 + }, + { + "epoch": 0.18068913315021856, + "grad_norm": 0.462792307138443, + "learning_rate": 0.00016389607351278287, + "loss": 1.3914, + "step": 13905 + }, + { + "epoch": 0.18070212769413443, + "grad_norm": 0.4687727689743042, + "learning_rate": 0.00016389347405087147, + "loss": 1.5746, + "step": 13906 + }, + { + "epoch": 0.1807151222380503, + "grad_norm": 0.36433643102645874, + "learning_rate": 0.0001638908745889601, + "loss": 1.3829, + "step": 13907 + }, + { + "epoch": 0.18072811678196618, + "grad_norm": 0.41371214389801025, + "learning_rate": 0.00016388827512704872, + "loss": 1.3827, + "step": 13908 + }, + { + "epoch": 0.18074111132588205, + "grad_norm": 0.33636152744293213, + "learning_rate": 0.00016388567566513732, + "loss": 1.37, + "step": 13909 + }, + { + "epoch": 0.18075410586979793, + "grad_norm": 0.3626081645488739, + "learning_rate": 0.00016388307620322594, + "loss": 1.2576, + "step": 13910 + }, + { + "epoch": 0.1807671004137138, + "grad_norm": 0.4206397831439972, + "learning_rate": 0.00016388047674131454, + "loss": 1.3836, + "step": 13911 + }, + { + "epoch": 0.18078009495762967, + "grad_norm": 0.49211814999580383, + "learning_rate": 0.0001638778772794032, + "loss": 1.5328, + "step": 13912 + }, + { + "epoch": 0.18079308950154555, + "grad_norm": 0.3886708915233612, + "learning_rate": 0.0001638752778174918, + "loss": 1.3879, + "step": 13913 + }, + { + "epoch": 0.18080608404546142, + "grad_norm": 0.40076062083244324, + "learning_rate": 0.0001638726783555804, + "loss": 1.4102, + "step": 13914 + }, + { + "epoch": 0.1808190785893773, + "grad_norm": 0.42153453826904297, + "learning_rate": 0.000163870078893669, + "loss": 1.5717, + "step": 13915 + }, + { + "epoch": 0.18083207313329316, + "grad_norm": 0.403816282749176, + "learning_rate": 0.00016386747943175764, + "loss": 1.3718, + "step": 13916 + }, + { + "epoch": 0.18084506767720904, + "grad_norm": 0.36507534980773926, + "learning_rate": 0.00016386487996984626, + "loss": 1.2626, + "step": 13917 + }, + { + "epoch": 0.1808580622211249, + "grad_norm": 0.3647579550743103, + "learning_rate": 0.00016386228050793486, + "loss": 1.2863, + "step": 13918 + }, + { + "epoch": 0.18087105676504078, + "grad_norm": 0.3325686752796173, + "learning_rate": 0.00016385968104602348, + "loss": 1.4135, + "step": 13919 + }, + { + "epoch": 0.18088405130895666, + "grad_norm": 0.47475892305374146, + "learning_rate": 0.0001638570815841121, + "loss": 1.574, + "step": 13920 + }, + { + "epoch": 0.18089704585287253, + "grad_norm": 0.5276363492012024, + "learning_rate": 0.0001638544821222007, + "loss": 1.4119, + "step": 13921 + }, + { + "epoch": 0.1809100403967884, + "grad_norm": 0.4015517830848694, + "learning_rate": 0.00016385188266028933, + "loss": 1.3681, + "step": 13922 + }, + { + "epoch": 0.18092303494070427, + "grad_norm": 0.3477279841899872, + "learning_rate": 0.00016384928319837793, + "loss": 1.3906, + "step": 13923 + }, + { + "epoch": 0.18093602948462015, + "grad_norm": 0.4474170506000519, + "learning_rate": 0.00016384668373646658, + "loss": 1.352, + "step": 13924 + }, + { + "epoch": 0.18094902402853602, + "grad_norm": 0.44052085280418396, + "learning_rate": 0.00016384408427455517, + "loss": 1.3615, + "step": 13925 + }, + { + "epoch": 0.1809620185724519, + "grad_norm": 0.3612934350967407, + "learning_rate": 0.0001638414848126438, + "loss": 1.2988, + "step": 13926 + }, + { + "epoch": 0.18097501311636777, + "grad_norm": 0.4232841432094574, + "learning_rate": 0.0001638388853507324, + "loss": 1.4153, + "step": 13927 + }, + { + "epoch": 0.18098800766028364, + "grad_norm": 0.2938156723976135, + "learning_rate": 0.00016383628588882102, + "loss": 1.1903, + "step": 13928 + }, + { + "epoch": 0.1810010022041995, + "grad_norm": 0.45036935806274414, + "learning_rate": 0.00016383368642690965, + "loss": 1.6003, + "step": 13929 + }, + { + "epoch": 0.18101399674811539, + "grad_norm": 0.41746649146080017, + "learning_rate": 0.00016383108696499824, + "loss": 1.3764, + "step": 13930 + }, + { + "epoch": 0.18102699129203126, + "grad_norm": 0.35695648193359375, + "learning_rate": 0.00016382848750308687, + "loss": 1.4215, + "step": 13931 + }, + { + "epoch": 0.18103998583594713, + "grad_norm": 0.4255262613296509, + "learning_rate": 0.0001638258880411755, + "loss": 1.4189, + "step": 13932 + }, + { + "epoch": 0.181052980379863, + "grad_norm": 0.36589908599853516, + "learning_rate": 0.0001638232885792641, + "loss": 1.3418, + "step": 13933 + }, + { + "epoch": 0.18106597492377888, + "grad_norm": 0.39140254259109497, + "learning_rate": 0.0001638206891173527, + "loss": 1.3956, + "step": 13934 + }, + { + "epoch": 0.18107896946769475, + "grad_norm": 0.39601925015449524, + "learning_rate": 0.00016381808965544134, + "loss": 1.4962, + "step": 13935 + }, + { + "epoch": 0.18109196401161062, + "grad_norm": 0.3855540156364441, + "learning_rate": 0.00016381549019352996, + "loss": 1.5114, + "step": 13936 + }, + { + "epoch": 0.1811049585555265, + "grad_norm": 0.44705599546432495, + "learning_rate": 0.00016381289073161856, + "loss": 1.4459, + "step": 13937 + }, + { + "epoch": 0.18111795309944237, + "grad_norm": 0.3450058102607727, + "learning_rate": 0.00016381029126970718, + "loss": 1.3676, + "step": 13938 + }, + { + "epoch": 0.18113094764335824, + "grad_norm": 0.3786931037902832, + "learning_rate": 0.0001638076918077958, + "loss": 1.4215, + "step": 13939 + }, + { + "epoch": 0.18114394218727412, + "grad_norm": 0.3350026309490204, + "learning_rate": 0.0001638050923458844, + "loss": 1.2822, + "step": 13940 + }, + { + "epoch": 0.18115693673119, + "grad_norm": 0.5217697620391846, + "learning_rate": 0.00016380249288397303, + "loss": 1.3663, + "step": 13941 + }, + { + "epoch": 0.18116993127510586, + "grad_norm": 0.41333919763565063, + "learning_rate": 0.00016379989342206163, + "loss": 1.5799, + "step": 13942 + }, + { + "epoch": 0.18118292581902173, + "grad_norm": 0.38928717374801636, + "learning_rate": 0.00016379729396015028, + "loss": 1.3081, + "step": 13943 + }, + { + "epoch": 0.1811959203629376, + "grad_norm": 0.38982003927230835, + "learning_rate": 0.00016379469449823888, + "loss": 1.3577, + "step": 13944 + }, + { + "epoch": 0.18120891490685348, + "grad_norm": 0.25227731466293335, + "learning_rate": 0.00016379209503632747, + "loss": 1.4509, + "step": 13945 + }, + { + "epoch": 0.18122190945076935, + "grad_norm": 0.39276865124702454, + "learning_rate": 0.0001637894955744161, + "loss": 1.4878, + "step": 13946 + }, + { + "epoch": 0.18123490399468523, + "grad_norm": 0.45176300406455994, + "learning_rate": 0.00016378689611250472, + "loss": 1.2566, + "step": 13947 + }, + { + "epoch": 0.1812478985386011, + "grad_norm": 0.38843151926994324, + "learning_rate": 0.00016378429665059335, + "loss": 1.4452, + "step": 13948 + }, + { + "epoch": 0.18126089308251697, + "grad_norm": 0.31233280897140503, + "learning_rate": 0.00016378169718868195, + "loss": 1.4721, + "step": 13949 + }, + { + "epoch": 0.18127388762643284, + "grad_norm": 0.4567350447177887, + "learning_rate": 0.00016377909772677057, + "loss": 1.4581, + "step": 13950 + }, + { + "epoch": 0.18128688217034872, + "grad_norm": 0.3333583474159241, + "learning_rate": 0.0001637764982648592, + "loss": 1.3596, + "step": 13951 + }, + { + "epoch": 0.1812998767142646, + "grad_norm": 0.3850037753582001, + "learning_rate": 0.0001637738988029478, + "loss": 1.4526, + "step": 13952 + }, + { + "epoch": 0.18131287125818046, + "grad_norm": 0.39848461747169495, + "learning_rate": 0.00016377129934103642, + "loss": 1.4833, + "step": 13953 + }, + { + "epoch": 0.18132586580209634, + "grad_norm": 0.4641290605068207, + "learning_rate": 0.000163768699879125, + "loss": 1.3582, + "step": 13954 + }, + { + "epoch": 0.1813388603460122, + "grad_norm": 0.43027442693710327, + "learning_rate": 0.00016376610041721366, + "loss": 1.5244, + "step": 13955 + }, + { + "epoch": 0.18135185488992808, + "grad_norm": 0.38939008116722107, + "learning_rate": 0.00016376350095530226, + "loss": 1.5802, + "step": 13956 + }, + { + "epoch": 0.18136484943384396, + "grad_norm": 0.3373205363750458, + "learning_rate": 0.00016376090149339086, + "loss": 1.2878, + "step": 13957 + }, + { + "epoch": 0.18137784397775983, + "grad_norm": 0.3444799780845642, + "learning_rate": 0.00016375830203147948, + "loss": 1.4647, + "step": 13958 + }, + { + "epoch": 0.1813908385216757, + "grad_norm": 0.34580856561660767, + "learning_rate": 0.0001637557025695681, + "loss": 1.4113, + "step": 13959 + }, + { + "epoch": 0.18140383306559157, + "grad_norm": 0.3667464852333069, + "learning_rate": 0.00016375310310765673, + "loss": 1.2927, + "step": 13960 + }, + { + "epoch": 0.18141682760950745, + "grad_norm": 0.5043931603431702, + "learning_rate": 0.00016375050364574533, + "loss": 1.1624, + "step": 13961 + }, + { + "epoch": 0.18142982215342332, + "grad_norm": 0.42321163415908813, + "learning_rate": 0.00016374790418383395, + "loss": 1.2044, + "step": 13962 + }, + { + "epoch": 0.1814428166973392, + "grad_norm": 0.3954523503780365, + "learning_rate": 0.00016374530472192258, + "loss": 1.5502, + "step": 13963 + }, + { + "epoch": 0.18145581124125507, + "grad_norm": 0.4727545380592346, + "learning_rate": 0.00016374270526001118, + "loss": 1.4951, + "step": 13964 + }, + { + "epoch": 0.18146880578517094, + "grad_norm": 0.45332348346710205, + "learning_rate": 0.0001637401057980998, + "loss": 1.3232, + "step": 13965 + }, + { + "epoch": 0.1814818003290868, + "grad_norm": 0.4639752209186554, + "learning_rate": 0.0001637375063361884, + "loss": 1.4182, + "step": 13966 + }, + { + "epoch": 0.18149479487300268, + "grad_norm": 0.2845968008041382, + "learning_rate": 0.00016373490687427705, + "loss": 1.4525, + "step": 13967 + }, + { + "epoch": 0.18150778941691856, + "grad_norm": 0.42732781171798706, + "learning_rate": 0.00016373230741236565, + "loss": 1.3563, + "step": 13968 + }, + { + "epoch": 0.18152078396083443, + "grad_norm": 0.33735087513923645, + "learning_rate": 0.00016372970795045427, + "loss": 1.2831, + "step": 13969 + }, + { + "epoch": 0.18153377850475033, + "grad_norm": 0.4555298984050751, + "learning_rate": 0.00016372710848854287, + "loss": 1.4099, + "step": 13970 + }, + { + "epoch": 0.1815467730486662, + "grad_norm": 0.4065535366535187, + "learning_rate": 0.0001637245090266315, + "loss": 1.4597, + "step": 13971 + }, + { + "epoch": 0.18155976759258208, + "grad_norm": 0.367956280708313, + "learning_rate": 0.00016372190956472012, + "loss": 1.3735, + "step": 13972 + }, + { + "epoch": 0.18157276213649795, + "grad_norm": 0.40117332339286804, + "learning_rate": 0.00016371931010280872, + "loss": 1.401, + "step": 13973 + }, + { + "epoch": 0.18158575668041382, + "grad_norm": 0.31792551279067993, + "learning_rate": 0.00016371671064089734, + "loss": 1.323, + "step": 13974 + }, + { + "epoch": 0.1815987512243297, + "grad_norm": 0.42645546793937683, + "learning_rate": 0.00016371411117898596, + "loss": 1.3871, + "step": 13975 + }, + { + "epoch": 0.18161174576824557, + "grad_norm": 0.40832582116127014, + "learning_rate": 0.00016371151171707456, + "loss": 1.6539, + "step": 13976 + }, + { + "epoch": 0.18162474031216144, + "grad_norm": 0.4843306839466095, + "learning_rate": 0.0001637089122551632, + "loss": 1.5694, + "step": 13977 + }, + { + "epoch": 0.18163773485607732, + "grad_norm": 0.34088650345802307, + "learning_rate": 0.0001637063127932518, + "loss": 1.4222, + "step": 13978 + }, + { + "epoch": 0.1816507293999932, + "grad_norm": 0.42761799693107605, + "learning_rate": 0.00016370371333134044, + "loss": 1.3876, + "step": 13979 + }, + { + "epoch": 0.18166372394390906, + "grad_norm": 0.3573712408542633, + "learning_rate": 0.00016370111386942903, + "loss": 1.5206, + "step": 13980 + }, + { + "epoch": 0.18167671848782493, + "grad_norm": 0.3561965227127075, + "learning_rate": 0.00016369851440751766, + "loss": 1.5019, + "step": 13981 + }, + { + "epoch": 0.1816897130317408, + "grad_norm": 0.33115676045417786, + "learning_rate": 0.00016369591494560628, + "loss": 1.246, + "step": 13982 + }, + { + "epoch": 0.18170270757565668, + "grad_norm": 0.3669005334377289, + "learning_rate": 0.00016369331548369488, + "loss": 1.3176, + "step": 13983 + }, + { + "epoch": 0.18171570211957255, + "grad_norm": 0.3958088755607605, + "learning_rate": 0.0001636907160217835, + "loss": 1.6497, + "step": 13984 + }, + { + "epoch": 0.18172869666348843, + "grad_norm": 0.38397201895713806, + "learning_rate": 0.0001636881165598721, + "loss": 1.3685, + "step": 13985 + }, + { + "epoch": 0.1817416912074043, + "grad_norm": 0.3442166745662689, + "learning_rate": 0.00016368551709796073, + "loss": 1.5329, + "step": 13986 + }, + { + "epoch": 0.18175468575132017, + "grad_norm": 0.36517348885536194, + "learning_rate": 0.00016368291763604935, + "loss": 1.3943, + "step": 13987 + }, + { + "epoch": 0.18176768029523604, + "grad_norm": 0.4144994616508484, + "learning_rate": 0.00016368031817413795, + "loss": 1.6596, + "step": 13988 + }, + { + "epoch": 0.18178067483915192, + "grad_norm": 0.4141557216644287, + "learning_rate": 0.00016367771871222657, + "loss": 1.4506, + "step": 13989 + }, + { + "epoch": 0.1817936693830678, + "grad_norm": 0.34273284673690796, + "learning_rate": 0.0001636751192503152, + "loss": 1.5336, + "step": 13990 + }, + { + "epoch": 0.18180666392698366, + "grad_norm": 0.3870570957660675, + "learning_rate": 0.00016367251978840382, + "loss": 1.4961, + "step": 13991 + }, + { + "epoch": 0.18181965847089954, + "grad_norm": 0.36208274960517883, + "learning_rate": 0.00016366992032649242, + "loss": 1.2687, + "step": 13992 + }, + { + "epoch": 0.1818326530148154, + "grad_norm": 0.4293138384819031, + "learning_rate": 0.00016366732086458104, + "loss": 1.3902, + "step": 13993 + }, + { + "epoch": 0.18184564755873128, + "grad_norm": 0.38129618763923645, + "learning_rate": 0.00016366472140266967, + "loss": 1.4509, + "step": 13994 + }, + { + "epoch": 0.18185864210264716, + "grad_norm": 0.39229297637939453, + "learning_rate": 0.00016366212194075826, + "loss": 1.5316, + "step": 13995 + }, + { + "epoch": 0.18187163664656303, + "grad_norm": 0.4997608959674835, + "learning_rate": 0.0001636595224788469, + "loss": 1.6033, + "step": 13996 + }, + { + "epoch": 0.1818846311904789, + "grad_norm": 0.36065179109573364, + "learning_rate": 0.0001636569230169355, + "loss": 1.2498, + "step": 13997 + }, + { + "epoch": 0.18189762573439477, + "grad_norm": 0.38008543848991394, + "learning_rate": 0.00016365432355502414, + "loss": 1.3301, + "step": 13998 + }, + { + "epoch": 0.18191062027831065, + "grad_norm": 0.44220849871635437, + "learning_rate": 0.00016365172409311274, + "loss": 1.5326, + "step": 13999 + }, + { + "epoch": 0.18192361482222652, + "grad_norm": 0.431658536195755, + "learning_rate": 0.00016364912463120133, + "loss": 1.5058, + "step": 14000 + }, + { + "epoch": 0.1819366093661424, + "grad_norm": 0.43918317556381226, + "learning_rate": 0.00016364652516928996, + "loss": 1.4801, + "step": 14001 + }, + { + "epoch": 0.18194960391005827, + "grad_norm": 0.47162097692489624, + "learning_rate": 0.00016364392570737858, + "loss": 1.3416, + "step": 14002 + }, + { + "epoch": 0.18196259845397414, + "grad_norm": 0.38275420665740967, + "learning_rate": 0.0001636413262454672, + "loss": 1.4922, + "step": 14003 + }, + { + "epoch": 0.18197559299789, + "grad_norm": 0.385733962059021, + "learning_rate": 0.0001636387267835558, + "loss": 1.529, + "step": 14004 + }, + { + "epoch": 0.18198858754180589, + "grad_norm": 0.2653253376483917, + "learning_rate": 0.00016363612732164443, + "loss": 1.3918, + "step": 14005 + }, + { + "epoch": 0.18200158208572176, + "grad_norm": 0.3419744372367859, + "learning_rate": 0.00016363352785973305, + "loss": 1.4181, + "step": 14006 + }, + { + "epoch": 0.18201457662963763, + "grad_norm": 0.41318562626838684, + "learning_rate": 0.00016363092839782165, + "loss": 1.4796, + "step": 14007 + }, + { + "epoch": 0.1820275711735535, + "grad_norm": 0.389669805765152, + "learning_rate": 0.00016362832893591027, + "loss": 1.396, + "step": 14008 + }, + { + "epoch": 0.18204056571746938, + "grad_norm": 0.3908182382583618, + "learning_rate": 0.0001636257294739989, + "loss": 1.4028, + "step": 14009 + }, + { + "epoch": 0.18205356026138525, + "grad_norm": 0.401261568069458, + "learning_rate": 0.00016362313001208752, + "loss": 1.632, + "step": 14010 + }, + { + "epoch": 0.18206655480530112, + "grad_norm": 0.4321623146533966, + "learning_rate": 0.00016362053055017612, + "loss": 1.5315, + "step": 14011 + }, + { + "epoch": 0.182079549349217, + "grad_norm": 0.40462014079093933, + "learning_rate": 0.00016361793108826472, + "loss": 1.4829, + "step": 14012 + }, + { + "epoch": 0.18209254389313287, + "grad_norm": 0.37876957654953003, + "learning_rate": 0.00016361533162635337, + "loss": 1.288, + "step": 14013 + }, + { + "epoch": 0.18210553843704874, + "grad_norm": 0.38370659947395325, + "learning_rate": 0.00016361273216444197, + "loss": 1.3761, + "step": 14014 + }, + { + "epoch": 0.18211853298096461, + "grad_norm": 0.3439337909221649, + "learning_rate": 0.0001636101327025306, + "loss": 1.4422, + "step": 14015 + }, + { + "epoch": 0.1821315275248805, + "grad_norm": 0.42591163516044617, + "learning_rate": 0.0001636075332406192, + "loss": 1.4578, + "step": 14016 + }, + { + "epoch": 0.18214452206879636, + "grad_norm": 0.41921764612197876, + "learning_rate": 0.00016360493377870781, + "loss": 1.3393, + "step": 14017 + }, + { + "epoch": 0.18215751661271223, + "grad_norm": 0.3973502218723297, + "learning_rate": 0.00016360233431679644, + "loss": 1.2588, + "step": 14018 + }, + { + "epoch": 0.1821705111566281, + "grad_norm": 0.4169273376464844, + "learning_rate": 0.00016359973485488504, + "loss": 1.4287, + "step": 14019 + }, + { + "epoch": 0.18218350570054398, + "grad_norm": 0.3799017667770386, + "learning_rate": 0.00016359713539297366, + "loss": 1.5446, + "step": 14020 + }, + { + "epoch": 0.18219650024445985, + "grad_norm": 0.43941307067871094, + "learning_rate": 0.00016359453593106228, + "loss": 1.4363, + "step": 14021 + }, + { + "epoch": 0.18220949478837573, + "grad_norm": 0.3740796148777008, + "learning_rate": 0.0001635919364691509, + "loss": 1.3879, + "step": 14022 + }, + { + "epoch": 0.1822224893322916, + "grad_norm": 0.44395968317985535, + "learning_rate": 0.0001635893370072395, + "loss": 1.526, + "step": 14023 + }, + { + "epoch": 0.18223548387620747, + "grad_norm": 0.3495246171951294, + "learning_rate": 0.0001635867375453281, + "loss": 1.3491, + "step": 14024 + }, + { + "epoch": 0.18224847842012334, + "grad_norm": 0.3668956756591797, + "learning_rate": 0.00016358413808341676, + "loss": 1.4034, + "step": 14025 + }, + { + "epoch": 0.18226147296403922, + "grad_norm": 0.44895651936531067, + "learning_rate": 0.00016358153862150535, + "loss": 1.4097, + "step": 14026 + }, + { + "epoch": 0.1822744675079551, + "grad_norm": 0.36002567410469055, + "learning_rate": 0.00016357893915959398, + "loss": 1.157, + "step": 14027 + }, + { + "epoch": 0.18228746205187096, + "grad_norm": 0.31939542293548584, + "learning_rate": 0.00016357633969768257, + "loss": 1.4068, + "step": 14028 + }, + { + "epoch": 0.18230045659578684, + "grad_norm": 0.45844006538391113, + "learning_rate": 0.0001635737402357712, + "loss": 1.4876, + "step": 14029 + }, + { + "epoch": 0.1823134511397027, + "grad_norm": 0.3604048490524292, + "learning_rate": 0.00016357114077385982, + "loss": 1.5303, + "step": 14030 + }, + { + "epoch": 0.18232644568361858, + "grad_norm": 0.46258118748664856, + "learning_rate": 0.00016356854131194842, + "loss": 1.4353, + "step": 14031 + }, + { + "epoch": 0.18233944022753446, + "grad_norm": 0.3510545492172241, + "learning_rate": 0.00016356594185003705, + "loss": 1.444, + "step": 14032 + }, + { + "epoch": 0.18235243477145033, + "grad_norm": 0.3985535204410553, + "learning_rate": 0.00016356334238812567, + "loss": 1.4398, + "step": 14033 + }, + { + "epoch": 0.1823654293153662, + "grad_norm": 0.45214900374412537, + "learning_rate": 0.0001635607429262143, + "loss": 1.5935, + "step": 14034 + }, + { + "epoch": 0.18237842385928207, + "grad_norm": 0.43656328320503235, + "learning_rate": 0.0001635581434643029, + "loss": 1.6284, + "step": 14035 + }, + { + "epoch": 0.18239141840319795, + "grad_norm": 0.3589610755443573, + "learning_rate": 0.00016355554400239152, + "loss": 1.3415, + "step": 14036 + }, + { + "epoch": 0.18240441294711382, + "grad_norm": 0.4076949656009674, + "learning_rate": 0.00016355294454048014, + "loss": 1.3789, + "step": 14037 + }, + { + "epoch": 0.1824174074910297, + "grad_norm": 0.40643566846847534, + "learning_rate": 0.00016355034507856874, + "loss": 1.4208, + "step": 14038 + }, + { + "epoch": 0.18243040203494557, + "grad_norm": 0.376617431640625, + "learning_rate": 0.00016354774561665736, + "loss": 1.6287, + "step": 14039 + }, + { + "epoch": 0.18244339657886144, + "grad_norm": 0.4039861857891083, + "learning_rate": 0.00016354514615474596, + "loss": 1.4394, + "step": 14040 + }, + { + "epoch": 0.1824563911227773, + "grad_norm": 0.2979445159435272, + "learning_rate": 0.00016354254669283458, + "loss": 1.0381, + "step": 14041 + }, + { + "epoch": 0.18246938566669318, + "grad_norm": 0.33211493492126465, + "learning_rate": 0.0001635399472309232, + "loss": 1.484, + "step": 14042 + }, + { + "epoch": 0.18248238021060906, + "grad_norm": 0.3744337558746338, + "learning_rate": 0.0001635373477690118, + "loss": 1.5327, + "step": 14043 + }, + { + "epoch": 0.18249537475452493, + "grad_norm": 0.3544542193412781, + "learning_rate": 0.00016353474830710043, + "loss": 1.5058, + "step": 14044 + }, + { + "epoch": 0.1825083692984408, + "grad_norm": 0.29757222533226013, + "learning_rate": 0.00016353214884518906, + "loss": 1.3475, + "step": 14045 + }, + { + "epoch": 0.1825213638423567, + "grad_norm": 0.41322824358940125, + "learning_rate": 0.00016352954938327768, + "loss": 1.5791, + "step": 14046 + }, + { + "epoch": 0.18253435838627258, + "grad_norm": 0.35056132078170776, + "learning_rate": 0.00016352694992136628, + "loss": 1.4366, + "step": 14047 + }, + { + "epoch": 0.18254735293018845, + "grad_norm": 0.42155611515045166, + "learning_rate": 0.0001635243504594549, + "loss": 1.5535, + "step": 14048 + }, + { + "epoch": 0.18256034747410432, + "grad_norm": 0.34348419308662415, + "learning_rate": 0.00016352175099754353, + "loss": 1.4098, + "step": 14049 + }, + { + "epoch": 0.1825733420180202, + "grad_norm": 0.32147687673568726, + "learning_rate": 0.00016351915153563212, + "loss": 1.4183, + "step": 14050 + }, + { + "epoch": 0.18258633656193607, + "grad_norm": 0.504754900932312, + "learning_rate": 0.00016351655207372075, + "loss": 1.5397, + "step": 14051 + }, + { + "epoch": 0.18259933110585194, + "grad_norm": 0.3047609031200409, + "learning_rate": 0.00016351395261180937, + "loss": 1.5062, + "step": 14052 + }, + { + "epoch": 0.18261232564976781, + "grad_norm": 0.29748672246932983, + "learning_rate": 0.000163511353149898, + "loss": 1.3646, + "step": 14053 + }, + { + "epoch": 0.1826253201936837, + "grad_norm": 0.4541914761066437, + "learning_rate": 0.0001635087536879866, + "loss": 1.5072, + "step": 14054 + }, + { + "epoch": 0.18263831473759956, + "grad_norm": 0.46341708302497864, + "learning_rate": 0.0001635061542260752, + "loss": 1.4969, + "step": 14055 + }, + { + "epoch": 0.18265130928151543, + "grad_norm": 0.4568157494068146, + "learning_rate": 0.00016350355476416384, + "loss": 1.4673, + "step": 14056 + }, + { + "epoch": 0.1826643038254313, + "grad_norm": 0.3995761573314667, + "learning_rate": 0.00016350095530225244, + "loss": 1.3008, + "step": 14057 + }, + { + "epoch": 0.18267729836934718, + "grad_norm": 0.39155545830726624, + "learning_rate": 0.00016349835584034107, + "loss": 1.5998, + "step": 14058 + }, + { + "epoch": 0.18269029291326305, + "grad_norm": 0.34831470251083374, + "learning_rate": 0.00016349575637842966, + "loss": 1.3119, + "step": 14059 + }, + { + "epoch": 0.18270328745717893, + "grad_norm": 0.37073448300361633, + "learning_rate": 0.0001634931569165183, + "loss": 1.3918, + "step": 14060 + }, + { + "epoch": 0.1827162820010948, + "grad_norm": 0.32926833629608154, + "learning_rate": 0.0001634905574546069, + "loss": 1.568, + "step": 14061 + }, + { + "epoch": 0.18272927654501067, + "grad_norm": 0.3862396776676178, + "learning_rate": 0.0001634879579926955, + "loss": 1.4659, + "step": 14062 + }, + { + "epoch": 0.18274227108892654, + "grad_norm": 0.44544902443885803, + "learning_rate": 0.00016348535853078413, + "loss": 1.4969, + "step": 14063 + }, + { + "epoch": 0.18275526563284242, + "grad_norm": 0.28099730610847473, + "learning_rate": 0.00016348275906887276, + "loss": 1.3014, + "step": 14064 + }, + { + "epoch": 0.1827682601767583, + "grad_norm": 0.41786128282546997, + "learning_rate": 0.00016348015960696138, + "loss": 1.5926, + "step": 14065 + }, + { + "epoch": 0.18278125472067416, + "grad_norm": 0.3417384624481201, + "learning_rate": 0.00016347756014504998, + "loss": 1.2703, + "step": 14066 + }, + { + "epoch": 0.18279424926459004, + "grad_norm": 0.45354530215263367, + "learning_rate": 0.00016347496068313858, + "loss": 1.1899, + "step": 14067 + }, + { + "epoch": 0.1828072438085059, + "grad_norm": 0.3581564724445343, + "learning_rate": 0.00016347236122122723, + "loss": 1.4701, + "step": 14068 + }, + { + "epoch": 0.18282023835242178, + "grad_norm": 0.4057212471961975, + "learning_rate": 0.00016346976175931583, + "loss": 1.6823, + "step": 14069 + }, + { + "epoch": 0.18283323289633766, + "grad_norm": 0.40115946531295776, + "learning_rate": 0.00016346716229740445, + "loss": 1.4272, + "step": 14070 + }, + { + "epoch": 0.18284622744025353, + "grad_norm": 0.38686510920524597, + "learning_rate": 0.00016346456283549305, + "loss": 1.4186, + "step": 14071 + }, + { + "epoch": 0.1828592219841694, + "grad_norm": 0.40734225511550903, + "learning_rate": 0.00016346196337358167, + "loss": 1.2761, + "step": 14072 + }, + { + "epoch": 0.18287221652808527, + "grad_norm": 0.5120412111282349, + "learning_rate": 0.0001634593639116703, + "loss": 1.4514, + "step": 14073 + }, + { + "epoch": 0.18288521107200115, + "grad_norm": 0.3747871220111847, + "learning_rate": 0.0001634567644497589, + "loss": 1.432, + "step": 14074 + }, + { + "epoch": 0.18289820561591702, + "grad_norm": 0.3794386386871338, + "learning_rate": 0.00016345416498784752, + "loss": 1.402, + "step": 14075 + }, + { + "epoch": 0.1829112001598329, + "grad_norm": 0.42419809103012085, + "learning_rate": 0.00016345156552593614, + "loss": 1.5567, + "step": 14076 + }, + { + "epoch": 0.18292419470374877, + "grad_norm": 0.38697177171707153, + "learning_rate": 0.00016344896606402477, + "loss": 1.4489, + "step": 14077 + }, + { + "epoch": 0.18293718924766464, + "grad_norm": 0.3752872049808502, + "learning_rate": 0.00016344636660211337, + "loss": 1.3389, + "step": 14078 + }, + { + "epoch": 0.1829501837915805, + "grad_norm": 0.44811201095581055, + "learning_rate": 0.00016344376714020196, + "loss": 1.4964, + "step": 14079 + }, + { + "epoch": 0.18296317833549638, + "grad_norm": 0.36299264430999756, + "learning_rate": 0.00016344116767829061, + "loss": 1.2521, + "step": 14080 + }, + { + "epoch": 0.18297617287941226, + "grad_norm": 0.4067133069038391, + "learning_rate": 0.0001634385682163792, + "loss": 1.3476, + "step": 14081 + }, + { + "epoch": 0.18298916742332813, + "grad_norm": 0.3378688395023346, + "learning_rate": 0.00016343596875446784, + "loss": 1.5133, + "step": 14082 + }, + { + "epoch": 0.183002161967244, + "grad_norm": 0.3995343744754791, + "learning_rate": 0.00016343336929255646, + "loss": 1.5854, + "step": 14083 + }, + { + "epoch": 0.18301515651115988, + "grad_norm": 0.4022303521633148, + "learning_rate": 0.00016343076983064506, + "loss": 1.4406, + "step": 14084 + }, + { + "epoch": 0.18302815105507575, + "grad_norm": 0.4465762972831726, + "learning_rate": 0.00016342817036873368, + "loss": 1.3251, + "step": 14085 + }, + { + "epoch": 0.18304114559899162, + "grad_norm": 0.4093140959739685, + "learning_rate": 0.00016342557090682228, + "loss": 1.4383, + "step": 14086 + }, + { + "epoch": 0.1830541401429075, + "grad_norm": 0.29103437066078186, + "learning_rate": 0.00016342297144491093, + "loss": 1.4526, + "step": 14087 + }, + { + "epoch": 0.18306713468682337, + "grad_norm": 0.3808046281337738, + "learning_rate": 0.00016342037198299953, + "loss": 1.5579, + "step": 14088 + }, + { + "epoch": 0.18308012923073924, + "grad_norm": 0.2769884467124939, + "learning_rate": 0.00016341777252108815, + "loss": 1.3528, + "step": 14089 + }, + { + "epoch": 0.18309312377465511, + "grad_norm": 0.42562422156333923, + "learning_rate": 0.00016341517305917675, + "loss": 1.51, + "step": 14090 + }, + { + "epoch": 0.183106118318571, + "grad_norm": 0.3662552535533905, + "learning_rate": 0.00016341257359726537, + "loss": 1.2242, + "step": 14091 + }, + { + "epoch": 0.18311911286248686, + "grad_norm": 0.37842535972595215, + "learning_rate": 0.000163409974135354, + "loss": 1.3649, + "step": 14092 + }, + { + "epoch": 0.18313210740640273, + "grad_norm": 0.29212892055511475, + "learning_rate": 0.0001634073746734426, + "loss": 1.2562, + "step": 14093 + }, + { + "epoch": 0.1831451019503186, + "grad_norm": 0.3692339360713959, + "learning_rate": 0.00016340477521153122, + "loss": 1.5351, + "step": 14094 + }, + { + "epoch": 0.18315809649423448, + "grad_norm": 0.36979204416275024, + "learning_rate": 0.00016340217574961985, + "loss": 1.2865, + "step": 14095 + }, + { + "epoch": 0.18317109103815035, + "grad_norm": 0.4215281307697296, + "learning_rate": 0.00016339957628770844, + "loss": 1.3955, + "step": 14096 + }, + { + "epoch": 0.18318408558206623, + "grad_norm": 0.3555452823638916, + "learning_rate": 0.00016339697682579707, + "loss": 1.5199, + "step": 14097 + }, + { + "epoch": 0.1831970801259821, + "grad_norm": 0.5135360956192017, + "learning_rate": 0.00016339437736388567, + "loss": 1.3644, + "step": 14098 + }, + { + "epoch": 0.18321007466989797, + "grad_norm": 0.385479211807251, + "learning_rate": 0.00016339177790197432, + "loss": 1.4385, + "step": 14099 + }, + { + "epoch": 0.18322306921381384, + "grad_norm": 0.3867081105709076, + "learning_rate": 0.00016338917844006291, + "loss": 1.5387, + "step": 14100 + }, + { + "epoch": 0.18323606375772972, + "grad_norm": 0.43120014667510986, + "learning_rate": 0.00016338657897815154, + "loss": 1.3749, + "step": 14101 + }, + { + "epoch": 0.1832490583016456, + "grad_norm": 0.3811238408088684, + "learning_rate": 0.00016338397951624014, + "loss": 1.3773, + "step": 14102 + }, + { + "epoch": 0.18326205284556146, + "grad_norm": 0.3594764173030853, + "learning_rate": 0.00016338138005432876, + "loss": 1.5016, + "step": 14103 + }, + { + "epoch": 0.18327504738947734, + "grad_norm": 0.29420915246009827, + "learning_rate": 0.00016337878059241738, + "loss": 1.2895, + "step": 14104 + }, + { + "epoch": 0.1832880419333932, + "grad_norm": 0.4095514714717865, + "learning_rate": 0.00016337618113050598, + "loss": 1.5777, + "step": 14105 + }, + { + "epoch": 0.18330103647730908, + "grad_norm": 0.2775671184062958, + "learning_rate": 0.0001633735816685946, + "loss": 1.3844, + "step": 14106 + }, + { + "epoch": 0.18331403102122495, + "grad_norm": 0.4624216556549072, + "learning_rate": 0.00016337098220668323, + "loss": 1.4317, + "step": 14107 + }, + { + "epoch": 0.18332702556514083, + "grad_norm": 0.46793991327285767, + "learning_rate": 0.00016336838274477183, + "loss": 1.3629, + "step": 14108 + }, + { + "epoch": 0.1833400201090567, + "grad_norm": 0.37692970037460327, + "learning_rate": 0.00016336578328286045, + "loss": 1.4633, + "step": 14109 + }, + { + "epoch": 0.18335301465297257, + "grad_norm": 0.4139084219932556, + "learning_rate": 0.00016336318382094905, + "loss": 1.4245, + "step": 14110 + }, + { + "epoch": 0.18336600919688845, + "grad_norm": 0.39490747451782227, + "learning_rate": 0.0001633605843590377, + "loss": 1.4193, + "step": 14111 + }, + { + "epoch": 0.18337900374080432, + "grad_norm": 0.48330605030059814, + "learning_rate": 0.0001633579848971263, + "loss": 1.3912, + "step": 14112 + }, + { + "epoch": 0.1833919982847202, + "grad_norm": 0.345071017742157, + "learning_rate": 0.00016335538543521492, + "loss": 1.5614, + "step": 14113 + }, + { + "epoch": 0.18340499282863607, + "grad_norm": 0.37234464287757874, + "learning_rate": 0.00016335278597330352, + "loss": 1.4033, + "step": 14114 + }, + { + "epoch": 0.18341798737255194, + "grad_norm": 0.38158395886421204, + "learning_rate": 0.00016335018651139215, + "loss": 1.4478, + "step": 14115 + }, + { + "epoch": 0.1834309819164678, + "grad_norm": 0.423510879278183, + "learning_rate": 0.00016334758704948077, + "loss": 1.2816, + "step": 14116 + }, + { + "epoch": 0.18344397646038368, + "grad_norm": 0.2975612282752991, + "learning_rate": 0.00016334498758756937, + "loss": 1.3716, + "step": 14117 + }, + { + "epoch": 0.18345697100429956, + "grad_norm": 0.38476189970970154, + "learning_rate": 0.000163342388125658, + "loss": 1.4032, + "step": 14118 + }, + { + "epoch": 0.18346996554821543, + "grad_norm": 0.3861877918243408, + "learning_rate": 0.00016333978866374662, + "loss": 1.4978, + "step": 14119 + }, + { + "epoch": 0.1834829600921313, + "grad_norm": 0.31106966733932495, + "learning_rate": 0.00016333718920183524, + "loss": 1.3745, + "step": 14120 + }, + { + "epoch": 0.18349595463604718, + "grad_norm": 0.3159928619861603, + "learning_rate": 0.00016333458973992384, + "loss": 1.5574, + "step": 14121 + }, + { + "epoch": 0.18350894917996308, + "grad_norm": 0.4266694486141205, + "learning_rate": 0.00016333199027801246, + "loss": 1.6152, + "step": 14122 + }, + { + "epoch": 0.18352194372387895, + "grad_norm": 0.4439978301525116, + "learning_rate": 0.0001633293908161011, + "loss": 1.5469, + "step": 14123 + }, + { + "epoch": 0.18353493826779482, + "grad_norm": 0.5029610395431519, + "learning_rate": 0.00016332679135418968, + "loss": 1.5436, + "step": 14124 + }, + { + "epoch": 0.1835479328117107, + "grad_norm": 0.45036670565605164, + "learning_rate": 0.0001633241918922783, + "loss": 1.328, + "step": 14125 + }, + { + "epoch": 0.18356092735562657, + "grad_norm": 0.29177844524383545, + "learning_rate": 0.00016332159243036693, + "loss": 1.5317, + "step": 14126 + }, + { + "epoch": 0.18357392189954244, + "grad_norm": 0.4240396320819855, + "learning_rate": 0.00016331899296845553, + "loss": 1.5785, + "step": 14127 + }, + { + "epoch": 0.18358691644345831, + "grad_norm": 0.42567187547683716, + "learning_rate": 0.00016331639350654416, + "loss": 1.6303, + "step": 14128 + }, + { + "epoch": 0.1835999109873742, + "grad_norm": 0.3158164620399475, + "learning_rate": 0.00016331379404463275, + "loss": 1.2832, + "step": 14129 + }, + { + "epoch": 0.18361290553129006, + "grad_norm": 0.3543689250946045, + "learning_rate": 0.0001633111945827214, + "loss": 1.3565, + "step": 14130 + }, + { + "epoch": 0.18362590007520593, + "grad_norm": 0.442054808139801, + "learning_rate": 0.00016330859512081, + "loss": 1.3854, + "step": 14131 + }, + { + "epoch": 0.1836388946191218, + "grad_norm": 0.40369337797164917, + "learning_rate": 0.00016330599565889863, + "loss": 1.4277, + "step": 14132 + }, + { + "epoch": 0.18365188916303768, + "grad_norm": 0.4353345036506653, + "learning_rate": 0.00016330339619698722, + "loss": 1.603, + "step": 14133 + }, + { + "epoch": 0.18366488370695355, + "grad_norm": 0.3719423711299896, + "learning_rate": 0.00016330079673507585, + "loss": 1.419, + "step": 14134 + }, + { + "epoch": 0.18367787825086943, + "grad_norm": 0.3628728985786438, + "learning_rate": 0.00016329819727316447, + "loss": 1.3113, + "step": 14135 + }, + { + "epoch": 0.1836908727947853, + "grad_norm": 0.47335267066955566, + "learning_rate": 0.00016329559781125307, + "loss": 1.5012, + "step": 14136 + }, + { + "epoch": 0.18370386733870117, + "grad_norm": 0.4623568654060364, + "learning_rate": 0.0001632929983493417, + "loss": 1.5038, + "step": 14137 + }, + { + "epoch": 0.18371686188261704, + "grad_norm": 0.42240414023399353, + "learning_rate": 0.00016329039888743032, + "loss": 1.4782, + "step": 14138 + }, + { + "epoch": 0.18372985642653292, + "grad_norm": 0.41479647159576416, + "learning_rate": 0.00016328779942551892, + "loss": 1.3718, + "step": 14139 + }, + { + "epoch": 0.1837428509704488, + "grad_norm": 0.42063257098197937, + "learning_rate": 0.00016328519996360754, + "loss": 1.3496, + "step": 14140 + }, + { + "epoch": 0.18375584551436466, + "grad_norm": 0.3319729268550873, + "learning_rate": 0.00016328260050169614, + "loss": 1.5304, + "step": 14141 + }, + { + "epoch": 0.18376884005828054, + "grad_norm": 0.39472097158432007, + "learning_rate": 0.0001632800010397848, + "loss": 1.4674, + "step": 14142 + }, + { + "epoch": 0.1837818346021964, + "grad_norm": 0.35716456174850464, + "learning_rate": 0.0001632774015778734, + "loss": 1.2327, + "step": 14143 + }, + { + "epoch": 0.18379482914611228, + "grad_norm": 0.36894872784614563, + "learning_rate": 0.000163274802115962, + "loss": 1.5294, + "step": 14144 + }, + { + "epoch": 0.18380782369002815, + "grad_norm": 0.3244621455669403, + "learning_rate": 0.0001632722026540506, + "loss": 1.3939, + "step": 14145 + }, + { + "epoch": 0.18382081823394403, + "grad_norm": 0.3625744879245758, + "learning_rate": 0.00016326960319213923, + "loss": 1.4218, + "step": 14146 + }, + { + "epoch": 0.1838338127778599, + "grad_norm": 0.4209868907928467, + "learning_rate": 0.00016326700373022786, + "loss": 1.5122, + "step": 14147 + }, + { + "epoch": 0.18384680732177577, + "grad_norm": 0.45798903703689575, + "learning_rate": 0.00016326440426831646, + "loss": 1.528, + "step": 14148 + }, + { + "epoch": 0.18385980186569165, + "grad_norm": 0.37147215008735657, + "learning_rate": 0.00016326180480640508, + "loss": 1.3215, + "step": 14149 + }, + { + "epoch": 0.18387279640960752, + "grad_norm": 0.4124443531036377, + "learning_rate": 0.0001632592053444937, + "loss": 1.4528, + "step": 14150 + }, + { + "epoch": 0.1838857909535234, + "grad_norm": 0.4228821396827698, + "learning_rate": 0.0001632566058825823, + "loss": 1.5896, + "step": 14151 + }, + { + "epoch": 0.18389878549743927, + "grad_norm": 0.4723481237888336, + "learning_rate": 0.00016325400642067093, + "loss": 1.4238, + "step": 14152 + }, + { + "epoch": 0.18391178004135514, + "grad_norm": 0.4504651427268982, + "learning_rate": 0.00016325140695875952, + "loss": 1.5265, + "step": 14153 + }, + { + "epoch": 0.183924774585271, + "grad_norm": 0.34285762906074524, + "learning_rate": 0.00016324880749684818, + "loss": 1.297, + "step": 14154 + }, + { + "epoch": 0.18393776912918688, + "grad_norm": 0.422423392534256, + "learning_rate": 0.00016324620803493677, + "loss": 1.3698, + "step": 14155 + }, + { + "epoch": 0.18395076367310276, + "grad_norm": 0.41746264696121216, + "learning_rate": 0.0001632436085730254, + "loss": 1.4437, + "step": 14156 + }, + { + "epoch": 0.18396375821701863, + "grad_norm": 0.3864283263683319, + "learning_rate": 0.00016324100911111402, + "loss": 1.3427, + "step": 14157 + }, + { + "epoch": 0.1839767527609345, + "grad_norm": 0.37371590733528137, + "learning_rate": 0.00016323840964920262, + "loss": 1.4361, + "step": 14158 + }, + { + "epoch": 0.18398974730485038, + "grad_norm": 0.3760530352592468, + "learning_rate": 0.00016323581018729124, + "loss": 1.5752, + "step": 14159 + }, + { + "epoch": 0.18400274184876625, + "grad_norm": 0.3577599823474884, + "learning_rate": 0.00016323321072537984, + "loss": 1.2996, + "step": 14160 + }, + { + "epoch": 0.18401573639268212, + "grad_norm": 0.2551228404045105, + "learning_rate": 0.0001632306112634685, + "loss": 1.2791, + "step": 14161 + }, + { + "epoch": 0.184028730936598, + "grad_norm": 0.2513895332813263, + "learning_rate": 0.0001632280118015571, + "loss": 1.1956, + "step": 14162 + }, + { + "epoch": 0.18404172548051387, + "grad_norm": 0.4167480170726776, + "learning_rate": 0.0001632254123396457, + "loss": 1.5575, + "step": 14163 + }, + { + "epoch": 0.18405472002442974, + "grad_norm": 0.3538648784160614, + "learning_rate": 0.0001632228128777343, + "loss": 1.5567, + "step": 14164 + }, + { + "epoch": 0.18406771456834561, + "grad_norm": 0.3859049677848816, + "learning_rate": 0.00016322021341582294, + "loss": 1.3137, + "step": 14165 + }, + { + "epoch": 0.1840807091122615, + "grad_norm": 0.361909955739975, + "learning_rate": 0.00016321761395391156, + "loss": 1.2688, + "step": 14166 + }, + { + "epoch": 0.18409370365617736, + "grad_norm": 0.469959557056427, + "learning_rate": 0.00016321501449200016, + "loss": 1.5363, + "step": 14167 + }, + { + "epoch": 0.18410669820009323, + "grad_norm": 0.339082807302475, + "learning_rate": 0.00016321241503008878, + "loss": 1.3991, + "step": 14168 + }, + { + "epoch": 0.1841196927440091, + "grad_norm": 0.38139861822128296, + "learning_rate": 0.0001632098155681774, + "loss": 1.4373, + "step": 14169 + }, + { + "epoch": 0.18413268728792498, + "grad_norm": 0.33445730805397034, + "learning_rate": 0.000163207216106266, + "loss": 1.4306, + "step": 14170 + }, + { + "epoch": 0.18414568183184085, + "grad_norm": 0.3137668967247009, + "learning_rate": 0.00016320461664435463, + "loss": 1.3268, + "step": 14171 + }, + { + "epoch": 0.18415867637575672, + "grad_norm": 0.3523469567298889, + "learning_rate": 0.00016320201718244323, + "loss": 1.2153, + "step": 14172 + }, + { + "epoch": 0.1841716709196726, + "grad_norm": 0.35693836212158203, + "learning_rate": 0.00016319941772053188, + "loss": 1.3563, + "step": 14173 + }, + { + "epoch": 0.18418466546358847, + "grad_norm": 0.3365146517753601, + "learning_rate": 0.00016319681825862048, + "loss": 1.313, + "step": 14174 + }, + { + "epoch": 0.18419766000750434, + "grad_norm": 0.25826820731163025, + "learning_rate": 0.0001631942187967091, + "loss": 1.2878, + "step": 14175 + }, + { + "epoch": 0.18421065455142022, + "grad_norm": 0.5472778081893921, + "learning_rate": 0.0001631916193347977, + "loss": 1.6011, + "step": 14176 + }, + { + "epoch": 0.1842236490953361, + "grad_norm": 0.33301112055778503, + "learning_rate": 0.00016318901987288632, + "loss": 1.3935, + "step": 14177 + }, + { + "epoch": 0.18423664363925196, + "grad_norm": 0.3766372501850128, + "learning_rate": 0.00016318642041097495, + "loss": 1.4649, + "step": 14178 + }, + { + "epoch": 0.18424963818316784, + "grad_norm": 0.37197381258010864, + "learning_rate": 0.00016318382094906354, + "loss": 1.5793, + "step": 14179 + }, + { + "epoch": 0.1842626327270837, + "grad_norm": 0.38353264331817627, + "learning_rate": 0.00016318122148715217, + "loss": 1.3998, + "step": 14180 + }, + { + "epoch": 0.18427562727099958, + "grad_norm": 0.4202147424221039, + "learning_rate": 0.0001631786220252408, + "loss": 1.4258, + "step": 14181 + }, + { + "epoch": 0.18428862181491545, + "grad_norm": 0.43637004494667053, + "learning_rate": 0.0001631760225633294, + "loss": 1.4895, + "step": 14182 + }, + { + "epoch": 0.18430161635883133, + "grad_norm": 0.4439176917076111, + "learning_rate": 0.00016317342310141801, + "loss": 1.4651, + "step": 14183 + }, + { + "epoch": 0.1843146109027472, + "grad_norm": 0.33773234486579895, + "learning_rate": 0.0001631708236395066, + "loss": 1.3833, + "step": 14184 + }, + { + "epoch": 0.18432760544666307, + "grad_norm": 0.373038649559021, + "learning_rate": 0.00016316822417759526, + "loss": 1.2611, + "step": 14185 + }, + { + "epoch": 0.18434059999057895, + "grad_norm": 0.3922519385814667, + "learning_rate": 0.00016316562471568386, + "loss": 1.4736, + "step": 14186 + }, + { + "epoch": 0.18435359453449482, + "grad_norm": 0.33782196044921875, + "learning_rate": 0.00016316302525377249, + "loss": 1.3659, + "step": 14187 + }, + { + "epoch": 0.1843665890784107, + "grad_norm": 0.3193321228027344, + "learning_rate": 0.00016316042579186108, + "loss": 1.1122, + "step": 14188 + }, + { + "epoch": 0.18437958362232656, + "grad_norm": 0.391689270734787, + "learning_rate": 0.0001631578263299497, + "loss": 1.3567, + "step": 14189 + }, + { + "epoch": 0.18439257816624244, + "grad_norm": 0.23014414310455322, + "learning_rate": 0.00016315522686803833, + "loss": 1.2416, + "step": 14190 + }, + { + "epoch": 0.1844055727101583, + "grad_norm": 0.3476603925228119, + "learning_rate": 0.00016315262740612693, + "loss": 1.1089, + "step": 14191 + }, + { + "epoch": 0.18441856725407418, + "grad_norm": 0.5365970134735107, + "learning_rate": 0.00016315002794421555, + "loss": 1.3553, + "step": 14192 + }, + { + "epoch": 0.18443156179799006, + "grad_norm": 0.315402626991272, + "learning_rate": 0.00016314742848230418, + "loss": 1.5172, + "step": 14193 + }, + { + "epoch": 0.18444455634190593, + "grad_norm": 0.3669466972351074, + "learning_rate": 0.00016314482902039278, + "loss": 1.5686, + "step": 14194 + }, + { + "epoch": 0.1844575508858218, + "grad_norm": 0.4669295847415924, + "learning_rate": 0.0001631422295584814, + "loss": 1.4138, + "step": 14195 + }, + { + "epoch": 0.18447054542973768, + "grad_norm": 0.36518460512161255, + "learning_rate": 0.00016313963009657002, + "loss": 1.3373, + "step": 14196 + }, + { + "epoch": 0.18448353997365355, + "grad_norm": 0.437038391828537, + "learning_rate": 0.00016313703063465865, + "loss": 1.4447, + "step": 14197 + }, + { + "epoch": 0.18449653451756945, + "grad_norm": 0.2697826325893402, + "learning_rate": 0.00016313443117274725, + "loss": 1.4632, + "step": 14198 + }, + { + "epoch": 0.18450952906148532, + "grad_norm": 0.4508165419101715, + "learning_rate": 0.00016313183171083587, + "loss": 1.5388, + "step": 14199 + }, + { + "epoch": 0.1845225236054012, + "grad_norm": 0.33104583621025085, + "learning_rate": 0.0001631292322489245, + "loss": 1.429, + "step": 14200 + }, + { + "epoch": 0.18453551814931707, + "grad_norm": 0.3331906199455261, + "learning_rate": 0.0001631266327870131, + "loss": 1.4997, + "step": 14201 + }, + { + "epoch": 0.18454851269323294, + "grad_norm": 0.4687112271785736, + "learning_rate": 0.00016312403332510172, + "loss": 1.4175, + "step": 14202 + }, + { + "epoch": 0.18456150723714881, + "grad_norm": 0.37302765250205994, + "learning_rate": 0.00016312143386319031, + "loss": 1.404, + "step": 14203 + }, + { + "epoch": 0.1845745017810647, + "grad_norm": 0.3988381326198578, + "learning_rate": 0.00016311883440127897, + "loss": 1.6268, + "step": 14204 + }, + { + "epoch": 0.18458749632498056, + "grad_norm": 0.4780881702899933, + "learning_rate": 0.00016311623493936756, + "loss": 1.6107, + "step": 14205 + }, + { + "epoch": 0.18460049086889643, + "grad_norm": 0.2883290648460388, + "learning_rate": 0.00016311363547745616, + "loss": 1.2355, + "step": 14206 + }, + { + "epoch": 0.1846134854128123, + "grad_norm": 0.4382941424846649, + "learning_rate": 0.00016311103601554479, + "loss": 1.3941, + "step": 14207 + }, + { + "epoch": 0.18462647995672818, + "grad_norm": 0.3439268171787262, + "learning_rate": 0.0001631084365536334, + "loss": 1.142, + "step": 14208 + }, + { + "epoch": 0.18463947450064405, + "grad_norm": 0.2982230484485626, + "learning_rate": 0.00016310583709172203, + "loss": 1.3042, + "step": 14209 + }, + { + "epoch": 0.18465246904455992, + "grad_norm": 0.4891691207885742, + "learning_rate": 0.00016310323762981063, + "loss": 1.4856, + "step": 14210 + }, + { + "epoch": 0.1846654635884758, + "grad_norm": 0.4565635919570923, + "learning_rate": 0.00016310063816789926, + "loss": 1.4687, + "step": 14211 + }, + { + "epoch": 0.18467845813239167, + "grad_norm": 0.32642844319343567, + "learning_rate": 0.00016309803870598788, + "loss": 1.4441, + "step": 14212 + }, + { + "epoch": 0.18469145267630754, + "grad_norm": 0.3676673173904419, + "learning_rate": 0.00016309543924407648, + "loss": 1.2649, + "step": 14213 + }, + { + "epoch": 0.18470444722022342, + "grad_norm": 0.3801582157611847, + "learning_rate": 0.0001630928397821651, + "loss": 1.3451, + "step": 14214 + }, + { + "epoch": 0.1847174417641393, + "grad_norm": 0.3406851887702942, + "learning_rate": 0.0001630902403202537, + "loss": 1.4178, + "step": 14215 + }, + { + "epoch": 0.18473043630805516, + "grad_norm": 0.43474966287612915, + "learning_rate": 0.00016308764085834235, + "loss": 1.465, + "step": 14216 + }, + { + "epoch": 0.18474343085197104, + "grad_norm": 0.38164186477661133, + "learning_rate": 0.00016308504139643095, + "loss": 1.6319, + "step": 14217 + }, + { + "epoch": 0.1847564253958869, + "grad_norm": 0.34267258644104004, + "learning_rate": 0.00016308244193451955, + "loss": 1.2316, + "step": 14218 + }, + { + "epoch": 0.18476941993980278, + "grad_norm": 0.31737038493156433, + "learning_rate": 0.00016307984247260817, + "loss": 1.3975, + "step": 14219 + }, + { + "epoch": 0.18478241448371865, + "grad_norm": 0.398255854845047, + "learning_rate": 0.0001630772430106968, + "loss": 1.4754, + "step": 14220 + }, + { + "epoch": 0.18479540902763453, + "grad_norm": 0.3205759823322296, + "learning_rate": 0.00016307464354878542, + "loss": 1.2871, + "step": 14221 + }, + { + "epoch": 0.1848084035715504, + "grad_norm": 0.4778819680213928, + "learning_rate": 0.00016307204408687402, + "loss": 1.5783, + "step": 14222 + }, + { + "epoch": 0.18482139811546627, + "grad_norm": 0.40417802333831787, + "learning_rate": 0.00016306944462496264, + "loss": 1.4569, + "step": 14223 + }, + { + "epoch": 0.18483439265938215, + "grad_norm": 0.321976900100708, + "learning_rate": 0.00016306684516305127, + "loss": 1.4991, + "step": 14224 + }, + { + "epoch": 0.18484738720329802, + "grad_norm": 0.27175503969192505, + "learning_rate": 0.00016306424570113986, + "loss": 1.276, + "step": 14225 + }, + { + "epoch": 0.1848603817472139, + "grad_norm": 0.49025505781173706, + "learning_rate": 0.0001630616462392285, + "loss": 1.5754, + "step": 14226 + }, + { + "epoch": 0.18487337629112977, + "grad_norm": 0.37089693546295166, + "learning_rate": 0.00016305904677731709, + "loss": 1.3411, + "step": 14227 + }, + { + "epoch": 0.18488637083504564, + "grad_norm": 0.3145904839038849, + "learning_rate": 0.00016305644731540574, + "loss": 1.4623, + "step": 14228 + }, + { + "epoch": 0.1848993653789615, + "grad_norm": 0.4463021755218506, + "learning_rate": 0.00016305384785349433, + "loss": 1.4372, + "step": 14229 + }, + { + "epoch": 0.18491235992287738, + "grad_norm": 0.3998275101184845, + "learning_rate": 0.00016305124839158293, + "loss": 1.4106, + "step": 14230 + }, + { + "epoch": 0.18492535446679326, + "grad_norm": 0.33859017491340637, + "learning_rate": 0.00016304864892967158, + "loss": 1.4446, + "step": 14231 + }, + { + "epoch": 0.18493834901070913, + "grad_norm": 0.33995580673217773, + "learning_rate": 0.00016304604946776018, + "loss": 1.4584, + "step": 14232 + }, + { + "epoch": 0.184951343554625, + "grad_norm": 0.36465945839881897, + "learning_rate": 0.0001630434500058488, + "loss": 1.4677, + "step": 14233 + }, + { + "epoch": 0.18496433809854088, + "grad_norm": 0.3799170255661011, + "learning_rate": 0.0001630408505439374, + "loss": 1.395, + "step": 14234 + }, + { + "epoch": 0.18497733264245675, + "grad_norm": 0.4696342945098877, + "learning_rate": 0.00016303825108202603, + "loss": 1.2541, + "step": 14235 + }, + { + "epoch": 0.18499032718637262, + "grad_norm": 0.31894004344940186, + "learning_rate": 0.00016303565162011465, + "loss": 1.4096, + "step": 14236 + }, + { + "epoch": 0.1850033217302885, + "grad_norm": 0.441837340593338, + "learning_rate": 0.00016303305215820325, + "loss": 1.4345, + "step": 14237 + }, + { + "epoch": 0.18501631627420437, + "grad_norm": 0.4292527735233307, + "learning_rate": 0.00016303045269629187, + "loss": 1.3966, + "step": 14238 + }, + { + "epoch": 0.18502931081812024, + "grad_norm": 0.34820497035980225, + "learning_rate": 0.0001630278532343805, + "loss": 1.4185, + "step": 14239 + }, + { + "epoch": 0.1850423053620361, + "grad_norm": 0.4338149130344391, + "learning_rate": 0.00016302525377246912, + "loss": 1.5342, + "step": 14240 + }, + { + "epoch": 0.185055299905952, + "grad_norm": 0.45493677258491516, + "learning_rate": 0.00016302265431055772, + "loss": 1.4273, + "step": 14241 + }, + { + "epoch": 0.18506829444986786, + "grad_norm": 0.39177313446998596, + "learning_rate": 0.00016302005484864634, + "loss": 1.3888, + "step": 14242 + }, + { + "epoch": 0.18508128899378373, + "grad_norm": 0.4118193984031677, + "learning_rate": 0.00016301745538673497, + "loss": 1.537, + "step": 14243 + }, + { + "epoch": 0.1850942835376996, + "grad_norm": 0.34462904930114746, + "learning_rate": 0.00016301485592482357, + "loss": 1.2559, + "step": 14244 + }, + { + "epoch": 0.18510727808161548, + "grad_norm": 0.3598201870918274, + "learning_rate": 0.0001630122564629122, + "loss": 1.4949, + "step": 14245 + }, + { + "epoch": 0.18512027262553135, + "grad_norm": 0.39716100692749023, + "learning_rate": 0.0001630096570010008, + "loss": 1.345, + "step": 14246 + }, + { + "epoch": 0.18513326716944722, + "grad_norm": 0.35244104266166687, + "learning_rate": 0.0001630070575390894, + "loss": 1.294, + "step": 14247 + }, + { + "epoch": 0.1851462617133631, + "grad_norm": 0.4715386629104614, + "learning_rate": 0.00016300445807717804, + "loss": 1.6238, + "step": 14248 + }, + { + "epoch": 0.18515925625727897, + "grad_norm": 0.346696138381958, + "learning_rate": 0.00016300185861526663, + "loss": 1.5361, + "step": 14249 + }, + { + "epoch": 0.18517225080119484, + "grad_norm": 0.3919036090373993, + "learning_rate": 0.00016299925915335526, + "loss": 1.4763, + "step": 14250 + }, + { + "epoch": 0.18518524534511072, + "grad_norm": 0.560883104801178, + "learning_rate": 0.00016299665969144388, + "loss": 1.6108, + "step": 14251 + }, + { + "epoch": 0.1851982398890266, + "grad_norm": 0.3743640184402466, + "learning_rate": 0.0001629940602295325, + "loss": 1.4283, + "step": 14252 + }, + { + "epoch": 0.18521123443294246, + "grad_norm": 0.41247740387916565, + "learning_rate": 0.0001629914607676211, + "loss": 1.6239, + "step": 14253 + }, + { + "epoch": 0.18522422897685833, + "grad_norm": 0.4529300332069397, + "learning_rate": 0.00016298886130570973, + "loss": 1.3986, + "step": 14254 + }, + { + "epoch": 0.1852372235207742, + "grad_norm": 0.38263243436813354, + "learning_rate": 0.00016298626184379835, + "loss": 1.4298, + "step": 14255 + }, + { + "epoch": 0.18525021806469008, + "grad_norm": 0.43897202610969543, + "learning_rate": 0.00016298366238188695, + "loss": 1.5402, + "step": 14256 + }, + { + "epoch": 0.18526321260860595, + "grad_norm": 0.35338664054870605, + "learning_rate": 0.00016298106291997558, + "loss": 1.4421, + "step": 14257 + }, + { + "epoch": 0.18527620715252183, + "grad_norm": 0.32810527086257935, + "learning_rate": 0.00016297846345806417, + "loss": 1.2929, + "step": 14258 + }, + { + "epoch": 0.1852892016964377, + "grad_norm": 0.36444562673568726, + "learning_rate": 0.00016297586399615282, + "loss": 1.5201, + "step": 14259 + }, + { + "epoch": 0.18530219624035357, + "grad_norm": 0.3933938443660736, + "learning_rate": 0.00016297326453424142, + "loss": 1.5183, + "step": 14260 + }, + { + "epoch": 0.18531519078426945, + "grad_norm": 0.37150898575782776, + "learning_rate": 0.00016297066507233002, + "loss": 1.3755, + "step": 14261 + }, + { + "epoch": 0.18532818532818532, + "grad_norm": 0.5009927153587341, + "learning_rate": 0.00016296806561041864, + "loss": 1.5118, + "step": 14262 + }, + { + "epoch": 0.1853411798721012, + "grad_norm": 0.3658714294433594, + "learning_rate": 0.00016296546614850727, + "loss": 1.643, + "step": 14263 + }, + { + "epoch": 0.18535417441601706, + "grad_norm": 0.4005360007286072, + "learning_rate": 0.0001629628666865959, + "loss": 1.4701, + "step": 14264 + }, + { + "epoch": 0.18536716895993294, + "grad_norm": 0.4908580780029297, + "learning_rate": 0.0001629602672246845, + "loss": 1.5584, + "step": 14265 + }, + { + "epoch": 0.1853801635038488, + "grad_norm": 0.3978935480117798, + "learning_rate": 0.00016295766776277311, + "loss": 1.3195, + "step": 14266 + }, + { + "epoch": 0.18539315804776468, + "grad_norm": 0.5323774218559265, + "learning_rate": 0.00016295506830086174, + "loss": 1.378, + "step": 14267 + }, + { + "epoch": 0.18540615259168056, + "grad_norm": 0.35051625967025757, + "learning_rate": 0.00016295246883895034, + "loss": 1.4612, + "step": 14268 + }, + { + "epoch": 0.18541914713559643, + "grad_norm": 0.4646984338760376, + "learning_rate": 0.00016294986937703896, + "loss": 1.5986, + "step": 14269 + }, + { + "epoch": 0.1854321416795123, + "grad_norm": 0.45621147751808167, + "learning_rate": 0.00016294726991512759, + "loss": 1.4939, + "step": 14270 + }, + { + "epoch": 0.18544513622342818, + "grad_norm": 0.39475017786026, + "learning_rate": 0.0001629446704532162, + "loss": 1.4582, + "step": 14271 + }, + { + "epoch": 0.18545813076734405, + "grad_norm": 0.3017983138561249, + "learning_rate": 0.0001629420709913048, + "loss": 1.252, + "step": 14272 + }, + { + "epoch": 0.18547112531125992, + "grad_norm": 0.3345814049243927, + "learning_rate": 0.0001629394715293934, + "loss": 1.3758, + "step": 14273 + }, + { + "epoch": 0.18548411985517582, + "grad_norm": 0.3642207086086273, + "learning_rate": 0.00016293687206748206, + "loss": 1.4862, + "step": 14274 + }, + { + "epoch": 0.1854971143990917, + "grad_norm": 0.45335185527801514, + "learning_rate": 0.00016293427260557065, + "loss": 1.3258, + "step": 14275 + }, + { + "epoch": 0.18551010894300757, + "grad_norm": 0.32101088762283325, + "learning_rate": 0.00016293167314365928, + "loss": 1.4992, + "step": 14276 + }, + { + "epoch": 0.18552310348692344, + "grad_norm": 0.4141268730163574, + "learning_rate": 0.00016292907368174788, + "loss": 1.6381, + "step": 14277 + }, + { + "epoch": 0.1855360980308393, + "grad_norm": 0.46873944997787476, + "learning_rate": 0.0001629264742198365, + "loss": 1.3337, + "step": 14278 + }, + { + "epoch": 0.1855490925747552, + "grad_norm": 0.38198965787887573, + "learning_rate": 0.00016292387475792512, + "loss": 1.4683, + "step": 14279 + }, + { + "epoch": 0.18556208711867106, + "grad_norm": 0.3092433214187622, + "learning_rate": 0.00016292127529601372, + "loss": 1.3181, + "step": 14280 + }, + { + "epoch": 0.18557508166258693, + "grad_norm": 0.421966016292572, + "learning_rate": 0.00016291867583410235, + "loss": 1.4173, + "step": 14281 + }, + { + "epoch": 0.1855880762065028, + "grad_norm": 0.38572409749031067, + "learning_rate": 0.00016291607637219097, + "loss": 1.3703, + "step": 14282 + }, + { + "epoch": 0.18560107075041868, + "grad_norm": 0.39017271995544434, + "learning_rate": 0.0001629134769102796, + "loss": 1.2168, + "step": 14283 + }, + { + "epoch": 0.18561406529433455, + "grad_norm": 0.35076531767845154, + "learning_rate": 0.0001629108774483682, + "loss": 1.4448, + "step": 14284 + }, + { + "epoch": 0.18562705983825042, + "grad_norm": 0.4101915657520294, + "learning_rate": 0.0001629082779864568, + "loss": 1.5174, + "step": 14285 + }, + { + "epoch": 0.1856400543821663, + "grad_norm": 0.40193435549736023, + "learning_rate": 0.00016290567852454544, + "loss": 1.3288, + "step": 14286 + }, + { + "epoch": 0.18565304892608217, + "grad_norm": 0.4339751601219177, + "learning_rate": 0.00016290307906263404, + "loss": 1.3335, + "step": 14287 + }, + { + "epoch": 0.18566604346999804, + "grad_norm": 0.432715505361557, + "learning_rate": 0.00016290047960072266, + "loss": 1.2862, + "step": 14288 + }, + { + "epoch": 0.18567903801391392, + "grad_norm": 0.34752845764160156, + "learning_rate": 0.00016289788013881126, + "loss": 1.4881, + "step": 14289 + }, + { + "epoch": 0.1856920325578298, + "grad_norm": 0.330517053604126, + "learning_rate": 0.00016289528067689989, + "loss": 1.3736, + "step": 14290 + }, + { + "epoch": 0.18570502710174566, + "grad_norm": 0.3264653980731964, + "learning_rate": 0.0001628926812149885, + "loss": 1.473, + "step": 14291 + }, + { + "epoch": 0.18571802164566154, + "grad_norm": 0.2794967293739319, + "learning_rate": 0.0001628900817530771, + "loss": 1.5245, + "step": 14292 + }, + { + "epoch": 0.1857310161895774, + "grad_norm": 0.3297751843929291, + "learning_rate": 0.00016288748229116573, + "loss": 1.4719, + "step": 14293 + }, + { + "epoch": 0.18574401073349328, + "grad_norm": 0.36630377173423767, + "learning_rate": 0.00016288488282925436, + "loss": 1.3589, + "step": 14294 + }, + { + "epoch": 0.18575700527740915, + "grad_norm": 0.3053281009197235, + "learning_rate": 0.00016288228336734298, + "loss": 1.4403, + "step": 14295 + }, + { + "epoch": 0.18576999982132503, + "grad_norm": 0.3138176202774048, + "learning_rate": 0.00016287968390543158, + "loss": 1.2616, + "step": 14296 + }, + { + "epoch": 0.1857829943652409, + "grad_norm": 0.422550767660141, + "learning_rate": 0.0001628770844435202, + "loss": 1.4104, + "step": 14297 + }, + { + "epoch": 0.18579598890915677, + "grad_norm": 0.33258068561553955, + "learning_rate": 0.00016287448498160883, + "loss": 1.5382, + "step": 14298 + }, + { + "epoch": 0.18580898345307265, + "grad_norm": 0.4508451223373413, + "learning_rate": 0.00016287188551969742, + "loss": 1.3465, + "step": 14299 + }, + { + "epoch": 0.18582197799698852, + "grad_norm": 0.47415196895599365, + "learning_rate": 0.00016286928605778605, + "loss": 1.4864, + "step": 14300 + }, + { + "epoch": 0.1858349725409044, + "grad_norm": 0.4498703181743622, + "learning_rate": 0.00016286668659587465, + "loss": 1.5972, + "step": 14301 + }, + { + "epoch": 0.18584796708482026, + "grad_norm": 0.3581358790397644, + "learning_rate": 0.00016286408713396327, + "loss": 1.4269, + "step": 14302 + }, + { + "epoch": 0.18586096162873614, + "grad_norm": 0.2862187922000885, + "learning_rate": 0.0001628614876720519, + "loss": 1.316, + "step": 14303 + }, + { + "epoch": 0.185873956172652, + "grad_norm": 0.32734933495521545, + "learning_rate": 0.0001628588882101405, + "loss": 1.3223, + "step": 14304 + }, + { + "epoch": 0.18588695071656788, + "grad_norm": 0.3670196831226349, + "learning_rate": 0.00016285628874822912, + "loss": 1.3676, + "step": 14305 + }, + { + "epoch": 0.18589994526048376, + "grad_norm": 0.43077024817466736, + "learning_rate": 0.00016285368928631774, + "loss": 1.4851, + "step": 14306 + }, + { + "epoch": 0.18591293980439963, + "grad_norm": 0.3802175223827362, + "learning_rate": 0.00016285108982440637, + "loss": 1.4537, + "step": 14307 + }, + { + "epoch": 0.1859259343483155, + "grad_norm": 0.39573338627815247, + "learning_rate": 0.00016284849036249496, + "loss": 1.3061, + "step": 14308 + }, + { + "epoch": 0.18593892889223138, + "grad_norm": 0.3352431356906891, + "learning_rate": 0.0001628458909005836, + "loss": 1.4291, + "step": 14309 + }, + { + "epoch": 0.18595192343614725, + "grad_norm": 0.44189995527267456, + "learning_rate": 0.0001628432914386722, + "loss": 1.4303, + "step": 14310 + }, + { + "epoch": 0.18596491798006312, + "grad_norm": 0.3592013716697693, + "learning_rate": 0.0001628406919767608, + "loss": 1.4524, + "step": 14311 + }, + { + "epoch": 0.185977912523979, + "grad_norm": 0.34093964099884033, + "learning_rate": 0.00016283809251484943, + "loss": 1.3018, + "step": 14312 + }, + { + "epoch": 0.18599090706789487, + "grad_norm": 0.41372033953666687, + "learning_rate": 0.00016283549305293806, + "loss": 1.4411, + "step": 14313 + }, + { + "epoch": 0.18600390161181074, + "grad_norm": 0.43197929859161377, + "learning_rate": 0.00016283289359102666, + "loss": 1.29, + "step": 14314 + }, + { + "epoch": 0.1860168961557266, + "grad_norm": 0.3922059237957001, + "learning_rate": 0.00016283029412911528, + "loss": 1.3814, + "step": 14315 + }, + { + "epoch": 0.1860298906996425, + "grad_norm": 0.4541341960430145, + "learning_rate": 0.00016282769466720388, + "loss": 1.1392, + "step": 14316 + }, + { + "epoch": 0.18604288524355836, + "grad_norm": 0.3847472667694092, + "learning_rate": 0.00016282509520529253, + "loss": 1.3816, + "step": 14317 + }, + { + "epoch": 0.18605587978747423, + "grad_norm": 0.38595613837242126, + "learning_rate": 0.00016282249574338113, + "loss": 1.4065, + "step": 14318 + }, + { + "epoch": 0.1860688743313901, + "grad_norm": 0.46247953176498413, + "learning_rate": 0.00016281989628146975, + "loss": 1.5003, + "step": 14319 + }, + { + "epoch": 0.18608186887530598, + "grad_norm": 0.4265228807926178, + "learning_rate": 0.00016281729681955835, + "loss": 1.4465, + "step": 14320 + }, + { + "epoch": 0.18609486341922185, + "grad_norm": 0.4624263048171997, + "learning_rate": 0.00016281469735764697, + "loss": 1.4938, + "step": 14321 + }, + { + "epoch": 0.18610785796313772, + "grad_norm": 0.43953514099121094, + "learning_rate": 0.0001628120978957356, + "loss": 1.5869, + "step": 14322 + }, + { + "epoch": 0.1861208525070536, + "grad_norm": 0.49323806166648865, + "learning_rate": 0.0001628094984338242, + "loss": 1.3974, + "step": 14323 + }, + { + "epoch": 0.18613384705096947, + "grad_norm": 0.3058515191078186, + "learning_rate": 0.00016280689897191282, + "loss": 1.4803, + "step": 14324 + }, + { + "epoch": 0.18614684159488534, + "grad_norm": 0.3950141668319702, + "learning_rate": 0.00016280429951000144, + "loss": 1.5046, + "step": 14325 + }, + { + "epoch": 0.18615983613880122, + "grad_norm": 0.39663898944854736, + "learning_rate": 0.00016280170004809007, + "loss": 1.3636, + "step": 14326 + }, + { + "epoch": 0.1861728306827171, + "grad_norm": 0.30346032977104187, + "learning_rate": 0.00016279910058617867, + "loss": 1.4355, + "step": 14327 + }, + { + "epoch": 0.18618582522663296, + "grad_norm": 0.3703216314315796, + "learning_rate": 0.00016279650112426726, + "loss": 1.3973, + "step": 14328 + }, + { + "epoch": 0.18619881977054883, + "grad_norm": 0.4186863601207733, + "learning_rate": 0.00016279390166235592, + "loss": 1.3924, + "step": 14329 + }, + { + "epoch": 0.1862118143144647, + "grad_norm": 0.36253488063812256, + "learning_rate": 0.0001627913022004445, + "loss": 1.2816, + "step": 14330 + }, + { + "epoch": 0.18622480885838058, + "grad_norm": 0.4027514159679413, + "learning_rate": 0.00016278870273853314, + "loss": 1.5649, + "step": 14331 + }, + { + "epoch": 0.18623780340229645, + "grad_norm": 0.4339659810066223, + "learning_rate": 0.00016278610327662173, + "loss": 1.425, + "step": 14332 + }, + { + "epoch": 0.18625079794621233, + "grad_norm": 0.3830181658267975, + "learning_rate": 0.00016278350381471036, + "loss": 1.2091, + "step": 14333 + }, + { + "epoch": 0.1862637924901282, + "grad_norm": 0.42852750420570374, + "learning_rate": 0.00016278090435279898, + "loss": 1.4521, + "step": 14334 + }, + { + "epoch": 0.18627678703404407, + "grad_norm": 0.3467976450920105, + "learning_rate": 0.00016277830489088758, + "loss": 1.5321, + "step": 14335 + }, + { + "epoch": 0.18628978157795995, + "grad_norm": 0.4158039093017578, + "learning_rate": 0.0001627757054289762, + "loss": 1.3732, + "step": 14336 + }, + { + "epoch": 0.18630277612187582, + "grad_norm": 0.4933732748031616, + "learning_rate": 0.00016277310596706483, + "loss": 1.5187, + "step": 14337 + }, + { + "epoch": 0.1863157706657917, + "grad_norm": 0.37910646200180054, + "learning_rate": 0.00016277050650515345, + "loss": 1.3129, + "step": 14338 + }, + { + "epoch": 0.18632876520970756, + "grad_norm": 0.42034706473350525, + "learning_rate": 0.00016276790704324205, + "loss": 1.5287, + "step": 14339 + }, + { + "epoch": 0.18634175975362344, + "grad_norm": 0.3714899718761444, + "learning_rate": 0.00016276530758133065, + "loss": 1.1593, + "step": 14340 + }, + { + "epoch": 0.1863547542975393, + "grad_norm": 0.4030725359916687, + "learning_rate": 0.0001627627081194193, + "loss": 1.3238, + "step": 14341 + }, + { + "epoch": 0.18636774884145518, + "grad_norm": 0.3666721284389496, + "learning_rate": 0.0001627601086575079, + "loss": 1.399, + "step": 14342 + }, + { + "epoch": 0.18638074338537106, + "grad_norm": 0.3854379653930664, + "learning_rate": 0.00016275750919559652, + "loss": 1.3854, + "step": 14343 + }, + { + "epoch": 0.18639373792928693, + "grad_norm": 0.4046310484409332, + "learning_rate": 0.00016275490973368515, + "loss": 1.4014, + "step": 14344 + }, + { + "epoch": 0.1864067324732028, + "grad_norm": 0.4487934708595276, + "learning_rate": 0.00016275231027177374, + "loss": 1.6155, + "step": 14345 + }, + { + "epoch": 0.18641972701711867, + "grad_norm": 0.46088072657585144, + "learning_rate": 0.00016274971080986237, + "loss": 1.3763, + "step": 14346 + }, + { + "epoch": 0.18643272156103455, + "grad_norm": 0.45239901542663574, + "learning_rate": 0.00016274711134795097, + "loss": 1.5041, + "step": 14347 + }, + { + "epoch": 0.18644571610495042, + "grad_norm": 0.37555330991744995, + "learning_rate": 0.00016274451188603962, + "loss": 1.5502, + "step": 14348 + }, + { + "epoch": 0.1864587106488663, + "grad_norm": 0.4384852647781372, + "learning_rate": 0.00016274191242412822, + "loss": 1.5333, + "step": 14349 + }, + { + "epoch": 0.18647170519278217, + "grad_norm": 0.334956556558609, + "learning_rate": 0.00016273931296221684, + "loss": 1.3302, + "step": 14350 + }, + { + "epoch": 0.18648469973669807, + "grad_norm": 0.35770031809806824, + "learning_rate": 0.00016273671350030544, + "loss": 1.4343, + "step": 14351 + }, + { + "epoch": 0.18649769428061394, + "grad_norm": 0.29289156198501587, + "learning_rate": 0.00016273411403839406, + "loss": 0.9746, + "step": 14352 + }, + { + "epoch": 0.1865106888245298, + "grad_norm": 0.40831005573272705, + "learning_rate": 0.00016273151457648269, + "loss": 1.6699, + "step": 14353 + }, + { + "epoch": 0.1865236833684457, + "grad_norm": 0.3534592390060425, + "learning_rate": 0.00016272891511457128, + "loss": 1.4727, + "step": 14354 + }, + { + "epoch": 0.18653667791236156, + "grad_norm": 0.3372885584831238, + "learning_rate": 0.0001627263156526599, + "loss": 1.3443, + "step": 14355 + }, + { + "epoch": 0.18654967245627743, + "grad_norm": 0.43740254640579224, + "learning_rate": 0.00016272371619074853, + "loss": 1.4162, + "step": 14356 + }, + { + "epoch": 0.1865626670001933, + "grad_norm": 0.3833792209625244, + "learning_rate": 0.00016272111672883713, + "loss": 1.2537, + "step": 14357 + }, + { + "epoch": 0.18657566154410918, + "grad_norm": 0.3932992219924927, + "learning_rate": 0.00016271851726692575, + "loss": 1.4399, + "step": 14358 + }, + { + "epoch": 0.18658865608802505, + "grad_norm": 0.3103752136230469, + "learning_rate": 0.00016271591780501435, + "loss": 1.2438, + "step": 14359 + }, + { + "epoch": 0.18660165063194092, + "grad_norm": 0.4436561167240143, + "learning_rate": 0.000162713318343103, + "loss": 1.5726, + "step": 14360 + }, + { + "epoch": 0.1866146451758568, + "grad_norm": 0.3723568320274353, + "learning_rate": 0.0001627107188811916, + "loss": 1.5834, + "step": 14361 + }, + { + "epoch": 0.18662763971977267, + "grad_norm": 0.3735470175743103, + "learning_rate": 0.00016270811941928022, + "loss": 1.4049, + "step": 14362 + }, + { + "epoch": 0.18664063426368854, + "grad_norm": 0.4717046916484833, + "learning_rate": 0.00016270551995736882, + "loss": 1.4987, + "step": 14363 + }, + { + "epoch": 0.18665362880760442, + "grad_norm": 0.3478568494319916, + "learning_rate": 0.00016270292049545745, + "loss": 1.45, + "step": 14364 + }, + { + "epoch": 0.1866666233515203, + "grad_norm": 0.3395802676677704, + "learning_rate": 0.00016270032103354607, + "loss": 1.5079, + "step": 14365 + }, + { + "epoch": 0.18667961789543616, + "grad_norm": 0.3740818500518799, + "learning_rate": 0.00016269772157163467, + "loss": 1.552, + "step": 14366 + }, + { + "epoch": 0.18669261243935203, + "grad_norm": 0.4298551380634308, + "learning_rate": 0.0001626951221097233, + "loss": 1.4899, + "step": 14367 + }, + { + "epoch": 0.1867056069832679, + "grad_norm": 0.28909531235694885, + "learning_rate": 0.00016269252264781192, + "loss": 1.5072, + "step": 14368 + }, + { + "epoch": 0.18671860152718378, + "grad_norm": 0.3859047591686249, + "learning_rate": 0.00016268992318590051, + "loss": 1.4512, + "step": 14369 + }, + { + "epoch": 0.18673159607109965, + "grad_norm": 0.40439948439598083, + "learning_rate": 0.00016268732372398914, + "loss": 1.4448, + "step": 14370 + }, + { + "epoch": 0.18674459061501553, + "grad_norm": 0.41528624296188354, + "learning_rate": 0.00016268472426207774, + "loss": 1.4992, + "step": 14371 + }, + { + "epoch": 0.1867575851589314, + "grad_norm": 0.4132675528526306, + "learning_rate": 0.0001626821248001664, + "loss": 1.5386, + "step": 14372 + }, + { + "epoch": 0.18677057970284727, + "grad_norm": 0.4855875074863434, + "learning_rate": 0.00016267952533825499, + "loss": 1.2689, + "step": 14373 + }, + { + "epoch": 0.18678357424676315, + "grad_norm": 0.36735275387763977, + "learning_rate": 0.0001626769258763436, + "loss": 1.3126, + "step": 14374 + }, + { + "epoch": 0.18679656879067902, + "grad_norm": 0.44390103220939636, + "learning_rate": 0.0001626743264144322, + "loss": 1.5056, + "step": 14375 + }, + { + "epoch": 0.1868095633345949, + "grad_norm": 0.29486754536628723, + "learning_rate": 0.00016267172695252083, + "loss": 1.2375, + "step": 14376 + }, + { + "epoch": 0.18682255787851076, + "grad_norm": 0.3379875719547272, + "learning_rate": 0.00016266912749060946, + "loss": 1.4989, + "step": 14377 + }, + { + "epoch": 0.18683555242242664, + "grad_norm": 0.3537369668483734, + "learning_rate": 0.00016266652802869805, + "loss": 1.4326, + "step": 14378 + }, + { + "epoch": 0.1868485469663425, + "grad_norm": 0.2884156405925751, + "learning_rate": 0.00016266392856678668, + "loss": 1.4428, + "step": 14379 + }, + { + "epoch": 0.18686154151025838, + "grad_norm": 0.35906651616096497, + "learning_rate": 0.0001626613291048753, + "loss": 1.2599, + "step": 14380 + }, + { + "epoch": 0.18687453605417426, + "grad_norm": 0.44593536853790283, + "learning_rate": 0.00016265872964296393, + "loss": 1.4354, + "step": 14381 + }, + { + "epoch": 0.18688753059809013, + "grad_norm": 0.3976133167743683, + "learning_rate": 0.00016265613018105252, + "loss": 1.4556, + "step": 14382 + }, + { + "epoch": 0.186900525142006, + "grad_norm": 0.38843801617622375, + "learning_rate": 0.00016265353071914115, + "loss": 1.5049, + "step": 14383 + }, + { + "epoch": 0.18691351968592188, + "grad_norm": 0.30734995007514954, + "learning_rate": 0.00016265093125722977, + "loss": 1.4126, + "step": 14384 + }, + { + "epoch": 0.18692651422983775, + "grad_norm": 0.42527928948402405, + "learning_rate": 0.00016264833179531837, + "loss": 1.6251, + "step": 14385 + }, + { + "epoch": 0.18693950877375362, + "grad_norm": 0.3487423062324524, + "learning_rate": 0.000162645732333407, + "loss": 1.3567, + "step": 14386 + }, + { + "epoch": 0.1869525033176695, + "grad_norm": 0.45996689796447754, + "learning_rate": 0.00016264313287149562, + "loss": 1.3291, + "step": 14387 + }, + { + "epoch": 0.18696549786158537, + "grad_norm": 0.40402936935424805, + "learning_rate": 0.00016264053340958422, + "loss": 1.5707, + "step": 14388 + }, + { + "epoch": 0.18697849240550124, + "grad_norm": 0.38168686628341675, + "learning_rate": 0.00016263793394767284, + "loss": 1.3254, + "step": 14389 + }, + { + "epoch": 0.1869914869494171, + "grad_norm": 0.4388217628002167, + "learning_rate": 0.00016263533448576144, + "loss": 1.3776, + "step": 14390 + }, + { + "epoch": 0.18700448149333299, + "grad_norm": 0.4481607973575592, + "learning_rate": 0.0001626327350238501, + "loss": 1.346, + "step": 14391 + }, + { + "epoch": 0.18701747603724886, + "grad_norm": 0.30641359090805054, + "learning_rate": 0.0001626301355619387, + "loss": 1.3329, + "step": 14392 + }, + { + "epoch": 0.18703047058116473, + "grad_norm": 0.37614983320236206, + "learning_rate": 0.0001626275361000273, + "loss": 1.5578, + "step": 14393 + }, + { + "epoch": 0.1870434651250806, + "grad_norm": 0.4834055006504059, + "learning_rate": 0.0001626249366381159, + "loss": 1.4689, + "step": 14394 + }, + { + "epoch": 0.18705645966899648, + "grad_norm": 0.3917204439640045, + "learning_rate": 0.00016262233717620453, + "loss": 1.584, + "step": 14395 + }, + { + "epoch": 0.18706945421291235, + "grad_norm": 0.44108593463897705, + "learning_rate": 0.00016261973771429316, + "loss": 1.5352, + "step": 14396 + }, + { + "epoch": 0.18708244875682822, + "grad_norm": 0.2887263000011444, + "learning_rate": 0.00016261713825238176, + "loss": 1.3455, + "step": 14397 + }, + { + "epoch": 0.1870954433007441, + "grad_norm": 0.38707253336906433, + "learning_rate": 0.00016261453879047038, + "loss": 1.3652, + "step": 14398 + }, + { + "epoch": 0.18710843784465997, + "grad_norm": 0.4041129946708679, + "learning_rate": 0.000162611939328559, + "loss": 1.1025, + "step": 14399 + }, + { + "epoch": 0.18712143238857584, + "grad_norm": 0.35989147424697876, + "learning_rate": 0.0001626093398666476, + "loss": 1.3621, + "step": 14400 + }, + { + "epoch": 0.18713442693249172, + "grad_norm": 0.3689303994178772, + "learning_rate": 0.00016260674040473623, + "loss": 1.5493, + "step": 14401 + }, + { + "epoch": 0.1871474214764076, + "grad_norm": 0.4389260411262512, + "learning_rate": 0.00016260414094282482, + "loss": 1.541, + "step": 14402 + }, + { + "epoch": 0.18716041602032346, + "grad_norm": 0.3757264316082001, + "learning_rate": 0.00016260154148091348, + "loss": 1.3747, + "step": 14403 + }, + { + "epoch": 0.18717341056423933, + "grad_norm": 0.3165930509567261, + "learning_rate": 0.00016259894201900207, + "loss": 1.3815, + "step": 14404 + }, + { + "epoch": 0.1871864051081552, + "grad_norm": 0.5496413707733154, + "learning_rate": 0.0001625963425570907, + "loss": 1.545, + "step": 14405 + }, + { + "epoch": 0.18719939965207108, + "grad_norm": 0.3831750750541687, + "learning_rate": 0.0001625937430951793, + "loss": 1.4059, + "step": 14406 + }, + { + "epoch": 0.18721239419598695, + "grad_norm": 0.3455681800842285, + "learning_rate": 0.00016259114363326792, + "loss": 1.3575, + "step": 14407 + }, + { + "epoch": 0.18722538873990283, + "grad_norm": 0.3597778081893921, + "learning_rate": 0.00016258854417135654, + "loss": 1.2048, + "step": 14408 + }, + { + "epoch": 0.1872383832838187, + "grad_norm": 0.3858657479286194, + "learning_rate": 0.00016258594470944514, + "loss": 1.4789, + "step": 14409 + }, + { + "epoch": 0.18725137782773457, + "grad_norm": 0.420121431350708, + "learning_rate": 0.00016258334524753377, + "loss": 1.6188, + "step": 14410 + }, + { + "epoch": 0.18726437237165044, + "grad_norm": 0.39170965552330017, + "learning_rate": 0.0001625807457856224, + "loss": 1.4187, + "step": 14411 + }, + { + "epoch": 0.18727736691556632, + "grad_norm": 0.4454212188720703, + "learning_rate": 0.000162578146323711, + "loss": 1.4111, + "step": 14412 + }, + { + "epoch": 0.1872903614594822, + "grad_norm": 0.3327889144420624, + "learning_rate": 0.0001625755468617996, + "loss": 1.5136, + "step": 14413 + }, + { + "epoch": 0.18730335600339806, + "grad_norm": 0.5569693446159363, + "learning_rate": 0.0001625729473998882, + "loss": 1.4477, + "step": 14414 + }, + { + "epoch": 0.18731635054731394, + "grad_norm": 0.4914078116416931, + "learning_rate": 0.00016257034793797686, + "loss": 1.4584, + "step": 14415 + }, + { + "epoch": 0.1873293450912298, + "grad_norm": 0.3696451485157013, + "learning_rate": 0.00016256774847606546, + "loss": 1.4937, + "step": 14416 + }, + { + "epoch": 0.18734233963514568, + "grad_norm": 0.295354425907135, + "learning_rate": 0.00016256514901415408, + "loss": 1.2159, + "step": 14417 + }, + { + "epoch": 0.18735533417906156, + "grad_norm": 0.4005788266658783, + "learning_rate": 0.0001625625495522427, + "loss": 1.4283, + "step": 14418 + }, + { + "epoch": 0.18736832872297743, + "grad_norm": 0.4608438014984131, + "learning_rate": 0.0001625599500903313, + "loss": 1.4152, + "step": 14419 + }, + { + "epoch": 0.1873813232668933, + "grad_norm": 0.5253211259841919, + "learning_rate": 0.00016255735062841993, + "loss": 1.5659, + "step": 14420 + }, + { + "epoch": 0.18739431781080917, + "grad_norm": 0.4151586890220642, + "learning_rate": 0.00016255475116650853, + "loss": 1.359, + "step": 14421 + }, + { + "epoch": 0.18740731235472505, + "grad_norm": 0.43396228551864624, + "learning_rate": 0.00016255215170459718, + "loss": 1.3933, + "step": 14422 + }, + { + "epoch": 0.18742030689864092, + "grad_norm": 0.4491978883743286, + "learning_rate": 0.00016254955224268578, + "loss": 1.5948, + "step": 14423 + }, + { + "epoch": 0.1874333014425568, + "grad_norm": 0.4526844620704651, + "learning_rate": 0.00016254695278077437, + "loss": 1.371, + "step": 14424 + }, + { + "epoch": 0.18744629598647267, + "grad_norm": 0.3974749445915222, + "learning_rate": 0.000162544353318863, + "loss": 1.2999, + "step": 14425 + }, + { + "epoch": 0.18745929053038854, + "grad_norm": 0.43171507120132446, + "learning_rate": 0.00016254175385695162, + "loss": 1.4081, + "step": 14426 + }, + { + "epoch": 0.18747228507430444, + "grad_norm": 0.40456482768058777, + "learning_rate": 0.00016253915439504025, + "loss": 1.3136, + "step": 14427 + }, + { + "epoch": 0.1874852796182203, + "grad_norm": 0.45029914379119873, + "learning_rate": 0.00016253655493312884, + "loss": 1.4957, + "step": 14428 + }, + { + "epoch": 0.18749827416213619, + "grad_norm": 0.5150418281555176, + "learning_rate": 0.00016253395547121747, + "loss": 1.4739, + "step": 14429 + }, + { + "epoch": 0.18751126870605206, + "grad_norm": 0.42873334884643555, + "learning_rate": 0.0001625313560093061, + "loss": 1.5303, + "step": 14430 + }, + { + "epoch": 0.18752426324996793, + "grad_norm": 0.3903109133243561, + "learning_rate": 0.0001625287565473947, + "loss": 1.3689, + "step": 14431 + }, + { + "epoch": 0.1875372577938838, + "grad_norm": 0.3819105327129364, + "learning_rate": 0.00016252615708548332, + "loss": 1.4961, + "step": 14432 + }, + { + "epoch": 0.18755025233779968, + "grad_norm": 0.36253443360328674, + "learning_rate": 0.0001625235576235719, + "loss": 1.5181, + "step": 14433 + }, + { + "epoch": 0.18756324688171555, + "grad_norm": 0.4301897883415222, + "learning_rate": 0.00016252095816166056, + "loss": 1.3342, + "step": 14434 + }, + { + "epoch": 0.18757624142563142, + "grad_norm": 0.37700337171554565, + "learning_rate": 0.00016251835869974916, + "loss": 1.3786, + "step": 14435 + }, + { + "epoch": 0.1875892359695473, + "grad_norm": 0.3972987234592438, + "learning_rate": 0.00016251575923783776, + "loss": 1.618, + "step": 14436 + }, + { + "epoch": 0.18760223051346317, + "grad_norm": 0.42468562722206116, + "learning_rate": 0.00016251315977592638, + "loss": 1.4217, + "step": 14437 + }, + { + "epoch": 0.18761522505737904, + "grad_norm": 0.3062169849872589, + "learning_rate": 0.000162510560314015, + "loss": 1.298, + "step": 14438 + }, + { + "epoch": 0.18762821960129492, + "grad_norm": 0.4621412456035614, + "learning_rate": 0.00016250796085210363, + "loss": 1.3647, + "step": 14439 + }, + { + "epoch": 0.1876412141452108, + "grad_norm": 0.40942925214767456, + "learning_rate": 0.00016250536139019223, + "loss": 1.3456, + "step": 14440 + }, + { + "epoch": 0.18765420868912666, + "grad_norm": 0.4475214183330536, + "learning_rate": 0.00016250276192828085, + "loss": 1.6426, + "step": 14441 + }, + { + "epoch": 0.18766720323304253, + "grad_norm": 0.3709256649017334, + "learning_rate": 0.00016250016246636948, + "loss": 1.3788, + "step": 14442 + }, + { + "epoch": 0.1876801977769584, + "grad_norm": 0.3314778208732605, + "learning_rate": 0.00016249756300445808, + "loss": 1.2837, + "step": 14443 + }, + { + "epoch": 0.18769319232087428, + "grad_norm": 0.47179046273231506, + "learning_rate": 0.0001624949635425467, + "loss": 1.5521, + "step": 14444 + }, + { + "epoch": 0.18770618686479015, + "grad_norm": 0.43415403366088867, + "learning_rate": 0.0001624923640806353, + "loss": 1.4386, + "step": 14445 + }, + { + "epoch": 0.18771918140870603, + "grad_norm": 0.35949957370758057, + "learning_rate": 0.00016248976461872395, + "loss": 1.4333, + "step": 14446 + }, + { + "epoch": 0.1877321759526219, + "grad_norm": 0.4087499976158142, + "learning_rate": 0.00016248716515681255, + "loss": 1.4038, + "step": 14447 + }, + { + "epoch": 0.18774517049653777, + "grad_norm": 0.4740716814994812, + "learning_rate": 0.00016248456569490117, + "loss": 1.6589, + "step": 14448 + }, + { + "epoch": 0.18775816504045365, + "grad_norm": 0.522384524345398, + "learning_rate": 0.00016248196623298977, + "loss": 1.3671, + "step": 14449 + }, + { + "epoch": 0.18777115958436952, + "grad_norm": 0.41290026903152466, + "learning_rate": 0.0001624793667710784, + "loss": 1.5605, + "step": 14450 + }, + { + "epoch": 0.1877841541282854, + "grad_norm": 0.3898678123950958, + "learning_rate": 0.00016247676730916702, + "loss": 1.4625, + "step": 14451 + }, + { + "epoch": 0.18779714867220126, + "grad_norm": 0.3673611879348755, + "learning_rate": 0.00016247416784725562, + "loss": 1.5046, + "step": 14452 + }, + { + "epoch": 0.18781014321611714, + "grad_norm": 0.36359888315200806, + "learning_rate": 0.00016247156838534424, + "loss": 1.3245, + "step": 14453 + }, + { + "epoch": 0.187823137760033, + "grad_norm": 0.394386351108551, + "learning_rate": 0.00016246896892343286, + "loss": 1.4362, + "step": 14454 + }, + { + "epoch": 0.18783613230394888, + "grad_norm": 0.38717418909072876, + "learning_rate": 0.00016246636946152146, + "loss": 1.389, + "step": 14455 + }, + { + "epoch": 0.18784912684786476, + "grad_norm": 0.3920443058013916, + "learning_rate": 0.00016246376999961009, + "loss": 1.4478, + "step": 14456 + }, + { + "epoch": 0.18786212139178063, + "grad_norm": 0.3487685024738312, + "learning_rate": 0.0001624611705376987, + "loss": 1.4958, + "step": 14457 + }, + { + "epoch": 0.1878751159356965, + "grad_norm": 0.2473049908876419, + "learning_rate": 0.00016245857107578734, + "loss": 1.2694, + "step": 14458 + }, + { + "epoch": 0.18788811047961237, + "grad_norm": 0.3660510778427124, + "learning_rate": 0.00016245597161387593, + "loss": 1.4073, + "step": 14459 + }, + { + "epoch": 0.18790110502352825, + "grad_norm": 0.4749138057231903, + "learning_rate": 0.00016245337215196456, + "loss": 1.3103, + "step": 14460 + }, + { + "epoch": 0.18791409956744412, + "grad_norm": 0.3831072151660919, + "learning_rate": 0.00016245077269005318, + "loss": 1.5522, + "step": 14461 + }, + { + "epoch": 0.18792709411136, + "grad_norm": 0.43080803751945496, + "learning_rate": 0.00016244817322814178, + "loss": 1.535, + "step": 14462 + }, + { + "epoch": 0.18794008865527587, + "grad_norm": 0.3907145857810974, + "learning_rate": 0.0001624455737662304, + "loss": 1.4529, + "step": 14463 + }, + { + "epoch": 0.18795308319919174, + "grad_norm": 0.47572651505470276, + "learning_rate": 0.000162442974304319, + "loss": 1.3927, + "step": 14464 + }, + { + "epoch": 0.1879660777431076, + "grad_norm": 0.4210858643054962, + "learning_rate": 0.00016244037484240765, + "loss": 1.321, + "step": 14465 + }, + { + "epoch": 0.18797907228702349, + "grad_norm": 0.36122605204582214, + "learning_rate": 0.00016243777538049625, + "loss": 1.522, + "step": 14466 + }, + { + "epoch": 0.18799206683093936, + "grad_norm": 0.4624292850494385, + "learning_rate": 0.00016243517591858485, + "loss": 1.4885, + "step": 14467 + }, + { + "epoch": 0.18800506137485523, + "grad_norm": 0.46274593472480774, + "learning_rate": 0.00016243257645667347, + "loss": 1.3888, + "step": 14468 + }, + { + "epoch": 0.1880180559187711, + "grad_norm": 0.4571024179458618, + "learning_rate": 0.0001624299769947621, + "loss": 1.3338, + "step": 14469 + }, + { + "epoch": 0.18803105046268698, + "grad_norm": 0.22090663015842438, + "learning_rate": 0.00016242737753285072, + "loss": 1.437, + "step": 14470 + }, + { + "epoch": 0.18804404500660285, + "grad_norm": 0.5126134753227234, + "learning_rate": 0.00016242477807093932, + "loss": 1.5309, + "step": 14471 + }, + { + "epoch": 0.18805703955051872, + "grad_norm": 0.3658894896507263, + "learning_rate": 0.00016242217860902794, + "loss": 1.3647, + "step": 14472 + }, + { + "epoch": 0.1880700340944346, + "grad_norm": 0.36310434341430664, + "learning_rate": 0.00016241957914711657, + "loss": 1.2909, + "step": 14473 + }, + { + "epoch": 0.18808302863835047, + "grad_norm": 0.39679476618766785, + "learning_rate": 0.00016241697968520516, + "loss": 1.4251, + "step": 14474 + }, + { + "epoch": 0.18809602318226634, + "grad_norm": 0.294733464717865, + "learning_rate": 0.0001624143802232938, + "loss": 1.5591, + "step": 14475 + }, + { + "epoch": 0.18810901772618221, + "grad_norm": 0.39871710538864136, + "learning_rate": 0.00016241178076138239, + "loss": 1.2823, + "step": 14476 + }, + { + "epoch": 0.1881220122700981, + "grad_norm": 0.4935499429702759, + "learning_rate": 0.00016240918129947104, + "loss": 1.3357, + "step": 14477 + }, + { + "epoch": 0.18813500681401396, + "grad_norm": 0.48948514461517334, + "learning_rate": 0.00016240658183755964, + "loss": 1.4179, + "step": 14478 + }, + { + "epoch": 0.18814800135792983, + "grad_norm": 0.38988742232322693, + "learning_rate": 0.00016240398237564823, + "loss": 1.4717, + "step": 14479 + }, + { + "epoch": 0.1881609959018457, + "grad_norm": 0.3298640847206116, + "learning_rate": 0.00016240138291373686, + "loss": 1.2856, + "step": 14480 + }, + { + "epoch": 0.18817399044576158, + "grad_norm": 0.3414093554019928, + "learning_rate": 0.00016239878345182548, + "loss": 1.3592, + "step": 14481 + }, + { + "epoch": 0.18818698498967745, + "grad_norm": 0.3263503611087799, + "learning_rate": 0.0001623961839899141, + "loss": 1.6188, + "step": 14482 + }, + { + "epoch": 0.18819997953359333, + "grad_norm": 0.40886253118515015, + "learning_rate": 0.0001623935845280027, + "loss": 1.4467, + "step": 14483 + }, + { + "epoch": 0.1882129740775092, + "grad_norm": 0.47259044647216797, + "learning_rate": 0.00016239098506609133, + "loss": 1.2118, + "step": 14484 + }, + { + "epoch": 0.18822596862142507, + "grad_norm": 0.36327096819877625, + "learning_rate": 0.00016238838560417995, + "loss": 1.4744, + "step": 14485 + }, + { + "epoch": 0.18823896316534094, + "grad_norm": 0.44845834374427795, + "learning_rate": 0.00016238578614226855, + "loss": 1.1621, + "step": 14486 + }, + { + "epoch": 0.18825195770925682, + "grad_norm": 0.3794812262058258, + "learning_rate": 0.00016238318668035717, + "loss": 1.4718, + "step": 14487 + }, + { + "epoch": 0.1882649522531727, + "grad_norm": 0.48565933108329773, + "learning_rate": 0.00016238058721844577, + "loss": 1.5504, + "step": 14488 + }, + { + "epoch": 0.18827794679708856, + "grad_norm": 0.44529977440834045, + "learning_rate": 0.00016237798775653442, + "loss": 1.5978, + "step": 14489 + }, + { + "epoch": 0.18829094134100444, + "grad_norm": 0.42387479543685913, + "learning_rate": 0.00016237538829462302, + "loss": 1.2925, + "step": 14490 + }, + { + "epoch": 0.1883039358849203, + "grad_norm": 0.3885887861251831, + "learning_rate": 0.00016237278883271162, + "loss": 1.6111, + "step": 14491 + }, + { + "epoch": 0.18831693042883618, + "grad_norm": 0.42726266384124756, + "learning_rate": 0.00016237018937080027, + "loss": 1.4592, + "step": 14492 + }, + { + "epoch": 0.18832992497275206, + "grad_norm": 0.414634644985199, + "learning_rate": 0.00016236758990888887, + "loss": 1.566, + "step": 14493 + }, + { + "epoch": 0.18834291951666793, + "grad_norm": 0.4234277009963989, + "learning_rate": 0.0001623649904469775, + "loss": 1.3832, + "step": 14494 + }, + { + "epoch": 0.1883559140605838, + "grad_norm": 0.3759523332118988, + "learning_rate": 0.0001623623909850661, + "loss": 1.5865, + "step": 14495 + }, + { + "epoch": 0.18836890860449967, + "grad_norm": 0.2468254268169403, + "learning_rate": 0.0001623597915231547, + "loss": 1.2233, + "step": 14496 + }, + { + "epoch": 0.18838190314841555, + "grad_norm": 0.4683910608291626, + "learning_rate": 0.00016235719206124334, + "loss": 1.3723, + "step": 14497 + }, + { + "epoch": 0.18839489769233142, + "grad_norm": 0.4523530602455139, + "learning_rate": 0.00016235459259933194, + "loss": 1.4103, + "step": 14498 + }, + { + "epoch": 0.1884078922362473, + "grad_norm": 0.4392837584018707, + "learning_rate": 0.00016235199313742056, + "loss": 1.4252, + "step": 14499 + }, + { + "epoch": 0.18842088678016317, + "grad_norm": 0.386633038520813, + "learning_rate": 0.00016234939367550918, + "loss": 1.4198, + "step": 14500 + }, + { + "epoch": 0.18843388132407904, + "grad_norm": 0.3381975591182709, + "learning_rate": 0.0001623467942135978, + "loss": 1.2635, + "step": 14501 + }, + { + "epoch": 0.1884468758679949, + "grad_norm": 0.42978349328041077, + "learning_rate": 0.0001623441947516864, + "loss": 1.7845, + "step": 14502 + }, + { + "epoch": 0.1884598704119108, + "grad_norm": 0.3976183235645294, + "learning_rate": 0.00016234159528977503, + "loss": 1.4765, + "step": 14503 + }, + { + "epoch": 0.18847286495582669, + "grad_norm": 0.44078388810157776, + "learning_rate": 0.00016233899582786365, + "loss": 1.2836, + "step": 14504 + }, + { + "epoch": 0.18848585949974256, + "grad_norm": 0.390920490026474, + "learning_rate": 0.00016233639636595225, + "loss": 1.4417, + "step": 14505 + }, + { + "epoch": 0.18849885404365843, + "grad_norm": 0.40321066975593567, + "learning_rate": 0.00016233379690404088, + "loss": 1.5436, + "step": 14506 + }, + { + "epoch": 0.1885118485875743, + "grad_norm": 0.4346846044063568, + "learning_rate": 0.00016233119744212947, + "loss": 1.4673, + "step": 14507 + }, + { + "epoch": 0.18852484313149018, + "grad_norm": 0.3635194003582001, + "learning_rate": 0.0001623285979802181, + "loss": 1.5856, + "step": 14508 + }, + { + "epoch": 0.18853783767540605, + "grad_norm": 0.39587870240211487, + "learning_rate": 0.00016232599851830672, + "loss": 1.4979, + "step": 14509 + }, + { + "epoch": 0.18855083221932192, + "grad_norm": 0.327092707157135, + "learning_rate": 0.00016232339905639532, + "loss": 1.3366, + "step": 14510 + }, + { + "epoch": 0.1885638267632378, + "grad_norm": 0.4528246223926544, + "learning_rate": 0.00016232079959448394, + "loss": 1.592, + "step": 14511 + }, + { + "epoch": 0.18857682130715367, + "grad_norm": 0.4175698459148407, + "learning_rate": 0.00016231820013257257, + "loss": 1.5478, + "step": 14512 + }, + { + "epoch": 0.18858981585106954, + "grad_norm": 0.38876259326934814, + "learning_rate": 0.0001623156006706612, + "loss": 1.36, + "step": 14513 + }, + { + "epoch": 0.18860281039498542, + "grad_norm": 0.3338746726512909, + "learning_rate": 0.0001623130012087498, + "loss": 1.3279, + "step": 14514 + }, + { + "epoch": 0.1886158049389013, + "grad_norm": 0.33175089955329895, + "learning_rate": 0.00016231040174683842, + "loss": 1.5069, + "step": 14515 + }, + { + "epoch": 0.18862879948281716, + "grad_norm": 0.434696227312088, + "learning_rate": 0.00016230780228492704, + "loss": 1.5451, + "step": 14516 + }, + { + "epoch": 0.18864179402673303, + "grad_norm": 0.3537701666355133, + "learning_rate": 0.00016230520282301564, + "loss": 1.3026, + "step": 14517 + }, + { + "epoch": 0.1886547885706489, + "grad_norm": 0.3466985821723938, + "learning_rate": 0.00016230260336110426, + "loss": 1.5071, + "step": 14518 + }, + { + "epoch": 0.18866778311456478, + "grad_norm": 0.4111585021018982, + "learning_rate": 0.00016230000389919286, + "loss": 1.6204, + "step": 14519 + }, + { + "epoch": 0.18868077765848065, + "grad_norm": 0.36198440194129944, + "learning_rate": 0.00016229740443728148, + "loss": 1.2028, + "step": 14520 + }, + { + "epoch": 0.18869377220239653, + "grad_norm": 0.44787272810935974, + "learning_rate": 0.0001622948049753701, + "loss": 1.5003, + "step": 14521 + }, + { + "epoch": 0.1887067667463124, + "grad_norm": 0.37772712111473083, + "learning_rate": 0.0001622922055134587, + "loss": 1.4645, + "step": 14522 + }, + { + "epoch": 0.18871976129022827, + "grad_norm": 0.4513583183288574, + "learning_rate": 0.00016228960605154733, + "loss": 1.409, + "step": 14523 + }, + { + "epoch": 0.18873275583414414, + "grad_norm": 0.40195319056510925, + "learning_rate": 0.00016228700658963595, + "loss": 1.4865, + "step": 14524 + }, + { + "epoch": 0.18874575037806002, + "grad_norm": 0.3088810443878174, + "learning_rate": 0.00016228440712772458, + "loss": 1.5114, + "step": 14525 + }, + { + "epoch": 0.1887587449219759, + "grad_norm": 0.36387673020362854, + "learning_rate": 0.00016228180766581318, + "loss": 1.2078, + "step": 14526 + }, + { + "epoch": 0.18877173946589176, + "grad_norm": 0.397029846906662, + "learning_rate": 0.0001622792082039018, + "loss": 1.4947, + "step": 14527 + }, + { + "epoch": 0.18878473400980764, + "grad_norm": 0.3769640326499939, + "learning_rate": 0.00016227660874199043, + "loss": 1.4792, + "step": 14528 + }, + { + "epoch": 0.1887977285537235, + "grad_norm": 0.3839905858039856, + "learning_rate": 0.00016227400928007902, + "loss": 1.227, + "step": 14529 + }, + { + "epoch": 0.18881072309763938, + "grad_norm": 0.3760770261287689, + "learning_rate": 0.00016227140981816765, + "loss": 1.5946, + "step": 14530 + }, + { + "epoch": 0.18882371764155526, + "grad_norm": 0.3543286919593811, + "learning_rate": 0.00016226881035625627, + "loss": 1.3848, + "step": 14531 + }, + { + "epoch": 0.18883671218547113, + "grad_norm": 0.47472327947616577, + "learning_rate": 0.0001622662108943449, + "loss": 1.3272, + "step": 14532 + }, + { + "epoch": 0.188849706729387, + "grad_norm": 0.33231085538864136, + "learning_rate": 0.0001622636114324335, + "loss": 1.2445, + "step": 14533 + }, + { + "epoch": 0.18886270127330287, + "grad_norm": 0.3936046361923218, + "learning_rate": 0.0001622610119705221, + "loss": 1.5161, + "step": 14534 + }, + { + "epoch": 0.18887569581721875, + "grad_norm": 0.46012234687805176, + "learning_rate": 0.00016225841250861074, + "loss": 1.5214, + "step": 14535 + }, + { + "epoch": 0.18888869036113462, + "grad_norm": 0.3569498658180237, + "learning_rate": 0.00016225581304669934, + "loss": 1.2607, + "step": 14536 + }, + { + "epoch": 0.1889016849050505, + "grad_norm": 0.4469507038593292, + "learning_rate": 0.00016225321358478796, + "loss": 1.5127, + "step": 14537 + }, + { + "epoch": 0.18891467944896637, + "grad_norm": 0.32847365736961365, + "learning_rate": 0.00016225061412287656, + "loss": 1.4589, + "step": 14538 + }, + { + "epoch": 0.18892767399288224, + "grad_norm": 0.3133804202079773, + "learning_rate": 0.0001622480146609652, + "loss": 1.3264, + "step": 14539 + }, + { + "epoch": 0.1889406685367981, + "grad_norm": 0.38544103503227234, + "learning_rate": 0.0001622454151990538, + "loss": 1.2701, + "step": 14540 + }, + { + "epoch": 0.18895366308071398, + "grad_norm": 0.29341474175453186, + "learning_rate": 0.0001622428157371424, + "loss": 1.3892, + "step": 14541 + }, + { + "epoch": 0.18896665762462986, + "grad_norm": 0.3488509953022003, + "learning_rate": 0.00016224021627523103, + "loss": 1.3131, + "step": 14542 + }, + { + "epoch": 0.18897965216854573, + "grad_norm": 0.39652740955352783, + "learning_rate": 0.00016223761681331966, + "loss": 1.3547, + "step": 14543 + }, + { + "epoch": 0.1889926467124616, + "grad_norm": 0.3201472759246826, + "learning_rate": 0.00016223501735140828, + "loss": 1.4861, + "step": 14544 + }, + { + "epoch": 0.18900564125637748, + "grad_norm": 0.47403889894485474, + "learning_rate": 0.00016223241788949688, + "loss": 1.6844, + "step": 14545 + }, + { + "epoch": 0.18901863580029335, + "grad_norm": 0.3672645390033722, + "learning_rate": 0.00016222981842758548, + "loss": 1.5306, + "step": 14546 + }, + { + "epoch": 0.18903163034420922, + "grad_norm": 0.3776554763317108, + "learning_rate": 0.00016222721896567413, + "loss": 1.3687, + "step": 14547 + }, + { + "epoch": 0.1890446248881251, + "grad_norm": 0.36863601207733154, + "learning_rate": 0.00016222461950376273, + "loss": 1.5099, + "step": 14548 + }, + { + "epoch": 0.18905761943204097, + "grad_norm": 0.3465145230293274, + "learning_rate": 0.00016222202004185135, + "loss": 1.3275, + "step": 14549 + }, + { + "epoch": 0.18907061397595684, + "grad_norm": 0.45081740617752075, + "learning_rate": 0.00016221942057993995, + "loss": 1.3354, + "step": 14550 + }, + { + "epoch": 0.18908360851987271, + "grad_norm": 0.4660468101501465, + "learning_rate": 0.00016221682111802857, + "loss": 1.5158, + "step": 14551 + }, + { + "epoch": 0.1890966030637886, + "grad_norm": 0.41518107056617737, + "learning_rate": 0.0001622142216561172, + "loss": 1.3131, + "step": 14552 + }, + { + "epoch": 0.18910959760770446, + "grad_norm": 0.5014123320579529, + "learning_rate": 0.0001622116221942058, + "loss": 1.5265, + "step": 14553 + }, + { + "epoch": 0.18912259215162033, + "grad_norm": 0.4716885983943939, + "learning_rate": 0.00016220902273229442, + "loss": 1.4843, + "step": 14554 + }, + { + "epoch": 0.1891355866955362, + "grad_norm": 0.3320855498313904, + "learning_rate": 0.00016220642327038304, + "loss": 1.3151, + "step": 14555 + }, + { + "epoch": 0.18914858123945208, + "grad_norm": 0.491390198469162, + "learning_rate": 0.00016220382380847167, + "loss": 1.4699, + "step": 14556 + }, + { + "epoch": 0.18916157578336795, + "grad_norm": 0.38898661732673645, + "learning_rate": 0.00016220122434656026, + "loss": 1.1798, + "step": 14557 + }, + { + "epoch": 0.18917457032728383, + "grad_norm": 0.40661144256591797, + "learning_rate": 0.00016219862488464886, + "loss": 1.4118, + "step": 14558 + }, + { + "epoch": 0.1891875648711997, + "grad_norm": 0.4319323003292084, + "learning_rate": 0.00016219602542273751, + "loss": 1.511, + "step": 14559 + }, + { + "epoch": 0.18920055941511557, + "grad_norm": 0.38511526584625244, + "learning_rate": 0.0001621934259608261, + "loss": 1.3516, + "step": 14560 + }, + { + "epoch": 0.18921355395903144, + "grad_norm": 0.32243314385414124, + "learning_rate": 0.00016219082649891474, + "loss": 1.3032, + "step": 14561 + }, + { + "epoch": 0.18922654850294732, + "grad_norm": 0.47862011194229126, + "learning_rate": 0.00016218822703700333, + "loss": 1.4037, + "step": 14562 + }, + { + "epoch": 0.1892395430468632, + "grad_norm": 0.39550742506980896, + "learning_rate": 0.00016218562757509196, + "loss": 1.2888, + "step": 14563 + }, + { + "epoch": 0.18925253759077906, + "grad_norm": 0.442732572555542, + "learning_rate": 0.00016218302811318058, + "loss": 1.5183, + "step": 14564 + }, + { + "epoch": 0.18926553213469494, + "grad_norm": 0.361819863319397, + "learning_rate": 0.00016218042865126918, + "loss": 1.4079, + "step": 14565 + }, + { + "epoch": 0.1892785266786108, + "grad_norm": 0.3531530201435089, + "learning_rate": 0.00016217782918935783, + "loss": 1.2965, + "step": 14566 + }, + { + "epoch": 0.18929152122252668, + "grad_norm": 0.38613361120224, + "learning_rate": 0.00016217522972744643, + "loss": 1.2648, + "step": 14567 + }, + { + "epoch": 0.18930451576644255, + "grad_norm": 0.3453353941440582, + "learning_rate": 0.00016217263026553505, + "loss": 1.409, + "step": 14568 + }, + { + "epoch": 0.18931751031035843, + "grad_norm": 0.5640254020690918, + "learning_rate": 0.00016217003080362365, + "loss": 1.532, + "step": 14569 + }, + { + "epoch": 0.1893305048542743, + "grad_norm": 0.33535224199295044, + "learning_rate": 0.00016216743134171227, + "loss": 1.2854, + "step": 14570 + }, + { + "epoch": 0.18934349939819017, + "grad_norm": 0.32907330989837646, + "learning_rate": 0.0001621648318798009, + "loss": 1.4261, + "step": 14571 + }, + { + "epoch": 0.18935649394210605, + "grad_norm": 0.36334043741226196, + "learning_rate": 0.0001621622324178895, + "loss": 1.4106, + "step": 14572 + }, + { + "epoch": 0.18936948848602192, + "grad_norm": 0.4157494008541107, + "learning_rate": 0.00016215963295597812, + "loss": 1.4241, + "step": 14573 + }, + { + "epoch": 0.1893824830299378, + "grad_norm": 0.39280542731285095, + "learning_rate": 0.00016215703349406675, + "loss": 1.3342, + "step": 14574 + }, + { + "epoch": 0.18939547757385367, + "grad_norm": 0.32324910163879395, + "learning_rate": 0.00016215443403215534, + "loss": 1.5257, + "step": 14575 + }, + { + "epoch": 0.18940847211776954, + "grad_norm": 0.4282236695289612, + "learning_rate": 0.00016215183457024397, + "loss": 1.4957, + "step": 14576 + }, + { + "epoch": 0.1894214666616854, + "grad_norm": 0.37148046493530273, + "learning_rate": 0.00016214923510833256, + "loss": 1.394, + "step": 14577 + }, + { + "epoch": 0.18943446120560128, + "grad_norm": 0.41627007722854614, + "learning_rate": 0.00016214663564642122, + "loss": 1.4561, + "step": 14578 + }, + { + "epoch": 0.18944745574951719, + "grad_norm": 0.44311755895614624, + "learning_rate": 0.0001621440361845098, + "loss": 1.5579, + "step": 14579 + }, + { + "epoch": 0.18946045029343306, + "grad_norm": 0.35448744893074036, + "learning_rate": 0.00016214143672259844, + "loss": 1.2271, + "step": 14580 + }, + { + "epoch": 0.18947344483734893, + "grad_norm": 0.4766080677509308, + "learning_rate": 0.00016213883726068704, + "loss": 1.4918, + "step": 14581 + }, + { + "epoch": 0.1894864393812648, + "grad_norm": 0.3092266917228699, + "learning_rate": 0.00016213623779877566, + "loss": 1.3528, + "step": 14582 + }, + { + "epoch": 0.18949943392518068, + "grad_norm": 0.38101866841316223, + "learning_rate": 0.00016213363833686428, + "loss": 1.5183, + "step": 14583 + }, + { + "epoch": 0.18951242846909655, + "grad_norm": 0.49301955103874207, + "learning_rate": 0.00016213103887495288, + "loss": 1.5361, + "step": 14584 + }, + { + "epoch": 0.18952542301301242, + "grad_norm": 0.4606168568134308, + "learning_rate": 0.0001621284394130415, + "loss": 1.4337, + "step": 14585 + }, + { + "epoch": 0.1895384175569283, + "grad_norm": 0.3919869661331177, + "learning_rate": 0.00016212583995113013, + "loss": 1.4166, + "step": 14586 + }, + { + "epoch": 0.18955141210084417, + "grad_norm": 0.4393739700317383, + "learning_rate": 0.00016212324048921876, + "loss": 1.4197, + "step": 14587 + }, + { + "epoch": 0.18956440664476004, + "grad_norm": 0.4152737855911255, + "learning_rate": 0.00016212064102730735, + "loss": 1.5014, + "step": 14588 + }, + { + "epoch": 0.18957740118867591, + "grad_norm": 0.29390281438827515, + "learning_rate": 0.00016211804156539595, + "loss": 1.3837, + "step": 14589 + }, + { + "epoch": 0.1895903957325918, + "grad_norm": 0.409537136554718, + "learning_rate": 0.0001621154421034846, + "loss": 1.3489, + "step": 14590 + }, + { + "epoch": 0.18960339027650766, + "grad_norm": 0.2716923654079437, + "learning_rate": 0.0001621128426415732, + "loss": 1.5005, + "step": 14591 + }, + { + "epoch": 0.18961638482042353, + "grad_norm": 0.356402188539505, + "learning_rate": 0.00016211024317966182, + "loss": 1.386, + "step": 14592 + }, + { + "epoch": 0.1896293793643394, + "grad_norm": 0.5671102404594421, + "learning_rate": 0.00016210764371775042, + "loss": 1.4691, + "step": 14593 + }, + { + "epoch": 0.18964237390825528, + "grad_norm": 0.41866335272789, + "learning_rate": 0.00016210504425583905, + "loss": 1.5245, + "step": 14594 + }, + { + "epoch": 0.18965536845217115, + "grad_norm": 0.39142248034477234, + "learning_rate": 0.00016210244479392767, + "loss": 1.3377, + "step": 14595 + }, + { + "epoch": 0.18966836299608703, + "grad_norm": 0.33140134811401367, + "learning_rate": 0.00016209984533201627, + "loss": 1.4644, + "step": 14596 + }, + { + "epoch": 0.1896813575400029, + "grad_norm": 0.4844195544719696, + "learning_rate": 0.0001620972458701049, + "loss": 1.5659, + "step": 14597 + }, + { + "epoch": 0.18969435208391877, + "grad_norm": 0.471505731344223, + "learning_rate": 0.00016209464640819352, + "loss": 1.4393, + "step": 14598 + }, + { + "epoch": 0.18970734662783464, + "grad_norm": 0.46963560581207275, + "learning_rate": 0.00016209204694628214, + "loss": 1.3687, + "step": 14599 + }, + { + "epoch": 0.18972034117175052, + "grad_norm": 0.3357981741428375, + "learning_rate": 0.00016208944748437074, + "loss": 1.392, + "step": 14600 + }, + { + "epoch": 0.1897333357156664, + "grad_norm": 0.27924999594688416, + "learning_rate": 0.00016208684802245934, + "loss": 1.3199, + "step": 14601 + }, + { + "epoch": 0.18974633025958226, + "grad_norm": 0.4085221588611603, + "learning_rate": 0.000162084248560548, + "loss": 1.2778, + "step": 14602 + }, + { + "epoch": 0.18975932480349814, + "grad_norm": 0.3391517102718353, + "learning_rate": 0.00016208164909863658, + "loss": 1.3443, + "step": 14603 + }, + { + "epoch": 0.189772319347414, + "grad_norm": 0.3961552083492279, + "learning_rate": 0.0001620790496367252, + "loss": 1.2885, + "step": 14604 + }, + { + "epoch": 0.18978531389132988, + "grad_norm": 0.34779369831085205, + "learning_rate": 0.00016207645017481383, + "loss": 1.5441, + "step": 14605 + }, + { + "epoch": 0.18979830843524576, + "grad_norm": 0.35380983352661133, + "learning_rate": 0.00016207385071290243, + "loss": 1.3273, + "step": 14606 + }, + { + "epoch": 0.18981130297916163, + "grad_norm": 0.35235559940338135, + "learning_rate": 0.00016207125125099106, + "loss": 1.4866, + "step": 14607 + }, + { + "epoch": 0.1898242975230775, + "grad_norm": 0.38930296897888184, + "learning_rate": 0.00016206865178907965, + "loss": 1.3539, + "step": 14608 + }, + { + "epoch": 0.18983729206699337, + "grad_norm": 0.42879459261894226, + "learning_rate": 0.0001620660523271683, + "loss": 1.4561, + "step": 14609 + }, + { + "epoch": 0.18985028661090925, + "grad_norm": 0.37104061245918274, + "learning_rate": 0.0001620634528652569, + "loss": 1.447, + "step": 14610 + }, + { + "epoch": 0.18986328115482512, + "grad_norm": 0.44099271297454834, + "learning_rate": 0.00016206085340334553, + "loss": 1.5425, + "step": 14611 + }, + { + "epoch": 0.189876275698741, + "grad_norm": 0.4174787104129791, + "learning_rate": 0.00016205825394143412, + "loss": 1.3412, + "step": 14612 + }, + { + "epoch": 0.18988927024265687, + "grad_norm": 0.33044126629829407, + "learning_rate": 0.00016205565447952275, + "loss": 1.4188, + "step": 14613 + }, + { + "epoch": 0.18990226478657274, + "grad_norm": 0.430999755859375, + "learning_rate": 0.00016205305501761137, + "loss": 1.4897, + "step": 14614 + }, + { + "epoch": 0.1899152593304886, + "grad_norm": 0.45416224002838135, + "learning_rate": 0.00016205045555569997, + "loss": 1.4092, + "step": 14615 + }, + { + "epoch": 0.18992825387440448, + "grad_norm": 0.3304848372936249, + "learning_rate": 0.0001620478560937886, + "loss": 1.1832, + "step": 14616 + }, + { + "epoch": 0.18994124841832036, + "grad_norm": 0.28313982486724854, + "learning_rate": 0.00016204525663187722, + "loss": 1.2612, + "step": 14617 + }, + { + "epoch": 0.18995424296223623, + "grad_norm": 0.41987302899360657, + "learning_rate": 0.00016204265716996582, + "loss": 1.5434, + "step": 14618 + }, + { + "epoch": 0.1899672375061521, + "grad_norm": 0.3710688352584839, + "learning_rate": 0.00016204005770805444, + "loss": 1.5838, + "step": 14619 + }, + { + "epoch": 0.18998023205006798, + "grad_norm": 0.4317571818828583, + "learning_rate": 0.00016203745824614304, + "loss": 1.4844, + "step": 14620 + }, + { + "epoch": 0.18999322659398385, + "grad_norm": 0.4230884313583374, + "learning_rate": 0.0001620348587842317, + "loss": 1.3868, + "step": 14621 + }, + { + "epoch": 0.19000622113789972, + "grad_norm": 0.3468683063983917, + "learning_rate": 0.0001620322593223203, + "loss": 1.3814, + "step": 14622 + }, + { + "epoch": 0.1900192156818156, + "grad_norm": 0.42843106389045715, + "learning_rate": 0.0001620296598604089, + "loss": 1.4123, + "step": 14623 + }, + { + "epoch": 0.19003221022573147, + "grad_norm": 0.36498942971229553, + "learning_rate": 0.0001620270603984975, + "loss": 1.3291, + "step": 14624 + }, + { + "epoch": 0.19004520476964734, + "grad_norm": 0.3694465458393097, + "learning_rate": 0.00016202446093658613, + "loss": 1.4638, + "step": 14625 + }, + { + "epoch": 0.19005819931356321, + "grad_norm": 0.5853530168533325, + "learning_rate": 0.00016202186147467476, + "loss": 1.4738, + "step": 14626 + }, + { + "epoch": 0.1900711938574791, + "grad_norm": 0.40372177958488464, + "learning_rate": 0.00016201926201276336, + "loss": 1.4716, + "step": 14627 + }, + { + "epoch": 0.19008418840139496, + "grad_norm": 0.39939644932746887, + "learning_rate": 0.00016201666255085198, + "loss": 1.3647, + "step": 14628 + }, + { + "epoch": 0.19009718294531083, + "grad_norm": 0.7032272815704346, + "learning_rate": 0.0001620140630889406, + "loss": 1.2941, + "step": 14629 + }, + { + "epoch": 0.1901101774892267, + "grad_norm": 0.4274415373802185, + "learning_rate": 0.0001620114636270292, + "loss": 1.2683, + "step": 14630 + }, + { + "epoch": 0.19012317203314258, + "grad_norm": 0.3828834593296051, + "learning_rate": 0.00016200886416511783, + "loss": 1.4966, + "step": 14631 + }, + { + "epoch": 0.19013616657705845, + "grad_norm": 0.41696980595588684, + "learning_rate": 0.00016200626470320642, + "loss": 1.57, + "step": 14632 + }, + { + "epoch": 0.19014916112097432, + "grad_norm": 0.3982534110546112, + "learning_rate": 0.00016200366524129507, + "loss": 1.5723, + "step": 14633 + }, + { + "epoch": 0.1901621556648902, + "grad_norm": 0.40588676929473877, + "learning_rate": 0.00016200106577938367, + "loss": 1.3448, + "step": 14634 + }, + { + "epoch": 0.19017515020880607, + "grad_norm": 0.3777240514755249, + "learning_rate": 0.0001619984663174723, + "loss": 1.2801, + "step": 14635 + }, + { + "epoch": 0.19018814475272194, + "grad_norm": 0.4189203381538391, + "learning_rate": 0.0001619958668555609, + "loss": 1.3555, + "step": 14636 + }, + { + "epoch": 0.19020113929663782, + "grad_norm": 0.35768625140190125, + "learning_rate": 0.00016199326739364952, + "loss": 1.407, + "step": 14637 + }, + { + "epoch": 0.1902141338405537, + "grad_norm": 0.4081965982913971, + "learning_rate": 0.00016199066793173814, + "loss": 1.4959, + "step": 14638 + }, + { + "epoch": 0.19022712838446956, + "grad_norm": 0.37342196702957153, + "learning_rate": 0.00016198806846982674, + "loss": 1.5566, + "step": 14639 + }, + { + "epoch": 0.19024012292838544, + "grad_norm": 0.3264179527759552, + "learning_rate": 0.0001619854690079154, + "loss": 1.4074, + "step": 14640 + }, + { + "epoch": 0.1902531174723013, + "grad_norm": 0.3984972834587097, + "learning_rate": 0.000161982869546004, + "loss": 1.4893, + "step": 14641 + }, + { + "epoch": 0.19026611201621718, + "grad_norm": 0.4176251292228699, + "learning_rate": 0.0001619802700840926, + "loss": 1.4782, + "step": 14642 + }, + { + "epoch": 0.19027910656013305, + "grad_norm": 0.38510435819625854, + "learning_rate": 0.0001619776706221812, + "loss": 1.4315, + "step": 14643 + }, + { + "epoch": 0.19029210110404893, + "grad_norm": 0.3662984371185303, + "learning_rate": 0.00016197507116026984, + "loss": 1.3816, + "step": 14644 + }, + { + "epoch": 0.1903050956479648, + "grad_norm": 0.34030601382255554, + "learning_rate": 0.00016197247169835846, + "loss": 1.2881, + "step": 14645 + }, + { + "epoch": 0.19031809019188067, + "grad_norm": 0.40683484077453613, + "learning_rate": 0.00016196987223644706, + "loss": 1.4184, + "step": 14646 + }, + { + "epoch": 0.19033108473579655, + "grad_norm": 0.4211297929286957, + "learning_rate": 0.00016196727277453568, + "loss": 1.4294, + "step": 14647 + }, + { + "epoch": 0.19034407927971242, + "grad_norm": 0.5057207942008972, + "learning_rate": 0.0001619646733126243, + "loss": 1.4789, + "step": 14648 + }, + { + "epoch": 0.1903570738236283, + "grad_norm": 0.4111502170562744, + "learning_rate": 0.0001619620738507129, + "loss": 1.3301, + "step": 14649 + }, + { + "epoch": 0.19037006836754417, + "grad_norm": 0.31516367197036743, + "learning_rate": 0.00016195947438880153, + "loss": 1.4038, + "step": 14650 + }, + { + "epoch": 0.19038306291146004, + "grad_norm": 0.38898375630378723, + "learning_rate": 0.00016195687492689013, + "loss": 1.7325, + "step": 14651 + }, + { + "epoch": 0.1903960574553759, + "grad_norm": 0.3711279332637787, + "learning_rate": 0.00016195427546497878, + "loss": 1.3553, + "step": 14652 + }, + { + "epoch": 0.19040905199929178, + "grad_norm": 0.4377843141555786, + "learning_rate": 0.00016195167600306737, + "loss": 1.5221, + "step": 14653 + }, + { + "epoch": 0.19042204654320766, + "grad_norm": 0.4867298901081085, + "learning_rate": 0.000161949076541156, + "loss": 1.4911, + "step": 14654 + }, + { + "epoch": 0.19043504108712356, + "grad_norm": 0.41919898986816406, + "learning_rate": 0.0001619464770792446, + "loss": 1.2737, + "step": 14655 + }, + { + "epoch": 0.19044803563103943, + "grad_norm": 0.46002867817878723, + "learning_rate": 0.00016194387761733322, + "loss": 1.4875, + "step": 14656 + }, + { + "epoch": 0.1904610301749553, + "grad_norm": 0.3393423557281494, + "learning_rate": 0.00016194127815542185, + "loss": 1.4528, + "step": 14657 + }, + { + "epoch": 0.19047402471887118, + "grad_norm": 0.37846317887306213, + "learning_rate": 0.00016193867869351044, + "loss": 1.2983, + "step": 14658 + }, + { + "epoch": 0.19048701926278705, + "grad_norm": 0.38445013761520386, + "learning_rate": 0.00016193607923159907, + "loss": 1.4298, + "step": 14659 + }, + { + "epoch": 0.19050001380670292, + "grad_norm": 0.3944438099861145, + "learning_rate": 0.0001619334797696877, + "loss": 1.4748, + "step": 14660 + }, + { + "epoch": 0.1905130083506188, + "grad_norm": 0.45238086581230164, + "learning_rate": 0.0001619308803077763, + "loss": 1.3561, + "step": 14661 + }, + { + "epoch": 0.19052600289453467, + "grad_norm": 0.3778780996799469, + "learning_rate": 0.00016192828084586491, + "loss": 1.3976, + "step": 14662 + }, + { + "epoch": 0.19053899743845054, + "grad_norm": 0.436320424079895, + "learning_rate": 0.0001619256813839535, + "loss": 1.4859, + "step": 14663 + }, + { + "epoch": 0.19055199198236641, + "grad_norm": 0.5251297354698181, + "learning_rate": 0.00016192308192204216, + "loss": 1.6171, + "step": 14664 + }, + { + "epoch": 0.1905649865262823, + "grad_norm": 0.4117533564567566, + "learning_rate": 0.00016192048246013076, + "loss": 1.3532, + "step": 14665 + }, + { + "epoch": 0.19057798107019816, + "grad_norm": 0.36731234192848206, + "learning_rate": 0.00016191788299821938, + "loss": 1.3374, + "step": 14666 + }, + { + "epoch": 0.19059097561411403, + "grad_norm": 0.3928210437297821, + "learning_rate": 0.00016191528353630798, + "loss": 1.5527, + "step": 14667 + }, + { + "epoch": 0.1906039701580299, + "grad_norm": 0.40122100710868835, + "learning_rate": 0.0001619126840743966, + "loss": 1.559, + "step": 14668 + }, + { + "epoch": 0.19061696470194578, + "grad_norm": 0.4511236846446991, + "learning_rate": 0.00016191008461248523, + "loss": 1.6414, + "step": 14669 + }, + { + "epoch": 0.19062995924586165, + "grad_norm": 0.46413132548332214, + "learning_rate": 0.00016190748515057383, + "loss": 1.3942, + "step": 14670 + }, + { + "epoch": 0.19064295378977753, + "grad_norm": 0.41753649711608887, + "learning_rate": 0.00016190488568866245, + "loss": 1.4066, + "step": 14671 + }, + { + "epoch": 0.1906559483336934, + "grad_norm": 0.43236780166625977, + "learning_rate": 0.00016190228622675108, + "loss": 1.3647, + "step": 14672 + }, + { + "epoch": 0.19066894287760927, + "grad_norm": 0.41736575961112976, + "learning_rate": 0.00016189968676483967, + "loss": 1.514, + "step": 14673 + }, + { + "epoch": 0.19068193742152514, + "grad_norm": 0.4440431296825409, + "learning_rate": 0.0001618970873029283, + "loss": 1.5375, + "step": 14674 + }, + { + "epoch": 0.19069493196544102, + "grad_norm": 0.4027050733566284, + "learning_rate": 0.0001618944878410169, + "loss": 1.2582, + "step": 14675 + }, + { + "epoch": 0.1907079265093569, + "grad_norm": 0.4415140151977539, + "learning_rate": 0.00016189188837910555, + "loss": 1.3887, + "step": 14676 + }, + { + "epoch": 0.19072092105327276, + "grad_norm": 0.4337286651134491, + "learning_rate": 0.00016188928891719415, + "loss": 1.3946, + "step": 14677 + }, + { + "epoch": 0.19073391559718864, + "grad_norm": 0.4080226719379425, + "learning_rate": 0.00016188668945528277, + "loss": 1.4649, + "step": 14678 + }, + { + "epoch": 0.1907469101411045, + "grad_norm": 0.31970280408859253, + "learning_rate": 0.0001618840899933714, + "loss": 1.2141, + "step": 14679 + }, + { + "epoch": 0.19075990468502038, + "grad_norm": 0.41169095039367676, + "learning_rate": 0.00016188149053146, + "loss": 1.4313, + "step": 14680 + }, + { + "epoch": 0.19077289922893625, + "grad_norm": 0.3573702871799469, + "learning_rate": 0.00016187889106954862, + "loss": 1.4711, + "step": 14681 + }, + { + "epoch": 0.19078589377285213, + "grad_norm": 0.3207574784755707, + "learning_rate": 0.00016187629160763721, + "loss": 1.2906, + "step": 14682 + }, + { + "epoch": 0.190798888316768, + "grad_norm": 0.392945796251297, + "learning_rate": 0.00016187369214572587, + "loss": 1.3037, + "step": 14683 + }, + { + "epoch": 0.19081188286068387, + "grad_norm": 0.3822785019874573, + "learning_rate": 0.00016187109268381446, + "loss": 1.3718, + "step": 14684 + }, + { + "epoch": 0.19082487740459975, + "grad_norm": 0.35723164677619934, + "learning_rate": 0.00016186849322190306, + "loss": 1.3592, + "step": 14685 + }, + { + "epoch": 0.19083787194851562, + "grad_norm": 0.40905579924583435, + "learning_rate": 0.00016186589375999168, + "loss": 1.5063, + "step": 14686 + }, + { + "epoch": 0.1908508664924315, + "grad_norm": 0.2638438045978546, + "learning_rate": 0.0001618632942980803, + "loss": 1.2266, + "step": 14687 + }, + { + "epoch": 0.19086386103634737, + "grad_norm": 0.32551392912864685, + "learning_rate": 0.00016186069483616893, + "loss": 1.3832, + "step": 14688 + }, + { + "epoch": 0.19087685558026324, + "grad_norm": 0.3668440580368042, + "learning_rate": 0.00016185809537425753, + "loss": 1.3449, + "step": 14689 + }, + { + "epoch": 0.1908898501241791, + "grad_norm": 0.3846147954463959, + "learning_rate": 0.00016185549591234616, + "loss": 1.4486, + "step": 14690 + }, + { + "epoch": 0.19090284466809498, + "grad_norm": 0.48661091923713684, + "learning_rate": 0.00016185289645043478, + "loss": 1.4397, + "step": 14691 + }, + { + "epoch": 0.19091583921201086, + "grad_norm": 0.42199310660362244, + "learning_rate": 0.00016185029698852338, + "loss": 1.5245, + "step": 14692 + }, + { + "epoch": 0.19092883375592673, + "grad_norm": 0.40835314989089966, + "learning_rate": 0.000161847697526612, + "loss": 1.4548, + "step": 14693 + }, + { + "epoch": 0.1909418282998426, + "grad_norm": 0.3708430230617523, + "learning_rate": 0.0001618450980647006, + "loss": 1.3529, + "step": 14694 + }, + { + "epoch": 0.19095482284375848, + "grad_norm": 0.4395867586135864, + "learning_rate": 0.00016184249860278925, + "loss": 1.5337, + "step": 14695 + }, + { + "epoch": 0.19096781738767435, + "grad_norm": 0.34248197078704834, + "learning_rate": 0.00016183989914087785, + "loss": 1.3031, + "step": 14696 + }, + { + "epoch": 0.19098081193159022, + "grad_norm": 0.4046511948108673, + "learning_rate": 0.00016183729967896645, + "loss": 1.3646, + "step": 14697 + }, + { + "epoch": 0.1909938064755061, + "grad_norm": 0.40073636174201965, + "learning_rate": 0.00016183470021705507, + "loss": 1.5307, + "step": 14698 + }, + { + "epoch": 0.19100680101942197, + "grad_norm": 0.4188086688518524, + "learning_rate": 0.0001618321007551437, + "loss": 1.3921, + "step": 14699 + }, + { + "epoch": 0.19101979556333784, + "grad_norm": 0.42957553267478943, + "learning_rate": 0.00016182950129323232, + "loss": 1.2958, + "step": 14700 + }, + { + "epoch": 0.1910327901072537, + "grad_norm": 0.40109172463417053, + "learning_rate": 0.00016182690183132092, + "loss": 1.5406, + "step": 14701 + }, + { + "epoch": 0.1910457846511696, + "grad_norm": 0.46192339062690735, + "learning_rate": 0.00016182430236940954, + "loss": 1.4243, + "step": 14702 + }, + { + "epoch": 0.19105877919508546, + "grad_norm": 0.4547237455844879, + "learning_rate": 0.00016182170290749817, + "loss": 1.5191, + "step": 14703 + }, + { + "epoch": 0.19107177373900133, + "grad_norm": 0.37635189294815063, + "learning_rate": 0.00016181910344558676, + "loss": 1.3985, + "step": 14704 + }, + { + "epoch": 0.1910847682829172, + "grad_norm": 0.41376179456710815, + "learning_rate": 0.0001618165039836754, + "loss": 1.5099, + "step": 14705 + }, + { + "epoch": 0.19109776282683308, + "grad_norm": 0.4531638026237488, + "learning_rate": 0.00016181390452176398, + "loss": 1.4845, + "step": 14706 + }, + { + "epoch": 0.19111075737074895, + "grad_norm": 0.37797045707702637, + "learning_rate": 0.00016181130505985264, + "loss": 1.5374, + "step": 14707 + }, + { + "epoch": 0.19112375191466482, + "grad_norm": 0.3327268064022064, + "learning_rate": 0.00016180870559794123, + "loss": 1.4572, + "step": 14708 + }, + { + "epoch": 0.1911367464585807, + "grad_norm": 0.3709682524204254, + "learning_rate": 0.00016180610613602986, + "loss": 1.493, + "step": 14709 + }, + { + "epoch": 0.19114974100249657, + "grad_norm": 0.41870760917663574, + "learning_rate": 0.00016180350667411846, + "loss": 1.6566, + "step": 14710 + }, + { + "epoch": 0.19116273554641244, + "grad_norm": 0.43770626187324524, + "learning_rate": 0.00016180090721220708, + "loss": 1.2873, + "step": 14711 + }, + { + "epoch": 0.19117573009032832, + "grad_norm": 0.3818780481815338, + "learning_rate": 0.0001617983077502957, + "loss": 1.2588, + "step": 14712 + }, + { + "epoch": 0.1911887246342442, + "grad_norm": 0.3426341414451599, + "learning_rate": 0.0001617957082883843, + "loss": 1.4366, + "step": 14713 + }, + { + "epoch": 0.19120171917816006, + "grad_norm": 0.43491262197494507, + "learning_rate": 0.00016179310882647293, + "loss": 1.5368, + "step": 14714 + }, + { + "epoch": 0.19121471372207594, + "grad_norm": 0.4278816878795624, + "learning_rate": 0.00016179050936456155, + "loss": 1.3731, + "step": 14715 + }, + { + "epoch": 0.1912277082659918, + "grad_norm": 0.46045053005218506, + "learning_rate": 0.00016178790990265015, + "loss": 1.5695, + "step": 14716 + }, + { + "epoch": 0.19124070280990768, + "grad_norm": 0.7254002094268799, + "learning_rate": 0.00016178531044073877, + "loss": 1.4771, + "step": 14717 + }, + { + "epoch": 0.19125369735382355, + "grad_norm": 0.3770655393600464, + "learning_rate": 0.0001617827109788274, + "loss": 1.2624, + "step": 14718 + }, + { + "epoch": 0.19126669189773943, + "grad_norm": 0.3826221227645874, + "learning_rate": 0.00016178011151691602, + "loss": 1.1633, + "step": 14719 + }, + { + "epoch": 0.1912796864416553, + "grad_norm": 0.44069963693618774, + "learning_rate": 0.00016177751205500462, + "loss": 1.5034, + "step": 14720 + }, + { + "epoch": 0.19129268098557117, + "grad_norm": 0.399186372756958, + "learning_rate": 0.00016177491259309324, + "loss": 1.3803, + "step": 14721 + }, + { + "epoch": 0.19130567552948705, + "grad_norm": 0.3506208658218384, + "learning_rate": 0.00016177231313118187, + "loss": 1.2111, + "step": 14722 + }, + { + "epoch": 0.19131867007340292, + "grad_norm": 0.3993605077266693, + "learning_rate": 0.00016176971366927047, + "loss": 1.5176, + "step": 14723 + }, + { + "epoch": 0.1913316646173188, + "grad_norm": 0.376873254776001, + "learning_rate": 0.0001617671142073591, + "loss": 1.3856, + "step": 14724 + }, + { + "epoch": 0.19134465916123466, + "grad_norm": 0.3938893973827362, + "learning_rate": 0.0001617645147454477, + "loss": 1.483, + "step": 14725 + }, + { + "epoch": 0.19135765370515054, + "grad_norm": 0.4488093852996826, + "learning_rate": 0.0001617619152835363, + "loss": 1.4998, + "step": 14726 + }, + { + "epoch": 0.1913706482490664, + "grad_norm": 0.4942302703857422, + "learning_rate": 0.00016175931582162494, + "loss": 1.4478, + "step": 14727 + }, + { + "epoch": 0.19138364279298228, + "grad_norm": 0.47813063859939575, + "learning_rate": 0.00016175671635971353, + "loss": 1.3778, + "step": 14728 + }, + { + "epoch": 0.19139663733689816, + "grad_norm": 0.35897913575172424, + "learning_rate": 0.00016175411689780216, + "loss": 1.4086, + "step": 14729 + }, + { + "epoch": 0.19140963188081403, + "grad_norm": 0.42157989740371704, + "learning_rate": 0.00016175151743589078, + "loss": 1.4952, + "step": 14730 + }, + { + "epoch": 0.19142262642472993, + "grad_norm": 0.38916924595832825, + "learning_rate": 0.0001617489179739794, + "loss": 1.3164, + "step": 14731 + }, + { + "epoch": 0.1914356209686458, + "grad_norm": 0.4732559323310852, + "learning_rate": 0.000161746318512068, + "loss": 1.4816, + "step": 14732 + }, + { + "epoch": 0.19144861551256168, + "grad_norm": 0.3626325726509094, + "learning_rate": 0.00016174371905015663, + "loss": 1.3921, + "step": 14733 + }, + { + "epoch": 0.19146161005647755, + "grad_norm": 0.4242331385612488, + "learning_rate": 0.00016174111958824525, + "loss": 1.2925, + "step": 14734 + }, + { + "epoch": 0.19147460460039342, + "grad_norm": 0.3638611137866974, + "learning_rate": 0.00016173852012633385, + "loss": 1.4722, + "step": 14735 + }, + { + "epoch": 0.1914875991443093, + "grad_norm": 0.4010542333126068, + "learning_rate": 0.00016173592066442248, + "loss": 1.4748, + "step": 14736 + }, + { + "epoch": 0.19150059368822517, + "grad_norm": 0.41190001368522644, + "learning_rate": 0.00016173332120251107, + "loss": 1.3564, + "step": 14737 + }, + { + "epoch": 0.19151358823214104, + "grad_norm": 0.38537663221359253, + "learning_rate": 0.00016173072174059972, + "loss": 1.3698, + "step": 14738 + }, + { + "epoch": 0.19152658277605691, + "grad_norm": 0.3804991841316223, + "learning_rate": 0.00016172812227868832, + "loss": 1.2688, + "step": 14739 + }, + { + "epoch": 0.1915395773199728, + "grad_norm": 0.37376806139945984, + "learning_rate": 0.00016172552281677692, + "loss": 1.5053, + "step": 14740 + }, + { + "epoch": 0.19155257186388866, + "grad_norm": 0.4185049831867218, + "learning_rate": 0.00016172292335486554, + "loss": 1.4177, + "step": 14741 + }, + { + "epoch": 0.19156556640780453, + "grad_norm": 0.43594786524772644, + "learning_rate": 0.00016172032389295417, + "loss": 1.4985, + "step": 14742 + }, + { + "epoch": 0.1915785609517204, + "grad_norm": 0.3320576250553131, + "learning_rate": 0.0001617177244310428, + "loss": 1.3372, + "step": 14743 + }, + { + "epoch": 0.19159155549563628, + "grad_norm": 0.40826481580734253, + "learning_rate": 0.0001617151249691314, + "loss": 1.4166, + "step": 14744 + }, + { + "epoch": 0.19160455003955215, + "grad_norm": 0.4043372869491577, + "learning_rate": 0.00016171252550722001, + "loss": 1.5113, + "step": 14745 + }, + { + "epoch": 0.19161754458346802, + "grad_norm": 0.567789614200592, + "learning_rate": 0.00016170992604530864, + "loss": 1.4549, + "step": 14746 + }, + { + "epoch": 0.1916305391273839, + "grad_norm": 0.444301038980484, + "learning_rate": 0.00016170732658339724, + "loss": 1.3388, + "step": 14747 + }, + { + "epoch": 0.19164353367129977, + "grad_norm": 0.3947491943836212, + "learning_rate": 0.00016170472712148586, + "loss": 1.3371, + "step": 14748 + }, + { + "epoch": 0.19165652821521564, + "grad_norm": 0.3155144155025482, + "learning_rate": 0.00016170212765957446, + "loss": 1.312, + "step": 14749 + }, + { + "epoch": 0.19166952275913152, + "grad_norm": 0.30175110697746277, + "learning_rate": 0.0001616995281976631, + "loss": 1.3784, + "step": 14750 + }, + { + "epoch": 0.1916825173030474, + "grad_norm": 0.37675991654396057, + "learning_rate": 0.0001616969287357517, + "loss": 1.5213, + "step": 14751 + }, + { + "epoch": 0.19169551184696326, + "grad_norm": 0.4649588167667389, + "learning_rate": 0.0001616943292738403, + "loss": 1.3588, + "step": 14752 + }, + { + "epoch": 0.19170850639087914, + "grad_norm": 0.38730716705322266, + "learning_rate": 0.00016169172981192896, + "loss": 1.4931, + "step": 14753 + }, + { + "epoch": 0.191721500934795, + "grad_norm": 0.3740478456020355, + "learning_rate": 0.00016168913035001755, + "loss": 1.4747, + "step": 14754 + }, + { + "epoch": 0.19173449547871088, + "grad_norm": 0.3958975672721863, + "learning_rate": 0.00016168653088810618, + "loss": 1.4363, + "step": 14755 + }, + { + "epoch": 0.19174749002262675, + "grad_norm": 0.44136694073677063, + "learning_rate": 0.00016168393142619478, + "loss": 1.4861, + "step": 14756 + }, + { + "epoch": 0.19176048456654263, + "grad_norm": 0.34408873319625854, + "learning_rate": 0.0001616813319642834, + "loss": 1.3286, + "step": 14757 + }, + { + "epoch": 0.1917734791104585, + "grad_norm": 0.3613181412220001, + "learning_rate": 0.00016167873250237202, + "loss": 1.348, + "step": 14758 + }, + { + "epoch": 0.19178647365437437, + "grad_norm": 0.39268746972084045, + "learning_rate": 0.00016167613304046062, + "loss": 1.5056, + "step": 14759 + }, + { + "epoch": 0.19179946819829025, + "grad_norm": 0.43165770173072815, + "learning_rate": 0.00016167353357854925, + "loss": 1.5264, + "step": 14760 + }, + { + "epoch": 0.19181246274220612, + "grad_norm": 0.44122982025146484, + "learning_rate": 0.00016167093411663787, + "loss": 1.4459, + "step": 14761 + }, + { + "epoch": 0.191825457286122, + "grad_norm": 0.406219482421875, + "learning_rate": 0.0001616683346547265, + "loss": 1.4387, + "step": 14762 + }, + { + "epoch": 0.19183845183003786, + "grad_norm": 0.575989305973053, + "learning_rate": 0.0001616657351928151, + "loss": 1.3879, + "step": 14763 + }, + { + "epoch": 0.19185144637395374, + "grad_norm": 0.3489566445350647, + "learning_rate": 0.0001616631357309037, + "loss": 1.4151, + "step": 14764 + }, + { + "epoch": 0.1918644409178696, + "grad_norm": 0.36041444540023804, + "learning_rate": 0.00016166053626899234, + "loss": 1.3354, + "step": 14765 + }, + { + "epoch": 0.19187743546178548, + "grad_norm": 0.49323713779449463, + "learning_rate": 0.00016165793680708094, + "loss": 1.4864, + "step": 14766 + }, + { + "epoch": 0.19189043000570136, + "grad_norm": 0.3746762275695801, + "learning_rate": 0.00016165533734516956, + "loss": 1.425, + "step": 14767 + }, + { + "epoch": 0.19190342454961723, + "grad_norm": 0.30793729424476624, + "learning_rate": 0.00016165273788325816, + "loss": 1.359, + "step": 14768 + }, + { + "epoch": 0.1919164190935331, + "grad_norm": 0.46501293778419495, + "learning_rate": 0.00016165013842134679, + "loss": 1.304, + "step": 14769 + }, + { + "epoch": 0.19192941363744898, + "grad_norm": 0.39953964948654175, + "learning_rate": 0.0001616475389594354, + "loss": 1.4446, + "step": 14770 + }, + { + "epoch": 0.19194240818136485, + "grad_norm": 0.39432424306869507, + "learning_rate": 0.000161644939497524, + "loss": 1.3085, + "step": 14771 + }, + { + "epoch": 0.19195540272528072, + "grad_norm": 0.3681644797325134, + "learning_rate": 0.00016164234003561263, + "loss": 1.4651, + "step": 14772 + }, + { + "epoch": 0.1919683972691966, + "grad_norm": 0.43860921263694763, + "learning_rate": 0.00016163974057370126, + "loss": 1.3085, + "step": 14773 + }, + { + "epoch": 0.19198139181311247, + "grad_norm": 0.43673309683799744, + "learning_rate": 0.00016163714111178988, + "loss": 1.3206, + "step": 14774 + }, + { + "epoch": 0.19199438635702834, + "grad_norm": 0.35295191407203674, + "learning_rate": 0.00016163454164987848, + "loss": 1.2792, + "step": 14775 + }, + { + "epoch": 0.1920073809009442, + "grad_norm": 0.46073010563850403, + "learning_rate": 0.0001616319421879671, + "loss": 1.4596, + "step": 14776 + }, + { + "epoch": 0.1920203754448601, + "grad_norm": 0.47358232736587524, + "learning_rate": 0.00016162934272605573, + "loss": 1.4292, + "step": 14777 + }, + { + "epoch": 0.19203336998877596, + "grad_norm": 0.46355482935905457, + "learning_rate": 0.00016162674326414432, + "loss": 1.3921, + "step": 14778 + }, + { + "epoch": 0.19204636453269183, + "grad_norm": 0.3571394681930542, + "learning_rate": 0.00016162414380223295, + "loss": 1.3749, + "step": 14779 + }, + { + "epoch": 0.1920593590766077, + "grad_norm": 0.44956809282302856, + "learning_rate": 0.00016162154434032155, + "loss": 1.3928, + "step": 14780 + }, + { + "epoch": 0.19207235362052358, + "grad_norm": 0.385164350271225, + "learning_rate": 0.00016161894487841017, + "loss": 1.254, + "step": 14781 + }, + { + "epoch": 0.19208534816443945, + "grad_norm": 0.3551305830478668, + "learning_rate": 0.0001616163454164988, + "loss": 1.5076, + "step": 14782 + }, + { + "epoch": 0.19209834270835532, + "grad_norm": 0.40480339527130127, + "learning_rate": 0.0001616137459545874, + "loss": 1.2248, + "step": 14783 + }, + { + "epoch": 0.1921113372522712, + "grad_norm": 0.3807767927646637, + "learning_rate": 0.00016161114649267602, + "loss": 1.4591, + "step": 14784 + }, + { + "epoch": 0.19212433179618707, + "grad_norm": 0.4305060803890228, + "learning_rate": 0.00016160854703076464, + "loss": 1.4321, + "step": 14785 + }, + { + "epoch": 0.19213732634010294, + "grad_norm": 0.4142296016216278, + "learning_rate": 0.00016160594756885327, + "loss": 1.4799, + "step": 14786 + }, + { + "epoch": 0.19215032088401882, + "grad_norm": 0.3961220979690552, + "learning_rate": 0.00016160334810694186, + "loss": 1.5275, + "step": 14787 + }, + { + "epoch": 0.1921633154279347, + "grad_norm": 0.37783369421958923, + "learning_rate": 0.0001616007486450305, + "loss": 1.5385, + "step": 14788 + }, + { + "epoch": 0.19217630997185056, + "grad_norm": 0.3969023525714874, + "learning_rate": 0.0001615981491831191, + "loss": 1.6432, + "step": 14789 + }, + { + "epoch": 0.19218930451576643, + "grad_norm": 0.37032586336135864, + "learning_rate": 0.0001615955497212077, + "loss": 1.3727, + "step": 14790 + }, + { + "epoch": 0.1922022990596823, + "grad_norm": 0.4205678105354309, + "learning_rate": 0.00016159295025929633, + "loss": 1.3817, + "step": 14791 + }, + { + "epoch": 0.19221529360359818, + "grad_norm": 0.3681456744670868, + "learning_rate": 0.00016159035079738496, + "loss": 1.3027, + "step": 14792 + }, + { + "epoch": 0.19222828814751405, + "grad_norm": 0.36517757177352905, + "learning_rate": 0.00016158775133547358, + "loss": 1.3506, + "step": 14793 + }, + { + "epoch": 0.19224128269142993, + "grad_norm": 0.4429134428501129, + "learning_rate": 0.00016158515187356218, + "loss": 1.4592, + "step": 14794 + }, + { + "epoch": 0.1922542772353458, + "grad_norm": 0.4614850580692291, + "learning_rate": 0.00016158255241165078, + "loss": 1.604, + "step": 14795 + }, + { + "epoch": 0.19226727177926167, + "grad_norm": 0.4379009008407593, + "learning_rate": 0.00016157995294973943, + "loss": 1.5071, + "step": 14796 + }, + { + "epoch": 0.19228026632317755, + "grad_norm": 0.28308600187301636, + "learning_rate": 0.00016157735348782803, + "loss": 1.3241, + "step": 14797 + }, + { + "epoch": 0.19229326086709342, + "grad_norm": 0.42078229784965515, + "learning_rate": 0.00016157475402591665, + "loss": 1.4385, + "step": 14798 + }, + { + "epoch": 0.1923062554110093, + "grad_norm": 0.3954154849052429, + "learning_rate": 0.00016157215456400525, + "loss": 1.3316, + "step": 14799 + }, + { + "epoch": 0.19231924995492516, + "grad_norm": 0.29797929525375366, + "learning_rate": 0.00016156955510209387, + "loss": 1.418, + "step": 14800 + }, + { + "epoch": 0.19233224449884104, + "grad_norm": 0.3662402033805847, + "learning_rate": 0.0001615669556401825, + "loss": 1.6391, + "step": 14801 + }, + { + "epoch": 0.1923452390427569, + "grad_norm": 0.41779810190200806, + "learning_rate": 0.0001615643561782711, + "loss": 1.4004, + "step": 14802 + }, + { + "epoch": 0.19235823358667278, + "grad_norm": 0.34955495595932007, + "learning_rate": 0.00016156175671635972, + "loss": 1.4351, + "step": 14803 + }, + { + "epoch": 0.19237122813058866, + "grad_norm": 0.35122936964035034, + "learning_rate": 0.00016155915725444834, + "loss": 1.3991, + "step": 14804 + }, + { + "epoch": 0.19238422267450453, + "grad_norm": 0.4929906129837036, + "learning_rate": 0.00016155655779253697, + "loss": 1.4813, + "step": 14805 + }, + { + "epoch": 0.1923972172184204, + "grad_norm": 0.3611014485359192, + "learning_rate": 0.00016155395833062557, + "loss": 1.3937, + "step": 14806 + }, + { + "epoch": 0.1924102117623363, + "grad_norm": 0.2804672122001648, + "learning_rate": 0.00016155135886871416, + "loss": 1.273, + "step": 14807 + }, + { + "epoch": 0.19242320630625218, + "grad_norm": 0.3807078003883362, + "learning_rate": 0.00016154875940680281, + "loss": 1.2595, + "step": 14808 + }, + { + "epoch": 0.19243620085016805, + "grad_norm": 0.3427078425884247, + "learning_rate": 0.0001615461599448914, + "loss": 1.2713, + "step": 14809 + }, + { + "epoch": 0.19244919539408392, + "grad_norm": 0.44836607575416565, + "learning_rate": 0.00016154356048298004, + "loss": 1.2625, + "step": 14810 + }, + { + "epoch": 0.1924621899379998, + "grad_norm": 0.4267934560775757, + "learning_rate": 0.00016154096102106863, + "loss": 1.2648, + "step": 14811 + }, + { + "epoch": 0.19247518448191567, + "grad_norm": 0.461042195558548, + "learning_rate": 0.00016153836155915726, + "loss": 1.4129, + "step": 14812 + }, + { + "epoch": 0.19248817902583154, + "grad_norm": 0.430828720331192, + "learning_rate": 0.00016153576209724588, + "loss": 1.3693, + "step": 14813 + }, + { + "epoch": 0.1925011735697474, + "grad_norm": 0.3735158145427704, + "learning_rate": 0.00016153316263533448, + "loss": 1.4946, + "step": 14814 + }, + { + "epoch": 0.1925141681136633, + "grad_norm": 0.2744368016719818, + "learning_rate": 0.0001615305631734231, + "loss": 1.3076, + "step": 14815 + }, + { + "epoch": 0.19252716265757916, + "grad_norm": 0.4095487594604492, + "learning_rate": 0.00016152796371151173, + "loss": 1.3416, + "step": 14816 + }, + { + "epoch": 0.19254015720149503, + "grad_norm": 0.35569536685943604, + "learning_rate": 0.00016152536424960035, + "loss": 1.3008, + "step": 14817 + }, + { + "epoch": 0.1925531517454109, + "grad_norm": 0.3805360496044159, + "learning_rate": 0.00016152276478768895, + "loss": 1.3214, + "step": 14818 + }, + { + "epoch": 0.19256614628932678, + "grad_norm": 0.3798256814479828, + "learning_rate": 0.00016152016532577755, + "loss": 1.4774, + "step": 14819 + }, + { + "epoch": 0.19257914083324265, + "grad_norm": 0.3908930718898773, + "learning_rate": 0.0001615175658638662, + "loss": 1.5745, + "step": 14820 + }, + { + "epoch": 0.19259213537715852, + "grad_norm": 0.37838214635849, + "learning_rate": 0.0001615149664019548, + "loss": 1.5448, + "step": 14821 + }, + { + "epoch": 0.1926051299210744, + "grad_norm": 0.4039660096168518, + "learning_rate": 0.00016151236694004342, + "loss": 1.6515, + "step": 14822 + }, + { + "epoch": 0.19261812446499027, + "grad_norm": 0.44893568754196167, + "learning_rate": 0.00016150976747813202, + "loss": 1.484, + "step": 14823 + }, + { + "epoch": 0.19263111900890614, + "grad_norm": 0.36752158403396606, + "learning_rate": 0.00016150716801622064, + "loss": 1.367, + "step": 14824 + }, + { + "epoch": 0.19264411355282202, + "grad_norm": 0.4139344096183777, + "learning_rate": 0.00016150456855430927, + "loss": 1.4476, + "step": 14825 + }, + { + "epoch": 0.1926571080967379, + "grad_norm": 0.4148998558521271, + "learning_rate": 0.00016150196909239787, + "loss": 1.3479, + "step": 14826 + }, + { + "epoch": 0.19267010264065376, + "grad_norm": 0.41702941060066223, + "learning_rate": 0.00016149936963048652, + "loss": 1.46, + "step": 14827 + }, + { + "epoch": 0.19268309718456963, + "grad_norm": 0.3881840407848358, + "learning_rate": 0.00016149677016857511, + "loss": 1.4713, + "step": 14828 + }, + { + "epoch": 0.1926960917284855, + "grad_norm": 0.3611997067928314, + "learning_rate": 0.00016149417070666374, + "loss": 1.3885, + "step": 14829 + }, + { + "epoch": 0.19270908627240138, + "grad_norm": 0.36825549602508545, + "learning_rate": 0.00016149157124475234, + "loss": 1.363, + "step": 14830 + }, + { + "epoch": 0.19272208081631725, + "grad_norm": 0.4498448967933655, + "learning_rate": 0.00016148897178284096, + "loss": 1.3699, + "step": 14831 + }, + { + "epoch": 0.19273507536023313, + "grad_norm": 0.40507617592811584, + "learning_rate": 0.00016148637232092959, + "loss": 1.4294, + "step": 14832 + }, + { + "epoch": 0.192748069904149, + "grad_norm": 0.3195624053478241, + "learning_rate": 0.00016148377285901818, + "loss": 1.4273, + "step": 14833 + }, + { + "epoch": 0.19276106444806487, + "grad_norm": 0.24283470213413239, + "learning_rate": 0.0001614811733971068, + "loss": 1.1657, + "step": 14834 + }, + { + "epoch": 0.19277405899198075, + "grad_norm": 0.29763153195381165, + "learning_rate": 0.00016147857393519543, + "loss": 1.3572, + "step": 14835 + }, + { + "epoch": 0.19278705353589662, + "grad_norm": 0.3514959514141083, + "learning_rate": 0.00016147597447328403, + "loss": 1.2956, + "step": 14836 + }, + { + "epoch": 0.1928000480798125, + "grad_norm": 0.44645237922668457, + "learning_rate": 0.00016147337501137265, + "loss": 1.6183, + "step": 14837 + }, + { + "epoch": 0.19281304262372836, + "grad_norm": 0.4316481947898865, + "learning_rate": 0.00016147077554946125, + "loss": 1.4404, + "step": 14838 + }, + { + "epoch": 0.19282603716764424, + "grad_norm": 0.3804508149623871, + "learning_rate": 0.0001614681760875499, + "loss": 1.4531, + "step": 14839 + }, + { + "epoch": 0.1928390317115601, + "grad_norm": 0.29477280378341675, + "learning_rate": 0.0001614655766256385, + "loss": 1.2613, + "step": 14840 + }, + { + "epoch": 0.19285202625547598, + "grad_norm": 0.4052661061286926, + "learning_rate": 0.00016146297716372712, + "loss": 1.2303, + "step": 14841 + }, + { + "epoch": 0.19286502079939186, + "grad_norm": 0.38528206944465637, + "learning_rate": 0.00016146037770181572, + "loss": 1.4822, + "step": 14842 + }, + { + "epoch": 0.19287801534330773, + "grad_norm": 0.4656522274017334, + "learning_rate": 0.00016145777823990435, + "loss": 1.4365, + "step": 14843 + }, + { + "epoch": 0.1928910098872236, + "grad_norm": 0.38580530881881714, + "learning_rate": 0.00016145517877799297, + "loss": 1.3121, + "step": 14844 + }, + { + "epoch": 0.19290400443113948, + "grad_norm": 0.4122304320335388, + "learning_rate": 0.00016145257931608157, + "loss": 1.5367, + "step": 14845 + }, + { + "epoch": 0.19291699897505535, + "grad_norm": 0.5036138892173767, + "learning_rate": 0.0001614499798541702, + "loss": 1.5542, + "step": 14846 + }, + { + "epoch": 0.19292999351897122, + "grad_norm": 0.4620932638645172, + "learning_rate": 0.00016144738039225882, + "loss": 1.3894, + "step": 14847 + }, + { + "epoch": 0.1929429880628871, + "grad_norm": 0.395526260137558, + "learning_rate": 0.00016144478093034741, + "loss": 1.2675, + "step": 14848 + }, + { + "epoch": 0.19295598260680297, + "grad_norm": 0.42580515146255493, + "learning_rate": 0.00016144218146843604, + "loss": 1.4859, + "step": 14849 + }, + { + "epoch": 0.19296897715071884, + "grad_norm": 0.3319178819656372, + "learning_rate": 0.00016143958200652464, + "loss": 1.0941, + "step": 14850 + }, + { + "epoch": 0.1929819716946347, + "grad_norm": 0.41081032156944275, + "learning_rate": 0.0001614369825446133, + "loss": 1.2713, + "step": 14851 + }, + { + "epoch": 0.19299496623855059, + "grad_norm": 0.42496830224990845, + "learning_rate": 0.00016143438308270189, + "loss": 1.5343, + "step": 14852 + }, + { + "epoch": 0.19300796078246646, + "grad_norm": 0.4384685158729553, + "learning_rate": 0.0001614317836207905, + "loss": 1.5021, + "step": 14853 + }, + { + "epoch": 0.19302095532638233, + "grad_norm": 0.4159129858016968, + "learning_rate": 0.0001614291841588791, + "loss": 1.5032, + "step": 14854 + }, + { + "epoch": 0.1930339498702982, + "grad_norm": 0.3636203408241272, + "learning_rate": 0.00016142658469696773, + "loss": 1.4775, + "step": 14855 + }, + { + "epoch": 0.19304694441421408, + "grad_norm": 0.3794700503349304, + "learning_rate": 0.00016142398523505636, + "loss": 1.4778, + "step": 14856 + }, + { + "epoch": 0.19305993895812995, + "grad_norm": 0.41769009828567505, + "learning_rate": 0.00016142138577314495, + "loss": 1.4356, + "step": 14857 + }, + { + "epoch": 0.19307293350204582, + "grad_norm": 0.41391444206237793, + "learning_rate": 0.00016141878631123358, + "loss": 1.5713, + "step": 14858 + }, + { + "epoch": 0.1930859280459617, + "grad_norm": 0.34555503726005554, + "learning_rate": 0.0001614161868493222, + "loss": 1.3823, + "step": 14859 + }, + { + "epoch": 0.19309892258987757, + "grad_norm": 0.48385536670684814, + "learning_rate": 0.00016141358738741083, + "loss": 1.3848, + "step": 14860 + }, + { + "epoch": 0.19311191713379344, + "grad_norm": 0.3248080909252167, + "learning_rate": 0.00016141098792549942, + "loss": 1.1726, + "step": 14861 + }, + { + "epoch": 0.19312491167770932, + "grad_norm": 0.3720283508300781, + "learning_rate": 0.00016140838846358805, + "loss": 1.2739, + "step": 14862 + }, + { + "epoch": 0.1931379062216252, + "grad_norm": 0.36990800499916077, + "learning_rate": 0.00016140578900167667, + "loss": 1.4363, + "step": 14863 + }, + { + "epoch": 0.19315090076554106, + "grad_norm": 0.46047475934028625, + "learning_rate": 0.00016140318953976527, + "loss": 1.3494, + "step": 14864 + }, + { + "epoch": 0.19316389530945693, + "grad_norm": 0.40693244338035583, + "learning_rate": 0.0001614005900778539, + "loss": 1.3564, + "step": 14865 + }, + { + "epoch": 0.1931768898533728, + "grad_norm": 0.40175721049308777, + "learning_rate": 0.00016139799061594252, + "loss": 1.6216, + "step": 14866 + }, + { + "epoch": 0.19318988439728868, + "grad_norm": 0.49909675121307373, + "learning_rate": 0.00016139539115403112, + "loss": 1.4139, + "step": 14867 + }, + { + "epoch": 0.19320287894120455, + "grad_norm": 0.45593997836112976, + "learning_rate": 0.00016139279169211974, + "loss": 1.3887, + "step": 14868 + }, + { + "epoch": 0.19321587348512043, + "grad_norm": 0.3343084752559662, + "learning_rate": 0.00016139019223020834, + "loss": 1.2647, + "step": 14869 + }, + { + "epoch": 0.1932288680290363, + "grad_norm": 0.34654197096824646, + "learning_rate": 0.000161387592768297, + "loss": 1.3814, + "step": 14870 + }, + { + "epoch": 0.19324186257295217, + "grad_norm": 0.3975386619567871, + "learning_rate": 0.0001613849933063856, + "loss": 1.5305, + "step": 14871 + }, + { + "epoch": 0.19325485711686805, + "grad_norm": 0.3944903314113617, + "learning_rate": 0.0001613823938444742, + "loss": 1.3117, + "step": 14872 + }, + { + "epoch": 0.19326785166078392, + "grad_norm": 0.3762023150920868, + "learning_rate": 0.0001613797943825628, + "loss": 1.3454, + "step": 14873 + }, + { + "epoch": 0.1932808462046998, + "grad_norm": 0.451759397983551, + "learning_rate": 0.00016137719492065143, + "loss": 1.5639, + "step": 14874 + }, + { + "epoch": 0.19329384074861566, + "grad_norm": 0.375691682100296, + "learning_rate": 0.00016137459545874006, + "loss": 1.3765, + "step": 14875 + }, + { + "epoch": 0.19330683529253154, + "grad_norm": 0.4128933250904083, + "learning_rate": 0.00016137199599682866, + "loss": 1.3414, + "step": 14876 + }, + { + "epoch": 0.1933198298364474, + "grad_norm": 0.3383465111255646, + "learning_rate": 0.00016136939653491728, + "loss": 1.3813, + "step": 14877 + }, + { + "epoch": 0.19333282438036328, + "grad_norm": 0.2938116788864136, + "learning_rate": 0.0001613667970730059, + "loss": 1.2532, + "step": 14878 + }, + { + "epoch": 0.19334581892427916, + "grad_norm": 0.4449491798877716, + "learning_rate": 0.0001613641976110945, + "loss": 1.4268, + "step": 14879 + }, + { + "epoch": 0.19335881346819503, + "grad_norm": 0.4733844995498657, + "learning_rate": 0.00016136159814918313, + "loss": 1.4647, + "step": 14880 + }, + { + "epoch": 0.1933718080121109, + "grad_norm": 0.34632447361946106, + "learning_rate": 0.00016135899868727172, + "loss": 1.1606, + "step": 14881 + }, + { + "epoch": 0.19338480255602677, + "grad_norm": 0.3805374801158905, + "learning_rate": 0.00016135639922536038, + "loss": 1.3975, + "step": 14882 + }, + { + "epoch": 0.19339779709994268, + "grad_norm": 0.407209187746048, + "learning_rate": 0.00016135379976344897, + "loss": 1.5714, + "step": 14883 + }, + { + "epoch": 0.19341079164385855, + "grad_norm": 0.4528035819530487, + "learning_rate": 0.0001613512003015376, + "loss": 1.3497, + "step": 14884 + }, + { + "epoch": 0.19342378618777442, + "grad_norm": 0.5349000096321106, + "learning_rate": 0.0001613486008396262, + "loss": 1.5769, + "step": 14885 + }, + { + "epoch": 0.1934367807316903, + "grad_norm": 0.3710574209690094, + "learning_rate": 0.00016134600137771482, + "loss": 1.4136, + "step": 14886 + }, + { + "epoch": 0.19344977527560617, + "grad_norm": 0.3422017991542816, + "learning_rate": 0.00016134340191580344, + "loss": 1.2903, + "step": 14887 + }, + { + "epoch": 0.19346276981952204, + "grad_norm": 0.37780115008354187, + "learning_rate": 0.00016134080245389204, + "loss": 1.2997, + "step": 14888 + }, + { + "epoch": 0.1934757643634379, + "grad_norm": 0.41516053676605225, + "learning_rate": 0.00016133820299198067, + "loss": 1.3218, + "step": 14889 + }, + { + "epoch": 0.1934887589073538, + "grad_norm": 0.28825294971466064, + "learning_rate": 0.0001613356035300693, + "loss": 1.45, + "step": 14890 + }, + { + "epoch": 0.19350175345126966, + "grad_norm": 0.34397125244140625, + "learning_rate": 0.0001613330040681579, + "loss": 1.4276, + "step": 14891 + }, + { + "epoch": 0.19351474799518553, + "grad_norm": 0.47288596630096436, + "learning_rate": 0.0001613304046062465, + "loss": 1.3474, + "step": 14892 + }, + { + "epoch": 0.1935277425391014, + "grad_norm": 0.43025270104408264, + "learning_rate": 0.0001613278051443351, + "loss": 1.2521, + "step": 14893 + }, + { + "epoch": 0.19354073708301728, + "grad_norm": 0.4127647876739502, + "learning_rate": 0.00016132520568242376, + "loss": 1.3128, + "step": 14894 + }, + { + "epoch": 0.19355373162693315, + "grad_norm": 0.42666390538215637, + "learning_rate": 0.00016132260622051236, + "loss": 1.4336, + "step": 14895 + }, + { + "epoch": 0.19356672617084902, + "grad_norm": 0.36437827348709106, + "learning_rate": 0.00016132000675860098, + "loss": 1.3463, + "step": 14896 + }, + { + "epoch": 0.1935797207147649, + "grad_norm": 0.5230821371078491, + "learning_rate": 0.00016131740729668958, + "loss": 1.4383, + "step": 14897 + }, + { + "epoch": 0.19359271525868077, + "grad_norm": 0.38183239102363586, + "learning_rate": 0.0001613148078347782, + "loss": 1.458, + "step": 14898 + }, + { + "epoch": 0.19360570980259664, + "grad_norm": 0.3007412552833557, + "learning_rate": 0.00016131220837286683, + "loss": 1.4062, + "step": 14899 + }, + { + "epoch": 0.19361870434651252, + "grad_norm": 0.41883182525634766, + "learning_rate": 0.00016130960891095543, + "loss": 1.4188, + "step": 14900 + }, + { + "epoch": 0.1936316988904284, + "grad_norm": 0.39784562587738037, + "learning_rate": 0.00016130700944904408, + "loss": 1.3255, + "step": 14901 + }, + { + "epoch": 0.19364469343434426, + "grad_norm": 0.4048933684825897, + "learning_rate": 0.00016130440998713268, + "loss": 1.4297, + "step": 14902 + }, + { + "epoch": 0.19365768797826013, + "grad_norm": 0.47147148847579956, + "learning_rate": 0.00016130181052522127, + "loss": 1.4944, + "step": 14903 + }, + { + "epoch": 0.193670682522176, + "grad_norm": 0.42690977454185486, + "learning_rate": 0.0001612992110633099, + "loss": 1.4924, + "step": 14904 + }, + { + "epoch": 0.19368367706609188, + "grad_norm": 0.43294331431388855, + "learning_rate": 0.00016129661160139852, + "loss": 1.3688, + "step": 14905 + }, + { + "epoch": 0.19369667161000775, + "grad_norm": 0.4685990810394287, + "learning_rate": 0.00016129401213948715, + "loss": 1.6216, + "step": 14906 + }, + { + "epoch": 0.19370966615392363, + "grad_norm": 0.48875540494918823, + "learning_rate": 0.00016129141267757574, + "loss": 1.5112, + "step": 14907 + }, + { + "epoch": 0.1937226606978395, + "grad_norm": 0.3637704849243164, + "learning_rate": 0.00016128881321566437, + "loss": 1.3556, + "step": 14908 + }, + { + "epoch": 0.19373565524175537, + "grad_norm": 0.3244069516658783, + "learning_rate": 0.000161286213753753, + "loss": 1.323, + "step": 14909 + }, + { + "epoch": 0.19374864978567125, + "grad_norm": 0.4190044105052948, + "learning_rate": 0.0001612836142918416, + "loss": 1.2397, + "step": 14910 + }, + { + "epoch": 0.19376164432958712, + "grad_norm": 0.40611886978149414, + "learning_rate": 0.00016128101482993021, + "loss": 1.5755, + "step": 14911 + }, + { + "epoch": 0.193774638873503, + "grad_norm": 0.47749456763267517, + "learning_rate": 0.0001612784153680188, + "loss": 1.595, + "step": 14912 + }, + { + "epoch": 0.19378763341741886, + "grad_norm": 0.45079129934310913, + "learning_rate": 0.00016127581590610746, + "loss": 1.2838, + "step": 14913 + }, + { + "epoch": 0.19380062796133474, + "grad_norm": 0.415567547082901, + "learning_rate": 0.00016127321644419606, + "loss": 1.4747, + "step": 14914 + }, + { + "epoch": 0.1938136225052506, + "grad_norm": 0.3816990554332733, + "learning_rate": 0.00016127061698228469, + "loss": 1.397, + "step": 14915 + }, + { + "epoch": 0.19382661704916648, + "grad_norm": 0.46385645866394043, + "learning_rate": 0.00016126801752037328, + "loss": 1.3613, + "step": 14916 + }, + { + "epoch": 0.19383961159308236, + "grad_norm": 0.5078310370445251, + "learning_rate": 0.0001612654180584619, + "loss": 1.4508, + "step": 14917 + }, + { + "epoch": 0.19385260613699823, + "grad_norm": 0.4516267776489258, + "learning_rate": 0.00016126281859655053, + "loss": 1.5428, + "step": 14918 + }, + { + "epoch": 0.1938656006809141, + "grad_norm": 0.5169050097465515, + "learning_rate": 0.00016126021913463913, + "loss": 1.5714, + "step": 14919 + }, + { + "epoch": 0.19387859522482997, + "grad_norm": 0.31570351123809814, + "learning_rate": 0.00016125761967272775, + "loss": 1.3927, + "step": 14920 + }, + { + "epoch": 0.19389158976874585, + "grad_norm": 0.43506646156311035, + "learning_rate": 0.00016125502021081638, + "loss": 1.2868, + "step": 14921 + }, + { + "epoch": 0.19390458431266172, + "grad_norm": 0.4190494120121002, + "learning_rate": 0.00016125242074890498, + "loss": 1.3835, + "step": 14922 + }, + { + "epoch": 0.1939175788565776, + "grad_norm": 0.40744659304618835, + "learning_rate": 0.0001612498212869936, + "loss": 1.3033, + "step": 14923 + }, + { + "epoch": 0.19393057340049347, + "grad_norm": 0.5022830963134766, + "learning_rate": 0.0001612472218250822, + "loss": 1.6206, + "step": 14924 + }, + { + "epoch": 0.19394356794440934, + "grad_norm": 0.43683817982673645, + "learning_rate": 0.00016124462236317085, + "loss": 1.4002, + "step": 14925 + }, + { + "epoch": 0.1939565624883252, + "grad_norm": 0.31244176626205444, + "learning_rate": 0.00016124202290125945, + "loss": 1.255, + "step": 14926 + }, + { + "epoch": 0.19396955703224109, + "grad_norm": 0.44326674938201904, + "learning_rate": 0.00016123942343934807, + "loss": 1.6125, + "step": 14927 + }, + { + "epoch": 0.19398255157615696, + "grad_norm": 0.3563145101070404, + "learning_rate": 0.00016123682397743667, + "loss": 1.3766, + "step": 14928 + }, + { + "epoch": 0.19399554612007283, + "grad_norm": 0.395197331905365, + "learning_rate": 0.0001612342245155253, + "loss": 1.3243, + "step": 14929 + }, + { + "epoch": 0.1940085406639887, + "grad_norm": 0.40622276067733765, + "learning_rate": 0.00016123162505361392, + "loss": 1.3819, + "step": 14930 + }, + { + "epoch": 0.19402153520790458, + "grad_norm": 0.4030214846134186, + "learning_rate": 0.00016122902559170251, + "loss": 1.3286, + "step": 14931 + }, + { + "epoch": 0.19403452975182045, + "grad_norm": 0.38832899928092957, + "learning_rate": 0.00016122642612979114, + "loss": 1.4969, + "step": 14932 + }, + { + "epoch": 0.19404752429573632, + "grad_norm": 0.4044993817806244, + "learning_rate": 0.00016122382666787976, + "loss": 1.3746, + "step": 14933 + }, + { + "epoch": 0.1940605188396522, + "grad_norm": 0.47294238209724426, + "learning_rate": 0.00016122122720596836, + "loss": 1.4719, + "step": 14934 + }, + { + "epoch": 0.19407351338356807, + "grad_norm": 0.4514651894569397, + "learning_rate": 0.00016121862774405699, + "loss": 1.542, + "step": 14935 + }, + { + "epoch": 0.19408650792748394, + "grad_norm": 0.41973602771759033, + "learning_rate": 0.00016121602828214558, + "loss": 1.4591, + "step": 14936 + }, + { + "epoch": 0.19409950247139982, + "grad_norm": 0.42308345437049866, + "learning_rate": 0.00016121342882023423, + "loss": 1.3356, + "step": 14937 + }, + { + "epoch": 0.1941124970153157, + "grad_norm": 0.4389118254184723, + "learning_rate": 0.00016121082935832283, + "loss": 1.3293, + "step": 14938 + }, + { + "epoch": 0.19412549155923156, + "grad_norm": 0.3561875522136688, + "learning_rate": 0.00016120822989641146, + "loss": 1.6497, + "step": 14939 + }, + { + "epoch": 0.19413848610314743, + "grad_norm": 0.4700963497161865, + "learning_rate": 0.00016120563043450008, + "loss": 1.5215, + "step": 14940 + }, + { + "epoch": 0.1941514806470633, + "grad_norm": 0.2432563751935959, + "learning_rate": 0.00016120303097258868, + "loss": 1.241, + "step": 14941 + }, + { + "epoch": 0.19416447519097918, + "grad_norm": 0.4091291129589081, + "learning_rate": 0.0001612004315106773, + "loss": 1.3623, + "step": 14942 + }, + { + "epoch": 0.19417746973489505, + "grad_norm": 0.4197945296764374, + "learning_rate": 0.0001611978320487659, + "loss": 1.3261, + "step": 14943 + }, + { + "epoch": 0.19419046427881093, + "grad_norm": 0.4064938724040985, + "learning_rate": 0.00016119523258685455, + "loss": 1.3506, + "step": 14944 + }, + { + "epoch": 0.1942034588227268, + "grad_norm": 0.5298076868057251, + "learning_rate": 0.00016119263312494315, + "loss": 1.5875, + "step": 14945 + }, + { + "epoch": 0.19421645336664267, + "grad_norm": 0.39641743898391724, + "learning_rate": 0.00016119003366303175, + "loss": 1.3535, + "step": 14946 + }, + { + "epoch": 0.19422944791055854, + "grad_norm": 0.3830353319644928, + "learning_rate": 0.00016118743420112037, + "loss": 1.3586, + "step": 14947 + }, + { + "epoch": 0.19424244245447442, + "grad_norm": 0.3914673328399658, + "learning_rate": 0.000161184834739209, + "loss": 1.4979, + "step": 14948 + }, + { + "epoch": 0.1942554369983903, + "grad_norm": 0.36403918266296387, + "learning_rate": 0.00016118223527729762, + "loss": 1.5723, + "step": 14949 + }, + { + "epoch": 0.19426843154230616, + "grad_norm": 0.3627685010433197, + "learning_rate": 0.00016117963581538622, + "loss": 1.5653, + "step": 14950 + }, + { + "epoch": 0.19428142608622204, + "grad_norm": 0.3853529095649719, + "learning_rate": 0.00016117703635347484, + "loss": 1.4284, + "step": 14951 + }, + { + "epoch": 0.1942944206301379, + "grad_norm": 0.38471460342407227, + "learning_rate": 0.00016117443689156347, + "loss": 1.4382, + "step": 14952 + }, + { + "epoch": 0.19430741517405378, + "grad_norm": 0.4330098330974579, + "learning_rate": 0.00016117183742965206, + "loss": 1.404, + "step": 14953 + }, + { + "epoch": 0.19432040971796966, + "grad_norm": 0.49237698316574097, + "learning_rate": 0.0001611692379677407, + "loss": 1.5298, + "step": 14954 + }, + { + "epoch": 0.19433340426188553, + "grad_norm": 0.42953816056251526, + "learning_rate": 0.00016116663850582929, + "loss": 1.4421, + "step": 14955 + }, + { + "epoch": 0.1943463988058014, + "grad_norm": 0.31219932436943054, + "learning_rate": 0.00016116403904391794, + "loss": 1.3428, + "step": 14956 + }, + { + "epoch": 0.19435939334971727, + "grad_norm": 0.43414148688316345, + "learning_rate": 0.00016116143958200653, + "loss": 1.3496, + "step": 14957 + }, + { + "epoch": 0.19437238789363315, + "grad_norm": 0.3261825740337372, + "learning_rate": 0.00016115884012009513, + "loss": 1.1845, + "step": 14958 + }, + { + "epoch": 0.19438538243754902, + "grad_norm": 0.4357633590698242, + "learning_rate": 0.00016115624065818376, + "loss": 1.4187, + "step": 14959 + }, + { + "epoch": 0.19439837698146492, + "grad_norm": 0.34273025393486023, + "learning_rate": 0.00016115364119627238, + "loss": 1.4127, + "step": 14960 + }, + { + "epoch": 0.1944113715253808, + "grad_norm": 0.45404061675071716, + "learning_rate": 0.000161151041734361, + "loss": 1.5838, + "step": 14961 + }, + { + "epoch": 0.19442436606929667, + "grad_norm": 0.4121553897857666, + "learning_rate": 0.0001611484422724496, + "loss": 1.4094, + "step": 14962 + }, + { + "epoch": 0.19443736061321254, + "grad_norm": 0.5067926645278931, + "learning_rate": 0.00016114584281053823, + "loss": 1.4486, + "step": 14963 + }, + { + "epoch": 0.1944503551571284, + "grad_norm": 0.4152904450893402, + "learning_rate": 0.00016114324334862685, + "loss": 1.3304, + "step": 14964 + }, + { + "epoch": 0.19446334970104429, + "grad_norm": 0.4469233751296997, + "learning_rate": 0.00016114064388671545, + "loss": 1.4429, + "step": 14965 + }, + { + "epoch": 0.19447634424496016, + "grad_norm": 0.38391441106796265, + "learning_rate": 0.00016113804442480407, + "loss": 1.2534, + "step": 14966 + }, + { + "epoch": 0.19448933878887603, + "grad_norm": 0.39998140931129456, + "learning_rate": 0.00016113544496289267, + "loss": 1.3039, + "step": 14967 + }, + { + "epoch": 0.1945023333327919, + "grad_norm": 0.38620835542678833, + "learning_rate": 0.00016113284550098132, + "loss": 1.4754, + "step": 14968 + }, + { + "epoch": 0.19451532787670778, + "grad_norm": 0.39291083812713623, + "learning_rate": 0.00016113024603906992, + "loss": 1.4612, + "step": 14969 + }, + { + "epoch": 0.19452832242062365, + "grad_norm": 0.4708918035030365, + "learning_rate": 0.00016112764657715852, + "loss": 1.358, + "step": 14970 + }, + { + "epoch": 0.19454131696453952, + "grad_norm": 0.42590638995170593, + "learning_rate": 0.00016112504711524714, + "loss": 1.2989, + "step": 14971 + }, + { + "epoch": 0.1945543115084554, + "grad_norm": 0.37809309363365173, + "learning_rate": 0.00016112244765333577, + "loss": 1.8283, + "step": 14972 + }, + { + "epoch": 0.19456730605237127, + "grad_norm": 0.3768952488899231, + "learning_rate": 0.0001611198481914244, + "loss": 1.4062, + "step": 14973 + }, + { + "epoch": 0.19458030059628714, + "grad_norm": 0.3749127984046936, + "learning_rate": 0.000161117248729513, + "loss": 1.4083, + "step": 14974 + }, + { + "epoch": 0.19459329514020302, + "grad_norm": 0.4495859146118164, + "learning_rate": 0.0001611146492676016, + "loss": 1.5011, + "step": 14975 + }, + { + "epoch": 0.1946062896841189, + "grad_norm": 0.31008484959602356, + "learning_rate": 0.00016111204980569024, + "loss": 1.3625, + "step": 14976 + }, + { + "epoch": 0.19461928422803476, + "grad_norm": 0.3724196255207062, + "learning_rate": 0.00016110945034377883, + "loss": 1.4536, + "step": 14977 + }, + { + "epoch": 0.19463227877195063, + "grad_norm": 0.41101840138435364, + "learning_rate": 0.00016110685088186746, + "loss": 1.4428, + "step": 14978 + }, + { + "epoch": 0.1946452733158665, + "grad_norm": 0.41465938091278076, + "learning_rate": 0.00016110425141995608, + "loss": 1.3214, + "step": 14979 + }, + { + "epoch": 0.19465826785978238, + "grad_norm": 0.3552533686161041, + "learning_rate": 0.0001611016519580447, + "loss": 1.3875, + "step": 14980 + }, + { + "epoch": 0.19467126240369825, + "grad_norm": 0.3392327129840851, + "learning_rate": 0.0001610990524961333, + "loss": 1.503, + "step": 14981 + }, + { + "epoch": 0.19468425694761413, + "grad_norm": 0.4082401990890503, + "learning_rate": 0.00016109645303422193, + "loss": 1.4145, + "step": 14982 + }, + { + "epoch": 0.19469725149153, + "grad_norm": 0.29725563526153564, + "learning_rate": 0.00016109385357231055, + "loss": 1.3869, + "step": 14983 + }, + { + "epoch": 0.19471024603544587, + "grad_norm": 0.4168282151222229, + "learning_rate": 0.00016109125411039915, + "loss": 1.4667, + "step": 14984 + }, + { + "epoch": 0.19472324057936174, + "grad_norm": 0.3943009078502655, + "learning_rate": 0.00016108865464848778, + "loss": 1.4355, + "step": 14985 + }, + { + "epoch": 0.19473623512327762, + "grad_norm": 0.4274914264678955, + "learning_rate": 0.00016108605518657637, + "loss": 1.5612, + "step": 14986 + }, + { + "epoch": 0.1947492296671935, + "grad_norm": 0.44223588705062866, + "learning_rate": 0.000161083455724665, + "loss": 1.4419, + "step": 14987 + }, + { + "epoch": 0.19476222421110936, + "grad_norm": 0.41913723945617676, + "learning_rate": 0.00016108085626275362, + "loss": 1.464, + "step": 14988 + }, + { + "epoch": 0.19477521875502524, + "grad_norm": 0.4552180767059326, + "learning_rate": 0.00016107825680084222, + "loss": 1.4621, + "step": 14989 + }, + { + "epoch": 0.1947882132989411, + "grad_norm": 0.43758639693260193, + "learning_rate": 0.00016107565733893084, + "loss": 1.5732, + "step": 14990 + }, + { + "epoch": 0.19480120784285698, + "grad_norm": 0.41083887219429016, + "learning_rate": 0.00016107305787701947, + "loss": 1.5109, + "step": 14991 + }, + { + "epoch": 0.19481420238677286, + "grad_norm": 0.4736224412918091, + "learning_rate": 0.0001610704584151081, + "loss": 1.6214, + "step": 14992 + }, + { + "epoch": 0.19482719693068873, + "grad_norm": 0.38998010754585266, + "learning_rate": 0.0001610678589531967, + "loss": 1.4728, + "step": 14993 + }, + { + "epoch": 0.1948401914746046, + "grad_norm": 0.4466601312160492, + "learning_rate": 0.00016106525949128532, + "loss": 1.4204, + "step": 14994 + }, + { + "epoch": 0.19485318601852047, + "grad_norm": 0.369575172662735, + "learning_rate": 0.00016106266002937394, + "loss": 1.2967, + "step": 14995 + }, + { + "epoch": 0.19486618056243635, + "grad_norm": 0.3789312243461609, + "learning_rate": 0.00016106006056746254, + "loss": 1.3714, + "step": 14996 + }, + { + "epoch": 0.19487917510635222, + "grad_norm": 0.4120161831378937, + "learning_rate": 0.00016105746110555116, + "loss": 1.3069, + "step": 14997 + }, + { + "epoch": 0.1948921696502681, + "grad_norm": 0.44996580481529236, + "learning_rate": 0.00016105486164363976, + "loss": 1.3407, + "step": 14998 + }, + { + "epoch": 0.19490516419418397, + "grad_norm": 0.367512583732605, + "learning_rate": 0.0001610522621817284, + "loss": 1.4125, + "step": 14999 + }, + { + "epoch": 0.19491815873809984, + "grad_norm": 0.4112405776977539, + "learning_rate": 0.000161049662719817, + "loss": 1.3201, + "step": 15000 + }, + { + "epoch": 0.1949311532820157, + "grad_norm": 0.29522496461868286, + "learning_rate": 0.0001610470632579056, + "loss": 1.453, + "step": 15001 + }, + { + "epoch": 0.19494414782593159, + "grad_norm": 0.3313886225223541, + "learning_rate": 0.00016104446379599423, + "loss": 1.2701, + "step": 15002 + }, + { + "epoch": 0.19495714236984746, + "grad_norm": 0.4674217998981476, + "learning_rate": 0.00016104186433408285, + "loss": 1.4279, + "step": 15003 + }, + { + "epoch": 0.19497013691376333, + "grad_norm": 0.3128993809223175, + "learning_rate": 0.00016103926487217148, + "loss": 1.4752, + "step": 15004 + }, + { + "epoch": 0.1949831314576792, + "grad_norm": 0.37602829933166504, + "learning_rate": 0.00016103666541026008, + "loss": 1.5826, + "step": 15005 + }, + { + "epoch": 0.19499612600159508, + "grad_norm": 0.4832821488380432, + "learning_rate": 0.0001610340659483487, + "loss": 1.4449, + "step": 15006 + }, + { + "epoch": 0.19500912054551095, + "grad_norm": 0.4294029772281647, + "learning_rate": 0.00016103146648643733, + "loss": 1.586, + "step": 15007 + }, + { + "epoch": 0.19502211508942682, + "grad_norm": 0.31111422181129456, + "learning_rate": 0.00016102886702452592, + "loss": 1.319, + "step": 15008 + }, + { + "epoch": 0.1950351096333427, + "grad_norm": 0.46707379817962646, + "learning_rate": 0.00016102626756261455, + "loss": 1.3575, + "step": 15009 + }, + { + "epoch": 0.19504810417725857, + "grad_norm": 0.35692548751831055, + "learning_rate": 0.00016102366810070314, + "loss": 1.3841, + "step": 15010 + }, + { + "epoch": 0.19506109872117444, + "grad_norm": 0.4133400022983551, + "learning_rate": 0.0001610210686387918, + "loss": 1.5599, + "step": 15011 + }, + { + "epoch": 0.19507409326509031, + "grad_norm": 0.3109988272190094, + "learning_rate": 0.0001610184691768804, + "loss": 1.3967, + "step": 15012 + }, + { + "epoch": 0.1950870878090062, + "grad_norm": 0.49542033672332764, + "learning_rate": 0.000161015869714969, + "loss": 1.5919, + "step": 15013 + }, + { + "epoch": 0.19510008235292206, + "grad_norm": 0.40462857484817505, + "learning_rate": 0.00016101327025305764, + "loss": 1.2506, + "step": 15014 + }, + { + "epoch": 0.19511307689683793, + "grad_norm": 0.4302003085613251, + "learning_rate": 0.00016101067079114624, + "loss": 1.4859, + "step": 15015 + }, + { + "epoch": 0.1951260714407538, + "grad_norm": 0.38717302680015564, + "learning_rate": 0.00016100807132923486, + "loss": 1.2318, + "step": 15016 + }, + { + "epoch": 0.19513906598466968, + "grad_norm": 0.3128765821456909, + "learning_rate": 0.00016100547186732346, + "loss": 1.2826, + "step": 15017 + }, + { + "epoch": 0.19515206052858555, + "grad_norm": 0.45582106709480286, + "learning_rate": 0.00016100287240541209, + "loss": 1.3975, + "step": 15018 + }, + { + "epoch": 0.19516505507250143, + "grad_norm": 0.37630948424339294, + "learning_rate": 0.0001610002729435007, + "loss": 1.3727, + "step": 15019 + }, + { + "epoch": 0.1951780496164173, + "grad_norm": 0.4631521701812744, + "learning_rate": 0.0001609976734815893, + "loss": 1.5121, + "step": 15020 + }, + { + "epoch": 0.19519104416033317, + "grad_norm": 0.4156006872653961, + "learning_rate": 0.00016099507401967793, + "loss": 1.3354, + "step": 15021 + }, + { + "epoch": 0.19520403870424904, + "grad_norm": 0.4105610251426697, + "learning_rate": 0.00016099247455776656, + "loss": 1.7009, + "step": 15022 + }, + { + "epoch": 0.19521703324816492, + "grad_norm": 0.46963226795196533, + "learning_rate": 0.00016098987509585518, + "loss": 1.528, + "step": 15023 + }, + { + "epoch": 0.1952300277920808, + "grad_norm": 0.4761582612991333, + "learning_rate": 0.00016098727563394378, + "loss": 1.3929, + "step": 15024 + }, + { + "epoch": 0.19524302233599666, + "grad_norm": 0.41948556900024414, + "learning_rate": 0.00016098467617203238, + "loss": 1.2504, + "step": 15025 + }, + { + "epoch": 0.19525601687991254, + "grad_norm": 0.3190854787826538, + "learning_rate": 0.00016098207671012103, + "loss": 1.3906, + "step": 15026 + }, + { + "epoch": 0.1952690114238284, + "grad_norm": 0.46148914098739624, + "learning_rate": 0.00016097947724820963, + "loss": 1.4814, + "step": 15027 + }, + { + "epoch": 0.19528200596774428, + "grad_norm": 0.33796706795692444, + "learning_rate": 0.00016097687778629825, + "loss": 1.2624, + "step": 15028 + }, + { + "epoch": 0.19529500051166016, + "grad_norm": 0.37835144996643066, + "learning_rate": 0.00016097427832438685, + "loss": 1.419, + "step": 15029 + }, + { + "epoch": 0.19530799505557603, + "grad_norm": 0.3936806619167328, + "learning_rate": 0.00016097167886247547, + "loss": 1.423, + "step": 15030 + }, + { + "epoch": 0.1953209895994919, + "grad_norm": 0.48103076219558716, + "learning_rate": 0.0001609690794005641, + "loss": 1.555, + "step": 15031 + }, + { + "epoch": 0.19533398414340777, + "grad_norm": 0.4297008216381073, + "learning_rate": 0.0001609664799386527, + "loss": 1.5039, + "step": 15032 + }, + { + "epoch": 0.19534697868732365, + "grad_norm": 0.5032608509063721, + "learning_rate": 0.00016096388047674132, + "loss": 1.6337, + "step": 15033 + }, + { + "epoch": 0.19535997323123952, + "grad_norm": 0.3664565086364746, + "learning_rate": 0.00016096128101482994, + "loss": 1.5047, + "step": 15034 + }, + { + "epoch": 0.1953729677751554, + "grad_norm": 0.32763874530792236, + "learning_rate": 0.00016095868155291857, + "loss": 1.2648, + "step": 15035 + }, + { + "epoch": 0.1953859623190713, + "grad_norm": 0.39580076932907104, + "learning_rate": 0.00016095608209100716, + "loss": 1.385, + "step": 15036 + }, + { + "epoch": 0.19539895686298717, + "grad_norm": 0.3973984122276306, + "learning_rate": 0.0001609534826290958, + "loss": 1.3607, + "step": 15037 + }, + { + "epoch": 0.19541195140690304, + "grad_norm": 0.3765105903148651, + "learning_rate": 0.0001609508831671844, + "loss": 1.4883, + "step": 15038 + }, + { + "epoch": 0.1954249459508189, + "grad_norm": 0.40490561723709106, + "learning_rate": 0.000160948283705273, + "loss": 1.4128, + "step": 15039 + }, + { + "epoch": 0.19543794049473479, + "grad_norm": 0.37630191445350647, + "learning_rate": 0.00016094568424336163, + "loss": 1.3293, + "step": 15040 + }, + { + "epoch": 0.19545093503865066, + "grad_norm": 0.36157554388046265, + "learning_rate": 0.00016094308478145023, + "loss": 1.2252, + "step": 15041 + }, + { + "epoch": 0.19546392958256653, + "grad_norm": 0.4231138825416565, + "learning_rate": 0.00016094048531953886, + "loss": 1.5418, + "step": 15042 + }, + { + "epoch": 0.1954769241264824, + "grad_norm": 0.38538050651550293, + "learning_rate": 0.00016093788585762748, + "loss": 1.5232, + "step": 15043 + }, + { + "epoch": 0.19548991867039828, + "grad_norm": 0.47394877672195435, + "learning_rate": 0.00016093528639571608, + "loss": 1.3685, + "step": 15044 + }, + { + "epoch": 0.19550291321431415, + "grad_norm": 0.4126327931880951, + "learning_rate": 0.0001609326869338047, + "loss": 1.575, + "step": 15045 + }, + { + "epoch": 0.19551590775823002, + "grad_norm": 0.5006478428840637, + "learning_rate": 0.00016093008747189333, + "loss": 1.6771, + "step": 15046 + }, + { + "epoch": 0.1955289023021459, + "grad_norm": 0.3928852081298828, + "learning_rate": 0.00016092748800998195, + "loss": 1.314, + "step": 15047 + }, + { + "epoch": 0.19554189684606177, + "grad_norm": 0.4020337462425232, + "learning_rate": 0.00016092488854807055, + "loss": 1.4295, + "step": 15048 + }, + { + "epoch": 0.19555489138997764, + "grad_norm": 0.3356950581073761, + "learning_rate": 0.00016092228908615917, + "loss": 1.5733, + "step": 15049 + }, + { + "epoch": 0.19556788593389351, + "grad_norm": 0.4210571348667145, + "learning_rate": 0.0001609196896242478, + "loss": 1.4687, + "step": 15050 + }, + { + "epoch": 0.1955808804778094, + "grad_norm": 0.41777604818344116, + "learning_rate": 0.0001609170901623364, + "loss": 1.2899, + "step": 15051 + }, + { + "epoch": 0.19559387502172526, + "grad_norm": 0.32231879234313965, + "learning_rate": 0.00016091449070042502, + "loss": 1.1813, + "step": 15052 + }, + { + "epoch": 0.19560686956564113, + "grad_norm": 0.4138682186603546, + "learning_rate": 0.00016091189123851364, + "loss": 1.5524, + "step": 15053 + }, + { + "epoch": 0.195619864109557, + "grad_norm": 0.37750229239463806, + "learning_rate": 0.00016090929177660224, + "loss": 1.3417, + "step": 15054 + }, + { + "epoch": 0.19563285865347288, + "grad_norm": 0.40623587369918823, + "learning_rate": 0.00016090669231469087, + "loss": 1.391, + "step": 15055 + }, + { + "epoch": 0.19564585319738875, + "grad_norm": 0.5168881416320801, + "learning_rate": 0.00016090409285277946, + "loss": 1.4582, + "step": 15056 + }, + { + "epoch": 0.19565884774130463, + "grad_norm": 0.49380379915237427, + "learning_rate": 0.00016090149339086812, + "loss": 1.5014, + "step": 15057 + }, + { + "epoch": 0.1956718422852205, + "grad_norm": 0.4175034761428833, + "learning_rate": 0.0001608988939289567, + "loss": 1.3414, + "step": 15058 + }, + { + "epoch": 0.19568483682913637, + "grad_norm": 0.32605743408203125, + "learning_rate": 0.00016089629446704534, + "loss": 1.2939, + "step": 15059 + }, + { + "epoch": 0.19569783137305224, + "grad_norm": 0.39813730120658875, + "learning_rate": 0.00016089369500513393, + "loss": 1.6173, + "step": 15060 + }, + { + "epoch": 0.19571082591696812, + "grad_norm": 0.41415518522262573, + "learning_rate": 0.00016089109554322256, + "loss": 1.4739, + "step": 15061 + }, + { + "epoch": 0.195723820460884, + "grad_norm": 0.5410938262939453, + "learning_rate": 0.00016088849608131118, + "loss": 1.4342, + "step": 15062 + }, + { + "epoch": 0.19573681500479986, + "grad_norm": 0.4245575964450836, + "learning_rate": 0.00016088589661939978, + "loss": 1.4338, + "step": 15063 + }, + { + "epoch": 0.19574980954871574, + "grad_norm": 0.4145648777484894, + "learning_rate": 0.0001608832971574884, + "loss": 1.5126, + "step": 15064 + }, + { + "epoch": 0.1957628040926316, + "grad_norm": 0.4183512330055237, + "learning_rate": 0.00016088069769557703, + "loss": 1.4174, + "step": 15065 + }, + { + "epoch": 0.19577579863654748, + "grad_norm": 0.30564042925834656, + "learning_rate": 0.00016087809823366565, + "loss": 1.4225, + "step": 15066 + }, + { + "epoch": 0.19578879318046336, + "grad_norm": 0.3914017677307129, + "learning_rate": 0.00016087549877175425, + "loss": 1.4402, + "step": 15067 + }, + { + "epoch": 0.19580178772437923, + "grad_norm": 0.4162934422492981, + "learning_rate": 0.00016087289930984285, + "loss": 1.3736, + "step": 15068 + }, + { + "epoch": 0.1958147822682951, + "grad_norm": 0.40347203612327576, + "learning_rate": 0.0001608702998479315, + "loss": 1.3863, + "step": 15069 + }, + { + "epoch": 0.19582777681221097, + "grad_norm": 0.30318590998649597, + "learning_rate": 0.0001608677003860201, + "loss": 1.2831, + "step": 15070 + }, + { + "epoch": 0.19584077135612685, + "grad_norm": 0.2869959771633148, + "learning_rate": 0.00016086510092410872, + "loss": 1.3392, + "step": 15071 + }, + { + "epoch": 0.19585376590004272, + "grad_norm": 0.3996869921684265, + "learning_rate": 0.00016086250146219732, + "loss": 1.3383, + "step": 15072 + }, + { + "epoch": 0.1958667604439586, + "grad_norm": 0.43461596965789795, + "learning_rate": 0.00016085990200028594, + "loss": 1.436, + "step": 15073 + }, + { + "epoch": 0.19587975498787447, + "grad_norm": 0.4679774343967438, + "learning_rate": 0.00016085730253837457, + "loss": 1.5102, + "step": 15074 + }, + { + "epoch": 0.19589274953179034, + "grad_norm": 0.4502407908439636, + "learning_rate": 0.00016085470307646317, + "loss": 1.5869, + "step": 15075 + }, + { + "epoch": 0.1959057440757062, + "grad_norm": 0.40007972717285156, + "learning_rate": 0.0001608521036145518, + "loss": 1.4142, + "step": 15076 + }, + { + "epoch": 0.19591873861962208, + "grad_norm": 0.4622056186199188, + "learning_rate": 0.00016084950415264042, + "loss": 1.4487, + "step": 15077 + }, + { + "epoch": 0.19593173316353796, + "grad_norm": 0.40014514327049255, + "learning_rate": 0.00016084690469072904, + "loss": 1.4344, + "step": 15078 + }, + { + "epoch": 0.19594472770745383, + "grad_norm": 0.4946998655796051, + "learning_rate": 0.00016084430522881764, + "loss": 1.4279, + "step": 15079 + }, + { + "epoch": 0.1959577222513697, + "grad_norm": 0.40331342816352844, + "learning_rate": 0.00016084170576690623, + "loss": 1.2849, + "step": 15080 + }, + { + "epoch": 0.19597071679528558, + "grad_norm": 0.37406259775161743, + "learning_rate": 0.0001608391063049949, + "loss": 1.3799, + "step": 15081 + }, + { + "epoch": 0.19598371133920145, + "grad_norm": 0.40779539942741394, + "learning_rate": 0.00016083650684308348, + "loss": 1.4894, + "step": 15082 + }, + { + "epoch": 0.19599670588311732, + "grad_norm": 0.43439584970474243, + "learning_rate": 0.0001608339073811721, + "loss": 1.3144, + "step": 15083 + }, + { + "epoch": 0.1960097004270332, + "grad_norm": 0.371105432510376, + "learning_rate": 0.0001608313079192607, + "loss": 1.3048, + "step": 15084 + }, + { + "epoch": 0.19602269497094907, + "grad_norm": 0.4025285840034485, + "learning_rate": 0.00016082870845734933, + "loss": 1.3729, + "step": 15085 + }, + { + "epoch": 0.19603568951486494, + "grad_norm": 0.28473252058029175, + "learning_rate": 0.00016082610899543795, + "loss": 1.4099, + "step": 15086 + }, + { + "epoch": 0.19604868405878081, + "grad_norm": 0.5011013150215149, + "learning_rate": 0.00016082350953352655, + "loss": 1.6652, + "step": 15087 + }, + { + "epoch": 0.1960616786026967, + "grad_norm": 0.4537227749824524, + "learning_rate": 0.0001608209100716152, + "loss": 1.6578, + "step": 15088 + }, + { + "epoch": 0.19607467314661256, + "grad_norm": 0.3511347472667694, + "learning_rate": 0.0001608183106097038, + "loss": 1.3642, + "step": 15089 + }, + { + "epoch": 0.19608766769052843, + "grad_norm": 0.26621341705322266, + "learning_rate": 0.00016081571114779243, + "loss": 1.1196, + "step": 15090 + }, + { + "epoch": 0.1961006622344443, + "grad_norm": 0.41732555627822876, + "learning_rate": 0.00016081311168588102, + "loss": 1.4199, + "step": 15091 + }, + { + "epoch": 0.19611365677836018, + "grad_norm": 0.36707526445388794, + "learning_rate": 0.00016081051222396965, + "loss": 1.3991, + "step": 15092 + }, + { + "epoch": 0.19612665132227605, + "grad_norm": 0.4805380403995514, + "learning_rate": 0.00016080791276205827, + "loss": 1.4135, + "step": 15093 + }, + { + "epoch": 0.19613964586619193, + "grad_norm": 0.4907483160495758, + "learning_rate": 0.00016080531330014687, + "loss": 1.4601, + "step": 15094 + }, + { + "epoch": 0.1961526404101078, + "grad_norm": 0.419883131980896, + "learning_rate": 0.0001608027138382355, + "loss": 1.5845, + "step": 15095 + }, + { + "epoch": 0.19616563495402367, + "grad_norm": 0.4388166666030884, + "learning_rate": 0.00016080011437632412, + "loss": 1.3623, + "step": 15096 + }, + { + "epoch": 0.19617862949793954, + "grad_norm": 0.3660356402397156, + "learning_rate": 0.00016079751491441272, + "loss": 1.5341, + "step": 15097 + }, + { + "epoch": 0.19619162404185542, + "grad_norm": 0.39928776025772095, + "learning_rate": 0.00016079491545250134, + "loss": 1.5847, + "step": 15098 + }, + { + "epoch": 0.1962046185857713, + "grad_norm": 0.3672029674053192, + "learning_rate": 0.00016079231599058994, + "loss": 1.3771, + "step": 15099 + }, + { + "epoch": 0.19621761312968716, + "grad_norm": 0.3349177837371826, + "learning_rate": 0.0001607897165286786, + "loss": 1.6002, + "step": 15100 + }, + { + "epoch": 0.19623060767360304, + "grad_norm": 0.4589473009109497, + "learning_rate": 0.00016078711706676719, + "loss": 1.4055, + "step": 15101 + }, + { + "epoch": 0.1962436022175189, + "grad_norm": 0.4271509051322937, + "learning_rate": 0.0001607845176048558, + "loss": 1.3003, + "step": 15102 + }, + { + "epoch": 0.19625659676143478, + "grad_norm": 0.3963555693626404, + "learning_rate": 0.0001607819181429444, + "loss": 1.5422, + "step": 15103 + }, + { + "epoch": 0.19626959130535065, + "grad_norm": 0.4377681016921997, + "learning_rate": 0.00016077931868103303, + "loss": 1.3185, + "step": 15104 + }, + { + "epoch": 0.19628258584926653, + "grad_norm": 0.3426659405231476, + "learning_rate": 0.00016077671921912166, + "loss": 1.3945, + "step": 15105 + }, + { + "epoch": 0.1962955803931824, + "grad_norm": 0.3157619833946228, + "learning_rate": 0.00016077411975721025, + "loss": 1.2127, + "step": 15106 + }, + { + "epoch": 0.19630857493709827, + "grad_norm": 0.40778079628944397, + "learning_rate": 0.00016077152029529888, + "loss": 1.5935, + "step": 15107 + }, + { + "epoch": 0.19632156948101415, + "grad_norm": 0.2817988097667694, + "learning_rate": 0.0001607689208333875, + "loss": 1.173, + "step": 15108 + }, + { + "epoch": 0.19633456402493002, + "grad_norm": 0.44415712356567383, + "learning_rate": 0.0001607663213714761, + "loss": 1.5046, + "step": 15109 + }, + { + "epoch": 0.1963475585688459, + "grad_norm": 0.36379292607307434, + "learning_rate": 0.00016076372190956473, + "loss": 1.4882, + "step": 15110 + }, + { + "epoch": 0.19636055311276177, + "grad_norm": 0.35104623436927795, + "learning_rate": 0.00016076112244765332, + "loss": 1.4546, + "step": 15111 + }, + { + "epoch": 0.19637354765667767, + "grad_norm": 0.31093838810920715, + "learning_rate": 0.00016075852298574197, + "loss": 1.2872, + "step": 15112 + }, + { + "epoch": 0.19638654220059354, + "grad_norm": 0.38891029357910156, + "learning_rate": 0.00016075592352383057, + "loss": 1.5188, + "step": 15113 + }, + { + "epoch": 0.1963995367445094, + "grad_norm": 0.40772297978401184, + "learning_rate": 0.0001607533240619192, + "loss": 1.4282, + "step": 15114 + }, + { + "epoch": 0.19641253128842528, + "grad_norm": 0.3571866750717163, + "learning_rate": 0.0001607507246000078, + "loss": 1.206, + "step": 15115 + }, + { + "epoch": 0.19642552583234116, + "grad_norm": 0.535038948059082, + "learning_rate": 0.00016074812513809642, + "loss": 1.4726, + "step": 15116 + }, + { + "epoch": 0.19643852037625703, + "grad_norm": 0.4346778094768524, + "learning_rate": 0.00016074552567618504, + "loss": 1.3508, + "step": 15117 + }, + { + "epoch": 0.1964515149201729, + "grad_norm": 0.4804379940032959, + "learning_rate": 0.00016074292621427364, + "loss": 1.3864, + "step": 15118 + }, + { + "epoch": 0.19646450946408878, + "grad_norm": 0.4321335256099701, + "learning_rate": 0.00016074032675236226, + "loss": 1.2403, + "step": 15119 + }, + { + "epoch": 0.19647750400800465, + "grad_norm": 0.49203550815582275, + "learning_rate": 0.0001607377272904509, + "loss": 1.5687, + "step": 15120 + }, + { + "epoch": 0.19649049855192052, + "grad_norm": 0.39814361929893494, + "learning_rate": 0.0001607351278285395, + "loss": 1.4176, + "step": 15121 + }, + { + "epoch": 0.1965034930958364, + "grad_norm": 0.4023113548755646, + "learning_rate": 0.0001607325283666281, + "loss": 1.4427, + "step": 15122 + }, + { + "epoch": 0.19651648763975227, + "grad_norm": 0.3334875702857971, + "learning_rate": 0.00016072992890471674, + "loss": 1.4648, + "step": 15123 + }, + { + "epoch": 0.19652948218366814, + "grad_norm": 0.4837316572666168, + "learning_rate": 0.00016072732944280536, + "loss": 1.4269, + "step": 15124 + }, + { + "epoch": 0.19654247672758401, + "grad_norm": 0.33407703042030334, + "learning_rate": 0.00016072472998089396, + "loss": 1.2473, + "step": 15125 + }, + { + "epoch": 0.1965554712714999, + "grad_norm": 0.4090714454650879, + "learning_rate": 0.00016072213051898258, + "loss": 1.4877, + "step": 15126 + }, + { + "epoch": 0.19656846581541576, + "grad_norm": 0.2927691340446472, + "learning_rate": 0.0001607195310570712, + "loss": 1.2998, + "step": 15127 + }, + { + "epoch": 0.19658146035933163, + "grad_norm": 0.465712308883667, + "learning_rate": 0.0001607169315951598, + "loss": 1.3434, + "step": 15128 + }, + { + "epoch": 0.1965944549032475, + "grad_norm": 0.3964065611362457, + "learning_rate": 0.00016071433213324843, + "loss": 1.5113, + "step": 15129 + }, + { + "epoch": 0.19660744944716338, + "grad_norm": 0.37474626302719116, + "learning_rate": 0.00016071173267133703, + "loss": 1.3561, + "step": 15130 + }, + { + "epoch": 0.19662044399107925, + "grad_norm": 0.31888917088508606, + "learning_rate": 0.00016070913320942568, + "loss": 1.2511, + "step": 15131 + }, + { + "epoch": 0.19663343853499513, + "grad_norm": 0.37481632828712463, + "learning_rate": 0.00016070653374751427, + "loss": 1.2369, + "step": 15132 + }, + { + "epoch": 0.196646433078911, + "grad_norm": 0.35175007581710815, + "learning_rate": 0.0001607039342856029, + "loss": 1.2775, + "step": 15133 + }, + { + "epoch": 0.19665942762282687, + "grad_norm": 0.4542943239212036, + "learning_rate": 0.0001607013348236915, + "loss": 1.2889, + "step": 15134 + }, + { + "epoch": 0.19667242216674274, + "grad_norm": 0.36963728070259094, + "learning_rate": 0.00016069873536178012, + "loss": 1.3964, + "step": 15135 + }, + { + "epoch": 0.19668541671065862, + "grad_norm": 0.417575478553772, + "learning_rate": 0.00016069613589986875, + "loss": 1.6066, + "step": 15136 + }, + { + "epoch": 0.1966984112545745, + "grad_norm": 0.3544687032699585, + "learning_rate": 0.00016069353643795734, + "loss": 1.3855, + "step": 15137 + }, + { + "epoch": 0.19671140579849036, + "grad_norm": 0.3952132761478424, + "learning_rate": 0.00016069093697604597, + "loss": 1.5143, + "step": 15138 + }, + { + "epoch": 0.19672440034240624, + "grad_norm": 0.3200976848602295, + "learning_rate": 0.0001606883375141346, + "loss": 1.2969, + "step": 15139 + }, + { + "epoch": 0.1967373948863221, + "grad_norm": 0.34209194779396057, + "learning_rate": 0.0001606857380522232, + "loss": 1.1786, + "step": 15140 + }, + { + "epoch": 0.19675038943023798, + "grad_norm": 0.3913453221321106, + "learning_rate": 0.0001606831385903118, + "loss": 1.3601, + "step": 15141 + }, + { + "epoch": 0.19676338397415385, + "grad_norm": 0.342191606760025, + "learning_rate": 0.0001606805391284004, + "loss": 1.3319, + "step": 15142 + }, + { + "epoch": 0.19677637851806973, + "grad_norm": 0.43631264567375183, + "learning_rate": 0.00016067793966648906, + "loss": 1.4964, + "step": 15143 + }, + { + "epoch": 0.1967893730619856, + "grad_norm": 0.37610581517219543, + "learning_rate": 0.00016067534020457766, + "loss": 1.4363, + "step": 15144 + }, + { + "epoch": 0.19680236760590147, + "grad_norm": 0.33718085289001465, + "learning_rate": 0.00016067274074266628, + "loss": 1.5437, + "step": 15145 + }, + { + "epoch": 0.19681536214981735, + "grad_norm": 0.3893381655216217, + "learning_rate": 0.00016067014128075488, + "loss": 1.331, + "step": 15146 + }, + { + "epoch": 0.19682835669373322, + "grad_norm": 0.40297672152519226, + "learning_rate": 0.0001606675418188435, + "loss": 1.3131, + "step": 15147 + }, + { + "epoch": 0.1968413512376491, + "grad_norm": 0.4165307879447937, + "learning_rate": 0.00016066494235693213, + "loss": 1.3983, + "step": 15148 + }, + { + "epoch": 0.19685434578156497, + "grad_norm": 0.38147854804992676, + "learning_rate": 0.00016066234289502073, + "loss": 1.3775, + "step": 15149 + }, + { + "epoch": 0.19686734032548084, + "grad_norm": 0.3278824985027313, + "learning_rate": 0.00016065974343310935, + "loss": 1.4299, + "step": 15150 + }, + { + "epoch": 0.1968803348693967, + "grad_norm": 0.38771364092826843, + "learning_rate": 0.00016065714397119798, + "loss": 1.3929, + "step": 15151 + }, + { + "epoch": 0.19689332941331258, + "grad_norm": 0.4045740067958832, + "learning_rate": 0.00016065454450928657, + "loss": 1.2267, + "step": 15152 + }, + { + "epoch": 0.19690632395722846, + "grad_norm": 0.3722262978553772, + "learning_rate": 0.0001606519450473752, + "loss": 1.2616, + "step": 15153 + }, + { + "epoch": 0.19691931850114433, + "grad_norm": 0.35703858733177185, + "learning_rate": 0.0001606493455854638, + "loss": 1.3488, + "step": 15154 + }, + { + "epoch": 0.1969323130450602, + "grad_norm": 0.3989024758338928, + "learning_rate": 0.00016064674612355245, + "loss": 1.5359, + "step": 15155 + }, + { + "epoch": 0.19694530758897608, + "grad_norm": 0.4781392216682434, + "learning_rate": 0.00016064414666164105, + "loss": 1.5791, + "step": 15156 + }, + { + "epoch": 0.19695830213289195, + "grad_norm": 0.4096522331237793, + "learning_rate": 0.00016064154719972967, + "loss": 1.4269, + "step": 15157 + }, + { + "epoch": 0.19697129667680782, + "grad_norm": 0.351851224899292, + "learning_rate": 0.00016063894773781827, + "loss": 1.3586, + "step": 15158 + }, + { + "epoch": 0.1969842912207237, + "grad_norm": 0.35385170578956604, + "learning_rate": 0.0001606363482759069, + "loss": 1.2723, + "step": 15159 + }, + { + "epoch": 0.19699728576463957, + "grad_norm": 0.38648566603660583, + "learning_rate": 0.00016063374881399552, + "loss": 1.4522, + "step": 15160 + }, + { + "epoch": 0.19701028030855544, + "grad_norm": 0.32445815205574036, + "learning_rate": 0.0001606311493520841, + "loss": 1.4577, + "step": 15161 + }, + { + "epoch": 0.19702327485247131, + "grad_norm": 0.327451229095459, + "learning_rate": 0.00016062854989017276, + "loss": 1.192, + "step": 15162 + }, + { + "epoch": 0.1970362693963872, + "grad_norm": 0.4033142924308777, + "learning_rate": 0.00016062595042826136, + "loss": 1.2914, + "step": 15163 + }, + { + "epoch": 0.19704926394030306, + "grad_norm": 0.45929038524627686, + "learning_rate": 0.00016062335096634996, + "loss": 1.4588, + "step": 15164 + }, + { + "epoch": 0.19706225848421893, + "grad_norm": 0.3005059063434601, + "learning_rate": 0.00016062075150443858, + "loss": 1.4982, + "step": 15165 + }, + { + "epoch": 0.1970752530281348, + "grad_norm": 0.38960590958595276, + "learning_rate": 0.0001606181520425272, + "loss": 1.2371, + "step": 15166 + }, + { + "epoch": 0.19708824757205068, + "grad_norm": 0.4554326832294464, + "learning_rate": 0.00016061555258061583, + "loss": 1.5176, + "step": 15167 + }, + { + "epoch": 0.19710124211596655, + "grad_norm": 0.4252159595489502, + "learning_rate": 0.00016061295311870443, + "loss": 1.2863, + "step": 15168 + }, + { + "epoch": 0.19711423665988242, + "grad_norm": 0.43140292167663574, + "learning_rate": 0.00016061035365679306, + "loss": 1.4859, + "step": 15169 + }, + { + "epoch": 0.1971272312037983, + "grad_norm": 0.3556015193462372, + "learning_rate": 0.00016060775419488168, + "loss": 1.2549, + "step": 15170 + }, + { + "epoch": 0.19714022574771417, + "grad_norm": 0.4013342559337616, + "learning_rate": 0.00016060515473297028, + "loss": 1.5748, + "step": 15171 + }, + { + "epoch": 0.19715322029163004, + "grad_norm": 0.517819881439209, + "learning_rate": 0.0001606025552710589, + "loss": 1.4189, + "step": 15172 + }, + { + "epoch": 0.19716621483554592, + "grad_norm": 0.49728506803512573, + "learning_rate": 0.0001605999558091475, + "loss": 1.5233, + "step": 15173 + }, + { + "epoch": 0.1971792093794618, + "grad_norm": 0.4505499303340912, + "learning_rate": 0.00016059735634723615, + "loss": 1.5079, + "step": 15174 + }, + { + "epoch": 0.19719220392337766, + "grad_norm": 0.4149855673313141, + "learning_rate": 0.00016059475688532475, + "loss": 1.364, + "step": 15175 + }, + { + "epoch": 0.19720519846729354, + "grad_norm": 0.3838134706020355, + "learning_rate": 0.00016059215742341335, + "loss": 1.4092, + "step": 15176 + }, + { + "epoch": 0.1972181930112094, + "grad_norm": 0.3868907690048218, + "learning_rate": 0.00016058955796150197, + "loss": 1.3183, + "step": 15177 + }, + { + "epoch": 0.19723118755512528, + "grad_norm": 0.469340980052948, + "learning_rate": 0.0001605869584995906, + "loss": 1.4049, + "step": 15178 + }, + { + "epoch": 0.19724418209904115, + "grad_norm": 0.4031793475151062, + "learning_rate": 0.00016058435903767922, + "loss": 1.5555, + "step": 15179 + }, + { + "epoch": 0.19725717664295703, + "grad_norm": 0.3342891335487366, + "learning_rate": 0.00016058175957576782, + "loss": 1.4964, + "step": 15180 + }, + { + "epoch": 0.1972701711868729, + "grad_norm": 0.40358588099479675, + "learning_rate": 0.00016057916011385644, + "loss": 1.375, + "step": 15181 + }, + { + "epoch": 0.19728316573078877, + "grad_norm": 0.39205893874168396, + "learning_rate": 0.00016057656065194506, + "loss": 1.5246, + "step": 15182 + }, + { + "epoch": 0.19729616027470465, + "grad_norm": 0.36472901701927185, + "learning_rate": 0.00016057396119003366, + "loss": 1.3433, + "step": 15183 + }, + { + "epoch": 0.19730915481862052, + "grad_norm": 0.45858439803123474, + "learning_rate": 0.0001605713617281223, + "loss": 1.5803, + "step": 15184 + }, + { + "epoch": 0.1973221493625364, + "grad_norm": 0.47431516647338867, + "learning_rate": 0.00016056876226621088, + "loss": 1.4762, + "step": 15185 + }, + { + "epoch": 0.19733514390645226, + "grad_norm": 0.4197148382663727, + "learning_rate": 0.00016056616280429954, + "loss": 1.441, + "step": 15186 + }, + { + "epoch": 0.19734813845036814, + "grad_norm": 0.3786357343196869, + "learning_rate": 0.00016056356334238813, + "loss": 1.2299, + "step": 15187 + }, + { + "epoch": 0.19736113299428404, + "grad_norm": 0.43684062361717224, + "learning_rate": 0.00016056096388047676, + "loss": 1.5556, + "step": 15188 + }, + { + "epoch": 0.1973741275381999, + "grad_norm": 0.37042975425720215, + "learning_rate": 0.00016055836441856535, + "loss": 1.5341, + "step": 15189 + }, + { + "epoch": 0.19738712208211578, + "grad_norm": 0.4426417648792267, + "learning_rate": 0.00016055576495665398, + "loss": 1.4656, + "step": 15190 + }, + { + "epoch": 0.19740011662603166, + "grad_norm": 0.38509050011634827, + "learning_rate": 0.0001605531654947426, + "loss": 1.588, + "step": 15191 + }, + { + "epoch": 0.19741311116994753, + "grad_norm": 0.3662339150905609, + "learning_rate": 0.0001605505660328312, + "loss": 1.4387, + "step": 15192 + }, + { + "epoch": 0.1974261057138634, + "grad_norm": 0.42003586888313293, + "learning_rate": 0.00016054796657091983, + "loss": 1.3125, + "step": 15193 + }, + { + "epoch": 0.19743910025777928, + "grad_norm": 0.3900770843029022, + "learning_rate": 0.00016054536710900845, + "loss": 1.3786, + "step": 15194 + }, + { + "epoch": 0.19745209480169515, + "grad_norm": 0.37473246455192566, + "learning_rate": 0.00016054276764709705, + "loss": 1.3634, + "step": 15195 + }, + { + "epoch": 0.19746508934561102, + "grad_norm": 0.43817541003227234, + "learning_rate": 0.00016054016818518567, + "loss": 1.3275, + "step": 15196 + }, + { + "epoch": 0.1974780838895269, + "grad_norm": 0.4224016070365906, + "learning_rate": 0.0001605375687232743, + "loss": 1.5399, + "step": 15197 + }, + { + "epoch": 0.19749107843344277, + "grad_norm": 0.40805330872535706, + "learning_rate": 0.00016053496926136292, + "loss": 1.3911, + "step": 15198 + }, + { + "epoch": 0.19750407297735864, + "grad_norm": 0.42046448588371277, + "learning_rate": 0.00016053236979945152, + "loss": 1.4295, + "step": 15199 + }, + { + "epoch": 0.19751706752127451, + "grad_norm": 0.37618348002433777, + "learning_rate": 0.00016052977033754014, + "loss": 1.3308, + "step": 15200 + }, + { + "epoch": 0.1975300620651904, + "grad_norm": 0.3812530040740967, + "learning_rate": 0.00016052717087562877, + "loss": 1.5153, + "step": 15201 + }, + { + "epoch": 0.19754305660910626, + "grad_norm": 0.3528536558151245, + "learning_rate": 0.00016052457141371736, + "loss": 1.4887, + "step": 15202 + }, + { + "epoch": 0.19755605115302213, + "grad_norm": 0.4155598282814026, + "learning_rate": 0.000160521971951806, + "loss": 1.5151, + "step": 15203 + }, + { + "epoch": 0.197569045696938, + "grad_norm": 0.4857502579689026, + "learning_rate": 0.0001605193724898946, + "loss": 1.6726, + "step": 15204 + }, + { + "epoch": 0.19758204024085388, + "grad_norm": 0.4043503999710083, + "learning_rate": 0.00016051677302798324, + "loss": 1.4527, + "step": 15205 + }, + { + "epoch": 0.19759503478476975, + "grad_norm": 0.36850136518478394, + "learning_rate": 0.00016051417356607184, + "loss": 1.478, + "step": 15206 + }, + { + "epoch": 0.19760802932868562, + "grad_norm": 0.44881299138069153, + "learning_rate": 0.00016051157410416043, + "loss": 1.3344, + "step": 15207 + }, + { + "epoch": 0.1976210238726015, + "grad_norm": 0.45037394762039185, + "learning_rate": 0.00016050897464224906, + "loss": 1.4538, + "step": 15208 + }, + { + "epoch": 0.19763401841651737, + "grad_norm": 0.3963901698589325, + "learning_rate": 0.00016050637518033768, + "loss": 1.4962, + "step": 15209 + }, + { + "epoch": 0.19764701296043324, + "grad_norm": 0.3214680850505829, + "learning_rate": 0.0001605037757184263, + "loss": 1.2978, + "step": 15210 + }, + { + "epoch": 0.19766000750434912, + "grad_norm": 0.38941630721092224, + "learning_rate": 0.0001605011762565149, + "loss": 1.4117, + "step": 15211 + }, + { + "epoch": 0.197673002048265, + "grad_norm": 0.31193453073501587, + "learning_rate": 0.00016049857679460353, + "loss": 1.5375, + "step": 15212 + }, + { + "epoch": 0.19768599659218086, + "grad_norm": 0.4008481502532959, + "learning_rate": 0.00016049597733269215, + "loss": 1.2097, + "step": 15213 + }, + { + "epoch": 0.19769899113609674, + "grad_norm": 0.4754476845264435, + "learning_rate": 0.00016049337787078075, + "loss": 1.4644, + "step": 15214 + }, + { + "epoch": 0.1977119856800126, + "grad_norm": 0.4401509165763855, + "learning_rate": 0.00016049077840886937, + "loss": 1.3521, + "step": 15215 + }, + { + "epoch": 0.19772498022392848, + "grad_norm": 0.40085768699645996, + "learning_rate": 0.00016048817894695797, + "loss": 1.4934, + "step": 15216 + }, + { + "epoch": 0.19773797476784435, + "grad_norm": 0.4902122914791107, + "learning_rate": 0.00016048557948504662, + "loss": 1.4409, + "step": 15217 + }, + { + "epoch": 0.19775096931176023, + "grad_norm": 0.4231351613998413, + "learning_rate": 0.00016048298002313522, + "loss": 1.4791, + "step": 15218 + }, + { + "epoch": 0.1977639638556761, + "grad_norm": 0.3941724896430969, + "learning_rate": 0.00016048038056122382, + "loss": 1.4948, + "step": 15219 + }, + { + "epoch": 0.19777695839959197, + "grad_norm": 0.47850003838539124, + "learning_rate": 0.00016047778109931244, + "loss": 1.3973, + "step": 15220 + }, + { + "epoch": 0.19778995294350785, + "grad_norm": 0.4435640871524811, + "learning_rate": 0.00016047518163740107, + "loss": 1.4406, + "step": 15221 + }, + { + "epoch": 0.19780294748742372, + "grad_norm": 0.3991568684577942, + "learning_rate": 0.0001604725821754897, + "loss": 1.5071, + "step": 15222 + }, + { + "epoch": 0.1978159420313396, + "grad_norm": 0.4084078371524811, + "learning_rate": 0.0001604699827135783, + "loss": 1.4639, + "step": 15223 + }, + { + "epoch": 0.19782893657525547, + "grad_norm": 0.5762602090835571, + "learning_rate": 0.00016046738325166691, + "loss": 1.6933, + "step": 15224 + }, + { + "epoch": 0.19784193111917134, + "grad_norm": 0.45471665263175964, + "learning_rate": 0.00016046478378975554, + "loss": 1.4782, + "step": 15225 + }, + { + "epoch": 0.1978549256630872, + "grad_norm": 0.4198942184448242, + "learning_rate": 0.00016046218432784414, + "loss": 1.4061, + "step": 15226 + }, + { + "epoch": 0.19786792020700308, + "grad_norm": 0.4814091920852661, + "learning_rate": 0.00016045958486593276, + "loss": 1.5305, + "step": 15227 + }, + { + "epoch": 0.19788091475091896, + "grad_norm": 0.4235839247703552, + "learning_rate": 0.00016045698540402136, + "loss": 1.4895, + "step": 15228 + }, + { + "epoch": 0.19789390929483483, + "grad_norm": 0.5342458486557007, + "learning_rate": 0.00016045438594211, + "loss": 1.458, + "step": 15229 + }, + { + "epoch": 0.1979069038387507, + "grad_norm": 0.3506269156932831, + "learning_rate": 0.0001604517864801986, + "loss": 1.5893, + "step": 15230 + }, + { + "epoch": 0.19791989838266658, + "grad_norm": 0.49221667647361755, + "learning_rate": 0.0001604491870182872, + "loss": 1.4676, + "step": 15231 + }, + { + "epoch": 0.19793289292658245, + "grad_norm": 0.44165927171707153, + "learning_rate": 0.00016044658755637583, + "loss": 1.3752, + "step": 15232 + }, + { + "epoch": 0.19794588747049832, + "grad_norm": 0.36550450325012207, + "learning_rate": 0.00016044398809446445, + "loss": 1.4414, + "step": 15233 + }, + { + "epoch": 0.1979588820144142, + "grad_norm": 0.377628892660141, + "learning_rate": 0.00016044138863255308, + "loss": 1.3878, + "step": 15234 + }, + { + "epoch": 0.19797187655833007, + "grad_norm": 0.47497355937957764, + "learning_rate": 0.00016043878917064167, + "loss": 1.4643, + "step": 15235 + }, + { + "epoch": 0.19798487110224594, + "grad_norm": 0.46771544218063354, + "learning_rate": 0.0001604361897087303, + "loss": 1.4994, + "step": 15236 + }, + { + "epoch": 0.1979978656461618, + "grad_norm": 0.45935195684432983, + "learning_rate": 0.00016043359024681892, + "loss": 1.4672, + "step": 15237 + }, + { + "epoch": 0.1980108601900777, + "grad_norm": 0.43917736411094666, + "learning_rate": 0.00016043099078490752, + "loss": 1.5373, + "step": 15238 + }, + { + "epoch": 0.19802385473399356, + "grad_norm": 0.2885652482509613, + "learning_rate": 0.00016042839132299615, + "loss": 1.2032, + "step": 15239 + }, + { + "epoch": 0.19803684927790943, + "grad_norm": 0.39125555753707886, + "learning_rate": 0.00016042579186108477, + "loss": 1.5456, + "step": 15240 + }, + { + "epoch": 0.1980498438218253, + "grad_norm": 0.3073153793811798, + "learning_rate": 0.0001604231923991734, + "loss": 1.4896, + "step": 15241 + }, + { + "epoch": 0.19806283836574118, + "grad_norm": 0.37376752495765686, + "learning_rate": 0.000160420592937262, + "loss": 1.4096, + "step": 15242 + }, + { + "epoch": 0.19807583290965705, + "grad_norm": 0.37771785259246826, + "learning_rate": 0.00016041799347535062, + "loss": 1.4587, + "step": 15243 + }, + { + "epoch": 0.19808882745357292, + "grad_norm": 0.5241137742996216, + "learning_rate": 0.00016041539401343924, + "loss": 1.4467, + "step": 15244 + }, + { + "epoch": 0.1981018219974888, + "grad_norm": 0.44248658418655396, + "learning_rate": 0.00016041279455152784, + "loss": 1.3923, + "step": 15245 + }, + { + "epoch": 0.19811481654140467, + "grad_norm": 0.4415137469768524, + "learning_rate": 0.00016041019508961646, + "loss": 1.4749, + "step": 15246 + }, + { + "epoch": 0.19812781108532054, + "grad_norm": 0.41170668601989746, + "learning_rate": 0.00016040759562770506, + "loss": 1.6811, + "step": 15247 + }, + { + "epoch": 0.19814080562923642, + "grad_norm": 0.3179848790168762, + "learning_rate": 0.00016040499616579368, + "loss": 1.2587, + "step": 15248 + }, + { + "epoch": 0.1981538001731523, + "grad_norm": 0.3288537859916687, + "learning_rate": 0.0001604023967038823, + "loss": 1.4788, + "step": 15249 + }, + { + "epoch": 0.19816679471706816, + "grad_norm": 0.4839995801448822, + "learning_rate": 0.0001603997972419709, + "loss": 1.3582, + "step": 15250 + }, + { + "epoch": 0.19817978926098404, + "grad_norm": 0.3626045882701874, + "learning_rate": 0.00016039719778005953, + "loss": 1.4185, + "step": 15251 + }, + { + "epoch": 0.1981927838048999, + "grad_norm": 0.31482669711112976, + "learning_rate": 0.00016039459831814816, + "loss": 1.3867, + "step": 15252 + }, + { + "epoch": 0.19820577834881578, + "grad_norm": 0.5015284419059753, + "learning_rate": 0.00016039199885623678, + "loss": 1.3923, + "step": 15253 + }, + { + "epoch": 0.19821877289273165, + "grad_norm": 0.43372347950935364, + "learning_rate": 0.00016038939939432538, + "loss": 1.6296, + "step": 15254 + }, + { + "epoch": 0.19823176743664753, + "grad_norm": 0.38777047395706177, + "learning_rate": 0.000160386799932414, + "loss": 1.4918, + "step": 15255 + }, + { + "epoch": 0.1982447619805634, + "grad_norm": 0.3947080075740814, + "learning_rate": 0.00016038420047050263, + "loss": 1.4615, + "step": 15256 + }, + { + "epoch": 0.19825775652447927, + "grad_norm": 0.44883888959884644, + "learning_rate": 0.00016038160100859122, + "loss": 1.3895, + "step": 15257 + }, + { + "epoch": 0.19827075106839515, + "grad_norm": 0.3320392370223999, + "learning_rate": 0.00016037900154667985, + "loss": 1.1723, + "step": 15258 + }, + { + "epoch": 0.19828374561231102, + "grad_norm": 0.47458067536354065, + "learning_rate": 0.00016037640208476845, + "loss": 1.3871, + "step": 15259 + }, + { + "epoch": 0.1982967401562269, + "grad_norm": 0.3882908821105957, + "learning_rate": 0.00016037380262285707, + "loss": 1.5049, + "step": 15260 + }, + { + "epoch": 0.19830973470014276, + "grad_norm": 0.46307921409606934, + "learning_rate": 0.0001603712031609457, + "loss": 1.4342, + "step": 15261 + }, + { + "epoch": 0.19832272924405864, + "grad_norm": 0.3385463058948517, + "learning_rate": 0.0001603686036990343, + "loss": 1.4359, + "step": 15262 + }, + { + "epoch": 0.1983357237879745, + "grad_norm": 0.4020019769668579, + "learning_rate": 0.00016036600423712292, + "loss": 1.4977, + "step": 15263 + }, + { + "epoch": 0.1983487183318904, + "grad_norm": 0.3457539677619934, + "learning_rate": 0.00016036340477521154, + "loss": 1.4751, + "step": 15264 + }, + { + "epoch": 0.19836171287580628, + "grad_norm": 0.3555501103401184, + "learning_rate": 0.00016036080531330017, + "loss": 1.4812, + "step": 15265 + }, + { + "epoch": 0.19837470741972216, + "grad_norm": 0.3446584939956665, + "learning_rate": 0.00016035820585138876, + "loss": 1.2154, + "step": 15266 + }, + { + "epoch": 0.19838770196363803, + "grad_norm": 0.39608034491539, + "learning_rate": 0.0001603556063894774, + "loss": 1.588, + "step": 15267 + }, + { + "epoch": 0.1984006965075539, + "grad_norm": 0.30151817202568054, + "learning_rate": 0.000160353006927566, + "loss": 1.5172, + "step": 15268 + }, + { + "epoch": 0.19841369105146978, + "grad_norm": 0.40922778844833374, + "learning_rate": 0.0001603504074656546, + "loss": 1.3745, + "step": 15269 + }, + { + "epoch": 0.19842668559538565, + "grad_norm": 0.45608213543891907, + "learning_rate": 0.00016034780800374323, + "loss": 1.5089, + "step": 15270 + }, + { + "epoch": 0.19843968013930152, + "grad_norm": 0.4785734713077545, + "learning_rate": 0.00016034520854183186, + "loss": 1.4504, + "step": 15271 + }, + { + "epoch": 0.1984526746832174, + "grad_norm": 0.4256673753261566, + "learning_rate": 0.00016034260907992048, + "loss": 1.2851, + "step": 15272 + }, + { + "epoch": 0.19846566922713327, + "grad_norm": 0.328163743019104, + "learning_rate": 0.00016034000961800908, + "loss": 1.4876, + "step": 15273 + }, + { + "epoch": 0.19847866377104914, + "grad_norm": 0.4179215133190155, + "learning_rate": 0.00016033741015609768, + "loss": 1.4193, + "step": 15274 + }, + { + "epoch": 0.198491658314965, + "grad_norm": 0.44137489795684814, + "learning_rate": 0.00016033481069418633, + "loss": 1.5518, + "step": 15275 + }, + { + "epoch": 0.1985046528588809, + "grad_norm": 0.4182802140712738, + "learning_rate": 0.00016033221123227493, + "loss": 1.4004, + "step": 15276 + }, + { + "epoch": 0.19851764740279676, + "grad_norm": 0.3562569320201874, + "learning_rate": 0.00016032961177036355, + "loss": 1.2554, + "step": 15277 + }, + { + "epoch": 0.19853064194671263, + "grad_norm": 0.35423633456230164, + "learning_rate": 0.00016032701230845215, + "loss": 1.2387, + "step": 15278 + }, + { + "epoch": 0.1985436364906285, + "grad_norm": 0.4391341209411621, + "learning_rate": 0.00016032441284654077, + "loss": 1.4935, + "step": 15279 + }, + { + "epoch": 0.19855663103454438, + "grad_norm": 0.36307451128959656, + "learning_rate": 0.0001603218133846294, + "loss": 1.236, + "step": 15280 + }, + { + "epoch": 0.19856962557846025, + "grad_norm": 0.3874627351760864, + "learning_rate": 0.000160319213922718, + "loss": 1.3927, + "step": 15281 + }, + { + "epoch": 0.19858262012237612, + "grad_norm": 0.39018988609313965, + "learning_rate": 0.00016031661446080662, + "loss": 1.3894, + "step": 15282 + }, + { + "epoch": 0.198595614666292, + "grad_norm": 0.33459872007369995, + "learning_rate": 0.00016031401499889524, + "loss": 1.2518, + "step": 15283 + }, + { + "epoch": 0.19860860921020787, + "grad_norm": 0.49354103207588196, + "learning_rate": 0.00016031141553698387, + "loss": 1.4767, + "step": 15284 + }, + { + "epoch": 0.19862160375412374, + "grad_norm": 0.35415980219841003, + "learning_rate": 0.00016030881607507247, + "loss": 1.3861, + "step": 15285 + }, + { + "epoch": 0.19863459829803962, + "grad_norm": 0.2480810284614563, + "learning_rate": 0.00016030621661316106, + "loss": 1.3568, + "step": 15286 + }, + { + "epoch": 0.1986475928419555, + "grad_norm": 0.3108460009098053, + "learning_rate": 0.00016030361715124971, + "loss": 1.3442, + "step": 15287 + }, + { + "epoch": 0.19866058738587136, + "grad_norm": 0.2840315103530884, + "learning_rate": 0.0001603010176893383, + "loss": 1.4583, + "step": 15288 + }, + { + "epoch": 0.19867358192978724, + "grad_norm": 0.37063631415367126, + "learning_rate": 0.00016029841822742694, + "loss": 1.4405, + "step": 15289 + }, + { + "epoch": 0.1986865764737031, + "grad_norm": 0.4510897099971771, + "learning_rate": 0.00016029581876551553, + "loss": 1.3659, + "step": 15290 + }, + { + "epoch": 0.19869957101761898, + "grad_norm": 0.3901059031486511, + "learning_rate": 0.00016029321930360416, + "loss": 1.6081, + "step": 15291 + }, + { + "epoch": 0.19871256556153485, + "grad_norm": 0.5823162198066711, + "learning_rate": 0.00016029061984169278, + "loss": 1.4775, + "step": 15292 + }, + { + "epoch": 0.19872556010545073, + "grad_norm": 0.37442460656166077, + "learning_rate": 0.00016028802037978138, + "loss": 1.368, + "step": 15293 + }, + { + "epoch": 0.1987385546493666, + "grad_norm": 0.3791543245315552, + "learning_rate": 0.00016028542091787, + "loss": 1.2698, + "step": 15294 + }, + { + "epoch": 0.19875154919328247, + "grad_norm": 0.4248009920120239, + "learning_rate": 0.00016028282145595863, + "loss": 1.4038, + "step": 15295 + }, + { + "epoch": 0.19876454373719835, + "grad_norm": 0.41493403911590576, + "learning_rate": 0.00016028022199404725, + "loss": 1.6313, + "step": 15296 + }, + { + "epoch": 0.19877753828111422, + "grad_norm": 0.42783305048942566, + "learning_rate": 0.00016027762253213585, + "loss": 1.4309, + "step": 15297 + }, + { + "epoch": 0.1987905328250301, + "grad_norm": 0.32946762442588806, + "learning_rate": 0.00016027502307022445, + "loss": 1.1649, + "step": 15298 + }, + { + "epoch": 0.19880352736894596, + "grad_norm": 0.4396952688694, + "learning_rate": 0.0001602724236083131, + "loss": 1.4214, + "step": 15299 + }, + { + "epoch": 0.19881652191286184, + "grad_norm": 0.5071067810058594, + "learning_rate": 0.0001602698241464017, + "loss": 1.405, + "step": 15300 + }, + { + "epoch": 0.1988295164567777, + "grad_norm": 0.4332043528556824, + "learning_rate": 0.00016026722468449032, + "loss": 1.4329, + "step": 15301 + }, + { + "epoch": 0.19884251100069358, + "grad_norm": 0.42541804909706116, + "learning_rate": 0.00016026462522257892, + "loss": 1.4254, + "step": 15302 + }, + { + "epoch": 0.19885550554460946, + "grad_norm": 0.4576720893383026, + "learning_rate": 0.00016026202576066754, + "loss": 1.3789, + "step": 15303 + }, + { + "epoch": 0.19886850008852533, + "grad_norm": 0.39556246995925903, + "learning_rate": 0.00016025942629875617, + "loss": 1.3404, + "step": 15304 + }, + { + "epoch": 0.1988814946324412, + "grad_norm": 0.41974642872810364, + "learning_rate": 0.00016025682683684477, + "loss": 1.4725, + "step": 15305 + }, + { + "epoch": 0.19889448917635708, + "grad_norm": 0.437761515378952, + "learning_rate": 0.0001602542273749334, + "loss": 1.5507, + "step": 15306 + }, + { + "epoch": 0.19890748372027295, + "grad_norm": 0.42709267139434814, + "learning_rate": 0.00016025162791302201, + "loss": 1.5817, + "step": 15307 + }, + { + "epoch": 0.19892047826418882, + "grad_norm": 0.41304272413253784, + "learning_rate": 0.00016024902845111064, + "loss": 1.5118, + "step": 15308 + }, + { + "epoch": 0.1989334728081047, + "grad_norm": 0.32319214940071106, + "learning_rate": 0.00016024642898919924, + "loss": 1.4065, + "step": 15309 + }, + { + "epoch": 0.19894646735202057, + "grad_norm": 0.680975615978241, + "learning_rate": 0.00016024382952728786, + "loss": 1.3247, + "step": 15310 + }, + { + "epoch": 0.19895946189593644, + "grad_norm": 0.4191313683986664, + "learning_rate": 0.00016024123006537648, + "loss": 1.4534, + "step": 15311 + }, + { + "epoch": 0.1989724564398523, + "grad_norm": 0.31099042296409607, + "learning_rate": 0.00016023863060346508, + "loss": 1.3029, + "step": 15312 + }, + { + "epoch": 0.1989854509837682, + "grad_norm": 0.3521687984466553, + "learning_rate": 0.0001602360311415537, + "loss": 1.2817, + "step": 15313 + }, + { + "epoch": 0.19899844552768406, + "grad_norm": 0.3875702917575836, + "learning_rate": 0.00016023343167964233, + "loss": 1.2578, + "step": 15314 + }, + { + "epoch": 0.19901144007159993, + "grad_norm": 0.3726058304309845, + "learning_rate": 0.00016023083221773093, + "loss": 1.4163, + "step": 15315 + }, + { + "epoch": 0.1990244346155158, + "grad_norm": 0.4327508211135864, + "learning_rate": 0.00016022823275581955, + "loss": 1.633, + "step": 15316 + }, + { + "epoch": 0.19903742915943168, + "grad_norm": 0.3837486505508423, + "learning_rate": 0.00016022563329390815, + "loss": 1.4178, + "step": 15317 + }, + { + "epoch": 0.19905042370334755, + "grad_norm": 0.32990044355392456, + "learning_rate": 0.0001602230338319968, + "loss": 1.4929, + "step": 15318 + }, + { + "epoch": 0.19906341824726342, + "grad_norm": 0.36149898171424866, + "learning_rate": 0.0001602204343700854, + "loss": 1.3582, + "step": 15319 + }, + { + "epoch": 0.1990764127911793, + "grad_norm": 0.41414421796798706, + "learning_rate": 0.00016021783490817402, + "loss": 1.4847, + "step": 15320 + }, + { + "epoch": 0.19908940733509517, + "grad_norm": 0.42448264360427856, + "learning_rate": 0.00016021523544626262, + "loss": 1.584, + "step": 15321 + }, + { + "epoch": 0.19910240187901104, + "grad_norm": 0.3937159776687622, + "learning_rate": 0.00016021263598435125, + "loss": 1.4891, + "step": 15322 + }, + { + "epoch": 0.19911539642292692, + "grad_norm": 0.396188348531723, + "learning_rate": 0.00016021003652243987, + "loss": 1.4622, + "step": 15323 + }, + { + "epoch": 0.1991283909668428, + "grad_norm": 0.40531226992607117, + "learning_rate": 0.00016020743706052847, + "loss": 1.5891, + "step": 15324 + }, + { + "epoch": 0.19914138551075866, + "grad_norm": 0.43410012125968933, + "learning_rate": 0.0001602048375986171, + "loss": 1.5348, + "step": 15325 + }, + { + "epoch": 0.19915438005467453, + "grad_norm": 0.288671612739563, + "learning_rate": 0.00016020223813670572, + "loss": 1.3558, + "step": 15326 + }, + { + "epoch": 0.1991673745985904, + "grad_norm": 0.413058340549469, + "learning_rate": 0.00016019963867479434, + "loss": 1.1742, + "step": 15327 + }, + { + "epoch": 0.19918036914250628, + "grad_norm": 0.42460256814956665, + "learning_rate": 0.00016019703921288294, + "loss": 1.5886, + "step": 15328 + }, + { + "epoch": 0.19919336368642215, + "grad_norm": 0.4351479411125183, + "learning_rate": 0.00016019443975097154, + "loss": 1.429, + "step": 15329 + }, + { + "epoch": 0.19920635823033803, + "grad_norm": 0.41776150465011597, + "learning_rate": 0.0001601918402890602, + "loss": 1.4502, + "step": 15330 + }, + { + "epoch": 0.1992193527742539, + "grad_norm": 0.31935542821884155, + "learning_rate": 0.00016018924082714878, + "loss": 1.3763, + "step": 15331 + }, + { + "epoch": 0.19923234731816977, + "grad_norm": 0.4103107750415802, + "learning_rate": 0.0001601866413652374, + "loss": 1.3706, + "step": 15332 + }, + { + "epoch": 0.19924534186208565, + "grad_norm": 0.4254096746444702, + "learning_rate": 0.000160184041903326, + "loss": 1.341, + "step": 15333 + }, + { + "epoch": 0.19925833640600152, + "grad_norm": 0.3568406403064728, + "learning_rate": 0.00016018144244141463, + "loss": 1.4953, + "step": 15334 + }, + { + "epoch": 0.1992713309499174, + "grad_norm": 0.36509087681770325, + "learning_rate": 0.00016017884297950326, + "loss": 1.2538, + "step": 15335 + }, + { + "epoch": 0.19928432549383326, + "grad_norm": 0.3267819881439209, + "learning_rate": 0.00016017624351759185, + "loss": 1.4212, + "step": 15336 + }, + { + "epoch": 0.19929732003774914, + "grad_norm": 0.44438377022743225, + "learning_rate": 0.00016017364405568048, + "loss": 1.3985, + "step": 15337 + }, + { + "epoch": 0.199310314581665, + "grad_norm": 0.4287395775318146, + "learning_rate": 0.0001601710445937691, + "loss": 1.4115, + "step": 15338 + }, + { + "epoch": 0.19932330912558088, + "grad_norm": 0.4248035252094269, + "learning_rate": 0.00016016844513185773, + "loss": 1.5455, + "step": 15339 + }, + { + "epoch": 0.19933630366949678, + "grad_norm": 0.44686010479927063, + "learning_rate": 0.00016016584566994632, + "loss": 1.42, + "step": 15340 + }, + { + "epoch": 0.19934929821341266, + "grad_norm": 0.2797262966632843, + "learning_rate": 0.00016016324620803492, + "loss": 1.3089, + "step": 15341 + }, + { + "epoch": 0.19936229275732853, + "grad_norm": 0.37204888463020325, + "learning_rate": 0.00016016064674612357, + "loss": 1.4282, + "step": 15342 + }, + { + "epoch": 0.1993752873012444, + "grad_norm": 0.3677528202533722, + "learning_rate": 0.00016015804728421217, + "loss": 1.4471, + "step": 15343 + }, + { + "epoch": 0.19938828184516028, + "grad_norm": 0.3339022397994995, + "learning_rate": 0.0001601554478223008, + "loss": 1.4126, + "step": 15344 + }, + { + "epoch": 0.19940127638907615, + "grad_norm": 0.36243629455566406, + "learning_rate": 0.00016015284836038942, + "loss": 1.3091, + "step": 15345 + }, + { + "epoch": 0.19941427093299202, + "grad_norm": 0.3422911763191223, + "learning_rate": 0.00016015024889847802, + "loss": 1.1941, + "step": 15346 + }, + { + "epoch": 0.1994272654769079, + "grad_norm": 0.42427539825439453, + "learning_rate": 0.00016014764943656664, + "loss": 1.4216, + "step": 15347 + }, + { + "epoch": 0.19944026002082377, + "grad_norm": 0.3341255784034729, + "learning_rate": 0.00016014504997465524, + "loss": 1.3904, + "step": 15348 + }, + { + "epoch": 0.19945325456473964, + "grad_norm": 0.3369588553905487, + "learning_rate": 0.0001601424505127439, + "loss": 1.2856, + "step": 15349 + }, + { + "epoch": 0.1994662491086555, + "grad_norm": 0.2476450800895691, + "learning_rate": 0.0001601398510508325, + "loss": 1.1781, + "step": 15350 + }, + { + "epoch": 0.1994792436525714, + "grad_norm": 0.4013558030128479, + "learning_rate": 0.0001601372515889211, + "loss": 1.3791, + "step": 15351 + }, + { + "epoch": 0.19949223819648726, + "grad_norm": 0.39862382411956787, + "learning_rate": 0.0001601346521270097, + "loss": 1.3654, + "step": 15352 + }, + { + "epoch": 0.19950523274040313, + "grad_norm": 0.4367014169692993, + "learning_rate": 0.00016013205266509833, + "loss": 1.3027, + "step": 15353 + }, + { + "epoch": 0.199518227284319, + "grad_norm": 0.33646225929260254, + "learning_rate": 0.00016012945320318696, + "loss": 1.4137, + "step": 15354 + }, + { + "epoch": 0.19953122182823488, + "grad_norm": 0.3747285008430481, + "learning_rate": 0.00016012685374127556, + "loss": 1.5891, + "step": 15355 + }, + { + "epoch": 0.19954421637215075, + "grad_norm": 0.3984653353691101, + "learning_rate": 0.00016012425427936418, + "loss": 1.5199, + "step": 15356 + }, + { + "epoch": 0.19955721091606662, + "grad_norm": 0.4099576473236084, + "learning_rate": 0.0001601216548174528, + "loss": 1.4076, + "step": 15357 + }, + { + "epoch": 0.1995702054599825, + "grad_norm": 0.2693162262439728, + "learning_rate": 0.0001601190553555414, + "loss": 1.406, + "step": 15358 + }, + { + "epoch": 0.19958320000389837, + "grad_norm": 0.3406940996646881, + "learning_rate": 0.00016011645589363003, + "loss": 1.5985, + "step": 15359 + }, + { + "epoch": 0.19959619454781424, + "grad_norm": 0.4689192771911621, + "learning_rate": 0.00016011385643171862, + "loss": 1.4187, + "step": 15360 + }, + { + "epoch": 0.19960918909173012, + "grad_norm": 0.34262481331825256, + "learning_rate": 0.00016011125696980728, + "loss": 1.3294, + "step": 15361 + }, + { + "epoch": 0.199622183635646, + "grad_norm": 0.41267237067222595, + "learning_rate": 0.00016010865750789587, + "loss": 1.3925, + "step": 15362 + }, + { + "epoch": 0.19963517817956186, + "grad_norm": 0.41196075081825256, + "learning_rate": 0.0001601060580459845, + "loss": 1.2361, + "step": 15363 + }, + { + "epoch": 0.19964817272347773, + "grad_norm": 0.405814528465271, + "learning_rate": 0.0001601034585840731, + "loss": 1.4084, + "step": 15364 + }, + { + "epoch": 0.1996611672673936, + "grad_norm": 0.44442397356033325, + "learning_rate": 0.00016010085912216172, + "loss": 1.5637, + "step": 15365 + }, + { + "epoch": 0.19967416181130948, + "grad_norm": 0.35996049642562866, + "learning_rate": 0.00016009825966025034, + "loss": 1.5295, + "step": 15366 + }, + { + "epoch": 0.19968715635522535, + "grad_norm": 0.3599914014339447, + "learning_rate": 0.00016009566019833894, + "loss": 1.2685, + "step": 15367 + }, + { + "epoch": 0.19970015089914123, + "grad_norm": 0.3731040358543396, + "learning_rate": 0.00016009306073642757, + "loss": 1.3927, + "step": 15368 + }, + { + "epoch": 0.1997131454430571, + "grad_norm": 0.37993544340133667, + "learning_rate": 0.0001600904612745162, + "loss": 1.6071, + "step": 15369 + }, + { + "epoch": 0.19972613998697297, + "grad_norm": 0.41570788621902466, + "learning_rate": 0.0001600878618126048, + "loss": 1.3826, + "step": 15370 + }, + { + "epoch": 0.19973913453088885, + "grad_norm": 0.43232133984565735, + "learning_rate": 0.0001600852623506934, + "loss": 1.3585, + "step": 15371 + }, + { + "epoch": 0.19975212907480472, + "grad_norm": 0.42845484614372253, + "learning_rate": 0.000160082662888782, + "loss": 1.4164, + "step": 15372 + }, + { + "epoch": 0.1997651236187206, + "grad_norm": 0.4609348177909851, + "learning_rate": 0.00016008006342687066, + "loss": 1.4316, + "step": 15373 + }, + { + "epoch": 0.19977811816263646, + "grad_norm": 0.34090402722358704, + "learning_rate": 0.00016007746396495926, + "loss": 1.3717, + "step": 15374 + }, + { + "epoch": 0.19979111270655234, + "grad_norm": 0.38243624567985535, + "learning_rate": 0.00016007486450304788, + "loss": 1.4322, + "step": 15375 + }, + { + "epoch": 0.1998041072504682, + "grad_norm": 0.3448341488838196, + "learning_rate": 0.00016007226504113648, + "loss": 1.0993, + "step": 15376 + }, + { + "epoch": 0.19981710179438408, + "grad_norm": 0.374994158744812, + "learning_rate": 0.0001600696655792251, + "loss": 1.5967, + "step": 15377 + }, + { + "epoch": 0.19983009633829996, + "grad_norm": 0.4522508978843689, + "learning_rate": 0.00016006706611731373, + "loss": 1.4916, + "step": 15378 + }, + { + "epoch": 0.19984309088221583, + "grad_norm": 0.4896794855594635, + "learning_rate": 0.00016006446665540233, + "loss": 1.584, + "step": 15379 + }, + { + "epoch": 0.1998560854261317, + "grad_norm": 0.3767539858818054, + "learning_rate": 0.00016006186719349095, + "loss": 1.4583, + "step": 15380 + }, + { + "epoch": 0.19986907997004758, + "grad_norm": 0.40603840351104736, + "learning_rate": 0.00016005926773157958, + "loss": 1.3325, + "step": 15381 + }, + { + "epoch": 0.19988207451396345, + "grad_norm": 0.4316093921661377, + "learning_rate": 0.00016005666826966817, + "loss": 1.3926, + "step": 15382 + }, + { + "epoch": 0.19989506905787932, + "grad_norm": 0.4680945575237274, + "learning_rate": 0.0001600540688077568, + "loss": 1.4563, + "step": 15383 + }, + { + "epoch": 0.1999080636017952, + "grad_norm": 0.46094948053359985, + "learning_rate": 0.00016005146934584542, + "loss": 1.3507, + "step": 15384 + }, + { + "epoch": 0.19992105814571107, + "grad_norm": 0.38625067472457886, + "learning_rate": 0.00016004886988393405, + "loss": 1.7112, + "step": 15385 + }, + { + "epoch": 0.19993405268962694, + "grad_norm": 0.391434907913208, + "learning_rate": 0.00016004627042202264, + "loss": 1.4305, + "step": 15386 + }, + { + "epoch": 0.1999470472335428, + "grad_norm": 0.32393285632133484, + "learning_rate": 0.00016004367096011127, + "loss": 1.2506, + "step": 15387 + }, + { + "epoch": 0.19996004177745869, + "grad_norm": 0.4099315106868744, + "learning_rate": 0.0001600410714981999, + "loss": 1.6296, + "step": 15388 + }, + { + "epoch": 0.19997303632137456, + "grad_norm": 0.4354977011680603, + "learning_rate": 0.0001600384720362885, + "loss": 1.4602, + "step": 15389 + }, + { + "epoch": 0.19998603086529043, + "grad_norm": 0.2789519131183624, + "learning_rate": 0.00016003587257437711, + "loss": 1.2519, + "step": 15390 + }, + { + "epoch": 0.1999990254092063, + "grad_norm": 0.3328530192375183, + "learning_rate": 0.0001600332731124657, + "loss": 1.316, + "step": 15391 + }, + { + "epoch": 0.20001201995312218, + "grad_norm": 0.3718877136707306, + "learning_rate": 0.00016003067365055436, + "loss": 1.4504, + "step": 15392 + }, + { + "epoch": 0.20002501449703805, + "grad_norm": 0.34923285245895386, + "learning_rate": 0.00016002807418864296, + "loss": 1.4455, + "step": 15393 + }, + { + "epoch": 0.20003800904095392, + "grad_norm": 0.4272012710571289, + "learning_rate": 0.00016002547472673159, + "loss": 1.4843, + "step": 15394 + }, + { + "epoch": 0.2000510035848698, + "grad_norm": 0.44626718759536743, + "learning_rate": 0.00016002287526482018, + "loss": 1.4089, + "step": 15395 + }, + { + "epoch": 0.20006399812878567, + "grad_norm": 0.4409841299057007, + "learning_rate": 0.0001600202758029088, + "loss": 1.4024, + "step": 15396 + }, + { + "epoch": 0.20007699267270154, + "grad_norm": 0.31712839007377625, + "learning_rate": 0.00016001767634099743, + "loss": 1.401, + "step": 15397 + }, + { + "epoch": 0.20008998721661742, + "grad_norm": 0.330126017332077, + "learning_rate": 0.00016001507687908603, + "loss": 1.4193, + "step": 15398 + }, + { + "epoch": 0.2001029817605333, + "grad_norm": 0.4245782792568207, + "learning_rate": 0.00016001247741717465, + "loss": 1.3154, + "step": 15399 + }, + { + "epoch": 0.20011597630444916, + "grad_norm": 0.3618156909942627, + "learning_rate": 0.00016000987795526328, + "loss": 1.4052, + "step": 15400 + }, + { + "epoch": 0.20012897084836503, + "grad_norm": 0.4344066381454468, + "learning_rate": 0.00016000727849335188, + "loss": 1.5139, + "step": 15401 + }, + { + "epoch": 0.2001419653922809, + "grad_norm": 0.40536969900131226, + "learning_rate": 0.0001600046790314405, + "loss": 1.5962, + "step": 15402 + }, + { + "epoch": 0.20015495993619678, + "grad_norm": 0.358058899641037, + "learning_rate": 0.0001600020795695291, + "loss": 1.4791, + "step": 15403 + }, + { + "epoch": 0.20016795448011265, + "grad_norm": 0.29033300280570984, + "learning_rate": 0.00015999948010761775, + "loss": 1.3785, + "step": 15404 + }, + { + "epoch": 0.20018094902402853, + "grad_norm": 0.4340335428714752, + "learning_rate": 0.00015999688064570635, + "loss": 1.4549, + "step": 15405 + }, + { + "epoch": 0.2001939435679444, + "grad_norm": 0.473200261592865, + "learning_rate": 0.00015999428118379497, + "loss": 1.459, + "step": 15406 + }, + { + "epoch": 0.20020693811186027, + "grad_norm": 0.3932296633720398, + "learning_rate": 0.00015999168172188357, + "loss": 1.3656, + "step": 15407 + }, + { + "epoch": 0.20021993265577614, + "grad_norm": 0.42950358986854553, + "learning_rate": 0.0001599890822599722, + "loss": 1.5815, + "step": 15408 + }, + { + "epoch": 0.20023292719969202, + "grad_norm": 0.40682581067085266, + "learning_rate": 0.00015998648279806082, + "loss": 1.416, + "step": 15409 + }, + { + "epoch": 0.2002459217436079, + "grad_norm": 0.4486320912837982, + "learning_rate": 0.00015998388333614941, + "loss": 1.2984, + "step": 15410 + }, + { + "epoch": 0.20025891628752376, + "grad_norm": 0.6088532209396362, + "learning_rate": 0.00015998128387423804, + "loss": 1.6367, + "step": 15411 + }, + { + "epoch": 0.20027191083143964, + "grad_norm": 0.4755866527557373, + "learning_rate": 0.00015997868441232666, + "loss": 1.5699, + "step": 15412 + }, + { + "epoch": 0.2002849053753555, + "grad_norm": 0.4551866054534912, + "learning_rate": 0.00015997608495041526, + "loss": 1.467, + "step": 15413 + }, + { + "epoch": 0.20029789991927138, + "grad_norm": 0.5039704442024231, + "learning_rate": 0.00015997348548850389, + "loss": 1.4165, + "step": 15414 + }, + { + "epoch": 0.20031089446318726, + "grad_norm": 0.3217463493347168, + "learning_rate": 0.00015997088602659248, + "loss": 1.315, + "step": 15415 + }, + { + "epoch": 0.20032388900710316, + "grad_norm": 0.44191160798072815, + "learning_rate": 0.00015996828656468113, + "loss": 1.4067, + "step": 15416 + }, + { + "epoch": 0.20033688355101903, + "grad_norm": 0.31735658645629883, + "learning_rate": 0.00015996568710276973, + "loss": 1.3719, + "step": 15417 + }, + { + "epoch": 0.2003498780949349, + "grad_norm": 0.3791908621788025, + "learning_rate": 0.00015996308764085836, + "loss": 1.3287, + "step": 15418 + }, + { + "epoch": 0.20036287263885078, + "grad_norm": 0.32334086298942566, + "learning_rate": 0.00015996048817894698, + "loss": 1.6866, + "step": 15419 + }, + { + "epoch": 0.20037586718276665, + "grad_norm": 0.460700660943985, + "learning_rate": 0.00015995788871703558, + "loss": 1.4302, + "step": 15420 + }, + { + "epoch": 0.20038886172668252, + "grad_norm": 0.5116479396820068, + "learning_rate": 0.0001599552892551242, + "loss": 1.3797, + "step": 15421 + }, + { + "epoch": 0.2004018562705984, + "grad_norm": 0.360805481672287, + "learning_rate": 0.0001599526897932128, + "loss": 1.33, + "step": 15422 + }, + { + "epoch": 0.20041485081451427, + "grad_norm": 0.40143775939941406, + "learning_rate": 0.00015995009033130145, + "loss": 1.5545, + "step": 15423 + }, + { + "epoch": 0.20042784535843014, + "grad_norm": 0.345742404460907, + "learning_rate": 0.00015994749086939005, + "loss": 1.3964, + "step": 15424 + }, + { + "epoch": 0.200440839902346, + "grad_norm": 0.40881749987602234, + "learning_rate": 0.00015994489140747865, + "loss": 1.7012, + "step": 15425 + }, + { + "epoch": 0.20045383444626189, + "grad_norm": 0.46395188570022583, + "learning_rate": 0.00015994229194556727, + "loss": 1.3633, + "step": 15426 + }, + { + "epoch": 0.20046682899017776, + "grad_norm": 0.4470542371273041, + "learning_rate": 0.0001599396924836559, + "loss": 1.4938, + "step": 15427 + }, + { + "epoch": 0.20047982353409363, + "grad_norm": 0.43475550413131714, + "learning_rate": 0.00015993709302174452, + "loss": 1.4408, + "step": 15428 + }, + { + "epoch": 0.2004928180780095, + "grad_norm": 0.41542017459869385, + "learning_rate": 0.00015993449355983312, + "loss": 1.305, + "step": 15429 + }, + { + "epoch": 0.20050581262192538, + "grad_norm": 0.3461635410785675, + "learning_rate": 0.00015993189409792174, + "loss": 1.3386, + "step": 15430 + }, + { + "epoch": 0.20051880716584125, + "grad_norm": 0.39723336696624756, + "learning_rate": 0.00015992929463601037, + "loss": 1.3459, + "step": 15431 + }, + { + "epoch": 0.20053180170975712, + "grad_norm": 0.41502058506011963, + "learning_rate": 0.00015992669517409896, + "loss": 1.4575, + "step": 15432 + }, + { + "epoch": 0.200544796253673, + "grad_norm": 0.455619752407074, + "learning_rate": 0.0001599240957121876, + "loss": 1.561, + "step": 15433 + }, + { + "epoch": 0.20055779079758887, + "grad_norm": 0.48920756578445435, + "learning_rate": 0.00015992149625027619, + "loss": 1.5112, + "step": 15434 + }, + { + "epoch": 0.20057078534150474, + "grad_norm": 0.4552110433578491, + "learning_rate": 0.00015991889678836484, + "loss": 1.3981, + "step": 15435 + }, + { + "epoch": 0.20058377988542062, + "grad_norm": 0.38848674297332764, + "learning_rate": 0.00015991629732645343, + "loss": 1.4132, + "step": 15436 + }, + { + "epoch": 0.2005967744293365, + "grad_norm": 0.38771361112594604, + "learning_rate": 0.00015991369786454203, + "loss": 1.157, + "step": 15437 + }, + { + "epoch": 0.20060976897325236, + "grad_norm": 0.3963020145893097, + "learning_rate": 0.00015991109840263066, + "loss": 1.3774, + "step": 15438 + }, + { + "epoch": 0.20062276351716823, + "grad_norm": 0.4398963153362274, + "learning_rate": 0.00015990849894071928, + "loss": 1.4806, + "step": 15439 + }, + { + "epoch": 0.2006357580610841, + "grad_norm": 0.42457035183906555, + "learning_rate": 0.0001599058994788079, + "loss": 1.3053, + "step": 15440 + }, + { + "epoch": 0.20064875260499998, + "grad_norm": 0.34232112765312195, + "learning_rate": 0.0001599033000168965, + "loss": 1.4357, + "step": 15441 + }, + { + "epoch": 0.20066174714891585, + "grad_norm": 0.4253518283367157, + "learning_rate": 0.00015990070055498513, + "loss": 1.4779, + "step": 15442 + }, + { + "epoch": 0.20067474169283173, + "grad_norm": 0.4288158118724823, + "learning_rate": 0.00015989810109307375, + "loss": 1.4467, + "step": 15443 + }, + { + "epoch": 0.2006877362367476, + "grad_norm": 0.3745271861553192, + "learning_rate": 0.00015989550163116235, + "loss": 1.4147, + "step": 15444 + }, + { + "epoch": 0.20070073078066347, + "grad_norm": 0.4292162358760834, + "learning_rate": 0.00015989290216925097, + "loss": 1.5221, + "step": 15445 + }, + { + "epoch": 0.20071372532457935, + "grad_norm": 0.4208000600337982, + "learning_rate": 0.00015989030270733957, + "loss": 1.5414, + "step": 15446 + }, + { + "epoch": 0.20072671986849522, + "grad_norm": 0.41436871886253357, + "learning_rate": 0.00015988770324542822, + "loss": 1.4846, + "step": 15447 + }, + { + "epoch": 0.2007397144124111, + "grad_norm": 0.42770326137542725, + "learning_rate": 0.00015988510378351682, + "loss": 1.4128, + "step": 15448 + }, + { + "epoch": 0.20075270895632696, + "grad_norm": 0.3122018575668335, + "learning_rate": 0.00015988250432160544, + "loss": 1.0307, + "step": 15449 + }, + { + "epoch": 0.20076570350024284, + "grad_norm": 0.3554200232028961, + "learning_rate": 0.00015987990485969404, + "loss": 1.5785, + "step": 15450 + }, + { + "epoch": 0.2007786980441587, + "grad_norm": 0.35506123304367065, + "learning_rate": 0.00015987730539778267, + "loss": 1.3464, + "step": 15451 + }, + { + "epoch": 0.20079169258807458, + "grad_norm": 0.3930074870586395, + "learning_rate": 0.0001598747059358713, + "loss": 1.393, + "step": 15452 + }, + { + "epoch": 0.20080468713199046, + "grad_norm": 0.3954486548900604, + "learning_rate": 0.0001598721064739599, + "loss": 1.3006, + "step": 15453 + }, + { + "epoch": 0.20081768167590633, + "grad_norm": 0.4204241931438446, + "learning_rate": 0.0001598695070120485, + "loss": 1.373, + "step": 15454 + }, + { + "epoch": 0.2008306762198222, + "grad_norm": 0.3705895245075226, + "learning_rate": 0.00015986690755013714, + "loss": 1.348, + "step": 15455 + }, + { + "epoch": 0.20084367076373807, + "grad_norm": 0.37882500886917114, + "learning_rate": 0.00015986430808822573, + "loss": 1.4137, + "step": 15456 + }, + { + "epoch": 0.20085666530765395, + "grad_norm": 0.3344724476337433, + "learning_rate": 0.00015986170862631436, + "loss": 1.5115, + "step": 15457 + }, + { + "epoch": 0.20086965985156982, + "grad_norm": 0.3381964862346649, + "learning_rate": 0.00015985910916440298, + "loss": 1.4018, + "step": 15458 + }, + { + "epoch": 0.2008826543954857, + "grad_norm": 0.4564279019832611, + "learning_rate": 0.0001598565097024916, + "loss": 1.4675, + "step": 15459 + }, + { + "epoch": 0.20089564893940157, + "grad_norm": 0.38492849469184875, + "learning_rate": 0.0001598539102405802, + "loss": 1.4442, + "step": 15460 + }, + { + "epoch": 0.20090864348331744, + "grad_norm": 0.46018293499946594, + "learning_rate": 0.00015985131077866883, + "loss": 1.3129, + "step": 15461 + }, + { + "epoch": 0.2009216380272333, + "grad_norm": 0.31527480483055115, + "learning_rate": 0.00015984871131675745, + "loss": 1.5662, + "step": 15462 + }, + { + "epoch": 0.20093463257114919, + "grad_norm": 0.4076431095600128, + "learning_rate": 0.00015984611185484605, + "loss": 1.5504, + "step": 15463 + }, + { + "epoch": 0.20094762711506506, + "grad_norm": 0.342613160610199, + "learning_rate": 0.00015984351239293468, + "loss": 1.3724, + "step": 15464 + }, + { + "epoch": 0.20096062165898093, + "grad_norm": 0.3431416451931, + "learning_rate": 0.00015984091293102327, + "loss": 1.1886, + "step": 15465 + }, + { + "epoch": 0.2009736162028968, + "grad_norm": 0.3727257251739502, + "learning_rate": 0.0001598383134691119, + "loss": 1.3544, + "step": 15466 + }, + { + "epoch": 0.20098661074681268, + "grad_norm": 0.5540410876274109, + "learning_rate": 0.00015983571400720052, + "loss": 1.2226, + "step": 15467 + }, + { + "epoch": 0.20099960529072855, + "grad_norm": 0.5565593242645264, + "learning_rate": 0.00015983311454528912, + "loss": 1.4975, + "step": 15468 + }, + { + "epoch": 0.20101259983464442, + "grad_norm": 0.3361845910549164, + "learning_rate": 0.00015983051508337774, + "loss": 1.4783, + "step": 15469 + }, + { + "epoch": 0.2010255943785603, + "grad_norm": 0.3931155502796173, + "learning_rate": 0.00015982791562146637, + "loss": 1.3817, + "step": 15470 + }, + { + "epoch": 0.20103858892247617, + "grad_norm": 0.4010556638240814, + "learning_rate": 0.000159825316159555, + "loss": 1.3986, + "step": 15471 + }, + { + "epoch": 0.20105158346639204, + "grad_norm": 0.3854156732559204, + "learning_rate": 0.0001598227166976436, + "loss": 1.2552, + "step": 15472 + }, + { + "epoch": 0.20106457801030791, + "grad_norm": 0.3804440200328827, + "learning_rate": 0.00015982011723573221, + "loss": 1.3033, + "step": 15473 + }, + { + "epoch": 0.2010775725542238, + "grad_norm": 0.36511868238449097, + "learning_rate": 0.00015981751777382084, + "loss": 1.5625, + "step": 15474 + }, + { + "epoch": 0.20109056709813966, + "grad_norm": 0.3993107080459595, + "learning_rate": 0.00015981491831190944, + "loss": 1.4384, + "step": 15475 + }, + { + "epoch": 0.20110356164205553, + "grad_norm": 0.3993953764438629, + "learning_rate": 0.00015981231884999806, + "loss": 1.3731, + "step": 15476 + }, + { + "epoch": 0.2011165561859714, + "grad_norm": 0.3434096574783325, + "learning_rate": 0.00015980971938808666, + "loss": 1.1904, + "step": 15477 + }, + { + "epoch": 0.20112955072988728, + "grad_norm": 0.3749016523361206, + "learning_rate": 0.0001598071199261753, + "loss": 1.4604, + "step": 15478 + }, + { + "epoch": 0.20114254527380315, + "grad_norm": 0.37235578894615173, + "learning_rate": 0.0001598045204642639, + "loss": 1.2544, + "step": 15479 + }, + { + "epoch": 0.20115553981771903, + "grad_norm": 0.45200005173683167, + "learning_rate": 0.0001598019210023525, + "loss": 1.2509, + "step": 15480 + }, + { + "epoch": 0.2011685343616349, + "grad_norm": 0.3638080358505249, + "learning_rate": 0.00015979932154044113, + "loss": 1.4136, + "step": 15481 + }, + { + "epoch": 0.20118152890555077, + "grad_norm": 0.3104375898838043, + "learning_rate": 0.00015979672207852975, + "loss": 1.2958, + "step": 15482 + }, + { + "epoch": 0.20119452344946664, + "grad_norm": 0.3581565320491791, + "learning_rate": 0.00015979412261661838, + "loss": 1.0592, + "step": 15483 + }, + { + "epoch": 0.20120751799338252, + "grad_norm": 0.34259718656539917, + "learning_rate": 0.00015979152315470698, + "loss": 1.3443, + "step": 15484 + }, + { + "epoch": 0.2012205125372984, + "grad_norm": 0.5038226246833801, + "learning_rate": 0.0001597889236927956, + "loss": 1.4838, + "step": 15485 + }, + { + "epoch": 0.20123350708121426, + "grad_norm": 0.3625810444355011, + "learning_rate": 0.00015978632423088422, + "loss": 1.26, + "step": 15486 + }, + { + "epoch": 0.20124650162513014, + "grad_norm": 0.4064147472381592, + "learning_rate": 0.00015978372476897282, + "loss": 1.3402, + "step": 15487 + }, + { + "epoch": 0.201259496169046, + "grad_norm": 0.3508455455303192, + "learning_rate": 0.00015978112530706145, + "loss": 1.3568, + "step": 15488 + }, + { + "epoch": 0.20127249071296188, + "grad_norm": 0.32590991258621216, + "learning_rate": 0.00015977852584515004, + "loss": 1.4675, + "step": 15489 + }, + { + "epoch": 0.20128548525687776, + "grad_norm": 0.3837073743343353, + "learning_rate": 0.0001597759263832387, + "loss": 1.5279, + "step": 15490 + }, + { + "epoch": 0.20129847980079363, + "grad_norm": 0.35764941573143005, + "learning_rate": 0.0001597733269213273, + "loss": 1.52, + "step": 15491 + }, + { + "epoch": 0.20131147434470953, + "grad_norm": 0.38586580753326416, + "learning_rate": 0.0001597707274594159, + "loss": 1.4397, + "step": 15492 + }, + { + "epoch": 0.2013244688886254, + "grad_norm": 0.42878884077072144, + "learning_rate": 0.00015976812799750454, + "loss": 1.3246, + "step": 15493 + }, + { + "epoch": 0.20133746343254127, + "grad_norm": 0.31376323103904724, + "learning_rate": 0.00015976552853559314, + "loss": 1.3979, + "step": 15494 + }, + { + "epoch": 0.20135045797645715, + "grad_norm": 0.42834824323654175, + "learning_rate": 0.00015976292907368176, + "loss": 1.4761, + "step": 15495 + }, + { + "epoch": 0.20136345252037302, + "grad_norm": 0.36774513125419617, + "learning_rate": 0.00015976032961177036, + "loss": 1.2995, + "step": 15496 + }, + { + "epoch": 0.2013764470642889, + "grad_norm": 0.38691818714141846, + "learning_rate": 0.00015975773014985899, + "loss": 1.4661, + "step": 15497 + }, + { + "epoch": 0.20138944160820477, + "grad_norm": 0.3923111855983734, + "learning_rate": 0.0001597551306879476, + "loss": 1.4596, + "step": 15498 + }, + { + "epoch": 0.20140243615212064, + "grad_norm": 0.3583773672580719, + "learning_rate": 0.0001597525312260362, + "loss": 1.382, + "step": 15499 + }, + { + "epoch": 0.2014154306960365, + "grad_norm": 0.30078837275505066, + "learning_rate": 0.00015974993176412483, + "loss": 1.4106, + "step": 15500 + }, + { + "epoch": 0.20142842523995239, + "grad_norm": 0.36736586689949036, + "learning_rate": 0.00015974733230221346, + "loss": 1.4908, + "step": 15501 + }, + { + "epoch": 0.20144141978386826, + "grad_norm": 0.35619914531707764, + "learning_rate": 0.00015974473284030208, + "loss": 1.4938, + "step": 15502 + }, + { + "epoch": 0.20145441432778413, + "grad_norm": 0.37158042192459106, + "learning_rate": 0.00015974213337839068, + "loss": 1.3052, + "step": 15503 + }, + { + "epoch": 0.2014674088717, + "grad_norm": 0.5701084733009338, + "learning_rate": 0.0001597395339164793, + "loss": 1.3253, + "step": 15504 + }, + { + "epoch": 0.20148040341561588, + "grad_norm": 0.458329439163208, + "learning_rate": 0.00015973693445456793, + "loss": 1.4172, + "step": 15505 + }, + { + "epoch": 0.20149339795953175, + "grad_norm": 0.40700650215148926, + "learning_rate": 0.00015973433499265652, + "loss": 1.5024, + "step": 15506 + }, + { + "epoch": 0.20150639250344762, + "grad_norm": 0.46246078610420227, + "learning_rate": 0.00015973173553074515, + "loss": 1.4115, + "step": 15507 + }, + { + "epoch": 0.2015193870473635, + "grad_norm": 0.3385193645954132, + "learning_rate": 0.00015972913606883375, + "loss": 1.5335, + "step": 15508 + }, + { + "epoch": 0.20153238159127937, + "grad_norm": 0.42362180352211, + "learning_rate": 0.00015972653660692237, + "loss": 1.3271, + "step": 15509 + }, + { + "epoch": 0.20154537613519524, + "grad_norm": 0.37288254499435425, + "learning_rate": 0.000159723937145011, + "loss": 1.2622, + "step": 15510 + }, + { + "epoch": 0.20155837067911112, + "grad_norm": 0.43298378586769104, + "learning_rate": 0.0001597213376830996, + "loss": 1.5571, + "step": 15511 + }, + { + "epoch": 0.201571365223027, + "grad_norm": 0.3844226002693176, + "learning_rate": 0.00015971873822118822, + "loss": 1.4003, + "step": 15512 + }, + { + "epoch": 0.20158435976694286, + "grad_norm": 0.3851691484451294, + "learning_rate": 0.00015971613875927684, + "loss": 1.3184, + "step": 15513 + }, + { + "epoch": 0.20159735431085873, + "grad_norm": 0.4106464982032776, + "learning_rate": 0.00015971353929736547, + "loss": 1.3351, + "step": 15514 + }, + { + "epoch": 0.2016103488547746, + "grad_norm": 0.41100969910621643, + "learning_rate": 0.00015971093983545406, + "loss": 1.4826, + "step": 15515 + }, + { + "epoch": 0.20162334339869048, + "grad_norm": 0.34024062752723694, + "learning_rate": 0.0001597083403735427, + "loss": 1.5004, + "step": 15516 + }, + { + "epoch": 0.20163633794260635, + "grad_norm": 0.4197511672973633, + "learning_rate": 0.0001597057409116313, + "loss": 1.4641, + "step": 15517 + }, + { + "epoch": 0.20164933248652223, + "grad_norm": 0.45267030596733093, + "learning_rate": 0.0001597031414497199, + "loss": 1.3783, + "step": 15518 + }, + { + "epoch": 0.2016623270304381, + "grad_norm": 0.446397989988327, + "learning_rate": 0.00015970054198780853, + "loss": 1.5595, + "step": 15519 + }, + { + "epoch": 0.20167532157435397, + "grad_norm": 0.37602293491363525, + "learning_rate": 0.00015969794252589713, + "loss": 1.3057, + "step": 15520 + }, + { + "epoch": 0.20168831611826984, + "grad_norm": 0.3605562746524811, + "learning_rate": 0.00015969534306398576, + "loss": 1.428, + "step": 15521 + }, + { + "epoch": 0.20170131066218572, + "grad_norm": 0.4639723300933838, + "learning_rate": 0.00015969274360207438, + "loss": 1.418, + "step": 15522 + }, + { + "epoch": 0.2017143052061016, + "grad_norm": 0.4100245237350464, + "learning_rate": 0.00015969014414016298, + "loss": 1.6012, + "step": 15523 + }, + { + "epoch": 0.20172729975001746, + "grad_norm": 0.38920027017593384, + "learning_rate": 0.0001596875446782516, + "loss": 1.4594, + "step": 15524 + }, + { + "epoch": 0.20174029429393334, + "grad_norm": 0.47984549403190613, + "learning_rate": 0.00015968494521634023, + "loss": 1.572, + "step": 15525 + }, + { + "epoch": 0.2017532888378492, + "grad_norm": 0.4243656098842621, + "learning_rate": 0.00015968234575442885, + "loss": 1.4721, + "step": 15526 + }, + { + "epoch": 0.20176628338176508, + "grad_norm": 0.38825806975364685, + "learning_rate": 0.00015967974629251745, + "loss": 1.3754, + "step": 15527 + }, + { + "epoch": 0.20177927792568096, + "grad_norm": 0.3386104106903076, + "learning_rate": 0.00015967714683060607, + "loss": 1.2337, + "step": 15528 + }, + { + "epoch": 0.20179227246959683, + "grad_norm": 0.3906554877758026, + "learning_rate": 0.0001596745473686947, + "loss": 1.433, + "step": 15529 + }, + { + "epoch": 0.2018052670135127, + "grad_norm": 0.3716062009334564, + "learning_rate": 0.0001596719479067833, + "loss": 1.6036, + "step": 15530 + }, + { + "epoch": 0.20181826155742857, + "grad_norm": 0.3616575598716736, + "learning_rate": 0.00015966934844487192, + "loss": 1.6153, + "step": 15531 + }, + { + "epoch": 0.20183125610134445, + "grad_norm": 0.426098108291626, + "learning_rate": 0.00015966674898296054, + "loss": 1.3482, + "step": 15532 + }, + { + "epoch": 0.20184425064526032, + "grad_norm": 0.3066435754299164, + "learning_rate": 0.00015966414952104917, + "loss": 1.1552, + "step": 15533 + }, + { + "epoch": 0.2018572451891762, + "grad_norm": 0.3735445439815521, + "learning_rate": 0.00015966155005913777, + "loss": 1.4611, + "step": 15534 + }, + { + "epoch": 0.20187023973309207, + "grad_norm": 0.3736024796962738, + "learning_rate": 0.00015965895059722636, + "loss": 1.4677, + "step": 15535 + }, + { + "epoch": 0.20188323427700794, + "grad_norm": 0.43967676162719727, + "learning_rate": 0.00015965635113531502, + "loss": 1.5224, + "step": 15536 + }, + { + "epoch": 0.2018962288209238, + "grad_norm": 0.3615036904811859, + "learning_rate": 0.0001596537516734036, + "loss": 1.2323, + "step": 15537 + }, + { + "epoch": 0.20190922336483969, + "grad_norm": 0.18317799270153046, + "learning_rate": 0.00015965115221149224, + "loss": 1.2085, + "step": 15538 + }, + { + "epoch": 0.20192221790875556, + "grad_norm": 0.3674650490283966, + "learning_rate": 0.00015964855274958083, + "loss": 1.5367, + "step": 15539 + }, + { + "epoch": 0.20193521245267143, + "grad_norm": 0.37121474742889404, + "learning_rate": 0.00015964595328766946, + "loss": 1.3428, + "step": 15540 + }, + { + "epoch": 0.2019482069965873, + "grad_norm": 0.39096876978874207, + "learning_rate": 0.00015964335382575808, + "loss": 1.5221, + "step": 15541 + }, + { + "epoch": 0.20196120154050318, + "grad_norm": 0.48480314016342163, + "learning_rate": 0.00015964075436384668, + "loss": 1.6317, + "step": 15542 + }, + { + "epoch": 0.20197419608441905, + "grad_norm": 0.44641631841659546, + "learning_rate": 0.0001596381549019353, + "loss": 1.3509, + "step": 15543 + }, + { + "epoch": 0.20198719062833492, + "grad_norm": 0.39969927072525024, + "learning_rate": 0.00015963555544002393, + "loss": 1.4268, + "step": 15544 + }, + { + "epoch": 0.2020001851722508, + "grad_norm": 0.38742053508758545, + "learning_rate": 0.00015963295597811255, + "loss": 1.2812, + "step": 15545 + }, + { + "epoch": 0.20201317971616667, + "grad_norm": 0.4840254783630371, + "learning_rate": 0.00015963035651620115, + "loss": 1.4762, + "step": 15546 + }, + { + "epoch": 0.20202617426008254, + "grad_norm": 0.33920347690582275, + "learning_rate": 0.00015962775705428975, + "loss": 1.4521, + "step": 15547 + }, + { + "epoch": 0.20203916880399841, + "grad_norm": 0.4188118278980255, + "learning_rate": 0.0001596251575923784, + "loss": 1.5022, + "step": 15548 + }, + { + "epoch": 0.2020521633479143, + "grad_norm": 0.32119986414909363, + "learning_rate": 0.000159622558130467, + "loss": 1.3524, + "step": 15549 + }, + { + "epoch": 0.20206515789183016, + "grad_norm": 0.32455724477767944, + "learning_rate": 0.00015961995866855562, + "loss": 1.5207, + "step": 15550 + }, + { + "epoch": 0.20207815243574603, + "grad_norm": 0.3758303225040436, + "learning_rate": 0.00015961735920664422, + "loss": 1.3958, + "step": 15551 + }, + { + "epoch": 0.2020911469796619, + "grad_norm": 0.44503000378608704, + "learning_rate": 0.00015961475974473284, + "loss": 1.5735, + "step": 15552 + }, + { + "epoch": 0.20210414152357778, + "grad_norm": 0.39568275213241577, + "learning_rate": 0.00015961216028282147, + "loss": 1.4604, + "step": 15553 + }, + { + "epoch": 0.20211713606749365, + "grad_norm": 0.28650161623954773, + "learning_rate": 0.00015960956082091007, + "loss": 1.3105, + "step": 15554 + }, + { + "epoch": 0.20213013061140953, + "grad_norm": 0.39524441957473755, + "learning_rate": 0.0001596069613589987, + "loss": 1.4664, + "step": 15555 + }, + { + "epoch": 0.2021431251553254, + "grad_norm": 0.48658257722854614, + "learning_rate": 0.00015960436189708732, + "loss": 1.3023, + "step": 15556 + }, + { + "epoch": 0.20215611969924127, + "grad_norm": 0.2924402356147766, + "learning_rate": 0.00015960176243517594, + "loss": 1.2134, + "step": 15557 + }, + { + "epoch": 0.20216911424315714, + "grad_norm": 0.39157775044441223, + "learning_rate": 0.00015959916297326454, + "loss": 1.3529, + "step": 15558 + }, + { + "epoch": 0.20218210878707302, + "grad_norm": 0.31740477681159973, + "learning_rate": 0.00015959656351135313, + "loss": 1.4197, + "step": 15559 + }, + { + "epoch": 0.2021951033309889, + "grad_norm": 0.37640008330345154, + "learning_rate": 0.00015959396404944179, + "loss": 1.3638, + "step": 15560 + }, + { + "epoch": 0.20220809787490476, + "grad_norm": 0.23595954477787018, + "learning_rate": 0.00015959136458753038, + "loss": 1.3842, + "step": 15561 + }, + { + "epoch": 0.20222109241882064, + "grad_norm": 0.5004596710205078, + "learning_rate": 0.000159588765125619, + "loss": 1.5304, + "step": 15562 + }, + { + "epoch": 0.2022340869627365, + "grad_norm": 0.3433200716972351, + "learning_rate": 0.0001595861656637076, + "loss": 1.3388, + "step": 15563 + }, + { + "epoch": 0.20224708150665238, + "grad_norm": 0.2922031283378601, + "learning_rate": 0.00015958356620179623, + "loss": 1.1781, + "step": 15564 + }, + { + "epoch": 0.20226007605056825, + "grad_norm": 0.37811943888664246, + "learning_rate": 0.00015958096673988485, + "loss": 1.4627, + "step": 15565 + }, + { + "epoch": 0.20227307059448413, + "grad_norm": 0.48024192452430725, + "learning_rate": 0.00015957836727797345, + "loss": 1.5468, + "step": 15566 + }, + { + "epoch": 0.2022860651384, + "grad_norm": 0.3539566695690155, + "learning_rate": 0.00015957576781606208, + "loss": 1.4408, + "step": 15567 + }, + { + "epoch": 0.20229905968231587, + "grad_norm": 0.3906601071357727, + "learning_rate": 0.0001595731683541507, + "loss": 1.5796, + "step": 15568 + }, + { + "epoch": 0.20231205422623177, + "grad_norm": 0.44790250062942505, + "learning_rate": 0.00015957056889223933, + "loss": 1.3717, + "step": 15569 + }, + { + "epoch": 0.20232504877014765, + "grad_norm": 0.39754799008369446, + "learning_rate": 0.00015956796943032792, + "loss": 1.2721, + "step": 15570 + }, + { + "epoch": 0.20233804331406352, + "grad_norm": 0.4742945730686188, + "learning_rate": 0.00015956536996841655, + "loss": 1.4732, + "step": 15571 + }, + { + "epoch": 0.2023510378579794, + "grad_norm": 0.35886383056640625, + "learning_rate": 0.00015956277050650517, + "loss": 1.4878, + "step": 15572 + }, + { + "epoch": 0.20236403240189527, + "grad_norm": 0.3766234219074249, + "learning_rate": 0.00015956017104459377, + "loss": 1.5001, + "step": 15573 + }, + { + "epoch": 0.20237702694581114, + "grad_norm": 0.4675527811050415, + "learning_rate": 0.0001595575715826824, + "loss": 1.3493, + "step": 15574 + }, + { + "epoch": 0.202390021489727, + "grad_norm": 0.3647250831127167, + "learning_rate": 0.00015955497212077102, + "loss": 1.5048, + "step": 15575 + }, + { + "epoch": 0.20240301603364289, + "grad_norm": 0.4306110143661499, + "learning_rate": 0.00015955237265885962, + "loss": 1.4666, + "step": 15576 + }, + { + "epoch": 0.20241601057755876, + "grad_norm": 0.34354308247566223, + "learning_rate": 0.00015954977319694824, + "loss": 1.5806, + "step": 15577 + }, + { + "epoch": 0.20242900512147463, + "grad_norm": 0.4518747627735138, + "learning_rate": 0.00015954717373503684, + "loss": 1.4257, + "step": 15578 + }, + { + "epoch": 0.2024419996653905, + "grad_norm": 0.2816008925437927, + "learning_rate": 0.0001595445742731255, + "loss": 1.1589, + "step": 15579 + }, + { + "epoch": 0.20245499420930638, + "grad_norm": 0.43849489092826843, + "learning_rate": 0.00015954197481121409, + "loss": 1.4997, + "step": 15580 + }, + { + "epoch": 0.20246798875322225, + "grad_norm": 0.31130221486091614, + "learning_rate": 0.0001595393753493027, + "loss": 1.2816, + "step": 15581 + }, + { + "epoch": 0.20248098329713812, + "grad_norm": 0.3955734372138977, + "learning_rate": 0.0001595367758873913, + "loss": 1.4013, + "step": 15582 + }, + { + "epoch": 0.202493977841054, + "grad_norm": 0.39053240418434143, + "learning_rate": 0.00015953417642547993, + "loss": 1.4422, + "step": 15583 + }, + { + "epoch": 0.20250697238496987, + "grad_norm": 0.3094363808631897, + "learning_rate": 0.00015953157696356856, + "loss": 1.2012, + "step": 15584 + }, + { + "epoch": 0.20251996692888574, + "grad_norm": 0.4734448492527008, + "learning_rate": 0.00015952897750165715, + "loss": 1.4293, + "step": 15585 + }, + { + "epoch": 0.20253296147280161, + "grad_norm": 0.5012496113777161, + "learning_rate": 0.00015952637803974578, + "loss": 1.4869, + "step": 15586 + }, + { + "epoch": 0.2025459560167175, + "grad_norm": 0.42707526683807373, + "learning_rate": 0.0001595237785778344, + "loss": 1.4535, + "step": 15587 + }, + { + "epoch": 0.20255895056063336, + "grad_norm": 0.42233243584632874, + "learning_rate": 0.000159521179115923, + "loss": 1.5284, + "step": 15588 + }, + { + "epoch": 0.20257194510454923, + "grad_norm": 0.29213064908981323, + "learning_rate": 0.00015951857965401162, + "loss": 1.3369, + "step": 15589 + }, + { + "epoch": 0.2025849396484651, + "grad_norm": 0.5354099273681641, + "learning_rate": 0.00015951598019210022, + "loss": 1.6117, + "step": 15590 + }, + { + "epoch": 0.20259793419238098, + "grad_norm": 0.4830285906791687, + "learning_rate": 0.00015951338073018887, + "loss": 1.52, + "step": 15591 + }, + { + "epoch": 0.20261092873629685, + "grad_norm": 0.3432127833366394, + "learning_rate": 0.00015951078126827747, + "loss": 1.2603, + "step": 15592 + }, + { + "epoch": 0.20262392328021273, + "grad_norm": 0.4127909541130066, + "learning_rate": 0.0001595081818063661, + "loss": 1.4846, + "step": 15593 + }, + { + "epoch": 0.2026369178241286, + "grad_norm": 0.3981674611568451, + "learning_rate": 0.0001595055823444547, + "loss": 1.3194, + "step": 15594 + }, + { + "epoch": 0.20264991236804447, + "grad_norm": 0.3570355772972107, + "learning_rate": 0.00015950298288254332, + "loss": 1.6007, + "step": 15595 + }, + { + "epoch": 0.20266290691196034, + "grad_norm": 0.48730647563934326, + "learning_rate": 0.00015950038342063194, + "loss": 1.3962, + "step": 15596 + }, + { + "epoch": 0.20267590145587622, + "grad_norm": 0.4497148096561432, + "learning_rate": 0.00015949778395872054, + "loss": 1.5353, + "step": 15597 + }, + { + "epoch": 0.2026888959997921, + "grad_norm": 0.31450557708740234, + "learning_rate": 0.00015949518449680916, + "loss": 1.3312, + "step": 15598 + }, + { + "epoch": 0.20270189054370796, + "grad_norm": 0.3757617771625519, + "learning_rate": 0.0001594925850348978, + "loss": 1.3624, + "step": 15599 + }, + { + "epoch": 0.20271488508762384, + "grad_norm": 0.45655694603919983, + "learning_rate": 0.0001594899855729864, + "loss": 1.334, + "step": 15600 + }, + { + "epoch": 0.2027278796315397, + "grad_norm": 0.44738686084747314, + "learning_rate": 0.000159487386111075, + "loss": 1.4487, + "step": 15601 + }, + { + "epoch": 0.20274087417545558, + "grad_norm": 0.39667317271232605, + "learning_rate": 0.0001594847866491636, + "loss": 1.5499, + "step": 15602 + }, + { + "epoch": 0.20275386871937146, + "grad_norm": 0.37171974778175354, + "learning_rate": 0.00015948218718725226, + "loss": 1.3191, + "step": 15603 + }, + { + "epoch": 0.20276686326328733, + "grad_norm": 0.4180026948451996, + "learning_rate": 0.00015947958772534086, + "loss": 1.3679, + "step": 15604 + }, + { + "epoch": 0.2027798578072032, + "grad_norm": 0.26227790117263794, + "learning_rate": 0.00015947698826342948, + "loss": 1.2342, + "step": 15605 + }, + { + "epoch": 0.20279285235111907, + "grad_norm": 0.3400615155696869, + "learning_rate": 0.0001594743888015181, + "loss": 1.5662, + "step": 15606 + }, + { + "epoch": 0.20280584689503495, + "grad_norm": 0.38658204674720764, + "learning_rate": 0.0001594717893396067, + "loss": 1.2616, + "step": 15607 + }, + { + "epoch": 0.20281884143895082, + "grad_norm": 0.4290766716003418, + "learning_rate": 0.00015946918987769533, + "loss": 1.3712, + "step": 15608 + }, + { + "epoch": 0.2028318359828667, + "grad_norm": 0.4338560700416565, + "learning_rate": 0.00015946659041578392, + "loss": 1.5662, + "step": 15609 + }, + { + "epoch": 0.20284483052678257, + "grad_norm": 0.3897329568862915, + "learning_rate": 0.00015946399095387258, + "loss": 1.3752, + "step": 15610 + }, + { + "epoch": 0.20285782507069844, + "grad_norm": 0.3282984793186188, + "learning_rate": 0.00015946139149196117, + "loss": 1.2311, + "step": 15611 + }, + { + "epoch": 0.2028708196146143, + "grad_norm": 0.43870893120765686, + "learning_rate": 0.0001594587920300498, + "loss": 1.516, + "step": 15612 + }, + { + "epoch": 0.20288381415853018, + "grad_norm": 0.4248964190483093, + "learning_rate": 0.0001594561925681384, + "loss": 1.5338, + "step": 15613 + }, + { + "epoch": 0.20289680870244606, + "grad_norm": 0.3369921147823334, + "learning_rate": 0.00015945359310622702, + "loss": 1.3801, + "step": 15614 + }, + { + "epoch": 0.20290980324636193, + "grad_norm": 0.43938079476356506, + "learning_rate": 0.00015945099364431564, + "loss": 1.583, + "step": 15615 + }, + { + "epoch": 0.2029227977902778, + "grad_norm": 0.3859417140483856, + "learning_rate": 0.00015944839418240424, + "loss": 1.6263, + "step": 15616 + }, + { + "epoch": 0.20293579233419368, + "grad_norm": 0.4081466495990753, + "learning_rate": 0.00015944579472049287, + "loss": 1.3804, + "step": 15617 + }, + { + "epoch": 0.20294878687810955, + "grad_norm": 0.5416001081466675, + "learning_rate": 0.0001594431952585815, + "loss": 1.4011, + "step": 15618 + }, + { + "epoch": 0.20296178142202542, + "grad_norm": 0.4448879361152649, + "learning_rate": 0.0001594405957966701, + "loss": 1.4706, + "step": 15619 + }, + { + "epoch": 0.2029747759659413, + "grad_norm": 0.3218129277229309, + "learning_rate": 0.0001594379963347587, + "loss": 1.1609, + "step": 15620 + }, + { + "epoch": 0.20298777050985717, + "grad_norm": 0.25834921002388, + "learning_rate": 0.0001594353968728473, + "loss": 1.2834, + "step": 15621 + }, + { + "epoch": 0.20300076505377304, + "grad_norm": 0.3170090913772583, + "learning_rate": 0.00015943279741093596, + "loss": 1.1739, + "step": 15622 + }, + { + "epoch": 0.20301375959768891, + "grad_norm": 0.34562620520591736, + "learning_rate": 0.00015943019794902456, + "loss": 1.5567, + "step": 15623 + }, + { + "epoch": 0.2030267541416048, + "grad_norm": 0.31423917412757874, + "learning_rate": 0.00015942759848711318, + "loss": 1.2251, + "step": 15624 + }, + { + "epoch": 0.20303974868552066, + "grad_norm": 0.4045373797416687, + "learning_rate": 0.00015942499902520178, + "loss": 1.6074, + "step": 15625 + }, + { + "epoch": 0.20305274322943653, + "grad_norm": 0.4055655002593994, + "learning_rate": 0.0001594223995632904, + "loss": 1.4157, + "step": 15626 + }, + { + "epoch": 0.2030657377733524, + "grad_norm": 0.4327966868877411, + "learning_rate": 0.00015941980010137903, + "loss": 1.454, + "step": 15627 + }, + { + "epoch": 0.20307873231726828, + "grad_norm": 0.40168339014053345, + "learning_rate": 0.00015941720063946763, + "loss": 1.595, + "step": 15628 + }, + { + "epoch": 0.20309172686118415, + "grad_norm": 0.3811909258365631, + "learning_rate": 0.00015941460117755625, + "loss": 1.3004, + "step": 15629 + }, + { + "epoch": 0.20310472140510002, + "grad_norm": 0.4503130614757538, + "learning_rate": 0.00015941200171564488, + "loss": 1.4717, + "step": 15630 + }, + { + "epoch": 0.2031177159490159, + "grad_norm": 0.4343057870864868, + "learning_rate": 0.00015940940225373347, + "loss": 1.3414, + "step": 15631 + }, + { + "epoch": 0.20313071049293177, + "grad_norm": 0.4421515166759491, + "learning_rate": 0.0001594068027918221, + "loss": 1.4232, + "step": 15632 + }, + { + "epoch": 0.20314370503684764, + "grad_norm": 0.3260214626789093, + "learning_rate": 0.0001594042033299107, + "loss": 1.3136, + "step": 15633 + }, + { + "epoch": 0.20315669958076352, + "grad_norm": 0.31766918301582336, + "learning_rate": 0.00015940160386799935, + "loss": 1.293, + "step": 15634 + }, + { + "epoch": 0.2031696941246794, + "grad_norm": 0.4145980179309845, + "learning_rate": 0.00015939900440608794, + "loss": 1.473, + "step": 15635 + }, + { + "epoch": 0.20318268866859526, + "grad_norm": 0.4416239261627197, + "learning_rate": 0.00015939640494417657, + "loss": 1.5645, + "step": 15636 + }, + { + "epoch": 0.20319568321251114, + "grad_norm": 0.3710215985774994, + "learning_rate": 0.00015939380548226517, + "loss": 1.3829, + "step": 15637 + }, + { + "epoch": 0.203208677756427, + "grad_norm": 0.4692445993423462, + "learning_rate": 0.0001593912060203538, + "loss": 1.4375, + "step": 15638 + }, + { + "epoch": 0.20322167230034288, + "grad_norm": 0.4876609146595001, + "learning_rate": 0.00015938860655844242, + "loss": 1.3926, + "step": 15639 + }, + { + "epoch": 0.20323466684425875, + "grad_norm": 0.4070512354373932, + "learning_rate": 0.000159386007096531, + "loss": 1.3986, + "step": 15640 + }, + { + "epoch": 0.20324766138817463, + "grad_norm": 0.4974505603313446, + "learning_rate": 0.00015938340763461964, + "loss": 1.5225, + "step": 15641 + }, + { + "epoch": 0.2032606559320905, + "grad_norm": 0.33264318108558655, + "learning_rate": 0.00015938080817270826, + "loss": 1.3339, + "step": 15642 + }, + { + "epoch": 0.20327365047600637, + "grad_norm": 0.4418434500694275, + "learning_rate": 0.00015937820871079686, + "loss": 1.5256, + "step": 15643 + }, + { + "epoch": 0.20328664501992225, + "grad_norm": 0.38963791728019714, + "learning_rate": 0.00015937560924888548, + "loss": 1.4744, + "step": 15644 + }, + { + "epoch": 0.20329963956383815, + "grad_norm": 0.3890082538127899, + "learning_rate": 0.0001593730097869741, + "loss": 1.6533, + "step": 15645 + }, + { + "epoch": 0.20331263410775402, + "grad_norm": 0.3245887756347656, + "learning_rate": 0.00015937041032506273, + "loss": 1.4314, + "step": 15646 + }, + { + "epoch": 0.2033256286516699, + "grad_norm": 0.3747952878475189, + "learning_rate": 0.00015936781086315133, + "loss": 1.24, + "step": 15647 + }, + { + "epoch": 0.20333862319558577, + "grad_norm": 0.3432357907295227, + "learning_rate": 0.00015936521140123995, + "loss": 1.4552, + "step": 15648 + }, + { + "epoch": 0.20335161773950164, + "grad_norm": 0.4468333423137665, + "learning_rate": 0.00015936261193932858, + "loss": 1.5789, + "step": 15649 + }, + { + "epoch": 0.2033646122834175, + "grad_norm": 0.42616572976112366, + "learning_rate": 0.00015936001247741718, + "loss": 1.5048, + "step": 15650 + }, + { + "epoch": 0.20337760682733338, + "grad_norm": 0.4014796316623688, + "learning_rate": 0.0001593574130155058, + "loss": 1.5122, + "step": 15651 + }, + { + "epoch": 0.20339060137124926, + "grad_norm": 0.36720192432403564, + "learning_rate": 0.0001593548135535944, + "loss": 1.3338, + "step": 15652 + }, + { + "epoch": 0.20340359591516513, + "grad_norm": 0.4661295711994171, + "learning_rate": 0.00015935221409168305, + "loss": 1.5724, + "step": 15653 + }, + { + "epoch": 0.203416590459081, + "grad_norm": 0.44531241059303284, + "learning_rate": 0.00015934961462977165, + "loss": 1.4317, + "step": 15654 + }, + { + "epoch": 0.20342958500299688, + "grad_norm": 0.37221646308898926, + "learning_rate": 0.00015934701516786027, + "loss": 1.3614, + "step": 15655 + }, + { + "epoch": 0.20344257954691275, + "grad_norm": 0.4490053653717041, + "learning_rate": 0.00015934441570594887, + "loss": 1.385, + "step": 15656 + }, + { + "epoch": 0.20345557409082862, + "grad_norm": 0.27513256669044495, + "learning_rate": 0.0001593418162440375, + "loss": 1.4627, + "step": 15657 + }, + { + "epoch": 0.2034685686347445, + "grad_norm": 0.40304383635520935, + "learning_rate": 0.00015933921678212612, + "loss": 1.3386, + "step": 15658 + }, + { + "epoch": 0.20348156317866037, + "grad_norm": 0.30084583163261414, + "learning_rate": 0.00015933661732021472, + "loss": 1.2755, + "step": 15659 + }, + { + "epoch": 0.20349455772257624, + "grad_norm": 0.5392645001411438, + "learning_rate": 0.00015933401785830334, + "loss": 1.5687, + "step": 15660 + }, + { + "epoch": 0.20350755226649211, + "grad_norm": 0.2891553044319153, + "learning_rate": 0.00015933141839639196, + "loss": 1.2629, + "step": 15661 + }, + { + "epoch": 0.203520546810408, + "grad_norm": 0.41944870352745056, + "learning_rate": 0.00015932881893448056, + "loss": 1.2568, + "step": 15662 + }, + { + "epoch": 0.20353354135432386, + "grad_norm": 0.40333545207977295, + "learning_rate": 0.00015932621947256919, + "loss": 1.3555, + "step": 15663 + }, + { + "epoch": 0.20354653589823973, + "grad_norm": 0.42014238238334656, + "learning_rate": 0.00015932362001065778, + "loss": 1.4136, + "step": 15664 + }, + { + "epoch": 0.2035595304421556, + "grad_norm": 0.36154791712760925, + "learning_rate": 0.00015932102054874644, + "loss": 1.489, + "step": 15665 + }, + { + "epoch": 0.20357252498607148, + "grad_norm": 0.47213754057884216, + "learning_rate": 0.00015931842108683503, + "loss": 1.3316, + "step": 15666 + }, + { + "epoch": 0.20358551952998735, + "grad_norm": 0.3318568170070648, + "learning_rate": 0.00015931582162492366, + "loss": 1.5078, + "step": 15667 + }, + { + "epoch": 0.20359851407390323, + "grad_norm": 0.36425235867500305, + "learning_rate": 0.00015931322216301225, + "loss": 1.4387, + "step": 15668 + }, + { + "epoch": 0.2036115086178191, + "grad_norm": 0.37598493695259094, + "learning_rate": 0.00015931062270110088, + "loss": 1.2973, + "step": 15669 + }, + { + "epoch": 0.20362450316173497, + "grad_norm": 0.4518984258174896, + "learning_rate": 0.0001593080232391895, + "loss": 1.3447, + "step": 15670 + }, + { + "epoch": 0.20363749770565084, + "grad_norm": 0.39861059188842773, + "learning_rate": 0.0001593054237772781, + "loss": 1.411, + "step": 15671 + }, + { + "epoch": 0.20365049224956672, + "grad_norm": 0.34336212277412415, + "learning_rate": 0.00015930282431536673, + "loss": 1.3022, + "step": 15672 + }, + { + "epoch": 0.2036634867934826, + "grad_norm": 0.4216004014015198, + "learning_rate": 0.00015930022485345535, + "loss": 1.3542, + "step": 15673 + }, + { + "epoch": 0.20367648133739846, + "grad_norm": 0.39204496145248413, + "learning_rate": 0.00015929762539154395, + "loss": 1.6154, + "step": 15674 + }, + { + "epoch": 0.20368947588131434, + "grad_norm": 0.4893943965435028, + "learning_rate": 0.00015929502592963257, + "loss": 1.4186, + "step": 15675 + }, + { + "epoch": 0.2037024704252302, + "grad_norm": 0.4786006808280945, + "learning_rate": 0.00015929242646772117, + "loss": 1.5974, + "step": 15676 + }, + { + "epoch": 0.20371546496914608, + "grad_norm": 0.35081303119659424, + "learning_rate": 0.00015928982700580982, + "loss": 1.3947, + "step": 15677 + }, + { + "epoch": 0.20372845951306195, + "grad_norm": 0.30897438526153564, + "learning_rate": 0.00015928722754389842, + "loss": 1.1852, + "step": 15678 + }, + { + "epoch": 0.20374145405697783, + "grad_norm": 0.4839898347854614, + "learning_rate": 0.00015928462808198704, + "loss": 1.4347, + "step": 15679 + }, + { + "epoch": 0.2037544486008937, + "grad_norm": 0.4220789074897766, + "learning_rate": 0.00015928202862007567, + "loss": 1.4932, + "step": 15680 + }, + { + "epoch": 0.20376744314480957, + "grad_norm": 0.3967871367931366, + "learning_rate": 0.00015927942915816426, + "loss": 1.661, + "step": 15681 + }, + { + "epoch": 0.20378043768872545, + "grad_norm": 0.4438914954662323, + "learning_rate": 0.0001592768296962529, + "loss": 1.3474, + "step": 15682 + }, + { + "epoch": 0.20379343223264132, + "grad_norm": 0.35931092500686646, + "learning_rate": 0.00015927423023434149, + "loss": 1.4284, + "step": 15683 + }, + { + "epoch": 0.2038064267765572, + "grad_norm": 0.45528286695480347, + "learning_rate": 0.00015927163077243014, + "loss": 1.6341, + "step": 15684 + }, + { + "epoch": 0.20381942132047307, + "grad_norm": 0.39101269841194153, + "learning_rate": 0.00015926903131051874, + "loss": 1.2844, + "step": 15685 + }, + { + "epoch": 0.20383241586438894, + "grad_norm": 0.4158966839313507, + "learning_rate": 0.00015926643184860733, + "loss": 1.4993, + "step": 15686 + }, + { + "epoch": 0.2038454104083048, + "grad_norm": 0.3594337999820709, + "learning_rate": 0.00015926383238669596, + "loss": 1.3699, + "step": 15687 + }, + { + "epoch": 0.20385840495222068, + "grad_norm": 0.39116764068603516, + "learning_rate": 0.00015926123292478458, + "loss": 1.236, + "step": 15688 + }, + { + "epoch": 0.20387139949613656, + "grad_norm": 0.3533034026622772, + "learning_rate": 0.0001592586334628732, + "loss": 1.4514, + "step": 15689 + }, + { + "epoch": 0.20388439404005243, + "grad_norm": 0.24810311198234558, + "learning_rate": 0.0001592560340009618, + "loss": 1.1727, + "step": 15690 + }, + { + "epoch": 0.2038973885839683, + "grad_norm": 0.29851341247558594, + "learning_rate": 0.00015925343453905043, + "loss": 1.233, + "step": 15691 + }, + { + "epoch": 0.20391038312788418, + "grad_norm": 0.36616063117980957, + "learning_rate": 0.00015925083507713905, + "loss": 1.5684, + "step": 15692 + }, + { + "epoch": 0.20392337767180005, + "grad_norm": 0.33179977536201477, + "learning_rate": 0.00015924823561522765, + "loss": 1.4019, + "step": 15693 + }, + { + "epoch": 0.20393637221571592, + "grad_norm": 0.394796758890152, + "learning_rate": 0.00015924563615331627, + "loss": 1.272, + "step": 15694 + }, + { + "epoch": 0.2039493667596318, + "grad_norm": 0.27692022919654846, + "learning_rate": 0.00015924303669140487, + "loss": 1.3884, + "step": 15695 + }, + { + "epoch": 0.20396236130354767, + "grad_norm": 0.40785714983940125, + "learning_rate": 0.00015924043722949352, + "loss": 1.3902, + "step": 15696 + }, + { + "epoch": 0.20397535584746354, + "grad_norm": 0.37398409843444824, + "learning_rate": 0.00015923783776758212, + "loss": 1.3423, + "step": 15697 + }, + { + "epoch": 0.2039883503913794, + "grad_norm": 0.35514935851097107, + "learning_rate": 0.00015923523830567072, + "loss": 1.3234, + "step": 15698 + }, + { + "epoch": 0.2040013449352953, + "grad_norm": 0.3922223746776581, + "learning_rate": 0.00015923263884375934, + "loss": 1.5144, + "step": 15699 + }, + { + "epoch": 0.20401433947921116, + "grad_norm": 0.4430294632911682, + "learning_rate": 0.00015923003938184797, + "loss": 1.5148, + "step": 15700 + }, + { + "epoch": 0.20402733402312703, + "grad_norm": 0.4479601979255676, + "learning_rate": 0.0001592274399199366, + "loss": 1.49, + "step": 15701 + }, + { + "epoch": 0.2040403285670429, + "grad_norm": 0.3259592056274414, + "learning_rate": 0.0001592248404580252, + "loss": 1.3565, + "step": 15702 + }, + { + "epoch": 0.20405332311095878, + "grad_norm": 0.44806450605392456, + "learning_rate": 0.0001592222409961138, + "loss": 1.4721, + "step": 15703 + }, + { + "epoch": 0.20406631765487465, + "grad_norm": 0.4268580973148346, + "learning_rate": 0.00015921964153420244, + "loss": 1.5731, + "step": 15704 + }, + { + "epoch": 0.20407931219879052, + "grad_norm": 0.43186575174331665, + "learning_rate": 0.00015921704207229104, + "loss": 1.5134, + "step": 15705 + }, + { + "epoch": 0.2040923067427064, + "grad_norm": 0.40926510095596313, + "learning_rate": 0.00015921444261037966, + "loss": 1.3174, + "step": 15706 + }, + { + "epoch": 0.20410530128662227, + "grad_norm": 0.4243534207344055, + "learning_rate": 0.00015921184314846826, + "loss": 1.3368, + "step": 15707 + }, + { + "epoch": 0.20411829583053814, + "grad_norm": 0.4110370874404907, + "learning_rate": 0.0001592092436865569, + "loss": 1.4053, + "step": 15708 + }, + { + "epoch": 0.20413129037445402, + "grad_norm": 0.4195098578929901, + "learning_rate": 0.0001592066442246455, + "loss": 1.4566, + "step": 15709 + }, + { + "epoch": 0.2041442849183699, + "grad_norm": 0.29919928312301636, + "learning_rate": 0.00015920404476273413, + "loss": 1.4793, + "step": 15710 + }, + { + "epoch": 0.20415727946228576, + "grad_norm": 0.45262691378593445, + "learning_rate": 0.00015920144530082273, + "loss": 1.3782, + "step": 15711 + }, + { + "epoch": 0.20417027400620164, + "grad_norm": 0.5835135579109192, + "learning_rate": 0.00015919884583891135, + "loss": 1.5884, + "step": 15712 + }, + { + "epoch": 0.2041832685501175, + "grad_norm": 0.4178208112716675, + "learning_rate": 0.00015919624637699998, + "loss": 1.4329, + "step": 15713 + }, + { + "epoch": 0.20419626309403338, + "grad_norm": 0.2904089689254761, + "learning_rate": 0.00015919364691508857, + "loss": 1.2274, + "step": 15714 + }, + { + "epoch": 0.20420925763794925, + "grad_norm": 0.4081857204437256, + "learning_rate": 0.0001591910474531772, + "loss": 1.4441, + "step": 15715 + }, + { + "epoch": 0.20422225218186513, + "grad_norm": 0.36956870555877686, + "learning_rate": 0.00015918844799126582, + "loss": 1.3105, + "step": 15716 + }, + { + "epoch": 0.204235246725781, + "grad_norm": 0.35003021359443665, + "learning_rate": 0.00015918584852935442, + "loss": 1.4961, + "step": 15717 + }, + { + "epoch": 0.20424824126969687, + "grad_norm": 0.3857676088809967, + "learning_rate": 0.00015918324906744305, + "loss": 1.2679, + "step": 15718 + }, + { + "epoch": 0.20426123581361275, + "grad_norm": 0.3591368496417999, + "learning_rate": 0.00015918064960553167, + "loss": 1.4774, + "step": 15719 + }, + { + "epoch": 0.20427423035752862, + "grad_norm": 0.4455430209636688, + "learning_rate": 0.0001591780501436203, + "loss": 1.5148, + "step": 15720 + }, + { + "epoch": 0.20428722490144452, + "grad_norm": 0.32895219326019287, + "learning_rate": 0.0001591754506817089, + "loss": 1.2892, + "step": 15721 + }, + { + "epoch": 0.2043002194453604, + "grad_norm": 0.35661959648132324, + "learning_rate": 0.00015917285121979752, + "loss": 1.4415, + "step": 15722 + }, + { + "epoch": 0.20431321398927627, + "grad_norm": 0.42002347111701965, + "learning_rate": 0.00015917025175788614, + "loss": 1.6175, + "step": 15723 + }, + { + "epoch": 0.20432620853319214, + "grad_norm": 0.4138714373111725, + "learning_rate": 0.00015916765229597474, + "loss": 1.586, + "step": 15724 + }, + { + "epoch": 0.204339203077108, + "grad_norm": 0.36805209517478943, + "learning_rate": 0.00015916505283406336, + "loss": 1.2892, + "step": 15725 + }, + { + "epoch": 0.20435219762102388, + "grad_norm": 0.41735419631004333, + "learning_rate": 0.00015916245337215196, + "loss": 1.4875, + "step": 15726 + }, + { + "epoch": 0.20436519216493976, + "grad_norm": 0.28439757227897644, + "learning_rate": 0.00015915985391024058, + "loss": 1.5228, + "step": 15727 + }, + { + "epoch": 0.20437818670885563, + "grad_norm": 0.400066077709198, + "learning_rate": 0.0001591572544483292, + "loss": 1.4091, + "step": 15728 + }, + { + "epoch": 0.2043911812527715, + "grad_norm": 0.41826218366622925, + "learning_rate": 0.0001591546549864178, + "loss": 1.3983, + "step": 15729 + }, + { + "epoch": 0.20440417579668738, + "grad_norm": 0.3373241126537323, + "learning_rate": 0.00015915205552450643, + "loss": 1.4874, + "step": 15730 + }, + { + "epoch": 0.20441717034060325, + "grad_norm": 0.4516903758049011, + "learning_rate": 0.00015914945606259505, + "loss": 1.4143, + "step": 15731 + }, + { + "epoch": 0.20443016488451912, + "grad_norm": 0.456332802772522, + "learning_rate": 0.00015914685660068368, + "loss": 1.5287, + "step": 15732 + }, + { + "epoch": 0.204443159428435, + "grad_norm": 0.5830400586128235, + "learning_rate": 0.00015914425713877228, + "loss": 1.7161, + "step": 15733 + }, + { + "epoch": 0.20445615397235087, + "grad_norm": 0.4263465404510498, + "learning_rate": 0.0001591416576768609, + "loss": 1.5081, + "step": 15734 + }, + { + "epoch": 0.20446914851626674, + "grad_norm": 0.4236100912094116, + "learning_rate": 0.00015913905821494953, + "loss": 1.2357, + "step": 15735 + }, + { + "epoch": 0.20448214306018261, + "grad_norm": 0.36477360129356384, + "learning_rate": 0.00015913645875303812, + "loss": 1.3883, + "step": 15736 + }, + { + "epoch": 0.2044951376040985, + "grad_norm": 0.44050678610801697, + "learning_rate": 0.00015913385929112675, + "loss": 1.4581, + "step": 15737 + }, + { + "epoch": 0.20450813214801436, + "grad_norm": 0.3738628029823303, + "learning_rate": 0.00015913125982921534, + "loss": 1.4019, + "step": 15738 + }, + { + "epoch": 0.20452112669193023, + "grad_norm": 0.4606102406978607, + "learning_rate": 0.000159128660367304, + "loss": 1.5841, + "step": 15739 + }, + { + "epoch": 0.2045341212358461, + "grad_norm": 0.3182002902030945, + "learning_rate": 0.0001591260609053926, + "loss": 1.65, + "step": 15740 + }, + { + "epoch": 0.20454711577976198, + "grad_norm": 0.3573318123817444, + "learning_rate": 0.0001591234614434812, + "loss": 1.2803, + "step": 15741 + }, + { + "epoch": 0.20456011032367785, + "grad_norm": 0.30597618222236633, + "learning_rate": 0.00015912086198156982, + "loss": 1.2344, + "step": 15742 + }, + { + "epoch": 0.20457310486759372, + "grad_norm": 0.41995641589164734, + "learning_rate": 0.00015911826251965844, + "loss": 1.2867, + "step": 15743 + }, + { + "epoch": 0.2045860994115096, + "grad_norm": 0.41299331188201904, + "learning_rate": 0.00015911566305774706, + "loss": 1.403, + "step": 15744 + }, + { + "epoch": 0.20459909395542547, + "grad_norm": 0.4078940451145172, + "learning_rate": 0.00015911306359583566, + "loss": 1.3966, + "step": 15745 + }, + { + "epoch": 0.20461208849934134, + "grad_norm": 0.46846461296081543, + "learning_rate": 0.0001591104641339243, + "loss": 1.2318, + "step": 15746 + }, + { + "epoch": 0.20462508304325722, + "grad_norm": 0.46171319484710693, + "learning_rate": 0.0001591078646720129, + "loss": 1.6357, + "step": 15747 + }, + { + "epoch": 0.2046380775871731, + "grad_norm": 0.3639085590839386, + "learning_rate": 0.0001591052652101015, + "loss": 1.5535, + "step": 15748 + }, + { + "epoch": 0.20465107213108896, + "grad_norm": 0.4720836579799652, + "learning_rate": 0.00015910266574819013, + "loss": 1.4096, + "step": 15749 + }, + { + "epoch": 0.20466406667500484, + "grad_norm": 0.45323488116264343, + "learning_rate": 0.00015910006628627873, + "loss": 1.3315, + "step": 15750 + }, + { + "epoch": 0.2046770612189207, + "grad_norm": 0.46483924984931946, + "learning_rate": 0.00015909746682436738, + "loss": 1.5884, + "step": 15751 + }, + { + "epoch": 0.20469005576283658, + "grad_norm": 0.42523857951164246, + "learning_rate": 0.00015909486736245598, + "loss": 1.4434, + "step": 15752 + }, + { + "epoch": 0.20470305030675245, + "grad_norm": 0.3048466742038727, + "learning_rate": 0.00015909226790054458, + "loss": 1.4924, + "step": 15753 + }, + { + "epoch": 0.20471604485066833, + "grad_norm": 0.5041714310646057, + "learning_rate": 0.00015908966843863323, + "loss": 1.5424, + "step": 15754 + }, + { + "epoch": 0.2047290393945842, + "grad_norm": 0.4051111936569214, + "learning_rate": 0.00015908706897672183, + "loss": 1.3879, + "step": 15755 + }, + { + "epoch": 0.20474203393850007, + "grad_norm": 0.44219812750816345, + "learning_rate": 0.00015908446951481045, + "loss": 1.3498, + "step": 15756 + }, + { + "epoch": 0.20475502848241595, + "grad_norm": 0.28013938665390015, + "learning_rate": 0.00015908187005289905, + "loss": 1.1812, + "step": 15757 + }, + { + "epoch": 0.20476802302633182, + "grad_norm": 0.3454078733921051, + "learning_rate": 0.00015907927059098767, + "loss": 1.4764, + "step": 15758 + }, + { + "epoch": 0.2047810175702477, + "grad_norm": 0.362848699092865, + "learning_rate": 0.0001590766711290763, + "loss": 1.3763, + "step": 15759 + }, + { + "epoch": 0.20479401211416356, + "grad_norm": 0.40795400738716125, + "learning_rate": 0.0001590740716671649, + "loss": 1.4743, + "step": 15760 + }, + { + "epoch": 0.20480700665807944, + "grad_norm": 0.34853318333625793, + "learning_rate": 0.00015907147220525352, + "loss": 1.2877, + "step": 15761 + }, + { + "epoch": 0.2048200012019953, + "grad_norm": 0.4030263125896454, + "learning_rate": 0.00015906887274334214, + "loss": 1.46, + "step": 15762 + }, + { + "epoch": 0.20483299574591118, + "grad_norm": 0.5251719355583191, + "learning_rate": 0.00015906627328143077, + "loss": 1.4268, + "step": 15763 + }, + { + "epoch": 0.20484599028982706, + "grad_norm": 0.3947938084602356, + "learning_rate": 0.00015906367381951936, + "loss": 1.4655, + "step": 15764 + }, + { + "epoch": 0.20485898483374293, + "grad_norm": 0.447459876537323, + "learning_rate": 0.00015906107435760796, + "loss": 1.321, + "step": 15765 + }, + { + "epoch": 0.2048719793776588, + "grad_norm": 0.5051454901695251, + "learning_rate": 0.00015905847489569661, + "loss": 1.5749, + "step": 15766 + }, + { + "epoch": 0.20488497392157468, + "grad_norm": 0.4089381992816925, + "learning_rate": 0.0001590558754337852, + "loss": 1.4098, + "step": 15767 + }, + { + "epoch": 0.20489796846549055, + "grad_norm": 0.36150145530700684, + "learning_rate": 0.00015905327597187384, + "loss": 1.3307, + "step": 15768 + }, + { + "epoch": 0.20491096300940642, + "grad_norm": 0.45095813274383545, + "learning_rate": 0.00015905067650996243, + "loss": 1.3518, + "step": 15769 + }, + { + "epoch": 0.2049239575533223, + "grad_norm": 0.28538087010383606, + "learning_rate": 0.00015904807704805106, + "loss": 1.2048, + "step": 15770 + }, + { + "epoch": 0.20493695209723817, + "grad_norm": 0.3877955675125122, + "learning_rate": 0.00015904547758613968, + "loss": 1.3066, + "step": 15771 + }, + { + "epoch": 0.20494994664115404, + "grad_norm": 0.36222320795059204, + "learning_rate": 0.00015904287812422828, + "loss": 1.3091, + "step": 15772 + }, + { + "epoch": 0.2049629411850699, + "grad_norm": 0.3057667315006256, + "learning_rate": 0.0001590402786623169, + "loss": 1.6036, + "step": 15773 + }, + { + "epoch": 0.2049759357289858, + "grad_norm": 0.3966691792011261, + "learning_rate": 0.00015903767920040553, + "loss": 1.6149, + "step": 15774 + }, + { + "epoch": 0.20498893027290166, + "grad_norm": 0.6498134136199951, + "learning_rate": 0.00015903507973849415, + "loss": 1.3092, + "step": 15775 + }, + { + "epoch": 0.20500192481681753, + "grad_norm": 0.32731446623802185, + "learning_rate": 0.00015903248027658275, + "loss": 1.2598, + "step": 15776 + }, + { + "epoch": 0.2050149193607334, + "grad_norm": 0.3012624979019165, + "learning_rate": 0.00015902988081467137, + "loss": 1.3868, + "step": 15777 + }, + { + "epoch": 0.20502791390464928, + "grad_norm": 0.5168237686157227, + "learning_rate": 0.00015902728135276, + "loss": 1.3713, + "step": 15778 + }, + { + "epoch": 0.20504090844856515, + "grad_norm": 0.4670848250389099, + "learning_rate": 0.0001590246818908486, + "loss": 1.5733, + "step": 15779 + }, + { + "epoch": 0.20505390299248102, + "grad_norm": 0.3513728678226471, + "learning_rate": 0.00015902208242893722, + "loss": 1.3412, + "step": 15780 + }, + { + "epoch": 0.2050668975363969, + "grad_norm": 0.473827600479126, + "learning_rate": 0.00015901948296702582, + "loss": 1.3986, + "step": 15781 + }, + { + "epoch": 0.20507989208031277, + "grad_norm": 0.40078842639923096, + "learning_rate": 0.00015901688350511444, + "loss": 1.2068, + "step": 15782 + }, + { + "epoch": 0.20509288662422864, + "grad_norm": 0.36712297797203064, + "learning_rate": 0.00015901428404320307, + "loss": 1.3649, + "step": 15783 + }, + { + "epoch": 0.20510588116814452, + "grad_norm": 0.44345995783805847, + "learning_rate": 0.00015901168458129166, + "loss": 1.6381, + "step": 15784 + }, + { + "epoch": 0.2051188757120604, + "grad_norm": 0.391267865896225, + "learning_rate": 0.0001590090851193803, + "loss": 1.4177, + "step": 15785 + }, + { + "epoch": 0.20513187025597626, + "grad_norm": 0.2519415020942688, + "learning_rate": 0.00015900648565746891, + "loss": 1.4768, + "step": 15786 + }, + { + "epoch": 0.20514486479989213, + "grad_norm": 0.37938761711120605, + "learning_rate": 0.00015900388619555754, + "loss": 1.4536, + "step": 15787 + }, + { + "epoch": 0.205157859343808, + "grad_norm": 0.3375428020954132, + "learning_rate": 0.00015900128673364614, + "loss": 1.4751, + "step": 15788 + }, + { + "epoch": 0.20517085388772388, + "grad_norm": 0.34928786754608154, + "learning_rate": 0.00015899868727173476, + "loss": 1.1602, + "step": 15789 + }, + { + "epoch": 0.20518384843163975, + "grad_norm": 0.3935679495334625, + "learning_rate": 0.00015899608780982338, + "loss": 1.7161, + "step": 15790 + }, + { + "epoch": 0.20519684297555563, + "grad_norm": 0.2962068021297455, + "learning_rate": 0.00015899348834791198, + "loss": 1.4203, + "step": 15791 + }, + { + "epoch": 0.2052098375194715, + "grad_norm": 0.3858349621295929, + "learning_rate": 0.0001589908888860006, + "loss": 1.364, + "step": 15792 + }, + { + "epoch": 0.20522283206338737, + "grad_norm": 0.39436033368110657, + "learning_rate": 0.00015898828942408923, + "loss": 1.5025, + "step": 15793 + }, + { + "epoch": 0.20523582660730325, + "grad_norm": 0.3853914737701416, + "learning_rate": 0.00015898568996217783, + "loss": 1.5118, + "step": 15794 + }, + { + "epoch": 0.20524882115121912, + "grad_norm": 0.48946279287338257, + "learning_rate": 0.00015898309050026645, + "loss": 1.4366, + "step": 15795 + }, + { + "epoch": 0.205261815695135, + "grad_norm": 0.4551539719104767, + "learning_rate": 0.00015898049103835505, + "loss": 1.4611, + "step": 15796 + }, + { + "epoch": 0.2052748102390509, + "grad_norm": 0.43925783038139343, + "learning_rate": 0.0001589778915764437, + "loss": 1.496, + "step": 15797 + }, + { + "epoch": 0.20528780478296677, + "grad_norm": 0.37782543897628784, + "learning_rate": 0.0001589752921145323, + "loss": 1.2591, + "step": 15798 + }, + { + "epoch": 0.20530079932688264, + "grad_norm": 0.45286956429481506, + "learning_rate": 0.00015897269265262092, + "loss": 1.469, + "step": 15799 + }, + { + "epoch": 0.2053137938707985, + "grad_norm": 0.3681240379810333, + "learning_rate": 0.00015897009319070952, + "loss": 1.3431, + "step": 15800 + }, + { + "epoch": 0.20532678841471438, + "grad_norm": 0.3992835283279419, + "learning_rate": 0.00015896749372879815, + "loss": 1.5222, + "step": 15801 + }, + { + "epoch": 0.20533978295863026, + "grad_norm": 0.34749746322631836, + "learning_rate": 0.00015896489426688677, + "loss": 1.3768, + "step": 15802 + }, + { + "epoch": 0.20535277750254613, + "grad_norm": 0.49276745319366455, + "learning_rate": 0.00015896229480497537, + "loss": 1.5404, + "step": 15803 + }, + { + "epoch": 0.205365772046462, + "grad_norm": 0.4035680592060089, + "learning_rate": 0.000158959695343064, + "loss": 1.3819, + "step": 15804 + }, + { + "epoch": 0.20537876659037788, + "grad_norm": 0.38701581954956055, + "learning_rate": 0.00015895709588115262, + "loss": 1.4325, + "step": 15805 + }, + { + "epoch": 0.20539176113429375, + "grad_norm": 0.3915572464466095, + "learning_rate": 0.00015895449641924124, + "loss": 1.2669, + "step": 15806 + }, + { + "epoch": 0.20540475567820962, + "grad_norm": 0.5313198566436768, + "learning_rate": 0.00015895189695732984, + "loss": 1.442, + "step": 15807 + }, + { + "epoch": 0.2054177502221255, + "grad_norm": 0.3440777659416199, + "learning_rate": 0.00015894929749541844, + "loss": 1.2321, + "step": 15808 + }, + { + "epoch": 0.20543074476604137, + "grad_norm": 0.42358773946762085, + "learning_rate": 0.0001589466980335071, + "loss": 1.4623, + "step": 15809 + }, + { + "epoch": 0.20544373930995724, + "grad_norm": 0.3406578600406647, + "learning_rate": 0.00015894409857159568, + "loss": 1.3457, + "step": 15810 + }, + { + "epoch": 0.2054567338538731, + "grad_norm": 0.32047003507614136, + "learning_rate": 0.0001589414991096843, + "loss": 1.2605, + "step": 15811 + }, + { + "epoch": 0.205469728397789, + "grad_norm": 0.44259655475616455, + "learning_rate": 0.0001589388996477729, + "loss": 1.2508, + "step": 15812 + }, + { + "epoch": 0.20548272294170486, + "grad_norm": 0.41553908586502075, + "learning_rate": 0.00015893630018586153, + "loss": 1.5704, + "step": 15813 + }, + { + "epoch": 0.20549571748562073, + "grad_norm": 0.3841545879840851, + "learning_rate": 0.00015893370072395016, + "loss": 1.4083, + "step": 15814 + }, + { + "epoch": 0.2055087120295366, + "grad_norm": 0.42117422819137573, + "learning_rate": 0.00015893110126203875, + "loss": 1.3001, + "step": 15815 + }, + { + "epoch": 0.20552170657345248, + "grad_norm": 0.4432094395160675, + "learning_rate": 0.00015892850180012738, + "loss": 1.4535, + "step": 15816 + }, + { + "epoch": 0.20553470111736835, + "grad_norm": 0.36133062839508057, + "learning_rate": 0.000158925902338216, + "loss": 1.36, + "step": 15817 + }, + { + "epoch": 0.20554769566128422, + "grad_norm": 0.3729369044303894, + "learning_rate": 0.00015892330287630463, + "loss": 1.4188, + "step": 15818 + }, + { + "epoch": 0.2055606902052001, + "grad_norm": 0.3679851293563843, + "learning_rate": 0.00015892070341439322, + "loss": 1.4607, + "step": 15819 + }, + { + "epoch": 0.20557368474911597, + "grad_norm": 0.4136151969432831, + "learning_rate": 0.00015891810395248182, + "loss": 1.459, + "step": 15820 + }, + { + "epoch": 0.20558667929303184, + "grad_norm": 0.31538987159729004, + "learning_rate": 0.00015891550449057047, + "loss": 1.3662, + "step": 15821 + }, + { + "epoch": 0.20559967383694772, + "grad_norm": 0.37139299511909485, + "learning_rate": 0.00015891290502865907, + "loss": 1.3113, + "step": 15822 + }, + { + "epoch": 0.2056126683808636, + "grad_norm": 0.39008355140686035, + "learning_rate": 0.0001589103055667477, + "loss": 1.5512, + "step": 15823 + }, + { + "epoch": 0.20562566292477946, + "grad_norm": 0.3811801075935364, + "learning_rate": 0.0001589077061048363, + "loss": 1.5279, + "step": 15824 + }, + { + "epoch": 0.20563865746869534, + "grad_norm": 0.42425671219825745, + "learning_rate": 0.00015890510664292492, + "loss": 1.223, + "step": 15825 + }, + { + "epoch": 0.2056516520126112, + "grad_norm": 0.3898901641368866, + "learning_rate": 0.00015890250718101354, + "loss": 1.3233, + "step": 15826 + }, + { + "epoch": 0.20566464655652708, + "grad_norm": 0.47206342220306396, + "learning_rate": 0.00015889990771910214, + "loss": 1.375, + "step": 15827 + }, + { + "epoch": 0.20567764110044295, + "grad_norm": 0.383807897567749, + "learning_rate": 0.0001588973082571908, + "loss": 1.6269, + "step": 15828 + }, + { + "epoch": 0.20569063564435883, + "grad_norm": 0.4191513955593109, + "learning_rate": 0.0001588947087952794, + "loss": 1.4365, + "step": 15829 + }, + { + "epoch": 0.2057036301882747, + "grad_norm": 0.3272859454154968, + "learning_rate": 0.000158892109333368, + "loss": 1.3244, + "step": 15830 + }, + { + "epoch": 0.20571662473219057, + "grad_norm": 0.32198211550712585, + "learning_rate": 0.0001588895098714566, + "loss": 1.2497, + "step": 15831 + }, + { + "epoch": 0.20572961927610645, + "grad_norm": 0.2837758958339691, + "learning_rate": 0.00015888691040954523, + "loss": 1.2955, + "step": 15832 + }, + { + "epoch": 0.20574261382002232, + "grad_norm": 0.40911129117012024, + "learning_rate": 0.00015888431094763386, + "loss": 1.6714, + "step": 15833 + }, + { + "epoch": 0.2057556083639382, + "grad_norm": 0.4043314754962921, + "learning_rate": 0.00015888171148572246, + "loss": 1.2847, + "step": 15834 + }, + { + "epoch": 0.20576860290785406, + "grad_norm": 0.4200332462787628, + "learning_rate": 0.00015887911202381108, + "loss": 1.3008, + "step": 15835 + }, + { + "epoch": 0.20578159745176994, + "grad_norm": 0.3216518759727478, + "learning_rate": 0.0001588765125618997, + "loss": 1.2906, + "step": 15836 + }, + { + "epoch": 0.2057945919956858, + "grad_norm": 0.2856453061103821, + "learning_rate": 0.0001588739130999883, + "loss": 1.4698, + "step": 15837 + }, + { + "epoch": 0.20580758653960168, + "grad_norm": 0.35720422863960266, + "learning_rate": 0.00015887131363807693, + "loss": 1.1851, + "step": 15838 + }, + { + "epoch": 0.20582058108351756, + "grad_norm": 0.44327622652053833, + "learning_rate": 0.00015886871417616552, + "loss": 1.6635, + "step": 15839 + }, + { + "epoch": 0.20583357562743343, + "grad_norm": 0.4015038013458252, + "learning_rate": 0.00015886611471425418, + "loss": 1.4849, + "step": 15840 + }, + { + "epoch": 0.2058465701713493, + "grad_norm": 0.29389163851737976, + "learning_rate": 0.00015886351525234277, + "loss": 1.3409, + "step": 15841 + }, + { + "epoch": 0.20585956471526518, + "grad_norm": 0.39749184250831604, + "learning_rate": 0.0001588609157904314, + "loss": 1.4076, + "step": 15842 + }, + { + "epoch": 0.20587255925918105, + "grad_norm": 0.7407920360565186, + "learning_rate": 0.00015885831632852, + "loss": 1.5349, + "step": 15843 + }, + { + "epoch": 0.20588555380309692, + "grad_norm": 0.3987780511379242, + "learning_rate": 0.00015885571686660862, + "loss": 1.5439, + "step": 15844 + }, + { + "epoch": 0.2058985483470128, + "grad_norm": 0.46152356266975403, + "learning_rate": 0.00015885311740469724, + "loss": 1.5296, + "step": 15845 + }, + { + "epoch": 0.20591154289092867, + "grad_norm": 0.38446956872940063, + "learning_rate": 0.00015885051794278584, + "loss": 1.4374, + "step": 15846 + }, + { + "epoch": 0.20592453743484454, + "grad_norm": 0.5016198754310608, + "learning_rate": 0.00015884791848087447, + "loss": 1.4926, + "step": 15847 + }, + { + "epoch": 0.2059375319787604, + "grad_norm": 0.48141491413116455, + "learning_rate": 0.0001588453190189631, + "loss": 1.3809, + "step": 15848 + }, + { + "epoch": 0.20595052652267629, + "grad_norm": 0.3519539535045624, + "learning_rate": 0.0001588427195570517, + "loss": 1.348, + "step": 15849 + }, + { + "epoch": 0.20596352106659216, + "grad_norm": 0.37036508321762085, + "learning_rate": 0.0001588401200951403, + "loss": 1.3443, + "step": 15850 + }, + { + "epoch": 0.20597651561050803, + "grad_norm": 0.47400668263435364, + "learning_rate": 0.0001588375206332289, + "loss": 1.6047, + "step": 15851 + }, + { + "epoch": 0.2059895101544239, + "grad_norm": 0.360757052898407, + "learning_rate": 0.00015883492117131756, + "loss": 1.4429, + "step": 15852 + }, + { + "epoch": 0.20600250469833978, + "grad_norm": 0.3367481827735901, + "learning_rate": 0.00015883232170940616, + "loss": 1.3385, + "step": 15853 + }, + { + "epoch": 0.20601549924225565, + "grad_norm": 0.3506537675857544, + "learning_rate": 0.00015882972224749478, + "loss": 1.3692, + "step": 15854 + }, + { + "epoch": 0.20602849378617152, + "grad_norm": 0.40682289004325867, + "learning_rate": 0.00015882712278558338, + "loss": 1.4108, + "step": 15855 + }, + { + "epoch": 0.2060414883300874, + "grad_norm": 0.4694855213165283, + "learning_rate": 0.000158824523323672, + "loss": 1.5517, + "step": 15856 + }, + { + "epoch": 0.20605448287400327, + "grad_norm": 0.346362441778183, + "learning_rate": 0.00015882192386176063, + "loss": 1.3311, + "step": 15857 + }, + { + "epoch": 0.20606747741791914, + "grad_norm": 0.26887205243110657, + "learning_rate": 0.00015881932439984923, + "loss": 1.438, + "step": 15858 + }, + { + "epoch": 0.20608047196183502, + "grad_norm": 0.3908768892288208, + "learning_rate": 0.00015881672493793785, + "loss": 1.3651, + "step": 15859 + }, + { + "epoch": 0.2060934665057509, + "grad_norm": 0.506881833076477, + "learning_rate": 0.00015881412547602647, + "loss": 1.4457, + "step": 15860 + }, + { + "epoch": 0.20610646104966676, + "grad_norm": 0.41534358263015747, + "learning_rate": 0.0001588115260141151, + "loss": 1.4301, + "step": 15861 + }, + { + "epoch": 0.20611945559358263, + "grad_norm": 0.3881515860557556, + "learning_rate": 0.0001588089265522037, + "loss": 1.3328, + "step": 15862 + }, + { + "epoch": 0.2061324501374985, + "grad_norm": 0.4791701138019562, + "learning_rate": 0.0001588063270902923, + "loss": 1.4697, + "step": 15863 + }, + { + "epoch": 0.20614544468141438, + "grad_norm": 0.4101627767086029, + "learning_rate": 0.00015880372762838095, + "loss": 1.3185, + "step": 15864 + }, + { + "epoch": 0.20615843922533025, + "grad_norm": 0.4083032011985779, + "learning_rate": 0.00015880112816646954, + "loss": 1.5074, + "step": 15865 + }, + { + "epoch": 0.20617143376924613, + "grad_norm": 0.39344367384910583, + "learning_rate": 0.00015879852870455817, + "loss": 1.2071, + "step": 15866 + }, + { + "epoch": 0.206184428313162, + "grad_norm": 0.35956549644470215, + "learning_rate": 0.0001587959292426468, + "loss": 1.4655, + "step": 15867 + }, + { + "epoch": 0.20619742285707787, + "grad_norm": 0.3508695662021637, + "learning_rate": 0.0001587933297807354, + "loss": 1.4405, + "step": 15868 + }, + { + "epoch": 0.20621041740099375, + "grad_norm": 0.4173499345779419, + "learning_rate": 0.00015879073031882401, + "loss": 1.3928, + "step": 15869 + }, + { + "epoch": 0.20622341194490962, + "grad_norm": 0.3480115532875061, + "learning_rate": 0.0001587881308569126, + "loss": 1.31, + "step": 15870 + }, + { + "epoch": 0.2062364064888255, + "grad_norm": 0.42981192469596863, + "learning_rate": 0.00015878553139500126, + "loss": 1.4739, + "step": 15871 + }, + { + "epoch": 0.20624940103274136, + "grad_norm": 0.34881457686424255, + "learning_rate": 0.00015878293193308986, + "loss": 1.3708, + "step": 15872 + }, + { + "epoch": 0.20626239557665726, + "grad_norm": 0.35697826743125916, + "learning_rate": 0.00015878033247117848, + "loss": 1.5492, + "step": 15873 + }, + { + "epoch": 0.20627539012057314, + "grad_norm": 0.315758615732193, + "learning_rate": 0.00015877773300926708, + "loss": 1.4323, + "step": 15874 + }, + { + "epoch": 0.206288384664489, + "grad_norm": 0.4722059667110443, + "learning_rate": 0.0001587751335473557, + "loss": 1.5743, + "step": 15875 + }, + { + "epoch": 0.20630137920840488, + "grad_norm": 0.42276933789253235, + "learning_rate": 0.00015877253408544433, + "loss": 1.3927, + "step": 15876 + }, + { + "epoch": 0.20631437375232076, + "grad_norm": 0.2741607129573822, + "learning_rate": 0.00015876993462353293, + "loss": 1.2414, + "step": 15877 + }, + { + "epoch": 0.20632736829623663, + "grad_norm": 0.3732401430606842, + "learning_rate": 0.00015876733516162155, + "loss": 1.3106, + "step": 15878 + }, + { + "epoch": 0.2063403628401525, + "grad_norm": 0.35604941844940186, + "learning_rate": 0.00015876473569971018, + "loss": 1.4573, + "step": 15879 + }, + { + "epoch": 0.20635335738406838, + "grad_norm": 0.42389288544654846, + "learning_rate": 0.00015876213623779877, + "loss": 1.4561, + "step": 15880 + }, + { + "epoch": 0.20636635192798425, + "grad_norm": 0.36977311968803406, + "learning_rate": 0.0001587595367758874, + "loss": 1.4669, + "step": 15881 + }, + { + "epoch": 0.20637934647190012, + "grad_norm": 0.3273666203022003, + "learning_rate": 0.000158756937313976, + "loss": 1.2684, + "step": 15882 + }, + { + "epoch": 0.206392341015816, + "grad_norm": 0.49288409948349, + "learning_rate": 0.00015875433785206465, + "loss": 1.4825, + "step": 15883 + }, + { + "epoch": 0.20640533555973187, + "grad_norm": 0.4416276514530182, + "learning_rate": 0.00015875173839015325, + "loss": 1.5172, + "step": 15884 + }, + { + "epoch": 0.20641833010364774, + "grad_norm": 0.3795471787452698, + "learning_rate": 0.00015874913892824187, + "loss": 1.5336, + "step": 15885 + }, + { + "epoch": 0.2064313246475636, + "grad_norm": 0.3744145929813385, + "learning_rate": 0.00015874653946633047, + "loss": 1.4966, + "step": 15886 + }, + { + "epoch": 0.2064443191914795, + "grad_norm": 0.41878148913383484, + "learning_rate": 0.0001587439400044191, + "loss": 1.4001, + "step": 15887 + }, + { + "epoch": 0.20645731373539536, + "grad_norm": 0.38130518794059753, + "learning_rate": 0.00015874134054250772, + "loss": 1.4254, + "step": 15888 + }, + { + "epoch": 0.20647030827931123, + "grad_norm": 0.36912328004837036, + "learning_rate": 0.00015873874108059631, + "loss": 1.3318, + "step": 15889 + }, + { + "epoch": 0.2064833028232271, + "grad_norm": 0.34443825483322144, + "learning_rate": 0.00015873614161868494, + "loss": 1.368, + "step": 15890 + }, + { + "epoch": 0.20649629736714298, + "grad_norm": 0.4547821283340454, + "learning_rate": 0.00015873354215677356, + "loss": 1.2034, + "step": 15891 + }, + { + "epoch": 0.20650929191105885, + "grad_norm": 0.36991581320762634, + "learning_rate": 0.00015873094269486216, + "loss": 1.3629, + "step": 15892 + }, + { + "epoch": 0.20652228645497472, + "grad_norm": 0.4537360668182373, + "learning_rate": 0.00015872834323295078, + "loss": 1.3154, + "step": 15893 + }, + { + "epoch": 0.2065352809988906, + "grad_norm": 0.37049147486686707, + "learning_rate": 0.00015872574377103938, + "loss": 1.2739, + "step": 15894 + }, + { + "epoch": 0.20654827554280647, + "grad_norm": 0.39780426025390625, + "learning_rate": 0.00015872314430912803, + "loss": 1.5707, + "step": 15895 + }, + { + "epoch": 0.20656127008672234, + "grad_norm": 0.34157872200012207, + "learning_rate": 0.00015872054484721663, + "loss": 1.2745, + "step": 15896 + }, + { + "epoch": 0.20657426463063822, + "grad_norm": 0.40147995948791504, + "learning_rate": 0.00015871794538530526, + "loss": 1.1553, + "step": 15897 + }, + { + "epoch": 0.2065872591745541, + "grad_norm": 0.6175597310066223, + "learning_rate": 0.00015871534592339385, + "loss": 1.4381, + "step": 15898 + }, + { + "epoch": 0.20660025371846996, + "grad_norm": 0.3995334208011627, + "learning_rate": 0.00015871274646148248, + "loss": 1.535, + "step": 15899 + }, + { + "epoch": 0.20661324826238583, + "grad_norm": 0.4335523247718811, + "learning_rate": 0.0001587101469995711, + "loss": 1.6232, + "step": 15900 + }, + { + "epoch": 0.2066262428063017, + "grad_norm": 0.41150805354118347, + "learning_rate": 0.0001587075475376597, + "loss": 1.2491, + "step": 15901 + }, + { + "epoch": 0.20663923735021758, + "grad_norm": 0.4173851013183594, + "learning_rate": 0.00015870494807574835, + "loss": 1.5438, + "step": 15902 + }, + { + "epoch": 0.20665223189413345, + "grad_norm": 0.38663020730018616, + "learning_rate": 0.00015870234861383695, + "loss": 1.2649, + "step": 15903 + }, + { + "epoch": 0.20666522643804933, + "grad_norm": 0.28352129459381104, + "learning_rate": 0.00015869974915192555, + "loss": 1.323, + "step": 15904 + }, + { + "epoch": 0.2066782209819652, + "grad_norm": 0.4399521052837372, + "learning_rate": 0.00015869714969001417, + "loss": 1.3059, + "step": 15905 + }, + { + "epoch": 0.20669121552588107, + "grad_norm": 0.3807695806026459, + "learning_rate": 0.0001586945502281028, + "loss": 1.382, + "step": 15906 + }, + { + "epoch": 0.20670421006979695, + "grad_norm": 0.36949726939201355, + "learning_rate": 0.00015869195076619142, + "loss": 1.5614, + "step": 15907 + }, + { + "epoch": 0.20671720461371282, + "grad_norm": 0.3936013877391815, + "learning_rate": 0.00015868935130428002, + "loss": 1.4939, + "step": 15908 + }, + { + "epoch": 0.2067301991576287, + "grad_norm": 0.4262370765209198, + "learning_rate": 0.00015868675184236864, + "loss": 1.3336, + "step": 15909 + }, + { + "epoch": 0.20674319370154456, + "grad_norm": 0.3740140497684479, + "learning_rate": 0.00015868415238045727, + "loss": 1.2651, + "step": 15910 + }, + { + "epoch": 0.20675618824546044, + "grad_norm": 0.3990071713924408, + "learning_rate": 0.00015868155291854586, + "loss": 1.2441, + "step": 15911 + }, + { + "epoch": 0.2067691827893763, + "grad_norm": 0.30727601051330566, + "learning_rate": 0.0001586789534566345, + "loss": 1.1888, + "step": 15912 + }, + { + "epoch": 0.20678217733329218, + "grad_norm": 0.38075730204582214, + "learning_rate": 0.00015867635399472308, + "loss": 1.4345, + "step": 15913 + }, + { + "epoch": 0.20679517187720806, + "grad_norm": 0.43977880477905273, + "learning_rate": 0.00015867375453281174, + "loss": 1.4487, + "step": 15914 + }, + { + "epoch": 0.20680816642112393, + "grad_norm": 0.45685312151908875, + "learning_rate": 0.00015867115507090033, + "loss": 1.4278, + "step": 15915 + }, + { + "epoch": 0.2068211609650398, + "grad_norm": 0.42141231894493103, + "learning_rate": 0.00015866855560898896, + "loss": 1.3266, + "step": 15916 + }, + { + "epoch": 0.20683415550895567, + "grad_norm": 0.4219370484352112, + "learning_rate": 0.00015866595614707756, + "loss": 1.5276, + "step": 15917 + }, + { + "epoch": 0.20684715005287155, + "grad_norm": 0.3282308876514435, + "learning_rate": 0.00015866335668516618, + "loss": 1.315, + "step": 15918 + }, + { + "epoch": 0.20686014459678742, + "grad_norm": 0.3735176920890808, + "learning_rate": 0.0001586607572232548, + "loss": 1.3812, + "step": 15919 + }, + { + "epoch": 0.2068731391407033, + "grad_norm": 0.49324479699134827, + "learning_rate": 0.0001586581577613434, + "loss": 1.313, + "step": 15920 + }, + { + "epoch": 0.20688613368461917, + "grad_norm": 0.4210513234138489, + "learning_rate": 0.00015865555829943203, + "loss": 1.3932, + "step": 15921 + }, + { + "epoch": 0.20689912822853504, + "grad_norm": 0.4304329752922058, + "learning_rate": 0.00015865295883752065, + "loss": 1.5712, + "step": 15922 + }, + { + "epoch": 0.2069121227724509, + "grad_norm": 0.41186749935150146, + "learning_rate": 0.00015865035937560925, + "loss": 1.4385, + "step": 15923 + }, + { + "epoch": 0.20692511731636679, + "grad_norm": 0.38800108432769775, + "learning_rate": 0.00015864775991369787, + "loss": 1.5389, + "step": 15924 + }, + { + "epoch": 0.20693811186028266, + "grad_norm": 0.3718966841697693, + "learning_rate": 0.00015864516045178647, + "loss": 1.2611, + "step": 15925 + }, + { + "epoch": 0.20695110640419853, + "grad_norm": 0.41373559832572937, + "learning_rate": 0.00015864256098987512, + "loss": 1.503, + "step": 15926 + }, + { + "epoch": 0.2069641009481144, + "grad_norm": 0.47427499294281006, + "learning_rate": 0.00015863996152796372, + "loss": 1.5411, + "step": 15927 + }, + { + "epoch": 0.20697709549203028, + "grad_norm": 0.42541879415512085, + "learning_rate": 0.00015863736206605234, + "loss": 1.4241, + "step": 15928 + }, + { + "epoch": 0.20699009003594615, + "grad_norm": 0.4966329336166382, + "learning_rate": 0.00015863476260414094, + "loss": 1.5141, + "step": 15929 + }, + { + "epoch": 0.20700308457986202, + "grad_norm": 0.5133181810379028, + "learning_rate": 0.00015863216314222957, + "loss": 1.4732, + "step": 15930 + }, + { + "epoch": 0.2070160791237779, + "grad_norm": 0.42081692814826965, + "learning_rate": 0.0001586295636803182, + "loss": 1.6751, + "step": 15931 + }, + { + "epoch": 0.20702907366769377, + "grad_norm": 0.43273743987083435, + "learning_rate": 0.0001586269642184068, + "loss": 1.3903, + "step": 15932 + }, + { + "epoch": 0.20704206821160964, + "grad_norm": 0.43079596757888794, + "learning_rate": 0.0001586243647564954, + "loss": 1.4664, + "step": 15933 + }, + { + "epoch": 0.20705506275552552, + "grad_norm": 0.4602683186531067, + "learning_rate": 0.00015862176529458404, + "loss": 1.3738, + "step": 15934 + }, + { + "epoch": 0.2070680572994414, + "grad_norm": 0.2862103283405304, + "learning_rate": 0.00015861916583267263, + "loss": 1.6209, + "step": 15935 + }, + { + "epoch": 0.20708105184335726, + "grad_norm": 0.4206918776035309, + "learning_rate": 0.00015861656637076126, + "loss": 1.6406, + "step": 15936 + }, + { + "epoch": 0.20709404638727313, + "grad_norm": 0.3876684308052063, + "learning_rate": 0.00015861396690884986, + "loss": 1.3282, + "step": 15937 + }, + { + "epoch": 0.207107040931189, + "grad_norm": 0.38095077872276306, + "learning_rate": 0.0001586113674469385, + "loss": 1.3787, + "step": 15938 + }, + { + "epoch": 0.20712003547510488, + "grad_norm": 0.45426690578460693, + "learning_rate": 0.0001586087679850271, + "loss": 1.3792, + "step": 15939 + }, + { + "epoch": 0.20713303001902075, + "grad_norm": 0.3864433765411377, + "learning_rate": 0.00015860616852311573, + "loss": 1.4522, + "step": 15940 + }, + { + "epoch": 0.20714602456293663, + "grad_norm": 0.42905518412590027, + "learning_rate": 0.00015860356906120435, + "loss": 1.3308, + "step": 15941 + }, + { + "epoch": 0.2071590191068525, + "grad_norm": 0.3632519543170929, + "learning_rate": 0.00015860096959929295, + "loss": 1.404, + "step": 15942 + }, + { + "epoch": 0.20717201365076837, + "grad_norm": 0.33392533659935, + "learning_rate": 0.00015859837013738158, + "loss": 1.3958, + "step": 15943 + }, + { + "epoch": 0.20718500819468424, + "grad_norm": 0.39345288276672363, + "learning_rate": 0.00015859577067547017, + "loss": 1.4429, + "step": 15944 + }, + { + "epoch": 0.20719800273860012, + "grad_norm": 0.5228486061096191, + "learning_rate": 0.00015859317121355882, + "loss": 1.4701, + "step": 15945 + }, + { + "epoch": 0.207210997282516, + "grad_norm": 0.38974788784980774, + "learning_rate": 0.00015859057175164742, + "loss": 1.4143, + "step": 15946 + }, + { + "epoch": 0.20722399182643186, + "grad_norm": 0.33740234375, + "learning_rate": 0.00015858797228973602, + "loss": 1.5008, + "step": 15947 + }, + { + "epoch": 0.20723698637034774, + "grad_norm": 0.47325247526168823, + "learning_rate": 0.00015858537282782464, + "loss": 1.4823, + "step": 15948 + }, + { + "epoch": 0.20724998091426364, + "grad_norm": 0.4428384602069855, + "learning_rate": 0.00015858277336591327, + "loss": 1.3386, + "step": 15949 + }, + { + "epoch": 0.2072629754581795, + "grad_norm": 0.4490267038345337, + "learning_rate": 0.0001585801739040019, + "loss": 1.4046, + "step": 15950 + }, + { + "epoch": 0.20727597000209538, + "grad_norm": 0.3699183464050293, + "learning_rate": 0.0001585775744420905, + "loss": 1.2574, + "step": 15951 + }, + { + "epoch": 0.20728896454601126, + "grad_norm": 0.4242355525493622, + "learning_rate": 0.00015857497498017911, + "loss": 1.3812, + "step": 15952 + }, + { + "epoch": 0.20730195908992713, + "grad_norm": 0.4202848970890045, + "learning_rate": 0.00015857237551826774, + "loss": 1.2855, + "step": 15953 + }, + { + "epoch": 0.207314953633843, + "grad_norm": 0.32425323128700256, + "learning_rate": 0.00015856977605635634, + "loss": 1.34, + "step": 15954 + }, + { + "epoch": 0.20732794817775888, + "grad_norm": 0.4508119523525238, + "learning_rate": 0.00015856717659444496, + "loss": 1.4274, + "step": 15955 + }, + { + "epoch": 0.20734094272167475, + "grad_norm": 0.3245319426059723, + "learning_rate": 0.00015856457713253356, + "loss": 1.0483, + "step": 15956 + }, + { + "epoch": 0.20735393726559062, + "grad_norm": 0.41097545623779297, + "learning_rate": 0.0001585619776706222, + "loss": 1.3924, + "step": 15957 + }, + { + "epoch": 0.2073669318095065, + "grad_norm": 0.4216390550136566, + "learning_rate": 0.0001585593782087108, + "loss": 1.4084, + "step": 15958 + }, + { + "epoch": 0.20737992635342237, + "grad_norm": 0.5487633943557739, + "learning_rate": 0.0001585567787467994, + "loss": 1.5324, + "step": 15959 + }, + { + "epoch": 0.20739292089733824, + "grad_norm": 0.4324282109737396, + "learning_rate": 0.00015855417928488803, + "loss": 1.5385, + "step": 15960 + }, + { + "epoch": 0.2074059154412541, + "grad_norm": 0.4272029399871826, + "learning_rate": 0.00015855157982297665, + "loss": 1.5907, + "step": 15961 + }, + { + "epoch": 0.20741890998516999, + "grad_norm": 0.43973782658576965, + "learning_rate": 0.00015854898036106528, + "loss": 1.6421, + "step": 15962 + }, + { + "epoch": 0.20743190452908586, + "grad_norm": 0.5025953054428101, + "learning_rate": 0.00015854638089915388, + "loss": 1.5181, + "step": 15963 + }, + { + "epoch": 0.20744489907300173, + "grad_norm": 0.43513232469558716, + "learning_rate": 0.0001585437814372425, + "loss": 1.4248, + "step": 15964 + }, + { + "epoch": 0.2074578936169176, + "grad_norm": 0.4654023349285126, + "learning_rate": 0.00015854118197533112, + "loss": 1.4621, + "step": 15965 + }, + { + "epoch": 0.20747088816083348, + "grad_norm": 0.34579113125801086, + "learning_rate": 0.00015853858251341972, + "loss": 1.5525, + "step": 15966 + }, + { + "epoch": 0.20748388270474935, + "grad_norm": 0.38553041219711304, + "learning_rate": 0.00015853598305150835, + "loss": 1.3002, + "step": 15967 + }, + { + "epoch": 0.20749687724866522, + "grad_norm": 0.3341819941997528, + "learning_rate": 0.00015853338358959694, + "loss": 1.5916, + "step": 15968 + }, + { + "epoch": 0.2075098717925811, + "grad_norm": 0.4365738332271576, + "learning_rate": 0.0001585307841276856, + "loss": 1.4613, + "step": 15969 + }, + { + "epoch": 0.20752286633649697, + "grad_norm": 0.30693507194519043, + "learning_rate": 0.0001585281846657742, + "loss": 1.3327, + "step": 15970 + }, + { + "epoch": 0.20753586088041284, + "grad_norm": 0.6149924397468567, + "learning_rate": 0.0001585255852038628, + "loss": 1.3987, + "step": 15971 + }, + { + "epoch": 0.20754885542432872, + "grad_norm": 0.3154333233833313, + "learning_rate": 0.00015852298574195141, + "loss": 1.3273, + "step": 15972 + }, + { + "epoch": 0.2075618499682446, + "grad_norm": 0.3717316687107086, + "learning_rate": 0.00015852038628004004, + "loss": 1.3175, + "step": 15973 + }, + { + "epoch": 0.20757484451216046, + "grad_norm": 0.3503427505493164, + "learning_rate": 0.00015851778681812866, + "loss": 1.6451, + "step": 15974 + }, + { + "epoch": 0.20758783905607633, + "grad_norm": 0.4445754289627075, + "learning_rate": 0.00015851518735621726, + "loss": 1.4261, + "step": 15975 + }, + { + "epoch": 0.2076008335999922, + "grad_norm": 0.48500898480415344, + "learning_rate": 0.00015851258789430589, + "loss": 1.3555, + "step": 15976 + }, + { + "epoch": 0.20761382814390808, + "grad_norm": 0.4229230284690857, + "learning_rate": 0.0001585099884323945, + "loss": 1.4876, + "step": 15977 + }, + { + "epoch": 0.20762682268782395, + "grad_norm": 0.3553781509399414, + "learning_rate": 0.0001585073889704831, + "loss": 1.2741, + "step": 15978 + }, + { + "epoch": 0.20763981723173983, + "grad_norm": 0.35326969623565674, + "learning_rate": 0.00015850478950857173, + "loss": 1.6283, + "step": 15979 + }, + { + "epoch": 0.2076528117756557, + "grad_norm": 0.27169209718704224, + "learning_rate": 0.00015850219004666036, + "loss": 1.5028, + "step": 15980 + }, + { + "epoch": 0.20766580631957157, + "grad_norm": 0.38618168234825134, + "learning_rate": 0.00015849959058474898, + "loss": 1.5922, + "step": 15981 + }, + { + "epoch": 0.20767880086348744, + "grad_norm": 0.35482385754585266, + "learning_rate": 0.00015849699112283758, + "loss": 1.4253, + "step": 15982 + }, + { + "epoch": 0.20769179540740332, + "grad_norm": 0.43640565872192383, + "learning_rate": 0.0001584943916609262, + "loss": 1.4547, + "step": 15983 + }, + { + "epoch": 0.2077047899513192, + "grad_norm": 0.2577308416366577, + "learning_rate": 0.00015849179219901483, + "loss": 1.3898, + "step": 15984 + }, + { + "epoch": 0.20771778449523506, + "grad_norm": 0.3763541877269745, + "learning_rate": 0.00015848919273710342, + "loss": 1.6398, + "step": 15985 + }, + { + "epoch": 0.20773077903915094, + "grad_norm": 0.38937291502952576, + "learning_rate": 0.00015848659327519205, + "loss": 1.6141, + "step": 15986 + }, + { + "epoch": 0.2077437735830668, + "grad_norm": 0.35316237807273865, + "learning_rate": 0.00015848399381328065, + "loss": 1.462, + "step": 15987 + }, + { + "epoch": 0.20775676812698268, + "grad_norm": 0.5038059949874878, + "learning_rate": 0.00015848139435136927, + "loss": 1.5502, + "step": 15988 + }, + { + "epoch": 0.20776976267089856, + "grad_norm": 0.2985912561416626, + "learning_rate": 0.0001584787948894579, + "loss": 1.3208, + "step": 15989 + }, + { + "epoch": 0.20778275721481443, + "grad_norm": 0.42337754368782043, + "learning_rate": 0.0001584761954275465, + "loss": 1.5212, + "step": 15990 + }, + { + "epoch": 0.2077957517587303, + "grad_norm": 0.38767901062965393, + "learning_rate": 0.00015847359596563512, + "loss": 1.5582, + "step": 15991 + }, + { + "epoch": 0.20780874630264617, + "grad_norm": 0.4252013862133026, + "learning_rate": 0.00015847099650372374, + "loss": 1.4976, + "step": 15992 + }, + { + "epoch": 0.20782174084656205, + "grad_norm": 0.5667216181755066, + "learning_rate": 0.00015846839704181237, + "loss": 1.3592, + "step": 15993 + }, + { + "epoch": 0.20783473539047792, + "grad_norm": 0.39167776703834534, + "learning_rate": 0.00015846579757990096, + "loss": 1.4129, + "step": 15994 + }, + { + "epoch": 0.2078477299343938, + "grad_norm": 0.3853174149990082, + "learning_rate": 0.0001584631981179896, + "loss": 1.4487, + "step": 15995 + }, + { + "epoch": 0.20786072447830967, + "grad_norm": 0.3639722764492035, + "learning_rate": 0.0001584605986560782, + "loss": 1.4497, + "step": 15996 + }, + { + "epoch": 0.20787371902222554, + "grad_norm": 0.44041547179222107, + "learning_rate": 0.0001584579991941668, + "loss": 1.4591, + "step": 15997 + }, + { + "epoch": 0.2078867135661414, + "grad_norm": 0.34470033645629883, + "learning_rate": 0.00015845539973225543, + "loss": 1.2498, + "step": 15998 + }, + { + "epoch": 0.20789970811005729, + "grad_norm": 0.5876336693763733, + "learning_rate": 0.00015845280027034403, + "loss": 1.3802, + "step": 15999 + }, + { + "epoch": 0.20791270265397316, + "grad_norm": 0.3623909652233124, + "learning_rate": 0.00015845020080843266, + "loss": 1.299, + "step": 16000 + }, + { + "epoch": 0.20792569719788903, + "grad_norm": 0.5420477390289307, + "learning_rate": 0.00015844760134652128, + "loss": 1.4149, + "step": 16001 + }, + { + "epoch": 0.2079386917418049, + "grad_norm": 0.34653428196907043, + "learning_rate": 0.00015844500188460988, + "loss": 1.4183, + "step": 16002 + }, + { + "epoch": 0.20795168628572078, + "grad_norm": 1.1590534448623657, + "learning_rate": 0.0001584424024226985, + "loss": 1.4341, + "step": 16003 + }, + { + "epoch": 0.20796468082963665, + "grad_norm": 0.43542009592056274, + "learning_rate": 0.00015843980296078713, + "loss": 1.5338, + "step": 16004 + }, + { + "epoch": 0.20797767537355252, + "grad_norm": 0.4745631515979767, + "learning_rate": 0.00015843720349887575, + "loss": 1.451, + "step": 16005 + }, + { + "epoch": 0.2079906699174684, + "grad_norm": 0.30314990878105164, + "learning_rate": 0.00015843460403696435, + "loss": 1.2994, + "step": 16006 + }, + { + "epoch": 0.20800366446138427, + "grad_norm": 0.38124462962150574, + "learning_rate": 0.00015843200457505297, + "loss": 1.5497, + "step": 16007 + }, + { + "epoch": 0.20801665900530014, + "grad_norm": 0.44628238677978516, + "learning_rate": 0.0001584294051131416, + "loss": 1.4621, + "step": 16008 + }, + { + "epoch": 0.20802965354921601, + "grad_norm": 0.3599419593811035, + "learning_rate": 0.0001584268056512302, + "loss": 1.3801, + "step": 16009 + }, + { + "epoch": 0.2080426480931319, + "grad_norm": 0.3340938091278076, + "learning_rate": 0.00015842420618931882, + "loss": 1.3018, + "step": 16010 + }, + { + "epoch": 0.20805564263704776, + "grad_norm": 0.4244717061519623, + "learning_rate": 0.00015842160672740742, + "loss": 1.4893, + "step": 16011 + }, + { + "epoch": 0.20806863718096363, + "grad_norm": 0.37749183177948, + "learning_rate": 0.00015841900726549607, + "loss": 1.4971, + "step": 16012 + }, + { + "epoch": 0.2080816317248795, + "grad_norm": 0.38569486141204834, + "learning_rate": 0.00015841640780358467, + "loss": 1.5133, + "step": 16013 + }, + { + "epoch": 0.20809462626879538, + "grad_norm": 0.4032568633556366, + "learning_rate": 0.00015841380834167326, + "loss": 1.5366, + "step": 16014 + }, + { + "epoch": 0.20810762081271125, + "grad_norm": 0.3759194016456604, + "learning_rate": 0.00015841120887976191, + "loss": 1.497, + "step": 16015 + }, + { + "epoch": 0.20812061535662713, + "grad_norm": 0.3568772077560425, + "learning_rate": 0.0001584086094178505, + "loss": 1.4852, + "step": 16016 + }, + { + "epoch": 0.208133609900543, + "grad_norm": 0.38058537244796753, + "learning_rate": 0.00015840600995593914, + "loss": 1.3279, + "step": 16017 + }, + { + "epoch": 0.20814660444445887, + "grad_norm": 0.4988006353378296, + "learning_rate": 0.00015840341049402773, + "loss": 1.4481, + "step": 16018 + }, + { + "epoch": 0.20815959898837474, + "grad_norm": 0.33993902802467346, + "learning_rate": 0.00015840081103211636, + "loss": 1.4293, + "step": 16019 + }, + { + "epoch": 0.20817259353229062, + "grad_norm": 0.3664860725402832, + "learning_rate": 0.00015839821157020498, + "loss": 1.3491, + "step": 16020 + }, + { + "epoch": 0.2081855880762065, + "grad_norm": 0.34311312437057495, + "learning_rate": 0.00015839561210829358, + "loss": 1.2992, + "step": 16021 + }, + { + "epoch": 0.20819858262012236, + "grad_norm": 0.37073156237602234, + "learning_rate": 0.0001583930126463822, + "loss": 1.5062, + "step": 16022 + }, + { + "epoch": 0.20821157716403824, + "grad_norm": 0.44511500000953674, + "learning_rate": 0.00015839041318447083, + "loss": 1.3081, + "step": 16023 + }, + { + "epoch": 0.2082245717079541, + "grad_norm": 0.3527889549732208, + "learning_rate": 0.00015838781372255945, + "loss": 1.224, + "step": 16024 + }, + { + "epoch": 0.20823756625187, + "grad_norm": 0.4156639575958252, + "learning_rate": 0.00015838521426064805, + "loss": 1.3504, + "step": 16025 + }, + { + "epoch": 0.20825056079578588, + "grad_norm": 0.43120813369750977, + "learning_rate": 0.00015838261479873665, + "loss": 1.5303, + "step": 16026 + }, + { + "epoch": 0.20826355533970176, + "grad_norm": 0.37040260434150696, + "learning_rate": 0.0001583800153368253, + "loss": 1.3712, + "step": 16027 + }, + { + "epoch": 0.20827654988361763, + "grad_norm": 0.38942664861679077, + "learning_rate": 0.0001583774158749139, + "loss": 1.4237, + "step": 16028 + }, + { + "epoch": 0.2082895444275335, + "grad_norm": 0.36321499943733215, + "learning_rate": 0.00015837481641300252, + "loss": 1.4585, + "step": 16029 + }, + { + "epoch": 0.20830253897144937, + "grad_norm": 0.4463197588920593, + "learning_rate": 0.00015837221695109112, + "loss": 1.4577, + "step": 16030 + }, + { + "epoch": 0.20831553351536525, + "grad_norm": 0.443386971950531, + "learning_rate": 0.00015836961748917974, + "loss": 1.5799, + "step": 16031 + }, + { + "epoch": 0.20832852805928112, + "grad_norm": 0.3910764753818512, + "learning_rate": 0.00015836701802726837, + "loss": 1.5709, + "step": 16032 + }, + { + "epoch": 0.208341522603197, + "grad_norm": 0.4198080003261566, + "learning_rate": 0.00015836441856535697, + "loss": 1.3891, + "step": 16033 + }, + { + "epoch": 0.20835451714711287, + "grad_norm": 0.24765633046627045, + "learning_rate": 0.0001583618191034456, + "loss": 1.3661, + "step": 16034 + }, + { + "epoch": 0.20836751169102874, + "grad_norm": 0.43960055708885193, + "learning_rate": 0.00015835921964153421, + "loss": 1.6753, + "step": 16035 + }, + { + "epoch": 0.2083805062349446, + "grad_norm": 0.6276617050170898, + "learning_rate": 0.00015835662017962284, + "loss": 1.3542, + "step": 16036 + }, + { + "epoch": 0.20839350077886049, + "grad_norm": 0.40269678831100464, + "learning_rate": 0.00015835402071771144, + "loss": 1.4307, + "step": 16037 + }, + { + "epoch": 0.20840649532277636, + "grad_norm": 0.34608420729637146, + "learning_rate": 0.00015835142125580006, + "loss": 1.2931, + "step": 16038 + }, + { + "epoch": 0.20841948986669223, + "grad_norm": 0.3153967559337616, + "learning_rate": 0.00015834882179388869, + "loss": 1.3352, + "step": 16039 + }, + { + "epoch": 0.2084324844106081, + "grad_norm": 0.3433372974395752, + "learning_rate": 0.00015834622233197728, + "loss": 1.3119, + "step": 16040 + }, + { + "epoch": 0.20844547895452398, + "grad_norm": 0.2960616648197174, + "learning_rate": 0.0001583436228700659, + "loss": 1.3269, + "step": 16041 + }, + { + "epoch": 0.20845847349843985, + "grad_norm": 0.3713153898715973, + "learning_rate": 0.0001583410234081545, + "loss": 1.2676, + "step": 16042 + }, + { + "epoch": 0.20847146804235572, + "grad_norm": 0.4613809585571289, + "learning_rate": 0.00015833842394624313, + "loss": 1.6384, + "step": 16043 + }, + { + "epoch": 0.2084844625862716, + "grad_norm": 0.3246658742427826, + "learning_rate": 0.00015833582448433175, + "loss": 1.3396, + "step": 16044 + }, + { + "epoch": 0.20849745713018747, + "grad_norm": 0.34756481647491455, + "learning_rate": 0.00015833322502242035, + "loss": 1.2093, + "step": 16045 + }, + { + "epoch": 0.20851045167410334, + "grad_norm": 0.48932504653930664, + "learning_rate": 0.00015833062556050898, + "loss": 1.4613, + "step": 16046 + }, + { + "epoch": 0.20852344621801921, + "grad_norm": 0.4023049771785736, + "learning_rate": 0.0001583280260985976, + "loss": 1.27, + "step": 16047 + }, + { + "epoch": 0.2085364407619351, + "grad_norm": 0.4189609885215759, + "learning_rate": 0.00015832542663668622, + "loss": 1.311, + "step": 16048 + }, + { + "epoch": 0.20854943530585096, + "grad_norm": 0.36577218770980835, + "learning_rate": 0.00015832282717477482, + "loss": 1.3969, + "step": 16049 + }, + { + "epoch": 0.20856242984976683, + "grad_norm": 0.35754260420799255, + "learning_rate": 0.00015832022771286345, + "loss": 1.4532, + "step": 16050 + }, + { + "epoch": 0.2085754243936827, + "grad_norm": 0.392670601606369, + "learning_rate": 0.00015831762825095207, + "loss": 1.3658, + "step": 16051 + }, + { + "epoch": 0.20858841893759858, + "grad_norm": 0.3320735692977905, + "learning_rate": 0.00015831502878904067, + "loss": 1.1229, + "step": 16052 + }, + { + "epoch": 0.20860141348151445, + "grad_norm": 0.44898515939712524, + "learning_rate": 0.0001583124293271293, + "loss": 1.4091, + "step": 16053 + }, + { + "epoch": 0.20861440802543033, + "grad_norm": 0.35932672023773193, + "learning_rate": 0.00015830982986521792, + "loss": 1.3058, + "step": 16054 + }, + { + "epoch": 0.2086274025693462, + "grad_norm": 0.42297473549842834, + "learning_rate": 0.00015830723040330651, + "loss": 1.4127, + "step": 16055 + }, + { + "epoch": 0.20864039711326207, + "grad_norm": 0.38213151693344116, + "learning_rate": 0.00015830463094139514, + "loss": 1.4545, + "step": 16056 + }, + { + "epoch": 0.20865339165717794, + "grad_norm": 0.36874356865882874, + "learning_rate": 0.00015830203147948374, + "loss": 1.3049, + "step": 16057 + }, + { + "epoch": 0.20866638620109382, + "grad_norm": 0.37892380356788635, + "learning_rate": 0.0001582994320175724, + "loss": 1.3975, + "step": 16058 + }, + { + "epoch": 0.2086793807450097, + "grad_norm": 0.48240986466407776, + "learning_rate": 0.00015829683255566099, + "loss": 1.5236, + "step": 16059 + }, + { + "epoch": 0.20869237528892556, + "grad_norm": 0.35460755228996277, + "learning_rate": 0.0001582942330937496, + "loss": 1.3819, + "step": 16060 + }, + { + "epoch": 0.20870536983284144, + "grad_norm": 0.3681202232837677, + "learning_rate": 0.0001582916336318382, + "loss": 1.2837, + "step": 16061 + }, + { + "epoch": 0.2087183643767573, + "grad_norm": 0.4416978359222412, + "learning_rate": 0.00015828903416992683, + "loss": 1.5869, + "step": 16062 + }, + { + "epoch": 0.20873135892067318, + "grad_norm": 0.4337884485721588, + "learning_rate": 0.00015828643470801546, + "loss": 1.5075, + "step": 16063 + }, + { + "epoch": 0.20874435346458906, + "grad_norm": 0.41698238253593445, + "learning_rate": 0.00015828383524610405, + "loss": 1.5484, + "step": 16064 + }, + { + "epoch": 0.20875734800850493, + "grad_norm": 0.40398716926574707, + "learning_rate": 0.00015828123578419268, + "loss": 1.3974, + "step": 16065 + }, + { + "epoch": 0.2087703425524208, + "grad_norm": 0.4739348292350769, + "learning_rate": 0.0001582786363222813, + "loss": 1.2976, + "step": 16066 + }, + { + "epoch": 0.20878333709633667, + "grad_norm": 0.4163246750831604, + "learning_rate": 0.00015827603686036993, + "loss": 1.2366, + "step": 16067 + }, + { + "epoch": 0.20879633164025255, + "grad_norm": 0.3740679919719696, + "learning_rate": 0.00015827343739845852, + "loss": 1.3101, + "step": 16068 + }, + { + "epoch": 0.20880932618416842, + "grad_norm": 0.3800796866416931, + "learning_rate": 0.00015827083793654712, + "loss": 1.5359, + "step": 16069 + }, + { + "epoch": 0.2088223207280843, + "grad_norm": 0.4328382909297943, + "learning_rate": 0.00015826823847463577, + "loss": 1.3673, + "step": 16070 + }, + { + "epoch": 0.20883531527200017, + "grad_norm": 0.3781449794769287, + "learning_rate": 0.00015826563901272437, + "loss": 1.4101, + "step": 16071 + }, + { + "epoch": 0.20884830981591604, + "grad_norm": 0.3667421042919159, + "learning_rate": 0.000158263039550813, + "loss": 1.4905, + "step": 16072 + }, + { + "epoch": 0.2088613043598319, + "grad_norm": 0.47123926877975464, + "learning_rate": 0.0001582604400889016, + "loss": 1.4843, + "step": 16073 + }, + { + "epoch": 0.20887429890374778, + "grad_norm": 0.3746497631072998, + "learning_rate": 0.00015825784062699022, + "loss": 1.4777, + "step": 16074 + }, + { + "epoch": 0.20888729344766366, + "grad_norm": 0.3833242654800415, + "learning_rate": 0.00015825524116507884, + "loss": 1.2077, + "step": 16075 + }, + { + "epoch": 0.20890028799157953, + "grad_norm": 0.3413725197315216, + "learning_rate": 0.00015825264170316744, + "loss": 1.3196, + "step": 16076 + }, + { + "epoch": 0.2089132825354954, + "grad_norm": 0.3846360445022583, + "learning_rate": 0.00015825004224125606, + "loss": 1.5497, + "step": 16077 + }, + { + "epoch": 0.20892627707941128, + "grad_norm": 0.3661326467990875, + "learning_rate": 0.0001582474427793447, + "loss": 1.4268, + "step": 16078 + }, + { + "epoch": 0.20893927162332715, + "grad_norm": 0.36104902625083923, + "learning_rate": 0.0001582448433174333, + "loss": 1.4486, + "step": 16079 + }, + { + "epoch": 0.20895226616724302, + "grad_norm": 0.34755033254623413, + "learning_rate": 0.0001582422438555219, + "loss": 1.5399, + "step": 16080 + }, + { + "epoch": 0.2089652607111589, + "grad_norm": 0.43129974603652954, + "learning_rate": 0.0001582396443936105, + "loss": 1.585, + "step": 16081 + }, + { + "epoch": 0.20897825525507477, + "grad_norm": 0.35307538509368896, + "learning_rate": 0.00015823704493169916, + "loss": 1.5001, + "step": 16082 + }, + { + "epoch": 0.20899124979899064, + "grad_norm": 0.35758987069129944, + "learning_rate": 0.00015823444546978776, + "loss": 1.3192, + "step": 16083 + }, + { + "epoch": 0.20900424434290651, + "grad_norm": 0.3619868755340576, + "learning_rate": 0.00015823184600787638, + "loss": 1.5394, + "step": 16084 + }, + { + "epoch": 0.2090172388868224, + "grad_norm": 0.4265950918197632, + "learning_rate": 0.00015822924654596498, + "loss": 1.5089, + "step": 16085 + }, + { + "epoch": 0.20903023343073826, + "grad_norm": 0.3617009222507477, + "learning_rate": 0.0001582266470840536, + "loss": 1.1267, + "step": 16086 + }, + { + "epoch": 0.20904322797465413, + "grad_norm": 0.37554478645324707, + "learning_rate": 0.00015822404762214223, + "loss": 1.4605, + "step": 16087 + }, + { + "epoch": 0.20905622251857, + "grad_norm": 0.3618440330028534, + "learning_rate": 0.00015822144816023082, + "loss": 1.2524, + "step": 16088 + }, + { + "epoch": 0.20906921706248588, + "grad_norm": 0.3825511932373047, + "learning_rate": 0.00015821884869831948, + "loss": 1.3665, + "step": 16089 + }, + { + "epoch": 0.20908221160640175, + "grad_norm": 0.3011224865913391, + "learning_rate": 0.00015821624923640807, + "loss": 1.4324, + "step": 16090 + }, + { + "epoch": 0.20909520615031763, + "grad_norm": 0.33794263005256653, + "learning_rate": 0.0001582136497744967, + "loss": 1.4026, + "step": 16091 + }, + { + "epoch": 0.2091082006942335, + "grad_norm": 0.3835112452507019, + "learning_rate": 0.0001582110503125853, + "loss": 1.4223, + "step": 16092 + }, + { + "epoch": 0.20912119523814937, + "grad_norm": 0.39376211166381836, + "learning_rate": 0.00015820845085067392, + "loss": 1.4813, + "step": 16093 + }, + { + "epoch": 0.20913418978206524, + "grad_norm": 0.3421460688114166, + "learning_rate": 0.00015820585138876254, + "loss": 1.3319, + "step": 16094 + }, + { + "epoch": 0.20914718432598112, + "grad_norm": 0.37054017186164856, + "learning_rate": 0.00015820325192685114, + "loss": 1.2539, + "step": 16095 + }, + { + "epoch": 0.209160178869897, + "grad_norm": 0.3377704620361328, + "learning_rate": 0.00015820065246493977, + "loss": 1.3083, + "step": 16096 + }, + { + "epoch": 0.20917317341381286, + "grad_norm": 0.3811125159263611, + "learning_rate": 0.0001581980530030284, + "loss": 1.5962, + "step": 16097 + }, + { + "epoch": 0.20918616795772874, + "grad_norm": 0.42365556955337524, + "learning_rate": 0.000158195453541117, + "loss": 1.3844, + "step": 16098 + }, + { + "epoch": 0.2091991625016446, + "grad_norm": 0.34604668617248535, + "learning_rate": 0.0001581928540792056, + "loss": 1.3526, + "step": 16099 + }, + { + "epoch": 0.20921215704556048, + "grad_norm": 0.534712016582489, + "learning_rate": 0.0001581902546172942, + "loss": 1.5126, + "step": 16100 + }, + { + "epoch": 0.20922515158947638, + "grad_norm": 0.4300841689109802, + "learning_rate": 0.00015818765515538286, + "loss": 1.4797, + "step": 16101 + }, + { + "epoch": 0.20923814613339226, + "grad_norm": 0.36704885959625244, + "learning_rate": 0.00015818505569347146, + "loss": 1.3589, + "step": 16102 + }, + { + "epoch": 0.20925114067730813, + "grad_norm": 0.31938013434410095, + "learning_rate": 0.00015818245623156008, + "loss": 1.4357, + "step": 16103 + }, + { + "epoch": 0.209264135221224, + "grad_norm": 0.43198609352111816, + "learning_rate": 0.00015817985676964868, + "loss": 1.3706, + "step": 16104 + }, + { + "epoch": 0.20927712976513987, + "grad_norm": 0.36224010586738586, + "learning_rate": 0.0001581772573077373, + "loss": 1.6895, + "step": 16105 + }, + { + "epoch": 0.20929012430905575, + "grad_norm": 0.3959023356437683, + "learning_rate": 0.00015817465784582593, + "loss": 1.4057, + "step": 16106 + }, + { + "epoch": 0.20930311885297162, + "grad_norm": 0.32151564955711365, + "learning_rate": 0.00015817205838391453, + "loss": 1.2412, + "step": 16107 + }, + { + "epoch": 0.2093161133968875, + "grad_norm": 0.3085571825504303, + "learning_rate": 0.00015816945892200315, + "loss": 1.5794, + "step": 16108 + }, + { + "epoch": 0.20932910794080337, + "grad_norm": 0.4798315465450287, + "learning_rate": 0.00015816685946009178, + "loss": 1.4049, + "step": 16109 + }, + { + "epoch": 0.20934210248471924, + "grad_norm": 0.29774633049964905, + "learning_rate": 0.00015816425999818037, + "loss": 1.3269, + "step": 16110 + }, + { + "epoch": 0.2093550970286351, + "grad_norm": 0.2533249258995056, + "learning_rate": 0.000158161660536269, + "loss": 1.2082, + "step": 16111 + }, + { + "epoch": 0.20936809157255099, + "grad_norm": 0.4961012601852417, + "learning_rate": 0.0001581590610743576, + "loss": 1.4736, + "step": 16112 + }, + { + "epoch": 0.20938108611646686, + "grad_norm": 0.4445107877254486, + "learning_rate": 0.00015815646161244625, + "loss": 1.3557, + "step": 16113 + }, + { + "epoch": 0.20939408066038273, + "grad_norm": 0.41197922825813293, + "learning_rate": 0.00015815386215053484, + "loss": 1.4149, + "step": 16114 + }, + { + "epoch": 0.2094070752042986, + "grad_norm": 0.444100022315979, + "learning_rate": 0.00015815126268862347, + "loss": 1.3203, + "step": 16115 + }, + { + "epoch": 0.20942006974821448, + "grad_norm": 0.3689761459827423, + "learning_rate": 0.00015814866322671207, + "loss": 1.5023, + "step": 16116 + }, + { + "epoch": 0.20943306429213035, + "grad_norm": 0.33834308385849, + "learning_rate": 0.0001581460637648007, + "loss": 1.3092, + "step": 16117 + }, + { + "epoch": 0.20944605883604622, + "grad_norm": 0.4474155306816101, + "learning_rate": 0.00015814346430288932, + "loss": 1.5661, + "step": 16118 + }, + { + "epoch": 0.2094590533799621, + "grad_norm": 0.507619321346283, + "learning_rate": 0.0001581408648409779, + "loss": 1.5395, + "step": 16119 + }, + { + "epoch": 0.20947204792387797, + "grad_norm": 0.3917638063430786, + "learning_rate": 0.00015813826537906654, + "loss": 1.481, + "step": 16120 + }, + { + "epoch": 0.20948504246779384, + "grad_norm": 0.40150707960128784, + "learning_rate": 0.00015813566591715516, + "loss": 1.3217, + "step": 16121 + }, + { + "epoch": 0.20949803701170971, + "grad_norm": 0.4090917706489563, + "learning_rate": 0.00015813306645524379, + "loss": 1.3591, + "step": 16122 + }, + { + "epoch": 0.2095110315556256, + "grad_norm": 0.4967215359210968, + "learning_rate": 0.00015813046699333238, + "loss": 1.393, + "step": 16123 + }, + { + "epoch": 0.20952402609954146, + "grad_norm": 0.4418075680732727, + "learning_rate": 0.000158127867531421, + "loss": 1.4935, + "step": 16124 + }, + { + "epoch": 0.20953702064345733, + "grad_norm": 0.37651708722114563, + "learning_rate": 0.00015812526806950963, + "loss": 1.4646, + "step": 16125 + }, + { + "epoch": 0.2095500151873732, + "grad_norm": 0.4067695140838623, + "learning_rate": 0.00015812266860759823, + "loss": 1.3542, + "step": 16126 + }, + { + "epoch": 0.20956300973128908, + "grad_norm": 0.33154159784317017, + "learning_rate": 0.00015812006914568685, + "loss": 1.5, + "step": 16127 + }, + { + "epoch": 0.20957600427520495, + "grad_norm": 0.3993632197380066, + "learning_rate": 0.00015811746968377548, + "loss": 1.4224, + "step": 16128 + }, + { + "epoch": 0.20958899881912083, + "grad_norm": 0.45638805627822876, + "learning_rate": 0.00015811487022186408, + "loss": 1.5868, + "step": 16129 + }, + { + "epoch": 0.2096019933630367, + "grad_norm": 0.4839898943901062, + "learning_rate": 0.0001581122707599527, + "loss": 1.6074, + "step": 16130 + }, + { + "epoch": 0.20961498790695257, + "grad_norm": 0.4412135183811188, + "learning_rate": 0.0001581096712980413, + "loss": 1.3248, + "step": 16131 + }, + { + "epoch": 0.20962798245086844, + "grad_norm": 0.30279359221458435, + "learning_rate": 0.00015810707183612995, + "loss": 1.168, + "step": 16132 + }, + { + "epoch": 0.20964097699478432, + "grad_norm": 0.5604149699211121, + "learning_rate": 0.00015810447237421855, + "loss": 1.5378, + "step": 16133 + }, + { + "epoch": 0.2096539715387002, + "grad_norm": 0.34835976362228394, + "learning_rate": 0.00015810187291230717, + "loss": 1.3418, + "step": 16134 + }, + { + "epoch": 0.20966696608261606, + "grad_norm": 0.3554418087005615, + "learning_rate": 0.00015809927345039577, + "loss": 1.3782, + "step": 16135 + }, + { + "epoch": 0.20967996062653194, + "grad_norm": 0.3810730278491974, + "learning_rate": 0.0001580966739884844, + "loss": 1.6233, + "step": 16136 + }, + { + "epoch": 0.2096929551704478, + "grad_norm": 0.8057996034622192, + "learning_rate": 0.00015809407452657302, + "loss": 1.4464, + "step": 16137 + }, + { + "epoch": 0.20970594971436368, + "grad_norm": 0.35921770334243774, + "learning_rate": 0.00015809147506466162, + "loss": 1.4362, + "step": 16138 + }, + { + "epoch": 0.20971894425827955, + "grad_norm": 0.33673444390296936, + "learning_rate": 0.00015808887560275024, + "loss": 1.2865, + "step": 16139 + }, + { + "epoch": 0.20973193880219543, + "grad_norm": 0.32769522070884705, + "learning_rate": 0.00015808627614083886, + "loss": 1.4023, + "step": 16140 + }, + { + "epoch": 0.2097449333461113, + "grad_norm": 0.40654343366622925, + "learning_rate": 0.00015808367667892746, + "loss": 1.3868, + "step": 16141 + }, + { + "epoch": 0.20975792789002717, + "grad_norm": 0.5318293571472168, + "learning_rate": 0.00015808107721701609, + "loss": 1.5092, + "step": 16142 + }, + { + "epoch": 0.20977092243394305, + "grad_norm": 0.37608638405799866, + "learning_rate": 0.00015807847775510468, + "loss": 1.4289, + "step": 16143 + }, + { + "epoch": 0.20978391697785892, + "grad_norm": 0.30139660835266113, + "learning_rate": 0.00015807587829319333, + "loss": 1.1744, + "step": 16144 + }, + { + "epoch": 0.2097969115217748, + "grad_norm": 0.33328428864479065, + "learning_rate": 0.00015807327883128193, + "loss": 1.4049, + "step": 16145 + }, + { + "epoch": 0.20980990606569067, + "grad_norm": 0.3661699593067169, + "learning_rate": 0.00015807067936937056, + "loss": 1.3411, + "step": 16146 + }, + { + "epoch": 0.20982290060960654, + "grad_norm": 0.33218416571617126, + "learning_rate": 0.00015806807990745915, + "loss": 1.3809, + "step": 16147 + }, + { + "epoch": 0.2098358951535224, + "grad_norm": 0.5098658800125122, + "learning_rate": 0.00015806548044554778, + "loss": 1.2054, + "step": 16148 + }, + { + "epoch": 0.20984888969743828, + "grad_norm": 0.4795396029949188, + "learning_rate": 0.0001580628809836364, + "loss": 1.5008, + "step": 16149 + }, + { + "epoch": 0.20986188424135416, + "grad_norm": 0.4537249207496643, + "learning_rate": 0.000158060281521725, + "loss": 1.4523, + "step": 16150 + }, + { + "epoch": 0.20987487878527003, + "grad_norm": 0.4230712354183197, + "learning_rate": 0.00015805768205981362, + "loss": 1.4202, + "step": 16151 + }, + { + "epoch": 0.2098878733291859, + "grad_norm": 0.419634073972702, + "learning_rate": 0.00015805508259790225, + "loss": 1.4281, + "step": 16152 + }, + { + "epoch": 0.20990086787310178, + "grad_norm": 0.5495619773864746, + "learning_rate": 0.00015805248313599085, + "loss": 1.4266, + "step": 16153 + }, + { + "epoch": 0.20991386241701765, + "grad_norm": 0.43823331594467163, + "learning_rate": 0.00015804988367407947, + "loss": 1.4767, + "step": 16154 + }, + { + "epoch": 0.20992685696093352, + "grad_norm": 0.3293590247631073, + "learning_rate": 0.00015804728421216807, + "loss": 1.2809, + "step": 16155 + }, + { + "epoch": 0.2099398515048494, + "grad_norm": 0.4741308391094208, + "learning_rate": 0.00015804468475025672, + "loss": 1.41, + "step": 16156 + }, + { + "epoch": 0.20995284604876527, + "grad_norm": 0.5016207098960876, + "learning_rate": 0.00015804208528834532, + "loss": 1.4976, + "step": 16157 + }, + { + "epoch": 0.20996584059268114, + "grad_norm": 0.2731064260005951, + "learning_rate": 0.00015803948582643394, + "loss": 1.416, + "step": 16158 + }, + { + "epoch": 0.20997883513659701, + "grad_norm": 0.30615097284317017, + "learning_rate": 0.00015803688636452254, + "loss": 1.3727, + "step": 16159 + }, + { + "epoch": 0.2099918296805129, + "grad_norm": 0.3715973198413849, + "learning_rate": 0.00015803428690261116, + "loss": 1.5319, + "step": 16160 + }, + { + "epoch": 0.21000482422442876, + "grad_norm": 0.39406460523605347, + "learning_rate": 0.0001580316874406998, + "loss": 1.3951, + "step": 16161 + }, + { + "epoch": 0.21001781876834463, + "grad_norm": 0.4085550606250763, + "learning_rate": 0.00015802908797878839, + "loss": 1.3726, + "step": 16162 + }, + { + "epoch": 0.2100308133122605, + "grad_norm": 0.4924324154853821, + "learning_rate": 0.00015802648851687704, + "loss": 1.5622, + "step": 16163 + }, + { + "epoch": 0.21004380785617638, + "grad_norm": 0.3017157316207886, + "learning_rate": 0.00015802388905496563, + "loss": 1.327, + "step": 16164 + }, + { + "epoch": 0.21005680240009225, + "grad_norm": 0.35309597849845886, + "learning_rate": 0.00015802128959305423, + "loss": 1.354, + "step": 16165 + }, + { + "epoch": 0.21006979694400812, + "grad_norm": 0.34255531430244446, + "learning_rate": 0.00015801869013114286, + "loss": 1.2859, + "step": 16166 + }, + { + "epoch": 0.210082791487924, + "grad_norm": 0.4562980532646179, + "learning_rate": 0.00015801609066923148, + "loss": 1.3558, + "step": 16167 + }, + { + "epoch": 0.21009578603183987, + "grad_norm": 0.3897833526134491, + "learning_rate": 0.0001580134912073201, + "loss": 1.256, + "step": 16168 + }, + { + "epoch": 0.21010878057575574, + "grad_norm": 0.526243269443512, + "learning_rate": 0.0001580108917454087, + "loss": 1.4002, + "step": 16169 + }, + { + "epoch": 0.21012177511967162, + "grad_norm": 0.41489970684051514, + "learning_rate": 0.00015800829228349733, + "loss": 1.3833, + "step": 16170 + }, + { + "epoch": 0.2101347696635875, + "grad_norm": 0.4673166871070862, + "learning_rate": 0.00015800569282158595, + "loss": 1.5615, + "step": 16171 + }, + { + "epoch": 0.21014776420750336, + "grad_norm": 0.5549204349517822, + "learning_rate": 0.00015800309335967455, + "loss": 1.442, + "step": 16172 + }, + { + "epoch": 0.21016075875141924, + "grad_norm": 0.279826819896698, + "learning_rate": 0.00015800049389776317, + "loss": 1.3215, + "step": 16173 + }, + { + "epoch": 0.2101737532953351, + "grad_norm": 0.4905039966106415, + "learning_rate": 0.00015799789443585177, + "loss": 1.4875, + "step": 16174 + }, + { + "epoch": 0.21018674783925098, + "grad_norm": 0.40426307916641235, + "learning_rate": 0.00015799529497394042, + "loss": 1.4631, + "step": 16175 + }, + { + "epoch": 0.21019974238316685, + "grad_norm": 0.42143338918685913, + "learning_rate": 0.00015799269551202902, + "loss": 1.4643, + "step": 16176 + }, + { + "epoch": 0.21021273692708273, + "grad_norm": 0.485944539308548, + "learning_rate": 0.00015799009605011762, + "loss": 1.4182, + "step": 16177 + }, + { + "epoch": 0.21022573147099863, + "grad_norm": 0.3476139307022095, + "learning_rate": 0.00015798749658820624, + "loss": 1.3245, + "step": 16178 + }, + { + "epoch": 0.2102387260149145, + "grad_norm": 0.35941123962402344, + "learning_rate": 0.00015798489712629487, + "loss": 1.3055, + "step": 16179 + }, + { + "epoch": 0.21025172055883037, + "grad_norm": 0.4703357517719269, + "learning_rate": 0.0001579822976643835, + "loss": 1.4774, + "step": 16180 + }, + { + "epoch": 0.21026471510274625, + "grad_norm": 0.41082531213760376, + "learning_rate": 0.0001579796982024721, + "loss": 1.2505, + "step": 16181 + }, + { + "epoch": 0.21027770964666212, + "grad_norm": 0.42048898339271545, + "learning_rate": 0.0001579770987405607, + "loss": 1.4652, + "step": 16182 + }, + { + "epoch": 0.210290704190578, + "grad_norm": 0.42284929752349854, + "learning_rate": 0.00015797449927864934, + "loss": 1.4355, + "step": 16183 + }, + { + "epoch": 0.21030369873449387, + "grad_norm": 0.30913910269737244, + "learning_rate": 0.00015797189981673793, + "loss": 1.3227, + "step": 16184 + }, + { + "epoch": 0.21031669327840974, + "grad_norm": 0.338111013174057, + "learning_rate": 0.00015796930035482656, + "loss": 1.2936, + "step": 16185 + }, + { + "epoch": 0.2103296878223256, + "grad_norm": 0.3680938184261322, + "learning_rate": 0.00015796670089291516, + "loss": 1.4755, + "step": 16186 + }, + { + "epoch": 0.21034268236624148, + "grad_norm": 0.3610798120498657, + "learning_rate": 0.0001579641014310038, + "loss": 1.3156, + "step": 16187 + }, + { + "epoch": 0.21035567691015736, + "grad_norm": 0.3957568109035492, + "learning_rate": 0.0001579615019690924, + "loss": 1.5311, + "step": 16188 + }, + { + "epoch": 0.21036867145407323, + "grad_norm": 0.39040637016296387, + "learning_rate": 0.00015795890250718103, + "loss": 1.3733, + "step": 16189 + }, + { + "epoch": 0.2103816659979891, + "grad_norm": 0.3577485680580139, + "learning_rate": 0.00015795630304526963, + "loss": 1.5836, + "step": 16190 + }, + { + "epoch": 0.21039466054190498, + "grad_norm": 0.33625850081443787, + "learning_rate": 0.00015795370358335825, + "loss": 1.4179, + "step": 16191 + }, + { + "epoch": 0.21040765508582085, + "grad_norm": 0.46243566274642944, + "learning_rate": 0.00015795110412144688, + "loss": 1.4302, + "step": 16192 + }, + { + "epoch": 0.21042064962973672, + "grad_norm": 0.391183078289032, + "learning_rate": 0.00015794850465953547, + "loss": 1.309, + "step": 16193 + }, + { + "epoch": 0.2104336441736526, + "grad_norm": 0.3739601671695709, + "learning_rate": 0.0001579459051976241, + "loss": 1.3358, + "step": 16194 + }, + { + "epoch": 0.21044663871756847, + "grad_norm": 0.4088958203792572, + "learning_rate": 0.00015794330573571272, + "loss": 1.5672, + "step": 16195 + }, + { + "epoch": 0.21045963326148434, + "grad_norm": 0.4685206115245819, + "learning_rate": 0.00015794070627380132, + "loss": 1.4968, + "step": 16196 + }, + { + "epoch": 0.21047262780540021, + "grad_norm": 0.410035640001297, + "learning_rate": 0.00015793810681188994, + "loss": 1.3905, + "step": 16197 + }, + { + "epoch": 0.2104856223493161, + "grad_norm": 0.3784518539905548, + "learning_rate": 0.00015793550734997854, + "loss": 1.4736, + "step": 16198 + }, + { + "epoch": 0.21049861689323196, + "grad_norm": 0.33516770601272583, + "learning_rate": 0.0001579329078880672, + "loss": 1.4937, + "step": 16199 + }, + { + "epoch": 0.21051161143714783, + "grad_norm": 0.3485300540924072, + "learning_rate": 0.0001579303084261558, + "loss": 1.4367, + "step": 16200 + }, + { + "epoch": 0.2105246059810637, + "grad_norm": 0.40552788972854614, + "learning_rate": 0.00015792770896424442, + "loss": 1.5334, + "step": 16201 + }, + { + "epoch": 0.21053760052497958, + "grad_norm": 0.3778834342956543, + "learning_rate": 0.00015792510950233304, + "loss": 1.3598, + "step": 16202 + }, + { + "epoch": 0.21055059506889545, + "grad_norm": 0.33078300952911377, + "learning_rate": 0.00015792251004042164, + "loss": 1.4294, + "step": 16203 + }, + { + "epoch": 0.21056358961281132, + "grad_norm": 0.38528957962989807, + "learning_rate": 0.00015791991057851026, + "loss": 1.524, + "step": 16204 + }, + { + "epoch": 0.2105765841567272, + "grad_norm": 0.4119674563407898, + "learning_rate": 0.00015791731111659886, + "loss": 1.4431, + "step": 16205 + }, + { + "epoch": 0.21058957870064307, + "grad_norm": 0.31666994094848633, + "learning_rate": 0.00015791471165468748, + "loss": 1.4804, + "step": 16206 + }, + { + "epoch": 0.21060257324455894, + "grad_norm": 0.38198497891426086, + "learning_rate": 0.0001579121121927761, + "loss": 1.2563, + "step": 16207 + }, + { + "epoch": 0.21061556778847482, + "grad_norm": 0.46017175912857056, + "learning_rate": 0.0001579095127308647, + "loss": 1.4845, + "step": 16208 + }, + { + "epoch": 0.2106285623323907, + "grad_norm": 0.41139546036720276, + "learning_rate": 0.00015790691326895333, + "loss": 1.5126, + "step": 16209 + }, + { + "epoch": 0.21064155687630656, + "grad_norm": 0.3713287115097046, + "learning_rate": 0.00015790431380704195, + "loss": 1.388, + "step": 16210 + }, + { + "epoch": 0.21065455142022244, + "grad_norm": 0.47881636023521423, + "learning_rate": 0.00015790171434513058, + "loss": 1.387, + "step": 16211 + }, + { + "epoch": 0.2106675459641383, + "grad_norm": 0.37192192673683167, + "learning_rate": 0.00015789911488321918, + "loss": 1.3125, + "step": 16212 + }, + { + "epoch": 0.21068054050805418, + "grad_norm": 0.43556323647499084, + "learning_rate": 0.0001578965154213078, + "loss": 1.5504, + "step": 16213 + }, + { + "epoch": 0.21069353505197005, + "grad_norm": 0.3167757987976074, + "learning_rate": 0.00015789391595939643, + "loss": 1.3135, + "step": 16214 + }, + { + "epoch": 0.21070652959588593, + "grad_norm": 0.4473654627799988, + "learning_rate": 0.00015789131649748502, + "loss": 1.574, + "step": 16215 + }, + { + "epoch": 0.2107195241398018, + "grad_norm": 0.4516356885433197, + "learning_rate": 0.00015788871703557365, + "loss": 1.4953, + "step": 16216 + }, + { + "epoch": 0.21073251868371767, + "grad_norm": 0.4738386869430542, + "learning_rate": 0.00015788611757366224, + "loss": 1.4981, + "step": 16217 + }, + { + "epoch": 0.21074551322763355, + "grad_norm": 0.4137638807296753, + "learning_rate": 0.0001578835181117509, + "loss": 1.4266, + "step": 16218 + }, + { + "epoch": 0.21075850777154942, + "grad_norm": 0.3552573621273041, + "learning_rate": 0.0001578809186498395, + "loss": 1.2781, + "step": 16219 + }, + { + "epoch": 0.2107715023154653, + "grad_norm": 0.4965246617794037, + "learning_rate": 0.0001578783191879281, + "loss": 1.2732, + "step": 16220 + }, + { + "epoch": 0.21078449685938117, + "grad_norm": 0.4246561527252197, + "learning_rate": 0.00015787571972601672, + "loss": 1.4232, + "step": 16221 + }, + { + "epoch": 0.21079749140329704, + "grad_norm": 0.3535086214542389, + "learning_rate": 0.00015787312026410534, + "loss": 1.5869, + "step": 16222 + }, + { + "epoch": 0.2108104859472129, + "grad_norm": 0.3383576273918152, + "learning_rate": 0.00015787052080219396, + "loss": 1.5025, + "step": 16223 + }, + { + "epoch": 0.21082348049112878, + "grad_norm": 0.35446271300315857, + "learning_rate": 0.00015786792134028256, + "loss": 1.4016, + "step": 16224 + }, + { + "epoch": 0.21083647503504466, + "grad_norm": 0.5007792711257935, + "learning_rate": 0.00015786532187837119, + "loss": 1.7624, + "step": 16225 + }, + { + "epoch": 0.21084946957896053, + "grad_norm": 0.41847583651542664, + "learning_rate": 0.0001578627224164598, + "loss": 1.4083, + "step": 16226 + }, + { + "epoch": 0.2108624641228764, + "grad_norm": 0.40579652786254883, + "learning_rate": 0.0001578601229545484, + "loss": 1.4386, + "step": 16227 + }, + { + "epoch": 0.21087545866679228, + "grad_norm": 0.45208218693733215, + "learning_rate": 0.00015785752349263703, + "loss": 1.3425, + "step": 16228 + }, + { + "epoch": 0.21088845321070815, + "grad_norm": 0.4796958267688751, + "learning_rate": 0.00015785492403072563, + "loss": 1.4284, + "step": 16229 + }, + { + "epoch": 0.21090144775462402, + "grad_norm": 0.4104662239551544, + "learning_rate": 0.00015785232456881428, + "loss": 1.2648, + "step": 16230 + }, + { + "epoch": 0.2109144422985399, + "grad_norm": 0.454857736825943, + "learning_rate": 0.00015784972510690288, + "loss": 1.534, + "step": 16231 + }, + { + "epoch": 0.21092743684245577, + "grad_norm": 0.31274157762527466, + "learning_rate": 0.00015784712564499148, + "loss": 1.2987, + "step": 16232 + }, + { + "epoch": 0.21094043138637164, + "grad_norm": 0.4466742277145386, + "learning_rate": 0.0001578445261830801, + "loss": 1.3356, + "step": 16233 + }, + { + "epoch": 0.2109534259302875, + "grad_norm": 0.47671762108802795, + "learning_rate": 0.00015784192672116873, + "loss": 1.4144, + "step": 16234 + }, + { + "epoch": 0.2109664204742034, + "grad_norm": 0.4282459616661072, + "learning_rate": 0.00015783932725925735, + "loss": 1.2414, + "step": 16235 + }, + { + "epoch": 0.21097941501811926, + "grad_norm": 0.4464629590511322, + "learning_rate": 0.00015783672779734595, + "loss": 1.3075, + "step": 16236 + }, + { + "epoch": 0.21099240956203513, + "grad_norm": 0.40190696716308594, + "learning_rate": 0.00015783412833543457, + "loss": 1.227, + "step": 16237 + }, + { + "epoch": 0.211005404105951, + "grad_norm": 0.36484453082084656, + "learning_rate": 0.0001578315288735232, + "loss": 1.3304, + "step": 16238 + }, + { + "epoch": 0.21101839864986688, + "grad_norm": 0.2900242209434509, + "learning_rate": 0.0001578289294116118, + "loss": 1.27, + "step": 16239 + }, + { + "epoch": 0.21103139319378275, + "grad_norm": 0.4291497766971588, + "learning_rate": 0.00015782632994970042, + "loss": 1.5278, + "step": 16240 + }, + { + "epoch": 0.21104438773769862, + "grad_norm": 0.43599385023117065, + "learning_rate": 0.00015782373048778904, + "loss": 1.4757, + "step": 16241 + }, + { + "epoch": 0.2110573822816145, + "grad_norm": 0.43510109186172485, + "learning_rate": 0.00015782113102587767, + "loss": 1.4869, + "step": 16242 + }, + { + "epoch": 0.21107037682553037, + "grad_norm": 0.3670237064361572, + "learning_rate": 0.00015781853156396626, + "loss": 1.4084, + "step": 16243 + }, + { + "epoch": 0.21108337136944624, + "grad_norm": 0.41854774951934814, + "learning_rate": 0.0001578159321020549, + "loss": 1.4313, + "step": 16244 + }, + { + "epoch": 0.21109636591336212, + "grad_norm": 0.37367674708366394, + "learning_rate": 0.0001578133326401435, + "loss": 1.5754, + "step": 16245 + }, + { + "epoch": 0.211109360457278, + "grad_norm": 0.35526806116104126, + "learning_rate": 0.0001578107331782321, + "loss": 1.347, + "step": 16246 + }, + { + "epoch": 0.21112235500119386, + "grad_norm": 0.46190324425697327, + "learning_rate": 0.00015780813371632074, + "loss": 1.3326, + "step": 16247 + }, + { + "epoch": 0.21113534954510974, + "grad_norm": 0.3207860589027405, + "learning_rate": 0.00015780553425440933, + "loss": 1.5087, + "step": 16248 + }, + { + "epoch": 0.2111483440890256, + "grad_norm": 0.4525337219238281, + "learning_rate": 0.00015780293479249796, + "loss": 1.4193, + "step": 16249 + }, + { + "epoch": 0.21116133863294148, + "grad_norm": 0.40851640701293945, + "learning_rate": 0.00015780033533058658, + "loss": 1.2945, + "step": 16250 + }, + { + "epoch": 0.21117433317685735, + "grad_norm": 0.35073214769363403, + "learning_rate": 0.00015779773586867518, + "loss": 1.4833, + "step": 16251 + }, + { + "epoch": 0.21118732772077323, + "grad_norm": 0.2889728844165802, + "learning_rate": 0.0001577951364067638, + "loss": 1.2882, + "step": 16252 + }, + { + "epoch": 0.2112003222646891, + "grad_norm": 0.39264413714408875, + "learning_rate": 0.00015779253694485243, + "loss": 1.4889, + "step": 16253 + }, + { + "epoch": 0.211213316808605, + "grad_norm": 0.40936630964279175, + "learning_rate": 0.00015778993748294105, + "loss": 1.3504, + "step": 16254 + }, + { + "epoch": 0.21122631135252087, + "grad_norm": 0.4741725027561188, + "learning_rate": 0.00015778733802102965, + "loss": 1.5754, + "step": 16255 + }, + { + "epoch": 0.21123930589643675, + "grad_norm": 0.358914315700531, + "learning_rate": 0.00015778473855911827, + "loss": 1.398, + "step": 16256 + }, + { + "epoch": 0.21125230044035262, + "grad_norm": 0.35838326811790466, + "learning_rate": 0.0001577821390972069, + "loss": 1.6176, + "step": 16257 + }, + { + "epoch": 0.2112652949842685, + "grad_norm": 0.4114440679550171, + "learning_rate": 0.0001577795396352955, + "loss": 1.4288, + "step": 16258 + }, + { + "epoch": 0.21127828952818437, + "grad_norm": 0.3763172924518585, + "learning_rate": 0.00015777694017338412, + "loss": 1.3329, + "step": 16259 + }, + { + "epoch": 0.21129128407210024, + "grad_norm": 0.422807902097702, + "learning_rate": 0.00015777434071147272, + "loss": 1.2846, + "step": 16260 + }, + { + "epoch": 0.2113042786160161, + "grad_norm": 0.39940619468688965, + "learning_rate": 0.00015777174124956134, + "loss": 1.4688, + "step": 16261 + }, + { + "epoch": 0.21131727315993198, + "grad_norm": 0.43360424041748047, + "learning_rate": 0.00015776914178764997, + "loss": 1.5483, + "step": 16262 + }, + { + "epoch": 0.21133026770384786, + "grad_norm": 0.4620785117149353, + "learning_rate": 0.00015776654232573856, + "loss": 1.5158, + "step": 16263 + }, + { + "epoch": 0.21134326224776373, + "grad_norm": 0.4285954535007477, + "learning_rate": 0.0001577639428638272, + "loss": 1.5753, + "step": 16264 + }, + { + "epoch": 0.2113562567916796, + "grad_norm": 0.5011884570121765, + "learning_rate": 0.0001577613434019158, + "loss": 1.3573, + "step": 16265 + }, + { + "epoch": 0.21136925133559548, + "grad_norm": 0.442808598279953, + "learning_rate": 0.00015775874394000444, + "loss": 1.478, + "step": 16266 + }, + { + "epoch": 0.21138224587951135, + "grad_norm": 0.4511898159980774, + "learning_rate": 0.00015775614447809304, + "loss": 1.4197, + "step": 16267 + }, + { + "epoch": 0.21139524042342722, + "grad_norm": 0.34370219707489014, + "learning_rate": 0.00015775354501618166, + "loss": 1.5622, + "step": 16268 + }, + { + "epoch": 0.2114082349673431, + "grad_norm": 0.44472193717956543, + "learning_rate": 0.00015775094555427028, + "loss": 1.5086, + "step": 16269 + }, + { + "epoch": 0.21142122951125897, + "grad_norm": 0.3992282748222351, + "learning_rate": 0.00015774834609235888, + "loss": 1.3097, + "step": 16270 + }, + { + "epoch": 0.21143422405517484, + "grad_norm": 0.4432145059108734, + "learning_rate": 0.0001577457466304475, + "loss": 1.4042, + "step": 16271 + }, + { + "epoch": 0.2114472185990907, + "grad_norm": 0.40112441778182983, + "learning_rate": 0.0001577431471685361, + "loss": 1.2503, + "step": 16272 + }, + { + "epoch": 0.2114602131430066, + "grad_norm": 0.30481457710266113, + "learning_rate": 0.00015774054770662475, + "loss": 1.3486, + "step": 16273 + }, + { + "epoch": 0.21147320768692246, + "grad_norm": 0.3991979658603668, + "learning_rate": 0.00015773794824471335, + "loss": 1.4356, + "step": 16274 + }, + { + "epoch": 0.21148620223083833, + "grad_norm": 0.3961305618286133, + "learning_rate": 0.00015773534878280195, + "loss": 1.5381, + "step": 16275 + }, + { + "epoch": 0.2114991967747542, + "grad_norm": 0.44020363688468933, + "learning_rate": 0.0001577327493208906, + "loss": 1.488, + "step": 16276 + }, + { + "epoch": 0.21151219131867008, + "grad_norm": 0.4920528829097748, + "learning_rate": 0.0001577301498589792, + "loss": 1.5657, + "step": 16277 + }, + { + "epoch": 0.21152518586258595, + "grad_norm": 0.5440728664398193, + "learning_rate": 0.00015772755039706782, + "loss": 1.5565, + "step": 16278 + }, + { + "epoch": 0.21153818040650182, + "grad_norm": 0.2542400062084198, + "learning_rate": 0.00015772495093515642, + "loss": 1.3051, + "step": 16279 + }, + { + "epoch": 0.2115511749504177, + "grad_norm": 0.3778766989707947, + "learning_rate": 0.00015772235147324504, + "loss": 1.4693, + "step": 16280 + }, + { + "epoch": 0.21156416949433357, + "grad_norm": 0.49587738513946533, + "learning_rate": 0.00015771975201133367, + "loss": 1.2817, + "step": 16281 + }, + { + "epoch": 0.21157716403824944, + "grad_norm": 0.36572280526161194, + "learning_rate": 0.00015771715254942227, + "loss": 1.3194, + "step": 16282 + }, + { + "epoch": 0.21159015858216532, + "grad_norm": 0.491885244846344, + "learning_rate": 0.0001577145530875109, + "loss": 1.529, + "step": 16283 + }, + { + "epoch": 0.2116031531260812, + "grad_norm": 0.40221449732780457, + "learning_rate": 0.00015771195362559952, + "loss": 1.312, + "step": 16284 + }, + { + "epoch": 0.21161614766999706, + "grad_norm": 0.4051157534122467, + "learning_rate": 0.00015770935416368814, + "loss": 1.3307, + "step": 16285 + }, + { + "epoch": 0.21162914221391294, + "grad_norm": 0.3877584636211395, + "learning_rate": 0.00015770675470177674, + "loss": 1.3674, + "step": 16286 + }, + { + "epoch": 0.2116421367578288, + "grad_norm": 0.39504584670066833, + "learning_rate": 0.00015770415523986534, + "loss": 1.5278, + "step": 16287 + }, + { + "epoch": 0.21165513130174468, + "grad_norm": 0.2848084568977356, + "learning_rate": 0.000157701555777954, + "loss": 1.2895, + "step": 16288 + }, + { + "epoch": 0.21166812584566055, + "grad_norm": 0.4326888620853424, + "learning_rate": 0.00015769895631604258, + "loss": 1.541, + "step": 16289 + }, + { + "epoch": 0.21168112038957643, + "grad_norm": 0.3197568356990814, + "learning_rate": 0.0001576963568541312, + "loss": 1.3245, + "step": 16290 + }, + { + "epoch": 0.2116941149334923, + "grad_norm": 0.33372920751571655, + "learning_rate": 0.0001576937573922198, + "loss": 1.3271, + "step": 16291 + }, + { + "epoch": 0.21170710947740817, + "grad_norm": 0.3435698449611664, + "learning_rate": 0.00015769115793030843, + "loss": 1.4158, + "step": 16292 + }, + { + "epoch": 0.21172010402132405, + "grad_norm": 0.369258850812912, + "learning_rate": 0.00015768855846839705, + "loss": 1.5082, + "step": 16293 + }, + { + "epoch": 0.21173309856523992, + "grad_norm": 0.35419028997421265, + "learning_rate": 0.00015768595900648565, + "loss": 1.3562, + "step": 16294 + }, + { + "epoch": 0.2117460931091558, + "grad_norm": 0.44479209184646606, + "learning_rate": 0.00015768335954457428, + "loss": 1.3886, + "step": 16295 + }, + { + "epoch": 0.21175908765307166, + "grad_norm": 0.4464130103588104, + "learning_rate": 0.0001576807600826629, + "loss": 1.5582, + "step": 16296 + }, + { + "epoch": 0.21177208219698754, + "grad_norm": 0.2587941586971283, + "learning_rate": 0.00015767816062075153, + "loss": 1.235, + "step": 16297 + }, + { + "epoch": 0.2117850767409034, + "grad_norm": 0.39664021134376526, + "learning_rate": 0.00015767556115884012, + "loss": 1.3493, + "step": 16298 + }, + { + "epoch": 0.21179807128481928, + "grad_norm": 0.3935302495956421, + "learning_rate": 0.00015767296169692872, + "loss": 1.4107, + "step": 16299 + }, + { + "epoch": 0.21181106582873516, + "grad_norm": 0.42637020349502563, + "learning_rate": 0.00015767036223501737, + "loss": 1.3466, + "step": 16300 + }, + { + "epoch": 0.21182406037265103, + "grad_norm": 0.36727002263069153, + "learning_rate": 0.00015766776277310597, + "loss": 1.3438, + "step": 16301 + }, + { + "epoch": 0.2118370549165669, + "grad_norm": 0.3233186602592468, + "learning_rate": 0.0001576651633111946, + "loss": 1.3481, + "step": 16302 + }, + { + "epoch": 0.21185004946048278, + "grad_norm": 0.4604240953922272, + "learning_rate": 0.0001576625638492832, + "loss": 1.3995, + "step": 16303 + }, + { + "epoch": 0.21186304400439865, + "grad_norm": 0.4501984715461731, + "learning_rate": 0.00015765996438737182, + "loss": 1.5679, + "step": 16304 + }, + { + "epoch": 0.21187603854831452, + "grad_norm": 0.47554242610931396, + "learning_rate": 0.00015765736492546044, + "loss": 1.3234, + "step": 16305 + }, + { + "epoch": 0.2118890330922304, + "grad_norm": 0.3562142550945282, + "learning_rate": 0.00015765476546354904, + "loss": 1.362, + "step": 16306 + }, + { + "epoch": 0.21190202763614627, + "grad_norm": 0.37659701704978943, + "learning_rate": 0.00015765216600163766, + "loss": 1.3895, + "step": 16307 + }, + { + "epoch": 0.21191502218006214, + "grad_norm": 0.3696676194667816, + "learning_rate": 0.0001576495665397263, + "loss": 1.4501, + "step": 16308 + }, + { + "epoch": 0.211928016723978, + "grad_norm": 0.3817282021045685, + "learning_rate": 0.0001576469670778149, + "loss": 1.3747, + "step": 16309 + }, + { + "epoch": 0.2119410112678939, + "grad_norm": 0.26398220658302307, + "learning_rate": 0.0001576443676159035, + "loss": 1.3465, + "step": 16310 + }, + { + "epoch": 0.21195400581180976, + "grad_norm": 0.5268006324768066, + "learning_rate": 0.00015764176815399213, + "loss": 1.6416, + "step": 16311 + }, + { + "epoch": 0.21196700035572563, + "grad_norm": 0.41523152589797974, + "learning_rate": 0.00015763916869208076, + "loss": 1.6528, + "step": 16312 + }, + { + "epoch": 0.2119799948996415, + "grad_norm": 0.511795699596405, + "learning_rate": 0.00015763656923016935, + "loss": 1.4691, + "step": 16313 + }, + { + "epoch": 0.21199298944355738, + "grad_norm": 0.38476619124412537, + "learning_rate": 0.00015763396976825798, + "loss": 1.4436, + "step": 16314 + }, + { + "epoch": 0.21200598398747325, + "grad_norm": 0.3326032757759094, + "learning_rate": 0.0001576313703063466, + "loss": 1.5026, + "step": 16315 + }, + { + "epoch": 0.21201897853138912, + "grad_norm": 0.2643774151802063, + "learning_rate": 0.0001576287708444352, + "loss": 1.177, + "step": 16316 + }, + { + "epoch": 0.212031973075305, + "grad_norm": 0.3378181755542755, + "learning_rate": 0.00015762617138252383, + "loss": 1.4634, + "step": 16317 + }, + { + "epoch": 0.21204496761922087, + "grad_norm": 0.328880250453949, + "learning_rate": 0.00015762357192061242, + "loss": 1.5544, + "step": 16318 + }, + { + "epoch": 0.21205796216313674, + "grad_norm": 0.4815271198749542, + "learning_rate": 0.00015762097245870107, + "loss": 1.6177, + "step": 16319 + }, + { + "epoch": 0.21207095670705262, + "grad_norm": 0.4236106872558594, + "learning_rate": 0.00015761837299678967, + "loss": 1.6295, + "step": 16320 + }, + { + "epoch": 0.2120839512509685, + "grad_norm": 0.2801547050476074, + "learning_rate": 0.0001576157735348783, + "loss": 1.4897, + "step": 16321 + }, + { + "epoch": 0.21209694579488436, + "grad_norm": 0.5137124061584473, + "learning_rate": 0.0001576131740729669, + "loss": 1.4729, + "step": 16322 + }, + { + "epoch": 0.21210994033880023, + "grad_norm": 0.368581086397171, + "learning_rate": 0.00015761057461105552, + "loss": 1.2429, + "step": 16323 + }, + { + "epoch": 0.2121229348827161, + "grad_norm": 0.3731137216091156, + "learning_rate": 0.00015760797514914414, + "loss": 1.472, + "step": 16324 + }, + { + "epoch": 0.21213592942663198, + "grad_norm": 0.34975066781044006, + "learning_rate": 0.00015760537568723274, + "loss": 1.4161, + "step": 16325 + }, + { + "epoch": 0.21214892397054785, + "grad_norm": 0.3392781913280487, + "learning_rate": 0.00015760277622532136, + "loss": 1.4149, + "step": 16326 + }, + { + "epoch": 0.21216191851446373, + "grad_norm": 0.4145644009113312, + "learning_rate": 0.00015760017676341, + "loss": 1.3132, + "step": 16327 + }, + { + "epoch": 0.2121749130583796, + "grad_norm": 0.4203912615776062, + "learning_rate": 0.00015759757730149861, + "loss": 1.5654, + "step": 16328 + }, + { + "epoch": 0.21218790760229547, + "grad_norm": 0.3994449973106384, + "learning_rate": 0.0001575949778395872, + "loss": 1.4233, + "step": 16329 + }, + { + "epoch": 0.21220090214621137, + "grad_norm": 0.36465057730674744, + "learning_rate": 0.0001575923783776758, + "loss": 1.35, + "step": 16330 + }, + { + "epoch": 0.21221389669012725, + "grad_norm": 0.37644141912460327, + "learning_rate": 0.00015758977891576446, + "loss": 1.5822, + "step": 16331 + }, + { + "epoch": 0.21222689123404312, + "grad_norm": 0.3889392614364624, + "learning_rate": 0.00015758717945385306, + "loss": 1.528, + "step": 16332 + }, + { + "epoch": 0.212239885777959, + "grad_norm": 0.30650168657302856, + "learning_rate": 0.00015758457999194168, + "loss": 1.3817, + "step": 16333 + }, + { + "epoch": 0.21225288032187486, + "grad_norm": 0.33140769600868225, + "learning_rate": 0.00015758198053003028, + "loss": 1.3821, + "step": 16334 + }, + { + "epoch": 0.21226587486579074, + "grad_norm": 0.3856028616428375, + "learning_rate": 0.0001575793810681189, + "loss": 1.2606, + "step": 16335 + }, + { + "epoch": 0.2122788694097066, + "grad_norm": 0.4393945634365082, + "learning_rate": 0.00015757678160620753, + "loss": 1.5902, + "step": 16336 + }, + { + "epoch": 0.21229186395362248, + "grad_norm": 0.3755260109901428, + "learning_rate": 0.00015757418214429613, + "loss": 1.3909, + "step": 16337 + }, + { + "epoch": 0.21230485849753836, + "grad_norm": 0.4153316617012024, + "learning_rate": 0.00015757158268238475, + "loss": 1.554, + "step": 16338 + }, + { + "epoch": 0.21231785304145423, + "grad_norm": 0.4598907232284546, + "learning_rate": 0.00015756898322047337, + "loss": 1.51, + "step": 16339 + }, + { + "epoch": 0.2123308475853701, + "grad_norm": 0.3993397057056427, + "learning_rate": 0.000157566383758562, + "loss": 1.3916, + "step": 16340 + }, + { + "epoch": 0.21234384212928598, + "grad_norm": 0.4127173125743866, + "learning_rate": 0.0001575637842966506, + "loss": 1.2945, + "step": 16341 + }, + { + "epoch": 0.21235683667320185, + "grad_norm": 0.4294263422489166, + "learning_rate": 0.0001575611848347392, + "loss": 1.596, + "step": 16342 + }, + { + "epoch": 0.21236983121711772, + "grad_norm": 0.365667462348938, + "learning_rate": 0.00015755858537282785, + "loss": 1.4254, + "step": 16343 + }, + { + "epoch": 0.2123828257610336, + "grad_norm": 0.362148642539978, + "learning_rate": 0.00015755598591091644, + "loss": 1.6175, + "step": 16344 + }, + { + "epoch": 0.21239582030494947, + "grad_norm": 0.30672407150268555, + "learning_rate": 0.00015755338644900507, + "loss": 1.3325, + "step": 16345 + }, + { + "epoch": 0.21240881484886534, + "grad_norm": 0.30809929966926575, + "learning_rate": 0.00015755078698709366, + "loss": 1.2251, + "step": 16346 + }, + { + "epoch": 0.2124218093927812, + "grad_norm": 0.444663405418396, + "learning_rate": 0.0001575481875251823, + "loss": 1.4597, + "step": 16347 + }, + { + "epoch": 0.2124348039366971, + "grad_norm": 0.3519798815250397, + "learning_rate": 0.0001575455880632709, + "loss": 1.4149, + "step": 16348 + }, + { + "epoch": 0.21244779848061296, + "grad_norm": 0.33298400044441223, + "learning_rate": 0.0001575429886013595, + "loss": 1.3919, + "step": 16349 + }, + { + "epoch": 0.21246079302452883, + "grad_norm": 0.45460131764411926, + "learning_rate": 0.00015754038913944816, + "loss": 1.3684, + "step": 16350 + }, + { + "epoch": 0.2124737875684447, + "grad_norm": 0.39912545680999756, + "learning_rate": 0.00015753778967753676, + "loss": 1.3542, + "step": 16351 + }, + { + "epoch": 0.21248678211236058, + "grad_norm": 0.38210734724998474, + "learning_rate": 0.00015753519021562538, + "loss": 1.6695, + "step": 16352 + }, + { + "epoch": 0.21249977665627645, + "grad_norm": 0.38878941535949707, + "learning_rate": 0.00015753259075371398, + "loss": 1.4372, + "step": 16353 + }, + { + "epoch": 0.21251277120019232, + "grad_norm": 0.42028963565826416, + "learning_rate": 0.0001575299912918026, + "loss": 1.3376, + "step": 16354 + }, + { + "epoch": 0.2125257657441082, + "grad_norm": 0.4192335307598114, + "learning_rate": 0.00015752739182989123, + "loss": 1.5857, + "step": 16355 + }, + { + "epoch": 0.21253876028802407, + "grad_norm": 0.34347543120384216, + "learning_rate": 0.00015752479236797983, + "loss": 1.5225, + "step": 16356 + }, + { + "epoch": 0.21255175483193994, + "grad_norm": 0.37485194206237793, + "learning_rate": 0.00015752219290606845, + "loss": 1.5033, + "step": 16357 + }, + { + "epoch": 0.21256474937585582, + "grad_norm": 0.4230476915836334, + "learning_rate": 0.00015751959344415708, + "loss": 1.2886, + "step": 16358 + }, + { + "epoch": 0.2125777439197717, + "grad_norm": 0.3253383934497833, + "learning_rate": 0.00015751699398224567, + "loss": 1.2778, + "step": 16359 + }, + { + "epoch": 0.21259073846368756, + "grad_norm": 0.4557027518749237, + "learning_rate": 0.0001575143945203343, + "loss": 1.4234, + "step": 16360 + }, + { + "epoch": 0.21260373300760343, + "grad_norm": 0.44025859236717224, + "learning_rate": 0.0001575117950584229, + "loss": 1.5407, + "step": 16361 + }, + { + "epoch": 0.2126167275515193, + "grad_norm": 0.4361938238143921, + "learning_rate": 0.00015750919559651155, + "loss": 1.4776, + "step": 16362 + }, + { + "epoch": 0.21262972209543518, + "grad_norm": 0.4802328050136566, + "learning_rate": 0.00015750659613460015, + "loss": 1.4684, + "step": 16363 + }, + { + "epoch": 0.21264271663935105, + "grad_norm": 0.25335893034935, + "learning_rate": 0.00015750399667268877, + "loss": 1.2355, + "step": 16364 + }, + { + "epoch": 0.21265571118326693, + "grad_norm": 0.4080866873264313, + "learning_rate": 0.00015750139721077737, + "loss": 1.2535, + "step": 16365 + }, + { + "epoch": 0.2126687057271828, + "grad_norm": 0.4209708273410797, + "learning_rate": 0.000157498797748866, + "loss": 1.5189, + "step": 16366 + }, + { + "epoch": 0.21268170027109867, + "grad_norm": 0.3630412817001343, + "learning_rate": 0.00015749619828695462, + "loss": 1.4, + "step": 16367 + }, + { + "epoch": 0.21269469481501455, + "grad_norm": 0.3301694095134735, + "learning_rate": 0.0001574935988250432, + "loss": 1.1731, + "step": 16368 + }, + { + "epoch": 0.21270768935893042, + "grad_norm": 0.3677677512168884, + "learning_rate": 0.00015749099936313184, + "loss": 1.2785, + "step": 16369 + }, + { + "epoch": 0.2127206839028463, + "grad_norm": 0.3904007375240326, + "learning_rate": 0.00015748839990122046, + "loss": 1.3091, + "step": 16370 + }, + { + "epoch": 0.21273367844676216, + "grad_norm": 0.4355577826499939, + "learning_rate": 0.00015748580043930906, + "loss": 1.6459, + "step": 16371 + }, + { + "epoch": 0.21274667299067804, + "grad_norm": 0.29074719548225403, + "learning_rate": 0.00015748320097739768, + "loss": 1.1778, + "step": 16372 + }, + { + "epoch": 0.2127596675345939, + "grad_norm": 0.36832883954048157, + "learning_rate": 0.00015748060151548628, + "loss": 1.3407, + "step": 16373 + }, + { + "epoch": 0.21277266207850978, + "grad_norm": 0.41720035672187805, + "learning_rate": 0.00015747800205357493, + "loss": 1.3455, + "step": 16374 + }, + { + "epoch": 0.21278565662242566, + "grad_norm": 0.38714170455932617, + "learning_rate": 0.00015747540259166353, + "loss": 1.3966, + "step": 16375 + }, + { + "epoch": 0.21279865116634153, + "grad_norm": 0.418650358915329, + "learning_rate": 0.00015747280312975216, + "loss": 1.536, + "step": 16376 + }, + { + "epoch": 0.2128116457102574, + "grad_norm": 0.4572198987007141, + "learning_rate": 0.00015747020366784075, + "loss": 1.3927, + "step": 16377 + }, + { + "epoch": 0.21282464025417328, + "grad_norm": 0.3800646960735321, + "learning_rate": 0.00015746760420592938, + "loss": 1.4264, + "step": 16378 + }, + { + "epoch": 0.21283763479808915, + "grad_norm": 0.40469101071357727, + "learning_rate": 0.000157465004744018, + "loss": 1.5828, + "step": 16379 + }, + { + "epoch": 0.21285062934200502, + "grad_norm": 0.43588578701019287, + "learning_rate": 0.0001574624052821066, + "loss": 1.3067, + "step": 16380 + }, + { + "epoch": 0.2128636238859209, + "grad_norm": 0.38335245847702026, + "learning_rate": 0.00015745980582019522, + "loss": 1.3734, + "step": 16381 + }, + { + "epoch": 0.21287661842983677, + "grad_norm": 0.41501665115356445, + "learning_rate": 0.00015745720635828385, + "loss": 1.4202, + "step": 16382 + }, + { + "epoch": 0.21288961297375264, + "grad_norm": 0.3989786207675934, + "learning_rate": 0.00015745460689637245, + "loss": 1.5109, + "step": 16383 + }, + { + "epoch": 0.2129026075176685, + "grad_norm": 0.4099924862384796, + "learning_rate": 0.00015745200743446107, + "loss": 1.4466, + "step": 16384 + }, + { + "epoch": 0.21291560206158439, + "grad_norm": 0.41755086183547974, + "learning_rate": 0.0001574494079725497, + "loss": 1.4014, + "step": 16385 + }, + { + "epoch": 0.21292859660550026, + "grad_norm": 0.3694026470184326, + "learning_rate": 0.00015744680851063832, + "loss": 1.2747, + "step": 16386 + }, + { + "epoch": 0.21294159114941613, + "grad_norm": 0.4344460368156433, + "learning_rate": 0.00015744420904872692, + "loss": 1.4092, + "step": 16387 + }, + { + "epoch": 0.212954585693332, + "grad_norm": 0.30986517667770386, + "learning_rate": 0.00015744160958681554, + "loss": 1.3623, + "step": 16388 + }, + { + "epoch": 0.21296758023724788, + "grad_norm": 0.42683228850364685, + "learning_rate": 0.00015743901012490417, + "loss": 1.4106, + "step": 16389 + }, + { + "epoch": 0.21298057478116375, + "grad_norm": 0.3688706159591675, + "learning_rate": 0.00015743641066299276, + "loss": 1.3762, + "step": 16390 + }, + { + "epoch": 0.21299356932507962, + "grad_norm": 0.4859285354614258, + "learning_rate": 0.0001574338112010814, + "loss": 1.4464, + "step": 16391 + }, + { + "epoch": 0.2130065638689955, + "grad_norm": 0.4936436414718628, + "learning_rate": 0.00015743121173916998, + "loss": 1.4556, + "step": 16392 + }, + { + "epoch": 0.21301955841291137, + "grad_norm": 0.38862279057502747, + "learning_rate": 0.00015742861227725864, + "loss": 1.3634, + "step": 16393 + }, + { + "epoch": 0.21303255295682724, + "grad_norm": 0.45797985792160034, + "learning_rate": 0.00015742601281534723, + "loss": 1.425, + "step": 16394 + }, + { + "epoch": 0.21304554750074312, + "grad_norm": 0.37307730317115784, + "learning_rate": 0.00015742341335343586, + "loss": 1.3996, + "step": 16395 + }, + { + "epoch": 0.213058542044659, + "grad_norm": 0.43095463514328003, + "learning_rate": 0.00015742081389152446, + "loss": 1.3158, + "step": 16396 + }, + { + "epoch": 0.21307153658857486, + "grad_norm": 0.47890886664390564, + "learning_rate": 0.00015741821442961308, + "loss": 1.355, + "step": 16397 + }, + { + "epoch": 0.21308453113249073, + "grad_norm": 0.3687115013599396, + "learning_rate": 0.0001574156149677017, + "loss": 1.2198, + "step": 16398 + }, + { + "epoch": 0.2130975256764066, + "grad_norm": 0.39474666118621826, + "learning_rate": 0.0001574130155057903, + "loss": 1.5039, + "step": 16399 + }, + { + "epoch": 0.21311052022032248, + "grad_norm": 0.3476211726665497, + "learning_rate": 0.00015741041604387893, + "loss": 1.3581, + "step": 16400 + }, + { + "epoch": 0.21312351476423835, + "grad_norm": 0.4627285897731781, + "learning_rate": 0.00015740781658196755, + "loss": 1.5028, + "step": 16401 + }, + { + "epoch": 0.21313650930815423, + "grad_norm": 0.37881433963775635, + "learning_rate": 0.00015740521712005615, + "loss": 1.4737, + "step": 16402 + }, + { + "epoch": 0.2131495038520701, + "grad_norm": 0.4564220607280731, + "learning_rate": 0.00015740261765814477, + "loss": 1.3839, + "step": 16403 + }, + { + "epoch": 0.21316249839598597, + "grad_norm": 0.3127775192260742, + "learning_rate": 0.00015740001819623337, + "loss": 1.5259, + "step": 16404 + }, + { + "epoch": 0.21317549293990185, + "grad_norm": 0.448333740234375, + "learning_rate": 0.00015739741873432202, + "loss": 1.2491, + "step": 16405 + }, + { + "epoch": 0.21318848748381775, + "grad_norm": 0.4341827630996704, + "learning_rate": 0.00015739481927241062, + "loss": 1.5643, + "step": 16406 + }, + { + "epoch": 0.21320148202773362, + "grad_norm": 0.4399312734603882, + "learning_rate": 0.00015739221981049924, + "loss": 1.4382, + "step": 16407 + }, + { + "epoch": 0.2132144765716495, + "grad_norm": 0.35957273840904236, + "learning_rate": 0.00015738962034858784, + "loss": 1.5121, + "step": 16408 + }, + { + "epoch": 0.21322747111556536, + "grad_norm": 0.25651615858078003, + "learning_rate": 0.00015738702088667646, + "loss": 1.3806, + "step": 16409 + }, + { + "epoch": 0.21324046565948124, + "grad_norm": 0.3757201135158539, + "learning_rate": 0.0001573844214247651, + "loss": 1.4592, + "step": 16410 + }, + { + "epoch": 0.2132534602033971, + "grad_norm": 0.4285160303115845, + "learning_rate": 0.0001573818219628537, + "loss": 1.5571, + "step": 16411 + }, + { + "epoch": 0.21326645474731298, + "grad_norm": 0.37521782517433167, + "learning_rate": 0.0001573792225009423, + "loss": 1.3879, + "step": 16412 + }, + { + "epoch": 0.21327944929122886, + "grad_norm": 0.42501676082611084, + "learning_rate": 0.00015737662303903094, + "loss": 1.4827, + "step": 16413 + }, + { + "epoch": 0.21329244383514473, + "grad_norm": 0.3862791061401367, + "learning_rate": 0.00015737402357711953, + "loss": 1.2277, + "step": 16414 + }, + { + "epoch": 0.2133054383790606, + "grad_norm": 0.34886133670806885, + "learning_rate": 0.00015737142411520816, + "loss": 1.2409, + "step": 16415 + }, + { + "epoch": 0.21331843292297648, + "grad_norm": 0.4193170368671417, + "learning_rate": 0.00015736882465329676, + "loss": 1.4239, + "step": 16416 + }, + { + "epoch": 0.21333142746689235, + "grad_norm": 0.3871462941169739, + "learning_rate": 0.0001573662251913854, + "loss": 1.5403, + "step": 16417 + }, + { + "epoch": 0.21334442201080822, + "grad_norm": 0.451837956905365, + "learning_rate": 0.000157363625729474, + "loss": 1.451, + "step": 16418 + }, + { + "epoch": 0.2133574165547241, + "grad_norm": 0.33466705679893494, + "learning_rate": 0.00015736102626756263, + "loss": 1.3357, + "step": 16419 + }, + { + "epoch": 0.21337041109863997, + "grad_norm": 0.4441995322704315, + "learning_rate": 0.00015735842680565123, + "loss": 1.2815, + "step": 16420 + }, + { + "epoch": 0.21338340564255584, + "grad_norm": 0.24887260794639587, + "learning_rate": 0.00015735582734373985, + "loss": 1.2635, + "step": 16421 + }, + { + "epoch": 0.2133964001864717, + "grad_norm": 0.2998470664024353, + "learning_rate": 0.00015735322788182847, + "loss": 1.4651, + "step": 16422 + }, + { + "epoch": 0.21340939473038759, + "grad_norm": 0.3927188515663147, + "learning_rate": 0.00015735062841991707, + "loss": 1.4348, + "step": 16423 + }, + { + "epoch": 0.21342238927430346, + "grad_norm": 0.4490024745464325, + "learning_rate": 0.00015734802895800572, + "loss": 1.2902, + "step": 16424 + }, + { + "epoch": 0.21343538381821933, + "grad_norm": 0.4487338960170746, + "learning_rate": 0.00015734542949609432, + "loss": 1.3611, + "step": 16425 + }, + { + "epoch": 0.2134483783621352, + "grad_norm": 0.3307473659515381, + "learning_rate": 0.00015734283003418292, + "loss": 1.2051, + "step": 16426 + }, + { + "epoch": 0.21346137290605108, + "grad_norm": 0.3719083368778229, + "learning_rate": 0.00015734023057227154, + "loss": 1.2721, + "step": 16427 + }, + { + "epoch": 0.21347436744996695, + "grad_norm": 0.38454705476760864, + "learning_rate": 0.00015733763111036017, + "loss": 1.5339, + "step": 16428 + }, + { + "epoch": 0.21348736199388282, + "grad_norm": 0.44238048791885376, + "learning_rate": 0.0001573350316484488, + "loss": 1.2716, + "step": 16429 + }, + { + "epoch": 0.2135003565377987, + "grad_norm": 0.38776183128356934, + "learning_rate": 0.0001573324321865374, + "loss": 1.5555, + "step": 16430 + }, + { + "epoch": 0.21351335108171457, + "grad_norm": 0.38936471939086914, + "learning_rate": 0.00015732983272462601, + "loss": 1.202, + "step": 16431 + }, + { + "epoch": 0.21352634562563044, + "grad_norm": 0.4147506654262543, + "learning_rate": 0.00015732723326271464, + "loss": 1.4464, + "step": 16432 + }, + { + "epoch": 0.21353934016954632, + "grad_norm": 0.4918941557407379, + "learning_rate": 0.00015732463380080324, + "loss": 1.5148, + "step": 16433 + }, + { + "epoch": 0.2135523347134622, + "grad_norm": 0.35338094830513, + "learning_rate": 0.00015732203433889186, + "loss": 1.1859, + "step": 16434 + }, + { + "epoch": 0.21356532925737806, + "grad_norm": 0.449516236782074, + "learning_rate": 0.00015731943487698046, + "loss": 1.3216, + "step": 16435 + }, + { + "epoch": 0.21357832380129393, + "grad_norm": 0.36722397804260254, + "learning_rate": 0.0001573168354150691, + "loss": 1.2822, + "step": 16436 + }, + { + "epoch": 0.2135913183452098, + "grad_norm": 0.3264809548854828, + "learning_rate": 0.0001573142359531577, + "loss": 1.3596, + "step": 16437 + }, + { + "epoch": 0.21360431288912568, + "grad_norm": 0.42582595348358154, + "learning_rate": 0.0001573116364912463, + "loss": 1.5805, + "step": 16438 + }, + { + "epoch": 0.21361730743304155, + "grad_norm": 0.4740326404571533, + "learning_rate": 0.00015730903702933493, + "loss": 1.4249, + "step": 16439 + }, + { + "epoch": 0.21363030197695743, + "grad_norm": 0.3801330029964447, + "learning_rate": 0.00015730643756742355, + "loss": 1.4191, + "step": 16440 + }, + { + "epoch": 0.2136432965208733, + "grad_norm": 0.3570318818092346, + "learning_rate": 0.00015730383810551218, + "loss": 1.5236, + "step": 16441 + }, + { + "epoch": 0.21365629106478917, + "grad_norm": 0.3715328574180603, + "learning_rate": 0.00015730123864360077, + "loss": 1.4106, + "step": 16442 + }, + { + "epoch": 0.21366928560870505, + "grad_norm": 0.46679291129112244, + "learning_rate": 0.0001572986391816894, + "loss": 1.4655, + "step": 16443 + }, + { + "epoch": 0.21368228015262092, + "grad_norm": 0.3911263346672058, + "learning_rate": 0.00015729603971977802, + "loss": 1.2921, + "step": 16444 + }, + { + "epoch": 0.2136952746965368, + "grad_norm": 0.45583686232566833, + "learning_rate": 0.00015729344025786662, + "loss": 1.3771, + "step": 16445 + }, + { + "epoch": 0.21370826924045266, + "grad_norm": 0.44621261954307556, + "learning_rate": 0.00015729084079595525, + "loss": 1.4974, + "step": 16446 + }, + { + "epoch": 0.21372126378436854, + "grad_norm": 0.24736717343330383, + "learning_rate": 0.00015728824133404384, + "loss": 1.3437, + "step": 16447 + }, + { + "epoch": 0.2137342583282844, + "grad_norm": 0.47360867261886597, + "learning_rate": 0.0001572856418721325, + "loss": 1.3008, + "step": 16448 + }, + { + "epoch": 0.21374725287220028, + "grad_norm": 0.4432130753993988, + "learning_rate": 0.0001572830424102211, + "loss": 1.292, + "step": 16449 + }, + { + "epoch": 0.21376024741611616, + "grad_norm": 0.4720573425292969, + "learning_rate": 0.00015728044294830972, + "loss": 1.2591, + "step": 16450 + }, + { + "epoch": 0.21377324196003203, + "grad_norm": 0.32347947359085083, + "learning_rate": 0.00015727784348639831, + "loss": 1.2989, + "step": 16451 + }, + { + "epoch": 0.2137862365039479, + "grad_norm": 0.39931702613830566, + "learning_rate": 0.00015727524402448694, + "loss": 1.3725, + "step": 16452 + }, + { + "epoch": 0.21379923104786377, + "grad_norm": 0.36974623799324036, + "learning_rate": 0.00015727264456257556, + "loss": 1.5771, + "step": 16453 + }, + { + "epoch": 0.21381222559177965, + "grad_norm": 0.3857141137123108, + "learning_rate": 0.00015727004510066416, + "loss": 1.4767, + "step": 16454 + }, + { + "epoch": 0.21382522013569552, + "grad_norm": 0.37301379442214966, + "learning_rate": 0.00015726744563875278, + "loss": 1.2362, + "step": 16455 + }, + { + "epoch": 0.2138382146796114, + "grad_norm": 0.384650319814682, + "learning_rate": 0.0001572648461768414, + "loss": 1.3956, + "step": 16456 + }, + { + "epoch": 0.21385120922352727, + "grad_norm": 0.35109105706214905, + "learning_rate": 0.00015726224671493, + "loss": 1.1436, + "step": 16457 + }, + { + "epoch": 0.21386420376744314, + "grad_norm": 0.4444829225540161, + "learning_rate": 0.00015725964725301863, + "loss": 1.4092, + "step": 16458 + }, + { + "epoch": 0.213877198311359, + "grad_norm": 0.38663068413734436, + "learning_rate": 0.00015725704779110726, + "loss": 1.3128, + "step": 16459 + }, + { + "epoch": 0.21389019285527489, + "grad_norm": 0.46210744976997375, + "learning_rate": 0.00015725444832919588, + "loss": 1.507, + "step": 16460 + }, + { + "epoch": 0.21390318739919076, + "grad_norm": 0.3999937176704407, + "learning_rate": 0.00015725184886728448, + "loss": 1.3442, + "step": 16461 + }, + { + "epoch": 0.21391618194310663, + "grad_norm": 0.4437315762042999, + "learning_rate": 0.0001572492494053731, + "loss": 1.4218, + "step": 16462 + }, + { + "epoch": 0.2139291764870225, + "grad_norm": 0.422163188457489, + "learning_rate": 0.00015724664994346173, + "loss": 1.318, + "step": 16463 + }, + { + "epoch": 0.21394217103093838, + "grad_norm": 0.4266189932823181, + "learning_rate": 0.00015724405048155032, + "loss": 1.5237, + "step": 16464 + }, + { + "epoch": 0.21395516557485425, + "grad_norm": 0.45921310782432556, + "learning_rate": 0.00015724145101963895, + "loss": 1.5496, + "step": 16465 + }, + { + "epoch": 0.21396816011877012, + "grad_norm": 0.37793177366256714, + "learning_rate": 0.00015723885155772755, + "loss": 1.3886, + "step": 16466 + }, + { + "epoch": 0.213981154662686, + "grad_norm": 0.37618908286094666, + "learning_rate": 0.00015723625209581617, + "loss": 1.4356, + "step": 16467 + }, + { + "epoch": 0.21399414920660187, + "grad_norm": 0.3298528790473938, + "learning_rate": 0.0001572336526339048, + "loss": 1.4025, + "step": 16468 + }, + { + "epoch": 0.21400714375051774, + "grad_norm": 0.3641802966594696, + "learning_rate": 0.0001572310531719934, + "loss": 1.3217, + "step": 16469 + }, + { + "epoch": 0.21402013829443362, + "grad_norm": 0.398813933134079, + "learning_rate": 0.00015722845371008202, + "loss": 1.223, + "step": 16470 + }, + { + "epoch": 0.2140331328383495, + "grad_norm": 0.45190438628196716, + "learning_rate": 0.00015722585424817064, + "loss": 1.6055, + "step": 16471 + }, + { + "epoch": 0.21404612738226536, + "grad_norm": 0.38312652707099915, + "learning_rate": 0.00015722325478625927, + "loss": 1.3531, + "step": 16472 + }, + { + "epoch": 0.21405912192618123, + "grad_norm": 0.3588237166404724, + "learning_rate": 0.00015722065532434786, + "loss": 1.2749, + "step": 16473 + }, + { + "epoch": 0.2140721164700971, + "grad_norm": 0.3974151313304901, + "learning_rate": 0.0001572180558624365, + "loss": 1.4132, + "step": 16474 + }, + { + "epoch": 0.21408511101401298, + "grad_norm": 0.36366257071495056, + "learning_rate": 0.0001572154564005251, + "loss": 1.4313, + "step": 16475 + }, + { + "epoch": 0.21409810555792885, + "grad_norm": 0.4387637972831726, + "learning_rate": 0.0001572128569386137, + "loss": 1.4985, + "step": 16476 + }, + { + "epoch": 0.21411110010184473, + "grad_norm": 0.31119364500045776, + "learning_rate": 0.00015721025747670233, + "loss": 1.3743, + "step": 16477 + }, + { + "epoch": 0.2141240946457606, + "grad_norm": 0.21330766379833221, + "learning_rate": 0.00015720765801479093, + "loss": 1.4404, + "step": 16478 + }, + { + "epoch": 0.21413708918967647, + "grad_norm": 0.3332308530807495, + "learning_rate": 0.00015720505855287958, + "loss": 1.3968, + "step": 16479 + }, + { + "epoch": 0.21415008373359234, + "grad_norm": 0.3787296414375305, + "learning_rate": 0.00015720245909096818, + "loss": 1.3836, + "step": 16480 + }, + { + "epoch": 0.21416307827750822, + "grad_norm": 0.44584155082702637, + "learning_rate": 0.00015719985962905678, + "loss": 1.2373, + "step": 16481 + }, + { + "epoch": 0.21417607282142412, + "grad_norm": 0.31913888454437256, + "learning_rate": 0.0001571972601671454, + "loss": 1.4198, + "step": 16482 + }, + { + "epoch": 0.21418906736534, + "grad_norm": 0.5201233625411987, + "learning_rate": 0.00015719466070523403, + "loss": 1.4421, + "step": 16483 + }, + { + "epoch": 0.21420206190925586, + "grad_norm": 0.4701615571975708, + "learning_rate": 0.00015719206124332265, + "loss": 1.4973, + "step": 16484 + }, + { + "epoch": 0.21421505645317174, + "grad_norm": 0.5191045999526978, + "learning_rate": 0.00015718946178141125, + "loss": 1.4815, + "step": 16485 + }, + { + "epoch": 0.2142280509970876, + "grad_norm": 0.352174311876297, + "learning_rate": 0.00015718686231949987, + "loss": 1.5913, + "step": 16486 + }, + { + "epoch": 0.21424104554100348, + "grad_norm": 0.45393285155296326, + "learning_rate": 0.0001571842628575885, + "loss": 1.4239, + "step": 16487 + }, + { + "epoch": 0.21425404008491936, + "grad_norm": 0.3982965052127838, + "learning_rate": 0.0001571816633956771, + "loss": 1.5383, + "step": 16488 + }, + { + "epoch": 0.21426703462883523, + "grad_norm": 0.37316644191741943, + "learning_rate": 0.00015717906393376572, + "loss": 1.4464, + "step": 16489 + }, + { + "epoch": 0.2142800291727511, + "grad_norm": 0.46091148257255554, + "learning_rate": 0.00015717646447185432, + "loss": 1.5204, + "step": 16490 + }, + { + "epoch": 0.21429302371666697, + "grad_norm": 0.462181031703949, + "learning_rate": 0.00015717386500994297, + "loss": 1.5215, + "step": 16491 + }, + { + "epoch": 0.21430601826058285, + "grad_norm": 0.4024527966976166, + "learning_rate": 0.00015717126554803157, + "loss": 1.3385, + "step": 16492 + }, + { + "epoch": 0.21431901280449872, + "grad_norm": 0.3024749457836151, + "learning_rate": 0.00015716866608612016, + "loss": 1.4666, + "step": 16493 + }, + { + "epoch": 0.2143320073484146, + "grad_norm": 0.9460294246673584, + "learning_rate": 0.0001571660666242088, + "loss": 1.5663, + "step": 16494 + }, + { + "epoch": 0.21434500189233047, + "grad_norm": 0.3830714523792267, + "learning_rate": 0.0001571634671622974, + "loss": 1.3336, + "step": 16495 + }, + { + "epoch": 0.21435799643624634, + "grad_norm": 0.3980732858181, + "learning_rate": 0.00015716086770038604, + "loss": 1.4113, + "step": 16496 + }, + { + "epoch": 0.2143709909801622, + "grad_norm": 0.33247390389442444, + "learning_rate": 0.00015715826823847463, + "loss": 1.3418, + "step": 16497 + }, + { + "epoch": 0.21438398552407809, + "grad_norm": 0.3369801640510559, + "learning_rate": 0.00015715566877656326, + "loss": 1.2834, + "step": 16498 + }, + { + "epoch": 0.21439698006799396, + "grad_norm": 0.413433313369751, + "learning_rate": 0.00015715306931465188, + "loss": 1.4559, + "step": 16499 + }, + { + "epoch": 0.21440997461190983, + "grad_norm": 0.5788470506668091, + "learning_rate": 0.00015715046985274048, + "loss": 1.4304, + "step": 16500 + }, + { + "epoch": 0.2144229691558257, + "grad_norm": 0.44265472888946533, + "learning_rate": 0.0001571478703908291, + "loss": 1.4603, + "step": 16501 + }, + { + "epoch": 0.21443596369974158, + "grad_norm": 0.42819735407829285, + "learning_rate": 0.00015714527092891773, + "loss": 1.4628, + "step": 16502 + }, + { + "epoch": 0.21444895824365745, + "grad_norm": 0.5448420643806458, + "learning_rate": 0.00015714267146700635, + "loss": 1.5795, + "step": 16503 + }, + { + "epoch": 0.21446195278757332, + "grad_norm": 0.37777411937713623, + "learning_rate": 0.00015714007200509495, + "loss": 1.3947, + "step": 16504 + }, + { + "epoch": 0.2144749473314892, + "grad_norm": 0.3798162341117859, + "learning_rate": 0.00015713747254318355, + "loss": 1.3604, + "step": 16505 + }, + { + "epoch": 0.21448794187540507, + "grad_norm": 0.3861330449581146, + "learning_rate": 0.0001571348730812722, + "loss": 1.1878, + "step": 16506 + }, + { + "epoch": 0.21450093641932094, + "grad_norm": 0.2823701500892639, + "learning_rate": 0.0001571322736193608, + "loss": 1.1425, + "step": 16507 + }, + { + "epoch": 0.21451393096323682, + "grad_norm": 0.398682177066803, + "learning_rate": 0.00015712967415744942, + "loss": 1.3511, + "step": 16508 + }, + { + "epoch": 0.2145269255071527, + "grad_norm": 0.39075520634651184, + "learning_rate": 0.00015712707469553802, + "loss": 1.2274, + "step": 16509 + }, + { + "epoch": 0.21453992005106856, + "grad_norm": 0.4440814256668091, + "learning_rate": 0.00015712447523362664, + "loss": 1.4322, + "step": 16510 + }, + { + "epoch": 0.21455291459498443, + "grad_norm": 0.4309839606285095, + "learning_rate": 0.00015712187577171527, + "loss": 1.6187, + "step": 16511 + }, + { + "epoch": 0.2145659091389003, + "grad_norm": 0.2919430434703827, + "learning_rate": 0.00015711927630980387, + "loss": 1.3373, + "step": 16512 + }, + { + "epoch": 0.21457890368281618, + "grad_norm": 0.351412832736969, + "learning_rate": 0.0001571166768478925, + "loss": 1.2871, + "step": 16513 + }, + { + "epoch": 0.21459189822673205, + "grad_norm": 0.36734259128570557, + "learning_rate": 0.00015711407738598111, + "loss": 1.3329, + "step": 16514 + }, + { + "epoch": 0.21460489277064793, + "grad_norm": 0.3979712128639221, + "learning_rate": 0.00015711147792406974, + "loss": 1.374, + "step": 16515 + }, + { + "epoch": 0.2146178873145638, + "grad_norm": 0.4031108021736145, + "learning_rate": 0.00015710887846215834, + "loss": 1.4755, + "step": 16516 + }, + { + "epoch": 0.21463088185847967, + "grad_norm": 0.3169833719730377, + "learning_rate": 0.00015710627900024696, + "loss": 1.3009, + "step": 16517 + }, + { + "epoch": 0.21464387640239554, + "grad_norm": 0.33369866013526917, + "learning_rate": 0.00015710367953833559, + "loss": 1.177, + "step": 16518 + }, + { + "epoch": 0.21465687094631142, + "grad_norm": 0.5357704162597656, + "learning_rate": 0.00015710108007642418, + "loss": 1.368, + "step": 16519 + }, + { + "epoch": 0.2146698654902273, + "grad_norm": 0.4652195870876312, + "learning_rate": 0.0001570984806145128, + "loss": 1.3483, + "step": 16520 + }, + { + "epoch": 0.21468286003414316, + "grad_norm": 0.4243440330028534, + "learning_rate": 0.0001570958811526014, + "loss": 1.4274, + "step": 16521 + }, + { + "epoch": 0.21469585457805904, + "grad_norm": 0.48390108346939087, + "learning_rate": 0.00015709328169069003, + "loss": 1.5251, + "step": 16522 + }, + { + "epoch": 0.2147088491219749, + "grad_norm": 0.391955703496933, + "learning_rate": 0.00015709068222877865, + "loss": 1.426, + "step": 16523 + }, + { + "epoch": 0.21472184366589078, + "grad_norm": 0.38584983348846436, + "learning_rate": 0.00015708808276686725, + "loss": 1.4606, + "step": 16524 + }, + { + "epoch": 0.21473483820980666, + "grad_norm": 0.43249809741973877, + "learning_rate": 0.00015708548330495588, + "loss": 1.4831, + "step": 16525 + }, + { + "epoch": 0.21474783275372253, + "grad_norm": 0.3600912094116211, + "learning_rate": 0.0001570828838430445, + "loss": 1.4734, + "step": 16526 + }, + { + "epoch": 0.2147608272976384, + "grad_norm": 0.47654473781585693, + "learning_rate": 0.00015708028438113312, + "loss": 1.4104, + "step": 16527 + }, + { + "epoch": 0.21477382184155427, + "grad_norm": 0.420943021774292, + "learning_rate": 0.00015707768491922172, + "loss": 1.4508, + "step": 16528 + }, + { + "epoch": 0.21478681638547015, + "grad_norm": 0.3651467263698578, + "learning_rate": 0.00015707508545731035, + "loss": 1.4071, + "step": 16529 + }, + { + "epoch": 0.21479981092938602, + "grad_norm": 0.3509159982204437, + "learning_rate": 0.00015707248599539897, + "loss": 1.3449, + "step": 16530 + }, + { + "epoch": 0.2148128054733019, + "grad_norm": 0.40557292103767395, + "learning_rate": 0.00015706988653348757, + "loss": 1.2499, + "step": 16531 + }, + { + "epoch": 0.21482580001721777, + "grad_norm": 0.5104345679283142, + "learning_rate": 0.0001570672870715762, + "loss": 1.3207, + "step": 16532 + }, + { + "epoch": 0.21483879456113364, + "grad_norm": 0.35232189297676086, + "learning_rate": 0.00015706468760966482, + "loss": 1.4028, + "step": 16533 + }, + { + "epoch": 0.2148517891050495, + "grad_norm": 0.3544084131717682, + "learning_rate": 0.00015706208814775344, + "loss": 1.442, + "step": 16534 + }, + { + "epoch": 0.21486478364896539, + "grad_norm": 0.3969372808933258, + "learning_rate": 0.00015705948868584204, + "loss": 1.3704, + "step": 16535 + }, + { + "epoch": 0.21487777819288126, + "grad_norm": 0.28448039293289185, + "learning_rate": 0.00015705688922393064, + "loss": 1.4701, + "step": 16536 + }, + { + "epoch": 0.21489077273679713, + "grad_norm": 0.4381963014602661, + "learning_rate": 0.0001570542897620193, + "loss": 1.4371, + "step": 16537 + }, + { + "epoch": 0.214903767280713, + "grad_norm": 0.45803946256637573, + "learning_rate": 0.00015705169030010789, + "loss": 1.3939, + "step": 16538 + }, + { + "epoch": 0.21491676182462888, + "grad_norm": 0.47935450077056885, + "learning_rate": 0.0001570490908381965, + "loss": 1.1933, + "step": 16539 + }, + { + "epoch": 0.21492975636854475, + "grad_norm": 0.40125685930252075, + "learning_rate": 0.0001570464913762851, + "loss": 1.367, + "step": 16540 + }, + { + "epoch": 0.21494275091246062, + "grad_norm": 0.5328443050384521, + "learning_rate": 0.00015704389191437373, + "loss": 1.5222, + "step": 16541 + }, + { + "epoch": 0.2149557454563765, + "grad_norm": 0.46879997849464417, + "learning_rate": 0.00015704129245246236, + "loss": 1.4179, + "step": 16542 + }, + { + "epoch": 0.21496874000029237, + "grad_norm": 0.35379940271377563, + "learning_rate": 0.00015703869299055095, + "loss": 1.4787, + "step": 16543 + }, + { + "epoch": 0.21498173454420824, + "grad_norm": 0.4055086076259613, + "learning_rate": 0.00015703609352863958, + "loss": 1.2983, + "step": 16544 + }, + { + "epoch": 0.21499472908812411, + "grad_norm": 0.33105772733688354, + "learning_rate": 0.0001570334940667282, + "loss": 1.316, + "step": 16545 + }, + { + "epoch": 0.21500772363204, + "grad_norm": 0.35414037108421326, + "learning_rate": 0.00015703089460481683, + "loss": 1.225, + "step": 16546 + }, + { + "epoch": 0.21502071817595586, + "grad_norm": 0.41440072655677795, + "learning_rate": 0.00015702829514290542, + "loss": 1.5852, + "step": 16547 + }, + { + "epoch": 0.21503371271987173, + "grad_norm": 0.3771766722202301, + "learning_rate": 0.00015702569568099402, + "loss": 1.302, + "step": 16548 + }, + { + "epoch": 0.2150467072637876, + "grad_norm": 0.4226526618003845, + "learning_rate": 0.00015702309621908267, + "loss": 1.3977, + "step": 16549 + }, + { + "epoch": 0.21505970180770348, + "grad_norm": 0.4723445177078247, + "learning_rate": 0.00015702049675717127, + "loss": 1.4628, + "step": 16550 + }, + { + "epoch": 0.21507269635161935, + "grad_norm": 0.4863370954990387, + "learning_rate": 0.0001570178972952599, + "loss": 1.6494, + "step": 16551 + }, + { + "epoch": 0.21508569089553523, + "grad_norm": 0.3522881865501404, + "learning_rate": 0.0001570152978333485, + "loss": 1.4251, + "step": 16552 + }, + { + "epoch": 0.2150986854394511, + "grad_norm": 0.3577641248703003, + "learning_rate": 0.00015701269837143712, + "loss": 1.1627, + "step": 16553 + }, + { + "epoch": 0.21511167998336697, + "grad_norm": 0.3905261158943176, + "learning_rate": 0.00015701009890952574, + "loss": 1.2126, + "step": 16554 + }, + { + "epoch": 0.21512467452728284, + "grad_norm": 0.30502554774284363, + "learning_rate": 0.00015700749944761434, + "loss": 1.4291, + "step": 16555 + }, + { + "epoch": 0.21513766907119872, + "grad_norm": 0.3698728680610657, + "learning_rate": 0.00015700489998570296, + "loss": 1.328, + "step": 16556 + }, + { + "epoch": 0.2151506636151146, + "grad_norm": 0.3145277202129364, + "learning_rate": 0.0001570023005237916, + "loss": 1.4469, + "step": 16557 + }, + { + "epoch": 0.2151636581590305, + "grad_norm": 0.40194594860076904, + "learning_rate": 0.0001569997010618802, + "loss": 1.5475, + "step": 16558 + }, + { + "epoch": 0.21517665270294636, + "grad_norm": 0.5104473233222961, + "learning_rate": 0.0001569971015999688, + "loss": 1.4858, + "step": 16559 + }, + { + "epoch": 0.21518964724686224, + "grad_norm": 0.38238149881362915, + "learning_rate": 0.0001569945021380574, + "loss": 1.2252, + "step": 16560 + }, + { + "epoch": 0.2152026417907781, + "grad_norm": 0.39461711049079895, + "learning_rate": 0.00015699190267614606, + "loss": 1.3934, + "step": 16561 + }, + { + "epoch": 0.21521563633469398, + "grad_norm": 0.38339686393737793, + "learning_rate": 0.00015698930321423466, + "loss": 1.5358, + "step": 16562 + }, + { + "epoch": 0.21522863087860986, + "grad_norm": 0.38039103150367737, + "learning_rate": 0.00015698670375232328, + "loss": 1.2871, + "step": 16563 + }, + { + "epoch": 0.21524162542252573, + "grad_norm": 0.43147265911102295, + "learning_rate": 0.00015698410429041188, + "loss": 1.4518, + "step": 16564 + }, + { + "epoch": 0.2152546199664416, + "grad_norm": 0.3983471095561981, + "learning_rate": 0.0001569815048285005, + "loss": 1.4717, + "step": 16565 + }, + { + "epoch": 0.21526761451035747, + "grad_norm": 0.3225042223930359, + "learning_rate": 0.00015697890536658913, + "loss": 1.3561, + "step": 16566 + }, + { + "epoch": 0.21528060905427335, + "grad_norm": 0.3922005593776703, + "learning_rate": 0.00015697630590467772, + "loss": 1.4307, + "step": 16567 + }, + { + "epoch": 0.21529360359818922, + "grad_norm": 0.3118862807750702, + "learning_rate": 0.00015697370644276635, + "loss": 1.3035, + "step": 16568 + }, + { + "epoch": 0.2153065981421051, + "grad_norm": 0.39475566148757935, + "learning_rate": 0.00015697110698085497, + "loss": 1.2448, + "step": 16569 + }, + { + "epoch": 0.21531959268602097, + "grad_norm": 0.3987126350402832, + "learning_rate": 0.0001569685075189436, + "loss": 1.3782, + "step": 16570 + }, + { + "epoch": 0.21533258722993684, + "grad_norm": 0.47500649094581604, + "learning_rate": 0.0001569659080570322, + "loss": 1.4427, + "step": 16571 + }, + { + "epoch": 0.2153455817738527, + "grad_norm": 0.33055704832077026, + "learning_rate": 0.00015696330859512082, + "loss": 1.187, + "step": 16572 + }, + { + "epoch": 0.21535857631776859, + "grad_norm": 0.4292328953742981, + "learning_rate": 0.00015696070913320944, + "loss": 1.3624, + "step": 16573 + }, + { + "epoch": 0.21537157086168446, + "grad_norm": 0.30911752581596375, + "learning_rate": 0.00015695810967129804, + "loss": 1.2905, + "step": 16574 + }, + { + "epoch": 0.21538456540560033, + "grad_norm": 0.4092995226383209, + "learning_rate": 0.00015695551020938667, + "loss": 1.4696, + "step": 16575 + }, + { + "epoch": 0.2153975599495162, + "grad_norm": 0.45963016152381897, + "learning_rate": 0.0001569529107474753, + "loss": 1.4939, + "step": 16576 + }, + { + "epoch": 0.21541055449343208, + "grad_norm": 0.42844027280807495, + "learning_rate": 0.0001569503112855639, + "loss": 1.4469, + "step": 16577 + }, + { + "epoch": 0.21542354903734795, + "grad_norm": 0.4033646583557129, + "learning_rate": 0.0001569477118236525, + "loss": 1.2084, + "step": 16578 + }, + { + "epoch": 0.21543654358126382, + "grad_norm": 0.3368788957595825, + "learning_rate": 0.0001569451123617411, + "loss": 1.2441, + "step": 16579 + }, + { + "epoch": 0.2154495381251797, + "grad_norm": 0.3674704432487488, + "learning_rate": 0.00015694251289982976, + "loss": 1.558, + "step": 16580 + }, + { + "epoch": 0.21546253266909557, + "grad_norm": 0.45151060819625854, + "learning_rate": 0.00015693991343791836, + "loss": 1.4704, + "step": 16581 + }, + { + "epoch": 0.21547552721301144, + "grad_norm": 0.471706360578537, + "learning_rate": 0.00015693731397600698, + "loss": 1.4053, + "step": 16582 + }, + { + "epoch": 0.21548852175692731, + "grad_norm": 0.37198522686958313, + "learning_rate": 0.00015693471451409558, + "loss": 1.5183, + "step": 16583 + }, + { + "epoch": 0.2155015163008432, + "grad_norm": 0.4410859942436218, + "learning_rate": 0.0001569321150521842, + "loss": 1.4267, + "step": 16584 + }, + { + "epoch": 0.21551451084475906, + "grad_norm": 0.4573918879032135, + "learning_rate": 0.00015692951559027283, + "loss": 1.5274, + "step": 16585 + }, + { + "epoch": 0.21552750538867493, + "grad_norm": 0.29926973581314087, + "learning_rate": 0.00015692691612836143, + "loss": 1.2994, + "step": 16586 + }, + { + "epoch": 0.2155404999325908, + "grad_norm": 0.31536000967025757, + "learning_rate": 0.00015692431666645005, + "loss": 1.4826, + "step": 16587 + }, + { + "epoch": 0.21555349447650668, + "grad_norm": 0.39215388894081116, + "learning_rate": 0.00015692171720453868, + "loss": 1.5273, + "step": 16588 + }, + { + "epoch": 0.21556648902042255, + "grad_norm": 0.3824387192726135, + "learning_rate": 0.00015691911774262727, + "loss": 1.2518, + "step": 16589 + }, + { + "epoch": 0.21557948356433843, + "grad_norm": 0.4292064309120178, + "learning_rate": 0.0001569165182807159, + "loss": 1.2965, + "step": 16590 + }, + { + "epoch": 0.2155924781082543, + "grad_norm": 0.4297655522823334, + "learning_rate": 0.0001569139188188045, + "loss": 1.3606, + "step": 16591 + }, + { + "epoch": 0.21560547265217017, + "grad_norm": 0.44488075375556946, + "learning_rate": 0.00015691131935689315, + "loss": 1.4546, + "step": 16592 + }, + { + "epoch": 0.21561846719608604, + "grad_norm": 0.351406455039978, + "learning_rate": 0.00015690871989498174, + "loss": 1.4916, + "step": 16593 + }, + { + "epoch": 0.21563146174000192, + "grad_norm": 0.34770748019218445, + "learning_rate": 0.00015690612043307037, + "loss": 1.291, + "step": 16594 + }, + { + "epoch": 0.2156444562839178, + "grad_norm": 0.47346386313438416, + "learning_rate": 0.00015690352097115897, + "loss": 1.3954, + "step": 16595 + }, + { + "epoch": 0.21565745082783366, + "grad_norm": 0.48725366592407227, + "learning_rate": 0.0001569009215092476, + "loss": 1.3502, + "step": 16596 + }, + { + "epoch": 0.21567044537174954, + "grad_norm": 0.29200097918510437, + "learning_rate": 0.00015689832204733621, + "loss": 1.2541, + "step": 16597 + }, + { + "epoch": 0.2156834399156654, + "grad_norm": 0.437290757894516, + "learning_rate": 0.0001568957225854248, + "loss": 1.3627, + "step": 16598 + }, + { + "epoch": 0.21569643445958128, + "grad_norm": 0.42533838748931885, + "learning_rate": 0.00015689312312351344, + "loss": 1.5972, + "step": 16599 + }, + { + "epoch": 0.21570942900349716, + "grad_norm": 0.36109253764152527, + "learning_rate": 0.00015689052366160206, + "loss": 1.4141, + "step": 16600 + }, + { + "epoch": 0.21572242354741303, + "grad_norm": 0.37121185660362244, + "learning_rate": 0.00015688792419969069, + "loss": 1.523, + "step": 16601 + }, + { + "epoch": 0.2157354180913289, + "grad_norm": 0.40480029582977295, + "learning_rate": 0.00015688532473777928, + "loss": 1.4019, + "step": 16602 + }, + { + "epoch": 0.21574841263524477, + "grad_norm": 0.5045374631881714, + "learning_rate": 0.00015688272527586788, + "loss": 1.4244, + "step": 16603 + }, + { + "epoch": 0.21576140717916065, + "grad_norm": 0.3900858163833618, + "learning_rate": 0.00015688012581395653, + "loss": 1.5065, + "step": 16604 + }, + { + "epoch": 0.21577440172307652, + "grad_norm": 0.3240197002887726, + "learning_rate": 0.00015687752635204513, + "loss": 1.2758, + "step": 16605 + }, + { + "epoch": 0.2157873962669924, + "grad_norm": 0.38154393434524536, + "learning_rate": 0.00015687492689013375, + "loss": 1.4389, + "step": 16606 + }, + { + "epoch": 0.21580039081090827, + "grad_norm": 0.33986303210258484, + "learning_rate": 0.00015687232742822238, + "loss": 1.4756, + "step": 16607 + }, + { + "epoch": 0.21581338535482414, + "grad_norm": 0.8153408765792847, + "learning_rate": 0.00015686972796631098, + "loss": 1.5496, + "step": 16608 + }, + { + "epoch": 0.21582637989874, + "grad_norm": 0.39027640223503113, + "learning_rate": 0.0001568671285043996, + "loss": 1.7707, + "step": 16609 + }, + { + "epoch": 0.21583937444265588, + "grad_norm": 0.3908083736896515, + "learning_rate": 0.0001568645290424882, + "loss": 1.5284, + "step": 16610 + }, + { + "epoch": 0.21585236898657176, + "grad_norm": 0.3799417316913605, + "learning_rate": 0.00015686192958057685, + "loss": 1.582, + "step": 16611 + }, + { + "epoch": 0.21586536353048763, + "grad_norm": 0.3183356523513794, + "learning_rate": 0.00015685933011866545, + "loss": 1.6138, + "step": 16612 + }, + { + "epoch": 0.2158783580744035, + "grad_norm": 0.4347495138645172, + "learning_rate": 0.00015685673065675407, + "loss": 1.5235, + "step": 16613 + }, + { + "epoch": 0.21589135261831938, + "grad_norm": 0.308858722448349, + "learning_rate": 0.00015685413119484267, + "loss": 1.3521, + "step": 16614 + }, + { + "epoch": 0.21590434716223525, + "grad_norm": 0.5220159292221069, + "learning_rate": 0.0001568515317329313, + "loss": 1.4335, + "step": 16615 + }, + { + "epoch": 0.21591734170615112, + "grad_norm": 0.39267468452453613, + "learning_rate": 0.00015684893227101992, + "loss": 1.2497, + "step": 16616 + }, + { + "epoch": 0.215930336250067, + "grad_norm": 0.4098016321659088, + "learning_rate": 0.00015684633280910851, + "loss": 1.5089, + "step": 16617 + }, + { + "epoch": 0.21594333079398287, + "grad_norm": 0.40980812907218933, + "learning_rate": 0.00015684373334719714, + "loss": 1.3314, + "step": 16618 + }, + { + "epoch": 0.21595632533789874, + "grad_norm": 0.39346665143966675, + "learning_rate": 0.00015684113388528576, + "loss": 1.4366, + "step": 16619 + }, + { + "epoch": 0.21596931988181461, + "grad_norm": 0.3957061469554901, + "learning_rate": 0.00015683853442337436, + "loss": 1.411, + "step": 16620 + }, + { + "epoch": 0.2159823144257305, + "grad_norm": 0.44264093041419983, + "learning_rate": 0.00015683593496146299, + "loss": 1.261, + "step": 16621 + }, + { + "epoch": 0.21599530896964636, + "grad_norm": 0.3062681257724762, + "learning_rate": 0.00015683333549955158, + "loss": 1.2393, + "step": 16622 + }, + { + "epoch": 0.21600830351356223, + "grad_norm": 0.44686415791511536, + "learning_rate": 0.00015683073603764023, + "loss": 1.4037, + "step": 16623 + }, + { + "epoch": 0.2160212980574781, + "grad_norm": 0.40485095977783203, + "learning_rate": 0.00015682813657572883, + "loss": 1.3506, + "step": 16624 + }, + { + "epoch": 0.21603429260139398, + "grad_norm": 0.4370565414428711, + "learning_rate": 0.00015682553711381746, + "loss": 1.5838, + "step": 16625 + }, + { + "epoch": 0.21604728714530985, + "grad_norm": 0.4163168668746948, + "learning_rate": 0.00015682293765190605, + "loss": 1.4353, + "step": 16626 + }, + { + "epoch": 0.21606028168922572, + "grad_norm": 0.37149685621261597, + "learning_rate": 0.00015682033818999468, + "loss": 1.454, + "step": 16627 + }, + { + "epoch": 0.2160732762331416, + "grad_norm": 0.4331968426704407, + "learning_rate": 0.0001568177387280833, + "loss": 1.442, + "step": 16628 + }, + { + "epoch": 0.21608627077705747, + "grad_norm": 0.3904690444469452, + "learning_rate": 0.0001568151392661719, + "loss": 1.4272, + "step": 16629 + }, + { + "epoch": 0.21609926532097334, + "grad_norm": 0.2630819082260132, + "learning_rate": 0.00015681253980426052, + "loss": 1.2998, + "step": 16630 + }, + { + "epoch": 0.21611225986488922, + "grad_norm": 0.41717612743377686, + "learning_rate": 0.00015680994034234915, + "loss": 1.4902, + "step": 16631 + }, + { + "epoch": 0.2161252544088051, + "grad_norm": 0.3159341514110565, + "learning_rate": 0.00015680734088043775, + "loss": 1.4452, + "step": 16632 + }, + { + "epoch": 0.21613824895272096, + "grad_norm": 0.39778199791908264, + "learning_rate": 0.00015680474141852637, + "loss": 1.4816, + "step": 16633 + }, + { + "epoch": 0.21615124349663686, + "grad_norm": 0.4765211343765259, + "learning_rate": 0.00015680214195661497, + "loss": 1.4979, + "step": 16634 + }, + { + "epoch": 0.21616423804055274, + "grad_norm": 0.39823561906814575, + "learning_rate": 0.00015679954249470362, + "loss": 1.3544, + "step": 16635 + }, + { + "epoch": 0.2161772325844686, + "grad_norm": 0.4006238579750061, + "learning_rate": 0.00015679694303279222, + "loss": 1.3446, + "step": 16636 + }, + { + "epoch": 0.21619022712838448, + "grad_norm": 0.5970996618270874, + "learning_rate": 0.00015679434357088084, + "loss": 1.347, + "step": 16637 + }, + { + "epoch": 0.21620322167230036, + "grad_norm": 0.39650824666023254, + "learning_rate": 0.00015679174410896944, + "loss": 1.4437, + "step": 16638 + }, + { + "epoch": 0.21621621621621623, + "grad_norm": 0.3815503716468811, + "learning_rate": 0.00015678914464705806, + "loss": 1.3184, + "step": 16639 + }, + { + "epoch": 0.2162292107601321, + "grad_norm": 0.37390029430389404, + "learning_rate": 0.0001567865451851467, + "loss": 1.4119, + "step": 16640 + }, + { + "epoch": 0.21624220530404797, + "grad_norm": 0.42822691798210144, + "learning_rate": 0.00015678394572323529, + "loss": 1.5361, + "step": 16641 + }, + { + "epoch": 0.21625519984796385, + "grad_norm": 0.39275333285331726, + "learning_rate": 0.0001567813462613239, + "loss": 1.3879, + "step": 16642 + }, + { + "epoch": 0.21626819439187972, + "grad_norm": 0.5643912553787231, + "learning_rate": 0.00015677874679941253, + "loss": 1.5656, + "step": 16643 + }, + { + "epoch": 0.2162811889357956, + "grad_norm": 0.4708293378353119, + "learning_rate": 0.00015677614733750113, + "loss": 1.4699, + "step": 16644 + }, + { + "epoch": 0.21629418347971147, + "grad_norm": 0.3414262533187866, + "learning_rate": 0.00015677354787558976, + "loss": 1.4886, + "step": 16645 + }, + { + "epoch": 0.21630717802362734, + "grad_norm": 0.4258613586425781, + "learning_rate": 0.00015677094841367838, + "loss": 1.4512, + "step": 16646 + }, + { + "epoch": 0.2163201725675432, + "grad_norm": 0.3839971125125885, + "learning_rate": 0.000156768348951767, + "loss": 1.4354, + "step": 16647 + }, + { + "epoch": 0.21633316711145908, + "grad_norm": 0.46988213062286377, + "learning_rate": 0.0001567657494898556, + "loss": 1.4923, + "step": 16648 + }, + { + "epoch": 0.21634616165537496, + "grad_norm": 0.4564168155193329, + "learning_rate": 0.00015676315002794423, + "loss": 1.3735, + "step": 16649 + }, + { + "epoch": 0.21635915619929083, + "grad_norm": 0.4323045611381531, + "learning_rate": 0.00015676055056603285, + "loss": 1.4812, + "step": 16650 + }, + { + "epoch": 0.2163721507432067, + "grad_norm": 0.3471057116985321, + "learning_rate": 0.00015675795110412145, + "loss": 1.3863, + "step": 16651 + }, + { + "epoch": 0.21638514528712258, + "grad_norm": 0.39946672320365906, + "learning_rate": 0.00015675535164221007, + "loss": 1.2958, + "step": 16652 + }, + { + "epoch": 0.21639813983103845, + "grad_norm": 0.4242045283317566, + "learning_rate": 0.00015675275218029867, + "loss": 1.521, + "step": 16653 + }, + { + "epoch": 0.21641113437495432, + "grad_norm": 0.4094468653202057, + "learning_rate": 0.00015675015271838732, + "loss": 1.3779, + "step": 16654 + }, + { + "epoch": 0.2164241289188702, + "grad_norm": 0.34646421670913696, + "learning_rate": 0.00015674755325647592, + "loss": 1.5316, + "step": 16655 + }, + { + "epoch": 0.21643712346278607, + "grad_norm": 0.42703479528427124, + "learning_rate": 0.00015674495379456454, + "loss": 1.5985, + "step": 16656 + }, + { + "epoch": 0.21645011800670194, + "grad_norm": 0.4245736598968506, + "learning_rate": 0.00015674235433265314, + "loss": 1.3731, + "step": 16657 + }, + { + "epoch": 0.21646311255061781, + "grad_norm": 0.3305426836013794, + "learning_rate": 0.00015673975487074177, + "loss": 1.4723, + "step": 16658 + }, + { + "epoch": 0.2164761070945337, + "grad_norm": 0.5194588303565979, + "learning_rate": 0.0001567371554088304, + "loss": 1.5208, + "step": 16659 + }, + { + "epoch": 0.21648910163844956, + "grad_norm": 0.35528671741485596, + "learning_rate": 0.000156734555946919, + "loss": 1.3596, + "step": 16660 + }, + { + "epoch": 0.21650209618236543, + "grad_norm": 0.4904539883136749, + "learning_rate": 0.0001567319564850076, + "loss": 1.4714, + "step": 16661 + }, + { + "epoch": 0.2165150907262813, + "grad_norm": 0.31905707716941833, + "learning_rate": 0.00015672935702309624, + "loss": 1.522, + "step": 16662 + }, + { + "epoch": 0.21652808527019718, + "grad_norm": 0.42712539434432983, + "learning_rate": 0.00015672675756118483, + "loss": 1.4163, + "step": 16663 + }, + { + "epoch": 0.21654107981411305, + "grad_norm": 0.4735722839832306, + "learning_rate": 0.00015672415809927346, + "loss": 1.5672, + "step": 16664 + }, + { + "epoch": 0.21655407435802893, + "grad_norm": 0.35667502880096436, + "learning_rate": 0.00015672155863736206, + "loss": 1.2409, + "step": 16665 + }, + { + "epoch": 0.2165670689019448, + "grad_norm": 0.3629409968852997, + "learning_rate": 0.0001567189591754507, + "loss": 1.4795, + "step": 16666 + }, + { + "epoch": 0.21658006344586067, + "grad_norm": 0.3938175439834595, + "learning_rate": 0.0001567163597135393, + "loss": 1.5103, + "step": 16667 + }, + { + "epoch": 0.21659305798977654, + "grad_norm": 0.424789696931839, + "learning_rate": 0.00015671376025162793, + "loss": 1.3422, + "step": 16668 + }, + { + "epoch": 0.21660605253369242, + "grad_norm": 0.340334951877594, + "learning_rate": 0.00015671116078971653, + "loss": 1.4515, + "step": 16669 + }, + { + "epoch": 0.2166190470776083, + "grad_norm": 0.4597562849521637, + "learning_rate": 0.00015670856132780515, + "loss": 1.4839, + "step": 16670 + }, + { + "epoch": 0.21663204162152416, + "grad_norm": 0.4084775745868683, + "learning_rate": 0.00015670596186589378, + "loss": 1.3904, + "step": 16671 + }, + { + "epoch": 0.21664503616544004, + "grad_norm": 0.3597360849380493, + "learning_rate": 0.00015670336240398237, + "loss": 1.4012, + "step": 16672 + }, + { + "epoch": 0.2166580307093559, + "grad_norm": 0.44616204500198364, + "learning_rate": 0.000156700762942071, + "loss": 1.5331, + "step": 16673 + }, + { + "epoch": 0.21667102525327178, + "grad_norm": 0.3745647072792053, + "learning_rate": 0.00015669816348015962, + "loss": 1.438, + "step": 16674 + }, + { + "epoch": 0.21668401979718765, + "grad_norm": 0.3706674575805664, + "learning_rate": 0.00015669556401824822, + "loss": 1.3303, + "step": 16675 + }, + { + "epoch": 0.21669701434110353, + "grad_norm": 0.4070736765861511, + "learning_rate": 0.00015669296455633684, + "loss": 1.3363, + "step": 16676 + }, + { + "epoch": 0.2167100088850194, + "grad_norm": 0.3106875419616699, + "learning_rate": 0.00015669036509442544, + "loss": 1.3661, + "step": 16677 + }, + { + "epoch": 0.21672300342893527, + "grad_norm": 0.3898864686489105, + "learning_rate": 0.0001566877656325141, + "loss": 1.4775, + "step": 16678 + }, + { + "epoch": 0.21673599797285115, + "grad_norm": 0.37862181663513184, + "learning_rate": 0.0001566851661706027, + "loss": 1.2082, + "step": 16679 + }, + { + "epoch": 0.21674899251676702, + "grad_norm": 0.39564165472984314, + "learning_rate": 0.00015668256670869131, + "loss": 1.4668, + "step": 16680 + }, + { + "epoch": 0.2167619870606829, + "grad_norm": 0.36793726682662964, + "learning_rate": 0.00015667996724677994, + "loss": 1.5403, + "step": 16681 + }, + { + "epoch": 0.21677498160459877, + "grad_norm": 0.3965376317501068, + "learning_rate": 0.00015667736778486854, + "loss": 1.2782, + "step": 16682 + }, + { + "epoch": 0.21678797614851464, + "grad_norm": 0.366913765668869, + "learning_rate": 0.00015667476832295716, + "loss": 1.5441, + "step": 16683 + }, + { + "epoch": 0.2168009706924305, + "grad_norm": 0.425129234790802, + "learning_rate": 0.00015667216886104576, + "loss": 1.3375, + "step": 16684 + }, + { + "epoch": 0.21681396523634638, + "grad_norm": 0.38893622159957886, + "learning_rate": 0.0001566695693991344, + "loss": 1.4341, + "step": 16685 + }, + { + "epoch": 0.21682695978026226, + "grad_norm": 0.4136925935745239, + "learning_rate": 0.000156666969937223, + "loss": 1.4315, + "step": 16686 + }, + { + "epoch": 0.21683995432417813, + "grad_norm": 0.47807273268699646, + "learning_rate": 0.0001566643704753116, + "loss": 1.5499, + "step": 16687 + }, + { + "epoch": 0.216852948868094, + "grad_norm": 0.2983015179634094, + "learning_rate": 0.00015666177101340023, + "loss": 1.6045, + "step": 16688 + }, + { + "epoch": 0.21686594341200988, + "grad_norm": 0.39483004808425903, + "learning_rate": 0.00015665917155148885, + "loss": 1.3651, + "step": 16689 + }, + { + "epoch": 0.21687893795592575, + "grad_norm": 0.3885948359966278, + "learning_rate": 0.00015665657208957748, + "loss": 1.4324, + "step": 16690 + }, + { + "epoch": 0.21689193249984162, + "grad_norm": 0.3961101472377777, + "learning_rate": 0.00015665397262766608, + "loss": 1.3949, + "step": 16691 + }, + { + "epoch": 0.2169049270437575, + "grad_norm": 0.34981784224510193, + "learning_rate": 0.0001566513731657547, + "loss": 1.4664, + "step": 16692 + }, + { + "epoch": 0.21691792158767337, + "grad_norm": 0.3440290689468384, + "learning_rate": 0.00015664877370384332, + "loss": 1.357, + "step": 16693 + }, + { + "epoch": 0.21693091613158924, + "grad_norm": 0.35698163509368896, + "learning_rate": 0.00015664617424193192, + "loss": 1.5693, + "step": 16694 + }, + { + "epoch": 0.2169439106755051, + "grad_norm": 0.3388995826244354, + "learning_rate": 0.00015664357478002055, + "loss": 1.5293, + "step": 16695 + }, + { + "epoch": 0.216956905219421, + "grad_norm": 0.3523191511631012, + "learning_rate": 0.00015664097531810914, + "loss": 1.457, + "step": 16696 + }, + { + "epoch": 0.21696989976333686, + "grad_norm": 0.5185662508010864, + "learning_rate": 0.0001566383758561978, + "loss": 1.3138, + "step": 16697 + }, + { + "epoch": 0.21698289430725273, + "grad_norm": 0.38048869371414185, + "learning_rate": 0.0001566357763942864, + "loss": 1.2495, + "step": 16698 + }, + { + "epoch": 0.2169958888511686, + "grad_norm": 0.4592948257923126, + "learning_rate": 0.000156633176932375, + "loss": 1.3806, + "step": 16699 + }, + { + "epoch": 0.21700888339508448, + "grad_norm": 0.46434304118156433, + "learning_rate": 0.00015663057747046361, + "loss": 1.5363, + "step": 16700 + }, + { + "epoch": 0.21702187793900035, + "grad_norm": 0.39502277970314026, + "learning_rate": 0.00015662797800855224, + "loss": 1.2855, + "step": 16701 + }, + { + "epoch": 0.21703487248291622, + "grad_norm": 0.29518890380859375, + "learning_rate": 0.00015662537854664086, + "loss": 1.2189, + "step": 16702 + }, + { + "epoch": 0.2170478670268321, + "grad_norm": 0.4711300730705261, + "learning_rate": 0.00015662277908472946, + "loss": 1.4416, + "step": 16703 + }, + { + "epoch": 0.21706086157074797, + "grad_norm": 0.41619038581848145, + "learning_rate": 0.00015662017962281809, + "loss": 1.4412, + "step": 16704 + }, + { + "epoch": 0.21707385611466384, + "grad_norm": 0.36433371901512146, + "learning_rate": 0.0001566175801609067, + "loss": 1.2662, + "step": 16705 + }, + { + "epoch": 0.21708685065857972, + "grad_norm": 0.45653778314590454, + "learning_rate": 0.0001566149806989953, + "loss": 1.4103, + "step": 16706 + }, + { + "epoch": 0.2170998452024956, + "grad_norm": 0.4312211573123932, + "learning_rate": 0.00015661238123708393, + "loss": 1.534, + "step": 16707 + }, + { + "epoch": 0.21711283974641146, + "grad_norm": 0.4086803197860718, + "learning_rate": 0.00015660978177517253, + "loss": 1.3577, + "step": 16708 + }, + { + "epoch": 0.21712583429032734, + "grad_norm": 0.39179056882858276, + "learning_rate": 0.00015660718231326118, + "loss": 1.485, + "step": 16709 + }, + { + "epoch": 0.21713882883424324, + "grad_norm": 0.2953057587146759, + "learning_rate": 0.00015660458285134978, + "loss": 1.3184, + "step": 16710 + }, + { + "epoch": 0.2171518233781591, + "grad_norm": 0.3751826584339142, + "learning_rate": 0.00015660198338943838, + "loss": 1.4064, + "step": 16711 + }, + { + "epoch": 0.21716481792207498, + "grad_norm": 0.38460439443588257, + "learning_rate": 0.000156599383927527, + "loss": 1.2642, + "step": 16712 + }, + { + "epoch": 0.21717781246599085, + "grad_norm": 0.39809390902519226, + "learning_rate": 0.00015659678446561562, + "loss": 1.469, + "step": 16713 + }, + { + "epoch": 0.21719080700990673, + "grad_norm": 0.47651973366737366, + "learning_rate": 0.00015659418500370425, + "loss": 1.5223, + "step": 16714 + }, + { + "epoch": 0.2172038015538226, + "grad_norm": 0.40227261185646057, + "learning_rate": 0.00015659158554179285, + "loss": 1.4262, + "step": 16715 + }, + { + "epoch": 0.21721679609773847, + "grad_norm": 0.4465072453022003, + "learning_rate": 0.00015658898607988147, + "loss": 1.4563, + "step": 16716 + }, + { + "epoch": 0.21722979064165435, + "grad_norm": 0.44313499331474304, + "learning_rate": 0.0001565863866179701, + "loss": 1.5551, + "step": 16717 + }, + { + "epoch": 0.21724278518557022, + "grad_norm": 0.34312674403190613, + "learning_rate": 0.0001565837871560587, + "loss": 1.336, + "step": 16718 + }, + { + "epoch": 0.2172557797294861, + "grad_norm": 0.33247441053390503, + "learning_rate": 0.00015658118769414732, + "loss": 1.4549, + "step": 16719 + }, + { + "epoch": 0.21726877427340197, + "grad_norm": 0.4092087745666504, + "learning_rate": 0.00015657858823223594, + "loss": 1.5188, + "step": 16720 + }, + { + "epoch": 0.21728176881731784, + "grad_norm": 0.43918511271476746, + "learning_rate": 0.00015657598877032457, + "loss": 1.593, + "step": 16721 + }, + { + "epoch": 0.2172947633612337, + "grad_norm": 0.47822877764701843, + "learning_rate": 0.00015657338930841316, + "loss": 1.4232, + "step": 16722 + }, + { + "epoch": 0.21730775790514958, + "grad_norm": 0.3324108421802521, + "learning_rate": 0.0001565707898465018, + "loss": 1.592, + "step": 16723 + }, + { + "epoch": 0.21732075244906546, + "grad_norm": 0.37148159742355347, + "learning_rate": 0.0001565681903845904, + "loss": 1.7572, + "step": 16724 + }, + { + "epoch": 0.21733374699298133, + "grad_norm": 0.36410802602767944, + "learning_rate": 0.000156565590922679, + "loss": 1.2273, + "step": 16725 + }, + { + "epoch": 0.2173467415368972, + "grad_norm": 0.42501553893089294, + "learning_rate": 0.00015656299146076763, + "loss": 1.443, + "step": 16726 + }, + { + "epoch": 0.21735973608081308, + "grad_norm": 0.3321506977081299, + "learning_rate": 0.00015656039199885623, + "loss": 1.2448, + "step": 16727 + }, + { + "epoch": 0.21737273062472895, + "grad_norm": 0.4748922288417816, + "learning_rate": 0.00015655779253694486, + "loss": 1.5254, + "step": 16728 + }, + { + "epoch": 0.21738572516864482, + "grad_norm": 0.3702036440372467, + "learning_rate": 0.00015655519307503348, + "loss": 1.1718, + "step": 16729 + }, + { + "epoch": 0.2173987197125607, + "grad_norm": 0.34009918570518494, + "learning_rate": 0.00015655259361312208, + "loss": 1.5975, + "step": 16730 + }, + { + "epoch": 0.21741171425647657, + "grad_norm": 0.46154966950416565, + "learning_rate": 0.0001565499941512107, + "loss": 1.4618, + "step": 16731 + }, + { + "epoch": 0.21742470880039244, + "grad_norm": 0.3811264932155609, + "learning_rate": 0.00015654739468929933, + "loss": 1.3974, + "step": 16732 + }, + { + "epoch": 0.21743770334430831, + "grad_norm": 0.2730337381362915, + "learning_rate": 0.00015654479522738795, + "loss": 1.3609, + "step": 16733 + }, + { + "epoch": 0.2174506978882242, + "grad_norm": 0.3844816982746124, + "learning_rate": 0.00015654219576547655, + "loss": 1.3632, + "step": 16734 + }, + { + "epoch": 0.21746369243214006, + "grad_norm": 0.37815961241722107, + "learning_rate": 0.00015653959630356517, + "loss": 1.4671, + "step": 16735 + }, + { + "epoch": 0.21747668697605593, + "grad_norm": 0.34678205847740173, + "learning_rate": 0.0001565369968416538, + "loss": 1.3536, + "step": 16736 + }, + { + "epoch": 0.2174896815199718, + "grad_norm": 0.40051963925361633, + "learning_rate": 0.0001565343973797424, + "loss": 1.5007, + "step": 16737 + }, + { + "epoch": 0.21750267606388768, + "grad_norm": 0.3613760769367218, + "learning_rate": 0.00015653179791783102, + "loss": 1.4845, + "step": 16738 + }, + { + "epoch": 0.21751567060780355, + "grad_norm": 0.43381285667419434, + "learning_rate": 0.00015652919845591962, + "loss": 1.3215, + "step": 16739 + }, + { + "epoch": 0.21752866515171942, + "grad_norm": 0.34630247950553894, + "learning_rate": 0.00015652659899400827, + "loss": 1.5543, + "step": 16740 + }, + { + "epoch": 0.2175416596956353, + "grad_norm": 0.42135071754455566, + "learning_rate": 0.00015652399953209687, + "loss": 1.5052, + "step": 16741 + }, + { + "epoch": 0.21755465423955117, + "grad_norm": 0.33038827776908875, + "learning_rate": 0.00015652140007018546, + "loss": 1.1484, + "step": 16742 + }, + { + "epoch": 0.21756764878346704, + "grad_norm": 0.4731179475784302, + "learning_rate": 0.0001565188006082741, + "loss": 1.5578, + "step": 16743 + }, + { + "epoch": 0.21758064332738292, + "grad_norm": 0.3731505870819092, + "learning_rate": 0.0001565162011463627, + "loss": 1.5184, + "step": 16744 + }, + { + "epoch": 0.2175936378712988, + "grad_norm": 0.449459046125412, + "learning_rate": 0.00015651360168445134, + "loss": 1.4476, + "step": 16745 + }, + { + "epoch": 0.21760663241521466, + "grad_norm": 0.37653404474258423, + "learning_rate": 0.00015651100222253993, + "loss": 1.308, + "step": 16746 + }, + { + "epoch": 0.21761962695913054, + "grad_norm": 0.40483370423316956, + "learning_rate": 0.00015650840276062856, + "loss": 1.515, + "step": 16747 + }, + { + "epoch": 0.2176326215030464, + "grad_norm": 0.4058394432067871, + "learning_rate": 0.00015650580329871718, + "loss": 1.4304, + "step": 16748 + }, + { + "epoch": 0.21764561604696228, + "grad_norm": 0.3681657612323761, + "learning_rate": 0.00015650320383680578, + "loss": 1.3604, + "step": 16749 + }, + { + "epoch": 0.21765861059087815, + "grad_norm": 0.4467221200466156, + "learning_rate": 0.0001565006043748944, + "loss": 1.6015, + "step": 16750 + }, + { + "epoch": 0.21767160513479403, + "grad_norm": 0.4380754232406616, + "learning_rate": 0.000156498004912983, + "loss": 1.371, + "step": 16751 + }, + { + "epoch": 0.2176845996787099, + "grad_norm": 0.3701735734939575, + "learning_rate": 0.00015649540545107165, + "loss": 1.3654, + "step": 16752 + }, + { + "epoch": 0.21769759422262577, + "grad_norm": 0.37976616621017456, + "learning_rate": 0.00015649280598916025, + "loss": 1.6451, + "step": 16753 + }, + { + "epoch": 0.21771058876654165, + "grad_norm": 0.38921859860420227, + "learning_rate": 0.00015649020652724885, + "loss": 1.2862, + "step": 16754 + }, + { + "epoch": 0.21772358331045752, + "grad_norm": 0.3717900514602661, + "learning_rate": 0.0001564876070653375, + "loss": 1.4568, + "step": 16755 + }, + { + "epoch": 0.2177365778543734, + "grad_norm": 0.35280272364616394, + "learning_rate": 0.0001564850076034261, + "loss": 1.3572, + "step": 16756 + }, + { + "epoch": 0.21774957239828927, + "grad_norm": 0.5717735886573792, + "learning_rate": 0.00015648240814151472, + "loss": 1.2634, + "step": 16757 + }, + { + "epoch": 0.21776256694220514, + "grad_norm": 0.4082580506801605, + "learning_rate": 0.00015647980867960332, + "loss": 1.3163, + "step": 16758 + }, + { + "epoch": 0.217775561486121, + "grad_norm": 0.5412265658378601, + "learning_rate": 0.00015647720921769194, + "loss": 1.4772, + "step": 16759 + }, + { + "epoch": 0.21778855603003688, + "grad_norm": 0.41908109188079834, + "learning_rate": 0.00015647460975578057, + "loss": 1.2015, + "step": 16760 + }, + { + "epoch": 0.21780155057395276, + "grad_norm": 0.6466304063796997, + "learning_rate": 0.00015647201029386917, + "loss": 1.4759, + "step": 16761 + }, + { + "epoch": 0.21781454511786863, + "grad_norm": 0.2947784662246704, + "learning_rate": 0.0001564694108319578, + "loss": 1.4606, + "step": 16762 + }, + { + "epoch": 0.2178275396617845, + "grad_norm": 0.38844993710517883, + "learning_rate": 0.00015646681137004642, + "loss": 1.3304, + "step": 16763 + }, + { + "epoch": 0.21784053420570038, + "grad_norm": 0.31608712673187256, + "learning_rate": 0.00015646421190813504, + "loss": 1.352, + "step": 16764 + }, + { + "epoch": 0.21785352874961625, + "grad_norm": 0.33786827325820923, + "learning_rate": 0.00015646161244622364, + "loss": 1.5429, + "step": 16765 + }, + { + "epoch": 0.21786652329353212, + "grad_norm": 0.41640591621398926, + "learning_rate": 0.00015645901298431223, + "loss": 1.4957, + "step": 16766 + }, + { + "epoch": 0.217879517837448, + "grad_norm": 0.25733786821365356, + "learning_rate": 0.00015645641352240089, + "loss": 1.2774, + "step": 16767 + }, + { + "epoch": 0.21789251238136387, + "grad_norm": 0.384190171957016, + "learning_rate": 0.00015645381406048948, + "loss": 1.2511, + "step": 16768 + }, + { + "epoch": 0.21790550692527974, + "grad_norm": 0.4897196590900421, + "learning_rate": 0.0001564512145985781, + "loss": 1.5158, + "step": 16769 + }, + { + "epoch": 0.2179185014691956, + "grad_norm": 0.38684624433517456, + "learning_rate": 0.0001564486151366667, + "loss": 1.3112, + "step": 16770 + }, + { + "epoch": 0.2179314960131115, + "grad_norm": 0.3542638123035431, + "learning_rate": 0.00015644601567475533, + "loss": 1.4792, + "step": 16771 + }, + { + "epoch": 0.21794449055702736, + "grad_norm": 0.366250604391098, + "learning_rate": 0.00015644341621284395, + "loss": 1.3279, + "step": 16772 + }, + { + "epoch": 0.21795748510094323, + "grad_norm": 0.4124433398246765, + "learning_rate": 0.00015644081675093255, + "loss": 1.5081, + "step": 16773 + }, + { + "epoch": 0.2179704796448591, + "grad_norm": 0.3338433802127838, + "learning_rate": 0.00015643821728902118, + "loss": 1.3079, + "step": 16774 + }, + { + "epoch": 0.21798347418877498, + "grad_norm": 0.4465111196041107, + "learning_rate": 0.0001564356178271098, + "loss": 1.5815, + "step": 16775 + }, + { + "epoch": 0.21799646873269085, + "grad_norm": 0.48583635687828064, + "learning_rate": 0.00015643301836519843, + "loss": 1.4563, + "step": 16776 + }, + { + "epoch": 0.21800946327660672, + "grad_norm": 0.3784427046775818, + "learning_rate": 0.00015643041890328702, + "loss": 1.4843, + "step": 16777 + }, + { + "epoch": 0.2180224578205226, + "grad_norm": 0.4067921042442322, + "learning_rate": 0.00015642781944137565, + "loss": 1.3139, + "step": 16778 + }, + { + "epoch": 0.21803545236443847, + "grad_norm": 0.4644142687320709, + "learning_rate": 0.00015642521997946427, + "loss": 1.6929, + "step": 16779 + }, + { + "epoch": 0.21804844690835434, + "grad_norm": 0.3555678427219391, + "learning_rate": 0.00015642262051755287, + "loss": 1.2989, + "step": 16780 + }, + { + "epoch": 0.21806144145227022, + "grad_norm": 0.355695903301239, + "learning_rate": 0.0001564200210556415, + "loss": 1.3526, + "step": 16781 + }, + { + "epoch": 0.2180744359961861, + "grad_norm": 0.35750722885131836, + "learning_rate": 0.0001564174215937301, + "loss": 1.4054, + "step": 16782 + }, + { + "epoch": 0.21808743054010196, + "grad_norm": 0.33844780921936035, + "learning_rate": 0.00015641482213181872, + "loss": 1.3747, + "step": 16783 + }, + { + "epoch": 0.21810042508401783, + "grad_norm": 0.5258835554122925, + "learning_rate": 0.00015641222266990734, + "loss": 1.4915, + "step": 16784 + }, + { + "epoch": 0.2181134196279337, + "grad_norm": 0.43022042512893677, + "learning_rate": 0.00015640962320799594, + "loss": 1.5072, + "step": 16785 + }, + { + "epoch": 0.21812641417184958, + "grad_norm": 0.41696757078170776, + "learning_rate": 0.00015640702374608456, + "loss": 1.4393, + "step": 16786 + }, + { + "epoch": 0.21813940871576548, + "grad_norm": 0.37719014286994934, + "learning_rate": 0.00015640442428417319, + "loss": 1.6216, + "step": 16787 + }, + { + "epoch": 0.21815240325968135, + "grad_norm": 0.24868805706501007, + "learning_rate": 0.0001564018248222618, + "loss": 1.2143, + "step": 16788 + }, + { + "epoch": 0.21816539780359723, + "grad_norm": 0.3956727683544159, + "learning_rate": 0.0001563992253603504, + "loss": 1.5182, + "step": 16789 + }, + { + "epoch": 0.2181783923475131, + "grad_norm": 0.4267377555370331, + "learning_rate": 0.00015639662589843903, + "loss": 1.3347, + "step": 16790 + }, + { + "epoch": 0.21819138689142897, + "grad_norm": 0.4038475453853607, + "learning_rate": 0.00015639402643652766, + "loss": 1.3509, + "step": 16791 + }, + { + "epoch": 0.21820438143534485, + "grad_norm": 0.3086733818054199, + "learning_rate": 0.00015639142697461625, + "loss": 1.3589, + "step": 16792 + }, + { + "epoch": 0.21821737597926072, + "grad_norm": 0.4115789532661438, + "learning_rate": 0.00015638882751270488, + "loss": 1.3748, + "step": 16793 + }, + { + "epoch": 0.2182303705231766, + "grad_norm": 0.3457920253276825, + "learning_rate": 0.0001563862280507935, + "loss": 1.2035, + "step": 16794 + }, + { + "epoch": 0.21824336506709247, + "grad_norm": 0.3764108717441559, + "learning_rate": 0.0001563836285888821, + "loss": 1.3565, + "step": 16795 + }, + { + "epoch": 0.21825635961100834, + "grad_norm": 0.4859064221382141, + "learning_rate": 0.00015638102912697073, + "loss": 1.4211, + "step": 16796 + }, + { + "epoch": 0.2182693541549242, + "grad_norm": 0.33105313777923584, + "learning_rate": 0.00015637842966505932, + "loss": 1.3378, + "step": 16797 + }, + { + "epoch": 0.21828234869884008, + "grad_norm": 0.37387049198150635, + "learning_rate": 0.00015637583020314797, + "loss": 1.3626, + "step": 16798 + }, + { + "epoch": 0.21829534324275596, + "grad_norm": 0.38293424248695374, + "learning_rate": 0.00015637323074123657, + "loss": 1.4374, + "step": 16799 + }, + { + "epoch": 0.21830833778667183, + "grad_norm": 0.47173646092414856, + "learning_rate": 0.0001563706312793252, + "loss": 1.3875, + "step": 16800 + }, + { + "epoch": 0.2183213323305877, + "grad_norm": 0.41163915395736694, + "learning_rate": 0.0001563680318174138, + "loss": 1.4015, + "step": 16801 + }, + { + "epoch": 0.21833432687450358, + "grad_norm": 0.4057028293609619, + "learning_rate": 0.00015636543235550242, + "loss": 1.5235, + "step": 16802 + }, + { + "epoch": 0.21834732141841945, + "grad_norm": 0.3744058609008789, + "learning_rate": 0.00015636283289359104, + "loss": 1.343, + "step": 16803 + }, + { + "epoch": 0.21836031596233532, + "grad_norm": 0.3776036500930786, + "learning_rate": 0.00015636023343167964, + "loss": 1.4716, + "step": 16804 + }, + { + "epoch": 0.2183733105062512, + "grad_norm": 0.3524835407733917, + "learning_rate": 0.00015635763396976826, + "loss": 1.4402, + "step": 16805 + }, + { + "epoch": 0.21838630505016707, + "grad_norm": 0.3152594566345215, + "learning_rate": 0.0001563550345078569, + "loss": 1.4213, + "step": 16806 + }, + { + "epoch": 0.21839929959408294, + "grad_norm": 0.3834557831287384, + "learning_rate": 0.0001563524350459455, + "loss": 1.4371, + "step": 16807 + }, + { + "epoch": 0.2184122941379988, + "grad_norm": 0.38085585832595825, + "learning_rate": 0.0001563498355840341, + "loss": 1.1847, + "step": 16808 + }, + { + "epoch": 0.2184252886819147, + "grad_norm": 0.32922738790512085, + "learning_rate": 0.0001563472361221227, + "loss": 1.4526, + "step": 16809 + }, + { + "epoch": 0.21843828322583056, + "grad_norm": 0.4722871780395508, + "learning_rate": 0.00015634463666021136, + "loss": 1.4853, + "step": 16810 + }, + { + "epoch": 0.21845127776974643, + "grad_norm": 0.43322816491127014, + "learning_rate": 0.00015634203719829996, + "loss": 1.3518, + "step": 16811 + }, + { + "epoch": 0.2184642723136623, + "grad_norm": 0.4004061818122864, + "learning_rate": 0.00015633943773638858, + "loss": 1.3632, + "step": 16812 + }, + { + "epoch": 0.21847726685757818, + "grad_norm": 0.425752192735672, + "learning_rate": 0.00015633683827447718, + "loss": 1.4789, + "step": 16813 + }, + { + "epoch": 0.21849026140149405, + "grad_norm": 0.4251552224159241, + "learning_rate": 0.0001563342388125658, + "loss": 1.4618, + "step": 16814 + }, + { + "epoch": 0.21850325594540992, + "grad_norm": 0.4280528128147125, + "learning_rate": 0.00015633163935065443, + "loss": 1.2739, + "step": 16815 + }, + { + "epoch": 0.2185162504893258, + "grad_norm": 0.33704516291618347, + "learning_rate": 0.00015632903988874303, + "loss": 1.3042, + "step": 16816 + }, + { + "epoch": 0.21852924503324167, + "grad_norm": 0.40909144282341003, + "learning_rate": 0.00015632644042683165, + "loss": 1.6533, + "step": 16817 + }, + { + "epoch": 0.21854223957715754, + "grad_norm": 0.3977036774158478, + "learning_rate": 0.00015632384096492027, + "loss": 1.5252, + "step": 16818 + }, + { + "epoch": 0.21855523412107342, + "grad_norm": 0.3146308660507202, + "learning_rate": 0.0001563212415030089, + "loss": 1.1663, + "step": 16819 + }, + { + "epoch": 0.2185682286649893, + "grad_norm": 0.4449825882911682, + "learning_rate": 0.0001563186420410975, + "loss": 1.4451, + "step": 16820 + }, + { + "epoch": 0.21858122320890516, + "grad_norm": 0.5218036770820618, + "learning_rate": 0.0001563160425791861, + "loss": 1.6039, + "step": 16821 + }, + { + "epoch": 0.21859421775282104, + "grad_norm": 0.3879724144935608, + "learning_rate": 0.00015631344311727474, + "loss": 1.5384, + "step": 16822 + }, + { + "epoch": 0.2186072122967369, + "grad_norm": 0.2828150689601898, + "learning_rate": 0.00015631084365536334, + "loss": 1.5118, + "step": 16823 + }, + { + "epoch": 0.21862020684065278, + "grad_norm": 0.39576396346092224, + "learning_rate": 0.00015630824419345197, + "loss": 1.3704, + "step": 16824 + }, + { + "epoch": 0.21863320138456865, + "grad_norm": 0.39629244804382324, + "learning_rate": 0.00015630564473154056, + "loss": 1.4324, + "step": 16825 + }, + { + "epoch": 0.21864619592848453, + "grad_norm": 0.4551648795604706, + "learning_rate": 0.0001563030452696292, + "loss": 1.4113, + "step": 16826 + }, + { + "epoch": 0.2186591904724004, + "grad_norm": 0.37779414653778076, + "learning_rate": 0.0001563004458077178, + "loss": 1.3153, + "step": 16827 + }, + { + "epoch": 0.21867218501631627, + "grad_norm": 0.40455910563468933, + "learning_rate": 0.0001562978463458064, + "loss": 1.4747, + "step": 16828 + }, + { + "epoch": 0.21868517956023215, + "grad_norm": 0.29166582226753235, + "learning_rate": 0.00015629524688389503, + "loss": 1.5594, + "step": 16829 + }, + { + "epoch": 0.21869817410414802, + "grad_norm": 0.3363259434700012, + "learning_rate": 0.00015629264742198366, + "loss": 1.2258, + "step": 16830 + }, + { + "epoch": 0.2187111686480639, + "grad_norm": 0.4475618302822113, + "learning_rate": 0.00015629004796007228, + "loss": 1.4639, + "step": 16831 + }, + { + "epoch": 0.21872416319197976, + "grad_norm": 0.40197041630744934, + "learning_rate": 0.00015628744849816088, + "loss": 1.3159, + "step": 16832 + }, + { + "epoch": 0.21873715773589564, + "grad_norm": 0.43685510754585266, + "learning_rate": 0.0001562848490362495, + "loss": 1.6726, + "step": 16833 + }, + { + "epoch": 0.2187501522798115, + "grad_norm": 0.3603578209877014, + "learning_rate": 0.00015628224957433813, + "loss": 1.5215, + "step": 16834 + }, + { + "epoch": 0.21876314682372738, + "grad_norm": 0.36410731077194214, + "learning_rate": 0.00015627965011242673, + "loss": 1.2982, + "step": 16835 + }, + { + "epoch": 0.21877614136764326, + "grad_norm": 0.4339752197265625, + "learning_rate": 0.00015627705065051535, + "loss": 1.4808, + "step": 16836 + }, + { + "epoch": 0.21878913591155913, + "grad_norm": 0.44504326581954956, + "learning_rate": 0.00015627445118860398, + "loss": 1.6463, + "step": 16837 + }, + { + "epoch": 0.218802130455475, + "grad_norm": 0.44338786602020264, + "learning_rate": 0.00015627185172669257, + "loss": 1.399, + "step": 16838 + }, + { + "epoch": 0.21881512499939088, + "grad_norm": 0.35459303855895996, + "learning_rate": 0.0001562692522647812, + "loss": 1.6303, + "step": 16839 + }, + { + "epoch": 0.21882811954330675, + "grad_norm": 0.42481687664985657, + "learning_rate": 0.0001562666528028698, + "loss": 1.391, + "step": 16840 + }, + { + "epoch": 0.21884111408722262, + "grad_norm": 0.339913934469223, + "learning_rate": 0.00015626405334095845, + "loss": 1.1546, + "step": 16841 + }, + { + "epoch": 0.2188541086311385, + "grad_norm": 0.4006161689758301, + "learning_rate": 0.00015626145387904704, + "loss": 1.4251, + "step": 16842 + }, + { + "epoch": 0.21886710317505437, + "grad_norm": 0.41801369190216064, + "learning_rate": 0.00015625885441713567, + "loss": 1.5567, + "step": 16843 + }, + { + "epoch": 0.21888009771897024, + "grad_norm": 0.37432003021240234, + "learning_rate": 0.00015625625495522427, + "loss": 1.4939, + "step": 16844 + }, + { + "epoch": 0.2188930922628861, + "grad_norm": 0.42471933364868164, + "learning_rate": 0.0001562536554933129, + "loss": 1.4387, + "step": 16845 + }, + { + "epoch": 0.21890608680680199, + "grad_norm": 0.4127656817436218, + "learning_rate": 0.00015625105603140152, + "loss": 1.4283, + "step": 16846 + }, + { + "epoch": 0.21891908135071786, + "grad_norm": 0.4068490266799927, + "learning_rate": 0.0001562484565694901, + "loss": 1.4211, + "step": 16847 + }, + { + "epoch": 0.21893207589463373, + "grad_norm": 0.4155423045158386, + "learning_rate": 0.00015624585710757874, + "loss": 1.4509, + "step": 16848 + }, + { + "epoch": 0.2189450704385496, + "grad_norm": 0.39099904894828796, + "learning_rate": 0.00015624325764566736, + "loss": 1.5133, + "step": 16849 + }, + { + "epoch": 0.21895806498246548, + "grad_norm": 0.43010252714157104, + "learning_rate": 0.00015624065818375596, + "loss": 1.481, + "step": 16850 + }, + { + "epoch": 0.21897105952638135, + "grad_norm": 0.36012136936187744, + "learning_rate": 0.00015623805872184458, + "loss": 1.5329, + "step": 16851 + }, + { + "epoch": 0.21898405407029722, + "grad_norm": 0.28695186972618103, + "learning_rate": 0.00015623545925993318, + "loss": 1.3573, + "step": 16852 + }, + { + "epoch": 0.2189970486142131, + "grad_norm": 0.32815098762512207, + "learning_rate": 0.00015623285979802183, + "loss": 1.5567, + "step": 16853 + }, + { + "epoch": 0.21901004315812897, + "grad_norm": 0.44502657651901245, + "learning_rate": 0.00015623026033611043, + "loss": 1.6575, + "step": 16854 + }, + { + "epoch": 0.21902303770204484, + "grad_norm": 0.3865372836589813, + "learning_rate": 0.00015622766087419905, + "loss": 1.3475, + "step": 16855 + }, + { + "epoch": 0.21903603224596072, + "grad_norm": 0.31753936409950256, + "learning_rate": 0.00015622506141228765, + "loss": 1.3279, + "step": 16856 + }, + { + "epoch": 0.2190490267898766, + "grad_norm": 0.7306997776031494, + "learning_rate": 0.00015622246195037628, + "loss": 1.4626, + "step": 16857 + }, + { + "epoch": 0.21906202133379246, + "grad_norm": 0.4163583517074585, + "learning_rate": 0.0001562198624884649, + "loss": 1.3209, + "step": 16858 + }, + { + "epoch": 0.21907501587770833, + "grad_norm": 0.41101527214050293, + "learning_rate": 0.0001562172630265535, + "loss": 1.436, + "step": 16859 + }, + { + "epoch": 0.2190880104216242, + "grad_norm": 0.38744163513183594, + "learning_rate": 0.00015621466356464212, + "loss": 1.4152, + "step": 16860 + }, + { + "epoch": 0.21910100496554008, + "grad_norm": 0.47433170676231384, + "learning_rate": 0.00015621206410273075, + "loss": 1.486, + "step": 16861 + }, + { + "epoch": 0.21911399950945595, + "grad_norm": 0.39724916219711304, + "learning_rate": 0.00015620946464081937, + "loss": 1.4638, + "step": 16862 + }, + { + "epoch": 0.21912699405337185, + "grad_norm": 0.3638724386692047, + "learning_rate": 0.00015620686517890797, + "loss": 1.457, + "step": 16863 + }, + { + "epoch": 0.21913998859728773, + "grad_norm": 0.4112611413002014, + "learning_rate": 0.00015620426571699657, + "loss": 1.4775, + "step": 16864 + }, + { + "epoch": 0.2191529831412036, + "grad_norm": 0.4751654863357544, + "learning_rate": 0.00015620166625508522, + "loss": 1.4363, + "step": 16865 + }, + { + "epoch": 0.21916597768511947, + "grad_norm": 0.38216903805732727, + "learning_rate": 0.00015619906679317382, + "loss": 1.5605, + "step": 16866 + }, + { + "epoch": 0.21917897222903535, + "grad_norm": 0.3676697909832001, + "learning_rate": 0.00015619646733126244, + "loss": 1.3506, + "step": 16867 + }, + { + "epoch": 0.21919196677295122, + "grad_norm": 0.4396227300167084, + "learning_rate": 0.00015619386786935106, + "loss": 1.4454, + "step": 16868 + }, + { + "epoch": 0.2192049613168671, + "grad_norm": 0.384050577878952, + "learning_rate": 0.00015619126840743966, + "loss": 1.4436, + "step": 16869 + }, + { + "epoch": 0.21921795586078296, + "grad_norm": 0.2954740524291992, + "learning_rate": 0.00015618866894552829, + "loss": 1.3362, + "step": 16870 + }, + { + "epoch": 0.21923095040469884, + "grad_norm": 0.4695274233818054, + "learning_rate": 0.00015618606948361688, + "loss": 1.5226, + "step": 16871 + }, + { + "epoch": 0.2192439449486147, + "grad_norm": 0.3787025511264801, + "learning_rate": 0.00015618347002170554, + "loss": 1.3989, + "step": 16872 + }, + { + "epoch": 0.21925693949253058, + "grad_norm": 0.319148451089859, + "learning_rate": 0.00015618087055979413, + "loss": 1.6091, + "step": 16873 + }, + { + "epoch": 0.21926993403644646, + "grad_norm": 0.4071063697338104, + "learning_rate": 0.00015617827109788276, + "loss": 1.2532, + "step": 16874 + }, + { + "epoch": 0.21928292858036233, + "grad_norm": 0.3327900171279907, + "learning_rate": 0.00015617567163597135, + "loss": 1.2584, + "step": 16875 + }, + { + "epoch": 0.2192959231242782, + "grad_norm": 0.38058164715766907, + "learning_rate": 0.00015617307217405998, + "loss": 1.6633, + "step": 16876 + }, + { + "epoch": 0.21930891766819408, + "grad_norm": 0.32354289293289185, + "learning_rate": 0.0001561704727121486, + "loss": 1.1628, + "step": 16877 + }, + { + "epoch": 0.21932191221210995, + "grad_norm": 0.34749579429626465, + "learning_rate": 0.0001561678732502372, + "loss": 1.4499, + "step": 16878 + }, + { + "epoch": 0.21933490675602582, + "grad_norm": 0.3439461290836334, + "learning_rate": 0.00015616527378832583, + "loss": 1.3724, + "step": 16879 + }, + { + "epoch": 0.2193479012999417, + "grad_norm": 0.4042541980743408, + "learning_rate": 0.00015616267432641445, + "loss": 1.5251, + "step": 16880 + }, + { + "epoch": 0.21936089584385757, + "grad_norm": 0.3595815896987915, + "learning_rate": 0.00015616007486450305, + "loss": 1.3439, + "step": 16881 + }, + { + "epoch": 0.21937389038777344, + "grad_norm": 0.41376960277557373, + "learning_rate": 0.00015615747540259167, + "loss": 1.3891, + "step": 16882 + }, + { + "epoch": 0.2193868849316893, + "grad_norm": 0.2848331928253174, + "learning_rate": 0.00015615487594068027, + "loss": 1.2759, + "step": 16883 + }, + { + "epoch": 0.2193998794756052, + "grad_norm": 0.40645015239715576, + "learning_rate": 0.00015615227647876892, + "loss": 1.3187, + "step": 16884 + }, + { + "epoch": 0.21941287401952106, + "grad_norm": 0.4796353280544281, + "learning_rate": 0.00015614967701685752, + "loss": 1.6828, + "step": 16885 + }, + { + "epoch": 0.21942586856343693, + "grad_norm": 0.37315917015075684, + "learning_rate": 0.00015614707755494614, + "loss": 1.3014, + "step": 16886 + }, + { + "epoch": 0.2194388631073528, + "grad_norm": 0.3138767182826996, + "learning_rate": 0.00015614447809303474, + "loss": 1.2505, + "step": 16887 + }, + { + "epoch": 0.21945185765126868, + "grad_norm": 0.4015048146247864, + "learning_rate": 0.00015614187863112336, + "loss": 1.4245, + "step": 16888 + }, + { + "epoch": 0.21946485219518455, + "grad_norm": 0.36808067560195923, + "learning_rate": 0.000156139279169212, + "loss": 1.2575, + "step": 16889 + }, + { + "epoch": 0.21947784673910042, + "grad_norm": 0.4535841643810272, + "learning_rate": 0.00015613667970730059, + "loss": 1.3205, + "step": 16890 + }, + { + "epoch": 0.2194908412830163, + "grad_norm": 0.32976144552230835, + "learning_rate": 0.0001561340802453892, + "loss": 1.1836, + "step": 16891 + }, + { + "epoch": 0.21950383582693217, + "grad_norm": 0.41894757747650146, + "learning_rate": 0.00015613148078347784, + "loss": 1.5443, + "step": 16892 + }, + { + "epoch": 0.21951683037084804, + "grad_norm": 0.35990431904792786, + "learning_rate": 0.00015612888132156643, + "loss": 1.1722, + "step": 16893 + }, + { + "epoch": 0.21952982491476392, + "grad_norm": 0.40163126587867737, + "learning_rate": 0.00015612628185965506, + "loss": 1.2236, + "step": 16894 + }, + { + "epoch": 0.2195428194586798, + "grad_norm": 0.34681564569473267, + "learning_rate": 0.00015612368239774365, + "loss": 1.3468, + "step": 16895 + }, + { + "epoch": 0.21955581400259566, + "grad_norm": 0.36382806301116943, + "learning_rate": 0.0001561210829358323, + "loss": 1.3494, + "step": 16896 + }, + { + "epoch": 0.21956880854651153, + "grad_norm": 0.513202965259552, + "learning_rate": 0.0001561184834739209, + "loss": 1.4313, + "step": 16897 + }, + { + "epoch": 0.2195818030904274, + "grad_norm": 0.32461661100387573, + "learning_rate": 0.00015611588401200953, + "loss": 1.4219, + "step": 16898 + }, + { + "epoch": 0.21959479763434328, + "grad_norm": 0.44427090883255005, + "learning_rate": 0.00015611328455009813, + "loss": 1.6281, + "step": 16899 + }, + { + "epoch": 0.21960779217825915, + "grad_norm": 0.40824612975120544, + "learning_rate": 0.00015611068508818675, + "loss": 1.4497, + "step": 16900 + }, + { + "epoch": 0.21962078672217503, + "grad_norm": 0.4134128987789154, + "learning_rate": 0.00015610808562627537, + "loss": 1.5011, + "step": 16901 + }, + { + "epoch": 0.2196337812660909, + "grad_norm": 0.4088248312473297, + "learning_rate": 0.00015610548616436397, + "loss": 1.4041, + "step": 16902 + }, + { + "epoch": 0.21964677581000677, + "grad_norm": 0.3561285734176636, + "learning_rate": 0.0001561028867024526, + "loss": 1.4716, + "step": 16903 + }, + { + "epoch": 0.21965977035392265, + "grad_norm": 0.4280087649822235, + "learning_rate": 0.00015610028724054122, + "loss": 1.3912, + "step": 16904 + }, + { + "epoch": 0.21967276489783852, + "grad_norm": 0.37301552295684814, + "learning_rate": 0.00015609768777862982, + "loss": 1.4228, + "step": 16905 + }, + { + "epoch": 0.2196857594417544, + "grad_norm": 0.3958079516887665, + "learning_rate": 0.00015609508831671844, + "loss": 1.5259, + "step": 16906 + }, + { + "epoch": 0.21969875398567026, + "grad_norm": 0.421053946018219, + "learning_rate": 0.00015609248885480707, + "loss": 1.3295, + "step": 16907 + }, + { + "epoch": 0.21971174852958614, + "grad_norm": 0.3922063112258911, + "learning_rate": 0.0001560898893928957, + "loss": 1.4187, + "step": 16908 + }, + { + "epoch": 0.219724743073502, + "grad_norm": 0.4066793918609619, + "learning_rate": 0.0001560872899309843, + "loss": 1.3811, + "step": 16909 + }, + { + "epoch": 0.21973773761741788, + "grad_norm": 0.41032347083091736, + "learning_rate": 0.0001560846904690729, + "loss": 1.5123, + "step": 16910 + }, + { + "epoch": 0.21975073216133376, + "grad_norm": 0.335581511259079, + "learning_rate": 0.00015608209100716154, + "loss": 1.2008, + "step": 16911 + }, + { + "epoch": 0.21976372670524963, + "grad_norm": 0.3605802059173584, + "learning_rate": 0.00015607949154525014, + "loss": 1.3167, + "step": 16912 + }, + { + "epoch": 0.2197767212491655, + "grad_norm": 0.43856149911880493, + "learning_rate": 0.00015607689208333876, + "loss": 1.5308, + "step": 16913 + }, + { + "epoch": 0.21978971579308137, + "grad_norm": 0.3836953341960907, + "learning_rate": 0.00015607429262142736, + "loss": 1.496, + "step": 16914 + }, + { + "epoch": 0.21980271033699725, + "grad_norm": 0.5015119314193726, + "learning_rate": 0.000156071693159516, + "loss": 1.4847, + "step": 16915 + }, + { + "epoch": 0.21981570488091312, + "grad_norm": 0.5078940391540527, + "learning_rate": 0.0001560690936976046, + "loss": 1.4667, + "step": 16916 + }, + { + "epoch": 0.219828699424829, + "grad_norm": 0.42061948776245117, + "learning_rate": 0.0001560664942356932, + "loss": 1.4439, + "step": 16917 + }, + { + "epoch": 0.21984169396874487, + "grad_norm": 0.32008206844329834, + "learning_rate": 0.00015606389477378183, + "loss": 1.5181, + "step": 16918 + }, + { + "epoch": 0.21985468851266074, + "grad_norm": 0.48307475447654724, + "learning_rate": 0.00015606129531187045, + "loss": 1.2098, + "step": 16919 + }, + { + "epoch": 0.2198676830565766, + "grad_norm": 0.3718625009059906, + "learning_rate": 0.00015605869584995908, + "loss": 1.3122, + "step": 16920 + }, + { + "epoch": 0.21988067760049249, + "grad_norm": 0.45591822266578674, + "learning_rate": 0.00015605609638804767, + "loss": 1.3838, + "step": 16921 + }, + { + "epoch": 0.21989367214440836, + "grad_norm": 0.3006881773471832, + "learning_rate": 0.0001560534969261363, + "loss": 1.5228, + "step": 16922 + }, + { + "epoch": 0.21990666668832423, + "grad_norm": 0.38272807002067566, + "learning_rate": 0.00015605089746422492, + "loss": 1.4197, + "step": 16923 + }, + { + "epoch": 0.2199196612322401, + "grad_norm": 0.36027419567108154, + "learning_rate": 0.00015604829800231352, + "loss": 1.3656, + "step": 16924 + }, + { + "epoch": 0.21993265577615598, + "grad_norm": 0.3465765118598938, + "learning_rate": 0.00015604569854040215, + "loss": 1.4092, + "step": 16925 + }, + { + "epoch": 0.21994565032007185, + "grad_norm": 0.40953442454338074, + "learning_rate": 0.00015604309907849074, + "loss": 1.4936, + "step": 16926 + }, + { + "epoch": 0.21995864486398772, + "grad_norm": 0.5084898471832275, + "learning_rate": 0.0001560404996165794, + "loss": 1.4619, + "step": 16927 + }, + { + "epoch": 0.2199716394079036, + "grad_norm": 0.4237470030784607, + "learning_rate": 0.000156037900154668, + "loss": 1.3977, + "step": 16928 + }, + { + "epoch": 0.21998463395181947, + "grad_norm": 0.4161651134490967, + "learning_rate": 0.00015603530069275662, + "loss": 1.2603, + "step": 16929 + }, + { + "epoch": 0.21999762849573534, + "grad_norm": 0.38115230202674866, + "learning_rate": 0.0001560327012308452, + "loss": 1.2718, + "step": 16930 + }, + { + "epoch": 0.22001062303965122, + "grad_norm": 0.4662272334098816, + "learning_rate": 0.00015603010176893384, + "loss": 1.5266, + "step": 16931 + }, + { + "epoch": 0.2200236175835671, + "grad_norm": 0.2777803838253021, + "learning_rate": 0.00015602750230702246, + "loss": 1.1361, + "step": 16932 + }, + { + "epoch": 0.22003661212748296, + "grad_norm": 0.4139927625656128, + "learning_rate": 0.00015602490284511106, + "loss": 1.4081, + "step": 16933 + }, + { + "epoch": 0.22004960667139883, + "grad_norm": 0.3474874794483185, + "learning_rate": 0.00015602230338319968, + "loss": 1.3327, + "step": 16934 + }, + { + "epoch": 0.2200626012153147, + "grad_norm": 0.5078178644180298, + "learning_rate": 0.0001560197039212883, + "loss": 1.5693, + "step": 16935 + }, + { + "epoch": 0.22007559575923058, + "grad_norm": 0.42416632175445557, + "learning_rate": 0.0001560171044593769, + "loss": 1.4047, + "step": 16936 + }, + { + "epoch": 0.22008859030314645, + "grad_norm": 0.35124078392982483, + "learning_rate": 0.00015601450499746553, + "loss": 1.499, + "step": 16937 + }, + { + "epoch": 0.22010158484706233, + "grad_norm": 0.44358327984809875, + "learning_rate": 0.00015601190553555413, + "loss": 1.4055, + "step": 16938 + }, + { + "epoch": 0.22011457939097823, + "grad_norm": 0.38890859484672546, + "learning_rate": 0.00015600930607364278, + "loss": 1.4529, + "step": 16939 + }, + { + "epoch": 0.2201275739348941, + "grad_norm": 0.3731576204299927, + "learning_rate": 0.00015600670661173138, + "loss": 1.4017, + "step": 16940 + }, + { + "epoch": 0.22014056847880997, + "grad_norm": 0.431218683719635, + "learning_rate": 0.00015600410714982, + "loss": 1.3973, + "step": 16941 + }, + { + "epoch": 0.22015356302272585, + "grad_norm": 0.3969866931438446, + "learning_rate": 0.00015600150768790863, + "loss": 1.1836, + "step": 16942 + }, + { + "epoch": 0.22016655756664172, + "grad_norm": 0.2890317142009735, + "learning_rate": 0.00015599890822599722, + "loss": 1.3694, + "step": 16943 + }, + { + "epoch": 0.2201795521105576, + "grad_norm": 0.5005160570144653, + "learning_rate": 0.00015599630876408585, + "loss": 1.4754, + "step": 16944 + }, + { + "epoch": 0.22019254665447346, + "grad_norm": 0.47472092509269714, + "learning_rate": 0.00015599370930217445, + "loss": 1.3355, + "step": 16945 + }, + { + "epoch": 0.22020554119838934, + "grad_norm": 0.39735203981399536, + "learning_rate": 0.0001559911098402631, + "loss": 1.5688, + "step": 16946 + }, + { + "epoch": 0.2202185357423052, + "grad_norm": 0.4081897735595703, + "learning_rate": 0.0001559885103783517, + "loss": 1.5895, + "step": 16947 + }, + { + "epoch": 0.22023153028622108, + "grad_norm": 0.412993848323822, + "learning_rate": 0.0001559859109164403, + "loss": 1.3628, + "step": 16948 + }, + { + "epoch": 0.22024452483013696, + "grad_norm": 0.3694773316383362, + "learning_rate": 0.00015598331145452892, + "loss": 1.5862, + "step": 16949 + }, + { + "epoch": 0.22025751937405283, + "grad_norm": 0.36286282539367676, + "learning_rate": 0.00015598071199261754, + "loss": 1.4797, + "step": 16950 + }, + { + "epoch": 0.2202705139179687, + "grad_norm": 0.36835727095603943, + "learning_rate": 0.00015597811253070616, + "loss": 1.4781, + "step": 16951 + }, + { + "epoch": 0.22028350846188458, + "grad_norm": 0.44325903058052063, + "learning_rate": 0.00015597551306879476, + "loss": 1.4121, + "step": 16952 + }, + { + "epoch": 0.22029650300580045, + "grad_norm": 0.4392807185649872, + "learning_rate": 0.0001559729136068834, + "loss": 1.5093, + "step": 16953 + }, + { + "epoch": 0.22030949754971632, + "grad_norm": 0.3409249782562256, + "learning_rate": 0.000155970314144972, + "loss": 1.5036, + "step": 16954 + }, + { + "epoch": 0.2203224920936322, + "grad_norm": 0.4693164527416229, + "learning_rate": 0.0001559677146830606, + "loss": 1.3713, + "step": 16955 + }, + { + "epoch": 0.22033548663754807, + "grad_norm": 0.3211345970630646, + "learning_rate": 0.00015596511522114923, + "loss": 1.3309, + "step": 16956 + }, + { + "epoch": 0.22034848118146394, + "grad_norm": 0.3904562294483185, + "learning_rate": 0.00015596251575923783, + "loss": 1.4153, + "step": 16957 + }, + { + "epoch": 0.2203614757253798, + "grad_norm": 0.42940059304237366, + "learning_rate": 0.00015595991629732648, + "loss": 1.3425, + "step": 16958 + }, + { + "epoch": 0.22037447026929569, + "grad_norm": 0.46401265263557434, + "learning_rate": 0.00015595731683541508, + "loss": 1.2938, + "step": 16959 + }, + { + "epoch": 0.22038746481321156, + "grad_norm": 0.35324928164482117, + "learning_rate": 0.00015595471737350368, + "loss": 1.5334, + "step": 16960 + }, + { + "epoch": 0.22040045935712743, + "grad_norm": 0.28420963883399963, + "learning_rate": 0.0001559521179115923, + "loss": 1.1194, + "step": 16961 + }, + { + "epoch": 0.2204134539010433, + "grad_norm": 0.3774755895137787, + "learning_rate": 0.00015594951844968093, + "loss": 1.3872, + "step": 16962 + }, + { + "epoch": 0.22042644844495918, + "grad_norm": 0.304919958114624, + "learning_rate": 0.00015594691898776955, + "loss": 1.2284, + "step": 16963 + }, + { + "epoch": 0.22043944298887505, + "grad_norm": 0.5344669222831726, + "learning_rate": 0.00015594431952585815, + "loss": 1.4851, + "step": 16964 + }, + { + "epoch": 0.22045243753279092, + "grad_norm": 0.33331140875816345, + "learning_rate": 0.00015594172006394677, + "loss": 1.4512, + "step": 16965 + }, + { + "epoch": 0.2204654320767068, + "grad_norm": 0.38237372040748596, + "learning_rate": 0.0001559391206020354, + "loss": 1.1646, + "step": 16966 + }, + { + "epoch": 0.22047842662062267, + "grad_norm": 0.378888338804245, + "learning_rate": 0.000155936521140124, + "loss": 1.3772, + "step": 16967 + }, + { + "epoch": 0.22049142116453854, + "grad_norm": 0.40520039200782776, + "learning_rate": 0.00015593392167821262, + "loss": 1.63, + "step": 16968 + }, + { + "epoch": 0.22050441570845442, + "grad_norm": 0.3504319190979004, + "learning_rate": 0.00015593132221630122, + "loss": 1.2721, + "step": 16969 + }, + { + "epoch": 0.2205174102523703, + "grad_norm": 0.36314040422439575, + "learning_rate": 0.00015592872275438987, + "loss": 1.5577, + "step": 16970 + }, + { + "epoch": 0.22053040479628616, + "grad_norm": 0.37017959356307983, + "learning_rate": 0.00015592612329247846, + "loss": 1.511, + "step": 16971 + }, + { + "epoch": 0.22054339934020203, + "grad_norm": 0.5501101016998291, + "learning_rate": 0.00015592352383056706, + "loss": 1.5643, + "step": 16972 + }, + { + "epoch": 0.2205563938841179, + "grad_norm": 0.45284131169319153, + "learning_rate": 0.0001559209243686557, + "loss": 1.4666, + "step": 16973 + }, + { + "epoch": 0.22056938842803378, + "grad_norm": 0.4814234972000122, + "learning_rate": 0.0001559183249067443, + "loss": 1.4575, + "step": 16974 + }, + { + "epoch": 0.22058238297194965, + "grad_norm": 0.3367791175842285, + "learning_rate": 0.00015591572544483294, + "loss": 1.3896, + "step": 16975 + }, + { + "epoch": 0.22059537751586553, + "grad_norm": 0.31046754121780396, + "learning_rate": 0.00015591312598292153, + "loss": 1.4478, + "step": 16976 + }, + { + "epoch": 0.2206083720597814, + "grad_norm": 0.5310745239257812, + "learning_rate": 0.00015591052652101016, + "loss": 1.3571, + "step": 16977 + }, + { + "epoch": 0.22062136660369727, + "grad_norm": 0.23618222773075104, + "learning_rate": 0.00015590792705909878, + "loss": 1.5222, + "step": 16978 + }, + { + "epoch": 0.22063436114761314, + "grad_norm": 0.442163348197937, + "learning_rate": 0.00015590532759718738, + "loss": 1.4654, + "step": 16979 + }, + { + "epoch": 0.22064735569152902, + "grad_norm": 0.4986705482006073, + "learning_rate": 0.000155902728135276, + "loss": 1.363, + "step": 16980 + }, + { + "epoch": 0.2206603502354449, + "grad_norm": 0.44632959365844727, + "learning_rate": 0.00015590012867336463, + "loss": 1.5249, + "step": 16981 + }, + { + "epoch": 0.22067334477936076, + "grad_norm": 0.44567590951919556, + "learning_rate": 0.00015589752921145325, + "loss": 1.3507, + "step": 16982 + }, + { + "epoch": 0.22068633932327664, + "grad_norm": 0.32191595435142517, + "learning_rate": 0.00015589492974954185, + "loss": 1.4514, + "step": 16983 + }, + { + "epoch": 0.2206993338671925, + "grad_norm": 0.4375464618206024, + "learning_rate": 0.00015589233028763047, + "loss": 1.7349, + "step": 16984 + }, + { + "epoch": 0.22071232841110838, + "grad_norm": 0.3427964448928833, + "learning_rate": 0.0001558897308257191, + "loss": 1.1675, + "step": 16985 + }, + { + "epoch": 0.22072532295502426, + "grad_norm": 0.4100309908390045, + "learning_rate": 0.0001558871313638077, + "loss": 1.3829, + "step": 16986 + }, + { + "epoch": 0.22073831749894013, + "grad_norm": 0.35462266206741333, + "learning_rate": 0.00015588453190189632, + "loss": 1.3021, + "step": 16987 + }, + { + "epoch": 0.220751312042856, + "grad_norm": 0.47487521171569824, + "learning_rate": 0.00015588193243998492, + "loss": 1.4399, + "step": 16988 + }, + { + "epoch": 0.22076430658677187, + "grad_norm": 0.4301503896713257, + "learning_rate": 0.00015587933297807354, + "loss": 1.4897, + "step": 16989 + }, + { + "epoch": 0.22077730113068775, + "grad_norm": 0.4237578511238098, + "learning_rate": 0.00015587673351616217, + "loss": 1.52, + "step": 16990 + }, + { + "epoch": 0.22079029567460362, + "grad_norm": 0.4684312641620636, + "learning_rate": 0.00015587413405425076, + "loss": 1.4168, + "step": 16991 + }, + { + "epoch": 0.2208032902185195, + "grad_norm": 0.4203507602214813, + "learning_rate": 0.0001558715345923394, + "loss": 1.5013, + "step": 16992 + }, + { + "epoch": 0.22081628476243537, + "grad_norm": 0.4595431685447693, + "learning_rate": 0.00015586893513042801, + "loss": 1.3043, + "step": 16993 + }, + { + "epoch": 0.22082927930635124, + "grad_norm": 0.3742528557777405, + "learning_rate": 0.00015586633566851664, + "loss": 1.2729, + "step": 16994 + }, + { + "epoch": 0.2208422738502671, + "grad_norm": 0.35850024223327637, + "learning_rate": 0.00015586373620660524, + "loss": 1.4276, + "step": 16995 + }, + { + "epoch": 0.22085526839418299, + "grad_norm": 0.41013646125793457, + "learning_rate": 0.00015586113674469386, + "loss": 1.5027, + "step": 16996 + }, + { + "epoch": 0.22086826293809886, + "grad_norm": 0.4253622591495514, + "learning_rate": 0.00015585853728278248, + "loss": 1.5784, + "step": 16997 + }, + { + "epoch": 0.22088125748201473, + "grad_norm": 0.5466195344924927, + "learning_rate": 0.00015585593782087108, + "loss": 1.3935, + "step": 16998 + }, + { + "epoch": 0.2208942520259306, + "grad_norm": 0.2679307162761688, + "learning_rate": 0.0001558533383589597, + "loss": 1.3094, + "step": 16999 + }, + { + "epoch": 0.22090724656984648, + "grad_norm": 0.35882630944252014, + "learning_rate": 0.0001558507388970483, + "loss": 1.2618, + "step": 17000 + }, + { + "epoch": 0.22092024111376235, + "grad_norm": 0.49659794569015503, + "learning_rate": 0.00015584813943513693, + "loss": 1.3546, + "step": 17001 + }, + { + "epoch": 0.22093323565767822, + "grad_norm": 0.48639124631881714, + "learning_rate": 0.00015584553997322555, + "loss": 1.4672, + "step": 17002 + }, + { + "epoch": 0.2209462302015941, + "grad_norm": 0.4292655289173126, + "learning_rate": 0.00015584294051131415, + "loss": 1.2426, + "step": 17003 + }, + { + "epoch": 0.22095922474550997, + "grad_norm": 0.40612438321113586, + "learning_rate": 0.00015584034104940277, + "loss": 1.6765, + "step": 17004 + }, + { + "epoch": 0.22097221928942584, + "grad_norm": 0.3818831741809845, + "learning_rate": 0.0001558377415874914, + "loss": 1.4683, + "step": 17005 + }, + { + "epoch": 0.22098521383334171, + "grad_norm": 0.36524754762649536, + "learning_rate": 0.00015583514212558002, + "loss": 1.4492, + "step": 17006 + }, + { + "epoch": 0.2209982083772576, + "grad_norm": 0.4679274559020996, + "learning_rate": 0.00015583254266366862, + "loss": 1.4814, + "step": 17007 + }, + { + "epoch": 0.22101120292117346, + "grad_norm": 0.43185847997665405, + "learning_rate": 0.00015582994320175725, + "loss": 1.29, + "step": 17008 + }, + { + "epoch": 0.22102419746508933, + "grad_norm": 0.45009997487068176, + "learning_rate": 0.00015582734373984587, + "loss": 1.7194, + "step": 17009 + }, + { + "epoch": 0.2210371920090052, + "grad_norm": 0.44202664494514465, + "learning_rate": 0.00015582474427793447, + "loss": 1.4675, + "step": 17010 + }, + { + "epoch": 0.22105018655292108, + "grad_norm": 0.4849221706390381, + "learning_rate": 0.0001558221448160231, + "loss": 1.416, + "step": 17011 + }, + { + "epoch": 0.22106318109683695, + "grad_norm": 0.3867471516132355, + "learning_rate": 0.0001558195453541117, + "loss": 1.3669, + "step": 17012 + }, + { + "epoch": 0.22107617564075283, + "grad_norm": 0.368221640586853, + "learning_rate": 0.00015581694589220034, + "loss": 1.3793, + "step": 17013 + }, + { + "epoch": 0.2210891701846687, + "grad_norm": 0.36737099289894104, + "learning_rate": 0.00015581434643028894, + "loss": 1.4401, + "step": 17014 + }, + { + "epoch": 0.2211021647285846, + "grad_norm": 0.38588178157806396, + "learning_rate": 0.00015581174696837754, + "loss": 1.3863, + "step": 17015 + }, + { + "epoch": 0.22111515927250047, + "grad_norm": 0.43531864881515503, + "learning_rate": 0.0001558091475064662, + "loss": 1.46, + "step": 17016 + }, + { + "epoch": 0.22112815381641635, + "grad_norm": 0.4761612117290497, + "learning_rate": 0.00015580654804455478, + "loss": 1.4948, + "step": 17017 + }, + { + "epoch": 0.22114114836033222, + "grad_norm": 0.40776824951171875, + "learning_rate": 0.0001558039485826434, + "loss": 1.6796, + "step": 17018 + }, + { + "epoch": 0.2211541429042481, + "grad_norm": 0.45228666067123413, + "learning_rate": 0.000155801349120732, + "loss": 1.4522, + "step": 17019 + }, + { + "epoch": 0.22116713744816396, + "grad_norm": 0.45015138387680054, + "learning_rate": 0.00015579874965882063, + "loss": 1.5762, + "step": 17020 + }, + { + "epoch": 0.22118013199207984, + "grad_norm": 0.4756341874599457, + "learning_rate": 0.00015579615019690926, + "loss": 1.4287, + "step": 17021 + }, + { + "epoch": 0.2211931265359957, + "grad_norm": 0.347330778837204, + "learning_rate": 0.00015579355073499785, + "loss": 1.305, + "step": 17022 + }, + { + "epoch": 0.22120612107991158, + "grad_norm": 0.41575974225997925, + "learning_rate": 0.00015579095127308648, + "loss": 1.4143, + "step": 17023 + }, + { + "epoch": 0.22121911562382746, + "grad_norm": 0.35551080107688904, + "learning_rate": 0.0001557883518111751, + "loss": 1.36, + "step": 17024 + }, + { + "epoch": 0.22123211016774333, + "grad_norm": 0.43945980072021484, + "learning_rate": 0.00015578575234926373, + "loss": 1.3988, + "step": 17025 + }, + { + "epoch": 0.2212451047116592, + "grad_norm": 0.45011472702026367, + "learning_rate": 0.00015578315288735232, + "loss": 1.6539, + "step": 17026 + }, + { + "epoch": 0.22125809925557507, + "grad_norm": 0.419046550989151, + "learning_rate": 0.00015578055342544092, + "loss": 1.5442, + "step": 17027 + }, + { + "epoch": 0.22127109379949095, + "grad_norm": 0.3983384668827057, + "learning_rate": 0.00015577795396352957, + "loss": 1.38, + "step": 17028 + }, + { + "epoch": 0.22128408834340682, + "grad_norm": 0.3543729782104492, + "learning_rate": 0.00015577535450161817, + "loss": 1.3682, + "step": 17029 + }, + { + "epoch": 0.2212970828873227, + "grad_norm": 0.510632336139679, + "learning_rate": 0.0001557727550397068, + "loss": 1.7461, + "step": 17030 + }, + { + "epoch": 0.22131007743123857, + "grad_norm": 0.440660685300827, + "learning_rate": 0.0001557701555777954, + "loss": 1.4275, + "step": 17031 + }, + { + "epoch": 0.22132307197515444, + "grad_norm": 0.34685972332954407, + "learning_rate": 0.00015576755611588402, + "loss": 1.2296, + "step": 17032 + }, + { + "epoch": 0.2213360665190703, + "grad_norm": 0.2996695339679718, + "learning_rate": 0.00015576495665397264, + "loss": 1.1563, + "step": 17033 + }, + { + "epoch": 0.22134906106298619, + "grad_norm": 0.3733215928077698, + "learning_rate": 0.00015576235719206124, + "loss": 1.4296, + "step": 17034 + }, + { + "epoch": 0.22136205560690206, + "grad_norm": 0.38955050706863403, + "learning_rate": 0.00015575975773014986, + "loss": 1.3116, + "step": 17035 + }, + { + "epoch": 0.22137505015081793, + "grad_norm": 0.4073977470397949, + "learning_rate": 0.0001557571582682385, + "loss": 1.3835, + "step": 17036 + }, + { + "epoch": 0.2213880446947338, + "grad_norm": 0.41414129734039307, + "learning_rate": 0.0001557545588063271, + "loss": 1.4798, + "step": 17037 + }, + { + "epoch": 0.22140103923864968, + "grad_norm": 0.46962955594062805, + "learning_rate": 0.0001557519593444157, + "loss": 1.3848, + "step": 17038 + }, + { + "epoch": 0.22141403378256555, + "grad_norm": 0.48201245069503784, + "learning_rate": 0.0001557493598825043, + "loss": 1.6492, + "step": 17039 + }, + { + "epoch": 0.22142702832648142, + "grad_norm": 0.38728567957878113, + "learning_rate": 0.00015574676042059296, + "loss": 1.418, + "step": 17040 + }, + { + "epoch": 0.2214400228703973, + "grad_norm": 0.40893441438674927, + "learning_rate": 0.00015574416095868156, + "loss": 1.5834, + "step": 17041 + }, + { + "epoch": 0.22145301741431317, + "grad_norm": 0.49246731400489807, + "learning_rate": 0.00015574156149677018, + "loss": 1.6009, + "step": 17042 + }, + { + "epoch": 0.22146601195822904, + "grad_norm": 0.397651344537735, + "learning_rate": 0.00015573896203485878, + "loss": 1.4665, + "step": 17043 + }, + { + "epoch": 0.22147900650214492, + "grad_norm": 0.39051553606987, + "learning_rate": 0.0001557363625729474, + "loss": 1.4885, + "step": 17044 + }, + { + "epoch": 0.2214920010460608, + "grad_norm": 0.45621123909950256, + "learning_rate": 0.00015573376311103603, + "loss": 1.4777, + "step": 17045 + }, + { + "epoch": 0.22150499558997666, + "grad_norm": 0.4330211579799652, + "learning_rate": 0.00015573116364912462, + "loss": 1.1337, + "step": 17046 + }, + { + "epoch": 0.22151799013389253, + "grad_norm": 0.3897799849510193, + "learning_rate": 0.00015572856418721325, + "loss": 1.2749, + "step": 17047 + }, + { + "epoch": 0.2215309846778084, + "grad_norm": 0.3862776458263397, + "learning_rate": 0.00015572596472530187, + "loss": 1.4044, + "step": 17048 + }, + { + "epoch": 0.22154397922172428, + "grad_norm": 0.3762578070163727, + "learning_rate": 0.0001557233652633905, + "loss": 1.4842, + "step": 17049 + }, + { + "epoch": 0.22155697376564015, + "grad_norm": 0.3673527240753174, + "learning_rate": 0.0001557207658014791, + "loss": 1.2782, + "step": 17050 + }, + { + "epoch": 0.22156996830955603, + "grad_norm": 0.3858078122138977, + "learning_rate": 0.00015571816633956772, + "loss": 1.5894, + "step": 17051 + }, + { + "epoch": 0.2215829628534719, + "grad_norm": 0.40626755356788635, + "learning_rate": 0.00015571556687765634, + "loss": 1.4181, + "step": 17052 + }, + { + "epoch": 0.22159595739738777, + "grad_norm": 0.3597128093242645, + "learning_rate": 0.00015571296741574494, + "loss": 1.4549, + "step": 17053 + }, + { + "epoch": 0.22160895194130364, + "grad_norm": 0.4017217755317688, + "learning_rate": 0.00015571036795383357, + "loss": 1.3754, + "step": 17054 + }, + { + "epoch": 0.22162194648521952, + "grad_norm": 0.3090975284576416, + "learning_rate": 0.0001557077684919222, + "loss": 1.3076, + "step": 17055 + }, + { + "epoch": 0.2216349410291354, + "grad_norm": 0.3845195174217224, + "learning_rate": 0.0001557051690300108, + "loss": 1.3728, + "step": 17056 + }, + { + "epoch": 0.22164793557305126, + "grad_norm": 0.4403516948223114, + "learning_rate": 0.0001557025695680994, + "loss": 1.491, + "step": 17057 + }, + { + "epoch": 0.22166093011696714, + "grad_norm": 0.3647840917110443, + "learning_rate": 0.000155699970106188, + "loss": 1.4119, + "step": 17058 + }, + { + "epoch": 0.221673924660883, + "grad_norm": 0.40061089396476746, + "learning_rate": 0.00015569737064427666, + "loss": 1.4018, + "step": 17059 + }, + { + "epoch": 0.22168691920479888, + "grad_norm": 0.33236902952194214, + "learning_rate": 0.00015569477118236526, + "loss": 1.2903, + "step": 17060 + }, + { + "epoch": 0.22169991374871476, + "grad_norm": 0.44524461030960083, + "learning_rate": 0.00015569217172045388, + "loss": 1.4229, + "step": 17061 + }, + { + "epoch": 0.22171290829263063, + "grad_norm": 0.5494827032089233, + "learning_rate": 0.00015568957225854248, + "loss": 1.5232, + "step": 17062 + }, + { + "epoch": 0.2217259028365465, + "grad_norm": 0.4711943566799164, + "learning_rate": 0.0001556869727966311, + "loss": 1.4898, + "step": 17063 + }, + { + "epoch": 0.22173889738046237, + "grad_norm": 0.308763712644577, + "learning_rate": 0.00015568437333471973, + "loss": 1.1964, + "step": 17064 + }, + { + "epoch": 0.22175189192437825, + "grad_norm": 0.4602581560611725, + "learning_rate": 0.00015568177387280833, + "loss": 1.3666, + "step": 17065 + }, + { + "epoch": 0.22176488646829412, + "grad_norm": 0.43294858932495117, + "learning_rate": 0.00015567917441089695, + "loss": 1.3974, + "step": 17066 + }, + { + "epoch": 0.22177788101221, + "grad_norm": 0.3961070477962494, + "learning_rate": 0.00015567657494898558, + "loss": 1.3042, + "step": 17067 + }, + { + "epoch": 0.22179087555612587, + "grad_norm": 0.4131130278110504, + "learning_rate": 0.0001556739754870742, + "loss": 1.4559, + "step": 17068 + }, + { + "epoch": 0.22180387010004174, + "grad_norm": 0.3964472711086273, + "learning_rate": 0.0001556713760251628, + "loss": 1.1308, + "step": 17069 + }, + { + "epoch": 0.2218168646439576, + "grad_norm": 0.3841591775417328, + "learning_rate": 0.0001556687765632514, + "loss": 1.4378, + "step": 17070 + }, + { + "epoch": 0.22182985918787348, + "grad_norm": 0.3324319124221802, + "learning_rate": 0.00015566617710134005, + "loss": 1.5845, + "step": 17071 + }, + { + "epoch": 0.22184285373178936, + "grad_norm": 0.2964618504047394, + "learning_rate": 0.00015566357763942864, + "loss": 1.291, + "step": 17072 + }, + { + "epoch": 0.22185584827570523, + "grad_norm": 0.42336753010749817, + "learning_rate": 0.00015566097817751727, + "loss": 1.4161, + "step": 17073 + }, + { + "epoch": 0.2218688428196211, + "grad_norm": 0.4712112247943878, + "learning_rate": 0.00015565837871560587, + "loss": 1.2435, + "step": 17074 + }, + { + "epoch": 0.22188183736353698, + "grad_norm": 0.46201184391975403, + "learning_rate": 0.0001556557792536945, + "loss": 1.305, + "step": 17075 + }, + { + "epoch": 0.22189483190745285, + "grad_norm": 0.34623244404792786, + "learning_rate": 0.00015565317979178311, + "loss": 1.2764, + "step": 17076 + }, + { + "epoch": 0.22190782645136872, + "grad_norm": 0.3202507495880127, + "learning_rate": 0.0001556505803298717, + "loss": 1.2393, + "step": 17077 + }, + { + "epoch": 0.2219208209952846, + "grad_norm": 0.3208293318748474, + "learning_rate": 0.00015564798086796034, + "loss": 1.634, + "step": 17078 + }, + { + "epoch": 0.22193381553920047, + "grad_norm": 0.44903144240379333, + "learning_rate": 0.00015564538140604896, + "loss": 1.4049, + "step": 17079 + }, + { + "epoch": 0.22194681008311634, + "grad_norm": 0.41508638858795166, + "learning_rate": 0.00015564278194413758, + "loss": 1.4669, + "step": 17080 + }, + { + "epoch": 0.22195980462703221, + "grad_norm": 0.3519446551799774, + "learning_rate": 0.00015564018248222618, + "loss": 1.3563, + "step": 17081 + }, + { + "epoch": 0.2219727991709481, + "grad_norm": 0.46139299869537354, + "learning_rate": 0.00015563758302031478, + "loss": 1.3372, + "step": 17082 + }, + { + "epoch": 0.22198579371486396, + "grad_norm": 0.3968223035335541, + "learning_rate": 0.00015563498355840343, + "loss": 1.3699, + "step": 17083 + }, + { + "epoch": 0.22199878825877983, + "grad_norm": 0.4089559018611908, + "learning_rate": 0.00015563238409649203, + "loss": 1.2311, + "step": 17084 + }, + { + "epoch": 0.2220117828026957, + "grad_norm": 0.4803493320941925, + "learning_rate": 0.00015562978463458065, + "loss": 1.1683, + "step": 17085 + }, + { + "epoch": 0.22202477734661158, + "grad_norm": 0.45740577578544617, + "learning_rate": 0.00015562718517266925, + "loss": 1.6397, + "step": 17086 + }, + { + "epoch": 0.22203777189052745, + "grad_norm": 0.3872956335544586, + "learning_rate": 0.00015562458571075788, + "loss": 1.5075, + "step": 17087 + }, + { + "epoch": 0.22205076643444333, + "grad_norm": 0.40057888627052307, + "learning_rate": 0.0001556219862488465, + "loss": 1.359, + "step": 17088 + }, + { + "epoch": 0.2220637609783592, + "grad_norm": 0.4077836275100708, + "learning_rate": 0.0001556193867869351, + "loss": 1.3742, + "step": 17089 + }, + { + "epoch": 0.22207675552227507, + "grad_norm": 0.34613364934921265, + "learning_rate": 0.00015561678732502375, + "loss": 1.4589, + "step": 17090 + }, + { + "epoch": 0.22208975006619097, + "grad_norm": 0.42743000388145447, + "learning_rate": 0.00015561418786311235, + "loss": 1.2644, + "step": 17091 + }, + { + "epoch": 0.22210274461010684, + "grad_norm": 0.6018355488777161, + "learning_rate": 0.00015561158840120097, + "loss": 1.4335, + "step": 17092 + }, + { + "epoch": 0.22211573915402272, + "grad_norm": 0.36277633905410767, + "learning_rate": 0.00015560898893928957, + "loss": 1.6177, + "step": 17093 + }, + { + "epoch": 0.2221287336979386, + "grad_norm": 0.4203292727470398, + "learning_rate": 0.0001556063894773782, + "loss": 1.3813, + "step": 17094 + }, + { + "epoch": 0.22214172824185446, + "grad_norm": 0.3625844120979309, + "learning_rate": 0.00015560379001546682, + "loss": 1.4356, + "step": 17095 + }, + { + "epoch": 0.22215472278577034, + "grad_norm": 0.5396364331245422, + "learning_rate": 0.00015560119055355541, + "loss": 1.4676, + "step": 17096 + }, + { + "epoch": 0.2221677173296862, + "grad_norm": 0.3877023160457611, + "learning_rate": 0.00015559859109164404, + "loss": 1.5454, + "step": 17097 + }, + { + "epoch": 0.22218071187360208, + "grad_norm": 0.514180064201355, + "learning_rate": 0.00015559599162973266, + "loss": 1.5093, + "step": 17098 + }, + { + "epoch": 0.22219370641751796, + "grad_norm": 0.39768078923225403, + "learning_rate": 0.00015559339216782126, + "loss": 1.3124, + "step": 17099 + }, + { + "epoch": 0.22220670096143383, + "grad_norm": 0.3751824200153351, + "learning_rate": 0.00015559079270590988, + "loss": 1.2822, + "step": 17100 + }, + { + "epoch": 0.2222196955053497, + "grad_norm": 0.5395187735557556, + "learning_rate": 0.00015558819324399848, + "loss": 1.3567, + "step": 17101 + }, + { + "epoch": 0.22223269004926557, + "grad_norm": 0.24038265645503998, + "learning_rate": 0.00015558559378208713, + "loss": 1.4351, + "step": 17102 + }, + { + "epoch": 0.22224568459318145, + "grad_norm": 0.43682458996772766, + "learning_rate": 0.00015558299432017573, + "loss": 1.5713, + "step": 17103 + }, + { + "epoch": 0.22225867913709732, + "grad_norm": 0.3363043963909149, + "learning_rate": 0.00015558039485826436, + "loss": 1.4052, + "step": 17104 + }, + { + "epoch": 0.2222716736810132, + "grad_norm": 0.471466064453125, + "learning_rate": 0.00015557779539635295, + "loss": 1.1848, + "step": 17105 + }, + { + "epoch": 0.22228466822492907, + "grad_norm": 0.41675013303756714, + "learning_rate": 0.00015557519593444158, + "loss": 1.3606, + "step": 17106 + }, + { + "epoch": 0.22229766276884494, + "grad_norm": 0.49515584111213684, + "learning_rate": 0.0001555725964725302, + "loss": 1.5995, + "step": 17107 + }, + { + "epoch": 0.2223106573127608, + "grad_norm": 0.2888803482055664, + "learning_rate": 0.0001555699970106188, + "loss": 1.4539, + "step": 17108 + }, + { + "epoch": 0.22232365185667669, + "grad_norm": 0.41267451643943787, + "learning_rate": 0.00015556739754870742, + "loss": 1.5359, + "step": 17109 + }, + { + "epoch": 0.22233664640059256, + "grad_norm": 0.46718311309814453, + "learning_rate": 0.00015556479808679605, + "loss": 1.3763, + "step": 17110 + }, + { + "epoch": 0.22234964094450843, + "grad_norm": 0.3803378641605377, + "learning_rate": 0.00015556219862488465, + "loss": 1.4768, + "step": 17111 + }, + { + "epoch": 0.2223626354884243, + "grad_norm": 0.42829012870788574, + "learning_rate": 0.00015555959916297327, + "loss": 1.3366, + "step": 17112 + }, + { + "epoch": 0.22237563003234018, + "grad_norm": 0.3077649772167206, + "learning_rate": 0.00015555699970106187, + "loss": 1.4243, + "step": 17113 + }, + { + "epoch": 0.22238862457625605, + "grad_norm": 0.3625745177268982, + "learning_rate": 0.00015555440023915052, + "loss": 1.3656, + "step": 17114 + }, + { + "epoch": 0.22240161912017192, + "grad_norm": 0.3440830409526825, + "learning_rate": 0.00015555180077723912, + "loss": 1.3793, + "step": 17115 + }, + { + "epoch": 0.2224146136640878, + "grad_norm": 0.36143049597740173, + "learning_rate": 0.00015554920131532774, + "loss": 1.0977, + "step": 17116 + }, + { + "epoch": 0.22242760820800367, + "grad_norm": 0.3851677179336548, + "learning_rate": 0.00015554660185341634, + "loss": 1.2104, + "step": 17117 + }, + { + "epoch": 0.22244060275191954, + "grad_norm": 0.4465150535106659, + "learning_rate": 0.00015554400239150496, + "loss": 1.3397, + "step": 17118 + }, + { + "epoch": 0.22245359729583541, + "grad_norm": 0.35547319054603577, + "learning_rate": 0.0001555414029295936, + "loss": 1.447, + "step": 17119 + }, + { + "epoch": 0.2224665918397513, + "grad_norm": 0.39653661847114563, + "learning_rate": 0.00015553880346768218, + "loss": 1.5618, + "step": 17120 + }, + { + "epoch": 0.22247958638366716, + "grad_norm": 0.338181734085083, + "learning_rate": 0.0001555362040057708, + "loss": 1.4234, + "step": 17121 + }, + { + "epoch": 0.22249258092758303, + "grad_norm": 0.4291459321975708, + "learning_rate": 0.00015553360454385943, + "loss": 1.4441, + "step": 17122 + }, + { + "epoch": 0.2225055754714989, + "grad_norm": 0.3681119978427887, + "learning_rate": 0.00015553100508194803, + "loss": 1.6754, + "step": 17123 + }, + { + "epoch": 0.22251857001541478, + "grad_norm": 0.44944924116134644, + "learning_rate": 0.00015552840562003666, + "loss": 1.4699, + "step": 17124 + }, + { + "epoch": 0.22253156455933065, + "grad_norm": 0.35042116045951843, + "learning_rate": 0.00015552580615812525, + "loss": 1.1873, + "step": 17125 + }, + { + "epoch": 0.22254455910324653, + "grad_norm": 0.31520354747772217, + "learning_rate": 0.0001555232066962139, + "loss": 1.2255, + "step": 17126 + }, + { + "epoch": 0.2225575536471624, + "grad_norm": 0.3630147874355316, + "learning_rate": 0.0001555206072343025, + "loss": 1.4813, + "step": 17127 + }, + { + "epoch": 0.22257054819107827, + "grad_norm": 0.42200225591659546, + "learning_rate": 0.00015551800777239113, + "loss": 1.4492, + "step": 17128 + }, + { + "epoch": 0.22258354273499414, + "grad_norm": 0.42526718974113464, + "learning_rate": 0.00015551540831047975, + "loss": 1.4772, + "step": 17129 + }, + { + "epoch": 0.22259653727891002, + "grad_norm": 0.6211418509483337, + "learning_rate": 0.00015551280884856835, + "loss": 1.6208, + "step": 17130 + }, + { + "epoch": 0.2226095318228259, + "grad_norm": 0.3544312119483948, + "learning_rate": 0.00015551020938665697, + "loss": 1.3873, + "step": 17131 + }, + { + "epoch": 0.22262252636674176, + "grad_norm": 0.450359582901001, + "learning_rate": 0.00015550760992474557, + "loss": 1.3735, + "step": 17132 + }, + { + "epoch": 0.22263552091065764, + "grad_norm": 0.4444940388202667, + "learning_rate": 0.00015550501046283422, + "loss": 1.4011, + "step": 17133 + }, + { + "epoch": 0.2226485154545735, + "grad_norm": 0.4288652837276459, + "learning_rate": 0.00015550241100092282, + "loss": 1.5391, + "step": 17134 + }, + { + "epoch": 0.22266150999848938, + "grad_norm": 0.4947999119758606, + "learning_rate": 0.00015549981153901144, + "loss": 1.4797, + "step": 17135 + }, + { + "epoch": 0.22267450454240525, + "grad_norm": 0.35433104634284973, + "learning_rate": 0.00015549721207710004, + "loss": 1.2502, + "step": 17136 + }, + { + "epoch": 0.22268749908632113, + "grad_norm": 0.4129192531108856, + "learning_rate": 0.00015549461261518867, + "loss": 1.4185, + "step": 17137 + }, + { + "epoch": 0.222700493630237, + "grad_norm": 0.4013659358024597, + "learning_rate": 0.0001554920131532773, + "loss": 1.296, + "step": 17138 + }, + { + "epoch": 0.22271348817415287, + "grad_norm": 0.37711530923843384, + "learning_rate": 0.0001554894136913659, + "loss": 1.3832, + "step": 17139 + }, + { + "epoch": 0.22272648271806875, + "grad_norm": 0.403781920671463, + "learning_rate": 0.0001554868142294545, + "loss": 1.4053, + "step": 17140 + }, + { + "epoch": 0.22273947726198462, + "grad_norm": 0.38190335035324097, + "learning_rate": 0.00015548421476754314, + "loss": 1.3746, + "step": 17141 + }, + { + "epoch": 0.2227524718059005, + "grad_norm": 0.3156324625015259, + "learning_rate": 0.00015548161530563173, + "loss": 1.1838, + "step": 17142 + }, + { + "epoch": 0.22276546634981637, + "grad_norm": 0.45251744985580444, + "learning_rate": 0.00015547901584372036, + "loss": 1.2926, + "step": 17143 + }, + { + "epoch": 0.22277846089373224, + "grad_norm": 0.41896748542785645, + "learning_rate": 0.00015547641638180896, + "loss": 1.4728, + "step": 17144 + }, + { + "epoch": 0.2227914554376481, + "grad_norm": 0.31802403926849365, + "learning_rate": 0.0001554738169198976, + "loss": 1.2417, + "step": 17145 + }, + { + "epoch": 0.22280444998156398, + "grad_norm": 0.3450278639793396, + "learning_rate": 0.0001554712174579862, + "loss": 1.4251, + "step": 17146 + }, + { + "epoch": 0.22281744452547986, + "grad_norm": 0.4255521595478058, + "learning_rate": 0.00015546861799607483, + "loss": 1.4164, + "step": 17147 + }, + { + "epoch": 0.22283043906939573, + "grad_norm": 0.4435980021953583, + "learning_rate": 0.00015546601853416343, + "loss": 1.4434, + "step": 17148 + }, + { + "epoch": 0.2228434336133116, + "grad_norm": 0.4812254011631012, + "learning_rate": 0.00015546341907225205, + "loss": 1.52, + "step": 17149 + }, + { + "epoch": 0.22285642815722748, + "grad_norm": 0.33715853095054626, + "learning_rate": 0.00015546081961034068, + "loss": 1.1849, + "step": 17150 + }, + { + "epoch": 0.22286942270114335, + "grad_norm": 0.40500110387802124, + "learning_rate": 0.00015545822014842927, + "loss": 1.3702, + "step": 17151 + }, + { + "epoch": 0.22288241724505922, + "grad_norm": 0.4157017469406128, + "learning_rate": 0.0001554556206865179, + "loss": 1.3938, + "step": 17152 + }, + { + "epoch": 0.2228954117889751, + "grad_norm": 0.2947644889354706, + "learning_rate": 0.00015545302122460652, + "loss": 1.3843, + "step": 17153 + }, + { + "epoch": 0.22290840633289097, + "grad_norm": 0.3716994822025299, + "learning_rate": 0.00015545042176269512, + "loss": 1.3801, + "step": 17154 + }, + { + "epoch": 0.22292140087680684, + "grad_norm": 0.45993781089782715, + "learning_rate": 0.00015544782230078374, + "loss": 1.6304, + "step": 17155 + }, + { + "epoch": 0.22293439542072271, + "grad_norm": 0.3995344340801239, + "learning_rate": 0.00015544522283887234, + "loss": 1.4776, + "step": 17156 + }, + { + "epoch": 0.2229473899646386, + "grad_norm": 0.42422494292259216, + "learning_rate": 0.000155442623376961, + "loss": 1.3458, + "step": 17157 + }, + { + "epoch": 0.22296038450855446, + "grad_norm": 0.3867335915565491, + "learning_rate": 0.0001554400239150496, + "loss": 1.3409, + "step": 17158 + }, + { + "epoch": 0.22297337905247033, + "grad_norm": 0.4358266294002533, + "learning_rate": 0.00015543742445313821, + "loss": 1.6161, + "step": 17159 + }, + { + "epoch": 0.2229863735963862, + "grad_norm": 0.40526410937309265, + "learning_rate": 0.0001554348249912268, + "loss": 1.3947, + "step": 17160 + }, + { + "epoch": 0.22299936814030208, + "grad_norm": 0.423902690410614, + "learning_rate": 0.00015543222552931544, + "loss": 1.4279, + "step": 17161 + }, + { + "epoch": 0.22301236268421795, + "grad_norm": 0.34187567234039307, + "learning_rate": 0.00015542962606740406, + "loss": 1.4244, + "step": 17162 + }, + { + "epoch": 0.22302535722813382, + "grad_norm": 0.39992156624794006, + "learning_rate": 0.00015542702660549266, + "loss": 1.3955, + "step": 17163 + }, + { + "epoch": 0.2230383517720497, + "grad_norm": 0.4114689230918884, + "learning_rate": 0.0001554244271435813, + "loss": 1.3662, + "step": 17164 + }, + { + "epoch": 0.22305134631596557, + "grad_norm": 0.43203005194664, + "learning_rate": 0.0001554218276816699, + "loss": 1.3727, + "step": 17165 + }, + { + "epoch": 0.22306434085988144, + "grad_norm": 0.3677416443824768, + "learning_rate": 0.0001554192282197585, + "loss": 1.3425, + "step": 17166 + }, + { + "epoch": 0.22307733540379734, + "grad_norm": 0.4099891781806946, + "learning_rate": 0.00015541662875784713, + "loss": 1.5158, + "step": 17167 + }, + { + "epoch": 0.22309032994771322, + "grad_norm": 0.39120379090309143, + "learning_rate": 0.00015541402929593575, + "loss": 1.3764, + "step": 17168 + }, + { + "epoch": 0.2231033244916291, + "grad_norm": 0.47283923625946045, + "learning_rate": 0.00015541142983402438, + "loss": 1.3689, + "step": 17169 + }, + { + "epoch": 0.22311631903554496, + "grad_norm": 0.47374963760375977, + "learning_rate": 0.00015540883037211298, + "loss": 1.5228, + "step": 17170 + }, + { + "epoch": 0.22312931357946084, + "grad_norm": 0.41419610381126404, + "learning_rate": 0.0001554062309102016, + "loss": 1.324, + "step": 17171 + }, + { + "epoch": 0.2231423081233767, + "grad_norm": 0.34329530596733093, + "learning_rate": 0.00015540363144829022, + "loss": 1.4898, + "step": 17172 + }, + { + "epoch": 0.22315530266729258, + "grad_norm": 0.4290868639945984, + "learning_rate": 0.00015540103198637882, + "loss": 1.3483, + "step": 17173 + }, + { + "epoch": 0.22316829721120846, + "grad_norm": 0.4576382339000702, + "learning_rate": 0.00015539843252446745, + "loss": 1.3437, + "step": 17174 + }, + { + "epoch": 0.22318129175512433, + "grad_norm": 0.3294677138328552, + "learning_rate": 0.00015539583306255604, + "loss": 1.1422, + "step": 17175 + }, + { + "epoch": 0.2231942862990402, + "grad_norm": 0.4289793372154236, + "learning_rate": 0.0001553932336006447, + "loss": 1.3833, + "step": 17176 + }, + { + "epoch": 0.22320728084295607, + "grad_norm": 0.3473701775074005, + "learning_rate": 0.0001553906341387333, + "loss": 1.4183, + "step": 17177 + }, + { + "epoch": 0.22322027538687195, + "grad_norm": 0.26277607679367065, + "learning_rate": 0.0001553880346768219, + "loss": 1.1681, + "step": 17178 + }, + { + "epoch": 0.22323326993078782, + "grad_norm": 0.3259830176830292, + "learning_rate": 0.00015538543521491051, + "loss": 1.3871, + "step": 17179 + }, + { + "epoch": 0.2232462644747037, + "grad_norm": 0.3959862291812897, + "learning_rate": 0.00015538283575299914, + "loss": 1.2493, + "step": 17180 + }, + { + "epoch": 0.22325925901861957, + "grad_norm": 0.32133084535598755, + "learning_rate": 0.00015538023629108776, + "loss": 1.4416, + "step": 17181 + }, + { + "epoch": 0.22327225356253544, + "grad_norm": 0.4233432710170746, + "learning_rate": 0.00015537763682917636, + "loss": 1.3732, + "step": 17182 + }, + { + "epoch": 0.2232852481064513, + "grad_norm": 0.42476895451545715, + "learning_rate": 0.00015537503736726499, + "loss": 1.4966, + "step": 17183 + }, + { + "epoch": 0.22329824265036718, + "grad_norm": 0.39380794763565063, + "learning_rate": 0.0001553724379053536, + "loss": 1.3662, + "step": 17184 + }, + { + "epoch": 0.22331123719428306, + "grad_norm": 0.48878365755081177, + "learning_rate": 0.0001553698384434422, + "loss": 1.5692, + "step": 17185 + }, + { + "epoch": 0.22332423173819893, + "grad_norm": 0.35221439599990845, + "learning_rate": 0.00015536723898153083, + "loss": 1.4879, + "step": 17186 + }, + { + "epoch": 0.2233372262821148, + "grad_norm": 0.46363532543182373, + "learning_rate": 0.00015536463951961943, + "loss": 1.6407, + "step": 17187 + }, + { + "epoch": 0.22335022082603068, + "grad_norm": 0.39026781916618347, + "learning_rate": 0.00015536204005770808, + "loss": 1.5055, + "step": 17188 + }, + { + "epoch": 0.22336321536994655, + "grad_norm": 0.352279931306839, + "learning_rate": 0.00015535944059579668, + "loss": 1.4515, + "step": 17189 + }, + { + "epoch": 0.22337620991386242, + "grad_norm": 0.3973168134689331, + "learning_rate": 0.0001553568411338853, + "loss": 1.5286, + "step": 17190 + }, + { + "epoch": 0.2233892044577783, + "grad_norm": 0.3869595527648926, + "learning_rate": 0.0001553542416719739, + "loss": 1.4942, + "step": 17191 + }, + { + "epoch": 0.22340219900169417, + "grad_norm": 0.3624345660209656, + "learning_rate": 0.00015535164221006252, + "loss": 1.4501, + "step": 17192 + }, + { + "epoch": 0.22341519354561004, + "grad_norm": 0.2924130856990814, + "learning_rate": 0.00015534904274815115, + "loss": 1.1925, + "step": 17193 + }, + { + "epoch": 0.22342818808952591, + "grad_norm": 0.3422030806541443, + "learning_rate": 0.00015534644328623975, + "loss": 1.4121, + "step": 17194 + }, + { + "epoch": 0.2234411826334418, + "grad_norm": 0.4656819701194763, + "learning_rate": 0.00015534384382432837, + "loss": 1.3132, + "step": 17195 + }, + { + "epoch": 0.22345417717735766, + "grad_norm": 0.41130486130714417, + "learning_rate": 0.000155341244362417, + "loss": 1.4668, + "step": 17196 + }, + { + "epoch": 0.22346717172127353, + "grad_norm": 0.3827744722366333, + "learning_rate": 0.0001553386449005056, + "loss": 1.2714, + "step": 17197 + }, + { + "epoch": 0.2234801662651894, + "grad_norm": 0.3936939239501953, + "learning_rate": 0.00015533604543859422, + "loss": 1.3407, + "step": 17198 + }, + { + "epoch": 0.22349316080910528, + "grad_norm": 0.31146296858787537, + "learning_rate": 0.00015533344597668281, + "loss": 1.5786, + "step": 17199 + }, + { + "epoch": 0.22350615535302115, + "grad_norm": 0.32435494661331177, + "learning_rate": 0.00015533084651477147, + "loss": 1.3857, + "step": 17200 + }, + { + "epoch": 0.22351914989693702, + "grad_norm": 0.40252891182899475, + "learning_rate": 0.00015532824705286006, + "loss": 1.4384, + "step": 17201 + }, + { + "epoch": 0.2235321444408529, + "grad_norm": 0.4690648317337036, + "learning_rate": 0.0001553256475909487, + "loss": 1.5365, + "step": 17202 + }, + { + "epoch": 0.22354513898476877, + "grad_norm": 0.3450070023536682, + "learning_rate": 0.0001553230481290373, + "loss": 1.478, + "step": 17203 + }, + { + "epoch": 0.22355813352868464, + "grad_norm": 0.47981390357017517, + "learning_rate": 0.0001553204486671259, + "loss": 1.4841, + "step": 17204 + }, + { + "epoch": 0.22357112807260052, + "grad_norm": 0.3812151551246643, + "learning_rate": 0.00015531784920521453, + "loss": 1.4281, + "step": 17205 + }, + { + "epoch": 0.2235841226165164, + "grad_norm": 0.43522951006889343, + "learning_rate": 0.00015531524974330313, + "loss": 1.4512, + "step": 17206 + }, + { + "epoch": 0.22359711716043226, + "grad_norm": 0.4600542485713959, + "learning_rate": 0.00015531265028139176, + "loss": 1.3144, + "step": 17207 + }, + { + "epoch": 0.22361011170434814, + "grad_norm": 0.36629295349121094, + "learning_rate": 0.00015531005081948038, + "loss": 1.4347, + "step": 17208 + }, + { + "epoch": 0.223623106248264, + "grad_norm": 0.36778852343559265, + "learning_rate": 0.00015530745135756898, + "loss": 1.4107, + "step": 17209 + }, + { + "epoch": 0.22363610079217988, + "grad_norm": 0.40101122856140137, + "learning_rate": 0.0001553048518956576, + "loss": 1.2283, + "step": 17210 + }, + { + "epoch": 0.22364909533609575, + "grad_norm": 0.4092688262462616, + "learning_rate": 0.00015530225243374623, + "loss": 1.4168, + "step": 17211 + }, + { + "epoch": 0.22366208988001163, + "grad_norm": 0.2870953679084778, + "learning_rate": 0.00015529965297183485, + "loss": 1.2667, + "step": 17212 + }, + { + "epoch": 0.2236750844239275, + "grad_norm": 0.3662099540233612, + "learning_rate": 0.00015529705350992345, + "loss": 1.4113, + "step": 17213 + }, + { + "epoch": 0.22368807896784337, + "grad_norm": 0.416275292634964, + "learning_rate": 0.00015529445404801207, + "loss": 1.4527, + "step": 17214 + }, + { + "epoch": 0.22370107351175925, + "grad_norm": 0.37154337763786316, + "learning_rate": 0.0001552918545861007, + "loss": 1.3959, + "step": 17215 + }, + { + "epoch": 0.22371406805567512, + "grad_norm": 0.35895952582359314, + "learning_rate": 0.0001552892551241893, + "loss": 1.3032, + "step": 17216 + }, + { + "epoch": 0.223727062599591, + "grad_norm": 0.3686949908733368, + "learning_rate": 0.00015528665566227792, + "loss": 1.2793, + "step": 17217 + }, + { + "epoch": 0.22374005714350687, + "grad_norm": 0.4785311222076416, + "learning_rate": 0.00015528405620036652, + "loss": 1.4317, + "step": 17218 + }, + { + "epoch": 0.22375305168742274, + "grad_norm": 0.3777119815349579, + "learning_rate": 0.00015528145673845517, + "loss": 1.4399, + "step": 17219 + }, + { + "epoch": 0.2237660462313386, + "grad_norm": 0.3776816427707672, + "learning_rate": 0.00015527885727654377, + "loss": 1.5438, + "step": 17220 + }, + { + "epoch": 0.22377904077525448, + "grad_norm": 0.43836694955825806, + "learning_rate": 0.00015527625781463236, + "loss": 1.5241, + "step": 17221 + }, + { + "epoch": 0.22379203531917036, + "grad_norm": 0.47422000765800476, + "learning_rate": 0.000155273658352721, + "loss": 1.5175, + "step": 17222 + }, + { + "epoch": 0.22380502986308623, + "grad_norm": 0.4537494480609894, + "learning_rate": 0.0001552710588908096, + "loss": 1.5111, + "step": 17223 + }, + { + "epoch": 0.2238180244070021, + "grad_norm": 0.37764039635658264, + "learning_rate": 0.00015526845942889824, + "loss": 1.3176, + "step": 17224 + }, + { + "epoch": 0.22383101895091798, + "grad_norm": 0.4033384919166565, + "learning_rate": 0.00015526585996698683, + "loss": 1.3036, + "step": 17225 + }, + { + "epoch": 0.22384401349483385, + "grad_norm": 0.4137011766433716, + "learning_rate": 0.00015526326050507546, + "loss": 1.4036, + "step": 17226 + }, + { + "epoch": 0.22385700803874972, + "grad_norm": 0.36519598960876465, + "learning_rate": 0.00015526066104316408, + "loss": 1.0906, + "step": 17227 + }, + { + "epoch": 0.2238700025826656, + "grad_norm": 0.34954115748405457, + "learning_rate": 0.00015525806158125268, + "loss": 1.3285, + "step": 17228 + }, + { + "epoch": 0.22388299712658147, + "grad_norm": 0.392419695854187, + "learning_rate": 0.0001552554621193413, + "loss": 1.475, + "step": 17229 + }, + { + "epoch": 0.22389599167049734, + "grad_norm": 0.4142645001411438, + "learning_rate": 0.0001552528626574299, + "loss": 1.3739, + "step": 17230 + }, + { + "epoch": 0.2239089862144132, + "grad_norm": 0.32194235920906067, + "learning_rate": 0.00015525026319551855, + "loss": 1.4845, + "step": 17231 + }, + { + "epoch": 0.2239219807583291, + "grad_norm": 0.3668786585330963, + "learning_rate": 0.00015524766373360715, + "loss": 1.3907, + "step": 17232 + }, + { + "epoch": 0.22393497530224496, + "grad_norm": 0.40856194496154785, + "learning_rate": 0.00015524506427169575, + "loss": 1.3633, + "step": 17233 + }, + { + "epoch": 0.22394796984616083, + "grad_norm": 0.38559821248054504, + "learning_rate": 0.00015524246480978437, + "loss": 1.3355, + "step": 17234 + }, + { + "epoch": 0.2239609643900767, + "grad_norm": 0.44653359055519104, + "learning_rate": 0.000155239865347873, + "loss": 1.4567, + "step": 17235 + }, + { + "epoch": 0.22397395893399258, + "grad_norm": 0.3987303376197815, + "learning_rate": 0.00015523726588596162, + "loss": 1.4833, + "step": 17236 + }, + { + "epoch": 0.22398695347790845, + "grad_norm": 0.43988803029060364, + "learning_rate": 0.00015523466642405022, + "loss": 1.4035, + "step": 17237 + }, + { + "epoch": 0.22399994802182432, + "grad_norm": 0.40534988045692444, + "learning_rate": 0.00015523206696213884, + "loss": 1.3272, + "step": 17238 + }, + { + "epoch": 0.2240129425657402, + "grad_norm": 0.3910040855407715, + "learning_rate": 0.00015522946750022747, + "loss": 1.4318, + "step": 17239 + }, + { + "epoch": 0.22402593710965607, + "grad_norm": 0.3395826816558838, + "learning_rate": 0.00015522686803831607, + "loss": 1.5333, + "step": 17240 + }, + { + "epoch": 0.22403893165357194, + "grad_norm": 0.3173440396785736, + "learning_rate": 0.0001552242685764047, + "loss": 1.4444, + "step": 17241 + }, + { + "epoch": 0.22405192619748782, + "grad_norm": 0.3964594304561615, + "learning_rate": 0.00015522166911449331, + "loss": 1.2549, + "step": 17242 + }, + { + "epoch": 0.22406492074140372, + "grad_norm": 0.3530697822570801, + "learning_rate": 0.00015521906965258194, + "loss": 1.4242, + "step": 17243 + }, + { + "epoch": 0.2240779152853196, + "grad_norm": 0.4321229159832001, + "learning_rate": 0.00015521647019067054, + "loss": 1.604, + "step": 17244 + }, + { + "epoch": 0.22409090982923546, + "grad_norm": 0.4241045415401459, + "learning_rate": 0.00015521387072875913, + "loss": 1.3933, + "step": 17245 + }, + { + "epoch": 0.22410390437315134, + "grad_norm": 0.4098696708679199, + "learning_rate": 0.00015521127126684779, + "loss": 1.6097, + "step": 17246 + }, + { + "epoch": 0.2241168989170672, + "grad_norm": 0.33041030168533325, + "learning_rate": 0.00015520867180493638, + "loss": 1.4669, + "step": 17247 + }, + { + "epoch": 0.22412989346098308, + "grad_norm": 0.4452991187572479, + "learning_rate": 0.000155206072343025, + "loss": 1.3393, + "step": 17248 + }, + { + "epoch": 0.22414288800489895, + "grad_norm": 0.3844544291496277, + "learning_rate": 0.0001552034728811136, + "loss": 1.3795, + "step": 17249 + }, + { + "epoch": 0.22415588254881483, + "grad_norm": 0.43733423948287964, + "learning_rate": 0.00015520087341920223, + "loss": 1.4092, + "step": 17250 + }, + { + "epoch": 0.2241688770927307, + "grad_norm": 0.41477397084236145, + "learning_rate": 0.00015519827395729085, + "loss": 1.376, + "step": 17251 + }, + { + "epoch": 0.22418187163664657, + "grad_norm": 0.4083769619464874, + "learning_rate": 0.00015519567449537945, + "loss": 1.4334, + "step": 17252 + }, + { + "epoch": 0.22419486618056245, + "grad_norm": 0.4906991720199585, + "learning_rate": 0.00015519307503346808, + "loss": 1.2715, + "step": 17253 + }, + { + "epoch": 0.22420786072447832, + "grad_norm": 0.40970197319984436, + "learning_rate": 0.0001551904755715567, + "loss": 1.4685, + "step": 17254 + }, + { + "epoch": 0.2242208552683942, + "grad_norm": 0.3219669461250305, + "learning_rate": 0.00015518787610964532, + "loss": 1.2984, + "step": 17255 + }, + { + "epoch": 0.22423384981231007, + "grad_norm": 0.4801519513130188, + "learning_rate": 0.00015518527664773392, + "loss": 1.4003, + "step": 17256 + }, + { + "epoch": 0.22424684435622594, + "grad_norm": 0.42577916383743286, + "learning_rate": 0.00015518267718582255, + "loss": 1.4536, + "step": 17257 + }, + { + "epoch": 0.2242598389001418, + "grad_norm": 0.34381628036499023, + "learning_rate": 0.00015518007772391117, + "loss": 1.4981, + "step": 17258 + }, + { + "epoch": 0.22427283344405768, + "grad_norm": 0.4634552001953125, + "learning_rate": 0.00015517747826199977, + "loss": 1.3198, + "step": 17259 + }, + { + "epoch": 0.22428582798797356, + "grad_norm": 0.3238271176815033, + "learning_rate": 0.0001551748788000884, + "loss": 1.3033, + "step": 17260 + }, + { + "epoch": 0.22429882253188943, + "grad_norm": 0.4381254017353058, + "learning_rate": 0.000155172279338177, + "loss": 1.3442, + "step": 17261 + }, + { + "epoch": 0.2243118170758053, + "grad_norm": 0.3971615433692932, + "learning_rate": 0.00015516967987626561, + "loss": 1.453, + "step": 17262 + }, + { + "epoch": 0.22432481161972118, + "grad_norm": 0.4603961110115051, + "learning_rate": 0.00015516708041435424, + "loss": 1.5408, + "step": 17263 + }, + { + "epoch": 0.22433780616363705, + "grad_norm": 0.3853374123573303, + "learning_rate": 0.00015516448095244284, + "loss": 1.4496, + "step": 17264 + }, + { + "epoch": 0.22435080070755292, + "grad_norm": 0.4377438724040985, + "learning_rate": 0.00015516188149053146, + "loss": 1.3491, + "step": 17265 + }, + { + "epoch": 0.2243637952514688, + "grad_norm": 0.3321067690849304, + "learning_rate": 0.00015515928202862009, + "loss": 1.3057, + "step": 17266 + }, + { + "epoch": 0.22437678979538467, + "grad_norm": 0.4278276860713959, + "learning_rate": 0.0001551566825667087, + "loss": 1.3219, + "step": 17267 + }, + { + "epoch": 0.22438978433930054, + "grad_norm": 0.42417049407958984, + "learning_rate": 0.0001551540831047973, + "loss": 1.5954, + "step": 17268 + }, + { + "epoch": 0.2244027788832164, + "grad_norm": 0.30943602323532104, + "learning_rate": 0.00015515148364288593, + "loss": 1.2925, + "step": 17269 + }, + { + "epoch": 0.2244157734271323, + "grad_norm": 0.42065128684043884, + "learning_rate": 0.00015514888418097456, + "loss": 1.1031, + "step": 17270 + }, + { + "epoch": 0.22442876797104816, + "grad_norm": 0.27368807792663574, + "learning_rate": 0.00015514628471906315, + "loss": 1.0893, + "step": 17271 + }, + { + "epoch": 0.22444176251496403, + "grad_norm": 0.39994674921035767, + "learning_rate": 0.00015514368525715178, + "loss": 1.2647, + "step": 17272 + }, + { + "epoch": 0.2244547570588799, + "grad_norm": 0.39597436785697937, + "learning_rate": 0.00015514108579524038, + "loss": 1.4117, + "step": 17273 + }, + { + "epoch": 0.22446775160279578, + "grad_norm": 0.34959545731544495, + "learning_rate": 0.00015513848633332903, + "loss": 1.5133, + "step": 17274 + }, + { + "epoch": 0.22448074614671165, + "grad_norm": 0.40860414505004883, + "learning_rate": 0.00015513588687141762, + "loss": 1.4046, + "step": 17275 + }, + { + "epoch": 0.22449374069062752, + "grad_norm": 0.5460940599441528, + "learning_rate": 0.00015513328740950622, + "loss": 1.5928, + "step": 17276 + }, + { + "epoch": 0.2245067352345434, + "grad_norm": 0.4546952545642853, + "learning_rate": 0.00015513068794759487, + "loss": 1.5112, + "step": 17277 + }, + { + "epoch": 0.22451972977845927, + "grad_norm": 0.4209045171737671, + "learning_rate": 0.00015512808848568347, + "loss": 1.3803, + "step": 17278 + }, + { + "epoch": 0.22453272432237514, + "grad_norm": 0.36467427015304565, + "learning_rate": 0.0001551254890237721, + "loss": 1.3589, + "step": 17279 + }, + { + "epoch": 0.22454571886629102, + "grad_norm": 0.28918224573135376, + "learning_rate": 0.0001551228895618607, + "loss": 1.1951, + "step": 17280 + }, + { + "epoch": 0.2245587134102069, + "grad_norm": 0.2822990417480469, + "learning_rate": 0.00015512029009994932, + "loss": 1.2312, + "step": 17281 + }, + { + "epoch": 0.22457170795412276, + "grad_norm": 0.38115745782852173, + "learning_rate": 0.00015511769063803794, + "loss": 1.4074, + "step": 17282 + }, + { + "epoch": 0.22458470249803864, + "grad_norm": 0.31620487570762634, + "learning_rate": 0.00015511509117612654, + "loss": 1.4553, + "step": 17283 + }, + { + "epoch": 0.2245976970419545, + "grad_norm": 0.5735006332397461, + "learning_rate": 0.00015511249171421516, + "loss": 1.4612, + "step": 17284 + }, + { + "epoch": 0.22461069158587038, + "grad_norm": 0.41778677701950073, + "learning_rate": 0.0001551098922523038, + "loss": 1.48, + "step": 17285 + }, + { + "epoch": 0.22462368612978625, + "grad_norm": 0.39154380559921265, + "learning_rate": 0.0001551072927903924, + "loss": 1.1666, + "step": 17286 + }, + { + "epoch": 0.22463668067370213, + "grad_norm": 0.5108563303947449, + "learning_rate": 0.000155104693328481, + "loss": 1.4689, + "step": 17287 + }, + { + "epoch": 0.224649675217618, + "grad_norm": 0.34346339106559753, + "learning_rate": 0.0001551020938665696, + "loss": 1.2787, + "step": 17288 + }, + { + "epoch": 0.22466266976153387, + "grad_norm": 0.406199187040329, + "learning_rate": 0.00015509949440465826, + "loss": 1.4693, + "step": 17289 + }, + { + "epoch": 0.22467566430544975, + "grad_norm": 0.46047765016555786, + "learning_rate": 0.00015509689494274686, + "loss": 1.6481, + "step": 17290 + }, + { + "epoch": 0.22468865884936562, + "grad_norm": 0.44907137751579285, + "learning_rate": 0.00015509429548083548, + "loss": 1.3559, + "step": 17291 + }, + { + "epoch": 0.2247016533932815, + "grad_norm": 0.45858585834503174, + "learning_rate": 0.00015509169601892408, + "loss": 1.4427, + "step": 17292 + }, + { + "epoch": 0.22471464793719736, + "grad_norm": 0.3390357792377472, + "learning_rate": 0.0001550890965570127, + "loss": 1.3068, + "step": 17293 + }, + { + "epoch": 0.22472764248111324, + "grad_norm": 0.39388394355773926, + "learning_rate": 0.00015508649709510133, + "loss": 1.5405, + "step": 17294 + }, + { + "epoch": 0.2247406370250291, + "grad_norm": 0.3702054023742676, + "learning_rate": 0.00015508389763318992, + "loss": 1.2752, + "step": 17295 + }, + { + "epoch": 0.22475363156894498, + "grad_norm": 0.383247435092926, + "learning_rate": 0.00015508129817127855, + "loss": 1.4986, + "step": 17296 + }, + { + "epoch": 0.22476662611286086, + "grad_norm": 0.42690354585647583, + "learning_rate": 0.00015507869870936717, + "loss": 1.4135, + "step": 17297 + }, + { + "epoch": 0.22477962065677673, + "grad_norm": 0.39397284388542175, + "learning_rate": 0.0001550760992474558, + "loss": 1.3382, + "step": 17298 + }, + { + "epoch": 0.2247926152006926, + "grad_norm": 0.460771381855011, + "learning_rate": 0.0001550734997855444, + "loss": 1.3601, + "step": 17299 + }, + { + "epoch": 0.22480560974460848, + "grad_norm": 0.3621922433376312, + "learning_rate": 0.000155070900323633, + "loss": 1.2357, + "step": 17300 + }, + { + "epoch": 0.22481860428852435, + "grad_norm": 0.43352898955345154, + "learning_rate": 0.00015506830086172164, + "loss": 1.4183, + "step": 17301 + }, + { + "epoch": 0.22483159883244022, + "grad_norm": 0.45269906520843506, + "learning_rate": 0.00015506570139981024, + "loss": 1.3568, + "step": 17302 + }, + { + "epoch": 0.2248445933763561, + "grad_norm": 0.508004903793335, + "learning_rate": 0.00015506310193789887, + "loss": 1.3575, + "step": 17303 + }, + { + "epoch": 0.22485758792027197, + "grad_norm": 0.42237740755081177, + "learning_rate": 0.00015506050247598746, + "loss": 1.4219, + "step": 17304 + }, + { + "epoch": 0.22487058246418784, + "grad_norm": 0.3603668510913849, + "learning_rate": 0.0001550579030140761, + "loss": 1.4793, + "step": 17305 + }, + { + "epoch": 0.2248835770081037, + "grad_norm": 0.43718016147613525, + "learning_rate": 0.0001550553035521647, + "loss": 1.4276, + "step": 17306 + }, + { + "epoch": 0.2248965715520196, + "grad_norm": 0.36309847235679626, + "learning_rate": 0.0001550527040902533, + "loss": 1.4366, + "step": 17307 + }, + { + "epoch": 0.22490956609593546, + "grad_norm": 0.37834012508392334, + "learning_rate": 0.00015505010462834193, + "loss": 1.393, + "step": 17308 + }, + { + "epoch": 0.22492256063985133, + "grad_norm": 0.3192480802536011, + "learning_rate": 0.00015504750516643056, + "loss": 1.2919, + "step": 17309 + }, + { + "epoch": 0.2249355551837672, + "grad_norm": 0.4102786183357239, + "learning_rate": 0.00015504490570451918, + "loss": 1.5319, + "step": 17310 + }, + { + "epoch": 0.22494854972768308, + "grad_norm": 0.42493653297424316, + "learning_rate": 0.00015504230624260778, + "loss": 1.2968, + "step": 17311 + }, + { + "epoch": 0.22496154427159895, + "grad_norm": 0.41623684763908386, + "learning_rate": 0.0001550397067806964, + "loss": 1.5455, + "step": 17312 + }, + { + "epoch": 0.22497453881551482, + "grad_norm": 0.295663058757782, + "learning_rate": 0.00015503710731878503, + "loss": 1.2794, + "step": 17313 + }, + { + "epoch": 0.2249875333594307, + "grad_norm": 0.4800005257129669, + "learning_rate": 0.00015503450785687363, + "loss": 1.4411, + "step": 17314 + }, + { + "epoch": 0.22500052790334657, + "grad_norm": 0.4075014293193817, + "learning_rate": 0.00015503190839496225, + "loss": 1.1826, + "step": 17315 + }, + { + "epoch": 0.22501352244726244, + "grad_norm": 0.40996208786964417, + "learning_rate": 0.00015502930893305088, + "loss": 1.3666, + "step": 17316 + }, + { + "epoch": 0.22502651699117832, + "grad_norm": 0.385638028383255, + "learning_rate": 0.00015502670947113947, + "loss": 1.4855, + "step": 17317 + }, + { + "epoch": 0.2250395115350942, + "grad_norm": 0.35620033740997314, + "learning_rate": 0.0001550241100092281, + "loss": 1.4412, + "step": 17318 + }, + { + "epoch": 0.22505250607901006, + "grad_norm": 0.40918657183647156, + "learning_rate": 0.0001550215105473167, + "loss": 1.5307, + "step": 17319 + }, + { + "epoch": 0.22506550062292596, + "grad_norm": 0.36366400122642517, + "learning_rate": 0.00015501891108540535, + "loss": 1.4665, + "step": 17320 + }, + { + "epoch": 0.22507849516684184, + "grad_norm": 0.3885035812854767, + "learning_rate": 0.00015501631162349394, + "loss": 1.3861, + "step": 17321 + }, + { + "epoch": 0.2250914897107577, + "grad_norm": 0.4904419183731079, + "learning_rate": 0.00015501371216158257, + "loss": 1.4437, + "step": 17322 + }, + { + "epoch": 0.22510448425467358, + "grad_norm": 0.34057196974754333, + "learning_rate": 0.00015501111269967117, + "loss": 1.2526, + "step": 17323 + }, + { + "epoch": 0.22511747879858945, + "grad_norm": 0.4373798072338104, + "learning_rate": 0.0001550085132377598, + "loss": 1.395, + "step": 17324 + }, + { + "epoch": 0.22513047334250533, + "grad_norm": 0.45585834980010986, + "learning_rate": 0.00015500591377584842, + "loss": 1.5725, + "step": 17325 + }, + { + "epoch": 0.2251434678864212, + "grad_norm": 0.3861900866031647, + "learning_rate": 0.000155003314313937, + "loss": 1.4385, + "step": 17326 + }, + { + "epoch": 0.22515646243033707, + "grad_norm": 0.47984573245048523, + "learning_rate": 0.00015500071485202564, + "loss": 1.5295, + "step": 17327 + }, + { + "epoch": 0.22516945697425295, + "grad_norm": 0.43926528096199036, + "learning_rate": 0.00015499811539011426, + "loss": 1.6313, + "step": 17328 + }, + { + "epoch": 0.22518245151816882, + "grad_norm": 0.44269147515296936, + "learning_rate": 0.00015499551592820286, + "loss": 1.6328, + "step": 17329 + }, + { + "epoch": 0.2251954460620847, + "grad_norm": 0.410132497549057, + "learning_rate": 0.00015499291646629148, + "loss": 1.428, + "step": 17330 + }, + { + "epoch": 0.22520844060600057, + "grad_norm": 0.38601186871528625, + "learning_rate": 0.00015499031700438008, + "loss": 1.3893, + "step": 17331 + }, + { + "epoch": 0.22522143514991644, + "grad_norm": 0.32521939277648926, + "learning_rate": 0.00015498771754246873, + "loss": 1.2513, + "step": 17332 + }, + { + "epoch": 0.2252344296938323, + "grad_norm": 0.5738088488578796, + "learning_rate": 0.00015498511808055733, + "loss": 1.5438, + "step": 17333 + }, + { + "epoch": 0.22524742423774818, + "grad_norm": 0.3105775713920593, + "learning_rate": 0.00015498251861864595, + "loss": 1.234, + "step": 17334 + }, + { + "epoch": 0.22526041878166406, + "grad_norm": 0.41933226585388184, + "learning_rate": 0.00015497991915673455, + "loss": 1.6983, + "step": 17335 + }, + { + "epoch": 0.22527341332557993, + "grad_norm": 0.37573865056037903, + "learning_rate": 0.00015497731969482318, + "loss": 1.2763, + "step": 17336 + }, + { + "epoch": 0.2252864078694958, + "grad_norm": 0.4303555488586426, + "learning_rate": 0.0001549747202329118, + "loss": 1.429, + "step": 17337 + }, + { + "epoch": 0.22529940241341168, + "grad_norm": 0.43803417682647705, + "learning_rate": 0.0001549721207710004, + "loss": 1.4104, + "step": 17338 + }, + { + "epoch": 0.22531239695732755, + "grad_norm": 0.33352023363113403, + "learning_rate": 0.00015496952130908902, + "loss": 1.406, + "step": 17339 + }, + { + "epoch": 0.22532539150124342, + "grad_norm": 0.4034109115600586, + "learning_rate": 0.00015496692184717765, + "loss": 1.5581, + "step": 17340 + }, + { + "epoch": 0.2253383860451593, + "grad_norm": 0.2970642149448395, + "learning_rate": 0.00015496432238526627, + "loss": 1.3252, + "step": 17341 + }, + { + "epoch": 0.22535138058907517, + "grad_norm": 0.3742406964302063, + "learning_rate": 0.00015496172292335487, + "loss": 1.4519, + "step": 17342 + }, + { + "epoch": 0.22536437513299104, + "grad_norm": 0.38583990931510925, + "learning_rate": 0.00015495912346144347, + "loss": 1.2261, + "step": 17343 + }, + { + "epoch": 0.2253773696769069, + "grad_norm": 0.3115047812461853, + "learning_rate": 0.00015495652399953212, + "loss": 1.3512, + "step": 17344 + }, + { + "epoch": 0.2253903642208228, + "grad_norm": 0.3753218352794647, + "learning_rate": 0.00015495392453762072, + "loss": 1.4042, + "step": 17345 + }, + { + "epoch": 0.22540335876473866, + "grad_norm": 0.4485861361026764, + "learning_rate": 0.00015495132507570934, + "loss": 1.4968, + "step": 17346 + }, + { + "epoch": 0.22541635330865453, + "grad_norm": 0.35710158944129944, + "learning_rate": 0.00015494872561379794, + "loss": 1.2975, + "step": 17347 + }, + { + "epoch": 0.2254293478525704, + "grad_norm": 0.3344789743423462, + "learning_rate": 0.00015494612615188656, + "loss": 1.4141, + "step": 17348 + }, + { + "epoch": 0.22544234239648628, + "grad_norm": 0.44867363572120667, + "learning_rate": 0.00015494352668997519, + "loss": 1.5023, + "step": 17349 + }, + { + "epoch": 0.22545533694040215, + "grad_norm": 0.4031669795513153, + "learning_rate": 0.00015494092722806378, + "loss": 1.2803, + "step": 17350 + }, + { + "epoch": 0.22546833148431802, + "grad_norm": 0.4371286928653717, + "learning_rate": 0.00015493832776615243, + "loss": 1.3404, + "step": 17351 + }, + { + "epoch": 0.2254813260282339, + "grad_norm": 0.4148988723754883, + "learning_rate": 0.00015493572830424103, + "loss": 1.5122, + "step": 17352 + }, + { + "epoch": 0.22549432057214977, + "grad_norm": 0.3802322447299957, + "learning_rate": 0.00015493312884232966, + "loss": 1.4035, + "step": 17353 + }, + { + "epoch": 0.22550731511606564, + "grad_norm": 0.4195025563240051, + "learning_rate": 0.00015493052938041825, + "loss": 1.4206, + "step": 17354 + }, + { + "epoch": 0.22552030965998152, + "grad_norm": 0.40878820419311523, + "learning_rate": 0.00015492792991850688, + "loss": 1.4791, + "step": 17355 + }, + { + "epoch": 0.2255333042038974, + "grad_norm": 0.41589826345443726, + "learning_rate": 0.0001549253304565955, + "loss": 1.3459, + "step": 17356 + }, + { + "epoch": 0.22554629874781326, + "grad_norm": 0.44983240962028503, + "learning_rate": 0.0001549227309946841, + "loss": 1.5836, + "step": 17357 + }, + { + "epoch": 0.22555929329172913, + "grad_norm": 0.43190452456474304, + "learning_rate": 0.00015492013153277273, + "loss": 1.4565, + "step": 17358 + }, + { + "epoch": 0.225572287835645, + "grad_norm": 0.46866104006767273, + "learning_rate": 0.00015491753207086135, + "loss": 1.4345, + "step": 17359 + }, + { + "epoch": 0.22558528237956088, + "grad_norm": 0.38252729177474976, + "learning_rate": 0.00015491493260894995, + "loss": 1.4523, + "step": 17360 + }, + { + "epoch": 0.22559827692347675, + "grad_norm": 0.4475078880786896, + "learning_rate": 0.00015491233314703857, + "loss": 1.5079, + "step": 17361 + }, + { + "epoch": 0.22561127146739263, + "grad_norm": 0.3892700672149658, + "learning_rate": 0.00015490973368512717, + "loss": 1.4031, + "step": 17362 + }, + { + "epoch": 0.2256242660113085, + "grad_norm": 0.34709739685058594, + "learning_rate": 0.00015490713422321582, + "loss": 1.3772, + "step": 17363 + }, + { + "epoch": 0.22563726055522437, + "grad_norm": 0.47149357199668884, + "learning_rate": 0.00015490453476130442, + "loss": 1.5625, + "step": 17364 + }, + { + "epoch": 0.22565025509914025, + "grad_norm": 0.2849879860877991, + "learning_rate": 0.00015490193529939304, + "loss": 1.4355, + "step": 17365 + }, + { + "epoch": 0.22566324964305612, + "grad_norm": 0.3187899887561798, + "learning_rate": 0.00015489933583748164, + "loss": 1.2053, + "step": 17366 + }, + { + "epoch": 0.225676244186972, + "grad_norm": 0.43930551409721375, + "learning_rate": 0.00015489673637557026, + "loss": 1.4115, + "step": 17367 + }, + { + "epoch": 0.22568923873088786, + "grad_norm": 0.2609654664993286, + "learning_rate": 0.0001548941369136589, + "loss": 1.2889, + "step": 17368 + }, + { + "epoch": 0.22570223327480374, + "grad_norm": 0.29785192012786865, + "learning_rate": 0.00015489153745174749, + "loss": 1.2689, + "step": 17369 + }, + { + "epoch": 0.2257152278187196, + "grad_norm": 0.424932062625885, + "learning_rate": 0.0001548889379898361, + "loss": 1.3087, + "step": 17370 + }, + { + "epoch": 0.22572822236263548, + "grad_norm": 0.4638703763484955, + "learning_rate": 0.00015488633852792473, + "loss": 1.5136, + "step": 17371 + }, + { + "epoch": 0.22574121690655136, + "grad_norm": 0.4499339461326599, + "learning_rate": 0.00015488373906601333, + "loss": 1.4435, + "step": 17372 + }, + { + "epoch": 0.22575421145046723, + "grad_norm": 0.3429933190345764, + "learning_rate": 0.00015488113960410196, + "loss": 1.4743, + "step": 17373 + }, + { + "epoch": 0.2257672059943831, + "grad_norm": 0.41518691182136536, + "learning_rate": 0.00015487854014219055, + "loss": 1.5183, + "step": 17374 + }, + { + "epoch": 0.22578020053829898, + "grad_norm": 0.39178723096847534, + "learning_rate": 0.0001548759406802792, + "loss": 1.4559, + "step": 17375 + }, + { + "epoch": 0.22579319508221485, + "grad_norm": 0.47018399834632874, + "learning_rate": 0.0001548733412183678, + "loss": 1.512, + "step": 17376 + }, + { + "epoch": 0.22580618962613072, + "grad_norm": 0.33574777841567993, + "learning_rate": 0.00015487074175645643, + "loss": 1.4443, + "step": 17377 + }, + { + "epoch": 0.2258191841700466, + "grad_norm": 0.32084521651268005, + "learning_rate": 0.00015486814229454502, + "loss": 1.2799, + "step": 17378 + }, + { + "epoch": 0.22583217871396247, + "grad_norm": 0.38468995690345764, + "learning_rate": 0.00015486554283263365, + "loss": 1.3267, + "step": 17379 + }, + { + "epoch": 0.22584517325787834, + "grad_norm": 0.43420401215553284, + "learning_rate": 0.00015486294337072227, + "loss": 1.3912, + "step": 17380 + }, + { + "epoch": 0.2258581678017942, + "grad_norm": 0.4336622953414917, + "learning_rate": 0.00015486034390881087, + "loss": 1.3389, + "step": 17381 + }, + { + "epoch": 0.22587116234571009, + "grad_norm": 0.4239758253097534, + "learning_rate": 0.0001548577444468995, + "loss": 1.3689, + "step": 17382 + }, + { + "epoch": 0.22588415688962596, + "grad_norm": 0.31912335753440857, + "learning_rate": 0.00015485514498498812, + "loss": 1.1682, + "step": 17383 + }, + { + "epoch": 0.22589715143354183, + "grad_norm": 0.342845618724823, + "learning_rate": 0.00015485254552307672, + "loss": 1.4299, + "step": 17384 + }, + { + "epoch": 0.2259101459774577, + "grad_norm": 0.2734147608280182, + "learning_rate": 0.00015484994606116534, + "loss": 1.1489, + "step": 17385 + }, + { + "epoch": 0.22592314052137358, + "grad_norm": 0.4070376753807068, + "learning_rate": 0.00015484734659925397, + "loss": 1.491, + "step": 17386 + }, + { + "epoch": 0.22593613506528945, + "grad_norm": 0.4500395953655243, + "learning_rate": 0.0001548447471373426, + "loss": 1.2911, + "step": 17387 + }, + { + "epoch": 0.22594912960920532, + "grad_norm": 0.3489089906215668, + "learning_rate": 0.0001548421476754312, + "loss": 1.4734, + "step": 17388 + }, + { + "epoch": 0.2259621241531212, + "grad_norm": 0.3487343192100525, + "learning_rate": 0.0001548395482135198, + "loss": 1.5562, + "step": 17389 + }, + { + "epoch": 0.22597511869703707, + "grad_norm": 0.4250304698944092, + "learning_rate": 0.00015483694875160844, + "loss": 1.2573, + "step": 17390 + }, + { + "epoch": 0.22598811324095294, + "grad_norm": 0.43247562646865845, + "learning_rate": 0.00015483434928969703, + "loss": 1.4344, + "step": 17391 + }, + { + "epoch": 0.22600110778486882, + "grad_norm": 0.47394558787345886, + "learning_rate": 0.00015483174982778566, + "loss": 1.3823, + "step": 17392 + }, + { + "epoch": 0.2260141023287847, + "grad_norm": 0.3615734875202179, + "learning_rate": 0.00015482915036587426, + "loss": 1.4962, + "step": 17393 + }, + { + "epoch": 0.22602709687270056, + "grad_norm": 0.37008240818977356, + "learning_rate": 0.0001548265509039629, + "loss": 1.3689, + "step": 17394 + }, + { + "epoch": 0.22604009141661643, + "grad_norm": 0.47572454810142517, + "learning_rate": 0.0001548239514420515, + "loss": 1.4971, + "step": 17395 + }, + { + "epoch": 0.22605308596053234, + "grad_norm": 0.47684207558631897, + "learning_rate": 0.00015482135198014013, + "loss": 1.4241, + "step": 17396 + }, + { + "epoch": 0.2260660805044482, + "grad_norm": 0.42600762844085693, + "learning_rate": 0.00015481875251822873, + "loss": 1.4819, + "step": 17397 + }, + { + "epoch": 0.22607907504836408, + "grad_norm": 0.44199568033218384, + "learning_rate": 0.00015481615305631735, + "loss": 1.359, + "step": 17398 + }, + { + "epoch": 0.22609206959227995, + "grad_norm": 0.4367537200450897, + "learning_rate": 0.00015481355359440598, + "loss": 1.4672, + "step": 17399 + }, + { + "epoch": 0.22610506413619583, + "grad_norm": 0.38594746589660645, + "learning_rate": 0.00015481095413249457, + "loss": 1.3857, + "step": 17400 + }, + { + "epoch": 0.2261180586801117, + "grad_norm": 0.47841691970825195, + "learning_rate": 0.0001548083546705832, + "loss": 1.5825, + "step": 17401 + }, + { + "epoch": 0.22613105322402757, + "grad_norm": 0.42805933952331543, + "learning_rate": 0.00015480575520867182, + "loss": 1.565, + "step": 17402 + }, + { + "epoch": 0.22614404776794345, + "grad_norm": 0.4200296700000763, + "learning_rate": 0.00015480315574676042, + "loss": 1.4444, + "step": 17403 + }, + { + "epoch": 0.22615704231185932, + "grad_norm": 0.39058685302734375, + "learning_rate": 0.00015480055628484904, + "loss": 1.4195, + "step": 17404 + }, + { + "epoch": 0.2261700368557752, + "grad_norm": 0.40262678265571594, + "learning_rate": 0.00015479795682293764, + "loss": 1.3986, + "step": 17405 + }, + { + "epoch": 0.22618303139969106, + "grad_norm": 0.3661644756793976, + "learning_rate": 0.0001547953573610263, + "loss": 1.4036, + "step": 17406 + }, + { + "epoch": 0.22619602594360694, + "grad_norm": 0.34374576807022095, + "learning_rate": 0.0001547927578991149, + "loss": 1.3049, + "step": 17407 + }, + { + "epoch": 0.2262090204875228, + "grad_norm": 0.48040637373924255, + "learning_rate": 0.00015479015843720352, + "loss": 1.406, + "step": 17408 + }, + { + "epoch": 0.22622201503143868, + "grad_norm": 0.3728158175945282, + "learning_rate": 0.0001547875589752921, + "loss": 1.535, + "step": 17409 + }, + { + "epoch": 0.22623500957535456, + "grad_norm": 0.3528313636779785, + "learning_rate": 0.00015478495951338074, + "loss": 1.2917, + "step": 17410 + }, + { + "epoch": 0.22624800411927043, + "grad_norm": 0.39543795585632324, + "learning_rate": 0.00015478236005146936, + "loss": 1.4908, + "step": 17411 + }, + { + "epoch": 0.2262609986631863, + "grad_norm": 0.37769341468811035, + "learning_rate": 0.00015477976058955796, + "loss": 1.3956, + "step": 17412 + }, + { + "epoch": 0.22627399320710218, + "grad_norm": 0.441013365983963, + "learning_rate": 0.00015477716112764658, + "loss": 1.3879, + "step": 17413 + }, + { + "epoch": 0.22628698775101805, + "grad_norm": 0.5277686715126038, + "learning_rate": 0.0001547745616657352, + "loss": 1.4989, + "step": 17414 + }, + { + "epoch": 0.22629998229493392, + "grad_norm": 0.42173245549201965, + "learning_rate": 0.0001547719622038238, + "loss": 1.3407, + "step": 17415 + }, + { + "epoch": 0.2263129768388498, + "grad_norm": 0.33099690079689026, + "learning_rate": 0.00015476936274191243, + "loss": 1.1165, + "step": 17416 + }, + { + "epoch": 0.22632597138276567, + "grad_norm": 0.47431480884552, + "learning_rate": 0.00015476676328000103, + "loss": 1.4413, + "step": 17417 + }, + { + "epoch": 0.22633896592668154, + "grad_norm": 0.41168826818466187, + "learning_rate": 0.00015476416381808968, + "loss": 1.4841, + "step": 17418 + }, + { + "epoch": 0.2263519604705974, + "grad_norm": 0.2497444450855255, + "learning_rate": 0.00015476156435617828, + "loss": 1.1803, + "step": 17419 + }, + { + "epoch": 0.22636495501451329, + "grad_norm": 0.38342034816741943, + "learning_rate": 0.0001547589648942669, + "loss": 1.5575, + "step": 17420 + }, + { + "epoch": 0.22637794955842916, + "grad_norm": 0.45156100392341614, + "learning_rate": 0.0001547563654323555, + "loss": 1.3754, + "step": 17421 + }, + { + "epoch": 0.22639094410234503, + "grad_norm": 0.418491005897522, + "learning_rate": 0.00015475376597044412, + "loss": 1.3967, + "step": 17422 + }, + { + "epoch": 0.2264039386462609, + "grad_norm": 0.34283629059791565, + "learning_rate": 0.00015475116650853275, + "loss": 1.6174, + "step": 17423 + }, + { + "epoch": 0.22641693319017678, + "grad_norm": 0.38933736085891724, + "learning_rate": 0.00015474856704662134, + "loss": 1.3978, + "step": 17424 + }, + { + "epoch": 0.22642992773409265, + "grad_norm": 0.46384942531585693, + "learning_rate": 0.00015474596758471, + "loss": 1.3459, + "step": 17425 + }, + { + "epoch": 0.22644292227800852, + "grad_norm": 0.3376656472682953, + "learning_rate": 0.0001547433681227986, + "loss": 1.3114, + "step": 17426 + }, + { + "epoch": 0.2264559168219244, + "grad_norm": 0.3847888112068176, + "learning_rate": 0.0001547407686608872, + "loss": 1.3687, + "step": 17427 + }, + { + "epoch": 0.22646891136584027, + "grad_norm": 0.34971368312835693, + "learning_rate": 0.00015473816919897582, + "loss": 1.6343, + "step": 17428 + }, + { + "epoch": 0.22648190590975614, + "grad_norm": 0.4429413378238678, + "learning_rate": 0.00015473556973706444, + "loss": 1.5323, + "step": 17429 + }, + { + "epoch": 0.22649490045367202, + "grad_norm": 0.38012903928756714, + "learning_rate": 0.00015473297027515306, + "loss": 1.3206, + "step": 17430 + }, + { + "epoch": 0.2265078949975879, + "grad_norm": 0.44928067922592163, + "learning_rate": 0.00015473037081324166, + "loss": 1.6097, + "step": 17431 + }, + { + "epoch": 0.22652088954150376, + "grad_norm": 0.34846264123916626, + "learning_rate": 0.00015472777135133029, + "loss": 1.3545, + "step": 17432 + }, + { + "epoch": 0.22653388408541963, + "grad_norm": 0.4594947099685669, + "learning_rate": 0.0001547251718894189, + "loss": 1.5832, + "step": 17433 + }, + { + "epoch": 0.2265468786293355, + "grad_norm": 0.36072802543640137, + "learning_rate": 0.0001547225724275075, + "loss": 1.5538, + "step": 17434 + }, + { + "epoch": 0.22655987317325138, + "grad_norm": 0.40906375646591187, + "learning_rate": 0.00015471997296559613, + "loss": 1.5471, + "step": 17435 + }, + { + "epoch": 0.22657286771716725, + "grad_norm": 0.3710382878780365, + "learning_rate": 0.00015471737350368473, + "loss": 1.4535, + "step": 17436 + }, + { + "epoch": 0.22658586226108313, + "grad_norm": 0.3967141807079315, + "learning_rate": 0.00015471477404177338, + "loss": 1.3244, + "step": 17437 + }, + { + "epoch": 0.226598856804999, + "grad_norm": 0.45705223083496094, + "learning_rate": 0.00015471217457986198, + "loss": 1.5352, + "step": 17438 + }, + { + "epoch": 0.22661185134891487, + "grad_norm": 0.5221951603889465, + "learning_rate": 0.00015470957511795058, + "loss": 1.4655, + "step": 17439 + }, + { + "epoch": 0.22662484589283075, + "grad_norm": 0.38480231165885925, + "learning_rate": 0.0001547069756560392, + "loss": 1.3329, + "step": 17440 + }, + { + "epoch": 0.22663784043674662, + "grad_norm": 0.45903274416923523, + "learning_rate": 0.00015470437619412783, + "loss": 1.3116, + "step": 17441 + }, + { + "epoch": 0.2266508349806625, + "grad_norm": 0.40764319896698, + "learning_rate": 0.00015470177673221645, + "loss": 1.3762, + "step": 17442 + }, + { + "epoch": 0.22666382952457836, + "grad_norm": 0.3567750155925751, + "learning_rate": 0.00015469917727030505, + "loss": 1.2855, + "step": 17443 + }, + { + "epoch": 0.22667682406849424, + "grad_norm": 0.36940595507621765, + "learning_rate": 0.00015469657780839367, + "loss": 1.2901, + "step": 17444 + }, + { + "epoch": 0.2266898186124101, + "grad_norm": 0.31514015793800354, + "learning_rate": 0.0001546939783464823, + "loss": 1.3736, + "step": 17445 + }, + { + "epoch": 0.22670281315632598, + "grad_norm": 0.40365323424339294, + "learning_rate": 0.0001546913788845709, + "loss": 1.5605, + "step": 17446 + }, + { + "epoch": 0.22671580770024186, + "grad_norm": 0.3547961115837097, + "learning_rate": 0.00015468877942265952, + "loss": 1.2786, + "step": 17447 + }, + { + "epoch": 0.22672880224415773, + "grad_norm": 0.31444740295410156, + "learning_rate": 0.00015468617996074812, + "loss": 1.3187, + "step": 17448 + }, + { + "epoch": 0.2267417967880736, + "grad_norm": 0.31617364287376404, + "learning_rate": 0.00015468358049883677, + "loss": 1.1888, + "step": 17449 + }, + { + "epoch": 0.22675479133198947, + "grad_norm": 0.44486579298973083, + "learning_rate": 0.00015468098103692536, + "loss": 1.5871, + "step": 17450 + }, + { + "epoch": 0.22676778587590535, + "grad_norm": 0.4003638029098511, + "learning_rate": 0.00015467838157501396, + "loss": 1.2373, + "step": 17451 + }, + { + "epoch": 0.22678078041982122, + "grad_norm": 0.36715012788772583, + "learning_rate": 0.00015467578211310259, + "loss": 1.4776, + "step": 17452 + }, + { + "epoch": 0.2267937749637371, + "grad_norm": 0.3751664161682129, + "learning_rate": 0.0001546731826511912, + "loss": 1.4395, + "step": 17453 + }, + { + "epoch": 0.22680676950765297, + "grad_norm": 0.3138881027698517, + "learning_rate": 0.00015467058318927984, + "loss": 1.3384, + "step": 17454 + }, + { + "epoch": 0.22681976405156884, + "grad_norm": 0.45714783668518066, + "learning_rate": 0.00015466798372736843, + "loss": 1.2495, + "step": 17455 + }, + { + "epoch": 0.2268327585954847, + "grad_norm": 0.39608290791511536, + "learning_rate": 0.00015466538426545706, + "loss": 1.5277, + "step": 17456 + }, + { + "epoch": 0.22684575313940059, + "grad_norm": 0.26731643080711365, + "learning_rate": 0.00015466278480354568, + "loss": 1.2756, + "step": 17457 + }, + { + "epoch": 0.22685874768331646, + "grad_norm": 0.4738873839378357, + "learning_rate": 0.00015466018534163428, + "loss": 1.4794, + "step": 17458 + }, + { + "epoch": 0.22687174222723233, + "grad_norm": 0.4773971736431122, + "learning_rate": 0.0001546575858797229, + "loss": 1.6204, + "step": 17459 + }, + { + "epoch": 0.2268847367711482, + "grad_norm": 0.4836757779121399, + "learning_rate": 0.00015465498641781153, + "loss": 1.6049, + "step": 17460 + }, + { + "epoch": 0.22689773131506408, + "grad_norm": 0.30799826979637146, + "learning_rate": 0.00015465238695590015, + "loss": 1.3359, + "step": 17461 + }, + { + "epoch": 0.22691072585897995, + "grad_norm": 0.3993365168571472, + "learning_rate": 0.00015464978749398875, + "loss": 1.344, + "step": 17462 + }, + { + "epoch": 0.22692372040289582, + "grad_norm": 0.38170790672302246, + "learning_rate": 0.00015464718803207737, + "loss": 1.5201, + "step": 17463 + }, + { + "epoch": 0.2269367149468117, + "grad_norm": 0.36720049381256104, + "learning_rate": 0.000154644588570166, + "loss": 1.4012, + "step": 17464 + }, + { + "epoch": 0.22694970949072757, + "grad_norm": 0.355109304189682, + "learning_rate": 0.0001546419891082546, + "loss": 1.4716, + "step": 17465 + }, + { + "epoch": 0.22696270403464344, + "grad_norm": 0.38913825154304504, + "learning_rate": 0.00015463938964634322, + "loss": 1.4926, + "step": 17466 + }, + { + "epoch": 0.22697569857855932, + "grad_norm": 0.32351431250572205, + "learning_rate": 0.00015463679018443182, + "loss": 1.4396, + "step": 17467 + }, + { + "epoch": 0.2269886931224752, + "grad_norm": 0.33774125576019287, + "learning_rate": 0.00015463419072252044, + "loss": 1.3058, + "step": 17468 + }, + { + "epoch": 0.22700168766639106, + "grad_norm": 0.35611337423324585, + "learning_rate": 0.00015463159126060907, + "loss": 1.3914, + "step": 17469 + }, + { + "epoch": 0.22701468221030693, + "grad_norm": 0.32777202129364014, + "learning_rate": 0.00015462899179869766, + "loss": 1.489, + "step": 17470 + }, + { + "epoch": 0.2270276767542228, + "grad_norm": 0.39417633414268494, + "learning_rate": 0.0001546263923367863, + "loss": 1.5456, + "step": 17471 + }, + { + "epoch": 0.2270406712981387, + "grad_norm": 0.38823428750038147, + "learning_rate": 0.0001546237928748749, + "loss": 1.38, + "step": 17472 + }, + { + "epoch": 0.22705366584205458, + "grad_norm": 0.41720306873321533, + "learning_rate": 0.00015462119341296354, + "loss": 1.4998, + "step": 17473 + }, + { + "epoch": 0.22706666038597045, + "grad_norm": 0.260313481092453, + "learning_rate": 0.00015461859395105214, + "loss": 1.2702, + "step": 17474 + }, + { + "epoch": 0.22707965492988633, + "grad_norm": 0.36505457758903503, + "learning_rate": 0.00015461599448914076, + "loss": 1.5749, + "step": 17475 + }, + { + "epoch": 0.2270926494738022, + "grad_norm": 0.35256657004356384, + "learning_rate": 0.00015461339502722938, + "loss": 1.3065, + "step": 17476 + }, + { + "epoch": 0.22710564401771807, + "grad_norm": 0.30015385150909424, + "learning_rate": 0.00015461079556531798, + "loss": 1.394, + "step": 17477 + }, + { + "epoch": 0.22711863856163395, + "grad_norm": 0.4065621495246887, + "learning_rate": 0.0001546081961034066, + "loss": 1.3026, + "step": 17478 + }, + { + "epoch": 0.22713163310554982, + "grad_norm": 0.3028584122657776, + "learning_rate": 0.0001546055966414952, + "loss": 1.2323, + "step": 17479 + }, + { + "epoch": 0.2271446276494657, + "grad_norm": 0.37900277972221375, + "learning_rate": 0.00015460299717958385, + "loss": 1.1991, + "step": 17480 + }, + { + "epoch": 0.22715762219338156, + "grad_norm": 0.3935201168060303, + "learning_rate": 0.00015460039771767245, + "loss": 1.3666, + "step": 17481 + }, + { + "epoch": 0.22717061673729744, + "grad_norm": 0.450606107711792, + "learning_rate": 0.00015459779825576105, + "loss": 1.627, + "step": 17482 + }, + { + "epoch": 0.2271836112812133, + "grad_norm": 0.4243600368499756, + "learning_rate": 0.00015459519879384967, + "loss": 1.4304, + "step": 17483 + }, + { + "epoch": 0.22719660582512918, + "grad_norm": 0.40043914318084717, + "learning_rate": 0.0001545925993319383, + "loss": 1.5418, + "step": 17484 + }, + { + "epoch": 0.22720960036904506, + "grad_norm": 0.4547288119792938, + "learning_rate": 0.00015458999987002692, + "loss": 1.4344, + "step": 17485 + }, + { + "epoch": 0.22722259491296093, + "grad_norm": 0.3056755065917969, + "learning_rate": 0.00015458740040811552, + "loss": 1.2878, + "step": 17486 + }, + { + "epoch": 0.2272355894568768, + "grad_norm": 0.4275365173816681, + "learning_rate": 0.00015458480094620415, + "loss": 1.3026, + "step": 17487 + }, + { + "epoch": 0.22724858400079267, + "grad_norm": 0.4318055212497711, + "learning_rate": 0.00015458220148429277, + "loss": 1.3762, + "step": 17488 + }, + { + "epoch": 0.22726157854470855, + "grad_norm": 0.36770716309547424, + "learning_rate": 0.00015457960202238137, + "loss": 1.4377, + "step": 17489 + }, + { + "epoch": 0.22727457308862442, + "grad_norm": 0.4921204447746277, + "learning_rate": 0.00015457700256047, + "loss": 1.658, + "step": 17490 + }, + { + "epoch": 0.2272875676325403, + "grad_norm": 0.36330509185791016, + "learning_rate": 0.0001545744030985586, + "loss": 1.3917, + "step": 17491 + }, + { + "epoch": 0.22730056217645617, + "grad_norm": 0.48243260383605957, + "learning_rate": 0.00015457180363664724, + "loss": 1.5888, + "step": 17492 + }, + { + "epoch": 0.22731355672037204, + "grad_norm": 0.4554900825023651, + "learning_rate": 0.00015456920417473584, + "loss": 1.4751, + "step": 17493 + }, + { + "epoch": 0.2273265512642879, + "grad_norm": 0.3958800137042999, + "learning_rate": 0.00015456660471282444, + "loss": 1.4439, + "step": 17494 + }, + { + "epoch": 0.22733954580820379, + "grad_norm": 0.3475806415081024, + "learning_rate": 0.00015456400525091306, + "loss": 1.3836, + "step": 17495 + }, + { + "epoch": 0.22735254035211966, + "grad_norm": 0.3271598219871521, + "learning_rate": 0.00015456140578900168, + "loss": 1.3591, + "step": 17496 + }, + { + "epoch": 0.22736553489603553, + "grad_norm": 0.4348144233226776, + "learning_rate": 0.0001545588063270903, + "loss": 1.4427, + "step": 17497 + }, + { + "epoch": 0.2273785294399514, + "grad_norm": 0.4207206666469574, + "learning_rate": 0.0001545562068651789, + "loss": 1.3819, + "step": 17498 + }, + { + "epoch": 0.22739152398386728, + "grad_norm": 0.4827212393283844, + "learning_rate": 0.00015455360740326753, + "loss": 1.5058, + "step": 17499 + }, + { + "epoch": 0.22740451852778315, + "grad_norm": 0.4095722436904907, + "learning_rate": 0.00015455100794135615, + "loss": 1.3503, + "step": 17500 + }, + { + "epoch": 0.22741751307169902, + "grad_norm": 0.4699098765850067, + "learning_rate": 0.00015454840847944475, + "loss": 1.3316, + "step": 17501 + }, + { + "epoch": 0.2274305076156149, + "grad_norm": 0.3005622923374176, + "learning_rate": 0.00015454580901753338, + "loss": 1.5259, + "step": 17502 + }, + { + "epoch": 0.22744350215953077, + "grad_norm": 0.33248934149742126, + "learning_rate": 0.000154543209555622, + "loss": 1.3527, + "step": 17503 + }, + { + "epoch": 0.22745649670344664, + "grad_norm": 0.4927341341972351, + "learning_rate": 0.00015454061009371063, + "loss": 1.5616, + "step": 17504 + }, + { + "epoch": 0.22746949124736252, + "grad_norm": 0.29577523469924927, + "learning_rate": 0.00015453801063179922, + "loss": 1.4328, + "step": 17505 + }, + { + "epoch": 0.2274824857912784, + "grad_norm": 0.48556411266326904, + "learning_rate": 0.00015453541116988782, + "loss": 1.5196, + "step": 17506 + }, + { + "epoch": 0.22749548033519426, + "grad_norm": 0.4814305603504181, + "learning_rate": 0.00015453281170797647, + "loss": 1.5006, + "step": 17507 + }, + { + "epoch": 0.22750847487911013, + "grad_norm": 0.37607795000076294, + "learning_rate": 0.00015453021224606507, + "loss": 1.2757, + "step": 17508 + }, + { + "epoch": 0.227521469423026, + "grad_norm": 0.448186993598938, + "learning_rate": 0.0001545276127841537, + "loss": 1.2001, + "step": 17509 + }, + { + "epoch": 0.22753446396694188, + "grad_norm": 0.33712536096572876, + "learning_rate": 0.0001545250133222423, + "loss": 1.6128, + "step": 17510 + }, + { + "epoch": 0.22754745851085775, + "grad_norm": 0.3859885632991791, + "learning_rate": 0.00015452241386033092, + "loss": 1.5175, + "step": 17511 + }, + { + "epoch": 0.22756045305477363, + "grad_norm": 0.440922349691391, + "learning_rate": 0.00015451981439841954, + "loss": 1.5203, + "step": 17512 + }, + { + "epoch": 0.2275734475986895, + "grad_norm": 0.4625594913959503, + "learning_rate": 0.00015451721493650814, + "loss": 1.651, + "step": 17513 + }, + { + "epoch": 0.22758644214260537, + "grad_norm": 0.3439292311668396, + "learning_rate": 0.00015451461547459676, + "loss": 1.3172, + "step": 17514 + }, + { + "epoch": 0.22759943668652124, + "grad_norm": 0.47682440280914307, + "learning_rate": 0.0001545120160126854, + "loss": 1.4806, + "step": 17515 + }, + { + "epoch": 0.22761243123043712, + "grad_norm": 0.30367887020111084, + "learning_rate": 0.000154509416550774, + "loss": 1.4158, + "step": 17516 + }, + { + "epoch": 0.227625425774353, + "grad_norm": 0.491248220205307, + "learning_rate": 0.0001545068170888626, + "loss": 1.3398, + "step": 17517 + }, + { + "epoch": 0.22763842031826886, + "grad_norm": 0.3286205530166626, + "learning_rate": 0.00015450421762695123, + "loss": 1.2543, + "step": 17518 + }, + { + "epoch": 0.22765141486218474, + "grad_norm": 0.4323371648788452, + "learning_rate": 0.00015450161816503986, + "loss": 1.6399, + "step": 17519 + }, + { + "epoch": 0.2276644094061006, + "grad_norm": 0.35500413179397583, + "learning_rate": 0.00015449901870312845, + "loss": 1.3036, + "step": 17520 + }, + { + "epoch": 0.22767740395001648, + "grad_norm": 0.2753719091415405, + "learning_rate": 0.00015449641924121708, + "loss": 1.2633, + "step": 17521 + }, + { + "epoch": 0.22769039849393236, + "grad_norm": 0.41145145893096924, + "learning_rate": 0.00015449381977930568, + "loss": 1.4671, + "step": 17522 + }, + { + "epoch": 0.22770339303784823, + "grad_norm": 0.35795196890830994, + "learning_rate": 0.0001544912203173943, + "loss": 1.2313, + "step": 17523 + }, + { + "epoch": 0.2277163875817641, + "grad_norm": 0.35109421610832214, + "learning_rate": 0.00015448862085548293, + "loss": 1.4308, + "step": 17524 + }, + { + "epoch": 0.22772938212567997, + "grad_norm": 0.3335050344467163, + "learning_rate": 0.00015448602139357152, + "loss": 1.2711, + "step": 17525 + }, + { + "epoch": 0.22774237666959585, + "grad_norm": 0.304598867893219, + "learning_rate": 0.00015448342193166015, + "loss": 1.3404, + "step": 17526 + }, + { + "epoch": 0.22775537121351172, + "grad_norm": 0.355780690908432, + "learning_rate": 0.00015448082246974877, + "loss": 1.2636, + "step": 17527 + }, + { + "epoch": 0.2277683657574276, + "grad_norm": 0.45066171884536743, + "learning_rate": 0.0001544782230078374, + "loss": 1.2542, + "step": 17528 + }, + { + "epoch": 0.22778136030134347, + "grad_norm": 0.3187810778617859, + "learning_rate": 0.000154475623545926, + "loss": 1.2667, + "step": 17529 + }, + { + "epoch": 0.22779435484525934, + "grad_norm": 0.4394470453262329, + "learning_rate": 0.00015447302408401462, + "loss": 1.5081, + "step": 17530 + }, + { + "epoch": 0.2278073493891752, + "grad_norm": 0.3630068004131317, + "learning_rate": 0.00015447042462210324, + "loss": 1.3614, + "step": 17531 + }, + { + "epoch": 0.22782034393309109, + "grad_norm": 0.3895515203475952, + "learning_rate": 0.00015446782516019184, + "loss": 1.2763, + "step": 17532 + }, + { + "epoch": 0.22783333847700696, + "grad_norm": 0.34488222002983093, + "learning_rate": 0.00015446522569828046, + "loss": 1.58, + "step": 17533 + }, + { + "epoch": 0.22784633302092283, + "grad_norm": 0.5027956962585449, + "learning_rate": 0.00015446262623636906, + "loss": 1.6167, + "step": 17534 + }, + { + "epoch": 0.2278593275648387, + "grad_norm": 0.4847308099269867, + "learning_rate": 0.0001544600267744577, + "loss": 1.7772, + "step": 17535 + }, + { + "epoch": 0.22787232210875458, + "grad_norm": 0.3336997628211975, + "learning_rate": 0.0001544574273125463, + "loss": 1.5796, + "step": 17536 + }, + { + "epoch": 0.22788531665267045, + "grad_norm": 0.4108007848262787, + "learning_rate": 0.0001544548278506349, + "loss": 1.3802, + "step": 17537 + }, + { + "epoch": 0.22789831119658632, + "grad_norm": 0.3541216254234314, + "learning_rate": 0.00015445222838872356, + "loss": 1.3754, + "step": 17538 + }, + { + "epoch": 0.2279113057405022, + "grad_norm": 0.3413882553577423, + "learning_rate": 0.00015444962892681216, + "loss": 1.278, + "step": 17539 + }, + { + "epoch": 0.22792430028441807, + "grad_norm": 0.26236289739608765, + "learning_rate": 0.00015444702946490078, + "loss": 1.3884, + "step": 17540 + }, + { + "epoch": 0.22793729482833394, + "grad_norm": 0.3783803880214691, + "learning_rate": 0.00015444443000298938, + "loss": 1.3518, + "step": 17541 + }, + { + "epoch": 0.22795028937224981, + "grad_norm": 0.3493862748146057, + "learning_rate": 0.000154441830541078, + "loss": 1.4121, + "step": 17542 + }, + { + "epoch": 0.2279632839161657, + "grad_norm": 0.30744510889053345, + "learning_rate": 0.00015443923107916663, + "loss": 1.3294, + "step": 17543 + }, + { + "epoch": 0.22797627846008156, + "grad_norm": 0.37657248973846436, + "learning_rate": 0.00015443663161725523, + "loss": 1.4234, + "step": 17544 + }, + { + "epoch": 0.22798927300399743, + "grad_norm": 0.33738717436790466, + "learning_rate": 0.00015443403215534385, + "loss": 1.4368, + "step": 17545 + }, + { + "epoch": 0.2280022675479133, + "grad_norm": 0.3121657073497772, + "learning_rate": 0.00015443143269343247, + "loss": 1.1387, + "step": 17546 + }, + { + "epoch": 0.22801526209182918, + "grad_norm": 0.3690843880176544, + "learning_rate": 0.0001544288332315211, + "loss": 1.3377, + "step": 17547 + }, + { + "epoch": 0.22802825663574508, + "grad_norm": 0.617122232913971, + "learning_rate": 0.0001544262337696097, + "loss": 1.4112, + "step": 17548 + }, + { + "epoch": 0.22804125117966095, + "grad_norm": 0.39011451601982117, + "learning_rate": 0.0001544236343076983, + "loss": 1.4522, + "step": 17549 + }, + { + "epoch": 0.22805424572357683, + "grad_norm": 0.3467435836791992, + "learning_rate": 0.00015442103484578695, + "loss": 1.4362, + "step": 17550 + }, + { + "epoch": 0.2280672402674927, + "grad_norm": 0.38670575618743896, + "learning_rate": 0.00015441843538387554, + "loss": 1.3359, + "step": 17551 + }, + { + "epoch": 0.22808023481140857, + "grad_norm": 0.36283016204833984, + "learning_rate": 0.00015441583592196417, + "loss": 1.4344, + "step": 17552 + }, + { + "epoch": 0.22809322935532444, + "grad_norm": 0.441263884305954, + "learning_rate": 0.00015441323646005276, + "loss": 1.4465, + "step": 17553 + }, + { + "epoch": 0.22810622389924032, + "grad_norm": 0.5017417073249817, + "learning_rate": 0.0001544106369981414, + "loss": 1.3819, + "step": 17554 + }, + { + "epoch": 0.2281192184431562, + "grad_norm": 0.43855154514312744, + "learning_rate": 0.00015440803753623001, + "loss": 1.5213, + "step": 17555 + }, + { + "epoch": 0.22813221298707206, + "grad_norm": 0.4814172685146332, + "learning_rate": 0.0001544054380743186, + "loss": 1.5137, + "step": 17556 + }, + { + "epoch": 0.22814520753098794, + "grad_norm": 0.3849272131919861, + "learning_rate": 0.00015440283861240724, + "loss": 1.3589, + "step": 17557 + }, + { + "epoch": 0.2281582020749038, + "grad_norm": 0.3414883017539978, + "learning_rate": 0.00015440023915049586, + "loss": 1.2675, + "step": 17558 + }, + { + "epoch": 0.22817119661881968, + "grad_norm": 0.27242282032966614, + "learning_rate": 0.00015439763968858448, + "loss": 1.337, + "step": 17559 + }, + { + "epoch": 0.22818419116273556, + "grad_norm": 0.4883895814418793, + "learning_rate": 0.00015439504022667308, + "loss": 1.518, + "step": 17560 + }, + { + "epoch": 0.22819718570665143, + "grad_norm": 0.3641911745071411, + "learning_rate": 0.00015439244076476168, + "loss": 1.4129, + "step": 17561 + }, + { + "epoch": 0.2282101802505673, + "grad_norm": 0.30756595730781555, + "learning_rate": 0.00015438984130285033, + "loss": 1.3691, + "step": 17562 + }, + { + "epoch": 0.22822317479448317, + "grad_norm": 0.4031060039997101, + "learning_rate": 0.00015438724184093893, + "loss": 1.4588, + "step": 17563 + }, + { + "epoch": 0.22823616933839905, + "grad_norm": 0.4151732325553894, + "learning_rate": 0.00015438464237902755, + "loss": 1.496, + "step": 17564 + }, + { + "epoch": 0.22824916388231492, + "grad_norm": 0.346788614988327, + "learning_rate": 0.00015438204291711615, + "loss": 1.2248, + "step": 17565 + }, + { + "epoch": 0.2282621584262308, + "grad_norm": 0.45701804757118225, + "learning_rate": 0.00015437944345520477, + "loss": 1.5695, + "step": 17566 + }, + { + "epoch": 0.22827515297014667, + "grad_norm": 0.3944202959537506, + "learning_rate": 0.0001543768439932934, + "loss": 1.2604, + "step": 17567 + }, + { + "epoch": 0.22828814751406254, + "grad_norm": 0.400157630443573, + "learning_rate": 0.000154374244531382, + "loss": 1.5668, + "step": 17568 + }, + { + "epoch": 0.2283011420579784, + "grad_norm": 0.4012301564216614, + "learning_rate": 0.00015437164506947062, + "loss": 1.4282, + "step": 17569 + }, + { + "epoch": 0.22831413660189429, + "grad_norm": 0.3055853843688965, + "learning_rate": 0.00015436904560755925, + "loss": 1.2053, + "step": 17570 + }, + { + "epoch": 0.22832713114581016, + "grad_norm": 0.47218552231788635, + "learning_rate": 0.00015436644614564787, + "loss": 1.6631, + "step": 17571 + }, + { + "epoch": 0.22834012568972603, + "grad_norm": 0.4413902759552002, + "learning_rate": 0.00015436384668373647, + "loss": 1.4876, + "step": 17572 + }, + { + "epoch": 0.2283531202336419, + "grad_norm": 0.3570714592933655, + "learning_rate": 0.0001543612472218251, + "loss": 1.3518, + "step": 17573 + }, + { + "epoch": 0.22836611477755778, + "grad_norm": 0.4896111488342285, + "learning_rate": 0.00015435864775991372, + "loss": 1.4229, + "step": 17574 + }, + { + "epoch": 0.22837910932147365, + "grad_norm": 0.3856905400753021, + "learning_rate": 0.00015435604829800231, + "loss": 1.5016, + "step": 17575 + }, + { + "epoch": 0.22839210386538952, + "grad_norm": 0.4265894293785095, + "learning_rate": 0.00015435344883609094, + "loss": 1.6509, + "step": 17576 + }, + { + "epoch": 0.2284050984093054, + "grad_norm": 0.3925989866256714, + "learning_rate": 0.00015435084937417956, + "loss": 1.5953, + "step": 17577 + }, + { + "epoch": 0.22841809295322127, + "grad_norm": 0.31342899799346924, + "learning_rate": 0.00015434824991226816, + "loss": 1.2474, + "step": 17578 + }, + { + "epoch": 0.22843108749713714, + "grad_norm": 0.5138340592384338, + "learning_rate": 0.00015434565045035678, + "loss": 1.4155, + "step": 17579 + }, + { + "epoch": 0.22844408204105301, + "grad_norm": 0.4364220201969147, + "learning_rate": 0.00015434305098844538, + "loss": 1.5345, + "step": 17580 + }, + { + "epoch": 0.2284570765849689, + "grad_norm": 0.4405708909034729, + "learning_rate": 0.00015434045152653403, + "loss": 1.4461, + "step": 17581 + }, + { + "epoch": 0.22847007112888476, + "grad_norm": 0.4068009555339813, + "learning_rate": 0.00015433785206462263, + "loss": 1.4358, + "step": 17582 + }, + { + "epoch": 0.22848306567280063, + "grad_norm": 0.4266466200351715, + "learning_rate": 0.00015433525260271126, + "loss": 1.2945, + "step": 17583 + }, + { + "epoch": 0.2284960602167165, + "grad_norm": 0.4239807426929474, + "learning_rate": 0.00015433265314079985, + "loss": 1.2412, + "step": 17584 + }, + { + "epoch": 0.22850905476063238, + "grad_norm": 0.3937571346759796, + "learning_rate": 0.00015433005367888848, + "loss": 1.2955, + "step": 17585 + }, + { + "epoch": 0.22852204930454825, + "grad_norm": 0.3565618097782135, + "learning_rate": 0.0001543274542169771, + "loss": 1.2166, + "step": 17586 + }, + { + "epoch": 0.22853504384846413, + "grad_norm": 0.3238370418548584, + "learning_rate": 0.0001543248547550657, + "loss": 1.3791, + "step": 17587 + }, + { + "epoch": 0.22854803839238, + "grad_norm": 0.3457615375518799, + "learning_rate": 0.00015432225529315432, + "loss": 1.3926, + "step": 17588 + }, + { + "epoch": 0.22856103293629587, + "grad_norm": 0.2828628420829773, + "learning_rate": 0.00015431965583124295, + "loss": 1.2077, + "step": 17589 + }, + { + "epoch": 0.22857402748021174, + "grad_norm": 0.4756614863872528, + "learning_rate": 0.00015431705636933155, + "loss": 1.6235, + "step": 17590 + }, + { + "epoch": 0.22858702202412762, + "grad_norm": 0.3619145154953003, + "learning_rate": 0.00015431445690742017, + "loss": 1.4554, + "step": 17591 + }, + { + "epoch": 0.2286000165680435, + "grad_norm": 0.4134174883365631, + "learning_rate": 0.00015431185744550877, + "loss": 1.5222, + "step": 17592 + }, + { + "epoch": 0.22861301111195936, + "grad_norm": 0.4153233468532562, + "learning_rate": 0.00015430925798359742, + "loss": 1.433, + "step": 17593 + }, + { + "epoch": 0.22862600565587524, + "grad_norm": 0.3607367277145386, + "learning_rate": 0.00015430665852168602, + "loss": 1.3922, + "step": 17594 + }, + { + "epoch": 0.2286390001997911, + "grad_norm": 0.4759838581085205, + "learning_rate": 0.00015430405905977464, + "loss": 1.4376, + "step": 17595 + }, + { + "epoch": 0.22865199474370698, + "grad_norm": 0.33668971061706543, + "learning_rate": 0.00015430145959786324, + "loss": 1.2891, + "step": 17596 + }, + { + "epoch": 0.22866498928762286, + "grad_norm": 0.3650651276111603, + "learning_rate": 0.00015429886013595186, + "loss": 1.4798, + "step": 17597 + }, + { + "epoch": 0.22867798383153873, + "grad_norm": 0.3216116428375244, + "learning_rate": 0.0001542962606740405, + "loss": 1.4721, + "step": 17598 + }, + { + "epoch": 0.2286909783754546, + "grad_norm": 0.4008892774581909, + "learning_rate": 0.00015429366121212908, + "loss": 1.3132, + "step": 17599 + }, + { + "epoch": 0.22870397291937047, + "grad_norm": 0.3910900354385376, + "learning_rate": 0.0001542910617502177, + "loss": 1.4189, + "step": 17600 + }, + { + "epoch": 0.22871696746328635, + "grad_norm": 0.31416893005371094, + "learning_rate": 0.00015428846228830633, + "loss": 1.5268, + "step": 17601 + }, + { + "epoch": 0.22872996200720222, + "grad_norm": 0.27825412154197693, + "learning_rate": 0.00015428586282639496, + "loss": 1.463, + "step": 17602 + }, + { + "epoch": 0.2287429565511181, + "grad_norm": 0.5107660889625549, + "learning_rate": 0.00015428326336448356, + "loss": 1.6413, + "step": 17603 + }, + { + "epoch": 0.22875595109503397, + "grad_norm": 0.45120808482170105, + "learning_rate": 0.00015428066390257215, + "loss": 1.4809, + "step": 17604 + }, + { + "epoch": 0.22876894563894984, + "grad_norm": 0.4583079218864441, + "learning_rate": 0.0001542780644406608, + "loss": 1.5901, + "step": 17605 + }, + { + "epoch": 0.2287819401828657, + "grad_norm": 0.5126442909240723, + "learning_rate": 0.0001542754649787494, + "loss": 1.2798, + "step": 17606 + }, + { + "epoch": 0.22879493472678158, + "grad_norm": 0.32243844866752625, + "learning_rate": 0.00015427286551683803, + "loss": 1.2415, + "step": 17607 + }, + { + "epoch": 0.22880792927069746, + "grad_norm": 0.2602277100086212, + "learning_rate": 0.00015427026605492662, + "loss": 1.3129, + "step": 17608 + }, + { + "epoch": 0.22882092381461333, + "grad_norm": 0.28018492460250854, + "learning_rate": 0.00015426766659301525, + "loss": 1.287, + "step": 17609 + }, + { + "epoch": 0.2288339183585292, + "grad_norm": 0.3126949965953827, + "learning_rate": 0.00015426506713110387, + "loss": 1.4174, + "step": 17610 + }, + { + "epoch": 0.22884691290244508, + "grad_norm": 0.462068647146225, + "learning_rate": 0.00015426246766919247, + "loss": 1.5323, + "step": 17611 + }, + { + "epoch": 0.22885990744636095, + "grad_norm": 0.3884279131889343, + "learning_rate": 0.00015425986820728112, + "loss": 1.3909, + "step": 17612 + }, + { + "epoch": 0.22887290199027682, + "grad_norm": 0.42448994517326355, + "learning_rate": 0.00015425726874536972, + "loss": 1.4556, + "step": 17613 + }, + { + "epoch": 0.2288858965341927, + "grad_norm": 0.4207201302051544, + "learning_rate": 0.00015425466928345834, + "loss": 1.4939, + "step": 17614 + }, + { + "epoch": 0.22889889107810857, + "grad_norm": 0.40967270731925964, + "learning_rate": 0.00015425206982154694, + "loss": 1.471, + "step": 17615 + }, + { + "epoch": 0.22891188562202444, + "grad_norm": 0.3311474323272705, + "learning_rate": 0.00015424947035963557, + "loss": 1.2696, + "step": 17616 + }, + { + "epoch": 0.22892488016594031, + "grad_norm": 0.3830753564834595, + "learning_rate": 0.0001542468708977242, + "loss": 1.3144, + "step": 17617 + }, + { + "epoch": 0.2289378747098562, + "grad_norm": 0.4001374840736389, + "learning_rate": 0.0001542442714358128, + "loss": 1.3508, + "step": 17618 + }, + { + "epoch": 0.22895086925377206, + "grad_norm": 0.42457136511802673, + "learning_rate": 0.0001542416719739014, + "loss": 1.6802, + "step": 17619 + }, + { + "epoch": 0.22896386379768793, + "grad_norm": 0.35581204295158386, + "learning_rate": 0.00015423907251199004, + "loss": 1.4076, + "step": 17620 + }, + { + "epoch": 0.2289768583416038, + "grad_norm": 0.40405532717704773, + "learning_rate": 0.00015423647305007863, + "loss": 1.5242, + "step": 17621 + }, + { + "epoch": 0.22898985288551968, + "grad_norm": 0.36132997274398804, + "learning_rate": 0.00015423387358816726, + "loss": 1.3457, + "step": 17622 + }, + { + "epoch": 0.22900284742943555, + "grad_norm": 0.37850359082221985, + "learning_rate": 0.00015423127412625586, + "loss": 1.3197, + "step": 17623 + }, + { + "epoch": 0.22901584197335145, + "grad_norm": 0.3285813629627228, + "learning_rate": 0.0001542286746643445, + "loss": 1.2982, + "step": 17624 + }, + { + "epoch": 0.22902883651726733, + "grad_norm": 0.3952697813510895, + "learning_rate": 0.0001542260752024331, + "loss": 1.4559, + "step": 17625 + }, + { + "epoch": 0.2290418310611832, + "grad_norm": 0.3677605390548706, + "learning_rate": 0.00015422347574052173, + "loss": 1.4088, + "step": 17626 + }, + { + "epoch": 0.22905482560509907, + "grad_norm": 0.43694770336151123, + "learning_rate": 0.00015422087627861033, + "loss": 1.4517, + "step": 17627 + }, + { + "epoch": 0.22906782014901494, + "grad_norm": 0.3547777235507965, + "learning_rate": 0.00015421827681669895, + "loss": 1.3605, + "step": 17628 + }, + { + "epoch": 0.22908081469293082, + "grad_norm": 0.4288869798183441, + "learning_rate": 0.00015421567735478757, + "loss": 1.4047, + "step": 17629 + }, + { + "epoch": 0.2290938092368467, + "grad_norm": 0.37506112456321716, + "learning_rate": 0.00015421307789287617, + "loss": 1.438, + "step": 17630 + }, + { + "epoch": 0.22910680378076256, + "grad_norm": 0.39819619059562683, + "learning_rate": 0.0001542104784309648, + "loss": 1.4956, + "step": 17631 + }, + { + "epoch": 0.22911979832467844, + "grad_norm": 0.44526052474975586, + "learning_rate": 0.00015420787896905342, + "loss": 1.5042, + "step": 17632 + }, + { + "epoch": 0.2291327928685943, + "grad_norm": 0.39161333441734314, + "learning_rate": 0.00015420527950714202, + "loss": 1.3279, + "step": 17633 + }, + { + "epoch": 0.22914578741251018, + "grad_norm": 0.42912355065345764, + "learning_rate": 0.00015420268004523064, + "loss": 1.4148, + "step": 17634 + }, + { + "epoch": 0.22915878195642606, + "grad_norm": 0.38548797369003296, + "learning_rate": 0.00015420008058331924, + "loss": 1.5169, + "step": 17635 + }, + { + "epoch": 0.22917177650034193, + "grad_norm": 0.41569754481315613, + "learning_rate": 0.0001541974811214079, + "loss": 1.3007, + "step": 17636 + }, + { + "epoch": 0.2291847710442578, + "grad_norm": 0.3122117817401886, + "learning_rate": 0.0001541948816594965, + "loss": 1.2777, + "step": 17637 + }, + { + "epoch": 0.22919776558817367, + "grad_norm": 0.344384104013443, + "learning_rate": 0.00015419228219758511, + "loss": 1.3958, + "step": 17638 + }, + { + "epoch": 0.22921076013208955, + "grad_norm": 0.3293127715587616, + "learning_rate": 0.0001541896827356737, + "loss": 1.4674, + "step": 17639 + }, + { + "epoch": 0.22922375467600542, + "grad_norm": 0.29614976048469543, + "learning_rate": 0.00015418708327376234, + "loss": 1.4696, + "step": 17640 + }, + { + "epoch": 0.2292367492199213, + "grad_norm": 0.3996814787387848, + "learning_rate": 0.00015418448381185096, + "loss": 1.5301, + "step": 17641 + }, + { + "epoch": 0.22924974376383717, + "grad_norm": 0.47054117918014526, + "learning_rate": 0.00015418188434993956, + "loss": 1.4324, + "step": 17642 + }, + { + "epoch": 0.22926273830775304, + "grad_norm": 0.4537562429904938, + "learning_rate": 0.00015417928488802818, + "loss": 1.4053, + "step": 17643 + }, + { + "epoch": 0.2292757328516689, + "grad_norm": 0.4789636433124542, + "learning_rate": 0.0001541766854261168, + "loss": 1.4235, + "step": 17644 + }, + { + "epoch": 0.22928872739558478, + "grad_norm": 0.382784903049469, + "learning_rate": 0.0001541740859642054, + "loss": 1.5688, + "step": 17645 + }, + { + "epoch": 0.22930172193950066, + "grad_norm": 0.363221138715744, + "learning_rate": 0.00015417148650229403, + "loss": 1.2391, + "step": 17646 + }, + { + "epoch": 0.22931471648341653, + "grad_norm": 0.4165670871734619, + "learning_rate": 0.00015416888704038265, + "loss": 1.3637, + "step": 17647 + }, + { + "epoch": 0.2293277110273324, + "grad_norm": 0.3411755859851837, + "learning_rate": 0.00015416628757847128, + "loss": 1.4937, + "step": 17648 + }, + { + "epoch": 0.22934070557124828, + "grad_norm": 0.38665154576301575, + "learning_rate": 0.00015416368811655987, + "loss": 1.3832, + "step": 17649 + }, + { + "epoch": 0.22935370011516415, + "grad_norm": 0.44277629256248474, + "learning_rate": 0.0001541610886546485, + "loss": 1.4414, + "step": 17650 + }, + { + "epoch": 0.22936669465908002, + "grad_norm": 0.36975422501564026, + "learning_rate": 0.00015415848919273712, + "loss": 1.4096, + "step": 17651 + }, + { + "epoch": 0.2293796892029959, + "grad_norm": 0.3800652325153351, + "learning_rate": 0.00015415588973082572, + "loss": 1.3322, + "step": 17652 + }, + { + "epoch": 0.22939268374691177, + "grad_norm": 0.4465552866458893, + "learning_rate": 0.00015415329026891435, + "loss": 1.4889, + "step": 17653 + }, + { + "epoch": 0.22940567829082764, + "grad_norm": 0.41041141748428345, + "learning_rate": 0.00015415069080700294, + "loss": 1.4372, + "step": 17654 + }, + { + "epoch": 0.22941867283474351, + "grad_norm": 0.39620378613471985, + "learning_rate": 0.0001541480913450916, + "loss": 1.4789, + "step": 17655 + }, + { + "epoch": 0.2294316673786594, + "grad_norm": 0.446790486574173, + "learning_rate": 0.0001541454918831802, + "loss": 1.2575, + "step": 17656 + }, + { + "epoch": 0.22944466192257526, + "grad_norm": 0.46105313301086426, + "learning_rate": 0.0001541428924212688, + "loss": 1.5288, + "step": 17657 + }, + { + "epoch": 0.22945765646649113, + "grad_norm": 0.34282946586608887, + "learning_rate": 0.00015414029295935741, + "loss": 1.4353, + "step": 17658 + }, + { + "epoch": 0.229470651010407, + "grad_norm": 0.4200456440448761, + "learning_rate": 0.00015413769349744604, + "loss": 1.3926, + "step": 17659 + }, + { + "epoch": 0.22948364555432288, + "grad_norm": 0.45076262950897217, + "learning_rate": 0.00015413509403553466, + "loss": 1.4815, + "step": 17660 + }, + { + "epoch": 0.22949664009823875, + "grad_norm": 0.4466899633407593, + "learning_rate": 0.00015413249457362326, + "loss": 1.4514, + "step": 17661 + }, + { + "epoch": 0.22950963464215463, + "grad_norm": 0.35786283016204834, + "learning_rate": 0.00015412989511171188, + "loss": 1.4671, + "step": 17662 + }, + { + "epoch": 0.2295226291860705, + "grad_norm": 0.38174551725387573, + "learning_rate": 0.0001541272956498005, + "loss": 1.5623, + "step": 17663 + }, + { + "epoch": 0.22953562372998637, + "grad_norm": 0.37272652983665466, + "learning_rate": 0.0001541246961878891, + "loss": 1.4073, + "step": 17664 + }, + { + "epoch": 0.22954861827390224, + "grad_norm": 0.47229018807411194, + "learning_rate": 0.00015412209672597773, + "loss": 1.5061, + "step": 17665 + }, + { + "epoch": 0.22956161281781812, + "grad_norm": 0.40937644243240356, + "learning_rate": 0.00015411949726406633, + "loss": 1.4206, + "step": 17666 + }, + { + "epoch": 0.229574607361734, + "grad_norm": 0.4650891125202179, + "learning_rate": 0.00015411689780215498, + "loss": 1.4561, + "step": 17667 + }, + { + "epoch": 0.22958760190564986, + "grad_norm": 0.4773378372192383, + "learning_rate": 0.00015411429834024358, + "loss": 1.4002, + "step": 17668 + }, + { + "epoch": 0.22960059644956574, + "grad_norm": 0.4365747272968292, + "learning_rate": 0.0001541116988783322, + "loss": 1.2941, + "step": 17669 + }, + { + "epoch": 0.2296135909934816, + "grad_norm": 0.4797625243663788, + "learning_rate": 0.0001541090994164208, + "loss": 1.549, + "step": 17670 + }, + { + "epoch": 0.22962658553739748, + "grad_norm": 0.4469527006149292, + "learning_rate": 0.00015410649995450942, + "loss": 1.5086, + "step": 17671 + }, + { + "epoch": 0.22963958008131335, + "grad_norm": 0.374441921710968, + "learning_rate": 0.00015410390049259805, + "loss": 1.4955, + "step": 17672 + }, + { + "epoch": 0.22965257462522923, + "grad_norm": 0.3637180030345917, + "learning_rate": 0.00015410130103068665, + "loss": 1.448, + "step": 17673 + }, + { + "epoch": 0.2296655691691451, + "grad_norm": 0.3801933228969574, + "learning_rate": 0.00015409870156877527, + "loss": 1.3988, + "step": 17674 + }, + { + "epoch": 0.22967856371306097, + "grad_norm": 0.3234044909477234, + "learning_rate": 0.0001540961021068639, + "loss": 1.444, + "step": 17675 + }, + { + "epoch": 0.22969155825697685, + "grad_norm": 0.33238109946250916, + "learning_rate": 0.0001540935026449525, + "loss": 1.4295, + "step": 17676 + }, + { + "epoch": 0.22970455280089272, + "grad_norm": 0.369138240814209, + "learning_rate": 0.00015409090318304112, + "loss": 1.5526, + "step": 17677 + }, + { + "epoch": 0.2297175473448086, + "grad_norm": 0.32458943128585815, + "learning_rate": 0.00015408830372112971, + "loss": 1.2806, + "step": 17678 + }, + { + "epoch": 0.22973054188872447, + "grad_norm": 0.3393263816833496, + "learning_rate": 0.00015408570425921837, + "loss": 1.5087, + "step": 17679 + }, + { + "epoch": 0.22974353643264034, + "grad_norm": 0.40407025814056396, + "learning_rate": 0.00015408310479730696, + "loss": 1.4372, + "step": 17680 + }, + { + "epoch": 0.2297565309765562, + "grad_norm": 0.37247687578201294, + "learning_rate": 0.0001540805053353956, + "loss": 1.4607, + "step": 17681 + }, + { + "epoch": 0.22976952552047208, + "grad_norm": 0.5529502034187317, + "learning_rate": 0.00015407790587348418, + "loss": 1.3671, + "step": 17682 + }, + { + "epoch": 0.22978252006438796, + "grad_norm": 0.4205712378025055, + "learning_rate": 0.0001540753064115728, + "loss": 1.4083, + "step": 17683 + }, + { + "epoch": 0.22979551460830383, + "grad_norm": 0.3608722686767578, + "learning_rate": 0.00015407270694966143, + "loss": 1.2233, + "step": 17684 + }, + { + "epoch": 0.2298085091522197, + "grad_norm": 0.47460004687309265, + "learning_rate": 0.00015407010748775003, + "loss": 1.2779, + "step": 17685 + }, + { + "epoch": 0.22982150369613558, + "grad_norm": 0.3191768229007721, + "learning_rate": 0.00015406750802583868, + "loss": 1.2741, + "step": 17686 + }, + { + "epoch": 0.22983449824005145, + "grad_norm": 0.42855381965637207, + "learning_rate": 0.00015406490856392728, + "loss": 1.378, + "step": 17687 + }, + { + "epoch": 0.22984749278396732, + "grad_norm": 0.354781836271286, + "learning_rate": 0.00015406230910201588, + "loss": 1.5386, + "step": 17688 + }, + { + "epoch": 0.2298604873278832, + "grad_norm": 0.39844000339508057, + "learning_rate": 0.0001540597096401045, + "loss": 1.3719, + "step": 17689 + }, + { + "epoch": 0.22987348187179907, + "grad_norm": 0.42703622579574585, + "learning_rate": 0.00015405711017819313, + "loss": 1.4384, + "step": 17690 + }, + { + "epoch": 0.22988647641571494, + "grad_norm": 0.38441357016563416, + "learning_rate": 0.00015405451071628175, + "loss": 1.2483, + "step": 17691 + }, + { + "epoch": 0.22989947095963081, + "grad_norm": 0.31359657645225525, + "learning_rate": 0.00015405191125437035, + "loss": 1.4259, + "step": 17692 + }, + { + "epoch": 0.2299124655035467, + "grad_norm": 0.3714214861392975, + "learning_rate": 0.00015404931179245897, + "loss": 1.3477, + "step": 17693 + }, + { + "epoch": 0.22992546004746256, + "grad_norm": 0.477580726146698, + "learning_rate": 0.0001540467123305476, + "loss": 1.4587, + "step": 17694 + }, + { + "epoch": 0.22993845459137843, + "grad_norm": 0.5337256789207458, + "learning_rate": 0.0001540441128686362, + "loss": 1.5064, + "step": 17695 + }, + { + "epoch": 0.2299514491352943, + "grad_norm": 0.41329288482666016, + "learning_rate": 0.00015404151340672482, + "loss": 1.2234, + "step": 17696 + }, + { + "epoch": 0.22996444367921018, + "grad_norm": 0.4516674280166626, + "learning_rate": 0.00015403891394481342, + "loss": 1.2709, + "step": 17697 + }, + { + "epoch": 0.22997743822312605, + "grad_norm": 0.4006953537464142, + "learning_rate": 0.00015403631448290207, + "loss": 1.4849, + "step": 17698 + }, + { + "epoch": 0.22999043276704192, + "grad_norm": 0.4751393496990204, + "learning_rate": 0.00015403371502099067, + "loss": 1.3091, + "step": 17699 + }, + { + "epoch": 0.23000342731095783, + "grad_norm": 0.3570221960544586, + "learning_rate": 0.00015403111555907926, + "loss": 1.4614, + "step": 17700 + }, + { + "epoch": 0.2300164218548737, + "grad_norm": 0.5387532114982605, + "learning_rate": 0.0001540285160971679, + "loss": 1.5955, + "step": 17701 + }, + { + "epoch": 0.23002941639878957, + "grad_norm": 0.32554253935813904, + "learning_rate": 0.0001540259166352565, + "loss": 1.3071, + "step": 17702 + }, + { + "epoch": 0.23004241094270544, + "grad_norm": 0.42064937949180603, + "learning_rate": 0.00015402331717334514, + "loss": 1.5228, + "step": 17703 + }, + { + "epoch": 0.23005540548662132, + "grad_norm": 0.3914891481399536, + "learning_rate": 0.00015402071771143373, + "loss": 1.4626, + "step": 17704 + }, + { + "epoch": 0.2300684000305372, + "grad_norm": 0.4852510094642639, + "learning_rate": 0.00015401811824952236, + "loss": 1.5935, + "step": 17705 + }, + { + "epoch": 0.23008139457445306, + "grad_norm": 0.4209222197532654, + "learning_rate": 0.00015401551878761098, + "loss": 1.3421, + "step": 17706 + }, + { + "epoch": 0.23009438911836894, + "grad_norm": 0.2824123501777649, + "learning_rate": 0.00015401291932569958, + "loss": 1.2957, + "step": 17707 + }, + { + "epoch": 0.2301073836622848, + "grad_norm": 0.30800673365592957, + "learning_rate": 0.0001540103198637882, + "loss": 1.0889, + "step": 17708 + }, + { + "epoch": 0.23012037820620068, + "grad_norm": 0.551593542098999, + "learning_rate": 0.0001540077204018768, + "loss": 1.4372, + "step": 17709 + }, + { + "epoch": 0.23013337275011655, + "grad_norm": 0.47093117237091064, + "learning_rate": 0.00015400512093996545, + "loss": 1.4836, + "step": 17710 + }, + { + "epoch": 0.23014636729403243, + "grad_norm": 0.34184256196022034, + "learning_rate": 0.00015400252147805405, + "loss": 1.389, + "step": 17711 + }, + { + "epoch": 0.2301593618379483, + "grad_norm": 0.41638702154159546, + "learning_rate": 0.00015399992201614265, + "loss": 1.3359, + "step": 17712 + }, + { + "epoch": 0.23017235638186417, + "grad_norm": 0.35902711749076843, + "learning_rate": 0.00015399732255423127, + "loss": 1.6393, + "step": 17713 + }, + { + "epoch": 0.23018535092578005, + "grad_norm": 0.4319157898426056, + "learning_rate": 0.0001539947230923199, + "loss": 1.4951, + "step": 17714 + }, + { + "epoch": 0.23019834546969592, + "grad_norm": 0.3503279387950897, + "learning_rate": 0.00015399212363040852, + "loss": 1.554, + "step": 17715 + }, + { + "epoch": 0.2302113400136118, + "grad_norm": 0.47127795219421387, + "learning_rate": 0.00015398952416849712, + "loss": 1.5118, + "step": 17716 + }, + { + "epoch": 0.23022433455752767, + "grad_norm": 0.4257420599460602, + "learning_rate": 0.00015398692470658574, + "loss": 1.3695, + "step": 17717 + }, + { + "epoch": 0.23023732910144354, + "grad_norm": 0.4545004963874817, + "learning_rate": 0.00015398432524467437, + "loss": 1.459, + "step": 17718 + }, + { + "epoch": 0.2302503236453594, + "grad_norm": 0.34622564911842346, + "learning_rate": 0.00015398172578276297, + "loss": 1.2835, + "step": 17719 + }, + { + "epoch": 0.23026331818927528, + "grad_norm": 0.3949187695980072, + "learning_rate": 0.0001539791263208516, + "loss": 1.3423, + "step": 17720 + }, + { + "epoch": 0.23027631273319116, + "grad_norm": 0.4023889899253845, + "learning_rate": 0.00015397652685894021, + "loss": 1.399, + "step": 17721 + }, + { + "epoch": 0.23028930727710703, + "grad_norm": 0.34029367566108704, + "learning_rate": 0.00015397392739702884, + "loss": 1.2726, + "step": 17722 + }, + { + "epoch": 0.2303023018210229, + "grad_norm": 0.386578232049942, + "learning_rate": 0.00015397132793511744, + "loss": 1.3364, + "step": 17723 + }, + { + "epoch": 0.23031529636493878, + "grad_norm": 0.34251466393470764, + "learning_rate": 0.00015396872847320606, + "loss": 1.2247, + "step": 17724 + }, + { + "epoch": 0.23032829090885465, + "grad_norm": 0.4175904393196106, + "learning_rate": 0.00015396612901129469, + "loss": 1.567, + "step": 17725 + }, + { + "epoch": 0.23034128545277052, + "grad_norm": 0.448779433965683, + "learning_rate": 0.00015396352954938328, + "loss": 1.4457, + "step": 17726 + }, + { + "epoch": 0.2303542799966864, + "grad_norm": 0.40709659457206726, + "learning_rate": 0.0001539609300874719, + "loss": 1.4649, + "step": 17727 + }, + { + "epoch": 0.23036727454060227, + "grad_norm": 0.3895239531993866, + "learning_rate": 0.0001539583306255605, + "loss": 1.3916, + "step": 17728 + }, + { + "epoch": 0.23038026908451814, + "grad_norm": 0.41733086109161377, + "learning_rate": 0.00015395573116364913, + "loss": 1.5186, + "step": 17729 + }, + { + "epoch": 0.23039326362843401, + "grad_norm": 0.5264485478401184, + "learning_rate": 0.00015395313170173775, + "loss": 1.5917, + "step": 17730 + }, + { + "epoch": 0.2304062581723499, + "grad_norm": 0.3452605605125427, + "learning_rate": 0.00015395053223982635, + "loss": 1.4038, + "step": 17731 + }, + { + "epoch": 0.23041925271626576, + "grad_norm": 0.48457321524620056, + "learning_rate": 0.00015394793277791498, + "loss": 1.4106, + "step": 17732 + }, + { + "epoch": 0.23043224726018163, + "grad_norm": 0.4452970027923584, + "learning_rate": 0.0001539453333160036, + "loss": 1.5444, + "step": 17733 + }, + { + "epoch": 0.2304452418040975, + "grad_norm": 0.33489036560058594, + "learning_rate": 0.00015394273385409222, + "loss": 1.4237, + "step": 17734 + }, + { + "epoch": 0.23045823634801338, + "grad_norm": 0.4045279324054718, + "learning_rate": 0.00015394013439218082, + "loss": 1.4664, + "step": 17735 + }, + { + "epoch": 0.23047123089192925, + "grad_norm": 0.48704978823661804, + "learning_rate": 0.00015393753493026945, + "loss": 1.5603, + "step": 17736 + }, + { + "epoch": 0.23048422543584512, + "grad_norm": 0.4264164865016937, + "learning_rate": 0.00015393493546835807, + "loss": 1.4007, + "step": 17737 + }, + { + "epoch": 0.230497219979761, + "grad_norm": 0.40201908349990845, + "learning_rate": 0.00015393233600644667, + "loss": 1.4843, + "step": 17738 + }, + { + "epoch": 0.23051021452367687, + "grad_norm": 0.4726935923099518, + "learning_rate": 0.0001539297365445353, + "loss": 1.6548, + "step": 17739 + }, + { + "epoch": 0.23052320906759274, + "grad_norm": 0.3787649869918823, + "learning_rate": 0.0001539271370826239, + "loss": 1.2578, + "step": 17740 + }, + { + "epoch": 0.23053620361150862, + "grad_norm": 0.5269920825958252, + "learning_rate": 0.00015392453762071251, + "loss": 1.3792, + "step": 17741 + }, + { + "epoch": 0.2305491981554245, + "grad_norm": 0.2908004820346832, + "learning_rate": 0.00015392193815880114, + "loss": 1.1771, + "step": 17742 + }, + { + "epoch": 0.23056219269934036, + "grad_norm": 0.45666834712028503, + "learning_rate": 0.00015391933869688974, + "loss": 1.3636, + "step": 17743 + }, + { + "epoch": 0.23057518724325624, + "grad_norm": 0.32404395937919617, + "learning_rate": 0.00015391673923497836, + "loss": 1.3183, + "step": 17744 + }, + { + "epoch": 0.2305881817871721, + "grad_norm": 0.3777819871902466, + "learning_rate": 0.00015391413977306699, + "loss": 1.6317, + "step": 17745 + }, + { + "epoch": 0.23060117633108798, + "grad_norm": 0.39990320801734924, + "learning_rate": 0.0001539115403111556, + "loss": 1.5315, + "step": 17746 + }, + { + "epoch": 0.23061417087500385, + "grad_norm": 0.4379133880138397, + "learning_rate": 0.0001539089408492442, + "loss": 1.329, + "step": 17747 + }, + { + "epoch": 0.23062716541891973, + "grad_norm": 0.2936893105506897, + "learning_rate": 0.00015390634138733283, + "loss": 1.2154, + "step": 17748 + }, + { + "epoch": 0.2306401599628356, + "grad_norm": 0.29721030592918396, + "learning_rate": 0.00015390374192542146, + "loss": 1.3616, + "step": 17749 + }, + { + "epoch": 0.23065315450675147, + "grad_norm": 0.43537285923957825, + "learning_rate": 0.00015390114246351005, + "loss": 1.407, + "step": 17750 + }, + { + "epoch": 0.23066614905066735, + "grad_norm": 0.3409595489501953, + "learning_rate": 0.00015389854300159868, + "loss": 1.3873, + "step": 17751 + }, + { + "epoch": 0.23067914359458322, + "grad_norm": 0.5316119194030762, + "learning_rate": 0.00015389594353968728, + "loss": 1.6667, + "step": 17752 + }, + { + "epoch": 0.2306921381384991, + "grad_norm": 0.3834903836250305, + "learning_rate": 0.00015389334407777593, + "loss": 1.5414, + "step": 17753 + }, + { + "epoch": 0.23070513268241497, + "grad_norm": 0.41867783665657043, + "learning_rate": 0.00015389074461586452, + "loss": 1.4372, + "step": 17754 + }, + { + "epoch": 0.23071812722633084, + "grad_norm": 0.3815166652202606, + "learning_rate": 0.00015388814515395312, + "loss": 1.2776, + "step": 17755 + }, + { + "epoch": 0.2307311217702467, + "grad_norm": 0.38519179821014404, + "learning_rate": 0.00015388554569204175, + "loss": 1.4389, + "step": 17756 + }, + { + "epoch": 0.23074411631416258, + "grad_norm": 0.3920106887817383, + "learning_rate": 0.00015388294623013037, + "loss": 1.5267, + "step": 17757 + }, + { + "epoch": 0.23075711085807846, + "grad_norm": 0.36666139960289, + "learning_rate": 0.000153880346768219, + "loss": 1.5599, + "step": 17758 + }, + { + "epoch": 0.23077010540199433, + "grad_norm": 0.38227421045303345, + "learning_rate": 0.0001538777473063076, + "loss": 1.4345, + "step": 17759 + }, + { + "epoch": 0.2307830999459102, + "grad_norm": 0.44606131315231323, + "learning_rate": 0.00015387514784439622, + "loss": 1.4597, + "step": 17760 + }, + { + "epoch": 0.23079609448982608, + "grad_norm": 0.39926958084106445, + "learning_rate": 0.00015387254838248484, + "loss": 1.3754, + "step": 17761 + }, + { + "epoch": 0.23080908903374195, + "grad_norm": 0.39020079374313354, + "learning_rate": 0.00015386994892057344, + "loss": 1.4708, + "step": 17762 + }, + { + "epoch": 0.23082208357765782, + "grad_norm": 0.23276223242282867, + "learning_rate": 0.00015386734945866206, + "loss": 1.377, + "step": 17763 + }, + { + "epoch": 0.2308350781215737, + "grad_norm": 0.44530966877937317, + "learning_rate": 0.0001538647499967507, + "loss": 1.4262, + "step": 17764 + }, + { + "epoch": 0.23084807266548957, + "grad_norm": 0.3683391213417053, + "learning_rate": 0.0001538621505348393, + "loss": 1.5455, + "step": 17765 + }, + { + "epoch": 0.23086106720940544, + "grad_norm": 0.42391595244407654, + "learning_rate": 0.0001538595510729279, + "loss": 1.3392, + "step": 17766 + }, + { + "epoch": 0.2308740617533213, + "grad_norm": 0.48458898067474365, + "learning_rate": 0.0001538569516110165, + "loss": 1.3953, + "step": 17767 + }, + { + "epoch": 0.2308870562972372, + "grad_norm": 0.31297212839126587, + "learning_rate": 0.00015385435214910516, + "loss": 1.397, + "step": 17768 + }, + { + "epoch": 0.23090005084115306, + "grad_norm": 0.3093230128288269, + "learning_rate": 0.00015385175268719376, + "loss": 1.3708, + "step": 17769 + }, + { + "epoch": 0.23091304538506893, + "grad_norm": 0.5038252472877502, + "learning_rate": 0.00015384915322528238, + "loss": 1.4602, + "step": 17770 + }, + { + "epoch": 0.2309260399289848, + "grad_norm": 0.3663513958454132, + "learning_rate": 0.00015384655376337098, + "loss": 1.2451, + "step": 17771 + }, + { + "epoch": 0.23093903447290068, + "grad_norm": 0.3613506555557251, + "learning_rate": 0.0001538439543014596, + "loss": 1.4721, + "step": 17772 + }, + { + "epoch": 0.23095202901681655, + "grad_norm": 0.4875245690345764, + "learning_rate": 0.00015384135483954823, + "loss": 1.4681, + "step": 17773 + }, + { + "epoch": 0.23096502356073242, + "grad_norm": 0.3016203045845032, + "learning_rate": 0.00015383875537763682, + "loss": 1.3883, + "step": 17774 + }, + { + "epoch": 0.2309780181046483, + "grad_norm": 0.39854696393013, + "learning_rate": 0.00015383615591572545, + "loss": 1.379, + "step": 17775 + }, + { + "epoch": 0.2309910126485642, + "grad_norm": 0.43419399857521057, + "learning_rate": 0.00015383355645381407, + "loss": 1.3976, + "step": 17776 + }, + { + "epoch": 0.23100400719248007, + "grad_norm": 0.31253641843795776, + "learning_rate": 0.0001538309569919027, + "loss": 1.2721, + "step": 17777 + }, + { + "epoch": 0.23101700173639594, + "grad_norm": 0.3926675319671631, + "learning_rate": 0.0001538283575299913, + "loss": 1.2644, + "step": 17778 + }, + { + "epoch": 0.23102999628031182, + "grad_norm": 0.3172212839126587, + "learning_rate": 0.00015382575806807992, + "loss": 1.385, + "step": 17779 + }, + { + "epoch": 0.2310429908242277, + "grad_norm": 0.5464864373207092, + "learning_rate": 0.00015382315860616854, + "loss": 1.4979, + "step": 17780 + }, + { + "epoch": 0.23105598536814356, + "grad_norm": 0.39636465907096863, + "learning_rate": 0.00015382055914425714, + "loss": 1.3413, + "step": 17781 + }, + { + "epoch": 0.23106897991205944, + "grad_norm": 0.31318336725234985, + "learning_rate": 0.00015381795968234577, + "loss": 1.3443, + "step": 17782 + }, + { + "epoch": 0.2310819744559753, + "grad_norm": 0.4878489375114441, + "learning_rate": 0.00015381536022043436, + "loss": 1.4212, + "step": 17783 + }, + { + "epoch": 0.23109496899989118, + "grad_norm": 0.43249329924583435, + "learning_rate": 0.000153812760758523, + "loss": 1.2095, + "step": 17784 + }, + { + "epoch": 0.23110796354380705, + "grad_norm": 0.3974393904209137, + "learning_rate": 0.0001538101612966116, + "loss": 1.4415, + "step": 17785 + }, + { + "epoch": 0.23112095808772293, + "grad_norm": 0.41968485713005066, + "learning_rate": 0.0001538075618347002, + "loss": 1.3609, + "step": 17786 + }, + { + "epoch": 0.2311339526316388, + "grad_norm": 0.3287057876586914, + "learning_rate": 0.00015380496237278883, + "loss": 1.3519, + "step": 17787 + }, + { + "epoch": 0.23114694717555467, + "grad_norm": 0.35635054111480713, + "learning_rate": 0.00015380236291087746, + "loss": 1.4472, + "step": 17788 + }, + { + "epoch": 0.23115994171947055, + "grad_norm": 0.4138796031475067, + "learning_rate": 0.00015379976344896608, + "loss": 1.3219, + "step": 17789 + }, + { + "epoch": 0.23117293626338642, + "grad_norm": 0.7689157128334045, + "learning_rate": 0.00015379716398705468, + "loss": 1.4017, + "step": 17790 + }, + { + "epoch": 0.2311859308073023, + "grad_norm": 0.5234813094139099, + "learning_rate": 0.0001537945645251433, + "loss": 1.5614, + "step": 17791 + }, + { + "epoch": 0.23119892535121817, + "grad_norm": 0.3792448937892914, + "learning_rate": 0.00015379196506323193, + "loss": 1.3483, + "step": 17792 + }, + { + "epoch": 0.23121191989513404, + "grad_norm": 0.39990630745887756, + "learning_rate": 0.00015378936560132053, + "loss": 1.3536, + "step": 17793 + }, + { + "epoch": 0.2312249144390499, + "grad_norm": 0.3721769452095032, + "learning_rate": 0.00015378676613940915, + "loss": 1.2774, + "step": 17794 + }, + { + "epoch": 0.23123790898296578, + "grad_norm": 0.33774635195732117, + "learning_rate": 0.00015378416667749778, + "loss": 1.3486, + "step": 17795 + }, + { + "epoch": 0.23125090352688166, + "grad_norm": 0.3447568416595459, + "learning_rate": 0.00015378156721558637, + "loss": 1.5181, + "step": 17796 + }, + { + "epoch": 0.23126389807079753, + "grad_norm": 0.34187808632850647, + "learning_rate": 0.000153778967753675, + "loss": 1.2728, + "step": 17797 + }, + { + "epoch": 0.2312768926147134, + "grad_norm": 0.37070977687835693, + "learning_rate": 0.0001537763682917636, + "loss": 1.5407, + "step": 17798 + }, + { + "epoch": 0.23128988715862928, + "grad_norm": 0.3658391237258911, + "learning_rate": 0.00015377376882985225, + "loss": 1.339, + "step": 17799 + }, + { + "epoch": 0.23130288170254515, + "grad_norm": 0.37719231843948364, + "learning_rate": 0.00015377116936794084, + "loss": 1.349, + "step": 17800 + }, + { + "epoch": 0.23131587624646102, + "grad_norm": 0.45412755012512207, + "learning_rate": 0.00015376856990602947, + "loss": 1.4136, + "step": 17801 + }, + { + "epoch": 0.2313288707903769, + "grad_norm": 0.31934070587158203, + "learning_rate": 0.00015376597044411807, + "loss": 1.2899, + "step": 17802 + }, + { + "epoch": 0.23134186533429277, + "grad_norm": 0.35606926679611206, + "learning_rate": 0.0001537633709822067, + "loss": 1.4105, + "step": 17803 + }, + { + "epoch": 0.23135485987820864, + "grad_norm": 0.5676253437995911, + "learning_rate": 0.00015376077152029531, + "loss": 1.5117, + "step": 17804 + }, + { + "epoch": 0.2313678544221245, + "grad_norm": 0.3563275635242462, + "learning_rate": 0.0001537581720583839, + "loss": 1.6511, + "step": 17805 + }, + { + "epoch": 0.2313808489660404, + "grad_norm": 0.3710525929927826, + "learning_rate": 0.00015375557259647254, + "loss": 1.4105, + "step": 17806 + }, + { + "epoch": 0.23139384350995626, + "grad_norm": 0.43689870834350586, + "learning_rate": 0.00015375297313456116, + "loss": 1.4626, + "step": 17807 + }, + { + "epoch": 0.23140683805387213, + "grad_norm": 0.42645755410194397, + "learning_rate": 0.00015375037367264979, + "loss": 1.4595, + "step": 17808 + }, + { + "epoch": 0.231419832597788, + "grad_norm": 0.41030409932136536, + "learning_rate": 0.00015374777421073838, + "loss": 1.3959, + "step": 17809 + }, + { + "epoch": 0.23143282714170388, + "grad_norm": 0.39593061804771423, + "learning_rate": 0.00015374517474882698, + "loss": 1.4249, + "step": 17810 + }, + { + "epoch": 0.23144582168561975, + "grad_norm": 0.3445868492126465, + "learning_rate": 0.00015374257528691563, + "loss": 1.211, + "step": 17811 + }, + { + "epoch": 0.23145881622953562, + "grad_norm": 0.386158287525177, + "learning_rate": 0.00015373997582500423, + "loss": 1.4471, + "step": 17812 + }, + { + "epoch": 0.2314718107734515, + "grad_norm": 0.3330208361148834, + "learning_rate": 0.00015373737636309285, + "loss": 1.1957, + "step": 17813 + }, + { + "epoch": 0.23148480531736737, + "grad_norm": 0.4697929918766022, + "learning_rate": 0.00015373477690118145, + "loss": 1.3807, + "step": 17814 + }, + { + "epoch": 0.23149779986128324, + "grad_norm": 0.4216265082359314, + "learning_rate": 0.00015373217743927008, + "loss": 1.4217, + "step": 17815 + }, + { + "epoch": 0.23151079440519912, + "grad_norm": 0.4688165485858917, + "learning_rate": 0.0001537295779773587, + "loss": 1.2106, + "step": 17816 + }, + { + "epoch": 0.231523788949115, + "grad_norm": 0.4179801344871521, + "learning_rate": 0.0001537269785154473, + "loss": 1.4094, + "step": 17817 + }, + { + "epoch": 0.23153678349303086, + "grad_norm": 0.3999471664428711, + "learning_rate": 0.00015372437905353592, + "loss": 1.3621, + "step": 17818 + }, + { + "epoch": 0.23154977803694674, + "grad_norm": 0.34358906745910645, + "learning_rate": 0.00015372177959162455, + "loss": 1.5047, + "step": 17819 + }, + { + "epoch": 0.2315627725808626, + "grad_norm": 0.3317813277244568, + "learning_rate": 0.00015371918012971317, + "loss": 1.2093, + "step": 17820 + }, + { + "epoch": 0.23157576712477848, + "grad_norm": 0.49148693680763245, + "learning_rate": 0.00015371658066780177, + "loss": 1.4874, + "step": 17821 + }, + { + "epoch": 0.23158876166869435, + "grad_norm": 0.40034812688827515, + "learning_rate": 0.00015371398120589037, + "loss": 1.6476, + "step": 17822 + }, + { + "epoch": 0.23160175621261023, + "grad_norm": 0.2973600924015045, + "learning_rate": 0.00015371138174397902, + "loss": 1.102, + "step": 17823 + }, + { + "epoch": 0.2316147507565261, + "grad_norm": 0.46445170044898987, + "learning_rate": 0.00015370878228206761, + "loss": 1.4162, + "step": 17824 + }, + { + "epoch": 0.23162774530044197, + "grad_norm": 0.5100994110107422, + "learning_rate": 0.00015370618282015624, + "loss": 1.6362, + "step": 17825 + }, + { + "epoch": 0.23164073984435785, + "grad_norm": 0.36702030897140503, + "learning_rate": 0.00015370358335824484, + "loss": 1.3632, + "step": 17826 + }, + { + "epoch": 0.23165373438827372, + "grad_norm": 0.3670720160007477, + "learning_rate": 0.00015370098389633346, + "loss": 1.3551, + "step": 17827 + }, + { + "epoch": 0.2316667289321896, + "grad_norm": 0.37104499340057373, + "learning_rate": 0.00015369838443442209, + "loss": 1.3752, + "step": 17828 + }, + { + "epoch": 0.23167972347610546, + "grad_norm": 0.443123459815979, + "learning_rate": 0.00015369578497251068, + "loss": 1.4786, + "step": 17829 + }, + { + "epoch": 0.23169271802002134, + "grad_norm": 0.337756872177124, + "learning_rate": 0.0001536931855105993, + "loss": 1.3264, + "step": 17830 + }, + { + "epoch": 0.2317057125639372, + "grad_norm": 0.3932908773422241, + "learning_rate": 0.00015369058604868793, + "loss": 1.2971, + "step": 17831 + }, + { + "epoch": 0.23171870710785308, + "grad_norm": 0.425310879945755, + "learning_rate": 0.00015368798658677656, + "loss": 1.2991, + "step": 17832 + }, + { + "epoch": 0.23173170165176896, + "grad_norm": 0.3532455861568451, + "learning_rate": 0.00015368538712486515, + "loss": 1.4057, + "step": 17833 + }, + { + "epoch": 0.23174469619568483, + "grad_norm": 0.3757370114326477, + "learning_rate": 0.00015368278766295378, + "loss": 1.2325, + "step": 17834 + }, + { + "epoch": 0.2317576907396007, + "grad_norm": 0.4334326982498169, + "learning_rate": 0.0001536801882010424, + "loss": 1.5564, + "step": 17835 + }, + { + "epoch": 0.23177068528351658, + "grad_norm": 0.34158578515052795, + "learning_rate": 0.000153677588739131, + "loss": 1.3843, + "step": 17836 + }, + { + "epoch": 0.23178367982743245, + "grad_norm": 0.35654598474502563, + "learning_rate": 0.00015367498927721962, + "loss": 1.3741, + "step": 17837 + }, + { + "epoch": 0.23179667437134832, + "grad_norm": 0.453436017036438, + "learning_rate": 0.00015367238981530825, + "loss": 1.2056, + "step": 17838 + }, + { + "epoch": 0.2318096689152642, + "grad_norm": 0.4400738775730133, + "learning_rate": 0.00015366979035339685, + "loss": 1.5811, + "step": 17839 + }, + { + "epoch": 0.23182266345918007, + "grad_norm": 0.4141646921634674, + "learning_rate": 0.00015366719089148547, + "loss": 1.4859, + "step": 17840 + }, + { + "epoch": 0.23183565800309594, + "grad_norm": 0.39764395356178284, + "learning_rate": 0.00015366459142957407, + "loss": 1.4994, + "step": 17841 + }, + { + "epoch": 0.2318486525470118, + "grad_norm": 0.45021700859069824, + "learning_rate": 0.00015366199196766272, + "loss": 1.7234, + "step": 17842 + }, + { + "epoch": 0.23186164709092769, + "grad_norm": 0.40634238719940186, + "learning_rate": 0.00015365939250575132, + "loss": 1.2821, + "step": 17843 + }, + { + "epoch": 0.23187464163484356, + "grad_norm": 0.40197062492370605, + "learning_rate": 0.00015365679304383994, + "loss": 1.3954, + "step": 17844 + }, + { + "epoch": 0.23188763617875943, + "grad_norm": 0.4001365303993225, + "learning_rate": 0.00015365419358192854, + "loss": 1.2962, + "step": 17845 + }, + { + "epoch": 0.2319006307226753, + "grad_norm": 0.3134896457195282, + "learning_rate": 0.00015365159412001716, + "loss": 1.4198, + "step": 17846 + }, + { + "epoch": 0.23191362526659118, + "grad_norm": 0.6061667203903198, + "learning_rate": 0.0001536489946581058, + "loss": 1.5099, + "step": 17847 + }, + { + "epoch": 0.23192661981050705, + "grad_norm": 0.3054920732975006, + "learning_rate": 0.00015364639519619439, + "loss": 1.4205, + "step": 17848 + }, + { + "epoch": 0.23193961435442292, + "grad_norm": 0.3513713777065277, + "learning_rate": 0.000153643795734283, + "loss": 1.5263, + "step": 17849 + }, + { + "epoch": 0.2319526088983388, + "grad_norm": 0.42762911319732666, + "learning_rate": 0.00015364119627237163, + "loss": 1.3243, + "step": 17850 + }, + { + "epoch": 0.23196560344225467, + "grad_norm": 0.3481937348842621, + "learning_rate": 0.00015363859681046023, + "loss": 1.2704, + "step": 17851 + }, + { + "epoch": 0.23197859798617057, + "grad_norm": 0.3869515359401703, + "learning_rate": 0.00015363599734854886, + "loss": 1.6046, + "step": 17852 + }, + { + "epoch": 0.23199159253008644, + "grad_norm": 0.4191778004169464, + "learning_rate": 0.00015363339788663745, + "loss": 1.517, + "step": 17853 + }, + { + "epoch": 0.23200458707400232, + "grad_norm": 0.3250775635242462, + "learning_rate": 0.0001536307984247261, + "loss": 1.3247, + "step": 17854 + }, + { + "epoch": 0.2320175816179182, + "grad_norm": 0.41848769783973694, + "learning_rate": 0.0001536281989628147, + "loss": 1.2927, + "step": 17855 + }, + { + "epoch": 0.23203057616183406, + "grad_norm": 0.3696177899837494, + "learning_rate": 0.00015362559950090333, + "loss": 1.4659, + "step": 17856 + }, + { + "epoch": 0.23204357070574994, + "grad_norm": 0.33975252509117126, + "learning_rate": 0.00015362300003899192, + "loss": 1.2339, + "step": 17857 + }, + { + "epoch": 0.2320565652496658, + "grad_norm": 0.4539058804512024, + "learning_rate": 0.00015362040057708055, + "loss": 1.5023, + "step": 17858 + }, + { + "epoch": 0.23206955979358168, + "grad_norm": 0.44186097383499146, + "learning_rate": 0.00015361780111516917, + "loss": 1.4831, + "step": 17859 + }, + { + "epoch": 0.23208255433749755, + "grad_norm": 0.38855695724487305, + "learning_rate": 0.00015361520165325777, + "loss": 1.4965, + "step": 17860 + }, + { + "epoch": 0.23209554888141343, + "grad_norm": 0.4631621539592743, + "learning_rate": 0.0001536126021913464, + "loss": 1.4102, + "step": 17861 + }, + { + "epoch": 0.2321085434253293, + "grad_norm": 0.39363187551498413, + "learning_rate": 0.00015361000272943502, + "loss": 1.4716, + "step": 17862 + }, + { + "epoch": 0.23212153796924517, + "grad_norm": 0.24282239377498627, + "learning_rate": 0.00015360740326752364, + "loss": 1.2962, + "step": 17863 + }, + { + "epoch": 0.23213453251316105, + "grad_norm": 0.3153238594532013, + "learning_rate": 0.00015360480380561224, + "loss": 1.4539, + "step": 17864 + }, + { + "epoch": 0.23214752705707692, + "grad_norm": 0.3699541687965393, + "learning_rate": 0.00015360220434370084, + "loss": 1.4895, + "step": 17865 + }, + { + "epoch": 0.2321605216009928, + "grad_norm": 0.3706502914428711, + "learning_rate": 0.0001535996048817895, + "loss": 1.371, + "step": 17866 + }, + { + "epoch": 0.23217351614490866, + "grad_norm": 0.3946920335292816, + "learning_rate": 0.0001535970054198781, + "loss": 1.4146, + "step": 17867 + }, + { + "epoch": 0.23218651068882454, + "grad_norm": 0.2880970537662506, + "learning_rate": 0.0001535944059579667, + "loss": 1.1588, + "step": 17868 + }, + { + "epoch": 0.2321995052327404, + "grad_norm": 0.3923853635787964, + "learning_rate": 0.00015359180649605534, + "loss": 1.197, + "step": 17869 + }, + { + "epoch": 0.23221249977665628, + "grad_norm": 0.4441952705383301, + "learning_rate": 0.00015358920703414393, + "loss": 1.5486, + "step": 17870 + }, + { + "epoch": 0.23222549432057216, + "grad_norm": 0.38266226649284363, + "learning_rate": 0.00015358660757223256, + "loss": 1.4285, + "step": 17871 + }, + { + "epoch": 0.23223848886448803, + "grad_norm": 0.38642239570617676, + "learning_rate": 0.00015358400811032116, + "loss": 1.39, + "step": 17872 + }, + { + "epoch": 0.2322514834084039, + "grad_norm": 0.3625488579273224, + "learning_rate": 0.0001535814086484098, + "loss": 1.2688, + "step": 17873 + }, + { + "epoch": 0.23226447795231978, + "grad_norm": 0.39898431301116943, + "learning_rate": 0.0001535788091864984, + "loss": 1.6059, + "step": 17874 + }, + { + "epoch": 0.23227747249623565, + "grad_norm": 0.4230211675167084, + "learning_rate": 0.00015357620972458703, + "loss": 1.3784, + "step": 17875 + }, + { + "epoch": 0.23229046704015152, + "grad_norm": 0.4505596458911896, + "learning_rate": 0.00015357361026267563, + "loss": 1.3411, + "step": 17876 + }, + { + "epoch": 0.2323034615840674, + "grad_norm": 0.3540389835834503, + "learning_rate": 0.00015357101080076425, + "loss": 1.5181, + "step": 17877 + }, + { + "epoch": 0.23231645612798327, + "grad_norm": 0.43678078055381775, + "learning_rate": 0.00015356841133885288, + "loss": 1.5223, + "step": 17878 + }, + { + "epoch": 0.23232945067189914, + "grad_norm": 0.47371378540992737, + "learning_rate": 0.00015356581187694147, + "loss": 1.5503, + "step": 17879 + }, + { + "epoch": 0.232342445215815, + "grad_norm": 0.4325070381164551, + "learning_rate": 0.0001535632124150301, + "loss": 1.2801, + "step": 17880 + }, + { + "epoch": 0.2323554397597309, + "grad_norm": 0.4451591968536377, + "learning_rate": 0.00015356061295311872, + "loss": 1.4762, + "step": 17881 + }, + { + "epoch": 0.23236843430364676, + "grad_norm": 0.4021493196487427, + "learning_rate": 0.00015355801349120732, + "loss": 1.4245, + "step": 17882 + }, + { + "epoch": 0.23238142884756263, + "grad_norm": 0.3905608057975769, + "learning_rate": 0.00015355541402929594, + "loss": 1.4166, + "step": 17883 + }, + { + "epoch": 0.2323944233914785, + "grad_norm": 0.42811280488967896, + "learning_rate": 0.00015355281456738454, + "loss": 1.4641, + "step": 17884 + }, + { + "epoch": 0.23240741793539438, + "grad_norm": 0.4082183539867401, + "learning_rate": 0.0001535502151054732, + "loss": 1.4691, + "step": 17885 + }, + { + "epoch": 0.23242041247931025, + "grad_norm": 0.303067684173584, + "learning_rate": 0.0001535476156435618, + "loss": 1.3537, + "step": 17886 + }, + { + "epoch": 0.23243340702322612, + "grad_norm": 0.38688594102859497, + "learning_rate": 0.00015354501618165042, + "loss": 1.48, + "step": 17887 + }, + { + "epoch": 0.232446401567142, + "grad_norm": 0.341637521982193, + "learning_rate": 0.000153542416719739, + "loss": 1.3024, + "step": 17888 + }, + { + "epoch": 0.23245939611105787, + "grad_norm": 0.31752997636795044, + "learning_rate": 0.00015353981725782764, + "loss": 1.2497, + "step": 17889 + }, + { + "epoch": 0.23247239065497374, + "grad_norm": 0.471500426530838, + "learning_rate": 0.00015353721779591626, + "loss": 1.5345, + "step": 17890 + }, + { + "epoch": 0.23248538519888962, + "grad_norm": 0.28118324279785156, + "learning_rate": 0.00015353461833400486, + "loss": 1.46, + "step": 17891 + }, + { + "epoch": 0.2324983797428055, + "grad_norm": 0.39630448818206787, + "learning_rate": 0.00015353201887209348, + "loss": 1.5243, + "step": 17892 + }, + { + "epoch": 0.23251137428672136, + "grad_norm": 0.4049227833747864, + "learning_rate": 0.0001535294194101821, + "loss": 1.4443, + "step": 17893 + }, + { + "epoch": 0.23252436883063723, + "grad_norm": 0.42329803109169006, + "learning_rate": 0.0001535268199482707, + "loss": 1.5795, + "step": 17894 + }, + { + "epoch": 0.2325373633745531, + "grad_norm": 0.5804991722106934, + "learning_rate": 0.00015352422048635933, + "loss": 1.3983, + "step": 17895 + }, + { + "epoch": 0.23255035791846898, + "grad_norm": 0.436396062374115, + "learning_rate": 0.00015352162102444793, + "loss": 1.4881, + "step": 17896 + }, + { + "epoch": 0.23256335246238485, + "grad_norm": 0.4165444076061249, + "learning_rate": 0.00015351902156253658, + "loss": 1.2902, + "step": 17897 + }, + { + "epoch": 0.23257634700630073, + "grad_norm": 0.36505982279777527, + "learning_rate": 0.00015351642210062518, + "loss": 1.43, + "step": 17898 + }, + { + "epoch": 0.2325893415502166, + "grad_norm": 0.4033183157444, + "learning_rate": 0.0001535138226387138, + "loss": 1.267, + "step": 17899 + }, + { + "epoch": 0.23260233609413247, + "grad_norm": 0.37419673800468445, + "learning_rate": 0.0001535112231768024, + "loss": 1.3773, + "step": 17900 + }, + { + "epoch": 0.23261533063804835, + "grad_norm": 0.39067843556404114, + "learning_rate": 0.00015350862371489102, + "loss": 1.3313, + "step": 17901 + }, + { + "epoch": 0.23262832518196422, + "grad_norm": 0.3143513798713684, + "learning_rate": 0.00015350602425297965, + "loss": 1.2085, + "step": 17902 + }, + { + "epoch": 0.2326413197258801, + "grad_norm": 0.30038923025131226, + "learning_rate": 0.00015350342479106824, + "loss": 1.4399, + "step": 17903 + }, + { + "epoch": 0.23265431426979596, + "grad_norm": 0.42759689688682556, + "learning_rate": 0.00015350082532915687, + "loss": 1.4828, + "step": 17904 + }, + { + "epoch": 0.23266730881371184, + "grad_norm": 0.3189608156681061, + "learning_rate": 0.0001534982258672455, + "loss": 1.3064, + "step": 17905 + }, + { + "epoch": 0.2326803033576277, + "grad_norm": 0.43306416273117065, + "learning_rate": 0.0001534956264053341, + "loss": 1.4608, + "step": 17906 + }, + { + "epoch": 0.23269329790154358, + "grad_norm": 0.3596252202987671, + "learning_rate": 0.00015349302694342272, + "loss": 1.5733, + "step": 17907 + }, + { + "epoch": 0.23270629244545946, + "grad_norm": 0.39456039667129517, + "learning_rate": 0.00015349042748151134, + "loss": 1.5159, + "step": 17908 + }, + { + "epoch": 0.23271928698937533, + "grad_norm": 0.4954872131347656, + "learning_rate": 0.00015348782801959996, + "loss": 1.3347, + "step": 17909 + }, + { + "epoch": 0.2327322815332912, + "grad_norm": 0.36924368143081665, + "learning_rate": 0.00015348522855768856, + "loss": 1.6782, + "step": 17910 + }, + { + "epoch": 0.23274527607720708, + "grad_norm": 0.4221276044845581, + "learning_rate": 0.00015348262909577719, + "loss": 1.4704, + "step": 17911 + }, + { + "epoch": 0.23275827062112295, + "grad_norm": 0.3313046395778656, + "learning_rate": 0.0001534800296338658, + "loss": 1.1531, + "step": 17912 + }, + { + "epoch": 0.23277126516503882, + "grad_norm": 0.3749324381351471, + "learning_rate": 0.0001534774301719544, + "loss": 1.3079, + "step": 17913 + }, + { + "epoch": 0.2327842597089547, + "grad_norm": 0.3325458765029907, + "learning_rate": 0.00015347483071004303, + "loss": 1.3079, + "step": 17914 + }, + { + "epoch": 0.23279725425287057, + "grad_norm": 0.3753267228603363, + "learning_rate": 0.00015347223124813163, + "loss": 1.5559, + "step": 17915 + }, + { + "epoch": 0.23281024879678644, + "grad_norm": 0.3746005892753601, + "learning_rate": 0.00015346963178622028, + "loss": 1.4584, + "step": 17916 + }, + { + "epoch": 0.2328232433407023, + "grad_norm": 0.4311268627643585, + "learning_rate": 0.00015346703232430888, + "loss": 1.5771, + "step": 17917 + }, + { + "epoch": 0.23283623788461819, + "grad_norm": 0.40249913930892944, + "learning_rate": 0.00015346443286239748, + "loss": 1.5888, + "step": 17918 + }, + { + "epoch": 0.23284923242853406, + "grad_norm": 0.4716746509075165, + "learning_rate": 0.0001534618334004861, + "loss": 1.4667, + "step": 17919 + }, + { + "epoch": 0.23286222697244993, + "grad_norm": 0.375625342130661, + "learning_rate": 0.00015345923393857472, + "loss": 1.2394, + "step": 17920 + }, + { + "epoch": 0.2328752215163658, + "grad_norm": 0.37376120686531067, + "learning_rate": 0.00015345663447666335, + "loss": 1.5129, + "step": 17921 + }, + { + "epoch": 0.23288821606028168, + "grad_norm": 0.37111878395080566, + "learning_rate": 0.00015345403501475195, + "loss": 1.3276, + "step": 17922 + }, + { + "epoch": 0.23290121060419755, + "grad_norm": 0.35338693857192993, + "learning_rate": 0.00015345143555284057, + "loss": 1.4121, + "step": 17923 + }, + { + "epoch": 0.23291420514811342, + "grad_norm": 0.4400428831577301, + "learning_rate": 0.0001534488360909292, + "loss": 1.4532, + "step": 17924 + }, + { + "epoch": 0.2329271996920293, + "grad_norm": 0.37598493695259094, + "learning_rate": 0.0001534462366290178, + "loss": 1.3349, + "step": 17925 + }, + { + "epoch": 0.23294019423594517, + "grad_norm": 0.3390032947063446, + "learning_rate": 0.00015344363716710642, + "loss": 1.6361, + "step": 17926 + }, + { + "epoch": 0.23295318877986104, + "grad_norm": 0.44953683018684387, + "learning_rate": 0.00015344103770519501, + "loss": 1.5049, + "step": 17927 + }, + { + "epoch": 0.23296618332377692, + "grad_norm": 0.5058104991912842, + "learning_rate": 0.00015343843824328367, + "loss": 1.3093, + "step": 17928 + }, + { + "epoch": 0.23297917786769282, + "grad_norm": 0.38602083921432495, + "learning_rate": 0.00015343583878137226, + "loss": 1.3341, + "step": 17929 + }, + { + "epoch": 0.2329921724116087, + "grad_norm": 0.38048407435417175, + "learning_rate": 0.0001534332393194609, + "loss": 1.3511, + "step": 17930 + }, + { + "epoch": 0.23300516695552456, + "grad_norm": 0.34763115644454956, + "learning_rate": 0.00015343063985754949, + "loss": 1.4245, + "step": 17931 + }, + { + "epoch": 0.23301816149944043, + "grad_norm": 0.41458916664123535, + "learning_rate": 0.0001534280403956381, + "loss": 1.4254, + "step": 17932 + }, + { + "epoch": 0.2330311560433563, + "grad_norm": 0.39238467812538147, + "learning_rate": 0.00015342544093372673, + "loss": 1.4751, + "step": 17933 + }, + { + "epoch": 0.23304415058727218, + "grad_norm": 0.4726831912994385, + "learning_rate": 0.00015342284147181533, + "loss": 1.5654, + "step": 17934 + }, + { + "epoch": 0.23305714513118805, + "grad_norm": 0.44743531942367554, + "learning_rate": 0.00015342024200990396, + "loss": 1.2952, + "step": 17935 + }, + { + "epoch": 0.23307013967510393, + "grad_norm": 0.49141213297843933, + "learning_rate": 0.00015341764254799258, + "loss": 1.5189, + "step": 17936 + }, + { + "epoch": 0.2330831342190198, + "grad_norm": 0.31094464659690857, + "learning_rate": 0.00015341504308608118, + "loss": 1.2981, + "step": 17937 + }, + { + "epoch": 0.23309612876293567, + "grad_norm": 0.3795747756958008, + "learning_rate": 0.0001534124436241698, + "loss": 1.2674, + "step": 17938 + }, + { + "epoch": 0.23310912330685155, + "grad_norm": 0.39798861742019653, + "learning_rate": 0.0001534098441622584, + "loss": 1.333, + "step": 17939 + }, + { + "epoch": 0.23312211785076742, + "grad_norm": 0.35945409536361694, + "learning_rate": 0.00015340724470034705, + "loss": 1.421, + "step": 17940 + }, + { + "epoch": 0.2331351123946833, + "grad_norm": 0.471318781375885, + "learning_rate": 0.00015340464523843565, + "loss": 1.5116, + "step": 17941 + }, + { + "epoch": 0.23314810693859916, + "grad_norm": 0.32836174964904785, + "learning_rate": 0.00015340204577652427, + "loss": 1.4417, + "step": 17942 + }, + { + "epoch": 0.23316110148251504, + "grad_norm": 0.34531545639038086, + "learning_rate": 0.0001533994463146129, + "loss": 1.3299, + "step": 17943 + }, + { + "epoch": 0.2331740960264309, + "grad_norm": 0.3809370994567871, + "learning_rate": 0.0001533968468527015, + "loss": 1.2998, + "step": 17944 + }, + { + "epoch": 0.23318709057034678, + "grad_norm": 0.46564140915870667, + "learning_rate": 0.00015339424739079012, + "loss": 1.6028, + "step": 17945 + }, + { + "epoch": 0.23320008511426266, + "grad_norm": 0.2764657735824585, + "learning_rate": 0.00015339164792887872, + "loss": 1.2578, + "step": 17946 + }, + { + "epoch": 0.23321307965817853, + "grad_norm": 0.4108177721500397, + "learning_rate": 0.00015338904846696734, + "loss": 1.4521, + "step": 17947 + }, + { + "epoch": 0.2332260742020944, + "grad_norm": 0.44228336215019226, + "learning_rate": 0.00015338644900505597, + "loss": 1.4632, + "step": 17948 + }, + { + "epoch": 0.23323906874601028, + "grad_norm": 0.40309804677963257, + "learning_rate": 0.00015338384954314456, + "loss": 1.514, + "step": 17949 + }, + { + "epoch": 0.23325206328992615, + "grad_norm": 0.3982924818992615, + "learning_rate": 0.0001533812500812332, + "loss": 1.4437, + "step": 17950 + }, + { + "epoch": 0.23326505783384202, + "grad_norm": 0.5114452838897705, + "learning_rate": 0.0001533786506193218, + "loss": 1.585, + "step": 17951 + }, + { + "epoch": 0.2332780523777579, + "grad_norm": 0.42855343222618103, + "learning_rate": 0.00015337605115741044, + "loss": 1.4228, + "step": 17952 + }, + { + "epoch": 0.23329104692167377, + "grad_norm": 0.2871570289134979, + "learning_rate": 0.00015337345169549903, + "loss": 1.2664, + "step": 17953 + }, + { + "epoch": 0.23330404146558964, + "grad_norm": 0.3602854311466217, + "learning_rate": 0.00015337085223358766, + "loss": 1.4117, + "step": 17954 + }, + { + "epoch": 0.2333170360095055, + "grad_norm": 0.3426908850669861, + "learning_rate": 0.00015336825277167628, + "loss": 1.4184, + "step": 17955 + }, + { + "epoch": 0.23333003055342139, + "grad_norm": 0.38484346866607666, + "learning_rate": 0.00015336565330976488, + "loss": 1.4802, + "step": 17956 + }, + { + "epoch": 0.23334302509733726, + "grad_norm": 0.3409084379673004, + "learning_rate": 0.0001533630538478535, + "loss": 1.4181, + "step": 17957 + }, + { + "epoch": 0.23335601964125313, + "grad_norm": 0.3370325565338135, + "learning_rate": 0.0001533604543859421, + "loss": 1.3947, + "step": 17958 + }, + { + "epoch": 0.233369014185169, + "grad_norm": 0.4113130271434784, + "learning_rate": 0.00015335785492403075, + "loss": 1.4426, + "step": 17959 + }, + { + "epoch": 0.23338200872908488, + "grad_norm": 0.4372863471508026, + "learning_rate": 0.00015335525546211935, + "loss": 1.5363, + "step": 17960 + }, + { + "epoch": 0.23339500327300075, + "grad_norm": 0.39516639709472656, + "learning_rate": 0.00015335265600020795, + "loss": 1.4497, + "step": 17961 + }, + { + "epoch": 0.23340799781691662, + "grad_norm": 0.5111863613128662, + "learning_rate": 0.00015335005653829657, + "loss": 1.4849, + "step": 17962 + }, + { + "epoch": 0.2334209923608325, + "grad_norm": 0.4778505861759186, + "learning_rate": 0.0001533474570763852, + "loss": 1.3552, + "step": 17963 + }, + { + "epoch": 0.23343398690474837, + "grad_norm": 0.44522005319595337, + "learning_rate": 0.00015334485761447382, + "loss": 1.309, + "step": 17964 + }, + { + "epoch": 0.23344698144866424, + "grad_norm": 0.37539568543434143, + "learning_rate": 0.00015334225815256242, + "loss": 1.4853, + "step": 17965 + }, + { + "epoch": 0.23345997599258012, + "grad_norm": 0.35811930894851685, + "learning_rate": 0.00015333965869065104, + "loss": 1.5613, + "step": 17966 + }, + { + "epoch": 0.233472970536496, + "grad_norm": 0.40717077255249023, + "learning_rate": 0.00015333705922873967, + "loss": 1.617, + "step": 17967 + }, + { + "epoch": 0.23348596508041186, + "grad_norm": 0.423179030418396, + "learning_rate": 0.00015333445976682827, + "loss": 1.4066, + "step": 17968 + }, + { + "epoch": 0.23349895962432773, + "grad_norm": 0.3956672251224518, + "learning_rate": 0.0001533318603049169, + "loss": 1.2867, + "step": 17969 + }, + { + "epoch": 0.2335119541682436, + "grad_norm": 0.4276334345340729, + "learning_rate": 0.0001533292608430055, + "loss": 1.3287, + "step": 17970 + }, + { + "epoch": 0.23352494871215948, + "grad_norm": 0.36300331354141235, + "learning_rate": 0.00015332666138109414, + "loss": 1.3658, + "step": 17971 + }, + { + "epoch": 0.23353794325607535, + "grad_norm": 0.4081609547138214, + "learning_rate": 0.00015332406191918274, + "loss": 1.4147, + "step": 17972 + }, + { + "epoch": 0.23355093779999123, + "grad_norm": 0.3212282657623291, + "learning_rate": 0.00015332146245727133, + "loss": 1.3882, + "step": 17973 + }, + { + "epoch": 0.2335639323439071, + "grad_norm": 0.45553895831108093, + "learning_rate": 0.00015331886299535996, + "loss": 1.6364, + "step": 17974 + }, + { + "epoch": 0.23357692688782297, + "grad_norm": 0.3248056471347809, + "learning_rate": 0.00015331626353344858, + "loss": 1.3687, + "step": 17975 + }, + { + "epoch": 0.23358992143173885, + "grad_norm": 0.46040576696395874, + "learning_rate": 0.0001533136640715372, + "loss": 1.6193, + "step": 17976 + }, + { + "epoch": 0.23360291597565472, + "grad_norm": 0.2937815487384796, + "learning_rate": 0.0001533110646096258, + "loss": 1.3843, + "step": 17977 + }, + { + "epoch": 0.2336159105195706, + "grad_norm": 0.34830930829048157, + "learning_rate": 0.00015330846514771443, + "loss": 1.3201, + "step": 17978 + }, + { + "epoch": 0.23362890506348646, + "grad_norm": 0.3946056365966797, + "learning_rate": 0.00015330586568580305, + "loss": 1.1903, + "step": 17979 + }, + { + "epoch": 0.23364189960740234, + "grad_norm": 0.37340110540390015, + "learning_rate": 0.00015330326622389165, + "loss": 1.3991, + "step": 17980 + }, + { + "epoch": 0.2336548941513182, + "grad_norm": 0.4689640402793884, + "learning_rate": 0.00015330066676198028, + "loss": 1.5045, + "step": 17981 + }, + { + "epoch": 0.23366788869523408, + "grad_norm": 0.40675726532936096, + "learning_rate": 0.0001532980673000689, + "loss": 1.3037, + "step": 17982 + }, + { + "epoch": 0.23368088323914996, + "grad_norm": 0.45732080936431885, + "learning_rate": 0.00015329546783815753, + "loss": 1.6636, + "step": 17983 + }, + { + "epoch": 0.23369387778306583, + "grad_norm": 0.4354645013809204, + "learning_rate": 0.00015329286837624612, + "loss": 1.459, + "step": 17984 + }, + { + "epoch": 0.2337068723269817, + "grad_norm": 0.38458874821662903, + "learning_rate": 0.00015329026891433475, + "loss": 1.3473, + "step": 17985 + }, + { + "epoch": 0.23371986687089757, + "grad_norm": 0.4326435625553131, + "learning_rate": 0.00015328766945242337, + "loss": 1.5688, + "step": 17986 + }, + { + "epoch": 0.23373286141481345, + "grad_norm": 0.39050623774528503, + "learning_rate": 0.00015328506999051197, + "loss": 1.4113, + "step": 17987 + }, + { + "epoch": 0.23374585595872932, + "grad_norm": 0.3956562578678131, + "learning_rate": 0.0001532824705286006, + "loss": 1.471, + "step": 17988 + }, + { + "epoch": 0.2337588505026452, + "grad_norm": 0.37935376167297363, + "learning_rate": 0.0001532798710666892, + "loss": 1.4289, + "step": 17989 + }, + { + "epoch": 0.23377184504656107, + "grad_norm": 0.39070311188697815, + "learning_rate": 0.00015327727160477782, + "loss": 1.2734, + "step": 17990 + }, + { + "epoch": 0.23378483959047694, + "grad_norm": 0.3776244819164276, + "learning_rate": 0.00015327467214286644, + "loss": 1.308, + "step": 17991 + }, + { + "epoch": 0.2337978341343928, + "grad_norm": 0.40933501720428467, + "learning_rate": 0.00015327207268095504, + "loss": 1.5812, + "step": 17992 + }, + { + "epoch": 0.23381082867830869, + "grad_norm": 0.3220255374908447, + "learning_rate": 0.00015326947321904366, + "loss": 1.3327, + "step": 17993 + }, + { + "epoch": 0.23382382322222456, + "grad_norm": 0.35636258125305176, + "learning_rate": 0.00015326687375713229, + "loss": 1.3992, + "step": 17994 + }, + { + "epoch": 0.23383681776614043, + "grad_norm": 0.3924807608127594, + "learning_rate": 0.0001532642742952209, + "loss": 1.4874, + "step": 17995 + }, + { + "epoch": 0.2338498123100563, + "grad_norm": 0.3293445408344269, + "learning_rate": 0.0001532616748333095, + "loss": 1.3592, + "step": 17996 + }, + { + "epoch": 0.23386280685397218, + "grad_norm": 0.40680330991744995, + "learning_rate": 0.00015325907537139813, + "loss": 1.5609, + "step": 17997 + }, + { + "epoch": 0.23387580139788805, + "grad_norm": 0.4301876425743103, + "learning_rate": 0.00015325647590948676, + "loss": 1.516, + "step": 17998 + }, + { + "epoch": 0.23388879594180392, + "grad_norm": 0.5076674222946167, + "learning_rate": 0.00015325387644757535, + "loss": 1.3165, + "step": 17999 + }, + { + "epoch": 0.2339017904857198, + "grad_norm": 0.41786855459213257, + "learning_rate": 0.00015325127698566398, + "loss": 1.3622, + "step": 18000 + }, + { + "epoch": 0.23391478502963567, + "grad_norm": 0.32593899965286255, + "learning_rate": 0.00015324867752375258, + "loss": 1.1442, + "step": 18001 + }, + { + "epoch": 0.23392777957355154, + "grad_norm": 0.47393733263015747, + "learning_rate": 0.0001532460780618412, + "loss": 1.6408, + "step": 18002 + }, + { + "epoch": 0.23394077411746741, + "grad_norm": 0.36780720949172974, + "learning_rate": 0.00015324347859992983, + "loss": 1.4241, + "step": 18003 + }, + { + "epoch": 0.2339537686613833, + "grad_norm": 0.3852415978908539, + "learning_rate": 0.00015324087913801842, + "loss": 1.6291, + "step": 18004 + }, + { + "epoch": 0.2339667632052992, + "grad_norm": 0.43215811252593994, + "learning_rate": 0.00015323827967610705, + "loss": 1.5221, + "step": 18005 + }, + { + "epoch": 0.23397975774921506, + "grad_norm": 0.33725112676620483, + "learning_rate": 0.00015323568021419567, + "loss": 1.5029, + "step": 18006 + }, + { + "epoch": 0.23399275229313093, + "grad_norm": 0.39836397767066956, + "learning_rate": 0.0001532330807522843, + "loss": 1.4686, + "step": 18007 + }, + { + "epoch": 0.2340057468370468, + "grad_norm": 0.2320425808429718, + "learning_rate": 0.0001532304812903729, + "loss": 1.4181, + "step": 18008 + }, + { + "epoch": 0.23401874138096268, + "grad_norm": 0.41020092368125916, + "learning_rate": 0.00015322788182846152, + "loss": 1.2584, + "step": 18009 + }, + { + "epoch": 0.23403173592487855, + "grad_norm": 0.3727324306964874, + "learning_rate": 0.00015322528236655014, + "loss": 1.3973, + "step": 18010 + }, + { + "epoch": 0.23404473046879443, + "grad_norm": 0.2463207244873047, + "learning_rate": 0.00015322268290463874, + "loss": 1.3373, + "step": 18011 + }, + { + "epoch": 0.2340577250127103, + "grad_norm": 0.32143476605415344, + "learning_rate": 0.00015322008344272736, + "loss": 1.4462, + "step": 18012 + }, + { + "epoch": 0.23407071955662617, + "grad_norm": 0.4097207486629486, + "learning_rate": 0.00015321748398081596, + "loss": 1.5051, + "step": 18013 + }, + { + "epoch": 0.23408371410054205, + "grad_norm": 0.33502137660980225, + "learning_rate": 0.0001532148845189046, + "loss": 1.3054, + "step": 18014 + }, + { + "epoch": 0.23409670864445792, + "grad_norm": 0.3422829508781433, + "learning_rate": 0.0001532122850569932, + "loss": 1.3622, + "step": 18015 + }, + { + "epoch": 0.2341097031883738, + "grad_norm": 0.3967907726764679, + "learning_rate": 0.0001532096855950818, + "loss": 1.5501, + "step": 18016 + }, + { + "epoch": 0.23412269773228966, + "grad_norm": 0.41888493299484253, + "learning_rate": 0.00015320708613317046, + "loss": 1.365, + "step": 18017 + }, + { + "epoch": 0.23413569227620554, + "grad_norm": 0.5179020762443542, + "learning_rate": 0.00015320448667125906, + "loss": 1.5825, + "step": 18018 + }, + { + "epoch": 0.2341486868201214, + "grad_norm": 0.5036913156509399, + "learning_rate": 0.00015320188720934768, + "loss": 1.5466, + "step": 18019 + }, + { + "epoch": 0.23416168136403728, + "grad_norm": 0.30901363492012024, + "learning_rate": 0.00015319928774743628, + "loss": 1.3966, + "step": 18020 + }, + { + "epoch": 0.23417467590795316, + "grad_norm": 0.4030865728855133, + "learning_rate": 0.0001531966882855249, + "loss": 1.3442, + "step": 18021 + }, + { + "epoch": 0.23418767045186903, + "grad_norm": 0.360787570476532, + "learning_rate": 0.00015319408882361353, + "loss": 1.4201, + "step": 18022 + }, + { + "epoch": 0.2342006649957849, + "grad_norm": 0.37237903475761414, + "learning_rate": 0.00015319148936170213, + "loss": 1.3517, + "step": 18023 + }, + { + "epoch": 0.23421365953970077, + "grad_norm": 0.4541695713996887, + "learning_rate": 0.00015318888989979075, + "loss": 1.4075, + "step": 18024 + }, + { + "epoch": 0.23422665408361665, + "grad_norm": 0.47659939527511597, + "learning_rate": 0.00015318629043787937, + "loss": 1.4612, + "step": 18025 + }, + { + "epoch": 0.23423964862753252, + "grad_norm": 0.422150194644928, + "learning_rate": 0.000153183690975968, + "loss": 1.2813, + "step": 18026 + }, + { + "epoch": 0.2342526431714484, + "grad_norm": 0.3242901563644409, + "learning_rate": 0.0001531810915140566, + "loss": 1.3177, + "step": 18027 + }, + { + "epoch": 0.23426563771536427, + "grad_norm": 0.3061369061470032, + "learning_rate": 0.0001531784920521452, + "loss": 1.2185, + "step": 18028 + }, + { + "epoch": 0.23427863225928014, + "grad_norm": 0.31887730956077576, + "learning_rate": 0.00015317589259023384, + "loss": 1.3763, + "step": 18029 + }, + { + "epoch": 0.234291626803196, + "grad_norm": 0.39775481820106506, + "learning_rate": 0.00015317329312832244, + "loss": 1.4063, + "step": 18030 + }, + { + "epoch": 0.23430462134711189, + "grad_norm": 0.2797357738018036, + "learning_rate": 0.00015317069366641107, + "loss": 1.4146, + "step": 18031 + }, + { + "epoch": 0.23431761589102776, + "grad_norm": 0.43941476941108704, + "learning_rate": 0.00015316809420449966, + "loss": 1.2305, + "step": 18032 + }, + { + "epoch": 0.23433061043494363, + "grad_norm": 0.9685315489768982, + "learning_rate": 0.0001531654947425883, + "loss": 1.479, + "step": 18033 + }, + { + "epoch": 0.2343436049788595, + "grad_norm": 0.39319655299186707, + "learning_rate": 0.0001531628952806769, + "loss": 1.4624, + "step": 18034 + }, + { + "epoch": 0.23435659952277538, + "grad_norm": 0.4018360674381256, + "learning_rate": 0.0001531602958187655, + "loss": 1.4521, + "step": 18035 + }, + { + "epoch": 0.23436959406669125, + "grad_norm": 0.32737624645233154, + "learning_rate": 0.00015315769635685414, + "loss": 1.3044, + "step": 18036 + }, + { + "epoch": 0.23438258861060712, + "grad_norm": 0.3900229334831238, + "learning_rate": 0.00015315509689494276, + "loss": 1.4614, + "step": 18037 + }, + { + "epoch": 0.234395583154523, + "grad_norm": 0.4148141145706177, + "learning_rate": 0.00015315249743303138, + "loss": 1.3859, + "step": 18038 + }, + { + "epoch": 0.23440857769843887, + "grad_norm": 0.40642696619033813, + "learning_rate": 0.00015314989797111998, + "loss": 1.411, + "step": 18039 + }, + { + "epoch": 0.23442157224235474, + "grad_norm": 0.47283414006233215, + "learning_rate": 0.00015314729850920858, + "loss": 1.5534, + "step": 18040 + }, + { + "epoch": 0.23443456678627062, + "grad_norm": 0.41187629103660583, + "learning_rate": 0.00015314469904729723, + "loss": 1.5191, + "step": 18041 + }, + { + "epoch": 0.2344475613301865, + "grad_norm": 0.3797478973865509, + "learning_rate": 0.00015314209958538583, + "loss": 1.4271, + "step": 18042 + }, + { + "epoch": 0.23446055587410236, + "grad_norm": 0.402601957321167, + "learning_rate": 0.00015313950012347445, + "loss": 1.4774, + "step": 18043 + }, + { + "epoch": 0.23447355041801823, + "grad_norm": 0.4224216938018799, + "learning_rate": 0.00015313690066156305, + "loss": 1.5172, + "step": 18044 + }, + { + "epoch": 0.2344865449619341, + "grad_norm": 0.39689821004867554, + "learning_rate": 0.00015313430119965167, + "loss": 1.2662, + "step": 18045 + }, + { + "epoch": 0.23449953950584998, + "grad_norm": 0.5979065895080566, + "learning_rate": 0.0001531317017377403, + "loss": 1.4197, + "step": 18046 + }, + { + "epoch": 0.23451253404976585, + "grad_norm": 0.35663479566574097, + "learning_rate": 0.0001531291022758289, + "loss": 1.5123, + "step": 18047 + }, + { + "epoch": 0.23452552859368173, + "grad_norm": 0.33819663524627686, + "learning_rate": 0.00015312650281391752, + "loss": 1.4088, + "step": 18048 + }, + { + "epoch": 0.2345385231375976, + "grad_norm": 0.3434363603591919, + "learning_rate": 0.00015312390335200614, + "loss": 1.2682, + "step": 18049 + }, + { + "epoch": 0.23455151768151347, + "grad_norm": 0.3235607445240021, + "learning_rate": 0.00015312130389009477, + "loss": 1.4834, + "step": 18050 + }, + { + "epoch": 0.23456451222542934, + "grad_norm": 0.4239378571510315, + "learning_rate": 0.00015311870442818337, + "loss": 1.515, + "step": 18051 + }, + { + "epoch": 0.23457750676934522, + "grad_norm": 0.4866640567779541, + "learning_rate": 0.000153116104966272, + "loss": 1.524, + "step": 18052 + }, + { + "epoch": 0.2345905013132611, + "grad_norm": 0.40287330746650696, + "learning_rate": 0.00015311350550436062, + "loss": 1.3636, + "step": 18053 + }, + { + "epoch": 0.23460349585717696, + "grad_norm": 0.3488408029079437, + "learning_rate": 0.0001531109060424492, + "loss": 1.239, + "step": 18054 + }, + { + "epoch": 0.23461649040109284, + "grad_norm": 0.4170578718185425, + "learning_rate": 0.00015310830658053784, + "loss": 1.6802, + "step": 18055 + }, + { + "epoch": 0.2346294849450087, + "grad_norm": 0.3394927382469177, + "learning_rate": 0.00015310570711862646, + "loss": 1.4157, + "step": 18056 + }, + { + "epoch": 0.23464247948892458, + "grad_norm": 0.4658181071281433, + "learning_rate": 0.00015310310765671506, + "loss": 1.3043, + "step": 18057 + }, + { + "epoch": 0.23465547403284046, + "grad_norm": 0.35588932037353516, + "learning_rate": 0.00015310050819480368, + "loss": 1.4884, + "step": 18058 + }, + { + "epoch": 0.23466846857675633, + "grad_norm": 0.3373064696788788, + "learning_rate": 0.00015309790873289228, + "loss": 1.4317, + "step": 18059 + }, + { + "epoch": 0.2346814631206722, + "grad_norm": 0.40685445070266724, + "learning_rate": 0.00015309530927098093, + "loss": 1.3801, + "step": 18060 + }, + { + "epoch": 0.23469445766458807, + "grad_norm": 0.5385668873786926, + "learning_rate": 0.00015309270980906953, + "loss": 1.398, + "step": 18061 + }, + { + "epoch": 0.23470745220850395, + "grad_norm": 0.40858256816864014, + "learning_rate": 0.00015309011034715815, + "loss": 1.3456, + "step": 18062 + }, + { + "epoch": 0.23472044675241982, + "grad_norm": 0.31402289867401123, + "learning_rate": 0.00015308751088524675, + "loss": 1.3194, + "step": 18063 + }, + { + "epoch": 0.2347334412963357, + "grad_norm": 0.381795197725296, + "learning_rate": 0.00015308491142333538, + "loss": 1.4004, + "step": 18064 + }, + { + "epoch": 0.23474643584025157, + "grad_norm": 0.4572577476501465, + "learning_rate": 0.000153082311961424, + "loss": 1.4242, + "step": 18065 + }, + { + "epoch": 0.23475943038416744, + "grad_norm": 0.37868568301200867, + "learning_rate": 0.0001530797124995126, + "loss": 1.4439, + "step": 18066 + }, + { + "epoch": 0.2347724249280833, + "grad_norm": 0.36918380856513977, + "learning_rate": 0.00015307711303760122, + "loss": 1.2055, + "step": 18067 + }, + { + "epoch": 0.23478541947199918, + "grad_norm": 0.30491912364959717, + "learning_rate": 0.00015307451357568985, + "loss": 1.2899, + "step": 18068 + }, + { + "epoch": 0.23479841401591506, + "grad_norm": 0.34524771571159363, + "learning_rate": 0.00015307191411377847, + "loss": 1.4236, + "step": 18069 + }, + { + "epoch": 0.23481140855983093, + "grad_norm": 0.3788608908653259, + "learning_rate": 0.00015306931465186707, + "loss": 1.4725, + "step": 18070 + }, + { + "epoch": 0.2348244031037468, + "grad_norm": 0.38002508878707886, + "learning_rate": 0.00015306671518995567, + "loss": 1.2142, + "step": 18071 + }, + { + "epoch": 0.23483739764766268, + "grad_norm": 0.36435046792030334, + "learning_rate": 0.00015306411572804432, + "loss": 1.4132, + "step": 18072 + }, + { + "epoch": 0.23485039219157855, + "grad_norm": 0.37841206789016724, + "learning_rate": 0.00015306151626613292, + "loss": 1.372, + "step": 18073 + }, + { + "epoch": 0.23486338673549442, + "grad_norm": 0.3763261139392853, + "learning_rate": 0.00015305891680422154, + "loss": 1.3355, + "step": 18074 + }, + { + "epoch": 0.2348763812794103, + "grad_norm": 0.4358110725879669, + "learning_rate": 0.00015305631734231014, + "loss": 1.3339, + "step": 18075 + }, + { + "epoch": 0.23488937582332617, + "grad_norm": 0.49096590280532837, + "learning_rate": 0.00015305371788039876, + "loss": 1.5661, + "step": 18076 + }, + { + "epoch": 0.23490237036724204, + "grad_norm": 0.4548513889312744, + "learning_rate": 0.0001530511184184874, + "loss": 1.5205, + "step": 18077 + }, + { + "epoch": 0.23491536491115791, + "grad_norm": 0.45075923204421997, + "learning_rate": 0.00015304851895657598, + "loss": 1.3969, + "step": 18078 + }, + { + "epoch": 0.2349283594550738, + "grad_norm": 0.3555245101451874, + "learning_rate": 0.0001530459194946646, + "loss": 1.447, + "step": 18079 + }, + { + "epoch": 0.23494135399898966, + "grad_norm": 0.34753239154815674, + "learning_rate": 0.00015304332003275323, + "loss": 1.512, + "step": 18080 + }, + { + "epoch": 0.23495434854290556, + "grad_norm": 0.37124064564704895, + "learning_rate": 0.00015304072057084186, + "loss": 1.3088, + "step": 18081 + }, + { + "epoch": 0.23496734308682143, + "grad_norm": 0.4854525029659271, + "learning_rate": 0.00015303812110893045, + "loss": 1.4232, + "step": 18082 + }, + { + "epoch": 0.2349803376307373, + "grad_norm": 0.5026360154151917, + "learning_rate": 0.00015303552164701905, + "loss": 1.5394, + "step": 18083 + }, + { + "epoch": 0.23499333217465318, + "grad_norm": 0.3670816421508789, + "learning_rate": 0.0001530329221851077, + "loss": 1.2812, + "step": 18084 + }, + { + "epoch": 0.23500632671856905, + "grad_norm": 0.39980369806289673, + "learning_rate": 0.0001530303227231963, + "loss": 1.5397, + "step": 18085 + }, + { + "epoch": 0.23501932126248493, + "grad_norm": 0.35663512349128723, + "learning_rate": 0.00015302772326128493, + "loss": 1.431, + "step": 18086 + }, + { + "epoch": 0.2350323158064008, + "grad_norm": 0.37008699774742126, + "learning_rate": 0.00015302512379937352, + "loss": 1.5248, + "step": 18087 + }, + { + "epoch": 0.23504531035031667, + "grad_norm": 0.4221617877483368, + "learning_rate": 0.00015302252433746215, + "loss": 1.4101, + "step": 18088 + }, + { + "epoch": 0.23505830489423254, + "grad_norm": 0.4563870429992676, + "learning_rate": 0.00015301992487555077, + "loss": 1.3585, + "step": 18089 + }, + { + "epoch": 0.23507129943814842, + "grad_norm": 0.3603486716747284, + "learning_rate": 0.00015301732541363937, + "loss": 1.2289, + "step": 18090 + }, + { + "epoch": 0.2350842939820643, + "grad_norm": 0.40538015961647034, + "learning_rate": 0.00015301472595172802, + "loss": 1.5078, + "step": 18091 + }, + { + "epoch": 0.23509728852598016, + "grad_norm": 0.28498125076293945, + "learning_rate": 0.00015301212648981662, + "loss": 1.2793, + "step": 18092 + }, + { + "epoch": 0.23511028306989604, + "grad_norm": 0.44566699862480164, + "learning_rate": 0.00015300952702790524, + "loss": 1.4666, + "step": 18093 + }, + { + "epoch": 0.2351232776138119, + "grad_norm": 0.4075450897216797, + "learning_rate": 0.00015300692756599384, + "loss": 1.3643, + "step": 18094 + }, + { + "epoch": 0.23513627215772778, + "grad_norm": 0.35127031803131104, + "learning_rate": 0.00015300432810408246, + "loss": 1.246, + "step": 18095 + }, + { + "epoch": 0.23514926670164366, + "grad_norm": 0.4481108486652374, + "learning_rate": 0.0001530017286421711, + "loss": 1.3677, + "step": 18096 + }, + { + "epoch": 0.23516226124555953, + "grad_norm": 0.36416977643966675, + "learning_rate": 0.0001529991291802597, + "loss": 1.1969, + "step": 18097 + }, + { + "epoch": 0.2351752557894754, + "grad_norm": 0.4587027132511139, + "learning_rate": 0.0001529965297183483, + "loss": 1.4056, + "step": 18098 + }, + { + "epoch": 0.23518825033339127, + "grad_norm": 0.3319268822669983, + "learning_rate": 0.00015299393025643694, + "loss": 1.3583, + "step": 18099 + }, + { + "epoch": 0.23520124487730715, + "grad_norm": 0.36130058765411377, + "learning_rate": 0.00015299133079452553, + "loss": 1.2937, + "step": 18100 + }, + { + "epoch": 0.23521423942122302, + "grad_norm": 0.48869195580482483, + "learning_rate": 0.00015298873133261416, + "loss": 1.3722, + "step": 18101 + }, + { + "epoch": 0.2352272339651389, + "grad_norm": 0.29547956585884094, + "learning_rate": 0.00015298613187070275, + "loss": 1.2454, + "step": 18102 + }, + { + "epoch": 0.23524022850905477, + "grad_norm": 0.3617570996284485, + "learning_rate": 0.0001529835324087914, + "loss": 1.4587, + "step": 18103 + }, + { + "epoch": 0.23525322305297064, + "grad_norm": 0.3721075654029846, + "learning_rate": 0.00015298093294688, + "loss": 1.2906, + "step": 18104 + }, + { + "epoch": 0.2352662175968865, + "grad_norm": 0.45762911438941956, + "learning_rate": 0.00015297833348496863, + "loss": 1.1893, + "step": 18105 + }, + { + "epoch": 0.23527921214080239, + "grad_norm": 0.34379497170448303, + "learning_rate": 0.00015297573402305723, + "loss": 1.4006, + "step": 18106 + }, + { + "epoch": 0.23529220668471826, + "grad_norm": 0.33811071515083313, + "learning_rate": 0.00015297313456114585, + "loss": 1.3711, + "step": 18107 + }, + { + "epoch": 0.23530520122863413, + "grad_norm": 0.40029510855674744, + "learning_rate": 0.00015297053509923447, + "loss": 1.4282, + "step": 18108 + }, + { + "epoch": 0.23531819577255, + "grad_norm": 0.41995954513549805, + "learning_rate": 0.00015296793563732307, + "loss": 1.3516, + "step": 18109 + }, + { + "epoch": 0.23533119031646588, + "grad_norm": 0.483599990606308, + "learning_rate": 0.0001529653361754117, + "loss": 1.4617, + "step": 18110 + }, + { + "epoch": 0.23534418486038175, + "grad_norm": 0.3936520218849182, + "learning_rate": 0.00015296273671350032, + "loss": 1.4211, + "step": 18111 + }, + { + "epoch": 0.23535717940429762, + "grad_norm": 0.44347572326660156, + "learning_rate": 0.00015296013725158892, + "loss": 1.4651, + "step": 18112 + }, + { + "epoch": 0.2353701739482135, + "grad_norm": 0.45416751503944397, + "learning_rate": 0.00015295753778967754, + "loss": 1.5017, + "step": 18113 + }, + { + "epoch": 0.23538316849212937, + "grad_norm": 0.3818269670009613, + "learning_rate": 0.00015295493832776614, + "loss": 1.3149, + "step": 18114 + }, + { + "epoch": 0.23539616303604524, + "grad_norm": 0.2547667622566223, + "learning_rate": 0.0001529523388658548, + "loss": 1.4288, + "step": 18115 + }, + { + "epoch": 0.23540915757996111, + "grad_norm": 0.43630170822143555, + "learning_rate": 0.0001529497394039434, + "loss": 1.4479, + "step": 18116 + }, + { + "epoch": 0.235422152123877, + "grad_norm": 0.4467422068119049, + "learning_rate": 0.00015294713994203201, + "loss": 1.2504, + "step": 18117 + }, + { + "epoch": 0.23543514666779286, + "grad_norm": 0.4496765434741974, + "learning_rate": 0.0001529445404801206, + "loss": 1.5589, + "step": 18118 + }, + { + "epoch": 0.23544814121170873, + "grad_norm": 0.4187292754650116, + "learning_rate": 0.00015294194101820924, + "loss": 1.2564, + "step": 18119 + }, + { + "epoch": 0.2354611357556246, + "grad_norm": 0.4326245188713074, + "learning_rate": 0.00015293934155629786, + "loss": 1.3895, + "step": 18120 + }, + { + "epoch": 0.23547413029954048, + "grad_norm": 0.3660198748111725, + "learning_rate": 0.00015293674209438646, + "loss": 1.4438, + "step": 18121 + }, + { + "epoch": 0.23548712484345635, + "grad_norm": 0.34446752071380615, + "learning_rate": 0.00015293414263247508, + "loss": 1.3791, + "step": 18122 + }, + { + "epoch": 0.23550011938737223, + "grad_norm": 0.31882914900779724, + "learning_rate": 0.0001529315431705637, + "loss": 1.2146, + "step": 18123 + }, + { + "epoch": 0.2355131139312881, + "grad_norm": 0.3847213685512543, + "learning_rate": 0.0001529289437086523, + "loss": 1.3098, + "step": 18124 + }, + { + "epoch": 0.23552610847520397, + "grad_norm": 0.32543617486953735, + "learning_rate": 0.00015292634424674093, + "loss": 1.2413, + "step": 18125 + }, + { + "epoch": 0.23553910301911984, + "grad_norm": 0.3916413187980652, + "learning_rate": 0.00015292374478482953, + "loss": 1.5142, + "step": 18126 + }, + { + "epoch": 0.23555209756303572, + "grad_norm": 0.34882864356040955, + "learning_rate": 0.00015292114532291818, + "loss": 1.4549, + "step": 18127 + }, + { + "epoch": 0.2355650921069516, + "grad_norm": 0.4583916664123535, + "learning_rate": 0.00015291854586100677, + "loss": 1.4873, + "step": 18128 + }, + { + "epoch": 0.23557808665086746, + "grad_norm": 0.42106905579566956, + "learning_rate": 0.0001529159463990954, + "loss": 1.3655, + "step": 18129 + }, + { + "epoch": 0.23559108119478334, + "grad_norm": 0.3971590995788574, + "learning_rate": 0.00015291334693718402, + "loss": 1.7991, + "step": 18130 + }, + { + "epoch": 0.2356040757386992, + "grad_norm": 0.4282810389995575, + "learning_rate": 0.00015291074747527262, + "loss": 1.4536, + "step": 18131 + }, + { + "epoch": 0.23561707028261508, + "grad_norm": 0.3702646791934967, + "learning_rate": 0.00015290814801336125, + "loss": 1.3402, + "step": 18132 + }, + { + "epoch": 0.23563006482653095, + "grad_norm": 0.35790005326271057, + "learning_rate": 0.00015290554855144984, + "loss": 1.2635, + "step": 18133 + }, + { + "epoch": 0.23564305937044683, + "grad_norm": 0.4585866928100586, + "learning_rate": 0.0001529029490895385, + "loss": 1.5563, + "step": 18134 + }, + { + "epoch": 0.2356560539143627, + "grad_norm": 0.3552546501159668, + "learning_rate": 0.0001529003496276271, + "loss": 1.4165, + "step": 18135 + }, + { + "epoch": 0.23566904845827857, + "grad_norm": 0.479915589094162, + "learning_rate": 0.00015289775016571572, + "loss": 1.4247, + "step": 18136 + }, + { + "epoch": 0.23568204300219445, + "grad_norm": 0.37789207696914673, + "learning_rate": 0.0001528951507038043, + "loss": 1.4213, + "step": 18137 + }, + { + "epoch": 0.23569503754611032, + "grad_norm": 0.3848220407962799, + "learning_rate": 0.00015289255124189294, + "loss": 1.3665, + "step": 18138 + }, + { + "epoch": 0.2357080320900262, + "grad_norm": 0.3396890461444855, + "learning_rate": 0.00015288995177998156, + "loss": 1.3181, + "step": 18139 + }, + { + "epoch": 0.23572102663394207, + "grad_norm": 0.42892786860466003, + "learning_rate": 0.00015288735231807016, + "loss": 1.4398, + "step": 18140 + }, + { + "epoch": 0.23573402117785794, + "grad_norm": 0.4097321927547455, + "learning_rate": 0.00015288475285615878, + "loss": 1.4484, + "step": 18141 + }, + { + "epoch": 0.2357470157217738, + "grad_norm": 0.42936643958091736, + "learning_rate": 0.0001528821533942474, + "loss": 1.2688, + "step": 18142 + }, + { + "epoch": 0.23576001026568968, + "grad_norm": 0.4336777925491333, + "learning_rate": 0.000152879553932336, + "loss": 1.3498, + "step": 18143 + }, + { + "epoch": 0.23577300480960556, + "grad_norm": 0.4130055010318756, + "learning_rate": 0.00015287695447042463, + "loss": 1.3498, + "step": 18144 + }, + { + "epoch": 0.23578599935352143, + "grad_norm": 0.34807726740837097, + "learning_rate": 0.00015287435500851323, + "loss": 1.4249, + "step": 18145 + }, + { + "epoch": 0.2357989938974373, + "grad_norm": 0.3700559139251709, + "learning_rate": 0.00015287175554660188, + "loss": 1.526, + "step": 18146 + }, + { + "epoch": 0.23581198844135318, + "grad_norm": 0.46488526463508606, + "learning_rate": 0.00015286915608469048, + "loss": 1.5499, + "step": 18147 + }, + { + "epoch": 0.23582498298526905, + "grad_norm": 0.3845004737377167, + "learning_rate": 0.0001528665566227791, + "loss": 1.4191, + "step": 18148 + }, + { + "epoch": 0.23583797752918492, + "grad_norm": 0.3803398013114929, + "learning_rate": 0.0001528639571608677, + "loss": 1.5062, + "step": 18149 + }, + { + "epoch": 0.2358509720731008, + "grad_norm": 0.48074230551719666, + "learning_rate": 0.00015286135769895632, + "loss": 1.4756, + "step": 18150 + }, + { + "epoch": 0.23586396661701667, + "grad_norm": 0.47667598724365234, + "learning_rate": 0.00015285875823704495, + "loss": 1.4859, + "step": 18151 + }, + { + "epoch": 0.23587696116093254, + "grad_norm": 0.5368838906288147, + "learning_rate": 0.00015285615877513355, + "loss": 1.4254, + "step": 18152 + }, + { + "epoch": 0.23588995570484841, + "grad_norm": 0.4175216257572174, + "learning_rate": 0.00015285355931322217, + "loss": 1.69, + "step": 18153 + }, + { + "epoch": 0.2359029502487643, + "grad_norm": 0.47914665937423706, + "learning_rate": 0.0001528509598513108, + "loss": 1.8011, + "step": 18154 + }, + { + "epoch": 0.23591594479268016, + "grad_norm": 0.4637472331523895, + "learning_rate": 0.0001528483603893994, + "loss": 1.2943, + "step": 18155 + }, + { + "epoch": 0.23592893933659603, + "grad_norm": 0.39966654777526855, + "learning_rate": 0.00015284576092748802, + "loss": 1.5315, + "step": 18156 + }, + { + "epoch": 0.23594193388051193, + "grad_norm": 0.27806514501571655, + "learning_rate": 0.0001528431614655766, + "loss": 1.3058, + "step": 18157 + }, + { + "epoch": 0.2359549284244278, + "grad_norm": 0.4609842598438263, + "learning_rate": 0.00015284056200366527, + "loss": 1.421, + "step": 18158 + }, + { + "epoch": 0.23596792296834368, + "grad_norm": 0.3547143042087555, + "learning_rate": 0.00015283796254175386, + "loss": 1.1357, + "step": 18159 + }, + { + "epoch": 0.23598091751225955, + "grad_norm": 0.355663001537323, + "learning_rate": 0.0001528353630798425, + "loss": 1.334, + "step": 18160 + }, + { + "epoch": 0.23599391205617543, + "grad_norm": 0.5052456259727478, + "learning_rate": 0.00015283276361793108, + "loss": 1.6809, + "step": 18161 + }, + { + "epoch": 0.2360069066000913, + "grad_norm": 0.4380223751068115, + "learning_rate": 0.0001528301641560197, + "loss": 1.2919, + "step": 18162 + }, + { + "epoch": 0.23601990114400717, + "grad_norm": 0.3678244948387146, + "learning_rate": 0.00015282756469410833, + "loss": 1.3556, + "step": 18163 + }, + { + "epoch": 0.23603289568792304, + "grad_norm": 0.3166038691997528, + "learning_rate": 0.00015282496523219693, + "loss": 1.3418, + "step": 18164 + }, + { + "epoch": 0.23604589023183892, + "grad_norm": 0.420303076505661, + "learning_rate": 0.00015282236577028556, + "loss": 1.3637, + "step": 18165 + }, + { + "epoch": 0.2360588847757548, + "grad_norm": 0.35121268033981323, + "learning_rate": 0.00015281976630837418, + "loss": 1.4739, + "step": 18166 + }, + { + "epoch": 0.23607187931967066, + "grad_norm": 0.2775443494319916, + "learning_rate": 0.00015281716684646278, + "loss": 1.2545, + "step": 18167 + }, + { + "epoch": 0.23608487386358654, + "grad_norm": 0.45077207684516907, + "learning_rate": 0.0001528145673845514, + "loss": 1.3224, + "step": 18168 + }, + { + "epoch": 0.2360978684075024, + "grad_norm": 0.377049058675766, + "learning_rate": 0.00015281196792264003, + "loss": 1.5797, + "step": 18169 + }, + { + "epoch": 0.23611086295141828, + "grad_norm": 0.4229763448238373, + "learning_rate": 0.00015280936846072865, + "loss": 1.6762, + "step": 18170 + }, + { + "epoch": 0.23612385749533416, + "grad_norm": 0.5033701062202454, + "learning_rate": 0.00015280676899881725, + "loss": 1.4265, + "step": 18171 + }, + { + "epoch": 0.23613685203925003, + "grad_norm": 0.3585719168186188, + "learning_rate": 0.00015280416953690587, + "loss": 1.3259, + "step": 18172 + }, + { + "epoch": 0.2361498465831659, + "grad_norm": 0.3322538733482361, + "learning_rate": 0.0001528015700749945, + "loss": 1.5847, + "step": 18173 + }, + { + "epoch": 0.23616284112708177, + "grad_norm": 0.4248094856739044, + "learning_rate": 0.0001527989706130831, + "loss": 1.3984, + "step": 18174 + }, + { + "epoch": 0.23617583567099765, + "grad_norm": 0.30544593930244446, + "learning_rate": 0.00015279637115117172, + "loss": 1.3828, + "step": 18175 + }, + { + "epoch": 0.23618883021491352, + "grad_norm": 0.37814876437187195, + "learning_rate": 0.00015279377168926032, + "loss": 1.4739, + "step": 18176 + }, + { + "epoch": 0.2362018247588294, + "grad_norm": 0.3748113512992859, + "learning_rate": 0.00015279117222734897, + "loss": 1.4219, + "step": 18177 + }, + { + "epoch": 0.23621481930274527, + "grad_norm": 0.4318793714046478, + "learning_rate": 0.00015278857276543756, + "loss": 1.4285, + "step": 18178 + }, + { + "epoch": 0.23622781384666114, + "grad_norm": 0.574853241443634, + "learning_rate": 0.00015278597330352616, + "loss": 1.3646, + "step": 18179 + }, + { + "epoch": 0.236240808390577, + "grad_norm": 0.3363092839717865, + "learning_rate": 0.0001527833738416148, + "loss": 1.2765, + "step": 18180 + }, + { + "epoch": 0.23625380293449288, + "grad_norm": 0.5701215267181396, + "learning_rate": 0.0001527807743797034, + "loss": 1.516, + "step": 18181 + }, + { + "epoch": 0.23626679747840876, + "grad_norm": 0.41120481491088867, + "learning_rate": 0.00015277817491779204, + "loss": 1.348, + "step": 18182 + }, + { + "epoch": 0.23627979202232463, + "grad_norm": 0.4616166353225708, + "learning_rate": 0.00015277557545588063, + "loss": 1.674, + "step": 18183 + }, + { + "epoch": 0.2362927865662405, + "grad_norm": 0.3635489344596863, + "learning_rate": 0.00015277297599396926, + "loss": 1.3864, + "step": 18184 + }, + { + "epoch": 0.23630578111015638, + "grad_norm": 0.37966054677963257, + "learning_rate": 0.00015277037653205788, + "loss": 1.2714, + "step": 18185 + }, + { + "epoch": 0.23631877565407225, + "grad_norm": 0.37608158588409424, + "learning_rate": 0.00015276777707014648, + "loss": 1.3796, + "step": 18186 + }, + { + "epoch": 0.23633177019798812, + "grad_norm": 0.37520378828048706, + "learning_rate": 0.0001527651776082351, + "loss": 1.3243, + "step": 18187 + }, + { + "epoch": 0.236344764741904, + "grad_norm": 0.4705045819282532, + "learning_rate": 0.0001527625781463237, + "loss": 1.4067, + "step": 18188 + }, + { + "epoch": 0.23635775928581987, + "grad_norm": 0.38363322615623474, + "learning_rate": 0.00015275997868441235, + "loss": 1.34, + "step": 18189 + }, + { + "epoch": 0.23637075382973574, + "grad_norm": 0.415509968996048, + "learning_rate": 0.00015275737922250095, + "loss": 1.4282, + "step": 18190 + }, + { + "epoch": 0.23638374837365161, + "grad_norm": 0.4897744953632355, + "learning_rate": 0.00015275477976058957, + "loss": 1.4026, + "step": 18191 + }, + { + "epoch": 0.2363967429175675, + "grad_norm": 0.3456737995147705, + "learning_rate": 0.00015275218029867817, + "loss": 1.3776, + "step": 18192 + }, + { + "epoch": 0.23640973746148336, + "grad_norm": 0.2571519911289215, + "learning_rate": 0.0001527495808367668, + "loss": 1.2256, + "step": 18193 + }, + { + "epoch": 0.23642273200539923, + "grad_norm": 0.36385491490364075, + "learning_rate": 0.00015274698137485542, + "loss": 1.4353, + "step": 18194 + }, + { + "epoch": 0.2364357265493151, + "grad_norm": 0.39608821272850037, + "learning_rate": 0.00015274438191294402, + "loss": 1.4698, + "step": 18195 + }, + { + "epoch": 0.23644872109323098, + "grad_norm": 0.35390350222587585, + "learning_rate": 0.00015274178245103264, + "loss": 1.3155, + "step": 18196 + }, + { + "epoch": 0.23646171563714685, + "grad_norm": 0.334207147359848, + "learning_rate": 0.00015273918298912127, + "loss": 1.6021, + "step": 18197 + }, + { + "epoch": 0.23647471018106273, + "grad_norm": 0.3848886489868164, + "learning_rate": 0.00015273658352720986, + "loss": 1.5115, + "step": 18198 + }, + { + "epoch": 0.2364877047249786, + "grad_norm": 0.38187968730926514, + "learning_rate": 0.0001527339840652985, + "loss": 1.2702, + "step": 18199 + }, + { + "epoch": 0.23650069926889447, + "grad_norm": 0.2943970263004303, + "learning_rate": 0.0001527313846033871, + "loss": 1.2392, + "step": 18200 + }, + { + "epoch": 0.23651369381281034, + "grad_norm": 0.5097970366477966, + "learning_rate": 0.00015272878514147574, + "loss": 1.5173, + "step": 18201 + }, + { + "epoch": 0.23652668835672622, + "grad_norm": 0.3958309292793274, + "learning_rate": 0.00015272618567956434, + "loss": 1.6729, + "step": 18202 + }, + { + "epoch": 0.2365396829006421, + "grad_norm": 0.39633217453956604, + "learning_rate": 0.00015272358621765296, + "loss": 1.4933, + "step": 18203 + }, + { + "epoch": 0.23655267744455796, + "grad_norm": 0.42337802052497864, + "learning_rate": 0.00015272098675574158, + "loss": 1.6307, + "step": 18204 + }, + { + "epoch": 0.23656567198847384, + "grad_norm": 0.37842652201652527, + "learning_rate": 0.00015271838729383018, + "loss": 1.5237, + "step": 18205 + }, + { + "epoch": 0.2365786665323897, + "grad_norm": 0.410157173871994, + "learning_rate": 0.0001527157878319188, + "loss": 1.3664, + "step": 18206 + }, + { + "epoch": 0.23659166107630558, + "grad_norm": 0.2758738100528717, + "learning_rate": 0.0001527131883700074, + "loss": 1.1969, + "step": 18207 + }, + { + "epoch": 0.23660465562022145, + "grad_norm": 0.32757776975631714, + "learning_rate": 0.00015271058890809603, + "loss": 1.2699, + "step": 18208 + }, + { + "epoch": 0.23661765016413733, + "grad_norm": 0.5655000805854797, + "learning_rate": 0.00015270798944618465, + "loss": 1.4511, + "step": 18209 + }, + { + "epoch": 0.2366306447080532, + "grad_norm": 0.4192947745323181, + "learning_rate": 0.00015270538998427325, + "loss": 1.5025, + "step": 18210 + }, + { + "epoch": 0.23664363925196907, + "grad_norm": 0.43690788745880127, + "learning_rate": 0.00015270279052236187, + "loss": 1.5444, + "step": 18211 + }, + { + "epoch": 0.23665663379588495, + "grad_norm": 0.4251912534236908, + "learning_rate": 0.0001527001910604505, + "loss": 1.4209, + "step": 18212 + }, + { + "epoch": 0.23666962833980082, + "grad_norm": 0.38708940148353577, + "learning_rate": 0.00015269759159853912, + "loss": 1.3019, + "step": 18213 + }, + { + "epoch": 0.2366826228837167, + "grad_norm": 0.34309008717536926, + "learning_rate": 0.00015269499213662772, + "loss": 1.3288, + "step": 18214 + }, + { + "epoch": 0.23669561742763257, + "grad_norm": 0.39441609382629395, + "learning_rate": 0.00015269239267471635, + "loss": 1.4494, + "step": 18215 + }, + { + "epoch": 0.23670861197154844, + "grad_norm": 0.4112738072872162, + "learning_rate": 0.00015268979321280497, + "loss": 1.4111, + "step": 18216 + }, + { + "epoch": 0.2367216065154643, + "grad_norm": 0.43523266911506653, + "learning_rate": 0.00015268719375089357, + "loss": 1.2338, + "step": 18217 + }, + { + "epoch": 0.23673460105938018, + "grad_norm": 0.4263109266757965, + "learning_rate": 0.0001526845942889822, + "loss": 1.218, + "step": 18218 + }, + { + "epoch": 0.23674759560329606, + "grad_norm": 0.3401554226875305, + "learning_rate": 0.0001526819948270708, + "loss": 1.3865, + "step": 18219 + }, + { + "epoch": 0.23676059014721193, + "grad_norm": 0.42171597480773926, + "learning_rate": 0.00015267939536515944, + "loss": 1.3249, + "step": 18220 + }, + { + "epoch": 0.2367735846911278, + "grad_norm": 0.4004462659358978, + "learning_rate": 0.00015267679590324804, + "loss": 1.4731, + "step": 18221 + }, + { + "epoch": 0.23678657923504368, + "grad_norm": 0.3541699945926666, + "learning_rate": 0.00015267419644133664, + "loss": 1.3885, + "step": 18222 + }, + { + "epoch": 0.23679957377895955, + "grad_norm": 0.3860013782978058, + "learning_rate": 0.00015267159697942526, + "loss": 1.5273, + "step": 18223 + }, + { + "epoch": 0.23681256832287542, + "grad_norm": 0.4005851149559021, + "learning_rate": 0.00015266899751751388, + "loss": 1.5126, + "step": 18224 + }, + { + "epoch": 0.2368255628667913, + "grad_norm": 0.3217337131500244, + "learning_rate": 0.0001526663980556025, + "loss": 1.2865, + "step": 18225 + }, + { + "epoch": 0.23683855741070717, + "grad_norm": 0.2983606457710266, + "learning_rate": 0.0001526637985936911, + "loss": 1.1833, + "step": 18226 + }, + { + "epoch": 0.23685155195462304, + "grad_norm": 0.40271472930908203, + "learning_rate": 0.00015266119913177973, + "loss": 1.4144, + "step": 18227 + }, + { + "epoch": 0.2368645464985389, + "grad_norm": 0.455280601978302, + "learning_rate": 0.00015265859966986836, + "loss": 1.4037, + "step": 18228 + }, + { + "epoch": 0.2368775410424548, + "grad_norm": 0.47944316267967224, + "learning_rate": 0.00015265600020795695, + "loss": 1.5018, + "step": 18229 + }, + { + "epoch": 0.23689053558637066, + "grad_norm": 0.39556461572647095, + "learning_rate": 0.00015265340074604558, + "loss": 1.5045, + "step": 18230 + }, + { + "epoch": 0.23690353013028653, + "grad_norm": 0.3195951282978058, + "learning_rate": 0.00015265080128413417, + "loss": 1.1856, + "step": 18231 + }, + { + "epoch": 0.2369165246742024, + "grad_norm": 0.33018478751182556, + "learning_rate": 0.00015264820182222283, + "loss": 1.3831, + "step": 18232 + }, + { + "epoch": 0.2369295192181183, + "grad_norm": 0.2715296149253845, + "learning_rate": 0.00015264560236031142, + "loss": 1.3369, + "step": 18233 + }, + { + "epoch": 0.23694251376203418, + "grad_norm": 0.40880000591278076, + "learning_rate": 0.00015264300289840002, + "loss": 1.5024, + "step": 18234 + }, + { + "epoch": 0.23695550830595005, + "grad_norm": 0.39754101634025574, + "learning_rate": 0.00015264040343648865, + "loss": 1.431, + "step": 18235 + }, + { + "epoch": 0.23696850284986593, + "grad_norm": 0.34071943163871765, + "learning_rate": 0.00015263780397457727, + "loss": 1.3663, + "step": 18236 + }, + { + "epoch": 0.2369814973937818, + "grad_norm": 0.4209735691547394, + "learning_rate": 0.0001526352045126659, + "loss": 1.4971, + "step": 18237 + }, + { + "epoch": 0.23699449193769767, + "grad_norm": 0.4041473865509033, + "learning_rate": 0.0001526326050507545, + "loss": 1.3777, + "step": 18238 + }, + { + "epoch": 0.23700748648161354, + "grad_norm": 0.3437062203884125, + "learning_rate": 0.00015263000558884312, + "loss": 1.5667, + "step": 18239 + }, + { + "epoch": 0.23702048102552942, + "grad_norm": 0.4070585072040558, + "learning_rate": 0.00015262740612693174, + "loss": 1.3817, + "step": 18240 + }, + { + "epoch": 0.2370334755694453, + "grad_norm": 0.3852291405200958, + "learning_rate": 0.00015262480666502034, + "loss": 1.3261, + "step": 18241 + }, + { + "epoch": 0.23704647011336116, + "grad_norm": 0.35380423069000244, + "learning_rate": 0.00015262220720310896, + "loss": 1.4057, + "step": 18242 + }, + { + "epoch": 0.23705946465727704, + "grad_norm": 0.4682086706161499, + "learning_rate": 0.0001526196077411976, + "loss": 1.5476, + "step": 18243 + }, + { + "epoch": 0.2370724592011929, + "grad_norm": 0.49074384570121765, + "learning_rate": 0.0001526170082792862, + "loss": 1.3677, + "step": 18244 + }, + { + "epoch": 0.23708545374510878, + "grad_norm": 0.37268275022506714, + "learning_rate": 0.0001526144088173748, + "loss": 1.1758, + "step": 18245 + }, + { + "epoch": 0.23709844828902465, + "grad_norm": 0.35650113224983215, + "learning_rate": 0.0001526118093554634, + "loss": 1.3764, + "step": 18246 + }, + { + "epoch": 0.23711144283294053, + "grad_norm": 0.35127323865890503, + "learning_rate": 0.00015260920989355206, + "loss": 1.2237, + "step": 18247 + }, + { + "epoch": 0.2371244373768564, + "grad_norm": 0.40921294689178467, + "learning_rate": 0.00015260661043164066, + "loss": 1.4648, + "step": 18248 + }, + { + "epoch": 0.23713743192077227, + "grad_norm": 0.3901052176952362, + "learning_rate": 0.00015260401096972928, + "loss": 1.3654, + "step": 18249 + }, + { + "epoch": 0.23715042646468815, + "grad_norm": 0.35409045219421387, + "learning_rate": 0.00015260141150781788, + "loss": 1.3752, + "step": 18250 + }, + { + "epoch": 0.23716342100860402, + "grad_norm": 0.438001424074173, + "learning_rate": 0.0001525988120459065, + "loss": 1.3889, + "step": 18251 + }, + { + "epoch": 0.2371764155525199, + "grad_norm": 0.38508597016334534, + "learning_rate": 0.00015259621258399513, + "loss": 1.4102, + "step": 18252 + }, + { + "epoch": 0.23718941009643577, + "grad_norm": 0.4647999703884125, + "learning_rate": 0.00015259361312208372, + "loss": 1.3686, + "step": 18253 + }, + { + "epoch": 0.23720240464035164, + "grad_norm": 0.4336076080799103, + "learning_rate": 0.00015259101366017235, + "loss": 1.4542, + "step": 18254 + }, + { + "epoch": 0.2372153991842675, + "grad_norm": 0.6248377561569214, + "learning_rate": 0.00015258841419826097, + "loss": 1.6398, + "step": 18255 + }, + { + "epoch": 0.23722839372818338, + "grad_norm": 0.4756849408149719, + "learning_rate": 0.0001525858147363496, + "loss": 1.4641, + "step": 18256 + }, + { + "epoch": 0.23724138827209926, + "grad_norm": 0.44121065735816956, + "learning_rate": 0.0001525832152744382, + "loss": 1.4335, + "step": 18257 + }, + { + "epoch": 0.23725438281601513, + "grad_norm": 0.43411433696746826, + "learning_rate": 0.00015258061581252682, + "loss": 1.5405, + "step": 18258 + }, + { + "epoch": 0.237267377359931, + "grad_norm": 0.42824992537498474, + "learning_rate": 0.00015257801635061544, + "loss": 1.309, + "step": 18259 + }, + { + "epoch": 0.23728037190384688, + "grad_norm": 0.3974263072013855, + "learning_rate": 0.00015257541688870404, + "loss": 1.3186, + "step": 18260 + }, + { + "epoch": 0.23729336644776275, + "grad_norm": 0.391261488199234, + "learning_rate": 0.00015257281742679267, + "loss": 1.3448, + "step": 18261 + }, + { + "epoch": 0.23730636099167862, + "grad_norm": 0.37573957443237305, + "learning_rate": 0.00015257021796488126, + "loss": 1.441, + "step": 18262 + }, + { + "epoch": 0.2373193555355945, + "grad_norm": 0.38714393973350525, + "learning_rate": 0.0001525676185029699, + "loss": 1.3944, + "step": 18263 + }, + { + "epoch": 0.23733235007951037, + "grad_norm": 0.3637382686138153, + "learning_rate": 0.0001525650190410585, + "loss": 1.2501, + "step": 18264 + }, + { + "epoch": 0.23734534462342624, + "grad_norm": 0.43220192193984985, + "learning_rate": 0.0001525624195791471, + "loss": 1.372, + "step": 18265 + }, + { + "epoch": 0.23735833916734211, + "grad_norm": 0.38607120513916016, + "learning_rate": 0.00015255982011723573, + "loss": 1.3484, + "step": 18266 + }, + { + "epoch": 0.237371333711258, + "grad_norm": 0.43534281849861145, + "learning_rate": 0.00015255722065532436, + "loss": 1.5282, + "step": 18267 + }, + { + "epoch": 0.23738432825517386, + "grad_norm": 0.4245777726173401, + "learning_rate": 0.00015255462119341298, + "loss": 1.4235, + "step": 18268 + }, + { + "epoch": 0.23739732279908973, + "grad_norm": 0.4120676517486572, + "learning_rate": 0.00015255202173150158, + "loss": 1.4388, + "step": 18269 + }, + { + "epoch": 0.2374103173430056, + "grad_norm": 0.275453120470047, + "learning_rate": 0.0001525494222695902, + "loss": 1.3444, + "step": 18270 + }, + { + "epoch": 0.23742331188692148, + "grad_norm": 0.34658634662628174, + "learning_rate": 0.00015254682280767883, + "loss": 1.2568, + "step": 18271 + }, + { + "epoch": 0.23743630643083735, + "grad_norm": 0.5128961801528931, + "learning_rate": 0.00015254422334576743, + "loss": 1.4373, + "step": 18272 + }, + { + "epoch": 0.23744930097475322, + "grad_norm": 0.3994189202785492, + "learning_rate": 0.00015254162388385605, + "loss": 1.2695, + "step": 18273 + }, + { + "epoch": 0.2374622955186691, + "grad_norm": 0.3061051666736603, + "learning_rate": 0.00015253902442194465, + "loss": 1.4212, + "step": 18274 + }, + { + "epoch": 0.23747529006258497, + "grad_norm": 0.3993454873561859, + "learning_rate": 0.0001525364249600333, + "loss": 1.516, + "step": 18275 + }, + { + "epoch": 0.23748828460650084, + "grad_norm": 0.4022742211818695, + "learning_rate": 0.0001525338254981219, + "loss": 1.3363, + "step": 18276 + }, + { + "epoch": 0.23750127915041672, + "grad_norm": 0.42900004982948303, + "learning_rate": 0.0001525312260362105, + "loss": 1.3001, + "step": 18277 + }, + { + "epoch": 0.2375142736943326, + "grad_norm": 0.40140804648399353, + "learning_rate": 0.00015252862657429915, + "loss": 1.4237, + "step": 18278 + }, + { + "epoch": 0.23752726823824846, + "grad_norm": 0.36197608709335327, + "learning_rate": 0.00015252602711238774, + "loss": 1.49, + "step": 18279 + }, + { + "epoch": 0.23754026278216434, + "grad_norm": 0.4287504553794861, + "learning_rate": 0.00015252342765047637, + "loss": 1.5939, + "step": 18280 + }, + { + "epoch": 0.2375532573260802, + "grad_norm": 0.4833097457885742, + "learning_rate": 0.00015252082818856497, + "loss": 1.3941, + "step": 18281 + }, + { + "epoch": 0.23756625186999608, + "grad_norm": 0.41275930404663086, + "learning_rate": 0.0001525182287266536, + "loss": 1.5896, + "step": 18282 + }, + { + "epoch": 0.23757924641391195, + "grad_norm": 0.4740753769874573, + "learning_rate": 0.00015251562926474221, + "loss": 1.4735, + "step": 18283 + }, + { + "epoch": 0.23759224095782783, + "grad_norm": 0.36063116788864136, + "learning_rate": 0.0001525130298028308, + "loss": 1.475, + "step": 18284 + }, + { + "epoch": 0.2376052355017437, + "grad_norm": 0.34388068318367004, + "learning_rate": 0.00015251043034091944, + "loss": 1.3601, + "step": 18285 + }, + { + "epoch": 0.23761823004565957, + "grad_norm": 0.37087368965148926, + "learning_rate": 0.00015250783087900806, + "loss": 1.387, + "step": 18286 + }, + { + "epoch": 0.23763122458957545, + "grad_norm": 0.37153133749961853, + "learning_rate": 0.00015250523141709669, + "loss": 1.3265, + "step": 18287 + }, + { + "epoch": 0.23764421913349132, + "grad_norm": 0.37546223402023315, + "learning_rate": 0.00015250263195518528, + "loss": 1.4727, + "step": 18288 + }, + { + "epoch": 0.2376572136774072, + "grad_norm": 0.41121768951416016, + "learning_rate": 0.00015250003249327388, + "loss": 1.5391, + "step": 18289 + }, + { + "epoch": 0.23767020822132306, + "grad_norm": 0.35365819931030273, + "learning_rate": 0.00015249743303136253, + "loss": 1.4179, + "step": 18290 + }, + { + "epoch": 0.23768320276523894, + "grad_norm": 0.4539684057235718, + "learning_rate": 0.00015249483356945113, + "loss": 1.4743, + "step": 18291 + }, + { + "epoch": 0.2376961973091548, + "grad_norm": 0.361674964427948, + "learning_rate": 0.00015249223410753975, + "loss": 1.4407, + "step": 18292 + }, + { + "epoch": 0.23770919185307068, + "grad_norm": 0.269663006067276, + "learning_rate": 0.00015248963464562835, + "loss": 1.2026, + "step": 18293 + }, + { + "epoch": 0.23772218639698656, + "grad_norm": 0.4076891541481018, + "learning_rate": 0.00015248703518371698, + "loss": 1.3339, + "step": 18294 + }, + { + "epoch": 0.23773518094090243, + "grad_norm": 0.3721511662006378, + "learning_rate": 0.0001524844357218056, + "loss": 1.311, + "step": 18295 + }, + { + "epoch": 0.2377481754848183, + "grad_norm": 0.3775753974914551, + "learning_rate": 0.0001524818362598942, + "loss": 1.4413, + "step": 18296 + }, + { + "epoch": 0.23776117002873418, + "grad_norm": 0.3262495696544647, + "learning_rate": 0.00015247923679798282, + "loss": 1.2478, + "step": 18297 + }, + { + "epoch": 0.23777416457265005, + "grad_norm": 0.4048340916633606, + "learning_rate": 0.00015247663733607145, + "loss": 1.3213, + "step": 18298 + }, + { + "epoch": 0.23778715911656592, + "grad_norm": 0.5252066850662231, + "learning_rate": 0.00015247403787416007, + "loss": 1.3677, + "step": 18299 + }, + { + "epoch": 0.2378001536604818, + "grad_norm": 0.41798362135887146, + "learning_rate": 0.00015247143841224867, + "loss": 1.4824, + "step": 18300 + }, + { + "epoch": 0.23781314820439767, + "grad_norm": 0.34314557909965515, + "learning_rate": 0.00015246883895033727, + "loss": 1.3602, + "step": 18301 + }, + { + "epoch": 0.23782614274831354, + "grad_norm": 0.3864387571811676, + "learning_rate": 0.00015246623948842592, + "loss": 1.3383, + "step": 18302 + }, + { + "epoch": 0.2378391372922294, + "grad_norm": 0.40235865116119385, + "learning_rate": 0.00015246364002651451, + "loss": 1.3264, + "step": 18303 + }, + { + "epoch": 0.2378521318361453, + "grad_norm": 0.4304735064506531, + "learning_rate": 0.00015246104056460314, + "loss": 1.5047, + "step": 18304 + }, + { + "epoch": 0.23786512638006116, + "grad_norm": 0.31614983081817627, + "learning_rate": 0.00015245844110269174, + "loss": 1.3543, + "step": 18305 + }, + { + "epoch": 0.23787812092397703, + "grad_norm": 0.27633342146873474, + "learning_rate": 0.00015245584164078036, + "loss": 1.3811, + "step": 18306 + }, + { + "epoch": 0.2378911154678929, + "grad_norm": 0.3414054214954376, + "learning_rate": 0.00015245324217886899, + "loss": 1.2454, + "step": 18307 + }, + { + "epoch": 0.23790411001180878, + "grad_norm": 0.36108309030532837, + "learning_rate": 0.00015245064271695758, + "loss": 1.4144, + "step": 18308 + }, + { + "epoch": 0.23791710455572468, + "grad_norm": 0.43713364005088806, + "learning_rate": 0.0001524480432550462, + "loss": 1.503, + "step": 18309 + }, + { + "epoch": 0.23793009909964055, + "grad_norm": 0.46419891715049744, + "learning_rate": 0.00015244544379313483, + "loss": 1.4029, + "step": 18310 + }, + { + "epoch": 0.23794309364355642, + "grad_norm": 0.39231863617897034, + "learning_rate": 0.00015244284433122346, + "loss": 1.4094, + "step": 18311 + }, + { + "epoch": 0.2379560881874723, + "grad_norm": 0.3756033778190613, + "learning_rate": 0.00015244024486931205, + "loss": 1.3828, + "step": 18312 + }, + { + "epoch": 0.23796908273138817, + "grad_norm": 0.5546630620956421, + "learning_rate": 0.00015243764540740068, + "loss": 1.5738, + "step": 18313 + }, + { + "epoch": 0.23798207727530404, + "grad_norm": 0.4147547483444214, + "learning_rate": 0.0001524350459454893, + "loss": 1.2955, + "step": 18314 + }, + { + "epoch": 0.23799507181921992, + "grad_norm": 0.43519827723503113, + "learning_rate": 0.0001524324464835779, + "loss": 1.6257, + "step": 18315 + }, + { + "epoch": 0.2380080663631358, + "grad_norm": 0.3643302917480469, + "learning_rate": 0.00015242984702166652, + "loss": 1.0968, + "step": 18316 + }, + { + "epoch": 0.23802106090705166, + "grad_norm": 0.3854471743106842, + "learning_rate": 0.00015242724755975515, + "loss": 1.5254, + "step": 18317 + }, + { + "epoch": 0.23803405545096754, + "grad_norm": 0.48158252239227295, + "learning_rate": 0.00015242464809784375, + "loss": 1.5382, + "step": 18318 + }, + { + "epoch": 0.2380470499948834, + "grad_norm": 0.356721967458725, + "learning_rate": 0.00015242204863593237, + "loss": 1.5433, + "step": 18319 + }, + { + "epoch": 0.23806004453879928, + "grad_norm": 0.3497735261917114, + "learning_rate": 0.00015241944917402097, + "loss": 1.4411, + "step": 18320 + }, + { + "epoch": 0.23807303908271515, + "grad_norm": 0.4345513582229614, + "learning_rate": 0.00015241684971210962, + "loss": 1.4692, + "step": 18321 + }, + { + "epoch": 0.23808603362663103, + "grad_norm": 0.49815332889556885, + "learning_rate": 0.00015241425025019822, + "loss": 1.3836, + "step": 18322 + }, + { + "epoch": 0.2380990281705469, + "grad_norm": 0.3473820090293884, + "learning_rate": 0.00015241165078828684, + "loss": 1.361, + "step": 18323 + }, + { + "epoch": 0.23811202271446277, + "grad_norm": 0.3871738612651825, + "learning_rate": 0.00015240905132637544, + "loss": 1.4123, + "step": 18324 + }, + { + "epoch": 0.23812501725837865, + "grad_norm": 0.3766672611236572, + "learning_rate": 0.00015240645186446406, + "loss": 1.5922, + "step": 18325 + }, + { + "epoch": 0.23813801180229452, + "grad_norm": 0.33464813232421875, + "learning_rate": 0.0001524038524025527, + "loss": 1.4179, + "step": 18326 + }, + { + "epoch": 0.2381510063462104, + "grad_norm": 0.5216385126113892, + "learning_rate": 0.00015240125294064128, + "loss": 1.4162, + "step": 18327 + }, + { + "epoch": 0.23816400089012627, + "grad_norm": 0.43903517723083496, + "learning_rate": 0.0001523986534787299, + "loss": 1.5467, + "step": 18328 + }, + { + "epoch": 0.23817699543404214, + "grad_norm": 0.40008556842803955, + "learning_rate": 0.00015239605401681853, + "loss": 1.4518, + "step": 18329 + }, + { + "epoch": 0.238189989977958, + "grad_norm": 0.36536145210266113, + "learning_rate": 0.00015239345455490713, + "loss": 1.3205, + "step": 18330 + }, + { + "epoch": 0.23820298452187388, + "grad_norm": 0.38722220063209534, + "learning_rate": 0.00015239085509299576, + "loss": 1.4093, + "step": 18331 + }, + { + "epoch": 0.23821597906578976, + "grad_norm": 0.4167955219745636, + "learning_rate": 0.00015238825563108435, + "loss": 1.5592, + "step": 18332 + }, + { + "epoch": 0.23822897360970563, + "grad_norm": 0.4013184607028961, + "learning_rate": 0.000152385656169173, + "loss": 1.4779, + "step": 18333 + }, + { + "epoch": 0.2382419681536215, + "grad_norm": 0.293133020401001, + "learning_rate": 0.0001523830567072616, + "loss": 1.3147, + "step": 18334 + }, + { + "epoch": 0.23825496269753738, + "grad_norm": 0.448988676071167, + "learning_rate": 0.00015238045724535023, + "loss": 1.6063, + "step": 18335 + }, + { + "epoch": 0.23826795724145325, + "grad_norm": 0.38091573119163513, + "learning_rate": 0.00015237785778343882, + "loss": 1.3969, + "step": 18336 + }, + { + "epoch": 0.23828095178536912, + "grad_norm": 0.38666629791259766, + "learning_rate": 0.00015237525832152745, + "loss": 1.4154, + "step": 18337 + }, + { + "epoch": 0.238293946329285, + "grad_norm": 0.3075655698776245, + "learning_rate": 0.00015237265885961607, + "loss": 1.4303, + "step": 18338 + }, + { + "epoch": 0.23830694087320087, + "grad_norm": 0.3782234191894531, + "learning_rate": 0.00015237005939770467, + "loss": 1.3338, + "step": 18339 + }, + { + "epoch": 0.23831993541711674, + "grad_norm": 0.4326476752758026, + "learning_rate": 0.0001523674599357933, + "loss": 1.5497, + "step": 18340 + }, + { + "epoch": 0.2383329299610326, + "grad_norm": 0.44837895035743713, + "learning_rate": 0.00015236486047388192, + "loss": 1.6385, + "step": 18341 + }, + { + "epoch": 0.2383459245049485, + "grad_norm": 0.41195419430732727, + "learning_rate": 0.00015236226101197054, + "loss": 1.5162, + "step": 18342 + }, + { + "epoch": 0.23835891904886436, + "grad_norm": 0.4462265074253082, + "learning_rate": 0.00015235966155005914, + "loss": 1.3663, + "step": 18343 + }, + { + "epoch": 0.23837191359278023, + "grad_norm": 0.4363690912723541, + "learning_rate": 0.00015235706208814774, + "loss": 1.4924, + "step": 18344 + }, + { + "epoch": 0.2383849081366961, + "grad_norm": 0.3536207675933838, + "learning_rate": 0.0001523544626262364, + "loss": 1.3365, + "step": 18345 + }, + { + "epoch": 0.23839790268061198, + "grad_norm": 0.37310531735420227, + "learning_rate": 0.000152351863164325, + "loss": 1.4971, + "step": 18346 + }, + { + "epoch": 0.23841089722452785, + "grad_norm": 0.39613497257232666, + "learning_rate": 0.0001523492637024136, + "loss": 1.3801, + "step": 18347 + }, + { + "epoch": 0.23842389176844372, + "grad_norm": 0.38797375559806824, + "learning_rate": 0.0001523466642405022, + "loss": 1.2277, + "step": 18348 + }, + { + "epoch": 0.2384368863123596, + "grad_norm": 0.401252418756485, + "learning_rate": 0.00015234406477859083, + "loss": 1.3874, + "step": 18349 + }, + { + "epoch": 0.23844988085627547, + "grad_norm": 0.3404322862625122, + "learning_rate": 0.00015234146531667946, + "loss": 1.335, + "step": 18350 + }, + { + "epoch": 0.23846287540019134, + "grad_norm": 0.4653375446796417, + "learning_rate": 0.00015233886585476806, + "loss": 1.4304, + "step": 18351 + }, + { + "epoch": 0.23847586994410722, + "grad_norm": 0.42086586356163025, + "learning_rate": 0.0001523362663928567, + "loss": 1.4604, + "step": 18352 + }, + { + "epoch": 0.2384888644880231, + "grad_norm": 0.4242717921733856, + "learning_rate": 0.0001523336669309453, + "loss": 1.5315, + "step": 18353 + }, + { + "epoch": 0.23850185903193896, + "grad_norm": 0.3968888223171234, + "learning_rate": 0.00015233106746903393, + "loss": 1.2455, + "step": 18354 + }, + { + "epoch": 0.23851485357585483, + "grad_norm": 0.44738245010375977, + "learning_rate": 0.00015232846800712253, + "loss": 1.504, + "step": 18355 + }, + { + "epoch": 0.2385278481197707, + "grad_norm": 0.4098779857158661, + "learning_rate": 0.00015232586854521115, + "loss": 1.652, + "step": 18356 + }, + { + "epoch": 0.23854084266368658, + "grad_norm": 0.42097532749176025, + "learning_rate": 0.00015232326908329978, + "loss": 1.3497, + "step": 18357 + }, + { + "epoch": 0.23855383720760245, + "grad_norm": 0.4069109857082367, + "learning_rate": 0.00015232066962138837, + "loss": 1.6071, + "step": 18358 + }, + { + "epoch": 0.23856683175151833, + "grad_norm": 0.31164857745170593, + "learning_rate": 0.000152318070159477, + "loss": 1.4385, + "step": 18359 + }, + { + "epoch": 0.2385798262954342, + "grad_norm": 0.37497246265411377, + "learning_rate": 0.00015231547069756562, + "loss": 1.2898, + "step": 18360 + }, + { + "epoch": 0.23859282083935007, + "grad_norm": 0.4040505588054657, + "learning_rate": 0.00015231287123565422, + "loss": 1.4444, + "step": 18361 + }, + { + "epoch": 0.23860581538326595, + "grad_norm": 0.34165674448013306, + "learning_rate": 0.00015231027177374284, + "loss": 1.3848, + "step": 18362 + }, + { + "epoch": 0.23861880992718182, + "grad_norm": 0.4454728662967682, + "learning_rate": 0.00015230767231183144, + "loss": 1.4778, + "step": 18363 + }, + { + "epoch": 0.2386318044710977, + "grad_norm": 0.4510919451713562, + "learning_rate": 0.0001523050728499201, + "loss": 1.563, + "step": 18364 + }, + { + "epoch": 0.23864479901501356, + "grad_norm": 0.3385016620159149, + "learning_rate": 0.0001523024733880087, + "loss": 1.5803, + "step": 18365 + }, + { + "epoch": 0.23865779355892944, + "grad_norm": 0.3359794020652771, + "learning_rate": 0.00015229987392609731, + "loss": 1.4566, + "step": 18366 + }, + { + "epoch": 0.2386707881028453, + "grad_norm": 0.3456694483757019, + "learning_rate": 0.0001522972744641859, + "loss": 1.3548, + "step": 18367 + }, + { + "epoch": 0.23868378264676118, + "grad_norm": 0.3981991112232208, + "learning_rate": 0.00015229467500227454, + "loss": 1.3751, + "step": 18368 + }, + { + "epoch": 0.23869677719067706, + "grad_norm": 0.4316973090171814, + "learning_rate": 0.00015229207554036316, + "loss": 1.4836, + "step": 18369 + }, + { + "epoch": 0.23870977173459293, + "grad_norm": 0.3886548578739166, + "learning_rate": 0.00015228947607845176, + "loss": 1.2173, + "step": 18370 + }, + { + "epoch": 0.2387227662785088, + "grad_norm": 0.4843144118785858, + "learning_rate": 0.00015228687661654038, + "loss": 1.4104, + "step": 18371 + }, + { + "epoch": 0.23873576082242468, + "grad_norm": 0.4144282341003418, + "learning_rate": 0.000152284277154629, + "loss": 1.5156, + "step": 18372 + }, + { + "epoch": 0.23874875536634055, + "grad_norm": 0.33613187074661255, + "learning_rate": 0.0001522816776927176, + "loss": 1.2932, + "step": 18373 + }, + { + "epoch": 0.23876174991025642, + "grad_norm": 0.41678911447525024, + "learning_rate": 0.00015227907823080623, + "loss": 1.365, + "step": 18374 + }, + { + "epoch": 0.2387747444541723, + "grad_norm": 0.41785669326782227, + "learning_rate": 0.00015227647876889483, + "loss": 1.5861, + "step": 18375 + }, + { + "epoch": 0.23878773899808817, + "grad_norm": 0.44068747758865356, + "learning_rate": 0.00015227387930698348, + "loss": 1.3881, + "step": 18376 + }, + { + "epoch": 0.23880073354200404, + "grad_norm": 0.39700567722320557, + "learning_rate": 0.00015227127984507208, + "loss": 1.2586, + "step": 18377 + }, + { + "epoch": 0.2388137280859199, + "grad_norm": 0.3955710530281067, + "learning_rate": 0.0001522686803831607, + "loss": 1.3585, + "step": 18378 + }, + { + "epoch": 0.23882672262983579, + "grad_norm": 0.32789579033851624, + "learning_rate": 0.0001522660809212493, + "loss": 1.3655, + "step": 18379 + }, + { + "epoch": 0.23883971717375166, + "grad_norm": 0.3593163788318634, + "learning_rate": 0.00015226348145933792, + "loss": 1.3141, + "step": 18380 + }, + { + "epoch": 0.23885271171766753, + "grad_norm": 0.425920307636261, + "learning_rate": 0.00015226088199742655, + "loss": 1.3062, + "step": 18381 + }, + { + "epoch": 0.2388657062615834, + "grad_norm": 0.41736099123954773, + "learning_rate": 0.00015225828253551514, + "loss": 1.338, + "step": 18382 + }, + { + "epoch": 0.23887870080549928, + "grad_norm": 0.4712754786014557, + "learning_rate": 0.00015225568307360377, + "loss": 1.4469, + "step": 18383 + }, + { + "epoch": 0.23889169534941515, + "grad_norm": 0.44545063376426697, + "learning_rate": 0.0001522530836116924, + "loss": 1.5173, + "step": 18384 + }, + { + "epoch": 0.23890468989333105, + "grad_norm": 0.417752742767334, + "learning_rate": 0.000152250484149781, + "loss": 1.3929, + "step": 18385 + }, + { + "epoch": 0.23891768443724692, + "grad_norm": 0.4690520763397217, + "learning_rate": 0.00015224788468786961, + "loss": 1.4693, + "step": 18386 + }, + { + "epoch": 0.2389306789811628, + "grad_norm": 0.45876073837280273, + "learning_rate": 0.0001522452852259582, + "loss": 1.2926, + "step": 18387 + }, + { + "epoch": 0.23894367352507867, + "grad_norm": 0.3711584806442261, + "learning_rate": 0.00015224268576404686, + "loss": 1.4161, + "step": 18388 + }, + { + "epoch": 0.23895666806899454, + "grad_norm": 0.4448450207710266, + "learning_rate": 0.00015224008630213546, + "loss": 1.337, + "step": 18389 + }, + { + "epoch": 0.23896966261291042, + "grad_norm": 0.4528861939907074, + "learning_rate": 0.00015223748684022409, + "loss": 1.4324, + "step": 18390 + }, + { + "epoch": 0.2389826571568263, + "grad_norm": 0.4680466055870056, + "learning_rate": 0.0001522348873783127, + "loss": 1.4565, + "step": 18391 + }, + { + "epoch": 0.23899565170074216, + "grad_norm": 0.24746529757976532, + "learning_rate": 0.0001522322879164013, + "loss": 1.3496, + "step": 18392 + }, + { + "epoch": 0.23900864624465804, + "grad_norm": 0.35205063223838806, + "learning_rate": 0.00015222968845448993, + "loss": 1.4021, + "step": 18393 + }, + { + "epoch": 0.2390216407885739, + "grad_norm": 0.42321908473968506, + "learning_rate": 0.00015222708899257853, + "loss": 1.4395, + "step": 18394 + }, + { + "epoch": 0.23903463533248978, + "grad_norm": 0.37585991621017456, + "learning_rate": 0.00015222448953066718, + "loss": 1.3448, + "step": 18395 + }, + { + "epoch": 0.23904762987640565, + "grad_norm": 0.3511659502983093, + "learning_rate": 0.00015222189006875578, + "loss": 1.3364, + "step": 18396 + }, + { + "epoch": 0.23906062442032153, + "grad_norm": 0.365261435508728, + "learning_rate": 0.0001522192906068444, + "loss": 1.4228, + "step": 18397 + }, + { + "epoch": 0.2390736189642374, + "grad_norm": 0.375203937292099, + "learning_rate": 0.000152216691144933, + "loss": 1.4221, + "step": 18398 + }, + { + "epoch": 0.23908661350815327, + "grad_norm": 0.4070294201374054, + "learning_rate": 0.00015221409168302162, + "loss": 1.3619, + "step": 18399 + }, + { + "epoch": 0.23909960805206915, + "grad_norm": 0.43807366490364075, + "learning_rate": 0.00015221149222111025, + "loss": 1.465, + "step": 18400 + }, + { + "epoch": 0.23911260259598502, + "grad_norm": 0.47241339087486267, + "learning_rate": 0.00015220889275919885, + "loss": 1.4921, + "step": 18401 + }, + { + "epoch": 0.2391255971399009, + "grad_norm": 0.44848671555519104, + "learning_rate": 0.00015220629329728747, + "loss": 1.5368, + "step": 18402 + }, + { + "epoch": 0.23913859168381676, + "grad_norm": 0.39667055010795593, + "learning_rate": 0.0001522036938353761, + "loss": 1.318, + "step": 18403 + }, + { + "epoch": 0.23915158622773264, + "grad_norm": 0.37080222368240356, + "learning_rate": 0.0001522010943734647, + "loss": 1.3532, + "step": 18404 + }, + { + "epoch": 0.2391645807716485, + "grad_norm": 0.38848569989204407, + "learning_rate": 0.00015219849491155332, + "loss": 1.4087, + "step": 18405 + }, + { + "epoch": 0.23917757531556438, + "grad_norm": 0.3809947073459625, + "learning_rate": 0.00015219589544964191, + "loss": 1.3293, + "step": 18406 + }, + { + "epoch": 0.23919056985948026, + "grad_norm": 0.4530285596847534, + "learning_rate": 0.00015219329598773057, + "loss": 1.4698, + "step": 18407 + }, + { + "epoch": 0.23920356440339613, + "grad_norm": 0.3524971902370453, + "learning_rate": 0.00015219069652581916, + "loss": 1.7743, + "step": 18408 + }, + { + "epoch": 0.239216558947312, + "grad_norm": 0.438679963350296, + "learning_rate": 0.0001521880970639078, + "loss": 1.4769, + "step": 18409 + }, + { + "epoch": 0.23922955349122788, + "grad_norm": 0.38857272267341614, + "learning_rate": 0.00015218549760199639, + "loss": 1.3954, + "step": 18410 + }, + { + "epoch": 0.23924254803514375, + "grad_norm": 0.41575130820274353, + "learning_rate": 0.000152182898140085, + "loss": 1.4375, + "step": 18411 + }, + { + "epoch": 0.23925554257905962, + "grad_norm": 0.4694860577583313, + "learning_rate": 0.00015218029867817363, + "loss": 1.3106, + "step": 18412 + }, + { + "epoch": 0.2392685371229755, + "grad_norm": 0.4033541679382324, + "learning_rate": 0.00015217769921626223, + "loss": 1.5521, + "step": 18413 + }, + { + "epoch": 0.23928153166689137, + "grad_norm": 0.4256977140903473, + "learning_rate": 0.00015217509975435086, + "loss": 1.3459, + "step": 18414 + }, + { + "epoch": 0.23929452621080724, + "grad_norm": 0.39320868253707886, + "learning_rate": 0.00015217250029243948, + "loss": 1.4408, + "step": 18415 + }, + { + "epoch": 0.2393075207547231, + "grad_norm": 0.4771225154399872, + "learning_rate": 0.00015216990083052808, + "loss": 1.472, + "step": 18416 + }, + { + "epoch": 0.23932051529863899, + "grad_norm": 0.37927496433258057, + "learning_rate": 0.0001521673013686167, + "loss": 1.4159, + "step": 18417 + }, + { + "epoch": 0.23933350984255486, + "grad_norm": 0.5706114172935486, + "learning_rate": 0.0001521647019067053, + "loss": 1.4365, + "step": 18418 + }, + { + "epoch": 0.23934650438647073, + "grad_norm": 0.4115237295627594, + "learning_rate": 0.00015216210244479395, + "loss": 1.3363, + "step": 18419 + }, + { + "epoch": 0.2393594989303866, + "grad_norm": 0.37234610319137573, + "learning_rate": 0.00015215950298288255, + "loss": 1.4372, + "step": 18420 + }, + { + "epoch": 0.23937249347430248, + "grad_norm": 0.37164726853370667, + "learning_rate": 0.00015215690352097117, + "loss": 1.382, + "step": 18421 + }, + { + "epoch": 0.23938548801821835, + "grad_norm": 0.444560706615448, + "learning_rate": 0.00015215430405905977, + "loss": 1.402, + "step": 18422 + }, + { + "epoch": 0.23939848256213422, + "grad_norm": 0.48399990797042847, + "learning_rate": 0.0001521517045971484, + "loss": 1.3937, + "step": 18423 + }, + { + "epoch": 0.2394114771060501, + "grad_norm": 0.4545885920524597, + "learning_rate": 0.00015214910513523702, + "loss": 1.4436, + "step": 18424 + }, + { + "epoch": 0.23942447164996597, + "grad_norm": 0.4635612964630127, + "learning_rate": 0.00015214650567332562, + "loss": 1.3423, + "step": 18425 + }, + { + "epoch": 0.23943746619388184, + "grad_norm": 0.46208158135414124, + "learning_rate": 0.00015214390621141427, + "loss": 1.3664, + "step": 18426 + }, + { + "epoch": 0.23945046073779772, + "grad_norm": 0.4331016540527344, + "learning_rate": 0.00015214130674950287, + "loss": 1.3717, + "step": 18427 + }, + { + "epoch": 0.2394634552817136, + "grad_norm": 0.4727119207382202, + "learning_rate": 0.00015213870728759146, + "loss": 1.5932, + "step": 18428 + }, + { + "epoch": 0.23947644982562946, + "grad_norm": 0.3666777014732361, + "learning_rate": 0.0001521361078256801, + "loss": 1.427, + "step": 18429 + }, + { + "epoch": 0.23948944436954533, + "grad_norm": 0.3947739899158478, + "learning_rate": 0.0001521335083637687, + "loss": 1.5911, + "step": 18430 + }, + { + "epoch": 0.2395024389134612, + "grad_norm": 0.47298958897590637, + "learning_rate": 0.00015213090890185734, + "loss": 1.2793, + "step": 18431 + }, + { + "epoch": 0.23951543345737708, + "grad_norm": 0.4279867112636566, + "learning_rate": 0.00015212830943994593, + "loss": 1.299, + "step": 18432 + }, + { + "epoch": 0.23952842800129295, + "grad_norm": 0.4041167199611664, + "learning_rate": 0.00015212570997803456, + "loss": 1.3863, + "step": 18433 + }, + { + "epoch": 0.23954142254520883, + "grad_norm": 0.42256590723991394, + "learning_rate": 0.00015212311051612318, + "loss": 1.4091, + "step": 18434 + }, + { + "epoch": 0.2395544170891247, + "grad_norm": 0.4084274172782898, + "learning_rate": 0.00015212051105421178, + "loss": 1.2583, + "step": 18435 + }, + { + "epoch": 0.23956741163304057, + "grad_norm": 0.4347551167011261, + "learning_rate": 0.0001521179115923004, + "loss": 1.5064, + "step": 18436 + }, + { + "epoch": 0.23958040617695645, + "grad_norm": 0.40250375866889954, + "learning_rate": 0.000152115312130389, + "loss": 1.3499, + "step": 18437 + }, + { + "epoch": 0.23959340072087232, + "grad_norm": 0.36157944798469543, + "learning_rate": 0.00015211271266847765, + "loss": 1.365, + "step": 18438 + }, + { + "epoch": 0.2396063952647882, + "grad_norm": 0.46978047490119934, + "learning_rate": 0.00015211011320656625, + "loss": 1.515, + "step": 18439 + }, + { + "epoch": 0.23961938980870406, + "grad_norm": 0.41418614983558655, + "learning_rate": 0.00015210751374465485, + "loss": 1.5187, + "step": 18440 + }, + { + "epoch": 0.23963238435261994, + "grad_norm": 0.33228474855422974, + "learning_rate": 0.00015210491428274347, + "loss": 1.4437, + "step": 18441 + }, + { + "epoch": 0.2396453788965358, + "grad_norm": 0.36606141924858093, + "learning_rate": 0.0001521023148208321, + "loss": 1.3836, + "step": 18442 + }, + { + "epoch": 0.23965837344045168, + "grad_norm": 0.3457067310810089, + "learning_rate": 0.00015209971535892072, + "loss": 1.4185, + "step": 18443 + }, + { + "epoch": 0.23967136798436756, + "grad_norm": 0.37555575370788574, + "learning_rate": 0.00015209711589700932, + "loss": 1.2671, + "step": 18444 + }, + { + "epoch": 0.23968436252828343, + "grad_norm": 0.34376436471939087, + "learning_rate": 0.00015209451643509794, + "loss": 1.2509, + "step": 18445 + }, + { + "epoch": 0.2396973570721993, + "grad_norm": 0.3246956169605255, + "learning_rate": 0.00015209191697318657, + "loss": 1.344, + "step": 18446 + }, + { + "epoch": 0.23971035161611517, + "grad_norm": 0.43418511748313904, + "learning_rate": 0.00015208931751127517, + "loss": 1.465, + "step": 18447 + }, + { + "epoch": 0.23972334616003105, + "grad_norm": 0.35498565435409546, + "learning_rate": 0.0001520867180493638, + "loss": 1.4512, + "step": 18448 + }, + { + "epoch": 0.23973634070394692, + "grad_norm": 0.45842888951301575, + "learning_rate": 0.0001520841185874524, + "loss": 1.5731, + "step": 18449 + }, + { + "epoch": 0.2397493352478628, + "grad_norm": 0.38945823907852173, + "learning_rate": 0.00015208151912554104, + "loss": 1.4053, + "step": 18450 + }, + { + "epoch": 0.23976232979177867, + "grad_norm": 0.3220931887626648, + "learning_rate": 0.00015207891966362964, + "loss": 1.3988, + "step": 18451 + }, + { + "epoch": 0.23977532433569454, + "grad_norm": 0.3733195960521698, + "learning_rate": 0.00015207632020171823, + "loss": 1.4427, + "step": 18452 + }, + { + "epoch": 0.2397883188796104, + "grad_norm": 0.41567760705947876, + "learning_rate": 0.00015207372073980686, + "loss": 1.4139, + "step": 18453 + }, + { + "epoch": 0.23980131342352629, + "grad_norm": 0.4601441025733948, + "learning_rate": 0.00015207112127789548, + "loss": 1.3478, + "step": 18454 + }, + { + "epoch": 0.23981430796744216, + "grad_norm": 0.41283464431762695, + "learning_rate": 0.0001520685218159841, + "loss": 1.4726, + "step": 18455 + }, + { + "epoch": 0.23982730251135803, + "grad_norm": 0.3775501847267151, + "learning_rate": 0.0001520659223540727, + "loss": 1.6285, + "step": 18456 + }, + { + "epoch": 0.2398402970552739, + "grad_norm": 0.3454986810684204, + "learning_rate": 0.00015206332289216133, + "loss": 1.2349, + "step": 18457 + }, + { + "epoch": 0.23985329159918978, + "grad_norm": 0.33893612027168274, + "learning_rate": 0.00015206072343024995, + "loss": 1.1989, + "step": 18458 + }, + { + "epoch": 0.23986628614310565, + "grad_norm": 0.3938547372817993, + "learning_rate": 0.00015205812396833855, + "loss": 1.3242, + "step": 18459 + }, + { + "epoch": 0.23987928068702152, + "grad_norm": 0.39689940214157104, + "learning_rate": 0.00015205552450642718, + "loss": 1.4818, + "step": 18460 + }, + { + "epoch": 0.23989227523093742, + "grad_norm": 0.39818379282951355, + "learning_rate": 0.00015205292504451577, + "loss": 1.2801, + "step": 18461 + }, + { + "epoch": 0.2399052697748533, + "grad_norm": 0.48824170231819153, + "learning_rate": 0.00015205032558260442, + "loss": 1.4916, + "step": 18462 + }, + { + "epoch": 0.23991826431876917, + "grad_norm": 0.3895243704319, + "learning_rate": 0.00015204772612069302, + "loss": 1.3978, + "step": 18463 + }, + { + "epoch": 0.23993125886268504, + "grad_norm": 0.40258723497390747, + "learning_rate": 0.00015204512665878165, + "loss": 1.4048, + "step": 18464 + }, + { + "epoch": 0.23994425340660092, + "grad_norm": 0.42507022619247437, + "learning_rate": 0.00015204252719687027, + "loss": 1.2852, + "step": 18465 + }, + { + "epoch": 0.2399572479505168, + "grad_norm": 0.34923067688941956, + "learning_rate": 0.00015203992773495887, + "loss": 1.3582, + "step": 18466 + }, + { + "epoch": 0.23997024249443266, + "grad_norm": 0.49082961678504944, + "learning_rate": 0.0001520373282730475, + "loss": 1.4326, + "step": 18467 + }, + { + "epoch": 0.23998323703834853, + "grad_norm": 0.5216532349586487, + "learning_rate": 0.0001520347288111361, + "loss": 1.4784, + "step": 18468 + }, + { + "epoch": 0.2399962315822644, + "grad_norm": 0.42487505078315735, + "learning_rate": 0.00015203212934922471, + "loss": 1.379, + "step": 18469 + }, + { + "epoch": 0.24000922612618028, + "grad_norm": 0.4226248264312744, + "learning_rate": 0.00015202952988731334, + "loss": 1.2864, + "step": 18470 + }, + { + "epoch": 0.24002222067009615, + "grad_norm": 0.48685169219970703, + "learning_rate": 0.00015202693042540194, + "loss": 1.3841, + "step": 18471 + }, + { + "epoch": 0.24003521521401203, + "grad_norm": 0.47948235273361206, + "learning_rate": 0.00015202433096349056, + "loss": 1.4886, + "step": 18472 + }, + { + "epoch": 0.2400482097579279, + "grad_norm": 0.4333220422267914, + "learning_rate": 0.00015202173150157919, + "loss": 1.406, + "step": 18473 + }, + { + "epoch": 0.24006120430184377, + "grad_norm": 0.4021318852901459, + "learning_rate": 0.0001520191320396678, + "loss": 1.4615, + "step": 18474 + }, + { + "epoch": 0.24007419884575965, + "grad_norm": 0.433322012424469, + "learning_rate": 0.0001520165325777564, + "loss": 1.3888, + "step": 18475 + }, + { + "epoch": 0.24008719338967552, + "grad_norm": 0.4169984757900238, + "learning_rate": 0.00015201393311584503, + "loss": 1.4407, + "step": 18476 + }, + { + "epoch": 0.2401001879335914, + "grad_norm": 0.39146688580513, + "learning_rate": 0.00015201133365393366, + "loss": 1.0661, + "step": 18477 + }, + { + "epoch": 0.24011318247750726, + "grad_norm": 0.46941983699798584, + "learning_rate": 0.00015200873419202225, + "loss": 1.5075, + "step": 18478 + }, + { + "epoch": 0.24012617702142314, + "grad_norm": 0.3752908408641815, + "learning_rate": 0.00015200613473011088, + "loss": 1.5302, + "step": 18479 + }, + { + "epoch": 0.240139171565339, + "grad_norm": 0.3781212866306305, + "learning_rate": 0.00015200353526819948, + "loss": 1.3203, + "step": 18480 + }, + { + "epoch": 0.24015216610925488, + "grad_norm": 0.3345184922218323, + "learning_rate": 0.00015200093580628813, + "loss": 1.4135, + "step": 18481 + }, + { + "epoch": 0.24016516065317076, + "grad_norm": 0.41585254669189453, + "learning_rate": 0.00015199833634437672, + "loss": 1.5058, + "step": 18482 + }, + { + "epoch": 0.24017815519708663, + "grad_norm": 0.3971256613731384, + "learning_rate": 0.00015199573688246532, + "loss": 1.3248, + "step": 18483 + }, + { + "epoch": 0.2401911497410025, + "grad_norm": 0.375776082277298, + "learning_rate": 0.00015199313742055395, + "loss": 1.406, + "step": 18484 + }, + { + "epoch": 0.24020414428491837, + "grad_norm": 0.49672961235046387, + "learning_rate": 0.00015199053795864257, + "loss": 1.3617, + "step": 18485 + }, + { + "epoch": 0.24021713882883425, + "grad_norm": 0.427846759557724, + "learning_rate": 0.0001519879384967312, + "loss": 1.3318, + "step": 18486 + }, + { + "epoch": 0.24023013337275012, + "grad_norm": 0.4171332120895386, + "learning_rate": 0.0001519853390348198, + "loss": 1.4284, + "step": 18487 + }, + { + "epoch": 0.240243127916666, + "grad_norm": 0.520584225654602, + "learning_rate": 0.00015198273957290842, + "loss": 1.2623, + "step": 18488 + }, + { + "epoch": 0.24025612246058187, + "grad_norm": 0.38603925704956055, + "learning_rate": 0.00015198014011099704, + "loss": 1.6043, + "step": 18489 + }, + { + "epoch": 0.24026911700449774, + "grad_norm": 0.36628198623657227, + "learning_rate": 0.00015197754064908564, + "loss": 1.4751, + "step": 18490 + }, + { + "epoch": 0.2402821115484136, + "grad_norm": 0.38636234402656555, + "learning_rate": 0.00015197494118717426, + "loss": 1.3396, + "step": 18491 + }, + { + "epoch": 0.24029510609232949, + "grad_norm": 0.37485790252685547, + "learning_rate": 0.00015197234172526286, + "loss": 1.3657, + "step": 18492 + }, + { + "epoch": 0.24030810063624536, + "grad_norm": 0.40534693002700806, + "learning_rate": 0.0001519697422633515, + "loss": 1.6734, + "step": 18493 + }, + { + "epoch": 0.24032109518016123, + "grad_norm": 0.38207098841667175, + "learning_rate": 0.0001519671428014401, + "loss": 1.3262, + "step": 18494 + }, + { + "epoch": 0.2403340897240771, + "grad_norm": 0.39534109830856323, + "learning_rate": 0.0001519645433395287, + "loss": 1.2607, + "step": 18495 + }, + { + "epoch": 0.24034708426799298, + "grad_norm": 0.4871385395526886, + "learning_rate": 0.00015196194387761733, + "loss": 1.4781, + "step": 18496 + }, + { + "epoch": 0.24036007881190885, + "grad_norm": 0.4130176901817322, + "learning_rate": 0.00015195934441570596, + "loss": 1.4118, + "step": 18497 + }, + { + "epoch": 0.24037307335582472, + "grad_norm": 0.4164632260799408, + "learning_rate": 0.00015195674495379458, + "loss": 1.6275, + "step": 18498 + }, + { + "epoch": 0.2403860678997406, + "grad_norm": 0.4478120505809784, + "learning_rate": 0.00015195414549188318, + "loss": 1.3451, + "step": 18499 + }, + { + "epoch": 0.24039906244365647, + "grad_norm": 0.5194246768951416, + "learning_rate": 0.0001519515460299718, + "loss": 1.502, + "step": 18500 + }, + { + "epoch": 0.24041205698757234, + "grad_norm": 0.2550109028816223, + "learning_rate": 0.00015194894656806043, + "loss": 1.3911, + "step": 18501 + }, + { + "epoch": 0.24042505153148822, + "grad_norm": 0.43518000841140747, + "learning_rate": 0.00015194634710614902, + "loss": 1.3353, + "step": 18502 + }, + { + "epoch": 0.2404380460754041, + "grad_norm": 0.32888466119766235, + "learning_rate": 0.00015194374764423765, + "loss": 1.4003, + "step": 18503 + }, + { + "epoch": 0.24045104061931996, + "grad_norm": 0.5493488311767578, + "learning_rate": 0.00015194114818232627, + "loss": 1.5907, + "step": 18504 + }, + { + "epoch": 0.24046403516323583, + "grad_norm": 0.371028333902359, + "learning_rate": 0.0001519385487204149, + "loss": 1.229, + "step": 18505 + }, + { + "epoch": 0.2404770297071517, + "grad_norm": 0.30533456802368164, + "learning_rate": 0.0001519359492585035, + "loss": 1.4314, + "step": 18506 + }, + { + "epoch": 0.24049002425106758, + "grad_norm": 0.4756530225276947, + "learning_rate": 0.0001519333497965921, + "loss": 1.3445, + "step": 18507 + }, + { + "epoch": 0.24050301879498345, + "grad_norm": 0.3461696207523346, + "learning_rate": 0.00015193075033468074, + "loss": 1.4566, + "step": 18508 + }, + { + "epoch": 0.24051601333889933, + "grad_norm": 0.3828386664390564, + "learning_rate": 0.00015192815087276934, + "loss": 1.3277, + "step": 18509 + }, + { + "epoch": 0.2405290078828152, + "grad_norm": 0.46872884035110474, + "learning_rate": 0.00015192555141085797, + "loss": 1.4433, + "step": 18510 + }, + { + "epoch": 0.24054200242673107, + "grad_norm": 0.3163316547870636, + "learning_rate": 0.00015192295194894656, + "loss": 1.4528, + "step": 18511 + }, + { + "epoch": 0.24055499697064694, + "grad_norm": 0.41251397132873535, + "learning_rate": 0.0001519203524870352, + "loss": 1.4331, + "step": 18512 + }, + { + "epoch": 0.24056799151456282, + "grad_norm": 0.40884074568748474, + "learning_rate": 0.0001519177530251238, + "loss": 1.477, + "step": 18513 + }, + { + "epoch": 0.2405809860584787, + "grad_norm": 0.5166527032852173, + "learning_rate": 0.0001519151535632124, + "loss": 1.3682, + "step": 18514 + }, + { + "epoch": 0.24059398060239456, + "grad_norm": 0.37735888361930847, + "learning_rate": 0.00015191255410130103, + "loss": 1.5552, + "step": 18515 + }, + { + "epoch": 0.24060697514631044, + "grad_norm": 0.3381499946117401, + "learning_rate": 0.00015190995463938966, + "loss": 1.2124, + "step": 18516 + }, + { + "epoch": 0.2406199696902263, + "grad_norm": 0.3511542081832886, + "learning_rate": 0.00015190735517747828, + "loss": 1.23, + "step": 18517 + }, + { + "epoch": 0.24063296423414218, + "grad_norm": 0.32661309838294983, + "learning_rate": 0.00015190475571556688, + "loss": 1.3646, + "step": 18518 + }, + { + "epoch": 0.24064595877805806, + "grad_norm": 0.43127313256263733, + "learning_rate": 0.0001519021562536555, + "loss": 1.3926, + "step": 18519 + }, + { + "epoch": 0.24065895332197393, + "grad_norm": 0.41396626830101013, + "learning_rate": 0.00015189955679174413, + "loss": 1.4587, + "step": 18520 + }, + { + "epoch": 0.2406719478658898, + "grad_norm": 0.5347922444343567, + "learning_rate": 0.00015189695732983273, + "loss": 1.2497, + "step": 18521 + }, + { + "epoch": 0.24068494240980567, + "grad_norm": 0.5608534812927246, + "learning_rate": 0.00015189435786792135, + "loss": 1.4887, + "step": 18522 + }, + { + "epoch": 0.24069793695372155, + "grad_norm": 0.3457473814487457, + "learning_rate": 0.00015189175840600995, + "loss": 1.3865, + "step": 18523 + }, + { + "epoch": 0.24071093149763742, + "grad_norm": 0.5131239295005798, + "learning_rate": 0.00015188915894409857, + "loss": 1.4279, + "step": 18524 + }, + { + "epoch": 0.2407239260415533, + "grad_norm": 0.42662137746810913, + "learning_rate": 0.0001518865594821872, + "loss": 1.3431, + "step": 18525 + }, + { + "epoch": 0.24073692058546917, + "grad_norm": 0.4022914171218872, + "learning_rate": 0.0001518839600202758, + "loss": 1.4766, + "step": 18526 + }, + { + "epoch": 0.24074991512938504, + "grad_norm": 0.39197391271591187, + "learning_rate": 0.00015188136055836442, + "loss": 1.4444, + "step": 18527 + }, + { + "epoch": 0.2407629096733009, + "grad_norm": 0.3474847674369812, + "learning_rate": 0.00015187876109645304, + "loss": 1.5234, + "step": 18528 + }, + { + "epoch": 0.24077590421721679, + "grad_norm": 0.349973201751709, + "learning_rate": 0.00015187616163454167, + "loss": 1.2971, + "step": 18529 + }, + { + "epoch": 0.24078889876113266, + "grad_norm": 0.4139890968799591, + "learning_rate": 0.00015187356217263027, + "loss": 1.4601, + "step": 18530 + }, + { + "epoch": 0.24080189330504853, + "grad_norm": 0.43503978848457336, + "learning_rate": 0.0001518709627107189, + "loss": 1.3643, + "step": 18531 + }, + { + "epoch": 0.2408148878489644, + "grad_norm": 0.3425271213054657, + "learning_rate": 0.00015186836324880752, + "loss": 1.398, + "step": 18532 + }, + { + "epoch": 0.24082788239288028, + "grad_norm": 0.41756126284599304, + "learning_rate": 0.0001518657637868961, + "loss": 1.346, + "step": 18533 + }, + { + "epoch": 0.24084087693679615, + "grad_norm": 0.4261583089828491, + "learning_rate": 0.00015186316432498474, + "loss": 1.444, + "step": 18534 + }, + { + "epoch": 0.24085387148071202, + "grad_norm": 0.4392475187778473, + "learning_rate": 0.00015186056486307333, + "loss": 1.4935, + "step": 18535 + }, + { + "epoch": 0.2408668660246279, + "grad_norm": 0.3863852322101593, + "learning_rate": 0.00015185796540116196, + "loss": 1.3656, + "step": 18536 + }, + { + "epoch": 0.24087986056854377, + "grad_norm": 0.3822007179260254, + "learning_rate": 0.00015185536593925058, + "loss": 1.3856, + "step": 18537 + }, + { + "epoch": 0.24089285511245967, + "grad_norm": 0.4949081242084503, + "learning_rate": 0.00015185276647733918, + "loss": 1.4339, + "step": 18538 + }, + { + "epoch": 0.24090584965637554, + "grad_norm": 0.3679749071598053, + "learning_rate": 0.00015185016701542783, + "loss": 1.4916, + "step": 18539 + }, + { + "epoch": 0.24091884420029142, + "grad_norm": 0.3821481168270111, + "learning_rate": 0.00015184756755351643, + "loss": 1.4219, + "step": 18540 + }, + { + "epoch": 0.2409318387442073, + "grad_norm": 0.30985528230667114, + "learning_rate": 0.00015184496809160505, + "loss": 1.2672, + "step": 18541 + }, + { + "epoch": 0.24094483328812316, + "grad_norm": 0.45522674918174744, + "learning_rate": 0.00015184236862969365, + "loss": 1.4787, + "step": 18542 + }, + { + "epoch": 0.24095782783203903, + "grad_norm": 0.3590070307254791, + "learning_rate": 0.00015183976916778228, + "loss": 1.246, + "step": 18543 + }, + { + "epoch": 0.2409708223759549, + "grad_norm": 0.26879581809043884, + "learning_rate": 0.0001518371697058709, + "loss": 1.3358, + "step": 18544 + }, + { + "epoch": 0.24098381691987078, + "grad_norm": 0.37274497747421265, + "learning_rate": 0.0001518345702439595, + "loss": 1.4597, + "step": 18545 + }, + { + "epoch": 0.24099681146378665, + "grad_norm": 0.293200820684433, + "learning_rate": 0.00015183197078204812, + "loss": 1.3228, + "step": 18546 + }, + { + "epoch": 0.24100980600770253, + "grad_norm": 0.3799766004085541, + "learning_rate": 0.00015182937132013675, + "loss": 1.3871, + "step": 18547 + }, + { + "epoch": 0.2410228005516184, + "grad_norm": 0.4362042546272278, + "learning_rate": 0.00015182677185822537, + "loss": 1.4107, + "step": 18548 + }, + { + "epoch": 0.24103579509553427, + "grad_norm": 0.44697362184524536, + "learning_rate": 0.00015182417239631397, + "loss": 1.4029, + "step": 18549 + }, + { + "epoch": 0.24104878963945015, + "grad_norm": 0.2885587215423584, + "learning_rate": 0.00015182157293440257, + "loss": 1.4777, + "step": 18550 + }, + { + "epoch": 0.24106178418336602, + "grad_norm": 0.43840134143829346, + "learning_rate": 0.00015181897347249122, + "loss": 1.3983, + "step": 18551 + }, + { + "epoch": 0.2410747787272819, + "grad_norm": 0.37484657764434814, + "learning_rate": 0.00015181637401057982, + "loss": 1.4156, + "step": 18552 + }, + { + "epoch": 0.24108777327119776, + "grad_norm": 0.3867681324481964, + "learning_rate": 0.00015181377454866844, + "loss": 1.4694, + "step": 18553 + }, + { + "epoch": 0.24110076781511364, + "grad_norm": 0.488303005695343, + "learning_rate": 0.00015181117508675704, + "loss": 1.5646, + "step": 18554 + }, + { + "epoch": 0.2411137623590295, + "grad_norm": 0.3741527199745178, + "learning_rate": 0.00015180857562484566, + "loss": 1.324, + "step": 18555 + }, + { + "epoch": 0.24112675690294538, + "grad_norm": 0.34053561091423035, + "learning_rate": 0.00015180597616293429, + "loss": 1.5557, + "step": 18556 + }, + { + "epoch": 0.24113975144686126, + "grad_norm": 0.31561756134033203, + "learning_rate": 0.00015180337670102288, + "loss": 1.3786, + "step": 18557 + }, + { + "epoch": 0.24115274599077713, + "grad_norm": 0.45263171195983887, + "learning_rate": 0.0001518007772391115, + "loss": 1.3521, + "step": 18558 + }, + { + "epoch": 0.241165740534693, + "grad_norm": 0.39372923970222473, + "learning_rate": 0.00015179817777720013, + "loss": 1.4225, + "step": 18559 + }, + { + "epoch": 0.24117873507860887, + "grad_norm": 0.33100995421409607, + "learning_rate": 0.00015179557831528876, + "loss": 1.2954, + "step": 18560 + }, + { + "epoch": 0.24119172962252475, + "grad_norm": 0.3820338845252991, + "learning_rate": 0.00015179297885337735, + "loss": 1.4251, + "step": 18561 + }, + { + "epoch": 0.24120472416644062, + "grad_norm": 0.39095795154571533, + "learning_rate": 0.00015179037939146595, + "loss": 1.5057, + "step": 18562 + }, + { + "epoch": 0.2412177187103565, + "grad_norm": 0.36307084560394287, + "learning_rate": 0.0001517877799295546, + "loss": 1.4401, + "step": 18563 + }, + { + "epoch": 0.24123071325427237, + "grad_norm": 0.3987591564655304, + "learning_rate": 0.0001517851804676432, + "loss": 1.3163, + "step": 18564 + }, + { + "epoch": 0.24124370779818824, + "grad_norm": 0.4528557062149048, + "learning_rate": 0.00015178258100573183, + "loss": 1.4546, + "step": 18565 + }, + { + "epoch": 0.2412567023421041, + "grad_norm": 0.41332393884658813, + "learning_rate": 0.00015177998154382042, + "loss": 1.3587, + "step": 18566 + }, + { + "epoch": 0.24126969688601999, + "grad_norm": 0.36671555042266846, + "learning_rate": 0.00015177738208190905, + "loss": 1.4171, + "step": 18567 + }, + { + "epoch": 0.24128269142993586, + "grad_norm": 0.3533616364002228, + "learning_rate": 0.00015177478261999767, + "loss": 1.5284, + "step": 18568 + }, + { + "epoch": 0.24129568597385173, + "grad_norm": 0.3955092132091522, + "learning_rate": 0.00015177218315808627, + "loss": 1.3413, + "step": 18569 + }, + { + "epoch": 0.2413086805177676, + "grad_norm": 0.5402181148529053, + "learning_rate": 0.0001517695836961749, + "loss": 1.5172, + "step": 18570 + }, + { + "epoch": 0.24132167506168348, + "grad_norm": 0.42114314436912537, + "learning_rate": 0.00015176698423426352, + "loss": 1.1807, + "step": 18571 + }, + { + "epoch": 0.24133466960559935, + "grad_norm": 0.4067343473434448, + "learning_rate": 0.00015176438477235214, + "loss": 1.4942, + "step": 18572 + }, + { + "epoch": 0.24134766414951522, + "grad_norm": 0.3690411448478699, + "learning_rate": 0.00015176178531044074, + "loss": 1.447, + "step": 18573 + }, + { + "epoch": 0.2413606586934311, + "grad_norm": 0.44165557622909546, + "learning_rate": 0.00015175918584852936, + "loss": 1.3786, + "step": 18574 + }, + { + "epoch": 0.24137365323734697, + "grad_norm": 0.474723219871521, + "learning_rate": 0.000151756586386618, + "loss": 1.4934, + "step": 18575 + }, + { + "epoch": 0.24138664778126284, + "grad_norm": 0.44561460614204407, + "learning_rate": 0.00015175398692470659, + "loss": 1.4458, + "step": 18576 + }, + { + "epoch": 0.24139964232517871, + "grad_norm": 0.45253416895866394, + "learning_rate": 0.0001517513874627952, + "loss": 1.4211, + "step": 18577 + }, + { + "epoch": 0.2414126368690946, + "grad_norm": 0.36146828532218933, + "learning_rate": 0.00015174878800088384, + "loss": 1.3264, + "step": 18578 + }, + { + "epoch": 0.24142563141301046, + "grad_norm": 0.3475085496902466, + "learning_rate": 0.00015174618853897243, + "loss": 1.5759, + "step": 18579 + }, + { + "epoch": 0.24143862595692633, + "grad_norm": 0.38433751463890076, + "learning_rate": 0.00015174358907706106, + "loss": 1.3702, + "step": 18580 + }, + { + "epoch": 0.2414516205008422, + "grad_norm": 0.36033758521080017, + "learning_rate": 0.00015174098961514965, + "loss": 1.5495, + "step": 18581 + }, + { + "epoch": 0.24146461504475808, + "grad_norm": 0.3818190395832062, + "learning_rate": 0.0001517383901532383, + "loss": 1.3455, + "step": 18582 + }, + { + "epoch": 0.24147760958867395, + "grad_norm": 0.5108311176300049, + "learning_rate": 0.0001517357906913269, + "loss": 1.3746, + "step": 18583 + }, + { + "epoch": 0.24149060413258983, + "grad_norm": 0.3417815864086151, + "learning_rate": 0.00015173319122941553, + "loss": 1.2899, + "step": 18584 + }, + { + "epoch": 0.2415035986765057, + "grad_norm": 0.40832841396331787, + "learning_rate": 0.00015173059176750413, + "loss": 1.5907, + "step": 18585 + }, + { + "epoch": 0.24151659322042157, + "grad_norm": 0.32985880970954895, + "learning_rate": 0.00015172799230559275, + "loss": 1.3568, + "step": 18586 + }, + { + "epoch": 0.24152958776433744, + "grad_norm": 0.3042459785938263, + "learning_rate": 0.00015172539284368137, + "loss": 1.3126, + "step": 18587 + }, + { + "epoch": 0.24154258230825332, + "grad_norm": 0.39141055941581726, + "learning_rate": 0.00015172279338176997, + "loss": 1.3054, + "step": 18588 + }, + { + "epoch": 0.2415555768521692, + "grad_norm": 0.4066309332847595, + "learning_rate": 0.0001517201939198586, + "loss": 1.4629, + "step": 18589 + }, + { + "epoch": 0.24156857139608506, + "grad_norm": 0.3061867356300354, + "learning_rate": 0.00015171759445794722, + "loss": 1.383, + "step": 18590 + }, + { + "epoch": 0.24158156594000094, + "grad_norm": 0.3496739864349365, + "learning_rate": 0.00015171499499603582, + "loss": 1.446, + "step": 18591 + }, + { + "epoch": 0.2415945604839168, + "grad_norm": 0.4225510060787201, + "learning_rate": 0.00015171239553412444, + "loss": 1.4763, + "step": 18592 + }, + { + "epoch": 0.24160755502783268, + "grad_norm": 0.39365333318710327, + "learning_rate": 0.00015170979607221304, + "loss": 1.0949, + "step": 18593 + }, + { + "epoch": 0.24162054957174856, + "grad_norm": 0.4351522624492645, + "learning_rate": 0.0001517071966103017, + "loss": 1.3895, + "step": 18594 + }, + { + "epoch": 0.24163354411566443, + "grad_norm": 0.26343002915382385, + "learning_rate": 0.0001517045971483903, + "loss": 1.14, + "step": 18595 + }, + { + "epoch": 0.2416465386595803, + "grad_norm": 0.2849823534488678, + "learning_rate": 0.0001517019976864789, + "loss": 1.2416, + "step": 18596 + }, + { + "epoch": 0.24165953320349617, + "grad_norm": 0.3246159255504608, + "learning_rate": 0.0001516993982245675, + "loss": 1.2887, + "step": 18597 + }, + { + "epoch": 0.24167252774741205, + "grad_norm": 0.4150823652744293, + "learning_rate": 0.00015169679876265613, + "loss": 1.703, + "step": 18598 + }, + { + "epoch": 0.24168552229132792, + "grad_norm": 0.3307783007621765, + "learning_rate": 0.00015169419930074476, + "loss": 1.5364, + "step": 18599 + }, + { + "epoch": 0.2416985168352438, + "grad_norm": 0.38430479168891907, + "learning_rate": 0.00015169159983883336, + "loss": 1.3438, + "step": 18600 + }, + { + "epoch": 0.24171151137915967, + "grad_norm": 0.5762456655502319, + "learning_rate": 0.00015168900037692198, + "loss": 1.5096, + "step": 18601 + }, + { + "epoch": 0.24172450592307554, + "grad_norm": 0.4061860144138336, + "learning_rate": 0.0001516864009150106, + "loss": 1.6329, + "step": 18602 + }, + { + "epoch": 0.2417375004669914, + "grad_norm": 0.4096372425556183, + "learning_rate": 0.00015168380145309923, + "loss": 1.3402, + "step": 18603 + }, + { + "epoch": 0.24175049501090728, + "grad_norm": 0.4047014117240906, + "learning_rate": 0.00015168120199118783, + "loss": 1.3931, + "step": 18604 + }, + { + "epoch": 0.24176348955482316, + "grad_norm": 0.38806089758872986, + "learning_rate": 0.00015167860252927643, + "loss": 1.5532, + "step": 18605 + }, + { + "epoch": 0.24177648409873903, + "grad_norm": 0.4528484046459198, + "learning_rate": 0.00015167600306736508, + "loss": 1.4809, + "step": 18606 + }, + { + "epoch": 0.2417894786426549, + "grad_norm": 0.3606020510196686, + "learning_rate": 0.00015167340360545367, + "loss": 1.4749, + "step": 18607 + }, + { + "epoch": 0.24180247318657078, + "grad_norm": 0.4066159725189209, + "learning_rate": 0.0001516708041435423, + "loss": 1.3681, + "step": 18608 + }, + { + "epoch": 0.24181546773048665, + "grad_norm": 0.4755224883556366, + "learning_rate": 0.0001516682046816309, + "loss": 1.3789, + "step": 18609 + }, + { + "epoch": 0.24182846227440252, + "grad_norm": 0.43885236978530884, + "learning_rate": 0.00015166560521971952, + "loss": 1.238, + "step": 18610 + }, + { + "epoch": 0.2418414568183184, + "grad_norm": 0.366558700799942, + "learning_rate": 0.00015166300575780814, + "loss": 1.2719, + "step": 18611 + }, + { + "epoch": 0.24185445136223427, + "grad_norm": 0.35997137427330017, + "learning_rate": 0.00015166040629589674, + "loss": 1.2549, + "step": 18612 + }, + { + "epoch": 0.24186744590615014, + "grad_norm": 0.4199138581752777, + "learning_rate": 0.0001516578068339854, + "loss": 1.4904, + "step": 18613 + }, + { + "epoch": 0.24188044045006604, + "grad_norm": 0.3938407599925995, + "learning_rate": 0.000151655207372074, + "loss": 1.3414, + "step": 18614 + }, + { + "epoch": 0.24189343499398192, + "grad_norm": 0.3760218918323517, + "learning_rate": 0.00015165260791016262, + "loss": 1.3576, + "step": 18615 + }, + { + "epoch": 0.2419064295378978, + "grad_norm": 0.3322869539260864, + "learning_rate": 0.0001516500084482512, + "loss": 1.4801, + "step": 18616 + }, + { + "epoch": 0.24191942408181366, + "grad_norm": 0.33314236998558044, + "learning_rate": 0.00015164740898633984, + "loss": 1.2436, + "step": 18617 + }, + { + "epoch": 0.24193241862572953, + "grad_norm": 0.39032691717147827, + "learning_rate": 0.00015164480952442846, + "loss": 1.3134, + "step": 18618 + }, + { + "epoch": 0.2419454131696454, + "grad_norm": 0.4915000796318054, + "learning_rate": 0.00015164221006251706, + "loss": 1.5002, + "step": 18619 + }, + { + "epoch": 0.24195840771356128, + "grad_norm": 0.2642613351345062, + "learning_rate": 0.00015163961060060568, + "loss": 1.3446, + "step": 18620 + }, + { + "epoch": 0.24197140225747715, + "grad_norm": 0.4540685713291168, + "learning_rate": 0.0001516370111386943, + "loss": 1.419, + "step": 18621 + }, + { + "epoch": 0.24198439680139303, + "grad_norm": 0.3748992681503296, + "learning_rate": 0.0001516344116767829, + "loss": 1.4248, + "step": 18622 + }, + { + "epoch": 0.2419973913453089, + "grad_norm": 0.3597988784313202, + "learning_rate": 0.00015163181221487153, + "loss": 1.2639, + "step": 18623 + }, + { + "epoch": 0.24201038588922477, + "grad_norm": 0.4828532040119171, + "learning_rate": 0.00015162921275296013, + "loss": 1.4896, + "step": 18624 + }, + { + "epoch": 0.24202338043314064, + "grad_norm": 0.489960640668869, + "learning_rate": 0.00015162661329104878, + "loss": 1.3722, + "step": 18625 + }, + { + "epoch": 0.24203637497705652, + "grad_norm": 0.4617827534675598, + "learning_rate": 0.00015162401382913738, + "loss": 1.3188, + "step": 18626 + }, + { + "epoch": 0.2420493695209724, + "grad_norm": 0.46666109561920166, + "learning_rate": 0.000151621414367226, + "loss": 1.5559, + "step": 18627 + }, + { + "epoch": 0.24206236406488826, + "grad_norm": 0.4043882191181183, + "learning_rate": 0.0001516188149053146, + "loss": 1.3671, + "step": 18628 + }, + { + "epoch": 0.24207535860880414, + "grad_norm": 0.341335266828537, + "learning_rate": 0.00015161621544340322, + "loss": 1.1951, + "step": 18629 + }, + { + "epoch": 0.24208835315272, + "grad_norm": 0.3979286551475525, + "learning_rate": 0.00015161361598149185, + "loss": 1.4869, + "step": 18630 + }, + { + "epoch": 0.24210134769663588, + "grad_norm": 0.5013980269432068, + "learning_rate": 0.00015161101651958044, + "loss": 1.3594, + "step": 18631 + }, + { + "epoch": 0.24211434224055176, + "grad_norm": 0.47453683614730835, + "learning_rate": 0.00015160841705766907, + "loss": 1.5178, + "step": 18632 + }, + { + "epoch": 0.24212733678446763, + "grad_norm": 0.37517401576042175, + "learning_rate": 0.0001516058175957577, + "loss": 1.4119, + "step": 18633 + }, + { + "epoch": 0.2421403313283835, + "grad_norm": 0.5154349207878113, + "learning_rate": 0.0001516032181338463, + "loss": 1.3683, + "step": 18634 + }, + { + "epoch": 0.24215332587229937, + "grad_norm": 0.40972840785980225, + "learning_rate": 0.00015160061867193492, + "loss": 1.5584, + "step": 18635 + }, + { + "epoch": 0.24216632041621525, + "grad_norm": 0.4575447142124176, + "learning_rate": 0.0001515980192100235, + "loss": 1.3647, + "step": 18636 + }, + { + "epoch": 0.24217931496013112, + "grad_norm": 0.5445562601089478, + "learning_rate": 0.00015159541974811216, + "loss": 1.4829, + "step": 18637 + }, + { + "epoch": 0.242192309504047, + "grad_norm": 0.4747721254825592, + "learning_rate": 0.00015159282028620076, + "loss": 1.5206, + "step": 18638 + }, + { + "epoch": 0.24220530404796287, + "grad_norm": 0.4366362392902374, + "learning_rate": 0.00015159022082428939, + "loss": 1.3796, + "step": 18639 + }, + { + "epoch": 0.24221829859187874, + "grad_norm": 0.40907254815101624, + "learning_rate": 0.00015158762136237798, + "loss": 1.5788, + "step": 18640 + }, + { + "epoch": 0.2422312931357946, + "grad_norm": 0.42435750365257263, + "learning_rate": 0.0001515850219004666, + "loss": 1.4762, + "step": 18641 + }, + { + "epoch": 0.24224428767971048, + "grad_norm": 0.3240799307823181, + "learning_rate": 0.00015158242243855523, + "loss": 1.4178, + "step": 18642 + }, + { + "epoch": 0.24225728222362636, + "grad_norm": 0.41263800859451294, + "learning_rate": 0.00015157982297664383, + "loss": 1.2983, + "step": 18643 + }, + { + "epoch": 0.24227027676754223, + "grad_norm": 0.3566671907901764, + "learning_rate": 0.00015157722351473245, + "loss": 1.4416, + "step": 18644 + }, + { + "epoch": 0.2422832713114581, + "grad_norm": 0.47105324268341064, + "learning_rate": 0.00015157462405282108, + "loss": 1.4999, + "step": 18645 + }, + { + "epoch": 0.24229626585537398, + "grad_norm": 0.32150542736053467, + "learning_rate": 0.00015157202459090968, + "loss": 1.6419, + "step": 18646 + }, + { + "epoch": 0.24230926039928985, + "grad_norm": 0.530704915523529, + "learning_rate": 0.0001515694251289983, + "loss": 1.464, + "step": 18647 + }, + { + "epoch": 0.24232225494320572, + "grad_norm": 0.3804032504558563, + "learning_rate": 0.00015156682566708693, + "loss": 1.749, + "step": 18648 + }, + { + "epoch": 0.2423352494871216, + "grad_norm": 0.2966215908527374, + "learning_rate": 0.00015156422620517555, + "loss": 1.2693, + "step": 18649 + }, + { + "epoch": 0.24234824403103747, + "grad_norm": 0.36431822180747986, + "learning_rate": 0.00015156162674326415, + "loss": 1.2924, + "step": 18650 + }, + { + "epoch": 0.24236123857495334, + "grad_norm": 0.43764594197273254, + "learning_rate": 0.00015155902728135277, + "loss": 1.352, + "step": 18651 + }, + { + "epoch": 0.24237423311886921, + "grad_norm": 0.3338315486907959, + "learning_rate": 0.0001515564278194414, + "loss": 1.4392, + "step": 18652 + }, + { + "epoch": 0.2423872276627851, + "grad_norm": 0.6378545165061951, + "learning_rate": 0.00015155382835753, + "loss": 1.4756, + "step": 18653 + }, + { + "epoch": 0.24240022220670096, + "grad_norm": 0.3244880139827728, + "learning_rate": 0.00015155122889561862, + "loss": 1.3912, + "step": 18654 + }, + { + "epoch": 0.24241321675061683, + "grad_norm": 0.4499629735946655, + "learning_rate": 0.00015154862943370722, + "loss": 1.5699, + "step": 18655 + }, + { + "epoch": 0.2424262112945327, + "grad_norm": 0.5940762758255005, + "learning_rate": 0.00015154602997179587, + "loss": 1.454, + "step": 18656 + }, + { + "epoch": 0.24243920583844858, + "grad_norm": 0.5180310010910034, + "learning_rate": 0.00015154343050988446, + "loss": 1.4349, + "step": 18657 + }, + { + "epoch": 0.24245220038236445, + "grad_norm": 0.5157365798950195, + "learning_rate": 0.00015154083104797306, + "loss": 1.4465, + "step": 18658 + }, + { + "epoch": 0.24246519492628033, + "grad_norm": 0.29018861055374146, + "learning_rate": 0.00015153823158606169, + "loss": 1.2229, + "step": 18659 + }, + { + "epoch": 0.2424781894701962, + "grad_norm": 0.4880530536174774, + "learning_rate": 0.0001515356321241503, + "loss": 1.6192, + "step": 18660 + }, + { + "epoch": 0.24249118401411207, + "grad_norm": 0.4546888470649719, + "learning_rate": 0.00015153303266223894, + "loss": 1.2466, + "step": 18661 + }, + { + "epoch": 0.24250417855802794, + "grad_norm": 0.41736820340156555, + "learning_rate": 0.00015153043320032753, + "loss": 1.3336, + "step": 18662 + }, + { + "epoch": 0.24251717310194382, + "grad_norm": 0.34993085265159607, + "learning_rate": 0.00015152783373841616, + "loss": 1.4632, + "step": 18663 + }, + { + "epoch": 0.2425301676458597, + "grad_norm": 0.4183323383331299, + "learning_rate": 0.00015152523427650478, + "loss": 1.4106, + "step": 18664 + }, + { + "epoch": 0.24254316218977556, + "grad_norm": 0.34649717807769775, + "learning_rate": 0.00015152263481459338, + "loss": 1.2787, + "step": 18665 + }, + { + "epoch": 0.24255615673369144, + "grad_norm": 0.34201788902282715, + "learning_rate": 0.000151520035352682, + "loss": 1.2444, + "step": 18666 + }, + { + "epoch": 0.2425691512776073, + "grad_norm": 0.4168111979961395, + "learning_rate": 0.0001515174358907706, + "loss": 1.4562, + "step": 18667 + }, + { + "epoch": 0.24258214582152318, + "grad_norm": 0.24262452125549316, + "learning_rate": 0.00015151483642885925, + "loss": 1.2438, + "step": 18668 + }, + { + "epoch": 0.24259514036543905, + "grad_norm": 0.35859549045562744, + "learning_rate": 0.00015151223696694785, + "loss": 1.2294, + "step": 18669 + }, + { + "epoch": 0.24260813490935493, + "grad_norm": 0.42102178931236267, + "learning_rate": 0.00015150963750503647, + "loss": 1.3983, + "step": 18670 + }, + { + "epoch": 0.2426211294532708, + "grad_norm": 0.4524250626564026, + "learning_rate": 0.00015150703804312507, + "loss": 1.4813, + "step": 18671 + }, + { + "epoch": 0.24263412399718667, + "grad_norm": 0.4526432454586029, + "learning_rate": 0.0001515044385812137, + "loss": 1.3536, + "step": 18672 + }, + { + "epoch": 0.24264711854110255, + "grad_norm": 0.42873483896255493, + "learning_rate": 0.00015150183911930232, + "loss": 1.3566, + "step": 18673 + }, + { + "epoch": 0.24266011308501842, + "grad_norm": 0.31473997235298157, + "learning_rate": 0.00015149923965739092, + "loss": 1.3312, + "step": 18674 + }, + { + "epoch": 0.2426731076289343, + "grad_norm": 0.4092557430267334, + "learning_rate": 0.00015149664019547954, + "loss": 1.4292, + "step": 18675 + }, + { + "epoch": 0.24268610217285017, + "grad_norm": 0.4255172312259674, + "learning_rate": 0.00015149404073356817, + "loss": 1.4521, + "step": 18676 + }, + { + "epoch": 0.24269909671676604, + "grad_norm": 0.42825016379356384, + "learning_rate": 0.00015149144127165676, + "loss": 1.4087, + "step": 18677 + }, + { + "epoch": 0.2427120912606819, + "grad_norm": 0.35758161544799805, + "learning_rate": 0.0001514888418097454, + "loss": 1.2621, + "step": 18678 + }, + { + "epoch": 0.24272508580459778, + "grad_norm": 0.29613304138183594, + "learning_rate": 0.00015148624234783399, + "loss": 1.244, + "step": 18679 + }, + { + "epoch": 0.24273808034851366, + "grad_norm": 0.27445435523986816, + "learning_rate": 0.00015148364288592264, + "loss": 1.2144, + "step": 18680 + }, + { + "epoch": 0.24275107489242953, + "grad_norm": 0.4573116898536682, + "learning_rate": 0.00015148104342401124, + "loss": 1.4982, + "step": 18681 + }, + { + "epoch": 0.2427640694363454, + "grad_norm": 0.37999045848846436, + "learning_rate": 0.00015147844396209986, + "loss": 1.4073, + "step": 18682 + }, + { + "epoch": 0.24277706398026128, + "grad_norm": 0.42922714352607727, + "learning_rate": 0.00015147584450018846, + "loss": 1.551, + "step": 18683 + }, + { + "epoch": 0.24279005852417715, + "grad_norm": 0.47659096121788025, + "learning_rate": 0.00015147324503827708, + "loss": 1.5652, + "step": 18684 + }, + { + "epoch": 0.24280305306809302, + "grad_norm": 0.4912133514881134, + "learning_rate": 0.0001514706455763657, + "loss": 1.5049, + "step": 18685 + }, + { + "epoch": 0.2428160476120089, + "grad_norm": 0.33328741788864136, + "learning_rate": 0.0001514680461144543, + "loss": 1.3937, + "step": 18686 + }, + { + "epoch": 0.24282904215592477, + "grad_norm": 0.3814634680747986, + "learning_rate": 0.00015146544665254296, + "loss": 1.5056, + "step": 18687 + }, + { + "epoch": 0.24284203669984064, + "grad_norm": 0.32378554344177246, + "learning_rate": 0.00015146284719063155, + "loss": 1.5988, + "step": 18688 + }, + { + "epoch": 0.24285503124375651, + "grad_norm": 0.3705339729785919, + "learning_rate": 0.00015146024772872015, + "loss": 1.2346, + "step": 18689 + }, + { + "epoch": 0.24286802578767241, + "grad_norm": 0.43569323420524597, + "learning_rate": 0.00015145764826680877, + "loss": 1.429, + "step": 18690 + }, + { + "epoch": 0.2428810203315883, + "grad_norm": 0.3635070025920868, + "learning_rate": 0.0001514550488048974, + "loss": 1.4371, + "step": 18691 + }, + { + "epoch": 0.24289401487550416, + "grad_norm": 0.45927491784095764, + "learning_rate": 0.00015145244934298602, + "loss": 1.5511, + "step": 18692 + }, + { + "epoch": 0.24290700941942003, + "grad_norm": 0.3554874360561371, + "learning_rate": 0.00015144984988107462, + "loss": 1.431, + "step": 18693 + }, + { + "epoch": 0.2429200039633359, + "grad_norm": 0.4174509644508362, + "learning_rate": 0.00015144725041916325, + "loss": 1.4388, + "step": 18694 + }, + { + "epoch": 0.24293299850725178, + "grad_norm": 0.4513232111930847, + "learning_rate": 0.00015144465095725187, + "loss": 1.3627, + "step": 18695 + }, + { + "epoch": 0.24294599305116765, + "grad_norm": 0.2647462487220764, + "learning_rate": 0.00015144205149534047, + "loss": 1.1804, + "step": 18696 + }, + { + "epoch": 0.24295898759508353, + "grad_norm": 0.4600968062877655, + "learning_rate": 0.0001514394520334291, + "loss": 1.3239, + "step": 18697 + }, + { + "epoch": 0.2429719821389994, + "grad_norm": 0.37200671434402466, + "learning_rate": 0.0001514368525715177, + "loss": 1.2162, + "step": 18698 + }, + { + "epoch": 0.24298497668291527, + "grad_norm": 0.3960192799568176, + "learning_rate": 0.00015143425310960634, + "loss": 1.3289, + "step": 18699 + }, + { + "epoch": 0.24299797122683114, + "grad_norm": 0.34126052260398865, + "learning_rate": 0.00015143165364769494, + "loss": 1.3954, + "step": 18700 + }, + { + "epoch": 0.24301096577074702, + "grad_norm": 0.5253952741622925, + "learning_rate": 0.00015142905418578354, + "loss": 1.4714, + "step": 18701 + }, + { + "epoch": 0.2430239603146629, + "grad_norm": 0.4674523174762726, + "learning_rate": 0.00015142645472387216, + "loss": 1.3456, + "step": 18702 + }, + { + "epoch": 0.24303695485857876, + "grad_norm": 0.30023127794265747, + "learning_rate": 0.00015142385526196078, + "loss": 1.5672, + "step": 18703 + }, + { + "epoch": 0.24304994940249464, + "grad_norm": 0.39188772439956665, + "learning_rate": 0.0001514212558000494, + "loss": 1.4785, + "step": 18704 + }, + { + "epoch": 0.2430629439464105, + "grad_norm": 0.4098399877548218, + "learning_rate": 0.000151418656338138, + "loss": 1.3529, + "step": 18705 + }, + { + "epoch": 0.24307593849032638, + "grad_norm": 0.40387290716171265, + "learning_rate": 0.00015141605687622663, + "loss": 1.3914, + "step": 18706 + }, + { + "epoch": 0.24308893303424225, + "grad_norm": 0.35220301151275635, + "learning_rate": 0.00015141345741431526, + "loss": 1.3964, + "step": 18707 + }, + { + "epoch": 0.24310192757815813, + "grad_norm": 0.36789292097091675, + "learning_rate": 0.00015141085795240385, + "loss": 1.4885, + "step": 18708 + }, + { + "epoch": 0.243114922122074, + "grad_norm": 0.379399836063385, + "learning_rate": 0.00015140825849049248, + "loss": 1.4816, + "step": 18709 + }, + { + "epoch": 0.24312791666598987, + "grad_norm": 0.32574591040611267, + "learning_rate": 0.00015140565902858107, + "loss": 1.4377, + "step": 18710 + }, + { + "epoch": 0.24314091120990575, + "grad_norm": 0.4179539084434509, + "learning_rate": 0.00015140305956666973, + "loss": 1.5756, + "step": 18711 + }, + { + "epoch": 0.24315390575382162, + "grad_norm": 0.391205370426178, + "learning_rate": 0.00015140046010475832, + "loss": 1.5877, + "step": 18712 + }, + { + "epoch": 0.2431669002977375, + "grad_norm": 0.3907163739204407, + "learning_rate": 0.00015139786064284692, + "loss": 1.4361, + "step": 18713 + }, + { + "epoch": 0.24317989484165337, + "grad_norm": 0.33866459131240845, + "learning_rate": 0.00015139526118093555, + "loss": 1.0723, + "step": 18714 + }, + { + "epoch": 0.24319288938556924, + "grad_norm": 0.34207606315612793, + "learning_rate": 0.00015139266171902417, + "loss": 1.4648, + "step": 18715 + }, + { + "epoch": 0.2432058839294851, + "grad_norm": 0.3380109667778015, + "learning_rate": 0.0001513900622571128, + "loss": 1.3762, + "step": 18716 + }, + { + "epoch": 0.24321887847340098, + "grad_norm": 0.47035568952560425, + "learning_rate": 0.0001513874627952014, + "loss": 1.5755, + "step": 18717 + }, + { + "epoch": 0.24323187301731686, + "grad_norm": 0.4450298845767975, + "learning_rate": 0.00015138486333329002, + "loss": 1.2981, + "step": 18718 + }, + { + "epoch": 0.24324486756123273, + "grad_norm": 0.5092129111289978, + "learning_rate": 0.00015138226387137864, + "loss": 1.3505, + "step": 18719 + }, + { + "epoch": 0.2432578621051486, + "grad_norm": 0.3212827444076538, + "learning_rate": 0.00015137966440946724, + "loss": 1.4542, + "step": 18720 + }, + { + "epoch": 0.24327085664906448, + "grad_norm": 0.44399669766426086, + "learning_rate": 0.00015137706494755586, + "loss": 1.5591, + "step": 18721 + }, + { + "epoch": 0.24328385119298035, + "grad_norm": 0.33565202355384827, + "learning_rate": 0.0001513744654856445, + "loss": 1.4235, + "step": 18722 + }, + { + "epoch": 0.24329684573689622, + "grad_norm": 0.4154299199581146, + "learning_rate": 0.0001513718660237331, + "loss": 1.3861, + "step": 18723 + }, + { + "epoch": 0.2433098402808121, + "grad_norm": 0.4201943278312683, + "learning_rate": 0.0001513692665618217, + "loss": 1.4103, + "step": 18724 + }, + { + "epoch": 0.24332283482472797, + "grad_norm": 0.4369250237941742, + "learning_rate": 0.00015136666709991033, + "loss": 1.3311, + "step": 18725 + }, + { + "epoch": 0.24333582936864384, + "grad_norm": 0.33135008811950684, + "learning_rate": 0.00015136406763799896, + "loss": 1.4669, + "step": 18726 + }, + { + "epoch": 0.24334882391255971, + "grad_norm": 0.4350792467594147, + "learning_rate": 0.00015136146817608756, + "loss": 1.3068, + "step": 18727 + }, + { + "epoch": 0.2433618184564756, + "grad_norm": 0.37491574883461, + "learning_rate": 0.00015135886871417618, + "loss": 1.3763, + "step": 18728 + }, + { + "epoch": 0.24337481300039146, + "grad_norm": 0.3830513060092926, + "learning_rate": 0.00015135626925226478, + "loss": 1.5058, + "step": 18729 + }, + { + "epoch": 0.24338780754430733, + "grad_norm": 0.43277010321617126, + "learning_rate": 0.0001513536697903534, + "loss": 1.4395, + "step": 18730 + }, + { + "epoch": 0.2434008020882232, + "grad_norm": 0.28896117210388184, + "learning_rate": 0.00015135107032844203, + "loss": 1.3533, + "step": 18731 + }, + { + "epoch": 0.24341379663213908, + "grad_norm": 0.3370968997478485, + "learning_rate": 0.00015134847086653062, + "loss": 1.5178, + "step": 18732 + }, + { + "epoch": 0.24342679117605495, + "grad_norm": 0.31554126739501953, + "learning_rate": 0.00015134587140461925, + "loss": 1.2208, + "step": 18733 + }, + { + "epoch": 0.24343978571997082, + "grad_norm": 0.3740726411342621, + "learning_rate": 0.00015134327194270787, + "loss": 1.3376, + "step": 18734 + }, + { + "epoch": 0.2434527802638867, + "grad_norm": 0.37324458360671997, + "learning_rate": 0.0001513406724807965, + "loss": 1.2837, + "step": 18735 + }, + { + "epoch": 0.24346577480780257, + "grad_norm": 0.37935417890548706, + "learning_rate": 0.0001513380730188851, + "loss": 1.3733, + "step": 18736 + }, + { + "epoch": 0.24347876935171844, + "grad_norm": 0.3903962969779968, + "learning_rate": 0.00015133547355697372, + "loss": 1.4875, + "step": 18737 + }, + { + "epoch": 0.24349176389563432, + "grad_norm": 0.4248451590538025, + "learning_rate": 0.00015133287409506234, + "loss": 1.4921, + "step": 18738 + }, + { + "epoch": 0.2435047584395502, + "grad_norm": 0.4164523482322693, + "learning_rate": 0.00015133027463315094, + "loss": 1.3684, + "step": 18739 + }, + { + "epoch": 0.24351775298346606, + "grad_norm": 0.4053780436515808, + "learning_rate": 0.00015132767517123956, + "loss": 1.4135, + "step": 18740 + }, + { + "epoch": 0.24353074752738194, + "grad_norm": 0.44323110580444336, + "learning_rate": 0.00015132507570932816, + "loss": 1.5026, + "step": 18741 + }, + { + "epoch": 0.2435437420712978, + "grad_norm": 0.4161290228366852, + "learning_rate": 0.0001513224762474168, + "loss": 1.4473, + "step": 18742 + }, + { + "epoch": 0.24355673661521368, + "grad_norm": 0.37046870589256287, + "learning_rate": 0.0001513198767855054, + "loss": 1.5759, + "step": 18743 + }, + { + "epoch": 0.24356973115912955, + "grad_norm": 0.3534325659275055, + "learning_rate": 0.000151317277323594, + "loss": 1.415, + "step": 18744 + }, + { + "epoch": 0.24358272570304543, + "grad_norm": 0.3760634958744049, + "learning_rate": 0.00015131467786168263, + "loss": 1.4554, + "step": 18745 + }, + { + "epoch": 0.2435957202469613, + "grad_norm": 0.38557153940200806, + "learning_rate": 0.00015131207839977126, + "loss": 1.408, + "step": 18746 + }, + { + "epoch": 0.24360871479087717, + "grad_norm": 0.29940006136894226, + "learning_rate": 0.00015130947893785988, + "loss": 1.2208, + "step": 18747 + }, + { + "epoch": 0.24362170933479305, + "grad_norm": 0.385562926530838, + "learning_rate": 0.00015130687947594848, + "loss": 1.2387, + "step": 18748 + }, + { + "epoch": 0.24363470387870892, + "grad_norm": 0.3814559280872345, + "learning_rate": 0.0001513042800140371, + "loss": 1.4198, + "step": 18749 + }, + { + "epoch": 0.2436476984226248, + "grad_norm": 0.36405354738235474, + "learning_rate": 0.00015130168055212573, + "loss": 1.5686, + "step": 18750 + }, + { + "epoch": 0.24366069296654067, + "grad_norm": 0.3762778639793396, + "learning_rate": 0.00015129908109021433, + "loss": 1.3829, + "step": 18751 + }, + { + "epoch": 0.24367368751045654, + "grad_norm": 0.43630701303482056, + "learning_rate": 0.00015129648162830295, + "loss": 1.4168, + "step": 18752 + }, + { + "epoch": 0.2436866820543724, + "grad_norm": 0.4406484365463257, + "learning_rate": 0.00015129388216639155, + "loss": 1.4702, + "step": 18753 + }, + { + "epoch": 0.24369967659828828, + "grad_norm": 0.379935085773468, + "learning_rate": 0.0001512912827044802, + "loss": 1.4309, + "step": 18754 + }, + { + "epoch": 0.24371267114220416, + "grad_norm": 0.412319153547287, + "learning_rate": 0.0001512886832425688, + "loss": 1.4179, + "step": 18755 + }, + { + "epoch": 0.24372566568612003, + "grad_norm": 0.42225927114486694, + "learning_rate": 0.0001512860837806574, + "loss": 1.345, + "step": 18756 + }, + { + "epoch": 0.2437386602300359, + "grad_norm": 0.43439263105392456, + "learning_rate": 0.00015128348431874602, + "loss": 1.4821, + "step": 18757 + }, + { + "epoch": 0.24375165477395178, + "grad_norm": 0.48218613862991333, + "learning_rate": 0.00015128088485683464, + "loss": 1.4024, + "step": 18758 + }, + { + "epoch": 0.24376464931786765, + "grad_norm": 0.3680278956890106, + "learning_rate": 0.00015127828539492327, + "loss": 1.5368, + "step": 18759 + }, + { + "epoch": 0.24377764386178352, + "grad_norm": 0.39215633273124695, + "learning_rate": 0.00015127568593301186, + "loss": 1.2514, + "step": 18760 + }, + { + "epoch": 0.2437906384056994, + "grad_norm": 0.32422924041748047, + "learning_rate": 0.0001512730864711005, + "loss": 1.3348, + "step": 18761 + }, + { + "epoch": 0.24380363294961527, + "grad_norm": 0.4098733365535736, + "learning_rate": 0.00015127048700918911, + "loss": 1.4497, + "step": 18762 + }, + { + "epoch": 0.24381662749353114, + "grad_norm": 0.405290812253952, + "learning_rate": 0.0001512678875472777, + "loss": 1.4056, + "step": 18763 + }, + { + "epoch": 0.243829622037447, + "grad_norm": 0.3286243975162506, + "learning_rate": 0.00015126528808536634, + "loss": 1.4373, + "step": 18764 + }, + { + "epoch": 0.2438426165813629, + "grad_norm": 0.4578157663345337, + "learning_rate": 0.00015126268862345496, + "loss": 1.4788, + "step": 18765 + }, + { + "epoch": 0.2438556111252788, + "grad_norm": 0.29505306482315063, + "learning_rate": 0.00015126008916154358, + "loss": 1.3035, + "step": 18766 + }, + { + "epoch": 0.24386860566919466, + "grad_norm": 0.36847490072250366, + "learning_rate": 0.00015125748969963218, + "loss": 1.5574, + "step": 18767 + }, + { + "epoch": 0.24388160021311053, + "grad_norm": 0.4174138903617859, + "learning_rate": 0.00015125489023772078, + "loss": 1.6381, + "step": 18768 + }, + { + "epoch": 0.2438945947570264, + "grad_norm": 0.3846185803413391, + "learning_rate": 0.00015125229077580943, + "loss": 1.4377, + "step": 18769 + }, + { + "epoch": 0.24390758930094228, + "grad_norm": 0.4426399767398834, + "learning_rate": 0.00015124969131389803, + "loss": 1.18, + "step": 18770 + }, + { + "epoch": 0.24392058384485815, + "grad_norm": 0.35817059874534607, + "learning_rate": 0.00015124709185198665, + "loss": 1.3283, + "step": 18771 + }, + { + "epoch": 0.24393357838877402, + "grad_norm": 0.3810522258281708, + "learning_rate": 0.00015124449239007525, + "loss": 1.2969, + "step": 18772 + }, + { + "epoch": 0.2439465729326899, + "grad_norm": 0.43992850184440613, + "learning_rate": 0.00015124189292816387, + "loss": 1.2024, + "step": 18773 + }, + { + "epoch": 0.24395956747660577, + "grad_norm": 0.4206596612930298, + "learning_rate": 0.0001512392934662525, + "loss": 1.4586, + "step": 18774 + }, + { + "epoch": 0.24397256202052164, + "grad_norm": 0.36326298117637634, + "learning_rate": 0.0001512366940043411, + "loss": 1.2922, + "step": 18775 + }, + { + "epoch": 0.24398555656443752, + "grad_norm": 0.32933279871940613, + "learning_rate": 0.00015123409454242972, + "loss": 1.2991, + "step": 18776 + }, + { + "epoch": 0.2439985511083534, + "grad_norm": 0.3619011640548706, + "learning_rate": 0.00015123149508051835, + "loss": 1.3923, + "step": 18777 + }, + { + "epoch": 0.24401154565226926, + "grad_norm": 0.359198659658432, + "learning_rate": 0.00015122889561860697, + "loss": 1.3943, + "step": 18778 + }, + { + "epoch": 0.24402454019618514, + "grad_norm": 0.30627089738845825, + "learning_rate": 0.00015122629615669557, + "loss": 1.4395, + "step": 18779 + }, + { + "epoch": 0.244037534740101, + "grad_norm": 0.31865280866622925, + "learning_rate": 0.00015122369669478416, + "loss": 1.3081, + "step": 18780 + }, + { + "epoch": 0.24405052928401688, + "grad_norm": 0.41274237632751465, + "learning_rate": 0.00015122109723287282, + "loss": 1.5199, + "step": 18781 + }, + { + "epoch": 0.24406352382793275, + "grad_norm": 0.34779736399650574, + "learning_rate": 0.00015121849777096141, + "loss": 1.4976, + "step": 18782 + }, + { + "epoch": 0.24407651837184863, + "grad_norm": 0.28953367471694946, + "learning_rate": 0.00015121589830905004, + "loss": 1.3315, + "step": 18783 + }, + { + "epoch": 0.2440895129157645, + "grad_norm": 0.3361044228076935, + "learning_rate": 0.00015121329884713864, + "loss": 1.1861, + "step": 18784 + }, + { + "epoch": 0.24410250745968037, + "grad_norm": 0.49731504917144775, + "learning_rate": 0.00015121069938522726, + "loss": 1.4633, + "step": 18785 + }, + { + "epoch": 0.24411550200359625, + "grad_norm": 0.38585013151168823, + "learning_rate": 0.00015120809992331588, + "loss": 1.4163, + "step": 18786 + }, + { + "epoch": 0.24412849654751212, + "grad_norm": 0.5276221036911011, + "learning_rate": 0.00015120550046140448, + "loss": 1.594, + "step": 18787 + }, + { + "epoch": 0.244141491091428, + "grad_norm": 0.4159316122531891, + "learning_rate": 0.0001512029009994931, + "loss": 1.3507, + "step": 18788 + }, + { + "epoch": 0.24415448563534387, + "grad_norm": 0.5151793956756592, + "learning_rate": 0.00015120030153758173, + "loss": 1.445, + "step": 18789 + }, + { + "epoch": 0.24416748017925974, + "grad_norm": 0.4093502461910248, + "learning_rate": 0.00015119770207567036, + "loss": 1.4257, + "step": 18790 + }, + { + "epoch": 0.2441804747231756, + "grad_norm": 0.30513325333595276, + "learning_rate": 0.00015119510261375895, + "loss": 1.5628, + "step": 18791 + }, + { + "epoch": 0.24419346926709148, + "grad_norm": 0.3895033895969391, + "learning_rate": 0.00015119250315184758, + "loss": 1.2502, + "step": 18792 + }, + { + "epoch": 0.24420646381100736, + "grad_norm": 0.29489392042160034, + "learning_rate": 0.0001511899036899362, + "loss": 1.2116, + "step": 18793 + }, + { + "epoch": 0.24421945835492323, + "grad_norm": 0.4238291382789612, + "learning_rate": 0.0001511873042280248, + "loss": 1.5503, + "step": 18794 + }, + { + "epoch": 0.2442324528988391, + "grad_norm": 0.45867016911506653, + "learning_rate": 0.00015118470476611342, + "loss": 1.4501, + "step": 18795 + }, + { + "epoch": 0.24424544744275498, + "grad_norm": 0.33472180366516113, + "learning_rate": 0.00015118210530420202, + "loss": 1.4123, + "step": 18796 + }, + { + "epoch": 0.24425844198667085, + "grad_norm": 0.49916476011276245, + "learning_rate": 0.00015117950584229065, + "loss": 1.4447, + "step": 18797 + }, + { + "epoch": 0.24427143653058672, + "grad_norm": 0.4661047160625458, + "learning_rate": 0.00015117690638037927, + "loss": 1.5426, + "step": 18798 + }, + { + "epoch": 0.2442844310745026, + "grad_norm": 0.30168893933296204, + "learning_rate": 0.00015117430691846787, + "loss": 1.518, + "step": 18799 + }, + { + "epoch": 0.24429742561841847, + "grad_norm": 0.4043814241886139, + "learning_rate": 0.00015117170745655652, + "loss": 1.2824, + "step": 18800 + }, + { + "epoch": 0.24431042016233434, + "grad_norm": 0.40717563033103943, + "learning_rate": 0.00015116910799464512, + "loss": 1.55, + "step": 18801 + }, + { + "epoch": 0.2443234147062502, + "grad_norm": 0.3813866376876831, + "learning_rate": 0.00015116650853273374, + "loss": 1.3725, + "step": 18802 + }, + { + "epoch": 0.2443364092501661, + "grad_norm": 0.37645336985588074, + "learning_rate": 0.00015116390907082234, + "loss": 1.3089, + "step": 18803 + }, + { + "epoch": 0.24434940379408196, + "grad_norm": 0.32562968134880066, + "learning_rate": 0.00015116130960891096, + "loss": 1.1854, + "step": 18804 + }, + { + "epoch": 0.24436239833799783, + "grad_norm": 0.3957166075706482, + "learning_rate": 0.0001511587101469996, + "loss": 1.2336, + "step": 18805 + }, + { + "epoch": 0.2443753928819137, + "grad_norm": 0.5393717885017395, + "learning_rate": 0.00015115611068508818, + "loss": 1.3798, + "step": 18806 + }, + { + "epoch": 0.24438838742582958, + "grad_norm": 0.40587881207466125, + "learning_rate": 0.0001511535112231768, + "loss": 1.398, + "step": 18807 + }, + { + "epoch": 0.24440138196974545, + "grad_norm": 0.2878035008907318, + "learning_rate": 0.00015115091176126543, + "loss": 1.2865, + "step": 18808 + }, + { + "epoch": 0.24441437651366132, + "grad_norm": 0.40324002504348755, + "learning_rate": 0.00015114831229935406, + "loss": 1.5572, + "step": 18809 + }, + { + "epoch": 0.2444273710575772, + "grad_norm": 0.3589307963848114, + "learning_rate": 0.00015114571283744266, + "loss": 1.3056, + "step": 18810 + }, + { + "epoch": 0.24444036560149307, + "grad_norm": 0.49471569061279297, + "learning_rate": 0.00015114311337553125, + "loss": 1.541, + "step": 18811 + }, + { + "epoch": 0.24445336014540894, + "grad_norm": 0.3284972906112671, + "learning_rate": 0.0001511405139136199, + "loss": 1.3574, + "step": 18812 + }, + { + "epoch": 0.24446635468932482, + "grad_norm": 0.37585967779159546, + "learning_rate": 0.0001511379144517085, + "loss": 1.4213, + "step": 18813 + }, + { + "epoch": 0.2444793492332407, + "grad_norm": 0.3559940457344055, + "learning_rate": 0.00015113531498979713, + "loss": 1.3872, + "step": 18814 + }, + { + "epoch": 0.24449234377715656, + "grad_norm": 0.42702940106391907, + "learning_rate": 0.00015113271552788572, + "loss": 1.5979, + "step": 18815 + }, + { + "epoch": 0.24450533832107244, + "grad_norm": 0.4022574722766876, + "learning_rate": 0.00015113011606597435, + "loss": 1.3873, + "step": 18816 + }, + { + "epoch": 0.2445183328649883, + "grad_norm": 0.3252454996109009, + "learning_rate": 0.00015112751660406297, + "loss": 1.2717, + "step": 18817 + }, + { + "epoch": 0.24453132740890418, + "grad_norm": 0.3761330544948578, + "learning_rate": 0.00015112491714215157, + "loss": 1.2447, + "step": 18818 + }, + { + "epoch": 0.24454432195282005, + "grad_norm": 0.3562001585960388, + "learning_rate": 0.0001511223176802402, + "loss": 1.3029, + "step": 18819 + }, + { + "epoch": 0.24455731649673593, + "grad_norm": 0.3484468162059784, + "learning_rate": 0.00015111971821832882, + "loss": 1.3335, + "step": 18820 + }, + { + "epoch": 0.2445703110406518, + "grad_norm": 0.4047453701496124, + "learning_rate": 0.00015111711875641744, + "loss": 1.3216, + "step": 18821 + }, + { + "epoch": 0.24458330558456767, + "grad_norm": 0.44757184386253357, + "learning_rate": 0.00015111451929450604, + "loss": 1.4453, + "step": 18822 + }, + { + "epoch": 0.24459630012848355, + "grad_norm": 0.3730284571647644, + "learning_rate": 0.00015111191983259464, + "loss": 1.2855, + "step": 18823 + }, + { + "epoch": 0.24460929467239942, + "grad_norm": 0.4215877056121826, + "learning_rate": 0.0001511093203706833, + "loss": 1.6168, + "step": 18824 + }, + { + "epoch": 0.2446222892163153, + "grad_norm": 0.4406856596469879, + "learning_rate": 0.0001511067209087719, + "loss": 1.5112, + "step": 18825 + }, + { + "epoch": 0.24463528376023116, + "grad_norm": 0.39661383628845215, + "learning_rate": 0.0001511041214468605, + "loss": 1.6071, + "step": 18826 + }, + { + "epoch": 0.24464827830414704, + "grad_norm": 0.4332396388053894, + "learning_rate": 0.0001511015219849491, + "loss": 1.4226, + "step": 18827 + }, + { + "epoch": 0.2446612728480629, + "grad_norm": 0.42816656827926636, + "learning_rate": 0.00015109892252303773, + "loss": 1.2745, + "step": 18828 + }, + { + "epoch": 0.24467426739197878, + "grad_norm": 0.35928821563720703, + "learning_rate": 0.00015109632306112636, + "loss": 1.3723, + "step": 18829 + }, + { + "epoch": 0.24468726193589466, + "grad_norm": 0.3581092357635498, + "learning_rate": 0.00015109372359921496, + "loss": 1.3478, + "step": 18830 + }, + { + "epoch": 0.24470025647981053, + "grad_norm": 0.6039936542510986, + "learning_rate": 0.00015109112413730358, + "loss": 1.4855, + "step": 18831 + }, + { + "epoch": 0.2447132510237264, + "grad_norm": 0.41694405674934387, + "learning_rate": 0.0001510885246753922, + "loss": 1.2934, + "step": 18832 + }, + { + "epoch": 0.24472624556764228, + "grad_norm": 0.38265755772590637, + "learning_rate": 0.00015108592521348083, + "loss": 1.4342, + "step": 18833 + }, + { + "epoch": 0.24473924011155815, + "grad_norm": 0.3158224821090698, + "learning_rate": 0.00015108332575156943, + "loss": 1.2717, + "step": 18834 + }, + { + "epoch": 0.24475223465547402, + "grad_norm": 0.34161150455474854, + "learning_rate": 0.00015108072628965805, + "loss": 1.5496, + "step": 18835 + }, + { + "epoch": 0.2447652291993899, + "grad_norm": 0.3635046184062958, + "learning_rate": 0.00015107812682774668, + "loss": 1.1539, + "step": 18836 + }, + { + "epoch": 0.24477822374330577, + "grad_norm": 0.3595418632030487, + "learning_rate": 0.00015107552736583527, + "loss": 1.5846, + "step": 18837 + }, + { + "epoch": 0.24479121828722164, + "grad_norm": 0.36239418387413025, + "learning_rate": 0.0001510729279039239, + "loss": 1.504, + "step": 18838 + }, + { + "epoch": 0.2448042128311375, + "grad_norm": 0.35877153277397156, + "learning_rate": 0.00015107032844201252, + "loss": 1.2728, + "step": 18839 + }, + { + "epoch": 0.2448172073750534, + "grad_norm": 0.4540707767009735, + "learning_rate": 0.00015106772898010112, + "loss": 1.4395, + "step": 18840 + }, + { + "epoch": 0.24483020191896926, + "grad_norm": 0.4120355248451233, + "learning_rate": 0.00015106512951818974, + "loss": 1.3899, + "step": 18841 + }, + { + "epoch": 0.24484319646288516, + "grad_norm": 0.37712806463241577, + "learning_rate": 0.00015106253005627834, + "loss": 1.5772, + "step": 18842 + }, + { + "epoch": 0.24485619100680103, + "grad_norm": 0.3947502374649048, + "learning_rate": 0.000151059930594367, + "loss": 1.2986, + "step": 18843 + }, + { + "epoch": 0.2448691855507169, + "grad_norm": 0.36569494009017944, + "learning_rate": 0.0001510573311324556, + "loss": 1.4456, + "step": 18844 + }, + { + "epoch": 0.24488218009463278, + "grad_norm": 0.3824523985385895, + "learning_rate": 0.00015105473167054421, + "loss": 1.5606, + "step": 18845 + }, + { + "epoch": 0.24489517463854865, + "grad_norm": 0.4289792478084564, + "learning_rate": 0.0001510521322086328, + "loss": 1.3368, + "step": 18846 + }, + { + "epoch": 0.24490816918246452, + "grad_norm": 0.44024258852005005, + "learning_rate": 0.00015104953274672144, + "loss": 1.3706, + "step": 18847 + }, + { + "epoch": 0.2449211637263804, + "grad_norm": 0.352137953042984, + "learning_rate": 0.00015104693328481006, + "loss": 1.4208, + "step": 18848 + }, + { + "epoch": 0.24493415827029627, + "grad_norm": 0.44328558444976807, + "learning_rate": 0.00015104433382289866, + "loss": 1.4415, + "step": 18849 + }, + { + "epoch": 0.24494715281421214, + "grad_norm": 0.43280497193336487, + "learning_rate": 0.00015104173436098728, + "loss": 1.3949, + "step": 18850 + }, + { + "epoch": 0.24496014735812802, + "grad_norm": 0.4635848104953766, + "learning_rate": 0.0001510391348990759, + "loss": 1.4373, + "step": 18851 + }, + { + "epoch": 0.2449731419020439, + "grad_norm": 0.37055668234825134, + "learning_rate": 0.0001510365354371645, + "loss": 1.4796, + "step": 18852 + }, + { + "epoch": 0.24498613644595976, + "grad_norm": 0.5201985239982605, + "learning_rate": 0.00015103393597525313, + "loss": 1.404, + "step": 18853 + }, + { + "epoch": 0.24499913098987564, + "grad_norm": 0.3533934950828552, + "learning_rate": 0.00015103133651334173, + "loss": 1.4053, + "step": 18854 + }, + { + "epoch": 0.2450121255337915, + "grad_norm": 0.39543235301971436, + "learning_rate": 0.00015102873705143038, + "loss": 1.4608, + "step": 18855 + }, + { + "epoch": 0.24502512007770738, + "grad_norm": 0.39458879828453064, + "learning_rate": 0.00015102613758951898, + "loss": 1.4994, + "step": 18856 + }, + { + "epoch": 0.24503811462162325, + "grad_norm": 0.4454036056995392, + "learning_rate": 0.0001510235381276076, + "loss": 1.4487, + "step": 18857 + }, + { + "epoch": 0.24505110916553913, + "grad_norm": 0.3871466815471649, + "learning_rate": 0.0001510209386656962, + "loss": 1.4809, + "step": 18858 + }, + { + "epoch": 0.245064103709455, + "grad_norm": 0.3446694314479828, + "learning_rate": 0.00015101833920378482, + "loss": 1.5561, + "step": 18859 + }, + { + "epoch": 0.24507709825337087, + "grad_norm": 0.41022297739982605, + "learning_rate": 0.00015101573974187345, + "loss": 1.3301, + "step": 18860 + }, + { + "epoch": 0.24509009279728675, + "grad_norm": 0.4372495412826538, + "learning_rate": 0.00015101314027996204, + "loss": 1.3225, + "step": 18861 + }, + { + "epoch": 0.24510308734120262, + "grad_norm": 0.3996409773826599, + "learning_rate": 0.00015101054081805067, + "loss": 1.5621, + "step": 18862 + }, + { + "epoch": 0.2451160818851185, + "grad_norm": 0.3698645532131195, + "learning_rate": 0.0001510079413561393, + "loss": 1.396, + "step": 18863 + }, + { + "epoch": 0.24512907642903436, + "grad_norm": 0.3147052526473999, + "learning_rate": 0.0001510053418942279, + "loss": 1.3551, + "step": 18864 + }, + { + "epoch": 0.24514207097295024, + "grad_norm": 0.4044569134712219, + "learning_rate": 0.00015100274243231651, + "loss": 1.1914, + "step": 18865 + }, + { + "epoch": 0.2451550655168661, + "grad_norm": 0.42564070224761963, + "learning_rate": 0.0001510001429704051, + "loss": 1.6862, + "step": 18866 + }, + { + "epoch": 0.24516806006078198, + "grad_norm": 0.5052086710929871, + "learning_rate": 0.00015099754350849376, + "loss": 1.5415, + "step": 18867 + }, + { + "epoch": 0.24518105460469786, + "grad_norm": 0.42564037442207336, + "learning_rate": 0.00015099494404658236, + "loss": 1.4262, + "step": 18868 + }, + { + "epoch": 0.24519404914861373, + "grad_norm": 0.4483758807182312, + "learning_rate": 0.00015099234458467098, + "loss": 1.5262, + "step": 18869 + }, + { + "epoch": 0.2452070436925296, + "grad_norm": 0.3586370646953583, + "learning_rate": 0.00015098974512275958, + "loss": 1.3809, + "step": 18870 + }, + { + "epoch": 0.24522003823644548, + "grad_norm": 0.3960714340209961, + "learning_rate": 0.0001509871456608482, + "loss": 1.4154, + "step": 18871 + }, + { + "epoch": 0.24523303278036135, + "grad_norm": 0.4796430766582489, + "learning_rate": 0.00015098454619893683, + "loss": 1.6041, + "step": 18872 + }, + { + "epoch": 0.24524602732427722, + "grad_norm": 0.29470136761665344, + "learning_rate": 0.00015098194673702543, + "loss": 1.1743, + "step": 18873 + }, + { + "epoch": 0.2452590218681931, + "grad_norm": 0.44004929065704346, + "learning_rate": 0.00015097934727511408, + "loss": 1.4328, + "step": 18874 + }, + { + "epoch": 0.24527201641210897, + "grad_norm": 0.3609558641910553, + "learning_rate": 0.00015097674781320268, + "loss": 1.3158, + "step": 18875 + }, + { + "epoch": 0.24528501095602484, + "grad_norm": 0.4033472537994385, + "learning_rate": 0.0001509741483512913, + "loss": 1.4027, + "step": 18876 + }, + { + "epoch": 0.2452980054999407, + "grad_norm": 0.400058776140213, + "learning_rate": 0.0001509715488893799, + "loss": 1.5013, + "step": 18877 + }, + { + "epoch": 0.2453110000438566, + "grad_norm": 0.5335497260093689, + "learning_rate": 0.00015096894942746852, + "loss": 1.3279, + "step": 18878 + }, + { + "epoch": 0.24532399458777246, + "grad_norm": 0.326107382774353, + "learning_rate": 0.00015096634996555715, + "loss": 1.5635, + "step": 18879 + }, + { + "epoch": 0.24533698913168833, + "grad_norm": 0.34297630190849304, + "learning_rate": 0.00015096375050364575, + "loss": 1.5086, + "step": 18880 + }, + { + "epoch": 0.2453499836756042, + "grad_norm": 0.3952433168888092, + "learning_rate": 0.00015096115104173437, + "loss": 1.5876, + "step": 18881 + }, + { + "epoch": 0.24536297821952008, + "grad_norm": 0.4266887903213501, + "learning_rate": 0.000150958551579823, + "loss": 1.4145, + "step": 18882 + }, + { + "epoch": 0.24537597276343595, + "grad_norm": 0.31486156582832336, + "learning_rate": 0.0001509559521179116, + "loss": 1.3145, + "step": 18883 + }, + { + "epoch": 0.24538896730735182, + "grad_norm": 0.2696535587310791, + "learning_rate": 0.00015095335265600022, + "loss": 1.4498, + "step": 18884 + }, + { + "epoch": 0.2454019618512677, + "grad_norm": 0.4172575771808624, + "learning_rate": 0.00015095075319408881, + "loss": 1.5064, + "step": 18885 + }, + { + "epoch": 0.24541495639518357, + "grad_norm": 0.34061190485954285, + "learning_rate": 0.00015094815373217747, + "loss": 1.6777, + "step": 18886 + }, + { + "epoch": 0.24542795093909944, + "grad_norm": 0.43162450194358826, + "learning_rate": 0.00015094555427026606, + "loss": 1.3103, + "step": 18887 + }, + { + "epoch": 0.24544094548301532, + "grad_norm": 0.7491441965103149, + "learning_rate": 0.0001509429548083547, + "loss": 1.3838, + "step": 18888 + }, + { + "epoch": 0.2454539400269312, + "grad_norm": 0.4656504690647125, + "learning_rate": 0.00015094035534644328, + "loss": 1.3809, + "step": 18889 + }, + { + "epoch": 0.24546693457084706, + "grad_norm": 0.29248175024986267, + "learning_rate": 0.0001509377558845319, + "loss": 1.5842, + "step": 18890 + }, + { + "epoch": 0.24547992911476293, + "grad_norm": 0.48640045523643494, + "learning_rate": 0.00015093515642262053, + "loss": 1.5551, + "step": 18891 + }, + { + "epoch": 0.2454929236586788, + "grad_norm": 0.3092939555644989, + "learning_rate": 0.00015093255696070913, + "loss": 1.3025, + "step": 18892 + }, + { + "epoch": 0.24550591820259468, + "grad_norm": 0.41665032505989075, + "learning_rate": 0.00015092995749879776, + "loss": 1.3641, + "step": 18893 + }, + { + "epoch": 0.24551891274651055, + "grad_norm": 0.3051072657108307, + "learning_rate": 0.00015092735803688638, + "loss": 1.3634, + "step": 18894 + }, + { + "epoch": 0.24553190729042643, + "grad_norm": 0.4392319917678833, + "learning_rate": 0.00015092475857497498, + "loss": 1.4081, + "step": 18895 + }, + { + "epoch": 0.2455449018343423, + "grad_norm": 0.3552829921245575, + "learning_rate": 0.0001509221591130636, + "loss": 1.3307, + "step": 18896 + }, + { + "epoch": 0.24555789637825817, + "grad_norm": 0.4525161385536194, + "learning_rate": 0.0001509195596511522, + "loss": 1.5111, + "step": 18897 + }, + { + "epoch": 0.24557089092217405, + "grad_norm": 0.30844396352767944, + "learning_rate": 0.00015091696018924085, + "loss": 1.2365, + "step": 18898 + }, + { + "epoch": 0.24558388546608992, + "grad_norm": 0.41089165210723877, + "learning_rate": 0.00015091436072732945, + "loss": 1.4627, + "step": 18899 + }, + { + "epoch": 0.2455968800100058, + "grad_norm": 0.2998374104499817, + "learning_rate": 0.00015091176126541807, + "loss": 1.4468, + "step": 18900 + }, + { + "epoch": 0.24560987455392166, + "grad_norm": 0.3028707504272461, + "learning_rate": 0.00015090916180350667, + "loss": 1.3342, + "step": 18901 + }, + { + "epoch": 0.24562286909783754, + "grad_norm": 0.4073849618434906, + "learning_rate": 0.0001509065623415953, + "loss": 1.4842, + "step": 18902 + }, + { + "epoch": 0.2456358636417534, + "grad_norm": 0.43446093797683716, + "learning_rate": 0.00015090396287968392, + "loss": 1.445, + "step": 18903 + }, + { + "epoch": 0.24564885818566928, + "grad_norm": 0.44141966104507446, + "learning_rate": 0.00015090136341777252, + "loss": 1.5915, + "step": 18904 + }, + { + "epoch": 0.24566185272958516, + "grad_norm": 0.3364699184894562, + "learning_rate": 0.00015089876395586114, + "loss": 1.4876, + "step": 18905 + }, + { + "epoch": 0.24567484727350103, + "grad_norm": 0.2789861559867859, + "learning_rate": 0.00015089616449394977, + "loss": 1.3709, + "step": 18906 + }, + { + "epoch": 0.2456878418174169, + "grad_norm": 0.40969333052635193, + "learning_rate": 0.00015089356503203836, + "loss": 1.3745, + "step": 18907 + }, + { + "epoch": 0.24570083636133278, + "grad_norm": 0.2771616280078888, + "learning_rate": 0.000150890965570127, + "loss": 1.3688, + "step": 18908 + }, + { + "epoch": 0.24571383090524865, + "grad_norm": 0.3067249357700348, + "learning_rate": 0.0001508883661082156, + "loss": 1.3455, + "step": 18909 + }, + { + "epoch": 0.24572682544916452, + "grad_norm": 0.46570900082588196, + "learning_rate": 0.00015088576664630424, + "loss": 1.4555, + "step": 18910 + }, + { + "epoch": 0.2457398199930804, + "grad_norm": 0.4184342324733734, + "learning_rate": 0.00015088316718439283, + "loss": 1.3756, + "step": 18911 + }, + { + "epoch": 0.24575281453699627, + "grad_norm": 0.378604531288147, + "learning_rate": 0.00015088056772248146, + "loss": 1.3979, + "step": 18912 + }, + { + "epoch": 0.24576580908091214, + "grad_norm": 0.4666525721549988, + "learning_rate": 0.00015087796826057008, + "loss": 1.3965, + "step": 18913 + }, + { + "epoch": 0.245778803624828, + "grad_norm": 0.39189279079437256, + "learning_rate": 0.00015087536879865868, + "loss": 1.4006, + "step": 18914 + }, + { + "epoch": 0.24579179816874389, + "grad_norm": 0.514519989490509, + "learning_rate": 0.0001508727693367473, + "loss": 1.5277, + "step": 18915 + }, + { + "epoch": 0.24580479271265976, + "grad_norm": 0.3929111063480377, + "learning_rate": 0.0001508701698748359, + "loss": 1.3883, + "step": 18916 + }, + { + "epoch": 0.24581778725657563, + "grad_norm": 0.42645853757858276, + "learning_rate": 0.00015086757041292455, + "loss": 1.4955, + "step": 18917 + }, + { + "epoch": 0.24583078180049153, + "grad_norm": 0.38905468583106995, + "learning_rate": 0.00015086497095101315, + "loss": 1.2094, + "step": 18918 + }, + { + "epoch": 0.2458437763444074, + "grad_norm": 0.3995038866996765, + "learning_rate": 0.00015086237148910175, + "loss": 1.5347, + "step": 18919 + }, + { + "epoch": 0.24585677088832328, + "grad_norm": 0.4171901047229767, + "learning_rate": 0.00015085977202719037, + "loss": 1.4877, + "step": 18920 + }, + { + "epoch": 0.24586976543223915, + "grad_norm": 0.38386470079421997, + "learning_rate": 0.000150857172565279, + "loss": 1.2076, + "step": 18921 + }, + { + "epoch": 0.24588275997615502, + "grad_norm": 0.30258697271347046, + "learning_rate": 0.00015085457310336762, + "loss": 1.3197, + "step": 18922 + }, + { + "epoch": 0.2458957545200709, + "grad_norm": 0.40699124336242676, + "learning_rate": 0.00015085197364145622, + "loss": 1.5121, + "step": 18923 + }, + { + "epoch": 0.24590874906398677, + "grad_norm": 0.3099406659603119, + "learning_rate": 0.00015084937417954484, + "loss": 1.2779, + "step": 18924 + }, + { + "epoch": 0.24592174360790264, + "grad_norm": 0.34832271933555603, + "learning_rate": 0.00015084677471763347, + "loss": 1.1913, + "step": 18925 + }, + { + "epoch": 0.24593473815181852, + "grad_norm": 0.45675963163375854, + "learning_rate": 0.00015084417525572207, + "loss": 1.3428, + "step": 18926 + }, + { + "epoch": 0.2459477326957344, + "grad_norm": 0.3640705645084381, + "learning_rate": 0.0001508415757938107, + "loss": 1.4172, + "step": 18927 + }, + { + "epoch": 0.24596072723965026, + "grad_norm": 0.49615612626075745, + "learning_rate": 0.0001508389763318993, + "loss": 1.4138, + "step": 18928 + }, + { + "epoch": 0.24597372178356613, + "grad_norm": 0.4574418067932129, + "learning_rate": 0.00015083637686998794, + "loss": 1.4386, + "step": 18929 + }, + { + "epoch": 0.245986716327482, + "grad_norm": 0.4878838360309601, + "learning_rate": 0.00015083377740807654, + "loss": 1.4701, + "step": 18930 + }, + { + "epoch": 0.24599971087139788, + "grad_norm": 0.4412200450897217, + "learning_rate": 0.00015083117794616516, + "loss": 1.5322, + "step": 18931 + }, + { + "epoch": 0.24601270541531375, + "grad_norm": 0.3475750684738159, + "learning_rate": 0.00015082857848425376, + "loss": 1.3837, + "step": 18932 + }, + { + "epoch": 0.24602569995922963, + "grad_norm": 0.3626914322376251, + "learning_rate": 0.00015082597902234238, + "loss": 1.2061, + "step": 18933 + }, + { + "epoch": 0.2460386945031455, + "grad_norm": 0.40176922082901, + "learning_rate": 0.000150823379560431, + "loss": 1.3461, + "step": 18934 + }, + { + "epoch": 0.24605168904706137, + "grad_norm": 0.4041104316711426, + "learning_rate": 0.0001508207800985196, + "loss": 1.3661, + "step": 18935 + }, + { + "epoch": 0.24606468359097725, + "grad_norm": 0.3509327471256256, + "learning_rate": 0.00015081818063660823, + "loss": 1.272, + "step": 18936 + }, + { + "epoch": 0.24607767813489312, + "grad_norm": 0.3932877480983734, + "learning_rate": 0.00015081558117469685, + "loss": 1.5167, + "step": 18937 + }, + { + "epoch": 0.246090672678809, + "grad_norm": 0.40667298436164856, + "learning_rate": 0.00015081298171278545, + "loss": 1.6011, + "step": 18938 + }, + { + "epoch": 0.24610366722272486, + "grad_norm": 0.49350541830062866, + "learning_rate": 0.00015081038225087408, + "loss": 1.4247, + "step": 18939 + }, + { + "epoch": 0.24611666176664074, + "grad_norm": 0.36479461193084717, + "learning_rate": 0.00015080778278896267, + "loss": 1.3265, + "step": 18940 + }, + { + "epoch": 0.2461296563105566, + "grad_norm": 0.45613476634025574, + "learning_rate": 0.00015080518332705132, + "loss": 1.5385, + "step": 18941 + }, + { + "epoch": 0.24614265085447248, + "grad_norm": 0.4249322712421417, + "learning_rate": 0.00015080258386513992, + "loss": 1.3963, + "step": 18942 + }, + { + "epoch": 0.24615564539838836, + "grad_norm": 0.3252331018447876, + "learning_rate": 0.00015079998440322855, + "loss": 1.2948, + "step": 18943 + }, + { + "epoch": 0.24616863994230423, + "grad_norm": 0.3997461497783661, + "learning_rate": 0.00015079738494131714, + "loss": 1.4025, + "step": 18944 + }, + { + "epoch": 0.2461816344862201, + "grad_norm": 0.5006688237190247, + "learning_rate": 0.00015079478547940577, + "loss": 1.3893, + "step": 18945 + }, + { + "epoch": 0.24619462903013598, + "grad_norm": 0.46695682406425476, + "learning_rate": 0.0001507921860174944, + "loss": 1.4693, + "step": 18946 + }, + { + "epoch": 0.24620762357405185, + "grad_norm": 0.33112865686416626, + "learning_rate": 0.000150789586555583, + "loss": 1.3808, + "step": 18947 + }, + { + "epoch": 0.24622061811796772, + "grad_norm": 0.5466291308403015, + "learning_rate": 0.00015078698709367161, + "loss": 1.4927, + "step": 18948 + }, + { + "epoch": 0.2462336126618836, + "grad_norm": 0.38930803537368774, + "learning_rate": 0.00015078438763176024, + "loss": 1.4061, + "step": 18949 + }, + { + "epoch": 0.24624660720579947, + "grad_norm": 0.4526674449443817, + "learning_rate": 0.00015078178816984884, + "loss": 1.396, + "step": 18950 + }, + { + "epoch": 0.24625960174971534, + "grad_norm": 0.31736335158348083, + "learning_rate": 0.00015077918870793746, + "loss": 1.284, + "step": 18951 + }, + { + "epoch": 0.2462725962936312, + "grad_norm": 0.46952641010284424, + "learning_rate": 0.00015077658924602609, + "loss": 1.4592, + "step": 18952 + }, + { + "epoch": 0.24628559083754709, + "grad_norm": 0.4860699772834778, + "learning_rate": 0.0001507739897841147, + "loss": 1.4297, + "step": 18953 + }, + { + "epoch": 0.24629858538146296, + "grad_norm": 0.43200260400772095, + "learning_rate": 0.0001507713903222033, + "loss": 1.4825, + "step": 18954 + }, + { + "epoch": 0.24631157992537883, + "grad_norm": 0.4215549826622009, + "learning_rate": 0.00015076879086029193, + "loss": 1.342, + "step": 18955 + }, + { + "epoch": 0.2463245744692947, + "grad_norm": 0.36756816506385803, + "learning_rate": 0.00015076619139838056, + "loss": 1.2427, + "step": 18956 + }, + { + "epoch": 0.24633756901321058, + "grad_norm": 0.4120791256427765, + "learning_rate": 0.00015076359193646915, + "loss": 1.3275, + "step": 18957 + }, + { + "epoch": 0.24635056355712645, + "grad_norm": 0.3671320676803589, + "learning_rate": 0.00015076099247455778, + "loss": 1.4187, + "step": 18958 + }, + { + "epoch": 0.24636355810104232, + "grad_norm": 0.4896242320537567, + "learning_rate": 0.00015075839301264638, + "loss": 1.4208, + "step": 18959 + }, + { + "epoch": 0.2463765526449582, + "grad_norm": 0.3865586519241333, + "learning_rate": 0.00015075579355073503, + "loss": 1.4248, + "step": 18960 + }, + { + "epoch": 0.24638954718887407, + "grad_norm": 0.3354617953300476, + "learning_rate": 0.00015075319408882362, + "loss": 1.32, + "step": 18961 + }, + { + "epoch": 0.24640254173278994, + "grad_norm": 0.419919490814209, + "learning_rate": 0.00015075059462691222, + "loss": 1.4812, + "step": 18962 + }, + { + "epoch": 0.24641553627670582, + "grad_norm": 0.4438920021057129, + "learning_rate": 0.00015074799516500085, + "loss": 1.4412, + "step": 18963 + }, + { + "epoch": 0.2464285308206217, + "grad_norm": 0.39383605122566223, + "learning_rate": 0.00015074539570308947, + "loss": 1.4579, + "step": 18964 + }, + { + "epoch": 0.24644152536453756, + "grad_norm": 0.44826945662498474, + "learning_rate": 0.0001507427962411781, + "loss": 1.3422, + "step": 18965 + }, + { + "epoch": 0.24645451990845343, + "grad_norm": 0.3373607397079468, + "learning_rate": 0.0001507401967792667, + "loss": 1.2381, + "step": 18966 + }, + { + "epoch": 0.2464675144523693, + "grad_norm": 0.4834904968738556, + "learning_rate": 0.00015073759731735532, + "loss": 1.5965, + "step": 18967 + }, + { + "epoch": 0.24648050899628518, + "grad_norm": 0.4047437310218811, + "learning_rate": 0.00015073499785544394, + "loss": 1.4364, + "step": 18968 + }, + { + "epoch": 0.24649350354020105, + "grad_norm": 0.3365999162197113, + "learning_rate": 0.00015073239839353254, + "loss": 1.3354, + "step": 18969 + }, + { + "epoch": 0.24650649808411693, + "grad_norm": 0.31903040409088135, + "learning_rate": 0.00015072979893162116, + "loss": 1.0632, + "step": 18970 + }, + { + "epoch": 0.2465194926280328, + "grad_norm": 0.43094542622566223, + "learning_rate": 0.00015072719946970976, + "loss": 1.4729, + "step": 18971 + }, + { + "epoch": 0.24653248717194867, + "grad_norm": 0.3633486032485962, + "learning_rate": 0.0001507246000077984, + "loss": 1.4649, + "step": 18972 + }, + { + "epoch": 0.24654548171586455, + "grad_norm": 0.5617942810058594, + "learning_rate": 0.000150722000545887, + "loss": 1.495, + "step": 18973 + }, + { + "epoch": 0.24655847625978042, + "grad_norm": 0.37191879749298096, + "learning_rate": 0.0001507194010839756, + "loss": 1.4604, + "step": 18974 + }, + { + "epoch": 0.2465714708036963, + "grad_norm": 0.4197375774383545, + "learning_rate": 0.00015071680162206423, + "loss": 1.253, + "step": 18975 + }, + { + "epoch": 0.24658446534761216, + "grad_norm": 0.395475834608078, + "learning_rate": 0.00015071420216015286, + "loss": 1.5585, + "step": 18976 + }, + { + "epoch": 0.24659745989152804, + "grad_norm": 0.41954919695854187, + "learning_rate": 0.00015071160269824148, + "loss": 1.4427, + "step": 18977 + }, + { + "epoch": 0.2466104544354439, + "grad_norm": 0.42125093936920166, + "learning_rate": 0.00015070900323633008, + "loss": 1.6147, + "step": 18978 + }, + { + "epoch": 0.24662344897935978, + "grad_norm": 0.5504149198532104, + "learning_rate": 0.0001507064037744187, + "loss": 1.4808, + "step": 18979 + }, + { + "epoch": 0.24663644352327566, + "grad_norm": 0.42679038643836975, + "learning_rate": 0.00015070380431250733, + "loss": 1.4419, + "step": 18980 + }, + { + "epoch": 0.24664943806719153, + "grad_norm": 0.47874870896339417, + "learning_rate": 0.00015070120485059592, + "loss": 1.4607, + "step": 18981 + }, + { + "epoch": 0.2466624326111074, + "grad_norm": 0.42051711678504944, + "learning_rate": 0.00015069860538868455, + "loss": 1.4576, + "step": 18982 + }, + { + "epoch": 0.24667542715502327, + "grad_norm": 0.42147690057754517, + "learning_rate": 0.00015069600592677317, + "loss": 1.3982, + "step": 18983 + }, + { + "epoch": 0.24668842169893915, + "grad_norm": 0.36267000436782837, + "learning_rate": 0.0001506934064648618, + "loss": 1.2403, + "step": 18984 + }, + { + "epoch": 0.24670141624285502, + "grad_norm": 0.38945502042770386, + "learning_rate": 0.0001506908070029504, + "loss": 1.3817, + "step": 18985 + }, + { + "epoch": 0.2467144107867709, + "grad_norm": 0.42240071296691895, + "learning_rate": 0.000150688207541039, + "loss": 1.4019, + "step": 18986 + }, + { + "epoch": 0.24672740533068677, + "grad_norm": 0.3628593981266022, + "learning_rate": 0.00015068560807912764, + "loss": 1.602, + "step": 18987 + }, + { + "epoch": 0.24674039987460264, + "grad_norm": 0.43468666076660156, + "learning_rate": 0.00015068300861721624, + "loss": 1.3571, + "step": 18988 + }, + { + "epoch": 0.2467533944185185, + "grad_norm": 0.42950963973999023, + "learning_rate": 0.00015068040915530487, + "loss": 1.2265, + "step": 18989 + }, + { + "epoch": 0.24676638896243439, + "grad_norm": 0.3333166539669037, + "learning_rate": 0.00015067780969339346, + "loss": 1.4119, + "step": 18990 + }, + { + "epoch": 0.24677938350635026, + "grad_norm": 0.2706016004085541, + "learning_rate": 0.0001506752102314821, + "loss": 1.1275, + "step": 18991 + }, + { + "epoch": 0.24679237805026613, + "grad_norm": 0.3455129563808441, + "learning_rate": 0.0001506726107695707, + "loss": 1.403, + "step": 18992 + }, + { + "epoch": 0.246805372594182, + "grad_norm": 0.48403802514076233, + "learning_rate": 0.0001506700113076593, + "loss": 1.3928, + "step": 18993 + }, + { + "epoch": 0.2468183671380979, + "grad_norm": 0.5159617066383362, + "learning_rate": 0.00015066741184574793, + "loss": 1.4067, + "step": 18994 + }, + { + "epoch": 0.24683136168201378, + "grad_norm": 0.34165164828300476, + "learning_rate": 0.00015066481238383656, + "loss": 1.3847, + "step": 18995 + }, + { + "epoch": 0.24684435622592965, + "grad_norm": 0.44306039810180664, + "learning_rate": 0.00015066221292192518, + "loss": 1.5585, + "step": 18996 + }, + { + "epoch": 0.24685735076984552, + "grad_norm": 0.4142409861087799, + "learning_rate": 0.00015065961346001378, + "loss": 1.4677, + "step": 18997 + }, + { + "epoch": 0.2468703453137614, + "grad_norm": 0.5357551574707031, + "learning_rate": 0.0001506570139981024, + "loss": 1.504, + "step": 18998 + }, + { + "epoch": 0.24688333985767727, + "grad_norm": 0.42358142137527466, + "learning_rate": 0.00015065441453619103, + "loss": 1.5028, + "step": 18999 + }, + { + "epoch": 0.24689633440159314, + "grad_norm": 0.34169360995292664, + "learning_rate": 0.00015065181507427963, + "loss": 1.1883, + "step": 19000 + }, + { + "epoch": 0.24690932894550902, + "grad_norm": 0.38855984807014465, + "learning_rate": 0.00015064921561236825, + "loss": 1.592, + "step": 19001 + }, + { + "epoch": 0.2469223234894249, + "grad_norm": 0.4562557339668274, + "learning_rate": 0.00015064661615045685, + "loss": 1.4843, + "step": 19002 + }, + { + "epoch": 0.24693531803334076, + "grad_norm": 0.4345974028110504, + "learning_rate": 0.00015064401668854547, + "loss": 1.3692, + "step": 19003 + }, + { + "epoch": 0.24694831257725663, + "grad_norm": 0.43085306882858276, + "learning_rate": 0.0001506414172266341, + "loss": 1.4547, + "step": 19004 + }, + { + "epoch": 0.2469613071211725, + "grad_norm": 0.4232499897480011, + "learning_rate": 0.0001506388177647227, + "loss": 1.632, + "step": 19005 + }, + { + "epoch": 0.24697430166508838, + "grad_norm": 0.45389315485954285, + "learning_rate": 0.00015063621830281132, + "loss": 1.5862, + "step": 19006 + }, + { + "epoch": 0.24698729620900425, + "grad_norm": 0.34181898832321167, + "learning_rate": 0.00015063361884089994, + "loss": 1.2575, + "step": 19007 + }, + { + "epoch": 0.24700029075292013, + "grad_norm": 0.3823287785053253, + "learning_rate": 0.00015063101937898857, + "loss": 1.3946, + "step": 19008 + }, + { + "epoch": 0.247013285296836, + "grad_norm": 0.31054508686065674, + "learning_rate": 0.00015062841991707717, + "loss": 1.2438, + "step": 19009 + }, + { + "epoch": 0.24702627984075187, + "grad_norm": 0.43709537386894226, + "learning_rate": 0.0001506258204551658, + "loss": 1.6329, + "step": 19010 + }, + { + "epoch": 0.24703927438466775, + "grad_norm": 0.3412104547023773, + "learning_rate": 0.00015062322099325441, + "loss": 1.2695, + "step": 19011 + }, + { + "epoch": 0.24705226892858362, + "grad_norm": 0.3026045858860016, + "learning_rate": 0.000150620621531343, + "loss": 1.2772, + "step": 19012 + }, + { + "epoch": 0.2470652634724995, + "grad_norm": 0.42059242725372314, + "learning_rate": 0.00015061802206943164, + "loss": 1.4135, + "step": 19013 + }, + { + "epoch": 0.24707825801641536, + "grad_norm": 0.3934084177017212, + "learning_rate": 0.00015061542260752023, + "loss": 1.3873, + "step": 19014 + }, + { + "epoch": 0.24709125256033124, + "grad_norm": 0.3865605592727661, + "learning_rate": 0.00015061282314560889, + "loss": 1.3047, + "step": 19015 + }, + { + "epoch": 0.2471042471042471, + "grad_norm": 0.4346744418144226, + "learning_rate": 0.00015061022368369748, + "loss": 1.5871, + "step": 19016 + }, + { + "epoch": 0.24711724164816298, + "grad_norm": 0.3997902572154999, + "learning_rate": 0.00015060762422178608, + "loss": 1.5877, + "step": 19017 + }, + { + "epoch": 0.24713023619207886, + "grad_norm": 0.3850584924221039, + "learning_rate": 0.0001506050247598747, + "loss": 1.3246, + "step": 19018 + }, + { + "epoch": 0.24714323073599473, + "grad_norm": 0.3420499861240387, + "learning_rate": 0.00015060242529796333, + "loss": 1.3257, + "step": 19019 + }, + { + "epoch": 0.2471562252799106, + "grad_norm": 0.40653449296951294, + "learning_rate": 0.00015059982583605195, + "loss": 1.4343, + "step": 19020 + }, + { + "epoch": 0.24716921982382647, + "grad_norm": 0.36838942766189575, + "learning_rate": 0.00015059722637414055, + "loss": 1.3241, + "step": 19021 + }, + { + "epoch": 0.24718221436774235, + "grad_norm": 0.37009817361831665, + "learning_rate": 0.00015059462691222918, + "loss": 1.3256, + "step": 19022 + }, + { + "epoch": 0.24719520891165822, + "grad_norm": 0.4786401093006134, + "learning_rate": 0.0001505920274503178, + "loss": 1.3751, + "step": 19023 + }, + { + "epoch": 0.2472082034555741, + "grad_norm": 0.48386844992637634, + "learning_rate": 0.0001505894279884064, + "loss": 1.5617, + "step": 19024 + }, + { + "epoch": 0.24722119799948997, + "grad_norm": 0.40708616375923157, + "learning_rate": 0.00015058682852649502, + "loss": 1.3024, + "step": 19025 + }, + { + "epoch": 0.24723419254340584, + "grad_norm": 0.4326227903366089, + "learning_rate": 0.00015058422906458365, + "loss": 1.4966, + "step": 19026 + }, + { + "epoch": 0.2472471870873217, + "grad_norm": 0.4003925919532776, + "learning_rate": 0.00015058162960267227, + "loss": 1.4613, + "step": 19027 + }, + { + "epoch": 0.24726018163123759, + "grad_norm": 0.3473844826221466, + "learning_rate": 0.00015057903014076087, + "loss": 1.1871, + "step": 19028 + }, + { + "epoch": 0.24727317617515346, + "grad_norm": 0.36359885334968567, + "learning_rate": 0.00015057643067884947, + "loss": 1.6373, + "step": 19029 + }, + { + "epoch": 0.24728617071906933, + "grad_norm": 0.41476327180862427, + "learning_rate": 0.00015057383121693812, + "loss": 1.4524, + "step": 19030 + }, + { + "epoch": 0.2472991652629852, + "grad_norm": 0.4065360724925995, + "learning_rate": 0.00015057123175502671, + "loss": 1.4698, + "step": 19031 + }, + { + "epoch": 0.24731215980690108, + "grad_norm": 0.47410744428634644, + "learning_rate": 0.00015056863229311534, + "loss": 1.5398, + "step": 19032 + }, + { + "epoch": 0.24732515435081695, + "grad_norm": 0.4218134880065918, + "learning_rate": 0.00015056603283120394, + "loss": 1.45, + "step": 19033 + }, + { + "epoch": 0.24733814889473282, + "grad_norm": 0.535853385925293, + "learning_rate": 0.00015056343336929256, + "loss": 1.3941, + "step": 19034 + }, + { + "epoch": 0.2473511434386487, + "grad_norm": 0.37583762407302856, + "learning_rate": 0.00015056083390738119, + "loss": 1.5249, + "step": 19035 + }, + { + "epoch": 0.24736413798256457, + "grad_norm": 0.4235329329967499, + "learning_rate": 0.00015055823444546978, + "loss": 1.4013, + "step": 19036 + }, + { + "epoch": 0.24737713252648044, + "grad_norm": 0.37479016184806824, + "learning_rate": 0.0001505556349835584, + "loss": 1.2676, + "step": 19037 + }, + { + "epoch": 0.24739012707039632, + "grad_norm": 0.329023152589798, + "learning_rate": 0.00015055303552164703, + "loss": 1.1844, + "step": 19038 + }, + { + "epoch": 0.2474031216143122, + "grad_norm": 0.2642345428466797, + "learning_rate": 0.00015055043605973566, + "loss": 1.2542, + "step": 19039 + }, + { + "epoch": 0.24741611615822806, + "grad_norm": 0.45293453335762024, + "learning_rate": 0.00015054783659782425, + "loss": 1.4203, + "step": 19040 + }, + { + "epoch": 0.24742911070214393, + "grad_norm": 0.39274486899375916, + "learning_rate": 0.00015054523713591285, + "loss": 1.3899, + "step": 19041 + }, + { + "epoch": 0.2474421052460598, + "grad_norm": 0.4734686315059662, + "learning_rate": 0.0001505426376740015, + "loss": 1.4268, + "step": 19042 + }, + { + "epoch": 0.24745509978997568, + "grad_norm": 0.4422254264354706, + "learning_rate": 0.0001505400382120901, + "loss": 1.4182, + "step": 19043 + }, + { + "epoch": 0.24746809433389155, + "grad_norm": 0.32406046986579895, + "learning_rate": 0.00015053743875017872, + "loss": 1.5838, + "step": 19044 + }, + { + "epoch": 0.24748108887780743, + "grad_norm": 0.5539398789405823, + "learning_rate": 0.00015053483928826732, + "loss": 1.4353, + "step": 19045 + }, + { + "epoch": 0.2474940834217233, + "grad_norm": 0.41394415497779846, + "learning_rate": 0.00015053223982635595, + "loss": 1.552, + "step": 19046 + }, + { + "epoch": 0.24750707796563917, + "grad_norm": 0.40400639176368713, + "learning_rate": 0.00015052964036444457, + "loss": 1.4701, + "step": 19047 + }, + { + "epoch": 0.24752007250955504, + "grad_norm": 0.4305315315723419, + "learning_rate": 0.00015052704090253317, + "loss": 1.5165, + "step": 19048 + }, + { + "epoch": 0.24753306705347092, + "grad_norm": 0.36676716804504395, + "learning_rate": 0.0001505244414406218, + "loss": 1.6704, + "step": 19049 + }, + { + "epoch": 0.2475460615973868, + "grad_norm": 0.6173962950706482, + "learning_rate": 0.00015052184197871042, + "loss": 1.619, + "step": 19050 + }, + { + "epoch": 0.24755905614130266, + "grad_norm": 0.45166561007499695, + "learning_rate": 0.00015051924251679904, + "loss": 1.5228, + "step": 19051 + }, + { + "epoch": 0.24757205068521854, + "grad_norm": 0.41376087069511414, + "learning_rate": 0.00015051664305488764, + "loss": 1.5808, + "step": 19052 + }, + { + "epoch": 0.2475850452291344, + "grad_norm": 0.42920196056365967, + "learning_rate": 0.00015051404359297626, + "loss": 1.6603, + "step": 19053 + }, + { + "epoch": 0.24759803977305028, + "grad_norm": 0.5915201902389526, + "learning_rate": 0.0001505114441310649, + "loss": 1.484, + "step": 19054 + }, + { + "epoch": 0.24761103431696616, + "grad_norm": 0.41033339500427246, + "learning_rate": 0.00015050884466915349, + "loss": 1.354, + "step": 19055 + }, + { + "epoch": 0.24762402886088203, + "grad_norm": 0.35668492317199707, + "learning_rate": 0.0001505062452072421, + "loss": 1.2864, + "step": 19056 + }, + { + "epoch": 0.2476370234047979, + "grad_norm": 0.36982670426368713, + "learning_rate": 0.00015050364574533073, + "loss": 1.4106, + "step": 19057 + }, + { + "epoch": 0.24765001794871377, + "grad_norm": 0.3040981888771057, + "learning_rate": 0.00015050104628341933, + "loss": 1.3076, + "step": 19058 + }, + { + "epoch": 0.24766301249262965, + "grad_norm": 0.3890395164489746, + "learning_rate": 0.00015049844682150796, + "loss": 1.5605, + "step": 19059 + }, + { + "epoch": 0.24767600703654552, + "grad_norm": 0.4530028700828552, + "learning_rate": 0.00015049584735959655, + "loss": 1.3765, + "step": 19060 + }, + { + "epoch": 0.2476890015804614, + "grad_norm": 0.405753493309021, + "learning_rate": 0.0001504932478976852, + "loss": 1.2287, + "step": 19061 + }, + { + "epoch": 0.24770199612437727, + "grad_norm": 0.38713157176971436, + "learning_rate": 0.0001504906484357738, + "loss": 1.2443, + "step": 19062 + }, + { + "epoch": 0.24771499066829314, + "grad_norm": 0.3774830102920532, + "learning_rate": 0.00015048804897386243, + "loss": 1.2687, + "step": 19063 + }, + { + "epoch": 0.247727985212209, + "grad_norm": 0.39333006739616394, + "learning_rate": 0.00015048544951195102, + "loss": 1.4222, + "step": 19064 + }, + { + "epoch": 0.24774097975612488, + "grad_norm": 0.4434764087200165, + "learning_rate": 0.00015048285005003965, + "loss": 1.3346, + "step": 19065 + }, + { + "epoch": 0.24775397430004076, + "grad_norm": 0.3143687844276428, + "learning_rate": 0.00015048025058812827, + "loss": 1.2926, + "step": 19066 + }, + { + "epoch": 0.24776696884395663, + "grad_norm": 0.5469130277633667, + "learning_rate": 0.00015047765112621687, + "loss": 1.4938, + "step": 19067 + }, + { + "epoch": 0.2477799633878725, + "grad_norm": 0.29207664728164673, + "learning_rate": 0.0001504750516643055, + "loss": 1.4543, + "step": 19068 + }, + { + "epoch": 0.24779295793178838, + "grad_norm": 0.45945000648498535, + "learning_rate": 0.00015047245220239412, + "loss": 1.517, + "step": 19069 + }, + { + "epoch": 0.24780595247570428, + "grad_norm": 0.4479677975177765, + "learning_rate": 0.00015046985274048272, + "loss": 1.4524, + "step": 19070 + }, + { + "epoch": 0.24781894701962015, + "grad_norm": 0.4423813819885254, + "learning_rate": 0.00015046725327857134, + "loss": 1.4043, + "step": 19071 + }, + { + "epoch": 0.24783194156353602, + "grad_norm": 0.37931036949157715, + "learning_rate": 0.00015046465381665994, + "loss": 1.2993, + "step": 19072 + }, + { + "epoch": 0.2478449361074519, + "grad_norm": 0.36628457903862, + "learning_rate": 0.0001504620543547486, + "loss": 1.4395, + "step": 19073 + }, + { + "epoch": 0.24785793065136777, + "grad_norm": 0.41657090187072754, + "learning_rate": 0.0001504594548928372, + "loss": 1.3384, + "step": 19074 + }, + { + "epoch": 0.24787092519528364, + "grad_norm": 0.3791036903858185, + "learning_rate": 0.0001504568554309258, + "loss": 1.4581, + "step": 19075 + }, + { + "epoch": 0.24788391973919952, + "grad_norm": 0.4603444039821625, + "learning_rate": 0.0001504542559690144, + "loss": 1.4532, + "step": 19076 + }, + { + "epoch": 0.2478969142831154, + "grad_norm": 0.4019109904766083, + "learning_rate": 0.00015045165650710303, + "loss": 1.4926, + "step": 19077 + }, + { + "epoch": 0.24790990882703126, + "grad_norm": 0.4186308979988098, + "learning_rate": 0.00015044905704519166, + "loss": 1.4957, + "step": 19078 + }, + { + "epoch": 0.24792290337094713, + "grad_norm": 0.42851924896240234, + "learning_rate": 0.00015044645758328026, + "loss": 1.4571, + "step": 19079 + }, + { + "epoch": 0.247935897914863, + "grad_norm": 0.4043085277080536, + "learning_rate": 0.00015044385812136888, + "loss": 1.3662, + "step": 19080 + }, + { + "epoch": 0.24794889245877888, + "grad_norm": 0.3865445554256439, + "learning_rate": 0.0001504412586594575, + "loss": 1.3245, + "step": 19081 + }, + { + "epoch": 0.24796188700269475, + "grad_norm": 0.4346553683280945, + "learning_rate": 0.00015043865919754613, + "loss": 1.2947, + "step": 19082 + }, + { + "epoch": 0.24797488154661063, + "grad_norm": 0.4635961353778839, + "learning_rate": 0.00015043605973563473, + "loss": 1.4776, + "step": 19083 + }, + { + "epoch": 0.2479878760905265, + "grad_norm": 0.33484140038490295, + "learning_rate": 0.00015043346027372332, + "loss": 1.1439, + "step": 19084 + }, + { + "epoch": 0.24800087063444237, + "grad_norm": 0.34960395097732544, + "learning_rate": 0.00015043086081181198, + "loss": 1.3343, + "step": 19085 + }, + { + "epoch": 0.24801386517835824, + "grad_norm": 0.3153984844684601, + "learning_rate": 0.00015042826134990057, + "loss": 1.4118, + "step": 19086 + }, + { + "epoch": 0.24802685972227412, + "grad_norm": 0.36336901783943176, + "learning_rate": 0.0001504256618879892, + "loss": 1.5549, + "step": 19087 + }, + { + "epoch": 0.24803985426619, + "grad_norm": 0.3608517348766327, + "learning_rate": 0.0001504230624260778, + "loss": 1.3839, + "step": 19088 + }, + { + "epoch": 0.24805284881010586, + "grad_norm": 0.3816474676132202, + "learning_rate": 0.00015042046296416642, + "loss": 1.182, + "step": 19089 + }, + { + "epoch": 0.24806584335402174, + "grad_norm": 0.5169097185134888, + "learning_rate": 0.00015041786350225504, + "loss": 1.3752, + "step": 19090 + }, + { + "epoch": 0.2480788378979376, + "grad_norm": 0.5496385097503662, + "learning_rate": 0.00015041526404034364, + "loss": 1.4404, + "step": 19091 + }, + { + "epoch": 0.24809183244185348, + "grad_norm": 0.4175279140472412, + "learning_rate": 0.00015041266457843227, + "loss": 1.3126, + "step": 19092 + }, + { + "epoch": 0.24810482698576936, + "grad_norm": 0.4991360604763031, + "learning_rate": 0.0001504100651165209, + "loss": 1.5526, + "step": 19093 + }, + { + "epoch": 0.24811782152968523, + "grad_norm": 0.3966275751590729, + "learning_rate": 0.00015040746565460952, + "loss": 1.3804, + "step": 19094 + }, + { + "epoch": 0.2481308160736011, + "grad_norm": 0.36171290278434753, + "learning_rate": 0.0001504048661926981, + "loss": 1.398, + "step": 19095 + }, + { + "epoch": 0.24814381061751697, + "grad_norm": 1.1638695001602173, + "learning_rate": 0.00015040226673078674, + "loss": 1.4331, + "step": 19096 + }, + { + "epoch": 0.24815680516143285, + "grad_norm": 0.4792670011520386, + "learning_rate": 0.00015039966726887536, + "loss": 1.376, + "step": 19097 + }, + { + "epoch": 0.24816979970534872, + "grad_norm": 0.44714289903640747, + "learning_rate": 0.00015039706780696396, + "loss": 1.2529, + "step": 19098 + }, + { + "epoch": 0.2481827942492646, + "grad_norm": 0.4257495701313019, + "learning_rate": 0.00015039446834505258, + "loss": 1.3908, + "step": 19099 + }, + { + "epoch": 0.24819578879318047, + "grad_norm": 0.4333556592464447, + "learning_rate": 0.0001503918688831412, + "loss": 1.1902, + "step": 19100 + }, + { + "epoch": 0.24820878333709634, + "grad_norm": 0.29756325483322144, + "learning_rate": 0.0001503892694212298, + "loss": 1.3437, + "step": 19101 + }, + { + "epoch": 0.2482217778810122, + "grad_norm": 0.491445392370224, + "learning_rate": 0.00015038666995931843, + "loss": 1.4148, + "step": 19102 + }, + { + "epoch": 0.24823477242492809, + "grad_norm": 0.47462043166160583, + "learning_rate": 0.00015038407049740703, + "loss": 1.5003, + "step": 19103 + }, + { + "epoch": 0.24824776696884396, + "grad_norm": 0.42689380049705505, + "learning_rate": 0.00015038147103549568, + "loss": 1.5778, + "step": 19104 + }, + { + "epoch": 0.24826076151275983, + "grad_norm": 0.28072085976600647, + "learning_rate": 0.00015037887157358428, + "loss": 1.2322, + "step": 19105 + }, + { + "epoch": 0.2482737560566757, + "grad_norm": 0.46227651834487915, + "learning_rate": 0.0001503762721116729, + "loss": 1.4735, + "step": 19106 + }, + { + "epoch": 0.24828675060059158, + "grad_norm": 0.42137861251831055, + "learning_rate": 0.0001503736726497615, + "loss": 1.5118, + "step": 19107 + }, + { + "epoch": 0.24829974514450745, + "grad_norm": 0.4753071963787079, + "learning_rate": 0.00015037107318785012, + "loss": 1.5744, + "step": 19108 + }, + { + "epoch": 0.24831273968842332, + "grad_norm": 0.38856756687164307, + "learning_rate": 0.00015036847372593875, + "loss": 1.6, + "step": 19109 + }, + { + "epoch": 0.2483257342323392, + "grad_norm": 0.4565708041191101, + "learning_rate": 0.00015036587426402734, + "loss": 1.3489, + "step": 19110 + }, + { + "epoch": 0.24833872877625507, + "grad_norm": 0.421584814786911, + "learning_rate": 0.00015036327480211597, + "loss": 1.3473, + "step": 19111 + }, + { + "epoch": 0.24835172332017094, + "grad_norm": 0.38016435503959656, + "learning_rate": 0.0001503606753402046, + "loss": 1.4225, + "step": 19112 + }, + { + "epoch": 0.24836471786408681, + "grad_norm": 0.4585200250148773, + "learning_rate": 0.0001503580758782932, + "loss": 1.3235, + "step": 19113 + }, + { + "epoch": 0.2483777124080027, + "grad_norm": 0.41468092799186707, + "learning_rate": 0.00015035547641638182, + "loss": 1.498, + "step": 19114 + }, + { + "epoch": 0.24839070695191856, + "grad_norm": 0.2867843210697174, + "learning_rate": 0.0001503528769544704, + "loss": 1.2306, + "step": 19115 + }, + { + "epoch": 0.24840370149583443, + "grad_norm": 0.4398726224899292, + "learning_rate": 0.00015035027749255906, + "loss": 1.5214, + "step": 19116 + }, + { + "epoch": 0.2484166960397503, + "grad_norm": 0.4244706928730011, + "learning_rate": 0.00015034767803064766, + "loss": 1.3045, + "step": 19117 + }, + { + "epoch": 0.24842969058366618, + "grad_norm": 0.3680809736251831, + "learning_rate": 0.00015034507856873629, + "loss": 1.4953, + "step": 19118 + }, + { + "epoch": 0.24844268512758205, + "grad_norm": 0.3799309730529785, + "learning_rate": 0.00015034247910682488, + "loss": 1.3721, + "step": 19119 + }, + { + "epoch": 0.24845567967149793, + "grad_norm": 0.319286972284317, + "learning_rate": 0.0001503398796449135, + "loss": 1.324, + "step": 19120 + }, + { + "epoch": 0.2484686742154138, + "grad_norm": 0.3913177251815796, + "learning_rate": 0.00015033728018300213, + "loss": 1.3181, + "step": 19121 + }, + { + "epoch": 0.24848166875932967, + "grad_norm": 0.4648880958557129, + "learning_rate": 0.00015033468072109073, + "loss": 1.544, + "step": 19122 + }, + { + "epoch": 0.24849466330324554, + "grad_norm": 0.35948699712753296, + "learning_rate": 0.00015033208125917935, + "loss": 1.3323, + "step": 19123 + }, + { + "epoch": 0.24850765784716142, + "grad_norm": 0.43552032113075256, + "learning_rate": 0.00015032948179726798, + "loss": 1.319, + "step": 19124 + }, + { + "epoch": 0.2485206523910773, + "grad_norm": 0.41005679965019226, + "learning_rate": 0.00015032688233535658, + "loss": 1.3827, + "step": 19125 + }, + { + "epoch": 0.24853364693499316, + "grad_norm": 0.3432054817676544, + "learning_rate": 0.0001503242828734452, + "loss": 1.5231, + "step": 19126 + }, + { + "epoch": 0.24854664147890904, + "grad_norm": 0.3863566219806671, + "learning_rate": 0.0001503216834115338, + "loss": 1.3088, + "step": 19127 + }, + { + "epoch": 0.2485596360228249, + "grad_norm": 0.49043983221054077, + "learning_rate": 0.00015031908394962245, + "loss": 1.6035, + "step": 19128 + }, + { + "epoch": 0.24857263056674078, + "grad_norm": 0.42359763383865356, + "learning_rate": 0.00015031648448771105, + "loss": 1.4198, + "step": 19129 + }, + { + "epoch": 0.24858562511065666, + "grad_norm": 0.429117888212204, + "learning_rate": 0.00015031388502579967, + "loss": 1.3661, + "step": 19130 + }, + { + "epoch": 0.24859861965457253, + "grad_norm": 0.3501416742801666, + "learning_rate": 0.0001503112855638883, + "loss": 1.4501, + "step": 19131 + }, + { + "epoch": 0.2486116141984884, + "grad_norm": 0.3616982698440552, + "learning_rate": 0.0001503086861019769, + "loss": 1.3478, + "step": 19132 + }, + { + "epoch": 0.24862460874240427, + "grad_norm": 0.3698638677597046, + "learning_rate": 0.00015030608664006552, + "loss": 1.4481, + "step": 19133 + }, + { + "epoch": 0.24863760328632015, + "grad_norm": 0.34999844431877136, + "learning_rate": 0.00015030348717815412, + "loss": 1.4285, + "step": 19134 + }, + { + "epoch": 0.24865059783023602, + "grad_norm": 0.4181182086467743, + "learning_rate": 0.00015030088771624277, + "loss": 1.4721, + "step": 19135 + }, + { + "epoch": 0.2486635923741519, + "grad_norm": 0.38231828808784485, + "learning_rate": 0.00015029828825433136, + "loss": 1.4484, + "step": 19136 + }, + { + "epoch": 0.24867658691806777, + "grad_norm": 0.3951537609100342, + "learning_rate": 0.00015029568879242, + "loss": 1.4103, + "step": 19137 + }, + { + "epoch": 0.24868958146198364, + "grad_norm": 0.4068044424057007, + "learning_rate": 0.00015029308933050859, + "loss": 1.3895, + "step": 19138 + }, + { + "epoch": 0.2487025760058995, + "grad_norm": 0.3836880326271057, + "learning_rate": 0.0001502904898685972, + "loss": 1.3739, + "step": 19139 + }, + { + "epoch": 0.24871557054981538, + "grad_norm": 0.3770371079444885, + "learning_rate": 0.00015028789040668583, + "loss": 1.4968, + "step": 19140 + }, + { + "epoch": 0.24872856509373126, + "grad_norm": 0.442480206489563, + "learning_rate": 0.00015028529094477443, + "loss": 1.4166, + "step": 19141 + }, + { + "epoch": 0.24874155963764713, + "grad_norm": 0.38389837741851807, + "learning_rate": 0.00015028269148286306, + "loss": 1.3675, + "step": 19142 + }, + { + "epoch": 0.248754554181563, + "grad_norm": 0.3987143039703369, + "learning_rate": 0.00015028009202095168, + "loss": 1.3711, + "step": 19143 + }, + { + "epoch": 0.24876754872547888, + "grad_norm": 0.4099433124065399, + "learning_rate": 0.00015027749255904028, + "loss": 1.3545, + "step": 19144 + }, + { + "epoch": 0.24878054326939475, + "grad_norm": 0.49564334750175476, + "learning_rate": 0.0001502748930971289, + "loss": 1.4243, + "step": 19145 + }, + { + "epoch": 0.24879353781331062, + "grad_norm": 0.45382511615753174, + "learning_rate": 0.0001502722936352175, + "loss": 1.3203, + "step": 19146 + }, + { + "epoch": 0.24880653235722652, + "grad_norm": 0.4894731044769287, + "learning_rate": 0.00015026969417330615, + "loss": 1.6907, + "step": 19147 + }, + { + "epoch": 0.2488195269011424, + "grad_norm": 0.40606409311294556, + "learning_rate": 0.00015026709471139475, + "loss": 1.4863, + "step": 19148 + }, + { + "epoch": 0.24883252144505827, + "grad_norm": 0.3097348213195801, + "learning_rate": 0.00015026449524948337, + "loss": 1.2026, + "step": 19149 + }, + { + "epoch": 0.24884551598897414, + "grad_norm": 0.30959758162498474, + "learning_rate": 0.00015026189578757197, + "loss": 1.3094, + "step": 19150 + }, + { + "epoch": 0.24885851053289001, + "grad_norm": 0.4124270975589752, + "learning_rate": 0.0001502592963256606, + "loss": 1.2226, + "step": 19151 + }, + { + "epoch": 0.2488715050768059, + "grad_norm": 0.29642346501350403, + "learning_rate": 0.00015025669686374922, + "loss": 1.4754, + "step": 19152 + }, + { + "epoch": 0.24888449962072176, + "grad_norm": 0.3061631917953491, + "learning_rate": 0.00015025409740183782, + "loss": 1.4331, + "step": 19153 + }, + { + "epoch": 0.24889749416463763, + "grad_norm": 0.3950229585170746, + "learning_rate": 0.00015025149793992644, + "loss": 1.3908, + "step": 19154 + }, + { + "epoch": 0.2489104887085535, + "grad_norm": 0.3768841326236725, + "learning_rate": 0.00015024889847801507, + "loss": 1.7169, + "step": 19155 + }, + { + "epoch": 0.24892348325246938, + "grad_norm": 0.33753305673599243, + "learning_rate": 0.00015024629901610366, + "loss": 1.4618, + "step": 19156 + }, + { + "epoch": 0.24893647779638525, + "grad_norm": 0.3248215615749359, + "learning_rate": 0.0001502436995541923, + "loss": 1.4749, + "step": 19157 + }, + { + "epoch": 0.24894947234030113, + "grad_norm": 0.4011741280555725, + "learning_rate": 0.00015024110009228089, + "loss": 1.5527, + "step": 19158 + }, + { + "epoch": 0.248962466884217, + "grad_norm": 0.38974371552467346, + "learning_rate": 0.00015023850063036954, + "loss": 1.2757, + "step": 19159 + }, + { + "epoch": 0.24897546142813287, + "grad_norm": 0.4189406931400299, + "learning_rate": 0.00015023590116845813, + "loss": 1.2964, + "step": 19160 + }, + { + "epoch": 0.24898845597204874, + "grad_norm": 0.47264423966407776, + "learning_rate": 0.00015023330170654676, + "loss": 1.3981, + "step": 19161 + }, + { + "epoch": 0.24900145051596462, + "grad_norm": 0.41537484526634216, + "learning_rate": 0.00015023070224463536, + "loss": 1.4059, + "step": 19162 + }, + { + "epoch": 0.2490144450598805, + "grad_norm": 0.40705251693725586, + "learning_rate": 0.00015022810278272398, + "loss": 1.4244, + "step": 19163 + }, + { + "epoch": 0.24902743960379636, + "grad_norm": 0.36592376232147217, + "learning_rate": 0.0001502255033208126, + "loss": 1.6813, + "step": 19164 + }, + { + "epoch": 0.24904043414771224, + "grad_norm": 0.3731706142425537, + "learning_rate": 0.0001502229038589012, + "loss": 1.2138, + "step": 19165 + }, + { + "epoch": 0.2490534286916281, + "grad_norm": 0.3618335723876953, + "learning_rate": 0.00015022030439698983, + "loss": 1.3899, + "step": 19166 + }, + { + "epoch": 0.24906642323554398, + "grad_norm": 0.3592146039009094, + "learning_rate": 0.00015021770493507845, + "loss": 1.196, + "step": 19167 + }, + { + "epoch": 0.24907941777945986, + "grad_norm": 0.44842615723609924, + "learning_rate": 0.00015021510547316705, + "loss": 1.4077, + "step": 19168 + }, + { + "epoch": 0.24909241232337573, + "grad_norm": 0.3890256881713867, + "learning_rate": 0.00015021250601125567, + "loss": 1.4916, + "step": 19169 + }, + { + "epoch": 0.2491054068672916, + "grad_norm": 0.3488661050796509, + "learning_rate": 0.0001502099065493443, + "loss": 1.4662, + "step": 19170 + }, + { + "epoch": 0.24911840141120747, + "grad_norm": 0.4618266224861145, + "learning_rate": 0.00015020730708743292, + "loss": 1.2772, + "step": 19171 + }, + { + "epoch": 0.24913139595512335, + "grad_norm": 0.3948304057121277, + "learning_rate": 0.00015020470762552152, + "loss": 1.4734, + "step": 19172 + }, + { + "epoch": 0.24914439049903922, + "grad_norm": 0.5780940651893616, + "learning_rate": 0.00015020210816361014, + "loss": 1.5365, + "step": 19173 + }, + { + "epoch": 0.2491573850429551, + "grad_norm": 0.40671610832214355, + "learning_rate": 0.00015019950870169877, + "loss": 1.4395, + "step": 19174 + }, + { + "epoch": 0.24917037958687097, + "grad_norm": 0.4146711528301239, + "learning_rate": 0.00015019690923978737, + "loss": 1.3896, + "step": 19175 + }, + { + "epoch": 0.24918337413078684, + "grad_norm": 0.3773079812526703, + "learning_rate": 0.000150194309777876, + "loss": 1.3601, + "step": 19176 + }, + { + "epoch": 0.2491963686747027, + "grad_norm": 0.2879582941532135, + "learning_rate": 0.0001501917103159646, + "loss": 1.275, + "step": 19177 + }, + { + "epoch": 0.24920936321861858, + "grad_norm": 0.38658273220062256, + "learning_rate": 0.00015018911085405324, + "loss": 1.4351, + "step": 19178 + }, + { + "epoch": 0.24922235776253446, + "grad_norm": 0.38487473130226135, + "learning_rate": 0.00015018651139214184, + "loss": 1.405, + "step": 19179 + }, + { + "epoch": 0.24923535230645033, + "grad_norm": 0.38571348786354065, + "learning_rate": 0.00015018391193023043, + "loss": 1.3919, + "step": 19180 + }, + { + "epoch": 0.2492483468503662, + "grad_norm": 0.33694788813591003, + "learning_rate": 0.00015018131246831906, + "loss": 1.2644, + "step": 19181 + }, + { + "epoch": 0.24926134139428208, + "grad_norm": 0.3174707889556885, + "learning_rate": 0.00015017871300640768, + "loss": 1.4046, + "step": 19182 + }, + { + "epoch": 0.24927433593819795, + "grad_norm": 0.39126893877983093, + "learning_rate": 0.0001501761135444963, + "loss": 1.5128, + "step": 19183 + }, + { + "epoch": 0.24928733048211382, + "grad_norm": 0.36360475420951843, + "learning_rate": 0.0001501735140825849, + "loss": 1.3412, + "step": 19184 + }, + { + "epoch": 0.2493003250260297, + "grad_norm": 0.35371512174606323, + "learning_rate": 0.00015017091462067353, + "loss": 1.1731, + "step": 19185 + }, + { + "epoch": 0.24931331956994557, + "grad_norm": 0.429453581571579, + "learning_rate": 0.00015016831515876215, + "loss": 1.4355, + "step": 19186 + }, + { + "epoch": 0.24932631411386144, + "grad_norm": 0.33113598823547363, + "learning_rate": 0.00015016571569685075, + "loss": 1.3623, + "step": 19187 + }, + { + "epoch": 0.24933930865777731, + "grad_norm": 0.6194899082183838, + "learning_rate": 0.00015016311623493938, + "loss": 1.4792, + "step": 19188 + }, + { + "epoch": 0.2493523032016932, + "grad_norm": 0.3941209614276886, + "learning_rate": 0.00015016051677302797, + "loss": 1.279, + "step": 19189 + }, + { + "epoch": 0.24936529774560906, + "grad_norm": 0.38978275656700134, + "learning_rate": 0.00015015791731111663, + "loss": 1.4462, + "step": 19190 + }, + { + "epoch": 0.24937829228952493, + "grad_norm": 0.420627623796463, + "learning_rate": 0.00015015531784920522, + "loss": 1.3809, + "step": 19191 + }, + { + "epoch": 0.2493912868334408, + "grad_norm": 0.32875826954841614, + "learning_rate": 0.00015015271838729382, + "loss": 1.6137, + "step": 19192 + }, + { + "epoch": 0.24940428137735668, + "grad_norm": 0.3822869658470154, + "learning_rate": 0.00015015011892538244, + "loss": 1.4486, + "step": 19193 + }, + { + "epoch": 0.24941727592127255, + "grad_norm": 0.36073216795921326, + "learning_rate": 0.00015014751946347107, + "loss": 1.3713, + "step": 19194 + }, + { + "epoch": 0.24943027046518843, + "grad_norm": 0.4094196856021881, + "learning_rate": 0.0001501449200015597, + "loss": 1.4062, + "step": 19195 + }, + { + "epoch": 0.2494432650091043, + "grad_norm": 0.398723840713501, + "learning_rate": 0.0001501423205396483, + "loss": 1.5017, + "step": 19196 + }, + { + "epoch": 0.24945625955302017, + "grad_norm": 0.4532181918621063, + "learning_rate": 0.00015013972107773692, + "loss": 1.2433, + "step": 19197 + }, + { + "epoch": 0.24946925409693604, + "grad_norm": 0.458676815032959, + "learning_rate": 0.00015013712161582554, + "loss": 1.4285, + "step": 19198 + }, + { + "epoch": 0.24948224864085192, + "grad_norm": 0.4670829772949219, + "learning_rate": 0.00015013452215391414, + "loss": 1.4901, + "step": 19199 + }, + { + "epoch": 0.2494952431847678, + "grad_norm": 0.4188287854194641, + "learning_rate": 0.00015013192269200276, + "loss": 1.4784, + "step": 19200 + }, + { + "epoch": 0.24950823772868366, + "grad_norm": 0.38152992725372314, + "learning_rate": 0.00015012932323009136, + "loss": 1.4712, + "step": 19201 + }, + { + "epoch": 0.24952123227259954, + "grad_norm": 0.42134958505630493, + "learning_rate": 0.00015012672376818, + "loss": 1.4233, + "step": 19202 + }, + { + "epoch": 0.2495342268165154, + "grad_norm": 0.4274691045284271, + "learning_rate": 0.0001501241243062686, + "loss": 1.4307, + "step": 19203 + }, + { + "epoch": 0.24954722136043128, + "grad_norm": 0.39370617270469666, + "learning_rate": 0.00015012152484435723, + "loss": 1.2499, + "step": 19204 + }, + { + "epoch": 0.24956021590434715, + "grad_norm": 0.48284730315208435, + "learning_rate": 0.00015011892538244586, + "loss": 1.3994, + "step": 19205 + }, + { + "epoch": 0.24957321044826303, + "grad_norm": 0.45212820172309875, + "learning_rate": 0.00015011632592053445, + "loss": 1.2625, + "step": 19206 + }, + { + "epoch": 0.2495862049921789, + "grad_norm": 0.4460216462612152, + "learning_rate": 0.00015011372645862308, + "loss": 1.1583, + "step": 19207 + }, + { + "epoch": 0.24959919953609477, + "grad_norm": 0.29853329062461853, + "learning_rate": 0.00015011112699671168, + "loss": 1.5585, + "step": 19208 + }, + { + "epoch": 0.24961219408001065, + "grad_norm": 0.32156065106391907, + "learning_rate": 0.0001501085275348003, + "loss": 1.345, + "step": 19209 + }, + { + "epoch": 0.24962518862392652, + "grad_norm": 0.40135571360588074, + "learning_rate": 0.00015010592807288893, + "loss": 1.396, + "step": 19210 + }, + { + "epoch": 0.2496381831678424, + "grad_norm": 0.4073614478111267, + "learning_rate": 0.00015010332861097752, + "loss": 1.4523, + "step": 19211 + }, + { + "epoch": 0.24965117771175827, + "grad_norm": 0.40700840950012207, + "learning_rate": 0.00015010072914906615, + "loss": 1.4825, + "step": 19212 + }, + { + "epoch": 0.24966417225567414, + "grad_norm": 0.34999483823776245, + "learning_rate": 0.00015009812968715477, + "loss": 1.3906, + "step": 19213 + }, + { + "epoch": 0.24967716679959, + "grad_norm": 0.31180256605148315, + "learning_rate": 0.0001500955302252434, + "loss": 1.2635, + "step": 19214 + }, + { + "epoch": 0.24969016134350588, + "grad_norm": 0.48124930262565613, + "learning_rate": 0.000150092930763332, + "loss": 1.5987, + "step": 19215 + }, + { + "epoch": 0.24970315588742176, + "grad_norm": 0.2858378291130066, + "learning_rate": 0.00015009033130142062, + "loss": 1.0638, + "step": 19216 + }, + { + "epoch": 0.24971615043133763, + "grad_norm": 0.43331724405288696, + "learning_rate": 0.00015008773183950924, + "loss": 1.4054, + "step": 19217 + }, + { + "epoch": 0.2497291449752535, + "grad_norm": 0.40219780802726746, + "learning_rate": 0.00015008513237759784, + "loss": 1.3669, + "step": 19218 + }, + { + "epoch": 0.24974213951916938, + "grad_norm": 0.35651901364326477, + "learning_rate": 0.00015008253291568646, + "loss": 1.4312, + "step": 19219 + }, + { + "epoch": 0.24975513406308525, + "grad_norm": 0.4978581964969635, + "learning_rate": 0.00015007993345377506, + "loss": 1.4445, + "step": 19220 + }, + { + "epoch": 0.24976812860700112, + "grad_norm": 0.31322818994522095, + "learning_rate": 0.0001500773339918637, + "loss": 1.2865, + "step": 19221 + }, + { + "epoch": 0.249781123150917, + "grad_norm": 0.30895453691482544, + "learning_rate": 0.0001500747345299523, + "loss": 1.3561, + "step": 19222 + }, + { + "epoch": 0.2497941176948329, + "grad_norm": 0.49428462982177734, + "learning_rate": 0.0001500721350680409, + "loss": 1.3864, + "step": 19223 + }, + { + "epoch": 0.24980711223874877, + "grad_norm": 0.3733403980731964, + "learning_rate": 0.00015006953560612953, + "loss": 1.483, + "step": 19224 + }, + { + "epoch": 0.24982010678266464, + "grad_norm": 0.39760130643844604, + "learning_rate": 0.00015006693614421816, + "loss": 1.2236, + "step": 19225 + }, + { + "epoch": 0.24983310132658051, + "grad_norm": 0.4495384991168976, + "learning_rate": 0.00015006433668230678, + "loss": 1.666, + "step": 19226 + }, + { + "epoch": 0.2498460958704964, + "grad_norm": 0.3407870829105377, + "learning_rate": 0.00015006173722039538, + "loss": 1.3406, + "step": 19227 + }, + { + "epoch": 0.24985909041441226, + "grad_norm": 0.33544498682022095, + "learning_rate": 0.000150059137758484, + "loss": 1.458, + "step": 19228 + }, + { + "epoch": 0.24987208495832813, + "grad_norm": 0.40889278054237366, + "learning_rate": 0.00015005653829657263, + "loss": 1.4447, + "step": 19229 + }, + { + "epoch": 0.249885079502244, + "grad_norm": 0.3434186577796936, + "learning_rate": 0.00015005393883466123, + "loss": 1.4921, + "step": 19230 + }, + { + "epoch": 0.24989807404615988, + "grad_norm": 0.43655088543891907, + "learning_rate": 0.00015005133937274985, + "loss": 1.4809, + "step": 19231 + }, + { + "epoch": 0.24991106859007575, + "grad_norm": 0.38057807087898254, + "learning_rate": 0.00015004873991083845, + "loss": 1.5081, + "step": 19232 + }, + { + "epoch": 0.24992406313399163, + "grad_norm": 0.4958529472351074, + "learning_rate": 0.0001500461404489271, + "loss": 1.502, + "step": 19233 + }, + { + "epoch": 0.2499370576779075, + "grad_norm": 0.40056657791137695, + "learning_rate": 0.0001500435409870157, + "loss": 1.1288, + "step": 19234 + }, + { + "epoch": 0.24995005222182337, + "grad_norm": 0.40495529770851135, + "learning_rate": 0.0001500409415251043, + "loss": 1.3488, + "step": 19235 + }, + { + "epoch": 0.24996304676573924, + "grad_norm": 0.3337843716144562, + "learning_rate": 0.00015003834206319292, + "loss": 1.4262, + "step": 19236 + }, + { + "epoch": 0.24997604130965512, + "grad_norm": 0.3919743597507477, + "learning_rate": 0.00015003574260128154, + "loss": 1.4275, + "step": 19237 + }, + { + "epoch": 0.249989035853571, + "grad_norm": 0.42955249547958374, + "learning_rate": 0.00015003314313937017, + "loss": 1.3145, + "step": 19238 + }, + { + "epoch": 0.25000203039748686, + "grad_norm": 0.41069361567497253, + "learning_rate": 0.00015003054367745876, + "loss": 1.5642, + "step": 19239 + }, + { + "epoch": 0.2500150249414027, + "grad_norm": 0.4328746199607849, + "learning_rate": 0.0001500279442155474, + "loss": 1.4593, + "step": 19240 + }, + { + "epoch": 0.2500280194853186, + "grad_norm": 0.3221152126789093, + "learning_rate": 0.000150025344753636, + "loss": 1.2975, + "step": 19241 + }, + { + "epoch": 0.25004101402923445, + "grad_norm": 0.4952162504196167, + "learning_rate": 0.0001500227452917246, + "loss": 1.4493, + "step": 19242 + }, + { + "epoch": 0.25005400857315035, + "grad_norm": 0.27219510078430176, + "learning_rate": 0.00015002014582981324, + "loss": 1.1862, + "step": 19243 + }, + { + "epoch": 0.2500670031170662, + "grad_norm": 0.40036454796791077, + "learning_rate": 0.00015001754636790186, + "loss": 1.2394, + "step": 19244 + }, + { + "epoch": 0.2500799976609821, + "grad_norm": 0.3157905340194702, + "learning_rate": 0.00015001494690599048, + "loss": 1.3851, + "step": 19245 + }, + { + "epoch": 0.25009299220489795, + "grad_norm": 0.3738485276699066, + "learning_rate": 0.00015001234744407908, + "loss": 1.7611, + "step": 19246 + }, + { + "epoch": 0.25010598674881385, + "grad_norm": 0.4408351182937622, + "learning_rate": 0.00015000974798216768, + "loss": 1.229, + "step": 19247 + }, + { + "epoch": 0.2501189812927297, + "grad_norm": 0.3717687726020813, + "learning_rate": 0.00015000714852025633, + "loss": 1.5592, + "step": 19248 + }, + { + "epoch": 0.2501319758366456, + "grad_norm": 0.32244768738746643, + "learning_rate": 0.00015000454905834493, + "loss": 1.4232, + "step": 19249 + }, + { + "epoch": 0.25014497038056144, + "grad_norm": 0.39478233456611633, + "learning_rate": 0.00015000194959643355, + "loss": 1.2541, + "step": 19250 + }, + { + "epoch": 0.25015796492447734, + "grad_norm": 0.3973926901817322, + "learning_rate": 0.00014999935013452215, + "loss": 1.4973, + "step": 19251 + }, + { + "epoch": 0.2501709594683932, + "grad_norm": 0.41790857911109924, + "learning_rate": 0.00014999675067261077, + "loss": 1.5505, + "step": 19252 + }, + { + "epoch": 0.2501839540123091, + "grad_norm": 0.3618704676628113, + "learning_rate": 0.0001499941512106994, + "loss": 1.3576, + "step": 19253 + }, + { + "epoch": 0.25019694855622493, + "grad_norm": 0.38586872816085815, + "learning_rate": 0.000149991551748788, + "loss": 1.3541, + "step": 19254 + }, + { + "epoch": 0.25020994310014083, + "grad_norm": 0.4389822781085968, + "learning_rate": 0.00014998895228687662, + "loss": 1.4139, + "step": 19255 + }, + { + "epoch": 0.2502229376440567, + "grad_norm": 0.43892979621887207, + "learning_rate": 0.00014998635282496525, + "loss": 1.3491, + "step": 19256 + }, + { + "epoch": 0.2502359321879726, + "grad_norm": 0.45594754815101624, + "learning_rate": 0.00014998375336305387, + "loss": 1.4041, + "step": 19257 + }, + { + "epoch": 0.2502489267318884, + "grad_norm": 0.37851089239120483, + "learning_rate": 0.00014998115390114247, + "loss": 1.3846, + "step": 19258 + }, + { + "epoch": 0.2502619212758043, + "grad_norm": 0.3150915503501892, + "learning_rate": 0.0001499785544392311, + "loss": 1.2744, + "step": 19259 + }, + { + "epoch": 0.25027491581972017, + "grad_norm": 0.43895670771598816, + "learning_rate": 0.00014997595497731972, + "loss": 1.3074, + "step": 19260 + }, + { + "epoch": 0.25028791036363607, + "grad_norm": 0.32327836751937866, + "learning_rate": 0.0001499733555154083, + "loss": 1.5824, + "step": 19261 + }, + { + "epoch": 0.25030090490755197, + "grad_norm": 0.38560882210731506, + "learning_rate": 0.00014997075605349694, + "loss": 1.3508, + "step": 19262 + }, + { + "epoch": 0.2503138994514678, + "grad_norm": 0.3774488866329193, + "learning_rate": 0.00014996815659158554, + "loss": 1.2967, + "step": 19263 + }, + { + "epoch": 0.2503268939953837, + "grad_norm": 0.3617823123931885, + "learning_rate": 0.00014996555712967416, + "loss": 1.4806, + "step": 19264 + }, + { + "epoch": 0.25033988853929956, + "grad_norm": 0.3435172438621521, + "learning_rate": 0.00014996295766776278, + "loss": 1.3939, + "step": 19265 + }, + { + "epoch": 0.25035288308321546, + "grad_norm": 0.4564226567745209, + "learning_rate": 0.00014996035820585138, + "loss": 1.4135, + "step": 19266 + }, + { + "epoch": 0.2503658776271313, + "grad_norm": 0.3977218568325043, + "learning_rate": 0.00014995775874394, + "loss": 1.4134, + "step": 19267 + }, + { + "epoch": 0.2503788721710472, + "grad_norm": 0.3221655786037445, + "learning_rate": 0.00014995515928202863, + "loss": 1.3027, + "step": 19268 + }, + { + "epoch": 0.25039186671496305, + "grad_norm": 0.41427087783813477, + "learning_rate": 0.00014995255982011725, + "loss": 1.3775, + "step": 19269 + }, + { + "epoch": 0.25040486125887895, + "grad_norm": 0.31644406914711, + "learning_rate": 0.00014994996035820585, + "loss": 1.2903, + "step": 19270 + }, + { + "epoch": 0.2504178558027948, + "grad_norm": 0.4224737286567688, + "learning_rate": 0.00014994736089629448, + "loss": 1.3279, + "step": 19271 + }, + { + "epoch": 0.2504308503467107, + "grad_norm": 0.4087494909763336, + "learning_rate": 0.0001499447614343831, + "loss": 1.484, + "step": 19272 + }, + { + "epoch": 0.25044384489062654, + "grad_norm": 0.48848119378089905, + "learning_rate": 0.0001499421619724717, + "loss": 1.299, + "step": 19273 + }, + { + "epoch": 0.25045683943454244, + "grad_norm": 0.5020224452018738, + "learning_rate": 0.00014993956251056032, + "loss": 1.481, + "step": 19274 + }, + { + "epoch": 0.2504698339784583, + "grad_norm": 0.48772576451301575, + "learning_rate": 0.00014993696304864892, + "loss": 1.3352, + "step": 19275 + }, + { + "epoch": 0.2504828285223742, + "grad_norm": 0.3205771744251251, + "learning_rate": 0.00014993436358673755, + "loss": 1.2712, + "step": 19276 + }, + { + "epoch": 0.25049582306629004, + "grad_norm": 0.43504101037979126, + "learning_rate": 0.00014993176412482617, + "loss": 1.4005, + "step": 19277 + }, + { + "epoch": 0.25050881761020594, + "grad_norm": 0.41573214530944824, + "learning_rate": 0.00014992916466291477, + "loss": 1.5298, + "step": 19278 + }, + { + "epoch": 0.2505218121541218, + "grad_norm": 0.38092154264450073, + "learning_rate": 0.00014992656520100342, + "loss": 1.3994, + "step": 19279 + }, + { + "epoch": 0.2505348066980377, + "grad_norm": 0.2937425374984741, + "learning_rate": 0.00014992396573909202, + "loss": 1.4855, + "step": 19280 + }, + { + "epoch": 0.2505478012419535, + "grad_norm": 0.3984566032886505, + "learning_rate": 0.00014992136627718064, + "loss": 1.3065, + "step": 19281 + }, + { + "epoch": 0.25056079578586943, + "grad_norm": 0.4287436306476593, + "learning_rate": 0.00014991876681526924, + "loss": 1.3654, + "step": 19282 + }, + { + "epoch": 0.2505737903297853, + "grad_norm": 0.41433101892471313, + "learning_rate": 0.00014991616735335786, + "loss": 1.3354, + "step": 19283 + }, + { + "epoch": 0.2505867848737012, + "grad_norm": 0.343309223651886, + "learning_rate": 0.0001499135678914465, + "loss": 1.4141, + "step": 19284 + }, + { + "epoch": 0.250599779417617, + "grad_norm": 0.36281776428222656, + "learning_rate": 0.00014991096842953508, + "loss": 1.2881, + "step": 19285 + }, + { + "epoch": 0.2506127739615329, + "grad_norm": 0.4217841327190399, + "learning_rate": 0.0001499083689676237, + "loss": 1.5818, + "step": 19286 + }, + { + "epoch": 0.25062576850544876, + "grad_norm": 0.42087310552597046, + "learning_rate": 0.00014990576950571233, + "loss": 1.5077, + "step": 19287 + }, + { + "epoch": 0.25063876304936467, + "grad_norm": 0.4089794158935547, + "learning_rate": 0.00014990317004380096, + "loss": 1.4815, + "step": 19288 + }, + { + "epoch": 0.2506517575932805, + "grad_norm": 0.34926196932792664, + "learning_rate": 0.00014990057058188955, + "loss": 1.3468, + "step": 19289 + }, + { + "epoch": 0.2506647521371964, + "grad_norm": 0.40168845653533936, + "learning_rate": 0.00014989797111997815, + "loss": 1.575, + "step": 19290 + }, + { + "epoch": 0.25067774668111226, + "grad_norm": 0.4461529850959778, + "learning_rate": 0.0001498953716580668, + "loss": 1.599, + "step": 19291 + }, + { + "epoch": 0.25069074122502816, + "grad_norm": 0.5150933861732483, + "learning_rate": 0.0001498927721961554, + "loss": 1.6999, + "step": 19292 + }, + { + "epoch": 0.250703735768944, + "grad_norm": 0.4014335572719574, + "learning_rate": 0.00014989017273424403, + "loss": 1.3744, + "step": 19293 + }, + { + "epoch": 0.2507167303128599, + "grad_norm": 0.44097307324409485, + "learning_rate": 0.00014988757327233262, + "loss": 1.3409, + "step": 19294 + }, + { + "epoch": 0.25072972485677575, + "grad_norm": 0.4230285584926605, + "learning_rate": 0.00014988497381042125, + "loss": 1.5175, + "step": 19295 + }, + { + "epoch": 0.25074271940069165, + "grad_norm": 0.35233306884765625, + "learning_rate": 0.00014988237434850987, + "loss": 1.5311, + "step": 19296 + }, + { + "epoch": 0.2507557139446075, + "grad_norm": 0.2650070786476135, + "learning_rate": 0.00014987977488659847, + "loss": 1.2741, + "step": 19297 + }, + { + "epoch": 0.2507687084885234, + "grad_norm": 0.39668363332748413, + "learning_rate": 0.0001498771754246871, + "loss": 1.4199, + "step": 19298 + }, + { + "epoch": 0.25078170303243924, + "grad_norm": 0.3839765787124634, + "learning_rate": 0.00014987457596277572, + "loss": 1.5785, + "step": 19299 + }, + { + "epoch": 0.25079469757635514, + "grad_norm": 0.4144365191459656, + "learning_rate": 0.00014987197650086434, + "loss": 1.3323, + "step": 19300 + }, + { + "epoch": 0.250807692120271, + "grad_norm": 0.4161668121814728, + "learning_rate": 0.00014986937703895294, + "loss": 1.4305, + "step": 19301 + }, + { + "epoch": 0.2508206866641869, + "grad_norm": 0.4364469349384308, + "learning_rate": 0.00014986677757704154, + "loss": 1.5838, + "step": 19302 + }, + { + "epoch": 0.25083368120810273, + "grad_norm": 0.4104681611061096, + "learning_rate": 0.0001498641781151302, + "loss": 1.4805, + "step": 19303 + }, + { + "epoch": 0.25084667575201863, + "grad_norm": 0.3963468372821808, + "learning_rate": 0.0001498615786532188, + "loss": 1.45, + "step": 19304 + }, + { + "epoch": 0.2508596702959345, + "grad_norm": 0.3825284242630005, + "learning_rate": 0.0001498589791913074, + "loss": 1.418, + "step": 19305 + }, + { + "epoch": 0.2508726648398504, + "grad_norm": 0.3770049512386322, + "learning_rate": 0.000149856379729396, + "loss": 1.3113, + "step": 19306 + }, + { + "epoch": 0.2508856593837662, + "grad_norm": 0.41633182764053345, + "learning_rate": 0.00014985378026748463, + "loss": 1.364, + "step": 19307 + }, + { + "epoch": 0.2508986539276821, + "grad_norm": 0.44034066796302795, + "learning_rate": 0.00014985118080557326, + "loss": 1.2433, + "step": 19308 + }, + { + "epoch": 0.25091164847159797, + "grad_norm": 0.2963191568851471, + "learning_rate": 0.00014984858134366185, + "loss": 1.4263, + "step": 19309 + }, + { + "epoch": 0.25092464301551387, + "grad_norm": 0.3092297315597534, + "learning_rate": 0.00014984598188175048, + "loss": 1.0592, + "step": 19310 + }, + { + "epoch": 0.2509376375594297, + "grad_norm": 0.4465179443359375, + "learning_rate": 0.0001498433824198391, + "loss": 1.6107, + "step": 19311 + }, + { + "epoch": 0.2509506321033456, + "grad_norm": 0.4339119493961334, + "learning_rate": 0.00014984078295792773, + "loss": 1.4658, + "step": 19312 + }, + { + "epoch": 0.25096362664726146, + "grad_norm": 0.46140626072883606, + "learning_rate": 0.00014983818349601633, + "loss": 1.4549, + "step": 19313 + }, + { + "epoch": 0.25097662119117736, + "grad_norm": 0.4929795265197754, + "learning_rate": 0.00014983558403410495, + "loss": 1.4189, + "step": 19314 + }, + { + "epoch": 0.2509896157350932, + "grad_norm": 0.45960184931755066, + "learning_rate": 0.00014983298457219357, + "loss": 1.3732, + "step": 19315 + }, + { + "epoch": 0.2510026102790091, + "grad_norm": 0.42808791995048523, + "learning_rate": 0.00014983038511028217, + "loss": 1.4864, + "step": 19316 + }, + { + "epoch": 0.25101560482292495, + "grad_norm": 0.30764973163604736, + "learning_rate": 0.0001498277856483708, + "loss": 1.5246, + "step": 19317 + }, + { + "epoch": 0.25102859936684085, + "grad_norm": 0.3849032521247864, + "learning_rate": 0.00014982518618645942, + "loss": 1.4392, + "step": 19318 + }, + { + "epoch": 0.2510415939107567, + "grad_norm": 0.3228871524333954, + "learning_rate": 0.00014982258672454802, + "loss": 1.2357, + "step": 19319 + }, + { + "epoch": 0.2510545884546726, + "grad_norm": 0.42633679509162903, + "learning_rate": 0.00014981998726263664, + "loss": 1.4902, + "step": 19320 + }, + { + "epoch": 0.25106758299858845, + "grad_norm": 0.38729336857795715, + "learning_rate": 0.00014981738780072524, + "loss": 1.3342, + "step": 19321 + }, + { + "epoch": 0.25108057754250435, + "grad_norm": 0.318507581949234, + "learning_rate": 0.0001498147883388139, + "loss": 1.3358, + "step": 19322 + }, + { + "epoch": 0.2510935720864202, + "grad_norm": 0.47785621881484985, + "learning_rate": 0.0001498121888769025, + "loss": 1.4492, + "step": 19323 + }, + { + "epoch": 0.2511065666303361, + "grad_norm": 0.3477509617805481, + "learning_rate": 0.00014980958941499111, + "loss": 1.4552, + "step": 19324 + }, + { + "epoch": 0.25111956117425194, + "grad_norm": 0.5051265954971313, + "learning_rate": 0.0001498069899530797, + "loss": 1.5248, + "step": 19325 + }, + { + "epoch": 0.25113255571816784, + "grad_norm": 0.422441691160202, + "learning_rate": 0.00014980439049116834, + "loss": 1.3613, + "step": 19326 + }, + { + "epoch": 0.2511455502620837, + "grad_norm": 0.4308181703090668, + "learning_rate": 0.00014980179102925696, + "loss": 1.5678, + "step": 19327 + }, + { + "epoch": 0.2511585448059996, + "grad_norm": 0.4285312294960022, + "learning_rate": 0.00014979919156734556, + "loss": 1.4648, + "step": 19328 + }, + { + "epoch": 0.25117153934991543, + "grad_norm": 0.3926316797733307, + "learning_rate": 0.00014979659210543418, + "loss": 1.3372, + "step": 19329 + }, + { + "epoch": 0.25118453389383133, + "grad_norm": 0.26758047938346863, + "learning_rate": 0.0001497939926435228, + "loss": 1.4504, + "step": 19330 + }, + { + "epoch": 0.2511975284377472, + "grad_norm": 0.4403420388698578, + "learning_rate": 0.0001497913931816114, + "loss": 1.4039, + "step": 19331 + }, + { + "epoch": 0.2512105229816631, + "grad_norm": 0.35894283652305603, + "learning_rate": 0.00014978879371970003, + "loss": 1.3014, + "step": 19332 + }, + { + "epoch": 0.2512235175255789, + "grad_norm": 0.4046807289123535, + "learning_rate": 0.00014978619425778863, + "loss": 1.4148, + "step": 19333 + }, + { + "epoch": 0.2512365120694948, + "grad_norm": 0.35515445470809937, + "learning_rate": 0.00014978359479587728, + "loss": 1.2439, + "step": 19334 + }, + { + "epoch": 0.25124950661341067, + "grad_norm": 0.36843305826187134, + "learning_rate": 0.00014978099533396587, + "loss": 1.5725, + "step": 19335 + }, + { + "epoch": 0.25126250115732657, + "grad_norm": 0.3644530177116394, + "learning_rate": 0.0001497783958720545, + "loss": 1.3158, + "step": 19336 + }, + { + "epoch": 0.25127549570124247, + "grad_norm": 0.3628610372543335, + "learning_rate": 0.0001497757964101431, + "loss": 1.3984, + "step": 19337 + }, + { + "epoch": 0.2512884902451583, + "grad_norm": 0.4411577582359314, + "learning_rate": 0.00014977319694823172, + "loss": 1.3599, + "step": 19338 + }, + { + "epoch": 0.2513014847890742, + "grad_norm": 0.4517787992954254, + "learning_rate": 0.00014977059748632035, + "loss": 1.4633, + "step": 19339 + }, + { + "epoch": 0.25131447933299006, + "grad_norm": 0.4036463499069214, + "learning_rate": 0.00014976799802440894, + "loss": 1.4428, + "step": 19340 + }, + { + "epoch": 0.25132747387690596, + "grad_norm": 0.3981083333492279, + "learning_rate": 0.00014976539856249757, + "loss": 1.3533, + "step": 19341 + }, + { + "epoch": 0.2513404684208218, + "grad_norm": 0.4366310238838196, + "learning_rate": 0.0001497627991005862, + "loss": 1.3081, + "step": 19342 + }, + { + "epoch": 0.2513534629647377, + "grad_norm": 0.4536551237106323, + "learning_rate": 0.00014976019963867482, + "loss": 1.3329, + "step": 19343 + }, + { + "epoch": 0.25136645750865355, + "grad_norm": 0.4184654951095581, + "learning_rate": 0.00014975760017676341, + "loss": 1.3681, + "step": 19344 + }, + { + "epoch": 0.25137945205256945, + "grad_norm": 0.46820053458213806, + "learning_rate": 0.000149755000714852, + "loss": 1.5395, + "step": 19345 + }, + { + "epoch": 0.2513924465964853, + "grad_norm": 0.4538785517215729, + "learning_rate": 0.00014975240125294066, + "loss": 1.4008, + "step": 19346 + }, + { + "epoch": 0.2514054411404012, + "grad_norm": 0.3344264626502991, + "learning_rate": 0.00014974980179102926, + "loss": 1.5737, + "step": 19347 + }, + { + "epoch": 0.25141843568431704, + "grad_norm": 0.38885772228240967, + "learning_rate": 0.00014974720232911788, + "loss": 1.3549, + "step": 19348 + }, + { + "epoch": 0.25143143022823294, + "grad_norm": 0.3296425938606262, + "learning_rate": 0.00014974460286720648, + "loss": 1.337, + "step": 19349 + }, + { + "epoch": 0.2514444247721488, + "grad_norm": 0.5911374092102051, + "learning_rate": 0.0001497420034052951, + "loss": 1.4441, + "step": 19350 + }, + { + "epoch": 0.2514574193160647, + "grad_norm": 0.4292764663696289, + "learning_rate": 0.00014973940394338373, + "loss": 1.3529, + "step": 19351 + }, + { + "epoch": 0.25147041385998053, + "grad_norm": 0.3773444890975952, + "learning_rate": 0.00014973680448147233, + "loss": 1.3774, + "step": 19352 + }, + { + "epoch": 0.25148340840389644, + "grad_norm": 0.4887596368789673, + "learning_rate": 0.00014973420501956098, + "loss": 1.5905, + "step": 19353 + }, + { + "epoch": 0.2514964029478123, + "grad_norm": 0.4610773026943207, + "learning_rate": 0.00014973160555764958, + "loss": 1.5496, + "step": 19354 + }, + { + "epoch": 0.2515093974917282, + "grad_norm": 0.3434821367263794, + "learning_rate": 0.0001497290060957382, + "loss": 1.5048, + "step": 19355 + }, + { + "epoch": 0.251522392035644, + "grad_norm": 0.4377850890159607, + "learning_rate": 0.0001497264066338268, + "loss": 1.5321, + "step": 19356 + }, + { + "epoch": 0.2515353865795599, + "grad_norm": 0.4377135634422302, + "learning_rate": 0.00014972380717191542, + "loss": 1.4132, + "step": 19357 + }, + { + "epoch": 0.2515483811234758, + "grad_norm": 0.42370685935020447, + "learning_rate": 0.00014972120771000405, + "loss": 1.5159, + "step": 19358 + }, + { + "epoch": 0.2515613756673917, + "grad_norm": 0.36708515882492065, + "learning_rate": 0.00014971860824809265, + "loss": 1.3687, + "step": 19359 + }, + { + "epoch": 0.2515743702113075, + "grad_norm": 0.3527795076370239, + "learning_rate": 0.00014971600878618127, + "loss": 1.4833, + "step": 19360 + }, + { + "epoch": 0.2515873647552234, + "grad_norm": 0.5197107791900635, + "learning_rate": 0.0001497134093242699, + "loss": 1.3474, + "step": 19361 + }, + { + "epoch": 0.25160035929913926, + "grad_norm": 0.2973027229309082, + "learning_rate": 0.0001497108098623585, + "loss": 1.3054, + "step": 19362 + }, + { + "epoch": 0.25161335384305517, + "grad_norm": 0.31174319982528687, + "learning_rate": 0.00014970821040044712, + "loss": 1.1624, + "step": 19363 + }, + { + "epoch": 0.251626348386971, + "grad_norm": 0.33600637316703796, + "learning_rate": 0.00014970561093853571, + "loss": 1.3334, + "step": 19364 + }, + { + "epoch": 0.2516393429308869, + "grad_norm": 0.45112213492393494, + "learning_rate": 0.00014970301147662437, + "loss": 1.4194, + "step": 19365 + }, + { + "epoch": 0.25165233747480276, + "grad_norm": 0.3276121914386749, + "learning_rate": 0.00014970041201471296, + "loss": 1.2727, + "step": 19366 + }, + { + "epoch": 0.25166533201871866, + "grad_norm": 0.32909461855888367, + "learning_rate": 0.0001496978125528016, + "loss": 1.227, + "step": 19367 + }, + { + "epoch": 0.2516783265626345, + "grad_norm": 0.38036054372787476, + "learning_rate": 0.00014969521309089018, + "loss": 1.4674, + "step": 19368 + }, + { + "epoch": 0.2516913211065504, + "grad_norm": 0.425784707069397, + "learning_rate": 0.0001496926136289788, + "loss": 1.5073, + "step": 19369 + }, + { + "epoch": 0.25170431565046625, + "grad_norm": 0.3376021981239319, + "learning_rate": 0.00014969001416706743, + "loss": 1.1589, + "step": 19370 + }, + { + "epoch": 0.25171731019438215, + "grad_norm": 0.43341004848480225, + "learning_rate": 0.00014968741470515603, + "loss": 1.5024, + "step": 19371 + }, + { + "epoch": 0.251730304738298, + "grad_norm": 0.35913532972335815, + "learning_rate": 0.00014968481524324466, + "loss": 1.3625, + "step": 19372 + }, + { + "epoch": 0.2517432992822139, + "grad_norm": 0.3747897446155548, + "learning_rate": 0.00014968221578133328, + "loss": 1.2266, + "step": 19373 + }, + { + "epoch": 0.25175629382612974, + "grad_norm": 0.47092750668525696, + "learning_rate": 0.00014967961631942188, + "loss": 1.4456, + "step": 19374 + }, + { + "epoch": 0.25176928837004564, + "grad_norm": 0.3811807632446289, + "learning_rate": 0.0001496770168575105, + "loss": 1.4112, + "step": 19375 + }, + { + "epoch": 0.2517822829139615, + "grad_norm": 0.3384043574333191, + "learning_rate": 0.0001496744173955991, + "loss": 1.5773, + "step": 19376 + }, + { + "epoch": 0.2517952774578774, + "grad_norm": 0.3639392554759979, + "learning_rate": 0.00014967181793368775, + "loss": 1.3745, + "step": 19377 + }, + { + "epoch": 0.25180827200179323, + "grad_norm": 0.3538079857826233, + "learning_rate": 0.00014966921847177635, + "loss": 1.3261, + "step": 19378 + }, + { + "epoch": 0.25182126654570913, + "grad_norm": 0.27463003993034363, + "learning_rate": 0.00014966661900986497, + "loss": 1.3753, + "step": 19379 + }, + { + "epoch": 0.251834261089625, + "grad_norm": 0.33416783809661865, + "learning_rate": 0.00014966401954795357, + "loss": 1.3011, + "step": 19380 + }, + { + "epoch": 0.2518472556335409, + "grad_norm": 0.42259055376052856, + "learning_rate": 0.0001496614200860422, + "loss": 1.5886, + "step": 19381 + }, + { + "epoch": 0.2518602501774567, + "grad_norm": 0.43376848101615906, + "learning_rate": 0.00014965882062413082, + "loss": 1.5122, + "step": 19382 + }, + { + "epoch": 0.2518732447213726, + "grad_norm": 0.4062389135360718, + "learning_rate": 0.00014965622116221942, + "loss": 1.4191, + "step": 19383 + }, + { + "epoch": 0.25188623926528847, + "grad_norm": 0.2730928361415863, + "learning_rate": 0.00014965362170030804, + "loss": 1.2733, + "step": 19384 + }, + { + "epoch": 0.25189923380920437, + "grad_norm": 0.4791623055934906, + "learning_rate": 0.00014965102223839667, + "loss": 1.39, + "step": 19385 + }, + { + "epoch": 0.2519122283531202, + "grad_norm": 0.3642703592777252, + "learning_rate": 0.00014964842277648526, + "loss": 1.5112, + "step": 19386 + }, + { + "epoch": 0.2519252228970361, + "grad_norm": 0.45902013778686523, + "learning_rate": 0.0001496458233145739, + "loss": 1.4592, + "step": 19387 + }, + { + "epoch": 0.25193821744095196, + "grad_norm": 0.3657056391239166, + "learning_rate": 0.00014964322385266248, + "loss": 1.3107, + "step": 19388 + }, + { + "epoch": 0.25195121198486786, + "grad_norm": 0.3789004385471344, + "learning_rate": 0.00014964062439075114, + "loss": 1.6029, + "step": 19389 + }, + { + "epoch": 0.2519642065287837, + "grad_norm": 0.38772717118263245, + "learning_rate": 0.00014963802492883973, + "loss": 1.5438, + "step": 19390 + }, + { + "epoch": 0.2519772010726996, + "grad_norm": 0.45567506551742554, + "learning_rate": 0.00014963542546692836, + "loss": 1.5508, + "step": 19391 + }, + { + "epoch": 0.25199019561661545, + "grad_norm": 0.3882790207862854, + "learning_rate": 0.00014963282600501698, + "loss": 1.5101, + "step": 19392 + }, + { + "epoch": 0.25200319016053135, + "grad_norm": 0.446857213973999, + "learning_rate": 0.00014963022654310558, + "loss": 1.4696, + "step": 19393 + }, + { + "epoch": 0.2520161847044472, + "grad_norm": 0.3953308165073395, + "learning_rate": 0.0001496276270811942, + "loss": 1.4145, + "step": 19394 + }, + { + "epoch": 0.2520291792483631, + "grad_norm": 0.29922378063201904, + "learning_rate": 0.0001496250276192828, + "loss": 1.3245, + "step": 19395 + }, + { + "epoch": 0.25204217379227895, + "grad_norm": 0.3518284857273102, + "learning_rate": 0.00014962242815737145, + "loss": 1.2183, + "step": 19396 + }, + { + "epoch": 0.25205516833619485, + "grad_norm": 0.4177667796611786, + "learning_rate": 0.00014961982869546005, + "loss": 1.412, + "step": 19397 + }, + { + "epoch": 0.2520681628801107, + "grad_norm": 0.3514609932899475, + "learning_rate": 0.00014961722923354865, + "loss": 1.4467, + "step": 19398 + }, + { + "epoch": 0.2520811574240266, + "grad_norm": 0.3810456097126007, + "learning_rate": 0.00014961462977163727, + "loss": 1.3019, + "step": 19399 + }, + { + "epoch": 0.25209415196794244, + "grad_norm": 0.3124588429927826, + "learning_rate": 0.0001496120303097259, + "loss": 1.4037, + "step": 19400 + }, + { + "epoch": 0.25210714651185834, + "grad_norm": 0.5496308207511902, + "learning_rate": 0.00014960943084781452, + "loss": 1.4659, + "step": 19401 + }, + { + "epoch": 0.2521201410557742, + "grad_norm": 0.27893659472465515, + "learning_rate": 0.00014960683138590312, + "loss": 1.4998, + "step": 19402 + }, + { + "epoch": 0.2521331355996901, + "grad_norm": 0.4640393555164337, + "learning_rate": 0.00014960423192399174, + "loss": 1.5136, + "step": 19403 + }, + { + "epoch": 0.25214613014360593, + "grad_norm": 0.37795260548591614, + "learning_rate": 0.00014960163246208037, + "loss": 1.4603, + "step": 19404 + }, + { + "epoch": 0.25215912468752183, + "grad_norm": 0.3591223955154419, + "learning_rate": 0.00014959903300016897, + "loss": 1.4772, + "step": 19405 + }, + { + "epoch": 0.2521721192314377, + "grad_norm": 0.35912078619003296, + "learning_rate": 0.0001495964335382576, + "loss": 1.5752, + "step": 19406 + }, + { + "epoch": 0.2521851137753536, + "grad_norm": 0.3744770288467407, + "learning_rate": 0.0001495938340763462, + "loss": 1.2821, + "step": 19407 + }, + { + "epoch": 0.2521981083192694, + "grad_norm": 0.41908347606658936, + "learning_rate": 0.00014959123461443484, + "loss": 1.372, + "step": 19408 + }, + { + "epoch": 0.2522111028631853, + "grad_norm": 0.37979528307914734, + "learning_rate": 0.00014958863515252344, + "loss": 1.1564, + "step": 19409 + }, + { + "epoch": 0.25222409740710117, + "grad_norm": 0.38841861486434937, + "learning_rate": 0.00014958603569061206, + "loss": 1.3559, + "step": 19410 + }, + { + "epoch": 0.25223709195101707, + "grad_norm": 0.4274185597896576, + "learning_rate": 0.00014958343622870066, + "loss": 1.2185, + "step": 19411 + }, + { + "epoch": 0.2522500864949329, + "grad_norm": 0.39575764536857605, + "learning_rate": 0.00014958083676678928, + "loss": 1.5424, + "step": 19412 + }, + { + "epoch": 0.2522630810388488, + "grad_norm": 0.46195104718208313, + "learning_rate": 0.0001495782373048779, + "loss": 1.4648, + "step": 19413 + }, + { + "epoch": 0.2522760755827647, + "grad_norm": 0.36915773153305054, + "learning_rate": 0.0001495756378429665, + "loss": 1.3907, + "step": 19414 + }, + { + "epoch": 0.25228907012668056, + "grad_norm": 0.45009738206863403, + "learning_rate": 0.00014957303838105513, + "loss": 1.5096, + "step": 19415 + }, + { + "epoch": 0.25230206467059646, + "grad_norm": 0.31375381350517273, + "learning_rate": 0.00014957043891914375, + "loss": 1.3629, + "step": 19416 + }, + { + "epoch": 0.2523150592145123, + "grad_norm": 0.4297015368938446, + "learning_rate": 0.00014956783945723235, + "loss": 1.388, + "step": 19417 + }, + { + "epoch": 0.2523280537584282, + "grad_norm": 0.4091581702232361, + "learning_rate": 0.00014956523999532097, + "loss": 1.419, + "step": 19418 + }, + { + "epoch": 0.25234104830234405, + "grad_norm": 0.4369363486766815, + "learning_rate": 0.00014956264053340957, + "loss": 1.5665, + "step": 19419 + }, + { + "epoch": 0.25235404284625995, + "grad_norm": 0.32671064138412476, + "learning_rate": 0.00014956004107149822, + "loss": 1.4437, + "step": 19420 + }, + { + "epoch": 0.2523670373901758, + "grad_norm": 0.37904953956604004, + "learning_rate": 0.00014955744160958682, + "loss": 1.6114, + "step": 19421 + }, + { + "epoch": 0.2523800319340917, + "grad_norm": 0.4265599250793457, + "learning_rate": 0.00014955484214767545, + "loss": 1.5065, + "step": 19422 + }, + { + "epoch": 0.25239302647800754, + "grad_norm": 0.43036991357803345, + "learning_rate": 0.00014955224268576404, + "loss": 1.3427, + "step": 19423 + }, + { + "epoch": 0.25240602102192344, + "grad_norm": 0.44632551074028015, + "learning_rate": 0.00014954964322385267, + "loss": 1.3998, + "step": 19424 + }, + { + "epoch": 0.2524190155658393, + "grad_norm": 0.46608179807662964, + "learning_rate": 0.0001495470437619413, + "loss": 1.3282, + "step": 19425 + }, + { + "epoch": 0.2524320101097552, + "grad_norm": 0.4105066657066345, + "learning_rate": 0.0001495444443000299, + "loss": 1.4524, + "step": 19426 + }, + { + "epoch": 0.25244500465367103, + "grad_norm": 0.472917377948761, + "learning_rate": 0.00014954184483811851, + "loss": 1.3913, + "step": 19427 + }, + { + "epoch": 0.25245799919758694, + "grad_norm": 0.40210890769958496, + "learning_rate": 0.00014953924537620714, + "loss": 1.4328, + "step": 19428 + }, + { + "epoch": 0.2524709937415028, + "grad_norm": 0.3711254298686981, + "learning_rate": 0.00014953664591429574, + "loss": 1.3584, + "step": 19429 + }, + { + "epoch": 0.2524839882854187, + "grad_norm": 0.45770329236984253, + "learning_rate": 0.00014953404645238436, + "loss": 1.4096, + "step": 19430 + }, + { + "epoch": 0.2524969828293345, + "grad_norm": 0.4337862432003021, + "learning_rate": 0.00014953144699047298, + "loss": 1.3453, + "step": 19431 + }, + { + "epoch": 0.2525099773732504, + "grad_norm": 0.33598023653030396, + "learning_rate": 0.0001495288475285616, + "loss": 1.527, + "step": 19432 + }, + { + "epoch": 0.2525229719171663, + "grad_norm": 0.3237660229206085, + "learning_rate": 0.0001495262480666502, + "loss": 1.3763, + "step": 19433 + }, + { + "epoch": 0.2525359664610822, + "grad_norm": 0.43121376633644104, + "learning_rate": 0.00014952364860473883, + "loss": 1.5971, + "step": 19434 + }, + { + "epoch": 0.252548961004998, + "grad_norm": 0.40737298130989075, + "learning_rate": 0.00014952104914282746, + "loss": 1.5253, + "step": 19435 + }, + { + "epoch": 0.2525619555489139, + "grad_norm": 0.4197191298007965, + "learning_rate": 0.00014951844968091605, + "loss": 1.4081, + "step": 19436 + }, + { + "epoch": 0.25257495009282976, + "grad_norm": 0.4256860911846161, + "learning_rate": 0.00014951585021900468, + "loss": 1.8356, + "step": 19437 + }, + { + "epoch": 0.25258794463674566, + "grad_norm": 0.4347538948059082, + "learning_rate": 0.00014951325075709327, + "loss": 1.4042, + "step": 19438 + }, + { + "epoch": 0.2526009391806615, + "grad_norm": 0.4465582072734833, + "learning_rate": 0.00014951065129518193, + "loss": 1.4165, + "step": 19439 + }, + { + "epoch": 0.2526139337245774, + "grad_norm": 0.37985777854919434, + "learning_rate": 0.00014950805183327052, + "loss": 1.2847, + "step": 19440 + }, + { + "epoch": 0.25262692826849326, + "grad_norm": 0.3992941081523895, + "learning_rate": 0.00014950545237135912, + "loss": 1.2284, + "step": 19441 + }, + { + "epoch": 0.25263992281240916, + "grad_norm": 0.4348006844520569, + "learning_rate": 0.00014950285290944775, + "loss": 1.4642, + "step": 19442 + }, + { + "epoch": 0.252652917356325, + "grad_norm": 0.3953380882740021, + "learning_rate": 0.00014950025344753637, + "loss": 1.4533, + "step": 19443 + }, + { + "epoch": 0.2526659119002409, + "grad_norm": 0.3284112811088562, + "learning_rate": 0.000149497653985625, + "loss": 1.2262, + "step": 19444 + }, + { + "epoch": 0.25267890644415675, + "grad_norm": 0.42996883392333984, + "learning_rate": 0.0001494950545237136, + "loss": 1.4904, + "step": 19445 + }, + { + "epoch": 0.25269190098807265, + "grad_norm": 0.3848508298397064, + "learning_rate": 0.00014949245506180222, + "loss": 1.5273, + "step": 19446 + }, + { + "epoch": 0.2527048955319885, + "grad_norm": 0.3397538363933563, + "learning_rate": 0.00014948985559989084, + "loss": 1.4848, + "step": 19447 + }, + { + "epoch": 0.2527178900759044, + "grad_norm": 0.4893542528152466, + "learning_rate": 0.00014948725613797944, + "loss": 1.4846, + "step": 19448 + }, + { + "epoch": 0.25273088461982024, + "grad_norm": 0.47302690148353577, + "learning_rate": 0.00014948465667606806, + "loss": 1.5966, + "step": 19449 + }, + { + "epoch": 0.25274387916373614, + "grad_norm": 0.4177716374397278, + "learning_rate": 0.00014948205721415666, + "loss": 1.5485, + "step": 19450 + }, + { + "epoch": 0.252756873707652, + "grad_norm": 0.36448538303375244, + "learning_rate": 0.0001494794577522453, + "loss": 1.4388, + "step": 19451 + }, + { + "epoch": 0.2527698682515679, + "grad_norm": 0.44344890117645264, + "learning_rate": 0.0001494768582903339, + "loss": 1.3458, + "step": 19452 + }, + { + "epoch": 0.25278286279548373, + "grad_norm": 0.34140545129776, + "learning_rate": 0.0001494742588284225, + "loss": 1.418, + "step": 19453 + }, + { + "epoch": 0.25279585733939963, + "grad_norm": 0.5025710463523865, + "learning_rate": 0.00014947165936651113, + "loss": 1.4794, + "step": 19454 + }, + { + "epoch": 0.2528088518833155, + "grad_norm": 0.34338274598121643, + "learning_rate": 0.00014946905990459976, + "loss": 1.41, + "step": 19455 + }, + { + "epoch": 0.2528218464272314, + "grad_norm": 0.4069278836250305, + "learning_rate": 0.00014946646044268838, + "loss": 1.291, + "step": 19456 + }, + { + "epoch": 0.2528348409711472, + "grad_norm": 0.3977620601654053, + "learning_rate": 0.00014946386098077698, + "loss": 1.4239, + "step": 19457 + }, + { + "epoch": 0.2528478355150631, + "grad_norm": 0.36042532324790955, + "learning_rate": 0.0001494612615188656, + "loss": 1.3277, + "step": 19458 + }, + { + "epoch": 0.25286083005897897, + "grad_norm": 0.3523425757884979, + "learning_rate": 0.00014945866205695423, + "loss": 1.3304, + "step": 19459 + }, + { + "epoch": 0.25287382460289487, + "grad_norm": 0.4886690676212311, + "learning_rate": 0.00014945606259504282, + "loss": 1.2939, + "step": 19460 + }, + { + "epoch": 0.2528868191468107, + "grad_norm": 0.40894386172294617, + "learning_rate": 0.00014945346313313145, + "loss": 1.4308, + "step": 19461 + }, + { + "epoch": 0.2528998136907266, + "grad_norm": 0.3787551820278168, + "learning_rate": 0.00014945086367122005, + "loss": 1.563, + "step": 19462 + }, + { + "epoch": 0.25291280823464246, + "grad_norm": 0.4399498999118805, + "learning_rate": 0.0001494482642093087, + "loss": 1.4975, + "step": 19463 + }, + { + "epoch": 0.25292580277855836, + "grad_norm": 0.35064423084259033, + "learning_rate": 0.0001494456647473973, + "loss": 1.2873, + "step": 19464 + }, + { + "epoch": 0.2529387973224742, + "grad_norm": 0.3743061125278473, + "learning_rate": 0.00014944306528548592, + "loss": 1.4569, + "step": 19465 + }, + { + "epoch": 0.2529517918663901, + "grad_norm": 0.4102039635181427, + "learning_rate": 0.00014944046582357454, + "loss": 1.5506, + "step": 19466 + }, + { + "epoch": 0.25296478641030595, + "grad_norm": 0.34494784474372864, + "learning_rate": 0.00014943786636166314, + "loss": 1.452, + "step": 19467 + }, + { + "epoch": 0.25297778095422185, + "grad_norm": 0.41487744450569153, + "learning_rate": 0.00014943526689975177, + "loss": 1.5787, + "step": 19468 + }, + { + "epoch": 0.2529907754981377, + "grad_norm": 0.3826618194580078, + "learning_rate": 0.00014943266743784036, + "loss": 1.5631, + "step": 19469 + }, + { + "epoch": 0.2530037700420536, + "grad_norm": 0.4021928012371063, + "learning_rate": 0.000149430067975929, + "loss": 1.4804, + "step": 19470 + }, + { + "epoch": 0.25301676458596944, + "grad_norm": 0.3860153257846832, + "learning_rate": 0.0001494274685140176, + "loss": 1.2641, + "step": 19471 + }, + { + "epoch": 0.25302975912988535, + "grad_norm": 0.3456415832042694, + "learning_rate": 0.0001494248690521062, + "loss": 1.3048, + "step": 19472 + }, + { + "epoch": 0.2530427536738012, + "grad_norm": 0.27741387486457825, + "learning_rate": 0.00014942226959019483, + "loss": 1.3909, + "step": 19473 + }, + { + "epoch": 0.2530557482177171, + "grad_norm": 0.36109939217567444, + "learning_rate": 0.00014941967012828346, + "loss": 1.493, + "step": 19474 + }, + { + "epoch": 0.25306874276163294, + "grad_norm": 0.4939716160297394, + "learning_rate": 0.00014941707066637208, + "loss": 1.3413, + "step": 19475 + }, + { + "epoch": 0.25308173730554884, + "grad_norm": 0.34035468101501465, + "learning_rate": 0.00014941447120446068, + "loss": 1.6573, + "step": 19476 + }, + { + "epoch": 0.2530947318494647, + "grad_norm": 0.3671989440917969, + "learning_rate": 0.0001494118717425493, + "loss": 1.1807, + "step": 19477 + }, + { + "epoch": 0.2531077263933806, + "grad_norm": 0.4366193413734436, + "learning_rate": 0.00014940927228063793, + "loss": 1.5688, + "step": 19478 + }, + { + "epoch": 0.25312072093729643, + "grad_norm": 0.38654372096061707, + "learning_rate": 0.00014940667281872653, + "loss": 1.4302, + "step": 19479 + }, + { + "epoch": 0.25313371548121233, + "grad_norm": 0.37955477833747864, + "learning_rate": 0.00014940407335681515, + "loss": 1.523, + "step": 19480 + }, + { + "epoch": 0.2531467100251282, + "grad_norm": 0.3090011179447174, + "learning_rate": 0.00014940147389490375, + "loss": 1.3223, + "step": 19481 + }, + { + "epoch": 0.2531597045690441, + "grad_norm": 0.3520176112651825, + "learning_rate": 0.00014939887443299237, + "loss": 1.4801, + "step": 19482 + }, + { + "epoch": 0.2531726991129599, + "grad_norm": 0.358134001493454, + "learning_rate": 0.000149396274971081, + "loss": 1.2845, + "step": 19483 + }, + { + "epoch": 0.2531856936568758, + "grad_norm": 0.4331236779689789, + "learning_rate": 0.0001493936755091696, + "loss": 1.4309, + "step": 19484 + }, + { + "epoch": 0.25319868820079167, + "grad_norm": 0.426949679851532, + "learning_rate": 0.00014939107604725822, + "loss": 1.4705, + "step": 19485 + }, + { + "epoch": 0.25321168274470757, + "grad_norm": 0.41182342171669006, + "learning_rate": 0.00014938847658534684, + "loss": 1.4218, + "step": 19486 + }, + { + "epoch": 0.2532246772886234, + "grad_norm": 0.36608171463012695, + "learning_rate": 0.00014938587712343547, + "loss": 1.4997, + "step": 19487 + }, + { + "epoch": 0.2532376718325393, + "grad_norm": 0.40239009261131287, + "learning_rate": 0.00014938327766152407, + "loss": 1.4691, + "step": 19488 + }, + { + "epoch": 0.2532506663764552, + "grad_norm": 0.4432760179042816, + "learning_rate": 0.0001493806781996127, + "loss": 1.3926, + "step": 19489 + }, + { + "epoch": 0.25326366092037106, + "grad_norm": 0.3851640820503235, + "learning_rate": 0.00014937807873770131, + "loss": 1.4094, + "step": 19490 + }, + { + "epoch": 0.25327665546428696, + "grad_norm": 0.4153343141078949, + "learning_rate": 0.0001493754792757899, + "loss": 1.4461, + "step": 19491 + }, + { + "epoch": 0.2532896500082028, + "grad_norm": 0.3610401153564453, + "learning_rate": 0.00014937287981387854, + "loss": 1.2303, + "step": 19492 + }, + { + "epoch": 0.2533026445521187, + "grad_norm": 0.40506166219711304, + "learning_rate": 0.00014937028035196713, + "loss": 1.6006, + "step": 19493 + }, + { + "epoch": 0.25331563909603455, + "grad_norm": 0.4519224762916565, + "learning_rate": 0.00014936768089005579, + "loss": 1.3561, + "step": 19494 + }, + { + "epoch": 0.25332863363995045, + "grad_norm": 0.41228964924812317, + "learning_rate": 0.00014936508142814438, + "loss": 1.3722, + "step": 19495 + }, + { + "epoch": 0.2533416281838663, + "grad_norm": 0.6391938924789429, + "learning_rate": 0.00014936248196623298, + "loss": 1.331, + "step": 19496 + }, + { + "epoch": 0.2533546227277822, + "grad_norm": 0.41272252798080444, + "learning_rate": 0.0001493598825043216, + "loss": 1.43, + "step": 19497 + }, + { + "epoch": 0.25336761727169804, + "grad_norm": 0.3163849115371704, + "learning_rate": 0.00014935728304241023, + "loss": 1.2974, + "step": 19498 + }, + { + "epoch": 0.25338061181561394, + "grad_norm": 0.3924661874771118, + "learning_rate": 0.00014935468358049885, + "loss": 1.3742, + "step": 19499 + }, + { + "epoch": 0.2533936063595298, + "grad_norm": 0.3626254200935364, + "learning_rate": 0.00014935208411858745, + "loss": 1.3065, + "step": 19500 + }, + { + "epoch": 0.2534066009034457, + "grad_norm": 0.4591801166534424, + "learning_rate": 0.00014934948465667608, + "loss": 1.4867, + "step": 19501 + }, + { + "epoch": 0.25341959544736153, + "grad_norm": 0.3122835159301758, + "learning_rate": 0.0001493468851947647, + "loss": 1.2819, + "step": 19502 + }, + { + "epoch": 0.25343258999127743, + "grad_norm": 0.4542468190193176, + "learning_rate": 0.0001493442857328533, + "loss": 1.5839, + "step": 19503 + }, + { + "epoch": 0.2534455845351933, + "grad_norm": 0.4456194043159485, + "learning_rate": 0.00014934168627094192, + "loss": 1.5638, + "step": 19504 + }, + { + "epoch": 0.2534585790791092, + "grad_norm": 0.22348594665527344, + "learning_rate": 0.00014933908680903055, + "loss": 1.111, + "step": 19505 + }, + { + "epoch": 0.253471573623025, + "grad_norm": 0.38837847113609314, + "learning_rate": 0.00014933648734711917, + "loss": 1.6684, + "step": 19506 + }, + { + "epoch": 0.2534845681669409, + "grad_norm": 0.3763684630393982, + "learning_rate": 0.00014933388788520777, + "loss": 1.336, + "step": 19507 + }, + { + "epoch": 0.25349756271085677, + "grad_norm": 0.3438556492328644, + "learning_rate": 0.00014933128842329637, + "loss": 1.1228, + "step": 19508 + }, + { + "epoch": 0.2535105572547727, + "grad_norm": 0.38098448514938354, + "learning_rate": 0.00014932868896138502, + "loss": 1.5339, + "step": 19509 + }, + { + "epoch": 0.2535235517986885, + "grad_norm": 0.44352248311042786, + "learning_rate": 0.00014932608949947361, + "loss": 1.3948, + "step": 19510 + }, + { + "epoch": 0.2535365463426044, + "grad_norm": 0.47841498255729675, + "learning_rate": 0.00014932349003756224, + "loss": 1.4295, + "step": 19511 + }, + { + "epoch": 0.25354954088652026, + "grad_norm": 0.43203219771385193, + "learning_rate": 0.00014932089057565084, + "loss": 1.6745, + "step": 19512 + }, + { + "epoch": 0.25356253543043616, + "grad_norm": 0.5679474472999573, + "learning_rate": 0.00014931829111373946, + "loss": 1.5579, + "step": 19513 + }, + { + "epoch": 0.253575529974352, + "grad_norm": 0.4489889442920685, + "learning_rate": 0.00014931569165182809, + "loss": 1.3245, + "step": 19514 + }, + { + "epoch": 0.2535885245182679, + "grad_norm": 0.5117473006248474, + "learning_rate": 0.00014931309218991668, + "loss": 1.5214, + "step": 19515 + }, + { + "epoch": 0.25360151906218376, + "grad_norm": 0.371930867433548, + "learning_rate": 0.0001493104927280053, + "loss": 1.452, + "step": 19516 + }, + { + "epoch": 0.25361451360609966, + "grad_norm": 0.47861620783805847, + "learning_rate": 0.00014930789326609393, + "loss": 1.6071, + "step": 19517 + }, + { + "epoch": 0.2536275081500155, + "grad_norm": 0.3440735638141632, + "learning_rate": 0.00014930529380418256, + "loss": 1.4441, + "step": 19518 + }, + { + "epoch": 0.2536405026939314, + "grad_norm": 0.34542837738990784, + "learning_rate": 0.00014930269434227115, + "loss": 1.4439, + "step": 19519 + }, + { + "epoch": 0.25365349723784725, + "grad_norm": 0.413703978061676, + "learning_rate": 0.00014930009488035978, + "loss": 1.4168, + "step": 19520 + }, + { + "epoch": 0.25366649178176315, + "grad_norm": 0.38638782501220703, + "learning_rate": 0.0001492974954184484, + "loss": 1.3942, + "step": 19521 + }, + { + "epoch": 0.253679486325679, + "grad_norm": 0.42799457907676697, + "learning_rate": 0.000149294895956537, + "loss": 1.3651, + "step": 19522 + }, + { + "epoch": 0.2536924808695949, + "grad_norm": 0.3785470128059387, + "learning_rate": 0.00014929229649462562, + "loss": 1.411, + "step": 19523 + }, + { + "epoch": 0.25370547541351074, + "grad_norm": 0.4188239276409149, + "learning_rate": 0.00014928969703271422, + "loss": 1.477, + "step": 19524 + }, + { + "epoch": 0.25371846995742664, + "grad_norm": 0.5385071039199829, + "learning_rate": 0.00014928709757080285, + "loss": 1.5267, + "step": 19525 + }, + { + "epoch": 0.2537314645013425, + "grad_norm": 0.40900567173957825, + "learning_rate": 0.00014928449810889147, + "loss": 1.3916, + "step": 19526 + }, + { + "epoch": 0.2537444590452584, + "grad_norm": 0.4177548885345459, + "learning_rate": 0.00014928189864698007, + "loss": 1.4331, + "step": 19527 + }, + { + "epoch": 0.25375745358917423, + "grad_norm": 0.4325701594352722, + "learning_rate": 0.0001492792991850687, + "loss": 1.6716, + "step": 19528 + }, + { + "epoch": 0.25377044813309013, + "grad_norm": 0.326905220746994, + "learning_rate": 0.00014927669972315732, + "loss": 1.2966, + "step": 19529 + }, + { + "epoch": 0.253783442677006, + "grad_norm": 0.4979146122932434, + "learning_rate": 0.00014927410026124594, + "loss": 1.5173, + "step": 19530 + }, + { + "epoch": 0.2537964372209219, + "grad_norm": 0.35215646028518677, + "learning_rate": 0.00014927150079933454, + "loss": 1.3729, + "step": 19531 + }, + { + "epoch": 0.2538094317648377, + "grad_norm": 0.47046953439712524, + "learning_rate": 0.00014926890133742316, + "loss": 1.4639, + "step": 19532 + }, + { + "epoch": 0.2538224263087536, + "grad_norm": 0.44784340262413025, + "learning_rate": 0.0001492663018755118, + "loss": 1.353, + "step": 19533 + }, + { + "epoch": 0.25383542085266947, + "grad_norm": 0.4741322994232178, + "learning_rate": 0.00014926370241360039, + "loss": 1.4941, + "step": 19534 + }, + { + "epoch": 0.25384841539658537, + "grad_norm": 0.2875347435474396, + "learning_rate": 0.000149261102951689, + "loss": 1.2758, + "step": 19535 + }, + { + "epoch": 0.2538614099405012, + "grad_norm": 0.3891442120075226, + "learning_rate": 0.0001492585034897776, + "loss": 1.4887, + "step": 19536 + }, + { + "epoch": 0.2538744044844171, + "grad_norm": 0.41554713249206543, + "learning_rate": 0.00014925590402786623, + "loss": 1.4239, + "step": 19537 + }, + { + "epoch": 0.25388739902833296, + "grad_norm": 0.35057684779167175, + "learning_rate": 0.00014925330456595486, + "loss": 1.3987, + "step": 19538 + }, + { + "epoch": 0.25390039357224886, + "grad_norm": 0.4619067311286926, + "learning_rate": 0.00014925070510404345, + "loss": 1.4433, + "step": 19539 + }, + { + "epoch": 0.2539133881161647, + "grad_norm": 0.3216840624809265, + "learning_rate": 0.0001492481056421321, + "loss": 1.5148, + "step": 19540 + }, + { + "epoch": 0.2539263826600806, + "grad_norm": 0.4319811165332794, + "learning_rate": 0.0001492455061802207, + "loss": 1.4827, + "step": 19541 + }, + { + "epoch": 0.25393937720399645, + "grad_norm": 0.46609023213386536, + "learning_rate": 0.00014924290671830933, + "loss": 1.4998, + "step": 19542 + }, + { + "epoch": 0.25395237174791235, + "grad_norm": 0.3596515357494354, + "learning_rate": 0.00014924030725639792, + "loss": 1.5007, + "step": 19543 + }, + { + "epoch": 0.2539653662918282, + "grad_norm": 0.4820636510848999, + "learning_rate": 0.00014923770779448655, + "loss": 1.4179, + "step": 19544 + }, + { + "epoch": 0.2539783608357441, + "grad_norm": 0.3542421758174896, + "learning_rate": 0.00014923510833257517, + "loss": 1.5935, + "step": 19545 + }, + { + "epoch": 0.25399135537965994, + "grad_norm": 0.35289159417152405, + "learning_rate": 0.00014923250887066377, + "loss": 1.5804, + "step": 19546 + }, + { + "epoch": 0.25400434992357585, + "grad_norm": 0.33503708243370056, + "learning_rate": 0.0001492299094087524, + "loss": 1.2877, + "step": 19547 + }, + { + "epoch": 0.2540173444674917, + "grad_norm": 0.328752726316452, + "learning_rate": 0.00014922730994684102, + "loss": 1.5439, + "step": 19548 + }, + { + "epoch": 0.2540303390114076, + "grad_norm": 0.44852712750434875, + "learning_rate": 0.00014922471048492964, + "loss": 1.4289, + "step": 19549 + }, + { + "epoch": 0.25404333355532344, + "grad_norm": 0.39202719926834106, + "learning_rate": 0.00014922211102301824, + "loss": 1.4395, + "step": 19550 + }, + { + "epoch": 0.25405632809923934, + "grad_norm": 0.42687827348709106, + "learning_rate": 0.00014921951156110684, + "loss": 1.213, + "step": 19551 + }, + { + "epoch": 0.2540693226431552, + "grad_norm": 0.4032503664493561, + "learning_rate": 0.0001492169120991955, + "loss": 1.3344, + "step": 19552 + }, + { + "epoch": 0.2540823171870711, + "grad_norm": 0.28723934292793274, + "learning_rate": 0.0001492143126372841, + "loss": 1.2125, + "step": 19553 + }, + { + "epoch": 0.25409531173098693, + "grad_norm": 0.3656887114048004, + "learning_rate": 0.0001492117131753727, + "loss": 1.2898, + "step": 19554 + }, + { + "epoch": 0.25410830627490283, + "grad_norm": 0.35861334204673767, + "learning_rate": 0.0001492091137134613, + "loss": 1.2993, + "step": 19555 + }, + { + "epoch": 0.2541213008188187, + "grad_norm": 0.41531193256378174, + "learning_rate": 0.00014920651425154993, + "loss": 1.5702, + "step": 19556 + }, + { + "epoch": 0.2541342953627346, + "grad_norm": 0.46724745631217957, + "learning_rate": 0.00014920391478963856, + "loss": 1.5282, + "step": 19557 + }, + { + "epoch": 0.2541472899066504, + "grad_norm": 0.40336310863494873, + "learning_rate": 0.00014920131532772716, + "loss": 1.4613, + "step": 19558 + }, + { + "epoch": 0.2541602844505663, + "grad_norm": 0.4190009832382202, + "learning_rate": 0.00014919871586581578, + "loss": 1.4086, + "step": 19559 + }, + { + "epoch": 0.25417327899448217, + "grad_norm": 0.2918795943260193, + "learning_rate": 0.0001491961164039044, + "loss": 1.606, + "step": 19560 + }, + { + "epoch": 0.25418627353839807, + "grad_norm": 0.36437034606933594, + "learning_rate": 0.00014919351694199303, + "loss": 1.3699, + "step": 19561 + }, + { + "epoch": 0.2541992680823139, + "grad_norm": 0.33752548694610596, + "learning_rate": 0.00014919091748008163, + "loss": 1.1943, + "step": 19562 + }, + { + "epoch": 0.2542122626262298, + "grad_norm": 0.5191263556480408, + "learning_rate": 0.00014918831801817022, + "loss": 1.5589, + "step": 19563 + }, + { + "epoch": 0.25422525717014566, + "grad_norm": 0.3960602581501007, + "learning_rate": 0.00014918571855625888, + "loss": 1.4288, + "step": 19564 + }, + { + "epoch": 0.25423825171406156, + "grad_norm": 0.4055216908454895, + "learning_rate": 0.00014918311909434747, + "loss": 1.3877, + "step": 19565 + }, + { + "epoch": 0.25425124625797746, + "grad_norm": 0.4590810239315033, + "learning_rate": 0.0001491805196324361, + "loss": 1.459, + "step": 19566 + }, + { + "epoch": 0.2542642408018933, + "grad_norm": 0.4493219554424286, + "learning_rate": 0.0001491779201705247, + "loss": 1.4646, + "step": 19567 + }, + { + "epoch": 0.2542772353458092, + "grad_norm": 0.42967626452445984, + "learning_rate": 0.00014917532070861332, + "loss": 1.3442, + "step": 19568 + }, + { + "epoch": 0.25429022988972505, + "grad_norm": 0.40593600273132324, + "learning_rate": 0.00014917272124670194, + "loss": 1.4179, + "step": 19569 + }, + { + "epoch": 0.25430322443364095, + "grad_norm": 0.3814839720726013, + "learning_rate": 0.00014917012178479054, + "loss": 1.4409, + "step": 19570 + }, + { + "epoch": 0.2543162189775568, + "grad_norm": 0.333474338054657, + "learning_rate": 0.00014916752232287917, + "loss": 1.4659, + "step": 19571 + }, + { + "epoch": 0.2543292135214727, + "grad_norm": 0.3968014121055603, + "learning_rate": 0.0001491649228609678, + "loss": 1.5165, + "step": 19572 + }, + { + "epoch": 0.25434220806538854, + "grad_norm": 0.5033664107322693, + "learning_rate": 0.00014916232339905641, + "loss": 1.4188, + "step": 19573 + }, + { + "epoch": 0.25435520260930444, + "grad_norm": 0.43290650844573975, + "learning_rate": 0.000149159723937145, + "loss": 1.3538, + "step": 19574 + }, + { + "epoch": 0.2543681971532203, + "grad_norm": 0.42177537083625793, + "learning_rate": 0.0001491571244752336, + "loss": 1.3532, + "step": 19575 + }, + { + "epoch": 0.2543811916971362, + "grad_norm": 0.41929543018341064, + "learning_rate": 0.00014915452501332226, + "loss": 1.6026, + "step": 19576 + }, + { + "epoch": 0.25439418624105203, + "grad_norm": 0.33637744188308716, + "learning_rate": 0.00014915192555141086, + "loss": 1.4604, + "step": 19577 + }, + { + "epoch": 0.25440718078496793, + "grad_norm": 0.406148225069046, + "learning_rate": 0.00014914932608949948, + "loss": 1.325, + "step": 19578 + }, + { + "epoch": 0.2544201753288838, + "grad_norm": 0.41331106424331665, + "learning_rate": 0.0001491467266275881, + "loss": 1.4351, + "step": 19579 + }, + { + "epoch": 0.2544331698727997, + "grad_norm": 0.3987766206264496, + "learning_rate": 0.0001491441271656767, + "loss": 1.4019, + "step": 19580 + }, + { + "epoch": 0.2544461644167155, + "grad_norm": 0.5104241967201233, + "learning_rate": 0.00014914152770376533, + "loss": 1.5371, + "step": 19581 + }, + { + "epoch": 0.2544591589606314, + "grad_norm": 0.34166020154953003, + "learning_rate": 0.00014913892824185393, + "loss": 1.4705, + "step": 19582 + }, + { + "epoch": 0.25447215350454727, + "grad_norm": 0.3657446503639221, + "learning_rate": 0.00014913632877994258, + "loss": 1.4575, + "step": 19583 + }, + { + "epoch": 0.2544851480484632, + "grad_norm": 0.45171964168548584, + "learning_rate": 0.00014913372931803118, + "loss": 1.3922, + "step": 19584 + }, + { + "epoch": 0.254498142592379, + "grad_norm": 0.30195876955986023, + "learning_rate": 0.0001491311298561198, + "loss": 1.2429, + "step": 19585 + }, + { + "epoch": 0.2545111371362949, + "grad_norm": 0.44035932421684265, + "learning_rate": 0.0001491285303942084, + "loss": 1.4496, + "step": 19586 + }, + { + "epoch": 0.25452413168021076, + "grad_norm": 0.4288991689682007, + "learning_rate": 0.00014912593093229702, + "loss": 1.5179, + "step": 19587 + }, + { + "epoch": 0.25453712622412666, + "grad_norm": 0.4843510389328003, + "learning_rate": 0.00014912333147038565, + "loss": 1.4315, + "step": 19588 + }, + { + "epoch": 0.2545501207680425, + "grad_norm": 0.4102731943130493, + "learning_rate": 0.00014912073200847424, + "loss": 1.424, + "step": 19589 + }, + { + "epoch": 0.2545631153119584, + "grad_norm": 0.33283668756484985, + "learning_rate": 0.00014911813254656287, + "loss": 1.2421, + "step": 19590 + }, + { + "epoch": 0.25457610985587426, + "grad_norm": 0.4266267716884613, + "learning_rate": 0.0001491155330846515, + "loss": 1.3228, + "step": 19591 + }, + { + "epoch": 0.25458910439979016, + "grad_norm": 0.4488687813282013, + "learning_rate": 0.0001491129336227401, + "loss": 1.4948, + "step": 19592 + }, + { + "epoch": 0.254602098943706, + "grad_norm": 0.3215477764606476, + "learning_rate": 0.00014911033416082871, + "loss": 1.3819, + "step": 19593 + }, + { + "epoch": 0.2546150934876219, + "grad_norm": 0.2730790972709656, + "learning_rate": 0.0001491077346989173, + "loss": 1.3785, + "step": 19594 + }, + { + "epoch": 0.25462808803153775, + "grad_norm": 0.4222654402256012, + "learning_rate": 0.00014910513523700596, + "loss": 1.4154, + "step": 19595 + }, + { + "epoch": 0.25464108257545365, + "grad_norm": 0.4084135591983795, + "learning_rate": 0.00014910253577509456, + "loss": 1.3022, + "step": 19596 + }, + { + "epoch": 0.2546540771193695, + "grad_norm": 0.31401485204696655, + "learning_rate": 0.00014909993631318319, + "loss": 1.4603, + "step": 19597 + }, + { + "epoch": 0.2546670716632854, + "grad_norm": 0.47276976704597473, + "learning_rate": 0.00014909733685127178, + "loss": 1.494, + "step": 19598 + }, + { + "epoch": 0.25468006620720124, + "grad_norm": 0.5564035773277283, + "learning_rate": 0.0001490947373893604, + "loss": 1.4751, + "step": 19599 + }, + { + "epoch": 0.25469306075111714, + "grad_norm": 0.48834386467933655, + "learning_rate": 0.00014909213792744903, + "loss": 1.3476, + "step": 19600 + }, + { + "epoch": 0.254706055295033, + "grad_norm": 0.3741549551486969, + "learning_rate": 0.00014908953846553763, + "loss": 1.3873, + "step": 19601 + }, + { + "epoch": 0.2547190498389489, + "grad_norm": 0.4064483940601349, + "learning_rate": 0.00014908693900362625, + "loss": 1.4014, + "step": 19602 + }, + { + "epoch": 0.25473204438286473, + "grad_norm": 0.3980434238910675, + "learning_rate": 0.00014908433954171488, + "loss": 1.3172, + "step": 19603 + }, + { + "epoch": 0.25474503892678063, + "grad_norm": 0.36086952686309814, + "learning_rate": 0.00014908174007980348, + "loss": 1.2456, + "step": 19604 + }, + { + "epoch": 0.2547580334706965, + "grad_norm": 0.4407871961593628, + "learning_rate": 0.0001490791406178921, + "loss": 1.494, + "step": 19605 + }, + { + "epoch": 0.2547710280146124, + "grad_norm": 0.36557450890541077, + "learning_rate": 0.0001490765411559807, + "loss": 1.3946, + "step": 19606 + }, + { + "epoch": 0.2547840225585282, + "grad_norm": 0.38791608810424805, + "learning_rate": 0.00014907394169406935, + "loss": 1.2061, + "step": 19607 + }, + { + "epoch": 0.2547970171024441, + "grad_norm": 0.41294312477111816, + "learning_rate": 0.00014907134223215795, + "loss": 1.3249, + "step": 19608 + }, + { + "epoch": 0.25481001164635997, + "grad_norm": 0.38132742047309875, + "learning_rate": 0.00014906874277024657, + "loss": 1.369, + "step": 19609 + }, + { + "epoch": 0.25482300619027587, + "grad_norm": 0.4066297113895416, + "learning_rate": 0.00014906614330833517, + "loss": 1.4157, + "step": 19610 + }, + { + "epoch": 0.2548360007341917, + "grad_norm": 0.4042745530605316, + "learning_rate": 0.0001490635438464238, + "loss": 1.4776, + "step": 19611 + }, + { + "epoch": 0.2548489952781076, + "grad_norm": 0.4441920518875122, + "learning_rate": 0.00014906094438451242, + "loss": 1.4029, + "step": 19612 + }, + { + "epoch": 0.25486198982202346, + "grad_norm": 0.4884093105792999, + "learning_rate": 0.00014905834492260101, + "loss": 1.4921, + "step": 19613 + }, + { + "epoch": 0.25487498436593936, + "grad_norm": 0.40551596879959106, + "learning_rate": 0.00014905574546068967, + "loss": 1.4383, + "step": 19614 + }, + { + "epoch": 0.2548879789098552, + "grad_norm": 0.40478554368019104, + "learning_rate": 0.00014905314599877826, + "loss": 1.4793, + "step": 19615 + }, + { + "epoch": 0.2549009734537711, + "grad_norm": 0.4637174606323242, + "learning_rate": 0.0001490505465368669, + "loss": 1.5266, + "step": 19616 + }, + { + "epoch": 0.25491396799768695, + "grad_norm": 0.42517709732055664, + "learning_rate": 0.00014904794707495549, + "loss": 1.4772, + "step": 19617 + }, + { + "epoch": 0.25492696254160285, + "grad_norm": 0.35457098484039307, + "learning_rate": 0.0001490453476130441, + "loss": 1.4463, + "step": 19618 + }, + { + "epoch": 0.2549399570855187, + "grad_norm": 0.39123281836509705, + "learning_rate": 0.00014904274815113273, + "loss": 1.4359, + "step": 19619 + }, + { + "epoch": 0.2549529516294346, + "grad_norm": 0.5061010718345642, + "learning_rate": 0.00014904014868922133, + "loss": 1.642, + "step": 19620 + }, + { + "epoch": 0.25496594617335044, + "grad_norm": 0.4411238431930542, + "learning_rate": 0.00014903754922730996, + "loss": 1.3891, + "step": 19621 + }, + { + "epoch": 0.25497894071726634, + "grad_norm": 0.39414554834365845, + "learning_rate": 0.00014903494976539858, + "loss": 1.4517, + "step": 19622 + }, + { + "epoch": 0.2549919352611822, + "grad_norm": 0.43490347266197205, + "learning_rate": 0.00014903235030348718, + "loss": 1.4189, + "step": 19623 + }, + { + "epoch": 0.2550049298050981, + "grad_norm": 0.4816271662712097, + "learning_rate": 0.0001490297508415758, + "loss": 1.2582, + "step": 19624 + }, + { + "epoch": 0.25501792434901394, + "grad_norm": 0.46682465076446533, + "learning_rate": 0.0001490271513796644, + "loss": 1.5558, + "step": 19625 + }, + { + "epoch": 0.25503091889292984, + "grad_norm": 0.35740694403648376, + "learning_rate": 0.00014902455191775305, + "loss": 1.3755, + "step": 19626 + }, + { + "epoch": 0.2550439134368457, + "grad_norm": 0.2937428951263428, + "learning_rate": 0.00014902195245584165, + "loss": 1.4038, + "step": 19627 + }, + { + "epoch": 0.2550569079807616, + "grad_norm": 0.41690313816070557, + "learning_rate": 0.00014901935299393027, + "loss": 1.4042, + "step": 19628 + }, + { + "epoch": 0.2550699025246774, + "grad_norm": 0.4506669044494629, + "learning_rate": 0.00014901675353201887, + "loss": 1.3506, + "step": 19629 + }, + { + "epoch": 0.25508289706859333, + "grad_norm": 0.42641496658325195, + "learning_rate": 0.0001490141540701075, + "loss": 1.4267, + "step": 19630 + }, + { + "epoch": 0.2550958916125092, + "grad_norm": 0.46458303928375244, + "learning_rate": 0.00014901155460819612, + "loss": 1.4515, + "step": 19631 + }, + { + "epoch": 0.2551088861564251, + "grad_norm": 0.5402584671974182, + "learning_rate": 0.00014900895514628472, + "loss": 1.5567, + "step": 19632 + }, + { + "epoch": 0.2551218807003409, + "grad_norm": 0.48536399006843567, + "learning_rate": 0.00014900635568437334, + "loss": 1.5556, + "step": 19633 + }, + { + "epoch": 0.2551348752442568, + "grad_norm": 0.3085572421550751, + "learning_rate": 0.00014900375622246197, + "loss": 1.5257, + "step": 19634 + }, + { + "epoch": 0.25514786978817267, + "grad_norm": 0.3858374059200287, + "learning_rate": 0.00014900115676055056, + "loss": 1.5151, + "step": 19635 + }, + { + "epoch": 0.25516086433208857, + "grad_norm": 0.32978740334510803, + "learning_rate": 0.0001489985572986392, + "loss": 1.3794, + "step": 19636 + }, + { + "epoch": 0.2551738588760044, + "grad_norm": 0.5962116718292236, + "learning_rate": 0.00014899595783672779, + "loss": 1.5962, + "step": 19637 + }, + { + "epoch": 0.2551868534199203, + "grad_norm": 0.48759955167770386, + "learning_rate": 0.00014899335837481644, + "loss": 1.4439, + "step": 19638 + }, + { + "epoch": 0.25519984796383616, + "grad_norm": 0.45516636967658997, + "learning_rate": 0.00014899075891290503, + "loss": 1.4525, + "step": 19639 + }, + { + "epoch": 0.25521284250775206, + "grad_norm": 0.488765150308609, + "learning_rate": 0.00014898815945099366, + "loss": 1.4983, + "step": 19640 + }, + { + "epoch": 0.25522583705166796, + "grad_norm": 0.4281080961227417, + "learning_rate": 0.00014898555998908226, + "loss": 1.2567, + "step": 19641 + }, + { + "epoch": 0.2552388315955838, + "grad_norm": 0.4397064745426178, + "learning_rate": 0.00014898296052717088, + "loss": 1.4347, + "step": 19642 + }, + { + "epoch": 0.2552518261394997, + "grad_norm": 0.4339059591293335, + "learning_rate": 0.0001489803610652595, + "loss": 1.2534, + "step": 19643 + }, + { + "epoch": 0.25526482068341555, + "grad_norm": 0.4328799843788147, + "learning_rate": 0.0001489777616033481, + "loss": 1.6832, + "step": 19644 + }, + { + "epoch": 0.25527781522733145, + "grad_norm": 0.36828485131263733, + "learning_rate": 0.00014897516214143673, + "loss": 1.4683, + "step": 19645 + }, + { + "epoch": 0.2552908097712473, + "grad_norm": 0.4733307957649231, + "learning_rate": 0.00014897256267952535, + "loss": 1.3166, + "step": 19646 + }, + { + "epoch": 0.2553038043151632, + "grad_norm": 0.4817982614040375, + "learning_rate": 0.00014896996321761395, + "loss": 1.459, + "step": 19647 + }, + { + "epoch": 0.25531679885907904, + "grad_norm": 0.39325281977653503, + "learning_rate": 0.00014896736375570257, + "loss": 1.3309, + "step": 19648 + }, + { + "epoch": 0.25532979340299494, + "grad_norm": 0.37920570373535156, + "learning_rate": 0.00014896476429379117, + "loss": 1.3799, + "step": 19649 + }, + { + "epoch": 0.2553427879469108, + "grad_norm": 0.4038579761981964, + "learning_rate": 0.00014896216483187982, + "loss": 1.5325, + "step": 19650 + }, + { + "epoch": 0.2553557824908267, + "grad_norm": 0.5017217993736267, + "learning_rate": 0.00014895956536996842, + "loss": 1.4294, + "step": 19651 + }, + { + "epoch": 0.25536877703474253, + "grad_norm": 0.40085774660110474, + "learning_rate": 0.00014895696590805704, + "loss": 1.36, + "step": 19652 + }, + { + "epoch": 0.25538177157865843, + "grad_norm": 0.42601174116134644, + "learning_rate": 0.00014895436644614567, + "loss": 1.3577, + "step": 19653 + }, + { + "epoch": 0.2553947661225743, + "grad_norm": 0.4202224016189575, + "learning_rate": 0.00014895176698423427, + "loss": 1.4851, + "step": 19654 + }, + { + "epoch": 0.2554077606664902, + "grad_norm": 0.44097375869750977, + "learning_rate": 0.0001489491675223229, + "loss": 1.4616, + "step": 19655 + }, + { + "epoch": 0.255420755210406, + "grad_norm": 0.3401409089565277, + "learning_rate": 0.0001489465680604115, + "loss": 1.5165, + "step": 19656 + }, + { + "epoch": 0.2554337497543219, + "grad_norm": 0.563421368598938, + "learning_rate": 0.00014894396859850014, + "loss": 1.5301, + "step": 19657 + }, + { + "epoch": 0.25544674429823777, + "grad_norm": 0.5105387568473816, + "learning_rate": 0.00014894136913658874, + "loss": 1.4498, + "step": 19658 + }, + { + "epoch": 0.25545973884215367, + "grad_norm": 0.39737823605537415, + "learning_rate": 0.00014893876967467733, + "loss": 1.4409, + "step": 19659 + }, + { + "epoch": 0.2554727333860695, + "grad_norm": 0.38277554512023926, + "learning_rate": 0.00014893617021276596, + "loss": 1.5231, + "step": 19660 + }, + { + "epoch": 0.2554857279299854, + "grad_norm": 0.40716618299484253, + "learning_rate": 0.00014893357075085458, + "loss": 1.3287, + "step": 19661 + }, + { + "epoch": 0.25549872247390126, + "grad_norm": 0.3670077919960022, + "learning_rate": 0.0001489309712889432, + "loss": 1.3276, + "step": 19662 + }, + { + "epoch": 0.25551171701781716, + "grad_norm": 0.43973225355148315, + "learning_rate": 0.0001489283718270318, + "loss": 1.3053, + "step": 19663 + }, + { + "epoch": 0.255524711561733, + "grad_norm": 0.38072311878204346, + "learning_rate": 0.00014892577236512043, + "loss": 1.5744, + "step": 19664 + }, + { + "epoch": 0.2555377061056489, + "grad_norm": 0.4879327118396759, + "learning_rate": 0.00014892317290320905, + "loss": 1.4884, + "step": 19665 + }, + { + "epoch": 0.25555070064956475, + "grad_norm": 0.33255159854888916, + "learning_rate": 0.00014892057344129765, + "loss": 1.303, + "step": 19666 + }, + { + "epoch": 0.25556369519348066, + "grad_norm": 0.3579411208629608, + "learning_rate": 0.00014891797397938628, + "loss": 1.2984, + "step": 19667 + }, + { + "epoch": 0.2555766897373965, + "grad_norm": 0.4280683398246765, + "learning_rate": 0.00014891537451747487, + "loss": 1.486, + "step": 19668 + }, + { + "epoch": 0.2555896842813124, + "grad_norm": 0.33373525738716125, + "learning_rate": 0.00014891277505556352, + "loss": 1.5716, + "step": 19669 + }, + { + "epoch": 0.25560267882522825, + "grad_norm": 0.3290935456752777, + "learning_rate": 0.00014891017559365212, + "loss": 1.2735, + "step": 19670 + }, + { + "epoch": 0.25561567336914415, + "grad_norm": 0.39359936118125916, + "learning_rate": 0.00014890757613174075, + "loss": 1.3702, + "step": 19671 + }, + { + "epoch": 0.25562866791306, + "grad_norm": 0.3568744659423828, + "learning_rate": 0.00014890497666982934, + "loss": 1.2655, + "step": 19672 + }, + { + "epoch": 0.2556416624569759, + "grad_norm": 0.42902329564094543, + "learning_rate": 0.00014890237720791797, + "loss": 1.4409, + "step": 19673 + }, + { + "epoch": 0.25565465700089174, + "grad_norm": 0.47060903906822205, + "learning_rate": 0.0001488997777460066, + "loss": 1.4231, + "step": 19674 + }, + { + "epoch": 0.25566765154480764, + "grad_norm": 0.383518785238266, + "learning_rate": 0.0001488971782840952, + "loss": 1.4637, + "step": 19675 + }, + { + "epoch": 0.2556806460887235, + "grad_norm": 0.35775506496429443, + "learning_rate": 0.00014889457882218382, + "loss": 1.4128, + "step": 19676 + }, + { + "epoch": 0.2556936406326394, + "grad_norm": 0.34001848101615906, + "learning_rate": 0.00014889197936027244, + "loss": 1.2971, + "step": 19677 + }, + { + "epoch": 0.25570663517655523, + "grad_norm": 0.37495020031929016, + "learning_rate": 0.00014888937989836104, + "loss": 1.3279, + "step": 19678 + }, + { + "epoch": 0.25571962972047113, + "grad_norm": 0.43208250403404236, + "learning_rate": 0.00014888678043644966, + "loss": 1.5267, + "step": 19679 + }, + { + "epoch": 0.255732624264387, + "grad_norm": 0.4606996476650238, + "learning_rate": 0.00014888418097453826, + "loss": 1.4772, + "step": 19680 + }, + { + "epoch": 0.2557456188083029, + "grad_norm": 0.32310324907302856, + "learning_rate": 0.0001488815815126269, + "loss": 1.3419, + "step": 19681 + }, + { + "epoch": 0.2557586133522187, + "grad_norm": 0.46317756175994873, + "learning_rate": 0.0001488789820507155, + "loss": 1.3172, + "step": 19682 + }, + { + "epoch": 0.2557716078961346, + "grad_norm": 0.4242875277996063, + "learning_rate": 0.00014887638258880413, + "loss": 1.3944, + "step": 19683 + }, + { + "epoch": 0.25578460244005047, + "grad_norm": 0.4088890254497528, + "learning_rate": 0.00014887378312689273, + "loss": 1.462, + "step": 19684 + }, + { + "epoch": 0.25579759698396637, + "grad_norm": 0.474467009305954, + "learning_rate": 0.00014887118366498135, + "loss": 1.5759, + "step": 19685 + }, + { + "epoch": 0.2558105915278822, + "grad_norm": 0.4188452363014221, + "learning_rate": 0.00014886858420306998, + "loss": 1.39, + "step": 19686 + }, + { + "epoch": 0.2558235860717981, + "grad_norm": 0.3549194633960724, + "learning_rate": 0.00014886598474115858, + "loss": 1.4429, + "step": 19687 + }, + { + "epoch": 0.25583658061571396, + "grad_norm": 0.3320971727371216, + "learning_rate": 0.0001488633852792472, + "loss": 1.2474, + "step": 19688 + }, + { + "epoch": 0.25584957515962986, + "grad_norm": 0.38851800560951233, + "learning_rate": 0.00014886078581733582, + "loss": 1.2246, + "step": 19689 + }, + { + "epoch": 0.2558625697035457, + "grad_norm": 0.3691411018371582, + "learning_rate": 0.00014885818635542442, + "loss": 1.3217, + "step": 19690 + }, + { + "epoch": 0.2558755642474616, + "grad_norm": 0.420337051153183, + "learning_rate": 0.00014885558689351305, + "loss": 1.4315, + "step": 19691 + }, + { + "epoch": 0.25588855879137745, + "grad_norm": 0.4311297833919525, + "learning_rate": 0.00014885298743160167, + "loss": 1.4508, + "step": 19692 + }, + { + "epoch": 0.25590155333529335, + "grad_norm": 0.3846625089645386, + "learning_rate": 0.0001488503879696903, + "loss": 1.275, + "step": 19693 + }, + { + "epoch": 0.2559145478792092, + "grad_norm": 0.2988477647304535, + "learning_rate": 0.0001488477885077789, + "loss": 1.4861, + "step": 19694 + }, + { + "epoch": 0.2559275424231251, + "grad_norm": 0.45092031359672546, + "learning_rate": 0.00014884518904586752, + "loss": 1.4046, + "step": 19695 + }, + { + "epoch": 0.25594053696704094, + "grad_norm": 0.2920062839984894, + "learning_rate": 0.00014884258958395614, + "loss": 1.444, + "step": 19696 + }, + { + "epoch": 0.25595353151095684, + "grad_norm": 0.40450745820999146, + "learning_rate": 0.00014883999012204474, + "loss": 1.3732, + "step": 19697 + }, + { + "epoch": 0.2559665260548727, + "grad_norm": 0.353371262550354, + "learning_rate": 0.00014883739066013336, + "loss": 1.4188, + "step": 19698 + }, + { + "epoch": 0.2559795205987886, + "grad_norm": 0.3861452639102936, + "learning_rate": 0.00014883479119822196, + "loss": 1.3263, + "step": 19699 + }, + { + "epoch": 0.25599251514270444, + "grad_norm": 0.4692375063896179, + "learning_rate": 0.0001488321917363106, + "loss": 1.2456, + "step": 19700 + }, + { + "epoch": 0.25600550968662034, + "grad_norm": 0.4575304687023163, + "learning_rate": 0.0001488295922743992, + "loss": 1.3955, + "step": 19701 + }, + { + "epoch": 0.2560185042305362, + "grad_norm": 0.412777304649353, + "learning_rate": 0.0001488269928124878, + "loss": 1.4081, + "step": 19702 + }, + { + "epoch": 0.2560314987744521, + "grad_norm": 0.45569878816604614, + "learning_rate": 0.00014882439335057643, + "loss": 1.5099, + "step": 19703 + }, + { + "epoch": 0.2560444933183679, + "grad_norm": 0.4113287031650543, + "learning_rate": 0.00014882179388866506, + "loss": 1.4169, + "step": 19704 + }, + { + "epoch": 0.25605748786228383, + "grad_norm": 0.4119247496128082, + "learning_rate": 0.00014881919442675368, + "loss": 1.3847, + "step": 19705 + }, + { + "epoch": 0.2560704824061997, + "grad_norm": 0.4179016649723053, + "learning_rate": 0.00014881659496484228, + "loss": 1.4449, + "step": 19706 + }, + { + "epoch": 0.2560834769501156, + "grad_norm": 0.5122266411781311, + "learning_rate": 0.0001488139955029309, + "loss": 1.453, + "step": 19707 + }, + { + "epoch": 0.2560964714940314, + "grad_norm": 0.3612705171108246, + "learning_rate": 0.00014881139604101953, + "loss": 1.4874, + "step": 19708 + }, + { + "epoch": 0.2561094660379473, + "grad_norm": 0.32217028737068176, + "learning_rate": 0.00014880879657910812, + "loss": 1.3092, + "step": 19709 + }, + { + "epoch": 0.25612246058186316, + "grad_norm": 0.5130923390388489, + "learning_rate": 0.00014880619711719675, + "loss": 1.564, + "step": 19710 + }, + { + "epoch": 0.25613545512577907, + "grad_norm": 0.4182952344417572, + "learning_rate": 0.00014880359765528535, + "loss": 1.2655, + "step": 19711 + }, + { + "epoch": 0.2561484496696949, + "grad_norm": 0.35896793007850647, + "learning_rate": 0.000148800998193374, + "loss": 1.5231, + "step": 19712 + }, + { + "epoch": 0.2561614442136108, + "grad_norm": 0.4089457094669342, + "learning_rate": 0.0001487983987314626, + "loss": 1.6013, + "step": 19713 + }, + { + "epoch": 0.25617443875752666, + "grad_norm": 0.5242648124694824, + "learning_rate": 0.0001487957992695512, + "loss": 1.4914, + "step": 19714 + }, + { + "epoch": 0.25618743330144256, + "grad_norm": 0.42909014225006104, + "learning_rate": 0.00014879319980763982, + "loss": 1.5397, + "step": 19715 + }, + { + "epoch": 0.2562004278453584, + "grad_norm": 0.4052775800228119, + "learning_rate": 0.00014879060034572844, + "loss": 1.7156, + "step": 19716 + }, + { + "epoch": 0.2562134223892743, + "grad_norm": 0.39925897121429443, + "learning_rate": 0.00014878800088381707, + "loss": 1.5665, + "step": 19717 + }, + { + "epoch": 0.2562264169331902, + "grad_norm": 0.4118512272834778, + "learning_rate": 0.00014878540142190566, + "loss": 1.3955, + "step": 19718 + }, + { + "epoch": 0.25623941147710605, + "grad_norm": 0.40068572759628296, + "learning_rate": 0.0001487828019599943, + "loss": 1.4006, + "step": 19719 + }, + { + "epoch": 0.25625240602102195, + "grad_norm": 0.3560904562473297, + "learning_rate": 0.0001487802024980829, + "loss": 1.2326, + "step": 19720 + }, + { + "epoch": 0.2562654005649378, + "grad_norm": 0.35685473680496216, + "learning_rate": 0.0001487776030361715, + "loss": 1.3766, + "step": 19721 + }, + { + "epoch": 0.2562783951088537, + "grad_norm": 0.3998667895793915, + "learning_rate": 0.00014877500357426013, + "loss": 1.511, + "step": 19722 + }, + { + "epoch": 0.25629138965276954, + "grad_norm": 0.33287513256073, + "learning_rate": 0.00014877240411234873, + "loss": 1.3826, + "step": 19723 + }, + { + "epoch": 0.25630438419668544, + "grad_norm": 0.5380807518959045, + "learning_rate": 0.00014876980465043738, + "loss": 1.283, + "step": 19724 + }, + { + "epoch": 0.2563173787406013, + "grad_norm": 0.3904716372489929, + "learning_rate": 0.00014876720518852598, + "loss": 1.2334, + "step": 19725 + }, + { + "epoch": 0.2563303732845172, + "grad_norm": 0.36384886503219604, + "learning_rate": 0.0001487646057266146, + "loss": 1.4242, + "step": 19726 + }, + { + "epoch": 0.25634336782843303, + "grad_norm": 0.40154126286506653, + "learning_rate": 0.00014876200626470323, + "loss": 1.4757, + "step": 19727 + }, + { + "epoch": 0.25635636237234893, + "grad_norm": 0.3741087317466736, + "learning_rate": 0.00014875940680279183, + "loss": 1.4073, + "step": 19728 + }, + { + "epoch": 0.2563693569162648, + "grad_norm": 0.3878123164176941, + "learning_rate": 0.00014875680734088045, + "loss": 1.4382, + "step": 19729 + }, + { + "epoch": 0.2563823514601807, + "grad_norm": 0.5249066948890686, + "learning_rate": 0.00014875420787896905, + "loss": 1.4512, + "step": 19730 + }, + { + "epoch": 0.2563953460040965, + "grad_norm": 0.26608163118362427, + "learning_rate": 0.00014875160841705767, + "loss": 1.3231, + "step": 19731 + }, + { + "epoch": 0.2564083405480124, + "grad_norm": 0.2851769030094147, + "learning_rate": 0.0001487490089551463, + "loss": 1.3595, + "step": 19732 + }, + { + "epoch": 0.25642133509192827, + "grad_norm": 0.4020291268825531, + "learning_rate": 0.0001487464094932349, + "loss": 1.4306, + "step": 19733 + }, + { + "epoch": 0.25643432963584417, + "grad_norm": 0.31380996108055115, + "learning_rate": 0.00014874381003132352, + "loss": 1.3461, + "step": 19734 + }, + { + "epoch": 0.25644732417976, + "grad_norm": 0.361942321062088, + "learning_rate": 0.00014874121056941214, + "loss": 1.5764, + "step": 19735 + }, + { + "epoch": 0.2564603187236759, + "grad_norm": 0.3764529228210449, + "learning_rate": 0.00014873861110750077, + "loss": 1.4292, + "step": 19736 + }, + { + "epoch": 0.25647331326759176, + "grad_norm": 0.36036935448646545, + "learning_rate": 0.00014873601164558937, + "loss": 1.4322, + "step": 19737 + }, + { + "epoch": 0.25648630781150766, + "grad_norm": 0.4236028790473938, + "learning_rate": 0.000148733412183678, + "loss": 1.3712, + "step": 19738 + }, + { + "epoch": 0.2564993023554235, + "grad_norm": 0.3441009223461151, + "learning_rate": 0.00014873081272176662, + "loss": 1.2628, + "step": 19739 + }, + { + "epoch": 0.2565122968993394, + "grad_norm": 0.3482554852962494, + "learning_rate": 0.0001487282132598552, + "loss": 1.4261, + "step": 19740 + }, + { + "epoch": 0.25652529144325525, + "grad_norm": 0.32627928256988525, + "learning_rate": 0.00014872561379794384, + "loss": 1.546, + "step": 19741 + }, + { + "epoch": 0.25653828598717116, + "grad_norm": 0.45293572545051575, + "learning_rate": 0.00014872301433603243, + "loss": 1.2624, + "step": 19742 + }, + { + "epoch": 0.256551280531087, + "grad_norm": 0.3377649784088135, + "learning_rate": 0.00014872041487412106, + "loss": 1.3445, + "step": 19743 + }, + { + "epoch": 0.2565642750750029, + "grad_norm": 0.3097911477088928, + "learning_rate": 0.00014871781541220968, + "loss": 1.399, + "step": 19744 + }, + { + "epoch": 0.25657726961891875, + "grad_norm": 0.5099747180938721, + "learning_rate": 0.00014871521595029828, + "loss": 1.4224, + "step": 19745 + }, + { + "epoch": 0.25659026416283465, + "grad_norm": 0.42007213830947876, + "learning_rate": 0.0001487126164883869, + "loss": 1.4047, + "step": 19746 + }, + { + "epoch": 0.2566032587067505, + "grad_norm": 0.4278099536895752, + "learning_rate": 0.00014871001702647553, + "loss": 1.3473, + "step": 19747 + }, + { + "epoch": 0.2566162532506664, + "grad_norm": 0.3885868191719055, + "learning_rate": 0.00014870741756456415, + "loss": 1.3933, + "step": 19748 + }, + { + "epoch": 0.25662924779458224, + "grad_norm": 0.3276718258857727, + "learning_rate": 0.00014870481810265275, + "loss": 1.4491, + "step": 19749 + }, + { + "epoch": 0.25664224233849814, + "grad_norm": 0.5262018442153931, + "learning_rate": 0.00014870221864074138, + "loss": 1.4612, + "step": 19750 + }, + { + "epoch": 0.256655236882414, + "grad_norm": 0.413718044757843, + "learning_rate": 0.00014869961917883, + "loss": 1.189, + "step": 19751 + }, + { + "epoch": 0.2566682314263299, + "grad_norm": 0.405471533536911, + "learning_rate": 0.0001486970197169186, + "loss": 1.5588, + "step": 19752 + }, + { + "epoch": 0.25668122597024573, + "grad_norm": 0.4070466458797455, + "learning_rate": 0.00014869442025500722, + "loss": 1.5992, + "step": 19753 + }, + { + "epoch": 0.25669422051416163, + "grad_norm": 0.4018714427947998, + "learning_rate": 0.00014869182079309582, + "loss": 1.4072, + "step": 19754 + }, + { + "epoch": 0.2567072150580775, + "grad_norm": 0.45151740312576294, + "learning_rate": 0.00014868922133118447, + "loss": 1.3729, + "step": 19755 + }, + { + "epoch": 0.2567202096019934, + "grad_norm": 0.4852622449398041, + "learning_rate": 0.00014868662186927307, + "loss": 1.4663, + "step": 19756 + }, + { + "epoch": 0.2567332041459092, + "grad_norm": 0.3701883554458618, + "learning_rate": 0.00014868402240736167, + "loss": 1.4152, + "step": 19757 + }, + { + "epoch": 0.2567461986898251, + "grad_norm": 0.31140780448913574, + "learning_rate": 0.0001486814229454503, + "loss": 1.3034, + "step": 19758 + }, + { + "epoch": 0.25675919323374097, + "grad_norm": 0.3135972023010254, + "learning_rate": 0.00014867882348353892, + "loss": 1.3016, + "step": 19759 + }, + { + "epoch": 0.25677218777765687, + "grad_norm": 0.29702073335647583, + "learning_rate": 0.00014867622402162754, + "loss": 1.3853, + "step": 19760 + }, + { + "epoch": 0.2567851823215727, + "grad_norm": 0.31660234928131104, + "learning_rate": 0.00014867362455971614, + "loss": 1.4071, + "step": 19761 + }, + { + "epoch": 0.2567981768654886, + "grad_norm": 0.4049093723297119, + "learning_rate": 0.00014867102509780476, + "loss": 1.1623, + "step": 19762 + }, + { + "epoch": 0.25681117140940446, + "grad_norm": 0.4344634711742401, + "learning_rate": 0.00014866842563589339, + "loss": 1.5088, + "step": 19763 + }, + { + "epoch": 0.25682416595332036, + "grad_norm": 0.36794304847717285, + "learning_rate": 0.00014866582617398198, + "loss": 1.2857, + "step": 19764 + }, + { + "epoch": 0.2568371604972362, + "grad_norm": 0.39373883605003357, + "learning_rate": 0.0001486632267120706, + "loss": 1.36, + "step": 19765 + }, + { + "epoch": 0.2568501550411521, + "grad_norm": 0.34790873527526855, + "learning_rate": 0.00014866062725015923, + "loss": 1.1421, + "step": 19766 + }, + { + "epoch": 0.25686314958506795, + "grad_norm": 0.4097789227962494, + "learning_rate": 0.00014865802778824786, + "loss": 1.4323, + "step": 19767 + }, + { + "epoch": 0.25687614412898385, + "grad_norm": 0.2604452073574066, + "learning_rate": 0.00014865542832633645, + "loss": 1.448, + "step": 19768 + }, + { + "epoch": 0.2568891386728997, + "grad_norm": 0.346098393201828, + "learning_rate": 0.00014865282886442505, + "loss": 1.3863, + "step": 19769 + }, + { + "epoch": 0.2569021332168156, + "grad_norm": 0.40293800830841064, + "learning_rate": 0.0001486502294025137, + "loss": 1.4521, + "step": 19770 + }, + { + "epoch": 0.25691512776073144, + "grad_norm": 0.4309069812297821, + "learning_rate": 0.0001486476299406023, + "loss": 1.3924, + "step": 19771 + }, + { + "epoch": 0.25692812230464734, + "grad_norm": 0.33858928084373474, + "learning_rate": 0.00014864503047869093, + "loss": 1.4213, + "step": 19772 + }, + { + "epoch": 0.2569411168485632, + "grad_norm": 0.407690167427063, + "learning_rate": 0.00014864243101677952, + "loss": 1.4275, + "step": 19773 + }, + { + "epoch": 0.2569541113924791, + "grad_norm": 0.3626769483089447, + "learning_rate": 0.00014863983155486815, + "loss": 1.3991, + "step": 19774 + }, + { + "epoch": 0.25696710593639494, + "grad_norm": 0.4655991196632385, + "learning_rate": 0.00014863723209295677, + "loss": 1.4511, + "step": 19775 + }, + { + "epoch": 0.25698010048031084, + "grad_norm": 0.39907339215278625, + "learning_rate": 0.00014863463263104537, + "loss": 1.6319, + "step": 19776 + }, + { + "epoch": 0.2569930950242267, + "grad_norm": 0.40345531702041626, + "learning_rate": 0.000148632033169134, + "loss": 1.5083, + "step": 19777 + }, + { + "epoch": 0.2570060895681426, + "grad_norm": 0.4085056781768799, + "learning_rate": 0.00014862943370722262, + "loss": 1.6034, + "step": 19778 + }, + { + "epoch": 0.2570190841120584, + "grad_norm": 0.3583461046218872, + "learning_rate": 0.00014862683424531124, + "loss": 1.2385, + "step": 19779 + }, + { + "epoch": 0.2570320786559743, + "grad_norm": 0.39973992109298706, + "learning_rate": 0.00014862423478339984, + "loss": 1.318, + "step": 19780 + }, + { + "epoch": 0.2570450731998902, + "grad_norm": 0.41369712352752686, + "learning_rate": 0.00014862163532148844, + "loss": 1.5932, + "step": 19781 + }, + { + "epoch": 0.2570580677438061, + "grad_norm": 0.4177628755569458, + "learning_rate": 0.0001486190358595771, + "loss": 1.5605, + "step": 19782 + }, + { + "epoch": 0.2570710622877219, + "grad_norm": 0.40126335620880127, + "learning_rate": 0.00014861643639766569, + "loss": 1.5701, + "step": 19783 + }, + { + "epoch": 0.2570840568316378, + "grad_norm": 0.4328796863555908, + "learning_rate": 0.0001486138369357543, + "loss": 1.4917, + "step": 19784 + }, + { + "epoch": 0.25709705137555366, + "grad_norm": 0.40425291657447815, + "learning_rate": 0.0001486112374738429, + "loss": 1.3878, + "step": 19785 + }, + { + "epoch": 0.25711004591946957, + "grad_norm": 0.3638814687728882, + "learning_rate": 0.00014860863801193153, + "loss": 1.2323, + "step": 19786 + }, + { + "epoch": 0.2571230404633854, + "grad_norm": 0.42932456731796265, + "learning_rate": 0.00014860603855002016, + "loss": 1.3454, + "step": 19787 + }, + { + "epoch": 0.2571360350073013, + "grad_norm": 0.36770328879356384, + "learning_rate": 0.00014860343908810875, + "loss": 1.252, + "step": 19788 + }, + { + "epoch": 0.25714902955121716, + "grad_norm": 0.2767905592918396, + "learning_rate": 0.00014860083962619738, + "loss": 1.4381, + "step": 19789 + }, + { + "epoch": 0.25716202409513306, + "grad_norm": 0.4000886082649231, + "learning_rate": 0.000148598240164286, + "loss": 1.592, + "step": 19790 + }, + { + "epoch": 0.2571750186390489, + "grad_norm": 0.47979533672332764, + "learning_rate": 0.00014859564070237463, + "loss": 1.474, + "step": 19791 + }, + { + "epoch": 0.2571880131829648, + "grad_norm": 0.3197050988674164, + "learning_rate": 0.00014859304124046323, + "loss": 1.326, + "step": 19792 + }, + { + "epoch": 0.25720100772688065, + "grad_norm": 0.3906276226043701, + "learning_rate": 0.00014859044177855185, + "loss": 1.3318, + "step": 19793 + }, + { + "epoch": 0.25721400227079655, + "grad_norm": 0.4051985740661621, + "learning_rate": 0.00014858784231664047, + "loss": 1.4334, + "step": 19794 + }, + { + "epoch": 0.25722699681471245, + "grad_norm": 0.4129682779312134, + "learning_rate": 0.00014858524285472907, + "loss": 1.3189, + "step": 19795 + }, + { + "epoch": 0.2572399913586283, + "grad_norm": 0.3920605480670929, + "learning_rate": 0.0001485826433928177, + "loss": 1.2422, + "step": 19796 + }, + { + "epoch": 0.2572529859025442, + "grad_norm": 0.26474153995513916, + "learning_rate": 0.0001485800439309063, + "loss": 1.3911, + "step": 19797 + }, + { + "epoch": 0.25726598044646004, + "grad_norm": 0.46777454018592834, + "learning_rate": 0.00014857744446899492, + "loss": 1.381, + "step": 19798 + }, + { + "epoch": 0.25727897499037594, + "grad_norm": 0.36751025915145874, + "learning_rate": 0.00014857484500708354, + "loss": 1.4543, + "step": 19799 + }, + { + "epoch": 0.2572919695342918, + "grad_norm": 0.3731157183647156, + "learning_rate": 0.00014857224554517214, + "loss": 1.4548, + "step": 19800 + }, + { + "epoch": 0.2573049640782077, + "grad_norm": 0.3724712133407593, + "learning_rate": 0.0001485696460832608, + "loss": 1.2211, + "step": 19801 + }, + { + "epoch": 0.25731795862212353, + "grad_norm": 0.43481382727622986, + "learning_rate": 0.0001485670466213494, + "loss": 1.4179, + "step": 19802 + }, + { + "epoch": 0.25733095316603943, + "grad_norm": 0.4335424602031708, + "learning_rate": 0.000148564447159438, + "loss": 1.3901, + "step": 19803 + }, + { + "epoch": 0.2573439477099553, + "grad_norm": 0.4344167709350586, + "learning_rate": 0.0001485618476975266, + "loss": 1.5662, + "step": 19804 + }, + { + "epoch": 0.2573569422538712, + "grad_norm": 0.36935222148895264, + "learning_rate": 0.00014855924823561524, + "loss": 1.3493, + "step": 19805 + }, + { + "epoch": 0.257369936797787, + "grad_norm": 0.2943502962589264, + "learning_rate": 0.00014855664877370386, + "loss": 1.2453, + "step": 19806 + }, + { + "epoch": 0.2573829313417029, + "grad_norm": 0.4090955853462219, + "learning_rate": 0.00014855404931179246, + "loss": 1.2359, + "step": 19807 + }, + { + "epoch": 0.25739592588561877, + "grad_norm": 0.36399662494659424, + "learning_rate": 0.00014855144984988108, + "loss": 1.4555, + "step": 19808 + }, + { + "epoch": 0.25740892042953467, + "grad_norm": 0.4545719027519226, + "learning_rate": 0.0001485488503879697, + "loss": 1.4715, + "step": 19809 + }, + { + "epoch": 0.2574219149734505, + "grad_norm": 0.37993332743644714, + "learning_rate": 0.0001485462509260583, + "loss": 1.5364, + "step": 19810 + }, + { + "epoch": 0.2574349095173664, + "grad_norm": 0.37425246834754944, + "learning_rate": 0.00014854365146414693, + "loss": 1.2401, + "step": 19811 + }, + { + "epoch": 0.25744790406128226, + "grad_norm": 0.4478759467601776, + "learning_rate": 0.00014854105200223553, + "loss": 1.5776, + "step": 19812 + }, + { + "epoch": 0.25746089860519816, + "grad_norm": 0.37398451566696167, + "learning_rate": 0.00014853845254032418, + "loss": 1.3112, + "step": 19813 + }, + { + "epoch": 0.257473893149114, + "grad_norm": 0.4248572587966919, + "learning_rate": 0.00014853585307841277, + "loss": 1.3554, + "step": 19814 + }, + { + "epoch": 0.2574868876930299, + "grad_norm": 0.41222965717315674, + "learning_rate": 0.0001485332536165014, + "loss": 1.3743, + "step": 19815 + }, + { + "epoch": 0.25749988223694575, + "grad_norm": 0.29611456394195557, + "learning_rate": 0.00014853065415459, + "loss": 1.4479, + "step": 19816 + }, + { + "epoch": 0.25751287678086165, + "grad_norm": 0.38925328850746155, + "learning_rate": 0.00014852805469267862, + "loss": 1.1967, + "step": 19817 + }, + { + "epoch": 0.2575258713247775, + "grad_norm": 0.3814679980278015, + "learning_rate": 0.00014852545523076724, + "loss": 1.652, + "step": 19818 + }, + { + "epoch": 0.2575388658686934, + "grad_norm": 0.45444709062576294, + "learning_rate": 0.00014852285576885584, + "loss": 1.4584, + "step": 19819 + }, + { + "epoch": 0.25755186041260925, + "grad_norm": 0.44860517978668213, + "learning_rate": 0.00014852025630694447, + "loss": 1.5225, + "step": 19820 + }, + { + "epoch": 0.25756485495652515, + "grad_norm": 0.3990727961063385, + "learning_rate": 0.0001485176568450331, + "loss": 1.3546, + "step": 19821 + }, + { + "epoch": 0.257577849500441, + "grad_norm": 0.4320986866950989, + "learning_rate": 0.00014851505738312172, + "loss": 1.4597, + "step": 19822 + }, + { + "epoch": 0.2575908440443569, + "grad_norm": 0.46973755955696106, + "learning_rate": 0.0001485124579212103, + "loss": 1.3869, + "step": 19823 + }, + { + "epoch": 0.25760383858827274, + "grad_norm": 0.34555894136428833, + "learning_rate": 0.0001485098584592989, + "loss": 1.2505, + "step": 19824 + }, + { + "epoch": 0.25761683313218864, + "grad_norm": 0.43511444330215454, + "learning_rate": 0.00014850725899738756, + "loss": 1.4422, + "step": 19825 + }, + { + "epoch": 0.2576298276761045, + "grad_norm": 0.35832250118255615, + "learning_rate": 0.00014850465953547616, + "loss": 1.2538, + "step": 19826 + }, + { + "epoch": 0.2576428222200204, + "grad_norm": 0.3831486403942108, + "learning_rate": 0.00014850206007356478, + "loss": 1.3276, + "step": 19827 + }, + { + "epoch": 0.25765581676393623, + "grad_norm": 0.38330164551734924, + "learning_rate": 0.00014849946061165338, + "loss": 1.3, + "step": 19828 + }, + { + "epoch": 0.25766881130785213, + "grad_norm": 0.36269333958625793, + "learning_rate": 0.000148496861149742, + "loss": 1.5511, + "step": 19829 + }, + { + "epoch": 0.257681805851768, + "grad_norm": 0.47035592794418335, + "learning_rate": 0.00014849426168783063, + "loss": 1.6231, + "step": 19830 + }, + { + "epoch": 0.2576948003956839, + "grad_norm": 0.32775580883026123, + "learning_rate": 0.00014849166222591923, + "loss": 1.4256, + "step": 19831 + }, + { + "epoch": 0.2577077949395997, + "grad_norm": 0.3980858325958252, + "learning_rate": 0.00014848906276400785, + "loss": 1.3851, + "step": 19832 + }, + { + "epoch": 0.2577207894835156, + "grad_norm": 0.31603139638900757, + "learning_rate": 0.00014848646330209648, + "loss": 1.3909, + "step": 19833 + }, + { + "epoch": 0.25773378402743147, + "grad_norm": 0.5892931222915649, + "learning_rate": 0.0001484838638401851, + "loss": 1.462, + "step": 19834 + }, + { + "epoch": 0.25774677857134737, + "grad_norm": 0.4033832848072052, + "learning_rate": 0.0001484812643782737, + "loss": 1.603, + "step": 19835 + }, + { + "epoch": 0.2577597731152632, + "grad_norm": 0.4719257950782776, + "learning_rate": 0.00014847866491636232, + "loss": 1.3801, + "step": 19836 + }, + { + "epoch": 0.2577727676591791, + "grad_norm": 0.2986607253551483, + "learning_rate": 0.00014847606545445095, + "loss": 1.3932, + "step": 19837 + }, + { + "epoch": 0.25778576220309496, + "grad_norm": 0.38879287242889404, + "learning_rate": 0.00014847346599253954, + "loss": 1.4757, + "step": 19838 + }, + { + "epoch": 0.25779875674701086, + "grad_norm": 0.36992597579956055, + "learning_rate": 0.00014847086653062817, + "loss": 1.6562, + "step": 19839 + }, + { + "epoch": 0.2578117512909267, + "grad_norm": 0.31835880875587463, + "learning_rate": 0.0001484682670687168, + "loss": 1.4312, + "step": 19840 + }, + { + "epoch": 0.2578247458348426, + "grad_norm": 0.3240465521812439, + "learning_rate": 0.0001484656676068054, + "loss": 1.3491, + "step": 19841 + }, + { + "epoch": 0.25783774037875845, + "grad_norm": 0.5762081146240234, + "learning_rate": 0.00014846306814489402, + "loss": 1.2901, + "step": 19842 + }, + { + "epoch": 0.25785073492267435, + "grad_norm": 0.4779595136642456, + "learning_rate": 0.0001484604686829826, + "loss": 1.3934, + "step": 19843 + }, + { + "epoch": 0.2578637294665902, + "grad_norm": 0.3311115503311157, + "learning_rate": 0.00014845786922107126, + "loss": 1.4032, + "step": 19844 + }, + { + "epoch": 0.2578767240105061, + "grad_norm": 0.3485088646411896, + "learning_rate": 0.00014845526975915986, + "loss": 1.6217, + "step": 19845 + }, + { + "epoch": 0.25788971855442194, + "grad_norm": 0.43883562088012695, + "learning_rate": 0.0001484526702972485, + "loss": 1.5096, + "step": 19846 + }, + { + "epoch": 0.25790271309833784, + "grad_norm": 0.41075778007507324, + "learning_rate": 0.00014845007083533708, + "loss": 1.4138, + "step": 19847 + }, + { + "epoch": 0.2579157076422537, + "grad_norm": 0.39212271571159363, + "learning_rate": 0.0001484474713734257, + "loss": 1.4253, + "step": 19848 + }, + { + "epoch": 0.2579287021861696, + "grad_norm": 0.45918703079223633, + "learning_rate": 0.00014844487191151433, + "loss": 1.4387, + "step": 19849 + }, + { + "epoch": 0.25794169673008543, + "grad_norm": 0.36436140537261963, + "learning_rate": 0.00014844227244960293, + "loss": 1.4624, + "step": 19850 + }, + { + "epoch": 0.25795469127400134, + "grad_norm": 0.36109301447868347, + "learning_rate": 0.00014843967298769155, + "loss": 1.2941, + "step": 19851 + }, + { + "epoch": 0.2579676858179172, + "grad_norm": 0.26176226139068604, + "learning_rate": 0.00014843707352578018, + "loss": 1.3627, + "step": 19852 + }, + { + "epoch": 0.2579806803618331, + "grad_norm": 0.33842501044273376, + "learning_rate": 0.00014843447406386878, + "loss": 1.192, + "step": 19853 + }, + { + "epoch": 0.2579936749057489, + "grad_norm": 0.4383045732975006, + "learning_rate": 0.0001484318746019574, + "loss": 1.4028, + "step": 19854 + }, + { + "epoch": 0.2580066694496648, + "grad_norm": 0.36733677983283997, + "learning_rate": 0.000148429275140046, + "loss": 1.4541, + "step": 19855 + }, + { + "epoch": 0.2580196639935807, + "grad_norm": 0.2952321767807007, + "learning_rate": 0.00014842667567813465, + "loss": 1.35, + "step": 19856 + }, + { + "epoch": 0.2580326585374966, + "grad_norm": 0.4473154842853546, + "learning_rate": 0.00014842407621622325, + "loss": 1.4635, + "step": 19857 + }, + { + "epoch": 0.2580456530814124, + "grad_norm": 0.7557210326194763, + "learning_rate": 0.00014842147675431187, + "loss": 1.3226, + "step": 19858 + }, + { + "epoch": 0.2580586476253283, + "grad_norm": 0.34873929619789124, + "learning_rate": 0.00014841887729240047, + "loss": 1.4924, + "step": 19859 + }, + { + "epoch": 0.25807164216924416, + "grad_norm": 0.3867649734020233, + "learning_rate": 0.0001484162778304891, + "loss": 1.4229, + "step": 19860 + }, + { + "epoch": 0.25808463671316006, + "grad_norm": 0.3477613031864166, + "learning_rate": 0.00014841367836857772, + "loss": 1.3157, + "step": 19861 + }, + { + "epoch": 0.2580976312570759, + "grad_norm": 0.3585951328277588, + "learning_rate": 0.00014841107890666632, + "loss": 1.3972, + "step": 19862 + }, + { + "epoch": 0.2581106258009918, + "grad_norm": 0.40776291489601135, + "learning_rate": 0.00014840847944475494, + "loss": 1.3709, + "step": 19863 + }, + { + "epoch": 0.25812362034490766, + "grad_norm": 0.3480892777442932, + "learning_rate": 0.00014840587998284356, + "loss": 1.1016, + "step": 19864 + }, + { + "epoch": 0.25813661488882356, + "grad_norm": 0.3900054097175598, + "learning_rate": 0.00014840328052093216, + "loss": 1.5474, + "step": 19865 + }, + { + "epoch": 0.2581496094327394, + "grad_norm": 0.42429476976394653, + "learning_rate": 0.0001484006810590208, + "loss": 1.504, + "step": 19866 + }, + { + "epoch": 0.2581626039766553, + "grad_norm": 0.30723729729652405, + "learning_rate": 0.00014839808159710938, + "loss": 1.3978, + "step": 19867 + }, + { + "epoch": 0.25817559852057115, + "grad_norm": 0.4251982569694519, + "learning_rate": 0.00014839548213519804, + "loss": 1.5004, + "step": 19868 + }, + { + "epoch": 0.25818859306448705, + "grad_norm": 0.3205110728740692, + "learning_rate": 0.00014839288267328663, + "loss": 1.6192, + "step": 19869 + }, + { + "epoch": 0.25820158760840295, + "grad_norm": 0.34095466136932373, + "learning_rate": 0.00014839028321137526, + "loss": 1.387, + "step": 19870 + }, + { + "epoch": 0.2582145821523188, + "grad_norm": 0.22871388494968414, + "learning_rate": 0.00014838768374946385, + "loss": 1.2076, + "step": 19871 + }, + { + "epoch": 0.2582275766962347, + "grad_norm": 0.4135155975818634, + "learning_rate": 0.00014838508428755248, + "loss": 1.31, + "step": 19872 + }, + { + "epoch": 0.25824057124015054, + "grad_norm": 0.40966808795928955, + "learning_rate": 0.0001483824848256411, + "loss": 1.3069, + "step": 19873 + }, + { + "epoch": 0.25825356578406644, + "grad_norm": 0.4008606970310211, + "learning_rate": 0.0001483798853637297, + "loss": 1.5713, + "step": 19874 + }, + { + "epoch": 0.2582665603279823, + "grad_norm": 0.3630692958831787, + "learning_rate": 0.00014837728590181835, + "loss": 1.4669, + "step": 19875 + }, + { + "epoch": 0.2582795548718982, + "grad_norm": 0.4633021056652069, + "learning_rate": 0.00014837468643990695, + "loss": 1.297, + "step": 19876 + }, + { + "epoch": 0.25829254941581403, + "grad_norm": 0.36767396330833435, + "learning_rate": 0.00014837208697799557, + "loss": 1.2831, + "step": 19877 + }, + { + "epoch": 0.25830554395972993, + "grad_norm": 0.49484503269195557, + "learning_rate": 0.00014836948751608417, + "loss": 1.3405, + "step": 19878 + }, + { + "epoch": 0.2583185385036458, + "grad_norm": 0.38587021827697754, + "learning_rate": 0.0001483668880541728, + "loss": 1.536, + "step": 19879 + }, + { + "epoch": 0.2583315330475617, + "grad_norm": 0.5180341005325317, + "learning_rate": 0.00014836428859226142, + "loss": 1.4147, + "step": 19880 + }, + { + "epoch": 0.2583445275914775, + "grad_norm": 0.47908255457878113, + "learning_rate": 0.00014836168913035002, + "loss": 1.5092, + "step": 19881 + }, + { + "epoch": 0.2583575221353934, + "grad_norm": 0.4228743314743042, + "learning_rate": 0.00014835908966843864, + "loss": 1.3845, + "step": 19882 + }, + { + "epoch": 0.25837051667930927, + "grad_norm": 0.36705100536346436, + "learning_rate": 0.00014835649020652727, + "loss": 1.6856, + "step": 19883 + }, + { + "epoch": 0.25838351122322517, + "grad_norm": 0.4564898610115051, + "learning_rate": 0.00014835389074461586, + "loss": 1.5801, + "step": 19884 + }, + { + "epoch": 0.258396505767141, + "grad_norm": 0.3571053743362427, + "learning_rate": 0.0001483512912827045, + "loss": 1.4008, + "step": 19885 + }, + { + "epoch": 0.2584095003110569, + "grad_norm": 0.270420104265213, + "learning_rate": 0.0001483486918207931, + "loss": 1.2654, + "step": 19886 + }, + { + "epoch": 0.25842249485497276, + "grad_norm": 0.3955066204071045, + "learning_rate": 0.00014834609235888174, + "loss": 1.5701, + "step": 19887 + }, + { + "epoch": 0.25843548939888866, + "grad_norm": 0.4124511182308197, + "learning_rate": 0.00014834349289697034, + "loss": 1.3096, + "step": 19888 + }, + { + "epoch": 0.2584484839428045, + "grad_norm": 0.3961043655872345, + "learning_rate": 0.00014834089343505896, + "loss": 1.3106, + "step": 19889 + }, + { + "epoch": 0.2584614784867204, + "grad_norm": 0.3272870182991028, + "learning_rate": 0.00014833829397314756, + "loss": 1.2618, + "step": 19890 + }, + { + "epoch": 0.25847447303063625, + "grad_norm": 0.31888192892074585, + "learning_rate": 0.00014833569451123618, + "loss": 1.3127, + "step": 19891 + }, + { + "epoch": 0.25848746757455215, + "grad_norm": 0.43939170241355896, + "learning_rate": 0.0001483330950493248, + "loss": 1.5248, + "step": 19892 + }, + { + "epoch": 0.258500462118468, + "grad_norm": 0.4425123631954193, + "learning_rate": 0.0001483304955874134, + "loss": 1.3877, + "step": 19893 + }, + { + "epoch": 0.2585134566623839, + "grad_norm": 0.363163024187088, + "learning_rate": 0.00014832789612550203, + "loss": 1.4747, + "step": 19894 + }, + { + "epoch": 0.25852645120629975, + "grad_norm": 0.347953736782074, + "learning_rate": 0.00014832529666359065, + "loss": 1.3415, + "step": 19895 + }, + { + "epoch": 0.25853944575021565, + "grad_norm": 0.47498077154159546, + "learning_rate": 0.00014832269720167925, + "loss": 1.5627, + "step": 19896 + }, + { + "epoch": 0.2585524402941315, + "grad_norm": 0.392143577337265, + "learning_rate": 0.00014832009773976787, + "loss": 1.2656, + "step": 19897 + }, + { + "epoch": 0.2585654348380474, + "grad_norm": 0.5318813323974609, + "learning_rate": 0.00014831749827785647, + "loss": 1.5933, + "step": 19898 + }, + { + "epoch": 0.25857842938196324, + "grad_norm": 0.4562159776687622, + "learning_rate": 0.00014831489881594512, + "loss": 1.3945, + "step": 19899 + }, + { + "epoch": 0.25859142392587914, + "grad_norm": 0.3080541491508484, + "learning_rate": 0.00014831229935403372, + "loss": 1.2794, + "step": 19900 + }, + { + "epoch": 0.258604418469795, + "grad_norm": 0.38244175910949707, + "learning_rate": 0.00014830969989212235, + "loss": 1.3745, + "step": 19901 + }, + { + "epoch": 0.2586174130137109, + "grad_norm": 0.36817625164985657, + "learning_rate": 0.00014830710043021094, + "loss": 1.3873, + "step": 19902 + }, + { + "epoch": 0.25863040755762673, + "grad_norm": 0.3452102243900299, + "learning_rate": 0.00014830450096829957, + "loss": 1.4399, + "step": 19903 + }, + { + "epoch": 0.25864340210154263, + "grad_norm": 0.41950735449790955, + "learning_rate": 0.0001483019015063882, + "loss": 1.4612, + "step": 19904 + }, + { + "epoch": 0.2586563966454585, + "grad_norm": 0.3153369426727295, + "learning_rate": 0.0001482993020444768, + "loss": 1.2716, + "step": 19905 + }, + { + "epoch": 0.2586693911893744, + "grad_norm": 0.4001500606536865, + "learning_rate": 0.0001482967025825654, + "loss": 1.3932, + "step": 19906 + }, + { + "epoch": 0.2586823857332902, + "grad_norm": 0.4432854652404785, + "learning_rate": 0.00014829410312065404, + "loss": 1.473, + "step": 19907 + }, + { + "epoch": 0.2586953802772061, + "grad_norm": 0.3478623330593109, + "learning_rate": 0.00014829150365874264, + "loss": 1.2928, + "step": 19908 + }, + { + "epoch": 0.25870837482112197, + "grad_norm": 0.4289468824863434, + "learning_rate": 0.00014828890419683126, + "loss": 1.3406, + "step": 19909 + }, + { + "epoch": 0.25872136936503787, + "grad_norm": 0.41697534918785095, + "learning_rate": 0.00014828630473491988, + "loss": 1.3225, + "step": 19910 + }, + { + "epoch": 0.2587343639089537, + "grad_norm": 0.40126150846481323, + "learning_rate": 0.0001482837052730085, + "loss": 1.4203, + "step": 19911 + }, + { + "epoch": 0.2587473584528696, + "grad_norm": 0.4429757595062256, + "learning_rate": 0.0001482811058110971, + "loss": 1.2334, + "step": 19912 + }, + { + "epoch": 0.25876035299678546, + "grad_norm": 0.3996281921863556, + "learning_rate": 0.00014827850634918573, + "loss": 1.3844, + "step": 19913 + }, + { + "epoch": 0.25877334754070136, + "grad_norm": 0.46142658591270447, + "learning_rate": 0.00014827590688727436, + "loss": 1.5765, + "step": 19914 + }, + { + "epoch": 0.2587863420846172, + "grad_norm": 0.43726444244384766, + "learning_rate": 0.00014827330742536295, + "loss": 1.4275, + "step": 19915 + }, + { + "epoch": 0.2587993366285331, + "grad_norm": 0.3690805435180664, + "learning_rate": 0.00014827070796345158, + "loss": 1.4337, + "step": 19916 + }, + { + "epoch": 0.25881233117244895, + "grad_norm": 0.35200124979019165, + "learning_rate": 0.00014826810850154017, + "loss": 1.2507, + "step": 19917 + }, + { + "epoch": 0.25882532571636485, + "grad_norm": 0.48526984453201294, + "learning_rate": 0.00014826550903962883, + "loss": 1.4562, + "step": 19918 + }, + { + "epoch": 0.2588383202602807, + "grad_norm": 0.28235483169555664, + "learning_rate": 0.00014826290957771742, + "loss": 1.1081, + "step": 19919 + }, + { + "epoch": 0.2588513148041966, + "grad_norm": 0.46671703457832336, + "learning_rate": 0.00014826031011580602, + "loss": 1.3082, + "step": 19920 + }, + { + "epoch": 0.25886430934811244, + "grad_norm": 0.42913997173309326, + "learning_rate": 0.00014825771065389465, + "loss": 1.3711, + "step": 19921 + }, + { + "epoch": 0.25887730389202834, + "grad_norm": 0.43390780687332153, + "learning_rate": 0.00014825511119198327, + "loss": 1.136, + "step": 19922 + }, + { + "epoch": 0.2588902984359442, + "grad_norm": 0.4601288437843323, + "learning_rate": 0.0001482525117300719, + "loss": 1.4765, + "step": 19923 + }, + { + "epoch": 0.2589032929798601, + "grad_norm": 0.3857990801334381, + "learning_rate": 0.0001482499122681605, + "loss": 1.1873, + "step": 19924 + }, + { + "epoch": 0.25891628752377593, + "grad_norm": 0.4280492663383484, + "learning_rate": 0.00014824731280624912, + "loss": 1.3045, + "step": 19925 + }, + { + "epoch": 0.25892928206769183, + "grad_norm": 0.36360520124435425, + "learning_rate": 0.00014824471334433774, + "loss": 1.2402, + "step": 19926 + }, + { + "epoch": 0.2589422766116077, + "grad_norm": 0.32240432500839233, + "learning_rate": 0.00014824211388242634, + "loss": 1.2962, + "step": 19927 + }, + { + "epoch": 0.2589552711555236, + "grad_norm": 0.3881559371948242, + "learning_rate": 0.00014823951442051496, + "loss": 1.4118, + "step": 19928 + }, + { + "epoch": 0.2589682656994394, + "grad_norm": 0.40604355931282043, + "learning_rate": 0.00014823691495860356, + "loss": 1.5554, + "step": 19929 + }, + { + "epoch": 0.2589812602433553, + "grad_norm": 0.3437977135181427, + "learning_rate": 0.0001482343154966922, + "loss": 1.2844, + "step": 19930 + }, + { + "epoch": 0.25899425478727117, + "grad_norm": 0.3995177149772644, + "learning_rate": 0.0001482317160347808, + "loss": 1.6499, + "step": 19931 + }, + { + "epoch": 0.2590072493311871, + "grad_norm": 0.4572320878505707, + "learning_rate": 0.00014822911657286943, + "loss": 1.3128, + "step": 19932 + }, + { + "epoch": 0.2590202438751029, + "grad_norm": 0.37138083577156067, + "learning_rate": 0.00014822651711095803, + "loss": 1.323, + "step": 19933 + }, + { + "epoch": 0.2590332384190188, + "grad_norm": 0.543493390083313, + "learning_rate": 0.00014822391764904666, + "loss": 1.4226, + "step": 19934 + }, + { + "epoch": 0.25904623296293466, + "grad_norm": 0.42756617069244385, + "learning_rate": 0.00014822131818713528, + "loss": 1.5851, + "step": 19935 + }, + { + "epoch": 0.25905922750685056, + "grad_norm": 0.34705233573913574, + "learning_rate": 0.00014821871872522388, + "loss": 1.4637, + "step": 19936 + }, + { + "epoch": 0.2590722220507664, + "grad_norm": 0.4670591652393341, + "learning_rate": 0.0001482161192633125, + "loss": 1.4106, + "step": 19937 + }, + { + "epoch": 0.2590852165946823, + "grad_norm": 0.3798062205314636, + "learning_rate": 0.00014821351980140113, + "loss": 1.6454, + "step": 19938 + }, + { + "epoch": 0.25909821113859816, + "grad_norm": 0.40061119198799133, + "learning_rate": 0.00014821092033948972, + "loss": 1.4775, + "step": 19939 + }, + { + "epoch": 0.25911120568251406, + "grad_norm": 0.4153488576412201, + "learning_rate": 0.00014820832087757835, + "loss": 1.3671, + "step": 19940 + }, + { + "epoch": 0.2591242002264299, + "grad_norm": 0.3565073311328888, + "learning_rate": 0.00014820572141566695, + "loss": 1.6061, + "step": 19941 + }, + { + "epoch": 0.2591371947703458, + "grad_norm": 0.4369582533836365, + "learning_rate": 0.0001482031219537556, + "loss": 1.3608, + "step": 19942 + }, + { + "epoch": 0.25915018931426165, + "grad_norm": 0.5014320611953735, + "learning_rate": 0.0001482005224918442, + "loss": 1.3902, + "step": 19943 + }, + { + "epoch": 0.25916318385817755, + "grad_norm": 0.48925432562828064, + "learning_rate": 0.00014819792302993282, + "loss": 1.4684, + "step": 19944 + }, + { + "epoch": 0.2591761784020934, + "grad_norm": 0.33795610070228577, + "learning_rate": 0.00014819532356802142, + "loss": 1.323, + "step": 19945 + }, + { + "epoch": 0.2591891729460093, + "grad_norm": 0.331469863653183, + "learning_rate": 0.00014819272410611004, + "loss": 1.3025, + "step": 19946 + }, + { + "epoch": 0.2592021674899252, + "grad_norm": 0.35915523767471313, + "learning_rate": 0.00014819012464419867, + "loss": 1.3517, + "step": 19947 + }, + { + "epoch": 0.25921516203384104, + "grad_norm": 0.49079039692878723, + "learning_rate": 0.00014818752518228726, + "loss": 1.5311, + "step": 19948 + }, + { + "epoch": 0.25922815657775694, + "grad_norm": 0.48849114775657654, + "learning_rate": 0.0001481849257203759, + "loss": 1.568, + "step": 19949 + }, + { + "epoch": 0.2592411511216728, + "grad_norm": 0.3905053436756134, + "learning_rate": 0.0001481823262584645, + "loss": 1.3059, + "step": 19950 + }, + { + "epoch": 0.2592541456655887, + "grad_norm": 0.43965134024620056, + "learning_rate": 0.0001481797267965531, + "loss": 1.4275, + "step": 19951 + }, + { + "epoch": 0.25926714020950453, + "grad_norm": 0.3594221770763397, + "learning_rate": 0.00014817712733464173, + "loss": 1.5361, + "step": 19952 + }, + { + "epoch": 0.25928013475342043, + "grad_norm": 0.386545866727829, + "learning_rate": 0.00014817452787273036, + "loss": 1.3875, + "step": 19953 + }, + { + "epoch": 0.2592931292973363, + "grad_norm": 0.33274510502815247, + "learning_rate": 0.00014817192841081898, + "loss": 1.2103, + "step": 19954 + }, + { + "epoch": 0.2593061238412522, + "grad_norm": 0.47073882818222046, + "learning_rate": 0.00014816932894890758, + "loss": 1.4757, + "step": 19955 + }, + { + "epoch": 0.259319118385168, + "grad_norm": 0.394117146730423, + "learning_rate": 0.0001481667294869962, + "loss": 1.4047, + "step": 19956 + }, + { + "epoch": 0.2593321129290839, + "grad_norm": 0.3804217576980591, + "learning_rate": 0.00014816413002508483, + "loss": 1.6032, + "step": 19957 + }, + { + "epoch": 0.25934510747299977, + "grad_norm": 0.3410324454307556, + "learning_rate": 0.00014816153056317343, + "loss": 1.4341, + "step": 19958 + }, + { + "epoch": 0.25935810201691567, + "grad_norm": 0.4197107255458832, + "learning_rate": 0.00014815893110126205, + "loss": 1.3531, + "step": 19959 + }, + { + "epoch": 0.2593710965608315, + "grad_norm": 0.4598172605037689, + "learning_rate": 0.00014815633163935065, + "loss": 1.5031, + "step": 19960 + }, + { + "epoch": 0.2593840911047474, + "grad_norm": 0.33939510583877563, + "learning_rate": 0.0001481537321774393, + "loss": 1.4511, + "step": 19961 + }, + { + "epoch": 0.25939708564866326, + "grad_norm": 0.44218817353248596, + "learning_rate": 0.0001481511327155279, + "loss": 1.4836, + "step": 19962 + }, + { + "epoch": 0.25941008019257916, + "grad_norm": 0.3660185933113098, + "learning_rate": 0.0001481485332536165, + "loss": 1.4618, + "step": 19963 + }, + { + "epoch": 0.259423074736495, + "grad_norm": 0.38963234424591064, + "learning_rate": 0.00014814593379170512, + "loss": 1.5467, + "step": 19964 + }, + { + "epoch": 0.2594360692804109, + "grad_norm": 0.31272071599960327, + "learning_rate": 0.00014814333432979374, + "loss": 1.3584, + "step": 19965 + }, + { + "epoch": 0.25944906382432675, + "grad_norm": 0.41172417998313904, + "learning_rate": 0.00014814073486788237, + "loss": 1.2183, + "step": 19966 + }, + { + "epoch": 0.25946205836824265, + "grad_norm": 0.3378946781158447, + "learning_rate": 0.00014813813540597096, + "loss": 1.2264, + "step": 19967 + }, + { + "epoch": 0.2594750529121585, + "grad_norm": 0.4257126748561859, + "learning_rate": 0.0001481355359440596, + "loss": 1.5005, + "step": 19968 + }, + { + "epoch": 0.2594880474560744, + "grad_norm": 0.3144511580467224, + "learning_rate": 0.00014813293648214821, + "loss": 1.4333, + "step": 19969 + }, + { + "epoch": 0.25950104199999025, + "grad_norm": 0.30821138620376587, + "learning_rate": 0.0001481303370202368, + "loss": 1.2791, + "step": 19970 + }, + { + "epoch": 0.25951403654390615, + "grad_norm": 0.3085781931877136, + "learning_rate": 0.00014812773755832544, + "loss": 1.3864, + "step": 19971 + }, + { + "epoch": 0.259527031087822, + "grad_norm": 0.412639856338501, + "learning_rate": 0.00014812513809641403, + "loss": 1.3098, + "step": 19972 + }, + { + "epoch": 0.2595400256317379, + "grad_norm": 0.33945998549461365, + "learning_rate": 0.00014812253863450268, + "loss": 1.4027, + "step": 19973 + }, + { + "epoch": 0.25955302017565374, + "grad_norm": 0.38900020718574524, + "learning_rate": 0.00014811993917259128, + "loss": 1.323, + "step": 19974 + }, + { + "epoch": 0.25956601471956964, + "grad_norm": 0.3130693733692169, + "learning_rate": 0.00014811733971067988, + "loss": 1.5209, + "step": 19975 + }, + { + "epoch": 0.2595790092634855, + "grad_norm": 0.5098480582237244, + "learning_rate": 0.0001481147402487685, + "loss": 1.3622, + "step": 19976 + }, + { + "epoch": 0.2595920038074014, + "grad_norm": 0.3957809507846832, + "learning_rate": 0.00014811214078685713, + "loss": 1.1881, + "step": 19977 + }, + { + "epoch": 0.25960499835131723, + "grad_norm": 0.36697855591773987, + "learning_rate": 0.00014810954132494575, + "loss": 1.2473, + "step": 19978 + }, + { + "epoch": 0.25961799289523313, + "grad_norm": 0.5590858459472656, + "learning_rate": 0.00014810694186303435, + "loss": 1.3385, + "step": 19979 + }, + { + "epoch": 0.259630987439149, + "grad_norm": 0.4463638365268707, + "learning_rate": 0.00014810434240112297, + "loss": 1.4433, + "step": 19980 + }, + { + "epoch": 0.2596439819830649, + "grad_norm": 0.35455814003944397, + "learning_rate": 0.0001481017429392116, + "loss": 1.3965, + "step": 19981 + }, + { + "epoch": 0.2596569765269807, + "grad_norm": 0.3963063657283783, + "learning_rate": 0.0001480991434773002, + "loss": 1.5104, + "step": 19982 + }, + { + "epoch": 0.2596699710708966, + "grad_norm": 0.33794882893562317, + "learning_rate": 0.00014809654401538882, + "loss": 1.3446, + "step": 19983 + }, + { + "epoch": 0.25968296561481247, + "grad_norm": 0.3056904375553131, + "learning_rate": 0.00014809394455347745, + "loss": 1.1081, + "step": 19984 + }, + { + "epoch": 0.25969596015872837, + "grad_norm": 0.3665129840373993, + "learning_rate": 0.00014809134509156607, + "loss": 1.5664, + "step": 19985 + }, + { + "epoch": 0.2597089547026442, + "grad_norm": 0.6793163418769836, + "learning_rate": 0.00014808874562965467, + "loss": 1.4859, + "step": 19986 + }, + { + "epoch": 0.2597219492465601, + "grad_norm": 0.3136841952800751, + "learning_rate": 0.00014808614616774326, + "loss": 1.2335, + "step": 19987 + }, + { + "epoch": 0.25973494379047596, + "grad_norm": 0.3115212619304657, + "learning_rate": 0.00014808354670583192, + "loss": 1.3646, + "step": 19988 + }, + { + "epoch": 0.25974793833439186, + "grad_norm": 0.3132984936237335, + "learning_rate": 0.00014808094724392051, + "loss": 1.4162, + "step": 19989 + }, + { + "epoch": 0.2597609328783077, + "grad_norm": 0.5420405864715576, + "learning_rate": 0.00014807834778200914, + "loss": 1.3965, + "step": 19990 + }, + { + "epoch": 0.2597739274222236, + "grad_norm": 0.3387608528137207, + "learning_rate": 0.00014807574832009774, + "loss": 1.471, + "step": 19991 + }, + { + "epoch": 0.25978692196613945, + "grad_norm": 0.45799100399017334, + "learning_rate": 0.00014807314885818636, + "loss": 1.4963, + "step": 19992 + }, + { + "epoch": 0.25979991651005535, + "grad_norm": 0.37677329778671265, + "learning_rate": 0.00014807054939627498, + "loss": 1.396, + "step": 19993 + }, + { + "epoch": 0.2598129110539712, + "grad_norm": 0.436212956905365, + "learning_rate": 0.00014806794993436358, + "loss": 1.5221, + "step": 19994 + }, + { + "epoch": 0.2598259055978871, + "grad_norm": 0.3971352279186249, + "learning_rate": 0.0001480653504724522, + "loss": 1.3441, + "step": 19995 + }, + { + "epoch": 0.25983890014180294, + "grad_norm": 0.4042627513408661, + "learning_rate": 0.00014806275101054083, + "loss": 1.3809, + "step": 19996 + }, + { + "epoch": 0.25985189468571884, + "grad_norm": 0.37473365664482117, + "learning_rate": 0.00014806015154862946, + "loss": 1.4999, + "step": 19997 + }, + { + "epoch": 0.2598648892296347, + "grad_norm": 0.42937174439430237, + "learning_rate": 0.00014805755208671805, + "loss": 1.5074, + "step": 19998 + }, + { + "epoch": 0.2598778837735506, + "grad_norm": 0.4152386486530304, + "learning_rate": 0.00014805495262480668, + "loss": 1.3692, + "step": 19999 + }, + { + "epoch": 0.25989087831746643, + "grad_norm": 0.3838444948196411, + "learning_rate": 0.0001480523531628953, + "loss": 1.2714, + "step": 20000 + }, + { + "epoch": 0.25990387286138233, + "grad_norm": 0.3570510447025299, + "learning_rate": 0.0001480497537009839, + "loss": 1.2544, + "step": 20001 + }, + { + "epoch": 0.2599168674052982, + "grad_norm": 0.3934921324253082, + "learning_rate": 0.00014804715423907252, + "loss": 1.4345, + "step": 20002 + }, + { + "epoch": 0.2599298619492141, + "grad_norm": 0.3592262268066406, + "learning_rate": 0.00014804455477716112, + "loss": 1.3459, + "step": 20003 + }, + { + "epoch": 0.2599428564931299, + "grad_norm": 0.37760719656944275, + "learning_rate": 0.00014804195531524975, + "loss": 1.4581, + "step": 20004 + }, + { + "epoch": 0.2599558510370458, + "grad_norm": 0.431679368019104, + "learning_rate": 0.00014803935585333837, + "loss": 1.4971, + "step": 20005 + }, + { + "epoch": 0.25996884558096167, + "grad_norm": 0.35606569051742554, + "learning_rate": 0.00014803675639142697, + "loss": 1.3987, + "step": 20006 + }, + { + "epoch": 0.2599818401248776, + "grad_norm": 0.39354535937309265, + "learning_rate": 0.0001480341569295156, + "loss": 1.4373, + "step": 20007 + }, + { + "epoch": 0.2599948346687934, + "grad_norm": 0.41023895144462585, + "learning_rate": 0.00014803155746760422, + "loss": 1.6971, + "step": 20008 + }, + { + "epoch": 0.2600078292127093, + "grad_norm": 0.4600581228733063, + "learning_rate": 0.00014802895800569284, + "loss": 1.6419, + "step": 20009 + }, + { + "epoch": 0.26002082375662516, + "grad_norm": 0.4285810589790344, + "learning_rate": 0.00014802635854378144, + "loss": 1.5576, + "step": 20010 + }, + { + "epoch": 0.26003381830054106, + "grad_norm": 0.4394996464252472, + "learning_rate": 0.00014802375908187006, + "loss": 1.5056, + "step": 20011 + }, + { + "epoch": 0.2600468128444569, + "grad_norm": 0.3718104660511017, + "learning_rate": 0.0001480211596199587, + "loss": 1.4, + "step": 20012 + }, + { + "epoch": 0.2600598073883728, + "grad_norm": 0.4083999991416931, + "learning_rate": 0.00014801856015804728, + "loss": 1.3433, + "step": 20013 + }, + { + "epoch": 0.26007280193228866, + "grad_norm": 0.6538832783699036, + "learning_rate": 0.0001480159606961359, + "loss": 1.4879, + "step": 20014 + }, + { + "epoch": 0.26008579647620456, + "grad_norm": 0.5382021069526672, + "learning_rate": 0.0001480133612342245, + "loss": 1.4073, + "step": 20015 + }, + { + "epoch": 0.2600987910201204, + "grad_norm": 0.41146913170814514, + "learning_rate": 0.00014801076177231316, + "loss": 1.3624, + "step": 20016 + }, + { + "epoch": 0.2601117855640363, + "grad_norm": 0.3455566167831421, + "learning_rate": 0.00014800816231040176, + "loss": 1.4985, + "step": 20017 + }, + { + "epoch": 0.26012478010795215, + "grad_norm": 0.4496353566646576, + "learning_rate": 0.00014800556284849035, + "loss": 1.4535, + "step": 20018 + }, + { + "epoch": 0.26013777465186805, + "grad_norm": 0.4191818833351135, + "learning_rate": 0.00014800296338657898, + "loss": 1.5009, + "step": 20019 + }, + { + "epoch": 0.2601507691957839, + "grad_norm": 0.35791489481925964, + "learning_rate": 0.0001480003639246676, + "loss": 1.4575, + "step": 20020 + }, + { + "epoch": 0.2601637637396998, + "grad_norm": 0.43893197178840637, + "learning_rate": 0.00014799776446275623, + "loss": 1.309, + "step": 20021 + }, + { + "epoch": 0.2601767582836157, + "grad_norm": 0.49037161469459534, + "learning_rate": 0.00014799516500084482, + "loss": 1.3291, + "step": 20022 + }, + { + "epoch": 0.26018975282753154, + "grad_norm": 0.38623711466789246, + "learning_rate": 0.00014799256553893345, + "loss": 1.2752, + "step": 20023 + }, + { + "epoch": 0.26020274737144744, + "grad_norm": 0.42128702998161316, + "learning_rate": 0.00014798996607702207, + "loss": 1.3228, + "step": 20024 + }, + { + "epoch": 0.2602157419153633, + "grad_norm": 0.36215609312057495, + "learning_rate": 0.00014798736661511067, + "loss": 1.5077, + "step": 20025 + }, + { + "epoch": 0.2602287364592792, + "grad_norm": 0.5817545652389526, + "learning_rate": 0.0001479847671531993, + "loss": 1.5628, + "step": 20026 + }, + { + "epoch": 0.26024173100319503, + "grad_norm": 0.41502395272254944, + "learning_rate": 0.00014798216769128792, + "loss": 1.465, + "step": 20027 + }, + { + "epoch": 0.26025472554711093, + "grad_norm": 0.43040773272514343, + "learning_rate": 0.00014797956822937654, + "loss": 1.4436, + "step": 20028 + }, + { + "epoch": 0.2602677200910268, + "grad_norm": 0.3085106313228607, + "learning_rate": 0.00014797696876746514, + "loss": 1.3371, + "step": 20029 + }, + { + "epoch": 0.2602807146349427, + "grad_norm": 0.3936407268047333, + "learning_rate": 0.00014797436930555374, + "loss": 1.3838, + "step": 20030 + }, + { + "epoch": 0.2602937091788585, + "grad_norm": 0.35208559036254883, + "learning_rate": 0.0001479717698436424, + "loss": 1.4223, + "step": 20031 + }, + { + "epoch": 0.2603067037227744, + "grad_norm": 0.465328574180603, + "learning_rate": 0.000147969170381731, + "loss": 1.287, + "step": 20032 + }, + { + "epoch": 0.26031969826669027, + "grad_norm": 0.37656769156455994, + "learning_rate": 0.0001479665709198196, + "loss": 1.295, + "step": 20033 + }, + { + "epoch": 0.26033269281060617, + "grad_norm": 0.34409090876579285, + "learning_rate": 0.0001479639714579082, + "loss": 1.2714, + "step": 20034 + }, + { + "epoch": 0.260345687354522, + "grad_norm": 0.3560160994529724, + "learning_rate": 0.00014796137199599683, + "loss": 1.439, + "step": 20035 + }, + { + "epoch": 0.2603586818984379, + "grad_norm": 0.3348861634731293, + "learning_rate": 0.00014795877253408546, + "loss": 1.3228, + "step": 20036 + }, + { + "epoch": 0.26037167644235376, + "grad_norm": 0.37119099497795105, + "learning_rate": 0.00014795617307217406, + "loss": 1.1904, + "step": 20037 + }, + { + "epoch": 0.26038467098626966, + "grad_norm": 0.4423973858356476, + "learning_rate": 0.00014795357361026268, + "loss": 1.5895, + "step": 20038 + }, + { + "epoch": 0.2603976655301855, + "grad_norm": 0.44404786825180054, + "learning_rate": 0.0001479509741483513, + "loss": 1.4094, + "step": 20039 + }, + { + "epoch": 0.2604106600741014, + "grad_norm": 0.4142516553401947, + "learning_rate": 0.00014794837468643993, + "loss": 1.3849, + "step": 20040 + }, + { + "epoch": 0.26042365461801725, + "grad_norm": 0.4309588372707367, + "learning_rate": 0.00014794577522452853, + "loss": 1.5008, + "step": 20041 + }, + { + "epoch": 0.26043664916193315, + "grad_norm": 0.45189225673675537, + "learning_rate": 0.00014794317576261712, + "loss": 1.3865, + "step": 20042 + }, + { + "epoch": 0.260449643705849, + "grad_norm": 0.41932713985443115, + "learning_rate": 0.00014794057630070578, + "loss": 1.4993, + "step": 20043 + }, + { + "epoch": 0.2604626382497649, + "grad_norm": 0.7539716958999634, + "learning_rate": 0.00014793797683879437, + "loss": 1.4877, + "step": 20044 + }, + { + "epoch": 0.26047563279368074, + "grad_norm": 0.4239988327026367, + "learning_rate": 0.000147935377376883, + "loss": 1.5216, + "step": 20045 + }, + { + "epoch": 0.26048862733759665, + "grad_norm": 0.4229147136211395, + "learning_rate": 0.0001479327779149716, + "loss": 1.3387, + "step": 20046 + }, + { + "epoch": 0.2605016218815125, + "grad_norm": 0.3592168390750885, + "learning_rate": 0.00014793017845306022, + "loss": 1.2966, + "step": 20047 + }, + { + "epoch": 0.2605146164254284, + "grad_norm": 0.3860403299331665, + "learning_rate": 0.00014792757899114884, + "loss": 1.6332, + "step": 20048 + }, + { + "epoch": 0.26052761096934424, + "grad_norm": 0.4260939359664917, + "learning_rate": 0.00014792497952923744, + "loss": 1.579, + "step": 20049 + }, + { + "epoch": 0.26054060551326014, + "grad_norm": 0.41257718205451965, + "learning_rate": 0.00014792238006732607, + "loss": 1.3625, + "step": 20050 + }, + { + "epoch": 0.260553600057176, + "grad_norm": 0.3556547462940216, + "learning_rate": 0.0001479197806054147, + "loss": 1.3959, + "step": 20051 + }, + { + "epoch": 0.2605665946010919, + "grad_norm": 0.3657892346382141, + "learning_rate": 0.00014791718114350331, + "loss": 1.5483, + "step": 20052 + }, + { + "epoch": 0.26057958914500773, + "grad_norm": 0.3173982501029968, + "learning_rate": 0.0001479145816815919, + "loss": 1.5487, + "step": 20053 + }, + { + "epoch": 0.26059258368892363, + "grad_norm": 0.4258553385734558, + "learning_rate": 0.00014791198221968054, + "loss": 1.6506, + "step": 20054 + }, + { + "epoch": 0.2606055782328395, + "grad_norm": 0.46200472116470337, + "learning_rate": 0.00014790938275776916, + "loss": 1.2226, + "step": 20055 + }, + { + "epoch": 0.2606185727767554, + "grad_norm": 0.4142913818359375, + "learning_rate": 0.00014790678329585776, + "loss": 1.37, + "step": 20056 + }, + { + "epoch": 0.2606315673206712, + "grad_norm": 0.4041662812232971, + "learning_rate": 0.00014790418383394638, + "loss": 1.5098, + "step": 20057 + }, + { + "epoch": 0.2606445618645871, + "grad_norm": 0.29881346225738525, + "learning_rate": 0.000147901584372035, + "loss": 1.2866, + "step": 20058 + }, + { + "epoch": 0.26065755640850297, + "grad_norm": 0.37747687101364136, + "learning_rate": 0.0001478989849101236, + "loss": 1.4203, + "step": 20059 + }, + { + "epoch": 0.26067055095241887, + "grad_norm": 0.4450613558292389, + "learning_rate": 0.00014789638544821223, + "loss": 1.6244, + "step": 20060 + }, + { + "epoch": 0.2606835454963347, + "grad_norm": 0.5059861540794373, + "learning_rate": 0.00014789378598630083, + "loss": 1.5656, + "step": 20061 + }, + { + "epoch": 0.2606965400402506, + "grad_norm": 0.3857816755771637, + "learning_rate": 0.00014789118652438948, + "loss": 1.5341, + "step": 20062 + }, + { + "epoch": 0.26070953458416646, + "grad_norm": 0.505658745765686, + "learning_rate": 0.00014788858706247808, + "loss": 1.4786, + "step": 20063 + }, + { + "epoch": 0.26072252912808236, + "grad_norm": 0.3507513701915741, + "learning_rate": 0.0001478859876005667, + "loss": 1.3486, + "step": 20064 + }, + { + "epoch": 0.2607355236719982, + "grad_norm": 0.3892856240272522, + "learning_rate": 0.0001478833881386553, + "loss": 1.3781, + "step": 20065 + }, + { + "epoch": 0.2607485182159141, + "grad_norm": 0.44830596446990967, + "learning_rate": 0.00014788078867674392, + "loss": 1.3686, + "step": 20066 + }, + { + "epoch": 0.26076151275982995, + "grad_norm": 0.3782263994216919, + "learning_rate": 0.00014787818921483255, + "loss": 1.4312, + "step": 20067 + }, + { + "epoch": 0.26077450730374585, + "grad_norm": 0.5403381586074829, + "learning_rate": 0.00014787558975292114, + "loss": 1.3026, + "step": 20068 + }, + { + "epoch": 0.2607875018476617, + "grad_norm": 0.6702861189842224, + "learning_rate": 0.00014787299029100977, + "loss": 1.5058, + "step": 20069 + }, + { + "epoch": 0.2608004963915776, + "grad_norm": 0.34010711312294006, + "learning_rate": 0.0001478703908290984, + "loss": 1.5275, + "step": 20070 + }, + { + "epoch": 0.26081349093549344, + "grad_norm": 0.4390709400177002, + "learning_rate": 0.000147867791367187, + "loss": 1.3433, + "step": 20071 + }, + { + "epoch": 0.26082648547940934, + "grad_norm": 0.4154433012008667, + "learning_rate": 0.00014786519190527561, + "loss": 1.5328, + "step": 20072 + }, + { + "epoch": 0.2608394800233252, + "grad_norm": 0.543128252029419, + "learning_rate": 0.0001478625924433642, + "loss": 1.5487, + "step": 20073 + }, + { + "epoch": 0.2608524745672411, + "grad_norm": 0.43565601110458374, + "learning_rate": 0.00014785999298145286, + "loss": 1.5526, + "step": 20074 + }, + { + "epoch": 0.26086546911115693, + "grad_norm": 0.4094981849193573, + "learning_rate": 0.00014785739351954146, + "loss": 1.2936, + "step": 20075 + }, + { + "epoch": 0.26087846365507283, + "grad_norm": 0.415495365858078, + "learning_rate": 0.00014785479405763009, + "loss": 1.3533, + "step": 20076 + }, + { + "epoch": 0.2608914581989887, + "grad_norm": 0.38112905621528625, + "learning_rate": 0.00014785219459571868, + "loss": 1.4268, + "step": 20077 + }, + { + "epoch": 0.2609044527429046, + "grad_norm": 0.34138205647468567, + "learning_rate": 0.0001478495951338073, + "loss": 1.2208, + "step": 20078 + }, + { + "epoch": 0.2609174472868204, + "grad_norm": 0.43928828835487366, + "learning_rate": 0.00014784699567189593, + "loss": 1.4356, + "step": 20079 + }, + { + "epoch": 0.2609304418307363, + "grad_norm": 0.33731186389923096, + "learning_rate": 0.00014784439620998453, + "loss": 1.2892, + "step": 20080 + }, + { + "epoch": 0.26094343637465217, + "grad_norm": 0.44879165291786194, + "learning_rate": 0.00014784179674807315, + "loss": 1.5041, + "step": 20081 + }, + { + "epoch": 0.26095643091856807, + "grad_norm": 0.4550076723098755, + "learning_rate": 0.00014783919728616178, + "loss": 1.4732, + "step": 20082 + }, + { + "epoch": 0.2609694254624839, + "grad_norm": 0.4142514765262604, + "learning_rate": 0.0001478365978242504, + "loss": 1.5805, + "step": 20083 + }, + { + "epoch": 0.2609824200063998, + "grad_norm": 0.5202316045761108, + "learning_rate": 0.000147833998362339, + "loss": 1.2799, + "step": 20084 + }, + { + "epoch": 0.26099541455031566, + "grad_norm": 0.36545681953430176, + "learning_rate": 0.0001478313989004276, + "loss": 1.4073, + "step": 20085 + }, + { + "epoch": 0.26100840909423156, + "grad_norm": 0.42346805334091187, + "learning_rate": 0.00014782879943851625, + "loss": 1.5625, + "step": 20086 + }, + { + "epoch": 0.2610214036381474, + "grad_norm": 0.41787809133529663, + "learning_rate": 0.00014782619997660485, + "loss": 1.3216, + "step": 20087 + }, + { + "epoch": 0.2610343981820633, + "grad_norm": 0.3913102447986603, + "learning_rate": 0.00014782360051469347, + "loss": 1.3677, + "step": 20088 + }, + { + "epoch": 0.26104739272597915, + "grad_norm": 0.46991586685180664, + "learning_rate": 0.00014782100105278207, + "loss": 1.4767, + "step": 20089 + }, + { + "epoch": 0.26106038726989506, + "grad_norm": 0.4853183925151825, + "learning_rate": 0.0001478184015908707, + "loss": 1.5355, + "step": 20090 + }, + { + "epoch": 0.2610733818138109, + "grad_norm": 0.36936235427856445, + "learning_rate": 0.00014781580212895932, + "loss": 1.5055, + "step": 20091 + }, + { + "epoch": 0.2610863763577268, + "grad_norm": 0.38284045457839966, + "learning_rate": 0.00014781320266704791, + "loss": 1.4527, + "step": 20092 + }, + { + "epoch": 0.26109937090164265, + "grad_norm": 0.42419642210006714, + "learning_rate": 0.00014781060320513654, + "loss": 1.4127, + "step": 20093 + }, + { + "epoch": 0.26111236544555855, + "grad_norm": 0.42717838287353516, + "learning_rate": 0.00014780800374322516, + "loss": 1.3308, + "step": 20094 + }, + { + "epoch": 0.2611253599894744, + "grad_norm": 0.463407427072525, + "learning_rate": 0.0001478054042813138, + "loss": 1.4209, + "step": 20095 + }, + { + "epoch": 0.2611383545333903, + "grad_norm": 0.401685506105423, + "learning_rate": 0.00014780280481940239, + "loss": 1.5169, + "step": 20096 + }, + { + "epoch": 0.26115134907730614, + "grad_norm": 0.37016400694847107, + "learning_rate": 0.000147800205357491, + "loss": 1.5556, + "step": 20097 + }, + { + "epoch": 0.26116434362122204, + "grad_norm": 0.37862780690193176, + "learning_rate": 0.00014779760589557963, + "loss": 1.3738, + "step": 20098 + }, + { + "epoch": 0.26117733816513794, + "grad_norm": 0.48829784989356995, + "learning_rate": 0.00014779500643366823, + "loss": 1.602, + "step": 20099 + }, + { + "epoch": 0.2611903327090538, + "grad_norm": 0.3443527817726135, + "learning_rate": 0.00014779240697175686, + "loss": 1.414, + "step": 20100 + }, + { + "epoch": 0.2612033272529697, + "grad_norm": 0.36224499344825745, + "learning_rate": 0.00014778980750984548, + "loss": 1.4077, + "step": 20101 + }, + { + "epoch": 0.26121632179688553, + "grad_norm": 0.37000930309295654, + "learning_rate": 0.00014778720804793408, + "loss": 1.488, + "step": 20102 + }, + { + "epoch": 0.26122931634080143, + "grad_norm": 0.45107078552246094, + "learning_rate": 0.0001477846085860227, + "loss": 1.5106, + "step": 20103 + }, + { + "epoch": 0.2612423108847173, + "grad_norm": 0.31218990683555603, + "learning_rate": 0.0001477820091241113, + "loss": 1.4386, + "step": 20104 + }, + { + "epoch": 0.2612553054286332, + "grad_norm": 0.41558870673179626, + "learning_rate": 0.00014777940966219995, + "loss": 1.3468, + "step": 20105 + }, + { + "epoch": 0.261268299972549, + "grad_norm": 0.4044245779514313, + "learning_rate": 0.00014777681020028855, + "loss": 1.4517, + "step": 20106 + }, + { + "epoch": 0.2612812945164649, + "grad_norm": 0.3682188093662262, + "learning_rate": 0.00014777421073837717, + "loss": 1.4658, + "step": 20107 + }, + { + "epoch": 0.26129428906038077, + "grad_norm": 0.3466509282588959, + "learning_rate": 0.00014777161127646577, + "loss": 1.2332, + "step": 20108 + }, + { + "epoch": 0.26130728360429667, + "grad_norm": 0.4850592613220215, + "learning_rate": 0.0001477690118145544, + "loss": 1.4182, + "step": 20109 + }, + { + "epoch": 0.2613202781482125, + "grad_norm": 0.3551687002182007, + "learning_rate": 0.00014776641235264302, + "loss": 1.3957, + "step": 20110 + }, + { + "epoch": 0.2613332726921284, + "grad_norm": 0.4125869572162628, + "learning_rate": 0.00014776381289073162, + "loss": 1.3972, + "step": 20111 + }, + { + "epoch": 0.26134626723604426, + "grad_norm": 0.3912777602672577, + "learning_rate": 0.00014776121342882024, + "loss": 1.4743, + "step": 20112 + }, + { + "epoch": 0.26135926177996016, + "grad_norm": 0.26863548159599304, + "learning_rate": 0.00014775861396690887, + "loss": 1.4372, + "step": 20113 + }, + { + "epoch": 0.261372256323876, + "grad_norm": 0.4019676446914673, + "learning_rate": 0.00014775601450499746, + "loss": 1.2715, + "step": 20114 + }, + { + "epoch": 0.2613852508677919, + "grad_norm": 0.4471684396266937, + "learning_rate": 0.0001477534150430861, + "loss": 1.46, + "step": 20115 + }, + { + "epoch": 0.26139824541170775, + "grad_norm": 0.4223073720932007, + "learning_rate": 0.00014775081558117468, + "loss": 1.3325, + "step": 20116 + }, + { + "epoch": 0.26141123995562365, + "grad_norm": 0.36810413002967834, + "learning_rate": 0.00014774821611926334, + "loss": 1.3726, + "step": 20117 + }, + { + "epoch": 0.2614242344995395, + "grad_norm": 0.5159755945205688, + "learning_rate": 0.00014774561665735193, + "loss": 1.546, + "step": 20118 + }, + { + "epoch": 0.2614372290434554, + "grad_norm": 0.4350285530090332, + "learning_rate": 0.00014774301719544056, + "loss": 1.5419, + "step": 20119 + }, + { + "epoch": 0.26145022358737124, + "grad_norm": 0.3918091058731079, + "learning_rate": 0.00014774041773352916, + "loss": 1.332, + "step": 20120 + }, + { + "epoch": 0.26146321813128715, + "grad_norm": 0.42419981956481934, + "learning_rate": 0.00014773781827161778, + "loss": 1.5422, + "step": 20121 + }, + { + "epoch": 0.261476212675203, + "grad_norm": 0.5650913119316101, + "learning_rate": 0.0001477352188097064, + "loss": 1.435, + "step": 20122 + }, + { + "epoch": 0.2614892072191189, + "grad_norm": 0.3097410798072815, + "learning_rate": 0.000147732619347795, + "loss": 1.2515, + "step": 20123 + }, + { + "epoch": 0.26150220176303474, + "grad_norm": 0.42348599433898926, + "learning_rate": 0.00014773001988588363, + "loss": 1.3592, + "step": 20124 + }, + { + "epoch": 0.26151519630695064, + "grad_norm": 0.3523194193840027, + "learning_rate": 0.00014772742042397225, + "loss": 1.5306, + "step": 20125 + }, + { + "epoch": 0.2615281908508665, + "grad_norm": 0.4342210292816162, + "learning_rate": 0.00014772482096206085, + "loss": 1.435, + "step": 20126 + }, + { + "epoch": 0.2615411853947824, + "grad_norm": 0.3992953598499298, + "learning_rate": 0.00014772222150014947, + "loss": 1.515, + "step": 20127 + }, + { + "epoch": 0.26155417993869823, + "grad_norm": 0.4150521457195282, + "learning_rate": 0.00014771962203823807, + "loss": 1.4629, + "step": 20128 + }, + { + "epoch": 0.26156717448261413, + "grad_norm": 0.37494733929634094, + "learning_rate": 0.00014771702257632672, + "loss": 1.5302, + "step": 20129 + }, + { + "epoch": 0.26158016902653, + "grad_norm": 0.33371207118034363, + "learning_rate": 0.00014771442311441532, + "loss": 1.4883, + "step": 20130 + }, + { + "epoch": 0.2615931635704459, + "grad_norm": 0.416979044675827, + "learning_rate": 0.00014771182365250394, + "loss": 1.2736, + "step": 20131 + }, + { + "epoch": 0.2616061581143617, + "grad_norm": 0.4367213845252991, + "learning_rate": 0.00014770922419059254, + "loss": 1.4821, + "step": 20132 + }, + { + "epoch": 0.2616191526582776, + "grad_norm": 0.4128398597240448, + "learning_rate": 0.00014770662472868117, + "loss": 1.6699, + "step": 20133 + }, + { + "epoch": 0.26163214720219347, + "grad_norm": 0.3529261648654938, + "learning_rate": 0.0001477040252667698, + "loss": 1.3901, + "step": 20134 + }, + { + "epoch": 0.26164514174610937, + "grad_norm": 0.4329010844230652, + "learning_rate": 0.0001477014258048584, + "loss": 1.4838, + "step": 20135 + }, + { + "epoch": 0.2616581362900252, + "grad_norm": 0.46771135926246643, + "learning_rate": 0.00014769882634294704, + "loss": 1.4736, + "step": 20136 + }, + { + "epoch": 0.2616711308339411, + "grad_norm": 0.30786871910095215, + "learning_rate": 0.00014769622688103564, + "loss": 1.2045, + "step": 20137 + }, + { + "epoch": 0.26168412537785696, + "grad_norm": 0.3756197988986969, + "learning_rate": 0.00014769362741912426, + "loss": 1.3288, + "step": 20138 + }, + { + "epoch": 0.26169711992177286, + "grad_norm": 0.4581568241119385, + "learning_rate": 0.00014769102795721286, + "loss": 1.4416, + "step": 20139 + }, + { + "epoch": 0.2617101144656887, + "grad_norm": 0.37867748737335205, + "learning_rate": 0.00014768842849530148, + "loss": 1.4914, + "step": 20140 + }, + { + "epoch": 0.2617231090096046, + "grad_norm": 0.36453431844711304, + "learning_rate": 0.0001476858290333901, + "loss": 1.3458, + "step": 20141 + }, + { + "epoch": 0.26173610355352045, + "grad_norm": 0.4035639762878418, + "learning_rate": 0.0001476832295714787, + "loss": 1.4142, + "step": 20142 + }, + { + "epoch": 0.26174909809743635, + "grad_norm": 0.34680575132369995, + "learning_rate": 0.00014768063010956733, + "loss": 1.4168, + "step": 20143 + }, + { + "epoch": 0.2617620926413522, + "grad_norm": 0.28573501110076904, + "learning_rate": 0.00014767803064765595, + "loss": 1.2719, + "step": 20144 + }, + { + "epoch": 0.2617750871852681, + "grad_norm": 0.42015549540519714, + "learning_rate": 0.00014767543118574455, + "loss": 1.5674, + "step": 20145 + }, + { + "epoch": 0.26178808172918394, + "grad_norm": 0.42318591475486755, + "learning_rate": 0.00014767283172383318, + "loss": 1.3624, + "step": 20146 + }, + { + "epoch": 0.26180107627309984, + "grad_norm": 0.40999001264572144, + "learning_rate": 0.00014767023226192177, + "loss": 1.3244, + "step": 20147 + }, + { + "epoch": 0.2618140708170157, + "grad_norm": 0.42387962341308594, + "learning_rate": 0.00014766763280001042, + "loss": 1.3569, + "step": 20148 + }, + { + "epoch": 0.2618270653609316, + "grad_norm": 0.4213663339614868, + "learning_rate": 0.00014766503333809902, + "loss": 1.2088, + "step": 20149 + }, + { + "epoch": 0.26184005990484743, + "grad_norm": 0.36051782965660095, + "learning_rate": 0.00014766243387618765, + "loss": 1.3062, + "step": 20150 + }, + { + "epoch": 0.26185305444876333, + "grad_norm": 0.37084564566612244, + "learning_rate": 0.00014765983441427624, + "loss": 1.3655, + "step": 20151 + }, + { + "epoch": 0.2618660489926792, + "grad_norm": 0.47213733196258545, + "learning_rate": 0.00014765723495236487, + "loss": 1.6196, + "step": 20152 + }, + { + "epoch": 0.2618790435365951, + "grad_norm": 0.4904009699821472, + "learning_rate": 0.0001476546354904535, + "loss": 1.5828, + "step": 20153 + }, + { + "epoch": 0.2618920380805109, + "grad_norm": 0.3814501464366913, + "learning_rate": 0.0001476520360285421, + "loss": 1.3349, + "step": 20154 + }, + { + "epoch": 0.2619050326244268, + "grad_norm": 0.5415825843811035, + "learning_rate": 0.00014764943656663071, + "loss": 1.4677, + "step": 20155 + }, + { + "epoch": 0.26191802716834267, + "grad_norm": 0.4704650342464447, + "learning_rate": 0.00014764683710471934, + "loss": 1.5759, + "step": 20156 + }, + { + "epoch": 0.26193102171225857, + "grad_norm": 0.3297520875930786, + "learning_rate": 0.00014764423764280794, + "loss": 1.3964, + "step": 20157 + }, + { + "epoch": 0.2619440162561744, + "grad_norm": 0.4667484164237976, + "learning_rate": 0.00014764163818089656, + "loss": 1.4234, + "step": 20158 + }, + { + "epoch": 0.2619570108000903, + "grad_norm": 0.3856322467327118, + "learning_rate": 0.00014763903871898516, + "loss": 1.6446, + "step": 20159 + }, + { + "epoch": 0.26197000534400616, + "grad_norm": 0.3428819179534912, + "learning_rate": 0.0001476364392570738, + "loss": 1.2616, + "step": 20160 + }, + { + "epoch": 0.26198299988792206, + "grad_norm": 0.2953803539276123, + "learning_rate": 0.0001476338397951624, + "loss": 1.0612, + "step": 20161 + }, + { + "epoch": 0.2619959944318379, + "grad_norm": 0.36012712121009827, + "learning_rate": 0.00014763124033325103, + "loss": 1.2913, + "step": 20162 + }, + { + "epoch": 0.2620089889757538, + "grad_norm": 0.3688061833381653, + "learning_rate": 0.00014762864087133963, + "loss": 1.3258, + "step": 20163 + }, + { + "epoch": 0.26202198351966965, + "grad_norm": 0.4010842740535736, + "learning_rate": 0.00014762604140942825, + "loss": 1.4237, + "step": 20164 + }, + { + "epoch": 0.26203497806358556, + "grad_norm": 0.33622780442237854, + "learning_rate": 0.00014762344194751688, + "loss": 1.3618, + "step": 20165 + }, + { + "epoch": 0.2620479726075014, + "grad_norm": 0.43539294600486755, + "learning_rate": 0.00014762084248560548, + "loss": 1.4673, + "step": 20166 + }, + { + "epoch": 0.2620609671514173, + "grad_norm": 0.366283118724823, + "learning_rate": 0.0001476182430236941, + "loss": 1.2359, + "step": 20167 + }, + { + "epoch": 0.26207396169533315, + "grad_norm": 0.4514642655849457, + "learning_rate": 0.00014761564356178272, + "loss": 1.4804, + "step": 20168 + }, + { + "epoch": 0.26208695623924905, + "grad_norm": 0.4827955663204193, + "learning_rate": 0.00014761304409987132, + "loss": 1.5538, + "step": 20169 + }, + { + "epoch": 0.2620999507831649, + "grad_norm": 0.4313634932041168, + "learning_rate": 0.00014761044463795995, + "loss": 1.4015, + "step": 20170 + }, + { + "epoch": 0.2621129453270808, + "grad_norm": 0.36991316080093384, + "learning_rate": 0.00014760784517604857, + "loss": 1.2878, + "step": 20171 + }, + { + "epoch": 0.26212593987099664, + "grad_norm": 0.3427586555480957, + "learning_rate": 0.0001476052457141372, + "loss": 1.3078, + "step": 20172 + }, + { + "epoch": 0.26213893441491254, + "grad_norm": 0.33271706104278564, + "learning_rate": 0.0001476026462522258, + "loss": 1.26, + "step": 20173 + }, + { + "epoch": 0.26215192895882844, + "grad_norm": 0.4862552583217621, + "learning_rate": 0.00014760004679031442, + "loss": 1.4289, + "step": 20174 + }, + { + "epoch": 0.2621649235027443, + "grad_norm": 0.35112544894218445, + "learning_rate": 0.00014759744732840304, + "loss": 1.2879, + "step": 20175 + }, + { + "epoch": 0.2621779180466602, + "grad_norm": 0.41724705696105957, + "learning_rate": 0.00014759484786649164, + "loss": 1.4041, + "step": 20176 + }, + { + "epoch": 0.26219091259057603, + "grad_norm": 0.43246138095855713, + "learning_rate": 0.00014759224840458026, + "loss": 1.3086, + "step": 20177 + }, + { + "epoch": 0.26220390713449193, + "grad_norm": 0.3994060456752777, + "learning_rate": 0.00014758964894266886, + "loss": 1.5876, + "step": 20178 + }, + { + "epoch": 0.2622169016784078, + "grad_norm": 0.37874072790145874, + "learning_rate": 0.0001475870494807575, + "loss": 1.2841, + "step": 20179 + }, + { + "epoch": 0.2622298962223237, + "grad_norm": 0.41393381357192993, + "learning_rate": 0.0001475844500188461, + "loss": 1.4817, + "step": 20180 + }, + { + "epoch": 0.2622428907662395, + "grad_norm": 0.4610188603401184, + "learning_rate": 0.0001475818505569347, + "loss": 1.5493, + "step": 20181 + }, + { + "epoch": 0.2622558853101554, + "grad_norm": 0.45889797806739807, + "learning_rate": 0.00014757925109502333, + "loss": 1.5041, + "step": 20182 + }, + { + "epoch": 0.26226887985407127, + "grad_norm": 0.32614219188690186, + "learning_rate": 0.00014757665163311196, + "loss": 1.2483, + "step": 20183 + }, + { + "epoch": 0.26228187439798717, + "grad_norm": 0.3387735188007355, + "learning_rate": 0.00014757405217120058, + "loss": 1.4668, + "step": 20184 + }, + { + "epoch": 0.262294868941903, + "grad_norm": 0.3283379375934601, + "learning_rate": 0.00014757145270928918, + "loss": 1.2671, + "step": 20185 + }, + { + "epoch": 0.2623078634858189, + "grad_norm": 0.3363078236579895, + "learning_rate": 0.0001475688532473778, + "loss": 1.4534, + "step": 20186 + }, + { + "epoch": 0.26232085802973476, + "grad_norm": 0.4706505835056305, + "learning_rate": 0.00014756625378546643, + "loss": 1.4774, + "step": 20187 + }, + { + "epoch": 0.26233385257365066, + "grad_norm": 0.3641832172870636, + "learning_rate": 0.00014756365432355502, + "loss": 1.3404, + "step": 20188 + }, + { + "epoch": 0.2623468471175665, + "grad_norm": 0.38552185893058777, + "learning_rate": 0.00014756105486164365, + "loss": 1.4825, + "step": 20189 + }, + { + "epoch": 0.2623598416614824, + "grad_norm": 0.4629080295562744, + "learning_rate": 0.00014755845539973225, + "loss": 1.3297, + "step": 20190 + }, + { + "epoch": 0.26237283620539825, + "grad_norm": 0.449511855840683, + "learning_rate": 0.0001475558559378209, + "loss": 1.388, + "step": 20191 + }, + { + "epoch": 0.26238583074931415, + "grad_norm": 0.4649542272090912, + "learning_rate": 0.0001475532564759095, + "loss": 1.3407, + "step": 20192 + }, + { + "epoch": 0.26239882529323, + "grad_norm": 0.4479967951774597, + "learning_rate": 0.0001475506570139981, + "loss": 1.3285, + "step": 20193 + }, + { + "epoch": 0.2624118198371459, + "grad_norm": 0.4342278242111206, + "learning_rate": 0.00014754805755208672, + "loss": 1.2499, + "step": 20194 + }, + { + "epoch": 0.26242481438106174, + "grad_norm": 0.4888266921043396, + "learning_rate": 0.00014754545809017534, + "loss": 1.4193, + "step": 20195 + }, + { + "epoch": 0.26243780892497764, + "grad_norm": 0.44978824257850647, + "learning_rate": 0.00014754285862826397, + "loss": 1.2825, + "step": 20196 + }, + { + "epoch": 0.2624508034688935, + "grad_norm": 0.4346938729286194, + "learning_rate": 0.00014754025916635256, + "loss": 1.5181, + "step": 20197 + }, + { + "epoch": 0.2624637980128094, + "grad_norm": 0.390828013420105, + "learning_rate": 0.0001475376597044412, + "loss": 1.4133, + "step": 20198 + }, + { + "epoch": 0.26247679255672524, + "grad_norm": 0.3145589828491211, + "learning_rate": 0.0001475350602425298, + "loss": 1.3667, + "step": 20199 + }, + { + "epoch": 0.26248978710064114, + "grad_norm": 0.48016080260276794, + "learning_rate": 0.0001475324607806184, + "loss": 1.5029, + "step": 20200 + }, + { + "epoch": 0.262502781644557, + "grad_norm": 0.3888293504714966, + "learning_rate": 0.00014752986131870703, + "loss": 1.5375, + "step": 20201 + }, + { + "epoch": 0.2625157761884729, + "grad_norm": 0.3966054618358612, + "learning_rate": 0.00014752726185679563, + "loss": 1.3739, + "step": 20202 + }, + { + "epoch": 0.2625287707323887, + "grad_norm": 0.44394451379776, + "learning_rate": 0.00014752466239488428, + "loss": 1.3229, + "step": 20203 + }, + { + "epoch": 0.26254176527630463, + "grad_norm": 0.45547330379486084, + "learning_rate": 0.00014752206293297288, + "loss": 1.5428, + "step": 20204 + }, + { + "epoch": 0.2625547598202205, + "grad_norm": 0.38327598571777344, + "learning_rate": 0.0001475194634710615, + "loss": 1.3516, + "step": 20205 + }, + { + "epoch": 0.2625677543641364, + "grad_norm": 0.4890802204608917, + "learning_rate": 0.0001475168640091501, + "loss": 1.5009, + "step": 20206 + }, + { + "epoch": 0.2625807489080522, + "grad_norm": 0.388841837644577, + "learning_rate": 0.00014751426454723873, + "loss": 1.5056, + "step": 20207 + }, + { + "epoch": 0.2625937434519681, + "grad_norm": 0.3530994951725006, + "learning_rate": 0.00014751166508532735, + "loss": 1.4598, + "step": 20208 + }, + { + "epoch": 0.26260673799588397, + "grad_norm": 0.39420780539512634, + "learning_rate": 0.00014750906562341595, + "loss": 1.3406, + "step": 20209 + }, + { + "epoch": 0.26261973253979987, + "grad_norm": 0.4644569158554077, + "learning_rate": 0.00014750646616150457, + "loss": 1.6051, + "step": 20210 + }, + { + "epoch": 0.2626327270837157, + "grad_norm": 0.38759228587150574, + "learning_rate": 0.0001475038666995932, + "loss": 1.3615, + "step": 20211 + }, + { + "epoch": 0.2626457216276316, + "grad_norm": 0.4613949656486511, + "learning_rate": 0.0001475012672376818, + "loss": 1.4428, + "step": 20212 + }, + { + "epoch": 0.26265871617154746, + "grad_norm": 0.35100141167640686, + "learning_rate": 0.00014749866777577042, + "loss": 1.3847, + "step": 20213 + }, + { + "epoch": 0.26267171071546336, + "grad_norm": 0.3811013698577881, + "learning_rate": 0.00014749606831385904, + "loss": 1.3349, + "step": 20214 + }, + { + "epoch": 0.2626847052593792, + "grad_norm": 0.45175135135650635, + "learning_rate": 0.00014749346885194767, + "loss": 1.5312, + "step": 20215 + }, + { + "epoch": 0.2626976998032951, + "grad_norm": 0.41111159324645996, + "learning_rate": 0.00014749086939003627, + "loss": 1.4716, + "step": 20216 + }, + { + "epoch": 0.26271069434721095, + "grad_norm": 0.37609606981277466, + "learning_rate": 0.0001474882699281249, + "loss": 1.3412, + "step": 20217 + }, + { + "epoch": 0.26272368889112685, + "grad_norm": 0.46760720014572144, + "learning_rate": 0.00014748567046621351, + "loss": 1.4216, + "step": 20218 + }, + { + "epoch": 0.2627366834350427, + "grad_norm": 0.42550939321517944, + "learning_rate": 0.0001474830710043021, + "loss": 1.2957, + "step": 20219 + }, + { + "epoch": 0.2627496779789586, + "grad_norm": 0.4149263799190521, + "learning_rate": 0.00014748047154239074, + "loss": 1.5043, + "step": 20220 + }, + { + "epoch": 0.26276267252287444, + "grad_norm": 0.36996227502822876, + "learning_rate": 0.00014747787208047933, + "loss": 1.398, + "step": 20221 + }, + { + "epoch": 0.26277566706679034, + "grad_norm": 0.46956267952919006, + "learning_rate": 0.00014747527261856799, + "loss": 1.4203, + "step": 20222 + }, + { + "epoch": 0.2627886616107062, + "grad_norm": 0.45764175057411194, + "learning_rate": 0.00014747267315665658, + "loss": 1.4969, + "step": 20223 + }, + { + "epoch": 0.2628016561546221, + "grad_norm": 0.4426584541797638, + "learning_rate": 0.00014747007369474518, + "loss": 1.4054, + "step": 20224 + }, + { + "epoch": 0.26281465069853793, + "grad_norm": 0.32432153820991516, + "learning_rate": 0.0001474674742328338, + "loss": 1.395, + "step": 20225 + }, + { + "epoch": 0.26282764524245383, + "grad_norm": 0.3240564465522766, + "learning_rate": 0.00014746487477092243, + "loss": 1.3175, + "step": 20226 + }, + { + "epoch": 0.2628406397863697, + "grad_norm": 0.4357198476791382, + "learning_rate": 0.00014746227530901105, + "loss": 1.4547, + "step": 20227 + }, + { + "epoch": 0.2628536343302856, + "grad_norm": 0.33011049032211304, + "learning_rate": 0.00014745967584709965, + "loss": 1.5637, + "step": 20228 + }, + { + "epoch": 0.2628666288742014, + "grad_norm": 0.5391556024551392, + "learning_rate": 0.00014745707638518828, + "loss": 1.7016, + "step": 20229 + }, + { + "epoch": 0.2628796234181173, + "grad_norm": 0.49176377058029175, + "learning_rate": 0.0001474544769232769, + "loss": 1.5119, + "step": 20230 + }, + { + "epoch": 0.26289261796203317, + "grad_norm": 0.35366764664649963, + "learning_rate": 0.0001474518774613655, + "loss": 1.343, + "step": 20231 + }, + { + "epoch": 0.26290561250594907, + "grad_norm": 0.34848880767822266, + "learning_rate": 0.00014744927799945412, + "loss": 1.2315, + "step": 20232 + }, + { + "epoch": 0.2629186070498649, + "grad_norm": 0.30918940901756287, + "learning_rate": 0.00014744667853754272, + "loss": 1.2837, + "step": 20233 + }, + { + "epoch": 0.2629316015937808, + "grad_norm": 0.3803653419017792, + "learning_rate": 0.00014744407907563137, + "loss": 1.401, + "step": 20234 + }, + { + "epoch": 0.26294459613769666, + "grad_norm": 0.39797085523605347, + "learning_rate": 0.00014744147961371997, + "loss": 1.3792, + "step": 20235 + }, + { + "epoch": 0.26295759068161256, + "grad_norm": 0.48326244950294495, + "learning_rate": 0.00014743888015180857, + "loss": 1.4719, + "step": 20236 + }, + { + "epoch": 0.2629705852255284, + "grad_norm": 0.38805291056632996, + "learning_rate": 0.0001474362806898972, + "loss": 1.4317, + "step": 20237 + }, + { + "epoch": 0.2629835797694443, + "grad_norm": 0.40799880027770996, + "learning_rate": 0.00014743368122798581, + "loss": 1.5228, + "step": 20238 + }, + { + "epoch": 0.26299657431336015, + "grad_norm": 0.44866764545440674, + "learning_rate": 0.00014743108176607444, + "loss": 1.554, + "step": 20239 + }, + { + "epoch": 0.26300956885727605, + "grad_norm": 0.4495713710784912, + "learning_rate": 0.00014742848230416304, + "loss": 1.4282, + "step": 20240 + }, + { + "epoch": 0.2630225634011919, + "grad_norm": 0.42010048031806946, + "learning_rate": 0.00014742588284225166, + "loss": 1.5962, + "step": 20241 + }, + { + "epoch": 0.2630355579451078, + "grad_norm": 0.5566220283508301, + "learning_rate": 0.00014742328338034029, + "loss": 1.5394, + "step": 20242 + }, + { + "epoch": 0.26304855248902365, + "grad_norm": 0.4417506754398346, + "learning_rate": 0.00014742068391842888, + "loss": 1.6033, + "step": 20243 + }, + { + "epoch": 0.26306154703293955, + "grad_norm": 0.481902152299881, + "learning_rate": 0.0001474180844565175, + "loss": 1.4638, + "step": 20244 + }, + { + "epoch": 0.2630745415768554, + "grad_norm": 0.48447972536087036, + "learning_rate": 0.00014741548499460613, + "loss": 1.51, + "step": 20245 + }, + { + "epoch": 0.2630875361207713, + "grad_norm": 0.4679826498031616, + "learning_rate": 0.00014741288553269476, + "loss": 1.4538, + "step": 20246 + }, + { + "epoch": 0.26310053066468714, + "grad_norm": 0.4882189631462097, + "learning_rate": 0.00014741028607078335, + "loss": 1.4387, + "step": 20247 + }, + { + "epoch": 0.26311352520860304, + "grad_norm": 0.43295976519584656, + "learning_rate": 0.00014740768660887195, + "loss": 1.3978, + "step": 20248 + }, + { + "epoch": 0.2631265197525189, + "grad_norm": 0.34668004512786865, + "learning_rate": 0.0001474050871469606, + "loss": 1.3928, + "step": 20249 + }, + { + "epoch": 0.2631395142964348, + "grad_norm": 0.410810649394989, + "learning_rate": 0.0001474024876850492, + "loss": 1.4836, + "step": 20250 + }, + { + "epoch": 0.2631525088403507, + "grad_norm": 0.4092983305454254, + "learning_rate": 0.00014739988822313782, + "loss": 1.421, + "step": 20251 + }, + { + "epoch": 0.26316550338426653, + "grad_norm": 0.4536350667476654, + "learning_rate": 0.00014739728876122642, + "loss": 1.3137, + "step": 20252 + }, + { + "epoch": 0.26317849792818243, + "grad_norm": 0.4732278883457184, + "learning_rate": 0.00014739468929931505, + "loss": 1.3703, + "step": 20253 + }, + { + "epoch": 0.2631914924720983, + "grad_norm": 0.36400094628334045, + "learning_rate": 0.00014739208983740367, + "loss": 1.4922, + "step": 20254 + }, + { + "epoch": 0.2632044870160142, + "grad_norm": 0.4370715320110321, + "learning_rate": 0.00014738949037549227, + "loss": 1.4546, + "step": 20255 + }, + { + "epoch": 0.26321748155993, + "grad_norm": 0.406096488237381, + "learning_rate": 0.0001473868909135809, + "loss": 1.4973, + "step": 20256 + }, + { + "epoch": 0.2632304761038459, + "grad_norm": 0.33333998918533325, + "learning_rate": 0.00014738429145166952, + "loss": 1.5353, + "step": 20257 + }, + { + "epoch": 0.26324347064776177, + "grad_norm": 0.39879703521728516, + "learning_rate": 0.00014738169198975814, + "loss": 1.5031, + "step": 20258 + }, + { + "epoch": 0.26325646519167767, + "grad_norm": 0.41050270199775696, + "learning_rate": 0.00014737909252784674, + "loss": 1.3392, + "step": 20259 + }, + { + "epoch": 0.2632694597355935, + "grad_norm": 0.4123345613479614, + "learning_rate": 0.00014737649306593536, + "loss": 1.3392, + "step": 20260 + }, + { + "epoch": 0.2632824542795094, + "grad_norm": 0.3925626873970032, + "learning_rate": 0.000147373893604024, + "loss": 1.5845, + "step": 20261 + }, + { + "epoch": 0.26329544882342526, + "grad_norm": 0.40224093198776245, + "learning_rate": 0.00014737129414211259, + "loss": 1.4397, + "step": 20262 + }, + { + "epoch": 0.26330844336734116, + "grad_norm": 0.5149607062339783, + "learning_rate": 0.0001473686946802012, + "loss": 1.2604, + "step": 20263 + }, + { + "epoch": 0.263321437911257, + "grad_norm": 0.47094354033470154, + "learning_rate": 0.0001473660952182898, + "loss": 1.5489, + "step": 20264 + }, + { + "epoch": 0.2633344324551729, + "grad_norm": 0.434526264667511, + "learning_rate": 0.00014736349575637843, + "loss": 1.383, + "step": 20265 + }, + { + "epoch": 0.26334742699908875, + "grad_norm": 0.35797902941703796, + "learning_rate": 0.00014736089629446706, + "loss": 1.2575, + "step": 20266 + }, + { + "epoch": 0.26336042154300465, + "grad_norm": 0.4485943913459778, + "learning_rate": 0.00014735829683255565, + "loss": 1.3066, + "step": 20267 + }, + { + "epoch": 0.2633734160869205, + "grad_norm": 0.3717271387577057, + "learning_rate": 0.00014735569737064428, + "loss": 1.4803, + "step": 20268 + }, + { + "epoch": 0.2633864106308364, + "grad_norm": 0.46604233980178833, + "learning_rate": 0.0001473530979087329, + "loss": 1.4642, + "step": 20269 + }, + { + "epoch": 0.26339940517475224, + "grad_norm": 0.430742472410202, + "learning_rate": 0.00014735049844682153, + "loss": 1.4613, + "step": 20270 + }, + { + "epoch": 0.26341239971866814, + "grad_norm": 0.4645368456840515, + "learning_rate": 0.00014734789898491012, + "loss": 1.4686, + "step": 20271 + }, + { + "epoch": 0.263425394262584, + "grad_norm": 0.3445025086402893, + "learning_rate": 0.00014734529952299875, + "loss": 1.2804, + "step": 20272 + }, + { + "epoch": 0.2634383888064999, + "grad_norm": 0.3552330434322357, + "learning_rate": 0.00014734270006108737, + "loss": 1.3502, + "step": 20273 + }, + { + "epoch": 0.26345138335041574, + "grad_norm": 0.4548403322696686, + "learning_rate": 0.00014734010059917597, + "loss": 1.5107, + "step": 20274 + }, + { + "epoch": 0.26346437789433164, + "grad_norm": 0.4379022717475891, + "learning_rate": 0.0001473375011372646, + "loss": 1.5556, + "step": 20275 + }, + { + "epoch": 0.2634773724382475, + "grad_norm": 0.38250961899757385, + "learning_rate": 0.0001473349016753532, + "loss": 1.5204, + "step": 20276 + }, + { + "epoch": 0.2634903669821634, + "grad_norm": 0.38495001196861267, + "learning_rate": 0.00014733230221344182, + "loss": 1.404, + "step": 20277 + }, + { + "epoch": 0.2635033615260792, + "grad_norm": 0.41897985339164734, + "learning_rate": 0.00014732970275153044, + "loss": 1.5264, + "step": 20278 + }, + { + "epoch": 0.26351635606999513, + "grad_norm": 0.3831501305103302, + "learning_rate": 0.00014732710328961904, + "loss": 1.4505, + "step": 20279 + }, + { + "epoch": 0.263529350613911, + "grad_norm": 0.33679720759391785, + "learning_rate": 0.00014732450382770766, + "loss": 1.415, + "step": 20280 + }, + { + "epoch": 0.2635423451578269, + "grad_norm": 0.4497471749782562, + "learning_rate": 0.0001473219043657963, + "loss": 1.4677, + "step": 20281 + }, + { + "epoch": 0.2635553397017427, + "grad_norm": 0.45883193612098694, + "learning_rate": 0.0001473193049038849, + "loss": 1.4142, + "step": 20282 + }, + { + "epoch": 0.2635683342456586, + "grad_norm": 0.35476186871528625, + "learning_rate": 0.0001473167054419735, + "loss": 1.2586, + "step": 20283 + }, + { + "epoch": 0.26358132878957446, + "grad_norm": 0.47246623039245605, + "learning_rate": 0.00014731410598006213, + "loss": 1.2574, + "step": 20284 + }, + { + "epoch": 0.26359432333349037, + "grad_norm": 0.3894566297531128, + "learning_rate": 0.00014731150651815076, + "loss": 1.452, + "step": 20285 + }, + { + "epoch": 0.2636073178774062, + "grad_norm": 0.4148367643356323, + "learning_rate": 0.00014730890705623936, + "loss": 1.4957, + "step": 20286 + }, + { + "epoch": 0.2636203124213221, + "grad_norm": 0.42245736718177795, + "learning_rate": 0.00014730630759432798, + "loss": 1.5777, + "step": 20287 + }, + { + "epoch": 0.26363330696523796, + "grad_norm": 0.4743061363697052, + "learning_rate": 0.0001473037081324166, + "loss": 1.4748, + "step": 20288 + }, + { + "epoch": 0.26364630150915386, + "grad_norm": 0.39008021354675293, + "learning_rate": 0.00014730110867050523, + "loss": 1.5946, + "step": 20289 + }, + { + "epoch": 0.2636592960530697, + "grad_norm": 0.4633859395980835, + "learning_rate": 0.00014729850920859383, + "loss": 1.4356, + "step": 20290 + }, + { + "epoch": 0.2636722905969856, + "grad_norm": 0.4043808579444885, + "learning_rate": 0.00014729590974668242, + "loss": 1.3425, + "step": 20291 + }, + { + "epoch": 0.26368528514090145, + "grad_norm": 0.4194539785385132, + "learning_rate": 0.00014729331028477108, + "loss": 1.2348, + "step": 20292 + }, + { + "epoch": 0.26369827968481735, + "grad_norm": 0.3785662353038788, + "learning_rate": 0.00014729071082285967, + "loss": 1.402, + "step": 20293 + }, + { + "epoch": 0.2637112742287332, + "grad_norm": 0.45317962765693665, + "learning_rate": 0.0001472881113609483, + "loss": 1.4497, + "step": 20294 + }, + { + "epoch": 0.2637242687726491, + "grad_norm": 0.40188026428222656, + "learning_rate": 0.0001472855118990369, + "loss": 1.3854, + "step": 20295 + }, + { + "epoch": 0.26373726331656494, + "grad_norm": 0.39926695823669434, + "learning_rate": 0.00014728291243712552, + "loss": 1.4611, + "step": 20296 + }, + { + "epoch": 0.26375025786048084, + "grad_norm": 0.4087277054786682, + "learning_rate": 0.00014728031297521414, + "loss": 1.5526, + "step": 20297 + }, + { + "epoch": 0.2637632524043967, + "grad_norm": 0.3697447180747986, + "learning_rate": 0.00014727771351330274, + "loss": 1.3993, + "step": 20298 + }, + { + "epoch": 0.2637762469483126, + "grad_norm": 0.5567469000816345, + "learning_rate": 0.00014727511405139137, + "loss": 1.5175, + "step": 20299 + }, + { + "epoch": 0.26378924149222843, + "grad_norm": 0.397339403629303, + "learning_rate": 0.00014727251458948, + "loss": 1.4167, + "step": 20300 + }, + { + "epoch": 0.26380223603614433, + "grad_norm": 0.4239199161529541, + "learning_rate": 0.00014726991512756862, + "loss": 1.611, + "step": 20301 + }, + { + "epoch": 0.2638152305800602, + "grad_norm": 0.3901256322860718, + "learning_rate": 0.0001472673156656572, + "loss": 1.4279, + "step": 20302 + }, + { + "epoch": 0.2638282251239761, + "grad_norm": 0.4490596055984497, + "learning_rate": 0.0001472647162037458, + "loss": 1.4353, + "step": 20303 + }, + { + "epoch": 0.2638412196678919, + "grad_norm": 0.4311522841453552, + "learning_rate": 0.00014726211674183446, + "loss": 1.4894, + "step": 20304 + }, + { + "epoch": 0.2638542142118078, + "grad_norm": 0.376436710357666, + "learning_rate": 0.00014725951727992306, + "loss": 1.4257, + "step": 20305 + }, + { + "epoch": 0.26386720875572367, + "grad_norm": 0.35798490047454834, + "learning_rate": 0.00014725691781801168, + "loss": 1.5572, + "step": 20306 + }, + { + "epoch": 0.26388020329963957, + "grad_norm": 0.4611857533454895, + "learning_rate": 0.00014725431835610028, + "loss": 1.5312, + "step": 20307 + }, + { + "epoch": 0.2638931978435554, + "grad_norm": 0.3669566512107849, + "learning_rate": 0.0001472517188941889, + "loss": 1.2343, + "step": 20308 + }, + { + "epoch": 0.2639061923874713, + "grad_norm": 0.45506301522254944, + "learning_rate": 0.00014724911943227753, + "loss": 1.5609, + "step": 20309 + }, + { + "epoch": 0.26391918693138716, + "grad_norm": 0.4446612596511841, + "learning_rate": 0.00014724651997036613, + "loss": 1.455, + "step": 20310 + }, + { + "epoch": 0.26393218147530306, + "grad_norm": 0.3697642683982849, + "learning_rate": 0.00014724392050845475, + "loss": 1.2827, + "step": 20311 + }, + { + "epoch": 0.2639451760192189, + "grad_norm": 0.3904421329498291, + "learning_rate": 0.00014724132104654338, + "loss": 1.5397, + "step": 20312 + }, + { + "epoch": 0.2639581705631348, + "grad_norm": 0.4419945478439331, + "learning_rate": 0.000147238721584632, + "loss": 1.5727, + "step": 20313 + }, + { + "epoch": 0.26397116510705065, + "grad_norm": 0.35768091678619385, + "learning_rate": 0.0001472361221227206, + "loss": 1.4451, + "step": 20314 + }, + { + "epoch": 0.26398415965096655, + "grad_norm": 0.34864622354507446, + "learning_rate": 0.0001472335226608092, + "loss": 1.4107, + "step": 20315 + }, + { + "epoch": 0.2639971541948824, + "grad_norm": 0.36060312390327454, + "learning_rate": 0.00014723092319889785, + "loss": 1.3151, + "step": 20316 + }, + { + "epoch": 0.2640101487387983, + "grad_norm": 0.39931216835975647, + "learning_rate": 0.00014722832373698644, + "loss": 1.4274, + "step": 20317 + }, + { + "epoch": 0.26402314328271415, + "grad_norm": 0.31003302335739136, + "learning_rate": 0.00014722572427507507, + "loss": 1.2604, + "step": 20318 + }, + { + "epoch": 0.26403613782663005, + "grad_norm": 0.3718045651912689, + "learning_rate": 0.0001472231248131637, + "loss": 1.6029, + "step": 20319 + }, + { + "epoch": 0.2640491323705459, + "grad_norm": 0.41842931509017944, + "learning_rate": 0.0001472205253512523, + "loss": 1.3846, + "step": 20320 + }, + { + "epoch": 0.2640621269144618, + "grad_norm": 0.33567166328430176, + "learning_rate": 0.00014721792588934092, + "loss": 1.2919, + "step": 20321 + }, + { + "epoch": 0.26407512145837764, + "grad_norm": 0.3347376883029938, + "learning_rate": 0.0001472153264274295, + "loss": 1.3849, + "step": 20322 + }, + { + "epoch": 0.26408811600229354, + "grad_norm": 0.516158938407898, + "learning_rate": 0.00014721272696551816, + "loss": 1.4817, + "step": 20323 + }, + { + "epoch": 0.2641011105462094, + "grad_norm": 0.4297718405723572, + "learning_rate": 0.00014721012750360676, + "loss": 1.5191, + "step": 20324 + }, + { + "epoch": 0.2641141050901253, + "grad_norm": 0.4083881676197052, + "learning_rate": 0.00014720752804169539, + "loss": 1.4355, + "step": 20325 + }, + { + "epoch": 0.26412709963404113, + "grad_norm": 0.4268642067909241, + "learning_rate": 0.00014720492857978398, + "loss": 1.4211, + "step": 20326 + }, + { + "epoch": 0.26414009417795703, + "grad_norm": 0.4757986068725586, + "learning_rate": 0.0001472023291178726, + "loss": 1.4262, + "step": 20327 + }, + { + "epoch": 0.26415308872187293, + "grad_norm": 0.3787767291069031, + "learning_rate": 0.00014719972965596123, + "loss": 1.5379, + "step": 20328 + }, + { + "epoch": 0.2641660832657888, + "grad_norm": 0.37189981341362, + "learning_rate": 0.00014719713019404983, + "loss": 1.3065, + "step": 20329 + }, + { + "epoch": 0.2641790778097047, + "grad_norm": 0.32237106561660767, + "learning_rate": 0.00014719453073213845, + "loss": 1.1731, + "step": 20330 + }, + { + "epoch": 0.2641920723536205, + "grad_norm": 0.5207783579826355, + "learning_rate": 0.00014719193127022708, + "loss": 1.2849, + "step": 20331 + }, + { + "epoch": 0.2642050668975364, + "grad_norm": 0.41006970405578613, + "learning_rate": 0.00014718933180831568, + "loss": 1.4631, + "step": 20332 + }, + { + "epoch": 0.26421806144145227, + "grad_norm": 0.44240331649780273, + "learning_rate": 0.0001471867323464043, + "loss": 1.2657, + "step": 20333 + }, + { + "epoch": 0.26423105598536817, + "grad_norm": 0.41540876030921936, + "learning_rate": 0.0001471841328844929, + "loss": 1.5526, + "step": 20334 + }, + { + "epoch": 0.264244050529284, + "grad_norm": 0.37062546610832214, + "learning_rate": 0.00014718153342258155, + "loss": 1.3855, + "step": 20335 + }, + { + "epoch": 0.2642570450731999, + "grad_norm": 0.3392045497894287, + "learning_rate": 0.00014717893396067015, + "loss": 1.5928, + "step": 20336 + }, + { + "epoch": 0.26427003961711576, + "grad_norm": 0.3722420036792755, + "learning_rate": 0.00014717633449875877, + "loss": 1.4956, + "step": 20337 + }, + { + "epoch": 0.26428303416103166, + "grad_norm": 0.3512881398200989, + "learning_rate": 0.00014717373503684737, + "loss": 1.3604, + "step": 20338 + }, + { + "epoch": 0.2642960287049475, + "grad_norm": 0.38898351788520813, + "learning_rate": 0.000147171135574936, + "loss": 1.3339, + "step": 20339 + }, + { + "epoch": 0.2643090232488634, + "grad_norm": 0.4500400424003601, + "learning_rate": 0.00014716853611302462, + "loss": 1.5171, + "step": 20340 + }, + { + "epoch": 0.26432201779277925, + "grad_norm": 0.4502471387386322, + "learning_rate": 0.00014716593665111322, + "loss": 1.452, + "step": 20341 + }, + { + "epoch": 0.26433501233669515, + "grad_norm": 0.30990028381347656, + "learning_rate": 0.00014716333718920184, + "loss": 1.39, + "step": 20342 + }, + { + "epoch": 0.264348006880611, + "grad_norm": 0.40244022011756897, + "learning_rate": 0.00014716073772729046, + "loss": 1.5825, + "step": 20343 + }, + { + "epoch": 0.2643610014245269, + "grad_norm": 0.45240139961242676, + "learning_rate": 0.0001471581382653791, + "loss": 1.4573, + "step": 20344 + }, + { + "epoch": 0.26437399596844274, + "grad_norm": 0.4707461893558502, + "learning_rate": 0.00014715553880346769, + "loss": 1.4836, + "step": 20345 + }, + { + "epoch": 0.26438699051235864, + "grad_norm": 0.4370970129966736, + "learning_rate": 0.00014715293934155628, + "loss": 1.4546, + "step": 20346 + }, + { + "epoch": 0.2643999850562745, + "grad_norm": 0.4711625874042511, + "learning_rate": 0.00014715033987964494, + "loss": 1.4243, + "step": 20347 + }, + { + "epoch": 0.2644129796001904, + "grad_norm": 0.4341016411781311, + "learning_rate": 0.00014714774041773353, + "loss": 1.5527, + "step": 20348 + }, + { + "epoch": 0.26442597414410624, + "grad_norm": 0.3351496160030365, + "learning_rate": 0.00014714514095582216, + "loss": 1.1485, + "step": 20349 + }, + { + "epoch": 0.26443896868802214, + "grad_norm": 0.4886619746685028, + "learning_rate": 0.00014714254149391075, + "loss": 1.5007, + "step": 20350 + }, + { + "epoch": 0.264451963231938, + "grad_norm": 0.47134706377983093, + "learning_rate": 0.00014713994203199938, + "loss": 1.4468, + "step": 20351 + }, + { + "epoch": 0.2644649577758539, + "grad_norm": 0.5228691101074219, + "learning_rate": 0.000147137342570088, + "loss": 1.4265, + "step": 20352 + }, + { + "epoch": 0.2644779523197697, + "grad_norm": 0.46689245104789734, + "learning_rate": 0.0001471347431081766, + "loss": 1.312, + "step": 20353 + }, + { + "epoch": 0.2644909468636856, + "grad_norm": 0.41836151480674744, + "learning_rate": 0.00014713214364626523, + "loss": 1.4118, + "step": 20354 + }, + { + "epoch": 0.2645039414076015, + "grad_norm": 0.49114224314689636, + "learning_rate": 0.00014712954418435385, + "loss": 1.4517, + "step": 20355 + }, + { + "epoch": 0.2645169359515174, + "grad_norm": 0.49041232466697693, + "learning_rate": 0.00014712694472244247, + "loss": 1.4667, + "step": 20356 + }, + { + "epoch": 0.2645299304954332, + "grad_norm": 0.39503321051597595, + "learning_rate": 0.00014712434526053107, + "loss": 1.4059, + "step": 20357 + }, + { + "epoch": 0.2645429250393491, + "grad_norm": 0.4683149755001068, + "learning_rate": 0.0001471217457986197, + "loss": 1.4779, + "step": 20358 + }, + { + "epoch": 0.26455591958326496, + "grad_norm": 0.402805358171463, + "learning_rate": 0.00014711914633670832, + "loss": 1.4736, + "step": 20359 + }, + { + "epoch": 0.26456891412718087, + "grad_norm": 0.369107723236084, + "learning_rate": 0.00014711654687479692, + "loss": 1.4433, + "step": 20360 + }, + { + "epoch": 0.2645819086710967, + "grad_norm": 0.4391769766807556, + "learning_rate": 0.00014711394741288554, + "loss": 1.4178, + "step": 20361 + }, + { + "epoch": 0.2645949032150126, + "grad_norm": 0.30711233615875244, + "learning_rate": 0.00014711134795097417, + "loss": 1.4596, + "step": 20362 + }, + { + "epoch": 0.26460789775892846, + "grad_norm": 0.44645780324935913, + "learning_rate": 0.00014710874848906276, + "loss": 1.4392, + "step": 20363 + }, + { + "epoch": 0.26462089230284436, + "grad_norm": 0.38735485076904297, + "learning_rate": 0.0001471061490271514, + "loss": 1.6671, + "step": 20364 + }, + { + "epoch": 0.2646338868467602, + "grad_norm": 0.41639265418052673, + "learning_rate": 0.00014710354956523999, + "loss": 1.3628, + "step": 20365 + }, + { + "epoch": 0.2646468813906761, + "grad_norm": 0.3761052191257477, + "learning_rate": 0.00014710095010332864, + "loss": 1.4702, + "step": 20366 + }, + { + "epoch": 0.26465987593459195, + "grad_norm": 0.388702929019928, + "learning_rate": 0.00014709835064141723, + "loss": 1.4651, + "step": 20367 + }, + { + "epoch": 0.26467287047850785, + "grad_norm": 0.3984396457672119, + "learning_rate": 0.00014709575117950586, + "loss": 1.3737, + "step": 20368 + }, + { + "epoch": 0.2646858650224237, + "grad_norm": 0.3564522862434387, + "learning_rate": 0.00014709315171759446, + "loss": 1.4, + "step": 20369 + }, + { + "epoch": 0.2646988595663396, + "grad_norm": 0.394505113363266, + "learning_rate": 0.00014709055225568308, + "loss": 1.2504, + "step": 20370 + }, + { + "epoch": 0.26471185411025544, + "grad_norm": 0.43178120255470276, + "learning_rate": 0.0001470879527937717, + "loss": 1.3292, + "step": 20371 + }, + { + "epoch": 0.26472484865417134, + "grad_norm": 0.40451210737228394, + "learning_rate": 0.0001470853533318603, + "loss": 1.3094, + "step": 20372 + }, + { + "epoch": 0.2647378431980872, + "grad_norm": 0.5249249339103699, + "learning_rate": 0.00014708275386994893, + "loss": 1.3884, + "step": 20373 + }, + { + "epoch": 0.2647508377420031, + "grad_norm": 0.3677501976490021, + "learning_rate": 0.00014708015440803755, + "loss": 1.5359, + "step": 20374 + }, + { + "epoch": 0.26476383228591893, + "grad_norm": 0.412564218044281, + "learning_rate": 0.00014707755494612615, + "loss": 1.5126, + "step": 20375 + }, + { + "epoch": 0.26477682682983483, + "grad_norm": 0.36126774549484253, + "learning_rate": 0.00014707495548421477, + "loss": 1.4725, + "step": 20376 + }, + { + "epoch": 0.2647898213737507, + "grad_norm": 0.3468768298625946, + "learning_rate": 0.00014707235602230337, + "loss": 1.428, + "step": 20377 + }, + { + "epoch": 0.2648028159176666, + "grad_norm": 0.4005556106567383, + "learning_rate": 0.00014706975656039202, + "loss": 1.4203, + "step": 20378 + }, + { + "epoch": 0.2648158104615824, + "grad_norm": 0.41145312786102295, + "learning_rate": 0.00014706715709848062, + "loss": 1.3495, + "step": 20379 + }, + { + "epoch": 0.2648288050054983, + "grad_norm": 0.40121734142303467, + "learning_rate": 0.00014706455763656924, + "loss": 1.4096, + "step": 20380 + }, + { + "epoch": 0.26484179954941417, + "grad_norm": 0.3555337190628052, + "learning_rate": 0.00014706195817465784, + "loss": 1.519, + "step": 20381 + }, + { + "epoch": 0.26485479409333007, + "grad_norm": 0.42210060358047485, + "learning_rate": 0.00014705935871274647, + "loss": 1.6939, + "step": 20382 + }, + { + "epoch": 0.2648677886372459, + "grad_norm": 0.40848425030708313, + "learning_rate": 0.0001470567592508351, + "loss": 1.5978, + "step": 20383 + }, + { + "epoch": 0.2648807831811618, + "grad_norm": 0.4455949068069458, + "learning_rate": 0.0001470541597889237, + "loss": 1.4914, + "step": 20384 + }, + { + "epoch": 0.26489377772507766, + "grad_norm": 0.3725610077381134, + "learning_rate": 0.0001470515603270123, + "loss": 1.4107, + "step": 20385 + }, + { + "epoch": 0.26490677226899356, + "grad_norm": 0.4815816581249237, + "learning_rate": 0.00014704896086510094, + "loss": 1.555, + "step": 20386 + }, + { + "epoch": 0.2649197668129094, + "grad_norm": 0.3977145850658417, + "learning_rate": 0.00014704636140318953, + "loss": 1.53, + "step": 20387 + }, + { + "epoch": 0.2649327613568253, + "grad_norm": 0.4359447956085205, + "learning_rate": 0.00014704376194127816, + "loss": 1.4012, + "step": 20388 + }, + { + "epoch": 0.26494575590074115, + "grad_norm": 0.40985336899757385, + "learning_rate": 0.00014704116247936676, + "loss": 1.4377, + "step": 20389 + }, + { + "epoch": 0.26495875044465705, + "grad_norm": 0.4714990556240082, + "learning_rate": 0.0001470385630174554, + "loss": 1.5874, + "step": 20390 + }, + { + "epoch": 0.2649717449885729, + "grad_norm": 0.4139081537723541, + "learning_rate": 0.000147035963555544, + "loss": 1.4625, + "step": 20391 + }, + { + "epoch": 0.2649847395324888, + "grad_norm": 0.37376633286476135, + "learning_rate": 0.00014703336409363263, + "loss": 1.3662, + "step": 20392 + }, + { + "epoch": 0.26499773407640465, + "grad_norm": 0.4207388758659363, + "learning_rate": 0.00014703076463172125, + "loss": 1.506, + "step": 20393 + }, + { + "epoch": 0.26501072862032055, + "grad_norm": 0.49392813444137573, + "learning_rate": 0.00014702816516980985, + "loss": 1.3898, + "step": 20394 + }, + { + "epoch": 0.2650237231642364, + "grad_norm": 0.36611130833625793, + "learning_rate": 0.00014702556570789848, + "loss": 1.35, + "step": 20395 + }, + { + "epoch": 0.2650367177081523, + "grad_norm": 0.42950478196144104, + "learning_rate": 0.00014702296624598707, + "loss": 1.2796, + "step": 20396 + }, + { + "epoch": 0.26504971225206814, + "grad_norm": 0.21927288174629211, + "learning_rate": 0.00014702036678407573, + "loss": 1.1613, + "step": 20397 + }, + { + "epoch": 0.26506270679598404, + "grad_norm": 0.35888615250587463, + "learning_rate": 0.00014701776732216432, + "loss": 1.7731, + "step": 20398 + }, + { + "epoch": 0.2650757013398999, + "grad_norm": 0.3721853196620941, + "learning_rate": 0.00014701516786025292, + "loss": 1.321, + "step": 20399 + }, + { + "epoch": 0.2650886958838158, + "grad_norm": 0.38958534598350525, + "learning_rate": 0.00014701256839834154, + "loss": 1.5856, + "step": 20400 + }, + { + "epoch": 0.26510169042773163, + "grad_norm": 0.36468446254730225, + "learning_rate": 0.00014700996893643017, + "loss": 1.3031, + "step": 20401 + }, + { + "epoch": 0.26511468497164753, + "grad_norm": 0.437661349773407, + "learning_rate": 0.0001470073694745188, + "loss": 1.4715, + "step": 20402 + }, + { + "epoch": 0.26512767951556343, + "grad_norm": 0.31876251101493835, + "learning_rate": 0.0001470047700126074, + "loss": 1.1394, + "step": 20403 + }, + { + "epoch": 0.2651406740594793, + "grad_norm": 0.38678017258644104, + "learning_rate": 0.00014700217055069602, + "loss": 1.4323, + "step": 20404 + }, + { + "epoch": 0.2651536686033952, + "grad_norm": 0.3743257522583008, + "learning_rate": 0.00014699957108878464, + "loss": 1.7135, + "step": 20405 + }, + { + "epoch": 0.265166663147311, + "grad_norm": 0.433645635843277, + "learning_rate": 0.00014699697162687324, + "loss": 1.5107, + "step": 20406 + }, + { + "epoch": 0.2651796576912269, + "grad_norm": 0.4228347837924957, + "learning_rate": 0.00014699437216496186, + "loss": 1.609, + "step": 20407 + }, + { + "epoch": 0.26519265223514277, + "grad_norm": 0.3998931646347046, + "learning_rate": 0.00014699177270305046, + "loss": 1.4465, + "step": 20408 + }, + { + "epoch": 0.26520564677905867, + "grad_norm": 0.4201618731021881, + "learning_rate": 0.0001469891732411391, + "loss": 1.4981, + "step": 20409 + }, + { + "epoch": 0.2652186413229745, + "grad_norm": 0.36882930994033813, + "learning_rate": 0.0001469865737792277, + "loss": 1.1966, + "step": 20410 + }, + { + "epoch": 0.2652316358668904, + "grad_norm": 0.23093043267726898, + "learning_rate": 0.00014698397431731633, + "loss": 1.2175, + "step": 20411 + }, + { + "epoch": 0.26524463041080626, + "grad_norm": 0.4628934860229492, + "learning_rate": 0.00014698137485540493, + "loss": 1.4904, + "step": 20412 + }, + { + "epoch": 0.26525762495472216, + "grad_norm": 0.4709840714931488, + "learning_rate": 0.00014697877539349355, + "loss": 1.5158, + "step": 20413 + }, + { + "epoch": 0.265270619498638, + "grad_norm": 0.4536694884300232, + "learning_rate": 0.00014697617593158218, + "loss": 1.3396, + "step": 20414 + }, + { + "epoch": 0.2652836140425539, + "grad_norm": 0.4625324308872223, + "learning_rate": 0.00014697357646967078, + "loss": 1.4676, + "step": 20415 + }, + { + "epoch": 0.26529660858646975, + "grad_norm": 0.3581303358078003, + "learning_rate": 0.0001469709770077594, + "loss": 1.4109, + "step": 20416 + }, + { + "epoch": 0.26530960313038565, + "grad_norm": 0.4025501012802124, + "learning_rate": 0.00014696837754584803, + "loss": 1.3674, + "step": 20417 + }, + { + "epoch": 0.2653225976743015, + "grad_norm": 0.513404369354248, + "learning_rate": 0.00014696577808393662, + "loss": 1.4814, + "step": 20418 + }, + { + "epoch": 0.2653355922182174, + "grad_norm": 0.4174301028251648, + "learning_rate": 0.00014696317862202525, + "loss": 1.4742, + "step": 20419 + }, + { + "epoch": 0.26534858676213324, + "grad_norm": 0.5026924014091492, + "learning_rate": 0.00014696057916011384, + "loss": 1.4351, + "step": 20420 + }, + { + "epoch": 0.26536158130604914, + "grad_norm": 0.4656890034675598, + "learning_rate": 0.0001469579796982025, + "loss": 1.4062, + "step": 20421 + }, + { + "epoch": 0.265374575849965, + "grad_norm": 0.37233859300613403, + "learning_rate": 0.0001469553802362911, + "loss": 1.3051, + "step": 20422 + }, + { + "epoch": 0.2653875703938809, + "grad_norm": 0.380974143743515, + "learning_rate": 0.00014695278077437972, + "loss": 1.4349, + "step": 20423 + }, + { + "epoch": 0.26540056493779673, + "grad_norm": 0.4362393617630005, + "learning_rate": 0.00014695018131246832, + "loss": 1.3495, + "step": 20424 + }, + { + "epoch": 0.26541355948171264, + "grad_norm": 0.3156451880931854, + "learning_rate": 0.00014694758185055694, + "loss": 1.3011, + "step": 20425 + }, + { + "epoch": 0.2654265540256285, + "grad_norm": 0.4349451959133148, + "learning_rate": 0.00014694498238864556, + "loss": 1.4841, + "step": 20426 + }, + { + "epoch": 0.2654395485695444, + "grad_norm": 0.3691495954990387, + "learning_rate": 0.00014694238292673416, + "loss": 1.351, + "step": 20427 + }, + { + "epoch": 0.2654525431134602, + "grad_norm": 0.35394132137298584, + "learning_rate": 0.00014693978346482279, + "loss": 1.4216, + "step": 20428 + }, + { + "epoch": 0.2654655376573761, + "grad_norm": 0.4816385507583618, + "learning_rate": 0.0001469371840029114, + "loss": 1.3449, + "step": 20429 + }, + { + "epoch": 0.265478532201292, + "grad_norm": 0.38406801223754883, + "learning_rate": 0.000146934584541, + "loss": 1.4388, + "step": 20430 + }, + { + "epoch": 0.2654915267452079, + "grad_norm": 0.37115398049354553, + "learning_rate": 0.00014693198507908863, + "loss": 1.2303, + "step": 20431 + }, + { + "epoch": 0.2655045212891237, + "grad_norm": 0.4292278587818146, + "learning_rate": 0.00014692938561717726, + "loss": 1.3919, + "step": 20432 + }, + { + "epoch": 0.2655175158330396, + "grad_norm": 0.4069332778453827, + "learning_rate": 0.00014692678615526588, + "loss": 1.5804, + "step": 20433 + }, + { + "epoch": 0.26553051037695546, + "grad_norm": 0.3018932342529297, + "learning_rate": 0.00014692418669335448, + "loss": 1.3315, + "step": 20434 + }, + { + "epoch": 0.26554350492087136, + "grad_norm": 0.3571458160877228, + "learning_rate": 0.0001469215872314431, + "loss": 1.3644, + "step": 20435 + }, + { + "epoch": 0.2655564994647872, + "grad_norm": 0.4125556945800781, + "learning_rate": 0.00014691898776953173, + "loss": 1.5809, + "step": 20436 + }, + { + "epoch": 0.2655694940087031, + "grad_norm": 0.4639059901237488, + "learning_rate": 0.00014691638830762033, + "loss": 1.4488, + "step": 20437 + }, + { + "epoch": 0.26558248855261896, + "grad_norm": 0.5177050232887268, + "learning_rate": 0.00014691378884570895, + "loss": 1.5209, + "step": 20438 + }, + { + "epoch": 0.26559548309653486, + "grad_norm": 0.3875853419303894, + "learning_rate": 0.00014691118938379755, + "loss": 1.2416, + "step": 20439 + }, + { + "epoch": 0.2656084776404507, + "grad_norm": 0.2878570556640625, + "learning_rate": 0.0001469085899218862, + "loss": 1.3895, + "step": 20440 + }, + { + "epoch": 0.2656214721843666, + "grad_norm": 0.40805357694625854, + "learning_rate": 0.0001469059904599748, + "loss": 1.3319, + "step": 20441 + }, + { + "epoch": 0.26563446672828245, + "grad_norm": 0.28271886706352234, + "learning_rate": 0.0001469033909980634, + "loss": 1.366, + "step": 20442 + }, + { + "epoch": 0.26564746127219835, + "grad_norm": 0.3878540098667145, + "learning_rate": 0.00014690079153615202, + "loss": 1.4347, + "step": 20443 + }, + { + "epoch": 0.2656604558161142, + "grad_norm": 0.3423001170158386, + "learning_rate": 0.00014689819207424064, + "loss": 1.5316, + "step": 20444 + }, + { + "epoch": 0.2656734503600301, + "grad_norm": 0.3997270166873932, + "learning_rate": 0.00014689559261232927, + "loss": 1.391, + "step": 20445 + }, + { + "epoch": 0.26568644490394594, + "grad_norm": 0.4809684455394745, + "learning_rate": 0.00014689299315041786, + "loss": 1.4126, + "step": 20446 + }, + { + "epoch": 0.26569943944786184, + "grad_norm": 0.4122914969921112, + "learning_rate": 0.0001468903936885065, + "loss": 1.3593, + "step": 20447 + }, + { + "epoch": 0.2657124339917777, + "grad_norm": 0.4307091236114502, + "learning_rate": 0.0001468877942265951, + "loss": 1.4189, + "step": 20448 + }, + { + "epoch": 0.2657254285356936, + "grad_norm": 0.39407795667648315, + "learning_rate": 0.0001468851947646837, + "loss": 1.3267, + "step": 20449 + }, + { + "epoch": 0.26573842307960943, + "grad_norm": 0.3779304623603821, + "learning_rate": 0.00014688259530277234, + "loss": 1.2678, + "step": 20450 + }, + { + "epoch": 0.26575141762352533, + "grad_norm": 0.47362807393074036, + "learning_rate": 0.00014687999584086093, + "loss": 1.4144, + "step": 20451 + }, + { + "epoch": 0.2657644121674412, + "grad_norm": 0.36937618255615234, + "learning_rate": 0.00014687739637894958, + "loss": 1.4565, + "step": 20452 + }, + { + "epoch": 0.2657774067113571, + "grad_norm": 0.42991894483566284, + "learning_rate": 0.00014687479691703818, + "loss": 1.403, + "step": 20453 + }, + { + "epoch": 0.2657904012552729, + "grad_norm": 0.5301804542541504, + "learning_rate": 0.00014687219745512678, + "loss": 1.3587, + "step": 20454 + }, + { + "epoch": 0.2658033957991888, + "grad_norm": 0.306618332862854, + "learning_rate": 0.0001468695979932154, + "loss": 1.3203, + "step": 20455 + }, + { + "epoch": 0.26581639034310467, + "grad_norm": 0.41566163301467896, + "learning_rate": 0.00014686699853130403, + "loss": 1.2821, + "step": 20456 + }, + { + "epoch": 0.26582938488702057, + "grad_norm": 0.36690419912338257, + "learning_rate": 0.00014686439906939265, + "loss": 1.4334, + "step": 20457 + }, + { + "epoch": 0.2658423794309364, + "grad_norm": 0.49747782945632935, + "learning_rate": 0.00014686179960748125, + "loss": 1.4742, + "step": 20458 + }, + { + "epoch": 0.2658553739748523, + "grad_norm": 0.40697649121284485, + "learning_rate": 0.00014685920014556987, + "loss": 1.281, + "step": 20459 + }, + { + "epoch": 0.26586836851876816, + "grad_norm": 0.36211541295051575, + "learning_rate": 0.0001468566006836585, + "loss": 1.4503, + "step": 20460 + }, + { + "epoch": 0.26588136306268406, + "grad_norm": 0.3339312672615051, + "learning_rate": 0.0001468540012217471, + "loss": 1.2378, + "step": 20461 + }, + { + "epoch": 0.2658943576065999, + "grad_norm": 0.31429940462112427, + "learning_rate": 0.00014685140175983572, + "loss": 1.4091, + "step": 20462 + }, + { + "epoch": 0.2659073521505158, + "grad_norm": 0.4537610411643982, + "learning_rate": 0.00014684880229792432, + "loss": 1.5026, + "step": 20463 + }, + { + "epoch": 0.26592034669443165, + "grad_norm": 0.47489133477211, + "learning_rate": 0.00014684620283601297, + "loss": 1.3688, + "step": 20464 + }, + { + "epoch": 0.26593334123834755, + "grad_norm": 0.4213125705718994, + "learning_rate": 0.00014684360337410157, + "loss": 1.3503, + "step": 20465 + }, + { + "epoch": 0.2659463357822634, + "grad_norm": 0.46165016293525696, + "learning_rate": 0.0001468410039121902, + "loss": 1.418, + "step": 20466 + }, + { + "epoch": 0.2659593303261793, + "grad_norm": 0.3794310688972473, + "learning_rate": 0.00014683840445027882, + "loss": 1.4213, + "step": 20467 + }, + { + "epoch": 0.26597232487009514, + "grad_norm": 0.3580165505409241, + "learning_rate": 0.0001468358049883674, + "loss": 1.4313, + "step": 20468 + }, + { + "epoch": 0.26598531941401105, + "grad_norm": 0.46021631360054016, + "learning_rate": 0.00014683320552645604, + "loss": 1.5658, + "step": 20469 + }, + { + "epoch": 0.2659983139579269, + "grad_norm": 0.40520593523979187, + "learning_rate": 0.00014683060606454464, + "loss": 1.3848, + "step": 20470 + }, + { + "epoch": 0.2660113085018428, + "grad_norm": 0.3961764872074127, + "learning_rate": 0.00014682800660263326, + "loss": 1.5738, + "step": 20471 + }, + { + "epoch": 0.26602430304575864, + "grad_norm": 0.4557397961616516, + "learning_rate": 0.00014682540714072188, + "loss": 1.3206, + "step": 20472 + }, + { + "epoch": 0.26603729758967454, + "grad_norm": 0.4219590425491333, + "learning_rate": 0.00014682280767881048, + "loss": 1.501, + "step": 20473 + }, + { + "epoch": 0.2660502921335904, + "grad_norm": 0.42460018396377563, + "learning_rate": 0.0001468202082168991, + "loss": 1.296, + "step": 20474 + }, + { + "epoch": 0.2660632866775063, + "grad_norm": 0.35993239283561707, + "learning_rate": 0.00014681760875498773, + "loss": 1.3469, + "step": 20475 + }, + { + "epoch": 0.26607628122142213, + "grad_norm": 0.3286518454551697, + "learning_rate": 0.00014681500929307636, + "loss": 1.3153, + "step": 20476 + }, + { + "epoch": 0.26608927576533803, + "grad_norm": 0.30503278970718384, + "learning_rate": 0.00014681240983116495, + "loss": 1.4186, + "step": 20477 + }, + { + "epoch": 0.2661022703092539, + "grad_norm": 0.4529259204864502, + "learning_rate": 0.00014680981036925358, + "loss": 1.3746, + "step": 20478 + }, + { + "epoch": 0.2661152648531698, + "grad_norm": 0.38732701539993286, + "learning_rate": 0.0001468072109073422, + "loss": 1.3698, + "step": 20479 + }, + { + "epoch": 0.2661282593970857, + "grad_norm": 0.40228399634361267, + "learning_rate": 0.0001468046114454308, + "loss": 1.1749, + "step": 20480 + }, + { + "epoch": 0.2661412539410015, + "grad_norm": 0.4193732440471649, + "learning_rate": 0.00014680201198351942, + "loss": 1.3749, + "step": 20481 + }, + { + "epoch": 0.2661542484849174, + "grad_norm": 0.46410825848579407, + "learning_rate": 0.00014679941252160802, + "loss": 1.3536, + "step": 20482 + }, + { + "epoch": 0.26616724302883327, + "grad_norm": 0.3774057626724243, + "learning_rate": 0.00014679681305969665, + "loss": 1.3258, + "step": 20483 + }, + { + "epoch": 0.26618023757274917, + "grad_norm": 0.3716343343257904, + "learning_rate": 0.00014679421359778527, + "loss": 1.4639, + "step": 20484 + }, + { + "epoch": 0.266193232116665, + "grad_norm": 0.3497524857521057, + "learning_rate": 0.00014679161413587387, + "loss": 1.3988, + "step": 20485 + }, + { + "epoch": 0.2662062266605809, + "grad_norm": 0.37190037965774536, + "learning_rate": 0.0001467890146739625, + "loss": 1.415, + "step": 20486 + }, + { + "epoch": 0.26621922120449676, + "grad_norm": 0.42416632175445557, + "learning_rate": 0.00014678641521205112, + "loss": 1.4715, + "step": 20487 + }, + { + "epoch": 0.26623221574841266, + "grad_norm": 0.43514132499694824, + "learning_rate": 0.00014678381575013974, + "loss": 1.3162, + "step": 20488 + }, + { + "epoch": 0.2662452102923285, + "grad_norm": 0.43945401906967163, + "learning_rate": 0.00014678121628822834, + "loss": 1.4666, + "step": 20489 + }, + { + "epoch": 0.2662582048362444, + "grad_norm": 0.4091470241546631, + "learning_rate": 0.00014677861682631696, + "loss": 1.5216, + "step": 20490 + }, + { + "epoch": 0.26627119938016025, + "grad_norm": 0.3419691324234009, + "learning_rate": 0.0001467760173644056, + "loss": 1.3405, + "step": 20491 + }, + { + "epoch": 0.26628419392407615, + "grad_norm": 0.378904789686203, + "learning_rate": 0.00014677341790249418, + "loss": 1.4309, + "step": 20492 + }, + { + "epoch": 0.266297188467992, + "grad_norm": 0.39966049790382385, + "learning_rate": 0.0001467708184405828, + "loss": 1.5508, + "step": 20493 + }, + { + "epoch": 0.2663101830119079, + "grad_norm": 0.3456076979637146, + "learning_rate": 0.0001467682189786714, + "loss": 1.2997, + "step": 20494 + }, + { + "epoch": 0.26632317755582374, + "grad_norm": 0.5551380515098572, + "learning_rate": 0.00014676561951676006, + "loss": 1.6379, + "step": 20495 + }, + { + "epoch": 0.26633617209973964, + "grad_norm": 0.4086301922798157, + "learning_rate": 0.00014676302005484866, + "loss": 1.3887, + "step": 20496 + }, + { + "epoch": 0.2663491666436555, + "grad_norm": 0.5633037686347961, + "learning_rate": 0.00014676042059293725, + "loss": 1.482, + "step": 20497 + }, + { + "epoch": 0.2663621611875714, + "grad_norm": 0.4641495645046234, + "learning_rate": 0.00014675782113102588, + "loss": 1.4203, + "step": 20498 + }, + { + "epoch": 0.26637515573148723, + "grad_norm": 0.367080956697464, + "learning_rate": 0.0001467552216691145, + "loss": 1.5495, + "step": 20499 + }, + { + "epoch": 0.26638815027540313, + "grad_norm": 0.4504353404045105, + "learning_rate": 0.00014675262220720313, + "loss": 1.3292, + "step": 20500 + }, + { + "epoch": 0.266401144819319, + "grad_norm": 0.4139452576637268, + "learning_rate": 0.00014675002274529172, + "loss": 1.4385, + "step": 20501 + }, + { + "epoch": 0.2664141393632349, + "grad_norm": 0.3960649073123932, + "learning_rate": 0.00014674742328338035, + "loss": 1.4685, + "step": 20502 + }, + { + "epoch": 0.2664271339071507, + "grad_norm": 0.4675760269165039, + "learning_rate": 0.00014674482382146897, + "loss": 1.4818, + "step": 20503 + }, + { + "epoch": 0.2664401284510666, + "grad_norm": 0.46072453260421753, + "learning_rate": 0.00014674222435955757, + "loss": 1.3335, + "step": 20504 + }, + { + "epoch": 0.26645312299498247, + "grad_norm": 0.39284059405326843, + "learning_rate": 0.0001467396248976462, + "loss": 1.5761, + "step": 20505 + }, + { + "epoch": 0.2664661175388984, + "grad_norm": 0.2924778163433075, + "learning_rate": 0.00014673702543573482, + "loss": 1.5178, + "step": 20506 + }, + { + "epoch": 0.2664791120828142, + "grad_norm": 0.40896594524383545, + "learning_rate": 0.00014673442597382344, + "loss": 1.4503, + "step": 20507 + }, + { + "epoch": 0.2664921066267301, + "grad_norm": 0.3824611008167267, + "learning_rate": 0.00014673182651191204, + "loss": 1.4965, + "step": 20508 + }, + { + "epoch": 0.26650510117064596, + "grad_norm": 0.4336054027080536, + "learning_rate": 0.00014672922705000064, + "loss": 1.4405, + "step": 20509 + }, + { + "epoch": 0.26651809571456186, + "grad_norm": 0.3573705852031708, + "learning_rate": 0.0001467266275880893, + "loss": 1.3256, + "step": 20510 + }, + { + "epoch": 0.2665310902584777, + "grad_norm": 0.2696384787559509, + "learning_rate": 0.0001467240281261779, + "loss": 1.1086, + "step": 20511 + }, + { + "epoch": 0.2665440848023936, + "grad_norm": 0.4157159924507141, + "learning_rate": 0.0001467214286642665, + "loss": 1.4728, + "step": 20512 + }, + { + "epoch": 0.26655707934630946, + "grad_norm": 0.3976568579673767, + "learning_rate": 0.0001467188292023551, + "loss": 1.3135, + "step": 20513 + }, + { + "epoch": 0.26657007389022536, + "grad_norm": 0.42020806670188904, + "learning_rate": 0.00014671622974044373, + "loss": 1.4906, + "step": 20514 + }, + { + "epoch": 0.2665830684341412, + "grad_norm": 0.48886722326278687, + "learning_rate": 0.00014671363027853236, + "loss": 1.5235, + "step": 20515 + }, + { + "epoch": 0.2665960629780571, + "grad_norm": 0.3221413791179657, + "learning_rate": 0.00014671103081662095, + "loss": 1.2387, + "step": 20516 + }, + { + "epoch": 0.26660905752197295, + "grad_norm": 0.4440333843231201, + "learning_rate": 0.00014670843135470958, + "loss": 1.3829, + "step": 20517 + }, + { + "epoch": 0.26662205206588885, + "grad_norm": 0.2754184901714325, + "learning_rate": 0.0001467058318927982, + "loss": 1.4253, + "step": 20518 + }, + { + "epoch": 0.2666350466098047, + "grad_norm": 0.3203902840614319, + "learning_rate": 0.00014670323243088683, + "loss": 1.2764, + "step": 20519 + }, + { + "epoch": 0.2666480411537206, + "grad_norm": 0.3556414246559143, + "learning_rate": 0.00014670063296897543, + "loss": 1.4171, + "step": 20520 + }, + { + "epoch": 0.26666103569763644, + "grad_norm": 0.4025947153568268, + "learning_rate": 0.00014669803350706402, + "loss": 1.3654, + "step": 20521 + }, + { + "epoch": 0.26667403024155234, + "grad_norm": 0.3941269516944885, + "learning_rate": 0.00014669543404515267, + "loss": 1.5005, + "step": 20522 + }, + { + "epoch": 0.2666870247854682, + "grad_norm": 0.45404061675071716, + "learning_rate": 0.00014669283458324127, + "loss": 1.4261, + "step": 20523 + }, + { + "epoch": 0.2667000193293841, + "grad_norm": 0.3397692143917084, + "learning_rate": 0.0001466902351213299, + "loss": 1.3216, + "step": 20524 + }, + { + "epoch": 0.26671301387329993, + "grad_norm": 0.43843889236450195, + "learning_rate": 0.0001466876356594185, + "loss": 1.4268, + "step": 20525 + }, + { + "epoch": 0.26672600841721583, + "grad_norm": 0.48928913474082947, + "learning_rate": 0.00014668503619750712, + "loss": 1.5124, + "step": 20526 + }, + { + "epoch": 0.2667390029611317, + "grad_norm": 0.3399357199668884, + "learning_rate": 0.00014668243673559574, + "loss": 1.3637, + "step": 20527 + }, + { + "epoch": 0.2667519975050476, + "grad_norm": 0.421852171421051, + "learning_rate": 0.00014667983727368434, + "loss": 1.4828, + "step": 20528 + }, + { + "epoch": 0.2667649920489634, + "grad_norm": 0.4316619634628296, + "learning_rate": 0.00014667723781177296, + "loss": 1.3984, + "step": 20529 + }, + { + "epoch": 0.2667779865928793, + "grad_norm": 0.4099975526332855, + "learning_rate": 0.0001466746383498616, + "loss": 1.5689, + "step": 20530 + }, + { + "epoch": 0.26679098113679517, + "grad_norm": 0.337446004152298, + "learning_rate": 0.00014667203888795021, + "loss": 1.2349, + "step": 20531 + }, + { + "epoch": 0.26680397568071107, + "grad_norm": 0.4579349458217621, + "learning_rate": 0.0001466694394260388, + "loss": 1.3794, + "step": 20532 + }, + { + "epoch": 0.2668169702246269, + "grad_norm": 0.5047162175178528, + "learning_rate": 0.00014666683996412744, + "loss": 1.5072, + "step": 20533 + }, + { + "epoch": 0.2668299647685428, + "grad_norm": 0.3554355800151825, + "learning_rate": 0.00014666424050221606, + "loss": 1.5544, + "step": 20534 + }, + { + "epoch": 0.26684295931245866, + "grad_norm": 0.3911932110786438, + "learning_rate": 0.00014666164104030466, + "loss": 1.4133, + "step": 20535 + }, + { + "epoch": 0.26685595385637456, + "grad_norm": 0.43749839067459106, + "learning_rate": 0.00014665904157839328, + "loss": 1.2183, + "step": 20536 + }, + { + "epoch": 0.2668689484002904, + "grad_norm": 0.40340352058410645, + "learning_rate": 0.00014665644211648188, + "loss": 1.3961, + "step": 20537 + }, + { + "epoch": 0.2668819429442063, + "grad_norm": 0.44622188806533813, + "learning_rate": 0.0001466538426545705, + "loss": 1.5031, + "step": 20538 + }, + { + "epoch": 0.26689493748812215, + "grad_norm": 0.3573788106441498, + "learning_rate": 0.00014665124319265913, + "loss": 1.4739, + "step": 20539 + }, + { + "epoch": 0.26690793203203805, + "grad_norm": 0.3120102286338806, + "learning_rate": 0.00014664864373074773, + "loss": 1.5174, + "step": 20540 + }, + { + "epoch": 0.2669209265759539, + "grad_norm": 0.4040786325931549, + "learning_rate": 0.00014664604426883638, + "loss": 1.2888, + "step": 20541 + }, + { + "epoch": 0.2669339211198698, + "grad_norm": 0.3949944078922272, + "learning_rate": 0.00014664344480692497, + "loss": 1.3514, + "step": 20542 + }, + { + "epoch": 0.26694691566378564, + "grad_norm": 0.4573860466480255, + "learning_rate": 0.0001466408453450136, + "loss": 1.4011, + "step": 20543 + }, + { + "epoch": 0.26695991020770155, + "grad_norm": 0.4439416527748108, + "learning_rate": 0.0001466382458831022, + "loss": 1.4129, + "step": 20544 + }, + { + "epoch": 0.2669729047516174, + "grad_norm": 0.34848085045814514, + "learning_rate": 0.00014663564642119082, + "loss": 1.515, + "step": 20545 + }, + { + "epoch": 0.2669858992955333, + "grad_norm": 0.39392855763435364, + "learning_rate": 0.00014663304695927945, + "loss": 1.2974, + "step": 20546 + }, + { + "epoch": 0.26699889383944914, + "grad_norm": 0.3618329167366028, + "learning_rate": 0.00014663044749736804, + "loss": 1.5627, + "step": 20547 + }, + { + "epoch": 0.26701188838336504, + "grad_norm": 0.37951064109802246, + "learning_rate": 0.00014662784803545667, + "loss": 1.4563, + "step": 20548 + }, + { + "epoch": 0.2670248829272809, + "grad_norm": 0.3968099057674408, + "learning_rate": 0.0001466252485735453, + "loss": 1.5636, + "step": 20549 + }, + { + "epoch": 0.2670378774711968, + "grad_norm": 0.47305813431739807, + "learning_rate": 0.00014662264911163392, + "loss": 1.3466, + "step": 20550 + }, + { + "epoch": 0.26705087201511263, + "grad_norm": 0.3422969877719879, + "learning_rate": 0.00014662004964972251, + "loss": 1.2357, + "step": 20551 + }, + { + "epoch": 0.26706386655902853, + "grad_norm": 0.26564934849739075, + "learning_rate": 0.0001466174501878111, + "loss": 1.3822, + "step": 20552 + }, + { + "epoch": 0.2670768611029444, + "grad_norm": 0.38466688990592957, + "learning_rate": 0.00014661485072589976, + "loss": 1.6854, + "step": 20553 + }, + { + "epoch": 0.2670898556468603, + "grad_norm": 0.42451873421669006, + "learning_rate": 0.00014661225126398836, + "loss": 1.3781, + "step": 20554 + }, + { + "epoch": 0.2671028501907762, + "grad_norm": 0.3984309136867523, + "learning_rate": 0.00014660965180207698, + "loss": 1.5276, + "step": 20555 + }, + { + "epoch": 0.267115844734692, + "grad_norm": 0.32337599992752075, + "learning_rate": 0.00014660705234016558, + "loss": 1.5079, + "step": 20556 + }, + { + "epoch": 0.2671288392786079, + "grad_norm": 0.34577375650405884, + "learning_rate": 0.0001466044528782542, + "loss": 1.3852, + "step": 20557 + }, + { + "epoch": 0.26714183382252377, + "grad_norm": 0.324277400970459, + "learning_rate": 0.00014660185341634283, + "loss": 1.5501, + "step": 20558 + }, + { + "epoch": 0.26715482836643967, + "grad_norm": 0.3926406502723694, + "learning_rate": 0.00014659925395443143, + "loss": 1.4244, + "step": 20559 + }, + { + "epoch": 0.2671678229103555, + "grad_norm": 0.3978142738342285, + "learning_rate": 0.00014659665449252005, + "loss": 1.3282, + "step": 20560 + }, + { + "epoch": 0.2671808174542714, + "grad_norm": 0.48755988478660583, + "learning_rate": 0.00014659405503060868, + "loss": 1.4851, + "step": 20561 + }, + { + "epoch": 0.26719381199818726, + "grad_norm": 0.39426976442337036, + "learning_rate": 0.0001465914555686973, + "loss": 1.4414, + "step": 20562 + }, + { + "epoch": 0.26720680654210316, + "grad_norm": 0.5472205281257629, + "learning_rate": 0.0001465888561067859, + "loss": 1.4487, + "step": 20563 + }, + { + "epoch": 0.267219801086019, + "grad_norm": 0.3972024619579315, + "learning_rate": 0.0001465862566448745, + "loss": 1.5969, + "step": 20564 + }, + { + "epoch": 0.2672327956299349, + "grad_norm": 0.4485118091106415, + "learning_rate": 0.00014658365718296315, + "loss": 1.3231, + "step": 20565 + }, + { + "epoch": 0.26724579017385075, + "grad_norm": 0.38982245326042175, + "learning_rate": 0.00014658105772105175, + "loss": 1.3931, + "step": 20566 + }, + { + "epoch": 0.26725878471776665, + "grad_norm": 0.397883802652359, + "learning_rate": 0.00014657845825914037, + "loss": 1.5746, + "step": 20567 + }, + { + "epoch": 0.2672717792616825, + "grad_norm": 0.38800114393234253, + "learning_rate": 0.00014657585879722897, + "loss": 1.3589, + "step": 20568 + }, + { + "epoch": 0.2672847738055984, + "grad_norm": 0.39322930574417114, + "learning_rate": 0.0001465732593353176, + "loss": 1.5264, + "step": 20569 + }, + { + "epoch": 0.26729776834951424, + "grad_norm": 0.4672949016094208, + "learning_rate": 0.00014657065987340622, + "loss": 1.5386, + "step": 20570 + }, + { + "epoch": 0.26731076289343014, + "grad_norm": 0.36268308758735657, + "learning_rate": 0.00014656806041149481, + "loss": 1.2384, + "step": 20571 + }, + { + "epoch": 0.267323757437346, + "grad_norm": 0.3658696413040161, + "learning_rate": 0.00014656546094958344, + "loss": 1.3069, + "step": 20572 + }, + { + "epoch": 0.2673367519812619, + "grad_norm": 0.36909234523773193, + "learning_rate": 0.00014656286148767206, + "loss": 1.3643, + "step": 20573 + }, + { + "epoch": 0.26734974652517773, + "grad_norm": 0.3710574209690094, + "learning_rate": 0.0001465602620257607, + "loss": 1.448, + "step": 20574 + }, + { + "epoch": 0.26736274106909363, + "grad_norm": 0.4195187985897064, + "learning_rate": 0.00014655766256384928, + "loss": 1.1507, + "step": 20575 + }, + { + "epoch": 0.2673757356130095, + "grad_norm": 0.30392158031463623, + "learning_rate": 0.00014655506310193788, + "loss": 1.2132, + "step": 20576 + }, + { + "epoch": 0.2673887301569254, + "grad_norm": 0.37641626596450806, + "learning_rate": 0.00014655246364002653, + "loss": 1.425, + "step": 20577 + }, + { + "epoch": 0.2674017247008412, + "grad_norm": 0.39774182438850403, + "learning_rate": 0.00014654986417811513, + "loss": 1.3475, + "step": 20578 + }, + { + "epoch": 0.2674147192447571, + "grad_norm": 0.45070627331733704, + "learning_rate": 0.00014654726471620376, + "loss": 1.3981, + "step": 20579 + }, + { + "epoch": 0.26742771378867297, + "grad_norm": 0.429548978805542, + "learning_rate": 0.00014654466525429238, + "loss": 1.4826, + "step": 20580 + }, + { + "epoch": 0.2674407083325889, + "grad_norm": 0.4077950119972229, + "learning_rate": 0.00014654206579238098, + "loss": 1.5791, + "step": 20581 + }, + { + "epoch": 0.2674537028765047, + "grad_norm": 0.4084833562374115, + "learning_rate": 0.0001465394663304696, + "loss": 1.3112, + "step": 20582 + }, + { + "epoch": 0.2674666974204206, + "grad_norm": 0.43050774931907654, + "learning_rate": 0.0001465368668685582, + "loss": 1.4577, + "step": 20583 + }, + { + "epoch": 0.26747969196433646, + "grad_norm": 0.408719539642334, + "learning_rate": 0.00014653426740664685, + "loss": 1.4389, + "step": 20584 + }, + { + "epoch": 0.26749268650825236, + "grad_norm": 0.3926236629486084, + "learning_rate": 0.00014653166794473545, + "loss": 1.3572, + "step": 20585 + }, + { + "epoch": 0.2675056810521682, + "grad_norm": 0.3172488808631897, + "learning_rate": 0.00014652906848282407, + "loss": 1.3145, + "step": 20586 + }, + { + "epoch": 0.2675186755960841, + "grad_norm": 0.3986409604549408, + "learning_rate": 0.00014652646902091267, + "loss": 1.43, + "step": 20587 + }, + { + "epoch": 0.26753167013999996, + "grad_norm": 0.3207913339138031, + "learning_rate": 0.0001465238695590013, + "loss": 1.5232, + "step": 20588 + }, + { + "epoch": 0.26754466468391586, + "grad_norm": 0.3617097735404968, + "learning_rate": 0.00014652127009708992, + "loss": 1.5102, + "step": 20589 + }, + { + "epoch": 0.2675576592278317, + "grad_norm": 0.4643590748310089, + "learning_rate": 0.00014651867063517852, + "loss": 1.5157, + "step": 20590 + }, + { + "epoch": 0.2675706537717476, + "grad_norm": 0.43895310163497925, + "learning_rate": 0.00014651607117326714, + "loss": 1.5303, + "step": 20591 + }, + { + "epoch": 0.26758364831566345, + "grad_norm": 0.3952551782131195, + "learning_rate": 0.00014651347171135577, + "loss": 1.393, + "step": 20592 + }, + { + "epoch": 0.26759664285957935, + "grad_norm": 0.37808409333229065, + "learning_rate": 0.00014651087224944436, + "loss": 1.287, + "step": 20593 + }, + { + "epoch": 0.2676096374034952, + "grad_norm": 0.4574804902076721, + "learning_rate": 0.000146508272787533, + "loss": 1.3884, + "step": 20594 + }, + { + "epoch": 0.2676226319474111, + "grad_norm": 0.4842263162136078, + "learning_rate": 0.00014650567332562158, + "loss": 1.4964, + "step": 20595 + }, + { + "epoch": 0.26763562649132694, + "grad_norm": 0.3509844243526459, + "learning_rate": 0.00014650307386371024, + "loss": 1.6198, + "step": 20596 + }, + { + "epoch": 0.26764862103524284, + "grad_norm": 0.42597100138664246, + "learning_rate": 0.00014650047440179883, + "loss": 1.4549, + "step": 20597 + }, + { + "epoch": 0.2676616155791587, + "grad_norm": 0.45165514945983887, + "learning_rate": 0.00014649787493988746, + "loss": 1.425, + "step": 20598 + }, + { + "epoch": 0.2676746101230746, + "grad_norm": 0.38156551122665405, + "learning_rate": 0.00014649527547797606, + "loss": 1.6365, + "step": 20599 + }, + { + "epoch": 0.26768760466699043, + "grad_norm": 0.3806611895561218, + "learning_rate": 0.00014649267601606468, + "loss": 1.4636, + "step": 20600 + }, + { + "epoch": 0.26770059921090633, + "grad_norm": 0.3957795798778534, + "learning_rate": 0.0001464900765541533, + "loss": 1.4103, + "step": 20601 + }, + { + "epoch": 0.2677135937548222, + "grad_norm": 0.29333290457725525, + "learning_rate": 0.0001464874770922419, + "loss": 1.2337, + "step": 20602 + }, + { + "epoch": 0.2677265882987381, + "grad_norm": 0.3904321491718292, + "learning_rate": 0.00014648487763033053, + "loss": 1.5009, + "step": 20603 + }, + { + "epoch": 0.2677395828426539, + "grad_norm": 0.40922266244888306, + "learning_rate": 0.00014648227816841915, + "loss": 1.7033, + "step": 20604 + }, + { + "epoch": 0.2677525773865698, + "grad_norm": 0.35216403007507324, + "learning_rate": 0.00014647967870650775, + "loss": 1.3186, + "step": 20605 + }, + { + "epoch": 0.26776557193048567, + "grad_norm": 0.35933560132980347, + "learning_rate": 0.00014647707924459637, + "loss": 1.3302, + "step": 20606 + }, + { + "epoch": 0.26777856647440157, + "grad_norm": 0.28483909368515015, + "learning_rate": 0.00014647447978268497, + "loss": 1.3749, + "step": 20607 + }, + { + "epoch": 0.2677915610183174, + "grad_norm": 0.423801451921463, + "learning_rate": 0.00014647188032077362, + "loss": 1.4734, + "step": 20608 + }, + { + "epoch": 0.2678045555622333, + "grad_norm": 0.27995800971984863, + "learning_rate": 0.00014646928085886222, + "loss": 1.3285, + "step": 20609 + }, + { + "epoch": 0.26781755010614916, + "grad_norm": 0.5021472573280334, + "learning_rate": 0.00014646668139695084, + "loss": 1.4605, + "step": 20610 + }, + { + "epoch": 0.26783054465006506, + "grad_norm": 0.39849868416786194, + "learning_rate": 0.00014646408193503944, + "loss": 1.3853, + "step": 20611 + }, + { + "epoch": 0.2678435391939809, + "grad_norm": 0.30901312828063965, + "learning_rate": 0.00014646148247312807, + "loss": 1.1987, + "step": 20612 + }, + { + "epoch": 0.2678565337378968, + "grad_norm": 0.3241737186908722, + "learning_rate": 0.0001464588830112167, + "loss": 1.4156, + "step": 20613 + }, + { + "epoch": 0.26786952828181265, + "grad_norm": 0.43412768840789795, + "learning_rate": 0.0001464562835493053, + "loss": 1.465, + "step": 20614 + }, + { + "epoch": 0.26788252282572855, + "grad_norm": 0.3654631972312927, + "learning_rate": 0.00014645368408739394, + "loss": 1.2729, + "step": 20615 + }, + { + "epoch": 0.2678955173696444, + "grad_norm": 0.3572291135787964, + "learning_rate": 0.00014645108462548254, + "loss": 1.5252, + "step": 20616 + }, + { + "epoch": 0.2679085119135603, + "grad_norm": 0.4149107336997986, + "learning_rate": 0.00014644848516357116, + "loss": 1.4198, + "step": 20617 + }, + { + "epoch": 0.26792150645747614, + "grad_norm": 0.286456435918808, + "learning_rate": 0.00014644588570165976, + "loss": 1.2797, + "step": 20618 + }, + { + "epoch": 0.26793450100139204, + "grad_norm": 0.40724557638168335, + "learning_rate": 0.00014644328623974838, + "loss": 1.4218, + "step": 20619 + }, + { + "epoch": 0.2679474955453079, + "grad_norm": 0.3967374265193939, + "learning_rate": 0.000146440686777837, + "loss": 1.5093, + "step": 20620 + }, + { + "epoch": 0.2679604900892238, + "grad_norm": 0.4456663131713867, + "learning_rate": 0.0001464380873159256, + "loss": 1.6117, + "step": 20621 + }, + { + "epoch": 0.26797348463313964, + "grad_norm": 0.35312244296073914, + "learning_rate": 0.00014643548785401423, + "loss": 1.3902, + "step": 20622 + }, + { + "epoch": 0.26798647917705554, + "grad_norm": 0.3175770938396454, + "learning_rate": 0.00014643288839210285, + "loss": 1.3318, + "step": 20623 + }, + { + "epoch": 0.2679994737209714, + "grad_norm": 0.384489506483078, + "learning_rate": 0.00014643028893019145, + "loss": 1.4892, + "step": 20624 + }, + { + "epoch": 0.2680124682648873, + "grad_norm": 0.3831591010093689, + "learning_rate": 0.00014642768946828008, + "loss": 1.3784, + "step": 20625 + }, + { + "epoch": 0.2680254628088031, + "grad_norm": 0.38771161437034607, + "learning_rate": 0.00014642509000636867, + "loss": 1.4784, + "step": 20626 + }, + { + "epoch": 0.26803845735271903, + "grad_norm": 0.4399231970310211, + "learning_rate": 0.00014642249054445732, + "loss": 1.398, + "step": 20627 + }, + { + "epoch": 0.2680514518966349, + "grad_norm": 0.4146258533000946, + "learning_rate": 0.00014641989108254592, + "loss": 1.4837, + "step": 20628 + }, + { + "epoch": 0.2680644464405508, + "grad_norm": 0.4017559587955475, + "learning_rate": 0.00014641729162063455, + "loss": 1.3852, + "step": 20629 + }, + { + "epoch": 0.2680774409844666, + "grad_norm": 0.43311452865600586, + "learning_rate": 0.00014641469215872314, + "loss": 1.5727, + "step": 20630 + }, + { + "epoch": 0.2680904355283825, + "grad_norm": 0.461364209651947, + "learning_rate": 0.00014641209269681177, + "loss": 1.2905, + "step": 20631 + }, + { + "epoch": 0.2681034300722984, + "grad_norm": 0.37731269001960754, + "learning_rate": 0.0001464094932349004, + "loss": 1.2865, + "step": 20632 + }, + { + "epoch": 0.26811642461621427, + "grad_norm": 0.4365038275718689, + "learning_rate": 0.000146406893772989, + "loss": 1.3654, + "step": 20633 + }, + { + "epoch": 0.26812941916013017, + "grad_norm": 0.4046669602394104, + "learning_rate": 0.00014640429431107761, + "loss": 1.5534, + "step": 20634 + }, + { + "epoch": 0.268142413704046, + "grad_norm": 0.42849433422088623, + "learning_rate": 0.00014640169484916624, + "loss": 1.4641, + "step": 20635 + }, + { + "epoch": 0.2681554082479619, + "grad_norm": 0.49273115396499634, + "learning_rate": 0.00014639909538725484, + "loss": 1.4751, + "step": 20636 + }, + { + "epoch": 0.26816840279187776, + "grad_norm": 0.3816077411174774, + "learning_rate": 0.00014639649592534346, + "loss": 1.3567, + "step": 20637 + }, + { + "epoch": 0.26818139733579366, + "grad_norm": 0.3444799780845642, + "learning_rate": 0.00014639389646343206, + "loss": 1.3324, + "step": 20638 + }, + { + "epoch": 0.2681943918797095, + "grad_norm": 0.31481218338012695, + "learning_rate": 0.0001463912970015207, + "loss": 1.3754, + "step": 20639 + }, + { + "epoch": 0.2682073864236254, + "grad_norm": 0.43880695104599, + "learning_rate": 0.0001463886975396093, + "loss": 1.3357, + "step": 20640 + }, + { + "epoch": 0.26822038096754125, + "grad_norm": 0.4160803556442261, + "learning_rate": 0.00014638609807769793, + "loss": 1.3076, + "step": 20641 + }, + { + "epoch": 0.26823337551145715, + "grad_norm": 0.3695676624774933, + "learning_rate": 0.00014638349861578653, + "loss": 1.5626, + "step": 20642 + }, + { + "epoch": 0.268246370055373, + "grad_norm": 0.39395737648010254, + "learning_rate": 0.00014638089915387515, + "loss": 1.3592, + "step": 20643 + }, + { + "epoch": 0.2682593645992889, + "grad_norm": 0.4336263835430145, + "learning_rate": 0.00014637829969196378, + "loss": 1.3741, + "step": 20644 + }, + { + "epoch": 0.26827235914320474, + "grad_norm": 0.44991445541381836, + "learning_rate": 0.00014637570023005238, + "loss": 1.4544, + "step": 20645 + }, + { + "epoch": 0.26828535368712064, + "grad_norm": 0.4383201003074646, + "learning_rate": 0.000146373100768141, + "loss": 1.6389, + "step": 20646 + }, + { + "epoch": 0.2682983482310365, + "grad_norm": 0.4155541658401489, + "learning_rate": 0.00014637050130622962, + "loss": 1.5005, + "step": 20647 + }, + { + "epoch": 0.2683113427749524, + "grad_norm": 0.36898618936538696, + "learning_rate": 0.00014636790184431822, + "loss": 1.383, + "step": 20648 + }, + { + "epoch": 0.26832433731886823, + "grad_norm": 0.3449082374572754, + "learning_rate": 0.00014636530238240685, + "loss": 1.5954, + "step": 20649 + }, + { + "epoch": 0.26833733186278413, + "grad_norm": 0.4273110628128052, + "learning_rate": 0.00014636270292049544, + "loss": 1.3348, + "step": 20650 + }, + { + "epoch": 0.2683503264067, + "grad_norm": 0.3989933133125305, + "learning_rate": 0.0001463601034585841, + "loss": 1.3298, + "step": 20651 + }, + { + "epoch": 0.2683633209506159, + "grad_norm": 0.33908236026763916, + "learning_rate": 0.0001463575039966727, + "loss": 1.4126, + "step": 20652 + }, + { + "epoch": 0.2683763154945317, + "grad_norm": 0.3597542941570282, + "learning_rate": 0.00014635490453476132, + "loss": 1.4677, + "step": 20653 + }, + { + "epoch": 0.2683893100384476, + "grad_norm": 0.4397994875907898, + "learning_rate": 0.00014635230507284994, + "loss": 1.3313, + "step": 20654 + }, + { + "epoch": 0.26840230458236347, + "grad_norm": 0.3488227427005768, + "learning_rate": 0.00014634970561093854, + "loss": 1.4348, + "step": 20655 + }, + { + "epoch": 0.26841529912627937, + "grad_norm": 0.3622024655342102, + "learning_rate": 0.00014634710614902716, + "loss": 1.5753, + "step": 20656 + }, + { + "epoch": 0.2684282936701952, + "grad_norm": 0.3150954842567444, + "learning_rate": 0.00014634450668711576, + "loss": 1.3288, + "step": 20657 + }, + { + "epoch": 0.2684412882141111, + "grad_norm": 0.43790921568870544, + "learning_rate": 0.0001463419072252044, + "loss": 1.5527, + "step": 20658 + }, + { + "epoch": 0.26845428275802696, + "grad_norm": 0.3803635239601135, + "learning_rate": 0.000146339307763293, + "loss": 1.3104, + "step": 20659 + }, + { + "epoch": 0.26846727730194286, + "grad_norm": 0.5523461103439331, + "learning_rate": 0.0001463367083013816, + "loss": 1.4069, + "step": 20660 + }, + { + "epoch": 0.2684802718458587, + "grad_norm": 0.3028532862663269, + "learning_rate": 0.00014633410883947023, + "loss": 1.4229, + "step": 20661 + }, + { + "epoch": 0.2684932663897746, + "grad_norm": 0.4048062264919281, + "learning_rate": 0.00014633150937755886, + "loss": 1.395, + "step": 20662 + }, + { + "epoch": 0.26850626093369045, + "grad_norm": 0.39473825693130493, + "learning_rate": 0.00014632890991564748, + "loss": 1.4592, + "step": 20663 + }, + { + "epoch": 0.26851925547760636, + "grad_norm": 0.3994807004928589, + "learning_rate": 0.00014632631045373608, + "loss": 1.2424, + "step": 20664 + }, + { + "epoch": 0.2685322500215222, + "grad_norm": 0.43029409646987915, + "learning_rate": 0.0001463237109918247, + "loss": 1.3107, + "step": 20665 + }, + { + "epoch": 0.2685452445654381, + "grad_norm": 0.2833326458930969, + "learning_rate": 0.00014632111152991333, + "loss": 1.3468, + "step": 20666 + }, + { + "epoch": 0.26855823910935395, + "grad_norm": 0.32771480083465576, + "learning_rate": 0.00014631851206800192, + "loss": 1.3958, + "step": 20667 + }, + { + "epoch": 0.26857123365326985, + "grad_norm": 0.41782641410827637, + "learning_rate": 0.00014631591260609055, + "loss": 1.455, + "step": 20668 + }, + { + "epoch": 0.2685842281971857, + "grad_norm": 0.4168553352355957, + "learning_rate": 0.00014631331314417915, + "loss": 1.2892, + "step": 20669 + }, + { + "epoch": 0.2685972227411016, + "grad_norm": 0.3225835859775543, + "learning_rate": 0.0001463107136822678, + "loss": 1.3053, + "step": 20670 + }, + { + "epoch": 0.26861021728501744, + "grad_norm": 0.4053954780101776, + "learning_rate": 0.0001463081142203564, + "loss": 1.3354, + "step": 20671 + }, + { + "epoch": 0.26862321182893334, + "grad_norm": 0.3914523720741272, + "learning_rate": 0.00014630551475844502, + "loss": 1.3413, + "step": 20672 + }, + { + "epoch": 0.2686362063728492, + "grad_norm": 0.3967929780483246, + "learning_rate": 0.00014630291529653362, + "loss": 1.5612, + "step": 20673 + }, + { + "epoch": 0.2686492009167651, + "grad_norm": 0.2634059488773346, + "learning_rate": 0.00014630031583462224, + "loss": 1.0879, + "step": 20674 + }, + { + "epoch": 0.26866219546068093, + "grad_norm": 0.38141852617263794, + "learning_rate": 0.00014629771637271087, + "loss": 1.3308, + "step": 20675 + }, + { + "epoch": 0.26867519000459683, + "grad_norm": 0.2449585646390915, + "learning_rate": 0.00014629511691079946, + "loss": 1.4104, + "step": 20676 + }, + { + "epoch": 0.2686881845485127, + "grad_norm": 0.4229148328304291, + "learning_rate": 0.0001462925174488881, + "loss": 1.3328, + "step": 20677 + }, + { + "epoch": 0.2687011790924286, + "grad_norm": 0.4751756191253662, + "learning_rate": 0.0001462899179869767, + "loss": 1.2427, + "step": 20678 + }, + { + "epoch": 0.2687141736363444, + "grad_norm": 0.3833039402961731, + "learning_rate": 0.0001462873185250653, + "loss": 1.3409, + "step": 20679 + }, + { + "epoch": 0.2687271681802603, + "grad_norm": 0.3475169837474823, + "learning_rate": 0.00014628471906315393, + "loss": 1.3617, + "step": 20680 + }, + { + "epoch": 0.26874016272417617, + "grad_norm": 0.2804672122001648, + "learning_rate": 0.00014628211960124253, + "loss": 1.196, + "step": 20681 + }, + { + "epoch": 0.26875315726809207, + "grad_norm": 0.41921156644821167, + "learning_rate": 0.00014627952013933118, + "loss": 1.3957, + "step": 20682 + }, + { + "epoch": 0.2687661518120079, + "grad_norm": 0.4306753873825073, + "learning_rate": 0.00014627692067741978, + "loss": 1.407, + "step": 20683 + }, + { + "epoch": 0.2687791463559238, + "grad_norm": 0.33495616912841797, + "learning_rate": 0.0001462743212155084, + "loss": 1.3731, + "step": 20684 + }, + { + "epoch": 0.26879214089983966, + "grad_norm": 0.47762545943260193, + "learning_rate": 0.000146271721753597, + "loss": 1.4385, + "step": 20685 + }, + { + "epoch": 0.26880513544375556, + "grad_norm": 0.44264090061187744, + "learning_rate": 0.00014626912229168563, + "loss": 1.421, + "step": 20686 + }, + { + "epoch": 0.2688181299876714, + "grad_norm": 0.43737825751304626, + "learning_rate": 0.00014626652282977425, + "loss": 1.3618, + "step": 20687 + }, + { + "epoch": 0.2688311245315873, + "grad_norm": 0.4088273346424103, + "learning_rate": 0.00014626392336786285, + "loss": 1.5765, + "step": 20688 + }, + { + "epoch": 0.26884411907550315, + "grad_norm": 0.40672099590301514, + "learning_rate": 0.00014626132390595147, + "loss": 1.1195, + "step": 20689 + }, + { + "epoch": 0.26885711361941905, + "grad_norm": 0.48126015067100525, + "learning_rate": 0.0001462587244440401, + "loss": 1.6144, + "step": 20690 + }, + { + "epoch": 0.2688701081633349, + "grad_norm": 0.36170291900634766, + "learning_rate": 0.0001462561249821287, + "loss": 1.2703, + "step": 20691 + }, + { + "epoch": 0.2688831027072508, + "grad_norm": 0.5130376815795898, + "learning_rate": 0.00014625352552021732, + "loss": 1.4901, + "step": 20692 + }, + { + "epoch": 0.26889609725116664, + "grad_norm": 0.35150396823883057, + "learning_rate": 0.00014625092605830594, + "loss": 1.1439, + "step": 20693 + }, + { + "epoch": 0.26890909179508254, + "grad_norm": 0.4448801279067993, + "learning_rate": 0.00014624832659639457, + "loss": 1.4869, + "step": 20694 + }, + { + "epoch": 0.2689220863389984, + "grad_norm": 0.380319207906723, + "learning_rate": 0.00014624572713448317, + "loss": 1.3173, + "step": 20695 + }, + { + "epoch": 0.2689350808829143, + "grad_norm": 0.3700210452079773, + "learning_rate": 0.0001462431276725718, + "loss": 1.3, + "step": 20696 + }, + { + "epoch": 0.26894807542683014, + "grad_norm": 0.3236282765865326, + "learning_rate": 0.00014624052821066041, + "loss": 1.1146, + "step": 20697 + }, + { + "epoch": 0.26896106997074604, + "grad_norm": 0.3484293520450592, + "learning_rate": 0.000146237928748749, + "loss": 1.3232, + "step": 20698 + }, + { + "epoch": 0.2689740645146619, + "grad_norm": 0.2931276559829712, + "learning_rate": 0.00014623532928683764, + "loss": 1.4476, + "step": 20699 + }, + { + "epoch": 0.2689870590585778, + "grad_norm": 0.43924808502197266, + "learning_rate": 0.00014623272982492623, + "loss": 1.2916, + "step": 20700 + }, + { + "epoch": 0.2690000536024936, + "grad_norm": 0.3774886131286621, + "learning_rate": 0.00014623013036301489, + "loss": 1.386, + "step": 20701 + }, + { + "epoch": 0.26901304814640953, + "grad_norm": 0.4164135158061981, + "learning_rate": 0.00014622753090110348, + "loss": 1.3177, + "step": 20702 + }, + { + "epoch": 0.2690260426903254, + "grad_norm": 0.5559182167053223, + "learning_rate": 0.00014622493143919208, + "loss": 1.4402, + "step": 20703 + }, + { + "epoch": 0.2690390372342413, + "grad_norm": 0.45743390917778015, + "learning_rate": 0.0001462223319772807, + "loss": 1.4531, + "step": 20704 + }, + { + "epoch": 0.2690520317781571, + "grad_norm": 0.40325310826301575, + "learning_rate": 0.00014621973251536933, + "loss": 1.3158, + "step": 20705 + }, + { + "epoch": 0.269065026322073, + "grad_norm": 0.46727636456489563, + "learning_rate": 0.00014621713305345795, + "loss": 1.4722, + "step": 20706 + }, + { + "epoch": 0.2690780208659889, + "grad_norm": 0.30945438146591187, + "learning_rate": 0.00014621453359154655, + "loss": 1.5292, + "step": 20707 + }, + { + "epoch": 0.26909101540990477, + "grad_norm": 0.5098206996917725, + "learning_rate": 0.00014621193412963518, + "loss": 1.4808, + "step": 20708 + }, + { + "epoch": 0.26910400995382067, + "grad_norm": 0.45369401574134827, + "learning_rate": 0.0001462093346677238, + "loss": 1.6057, + "step": 20709 + }, + { + "epoch": 0.2691170044977365, + "grad_norm": 0.36789557337760925, + "learning_rate": 0.0001462067352058124, + "loss": 1.1677, + "step": 20710 + }, + { + "epoch": 0.2691299990416524, + "grad_norm": 0.3800722360610962, + "learning_rate": 0.00014620413574390102, + "loss": 1.2215, + "step": 20711 + }, + { + "epoch": 0.26914299358556826, + "grad_norm": 0.42654749751091003, + "learning_rate": 0.00014620153628198962, + "loss": 1.3985, + "step": 20712 + }, + { + "epoch": 0.26915598812948416, + "grad_norm": 0.5000320672988892, + "learning_rate": 0.00014619893682007827, + "loss": 1.5014, + "step": 20713 + }, + { + "epoch": 0.2691689826734, + "grad_norm": 0.37402960658073425, + "learning_rate": 0.00014619633735816687, + "loss": 1.3482, + "step": 20714 + }, + { + "epoch": 0.2691819772173159, + "grad_norm": 0.3382261097431183, + "learning_rate": 0.00014619373789625547, + "loss": 1.3532, + "step": 20715 + }, + { + "epoch": 0.26919497176123175, + "grad_norm": 0.3761296272277832, + "learning_rate": 0.0001461911384343441, + "loss": 1.4991, + "step": 20716 + }, + { + "epoch": 0.26920796630514765, + "grad_norm": 0.3428308367729187, + "learning_rate": 0.00014618853897243271, + "loss": 1.6513, + "step": 20717 + }, + { + "epoch": 0.2692209608490635, + "grad_norm": 0.3999347686767578, + "learning_rate": 0.00014618593951052134, + "loss": 1.3187, + "step": 20718 + }, + { + "epoch": 0.2692339553929794, + "grad_norm": 0.3850482702255249, + "learning_rate": 0.00014618334004860994, + "loss": 1.5427, + "step": 20719 + }, + { + "epoch": 0.26924694993689524, + "grad_norm": 0.5011225938796997, + "learning_rate": 0.00014618074058669856, + "loss": 1.3105, + "step": 20720 + }, + { + "epoch": 0.26925994448081114, + "grad_norm": 0.4166141152381897, + "learning_rate": 0.00014617814112478719, + "loss": 1.3921, + "step": 20721 + }, + { + "epoch": 0.269272939024727, + "grad_norm": 0.32082441449165344, + "learning_rate": 0.00014617554166287578, + "loss": 1.5407, + "step": 20722 + }, + { + "epoch": 0.2692859335686429, + "grad_norm": 0.41183531284332275, + "learning_rate": 0.0001461729422009644, + "loss": 1.3955, + "step": 20723 + }, + { + "epoch": 0.26929892811255873, + "grad_norm": 0.2709164321422577, + "learning_rate": 0.000146170342739053, + "loss": 1.4238, + "step": 20724 + }, + { + "epoch": 0.26931192265647463, + "grad_norm": 0.373794287443161, + "learning_rate": 0.00014616774327714166, + "loss": 1.3195, + "step": 20725 + }, + { + "epoch": 0.2693249172003905, + "grad_norm": 0.4608331322669983, + "learning_rate": 0.00014616514381523025, + "loss": 1.371, + "step": 20726 + }, + { + "epoch": 0.2693379117443064, + "grad_norm": 0.4786517322063446, + "learning_rate": 0.00014616254435331885, + "loss": 1.4855, + "step": 20727 + }, + { + "epoch": 0.2693509062882222, + "grad_norm": 0.3594572842121124, + "learning_rate": 0.0001461599448914075, + "loss": 1.4858, + "step": 20728 + }, + { + "epoch": 0.2693639008321381, + "grad_norm": 0.36895909905433655, + "learning_rate": 0.0001461573454294961, + "loss": 1.5196, + "step": 20729 + }, + { + "epoch": 0.26937689537605397, + "grad_norm": 0.38295185565948486, + "learning_rate": 0.00014615474596758472, + "loss": 1.5151, + "step": 20730 + }, + { + "epoch": 0.26938988991996987, + "grad_norm": 0.4619479477405548, + "learning_rate": 0.00014615214650567332, + "loss": 1.4108, + "step": 20731 + }, + { + "epoch": 0.2694028844638857, + "grad_norm": 0.3786890506744385, + "learning_rate": 0.00014614954704376195, + "loss": 1.4663, + "step": 20732 + }, + { + "epoch": 0.2694158790078016, + "grad_norm": 0.34674233198165894, + "learning_rate": 0.00014614694758185057, + "loss": 1.3268, + "step": 20733 + }, + { + "epoch": 0.26942887355171746, + "grad_norm": 0.35739365220069885, + "learning_rate": 0.00014614434811993917, + "loss": 1.4983, + "step": 20734 + }, + { + "epoch": 0.26944186809563336, + "grad_norm": 0.3962860107421875, + "learning_rate": 0.0001461417486580278, + "loss": 1.4861, + "step": 20735 + }, + { + "epoch": 0.2694548626395492, + "grad_norm": 0.4904305040836334, + "learning_rate": 0.00014613914919611642, + "loss": 1.3847, + "step": 20736 + }, + { + "epoch": 0.2694678571834651, + "grad_norm": 0.4515432119369507, + "learning_rate": 0.00014613654973420504, + "loss": 1.4656, + "step": 20737 + }, + { + "epoch": 0.26948085172738095, + "grad_norm": 0.36145856976509094, + "learning_rate": 0.00014613395027229364, + "loss": 1.2893, + "step": 20738 + }, + { + "epoch": 0.26949384627129686, + "grad_norm": 0.4274309277534485, + "learning_rate": 0.00014613135081038226, + "loss": 1.4493, + "step": 20739 + }, + { + "epoch": 0.2695068408152127, + "grad_norm": 0.5018572807312012, + "learning_rate": 0.0001461287513484709, + "loss": 1.3617, + "step": 20740 + }, + { + "epoch": 0.2695198353591286, + "grad_norm": 0.44044139981269836, + "learning_rate": 0.00014612615188655949, + "loss": 1.442, + "step": 20741 + }, + { + "epoch": 0.26953282990304445, + "grad_norm": 0.449899286031723, + "learning_rate": 0.0001461235524246481, + "loss": 1.4371, + "step": 20742 + }, + { + "epoch": 0.26954582444696035, + "grad_norm": 0.4401423931121826, + "learning_rate": 0.0001461209529627367, + "loss": 1.1401, + "step": 20743 + }, + { + "epoch": 0.2695588189908762, + "grad_norm": 0.41809260845184326, + "learning_rate": 0.00014611835350082533, + "loss": 1.3633, + "step": 20744 + }, + { + "epoch": 0.2695718135347921, + "grad_norm": 0.4464724361896515, + "learning_rate": 0.00014611575403891396, + "loss": 1.5006, + "step": 20745 + }, + { + "epoch": 0.26958480807870794, + "grad_norm": 0.4274747371673584, + "learning_rate": 0.00014611315457700255, + "loss": 1.2148, + "step": 20746 + }, + { + "epoch": 0.26959780262262384, + "grad_norm": 0.39786624908447266, + "learning_rate": 0.00014611055511509118, + "loss": 1.4833, + "step": 20747 + }, + { + "epoch": 0.2696107971665397, + "grad_norm": 0.4112197458744049, + "learning_rate": 0.0001461079556531798, + "loss": 1.4131, + "step": 20748 + }, + { + "epoch": 0.2696237917104556, + "grad_norm": 0.4289264380931854, + "learning_rate": 0.00014610535619126843, + "loss": 1.4593, + "step": 20749 + }, + { + "epoch": 0.26963678625437143, + "grad_norm": 0.3172912895679474, + "learning_rate": 0.00014610275672935702, + "loss": 1.4076, + "step": 20750 + }, + { + "epoch": 0.26964978079828733, + "grad_norm": 0.3598501980304718, + "learning_rate": 0.00014610015726744565, + "loss": 1.4618, + "step": 20751 + }, + { + "epoch": 0.2696627753422032, + "grad_norm": 0.42353907227516174, + "learning_rate": 0.00014609755780553427, + "loss": 1.4181, + "step": 20752 + }, + { + "epoch": 0.2696757698861191, + "grad_norm": 0.37242230772972107, + "learning_rate": 0.00014609495834362287, + "loss": 1.5244, + "step": 20753 + }, + { + "epoch": 0.2696887644300349, + "grad_norm": 0.4530049264431, + "learning_rate": 0.0001460923588817115, + "loss": 1.4631, + "step": 20754 + }, + { + "epoch": 0.2697017589739508, + "grad_norm": 0.41440507769584656, + "learning_rate": 0.0001460897594198001, + "loss": 1.3935, + "step": 20755 + }, + { + "epoch": 0.26971475351786667, + "grad_norm": 0.3290250301361084, + "learning_rate": 0.00014608715995788874, + "loss": 1.3011, + "step": 20756 + }, + { + "epoch": 0.26972774806178257, + "grad_norm": 0.5125744342803955, + "learning_rate": 0.00014608456049597734, + "loss": 1.5043, + "step": 20757 + }, + { + "epoch": 0.2697407426056984, + "grad_norm": 0.35893887281417847, + "learning_rate": 0.00014608196103406594, + "loss": 1.2907, + "step": 20758 + }, + { + "epoch": 0.2697537371496143, + "grad_norm": 0.28977641463279724, + "learning_rate": 0.00014607936157215456, + "loss": 1.2827, + "step": 20759 + }, + { + "epoch": 0.26976673169353016, + "grad_norm": 0.41102495789527893, + "learning_rate": 0.0001460767621102432, + "loss": 1.4831, + "step": 20760 + }, + { + "epoch": 0.26977972623744606, + "grad_norm": 0.40334898233413696, + "learning_rate": 0.0001460741626483318, + "loss": 1.4153, + "step": 20761 + }, + { + "epoch": 0.2697927207813619, + "grad_norm": 0.3420429527759552, + "learning_rate": 0.0001460715631864204, + "loss": 1.498, + "step": 20762 + }, + { + "epoch": 0.2698057153252778, + "grad_norm": 0.4257211983203888, + "learning_rate": 0.00014606896372450903, + "loss": 1.3442, + "step": 20763 + }, + { + "epoch": 0.26981870986919365, + "grad_norm": 0.4188475012779236, + "learning_rate": 0.00014606636426259766, + "loss": 1.3155, + "step": 20764 + }, + { + "epoch": 0.26983170441310955, + "grad_norm": 0.3321782946586609, + "learning_rate": 0.00014606376480068626, + "loss": 1.4222, + "step": 20765 + }, + { + "epoch": 0.2698446989570254, + "grad_norm": 0.43151503801345825, + "learning_rate": 0.00014606116533877488, + "loss": 1.5216, + "step": 20766 + }, + { + "epoch": 0.2698576935009413, + "grad_norm": 0.3855516016483307, + "learning_rate": 0.0001460585658768635, + "loss": 1.4338, + "step": 20767 + }, + { + "epoch": 0.26987068804485714, + "grad_norm": 0.4201805591583252, + "learning_rate": 0.00014605596641495213, + "loss": 1.3732, + "step": 20768 + }, + { + "epoch": 0.26988368258877304, + "grad_norm": 0.43783777952194214, + "learning_rate": 0.00014605336695304073, + "loss": 1.253, + "step": 20769 + }, + { + "epoch": 0.2698966771326889, + "grad_norm": 0.42294326424598694, + "learning_rate": 0.00014605076749112932, + "loss": 1.463, + "step": 20770 + }, + { + "epoch": 0.2699096716766048, + "grad_norm": 0.40143537521362305, + "learning_rate": 0.00014604816802921798, + "loss": 1.352, + "step": 20771 + }, + { + "epoch": 0.26992266622052064, + "grad_norm": 0.4824879467487335, + "learning_rate": 0.00014604556856730657, + "loss": 1.4916, + "step": 20772 + }, + { + "epoch": 0.26993566076443654, + "grad_norm": 0.41266053915023804, + "learning_rate": 0.0001460429691053952, + "loss": 1.4579, + "step": 20773 + }, + { + "epoch": 0.2699486553083524, + "grad_norm": 0.48308441042900085, + "learning_rate": 0.0001460403696434838, + "loss": 1.6107, + "step": 20774 + }, + { + "epoch": 0.2699616498522683, + "grad_norm": 0.39897236227989197, + "learning_rate": 0.00014603777018157242, + "loss": 1.4627, + "step": 20775 + }, + { + "epoch": 0.2699746443961841, + "grad_norm": 0.3590661585330963, + "learning_rate": 0.00014603517071966104, + "loss": 1.4829, + "step": 20776 + }, + { + "epoch": 0.2699876389401, + "grad_norm": 0.38587871193885803, + "learning_rate": 0.00014603257125774964, + "loss": 1.3482, + "step": 20777 + }, + { + "epoch": 0.2700006334840159, + "grad_norm": 0.3748542070388794, + "learning_rate": 0.00014602997179583827, + "loss": 1.3458, + "step": 20778 + }, + { + "epoch": 0.2700136280279318, + "grad_norm": 0.4357249438762665, + "learning_rate": 0.0001460273723339269, + "loss": 1.3145, + "step": 20779 + }, + { + "epoch": 0.2700266225718476, + "grad_norm": 0.3993145227432251, + "learning_rate": 0.00014602477287201551, + "loss": 1.4894, + "step": 20780 + }, + { + "epoch": 0.2700396171157635, + "grad_norm": 0.49461695551872253, + "learning_rate": 0.0001460221734101041, + "loss": 1.518, + "step": 20781 + }, + { + "epoch": 0.27005261165967936, + "grad_norm": 0.42455703020095825, + "learning_rate": 0.0001460195739481927, + "loss": 1.5025, + "step": 20782 + }, + { + "epoch": 0.27006560620359527, + "grad_norm": 0.306119441986084, + "learning_rate": 0.00014601697448628136, + "loss": 1.322, + "step": 20783 + }, + { + "epoch": 0.27007860074751117, + "grad_norm": 0.39377444982528687, + "learning_rate": 0.00014601437502436996, + "loss": 1.4059, + "step": 20784 + }, + { + "epoch": 0.270091595291427, + "grad_norm": 0.36525020003318787, + "learning_rate": 0.00014601177556245858, + "loss": 1.2923, + "step": 20785 + }, + { + "epoch": 0.2701045898353429, + "grad_norm": 0.3976100981235504, + "learning_rate": 0.00014600917610054718, + "loss": 1.3119, + "step": 20786 + }, + { + "epoch": 0.27011758437925876, + "grad_norm": 0.4202565550804138, + "learning_rate": 0.0001460065766386358, + "loss": 1.2831, + "step": 20787 + }, + { + "epoch": 0.27013057892317466, + "grad_norm": 0.33152076601982117, + "learning_rate": 0.00014600397717672443, + "loss": 1.4323, + "step": 20788 + }, + { + "epoch": 0.2701435734670905, + "grad_norm": 0.39816585183143616, + "learning_rate": 0.00014600137771481303, + "loss": 1.5205, + "step": 20789 + }, + { + "epoch": 0.2701565680110064, + "grad_norm": 0.4054388999938965, + "learning_rate": 0.00014599877825290165, + "loss": 1.2961, + "step": 20790 + }, + { + "epoch": 0.27016956255492225, + "grad_norm": 0.424545556306839, + "learning_rate": 0.00014599617879099028, + "loss": 1.2815, + "step": 20791 + }, + { + "epoch": 0.27018255709883815, + "grad_norm": 0.35893192887306213, + "learning_rate": 0.0001459935793290789, + "loss": 1.5249, + "step": 20792 + }, + { + "epoch": 0.270195551642754, + "grad_norm": 0.3958604037761688, + "learning_rate": 0.0001459909798671675, + "loss": 1.326, + "step": 20793 + }, + { + "epoch": 0.2702085461866699, + "grad_norm": 0.40877991914749146, + "learning_rate": 0.00014598838040525612, + "loss": 1.4322, + "step": 20794 + }, + { + "epoch": 0.27022154073058574, + "grad_norm": 0.3238425850868225, + "learning_rate": 0.00014598578094334475, + "loss": 1.2572, + "step": 20795 + }, + { + "epoch": 0.27023453527450164, + "grad_norm": 0.4755823016166687, + "learning_rate": 0.00014598318148143334, + "loss": 1.4959, + "step": 20796 + }, + { + "epoch": 0.2702475298184175, + "grad_norm": 0.3692767918109894, + "learning_rate": 0.00014598058201952197, + "loss": 1.4663, + "step": 20797 + }, + { + "epoch": 0.2702605243623334, + "grad_norm": 0.33734601736068726, + "learning_rate": 0.00014597798255761057, + "loss": 1.4525, + "step": 20798 + }, + { + "epoch": 0.27027351890624923, + "grad_norm": 0.4396667182445526, + "learning_rate": 0.0001459753830956992, + "loss": 1.4602, + "step": 20799 + }, + { + "epoch": 0.27028651345016513, + "grad_norm": 0.4326832890510559, + "learning_rate": 0.00014597278363378781, + "loss": 1.5171, + "step": 20800 + }, + { + "epoch": 0.270299507994081, + "grad_norm": 0.45728784799575806, + "learning_rate": 0.0001459701841718764, + "loss": 1.4585, + "step": 20801 + }, + { + "epoch": 0.2703125025379969, + "grad_norm": 0.3867819607257843, + "learning_rate": 0.00014596758470996506, + "loss": 1.5228, + "step": 20802 + }, + { + "epoch": 0.2703254970819127, + "grad_norm": 0.4609912931919098, + "learning_rate": 0.00014596498524805366, + "loss": 1.3221, + "step": 20803 + }, + { + "epoch": 0.2703384916258286, + "grad_norm": 0.44288545846939087, + "learning_rate": 0.00014596238578614229, + "loss": 1.6226, + "step": 20804 + }, + { + "epoch": 0.27035148616974447, + "grad_norm": 0.3676133155822754, + "learning_rate": 0.00014595978632423088, + "loss": 1.3888, + "step": 20805 + }, + { + "epoch": 0.27036448071366037, + "grad_norm": 0.3704010248184204, + "learning_rate": 0.0001459571868623195, + "loss": 1.3269, + "step": 20806 + }, + { + "epoch": 0.2703774752575762, + "grad_norm": 0.4500162899494171, + "learning_rate": 0.00014595458740040813, + "loss": 1.4127, + "step": 20807 + }, + { + "epoch": 0.2703904698014921, + "grad_norm": 0.3112788498401642, + "learning_rate": 0.00014595198793849673, + "loss": 1.3928, + "step": 20808 + }, + { + "epoch": 0.27040346434540796, + "grad_norm": 0.4380784034729004, + "learning_rate": 0.00014594938847658535, + "loss": 1.4733, + "step": 20809 + }, + { + "epoch": 0.27041645888932386, + "grad_norm": 0.40678146481513977, + "learning_rate": 0.00014594678901467398, + "loss": 1.4062, + "step": 20810 + }, + { + "epoch": 0.2704294534332397, + "grad_norm": 0.3612669110298157, + "learning_rate": 0.00014594418955276258, + "loss": 1.4338, + "step": 20811 + }, + { + "epoch": 0.2704424479771556, + "grad_norm": 0.4091089367866516, + "learning_rate": 0.0001459415900908512, + "loss": 1.6577, + "step": 20812 + }, + { + "epoch": 0.27045544252107145, + "grad_norm": 0.3786531686782837, + "learning_rate": 0.0001459389906289398, + "loss": 1.3989, + "step": 20813 + }, + { + "epoch": 0.27046843706498735, + "grad_norm": 0.46411582827568054, + "learning_rate": 0.00014593639116702845, + "loss": 1.3317, + "step": 20814 + }, + { + "epoch": 0.2704814316089032, + "grad_norm": 0.4212292730808258, + "learning_rate": 0.00014593379170511705, + "loss": 1.3794, + "step": 20815 + }, + { + "epoch": 0.2704944261528191, + "grad_norm": 0.30869749188423157, + "learning_rate": 0.00014593119224320567, + "loss": 1.49, + "step": 20816 + }, + { + "epoch": 0.27050742069673495, + "grad_norm": 0.4733096659183502, + "learning_rate": 0.00014592859278129427, + "loss": 1.4353, + "step": 20817 + }, + { + "epoch": 0.27052041524065085, + "grad_norm": 0.39669325947761536, + "learning_rate": 0.0001459259933193829, + "loss": 1.4102, + "step": 20818 + }, + { + "epoch": 0.2705334097845667, + "grad_norm": 0.4561370313167572, + "learning_rate": 0.00014592339385747152, + "loss": 1.402, + "step": 20819 + }, + { + "epoch": 0.2705464043284826, + "grad_norm": 0.44948163628578186, + "learning_rate": 0.00014592079439556011, + "loss": 1.5506, + "step": 20820 + }, + { + "epoch": 0.27055939887239844, + "grad_norm": 0.366617351770401, + "learning_rate": 0.00014591819493364874, + "loss": 1.5229, + "step": 20821 + }, + { + "epoch": 0.27057239341631434, + "grad_norm": 0.37989145517349243, + "learning_rate": 0.00014591559547173736, + "loss": 1.4713, + "step": 20822 + }, + { + "epoch": 0.2705853879602302, + "grad_norm": 0.3669915497303009, + "learning_rate": 0.000145912996009826, + "loss": 1.4177, + "step": 20823 + }, + { + "epoch": 0.2705983825041461, + "grad_norm": 0.40401995182037354, + "learning_rate": 0.00014591039654791459, + "loss": 1.4135, + "step": 20824 + }, + { + "epoch": 0.27061137704806193, + "grad_norm": 0.3447856307029724, + "learning_rate": 0.00014590779708600318, + "loss": 1.2905, + "step": 20825 + }, + { + "epoch": 0.27062437159197783, + "grad_norm": 0.3797377347946167, + "learning_rate": 0.00014590519762409183, + "loss": 1.207, + "step": 20826 + }, + { + "epoch": 0.2706373661358937, + "grad_norm": 0.37477850914001465, + "learning_rate": 0.00014590259816218043, + "loss": 1.4285, + "step": 20827 + }, + { + "epoch": 0.2706503606798096, + "grad_norm": 0.38122615218162537, + "learning_rate": 0.00014589999870026906, + "loss": 1.2644, + "step": 20828 + }, + { + "epoch": 0.2706633552237254, + "grad_norm": 0.40307557582855225, + "learning_rate": 0.00014589739923835765, + "loss": 1.4889, + "step": 20829 + }, + { + "epoch": 0.2706763497676413, + "grad_norm": 0.34643661975860596, + "learning_rate": 0.00014589479977644628, + "loss": 1.4476, + "step": 20830 + }, + { + "epoch": 0.27068934431155717, + "grad_norm": 0.36796635389328003, + "learning_rate": 0.0001458922003145349, + "loss": 1.3461, + "step": 20831 + }, + { + "epoch": 0.27070233885547307, + "grad_norm": 0.402668833732605, + "learning_rate": 0.0001458896008526235, + "loss": 1.5257, + "step": 20832 + }, + { + "epoch": 0.2707153333993889, + "grad_norm": 0.4068823456764221, + "learning_rate": 0.00014588700139071212, + "loss": 1.3416, + "step": 20833 + }, + { + "epoch": 0.2707283279433048, + "grad_norm": 0.4461059868335724, + "learning_rate": 0.00014588440192880075, + "loss": 1.4412, + "step": 20834 + }, + { + "epoch": 0.27074132248722066, + "grad_norm": 0.5113810300827026, + "learning_rate": 0.00014588180246688937, + "loss": 1.5175, + "step": 20835 + }, + { + "epoch": 0.27075431703113656, + "grad_norm": 0.34720370173454285, + "learning_rate": 0.00014587920300497797, + "loss": 1.3346, + "step": 20836 + }, + { + "epoch": 0.2707673115750524, + "grad_norm": 0.429147332906723, + "learning_rate": 0.00014587660354306657, + "loss": 1.4084, + "step": 20837 + }, + { + "epoch": 0.2707803061189683, + "grad_norm": 0.3017660975456238, + "learning_rate": 0.00014587400408115522, + "loss": 1.0667, + "step": 20838 + }, + { + "epoch": 0.27079330066288415, + "grad_norm": 0.4261918067932129, + "learning_rate": 0.00014587140461924382, + "loss": 1.4971, + "step": 20839 + }, + { + "epoch": 0.27080629520680005, + "grad_norm": 0.3704416751861572, + "learning_rate": 0.00014586880515733244, + "loss": 1.3831, + "step": 20840 + }, + { + "epoch": 0.2708192897507159, + "grad_norm": 0.4343677759170532, + "learning_rate": 0.00014586620569542107, + "loss": 1.6067, + "step": 20841 + }, + { + "epoch": 0.2708322842946318, + "grad_norm": 0.42772969603538513, + "learning_rate": 0.00014586360623350966, + "loss": 1.4594, + "step": 20842 + }, + { + "epoch": 0.27084527883854764, + "grad_norm": 0.4471254050731659, + "learning_rate": 0.0001458610067715983, + "loss": 1.4875, + "step": 20843 + }, + { + "epoch": 0.27085827338246354, + "grad_norm": 0.4613000154495239, + "learning_rate": 0.00014585840730968689, + "loss": 1.4553, + "step": 20844 + }, + { + "epoch": 0.2708712679263794, + "grad_norm": 0.3984544277191162, + "learning_rate": 0.00014585580784777554, + "loss": 1.3063, + "step": 20845 + }, + { + "epoch": 0.2708842624702953, + "grad_norm": 0.4328620135784149, + "learning_rate": 0.00014585320838586413, + "loss": 1.4295, + "step": 20846 + }, + { + "epoch": 0.27089725701421113, + "grad_norm": 0.38093486428260803, + "learning_rate": 0.00014585060892395276, + "loss": 1.4901, + "step": 20847 + }, + { + "epoch": 0.27091025155812704, + "grad_norm": 0.3480699360370636, + "learning_rate": 0.00014584800946204136, + "loss": 1.642, + "step": 20848 + }, + { + "epoch": 0.2709232461020429, + "grad_norm": 0.3781997859477997, + "learning_rate": 0.00014584541000012998, + "loss": 1.2357, + "step": 20849 + }, + { + "epoch": 0.2709362406459588, + "grad_norm": 0.4760379195213318, + "learning_rate": 0.0001458428105382186, + "loss": 1.5356, + "step": 20850 + }, + { + "epoch": 0.2709492351898746, + "grad_norm": 0.32137757539749146, + "learning_rate": 0.0001458402110763072, + "loss": 1.4307, + "step": 20851 + }, + { + "epoch": 0.2709622297337905, + "grad_norm": 0.4688911736011505, + "learning_rate": 0.00014583761161439583, + "loss": 1.6149, + "step": 20852 + }, + { + "epoch": 0.2709752242777064, + "grad_norm": 0.3692459762096405, + "learning_rate": 0.00014583501215248445, + "loss": 1.3311, + "step": 20853 + }, + { + "epoch": 0.2709882188216223, + "grad_norm": 0.37827131152153015, + "learning_rate": 0.00014583241269057305, + "loss": 1.3687, + "step": 20854 + }, + { + "epoch": 0.2710012133655381, + "grad_norm": 0.45231932401657104, + "learning_rate": 0.00014582981322866167, + "loss": 1.4479, + "step": 20855 + }, + { + "epoch": 0.271014207909454, + "grad_norm": 0.3784984052181244, + "learning_rate": 0.00014582721376675027, + "loss": 1.4308, + "step": 20856 + }, + { + "epoch": 0.27102720245336986, + "grad_norm": 0.4157150685787201, + "learning_rate": 0.00014582461430483892, + "loss": 1.5173, + "step": 20857 + }, + { + "epoch": 0.27104019699728576, + "grad_norm": 0.4382265508174896, + "learning_rate": 0.00014582201484292752, + "loss": 1.6408, + "step": 20858 + }, + { + "epoch": 0.27105319154120167, + "grad_norm": 0.43809014558792114, + "learning_rate": 0.00014581941538101614, + "loss": 1.5878, + "step": 20859 + }, + { + "epoch": 0.2710661860851175, + "grad_norm": 0.4447806477546692, + "learning_rate": 0.00014581681591910474, + "loss": 1.3213, + "step": 20860 + }, + { + "epoch": 0.2710791806290334, + "grad_norm": 0.3964853286743164, + "learning_rate": 0.00014581421645719337, + "loss": 1.4591, + "step": 20861 + }, + { + "epoch": 0.27109217517294926, + "grad_norm": 0.34540191292762756, + "learning_rate": 0.000145811616995282, + "loss": 1.2487, + "step": 20862 + }, + { + "epoch": 0.27110516971686516, + "grad_norm": 0.4400683343410492, + "learning_rate": 0.0001458090175333706, + "loss": 1.4948, + "step": 20863 + }, + { + "epoch": 0.271118164260781, + "grad_norm": 0.41917356848716736, + "learning_rate": 0.0001458064180714592, + "loss": 1.4493, + "step": 20864 + }, + { + "epoch": 0.2711311588046969, + "grad_norm": 0.3956057131290436, + "learning_rate": 0.00014580381860954784, + "loss": 1.3558, + "step": 20865 + }, + { + "epoch": 0.27114415334861275, + "grad_norm": 0.4873267710208893, + "learning_rate": 0.00014580121914763643, + "loss": 1.4294, + "step": 20866 + }, + { + "epoch": 0.27115714789252865, + "grad_norm": 0.4087563157081604, + "learning_rate": 0.00014579861968572506, + "loss": 1.3979, + "step": 20867 + }, + { + "epoch": 0.2711701424364445, + "grad_norm": 0.32891717553138733, + "learning_rate": 0.00014579602022381366, + "loss": 1.4041, + "step": 20868 + }, + { + "epoch": 0.2711831369803604, + "grad_norm": 0.4600984752178192, + "learning_rate": 0.0001457934207619023, + "loss": 1.3648, + "step": 20869 + }, + { + "epoch": 0.27119613152427624, + "grad_norm": 0.4293263852596283, + "learning_rate": 0.0001457908212999909, + "loss": 1.3921, + "step": 20870 + }, + { + "epoch": 0.27120912606819214, + "grad_norm": 0.40912577509880066, + "learning_rate": 0.00014578822183807953, + "loss": 1.4292, + "step": 20871 + }, + { + "epoch": 0.271222120612108, + "grad_norm": 0.34529200196266174, + "learning_rate": 0.00014578562237616813, + "loss": 1.4514, + "step": 20872 + }, + { + "epoch": 0.2712351151560239, + "grad_norm": 0.2624732255935669, + "learning_rate": 0.00014578302291425675, + "loss": 1.1451, + "step": 20873 + }, + { + "epoch": 0.27124810969993973, + "grad_norm": 0.36293190717697144, + "learning_rate": 0.00014578042345234538, + "loss": 1.4315, + "step": 20874 + }, + { + "epoch": 0.27126110424385563, + "grad_norm": 0.4560997188091278, + "learning_rate": 0.00014577782399043397, + "loss": 1.418, + "step": 20875 + }, + { + "epoch": 0.2712740987877715, + "grad_norm": 0.4754064679145813, + "learning_rate": 0.00014577522452852263, + "loss": 1.5889, + "step": 20876 + }, + { + "epoch": 0.2712870933316874, + "grad_norm": 0.3319530487060547, + "learning_rate": 0.00014577262506661122, + "loss": 1.416, + "step": 20877 + }, + { + "epoch": 0.2713000878756032, + "grad_norm": 0.4696401357650757, + "learning_rate": 0.00014577002560469985, + "loss": 1.4437, + "step": 20878 + }, + { + "epoch": 0.2713130824195191, + "grad_norm": 0.34043440222740173, + "learning_rate": 0.00014576742614278844, + "loss": 1.2634, + "step": 20879 + }, + { + "epoch": 0.27132607696343497, + "grad_norm": 0.38766780495643616, + "learning_rate": 0.00014576482668087707, + "loss": 1.4191, + "step": 20880 + }, + { + "epoch": 0.27133907150735087, + "grad_norm": 0.4224969446659088, + "learning_rate": 0.0001457622272189657, + "loss": 1.3619, + "step": 20881 + }, + { + "epoch": 0.2713520660512667, + "grad_norm": 0.5193988084793091, + "learning_rate": 0.0001457596277570543, + "loss": 1.3226, + "step": 20882 + }, + { + "epoch": 0.2713650605951826, + "grad_norm": 0.3767814040184021, + "learning_rate": 0.00014575702829514292, + "loss": 1.5669, + "step": 20883 + }, + { + "epoch": 0.27137805513909846, + "grad_norm": 0.33606457710266113, + "learning_rate": 0.00014575442883323154, + "loss": 1.2803, + "step": 20884 + }, + { + "epoch": 0.27139104968301436, + "grad_norm": 0.46754106879234314, + "learning_rate": 0.00014575182937132014, + "loss": 1.4925, + "step": 20885 + }, + { + "epoch": 0.2714040442269302, + "grad_norm": 0.36631861329078674, + "learning_rate": 0.00014574922990940876, + "loss": 1.3568, + "step": 20886 + }, + { + "epoch": 0.2714170387708461, + "grad_norm": 0.5096385478973389, + "learning_rate": 0.00014574663044749736, + "loss": 1.3471, + "step": 20887 + }, + { + "epoch": 0.27143003331476195, + "grad_norm": 0.40501829981803894, + "learning_rate": 0.000145744030985586, + "loss": 1.4683, + "step": 20888 + }, + { + "epoch": 0.27144302785867785, + "grad_norm": 0.4708895981311798, + "learning_rate": 0.0001457414315236746, + "loss": 1.4784, + "step": 20889 + }, + { + "epoch": 0.2714560224025937, + "grad_norm": 0.4071749746799469, + "learning_rate": 0.00014573883206176323, + "loss": 1.4794, + "step": 20890 + }, + { + "epoch": 0.2714690169465096, + "grad_norm": 0.3962570130825043, + "learning_rate": 0.00014573623259985183, + "loss": 1.3485, + "step": 20891 + }, + { + "epoch": 0.27148201149042545, + "grad_norm": 0.3588683307170868, + "learning_rate": 0.00014573363313794045, + "loss": 1.3494, + "step": 20892 + }, + { + "epoch": 0.27149500603434135, + "grad_norm": 0.4205160439014435, + "learning_rate": 0.00014573103367602908, + "loss": 1.3711, + "step": 20893 + }, + { + "epoch": 0.2715080005782572, + "grad_norm": 0.40023550391197205, + "learning_rate": 0.00014572843421411768, + "loss": 1.237, + "step": 20894 + }, + { + "epoch": 0.2715209951221731, + "grad_norm": 0.40140536427497864, + "learning_rate": 0.0001457258347522063, + "loss": 1.5044, + "step": 20895 + }, + { + "epoch": 0.27153398966608894, + "grad_norm": 0.31563496589660645, + "learning_rate": 0.00014572323529029493, + "loss": 1.2847, + "step": 20896 + }, + { + "epoch": 0.27154698421000484, + "grad_norm": 0.479702889919281, + "learning_rate": 0.00014572063582838352, + "loss": 1.4794, + "step": 20897 + }, + { + "epoch": 0.2715599787539207, + "grad_norm": 0.2976384162902832, + "learning_rate": 0.00014571803636647215, + "loss": 1.1839, + "step": 20898 + }, + { + "epoch": 0.2715729732978366, + "grad_norm": 0.3080192804336548, + "learning_rate": 0.00014571543690456074, + "loss": 1.1892, + "step": 20899 + }, + { + "epoch": 0.27158596784175243, + "grad_norm": 0.4232228994369507, + "learning_rate": 0.0001457128374426494, + "loss": 1.5328, + "step": 20900 + }, + { + "epoch": 0.27159896238566833, + "grad_norm": 0.4344131648540497, + "learning_rate": 0.000145710237980738, + "loss": 1.5033, + "step": 20901 + }, + { + "epoch": 0.2716119569295842, + "grad_norm": 0.5643211007118225, + "learning_rate": 0.00014570763851882662, + "loss": 1.3747, + "step": 20902 + }, + { + "epoch": 0.2716249514735001, + "grad_norm": 0.3988206088542938, + "learning_rate": 0.00014570503905691522, + "loss": 1.5398, + "step": 20903 + }, + { + "epoch": 0.2716379460174159, + "grad_norm": 0.4586490988731384, + "learning_rate": 0.00014570243959500384, + "loss": 1.5673, + "step": 20904 + }, + { + "epoch": 0.2716509405613318, + "grad_norm": 0.39344796538352966, + "learning_rate": 0.00014569984013309246, + "loss": 1.393, + "step": 20905 + }, + { + "epoch": 0.27166393510524767, + "grad_norm": 0.41531339287757874, + "learning_rate": 0.00014569724067118106, + "loss": 1.2759, + "step": 20906 + }, + { + "epoch": 0.27167692964916357, + "grad_norm": 0.4514845013618469, + "learning_rate": 0.00014569464120926969, + "loss": 1.4866, + "step": 20907 + }, + { + "epoch": 0.2716899241930794, + "grad_norm": 0.4289674162864685, + "learning_rate": 0.0001456920417473583, + "loss": 1.3813, + "step": 20908 + }, + { + "epoch": 0.2717029187369953, + "grad_norm": 0.4469279646873474, + "learning_rate": 0.0001456894422854469, + "loss": 1.445, + "step": 20909 + }, + { + "epoch": 0.27171591328091116, + "grad_norm": 0.3672669231891632, + "learning_rate": 0.00014568684282353553, + "loss": 1.6184, + "step": 20910 + }, + { + "epoch": 0.27172890782482706, + "grad_norm": 0.3048780858516693, + "learning_rate": 0.00014568424336162413, + "loss": 1.2093, + "step": 20911 + }, + { + "epoch": 0.2717419023687429, + "grad_norm": 0.3432950973510742, + "learning_rate": 0.00014568164389971278, + "loss": 1.4253, + "step": 20912 + }, + { + "epoch": 0.2717548969126588, + "grad_norm": 0.44160789251327515, + "learning_rate": 0.00014567904443780138, + "loss": 1.6193, + "step": 20913 + }, + { + "epoch": 0.27176789145657465, + "grad_norm": 0.43504777550697327, + "learning_rate": 0.00014567644497589, + "loss": 1.5853, + "step": 20914 + }, + { + "epoch": 0.27178088600049055, + "grad_norm": 0.42595210671424866, + "learning_rate": 0.00014567384551397863, + "loss": 1.3747, + "step": 20915 + }, + { + "epoch": 0.2717938805444064, + "grad_norm": 0.3934386074542999, + "learning_rate": 0.00014567124605206722, + "loss": 1.2053, + "step": 20916 + }, + { + "epoch": 0.2718068750883223, + "grad_norm": 0.3724689781665802, + "learning_rate": 0.00014566864659015585, + "loss": 1.3004, + "step": 20917 + }, + { + "epoch": 0.27181986963223814, + "grad_norm": 0.5848191976547241, + "learning_rate": 0.00014566604712824445, + "loss": 1.5933, + "step": 20918 + }, + { + "epoch": 0.27183286417615404, + "grad_norm": 0.4291781187057495, + "learning_rate": 0.0001456634476663331, + "loss": 1.3255, + "step": 20919 + }, + { + "epoch": 0.2718458587200699, + "grad_norm": 0.5061736106872559, + "learning_rate": 0.0001456608482044217, + "loss": 1.5369, + "step": 20920 + }, + { + "epoch": 0.2718588532639858, + "grad_norm": 0.445131778717041, + "learning_rate": 0.0001456582487425103, + "loss": 1.4927, + "step": 20921 + }, + { + "epoch": 0.27187184780790163, + "grad_norm": 0.3543862998485565, + "learning_rate": 0.00014565564928059892, + "loss": 1.447, + "step": 20922 + }, + { + "epoch": 0.27188484235181754, + "grad_norm": 0.4228907525539398, + "learning_rate": 0.00014565304981868754, + "loss": 1.3768, + "step": 20923 + }, + { + "epoch": 0.2718978368957334, + "grad_norm": 0.4196886420249939, + "learning_rate": 0.00014565045035677617, + "loss": 1.4874, + "step": 20924 + }, + { + "epoch": 0.2719108314396493, + "grad_norm": 0.4059169888496399, + "learning_rate": 0.00014564785089486476, + "loss": 1.411, + "step": 20925 + }, + { + "epoch": 0.2719238259835651, + "grad_norm": 0.34867146611213684, + "learning_rate": 0.0001456452514329534, + "loss": 1.5161, + "step": 20926 + }, + { + "epoch": 0.271936820527481, + "grad_norm": 0.4182260036468506, + "learning_rate": 0.000145642651971042, + "loss": 1.4437, + "step": 20927 + }, + { + "epoch": 0.27194981507139687, + "grad_norm": 0.2763451039791107, + "learning_rate": 0.0001456400525091306, + "loss": 1.39, + "step": 20928 + }, + { + "epoch": 0.2719628096153128, + "grad_norm": 0.2923557162284851, + "learning_rate": 0.00014563745304721923, + "loss": 1.3705, + "step": 20929 + }, + { + "epoch": 0.2719758041592286, + "grad_norm": 0.4142714738845825, + "learning_rate": 0.00014563485358530783, + "loss": 1.4015, + "step": 20930 + }, + { + "epoch": 0.2719887987031445, + "grad_norm": 0.3314935564994812, + "learning_rate": 0.00014563225412339648, + "loss": 1.2912, + "step": 20931 + }, + { + "epoch": 0.27200179324706036, + "grad_norm": 0.3004406690597534, + "learning_rate": 0.00014562965466148508, + "loss": 1.1659, + "step": 20932 + }, + { + "epoch": 0.27201478779097626, + "grad_norm": 0.4019714891910553, + "learning_rate": 0.00014562705519957368, + "loss": 1.4722, + "step": 20933 + }, + { + "epoch": 0.2720277823348921, + "grad_norm": 0.39341098070144653, + "learning_rate": 0.0001456244557376623, + "loss": 1.307, + "step": 20934 + }, + { + "epoch": 0.272040776878808, + "grad_norm": 0.37396857142448425, + "learning_rate": 0.00014562185627575093, + "loss": 1.3251, + "step": 20935 + }, + { + "epoch": 0.2720537714227239, + "grad_norm": 0.4872579276561737, + "learning_rate": 0.00014561925681383955, + "loss": 1.3513, + "step": 20936 + }, + { + "epoch": 0.27206676596663976, + "grad_norm": 0.3539903461933136, + "learning_rate": 0.00014561665735192815, + "loss": 1.4946, + "step": 20937 + }, + { + "epoch": 0.27207976051055566, + "grad_norm": 0.3839171528816223, + "learning_rate": 0.00014561405789001677, + "loss": 1.258, + "step": 20938 + }, + { + "epoch": 0.2720927550544715, + "grad_norm": 0.3958101272583008, + "learning_rate": 0.0001456114584281054, + "loss": 1.4121, + "step": 20939 + }, + { + "epoch": 0.2721057495983874, + "grad_norm": 0.46334654092788696, + "learning_rate": 0.000145608858966194, + "loss": 1.4719, + "step": 20940 + }, + { + "epoch": 0.27211874414230325, + "grad_norm": 0.43549561500549316, + "learning_rate": 0.00014560625950428262, + "loss": 1.5402, + "step": 20941 + }, + { + "epoch": 0.27213173868621915, + "grad_norm": 0.4384608566761017, + "learning_rate": 0.00014560366004237122, + "loss": 1.4569, + "step": 20942 + }, + { + "epoch": 0.272144733230135, + "grad_norm": 0.43945741653442383, + "learning_rate": 0.00014560106058045987, + "loss": 1.2072, + "step": 20943 + }, + { + "epoch": 0.2721577277740509, + "grad_norm": 0.3684191405773163, + "learning_rate": 0.00014559846111854847, + "loss": 1.5432, + "step": 20944 + }, + { + "epoch": 0.27217072231796674, + "grad_norm": 0.2938414514064789, + "learning_rate": 0.0001455958616566371, + "loss": 1.3299, + "step": 20945 + }, + { + "epoch": 0.27218371686188264, + "grad_norm": 0.3631507456302643, + "learning_rate": 0.0001455932621947257, + "loss": 1.2465, + "step": 20946 + }, + { + "epoch": 0.2721967114057985, + "grad_norm": 0.3917112946510315, + "learning_rate": 0.0001455906627328143, + "loss": 1.4785, + "step": 20947 + }, + { + "epoch": 0.2722097059497144, + "grad_norm": 0.32186800241470337, + "learning_rate": 0.00014558806327090294, + "loss": 1.3332, + "step": 20948 + }, + { + "epoch": 0.27222270049363023, + "grad_norm": 0.396060973405838, + "learning_rate": 0.00014558546380899153, + "loss": 1.4922, + "step": 20949 + }, + { + "epoch": 0.27223569503754613, + "grad_norm": 0.3950470983982086, + "learning_rate": 0.00014558286434708016, + "loss": 1.4796, + "step": 20950 + }, + { + "epoch": 0.272248689581462, + "grad_norm": 0.4561919569969177, + "learning_rate": 0.00014558026488516878, + "loss": 1.4229, + "step": 20951 + }, + { + "epoch": 0.2722616841253779, + "grad_norm": 0.3467792868614197, + "learning_rate": 0.00014557766542325738, + "loss": 1.3409, + "step": 20952 + }, + { + "epoch": 0.2722746786692937, + "grad_norm": 0.4270062744617462, + "learning_rate": 0.000145575065961346, + "loss": 1.4152, + "step": 20953 + }, + { + "epoch": 0.2722876732132096, + "grad_norm": 0.35324808955192566, + "learning_rate": 0.00014557246649943463, + "loss": 1.5561, + "step": 20954 + }, + { + "epoch": 0.27230066775712547, + "grad_norm": 0.43271857500076294, + "learning_rate": 0.00014556986703752325, + "loss": 1.4855, + "step": 20955 + }, + { + "epoch": 0.27231366230104137, + "grad_norm": 0.3370274603366852, + "learning_rate": 0.00014556726757561185, + "loss": 1.291, + "step": 20956 + }, + { + "epoch": 0.2723266568449572, + "grad_norm": 0.36171454191207886, + "learning_rate": 0.00014556466811370048, + "loss": 1.399, + "step": 20957 + }, + { + "epoch": 0.2723396513888731, + "grad_norm": 0.411704421043396, + "learning_rate": 0.0001455620686517891, + "loss": 1.3516, + "step": 20958 + }, + { + "epoch": 0.27235264593278896, + "grad_norm": 0.4227503538131714, + "learning_rate": 0.0001455594691898777, + "loss": 1.4469, + "step": 20959 + }, + { + "epoch": 0.27236564047670486, + "grad_norm": 0.3900274634361267, + "learning_rate": 0.00014555686972796632, + "loss": 1.3782, + "step": 20960 + }, + { + "epoch": 0.2723786350206207, + "grad_norm": 0.4124211370944977, + "learning_rate": 0.00014555427026605492, + "loss": 1.514, + "step": 20961 + }, + { + "epoch": 0.2723916295645366, + "grad_norm": 0.3952930271625519, + "learning_rate": 0.00014555167080414357, + "loss": 1.3357, + "step": 20962 + }, + { + "epoch": 0.27240462410845245, + "grad_norm": 0.4210646450519562, + "learning_rate": 0.00014554907134223217, + "loss": 1.6704, + "step": 20963 + }, + { + "epoch": 0.27241761865236835, + "grad_norm": 0.4260576665401459, + "learning_rate": 0.00014554647188032077, + "loss": 1.4169, + "step": 20964 + }, + { + "epoch": 0.2724306131962842, + "grad_norm": 0.39298146963119507, + "learning_rate": 0.0001455438724184094, + "loss": 1.3177, + "step": 20965 + }, + { + "epoch": 0.2724436077402001, + "grad_norm": 0.4447885751724243, + "learning_rate": 0.00014554127295649802, + "loss": 1.5267, + "step": 20966 + }, + { + "epoch": 0.27245660228411595, + "grad_norm": 0.33108648657798767, + "learning_rate": 0.00014553867349458664, + "loss": 1.3462, + "step": 20967 + }, + { + "epoch": 0.27246959682803185, + "grad_norm": 0.3959689140319824, + "learning_rate": 0.00014553607403267524, + "loss": 1.1595, + "step": 20968 + }, + { + "epoch": 0.2724825913719477, + "grad_norm": 0.4592994749546051, + "learning_rate": 0.00014553347457076386, + "loss": 1.4471, + "step": 20969 + }, + { + "epoch": 0.2724955859158636, + "grad_norm": 0.3343043923377991, + "learning_rate": 0.00014553087510885249, + "loss": 1.3166, + "step": 20970 + }, + { + "epoch": 0.27250858045977944, + "grad_norm": 0.45657259225845337, + "learning_rate": 0.00014552827564694108, + "loss": 1.4627, + "step": 20971 + }, + { + "epoch": 0.27252157500369534, + "grad_norm": 0.48273783922195435, + "learning_rate": 0.0001455256761850297, + "loss": 1.3553, + "step": 20972 + }, + { + "epoch": 0.2725345695476112, + "grad_norm": 0.3224756717681885, + "learning_rate": 0.0001455230767231183, + "loss": 1.3283, + "step": 20973 + }, + { + "epoch": 0.2725475640915271, + "grad_norm": 0.5678260922431946, + "learning_rate": 0.00014552047726120696, + "loss": 1.3435, + "step": 20974 + }, + { + "epoch": 0.27256055863544293, + "grad_norm": 0.3586970865726471, + "learning_rate": 0.00014551787779929555, + "loss": 1.2358, + "step": 20975 + }, + { + "epoch": 0.27257355317935883, + "grad_norm": 0.4826592803001404, + "learning_rate": 0.00014551527833738415, + "loss": 1.4185, + "step": 20976 + }, + { + "epoch": 0.2725865477232747, + "grad_norm": 0.4475829005241394, + "learning_rate": 0.00014551267887547278, + "loss": 1.6022, + "step": 20977 + }, + { + "epoch": 0.2725995422671906, + "grad_norm": 0.3895430266857147, + "learning_rate": 0.0001455100794135614, + "loss": 1.4663, + "step": 20978 + }, + { + "epoch": 0.2726125368111064, + "grad_norm": 0.3294030427932739, + "learning_rate": 0.00014550747995165003, + "loss": 1.263, + "step": 20979 + }, + { + "epoch": 0.2726255313550223, + "grad_norm": 0.41625484824180603, + "learning_rate": 0.00014550488048973862, + "loss": 1.48, + "step": 20980 + }, + { + "epoch": 0.27263852589893817, + "grad_norm": 0.39338308572769165, + "learning_rate": 0.00014550228102782725, + "loss": 1.557, + "step": 20981 + }, + { + "epoch": 0.27265152044285407, + "grad_norm": 0.34008949995040894, + "learning_rate": 0.00014549968156591587, + "loss": 1.5955, + "step": 20982 + }, + { + "epoch": 0.2726645149867699, + "grad_norm": 0.4699447751045227, + "learning_rate": 0.00014549708210400447, + "loss": 1.6548, + "step": 20983 + }, + { + "epoch": 0.2726775095306858, + "grad_norm": 0.44465935230255127, + "learning_rate": 0.0001454944826420931, + "loss": 1.3683, + "step": 20984 + }, + { + "epoch": 0.27269050407460166, + "grad_norm": 0.3938220739364624, + "learning_rate": 0.0001454918831801817, + "loss": 1.5514, + "step": 20985 + }, + { + "epoch": 0.27270349861851756, + "grad_norm": 0.3734889328479767, + "learning_rate": 0.00014548928371827034, + "loss": 1.4019, + "step": 20986 + }, + { + "epoch": 0.2727164931624334, + "grad_norm": 0.36078962683677673, + "learning_rate": 0.00014548668425635894, + "loss": 1.524, + "step": 20987 + }, + { + "epoch": 0.2727294877063493, + "grad_norm": 1.1044508218765259, + "learning_rate": 0.00014548408479444754, + "loss": 1.4458, + "step": 20988 + }, + { + "epoch": 0.27274248225026515, + "grad_norm": 0.38395217061042786, + "learning_rate": 0.0001454814853325362, + "loss": 1.4384, + "step": 20989 + }, + { + "epoch": 0.27275547679418105, + "grad_norm": 0.3708850145339966, + "learning_rate": 0.00014547888587062479, + "loss": 1.2456, + "step": 20990 + }, + { + "epoch": 0.2727684713380969, + "grad_norm": 0.4532736539840698, + "learning_rate": 0.0001454762864087134, + "loss": 1.3178, + "step": 20991 + }, + { + "epoch": 0.2727814658820128, + "grad_norm": 0.47734859585762024, + "learning_rate": 0.000145473686946802, + "loss": 1.4773, + "step": 20992 + }, + { + "epoch": 0.27279446042592864, + "grad_norm": 0.4454466998577118, + "learning_rate": 0.00014547108748489063, + "loss": 1.3575, + "step": 20993 + }, + { + "epoch": 0.27280745496984454, + "grad_norm": 0.42839354276657104, + "learning_rate": 0.00014546848802297926, + "loss": 1.3424, + "step": 20994 + }, + { + "epoch": 0.2728204495137604, + "grad_norm": 0.37258681654930115, + "learning_rate": 0.00014546588856106785, + "loss": 1.4759, + "step": 20995 + }, + { + "epoch": 0.2728334440576763, + "grad_norm": 0.35203108191490173, + "learning_rate": 0.00014546328909915648, + "loss": 1.4133, + "step": 20996 + }, + { + "epoch": 0.27284643860159213, + "grad_norm": 0.3980732858181, + "learning_rate": 0.0001454606896372451, + "loss": 1.4573, + "step": 20997 + }, + { + "epoch": 0.27285943314550803, + "grad_norm": 0.2713392972946167, + "learning_rate": 0.00014545809017533373, + "loss": 1.3958, + "step": 20998 + }, + { + "epoch": 0.2728724276894239, + "grad_norm": 0.49511057138442993, + "learning_rate": 0.00014545549071342233, + "loss": 1.5556, + "step": 20999 + }, + { + "epoch": 0.2728854222333398, + "grad_norm": 0.3600739538669586, + "learning_rate": 0.00014545289125151095, + "loss": 1.4051, + "step": 21000 + }, + { + "epoch": 0.2728984167772556, + "grad_norm": 0.45966795086860657, + "learning_rate": 0.00014545029178959957, + "loss": 1.499, + "step": 21001 + }, + { + "epoch": 0.2729114113211715, + "grad_norm": 0.36374330520629883, + "learning_rate": 0.00014544769232768817, + "loss": 1.4556, + "step": 21002 + }, + { + "epoch": 0.27292440586508737, + "grad_norm": 0.3642531931400299, + "learning_rate": 0.0001454450928657768, + "loss": 1.4275, + "step": 21003 + }, + { + "epoch": 0.2729374004090033, + "grad_norm": 0.3998773396015167, + "learning_rate": 0.0001454424934038654, + "loss": 1.4732, + "step": 21004 + }, + { + "epoch": 0.2729503949529191, + "grad_norm": 0.34008800983428955, + "learning_rate": 0.00014543989394195402, + "loss": 1.4137, + "step": 21005 + }, + { + "epoch": 0.272963389496835, + "grad_norm": 0.4003192186355591, + "learning_rate": 0.00014543729448004264, + "loss": 1.3219, + "step": 21006 + }, + { + "epoch": 0.27297638404075086, + "grad_norm": 0.4651870131492615, + "learning_rate": 0.00014543469501813124, + "loss": 1.3768, + "step": 21007 + }, + { + "epoch": 0.27298937858466676, + "grad_norm": 0.431985080242157, + "learning_rate": 0.00014543209555621986, + "loss": 1.4117, + "step": 21008 + }, + { + "epoch": 0.2730023731285826, + "grad_norm": 0.32566460967063904, + "learning_rate": 0.0001454294960943085, + "loss": 1.3819, + "step": 21009 + }, + { + "epoch": 0.2730153676724985, + "grad_norm": 0.41906920075416565, + "learning_rate": 0.0001454268966323971, + "loss": 1.3556, + "step": 21010 + }, + { + "epoch": 0.27302836221641436, + "grad_norm": 0.337770015001297, + "learning_rate": 0.0001454242971704857, + "loss": 1.4031, + "step": 21011 + }, + { + "epoch": 0.27304135676033026, + "grad_norm": 0.4519765079021454, + "learning_rate": 0.00014542169770857434, + "loss": 1.5323, + "step": 21012 + }, + { + "epoch": 0.27305435130424616, + "grad_norm": 0.3738921284675598, + "learning_rate": 0.00014541909824666296, + "loss": 1.5589, + "step": 21013 + }, + { + "epoch": 0.273067345848162, + "grad_norm": 0.4027923047542572, + "learning_rate": 0.00014541649878475156, + "loss": 1.3714, + "step": 21014 + }, + { + "epoch": 0.2730803403920779, + "grad_norm": 0.39273229241371155, + "learning_rate": 0.00014541389932284018, + "loss": 1.3112, + "step": 21015 + }, + { + "epoch": 0.27309333493599375, + "grad_norm": 0.36209815740585327, + "learning_rate": 0.00014541129986092878, + "loss": 1.3559, + "step": 21016 + }, + { + "epoch": 0.27310632947990965, + "grad_norm": 0.4092465341091156, + "learning_rate": 0.0001454087003990174, + "loss": 1.4444, + "step": 21017 + }, + { + "epoch": 0.2731193240238255, + "grad_norm": 0.3870810866355896, + "learning_rate": 0.00014540610093710603, + "loss": 1.39, + "step": 21018 + }, + { + "epoch": 0.2731323185677414, + "grad_norm": 0.42070648074150085, + "learning_rate": 0.00014540350147519463, + "loss": 1.3565, + "step": 21019 + }, + { + "epoch": 0.27314531311165724, + "grad_norm": 0.41532325744628906, + "learning_rate": 0.00014540090201328325, + "loss": 1.3182, + "step": 21020 + }, + { + "epoch": 0.27315830765557314, + "grad_norm": 0.4815254509449005, + "learning_rate": 0.00014539830255137187, + "loss": 1.3652, + "step": 21021 + }, + { + "epoch": 0.273171302199489, + "grad_norm": 0.33928966522216797, + "learning_rate": 0.0001453957030894605, + "loss": 1.2993, + "step": 21022 + }, + { + "epoch": 0.2731842967434049, + "grad_norm": 0.4513469338417053, + "learning_rate": 0.0001453931036275491, + "loss": 1.5187, + "step": 21023 + }, + { + "epoch": 0.27319729128732073, + "grad_norm": 0.36666297912597656, + "learning_rate": 0.00014539050416563772, + "loss": 1.3535, + "step": 21024 + }, + { + "epoch": 0.27321028583123663, + "grad_norm": 0.42291414737701416, + "learning_rate": 0.00014538790470372635, + "loss": 1.3596, + "step": 21025 + }, + { + "epoch": 0.2732232803751525, + "grad_norm": 0.2663572132587433, + "learning_rate": 0.00014538530524181494, + "loss": 1.387, + "step": 21026 + }, + { + "epoch": 0.2732362749190684, + "grad_norm": 0.36366698145866394, + "learning_rate": 0.00014538270577990357, + "loss": 1.4335, + "step": 21027 + }, + { + "epoch": 0.2732492694629842, + "grad_norm": 0.4077562391757965, + "learning_rate": 0.0001453801063179922, + "loss": 1.4028, + "step": 21028 + }, + { + "epoch": 0.2732622640069001, + "grad_norm": 0.2685930132865906, + "learning_rate": 0.00014537750685608082, + "loss": 1.0368, + "step": 21029 + }, + { + "epoch": 0.27327525855081597, + "grad_norm": 0.45041173696517944, + "learning_rate": 0.0001453749073941694, + "loss": 1.5317, + "step": 21030 + }, + { + "epoch": 0.27328825309473187, + "grad_norm": 0.35876336693763733, + "learning_rate": 0.000145372307932258, + "loss": 1.3686, + "step": 21031 + }, + { + "epoch": 0.2733012476386477, + "grad_norm": 0.4320288896560669, + "learning_rate": 0.00014536970847034666, + "loss": 1.4189, + "step": 21032 + }, + { + "epoch": 0.2733142421825636, + "grad_norm": 0.4032602608203888, + "learning_rate": 0.00014536710900843526, + "loss": 1.3094, + "step": 21033 + }, + { + "epoch": 0.27332723672647946, + "grad_norm": 0.4131283164024353, + "learning_rate": 0.00014536450954652388, + "loss": 1.3437, + "step": 21034 + }, + { + "epoch": 0.27334023127039536, + "grad_norm": 0.4300435185432434, + "learning_rate": 0.00014536191008461248, + "loss": 1.4145, + "step": 21035 + }, + { + "epoch": 0.2733532258143112, + "grad_norm": 0.3635316491127014, + "learning_rate": 0.0001453593106227011, + "loss": 1.3144, + "step": 21036 + }, + { + "epoch": 0.2733662203582271, + "grad_norm": 0.3523062765598297, + "learning_rate": 0.00014535671116078973, + "loss": 1.3391, + "step": 21037 + }, + { + "epoch": 0.27337921490214295, + "grad_norm": 0.3737955689430237, + "learning_rate": 0.00014535411169887833, + "loss": 1.6278, + "step": 21038 + }, + { + "epoch": 0.27339220944605885, + "grad_norm": 0.39869099855422974, + "learning_rate": 0.00014535151223696695, + "loss": 1.3535, + "step": 21039 + }, + { + "epoch": 0.2734052039899747, + "grad_norm": 0.46052759885787964, + "learning_rate": 0.00014534891277505558, + "loss": 1.4842, + "step": 21040 + }, + { + "epoch": 0.2734181985338906, + "grad_norm": 0.5159851908683777, + "learning_rate": 0.0001453463133131442, + "loss": 1.631, + "step": 21041 + }, + { + "epoch": 0.27343119307780644, + "grad_norm": 0.37330055236816406, + "learning_rate": 0.0001453437138512328, + "loss": 1.3921, + "step": 21042 + }, + { + "epoch": 0.27344418762172235, + "grad_norm": 0.348766565322876, + "learning_rate": 0.0001453411143893214, + "loss": 1.2829, + "step": 21043 + }, + { + "epoch": 0.2734571821656382, + "grad_norm": 0.33741191029548645, + "learning_rate": 0.00014533851492741005, + "loss": 1.518, + "step": 21044 + }, + { + "epoch": 0.2734701767095541, + "grad_norm": 0.3587682843208313, + "learning_rate": 0.00014533591546549865, + "loss": 1.4188, + "step": 21045 + }, + { + "epoch": 0.27348317125346994, + "grad_norm": 0.34680241346359253, + "learning_rate": 0.00014533331600358727, + "loss": 1.3415, + "step": 21046 + }, + { + "epoch": 0.27349616579738584, + "grad_norm": 0.41146257519721985, + "learning_rate": 0.00014533071654167587, + "loss": 1.4233, + "step": 21047 + }, + { + "epoch": 0.2735091603413017, + "grad_norm": 0.44580915570259094, + "learning_rate": 0.0001453281170797645, + "loss": 1.4175, + "step": 21048 + }, + { + "epoch": 0.2735221548852176, + "grad_norm": 0.3958224952220917, + "learning_rate": 0.00014532551761785312, + "loss": 1.525, + "step": 21049 + }, + { + "epoch": 0.27353514942913343, + "grad_norm": 0.45374515652656555, + "learning_rate": 0.0001453229181559417, + "loss": 1.4871, + "step": 21050 + }, + { + "epoch": 0.27354814397304933, + "grad_norm": 0.37961292266845703, + "learning_rate": 0.00014532031869403034, + "loss": 1.6329, + "step": 21051 + }, + { + "epoch": 0.2735611385169652, + "grad_norm": 0.30875957012176514, + "learning_rate": 0.00014531771923211896, + "loss": 1.4655, + "step": 21052 + }, + { + "epoch": 0.2735741330608811, + "grad_norm": 0.3240174651145935, + "learning_rate": 0.0001453151197702076, + "loss": 1.1792, + "step": 21053 + }, + { + "epoch": 0.2735871276047969, + "grad_norm": 0.476652055978775, + "learning_rate": 0.00014531252030829618, + "loss": 1.4837, + "step": 21054 + }, + { + "epoch": 0.2736001221487128, + "grad_norm": 0.4962819814682007, + "learning_rate": 0.00014530992084638478, + "loss": 1.2407, + "step": 21055 + }, + { + "epoch": 0.27361311669262867, + "grad_norm": 0.7336745262145996, + "learning_rate": 0.00014530732138447343, + "loss": 1.4192, + "step": 21056 + }, + { + "epoch": 0.27362611123654457, + "grad_norm": 0.41748496890068054, + "learning_rate": 0.00014530472192256203, + "loss": 1.493, + "step": 21057 + }, + { + "epoch": 0.2736391057804604, + "grad_norm": 0.4320474863052368, + "learning_rate": 0.00014530212246065065, + "loss": 1.4288, + "step": 21058 + }, + { + "epoch": 0.2736521003243763, + "grad_norm": 0.4384480118751526, + "learning_rate": 0.00014529952299873925, + "loss": 1.4565, + "step": 21059 + }, + { + "epoch": 0.27366509486829216, + "grad_norm": 0.3575893044471741, + "learning_rate": 0.00014529692353682788, + "loss": 1.585, + "step": 21060 + }, + { + "epoch": 0.27367808941220806, + "grad_norm": 0.37584006786346436, + "learning_rate": 0.0001452943240749165, + "loss": 1.4706, + "step": 21061 + }, + { + "epoch": 0.2736910839561239, + "grad_norm": 0.4650433659553528, + "learning_rate": 0.0001452917246130051, + "loss": 1.471, + "step": 21062 + }, + { + "epoch": 0.2737040785000398, + "grad_norm": 0.4942532181739807, + "learning_rate": 0.00014528912515109375, + "loss": 1.5046, + "step": 21063 + }, + { + "epoch": 0.27371707304395565, + "grad_norm": 0.3835715353488922, + "learning_rate": 0.00014528652568918235, + "loss": 1.4204, + "step": 21064 + }, + { + "epoch": 0.27373006758787155, + "grad_norm": 0.4147658944129944, + "learning_rate": 0.00014528392622727097, + "loss": 1.674, + "step": 21065 + }, + { + "epoch": 0.2737430621317874, + "grad_norm": 0.4385238587856293, + "learning_rate": 0.00014528132676535957, + "loss": 1.4338, + "step": 21066 + }, + { + "epoch": 0.2737560566757033, + "grad_norm": 0.45123305916786194, + "learning_rate": 0.0001452787273034482, + "loss": 1.4056, + "step": 21067 + }, + { + "epoch": 0.27376905121961914, + "grad_norm": 0.37931713461875916, + "learning_rate": 0.00014527612784153682, + "loss": 1.5346, + "step": 21068 + }, + { + "epoch": 0.27378204576353504, + "grad_norm": 0.3663512170314789, + "learning_rate": 0.00014527352837962542, + "loss": 1.3788, + "step": 21069 + }, + { + "epoch": 0.2737950403074509, + "grad_norm": 0.4405010938644409, + "learning_rate": 0.00014527092891771404, + "loss": 1.298, + "step": 21070 + }, + { + "epoch": 0.2738080348513668, + "grad_norm": 0.3729875087738037, + "learning_rate": 0.00014526832945580266, + "loss": 1.5995, + "step": 21071 + }, + { + "epoch": 0.27382102939528263, + "grad_norm": 0.42570021748542786, + "learning_rate": 0.00014526572999389126, + "loss": 1.3873, + "step": 21072 + }, + { + "epoch": 0.27383402393919853, + "grad_norm": 0.3910354673862457, + "learning_rate": 0.0001452631305319799, + "loss": 1.4487, + "step": 21073 + }, + { + "epoch": 0.2738470184831144, + "grad_norm": 0.2987804412841797, + "learning_rate": 0.00014526053107006848, + "loss": 1.1991, + "step": 21074 + }, + { + "epoch": 0.2738600130270303, + "grad_norm": 0.3597513437271118, + "learning_rate": 0.00014525793160815714, + "loss": 1.4366, + "step": 21075 + }, + { + "epoch": 0.2738730075709461, + "grad_norm": 0.48137202858924866, + "learning_rate": 0.00014525533214624573, + "loss": 1.42, + "step": 21076 + }, + { + "epoch": 0.273886002114862, + "grad_norm": 0.42970511317253113, + "learning_rate": 0.00014525273268433436, + "loss": 1.3713, + "step": 21077 + }, + { + "epoch": 0.27389899665877787, + "grad_norm": 0.3753858506679535, + "learning_rate": 0.00014525013322242295, + "loss": 1.2101, + "step": 21078 + }, + { + "epoch": 0.27391199120269377, + "grad_norm": 0.4589577317237854, + "learning_rate": 0.00014524753376051158, + "loss": 1.4407, + "step": 21079 + }, + { + "epoch": 0.2739249857466096, + "grad_norm": 0.3946545422077179, + "learning_rate": 0.0001452449342986002, + "loss": 1.3384, + "step": 21080 + }, + { + "epoch": 0.2739379802905255, + "grad_norm": 0.415936142206192, + "learning_rate": 0.0001452423348366888, + "loss": 1.4975, + "step": 21081 + }, + { + "epoch": 0.27395097483444136, + "grad_norm": 0.48644179105758667, + "learning_rate": 0.00014523973537477743, + "loss": 1.4348, + "step": 21082 + }, + { + "epoch": 0.27396396937835726, + "grad_norm": 0.38244932889938354, + "learning_rate": 0.00014523713591286605, + "loss": 1.4551, + "step": 21083 + }, + { + "epoch": 0.2739769639222731, + "grad_norm": 0.35513171553611755, + "learning_rate": 0.00014523453645095467, + "loss": 1.3898, + "step": 21084 + }, + { + "epoch": 0.273989958466189, + "grad_norm": 0.5417866110801697, + "learning_rate": 0.00014523193698904327, + "loss": 1.4588, + "step": 21085 + }, + { + "epoch": 0.27400295301010485, + "grad_norm": 0.41893693804740906, + "learning_rate": 0.00014522933752713187, + "loss": 1.4227, + "step": 21086 + }, + { + "epoch": 0.27401594755402076, + "grad_norm": 0.3841964602470398, + "learning_rate": 0.00014522673806522052, + "loss": 1.6915, + "step": 21087 + }, + { + "epoch": 0.27402894209793666, + "grad_norm": 0.33003556728363037, + "learning_rate": 0.00014522413860330912, + "loss": 1.395, + "step": 21088 + }, + { + "epoch": 0.2740419366418525, + "grad_norm": 0.28712520003318787, + "learning_rate": 0.00014522153914139774, + "loss": 1.4047, + "step": 21089 + }, + { + "epoch": 0.2740549311857684, + "grad_norm": 0.40768882632255554, + "learning_rate": 0.00014521893967948634, + "loss": 1.4435, + "step": 21090 + }, + { + "epoch": 0.27406792572968425, + "grad_norm": 0.509784460067749, + "learning_rate": 0.00014521634021757496, + "loss": 1.4047, + "step": 21091 + }, + { + "epoch": 0.27408092027360015, + "grad_norm": 0.35930219292640686, + "learning_rate": 0.0001452137407556636, + "loss": 1.4614, + "step": 21092 + }, + { + "epoch": 0.274093914817516, + "grad_norm": 0.4207635521888733, + "learning_rate": 0.0001452111412937522, + "loss": 1.1942, + "step": 21093 + }, + { + "epoch": 0.2741069093614319, + "grad_norm": 0.30240803956985474, + "learning_rate": 0.0001452085418318408, + "loss": 1.3059, + "step": 21094 + }, + { + "epoch": 0.27411990390534774, + "grad_norm": 0.5255926847457886, + "learning_rate": 0.00014520594236992944, + "loss": 1.3044, + "step": 21095 + }, + { + "epoch": 0.27413289844926364, + "grad_norm": 0.3513743281364441, + "learning_rate": 0.00014520334290801806, + "loss": 1.3226, + "step": 21096 + }, + { + "epoch": 0.2741458929931795, + "grad_norm": 0.4706592857837677, + "learning_rate": 0.00014520074344610666, + "loss": 1.4042, + "step": 21097 + }, + { + "epoch": 0.2741588875370954, + "grad_norm": 0.4073595404624939, + "learning_rate": 0.00014519814398419528, + "loss": 1.3979, + "step": 21098 + }, + { + "epoch": 0.27417188208101123, + "grad_norm": 0.4529314339160919, + "learning_rate": 0.0001451955445222839, + "loss": 1.2585, + "step": 21099 + }, + { + "epoch": 0.27418487662492713, + "grad_norm": 0.4774497151374817, + "learning_rate": 0.0001451929450603725, + "loss": 1.3144, + "step": 21100 + }, + { + "epoch": 0.274197871168843, + "grad_norm": 0.39242956042289734, + "learning_rate": 0.00014519034559846113, + "loss": 1.4923, + "step": 21101 + }, + { + "epoch": 0.2742108657127589, + "grad_norm": 0.3895746171474457, + "learning_rate": 0.00014518774613654975, + "loss": 1.2079, + "step": 21102 + }, + { + "epoch": 0.2742238602566747, + "grad_norm": 0.4333137571811676, + "learning_rate": 0.00014518514667463835, + "loss": 1.2861, + "step": 21103 + }, + { + "epoch": 0.2742368548005906, + "grad_norm": 0.26529887318611145, + "learning_rate": 0.00014518254721272697, + "loss": 1.1751, + "step": 21104 + }, + { + "epoch": 0.27424984934450647, + "grad_norm": 0.475535124540329, + "learning_rate": 0.00014517994775081557, + "loss": 1.5904, + "step": 21105 + }, + { + "epoch": 0.27426284388842237, + "grad_norm": 0.40255001187324524, + "learning_rate": 0.00014517734828890422, + "loss": 1.63, + "step": 21106 + }, + { + "epoch": 0.2742758384323382, + "grad_norm": 0.3675733506679535, + "learning_rate": 0.00014517474882699282, + "loss": 1.2531, + "step": 21107 + }, + { + "epoch": 0.2742888329762541, + "grad_norm": 0.4259125888347626, + "learning_rate": 0.00014517214936508145, + "loss": 1.4274, + "step": 21108 + }, + { + "epoch": 0.27430182752016996, + "grad_norm": 0.516318142414093, + "learning_rate": 0.00014516954990317004, + "loss": 1.5148, + "step": 21109 + }, + { + "epoch": 0.27431482206408586, + "grad_norm": 0.46355438232421875, + "learning_rate": 0.00014516695044125867, + "loss": 1.256, + "step": 21110 + }, + { + "epoch": 0.2743278166080017, + "grad_norm": 0.3909146189689636, + "learning_rate": 0.0001451643509793473, + "loss": 1.5112, + "step": 21111 + }, + { + "epoch": 0.2743408111519176, + "grad_norm": 0.40489462018013, + "learning_rate": 0.0001451617515174359, + "loss": 1.4785, + "step": 21112 + }, + { + "epoch": 0.27435380569583345, + "grad_norm": 0.4154320955276489, + "learning_rate": 0.00014515915205552451, + "loss": 1.4824, + "step": 21113 + }, + { + "epoch": 0.27436680023974935, + "grad_norm": 0.33446720242500305, + "learning_rate": 0.00014515655259361314, + "loss": 1.165, + "step": 21114 + }, + { + "epoch": 0.2743797947836652, + "grad_norm": 0.31943079829216003, + "learning_rate": 0.00014515395313170174, + "loss": 1.237, + "step": 21115 + }, + { + "epoch": 0.2743927893275811, + "grad_norm": 0.4016754925251007, + "learning_rate": 0.00014515135366979036, + "loss": 1.3974, + "step": 21116 + }, + { + "epoch": 0.27440578387149694, + "grad_norm": 0.4799249768257141, + "learning_rate": 0.00014514875420787896, + "loss": 1.4643, + "step": 21117 + }, + { + "epoch": 0.27441877841541285, + "grad_norm": 0.4338774085044861, + "learning_rate": 0.0001451461547459676, + "loss": 1.1566, + "step": 21118 + }, + { + "epoch": 0.2744317729593287, + "grad_norm": 0.37024784088134766, + "learning_rate": 0.0001451435552840562, + "loss": 1.5777, + "step": 21119 + }, + { + "epoch": 0.2744447675032446, + "grad_norm": 0.42051899433135986, + "learning_rate": 0.00014514095582214483, + "loss": 1.4166, + "step": 21120 + }, + { + "epoch": 0.27445776204716044, + "grad_norm": 0.40367645025253296, + "learning_rate": 0.00014513835636023343, + "loss": 1.2711, + "step": 21121 + }, + { + "epoch": 0.27447075659107634, + "grad_norm": 0.3834116458892822, + "learning_rate": 0.00014513575689832205, + "loss": 1.5209, + "step": 21122 + }, + { + "epoch": 0.2744837511349922, + "grad_norm": 0.45230185985565186, + "learning_rate": 0.00014513315743641068, + "loss": 1.4885, + "step": 21123 + }, + { + "epoch": 0.2744967456789081, + "grad_norm": 0.44072362780570984, + "learning_rate": 0.00014513055797449927, + "loss": 1.4712, + "step": 21124 + }, + { + "epoch": 0.27450974022282393, + "grad_norm": 0.5290454626083374, + "learning_rate": 0.0001451279585125879, + "loss": 1.6872, + "step": 21125 + }, + { + "epoch": 0.27452273476673983, + "grad_norm": 0.4498007893562317, + "learning_rate": 0.00014512535905067652, + "loss": 1.4448, + "step": 21126 + }, + { + "epoch": 0.2745357293106557, + "grad_norm": 0.48479050397872925, + "learning_rate": 0.00014512275958876512, + "loss": 1.4481, + "step": 21127 + }, + { + "epoch": 0.2745487238545716, + "grad_norm": 0.46207278966903687, + "learning_rate": 0.00014512016012685375, + "loss": 1.594, + "step": 21128 + }, + { + "epoch": 0.2745617183984874, + "grad_norm": 0.4562664330005646, + "learning_rate": 0.00014511756066494234, + "loss": 1.4636, + "step": 21129 + }, + { + "epoch": 0.2745747129424033, + "grad_norm": 0.4015084207057953, + "learning_rate": 0.000145114961203031, + "loss": 1.4764, + "step": 21130 + }, + { + "epoch": 0.27458770748631917, + "grad_norm": 0.4773150682449341, + "learning_rate": 0.0001451123617411196, + "loss": 1.612, + "step": 21131 + }, + { + "epoch": 0.27460070203023507, + "grad_norm": 0.47321027517318726, + "learning_rate": 0.00014510976227920822, + "loss": 1.4557, + "step": 21132 + }, + { + "epoch": 0.2746136965741509, + "grad_norm": 0.34213805198669434, + "learning_rate": 0.00014510716281729681, + "loss": 1.3234, + "step": 21133 + }, + { + "epoch": 0.2746266911180668, + "grad_norm": 0.491956502199173, + "learning_rate": 0.00014510456335538544, + "loss": 1.6393, + "step": 21134 + }, + { + "epoch": 0.27463968566198266, + "grad_norm": 0.24043916165828705, + "learning_rate": 0.00014510196389347406, + "loss": 1.4292, + "step": 21135 + }, + { + "epoch": 0.27465268020589856, + "grad_norm": 0.49715086817741394, + "learning_rate": 0.00014509936443156266, + "loss": 1.3918, + "step": 21136 + }, + { + "epoch": 0.2746656747498144, + "grad_norm": 0.4062346816062927, + "learning_rate": 0.0001450967649696513, + "loss": 1.3848, + "step": 21137 + }, + { + "epoch": 0.2746786692937303, + "grad_norm": 0.4312981367111206, + "learning_rate": 0.0001450941655077399, + "loss": 1.3298, + "step": 21138 + }, + { + "epoch": 0.27469166383764615, + "grad_norm": 0.3923072814941406, + "learning_rate": 0.0001450915660458285, + "loss": 1.3491, + "step": 21139 + }, + { + "epoch": 0.27470465838156205, + "grad_norm": 0.4729924499988556, + "learning_rate": 0.00014508896658391713, + "loss": 1.425, + "step": 21140 + }, + { + "epoch": 0.2747176529254779, + "grad_norm": 0.48524609208106995, + "learning_rate": 0.00014508636712200576, + "loss": 1.4278, + "step": 21141 + }, + { + "epoch": 0.2747306474693938, + "grad_norm": 0.3876422643661499, + "learning_rate": 0.00014508376766009438, + "loss": 1.4143, + "step": 21142 + }, + { + "epoch": 0.27474364201330964, + "grad_norm": 0.3328193426132202, + "learning_rate": 0.00014508116819818298, + "loss": 1.3708, + "step": 21143 + }, + { + "epoch": 0.27475663655722554, + "grad_norm": 0.3669082224369049, + "learning_rate": 0.0001450785687362716, + "loss": 1.404, + "step": 21144 + }, + { + "epoch": 0.2747696311011414, + "grad_norm": 0.6811167001724243, + "learning_rate": 0.00014507596927436023, + "loss": 1.56, + "step": 21145 + }, + { + "epoch": 0.2747826256450573, + "grad_norm": 0.3828492760658264, + "learning_rate": 0.00014507336981244882, + "loss": 1.1858, + "step": 21146 + }, + { + "epoch": 0.27479562018897313, + "grad_norm": 0.31917595863342285, + "learning_rate": 0.00014507077035053745, + "loss": 1.3122, + "step": 21147 + }, + { + "epoch": 0.27480861473288903, + "grad_norm": 0.4403409957885742, + "learning_rate": 0.00014506817088862605, + "loss": 1.4218, + "step": 21148 + }, + { + "epoch": 0.2748216092768049, + "grad_norm": 0.32866010069847107, + "learning_rate": 0.0001450655714267147, + "loss": 1.2003, + "step": 21149 + }, + { + "epoch": 0.2748346038207208, + "grad_norm": 0.3999956548213959, + "learning_rate": 0.0001450629719648033, + "loss": 1.4845, + "step": 21150 + }, + { + "epoch": 0.2748475983646366, + "grad_norm": 0.4942968785762787, + "learning_rate": 0.00014506037250289192, + "loss": 1.4445, + "step": 21151 + }, + { + "epoch": 0.2748605929085525, + "grad_norm": 0.3724045753479004, + "learning_rate": 0.00014505777304098052, + "loss": 1.3934, + "step": 21152 + }, + { + "epoch": 0.27487358745246837, + "grad_norm": 0.47794023156166077, + "learning_rate": 0.00014505517357906914, + "loss": 1.3485, + "step": 21153 + }, + { + "epoch": 0.27488658199638427, + "grad_norm": 0.37596553564071655, + "learning_rate": 0.00014505257411715777, + "loss": 1.4601, + "step": 21154 + }, + { + "epoch": 0.2748995765403001, + "grad_norm": 0.41127005219459534, + "learning_rate": 0.00014504997465524636, + "loss": 1.3331, + "step": 21155 + }, + { + "epoch": 0.274912571084216, + "grad_norm": 0.3894975483417511, + "learning_rate": 0.000145047375193335, + "loss": 1.1967, + "step": 21156 + }, + { + "epoch": 0.27492556562813186, + "grad_norm": 0.4334724247455597, + "learning_rate": 0.0001450447757314236, + "loss": 1.5161, + "step": 21157 + }, + { + "epoch": 0.27493856017204776, + "grad_norm": 0.6829808950424194, + "learning_rate": 0.0001450421762695122, + "loss": 1.4384, + "step": 21158 + }, + { + "epoch": 0.2749515547159636, + "grad_norm": 0.43273308873176575, + "learning_rate": 0.00014503957680760083, + "loss": 1.417, + "step": 21159 + }, + { + "epoch": 0.2749645492598795, + "grad_norm": 0.36936119198799133, + "learning_rate": 0.00014503697734568943, + "loss": 1.4714, + "step": 21160 + }, + { + "epoch": 0.27497754380379535, + "grad_norm": 0.4327675998210907, + "learning_rate": 0.00014503437788377808, + "loss": 1.6612, + "step": 21161 + }, + { + "epoch": 0.27499053834771126, + "grad_norm": 0.4079569876194, + "learning_rate": 0.00014503177842186668, + "loss": 1.4983, + "step": 21162 + }, + { + "epoch": 0.2750035328916271, + "grad_norm": 0.36818698048591614, + "learning_rate": 0.0001450291789599553, + "loss": 1.2719, + "step": 21163 + }, + { + "epoch": 0.275016527435543, + "grad_norm": 0.33294445276260376, + "learning_rate": 0.0001450265794980439, + "loss": 1.4431, + "step": 21164 + }, + { + "epoch": 0.2750295219794589, + "grad_norm": 0.43079033493995667, + "learning_rate": 0.00014502398003613253, + "loss": 1.4679, + "step": 21165 + }, + { + "epoch": 0.27504251652337475, + "grad_norm": 0.3781503736972809, + "learning_rate": 0.00014502138057422115, + "loss": 1.7069, + "step": 21166 + }, + { + "epoch": 0.27505551106729065, + "grad_norm": 0.31240200996398926, + "learning_rate": 0.00014501878111230975, + "loss": 1.3101, + "step": 21167 + }, + { + "epoch": 0.2750685056112065, + "grad_norm": 0.4006272256374359, + "learning_rate": 0.00014501618165039837, + "loss": 1.3933, + "step": 21168 + }, + { + "epoch": 0.2750815001551224, + "grad_norm": 0.38519835472106934, + "learning_rate": 0.000145013582188487, + "loss": 1.487, + "step": 21169 + }, + { + "epoch": 0.27509449469903824, + "grad_norm": 0.3168710768222809, + "learning_rate": 0.0001450109827265756, + "loss": 1.1667, + "step": 21170 + }, + { + "epoch": 0.27510748924295414, + "grad_norm": 0.444307804107666, + "learning_rate": 0.00014500838326466422, + "loss": 1.5404, + "step": 21171 + }, + { + "epoch": 0.27512048378687, + "grad_norm": 0.40774983167648315, + "learning_rate": 0.00014500578380275284, + "loss": 1.4113, + "step": 21172 + }, + { + "epoch": 0.2751334783307859, + "grad_norm": 0.33713987469673157, + "learning_rate": 0.00014500318434084147, + "loss": 1.4219, + "step": 21173 + }, + { + "epoch": 0.27514647287470173, + "grad_norm": 0.43668732047080994, + "learning_rate": 0.00014500058487893007, + "loss": 1.2493, + "step": 21174 + }, + { + "epoch": 0.27515946741861763, + "grad_norm": 0.508034348487854, + "learning_rate": 0.0001449979854170187, + "loss": 1.4009, + "step": 21175 + }, + { + "epoch": 0.2751724619625335, + "grad_norm": 0.44830527901649475, + "learning_rate": 0.00014499538595510731, + "loss": 1.281, + "step": 21176 + }, + { + "epoch": 0.2751854565064494, + "grad_norm": 0.4372822940349579, + "learning_rate": 0.0001449927864931959, + "loss": 1.334, + "step": 21177 + }, + { + "epoch": 0.2751984510503652, + "grad_norm": 0.3612796664237976, + "learning_rate": 0.00014499018703128454, + "loss": 1.546, + "step": 21178 + }, + { + "epoch": 0.2752114455942811, + "grad_norm": 0.40134289860725403, + "learning_rate": 0.00014498758756937313, + "loss": 1.3757, + "step": 21179 + }, + { + "epoch": 0.27522444013819697, + "grad_norm": 0.38240689039230347, + "learning_rate": 0.00014498498810746178, + "loss": 1.6334, + "step": 21180 + }, + { + "epoch": 0.27523743468211287, + "grad_norm": 0.4360727071762085, + "learning_rate": 0.00014498238864555038, + "loss": 1.3282, + "step": 21181 + }, + { + "epoch": 0.2752504292260287, + "grad_norm": 0.3220953643321991, + "learning_rate": 0.00014497978918363898, + "loss": 1.145, + "step": 21182 + }, + { + "epoch": 0.2752634237699446, + "grad_norm": 0.39528268575668335, + "learning_rate": 0.0001449771897217276, + "loss": 1.4409, + "step": 21183 + }, + { + "epoch": 0.27527641831386046, + "grad_norm": 0.4668637812137604, + "learning_rate": 0.00014497459025981623, + "loss": 1.3587, + "step": 21184 + }, + { + "epoch": 0.27528941285777636, + "grad_norm": 0.3947980999946594, + "learning_rate": 0.00014497199079790485, + "loss": 1.7426, + "step": 21185 + }, + { + "epoch": 0.2753024074016922, + "grad_norm": 0.4025053381919861, + "learning_rate": 0.00014496939133599345, + "loss": 1.4422, + "step": 21186 + }, + { + "epoch": 0.2753154019456081, + "grad_norm": 0.4357895851135254, + "learning_rate": 0.00014496679187408207, + "loss": 1.288, + "step": 21187 + }, + { + "epoch": 0.27532839648952395, + "grad_norm": 0.36953502893447876, + "learning_rate": 0.0001449641924121707, + "loss": 1.4374, + "step": 21188 + }, + { + "epoch": 0.27534139103343985, + "grad_norm": 0.38954612612724304, + "learning_rate": 0.0001449615929502593, + "loss": 1.3362, + "step": 21189 + }, + { + "epoch": 0.2753543855773557, + "grad_norm": 0.3733152747154236, + "learning_rate": 0.00014495899348834792, + "loss": 1.5403, + "step": 21190 + }, + { + "epoch": 0.2753673801212716, + "grad_norm": 0.36010414361953735, + "learning_rate": 0.00014495639402643652, + "loss": 1.3824, + "step": 21191 + }, + { + "epoch": 0.27538037466518744, + "grad_norm": 0.3495846092700958, + "learning_rate": 0.00014495379456452517, + "loss": 1.3414, + "step": 21192 + }, + { + "epoch": 0.27539336920910334, + "grad_norm": 0.4866452217102051, + "learning_rate": 0.00014495119510261377, + "loss": 1.4483, + "step": 21193 + }, + { + "epoch": 0.2754063637530192, + "grad_norm": 0.38002511858940125, + "learning_rate": 0.00014494859564070237, + "loss": 1.5761, + "step": 21194 + }, + { + "epoch": 0.2754193582969351, + "grad_norm": 0.4484215974807739, + "learning_rate": 0.000144945996178791, + "loss": 1.492, + "step": 21195 + }, + { + "epoch": 0.27543235284085094, + "grad_norm": 0.36568936705589294, + "learning_rate": 0.00014494339671687961, + "loss": 1.6437, + "step": 21196 + }, + { + "epoch": 0.27544534738476684, + "grad_norm": 0.4071952700614929, + "learning_rate": 0.00014494079725496824, + "loss": 1.6131, + "step": 21197 + }, + { + "epoch": 0.2754583419286827, + "grad_norm": 0.3388287425041199, + "learning_rate": 0.00014493819779305684, + "loss": 1.3583, + "step": 21198 + }, + { + "epoch": 0.2754713364725986, + "grad_norm": 0.44066786766052246, + "learning_rate": 0.00014493559833114546, + "loss": 1.4328, + "step": 21199 + }, + { + "epoch": 0.2754843310165144, + "grad_norm": 0.4633805751800537, + "learning_rate": 0.00014493299886923408, + "loss": 1.4578, + "step": 21200 + }, + { + "epoch": 0.27549732556043033, + "grad_norm": 0.5341418981552124, + "learning_rate": 0.00014493039940732268, + "loss": 1.4543, + "step": 21201 + }, + { + "epoch": 0.2755103201043462, + "grad_norm": 0.3690308630466461, + "learning_rate": 0.0001449277999454113, + "loss": 1.3158, + "step": 21202 + }, + { + "epoch": 0.2755233146482621, + "grad_norm": 0.4203195571899414, + "learning_rate": 0.0001449252004834999, + "loss": 1.368, + "step": 21203 + }, + { + "epoch": 0.2755363091921779, + "grad_norm": 0.3265753388404846, + "learning_rate": 0.00014492260102158856, + "loss": 1.1581, + "step": 21204 + }, + { + "epoch": 0.2755493037360938, + "grad_norm": 0.419819176197052, + "learning_rate": 0.00014492000155967715, + "loss": 1.4314, + "step": 21205 + }, + { + "epoch": 0.27556229828000967, + "grad_norm": 0.4059402346611023, + "learning_rate": 0.00014491740209776578, + "loss": 1.4519, + "step": 21206 + }, + { + "epoch": 0.27557529282392557, + "grad_norm": 0.3671705424785614, + "learning_rate": 0.00014491480263585437, + "loss": 1.4712, + "step": 21207 + }, + { + "epoch": 0.2755882873678414, + "grad_norm": 0.4350607991218567, + "learning_rate": 0.000144912203173943, + "loss": 1.4063, + "step": 21208 + }, + { + "epoch": 0.2756012819117573, + "grad_norm": 0.40174663066864014, + "learning_rate": 0.00014490960371203162, + "loss": 1.5422, + "step": 21209 + }, + { + "epoch": 0.27561427645567316, + "grad_norm": 0.5050309896469116, + "learning_rate": 0.00014490700425012022, + "loss": 1.3792, + "step": 21210 + }, + { + "epoch": 0.27562727099958906, + "grad_norm": 0.4664018154144287, + "learning_rate": 0.00014490440478820885, + "loss": 1.3682, + "step": 21211 + }, + { + "epoch": 0.2756402655435049, + "grad_norm": 0.3265899121761322, + "learning_rate": 0.00014490180532629747, + "loss": 1.5398, + "step": 21212 + }, + { + "epoch": 0.2756532600874208, + "grad_norm": 0.31491950154304504, + "learning_rate": 0.00014489920586438607, + "loss": 1.2475, + "step": 21213 + }, + { + "epoch": 0.27566625463133665, + "grad_norm": 0.42566415667533875, + "learning_rate": 0.0001448966064024747, + "loss": 1.3364, + "step": 21214 + }, + { + "epoch": 0.27567924917525255, + "grad_norm": 0.49124494194984436, + "learning_rate": 0.00014489400694056332, + "loss": 1.4401, + "step": 21215 + }, + { + "epoch": 0.2756922437191684, + "grad_norm": 0.3966924250125885, + "learning_rate": 0.00014489140747865194, + "loss": 1.4349, + "step": 21216 + }, + { + "epoch": 0.2757052382630843, + "grad_norm": 0.41968196630477905, + "learning_rate": 0.00014488880801674054, + "loss": 1.5184, + "step": 21217 + }, + { + "epoch": 0.27571823280700014, + "grad_norm": 0.3788725435733795, + "learning_rate": 0.00014488620855482916, + "loss": 1.4666, + "step": 21218 + }, + { + "epoch": 0.27573122735091604, + "grad_norm": 0.39863821864128113, + "learning_rate": 0.0001448836090929178, + "loss": 1.3248, + "step": 21219 + }, + { + "epoch": 0.2757442218948319, + "grad_norm": 0.3196360766887665, + "learning_rate": 0.00014488100963100638, + "loss": 1.3715, + "step": 21220 + }, + { + "epoch": 0.2757572164387478, + "grad_norm": 0.41795575618743896, + "learning_rate": 0.000144878410169095, + "loss": 1.4114, + "step": 21221 + }, + { + "epoch": 0.27577021098266363, + "grad_norm": 0.25024887919425964, + "learning_rate": 0.0001448758107071836, + "loss": 1.2667, + "step": 21222 + }, + { + "epoch": 0.27578320552657953, + "grad_norm": 0.5350170731544495, + "learning_rate": 0.00014487321124527223, + "loss": 1.6328, + "step": 21223 + }, + { + "epoch": 0.2757962000704954, + "grad_norm": 0.35319074988365173, + "learning_rate": 0.00014487061178336086, + "loss": 1.3237, + "step": 21224 + }, + { + "epoch": 0.2758091946144113, + "grad_norm": 0.38834911584854126, + "learning_rate": 0.00014486801232144945, + "loss": 1.4453, + "step": 21225 + }, + { + "epoch": 0.2758221891583271, + "grad_norm": 0.4736301898956299, + "learning_rate": 0.00014486541285953808, + "loss": 1.4106, + "step": 21226 + }, + { + "epoch": 0.275835183702243, + "grad_norm": 0.383249431848526, + "learning_rate": 0.0001448628133976267, + "loss": 1.3068, + "step": 21227 + }, + { + "epoch": 0.27584817824615887, + "grad_norm": 0.3536500334739685, + "learning_rate": 0.00014486021393571533, + "loss": 1.3202, + "step": 21228 + }, + { + "epoch": 0.27586117279007477, + "grad_norm": 0.42011725902557373, + "learning_rate": 0.00014485761447380392, + "loss": 1.3821, + "step": 21229 + }, + { + "epoch": 0.2758741673339906, + "grad_norm": 0.36506393551826477, + "learning_rate": 0.00014485501501189255, + "loss": 1.2793, + "step": 21230 + }, + { + "epoch": 0.2758871618779065, + "grad_norm": 0.34045228362083435, + "learning_rate": 0.00014485241554998117, + "loss": 1.4418, + "step": 21231 + }, + { + "epoch": 0.27590015642182236, + "grad_norm": 0.45484286546707153, + "learning_rate": 0.00014484981608806977, + "loss": 1.5592, + "step": 21232 + }, + { + "epoch": 0.27591315096573826, + "grad_norm": 0.3172948956489563, + "learning_rate": 0.0001448472166261584, + "loss": 1.3045, + "step": 21233 + }, + { + "epoch": 0.2759261455096541, + "grad_norm": 0.4191264808177948, + "learning_rate": 0.000144844617164247, + "loss": 1.3347, + "step": 21234 + }, + { + "epoch": 0.27593914005357, + "grad_norm": 0.41809552907943726, + "learning_rate": 0.00014484201770233564, + "loss": 1.3487, + "step": 21235 + }, + { + "epoch": 0.27595213459748585, + "grad_norm": 0.4197249114513397, + "learning_rate": 0.00014483941824042424, + "loss": 1.5669, + "step": 21236 + }, + { + "epoch": 0.27596512914140175, + "grad_norm": 0.41611534357070923, + "learning_rate": 0.00014483681877851284, + "loss": 1.3036, + "step": 21237 + }, + { + "epoch": 0.2759781236853176, + "grad_norm": 0.3917938470840454, + "learning_rate": 0.00014483421931660146, + "loss": 1.3676, + "step": 21238 + }, + { + "epoch": 0.2759911182292335, + "grad_norm": 0.49736669659614563, + "learning_rate": 0.0001448316198546901, + "loss": 1.5584, + "step": 21239 + }, + { + "epoch": 0.2760041127731494, + "grad_norm": 0.3698282837867737, + "learning_rate": 0.0001448290203927787, + "loss": 1.3504, + "step": 21240 + }, + { + "epoch": 0.27601710731706525, + "grad_norm": 0.4146203398704529, + "learning_rate": 0.0001448264209308673, + "loss": 1.4932, + "step": 21241 + }, + { + "epoch": 0.27603010186098115, + "grad_norm": 0.4756197929382324, + "learning_rate": 0.00014482382146895593, + "loss": 1.5046, + "step": 21242 + }, + { + "epoch": 0.276043096404897, + "grad_norm": 0.3861199617385864, + "learning_rate": 0.00014482122200704456, + "loss": 1.5186, + "step": 21243 + }, + { + "epoch": 0.2760560909488129, + "grad_norm": 0.39208948612213135, + "learning_rate": 0.00014481862254513316, + "loss": 1.3228, + "step": 21244 + }, + { + "epoch": 0.27606908549272874, + "grad_norm": 0.3501081168651581, + "learning_rate": 0.00014481602308322178, + "loss": 1.3014, + "step": 21245 + }, + { + "epoch": 0.27608208003664464, + "grad_norm": 0.5069080591201782, + "learning_rate": 0.0001448134236213104, + "loss": 1.5984, + "step": 21246 + }, + { + "epoch": 0.2760950745805605, + "grad_norm": 0.40057137608528137, + "learning_rate": 0.00014481082415939903, + "loss": 1.3721, + "step": 21247 + }, + { + "epoch": 0.2761080691244764, + "grad_norm": 0.29378098249435425, + "learning_rate": 0.00014480822469748763, + "loss": 1.377, + "step": 21248 + }, + { + "epoch": 0.27612106366839223, + "grad_norm": 0.37494799494743347, + "learning_rate": 0.00014480562523557622, + "loss": 1.4259, + "step": 21249 + }, + { + "epoch": 0.27613405821230813, + "grad_norm": 0.4854723811149597, + "learning_rate": 0.00014480302577366488, + "loss": 1.6243, + "step": 21250 + }, + { + "epoch": 0.276147052756224, + "grad_norm": 0.3956761658191681, + "learning_rate": 0.00014480042631175347, + "loss": 1.5744, + "step": 21251 + }, + { + "epoch": 0.2761600473001399, + "grad_norm": 0.4590616226196289, + "learning_rate": 0.0001447978268498421, + "loss": 1.3578, + "step": 21252 + }, + { + "epoch": 0.2761730418440557, + "grad_norm": 0.40178245306015015, + "learning_rate": 0.0001447952273879307, + "loss": 1.4293, + "step": 21253 + }, + { + "epoch": 0.2761860363879716, + "grad_norm": 0.4865666627883911, + "learning_rate": 0.00014479262792601932, + "loss": 1.5323, + "step": 21254 + }, + { + "epoch": 0.27619903093188747, + "grad_norm": 0.4642694890499115, + "learning_rate": 0.00014479002846410794, + "loss": 1.446, + "step": 21255 + }, + { + "epoch": 0.27621202547580337, + "grad_norm": 0.4002801775932312, + "learning_rate": 0.00014478742900219654, + "loss": 1.5125, + "step": 21256 + }, + { + "epoch": 0.2762250200197192, + "grad_norm": 0.3465811014175415, + "learning_rate": 0.00014478482954028517, + "loss": 1.3498, + "step": 21257 + }, + { + "epoch": 0.2762380145636351, + "grad_norm": 0.4012953042984009, + "learning_rate": 0.0001447822300783738, + "loss": 1.4418, + "step": 21258 + }, + { + "epoch": 0.27625100910755096, + "grad_norm": 0.4169375002384186, + "learning_rate": 0.00014477963061646241, + "loss": 1.343, + "step": 21259 + }, + { + "epoch": 0.27626400365146686, + "grad_norm": 0.4746510088443756, + "learning_rate": 0.000144777031154551, + "loss": 1.6866, + "step": 21260 + }, + { + "epoch": 0.2762769981953827, + "grad_norm": 0.44590604305267334, + "learning_rate": 0.0001447744316926396, + "loss": 1.5745, + "step": 21261 + }, + { + "epoch": 0.2762899927392986, + "grad_norm": 0.323533296585083, + "learning_rate": 0.00014477183223072826, + "loss": 1.5348, + "step": 21262 + }, + { + "epoch": 0.27630298728321445, + "grad_norm": 0.4145243465900421, + "learning_rate": 0.00014476923276881686, + "loss": 1.5405, + "step": 21263 + }, + { + "epoch": 0.27631598182713035, + "grad_norm": 0.38724735379219055, + "learning_rate": 0.00014476663330690548, + "loss": 1.5302, + "step": 21264 + }, + { + "epoch": 0.2763289763710462, + "grad_norm": 0.4373025596141815, + "learning_rate": 0.00014476403384499408, + "loss": 1.4764, + "step": 21265 + }, + { + "epoch": 0.2763419709149621, + "grad_norm": 0.37312254309654236, + "learning_rate": 0.0001447614343830827, + "loss": 1.3138, + "step": 21266 + }, + { + "epoch": 0.27635496545887794, + "grad_norm": 0.4806848168373108, + "learning_rate": 0.00014475883492117133, + "loss": 1.6074, + "step": 21267 + }, + { + "epoch": 0.27636796000279384, + "grad_norm": 0.41567668318748474, + "learning_rate": 0.00014475623545925993, + "loss": 1.2749, + "step": 21268 + }, + { + "epoch": 0.2763809545467097, + "grad_norm": 0.46890705823898315, + "learning_rate": 0.00014475363599734855, + "loss": 1.325, + "step": 21269 + }, + { + "epoch": 0.2763939490906256, + "grad_norm": 0.35188305377960205, + "learning_rate": 0.00014475103653543718, + "loss": 1.5179, + "step": 21270 + }, + { + "epoch": 0.27640694363454144, + "grad_norm": 0.42655396461486816, + "learning_rate": 0.0001447484370735258, + "loss": 1.4931, + "step": 21271 + }, + { + "epoch": 0.27641993817845734, + "grad_norm": 0.4551280736923218, + "learning_rate": 0.0001447458376116144, + "loss": 1.3336, + "step": 21272 + }, + { + "epoch": 0.2764329327223732, + "grad_norm": 0.4306657612323761, + "learning_rate": 0.00014474323814970302, + "loss": 1.4428, + "step": 21273 + }, + { + "epoch": 0.2764459272662891, + "grad_norm": 0.4020560085773468, + "learning_rate": 0.00014474063868779165, + "loss": 1.4625, + "step": 21274 + }, + { + "epoch": 0.2764589218102049, + "grad_norm": 0.3207226097583771, + "learning_rate": 0.00014473803922588024, + "loss": 1.3209, + "step": 21275 + }, + { + "epoch": 0.27647191635412083, + "grad_norm": 0.39836278557777405, + "learning_rate": 0.00014473543976396887, + "loss": 1.3315, + "step": 21276 + }, + { + "epoch": 0.2764849108980367, + "grad_norm": 0.4348875880241394, + "learning_rate": 0.00014473284030205747, + "loss": 1.4045, + "step": 21277 + }, + { + "epoch": 0.2764979054419526, + "grad_norm": 0.382883757352829, + "learning_rate": 0.0001447302408401461, + "loss": 1.2114, + "step": 21278 + }, + { + "epoch": 0.2765108999858684, + "grad_norm": 0.4889993965625763, + "learning_rate": 0.00014472764137823471, + "loss": 1.4111, + "step": 21279 + }, + { + "epoch": 0.2765238945297843, + "grad_norm": 0.36627429723739624, + "learning_rate": 0.0001447250419163233, + "loss": 1.2897, + "step": 21280 + }, + { + "epoch": 0.27653688907370017, + "grad_norm": 0.41897332668304443, + "learning_rate": 0.00014472244245441194, + "loss": 1.5972, + "step": 21281 + }, + { + "epoch": 0.27654988361761607, + "grad_norm": 0.3551464080810547, + "learning_rate": 0.00014471984299250056, + "loss": 1.3026, + "step": 21282 + }, + { + "epoch": 0.2765628781615319, + "grad_norm": 0.4172768294811249, + "learning_rate": 0.00014471724353058919, + "loss": 1.4142, + "step": 21283 + }, + { + "epoch": 0.2765758727054478, + "grad_norm": 0.3352348804473877, + "learning_rate": 0.00014471464406867778, + "loss": 1.3868, + "step": 21284 + }, + { + "epoch": 0.27658886724936366, + "grad_norm": 0.36794567108154297, + "learning_rate": 0.0001447120446067664, + "loss": 1.3668, + "step": 21285 + }, + { + "epoch": 0.27660186179327956, + "grad_norm": 0.42755863070487976, + "learning_rate": 0.00014470944514485503, + "loss": 1.5327, + "step": 21286 + }, + { + "epoch": 0.2766148563371954, + "grad_norm": 0.39130860567092896, + "learning_rate": 0.00014470684568294363, + "loss": 1.2986, + "step": 21287 + }, + { + "epoch": 0.2766278508811113, + "grad_norm": 0.3780863583087921, + "learning_rate": 0.00014470424622103225, + "loss": 1.3391, + "step": 21288 + }, + { + "epoch": 0.27664084542502715, + "grad_norm": 0.3552754521369934, + "learning_rate": 0.00014470164675912088, + "loss": 1.1574, + "step": 21289 + }, + { + "epoch": 0.27665383996894305, + "grad_norm": 0.41215065121650696, + "learning_rate": 0.0001446990472972095, + "loss": 1.5628, + "step": 21290 + }, + { + "epoch": 0.2766668345128589, + "grad_norm": 0.30129313468933105, + "learning_rate": 0.0001446964478352981, + "loss": 1.3785, + "step": 21291 + }, + { + "epoch": 0.2766798290567748, + "grad_norm": 0.43935802578926086, + "learning_rate": 0.0001446938483733867, + "loss": 1.4064, + "step": 21292 + }, + { + "epoch": 0.27669282360069064, + "grad_norm": 0.5928434729576111, + "learning_rate": 0.00014469124891147535, + "loss": 1.4197, + "step": 21293 + }, + { + "epoch": 0.27670581814460654, + "grad_norm": 0.3758867084980011, + "learning_rate": 0.00014468864944956395, + "loss": 1.3138, + "step": 21294 + }, + { + "epoch": 0.2767188126885224, + "grad_norm": 0.5056210160255432, + "learning_rate": 0.00014468604998765257, + "loss": 1.4802, + "step": 21295 + }, + { + "epoch": 0.2767318072324383, + "grad_norm": 0.3933154344558716, + "learning_rate": 0.00014468345052574117, + "loss": 1.3173, + "step": 21296 + }, + { + "epoch": 0.27674480177635413, + "grad_norm": 0.4187923073768616, + "learning_rate": 0.0001446808510638298, + "loss": 1.271, + "step": 21297 + }, + { + "epoch": 0.27675779632027003, + "grad_norm": 0.3093286156654358, + "learning_rate": 0.00014467825160191842, + "loss": 1.3094, + "step": 21298 + }, + { + "epoch": 0.2767707908641859, + "grad_norm": 0.41404083371162415, + "learning_rate": 0.00014467565214000701, + "loss": 1.5085, + "step": 21299 + }, + { + "epoch": 0.2767837854081018, + "grad_norm": 0.49591660499572754, + "learning_rate": 0.00014467305267809564, + "loss": 1.2991, + "step": 21300 + }, + { + "epoch": 0.2767967799520176, + "grad_norm": 0.4666811227798462, + "learning_rate": 0.00014467045321618426, + "loss": 1.5373, + "step": 21301 + }, + { + "epoch": 0.2768097744959335, + "grad_norm": 0.32571935653686523, + "learning_rate": 0.0001446678537542729, + "loss": 1.356, + "step": 21302 + }, + { + "epoch": 0.27682276903984937, + "grad_norm": 0.4207615256309509, + "learning_rate": 0.00014466525429236149, + "loss": 1.3703, + "step": 21303 + }, + { + "epoch": 0.27683576358376527, + "grad_norm": 0.36844712495803833, + "learning_rate": 0.00014466265483045008, + "loss": 1.1713, + "step": 21304 + }, + { + "epoch": 0.2768487581276811, + "grad_norm": 0.41885948181152344, + "learning_rate": 0.00014466005536853873, + "loss": 1.2881, + "step": 21305 + }, + { + "epoch": 0.276861752671597, + "grad_norm": 0.3904375433921814, + "learning_rate": 0.00014465745590662733, + "loss": 1.4986, + "step": 21306 + }, + { + "epoch": 0.27687474721551286, + "grad_norm": 0.45266616344451904, + "learning_rate": 0.00014465485644471596, + "loss": 1.3446, + "step": 21307 + }, + { + "epoch": 0.27688774175942876, + "grad_norm": 0.32533466815948486, + "learning_rate": 0.00014465225698280455, + "loss": 1.5466, + "step": 21308 + }, + { + "epoch": 0.2769007363033446, + "grad_norm": 0.3485800325870514, + "learning_rate": 0.00014464965752089318, + "loss": 1.3886, + "step": 21309 + }, + { + "epoch": 0.2769137308472605, + "grad_norm": 0.46611183881759644, + "learning_rate": 0.0001446470580589818, + "loss": 1.5632, + "step": 21310 + }, + { + "epoch": 0.27692672539117635, + "grad_norm": 0.42077791690826416, + "learning_rate": 0.0001446444585970704, + "loss": 1.4926, + "step": 21311 + }, + { + "epoch": 0.27693971993509225, + "grad_norm": 0.33902305364608765, + "learning_rate": 0.00014464185913515902, + "loss": 1.2663, + "step": 21312 + }, + { + "epoch": 0.2769527144790081, + "grad_norm": 0.3745257556438446, + "learning_rate": 0.00014463925967324765, + "loss": 1.4413, + "step": 21313 + }, + { + "epoch": 0.276965709022924, + "grad_norm": 0.44815024733543396, + "learning_rate": 0.00014463666021133627, + "loss": 1.4831, + "step": 21314 + }, + { + "epoch": 0.27697870356683985, + "grad_norm": 0.3737947940826416, + "learning_rate": 0.00014463406074942487, + "loss": 1.4351, + "step": 21315 + }, + { + "epoch": 0.27699169811075575, + "grad_norm": 0.38703247904777527, + "learning_rate": 0.00014463146128751347, + "loss": 1.3067, + "step": 21316 + }, + { + "epoch": 0.27700469265467165, + "grad_norm": 0.4382397532463074, + "learning_rate": 0.00014462886182560212, + "loss": 1.4665, + "step": 21317 + }, + { + "epoch": 0.2770176871985875, + "grad_norm": 0.381588876247406, + "learning_rate": 0.00014462626236369072, + "loss": 1.5344, + "step": 21318 + }, + { + "epoch": 0.2770306817425034, + "grad_norm": 0.4803544282913208, + "learning_rate": 0.00014462366290177934, + "loss": 1.4596, + "step": 21319 + }, + { + "epoch": 0.27704367628641924, + "grad_norm": 0.3316510021686554, + "learning_rate": 0.00014462106343986797, + "loss": 1.348, + "step": 21320 + }, + { + "epoch": 0.27705667083033514, + "grad_norm": 0.3640795648097992, + "learning_rate": 0.00014461846397795656, + "loss": 1.5092, + "step": 21321 + }, + { + "epoch": 0.277069665374251, + "grad_norm": 0.40990835428237915, + "learning_rate": 0.0001446158645160452, + "loss": 1.5255, + "step": 21322 + }, + { + "epoch": 0.2770826599181669, + "grad_norm": 0.33405521512031555, + "learning_rate": 0.00014461326505413379, + "loss": 1.4087, + "step": 21323 + }, + { + "epoch": 0.27709565446208273, + "grad_norm": 0.4905710220336914, + "learning_rate": 0.00014461066559222244, + "loss": 1.4012, + "step": 21324 + }, + { + "epoch": 0.27710864900599863, + "grad_norm": 0.5047950744628906, + "learning_rate": 0.00014460806613031103, + "loss": 1.4779, + "step": 21325 + }, + { + "epoch": 0.2771216435499145, + "grad_norm": 0.3084236979484558, + "learning_rate": 0.00014460546666839966, + "loss": 1.3, + "step": 21326 + }, + { + "epoch": 0.2771346380938304, + "grad_norm": 0.4295514225959778, + "learning_rate": 0.00014460286720648826, + "loss": 1.4081, + "step": 21327 + }, + { + "epoch": 0.2771476326377462, + "grad_norm": 0.3383934497833252, + "learning_rate": 0.00014460026774457688, + "loss": 1.4153, + "step": 21328 + }, + { + "epoch": 0.2771606271816621, + "grad_norm": 0.297966867685318, + "learning_rate": 0.0001445976682826655, + "loss": 1.402, + "step": 21329 + }, + { + "epoch": 0.27717362172557797, + "grad_norm": 0.4542658030986786, + "learning_rate": 0.0001445950688207541, + "loss": 1.5341, + "step": 21330 + }, + { + "epoch": 0.27718661626949387, + "grad_norm": 0.361622154712677, + "learning_rate": 0.00014459246935884273, + "loss": 1.2048, + "step": 21331 + }, + { + "epoch": 0.2771996108134097, + "grad_norm": 0.39191052317619324, + "learning_rate": 0.00014458986989693135, + "loss": 1.3645, + "step": 21332 + }, + { + "epoch": 0.2772126053573256, + "grad_norm": 0.34734442830085754, + "learning_rate": 0.00014458727043501995, + "loss": 1.3434, + "step": 21333 + }, + { + "epoch": 0.27722559990124146, + "grad_norm": 0.4504390358924866, + "learning_rate": 0.00014458467097310857, + "loss": 1.3666, + "step": 21334 + }, + { + "epoch": 0.27723859444515736, + "grad_norm": 0.3920741379261017, + "learning_rate": 0.00014458207151119717, + "loss": 1.5056, + "step": 21335 + }, + { + "epoch": 0.2772515889890732, + "grad_norm": 0.3147026598453522, + "learning_rate": 0.00014457947204928582, + "loss": 1.3457, + "step": 21336 + }, + { + "epoch": 0.2772645835329891, + "grad_norm": 0.38812771439552307, + "learning_rate": 0.00014457687258737442, + "loss": 1.3301, + "step": 21337 + }, + { + "epoch": 0.27727757807690495, + "grad_norm": 0.34487760066986084, + "learning_rate": 0.00014457427312546304, + "loss": 1.5975, + "step": 21338 + }, + { + "epoch": 0.27729057262082085, + "grad_norm": 0.4248666763305664, + "learning_rate": 0.00014457167366355164, + "loss": 1.3343, + "step": 21339 + }, + { + "epoch": 0.2773035671647367, + "grad_norm": 0.37901660799980164, + "learning_rate": 0.00014456907420164027, + "loss": 1.5059, + "step": 21340 + }, + { + "epoch": 0.2773165617086526, + "grad_norm": 0.41012606024742126, + "learning_rate": 0.0001445664747397289, + "loss": 1.4179, + "step": 21341 + }, + { + "epoch": 0.27732955625256844, + "grad_norm": 0.3304790258407593, + "learning_rate": 0.0001445638752778175, + "loss": 1.4316, + "step": 21342 + }, + { + "epoch": 0.27734255079648434, + "grad_norm": 0.3572097718715668, + "learning_rate": 0.0001445612758159061, + "loss": 1.3586, + "step": 21343 + }, + { + "epoch": 0.2773555453404002, + "grad_norm": 0.33713775873184204, + "learning_rate": 0.00014455867635399474, + "loss": 1.4721, + "step": 21344 + }, + { + "epoch": 0.2773685398843161, + "grad_norm": 0.40653857588768005, + "learning_rate": 0.00014455607689208333, + "loss": 1.3655, + "step": 21345 + }, + { + "epoch": 0.27738153442823194, + "grad_norm": 0.41890978813171387, + "learning_rate": 0.00014455347743017196, + "loss": 1.3493, + "step": 21346 + }, + { + "epoch": 0.27739452897214784, + "grad_norm": 0.3786761462688446, + "learning_rate": 0.00014455087796826056, + "loss": 1.1891, + "step": 21347 + }, + { + "epoch": 0.2774075235160637, + "grad_norm": 0.4871302545070648, + "learning_rate": 0.0001445482785063492, + "loss": 1.4943, + "step": 21348 + }, + { + "epoch": 0.2774205180599796, + "grad_norm": 0.40408241748809814, + "learning_rate": 0.0001445456790444378, + "loss": 1.5567, + "step": 21349 + }, + { + "epoch": 0.2774335126038954, + "grad_norm": 0.4766120910644531, + "learning_rate": 0.00014454307958252643, + "loss": 1.853, + "step": 21350 + }, + { + "epoch": 0.2774465071478113, + "grad_norm": 0.3546866476535797, + "learning_rate": 0.00014454048012061503, + "loss": 1.4671, + "step": 21351 + }, + { + "epoch": 0.2774595016917272, + "grad_norm": 0.3619031012058258, + "learning_rate": 0.00014453788065870365, + "loss": 1.3083, + "step": 21352 + }, + { + "epoch": 0.2774724962356431, + "grad_norm": 0.4772902727127075, + "learning_rate": 0.00014453528119679228, + "loss": 1.5611, + "step": 21353 + }, + { + "epoch": 0.2774854907795589, + "grad_norm": 0.3812859058380127, + "learning_rate": 0.00014453268173488087, + "loss": 1.3227, + "step": 21354 + }, + { + "epoch": 0.2774984853234748, + "grad_norm": 0.46992361545562744, + "learning_rate": 0.0001445300822729695, + "loss": 1.6204, + "step": 21355 + }, + { + "epoch": 0.27751147986739066, + "grad_norm": 0.4284652769565582, + "learning_rate": 0.00014452748281105812, + "loss": 1.3658, + "step": 21356 + }, + { + "epoch": 0.27752447441130657, + "grad_norm": 0.41886651515960693, + "learning_rate": 0.00014452488334914675, + "loss": 1.4634, + "step": 21357 + }, + { + "epoch": 0.2775374689552224, + "grad_norm": 0.41843563318252563, + "learning_rate": 0.00014452228388723534, + "loss": 1.4561, + "step": 21358 + }, + { + "epoch": 0.2775504634991383, + "grad_norm": 0.38557982444763184, + "learning_rate": 0.00014451968442532397, + "loss": 1.5675, + "step": 21359 + }, + { + "epoch": 0.27756345804305416, + "grad_norm": 0.42279666662216187, + "learning_rate": 0.0001445170849634126, + "loss": 1.456, + "step": 21360 + }, + { + "epoch": 0.27757645258697006, + "grad_norm": 0.2636811137199402, + "learning_rate": 0.0001445144855015012, + "loss": 1.4615, + "step": 21361 + }, + { + "epoch": 0.2775894471308859, + "grad_norm": 0.32072141766548157, + "learning_rate": 0.00014451188603958981, + "loss": 1.2438, + "step": 21362 + }, + { + "epoch": 0.2776024416748018, + "grad_norm": 0.43834319710731506, + "learning_rate": 0.00014450928657767844, + "loss": 1.395, + "step": 21363 + }, + { + "epoch": 0.27761543621871765, + "grad_norm": 0.40107449889183044, + "learning_rate": 0.00014450668711576704, + "loss": 1.4272, + "step": 21364 + }, + { + "epoch": 0.27762843076263355, + "grad_norm": 0.5505580306053162, + "learning_rate": 0.00014450408765385566, + "loss": 1.4463, + "step": 21365 + }, + { + "epoch": 0.2776414253065494, + "grad_norm": 0.416530966758728, + "learning_rate": 0.00014450148819194426, + "loss": 1.4453, + "step": 21366 + }, + { + "epoch": 0.2776544198504653, + "grad_norm": 0.4096967875957489, + "learning_rate": 0.0001444988887300329, + "loss": 1.6375, + "step": 21367 + }, + { + "epoch": 0.27766741439438114, + "grad_norm": 0.4949805438518524, + "learning_rate": 0.0001444962892681215, + "loss": 1.4885, + "step": 21368 + }, + { + "epoch": 0.27768040893829704, + "grad_norm": 0.4033469557762146, + "learning_rate": 0.00014449368980621013, + "loss": 1.3952, + "step": 21369 + }, + { + "epoch": 0.2776934034822129, + "grad_norm": 0.48273131251335144, + "learning_rate": 0.00014449109034429873, + "loss": 1.6422, + "step": 21370 + }, + { + "epoch": 0.2777063980261288, + "grad_norm": 0.4689628481864929, + "learning_rate": 0.00014448849088238735, + "loss": 1.5062, + "step": 21371 + }, + { + "epoch": 0.27771939257004463, + "grad_norm": 0.5330589413642883, + "learning_rate": 0.00014448589142047598, + "loss": 1.4142, + "step": 21372 + }, + { + "epoch": 0.27773238711396053, + "grad_norm": 0.41915202140808105, + "learning_rate": 0.00014448329195856458, + "loss": 1.6137, + "step": 21373 + }, + { + "epoch": 0.2777453816578764, + "grad_norm": 0.3730369806289673, + "learning_rate": 0.0001444806924966532, + "loss": 1.1628, + "step": 21374 + }, + { + "epoch": 0.2777583762017923, + "grad_norm": 0.3279553949832916, + "learning_rate": 0.00014447809303474182, + "loss": 1.2664, + "step": 21375 + }, + { + "epoch": 0.2777713707457081, + "grad_norm": 0.45137903094291687, + "learning_rate": 0.00014447549357283042, + "loss": 1.4193, + "step": 21376 + }, + { + "epoch": 0.277784365289624, + "grad_norm": 0.3433552384376526, + "learning_rate": 0.00014447289411091905, + "loss": 1.4443, + "step": 21377 + }, + { + "epoch": 0.27779735983353987, + "grad_norm": 0.3838362395763397, + "learning_rate": 0.00014447029464900764, + "loss": 1.3011, + "step": 21378 + }, + { + "epoch": 0.27781035437745577, + "grad_norm": 0.3855244517326355, + "learning_rate": 0.0001444676951870963, + "loss": 1.413, + "step": 21379 + }, + { + "epoch": 0.2778233489213716, + "grad_norm": 0.40140780806541443, + "learning_rate": 0.0001444650957251849, + "loss": 1.4696, + "step": 21380 + }, + { + "epoch": 0.2778363434652875, + "grad_norm": 0.4046088755130768, + "learning_rate": 0.00014446249626327352, + "loss": 1.5458, + "step": 21381 + }, + { + "epoch": 0.27784933800920336, + "grad_norm": 0.37515345215797424, + "learning_rate": 0.00014445989680136211, + "loss": 1.2169, + "step": 21382 + }, + { + "epoch": 0.27786233255311926, + "grad_norm": 0.43619510531425476, + "learning_rate": 0.00014445729733945074, + "loss": 1.3219, + "step": 21383 + }, + { + "epoch": 0.2778753270970351, + "grad_norm": 0.3891080915927887, + "learning_rate": 0.00014445469787753936, + "loss": 1.4813, + "step": 21384 + }, + { + "epoch": 0.277888321640951, + "grad_norm": 0.2551535665988922, + "learning_rate": 0.00014445209841562796, + "loss": 1.3341, + "step": 21385 + }, + { + "epoch": 0.27790131618486685, + "grad_norm": 0.507038414478302, + "learning_rate": 0.00014444949895371659, + "loss": 1.3524, + "step": 21386 + }, + { + "epoch": 0.27791431072878275, + "grad_norm": 0.3655197024345398, + "learning_rate": 0.0001444468994918052, + "loss": 1.388, + "step": 21387 + }, + { + "epoch": 0.2779273052726986, + "grad_norm": 0.43492814898490906, + "learning_rate": 0.0001444443000298938, + "loss": 1.273, + "step": 21388 + }, + { + "epoch": 0.2779402998166145, + "grad_norm": 0.4352020025253296, + "learning_rate": 0.00014444170056798243, + "loss": 1.33, + "step": 21389 + }, + { + "epoch": 0.27795329436053035, + "grad_norm": 0.494403600692749, + "learning_rate": 0.00014443910110607103, + "loss": 1.4167, + "step": 21390 + }, + { + "epoch": 0.27796628890444625, + "grad_norm": 0.32938453555107117, + "learning_rate": 0.00014443650164415968, + "loss": 1.4163, + "step": 21391 + }, + { + "epoch": 0.27797928344836215, + "grad_norm": 0.4741702377796173, + "learning_rate": 0.00014443390218224828, + "loss": 1.4508, + "step": 21392 + }, + { + "epoch": 0.277992277992278, + "grad_norm": 0.46993911266326904, + "learning_rate": 0.0001444313027203369, + "loss": 1.5122, + "step": 21393 + }, + { + "epoch": 0.2780052725361939, + "grad_norm": 0.3995822072029114, + "learning_rate": 0.0001444287032584255, + "loss": 1.5962, + "step": 21394 + }, + { + "epoch": 0.27801826708010974, + "grad_norm": 0.38425421714782715, + "learning_rate": 0.00014442610379651412, + "loss": 1.3499, + "step": 21395 + }, + { + "epoch": 0.27803126162402564, + "grad_norm": 0.3769197165966034, + "learning_rate": 0.00014442350433460275, + "loss": 1.3154, + "step": 21396 + }, + { + "epoch": 0.2780442561679415, + "grad_norm": 0.48159024119377136, + "learning_rate": 0.00014442090487269135, + "loss": 1.5015, + "step": 21397 + }, + { + "epoch": 0.2780572507118574, + "grad_norm": 0.3942597210407257, + "learning_rate": 0.00014441830541078, + "loss": 1.247, + "step": 21398 + }, + { + "epoch": 0.27807024525577323, + "grad_norm": 0.4537292718887329, + "learning_rate": 0.0001444157059488686, + "loss": 1.4541, + "step": 21399 + }, + { + "epoch": 0.27808323979968913, + "grad_norm": 0.4368032217025757, + "learning_rate": 0.0001444131064869572, + "loss": 1.3595, + "step": 21400 + }, + { + "epoch": 0.278096234343605, + "grad_norm": 0.37032121419906616, + "learning_rate": 0.00014441050702504582, + "loss": 1.3364, + "step": 21401 + }, + { + "epoch": 0.2781092288875209, + "grad_norm": 0.4966641068458557, + "learning_rate": 0.00014440790756313444, + "loss": 1.4888, + "step": 21402 + }, + { + "epoch": 0.2781222234314367, + "grad_norm": 0.3264123499393463, + "learning_rate": 0.00014440530810122307, + "loss": 1.4118, + "step": 21403 + }, + { + "epoch": 0.2781352179753526, + "grad_norm": 0.33183181285858154, + "learning_rate": 0.00014440270863931166, + "loss": 1.4755, + "step": 21404 + }, + { + "epoch": 0.27814821251926847, + "grad_norm": 0.4152382016181946, + "learning_rate": 0.0001444001091774003, + "loss": 1.4971, + "step": 21405 + }, + { + "epoch": 0.27816120706318437, + "grad_norm": 0.33708927035331726, + "learning_rate": 0.0001443975097154889, + "loss": 1.3765, + "step": 21406 + }, + { + "epoch": 0.2781742016071002, + "grad_norm": 0.43458428978919983, + "learning_rate": 0.0001443949102535775, + "loss": 1.3403, + "step": 21407 + }, + { + "epoch": 0.2781871961510161, + "grad_norm": 0.38059690594673157, + "learning_rate": 0.00014439231079166613, + "loss": 1.4151, + "step": 21408 + }, + { + "epoch": 0.27820019069493196, + "grad_norm": 0.3961121737957001, + "learning_rate": 0.00014438971132975473, + "loss": 1.3189, + "step": 21409 + }, + { + "epoch": 0.27821318523884786, + "grad_norm": 0.3854229152202606, + "learning_rate": 0.00014438711186784338, + "loss": 1.2997, + "step": 21410 + }, + { + "epoch": 0.2782261797827637, + "grad_norm": 0.3332661986351013, + "learning_rate": 0.00014438451240593198, + "loss": 1.221, + "step": 21411 + }, + { + "epoch": 0.2782391743266796, + "grad_norm": 0.5238799452781677, + "learning_rate": 0.0001443819129440206, + "loss": 1.4279, + "step": 21412 + }, + { + "epoch": 0.27825216887059545, + "grad_norm": 0.4394322633743286, + "learning_rate": 0.0001443793134821092, + "loss": 1.3998, + "step": 21413 + }, + { + "epoch": 0.27826516341451135, + "grad_norm": 0.44008713960647583, + "learning_rate": 0.00014437671402019783, + "loss": 1.6128, + "step": 21414 + }, + { + "epoch": 0.2782781579584272, + "grad_norm": 0.39046064019203186, + "learning_rate": 0.00014437411455828645, + "loss": 1.5418, + "step": 21415 + }, + { + "epoch": 0.2782911525023431, + "grad_norm": 0.4449542760848999, + "learning_rate": 0.00014437151509637505, + "loss": 1.5318, + "step": 21416 + }, + { + "epoch": 0.27830414704625894, + "grad_norm": 0.5111715197563171, + "learning_rate": 0.00014436891563446367, + "loss": 1.5035, + "step": 21417 + }, + { + "epoch": 0.27831714159017484, + "grad_norm": 0.4284152388572693, + "learning_rate": 0.0001443663161725523, + "loss": 1.3336, + "step": 21418 + }, + { + "epoch": 0.2783301361340907, + "grad_norm": 0.384360134601593, + "learning_rate": 0.0001443637167106409, + "loss": 1.2077, + "step": 21419 + }, + { + "epoch": 0.2783431306780066, + "grad_norm": 0.3995853662490845, + "learning_rate": 0.00014436111724872952, + "loss": 1.452, + "step": 21420 + }, + { + "epoch": 0.27835612522192243, + "grad_norm": 0.4239235520362854, + "learning_rate": 0.00014435851778681812, + "loss": 1.5758, + "step": 21421 + }, + { + "epoch": 0.27836911976583834, + "grad_norm": 0.3954913914203644, + "learning_rate": 0.00014435591832490677, + "loss": 1.6059, + "step": 21422 + }, + { + "epoch": 0.2783821143097542, + "grad_norm": 0.42043933272361755, + "learning_rate": 0.00014435331886299537, + "loss": 1.5903, + "step": 21423 + }, + { + "epoch": 0.2783951088536701, + "grad_norm": 0.4186195433139801, + "learning_rate": 0.000144350719401084, + "loss": 1.3663, + "step": 21424 + }, + { + "epoch": 0.2784081033975859, + "grad_norm": 0.469321608543396, + "learning_rate": 0.0001443481199391726, + "loss": 1.4055, + "step": 21425 + }, + { + "epoch": 0.2784210979415018, + "grad_norm": 0.4266015887260437, + "learning_rate": 0.0001443455204772612, + "loss": 1.4899, + "step": 21426 + }, + { + "epoch": 0.2784340924854177, + "grad_norm": 0.48333337903022766, + "learning_rate": 0.00014434292101534984, + "loss": 1.4089, + "step": 21427 + }, + { + "epoch": 0.2784470870293336, + "grad_norm": 0.3953308165073395, + "learning_rate": 0.00014434032155343843, + "loss": 1.5657, + "step": 21428 + }, + { + "epoch": 0.2784600815732494, + "grad_norm": 0.33619922399520874, + "learning_rate": 0.00014433772209152706, + "loss": 1.2991, + "step": 21429 + }, + { + "epoch": 0.2784730761171653, + "grad_norm": 0.36046817898750305, + "learning_rate": 0.00014433512262961568, + "loss": 1.192, + "step": 21430 + }, + { + "epoch": 0.27848607066108116, + "grad_norm": 0.456960529088974, + "learning_rate": 0.00014433252316770428, + "loss": 1.3118, + "step": 21431 + }, + { + "epoch": 0.27849906520499706, + "grad_norm": 0.49748027324676514, + "learning_rate": 0.0001443299237057929, + "loss": 1.4176, + "step": 21432 + }, + { + "epoch": 0.2785120597489129, + "grad_norm": 0.43698936700820923, + "learning_rate": 0.00014432732424388153, + "loss": 1.2421, + "step": 21433 + }, + { + "epoch": 0.2785250542928288, + "grad_norm": 0.4454978406429291, + "learning_rate": 0.00014432472478197015, + "loss": 1.3019, + "step": 21434 + }, + { + "epoch": 0.27853804883674466, + "grad_norm": 0.44687652587890625, + "learning_rate": 0.00014432212532005875, + "loss": 1.5089, + "step": 21435 + }, + { + "epoch": 0.27855104338066056, + "grad_norm": 0.5089172124862671, + "learning_rate": 0.00014431952585814738, + "loss": 1.2937, + "step": 21436 + }, + { + "epoch": 0.2785640379245764, + "grad_norm": 0.4406977891921997, + "learning_rate": 0.000144316926396236, + "loss": 1.3908, + "step": 21437 + }, + { + "epoch": 0.2785770324684923, + "grad_norm": 0.48522382974624634, + "learning_rate": 0.0001443143269343246, + "loss": 1.4225, + "step": 21438 + }, + { + "epoch": 0.27859002701240815, + "grad_norm": 0.33807873725891113, + "learning_rate": 0.00014431172747241322, + "loss": 1.406, + "step": 21439 + }, + { + "epoch": 0.27860302155632405, + "grad_norm": 0.5246123671531677, + "learning_rate": 0.00014430912801050182, + "loss": 1.5528, + "step": 21440 + }, + { + "epoch": 0.2786160161002399, + "grad_norm": 0.590869128704071, + "learning_rate": 0.00014430652854859047, + "loss": 1.5029, + "step": 21441 + }, + { + "epoch": 0.2786290106441558, + "grad_norm": 0.3561863899230957, + "learning_rate": 0.00014430392908667907, + "loss": 1.4338, + "step": 21442 + }, + { + "epoch": 0.27864200518807164, + "grad_norm": 0.4269315302371979, + "learning_rate": 0.00014430132962476767, + "loss": 1.3754, + "step": 21443 + }, + { + "epoch": 0.27865499973198754, + "grad_norm": 0.4255005717277527, + "learning_rate": 0.0001442987301628563, + "loss": 1.2201, + "step": 21444 + }, + { + "epoch": 0.2786679942759034, + "grad_norm": 0.4089432954788208, + "learning_rate": 0.00014429613070094492, + "loss": 1.4634, + "step": 21445 + }, + { + "epoch": 0.2786809888198193, + "grad_norm": 0.3521561920642853, + "learning_rate": 0.00014429353123903354, + "loss": 1.1944, + "step": 21446 + }, + { + "epoch": 0.27869398336373513, + "grad_norm": 0.33145517110824585, + "learning_rate": 0.00014429093177712214, + "loss": 1.2196, + "step": 21447 + }, + { + "epoch": 0.27870697790765103, + "grad_norm": 0.3241947293281555, + "learning_rate": 0.00014428833231521076, + "loss": 1.3463, + "step": 21448 + }, + { + "epoch": 0.2787199724515669, + "grad_norm": 0.46330612897872925, + "learning_rate": 0.00014428573285329939, + "loss": 1.3855, + "step": 21449 + }, + { + "epoch": 0.2787329669954828, + "grad_norm": 0.43466782569885254, + "learning_rate": 0.00014428313339138798, + "loss": 1.6249, + "step": 21450 + }, + { + "epoch": 0.2787459615393986, + "grad_norm": 0.41936221718788147, + "learning_rate": 0.0001442805339294766, + "loss": 1.3844, + "step": 21451 + }, + { + "epoch": 0.2787589560833145, + "grad_norm": 0.5308575630187988, + "learning_rate": 0.0001442779344675652, + "loss": 1.4612, + "step": 21452 + }, + { + "epoch": 0.27877195062723037, + "grad_norm": 0.3371976315975189, + "learning_rate": 0.00014427533500565386, + "loss": 1.5368, + "step": 21453 + }, + { + "epoch": 0.27878494517114627, + "grad_norm": 0.4401633143424988, + "learning_rate": 0.00014427273554374245, + "loss": 1.5234, + "step": 21454 + }, + { + "epoch": 0.2787979397150621, + "grad_norm": 0.356117844581604, + "learning_rate": 0.00014427013608183105, + "loss": 1.5649, + "step": 21455 + }, + { + "epoch": 0.278810934258978, + "grad_norm": 0.40017110109329224, + "learning_rate": 0.00014426753661991968, + "loss": 1.3235, + "step": 21456 + }, + { + "epoch": 0.27882392880289386, + "grad_norm": 0.4322781562805176, + "learning_rate": 0.0001442649371580083, + "loss": 1.3192, + "step": 21457 + }, + { + "epoch": 0.27883692334680976, + "grad_norm": 0.3859092593193054, + "learning_rate": 0.00014426233769609692, + "loss": 1.3756, + "step": 21458 + }, + { + "epoch": 0.2788499178907256, + "grad_norm": 0.39356598258018494, + "learning_rate": 0.00014425973823418552, + "loss": 1.3102, + "step": 21459 + }, + { + "epoch": 0.2788629124346415, + "grad_norm": 0.48195910453796387, + "learning_rate": 0.00014425713877227415, + "loss": 1.4914, + "step": 21460 + }, + { + "epoch": 0.27887590697855735, + "grad_norm": 0.37488916516304016, + "learning_rate": 0.00014425453931036277, + "loss": 1.4812, + "step": 21461 + }, + { + "epoch": 0.27888890152247325, + "grad_norm": 0.2964920103549957, + "learning_rate": 0.00014425193984845137, + "loss": 1.4049, + "step": 21462 + }, + { + "epoch": 0.2789018960663891, + "grad_norm": 0.34389278292655945, + "learning_rate": 0.00014424934038654, + "loss": 1.3698, + "step": 21463 + }, + { + "epoch": 0.278914890610305, + "grad_norm": 0.4370003640651703, + "learning_rate": 0.0001442467409246286, + "loss": 1.4366, + "step": 21464 + }, + { + "epoch": 0.27892788515422084, + "grad_norm": 0.4175202250480652, + "learning_rate": 0.00014424414146271724, + "loss": 1.3644, + "step": 21465 + }, + { + "epoch": 0.27894087969813675, + "grad_norm": 0.3859788477420807, + "learning_rate": 0.00014424154200080584, + "loss": 1.5084, + "step": 21466 + }, + { + "epoch": 0.2789538742420526, + "grad_norm": 0.2370905876159668, + "learning_rate": 0.00014423894253889444, + "loss": 1.4443, + "step": 21467 + }, + { + "epoch": 0.2789668687859685, + "grad_norm": 0.32241150736808777, + "learning_rate": 0.00014423634307698306, + "loss": 1.4162, + "step": 21468 + }, + { + "epoch": 0.2789798633298844, + "grad_norm": 0.35621413588523865, + "learning_rate": 0.00014423374361507169, + "loss": 1.1357, + "step": 21469 + }, + { + "epoch": 0.27899285787380024, + "grad_norm": 0.40914133191108704, + "learning_rate": 0.0001442311441531603, + "loss": 1.3368, + "step": 21470 + }, + { + "epoch": 0.27900585241771614, + "grad_norm": 0.45546963810920715, + "learning_rate": 0.0001442285446912489, + "loss": 1.4786, + "step": 21471 + }, + { + "epoch": 0.279018846961632, + "grad_norm": 0.4365185499191284, + "learning_rate": 0.00014422594522933753, + "loss": 1.5744, + "step": 21472 + }, + { + "epoch": 0.2790318415055479, + "grad_norm": 0.43023788928985596, + "learning_rate": 0.00014422334576742616, + "loss": 1.5829, + "step": 21473 + }, + { + "epoch": 0.27904483604946373, + "grad_norm": 0.45840564370155334, + "learning_rate": 0.00014422074630551475, + "loss": 1.3198, + "step": 21474 + }, + { + "epoch": 0.27905783059337963, + "grad_norm": 0.5318276882171631, + "learning_rate": 0.00014421814684360338, + "loss": 1.4954, + "step": 21475 + }, + { + "epoch": 0.2790708251372955, + "grad_norm": 0.37725311517715454, + "learning_rate": 0.000144215547381692, + "loss": 1.4431, + "step": 21476 + }, + { + "epoch": 0.2790838196812114, + "grad_norm": 0.35171905159950256, + "learning_rate": 0.00014421294791978063, + "loss": 1.2492, + "step": 21477 + }, + { + "epoch": 0.2790968142251272, + "grad_norm": 0.5016863346099854, + "learning_rate": 0.00014421034845786922, + "loss": 1.3518, + "step": 21478 + }, + { + "epoch": 0.2791098087690431, + "grad_norm": 0.3473861515522003, + "learning_rate": 0.00014420774899595785, + "loss": 1.5334, + "step": 21479 + }, + { + "epoch": 0.27912280331295897, + "grad_norm": 0.39675071835517883, + "learning_rate": 0.00014420514953404647, + "loss": 1.423, + "step": 21480 + }, + { + "epoch": 0.27913579785687487, + "grad_norm": 0.3445226550102234, + "learning_rate": 0.00014420255007213507, + "loss": 1.5077, + "step": 21481 + }, + { + "epoch": 0.2791487924007907, + "grad_norm": 0.38240480422973633, + "learning_rate": 0.0001441999506102237, + "loss": 1.3547, + "step": 21482 + }, + { + "epoch": 0.2791617869447066, + "grad_norm": 0.42996129393577576, + "learning_rate": 0.0001441973511483123, + "loss": 1.6199, + "step": 21483 + }, + { + "epoch": 0.27917478148862246, + "grad_norm": 0.3516026437282562, + "learning_rate": 0.00014419475168640092, + "loss": 1.3657, + "step": 21484 + }, + { + "epoch": 0.27918777603253836, + "grad_norm": 0.39256930351257324, + "learning_rate": 0.00014419215222448954, + "loss": 1.2887, + "step": 21485 + }, + { + "epoch": 0.2792007705764542, + "grad_norm": 0.3857741951942444, + "learning_rate": 0.00014418955276257814, + "loss": 1.3836, + "step": 21486 + }, + { + "epoch": 0.2792137651203701, + "grad_norm": 0.4462112784385681, + "learning_rate": 0.00014418695330066676, + "loss": 1.3532, + "step": 21487 + }, + { + "epoch": 0.27922675966428595, + "grad_norm": 0.42959532141685486, + "learning_rate": 0.0001441843538387554, + "loss": 1.4041, + "step": 21488 + }, + { + "epoch": 0.27923975420820185, + "grad_norm": 0.36009982228279114, + "learning_rate": 0.000144181754376844, + "loss": 1.4038, + "step": 21489 + }, + { + "epoch": 0.2792527487521177, + "grad_norm": 0.35454922914505005, + "learning_rate": 0.0001441791549149326, + "loss": 1.3783, + "step": 21490 + }, + { + "epoch": 0.2792657432960336, + "grad_norm": 0.4293123781681061, + "learning_rate": 0.00014417655545302123, + "loss": 1.464, + "step": 21491 + }, + { + "epoch": 0.27927873783994944, + "grad_norm": 0.44025054574012756, + "learning_rate": 0.00014417395599110986, + "loss": 1.32, + "step": 21492 + }, + { + "epoch": 0.27929173238386534, + "grad_norm": 0.3445517122745514, + "learning_rate": 0.00014417135652919846, + "loss": 1.4833, + "step": 21493 + }, + { + "epoch": 0.2793047269277812, + "grad_norm": 0.43521058559417725, + "learning_rate": 0.00014416875706728708, + "loss": 1.3956, + "step": 21494 + }, + { + "epoch": 0.2793177214716971, + "grad_norm": 0.45968952775001526, + "learning_rate": 0.00014416615760537568, + "loss": 1.3773, + "step": 21495 + }, + { + "epoch": 0.27933071601561293, + "grad_norm": 0.3523813486099243, + "learning_rate": 0.00014416355814346433, + "loss": 1.3612, + "step": 21496 + }, + { + "epoch": 0.27934371055952884, + "grad_norm": 0.3348783254623413, + "learning_rate": 0.00014416095868155293, + "loss": 1.3667, + "step": 21497 + }, + { + "epoch": 0.2793567051034447, + "grad_norm": 0.4334190785884857, + "learning_rate": 0.00014415835921964152, + "loss": 1.4579, + "step": 21498 + }, + { + "epoch": 0.2793696996473606, + "grad_norm": 0.34385475516319275, + "learning_rate": 0.00014415575975773015, + "loss": 1.2887, + "step": 21499 + }, + { + "epoch": 0.2793826941912764, + "grad_norm": 0.2839030921459198, + "learning_rate": 0.00014415316029581877, + "loss": 1.1337, + "step": 21500 + }, + { + "epoch": 0.2793956887351923, + "grad_norm": 0.383198082447052, + "learning_rate": 0.0001441505608339074, + "loss": 1.2243, + "step": 21501 + }, + { + "epoch": 0.27940868327910817, + "grad_norm": 0.39565548300743103, + "learning_rate": 0.000144147961371996, + "loss": 1.194, + "step": 21502 + }, + { + "epoch": 0.2794216778230241, + "grad_norm": 0.4241720139980316, + "learning_rate": 0.00014414536191008462, + "loss": 1.2956, + "step": 21503 + }, + { + "epoch": 0.2794346723669399, + "grad_norm": 0.35848429799079895, + "learning_rate": 0.00014414276244817324, + "loss": 1.5074, + "step": 21504 + }, + { + "epoch": 0.2794476669108558, + "grad_norm": 0.5230528712272644, + "learning_rate": 0.00014414016298626184, + "loss": 1.5408, + "step": 21505 + }, + { + "epoch": 0.27946066145477166, + "grad_norm": 0.34090229868888855, + "learning_rate": 0.00014413756352435047, + "loss": 1.4492, + "step": 21506 + }, + { + "epoch": 0.27947365599868756, + "grad_norm": 0.4034985899925232, + "learning_rate": 0.0001441349640624391, + "loss": 1.427, + "step": 21507 + }, + { + "epoch": 0.2794866505426034, + "grad_norm": 0.35721829533576965, + "learning_rate": 0.00014413236460052772, + "loss": 1.3985, + "step": 21508 + }, + { + "epoch": 0.2794996450865193, + "grad_norm": 0.44309142231941223, + "learning_rate": 0.0001441297651386163, + "loss": 1.2786, + "step": 21509 + }, + { + "epoch": 0.27951263963043516, + "grad_norm": 0.44928547739982605, + "learning_rate": 0.0001441271656767049, + "loss": 1.5405, + "step": 21510 + }, + { + "epoch": 0.27952563417435106, + "grad_norm": 0.38292816281318665, + "learning_rate": 0.00014412456621479356, + "loss": 1.3209, + "step": 21511 + }, + { + "epoch": 0.2795386287182669, + "grad_norm": 0.28443366289138794, + "learning_rate": 0.00014412196675288216, + "loss": 1.3041, + "step": 21512 + }, + { + "epoch": 0.2795516232621828, + "grad_norm": 0.4442163109779358, + "learning_rate": 0.00014411936729097078, + "loss": 1.4933, + "step": 21513 + }, + { + "epoch": 0.27956461780609865, + "grad_norm": 0.4082050025463104, + "learning_rate": 0.00014411676782905938, + "loss": 1.4507, + "step": 21514 + }, + { + "epoch": 0.27957761235001455, + "grad_norm": 0.3807099163532257, + "learning_rate": 0.000144114168367148, + "loss": 1.4412, + "step": 21515 + }, + { + "epoch": 0.2795906068939304, + "grad_norm": 0.3784346878528595, + "learning_rate": 0.00014411156890523663, + "loss": 1.3442, + "step": 21516 + }, + { + "epoch": 0.2796036014378463, + "grad_norm": 0.43854638934135437, + "learning_rate": 0.00014410896944332523, + "loss": 1.468, + "step": 21517 + }, + { + "epoch": 0.27961659598176214, + "grad_norm": 0.5125160813331604, + "learning_rate": 0.00014410636998141385, + "loss": 1.2961, + "step": 21518 + }, + { + "epoch": 0.27962959052567804, + "grad_norm": 0.3407018184661865, + "learning_rate": 0.00014410377051950248, + "loss": 1.463, + "step": 21519 + }, + { + "epoch": 0.2796425850695939, + "grad_norm": 0.46935951709747314, + "learning_rate": 0.0001441011710575911, + "loss": 1.4124, + "step": 21520 + }, + { + "epoch": 0.2796555796135098, + "grad_norm": 0.41228967905044556, + "learning_rate": 0.0001440985715956797, + "loss": 1.2428, + "step": 21521 + }, + { + "epoch": 0.27966857415742563, + "grad_norm": 0.3404655158519745, + "learning_rate": 0.0001440959721337683, + "loss": 1.4957, + "step": 21522 + }, + { + "epoch": 0.27968156870134153, + "grad_norm": 0.4039502739906311, + "learning_rate": 0.00014409337267185695, + "loss": 1.408, + "step": 21523 + }, + { + "epoch": 0.2796945632452574, + "grad_norm": 0.4064100980758667, + "learning_rate": 0.00014409077320994554, + "loss": 1.5325, + "step": 21524 + }, + { + "epoch": 0.2797075577891733, + "grad_norm": 0.41681787371635437, + "learning_rate": 0.00014408817374803417, + "loss": 1.4673, + "step": 21525 + }, + { + "epoch": 0.2797205523330891, + "grad_norm": 0.4816579520702362, + "learning_rate": 0.00014408557428612277, + "loss": 1.4488, + "step": 21526 + }, + { + "epoch": 0.279733546877005, + "grad_norm": 0.5031188130378723, + "learning_rate": 0.0001440829748242114, + "loss": 1.5229, + "step": 21527 + }, + { + "epoch": 0.27974654142092087, + "grad_norm": 0.36926984786987305, + "learning_rate": 0.00014408037536230002, + "loss": 1.5312, + "step": 21528 + }, + { + "epoch": 0.27975953596483677, + "grad_norm": 0.43403682112693787, + "learning_rate": 0.0001440777759003886, + "loss": 1.4893, + "step": 21529 + }, + { + "epoch": 0.2797725305087526, + "grad_norm": 0.4647897779941559, + "learning_rate": 0.00014407517643847724, + "loss": 1.3366, + "step": 21530 + }, + { + "epoch": 0.2797855250526685, + "grad_norm": 0.38596251606941223, + "learning_rate": 0.00014407257697656586, + "loss": 1.1498, + "step": 21531 + }, + { + "epoch": 0.27979851959658436, + "grad_norm": 0.2629759609699249, + "learning_rate": 0.00014406997751465449, + "loss": 1.3319, + "step": 21532 + }, + { + "epoch": 0.27981151414050026, + "grad_norm": 0.39126014709472656, + "learning_rate": 0.00014406737805274308, + "loss": 1.2858, + "step": 21533 + }, + { + "epoch": 0.2798245086844161, + "grad_norm": 0.3864976167678833, + "learning_rate": 0.0001440647785908317, + "loss": 1.349, + "step": 21534 + }, + { + "epoch": 0.279837503228332, + "grad_norm": 0.42169812321662903, + "learning_rate": 0.00014406217912892033, + "loss": 1.4357, + "step": 21535 + }, + { + "epoch": 0.27985049777224785, + "grad_norm": 0.41296932101249695, + "learning_rate": 0.00014405957966700893, + "loss": 1.536, + "step": 21536 + }, + { + "epoch": 0.27986349231616375, + "grad_norm": 0.40680354833602905, + "learning_rate": 0.00014405698020509755, + "loss": 1.2904, + "step": 21537 + }, + { + "epoch": 0.2798764868600796, + "grad_norm": 0.5047711730003357, + "learning_rate": 0.00014405438074318615, + "loss": 1.49, + "step": 21538 + }, + { + "epoch": 0.2798894814039955, + "grad_norm": 0.46680948138237, + "learning_rate": 0.00014405178128127478, + "loss": 1.4756, + "step": 21539 + }, + { + "epoch": 0.27990247594791134, + "grad_norm": 0.3409785032272339, + "learning_rate": 0.0001440491818193634, + "loss": 1.4024, + "step": 21540 + }, + { + "epoch": 0.27991547049182725, + "grad_norm": 0.42886868119239807, + "learning_rate": 0.000144046582357452, + "loss": 1.5185, + "step": 21541 + }, + { + "epoch": 0.2799284650357431, + "grad_norm": 0.4366057217121124, + "learning_rate": 0.00014404398289554062, + "loss": 1.4912, + "step": 21542 + }, + { + "epoch": 0.279941459579659, + "grad_norm": 0.3587425947189331, + "learning_rate": 0.00014404138343362925, + "loss": 1.4523, + "step": 21543 + }, + { + "epoch": 0.27995445412357484, + "grad_norm": 0.3492141664028168, + "learning_rate": 0.00014403878397171787, + "loss": 1.2518, + "step": 21544 + }, + { + "epoch": 0.27996744866749074, + "grad_norm": 0.298484206199646, + "learning_rate": 0.00014403618450980647, + "loss": 1.3221, + "step": 21545 + }, + { + "epoch": 0.27998044321140664, + "grad_norm": 0.3970220386981964, + "learning_rate": 0.0001440335850478951, + "loss": 1.4704, + "step": 21546 + }, + { + "epoch": 0.2799934377553225, + "grad_norm": 0.40950071811676025, + "learning_rate": 0.00014403098558598372, + "loss": 1.5809, + "step": 21547 + }, + { + "epoch": 0.2800064322992384, + "grad_norm": 0.46853384375572205, + "learning_rate": 0.00014402838612407232, + "loss": 1.4397, + "step": 21548 + }, + { + "epoch": 0.28001942684315423, + "grad_norm": 0.4517922103404999, + "learning_rate": 0.00014402578666216094, + "loss": 1.3388, + "step": 21549 + }, + { + "epoch": 0.28003242138707013, + "grad_norm": 0.36487752199172974, + "learning_rate": 0.00014402318720024956, + "loss": 1.3091, + "step": 21550 + }, + { + "epoch": 0.280045415930986, + "grad_norm": 0.4536622166633606, + "learning_rate": 0.00014402058773833816, + "loss": 1.4841, + "step": 21551 + }, + { + "epoch": 0.2800584104749019, + "grad_norm": 0.4552665650844574, + "learning_rate": 0.00014401798827642679, + "loss": 1.4781, + "step": 21552 + }, + { + "epoch": 0.2800714050188177, + "grad_norm": 0.4001098871231079, + "learning_rate": 0.00014401538881451538, + "loss": 1.3086, + "step": 21553 + }, + { + "epoch": 0.2800843995627336, + "grad_norm": 0.4407338500022888, + "learning_rate": 0.00014401278935260404, + "loss": 1.4137, + "step": 21554 + }, + { + "epoch": 0.28009739410664947, + "grad_norm": 0.4241715967655182, + "learning_rate": 0.00014401018989069263, + "loss": 1.5933, + "step": 21555 + }, + { + "epoch": 0.28011038865056537, + "grad_norm": 0.45190882682800293, + "learning_rate": 0.00014400759042878126, + "loss": 1.3828, + "step": 21556 + }, + { + "epoch": 0.2801233831944812, + "grad_norm": 0.42788925766944885, + "learning_rate": 0.00014400499096686985, + "loss": 1.3792, + "step": 21557 + }, + { + "epoch": 0.2801363777383971, + "grad_norm": 0.347611665725708, + "learning_rate": 0.00014400239150495848, + "loss": 1.614, + "step": 21558 + }, + { + "epoch": 0.28014937228231296, + "grad_norm": 0.5511483550071716, + "learning_rate": 0.0001439997920430471, + "loss": 1.3305, + "step": 21559 + }, + { + "epoch": 0.28016236682622886, + "grad_norm": 0.4634404480457306, + "learning_rate": 0.0001439971925811357, + "loss": 1.4258, + "step": 21560 + }, + { + "epoch": 0.2801753613701447, + "grad_norm": 0.42640987038612366, + "learning_rate": 0.00014399459311922433, + "loss": 1.3595, + "step": 21561 + }, + { + "epoch": 0.2801883559140606, + "grad_norm": 0.35330328345298767, + "learning_rate": 0.00014399199365731295, + "loss": 1.2516, + "step": 21562 + }, + { + "epoch": 0.28020135045797645, + "grad_norm": 0.4085612893104553, + "learning_rate": 0.00014398939419540157, + "loss": 1.4904, + "step": 21563 + }, + { + "epoch": 0.28021434500189235, + "grad_norm": 0.40327513217926025, + "learning_rate": 0.00014398679473349017, + "loss": 1.3826, + "step": 21564 + }, + { + "epoch": 0.2802273395458082, + "grad_norm": 0.4792408049106598, + "learning_rate": 0.00014398419527157877, + "loss": 1.5326, + "step": 21565 + }, + { + "epoch": 0.2802403340897241, + "grad_norm": 0.3872530162334442, + "learning_rate": 0.00014398159580966742, + "loss": 1.2945, + "step": 21566 + }, + { + "epoch": 0.28025332863363994, + "grad_norm": 0.3294652998447418, + "learning_rate": 0.00014397899634775602, + "loss": 1.3299, + "step": 21567 + }, + { + "epoch": 0.28026632317755584, + "grad_norm": 0.3749648630619049, + "learning_rate": 0.00014397639688584464, + "loss": 1.3798, + "step": 21568 + }, + { + "epoch": 0.2802793177214717, + "grad_norm": 0.39000073075294495, + "learning_rate": 0.00014397379742393324, + "loss": 1.5696, + "step": 21569 + }, + { + "epoch": 0.2802923122653876, + "grad_norm": 0.31261932849884033, + "learning_rate": 0.00014397119796202186, + "loss": 1.3209, + "step": 21570 + }, + { + "epoch": 0.28030530680930343, + "grad_norm": 0.3025438189506531, + "learning_rate": 0.0001439685985001105, + "loss": 1.2002, + "step": 21571 + }, + { + "epoch": 0.28031830135321933, + "grad_norm": 0.3718826472759247, + "learning_rate": 0.00014396599903819909, + "loss": 1.4044, + "step": 21572 + }, + { + "epoch": 0.2803312958971352, + "grad_norm": 0.3508923649787903, + "learning_rate": 0.0001439633995762877, + "loss": 1.341, + "step": 21573 + }, + { + "epoch": 0.2803442904410511, + "grad_norm": 0.3179311454296112, + "learning_rate": 0.00014396080011437634, + "loss": 1.369, + "step": 21574 + }, + { + "epoch": 0.2803572849849669, + "grad_norm": 0.3752168118953705, + "learning_rate": 0.00014395820065246496, + "loss": 1.3316, + "step": 21575 + }, + { + "epoch": 0.2803702795288828, + "grad_norm": 0.4096318781375885, + "learning_rate": 0.00014395560119055356, + "loss": 1.3735, + "step": 21576 + }, + { + "epoch": 0.28038327407279867, + "grad_norm": 0.3741842210292816, + "learning_rate": 0.00014395300172864215, + "loss": 1.4975, + "step": 21577 + }, + { + "epoch": 0.2803962686167146, + "grad_norm": 0.3936016857624054, + "learning_rate": 0.0001439504022667308, + "loss": 1.378, + "step": 21578 + }, + { + "epoch": 0.2804092631606304, + "grad_norm": 0.48750174045562744, + "learning_rate": 0.0001439478028048194, + "loss": 1.6248, + "step": 21579 + }, + { + "epoch": 0.2804222577045463, + "grad_norm": 0.37635859847068787, + "learning_rate": 0.00014394520334290803, + "loss": 1.1349, + "step": 21580 + }, + { + "epoch": 0.28043525224846216, + "grad_norm": 0.33232641220092773, + "learning_rate": 0.00014394260388099665, + "loss": 1.3849, + "step": 21581 + }, + { + "epoch": 0.28044824679237806, + "grad_norm": 0.39684557914733887, + "learning_rate": 0.00014394000441908525, + "loss": 1.3981, + "step": 21582 + }, + { + "epoch": 0.2804612413362939, + "grad_norm": 0.4057500958442688, + "learning_rate": 0.00014393740495717387, + "loss": 1.5069, + "step": 21583 + }, + { + "epoch": 0.2804742358802098, + "grad_norm": 0.41019001603126526, + "learning_rate": 0.00014393480549526247, + "loss": 1.5021, + "step": 21584 + }, + { + "epoch": 0.28048723042412566, + "grad_norm": 0.48081421852111816, + "learning_rate": 0.00014393220603335112, + "loss": 1.5573, + "step": 21585 + }, + { + "epoch": 0.28050022496804156, + "grad_norm": 0.38263487815856934, + "learning_rate": 0.00014392960657143972, + "loss": 1.3049, + "step": 21586 + }, + { + "epoch": 0.2805132195119574, + "grad_norm": 0.46623101830482483, + "learning_rate": 0.00014392700710952834, + "loss": 1.3493, + "step": 21587 + }, + { + "epoch": 0.2805262140558733, + "grad_norm": 0.4086287021636963, + "learning_rate": 0.00014392440764761694, + "loss": 1.3976, + "step": 21588 + }, + { + "epoch": 0.28053920859978915, + "grad_norm": 0.3839130401611328, + "learning_rate": 0.00014392180818570557, + "loss": 1.2677, + "step": 21589 + }, + { + "epoch": 0.28055220314370505, + "grad_norm": 0.5059996843338013, + "learning_rate": 0.0001439192087237942, + "loss": 1.5403, + "step": 21590 + }, + { + "epoch": 0.2805651976876209, + "grad_norm": 0.35892558097839355, + "learning_rate": 0.0001439166092618828, + "loss": 1.4145, + "step": 21591 + }, + { + "epoch": 0.2805781922315368, + "grad_norm": 0.4133143723011017, + "learning_rate": 0.0001439140097999714, + "loss": 1.5327, + "step": 21592 + }, + { + "epoch": 0.28059118677545264, + "grad_norm": 0.3344416916370392, + "learning_rate": 0.00014391141033806004, + "loss": 1.4399, + "step": 21593 + }, + { + "epoch": 0.28060418131936854, + "grad_norm": 0.2513716518878937, + "learning_rate": 0.00014390881087614864, + "loss": 1.3628, + "step": 21594 + }, + { + "epoch": 0.2806171758632844, + "grad_norm": 0.4003777801990509, + "learning_rate": 0.00014390621141423726, + "loss": 1.2635, + "step": 21595 + }, + { + "epoch": 0.2806301704072003, + "grad_norm": 0.3850751221179962, + "learning_rate": 0.00014390361195232586, + "loss": 1.3411, + "step": 21596 + }, + { + "epoch": 0.28064316495111613, + "grad_norm": 0.35250312089920044, + "learning_rate": 0.0001439010124904145, + "loss": 1.4228, + "step": 21597 + }, + { + "epoch": 0.28065615949503203, + "grad_norm": 0.39210233092308044, + "learning_rate": 0.0001438984130285031, + "loss": 1.4644, + "step": 21598 + }, + { + "epoch": 0.2806691540389479, + "grad_norm": 0.39643925428390503, + "learning_rate": 0.00014389581356659173, + "loss": 1.2163, + "step": 21599 + }, + { + "epoch": 0.2806821485828638, + "grad_norm": 0.3537564277648926, + "learning_rate": 0.00014389321410468033, + "loss": 1.1641, + "step": 21600 + }, + { + "epoch": 0.2806951431267796, + "grad_norm": 0.5541505217552185, + "learning_rate": 0.00014389061464276895, + "loss": 1.5356, + "step": 21601 + }, + { + "epoch": 0.2807081376706955, + "grad_norm": 0.3661820590496063, + "learning_rate": 0.00014388801518085758, + "loss": 1.3512, + "step": 21602 + }, + { + "epoch": 0.28072113221461137, + "grad_norm": 0.4304031729698181, + "learning_rate": 0.00014388541571894617, + "loss": 1.6132, + "step": 21603 + }, + { + "epoch": 0.28073412675852727, + "grad_norm": 0.44018834829330444, + "learning_rate": 0.0001438828162570348, + "loss": 1.3435, + "step": 21604 + }, + { + "epoch": 0.2807471213024431, + "grad_norm": 0.42459535598754883, + "learning_rate": 0.00014388021679512342, + "loss": 1.4118, + "step": 21605 + }, + { + "epoch": 0.280760115846359, + "grad_norm": 0.5122298002243042, + "learning_rate": 0.00014387761733321202, + "loss": 1.5491, + "step": 21606 + }, + { + "epoch": 0.28077311039027486, + "grad_norm": 0.3131127655506134, + "learning_rate": 0.00014387501787130064, + "loss": 1.2474, + "step": 21607 + }, + { + "epoch": 0.28078610493419076, + "grad_norm": 0.4709749221801758, + "learning_rate": 0.00014387241840938924, + "loss": 1.5598, + "step": 21608 + }, + { + "epoch": 0.2807990994781066, + "grad_norm": 0.4325936734676361, + "learning_rate": 0.0001438698189474779, + "loss": 1.4556, + "step": 21609 + }, + { + "epoch": 0.2808120940220225, + "grad_norm": 0.42554768919944763, + "learning_rate": 0.0001438672194855665, + "loss": 1.4737, + "step": 21610 + }, + { + "epoch": 0.28082508856593835, + "grad_norm": 0.4179743528366089, + "learning_rate": 0.00014386462002365512, + "loss": 1.4633, + "step": 21611 + }, + { + "epoch": 0.28083808310985425, + "grad_norm": 0.37180235981941223, + "learning_rate": 0.0001438620205617437, + "loss": 1.2854, + "step": 21612 + }, + { + "epoch": 0.2808510776537701, + "grad_norm": 0.4042123556137085, + "learning_rate": 0.00014385942109983234, + "loss": 1.6696, + "step": 21613 + }, + { + "epoch": 0.280864072197686, + "grad_norm": 0.3201903700828552, + "learning_rate": 0.00014385682163792096, + "loss": 1.1982, + "step": 21614 + }, + { + "epoch": 0.28087706674160184, + "grad_norm": 0.4126211702823639, + "learning_rate": 0.00014385422217600956, + "loss": 1.3844, + "step": 21615 + }, + { + "epoch": 0.28089006128551774, + "grad_norm": 0.3589378297328949, + "learning_rate": 0.00014385162271409818, + "loss": 1.4956, + "step": 21616 + }, + { + "epoch": 0.2809030558294336, + "grad_norm": 0.3412325382232666, + "learning_rate": 0.0001438490232521868, + "loss": 1.2256, + "step": 21617 + }, + { + "epoch": 0.2809160503733495, + "grad_norm": 0.4087387025356293, + "learning_rate": 0.00014384642379027543, + "loss": 1.2813, + "step": 21618 + }, + { + "epoch": 0.28092904491726534, + "grad_norm": 0.39880043268203735, + "learning_rate": 0.00014384382432836403, + "loss": 1.3026, + "step": 21619 + }, + { + "epoch": 0.28094203946118124, + "grad_norm": 0.4509781301021576, + "learning_rate": 0.00014384122486645265, + "loss": 1.5823, + "step": 21620 + }, + { + "epoch": 0.28095503400509714, + "grad_norm": 0.4069022536277771, + "learning_rate": 0.00014383862540454128, + "loss": 1.3807, + "step": 21621 + }, + { + "epoch": 0.280968028549013, + "grad_norm": 0.39547228813171387, + "learning_rate": 0.00014383602594262988, + "loss": 1.1127, + "step": 21622 + }, + { + "epoch": 0.2809810230929289, + "grad_norm": 0.38375696539878845, + "learning_rate": 0.0001438334264807185, + "loss": 1.2796, + "step": 21623 + }, + { + "epoch": 0.28099401763684473, + "grad_norm": 0.36782675981521606, + "learning_rate": 0.00014383082701880713, + "loss": 1.2502, + "step": 21624 + }, + { + "epoch": 0.28100701218076063, + "grad_norm": 0.43092501163482666, + "learning_rate": 0.00014382822755689572, + "loss": 1.4962, + "step": 21625 + }, + { + "epoch": 0.2810200067246765, + "grad_norm": 0.41716548800468445, + "learning_rate": 0.00014382562809498435, + "loss": 1.5437, + "step": 21626 + }, + { + "epoch": 0.2810330012685924, + "grad_norm": 0.4281606376171112, + "learning_rate": 0.00014382302863307294, + "loss": 1.4638, + "step": 21627 + }, + { + "epoch": 0.2810459958125082, + "grad_norm": 0.42682525515556335, + "learning_rate": 0.0001438204291711616, + "loss": 1.365, + "step": 21628 + }, + { + "epoch": 0.2810589903564241, + "grad_norm": 0.44276565313339233, + "learning_rate": 0.0001438178297092502, + "loss": 1.4691, + "step": 21629 + }, + { + "epoch": 0.28107198490033997, + "grad_norm": 0.38427865505218506, + "learning_rate": 0.00014381523024733882, + "loss": 1.331, + "step": 21630 + }, + { + "epoch": 0.28108497944425587, + "grad_norm": 0.46011996269226074, + "learning_rate": 0.00014381263078542742, + "loss": 1.6645, + "step": 21631 + }, + { + "epoch": 0.2810979739881717, + "grad_norm": 0.4322268068790436, + "learning_rate": 0.00014381003132351604, + "loss": 1.3499, + "step": 21632 + }, + { + "epoch": 0.2811109685320876, + "grad_norm": 0.38161829113960266, + "learning_rate": 0.00014380743186160466, + "loss": 1.2661, + "step": 21633 + }, + { + "epoch": 0.28112396307600346, + "grad_norm": 0.4218149185180664, + "learning_rate": 0.00014380483239969326, + "loss": 1.4326, + "step": 21634 + }, + { + "epoch": 0.28113695761991936, + "grad_norm": 0.3905273377895355, + "learning_rate": 0.0001438022329377819, + "loss": 1.4713, + "step": 21635 + }, + { + "epoch": 0.2811499521638352, + "grad_norm": 0.4071963429450989, + "learning_rate": 0.0001437996334758705, + "loss": 1.4191, + "step": 21636 + }, + { + "epoch": 0.2811629467077511, + "grad_norm": 0.5114985704421997, + "learning_rate": 0.0001437970340139591, + "loss": 1.6028, + "step": 21637 + }, + { + "epoch": 0.28117594125166695, + "grad_norm": 0.5102190971374512, + "learning_rate": 0.00014379443455204773, + "loss": 1.3921, + "step": 21638 + }, + { + "epoch": 0.28118893579558285, + "grad_norm": 0.4546314477920532, + "learning_rate": 0.00014379183509013633, + "loss": 1.5311, + "step": 21639 + }, + { + "epoch": 0.2812019303394987, + "grad_norm": 0.43757355213165283, + "learning_rate": 0.00014378923562822498, + "loss": 1.4271, + "step": 21640 + }, + { + "epoch": 0.2812149248834146, + "grad_norm": 0.4324467182159424, + "learning_rate": 0.00014378663616631358, + "loss": 1.4025, + "step": 21641 + }, + { + "epoch": 0.28122791942733044, + "grad_norm": 0.4437076449394226, + "learning_rate": 0.0001437840367044022, + "loss": 1.508, + "step": 21642 + }, + { + "epoch": 0.28124091397124634, + "grad_norm": 0.42971280217170715, + "learning_rate": 0.0001437814372424908, + "loss": 1.477, + "step": 21643 + }, + { + "epoch": 0.2812539085151622, + "grad_norm": 0.4508056342601776, + "learning_rate": 0.00014377883778057943, + "loss": 1.5793, + "step": 21644 + }, + { + "epoch": 0.2812669030590781, + "grad_norm": 0.3884173631668091, + "learning_rate": 0.00014377623831866805, + "loss": 1.5439, + "step": 21645 + }, + { + "epoch": 0.28127989760299393, + "grad_norm": 0.4531201422214508, + "learning_rate": 0.00014377363885675665, + "loss": 1.4845, + "step": 21646 + }, + { + "epoch": 0.28129289214690983, + "grad_norm": 0.42577460408210754, + "learning_rate": 0.00014377103939484527, + "loss": 1.3472, + "step": 21647 + }, + { + "epoch": 0.2813058866908257, + "grad_norm": 0.4146845042705536, + "learning_rate": 0.0001437684399329339, + "loss": 1.4833, + "step": 21648 + }, + { + "epoch": 0.2813188812347416, + "grad_norm": 0.37905383110046387, + "learning_rate": 0.0001437658404710225, + "loss": 1.4207, + "step": 21649 + }, + { + "epoch": 0.2813318757786574, + "grad_norm": 0.46708089113235474, + "learning_rate": 0.00014376324100911112, + "loss": 1.3243, + "step": 21650 + }, + { + "epoch": 0.2813448703225733, + "grad_norm": 0.5826940536499023, + "learning_rate": 0.00014376064154719972, + "loss": 1.4155, + "step": 21651 + }, + { + "epoch": 0.28135786486648917, + "grad_norm": 0.4636281430721283, + "learning_rate": 0.00014375804208528837, + "loss": 1.4661, + "step": 21652 + }, + { + "epoch": 0.28137085941040507, + "grad_norm": 0.38280513882637024, + "learning_rate": 0.00014375544262337696, + "loss": 1.4645, + "step": 21653 + }, + { + "epoch": 0.2813838539543209, + "grad_norm": 0.41344279050827026, + "learning_rate": 0.0001437528431614656, + "loss": 1.4461, + "step": 21654 + }, + { + "epoch": 0.2813968484982368, + "grad_norm": 0.46122950315475464, + "learning_rate": 0.00014375024369955421, + "loss": 1.4312, + "step": 21655 + }, + { + "epoch": 0.28140984304215266, + "grad_norm": 0.39554929733276367, + "learning_rate": 0.0001437476442376428, + "loss": 1.4736, + "step": 21656 + }, + { + "epoch": 0.28142283758606856, + "grad_norm": 0.4467918872833252, + "learning_rate": 0.00014374504477573144, + "loss": 1.5203, + "step": 21657 + }, + { + "epoch": 0.2814358321299844, + "grad_norm": 0.32374107837677, + "learning_rate": 0.00014374244531382003, + "loss": 1.3642, + "step": 21658 + }, + { + "epoch": 0.2814488266739003, + "grad_norm": 0.4517781734466553, + "learning_rate": 0.00014373984585190868, + "loss": 1.413, + "step": 21659 + }, + { + "epoch": 0.28146182121781615, + "grad_norm": 0.5237303376197815, + "learning_rate": 0.00014373724638999728, + "loss": 1.4423, + "step": 21660 + }, + { + "epoch": 0.28147481576173206, + "grad_norm": 0.3754831850528717, + "learning_rate": 0.00014373464692808588, + "loss": 1.3623, + "step": 21661 + }, + { + "epoch": 0.2814878103056479, + "grad_norm": 0.42003101110458374, + "learning_rate": 0.0001437320474661745, + "loss": 1.4949, + "step": 21662 + }, + { + "epoch": 0.2815008048495638, + "grad_norm": 0.4061402976512909, + "learning_rate": 0.00014372944800426313, + "loss": 1.5131, + "step": 21663 + }, + { + "epoch": 0.28151379939347965, + "grad_norm": 0.4422401487827301, + "learning_rate": 0.00014372684854235175, + "loss": 1.4298, + "step": 21664 + }, + { + "epoch": 0.28152679393739555, + "grad_norm": 0.45768260955810547, + "learning_rate": 0.00014372424908044035, + "loss": 1.3497, + "step": 21665 + }, + { + "epoch": 0.2815397884813114, + "grad_norm": 0.40046998858451843, + "learning_rate": 0.00014372164961852897, + "loss": 1.2383, + "step": 21666 + }, + { + "epoch": 0.2815527830252273, + "grad_norm": 0.4369175434112549, + "learning_rate": 0.0001437190501566176, + "loss": 1.5959, + "step": 21667 + }, + { + "epoch": 0.28156577756914314, + "grad_norm": 0.3852870762348175, + "learning_rate": 0.0001437164506947062, + "loss": 1.3339, + "step": 21668 + }, + { + "epoch": 0.28157877211305904, + "grad_norm": 0.3945727050304413, + "learning_rate": 0.00014371385123279482, + "loss": 1.219, + "step": 21669 + }, + { + "epoch": 0.2815917666569749, + "grad_norm": 0.4296533763408661, + "learning_rate": 0.00014371125177088342, + "loss": 1.4618, + "step": 21670 + }, + { + "epoch": 0.2816047612008908, + "grad_norm": 0.35974952578544617, + "learning_rate": 0.00014370865230897207, + "loss": 1.4644, + "step": 21671 + }, + { + "epoch": 0.28161775574480663, + "grad_norm": 0.46538421511650085, + "learning_rate": 0.00014370605284706067, + "loss": 1.3791, + "step": 21672 + }, + { + "epoch": 0.28163075028872253, + "grad_norm": 0.33413568139076233, + "learning_rate": 0.0001437034533851493, + "loss": 1.4626, + "step": 21673 + }, + { + "epoch": 0.2816437448326384, + "grad_norm": 0.4075777232646942, + "learning_rate": 0.0001437008539232379, + "loss": 1.3576, + "step": 21674 + }, + { + "epoch": 0.2816567393765543, + "grad_norm": 0.35055723786354065, + "learning_rate": 0.0001436982544613265, + "loss": 1.3586, + "step": 21675 + }, + { + "epoch": 0.2816697339204701, + "grad_norm": 0.5690978169441223, + "learning_rate": 0.00014369565499941514, + "loss": 1.3709, + "step": 21676 + }, + { + "epoch": 0.281682728464386, + "grad_norm": 0.3441561460494995, + "learning_rate": 0.00014369305553750374, + "loss": 1.2507, + "step": 21677 + }, + { + "epoch": 0.28169572300830187, + "grad_norm": 0.36188071966171265, + "learning_rate": 0.00014369045607559236, + "loss": 1.4651, + "step": 21678 + }, + { + "epoch": 0.28170871755221777, + "grad_norm": 0.47652989625930786, + "learning_rate": 0.00014368785661368098, + "loss": 1.4811, + "step": 21679 + }, + { + "epoch": 0.2817217120961336, + "grad_norm": 0.48744603991508484, + "learning_rate": 0.00014368525715176958, + "loss": 1.4591, + "step": 21680 + }, + { + "epoch": 0.2817347066400495, + "grad_norm": 0.4956151247024536, + "learning_rate": 0.0001436826576898582, + "loss": 1.4191, + "step": 21681 + }, + { + "epoch": 0.28174770118396536, + "grad_norm": 0.41328027844429016, + "learning_rate": 0.0001436800582279468, + "loss": 1.3678, + "step": 21682 + }, + { + "epoch": 0.28176069572788126, + "grad_norm": 0.34563249349594116, + "learning_rate": 0.00014367745876603546, + "loss": 1.3641, + "step": 21683 + }, + { + "epoch": 0.2817736902717971, + "grad_norm": 0.3507574200630188, + "learning_rate": 0.00014367485930412405, + "loss": 1.2525, + "step": 21684 + }, + { + "epoch": 0.281786684815713, + "grad_norm": 0.4258630573749542, + "learning_rate": 0.00014367225984221268, + "loss": 1.4797, + "step": 21685 + }, + { + "epoch": 0.28179967935962885, + "grad_norm": 0.44550222158432007, + "learning_rate": 0.00014366966038030127, + "loss": 1.4483, + "step": 21686 + }, + { + "epoch": 0.28181267390354475, + "grad_norm": 0.38555362820625305, + "learning_rate": 0.0001436670609183899, + "loss": 1.3341, + "step": 21687 + }, + { + "epoch": 0.2818256684474606, + "grad_norm": 0.3425605297088623, + "learning_rate": 0.00014366446145647852, + "loss": 1.3425, + "step": 21688 + }, + { + "epoch": 0.2818386629913765, + "grad_norm": 0.4604874551296234, + "learning_rate": 0.00014366186199456712, + "loss": 1.4784, + "step": 21689 + }, + { + "epoch": 0.28185165753529234, + "grad_norm": 0.3135378062725067, + "learning_rate": 0.00014365926253265575, + "loss": 1.2873, + "step": 21690 + }, + { + "epoch": 0.28186465207920824, + "grad_norm": 0.46984222531318665, + "learning_rate": 0.00014365666307074437, + "loss": 1.3769, + "step": 21691 + }, + { + "epoch": 0.2818776466231241, + "grad_norm": 0.32434436678886414, + "learning_rate": 0.00014365406360883297, + "loss": 1.3276, + "step": 21692 + }, + { + "epoch": 0.28189064116704, + "grad_norm": 0.30501803755760193, + "learning_rate": 0.0001436514641469216, + "loss": 1.4448, + "step": 21693 + }, + { + "epoch": 0.28190363571095584, + "grad_norm": 0.3443998098373413, + "learning_rate": 0.00014364886468501022, + "loss": 1.5251, + "step": 21694 + }, + { + "epoch": 0.28191663025487174, + "grad_norm": 0.4193110466003418, + "learning_rate": 0.00014364626522309884, + "loss": 1.5764, + "step": 21695 + }, + { + "epoch": 0.2819296247987876, + "grad_norm": 0.38531726598739624, + "learning_rate": 0.00014364366576118744, + "loss": 1.2305, + "step": 21696 + }, + { + "epoch": 0.2819426193427035, + "grad_norm": 0.4568133056163788, + "learning_rate": 0.00014364106629927606, + "loss": 1.368, + "step": 21697 + }, + { + "epoch": 0.2819556138866194, + "grad_norm": 0.447349488735199, + "learning_rate": 0.0001436384668373647, + "loss": 1.3013, + "step": 21698 + }, + { + "epoch": 0.28196860843053523, + "grad_norm": 0.4292466342449188, + "learning_rate": 0.00014363586737545328, + "loss": 1.527, + "step": 21699 + }, + { + "epoch": 0.28198160297445113, + "grad_norm": 0.29827407002449036, + "learning_rate": 0.0001436332679135419, + "loss": 1.0936, + "step": 21700 + }, + { + "epoch": 0.281994597518367, + "grad_norm": 0.5215676426887512, + "learning_rate": 0.0001436306684516305, + "loss": 1.463, + "step": 21701 + }, + { + "epoch": 0.2820075920622829, + "grad_norm": 0.4000895321369171, + "learning_rate": 0.00014362806898971916, + "loss": 1.4644, + "step": 21702 + }, + { + "epoch": 0.2820205866061987, + "grad_norm": 0.48442643880844116, + "learning_rate": 0.00014362546952780776, + "loss": 1.5668, + "step": 21703 + }, + { + "epoch": 0.2820335811501146, + "grad_norm": 0.3367200791835785, + "learning_rate": 0.00014362287006589635, + "loss": 1.405, + "step": 21704 + }, + { + "epoch": 0.28204657569403047, + "grad_norm": 0.30173593759536743, + "learning_rate": 0.00014362027060398498, + "loss": 1.3217, + "step": 21705 + }, + { + "epoch": 0.28205957023794637, + "grad_norm": 0.4231071174144745, + "learning_rate": 0.0001436176711420736, + "loss": 1.3678, + "step": 21706 + }, + { + "epoch": 0.2820725647818622, + "grad_norm": 0.40426525473594666, + "learning_rate": 0.00014361507168016223, + "loss": 1.4909, + "step": 21707 + }, + { + "epoch": 0.2820855593257781, + "grad_norm": 0.3810115158557892, + "learning_rate": 0.00014361247221825082, + "loss": 1.2473, + "step": 21708 + }, + { + "epoch": 0.28209855386969396, + "grad_norm": 0.42240890860557556, + "learning_rate": 0.00014360987275633945, + "loss": 1.3171, + "step": 21709 + }, + { + "epoch": 0.28211154841360986, + "grad_norm": 0.31563258171081543, + "learning_rate": 0.00014360727329442807, + "loss": 1.3224, + "step": 21710 + }, + { + "epoch": 0.2821245429575257, + "grad_norm": 0.4698585867881775, + "learning_rate": 0.00014360467383251667, + "loss": 1.4441, + "step": 21711 + }, + { + "epoch": 0.2821375375014416, + "grad_norm": 0.3946951627731323, + "learning_rate": 0.0001436020743706053, + "loss": 1.4574, + "step": 21712 + }, + { + "epoch": 0.28215053204535745, + "grad_norm": 0.3217959702014923, + "learning_rate": 0.0001435994749086939, + "loss": 1.5407, + "step": 21713 + }, + { + "epoch": 0.28216352658927335, + "grad_norm": 0.45142993330955505, + "learning_rate": 0.00014359687544678254, + "loss": 1.3828, + "step": 21714 + }, + { + "epoch": 0.2821765211331892, + "grad_norm": 0.4140826165676117, + "learning_rate": 0.00014359427598487114, + "loss": 1.3128, + "step": 21715 + }, + { + "epoch": 0.2821895156771051, + "grad_norm": 0.23171290755271912, + "learning_rate": 0.00014359167652295974, + "loss": 1.299, + "step": 21716 + }, + { + "epoch": 0.28220251022102094, + "grad_norm": 0.46328428387641907, + "learning_rate": 0.00014358907706104836, + "loss": 1.544, + "step": 21717 + }, + { + "epoch": 0.28221550476493684, + "grad_norm": 0.3866714537143707, + "learning_rate": 0.000143586477599137, + "loss": 1.3213, + "step": 21718 + }, + { + "epoch": 0.2822284993088527, + "grad_norm": 0.5351392030715942, + "learning_rate": 0.0001435838781372256, + "loss": 1.5189, + "step": 21719 + }, + { + "epoch": 0.2822414938527686, + "grad_norm": 0.39193418622016907, + "learning_rate": 0.0001435812786753142, + "loss": 1.4766, + "step": 21720 + }, + { + "epoch": 0.28225448839668443, + "grad_norm": 0.4878564178943634, + "learning_rate": 0.00014357867921340283, + "loss": 1.4514, + "step": 21721 + }, + { + "epoch": 0.28226748294060033, + "grad_norm": 0.5170662999153137, + "learning_rate": 0.00014357607975149146, + "loss": 1.5403, + "step": 21722 + }, + { + "epoch": 0.2822804774845162, + "grad_norm": 0.6031299829483032, + "learning_rate": 0.00014357348028958006, + "loss": 1.3689, + "step": 21723 + }, + { + "epoch": 0.2822934720284321, + "grad_norm": 0.32543259859085083, + "learning_rate": 0.00014357088082766868, + "loss": 1.4631, + "step": 21724 + }, + { + "epoch": 0.2823064665723479, + "grad_norm": 0.4092324674129486, + "learning_rate": 0.00014356828136575728, + "loss": 1.5122, + "step": 21725 + }, + { + "epoch": 0.2823194611162638, + "grad_norm": 0.47104087471961975, + "learning_rate": 0.00014356568190384593, + "loss": 1.4428, + "step": 21726 + }, + { + "epoch": 0.28233245566017967, + "grad_norm": 0.431558221578598, + "learning_rate": 0.00014356308244193453, + "loss": 1.5317, + "step": 21727 + }, + { + "epoch": 0.28234545020409557, + "grad_norm": 0.45123809576034546, + "learning_rate": 0.00014356048298002312, + "loss": 1.4795, + "step": 21728 + }, + { + "epoch": 0.2823584447480114, + "grad_norm": 0.3939613997936249, + "learning_rate": 0.00014355788351811177, + "loss": 1.3566, + "step": 21729 + }, + { + "epoch": 0.2823714392919273, + "grad_norm": 0.3815322816371918, + "learning_rate": 0.00014355528405620037, + "loss": 1.7065, + "step": 21730 + }, + { + "epoch": 0.28238443383584316, + "grad_norm": 0.44508302211761475, + "learning_rate": 0.000143552684594289, + "loss": 1.4391, + "step": 21731 + }, + { + "epoch": 0.28239742837975906, + "grad_norm": 0.369547039270401, + "learning_rate": 0.0001435500851323776, + "loss": 1.6936, + "step": 21732 + }, + { + "epoch": 0.2824104229236749, + "grad_norm": 0.4604247212409973, + "learning_rate": 0.00014354748567046622, + "loss": 1.4554, + "step": 21733 + }, + { + "epoch": 0.2824234174675908, + "grad_norm": 0.41318246722221375, + "learning_rate": 0.00014354488620855484, + "loss": 1.4091, + "step": 21734 + }, + { + "epoch": 0.28243641201150665, + "grad_norm": 0.4227082133293152, + "learning_rate": 0.00014354228674664344, + "loss": 1.4725, + "step": 21735 + }, + { + "epoch": 0.28244940655542256, + "grad_norm": 0.4632466435432434, + "learning_rate": 0.00014353968728473206, + "loss": 1.3923, + "step": 21736 + }, + { + "epoch": 0.2824624010993384, + "grad_norm": 0.33906590938568115, + "learning_rate": 0.0001435370878228207, + "loss": 1.375, + "step": 21737 + }, + { + "epoch": 0.2824753956432543, + "grad_norm": 0.3812054395675659, + "learning_rate": 0.00014353448836090931, + "loss": 1.5133, + "step": 21738 + }, + { + "epoch": 0.28248839018717015, + "grad_norm": 0.4794810116291046, + "learning_rate": 0.0001435318888989979, + "loss": 1.3677, + "step": 21739 + }, + { + "epoch": 0.28250138473108605, + "grad_norm": 0.369107186794281, + "learning_rate": 0.00014352928943708654, + "loss": 1.332, + "step": 21740 + }, + { + "epoch": 0.2825143792750019, + "grad_norm": 0.5090543031692505, + "learning_rate": 0.00014352668997517516, + "loss": 1.3189, + "step": 21741 + }, + { + "epoch": 0.2825273738189178, + "grad_norm": 0.3605302572250366, + "learning_rate": 0.00014352409051326376, + "loss": 1.4531, + "step": 21742 + }, + { + "epoch": 0.28254036836283364, + "grad_norm": 0.48940834403038025, + "learning_rate": 0.00014352149105135238, + "loss": 1.4939, + "step": 21743 + }, + { + "epoch": 0.28255336290674954, + "grad_norm": 0.3710671067237854, + "learning_rate": 0.00014351889158944098, + "loss": 1.5996, + "step": 21744 + }, + { + "epoch": 0.2825663574506654, + "grad_norm": 0.3644084334373474, + "learning_rate": 0.0001435162921275296, + "loss": 1.4049, + "step": 21745 + }, + { + "epoch": 0.2825793519945813, + "grad_norm": 0.38626033067703247, + "learning_rate": 0.00014351369266561823, + "loss": 1.2626, + "step": 21746 + }, + { + "epoch": 0.28259234653849713, + "grad_norm": 0.42499861121177673, + "learning_rate": 0.00014351109320370683, + "loss": 1.4287, + "step": 21747 + }, + { + "epoch": 0.28260534108241303, + "grad_norm": 0.4534740149974823, + "learning_rate": 0.00014350849374179545, + "loss": 1.3844, + "step": 21748 + }, + { + "epoch": 0.2826183356263289, + "grad_norm": 0.46116897463798523, + "learning_rate": 0.00014350589427988407, + "loss": 1.5236, + "step": 21749 + }, + { + "epoch": 0.2826313301702448, + "grad_norm": 0.4565775990486145, + "learning_rate": 0.0001435032948179727, + "loss": 1.4928, + "step": 21750 + }, + { + "epoch": 0.2826443247141606, + "grad_norm": 0.42477887868881226, + "learning_rate": 0.0001435006953560613, + "loss": 1.548, + "step": 21751 + }, + { + "epoch": 0.2826573192580765, + "grad_norm": 0.4219886362552643, + "learning_rate": 0.00014349809589414992, + "loss": 1.4348, + "step": 21752 + }, + { + "epoch": 0.28267031380199237, + "grad_norm": 0.39015692472457886, + "learning_rate": 0.00014349549643223855, + "loss": 1.4617, + "step": 21753 + }, + { + "epoch": 0.28268330834590827, + "grad_norm": 0.3318279981613159, + "learning_rate": 0.00014349289697032714, + "loss": 1.2249, + "step": 21754 + }, + { + "epoch": 0.2826963028898241, + "grad_norm": 0.32213252782821655, + "learning_rate": 0.00014349029750841577, + "loss": 1.4706, + "step": 21755 + }, + { + "epoch": 0.28270929743374, + "grad_norm": 0.467014878988266, + "learning_rate": 0.00014348769804650436, + "loss": 1.4635, + "step": 21756 + }, + { + "epoch": 0.28272229197765586, + "grad_norm": 0.4662379324436188, + "learning_rate": 0.000143485098584593, + "loss": 1.3668, + "step": 21757 + }, + { + "epoch": 0.28273528652157176, + "grad_norm": 0.4565616250038147, + "learning_rate": 0.00014348249912268161, + "loss": 1.5082, + "step": 21758 + }, + { + "epoch": 0.2827482810654876, + "grad_norm": 0.3721349537372589, + "learning_rate": 0.0001434798996607702, + "loss": 1.4445, + "step": 21759 + }, + { + "epoch": 0.2827612756094035, + "grad_norm": 0.42279231548309326, + "learning_rate": 0.00014347730019885884, + "loss": 1.5313, + "step": 21760 + }, + { + "epoch": 0.28277427015331935, + "grad_norm": 0.37611299753189087, + "learning_rate": 0.00014347470073694746, + "loss": 1.3632, + "step": 21761 + }, + { + "epoch": 0.28278726469723525, + "grad_norm": 0.40103912353515625, + "learning_rate": 0.00014347210127503608, + "loss": 1.33, + "step": 21762 + }, + { + "epoch": 0.2828002592411511, + "grad_norm": 0.4093247056007385, + "learning_rate": 0.00014346950181312468, + "loss": 1.4338, + "step": 21763 + }, + { + "epoch": 0.282813253785067, + "grad_norm": 0.5236377120018005, + "learning_rate": 0.0001434669023512133, + "loss": 1.2763, + "step": 21764 + }, + { + "epoch": 0.28282624832898284, + "grad_norm": 0.3952511250972748, + "learning_rate": 0.00014346430288930193, + "loss": 1.4751, + "step": 21765 + }, + { + "epoch": 0.28283924287289874, + "grad_norm": 0.43484461307525635, + "learning_rate": 0.00014346170342739053, + "loss": 1.3466, + "step": 21766 + }, + { + "epoch": 0.2828522374168146, + "grad_norm": 0.4340057969093323, + "learning_rate": 0.00014345910396547915, + "loss": 1.5659, + "step": 21767 + }, + { + "epoch": 0.2828652319607305, + "grad_norm": 0.39850088953971863, + "learning_rate": 0.00014345650450356778, + "loss": 1.4022, + "step": 21768 + }, + { + "epoch": 0.28287822650464634, + "grad_norm": 0.41238662600517273, + "learning_rate": 0.0001434539050416564, + "loss": 1.416, + "step": 21769 + }, + { + "epoch": 0.28289122104856224, + "grad_norm": 0.37408211827278137, + "learning_rate": 0.000143451305579745, + "loss": 1.1999, + "step": 21770 + }, + { + "epoch": 0.2829042155924781, + "grad_norm": 0.45001137256622314, + "learning_rate": 0.0001434487061178336, + "loss": 1.4894, + "step": 21771 + }, + { + "epoch": 0.282917210136394, + "grad_norm": 0.342305988073349, + "learning_rate": 0.00014344610665592225, + "loss": 1.4074, + "step": 21772 + }, + { + "epoch": 0.2829302046803099, + "grad_norm": 0.4604296088218689, + "learning_rate": 0.00014344350719401085, + "loss": 1.4135, + "step": 21773 + }, + { + "epoch": 0.2829431992242257, + "grad_norm": 0.30350837111473083, + "learning_rate": 0.00014344090773209947, + "loss": 1.3574, + "step": 21774 + }, + { + "epoch": 0.28295619376814163, + "grad_norm": 0.37793588638305664, + "learning_rate": 0.00014343830827018807, + "loss": 1.4875, + "step": 21775 + }, + { + "epoch": 0.2829691883120575, + "grad_norm": 0.3413355350494385, + "learning_rate": 0.0001434357088082767, + "loss": 1.3764, + "step": 21776 + }, + { + "epoch": 0.2829821828559734, + "grad_norm": 0.4076234698295593, + "learning_rate": 0.00014343310934636532, + "loss": 1.36, + "step": 21777 + }, + { + "epoch": 0.2829951773998892, + "grad_norm": 0.41776394844055176, + "learning_rate": 0.00014343050988445391, + "loss": 1.3279, + "step": 21778 + }, + { + "epoch": 0.2830081719438051, + "grad_norm": 0.37759140133857727, + "learning_rate": 0.00014342791042254254, + "loss": 1.5197, + "step": 21779 + }, + { + "epoch": 0.28302116648772097, + "grad_norm": 0.5261659026145935, + "learning_rate": 0.00014342531096063116, + "loss": 1.4205, + "step": 21780 + }, + { + "epoch": 0.28303416103163687, + "grad_norm": 0.38115450739860535, + "learning_rate": 0.0001434227114987198, + "loss": 1.5498, + "step": 21781 + }, + { + "epoch": 0.2830471555755527, + "grad_norm": 0.42801326513290405, + "learning_rate": 0.00014342011203680838, + "loss": 1.3893, + "step": 21782 + }, + { + "epoch": 0.2830601501194686, + "grad_norm": 0.35237276554107666, + "learning_rate": 0.00014341751257489698, + "loss": 1.424, + "step": 21783 + }, + { + "epoch": 0.28307314466338446, + "grad_norm": 0.4044218063354492, + "learning_rate": 0.00014341491311298563, + "loss": 1.6333, + "step": 21784 + }, + { + "epoch": 0.28308613920730036, + "grad_norm": 0.3681308925151825, + "learning_rate": 0.00014341231365107423, + "loss": 1.3307, + "step": 21785 + }, + { + "epoch": 0.2830991337512162, + "grad_norm": 0.37480831146240234, + "learning_rate": 0.00014340971418916286, + "loss": 1.3504, + "step": 21786 + }, + { + "epoch": 0.2831121282951321, + "grad_norm": 0.37955859303474426, + "learning_rate": 0.00014340711472725145, + "loss": 1.4032, + "step": 21787 + }, + { + "epoch": 0.28312512283904795, + "grad_norm": 0.445197731256485, + "learning_rate": 0.00014340451526534008, + "loss": 1.4659, + "step": 21788 + }, + { + "epoch": 0.28313811738296385, + "grad_norm": 0.43626245856285095, + "learning_rate": 0.0001434019158034287, + "loss": 1.3216, + "step": 21789 + }, + { + "epoch": 0.2831511119268797, + "grad_norm": 0.4422862231731415, + "learning_rate": 0.0001433993163415173, + "loss": 1.3629, + "step": 21790 + }, + { + "epoch": 0.2831641064707956, + "grad_norm": 0.3592076897621155, + "learning_rate": 0.00014339671687960592, + "loss": 1.2363, + "step": 21791 + }, + { + "epoch": 0.28317710101471144, + "grad_norm": 0.37475383281707764, + "learning_rate": 0.00014339411741769455, + "loss": 1.2707, + "step": 21792 + }, + { + "epoch": 0.28319009555862734, + "grad_norm": 0.35880836844444275, + "learning_rate": 0.00014339151795578317, + "loss": 1.2994, + "step": 21793 + }, + { + "epoch": 0.2832030901025432, + "grad_norm": 0.3758575916290283, + "learning_rate": 0.00014338891849387177, + "loss": 1.502, + "step": 21794 + }, + { + "epoch": 0.2832160846464591, + "grad_norm": 0.39884600043296814, + "learning_rate": 0.0001433863190319604, + "loss": 1.5207, + "step": 21795 + }, + { + "epoch": 0.28322907919037493, + "grad_norm": 0.35125336050987244, + "learning_rate": 0.00014338371957004902, + "loss": 1.1604, + "step": 21796 + }, + { + "epoch": 0.28324207373429083, + "grad_norm": 0.4417129158973694, + "learning_rate": 0.00014338112010813762, + "loss": 1.4518, + "step": 21797 + }, + { + "epoch": 0.2832550682782067, + "grad_norm": 0.36367693543434143, + "learning_rate": 0.00014337852064622624, + "loss": 1.5242, + "step": 21798 + }, + { + "epoch": 0.2832680628221226, + "grad_norm": 0.39889103174209595, + "learning_rate": 0.00014337592118431484, + "loss": 1.4116, + "step": 21799 + }, + { + "epoch": 0.2832810573660384, + "grad_norm": 0.3439438045024872, + "learning_rate": 0.00014337332172240346, + "loss": 1.3042, + "step": 21800 + }, + { + "epoch": 0.2832940519099543, + "grad_norm": 0.31915032863616943, + "learning_rate": 0.0001433707222604921, + "loss": 1.5826, + "step": 21801 + }, + { + "epoch": 0.28330704645387017, + "grad_norm": 0.42528292536735535, + "learning_rate": 0.00014336812279858068, + "loss": 1.4076, + "step": 21802 + }, + { + "epoch": 0.28332004099778607, + "grad_norm": 0.3702611029148102, + "learning_rate": 0.00014336552333666934, + "loss": 1.2547, + "step": 21803 + }, + { + "epoch": 0.2833330355417019, + "grad_norm": 0.40039560198783875, + "learning_rate": 0.00014336292387475793, + "loss": 1.4127, + "step": 21804 + }, + { + "epoch": 0.2833460300856178, + "grad_norm": 0.4360812306404114, + "learning_rate": 0.00014336032441284656, + "loss": 1.494, + "step": 21805 + }, + { + "epoch": 0.28335902462953366, + "grad_norm": 0.3242342174053192, + "learning_rate": 0.00014335772495093516, + "loss": 1.2982, + "step": 21806 + }, + { + "epoch": 0.28337201917344956, + "grad_norm": 0.4012332260608673, + "learning_rate": 0.00014335512548902378, + "loss": 1.4413, + "step": 21807 + }, + { + "epoch": 0.2833850137173654, + "grad_norm": 0.4329386055469513, + "learning_rate": 0.0001433525260271124, + "loss": 1.5064, + "step": 21808 + }, + { + "epoch": 0.2833980082612813, + "grad_norm": 0.3515535593032837, + "learning_rate": 0.000143349926565201, + "loss": 1.5046, + "step": 21809 + }, + { + "epoch": 0.28341100280519715, + "grad_norm": 0.3048829436302185, + "learning_rate": 0.00014334732710328963, + "loss": 1.2416, + "step": 21810 + }, + { + "epoch": 0.28342399734911305, + "grad_norm": 0.4261169135570526, + "learning_rate": 0.00014334472764137825, + "loss": 1.6335, + "step": 21811 + }, + { + "epoch": 0.2834369918930289, + "grad_norm": 0.3847494125366211, + "learning_rate": 0.00014334212817946685, + "loss": 1.3573, + "step": 21812 + }, + { + "epoch": 0.2834499864369448, + "grad_norm": 0.34324175119400024, + "learning_rate": 0.00014333952871755547, + "loss": 1.2693, + "step": 21813 + }, + { + "epoch": 0.28346298098086065, + "grad_norm": 0.32652267813682556, + "learning_rate": 0.00014333692925564407, + "loss": 1.686, + "step": 21814 + }, + { + "epoch": 0.28347597552477655, + "grad_norm": 0.4678068161010742, + "learning_rate": 0.00014333432979373272, + "loss": 1.328, + "step": 21815 + }, + { + "epoch": 0.2834889700686924, + "grad_norm": 0.38122639060020447, + "learning_rate": 0.00014333173033182132, + "loss": 1.2032, + "step": 21816 + }, + { + "epoch": 0.2835019646126083, + "grad_norm": 0.3848731517791748, + "learning_rate": 0.00014332913086990994, + "loss": 1.2498, + "step": 21817 + }, + { + "epoch": 0.28351495915652414, + "grad_norm": 0.3523218035697937, + "learning_rate": 0.00014332653140799854, + "loss": 1.2221, + "step": 21818 + }, + { + "epoch": 0.28352795370044004, + "grad_norm": 0.44534292817115784, + "learning_rate": 0.00014332393194608717, + "loss": 1.2493, + "step": 21819 + }, + { + "epoch": 0.2835409482443559, + "grad_norm": 0.4481651484966278, + "learning_rate": 0.0001433213324841758, + "loss": 1.441, + "step": 21820 + }, + { + "epoch": 0.2835539427882718, + "grad_norm": 0.39966532588005066, + "learning_rate": 0.0001433187330222644, + "loss": 1.3248, + "step": 21821 + }, + { + "epoch": 0.28356693733218763, + "grad_norm": 0.43956258893013, + "learning_rate": 0.000143316133560353, + "loss": 1.4088, + "step": 21822 + }, + { + "epoch": 0.28357993187610353, + "grad_norm": 0.406645804643631, + "learning_rate": 0.00014331353409844164, + "loss": 1.7489, + "step": 21823 + }, + { + "epoch": 0.2835929264200194, + "grad_norm": 0.41724923253059387, + "learning_rate": 0.00014331093463653026, + "loss": 1.3259, + "step": 21824 + }, + { + "epoch": 0.2836059209639353, + "grad_norm": 0.40026265382766724, + "learning_rate": 0.00014330833517461886, + "loss": 1.4538, + "step": 21825 + }, + { + "epoch": 0.2836189155078511, + "grad_norm": 0.523308277130127, + "learning_rate": 0.00014330573571270746, + "loss": 1.4485, + "step": 21826 + }, + { + "epoch": 0.283631910051767, + "grad_norm": 0.38762956857681274, + "learning_rate": 0.0001433031362507961, + "loss": 1.4196, + "step": 21827 + }, + { + "epoch": 0.28364490459568287, + "grad_norm": 0.35311782360076904, + "learning_rate": 0.0001433005367888847, + "loss": 1.4274, + "step": 21828 + }, + { + "epoch": 0.28365789913959877, + "grad_norm": 0.45122388005256653, + "learning_rate": 0.00014329793732697333, + "loss": 1.4438, + "step": 21829 + }, + { + "epoch": 0.2836708936835146, + "grad_norm": 0.333778440952301, + "learning_rate": 0.00014329533786506193, + "loss": 1.2265, + "step": 21830 + }, + { + "epoch": 0.2836838882274305, + "grad_norm": 0.435007244348526, + "learning_rate": 0.00014329273840315055, + "loss": 1.4427, + "step": 21831 + }, + { + "epoch": 0.28369688277134636, + "grad_norm": 0.5377725958824158, + "learning_rate": 0.00014329013894123918, + "loss": 1.3354, + "step": 21832 + }, + { + "epoch": 0.28370987731526226, + "grad_norm": 0.3487373888492584, + "learning_rate": 0.00014328753947932777, + "loss": 1.3189, + "step": 21833 + }, + { + "epoch": 0.2837228718591781, + "grad_norm": 0.4342208206653595, + "learning_rate": 0.0001432849400174164, + "loss": 1.2371, + "step": 21834 + }, + { + "epoch": 0.283735866403094, + "grad_norm": 0.395367294549942, + "learning_rate": 0.00014328234055550502, + "loss": 1.4169, + "step": 21835 + }, + { + "epoch": 0.28374886094700985, + "grad_norm": 0.3446871042251587, + "learning_rate": 0.00014327974109359365, + "loss": 1.5585, + "step": 21836 + }, + { + "epoch": 0.28376185549092575, + "grad_norm": 0.36191636323928833, + "learning_rate": 0.00014327714163168224, + "loss": 1.2578, + "step": 21837 + }, + { + "epoch": 0.2837748500348416, + "grad_norm": 0.5307068824768066, + "learning_rate": 0.00014327454216977084, + "loss": 1.416, + "step": 21838 + }, + { + "epoch": 0.2837878445787575, + "grad_norm": 0.36990657448768616, + "learning_rate": 0.0001432719427078595, + "loss": 1.5035, + "step": 21839 + }, + { + "epoch": 0.28380083912267334, + "grad_norm": 0.3635912835597992, + "learning_rate": 0.0001432693432459481, + "loss": 1.476, + "step": 21840 + }, + { + "epoch": 0.28381383366658924, + "grad_norm": 0.49250268936157227, + "learning_rate": 0.00014326674378403671, + "loss": 1.6581, + "step": 21841 + }, + { + "epoch": 0.2838268282105051, + "grad_norm": 0.39382851123809814, + "learning_rate": 0.00014326414432212534, + "loss": 1.3253, + "step": 21842 + }, + { + "epoch": 0.283839822754421, + "grad_norm": 0.5141470432281494, + "learning_rate": 0.00014326154486021394, + "loss": 1.2545, + "step": 21843 + }, + { + "epoch": 0.28385281729833683, + "grad_norm": 0.3891243636608124, + "learning_rate": 0.00014325894539830256, + "loss": 1.351, + "step": 21844 + }, + { + "epoch": 0.28386581184225274, + "grad_norm": 0.40513208508491516, + "learning_rate": 0.00014325634593639116, + "loss": 1.622, + "step": 21845 + }, + { + "epoch": 0.2838788063861686, + "grad_norm": 0.4180625081062317, + "learning_rate": 0.0001432537464744798, + "loss": 1.6297, + "step": 21846 + }, + { + "epoch": 0.2838918009300845, + "grad_norm": 0.40011677145957947, + "learning_rate": 0.0001432511470125684, + "loss": 1.3979, + "step": 21847 + }, + { + "epoch": 0.2839047954740003, + "grad_norm": 0.40850383043289185, + "learning_rate": 0.00014324854755065703, + "loss": 1.5533, + "step": 21848 + }, + { + "epoch": 0.2839177900179162, + "grad_norm": 0.3992455005645752, + "learning_rate": 0.00014324594808874563, + "loss": 1.3892, + "step": 21849 + }, + { + "epoch": 0.28393078456183213, + "grad_norm": 0.33947646617889404, + "learning_rate": 0.00014324334862683425, + "loss": 1.2538, + "step": 21850 + }, + { + "epoch": 0.283943779105748, + "grad_norm": 0.43455737829208374, + "learning_rate": 0.00014324074916492288, + "loss": 1.5294, + "step": 21851 + }, + { + "epoch": 0.2839567736496639, + "grad_norm": 0.34516170620918274, + "learning_rate": 0.00014323814970301148, + "loss": 1.3957, + "step": 21852 + }, + { + "epoch": 0.2839697681935797, + "grad_norm": 0.3848903477191925, + "learning_rate": 0.0001432355502411001, + "loss": 1.1959, + "step": 21853 + }, + { + "epoch": 0.2839827627374956, + "grad_norm": 0.36252716183662415, + "learning_rate": 0.00014323295077918872, + "loss": 1.4499, + "step": 21854 + }, + { + "epoch": 0.28399575728141147, + "grad_norm": 0.49076372385025024, + "learning_rate": 0.00014323035131727732, + "loss": 1.7547, + "step": 21855 + }, + { + "epoch": 0.28400875182532737, + "grad_norm": 0.4048703610897064, + "learning_rate": 0.00014322775185536595, + "loss": 1.4122, + "step": 21856 + }, + { + "epoch": 0.2840217463692432, + "grad_norm": 0.4614471197128296, + "learning_rate": 0.00014322515239345454, + "loss": 1.4806, + "step": 21857 + }, + { + "epoch": 0.2840347409131591, + "grad_norm": 0.5605059862136841, + "learning_rate": 0.0001432225529315432, + "loss": 1.4019, + "step": 21858 + }, + { + "epoch": 0.28404773545707496, + "grad_norm": 0.5168873071670532, + "learning_rate": 0.0001432199534696318, + "loss": 1.265, + "step": 21859 + }, + { + "epoch": 0.28406073000099086, + "grad_norm": 0.494658499956131, + "learning_rate": 0.00014321735400772042, + "loss": 1.5678, + "step": 21860 + }, + { + "epoch": 0.2840737245449067, + "grad_norm": 0.39822033047676086, + "learning_rate": 0.00014321475454580901, + "loss": 1.2623, + "step": 21861 + }, + { + "epoch": 0.2840867190888226, + "grad_norm": 0.4155806601047516, + "learning_rate": 0.00014321215508389764, + "loss": 1.6313, + "step": 21862 + }, + { + "epoch": 0.28409971363273845, + "grad_norm": 0.40179985761642456, + "learning_rate": 0.00014320955562198626, + "loss": 1.1571, + "step": 21863 + }, + { + "epoch": 0.28411270817665435, + "grad_norm": 0.3448730707168579, + "learning_rate": 0.00014320695616007486, + "loss": 1.1556, + "step": 21864 + }, + { + "epoch": 0.2841257027205702, + "grad_norm": 0.3519286811351776, + "learning_rate": 0.00014320435669816349, + "loss": 1.1557, + "step": 21865 + }, + { + "epoch": 0.2841386972644861, + "grad_norm": 0.41627949476242065, + "learning_rate": 0.0001432017572362521, + "loss": 1.4976, + "step": 21866 + }, + { + "epoch": 0.28415169180840194, + "grad_norm": 0.28488898277282715, + "learning_rate": 0.0001431991577743407, + "loss": 1.4623, + "step": 21867 + }, + { + "epoch": 0.28416468635231784, + "grad_norm": 0.2909516394138336, + "learning_rate": 0.00014319655831242933, + "loss": 1.4012, + "step": 21868 + }, + { + "epoch": 0.2841776808962337, + "grad_norm": 0.43699222803115845, + "learning_rate": 0.00014319395885051793, + "loss": 1.6935, + "step": 21869 + }, + { + "epoch": 0.2841906754401496, + "grad_norm": 0.47480183839797974, + "learning_rate": 0.00014319135938860658, + "loss": 1.2782, + "step": 21870 + }, + { + "epoch": 0.28420366998406543, + "grad_norm": 0.4402593672275543, + "learning_rate": 0.00014318875992669518, + "loss": 1.5391, + "step": 21871 + }, + { + "epoch": 0.28421666452798133, + "grad_norm": 0.31280797719955444, + "learning_rate": 0.0001431861604647838, + "loss": 1.3815, + "step": 21872 + }, + { + "epoch": 0.2842296590718972, + "grad_norm": 0.2559697926044464, + "learning_rate": 0.0001431835610028724, + "loss": 1.3309, + "step": 21873 + }, + { + "epoch": 0.2842426536158131, + "grad_norm": 0.3198792040348053, + "learning_rate": 0.00014318096154096102, + "loss": 1.4866, + "step": 21874 + }, + { + "epoch": 0.2842556481597289, + "grad_norm": 0.4694131314754486, + "learning_rate": 0.00014317836207904965, + "loss": 1.5446, + "step": 21875 + }, + { + "epoch": 0.2842686427036448, + "grad_norm": 0.343665212392807, + "learning_rate": 0.00014317576261713825, + "loss": 1.5644, + "step": 21876 + }, + { + "epoch": 0.28428163724756067, + "grad_norm": 0.3482874631881714, + "learning_rate": 0.0001431731631552269, + "loss": 1.4043, + "step": 21877 + }, + { + "epoch": 0.28429463179147657, + "grad_norm": 0.25677087903022766, + "learning_rate": 0.0001431705636933155, + "loss": 1.4226, + "step": 21878 + }, + { + "epoch": 0.2843076263353924, + "grad_norm": 0.40079620480537415, + "learning_rate": 0.00014316796423140412, + "loss": 1.384, + "step": 21879 + }, + { + "epoch": 0.2843206208793083, + "grad_norm": 0.3208344578742981, + "learning_rate": 0.00014316536476949272, + "loss": 1.3486, + "step": 21880 + }, + { + "epoch": 0.28433361542322416, + "grad_norm": 0.4193015694618225, + "learning_rate": 0.00014316276530758134, + "loss": 1.3767, + "step": 21881 + }, + { + "epoch": 0.28434660996714006, + "grad_norm": 0.33976733684539795, + "learning_rate": 0.00014316016584566997, + "loss": 1.4011, + "step": 21882 + }, + { + "epoch": 0.2843596045110559, + "grad_norm": 0.41534972190856934, + "learning_rate": 0.00014315756638375856, + "loss": 1.3568, + "step": 21883 + }, + { + "epoch": 0.2843725990549718, + "grad_norm": 0.4091799557209015, + "learning_rate": 0.0001431549669218472, + "loss": 1.6021, + "step": 21884 + }, + { + "epoch": 0.28438559359888765, + "grad_norm": 0.28058111667633057, + "learning_rate": 0.0001431523674599358, + "loss": 1.3497, + "step": 21885 + }, + { + "epoch": 0.28439858814280355, + "grad_norm": 0.3813077211380005, + "learning_rate": 0.0001431497679980244, + "loss": 1.435, + "step": 21886 + }, + { + "epoch": 0.2844115826867194, + "grad_norm": 0.40862488746643066, + "learning_rate": 0.00014314716853611303, + "loss": 1.6265, + "step": 21887 + }, + { + "epoch": 0.2844245772306353, + "grad_norm": 0.39700421690940857, + "learning_rate": 0.00014314456907420163, + "loss": 1.3329, + "step": 21888 + }, + { + "epoch": 0.28443757177455115, + "grad_norm": 0.39813247323036194, + "learning_rate": 0.00014314196961229028, + "loss": 1.41, + "step": 21889 + }, + { + "epoch": 0.28445056631846705, + "grad_norm": 0.3587433099746704, + "learning_rate": 0.00014313937015037888, + "loss": 1.3962, + "step": 21890 + }, + { + "epoch": 0.2844635608623829, + "grad_norm": 0.39923006296157837, + "learning_rate": 0.0001431367706884675, + "loss": 1.328, + "step": 21891 + }, + { + "epoch": 0.2844765554062988, + "grad_norm": 0.3837178349494934, + "learning_rate": 0.0001431341712265561, + "loss": 1.322, + "step": 21892 + }, + { + "epoch": 0.28448954995021464, + "grad_norm": 0.25855177640914917, + "learning_rate": 0.00014313157176464473, + "loss": 1.2647, + "step": 21893 + }, + { + "epoch": 0.28450254449413054, + "grad_norm": 0.35650014877319336, + "learning_rate": 0.00014312897230273335, + "loss": 1.402, + "step": 21894 + }, + { + "epoch": 0.2845155390380464, + "grad_norm": 0.4028565585613251, + "learning_rate": 0.00014312637284082195, + "loss": 1.4435, + "step": 21895 + }, + { + "epoch": 0.2845285335819623, + "grad_norm": 0.3979090452194214, + "learning_rate": 0.00014312377337891057, + "loss": 1.566, + "step": 21896 + }, + { + "epoch": 0.28454152812587813, + "grad_norm": 0.24705497920513153, + "learning_rate": 0.0001431211739169992, + "loss": 1.4079, + "step": 21897 + }, + { + "epoch": 0.28455452266979403, + "grad_norm": 0.4081833064556122, + "learning_rate": 0.0001431185744550878, + "loss": 1.3421, + "step": 21898 + }, + { + "epoch": 0.2845675172137099, + "grad_norm": 0.46858513355255127, + "learning_rate": 0.00014311597499317642, + "loss": 1.4073, + "step": 21899 + }, + { + "epoch": 0.2845805117576258, + "grad_norm": 0.34121426939964294, + "learning_rate": 0.00014311337553126502, + "loss": 1.2371, + "step": 21900 + }, + { + "epoch": 0.2845935063015416, + "grad_norm": 0.43017086386680603, + "learning_rate": 0.00014311077606935367, + "loss": 1.5585, + "step": 21901 + }, + { + "epoch": 0.2846065008454575, + "grad_norm": 0.3084973692893982, + "learning_rate": 0.00014310817660744227, + "loss": 1.4033, + "step": 21902 + }, + { + "epoch": 0.28461949538937337, + "grad_norm": 0.4116935133934021, + "learning_rate": 0.0001431055771455309, + "loss": 1.4335, + "step": 21903 + }, + { + "epoch": 0.28463248993328927, + "grad_norm": 0.35372108221054077, + "learning_rate": 0.0001431029776836195, + "loss": 1.3184, + "step": 21904 + }, + { + "epoch": 0.2846454844772051, + "grad_norm": 0.38982659578323364, + "learning_rate": 0.0001431003782217081, + "loss": 1.4081, + "step": 21905 + }, + { + "epoch": 0.284658479021121, + "grad_norm": 0.4333013892173767, + "learning_rate": 0.00014309777875979674, + "loss": 1.4882, + "step": 21906 + }, + { + "epoch": 0.28467147356503686, + "grad_norm": 0.4317880868911743, + "learning_rate": 0.00014309517929788533, + "loss": 1.4465, + "step": 21907 + }, + { + "epoch": 0.28468446810895276, + "grad_norm": 0.3072398602962494, + "learning_rate": 0.00014309257983597396, + "loss": 1.4629, + "step": 21908 + }, + { + "epoch": 0.2846974626528686, + "grad_norm": 0.3618871867656708, + "learning_rate": 0.00014308998037406258, + "loss": 1.4559, + "step": 21909 + }, + { + "epoch": 0.2847104571967845, + "grad_norm": 0.4164143204689026, + "learning_rate": 0.00014308738091215118, + "loss": 1.3443, + "step": 21910 + }, + { + "epoch": 0.28472345174070035, + "grad_norm": 0.45673471689224243, + "learning_rate": 0.0001430847814502398, + "loss": 1.4438, + "step": 21911 + }, + { + "epoch": 0.28473644628461625, + "grad_norm": 0.33497557044029236, + "learning_rate": 0.0001430821819883284, + "loss": 1.3079, + "step": 21912 + }, + { + "epoch": 0.2847494408285321, + "grad_norm": 0.42376720905303955, + "learning_rate": 0.00014307958252641705, + "loss": 1.4289, + "step": 21913 + }, + { + "epoch": 0.284762435372448, + "grad_norm": 0.3171910345554352, + "learning_rate": 0.00014307698306450565, + "loss": 1.2059, + "step": 21914 + }, + { + "epoch": 0.28477542991636384, + "grad_norm": 0.4601474106311798, + "learning_rate": 0.00014307438360259428, + "loss": 1.455, + "step": 21915 + }, + { + "epoch": 0.28478842446027974, + "grad_norm": 0.3594374358654022, + "learning_rate": 0.0001430717841406829, + "loss": 1.2365, + "step": 21916 + }, + { + "epoch": 0.2848014190041956, + "grad_norm": 0.4302177131175995, + "learning_rate": 0.0001430691846787715, + "loss": 1.4431, + "step": 21917 + }, + { + "epoch": 0.2848144135481115, + "grad_norm": 0.3090600073337555, + "learning_rate": 0.00014306658521686012, + "loss": 1.2196, + "step": 21918 + }, + { + "epoch": 0.28482740809202733, + "grad_norm": 0.36141103506088257, + "learning_rate": 0.00014306398575494872, + "loss": 1.4712, + "step": 21919 + }, + { + "epoch": 0.28484040263594324, + "grad_norm": 0.5438603162765503, + "learning_rate": 0.00014306138629303737, + "loss": 1.4388, + "step": 21920 + }, + { + "epoch": 0.2848533971798591, + "grad_norm": 0.3602411448955536, + "learning_rate": 0.00014305878683112597, + "loss": 1.5405, + "step": 21921 + }, + { + "epoch": 0.284866391723775, + "grad_norm": 0.34778353571891785, + "learning_rate": 0.00014305618736921457, + "loss": 1.487, + "step": 21922 + }, + { + "epoch": 0.2848793862676908, + "grad_norm": 0.492312490940094, + "learning_rate": 0.0001430535879073032, + "loss": 1.3588, + "step": 21923 + }, + { + "epoch": 0.2848923808116067, + "grad_norm": 0.3657194674015045, + "learning_rate": 0.00014305098844539181, + "loss": 1.256, + "step": 21924 + }, + { + "epoch": 0.2849053753555226, + "grad_norm": 0.4633488357067108, + "learning_rate": 0.00014304838898348044, + "loss": 1.4007, + "step": 21925 + }, + { + "epoch": 0.2849183698994385, + "grad_norm": 0.4802319407463074, + "learning_rate": 0.00014304578952156904, + "loss": 1.4698, + "step": 21926 + }, + { + "epoch": 0.2849313644433544, + "grad_norm": 0.36779287457466125, + "learning_rate": 0.00014304319005965766, + "loss": 1.2843, + "step": 21927 + }, + { + "epoch": 0.2849443589872702, + "grad_norm": 0.38392651081085205, + "learning_rate": 0.00014304059059774629, + "loss": 1.2688, + "step": 21928 + }, + { + "epoch": 0.2849573535311861, + "grad_norm": 0.2964380979537964, + "learning_rate": 0.00014303799113583488, + "loss": 1.1265, + "step": 21929 + }, + { + "epoch": 0.28497034807510196, + "grad_norm": 0.28442779183387756, + "learning_rate": 0.0001430353916739235, + "loss": 1.3179, + "step": 21930 + }, + { + "epoch": 0.28498334261901787, + "grad_norm": 0.3989967405796051, + "learning_rate": 0.0001430327922120121, + "loss": 1.2743, + "step": 21931 + }, + { + "epoch": 0.2849963371629337, + "grad_norm": 0.46536925435066223, + "learning_rate": 0.00014303019275010076, + "loss": 1.8473, + "step": 21932 + }, + { + "epoch": 0.2850093317068496, + "grad_norm": 0.43414443731307983, + "learning_rate": 0.00014302759328818935, + "loss": 1.4503, + "step": 21933 + }, + { + "epoch": 0.28502232625076546, + "grad_norm": 0.35513585805892944, + "learning_rate": 0.00014302499382627795, + "loss": 1.2259, + "step": 21934 + }, + { + "epoch": 0.28503532079468136, + "grad_norm": 0.4347069561481476, + "learning_rate": 0.00014302239436436658, + "loss": 1.4936, + "step": 21935 + }, + { + "epoch": 0.2850483153385972, + "grad_norm": 0.4843854010105133, + "learning_rate": 0.0001430197949024552, + "loss": 1.3628, + "step": 21936 + }, + { + "epoch": 0.2850613098825131, + "grad_norm": 0.4745556712150574, + "learning_rate": 0.00014301719544054382, + "loss": 1.5012, + "step": 21937 + }, + { + "epoch": 0.28507430442642895, + "grad_norm": 0.4343568682670593, + "learning_rate": 0.00014301459597863242, + "loss": 1.4119, + "step": 21938 + }, + { + "epoch": 0.28508729897034485, + "grad_norm": 0.42833149433135986, + "learning_rate": 0.00014301199651672105, + "loss": 1.473, + "step": 21939 + }, + { + "epoch": 0.2851002935142607, + "grad_norm": 0.41668805480003357, + "learning_rate": 0.00014300939705480967, + "loss": 1.4469, + "step": 21940 + }, + { + "epoch": 0.2851132880581766, + "grad_norm": 0.395904004573822, + "learning_rate": 0.00014300679759289827, + "loss": 1.4087, + "step": 21941 + }, + { + "epoch": 0.28512628260209244, + "grad_norm": 0.3692234456539154, + "learning_rate": 0.0001430041981309869, + "loss": 1.2834, + "step": 21942 + }, + { + "epoch": 0.28513927714600834, + "grad_norm": 0.4075605571269989, + "learning_rate": 0.0001430015986690755, + "loss": 1.5301, + "step": 21943 + }, + { + "epoch": 0.2851522716899242, + "grad_norm": 0.3128395974636078, + "learning_rate": 0.00014299899920716414, + "loss": 1.339, + "step": 21944 + }, + { + "epoch": 0.2851652662338401, + "grad_norm": 0.4022481441497803, + "learning_rate": 0.00014299639974525274, + "loss": 1.1985, + "step": 21945 + }, + { + "epoch": 0.28517826077775593, + "grad_norm": 0.4021647274494171, + "learning_rate": 0.00014299380028334136, + "loss": 1.4608, + "step": 21946 + }, + { + "epoch": 0.28519125532167183, + "grad_norm": 0.3396584689617157, + "learning_rate": 0.00014299120082142996, + "loss": 1.3398, + "step": 21947 + }, + { + "epoch": 0.2852042498655877, + "grad_norm": 0.45777085423469543, + "learning_rate": 0.00014298860135951859, + "loss": 1.3457, + "step": 21948 + }, + { + "epoch": 0.2852172444095036, + "grad_norm": 0.44055110216140747, + "learning_rate": 0.0001429860018976072, + "loss": 1.492, + "step": 21949 + }, + { + "epoch": 0.2852302389534194, + "grad_norm": 0.4318700432777405, + "learning_rate": 0.0001429834024356958, + "loss": 1.3895, + "step": 21950 + }, + { + "epoch": 0.2852432334973353, + "grad_norm": 0.3629011809825897, + "learning_rate": 0.00014298080297378443, + "loss": 1.4125, + "step": 21951 + }, + { + "epoch": 0.28525622804125117, + "grad_norm": 0.31618040800094604, + "learning_rate": 0.00014297820351187306, + "loss": 1.4318, + "step": 21952 + }, + { + "epoch": 0.28526922258516707, + "grad_norm": 0.41475939750671387, + "learning_rate": 0.00014297560404996165, + "loss": 1.5863, + "step": 21953 + }, + { + "epoch": 0.2852822171290829, + "grad_norm": 0.44185543060302734, + "learning_rate": 0.00014297300458805028, + "loss": 1.2739, + "step": 21954 + }, + { + "epoch": 0.2852952116729988, + "grad_norm": 0.4192999303340912, + "learning_rate": 0.0001429704051261389, + "loss": 1.4073, + "step": 21955 + }, + { + "epoch": 0.28530820621691466, + "grad_norm": 0.3721044659614563, + "learning_rate": 0.00014296780566422753, + "loss": 1.3611, + "step": 21956 + }, + { + "epoch": 0.28532120076083056, + "grad_norm": 0.38076090812683105, + "learning_rate": 0.00014296520620231612, + "loss": 1.3161, + "step": 21957 + }, + { + "epoch": 0.2853341953047464, + "grad_norm": 0.3386537432670593, + "learning_rate": 0.00014296260674040475, + "loss": 1.341, + "step": 21958 + }, + { + "epoch": 0.2853471898486623, + "grad_norm": 0.34403541684150696, + "learning_rate": 0.00014296000727849337, + "loss": 1.3572, + "step": 21959 + }, + { + "epoch": 0.28536018439257815, + "grad_norm": 0.41266053915023804, + "learning_rate": 0.00014295740781658197, + "loss": 1.5182, + "step": 21960 + }, + { + "epoch": 0.28537317893649405, + "grad_norm": 0.336268812417984, + "learning_rate": 0.0001429548083546706, + "loss": 1.3286, + "step": 21961 + }, + { + "epoch": 0.2853861734804099, + "grad_norm": 0.4718099534511566, + "learning_rate": 0.0001429522088927592, + "loss": 1.3731, + "step": 21962 + }, + { + "epoch": 0.2853991680243258, + "grad_norm": 0.31338202953338623, + "learning_rate": 0.00014294960943084782, + "loss": 1.3117, + "step": 21963 + }, + { + "epoch": 0.28541216256824165, + "grad_norm": 0.34216970205307007, + "learning_rate": 0.00014294700996893644, + "loss": 1.2764, + "step": 21964 + }, + { + "epoch": 0.28542515711215755, + "grad_norm": 0.4323378801345825, + "learning_rate": 0.00014294441050702504, + "loss": 1.4339, + "step": 21965 + }, + { + "epoch": 0.2854381516560734, + "grad_norm": 0.3430556356906891, + "learning_rate": 0.00014294181104511366, + "loss": 1.369, + "step": 21966 + }, + { + "epoch": 0.2854511461999893, + "grad_norm": 0.35220974683761597, + "learning_rate": 0.0001429392115832023, + "loss": 1.3689, + "step": 21967 + }, + { + "epoch": 0.28546414074390514, + "grad_norm": 0.31723353266716003, + "learning_rate": 0.0001429366121212909, + "loss": 1.26, + "step": 21968 + }, + { + "epoch": 0.28547713528782104, + "grad_norm": 0.41802477836608887, + "learning_rate": 0.0001429340126593795, + "loss": 1.6105, + "step": 21969 + }, + { + "epoch": 0.2854901298317369, + "grad_norm": 0.39274874329566956, + "learning_rate": 0.00014293141319746813, + "loss": 1.5149, + "step": 21970 + }, + { + "epoch": 0.2855031243756528, + "grad_norm": 0.34680861234664917, + "learning_rate": 0.00014292881373555676, + "loss": 1.2788, + "step": 21971 + }, + { + "epoch": 0.28551611891956863, + "grad_norm": 0.33555078506469727, + "learning_rate": 0.00014292621427364536, + "loss": 1.2289, + "step": 21972 + }, + { + "epoch": 0.28552911346348453, + "grad_norm": 0.4141026735305786, + "learning_rate": 0.00014292361481173398, + "loss": 1.1982, + "step": 21973 + }, + { + "epoch": 0.2855421080074004, + "grad_norm": 0.3478696942329407, + "learning_rate": 0.00014292101534982258, + "loss": 1.6218, + "step": 21974 + }, + { + "epoch": 0.2855551025513163, + "grad_norm": 0.40909144282341003, + "learning_rate": 0.00014291841588791123, + "loss": 1.4228, + "step": 21975 + }, + { + "epoch": 0.2855680970952321, + "grad_norm": 0.404229998588562, + "learning_rate": 0.00014291581642599983, + "loss": 1.4067, + "step": 21976 + }, + { + "epoch": 0.285581091639148, + "grad_norm": 0.37593090534210205, + "learning_rate": 0.00014291321696408842, + "loss": 1.4202, + "step": 21977 + }, + { + "epoch": 0.28559408618306387, + "grad_norm": 0.3663129210472107, + "learning_rate": 0.00014291061750217705, + "loss": 1.6132, + "step": 21978 + }, + { + "epoch": 0.28560708072697977, + "grad_norm": 0.30304816365242004, + "learning_rate": 0.00014290801804026567, + "loss": 1.2752, + "step": 21979 + }, + { + "epoch": 0.2856200752708956, + "grad_norm": 0.34673941135406494, + "learning_rate": 0.0001429054185783543, + "loss": 1.3966, + "step": 21980 + }, + { + "epoch": 0.2856330698148115, + "grad_norm": 0.3668404519557953, + "learning_rate": 0.0001429028191164429, + "loss": 1.4455, + "step": 21981 + }, + { + "epoch": 0.28564606435872736, + "grad_norm": 0.44638675451278687, + "learning_rate": 0.00014290021965453152, + "loss": 1.2834, + "step": 21982 + }, + { + "epoch": 0.28565905890264326, + "grad_norm": 0.3655369281768799, + "learning_rate": 0.00014289762019262014, + "loss": 1.238, + "step": 21983 + }, + { + "epoch": 0.2856720534465591, + "grad_norm": 0.4036513566970825, + "learning_rate": 0.00014289502073070874, + "loss": 1.3708, + "step": 21984 + }, + { + "epoch": 0.285685047990475, + "grad_norm": 0.408968061208725, + "learning_rate": 0.00014289242126879737, + "loss": 1.4938, + "step": 21985 + }, + { + "epoch": 0.28569804253439085, + "grad_norm": 0.4376639127731323, + "learning_rate": 0.00014288982180688596, + "loss": 1.3572, + "step": 21986 + }, + { + "epoch": 0.28571103707830675, + "grad_norm": 0.3937339186668396, + "learning_rate": 0.00014288722234497461, + "loss": 1.3805, + "step": 21987 + }, + { + "epoch": 0.2857240316222226, + "grad_norm": 0.3632708191871643, + "learning_rate": 0.0001428846228830632, + "loss": 1.3606, + "step": 21988 + }, + { + "epoch": 0.2857370261661385, + "grad_norm": 0.4313856363296509, + "learning_rate": 0.0001428820234211518, + "loss": 1.3387, + "step": 21989 + }, + { + "epoch": 0.28575002071005434, + "grad_norm": 0.33378085494041443, + "learning_rate": 0.00014287942395924046, + "loss": 1.467, + "step": 21990 + }, + { + "epoch": 0.28576301525397024, + "grad_norm": 0.41261959075927734, + "learning_rate": 0.00014287682449732906, + "loss": 1.4259, + "step": 21991 + }, + { + "epoch": 0.2857760097978861, + "grad_norm": 0.4721905589103699, + "learning_rate": 0.00014287422503541768, + "loss": 1.3853, + "step": 21992 + }, + { + "epoch": 0.285789004341802, + "grad_norm": 0.35229530930519104, + "learning_rate": 0.00014287162557350628, + "loss": 1.2444, + "step": 21993 + }, + { + "epoch": 0.28580199888571783, + "grad_norm": 0.5010068416595459, + "learning_rate": 0.0001428690261115949, + "loss": 1.4539, + "step": 21994 + }, + { + "epoch": 0.28581499342963373, + "grad_norm": 0.42573896050453186, + "learning_rate": 0.00014286642664968353, + "loss": 1.4227, + "step": 21995 + }, + { + "epoch": 0.2858279879735496, + "grad_norm": 0.4087473452091217, + "learning_rate": 0.00014286382718777213, + "loss": 1.425, + "step": 21996 + }, + { + "epoch": 0.2858409825174655, + "grad_norm": 0.3637833297252655, + "learning_rate": 0.00014286122772586075, + "loss": 1.2943, + "step": 21997 + }, + { + "epoch": 0.2858539770613813, + "grad_norm": 0.489261656999588, + "learning_rate": 0.00014285862826394938, + "loss": 1.5085, + "step": 21998 + }, + { + "epoch": 0.2858669716052972, + "grad_norm": 0.38434356451034546, + "learning_rate": 0.000142856028802038, + "loss": 1.4885, + "step": 21999 + }, + { + "epoch": 0.28587996614921307, + "grad_norm": 0.4948313534259796, + "learning_rate": 0.0001428534293401266, + "loss": 1.6242, + "step": 22000 + }, + { + "epoch": 0.285892960693129, + "grad_norm": 0.512351930141449, + "learning_rate": 0.00014285082987821522, + "loss": 1.5307, + "step": 22001 + }, + { + "epoch": 0.2859059552370449, + "grad_norm": 0.3727751672267914, + "learning_rate": 0.00014284823041630385, + "loss": 1.5014, + "step": 22002 + }, + { + "epoch": 0.2859189497809607, + "grad_norm": 0.3941037058830261, + "learning_rate": 0.00014284563095439244, + "loss": 1.4953, + "step": 22003 + }, + { + "epoch": 0.2859319443248766, + "grad_norm": 0.4126706123352051, + "learning_rate": 0.00014284303149248107, + "loss": 1.3524, + "step": 22004 + }, + { + "epoch": 0.28594493886879246, + "grad_norm": 0.38840505480766296, + "learning_rate": 0.00014284043203056967, + "loss": 1.3747, + "step": 22005 + }, + { + "epoch": 0.28595793341270836, + "grad_norm": 0.4557211995124817, + "learning_rate": 0.0001428378325686583, + "loss": 1.3452, + "step": 22006 + }, + { + "epoch": 0.2859709279566242, + "grad_norm": 0.41547250747680664, + "learning_rate": 0.00014283523310674691, + "loss": 1.3542, + "step": 22007 + }, + { + "epoch": 0.2859839225005401, + "grad_norm": 0.37983959913253784, + "learning_rate": 0.0001428326336448355, + "loss": 1.2848, + "step": 22008 + }, + { + "epoch": 0.28599691704445596, + "grad_norm": 0.43346232175827026, + "learning_rate": 0.00014283003418292414, + "loss": 1.3939, + "step": 22009 + }, + { + "epoch": 0.28600991158837186, + "grad_norm": 0.5540909171104431, + "learning_rate": 0.00014282743472101276, + "loss": 1.4492, + "step": 22010 + }, + { + "epoch": 0.2860229061322877, + "grad_norm": 0.41637828946113586, + "learning_rate": 0.00014282483525910139, + "loss": 1.6654, + "step": 22011 + }, + { + "epoch": 0.2860359006762036, + "grad_norm": 0.4486018419265747, + "learning_rate": 0.00014282223579718998, + "loss": 1.5164, + "step": 22012 + }, + { + "epoch": 0.28604889522011945, + "grad_norm": 0.35889574885368347, + "learning_rate": 0.0001428196363352786, + "loss": 1.3653, + "step": 22013 + }, + { + "epoch": 0.28606188976403535, + "grad_norm": 0.31671467423439026, + "learning_rate": 0.00014281703687336723, + "loss": 1.3542, + "step": 22014 + }, + { + "epoch": 0.2860748843079512, + "grad_norm": 0.4389597773551941, + "learning_rate": 0.00014281443741145583, + "loss": 1.5172, + "step": 22015 + }, + { + "epoch": 0.2860878788518671, + "grad_norm": 0.3850475251674652, + "learning_rate": 0.00014281183794954445, + "loss": 1.6949, + "step": 22016 + }, + { + "epoch": 0.28610087339578294, + "grad_norm": 0.3959624767303467, + "learning_rate": 0.00014280923848763305, + "loss": 1.5044, + "step": 22017 + }, + { + "epoch": 0.28611386793969884, + "grad_norm": 0.4096791446208954, + "learning_rate": 0.00014280663902572168, + "loss": 1.4464, + "step": 22018 + }, + { + "epoch": 0.2861268624836147, + "grad_norm": 0.42850667238235474, + "learning_rate": 0.0001428040395638103, + "loss": 1.3657, + "step": 22019 + }, + { + "epoch": 0.2861398570275306, + "grad_norm": 0.46171823143959045, + "learning_rate": 0.0001428014401018989, + "loss": 1.3529, + "step": 22020 + }, + { + "epoch": 0.28615285157144643, + "grad_norm": 0.26572272181510925, + "learning_rate": 0.00014279884063998752, + "loss": 1.1545, + "step": 22021 + }, + { + "epoch": 0.28616584611536233, + "grad_norm": 0.36273351311683655, + "learning_rate": 0.00014279624117807615, + "loss": 1.3032, + "step": 22022 + }, + { + "epoch": 0.2861788406592782, + "grad_norm": 0.4640752077102661, + "learning_rate": 0.00014279364171616477, + "loss": 1.4083, + "step": 22023 + }, + { + "epoch": 0.2861918352031941, + "grad_norm": 0.4503646790981293, + "learning_rate": 0.00014279104225425337, + "loss": 1.5933, + "step": 22024 + }, + { + "epoch": 0.2862048297471099, + "grad_norm": 0.3152863681316376, + "learning_rate": 0.000142788442792342, + "loss": 1.4376, + "step": 22025 + }, + { + "epoch": 0.2862178242910258, + "grad_norm": 0.356189489364624, + "learning_rate": 0.00014278584333043062, + "loss": 1.5175, + "step": 22026 + }, + { + "epoch": 0.28623081883494167, + "grad_norm": 0.4671732783317566, + "learning_rate": 0.00014278324386851921, + "loss": 1.5777, + "step": 22027 + }, + { + "epoch": 0.28624381337885757, + "grad_norm": 0.36938387155532837, + "learning_rate": 0.00014278064440660784, + "loss": 1.3272, + "step": 22028 + }, + { + "epoch": 0.2862568079227734, + "grad_norm": 0.44200819730758667, + "learning_rate": 0.00014277804494469646, + "loss": 1.4551, + "step": 22029 + }, + { + "epoch": 0.2862698024666893, + "grad_norm": 0.4520149528980255, + "learning_rate": 0.0001427754454827851, + "loss": 1.4775, + "step": 22030 + }, + { + "epoch": 0.28628279701060516, + "grad_norm": 0.3840465843677521, + "learning_rate": 0.00014277284602087369, + "loss": 1.3288, + "step": 22031 + }, + { + "epoch": 0.28629579155452106, + "grad_norm": 0.4465623199939728, + "learning_rate": 0.00014277024655896228, + "loss": 1.4295, + "step": 22032 + }, + { + "epoch": 0.2863087860984369, + "grad_norm": 0.33145877718925476, + "learning_rate": 0.00014276764709705093, + "loss": 1.5137, + "step": 22033 + }, + { + "epoch": 0.2863217806423528, + "grad_norm": 0.38937899470329285, + "learning_rate": 0.00014276504763513953, + "loss": 1.4834, + "step": 22034 + }, + { + "epoch": 0.28633477518626865, + "grad_norm": 0.328317254781723, + "learning_rate": 0.00014276244817322816, + "loss": 1.6545, + "step": 22035 + }, + { + "epoch": 0.28634776973018455, + "grad_norm": 0.44835636019706726, + "learning_rate": 0.00014275984871131675, + "loss": 1.2589, + "step": 22036 + }, + { + "epoch": 0.2863607642741004, + "grad_norm": 0.3727401793003082, + "learning_rate": 0.00014275724924940538, + "loss": 1.4606, + "step": 22037 + }, + { + "epoch": 0.2863737588180163, + "grad_norm": 0.43079492449760437, + "learning_rate": 0.000142754649787494, + "loss": 1.6143, + "step": 22038 + }, + { + "epoch": 0.28638675336193214, + "grad_norm": 0.35338684916496277, + "learning_rate": 0.0001427520503255826, + "loss": 1.2741, + "step": 22039 + }, + { + "epoch": 0.28639974790584805, + "grad_norm": 0.33504289388656616, + "learning_rate": 0.00014274945086367122, + "loss": 1.1859, + "step": 22040 + }, + { + "epoch": 0.2864127424497639, + "grad_norm": 0.41287708282470703, + "learning_rate": 0.00014274685140175985, + "loss": 1.3995, + "step": 22041 + }, + { + "epoch": 0.2864257369936798, + "grad_norm": 0.3278936743736267, + "learning_rate": 0.00014274425193984847, + "loss": 1.365, + "step": 22042 + }, + { + "epoch": 0.28643873153759564, + "grad_norm": 0.4115864634513855, + "learning_rate": 0.00014274165247793707, + "loss": 1.4876, + "step": 22043 + }, + { + "epoch": 0.28645172608151154, + "grad_norm": 0.3723506033420563, + "learning_rate": 0.00014273905301602567, + "loss": 1.5142, + "step": 22044 + }, + { + "epoch": 0.2864647206254274, + "grad_norm": 0.5202172994613647, + "learning_rate": 0.00014273645355411432, + "loss": 1.5319, + "step": 22045 + }, + { + "epoch": 0.2864777151693433, + "grad_norm": 0.49196693301200867, + "learning_rate": 0.00014273385409220292, + "loss": 1.5103, + "step": 22046 + }, + { + "epoch": 0.28649070971325913, + "grad_norm": 0.4838505983352661, + "learning_rate": 0.00014273125463029154, + "loss": 1.4901, + "step": 22047 + }, + { + "epoch": 0.28650370425717503, + "grad_norm": 0.3634736239910126, + "learning_rate": 0.00014272865516838014, + "loss": 1.4783, + "step": 22048 + }, + { + "epoch": 0.2865166988010909, + "grad_norm": 0.42378202080726624, + "learning_rate": 0.00014272605570646876, + "loss": 1.4223, + "step": 22049 + }, + { + "epoch": 0.2865296933450068, + "grad_norm": 0.4459422528743744, + "learning_rate": 0.0001427234562445574, + "loss": 1.4177, + "step": 22050 + }, + { + "epoch": 0.2865426878889226, + "grad_norm": 0.44379082322120667, + "learning_rate": 0.00014272085678264599, + "loss": 1.4822, + "step": 22051 + }, + { + "epoch": 0.2865556824328385, + "grad_norm": 0.3417443037033081, + "learning_rate": 0.0001427182573207346, + "loss": 1.1639, + "step": 22052 + }, + { + "epoch": 0.28656867697675437, + "grad_norm": 0.35039427876472473, + "learning_rate": 0.00014271565785882323, + "loss": 1.3411, + "step": 22053 + }, + { + "epoch": 0.28658167152067027, + "grad_norm": 0.3391748368740082, + "learning_rate": 0.00014271305839691186, + "loss": 1.2356, + "step": 22054 + }, + { + "epoch": 0.2865946660645861, + "grad_norm": 0.3190998435020447, + "learning_rate": 0.00014271045893500046, + "loss": 1.4417, + "step": 22055 + }, + { + "epoch": 0.286607660608502, + "grad_norm": 0.40838944911956787, + "learning_rate": 0.00014270785947308905, + "loss": 1.2985, + "step": 22056 + }, + { + "epoch": 0.28662065515241786, + "grad_norm": 0.3903173506259918, + "learning_rate": 0.0001427052600111777, + "loss": 1.1493, + "step": 22057 + }, + { + "epoch": 0.28663364969633376, + "grad_norm": 0.3411411941051483, + "learning_rate": 0.0001427026605492663, + "loss": 1.2035, + "step": 22058 + }, + { + "epoch": 0.2866466442402496, + "grad_norm": 0.40367719531059265, + "learning_rate": 0.00014270006108735493, + "loss": 1.3588, + "step": 22059 + }, + { + "epoch": 0.2866596387841655, + "grad_norm": 0.4369624853134155, + "learning_rate": 0.00014269746162544352, + "loss": 1.4349, + "step": 22060 + }, + { + "epoch": 0.28667263332808135, + "grad_norm": 0.40241745114326477, + "learning_rate": 0.00014269486216353215, + "loss": 1.3375, + "step": 22061 + }, + { + "epoch": 0.28668562787199725, + "grad_norm": 0.3780624568462372, + "learning_rate": 0.00014269226270162077, + "loss": 1.4254, + "step": 22062 + }, + { + "epoch": 0.2866986224159131, + "grad_norm": 0.4062449336051941, + "learning_rate": 0.00014268966323970937, + "loss": 1.4223, + "step": 22063 + }, + { + "epoch": 0.286711616959829, + "grad_norm": 0.48995670676231384, + "learning_rate": 0.00014268706377779802, + "loss": 1.4169, + "step": 22064 + }, + { + "epoch": 0.28672461150374484, + "grad_norm": 0.38684117794036865, + "learning_rate": 0.00014268446431588662, + "loss": 1.4493, + "step": 22065 + }, + { + "epoch": 0.28673760604766074, + "grad_norm": 0.43404072523117065, + "learning_rate": 0.00014268186485397524, + "loss": 1.3545, + "step": 22066 + }, + { + "epoch": 0.2867506005915766, + "grad_norm": 0.46732303500175476, + "learning_rate": 0.00014267926539206384, + "loss": 1.5281, + "step": 22067 + }, + { + "epoch": 0.2867635951354925, + "grad_norm": 0.32384055852890015, + "learning_rate": 0.00014267666593015247, + "loss": 1.4781, + "step": 22068 + }, + { + "epoch": 0.28677658967940833, + "grad_norm": 0.43550577759742737, + "learning_rate": 0.0001426740664682411, + "loss": 1.3708, + "step": 22069 + }, + { + "epoch": 0.28678958422332423, + "grad_norm": 0.4591449201107025, + "learning_rate": 0.0001426714670063297, + "loss": 1.514, + "step": 22070 + }, + { + "epoch": 0.2868025787672401, + "grad_norm": 0.43799638748168945, + "learning_rate": 0.0001426688675444183, + "loss": 1.4926, + "step": 22071 + }, + { + "epoch": 0.286815573311156, + "grad_norm": 0.48175352811813354, + "learning_rate": 0.00014266626808250694, + "loss": 1.4686, + "step": 22072 + }, + { + "epoch": 0.2868285678550718, + "grad_norm": 0.43021318316459656, + "learning_rate": 0.00014266366862059553, + "loss": 1.5257, + "step": 22073 + }, + { + "epoch": 0.2868415623989877, + "grad_norm": 0.39312949776649475, + "learning_rate": 0.00014266106915868416, + "loss": 1.5201, + "step": 22074 + }, + { + "epoch": 0.28685455694290357, + "grad_norm": 0.43819543719291687, + "learning_rate": 0.00014265846969677276, + "loss": 1.5464, + "step": 22075 + }, + { + "epoch": 0.28686755148681947, + "grad_norm": 0.43548449873924255, + "learning_rate": 0.0001426558702348614, + "loss": 1.2925, + "step": 22076 + }, + { + "epoch": 0.2868805460307354, + "grad_norm": 0.46215489506721497, + "learning_rate": 0.00014265327077295, + "loss": 1.5256, + "step": 22077 + }, + { + "epoch": 0.2868935405746512, + "grad_norm": 0.35817450284957886, + "learning_rate": 0.00014265067131103863, + "loss": 1.5043, + "step": 22078 + }, + { + "epoch": 0.2869065351185671, + "grad_norm": 0.23793365061283112, + "learning_rate": 0.00014264807184912723, + "loss": 1.4752, + "step": 22079 + }, + { + "epoch": 0.28691952966248296, + "grad_norm": 0.42317914962768555, + "learning_rate": 0.00014264547238721585, + "loss": 1.4091, + "step": 22080 + }, + { + "epoch": 0.28693252420639886, + "grad_norm": 0.45541349053382874, + "learning_rate": 0.00014264287292530448, + "loss": 1.516, + "step": 22081 + }, + { + "epoch": 0.2869455187503147, + "grad_norm": 0.40810176730155945, + "learning_rate": 0.00014264027346339307, + "loss": 1.4239, + "step": 22082 + }, + { + "epoch": 0.2869585132942306, + "grad_norm": 0.37985411286354065, + "learning_rate": 0.0001426376740014817, + "loss": 1.5568, + "step": 22083 + }, + { + "epoch": 0.28697150783814646, + "grad_norm": 0.37615418434143066, + "learning_rate": 0.00014263507453957032, + "loss": 1.508, + "step": 22084 + }, + { + "epoch": 0.28698450238206236, + "grad_norm": 0.4067232012748718, + "learning_rate": 0.00014263247507765895, + "loss": 1.4641, + "step": 22085 + }, + { + "epoch": 0.2869974969259782, + "grad_norm": 0.3505003750324249, + "learning_rate": 0.00014262987561574754, + "loss": 1.3532, + "step": 22086 + }, + { + "epoch": 0.2870104914698941, + "grad_norm": 0.41911816596984863, + "learning_rate": 0.00014262727615383614, + "loss": 1.5465, + "step": 22087 + }, + { + "epoch": 0.28702348601380995, + "grad_norm": 0.4251181185245514, + "learning_rate": 0.0001426246766919248, + "loss": 1.4784, + "step": 22088 + }, + { + "epoch": 0.28703648055772585, + "grad_norm": 0.3102303147315979, + "learning_rate": 0.0001426220772300134, + "loss": 1.3195, + "step": 22089 + }, + { + "epoch": 0.2870494751016417, + "grad_norm": 0.5132701992988586, + "learning_rate": 0.00014261947776810202, + "loss": 1.6155, + "step": 22090 + }, + { + "epoch": 0.2870624696455576, + "grad_norm": 0.45485636591911316, + "learning_rate": 0.0001426168783061906, + "loss": 1.2436, + "step": 22091 + }, + { + "epoch": 0.28707546418947344, + "grad_norm": 0.3383045494556427, + "learning_rate": 0.00014261427884427924, + "loss": 1.3814, + "step": 22092 + }, + { + "epoch": 0.28708845873338934, + "grad_norm": 0.4377923011779785, + "learning_rate": 0.00014261167938236786, + "loss": 1.5571, + "step": 22093 + }, + { + "epoch": 0.2871014532773052, + "grad_norm": 0.30352792143821716, + "learning_rate": 0.00014260907992045646, + "loss": 1.2497, + "step": 22094 + }, + { + "epoch": 0.2871144478212211, + "grad_norm": 0.3492357134819031, + "learning_rate": 0.00014260648045854508, + "loss": 1.3974, + "step": 22095 + }, + { + "epoch": 0.28712744236513693, + "grad_norm": 0.29284554719924927, + "learning_rate": 0.0001426038809966337, + "loss": 1.3344, + "step": 22096 + }, + { + "epoch": 0.28714043690905283, + "grad_norm": 0.5212596654891968, + "learning_rate": 0.00014260128153472233, + "loss": 1.295, + "step": 22097 + }, + { + "epoch": 0.2871534314529687, + "grad_norm": 0.37402665615081787, + "learning_rate": 0.00014259868207281093, + "loss": 1.4609, + "step": 22098 + }, + { + "epoch": 0.2871664259968846, + "grad_norm": 0.40469640493392944, + "learning_rate": 0.00014259608261089953, + "loss": 1.6954, + "step": 22099 + }, + { + "epoch": 0.2871794205408004, + "grad_norm": 0.49761828780174255, + "learning_rate": 0.00014259348314898818, + "loss": 1.3617, + "step": 22100 + }, + { + "epoch": 0.2871924150847163, + "grad_norm": 0.5624523758888245, + "learning_rate": 0.00014259088368707678, + "loss": 1.417, + "step": 22101 + }, + { + "epoch": 0.28720540962863217, + "grad_norm": 0.37021806836128235, + "learning_rate": 0.0001425882842251654, + "loss": 1.4123, + "step": 22102 + }, + { + "epoch": 0.28721840417254807, + "grad_norm": 0.45372435450553894, + "learning_rate": 0.00014258568476325403, + "loss": 1.4109, + "step": 22103 + }, + { + "epoch": 0.2872313987164639, + "grad_norm": 0.47401976585388184, + "learning_rate": 0.00014258308530134262, + "loss": 1.5865, + "step": 22104 + }, + { + "epoch": 0.2872443932603798, + "grad_norm": 0.45055750012397766, + "learning_rate": 0.00014258048583943125, + "loss": 1.4061, + "step": 22105 + }, + { + "epoch": 0.28725738780429566, + "grad_norm": 0.4332391917705536, + "learning_rate": 0.00014257788637751984, + "loss": 1.4039, + "step": 22106 + }, + { + "epoch": 0.28727038234821156, + "grad_norm": 0.36800718307495117, + "learning_rate": 0.0001425752869156085, + "loss": 1.5256, + "step": 22107 + }, + { + "epoch": 0.2872833768921274, + "grad_norm": 0.43751880526542664, + "learning_rate": 0.0001425726874536971, + "loss": 1.6016, + "step": 22108 + }, + { + "epoch": 0.2872963714360433, + "grad_norm": 0.3916257619857788, + "learning_rate": 0.00014257008799178572, + "loss": 1.3052, + "step": 22109 + }, + { + "epoch": 0.28730936597995915, + "grad_norm": 0.4078250229358673, + "learning_rate": 0.00014256748852987432, + "loss": 1.2841, + "step": 22110 + }, + { + "epoch": 0.28732236052387505, + "grad_norm": 0.4308535158634186, + "learning_rate": 0.00014256488906796294, + "loss": 1.403, + "step": 22111 + }, + { + "epoch": 0.2873353550677909, + "grad_norm": 0.4177975058555603, + "learning_rate": 0.00014256228960605156, + "loss": 1.3683, + "step": 22112 + }, + { + "epoch": 0.2873483496117068, + "grad_norm": 0.4185788333415985, + "learning_rate": 0.00014255969014414016, + "loss": 1.274, + "step": 22113 + }, + { + "epoch": 0.28736134415562264, + "grad_norm": 0.43693187832832336, + "learning_rate": 0.00014255709068222879, + "loss": 1.4506, + "step": 22114 + }, + { + "epoch": 0.28737433869953855, + "grad_norm": 0.33323588967323303, + "learning_rate": 0.0001425544912203174, + "loss": 1.517, + "step": 22115 + }, + { + "epoch": 0.2873873332434544, + "grad_norm": 0.4163329005241394, + "learning_rate": 0.000142551891758406, + "loss": 1.2719, + "step": 22116 + }, + { + "epoch": 0.2874003277873703, + "grad_norm": 0.4504295885562897, + "learning_rate": 0.00014254929229649463, + "loss": 1.4478, + "step": 22117 + }, + { + "epoch": 0.28741332233128614, + "grad_norm": 0.38723647594451904, + "learning_rate": 0.00014254669283458323, + "loss": 1.2207, + "step": 22118 + }, + { + "epoch": 0.28742631687520204, + "grad_norm": 0.4415624439716339, + "learning_rate": 0.00014254409337267188, + "loss": 1.496, + "step": 22119 + }, + { + "epoch": 0.2874393114191179, + "grad_norm": 0.26319268345832825, + "learning_rate": 0.00014254149391076048, + "loss": 1.5161, + "step": 22120 + }, + { + "epoch": 0.2874523059630338, + "grad_norm": 0.4632225036621094, + "learning_rate": 0.0001425388944488491, + "loss": 1.6013, + "step": 22121 + }, + { + "epoch": 0.28746530050694963, + "grad_norm": 0.36476877331733704, + "learning_rate": 0.0001425362949869377, + "loss": 1.28, + "step": 22122 + }, + { + "epoch": 0.28747829505086553, + "grad_norm": 0.3627232313156128, + "learning_rate": 0.00014253369552502633, + "loss": 1.4236, + "step": 22123 + }, + { + "epoch": 0.2874912895947814, + "grad_norm": 0.40497633814811707, + "learning_rate": 0.00014253109606311495, + "loss": 1.4835, + "step": 22124 + }, + { + "epoch": 0.2875042841386973, + "grad_norm": 0.38339635729789734, + "learning_rate": 0.00014252849660120355, + "loss": 1.4145, + "step": 22125 + }, + { + "epoch": 0.2875172786826131, + "grad_norm": 0.3669421970844269, + "learning_rate": 0.00014252589713929217, + "loss": 1.4088, + "step": 22126 + }, + { + "epoch": 0.287530273226529, + "grad_norm": 0.35283127427101135, + "learning_rate": 0.0001425232976773808, + "loss": 1.3145, + "step": 22127 + }, + { + "epoch": 0.28754326777044487, + "grad_norm": 0.35127025842666626, + "learning_rate": 0.0001425206982154694, + "loss": 1.4328, + "step": 22128 + }, + { + "epoch": 0.28755626231436077, + "grad_norm": 0.4407002329826355, + "learning_rate": 0.00014251809875355802, + "loss": 1.4376, + "step": 22129 + }, + { + "epoch": 0.2875692568582766, + "grad_norm": 0.3614174723625183, + "learning_rate": 0.00014251549929164662, + "loss": 1.1961, + "step": 22130 + }, + { + "epoch": 0.2875822514021925, + "grad_norm": 0.3617641031742096, + "learning_rate": 0.00014251289982973527, + "loss": 1.3747, + "step": 22131 + }, + { + "epoch": 0.28759524594610836, + "grad_norm": 0.37700214982032776, + "learning_rate": 0.00014251030036782386, + "loss": 1.3639, + "step": 22132 + }, + { + "epoch": 0.28760824049002426, + "grad_norm": 0.37006232142448425, + "learning_rate": 0.0001425077009059125, + "loss": 1.347, + "step": 22133 + }, + { + "epoch": 0.2876212350339401, + "grad_norm": 0.43554365634918213, + "learning_rate": 0.00014250510144400109, + "loss": 1.4454, + "step": 22134 + }, + { + "epoch": 0.287634229577856, + "grad_norm": 0.3919074237346649, + "learning_rate": 0.0001425025019820897, + "loss": 1.4605, + "step": 22135 + }, + { + "epoch": 0.28764722412177185, + "grad_norm": 0.45605579018592834, + "learning_rate": 0.00014249990252017833, + "loss": 1.3907, + "step": 22136 + }, + { + "epoch": 0.28766021866568775, + "grad_norm": 0.3348766565322876, + "learning_rate": 0.00014249730305826693, + "loss": 1.4485, + "step": 22137 + }, + { + "epoch": 0.2876732132096036, + "grad_norm": 0.3504857122898102, + "learning_rate": 0.00014249470359635558, + "loss": 1.5131, + "step": 22138 + }, + { + "epoch": 0.2876862077535195, + "grad_norm": 0.5046715140342712, + "learning_rate": 0.00014249210413444418, + "loss": 1.5235, + "step": 22139 + }, + { + "epoch": 0.28769920229743534, + "grad_norm": 0.3486250340938568, + "learning_rate": 0.00014248950467253278, + "loss": 1.3206, + "step": 22140 + }, + { + "epoch": 0.28771219684135124, + "grad_norm": 0.35158732533454895, + "learning_rate": 0.0001424869052106214, + "loss": 1.4862, + "step": 22141 + }, + { + "epoch": 0.2877251913852671, + "grad_norm": 0.43166446685791016, + "learning_rate": 0.00014248430574871003, + "loss": 1.3788, + "step": 22142 + }, + { + "epoch": 0.287738185929183, + "grad_norm": 0.3212583363056183, + "learning_rate": 0.00014248170628679865, + "loss": 1.4891, + "step": 22143 + }, + { + "epoch": 0.28775118047309883, + "grad_norm": 0.4297787547111511, + "learning_rate": 0.00014247910682488725, + "loss": 1.4005, + "step": 22144 + }, + { + "epoch": 0.28776417501701473, + "grad_norm": 0.4180818498134613, + "learning_rate": 0.00014247650736297587, + "loss": 1.6154, + "step": 22145 + }, + { + "epoch": 0.2877771695609306, + "grad_norm": 0.473725825548172, + "learning_rate": 0.0001424739079010645, + "loss": 1.419, + "step": 22146 + }, + { + "epoch": 0.2877901641048465, + "grad_norm": 0.42257606983184814, + "learning_rate": 0.0001424713084391531, + "loss": 1.4625, + "step": 22147 + }, + { + "epoch": 0.2878031586487623, + "grad_norm": 0.3577720820903778, + "learning_rate": 0.00014246870897724172, + "loss": 1.5566, + "step": 22148 + }, + { + "epoch": 0.2878161531926782, + "grad_norm": 0.3756045997142792, + "learning_rate": 0.00014246610951533032, + "loss": 1.4821, + "step": 22149 + }, + { + "epoch": 0.28782914773659407, + "grad_norm": 0.42378586530685425, + "learning_rate": 0.00014246351005341897, + "loss": 1.5888, + "step": 22150 + }, + { + "epoch": 0.28784214228050997, + "grad_norm": 0.39652448892593384, + "learning_rate": 0.00014246091059150757, + "loss": 1.5133, + "step": 22151 + }, + { + "epoch": 0.2878551368244258, + "grad_norm": 0.41539499163627625, + "learning_rate": 0.0001424583111295962, + "loss": 1.2951, + "step": 22152 + }, + { + "epoch": 0.2878681313683417, + "grad_norm": 0.43573832511901855, + "learning_rate": 0.0001424557116676848, + "loss": 1.4112, + "step": 22153 + }, + { + "epoch": 0.2878811259122576, + "grad_norm": 0.46801653504371643, + "learning_rate": 0.0001424531122057734, + "loss": 1.446, + "step": 22154 + }, + { + "epoch": 0.28789412045617346, + "grad_norm": 0.4033289849758148, + "learning_rate": 0.00014245051274386204, + "loss": 1.3774, + "step": 22155 + }, + { + "epoch": 0.28790711500008936, + "grad_norm": 0.3874218165874481, + "learning_rate": 0.00014244791328195063, + "loss": 1.4161, + "step": 22156 + }, + { + "epoch": 0.2879201095440052, + "grad_norm": 0.42272356152534485, + "learning_rate": 0.00014244531382003926, + "loss": 1.4871, + "step": 22157 + }, + { + "epoch": 0.2879331040879211, + "grad_norm": 0.3884390592575073, + "learning_rate": 0.00014244271435812788, + "loss": 1.5558, + "step": 22158 + }, + { + "epoch": 0.28794609863183696, + "grad_norm": 0.29191964864730835, + "learning_rate": 0.00014244011489621648, + "loss": 1.5111, + "step": 22159 + }, + { + "epoch": 0.28795909317575286, + "grad_norm": 0.39024919271469116, + "learning_rate": 0.0001424375154343051, + "loss": 1.2363, + "step": 22160 + }, + { + "epoch": 0.2879720877196687, + "grad_norm": 0.3433208465576172, + "learning_rate": 0.0001424349159723937, + "loss": 1.4404, + "step": 22161 + }, + { + "epoch": 0.2879850822635846, + "grad_norm": 0.4786560535430908, + "learning_rate": 0.00014243231651048235, + "loss": 1.4269, + "step": 22162 + }, + { + "epoch": 0.28799807680750045, + "grad_norm": 0.41161152720451355, + "learning_rate": 0.00014242971704857095, + "loss": 1.3291, + "step": 22163 + }, + { + "epoch": 0.28801107135141635, + "grad_norm": 0.40924957394599915, + "learning_rate": 0.00014242711758665958, + "loss": 1.3783, + "step": 22164 + }, + { + "epoch": 0.2880240658953322, + "grad_norm": 0.44676119089126587, + "learning_rate": 0.00014242451812474817, + "loss": 1.4976, + "step": 22165 + }, + { + "epoch": 0.2880370604392481, + "grad_norm": 0.3842381536960602, + "learning_rate": 0.0001424219186628368, + "loss": 1.3076, + "step": 22166 + }, + { + "epoch": 0.28805005498316394, + "grad_norm": 0.397760272026062, + "learning_rate": 0.00014241931920092542, + "loss": 1.3117, + "step": 22167 + }, + { + "epoch": 0.28806304952707984, + "grad_norm": 0.46874600648880005, + "learning_rate": 0.00014241671973901402, + "loss": 1.5023, + "step": 22168 + }, + { + "epoch": 0.2880760440709957, + "grad_norm": 0.3981539011001587, + "learning_rate": 0.00014241412027710264, + "loss": 1.4564, + "step": 22169 + }, + { + "epoch": 0.2880890386149116, + "grad_norm": 0.3509082496166229, + "learning_rate": 0.00014241152081519127, + "loss": 1.4025, + "step": 22170 + }, + { + "epoch": 0.28810203315882743, + "grad_norm": 0.3865593373775482, + "learning_rate": 0.00014240892135327987, + "loss": 1.3551, + "step": 22171 + }, + { + "epoch": 0.28811502770274333, + "grad_norm": 0.7735856771469116, + "learning_rate": 0.0001424063218913685, + "loss": 1.337, + "step": 22172 + }, + { + "epoch": 0.2881280222466592, + "grad_norm": 0.32929879426956177, + "learning_rate": 0.0001424037224294571, + "loss": 1.3845, + "step": 22173 + }, + { + "epoch": 0.2881410167905751, + "grad_norm": 0.3991185128688812, + "learning_rate": 0.00014240112296754574, + "loss": 1.2685, + "step": 22174 + }, + { + "epoch": 0.2881540113344909, + "grad_norm": 0.4119705557823181, + "learning_rate": 0.00014239852350563434, + "loss": 1.272, + "step": 22175 + }, + { + "epoch": 0.2881670058784068, + "grad_norm": 0.2189141809940338, + "learning_rate": 0.00014239592404372296, + "loss": 1.2547, + "step": 22176 + }, + { + "epoch": 0.28818000042232267, + "grad_norm": 0.38488346338272095, + "learning_rate": 0.0001423933245818116, + "loss": 1.3921, + "step": 22177 + }, + { + "epoch": 0.28819299496623857, + "grad_norm": 0.41958320140838623, + "learning_rate": 0.00014239072511990018, + "loss": 1.5372, + "step": 22178 + }, + { + "epoch": 0.2882059895101544, + "grad_norm": 0.4101869761943817, + "learning_rate": 0.0001423881256579888, + "loss": 1.36, + "step": 22179 + }, + { + "epoch": 0.2882189840540703, + "grad_norm": 0.45164361596107483, + "learning_rate": 0.0001423855261960774, + "loss": 1.4036, + "step": 22180 + }, + { + "epoch": 0.28823197859798616, + "grad_norm": 0.4409153461456299, + "learning_rate": 0.00014238292673416606, + "loss": 1.5524, + "step": 22181 + }, + { + "epoch": 0.28824497314190206, + "grad_norm": 0.3638860881328583, + "learning_rate": 0.00014238032727225465, + "loss": 1.5387, + "step": 22182 + }, + { + "epoch": 0.2882579676858179, + "grad_norm": 0.5224159359931946, + "learning_rate": 0.00014237772781034325, + "loss": 1.4571, + "step": 22183 + }, + { + "epoch": 0.2882709622297338, + "grad_norm": 0.437107115983963, + "learning_rate": 0.00014237512834843188, + "loss": 1.5554, + "step": 22184 + }, + { + "epoch": 0.28828395677364965, + "grad_norm": 0.4087488055229187, + "learning_rate": 0.0001423725288865205, + "loss": 1.447, + "step": 22185 + }, + { + "epoch": 0.28829695131756555, + "grad_norm": 0.39433395862579346, + "learning_rate": 0.00014236992942460913, + "loss": 1.5002, + "step": 22186 + }, + { + "epoch": 0.2883099458614814, + "grad_norm": 0.26936182379722595, + "learning_rate": 0.00014236732996269772, + "loss": 1.6188, + "step": 22187 + }, + { + "epoch": 0.2883229404053973, + "grad_norm": 0.3768483102321625, + "learning_rate": 0.00014236473050078635, + "loss": 1.418, + "step": 22188 + }, + { + "epoch": 0.28833593494931314, + "grad_norm": 0.4075142443180084, + "learning_rate": 0.00014236213103887497, + "loss": 1.3955, + "step": 22189 + }, + { + "epoch": 0.28834892949322904, + "grad_norm": 0.3793126046657562, + "learning_rate": 0.00014235953157696357, + "loss": 1.1325, + "step": 22190 + }, + { + "epoch": 0.2883619240371449, + "grad_norm": 0.3578402101993561, + "learning_rate": 0.0001423569321150522, + "loss": 1.2844, + "step": 22191 + }, + { + "epoch": 0.2883749185810608, + "grad_norm": 0.3542401194572449, + "learning_rate": 0.0001423543326531408, + "loss": 1.4327, + "step": 22192 + }, + { + "epoch": 0.28838791312497664, + "grad_norm": 0.3764530420303345, + "learning_rate": 0.00014235173319122944, + "loss": 1.2426, + "step": 22193 + }, + { + "epoch": 0.28840090766889254, + "grad_norm": 0.37350791692733765, + "learning_rate": 0.00014234913372931804, + "loss": 1.2829, + "step": 22194 + }, + { + "epoch": 0.2884139022128084, + "grad_norm": 0.38806912302970886, + "learning_rate": 0.00014234653426740664, + "loss": 1.4265, + "step": 22195 + }, + { + "epoch": 0.2884268967567243, + "grad_norm": 0.39289748668670654, + "learning_rate": 0.00014234393480549526, + "loss": 1.4502, + "step": 22196 + }, + { + "epoch": 0.2884398913006401, + "grad_norm": 0.47973915934562683, + "learning_rate": 0.00014234133534358389, + "loss": 1.3023, + "step": 22197 + }, + { + "epoch": 0.28845288584455603, + "grad_norm": 0.3785458505153656, + "learning_rate": 0.0001423387358816725, + "loss": 1.378, + "step": 22198 + }, + { + "epoch": 0.2884658803884719, + "grad_norm": 0.4501010775566101, + "learning_rate": 0.0001423361364197611, + "loss": 1.5021, + "step": 22199 + }, + { + "epoch": 0.2884788749323878, + "grad_norm": 0.5293017029762268, + "learning_rate": 0.00014233353695784973, + "loss": 1.4096, + "step": 22200 + }, + { + "epoch": 0.2884918694763036, + "grad_norm": 0.4222677946090698, + "learning_rate": 0.00014233093749593836, + "loss": 1.4493, + "step": 22201 + }, + { + "epoch": 0.2885048640202195, + "grad_norm": 0.4977155923843384, + "learning_rate": 0.00014232833803402695, + "loss": 1.4248, + "step": 22202 + }, + { + "epoch": 0.28851785856413537, + "grad_norm": 0.344005823135376, + "learning_rate": 0.00014232573857211558, + "loss": 1.2453, + "step": 22203 + }, + { + "epoch": 0.28853085310805127, + "grad_norm": 0.4305126965045929, + "learning_rate": 0.00014232313911020418, + "loss": 1.5422, + "step": 22204 + }, + { + "epoch": 0.2885438476519671, + "grad_norm": 0.45520490407943726, + "learning_rate": 0.00014232053964829283, + "loss": 1.4611, + "step": 22205 + }, + { + "epoch": 0.288556842195883, + "grad_norm": 0.43232953548431396, + "learning_rate": 0.00014231794018638143, + "loss": 1.5281, + "step": 22206 + }, + { + "epoch": 0.28856983673979886, + "grad_norm": 0.36836835741996765, + "learning_rate": 0.00014231534072447005, + "loss": 1.4601, + "step": 22207 + }, + { + "epoch": 0.28858283128371476, + "grad_norm": 0.46151643991470337, + "learning_rate": 0.00014231274126255865, + "loss": 1.3012, + "step": 22208 + }, + { + "epoch": 0.2885958258276306, + "grad_norm": 0.3296111524105072, + "learning_rate": 0.00014231014180064727, + "loss": 1.1617, + "step": 22209 + }, + { + "epoch": 0.2886088203715465, + "grad_norm": 0.4365094304084778, + "learning_rate": 0.0001423075423387359, + "loss": 1.4654, + "step": 22210 + }, + { + "epoch": 0.28862181491546235, + "grad_norm": 0.4050542116165161, + "learning_rate": 0.0001423049428768245, + "loss": 1.4388, + "step": 22211 + }, + { + "epoch": 0.28863480945937825, + "grad_norm": 0.37126821279525757, + "learning_rate": 0.00014230234341491312, + "loss": 1.3354, + "step": 22212 + }, + { + "epoch": 0.2886478040032941, + "grad_norm": 0.29640763998031616, + "learning_rate": 0.00014229974395300174, + "loss": 1.344, + "step": 22213 + }, + { + "epoch": 0.28866079854721, + "grad_norm": 0.3536563813686371, + "learning_rate": 0.00014229714449109034, + "loss": 1.4147, + "step": 22214 + }, + { + "epoch": 0.28867379309112584, + "grad_norm": 0.3737310469150543, + "learning_rate": 0.00014229454502917896, + "loss": 1.476, + "step": 22215 + }, + { + "epoch": 0.28868678763504174, + "grad_norm": 0.515973687171936, + "learning_rate": 0.0001422919455672676, + "loss": 1.4826, + "step": 22216 + }, + { + "epoch": 0.2886997821789576, + "grad_norm": 0.3244648277759552, + "learning_rate": 0.0001422893461053562, + "loss": 1.3116, + "step": 22217 + }, + { + "epoch": 0.2887127767228735, + "grad_norm": 0.3121008276939392, + "learning_rate": 0.0001422867466434448, + "loss": 1.3423, + "step": 22218 + }, + { + "epoch": 0.28872577126678933, + "grad_norm": 0.3961296081542969, + "learning_rate": 0.00014228414718153344, + "loss": 1.4464, + "step": 22219 + }, + { + "epoch": 0.28873876581070523, + "grad_norm": 0.41946539282798767, + "learning_rate": 0.00014228154771962206, + "loss": 1.3208, + "step": 22220 + }, + { + "epoch": 0.2887517603546211, + "grad_norm": 0.3853977918624878, + "learning_rate": 0.00014227894825771066, + "loss": 1.2638, + "step": 22221 + }, + { + "epoch": 0.288764754898537, + "grad_norm": 0.4243699014186859, + "learning_rate": 0.00014227634879579928, + "loss": 1.2983, + "step": 22222 + }, + { + "epoch": 0.2887777494424528, + "grad_norm": 0.32959455251693726, + "learning_rate": 0.00014227374933388788, + "loss": 1.2961, + "step": 22223 + }, + { + "epoch": 0.2887907439863687, + "grad_norm": 0.44616636633872986, + "learning_rate": 0.0001422711498719765, + "loss": 1.3889, + "step": 22224 + }, + { + "epoch": 0.28880373853028457, + "grad_norm": 0.38505083322525024, + "learning_rate": 0.00014226855041006513, + "loss": 1.4369, + "step": 22225 + }, + { + "epoch": 0.28881673307420047, + "grad_norm": 0.44189339876174927, + "learning_rate": 0.00014226595094815373, + "loss": 1.4329, + "step": 22226 + }, + { + "epoch": 0.2888297276181163, + "grad_norm": 0.515619695186615, + "learning_rate": 0.00014226335148624235, + "loss": 1.5656, + "step": 22227 + }, + { + "epoch": 0.2888427221620322, + "grad_norm": 0.48045679926872253, + "learning_rate": 0.00014226075202433097, + "loss": 1.4018, + "step": 22228 + }, + { + "epoch": 0.28885571670594806, + "grad_norm": 0.5401654243469238, + "learning_rate": 0.0001422581525624196, + "loss": 1.6124, + "step": 22229 + }, + { + "epoch": 0.28886871124986396, + "grad_norm": 0.29047438502311707, + "learning_rate": 0.0001422555531005082, + "loss": 1.5682, + "step": 22230 + }, + { + "epoch": 0.28888170579377986, + "grad_norm": 0.42067721486091614, + "learning_rate": 0.00014225295363859682, + "loss": 1.3397, + "step": 22231 + }, + { + "epoch": 0.2888947003376957, + "grad_norm": 0.43651214241981506, + "learning_rate": 0.00014225035417668545, + "loss": 1.5441, + "step": 22232 + }, + { + "epoch": 0.2889076948816116, + "grad_norm": 0.45472288131713867, + "learning_rate": 0.00014224775471477404, + "loss": 1.3925, + "step": 22233 + }, + { + "epoch": 0.28892068942552745, + "grad_norm": 0.5030890703201294, + "learning_rate": 0.00014224515525286267, + "loss": 1.6147, + "step": 22234 + }, + { + "epoch": 0.28893368396944336, + "grad_norm": 0.32014161348342896, + "learning_rate": 0.00014224255579095126, + "loss": 1.2553, + "step": 22235 + }, + { + "epoch": 0.2889466785133592, + "grad_norm": 0.40538862347602844, + "learning_rate": 0.00014223995632903992, + "loss": 1.4536, + "step": 22236 + }, + { + "epoch": 0.2889596730572751, + "grad_norm": 0.4062918424606323, + "learning_rate": 0.0001422373568671285, + "loss": 1.5067, + "step": 22237 + }, + { + "epoch": 0.28897266760119095, + "grad_norm": 0.4442928731441498, + "learning_rate": 0.0001422347574052171, + "loss": 1.5302, + "step": 22238 + }, + { + "epoch": 0.28898566214510685, + "grad_norm": 0.5418941378593445, + "learning_rate": 0.00014223215794330574, + "loss": 1.3773, + "step": 22239 + }, + { + "epoch": 0.2889986566890227, + "grad_norm": 0.4543876051902771, + "learning_rate": 0.00014222955848139436, + "loss": 1.4275, + "step": 22240 + }, + { + "epoch": 0.2890116512329386, + "grad_norm": 0.3564375042915344, + "learning_rate": 0.00014222695901948298, + "loss": 1.3042, + "step": 22241 + }, + { + "epoch": 0.28902464577685444, + "grad_norm": 0.3918377161026001, + "learning_rate": 0.00014222435955757158, + "loss": 1.4558, + "step": 22242 + }, + { + "epoch": 0.28903764032077034, + "grad_norm": 0.419392466545105, + "learning_rate": 0.0001422217600956602, + "loss": 1.4747, + "step": 22243 + }, + { + "epoch": 0.2890506348646862, + "grad_norm": 0.4483223855495453, + "learning_rate": 0.00014221916063374883, + "loss": 1.6658, + "step": 22244 + }, + { + "epoch": 0.2890636294086021, + "grad_norm": 0.3335586488246918, + "learning_rate": 0.00014221656117183743, + "loss": 1.4099, + "step": 22245 + }, + { + "epoch": 0.28907662395251793, + "grad_norm": 0.3058409094810486, + "learning_rate": 0.00014221396170992605, + "loss": 1.1076, + "step": 22246 + }, + { + "epoch": 0.28908961849643383, + "grad_norm": 0.4099843204021454, + "learning_rate": 0.00014221136224801465, + "loss": 1.3358, + "step": 22247 + }, + { + "epoch": 0.2891026130403497, + "grad_norm": 0.4615851938724518, + "learning_rate": 0.0001422087627861033, + "loss": 1.4019, + "step": 22248 + }, + { + "epoch": 0.2891156075842656, + "grad_norm": 0.4688842296600342, + "learning_rate": 0.0001422061633241919, + "loss": 1.4556, + "step": 22249 + }, + { + "epoch": 0.2891286021281814, + "grad_norm": 0.4001471698284149, + "learning_rate": 0.0001422035638622805, + "loss": 1.394, + "step": 22250 + }, + { + "epoch": 0.2891415966720973, + "grad_norm": 0.4646671712398529, + "learning_rate": 0.00014220096440036915, + "loss": 1.3567, + "step": 22251 + }, + { + "epoch": 0.28915459121601317, + "grad_norm": 0.37723442912101746, + "learning_rate": 0.00014219836493845775, + "loss": 1.2708, + "step": 22252 + }, + { + "epoch": 0.28916758575992907, + "grad_norm": 0.35815441608428955, + "learning_rate": 0.00014219576547654637, + "loss": 1.4626, + "step": 22253 + }, + { + "epoch": 0.2891805803038449, + "grad_norm": 0.3989640176296234, + "learning_rate": 0.00014219316601463497, + "loss": 1.3824, + "step": 22254 + }, + { + "epoch": 0.2891935748477608, + "grad_norm": 0.4239923655986786, + "learning_rate": 0.0001421905665527236, + "loss": 1.403, + "step": 22255 + }, + { + "epoch": 0.28920656939167666, + "grad_norm": 0.42054277658462524, + "learning_rate": 0.00014218796709081222, + "loss": 1.6153, + "step": 22256 + }, + { + "epoch": 0.28921956393559256, + "grad_norm": 0.32436731457710266, + "learning_rate": 0.0001421853676289008, + "loss": 1.387, + "step": 22257 + }, + { + "epoch": 0.2892325584795084, + "grad_norm": 0.446170836687088, + "learning_rate": 0.00014218276816698944, + "loss": 1.3896, + "step": 22258 + }, + { + "epoch": 0.2892455530234243, + "grad_norm": 0.4039406478404999, + "learning_rate": 0.00014218016870507806, + "loss": 1.4743, + "step": 22259 + }, + { + "epoch": 0.28925854756734015, + "grad_norm": 0.42505866289138794, + "learning_rate": 0.0001421775692431667, + "loss": 1.5496, + "step": 22260 + }, + { + "epoch": 0.28927154211125605, + "grad_norm": 0.32116004824638367, + "learning_rate": 0.00014217496978125528, + "loss": 1.6292, + "step": 22261 + }, + { + "epoch": 0.2892845366551719, + "grad_norm": 0.44610050320625305, + "learning_rate": 0.00014217237031934388, + "loss": 1.3935, + "step": 22262 + }, + { + "epoch": 0.2892975311990878, + "grad_norm": 0.4346407651901245, + "learning_rate": 0.00014216977085743253, + "loss": 1.3005, + "step": 22263 + }, + { + "epoch": 0.28931052574300364, + "grad_norm": 0.4863552749156952, + "learning_rate": 0.00014216717139552113, + "loss": 1.342, + "step": 22264 + }, + { + "epoch": 0.28932352028691954, + "grad_norm": 0.36966603994369507, + "learning_rate": 0.00014216457193360976, + "loss": 1.3606, + "step": 22265 + }, + { + "epoch": 0.2893365148308354, + "grad_norm": 0.4484783709049225, + "learning_rate": 0.00014216197247169835, + "loss": 1.6543, + "step": 22266 + }, + { + "epoch": 0.2893495093747513, + "grad_norm": 0.4564906060695648, + "learning_rate": 0.00014215937300978698, + "loss": 1.3202, + "step": 22267 + }, + { + "epoch": 0.28936250391866714, + "grad_norm": 0.4410177767276764, + "learning_rate": 0.0001421567735478756, + "loss": 1.5001, + "step": 22268 + }, + { + "epoch": 0.28937549846258304, + "grad_norm": 0.3964931070804596, + "learning_rate": 0.0001421541740859642, + "loss": 1.3534, + "step": 22269 + }, + { + "epoch": 0.2893884930064989, + "grad_norm": 0.32138121128082275, + "learning_rate": 0.00014215157462405282, + "loss": 1.2751, + "step": 22270 + }, + { + "epoch": 0.2894014875504148, + "grad_norm": 0.312162846326828, + "learning_rate": 0.00014214897516214145, + "loss": 1.3733, + "step": 22271 + }, + { + "epoch": 0.2894144820943306, + "grad_norm": 0.3795371949672699, + "learning_rate": 0.00014214637570023007, + "loss": 1.5391, + "step": 22272 + }, + { + "epoch": 0.28942747663824653, + "grad_norm": 0.4658038020133972, + "learning_rate": 0.00014214377623831867, + "loss": 1.5096, + "step": 22273 + }, + { + "epoch": 0.2894404711821624, + "grad_norm": 0.4609101414680481, + "learning_rate": 0.0001421411767764073, + "loss": 1.4755, + "step": 22274 + }, + { + "epoch": 0.2894534657260783, + "grad_norm": 0.39089611172676086, + "learning_rate": 0.00014213857731449592, + "loss": 1.4187, + "step": 22275 + }, + { + "epoch": 0.2894664602699941, + "grad_norm": 0.3116099238395691, + "learning_rate": 0.00014213597785258452, + "loss": 1.5299, + "step": 22276 + }, + { + "epoch": 0.28947945481391, + "grad_norm": 0.34981757402420044, + "learning_rate": 0.00014213337839067314, + "loss": 1.461, + "step": 22277 + }, + { + "epoch": 0.28949244935782587, + "grad_norm": 0.30512478947639465, + "learning_rate": 0.00014213077892876174, + "loss": 1.4557, + "step": 22278 + }, + { + "epoch": 0.28950544390174177, + "grad_norm": 0.3608631193637848, + "learning_rate": 0.00014212817946685036, + "loss": 1.2629, + "step": 22279 + }, + { + "epoch": 0.2895184384456576, + "grad_norm": 0.37413182854652405, + "learning_rate": 0.000142125580004939, + "loss": 1.5088, + "step": 22280 + }, + { + "epoch": 0.2895314329895735, + "grad_norm": 0.4897593855857849, + "learning_rate": 0.00014212298054302758, + "loss": 1.3233, + "step": 22281 + }, + { + "epoch": 0.28954442753348936, + "grad_norm": 0.4357750117778778, + "learning_rate": 0.0001421203810811162, + "loss": 1.4592, + "step": 22282 + }, + { + "epoch": 0.28955742207740526, + "grad_norm": 0.4132704734802246, + "learning_rate": 0.00014211778161920483, + "loss": 1.5849, + "step": 22283 + }, + { + "epoch": 0.2895704166213211, + "grad_norm": 0.44093960523605347, + "learning_rate": 0.00014211518215729346, + "loss": 1.3972, + "step": 22284 + }, + { + "epoch": 0.289583411165237, + "grad_norm": 0.42616814374923706, + "learning_rate": 0.00014211258269538205, + "loss": 1.4716, + "step": 22285 + }, + { + "epoch": 0.28959640570915285, + "grad_norm": 0.25706642866134644, + "learning_rate": 0.00014210998323347068, + "loss": 1.3038, + "step": 22286 + }, + { + "epoch": 0.28960940025306875, + "grad_norm": 0.38505569100379944, + "learning_rate": 0.0001421073837715593, + "loss": 1.7213, + "step": 22287 + }, + { + "epoch": 0.2896223947969846, + "grad_norm": 0.36318185925483704, + "learning_rate": 0.0001421047843096479, + "loss": 1.4037, + "step": 22288 + }, + { + "epoch": 0.2896353893409005, + "grad_norm": 0.4311409294605255, + "learning_rate": 0.00014210218484773653, + "loss": 1.4954, + "step": 22289 + }, + { + "epoch": 0.28964838388481634, + "grad_norm": 0.47099241614341736, + "learning_rate": 0.00014209958538582515, + "loss": 1.419, + "step": 22290 + }, + { + "epoch": 0.28966137842873224, + "grad_norm": 0.3859619200229645, + "learning_rate": 0.00014209698592391377, + "loss": 1.4748, + "step": 22291 + }, + { + "epoch": 0.2896743729726481, + "grad_norm": 0.3615507483482361, + "learning_rate": 0.00014209438646200237, + "loss": 1.4257, + "step": 22292 + }, + { + "epoch": 0.289687367516564, + "grad_norm": 0.3239363729953766, + "learning_rate": 0.00014209178700009097, + "loss": 1.3568, + "step": 22293 + }, + { + "epoch": 0.28970036206047983, + "grad_norm": 0.43754154443740845, + "learning_rate": 0.00014208918753817962, + "loss": 1.4336, + "step": 22294 + }, + { + "epoch": 0.28971335660439573, + "grad_norm": 0.41938117146492004, + "learning_rate": 0.00014208658807626822, + "loss": 1.4998, + "step": 22295 + }, + { + "epoch": 0.2897263511483116, + "grad_norm": 0.3349857032299042, + "learning_rate": 0.00014208398861435684, + "loss": 1.284, + "step": 22296 + }, + { + "epoch": 0.2897393456922275, + "grad_norm": 0.42732977867126465, + "learning_rate": 0.00014208138915244544, + "loss": 1.4753, + "step": 22297 + }, + { + "epoch": 0.2897523402361433, + "grad_norm": 0.3861484229564667, + "learning_rate": 0.00014207878969053406, + "loss": 1.5407, + "step": 22298 + }, + { + "epoch": 0.2897653347800592, + "grad_norm": 0.33476418256759644, + "learning_rate": 0.0001420761902286227, + "loss": 1.3591, + "step": 22299 + }, + { + "epoch": 0.28977832932397507, + "grad_norm": 0.7007179260253906, + "learning_rate": 0.0001420735907667113, + "loss": 1.6933, + "step": 22300 + }, + { + "epoch": 0.28979132386789097, + "grad_norm": 0.5477127432823181, + "learning_rate": 0.0001420709913047999, + "loss": 1.6322, + "step": 22301 + }, + { + "epoch": 0.2898043184118068, + "grad_norm": 0.38956218957901, + "learning_rate": 0.00014206839184288854, + "loss": 1.3232, + "step": 22302 + }, + { + "epoch": 0.2898173129557227, + "grad_norm": 0.42501935362815857, + "learning_rate": 0.00014206579238097716, + "loss": 1.6154, + "step": 22303 + }, + { + "epoch": 0.28983030749963856, + "grad_norm": 0.35152241587638855, + "learning_rate": 0.00014206319291906576, + "loss": 1.4486, + "step": 22304 + }, + { + "epoch": 0.28984330204355446, + "grad_norm": 0.33880022168159485, + "learning_rate": 0.00014206059345715435, + "loss": 1.2453, + "step": 22305 + }, + { + "epoch": 0.28985629658747036, + "grad_norm": 0.5051475763320923, + "learning_rate": 0.000142057993995243, + "loss": 1.2604, + "step": 22306 + }, + { + "epoch": 0.2898692911313862, + "grad_norm": 0.4556049108505249, + "learning_rate": 0.0001420553945333316, + "loss": 1.4191, + "step": 22307 + }, + { + "epoch": 0.2898822856753021, + "grad_norm": 0.3710220158100128, + "learning_rate": 0.00014205279507142023, + "loss": 1.5994, + "step": 22308 + }, + { + "epoch": 0.28989528021921795, + "grad_norm": 0.39599597454071045, + "learning_rate": 0.00014205019560950883, + "loss": 1.6606, + "step": 22309 + }, + { + "epoch": 0.28990827476313386, + "grad_norm": 0.3968614637851715, + "learning_rate": 0.00014204759614759745, + "loss": 1.2073, + "step": 22310 + }, + { + "epoch": 0.2899212693070497, + "grad_norm": 0.4041449725627899, + "learning_rate": 0.00014204499668568607, + "loss": 1.5553, + "step": 22311 + }, + { + "epoch": 0.2899342638509656, + "grad_norm": 0.3937256634235382, + "learning_rate": 0.00014204239722377467, + "loss": 1.37, + "step": 22312 + }, + { + "epoch": 0.28994725839488145, + "grad_norm": 0.40615394711494446, + "learning_rate": 0.0001420397977618633, + "loss": 1.5182, + "step": 22313 + }, + { + "epoch": 0.28996025293879735, + "grad_norm": 0.3599732220172882, + "learning_rate": 0.00014203719829995192, + "loss": 1.4869, + "step": 22314 + }, + { + "epoch": 0.2899732474827132, + "grad_norm": 0.3950166702270508, + "learning_rate": 0.00014203459883804055, + "loss": 1.23, + "step": 22315 + }, + { + "epoch": 0.2899862420266291, + "grad_norm": 0.4353402554988861, + "learning_rate": 0.00014203199937612914, + "loss": 1.5926, + "step": 22316 + }, + { + "epoch": 0.28999923657054494, + "grad_norm": 0.37088438868522644, + "learning_rate": 0.00014202939991421774, + "loss": 1.3402, + "step": 22317 + }, + { + "epoch": 0.29001223111446084, + "grad_norm": 0.47764694690704346, + "learning_rate": 0.0001420268004523064, + "loss": 1.3889, + "step": 22318 + }, + { + "epoch": 0.2900252256583767, + "grad_norm": 0.3792191743850708, + "learning_rate": 0.000142024200990395, + "loss": 1.3261, + "step": 22319 + }, + { + "epoch": 0.2900382202022926, + "grad_norm": 0.41932064294815063, + "learning_rate": 0.00014202160152848361, + "loss": 1.4122, + "step": 22320 + }, + { + "epoch": 0.29005121474620843, + "grad_norm": 0.43686172366142273, + "learning_rate": 0.0001420190020665722, + "loss": 1.5433, + "step": 22321 + }, + { + "epoch": 0.29006420929012433, + "grad_norm": 0.2098013162612915, + "learning_rate": 0.00014201640260466084, + "loss": 0.9717, + "step": 22322 + }, + { + "epoch": 0.2900772038340402, + "grad_norm": 0.3696129024028778, + "learning_rate": 0.00014201380314274946, + "loss": 1.3771, + "step": 22323 + }, + { + "epoch": 0.2900901983779561, + "grad_norm": 0.43324437737464905, + "learning_rate": 0.00014201120368083806, + "loss": 1.4719, + "step": 22324 + }, + { + "epoch": 0.2901031929218719, + "grad_norm": 0.4683477282524109, + "learning_rate": 0.0001420086042189267, + "loss": 1.5837, + "step": 22325 + }, + { + "epoch": 0.2901161874657878, + "grad_norm": 0.4767380654811859, + "learning_rate": 0.0001420060047570153, + "loss": 1.4855, + "step": 22326 + }, + { + "epoch": 0.29012918200970367, + "grad_norm": 0.3886423408985138, + "learning_rate": 0.00014200340529510393, + "loss": 1.2985, + "step": 22327 + }, + { + "epoch": 0.29014217655361957, + "grad_norm": 0.3977794051170349, + "learning_rate": 0.00014200080583319253, + "loss": 1.4754, + "step": 22328 + }, + { + "epoch": 0.2901551710975354, + "grad_norm": 0.25363174080848694, + "learning_rate": 0.00014199820637128115, + "loss": 1.2019, + "step": 22329 + }, + { + "epoch": 0.2901681656414513, + "grad_norm": 0.3842352330684662, + "learning_rate": 0.00014199560690936978, + "loss": 1.545, + "step": 22330 + }, + { + "epoch": 0.29018116018536716, + "grad_norm": 0.3418513834476471, + "learning_rate": 0.00014199300744745837, + "loss": 1.4383, + "step": 22331 + }, + { + "epoch": 0.29019415472928306, + "grad_norm": 0.33965203166007996, + "learning_rate": 0.000141990407985547, + "loss": 1.4286, + "step": 22332 + }, + { + "epoch": 0.2902071492731989, + "grad_norm": 0.43528953194618225, + "learning_rate": 0.00014198780852363562, + "loss": 1.3904, + "step": 22333 + }, + { + "epoch": 0.2902201438171148, + "grad_norm": 0.46454524993896484, + "learning_rate": 0.00014198520906172422, + "loss": 1.2898, + "step": 22334 + }, + { + "epoch": 0.29023313836103065, + "grad_norm": 0.4105285704135895, + "learning_rate": 0.00014198260959981285, + "loss": 1.4581, + "step": 22335 + }, + { + "epoch": 0.29024613290494655, + "grad_norm": 0.3978787660598755, + "learning_rate": 0.00014198001013790144, + "loss": 1.4891, + "step": 22336 + }, + { + "epoch": 0.2902591274488624, + "grad_norm": 0.35916993021965027, + "learning_rate": 0.0001419774106759901, + "loss": 1.2862, + "step": 22337 + }, + { + "epoch": 0.2902721219927783, + "grad_norm": 0.39996638894081116, + "learning_rate": 0.0001419748112140787, + "loss": 1.3422, + "step": 22338 + }, + { + "epoch": 0.29028511653669414, + "grad_norm": 0.3090335726737976, + "learning_rate": 0.00014197221175216732, + "loss": 1.14, + "step": 22339 + }, + { + "epoch": 0.29029811108061004, + "grad_norm": 0.506252646446228, + "learning_rate": 0.00014196961229025591, + "loss": 1.506, + "step": 22340 + }, + { + "epoch": 0.2903111056245259, + "grad_norm": 0.42328906059265137, + "learning_rate": 0.00014196701282834454, + "loss": 1.4337, + "step": 22341 + }, + { + "epoch": 0.2903241001684418, + "grad_norm": 0.4637744128704071, + "learning_rate": 0.00014196441336643316, + "loss": 1.3951, + "step": 22342 + }, + { + "epoch": 0.29033709471235764, + "grad_norm": 0.3766438663005829, + "learning_rate": 0.00014196181390452176, + "loss": 1.5259, + "step": 22343 + }, + { + "epoch": 0.29035008925627354, + "grad_norm": 0.3739299476146698, + "learning_rate": 0.00014195921444261038, + "loss": 1.4014, + "step": 22344 + }, + { + "epoch": 0.2903630838001894, + "grad_norm": 0.34221434593200684, + "learning_rate": 0.000141956614980699, + "loss": 1.3113, + "step": 22345 + }, + { + "epoch": 0.2903760783441053, + "grad_norm": 0.4266754388809204, + "learning_rate": 0.0001419540155187876, + "loss": 1.3435, + "step": 22346 + }, + { + "epoch": 0.2903890728880211, + "grad_norm": 0.4251234233379364, + "learning_rate": 0.00014195141605687623, + "loss": 1.3403, + "step": 22347 + }, + { + "epoch": 0.290402067431937, + "grad_norm": 0.3070501387119293, + "learning_rate": 0.00014194881659496483, + "loss": 1.2095, + "step": 22348 + }, + { + "epoch": 0.2904150619758529, + "grad_norm": 0.3991011381149292, + "learning_rate": 0.00014194621713305348, + "loss": 1.5122, + "step": 22349 + }, + { + "epoch": 0.2904280565197688, + "grad_norm": 0.4335017204284668, + "learning_rate": 0.00014194361767114208, + "loss": 1.3991, + "step": 22350 + }, + { + "epoch": 0.2904410510636846, + "grad_norm": 0.28476572036743164, + "learning_rate": 0.0001419410182092307, + "loss": 1.4286, + "step": 22351 + }, + { + "epoch": 0.2904540456076005, + "grad_norm": 0.3671368658542633, + "learning_rate": 0.0001419384187473193, + "loss": 1.5065, + "step": 22352 + }, + { + "epoch": 0.29046704015151636, + "grad_norm": 0.3427566885948181, + "learning_rate": 0.00014193581928540792, + "loss": 1.3048, + "step": 22353 + }, + { + "epoch": 0.29048003469543227, + "grad_norm": 0.34179338812828064, + "learning_rate": 0.00014193321982349655, + "loss": 1.1822, + "step": 22354 + }, + { + "epoch": 0.2904930292393481, + "grad_norm": 0.3781779408454895, + "learning_rate": 0.00014193062036158515, + "loss": 1.3153, + "step": 22355 + }, + { + "epoch": 0.290506023783264, + "grad_norm": 0.39216864109039307, + "learning_rate": 0.00014192802089967377, + "loss": 1.2711, + "step": 22356 + }, + { + "epoch": 0.29051901832717986, + "grad_norm": 0.4567340612411499, + "learning_rate": 0.0001419254214377624, + "loss": 1.6723, + "step": 22357 + }, + { + "epoch": 0.29053201287109576, + "grad_norm": 0.4034908711910248, + "learning_rate": 0.00014192282197585102, + "loss": 1.5544, + "step": 22358 + }, + { + "epoch": 0.2905450074150116, + "grad_norm": 0.4637525677680969, + "learning_rate": 0.00014192022251393962, + "loss": 1.2819, + "step": 22359 + }, + { + "epoch": 0.2905580019589275, + "grad_norm": 0.4224262833595276, + "learning_rate": 0.00014191762305202824, + "loss": 1.3303, + "step": 22360 + }, + { + "epoch": 0.29057099650284335, + "grad_norm": 0.4207250773906708, + "learning_rate": 0.00014191502359011687, + "loss": 1.5319, + "step": 22361 + }, + { + "epoch": 0.29058399104675925, + "grad_norm": 0.37056565284729004, + "learning_rate": 0.00014191242412820546, + "loss": 1.4878, + "step": 22362 + }, + { + "epoch": 0.2905969855906751, + "grad_norm": 0.37557870149612427, + "learning_rate": 0.0001419098246662941, + "loss": 1.3815, + "step": 22363 + }, + { + "epoch": 0.290609980134591, + "grad_norm": 0.4046560227870941, + "learning_rate": 0.0001419072252043827, + "loss": 1.3142, + "step": 22364 + }, + { + "epoch": 0.29062297467850684, + "grad_norm": 0.3729954957962036, + "learning_rate": 0.0001419046257424713, + "loss": 1.265, + "step": 22365 + }, + { + "epoch": 0.29063596922242274, + "grad_norm": 0.310863196849823, + "learning_rate": 0.00014190202628055993, + "loss": 1.342, + "step": 22366 + }, + { + "epoch": 0.2906489637663386, + "grad_norm": 0.4439898431301117, + "learning_rate": 0.00014189942681864853, + "loss": 1.4744, + "step": 22367 + }, + { + "epoch": 0.2906619583102545, + "grad_norm": 0.5251782536506653, + "learning_rate": 0.00014189682735673718, + "loss": 1.4744, + "step": 22368 + }, + { + "epoch": 0.29067495285417033, + "grad_norm": 0.381529837846756, + "learning_rate": 0.00014189422789482578, + "loss": 1.29, + "step": 22369 + }, + { + "epoch": 0.29068794739808623, + "grad_norm": 0.4463101327419281, + "learning_rate": 0.0001418916284329144, + "loss": 1.4288, + "step": 22370 + }, + { + "epoch": 0.2907009419420021, + "grad_norm": 0.43048325181007385, + "learning_rate": 0.000141889028971003, + "loss": 1.3991, + "step": 22371 + }, + { + "epoch": 0.290713936485918, + "grad_norm": 0.32835185527801514, + "learning_rate": 0.00014188642950909163, + "loss": 1.2291, + "step": 22372 + }, + { + "epoch": 0.2907269310298338, + "grad_norm": 0.3897741734981537, + "learning_rate": 0.00014188383004718025, + "loss": 1.6042, + "step": 22373 + }, + { + "epoch": 0.2907399255737497, + "grad_norm": 0.46307021379470825, + "learning_rate": 0.00014188123058526885, + "loss": 1.5311, + "step": 22374 + }, + { + "epoch": 0.29075292011766557, + "grad_norm": 0.43575552105903625, + "learning_rate": 0.00014187863112335747, + "loss": 1.1857, + "step": 22375 + }, + { + "epoch": 0.29076591466158147, + "grad_norm": 0.5436030030250549, + "learning_rate": 0.0001418760316614461, + "loss": 1.3523, + "step": 22376 + }, + { + "epoch": 0.2907789092054973, + "grad_norm": 0.35704275965690613, + "learning_rate": 0.0001418734321995347, + "loss": 1.438, + "step": 22377 + }, + { + "epoch": 0.2907919037494132, + "grad_norm": 0.5289881229400635, + "learning_rate": 0.00014187083273762332, + "loss": 1.5381, + "step": 22378 + }, + { + "epoch": 0.29080489829332906, + "grad_norm": 0.41413334012031555, + "learning_rate": 0.00014186823327571192, + "loss": 1.3201, + "step": 22379 + }, + { + "epoch": 0.29081789283724496, + "grad_norm": 0.38084661960601807, + "learning_rate": 0.00014186563381380057, + "loss": 1.4525, + "step": 22380 + }, + { + "epoch": 0.2908308873811608, + "grad_norm": 0.4375143349170685, + "learning_rate": 0.00014186303435188917, + "loss": 1.4496, + "step": 22381 + }, + { + "epoch": 0.2908438819250767, + "grad_norm": 0.37477463483810425, + "learning_rate": 0.0001418604348899778, + "loss": 1.437, + "step": 22382 + }, + { + "epoch": 0.2908568764689926, + "grad_norm": 0.38776907324790955, + "learning_rate": 0.0001418578354280664, + "loss": 1.4108, + "step": 22383 + }, + { + "epoch": 0.29086987101290845, + "grad_norm": 0.3997366726398468, + "learning_rate": 0.000141855235966155, + "loss": 1.496, + "step": 22384 + }, + { + "epoch": 0.29088286555682435, + "grad_norm": 0.410274863243103, + "learning_rate": 0.00014185263650424364, + "loss": 1.3605, + "step": 22385 + }, + { + "epoch": 0.2908958601007402, + "grad_norm": 0.451054185628891, + "learning_rate": 0.00014185003704233223, + "loss": 1.4451, + "step": 22386 + }, + { + "epoch": 0.2909088546446561, + "grad_norm": 0.4546957314014435, + "learning_rate": 0.00014184743758042086, + "loss": 1.4672, + "step": 22387 + }, + { + "epoch": 0.29092184918857195, + "grad_norm": 0.38290470838546753, + "learning_rate": 0.00014184483811850948, + "loss": 1.3172, + "step": 22388 + }, + { + "epoch": 0.29093484373248785, + "grad_norm": 0.2988804280757904, + "learning_rate": 0.00014184223865659808, + "loss": 1.3318, + "step": 22389 + }, + { + "epoch": 0.2909478382764037, + "grad_norm": 0.32097315788269043, + "learning_rate": 0.0001418396391946867, + "loss": 1.4046, + "step": 22390 + }, + { + "epoch": 0.2909608328203196, + "grad_norm": 0.32626211643218994, + "learning_rate": 0.0001418370397327753, + "loss": 1.355, + "step": 22391 + }, + { + "epoch": 0.29097382736423544, + "grad_norm": 0.3629591166973114, + "learning_rate": 0.00014183444027086395, + "loss": 1.2518, + "step": 22392 + }, + { + "epoch": 0.29098682190815134, + "grad_norm": 0.3561261296272278, + "learning_rate": 0.00014183184080895255, + "loss": 1.3262, + "step": 22393 + }, + { + "epoch": 0.2909998164520672, + "grad_norm": 0.44344791769981384, + "learning_rate": 0.00014182924134704118, + "loss": 1.36, + "step": 22394 + }, + { + "epoch": 0.2910128109959831, + "grad_norm": 0.3925451636314392, + "learning_rate": 0.00014182664188512977, + "loss": 1.4019, + "step": 22395 + }, + { + "epoch": 0.29102580553989893, + "grad_norm": 0.41915786266326904, + "learning_rate": 0.0001418240424232184, + "loss": 1.3088, + "step": 22396 + }, + { + "epoch": 0.29103880008381483, + "grad_norm": 0.3616812527179718, + "learning_rate": 0.00014182144296130702, + "loss": 1.3475, + "step": 22397 + }, + { + "epoch": 0.2910517946277307, + "grad_norm": 0.34788963198661804, + "learning_rate": 0.00014181884349939562, + "loss": 1.2185, + "step": 22398 + }, + { + "epoch": 0.2910647891716466, + "grad_norm": 0.4292566478252411, + "learning_rate": 0.00014181624403748427, + "loss": 1.3199, + "step": 22399 + }, + { + "epoch": 0.2910777837155624, + "grad_norm": 0.4887067973613739, + "learning_rate": 0.00014181364457557287, + "loss": 1.5501, + "step": 22400 + }, + { + "epoch": 0.2910907782594783, + "grad_norm": 0.39569053053855896, + "learning_rate": 0.00014181104511366147, + "loss": 1.4242, + "step": 22401 + }, + { + "epoch": 0.29110377280339417, + "grad_norm": 0.34402787685394287, + "learning_rate": 0.0001418084456517501, + "loss": 1.2173, + "step": 22402 + }, + { + "epoch": 0.29111676734731007, + "grad_norm": 0.45720356702804565, + "learning_rate": 0.00014180584618983871, + "loss": 1.4169, + "step": 22403 + }, + { + "epoch": 0.2911297618912259, + "grad_norm": 0.37784141302108765, + "learning_rate": 0.00014180324672792734, + "loss": 1.3462, + "step": 22404 + }, + { + "epoch": 0.2911427564351418, + "grad_norm": 0.41201251745224, + "learning_rate": 0.00014180064726601594, + "loss": 1.5398, + "step": 22405 + }, + { + "epoch": 0.29115575097905766, + "grad_norm": 0.4607985317707062, + "learning_rate": 0.00014179804780410456, + "loss": 1.4393, + "step": 22406 + }, + { + "epoch": 0.29116874552297356, + "grad_norm": 0.39322248101234436, + "learning_rate": 0.00014179544834219318, + "loss": 1.1679, + "step": 22407 + }, + { + "epoch": 0.2911817400668894, + "grad_norm": 0.4623670279979706, + "learning_rate": 0.00014179284888028178, + "loss": 1.5951, + "step": 22408 + }, + { + "epoch": 0.2911947346108053, + "grad_norm": 0.37860068678855896, + "learning_rate": 0.0001417902494183704, + "loss": 1.4761, + "step": 22409 + }, + { + "epoch": 0.29120772915472115, + "grad_norm": 0.34126970171928406, + "learning_rate": 0.000141787649956459, + "loss": 1.4322, + "step": 22410 + }, + { + "epoch": 0.29122072369863705, + "grad_norm": 0.258602499961853, + "learning_rate": 0.00014178505049454766, + "loss": 1.3408, + "step": 22411 + }, + { + "epoch": 0.2912337182425529, + "grad_norm": 0.41152387857437134, + "learning_rate": 0.00014178245103263625, + "loss": 1.273, + "step": 22412 + }, + { + "epoch": 0.2912467127864688, + "grad_norm": 0.441094309091568, + "learning_rate": 0.00014177985157072488, + "loss": 1.5128, + "step": 22413 + }, + { + "epoch": 0.29125970733038464, + "grad_norm": 0.40566110610961914, + "learning_rate": 0.00014177725210881348, + "loss": 1.3256, + "step": 22414 + }, + { + "epoch": 0.29127270187430054, + "grad_norm": 0.4134718179702759, + "learning_rate": 0.0001417746526469021, + "loss": 1.3458, + "step": 22415 + }, + { + "epoch": 0.2912856964182164, + "grad_norm": 0.46284419298171997, + "learning_rate": 0.00014177205318499072, + "loss": 1.4527, + "step": 22416 + }, + { + "epoch": 0.2912986909621323, + "grad_norm": 0.4089805781841278, + "learning_rate": 0.00014176945372307932, + "loss": 1.4001, + "step": 22417 + }, + { + "epoch": 0.29131168550604813, + "grad_norm": 0.5223478674888611, + "learning_rate": 0.00014176685426116795, + "loss": 1.4858, + "step": 22418 + }, + { + "epoch": 0.29132468004996404, + "grad_norm": 0.41372770071029663, + "learning_rate": 0.00014176425479925657, + "loss": 1.3085, + "step": 22419 + }, + { + "epoch": 0.2913376745938799, + "grad_norm": 0.3995048701763153, + "learning_rate": 0.00014176165533734517, + "loss": 1.4552, + "step": 22420 + }, + { + "epoch": 0.2913506691377958, + "grad_norm": 0.40415453910827637, + "learning_rate": 0.0001417590558754338, + "loss": 1.3393, + "step": 22421 + }, + { + "epoch": 0.2913636636817116, + "grad_norm": 0.3481190800666809, + "learning_rate": 0.0001417564564135224, + "loss": 1.2948, + "step": 22422 + }, + { + "epoch": 0.2913766582256275, + "grad_norm": 0.39264971017837524, + "learning_rate": 0.00014175385695161104, + "loss": 1.307, + "step": 22423 + }, + { + "epoch": 0.2913896527695434, + "grad_norm": 0.36539313197135925, + "learning_rate": 0.00014175125748969964, + "loss": 1.4567, + "step": 22424 + }, + { + "epoch": 0.2914026473134593, + "grad_norm": 0.4535652697086334, + "learning_rate": 0.00014174865802778826, + "loss": 1.3782, + "step": 22425 + }, + { + "epoch": 0.2914156418573751, + "grad_norm": 0.37952205538749695, + "learning_rate": 0.00014174605856587686, + "loss": 1.3139, + "step": 22426 + }, + { + "epoch": 0.291428636401291, + "grad_norm": 0.38149702548980713, + "learning_rate": 0.00014174345910396548, + "loss": 1.4591, + "step": 22427 + }, + { + "epoch": 0.29144163094520686, + "grad_norm": 0.45030805468559265, + "learning_rate": 0.0001417408596420541, + "loss": 1.5303, + "step": 22428 + }, + { + "epoch": 0.29145462548912277, + "grad_norm": 0.281517893075943, + "learning_rate": 0.0001417382601801427, + "loss": 1.3519, + "step": 22429 + }, + { + "epoch": 0.2914676200330386, + "grad_norm": 0.45179814100265503, + "learning_rate": 0.00014173566071823133, + "loss": 1.307, + "step": 22430 + }, + { + "epoch": 0.2914806145769545, + "grad_norm": 0.356875479221344, + "learning_rate": 0.00014173306125631996, + "loss": 1.317, + "step": 22431 + }, + { + "epoch": 0.29149360912087036, + "grad_norm": 0.36342817544937134, + "learning_rate": 0.00014173046179440855, + "loss": 1.3144, + "step": 22432 + }, + { + "epoch": 0.29150660366478626, + "grad_norm": 0.3269090950489044, + "learning_rate": 0.00014172786233249718, + "loss": 1.4073, + "step": 22433 + }, + { + "epoch": 0.2915195982087021, + "grad_norm": 0.3639696538448334, + "learning_rate": 0.0001417252628705858, + "loss": 1.3686, + "step": 22434 + }, + { + "epoch": 0.291532592752618, + "grad_norm": 0.46873170137405396, + "learning_rate": 0.00014172266340867443, + "loss": 1.3213, + "step": 22435 + }, + { + "epoch": 0.29154558729653385, + "grad_norm": 0.30299144983291626, + "learning_rate": 0.00014172006394676302, + "loss": 1.2294, + "step": 22436 + }, + { + "epoch": 0.29155858184044975, + "grad_norm": 0.46435976028442383, + "learning_rate": 0.00014171746448485165, + "loss": 1.4906, + "step": 22437 + }, + { + "epoch": 0.2915715763843656, + "grad_norm": 0.4323849380016327, + "learning_rate": 0.00014171486502294027, + "loss": 1.4484, + "step": 22438 + }, + { + "epoch": 0.2915845709282815, + "grad_norm": 0.43398523330688477, + "learning_rate": 0.00014171226556102887, + "loss": 1.5972, + "step": 22439 + }, + { + "epoch": 0.29159756547219734, + "grad_norm": 0.3140985667705536, + "learning_rate": 0.0001417096660991175, + "loss": 1.179, + "step": 22440 + }, + { + "epoch": 0.29161056001611324, + "grad_norm": 0.4335821568965912, + "learning_rate": 0.0001417070666372061, + "loss": 1.4986, + "step": 22441 + }, + { + "epoch": 0.2916235545600291, + "grad_norm": 0.40765589475631714, + "learning_rate": 0.00014170446717529474, + "loss": 1.2942, + "step": 22442 + }, + { + "epoch": 0.291636549103945, + "grad_norm": 0.37462612986564636, + "learning_rate": 0.00014170186771338334, + "loss": 1.5981, + "step": 22443 + }, + { + "epoch": 0.29164954364786083, + "grad_norm": 0.4370235800743103, + "learning_rate": 0.00014169926825147194, + "loss": 1.5524, + "step": 22444 + }, + { + "epoch": 0.29166253819177673, + "grad_norm": 0.34201183915138245, + "learning_rate": 0.00014169666878956056, + "loss": 1.2176, + "step": 22445 + }, + { + "epoch": 0.2916755327356926, + "grad_norm": 0.3491097092628479, + "learning_rate": 0.0001416940693276492, + "loss": 1.4063, + "step": 22446 + }, + { + "epoch": 0.2916885272796085, + "grad_norm": 0.4681399166584015, + "learning_rate": 0.0001416914698657378, + "loss": 1.4079, + "step": 22447 + }, + { + "epoch": 0.2917015218235243, + "grad_norm": 0.3690841794013977, + "learning_rate": 0.0001416888704038264, + "loss": 1.2544, + "step": 22448 + }, + { + "epoch": 0.2917145163674402, + "grad_norm": 0.5175653100013733, + "learning_rate": 0.00014168627094191503, + "loss": 1.3572, + "step": 22449 + }, + { + "epoch": 0.29172751091135607, + "grad_norm": 0.36344748735427856, + "learning_rate": 0.00014168367148000366, + "loss": 1.3683, + "step": 22450 + }, + { + "epoch": 0.29174050545527197, + "grad_norm": 0.31031015515327454, + "learning_rate": 0.00014168107201809226, + "loss": 1.2095, + "step": 22451 + }, + { + "epoch": 0.2917534999991878, + "grad_norm": 0.34097951650619507, + "learning_rate": 0.00014167847255618088, + "loss": 1.3518, + "step": 22452 + }, + { + "epoch": 0.2917664945431037, + "grad_norm": 0.4360902011394501, + "learning_rate": 0.00014167587309426948, + "loss": 1.4026, + "step": 22453 + }, + { + "epoch": 0.29177948908701956, + "grad_norm": 0.36822423338890076, + "learning_rate": 0.00014167327363235813, + "loss": 1.2999, + "step": 22454 + }, + { + "epoch": 0.29179248363093546, + "grad_norm": 0.40952008962631226, + "learning_rate": 0.00014167067417044673, + "loss": 1.372, + "step": 22455 + }, + { + "epoch": 0.2918054781748513, + "grad_norm": 0.4163241386413574, + "learning_rate": 0.00014166807470853532, + "loss": 1.4147, + "step": 22456 + }, + { + "epoch": 0.2918184727187672, + "grad_norm": 0.40895166993141174, + "learning_rate": 0.00014166547524662395, + "loss": 1.4159, + "step": 22457 + }, + { + "epoch": 0.2918314672626831, + "grad_norm": 0.35260632634162903, + "learning_rate": 0.00014166287578471257, + "loss": 1.235, + "step": 22458 + }, + { + "epoch": 0.29184446180659895, + "grad_norm": 0.34396541118621826, + "learning_rate": 0.0001416602763228012, + "loss": 1.4638, + "step": 22459 + }, + { + "epoch": 0.29185745635051485, + "grad_norm": 0.39983272552490234, + "learning_rate": 0.0001416576768608898, + "loss": 1.2562, + "step": 22460 + }, + { + "epoch": 0.2918704508944307, + "grad_norm": 0.33869585394859314, + "learning_rate": 0.00014165507739897842, + "loss": 1.3032, + "step": 22461 + }, + { + "epoch": 0.2918834454383466, + "grad_norm": 0.39762187004089355, + "learning_rate": 0.00014165247793706704, + "loss": 1.3093, + "step": 22462 + }, + { + "epoch": 0.29189643998226245, + "grad_norm": 0.38377419114112854, + "learning_rate": 0.00014164987847515564, + "loss": 1.5378, + "step": 22463 + }, + { + "epoch": 0.29190943452617835, + "grad_norm": 0.43484777212142944, + "learning_rate": 0.00014164727901324427, + "loss": 1.4626, + "step": 22464 + }, + { + "epoch": 0.2919224290700942, + "grad_norm": 0.28662002086639404, + "learning_rate": 0.00014164467955133286, + "loss": 1.3209, + "step": 22465 + }, + { + "epoch": 0.2919354236140101, + "grad_norm": 0.4069984555244446, + "learning_rate": 0.00014164208008942151, + "loss": 1.4037, + "step": 22466 + }, + { + "epoch": 0.29194841815792594, + "grad_norm": 0.36249905824661255, + "learning_rate": 0.0001416394806275101, + "loss": 1.2412, + "step": 22467 + }, + { + "epoch": 0.29196141270184184, + "grad_norm": 0.42795199155807495, + "learning_rate": 0.0001416368811655987, + "loss": 1.4898, + "step": 22468 + }, + { + "epoch": 0.2919744072457577, + "grad_norm": 0.40475648641586304, + "learning_rate": 0.00014163428170368733, + "loss": 1.5042, + "step": 22469 + }, + { + "epoch": 0.2919874017896736, + "grad_norm": 0.573387622833252, + "learning_rate": 0.00014163168224177596, + "loss": 1.4308, + "step": 22470 + }, + { + "epoch": 0.29200039633358943, + "grad_norm": 0.4268193244934082, + "learning_rate": 0.00014162908277986458, + "loss": 1.2659, + "step": 22471 + }, + { + "epoch": 0.29201339087750533, + "grad_norm": 0.4079134464263916, + "learning_rate": 0.00014162648331795318, + "loss": 1.4543, + "step": 22472 + }, + { + "epoch": 0.2920263854214212, + "grad_norm": 0.34781429171562195, + "learning_rate": 0.0001416238838560418, + "loss": 1.3429, + "step": 22473 + }, + { + "epoch": 0.2920393799653371, + "grad_norm": 0.3656001091003418, + "learning_rate": 0.00014162128439413043, + "loss": 1.2805, + "step": 22474 + }, + { + "epoch": 0.2920523745092529, + "grad_norm": 0.4839644432067871, + "learning_rate": 0.00014161868493221903, + "loss": 1.555, + "step": 22475 + }, + { + "epoch": 0.2920653690531688, + "grad_norm": 0.43905937671661377, + "learning_rate": 0.00014161608547030765, + "loss": 1.5976, + "step": 22476 + }, + { + "epoch": 0.29207836359708467, + "grad_norm": 0.4452652037143707, + "learning_rate": 0.00014161348600839628, + "loss": 1.2814, + "step": 22477 + }, + { + "epoch": 0.29209135814100057, + "grad_norm": 0.41431668400764465, + "learning_rate": 0.0001416108865464849, + "loss": 1.4741, + "step": 22478 + }, + { + "epoch": 0.2921043526849164, + "grad_norm": 0.3766244649887085, + "learning_rate": 0.0001416082870845735, + "loss": 1.2444, + "step": 22479 + }, + { + "epoch": 0.2921173472288323, + "grad_norm": 0.49738484621047974, + "learning_rate": 0.00014160568762266212, + "loss": 1.5616, + "step": 22480 + }, + { + "epoch": 0.29213034177274816, + "grad_norm": 0.4044188857078552, + "learning_rate": 0.00014160308816075075, + "loss": 1.5336, + "step": 22481 + }, + { + "epoch": 0.29214333631666406, + "grad_norm": 0.41952499747276306, + "learning_rate": 0.00014160048869883934, + "loss": 1.6241, + "step": 22482 + }, + { + "epoch": 0.2921563308605799, + "grad_norm": 0.3903580605983734, + "learning_rate": 0.00014159788923692797, + "loss": 1.3418, + "step": 22483 + }, + { + "epoch": 0.2921693254044958, + "grad_norm": 0.4092293083667755, + "learning_rate": 0.00014159528977501657, + "loss": 1.2682, + "step": 22484 + }, + { + "epoch": 0.29218231994841165, + "grad_norm": 0.34794676303863525, + "learning_rate": 0.0001415926903131052, + "loss": 1.5098, + "step": 22485 + }, + { + "epoch": 0.29219531449232755, + "grad_norm": 0.41119280457496643, + "learning_rate": 0.00014159009085119381, + "loss": 1.544, + "step": 22486 + }, + { + "epoch": 0.2922083090362434, + "grad_norm": 0.42848271131515503, + "learning_rate": 0.0001415874913892824, + "loss": 1.3591, + "step": 22487 + }, + { + "epoch": 0.2922213035801593, + "grad_norm": 0.4547380805015564, + "learning_rate": 0.00014158489192737104, + "loss": 1.3756, + "step": 22488 + }, + { + "epoch": 0.29223429812407514, + "grad_norm": 0.40840330719947815, + "learning_rate": 0.00014158229246545966, + "loss": 1.4207, + "step": 22489 + }, + { + "epoch": 0.29224729266799104, + "grad_norm": 0.37380197644233704, + "learning_rate": 0.00014157969300354829, + "loss": 1.4824, + "step": 22490 + }, + { + "epoch": 0.2922602872119069, + "grad_norm": 0.4293728172779083, + "learning_rate": 0.00014157709354163688, + "loss": 1.5876, + "step": 22491 + }, + { + "epoch": 0.2922732817558228, + "grad_norm": 0.4066116213798523, + "learning_rate": 0.0001415744940797255, + "loss": 1.3308, + "step": 22492 + }, + { + "epoch": 0.29228627629973863, + "grad_norm": 0.40848809480667114, + "learning_rate": 0.00014157189461781413, + "loss": 1.4137, + "step": 22493 + }, + { + "epoch": 0.29229927084365454, + "grad_norm": 0.32751092314720154, + "learning_rate": 0.00014156929515590273, + "loss": 1.3348, + "step": 22494 + }, + { + "epoch": 0.2923122653875704, + "grad_norm": 0.4927367866039276, + "learning_rate": 0.00014156669569399135, + "loss": 1.4771, + "step": 22495 + }, + { + "epoch": 0.2923252599314863, + "grad_norm": 0.5153440237045288, + "learning_rate": 0.00014156409623207995, + "loss": 1.5009, + "step": 22496 + }, + { + "epoch": 0.2923382544754021, + "grad_norm": 0.3810153603553772, + "learning_rate": 0.0001415614967701686, + "loss": 1.4025, + "step": 22497 + }, + { + "epoch": 0.292351249019318, + "grad_norm": 0.3840183913707733, + "learning_rate": 0.0001415588973082572, + "loss": 1.2577, + "step": 22498 + }, + { + "epoch": 0.29236424356323387, + "grad_norm": 0.31107601523399353, + "learning_rate": 0.0001415562978463458, + "loss": 1.292, + "step": 22499 + }, + { + "epoch": 0.2923772381071498, + "grad_norm": 0.5184291005134583, + "learning_rate": 0.00014155369838443442, + "loss": 1.5384, + "step": 22500 + }, + { + "epoch": 0.2923902326510656, + "grad_norm": 0.4737551808357239, + "learning_rate": 0.00014155109892252305, + "loss": 1.5834, + "step": 22501 + }, + { + "epoch": 0.2924032271949815, + "grad_norm": 0.47723016142845154, + "learning_rate": 0.00014154849946061167, + "loss": 1.4968, + "step": 22502 + }, + { + "epoch": 0.29241622173889736, + "grad_norm": 0.4479424059391022, + "learning_rate": 0.00014154589999870027, + "loss": 1.3638, + "step": 22503 + }, + { + "epoch": 0.29242921628281326, + "grad_norm": 0.4283653497695923, + "learning_rate": 0.0001415433005367889, + "loss": 1.3799, + "step": 22504 + }, + { + "epoch": 0.2924422108267291, + "grad_norm": 0.4166092574596405, + "learning_rate": 0.00014154070107487752, + "loss": 1.4467, + "step": 22505 + }, + { + "epoch": 0.292455205370645, + "grad_norm": 0.5127121210098267, + "learning_rate": 0.00014153810161296611, + "loss": 1.39, + "step": 22506 + }, + { + "epoch": 0.29246819991456086, + "grad_norm": 0.4101192355155945, + "learning_rate": 0.00014153550215105474, + "loss": 1.2877, + "step": 22507 + }, + { + "epoch": 0.29248119445847676, + "grad_norm": 0.431466668844223, + "learning_rate": 0.00014153290268914336, + "loss": 1.6306, + "step": 22508 + }, + { + "epoch": 0.2924941890023926, + "grad_norm": 0.4160490930080414, + "learning_rate": 0.000141530303227232, + "loss": 1.5468, + "step": 22509 + }, + { + "epoch": 0.2925071835463085, + "grad_norm": 0.3985010087490082, + "learning_rate": 0.00014152770376532059, + "loss": 1.2803, + "step": 22510 + }, + { + "epoch": 0.29252017809022435, + "grad_norm": 0.4404599070549011, + "learning_rate": 0.00014152510430340918, + "loss": 1.5743, + "step": 22511 + }, + { + "epoch": 0.29253317263414025, + "grad_norm": 0.37941256165504456, + "learning_rate": 0.00014152250484149783, + "loss": 1.5251, + "step": 22512 + }, + { + "epoch": 0.2925461671780561, + "grad_norm": 0.4095732271671295, + "learning_rate": 0.00014151990537958643, + "loss": 1.4543, + "step": 22513 + }, + { + "epoch": 0.292559161721972, + "grad_norm": 0.39623120427131653, + "learning_rate": 0.00014151730591767506, + "loss": 1.5119, + "step": 22514 + }, + { + "epoch": 0.29257215626588784, + "grad_norm": 0.45352184772491455, + "learning_rate": 0.00014151470645576365, + "loss": 1.5995, + "step": 22515 + }, + { + "epoch": 0.29258515080980374, + "grad_norm": 0.34750935435295105, + "learning_rate": 0.00014151210699385228, + "loss": 1.1497, + "step": 22516 + }, + { + "epoch": 0.2925981453537196, + "grad_norm": 0.4564761519432068, + "learning_rate": 0.0001415095075319409, + "loss": 1.4583, + "step": 22517 + }, + { + "epoch": 0.2926111398976355, + "grad_norm": 0.40920811891555786, + "learning_rate": 0.0001415069080700295, + "loss": 1.3728, + "step": 22518 + }, + { + "epoch": 0.29262413444155133, + "grad_norm": 0.42259570956230164, + "learning_rate": 0.00014150430860811812, + "loss": 1.3961, + "step": 22519 + }, + { + "epoch": 0.29263712898546723, + "grad_norm": 0.33282750844955444, + "learning_rate": 0.00014150170914620675, + "loss": 1.3534, + "step": 22520 + }, + { + "epoch": 0.2926501235293831, + "grad_norm": 0.4240438938140869, + "learning_rate": 0.00014149910968429537, + "loss": 1.3855, + "step": 22521 + }, + { + "epoch": 0.292663118073299, + "grad_norm": 0.3609941601753235, + "learning_rate": 0.00014149651022238397, + "loss": 1.451, + "step": 22522 + }, + { + "epoch": 0.2926761126172148, + "grad_norm": 0.29714062809944153, + "learning_rate": 0.00014149391076047257, + "loss": 1.2573, + "step": 22523 + }, + { + "epoch": 0.2926891071611307, + "grad_norm": 0.3870581388473511, + "learning_rate": 0.00014149131129856122, + "loss": 1.4361, + "step": 22524 + }, + { + "epoch": 0.29270210170504657, + "grad_norm": 0.37668898701667786, + "learning_rate": 0.00014148871183664982, + "loss": 1.3807, + "step": 22525 + }, + { + "epoch": 0.29271509624896247, + "grad_norm": 0.43750301003456116, + "learning_rate": 0.00014148611237473844, + "loss": 1.5885, + "step": 22526 + }, + { + "epoch": 0.2927280907928783, + "grad_norm": 0.36418819427490234, + "learning_rate": 0.00014148351291282704, + "loss": 1.3241, + "step": 22527 + }, + { + "epoch": 0.2927410853367942, + "grad_norm": 0.5421130061149597, + "learning_rate": 0.00014148091345091566, + "loss": 1.5953, + "step": 22528 + }, + { + "epoch": 0.29275407988071006, + "grad_norm": 0.4437992572784424, + "learning_rate": 0.0001414783139890043, + "loss": 1.4474, + "step": 22529 + }, + { + "epoch": 0.29276707442462596, + "grad_norm": 0.48086389899253845, + "learning_rate": 0.00014147571452709289, + "loss": 1.3507, + "step": 22530 + }, + { + "epoch": 0.2927800689685418, + "grad_norm": 0.5275854468345642, + "learning_rate": 0.0001414731150651815, + "loss": 1.4328, + "step": 22531 + }, + { + "epoch": 0.2927930635124577, + "grad_norm": 0.35602810978889465, + "learning_rate": 0.00014147051560327013, + "loss": 1.3794, + "step": 22532 + }, + { + "epoch": 0.29280605805637355, + "grad_norm": 0.3511948585510254, + "learning_rate": 0.00014146791614135876, + "loss": 1.325, + "step": 22533 + }, + { + "epoch": 0.29281905260028945, + "grad_norm": 0.35132676362991333, + "learning_rate": 0.00014146531667944736, + "loss": 1.6158, + "step": 22534 + }, + { + "epoch": 0.29283204714420535, + "grad_norm": 0.24422088265419006, + "learning_rate": 0.00014146271721753598, + "loss": 1.1416, + "step": 22535 + }, + { + "epoch": 0.2928450416881212, + "grad_norm": 0.4303850829601288, + "learning_rate": 0.0001414601177556246, + "loss": 1.4665, + "step": 22536 + }, + { + "epoch": 0.2928580362320371, + "grad_norm": 0.44975119829177856, + "learning_rate": 0.0001414575182937132, + "loss": 1.5136, + "step": 22537 + }, + { + "epoch": 0.29287103077595295, + "grad_norm": 0.3508610725402832, + "learning_rate": 0.00014145491883180183, + "loss": 1.3161, + "step": 22538 + }, + { + "epoch": 0.29288402531986885, + "grad_norm": 0.46991166472435, + "learning_rate": 0.00014145231936989042, + "loss": 1.4926, + "step": 22539 + }, + { + "epoch": 0.2928970198637847, + "grad_norm": 0.5001912117004395, + "learning_rate": 0.00014144971990797905, + "loss": 1.3944, + "step": 22540 + }, + { + "epoch": 0.2929100144077006, + "grad_norm": 0.4130523204803467, + "learning_rate": 0.00014144712044606767, + "loss": 1.5294, + "step": 22541 + }, + { + "epoch": 0.29292300895161644, + "grad_norm": 0.4101564288139343, + "learning_rate": 0.00014144452098415627, + "loss": 1.5661, + "step": 22542 + }, + { + "epoch": 0.29293600349553234, + "grad_norm": 0.2994188070297241, + "learning_rate": 0.0001414419215222449, + "loss": 1.3108, + "step": 22543 + }, + { + "epoch": 0.2929489980394482, + "grad_norm": 0.3450506031513214, + "learning_rate": 0.00014143932206033352, + "loss": 1.2291, + "step": 22544 + }, + { + "epoch": 0.2929619925833641, + "grad_norm": 0.43679311871528625, + "learning_rate": 0.00014143672259842214, + "loss": 1.3722, + "step": 22545 + }, + { + "epoch": 0.29297498712727993, + "grad_norm": 0.3442310690879822, + "learning_rate": 0.00014143412313651074, + "loss": 1.2909, + "step": 22546 + }, + { + "epoch": 0.29298798167119583, + "grad_norm": 0.3955119550228119, + "learning_rate": 0.00014143152367459937, + "loss": 1.5496, + "step": 22547 + }, + { + "epoch": 0.2930009762151117, + "grad_norm": 0.28952857851982117, + "learning_rate": 0.000141428924212688, + "loss": 1.4617, + "step": 22548 + }, + { + "epoch": 0.2930139707590276, + "grad_norm": 0.44652268290519714, + "learning_rate": 0.0001414263247507766, + "loss": 1.4031, + "step": 22549 + }, + { + "epoch": 0.2930269653029434, + "grad_norm": 0.38711878657341003, + "learning_rate": 0.0001414237252888652, + "loss": 1.4184, + "step": 22550 + }, + { + "epoch": 0.2930399598468593, + "grad_norm": 0.43617695569992065, + "learning_rate": 0.00014142112582695384, + "loss": 1.4482, + "step": 22551 + }, + { + "epoch": 0.29305295439077517, + "grad_norm": 0.32677868008613586, + "learning_rate": 0.00014141852636504243, + "loss": 1.568, + "step": 22552 + }, + { + "epoch": 0.29306594893469107, + "grad_norm": 0.44942548871040344, + "learning_rate": 0.00014141592690313106, + "loss": 1.3152, + "step": 22553 + }, + { + "epoch": 0.2930789434786069, + "grad_norm": 0.4771665036678314, + "learning_rate": 0.00014141332744121966, + "loss": 1.5968, + "step": 22554 + }, + { + "epoch": 0.2930919380225228, + "grad_norm": 0.4449734687805176, + "learning_rate": 0.0001414107279793083, + "loss": 1.351, + "step": 22555 + }, + { + "epoch": 0.29310493256643866, + "grad_norm": 0.37848758697509766, + "learning_rate": 0.0001414081285173969, + "loss": 1.6488, + "step": 22556 + }, + { + "epoch": 0.29311792711035456, + "grad_norm": 0.39109450578689575, + "learning_rate": 0.00014140552905548553, + "loss": 1.2372, + "step": 22557 + }, + { + "epoch": 0.2931309216542704, + "grad_norm": 0.317754864692688, + "learning_rate": 0.00014140292959357413, + "loss": 1.2596, + "step": 22558 + }, + { + "epoch": 0.2931439161981863, + "grad_norm": 0.4028441905975342, + "learning_rate": 0.00014140033013166275, + "loss": 1.2158, + "step": 22559 + }, + { + "epoch": 0.29315691074210215, + "grad_norm": 0.36479660868644714, + "learning_rate": 0.00014139773066975138, + "loss": 1.3898, + "step": 22560 + }, + { + "epoch": 0.29316990528601805, + "grad_norm": 0.39816340804100037, + "learning_rate": 0.00014139513120783997, + "loss": 1.3612, + "step": 22561 + }, + { + "epoch": 0.2931828998299339, + "grad_norm": 0.393535852432251, + "learning_rate": 0.0001413925317459286, + "loss": 1.5222, + "step": 22562 + }, + { + "epoch": 0.2931958943738498, + "grad_norm": 0.3182189166545868, + "learning_rate": 0.00014138993228401722, + "loss": 1.431, + "step": 22563 + }, + { + "epoch": 0.29320888891776564, + "grad_norm": 0.37469854950904846, + "learning_rate": 0.00014138733282210585, + "loss": 1.3974, + "step": 22564 + }, + { + "epoch": 0.29322188346168154, + "grad_norm": 0.3932574689388275, + "learning_rate": 0.00014138473336019444, + "loss": 1.3816, + "step": 22565 + }, + { + "epoch": 0.2932348780055974, + "grad_norm": 0.4263599216938019, + "learning_rate": 0.00014138213389828304, + "loss": 1.6606, + "step": 22566 + }, + { + "epoch": 0.2932478725495133, + "grad_norm": 0.4822588860988617, + "learning_rate": 0.0001413795344363717, + "loss": 1.4894, + "step": 22567 + }, + { + "epoch": 0.29326086709342913, + "grad_norm": 0.3361617624759674, + "learning_rate": 0.0001413769349744603, + "loss": 1.3536, + "step": 22568 + }, + { + "epoch": 0.29327386163734503, + "grad_norm": 0.39120563864707947, + "learning_rate": 0.00014137433551254891, + "loss": 1.299, + "step": 22569 + }, + { + "epoch": 0.2932868561812609, + "grad_norm": 0.4127618670463562, + "learning_rate": 0.0001413717360506375, + "loss": 1.6053, + "step": 22570 + }, + { + "epoch": 0.2932998507251768, + "grad_norm": 0.3370152711868286, + "learning_rate": 0.00014136913658872614, + "loss": 1.4671, + "step": 22571 + }, + { + "epoch": 0.2933128452690926, + "grad_norm": 0.49673357605934143, + "learning_rate": 0.00014136653712681476, + "loss": 1.425, + "step": 22572 + }, + { + "epoch": 0.2933258398130085, + "grad_norm": 0.3558320701122284, + "learning_rate": 0.00014136393766490336, + "loss": 1.2506, + "step": 22573 + }, + { + "epoch": 0.29333883435692437, + "grad_norm": 0.44406816363334656, + "learning_rate": 0.00014136133820299198, + "loss": 1.322, + "step": 22574 + }, + { + "epoch": 0.2933518289008403, + "grad_norm": 0.407133549451828, + "learning_rate": 0.0001413587387410806, + "loss": 1.5989, + "step": 22575 + }, + { + "epoch": 0.2933648234447561, + "grad_norm": 0.4633191227912903, + "learning_rate": 0.00014135613927916923, + "loss": 1.4413, + "step": 22576 + }, + { + "epoch": 0.293377817988672, + "grad_norm": 0.4019451439380646, + "learning_rate": 0.00014135353981725783, + "loss": 1.4451, + "step": 22577 + }, + { + "epoch": 0.29339081253258786, + "grad_norm": 0.3871854245662689, + "learning_rate": 0.00014135094035534643, + "loss": 1.5382, + "step": 22578 + }, + { + "epoch": 0.29340380707650376, + "grad_norm": 0.42469462752342224, + "learning_rate": 0.00014134834089343508, + "loss": 1.359, + "step": 22579 + }, + { + "epoch": 0.2934168016204196, + "grad_norm": 0.4095105826854706, + "learning_rate": 0.00014134574143152368, + "loss": 1.1768, + "step": 22580 + }, + { + "epoch": 0.2934297961643355, + "grad_norm": 0.2695428729057312, + "learning_rate": 0.0001413431419696123, + "loss": 1.5073, + "step": 22581 + }, + { + "epoch": 0.29344279070825136, + "grad_norm": 0.41797783970832825, + "learning_rate": 0.00014134054250770092, + "loss": 1.4181, + "step": 22582 + }, + { + "epoch": 0.29345578525216726, + "grad_norm": 0.4157312214374542, + "learning_rate": 0.00014133794304578952, + "loss": 1.4828, + "step": 22583 + }, + { + "epoch": 0.2934687797960831, + "grad_norm": 0.3593350350856781, + "learning_rate": 0.00014133534358387815, + "loss": 1.2943, + "step": 22584 + }, + { + "epoch": 0.293481774339999, + "grad_norm": 0.39659184217453003, + "learning_rate": 0.00014133274412196674, + "loss": 1.51, + "step": 22585 + }, + { + "epoch": 0.29349476888391485, + "grad_norm": 0.4719579219818115, + "learning_rate": 0.0001413301446600554, + "loss": 1.393, + "step": 22586 + }, + { + "epoch": 0.29350776342783075, + "grad_norm": 0.3489084541797638, + "learning_rate": 0.000141327545198144, + "loss": 1.2905, + "step": 22587 + }, + { + "epoch": 0.2935207579717466, + "grad_norm": 0.3658064305782318, + "learning_rate": 0.00014132494573623262, + "loss": 1.4529, + "step": 22588 + }, + { + "epoch": 0.2935337525156625, + "grad_norm": 0.41651803255081177, + "learning_rate": 0.00014132234627432121, + "loss": 1.4316, + "step": 22589 + }, + { + "epoch": 0.29354674705957834, + "grad_norm": 0.33188119530677795, + "learning_rate": 0.00014131974681240984, + "loss": 1.4201, + "step": 22590 + }, + { + "epoch": 0.29355974160349424, + "grad_norm": 0.431319922208786, + "learning_rate": 0.00014131714735049846, + "loss": 1.3938, + "step": 22591 + }, + { + "epoch": 0.2935727361474101, + "grad_norm": 0.3691128194332123, + "learning_rate": 0.00014131454788858706, + "loss": 1.3736, + "step": 22592 + }, + { + "epoch": 0.293585730691326, + "grad_norm": 0.3895038962364197, + "learning_rate": 0.00014131194842667569, + "loss": 1.3666, + "step": 22593 + }, + { + "epoch": 0.29359872523524183, + "grad_norm": 0.3395620882511139, + "learning_rate": 0.0001413093489647643, + "loss": 1.371, + "step": 22594 + }, + { + "epoch": 0.29361171977915773, + "grad_norm": 0.45897865295410156, + "learning_rate": 0.0001413067495028529, + "loss": 1.4082, + "step": 22595 + }, + { + "epoch": 0.2936247143230736, + "grad_norm": 0.4305770695209503, + "learning_rate": 0.00014130415004094153, + "loss": 1.3365, + "step": 22596 + }, + { + "epoch": 0.2936377088669895, + "grad_norm": 0.45783206820487976, + "learning_rate": 0.00014130155057903013, + "loss": 1.2981, + "step": 22597 + }, + { + "epoch": 0.2936507034109053, + "grad_norm": 0.4359472990036011, + "learning_rate": 0.00014129895111711878, + "loss": 1.5193, + "step": 22598 + }, + { + "epoch": 0.2936636979548212, + "grad_norm": 0.4860736131668091, + "learning_rate": 0.00014129635165520738, + "loss": 1.4043, + "step": 22599 + }, + { + "epoch": 0.29367669249873707, + "grad_norm": 0.40545913577079773, + "learning_rate": 0.000141293752193296, + "loss": 1.4422, + "step": 22600 + }, + { + "epoch": 0.29368968704265297, + "grad_norm": 0.452754944562912, + "learning_rate": 0.0001412911527313846, + "loss": 1.3803, + "step": 22601 + }, + { + "epoch": 0.2937026815865688, + "grad_norm": 0.40198904275894165, + "learning_rate": 0.00014128855326947322, + "loss": 1.453, + "step": 22602 + }, + { + "epoch": 0.2937156761304847, + "grad_norm": 0.38908687233924866, + "learning_rate": 0.00014128595380756185, + "loss": 1.2037, + "step": 22603 + }, + { + "epoch": 0.29372867067440056, + "grad_norm": 0.3717786371707916, + "learning_rate": 0.00014128335434565045, + "loss": 1.4296, + "step": 22604 + }, + { + "epoch": 0.29374166521831646, + "grad_norm": 0.31460779905319214, + "learning_rate": 0.00014128075488373907, + "loss": 1.3849, + "step": 22605 + }, + { + "epoch": 0.2937546597622323, + "grad_norm": 0.4288352429866791, + "learning_rate": 0.0001412781554218277, + "loss": 1.2045, + "step": 22606 + }, + { + "epoch": 0.2937676543061482, + "grad_norm": 0.343243807554245, + "learning_rate": 0.0001412755559599163, + "loss": 1.4274, + "step": 22607 + }, + { + "epoch": 0.29378064885006405, + "grad_norm": 0.4744374752044678, + "learning_rate": 0.00014127295649800492, + "loss": 1.3088, + "step": 22608 + }, + { + "epoch": 0.29379364339397995, + "grad_norm": 0.3874797523021698, + "learning_rate": 0.00014127035703609351, + "loss": 1.5031, + "step": 22609 + }, + { + "epoch": 0.29380663793789585, + "grad_norm": 0.4258202612400055, + "learning_rate": 0.00014126775757418217, + "loss": 1.3359, + "step": 22610 + }, + { + "epoch": 0.2938196324818117, + "grad_norm": 0.33291882276535034, + "learning_rate": 0.00014126515811227076, + "loss": 1.392, + "step": 22611 + }, + { + "epoch": 0.2938326270257276, + "grad_norm": 0.38168609142303467, + "learning_rate": 0.0001412625586503594, + "loss": 1.2818, + "step": 22612 + }, + { + "epoch": 0.29384562156964344, + "grad_norm": 0.38539960980415344, + "learning_rate": 0.00014125995918844799, + "loss": 1.2209, + "step": 22613 + }, + { + "epoch": 0.29385861611355935, + "grad_norm": 0.46443766355514526, + "learning_rate": 0.0001412573597265366, + "loss": 1.3587, + "step": 22614 + }, + { + "epoch": 0.2938716106574752, + "grad_norm": 0.47241437435150146, + "learning_rate": 0.00014125476026462523, + "loss": 1.4568, + "step": 22615 + }, + { + "epoch": 0.2938846052013911, + "grad_norm": 0.43388858437538147, + "learning_rate": 0.00014125216080271383, + "loss": 1.3528, + "step": 22616 + }, + { + "epoch": 0.29389759974530694, + "grad_norm": 0.42175722122192383, + "learning_rate": 0.00014124956134080246, + "loss": 1.367, + "step": 22617 + }, + { + "epoch": 0.29391059428922284, + "grad_norm": 0.3789048492908478, + "learning_rate": 0.00014124696187889108, + "loss": 1.5094, + "step": 22618 + }, + { + "epoch": 0.2939235888331387, + "grad_norm": 0.44789692759513855, + "learning_rate": 0.0001412443624169797, + "loss": 1.5487, + "step": 22619 + }, + { + "epoch": 0.2939365833770546, + "grad_norm": 0.4851679801940918, + "learning_rate": 0.0001412417629550683, + "loss": 1.3995, + "step": 22620 + }, + { + "epoch": 0.29394957792097043, + "grad_norm": 0.4190042316913605, + "learning_rate": 0.00014123916349315693, + "loss": 1.3758, + "step": 22621 + }, + { + "epoch": 0.29396257246488633, + "grad_norm": 0.4972772002220154, + "learning_rate": 0.00014123656403124555, + "loss": 1.3953, + "step": 22622 + }, + { + "epoch": 0.2939755670088022, + "grad_norm": 0.37061765789985657, + "learning_rate": 0.00014123396456933415, + "loss": 1.4499, + "step": 22623 + }, + { + "epoch": 0.2939885615527181, + "grad_norm": 0.45653384923934937, + "learning_rate": 0.00014123136510742277, + "loss": 1.4304, + "step": 22624 + }, + { + "epoch": 0.2940015560966339, + "grad_norm": 0.685883641242981, + "learning_rate": 0.0001412287656455114, + "loss": 1.2556, + "step": 22625 + }, + { + "epoch": 0.2940145506405498, + "grad_norm": 0.3788043260574341, + "learning_rate": 0.0001412261661836, + "loss": 1.5572, + "step": 22626 + }, + { + "epoch": 0.29402754518446567, + "grad_norm": 0.33320727944374084, + "learning_rate": 0.00014122356672168862, + "loss": 1.4471, + "step": 22627 + }, + { + "epoch": 0.29404053972838157, + "grad_norm": 0.41008460521698, + "learning_rate": 0.00014122096725977722, + "loss": 1.5881, + "step": 22628 + }, + { + "epoch": 0.2940535342722974, + "grad_norm": 0.28735849261283875, + "learning_rate": 0.00014121836779786587, + "loss": 1.4514, + "step": 22629 + }, + { + "epoch": 0.2940665288162133, + "grad_norm": 0.3744684159755707, + "learning_rate": 0.00014121576833595447, + "loss": 1.4418, + "step": 22630 + }, + { + "epoch": 0.29407952336012916, + "grad_norm": 0.3476904332637787, + "learning_rate": 0.0001412131688740431, + "loss": 1.5407, + "step": 22631 + }, + { + "epoch": 0.29409251790404506, + "grad_norm": 0.5504875183105469, + "learning_rate": 0.0001412105694121317, + "loss": 1.4748, + "step": 22632 + }, + { + "epoch": 0.2941055124479609, + "grad_norm": 0.5731304287910461, + "learning_rate": 0.0001412079699502203, + "loss": 1.3007, + "step": 22633 + }, + { + "epoch": 0.2941185069918768, + "grad_norm": 0.4084780514240265, + "learning_rate": 0.00014120537048830894, + "loss": 1.3574, + "step": 22634 + }, + { + "epoch": 0.29413150153579265, + "grad_norm": 0.4878765344619751, + "learning_rate": 0.00014120277102639753, + "loss": 1.3662, + "step": 22635 + }, + { + "epoch": 0.29414449607970855, + "grad_norm": 0.38960886001586914, + "learning_rate": 0.00014120017156448616, + "loss": 1.5241, + "step": 22636 + }, + { + "epoch": 0.2941574906236244, + "grad_norm": 0.4260435402393341, + "learning_rate": 0.00014119757210257478, + "loss": 1.3908, + "step": 22637 + }, + { + "epoch": 0.2941704851675403, + "grad_norm": 0.41243478655815125, + "learning_rate": 0.00014119497264066338, + "loss": 1.47, + "step": 22638 + }, + { + "epoch": 0.29418347971145614, + "grad_norm": 0.4653548002243042, + "learning_rate": 0.000141192373178752, + "loss": 1.5074, + "step": 22639 + }, + { + "epoch": 0.29419647425537204, + "grad_norm": 0.48945680260658264, + "learning_rate": 0.0001411897737168406, + "loss": 1.3663, + "step": 22640 + }, + { + "epoch": 0.2942094687992879, + "grad_norm": 0.40850454568862915, + "learning_rate": 0.00014118717425492925, + "loss": 1.5302, + "step": 22641 + }, + { + "epoch": 0.2942224633432038, + "grad_norm": 0.48850518465042114, + "learning_rate": 0.00014118457479301785, + "loss": 1.406, + "step": 22642 + }, + { + "epoch": 0.29423545788711963, + "grad_norm": 0.4024600684642792, + "learning_rate": 0.00014118197533110648, + "loss": 1.3428, + "step": 22643 + }, + { + "epoch": 0.29424845243103553, + "grad_norm": 0.47545182704925537, + "learning_rate": 0.00014117937586919507, + "loss": 1.4902, + "step": 22644 + }, + { + "epoch": 0.2942614469749514, + "grad_norm": 0.44334152340888977, + "learning_rate": 0.0001411767764072837, + "loss": 1.3876, + "step": 22645 + }, + { + "epoch": 0.2942744415188673, + "grad_norm": 0.2862689197063446, + "learning_rate": 0.00014117417694537232, + "loss": 1.4468, + "step": 22646 + }, + { + "epoch": 0.2942874360627831, + "grad_norm": 0.42577341198921204, + "learning_rate": 0.00014117157748346092, + "loss": 1.3768, + "step": 22647 + }, + { + "epoch": 0.294300430606699, + "grad_norm": 0.41813918948173523, + "learning_rate": 0.00014116897802154954, + "loss": 1.5569, + "step": 22648 + }, + { + "epoch": 0.29431342515061487, + "grad_norm": 0.4774334132671356, + "learning_rate": 0.00014116637855963817, + "loss": 1.3415, + "step": 22649 + }, + { + "epoch": 0.29432641969453077, + "grad_norm": 0.42769134044647217, + "learning_rate": 0.00014116377909772677, + "loss": 1.2718, + "step": 22650 + }, + { + "epoch": 0.2943394142384466, + "grad_norm": 0.45138004422187805, + "learning_rate": 0.0001411611796358154, + "loss": 1.4508, + "step": 22651 + }, + { + "epoch": 0.2943524087823625, + "grad_norm": 0.4319975972175598, + "learning_rate": 0.000141158580173904, + "loss": 1.3672, + "step": 22652 + }, + { + "epoch": 0.29436540332627836, + "grad_norm": 0.38144925236701965, + "learning_rate": 0.00014115598071199264, + "loss": 1.3791, + "step": 22653 + }, + { + "epoch": 0.29437839787019426, + "grad_norm": 0.41073185205459595, + "learning_rate": 0.00014115338125008124, + "loss": 1.5162, + "step": 22654 + }, + { + "epoch": 0.2943913924141101, + "grad_norm": 0.35939306020736694, + "learning_rate": 0.00014115078178816986, + "loss": 1.5065, + "step": 22655 + }, + { + "epoch": 0.294404386958026, + "grad_norm": 0.407172828912735, + "learning_rate": 0.00014114818232625846, + "loss": 1.3363, + "step": 22656 + }, + { + "epoch": 0.29441738150194185, + "grad_norm": 0.3219147324562073, + "learning_rate": 0.00014114558286434708, + "loss": 1.2946, + "step": 22657 + }, + { + "epoch": 0.29443037604585776, + "grad_norm": 0.3669741451740265, + "learning_rate": 0.0001411429834024357, + "loss": 1.409, + "step": 22658 + }, + { + "epoch": 0.2944433705897736, + "grad_norm": 0.4027775228023529, + "learning_rate": 0.0001411403839405243, + "loss": 1.2179, + "step": 22659 + }, + { + "epoch": 0.2944563651336895, + "grad_norm": 0.31190550327301025, + "learning_rate": 0.00014113778447861296, + "loss": 1.2808, + "step": 22660 + }, + { + "epoch": 0.29446935967760535, + "grad_norm": 0.28889214992523193, + "learning_rate": 0.00014113518501670155, + "loss": 1.4286, + "step": 22661 + }, + { + "epoch": 0.29448235422152125, + "grad_norm": 0.38850679993629456, + "learning_rate": 0.00014113258555479015, + "loss": 1.2951, + "step": 22662 + }, + { + "epoch": 0.2944953487654371, + "grad_norm": 0.41878315806388855, + "learning_rate": 0.00014112998609287878, + "loss": 1.469, + "step": 22663 + }, + { + "epoch": 0.294508343309353, + "grad_norm": 0.4294496476650238, + "learning_rate": 0.0001411273866309674, + "loss": 1.4834, + "step": 22664 + }, + { + "epoch": 0.29452133785326884, + "grad_norm": 0.42085373401641846, + "learning_rate": 0.00014112478716905603, + "loss": 1.416, + "step": 22665 + }, + { + "epoch": 0.29453433239718474, + "grad_norm": 0.46573328971862793, + "learning_rate": 0.00014112218770714462, + "loss": 1.4659, + "step": 22666 + }, + { + "epoch": 0.2945473269411006, + "grad_norm": 0.3660159409046173, + "learning_rate": 0.00014111958824523325, + "loss": 1.318, + "step": 22667 + }, + { + "epoch": 0.2945603214850165, + "grad_norm": 0.37564074993133545, + "learning_rate": 0.00014111698878332187, + "loss": 1.3054, + "step": 22668 + }, + { + "epoch": 0.29457331602893233, + "grad_norm": 0.46375614404678345, + "learning_rate": 0.00014111438932141047, + "loss": 1.6232, + "step": 22669 + }, + { + "epoch": 0.29458631057284823, + "grad_norm": 0.3933001160621643, + "learning_rate": 0.0001411117898594991, + "loss": 1.4715, + "step": 22670 + }, + { + "epoch": 0.2945993051167641, + "grad_norm": 0.4217098653316498, + "learning_rate": 0.0001411091903975877, + "loss": 1.6454, + "step": 22671 + }, + { + "epoch": 0.29461229966068, + "grad_norm": 0.34935858845710754, + "learning_rate": 0.00014110659093567634, + "loss": 1.402, + "step": 22672 + }, + { + "epoch": 0.2946252942045958, + "grad_norm": 0.4610530734062195, + "learning_rate": 0.00014110399147376494, + "loss": 1.4101, + "step": 22673 + }, + { + "epoch": 0.2946382887485117, + "grad_norm": 0.3640984296798706, + "learning_rate": 0.00014110139201185354, + "loss": 1.3667, + "step": 22674 + }, + { + "epoch": 0.29465128329242757, + "grad_norm": 0.4347236156463623, + "learning_rate": 0.00014109879254994216, + "loss": 1.4123, + "step": 22675 + }, + { + "epoch": 0.29466427783634347, + "grad_norm": 0.3800946772098541, + "learning_rate": 0.00014109619308803079, + "loss": 1.2625, + "step": 22676 + }, + { + "epoch": 0.2946772723802593, + "grad_norm": 0.3644084930419922, + "learning_rate": 0.0001410935936261194, + "loss": 1.3465, + "step": 22677 + }, + { + "epoch": 0.2946902669241752, + "grad_norm": 0.4131089448928833, + "learning_rate": 0.000141090994164208, + "loss": 1.4395, + "step": 22678 + }, + { + "epoch": 0.29470326146809106, + "grad_norm": 0.4133332371711731, + "learning_rate": 0.00014108839470229663, + "loss": 1.2441, + "step": 22679 + }, + { + "epoch": 0.29471625601200696, + "grad_norm": 0.38967373967170715, + "learning_rate": 0.00014108579524038526, + "loss": 1.5048, + "step": 22680 + }, + { + "epoch": 0.2947292505559228, + "grad_norm": 0.38563454151153564, + "learning_rate": 0.00014108319577847385, + "loss": 1.271, + "step": 22681 + }, + { + "epoch": 0.2947422450998387, + "grad_norm": 0.4786517918109894, + "learning_rate": 0.00014108059631656248, + "loss": 1.3529, + "step": 22682 + }, + { + "epoch": 0.29475523964375455, + "grad_norm": 0.3604390025138855, + "learning_rate": 0.00014107799685465108, + "loss": 1.5334, + "step": 22683 + }, + { + "epoch": 0.29476823418767045, + "grad_norm": 0.4230327904224396, + "learning_rate": 0.00014107539739273973, + "loss": 1.3628, + "step": 22684 + }, + { + "epoch": 0.2947812287315863, + "grad_norm": 0.4077970087528229, + "learning_rate": 0.00014107279793082833, + "loss": 1.2496, + "step": 22685 + }, + { + "epoch": 0.2947942232755022, + "grad_norm": 0.48415517807006836, + "learning_rate": 0.00014107019846891695, + "loss": 1.5605, + "step": 22686 + }, + { + "epoch": 0.2948072178194181, + "grad_norm": 0.40656521916389465, + "learning_rate": 0.00014106759900700555, + "loss": 1.3913, + "step": 22687 + }, + { + "epoch": 0.29482021236333394, + "grad_norm": 0.47370409965515137, + "learning_rate": 0.00014106499954509417, + "loss": 1.39, + "step": 22688 + }, + { + "epoch": 0.29483320690724985, + "grad_norm": 0.38415780663490295, + "learning_rate": 0.0001410624000831828, + "loss": 1.4926, + "step": 22689 + }, + { + "epoch": 0.2948462014511657, + "grad_norm": 0.3729836940765381, + "learning_rate": 0.0001410598006212714, + "loss": 1.4634, + "step": 22690 + }, + { + "epoch": 0.2948591959950816, + "grad_norm": 0.5336193442344666, + "learning_rate": 0.00014105720115936002, + "loss": 1.5676, + "step": 22691 + }, + { + "epoch": 0.29487219053899744, + "grad_norm": 0.3772420585155487, + "learning_rate": 0.00014105460169744864, + "loss": 1.5012, + "step": 22692 + }, + { + "epoch": 0.29488518508291334, + "grad_norm": 0.4394102394580841, + "learning_rate": 0.00014105200223553724, + "loss": 1.3906, + "step": 22693 + }, + { + "epoch": 0.2948981796268292, + "grad_norm": 0.4405618906021118, + "learning_rate": 0.00014104940277362586, + "loss": 1.3566, + "step": 22694 + }, + { + "epoch": 0.2949111741707451, + "grad_norm": 0.4179721474647522, + "learning_rate": 0.0001410468033117145, + "loss": 1.3191, + "step": 22695 + }, + { + "epoch": 0.29492416871466093, + "grad_norm": 0.3888811767101288, + "learning_rate": 0.0001410442038498031, + "loss": 1.6286, + "step": 22696 + }, + { + "epoch": 0.29493716325857683, + "grad_norm": 0.41354015469551086, + "learning_rate": 0.0001410416043878917, + "loss": 1.2846, + "step": 22697 + }, + { + "epoch": 0.2949501578024927, + "grad_norm": 0.3298114538192749, + "learning_rate": 0.00014103900492598033, + "loss": 1.246, + "step": 22698 + }, + { + "epoch": 0.2949631523464086, + "grad_norm": 0.4141797423362732, + "learning_rate": 0.00014103640546406896, + "loss": 1.1815, + "step": 22699 + }, + { + "epoch": 0.2949761468903244, + "grad_norm": 0.41696545481681824, + "learning_rate": 0.00014103380600215756, + "loss": 1.4957, + "step": 22700 + }, + { + "epoch": 0.2949891414342403, + "grad_norm": 0.3861754536628723, + "learning_rate": 0.00014103120654024618, + "loss": 1.3783, + "step": 22701 + }, + { + "epoch": 0.29500213597815617, + "grad_norm": 0.4896663427352905, + "learning_rate": 0.00014102860707833478, + "loss": 1.3522, + "step": 22702 + }, + { + "epoch": 0.29501513052207207, + "grad_norm": 0.3898656368255615, + "learning_rate": 0.00014102600761642343, + "loss": 1.3368, + "step": 22703 + }, + { + "epoch": 0.2950281250659879, + "grad_norm": 0.34179216623306274, + "learning_rate": 0.00014102340815451203, + "loss": 1.3682, + "step": 22704 + }, + { + "epoch": 0.2950411196099038, + "grad_norm": 0.44247201085090637, + "learning_rate": 0.00014102080869260062, + "loss": 1.4201, + "step": 22705 + }, + { + "epoch": 0.29505411415381966, + "grad_norm": 0.43878957629203796, + "learning_rate": 0.00014101820923068925, + "loss": 1.3786, + "step": 22706 + }, + { + "epoch": 0.29506710869773556, + "grad_norm": 0.4125325381755829, + "learning_rate": 0.00014101560976877787, + "loss": 1.4007, + "step": 22707 + }, + { + "epoch": 0.2950801032416514, + "grad_norm": 0.43359920382499695, + "learning_rate": 0.0001410130103068665, + "loss": 1.5341, + "step": 22708 + }, + { + "epoch": 0.2950930977855673, + "grad_norm": 0.45451676845550537, + "learning_rate": 0.0001410104108449551, + "loss": 1.4745, + "step": 22709 + }, + { + "epoch": 0.29510609232948315, + "grad_norm": 0.37634754180908203, + "learning_rate": 0.00014100781138304372, + "loss": 1.4869, + "step": 22710 + }, + { + "epoch": 0.29511908687339905, + "grad_norm": 0.5383051037788391, + "learning_rate": 0.00014100521192113234, + "loss": 1.4963, + "step": 22711 + }, + { + "epoch": 0.2951320814173149, + "grad_norm": 0.3083975613117218, + "learning_rate": 0.00014100261245922094, + "loss": 1.08, + "step": 22712 + }, + { + "epoch": 0.2951450759612308, + "grad_norm": 0.3205197751522064, + "learning_rate": 0.00014100001299730957, + "loss": 1.2795, + "step": 22713 + }, + { + "epoch": 0.29515807050514664, + "grad_norm": 0.3008273243904114, + "learning_rate": 0.00014099741353539816, + "loss": 1.5117, + "step": 22714 + }, + { + "epoch": 0.29517106504906254, + "grad_norm": 0.3480885922908783, + "learning_rate": 0.00014099481407348682, + "loss": 1.3758, + "step": 22715 + }, + { + "epoch": 0.2951840595929784, + "grad_norm": 0.3936046361923218, + "learning_rate": 0.0001409922146115754, + "loss": 1.3568, + "step": 22716 + }, + { + "epoch": 0.2951970541368943, + "grad_norm": 0.45173177123069763, + "learning_rate": 0.000140989615149664, + "loss": 1.5368, + "step": 22717 + }, + { + "epoch": 0.29521004868081013, + "grad_norm": 0.47739657759666443, + "learning_rate": 0.00014098701568775263, + "loss": 1.4721, + "step": 22718 + }, + { + "epoch": 0.29522304322472603, + "grad_norm": 0.3324936628341675, + "learning_rate": 0.00014098441622584126, + "loss": 1.5117, + "step": 22719 + }, + { + "epoch": 0.2952360377686419, + "grad_norm": 0.36412155628204346, + "learning_rate": 0.00014098181676392988, + "loss": 1.4161, + "step": 22720 + }, + { + "epoch": 0.2952490323125578, + "grad_norm": 0.4264192581176758, + "learning_rate": 0.00014097921730201848, + "loss": 1.3919, + "step": 22721 + }, + { + "epoch": 0.2952620268564736, + "grad_norm": 0.37785351276397705, + "learning_rate": 0.0001409766178401071, + "loss": 1.4503, + "step": 22722 + }, + { + "epoch": 0.2952750214003895, + "grad_norm": 0.445453941822052, + "learning_rate": 0.00014097401837819573, + "loss": 1.5297, + "step": 22723 + }, + { + "epoch": 0.29528801594430537, + "grad_norm": 0.4093923270702362, + "learning_rate": 0.00014097141891628433, + "loss": 1.3703, + "step": 22724 + }, + { + "epoch": 0.29530101048822127, + "grad_norm": 0.4471838176250458, + "learning_rate": 0.00014096881945437295, + "loss": 1.4905, + "step": 22725 + }, + { + "epoch": 0.2953140050321371, + "grad_norm": 0.3949466049671173, + "learning_rate": 0.00014096621999246155, + "loss": 1.5135, + "step": 22726 + }, + { + "epoch": 0.295326999576053, + "grad_norm": 0.4689790606498718, + "learning_rate": 0.0001409636205305502, + "loss": 1.5711, + "step": 22727 + }, + { + "epoch": 0.29533999411996886, + "grad_norm": 0.433599054813385, + "learning_rate": 0.0001409610210686388, + "loss": 1.6089, + "step": 22728 + }, + { + "epoch": 0.29535298866388476, + "grad_norm": 0.3340052664279938, + "learning_rate": 0.0001409584216067274, + "loss": 1.4621, + "step": 22729 + }, + { + "epoch": 0.2953659832078006, + "grad_norm": 0.3817252516746521, + "learning_rate": 0.00014095582214481602, + "loss": 1.3275, + "step": 22730 + }, + { + "epoch": 0.2953789777517165, + "grad_norm": 0.3421393930912018, + "learning_rate": 0.00014095322268290464, + "loss": 1.4728, + "step": 22731 + }, + { + "epoch": 0.29539197229563235, + "grad_norm": 0.30604031682014465, + "learning_rate": 0.00014095062322099327, + "loss": 1.0493, + "step": 22732 + }, + { + "epoch": 0.29540496683954826, + "grad_norm": 0.3763476610183716, + "learning_rate": 0.00014094802375908187, + "loss": 1.5927, + "step": 22733 + }, + { + "epoch": 0.2954179613834641, + "grad_norm": 0.3829968571662903, + "learning_rate": 0.0001409454242971705, + "loss": 1.5236, + "step": 22734 + }, + { + "epoch": 0.29543095592738, + "grad_norm": 0.36088457703590393, + "learning_rate": 0.00014094282483525912, + "loss": 1.3462, + "step": 22735 + }, + { + "epoch": 0.29544395047129585, + "grad_norm": 0.3757702112197876, + "learning_rate": 0.0001409402253733477, + "loss": 1.3773, + "step": 22736 + }, + { + "epoch": 0.29545694501521175, + "grad_norm": 0.4474758207798004, + "learning_rate": 0.00014093762591143634, + "loss": 1.5072, + "step": 22737 + }, + { + "epoch": 0.2954699395591276, + "grad_norm": 0.3275909423828125, + "learning_rate": 0.00014093502644952496, + "loss": 1.3505, + "step": 22738 + }, + { + "epoch": 0.2954829341030435, + "grad_norm": 0.4505625367164612, + "learning_rate": 0.00014093242698761359, + "loss": 1.4056, + "step": 22739 + }, + { + "epoch": 0.29549592864695934, + "grad_norm": 0.4183221161365509, + "learning_rate": 0.00014092982752570218, + "loss": 1.4081, + "step": 22740 + }, + { + "epoch": 0.29550892319087524, + "grad_norm": 0.35911083221435547, + "learning_rate": 0.0001409272280637908, + "loss": 1.4545, + "step": 22741 + }, + { + "epoch": 0.2955219177347911, + "grad_norm": 0.4382403790950775, + "learning_rate": 0.00014092462860187943, + "loss": 1.4295, + "step": 22742 + }, + { + "epoch": 0.295534912278707, + "grad_norm": 0.4173792898654938, + "learning_rate": 0.00014092202913996803, + "loss": 1.2766, + "step": 22743 + }, + { + "epoch": 0.29554790682262283, + "grad_norm": 0.39180731773376465, + "learning_rate": 0.00014091942967805665, + "loss": 1.5351, + "step": 22744 + }, + { + "epoch": 0.29556090136653873, + "grad_norm": 0.5312597751617432, + "learning_rate": 0.00014091683021614525, + "loss": 1.4906, + "step": 22745 + }, + { + "epoch": 0.2955738959104546, + "grad_norm": 0.4281651973724365, + "learning_rate": 0.00014091423075423388, + "loss": 1.4218, + "step": 22746 + }, + { + "epoch": 0.2955868904543705, + "grad_norm": 0.3991905152797699, + "learning_rate": 0.0001409116312923225, + "loss": 1.46, + "step": 22747 + }, + { + "epoch": 0.2955998849982863, + "grad_norm": 0.5015237331390381, + "learning_rate": 0.0001409090318304111, + "loss": 1.392, + "step": 22748 + }, + { + "epoch": 0.2956128795422022, + "grad_norm": 0.45229706168174744, + "learning_rate": 0.00014090643236849972, + "loss": 1.3191, + "step": 22749 + }, + { + "epoch": 0.29562587408611807, + "grad_norm": 0.4183344542980194, + "learning_rate": 0.00014090383290658835, + "loss": 1.538, + "step": 22750 + }, + { + "epoch": 0.29563886863003397, + "grad_norm": 0.4537038505077362, + "learning_rate": 0.00014090123344467697, + "loss": 1.3393, + "step": 22751 + }, + { + "epoch": 0.2956518631739498, + "grad_norm": 0.47749584913253784, + "learning_rate": 0.00014089863398276557, + "loss": 1.3506, + "step": 22752 + }, + { + "epoch": 0.2956648577178657, + "grad_norm": 0.4244959354400635, + "learning_rate": 0.0001408960345208542, + "loss": 1.7011, + "step": 22753 + }, + { + "epoch": 0.29567785226178156, + "grad_norm": 0.35170838236808777, + "learning_rate": 0.00014089343505894282, + "loss": 1.6055, + "step": 22754 + }, + { + "epoch": 0.29569084680569746, + "grad_norm": 0.42715394496917725, + "learning_rate": 0.00014089083559703142, + "loss": 1.3151, + "step": 22755 + }, + { + "epoch": 0.2957038413496133, + "grad_norm": 0.3868791460990906, + "learning_rate": 0.00014088823613512004, + "loss": 1.4112, + "step": 22756 + }, + { + "epoch": 0.2957168358935292, + "grad_norm": 0.37718990445137024, + "learning_rate": 0.00014088563667320864, + "loss": 1.3493, + "step": 22757 + }, + { + "epoch": 0.29572983043744505, + "grad_norm": 0.34347158670425415, + "learning_rate": 0.00014088303721129726, + "loss": 1.4015, + "step": 22758 + }, + { + "epoch": 0.29574282498136095, + "grad_norm": 0.43162864446640015, + "learning_rate": 0.00014088043774938589, + "loss": 1.6092, + "step": 22759 + }, + { + "epoch": 0.2957558195252768, + "grad_norm": 0.49372825026512146, + "learning_rate": 0.00014087783828747448, + "loss": 1.3606, + "step": 22760 + }, + { + "epoch": 0.2957688140691927, + "grad_norm": 0.3937578797340393, + "learning_rate": 0.0001408752388255631, + "loss": 1.1894, + "step": 22761 + }, + { + "epoch": 0.29578180861310854, + "grad_norm": 0.5224334597587585, + "learning_rate": 0.00014087263936365173, + "loss": 1.328, + "step": 22762 + }, + { + "epoch": 0.29579480315702444, + "grad_norm": 0.3274579346179962, + "learning_rate": 0.00014087003990174036, + "loss": 1.3335, + "step": 22763 + }, + { + "epoch": 0.29580779770094034, + "grad_norm": 0.37509703636169434, + "learning_rate": 0.00014086744043982895, + "loss": 1.3058, + "step": 22764 + }, + { + "epoch": 0.2958207922448562, + "grad_norm": 0.4386065900325775, + "learning_rate": 0.00014086484097791758, + "loss": 1.4496, + "step": 22765 + }, + { + "epoch": 0.2958337867887721, + "grad_norm": 0.4717961847782135, + "learning_rate": 0.0001408622415160062, + "loss": 1.5254, + "step": 22766 + }, + { + "epoch": 0.29584678133268794, + "grad_norm": 0.36254382133483887, + "learning_rate": 0.0001408596420540948, + "loss": 1.2415, + "step": 22767 + }, + { + "epoch": 0.29585977587660384, + "grad_norm": 0.36956173181533813, + "learning_rate": 0.00014085704259218343, + "loss": 1.2223, + "step": 22768 + }, + { + "epoch": 0.2958727704205197, + "grad_norm": 0.4694001376628876, + "learning_rate": 0.00014085444313027205, + "loss": 1.7242, + "step": 22769 + }, + { + "epoch": 0.2958857649644356, + "grad_norm": 0.4347313642501831, + "learning_rate": 0.00014085184366836067, + "loss": 1.5408, + "step": 22770 + }, + { + "epoch": 0.2958987595083514, + "grad_norm": 0.3834713399410248, + "learning_rate": 0.00014084924420644927, + "loss": 1.3332, + "step": 22771 + }, + { + "epoch": 0.29591175405226733, + "grad_norm": 0.38068875670433044, + "learning_rate": 0.00014084664474453787, + "loss": 1.407, + "step": 22772 + }, + { + "epoch": 0.2959247485961832, + "grad_norm": 0.3845745623111725, + "learning_rate": 0.00014084404528262652, + "loss": 1.4291, + "step": 22773 + }, + { + "epoch": 0.2959377431400991, + "grad_norm": 0.40543684363365173, + "learning_rate": 0.00014084144582071512, + "loss": 1.2186, + "step": 22774 + }, + { + "epoch": 0.2959507376840149, + "grad_norm": 0.3033972680568695, + "learning_rate": 0.00014083884635880374, + "loss": 1.2849, + "step": 22775 + }, + { + "epoch": 0.2959637322279308, + "grad_norm": 0.32294130325317383, + "learning_rate": 0.00014083624689689234, + "loss": 1.4622, + "step": 22776 + }, + { + "epoch": 0.29597672677184667, + "grad_norm": 0.44553059339523315, + "learning_rate": 0.00014083364743498096, + "loss": 1.4221, + "step": 22777 + }, + { + "epoch": 0.29598972131576257, + "grad_norm": 0.3012782037258148, + "learning_rate": 0.0001408310479730696, + "loss": 1.3275, + "step": 22778 + }, + { + "epoch": 0.2960027158596784, + "grad_norm": 0.4275689125061035, + "learning_rate": 0.00014082844851115819, + "loss": 1.6086, + "step": 22779 + }, + { + "epoch": 0.2960157104035943, + "grad_norm": 0.33903008699417114, + "learning_rate": 0.0001408258490492468, + "loss": 1.177, + "step": 22780 + }, + { + "epoch": 0.29602870494751016, + "grad_norm": 0.5214288234710693, + "learning_rate": 0.00014082324958733544, + "loss": 1.3999, + "step": 22781 + }, + { + "epoch": 0.29604169949142606, + "grad_norm": 0.4583309590816498, + "learning_rate": 0.00014082065012542406, + "loss": 1.5295, + "step": 22782 + }, + { + "epoch": 0.2960546940353419, + "grad_norm": 0.39608946442604065, + "learning_rate": 0.00014081805066351266, + "loss": 1.2594, + "step": 22783 + }, + { + "epoch": 0.2960676885792578, + "grad_norm": 0.3155381679534912, + "learning_rate": 0.00014081545120160125, + "loss": 1.4035, + "step": 22784 + }, + { + "epoch": 0.29608068312317365, + "grad_norm": 0.4135020673274994, + "learning_rate": 0.0001408128517396899, + "loss": 1.372, + "step": 22785 + }, + { + "epoch": 0.29609367766708955, + "grad_norm": 0.4492252469062805, + "learning_rate": 0.0001408102522777785, + "loss": 1.3946, + "step": 22786 + }, + { + "epoch": 0.2961066722110054, + "grad_norm": 0.3661917448043823, + "learning_rate": 0.00014080765281586713, + "loss": 1.6238, + "step": 22787 + }, + { + "epoch": 0.2961196667549213, + "grad_norm": 0.35723355412483215, + "learning_rate": 0.00014080505335395573, + "loss": 1.2971, + "step": 22788 + }, + { + "epoch": 0.29613266129883714, + "grad_norm": 0.4470587968826294, + "learning_rate": 0.00014080245389204435, + "loss": 1.4378, + "step": 22789 + }, + { + "epoch": 0.29614565584275304, + "grad_norm": 0.4286514222621918, + "learning_rate": 0.00014079985443013297, + "loss": 1.4807, + "step": 22790 + }, + { + "epoch": 0.2961586503866689, + "grad_norm": 0.37609192728996277, + "learning_rate": 0.00014079725496822157, + "loss": 1.2514, + "step": 22791 + }, + { + "epoch": 0.2961716449305848, + "grad_norm": 0.2713671624660492, + "learning_rate": 0.0001407946555063102, + "loss": 1.4892, + "step": 22792 + }, + { + "epoch": 0.29618463947450063, + "grad_norm": 0.38295048475265503, + "learning_rate": 0.00014079205604439882, + "loss": 1.328, + "step": 22793 + }, + { + "epoch": 0.29619763401841653, + "grad_norm": 0.4896673858165741, + "learning_rate": 0.00014078945658248745, + "loss": 1.4202, + "step": 22794 + }, + { + "epoch": 0.2962106285623324, + "grad_norm": 0.47017353773117065, + "learning_rate": 0.00014078685712057604, + "loss": 1.4478, + "step": 22795 + }, + { + "epoch": 0.2962236231062483, + "grad_norm": 0.3654111623764038, + "learning_rate": 0.00014078425765866464, + "loss": 1.1031, + "step": 22796 + }, + { + "epoch": 0.2962366176501641, + "grad_norm": 0.30157145857810974, + "learning_rate": 0.0001407816581967533, + "loss": 1.3447, + "step": 22797 + }, + { + "epoch": 0.29624961219408, + "grad_norm": 0.45422545075416565, + "learning_rate": 0.0001407790587348419, + "loss": 1.4342, + "step": 22798 + }, + { + "epoch": 0.29626260673799587, + "grad_norm": 0.27733734250068665, + "learning_rate": 0.0001407764592729305, + "loss": 1.3901, + "step": 22799 + }, + { + "epoch": 0.29627560128191177, + "grad_norm": 0.47230327129364014, + "learning_rate": 0.0001407738598110191, + "loss": 1.4753, + "step": 22800 + }, + { + "epoch": 0.2962885958258276, + "grad_norm": 0.42203742265701294, + "learning_rate": 0.00014077126034910774, + "loss": 1.4846, + "step": 22801 + }, + { + "epoch": 0.2963015903697435, + "grad_norm": 0.40571749210357666, + "learning_rate": 0.00014076866088719636, + "loss": 1.347, + "step": 22802 + }, + { + "epoch": 0.29631458491365936, + "grad_norm": 0.5059195160865784, + "learning_rate": 0.00014076606142528496, + "loss": 1.5918, + "step": 22803 + }, + { + "epoch": 0.29632757945757526, + "grad_norm": 0.3949039578437805, + "learning_rate": 0.00014076346196337358, + "loss": 1.396, + "step": 22804 + }, + { + "epoch": 0.2963405740014911, + "grad_norm": 0.3804425299167633, + "learning_rate": 0.0001407608625014622, + "loss": 1.2227, + "step": 22805 + }, + { + "epoch": 0.296353568545407, + "grad_norm": 0.39050114154815674, + "learning_rate": 0.00014075826303955083, + "loss": 1.43, + "step": 22806 + }, + { + "epoch": 0.29636656308932285, + "grad_norm": 0.3925984799861908, + "learning_rate": 0.00014075566357763943, + "loss": 1.34, + "step": 22807 + }, + { + "epoch": 0.29637955763323875, + "grad_norm": 0.4106157422065735, + "learning_rate": 0.00014075306411572805, + "loss": 1.1953, + "step": 22808 + }, + { + "epoch": 0.2963925521771546, + "grad_norm": 0.30718663334846497, + "learning_rate": 0.00014075046465381668, + "loss": 1.4182, + "step": 22809 + }, + { + "epoch": 0.2964055467210705, + "grad_norm": 0.31157761812210083, + "learning_rate": 0.00014074786519190527, + "loss": 1.3287, + "step": 22810 + }, + { + "epoch": 0.29641854126498635, + "grad_norm": 0.3276211619377136, + "learning_rate": 0.0001407452657299939, + "loss": 1.3914, + "step": 22811 + }, + { + "epoch": 0.29643153580890225, + "grad_norm": 0.3580071032047272, + "learning_rate": 0.00014074266626808252, + "loss": 1.4117, + "step": 22812 + }, + { + "epoch": 0.2964445303528181, + "grad_norm": 0.3786758482456207, + "learning_rate": 0.00014074006680617112, + "loss": 1.4506, + "step": 22813 + }, + { + "epoch": 0.296457524896734, + "grad_norm": 0.3502340614795685, + "learning_rate": 0.00014073746734425975, + "loss": 1.4399, + "step": 22814 + }, + { + "epoch": 0.29647051944064984, + "grad_norm": 0.42407429218292236, + "learning_rate": 0.00014073486788234834, + "loss": 1.4414, + "step": 22815 + }, + { + "epoch": 0.29648351398456574, + "grad_norm": 0.34258678555488586, + "learning_rate": 0.000140732268420437, + "loss": 1.4689, + "step": 22816 + }, + { + "epoch": 0.2964965085284816, + "grad_norm": 0.4263593852519989, + "learning_rate": 0.0001407296689585256, + "loss": 1.4483, + "step": 22817 + }, + { + "epoch": 0.2965095030723975, + "grad_norm": 0.5015770792961121, + "learning_rate": 0.00014072706949661422, + "loss": 1.6009, + "step": 22818 + }, + { + "epoch": 0.29652249761631333, + "grad_norm": 0.3282269835472107, + "learning_rate": 0.0001407244700347028, + "loss": 1.3456, + "step": 22819 + }, + { + "epoch": 0.29653549216022923, + "grad_norm": 0.3061131238937378, + "learning_rate": 0.00014072187057279144, + "loss": 1.6779, + "step": 22820 + }, + { + "epoch": 0.2965484867041451, + "grad_norm": 0.3473403751850128, + "learning_rate": 0.00014071927111088006, + "loss": 1.4375, + "step": 22821 + }, + { + "epoch": 0.296561481248061, + "grad_norm": 0.41655465960502625, + "learning_rate": 0.00014071667164896866, + "loss": 1.5354, + "step": 22822 + }, + { + "epoch": 0.2965744757919768, + "grad_norm": 0.40060603618621826, + "learning_rate": 0.00014071407218705728, + "loss": 1.4237, + "step": 22823 + }, + { + "epoch": 0.2965874703358927, + "grad_norm": 0.3970382809638977, + "learning_rate": 0.0001407114727251459, + "loss": 1.2728, + "step": 22824 + }, + { + "epoch": 0.29660046487980857, + "grad_norm": 0.5272574424743652, + "learning_rate": 0.00014070887326323453, + "loss": 1.5391, + "step": 22825 + }, + { + "epoch": 0.29661345942372447, + "grad_norm": 0.40819504857063293, + "learning_rate": 0.00014070627380132313, + "loss": 1.5664, + "step": 22826 + }, + { + "epoch": 0.2966264539676403, + "grad_norm": 0.4603120982646942, + "learning_rate": 0.00014070367433941173, + "loss": 1.4675, + "step": 22827 + }, + { + "epoch": 0.2966394485115562, + "grad_norm": 0.3939252197742462, + "learning_rate": 0.00014070107487750038, + "loss": 1.4344, + "step": 22828 + }, + { + "epoch": 0.29665244305547206, + "grad_norm": 0.4045909345149994, + "learning_rate": 0.00014069847541558898, + "loss": 1.4307, + "step": 22829 + }, + { + "epoch": 0.29666543759938796, + "grad_norm": 0.32135406136512756, + "learning_rate": 0.0001406958759536776, + "loss": 1.3987, + "step": 22830 + }, + { + "epoch": 0.2966784321433038, + "grad_norm": 0.3731941282749176, + "learning_rate": 0.0001406932764917662, + "loss": 1.4561, + "step": 22831 + }, + { + "epoch": 0.2966914266872197, + "grad_norm": 0.443343847990036, + "learning_rate": 0.00014069067702985482, + "loss": 1.5198, + "step": 22832 + }, + { + "epoch": 0.29670442123113555, + "grad_norm": 0.3785175085067749, + "learning_rate": 0.00014068807756794345, + "loss": 1.4399, + "step": 22833 + }, + { + "epoch": 0.29671741577505145, + "grad_norm": 0.4535701274871826, + "learning_rate": 0.00014068547810603205, + "loss": 1.4742, + "step": 22834 + }, + { + "epoch": 0.2967304103189673, + "grad_norm": 0.3857325315475464, + "learning_rate": 0.00014068287864412067, + "loss": 1.443, + "step": 22835 + }, + { + "epoch": 0.2967434048628832, + "grad_norm": 0.3766133189201355, + "learning_rate": 0.0001406802791822093, + "loss": 1.3728, + "step": 22836 + }, + { + "epoch": 0.29675639940679904, + "grad_norm": 0.3807442784309387, + "learning_rate": 0.00014067767972029792, + "loss": 1.4144, + "step": 22837 + }, + { + "epoch": 0.29676939395071494, + "grad_norm": 0.3636348843574524, + "learning_rate": 0.00014067508025838652, + "loss": 1.3078, + "step": 22838 + }, + { + "epoch": 0.29678238849463084, + "grad_norm": 0.44451650977134705, + "learning_rate": 0.0001406724807964751, + "loss": 1.6234, + "step": 22839 + }, + { + "epoch": 0.2967953830385467, + "grad_norm": 0.4159308969974518, + "learning_rate": 0.00014066988133456376, + "loss": 1.3185, + "step": 22840 + }, + { + "epoch": 0.2968083775824626, + "grad_norm": 0.37671971321105957, + "learning_rate": 0.00014066728187265236, + "loss": 1.543, + "step": 22841 + }, + { + "epoch": 0.29682137212637844, + "grad_norm": 0.4537704885005951, + "learning_rate": 0.000140664682410741, + "loss": 1.5102, + "step": 22842 + }, + { + "epoch": 0.29683436667029434, + "grad_norm": 0.34417566657066345, + "learning_rate": 0.0001406620829488296, + "loss": 1.4457, + "step": 22843 + }, + { + "epoch": 0.2968473612142102, + "grad_norm": 0.3451550006866455, + "learning_rate": 0.0001406594834869182, + "loss": 1.5407, + "step": 22844 + }, + { + "epoch": 0.2968603557581261, + "grad_norm": 0.4700402319431305, + "learning_rate": 0.00014065688402500683, + "loss": 1.5849, + "step": 22845 + }, + { + "epoch": 0.2968733503020419, + "grad_norm": 0.41609492897987366, + "learning_rate": 0.00014065428456309543, + "loss": 1.6633, + "step": 22846 + }, + { + "epoch": 0.29688634484595783, + "grad_norm": 0.48423802852630615, + "learning_rate": 0.00014065168510118408, + "loss": 1.5358, + "step": 22847 + }, + { + "epoch": 0.2968993393898737, + "grad_norm": 0.360832542181015, + "learning_rate": 0.00014064908563927268, + "loss": 1.5546, + "step": 22848 + }, + { + "epoch": 0.2969123339337896, + "grad_norm": 0.3818899095058441, + "learning_rate": 0.0001406464861773613, + "loss": 1.3849, + "step": 22849 + }, + { + "epoch": 0.2969253284777054, + "grad_norm": 0.3651469647884369, + "learning_rate": 0.0001406438867154499, + "loss": 1.4863, + "step": 22850 + }, + { + "epoch": 0.2969383230216213, + "grad_norm": 0.4357500970363617, + "learning_rate": 0.00014064128725353853, + "loss": 1.4161, + "step": 22851 + }, + { + "epoch": 0.29695131756553717, + "grad_norm": 0.4675513803958893, + "learning_rate": 0.00014063868779162715, + "loss": 1.4354, + "step": 22852 + }, + { + "epoch": 0.29696431210945307, + "grad_norm": 0.36629101634025574, + "learning_rate": 0.00014063608832971575, + "loss": 1.3113, + "step": 22853 + }, + { + "epoch": 0.2969773066533689, + "grad_norm": 0.4100399315357208, + "learning_rate": 0.00014063348886780437, + "loss": 1.3803, + "step": 22854 + }, + { + "epoch": 0.2969903011972848, + "grad_norm": 0.3664558529853821, + "learning_rate": 0.000140630889405893, + "loss": 1.3159, + "step": 22855 + }, + { + "epoch": 0.29700329574120066, + "grad_norm": 0.2958900034427643, + "learning_rate": 0.0001406282899439816, + "loss": 1.1542, + "step": 22856 + }, + { + "epoch": 0.29701629028511656, + "grad_norm": 0.32148393988609314, + "learning_rate": 0.00014062569048207022, + "loss": 1.5216, + "step": 22857 + }, + { + "epoch": 0.2970292848290324, + "grad_norm": 0.37673118710517883, + "learning_rate": 0.00014062309102015882, + "loss": 1.288, + "step": 22858 + }, + { + "epoch": 0.2970422793729483, + "grad_norm": 0.36361804604530334, + "learning_rate": 0.00014062049155824747, + "loss": 1.5888, + "step": 22859 + }, + { + "epoch": 0.29705527391686415, + "grad_norm": 0.3854396641254425, + "learning_rate": 0.00014061789209633606, + "loss": 1.3321, + "step": 22860 + }, + { + "epoch": 0.29706826846078005, + "grad_norm": 0.3788318634033203, + "learning_rate": 0.0001406152926344247, + "loss": 1.3712, + "step": 22861 + }, + { + "epoch": 0.2970812630046959, + "grad_norm": 0.3288755416870117, + "learning_rate": 0.0001406126931725133, + "loss": 1.4977, + "step": 22862 + }, + { + "epoch": 0.2970942575486118, + "grad_norm": 0.35468214750289917, + "learning_rate": 0.0001406100937106019, + "loss": 1.3864, + "step": 22863 + }, + { + "epoch": 0.29710725209252764, + "grad_norm": 0.39672982692718506, + "learning_rate": 0.00014060749424869054, + "loss": 1.1931, + "step": 22864 + }, + { + "epoch": 0.29712024663644354, + "grad_norm": 0.3686577379703522, + "learning_rate": 0.00014060489478677913, + "loss": 1.4555, + "step": 22865 + }, + { + "epoch": 0.2971332411803594, + "grad_norm": 0.4682312607765198, + "learning_rate": 0.00014060229532486776, + "loss": 1.4446, + "step": 22866 + }, + { + "epoch": 0.2971462357242753, + "grad_norm": 0.3407752811908722, + "learning_rate": 0.00014059969586295638, + "loss": 1.3824, + "step": 22867 + }, + { + "epoch": 0.29715923026819113, + "grad_norm": 0.4182986617088318, + "learning_rate": 0.00014059709640104498, + "loss": 1.4133, + "step": 22868 + }, + { + "epoch": 0.29717222481210703, + "grad_norm": 0.4380069375038147, + "learning_rate": 0.0001405944969391336, + "loss": 1.4291, + "step": 22869 + }, + { + "epoch": 0.2971852193560229, + "grad_norm": 0.47763022780418396, + "learning_rate": 0.0001405918974772222, + "loss": 1.402, + "step": 22870 + }, + { + "epoch": 0.2971982138999388, + "grad_norm": 0.30499228835105896, + "learning_rate": 0.00014058929801531085, + "loss": 1.2681, + "step": 22871 + }, + { + "epoch": 0.2972112084438546, + "grad_norm": 0.4225768446922302, + "learning_rate": 0.00014058669855339945, + "loss": 1.5723, + "step": 22872 + }, + { + "epoch": 0.2972242029877705, + "grad_norm": 0.3985782265663147, + "learning_rate": 0.00014058409909148807, + "loss": 1.2473, + "step": 22873 + }, + { + "epoch": 0.29723719753168637, + "grad_norm": 0.3871857225894928, + "learning_rate": 0.00014058149962957667, + "loss": 1.4637, + "step": 22874 + }, + { + "epoch": 0.29725019207560227, + "grad_norm": 0.28158804774284363, + "learning_rate": 0.0001405789001676653, + "loss": 1.5074, + "step": 22875 + }, + { + "epoch": 0.2972631866195181, + "grad_norm": 0.40317776799201965, + "learning_rate": 0.00014057630070575392, + "loss": 1.4289, + "step": 22876 + }, + { + "epoch": 0.297276181163434, + "grad_norm": 0.29599377512931824, + "learning_rate": 0.00014057370124384252, + "loss": 1.3555, + "step": 22877 + }, + { + "epoch": 0.29728917570734986, + "grad_norm": 0.3496891260147095, + "learning_rate": 0.00014057110178193114, + "loss": 1.6558, + "step": 22878 + }, + { + "epoch": 0.29730217025126576, + "grad_norm": 0.38670259714126587, + "learning_rate": 0.00014056850232001977, + "loss": 1.2619, + "step": 22879 + }, + { + "epoch": 0.2973151647951816, + "grad_norm": 0.3137998580932617, + "learning_rate": 0.00014056590285810836, + "loss": 1.2972, + "step": 22880 + }, + { + "epoch": 0.2973281593390975, + "grad_norm": 0.47068697214126587, + "learning_rate": 0.000140563303396197, + "loss": 1.5808, + "step": 22881 + }, + { + "epoch": 0.29734115388301335, + "grad_norm": 0.5047212839126587, + "learning_rate": 0.00014056070393428561, + "loss": 1.4334, + "step": 22882 + }, + { + "epoch": 0.29735414842692925, + "grad_norm": 0.4658738076686859, + "learning_rate": 0.00014055810447237424, + "loss": 1.6403, + "step": 22883 + }, + { + "epoch": 0.2973671429708451, + "grad_norm": 0.411141574382782, + "learning_rate": 0.00014055550501046284, + "loss": 1.3849, + "step": 22884 + }, + { + "epoch": 0.297380137514761, + "grad_norm": 0.38476452231407166, + "learning_rate": 0.00014055290554855146, + "loss": 1.432, + "step": 22885 + }, + { + "epoch": 0.29739313205867685, + "grad_norm": 0.41409575939178467, + "learning_rate": 0.00014055030608664008, + "loss": 1.6512, + "step": 22886 + }, + { + "epoch": 0.29740612660259275, + "grad_norm": 0.3472932279109955, + "learning_rate": 0.00014054770662472868, + "loss": 1.3431, + "step": 22887 + }, + { + "epoch": 0.2974191211465086, + "grad_norm": 0.33591771125793457, + "learning_rate": 0.0001405451071628173, + "loss": 1.2956, + "step": 22888 + }, + { + "epoch": 0.2974321156904245, + "grad_norm": 0.24137292802333832, + "learning_rate": 0.0001405425077009059, + "loss": 1.2802, + "step": 22889 + }, + { + "epoch": 0.29744511023434034, + "grad_norm": 0.3404938280582428, + "learning_rate": 0.00014053990823899456, + "loss": 1.2568, + "step": 22890 + }, + { + "epoch": 0.29745810477825624, + "grad_norm": 0.33813583850860596, + "learning_rate": 0.00014053730877708315, + "loss": 1.2234, + "step": 22891 + }, + { + "epoch": 0.2974710993221721, + "grad_norm": 0.32802852988243103, + "learning_rate": 0.00014053470931517178, + "loss": 1.1815, + "step": 22892 + }, + { + "epoch": 0.297484093866088, + "grad_norm": 0.4178912937641144, + "learning_rate": 0.00014053210985326037, + "loss": 1.3813, + "step": 22893 + }, + { + "epoch": 0.29749708841000383, + "grad_norm": 0.32503899931907654, + "learning_rate": 0.000140529510391349, + "loss": 1.5414, + "step": 22894 + }, + { + "epoch": 0.29751008295391973, + "grad_norm": 0.4295741319656372, + "learning_rate": 0.00014052691092943762, + "loss": 1.5876, + "step": 22895 + }, + { + "epoch": 0.2975230774978356, + "grad_norm": 0.4701733887195587, + "learning_rate": 0.00014052431146752622, + "loss": 1.3618, + "step": 22896 + }, + { + "epoch": 0.2975360720417515, + "grad_norm": 0.33705857396125793, + "learning_rate": 0.00014052171200561485, + "loss": 1.345, + "step": 22897 + }, + { + "epoch": 0.2975490665856673, + "grad_norm": 0.4426041841506958, + "learning_rate": 0.00014051911254370347, + "loss": 1.5497, + "step": 22898 + }, + { + "epoch": 0.2975620611295832, + "grad_norm": 0.41073349118232727, + "learning_rate": 0.00014051651308179207, + "loss": 1.3098, + "step": 22899 + }, + { + "epoch": 0.29757505567349907, + "grad_norm": 0.34824374318122864, + "learning_rate": 0.0001405139136198807, + "loss": 1.4465, + "step": 22900 + }, + { + "epoch": 0.29758805021741497, + "grad_norm": 0.38049209117889404, + "learning_rate": 0.0001405113141579693, + "loss": 1.5442, + "step": 22901 + }, + { + "epoch": 0.2976010447613308, + "grad_norm": 0.3623986542224884, + "learning_rate": 0.00014050871469605794, + "loss": 1.407, + "step": 22902 + }, + { + "epoch": 0.2976140393052467, + "grad_norm": 0.4316590130329132, + "learning_rate": 0.00014050611523414654, + "loss": 1.3392, + "step": 22903 + }, + { + "epoch": 0.29762703384916256, + "grad_norm": 0.3892042338848114, + "learning_rate": 0.00014050351577223516, + "loss": 1.5788, + "step": 22904 + }, + { + "epoch": 0.29764002839307846, + "grad_norm": 0.3722061514854431, + "learning_rate": 0.00014050091631032376, + "loss": 1.3286, + "step": 22905 + }, + { + "epoch": 0.2976530229369943, + "grad_norm": 0.40507733821868896, + "learning_rate": 0.00014049831684841238, + "loss": 1.5016, + "step": 22906 + }, + { + "epoch": 0.2976660174809102, + "grad_norm": 0.3747061789035797, + "learning_rate": 0.000140495717386501, + "loss": 1.2771, + "step": 22907 + }, + { + "epoch": 0.29767901202482605, + "grad_norm": 0.4782727062702179, + "learning_rate": 0.0001404931179245896, + "loss": 1.4754, + "step": 22908 + }, + { + "epoch": 0.29769200656874195, + "grad_norm": 0.38695231080055237, + "learning_rate": 0.00014049051846267823, + "loss": 1.4502, + "step": 22909 + }, + { + "epoch": 0.2977050011126578, + "grad_norm": 0.3897361755371094, + "learning_rate": 0.00014048791900076686, + "loss": 1.4328, + "step": 22910 + }, + { + "epoch": 0.2977179956565737, + "grad_norm": 0.3365316092967987, + "learning_rate": 0.00014048531953885545, + "loss": 1.3066, + "step": 22911 + }, + { + "epoch": 0.29773099020048954, + "grad_norm": 0.3724667429924011, + "learning_rate": 0.00014048272007694408, + "loss": 1.2428, + "step": 22912 + }, + { + "epoch": 0.29774398474440544, + "grad_norm": 0.37072405219078064, + "learning_rate": 0.00014048012061503267, + "loss": 1.4466, + "step": 22913 + }, + { + "epoch": 0.2977569792883213, + "grad_norm": 0.3356662094593048, + "learning_rate": 0.00014047752115312133, + "loss": 1.477, + "step": 22914 + }, + { + "epoch": 0.2977699738322372, + "grad_norm": 0.3485557734966278, + "learning_rate": 0.00014047492169120992, + "loss": 1.6869, + "step": 22915 + }, + { + "epoch": 0.2977829683761531, + "grad_norm": 0.4210974872112274, + "learning_rate": 0.00014047232222929855, + "loss": 1.3762, + "step": 22916 + }, + { + "epoch": 0.29779596292006894, + "grad_norm": 0.3594282865524292, + "learning_rate": 0.00014046972276738717, + "loss": 1.3609, + "step": 22917 + }, + { + "epoch": 0.29780895746398484, + "grad_norm": 0.3362147808074951, + "learning_rate": 0.00014046712330547577, + "loss": 1.3309, + "step": 22918 + }, + { + "epoch": 0.2978219520079007, + "grad_norm": 0.4683127999305725, + "learning_rate": 0.0001404645238435644, + "loss": 1.4555, + "step": 22919 + }, + { + "epoch": 0.2978349465518166, + "grad_norm": 0.32860347628593445, + "learning_rate": 0.000140461924381653, + "loss": 1.2936, + "step": 22920 + }, + { + "epoch": 0.2978479410957324, + "grad_norm": 0.49081334471702576, + "learning_rate": 0.00014045932491974164, + "loss": 1.4337, + "step": 22921 + }, + { + "epoch": 0.2978609356396483, + "grad_norm": 0.42803627252578735, + "learning_rate": 0.00014045672545783024, + "loss": 1.4939, + "step": 22922 + }, + { + "epoch": 0.2978739301835642, + "grad_norm": 0.4179244041442871, + "learning_rate": 0.00014045412599591884, + "loss": 1.3672, + "step": 22923 + }, + { + "epoch": 0.2978869247274801, + "grad_norm": 0.4239777624607086, + "learning_rate": 0.00014045152653400746, + "loss": 1.3249, + "step": 22924 + }, + { + "epoch": 0.2978999192713959, + "grad_norm": 0.37392351031303406, + "learning_rate": 0.0001404489270720961, + "loss": 1.2115, + "step": 22925 + }, + { + "epoch": 0.2979129138153118, + "grad_norm": 0.39103397727012634, + "learning_rate": 0.0001404463276101847, + "loss": 1.382, + "step": 22926 + }, + { + "epoch": 0.29792590835922766, + "grad_norm": 0.3098256587982178, + "learning_rate": 0.0001404437281482733, + "loss": 1.3627, + "step": 22927 + }, + { + "epoch": 0.29793890290314357, + "grad_norm": 0.40651196241378784, + "learning_rate": 0.00014044112868636193, + "loss": 1.4748, + "step": 22928 + }, + { + "epoch": 0.2979518974470594, + "grad_norm": 0.4172825515270233, + "learning_rate": 0.00014043852922445056, + "loss": 1.5185, + "step": 22929 + }, + { + "epoch": 0.2979648919909753, + "grad_norm": 0.42139530181884766, + "learning_rate": 0.00014043592976253916, + "loss": 1.4345, + "step": 22930 + }, + { + "epoch": 0.29797788653489116, + "grad_norm": 0.4424646496772766, + "learning_rate": 0.00014043333030062778, + "loss": 1.4346, + "step": 22931 + }, + { + "epoch": 0.29799088107880706, + "grad_norm": 0.38615378737449646, + "learning_rate": 0.00014043073083871638, + "loss": 1.1767, + "step": 22932 + }, + { + "epoch": 0.2980038756227229, + "grad_norm": 0.44978195428848267, + "learning_rate": 0.00014042813137680503, + "loss": 1.4309, + "step": 22933 + }, + { + "epoch": 0.2980168701666388, + "grad_norm": 0.3788641393184662, + "learning_rate": 0.00014042553191489363, + "loss": 1.2985, + "step": 22934 + }, + { + "epoch": 0.29802986471055465, + "grad_norm": 0.4054301381111145, + "learning_rate": 0.00014042293245298222, + "loss": 1.3065, + "step": 22935 + }, + { + "epoch": 0.29804285925447055, + "grad_norm": 0.42570415139198303, + "learning_rate": 0.00014042033299107085, + "loss": 1.3586, + "step": 22936 + }, + { + "epoch": 0.2980558537983864, + "grad_norm": 0.312277227640152, + "learning_rate": 0.00014041773352915947, + "loss": 1.5036, + "step": 22937 + }, + { + "epoch": 0.2980688483423023, + "grad_norm": 0.43636733293533325, + "learning_rate": 0.0001404151340672481, + "loss": 1.4062, + "step": 22938 + }, + { + "epoch": 0.29808184288621814, + "grad_norm": 0.4913323223590851, + "learning_rate": 0.0001404125346053367, + "loss": 1.3741, + "step": 22939 + }, + { + "epoch": 0.29809483743013404, + "grad_norm": 0.4800909161567688, + "learning_rate": 0.00014040993514342532, + "loss": 1.4679, + "step": 22940 + }, + { + "epoch": 0.2981078319740499, + "grad_norm": 0.39575737714767456, + "learning_rate": 0.00014040733568151394, + "loss": 1.3649, + "step": 22941 + }, + { + "epoch": 0.2981208265179658, + "grad_norm": 0.41685548424720764, + "learning_rate": 0.00014040473621960254, + "loss": 1.2472, + "step": 22942 + }, + { + "epoch": 0.29813382106188163, + "grad_norm": 0.2750242054462433, + "learning_rate": 0.00014040213675769117, + "loss": 1.2321, + "step": 22943 + }, + { + "epoch": 0.29814681560579753, + "grad_norm": 0.42563796043395996, + "learning_rate": 0.00014039953729577976, + "loss": 1.4788, + "step": 22944 + }, + { + "epoch": 0.2981598101497134, + "grad_norm": 0.452678382396698, + "learning_rate": 0.00014039693783386841, + "loss": 1.7496, + "step": 22945 + }, + { + "epoch": 0.2981728046936293, + "grad_norm": 0.4093649089336395, + "learning_rate": 0.000140394338371957, + "loss": 1.4877, + "step": 22946 + }, + { + "epoch": 0.2981857992375451, + "grad_norm": 0.37021756172180176, + "learning_rate": 0.00014039173891004564, + "loss": 1.4622, + "step": 22947 + }, + { + "epoch": 0.298198793781461, + "grad_norm": 0.3859061300754547, + "learning_rate": 0.00014038913944813423, + "loss": 1.4197, + "step": 22948 + }, + { + "epoch": 0.29821178832537687, + "grad_norm": 0.3743303418159485, + "learning_rate": 0.00014038653998622286, + "loss": 1.4015, + "step": 22949 + }, + { + "epoch": 0.29822478286929277, + "grad_norm": 0.5168335437774658, + "learning_rate": 0.00014038394052431148, + "loss": 1.4413, + "step": 22950 + }, + { + "epoch": 0.2982377774132086, + "grad_norm": 0.44695064425468445, + "learning_rate": 0.00014038134106240008, + "loss": 1.3717, + "step": 22951 + }, + { + "epoch": 0.2982507719571245, + "grad_norm": 0.3422711491584778, + "learning_rate": 0.0001403787416004887, + "loss": 1.4909, + "step": 22952 + }, + { + "epoch": 0.29826376650104036, + "grad_norm": 0.49021583795547485, + "learning_rate": 0.00014037614213857733, + "loss": 1.472, + "step": 22953 + }, + { + "epoch": 0.29827676104495626, + "grad_norm": 0.40482184290885925, + "learning_rate": 0.00014037354267666593, + "loss": 1.1841, + "step": 22954 + }, + { + "epoch": 0.2982897555888721, + "grad_norm": 0.4326929450035095, + "learning_rate": 0.00014037094321475455, + "loss": 1.2391, + "step": 22955 + }, + { + "epoch": 0.298302750132788, + "grad_norm": 0.4419623613357544, + "learning_rate": 0.00014036834375284317, + "loss": 1.2396, + "step": 22956 + }, + { + "epoch": 0.29831574467670385, + "grad_norm": 0.4637749493122101, + "learning_rate": 0.0001403657442909318, + "loss": 1.4447, + "step": 22957 + }, + { + "epoch": 0.29832873922061975, + "grad_norm": 0.2633437216281891, + "learning_rate": 0.0001403631448290204, + "loss": 1.377, + "step": 22958 + }, + { + "epoch": 0.2983417337645356, + "grad_norm": 0.3231419026851654, + "learning_rate": 0.00014036054536710902, + "loss": 1.281, + "step": 22959 + }, + { + "epoch": 0.2983547283084515, + "grad_norm": 0.40608328580856323, + "learning_rate": 0.00014035794590519765, + "loss": 1.3984, + "step": 22960 + }, + { + "epoch": 0.29836772285236735, + "grad_norm": 0.4633846879005432, + "learning_rate": 0.00014035534644328624, + "loss": 1.7658, + "step": 22961 + }, + { + "epoch": 0.29838071739628325, + "grad_norm": 0.3306026756763458, + "learning_rate": 0.00014035274698137487, + "loss": 1.4273, + "step": 22962 + }, + { + "epoch": 0.2983937119401991, + "grad_norm": 0.4051123857498169, + "learning_rate": 0.00014035014751946347, + "loss": 1.3625, + "step": 22963 + }, + { + "epoch": 0.298406706484115, + "grad_norm": 0.4277397096157074, + "learning_rate": 0.0001403475480575521, + "loss": 1.4658, + "step": 22964 + }, + { + "epoch": 0.29841970102803084, + "grad_norm": 0.29705703258514404, + "learning_rate": 0.00014034494859564071, + "loss": 1.3797, + "step": 22965 + }, + { + "epoch": 0.29843269557194674, + "grad_norm": 0.3473593592643738, + "learning_rate": 0.0001403423491337293, + "loss": 1.2868, + "step": 22966 + }, + { + "epoch": 0.2984456901158626, + "grad_norm": 0.4127417802810669, + "learning_rate": 0.00014033974967181794, + "loss": 1.5055, + "step": 22967 + }, + { + "epoch": 0.2984586846597785, + "grad_norm": 0.2935155928134918, + "learning_rate": 0.00014033715020990656, + "loss": 1.3117, + "step": 22968 + }, + { + "epoch": 0.29847167920369433, + "grad_norm": 0.39981845021247864, + "learning_rate": 0.00014033455074799518, + "loss": 1.2983, + "step": 22969 + }, + { + "epoch": 0.29848467374761023, + "grad_norm": 0.49690040946006775, + "learning_rate": 0.00014033195128608378, + "loss": 1.3331, + "step": 22970 + }, + { + "epoch": 0.2984976682915261, + "grad_norm": 0.4131467044353485, + "learning_rate": 0.0001403293518241724, + "loss": 1.4258, + "step": 22971 + }, + { + "epoch": 0.298510662835442, + "grad_norm": 0.5347421765327454, + "learning_rate": 0.00014032675236226103, + "loss": 1.4407, + "step": 22972 + }, + { + "epoch": 0.2985236573793578, + "grad_norm": 0.4536976218223572, + "learning_rate": 0.00014032415290034963, + "loss": 1.3972, + "step": 22973 + }, + { + "epoch": 0.2985366519232737, + "grad_norm": 0.4558336138725281, + "learning_rate": 0.00014032155343843825, + "loss": 1.6056, + "step": 22974 + }, + { + "epoch": 0.29854964646718957, + "grad_norm": 0.4119022786617279, + "learning_rate": 0.00014031895397652685, + "loss": 1.4241, + "step": 22975 + }, + { + "epoch": 0.29856264101110547, + "grad_norm": 0.4163656532764435, + "learning_rate": 0.0001403163545146155, + "loss": 1.5093, + "step": 22976 + }, + { + "epoch": 0.2985756355550213, + "grad_norm": 0.2803994119167328, + "learning_rate": 0.0001403137550527041, + "loss": 1.2156, + "step": 22977 + }, + { + "epoch": 0.2985886300989372, + "grad_norm": 0.3968595564365387, + "learning_rate": 0.0001403111555907927, + "loss": 1.4486, + "step": 22978 + }, + { + "epoch": 0.29860162464285306, + "grad_norm": 0.3674934506416321, + "learning_rate": 0.00014030855612888132, + "loss": 1.5421, + "step": 22979 + }, + { + "epoch": 0.29861461918676896, + "grad_norm": 0.38411805033683777, + "learning_rate": 0.00014030595666696995, + "loss": 1.3499, + "step": 22980 + }, + { + "epoch": 0.2986276137306848, + "grad_norm": 0.5203903317451477, + "learning_rate": 0.00014030335720505857, + "loss": 1.5487, + "step": 22981 + }, + { + "epoch": 0.2986406082746007, + "grad_norm": 0.414045125246048, + "learning_rate": 0.00014030075774314717, + "loss": 1.5586, + "step": 22982 + }, + { + "epoch": 0.29865360281851655, + "grad_norm": 0.30370041728019714, + "learning_rate": 0.0001402981582812358, + "loss": 1.2743, + "step": 22983 + }, + { + "epoch": 0.29866659736243245, + "grad_norm": 0.3711134195327759, + "learning_rate": 0.00014029555881932442, + "loss": 1.4256, + "step": 22984 + }, + { + "epoch": 0.2986795919063483, + "grad_norm": 0.43841317296028137, + "learning_rate": 0.00014029295935741301, + "loss": 1.446, + "step": 22985 + }, + { + "epoch": 0.2986925864502642, + "grad_norm": 0.4599684774875641, + "learning_rate": 0.00014029035989550164, + "loss": 1.3719, + "step": 22986 + }, + { + "epoch": 0.29870558099418004, + "grad_norm": 0.4624248445034027, + "learning_rate": 0.00014028776043359024, + "loss": 1.4111, + "step": 22987 + }, + { + "epoch": 0.29871857553809594, + "grad_norm": 0.41547250747680664, + "learning_rate": 0.0001402851609716789, + "loss": 1.587, + "step": 22988 + }, + { + "epoch": 0.2987315700820118, + "grad_norm": 0.44423219561576843, + "learning_rate": 0.00014028256150976748, + "loss": 1.4156, + "step": 22989 + }, + { + "epoch": 0.2987445646259277, + "grad_norm": 0.33778834342956543, + "learning_rate": 0.00014027996204785608, + "loss": 1.1865, + "step": 22990 + }, + { + "epoch": 0.2987575591698436, + "grad_norm": 0.4605976939201355, + "learning_rate": 0.00014027736258594473, + "loss": 1.4909, + "step": 22991 + }, + { + "epoch": 0.29877055371375943, + "grad_norm": 0.43227675557136536, + "learning_rate": 0.00014027476312403333, + "loss": 1.6118, + "step": 22992 + }, + { + "epoch": 0.29878354825767534, + "grad_norm": 0.43669548630714417, + "learning_rate": 0.00014027216366212196, + "loss": 1.5071, + "step": 22993 + }, + { + "epoch": 0.2987965428015912, + "grad_norm": 0.3404190242290497, + "learning_rate": 0.00014026956420021055, + "loss": 1.4458, + "step": 22994 + }, + { + "epoch": 0.2988095373455071, + "grad_norm": 0.40627357363700867, + "learning_rate": 0.00014026696473829918, + "loss": 1.4604, + "step": 22995 + }, + { + "epoch": 0.2988225318894229, + "grad_norm": 0.5124145150184631, + "learning_rate": 0.0001402643652763878, + "loss": 1.2954, + "step": 22996 + }, + { + "epoch": 0.2988355264333388, + "grad_norm": 0.4437195956707001, + "learning_rate": 0.0001402617658144764, + "loss": 1.3254, + "step": 22997 + }, + { + "epoch": 0.2988485209772547, + "grad_norm": 0.33872994780540466, + "learning_rate": 0.00014025916635256502, + "loss": 1.242, + "step": 22998 + }, + { + "epoch": 0.2988615155211706, + "grad_norm": 0.4779316186904907, + "learning_rate": 0.00014025656689065365, + "loss": 1.4364, + "step": 22999 + }, + { + "epoch": 0.2988745100650864, + "grad_norm": 0.46553751826286316, + "learning_rate": 0.00014025396742874227, + "loss": 1.4367, + "step": 23000 + }, + { + "epoch": 0.2988875046090023, + "grad_norm": 0.5074887871742249, + "learning_rate": 0.00014025136796683087, + "loss": 1.3992, + "step": 23001 + }, + { + "epoch": 0.29890049915291816, + "grad_norm": 0.3086678087711334, + "learning_rate": 0.00014024876850491947, + "loss": 1.3144, + "step": 23002 + }, + { + "epoch": 0.29891349369683407, + "grad_norm": 0.45222100615501404, + "learning_rate": 0.00014024616904300812, + "loss": 1.4294, + "step": 23003 + }, + { + "epoch": 0.2989264882407499, + "grad_norm": 0.42579686641693115, + "learning_rate": 0.00014024356958109672, + "loss": 1.4623, + "step": 23004 + }, + { + "epoch": 0.2989394827846658, + "grad_norm": 0.23428599536418915, + "learning_rate": 0.00014024097011918534, + "loss": 1.1466, + "step": 23005 + }, + { + "epoch": 0.29895247732858166, + "grad_norm": 0.4408715069293976, + "learning_rate": 0.00014023837065727394, + "loss": 1.4951, + "step": 23006 + }, + { + "epoch": 0.29896547187249756, + "grad_norm": 0.34068456292152405, + "learning_rate": 0.00014023577119536256, + "loss": 1.3378, + "step": 23007 + }, + { + "epoch": 0.2989784664164134, + "grad_norm": 0.44888728857040405, + "learning_rate": 0.0001402331717334512, + "loss": 1.3897, + "step": 23008 + }, + { + "epoch": 0.2989914609603293, + "grad_norm": 0.35491135716438293, + "learning_rate": 0.00014023057227153978, + "loss": 1.3693, + "step": 23009 + }, + { + "epoch": 0.29900445550424515, + "grad_norm": 0.38651758432388306, + "learning_rate": 0.0001402279728096284, + "loss": 1.3369, + "step": 23010 + }, + { + "epoch": 0.29901745004816105, + "grad_norm": 0.3042561709880829, + "learning_rate": 0.00014022537334771703, + "loss": 1.3415, + "step": 23011 + }, + { + "epoch": 0.2990304445920769, + "grad_norm": 0.42663854360580444, + "learning_rate": 0.00014022277388580566, + "loss": 1.5087, + "step": 23012 + }, + { + "epoch": 0.2990434391359928, + "grad_norm": 0.5376585721969604, + "learning_rate": 0.00014022017442389426, + "loss": 1.5156, + "step": 23013 + }, + { + "epoch": 0.29905643367990864, + "grad_norm": 0.4362395405769348, + "learning_rate": 0.00014021757496198288, + "loss": 1.3487, + "step": 23014 + }, + { + "epoch": 0.29906942822382454, + "grad_norm": 0.38434898853302, + "learning_rate": 0.0001402149755000715, + "loss": 1.3713, + "step": 23015 + }, + { + "epoch": 0.2990824227677404, + "grad_norm": 0.396319717168808, + "learning_rate": 0.0001402123760381601, + "loss": 1.3007, + "step": 23016 + }, + { + "epoch": 0.2990954173116563, + "grad_norm": 0.4017016589641571, + "learning_rate": 0.00014020977657624873, + "loss": 1.3074, + "step": 23017 + }, + { + "epoch": 0.29910841185557213, + "grad_norm": 0.3336453437805176, + "learning_rate": 0.00014020717711433732, + "loss": 1.342, + "step": 23018 + }, + { + "epoch": 0.29912140639948803, + "grad_norm": 0.32417479157447815, + "learning_rate": 0.00014020457765242595, + "loss": 1.3644, + "step": 23019 + }, + { + "epoch": 0.2991344009434039, + "grad_norm": 0.3886622488498688, + "learning_rate": 0.00014020197819051457, + "loss": 1.2076, + "step": 23020 + }, + { + "epoch": 0.2991473954873198, + "grad_norm": 0.44363850355148315, + "learning_rate": 0.00014019937872860317, + "loss": 1.3943, + "step": 23021 + }, + { + "epoch": 0.2991603900312356, + "grad_norm": 0.4339125454425812, + "learning_rate": 0.0001401967792666918, + "loss": 1.5611, + "step": 23022 + }, + { + "epoch": 0.2991733845751515, + "grad_norm": 0.5525286197662354, + "learning_rate": 0.00014019417980478042, + "loss": 1.3549, + "step": 23023 + }, + { + "epoch": 0.29918637911906737, + "grad_norm": 0.3635425269603729, + "learning_rate": 0.00014019158034286904, + "loss": 1.2835, + "step": 23024 + }, + { + "epoch": 0.29919937366298327, + "grad_norm": 0.7514035701751709, + "learning_rate": 0.00014018898088095764, + "loss": 1.3743, + "step": 23025 + }, + { + "epoch": 0.2992123682068991, + "grad_norm": 0.3714289367198944, + "learning_rate": 0.00014018638141904627, + "loss": 1.3128, + "step": 23026 + }, + { + "epoch": 0.299225362750815, + "grad_norm": 0.4191341698169708, + "learning_rate": 0.0001401837819571349, + "loss": 1.4467, + "step": 23027 + }, + { + "epoch": 0.29923835729473086, + "grad_norm": 0.37737709283828735, + "learning_rate": 0.0001401811824952235, + "loss": 1.2746, + "step": 23028 + }, + { + "epoch": 0.29925135183864676, + "grad_norm": 0.35028916597366333, + "learning_rate": 0.0001401785830333121, + "loss": 1.3699, + "step": 23029 + }, + { + "epoch": 0.2992643463825626, + "grad_norm": 0.3594965636730194, + "learning_rate": 0.00014017598357140074, + "loss": 1.4369, + "step": 23030 + }, + { + "epoch": 0.2992773409264785, + "grad_norm": 0.3582841157913208, + "learning_rate": 0.00014017338410948936, + "loss": 1.4876, + "step": 23031 + }, + { + "epoch": 0.29929033547039435, + "grad_norm": 0.3327685594558716, + "learning_rate": 0.00014017078464757796, + "loss": 1.6182, + "step": 23032 + }, + { + "epoch": 0.29930333001431025, + "grad_norm": 0.37973544001579285, + "learning_rate": 0.00014016818518566656, + "loss": 1.3351, + "step": 23033 + }, + { + "epoch": 0.2993163245582261, + "grad_norm": 0.3545832633972168, + "learning_rate": 0.0001401655857237552, + "loss": 1.3836, + "step": 23034 + }, + { + "epoch": 0.299329319102142, + "grad_norm": 0.39142656326293945, + "learning_rate": 0.0001401629862618438, + "loss": 1.4798, + "step": 23035 + }, + { + "epoch": 0.29934231364605784, + "grad_norm": 0.3702673316001892, + "learning_rate": 0.00014016038679993243, + "loss": 1.2912, + "step": 23036 + }, + { + "epoch": 0.29935530818997375, + "grad_norm": 0.4710434675216675, + "learning_rate": 0.00014015778733802103, + "loss": 1.5469, + "step": 23037 + }, + { + "epoch": 0.2993683027338896, + "grad_norm": 0.4846549928188324, + "learning_rate": 0.00014015518787610965, + "loss": 1.5023, + "step": 23038 + }, + { + "epoch": 0.2993812972778055, + "grad_norm": 0.2982243299484253, + "learning_rate": 0.00014015258841419828, + "loss": 1.3477, + "step": 23039 + }, + { + "epoch": 0.29939429182172134, + "grad_norm": 0.42551302909851074, + "learning_rate": 0.00014014998895228687, + "loss": 1.4403, + "step": 23040 + }, + { + "epoch": 0.29940728636563724, + "grad_norm": 0.34329622983932495, + "learning_rate": 0.0001401473894903755, + "loss": 1.2153, + "step": 23041 + }, + { + "epoch": 0.2994202809095531, + "grad_norm": 0.308676540851593, + "learning_rate": 0.00014014479002846412, + "loss": 1.4109, + "step": 23042 + }, + { + "epoch": 0.299433275453469, + "grad_norm": 0.33030661940574646, + "learning_rate": 0.00014014219056655275, + "loss": 1.4284, + "step": 23043 + }, + { + "epoch": 0.29944626999738483, + "grad_norm": 0.36275357007980347, + "learning_rate": 0.00014013959110464134, + "loss": 1.2559, + "step": 23044 + }, + { + "epoch": 0.29945926454130073, + "grad_norm": 0.3830799460411072, + "learning_rate": 0.00014013699164272994, + "loss": 1.3998, + "step": 23045 + }, + { + "epoch": 0.2994722590852166, + "grad_norm": 0.35228869318962097, + "learning_rate": 0.0001401343921808186, + "loss": 1.3099, + "step": 23046 + }, + { + "epoch": 0.2994852536291325, + "grad_norm": 0.32076987624168396, + "learning_rate": 0.0001401317927189072, + "loss": 1.4854, + "step": 23047 + }, + { + "epoch": 0.2994982481730483, + "grad_norm": 0.4578842520713806, + "learning_rate": 0.00014012919325699581, + "loss": 1.4898, + "step": 23048 + }, + { + "epoch": 0.2995112427169642, + "grad_norm": 0.43700602650642395, + "learning_rate": 0.0001401265937950844, + "loss": 1.5548, + "step": 23049 + }, + { + "epoch": 0.29952423726088007, + "grad_norm": 0.3971840739250183, + "learning_rate": 0.00014012399433317304, + "loss": 1.3444, + "step": 23050 + }, + { + "epoch": 0.29953723180479597, + "grad_norm": 0.44781315326690674, + "learning_rate": 0.00014012139487126166, + "loss": 1.5399, + "step": 23051 + }, + { + "epoch": 0.2995502263487118, + "grad_norm": 0.42125004529953003, + "learning_rate": 0.00014011879540935026, + "loss": 1.2929, + "step": 23052 + }, + { + "epoch": 0.2995632208926277, + "grad_norm": 0.4396473467350006, + "learning_rate": 0.00014011619594743888, + "loss": 1.5312, + "step": 23053 + }, + { + "epoch": 0.29957621543654356, + "grad_norm": 0.3253374993801117, + "learning_rate": 0.0001401135964855275, + "loss": 1.5133, + "step": 23054 + }, + { + "epoch": 0.29958920998045946, + "grad_norm": 0.5467861294746399, + "learning_rate": 0.00014011099702361613, + "loss": 1.4122, + "step": 23055 + }, + { + "epoch": 0.2996022045243753, + "grad_norm": 0.32921624183654785, + "learning_rate": 0.00014010839756170473, + "loss": 1.4513, + "step": 23056 + }, + { + "epoch": 0.2996151990682912, + "grad_norm": 0.36699020862579346, + "learning_rate": 0.00014010579809979333, + "loss": 1.2875, + "step": 23057 + }, + { + "epoch": 0.29962819361220705, + "grad_norm": 0.41242703795433044, + "learning_rate": 0.00014010319863788198, + "loss": 1.3874, + "step": 23058 + }, + { + "epoch": 0.29964118815612295, + "grad_norm": 0.39426305890083313, + "learning_rate": 0.00014010059917597058, + "loss": 1.5611, + "step": 23059 + }, + { + "epoch": 0.2996541827000388, + "grad_norm": 0.4162886142730713, + "learning_rate": 0.0001400979997140592, + "loss": 1.3413, + "step": 23060 + }, + { + "epoch": 0.2996671772439547, + "grad_norm": 0.32781875133514404, + "learning_rate": 0.0001400954002521478, + "loss": 1.3974, + "step": 23061 + }, + { + "epoch": 0.29968017178787054, + "grad_norm": 0.43102505803108215, + "learning_rate": 0.00014009280079023642, + "loss": 1.3725, + "step": 23062 + }, + { + "epoch": 0.29969316633178644, + "grad_norm": 0.37109994888305664, + "learning_rate": 0.00014009020132832505, + "loss": 1.537, + "step": 23063 + }, + { + "epoch": 0.2997061608757023, + "grad_norm": 0.4329696595668793, + "learning_rate": 0.00014008760186641364, + "loss": 1.4747, + "step": 23064 + }, + { + "epoch": 0.2997191554196182, + "grad_norm": 0.3689322769641876, + "learning_rate": 0.0001400850024045023, + "loss": 1.4872, + "step": 23065 + }, + { + "epoch": 0.29973214996353403, + "grad_norm": 0.2741352617740631, + "learning_rate": 0.0001400824029425909, + "loss": 1.1206, + "step": 23066 + }, + { + "epoch": 0.29974514450744993, + "grad_norm": 0.45320233702659607, + "learning_rate": 0.00014007980348067952, + "loss": 1.5149, + "step": 23067 + }, + { + "epoch": 0.29975813905136584, + "grad_norm": 0.5117446184158325, + "learning_rate": 0.00014007720401876811, + "loss": 1.5151, + "step": 23068 + }, + { + "epoch": 0.2997711335952817, + "grad_norm": 0.3062855005264282, + "learning_rate": 0.00014007460455685674, + "loss": 1.4095, + "step": 23069 + }, + { + "epoch": 0.2997841281391976, + "grad_norm": 0.3812221586704254, + "learning_rate": 0.00014007200509494536, + "loss": 1.3836, + "step": 23070 + }, + { + "epoch": 0.2997971226831134, + "grad_norm": 0.4222165048122406, + "learning_rate": 0.00014006940563303396, + "loss": 1.4239, + "step": 23071 + }, + { + "epoch": 0.2998101172270293, + "grad_norm": 0.4736957550048828, + "learning_rate": 0.00014006680617112259, + "loss": 1.5331, + "step": 23072 + }, + { + "epoch": 0.29982311177094517, + "grad_norm": 0.4582005739212036, + "learning_rate": 0.0001400642067092112, + "loss": 1.434, + "step": 23073 + }, + { + "epoch": 0.2998361063148611, + "grad_norm": 0.41837435960769653, + "learning_rate": 0.0001400616072472998, + "loss": 1.2434, + "step": 23074 + }, + { + "epoch": 0.2998491008587769, + "grad_norm": 0.44448328018188477, + "learning_rate": 0.00014005900778538843, + "loss": 1.63, + "step": 23075 + }, + { + "epoch": 0.2998620954026928, + "grad_norm": 0.3510551154613495, + "learning_rate": 0.00014005640832347703, + "loss": 1.3541, + "step": 23076 + }, + { + "epoch": 0.29987508994660866, + "grad_norm": 0.3945156931877136, + "learning_rate": 0.00014005380886156568, + "loss": 1.3171, + "step": 23077 + }, + { + "epoch": 0.29988808449052456, + "grad_norm": 0.2724815011024475, + "learning_rate": 0.00014005120939965428, + "loss": 1.161, + "step": 23078 + }, + { + "epoch": 0.2999010790344404, + "grad_norm": 0.4988335371017456, + "learning_rate": 0.0001400486099377429, + "loss": 1.3636, + "step": 23079 + }, + { + "epoch": 0.2999140735783563, + "grad_norm": 0.4144153594970703, + "learning_rate": 0.0001400460104758315, + "loss": 1.4459, + "step": 23080 + }, + { + "epoch": 0.29992706812227216, + "grad_norm": 0.35689717531204224, + "learning_rate": 0.00014004341101392012, + "loss": 1.335, + "step": 23081 + }, + { + "epoch": 0.29994006266618806, + "grad_norm": 0.444684237241745, + "learning_rate": 0.00014004081155200875, + "loss": 1.4402, + "step": 23082 + }, + { + "epoch": 0.2999530572101039, + "grad_norm": 0.3817139267921448, + "learning_rate": 0.00014003821209009735, + "loss": 1.4088, + "step": 23083 + }, + { + "epoch": 0.2999660517540198, + "grad_norm": 0.4249608814716339, + "learning_rate": 0.00014003561262818597, + "loss": 1.5001, + "step": 23084 + }, + { + "epoch": 0.29997904629793565, + "grad_norm": 0.3391485810279846, + "learning_rate": 0.0001400330131662746, + "loss": 1.6685, + "step": 23085 + }, + { + "epoch": 0.29999204084185155, + "grad_norm": 0.47837451100349426, + "learning_rate": 0.0001400304137043632, + "loss": 1.5881, + "step": 23086 + }, + { + "epoch": 0.3000050353857674, + "grad_norm": 0.33048415184020996, + "learning_rate": 0.00014002781424245182, + "loss": 1.2404, + "step": 23087 + }, + { + "epoch": 0.3000180299296833, + "grad_norm": 0.45419323444366455, + "learning_rate": 0.00014002521478054041, + "loss": 1.4115, + "step": 23088 + }, + { + "epoch": 0.30003102447359914, + "grad_norm": 0.4446699917316437, + "learning_rate": 0.00014002261531862907, + "loss": 1.4144, + "step": 23089 + }, + { + "epoch": 0.30004401901751504, + "grad_norm": 0.32597842812538147, + "learning_rate": 0.00014002001585671766, + "loss": 1.4544, + "step": 23090 + }, + { + "epoch": 0.3000570135614309, + "grad_norm": 0.35293325781822205, + "learning_rate": 0.0001400174163948063, + "loss": 1.3647, + "step": 23091 + }, + { + "epoch": 0.3000700081053468, + "grad_norm": 0.3385941684246063, + "learning_rate": 0.00014001481693289489, + "loss": 1.1412, + "step": 23092 + }, + { + "epoch": 0.30008300264926263, + "grad_norm": 0.4868931770324707, + "learning_rate": 0.0001400122174709835, + "loss": 1.311, + "step": 23093 + }, + { + "epoch": 0.30009599719317853, + "grad_norm": 0.3418442904949188, + "learning_rate": 0.00014000961800907213, + "loss": 1.3247, + "step": 23094 + }, + { + "epoch": 0.3001089917370944, + "grad_norm": 0.41976794600486755, + "learning_rate": 0.00014000701854716073, + "loss": 1.4714, + "step": 23095 + }, + { + "epoch": 0.3001219862810103, + "grad_norm": 0.42028120160102844, + "learning_rate": 0.00014000441908524936, + "loss": 1.5775, + "step": 23096 + }, + { + "epoch": 0.3001349808249261, + "grad_norm": 0.46938571333885193, + "learning_rate": 0.00014000181962333798, + "loss": 1.3944, + "step": 23097 + }, + { + "epoch": 0.300147975368842, + "grad_norm": 0.45536401867866516, + "learning_rate": 0.0001399992201614266, + "loss": 1.526, + "step": 23098 + }, + { + "epoch": 0.30016096991275787, + "grad_norm": 0.34066665172576904, + "learning_rate": 0.0001399966206995152, + "loss": 1.311, + "step": 23099 + }, + { + "epoch": 0.30017396445667377, + "grad_norm": 0.35779157280921936, + "learning_rate": 0.0001399940212376038, + "loss": 1.3758, + "step": 23100 + }, + { + "epoch": 0.3001869590005896, + "grad_norm": 0.4279794692993164, + "learning_rate": 0.00013999142177569245, + "loss": 1.4, + "step": 23101 + }, + { + "epoch": 0.3001999535445055, + "grad_norm": 0.33099332451820374, + "learning_rate": 0.00013998882231378105, + "loss": 1.3823, + "step": 23102 + }, + { + "epoch": 0.30021294808842136, + "grad_norm": 0.42596107721328735, + "learning_rate": 0.00013998622285186967, + "loss": 1.4782, + "step": 23103 + }, + { + "epoch": 0.30022594263233726, + "grad_norm": 0.4145480990409851, + "learning_rate": 0.0001399836233899583, + "loss": 1.3681, + "step": 23104 + }, + { + "epoch": 0.3002389371762531, + "grad_norm": 0.2508307099342346, + "learning_rate": 0.0001399810239280469, + "loss": 1.3107, + "step": 23105 + }, + { + "epoch": 0.300251931720169, + "grad_norm": 0.45985832810401917, + "learning_rate": 0.00013997842446613552, + "loss": 1.3947, + "step": 23106 + }, + { + "epoch": 0.30026492626408485, + "grad_norm": 0.40395018458366394, + "learning_rate": 0.00013997582500422412, + "loss": 1.4372, + "step": 23107 + }, + { + "epoch": 0.30027792080800075, + "grad_norm": 0.41711878776550293, + "learning_rate": 0.00013997322554231277, + "loss": 1.5646, + "step": 23108 + }, + { + "epoch": 0.3002909153519166, + "grad_norm": 0.39429208636283875, + "learning_rate": 0.00013997062608040137, + "loss": 1.3827, + "step": 23109 + }, + { + "epoch": 0.3003039098958325, + "grad_norm": 0.4032944142818451, + "learning_rate": 0.00013996802661849, + "loss": 1.5617, + "step": 23110 + }, + { + "epoch": 0.30031690443974834, + "grad_norm": 0.31945449113845825, + "learning_rate": 0.0001399654271565786, + "loss": 1.2784, + "step": 23111 + }, + { + "epoch": 0.30032989898366425, + "grad_norm": 0.309740275144577, + "learning_rate": 0.0001399628276946672, + "loss": 1.2569, + "step": 23112 + }, + { + "epoch": 0.3003428935275801, + "grad_norm": 0.338676393032074, + "learning_rate": 0.00013996022823275584, + "loss": 1.4824, + "step": 23113 + }, + { + "epoch": 0.300355888071496, + "grad_norm": 0.4370415508747101, + "learning_rate": 0.00013995762877084443, + "loss": 1.722, + "step": 23114 + }, + { + "epoch": 0.30036888261541184, + "grad_norm": 0.4167619049549103, + "learning_rate": 0.00013995502930893306, + "loss": 1.3218, + "step": 23115 + }, + { + "epoch": 0.30038187715932774, + "grad_norm": 0.3589772880077362, + "learning_rate": 0.00013995242984702168, + "loss": 1.3254, + "step": 23116 + }, + { + "epoch": 0.3003948717032436, + "grad_norm": 0.395468145608902, + "learning_rate": 0.00013994983038511028, + "loss": 1.3911, + "step": 23117 + }, + { + "epoch": 0.3004078662471595, + "grad_norm": 0.5283803939819336, + "learning_rate": 0.0001399472309231989, + "loss": 1.5771, + "step": 23118 + }, + { + "epoch": 0.30042086079107533, + "grad_norm": 0.34886303544044495, + "learning_rate": 0.0001399446314612875, + "loss": 1.5127, + "step": 23119 + }, + { + "epoch": 0.30043385533499123, + "grad_norm": 0.4214344322681427, + "learning_rate": 0.00013994203199937615, + "loss": 1.4522, + "step": 23120 + }, + { + "epoch": 0.3004468498789071, + "grad_norm": 0.3599209189414978, + "learning_rate": 0.00013993943253746475, + "loss": 1.4764, + "step": 23121 + }, + { + "epoch": 0.300459844422823, + "grad_norm": 0.3773268163204193, + "learning_rate": 0.00013993683307555338, + "loss": 1.3414, + "step": 23122 + }, + { + "epoch": 0.3004728389667388, + "grad_norm": 0.3108392357826233, + "learning_rate": 0.00013993423361364197, + "loss": 1.2555, + "step": 23123 + }, + { + "epoch": 0.3004858335106547, + "grad_norm": 0.4072191119194031, + "learning_rate": 0.0001399316341517306, + "loss": 1.3073, + "step": 23124 + }, + { + "epoch": 0.30049882805457057, + "grad_norm": 0.4205571711063385, + "learning_rate": 0.00013992903468981922, + "loss": 1.4057, + "step": 23125 + }, + { + "epoch": 0.30051182259848647, + "grad_norm": 0.481813907623291, + "learning_rate": 0.00013992643522790782, + "loss": 1.4237, + "step": 23126 + }, + { + "epoch": 0.3005248171424023, + "grad_norm": 0.46717846393585205, + "learning_rate": 0.00013992383576599644, + "loss": 1.321, + "step": 23127 + }, + { + "epoch": 0.3005378116863182, + "grad_norm": 0.3609062731266022, + "learning_rate": 0.00013992123630408507, + "loss": 1.4154, + "step": 23128 + }, + { + "epoch": 0.30055080623023406, + "grad_norm": 0.37674105167388916, + "learning_rate": 0.00013991863684217367, + "loss": 1.2722, + "step": 23129 + }, + { + "epoch": 0.30056380077414996, + "grad_norm": 0.5592606067657471, + "learning_rate": 0.0001399160373802623, + "loss": 1.268, + "step": 23130 + }, + { + "epoch": 0.3005767953180658, + "grad_norm": 0.32691535353660583, + "learning_rate": 0.0001399134379183509, + "loss": 1.3656, + "step": 23131 + }, + { + "epoch": 0.3005897898619817, + "grad_norm": 0.36100149154663086, + "learning_rate": 0.00013991083845643954, + "loss": 1.3526, + "step": 23132 + }, + { + "epoch": 0.30060278440589755, + "grad_norm": 0.4244671165943146, + "learning_rate": 0.00013990823899452814, + "loss": 1.543, + "step": 23133 + }, + { + "epoch": 0.30061577894981345, + "grad_norm": 0.40261876583099365, + "learning_rate": 0.00013990563953261676, + "loss": 1.445, + "step": 23134 + }, + { + "epoch": 0.3006287734937293, + "grad_norm": 0.35691580176353455, + "learning_rate": 0.00013990304007070536, + "loss": 1.4398, + "step": 23135 + }, + { + "epoch": 0.3006417680376452, + "grad_norm": 0.4153222441673279, + "learning_rate": 0.00013990044060879398, + "loss": 1.289, + "step": 23136 + }, + { + "epoch": 0.30065476258156104, + "grad_norm": 0.5712962746620178, + "learning_rate": 0.0001398978411468826, + "loss": 1.5766, + "step": 23137 + }, + { + "epoch": 0.30066775712547694, + "grad_norm": 0.3895614743232727, + "learning_rate": 0.0001398952416849712, + "loss": 1.3107, + "step": 23138 + }, + { + "epoch": 0.3006807516693928, + "grad_norm": 0.4627132713794708, + "learning_rate": 0.00013989264222305986, + "loss": 1.3964, + "step": 23139 + }, + { + "epoch": 0.3006937462133087, + "grad_norm": 0.42954251170158386, + "learning_rate": 0.00013989004276114845, + "loss": 1.2456, + "step": 23140 + }, + { + "epoch": 0.30070674075722453, + "grad_norm": 0.36836904287338257, + "learning_rate": 0.00013988744329923705, + "loss": 1.4635, + "step": 23141 + }, + { + "epoch": 0.30071973530114043, + "grad_norm": 0.3514178693294525, + "learning_rate": 0.00013988484383732568, + "loss": 1.3925, + "step": 23142 + }, + { + "epoch": 0.30073272984505633, + "grad_norm": 0.44862452149391174, + "learning_rate": 0.0001398822443754143, + "loss": 1.438, + "step": 23143 + }, + { + "epoch": 0.3007457243889722, + "grad_norm": 0.5299797654151917, + "learning_rate": 0.00013987964491350292, + "loss": 1.4577, + "step": 23144 + }, + { + "epoch": 0.3007587189328881, + "grad_norm": 0.4457918703556061, + "learning_rate": 0.00013987704545159152, + "loss": 1.6489, + "step": 23145 + }, + { + "epoch": 0.3007717134768039, + "grad_norm": 0.4561452865600586, + "learning_rate": 0.00013987444598968015, + "loss": 1.3688, + "step": 23146 + }, + { + "epoch": 0.3007847080207198, + "grad_norm": 0.44524261355400085, + "learning_rate": 0.00013987184652776877, + "loss": 1.4955, + "step": 23147 + }, + { + "epoch": 0.30079770256463567, + "grad_norm": 0.41511401534080505, + "learning_rate": 0.00013986924706585737, + "loss": 1.1362, + "step": 23148 + }, + { + "epoch": 0.3008106971085516, + "grad_norm": 0.42589128017425537, + "learning_rate": 0.000139866647603946, + "loss": 1.4206, + "step": 23149 + }, + { + "epoch": 0.3008236916524674, + "grad_norm": 0.38631191849708557, + "learning_rate": 0.0001398640481420346, + "loss": 1.3594, + "step": 23150 + }, + { + "epoch": 0.3008366861963833, + "grad_norm": 0.3594703674316406, + "learning_rate": 0.00013986144868012324, + "loss": 1.5019, + "step": 23151 + }, + { + "epoch": 0.30084968074029916, + "grad_norm": 0.37220489978790283, + "learning_rate": 0.00013985884921821184, + "loss": 1.4041, + "step": 23152 + }, + { + "epoch": 0.30086267528421506, + "grad_norm": 0.3390521705150604, + "learning_rate": 0.00013985624975630046, + "loss": 1.5505, + "step": 23153 + }, + { + "epoch": 0.3008756698281309, + "grad_norm": 0.3974217474460602, + "learning_rate": 0.00013985365029438906, + "loss": 1.4717, + "step": 23154 + }, + { + "epoch": 0.3008886643720468, + "grad_norm": 0.38737231492996216, + "learning_rate": 0.00013985105083247769, + "loss": 1.2632, + "step": 23155 + }, + { + "epoch": 0.30090165891596266, + "grad_norm": 0.38725540041923523, + "learning_rate": 0.0001398484513705663, + "loss": 1.4045, + "step": 23156 + }, + { + "epoch": 0.30091465345987856, + "grad_norm": 0.46442216634750366, + "learning_rate": 0.0001398458519086549, + "loss": 1.3241, + "step": 23157 + }, + { + "epoch": 0.3009276480037944, + "grad_norm": 0.4750121235847473, + "learning_rate": 0.00013984325244674353, + "loss": 1.5128, + "step": 23158 + }, + { + "epoch": 0.3009406425477103, + "grad_norm": 0.37743815779685974, + "learning_rate": 0.00013984065298483216, + "loss": 1.5687, + "step": 23159 + }, + { + "epoch": 0.30095363709162615, + "grad_norm": 0.43370598554611206, + "learning_rate": 0.00013983805352292075, + "loss": 1.2567, + "step": 23160 + }, + { + "epoch": 0.30096663163554205, + "grad_norm": 0.3631453812122345, + "learning_rate": 0.00013983545406100938, + "loss": 1.365, + "step": 23161 + }, + { + "epoch": 0.3009796261794579, + "grad_norm": 0.2913861870765686, + "learning_rate": 0.00013983285459909798, + "loss": 1.4156, + "step": 23162 + }, + { + "epoch": 0.3009926207233738, + "grad_norm": 0.4645940363407135, + "learning_rate": 0.00013983025513718663, + "loss": 1.4296, + "step": 23163 + }, + { + "epoch": 0.30100561526728964, + "grad_norm": 0.3260025978088379, + "learning_rate": 0.00013982765567527522, + "loss": 1.2212, + "step": 23164 + }, + { + "epoch": 0.30101860981120554, + "grad_norm": 0.37264716625213623, + "learning_rate": 0.00013982505621336385, + "loss": 1.6016, + "step": 23165 + }, + { + "epoch": 0.3010316043551214, + "grad_norm": 0.3139488101005554, + "learning_rate": 0.00013982245675145245, + "loss": 1.3311, + "step": 23166 + }, + { + "epoch": 0.3010445988990373, + "grad_norm": 0.39901939034461975, + "learning_rate": 0.00013981985728954107, + "loss": 1.5765, + "step": 23167 + }, + { + "epoch": 0.30105759344295313, + "grad_norm": 0.42495572566986084, + "learning_rate": 0.0001398172578276297, + "loss": 1.611, + "step": 23168 + }, + { + "epoch": 0.30107058798686903, + "grad_norm": 0.37210941314697266, + "learning_rate": 0.0001398146583657183, + "loss": 1.5092, + "step": 23169 + }, + { + "epoch": 0.3010835825307849, + "grad_norm": 0.4773200452327728, + "learning_rate": 0.00013981205890380692, + "loss": 1.4687, + "step": 23170 + }, + { + "epoch": 0.3010965770747008, + "grad_norm": 0.3680039644241333, + "learning_rate": 0.00013980945944189554, + "loss": 1.4688, + "step": 23171 + }, + { + "epoch": 0.3011095716186166, + "grad_norm": 0.40396901965141296, + "learning_rate": 0.00013980685997998414, + "loss": 1.4098, + "step": 23172 + }, + { + "epoch": 0.3011225661625325, + "grad_norm": 0.29876863956451416, + "learning_rate": 0.00013980426051807276, + "loss": 1.2946, + "step": 23173 + }, + { + "epoch": 0.30113556070644837, + "grad_norm": 0.337939590215683, + "learning_rate": 0.00013980166105616136, + "loss": 1.3637, + "step": 23174 + }, + { + "epoch": 0.30114855525036427, + "grad_norm": 0.3283732831478119, + "learning_rate": 0.00013979906159425, + "loss": 1.3861, + "step": 23175 + }, + { + "epoch": 0.3011615497942801, + "grad_norm": 0.35722818970680237, + "learning_rate": 0.0001397964621323386, + "loss": 1.3949, + "step": 23176 + }, + { + "epoch": 0.301174544338196, + "grad_norm": 0.5842116475105286, + "learning_rate": 0.00013979386267042723, + "loss": 1.4274, + "step": 23177 + }, + { + "epoch": 0.30118753888211186, + "grad_norm": 0.37793973088264465, + "learning_rate": 0.00013979126320851586, + "loss": 1.375, + "step": 23178 + }, + { + "epoch": 0.30120053342602776, + "grad_norm": 0.40954336524009705, + "learning_rate": 0.00013978866374660446, + "loss": 1.5104, + "step": 23179 + }, + { + "epoch": 0.3012135279699436, + "grad_norm": 0.38299664855003357, + "learning_rate": 0.00013978606428469308, + "loss": 1.3905, + "step": 23180 + }, + { + "epoch": 0.3012265225138595, + "grad_norm": 0.3575705587863922, + "learning_rate": 0.00013978346482278168, + "loss": 1.3747, + "step": 23181 + }, + { + "epoch": 0.30123951705777535, + "grad_norm": 0.34922829270362854, + "learning_rate": 0.00013978086536087033, + "loss": 1.3434, + "step": 23182 + }, + { + "epoch": 0.30125251160169125, + "grad_norm": 0.3702467381954193, + "learning_rate": 0.00013977826589895893, + "loss": 1.4672, + "step": 23183 + }, + { + "epoch": 0.3012655061456071, + "grad_norm": 0.35502973198890686, + "learning_rate": 0.00013977566643704752, + "loss": 1.3925, + "step": 23184 + }, + { + "epoch": 0.301278500689523, + "grad_norm": 0.4230843484401703, + "learning_rate": 0.00013977306697513615, + "loss": 1.4039, + "step": 23185 + }, + { + "epoch": 0.30129149523343884, + "grad_norm": 0.4708244204521179, + "learning_rate": 0.00013977046751322477, + "loss": 1.435, + "step": 23186 + }, + { + "epoch": 0.30130448977735474, + "grad_norm": 0.314054012298584, + "learning_rate": 0.0001397678680513134, + "loss": 1.4189, + "step": 23187 + }, + { + "epoch": 0.3013174843212706, + "grad_norm": 0.45249736309051514, + "learning_rate": 0.000139765268589402, + "loss": 1.3491, + "step": 23188 + }, + { + "epoch": 0.3013304788651865, + "grad_norm": 0.45952489972114563, + "learning_rate": 0.00013976266912749062, + "loss": 1.5901, + "step": 23189 + }, + { + "epoch": 0.30134347340910234, + "grad_norm": 0.3819359540939331, + "learning_rate": 0.00013976006966557924, + "loss": 1.2112, + "step": 23190 + }, + { + "epoch": 0.30135646795301824, + "grad_norm": 0.3522294759750366, + "learning_rate": 0.00013975747020366784, + "loss": 1.2512, + "step": 23191 + }, + { + "epoch": 0.3013694624969341, + "grad_norm": 0.36187493801116943, + "learning_rate": 0.00013975487074175647, + "loss": 1.3138, + "step": 23192 + }, + { + "epoch": 0.30138245704085, + "grad_norm": 0.4160166084766388, + "learning_rate": 0.00013975227127984506, + "loss": 1.3973, + "step": 23193 + }, + { + "epoch": 0.30139545158476583, + "grad_norm": 0.36178678274154663, + "learning_rate": 0.00013974967181793372, + "loss": 1.355, + "step": 23194 + }, + { + "epoch": 0.30140844612868173, + "grad_norm": 0.5714994668960571, + "learning_rate": 0.0001397470723560223, + "loss": 1.5371, + "step": 23195 + }, + { + "epoch": 0.3014214406725976, + "grad_norm": 0.40891629457473755, + "learning_rate": 0.0001397444728941109, + "loss": 1.6034, + "step": 23196 + }, + { + "epoch": 0.3014344352165135, + "grad_norm": 0.4667307138442993, + "learning_rate": 0.00013974187343219953, + "loss": 1.4555, + "step": 23197 + }, + { + "epoch": 0.3014474297604293, + "grad_norm": 0.39944300055503845, + "learning_rate": 0.00013973927397028816, + "loss": 1.4359, + "step": 23198 + }, + { + "epoch": 0.3014604243043452, + "grad_norm": 0.36141452193260193, + "learning_rate": 0.00013973667450837678, + "loss": 1.2626, + "step": 23199 + }, + { + "epoch": 0.30147341884826107, + "grad_norm": 0.4503568708896637, + "learning_rate": 0.00013973407504646538, + "loss": 1.2701, + "step": 23200 + }, + { + "epoch": 0.30148641339217697, + "grad_norm": 0.3694996237754822, + "learning_rate": 0.000139731475584554, + "loss": 1.1636, + "step": 23201 + }, + { + "epoch": 0.3014994079360928, + "grad_norm": 0.3991595506668091, + "learning_rate": 0.00013972887612264263, + "loss": 1.513, + "step": 23202 + }, + { + "epoch": 0.3015124024800087, + "grad_norm": 0.4340282082557678, + "learning_rate": 0.00013972627666073123, + "loss": 1.5162, + "step": 23203 + }, + { + "epoch": 0.30152539702392456, + "grad_norm": 0.37580135464668274, + "learning_rate": 0.00013972367719881985, + "loss": 1.3635, + "step": 23204 + }, + { + "epoch": 0.30153839156784046, + "grad_norm": 0.457426518201828, + "learning_rate": 0.00013972107773690845, + "loss": 1.3668, + "step": 23205 + }, + { + "epoch": 0.3015513861117563, + "grad_norm": 0.4442581236362457, + "learning_rate": 0.0001397184782749971, + "loss": 1.463, + "step": 23206 + }, + { + "epoch": 0.3015643806556722, + "grad_norm": 0.26839351654052734, + "learning_rate": 0.0001397158788130857, + "loss": 1.3775, + "step": 23207 + }, + { + "epoch": 0.30157737519958805, + "grad_norm": 0.31884241104125977, + "learning_rate": 0.0001397132793511743, + "loss": 1.3226, + "step": 23208 + }, + { + "epoch": 0.30159036974350395, + "grad_norm": 0.305759459733963, + "learning_rate": 0.00013971067988926292, + "loss": 1.344, + "step": 23209 + }, + { + "epoch": 0.3016033642874198, + "grad_norm": 0.35471776127815247, + "learning_rate": 0.00013970808042735154, + "loss": 1.3643, + "step": 23210 + }, + { + "epoch": 0.3016163588313357, + "grad_norm": 0.4749649465084076, + "learning_rate": 0.00013970548096544017, + "loss": 1.2789, + "step": 23211 + }, + { + "epoch": 0.30162935337525154, + "grad_norm": 0.3606630861759186, + "learning_rate": 0.00013970288150352877, + "loss": 1.2798, + "step": 23212 + }, + { + "epoch": 0.30164234791916744, + "grad_norm": 0.4138084948062897, + "learning_rate": 0.0001397002820416174, + "loss": 1.3784, + "step": 23213 + }, + { + "epoch": 0.3016553424630833, + "grad_norm": 0.4225277602672577, + "learning_rate": 0.00013969768257970602, + "loss": 1.5316, + "step": 23214 + }, + { + "epoch": 0.3016683370069992, + "grad_norm": 0.5402829647064209, + "learning_rate": 0.0001396950831177946, + "loss": 1.3919, + "step": 23215 + }, + { + "epoch": 0.30168133155091503, + "grad_norm": 0.37081778049468994, + "learning_rate": 0.00013969248365588324, + "loss": 1.2534, + "step": 23216 + }, + { + "epoch": 0.30169432609483093, + "grad_norm": 0.4349399507045746, + "learning_rate": 0.00013968988419397186, + "loss": 1.3948, + "step": 23217 + }, + { + "epoch": 0.3017073206387468, + "grad_norm": 0.4180797338485718, + "learning_rate": 0.00013968728473206049, + "loss": 1.4315, + "step": 23218 + }, + { + "epoch": 0.3017203151826627, + "grad_norm": 0.34367337822914124, + "learning_rate": 0.00013968468527014908, + "loss": 1.3839, + "step": 23219 + }, + { + "epoch": 0.3017333097265786, + "grad_norm": 0.4184907376766205, + "learning_rate": 0.0001396820858082377, + "loss": 1.3688, + "step": 23220 + }, + { + "epoch": 0.3017463042704944, + "grad_norm": 0.26103711128234863, + "learning_rate": 0.00013967948634632633, + "loss": 1.4243, + "step": 23221 + }, + { + "epoch": 0.3017592988144103, + "grad_norm": 0.5270993113517761, + "learning_rate": 0.00013967688688441493, + "loss": 1.3227, + "step": 23222 + }, + { + "epoch": 0.30177229335832617, + "grad_norm": 0.3441081643104553, + "learning_rate": 0.00013967428742250355, + "loss": 1.4017, + "step": 23223 + }, + { + "epoch": 0.30178528790224207, + "grad_norm": 0.3834455907344818, + "learning_rate": 0.00013967168796059215, + "loss": 1.2201, + "step": 23224 + }, + { + "epoch": 0.3017982824461579, + "grad_norm": 0.5141701102256775, + "learning_rate": 0.00013966908849868078, + "loss": 1.4855, + "step": 23225 + }, + { + "epoch": 0.3018112769900738, + "grad_norm": 0.30219364166259766, + "learning_rate": 0.0001396664890367694, + "loss": 1.2627, + "step": 23226 + }, + { + "epoch": 0.30182427153398966, + "grad_norm": 0.35873734951019287, + "learning_rate": 0.000139663889574858, + "loss": 1.3197, + "step": 23227 + }, + { + "epoch": 0.30183726607790556, + "grad_norm": 0.4135015606880188, + "learning_rate": 0.00013966129011294662, + "loss": 1.6383, + "step": 23228 + }, + { + "epoch": 0.3018502606218214, + "grad_norm": 0.432526558637619, + "learning_rate": 0.00013965869065103525, + "loss": 1.5395, + "step": 23229 + }, + { + "epoch": 0.3018632551657373, + "grad_norm": 0.3854345381259918, + "learning_rate": 0.00013965609118912387, + "loss": 1.5485, + "step": 23230 + }, + { + "epoch": 0.30187624970965315, + "grad_norm": 0.4011388421058655, + "learning_rate": 0.00013965349172721247, + "loss": 1.248, + "step": 23231 + }, + { + "epoch": 0.30188924425356906, + "grad_norm": 0.3831024169921875, + "learning_rate": 0.0001396508922653011, + "loss": 1.3951, + "step": 23232 + }, + { + "epoch": 0.3019022387974849, + "grad_norm": 0.41663724184036255, + "learning_rate": 0.00013964829280338972, + "loss": 1.705, + "step": 23233 + }, + { + "epoch": 0.3019152333414008, + "grad_norm": 0.450585275888443, + "learning_rate": 0.00013964569334147832, + "loss": 1.4459, + "step": 23234 + }, + { + "epoch": 0.30192822788531665, + "grad_norm": 0.40704357624053955, + "learning_rate": 0.00013964309387956694, + "loss": 1.2648, + "step": 23235 + }, + { + "epoch": 0.30194122242923255, + "grad_norm": 0.34594273567199707, + "learning_rate": 0.00013964049441765554, + "loss": 1.5103, + "step": 23236 + }, + { + "epoch": 0.3019542169731484, + "grad_norm": 0.32361677289009094, + "learning_rate": 0.0001396378949557442, + "loss": 1.3969, + "step": 23237 + }, + { + "epoch": 0.3019672115170643, + "grad_norm": 0.5021398663520813, + "learning_rate": 0.00013963529549383279, + "loss": 1.5684, + "step": 23238 + }, + { + "epoch": 0.30198020606098014, + "grad_norm": 0.41178274154663086, + "learning_rate": 0.00013963269603192138, + "loss": 1.2743, + "step": 23239 + }, + { + "epoch": 0.30199320060489604, + "grad_norm": 0.4706447124481201, + "learning_rate": 0.00013963009657001, + "loss": 1.3471, + "step": 23240 + }, + { + "epoch": 0.3020061951488119, + "grad_norm": 0.3766452670097351, + "learning_rate": 0.00013962749710809863, + "loss": 1.4541, + "step": 23241 + }, + { + "epoch": 0.3020191896927278, + "grad_norm": 0.30931195616722107, + "learning_rate": 0.00013962489764618726, + "loss": 1.232, + "step": 23242 + }, + { + "epoch": 0.30203218423664363, + "grad_norm": 0.43955984711647034, + "learning_rate": 0.00013962229818427585, + "loss": 1.2698, + "step": 23243 + }, + { + "epoch": 0.30204517878055953, + "grad_norm": 0.31557130813598633, + "learning_rate": 0.00013961969872236448, + "loss": 1.4353, + "step": 23244 + }, + { + "epoch": 0.3020581733244754, + "grad_norm": 0.374362975358963, + "learning_rate": 0.0001396170992604531, + "loss": 1.3089, + "step": 23245 + }, + { + "epoch": 0.3020711678683913, + "grad_norm": 0.28767290711402893, + "learning_rate": 0.0001396144997985417, + "loss": 1.287, + "step": 23246 + }, + { + "epoch": 0.3020841624123071, + "grad_norm": 0.39166656136512756, + "learning_rate": 0.00013961190033663032, + "loss": 1.4792, + "step": 23247 + }, + { + "epoch": 0.302097156956223, + "grad_norm": 0.33174094557762146, + "learning_rate": 0.00013960930087471892, + "loss": 1.2213, + "step": 23248 + }, + { + "epoch": 0.30211015150013887, + "grad_norm": 0.42901986837387085, + "learning_rate": 0.00013960670141280757, + "loss": 1.3961, + "step": 23249 + }, + { + "epoch": 0.30212314604405477, + "grad_norm": 0.4432438910007477, + "learning_rate": 0.00013960410195089617, + "loss": 1.2548, + "step": 23250 + }, + { + "epoch": 0.3021361405879706, + "grad_norm": 0.3181210458278656, + "learning_rate": 0.00013960150248898477, + "loss": 1.3349, + "step": 23251 + }, + { + "epoch": 0.3021491351318865, + "grad_norm": 0.3390786349773407, + "learning_rate": 0.00013959890302707342, + "loss": 1.4645, + "step": 23252 + }, + { + "epoch": 0.30216212967580236, + "grad_norm": 0.3404589295387268, + "learning_rate": 0.00013959630356516202, + "loss": 1.3128, + "step": 23253 + }, + { + "epoch": 0.30217512421971826, + "grad_norm": 0.30252063274383545, + "learning_rate": 0.00013959370410325064, + "loss": 1.1207, + "step": 23254 + }, + { + "epoch": 0.3021881187636341, + "grad_norm": 0.2729305028915405, + "learning_rate": 0.00013959110464133924, + "loss": 1.5024, + "step": 23255 + }, + { + "epoch": 0.30220111330755, + "grad_norm": 0.30618563294410706, + "learning_rate": 0.00013958850517942786, + "loss": 1.1178, + "step": 23256 + }, + { + "epoch": 0.30221410785146585, + "grad_norm": 0.3727412223815918, + "learning_rate": 0.0001395859057175165, + "loss": 1.4083, + "step": 23257 + }, + { + "epoch": 0.30222710239538175, + "grad_norm": 0.3510522246360779, + "learning_rate": 0.00013958330625560509, + "loss": 1.5114, + "step": 23258 + }, + { + "epoch": 0.3022400969392976, + "grad_norm": 0.3374330699443817, + "learning_rate": 0.0001395807067936937, + "loss": 1.107, + "step": 23259 + }, + { + "epoch": 0.3022530914832135, + "grad_norm": 0.4756379723548889, + "learning_rate": 0.00013957810733178233, + "loss": 1.5095, + "step": 23260 + }, + { + "epoch": 0.30226608602712934, + "grad_norm": 0.3482968807220459, + "learning_rate": 0.00013957550786987096, + "loss": 1.171, + "step": 23261 + }, + { + "epoch": 0.30227908057104524, + "grad_norm": 0.31565549969673157, + "learning_rate": 0.00013957290840795956, + "loss": 1.4808, + "step": 23262 + }, + { + "epoch": 0.3022920751149611, + "grad_norm": 0.4882533848285675, + "learning_rate": 0.00013957030894604815, + "loss": 1.3616, + "step": 23263 + }, + { + "epoch": 0.302305069658877, + "grad_norm": 0.25556454062461853, + "learning_rate": 0.0001395677094841368, + "loss": 1.2692, + "step": 23264 + }, + { + "epoch": 0.30231806420279284, + "grad_norm": 0.3218751847743988, + "learning_rate": 0.0001395651100222254, + "loss": 1.4677, + "step": 23265 + }, + { + "epoch": 0.30233105874670874, + "grad_norm": 0.3643142580986023, + "learning_rate": 0.00013956251056031403, + "loss": 1.5306, + "step": 23266 + }, + { + "epoch": 0.3023440532906246, + "grad_norm": 0.33794793486595154, + "learning_rate": 0.00013955991109840262, + "loss": 1.5031, + "step": 23267 + }, + { + "epoch": 0.3023570478345405, + "grad_norm": 0.35743266344070435, + "learning_rate": 0.00013955731163649125, + "loss": 1.2518, + "step": 23268 + }, + { + "epoch": 0.3023700423784563, + "grad_norm": 0.41200628876686096, + "learning_rate": 0.00013955471217457987, + "loss": 1.3647, + "step": 23269 + }, + { + "epoch": 0.30238303692237223, + "grad_norm": 0.4245513379573822, + "learning_rate": 0.00013955211271266847, + "loss": 1.2949, + "step": 23270 + }, + { + "epoch": 0.3023960314662881, + "grad_norm": 0.40892940759658813, + "learning_rate": 0.0001395495132507571, + "loss": 1.3158, + "step": 23271 + }, + { + "epoch": 0.302409026010204, + "grad_norm": 0.2947690784931183, + "learning_rate": 0.00013954691378884572, + "loss": 1.4545, + "step": 23272 + }, + { + "epoch": 0.3024220205541198, + "grad_norm": 0.4097446799278259, + "learning_rate": 0.00013954431432693434, + "loss": 1.3724, + "step": 23273 + }, + { + "epoch": 0.3024350150980357, + "grad_norm": 0.39230504631996155, + "learning_rate": 0.00013954171486502294, + "loss": 1.2693, + "step": 23274 + }, + { + "epoch": 0.30244800964195157, + "grad_norm": 0.3813019394874573, + "learning_rate": 0.00013953911540311157, + "loss": 1.538, + "step": 23275 + }, + { + "epoch": 0.30246100418586747, + "grad_norm": 0.38401761651039124, + "learning_rate": 0.0001395365159412002, + "loss": 1.3451, + "step": 23276 + }, + { + "epoch": 0.3024739987297833, + "grad_norm": 0.4438844621181488, + "learning_rate": 0.0001395339164792888, + "loss": 1.4136, + "step": 23277 + }, + { + "epoch": 0.3024869932736992, + "grad_norm": 0.49788713455200195, + "learning_rate": 0.0001395313170173774, + "loss": 1.5324, + "step": 23278 + }, + { + "epoch": 0.30249998781761506, + "grad_norm": 0.4323347508907318, + "learning_rate": 0.000139528717555466, + "loss": 1.5753, + "step": 23279 + }, + { + "epoch": 0.30251298236153096, + "grad_norm": 0.432938814163208, + "learning_rate": 0.00013952611809355463, + "loss": 1.598, + "step": 23280 + }, + { + "epoch": 0.3025259769054468, + "grad_norm": 0.47529903054237366, + "learning_rate": 0.00013952351863164326, + "loss": 1.433, + "step": 23281 + }, + { + "epoch": 0.3025389714493627, + "grad_norm": 0.2561517059803009, + "learning_rate": 0.00013952091916973186, + "loss": 1.4588, + "step": 23282 + }, + { + "epoch": 0.30255196599327855, + "grad_norm": 0.37453752756118774, + "learning_rate": 0.00013951831970782048, + "loss": 1.3911, + "step": 23283 + }, + { + "epoch": 0.30256496053719445, + "grad_norm": 0.3329564034938812, + "learning_rate": 0.0001395157202459091, + "loss": 1.4052, + "step": 23284 + }, + { + "epoch": 0.3025779550811103, + "grad_norm": 0.45538899302482605, + "learning_rate": 0.00013951312078399773, + "loss": 1.5904, + "step": 23285 + }, + { + "epoch": 0.3025909496250262, + "grad_norm": 0.4049103260040283, + "learning_rate": 0.00013951052132208633, + "loss": 1.4333, + "step": 23286 + }, + { + "epoch": 0.30260394416894204, + "grad_norm": 0.41849347949028015, + "learning_rate": 0.00013950792186017495, + "loss": 1.4867, + "step": 23287 + }, + { + "epoch": 0.30261693871285794, + "grad_norm": 0.3378688395023346, + "learning_rate": 0.00013950532239826358, + "loss": 1.2821, + "step": 23288 + }, + { + "epoch": 0.3026299332567738, + "grad_norm": 0.317399799823761, + "learning_rate": 0.00013950272293635217, + "loss": 1.2492, + "step": 23289 + }, + { + "epoch": 0.3026429278006897, + "grad_norm": 0.32929331064224243, + "learning_rate": 0.0001395001234744408, + "loss": 1.4858, + "step": 23290 + }, + { + "epoch": 0.30265592234460553, + "grad_norm": 0.41066640615463257, + "learning_rate": 0.00013949752401252942, + "loss": 1.4886, + "step": 23291 + }, + { + "epoch": 0.30266891688852143, + "grad_norm": 0.44785404205322266, + "learning_rate": 0.00013949492455061802, + "loss": 1.403, + "step": 23292 + }, + { + "epoch": 0.3026819114324373, + "grad_norm": 0.3656848967075348, + "learning_rate": 0.00013949232508870664, + "loss": 1.3891, + "step": 23293 + }, + { + "epoch": 0.3026949059763532, + "grad_norm": 0.47436001896858215, + "learning_rate": 0.00013948972562679524, + "loss": 1.4735, + "step": 23294 + }, + { + "epoch": 0.302707900520269, + "grad_norm": 0.3732442557811737, + "learning_rate": 0.0001394871261648839, + "loss": 1.3321, + "step": 23295 + }, + { + "epoch": 0.3027208950641849, + "grad_norm": 0.5020295977592468, + "learning_rate": 0.0001394845267029725, + "loss": 1.4719, + "step": 23296 + }, + { + "epoch": 0.3027338896081008, + "grad_norm": 0.45214781165122986, + "learning_rate": 0.00013948192724106112, + "loss": 1.4315, + "step": 23297 + }, + { + "epoch": 0.30274688415201667, + "grad_norm": 0.3320401608943939, + "learning_rate": 0.0001394793277791497, + "loss": 1.4257, + "step": 23298 + }, + { + "epoch": 0.30275987869593257, + "grad_norm": 0.357349693775177, + "learning_rate": 0.00013947672831723834, + "loss": 1.4468, + "step": 23299 + }, + { + "epoch": 0.3027728732398484, + "grad_norm": 0.4401451051235199, + "learning_rate": 0.00013947412885532696, + "loss": 1.5007, + "step": 23300 + }, + { + "epoch": 0.3027858677837643, + "grad_norm": 0.4379909932613373, + "learning_rate": 0.00013947152939341556, + "loss": 1.4731, + "step": 23301 + }, + { + "epoch": 0.30279886232768016, + "grad_norm": 0.44491416215896606, + "learning_rate": 0.00013946892993150418, + "loss": 1.4754, + "step": 23302 + }, + { + "epoch": 0.30281185687159606, + "grad_norm": 0.5017706751823425, + "learning_rate": 0.0001394663304695928, + "loss": 1.4334, + "step": 23303 + }, + { + "epoch": 0.3028248514155119, + "grad_norm": 0.4530334770679474, + "learning_rate": 0.00013946373100768143, + "loss": 1.475, + "step": 23304 + }, + { + "epoch": 0.3028378459594278, + "grad_norm": 0.4294649660587311, + "learning_rate": 0.00013946113154577003, + "loss": 1.3431, + "step": 23305 + }, + { + "epoch": 0.30285084050334365, + "grad_norm": 0.3677988648414612, + "learning_rate": 0.00013945853208385863, + "loss": 1.482, + "step": 23306 + }, + { + "epoch": 0.30286383504725956, + "grad_norm": 0.3039516806602478, + "learning_rate": 0.00013945593262194728, + "loss": 1.1731, + "step": 23307 + }, + { + "epoch": 0.3028768295911754, + "grad_norm": 0.3287495970726013, + "learning_rate": 0.00013945333316003588, + "loss": 1.3002, + "step": 23308 + }, + { + "epoch": 0.3028898241350913, + "grad_norm": 0.3238582909107208, + "learning_rate": 0.0001394507336981245, + "loss": 1.3247, + "step": 23309 + }, + { + "epoch": 0.30290281867900715, + "grad_norm": 0.46046897768974304, + "learning_rate": 0.0001394481342362131, + "loss": 1.5692, + "step": 23310 + }, + { + "epoch": 0.30291581322292305, + "grad_norm": 0.4231153428554535, + "learning_rate": 0.00013944553477430172, + "loss": 1.5392, + "step": 23311 + }, + { + "epoch": 0.3029288077668389, + "grad_norm": 0.46809911727905273, + "learning_rate": 0.00013944293531239035, + "loss": 1.4985, + "step": 23312 + }, + { + "epoch": 0.3029418023107548, + "grad_norm": 0.3634107708930969, + "learning_rate": 0.00013944033585047894, + "loss": 1.3924, + "step": 23313 + }, + { + "epoch": 0.30295479685467064, + "grad_norm": 0.41893237829208374, + "learning_rate": 0.00013943773638856757, + "loss": 1.4398, + "step": 23314 + }, + { + "epoch": 0.30296779139858654, + "grad_norm": 0.4360145628452301, + "learning_rate": 0.0001394351369266562, + "loss": 1.3139, + "step": 23315 + }, + { + "epoch": 0.3029807859425024, + "grad_norm": 0.3686993718147278, + "learning_rate": 0.00013943253746474482, + "loss": 1.467, + "step": 23316 + }, + { + "epoch": 0.3029937804864183, + "grad_norm": 0.27927976846694946, + "learning_rate": 0.00013942993800283342, + "loss": 1.3602, + "step": 23317 + }, + { + "epoch": 0.30300677503033413, + "grad_norm": 0.378081738948822, + "learning_rate": 0.000139427338540922, + "loss": 1.3723, + "step": 23318 + }, + { + "epoch": 0.30301976957425003, + "grad_norm": 0.40298354625701904, + "learning_rate": 0.00013942473907901066, + "loss": 1.4525, + "step": 23319 + }, + { + "epoch": 0.3030327641181659, + "grad_norm": 0.35920295119285583, + "learning_rate": 0.00013942213961709926, + "loss": 1.4268, + "step": 23320 + }, + { + "epoch": 0.3030457586620818, + "grad_norm": 0.3600679636001587, + "learning_rate": 0.00013941954015518789, + "loss": 1.6027, + "step": 23321 + }, + { + "epoch": 0.3030587532059976, + "grad_norm": 0.4024488031864166, + "learning_rate": 0.00013941694069327648, + "loss": 1.3513, + "step": 23322 + }, + { + "epoch": 0.3030717477499135, + "grad_norm": 0.3834989070892334, + "learning_rate": 0.0001394143412313651, + "loss": 1.304, + "step": 23323 + }, + { + "epoch": 0.30308474229382937, + "grad_norm": 0.3793964385986328, + "learning_rate": 0.00013941174176945373, + "loss": 1.389, + "step": 23324 + }, + { + "epoch": 0.30309773683774527, + "grad_norm": 0.4368106722831726, + "learning_rate": 0.00013940914230754233, + "loss": 1.4158, + "step": 23325 + }, + { + "epoch": 0.3031107313816611, + "grad_norm": 0.43449971079826355, + "learning_rate": 0.00013940654284563098, + "loss": 1.4015, + "step": 23326 + }, + { + "epoch": 0.303123725925577, + "grad_norm": 0.48215702176094055, + "learning_rate": 0.00013940394338371958, + "loss": 1.41, + "step": 23327 + }, + { + "epoch": 0.30313672046949286, + "grad_norm": 0.4209710359573364, + "learning_rate": 0.0001394013439218082, + "loss": 1.4505, + "step": 23328 + }, + { + "epoch": 0.30314971501340876, + "grad_norm": 0.36241504549980164, + "learning_rate": 0.0001393987444598968, + "loss": 1.4124, + "step": 23329 + }, + { + "epoch": 0.3031627095573246, + "grad_norm": 0.29592418670654297, + "learning_rate": 0.00013939614499798543, + "loss": 1.392, + "step": 23330 + }, + { + "epoch": 0.3031757041012405, + "grad_norm": 0.3353019654750824, + "learning_rate": 0.00013939354553607405, + "loss": 1.3459, + "step": 23331 + }, + { + "epoch": 0.30318869864515635, + "grad_norm": 0.3616652190685272, + "learning_rate": 0.00013939094607416265, + "loss": 1.4107, + "step": 23332 + }, + { + "epoch": 0.30320169318907225, + "grad_norm": 0.24782046675682068, + "learning_rate": 0.00013938834661225127, + "loss": 1.2818, + "step": 23333 + }, + { + "epoch": 0.3032146877329881, + "grad_norm": 0.36093559861183167, + "learning_rate": 0.0001393857471503399, + "loss": 1.2839, + "step": 23334 + }, + { + "epoch": 0.303227682276904, + "grad_norm": 0.2977079749107361, + "learning_rate": 0.0001393831476884285, + "loss": 1.3725, + "step": 23335 + }, + { + "epoch": 0.30324067682081984, + "grad_norm": 0.39603355526924133, + "learning_rate": 0.00013938054822651712, + "loss": 1.5214, + "step": 23336 + }, + { + "epoch": 0.30325367136473574, + "grad_norm": 0.284260094165802, + "learning_rate": 0.00013937794876460572, + "loss": 1.3551, + "step": 23337 + }, + { + "epoch": 0.3032666659086516, + "grad_norm": 0.35100439190864563, + "learning_rate": 0.00013937534930269437, + "loss": 1.457, + "step": 23338 + }, + { + "epoch": 0.3032796604525675, + "grad_norm": 0.43551748991012573, + "learning_rate": 0.00013937274984078296, + "loss": 1.3806, + "step": 23339 + }, + { + "epoch": 0.30329265499648334, + "grad_norm": 0.48077815771102905, + "learning_rate": 0.0001393701503788716, + "loss": 1.5811, + "step": 23340 + }, + { + "epoch": 0.30330564954039924, + "grad_norm": 0.3833942413330078, + "learning_rate": 0.00013936755091696019, + "loss": 1.497, + "step": 23341 + }, + { + "epoch": 0.3033186440843151, + "grad_norm": 0.4411120116710663, + "learning_rate": 0.0001393649514550488, + "loss": 1.5894, + "step": 23342 + }, + { + "epoch": 0.303331638628231, + "grad_norm": 0.3648945689201355, + "learning_rate": 0.00013936235199313744, + "loss": 1.302, + "step": 23343 + }, + { + "epoch": 0.3033446331721468, + "grad_norm": 0.4178422689437866, + "learning_rate": 0.00013935975253122603, + "loss": 1.4245, + "step": 23344 + }, + { + "epoch": 0.3033576277160627, + "grad_norm": 0.3966846764087677, + "learning_rate": 0.00013935715306931466, + "loss": 1.3355, + "step": 23345 + }, + { + "epoch": 0.3033706222599786, + "grad_norm": 0.2796681225299835, + "learning_rate": 0.00013935455360740328, + "loss": 1.0936, + "step": 23346 + }, + { + "epoch": 0.3033836168038945, + "grad_norm": 0.41754207015037537, + "learning_rate": 0.00013935195414549188, + "loss": 1.3595, + "step": 23347 + }, + { + "epoch": 0.3033966113478103, + "grad_norm": 0.37742170691490173, + "learning_rate": 0.0001393493546835805, + "loss": 1.4647, + "step": 23348 + }, + { + "epoch": 0.3034096058917262, + "grad_norm": 0.38867121934890747, + "learning_rate": 0.0001393467552216691, + "loss": 1.5306, + "step": 23349 + }, + { + "epoch": 0.30342260043564206, + "grad_norm": 0.5812363624572754, + "learning_rate": 0.00013934415575975775, + "loss": 1.5353, + "step": 23350 + }, + { + "epoch": 0.30343559497955797, + "grad_norm": 0.40888872742652893, + "learning_rate": 0.00013934155629784635, + "loss": 1.2623, + "step": 23351 + }, + { + "epoch": 0.3034485895234738, + "grad_norm": 0.35665449500083923, + "learning_rate": 0.00013933895683593497, + "loss": 1.3862, + "step": 23352 + }, + { + "epoch": 0.3034615840673897, + "grad_norm": 0.3410831391811371, + "learning_rate": 0.00013933635737402357, + "loss": 1.3896, + "step": 23353 + }, + { + "epoch": 0.30347457861130556, + "grad_norm": 0.46788543462753296, + "learning_rate": 0.0001393337579121122, + "loss": 1.3214, + "step": 23354 + }, + { + "epoch": 0.30348757315522146, + "grad_norm": 0.30422621965408325, + "learning_rate": 0.00013933115845020082, + "loss": 1.2889, + "step": 23355 + }, + { + "epoch": 0.3035005676991373, + "grad_norm": 0.3669605851173401, + "learning_rate": 0.00013932855898828942, + "loss": 1.2636, + "step": 23356 + }, + { + "epoch": 0.3035135622430532, + "grad_norm": 0.35256868600845337, + "learning_rate": 0.00013932595952637804, + "loss": 1.4778, + "step": 23357 + }, + { + "epoch": 0.30352655678696905, + "grad_norm": 0.3997613787651062, + "learning_rate": 0.00013932336006446667, + "loss": 1.4352, + "step": 23358 + }, + { + "epoch": 0.30353955133088495, + "grad_norm": 0.3804091215133667, + "learning_rate": 0.0001393207606025553, + "loss": 1.2913, + "step": 23359 + }, + { + "epoch": 0.3035525458748008, + "grad_norm": 0.3649604618549347, + "learning_rate": 0.0001393181611406439, + "loss": 1.3199, + "step": 23360 + }, + { + "epoch": 0.3035655404187167, + "grad_norm": 0.3914465606212616, + "learning_rate": 0.00013931556167873249, + "loss": 1.3312, + "step": 23361 + }, + { + "epoch": 0.30357853496263254, + "grad_norm": 0.4296543598175049, + "learning_rate": 0.00013931296221682114, + "loss": 1.3644, + "step": 23362 + }, + { + "epoch": 0.30359152950654844, + "grad_norm": 0.4134215712547302, + "learning_rate": 0.00013931036275490974, + "loss": 1.3889, + "step": 23363 + }, + { + "epoch": 0.3036045240504643, + "grad_norm": 0.46308422088623047, + "learning_rate": 0.00013930776329299836, + "loss": 1.4569, + "step": 23364 + }, + { + "epoch": 0.3036175185943802, + "grad_norm": 0.3566986918449402, + "learning_rate": 0.00013930516383108698, + "loss": 1.1827, + "step": 23365 + }, + { + "epoch": 0.30363051313829603, + "grad_norm": 0.43237459659576416, + "learning_rate": 0.00013930256436917558, + "loss": 1.3108, + "step": 23366 + }, + { + "epoch": 0.30364350768221193, + "grad_norm": 0.3919980227947235, + "learning_rate": 0.0001392999649072642, + "loss": 1.5722, + "step": 23367 + }, + { + "epoch": 0.3036565022261278, + "grad_norm": 0.38708871603012085, + "learning_rate": 0.0001392973654453528, + "loss": 1.4127, + "step": 23368 + }, + { + "epoch": 0.3036694967700437, + "grad_norm": 0.36836186051368713, + "learning_rate": 0.00013929476598344145, + "loss": 1.174, + "step": 23369 + }, + { + "epoch": 0.3036824913139595, + "grad_norm": 0.31136471033096313, + "learning_rate": 0.00013929216652153005, + "loss": 1.2656, + "step": 23370 + }, + { + "epoch": 0.3036954858578754, + "grad_norm": 0.3176463842391968, + "learning_rate": 0.00013928956705961868, + "loss": 1.2335, + "step": 23371 + }, + { + "epoch": 0.3037084804017913, + "grad_norm": 0.40809836983680725, + "learning_rate": 0.00013928696759770727, + "loss": 1.349, + "step": 23372 + }, + { + "epoch": 0.30372147494570717, + "grad_norm": 0.40170395374298096, + "learning_rate": 0.0001392843681357959, + "loss": 1.474, + "step": 23373 + }, + { + "epoch": 0.30373446948962307, + "grad_norm": 0.4301799237728119, + "learning_rate": 0.00013928176867388452, + "loss": 1.4337, + "step": 23374 + }, + { + "epoch": 0.3037474640335389, + "grad_norm": 0.45454856753349304, + "learning_rate": 0.00013927916921197312, + "loss": 1.4051, + "step": 23375 + }, + { + "epoch": 0.3037604585774548, + "grad_norm": 0.52679842710495, + "learning_rate": 0.00013927656975006174, + "loss": 1.4775, + "step": 23376 + }, + { + "epoch": 0.30377345312137066, + "grad_norm": 0.3931540846824646, + "learning_rate": 0.00013927397028815037, + "loss": 1.4335, + "step": 23377 + }, + { + "epoch": 0.30378644766528656, + "grad_norm": 0.45587897300720215, + "learning_rate": 0.00013927137082623897, + "loss": 1.3549, + "step": 23378 + }, + { + "epoch": 0.3037994422092024, + "grad_norm": 0.37841564416885376, + "learning_rate": 0.0001392687713643276, + "loss": 1.4803, + "step": 23379 + }, + { + "epoch": 0.3038124367531183, + "grad_norm": 0.4343169927597046, + "learning_rate": 0.0001392661719024162, + "loss": 1.3295, + "step": 23380 + }, + { + "epoch": 0.30382543129703415, + "grad_norm": 0.43603068590164185, + "learning_rate": 0.00013926357244050484, + "loss": 1.3117, + "step": 23381 + }, + { + "epoch": 0.30383842584095005, + "grad_norm": 0.35854434967041016, + "learning_rate": 0.00013926097297859344, + "loss": 1.4942, + "step": 23382 + }, + { + "epoch": 0.3038514203848659, + "grad_norm": 0.5235525369644165, + "learning_rate": 0.00013925837351668206, + "loss": 1.3438, + "step": 23383 + }, + { + "epoch": 0.3038644149287818, + "grad_norm": 0.3831540644168854, + "learning_rate": 0.00013925577405477066, + "loss": 1.5348, + "step": 23384 + }, + { + "epoch": 0.30387740947269765, + "grad_norm": 0.45061415433883667, + "learning_rate": 0.00013925317459285928, + "loss": 1.5049, + "step": 23385 + }, + { + "epoch": 0.30389040401661355, + "grad_norm": 0.3730837106704712, + "learning_rate": 0.0001392505751309479, + "loss": 1.2029, + "step": 23386 + }, + { + "epoch": 0.3039033985605294, + "grad_norm": 0.47303473949432373, + "learning_rate": 0.0001392479756690365, + "loss": 1.5152, + "step": 23387 + }, + { + "epoch": 0.3039163931044453, + "grad_norm": 0.2890664041042328, + "learning_rate": 0.00013924537620712513, + "loss": 1.3942, + "step": 23388 + }, + { + "epoch": 0.30392938764836114, + "grad_norm": 0.45294487476348877, + "learning_rate": 0.00013924277674521375, + "loss": 1.5301, + "step": 23389 + }, + { + "epoch": 0.30394238219227704, + "grad_norm": 0.43785783648490906, + "learning_rate": 0.00013924017728330235, + "loss": 1.2817, + "step": 23390 + }, + { + "epoch": 0.3039553767361929, + "grad_norm": 0.4318059980869293, + "learning_rate": 0.00013923757782139098, + "loss": 1.3844, + "step": 23391 + }, + { + "epoch": 0.3039683712801088, + "grad_norm": 0.500150203704834, + "learning_rate": 0.00013923497835947957, + "loss": 1.3624, + "step": 23392 + }, + { + "epoch": 0.30398136582402463, + "grad_norm": 0.4585595428943634, + "learning_rate": 0.00013923237889756823, + "loss": 1.5629, + "step": 23393 + }, + { + "epoch": 0.30399436036794053, + "grad_norm": 0.36613747477531433, + "learning_rate": 0.00013922977943565682, + "loss": 1.3208, + "step": 23394 + }, + { + "epoch": 0.3040073549118564, + "grad_norm": 0.38989126682281494, + "learning_rate": 0.00013922717997374545, + "loss": 1.4224, + "step": 23395 + }, + { + "epoch": 0.3040203494557723, + "grad_norm": 0.34212422370910645, + "learning_rate": 0.00013922458051183404, + "loss": 1.2932, + "step": 23396 + }, + { + "epoch": 0.3040333439996881, + "grad_norm": 0.5155880451202393, + "learning_rate": 0.00013922198104992267, + "loss": 1.3404, + "step": 23397 + }, + { + "epoch": 0.304046338543604, + "grad_norm": 0.4678805470466614, + "learning_rate": 0.0001392193815880113, + "loss": 1.5571, + "step": 23398 + }, + { + "epoch": 0.30405933308751987, + "grad_norm": 0.39795011281967163, + "learning_rate": 0.0001392167821260999, + "loss": 1.3241, + "step": 23399 + }, + { + "epoch": 0.30407232763143577, + "grad_norm": 0.40905988216400146, + "learning_rate": 0.00013921418266418854, + "loss": 1.3328, + "step": 23400 + }, + { + "epoch": 0.3040853221753516, + "grad_norm": 0.44862300157546997, + "learning_rate": 0.00013921158320227714, + "loss": 1.3618, + "step": 23401 + }, + { + "epoch": 0.3040983167192675, + "grad_norm": 0.35704198479652405, + "learning_rate": 0.00013920898374036574, + "loss": 1.4144, + "step": 23402 + }, + { + "epoch": 0.30411131126318336, + "grad_norm": 0.44293662905693054, + "learning_rate": 0.00013920638427845436, + "loss": 1.3012, + "step": 23403 + }, + { + "epoch": 0.30412430580709926, + "grad_norm": 0.41678890585899353, + "learning_rate": 0.000139203784816543, + "loss": 1.5451, + "step": 23404 + }, + { + "epoch": 0.3041373003510151, + "grad_norm": 0.41258934140205383, + "learning_rate": 0.0001392011853546316, + "loss": 1.3727, + "step": 23405 + }, + { + "epoch": 0.304150294894931, + "grad_norm": 1.7785333395004272, + "learning_rate": 0.0001391985858927202, + "loss": 1.3302, + "step": 23406 + }, + { + "epoch": 0.30416328943884685, + "grad_norm": 0.4313473403453827, + "learning_rate": 0.00013919598643080883, + "loss": 1.3551, + "step": 23407 + }, + { + "epoch": 0.30417628398276275, + "grad_norm": 0.44515150785446167, + "learning_rate": 0.00013919338696889746, + "loss": 1.3341, + "step": 23408 + }, + { + "epoch": 0.3041892785266786, + "grad_norm": 0.36140763759613037, + "learning_rate": 0.00013919078750698605, + "loss": 1.3879, + "step": 23409 + }, + { + "epoch": 0.3042022730705945, + "grad_norm": 0.45209184288978577, + "learning_rate": 0.00013918818804507468, + "loss": 1.3747, + "step": 23410 + }, + { + "epoch": 0.30421526761451034, + "grad_norm": 0.4458887279033661, + "learning_rate": 0.00013918558858316328, + "loss": 1.4881, + "step": 23411 + }, + { + "epoch": 0.30422826215842624, + "grad_norm": 0.46184462308883667, + "learning_rate": 0.00013918298912125193, + "loss": 1.5488, + "step": 23412 + }, + { + "epoch": 0.3042412567023421, + "grad_norm": 0.4529193043708801, + "learning_rate": 0.00013918038965934053, + "loss": 1.3648, + "step": 23413 + }, + { + "epoch": 0.304254251246258, + "grad_norm": 0.33885693550109863, + "learning_rate": 0.00013917779019742912, + "loss": 1.3015, + "step": 23414 + }, + { + "epoch": 0.30426724579017383, + "grad_norm": 0.4012843370437622, + "learning_rate": 0.00013917519073551775, + "loss": 1.4004, + "step": 23415 + }, + { + "epoch": 0.30428024033408974, + "grad_norm": 0.4831693172454834, + "learning_rate": 0.00013917259127360637, + "loss": 1.4555, + "step": 23416 + }, + { + "epoch": 0.3042932348780056, + "grad_norm": 0.44907328486442566, + "learning_rate": 0.000139169991811695, + "loss": 1.3931, + "step": 23417 + }, + { + "epoch": 0.3043062294219215, + "grad_norm": 0.43334949016571045, + "learning_rate": 0.0001391673923497836, + "loss": 1.3305, + "step": 23418 + }, + { + "epoch": 0.3043192239658373, + "grad_norm": 0.3851233124732971, + "learning_rate": 0.00013916479288787222, + "loss": 1.3957, + "step": 23419 + }, + { + "epoch": 0.3043322185097532, + "grad_norm": 0.45768246054649353, + "learning_rate": 0.00013916219342596084, + "loss": 1.4605, + "step": 23420 + }, + { + "epoch": 0.3043452130536691, + "grad_norm": 0.3982701003551483, + "learning_rate": 0.00013915959396404944, + "loss": 1.3328, + "step": 23421 + }, + { + "epoch": 0.304358207597585, + "grad_norm": 0.29739758372306824, + "learning_rate": 0.00013915699450213806, + "loss": 1.5081, + "step": 23422 + }, + { + "epoch": 0.3043712021415008, + "grad_norm": 0.4620401859283447, + "learning_rate": 0.00013915439504022666, + "loss": 1.4313, + "step": 23423 + }, + { + "epoch": 0.3043841966854167, + "grad_norm": 0.3972269594669342, + "learning_rate": 0.00013915179557831531, + "loss": 1.4865, + "step": 23424 + }, + { + "epoch": 0.30439719122933256, + "grad_norm": 0.5003836154937744, + "learning_rate": 0.0001391491961164039, + "loss": 1.3011, + "step": 23425 + }, + { + "epoch": 0.30441018577324847, + "grad_norm": 0.3578912019729614, + "learning_rate": 0.00013914659665449254, + "loss": 1.4613, + "step": 23426 + }, + { + "epoch": 0.3044231803171643, + "grad_norm": 0.3965323269367218, + "learning_rate": 0.00013914399719258113, + "loss": 1.3485, + "step": 23427 + }, + { + "epoch": 0.3044361748610802, + "grad_norm": 0.5065497756004333, + "learning_rate": 0.00013914139773066976, + "loss": 1.5553, + "step": 23428 + }, + { + "epoch": 0.30444916940499606, + "grad_norm": 0.3976811468601227, + "learning_rate": 0.00013913879826875838, + "loss": 1.4789, + "step": 23429 + }, + { + "epoch": 0.30446216394891196, + "grad_norm": 0.3483099341392517, + "learning_rate": 0.00013913619880684698, + "loss": 1.2753, + "step": 23430 + }, + { + "epoch": 0.3044751584928278, + "grad_norm": 0.41991087794303894, + "learning_rate": 0.0001391335993449356, + "loss": 1.428, + "step": 23431 + }, + { + "epoch": 0.3044881530367437, + "grad_norm": 0.40444377064704895, + "learning_rate": 0.00013913099988302423, + "loss": 1.3348, + "step": 23432 + }, + { + "epoch": 0.30450114758065955, + "grad_norm": 0.3690468370914459, + "learning_rate": 0.00013912840042111283, + "loss": 1.317, + "step": 23433 + }, + { + "epoch": 0.30451414212457545, + "grad_norm": 0.37328040599823, + "learning_rate": 0.00013912580095920145, + "loss": 1.24, + "step": 23434 + }, + { + "epoch": 0.3045271366684913, + "grad_norm": 0.3880142271518707, + "learning_rate": 0.00013912320149729005, + "loss": 1.4818, + "step": 23435 + }, + { + "epoch": 0.3045401312124072, + "grad_norm": 0.392406702041626, + "learning_rate": 0.0001391206020353787, + "loss": 1.2328, + "step": 23436 + }, + { + "epoch": 0.30455312575632304, + "grad_norm": 0.395727276802063, + "learning_rate": 0.0001391180025734673, + "loss": 1.3575, + "step": 23437 + }, + { + "epoch": 0.30456612030023894, + "grad_norm": 0.41765278577804565, + "learning_rate": 0.00013911540311155592, + "loss": 1.4517, + "step": 23438 + }, + { + "epoch": 0.3045791148441548, + "grad_norm": 0.3868548274040222, + "learning_rate": 0.00013911280364964455, + "loss": 1.3772, + "step": 23439 + }, + { + "epoch": 0.3045921093880707, + "grad_norm": 0.3593936562538147, + "learning_rate": 0.00013911020418773314, + "loss": 1.3226, + "step": 23440 + }, + { + "epoch": 0.30460510393198653, + "grad_norm": 0.46014881134033203, + "learning_rate": 0.00013910760472582177, + "loss": 1.487, + "step": 23441 + }, + { + "epoch": 0.30461809847590243, + "grad_norm": 0.4650191068649292, + "learning_rate": 0.00013910500526391036, + "loss": 1.5201, + "step": 23442 + }, + { + "epoch": 0.3046310930198183, + "grad_norm": 0.3960723578929901, + "learning_rate": 0.00013910240580199902, + "loss": 1.4578, + "step": 23443 + }, + { + "epoch": 0.3046440875637342, + "grad_norm": 0.41256600618362427, + "learning_rate": 0.00013909980634008761, + "loss": 1.5645, + "step": 23444 + }, + { + "epoch": 0.30465708210765, + "grad_norm": 0.3604339063167572, + "learning_rate": 0.0001390972068781762, + "loss": 1.5961, + "step": 23445 + }, + { + "epoch": 0.3046700766515659, + "grad_norm": 0.4878902733325958, + "learning_rate": 0.00013909460741626484, + "loss": 1.5096, + "step": 23446 + }, + { + "epoch": 0.30468307119548177, + "grad_norm": 0.42814451456069946, + "learning_rate": 0.00013909200795435346, + "loss": 1.5028, + "step": 23447 + }, + { + "epoch": 0.30469606573939767, + "grad_norm": 0.4190797507762909, + "learning_rate": 0.00013908940849244208, + "loss": 1.4652, + "step": 23448 + }, + { + "epoch": 0.30470906028331357, + "grad_norm": 0.34647899866104126, + "learning_rate": 0.00013908680903053068, + "loss": 1.4682, + "step": 23449 + }, + { + "epoch": 0.3047220548272294, + "grad_norm": 0.4343963861465454, + "learning_rate": 0.0001390842095686193, + "loss": 1.3618, + "step": 23450 + }, + { + "epoch": 0.3047350493711453, + "grad_norm": 0.3554687798023224, + "learning_rate": 0.00013908161010670793, + "loss": 1.4868, + "step": 23451 + }, + { + "epoch": 0.30474804391506116, + "grad_norm": 0.3702371418476105, + "learning_rate": 0.00013907901064479653, + "loss": 1.3101, + "step": 23452 + }, + { + "epoch": 0.30476103845897706, + "grad_norm": 0.45455601811408997, + "learning_rate": 0.00013907641118288515, + "loss": 1.6089, + "step": 23453 + }, + { + "epoch": 0.3047740330028929, + "grad_norm": 0.4820840656757355, + "learning_rate": 0.00013907381172097375, + "loss": 1.4987, + "step": 23454 + }, + { + "epoch": 0.3047870275468088, + "grad_norm": 0.38765203952789307, + "learning_rate": 0.0001390712122590624, + "loss": 1.538, + "step": 23455 + }, + { + "epoch": 0.30480002209072465, + "grad_norm": 0.4099428355693817, + "learning_rate": 0.000139068612797151, + "loss": 1.4147, + "step": 23456 + }, + { + "epoch": 0.30481301663464055, + "grad_norm": 0.4511684775352478, + "learning_rate": 0.0001390660133352396, + "loss": 1.5418, + "step": 23457 + }, + { + "epoch": 0.3048260111785564, + "grad_norm": 0.4140436351299286, + "learning_rate": 0.00013906341387332822, + "loss": 1.3171, + "step": 23458 + }, + { + "epoch": 0.3048390057224723, + "grad_norm": 0.4105619192123413, + "learning_rate": 0.00013906081441141685, + "loss": 1.3859, + "step": 23459 + }, + { + "epoch": 0.30485200026638815, + "grad_norm": 0.39106282591819763, + "learning_rate": 0.00013905821494950547, + "loss": 1.3089, + "step": 23460 + }, + { + "epoch": 0.30486499481030405, + "grad_norm": 0.48305389285087585, + "learning_rate": 0.00013905561548759407, + "loss": 1.4178, + "step": 23461 + }, + { + "epoch": 0.3048779893542199, + "grad_norm": 0.3905850946903229, + "learning_rate": 0.0001390530160256827, + "loss": 1.3447, + "step": 23462 + }, + { + "epoch": 0.3048909838981358, + "grad_norm": 0.37989890575408936, + "learning_rate": 0.00013905041656377132, + "loss": 1.414, + "step": 23463 + }, + { + "epoch": 0.30490397844205164, + "grad_norm": 0.4079782962799072, + "learning_rate": 0.0001390478171018599, + "loss": 1.3805, + "step": 23464 + }, + { + "epoch": 0.30491697298596754, + "grad_norm": 0.4466724991798401, + "learning_rate": 0.00013904521763994854, + "loss": 1.4437, + "step": 23465 + }, + { + "epoch": 0.3049299675298834, + "grad_norm": 0.40522077679634094, + "learning_rate": 0.00013904261817803714, + "loss": 1.4954, + "step": 23466 + }, + { + "epoch": 0.3049429620737993, + "grad_norm": 0.3816321790218353, + "learning_rate": 0.0001390400187161258, + "loss": 1.4651, + "step": 23467 + }, + { + "epoch": 0.30495595661771513, + "grad_norm": 0.4906289577484131, + "learning_rate": 0.00013903741925421438, + "loss": 1.6576, + "step": 23468 + }, + { + "epoch": 0.30496895116163103, + "grad_norm": 0.37748995423316956, + "learning_rate": 0.00013903481979230298, + "loss": 1.5941, + "step": 23469 + }, + { + "epoch": 0.3049819457055469, + "grad_norm": 0.2999838888645172, + "learning_rate": 0.0001390322203303916, + "loss": 1.4557, + "step": 23470 + }, + { + "epoch": 0.3049949402494628, + "grad_norm": 0.3470779061317444, + "learning_rate": 0.00013902962086848023, + "loss": 1.234, + "step": 23471 + }, + { + "epoch": 0.3050079347933786, + "grad_norm": 0.3533174395561218, + "learning_rate": 0.00013902702140656886, + "loss": 1.2147, + "step": 23472 + }, + { + "epoch": 0.3050209293372945, + "grad_norm": 0.3998168110847473, + "learning_rate": 0.00013902442194465745, + "loss": 1.327, + "step": 23473 + }, + { + "epoch": 0.30503392388121037, + "grad_norm": 0.31588008999824524, + "learning_rate": 0.00013902182248274608, + "loss": 1.3918, + "step": 23474 + }, + { + "epoch": 0.30504691842512627, + "grad_norm": 0.34923332929611206, + "learning_rate": 0.0001390192230208347, + "loss": 1.2754, + "step": 23475 + }, + { + "epoch": 0.3050599129690421, + "grad_norm": 0.34490811824798584, + "learning_rate": 0.0001390166235589233, + "loss": 1.288, + "step": 23476 + }, + { + "epoch": 0.305072907512958, + "grad_norm": 0.4352579116821289, + "learning_rate": 0.00013901402409701192, + "loss": 1.4201, + "step": 23477 + }, + { + "epoch": 0.30508590205687386, + "grad_norm": 0.457725465297699, + "learning_rate": 0.00013901142463510055, + "loss": 1.4294, + "step": 23478 + }, + { + "epoch": 0.30509889660078976, + "grad_norm": 0.4381331503391266, + "learning_rate": 0.00013900882517318917, + "loss": 1.3799, + "step": 23479 + }, + { + "epoch": 0.3051118911447056, + "grad_norm": 0.4067361652851105, + "learning_rate": 0.00013900622571127777, + "loss": 1.3466, + "step": 23480 + }, + { + "epoch": 0.3051248856886215, + "grad_norm": 0.41730648279190063, + "learning_rate": 0.0001390036262493664, + "loss": 1.6048, + "step": 23481 + }, + { + "epoch": 0.30513788023253735, + "grad_norm": 0.38951125741004944, + "learning_rate": 0.00013900102678745502, + "loss": 1.2188, + "step": 23482 + }, + { + "epoch": 0.30515087477645325, + "grad_norm": 0.4512186050415039, + "learning_rate": 0.00013899842732554362, + "loss": 1.3749, + "step": 23483 + }, + { + "epoch": 0.3051638693203691, + "grad_norm": 0.3760858178138733, + "learning_rate": 0.00013899582786363224, + "loss": 1.3459, + "step": 23484 + }, + { + "epoch": 0.305176863864285, + "grad_norm": 0.3248966634273529, + "learning_rate": 0.00013899322840172084, + "loss": 1.3515, + "step": 23485 + }, + { + "epoch": 0.30518985840820084, + "grad_norm": 0.295704185962677, + "learning_rate": 0.00013899062893980946, + "loss": 1.2406, + "step": 23486 + }, + { + "epoch": 0.30520285295211674, + "grad_norm": 0.4719599485397339, + "learning_rate": 0.0001389880294778981, + "loss": 1.5597, + "step": 23487 + }, + { + "epoch": 0.3052158474960326, + "grad_norm": 0.4941157400608063, + "learning_rate": 0.00013898543001598668, + "loss": 1.5769, + "step": 23488 + }, + { + "epoch": 0.3052288420399485, + "grad_norm": 0.32661327719688416, + "learning_rate": 0.0001389828305540753, + "loss": 1.3615, + "step": 23489 + }, + { + "epoch": 0.30524183658386433, + "grad_norm": 0.5071525573730469, + "learning_rate": 0.00013898023109216393, + "loss": 1.3819, + "step": 23490 + }, + { + "epoch": 0.30525483112778024, + "grad_norm": 0.43125683069229126, + "learning_rate": 0.00013897763163025256, + "loss": 1.4411, + "step": 23491 + }, + { + "epoch": 0.3052678256716961, + "grad_norm": 0.3250223696231842, + "learning_rate": 0.00013897503216834116, + "loss": 1.3657, + "step": 23492 + }, + { + "epoch": 0.305280820215612, + "grad_norm": 0.3734956979751587, + "learning_rate": 0.00013897243270642978, + "loss": 1.362, + "step": 23493 + }, + { + "epoch": 0.3052938147595278, + "grad_norm": 0.38495510816574097, + "learning_rate": 0.0001389698332445184, + "loss": 1.4573, + "step": 23494 + }, + { + "epoch": 0.3053068093034437, + "grad_norm": 0.30793297290802, + "learning_rate": 0.000138967233782607, + "loss": 1.3172, + "step": 23495 + }, + { + "epoch": 0.30531980384735957, + "grad_norm": 0.49670201539993286, + "learning_rate": 0.00013896463432069563, + "loss": 1.3595, + "step": 23496 + }, + { + "epoch": 0.3053327983912755, + "grad_norm": 0.21125417947769165, + "learning_rate": 0.00013896203485878422, + "loss": 1.254, + "step": 23497 + }, + { + "epoch": 0.3053457929351913, + "grad_norm": 0.38317611813545227, + "learning_rate": 0.00013895943539687285, + "loss": 1.3221, + "step": 23498 + }, + { + "epoch": 0.3053587874791072, + "grad_norm": 0.4652711749076843, + "learning_rate": 0.00013895683593496147, + "loss": 1.3433, + "step": 23499 + }, + { + "epoch": 0.30537178202302306, + "grad_norm": 0.4497043490409851, + "learning_rate": 0.00013895423647305007, + "loss": 1.546, + "step": 23500 + }, + { + "epoch": 0.30538477656693896, + "grad_norm": 0.5556791424751282, + "learning_rate": 0.0001389516370111387, + "loss": 1.2698, + "step": 23501 + }, + { + "epoch": 0.3053977711108548, + "grad_norm": 0.3903411626815796, + "learning_rate": 0.00013894903754922732, + "loss": 1.3535, + "step": 23502 + }, + { + "epoch": 0.3054107656547707, + "grad_norm": 0.4146912097930908, + "learning_rate": 0.00013894643808731594, + "loss": 1.48, + "step": 23503 + }, + { + "epoch": 0.30542376019868656, + "grad_norm": 0.34024927020072937, + "learning_rate": 0.00013894383862540454, + "loss": 1.0548, + "step": 23504 + }, + { + "epoch": 0.30543675474260246, + "grad_norm": 0.33760544657707214, + "learning_rate": 0.00013894123916349316, + "loss": 1.2827, + "step": 23505 + }, + { + "epoch": 0.3054497492865183, + "grad_norm": 0.4486847519874573, + "learning_rate": 0.0001389386397015818, + "loss": 1.2843, + "step": 23506 + }, + { + "epoch": 0.3054627438304342, + "grad_norm": 0.4389313757419586, + "learning_rate": 0.0001389360402396704, + "loss": 1.3684, + "step": 23507 + }, + { + "epoch": 0.30547573837435005, + "grad_norm": 0.42779406905174255, + "learning_rate": 0.000138933440777759, + "loss": 1.3537, + "step": 23508 + }, + { + "epoch": 0.30548873291826595, + "grad_norm": 0.28546014428138733, + "learning_rate": 0.0001389308413158476, + "loss": 1.506, + "step": 23509 + }, + { + "epoch": 0.3055017274621818, + "grad_norm": 0.3954133987426758, + "learning_rate": 0.00013892824185393626, + "loss": 1.404, + "step": 23510 + }, + { + "epoch": 0.3055147220060977, + "grad_norm": 0.36492669582366943, + "learning_rate": 0.00013892564239202486, + "loss": 1.2677, + "step": 23511 + }, + { + "epoch": 0.30552771655001354, + "grad_norm": 0.3887994587421417, + "learning_rate": 0.00013892304293011346, + "loss": 1.2982, + "step": 23512 + }, + { + "epoch": 0.30554071109392944, + "grad_norm": 0.39080923795700073, + "learning_rate": 0.0001389204434682021, + "loss": 1.4723, + "step": 23513 + }, + { + "epoch": 0.3055537056378453, + "grad_norm": 0.430855393409729, + "learning_rate": 0.0001389178440062907, + "loss": 1.5383, + "step": 23514 + }, + { + "epoch": 0.3055667001817612, + "grad_norm": 0.39280420541763306, + "learning_rate": 0.00013891524454437933, + "loss": 1.4976, + "step": 23515 + }, + { + "epoch": 0.30557969472567703, + "grad_norm": 0.40288299322128296, + "learning_rate": 0.00013891264508246793, + "loss": 1.3175, + "step": 23516 + }, + { + "epoch": 0.30559268926959293, + "grad_norm": 0.46204444766044617, + "learning_rate": 0.00013891004562055655, + "loss": 1.4789, + "step": 23517 + }, + { + "epoch": 0.3056056838135088, + "grad_norm": 0.38068604469299316, + "learning_rate": 0.00013890744615864517, + "loss": 1.5661, + "step": 23518 + }, + { + "epoch": 0.3056186783574247, + "grad_norm": 0.4827874004840851, + "learning_rate": 0.00013890484669673377, + "loss": 1.4093, + "step": 23519 + }, + { + "epoch": 0.3056316729013405, + "grad_norm": 0.31266921758651733, + "learning_rate": 0.0001389022472348224, + "loss": 1.3273, + "step": 23520 + }, + { + "epoch": 0.3056446674452564, + "grad_norm": 0.29505765438079834, + "learning_rate": 0.00013889964777291102, + "loss": 1.3602, + "step": 23521 + }, + { + "epoch": 0.30565766198917227, + "grad_norm": 0.2704898416996002, + "learning_rate": 0.00013889704831099965, + "loss": 1.3186, + "step": 23522 + }, + { + "epoch": 0.30567065653308817, + "grad_norm": 0.4260876178741455, + "learning_rate": 0.00013889444884908824, + "loss": 1.4234, + "step": 23523 + }, + { + "epoch": 0.30568365107700407, + "grad_norm": 0.39754945039749146, + "learning_rate": 0.00013889184938717684, + "loss": 1.5558, + "step": 23524 + }, + { + "epoch": 0.3056966456209199, + "grad_norm": 0.3910837173461914, + "learning_rate": 0.0001388892499252655, + "loss": 1.3731, + "step": 23525 + }, + { + "epoch": 0.3057096401648358, + "grad_norm": 0.4193885326385498, + "learning_rate": 0.0001388866504633541, + "loss": 1.4238, + "step": 23526 + }, + { + "epoch": 0.30572263470875166, + "grad_norm": 0.3782515227794647, + "learning_rate": 0.00013888405100144271, + "loss": 1.4786, + "step": 23527 + }, + { + "epoch": 0.30573562925266756, + "grad_norm": 0.3675345778465271, + "learning_rate": 0.0001388814515395313, + "loss": 1.3544, + "step": 23528 + }, + { + "epoch": 0.3057486237965834, + "grad_norm": 0.34074875712394714, + "learning_rate": 0.00013887885207761994, + "loss": 1.4319, + "step": 23529 + }, + { + "epoch": 0.3057616183404993, + "grad_norm": 0.38775891065597534, + "learning_rate": 0.00013887625261570856, + "loss": 1.3225, + "step": 23530 + }, + { + "epoch": 0.30577461288441515, + "grad_norm": 0.41392791271209717, + "learning_rate": 0.00013887365315379716, + "loss": 1.2939, + "step": 23531 + }, + { + "epoch": 0.30578760742833105, + "grad_norm": 0.34924671053886414, + "learning_rate": 0.00013887105369188578, + "loss": 1.3975, + "step": 23532 + }, + { + "epoch": 0.3058006019722469, + "grad_norm": 0.47477591037750244, + "learning_rate": 0.0001388684542299744, + "loss": 1.5919, + "step": 23533 + }, + { + "epoch": 0.3058135965161628, + "grad_norm": 0.3081744313240051, + "learning_rate": 0.00013886585476806303, + "loss": 1.2359, + "step": 23534 + }, + { + "epoch": 0.30582659106007865, + "grad_norm": 0.3801209628582001, + "learning_rate": 0.00013886325530615163, + "loss": 1.5442, + "step": 23535 + }, + { + "epoch": 0.30583958560399455, + "grad_norm": 0.40903663635253906, + "learning_rate": 0.00013886065584424025, + "loss": 1.4123, + "step": 23536 + }, + { + "epoch": 0.3058525801479104, + "grad_norm": 0.4087826609611511, + "learning_rate": 0.00013885805638232888, + "loss": 1.5494, + "step": 23537 + }, + { + "epoch": 0.3058655746918263, + "grad_norm": 0.368113249540329, + "learning_rate": 0.00013885545692041747, + "loss": 1.3457, + "step": 23538 + }, + { + "epoch": 0.30587856923574214, + "grad_norm": 0.39751601219177246, + "learning_rate": 0.0001388528574585061, + "loss": 1.1442, + "step": 23539 + }, + { + "epoch": 0.30589156377965804, + "grad_norm": 0.5110365748405457, + "learning_rate": 0.0001388502579965947, + "loss": 1.5872, + "step": 23540 + }, + { + "epoch": 0.3059045583235739, + "grad_norm": 0.29193973541259766, + "learning_rate": 0.00013884765853468332, + "loss": 1.4078, + "step": 23541 + }, + { + "epoch": 0.3059175528674898, + "grad_norm": 0.3715648353099823, + "learning_rate": 0.00013884505907277195, + "loss": 1.5586, + "step": 23542 + }, + { + "epoch": 0.30593054741140563, + "grad_norm": 0.3424482047557831, + "learning_rate": 0.00013884245961086054, + "loss": 1.3866, + "step": 23543 + }, + { + "epoch": 0.30594354195532153, + "grad_norm": 0.3644959330558777, + "learning_rate": 0.00013883986014894917, + "loss": 1.3772, + "step": 23544 + }, + { + "epoch": 0.3059565364992374, + "grad_norm": 0.5370931625366211, + "learning_rate": 0.0001388372606870378, + "loss": 1.4216, + "step": 23545 + }, + { + "epoch": 0.3059695310431533, + "grad_norm": 0.33715203404426575, + "learning_rate": 0.00013883466122512642, + "loss": 1.3976, + "step": 23546 + }, + { + "epoch": 0.3059825255870691, + "grad_norm": 0.5049725770950317, + "learning_rate": 0.00013883206176321501, + "loss": 1.4355, + "step": 23547 + }, + { + "epoch": 0.305995520130985, + "grad_norm": 0.41145607829093933, + "learning_rate": 0.00013882946230130364, + "loss": 1.4619, + "step": 23548 + }, + { + "epoch": 0.30600851467490087, + "grad_norm": 0.4065941572189331, + "learning_rate": 0.00013882686283939226, + "loss": 1.5665, + "step": 23549 + }, + { + "epoch": 0.30602150921881677, + "grad_norm": 0.4424576461315155, + "learning_rate": 0.00013882426337748086, + "loss": 1.5069, + "step": 23550 + }, + { + "epoch": 0.3060345037627326, + "grad_norm": 0.473283976316452, + "learning_rate": 0.00013882166391556948, + "loss": 1.4298, + "step": 23551 + }, + { + "epoch": 0.3060474983066485, + "grad_norm": 0.41806769371032715, + "learning_rate": 0.0001388190644536581, + "loss": 1.4097, + "step": 23552 + }, + { + "epoch": 0.30606049285056436, + "grad_norm": 0.3266026973724365, + "learning_rate": 0.0001388164649917467, + "loss": 1.1847, + "step": 23553 + }, + { + "epoch": 0.30607348739448026, + "grad_norm": 0.3820030689239502, + "learning_rate": 0.00013881386552983533, + "loss": 1.3584, + "step": 23554 + }, + { + "epoch": 0.3060864819383961, + "grad_norm": 0.3995163142681122, + "learning_rate": 0.00013881126606792393, + "loss": 1.535, + "step": 23555 + }, + { + "epoch": 0.306099476482312, + "grad_norm": 0.4605192244052887, + "learning_rate": 0.00013880866660601258, + "loss": 1.5605, + "step": 23556 + }, + { + "epoch": 0.30611247102622785, + "grad_norm": 0.3773915469646454, + "learning_rate": 0.00013880606714410118, + "loss": 1.3675, + "step": 23557 + }, + { + "epoch": 0.30612546557014375, + "grad_norm": 0.41772910952568054, + "learning_rate": 0.0001388034676821898, + "loss": 1.3495, + "step": 23558 + }, + { + "epoch": 0.3061384601140596, + "grad_norm": 0.3977172076702118, + "learning_rate": 0.0001388008682202784, + "loss": 1.5292, + "step": 23559 + }, + { + "epoch": 0.3061514546579755, + "grad_norm": 0.35015615820884705, + "learning_rate": 0.00013879826875836702, + "loss": 1.3182, + "step": 23560 + }, + { + "epoch": 0.30616444920189134, + "grad_norm": 0.3995659053325653, + "learning_rate": 0.00013879566929645565, + "loss": 1.5231, + "step": 23561 + }, + { + "epoch": 0.30617744374580724, + "grad_norm": 0.5172699689865112, + "learning_rate": 0.00013879306983454425, + "loss": 1.5096, + "step": 23562 + }, + { + "epoch": 0.3061904382897231, + "grad_norm": 0.32431915402412415, + "learning_rate": 0.00013879047037263287, + "loss": 1.421, + "step": 23563 + }, + { + "epoch": 0.306203432833639, + "grad_norm": 0.3113725483417511, + "learning_rate": 0.0001387878709107215, + "loss": 1.3885, + "step": 23564 + }, + { + "epoch": 0.30621642737755483, + "grad_norm": 0.37816914916038513, + "learning_rate": 0.00013878527144881012, + "loss": 1.2267, + "step": 23565 + }, + { + "epoch": 0.30622942192147073, + "grad_norm": 0.4050650894641876, + "learning_rate": 0.00013878267198689872, + "loss": 1.424, + "step": 23566 + }, + { + "epoch": 0.3062424164653866, + "grad_norm": 0.4346819519996643, + "learning_rate": 0.00013878007252498731, + "loss": 1.3197, + "step": 23567 + }, + { + "epoch": 0.3062554110093025, + "grad_norm": 0.3116249144077301, + "learning_rate": 0.00013877747306307597, + "loss": 1.4365, + "step": 23568 + }, + { + "epoch": 0.3062684055532183, + "grad_norm": 0.4409507215023041, + "learning_rate": 0.00013877487360116456, + "loss": 1.3679, + "step": 23569 + }, + { + "epoch": 0.3062814000971342, + "grad_norm": 0.3509639501571655, + "learning_rate": 0.0001387722741392532, + "loss": 1.2883, + "step": 23570 + }, + { + "epoch": 0.30629439464105007, + "grad_norm": 0.3960501253604889, + "learning_rate": 0.00013876967467734178, + "loss": 1.5578, + "step": 23571 + }, + { + "epoch": 0.306307389184966, + "grad_norm": 0.4010438919067383, + "learning_rate": 0.0001387670752154304, + "loss": 1.3401, + "step": 23572 + }, + { + "epoch": 0.3063203837288818, + "grad_norm": 0.3445011079311371, + "learning_rate": 0.00013876447575351903, + "loss": 1.4625, + "step": 23573 + }, + { + "epoch": 0.3063333782727977, + "grad_norm": 0.4155280292034149, + "learning_rate": 0.00013876187629160763, + "loss": 1.4966, + "step": 23574 + }, + { + "epoch": 0.30634637281671356, + "grad_norm": 0.31351906061172485, + "learning_rate": 0.00013875927682969626, + "loss": 1.3953, + "step": 23575 + }, + { + "epoch": 0.30635936736062946, + "grad_norm": 0.3552655577659607, + "learning_rate": 0.00013875667736778488, + "loss": 1.2937, + "step": 23576 + }, + { + "epoch": 0.3063723619045453, + "grad_norm": 0.4302465617656708, + "learning_rate": 0.0001387540779058735, + "loss": 1.34, + "step": 23577 + }, + { + "epoch": 0.3063853564484612, + "grad_norm": 0.27516257762908936, + "learning_rate": 0.0001387514784439621, + "loss": 1.3098, + "step": 23578 + }, + { + "epoch": 0.30639835099237706, + "grad_norm": 0.4331068992614746, + "learning_rate": 0.0001387488789820507, + "loss": 1.4499, + "step": 23579 + }, + { + "epoch": 0.30641134553629296, + "grad_norm": 0.4370339810848236, + "learning_rate": 0.00013874627952013935, + "loss": 1.3013, + "step": 23580 + }, + { + "epoch": 0.3064243400802088, + "grad_norm": 0.3664869964122772, + "learning_rate": 0.00013874368005822795, + "loss": 1.3151, + "step": 23581 + }, + { + "epoch": 0.3064373346241247, + "grad_norm": 0.44402244687080383, + "learning_rate": 0.00013874108059631657, + "loss": 1.4454, + "step": 23582 + }, + { + "epoch": 0.30645032916804055, + "grad_norm": 0.4439980089664459, + "learning_rate": 0.00013873848113440517, + "loss": 1.2174, + "step": 23583 + }, + { + "epoch": 0.30646332371195645, + "grad_norm": 0.37649279832839966, + "learning_rate": 0.0001387358816724938, + "loss": 1.5126, + "step": 23584 + }, + { + "epoch": 0.3064763182558723, + "grad_norm": 0.4243823289871216, + "learning_rate": 0.00013873328221058242, + "loss": 1.4926, + "step": 23585 + }, + { + "epoch": 0.3064893127997882, + "grad_norm": 0.35294297337532043, + "learning_rate": 0.00013873068274867102, + "loss": 1.2073, + "step": 23586 + }, + { + "epoch": 0.30650230734370404, + "grad_norm": 0.3987620770931244, + "learning_rate": 0.00013872808328675967, + "loss": 1.4063, + "step": 23587 + }, + { + "epoch": 0.30651530188761994, + "grad_norm": 0.4728260636329651, + "learning_rate": 0.00013872548382484827, + "loss": 1.5492, + "step": 23588 + }, + { + "epoch": 0.3065282964315358, + "grad_norm": 0.4484679102897644, + "learning_rate": 0.0001387228843629369, + "loss": 1.4972, + "step": 23589 + }, + { + "epoch": 0.3065412909754517, + "grad_norm": 0.3717074990272522, + "learning_rate": 0.0001387202849010255, + "loss": 1.3849, + "step": 23590 + }, + { + "epoch": 0.30655428551936753, + "grad_norm": 0.40513846278190613, + "learning_rate": 0.0001387176854391141, + "loss": 1.3796, + "step": 23591 + }, + { + "epoch": 0.30656728006328343, + "grad_norm": 0.42069143056869507, + "learning_rate": 0.00013871508597720274, + "loss": 1.2732, + "step": 23592 + }, + { + "epoch": 0.3065802746071993, + "grad_norm": 0.45256495475769043, + "learning_rate": 0.00013871248651529133, + "loss": 1.4469, + "step": 23593 + }, + { + "epoch": 0.3065932691511152, + "grad_norm": 0.31257379055023193, + "learning_rate": 0.00013870988705337996, + "loss": 1.6586, + "step": 23594 + }, + { + "epoch": 0.306606263695031, + "grad_norm": 0.39899003505706787, + "learning_rate": 0.00013870728759146858, + "loss": 1.3932, + "step": 23595 + }, + { + "epoch": 0.3066192582389469, + "grad_norm": 0.3584810197353363, + "learning_rate": 0.00013870468812955718, + "loss": 1.334, + "step": 23596 + }, + { + "epoch": 0.30663225278286277, + "grad_norm": 0.510430634021759, + "learning_rate": 0.0001387020886676458, + "loss": 1.6248, + "step": 23597 + }, + { + "epoch": 0.30664524732677867, + "grad_norm": 0.46201857924461365, + "learning_rate": 0.0001386994892057344, + "loss": 1.4542, + "step": 23598 + }, + { + "epoch": 0.3066582418706945, + "grad_norm": 0.4728403389453888, + "learning_rate": 0.00013869688974382305, + "loss": 1.5001, + "step": 23599 + }, + { + "epoch": 0.3066712364146104, + "grad_norm": 0.32483214139938354, + "learning_rate": 0.00013869429028191165, + "loss": 1.3561, + "step": 23600 + }, + { + "epoch": 0.3066842309585263, + "grad_norm": 0.35493841767311096, + "learning_rate": 0.00013869169082000028, + "loss": 1.1853, + "step": 23601 + }, + { + "epoch": 0.30669722550244216, + "grad_norm": 0.410774827003479, + "learning_rate": 0.00013868909135808887, + "loss": 1.5055, + "step": 23602 + }, + { + "epoch": 0.30671022004635806, + "grad_norm": 0.30181410908699036, + "learning_rate": 0.0001386864918961775, + "loss": 1.2662, + "step": 23603 + }, + { + "epoch": 0.3067232145902739, + "grad_norm": 0.4347074627876282, + "learning_rate": 0.00013868389243426612, + "loss": 1.4768, + "step": 23604 + }, + { + "epoch": 0.3067362091341898, + "grad_norm": 0.41375666856765747, + "learning_rate": 0.00013868129297235472, + "loss": 1.3018, + "step": 23605 + }, + { + "epoch": 0.30674920367810565, + "grad_norm": 0.41536417603492737, + "learning_rate": 0.00013867869351044334, + "loss": 1.4455, + "step": 23606 + }, + { + "epoch": 0.30676219822202155, + "grad_norm": 0.45644810795783997, + "learning_rate": 0.00013867609404853197, + "loss": 1.3005, + "step": 23607 + }, + { + "epoch": 0.3067751927659374, + "grad_norm": 0.4884204566478729, + "learning_rate": 0.00013867349458662057, + "loss": 1.4056, + "step": 23608 + }, + { + "epoch": 0.3067881873098533, + "grad_norm": 0.33736222982406616, + "learning_rate": 0.0001386708951247092, + "loss": 1.3616, + "step": 23609 + }, + { + "epoch": 0.30680118185376914, + "grad_norm": 0.43201127648353577, + "learning_rate": 0.0001386682956627978, + "loss": 1.5119, + "step": 23610 + }, + { + "epoch": 0.30681417639768505, + "grad_norm": 0.5068404674530029, + "learning_rate": 0.00013866569620088644, + "loss": 1.3493, + "step": 23611 + }, + { + "epoch": 0.3068271709416009, + "grad_norm": 0.3931178152561188, + "learning_rate": 0.00013866309673897504, + "loss": 1.3852, + "step": 23612 + }, + { + "epoch": 0.3068401654855168, + "grad_norm": 0.4263322353363037, + "learning_rate": 0.00013866049727706366, + "loss": 1.3801, + "step": 23613 + }, + { + "epoch": 0.30685316002943264, + "grad_norm": 0.2817239463329315, + "learning_rate": 0.00013865789781515226, + "loss": 1.2746, + "step": 23614 + }, + { + "epoch": 0.30686615457334854, + "grad_norm": 0.40504273772239685, + "learning_rate": 0.00013865529835324088, + "loss": 1.5184, + "step": 23615 + }, + { + "epoch": 0.3068791491172644, + "grad_norm": 0.4184659421443939, + "learning_rate": 0.0001386526988913295, + "loss": 1.5096, + "step": 23616 + }, + { + "epoch": 0.3068921436611803, + "grad_norm": 0.4181513786315918, + "learning_rate": 0.0001386500994294181, + "loss": 1.3144, + "step": 23617 + }, + { + "epoch": 0.30690513820509613, + "grad_norm": 0.2820641100406647, + "learning_rate": 0.00013864749996750673, + "loss": 1.3355, + "step": 23618 + }, + { + "epoch": 0.30691813274901203, + "grad_norm": 0.5276575088500977, + "learning_rate": 0.00013864490050559535, + "loss": 1.524, + "step": 23619 + }, + { + "epoch": 0.3069311272929279, + "grad_norm": 0.4829508364200592, + "learning_rate": 0.00013864230104368395, + "loss": 1.5497, + "step": 23620 + }, + { + "epoch": 0.3069441218368438, + "grad_norm": 0.3474687933921814, + "learning_rate": 0.00013863970158177258, + "loss": 1.4189, + "step": 23621 + }, + { + "epoch": 0.3069571163807596, + "grad_norm": 0.40267297625541687, + "learning_rate": 0.0001386371021198612, + "loss": 1.2774, + "step": 23622 + }, + { + "epoch": 0.3069701109246755, + "grad_norm": 0.4492112696170807, + "learning_rate": 0.00013863450265794982, + "loss": 1.5127, + "step": 23623 + }, + { + "epoch": 0.30698310546859137, + "grad_norm": 0.4088647663593292, + "learning_rate": 0.00013863190319603842, + "loss": 1.3064, + "step": 23624 + }, + { + "epoch": 0.30699610001250727, + "grad_norm": 0.4043883681297302, + "learning_rate": 0.00013862930373412705, + "loss": 1.4856, + "step": 23625 + }, + { + "epoch": 0.3070090945564231, + "grad_norm": 0.5127471685409546, + "learning_rate": 0.00013862670427221567, + "loss": 1.5484, + "step": 23626 + }, + { + "epoch": 0.307022089100339, + "grad_norm": 0.48295533657073975, + "learning_rate": 0.00013862410481030427, + "loss": 1.4358, + "step": 23627 + }, + { + "epoch": 0.30703508364425486, + "grad_norm": 0.3892424404621124, + "learning_rate": 0.0001386215053483929, + "loss": 1.3772, + "step": 23628 + }, + { + "epoch": 0.30704807818817076, + "grad_norm": 0.3554915487766266, + "learning_rate": 0.0001386189058864815, + "loss": 1.4281, + "step": 23629 + }, + { + "epoch": 0.3070610727320866, + "grad_norm": 0.3611208200454712, + "learning_rate": 0.00013861630642457014, + "loss": 1.5503, + "step": 23630 + }, + { + "epoch": 0.3070740672760025, + "grad_norm": 0.3918308913707733, + "learning_rate": 0.00013861370696265874, + "loss": 1.4471, + "step": 23631 + }, + { + "epoch": 0.30708706181991835, + "grad_norm": 0.33947885036468506, + "learning_rate": 0.00013861110750074736, + "loss": 1.3327, + "step": 23632 + }, + { + "epoch": 0.30710005636383425, + "grad_norm": 0.40992066264152527, + "learning_rate": 0.00013860850803883596, + "loss": 1.3111, + "step": 23633 + }, + { + "epoch": 0.3071130509077501, + "grad_norm": 0.4609469771385193, + "learning_rate": 0.00013860590857692459, + "loss": 1.7449, + "step": 23634 + }, + { + "epoch": 0.307126045451666, + "grad_norm": 0.3659321665763855, + "learning_rate": 0.0001386033091150132, + "loss": 1.2248, + "step": 23635 + }, + { + "epoch": 0.30713903999558184, + "grad_norm": 0.25856778025627136, + "learning_rate": 0.0001386007096531018, + "loss": 1.1068, + "step": 23636 + }, + { + "epoch": 0.30715203453949774, + "grad_norm": 0.49618345499038696, + "learning_rate": 0.00013859811019119043, + "loss": 1.5069, + "step": 23637 + }, + { + "epoch": 0.3071650290834136, + "grad_norm": 0.4758076071739197, + "learning_rate": 0.00013859551072927906, + "loss": 1.5385, + "step": 23638 + }, + { + "epoch": 0.3071780236273295, + "grad_norm": 0.3862117826938629, + "learning_rate": 0.00013859291126736765, + "loss": 1.3563, + "step": 23639 + }, + { + "epoch": 0.30719101817124533, + "grad_norm": 0.40093135833740234, + "learning_rate": 0.00013859031180545628, + "loss": 1.3412, + "step": 23640 + }, + { + "epoch": 0.30720401271516123, + "grad_norm": 0.4450172185897827, + "learning_rate": 0.00013858771234354488, + "loss": 1.47, + "step": 23641 + }, + { + "epoch": 0.3072170072590771, + "grad_norm": 0.4352661371231079, + "learning_rate": 0.00013858511288163353, + "loss": 1.4592, + "step": 23642 + }, + { + "epoch": 0.307230001802993, + "grad_norm": 0.3355281352996826, + "learning_rate": 0.00013858251341972212, + "loss": 1.3442, + "step": 23643 + }, + { + "epoch": 0.3072429963469088, + "grad_norm": 0.36961600184440613, + "learning_rate": 0.00013857991395781075, + "loss": 1.3783, + "step": 23644 + }, + { + "epoch": 0.3072559908908247, + "grad_norm": 0.43317127227783203, + "learning_rate": 0.00013857731449589935, + "loss": 1.4284, + "step": 23645 + }, + { + "epoch": 0.30726898543474057, + "grad_norm": 0.5049710273742676, + "learning_rate": 0.00013857471503398797, + "loss": 1.4564, + "step": 23646 + }, + { + "epoch": 0.30728197997865647, + "grad_norm": 0.4409331977367401, + "learning_rate": 0.0001385721155720766, + "loss": 1.4723, + "step": 23647 + }, + { + "epoch": 0.3072949745225723, + "grad_norm": 0.33348605036735535, + "learning_rate": 0.0001385695161101652, + "loss": 1.3064, + "step": 23648 + }, + { + "epoch": 0.3073079690664882, + "grad_norm": 0.4932759702205658, + "learning_rate": 0.00013856691664825382, + "loss": 1.5257, + "step": 23649 + }, + { + "epoch": 0.30732096361040406, + "grad_norm": 0.5131253600120544, + "learning_rate": 0.00013856431718634244, + "loss": 1.5473, + "step": 23650 + }, + { + "epoch": 0.30733395815431996, + "grad_norm": 0.3839573264122009, + "learning_rate": 0.00013856171772443104, + "loss": 1.2628, + "step": 23651 + }, + { + "epoch": 0.3073469526982358, + "grad_norm": 0.36517488956451416, + "learning_rate": 0.00013855911826251966, + "loss": 1.5401, + "step": 23652 + }, + { + "epoch": 0.3073599472421517, + "grad_norm": 0.45546606183052063, + "learning_rate": 0.00013855651880060826, + "loss": 1.3099, + "step": 23653 + }, + { + "epoch": 0.30737294178606756, + "grad_norm": 0.48339593410491943, + "learning_rate": 0.0001385539193386969, + "loss": 1.3436, + "step": 23654 + }, + { + "epoch": 0.30738593632998346, + "grad_norm": 0.27648887038230896, + "learning_rate": 0.0001385513198767855, + "loss": 1.0945, + "step": 23655 + }, + { + "epoch": 0.3073989308738993, + "grad_norm": 0.42309150099754333, + "learning_rate": 0.00013854872041487413, + "loss": 1.2807, + "step": 23656 + }, + { + "epoch": 0.3074119254178152, + "grad_norm": 0.3902518153190613, + "learning_rate": 0.00013854612095296273, + "loss": 1.3453, + "step": 23657 + }, + { + "epoch": 0.30742491996173105, + "grad_norm": 0.3287469446659088, + "learning_rate": 0.00013854352149105136, + "loss": 1.2096, + "step": 23658 + }, + { + "epoch": 0.30743791450564695, + "grad_norm": 0.4263773560523987, + "learning_rate": 0.00013854092202913998, + "loss": 1.4426, + "step": 23659 + }, + { + "epoch": 0.3074509090495628, + "grad_norm": 0.4225476086139679, + "learning_rate": 0.00013853832256722858, + "loss": 1.5974, + "step": 23660 + }, + { + "epoch": 0.3074639035934787, + "grad_norm": 0.4083250164985657, + "learning_rate": 0.00013853572310531723, + "loss": 1.3769, + "step": 23661 + }, + { + "epoch": 0.30747689813739454, + "grad_norm": 0.427945077419281, + "learning_rate": 0.00013853312364340583, + "loss": 1.6552, + "step": 23662 + }, + { + "epoch": 0.30748989268131044, + "grad_norm": 0.461101233959198, + "learning_rate": 0.00013853052418149442, + "loss": 1.2396, + "step": 23663 + }, + { + "epoch": 0.3075028872252263, + "grad_norm": 0.32514646649360657, + "learning_rate": 0.00013852792471958305, + "loss": 1.2542, + "step": 23664 + }, + { + "epoch": 0.3075158817691422, + "grad_norm": 0.46653270721435547, + "learning_rate": 0.00013852532525767167, + "loss": 1.5008, + "step": 23665 + }, + { + "epoch": 0.30752887631305803, + "grad_norm": 0.4038919508457184, + "learning_rate": 0.0001385227257957603, + "loss": 1.405, + "step": 23666 + }, + { + "epoch": 0.30754187085697393, + "grad_norm": 0.2607548236846924, + "learning_rate": 0.0001385201263338489, + "loss": 1.2664, + "step": 23667 + }, + { + "epoch": 0.3075548654008898, + "grad_norm": 0.3488636612892151, + "learning_rate": 0.00013851752687193752, + "loss": 1.4976, + "step": 23668 + }, + { + "epoch": 0.3075678599448057, + "grad_norm": 0.4039755165576935, + "learning_rate": 0.00013851492741002614, + "loss": 1.4058, + "step": 23669 + }, + { + "epoch": 0.3075808544887215, + "grad_norm": 0.4989168047904968, + "learning_rate": 0.00013851232794811474, + "loss": 1.4851, + "step": 23670 + }, + { + "epoch": 0.3075938490326374, + "grad_norm": 0.39854735136032104, + "learning_rate": 0.00013850972848620337, + "loss": 1.4742, + "step": 23671 + }, + { + "epoch": 0.30760684357655327, + "grad_norm": 0.41402918100357056, + "learning_rate": 0.00013850712902429196, + "loss": 1.3461, + "step": 23672 + }, + { + "epoch": 0.30761983812046917, + "grad_norm": 0.3732379078865051, + "learning_rate": 0.00013850452956238061, + "loss": 1.3689, + "step": 23673 + }, + { + "epoch": 0.307632832664385, + "grad_norm": 0.35589849948883057, + "learning_rate": 0.0001385019301004692, + "loss": 1.4185, + "step": 23674 + }, + { + "epoch": 0.3076458272083009, + "grad_norm": 0.37869277596473694, + "learning_rate": 0.0001384993306385578, + "loss": 1.5981, + "step": 23675 + }, + { + "epoch": 0.3076588217522168, + "grad_norm": 0.486847847700119, + "learning_rate": 0.00013849673117664643, + "loss": 1.5504, + "step": 23676 + }, + { + "epoch": 0.30767181629613266, + "grad_norm": 0.43533119559288025, + "learning_rate": 0.00013849413171473506, + "loss": 1.2873, + "step": 23677 + }, + { + "epoch": 0.30768481084004856, + "grad_norm": 0.4138009250164032, + "learning_rate": 0.00013849153225282368, + "loss": 1.4943, + "step": 23678 + }, + { + "epoch": 0.3076978053839644, + "grad_norm": 0.42973774671554565, + "learning_rate": 0.00013848893279091228, + "loss": 1.4223, + "step": 23679 + }, + { + "epoch": 0.3077107999278803, + "grad_norm": 0.4351615607738495, + "learning_rate": 0.0001384863333290009, + "loss": 1.307, + "step": 23680 + }, + { + "epoch": 0.30772379447179615, + "grad_norm": 0.3348578214645386, + "learning_rate": 0.00013848373386708953, + "loss": 1.4563, + "step": 23681 + }, + { + "epoch": 0.30773678901571205, + "grad_norm": 0.3403237462043762, + "learning_rate": 0.00013848113440517813, + "loss": 1.4801, + "step": 23682 + }, + { + "epoch": 0.3077497835596279, + "grad_norm": 0.33509063720703125, + "learning_rate": 0.00013847853494326675, + "loss": 1.3741, + "step": 23683 + }, + { + "epoch": 0.3077627781035438, + "grad_norm": 0.39930030703544617, + "learning_rate": 0.00013847593548135535, + "loss": 1.2608, + "step": 23684 + }, + { + "epoch": 0.30777577264745964, + "grad_norm": 0.43121635913848877, + "learning_rate": 0.000138473336019444, + "loss": 1.4514, + "step": 23685 + }, + { + "epoch": 0.30778876719137555, + "grad_norm": 0.4374832212924957, + "learning_rate": 0.0001384707365575326, + "loss": 1.4659, + "step": 23686 + }, + { + "epoch": 0.3078017617352914, + "grad_norm": 0.4245457649230957, + "learning_rate": 0.00013846813709562122, + "loss": 1.378, + "step": 23687 + }, + { + "epoch": 0.3078147562792073, + "grad_norm": 0.4714190363883972, + "learning_rate": 0.00013846553763370982, + "loss": 1.5325, + "step": 23688 + }, + { + "epoch": 0.30782775082312314, + "grad_norm": 0.46281975507736206, + "learning_rate": 0.00013846293817179844, + "loss": 1.5707, + "step": 23689 + }, + { + "epoch": 0.30784074536703904, + "grad_norm": 0.3224961459636688, + "learning_rate": 0.00013846033870988707, + "loss": 1.1255, + "step": 23690 + }, + { + "epoch": 0.3078537399109549, + "grad_norm": 0.42874646186828613, + "learning_rate": 0.00013845773924797567, + "loss": 1.5171, + "step": 23691 + }, + { + "epoch": 0.3078667344548708, + "grad_norm": 0.29915958642959595, + "learning_rate": 0.0001384551397860643, + "loss": 1.4222, + "step": 23692 + }, + { + "epoch": 0.30787972899878663, + "grad_norm": 0.37287771701812744, + "learning_rate": 0.00013845254032415291, + "loss": 1.2152, + "step": 23693 + }, + { + "epoch": 0.30789272354270253, + "grad_norm": 0.3369549810886383, + "learning_rate": 0.0001384499408622415, + "loss": 1.5369, + "step": 23694 + }, + { + "epoch": 0.3079057180866184, + "grad_norm": 0.3251512050628662, + "learning_rate": 0.00013844734140033014, + "loss": 1.3568, + "step": 23695 + }, + { + "epoch": 0.3079187126305343, + "grad_norm": 0.38703179359436035, + "learning_rate": 0.00013844474193841876, + "loss": 1.2748, + "step": 23696 + }, + { + "epoch": 0.3079317071744501, + "grad_norm": 0.3561120927333832, + "learning_rate": 0.00013844214247650739, + "loss": 1.3144, + "step": 23697 + }, + { + "epoch": 0.307944701718366, + "grad_norm": 0.338692307472229, + "learning_rate": 0.00013843954301459598, + "loss": 1.1319, + "step": 23698 + }, + { + "epoch": 0.30795769626228187, + "grad_norm": 0.42351263761520386, + "learning_rate": 0.0001384369435526846, + "loss": 1.4552, + "step": 23699 + }, + { + "epoch": 0.30797069080619777, + "grad_norm": 0.3722483217716217, + "learning_rate": 0.00013843434409077323, + "loss": 1.3144, + "step": 23700 + }, + { + "epoch": 0.3079836853501136, + "grad_norm": 0.3416104316711426, + "learning_rate": 0.00013843174462886183, + "loss": 1.3443, + "step": 23701 + }, + { + "epoch": 0.3079966798940295, + "grad_norm": 0.4270249605178833, + "learning_rate": 0.00013842914516695045, + "loss": 1.3215, + "step": 23702 + }, + { + "epoch": 0.30800967443794536, + "grad_norm": 0.33300483226776123, + "learning_rate": 0.00013842654570503905, + "loss": 1.4587, + "step": 23703 + }, + { + "epoch": 0.30802266898186126, + "grad_norm": 0.27737119793891907, + "learning_rate": 0.00013842394624312768, + "loss": 1.2077, + "step": 23704 + }, + { + "epoch": 0.3080356635257771, + "grad_norm": 0.37972211837768555, + "learning_rate": 0.0001384213467812163, + "loss": 1.4058, + "step": 23705 + }, + { + "epoch": 0.308048658069693, + "grad_norm": 0.394265741109848, + "learning_rate": 0.0001384187473193049, + "loss": 1.5386, + "step": 23706 + }, + { + "epoch": 0.30806165261360885, + "grad_norm": 0.3523576855659485, + "learning_rate": 0.00013841614785739352, + "loss": 1.393, + "step": 23707 + }, + { + "epoch": 0.30807464715752475, + "grad_norm": 0.5150784254074097, + "learning_rate": 0.00013841354839548215, + "loss": 1.6029, + "step": 23708 + }, + { + "epoch": 0.3080876417014406, + "grad_norm": 0.5116024017333984, + "learning_rate": 0.00013841094893357077, + "loss": 1.4506, + "step": 23709 + }, + { + "epoch": 0.3081006362453565, + "grad_norm": 0.31057223677635193, + "learning_rate": 0.00013840834947165937, + "loss": 1.2613, + "step": 23710 + }, + { + "epoch": 0.30811363078927234, + "grad_norm": 0.4687395393848419, + "learning_rate": 0.000138405750009748, + "loss": 1.5061, + "step": 23711 + }, + { + "epoch": 0.30812662533318824, + "grad_norm": 0.3337137997150421, + "learning_rate": 0.00013840315054783662, + "loss": 1.3149, + "step": 23712 + }, + { + "epoch": 0.3081396198771041, + "grad_norm": 0.351261705160141, + "learning_rate": 0.00013840055108592521, + "loss": 1.5241, + "step": 23713 + }, + { + "epoch": 0.30815261442102, + "grad_norm": 0.361054390668869, + "learning_rate": 0.00013839795162401384, + "loss": 1.3152, + "step": 23714 + }, + { + "epoch": 0.30816560896493583, + "grad_norm": 0.34404996037483215, + "learning_rate": 0.00013839535216210244, + "loss": 1.4769, + "step": 23715 + }, + { + "epoch": 0.30817860350885173, + "grad_norm": 0.3886276185512543, + "learning_rate": 0.0001383927527001911, + "loss": 1.3482, + "step": 23716 + }, + { + "epoch": 0.3081915980527676, + "grad_norm": 0.3152378797531128, + "learning_rate": 0.00013839015323827969, + "loss": 1.374, + "step": 23717 + }, + { + "epoch": 0.3082045925966835, + "grad_norm": 0.42092639207839966, + "learning_rate": 0.00013838755377636828, + "loss": 1.5369, + "step": 23718 + }, + { + "epoch": 0.3082175871405993, + "grad_norm": 0.36865299940109253, + "learning_rate": 0.0001383849543144569, + "loss": 1.3525, + "step": 23719 + }, + { + "epoch": 0.3082305816845152, + "grad_norm": 0.4288714826107025, + "learning_rate": 0.00013838235485254553, + "loss": 1.4217, + "step": 23720 + }, + { + "epoch": 0.30824357622843107, + "grad_norm": 0.37806907296180725, + "learning_rate": 0.00013837975539063416, + "loss": 1.3303, + "step": 23721 + }, + { + "epoch": 0.30825657077234697, + "grad_norm": 0.380581796169281, + "learning_rate": 0.00013837715592872275, + "loss": 1.2867, + "step": 23722 + }, + { + "epoch": 0.3082695653162628, + "grad_norm": 0.342325896024704, + "learning_rate": 0.00013837455646681138, + "loss": 1.351, + "step": 23723 + }, + { + "epoch": 0.3082825598601787, + "grad_norm": 0.36525553464889526, + "learning_rate": 0.0001383719570049, + "loss": 1.3957, + "step": 23724 + }, + { + "epoch": 0.30829555440409456, + "grad_norm": 0.3850937783718109, + "learning_rate": 0.0001383693575429886, + "loss": 1.3506, + "step": 23725 + }, + { + "epoch": 0.30830854894801046, + "grad_norm": 0.3460325300693512, + "learning_rate": 0.00013836675808107722, + "loss": 1.2405, + "step": 23726 + }, + { + "epoch": 0.3083215434919263, + "grad_norm": 0.36706075072288513, + "learning_rate": 0.00013836415861916582, + "loss": 1.5763, + "step": 23727 + }, + { + "epoch": 0.3083345380358422, + "grad_norm": 0.40368518233299255, + "learning_rate": 0.00013836155915725447, + "loss": 1.2892, + "step": 23728 + }, + { + "epoch": 0.30834753257975805, + "grad_norm": 0.3619697093963623, + "learning_rate": 0.00013835895969534307, + "loss": 1.4129, + "step": 23729 + }, + { + "epoch": 0.30836052712367396, + "grad_norm": 0.4654667377471924, + "learning_rate": 0.00013835636023343167, + "loss": 1.4071, + "step": 23730 + }, + { + "epoch": 0.3083735216675898, + "grad_norm": 0.4690954387187958, + "learning_rate": 0.0001383537607715203, + "loss": 1.6251, + "step": 23731 + }, + { + "epoch": 0.3083865162115057, + "grad_norm": 0.43373632431030273, + "learning_rate": 0.00013835116130960892, + "loss": 1.4548, + "step": 23732 + }, + { + "epoch": 0.30839951075542155, + "grad_norm": 0.3968580961227417, + "learning_rate": 0.00013834856184769754, + "loss": 1.269, + "step": 23733 + }, + { + "epoch": 0.30841250529933745, + "grad_norm": 0.404070645570755, + "learning_rate": 0.00013834596238578614, + "loss": 1.407, + "step": 23734 + }, + { + "epoch": 0.3084254998432533, + "grad_norm": 0.3268396854400635, + "learning_rate": 0.00013834336292387476, + "loss": 1.4472, + "step": 23735 + }, + { + "epoch": 0.3084384943871692, + "grad_norm": 0.46975499391555786, + "learning_rate": 0.0001383407634619634, + "loss": 1.3193, + "step": 23736 + }, + { + "epoch": 0.30845148893108504, + "grad_norm": 0.37644270062446594, + "learning_rate": 0.00013833816400005199, + "loss": 1.5597, + "step": 23737 + }, + { + "epoch": 0.30846448347500094, + "grad_norm": 0.37298479676246643, + "learning_rate": 0.0001383355645381406, + "loss": 1.5519, + "step": 23738 + }, + { + "epoch": 0.3084774780189168, + "grad_norm": 0.40612509846687317, + "learning_rate": 0.00013833296507622923, + "loss": 1.504, + "step": 23739 + }, + { + "epoch": 0.3084904725628327, + "grad_norm": 0.43061867356300354, + "learning_rate": 0.00013833036561431786, + "loss": 1.3877, + "step": 23740 + }, + { + "epoch": 0.30850346710674853, + "grad_norm": 0.39031246304512024, + "learning_rate": 0.00013832776615240646, + "loss": 1.2832, + "step": 23741 + }, + { + "epoch": 0.30851646165066443, + "grad_norm": 0.40483370423316956, + "learning_rate": 0.00013832516669049508, + "loss": 1.412, + "step": 23742 + }, + { + "epoch": 0.3085294561945803, + "grad_norm": 0.5106333494186401, + "learning_rate": 0.0001383225672285837, + "loss": 1.5594, + "step": 23743 + }, + { + "epoch": 0.3085424507384962, + "grad_norm": 0.39754360914230347, + "learning_rate": 0.0001383199677666723, + "loss": 1.4502, + "step": 23744 + }, + { + "epoch": 0.308555445282412, + "grad_norm": 0.3896426558494568, + "learning_rate": 0.00013831736830476093, + "loss": 1.4657, + "step": 23745 + }, + { + "epoch": 0.3085684398263279, + "grad_norm": 0.39586618542671204, + "learning_rate": 0.00013831476884284952, + "loss": 1.4429, + "step": 23746 + }, + { + "epoch": 0.30858143437024377, + "grad_norm": 0.42053499817848206, + "learning_rate": 0.00013831216938093815, + "loss": 1.4572, + "step": 23747 + }, + { + "epoch": 0.30859442891415967, + "grad_norm": 0.4519786834716797, + "learning_rate": 0.00013830956991902677, + "loss": 1.4165, + "step": 23748 + }, + { + "epoch": 0.3086074234580755, + "grad_norm": 0.35225239396095276, + "learning_rate": 0.00013830697045711537, + "loss": 1.2124, + "step": 23749 + }, + { + "epoch": 0.3086204180019914, + "grad_norm": 0.4474756419658661, + "learning_rate": 0.000138304370995204, + "loss": 1.4339, + "step": 23750 + }, + { + "epoch": 0.30863341254590726, + "grad_norm": 0.4044516086578369, + "learning_rate": 0.00013830177153329262, + "loss": 1.3497, + "step": 23751 + }, + { + "epoch": 0.30864640708982316, + "grad_norm": 0.42009204626083374, + "learning_rate": 0.00013829917207138124, + "loss": 1.5014, + "step": 23752 + }, + { + "epoch": 0.30865940163373906, + "grad_norm": 0.3916882276535034, + "learning_rate": 0.00013829657260946984, + "loss": 1.3825, + "step": 23753 + }, + { + "epoch": 0.3086723961776549, + "grad_norm": 0.48238059878349304, + "learning_rate": 0.00013829397314755847, + "loss": 1.5252, + "step": 23754 + }, + { + "epoch": 0.3086853907215708, + "grad_norm": 0.4167851507663727, + "learning_rate": 0.0001382913736856471, + "loss": 1.3571, + "step": 23755 + }, + { + "epoch": 0.30869838526548665, + "grad_norm": 0.3993912637233734, + "learning_rate": 0.0001382887742237357, + "loss": 1.4099, + "step": 23756 + }, + { + "epoch": 0.30871137980940255, + "grad_norm": 0.3138660490512848, + "learning_rate": 0.0001382861747618243, + "loss": 1.3357, + "step": 23757 + }, + { + "epoch": 0.3087243743533184, + "grad_norm": 0.40589895844459534, + "learning_rate": 0.0001382835752999129, + "loss": 1.6265, + "step": 23758 + }, + { + "epoch": 0.3087373688972343, + "grad_norm": 0.4522473216056824, + "learning_rate": 0.00013828097583800153, + "loss": 1.3554, + "step": 23759 + }, + { + "epoch": 0.30875036344115014, + "grad_norm": 0.4732970893383026, + "learning_rate": 0.00013827837637609016, + "loss": 1.4338, + "step": 23760 + }, + { + "epoch": 0.30876335798506604, + "grad_norm": 0.42332056164741516, + "learning_rate": 0.00013827577691417876, + "loss": 1.3054, + "step": 23761 + }, + { + "epoch": 0.3087763525289819, + "grad_norm": 0.39770859479904175, + "learning_rate": 0.00013827317745226738, + "loss": 1.5026, + "step": 23762 + }, + { + "epoch": 0.3087893470728978, + "grad_norm": 0.3936413526535034, + "learning_rate": 0.000138270577990356, + "loss": 1.4661, + "step": 23763 + }, + { + "epoch": 0.30880234161681364, + "grad_norm": 0.42304402589797974, + "learning_rate": 0.00013826797852844463, + "loss": 1.4781, + "step": 23764 + }, + { + "epoch": 0.30881533616072954, + "grad_norm": 0.3309645652770996, + "learning_rate": 0.00013826537906653323, + "loss": 1.4208, + "step": 23765 + }, + { + "epoch": 0.3088283307046454, + "grad_norm": 0.358285516500473, + "learning_rate": 0.00013826277960462185, + "loss": 1.2081, + "step": 23766 + }, + { + "epoch": 0.3088413252485613, + "grad_norm": 0.4188380539417267, + "learning_rate": 0.00013826018014271048, + "loss": 1.3715, + "step": 23767 + }, + { + "epoch": 0.3088543197924771, + "grad_norm": 0.31712132692337036, + "learning_rate": 0.00013825758068079907, + "loss": 1.507, + "step": 23768 + }, + { + "epoch": 0.30886731433639303, + "grad_norm": 0.4045106768608093, + "learning_rate": 0.0001382549812188877, + "loss": 1.3644, + "step": 23769 + }, + { + "epoch": 0.3088803088803089, + "grad_norm": 0.41257748007774353, + "learning_rate": 0.00013825238175697632, + "loss": 1.3753, + "step": 23770 + }, + { + "epoch": 0.3088933034242248, + "grad_norm": 0.4561169743537903, + "learning_rate": 0.00013824978229506495, + "loss": 1.4705, + "step": 23771 + }, + { + "epoch": 0.3089062979681406, + "grad_norm": 0.3939547836780548, + "learning_rate": 0.00013824718283315354, + "loss": 1.5, + "step": 23772 + }, + { + "epoch": 0.3089192925120565, + "grad_norm": 0.36217886209487915, + "learning_rate": 0.00013824458337124214, + "loss": 1.2748, + "step": 23773 + }, + { + "epoch": 0.30893228705597237, + "grad_norm": 0.41370636224746704, + "learning_rate": 0.0001382419839093308, + "loss": 1.3303, + "step": 23774 + }, + { + "epoch": 0.30894528159988827, + "grad_norm": 0.42174336314201355, + "learning_rate": 0.0001382393844474194, + "loss": 1.4376, + "step": 23775 + }, + { + "epoch": 0.3089582761438041, + "grad_norm": 0.40029215812683105, + "learning_rate": 0.00013823678498550801, + "loss": 1.419, + "step": 23776 + }, + { + "epoch": 0.30897127068772, + "grad_norm": 0.38913801312446594, + "learning_rate": 0.0001382341855235966, + "loss": 1.3723, + "step": 23777 + }, + { + "epoch": 0.30898426523163586, + "grad_norm": 0.41501766443252563, + "learning_rate": 0.00013823158606168524, + "loss": 1.3912, + "step": 23778 + }, + { + "epoch": 0.30899725977555176, + "grad_norm": 0.38698235154151917, + "learning_rate": 0.00013822898659977386, + "loss": 1.1963, + "step": 23779 + }, + { + "epoch": 0.3090102543194676, + "grad_norm": 0.35782167315483093, + "learning_rate": 0.00013822638713786246, + "loss": 1.1406, + "step": 23780 + }, + { + "epoch": 0.3090232488633835, + "grad_norm": 0.3086980879306793, + "learning_rate": 0.00013822378767595108, + "loss": 1.4323, + "step": 23781 + }, + { + "epoch": 0.30903624340729935, + "grad_norm": 0.370387464761734, + "learning_rate": 0.0001382211882140397, + "loss": 1.2246, + "step": 23782 + }, + { + "epoch": 0.30904923795121525, + "grad_norm": 0.42195382714271545, + "learning_rate": 0.00013821858875212833, + "loss": 1.2092, + "step": 23783 + }, + { + "epoch": 0.3090622324951311, + "grad_norm": 0.37231874465942383, + "learning_rate": 0.00013821598929021693, + "loss": 1.4821, + "step": 23784 + }, + { + "epoch": 0.309075227039047, + "grad_norm": 0.36413538455963135, + "learning_rate": 0.00013821338982830553, + "loss": 1.4356, + "step": 23785 + }, + { + "epoch": 0.30908822158296284, + "grad_norm": 0.4143666923046112, + "learning_rate": 0.00013821079036639418, + "loss": 1.439, + "step": 23786 + }, + { + "epoch": 0.30910121612687874, + "grad_norm": 0.41055288910865784, + "learning_rate": 0.00013820819090448278, + "loss": 1.2347, + "step": 23787 + }, + { + "epoch": 0.3091142106707946, + "grad_norm": 0.5114784836769104, + "learning_rate": 0.0001382055914425714, + "loss": 1.4228, + "step": 23788 + }, + { + "epoch": 0.3091272052147105, + "grad_norm": 0.4596235454082489, + "learning_rate": 0.00013820299198066, + "loss": 1.4927, + "step": 23789 + }, + { + "epoch": 0.30914019975862633, + "grad_norm": 0.3260226845741272, + "learning_rate": 0.00013820039251874862, + "loss": 1.1972, + "step": 23790 + }, + { + "epoch": 0.30915319430254223, + "grad_norm": 0.51402348279953, + "learning_rate": 0.00013819779305683725, + "loss": 1.3992, + "step": 23791 + }, + { + "epoch": 0.3091661888464581, + "grad_norm": 0.4389089047908783, + "learning_rate": 0.00013819519359492584, + "loss": 1.463, + "step": 23792 + }, + { + "epoch": 0.309179183390374, + "grad_norm": 0.4586727023124695, + "learning_rate": 0.00013819259413301447, + "loss": 1.3317, + "step": 23793 + }, + { + "epoch": 0.3091921779342898, + "grad_norm": 0.38384613394737244, + "learning_rate": 0.0001381899946711031, + "loss": 1.3695, + "step": 23794 + }, + { + "epoch": 0.3092051724782057, + "grad_norm": 0.4415891170501709, + "learning_rate": 0.00013818739520919172, + "loss": 1.3687, + "step": 23795 + }, + { + "epoch": 0.30921816702212157, + "grad_norm": 0.45097705721855164, + "learning_rate": 0.00013818479574728031, + "loss": 1.4676, + "step": 23796 + }, + { + "epoch": 0.30923116156603747, + "grad_norm": 0.444897323846817, + "learning_rate": 0.0001381821962853689, + "loss": 1.4107, + "step": 23797 + }, + { + "epoch": 0.3092441561099533, + "grad_norm": 0.4139679968357086, + "learning_rate": 0.00013817959682345756, + "loss": 1.415, + "step": 23798 + }, + { + "epoch": 0.3092571506538692, + "grad_norm": 0.28131258487701416, + "learning_rate": 0.00013817699736154616, + "loss": 1.2654, + "step": 23799 + }, + { + "epoch": 0.30927014519778506, + "grad_norm": 0.5153700709342957, + "learning_rate": 0.00013817439789963479, + "loss": 1.5621, + "step": 23800 + }, + { + "epoch": 0.30928313974170096, + "grad_norm": 0.39196959137916565, + "learning_rate": 0.00013817179843772338, + "loss": 1.5736, + "step": 23801 + }, + { + "epoch": 0.3092961342856168, + "grad_norm": 0.43468302488327026, + "learning_rate": 0.000138169198975812, + "loss": 1.4264, + "step": 23802 + }, + { + "epoch": 0.3093091288295327, + "grad_norm": 0.328887939453125, + "learning_rate": 0.00013816659951390063, + "loss": 1.2911, + "step": 23803 + }, + { + "epoch": 0.30932212337344855, + "grad_norm": 0.3720610737800598, + "learning_rate": 0.00013816400005198923, + "loss": 1.4383, + "step": 23804 + }, + { + "epoch": 0.30933511791736445, + "grad_norm": 0.4355788826942444, + "learning_rate": 0.00013816140059007785, + "loss": 1.4154, + "step": 23805 + }, + { + "epoch": 0.3093481124612803, + "grad_norm": 0.41605398058891296, + "learning_rate": 0.00013815880112816648, + "loss": 1.4601, + "step": 23806 + }, + { + "epoch": 0.3093611070051962, + "grad_norm": 0.23450107872486115, + "learning_rate": 0.0001381562016662551, + "loss": 1.2207, + "step": 23807 + }, + { + "epoch": 0.30937410154911205, + "grad_norm": 0.45504528284072876, + "learning_rate": 0.0001381536022043437, + "loss": 1.4695, + "step": 23808 + }, + { + "epoch": 0.30938709609302795, + "grad_norm": 0.34779179096221924, + "learning_rate": 0.00013815100274243232, + "loss": 1.1928, + "step": 23809 + }, + { + "epoch": 0.3094000906369438, + "grad_norm": 0.4406035840511322, + "learning_rate": 0.00013814840328052095, + "loss": 1.496, + "step": 23810 + }, + { + "epoch": 0.3094130851808597, + "grad_norm": 0.3680429756641388, + "learning_rate": 0.00013814580381860955, + "loss": 1.2381, + "step": 23811 + }, + { + "epoch": 0.30942607972477554, + "grad_norm": 0.3156750202178955, + "learning_rate": 0.00013814320435669817, + "loss": 1.3683, + "step": 23812 + }, + { + "epoch": 0.30943907426869144, + "grad_norm": 0.4043225944042206, + "learning_rate": 0.0001381406048947868, + "loss": 1.5869, + "step": 23813 + }, + { + "epoch": 0.3094520688126073, + "grad_norm": 0.45116084814071655, + "learning_rate": 0.0001381380054328754, + "loss": 1.4057, + "step": 23814 + }, + { + "epoch": 0.3094650633565232, + "grad_norm": 0.3871017396450043, + "learning_rate": 0.00013813540597096402, + "loss": 1.3121, + "step": 23815 + }, + { + "epoch": 0.30947805790043903, + "grad_norm": 0.4697607457637787, + "learning_rate": 0.00013813280650905261, + "loss": 1.5304, + "step": 23816 + }, + { + "epoch": 0.30949105244435493, + "grad_norm": 0.39670342206954956, + "learning_rate": 0.00013813020704714127, + "loss": 1.5512, + "step": 23817 + }, + { + "epoch": 0.3095040469882708, + "grad_norm": 0.48963800072669983, + "learning_rate": 0.00013812760758522986, + "loss": 1.309, + "step": 23818 + }, + { + "epoch": 0.3095170415321867, + "grad_norm": 0.3558309078216553, + "learning_rate": 0.0001381250081233185, + "loss": 1.3864, + "step": 23819 + }, + { + "epoch": 0.3095300360761025, + "grad_norm": 0.3990516662597656, + "learning_rate": 0.00013812240866140709, + "loss": 1.472, + "step": 23820 + }, + { + "epoch": 0.3095430306200184, + "grad_norm": 0.38804891705513, + "learning_rate": 0.0001381198091994957, + "loss": 1.5044, + "step": 23821 + }, + { + "epoch": 0.30955602516393427, + "grad_norm": 0.37251004576683044, + "learning_rate": 0.00013811720973758433, + "loss": 1.4186, + "step": 23822 + }, + { + "epoch": 0.30956901970785017, + "grad_norm": 0.46192091703414917, + "learning_rate": 0.00013811461027567293, + "loss": 1.3742, + "step": 23823 + }, + { + "epoch": 0.309582014251766, + "grad_norm": 0.40239229798316956, + "learning_rate": 0.00013811201081376156, + "loss": 1.448, + "step": 23824 + }, + { + "epoch": 0.3095950087956819, + "grad_norm": 0.35058051347732544, + "learning_rate": 0.00013810941135185018, + "loss": 1.4288, + "step": 23825 + }, + { + "epoch": 0.30960800333959776, + "grad_norm": 0.43243512511253357, + "learning_rate": 0.0001381068118899388, + "loss": 1.3979, + "step": 23826 + }, + { + "epoch": 0.30962099788351366, + "grad_norm": 0.39954861998558044, + "learning_rate": 0.0001381042124280274, + "loss": 1.6014, + "step": 23827 + }, + { + "epoch": 0.30963399242742956, + "grad_norm": 0.34985974431037903, + "learning_rate": 0.000138101612966116, + "loss": 1.3475, + "step": 23828 + }, + { + "epoch": 0.3096469869713454, + "grad_norm": 0.3275580406188965, + "learning_rate": 0.00013809901350420465, + "loss": 1.4011, + "step": 23829 + }, + { + "epoch": 0.3096599815152613, + "grad_norm": 0.49311327934265137, + "learning_rate": 0.00013809641404229325, + "loss": 1.3243, + "step": 23830 + }, + { + "epoch": 0.30967297605917715, + "grad_norm": 0.43973541259765625, + "learning_rate": 0.00013809381458038187, + "loss": 1.334, + "step": 23831 + }, + { + "epoch": 0.30968597060309305, + "grad_norm": 0.42753127217292786, + "learning_rate": 0.00013809121511847047, + "loss": 1.4136, + "step": 23832 + }, + { + "epoch": 0.3096989651470089, + "grad_norm": 0.43281760811805725, + "learning_rate": 0.0001380886156565591, + "loss": 1.3877, + "step": 23833 + }, + { + "epoch": 0.3097119596909248, + "grad_norm": 0.49253880977630615, + "learning_rate": 0.00013808601619464772, + "loss": 1.4942, + "step": 23834 + }, + { + "epoch": 0.30972495423484064, + "grad_norm": 0.4486534595489502, + "learning_rate": 0.00013808341673273632, + "loss": 1.4632, + "step": 23835 + }, + { + "epoch": 0.30973794877875654, + "grad_norm": 0.41377606987953186, + "learning_rate": 0.00013808081727082494, + "loss": 1.4024, + "step": 23836 + }, + { + "epoch": 0.3097509433226724, + "grad_norm": 0.4178917109966278, + "learning_rate": 0.00013807821780891357, + "loss": 1.5282, + "step": 23837 + }, + { + "epoch": 0.3097639378665883, + "grad_norm": 0.44078192114830017, + "learning_rate": 0.0001380756183470022, + "loss": 1.2529, + "step": 23838 + }, + { + "epoch": 0.30977693241050414, + "grad_norm": 0.3333128094673157, + "learning_rate": 0.0001380730188850908, + "loss": 1.2448, + "step": 23839 + }, + { + "epoch": 0.30978992695442004, + "grad_norm": 0.4325866401195526, + "learning_rate": 0.00013807041942317939, + "loss": 1.3517, + "step": 23840 + }, + { + "epoch": 0.3098029214983359, + "grad_norm": 0.40073856711387634, + "learning_rate": 0.00013806781996126804, + "loss": 1.285, + "step": 23841 + }, + { + "epoch": 0.3098159160422518, + "grad_norm": 0.3905218243598938, + "learning_rate": 0.00013806522049935663, + "loss": 1.5221, + "step": 23842 + }, + { + "epoch": 0.3098289105861676, + "grad_norm": 0.4415857791900635, + "learning_rate": 0.00013806262103744526, + "loss": 1.4371, + "step": 23843 + }, + { + "epoch": 0.30984190513008353, + "grad_norm": 0.37541815638542175, + "learning_rate": 0.00013806002157553388, + "loss": 1.362, + "step": 23844 + }, + { + "epoch": 0.3098548996739994, + "grad_norm": 0.4728158116340637, + "learning_rate": 0.00013805742211362248, + "loss": 1.5003, + "step": 23845 + }, + { + "epoch": 0.3098678942179153, + "grad_norm": 0.3400465250015259, + "learning_rate": 0.0001380548226517111, + "loss": 1.4446, + "step": 23846 + }, + { + "epoch": 0.3098808887618311, + "grad_norm": 0.4620821475982666, + "learning_rate": 0.0001380522231897997, + "loss": 1.55, + "step": 23847 + }, + { + "epoch": 0.309893883305747, + "grad_norm": 0.4416901767253876, + "learning_rate": 0.00013804962372788835, + "loss": 1.514, + "step": 23848 + }, + { + "epoch": 0.30990687784966287, + "grad_norm": 0.37687355279922485, + "learning_rate": 0.00013804702426597695, + "loss": 1.249, + "step": 23849 + }, + { + "epoch": 0.30991987239357877, + "grad_norm": 0.4145062565803528, + "learning_rate": 0.00013804442480406558, + "loss": 1.3067, + "step": 23850 + }, + { + "epoch": 0.3099328669374946, + "grad_norm": 0.39866968989372253, + "learning_rate": 0.00013804182534215417, + "loss": 1.6535, + "step": 23851 + }, + { + "epoch": 0.3099458614814105, + "grad_norm": 0.42530226707458496, + "learning_rate": 0.0001380392258802428, + "loss": 1.3922, + "step": 23852 + }, + { + "epoch": 0.30995885602532636, + "grad_norm": 0.45034313201904297, + "learning_rate": 0.00013803662641833142, + "loss": 1.5332, + "step": 23853 + }, + { + "epoch": 0.30997185056924226, + "grad_norm": 0.43751785159111023, + "learning_rate": 0.00013803402695642002, + "loss": 1.749, + "step": 23854 + }, + { + "epoch": 0.3099848451131581, + "grad_norm": 0.43934813141822815, + "learning_rate": 0.00013803142749450864, + "loss": 1.4842, + "step": 23855 + }, + { + "epoch": 0.309997839657074, + "grad_norm": 0.36137276887893677, + "learning_rate": 0.00013802882803259727, + "loss": 1.3301, + "step": 23856 + }, + { + "epoch": 0.31001083420098985, + "grad_norm": 0.39494848251342773, + "learning_rate": 0.00013802622857068587, + "loss": 1.3921, + "step": 23857 + }, + { + "epoch": 0.31002382874490575, + "grad_norm": 0.3148389756679535, + "learning_rate": 0.0001380236291087745, + "loss": 1.3547, + "step": 23858 + }, + { + "epoch": 0.3100368232888216, + "grad_norm": 0.3343336582183838, + "learning_rate": 0.0001380210296468631, + "loss": 1.5378, + "step": 23859 + }, + { + "epoch": 0.3100498178327375, + "grad_norm": 0.496134489774704, + "learning_rate": 0.00013801843018495174, + "loss": 1.5298, + "step": 23860 + }, + { + "epoch": 0.31006281237665334, + "grad_norm": 0.46657252311706543, + "learning_rate": 0.00013801583072304034, + "loss": 1.3416, + "step": 23861 + }, + { + "epoch": 0.31007580692056924, + "grad_norm": 0.45345184206962585, + "learning_rate": 0.00013801323126112896, + "loss": 1.2855, + "step": 23862 + }, + { + "epoch": 0.3100888014644851, + "grad_norm": 0.42172691226005554, + "learning_rate": 0.00013801063179921756, + "loss": 1.4361, + "step": 23863 + }, + { + "epoch": 0.310101796008401, + "grad_norm": 0.35896340012550354, + "learning_rate": 0.00013800803233730618, + "loss": 1.3581, + "step": 23864 + }, + { + "epoch": 0.31011479055231683, + "grad_norm": 0.41128477454185486, + "learning_rate": 0.0001380054328753948, + "loss": 1.3925, + "step": 23865 + }, + { + "epoch": 0.31012778509623273, + "grad_norm": 0.2846456468105316, + "learning_rate": 0.0001380028334134834, + "loss": 1.2609, + "step": 23866 + }, + { + "epoch": 0.3101407796401486, + "grad_norm": 0.3456208109855652, + "learning_rate": 0.00013800023395157203, + "loss": 1.2951, + "step": 23867 + }, + { + "epoch": 0.3101537741840645, + "grad_norm": 0.40712153911590576, + "learning_rate": 0.00013799763448966065, + "loss": 1.4023, + "step": 23868 + }, + { + "epoch": 0.3101667687279803, + "grad_norm": 0.4775449335575104, + "learning_rate": 0.00013799503502774925, + "loss": 1.1947, + "step": 23869 + }, + { + "epoch": 0.3101797632718962, + "grad_norm": 0.39188286662101746, + "learning_rate": 0.00013799243556583788, + "loss": 1.2654, + "step": 23870 + }, + { + "epoch": 0.31019275781581207, + "grad_norm": 0.4361734986305237, + "learning_rate": 0.00013798983610392647, + "loss": 1.3095, + "step": 23871 + }, + { + "epoch": 0.31020575235972797, + "grad_norm": 0.48571932315826416, + "learning_rate": 0.00013798723664201513, + "loss": 1.5007, + "step": 23872 + }, + { + "epoch": 0.3102187469036438, + "grad_norm": 0.38242021203041077, + "learning_rate": 0.00013798463718010372, + "loss": 1.3414, + "step": 23873 + }, + { + "epoch": 0.3102317414475597, + "grad_norm": 0.429584801197052, + "learning_rate": 0.00013798203771819235, + "loss": 1.3548, + "step": 23874 + }, + { + "epoch": 0.31024473599147556, + "grad_norm": 0.4487929046154022, + "learning_rate": 0.00013797943825628094, + "loss": 1.4083, + "step": 23875 + }, + { + "epoch": 0.31025773053539146, + "grad_norm": 0.44359031319618225, + "learning_rate": 0.00013797683879436957, + "loss": 1.297, + "step": 23876 + }, + { + "epoch": 0.3102707250793073, + "grad_norm": 0.3933943510055542, + "learning_rate": 0.0001379742393324582, + "loss": 1.1522, + "step": 23877 + }, + { + "epoch": 0.3102837196232232, + "grad_norm": 0.419813334941864, + "learning_rate": 0.0001379716398705468, + "loss": 1.3617, + "step": 23878 + }, + { + "epoch": 0.31029671416713905, + "grad_norm": 0.37526410818099976, + "learning_rate": 0.00013796904040863542, + "loss": 1.222, + "step": 23879 + }, + { + "epoch": 0.31030970871105495, + "grad_norm": 0.4736362397670746, + "learning_rate": 0.00013796644094672404, + "loss": 1.4928, + "step": 23880 + }, + { + "epoch": 0.3103227032549708, + "grad_norm": 0.2675705850124359, + "learning_rate": 0.00013796384148481264, + "loss": 1.3189, + "step": 23881 + }, + { + "epoch": 0.3103356977988867, + "grad_norm": 0.4965255558490753, + "learning_rate": 0.00013796124202290126, + "loss": 1.3363, + "step": 23882 + }, + { + "epoch": 0.31034869234280255, + "grad_norm": 0.4060961604118347, + "learning_rate": 0.00013795864256098989, + "loss": 1.2321, + "step": 23883 + }, + { + "epoch": 0.31036168688671845, + "grad_norm": 0.349907785654068, + "learning_rate": 0.0001379560430990785, + "loss": 1.59, + "step": 23884 + }, + { + "epoch": 0.3103746814306343, + "grad_norm": 0.39603179693222046, + "learning_rate": 0.0001379534436371671, + "loss": 1.4503, + "step": 23885 + }, + { + "epoch": 0.3103876759745502, + "grad_norm": 0.5262779593467712, + "learning_rate": 0.00013795084417525573, + "loss": 1.4417, + "step": 23886 + }, + { + "epoch": 0.31040067051846604, + "grad_norm": 0.46533364057540894, + "learning_rate": 0.00013794824471334436, + "loss": 1.6625, + "step": 23887 + }, + { + "epoch": 0.31041366506238194, + "grad_norm": 0.3112325668334961, + "learning_rate": 0.00013794564525143295, + "loss": 1.1391, + "step": 23888 + }, + { + "epoch": 0.3104266596062978, + "grad_norm": 0.5040755867958069, + "learning_rate": 0.00013794304578952158, + "loss": 1.4223, + "step": 23889 + }, + { + "epoch": 0.3104396541502137, + "grad_norm": 0.3455553948879242, + "learning_rate": 0.00013794044632761018, + "loss": 1.3353, + "step": 23890 + }, + { + "epoch": 0.31045264869412953, + "grad_norm": 0.40547439455986023, + "learning_rate": 0.00013793784686569883, + "loss": 1.3239, + "step": 23891 + }, + { + "epoch": 0.31046564323804543, + "grad_norm": 0.4873808026313782, + "learning_rate": 0.00013793524740378743, + "loss": 1.4291, + "step": 23892 + }, + { + "epoch": 0.3104786377819613, + "grad_norm": 0.3819611966609955, + "learning_rate": 0.00013793264794187605, + "loss": 1.236, + "step": 23893 + }, + { + "epoch": 0.3104916323258772, + "grad_norm": 0.4641636610031128, + "learning_rate": 0.00013793004847996465, + "loss": 1.4637, + "step": 23894 + }, + { + "epoch": 0.310504626869793, + "grad_norm": 0.3523745536804199, + "learning_rate": 0.00013792744901805327, + "loss": 1.1905, + "step": 23895 + }, + { + "epoch": 0.3105176214137089, + "grad_norm": 0.44878754019737244, + "learning_rate": 0.0001379248495561419, + "loss": 1.603, + "step": 23896 + }, + { + "epoch": 0.31053061595762477, + "grad_norm": 0.3051418662071228, + "learning_rate": 0.0001379222500942305, + "loss": 1.5226, + "step": 23897 + }, + { + "epoch": 0.31054361050154067, + "grad_norm": 0.39380788803100586, + "learning_rate": 0.00013791965063231912, + "loss": 1.36, + "step": 23898 + }, + { + "epoch": 0.3105566050454565, + "grad_norm": 0.4346606731414795, + "learning_rate": 0.00013791705117040774, + "loss": 1.5733, + "step": 23899 + }, + { + "epoch": 0.3105695995893724, + "grad_norm": 0.39551079273223877, + "learning_rate": 0.00013791445170849634, + "loss": 1.5093, + "step": 23900 + }, + { + "epoch": 0.31058259413328826, + "grad_norm": 0.4321194291114807, + "learning_rate": 0.00013791185224658496, + "loss": 1.3279, + "step": 23901 + }, + { + "epoch": 0.31059558867720416, + "grad_norm": 0.33355411887168884, + "learning_rate": 0.00013790925278467356, + "loss": 1.3862, + "step": 23902 + }, + { + "epoch": 0.31060858322112, + "grad_norm": 0.4714154601097107, + "learning_rate": 0.0001379066533227622, + "loss": 1.5895, + "step": 23903 + }, + { + "epoch": 0.3106215777650359, + "grad_norm": 0.3886106014251709, + "learning_rate": 0.0001379040538608508, + "loss": 1.2789, + "step": 23904 + }, + { + "epoch": 0.3106345723089518, + "grad_norm": 0.37490972876548767, + "learning_rate": 0.00013790145439893944, + "loss": 1.3529, + "step": 23905 + }, + { + "epoch": 0.31064756685286765, + "grad_norm": 0.30774742364883423, + "learning_rate": 0.00013789885493702803, + "loss": 1.5286, + "step": 23906 + }, + { + "epoch": 0.31066056139678355, + "grad_norm": 0.3692784607410431, + "learning_rate": 0.00013789625547511666, + "loss": 1.3646, + "step": 23907 + }, + { + "epoch": 0.3106735559406994, + "grad_norm": 0.41448262333869934, + "learning_rate": 0.00013789365601320528, + "loss": 1.2563, + "step": 23908 + }, + { + "epoch": 0.3106865504846153, + "grad_norm": 0.3954908847808838, + "learning_rate": 0.00013789105655129388, + "loss": 1.2521, + "step": 23909 + }, + { + "epoch": 0.31069954502853114, + "grad_norm": 0.39265531301498413, + "learning_rate": 0.0001378884570893825, + "loss": 1.5022, + "step": 23910 + }, + { + "epoch": 0.31071253957244704, + "grad_norm": 0.3893373906612396, + "learning_rate": 0.00013788585762747113, + "loss": 1.4652, + "step": 23911 + }, + { + "epoch": 0.3107255341163629, + "grad_norm": 0.5740199685096741, + "learning_rate": 0.00013788325816555973, + "loss": 1.1323, + "step": 23912 + }, + { + "epoch": 0.3107385286602788, + "grad_norm": 0.39213043451309204, + "learning_rate": 0.00013788065870364835, + "loss": 1.5641, + "step": 23913 + }, + { + "epoch": 0.31075152320419464, + "grad_norm": 0.4795304834842682, + "learning_rate": 0.00013787805924173695, + "loss": 1.3815, + "step": 23914 + }, + { + "epoch": 0.31076451774811054, + "grad_norm": 0.6655697226524353, + "learning_rate": 0.0001378754597798256, + "loss": 1.4232, + "step": 23915 + }, + { + "epoch": 0.3107775122920264, + "grad_norm": 0.35082104802131653, + "learning_rate": 0.0001378728603179142, + "loss": 1.1995, + "step": 23916 + }, + { + "epoch": 0.3107905068359423, + "grad_norm": 0.418209433555603, + "learning_rate": 0.00013787026085600282, + "loss": 1.4208, + "step": 23917 + }, + { + "epoch": 0.3108035013798581, + "grad_norm": 0.31623315811157227, + "learning_rate": 0.00013786766139409144, + "loss": 1.2735, + "step": 23918 + }, + { + "epoch": 0.310816495923774, + "grad_norm": 0.4177623987197876, + "learning_rate": 0.00013786506193218004, + "loss": 1.5594, + "step": 23919 + }, + { + "epoch": 0.3108294904676899, + "grad_norm": 0.3654765486717224, + "learning_rate": 0.00013786246247026867, + "loss": 1.4608, + "step": 23920 + }, + { + "epoch": 0.3108424850116058, + "grad_norm": 0.42051026225090027, + "learning_rate": 0.00013785986300835726, + "loss": 1.5841, + "step": 23921 + }, + { + "epoch": 0.3108554795555216, + "grad_norm": 0.3291291892528534, + "learning_rate": 0.00013785726354644592, + "loss": 1.4767, + "step": 23922 + }, + { + "epoch": 0.3108684740994375, + "grad_norm": 0.39693668484687805, + "learning_rate": 0.0001378546640845345, + "loss": 1.6129, + "step": 23923 + }, + { + "epoch": 0.31088146864335336, + "grad_norm": 0.4205494523048401, + "learning_rate": 0.0001378520646226231, + "loss": 1.319, + "step": 23924 + }, + { + "epoch": 0.31089446318726927, + "grad_norm": 0.4231456518173218, + "learning_rate": 0.00013784946516071173, + "loss": 1.5289, + "step": 23925 + }, + { + "epoch": 0.3109074577311851, + "grad_norm": 0.386369526386261, + "learning_rate": 0.00013784686569880036, + "loss": 1.415, + "step": 23926 + }, + { + "epoch": 0.310920452275101, + "grad_norm": 0.43499818444252014, + "learning_rate": 0.00013784426623688898, + "loss": 1.5224, + "step": 23927 + }, + { + "epoch": 0.31093344681901686, + "grad_norm": 0.3281128406524658, + "learning_rate": 0.00013784166677497758, + "loss": 1.1598, + "step": 23928 + }, + { + "epoch": 0.31094644136293276, + "grad_norm": 0.4508039653301239, + "learning_rate": 0.0001378390673130662, + "loss": 1.3316, + "step": 23929 + }, + { + "epoch": 0.3109594359068486, + "grad_norm": 0.43688860535621643, + "learning_rate": 0.00013783646785115483, + "loss": 1.5154, + "step": 23930 + }, + { + "epoch": 0.3109724304507645, + "grad_norm": 0.48640280961990356, + "learning_rate": 0.00013783386838924343, + "loss": 1.4405, + "step": 23931 + }, + { + "epoch": 0.31098542499468035, + "grad_norm": 0.40494704246520996, + "learning_rate": 0.00013783126892733205, + "loss": 1.4462, + "step": 23932 + }, + { + "epoch": 0.31099841953859625, + "grad_norm": 0.4335281252861023, + "learning_rate": 0.00013782866946542065, + "loss": 1.3523, + "step": 23933 + }, + { + "epoch": 0.3110114140825121, + "grad_norm": 0.30513033270835876, + "learning_rate": 0.0001378260700035093, + "loss": 1.4079, + "step": 23934 + }, + { + "epoch": 0.311024408626428, + "grad_norm": 0.3715493381023407, + "learning_rate": 0.0001378234705415979, + "loss": 1.402, + "step": 23935 + }, + { + "epoch": 0.31103740317034384, + "grad_norm": 0.46856236457824707, + "learning_rate": 0.0001378208710796865, + "loss": 1.3601, + "step": 23936 + }, + { + "epoch": 0.31105039771425974, + "grad_norm": 0.4111510217189789, + "learning_rate": 0.00013781827161777512, + "loss": 1.3899, + "step": 23937 + }, + { + "epoch": 0.3110633922581756, + "grad_norm": 0.30440622568130493, + "learning_rate": 0.00013781567215586374, + "loss": 1.5925, + "step": 23938 + }, + { + "epoch": 0.3110763868020915, + "grad_norm": 0.38103538751602173, + "learning_rate": 0.00013781307269395237, + "loss": 1.4483, + "step": 23939 + }, + { + "epoch": 0.31108938134600733, + "grad_norm": 0.4235471487045288, + "learning_rate": 0.00013781047323204097, + "loss": 1.394, + "step": 23940 + }, + { + "epoch": 0.31110237588992323, + "grad_norm": 0.42663297057151794, + "learning_rate": 0.0001378078737701296, + "loss": 1.2704, + "step": 23941 + }, + { + "epoch": 0.3111153704338391, + "grad_norm": 0.3289090394973755, + "learning_rate": 0.00013780527430821822, + "loss": 1.4361, + "step": 23942 + }, + { + "epoch": 0.311128364977755, + "grad_norm": 0.40683725476264954, + "learning_rate": 0.0001378026748463068, + "loss": 1.2902, + "step": 23943 + }, + { + "epoch": 0.3111413595216708, + "grad_norm": 0.39345088601112366, + "learning_rate": 0.00013780007538439544, + "loss": 1.4135, + "step": 23944 + }, + { + "epoch": 0.3111543540655867, + "grad_norm": 0.48659366369247437, + "learning_rate": 0.00013779747592248403, + "loss": 1.3726, + "step": 23945 + }, + { + "epoch": 0.31116734860950257, + "grad_norm": 0.37349313497543335, + "learning_rate": 0.0001377948764605727, + "loss": 1.3369, + "step": 23946 + }, + { + "epoch": 0.31118034315341847, + "grad_norm": 0.3508731424808502, + "learning_rate": 0.00013779227699866128, + "loss": 1.4158, + "step": 23947 + }, + { + "epoch": 0.3111933376973343, + "grad_norm": 0.4582858979701996, + "learning_rate": 0.0001377896775367499, + "loss": 1.4077, + "step": 23948 + }, + { + "epoch": 0.3112063322412502, + "grad_norm": 0.37235763669013977, + "learning_rate": 0.0001377870780748385, + "loss": 1.3243, + "step": 23949 + }, + { + "epoch": 0.31121932678516606, + "grad_norm": 0.45014306902885437, + "learning_rate": 0.00013778447861292713, + "loss": 1.4114, + "step": 23950 + }, + { + "epoch": 0.31123232132908196, + "grad_norm": 0.42739301919937134, + "learning_rate": 0.00013778187915101575, + "loss": 1.5206, + "step": 23951 + }, + { + "epoch": 0.3112453158729978, + "grad_norm": 0.3125927448272705, + "learning_rate": 0.00013777927968910435, + "loss": 1.2116, + "step": 23952 + }, + { + "epoch": 0.3112583104169137, + "grad_norm": 0.4059589207172394, + "learning_rate": 0.00013777668022719298, + "loss": 1.2822, + "step": 23953 + }, + { + "epoch": 0.31127130496082955, + "grad_norm": 0.37575000524520874, + "learning_rate": 0.0001377740807652816, + "loss": 1.4164, + "step": 23954 + }, + { + "epoch": 0.31128429950474545, + "grad_norm": 0.32792118191719055, + "learning_rate": 0.0001377714813033702, + "loss": 1.2217, + "step": 23955 + }, + { + "epoch": 0.3112972940486613, + "grad_norm": 0.41451743245124817, + "learning_rate": 0.00013776888184145882, + "loss": 1.4349, + "step": 23956 + }, + { + "epoch": 0.3113102885925772, + "grad_norm": 0.3205423951148987, + "learning_rate": 0.00013776628237954745, + "loss": 1.3984, + "step": 23957 + }, + { + "epoch": 0.31132328313649305, + "grad_norm": 0.44887575507164, + "learning_rate": 0.00013776368291763607, + "loss": 1.4088, + "step": 23958 + }, + { + "epoch": 0.31133627768040895, + "grad_norm": 0.39050063490867615, + "learning_rate": 0.00013776108345572467, + "loss": 1.5116, + "step": 23959 + }, + { + "epoch": 0.3113492722243248, + "grad_norm": 0.38548868894577026, + "learning_rate": 0.0001377584839938133, + "loss": 1.609, + "step": 23960 + }, + { + "epoch": 0.3113622667682407, + "grad_norm": 0.4732484221458435, + "learning_rate": 0.00013775588453190192, + "loss": 1.5134, + "step": 23961 + }, + { + "epoch": 0.31137526131215654, + "grad_norm": 0.3074033856391907, + "learning_rate": 0.00013775328506999052, + "loss": 1.1652, + "step": 23962 + }, + { + "epoch": 0.31138825585607244, + "grad_norm": 0.3868245780467987, + "learning_rate": 0.00013775068560807914, + "loss": 1.3733, + "step": 23963 + }, + { + "epoch": 0.3114012503999883, + "grad_norm": 0.4524303674697876, + "learning_rate": 0.00013774808614616774, + "loss": 1.3938, + "step": 23964 + }, + { + "epoch": 0.3114142449439042, + "grad_norm": 0.3855283260345459, + "learning_rate": 0.00013774548668425636, + "loss": 1.3352, + "step": 23965 + }, + { + "epoch": 0.31142723948782003, + "grad_norm": 0.32370954751968384, + "learning_rate": 0.00013774288722234499, + "loss": 1.3453, + "step": 23966 + }, + { + "epoch": 0.31144023403173593, + "grad_norm": 0.4870944619178772, + "learning_rate": 0.00013774028776043358, + "loss": 1.3989, + "step": 23967 + }, + { + "epoch": 0.3114532285756518, + "grad_norm": 0.405438631772995, + "learning_rate": 0.0001377376882985222, + "loss": 1.4075, + "step": 23968 + }, + { + "epoch": 0.3114662231195677, + "grad_norm": 0.34983202815055847, + "learning_rate": 0.00013773508883661083, + "loss": 1.5481, + "step": 23969 + }, + { + "epoch": 0.3114792176634835, + "grad_norm": 0.2988024055957794, + "learning_rate": 0.00013773248937469946, + "loss": 1.333, + "step": 23970 + }, + { + "epoch": 0.3114922122073994, + "grad_norm": 0.3165874779224396, + "learning_rate": 0.00013772988991278805, + "loss": 1.2441, + "step": 23971 + }, + { + "epoch": 0.31150520675131527, + "grad_norm": 0.3601977527141571, + "learning_rate": 0.00013772729045087668, + "loss": 1.3238, + "step": 23972 + }, + { + "epoch": 0.31151820129523117, + "grad_norm": 0.4172667860984802, + "learning_rate": 0.0001377246909889653, + "loss": 1.3174, + "step": 23973 + }, + { + "epoch": 0.311531195839147, + "grad_norm": 0.4173881709575653, + "learning_rate": 0.0001377220915270539, + "loss": 1.5729, + "step": 23974 + }, + { + "epoch": 0.3115441903830629, + "grad_norm": 0.3168468773365021, + "learning_rate": 0.00013771949206514253, + "loss": 1.2685, + "step": 23975 + }, + { + "epoch": 0.31155718492697876, + "grad_norm": 0.4073374569416046, + "learning_rate": 0.00013771689260323112, + "loss": 1.3985, + "step": 23976 + }, + { + "epoch": 0.31157017947089466, + "grad_norm": 0.3657204210758209, + "learning_rate": 0.00013771429314131977, + "loss": 1.3847, + "step": 23977 + }, + { + "epoch": 0.3115831740148105, + "grad_norm": 0.32920265197753906, + "learning_rate": 0.00013771169367940837, + "loss": 1.3454, + "step": 23978 + }, + { + "epoch": 0.3115961685587264, + "grad_norm": 0.4257546365261078, + "learning_rate": 0.00013770909421749697, + "loss": 1.3956, + "step": 23979 + }, + { + "epoch": 0.31160916310264225, + "grad_norm": 0.38961026072502136, + "learning_rate": 0.0001377064947555856, + "loss": 1.4517, + "step": 23980 + }, + { + "epoch": 0.31162215764655815, + "grad_norm": 0.4626120328903198, + "learning_rate": 0.00013770389529367422, + "loss": 1.4823, + "step": 23981 + }, + { + "epoch": 0.31163515219047405, + "grad_norm": 0.4362609386444092, + "learning_rate": 0.00013770129583176284, + "loss": 1.376, + "step": 23982 + }, + { + "epoch": 0.3116481467343899, + "grad_norm": 0.47997599840164185, + "learning_rate": 0.00013769869636985144, + "loss": 1.403, + "step": 23983 + }, + { + "epoch": 0.3116611412783058, + "grad_norm": 0.33067432045936584, + "learning_rate": 0.00013769609690794006, + "loss": 1.2723, + "step": 23984 + }, + { + "epoch": 0.31167413582222164, + "grad_norm": 0.46800974011421204, + "learning_rate": 0.0001376934974460287, + "loss": 1.5817, + "step": 23985 + }, + { + "epoch": 0.31168713036613754, + "grad_norm": 0.4393022954463959, + "learning_rate": 0.00013769089798411729, + "loss": 1.2901, + "step": 23986 + }, + { + "epoch": 0.3117001249100534, + "grad_norm": 0.3599379360675812, + "learning_rate": 0.0001376882985222059, + "loss": 1.4042, + "step": 23987 + }, + { + "epoch": 0.3117131194539693, + "grad_norm": 0.4527144730091095, + "learning_rate": 0.0001376856990602945, + "loss": 1.4807, + "step": 23988 + }, + { + "epoch": 0.31172611399788513, + "grad_norm": 0.388300359249115, + "learning_rate": 0.00013768309959838316, + "loss": 1.4414, + "step": 23989 + }, + { + "epoch": 0.31173910854180104, + "grad_norm": 0.37205156683921814, + "learning_rate": 0.00013768050013647176, + "loss": 1.3013, + "step": 23990 + }, + { + "epoch": 0.3117521030857169, + "grad_norm": 0.3441292345523834, + "learning_rate": 0.00013767790067456035, + "loss": 1.5675, + "step": 23991 + }, + { + "epoch": 0.3117650976296328, + "grad_norm": 0.3798999488353729, + "learning_rate": 0.00013767530121264898, + "loss": 1.2948, + "step": 23992 + }, + { + "epoch": 0.3117780921735486, + "grad_norm": 0.4052700102329254, + "learning_rate": 0.0001376727017507376, + "loss": 1.2227, + "step": 23993 + }, + { + "epoch": 0.3117910867174645, + "grad_norm": 0.43474170565605164, + "learning_rate": 0.00013767010228882623, + "loss": 1.3954, + "step": 23994 + }, + { + "epoch": 0.3118040812613804, + "grad_norm": 0.3933478891849518, + "learning_rate": 0.00013766750282691483, + "loss": 1.4149, + "step": 23995 + }, + { + "epoch": 0.3118170758052963, + "grad_norm": 0.45425668358802795, + "learning_rate": 0.00013766490336500345, + "loss": 1.3104, + "step": 23996 + }, + { + "epoch": 0.3118300703492121, + "grad_norm": 0.40313950181007385, + "learning_rate": 0.00013766230390309207, + "loss": 1.4334, + "step": 23997 + }, + { + "epoch": 0.311843064893128, + "grad_norm": 0.33744296431541443, + "learning_rate": 0.00013765970444118067, + "loss": 1.375, + "step": 23998 + }, + { + "epoch": 0.31185605943704386, + "grad_norm": 0.43625134229660034, + "learning_rate": 0.0001376571049792693, + "loss": 1.5292, + "step": 23999 + }, + { + "epoch": 0.31186905398095977, + "grad_norm": 0.488025426864624, + "learning_rate": 0.00013765450551735792, + "loss": 1.49, + "step": 24000 + }, + { + "epoch": 0.3118820485248756, + "grad_norm": 0.3705483675003052, + "learning_rate": 0.00013765190605544655, + "loss": 1.4062, + "step": 24001 + }, + { + "epoch": 0.3118950430687915, + "grad_norm": 0.3765580952167511, + "learning_rate": 0.00013764930659353514, + "loss": 1.4171, + "step": 24002 + }, + { + "epoch": 0.31190803761270736, + "grad_norm": 0.3116188645362854, + "learning_rate": 0.00013764670713162374, + "loss": 1.4177, + "step": 24003 + }, + { + "epoch": 0.31192103215662326, + "grad_norm": 0.510449230670929, + "learning_rate": 0.0001376441076697124, + "loss": 1.4859, + "step": 24004 + }, + { + "epoch": 0.3119340267005391, + "grad_norm": 0.41841232776641846, + "learning_rate": 0.000137641508207801, + "loss": 1.5218, + "step": 24005 + }, + { + "epoch": 0.311947021244455, + "grad_norm": 0.36413973569869995, + "learning_rate": 0.0001376389087458896, + "loss": 1.4506, + "step": 24006 + }, + { + "epoch": 0.31196001578837085, + "grad_norm": 0.40747568011283875, + "learning_rate": 0.0001376363092839782, + "loss": 1.3849, + "step": 24007 + }, + { + "epoch": 0.31197301033228675, + "grad_norm": 0.40432819724082947, + "learning_rate": 0.00013763370982206684, + "loss": 1.4365, + "step": 24008 + }, + { + "epoch": 0.3119860048762026, + "grad_norm": 0.3930312693119049, + "learning_rate": 0.00013763111036015546, + "loss": 1.4956, + "step": 24009 + }, + { + "epoch": 0.3119989994201185, + "grad_norm": 0.2995540499687195, + "learning_rate": 0.00013762851089824406, + "loss": 1.462, + "step": 24010 + }, + { + "epoch": 0.31201199396403434, + "grad_norm": 0.449510782957077, + "learning_rate": 0.00013762591143633268, + "loss": 1.4356, + "step": 24011 + }, + { + "epoch": 0.31202498850795024, + "grad_norm": 0.3419528305530548, + "learning_rate": 0.0001376233119744213, + "loss": 1.3369, + "step": 24012 + }, + { + "epoch": 0.3120379830518661, + "grad_norm": 0.44447213411331177, + "learning_rate": 0.00013762071251250993, + "loss": 1.372, + "step": 24013 + }, + { + "epoch": 0.312050977595782, + "grad_norm": 0.43342724442481995, + "learning_rate": 0.00013761811305059853, + "loss": 1.5641, + "step": 24014 + }, + { + "epoch": 0.31206397213969783, + "grad_norm": 0.4521684944629669, + "learning_rate": 0.00013761551358868715, + "loss": 1.5426, + "step": 24015 + }, + { + "epoch": 0.31207696668361373, + "grad_norm": 0.5262646675109863, + "learning_rate": 0.00013761291412677578, + "loss": 1.303, + "step": 24016 + }, + { + "epoch": 0.3120899612275296, + "grad_norm": 0.36772117018699646, + "learning_rate": 0.00013761031466486437, + "loss": 1.2938, + "step": 24017 + }, + { + "epoch": 0.3121029557714455, + "grad_norm": 0.3977145850658417, + "learning_rate": 0.000137607715202953, + "loss": 1.3204, + "step": 24018 + }, + { + "epoch": 0.3121159503153613, + "grad_norm": 0.3843863010406494, + "learning_rate": 0.0001376051157410416, + "loss": 1.5278, + "step": 24019 + }, + { + "epoch": 0.3121289448592772, + "grad_norm": 0.3801257908344269, + "learning_rate": 0.00013760251627913022, + "loss": 1.3724, + "step": 24020 + }, + { + "epoch": 0.31214193940319307, + "grad_norm": 0.33764639496803284, + "learning_rate": 0.00013759991681721885, + "loss": 1.2946, + "step": 24021 + }, + { + "epoch": 0.31215493394710897, + "grad_norm": 0.3801344335079193, + "learning_rate": 0.00013759731735530744, + "loss": 1.2576, + "step": 24022 + }, + { + "epoch": 0.3121679284910248, + "grad_norm": 0.3762162923812866, + "learning_rate": 0.00013759471789339607, + "loss": 1.3091, + "step": 24023 + }, + { + "epoch": 0.3121809230349407, + "grad_norm": 0.49149996042251587, + "learning_rate": 0.0001375921184314847, + "loss": 1.45, + "step": 24024 + }, + { + "epoch": 0.31219391757885656, + "grad_norm": 0.4664984345436096, + "learning_rate": 0.00013758951896957332, + "loss": 1.2782, + "step": 24025 + }, + { + "epoch": 0.31220691212277246, + "grad_norm": 0.3481917679309845, + "learning_rate": 0.0001375869195076619, + "loss": 1.4094, + "step": 24026 + }, + { + "epoch": 0.3122199066666883, + "grad_norm": 0.39677417278289795, + "learning_rate": 0.00013758432004575054, + "loss": 1.5093, + "step": 24027 + }, + { + "epoch": 0.3122329012106042, + "grad_norm": 0.4046335816383362, + "learning_rate": 0.00013758172058383916, + "loss": 1.4802, + "step": 24028 + }, + { + "epoch": 0.31224589575452005, + "grad_norm": 0.4369329512119293, + "learning_rate": 0.00013757912112192776, + "loss": 1.3582, + "step": 24029 + }, + { + "epoch": 0.31225889029843595, + "grad_norm": 0.36028096079826355, + "learning_rate": 0.00013757652166001638, + "loss": 1.2879, + "step": 24030 + }, + { + "epoch": 0.3122718848423518, + "grad_norm": 0.5566724538803101, + "learning_rate": 0.000137573922198105, + "loss": 1.331, + "step": 24031 + }, + { + "epoch": 0.3122848793862677, + "grad_norm": 0.34386762976646423, + "learning_rate": 0.00013757132273619363, + "loss": 1.1165, + "step": 24032 + }, + { + "epoch": 0.31229787393018354, + "grad_norm": 0.4286525547504425, + "learning_rate": 0.00013756872327428223, + "loss": 1.4363, + "step": 24033 + }, + { + "epoch": 0.31231086847409945, + "grad_norm": 0.4375098645687103, + "learning_rate": 0.00013756612381237083, + "loss": 1.2707, + "step": 24034 + }, + { + "epoch": 0.3123238630180153, + "grad_norm": 0.3450906276702881, + "learning_rate": 0.00013756352435045948, + "loss": 1.3853, + "step": 24035 + }, + { + "epoch": 0.3123368575619312, + "grad_norm": 0.4314018487930298, + "learning_rate": 0.00013756092488854808, + "loss": 1.4795, + "step": 24036 + }, + { + "epoch": 0.31234985210584704, + "grad_norm": 0.415788859128952, + "learning_rate": 0.0001375583254266367, + "loss": 1.3929, + "step": 24037 + }, + { + "epoch": 0.31236284664976294, + "grad_norm": 0.4273725748062134, + "learning_rate": 0.0001375557259647253, + "loss": 1.4401, + "step": 24038 + }, + { + "epoch": 0.3123758411936788, + "grad_norm": 0.38512519001960754, + "learning_rate": 0.00013755312650281392, + "loss": 1.4073, + "step": 24039 + }, + { + "epoch": 0.3123888357375947, + "grad_norm": 0.34402963519096375, + "learning_rate": 0.00013755052704090255, + "loss": 1.5044, + "step": 24040 + }, + { + "epoch": 0.31240183028151053, + "grad_norm": 0.3577123284339905, + "learning_rate": 0.00013754792757899115, + "loss": 1.2278, + "step": 24041 + }, + { + "epoch": 0.31241482482542643, + "grad_norm": 0.40234658122062683, + "learning_rate": 0.00013754532811707977, + "loss": 1.3492, + "step": 24042 + }, + { + "epoch": 0.3124278193693423, + "grad_norm": 0.435512512922287, + "learning_rate": 0.0001375427286551684, + "loss": 1.5132, + "step": 24043 + }, + { + "epoch": 0.3124408139132582, + "grad_norm": 0.2758682370185852, + "learning_rate": 0.00013754012919325702, + "loss": 1.4252, + "step": 24044 + }, + { + "epoch": 0.312453808457174, + "grad_norm": 0.4181942641735077, + "learning_rate": 0.00013753752973134562, + "loss": 1.4579, + "step": 24045 + }, + { + "epoch": 0.3124668030010899, + "grad_norm": 0.4430083632469177, + "learning_rate": 0.0001375349302694342, + "loss": 1.2538, + "step": 24046 + }, + { + "epoch": 0.31247979754500577, + "grad_norm": 0.46867311000823975, + "learning_rate": 0.00013753233080752286, + "loss": 1.4697, + "step": 24047 + }, + { + "epoch": 0.31249279208892167, + "grad_norm": 0.38926905393600464, + "learning_rate": 0.00013752973134561146, + "loss": 1.6609, + "step": 24048 + }, + { + "epoch": 0.3125057866328375, + "grad_norm": 0.35909736156463623, + "learning_rate": 0.0001375271318837001, + "loss": 1.5465, + "step": 24049 + }, + { + "epoch": 0.3125187811767534, + "grad_norm": 0.32579079270362854, + "learning_rate": 0.00013752453242178868, + "loss": 1.1768, + "step": 24050 + }, + { + "epoch": 0.31253177572066926, + "grad_norm": 0.39720970392227173, + "learning_rate": 0.0001375219329598773, + "loss": 1.5772, + "step": 24051 + }, + { + "epoch": 0.31254477026458516, + "grad_norm": 0.40443727374076843, + "learning_rate": 0.00013751933349796593, + "loss": 1.409, + "step": 24052 + }, + { + "epoch": 0.312557764808501, + "grad_norm": 0.303507536649704, + "learning_rate": 0.00013751673403605453, + "loss": 1.2178, + "step": 24053 + }, + { + "epoch": 0.3125707593524169, + "grad_norm": 0.4055541455745697, + "learning_rate": 0.00013751413457414316, + "loss": 1.4201, + "step": 24054 + }, + { + "epoch": 0.31258375389633275, + "grad_norm": 0.3177415132522583, + "learning_rate": 0.00013751153511223178, + "loss": 1.5134, + "step": 24055 + }, + { + "epoch": 0.31259674844024865, + "grad_norm": 0.30238714814186096, + "learning_rate": 0.0001375089356503204, + "loss": 1.1993, + "step": 24056 + }, + { + "epoch": 0.31260974298416455, + "grad_norm": 0.44723403453826904, + "learning_rate": 0.000137506336188409, + "loss": 1.4595, + "step": 24057 + }, + { + "epoch": 0.3126227375280804, + "grad_norm": 0.31841811537742615, + "learning_rate": 0.0001375037367264976, + "loss": 1.2656, + "step": 24058 + }, + { + "epoch": 0.3126357320719963, + "grad_norm": 0.4292653203010559, + "learning_rate": 0.00013750113726458625, + "loss": 1.5004, + "step": 24059 + }, + { + "epoch": 0.31264872661591214, + "grad_norm": 0.41497403383255005, + "learning_rate": 0.00013749853780267485, + "loss": 1.4755, + "step": 24060 + }, + { + "epoch": 0.31266172115982804, + "grad_norm": 0.41533756256103516, + "learning_rate": 0.00013749593834076347, + "loss": 1.4158, + "step": 24061 + }, + { + "epoch": 0.3126747157037439, + "grad_norm": 0.36261487007141113, + "learning_rate": 0.00013749333887885207, + "loss": 1.5097, + "step": 24062 + }, + { + "epoch": 0.3126877102476598, + "grad_norm": 0.4451095759868622, + "learning_rate": 0.0001374907394169407, + "loss": 1.384, + "step": 24063 + }, + { + "epoch": 0.31270070479157563, + "grad_norm": 0.49637743830680847, + "learning_rate": 0.00013748813995502932, + "loss": 1.3524, + "step": 24064 + }, + { + "epoch": 0.31271369933549154, + "grad_norm": 0.38129115104675293, + "learning_rate": 0.00013748554049311792, + "loss": 1.3337, + "step": 24065 + }, + { + "epoch": 0.3127266938794074, + "grad_norm": 0.4188744127750397, + "learning_rate": 0.00013748294103120654, + "loss": 1.2504, + "step": 24066 + }, + { + "epoch": 0.3127396884233233, + "grad_norm": 0.37698012590408325, + "learning_rate": 0.00013748034156929516, + "loss": 1.3171, + "step": 24067 + }, + { + "epoch": 0.3127526829672391, + "grad_norm": 0.4148073196411133, + "learning_rate": 0.0001374777421073838, + "loss": 1.374, + "step": 24068 + }, + { + "epoch": 0.312765677511155, + "grad_norm": 0.38230788707733154, + "learning_rate": 0.0001374751426454724, + "loss": 1.3433, + "step": 24069 + }, + { + "epoch": 0.31277867205507087, + "grad_norm": 0.4472162425518036, + "learning_rate": 0.000137472543183561, + "loss": 1.2701, + "step": 24070 + }, + { + "epoch": 0.3127916665989868, + "grad_norm": 0.41548895835876465, + "learning_rate": 0.00013746994372164964, + "loss": 1.4901, + "step": 24071 + }, + { + "epoch": 0.3128046611429026, + "grad_norm": 0.3675248920917511, + "learning_rate": 0.00013746734425973823, + "loss": 1.3729, + "step": 24072 + }, + { + "epoch": 0.3128176556868185, + "grad_norm": 0.41979625821113586, + "learning_rate": 0.00013746474479782686, + "loss": 1.3662, + "step": 24073 + }, + { + "epoch": 0.31283065023073436, + "grad_norm": 0.41736137866973877, + "learning_rate": 0.00013746214533591548, + "loss": 1.6323, + "step": 24074 + }, + { + "epoch": 0.31284364477465026, + "grad_norm": 0.41358327865600586, + "learning_rate": 0.00013745954587400408, + "loss": 1.416, + "step": 24075 + }, + { + "epoch": 0.3128566393185661, + "grad_norm": 0.3807070255279541, + "learning_rate": 0.0001374569464120927, + "loss": 1.5321, + "step": 24076 + }, + { + "epoch": 0.312869633862482, + "grad_norm": 0.41228362917900085, + "learning_rate": 0.0001374543469501813, + "loss": 1.7842, + "step": 24077 + }, + { + "epoch": 0.31288262840639786, + "grad_norm": 0.36300885677337646, + "learning_rate": 0.00013745174748826995, + "loss": 1.4059, + "step": 24078 + }, + { + "epoch": 0.31289562295031376, + "grad_norm": 0.4169485569000244, + "learning_rate": 0.00013744914802635855, + "loss": 1.3575, + "step": 24079 + }, + { + "epoch": 0.3129086174942296, + "grad_norm": 0.45995306968688965, + "learning_rate": 0.00013744654856444717, + "loss": 1.4774, + "step": 24080 + }, + { + "epoch": 0.3129216120381455, + "grad_norm": 0.4167274832725525, + "learning_rate": 0.00013744394910253577, + "loss": 1.4083, + "step": 24081 + }, + { + "epoch": 0.31293460658206135, + "grad_norm": 0.33171501755714417, + "learning_rate": 0.0001374413496406244, + "loss": 1.5144, + "step": 24082 + }, + { + "epoch": 0.31294760112597725, + "grad_norm": 0.38084664940834045, + "learning_rate": 0.00013743875017871302, + "loss": 1.4078, + "step": 24083 + }, + { + "epoch": 0.3129605956698931, + "grad_norm": 0.36482226848602295, + "learning_rate": 0.00013743615071680162, + "loss": 1.4199, + "step": 24084 + }, + { + "epoch": 0.312973590213809, + "grad_norm": 0.35472533106803894, + "learning_rate": 0.00013743355125489024, + "loss": 1.5307, + "step": 24085 + }, + { + "epoch": 0.31298658475772484, + "grad_norm": 0.4131404459476471, + "learning_rate": 0.00013743095179297887, + "loss": 1.3654, + "step": 24086 + }, + { + "epoch": 0.31299957930164074, + "grad_norm": 0.4545859694480896, + "learning_rate": 0.00013742835233106746, + "loss": 1.4369, + "step": 24087 + }, + { + "epoch": 0.3130125738455566, + "grad_norm": 0.4544179141521454, + "learning_rate": 0.0001374257528691561, + "loss": 1.4936, + "step": 24088 + }, + { + "epoch": 0.3130255683894725, + "grad_norm": 0.38573402166366577, + "learning_rate": 0.0001374231534072447, + "loss": 1.52, + "step": 24089 + }, + { + "epoch": 0.31303856293338833, + "grad_norm": 0.3297925293445587, + "learning_rate": 0.00013742055394533334, + "loss": 1.3784, + "step": 24090 + }, + { + "epoch": 0.31305155747730423, + "grad_norm": 0.38560348749160767, + "learning_rate": 0.00013741795448342194, + "loss": 1.4111, + "step": 24091 + }, + { + "epoch": 0.3130645520212201, + "grad_norm": 0.4217059314250946, + "learning_rate": 0.00013741535502151056, + "loss": 1.5637, + "step": 24092 + }, + { + "epoch": 0.313077546565136, + "grad_norm": 0.4886864125728607, + "learning_rate": 0.00013741275555959916, + "loss": 1.5556, + "step": 24093 + }, + { + "epoch": 0.3130905411090518, + "grad_norm": 0.36298999190330505, + "learning_rate": 0.00013741015609768778, + "loss": 1.2336, + "step": 24094 + }, + { + "epoch": 0.3131035356529677, + "grad_norm": 0.28728771209716797, + "learning_rate": 0.0001374075566357764, + "loss": 1.534, + "step": 24095 + }, + { + "epoch": 0.31311653019688357, + "grad_norm": 0.4165332317352295, + "learning_rate": 0.000137404957173865, + "loss": 1.4454, + "step": 24096 + }, + { + "epoch": 0.31312952474079947, + "grad_norm": 0.33897554874420166, + "learning_rate": 0.00013740235771195363, + "loss": 1.3528, + "step": 24097 + }, + { + "epoch": 0.3131425192847153, + "grad_norm": 0.4142508804798126, + "learning_rate": 0.00013739975825004225, + "loss": 1.3996, + "step": 24098 + }, + { + "epoch": 0.3131555138286312, + "grad_norm": 0.47479376196861267, + "learning_rate": 0.00013739715878813088, + "loss": 1.2737, + "step": 24099 + }, + { + "epoch": 0.31316850837254706, + "grad_norm": 0.42113324999809265, + "learning_rate": 0.00013739455932621947, + "loss": 1.4923, + "step": 24100 + }, + { + "epoch": 0.31318150291646296, + "grad_norm": 0.40646979212760925, + "learning_rate": 0.00013739195986430807, + "loss": 1.3435, + "step": 24101 + }, + { + "epoch": 0.3131944974603788, + "grad_norm": 0.4228103756904602, + "learning_rate": 0.00013738936040239672, + "loss": 1.3386, + "step": 24102 + }, + { + "epoch": 0.3132074920042947, + "grad_norm": 0.487327516078949, + "learning_rate": 0.00013738676094048532, + "loss": 1.3879, + "step": 24103 + }, + { + "epoch": 0.31322048654821055, + "grad_norm": 0.33931151032447815, + "learning_rate": 0.00013738416147857395, + "loss": 1.4628, + "step": 24104 + }, + { + "epoch": 0.31323348109212645, + "grad_norm": 0.4432222843170166, + "learning_rate": 0.00013738156201666257, + "loss": 1.5147, + "step": 24105 + }, + { + "epoch": 0.3132464756360423, + "grad_norm": 0.41895508766174316, + "learning_rate": 0.00013737896255475117, + "loss": 1.5267, + "step": 24106 + }, + { + "epoch": 0.3132594701799582, + "grad_norm": 0.3876771032810211, + "learning_rate": 0.0001373763630928398, + "loss": 1.4458, + "step": 24107 + }, + { + "epoch": 0.31327246472387404, + "grad_norm": 0.33563360571861267, + "learning_rate": 0.0001373737636309284, + "loss": 1.3171, + "step": 24108 + }, + { + "epoch": 0.31328545926778995, + "grad_norm": 0.5040119886398315, + "learning_rate": 0.00013737116416901704, + "loss": 1.468, + "step": 24109 + }, + { + "epoch": 0.3132984538117058, + "grad_norm": 0.38198140263557434, + "learning_rate": 0.00013736856470710564, + "loss": 1.2265, + "step": 24110 + }, + { + "epoch": 0.3133114483556217, + "grad_norm": 0.4412464499473572, + "learning_rate": 0.00013736596524519426, + "loss": 1.4326, + "step": 24111 + }, + { + "epoch": 0.31332444289953754, + "grad_norm": 0.3524186909198761, + "learning_rate": 0.00013736336578328286, + "loss": 1.3943, + "step": 24112 + }, + { + "epoch": 0.31333743744345344, + "grad_norm": 0.36579394340515137, + "learning_rate": 0.00013736076632137148, + "loss": 1.27, + "step": 24113 + }, + { + "epoch": 0.3133504319873693, + "grad_norm": 0.46102070808410645, + "learning_rate": 0.0001373581668594601, + "loss": 1.3613, + "step": 24114 + }, + { + "epoch": 0.3133634265312852, + "grad_norm": 0.3797166645526886, + "learning_rate": 0.0001373555673975487, + "loss": 1.4383, + "step": 24115 + }, + { + "epoch": 0.31337642107520103, + "grad_norm": 0.3789106011390686, + "learning_rate": 0.00013735296793563733, + "loss": 1.2324, + "step": 24116 + }, + { + "epoch": 0.31338941561911693, + "grad_norm": 0.3841444253921509, + "learning_rate": 0.00013735036847372596, + "loss": 1.4528, + "step": 24117 + }, + { + "epoch": 0.3134024101630328, + "grad_norm": 0.38438114523887634, + "learning_rate": 0.00013734776901181455, + "loss": 1.316, + "step": 24118 + }, + { + "epoch": 0.3134154047069487, + "grad_norm": 0.40486881136894226, + "learning_rate": 0.00013734516954990318, + "loss": 1.4839, + "step": 24119 + }, + { + "epoch": 0.3134283992508645, + "grad_norm": 0.3532828986644745, + "learning_rate": 0.00013734257008799177, + "loss": 1.5072, + "step": 24120 + }, + { + "epoch": 0.3134413937947804, + "grad_norm": 0.4030096232891083, + "learning_rate": 0.00013733997062608043, + "loss": 1.4009, + "step": 24121 + }, + { + "epoch": 0.31345438833869627, + "grad_norm": 0.4326718747615814, + "learning_rate": 0.00013733737116416902, + "loss": 1.3561, + "step": 24122 + }, + { + "epoch": 0.31346738288261217, + "grad_norm": 0.42302659153938293, + "learning_rate": 0.00013733477170225765, + "loss": 1.3948, + "step": 24123 + }, + { + "epoch": 0.313480377426528, + "grad_norm": 0.3909808397293091, + "learning_rate": 0.00013733217224034625, + "loss": 1.1477, + "step": 24124 + }, + { + "epoch": 0.3134933719704439, + "grad_norm": 0.3916614353656769, + "learning_rate": 0.00013732957277843487, + "loss": 1.3368, + "step": 24125 + }, + { + "epoch": 0.31350636651435976, + "grad_norm": 0.47921115159988403, + "learning_rate": 0.0001373269733165235, + "loss": 1.6167, + "step": 24126 + }, + { + "epoch": 0.31351936105827566, + "grad_norm": 0.4598756432533264, + "learning_rate": 0.0001373243738546121, + "loss": 1.4836, + "step": 24127 + }, + { + "epoch": 0.3135323556021915, + "grad_norm": 0.29184019565582275, + "learning_rate": 0.00013732177439270072, + "loss": 1.4215, + "step": 24128 + }, + { + "epoch": 0.3135453501461074, + "grad_norm": 0.3766686022281647, + "learning_rate": 0.00013731917493078934, + "loss": 1.4199, + "step": 24129 + }, + { + "epoch": 0.31355834469002325, + "grad_norm": 0.4528316259384155, + "learning_rate": 0.00013731657546887794, + "loss": 1.3668, + "step": 24130 + }, + { + "epoch": 0.31357133923393915, + "grad_norm": 0.3740106225013733, + "learning_rate": 0.00013731397600696656, + "loss": 1.2701, + "step": 24131 + }, + { + "epoch": 0.313584333777855, + "grad_norm": 0.4456227123737335, + "learning_rate": 0.00013731137654505516, + "loss": 1.2809, + "step": 24132 + }, + { + "epoch": 0.3135973283217709, + "grad_norm": 0.3641541302204132, + "learning_rate": 0.0001373087770831438, + "loss": 1.5589, + "step": 24133 + }, + { + "epoch": 0.3136103228656868, + "grad_norm": 0.4384995102882385, + "learning_rate": 0.0001373061776212324, + "loss": 1.5314, + "step": 24134 + }, + { + "epoch": 0.31362331740960264, + "grad_norm": 0.3754170536994934, + "learning_rate": 0.00013730357815932103, + "loss": 1.4484, + "step": 24135 + }, + { + "epoch": 0.31363631195351854, + "grad_norm": 0.4135434329509735, + "learning_rate": 0.00013730097869740963, + "loss": 1.3965, + "step": 24136 + }, + { + "epoch": 0.3136493064974344, + "grad_norm": 0.42548951506614685, + "learning_rate": 0.00013729837923549826, + "loss": 1.4212, + "step": 24137 + }, + { + "epoch": 0.3136623010413503, + "grad_norm": 0.3537392020225525, + "learning_rate": 0.00013729577977358688, + "loss": 1.2549, + "step": 24138 + }, + { + "epoch": 0.31367529558526613, + "grad_norm": 0.4996125102043152, + "learning_rate": 0.00013729318031167548, + "loss": 1.3863, + "step": 24139 + }, + { + "epoch": 0.31368829012918203, + "grad_norm": 0.34995129704475403, + "learning_rate": 0.0001372905808497641, + "loss": 1.265, + "step": 24140 + }, + { + "epoch": 0.3137012846730979, + "grad_norm": 0.31927287578582764, + "learning_rate": 0.00013728798138785273, + "loss": 1.4223, + "step": 24141 + }, + { + "epoch": 0.3137142792170138, + "grad_norm": 0.3679982125759125, + "learning_rate": 0.00013728538192594132, + "loss": 1.4303, + "step": 24142 + }, + { + "epoch": 0.3137272737609296, + "grad_norm": 0.3511558473110199, + "learning_rate": 0.00013728278246402995, + "loss": 1.4301, + "step": 24143 + }, + { + "epoch": 0.3137402683048455, + "grad_norm": 0.2987215518951416, + "learning_rate": 0.00013728018300211857, + "loss": 1.4566, + "step": 24144 + }, + { + "epoch": 0.31375326284876137, + "grad_norm": 0.42606785893440247, + "learning_rate": 0.0001372775835402072, + "loss": 1.5058, + "step": 24145 + }, + { + "epoch": 0.3137662573926773, + "grad_norm": 0.3913020193576813, + "learning_rate": 0.0001372749840782958, + "loss": 1.5466, + "step": 24146 + }, + { + "epoch": 0.3137792519365931, + "grad_norm": 0.38378703594207764, + "learning_rate": 0.00013727238461638442, + "loss": 1.3251, + "step": 24147 + }, + { + "epoch": 0.313792246480509, + "grad_norm": 0.45694097876548767, + "learning_rate": 0.00013726978515447304, + "loss": 1.4177, + "step": 24148 + }, + { + "epoch": 0.31380524102442486, + "grad_norm": 0.3502528667449951, + "learning_rate": 0.00013726718569256164, + "loss": 1.3085, + "step": 24149 + }, + { + "epoch": 0.31381823556834076, + "grad_norm": 0.41781380772590637, + "learning_rate": 0.00013726458623065027, + "loss": 1.4455, + "step": 24150 + }, + { + "epoch": 0.3138312301122566, + "grad_norm": 0.2427089363336563, + "learning_rate": 0.00013726198676873886, + "loss": 1.1942, + "step": 24151 + }, + { + "epoch": 0.3138442246561725, + "grad_norm": 0.3682045638561249, + "learning_rate": 0.00013725938730682751, + "loss": 1.4915, + "step": 24152 + }, + { + "epoch": 0.31385721920008836, + "grad_norm": 0.3806772232055664, + "learning_rate": 0.0001372567878449161, + "loss": 1.4897, + "step": 24153 + }, + { + "epoch": 0.31387021374400426, + "grad_norm": 0.4181349277496338, + "learning_rate": 0.00013725418838300474, + "loss": 1.51, + "step": 24154 + }, + { + "epoch": 0.3138832082879201, + "grad_norm": 0.40033575892448425, + "learning_rate": 0.00013725158892109333, + "loss": 1.5686, + "step": 24155 + }, + { + "epoch": 0.313896202831836, + "grad_norm": 0.382345587015152, + "learning_rate": 0.00013724898945918196, + "loss": 1.3563, + "step": 24156 + }, + { + "epoch": 0.31390919737575185, + "grad_norm": 0.36429542303085327, + "learning_rate": 0.00013724638999727058, + "loss": 1.5089, + "step": 24157 + }, + { + "epoch": 0.31392219191966775, + "grad_norm": 0.35979387164115906, + "learning_rate": 0.00013724379053535918, + "loss": 1.3182, + "step": 24158 + }, + { + "epoch": 0.3139351864635836, + "grad_norm": 0.39158037304878235, + "learning_rate": 0.0001372411910734478, + "loss": 1.4904, + "step": 24159 + }, + { + "epoch": 0.3139481810074995, + "grad_norm": 0.421166330575943, + "learning_rate": 0.00013723859161153643, + "loss": 1.5244, + "step": 24160 + }, + { + "epoch": 0.31396117555141534, + "grad_norm": 0.35423725843429565, + "learning_rate": 0.00013723599214962503, + "loss": 1.4312, + "step": 24161 + }, + { + "epoch": 0.31397417009533124, + "grad_norm": 0.404064416885376, + "learning_rate": 0.00013723339268771365, + "loss": 1.4624, + "step": 24162 + }, + { + "epoch": 0.3139871646392471, + "grad_norm": 0.5315631031990051, + "learning_rate": 0.00013723079322580225, + "loss": 1.5876, + "step": 24163 + }, + { + "epoch": 0.314000159183163, + "grad_norm": 0.37470829486846924, + "learning_rate": 0.0001372281937638909, + "loss": 1.3917, + "step": 24164 + }, + { + "epoch": 0.31401315372707883, + "grad_norm": 0.3997611403465271, + "learning_rate": 0.0001372255943019795, + "loss": 1.3947, + "step": 24165 + }, + { + "epoch": 0.31402614827099473, + "grad_norm": 0.4182421863079071, + "learning_rate": 0.00013722299484006812, + "loss": 1.4538, + "step": 24166 + }, + { + "epoch": 0.3140391428149106, + "grad_norm": 0.37487128376960754, + "learning_rate": 0.00013722039537815672, + "loss": 1.3848, + "step": 24167 + }, + { + "epoch": 0.3140521373588265, + "grad_norm": 0.39135023951530457, + "learning_rate": 0.00013721779591624534, + "loss": 1.332, + "step": 24168 + }, + { + "epoch": 0.3140651319027423, + "grad_norm": 0.3737618029117584, + "learning_rate": 0.00013721519645433397, + "loss": 1.2217, + "step": 24169 + }, + { + "epoch": 0.3140781264466582, + "grad_norm": 0.30085426568984985, + "learning_rate": 0.00013721259699242257, + "loss": 1.5156, + "step": 24170 + }, + { + "epoch": 0.31409112099057407, + "grad_norm": 0.349826842546463, + "learning_rate": 0.0001372099975305112, + "loss": 1.5919, + "step": 24171 + }, + { + "epoch": 0.31410411553448997, + "grad_norm": 0.4013342559337616, + "learning_rate": 0.00013720739806859981, + "loss": 1.5131, + "step": 24172 + }, + { + "epoch": 0.3141171100784058, + "grad_norm": 0.45974043011665344, + "learning_rate": 0.0001372047986066884, + "loss": 1.3831, + "step": 24173 + }, + { + "epoch": 0.3141301046223217, + "grad_norm": 0.4515654742717743, + "learning_rate": 0.00013720219914477704, + "loss": 1.4738, + "step": 24174 + }, + { + "epoch": 0.31414309916623756, + "grad_norm": 0.3971514403820038, + "learning_rate": 0.00013719959968286563, + "loss": 1.331, + "step": 24175 + }, + { + "epoch": 0.31415609371015346, + "grad_norm": 0.45709437131881714, + "learning_rate": 0.00013719700022095428, + "loss": 1.4114, + "step": 24176 + }, + { + "epoch": 0.3141690882540693, + "grad_norm": 0.36304983496665955, + "learning_rate": 0.00013719440075904288, + "loss": 1.29, + "step": 24177 + }, + { + "epoch": 0.3141820827979852, + "grad_norm": 0.3682185411453247, + "learning_rate": 0.0001371918012971315, + "loss": 1.3481, + "step": 24178 + }, + { + "epoch": 0.31419507734190105, + "grad_norm": 0.47754621505737305, + "learning_rate": 0.00013718920183522013, + "loss": 1.2722, + "step": 24179 + }, + { + "epoch": 0.31420807188581695, + "grad_norm": 0.4243612587451935, + "learning_rate": 0.00013718660237330873, + "loss": 1.3513, + "step": 24180 + }, + { + "epoch": 0.3142210664297328, + "grad_norm": 0.381763219833374, + "learning_rate": 0.00013718400291139735, + "loss": 1.4275, + "step": 24181 + }, + { + "epoch": 0.3142340609736487, + "grad_norm": 0.42432597279548645, + "learning_rate": 0.00013718140344948595, + "loss": 1.4977, + "step": 24182 + }, + { + "epoch": 0.31424705551756454, + "grad_norm": 0.4389258921146393, + "learning_rate": 0.0001371788039875746, + "loss": 1.5341, + "step": 24183 + }, + { + "epoch": 0.31426005006148044, + "grad_norm": 0.33113405108451843, + "learning_rate": 0.0001371762045256632, + "loss": 1.4842, + "step": 24184 + }, + { + "epoch": 0.3142730446053963, + "grad_norm": 0.34596988558769226, + "learning_rate": 0.0001371736050637518, + "loss": 1.3565, + "step": 24185 + }, + { + "epoch": 0.3142860391493122, + "grad_norm": 0.39609968662261963, + "learning_rate": 0.00013717100560184042, + "loss": 1.4609, + "step": 24186 + }, + { + "epoch": 0.31429903369322804, + "grad_norm": 0.3026363253593445, + "learning_rate": 0.00013716840613992905, + "loss": 1.1944, + "step": 24187 + }, + { + "epoch": 0.31431202823714394, + "grad_norm": 0.41935089230537415, + "learning_rate": 0.00013716580667801767, + "loss": 1.4014, + "step": 24188 + }, + { + "epoch": 0.3143250227810598, + "grad_norm": 0.40155860781669617, + "learning_rate": 0.00013716320721610627, + "loss": 1.4568, + "step": 24189 + }, + { + "epoch": 0.3143380173249757, + "grad_norm": 0.4086877703666687, + "learning_rate": 0.0001371606077541949, + "loss": 1.4024, + "step": 24190 + }, + { + "epoch": 0.31435101186889153, + "grad_norm": 0.4069059193134308, + "learning_rate": 0.00013715800829228352, + "loss": 1.4331, + "step": 24191 + }, + { + "epoch": 0.31436400641280743, + "grad_norm": 0.37308403849601746, + "learning_rate": 0.00013715540883037211, + "loss": 1.4, + "step": 24192 + }, + { + "epoch": 0.3143770009567233, + "grad_norm": 0.38528764247894287, + "learning_rate": 0.00013715280936846074, + "loss": 1.2503, + "step": 24193 + }, + { + "epoch": 0.3143899955006392, + "grad_norm": 0.4985155165195465, + "learning_rate": 0.00013715020990654934, + "loss": 1.4655, + "step": 24194 + }, + { + "epoch": 0.314402990044555, + "grad_norm": 0.34910356998443604, + "learning_rate": 0.000137147610444638, + "loss": 1.2278, + "step": 24195 + }, + { + "epoch": 0.3144159845884709, + "grad_norm": 0.3683018982410431, + "learning_rate": 0.00013714501098272658, + "loss": 1.4835, + "step": 24196 + }, + { + "epoch": 0.31442897913238677, + "grad_norm": 0.43597593903541565, + "learning_rate": 0.00013714241152081518, + "loss": 1.3774, + "step": 24197 + }, + { + "epoch": 0.31444197367630267, + "grad_norm": 0.3188539743423462, + "learning_rate": 0.0001371398120589038, + "loss": 1.4925, + "step": 24198 + }, + { + "epoch": 0.3144549682202185, + "grad_norm": 0.4241398274898529, + "learning_rate": 0.00013713721259699243, + "loss": 1.246, + "step": 24199 + }, + { + "epoch": 0.3144679627641344, + "grad_norm": 0.5855793356895447, + "learning_rate": 0.00013713461313508106, + "loss": 1.5874, + "step": 24200 + }, + { + "epoch": 0.31448095730805026, + "grad_norm": 0.2817264795303345, + "learning_rate": 0.00013713201367316965, + "loss": 1.2687, + "step": 24201 + }, + { + "epoch": 0.31449395185196616, + "grad_norm": 0.41730374097824097, + "learning_rate": 0.00013712941421125828, + "loss": 1.562, + "step": 24202 + }, + { + "epoch": 0.314506946395882, + "grad_norm": 0.4404489994049072, + "learning_rate": 0.0001371268147493469, + "loss": 1.527, + "step": 24203 + }, + { + "epoch": 0.3145199409397979, + "grad_norm": 0.35299018025398254, + "learning_rate": 0.0001371242152874355, + "loss": 1.4261, + "step": 24204 + }, + { + "epoch": 0.31453293548371375, + "grad_norm": 0.37917932868003845, + "learning_rate": 0.00013712161582552412, + "loss": 1.4166, + "step": 24205 + }, + { + "epoch": 0.31454593002762965, + "grad_norm": 0.42596685886383057, + "learning_rate": 0.00013711901636361272, + "loss": 1.4279, + "step": 24206 + }, + { + "epoch": 0.3145589245715455, + "grad_norm": 0.3967914879322052, + "learning_rate": 0.00013711641690170137, + "loss": 1.3899, + "step": 24207 + }, + { + "epoch": 0.3145719191154614, + "grad_norm": 0.4533579349517822, + "learning_rate": 0.00013711381743978997, + "loss": 1.5057, + "step": 24208 + }, + { + "epoch": 0.3145849136593773, + "grad_norm": 0.36241182684898376, + "learning_rate": 0.00013711121797787857, + "loss": 1.1778, + "step": 24209 + }, + { + "epoch": 0.31459790820329314, + "grad_norm": 0.3608736991882324, + "learning_rate": 0.0001371086185159672, + "loss": 1.3887, + "step": 24210 + }, + { + "epoch": 0.31461090274720904, + "grad_norm": 0.29960396885871887, + "learning_rate": 0.00013710601905405582, + "loss": 1.1806, + "step": 24211 + }, + { + "epoch": 0.3146238972911249, + "grad_norm": 0.438826322555542, + "learning_rate": 0.00013710341959214444, + "loss": 1.4569, + "step": 24212 + }, + { + "epoch": 0.3146368918350408, + "grad_norm": 0.35067325830459595, + "learning_rate": 0.00013710082013023304, + "loss": 1.4293, + "step": 24213 + }, + { + "epoch": 0.31464988637895663, + "grad_norm": 0.495954304933548, + "learning_rate": 0.00013709822066832166, + "loss": 1.6212, + "step": 24214 + }, + { + "epoch": 0.31466288092287253, + "grad_norm": 0.3596116900444031, + "learning_rate": 0.0001370956212064103, + "loss": 1.4736, + "step": 24215 + }, + { + "epoch": 0.3146758754667884, + "grad_norm": 0.3464513421058655, + "learning_rate": 0.00013709302174449888, + "loss": 1.5765, + "step": 24216 + }, + { + "epoch": 0.3146888700107043, + "grad_norm": 0.36833566427230835, + "learning_rate": 0.0001370904222825875, + "loss": 1.5671, + "step": 24217 + }, + { + "epoch": 0.3147018645546201, + "grad_norm": 0.497585654258728, + "learning_rate": 0.00013708782282067613, + "loss": 1.3884, + "step": 24218 + }, + { + "epoch": 0.314714859098536, + "grad_norm": 0.4037121832370758, + "learning_rate": 0.00013708522335876476, + "loss": 1.5154, + "step": 24219 + }, + { + "epoch": 0.31472785364245187, + "grad_norm": 0.5084320902824402, + "learning_rate": 0.00013708262389685336, + "loss": 1.5147, + "step": 24220 + }, + { + "epoch": 0.31474084818636777, + "grad_norm": 0.35329383611679077, + "learning_rate": 0.00013708002443494198, + "loss": 1.3351, + "step": 24221 + }, + { + "epoch": 0.3147538427302836, + "grad_norm": 0.4006134867668152, + "learning_rate": 0.0001370774249730306, + "loss": 1.309, + "step": 24222 + }, + { + "epoch": 0.3147668372741995, + "grad_norm": 0.45675128698349, + "learning_rate": 0.0001370748255111192, + "loss": 1.445, + "step": 24223 + }, + { + "epoch": 0.31477983181811536, + "grad_norm": 0.4189942181110382, + "learning_rate": 0.00013707222604920783, + "loss": 1.4819, + "step": 24224 + }, + { + "epoch": 0.31479282636203126, + "grad_norm": 0.444923460483551, + "learning_rate": 0.00013706962658729642, + "loss": 1.4941, + "step": 24225 + }, + { + "epoch": 0.3148058209059471, + "grad_norm": 0.398495078086853, + "learning_rate": 0.00013706702712538505, + "loss": 1.3008, + "step": 24226 + }, + { + "epoch": 0.314818815449863, + "grad_norm": 0.34429240226745605, + "learning_rate": 0.00013706442766347367, + "loss": 1.3207, + "step": 24227 + }, + { + "epoch": 0.31483180999377886, + "grad_norm": 0.4360469579696655, + "learning_rate": 0.00013706182820156227, + "loss": 1.4586, + "step": 24228 + }, + { + "epoch": 0.31484480453769476, + "grad_norm": 0.3913939297199249, + "learning_rate": 0.0001370592287396509, + "loss": 1.3652, + "step": 24229 + }, + { + "epoch": 0.3148577990816106, + "grad_norm": 0.4482981264591217, + "learning_rate": 0.00013705662927773952, + "loss": 1.5234, + "step": 24230 + }, + { + "epoch": 0.3148707936255265, + "grad_norm": 0.3716791868209839, + "learning_rate": 0.00013705402981582814, + "loss": 1.4302, + "step": 24231 + }, + { + "epoch": 0.31488378816944235, + "grad_norm": 0.43635624647140503, + "learning_rate": 0.00013705143035391674, + "loss": 1.3219, + "step": 24232 + }, + { + "epoch": 0.31489678271335825, + "grad_norm": 0.3972419202327728, + "learning_rate": 0.00013704883089200537, + "loss": 1.4784, + "step": 24233 + }, + { + "epoch": 0.3149097772572741, + "grad_norm": 0.45062461495399475, + "learning_rate": 0.000137046231430094, + "loss": 1.4234, + "step": 24234 + }, + { + "epoch": 0.31492277180119, + "grad_norm": 0.45785340666770935, + "learning_rate": 0.0001370436319681826, + "loss": 1.3391, + "step": 24235 + }, + { + "epoch": 0.31493576634510584, + "grad_norm": 0.42377781867980957, + "learning_rate": 0.0001370410325062712, + "loss": 1.4262, + "step": 24236 + }, + { + "epoch": 0.31494876088902174, + "grad_norm": 0.5246242880821228, + "learning_rate": 0.0001370384330443598, + "loss": 1.5391, + "step": 24237 + }, + { + "epoch": 0.3149617554329376, + "grad_norm": 0.4232649505138397, + "learning_rate": 0.00013703583358244846, + "loss": 1.41, + "step": 24238 + }, + { + "epoch": 0.3149747499768535, + "grad_norm": 0.3410038352012634, + "learning_rate": 0.00013703323412053706, + "loss": 1.0745, + "step": 24239 + }, + { + "epoch": 0.31498774452076933, + "grad_norm": 0.4006120264530182, + "learning_rate": 0.00013703063465862566, + "loss": 1.4459, + "step": 24240 + }, + { + "epoch": 0.31500073906468523, + "grad_norm": 0.45589473843574524, + "learning_rate": 0.00013702803519671428, + "loss": 1.3432, + "step": 24241 + }, + { + "epoch": 0.3150137336086011, + "grad_norm": 0.34500938653945923, + "learning_rate": 0.0001370254357348029, + "loss": 1.3072, + "step": 24242 + }, + { + "epoch": 0.315026728152517, + "grad_norm": 0.3783598840236664, + "learning_rate": 0.00013702283627289153, + "loss": 1.2806, + "step": 24243 + }, + { + "epoch": 0.3150397226964328, + "grad_norm": 0.4570356011390686, + "learning_rate": 0.00013702023681098013, + "loss": 1.5026, + "step": 24244 + }, + { + "epoch": 0.3150527172403487, + "grad_norm": 0.42782655358314514, + "learning_rate": 0.00013701763734906875, + "loss": 1.4194, + "step": 24245 + }, + { + "epoch": 0.31506571178426457, + "grad_norm": 0.3588072657585144, + "learning_rate": 0.00013701503788715738, + "loss": 1.4905, + "step": 24246 + }, + { + "epoch": 0.31507870632818047, + "grad_norm": 0.37493863701820374, + "learning_rate": 0.00013701243842524597, + "loss": 1.3526, + "step": 24247 + }, + { + "epoch": 0.3150917008720963, + "grad_norm": 0.3687364161014557, + "learning_rate": 0.0001370098389633346, + "loss": 1.1768, + "step": 24248 + }, + { + "epoch": 0.3151046954160122, + "grad_norm": 0.3847581148147583, + "learning_rate": 0.0001370072395014232, + "loss": 1.3121, + "step": 24249 + }, + { + "epoch": 0.31511768995992806, + "grad_norm": 0.4969902038574219, + "learning_rate": 0.00013700464003951185, + "loss": 1.5232, + "step": 24250 + }, + { + "epoch": 0.31513068450384396, + "grad_norm": 0.3823295533657074, + "learning_rate": 0.00013700204057760044, + "loss": 1.3681, + "step": 24251 + }, + { + "epoch": 0.3151436790477598, + "grad_norm": 0.5384612679481506, + "learning_rate": 0.00013699944111568904, + "loss": 1.5687, + "step": 24252 + }, + { + "epoch": 0.3151566735916757, + "grad_norm": 0.45925915241241455, + "learning_rate": 0.0001369968416537777, + "loss": 1.4197, + "step": 24253 + }, + { + "epoch": 0.31516966813559155, + "grad_norm": 0.33666902780532837, + "learning_rate": 0.0001369942421918663, + "loss": 1.5279, + "step": 24254 + }, + { + "epoch": 0.31518266267950745, + "grad_norm": 0.3572155833244324, + "learning_rate": 0.00013699164272995491, + "loss": 1.4185, + "step": 24255 + }, + { + "epoch": 0.3151956572234233, + "grad_norm": 0.36339375376701355, + "learning_rate": 0.0001369890432680435, + "loss": 1.345, + "step": 24256 + }, + { + "epoch": 0.3152086517673392, + "grad_norm": 0.4763093888759613, + "learning_rate": 0.00013698644380613214, + "loss": 1.495, + "step": 24257 + }, + { + "epoch": 0.31522164631125504, + "grad_norm": 0.46020177006721497, + "learning_rate": 0.00013698384434422076, + "loss": 1.5206, + "step": 24258 + }, + { + "epoch": 0.31523464085517094, + "grad_norm": 0.36117589473724365, + "learning_rate": 0.00013698124488230936, + "loss": 1.2139, + "step": 24259 + }, + { + "epoch": 0.3152476353990868, + "grad_norm": 0.3548513352870941, + "learning_rate": 0.00013697864542039798, + "loss": 1.3241, + "step": 24260 + }, + { + "epoch": 0.3152606299430027, + "grad_norm": 0.36290788650512695, + "learning_rate": 0.0001369760459584866, + "loss": 1.3601, + "step": 24261 + }, + { + "epoch": 0.31527362448691854, + "grad_norm": 0.31978094577789307, + "learning_rate": 0.00013697344649657523, + "loss": 1.3904, + "step": 24262 + }, + { + "epoch": 0.31528661903083444, + "grad_norm": 0.4334924519062042, + "learning_rate": 0.00013697084703466383, + "loss": 1.4669, + "step": 24263 + }, + { + "epoch": 0.3152996135747503, + "grad_norm": 0.3809143900871277, + "learning_rate": 0.00013696824757275243, + "loss": 1.5479, + "step": 24264 + }, + { + "epoch": 0.3153126081186662, + "grad_norm": 0.39382120966911316, + "learning_rate": 0.00013696564811084108, + "loss": 1.4626, + "step": 24265 + }, + { + "epoch": 0.315325602662582, + "grad_norm": 0.347513347864151, + "learning_rate": 0.00013696304864892968, + "loss": 1.3503, + "step": 24266 + }, + { + "epoch": 0.31533859720649793, + "grad_norm": 0.3870210647583008, + "learning_rate": 0.0001369604491870183, + "loss": 1.6351, + "step": 24267 + }, + { + "epoch": 0.3153515917504138, + "grad_norm": 0.4059775769710541, + "learning_rate": 0.0001369578497251069, + "loss": 1.271, + "step": 24268 + }, + { + "epoch": 0.3153645862943297, + "grad_norm": 0.4513508677482605, + "learning_rate": 0.00013695525026319552, + "loss": 1.4766, + "step": 24269 + }, + { + "epoch": 0.3153775808382455, + "grad_norm": 0.45753729343414307, + "learning_rate": 0.00013695265080128415, + "loss": 1.4137, + "step": 24270 + }, + { + "epoch": 0.3153905753821614, + "grad_norm": 0.39034897089004517, + "learning_rate": 0.00013695005133937274, + "loss": 1.3052, + "step": 24271 + }, + { + "epoch": 0.31540356992607727, + "grad_norm": 0.38706445693969727, + "learning_rate": 0.00013694745187746137, + "loss": 1.5688, + "step": 24272 + }, + { + "epoch": 0.31541656446999317, + "grad_norm": 0.42655348777770996, + "learning_rate": 0.00013694485241555, + "loss": 1.3109, + "step": 24273 + }, + { + "epoch": 0.315429559013909, + "grad_norm": 0.3179061710834503, + "learning_rate": 0.00013694225295363862, + "loss": 1.3148, + "step": 24274 + }, + { + "epoch": 0.3154425535578249, + "grad_norm": 0.37439557909965515, + "learning_rate": 0.00013693965349172721, + "loss": 1.4902, + "step": 24275 + }, + { + "epoch": 0.31545554810174076, + "grad_norm": 0.3858317732810974, + "learning_rate": 0.00013693705402981584, + "loss": 1.3114, + "step": 24276 + }, + { + "epoch": 0.31546854264565666, + "grad_norm": 0.3992762565612793, + "learning_rate": 0.00013693445456790446, + "loss": 1.4296, + "step": 24277 + }, + { + "epoch": 0.3154815371895725, + "grad_norm": 0.2564488649368286, + "learning_rate": 0.00013693185510599306, + "loss": 1.1431, + "step": 24278 + }, + { + "epoch": 0.3154945317334884, + "grad_norm": 0.36163845658302307, + "learning_rate": 0.00013692925564408169, + "loss": 1.3885, + "step": 24279 + }, + { + "epoch": 0.31550752627740425, + "grad_norm": 0.3678258955478668, + "learning_rate": 0.00013692665618217028, + "loss": 1.443, + "step": 24280 + }, + { + "epoch": 0.31552052082132015, + "grad_norm": 0.40115487575531006, + "learning_rate": 0.0001369240567202589, + "loss": 1.502, + "step": 24281 + }, + { + "epoch": 0.315533515365236, + "grad_norm": 0.3350605070590973, + "learning_rate": 0.00013692145725834753, + "loss": 1.2839, + "step": 24282 + }, + { + "epoch": 0.3155465099091519, + "grad_norm": 0.31742534041404724, + "learning_rate": 0.00013691885779643613, + "loss": 1.3904, + "step": 24283 + }, + { + "epoch": 0.31555950445306774, + "grad_norm": 0.44456052780151367, + "learning_rate": 0.00013691625833452475, + "loss": 1.5256, + "step": 24284 + }, + { + "epoch": 0.31557249899698364, + "grad_norm": 0.36216914653778076, + "learning_rate": 0.00013691365887261338, + "loss": 1.4848, + "step": 24285 + }, + { + "epoch": 0.31558549354089954, + "grad_norm": 0.4361882209777832, + "learning_rate": 0.000136911059410702, + "loss": 1.3248, + "step": 24286 + }, + { + "epoch": 0.3155984880848154, + "grad_norm": 0.47620388865470886, + "learning_rate": 0.0001369084599487906, + "loss": 1.5696, + "step": 24287 + }, + { + "epoch": 0.3156114826287313, + "grad_norm": 0.42277124524116516, + "learning_rate": 0.00013690586048687922, + "loss": 1.3461, + "step": 24288 + }, + { + "epoch": 0.31562447717264713, + "grad_norm": 0.3535692095756531, + "learning_rate": 0.00013690326102496785, + "loss": 1.3456, + "step": 24289 + }, + { + "epoch": 0.31563747171656303, + "grad_norm": 0.4585002660751343, + "learning_rate": 0.00013690066156305645, + "loss": 1.5086, + "step": 24290 + }, + { + "epoch": 0.3156504662604789, + "grad_norm": 0.2959149181842804, + "learning_rate": 0.00013689806210114507, + "loss": 1.4206, + "step": 24291 + }, + { + "epoch": 0.3156634608043948, + "grad_norm": 0.44144290685653687, + "learning_rate": 0.0001368954626392337, + "loss": 1.3529, + "step": 24292 + }, + { + "epoch": 0.3156764553483106, + "grad_norm": 0.43028753995895386, + "learning_rate": 0.0001368928631773223, + "loss": 1.495, + "step": 24293 + }, + { + "epoch": 0.3156894498922265, + "grad_norm": 0.3719899356365204, + "learning_rate": 0.00013689026371541092, + "loss": 1.5469, + "step": 24294 + }, + { + "epoch": 0.31570244443614237, + "grad_norm": 0.34897610545158386, + "learning_rate": 0.00013688766425349951, + "loss": 1.2544, + "step": 24295 + }, + { + "epoch": 0.31571543898005827, + "grad_norm": 0.39846864342689514, + "learning_rate": 0.00013688506479158817, + "loss": 1.4804, + "step": 24296 + }, + { + "epoch": 0.3157284335239741, + "grad_norm": 0.4127960801124573, + "learning_rate": 0.00013688246532967676, + "loss": 1.4743, + "step": 24297 + }, + { + "epoch": 0.31574142806789, + "grad_norm": 0.37189939618110657, + "learning_rate": 0.0001368798658677654, + "loss": 1.426, + "step": 24298 + }, + { + "epoch": 0.31575442261180586, + "grad_norm": 0.39628031849861145, + "learning_rate": 0.00013687726640585399, + "loss": 1.4488, + "step": 24299 + }, + { + "epoch": 0.31576741715572176, + "grad_norm": 0.4661518335342407, + "learning_rate": 0.0001368746669439426, + "loss": 1.5333, + "step": 24300 + }, + { + "epoch": 0.3157804116996376, + "grad_norm": 0.37219560146331787, + "learning_rate": 0.00013687206748203123, + "loss": 1.4406, + "step": 24301 + }, + { + "epoch": 0.3157934062435535, + "grad_norm": 0.3905365765094757, + "learning_rate": 0.00013686946802011983, + "loss": 1.3698, + "step": 24302 + }, + { + "epoch": 0.31580640078746935, + "grad_norm": 0.5504800081253052, + "learning_rate": 0.00013686686855820846, + "loss": 1.4798, + "step": 24303 + }, + { + "epoch": 0.31581939533138526, + "grad_norm": 0.4288892447948456, + "learning_rate": 0.00013686426909629708, + "loss": 1.4789, + "step": 24304 + }, + { + "epoch": 0.3158323898753011, + "grad_norm": 0.4315957725048065, + "learning_rate": 0.0001368616696343857, + "loss": 1.6025, + "step": 24305 + }, + { + "epoch": 0.315845384419217, + "grad_norm": 0.48499569296836853, + "learning_rate": 0.0001368590701724743, + "loss": 1.4618, + "step": 24306 + }, + { + "epoch": 0.31585837896313285, + "grad_norm": 0.39639511704444885, + "learning_rate": 0.0001368564707105629, + "loss": 1.7133, + "step": 24307 + }, + { + "epoch": 0.31587137350704875, + "grad_norm": 0.41407209634780884, + "learning_rate": 0.00013685387124865155, + "loss": 1.5126, + "step": 24308 + }, + { + "epoch": 0.3158843680509646, + "grad_norm": 0.32924962043762207, + "learning_rate": 0.00013685127178674015, + "loss": 1.3487, + "step": 24309 + }, + { + "epoch": 0.3158973625948805, + "grad_norm": 0.3358137905597687, + "learning_rate": 0.00013684867232482877, + "loss": 1.5968, + "step": 24310 + }, + { + "epoch": 0.31591035713879634, + "grad_norm": 0.5075027942657471, + "learning_rate": 0.00013684607286291737, + "loss": 1.6422, + "step": 24311 + }, + { + "epoch": 0.31592335168271224, + "grad_norm": 0.5027745366096497, + "learning_rate": 0.000136843473401006, + "loss": 1.3337, + "step": 24312 + }, + { + "epoch": 0.3159363462266281, + "grad_norm": 0.42853492498397827, + "learning_rate": 0.00013684087393909462, + "loss": 1.2563, + "step": 24313 + }, + { + "epoch": 0.315949340770544, + "grad_norm": 0.3679285943508148, + "learning_rate": 0.00013683827447718322, + "loss": 1.3319, + "step": 24314 + }, + { + "epoch": 0.31596233531445983, + "grad_norm": 0.30248886346817017, + "learning_rate": 0.00013683567501527184, + "loss": 1.4425, + "step": 24315 + }, + { + "epoch": 0.31597532985837573, + "grad_norm": 0.3021405339241028, + "learning_rate": 0.00013683307555336047, + "loss": 1.1713, + "step": 24316 + }, + { + "epoch": 0.3159883244022916, + "grad_norm": 0.4111071527004242, + "learning_rate": 0.0001368304760914491, + "loss": 1.3056, + "step": 24317 + }, + { + "epoch": 0.3160013189462075, + "grad_norm": 0.40387460589408875, + "learning_rate": 0.0001368278766295377, + "loss": 1.3966, + "step": 24318 + }, + { + "epoch": 0.3160143134901233, + "grad_norm": 0.42876961827278137, + "learning_rate": 0.00013682527716762629, + "loss": 1.4523, + "step": 24319 + }, + { + "epoch": 0.3160273080340392, + "grad_norm": 0.4366833567619324, + "learning_rate": 0.00013682267770571494, + "loss": 1.3512, + "step": 24320 + }, + { + "epoch": 0.31604030257795507, + "grad_norm": 0.2831141948699951, + "learning_rate": 0.00013682007824380353, + "loss": 1.4583, + "step": 24321 + }, + { + "epoch": 0.31605329712187097, + "grad_norm": 0.5409504175186157, + "learning_rate": 0.00013681747878189216, + "loss": 1.5404, + "step": 24322 + }, + { + "epoch": 0.3160662916657868, + "grad_norm": 0.30610910058021545, + "learning_rate": 0.00013681487931998076, + "loss": 1.3331, + "step": 24323 + }, + { + "epoch": 0.3160792862097027, + "grad_norm": 0.40455469489097595, + "learning_rate": 0.00013681227985806938, + "loss": 1.6068, + "step": 24324 + }, + { + "epoch": 0.31609228075361856, + "grad_norm": 0.3958268463611603, + "learning_rate": 0.000136809680396158, + "loss": 1.3528, + "step": 24325 + }, + { + "epoch": 0.31610527529753446, + "grad_norm": 0.4272357225418091, + "learning_rate": 0.0001368070809342466, + "loss": 1.2241, + "step": 24326 + }, + { + "epoch": 0.3161182698414503, + "grad_norm": 0.40770307183265686, + "learning_rate": 0.00013680448147233525, + "loss": 1.3467, + "step": 24327 + }, + { + "epoch": 0.3161312643853662, + "grad_norm": 0.35952791571617126, + "learning_rate": 0.00013680188201042385, + "loss": 1.0945, + "step": 24328 + }, + { + "epoch": 0.31614425892928205, + "grad_norm": 0.4818912148475647, + "learning_rate": 0.00013679928254851248, + "loss": 1.3933, + "step": 24329 + }, + { + "epoch": 0.31615725347319795, + "grad_norm": 0.34593772888183594, + "learning_rate": 0.00013679668308660107, + "loss": 1.2958, + "step": 24330 + }, + { + "epoch": 0.3161702480171138, + "grad_norm": 0.37886014580726624, + "learning_rate": 0.0001367940836246897, + "loss": 1.5966, + "step": 24331 + }, + { + "epoch": 0.3161832425610297, + "grad_norm": 0.42048585414886475, + "learning_rate": 0.00013679148416277832, + "loss": 1.4293, + "step": 24332 + }, + { + "epoch": 0.31619623710494554, + "grad_norm": 0.4240248501300812, + "learning_rate": 0.00013678888470086692, + "loss": 1.45, + "step": 24333 + }, + { + "epoch": 0.31620923164886144, + "grad_norm": 0.3015948534011841, + "learning_rate": 0.00013678628523895554, + "loss": 1.1447, + "step": 24334 + }, + { + "epoch": 0.3162222261927773, + "grad_norm": 0.3940720558166504, + "learning_rate": 0.00013678368577704417, + "loss": 1.4425, + "step": 24335 + }, + { + "epoch": 0.3162352207366932, + "grad_norm": 0.43135395646095276, + "learning_rate": 0.00013678108631513277, + "loss": 1.3101, + "step": 24336 + }, + { + "epoch": 0.31624821528060904, + "grad_norm": 0.3022572100162506, + "learning_rate": 0.0001367784868532214, + "loss": 1.4673, + "step": 24337 + }, + { + "epoch": 0.31626120982452494, + "grad_norm": 0.3595219552516937, + "learning_rate": 0.00013677588739131, + "loss": 1.4218, + "step": 24338 + }, + { + "epoch": 0.3162742043684408, + "grad_norm": 0.3659757673740387, + "learning_rate": 0.00013677328792939864, + "loss": 1.2944, + "step": 24339 + }, + { + "epoch": 0.3162871989123567, + "grad_norm": 0.4848015308380127, + "learning_rate": 0.00013677068846748724, + "loss": 1.4138, + "step": 24340 + }, + { + "epoch": 0.3163001934562725, + "grad_norm": 0.38484156131744385, + "learning_rate": 0.00013676808900557586, + "loss": 1.605, + "step": 24341 + }, + { + "epoch": 0.3163131880001884, + "grad_norm": 0.40679213404655457, + "learning_rate": 0.00013676548954366446, + "loss": 1.4427, + "step": 24342 + }, + { + "epoch": 0.3163261825441043, + "grad_norm": 0.3918675184249878, + "learning_rate": 0.00013676289008175308, + "loss": 1.3539, + "step": 24343 + }, + { + "epoch": 0.3163391770880202, + "grad_norm": 0.40854018926620483, + "learning_rate": 0.0001367602906198417, + "loss": 1.4043, + "step": 24344 + }, + { + "epoch": 0.316352171631936, + "grad_norm": 0.5385652184486389, + "learning_rate": 0.0001367576911579303, + "loss": 1.3421, + "step": 24345 + }, + { + "epoch": 0.3163651661758519, + "grad_norm": 0.41769295930862427, + "learning_rate": 0.00013675509169601893, + "loss": 1.4397, + "step": 24346 + }, + { + "epoch": 0.31637816071976776, + "grad_norm": 0.46131348609924316, + "learning_rate": 0.00013675249223410755, + "loss": 1.2472, + "step": 24347 + }, + { + "epoch": 0.31639115526368367, + "grad_norm": 0.3219773471355438, + "learning_rate": 0.00013674989277219615, + "loss": 1.246, + "step": 24348 + }, + { + "epoch": 0.3164041498075995, + "grad_norm": 0.4026868939399719, + "learning_rate": 0.00013674729331028478, + "loss": 1.4756, + "step": 24349 + }, + { + "epoch": 0.3164171443515154, + "grad_norm": 0.4557186961174011, + "learning_rate": 0.00013674469384837337, + "loss": 1.6064, + "step": 24350 + }, + { + "epoch": 0.31643013889543126, + "grad_norm": 0.4154195487499237, + "learning_rate": 0.00013674209438646202, + "loss": 1.5842, + "step": 24351 + }, + { + "epoch": 0.31644313343934716, + "grad_norm": 0.44961991906166077, + "learning_rate": 0.00013673949492455062, + "loss": 1.3923, + "step": 24352 + }, + { + "epoch": 0.316456127983263, + "grad_norm": 0.4193685054779053, + "learning_rate": 0.00013673689546263925, + "loss": 1.4905, + "step": 24353 + }, + { + "epoch": 0.3164691225271789, + "grad_norm": 0.4472082555294037, + "learning_rate": 0.00013673429600072784, + "loss": 1.2917, + "step": 24354 + }, + { + "epoch": 0.31648211707109475, + "grad_norm": 0.3476710617542267, + "learning_rate": 0.00013673169653881647, + "loss": 1.1783, + "step": 24355 + }, + { + "epoch": 0.31649511161501065, + "grad_norm": 0.4199211895465851, + "learning_rate": 0.0001367290970769051, + "loss": 1.2929, + "step": 24356 + }, + { + "epoch": 0.3165081061589265, + "grad_norm": 0.3336026072502136, + "learning_rate": 0.0001367264976149937, + "loss": 1.4057, + "step": 24357 + }, + { + "epoch": 0.3165211007028424, + "grad_norm": 0.4056035280227661, + "learning_rate": 0.00013672389815308231, + "loss": 1.3765, + "step": 24358 + }, + { + "epoch": 0.31653409524675824, + "grad_norm": 0.32808977365493774, + "learning_rate": 0.00013672129869117094, + "loss": 1.3913, + "step": 24359 + }, + { + "epoch": 0.31654708979067414, + "grad_norm": 0.4635668992996216, + "learning_rate": 0.00013671869922925956, + "loss": 1.3776, + "step": 24360 + }, + { + "epoch": 0.31656008433459004, + "grad_norm": 0.42094820737838745, + "learning_rate": 0.00013671609976734816, + "loss": 1.4913, + "step": 24361 + }, + { + "epoch": 0.3165730788785059, + "grad_norm": 0.508424699306488, + "learning_rate": 0.00013671350030543676, + "loss": 1.316, + "step": 24362 + }, + { + "epoch": 0.3165860734224218, + "grad_norm": 0.42761772871017456, + "learning_rate": 0.0001367109008435254, + "loss": 1.3915, + "step": 24363 + }, + { + "epoch": 0.31659906796633763, + "grad_norm": 0.3329997658729553, + "learning_rate": 0.000136708301381614, + "loss": 1.5453, + "step": 24364 + }, + { + "epoch": 0.31661206251025353, + "grad_norm": 0.47200146317481995, + "learning_rate": 0.00013670570191970263, + "loss": 1.4411, + "step": 24365 + }, + { + "epoch": 0.3166250570541694, + "grad_norm": 0.3911210298538208, + "learning_rate": 0.00013670310245779126, + "loss": 1.4591, + "step": 24366 + }, + { + "epoch": 0.3166380515980853, + "grad_norm": 0.36426842212677, + "learning_rate": 0.00013670050299587985, + "loss": 1.2939, + "step": 24367 + }, + { + "epoch": 0.3166510461420011, + "grad_norm": 0.48546144366264343, + "learning_rate": 0.00013669790353396848, + "loss": 1.4095, + "step": 24368 + }, + { + "epoch": 0.316664040685917, + "grad_norm": 0.43470498919487, + "learning_rate": 0.00013669530407205708, + "loss": 1.5054, + "step": 24369 + }, + { + "epoch": 0.31667703522983287, + "grad_norm": 0.29904910922050476, + "learning_rate": 0.00013669270461014573, + "loss": 1.3634, + "step": 24370 + }, + { + "epoch": 0.31669002977374877, + "grad_norm": 0.4916445314884186, + "learning_rate": 0.00013669010514823432, + "loss": 1.4675, + "step": 24371 + }, + { + "epoch": 0.3167030243176646, + "grad_norm": 0.42997390031814575, + "learning_rate": 0.00013668750568632295, + "loss": 1.4468, + "step": 24372 + }, + { + "epoch": 0.3167160188615805, + "grad_norm": 0.4192761182785034, + "learning_rate": 0.00013668490622441155, + "loss": 1.3557, + "step": 24373 + }, + { + "epoch": 0.31672901340549636, + "grad_norm": 0.46975022554397583, + "learning_rate": 0.00013668230676250017, + "loss": 1.5046, + "step": 24374 + }, + { + "epoch": 0.31674200794941226, + "grad_norm": 0.3747991621494293, + "learning_rate": 0.0001366797073005888, + "loss": 1.4379, + "step": 24375 + }, + { + "epoch": 0.3167550024933281, + "grad_norm": 0.3558545708656311, + "learning_rate": 0.0001366771078386774, + "loss": 1.2838, + "step": 24376 + }, + { + "epoch": 0.316767997037244, + "grad_norm": 0.45769351720809937, + "learning_rate": 0.00013667450837676602, + "loss": 1.5688, + "step": 24377 + }, + { + "epoch": 0.31678099158115985, + "grad_norm": 0.367517352104187, + "learning_rate": 0.00013667190891485464, + "loss": 1.2833, + "step": 24378 + }, + { + "epoch": 0.31679398612507575, + "grad_norm": 0.36312514543533325, + "learning_rate": 0.00013666930945294324, + "loss": 1.3183, + "step": 24379 + }, + { + "epoch": 0.3168069806689916, + "grad_norm": 0.40454697608947754, + "learning_rate": 0.00013666670999103186, + "loss": 1.2847, + "step": 24380 + }, + { + "epoch": 0.3168199752129075, + "grad_norm": 0.4117373526096344, + "learning_rate": 0.00013666411052912046, + "loss": 1.2778, + "step": 24381 + }, + { + "epoch": 0.31683296975682335, + "grad_norm": 0.42060205340385437, + "learning_rate": 0.0001366615110672091, + "loss": 1.4025, + "step": 24382 + }, + { + "epoch": 0.31684596430073925, + "grad_norm": 0.36232438683509827, + "learning_rate": 0.0001366589116052977, + "loss": 1.4401, + "step": 24383 + }, + { + "epoch": 0.3168589588446551, + "grad_norm": 0.38088318705558777, + "learning_rate": 0.00013665631214338633, + "loss": 1.3573, + "step": 24384 + }, + { + "epoch": 0.316871953388571, + "grad_norm": 0.37629103660583496, + "learning_rate": 0.00013665371268147493, + "loss": 1.4336, + "step": 24385 + }, + { + "epoch": 0.31688494793248684, + "grad_norm": 0.29912474751472473, + "learning_rate": 0.00013665111321956356, + "loss": 1.3813, + "step": 24386 + }, + { + "epoch": 0.31689794247640274, + "grad_norm": 0.3750167787075043, + "learning_rate": 0.00013664851375765218, + "loss": 1.4466, + "step": 24387 + }, + { + "epoch": 0.3169109370203186, + "grad_norm": 0.395393431186676, + "learning_rate": 0.00013664591429574078, + "loss": 1.3905, + "step": 24388 + }, + { + "epoch": 0.3169239315642345, + "grad_norm": 0.40814340114593506, + "learning_rate": 0.0001366433148338294, + "loss": 1.5117, + "step": 24389 + }, + { + "epoch": 0.31693692610815033, + "grad_norm": 0.31867504119873047, + "learning_rate": 0.00013664071537191803, + "loss": 1.2665, + "step": 24390 + }, + { + "epoch": 0.31694992065206623, + "grad_norm": 0.3972594738006592, + "learning_rate": 0.00013663811591000662, + "loss": 1.527, + "step": 24391 + }, + { + "epoch": 0.3169629151959821, + "grad_norm": 0.3781193196773529, + "learning_rate": 0.00013663551644809525, + "loss": 1.5671, + "step": 24392 + }, + { + "epoch": 0.316975909739898, + "grad_norm": 0.3944792151451111, + "learning_rate": 0.00013663291698618385, + "loss": 1.414, + "step": 24393 + }, + { + "epoch": 0.3169889042838138, + "grad_norm": 0.40063533186912537, + "learning_rate": 0.0001366303175242725, + "loss": 1.4716, + "step": 24394 + }, + { + "epoch": 0.3170018988277297, + "grad_norm": 0.43710336089134216, + "learning_rate": 0.0001366277180623611, + "loss": 1.3416, + "step": 24395 + }, + { + "epoch": 0.31701489337164557, + "grad_norm": 0.3572905659675598, + "learning_rate": 0.00013662511860044972, + "loss": 1.3681, + "step": 24396 + }, + { + "epoch": 0.31702788791556147, + "grad_norm": 0.37871429324150085, + "learning_rate": 0.00013662251913853832, + "loss": 1.3879, + "step": 24397 + }, + { + "epoch": 0.3170408824594773, + "grad_norm": 0.3703164756298065, + "learning_rate": 0.00013661991967662694, + "loss": 1.4376, + "step": 24398 + }, + { + "epoch": 0.3170538770033932, + "grad_norm": 0.38005539774894714, + "learning_rate": 0.00013661732021471557, + "loss": 1.4788, + "step": 24399 + }, + { + "epoch": 0.31706687154730906, + "grad_norm": 0.3313455283641815, + "learning_rate": 0.00013661472075280416, + "loss": 1.2636, + "step": 24400 + }, + { + "epoch": 0.31707986609122496, + "grad_norm": 0.43130943179130554, + "learning_rate": 0.00013661212129089282, + "loss": 1.5056, + "step": 24401 + }, + { + "epoch": 0.3170928606351408, + "grad_norm": 0.39680278301239014, + "learning_rate": 0.0001366095218289814, + "loss": 1.5009, + "step": 24402 + }, + { + "epoch": 0.3171058551790567, + "grad_norm": 0.40444695949554443, + "learning_rate": 0.00013660692236707, + "loss": 1.4351, + "step": 24403 + }, + { + "epoch": 0.31711884972297255, + "grad_norm": 0.4056309163570404, + "learning_rate": 0.00013660432290515863, + "loss": 1.4454, + "step": 24404 + }, + { + "epoch": 0.31713184426688845, + "grad_norm": 0.39997339248657227, + "learning_rate": 0.00013660172344324726, + "loss": 1.3238, + "step": 24405 + }, + { + "epoch": 0.3171448388108043, + "grad_norm": 0.3984402120113373, + "learning_rate": 0.00013659912398133588, + "loss": 1.3976, + "step": 24406 + }, + { + "epoch": 0.3171578333547202, + "grad_norm": 0.3693009614944458, + "learning_rate": 0.00013659652451942448, + "loss": 1.4603, + "step": 24407 + }, + { + "epoch": 0.31717082789863604, + "grad_norm": 0.28103724122047424, + "learning_rate": 0.0001365939250575131, + "loss": 1.3755, + "step": 24408 + }, + { + "epoch": 0.31718382244255194, + "grad_norm": 0.40090128779411316, + "learning_rate": 0.00013659132559560173, + "loss": 1.4489, + "step": 24409 + }, + { + "epoch": 0.3171968169864678, + "grad_norm": 0.44954749941825867, + "learning_rate": 0.00013658872613369033, + "loss": 1.4129, + "step": 24410 + }, + { + "epoch": 0.3172098115303837, + "grad_norm": 0.3736685812473297, + "learning_rate": 0.00013658612667177895, + "loss": 1.4027, + "step": 24411 + }, + { + "epoch": 0.31722280607429953, + "grad_norm": 0.41179800033569336, + "learning_rate": 0.00013658352720986755, + "loss": 1.5489, + "step": 24412 + }, + { + "epoch": 0.31723580061821544, + "grad_norm": 0.32932525873184204, + "learning_rate": 0.0001365809277479562, + "loss": 1.3133, + "step": 24413 + }, + { + "epoch": 0.3172487951621313, + "grad_norm": 0.3847118318080902, + "learning_rate": 0.0001365783282860448, + "loss": 1.5251, + "step": 24414 + }, + { + "epoch": 0.3172617897060472, + "grad_norm": 0.3626595735549927, + "learning_rate": 0.0001365757288241334, + "loss": 1.5451, + "step": 24415 + }, + { + "epoch": 0.317274784249963, + "grad_norm": 0.37977007031440735, + "learning_rate": 0.00013657312936222202, + "loss": 1.3042, + "step": 24416 + }, + { + "epoch": 0.3172877787938789, + "grad_norm": 0.34163352847099304, + "learning_rate": 0.00013657052990031064, + "loss": 1.4929, + "step": 24417 + }, + { + "epoch": 0.3173007733377948, + "grad_norm": 0.5282631516456604, + "learning_rate": 0.00013656793043839927, + "loss": 1.5479, + "step": 24418 + }, + { + "epoch": 0.3173137678817107, + "grad_norm": 0.3378818929195404, + "learning_rate": 0.00013656533097648787, + "loss": 1.4773, + "step": 24419 + }, + { + "epoch": 0.3173267624256265, + "grad_norm": 0.3953704237937927, + "learning_rate": 0.0001365627315145765, + "loss": 1.4961, + "step": 24420 + }, + { + "epoch": 0.3173397569695424, + "grad_norm": 0.16165396571159363, + "learning_rate": 0.00013656013205266512, + "loss": 1.0871, + "step": 24421 + }, + { + "epoch": 0.31735275151345826, + "grad_norm": 0.35105374455451965, + "learning_rate": 0.0001365575325907537, + "loss": 1.426, + "step": 24422 + }, + { + "epoch": 0.31736574605737417, + "grad_norm": 0.4470350742340088, + "learning_rate": 0.00013655493312884234, + "loss": 1.4488, + "step": 24423 + }, + { + "epoch": 0.31737874060129, + "grad_norm": 0.43733686208724976, + "learning_rate": 0.00013655233366693093, + "loss": 1.411, + "step": 24424 + }, + { + "epoch": 0.3173917351452059, + "grad_norm": 0.41593149304389954, + "learning_rate": 0.00013654973420501959, + "loss": 1.3897, + "step": 24425 + }, + { + "epoch": 0.31740472968912176, + "grad_norm": 0.36359885334968567, + "learning_rate": 0.00013654713474310818, + "loss": 1.3737, + "step": 24426 + }, + { + "epoch": 0.31741772423303766, + "grad_norm": 0.38714805245399475, + "learning_rate": 0.0001365445352811968, + "loss": 1.423, + "step": 24427 + }, + { + "epoch": 0.3174307187769535, + "grad_norm": 0.3751436173915863, + "learning_rate": 0.0001365419358192854, + "loss": 1.4568, + "step": 24428 + }, + { + "epoch": 0.3174437133208694, + "grad_norm": 0.4020783007144928, + "learning_rate": 0.00013653933635737403, + "loss": 1.3229, + "step": 24429 + }, + { + "epoch": 0.31745670786478525, + "grad_norm": 0.4770466685295105, + "learning_rate": 0.00013653673689546265, + "loss": 1.5079, + "step": 24430 + }, + { + "epoch": 0.31746970240870115, + "grad_norm": 0.33619800209999084, + "learning_rate": 0.00013653413743355125, + "loss": 1.0921, + "step": 24431 + }, + { + "epoch": 0.317482696952617, + "grad_norm": 0.4646088480949402, + "learning_rate": 0.00013653153797163988, + "loss": 1.4453, + "step": 24432 + }, + { + "epoch": 0.3174956914965329, + "grad_norm": 0.4613949656486511, + "learning_rate": 0.0001365289385097285, + "loss": 1.4958, + "step": 24433 + }, + { + "epoch": 0.31750868604044874, + "grad_norm": 0.45388227701187134, + "learning_rate": 0.0001365263390478171, + "loss": 1.2837, + "step": 24434 + }, + { + "epoch": 0.31752168058436464, + "grad_norm": 0.4936314523220062, + "learning_rate": 0.00013652373958590572, + "loss": 1.453, + "step": 24435 + }, + { + "epoch": 0.3175346751282805, + "grad_norm": 0.39332345128059387, + "learning_rate": 0.00013652114012399432, + "loss": 1.4375, + "step": 24436 + }, + { + "epoch": 0.3175476696721964, + "grad_norm": 0.4769171476364136, + "learning_rate": 0.00013651854066208297, + "loss": 1.3027, + "step": 24437 + }, + { + "epoch": 0.3175606642161123, + "grad_norm": 0.3637208342552185, + "learning_rate": 0.00013651594120017157, + "loss": 1.3096, + "step": 24438 + }, + { + "epoch": 0.31757365876002813, + "grad_norm": 0.3802751302719116, + "learning_rate": 0.0001365133417382602, + "loss": 1.3362, + "step": 24439 + }, + { + "epoch": 0.31758665330394403, + "grad_norm": 0.4256417155265808, + "learning_rate": 0.00013651074227634882, + "loss": 1.4033, + "step": 24440 + }, + { + "epoch": 0.3175996478478599, + "grad_norm": 0.38301560282707214, + "learning_rate": 0.00013650814281443742, + "loss": 1.3392, + "step": 24441 + }, + { + "epoch": 0.3176126423917758, + "grad_norm": 0.3901337683200836, + "learning_rate": 0.00013650554335252604, + "loss": 1.2899, + "step": 24442 + }, + { + "epoch": 0.3176256369356916, + "grad_norm": 0.4726414084434509, + "learning_rate": 0.00013650294389061464, + "loss": 1.4639, + "step": 24443 + }, + { + "epoch": 0.3176386314796075, + "grad_norm": 0.4089820086956024, + "learning_rate": 0.0001365003444287033, + "loss": 1.5096, + "step": 24444 + }, + { + "epoch": 0.31765162602352337, + "grad_norm": 0.43032199144363403, + "learning_rate": 0.00013649774496679189, + "loss": 1.364, + "step": 24445 + }, + { + "epoch": 0.31766462056743927, + "grad_norm": 0.47647249698638916, + "learning_rate": 0.00013649514550488048, + "loss": 1.5068, + "step": 24446 + }, + { + "epoch": 0.3176776151113551, + "grad_norm": 0.4064078629016876, + "learning_rate": 0.0001364925460429691, + "loss": 1.294, + "step": 24447 + }, + { + "epoch": 0.317690609655271, + "grad_norm": 0.4681776463985443, + "learning_rate": 0.00013648994658105773, + "loss": 1.3685, + "step": 24448 + }, + { + "epoch": 0.31770360419918686, + "grad_norm": 0.4651210606098175, + "learning_rate": 0.00013648734711914636, + "loss": 1.4488, + "step": 24449 + }, + { + "epoch": 0.31771659874310276, + "grad_norm": 0.4250487983226776, + "learning_rate": 0.00013648474765723495, + "loss": 1.4641, + "step": 24450 + }, + { + "epoch": 0.3177295932870186, + "grad_norm": 0.4269929528236389, + "learning_rate": 0.00013648214819532358, + "loss": 1.4989, + "step": 24451 + }, + { + "epoch": 0.3177425878309345, + "grad_norm": 0.38231131434440613, + "learning_rate": 0.0001364795487334122, + "loss": 1.4425, + "step": 24452 + }, + { + "epoch": 0.31775558237485035, + "grad_norm": 0.4581094980239868, + "learning_rate": 0.0001364769492715008, + "loss": 1.4599, + "step": 24453 + }, + { + "epoch": 0.31776857691876625, + "grad_norm": 0.42847728729248047, + "learning_rate": 0.00013647434980958943, + "loss": 1.3118, + "step": 24454 + }, + { + "epoch": 0.3177815714626821, + "grad_norm": 0.3296990692615509, + "learning_rate": 0.00013647175034767802, + "loss": 1.302, + "step": 24455 + }, + { + "epoch": 0.317794566006598, + "grad_norm": 0.3605743646621704, + "learning_rate": 0.00013646915088576667, + "loss": 1.4442, + "step": 24456 + }, + { + "epoch": 0.31780756055051385, + "grad_norm": 0.3527849018573761, + "learning_rate": 0.00013646655142385527, + "loss": 1.4337, + "step": 24457 + }, + { + "epoch": 0.31782055509442975, + "grad_norm": 0.2750887870788574, + "learning_rate": 0.00013646395196194387, + "loss": 1.4747, + "step": 24458 + }, + { + "epoch": 0.3178335496383456, + "grad_norm": 0.33957940340042114, + "learning_rate": 0.0001364613525000325, + "loss": 1.4128, + "step": 24459 + }, + { + "epoch": 0.3178465441822615, + "grad_norm": 0.34004250168800354, + "learning_rate": 0.00013645875303812112, + "loss": 1.1761, + "step": 24460 + }, + { + "epoch": 0.31785953872617734, + "grad_norm": 0.38412028551101685, + "learning_rate": 0.00013645615357620974, + "loss": 1.4794, + "step": 24461 + }, + { + "epoch": 0.31787253327009324, + "grad_norm": 0.48290520906448364, + "learning_rate": 0.00013645355411429834, + "loss": 1.4543, + "step": 24462 + }, + { + "epoch": 0.3178855278140091, + "grad_norm": 0.45453929901123047, + "learning_rate": 0.00013645095465238696, + "loss": 1.3192, + "step": 24463 + }, + { + "epoch": 0.317898522357925, + "grad_norm": 0.47529467940330505, + "learning_rate": 0.0001364483551904756, + "loss": 1.4917, + "step": 24464 + }, + { + "epoch": 0.31791151690184083, + "grad_norm": 0.35940366983413696, + "learning_rate": 0.00013644575572856419, + "loss": 1.3336, + "step": 24465 + }, + { + "epoch": 0.31792451144575673, + "grad_norm": 0.4172157347202301, + "learning_rate": 0.0001364431562666528, + "loss": 1.433, + "step": 24466 + }, + { + "epoch": 0.3179375059896726, + "grad_norm": 0.4016782343387604, + "learning_rate": 0.0001364405568047414, + "loss": 1.3937, + "step": 24467 + }, + { + "epoch": 0.3179505005335885, + "grad_norm": 0.40001797676086426, + "learning_rate": 0.00013643795734283006, + "loss": 1.2662, + "step": 24468 + }, + { + "epoch": 0.3179634950775043, + "grad_norm": 0.34742429852485657, + "learning_rate": 0.00013643535788091866, + "loss": 1.2846, + "step": 24469 + }, + { + "epoch": 0.3179764896214202, + "grad_norm": 0.34739571809768677, + "learning_rate": 0.00013643275841900725, + "loss": 1.2309, + "step": 24470 + }, + { + "epoch": 0.31798948416533607, + "grad_norm": 0.3443008065223694, + "learning_rate": 0.00013643015895709588, + "loss": 1.4515, + "step": 24471 + }, + { + "epoch": 0.31800247870925197, + "grad_norm": 0.3984827995300293, + "learning_rate": 0.0001364275594951845, + "loss": 1.4079, + "step": 24472 + }, + { + "epoch": 0.3180154732531678, + "grad_norm": 0.38483062386512756, + "learning_rate": 0.00013642496003327313, + "loss": 1.4874, + "step": 24473 + }, + { + "epoch": 0.3180284677970837, + "grad_norm": 0.34384283423423767, + "learning_rate": 0.00013642236057136172, + "loss": 1.4637, + "step": 24474 + }, + { + "epoch": 0.31804146234099956, + "grad_norm": 0.25040942430496216, + "learning_rate": 0.00013641976110945035, + "loss": 1.3285, + "step": 24475 + }, + { + "epoch": 0.31805445688491546, + "grad_norm": 0.4131932258605957, + "learning_rate": 0.00013641716164753897, + "loss": 1.5199, + "step": 24476 + }, + { + "epoch": 0.3180674514288313, + "grad_norm": 0.4325697124004364, + "learning_rate": 0.00013641456218562757, + "loss": 1.4554, + "step": 24477 + }, + { + "epoch": 0.3180804459727472, + "grad_norm": 0.3533201813697815, + "learning_rate": 0.0001364119627237162, + "loss": 1.5592, + "step": 24478 + }, + { + "epoch": 0.31809344051666305, + "grad_norm": 0.42637214064598083, + "learning_rate": 0.00013640936326180482, + "loss": 1.5279, + "step": 24479 + }, + { + "epoch": 0.31810643506057895, + "grad_norm": 0.34816643595695496, + "learning_rate": 0.00013640676379989344, + "loss": 1.4878, + "step": 24480 + }, + { + "epoch": 0.3181194296044948, + "grad_norm": 0.3948005437850952, + "learning_rate": 0.00013640416433798204, + "loss": 1.3542, + "step": 24481 + }, + { + "epoch": 0.3181324241484107, + "grad_norm": 0.343655526638031, + "learning_rate": 0.00013640156487607067, + "loss": 1.2842, + "step": 24482 + }, + { + "epoch": 0.31814541869232654, + "grad_norm": 0.40343061089515686, + "learning_rate": 0.0001363989654141593, + "loss": 1.3605, + "step": 24483 + }, + { + "epoch": 0.31815841323624244, + "grad_norm": 0.4316779375076294, + "learning_rate": 0.0001363963659522479, + "loss": 1.4025, + "step": 24484 + }, + { + "epoch": 0.3181714077801583, + "grad_norm": 0.41991502046585083, + "learning_rate": 0.0001363937664903365, + "loss": 1.3039, + "step": 24485 + }, + { + "epoch": 0.3181844023240742, + "grad_norm": 0.46094104647636414, + "learning_rate": 0.0001363911670284251, + "loss": 1.2499, + "step": 24486 + }, + { + "epoch": 0.31819739686799003, + "grad_norm": 0.39675092697143555, + "learning_rate": 0.00013638856756651373, + "loss": 1.4136, + "step": 24487 + }, + { + "epoch": 0.31821039141190594, + "grad_norm": 0.40125471353530884, + "learning_rate": 0.00013638596810460236, + "loss": 1.3759, + "step": 24488 + }, + { + "epoch": 0.3182233859558218, + "grad_norm": 0.4595598578453064, + "learning_rate": 0.00013638336864269096, + "loss": 1.4896, + "step": 24489 + }, + { + "epoch": 0.3182363804997377, + "grad_norm": 0.33391091227531433, + "learning_rate": 0.00013638076918077958, + "loss": 1.3795, + "step": 24490 + }, + { + "epoch": 0.3182493750436535, + "grad_norm": 0.32041823863983154, + "learning_rate": 0.0001363781697188682, + "loss": 1.431, + "step": 24491 + }, + { + "epoch": 0.3182623695875694, + "grad_norm": 0.3728831112384796, + "learning_rate": 0.00013637557025695683, + "loss": 1.4762, + "step": 24492 + }, + { + "epoch": 0.31827536413148527, + "grad_norm": 0.44518592953681946, + "learning_rate": 0.00013637297079504543, + "loss": 1.3638, + "step": 24493 + }, + { + "epoch": 0.3182883586754012, + "grad_norm": 0.4794606864452362, + "learning_rate": 0.00013637037133313405, + "loss": 1.5528, + "step": 24494 + }, + { + "epoch": 0.318301353219317, + "grad_norm": 0.305264949798584, + "learning_rate": 0.00013636777187122268, + "loss": 1.3475, + "step": 24495 + }, + { + "epoch": 0.3183143477632329, + "grad_norm": 0.3816368281841278, + "learning_rate": 0.00013636517240931127, + "loss": 1.3319, + "step": 24496 + }, + { + "epoch": 0.31832734230714876, + "grad_norm": 0.30250993371009827, + "learning_rate": 0.0001363625729473999, + "loss": 1.3405, + "step": 24497 + }, + { + "epoch": 0.31834033685106466, + "grad_norm": 0.4202573299407959, + "learning_rate": 0.0001363599734854885, + "loss": 1.3743, + "step": 24498 + }, + { + "epoch": 0.3183533313949805, + "grad_norm": 0.36171600222587585, + "learning_rate": 0.00013635737402357712, + "loss": 1.5809, + "step": 24499 + }, + { + "epoch": 0.3183663259388964, + "grad_norm": 0.460409015417099, + "learning_rate": 0.00013635477456166574, + "loss": 1.5428, + "step": 24500 + }, + { + "epoch": 0.31837932048281226, + "grad_norm": 0.4011229872703552, + "learning_rate": 0.00013635217509975434, + "loss": 1.3342, + "step": 24501 + }, + { + "epoch": 0.31839231502672816, + "grad_norm": 0.33957865834236145, + "learning_rate": 0.00013634957563784297, + "loss": 1.4648, + "step": 24502 + }, + { + "epoch": 0.318405309570644, + "grad_norm": 0.42703142762184143, + "learning_rate": 0.0001363469761759316, + "loss": 1.4846, + "step": 24503 + }, + { + "epoch": 0.3184183041145599, + "grad_norm": 0.3616746962070465, + "learning_rate": 0.00013634437671402022, + "loss": 1.2663, + "step": 24504 + }, + { + "epoch": 0.31843129865847575, + "grad_norm": 0.45498013496398926, + "learning_rate": 0.0001363417772521088, + "loss": 1.3494, + "step": 24505 + }, + { + "epoch": 0.31844429320239165, + "grad_norm": 0.4362391233444214, + "learning_rate": 0.00013633917779019744, + "loss": 1.203, + "step": 24506 + }, + { + "epoch": 0.3184572877463075, + "grad_norm": 0.42321792244911194, + "learning_rate": 0.00013633657832828606, + "loss": 1.2931, + "step": 24507 + }, + { + "epoch": 0.3184702822902234, + "grad_norm": 0.42419230937957764, + "learning_rate": 0.00013633397886637466, + "loss": 1.3792, + "step": 24508 + }, + { + "epoch": 0.31848327683413924, + "grad_norm": 0.46707555651664734, + "learning_rate": 0.00013633137940446328, + "loss": 1.3083, + "step": 24509 + }, + { + "epoch": 0.31849627137805514, + "grad_norm": 0.3309074342250824, + "learning_rate": 0.00013632877994255188, + "loss": 1.4309, + "step": 24510 + }, + { + "epoch": 0.318509265921971, + "grad_norm": 0.4789426624774933, + "learning_rate": 0.00013632618048064053, + "loss": 1.4981, + "step": 24511 + }, + { + "epoch": 0.3185222604658869, + "grad_norm": 0.4254739582538605, + "learning_rate": 0.00013632358101872913, + "loss": 1.3034, + "step": 24512 + }, + { + "epoch": 0.31853525500980273, + "grad_norm": 0.4464763104915619, + "learning_rate": 0.00013632098155681773, + "loss": 1.424, + "step": 24513 + }, + { + "epoch": 0.31854824955371863, + "grad_norm": 0.44271036982536316, + "learning_rate": 0.00013631838209490638, + "loss": 1.4236, + "step": 24514 + }, + { + "epoch": 0.31856124409763453, + "grad_norm": 0.4108887016773224, + "learning_rate": 0.00013631578263299498, + "loss": 1.311, + "step": 24515 + }, + { + "epoch": 0.3185742386415504, + "grad_norm": 0.42524009943008423, + "learning_rate": 0.0001363131831710836, + "loss": 1.3427, + "step": 24516 + }, + { + "epoch": 0.3185872331854663, + "grad_norm": 0.3634762465953827, + "learning_rate": 0.0001363105837091722, + "loss": 1.4374, + "step": 24517 + }, + { + "epoch": 0.3186002277293821, + "grad_norm": 0.34638872742652893, + "learning_rate": 0.00013630798424726082, + "loss": 1.2739, + "step": 24518 + }, + { + "epoch": 0.318613222273298, + "grad_norm": 0.5592221617698669, + "learning_rate": 0.00013630538478534945, + "loss": 1.3902, + "step": 24519 + }, + { + "epoch": 0.31862621681721387, + "grad_norm": 0.4069071114063263, + "learning_rate": 0.00013630278532343804, + "loss": 1.432, + "step": 24520 + }, + { + "epoch": 0.31863921136112977, + "grad_norm": 0.44740116596221924, + "learning_rate": 0.00013630018586152667, + "loss": 1.4657, + "step": 24521 + }, + { + "epoch": 0.3186522059050456, + "grad_norm": 0.5179588794708252, + "learning_rate": 0.0001362975863996153, + "loss": 1.3662, + "step": 24522 + }, + { + "epoch": 0.3186652004489615, + "grad_norm": 0.46699976921081543, + "learning_rate": 0.00013629498693770392, + "loss": 1.2919, + "step": 24523 + }, + { + "epoch": 0.31867819499287736, + "grad_norm": 0.3441481590270996, + "learning_rate": 0.00013629238747579252, + "loss": 1.4648, + "step": 24524 + }, + { + "epoch": 0.31869118953679326, + "grad_norm": 0.38569527864456177, + "learning_rate": 0.0001362897880138811, + "loss": 1.4444, + "step": 24525 + }, + { + "epoch": 0.3187041840807091, + "grad_norm": 0.3232535123825073, + "learning_rate": 0.00013628718855196976, + "loss": 1.2475, + "step": 24526 + }, + { + "epoch": 0.318717178624625, + "grad_norm": 0.36247608065605164, + "learning_rate": 0.00013628458909005836, + "loss": 1.2857, + "step": 24527 + }, + { + "epoch": 0.31873017316854085, + "grad_norm": 0.448169082403183, + "learning_rate": 0.00013628198962814699, + "loss": 1.4624, + "step": 24528 + }, + { + "epoch": 0.31874316771245675, + "grad_norm": 0.37179121375083923, + "learning_rate": 0.00013627939016623558, + "loss": 1.3567, + "step": 24529 + }, + { + "epoch": 0.3187561622563726, + "grad_norm": 0.4183000922203064, + "learning_rate": 0.0001362767907043242, + "loss": 1.1763, + "step": 24530 + }, + { + "epoch": 0.3187691568002885, + "grad_norm": 0.4072781205177307, + "learning_rate": 0.00013627419124241283, + "loss": 1.1975, + "step": 24531 + }, + { + "epoch": 0.31878215134420435, + "grad_norm": 0.48555609583854675, + "learning_rate": 0.00013627159178050143, + "loss": 1.5033, + "step": 24532 + }, + { + "epoch": 0.31879514588812025, + "grad_norm": 0.44378501176834106, + "learning_rate": 0.00013626899231859005, + "loss": 1.5301, + "step": 24533 + }, + { + "epoch": 0.3188081404320361, + "grad_norm": 0.4073767066001892, + "learning_rate": 0.00013626639285667868, + "loss": 1.2428, + "step": 24534 + }, + { + "epoch": 0.318821134975952, + "grad_norm": 0.3824664056301117, + "learning_rate": 0.0001362637933947673, + "loss": 1.3102, + "step": 24535 + }, + { + "epoch": 0.31883412951986784, + "grad_norm": 0.4208299517631531, + "learning_rate": 0.0001362611939328559, + "loss": 1.3669, + "step": 24536 + }, + { + "epoch": 0.31884712406378374, + "grad_norm": 0.37531328201293945, + "learning_rate": 0.0001362585944709445, + "loss": 1.5238, + "step": 24537 + }, + { + "epoch": 0.3188601186076996, + "grad_norm": 0.391003280878067, + "learning_rate": 0.00013625599500903315, + "loss": 1.3356, + "step": 24538 + }, + { + "epoch": 0.3188731131516155, + "grad_norm": 0.3075683116912842, + "learning_rate": 0.00013625339554712175, + "loss": 1.3381, + "step": 24539 + }, + { + "epoch": 0.31888610769553133, + "grad_norm": 0.49173039197921753, + "learning_rate": 0.00013625079608521037, + "loss": 1.3635, + "step": 24540 + }, + { + "epoch": 0.31889910223944723, + "grad_norm": 0.44790974259376526, + "learning_rate": 0.00013624819662329897, + "loss": 1.2755, + "step": 24541 + }, + { + "epoch": 0.3189120967833631, + "grad_norm": 0.4942026734352112, + "learning_rate": 0.0001362455971613876, + "loss": 1.5959, + "step": 24542 + }, + { + "epoch": 0.318925091327279, + "grad_norm": 0.4439903795719147, + "learning_rate": 0.00013624299769947622, + "loss": 1.2486, + "step": 24543 + }, + { + "epoch": 0.3189380858711948, + "grad_norm": 0.41642075777053833, + "learning_rate": 0.00013624039823756482, + "loss": 1.3676, + "step": 24544 + }, + { + "epoch": 0.3189510804151107, + "grad_norm": 0.41230639815330505, + "learning_rate": 0.00013623779877565344, + "loss": 1.4296, + "step": 24545 + }, + { + "epoch": 0.31896407495902657, + "grad_norm": 0.31965962052345276, + "learning_rate": 0.00013623519931374206, + "loss": 1.44, + "step": 24546 + }, + { + "epoch": 0.31897706950294247, + "grad_norm": 0.3615824580192566, + "learning_rate": 0.0001362325998518307, + "loss": 1.3134, + "step": 24547 + }, + { + "epoch": 0.3189900640468583, + "grad_norm": 0.37412142753601074, + "learning_rate": 0.00013623000038991929, + "loss": 1.373, + "step": 24548 + }, + { + "epoch": 0.3190030585907742, + "grad_norm": 0.40013277530670166, + "learning_rate": 0.0001362274009280079, + "loss": 1.4433, + "step": 24549 + }, + { + "epoch": 0.31901605313469006, + "grad_norm": 0.4531174302101135, + "learning_rate": 0.00013622480146609654, + "loss": 1.5444, + "step": 24550 + }, + { + "epoch": 0.31902904767860596, + "grad_norm": 0.4477156102657318, + "learning_rate": 0.00013622220200418513, + "loss": 1.2742, + "step": 24551 + }, + { + "epoch": 0.3190420422225218, + "grad_norm": 0.4236210882663727, + "learning_rate": 0.00013621960254227376, + "loss": 1.4847, + "step": 24552 + }, + { + "epoch": 0.3190550367664377, + "grad_norm": 0.36249276995658875, + "learning_rate": 0.00013621700308036238, + "loss": 1.1774, + "step": 24553 + }, + { + "epoch": 0.31906803131035355, + "grad_norm": 0.3766421377658844, + "learning_rate": 0.00013621440361845098, + "loss": 1.4264, + "step": 24554 + }, + { + "epoch": 0.31908102585426945, + "grad_norm": 0.4403527081012726, + "learning_rate": 0.0001362118041565396, + "loss": 1.4028, + "step": 24555 + }, + { + "epoch": 0.3190940203981853, + "grad_norm": 0.4380132555961609, + "learning_rate": 0.0001362092046946282, + "loss": 1.5295, + "step": 24556 + }, + { + "epoch": 0.3191070149421012, + "grad_norm": 0.25155001878738403, + "learning_rate": 0.00013620660523271685, + "loss": 1.2453, + "step": 24557 + }, + { + "epoch": 0.31912000948601704, + "grad_norm": 0.46866559982299805, + "learning_rate": 0.00013620400577080545, + "loss": 1.5336, + "step": 24558 + }, + { + "epoch": 0.31913300402993294, + "grad_norm": 0.325844943523407, + "learning_rate": 0.00013620140630889407, + "loss": 1.3731, + "step": 24559 + }, + { + "epoch": 0.3191459985738488, + "grad_norm": 0.26560500264167786, + "learning_rate": 0.00013619880684698267, + "loss": 1.4453, + "step": 24560 + }, + { + "epoch": 0.3191589931177647, + "grad_norm": 0.3765057325363159, + "learning_rate": 0.0001361962073850713, + "loss": 1.3048, + "step": 24561 + }, + { + "epoch": 0.31917198766168053, + "grad_norm": 0.31004953384399414, + "learning_rate": 0.00013619360792315992, + "loss": 1.3569, + "step": 24562 + }, + { + "epoch": 0.31918498220559643, + "grad_norm": 0.40252864360809326, + "learning_rate": 0.00013619100846124852, + "loss": 1.4125, + "step": 24563 + }, + { + "epoch": 0.3191979767495123, + "grad_norm": 0.25679269433021545, + "learning_rate": 0.00013618840899933714, + "loss": 1.2879, + "step": 24564 + }, + { + "epoch": 0.3192109712934282, + "grad_norm": 0.378481924533844, + "learning_rate": 0.00013618580953742577, + "loss": 1.4235, + "step": 24565 + }, + { + "epoch": 0.319223965837344, + "grad_norm": 0.4207339882850647, + "learning_rate": 0.0001361832100755144, + "loss": 1.501, + "step": 24566 + }, + { + "epoch": 0.3192369603812599, + "grad_norm": 0.4911903440952301, + "learning_rate": 0.000136180610613603, + "loss": 1.6087, + "step": 24567 + }, + { + "epoch": 0.31924995492517577, + "grad_norm": 0.32365554571151733, + "learning_rate": 0.00013617801115169159, + "loss": 1.3274, + "step": 24568 + }, + { + "epoch": 0.3192629494690917, + "grad_norm": 0.42182356119155884, + "learning_rate": 0.00013617541168978024, + "loss": 1.552, + "step": 24569 + }, + { + "epoch": 0.3192759440130075, + "grad_norm": 0.34021878242492676, + "learning_rate": 0.00013617281222786884, + "loss": 1.2254, + "step": 24570 + }, + { + "epoch": 0.3192889385569234, + "grad_norm": 0.3618484139442444, + "learning_rate": 0.00013617021276595746, + "loss": 1.2343, + "step": 24571 + }, + { + "epoch": 0.31930193310083926, + "grad_norm": 0.40864869952201843, + "learning_rate": 0.00013616761330404606, + "loss": 1.6207, + "step": 24572 + }, + { + "epoch": 0.31931492764475516, + "grad_norm": 0.3376300632953644, + "learning_rate": 0.00013616501384213468, + "loss": 1.2565, + "step": 24573 + }, + { + "epoch": 0.319327922188671, + "grad_norm": 0.3087749481201172, + "learning_rate": 0.0001361624143802233, + "loss": 1.5313, + "step": 24574 + }, + { + "epoch": 0.3193409167325869, + "grad_norm": 0.4960716664791107, + "learning_rate": 0.0001361598149183119, + "loss": 1.4389, + "step": 24575 + }, + { + "epoch": 0.31935391127650276, + "grad_norm": 0.4120165705680847, + "learning_rate": 0.00013615721545640053, + "loss": 1.58, + "step": 24576 + }, + { + "epoch": 0.31936690582041866, + "grad_norm": 0.45616215467453003, + "learning_rate": 0.00013615461599448915, + "loss": 1.5636, + "step": 24577 + }, + { + "epoch": 0.3193799003643345, + "grad_norm": 0.3470858931541443, + "learning_rate": 0.00013615201653257778, + "loss": 1.4587, + "step": 24578 + }, + { + "epoch": 0.3193928949082504, + "grad_norm": 0.41595685482025146, + "learning_rate": 0.00013614941707066637, + "loss": 1.3261, + "step": 24579 + }, + { + "epoch": 0.31940588945216625, + "grad_norm": 0.40102699398994446, + "learning_rate": 0.00013614681760875497, + "loss": 1.2579, + "step": 24580 + }, + { + "epoch": 0.31941888399608215, + "grad_norm": 0.3551299571990967, + "learning_rate": 0.00013614421814684362, + "loss": 1.0856, + "step": 24581 + }, + { + "epoch": 0.319431878539998, + "grad_norm": 0.41390490531921387, + "learning_rate": 0.00013614161868493222, + "loss": 1.327, + "step": 24582 + }, + { + "epoch": 0.3194448730839139, + "grad_norm": 0.37795621156692505, + "learning_rate": 0.00013613901922302085, + "loss": 1.4272, + "step": 24583 + }, + { + "epoch": 0.31945786762782974, + "grad_norm": 0.4226207435131073, + "learning_rate": 0.00013613641976110944, + "loss": 1.4477, + "step": 24584 + }, + { + "epoch": 0.31947086217174564, + "grad_norm": 0.3721774220466614, + "learning_rate": 0.00013613382029919807, + "loss": 1.685, + "step": 24585 + }, + { + "epoch": 0.3194838567156615, + "grad_norm": 0.43431785702705383, + "learning_rate": 0.0001361312208372867, + "loss": 1.3605, + "step": 24586 + }, + { + "epoch": 0.3194968512595774, + "grad_norm": 0.5108338594436646, + "learning_rate": 0.0001361286213753753, + "loss": 1.3863, + "step": 24587 + }, + { + "epoch": 0.31950984580349323, + "grad_norm": 0.39368823170661926, + "learning_rate": 0.00013612602191346394, + "loss": 1.4914, + "step": 24588 + }, + { + "epoch": 0.31952284034740913, + "grad_norm": 0.33489078283309937, + "learning_rate": 0.00013612342245155254, + "loss": 1.4493, + "step": 24589 + }, + { + "epoch": 0.31953583489132503, + "grad_norm": 0.4886419177055359, + "learning_rate": 0.00013612082298964116, + "loss": 1.6374, + "step": 24590 + }, + { + "epoch": 0.3195488294352409, + "grad_norm": 0.36556005477905273, + "learning_rate": 0.00013611822352772976, + "loss": 1.2086, + "step": 24591 + }, + { + "epoch": 0.3195618239791568, + "grad_norm": 0.3822033703327179, + "learning_rate": 0.00013611562406581838, + "loss": 1.4478, + "step": 24592 + }, + { + "epoch": 0.3195748185230726, + "grad_norm": 0.3966180980205536, + "learning_rate": 0.000136113024603907, + "loss": 1.2841, + "step": 24593 + }, + { + "epoch": 0.3195878130669885, + "grad_norm": 0.4178012013435364, + "learning_rate": 0.0001361104251419956, + "loss": 1.3748, + "step": 24594 + }, + { + "epoch": 0.31960080761090437, + "grad_norm": 0.4416796863079071, + "learning_rate": 0.00013610782568008423, + "loss": 1.5387, + "step": 24595 + }, + { + "epoch": 0.31961380215482027, + "grad_norm": 0.447081595659256, + "learning_rate": 0.00013610522621817285, + "loss": 1.3171, + "step": 24596 + }, + { + "epoch": 0.3196267966987361, + "grad_norm": 0.33643415570259094, + "learning_rate": 0.00013610262675626145, + "loss": 1.0847, + "step": 24597 + }, + { + "epoch": 0.319639791242652, + "grad_norm": 0.38043490052223206, + "learning_rate": 0.00013610002729435008, + "loss": 1.2841, + "step": 24598 + }, + { + "epoch": 0.31965278578656786, + "grad_norm": 0.34751492738723755, + "learning_rate": 0.00013609742783243867, + "loss": 1.3364, + "step": 24599 + }, + { + "epoch": 0.31966578033048376, + "grad_norm": 0.640848696231842, + "learning_rate": 0.00013609482837052733, + "loss": 1.611, + "step": 24600 + }, + { + "epoch": 0.3196787748743996, + "grad_norm": 0.4591352939605713, + "learning_rate": 0.00013609222890861592, + "loss": 1.5975, + "step": 24601 + }, + { + "epoch": 0.3196917694183155, + "grad_norm": 0.41186749935150146, + "learning_rate": 0.00013608962944670455, + "loss": 1.4004, + "step": 24602 + }, + { + "epoch": 0.31970476396223135, + "grad_norm": 0.39533838629722595, + "learning_rate": 0.00013608702998479315, + "loss": 1.5247, + "step": 24603 + }, + { + "epoch": 0.31971775850614725, + "grad_norm": 0.4489741027355194, + "learning_rate": 0.00013608443052288177, + "loss": 1.4234, + "step": 24604 + }, + { + "epoch": 0.3197307530500631, + "grad_norm": 0.48175880312919617, + "learning_rate": 0.0001360818310609704, + "loss": 1.5163, + "step": 24605 + }, + { + "epoch": 0.319743747593979, + "grad_norm": 0.3358345031738281, + "learning_rate": 0.000136079231599059, + "loss": 1.5759, + "step": 24606 + }, + { + "epoch": 0.31975674213789484, + "grad_norm": 0.46662598848342896, + "learning_rate": 0.00013607663213714762, + "loss": 1.3458, + "step": 24607 + }, + { + "epoch": 0.31976973668181075, + "grad_norm": 0.3371489346027374, + "learning_rate": 0.00013607403267523624, + "loss": 1.502, + "step": 24608 + }, + { + "epoch": 0.3197827312257266, + "grad_norm": 0.3279533088207245, + "learning_rate": 0.00013607143321332484, + "loss": 1.3046, + "step": 24609 + }, + { + "epoch": 0.3197957257696425, + "grad_norm": 0.351125568151474, + "learning_rate": 0.00013606883375141346, + "loss": 1.4309, + "step": 24610 + }, + { + "epoch": 0.31980872031355834, + "grad_norm": 0.3855311870574951, + "learning_rate": 0.00013606623428950206, + "loss": 1.3136, + "step": 24611 + }, + { + "epoch": 0.31982171485747424, + "grad_norm": 0.42153966426849365, + "learning_rate": 0.0001360636348275907, + "loss": 1.2424, + "step": 24612 + }, + { + "epoch": 0.3198347094013901, + "grad_norm": 0.45653510093688965, + "learning_rate": 0.0001360610353656793, + "loss": 1.3847, + "step": 24613 + }, + { + "epoch": 0.319847703945306, + "grad_norm": 0.27487313747406006, + "learning_rate": 0.00013605843590376793, + "loss": 1.1458, + "step": 24614 + }, + { + "epoch": 0.31986069848922183, + "grad_norm": 0.4824073314666748, + "learning_rate": 0.00013605583644185653, + "loss": 1.3576, + "step": 24615 + }, + { + "epoch": 0.31987369303313773, + "grad_norm": 0.31202632188796997, + "learning_rate": 0.00013605323697994515, + "loss": 1.276, + "step": 24616 + }, + { + "epoch": 0.3198866875770536, + "grad_norm": 0.5924356579780579, + "learning_rate": 0.00013605063751803378, + "loss": 1.5418, + "step": 24617 + }, + { + "epoch": 0.3198996821209695, + "grad_norm": 0.3384953737258911, + "learning_rate": 0.00013604803805612238, + "loss": 1.2339, + "step": 24618 + }, + { + "epoch": 0.3199126766648853, + "grad_norm": 0.5104926824569702, + "learning_rate": 0.000136045438594211, + "loss": 1.2862, + "step": 24619 + }, + { + "epoch": 0.3199256712088012, + "grad_norm": 0.39461395144462585, + "learning_rate": 0.00013604283913229963, + "loss": 1.4092, + "step": 24620 + }, + { + "epoch": 0.31993866575271707, + "grad_norm": 0.38680028915405273, + "learning_rate": 0.00013604023967038822, + "loss": 1.356, + "step": 24621 + }, + { + "epoch": 0.31995166029663297, + "grad_norm": 0.4401646554470062, + "learning_rate": 0.00013603764020847685, + "loss": 1.3714, + "step": 24622 + }, + { + "epoch": 0.3199646548405488, + "grad_norm": 0.3675658404827118, + "learning_rate": 0.00013603504074656544, + "loss": 1.3751, + "step": 24623 + }, + { + "epoch": 0.3199776493844647, + "grad_norm": 0.41579878330230713, + "learning_rate": 0.0001360324412846541, + "loss": 1.4316, + "step": 24624 + }, + { + "epoch": 0.31999064392838056, + "grad_norm": 0.3984793424606323, + "learning_rate": 0.0001360298418227427, + "loss": 1.2428, + "step": 24625 + }, + { + "epoch": 0.32000363847229646, + "grad_norm": 0.3353992998600006, + "learning_rate": 0.00013602724236083132, + "loss": 1.2812, + "step": 24626 + }, + { + "epoch": 0.3200166330162123, + "grad_norm": 0.35817191004753113, + "learning_rate": 0.00013602464289891994, + "loss": 1.3114, + "step": 24627 + }, + { + "epoch": 0.3200296275601282, + "grad_norm": 0.35655084252357483, + "learning_rate": 0.00013602204343700854, + "loss": 1.2821, + "step": 24628 + }, + { + "epoch": 0.32004262210404405, + "grad_norm": 0.336092472076416, + "learning_rate": 0.00013601944397509716, + "loss": 1.3803, + "step": 24629 + }, + { + "epoch": 0.32005561664795995, + "grad_norm": 0.34984907507896423, + "learning_rate": 0.00013601684451318576, + "loss": 1.5084, + "step": 24630 + }, + { + "epoch": 0.3200686111918758, + "grad_norm": 0.4446631669998169, + "learning_rate": 0.00013601424505127441, + "loss": 1.4192, + "step": 24631 + }, + { + "epoch": 0.3200816057357917, + "grad_norm": 0.44216668605804443, + "learning_rate": 0.000136011645589363, + "loss": 1.3771, + "step": 24632 + }, + { + "epoch": 0.32009460027970754, + "grad_norm": 0.3548837900161743, + "learning_rate": 0.00013600904612745164, + "loss": 1.3578, + "step": 24633 + }, + { + "epoch": 0.32010759482362344, + "grad_norm": 0.3141326904296875, + "learning_rate": 0.00013600644666554023, + "loss": 1.3227, + "step": 24634 + }, + { + "epoch": 0.3201205893675393, + "grad_norm": 0.4169851541519165, + "learning_rate": 0.00013600384720362886, + "loss": 1.3578, + "step": 24635 + }, + { + "epoch": 0.3201335839114552, + "grad_norm": 0.42173251509666443, + "learning_rate": 0.00013600124774171748, + "loss": 1.3471, + "step": 24636 + }, + { + "epoch": 0.32014657845537103, + "grad_norm": 0.41726696491241455, + "learning_rate": 0.00013599864827980608, + "loss": 1.4472, + "step": 24637 + }, + { + "epoch": 0.32015957299928693, + "grad_norm": 0.3060067594051361, + "learning_rate": 0.0001359960488178947, + "loss": 1.366, + "step": 24638 + }, + { + "epoch": 0.3201725675432028, + "grad_norm": 0.33108460903167725, + "learning_rate": 0.00013599344935598333, + "loss": 1.4009, + "step": 24639 + }, + { + "epoch": 0.3201855620871187, + "grad_norm": 0.5050896406173706, + "learning_rate": 0.00013599084989407193, + "loss": 1.49, + "step": 24640 + }, + { + "epoch": 0.3201985566310345, + "grad_norm": 0.5148933529853821, + "learning_rate": 0.00013598825043216055, + "loss": 1.4794, + "step": 24641 + }, + { + "epoch": 0.3202115511749504, + "grad_norm": 0.39288076758384705, + "learning_rate": 0.00013598565097024915, + "loss": 1.3284, + "step": 24642 + }, + { + "epoch": 0.32022454571886627, + "grad_norm": 0.42341890931129456, + "learning_rate": 0.0001359830515083378, + "loss": 1.519, + "step": 24643 + }, + { + "epoch": 0.32023754026278217, + "grad_norm": 0.3518048822879791, + "learning_rate": 0.0001359804520464264, + "loss": 1.4584, + "step": 24644 + }, + { + "epoch": 0.320250534806698, + "grad_norm": 0.3701116740703583, + "learning_rate": 0.00013597785258451502, + "loss": 1.26, + "step": 24645 + }, + { + "epoch": 0.3202635293506139, + "grad_norm": 0.43790045380592346, + "learning_rate": 0.00013597525312260362, + "loss": 1.2672, + "step": 24646 + }, + { + "epoch": 0.32027652389452976, + "grad_norm": 0.47366979718208313, + "learning_rate": 0.00013597265366069224, + "loss": 1.3905, + "step": 24647 + }, + { + "epoch": 0.32028951843844566, + "grad_norm": 0.4065723419189453, + "learning_rate": 0.00013597005419878087, + "loss": 1.5294, + "step": 24648 + }, + { + "epoch": 0.3203025129823615, + "grad_norm": 0.4497450590133667, + "learning_rate": 0.00013596745473686946, + "loss": 1.4367, + "step": 24649 + }, + { + "epoch": 0.3203155075262774, + "grad_norm": 0.4001935124397278, + "learning_rate": 0.0001359648552749581, + "loss": 1.326, + "step": 24650 + }, + { + "epoch": 0.32032850207019326, + "grad_norm": 0.409372478723526, + "learning_rate": 0.00013596225581304671, + "loss": 1.2982, + "step": 24651 + }, + { + "epoch": 0.32034149661410916, + "grad_norm": 0.26022660732269287, + "learning_rate": 0.0001359596563511353, + "loss": 1.4038, + "step": 24652 + }, + { + "epoch": 0.320354491158025, + "grad_norm": 0.4460715055465698, + "learning_rate": 0.00013595705688922394, + "loss": 1.3915, + "step": 24653 + }, + { + "epoch": 0.3203674857019409, + "grad_norm": 0.3659096956253052, + "learning_rate": 0.00013595445742731253, + "loss": 1.4536, + "step": 24654 + }, + { + "epoch": 0.32038048024585675, + "grad_norm": 0.4425286650657654, + "learning_rate": 0.00013595185796540118, + "loss": 1.4586, + "step": 24655 + }, + { + "epoch": 0.32039347478977265, + "grad_norm": 0.34227854013442993, + "learning_rate": 0.00013594925850348978, + "loss": 1.3676, + "step": 24656 + }, + { + "epoch": 0.3204064693336885, + "grad_norm": 0.3165017068386078, + "learning_rate": 0.0001359466590415784, + "loss": 1.3435, + "step": 24657 + }, + { + "epoch": 0.3204194638776044, + "grad_norm": 0.338697612285614, + "learning_rate": 0.000135944059579667, + "loss": 1.3449, + "step": 24658 + }, + { + "epoch": 0.32043245842152024, + "grad_norm": 0.4276241660118103, + "learning_rate": 0.00013594146011775563, + "loss": 1.4797, + "step": 24659 + }, + { + "epoch": 0.32044545296543614, + "grad_norm": 0.5089761018753052, + "learning_rate": 0.00013593886065584425, + "loss": 1.3462, + "step": 24660 + }, + { + "epoch": 0.320458447509352, + "grad_norm": 0.37063679099082947, + "learning_rate": 0.00013593626119393285, + "loss": 1.3394, + "step": 24661 + }, + { + "epoch": 0.3204714420532679, + "grad_norm": 0.47729796171188354, + "learning_rate": 0.0001359336617320215, + "loss": 1.5818, + "step": 24662 + }, + { + "epoch": 0.32048443659718373, + "grad_norm": 0.3525777757167816, + "learning_rate": 0.0001359310622701101, + "loss": 1.2506, + "step": 24663 + }, + { + "epoch": 0.32049743114109963, + "grad_norm": 0.3640398383140564, + "learning_rate": 0.0001359284628081987, + "loss": 1.3161, + "step": 24664 + }, + { + "epoch": 0.3205104256850155, + "grad_norm": 0.368281751871109, + "learning_rate": 0.00013592586334628732, + "loss": 1.4794, + "step": 24665 + }, + { + "epoch": 0.3205234202289314, + "grad_norm": 0.4180395007133484, + "learning_rate": 0.00013592326388437595, + "loss": 1.3675, + "step": 24666 + }, + { + "epoch": 0.3205364147728473, + "grad_norm": 0.4629068672657013, + "learning_rate": 0.00013592066442246457, + "loss": 1.5641, + "step": 24667 + }, + { + "epoch": 0.3205494093167631, + "grad_norm": 0.4522596299648285, + "learning_rate": 0.00013591806496055317, + "loss": 1.4391, + "step": 24668 + }, + { + "epoch": 0.320562403860679, + "grad_norm": 0.4398055374622345, + "learning_rate": 0.0001359154654986418, + "loss": 1.517, + "step": 24669 + }, + { + "epoch": 0.32057539840459487, + "grad_norm": 0.38920411467552185, + "learning_rate": 0.00013591286603673042, + "loss": 1.4407, + "step": 24670 + }, + { + "epoch": 0.32058839294851077, + "grad_norm": 0.4751942753791809, + "learning_rate": 0.00013591026657481901, + "loss": 1.5235, + "step": 24671 + }, + { + "epoch": 0.3206013874924266, + "grad_norm": 0.3321654200553894, + "learning_rate": 0.00013590766711290764, + "loss": 1.2633, + "step": 24672 + }, + { + "epoch": 0.3206143820363425, + "grad_norm": 0.44496938586235046, + "learning_rate": 0.00013590506765099624, + "loss": 1.5561, + "step": 24673 + }, + { + "epoch": 0.32062737658025836, + "grad_norm": 0.42505893111228943, + "learning_rate": 0.0001359024681890849, + "loss": 1.3232, + "step": 24674 + }, + { + "epoch": 0.32064037112417426, + "grad_norm": 0.46920567750930786, + "learning_rate": 0.00013589986872717348, + "loss": 1.4445, + "step": 24675 + }, + { + "epoch": 0.3206533656680901, + "grad_norm": 0.39228391647338867, + "learning_rate": 0.00013589726926526208, + "loss": 1.3508, + "step": 24676 + }, + { + "epoch": 0.320666360212006, + "grad_norm": 0.42103394865989685, + "learning_rate": 0.0001358946698033507, + "loss": 1.4086, + "step": 24677 + }, + { + "epoch": 0.32067935475592185, + "grad_norm": 0.3482295870780945, + "learning_rate": 0.00013589207034143933, + "loss": 1.2377, + "step": 24678 + }, + { + "epoch": 0.32069234929983775, + "grad_norm": 0.33322474360466003, + "learning_rate": 0.00013588947087952796, + "loss": 1.3471, + "step": 24679 + }, + { + "epoch": 0.3207053438437536, + "grad_norm": 0.3872907757759094, + "learning_rate": 0.00013588687141761655, + "loss": 1.3884, + "step": 24680 + }, + { + "epoch": 0.3207183383876695, + "grad_norm": 0.35781705379486084, + "learning_rate": 0.00013588427195570518, + "loss": 1.5207, + "step": 24681 + }, + { + "epoch": 0.32073133293158534, + "grad_norm": 0.34672942757606506, + "learning_rate": 0.0001358816724937938, + "loss": 1.3324, + "step": 24682 + }, + { + "epoch": 0.32074432747550125, + "grad_norm": 0.4029582738876343, + "learning_rate": 0.0001358790730318824, + "loss": 1.3019, + "step": 24683 + }, + { + "epoch": 0.3207573220194171, + "grad_norm": 0.39951464533805847, + "learning_rate": 0.00013587647356997102, + "loss": 1.6366, + "step": 24684 + }, + { + "epoch": 0.320770316563333, + "grad_norm": 0.37376925349235535, + "learning_rate": 0.00013587387410805962, + "loss": 1.2487, + "step": 24685 + }, + { + "epoch": 0.32078331110724884, + "grad_norm": 0.3113340139389038, + "learning_rate": 0.00013587127464614827, + "loss": 1.2078, + "step": 24686 + }, + { + "epoch": 0.32079630565116474, + "grad_norm": 0.3956928253173828, + "learning_rate": 0.00013586867518423687, + "loss": 1.4368, + "step": 24687 + }, + { + "epoch": 0.3208093001950806, + "grad_norm": 0.4499419033527374, + "learning_rate": 0.0001358660757223255, + "loss": 1.3592, + "step": 24688 + }, + { + "epoch": 0.3208222947389965, + "grad_norm": 0.3688843250274658, + "learning_rate": 0.0001358634762604141, + "loss": 1.1919, + "step": 24689 + }, + { + "epoch": 0.32083528928291233, + "grad_norm": 0.35973894596099854, + "learning_rate": 0.00013586087679850272, + "loss": 1.2937, + "step": 24690 + }, + { + "epoch": 0.32084828382682823, + "grad_norm": 0.43575525283813477, + "learning_rate": 0.00013585827733659134, + "loss": 1.4523, + "step": 24691 + }, + { + "epoch": 0.3208612783707441, + "grad_norm": 0.4094131290912628, + "learning_rate": 0.00013585567787467994, + "loss": 1.4535, + "step": 24692 + }, + { + "epoch": 0.32087427291466, + "grad_norm": 0.3185214698314667, + "learning_rate": 0.00013585307841276856, + "loss": 1.3424, + "step": 24693 + }, + { + "epoch": 0.3208872674585758, + "grad_norm": 0.3318707346916199, + "learning_rate": 0.0001358504789508572, + "loss": 1.2323, + "step": 24694 + }, + { + "epoch": 0.3209002620024917, + "grad_norm": 0.34532883763313293, + "learning_rate": 0.00013584787948894578, + "loss": 1.4138, + "step": 24695 + }, + { + "epoch": 0.32091325654640757, + "grad_norm": 0.3023165762424469, + "learning_rate": 0.0001358452800270344, + "loss": 1.1863, + "step": 24696 + }, + { + "epoch": 0.32092625109032347, + "grad_norm": 0.3690437376499176, + "learning_rate": 0.000135842680565123, + "loss": 1.3486, + "step": 24697 + }, + { + "epoch": 0.3209392456342393, + "grad_norm": 0.5054942965507507, + "learning_rate": 0.00013584008110321166, + "loss": 1.5228, + "step": 24698 + }, + { + "epoch": 0.3209522401781552, + "grad_norm": 0.49830055236816406, + "learning_rate": 0.00013583748164130026, + "loss": 1.4806, + "step": 24699 + }, + { + "epoch": 0.32096523472207106, + "grad_norm": 0.43226420879364014, + "learning_rate": 0.00013583488217938888, + "loss": 1.3502, + "step": 24700 + }, + { + "epoch": 0.32097822926598696, + "grad_norm": 0.35177141427993774, + "learning_rate": 0.0001358322827174775, + "loss": 1.4824, + "step": 24701 + }, + { + "epoch": 0.3209912238099028, + "grad_norm": 0.4701346457004547, + "learning_rate": 0.0001358296832555661, + "loss": 1.3295, + "step": 24702 + }, + { + "epoch": 0.3210042183538187, + "grad_norm": 0.5246003866195679, + "learning_rate": 0.00013582708379365473, + "loss": 1.4603, + "step": 24703 + }, + { + "epoch": 0.32101721289773455, + "grad_norm": 0.34467628598213196, + "learning_rate": 0.00013582448433174332, + "loss": 1.2419, + "step": 24704 + }, + { + "epoch": 0.32103020744165045, + "grad_norm": 0.4125974178314209, + "learning_rate": 0.00013582188486983195, + "loss": 1.5919, + "step": 24705 + }, + { + "epoch": 0.3210432019855663, + "grad_norm": 0.407257616519928, + "learning_rate": 0.00013581928540792057, + "loss": 1.4451, + "step": 24706 + }, + { + "epoch": 0.3210561965294822, + "grad_norm": 0.4813106060028076, + "learning_rate": 0.00013581668594600917, + "loss": 1.5407, + "step": 24707 + }, + { + "epoch": 0.32106919107339804, + "grad_norm": 0.4208795428276062, + "learning_rate": 0.0001358140864840978, + "loss": 1.3404, + "step": 24708 + }, + { + "epoch": 0.32108218561731394, + "grad_norm": 0.38644152879714966, + "learning_rate": 0.00013581148702218642, + "loss": 1.3186, + "step": 24709 + }, + { + "epoch": 0.3210951801612298, + "grad_norm": 0.4278043508529663, + "learning_rate": 0.00013580888756027504, + "loss": 1.3186, + "step": 24710 + }, + { + "epoch": 0.3211081747051457, + "grad_norm": 0.43379321694374084, + "learning_rate": 0.00013580628809836364, + "loss": 1.2023, + "step": 24711 + }, + { + "epoch": 0.32112116924906153, + "grad_norm": 0.2837704122066498, + "learning_rate": 0.00013580368863645227, + "loss": 1.2742, + "step": 24712 + }, + { + "epoch": 0.32113416379297743, + "grad_norm": 0.3324788510799408, + "learning_rate": 0.0001358010891745409, + "loss": 1.2826, + "step": 24713 + }, + { + "epoch": 0.3211471583368933, + "grad_norm": 0.40308767557144165, + "learning_rate": 0.0001357984897126295, + "loss": 1.4953, + "step": 24714 + }, + { + "epoch": 0.3211601528808092, + "grad_norm": 0.3645550012588501, + "learning_rate": 0.0001357958902507181, + "loss": 1.3198, + "step": 24715 + }, + { + "epoch": 0.321173147424725, + "grad_norm": 0.37720733880996704, + "learning_rate": 0.0001357932907888067, + "loss": 1.5122, + "step": 24716 + }, + { + "epoch": 0.3211861419686409, + "grad_norm": 0.476252019405365, + "learning_rate": 0.00013579069132689536, + "loss": 1.4569, + "step": 24717 + }, + { + "epoch": 0.32119913651255677, + "grad_norm": 0.31597527861595154, + "learning_rate": 0.00013578809186498396, + "loss": 1.4193, + "step": 24718 + }, + { + "epoch": 0.32121213105647267, + "grad_norm": 0.4516609013080597, + "learning_rate": 0.00013578549240307256, + "loss": 1.561, + "step": 24719 + }, + { + "epoch": 0.3212251256003885, + "grad_norm": 0.33386701345443726, + "learning_rate": 0.00013578289294116118, + "loss": 1.27, + "step": 24720 + }, + { + "epoch": 0.3212381201443044, + "grad_norm": 0.31866100430488586, + "learning_rate": 0.0001357802934792498, + "loss": 1.2678, + "step": 24721 + }, + { + "epoch": 0.32125111468822026, + "grad_norm": 0.46269989013671875, + "learning_rate": 0.00013577769401733843, + "loss": 1.4505, + "step": 24722 + }, + { + "epoch": 0.32126410923213616, + "grad_norm": 0.4592706561088562, + "learning_rate": 0.00013577509455542703, + "loss": 1.559, + "step": 24723 + }, + { + "epoch": 0.321277103776052, + "grad_norm": 0.41620030999183655, + "learning_rate": 0.00013577249509351565, + "loss": 1.4544, + "step": 24724 + }, + { + "epoch": 0.3212900983199679, + "grad_norm": 0.38035884499549866, + "learning_rate": 0.00013576989563160427, + "loss": 1.2912, + "step": 24725 + }, + { + "epoch": 0.32130309286388375, + "grad_norm": 0.45668527483940125, + "learning_rate": 0.00013576729616969287, + "loss": 1.4422, + "step": 24726 + }, + { + "epoch": 0.32131608740779966, + "grad_norm": 0.3690143823623657, + "learning_rate": 0.0001357646967077815, + "loss": 1.3352, + "step": 24727 + }, + { + "epoch": 0.3213290819517155, + "grad_norm": 0.3793088495731354, + "learning_rate": 0.0001357620972458701, + "loss": 1.3989, + "step": 24728 + }, + { + "epoch": 0.3213420764956314, + "grad_norm": 0.416709303855896, + "learning_rate": 0.00013575949778395875, + "loss": 1.3578, + "step": 24729 + }, + { + "epoch": 0.32135507103954725, + "grad_norm": 0.37327855825424194, + "learning_rate": 0.00013575689832204734, + "loss": 1.426, + "step": 24730 + }, + { + "epoch": 0.32136806558346315, + "grad_norm": 0.3240900933742523, + "learning_rate": 0.00013575429886013594, + "loss": 1.4102, + "step": 24731 + }, + { + "epoch": 0.321381060127379, + "grad_norm": 0.36944425106048584, + "learning_rate": 0.00013575169939822457, + "loss": 1.2829, + "step": 24732 + }, + { + "epoch": 0.3213940546712949, + "grad_norm": 0.47413942217826843, + "learning_rate": 0.0001357490999363132, + "loss": 1.3273, + "step": 24733 + }, + { + "epoch": 0.32140704921521074, + "grad_norm": 0.3661559522151947, + "learning_rate": 0.00013574650047440181, + "loss": 1.2789, + "step": 24734 + }, + { + "epoch": 0.32142004375912664, + "grad_norm": 0.47811728715896606, + "learning_rate": 0.0001357439010124904, + "loss": 1.4873, + "step": 24735 + }, + { + "epoch": 0.3214330383030425, + "grad_norm": 0.42382270097732544, + "learning_rate": 0.00013574130155057904, + "loss": 1.4457, + "step": 24736 + }, + { + "epoch": 0.3214460328469584, + "grad_norm": 0.43184298276901245, + "learning_rate": 0.00013573870208866766, + "loss": 1.3813, + "step": 24737 + }, + { + "epoch": 0.32145902739087423, + "grad_norm": 0.36754274368286133, + "learning_rate": 0.00013573610262675626, + "loss": 1.1517, + "step": 24738 + }, + { + "epoch": 0.32147202193479013, + "grad_norm": 0.3488259017467499, + "learning_rate": 0.00013573350316484488, + "loss": 1.2776, + "step": 24739 + }, + { + "epoch": 0.321485016478706, + "grad_norm": 0.3927743434906006, + "learning_rate": 0.0001357309037029335, + "loss": 1.4579, + "step": 24740 + }, + { + "epoch": 0.3214980110226219, + "grad_norm": 0.3752271831035614, + "learning_rate": 0.00013572830424102213, + "loss": 1.3483, + "step": 24741 + }, + { + "epoch": 0.3215110055665378, + "grad_norm": 0.3438195586204529, + "learning_rate": 0.00013572570477911073, + "loss": 1.4341, + "step": 24742 + }, + { + "epoch": 0.3215240001104536, + "grad_norm": 0.3992052674293518, + "learning_rate": 0.00013572310531719933, + "loss": 1.4993, + "step": 24743 + }, + { + "epoch": 0.3215369946543695, + "grad_norm": 0.48372069001197815, + "learning_rate": 0.00013572050585528798, + "loss": 1.4975, + "step": 24744 + }, + { + "epoch": 0.32154998919828537, + "grad_norm": 0.43863922357559204, + "learning_rate": 0.00013571790639337657, + "loss": 1.4671, + "step": 24745 + }, + { + "epoch": 0.32156298374220127, + "grad_norm": 0.3579051196575165, + "learning_rate": 0.0001357153069314652, + "loss": 1.2875, + "step": 24746 + }, + { + "epoch": 0.3215759782861171, + "grad_norm": 0.45592498779296875, + "learning_rate": 0.0001357127074695538, + "loss": 1.4144, + "step": 24747 + }, + { + "epoch": 0.321588972830033, + "grad_norm": 0.43024441599845886, + "learning_rate": 0.00013571010800764242, + "loss": 1.5898, + "step": 24748 + }, + { + "epoch": 0.32160196737394886, + "grad_norm": 0.4178403913974762, + "learning_rate": 0.00013570750854573105, + "loss": 1.413, + "step": 24749 + }, + { + "epoch": 0.32161496191786476, + "grad_norm": 0.4219180643558502, + "learning_rate": 0.00013570490908381964, + "loss": 1.3591, + "step": 24750 + }, + { + "epoch": 0.3216279564617806, + "grad_norm": 0.42791593074798584, + "learning_rate": 0.00013570230962190827, + "loss": 1.3157, + "step": 24751 + }, + { + "epoch": 0.3216409510056965, + "grad_norm": 0.39800095558166504, + "learning_rate": 0.0001356997101599969, + "loss": 1.3912, + "step": 24752 + }, + { + "epoch": 0.32165394554961235, + "grad_norm": 0.4012994170188904, + "learning_rate": 0.00013569711069808552, + "loss": 1.4016, + "step": 24753 + }, + { + "epoch": 0.32166694009352825, + "grad_norm": 0.37264829874038696, + "learning_rate": 0.00013569451123617411, + "loss": 1.3934, + "step": 24754 + }, + { + "epoch": 0.3216799346374441, + "grad_norm": 0.3317003846168518, + "learning_rate": 0.00013569191177426274, + "loss": 1.4107, + "step": 24755 + }, + { + "epoch": 0.32169292918136, + "grad_norm": 0.4236525297164917, + "learning_rate": 0.00013568931231235136, + "loss": 1.5237, + "step": 24756 + }, + { + "epoch": 0.32170592372527584, + "grad_norm": 0.3545015752315521, + "learning_rate": 0.00013568671285043996, + "loss": 1.326, + "step": 24757 + }, + { + "epoch": 0.32171891826919174, + "grad_norm": 0.4817945063114166, + "learning_rate": 0.00013568411338852858, + "loss": 1.466, + "step": 24758 + }, + { + "epoch": 0.3217319128131076, + "grad_norm": 0.4174249768257141, + "learning_rate": 0.00013568151392661718, + "loss": 1.3172, + "step": 24759 + }, + { + "epoch": 0.3217449073570235, + "grad_norm": 0.3138663172721863, + "learning_rate": 0.0001356789144647058, + "loss": 1.5928, + "step": 24760 + }, + { + "epoch": 0.32175790190093934, + "grad_norm": 0.4899853765964508, + "learning_rate": 0.00013567631500279443, + "loss": 1.4632, + "step": 24761 + }, + { + "epoch": 0.32177089644485524, + "grad_norm": 0.40293407440185547, + "learning_rate": 0.00013567371554088303, + "loss": 1.3502, + "step": 24762 + }, + { + "epoch": 0.3217838909887711, + "grad_norm": 0.3393685519695282, + "learning_rate": 0.00013567111607897165, + "loss": 1.4755, + "step": 24763 + }, + { + "epoch": 0.321796885532687, + "grad_norm": 0.38959047198295593, + "learning_rate": 0.00013566851661706028, + "loss": 1.3434, + "step": 24764 + }, + { + "epoch": 0.32180988007660283, + "grad_norm": 0.3684130609035492, + "learning_rate": 0.0001356659171551489, + "loss": 1.4785, + "step": 24765 + }, + { + "epoch": 0.32182287462051873, + "grad_norm": 0.35236477851867676, + "learning_rate": 0.0001356633176932375, + "loss": 1.4781, + "step": 24766 + }, + { + "epoch": 0.3218358691644346, + "grad_norm": 0.3297717869281769, + "learning_rate": 0.00013566071823132612, + "loss": 1.2998, + "step": 24767 + }, + { + "epoch": 0.3218488637083505, + "grad_norm": 0.4193660616874695, + "learning_rate": 0.00013565811876941475, + "loss": 1.402, + "step": 24768 + }, + { + "epoch": 0.3218618582522663, + "grad_norm": 0.4387204349040985, + "learning_rate": 0.00013565551930750335, + "loss": 1.4731, + "step": 24769 + }, + { + "epoch": 0.3218748527961822, + "grad_norm": 0.39443477988243103, + "learning_rate": 0.00013565291984559197, + "loss": 1.3148, + "step": 24770 + }, + { + "epoch": 0.32188784734009807, + "grad_norm": 0.37812450528144836, + "learning_rate": 0.00013565032038368057, + "loss": 1.4579, + "step": 24771 + }, + { + "epoch": 0.32190084188401397, + "grad_norm": 0.4122190475463867, + "learning_rate": 0.00013564772092176922, + "loss": 1.2654, + "step": 24772 + }, + { + "epoch": 0.3219138364279298, + "grad_norm": 0.3226117491722107, + "learning_rate": 0.00013564512145985782, + "loss": 1.3342, + "step": 24773 + }, + { + "epoch": 0.3219268309718457, + "grad_norm": 0.4152553975582123, + "learning_rate": 0.00013564252199794641, + "loss": 1.3738, + "step": 24774 + }, + { + "epoch": 0.32193982551576156, + "grad_norm": 0.43611374497413635, + "learning_rate": 0.00013563992253603507, + "loss": 1.3646, + "step": 24775 + }, + { + "epoch": 0.32195282005967746, + "grad_norm": 0.49749991297721863, + "learning_rate": 0.00013563732307412366, + "loss": 1.528, + "step": 24776 + }, + { + "epoch": 0.3219658146035933, + "grad_norm": 0.38084685802459717, + "learning_rate": 0.0001356347236122123, + "loss": 1.3476, + "step": 24777 + }, + { + "epoch": 0.3219788091475092, + "grad_norm": 0.4132595956325531, + "learning_rate": 0.00013563212415030088, + "loss": 1.4489, + "step": 24778 + }, + { + "epoch": 0.32199180369142505, + "grad_norm": 0.668992817401886, + "learning_rate": 0.0001356295246883895, + "loss": 1.4169, + "step": 24779 + }, + { + "epoch": 0.32200479823534095, + "grad_norm": 0.4263128340244293, + "learning_rate": 0.00013562692522647813, + "loss": 1.2513, + "step": 24780 + }, + { + "epoch": 0.3220177927792568, + "grad_norm": 0.4289945960044861, + "learning_rate": 0.00013562432576456673, + "loss": 1.4288, + "step": 24781 + }, + { + "epoch": 0.3220307873231727, + "grad_norm": 0.476245254278183, + "learning_rate": 0.00013562172630265536, + "loss": 1.3965, + "step": 24782 + }, + { + "epoch": 0.32204378186708854, + "grad_norm": 0.4400055408477783, + "learning_rate": 0.00013561912684074398, + "loss": 1.4549, + "step": 24783 + }, + { + "epoch": 0.32205677641100444, + "grad_norm": 0.374044269323349, + "learning_rate": 0.0001356165273788326, + "loss": 1.4691, + "step": 24784 + }, + { + "epoch": 0.3220697709549203, + "grad_norm": 0.3557385802268982, + "learning_rate": 0.0001356139279169212, + "loss": 1.2804, + "step": 24785 + }, + { + "epoch": 0.3220827654988362, + "grad_norm": 0.333656370639801, + "learning_rate": 0.0001356113284550098, + "loss": 1.4894, + "step": 24786 + }, + { + "epoch": 0.32209576004275203, + "grad_norm": 0.440645694732666, + "learning_rate": 0.00013560872899309845, + "loss": 1.4679, + "step": 24787 + }, + { + "epoch": 0.32210875458666793, + "grad_norm": 0.34813442826271057, + "learning_rate": 0.00013560612953118705, + "loss": 1.4026, + "step": 24788 + }, + { + "epoch": 0.3221217491305838, + "grad_norm": 0.40143880248069763, + "learning_rate": 0.00013560353006927567, + "loss": 1.6164, + "step": 24789 + }, + { + "epoch": 0.3221347436744997, + "grad_norm": 0.4037768840789795, + "learning_rate": 0.00013560093060736427, + "loss": 1.4597, + "step": 24790 + }, + { + "epoch": 0.3221477382184155, + "grad_norm": 0.42063459753990173, + "learning_rate": 0.0001355983311454529, + "loss": 1.388, + "step": 24791 + }, + { + "epoch": 0.3221607327623314, + "grad_norm": 0.450285941362381, + "learning_rate": 0.00013559573168354152, + "loss": 1.6518, + "step": 24792 + }, + { + "epoch": 0.32217372730624727, + "grad_norm": 0.3764720857143402, + "learning_rate": 0.00013559313222163012, + "loss": 1.5588, + "step": 24793 + }, + { + "epoch": 0.32218672185016317, + "grad_norm": 0.4656607508659363, + "learning_rate": 0.00013559053275971874, + "loss": 1.3533, + "step": 24794 + }, + { + "epoch": 0.322199716394079, + "grad_norm": 0.3740445375442505, + "learning_rate": 0.00013558793329780737, + "loss": 1.2915, + "step": 24795 + }, + { + "epoch": 0.3222127109379949, + "grad_norm": 0.37352094054222107, + "learning_rate": 0.000135585333835896, + "loss": 1.5986, + "step": 24796 + }, + { + "epoch": 0.32222570548191076, + "grad_norm": 0.4056890308856964, + "learning_rate": 0.0001355827343739846, + "loss": 1.3736, + "step": 24797 + }, + { + "epoch": 0.32223870002582666, + "grad_norm": 0.5165325403213501, + "learning_rate": 0.00013558013491207318, + "loss": 1.5007, + "step": 24798 + }, + { + "epoch": 0.3222516945697425, + "grad_norm": 0.3987845778465271, + "learning_rate": 0.00013557753545016184, + "loss": 1.4611, + "step": 24799 + }, + { + "epoch": 0.3222646891136584, + "grad_norm": 0.3437821567058563, + "learning_rate": 0.00013557493598825043, + "loss": 1.2405, + "step": 24800 + }, + { + "epoch": 0.32227768365757425, + "grad_norm": 0.4448677599430084, + "learning_rate": 0.00013557233652633906, + "loss": 1.4778, + "step": 24801 + }, + { + "epoch": 0.32229067820149015, + "grad_norm": 0.3452218174934387, + "learning_rate": 0.00013556973706442766, + "loss": 1.3146, + "step": 24802 + }, + { + "epoch": 0.322303672745406, + "grad_norm": 0.3361167907714844, + "learning_rate": 0.00013556713760251628, + "loss": 1.6413, + "step": 24803 + }, + { + "epoch": 0.3223166672893219, + "grad_norm": 0.42919233441352844, + "learning_rate": 0.0001355645381406049, + "loss": 1.3712, + "step": 24804 + }, + { + "epoch": 0.32232966183323775, + "grad_norm": 0.4290870726108551, + "learning_rate": 0.0001355619386786935, + "loss": 1.5637, + "step": 24805 + }, + { + "epoch": 0.32234265637715365, + "grad_norm": 0.37546175718307495, + "learning_rate": 0.00013555933921678213, + "loss": 1.1305, + "step": 24806 + }, + { + "epoch": 0.3223556509210695, + "grad_norm": 0.45757678151130676, + "learning_rate": 0.00013555673975487075, + "loss": 1.5652, + "step": 24807 + }, + { + "epoch": 0.3223686454649854, + "grad_norm": 0.4538206458091736, + "learning_rate": 0.00013555414029295938, + "loss": 1.4418, + "step": 24808 + }, + { + "epoch": 0.32238164000890124, + "grad_norm": 0.4278055429458618, + "learning_rate": 0.00013555154083104797, + "loss": 1.3544, + "step": 24809 + }, + { + "epoch": 0.32239463455281714, + "grad_norm": 0.34231773018836975, + "learning_rate": 0.0001355489413691366, + "loss": 1.5204, + "step": 24810 + }, + { + "epoch": 0.322407629096733, + "grad_norm": 0.4222254455089569, + "learning_rate": 0.00013554634190722522, + "loss": 1.2565, + "step": 24811 + }, + { + "epoch": 0.3224206236406489, + "grad_norm": 0.46273741126060486, + "learning_rate": 0.00013554374244531382, + "loss": 1.3795, + "step": 24812 + }, + { + "epoch": 0.32243361818456473, + "grad_norm": 0.4589740037918091, + "learning_rate": 0.00013554114298340244, + "loss": 1.3784, + "step": 24813 + }, + { + "epoch": 0.32244661272848063, + "grad_norm": 0.5123797059059143, + "learning_rate": 0.00013553854352149107, + "loss": 1.4594, + "step": 24814 + }, + { + "epoch": 0.3224596072723965, + "grad_norm": 0.2949802875518799, + "learning_rate": 0.00013553594405957967, + "loss": 1.4215, + "step": 24815 + }, + { + "epoch": 0.3224726018163124, + "grad_norm": 0.370859831571579, + "learning_rate": 0.0001355333445976683, + "loss": 1.2735, + "step": 24816 + }, + { + "epoch": 0.3224855963602282, + "grad_norm": 0.38516542315483093, + "learning_rate": 0.0001355307451357569, + "loss": 1.5608, + "step": 24817 + }, + { + "epoch": 0.3224985909041441, + "grad_norm": 0.3918180465698242, + "learning_rate": 0.00013552814567384554, + "loss": 1.3215, + "step": 24818 + }, + { + "epoch": 0.32251158544806, + "grad_norm": 0.36429646611213684, + "learning_rate": 0.00013552554621193414, + "loss": 1.2987, + "step": 24819 + }, + { + "epoch": 0.32252457999197587, + "grad_norm": 0.3534800112247467, + "learning_rate": 0.00013552294675002276, + "loss": 1.4025, + "step": 24820 + }, + { + "epoch": 0.32253757453589177, + "grad_norm": 0.4373858869075775, + "learning_rate": 0.00013552034728811136, + "loss": 1.3019, + "step": 24821 + }, + { + "epoch": 0.3225505690798076, + "grad_norm": 0.4387717545032501, + "learning_rate": 0.00013551774782619998, + "loss": 1.4948, + "step": 24822 + }, + { + "epoch": 0.3225635636237235, + "grad_norm": 0.4137800335884094, + "learning_rate": 0.0001355151483642886, + "loss": 1.5045, + "step": 24823 + }, + { + "epoch": 0.32257655816763936, + "grad_norm": 0.3808870315551758, + "learning_rate": 0.0001355125489023772, + "loss": 1.3537, + "step": 24824 + }, + { + "epoch": 0.32258955271155526, + "grad_norm": 0.5064056515693665, + "learning_rate": 0.00013550994944046583, + "loss": 1.3483, + "step": 24825 + }, + { + "epoch": 0.3226025472554711, + "grad_norm": 0.26330268383026123, + "learning_rate": 0.00013550734997855445, + "loss": 1.2074, + "step": 24826 + }, + { + "epoch": 0.322615541799387, + "grad_norm": 0.39733344316482544, + "learning_rate": 0.00013550475051664305, + "loss": 1.4038, + "step": 24827 + }, + { + "epoch": 0.32262853634330285, + "grad_norm": 0.37604841589927673, + "learning_rate": 0.00013550215105473168, + "loss": 1.5098, + "step": 24828 + }, + { + "epoch": 0.32264153088721875, + "grad_norm": 0.41053780913352966, + "learning_rate": 0.00013549955159282027, + "loss": 1.4856, + "step": 24829 + }, + { + "epoch": 0.3226545254311346, + "grad_norm": 0.33901020884513855, + "learning_rate": 0.00013549695213090892, + "loss": 1.5203, + "step": 24830 + }, + { + "epoch": 0.3226675199750505, + "grad_norm": 0.33161652088165283, + "learning_rate": 0.00013549435266899752, + "loss": 1.476, + "step": 24831 + }, + { + "epoch": 0.32268051451896634, + "grad_norm": 0.43718859553337097, + "learning_rate": 0.00013549175320708615, + "loss": 1.4997, + "step": 24832 + }, + { + "epoch": 0.32269350906288224, + "grad_norm": 0.39829009771347046, + "learning_rate": 0.00013548915374517474, + "loss": 1.5696, + "step": 24833 + }, + { + "epoch": 0.3227065036067981, + "grad_norm": 0.4563552141189575, + "learning_rate": 0.00013548655428326337, + "loss": 1.5666, + "step": 24834 + }, + { + "epoch": 0.322719498150714, + "grad_norm": 0.3627989590167999, + "learning_rate": 0.000135483954821352, + "loss": 1.322, + "step": 24835 + }, + { + "epoch": 0.32273249269462984, + "grad_norm": 0.47464463114738464, + "learning_rate": 0.0001354813553594406, + "loss": 1.3886, + "step": 24836 + }, + { + "epoch": 0.32274548723854574, + "grad_norm": 0.4163193106651306, + "learning_rate": 0.00013547875589752921, + "loss": 1.4001, + "step": 24837 + }, + { + "epoch": 0.3227584817824616, + "grad_norm": 0.3628351390361786, + "learning_rate": 0.00013547615643561784, + "loss": 1.4763, + "step": 24838 + }, + { + "epoch": 0.3227714763263775, + "grad_norm": 0.4316408932209015, + "learning_rate": 0.00013547355697370646, + "loss": 1.4874, + "step": 24839 + }, + { + "epoch": 0.3227844708702933, + "grad_norm": 0.36311185359954834, + "learning_rate": 0.00013547095751179506, + "loss": 1.3914, + "step": 24840 + }, + { + "epoch": 0.32279746541420923, + "grad_norm": 0.4402531385421753, + "learning_rate": 0.00013546835804988366, + "loss": 1.2641, + "step": 24841 + }, + { + "epoch": 0.3228104599581251, + "grad_norm": 0.48618343472480774, + "learning_rate": 0.0001354657585879723, + "loss": 1.4057, + "step": 24842 + }, + { + "epoch": 0.322823454502041, + "grad_norm": 0.4292903244495392, + "learning_rate": 0.0001354631591260609, + "loss": 1.2999, + "step": 24843 + }, + { + "epoch": 0.3228364490459568, + "grad_norm": 0.4178447127342224, + "learning_rate": 0.00013546055966414953, + "loss": 1.5188, + "step": 24844 + }, + { + "epoch": 0.3228494435898727, + "grad_norm": 0.37929201126098633, + "learning_rate": 0.00013545796020223813, + "loss": 1.4396, + "step": 24845 + }, + { + "epoch": 0.32286243813378857, + "grad_norm": 0.4620269536972046, + "learning_rate": 0.00013545536074032675, + "loss": 1.3595, + "step": 24846 + }, + { + "epoch": 0.32287543267770447, + "grad_norm": 0.39598965644836426, + "learning_rate": 0.00013545276127841538, + "loss": 1.2685, + "step": 24847 + }, + { + "epoch": 0.3228884272216203, + "grad_norm": 0.30083295702934265, + "learning_rate": 0.00013545016181650398, + "loss": 1.2249, + "step": 24848 + }, + { + "epoch": 0.3229014217655362, + "grad_norm": 0.404985249042511, + "learning_rate": 0.00013544756235459263, + "loss": 1.3546, + "step": 24849 + }, + { + "epoch": 0.32291441630945206, + "grad_norm": 0.4552009403705597, + "learning_rate": 0.00013544496289268122, + "loss": 1.4632, + "step": 24850 + }, + { + "epoch": 0.32292741085336796, + "grad_norm": 0.4433949589729309, + "learning_rate": 0.00013544236343076985, + "loss": 1.263, + "step": 24851 + }, + { + "epoch": 0.3229404053972838, + "grad_norm": 0.3626491129398346, + "learning_rate": 0.00013543976396885845, + "loss": 1.3176, + "step": 24852 + }, + { + "epoch": 0.3229533999411997, + "grad_norm": 0.4268733859062195, + "learning_rate": 0.00013543716450694707, + "loss": 1.4841, + "step": 24853 + }, + { + "epoch": 0.32296639448511555, + "grad_norm": 0.39087337255477905, + "learning_rate": 0.0001354345650450357, + "loss": 1.2964, + "step": 24854 + }, + { + "epoch": 0.32297938902903145, + "grad_norm": 0.40472298860549927, + "learning_rate": 0.0001354319655831243, + "loss": 1.4587, + "step": 24855 + }, + { + "epoch": 0.3229923835729473, + "grad_norm": 0.538482666015625, + "learning_rate": 0.00013542936612121292, + "loss": 1.4917, + "step": 24856 + }, + { + "epoch": 0.3230053781168632, + "grad_norm": 0.2860257923603058, + "learning_rate": 0.00013542676665930154, + "loss": 1.6233, + "step": 24857 + }, + { + "epoch": 0.32301837266077904, + "grad_norm": 0.37848132848739624, + "learning_rate": 0.00013542416719739014, + "loss": 1.5776, + "step": 24858 + }, + { + "epoch": 0.32303136720469494, + "grad_norm": 0.3342726528644562, + "learning_rate": 0.00013542156773547876, + "loss": 1.2313, + "step": 24859 + }, + { + "epoch": 0.3230443617486108, + "grad_norm": 0.3620634973049164, + "learning_rate": 0.00013541896827356736, + "loss": 1.3532, + "step": 24860 + }, + { + "epoch": 0.3230573562925267, + "grad_norm": 0.27235373854637146, + "learning_rate": 0.000135416368811656, + "loss": 1.4234, + "step": 24861 + }, + { + "epoch": 0.32307035083644253, + "grad_norm": 0.3243304491043091, + "learning_rate": 0.0001354137693497446, + "loss": 1.2027, + "step": 24862 + }, + { + "epoch": 0.32308334538035843, + "grad_norm": 0.28318798542022705, + "learning_rate": 0.00013541116988783323, + "loss": 1.2209, + "step": 24863 + }, + { + "epoch": 0.3230963399242743, + "grad_norm": 0.47671380639076233, + "learning_rate": 0.00013540857042592183, + "loss": 1.5632, + "step": 24864 + }, + { + "epoch": 0.3231093344681902, + "grad_norm": 0.4260321855545044, + "learning_rate": 0.00013540597096401046, + "loss": 1.4437, + "step": 24865 + }, + { + "epoch": 0.323122329012106, + "grad_norm": 0.41248229146003723, + "learning_rate": 0.00013540337150209908, + "loss": 1.5356, + "step": 24866 + }, + { + "epoch": 0.3231353235560219, + "grad_norm": 0.3432072699069977, + "learning_rate": 0.00013540077204018768, + "loss": 1.3537, + "step": 24867 + }, + { + "epoch": 0.32314831809993777, + "grad_norm": 0.36243635416030884, + "learning_rate": 0.0001353981725782763, + "loss": 1.4697, + "step": 24868 + }, + { + "epoch": 0.32316131264385367, + "grad_norm": 0.4090852737426758, + "learning_rate": 0.00013539557311636493, + "loss": 1.1962, + "step": 24869 + }, + { + "epoch": 0.3231743071877695, + "grad_norm": 0.3374473452568054, + "learning_rate": 0.00013539297365445352, + "loss": 1.3431, + "step": 24870 + }, + { + "epoch": 0.3231873017316854, + "grad_norm": 0.32808125019073486, + "learning_rate": 0.00013539037419254215, + "loss": 1.1792, + "step": 24871 + }, + { + "epoch": 0.32320029627560126, + "grad_norm": 0.3953491747379303, + "learning_rate": 0.00013538777473063075, + "loss": 1.4313, + "step": 24872 + }, + { + "epoch": 0.32321329081951716, + "grad_norm": 0.30446162819862366, + "learning_rate": 0.0001353851752687194, + "loss": 1.344, + "step": 24873 + }, + { + "epoch": 0.323226285363433, + "grad_norm": 0.43865498900413513, + "learning_rate": 0.000135382575806808, + "loss": 1.4877, + "step": 24874 + }, + { + "epoch": 0.3232392799073489, + "grad_norm": 0.4455358386039734, + "learning_rate": 0.00013537997634489662, + "loss": 1.43, + "step": 24875 + }, + { + "epoch": 0.32325227445126475, + "grad_norm": 0.35313087701797485, + "learning_rate": 0.00013537737688298522, + "loss": 1.5881, + "step": 24876 + }, + { + "epoch": 0.32326526899518065, + "grad_norm": 0.37257739901542664, + "learning_rate": 0.00013537477742107384, + "loss": 1.3303, + "step": 24877 + }, + { + "epoch": 0.3232782635390965, + "grad_norm": 0.5111738443374634, + "learning_rate": 0.00013537217795916247, + "loss": 1.4266, + "step": 24878 + }, + { + "epoch": 0.3232912580830124, + "grad_norm": 0.38831818103790283, + "learning_rate": 0.00013536957849725106, + "loss": 1.4175, + "step": 24879 + }, + { + "epoch": 0.32330425262692825, + "grad_norm": 0.37848910689353943, + "learning_rate": 0.0001353669790353397, + "loss": 1.345, + "step": 24880 + }, + { + "epoch": 0.32331724717084415, + "grad_norm": 0.319864958524704, + "learning_rate": 0.0001353643795734283, + "loss": 1.3104, + "step": 24881 + }, + { + "epoch": 0.32333024171476, + "grad_norm": 0.4297686517238617, + "learning_rate": 0.0001353617801115169, + "loss": 1.3595, + "step": 24882 + }, + { + "epoch": 0.3233432362586759, + "grad_norm": 0.4045651853084564, + "learning_rate": 0.00013535918064960553, + "loss": 1.4907, + "step": 24883 + }, + { + "epoch": 0.32335623080259174, + "grad_norm": 0.4724644422531128, + "learning_rate": 0.00013535658118769416, + "loss": 1.5476, + "step": 24884 + }, + { + "epoch": 0.32336922534650764, + "grad_norm": 0.3693622350692749, + "learning_rate": 0.00013535398172578278, + "loss": 1.2824, + "step": 24885 + }, + { + "epoch": 0.3233822198904235, + "grad_norm": 0.46172967553138733, + "learning_rate": 0.00013535138226387138, + "loss": 1.529, + "step": 24886 + }, + { + "epoch": 0.3233952144343394, + "grad_norm": 0.47828933596611023, + "learning_rate": 0.00013534878280196, + "loss": 1.3226, + "step": 24887 + }, + { + "epoch": 0.32340820897825523, + "grad_norm": 0.33672595024108887, + "learning_rate": 0.00013534618334004863, + "loss": 1.4788, + "step": 24888 + }, + { + "epoch": 0.32342120352217113, + "grad_norm": 0.4923304617404938, + "learning_rate": 0.00013534358387813723, + "loss": 1.4222, + "step": 24889 + }, + { + "epoch": 0.323434198066087, + "grad_norm": 0.39976876974105835, + "learning_rate": 0.00013534098441622585, + "loss": 1.3924, + "step": 24890 + }, + { + "epoch": 0.3234471926100029, + "grad_norm": 0.3337671458721161, + "learning_rate": 0.00013533838495431445, + "loss": 1.2557, + "step": 24891 + }, + { + "epoch": 0.3234601871539187, + "grad_norm": 0.41363513469696045, + "learning_rate": 0.0001353357854924031, + "loss": 1.4137, + "step": 24892 + }, + { + "epoch": 0.3234731816978346, + "grad_norm": 0.5088506937026978, + "learning_rate": 0.0001353331860304917, + "loss": 1.4961, + "step": 24893 + }, + { + "epoch": 0.3234861762417505, + "grad_norm": 0.3360183537006378, + "learning_rate": 0.00013533058656858032, + "loss": 1.3568, + "step": 24894 + }, + { + "epoch": 0.32349917078566637, + "grad_norm": 0.3352796733379364, + "learning_rate": 0.00013532798710666892, + "loss": 1.2787, + "step": 24895 + }, + { + "epoch": 0.32351216532958227, + "grad_norm": 0.35485097765922546, + "learning_rate": 0.00013532538764475754, + "loss": 1.4271, + "step": 24896 + }, + { + "epoch": 0.3235251598734981, + "grad_norm": 0.4580633044242859, + "learning_rate": 0.00013532278818284617, + "loss": 1.2401, + "step": 24897 + }, + { + "epoch": 0.323538154417414, + "grad_norm": 0.517603874206543, + "learning_rate": 0.00013532018872093477, + "loss": 1.4979, + "step": 24898 + }, + { + "epoch": 0.32355114896132986, + "grad_norm": 0.2718959450721741, + "learning_rate": 0.0001353175892590234, + "loss": 1.3858, + "step": 24899 + }, + { + "epoch": 0.32356414350524576, + "grad_norm": 0.39144110679626465, + "learning_rate": 0.00013531498979711201, + "loss": 1.3775, + "step": 24900 + }, + { + "epoch": 0.3235771380491616, + "grad_norm": 0.429180771112442, + "learning_rate": 0.0001353123903352006, + "loss": 1.499, + "step": 24901 + }, + { + "epoch": 0.3235901325930775, + "grad_norm": 0.4488694667816162, + "learning_rate": 0.00013530979087328924, + "loss": 1.5914, + "step": 24902 + }, + { + "epoch": 0.32360312713699335, + "grad_norm": 0.3418270945549011, + "learning_rate": 0.00013530719141137783, + "loss": 1.1116, + "step": 24903 + }, + { + "epoch": 0.32361612168090925, + "grad_norm": 0.4319004714488983, + "learning_rate": 0.00013530459194946649, + "loss": 1.2198, + "step": 24904 + }, + { + "epoch": 0.3236291162248251, + "grad_norm": 0.3784480690956116, + "learning_rate": 0.00013530199248755508, + "loss": 1.2532, + "step": 24905 + }, + { + "epoch": 0.323642110768741, + "grad_norm": 0.36385276913642883, + "learning_rate": 0.0001352993930256437, + "loss": 1.2948, + "step": 24906 + }, + { + "epoch": 0.32365510531265684, + "grad_norm": 0.45260900259017944, + "learning_rate": 0.0001352967935637323, + "loss": 1.3254, + "step": 24907 + }, + { + "epoch": 0.32366809985657274, + "grad_norm": 0.4192999601364136, + "learning_rate": 0.00013529419410182093, + "loss": 1.3874, + "step": 24908 + }, + { + "epoch": 0.3236810944004886, + "grad_norm": 0.4241378903388977, + "learning_rate": 0.00013529159463990955, + "loss": 1.3429, + "step": 24909 + }, + { + "epoch": 0.3236940889444045, + "grad_norm": 0.4041059911251068, + "learning_rate": 0.00013528899517799815, + "loss": 1.3765, + "step": 24910 + }, + { + "epoch": 0.32370708348832034, + "grad_norm": 0.38063371181488037, + "learning_rate": 0.00013528639571608678, + "loss": 1.3264, + "step": 24911 + }, + { + "epoch": 0.32372007803223624, + "grad_norm": 0.43497347831726074, + "learning_rate": 0.0001352837962541754, + "loss": 1.4182, + "step": 24912 + }, + { + "epoch": 0.3237330725761521, + "grad_norm": 0.38060855865478516, + "learning_rate": 0.000135281196792264, + "loss": 1.5217, + "step": 24913 + }, + { + "epoch": 0.323746067120068, + "grad_norm": 0.4803584814071655, + "learning_rate": 0.00013527859733035262, + "loss": 1.4066, + "step": 24914 + }, + { + "epoch": 0.3237590616639838, + "grad_norm": 0.5258299112319946, + "learning_rate": 0.00013527599786844122, + "loss": 1.4874, + "step": 24915 + }, + { + "epoch": 0.3237720562078997, + "grad_norm": 0.386028915643692, + "learning_rate": 0.00013527339840652987, + "loss": 1.2463, + "step": 24916 + }, + { + "epoch": 0.3237850507518156, + "grad_norm": 0.43555545806884766, + "learning_rate": 0.00013527079894461847, + "loss": 1.6694, + "step": 24917 + }, + { + "epoch": 0.3237980452957315, + "grad_norm": 0.5101915597915649, + "learning_rate": 0.0001352681994827071, + "loss": 1.366, + "step": 24918 + }, + { + "epoch": 0.3238110398396473, + "grad_norm": 0.3759496212005615, + "learning_rate": 0.0001352656000207957, + "loss": 1.4064, + "step": 24919 + }, + { + "epoch": 0.3238240343835632, + "grad_norm": 0.32454878091812134, + "learning_rate": 0.00013526300055888431, + "loss": 1.4186, + "step": 24920 + }, + { + "epoch": 0.32383702892747906, + "grad_norm": 0.39061838388442993, + "learning_rate": 0.00013526040109697294, + "loss": 1.3787, + "step": 24921 + }, + { + "epoch": 0.32385002347139497, + "grad_norm": 0.47894737124443054, + "learning_rate": 0.00013525780163506154, + "loss": 1.3398, + "step": 24922 + }, + { + "epoch": 0.3238630180153108, + "grad_norm": 0.31642550230026245, + "learning_rate": 0.0001352552021731502, + "loss": 1.3317, + "step": 24923 + }, + { + "epoch": 0.3238760125592267, + "grad_norm": 0.4408561587333679, + "learning_rate": 0.00013525260271123879, + "loss": 1.5135, + "step": 24924 + }, + { + "epoch": 0.32388900710314256, + "grad_norm": 0.38072502613067627, + "learning_rate": 0.00013525000324932738, + "loss": 1.4164, + "step": 24925 + }, + { + "epoch": 0.32390200164705846, + "grad_norm": 0.5071101784706116, + "learning_rate": 0.000135247403787416, + "loss": 1.441, + "step": 24926 + }, + { + "epoch": 0.3239149961909743, + "grad_norm": 0.4267844259738922, + "learning_rate": 0.00013524480432550463, + "loss": 1.557, + "step": 24927 + }, + { + "epoch": 0.3239279907348902, + "grad_norm": 0.3761705160140991, + "learning_rate": 0.00013524220486359326, + "loss": 1.5055, + "step": 24928 + }, + { + "epoch": 0.32394098527880605, + "grad_norm": 0.4288162291049957, + "learning_rate": 0.00013523960540168185, + "loss": 1.4598, + "step": 24929 + }, + { + "epoch": 0.32395397982272195, + "grad_norm": 0.4203217029571533, + "learning_rate": 0.00013523700593977048, + "loss": 1.3949, + "step": 24930 + }, + { + "epoch": 0.3239669743666378, + "grad_norm": 0.4510311186313629, + "learning_rate": 0.0001352344064778591, + "loss": 1.3672, + "step": 24931 + }, + { + "epoch": 0.3239799689105537, + "grad_norm": 0.42196300625801086, + "learning_rate": 0.0001352318070159477, + "loss": 1.3068, + "step": 24932 + }, + { + "epoch": 0.32399296345446954, + "grad_norm": 0.35303065180778503, + "learning_rate": 0.00013522920755403632, + "loss": 1.4527, + "step": 24933 + }, + { + "epoch": 0.32400595799838544, + "grad_norm": 0.389010488986969, + "learning_rate": 0.00013522660809212492, + "loss": 1.398, + "step": 24934 + }, + { + "epoch": 0.3240189525423013, + "grad_norm": 0.35926100611686707, + "learning_rate": 0.00013522400863021357, + "loss": 1.4196, + "step": 24935 + }, + { + "epoch": 0.3240319470862172, + "grad_norm": 0.42396387457847595, + "learning_rate": 0.00013522140916830217, + "loss": 1.4498, + "step": 24936 + }, + { + "epoch": 0.32404494163013303, + "grad_norm": 0.4404793083667755, + "learning_rate": 0.00013521880970639077, + "loss": 1.5801, + "step": 24937 + }, + { + "epoch": 0.32405793617404893, + "grad_norm": 0.36292192339897156, + "learning_rate": 0.0001352162102444794, + "loss": 1.3481, + "step": 24938 + }, + { + "epoch": 0.3240709307179648, + "grad_norm": 0.4069438576698303, + "learning_rate": 0.00013521361078256802, + "loss": 1.3579, + "step": 24939 + }, + { + "epoch": 0.3240839252618807, + "grad_norm": 0.32507357001304626, + "learning_rate": 0.00013521101132065664, + "loss": 1.3694, + "step": 24940 + }, + { + "epoch": 0.3240969198057965, + "grad_norm": 0.4535185992717743, + "learning_rate": 0.00013520841185874524, + "loss": 1.5158, + "step": 24941 + }, + { + "epoch": 0.3241099143497124, + "grad_norm": 0.3327348232269287, + "learning_rate": 0.00013520581239683386, + "loss": 1.4626, + "step": 24942 + }, + { + "epoch": 0.32412290889362827, + "grad_norm": 0.4001970589160919, + "learning_rate": 0.0001352032129349225, + "loss": 1.3684, + "step": 24943 + }, + { + "epoch": 0.32413590343754417, + "grad_norm": 0.36679449677467346, + "learning_rate": 0.00013520061347301109, + "loss": 1.2966, + "step": 24944 + }, + { + "epoch": 0.32414889798146, + "grad_norm": 0.3613438904285431, + "learning_rate": 0.0001351980140110997, + "loss": 1.63, + "step": 24945 + }, + { + "epoch": 0.3241618925253759, + "grad_norm": 0.39503341913223267, + "learning_rate": 0.0001351954145491883, + "loss": 1.3538, + "step": 24946 + }, + { + "epoch": 0.32417488706929176, + "grad_norm": 0.39064785838127136, + "learning_rate": 0.00013519281508727696, + "loss": 1.5296, + "step": 24947 + }, + { + "epoch": 0.32418788161320766, + "grad_norm": 0.4340202510356903, + "learning_rate": 0.00013519021562536556, + "loss": 1.3433, + "step": 24948 + }, + { + "epoch": 0.3242008761571235, + "grad_norm": 0.33337584137916565, + "learning_rate": 0.00013518761616345415, + "loss": 1.4876, + "step": 24949 + }, + { + "epoch": 0.3242138707010394, + "grad_norm": 0.4356640577316284, + "learning_rate": 0.00013518501670154278, + "loss": 1.3719, + "step": 24950 + }, + { + "epoch": 0.32422686524495525, + "grad_norm": 0.4716472923755646, + "learning_rate": 0.0001351824172396314, + "loss": 1.4316, + "step": 24951 + }, + { + "epoch": 0.32423985978887115, + "grad_norm": 0.49827995896339417, + "learning_rate": 0.00013517981777772003, + "loss": 1.3297, + "step": 24952 + }, + { + "epoch": 0.324252854332787, + "grad_norm": 0.4101789891719818, + "learning_rate": 0.00013517721831580862, + "loss": 1.3277, + "step": 24953 + }, + { + "epoch": 0.3242658488767029, + "grad_norm": 0.492003858089447, + "learning_rate": 0.00013517461885389725, + "loss": 1.5181, + "step": 24954 + }, + { + "epoch": 0.32427884342061875, + "grad_norm": 0.35504239797592163, + "learning_rate": 0.00013517201939198587, + "loss": 1.2464, + "step": 24955 + }, + { + "epoch": 0.32429183796453465, + "grad_norm": 0.49131059646606445, + "learning_rate": 0.00013516941993007447, + "loss": 1.3402, + "step": 24956 + }, + { + "epoch": 0.3243048325084505, + "grad_norm": 0.34662574529647827, + "learning_rate": 0.0001351668204681631, + "loss": 1.1408, + "step": 24957 + }, + { + "epoch": 0.3243178270523664, + "grad_norm": 0.4073435366153717, + "learning_rate": 0.00013516422100625172, + "loss": 1.2208, + "step": 24958 + }, + { + "epoch": 0.32433082159628224, + "grad_norm": 0.40237903594970703, + "learning_rate": 0.00013516162154434034, + "loss": 1.3924, + "step": 24959 + }, + { + "epoch": 0.32434381614019814, + "grad_norm": 0.4088524878025055, + "learning_rate": 0.00013515902208242894, + "loss": 1.3625, + "step": 24960 + }, + { + "epoch": 0.324356810684114, + "grad_norm": 0.3493786156177521, + "learning_rate": 0.00013515642262051757, + "loss": 1.2137, + "step": 24961 + }, + { + "epoch": 0.3243698052280299, + "grad_norm": 0.3620012700557709, + "learning_rate": 0.0001351538231586062, + "loss": 1.5046, + "step": 24962 + }, + { + "epoch": 0.32438279977194573, + "grad_norm": 0.43123066425323486, + "learning_rate": 0.0001351512236966948, + "loss": 1.3069, + "step": 24963 + }, + { + "epoch": 0.32439579431586163, + "grad_norm": 0.33102044463157654, + "learning_rate": 0.0001351486242347834, + "loss": 1.5371, + "step": 24964 + }, + { + "epoch": 0.3244087888597775, + "grad_norm": 0.2837887704372406, + "learning_rate": 0.000135146024772872, + "loss": 1.5202, + "step": 24965 + }, + { + "epoch": 0.3244217834036934, + "grad_norm": 0.43771886825561523, + "learning_rate": 0.00013514342531096063, + "loss": 1.3736, + "step": 24966 + }, + { + "epoch": 0.3244347779476092, + "grad_norm": 0.4169369339942932, + "learning_rate": 0.00013514082584904926, + "loss": 1.3102, + "step": 24967 + }, + { + "epoch": 0.3244477724915251, + "grad_norm": 0.4754272997379303, + "learning_rate": 0.00013513822638713786, + "loss": 1.4647, + "step": 24968 + }, + { + "epoch": 0.32446076703544097, + "grad_norm": 0.35552677512168884, + "learning_rate": 0.00013513562692522648, + "loss": 1.3328, + "step": 24969 + }, + { + "epoch": 0.32447376157935687, + "grad_norm": 0.3923947513103485, + "learning_rate": 0.0001351330274633151, + "loss": 1.5282, + "step": 24970 + }, + { + "epoch": 0.32448675612327277, + "grad_norm": 0.3813493847846985, + "learning_rate": 0.00013513042800140373, + "loss": 1.3916, + "step": 24971 + }, + { + "epoch": 0.3244997506671886, + "grad_norm": 0.40924695134162903, + "learning_rate": 0.00013512782853949233, + "loss": 1.4469, + "step": 24972 + }, + { + "epoch": 0.3245127452111045, + "grad_norm": 0.5052087903022766, + "learning_rate": 0.00013512522907758095, + "loss": 1.5353, + "step": 24973 + }, + { + "epoch": 0.32452573975502036, + "grad_norm": 0.36621546745300293, + "learning_rate": 0.00013512262961566958, + "loss": 1.3115, + "step": 24974 + }, + { + "epoch": 0.32453873429893626, + "grad_norm": 0.42935553193092346, + "learning_rate": 0.00013512003015375817, + "loss": 1.4772, + "step": 24975 + }, + { + "epoch": 0.3245517288428521, + "grad_norm": 0.45067891478538513, + "learning_rate": 0.0001351174306918468, + "loss": 1.3963, + "step": 24976 + }, + { + "epoch": 0.324564723386768, + "grad_norm": 0.334783136844635, + "learning_rate": 0.0001351148312299354, + "loss": 1.2486, + "step": 24977 + }, + { + "epoch": 0.32457771793068385, + "grad_norm": 0.3122904300689697, + "learning_rate": 0.00013511223176802405, + "loss": 1.5302, + "step": 24978 + }, + { + "epoch": 0.32459071247459975, + "grad_norm": 0.4262402057647705, + "learning_rate": 0.00013510963230611264, + "loss": 1.2689, + "step": 24979 + }, + { + "epoch": 0.3246037070185156, + "grad_norm": 0.34411951899528503, + "learning_rate": 0.00013510703284420124, + "loss": 1.4389, + "step": 24980 + }, + { + "epoch": 0.3246167015624315, + "grad_norm": 0.3515605330467224, + "learning_rate": 0.00013510443338228987, + "loss": 1.3707, + "step": 24981 + }, + { + "epoch": 0.32462969610634734, + "grad_norm": 0.40537312626838684, + "learning_rate": 0.0001351018339203785, + "loss": 1.3657, + "step": 24982 + }, + { + "epoch": 0.32464269065026324, + "grad_norm": 0.3802444040775299, + "learning_rate": 0.00013509923445846712, + "loss": 1.5547, + "step": 24983 + }, + { + "epoch": 0.3246556851941791, + "grad_norm": 0.38722100853919983, + "learning_rate": 0.0001350966349965557, + "loss": 1.408, + "step": 24984 + }, + { + "epoch": 0.324668679738095, + "grad_norm": 0.39462536573410034, + "learning_rate": 0.00013509403553464434, + "loss": 1.4789, + "step": 24985 + }, + { + "epoch": 0.32468167428201083, + "grad_norm": 0.420680969953537, + "learning_rate": 0.00013509143607273296, + "loss": 1.3267, + "step": 24986 + }, + { + "epoch": 0.32469466882592674, + "grad_norm": 0.33159300684928894, + "learning_rate": 0.00013508883661082156, + "loss": 1.5063, + "step": 24987 + }, + { + "epoch": 0.3247076633698426, + "grad_norm": 0.3995024263858795, + "learning_rate": 0.00013508623714891018, + "loss": 1.6027, + "step": 24988 + }, + { + "epoch": 0.3247206579137585, + "grad_norm": 0.36618131399154663, + "learning_rate": 0.00013508363768699878, + "loss": 1.4363, + "step": 24989 + }, + { + "epoch": 0.3247336524576743, + "grad_norm": 0.38703015446662903, + "learning_rate": 0.00013508103822508743, + "loss": 1.5189, + "step": 24990 + }, + { + "epoch": 0.3247466470015902, + "grad_norm": 0.32923591136932373, + "learning_rate": 0.00013507843876317603, + "loss": 1.6484, + "step": 24991 + }, + { + "epoch": 0.3247596415455061, + "grad_norm": 0.4181039035320282, + "learning_rate": 0.00013507583930126463, + "loss": 1.4062, + "step": 24992 + }, + { + "epoch": 0.324772636089422, + "grad_norm": 0.41602247953414917, + "learning_rate": 0.00013507323983935325, + "loss": 1.5106, + "step": 24993 + }, + { + "epoch": 0.3247856306333378, + "grad_norm": 0.27113261818885803, + "learning_rate": 0.00013507064037744188, + "loss": 1.3445, + "step": 24994 + }, + { + "epoch": 0.3247986251772537, + "grad_norm": 0.4463062584400177, + "learning_rate": 0.0001350680409155305, + "loss": 1.4254, + "step": 24995 + }, + { + "epoch": 0.32481161972116956, + "grad_norm": 0.30592623353004456, + "learning_rate": 0.0001350654414536191, + "loss": 1.2206, + "step": 24996 + }, + { + "epoch": 0.32482461426508547, + "grad_norm": 0.3911055028438568, + "learning_rate": 0.00013506284199170772, + "loss": 1.3311, + "step": 24997 + }, + { + "epoch": 0.3248376088090013, + "grad_norm": 0.45315492153167725, + "learning_rate": 0.00013506024252979635, + "loss": 1.1246, + "step": 24998 + }, + { + "epoch": 0.3248506033529172, + "grad_norm": 0.4048317074775696, + "learning_rate": 0.00013505764306788494, + "loss": 1.4223, + "step": 24999 + }, + { + "epoch": 0.32486359789683306, + "grad_norm": 0.35791638493537903, + "learning_rate": 0.00013505504360597357, + "loss": 1.5504, + "step": 25000 + }, + { + "epoch": 0.32487659244074896, + "grad_norm": 0.46022653579711914, + "learning_rate": 0.0001350524441440622, + "loss": 1.4974, + "step": 25001 + }, + { + "epoch": 0.3248895869846648, + "grad_norm": 0.417204886674881, + "learning_rate": 0.00013504984468215082, + "loss": 1.6056, + "step": 25002 + }, + { + "epoch": 0.3249025815285807, + "grad_norm": 0.38330891728401184, + "learning_rate": 0.00013504724522023942, + "loss": 1.4404, + "step": 25003 + }, + { + "epoch": 0.32491557607249655, + "grad_norm": 0.5061829686164856, + "learning_rate": 0.000135044645758328, + "loss": 1.424, + "step": 25004 + }, + { + "epoch": 0.32492857061641245, + "grad_norm": 0.4341750144958496, + "learning_rate": 0.00013504204629641666, + "loss": 1.379, + "step": 25005 + }, + { + "epoch": 0.3249415651603283, + "grad_norm": 0.3365899622440338, + "learning_rate": 0.00013503944683450526, + "loss": 1.281, + "step": 25006 + }, + { + "epoch": 0.3249545597042442, + "grad_norm": 0.5352455973625183, + "learning_rate": 0.00013503684737259389, + "loss": 1.567, + "step": 25007 + }, + { + "epoch": 0.32496755424816004, + "grad_norm": 0.4097532331943512, + "learning_rate": 0.00013503424791068248, + "loss": 1.6571, + "step": 25008 + }, + { + "epoch": 0.32498054879207594, + "grad_norm": 0.3723010718822479, + "learning_rate": 0.0001350316484487711, + "loss": 1.4807, + "step": 25009 + }, + { + "epoch": 0.3249935433359918, + "grad_norm": 0.3988194167613983, + "learning_rate": 0.00013502904898685973, + "loss": 1.4546, + "step": 25010 + }, + { + "epoch": 0.3250065378799077, + "grad_norm": 0.4118337631225586, + "learning_rate": 0.00013502644952494833, + "loss": 1.2544, + "step": 25011 + }, + { + "epoch": 0.32501953242382353, + "grad_norm": 0.4609181582927704, + "learning_rate": 0.00013502385006303695, + "loss": 1.5052, + "step": 25012 + }, + { + "epoch": 0.32503252696773943, + "grad_norm": 0.4377482533454895, + "learning_rate": 0.00013502125060112558, + "loss": 1.518, + "step": 25013 + }, + { + "epoch": 0.3250455215116553, + "grad_norm": 0.3532160222530365, + "learning_rate": 0.0001350186511392142, + "loss": 1.3827, + "step": 25014 + }, + { + "epoch": 0.3250585160555712, + "grad_norm": 0.449031263589859, + "learning_rate": 0.0001350160516773028, + "loss": 1.4279, + "step": 25015 + }, + { + "epoch": 0.325071510599487, + "grad_norm": 0.3605179488658905, + "learning_rate": 0.00013501345221539142, + "loss": 1.2892, + "step": 25016 + }, + { + "epoch": 0.3250845051434029, + "grad_norm": 0.4708857238292694, + "learning_rate": 0.00013501085275348005, + "loss": 1.665, + "step": 25017 + }, + { + "epoch": 0.32509749968731877, + "grad_norm": 0.35279399156570435, + "learning_rate": 0.00013500825329156865, + "loss": 1.2813, + "step": 25018 + }, + { + "epoch": 0.32511049423123467, + "grad_norm": 0.352827787399292, + "learning_rate": 0.00013500565382965727, + "loss": 1.1623, + "step": 25019 + }, + { + "epoch": 0.3251234887751505, + "grad_norm": 0.34444841742515564, + "learning_rate": 0.00013500305436774587, + "loss": 1.4464, + "step": 25020 + }, + { + "epoch": 0.3251364833190664, + "grad_norm": 0.4622649550437927, + "learning_rate": 0.0001350004549058345, + "loss": 1.4626, + "step": 25021 + }, + { + "epoch": 0.32514947786298226, + "grad_norm": 0.32305943965911865, + "learning_rate": 0.00013499785544392312, + "loss": 1.301, + "step": 25022 + }, + { + "epoch": 0.32516247240689816, + "grad_norm": 0.32979416847229004, + "learning_rate": 0.00013499525598201171, + "loss": 1.4031, + "step": 25023 + }, + { + "epoch": 0.325175466950814, + "grad_norm": 0.5701051354408264, + "learning_rate": 0.00013499265652010034, + "loss": 1.6597, + "step": 25024 + }, + { + "epoch": 0.3251884614947299, + "grad_norm": 0.3824736177921295, + "learning_rate": 0.00013499005705818896, + "loss": 1.3873, + "step": 25025 + }, + { + "epoch": 0.32520145603864575, + "grad_norm": 0.4354667365550995, + "learning_rate": 0.0001349874575962776, + "loss": 1.3016, + "step": 25026 + }, + { + "epoch": 0.32521445058256165, + "grad_norm": 0.3852863311767578, + "learning_rate": 0.00013498485813436619, + "loss": 1.5023, + "step": 25027 + }, + { + "epoch": 0.3252274451264775, + "grad_norm": 0.4297827482223511, + "learning_rate": 0.0001349822586724548, + "loss": 1.3444, + "step": 25028 + }, + { + "epoch": 0.3252404396703934, + "grad_norm": 0.4374180734157562, + "learning_rate": 0.00013497965921054343, + "loss": 1.5081, + "step": 25029 + }, + { + "epoch": 0.32525343421430924, + "grad_norm": 0.3657471537590027, + "learning_rate": 0.00013497705974863203, + "loss": 1.3981, + "step": 25030 + }, + { + "epoch": 0.32526642875822515, + "grad_norm": 0.2984415590763092, + "learning_rate": 0.00013497446028672066, + "loss": 1.3992, + "step": 25031 + }, + { + "epoch": 0.325279423302141, + "grad_norm": 0.47639337182044983, + "learning_rate": 0.00013497186082480928, + "loss": 1.4649, + "step": 25032 + }, + { + "epoch": 0.3252924178460569, + "grad_norm": 0.4582609534263611, + "learning_rate": 0.00013496926136289788, + "loss": 1.5148, + "step": 25033 + }, + { + "epoch": 0.32530541238997274, + "grad_norm": 0.505190908908844, + "learning_rate": 0.0001349666619009865, + "loss": 1.4259, + "step": 25034 + }, + { + "epoch": 0.32531840693388864, + "grad_norm": 0.4094711244106293, + "learning_rate": 0.0001349640624390751, + "loss": 1.4837, + "step": 25035 + }, + { + "epoch": 0.3253314014778045, + "grad_norm": 0.3223525583744049, + "learning_rate": 0.00013496146297716375, + "loss": 1.4357, + "step": 25036 + }, + { + "epoch": 0.3253443960217204, + "grad_norm": 0.42297056317329407, + "learning_rate": 0.00013495886351525235, + "loss": 1.4376, + "step": 25037 + }, + { + "epoch": 0.32535739056563623, + "grad_norm": 0.32708024978637695, + "learning_rate": 0.00013495626405334097, + "loss": 1.114, + "step": 25038 + }, + { + "epoch": 0.32537038510955213, + "grad_norm": 0.48127609491348267, + "learning_rate": 0.00013495366459142957, + "loss": 1.3963, + "step": 25039 + }, + { + "epoch": 0.325383379653468, + "grad_norm": 0.39153796434402466, + "learning_rate": 0.0001349510651295182, + "loss": 1.4529, + "step": 25040 + }, + { + "epoch": 0.3253963741973839, + "grad_norm": 0.3022606074810028, + "learning_rate": 0.00013494846566760682, + "loss": 1.5539, + "step": 25041 + }, + { + "epoch": 0.3254093687412997, + "grad_norm": 0.5268205404281616, + "learning_rate": 0.00013494586620569542, + "loss": 1.3033, + "step": 25042 + }, + { + "epoch": 0.3254223632852156, + "grad_norm": 0.35959258675575256, + "learning_rate": 0.00013494326674378404, + "loss": 1.4074, + "step": 25043 + }, + { + "epoch": 0.32543535782913147, + "grad_norm": 0.4490048885345459, + "learning_rate": 0.00013494066728187267, + "loss": 1.36, + "step": 25044 + }, + { + "epoch": 0.32544835237304737, + "grad_norm": 0.3637920916080475, + "learning_rate": 0.0001349380678199613, + "loss": 1.4068, + "step": 25045 + }, + { + "epoch": 0.32546134691696327, + "grad_norm": 0.36366933584213257, + "learning_rate": 0.0001349354683580499, + "loss": 1.4479, + "step": 25046 + }, + { + "epoch": 0.3254743414608791, + "grad_norm": 0.45708683133125305, + "learning_rate": 0.00013493286889613849, + "loss": 1.3328, + "step": 25047 + }, + { + "epoch": 0.325487336004795, + "grad_norm": 0.3925706446170807, + "learning_rate": 0.00013493026943422714, + "loss": 1.3605, + "step": 25048 + }, + { + "epoch": 0.32550033054871086, + "grad_norm": 0.33771613240242004, + "learning_rate": 0.00013492766997231573, + "loss": 1.4516, + "step": 25049 + }, + { + "epoch": 0.32551332509262676, + "grad_norm": 0.4395461678504944, + "learning_rate": 0.00013492507051040436, + "loss": 1.6518, + "step": 25050 + }, + { + "epoch": 0.3255263196365426, + "grad_norm": 0.4612828195095062, + "learning_rate": 0.00013492247104849296, + "loss": 1.4781, + "step": 25051 + }, + { + "epoch": 0.3255393141804585, + "grad_norm": 0.34174174070358276, + "learning_rate": 0.00013491987158658158, + "loss": 1.2694, + "step": 25052 + }, + { + "epoch": 0.32555230872437435, + "grad_norm": 0.36931562423706055, + "learning_rate": 0.0001349172721246702, + "loss": 1.4181, + "step": 25053 + }, + { + "epoch": 0.32556530326829025, + "grad_norm": 0.3899327218532562, + "learning_rate": 0.0001349146726627588, + "loss": 1.3871, + "step": 25054 + }, + { + "epoch": 0.3255782978122061, + "grad_norm": 0.42077282071113586, + "learning_rate": 0.00013491207320084743, + "loss": 1.4662, + "step": 25055 + }, + { + "epoch": 0.325591292356122, + "grad_norm": 0.4258863925933838, + "learning_rate": 0.00013490947373893605, + "loss": 1.4813, + "step": 25056 + }, + { + "epoch": 0.32560428690003784, + "grad_norm": 0.5056393146514893, + "learning_rate": 0.00013490687427702468, + "loss": 1.5901, + "step": 25057 + }, + { + "epoch": 0.32561728144395374, + "grad_norm": 0.4341283142566681, + "learning_rate": 0.00013490427481511327, + "loss": 1.4062, + "step": 25058 + }, + { + "epoch": 0.3256302759878696, + "grad_norm": 0.38846975564956665, + "learning_rate": 0.00013490167535320187, + "loss": 1.5391, + "step": 25059 + }, + { + "epoch": 0.3256432705317855, + "grad_norm": 0.39466699957847595, + "learning_rate": 0.00013489907589129052, + "loss": 1.1182, + "step": 25060 + }, + { + "epoch": 0.32565626507570133, + "grad_norm": 0.32375431060791016, + "learning_rate": 0.00013489647642937912, + "loss": 1.2492, + "step": 25061 + }, + { + "epoch": 0.32566925961961724, + "grad_norm": 0.36306890845298767, + "learning_rate": 0.00013489387696746774, + "loss": 1.4695, + "step": 25062 + }, + { + "epoch": 0.3256822541635331, + "grad_norm": 0.2906920313835144, + "learning_rate": 0.00013489127750555634, + "loss": 1.1623, + "step": 25063 + }, + { + "epoch": 0.325695248707449, + "grad_norm": 0.41639089584350586, + "learning_rate": 0.00013488867804364497, + "loss": 1.5278, + "step": 25064 + }, + { + "epoch": 0.3257082432513648, + "grad_norm": 0.40844419598579407, + "learning_rate": 0.0001348860785817336, + "loss": 1.29, + "step": 25065 + }, + { + "epoch": 0.3257212377952807, + "grad_norm": 0.40919041633605957, + "learning_rate": 0.0001348834791198222, + "loss": 1.4418, + "step": 25066 + }, + { + "epoch": 0.32573423233919657, + "grad_norm": 0.4001988470554352, + "learning_rate": 0.0001348808796579108, + "loss": 1.4395, + "step": 25067 + }, + { + "epoch": 0.3257472268831125, + "grad_norm": 0.38842952251434326, + "learning_rate": 0.00013487828019599944, + "loss": 1.336, + "step": 25068 + }, + { + "epoch": 0.3257602214270283, + "grad_norm": 0.4620618224143982, + "learning_rate": 0.00013487568073408806, + "loss": 1.559, + "step": 25069 + }, + { + "epoch": 0.3257732159709442, + "grad_norm": 0.4246152639389038, + "learning_rate": 0.00013487308127217666, + "loss": 1.4155, + "step": 25070 + }, + { + "epoch": 0.32578621051486006, + "grad_norm": 0.35777661204338074, + "learning_rate": 0.00013487048181026528, + "loss": 1.4633, + "step": 25071 + }, + { + "epoch": 0.32579920505877596, + "grad_norm": 0.3936436176300049, + "learning_rate": 0.0001348678823483539, + "loss": 1.4218, + "step": 25072 + }, + { + "epoch": 0.3258121996026918, + "grad_norm": 0.4077737331390381, + "learning_rate": 0.0001348652828864425, + "loss": 1.3473, + "step": 25073 + }, + { + "epoch": 0.3258251941466077, + "grad_norm": 0.5033107995986938, + "learning_rate": 0.00013486268342453113, + "loss": 1.3831, + "step": 25074 + }, + { + "epoch": 0.32583818869052356, + "grad_norm": 0.34461936354637146, + "learning_rate": 0.00013486008396261975, + "loss": 1.3294, + "step": 25075 + }, + { + "epoch": 0.32585118323443946, + "grad_norm": 0.34761327505111694, + "learning_rate": 0.00013485748450070835, + "loss": 1.3322, + "step": 25076 + }, + { + "epoch": 0.3258641777783553, + "grad_norm": 0.4112671911716461, + "learning_rate": 0.00013485488503879698, + "loss": 1.4145, + "step": 25077 + }, + { + "epoch": 0.3258771723222712, + "grad_norm": 0.34792712330818176, + "learning_rate": 0.00013485228557688557, + "loss": 1.4282, + "step": 25078 + }, + { + "epoch": 0.32589016686618705, + "grad_norm": 0.49308788776397705, + "learning_rate": 0.00013484968611497423, + "loss": 1.4411, + "step": 25079 + }, + { + "epoch": 0.32590316141010295, + "grad_norm": 0.3949072062969208, + "learning_rate": 0.00013484708665306282, + "loss": 1.3714, + "step": 25080 + }, + { + "epoch": 0.3259161559540188, + "grad_norm": 0.41147202253341675, + "learning_rate": 0.00013484448719115145, + "loss": 1.4397, + "step": 25081 + }, + { + "epoch": 0.3259291504979347, + "grad_norm": 0.3794117867946625, + "learning_rate": 0.00013484188772924004, + "loss": 1.5291, + "step": 25082 + }, + { + "epoch": 0.32594214504185054, + "grad_norm": 0.4321388304233551, + "learning_rate": 0.00013483928826732867, + "loss": 1.4051, + "step": 25083 + }, + { + "epoch": 0.32595513958576644, + "grad_norm": 0.44398024678230286, + "learning_rate": 0.0001348366888054173, + "loss": 1.412, + "step": 25084 + }, + { + "epoch": 0.3259681341296823, + "grad_norm": 0.4830962121486664, + "learning_rate": 0.0001348340893435059, + "loss": 1.4736, + "step": 25085 + }, + { + "epoch": 0.3259811286735982, + "grad_norm": 0.3724367320537567, + "learning_rate": 0.00013483148988159452, + "loss": 1.2677, + "step": 25086 + }, + { + "epoch": 0.32599412321751403, + "grad_norm": 0.36601653695106506, + "learning_rate": 0.00013482889041968314, + "loss": 1.4373, + "step": 25087 + }, + { + "epoch": 0.32600711776142993, + "grad_norm": 0.36193615198135376, + "learning_rate": 0.00013482629095777174, + "loss": 1.3491, + "step": 25088 + }, + { + "epoch": 0.3260201123053458, + "grad_norm": 0.425514817237854, + "learning_rate": 0.00013482369149586036, + "loss": 1.4519, + "step": 25089 + }, + { + "epoch": 0.3260331068492617, + "grad_norm": 0.4317486882209778, + "learning_rate": 0.00013482109203394896, + "loss": 1.3881, + "step": 25090 + }, + { + "epoch": 0.3260461013931775, + "grad_norm": 0.5020955801010132, + "learning_rate": 0.0001348184925720376, + "loss": 1.5875, + "step": 25091 + }, + { + "epoch": 0.3260590959370934, + "grad_norm": 0.41325709223747253, + "learning_rate": 0.0001348158931101262, + "loss": 1.436, + "step": 25092 + }, + { + "epoch": 0.32607209048100927, + "grad_norm": 0.36766985058784485, + "learning_rate": 0.00013481329364821483, + "loss": 1.3766, + "step": 25093 + }, + { + "epoch": 0.32608508502492517, + "grad_norm": 0.39926642179489136, + "learning_rate": 0.00013481069418630343, + "loss": 1.4934, + "step": 25094 + }, + { + "epoch": 0.326098079568841, + "grad_norm": 0.2830159664154053, + "learning_rate": 0.00013480809472439205, + "loss": 1.1838, + "step": 25095 + }, + { + "epoch": 0.3261110741127569, + "grad_norm": 0.34247270226478577, + "learning_rate": 0.00013480549526248068, + "loss": 1.373, + "step": 25096 + }, + { + "epoch": 0.32612406865667276, + "grad_norm": 0.38097715377807617, + "learning_rate": 0.00013480289580056928, + "loss": 1.2018, + "step": 25097 + }, + { + "epoch": 0.32613706320058866, + "grad_norm": 0.4696395993232727, + "learning_rate": 0.0001348002963386579, + "loss": 1.3039, + "step": 25098 + }, + { + "epoch": 0.3261500577445045, + "grad_norm": 0.300123393535614, + "learning_rate": 0.00013479769687674653, + "loss": 1.4659, + "step": 25099 + }, + { + "epoch": 0.3261630522884204, + "grad_norm": 0.3671986162662506, + "learning_rate": 0.00013479509741483515, + "loss": 1.2592, + "step": 25100 + }, + { + "epoch": 0.32617604683233625, + "grad_norm": 0.45503172278404236, + "learning_rate": 0.00013479249795292375, + "loss": 1.5242, + "step": 25101 + }, + { + "epoch": 0.32618904137625215, + "grad_norm": 0.4303828775882721, + "learning_rate": 0.00013478989849101234, + "loss": 1.4291, + "step": 25102 + }, + { + "epoch": 0.326202035920168, + "grad_norm": 0.4110209047794342, + "learning_rate": 0.000134787299029101, + "loss": 1.3396, + "step": 25103 + }, + { + "epoch": 0.3262150304640839, + "grad_norm": 0.41984519362449646, + "learning_rate": 0.0001347846995671896, + "loss": 1.3952, + "step": 25104 + }, + { + "epoch": 0.32622802500799974, + "grad_norm": 0.3950515687465668, + "learning_rate": 0.00013478210010527822, + "loss": 1.1887, + "step": 25105 + }, + { + "epoch": 0.32624101955191565, + "grad_norm": 0.2811693847179413, + "learning_rate": 0.00013477950064336684, + "loss": 1.3754, + "step": 25106 + }, + { + "epoch": 0.3262540140958315, + "grad_norm": 0.34692201018333435, + "learning_rate": 0.00013477690118145544, + "loss": 1.5367, + "step": 25107 + }, + { + "epoch": 0.3262670086397474, + "grad_norm": 0.3976317346096039, + "learning_rate": 0.00013477430171954406, + "loss": 1.3272, + "step": 25108 + }, + { + "epoch": 0.32628000318366324, + "grad_norm": 0.3629496693611145, + "learning_rate": 0.00013477170225763266, + "loss": 1.2415, + "step": 25109 + }, + { + "epoch": 0.32629299772757914, + "grad_norm": 0.37405526638031006, + "learning_rate": 0.0001347691027957213, + "loss": 1.4472, + "step": 25110 + }, + { + "epoch": 0.326305992271495, + "grad_norm": 0.37751832604408264, + "learning_rate": 0.0001347665033338099, + "loss": 1.1198, + "step": 25111 + }, + { + "epoch": 0.3263189868154109, + "grad_norm": 0.4611285328865051, + "learning_rate": 0.00013476390387189854, + "loss": 1.3738, + "step": 25112 + }, + { + "epoch": 0.32633198135932673, + "grad_norm": 0.37156814336776733, + "learning_rate": 0.00013476130440998713, + "loss": 1.2268, + "step": 25113 + }, + { + "epoch": 0.32634497590324263, + "grad_norm": 0.4042040705680847, + "learning_rate": 0.00013475870494807576, + "loss": 1.4387, + "step": 25114 + }, + { + "epoch": 0.3263579704471585, + "grad_norm": 0.3807907700538635, + "learning_rate": 0.00013475610548616438, + "loss": 1.2069, + "step": 25115 + }, + { + "epoch": 0.3263709649910744, + "grad_norm": 0.3175273537635803, + "learning_rate": 0.00013475350602425298, + "loss": 1.4251, + "step": 25116 + }, + { + "epoch": 0.3263839595349902, + "grad_norm": 0.46717217564582825, + "learning_rate": 0.0001347509065623416, + "loss": 1.335, + "step": 25117 + }, + { + "epoch": 0.3263969540789061, + "grad_norm": 0.38880303502082825, + "learning_rate": 0.00013474830710043023, + "loss": 1.4468, + "step": 25118 + }, + { + "epoch": 0.32640994862282197, + "grad_norm": 0.35459861159324646, + "learning_rate": 0.00013474570763851883, + "loss": 1.1196, + "step": 25119 + }, + { + "epoch": 0.32642294316673787, + "grad_norm": 0.42634567618370056, + "learning_rate": 0.00013474310817660745, + "loss": 1.2734, + "step": 25120 + }, + { + "epoch": 0.3264359377106537, + "grad_norm": 0.4068199098110199, + "learning_rate": 0.00013474050871469605, + "loss": 1.4219, + "step": 25121 + }, + { + "epoch": 0.3264489322545696, + "grad_norm": 0.4204493463039398, + "learning_rate": 0.0001347379092527847, + "loss": 1.2902, + "step": 25122 + }, + { + "epoch": 0.3264619267984855, + "grad_norm": 0.4918639361858368, + "learning_rate": 0.0001347353097908733, + "loss": 1.4356, + "step": 25123 + }, + { + "epoch": 0.32647492134240136, + "grad_norm": 0.3572603166103363, + "learning_rate": 0.00013473271032896192, + "loss": 1.3307, + "step": 25124 + }, + { + "epoch": 0.32648791588631726, + "grad_norm": 0.42816242575645447, + "learning_rate": 0.00013473011086705052, + "loss": 1.4716, + "step": 25125 + }, + { + "epoch": 0.3265009104302331, + "grad_norm": 0.3943621516227722, + "learning_rate": 0.00013472751140513914, + "loss": 1.4461, + "step": 25126 + }, + { + "epoch": 0.326513904974149, + "grad_norm": 0.48014211654663086, + "learning_rate": 0.00013472491194322777, + "loss": 1.5333, + "step": 25127 + }, + { + "epoch": 0.32652689951806485, + "grad_norm": 0.36372148990631104, + "learning_rate": 0.00013472231248131636, + "loss": 1.3142, + "step": 25128 + }, + { + "epoch": 0.32653989406198075, + "grad_norm": 0.4038459062576294, + "learning_rate": 0.000134719713019405, + "loss": 1.4598, + "step": 25129 + }, + { + "epoch": 0.3265528886058966, + "grad_norm": 0.4052509069442749, + "learning_rate": 0.0001347171135574936, + "loss": 1.3734, + "step": 25130 + }, + { + "epoch": 0.3265658831498125, + "grad_norm": 0.32374992966651917, + "learning_rate": 0.0001347145140955822, + "loss": 1.5525, + "step": 25131 + }, + { + "epoch": 0.32657887769372834, + "grad_norm": 0.22822009027004242, + "learning_rate": 0.00013471191463367084, + "loss": 1.4732, + "step": 25132 + }, + { + "epoch": 0.32659187223764424, + "grad_norm": 0.4433041214942932, + "learning_rate": 0.00013470931517175943, + "loss": 1.4581, + "step": 25133 + }, + { + "epoch": 0.3266048667815601, + "grad_norm": 0.392037957906723, + "learning_rate": 0.00013470671570984808, + "loss": 1.6374, + "step": 25134 + }, + { + "epoch": 0.326617861325476, + "grad_norm": 0.47803208231925964, + "learning_rate": 0.00013470411624793668, + "loss": 1.3338, + "step": 25135 + }, + { + "epoch": 0.32663085586939183, + "grad_norm": 0.4125545620918274, + "learning_rate": 0.0001347015167860253, + "loss": 1.4345, + "step": 25136 + }, + { + "epoch": 0.32664385041330773, + "grad_norm": 0.390426367521286, + "learning_rate": 0.0001346989173241139, + "loss": 1.2749, + "step": 25137 + }, + { + "epoch": 0.3266568449572236, + "grad_norm": 0.37080010771751404, + "learning_rate": 0.00013469631786220253, + "loss": 1.4645, + "step": 25138 + }, + { + "epoch": 0.3266698395011395, + "grad_norm": 0.34431877732276917, + "learning_rate": 0.00013469371840029115, + "loss": 1.3404, + "step": 25139 + }, + { + "epoch": 0.3266828340450553, + "grad_norm": 0.5365375876426697, + "learning_rate": 0.00013469111893837975, + "loss": 1.4166, + "step": 25140 + }, + { + "epoch": 0.3266958285889712, + "grad_norm": 0.506577730178833, + "learning_rate": 0.00013468851947646837, + "loss": 1.5908, + "step": 25141 + }, + { + "epoch": 0.32670882313288707, + "grad_norm": 0.43247413635253906, + "learning_rate": 0.000134685920014557, + "loss": 1.3231, + "step": 25142 + }, + { + "epoch": 0.326721817676803, + "grad_norm": 0.3522419333457947, + "learning_rate": 0.0001346833205526456, + "loss": 1.318, + "step": 25143 + }, + { + "epoch": 0.3267348122207188, + "grad_norm": 0.4904137849807739, + "learning_rate": 0.00013468072109073422, + "loss": 1.5135, + "step": 25144 + }, + { + "epoch": 0.3267478067646347, + "grad_norm": 0.32583436369895935, + "learning_rate": 0.00013467812162882284, + "loss": 1.3889, + "step": 25145 + }, + { + "epoch": 0.32676080130855056, + "grad_norm": 0.4270898401737213, + "learning_rate": 0.00013467552216691147, + "loss": 1.4651, + "step": 25146 + }, + { + "epoch": 0.32677379585246646, + "grad_norm": 0.4327910840511322, + "learning_rate": 0.00013467292270500007, + "loss": 1.3234, + "step": 25147 + }, + { + "epoch": 0.3267867903963823, + "grad_norm": 0.3867788314819336, + "learning_rate": 0.0001346703232430887, + "loss": 1.3614, + "step": 25148 + }, + { + "epoch": 0.3267997849402982, + "grad_norm": 0.416170209646225, + "learning_rate": 0.00013466772378117732, + "loss": 1.3736, + "step": 25149 + }, + { + "epoch": 0.32681277948421406, + "grad_norm": 0.4406786859035492, + "learning_rate": 0.0001346651243192659, + "loss": 1.4754, + "step": 25150 + }, + { + "epoch": 0.32682577402812996, + "grad_norm": 0.41690707206726074, + "learning_rate": 0.00013466252485735454, + "loss": 1.4068, + "step": 25151 + }, + { + "epoch": 0.3268387685720458, + "grad_norm": 0.3712075352668762, + "learning_rate": 0.00013465992539544314, + "loss": 1.3816, + "step": 25152 + }, + { + "epoch": 0.3268517631159617, + "grad_norm": 0.4014545977115631, + "learning_rate": 0.0001346573259335318, + "loss": 1.5239, + "step": 25153 + }, + { + "epoch": 0.32686475765987755, + "grad_norm": 0.22958482801914215, + "learning_rate": 0.00013465472647162038, + "loss": 0.9796, + "step": 25154 + }, + { + "epoch": 0.32687775220379345, + "grad_norm": 0.3111101984977722, + "learning_rate": 0.00013465212700970898, + "loss": 1.3745, + "step": 25155 + }, + { + "epoch": 0.3268907467477093, + "grad_norm": 0.49364593625068665, + "learning_rate": 0.0001346495275477976, + "loss": 1.5498, + "step": 25156 + }, + { + "epoch": 0.3269037412916252, + "grad_norm": 0.3933982849121094, + "learning_rate": 0.00013464692808588623, + "loss": 1.2648, + "step": 25157 + }, + { + "epoch": 0.32691673583554104, + "grad_norm": 0.3878893554210663, + "learning_rate": 0.00013464432862397485, + "loss": 1.3151, + "step": 25158 + }, + { + "epoch": 0.32692973037945694, + "grad_norm": 0.37900814414024353, + "learning_rate": 0.00013464172916206345, + "loss": 1.4053, + "step": 25159 + }, + { + "epoch": 0.3269427249233728, + "grad_norm": 0.4502893388271332, + "learning_rate": 0.00013463912970015208, + "loss": 1.5794, + "step": 25160 + }, + { + "epoch": 0.3269557194672887, + "grad_norm": 0.3121495544910431, + "learning_rate": 0.0001346365302382407, + "loss": 1.399, + "step": 25161 + }, + { + "epoch": 0.32696871401120453, + "grad_norm": 0.4192117750644684, + "learning_rate": 0.0001346339307763293, + "loss": 1.416, + "step": 25162 + }, + { + "epoch": 0.32698170855512043, + "grad_norm": 0.43233877420425415, + "learning_rate": 0.00013463133131441792, + "loss": 1.4482, + "step": 25163 + }, + { + "epoch": 0.3269947030990363, + "grad_norm": 0.4069402813911438, + "learning_rate": 0.00013462873185250652, + "loss": 1.4392, + "step": 25164 + }, + { + "epoch": 0.3270076976429522, + "grad_norm": 0.41604119539260864, + "learning_rate": 0.00013462613239059517, + "loss": 1.4955, + "step": 25165 + }, + { + "epoch": 0.327020692186868, + "grad_norm": 0.35321110486984253, + "learning_rate": 0.00013462353292868377, + "loss": 1.4087, + "step": 25166 + }, + { + "epoch": 0.3270336867307839, + "grad_norm": 0.4783344566822052, + "learning_rate": 0.0001346209334667724, + "loss": 1.3354, + "step": 25167 + }, + { + "epoch": 0.32704668127469977, + "grad_norm": 0.3716732859611511, + "learning_rate": 0.000134618334004861, + "loss": 1.6129, + "step": 25168 + }, + { + "epoch": 0.32705967581861567, + "grad_norm": 0.3891408145427704, + "learning_rate": 0.00013461573454294962, + "loss": 1.3723, + "step": 25169 + }, + { + "epoch": 0.3270726703625315, + "grad_norm": 0.44201067090034485, + "learning_rate": 0.00013461313508103824, + "loss": 1.431, + "step": 25170 + }, + { + "epoch": 0.3270856649064474, + "grad_norm": 0.3926926553249359, + "learning_rate": 0.00013461053561912684, + "loss": 1.1106, + "step": 25171 + }, + { + "epoch": 0.32709865945036326, + "grad_norm": 0.3201066255569458, + "learning_rate": 0.00013460793615721546, + "loss": 1.2104, + "step": 25172 + }, + { + "epoch": 0.32711165399427916, + "grad_norm": 0.4122949242591858, + "learning_rate": 0.0001346053366953041, + "loss": 1.3383, + "step": 25173 + }, + { + "epoch": 0.327124648538195, + "grad_norm": 0.5389511585235596, + "learning_rate": 0.00013460273723339268, + "loss": 1.4564, + "step": 25174 + }, + { + "epoch": 0.3271376430821109, + "grad_norm": 0.4164218008518219, + "learning_rate": 0.0001346001377714813, + "loss": 1.6989, + "step": 25175 + }, + { + "epoch": 0.32715063762602675, + "grad_norm": 0.40415647625923157, + "learning_rate": 0.0001345975383095699, + "loss": 1.3596, + "step": 25176 + }, + { + "epoch": 0.32716363216994265, + "grad_norm": 0.4228530824184418, + "learning_rate": 0.00013459493884765856, + "loss": 1.4409, + "step": 25177 + }, + { + "epoch": 0.3271766267138585, + "grad_norm": 0.4279729723930359, + "learning_rate": 0.00013459233938574715, + "loss": 1.3715, + "step": 25178 + }, + { + "epoch": 0.3271896212577744, + "grad_norm": 0.4416216015815735, + "learning_rate": 0.00013458973992383578, + "loss": 1.3926, + "step": 25179 + }, + { + "epoch": 0.32720261580169024, + "grad_norm": 0.4428456425666809, + "learning_rate": 0.0001345871404619244, + "loss": 1.4309, + "step": 25180 + }, + { + "epoch": 0.32721561034560614, + "grad_norm": 0.485161691904068, + "learning_rate": 0.000134584541000013, + "loss": 1.4378, + "step": 25181 + }, + { + "epoch": 0.327228604889522, + "grad_norm": 0.42604485154151917, + "learning_rate": 0.00013458194153810163, + "loss": 1.2462, + "step": 25182 + }, + { + "epoch": 0.3272415994334379, + "grad_norm": 0.510232150554657, + "learning_rate": 0.00013457934207619022, + "loss": 1.5616, + "step": 25183 + }, + { + "epoch": 0.32725459397735374, + "grad_norm": 0.43469035625457764, + "learning_rate": 0.00013457674261427887, + "loss": 1.583, + "step": 25184 + }, + { + "epoch": 0.32726758852126964, + "grad_norm": 0.4172048270702362, + "learning_rate": 0.00013457414315236747, + "loss": 1.3413, + "step": 25185 + }, + { + "epoch": 0.3272805830651855, + "grad_norm": 0.39685049653053284, + "learning_rate": 0.00013457154369045607, + "loss": 1.4196, + "step": 25186 + }, + { + "epoch": 0.3272935776091014, + "grad_norm": 0.43034377694129944, + "learning_rate": 0.0001345689442285447, + "loss": 1.4554, + "step": 25187 + }, + { + "epoch": 0.32730657215301723, + "grad_norm": 0.409824013710022, + "learning_rate": 0.00013456634476663332, + "loss": 1.3962, + "step": 25188 + }, + { + "epoch": 0.32731956669693313, + "grad_norm": 0.45401108264923096, + "learning_rate": 0.00013456374530472194, + "loss": 1.4953, + "step": 25189 + }, + { + "epoch": 0.327332561240849, + "grad_norm": 0.35680845379829407, + "learning_rate": 0.00013456114584281054, + "loss": 1.2949, + "step": 25190 + }, + { + "epoch": 0.3273455557847649, + "grad_norm": 0.313738077878952, + "learning_rate": 0.00013455854638089916, + "loss": 1.3872, + "step": 25191 + }, + { + "epoch": 0.3273585503286807, + "grad_norm": 0.4033963084220886, + "learning_rate": 0.0001345559469189878, + "loss": 1.3798, + "step": 25192 + }, + { + "epoch": 0.3273715448725966, + "grad_norm": 0.37819743156433105, + "learning_rate": 0.0001345533474570764, + "loss": 1.3276, + "step": 25193 + }, + { + "epoch": 0.32738453941651247, + "grad_norm": 0.4011194407939911, + "learning_rate": 0.000134550747995165, + "loss": 1.5889, + "step": 25194 + }, + { + "epoch": 0.32739753396042837, + "grad_norm": 0.37969887256622314, + "learning_rate": 0.0001345481485332536, + "loss": 1.3793, + "step": 25195 + }, + { + "epoch": 0.3274105285043442, + "grad_norm": 0.412293016910553, + "learning_rate": 0.00013454554907134226, + "loss": 1.3066, + "step": 25196 + }, + { + "epoch": 0.3274235230482601, + "grad_norm": 0.39382970333099365, + "learning_rate": 0.00013454294960943086, + "loss": 1.587, + "step": 25197 + }, + { + "epoch": 0.32743651759217596, + "grad_norm": 0.37101104855537415, + "learning_rate": 0.00013454035014751945, + "loss": 1.205, + "step": 25198 + }, + { + "epoch": 0.32744951213609186, + "grad_norm": 0.38875311613082886, + "learning_rate": 0.00013453775068560808, + "loss": 1.5434, + "step": 25199 + }, + { + "epoch": 0.32746250668000776, + "grad_norm": 0.4111554026603699, + "learning_rate": 0.0001345351512236967, + "loss": 1.4005, + "step": 25200 + }, + { + "epoch": 0.3274755012239236, + "grad_norm": 0.38296395540237427, + "learning_rate": 0.00013453255176178533, + "loss": 1.4733, + "step": 25201 + }, + { + "epoch": 0.3274884957678395, + "grad_norm": 0.4104140102863312, + "learning_rate": 0.00013452995229987393, + "loss": 1.2903, + "step": 25202 + }, + { + "epoch": 0.32750149031175535, + "grad_norm": 0.4143497347831726, + "learning_rate": 0.00013452735283796255, + "loss": 1.3873, + "step": 25203 + }, + { + "epoch": 0.32751448485567125, + "grad_norm": 0.40942829847335815, + "learning_rate": 0.00013452475337605117, + "loss": 1.2982, + "step": 25204 + }, + { + "epoch": 0.3275274793995871, + "grad_norm": 0.4146082401275635, + "learning_rate": 0.00013452215391413977, + "loss": 1.2994, + "step": 25205 + }, + { + "epoch": 0.327540473943503, + "grad_norm": 0.39570704102516174, + "learning_rate": 0.0001345195544522284, + "loss": 1.4247, + "step": 25206 + }, + { + "epoch": 0.32755346848741884, + "grad_norm": 0.27458903193473816, + "learning_rate": 0.000134516954990317, + "loss": 1.4095, + "step": 25207 + }, + { + "epoch": 0.32756646303133474, + "grad_norm": 0.28139635920524597, + "learning_rate": 0.00013451435552840565, + "loss": 1.3069, + "step": 25208 + }, + { + "epoch": 0.3275794575752506, + "grad_norm": 0.37578094005584717, + "learning_rate": 0.00013451175606649424, + "loss": 1.3259, + "step": 25209 + }, + { + "epoch": 0.3275924521191665, + "grad_norm": 0.5164992213249207, + "learning_rate": 0.00013450915660458284, + "loss": 1.683, + "step": 25210 + }, + { + "epoch": 0.32760544666308233, + "grad_norm": 0.4637851417064667, + "learning_rate": 0.00013450655714267146, + "loss": 1.4682, + "step": 25211 + }, + { + "epoch": 0.32761844120699823, + "grad_norm": 0.527933657169342, + "learning_rate": 0.0001345039576807601, + "loss": 1.3064, + "step": 25212 + }, + { + "epoch": 0.3276314357509141, + "grad_norm": 0.34882813692092896, + "learning_rate": 0.00013450135821884871, + "loss": 1.4003, + "step": 25213 + }, + { + "epoch": 0.32764443029483, + "grad_norm": 0.34134653210639954, + "learning_rate": 0.0001344987587569373, + "loss": 1.2132, + "step": 25214 + }, + { + "epoch": 0.3276574248387458, + "grad_norm": 0.37958288192749023, + "learning_rate": 0.00013449615929502594, + "loss": 1.4585, + "step": 25215 + }, + { + "epoch": 0.3276704193826617, + "grad_norm": 0.45946335792541504, + "learning_rate": 0.00013449355983311456, + "loss": 1.5578, + "step": 25216 + }, + { + "epoch": 0.32768341392657757, + "grad_norm": 0.47827282547950745, + "learning_rate": 0.00013449096037120316, + "loss": 1.3504, + "step": 25217 + }, + { + "epoch": 0.32769640847049347, + "grad_norm": 0.3432607054710388, + "learning_rate": 0.00013448836090929178, + "loss": 1.5206, + "step": 25218 + }, + { + "epoch": 0.3277094030144093, + "grad_norm": 0.48154816031455994, + "learning_rate": 0.0001344857614473804, + "loss": 1.5157, + "step": 25219 + }, + { + "epoch": 0.3277223975583252, + "grad_norm": 0.4297764003276825, + "learning_rate": 0.00013448316198546903, + "loss": 1.3819, + "step": 25220 + }, + { + "epoch": 0.32773539210224106, + "grad_norm": 0.34937822818756104, + "learning_rate": 0.00013448056252355763, + "loss": 1.3598, + "step": 25221 + }, + { + "epoch": 0.32774838664615696, + "grad_norm": 0.36012890934944153, + "learning_rate": 0.00013447796306164625, + "loss": 1.3775, + "step": 25222 + }, + { + "epoch": 0.3277613811900728, + "grad_norm": 0.3779444992542267, + "learning_rate": 0.00013447536359973488, + "loss": 1.3051, + "step": 25223 + }, + { + "epoch": 0.3277743757339887, + "grad_norm": 0.33696267008781433, + "learning_rate": 0.00013447276413782347, + "loss": 1.4542, + "step": 25224 + }, + { + "epoch": 0.32778737027790456, + "grad_norm": 0.3947320878505707, + "learning_rate": 0.0001344701646759121, + "loss": 1.4633, + "step": 25225 + }, + { + "epoch": 0.32780036482182046, + "grad_norm": 0.36321836709976196, + "learning_rate": 0.0001344675652140007, + "loss": 1.5153, + "step": 25226 + }, + { + "epoch": 0.3278133593657363, + "grad_norm": 0.49338874220848083, + "learning_rate": 0.00013446496575208932, + "loss": 1.4833, + "step": 25227 + }, + { + "epoch": 0.3278263539096522, + "grad_norm": 0.37824904918670654, + "learning_rate": 0.00013446236629017795, + "loss": 1.4234, + "step": 25228 + }, + { + "epoch": 0.32783934845356805, + "grad_norm": 0.36573806405067444, + "learning_rate": 0.00013445976682826654, + "loss": 1.3192, + "step": 25229 + }, + { + "epoch": 0.32785234299748395, + "grad_norm": 0.40590664744377136, + "learning_rate": 0.00013445716736635517, + "loss": 1.4116, + "step": 25230 + }, + { + "epoch": 0.3278653375413998, + "grad_norm": 0.35290586948394775, + "learning_rate": 0.0001344545679044438, + "loss": 1.2989, + "step": 25231 + }, + { + "epoch": 0.3278783320853157, + "grad_norm": 0.35414546728134155, + "learning_rate": 0.00013445196844253242, + "loss": 1.5103, + "step": 25232 + }, + { + "epoch": 0.32789132662923154, + "grad_norm": 0.41480574011802673, + "learning_rate": 0.000134449368980621, + "loss": 1.5032, + "step": 25233 + }, + { + "epoch": 0.32790432117314744, + "grad_norm": 0.40544453263282776, + "learning_rate": 0.00013444676951870964, + "loss": 1.368, + "step": 25234 + }, + { + "epoch": 0.3279173157170633, + "grad_norm": 0.38026750087738037, + "learning_rate": 0.00013444417005679826, + "loss": 1.6145, + "step": 25235 + }, + { + "epoch": 0.3279303102609792, + "grad_norm": 0.46215152740478516, + "learning_rate": 0.00013444157059488686, + "loss": 1.1759, + "step": 25236 + }, + { + "epoch": 0.32794330480489503, + "grad_norm": 0.3808053135871887, + "learning_rate": 0.00013443897113297548, + "loss": 1.4905, + "step": 25237 + }, + { + "epoch": 0.32795629934881093, + "grad_norm": 0.40001416206359863, + "learning_rate": 0.00013443637167106408, + "loss": 1.3214, + "step": 25238 + }, + { + "epoch": 0.3279692938927268, + "grad_norm": 0.39595237374305725, + "learning_rate": 0.0001344337722091527, + "loss": 1.5032, + "step": 25239 + }, + { + "epoch": 0.3279822884366427, + "grad_norm": 0.40611952543258667, + "learning_rate": 0.00013443117274724133, + "loss": 1.6659, + "step": 25240 + }, + { + "epoch": 0.3279952829805585, + "grad_norm": 0.3672429919242859, + "learning_rate": 0.00013442857328532993, + "loss": 1.1421, + "step": 25241 + }, + { + "epoch": 0.3280082775244744, + "grad_norm": 0.44474703073501587, + "learning_rate": 0.00013442597382341855, + "loss": 1.5541, + "step": 25242 + }, + { + "epoch": 0.32802127206839027, + "grad_norm": 0.4027102291584015, + "learning_rate": 0.00013442337436150718, + "loss": 1.5021, + "step": 25243 + }, + { + "epoch": 0.32803426661230617, + "grad_norm": 0.3847944438457489, + "learning_rate": 0.0001344207748995958, + "loss": 1.5262, + "step": 25244 + }, + { + "epoch": 0.328047261156222, + "grad_norm": 0.42310675978660583, + "learning_rate": 0.0001344181754376844, + "loss": 1.3288, + "step": 25245 + }, + { + "epoch": 0.3280602557001379, + "grad_norm": 0.41820088028907776, + "learning_rate": 0.00013441557597577302, + "loss": 1.4223, + "step": 25246 + }, + { + "epoch": 0.32807325024405376, + "grad_norm": 0.3190530836582184, + "learning_rate": 0.00013441297651386165, + "loss": 1.3857, + "step": 25247 + }, + { + "epoch": 0.32808624478796966, + "grad_norm": 0.41750451922416687, + "learning_rate": 0.00013441037705195025, + "loss": 1.3981, + "step": 25248 + }, + { + "epoch": 0.3280992393318855, + "grad_norm": 0.35738861560821533, + "learning_rate": 0.00013440777759003887, + "loss": 1.3984, + "step": 25249 + }, + { + "epoch": 0.3281122338758014, + "grad_norm": 0.43311652541160583, + "learning_rate": 0.00013440517812812747, + "loss": 1.3313, + "step": 25250 + }, + { + "epoch": 0.32812522841971725, + "grad_norm": 0.35025453567504883, + "learning_rate": 0.00013440257866621612, + "loss": 1.1331, + "step": 25251 + }, + { + "epoch": 0.32813822296363315, + "grad_norm": 0.58076411485672, + "learning_rate": 0.00013439997920430472, + "loss": 1.4556, + "step": 25252 + }, + { + "epoch": 0.328151217507549, + "grad_norm": 0.38680019974708557, + "learning_rate": 0.0001343973797423933, + "loss": 1.0974, + "step": 25253 + }, + { + "epoch": 0.3281642120514649, + "grad_norm": 0.44846659898757935, + "learning_rate": 0.00013439478028048194, + "loss": 1.3886, + "step": 25254 + }, + { + "epoch": 0.32817720659538074, + "grad_norm": 0.3297249674797058, + "learning_rate": 0.00013439218081857056, + "loss": 1.2803, + "step": 25255 + }, + { + "epoch": 0.32819020113929664, + "grad_norm": 0.3909626007080078, + "learning_rate": 0.0001343895813566592, + "loss": 1.3719, + "step": 25256 + }, + { + "epoch": 0.3282031956832125, + "grad_norm": 0.3747754693031311, + "learning_rate": 0.00013438698189474778, + "loss": 1.577, + "step": 25257 + }, + { + "epoch": 0.3282161902271284, + "grad_norm": 0.4181773364543915, + "learning_rate": 0.0001343843824328364, + "loss": 1.3302, + "step": 25258 + }, + { + "epoch": 0.32822918477104424, + "grad_norm": 0.40097978711128235, + "learning_rate": 0.00013438178297092503, + "loss": 1.3113, + "step": 25259 + }, + { + "epoch": 0.32824217931496014, + "grad_norm": 0.388907790184021, + "learning_rate": 0.00013437918350901363, + "loss": 1.4804, + "step": 25260 + }, + { + "epoch": 0.328255173858876, + "grad_norm": 0.39584091305732727, + "learning_rate": 0.00013437658404710226, + "loss": 1.2277, + "step": 25261 + }, + { + "epoch": 0.3282681684027919, + "grad_norm": 0.44478240609169006, + "learning_rate": 0.00013437398458519088, + "loss": 1.6657, + "step": 25262 + }, + { + "epoch": 0.3282811629467077, + "grad_norm": 0.35414913296699524, + "learning_rate": 0.0001343713851232795, + "loss": 1.3972, + "step": 25263 + }, + { + "epoch": 0.32829415749062363, + "grad_norm": 0.3457530736923218, + "learning_rate": 0.0001343687856613681, + "loss": 1.229, + "step": 25264 + }, + { + "epoch": 0.3283071520345395, + "grad_norm": 0.3642204701900482, + "learning_rate": 0.0001343661861994567, + "loss": 1.3387, + "step": 25265 + }, + { + "epoch": 0.3283201465784554, + "grad_norm": 0.46774348616600037, + "learning_rate": 0.00013436358673754535, + "loss": 1.4054, + "step": 25266 + }, + { + "epoch": 0.3283331411223712, + "grad_norm": 0.46813157200813293, + "learning_rate": 0.00013436098727563395, + "loss": 1.5388, + "step": 25267 + }, + { + "epoch": 0.3283461356662871, + "grad_norm": 0.4620802104473114, + "learning_rate": 0.00013435838781372257, + "loss": 1.4207, + "step": 25268 + }, + { + "epoch": 0.32835913021020297, + "grad_norm": 0.37374627590179443, + "learning_rate": 0.00013435578835181117, + "loss": 1.5331, + "step": 25269 + }, + { + "epoch": 0.32837212475411887, + "grad_norm": 0.2873345911502838, + "learning_rate": 0.0001343531888898998, + "loss": 1.296, + "step": 25270 + }, + { + "epoch": 0.3283851192980347, + "grad_norm": 0.35456565022468567, + "learning_rate": 0.00013435058942798842, + "loss": 1.5246, + "step": 25271 + }, + { + "epoch": 0.3283981138419506, + "grad_norm": 0.41262704133987427, + "learning_rate": 0.00013434798996607702, + "loss": 1.4908, + "step": 25272 + }, + { + "epoch": 0.32841110838586646, + "grad_norm": 0.45627617835998535, + "learning_rate": 0.00013434539050416564, + "loss": 1.3803, + "step": 25273 + }, + { + "epoch": 0.32842410292978236, + "grad_norm": 0.40540611743927, + "learning_rate": 0.00013434279104225427, + "loss": 1.4691, + "step": 25274 + }, + { + "epoch": 0.32843709747369826, + "grad_norm": 0.3801732063293457, + "learning_rate": 0.0001343401915803429, + "loss": 1.303, + "step": 25275 + }, + { + "epoch": 0.3284500920176141, + "grad_norm": 0.3658905327320099, + "learning_rate": 0.0001343375921184315, + "loss": 1.5613, + "step": 25276 + }, + { + "epoch": 0.32846308656153, + "grad_norm": 0.48025932908058167, + "learning_rate": 0.00013433499265652008, + "loss": 1.3701, + "step": 25277 + }, + { + "epoch": 0.32847608110544585, + "grad_norm": 0.4760112464427948, + "learning_rate": 0.00013433239319460874, + "loss": 1.4226, + "step": 25278 + }, + { + "epoch": 0.32848907564936175, + "grad_norm": 0.4144611656665802, + "learning_rate": 0.00013432979373269733, + "loss": 1.3887, + "step": 25279 + }, + { + "epoch": 0.3285020701932776, + "grad_norm": 0.46555212140083313, + "learning_rate": 0.00013432719427078596, + "loss": 1.3211, + "step": 25280 + }, + { + "epoch": 0.3285150647371935, + "grad_norm": 0.46026021242141724, + "learning_rate": 0.00013432459480887456, + "loss": 1.5042, + "step": 25281 + }, + { + "epoch": 0.32852805928110934, + "grad_norm": 0.40246525406837463, + "learning_rate": 0.00013432199534696318, + "loss": 1.4642, + "step": 25282 + }, + { + "epoch": 0.32854105382502524, + "grad_norm": 0.4118730127811432, + "learning_rate": 0.0001343193958850518, + "loss": 1.4044, + "step": 25283 + }, + { + "epoch": 0.3285540483689411, + "grad_norm": 0.3817659914493561, + "learning_rate": 0.0001343167964231404, + "loss": 1.4898, + "step": 25284 + }, + { + "epoch": 0.328567042912857, + "grad_norm": 0.3684300482273102, + "learning_rate": 0.00013431419696122903, + "loss": 1.3707, + "step": 25285 + }, + { + "epoch": 0.32858003745677283, + "grad_norm": 0.4501408636569977, + "learning_rate": 0.00013431159749931765, + "loss": 1.5859, + "step": 25286 + }, + { + "epoch": 0.32859303200068873, + "grad_norm": 0.40602347254753113, + "learning_rate": 0.00013430899803740627, + "loss": 1.2278, + "step": 25287 + }, + { + "epoch": 0.3286060265446046, + "grad_norm": 0.3946186900138855, + "learning_rate": 0.00013430639857549487, + "loss": 1.3844, + "step": 25288 + }, + { + "epoch": 0.3286190210885205, + "grad_norm": 0.4072436988353729, + "learning_rate": 0.0001343037991135835, + "loss": 1.2894, + "step": 25289 + }, + { + "epoch": 0.3286320156324363, + "grad_norm": 0.33170178532600403, + "learning_rate": 0.00013430119965167212, + "loss": 1.4158, + "step": 25290 + }, + { + "epoch": 0.3286450101763522, + "grad_norm": 0.27600911259651184, + "learning_rate": 0.00013429860018976072, + "loss": 1.3802, + "step": 25291 + }, + { + "epoch": 0.32865800472026807, + "grad_norm": 0.3940262496471405, + "learning_rate": 0.00013429600072784934, + "loss": 1.436, + "step": 25292 + }, + { + "epoch": 0.32867099926418397, + "grad_norm": 0.30938389897346497, + "learning_rate": 0.00013429340126593797, + "loss": 1.2563, + "step": 25293 + }, + { + "epoch": 0.3286839938080998, + "grad_norm": 0.3945947587490082, + "learning_rate": 0.00013429080180402656, + "loss": 1.5157, + "step": 25294 + }, + { + "epoch": 0.3286969883520157, + "grad_norm": 0.45850762724876404, + "learning_rate": 0.0001342882023421152, + "loss": 1.6917, + "step": 25295 + }, + { + "epoch": 0.32870998289593156, + "grad_norm": 0.33542680740356445, + "learning_rate": 0.0001342856028802038, + "loss": 1.5708, + "step": 25296 + }, + { + "epoch": 0.32872297743984746, + "grad_norm": 0.4499053955078125, + "learning_rate": 0.00013428300341829244, + "loss": 1.5506, + "step": 25297 + }, + { + "epoch": 0.3287359719837633, + "grad_norm": 0.3834080100059509, + "learning_rate": 0.00013428040395638104, + "loss": 1.5082, + "step": 25298 + }, + { + "epoch": 0.3287489665276792, + "grad_norm": 0.4806734323501587, + "learning_rate": 0.00013427780449446966, + "loss": 1.5844, + "step": 25299 + }, + { + "epoch": 0.32876196107159505, + "grad_norm": 0.3630460202693939, + "learning_rate": 0.00013427520503255826, + "loss": 1.3928, + "step": 25300 + }, + { + "epoch": 0.32877495561551096, + "grad_norm": 0.35897013545036316, + "learning_rate": 0.00013427260557064688, + "loss": 1.5129, + "step": 25301 + }, + { + "epoch": 0.3287879501594268, + "grad_norm": 0.5187689661979675, + "learning_rate": 0.0001342700061087355, + "loss": 1.53, + "step": 25302 + }, + { + "epoch": 0.3288009447033427, + "grad_norm": 0.4041348099708557, + "learning_rate": 0.0001342674066468241, + "loss": 1.5057, + "step": 25303 + }, + { + "epoch": 0.32881393924725855, + "grad_norm": 0.36623117327690125, + "learning_rate": 0.00013426480718491273, + "loss": 1.1344, + "step": 25304 + }, + { + "epoch": 0.32882693379117445, + "grad_norm": 0.4020625352859497, + "learning_rate": 0.00013426220772300135, + "loss": 1.4503, + "step": 25305 + }, + { + "epoch": 0.3288399283350903, + "grad_norm": 0.4368743896484375, + "learning_rate": 0.00013425960826108998, + "loss": 1.28, + "step": 25306 + }, + { + "epoch": 0.3288529228790062, + "grad_norm": 0.37784093618392944, + "learning_rate": 0.00013425700879917857, + "loss": 1.3131, + "step": 25307 + }, + { + "epoch": 0.32886591742292204, + "grad_norm": 0.42796891927719116, + "learning_rate": 0.00013425440933726717, + "loss": 1.4572, + "step": 25308 + }, + { + "epoch": 0.32887891196683794, + "grad_norm": 0.3729535937309265, + "learning_rate": 0.00013425180987535582, + "loss": 1.2365, + "step": 25309 + }, + { + "epoch": 0.3288919065107538, + "grad_norm": 0.3360167145729065, + "learning_rate": 0.00013424921041344442, + "loss": 1.4642, + "step": 25310 + }, + { + "epoch": 0.3289049010546697, + "grad_norm": 0.3128669559955597, + "learning_rate": 0.00013424661095153305, + "loss": 1.4907, + "step": 25311 + }, + { + "epoch": 0.32891789559858553, + "grad_norm": 0.40844419598579407, + "learning_rate": 0.00013424401148962164, + "loss": 1.3444, + "step": 25312 + }, + { + "epoch": 0.32893089014250143, + "grad_norm": 0.3773769736289978, + "learning_rate": 0.00013424141202771027, + "loss": 1.4485, + "step": 25313 + }, + { + "epoch": 0.3289438846864173, + "grad_norm": 0.47407764196395874, + "learning_rate": 0.0001342388125657989, + "loss": 1.5178, + "step": 25314 + }, + { + "epoch": 0.3289568792303332, + "grad_norm": 0.4541483521461487, + "learning_rate": 0.0001342362131038875, + "loss": 1.3417, + "step": 25315 + }, + { + "epoch": 0.328969873774249, + "grad_norm": 0.5842644572257996, + "learning_rate": 0.00013423361364197611, + "loss": 1.504, + "step": 25316 + }, + { + "epoch": 0.3289828683181649, + "grad_norm": 0.40054672956466675, + "learning_rate": 0.00013423101418006474, + "loss": 1.466, + "step": 25317 + }, + { + "epoch": 0.32899586286208077, + "grad_norm": 0.250788152217865, + "learning_rate": 0.00013422841471815336, + "loss": 1.358, + "step": 25318 + }, + { + "epoch": 0.32900885740599667, + "grad_norm": 0.39067402482032776, + "learning_rate": 0.00013422581525624196, + "loss": 1.4238, + "step": 25319 + }, + { + "epoch": 0.3290218519499125, + "grad_norm": 0.4361144006252289, + "learning_rate": 0.00013422321579433056, + "loss": 1.3413, + "step": 25320 + }, + { + "epoch": 0.3290348464938284, + "grad_norm": 0.3501947224140167, + "learning_rate": 0.0001342206163324192, + "loss": 1.5347, + "step": 25321 + }, + { + "epoch": 0.32904784103774426, + "grad_norm": 0.5104490518569946, + "learning_rate": 0.0001342180168705078, + "loss": 1.395, + "step": 25322 + }, + { + "epoch": 0.32906083558166016, + "grad_norm": 0.4331497251987457, + "learning_rate": 0.00013421541740859643, + "loss": 1.7013, + "step": 25323 + }, + { + "epoch": 0.329073830125576, + "grad_norm": 0.42801639437675476, + "learning_rate": 0.00013421281794668503, + "loss": 1.5066, + "step": 25324 + }, + { + "epoch": 0.3290868246694919, + "grad_norm": 0.389602929353714, + "learning_rate": 0.00013421021848477365, + "loss": 1.4475, + "step": 25325 + }, + { + "epoch": 0.32909981921340775, + "grad_norm": 0.3439443111419678, + "learning_rate": 0.00013420761902286228, + "loss": 1.3567, + "step": 25326 + }, + { + "epoch": 0.32911281375732365, + "grad_norm": 0.28177815675735474, + "learning_rate": 0.00013420501956095087, + "loss": 1.3039, + "step": 25327 + }, + { + "epoch": 0.3291258083012395, + "grad_norm": 0.4260914921760559, + "learning_rate": 0.0001342024200990395, + "loss": 1.4301, + "step": 25328 + }, + { + "epoch": 0.3291388028451554, + "grad_norm": 0.4295695126056671, + "learning_rate": 0.00013419982063712812, + "loss": 1.5908, + "step": 25329 + }, + { + "epoch": 0.32915179738907124, + "grad_norm": 0.45482027530670166, + "learning_rate": 0.00013419722117521675, + "loss": 1.331, + "step": 25330 + }, + { + "epoch": 0.32916479193298714, + "grad_norm": 0.3673102855682373, + "learning_rate": 0.00013419462171330535, + "loss": 1.4821, + "step": 25331 + }, + { + "epoch": 0.329177786476903, + "grad_norm": 0.3738667666912079, + "learning_rate": 0.00013419202225139397, + "loss": 1.447, + "step": 25332 + }, + { + "epoch": 0.3291907810208189, + "grad_norm": 0.36126378178596497, + "learning_rate": 0.0001341894227894826, + "loss": 1.3497, + "step": 25333 + }, + { + "epoch": 0.32920377556473474, + "grad_norm": 0.3273408114910126, + "learning_rate": 0.0001341868233275712, + "loss": 1.3492, + "step": 25334 + }, + { + "epoch": 0.32921677010865064, + "grad_norm": 0.49449607729911804, + "learning_rate": 0.00013418422386565982, + "loss": 1.327, + "step": 25335 + }, + { + "epoch": 0.3292297646525665, + "grad_norm": 0.3891625702381134, + "learning_rate": 0.00013418162440374844, + "loss": 1.3715, + "step": 25336 + }, + { + "epoch": 0.3292427591964824, + "grad_norm": 0.36737412214279175, + "learning_rate": 0.00013417902494183704, + "loss": 1.5767, + "step": 25337 + }, + { + "epoch": 0.3292557537403982, + "grad_norm": 0.41383060812950134, + "learning_rate": 0.00013417642547992566, + "loss": 1.5812, + "step": 25338 + }, + { + "epoch": 0.32926874828431413, + "grad_norm": 0.44203418493270874, + "learning_rate": 0.00013417382601801426, + "loss": 1.4539, + "step": 25339 + }, + { + "epoch": 0.32928174282823, + "grad_norm": 0.36490094661712646, + "learning_rate": 0.0001341712265561029, + "loss": 1.5176, + "step": 25340 + }, + { + "epoch": 0.3292947373721459, + "grad_norm": 0.3501736521720886, + "learning_rate": 0.0001341686270941915, + "loss": 1.2783, + "step": 25341 + }, + { + "epoch": 0.3293077319160617, + "grad_norm": 0.3369479477405548, + "learning_rate": 0.00013416602763228013, + "loss": 1.4937, + "step": 25342 + }, + { + "epoch": 0.3293207264599776, + "grad_norm": 0.4171413779258728, + "learning_rate": 0.00013416342817036873, + "loss": 1.417, + "step": 25343 + }, + { + "epoch": 0.32933372100389346, + "grad_norm": 0.33711177110671997, + "learning_rate": 0.00013416082870845736, + "loss": 1.2241, + "step": 25344 + }, + { + "epoch": 0.32934671554780937, + "grad_norm": 0.42718514800071716, + "learning_rate": 0.00013415822924654598, + "loss": 1.6906, + "step": 25345 + }, + { + "epoch": 0.3293597100917252, + "grad_norm": 0.3979288339614868, + "learning_rate": 0.00013415562978463458, + "loss": 1.5149, + "step": 25346 + }, + { + "epoch": 0.3293727046356411, + "grad_norm": 0.3372088074684143, + "learning_rate": 0.0001341530303227232, + "loss": 1.3083, + "step": 25347 + }, + { + "epoch": 0.32938569917955696, + "grad_norm": 0.21296252310276031, + "learning_rate": 0.00013415043086081183, + "loss": 1.2374, + "step": 25348 + }, + { + "epoch": 0.32939869372347286, + "grad_norm": 0.3407288193702698, + "learning_rate": 0.00013414783139890042, + "loss": 1.4008, + "step": 25349 + }, + { + "epoch": 0.3294116882673887, + "grad_norm": 0.347440630197525, + "learning_rate": 0.00013414523193698905, + "loss": 1.5054, + "step": 25350 + }, + { + "epoch": 0.3294246828113046, + "grad_norm": 0.4117650091648102, + "learning_rate": 0.00013414263247507765, + "loss": 1.5126, + "step": 25351 + }, + { + "epoch": 0.3294376773552205, + "grad_norm": 0.39046260714530945, + "learning_rate": 0.0001341400330131663, + "loss": 1.3533, + "step": 25352 + }, + { + "epoch": 0.32945067189913635, + "grad_norm": 0.44210928678512573, + "learning_rate": 0.0001341374335512549, + "loss": 1.5414, + "step": 25353 + }, + { + "epoch": 0.32946366644305225, + "grad_norm": 0.41369375586509705, + "learning_rate": 0.00013413483408934352, + "loss": 1.0772, + "step": 25354 + }, + { + "epoch": 0.3294766609869681, + "grad_norm": 0.4005845785140991, + "learning_rate": 0.00013413223462743212, + "loss": 1.3835, + "step": 25355 + }, + { + "epoch": 0.329489655530884, + "grad_norm": 0.4949833154678345, + "learning_rate": 0.00013412963516552074, + "loss": 1.3986, + "step": 25356 + }, + { + "epoch": 0.32950265007479984, + "grad_norm": 0.34249964356422424, + "learning_rate": 0.00013412703570360937, + "loss": 1.5586, + "step": 25357 + }, + { + "epoch": 0.32951564461871574, + "grad_norm": 0.41654446721076965, + "learning_rate": 0.00013412443624169796, + "loss": 1.3539, + "step": 25358 + }, + { + "epoch": 0.3295286391626316, + "grad_norm": 0.3785189688205719, + "learning_rate": 0.0001341218367797866, + "loss": 1.4091, + "step": 25359 + }, + { + "epoch": 0.3295416337065475, + "grad_norm": 0.47925683856010437, + "learning_rate": 0.0001341192373178752, + "loss": 1.4224, + "step": 25360 + }, + { + "epoch": 0.32955462825046333, + "grad_norm": 0.4160226881504059, + "learning_rate": 0.0001341166378559638, + "loss": 1.4358, + "step": 25361 + }, + { + "epoch": 0.32956762279437923, + "grad_norm": 0.46545690298080444, + "learning_rate": 0.00013411403839405243, + "loss": 1.2741, + "step": 25362 + }, + { + "epoch": 0.3295806173382951, + "grad_norm": 0.47618338465690613, + "learning_rate": 0.00013411143893214103, + "loss": 1.3474, + "step": 25363 + }, + { + "epoch": 0.329593611882211, + "grad_norm": 0.3372313380241394, + "learning_rate": 0.00013410883947022968, + "loss": 1.5282, + "step": 25364 + }, + { + "epoch": 0.3296066064261268, + "grad_norm": 0.45731109380722046, + "learning_rate": 0.00013410624000831828, + "loss": 1.3606, + "step": 25365 + }, + { + "epoch": 0.3296196009700427, + "grad_norm": 0.4226211607456207, + "learning_rate": 0.0001341036405464069, + "loss": 1.4463, + "step": 25366 + }, + { + "epoch": 0.32963259551395857, + "grad_norm": 0.38409683108329773, + "learning_rate": 0.00013410104108449553, + "loss": 1.4347, + "step": 25367 + }, + { + "epoch": 0.32964559005787447, + "grad_norm": 0.4026913344860077, + "learning_rate": 0.00013409844162258413, + "loss": 1.3023, + "step": 25368 + }, + { + "epoch": 0.3296585846017903, + "grad_norm": 0.4126409888267517, + "learning_rate": 0.00013409584216067275, + "loss": 1.3066, + "step": 25369 + }, + { + "epoch": 0.3296715791457062, + "grad_norm": 0.3737824261188507, + "learning_rate": 0.00013409324269876135, + "loss": 1.5713, + "step": 25370 + }, + { + "epoch": 0.32968457368962206, + "grad_norm": 0.3570338487625122, + "learning_rate": 0.00013409064323685, + "loss": 1.3271, + "step": 25371 + }, + { + "epoch": 0.32969756823353796, + "grad_norm": 0.399305522441864, + "learning_rate": 0.0001340880437749386, + "loss": 1.2789, + "step": 25372 + }, + { + "epoch": 0.3297105627774538, + "grad_norm": 0.3623657524585724, + "learning_rate": 0.00013408544431302722, + "loss": 1.3151, + "step": 25373 + }, + { + "epoch": 0.3297235573213697, + "grad_norm": 0.4736778736114502, + "learning_rate": 0.00013408284485111582, + "loss": 1.4529, + "step": 25374 + }, + { + "epoch": 0.32973655186528555, + "grad_norm": 0.4068353474140167, + "learning_rate": 0.00013408024538920444, + "loss": 1.4499, + "step": 25375 + }, + { + "epoch": 0.32974954640920145, + "grad_norm": 0.4361303746700287, + "learning_rate": 0.00013407764592729307, + "loss": 1.478, + "step": 25376 + }, + { + "epoch": 0.3297625409531173, + "grad_norm": 0.4379952847957611, + "learning_rate": 0.00013407504646538167, + "loss": 1.3812, + "step": 25377 + }, + { + "epoch": 0.3297755354970332, + "grad_norm": 0.38622012734413147, + "learning_rate": 0.0001340724470034703, + "loss": 1.4063, + "step": 25378 + }, + { + "epoch": 0.32978853004094905, + "grad_norm": 0.5305108428001404, + "learning_rate": 0.00013406984754155891, + "loss": 1.3855, + "step": 25379 + }, + { + "epoch": 0.32980152458486495, + "grad_norm": 0.44919219613075256, + "learning_rate": 0.0001340672480796475, + "loss": 1.3975, + "step": 25380 + }, + { + "epoch": 0.3298145191287808, + "grad_norm": 0.2653268873691559, + "learning_rate": 0.00013406464861773614, + "loss": 1.4539, + "step": 25381 + }, + { + "epoch": 0.3298275136726967, + "grad_norm": 0.28848984837532043, + "learning_rate": 0.00013406204915582473, + "loss": 1.2992, + "step": 25382 + }, + { + "epoch": 0.32984050821661254, + "grad_norm": 0.4742273986339569, + "learning_rate": 0.00013405944969391339, + "loss": 1.4019, + "step": 25383 + }, + { + "epoch": 0.32985350276052844, + "grad_norm": 0.3242991268634796, + "learning_rate": 0.00013405685023200198, + "loss": 1.3752, + "step": 25384 + }, + { + "epoch": 0.3298664973044443, + "grad_norm": 0.39433974027633667, + "learning_rate": 0.0001340542507700906, + "loss": 1.4665, + "step": 25385 + }, + { + "epoch": 0.3298794918483602, + "grad_norm": 0.2532108426094055, + "learning_rate": 0.0001340516513081792, + "loss": 1.0141, + "step": 25386 + }, + { + "epoch": 0.32989248639227603, + "grad_norm": 0.35759055614471436, + "learning_rate": 0.00013404905184626783, + "loss": 1.4838, + "step": 25387 + }, + { + "epoch": 0.32990548093619193, + "grad_norm": 0.42066437005996704, + "learning_rate": 0.00013404645238435645, + "loss": 1.2467, + "step": 25388 + }, + { + "epoch": 0.3299184754801078, + "grad_norm": 0.3852258324623108, + "learning_rate": 0.00013404385292244505, + "loss": 1.3166, + "step": 25389 + }, + { + "epoch": 0.3299314700240237, + "grad_norm": 0.3838479816913605, + "learning_rate": 0.00013404125346053368, + "loss": 1.2427, + "step": 25390 + }, + { + "epoch": 0.3299444645679395, + "grad_norm": 0.4362042248249054, + "learning_rate": 0.0001340386539986223, + "loss": 1.3048, + "step": 25391 + }, + { + "epoch": 0.3299574591118554, + "grad_norm": 0.41609805822372437, + "learning_rate": 0.0001340360545367109, + "loss": 1.3693, + "step": 25392 + }, + { + "epoch": 0.32997045365577127, + "grad_norm": 0.45666131377220154, + "learning_rate": 0.00013403345507479952, + "loss": 1.291, + "step": 25393 + }, + { + "epoch": 0.32998344819968717, + "grad_norm": 0.3742736279964447, + "learning_rate": 0.00013403085561288812, + "loss": 1.4613, + "step": 25394 + }, + { + "epoch": 0.329996442743603, + "grad_norm": 0.48365676403045654, + "learning_rate": 0.00013402825615097677, + "loss": 1.3548, + "step": 25395 + }, + { + "epoch": 0.3300094372875189, + "grad_norm": 0.402761846780777, + "learning_rate": 0.00013402565668906537, + "loss": 1.4136, + "step": 25396 + }, + { + "epoch": 0.33002243183143476, + "grad_norm": 0.37098076939582825, + "learning_rate": 0.000134023057227154, + "loss": 1.2083, + "step": 25397 + }, + { + "epoch": 0.33003542637535066, + "grad_norm": 0.4020889401435852, + "learning_rate": 0.0001340204577652426, + "loss": 1.4296, + "step": 25398 + }, + { + "epoch": 0.3300484209192665, + "grad_norm": 0.4559450149536133, + "learning_rate": 0.00013401785830333121, + "loss": 1.4123, + "step": 25399 + }, + { + "epoch": 0.3300614154631824, + "grad_norm": 0.45479264855384827, + "learning_rate": 0.00013401525884141984, + "loss": 1.5204, + "step": 25400 + }, + { + "epoch": 0.33007441000709825, + "grad_norm": 0.47942304611206055, + "learning_rate": 0.00013401265937950844, + "loss": 1.5651, + "step": 25401 + }, + { + "epoch": 0.33008740455101415, + "grad_norm": 0.37381014227867126, + "learning_rate": 0.00013401005991759706, + "loss": 1.2103, + "step": 25402 + }, + { + "epoch": 0.33010039909493, + "grad_norm": 0.27371224761009216, + "learning_rate": 0.00013400746045568569, + "loss": 1.0815, + "step": 25403 + }, + { + "epoch": 0.3301133936388459, + "grad_norm": 0.35925397276878357, + "learning_rate": 0.00013400486099377428, + "loss": 1.4117, + "step": 25404 + }, + { + "epoch": 0.33012638818276174, + "grad_norm": 0.4015498757362366, + "learning_rate": 0.0001340022615318629, + "loss": 1.6293, + "step": 25405 + }, + { + "epoch": 0.33013938272667764, + "grad_norm": 0.43442967534065247, + "learning_rate": 0.00013399966206995153, + "loss": 1.5014, + "step": 25406 + }, + { + "epoch": 0.3301523772705935, + "grad_norm": 0.38886693120002747, + "learning_rate": 0.00013399706260804016, + "loss": 1.2442, + "step": 25407 + }, + { + "epoch": 0.3301653718145094, + "grad_norm": 0.3544869124889374, + "learning_rate": 0.00013399446314612875, + "loss": 1.3819, + "step": 25408 + }, + { + "epoch": 0.33017836635842523, + "grad_norm": 0.5248733758926392, + "learning_rate": 0.00013399186368421738, + "loss": 1.4933, + "step": 25409 + }, + { + "epoch": 0.33019136090234114, + "grad_norm": 0.49900826811790466, + "learning_rate": 0.000133989264222306, + "loss": 1.5402, + "step": 25410 + }, + { + "epoch": 0.330204355446257, + "grad_norm": 0.3594284951686859, + "learning_rate": 0.0001339866647603946, + "loss": 1.3585, + "step": 25411 + }, + { + "epoch": 0.3302173499901729, + "grad_norm": 0.3940020799636841, + "learning_rate": 0.00013398406529848322, + "loss": 1.2415, + "step": 25412 + }, + { + "epoch": 0.3302303445340887, + "grad_norm": 0.2958138585090637, + "learning_rate": 0.00013398146583657182, + "loss": 1.5371, + "step": 25413 + }, + { + "epoch": 0.3302433390780046, + "grad_norm": 0.2538348436355591, + "learning_rate": 0.00013397886637466047, + "loss": 1.331, + "step": 25414 + }, + { + "epoch": 0.3302563336219205, + "grad_norm": 0.39588114619255066, + "learning_rate": 0.00013397626691274907, + "loss": 1.3503, + "step": 25415 + }, + { + "epoch": 0.3302693281658364, + "grad_norm": 0.46107110381126404, + "learning_rate": 0.00013397366745083767, + "loss": 1.6041, + "step": 25416 + }, + { + "epoch": 0.3302823227097522, + "grad_norm": 0.3055363595485687, + "learning_rate": 0.0001339710679889263, + "loss": 1.3291, + "step": 25417 + }, + { + "epoch": 0.3302953172536681, + "grad_norm": 0.34722602367401123, + "learning_rate": 0.00013396846852701492, + "loss": 1.2665, + "step": 25418 + }, + { + "epoch": 0.33030831179758396, + "grad_norm": 0.43605929613113403, + "learning_rate": 0.00013396586906510354, + "loss": 1.5043, + "step": 25419 + }, + { + "epoch": 0.33032130634149987, + "grad_norm": 0.40294522047042847, + "learning_rate": 0.00013396326960319214, + "loss": 1.2429, + "step": 25420 + }, + { + "epoch": 0.3303343008854157, + "grad_norm": 0.3467344641685486, + "learning_rate": 0.00013396067014128076, + "loss": 1.1676, + "step": 25421 + }, + { + "epoch": 0.3303472954293316, + "grad_norm": 0.5104806423187256, + "learning_rate": 0.0001339580706793694, + "loss": 1.4379, + "step": 25422 + }, + { + "epoch": 0.33036028997324746, + "grad_norm": 0.30016499757766724, + "learning_rate": 0.00013395547121745799, + "loss": 1.3433, + "step": 25423 + }, + { + "epoch": 0.33037328451716336, + "grad_norm": 0.5396904349327087, + "learning_rate": 0.0001339528717555466, + "loss": 1.4127, + "step": 25424 + }, + { + "epoch": 0.3303862790610792, + "grad_norm": 0.41141584515571594, + "learning_rate": 0.0001339502722936352, + "loss": 1.3464, + "step": 25425 + }, + { + "epoch": 0.3303992736049951, + "grad_norm": 0.33225777745246887, + "learning_rate": 0.00013394767283172386, + "loss": 1.4562, + "step": 25426 + }, + { + "epoch": 0.330412268148911, + "grad_norm": 0.4551813006401062, + "learning_rate": 0.00013394507336981246, + "loss": 1.6444, + "step": 25427 + }, + { + "epoch": 0.33042526269282685, + "grad_norm": 0.4935167729854584, + "learning_rate": 0.00013394247390790108, + "loss": 1.3621, + "step": 25428 + }, + { + "epoch": 0.33043825723674275, + "grad_norm": 0.3895745575428009, + "learning_rate": 0.00013393987444598968, + "loss": 1.1186, + "step": 25429 + }, + { + "epoch": 0.3304512517806586, + "grad_norm": 0.3657846450805664, + "learning_rate": 0.0001339372749840783, + "loss": 1.5083, + "step": 25430 + }, + { + "epoch": 0.3304642463245745, + "grad_norm": 0.44281864166259766, + "learning_rate": 0.00013393467552216693, + "loss": 1.327, + "step": 25431 + }, + { + "epoch": 0.33047724086849034, + "grad_norm": 0.36560437083244324, + "learning_rate": 0.00013393207606025552, + "loss": 1.5263, + "step": 25432 + }, + { + "epoch": 0.33049023541240624, + "grad_norm": 0.49306923151016235, + "learning_rate": 0.00013392947659834415, + "loss": 1.5285, + "step": 25433 + }, + { + "epoch": 0.3305032299563221, + "grad_norm": 0.43165305256843567, + "learning_rate": 0.00013392687713643277, + "loss": 1.3839, + "step": 25434 + }, + { + "epoch": 0.330516224500238, + "grad_norm": 0.39934834837913513, + "learning_rate": 0.00013392427767452137, + "loss": 1.3574, + "step": 25435 + }, + { + "epoch": 0.33052921904415383, + "grad_norm": 0.3733825385570526, + "learning_rate": 0.00013392167821261, + "loss": 1.1742, + "step": 25436 + }, + { + "epoch": 0.33054221358806973, + "grad_norm": 0.33662697672843933, + "learning_rate": 0.0001339190787506986, + "loss": 1.2535, + "step": 25437 + }, + { + "epoch": 0.3305552081319856, + "grad_norm": 0.3938794434070587, + "learning_rate": 0.00013391647928878724, + "loss": 1.3558, + "step": 25438 + }, + { + "epoch": 0.3305682026759015, + "grad_norm": 0.33154577016830444, + "learning_rate": 0.00013391387982687584, + "loss": 1.3742, + "step": 25439 + }, + { + "epoch": 0.3305811972198173, + "grad_norm": 0.5127232074737549, + "learning_rate": 0.00013391128036496447, + "loss": 1.5486, + "step": 25440 + }, + { + "epoch": 0.3305941917637332, + "grad_norm": 0.39770156145095825, + "learning_rate": 0.0001339086809030531, + "loss": 1.4735, + "step": 25441 + }, + { + "epoch": 0.33060718630764907, + "grad_norm": 0.40657123923301697, + "learning_rate": 0.0001339060814411417, + "loss": 1.2754, + "step": 25442 + }, + { + "epoch": 0.33062018085156497, + "grad_norm": 0.36890271306037903, + "learning_rate": 0.0001339034819792303, + "loss": 1.2363, + "step": 25443 + }, + { + "epoch": 0.3306331753954808, + "grad_norm": 0.41669103503227234, + "learning_rate": 0.0001339008825173189, + "loss": 1.4988, + "step": 25444 + }, + { + "epoch": 0.3306461699393967, + "grad_norm": 0.45236214995384216, + "learning_rate": 0.00013389828305540753, + "loss": 1.2997, + "step": 25445 + }, + { + "epoch": 0.33065916448331256, + "grad_norm": 0.37558236718177795, + "learning_rate": 0.00013389568359349616, + "loss": 1.4876, + "step": 25446 + }, + { + "epoch": 0.33067215902722846, + "grad_norm": 0.3748414218425751, + "learning_rate": 0.00013389308413158476, + "loss": 1.4397, + "step": 25447 + }, + { + "epoch": 0.3306851535711443, + "grad_norm": 0.2971377968788147, + "learning_rate": 0.00013389048466967338, + "loss": 1.5025, + "step": 25448 + }, + { + "epoch": 0.3306981481150602, + "grad_norm": 0.41289663314819336, + "learning_rate": 0.000133887885207762, + "loss": 1.4438, + "step": 25449 + }, + { + "epoch": 0.33071114265897605, + "grad_norm": 0.4127791225910187, + "learning_rate": 0.00013388528574585063, + "loss": 1.427, + "step": 25450 + }, + { + "epoch": 0.33072413720289195, + "grad_norm": 0.43375536799430847, + "learning_rate": 0.00013388268628393923, + "loss": 1.4366, + "step": 25451 + }, + { + "epoch": 0.3307371317468078, + "grad_norm": 0.4230966866016388, + "learning_rate": 0.00013388008682202785, + "loss": 1.3474, + "step": 25452 + }, + { + "epoch": 0.3307501262907237, + "grad_norm": 0.41233646869659424, + "learning_rate": 0.00013387748736011648, + "loss": 1.3783, + "step": 25453 + }, + { + "epoch": 0.33076312083463955, + "grad_norm": 0.453925222158432, + "learning_rate": 0.00013387488789820507, + "loss": 1.5539, + "step": 25454 + }, + { + "epoch": 0.33077611537855545, + "grad_norm": 0.45712530612945557, + "learning_rate": 0.0001338722884362937, + "loss": 1.4423, + "step": 25455 + }, + { + "epoch": 0.3307891099224713, + "grad_norm": 0.31733739376068115, + "learning_rate": 0.0001338696889743823, + "loss": 1.3004, + "step": 25456 + }, + { + "epoch": 0.3308021044663872, + "grad_norm": 0.33850136399269104, + "learning_rate": 0.00013386708951247095, + "loss": 1.4268, + "step": 25457 + }, + { + "epoch": 0.33081509901030304, + "grad_norm": 0.34473028779029846, + "learning_rate": 0.00013386449005055954, + "loss": 1.5118, + "step": 25458 + }, + { + "epoch": 0.33082809355421894, + "grad_norm": 0.35425084829330444, + "learning_rate": 0.00013386189058864814, + "loss": 1.4399, + "step": 25459 + }, + { + "epoch": 0.3308410880981348, + "grad_norm": 0.43701741099357605, + "learning_rate": 0.00013385929112673677, + "loss": 1.3944, + "step": 25460 + }, + { + "epoch": 0.3308540826420507, + "grad_norm": 0.376965194940567, + "learning_rate": 0.0001338566916648254, + "loss": 1.3592, + "step": 25461 + }, + { + "epoch": 0.33086707718596653, + "grad_norm": 0.29334285855293274, + "learning_rate": 0.00013385409220291401, + "loss": 1.5212, + "step": 25462 + }, + { + "epoch": 0.33088007172988243, + "grad_norm": 0.39277926087379456, + "learning_rate": 0.0001338514927410026, + "loss": 1.4898, + "step": 25463 + }, + { + "epoch": 0.3308930662737983, + "grad_norm": 0.4381749927997589, + "learning_rate": 0.00013384889327909124, + "loss": 1.2541, + "step": 25464 + }, + { + "epoch": 0.3309060608177142, + "grad_norm": 0.330308198928833, + "learning_rate": 0.00013384629381717986, + "loss": 1.5591, + "step": 25465 + }, + { + "epoch": 0.33091905536163, + "grad_norm": 0.44727402925491333, + "learning_rate": 0.00013384369435526846, + "loss": 1.4599, + "step": 25466 + }, + { + "epoch": 0.3309320499055459, + "grad_norm": 0.28293436765670776, + "learning_rate": 0.00013384109489335708, + "loss": 1.5182, + "step": 25467 + }, + { + "epoch": 0.33094504444946177, + "grad_norm": 0.4533950984477997, + "learning_rate": 0.00013383849543144568, + "loss": 1.4373, + "step": 25468 + }, + { + "epoch": 0.33095803899337767, + "grad_norm": 0.40868476033210754, + "learning_rate": 0.00013383589596953433, + "loss": 1.3619, + "step": 25469 + }, + { + "epoch": 0.3309710335372935, + "grad_norm": 0.4499734342098236, + "learning_rate": 0.00013383329650762293, + "loss": 1.5575, + "step": 25470 + }, + { + "epoch": 0.3309840280812094, + "grad_norm": 0.45362964272499084, + "learning_rate": 0.00013383069704571153, + "loss": 1.5309, + "step": 25471 + }, + { + "epoch": 0.33099702262512526, + "grad_norm": 0.5177431106567383, + "learning_rate": 0.00013382809758380015, + "loss": 1.4244, + "step": 25472 + }, + { + "epoch": 0.33101001716904116, + "grad_norm": 0.4701518416404724, + "learning_rate": 0.00013382549812188878, + "loss": 1.4907, + "step": 25473 + }, + { + "epoch": 0.331023011712957, + "grad_norm": 0.38079217076301575, + "learning_rate": 0.0001338228986599774, + "loss": 1.2489, + "step": 25474 + }, + { + "epoch": 0.3310360062568729, + "grad_norm": 0.383716881275177, + "learning_rate": 0.000133820299198066, + "loss": 1.6244, + "step": 25475 + }, + { + "epoch": 0.33104900080078875, + "grad_norm": 0.33251166343688965, + "learning_rate": 0.00013381769973615462, + "loss": 1.4712, + "step": 25476 + }, + { + "epoch": 0.33106199534470465, + "grad_norm": 0.4782688617706299, + "learning_rate": 0.00013381510027424325, + "loss": 1.3404, + "step": 25477 + }, + { + "epoch": 0.3310749898886205, + "grad_norm": 0.34370654821395874, + "learning_rate": 0.00013381250081233184, + "loss": 1.3602, + "step": 25478 + }, + { + "epoch": 0.3310879844325364, + "grad_norm": 0.3607679009437561, + "learning_rate": 0.00013380990135042047, + "loss": 1.2124, + "step": 25479 + }, + { + "epoch": 0.33110097897645224, + "grad_norm": 0.41087839007377625, + "learning_rate": 0.0001338073018885091, + "loss": 1.242, + "step": 25480 + }, + { + "epoch": 0.33111397352036814, + "grad_norm": 0.35053449869155884, + "learning_rate": 0.00013380470242659772, + "loss": 1.3611, + "step": 25481 + }, + { + "epoch": 0.331126968064284, + "grad_norm": 0.28869110345840454, + "learning_rate": 0.00013380210296468631, + "loss": 1.321, + "step": 25482 + }, + { + "epoch": 0.3311399626081999, + "grad_norm": 0.4529825448989868, + "learning_rate": 0.00013379950350277494, + "loss": 1.3559, + "step": 25483 + }, + { + "epoch": 0.33115295715211573, + "grad_norm": 0.3635726571083069, + "learning_rate": 0.00013379690404086356, + "loss": 1.2835, + "step": 25484 + }, + { + "epoch": 0.33116595169603164, + "grad_norm": 0.29228654503822327, + "learning_rate": 0.00013379430457895216, + "loss": 1.4655, + "step": 25485 + }, + { + "epoch": 0.3311789462399475, + "grad_norm": 0.45637771487236023, + "learning_rate": 0.00013379170511704079, + "loss": 1.4376, + "step": 25486 + }, + { + "epoch": 0.3311919407838634, + "grad_norm": 0.41637957096099854, + "learning_rate": 0.00013378910565512938, + "loss": 1.5206, + "step": 25487 + }, + { + "epoch": 0.3312049353277792, + "grad_norm": 0.5057008862495422, + "learning_rate": 0.000133786506193218, + "loss": 1.5834, + "step": 25488 + }, + { + "epoch": 0.3312179298716951, + "grad_norm": 0.3937355577945709, + "learning_rate": 0.00013378390673130663, + "loss": 1.5438, + "step": 25489 + }, + { + "epoch": 0.33123092441561097, + "grad_norm": 0.3645849823951721, + "learning_rate": 0.00013378130726939523, + "loss": 1.4476, + "step": 25490 + }, + { + "epoch": 0.3312439189595269, + "grad_norm": 0.5379676818847656, + "learning_rate": 0.00013377870780748385, + "loss": 1.5305, + "step": 25491 + }, + { + "epoch": 0.3312569135034427, + "grad_norm": 0.45664548873901367, + "learning_rate": 0.00013377610834557248, + "loss": 1.4633, + "step": 25492 + }, + { + "epoch": 0.3312699080473586, + "grad_norm": 0.40925315022468567, + "learning_rate": 0.0001337735088836611, + "loss": 1.3933, + "step": 25493 + }, + { + "epoch": 0.33128290259127446, + "grad_norm": 0.34462788701057434, + "learning_rate": 0.0001337709094217497, + "loss": 1.2266, + "step": 25494 + }, + { + "epoch": 0.33129589713519036, + "grad_norm": 0.48976314067840576, + "learning_rate": 0.00013376830995983832, + "loss": 1.4696, + "step": 25495 + }, + { + "epoch": 0.3313088916791062, + "grad_norm": 0.3538649380207062, + "learning_rate": 0.00013376571049792695, + "loss": 1.3508, + "step": 25496 + }, + { + "epoch": 0.3313218862230221, + "grad_norm": 0.4076395332813263, + "learning_rate": 0.00013376311103601555, + "loss": 1.328, + "step": 25497 + }, + { + "epoch": 0.33133488076693796, + "grad_norm": 0.3823518753051758, + "learning_rate": 0.00013376051157410417, + "loss": 1.2838, + "step": 25498 + }, + { + "epoch": 0.33134787531085386, + "grad_norm": 0.4611894190311432, + "learning_rate": 0.00013375791211219277, + "loss": 1.5482, + "step": 25499 + }, + { + "epoch": 0.3313608698547697, + "grad_norm": 0.3460150957107544, + "learning_rate": 0.0001337553126502814, + "loss": 1.2385, + "step": 25500 + }, + { + "epoch": 0.3313738643986856, + "grad_norm": 0.3888540267944336, + "learning_rate": 0.00013375271318837002, + "loss": 1.4351, + "step": 25501 + }, + { + "epoch": 0.33138685894260145, + "grad_norm": 0.3436017632484436, + "learning_rate": 0.00013375011372645861, + "loss": 1.3135, + "step": 25502 + }, + { + "epoch": 0.33139985348651735, + "grad_norm": 0.4628414511680603, + "learning_rate": 0.00013374751426454724, + "loss": 1.342, + "step": 25503 + }, + { + "epoch": 0.33141284803043325, + "grad_norm": 0.3945187032222748, + "learning_rate": 0.00013374491480263586, + "loss": 1.4171, + "step": 25504 + }, + { + "epoch": 0.3314258425743491, + "grad_norm": 0.44210487604141235, + "learning_rate": 0.0001337423153407245, + "loss": 1.4375, + "step": 25505 + }, + { + "epoch": 0.331438837118265, + "grad_norm": 0.42043545842170715, + "learning_rate": 0.00013373971587881309, + "loss": 1.4242, + "step": 25506 + }, + { + "epoch": 0.33145183166218084, + "grad_norm": 0.33141228556632996, + "learning_rate": 0.0001337371164169017, + "loss": 1.29, + "step": 25507 + }, + { + "epoch": 0.33146482620609674, + "grad_norm": 0.28985899686813354, + "learning_rate": 0.00013373451695499033, + "loss": 1.3966, + "step": 25508 + }, + { + "epoch": 0.3314778207500126, + "grad_norm": 0.45740869641304016, + "learning_rate": 0.00013373191749307893, + "loss": 1.3164, + "step": 25509 + }, + { + "epoch": 0.3314908152939285, + "grad_norm": 0.3912624716758728, + "learning_rate": 0.00013372931803116756, + "loss": 1.7083, + "step": 25510 + }, + { + "epoch": 0.33150380983784433, + "grad_norm": 0.4109328091144562, + "learning_rate": 0.00013372671856925615, + "loss": 1.5296, + "step": 25511 + }, + { + "epoch": 0.33151680438176023, + "grad_norm": 0.35951530933380127, + "learning_rate": 0.0001337241191073448, + "loss": 1.3281, + "step": 25512 + }, + { + "epoch": 0.3315297989256761, + "grad_norm": 0.3745634853839874, + "learning_rate": 0.0001337215196454334, + "loss": 1.4327, + "step": 25513 + }, + { + "epoch": 0.331542793469592, + "grad_norm": 0.3250645697116852, + "learning_rate": 0.000133718920183522, + "loss": 1.3297, + "step": 25514 + }, + { + "epoch": 0.3315557880135078, + "grad_norm": 0.39504584670066833, + "learning_rate": 0.00013371632072161065, + "loss": 1.455, + "step": 25515 + }, + { + "epoch": 0.3315687825574237, + "grad_norm": 0.33048221468925476, + "learning_rate": 0.00013371372125969925, + "loss": 1.2809, + "step": 25516 + }, + { + "epoch": 0.33158177710133957, + "grad_norm": 0.3931441903114319, + "learning_rate": 0.00013371112179778787, + "loss": 1.6605, + "step": 25517 + }, + { + "epoch": 0.33159477164525547, + "grad_norm": 0.42793184518814087, + "learning_rate": 0.00013370852233587647, + "loss": 1.3966, + "step": 25518 + }, + { + "epoch": 0.3316077661891713, + "grad_norm": 0.3611810803413391, + "learning_rate": 0.0001337059228739651, + "loss": 1.423, + "step": 25519 + }, + { + "epoch": 0.3316207607330872, + "grad_norm": 0.3704386353492737, + "learning_rate": 0.00013370332341205372, + "loss": 1.401, + "step": 25520 + }, + { + "epoch": 0.33163375527700306, + "grad_norm": 0.5026872158050537, + "learning_rate": 0.00013370072395014232, + "loss": 1.2996, + "step": 25521 + }, + { + "epoch": 0.33164674982091896, + "grad_norm": 0.31302785873413086, + "learning_rate": 0.00013369812448823094, + "loss": 1.2171, + "step": 25522 + }, + { + "epoch": 0.3316597443648348, + "grad_norm": 0.34155482053756714, + "learning_rate": 0.00013369552502631957, + "loss": 1.3272, + "step": 25523 + }, + { + "epoch": 0.3316727389087507, + "grad_norm": 0.392689973115921, + "learning_rate": 0.0001336929255644082, + "loss": 1.3675, + "step": 25524 + }, + { + "epoch": 0.33168573345266655, + "grad_norm": 0.42233341932296753, + "learning_rate": 0.0001336903261024968, + "loss": 1.554, + "step": 25525 + }, + { + "epoch": 0.33169872799658245, + "grad_norm": 0.588141679763794, + "learning_rate": 0.00013368772664058539, + "loss": 1.2468, + "step": 25526 + }, + { + "epoch": 0.3317117225404983, + "grad_norm": 0.35416871309280396, + "learning_rate": 0.00013368512717867404, + "loss": 1.4324, + "step": 25527 + }, + { + "epoch": 0.3317247170844142, + "grad_norm": 0.454398512840271, + "learning_rate": 0.00013368252771676263, + "loss": 1.5324, + "step": 25528 + }, + { + "epoch": 0.33173771162833005, + "grad_norm": 0.3870798945426941, + "learning_rate": 0.00013367992825485126, + "loss": 1.4309, + "step": 25529 + }, + { + "epoch": 0.33175070617224595, + "grad_norm": 0.430955708026886, + "learning_rate": 0.00013367732879293986, + "loss": 1.5778, + "step": 25530 + }, + { + "epoch": 0.3317637007161618, + "grad_norm": 0.4626177251338959, + "learning_rate": 0.00013367472933102848, + "loss": 1.6093, + "step": 25531 + }, + { + "epoch": 0.3317766952600777, + "grad_norm": 0.5052372217178345, + "learning_rate": 0.0001336721298691171, + "loss": 1.3736, + "step": 25532 + }, + { + "epoch": 0.33178968980399354, + "grad_norm": 0.3977068066596985, + "learning_rate": 0.0001336695304072057, + "loss": 1.5113, + "step": 25533 + }, + { + "epoch": 0.33180268434790944, + "grad_norm": 0.2562795579433441, + "learning_rate": 0.00013366693094529433, + "loss": 1.2995, + "step": 25534 + }, + { + "epoch": 0.3318156788918253, + "grad_norm": 0.4043462872505188, + "learning_rate": 0.00013366433148338295, + "loss": 1.3254, + "step": 25535 + }, + { + "epoch": 0.3318286734357412, + "grad_norm": 0.5225029587745667, + "learning_rate": 0.00013366173202147158, + "loss": 1.4949, + "step": 25536 + }, + { + "epoch": 0.33184166797965703, + "grad_norm": 0.5557762980461121, + "learning_rate": 0.00013365913255956017, + "loss": 1.4127, + "step": 25537 + }, + { + "epoch": 0.33185466252357293, + "grad_norm": 0.38120874762535095, + "learning_rate": 0.00013365653309764877, + "loss": 1.35, + "step": 25538 + }, + { + "epoch": 0.3318676570674888, + "grad_norm": 0.4264238774776459, + "learning_rate": 0.00013365393363573742, + "loss": 1.4785, + "step": 25539 + }, + { + "epoch": 0.3318806516114047, + "grad_norm": 0.3222150504589081, + "learning_rate": 0.00013365133417382602, + "loss": 1.2357, + "step": 25540 + }, + { + "epoch": 0.3318936461553205, + "grad_norm": 0.46247807145118713, + "learning_rate": 0.00013364873471191464, + "loss": 1.5866, + "step": 25541 + }, + { + "epoch": 0.3319066406992364, + "grad_norm": 0.34905382990837097, + "learning_rate": 0.00013364613525000324, + "loss": 1.3636, + "step": 25542 + }, + { + "epoch": 0.33191963524315227, + "grad_norm": 0.4535878300666809, + "learning_rate": 0.00013364353578809187, + "loss": 1.4386, + "step": 25543 + }, + { + "epoch": 0.33193262978706817, + "grad_norm": 0.3320789635181427, + "learning_rate": 0.0001336409363261805, + "loss": 1.42, + "step": 25544 + }, + { + "epoch": 0.331945624330984, + "grad_norm": 0.31830722093582153, + "learning_rate": 0.0001336383368642691, + "loss": 1.2966, + "step": 25545 + }, + { + "epoch": 0.3319586188748999, + "grad_norm": 0.3413156569004059, + "learning_rate": 0.0001336357374023577, + "loss": 1.4182, + "step": 25546 + }, + { + "epoch": 0.33197161341881576, + "grad_norm": 0.3917073607444763, + "learning_rate": 0.00013363313794044634, + "loss": 1.552, + "step": 25547 + }, + { + "epoch": 0.33198460796273166, + "grad_norm": 0.4104979336261749, + "learning_rate": 0.00013363053847853496, + "loss": 1.4159, + "step": 25548 + }, + { + "epoch": 0.3319976025066475, + "grad_norm": 0.40208619832992554, + "learning_rate": 0.00013362793901662356, + "loss": 1.2627, + "step": 25549 + }, + { + "epoch": 0.3320105970505634, + "grad_norm": 0.3399253785610199, + "learning_rate": 0.00013362533955471218, + "loss": 1.2989, + "step": 25550 + }, + { + "epoch": 0.33202359159447925, + "grad_norm": 0.36069270968437195, + "learning_rate": 0.0001336227400928008, + "loss": 1.427, + "step": 25551 + }, + { + "epoch": 0.33203658613839515, + "grad_norm": 0.3751765787601471, + "learning_rate": 0.0001336201406308894, + "loss": 1.2451, + "step": 25552 + }, + { + "epoch": 0.332049580682311, + "grad_norm": 0.39233654737472534, + "learning_rate": 0.00013361754116897803, + "loss": 1.363, + "step": 25553 + }, + { + "epoch": 0.3320625752262269, + "grad_norm": 0.3892533481121063, + "learning_rate": 0.00013361494170706665, + "loss": 1.5235, + "step": 25554 + }, + { + "epoch": 0.33207556977014274, + "grad_norm": 0.3848094940185547, + "learning_rate": 0.00013361234224515525, + "loss": 1.4284, + "step": 25555 + }, + { + "epoch": 0.33208856431405864, + "grad_norm": 0.49205902218818665, + "learning_rate": 0.00013360974278324388, + "loss": 1.5989, + "step": 25556 + }, + { + "epoch": 0.3321015588579745, + "grad_norm": 0.36181342601776123, + "learning_rate": 0.00013360714332133247, + "loss": 1.4242, + "step": 25557 + }, + { + "epoch": 0.3321145534018904, + "grad_norm": 0.42041993141174316, + "learning_rate": 0.00013360454385942112, + "loss": 1.3902, + "step": 25558 + }, + { + "epoch": 0.33212754794580623, + "grad_norm": 0.33958718180656433, + "learning_rate": 0.00013360194439750972, + "loss": 1.3398, + "step": 25559 + }, + { + "epoch": 0.33214054248972213, + "grad_norm": 0.3924887478351593, + "learning_rate": 0.00013359934493559835, + "loss": 1.3547, + "step": 25560 + }, + { + "epoch": 0.332153537033638, + "grad_norm": 0.3785068392753601, + "learning_rate": 0.00013359674547368694, + "loss": 1.3232, + "step": 25561 + }, + { + "epoch": 0.3321665315775539, + "grad_norm": 0.37241414189338684, + "learning_rate": 0.00013359414601177557, + "loss": 1.2776, + "step": 25562 + }, + { + "epoch": 0.3321795261214697, + "grad_norm": 0.45610663294792175, + "learning_rate": 0.0001335915465498642, + "loss": 1.4505, + "step": 25563 + }, + { + "epoch": 0.3321925206653856, + "grad_norm": 0.3319467008113861, + "learning_rate": 0.0001335889470879528, + "loss": 1.2296, + "step": 25564 + }, + { + "epoch": 0.33220551520930147, + "grad_norm": 0.5282312631607056, + "learning_rate": 0.00013358634762604141, + "loss": 1.3536, + "step": 25565 + }, + { + "epoch": 0.3322185097532174, + "grad_norm": 0.39996835589408875, + "learning_rate": 0.00013358374816413004, + "loss": 1.5491, + "step": 25566 + }, + { + "epoch": 0.3322315042971332, + "grad_norm": 0.39466020464897156, + "learning_rate": 0.00013358114870221864, + "loss": 1.4308, + "step": 25567 + }, + { + "epoch": 0.3322444988410491, + "grad_norm": 0.381485253572464, + "learning_rate": 0.00013357854924030726, + "loss": 1.3573, + "step": 25568 + }, + { + "epoch": 0.33225749338496496, + "grad_norm": 0.34763917326927185, + "learning_rate": 0.00013357594977839586, + "loss": 1.3254, + "step": 25569 + }, + { + "epoch": 0.33227048792888086, + "grad_norm": 0.3791951537132263, + "learning_rate": 0.0001335733503164845, + "loss": 1.3656, + "step": 25570 + }, + { + "epoch": 0.3322834824727967, + "grad_norm": 0.4989057779312134, + "learning_rate": 0.0001335707508545731, + "loss": 1.5201, + "step": 25571 + }, + { + "epoch": 0.3322964770167126, + "grad_norm": 0.37039926648139954, + "learning_rate": 0.00013356815139266173, + "loss": 1.5187, + "step": 25572 + }, + { + "epoch": 0.33230947156062846, + "grad_norm": 0.34916022419929504, + "learning_rate": 0.00013356555193075033, + "loss": 1.4985, + "step": 25573 + }, + { + "epoch": 0.33232246610454436, + "grad_norm": 0.5183528065681458, + "learning_rate": 0.00013356295246883895, + "loss": 1.4452, + "step": 25574 + }, + { + "epoch": 0.3323354606484602, + "grad_norm": 0.3225412964820862, + "learning_rate": 0.00013356035300692758, + "loss": 1.3835, + "step": 25575 + }, + { + "epoch": 0.3323484551923761, + "grad_norm": 0.502003014087677, + "learning_rate": 0.00013355775354501618, + "loss": 1.5655, + "step": 25576 + }, + { + "epoch": 0.33236144973629195, + "grad_norm": 0.3813602030277252, + "learning_rate": 0.0001335551540831048, + "loss": 1.3902, + "step": 25577 + }, + { + "epoch": 0.33237444428020785, + "grad_norm": 0.3464597165584564, + "learning_rate": 0.00013355255462119342, + "loss": 1.3222, + "step": 25578 + }, + { + "epoch": 0.33238743882412375, + "grad_norm": 0.3733803927898407, + "learning_rate": 0.00013354995515928205, + "loss": 1.4504, + "step": 25579 + }, + { + "epoch": 0.3324004333680396, + "grad_norm": 0.5423606038093567, + "learning_rate": 0.00013354735569737065, + "loss": 1.4372, + "step": 25580 + }, + { + "epoch": 0.3324134279119555, + "grad_norm": 0.32530727982521057, + "learning_rate": 0.00013354475623545924, + "loss": 1.2882, + "step": 25581 + }, + { + "epoch": 0.33242642245587134, + "grad_norm": 0.3465639352798462, + "learning_rate": 0.0001335421567735479, + "loss": 1.3681, + "step": 25582 + }, + { + "epoch": 0.33243941699978724, + "grad_norm": 0.31461164355278015, + "learning_rate": 0.0001335395573116365, + "loss": 1.3331, + "step": 25583 + }, + { + "epoch": 0.3324524115437031, + "grad_norm": 0.4370171129703522, + "learning_rate": 0.00013353695784972512, + "loss": 1.3451, + "step": 25584 + }, + { + "epoch": 0.332465406087619, + "grad_norm": 0.3450091779232025, + "learning_rate": 0.00013353435838781371, + "loss": 1.5385, + "step": 25585 + }, + { + "epoch": 0.33247840063153483, + "grad_norm": 0.404261976480484, + "learning_rate": 0.00013353175892590234, + "loss": 1.2719, + "step": 25586 + }, + { + "epoch": 0.33249139517545073, + "grad_norm": 0.39896342158317566, + "learning_rate": 0.00013352915946399096, + "loss": 1.4844, + "step": 25587 + }, + { + "epoch": 0.3325043897193666, + "grad_norm": 0.4159028232097626, + "learning_rate": 0.00013352656000207956, + "loss": 1.3059, + "step": 25588 + }, + { + "epoch": 0.3325173842632825, + "grad_norm": 0.42139920592308044, + "learning_rate": 0.0001335239605401682, + "loss": 1.3424, + "step": 25589 + }, + { + "epoch": 0.3325303788071983, + "grad_norm": 0.4097001850605011, + "learning_rate": 0.0001335213610782568, + "loss": 1.7296, + "step": 25590 + }, + { + "epoch": 0.3325433733511142, + "grad_norm": 0.4299386143684387, + "learning_rate": 0.00013351876161634543, + "loss": 1.3901, + "step": 25591 + }, + { + "epoch": 0.33255636789503007, + "grad_norm": 0.2708926796913147, + "learning_rate": 0.00013351616215443403, + "loss": 1.204, + "step": 25592 + }, + { + "epoch": 0.33256936243894597, + "grad_norm": 0.4295893907546997, + "learning_rate": 0.00013351356269252266, + "loss": 1.455, + "step": 25593 + }, + { + "epoch": 0.3325823569828618, + "grad_norm": 0.4148605763912201, + "learning_rate": 0.00013351096323061128, + "loss": 1.3552, + "step": 25594 + }, + { + "epoch": 0.3325953515267777, + "grad_norm": 0.36075738072395325, + "learning_rate": 0.00013350836376869988, + "loss": 1.3259, + "step": 25595 + }, + { + "epoch": 0.33260834607069356, + "grad_norm": 0.4220554828643799, + "learning_rate": 0.0001335057643067885, + "loss": 1.3586, + "step": 25596 + }, + { + "epoch": 0.33262134061460946, + "grad_norm": 0.4005264937877655, + "learning_rate": 0.00013350316484487713, + "loss": 1.2632, + "step": 25597 + }, + { + "epoch": 0.3326343351585253, + "grad_norm": 0.38507142663002014, + "learning_rate": 0.00013350056538296572, + "loss": 1.2469, + "step": 25598 + }, + { + "epoch": 0.3326473297024412, + "grad_norm": 0.4175419509410858, + "learning_rate": 0.00013349796592105435, + "loss": 1.4666, + "step": 25599 + }, + { + "epoch": 0.33266032424635705, + "grad_norm": 0.35873863101005554, + "learning_rate": 0.00013349536645914295, + "loss": 1.2894, + "step": 25600 + }, + { + "epoch": 0.33267331879027295, + "grad_norm": 0.45133766531944275, + "learning_rate": 0.0001334927669972316, + "loss": 1.3752, + "step": 25601 + }, + { + "epoch": 0.3326863133341888, + "grad_norm": 0.6153174042701721, + "learning_rate": 0.0001334901675353202, + "loss": 1.3217, + "step": 25602 + }, + { + "epoch": 0.3326993078781047, + "grad_norm": 0.4296231269836426, + "learning_rate": 0.00013348756807340882, + "loss": 1.3398, + "step": 25603 + }, + { + "epoch": 0.33271230242202054, + "grad_norm": 0.4200472831726074, + "learning_rate": 0.00013348496861149742, + "loss": 1.296, + "step": 25604 + }, + { + "epoch": 0.33272529696593645, + "grad_norm": 0.38560792803764343, + "learning_rate": 0.00013348236914958604, + "loss": 1.495, + "step": 25605 + }, + { + "epoch": 0.3327382915098523, + "grad_norm": 0.393822580575943, + "learning_rate": 0.00013347976968767467, + "loss": 1.4542, + "step": 25606 + }, + { + "epoch": 0.3327512860537682, + "grad_norm": 0.4270383417606354, + "learning_rate": 0.00013347717022576326, + "loss": 1.3654, + "step": 25607 + }, + { + "epoch": 0.33276428059768404, + "grad_norm": 0.3826548457145691, + "learning_rate": 0.0001334745707638519, + "loss": 1.3082, + "step": 25608 + }, + { + "epoch": 0.33277727514159994, + "grad_norm": 0.33963364362716675, + "learning_rate": 0.0001334719713019405, + "loss": 1.192, + "step": 25609 + }, + { + "epoch": 0.3327902696855158, + "grad_norm": 0.27436015009880066, + "learning_rate": 0.0001334693718400291, + "loss": 1.4287, + "step": 25610 + }, + { + "epoch": 0.3328032642294317, + "grad_norm": 0.42664679884910583, + "learning_rate": 0.00013346677237811773, + "loss": 1.4623, + "step": 25611 + }, + { + "epoch": 0.33281625877334753, + "grad_norm": 0.4803803563117981, + "learning_rate": 0.00013346417291620633, + "loss": 1.4997, + "step": 25612 + }, + { + "epoch": 0.33282925331726343, + "grad_norm": 0.3724839389324188, + "learning_rate": 0.00013346157345429498, + "loss": 1.4123, + "step": 25613 + }, + { + "epoch": 0.3328422478611793, + "grad_norm": 0.25665339827537537, + "learning_rate": 0.00013345897399238358, + "loss": 1.409, + "step": 25614 + }, + { + "epoch": 0.3328552424050952, + "grad_norm": 0.32765892148017883, + "learning_rate": 0.0001334563745304722, + "loss": 1.2473, + "step": 25615 + }, + { + "epoch": 0.332868236949011, + "grad_norm": 0.3515726923942566, + "learning_rate": 0.0001334537750685608, + "loss": 1.5841, + "step": 25616 + }, + { + "epoch": 0.3328812314929269, + "grad_norm": 0.3732127249240875, + "learning_rate": 0.00013345117560664943, + "loss": 1.5871, + "step": 25617 + }, + { + "epoch": 0.33289422603684277, + "grad_norm": 0.4103303849697113, + "learning_rate": 0.00013344857614473805, + "loss": 1.2642, + "step": 25618 + }, + { + "epoch": 0.33290722058075867, + "grad_norm": 0.4249461889266968, + "learning_rate": 0.00013344597668282665, + "loss": 1.5431, + "step": 25619 + }, + { + "epoch": 0.3329202151246745, + "grad_norm": 0.2997204065322876, + "learning_rate": 0.00013344337722091527, + "loss": 1.1373, + "step": 25620 + }, + { + "epoch": 0.3329332096685904, + "grad_norm": 0.38188666105270386, + "learning_rate": 0.0001334407777590039, + "loss": 1.3343, + "step": 25621 + }, + { + "epoch": 0.33294620421250626, + "grad_norm": 0.4125663936138153, + "learning_rate": 0.0001334381782970925, + "loss": 1.4628, + "step": 25622 + }, + { + "epoch": 0.33295919875642216, + "grad_norm": 0.3723592758178711, + "learning_rate": 0.00013343557883518112, + "loss": 1.2897, + "step": 25623 + }, + { + "epoch": 0.332972193300338, + "grad_norm": 0.4307374656200409, + "learning_rate": 0.00013343297937326972, + "loss": 1.4864, + "step": 25624 + }, + { + "epoch": 0.3329851878442539, + "grad_norm": 0.4564815163612366, + "learning_rate": 0.00013343037991135837, + "loss": 1.4334, + "step": 25625 + }, + { + "epoch": 0.33299818238816975, + "grad_norm": 0.3599027693271637, + "learning_rate": 0.00013342778044944697, + "loss": 1.4046, + "step": 25626 + }, + { + "epoch": 0.33301117693208565, + "grad_norm": 0.4610879421234131, + "learning_rate": 0.0001334251809875356, + "loss": 1.5288, + "step": 25627 + }, + { + "epoch": 0.3330241714760015, + "grad_norm": 0.47012439370155334, + "learning_rate": 0.00013342258152562422, + "loss": 1.4034, + "step": 25628 + }, + { + "epoch": 0.3330371660199174, + "grad_norm": 0.3730287253856659, + "learning_rate": 0.0001334199820637128, + "loss": 1.3497, + "step": 25629 + }, + { + "epoch": 0.33305016056383324, + "grad_norm": 0.36285486817359924, + "learning_rate": 0.00013341738260180144, + "loss": 1.4784, + "step": 25630 + }, + { + "epoch": 0.33306315510774914, + "grad_norm": 0.3468230962753296, + "learning_rate": 0.00013341478313989003, + "loss": 1.5671, + "step": 25631 + }, + { + "epoch": 0.333076149651665, + "grad_norm": 0.3691968023777008, + "learning_rate": 0.00013341218367797869, + "loss": 1.4311, + "step": 25632 + }, + { + "epoch": 0.3330891441955809, + "grad_norm": 0.38946670293807983, + "learning_rate": 0.00013340958421606728, + "loss": 1.3988, + "step": 25633 + }, + { + "epoch": 0.33310213873949673, + "grad_norm": 0.5536205768585205, + "learning_rate": 0.0001334069847541559, + "loss": 1.4267, + "step": 25634 + }, + { + "epoch": 0.33311513328341263, + "grad_norm": 0.40093138813972473, + "learning_rate": 0.0001334043852922445, + "loss": 1.3898, + "step": 25635 + }, + { + "epoch": 0.3331281278273285, + "grad_norm": 0.27834880352020264, + "learning_rate": 0.00013340178583033313, + "loss": 1.3848, + "step": 25636 + }, + { + "epoch": 0.3331411223712444, + "grad_norm": 0.369975209236145, + "learning_rate": 0.00013339918636842175, + "loss": 1.4382, + "step": 25637 + }, + { + "epoch": 0.3331541169151602, + "grad_norm": 0.34010785818099976, + "learning_rate": 0.00013339658690651035, + "loss": 1.5149, + "step": 25638 + }, + { + "epoch": 0.3331671114590761, + "grad_norm": 0.4089600741863251, + "learning_rate": 0.00013339398744459898, + "loss": 1.4833, + "step": 25639 + }, + { + "epoch": 0.33318010600299197, + "grad_norm": 0.43873879313468933, + "learning_rate": 0.0001333913879826876, + "loss": 1.4863, + "step": 25640 + }, + { + "epoch": 0.33319310054690787, + "grad_norm": 0.33970677852630615, + "learning_rate": 0.0001333887885207762, + "loss": 1.4104, + "step": 25641 + }, + { + "epoch": 0.3332060950908237, + "grad_norm": 0.3152744174003601, + "learning_rate": 0.00013338618905886482, + "loss": 1.4255, + "step": 25642 + }, + { + "epoch": 0.3332190896347396, + "grad_norm": 0.3619931638240814, + "learning_rate": 0.00013338358959695342, + "loss": 1.0646, + "step": 25643 + }, + { + "epoch": 0.33323208417865546, + "grad_norm": 0.39924100041389465, + "learning_rate": 0.00013338099013504207, + "loss": 1.3965, + "step": 25644 + }, + { + "epoch": 0.33324507872257136, + "grad_norm": 0.3600206971168518, + "learning_rate": 0.00013337839067313067, + "loss": 1.3047, + "step": 25645 + }, + { + "epoch": 0.3332580732664872, + "grad_norm": 0.3806486129760742, + "learning_rate": 0.0001333757912112193, + "loss": 1.4779, + "step": 25646 + }, + { + "epoch": 0.3332710678104031, + "grad_norm": 0.4065740406513214, + "learning_rate": 0.0001333731917493079, + "loss": 1.6361, + "step": 25647 + }, + { + "epoch": 0.33328406235431896, + "grad_norm": 0.362339586019516, + "learning_rate": 0.00013337059228739652, + "loss": 1.445, + "step": 25648 + }, + { + "epoch": 0.33329705689823486, + "grad_norm": 0.3874898850917816, + "learning_rate": 0.00013336799282548514, + "loss": 1.4442, + "step": 25649 + }, + { + "epoch": 0.3333100514421507, + "grad_norm": 0.4575158655643463, + "learning_rate": 0.00013336539336357374, + "loss": 1.3938, + "step": 25650 + }, + { + "epoch": 0.3333230459860666, + "grad_norm": 0.3464610278606415, + "learning_rate": 0.00013336279390166236, + "loss": 1.4896, + "step": 25651 + }, + { + "epoch": 0.33333604052998245, + "grad_norm": 0.4411802589893341, + "learning_rate": 0.00013336019443975099, + "loss": 1.3937, + "step": 25652 + }, + { + "epoch": 0.33334903507389835, + "grad_norm": 0.41082653403282166, + "learning_rate": 0.00013335759497783958, + "loss": 1.3852, + "step": 25653 + }, + { + "epoch": 0.3333620296178142, + "grad_norm": 0.3968498408794403, + "learning_rate": 0.0001333549955159282, + "loss": 1.484, + "step": 25654 + }, + { + "epoch": 0.3333750241617301, + "grad_norm": 0.25850117206573486, + "learning_rate": 0.0001333523960540168, + "loss": 1.2697, + "step": 25655 + }, + { + "epoch": 0.333388018705646, + "grad_norm": 0.435116171836853, + "learning_rate": 0.00013334979659210546, + "loss": 1.384, + "step": 25656 + }, + { + "epoch": 0.33340101324956184, + "grad_norm": 0.2973543405532837, + "learning_rate": 0.00013334719713019405, + "loss": 1.205, + "step": 25657 + }, + { + "epoch": 0.33341400779347774, + "grad_norm": 0.3141220211982727, + "learning_rate": 0.00013334459766828268, + "loss": 1.3016, + "step": 25658 + }, + { + "epoch": 0.3334270023373936, + "grad_norm": 0.4753108322620392, + "learning_rate": 0.00013334199820637128, + "loss": 1.351, + "step": 25659 + }, + { + "epoch": 0.3334399968813095, + "grad_norm": 0.4370015859603882, + "learning_rate": 0.0001333393987444599, + "loss": 1.377, + "step": 25660 + }, + { + "epoch": 0.33345299142522533, + "grad_norm": 0.48066309094429016, + "learning_rate": 0.00013333679928254853, + "loss": 1.5265, + "step": 25661 + }, + { + "epoch": 0.33346598596914123, + "grad_norm": 0.41677922010421753, + "learning_rate": 0.00013333419982063712, + "loss": 1.4218, + "step": 25662 + }, + { + "epoch": 0.3334789805130571, + "grad_norm": 0.29214680194854736, + "learning_rate": 0.00013333160035872577, + "loss": 1.2593, + "step": 25663 + }, + { + "epoch": 0.333491975056973, + "grad_norm": 0.3423587679862976, + "learning_rate": 0.00013332900089681437, + "loss": 1.622, + "step": 25664 + }, + { + "epoch": 0.3335049696008888, + "grad_norm": 0.3217088580131531, + "learning_rate": 0.00013332640143490297, + "loss": 1.5343, + "step": 25665 + }, + { + "epoch": 0.3335179641448047, + "grad_norm": 0.42120659351348877, + "learning_rate": 0.0001333238019729916, + "loss": 1.5382, + "step": 25666 + }, + { + "epoch": 0.33353095868872057, + "grad_norm": 0.3697757124900818, + "learning_rate": 0.00013332120251108022, + "loss": 1.4421, + "step": 25667 + }, + { + "epoch": 0.33354395323263647, + "grad_norm": 0.4235227406024933, + "learning_rate": 0.00013331860304916884, + "loss": 1.409, + "step": 25668 + }, + { + "epoch": 0.3335569477765523, + "grad_norm": 0.4443199634552002, + "learning_rate": 0.00013331600358725744, + "loss": 1.2413, + "step": 25669 + }, + { + "epoch": 0.3335699423204682, + "grad_norm": 0.5446480512619019, + "learning_rate": 0.00013331340412534606, + "loss": 1.2817, + "step": 25670 + }, + { + "epoch": 0.33358293686438406, + "grad_norm": 0.3863662779331207, + "learning_rate": 0.0001333108046634347, + "loss": 1.4404, + "step": 25671 + }, + { + "epoch": 0.33359593140829996, + "grad_norm": 0.3658975064754486, + "learning_rate": 0.00013330820520152329, + "loss": 1.4263, + "step": 25672 + }, + { + "epoch": 0.3336089259522158, + "grad_norm": 0.3328890800476074, + "learning_rate": 0.0001333056057396119, + "loss": 1.2018, + "step": 25673 + }, + { + "epoch": 0.3336219204961317, + "grad_norm": 0.41930091381073, + "learning_rate": 0.0001333030062777005, + "loss": 1.4204, + "step": 25674 + }, + { + "epoch": 0.33363491504004755, + "grad_norm": 0.42810705304145813, + "learning_rate": 0.00013330040681578916, + "loss": 1.3207, + "step": 25675 + }, + { + "epoch": 0.33364790958396345, + "grad_norm": 0.41794756054878235, + "learning_rate": 0.00013329780735387776, + "loss": 1.3049, + "step": 25676 + }, + { + "epoch": 0.3336609041278793, + "grad_norm": 0.3177722096443176, + "learning_rate": 0.00013329520789196635, + "loss": 1.3336, + "step": 25677 + }, + { + "epoch": 0.3336738986717952, + "grad_norm": 0.4186551868915558, + "learning_rate": 0.00013329260843005498, + "loss": 1.3983, + "step": 25678 + }, + { + "epoch": 0.33368689321571104, + "grad_norm": 0.3689252436161041, + "learning_rate": 0.0001332900089681436, + "loss": 1.437, + "step": 25679 + }, + { + "epoch": 0.33369988775962695, + "grad_norm": 0.5300559997558594, + "learning_rate": 0.00013328740950623223, + "loss": 1.4918, + "step": 25680 + }, + { + "epoch": 0.3337128823035428, + "grad_norm": 0.30538153648376465, + "learning_rate": 0.00013328481004432083, + "loss": 1.3849, + "step": 25681 + }, + { + "epoch": 0.3337258768474587, + "grad_norm": 0.3892616033554077, + "learning_rate": 0.00013328221058240945, + "loss": 1.361, + "step": 25682 + }, + { + "epoch": 0.33373887139137454, + "grad_norm": 0.41723451018333435, + "learning_rate": 0.00013327961112049807, + "loss": 1.3645, + "step": 25683 + }, + { + "epoch": 0.33375186593529044, + "grad_norm": 0.4507998824119568, + "learning_rate": 0.00013327701165858667, + "loss": 1.4467, + "step": 25684 + }, + { + "epoch": 0.3337648604792063, + "grad_norm": 0.4456848204135895, + "learning_rate": 0.0001332744121966753, + "loss": 1.5981, + "step": 25685 + }, + { + "epoch": 0.3337778550231222, + "grad_norm": 0.42757314443588257, + "learning_rate": 0.0001332718127347639, + "loss": 1.2731, + "step": 25686 + }, + { + "epoch": 0.33379084956703803, + "grad_norm": 0.4181702136993408, + "learning_rate": 0.00013326921327285254, + "loss": 1.519, + "step": 25687 + }, + { + "epoch": 0.33380384411095393, + "grad_norm": 0.36280614137649536, + "learning_rate": 0.00013326661381094114, + "loss": 1.5972, + "step": 25688 + }, + { + "epoch": 0.3338168386548698, + "grad_norm": 0.41598019003868103, + "learning_rate": 0.00013326401434902977, + "loss": 1.2564, + "step": 25689 + }, + { + "epoch": 0.3338298331987857, + "grad_norm": 0.33791980147361755, + "learning_rate": 0.00013326141488711836, + "loss": 1.1705, + "step": 25690 + }, + { + "epoch": 0.3338428277427015, + "grad_norm": 0.4271943271160126, + "learning_rate": 0.000133258815425207, + "loss": 1.2781, + "step": 25691 + }, + { + "epoch": 0.3338558222866174, + "grad_norm": 0.4520389139652252, + "learning_rate": 0.0001332562159632956, + "loss": 1.4017, + "step": 25692 + }, + { + "epoch": 0.33386881683053327, + "grad_norm": 0.3014369308948517, + "learning_rate": 0.0001332536165013842, + "loss": 1.3252, + "step": 25693 + }, + { + "epoch": 0.33388181137444917, + "grad_norm": 0.3556816875934601, + "learning_rate": 0.00013325101703947283, + "loss": 1.3722, + "step": 25694 + }, + { + "epoch": 0.333894805918365, + "grad_norm": 0.3957395553588867, + "learning_rate": 0.00013324841757756146, + "loss": 1.313, + "step": 25695 + }, + { + "epoch": 0.3339078004622809, + "grad_norm": 0.4657697379589081, + "learning_rate": 0.00013324581811565006, + "loss": 1.5053, + "step": 25696 + }, + { + "epoch": 0.33392079500619676, + "grad_norm": 0.38660985231399536, + "learning_rate": 0.00013324321865373868, + "loss": 1.4131, + "step": 25697 + }, + { + "epoch": 0.33393378955011266, + "grad_norm": 0.4783041179180145, + "learning_rate": 0.00013324061919182728, + "loss": 1.487, + "step": 25698 + }, + { + "epoch": 0.3339467840940285, + "grad_norm": 0.3154374361038208, + "learning_rate": 0.00013323801972991593, + "loss": 1.362, + "step": 25699 + }, + { + "epoch": 0.3339597786379444, + "grad_norm": 0.3468500077724457, + "learning_rate": 0.00013323542026800453, + "loss": 1.1607, + "step": 25700 + }, + { + "epoch": 0.33397277318186025, + "grad_norm": 0.3841954469680786, + "learning_rate": 0.00013323282080609315, + "loss": 1.3338, + "step": 25701 + }, + { + "epoch": 0.33398576772577615, + "grad_norm": 0.34123364090919495, + "learning_rate": 0.00013323022134418178, + "loss": 1.4662, + "step": 25702 + }, + { + "epoch": 0.333998762269692, + "grad_norm": 0.3330534100532532, + "learning_rate": 0.00013322762188227037, + "loss": 1.3348, + "step": 25703 + }, + { + "epoch": 0.3340117568136079, + "grad_norm": 0.4104132652282715, + "learning_rate": 0.000133225022420359, + "loss": 1.5437, + "step": 25704 + }, + { + "epoch": 0.33402475135752374, + "grad_norm": 0.41392773389816284, + "learning_rate": 0.0001332224229584476, + "loss": 1.4591, + "step": 25705 + }, + { + "epoch": 0.33403774590143964, + "grad_norm": 0.3826049864292145, + "learning_rate": 0.00013321982349653622, + "loss": 1.3916, + "step": 25706 + }, + { + "epoch": 0.3340507404453555, + "grad_norm": 0.5106659531593323, + "learning_rate": 0.00013321722403462484, + "loss": 1.3409, + "step": 25707 + }, + { + "epoch": 0.3340637349892714, + "grad_norm": 0.3896544873714447, + "learning_rate": 0.00013321462457271344, + "loss": 1.4043, + "step": 25708 + }, + { + "epoch": 0.33407672953318723, + "grad_norm": 0.3524024486541748, + "learning_rate": 0.00013321202511080207, + "loss": 1.3384, + "step": 25709 + }, + { + "epoch": 0.33408972407710313, + "grad_norm": 0.381040096282959, + "learning_rate": 0.0001332094256488907, + "loss": 1.1831, + "step": 25710 + }, + { + "epoch": 0.334102718621019, + "grad_norm": 0.35315778851509094, + "learning_rate": 0.00013320682618697932, + "loss": 1.5032, + "step": 25711 + }, + { + "epoch": 0.3341157131649349, + "grad_norm": 0.37174782156944275, + "learning_rate": 0.0001332042267250679, + "loss": 1.5872, + "step": 25712 + }, + { + "epoch": 0.3341287077088507, + "grad_norm": 0.387458473443985, + "learning_rate": 0.00013320162726315654, + "loss": 1.2877, + "step": 25713 + }, + { + "epoch": 0.3341417022527666, + "grad_norm": 0.42157042026519775, + "learning_rate": 0.00013319902780124516, + "loss": 1.4704, + "step": 25714 + }, + { + "epoch": 0.33415469679668247, + "grad_norm": 0.47148948907852173, + "learning_rate": 0.00013319642833933376, + "loss": 1.3166, + "step": 25715 + }, + { + "epoch": 0.33416769134059837, + "grad_norm": 0.43402060866355896, + "learning_rate": 0.00013319382887742238, + "loss": 1.2751, + "step": 25716 + }, + { + "epoch": 0.3341806858845142, + "grad_norm": 0.33133432269096375, + "learning_rate": 0.00013319122941551098, + "loss": 1.2877, + "step": 25717 + }, + { + "epoch": 0.3341936804284301, + "grad_norm": 0.4010087847709656, + "learning_rate": 0.00013318862995359963, + "loss": 1.4587, + "step": 25718 + }, + { + "epoch": 0.33420667497234596, + "grad_norm": 0.4143335521221161, + "learning_rate": 0.00013318603049168823, + "loss": 1.4603, + "step": 25719 + }, + { + "epoch": 0.33421966951626186, + "grad_norm": 0.46601802110671997, + "learning_rate": 0.00013318343102977683, + "loss": 1.5658, + "step": 25720 + }, + { + "epoch": 0.3342326640601777, + "grad_norm": 0.45379146933555603, + "learning_rate": 0.00013318083156786545, + "loss": 1.3591, + "step": 25721 + }, + { + "epoch": 0.3342456586040936, + "grad_norm": 0.39463135600090027, + "learning_rate": 0.00013317823210595408, + "loss": 1.3518, + "step": 25722 + }, + { + "epoch": 0.33425865314800945, + "grad_norm": 0.41754457354545593, + "learning_rate": 0.0001331756326440427, + "loss": 1.4463, + "step": 25723 + }, + { + "epoch": 0.33427164769192536, + "grad_norm": 0.3736172914505005, + "learning_rate": 0.0001331730331821313, + "loss": 1.4334, + "step": 25724 + }, + { + "epoch": 0.3342846422358412, + "grad_norm": 0.47630414366722107, + "learning_rate": 0.00013317043372021992, + "loss": 1.6443, + "step": 25725 + }, + { + "epoch": 0.3342976367797571, + "grad_norm": 0.3538120985031128, + "learning_rate": 0.00013316783425830855, + "loss": 1.4333, + "step": 25726 + }, + { + "epoch": 0.33431063132367295, + "grad_norm": 0.39376553893089294, + "learning_rate": 0.00013316523479639714, + "loss": 1.4702, + "step": 25727 + }, + { + "epoch": 0.33432362586758885, + "grad_norm": 0.39656057953834534, + "learning_rate": 0.00013316263533448577, + "loss": 1.5044, + "step": 25728 + }, + { + "epoch": 0.3343366204115047, + "grad_norm": 0.3424912095069885, + "learning_rate": 0.00013316003587257437, + "loss": 1.4403, + "step": 25729 + }, + { + "epoch": 0.3343496149554206, + "grad_norm": 0.40744975209236145, + "learning_rate": 0.00013315743641066302, + "loss": 1.4297, + "step": 25730 + }, + { + "epoch": 0.33436260949933644, + "grad_norm": 0.38761138916015625, + "learning_rate": 0.00013315483694875162, + "loss": 1.265, + "step": 25731 + }, + { + "epoch": 0.33437560404325234, + "grad_norm": 0.39098161458969116, + "learning_rate": 0.0001331522374868402, + "loss": 1.3046, + "step": 25732 + }, + { + "epoch": 0.33438859858716824, + "grad_norm": 0.30262884497642517, + "learning_rate": 0.00013314963802492884, + "loss": 1.358, + "step": 25733 + }, + { + "epoch": 0.3344015931310841, + "grad_norm": 0.3829196095466614, + "learning_rate": 0.00013314703856301746, + "loss": 1.3496, + "step": 25734 + }, + { + "epoch": 0.334414587675, + "grad_norm": 0.3335346281528473, + "learning_rate": 0.0001331444391011061, + "loss": 1.4643, + "step": 25735 + }, + { + "epoch": 0.33442758221891583, + "grad_norm": 0.3577483892440796, + "learning_rate": 0.00013314183963919468, + "loss": 1.2778, + "step": 25736 + }, + { + "epoch": 0.33444057676283173, + "grad_norm": 0.3655077815055847, + "learning_rate": 0.0001331392401772833, + "loss": 1.3219, + "step": 25737 + }, + { + "epoch": 0.3344535713067476, + "grad_norm": 0.43895676732063293, + "learning_rate": 0.00013313664071537193, + "loss": 1.3642, + "step": 25738 + }, + { + "epoch": 0.3344665658506635, + "grad_norm": 0.30158793926239014, + "learning_rate": 0.00013313404125346053, + "loss": 1.2761, + "step": 25739 + }, + { + "epoch": 0.3344795603945793, + "grad_norm": 0.3677535951137543, + "learning_rate": 0.00013313144179154915, + "loss": 1.4953, + "step": 25740 + }, + { + "epoch": 0.3344925549384952, + "grad_norm": 0.3854217231273651, + "learning_rate": 0.00013312884232963778, + "loss": 1.5315, + "step": 25741 + }, + { + "epoch": 0.33450554948241107, + "grad_norm": 0.5100762844085693, + "learning_rate": 0.0001331262428677264, + "loss": 1.4422, + "step": 25742 + }, + { + "epoch": 0.33451854402632697, + "grad_norm": 0.3941008150577545, + "learning_rate": 0.000133123643405815, + "loss": 1.3095, + "step": 25743 + }, + { + "epoch": 0.3345315385702428, + "grad_norm": 0.5803819298744202, + "learning_rate": 0.0001331210439439036, + "loss": 1.355, + "step": 25744 + }, + { + "epoch": 0.3345445331141587, + "grad_norm": 0.4111936390399933, + "learning_rate": 0.00013311844448199225, + "loss": 1.3819, + "step": 25745 + }, + { + "epoch": 0.33455752765807456, + "grad_norm": 0.28085440397262573, + "learning_rate": 0.00013311584502008085, + "loss": 1.3052, + "step": 25746 + }, + { + "epoch": 0.33457052220199046, + "grad_norm": 0.4010433852672577, + "learning_rate": 0.00013311324555816947, + "loss": 1.4377, + "step": 25747 + }, + { + "epoch": 0.3345835167459063, + "grad_norm": 0.41501107811927795, + "learning_rate": 0.00013311064609625807, + "loss": 1.4122, + "step": 25748 + }, + { + "epoch": 0.3345965112898222, + "grad_norm": 0.48217329382896423, + "learning_rate": 0.0001331080466343467, + "loss": 1.3691, + "step": 25749 + }, + { + "epoch": 0.33460950583373805, + "grad_norm": 0.3848549723625183, + "learning_rate": 0.00013310544717243532, + "loss": 1.4608, + "step": 25750 + }, + { + "epoch": 0.33462250037765395, + "grad_norm": 0.6189988255500793, + "learning_rate": 0.00013310284771052392, + "loss": 1.5218, + "step": 25751 + }, + { + "epoch": 0.3346354949215698, + "grad_norm": 0.44294923543930054, + "learning_rate": 0.00013310024824861254, + "loss": 1.3063, + "step": 25752 + }, + { + "epoch": 0.3346484894654857, + "grad_norm": 0.3766375780105591, + "learning_rate": 0.00013309764878670116, + "loss": 1.3734, + "step": 25753 + }, + { + "epoch": 0.33466148400940154, + "grad_norm": 0.4893748164176941, + "learning_rate": 0.0001330950493247898, + "loss": 1.4105, + "step": 25754 + }, + { + "epoch": 0.33467447855331744, + "grad_norm": 0.3725035488605499, + "learning_rate": 0.00013309244986287839, + "loss": 1.2359, + "step": 25755 + }, + { + "epoch": 0.3346874730972333, + "grad_norm": 0.3913702666759491, + "learning_rate": 0.000133089850400967, + "loss": 1.5945, + "step": 25756 + }, + { + "epoch": 0.3347004676411492, + "grad_norm": 0.46373018622398376, + "learning_rate": 0.00013308725093905564, + "loss": 1.487, + "step": 25757 + }, + { + "epoch": 0.33471346218506504, + "grad_norm": 0.43601059913635254, + "learning_rate": 0.00013308465147714423, + "loss": 1.6358, + "step": 25758 + }, + { + "epoch": 0.33472645672898094, + "grad_norm": 0.3991638720035553, + "learning_rate": 0.00013308205201523286, + "loss": 1.5986, + "step": 25759 + }, + { + "epoch": 0.3347394512728968, + "grad_norm": 0.4584387540817261, + "learning_rate": 0.00013307945255332145, + "loss": 1.4354, + "step": 25760 + }, + { + "epoch": 0.3347524458168127, + "grad_norm": 0.3756411671638489, + "learning_rate": 0.00013307685309141008, + "loss": 1.371, + "step": 25761 + }, + { + "epoch": 0.33476544036072853, + "grad_norm": 0.38803601264953613, + "learning_rate": 0.0001330742536294987, + "loss": 1.2757, + "step": 25762 + }, + { + "epoch": 0.33477843490464443, + "grad_norm": 0.5287535190582275, + "learning_rate": 0.0001330716541675873, + "loss": 1.6751, + "step": 25763 + }, + { + "epoch": 0.3347914294485603, + "grad_norm": 0.42449700832366943, + "learning_rate": 0.00013306905470567593, + "loss": 1.3604, + "step": 25764 + }, + { + "epoch": 0.3348044239924762, + "grad_norm": 0.37302276492118835, + "learning_rate": 0.00013306645524376455, + "loss": 1.2632, + "step": 25765 + }, + { + "epoch": 0.334817418536392, + "grad_norm": 0.3807085156440735, + "learning_rate": 0.00013306385578185317, + "loss": 1.3199, + "step": 25766 + }, + { + "epoch": 0.3348304130803079, + "grad_norm": 0.3239312171936035, + "learning_rate": 0.00013306125631994177, + "loss": 1.2471, + "step": 25767 + }, + { + "epoch": 0.33484340762422377, + "grad_norm": 0.39483028650283813, + "learning_rate": 0.0001330586568580304, + "loss": 1.5344, + "step": 25768 + }, + { + "epoch": 0.33485640216813967, + "grad_norm": 0.4829374849796295, + "learning_rate": 0.00013305605739611902, + "loss": 1.4222, + "step": 25769 + }, + { + "epoch": 0.3348693967120555, + "grad_norm": 0.3475134074687958, + "learning_rate": 0.00013305345793420762, + "loss": 1.3184, + "step": 25770 + }, + { + "epoch": 0.3348823912559714, + "grad_norm": 0.39459505677223206, + "learning_rate": 0.00013305085847229624, + "loss": 1.3622, + "step": 25771 + }, + { + "epoch": 0.33489538579988726, + "grad_norm": 0.408558189868927, + "learning_rate": 0.00013304825901038484, + "loss": 1.4059, + "step": 25772 + }, + { + "epoch": 0.33490838034380316, + "grad_norm": 0.49281829595565796, + "learning_rate": 0.00013304565954847346, + "loss": 1.5471, + "step": 25773 + }, + { + "epoch": 0.334921374887719, + "grad_norm": 0.3547395169734955, + "learning_rate": 0.0001330430600865621, + "loss": 1.3622, + "step": 25774 + }, + { + "epoch": 0.3349343694316349, + "grad_norm": 0.38016456365585327, + "learning_rate": 0.00013304046062465069, + "loss": 1.384, + "step": 25775 + }, + { + "epoch": 0.33494736397555075, + "grad_norm": 0.3344155251979828, + "learning_rate": 0.00013303786116273934, + "loss": 1.208, + "step": 25776 + }, + { + "epoch": 0.33496035851946665, + "grad_norm": 0.43175214529037476, + "learning_rate": 0.00013303526170082794, + "loss": 1.486, + "step": 25777 + }, + { + "epoch": 0.3349733530633825, + "grad_norm": 0.35000523924827576, + "learning_rate": 0.00013303266223891656, + "loss": 1.3439, + "step": 25778 + }, + { + "epoch": 0.3349863476072984, + "grad_norm": 0.4333122670650482, + "learning_rate": 0.00013303006277700516, + "loss": 1.4226, + "step": 25779 + }, + { + "epoch": 0.33499934215121424, + "grad_norm": 0.46730920672416687, + "learning_rate": 0.00013302746331509378, + "loss": 1.3632, + "step": 25780 + }, + { + "epoch": 0.33501233669513014, + "grad_norm": 0.5109378695487976, + "learning_rate": 0.0001330248638531824, + "loss": 1.4994, + "step": 25781 + }, + { + "epoch": 0.335025331239046, + "grad_norm": 0.38987481594085693, + "learning_rate": 0.000133022264391271, + "loss": 1.4897, + "step": 25782 + }, + { + "epoch": 0.3350383257829619, + "grad_norm": 0.3619634211063385, + "learning_rate": 0.00013301966492935963, + "loss": 1.4555, + "step": 25783 + }, + { + "epoch": 0.33505132032687773, + "grad_norm": 0.42930787801742554, + "learning_rate": 0.00013301706546744825, + "loss": 1.4001, + "step": 25784 + }, + { + "epoch": 0.33506431487079363, + "grad_norm": 0.2995814383029938, + "learning_rate": 0.00013301446600553688, + "loss": 1.4406, + "step": 25785 + }, + { + "epoch": 0.3350773094147095, + "grad_norm": 0.42854374647140503, + "learning_rate": 0.00013301186654362547, + "loss": 1.3489, + "step": 25786 + }, + { + "epoch": 0.3350903039586254, + "grad_norm": 0.36997437477111816, + "learning_rate": 0.00013300926708171407, + "loss": 1.374, + "step": 25787 + }, + { + "epoch": 0.3351032985025412, + "grad_norm": 0.3460169732570648, + "learning_rate": 0.00013300666761980272, + "loss": 1.1957, + "step": 25788 + }, + { + "epoch": 0.3351162930464571, + "grad_norm": 0.40232178568840027, + "learning_rate": 0.00013300406815789132, + "loss": 1.3714, + "step": 25789 + }, + { + "epoch": 0.33512928759037297, + "grad_norm": 0.3390718102455139, + "learning_rate": 0.00013300146869597995, + "loss": 1.5198, + "step": 25790 + }, + { + "epoch": 0.33514228213428887, + "grad_norm": 0.4513596296310425, + "learning_rate": 0.00013299886923406854, + "loss": 1.3926, + "step": 25791 + }, + { + "epoch": 0.3351552766782047, + "grad_norm": 0.3837360739707947, + "learning_rate": 0.00013299626977215717, + "loss": 1.5218, + "step": 25792 + }, + { + "epoch": 0.3351682712221206, + "grad_norm": 0.35585376620292664, + "learning_rate": 0.0001329936703102458, + "loss": 1.4913, + "step": 25793 + }, + { + "epoch": 0.33518126576603646, + "grad_norm": 0.40672287344932556, + "learning_rate": 0.0001329910708483344, + "loss": 1.425, + "step": 25794 + }, + { + "epoch": 0.33519426030995236, + "grad_norm": 0.4959190785884857, + "learning_rate": 0.000132988471386423, + "loss": 1.545, + "step": 25795 + }, + { + "epoch": 0.3352072548538682, + "grad_norm": 0.39881736040115356, + "learning_rate": 0.00013298587192451164, + "loss": 1.4014, + "step": 25796 + }, + { + "epoch": 0.3352202493977841, + "grad_norm": 0.25943171977996826, + "learning_rate": 0.00013298327246260026, + "loss": 1.4929, + "step": 25797 + }, + { + "epoch": 0.33523324394169995, + "grad_norm": 0.4061547815799713, + "learning_rate": 0.00013298067300068886, + "loss": 1.5397, + "step": 25798 + }, + { + "epoch": 0.33524623848561586, + "grad_norm": 0.38471323251724243, + "learning_rate": 0.00013297807353877746, + "loss": 1.3281, + "step": 25799 + }, + { + "epoch": 0.3352592330295317, + "grad_norm": 0.38609546422958374, + "learning_rate": 0.0001329754740768661, + "loss": 1.5517, + "step": 25800 + }, + { + "epoch": 0.3352722275734476, + "grad_norm": 0.35709348320961, + "learning_rate": 0.0001329728746149547, + "loss": 1.3181, + "step": 25801 + }, + { + "epoch": 0.33528522211736345, + "grad_norm": 0.4715689420700073, + "learning_rate": 0.00013297027515304333, + "loss": 1.4992, + "step": 25802 + }, + { + "epoch": 0.33529821666127935, + "grad_norm": 0.3763044774532318, + "learning_rate": 0.00013296767569113193, + "loss": 1.2592, + "step": 25803 + }, + { + "epoch": 0.3353112112051952, + "grad_norm": 0.33089184761047363, + "learning_rate": 0.00013296507622922055, + "loss": 1.268, + "step": 25804 + }, + { + "epoch": 0.3353242057491111, + "grad_norm": 0.45010942220687866, + "learning_rate": 0.00013296247676730918, + "loss": 1.3536, + "step": 25805 + }, + { + "epoch": 0.33533720029302694, + "grad_norm": 0.42581161856651306, + "learning_rate": 0.00013295987730539777, + "loss": 1.3623, + "step": 25806 + }, + { + "epoch": 0.33535019483694284, + "grad_norm": 0.3786059617996216, + "learning_rate": 0.0001329572778434864, + "loss": 1.1144, + "step": 25807 + }, + { + "epoch": 0.33536318938085874, + "grad_norm": 0.39766258001327515, + "learning_rate": 0.00013295467838157502, + "loss": 1.5018, + "step": 25808 + }, + { + "epoch": 0.3353761839247746, + "grad_norm": 0.4654091000556946, + "learning_rate": 0.00013295207891966365, + "loss": 1.3901, + "step": 25809 + }, + { + "epoch": 0.3353891784686905, + "grad_norm": 0.32602956891059875, + "learning_rate": 0.00013294947945775225, + "loss": 1.3941, + "step": 25810 + }, + { + "epoch": 0.33540217301260633, + "grad_norm": 0.33514222502708435, + "learning_rate": 0.00013294687999584087, + "loss": 1.4649, + "step": 25811 + }, + { + "epoch": 0.33541516755652223, + "grad_norm": 0.29577383399009705, + "learning_rate": 0.0001329442805339295, + "loss": 1.3222, + "step": 25812 + }, + { + "epoch": 0.3354281621004381, + "grad_norm": 0.3306135833263397, + "learning_rate": 0.0001329416810720181, + "loss": 1.394, + "step": 25813 + }, + { + "epoch": 0.335441156644354, + "grad_norm": 0.36510902643203735, + "learning_rate": 0.00013293908161010672, + "loss": 1.2757, + "step": 25814 + }, + { + "epoch": 0.3354541511882698, + "grad_norm": 0.39862626791000366, + "learning_rate": 0.00013293648214819534, + "loss": 1.4258, + "step": 25815 + }, + { + "epoch": 0.3354671457321857, + "grad_norm": 0.4043697118759155, + "learning_rate": 0.00013293388268628394, + "loss": 1.4016, + "step": 25816 + }, + { + "epoch": 0.33548014027610157, + "grad_norm": 0.34013503789901733, + "learning_rate": 0.00013293128322437256, + "loss": 1.3396, + "step": 25817 + }, + { + "epoch": 0.33549313482001747, + "grad_norm": 0.4256230294704437, + "learning_rate": 0.00013292868376246116, + "loss": 1.4932, + "step": 25818 + }, + { + "epoch": 0.3355061293639333, + "grad_norm": 0.38826268911361694, + "learning_rate": 0.0001329260843005498, + "loss": 1.4574, + "step": 25819 + }, + { + "epoch": 0.3355191239078492, + "grad_norm": 0.33866193890571594, + "learning_rate": 0.0001329234848386384, + "loss": 1.5768, + "step": 25820 + }, + { + "epoch": 0.33553211845176506, + "grad_norm": 0.3599224090576172, + "learning_rate": 0.00013292088537672703, + "loss": 1.3958, + "step": 25821 + }, + { + "epoch": 0.33554511299568096, + "grad_norm": 0.3819292485713959, + "learning_rate": 0.00013291828591481563, + "loss": 1.1807, + "step": 25822 + }, + { + "epoch": 0.3355581075395968, + "grad_norm": 0.43045055866241455, + "learning_rate": 0.00013291568645290426, + "loss": 1.4105, + "step": 25823 + }, + { + "epoch": 0.3355711020835127, + "grad_norm": 0.2808796167373657, + "learning_rate": 0.00013291308699099288, + "loss": 1.2963, + "step": 25824 + }, + { + "epoch": 0.33558409662742855, + "grad_norm": 0.39081984758377075, + "learning_rate": 0.00013291048752908148, + "loss": 1.3504, + "step": 25825 + }, + { + "epoch": 0.33559709117134445, + "grad_norm": 0.36974290013313293, + "learning_rate": 0.0001329078880671701, + "loss": 1.2827, + "step": 25826 + }, + { + "epoch": 0.3356100857152603, + "grad_norm": 0.3717120885848999, + "learning_rate": 0.00013290528860525873, + "loss": 1.5719, + "step": 25827 + }, + { + "epoch": 0.3356230802591762, + "grad_norm": 0.36996975541114807, + "learning_rate": 0.00013290268914334732, + "loss": 1.4233, + "step": 25828 + }, + { + "epoch": 0.33563607480309204, + "grad_norm": 0.40690329670906067, + "learning_rate": 0.00013290008968143595, + "loss": 1.5165, + "step": 25829 + }, + { + "epoch": 0.33564906934700794, + "grad_norm": 0.37998270988464355, + "learning_rate": 0.00013289749021952455, + "loss": 1.4288, + "step": 25830 + }, + { + "epoch": 0.3356620638909238, + "grad_norm": 0.3923504650592804, + "learning_rate": 0.0001328948907576132, + "loss": 1.3807, + "step": 25831 + }, + { + "epoch": 0.3356750584348397, + "grad_norm": 0.394951730966568, + "learning_rate": 0.0001328922912957018, + "loss": 1.4369, + "step": 25832 + }, + { + "epoch": 0.33568805297875554, + "grad_norm": 0.4979211390018463, + "learning_rate": 0.00013288969183379042, + "loss": 1.3597, + "step": 25833 + }, + { + "epoch": 0.33570104752267144, + "grad_norm": 0.5070407390594482, + "learning_rate": 0.00013288709237187902, + "loss": 1.4285, + "step": 25834 + }, + { + "epoch": 0.3357140420665873, + "grad_norm": 0.316627562046051, + "learning_rate": 0.00013288449290996764, + "loss": 1.3436, + "step": 25835 + }, + { + "epoch": 0.3357270366105032, + "grad_norm": 0.472935289144516, + "learning_rate": 0.00013288189344805626, + "loss": 1.3824, + "step": 25836 + }, + { + "epoch": 0.335740031154419, + "grad_norm": 0.3751969635486603, + "learning_rate": 0.00013287929398614486, + "loss": 1.3853, + "step": 25837 + }, + { + "epoch": 0.33575302569833493, + "grad_norm": 0.37136751413345337, + "learning_rate": 0.0001328766945242335, + "loss": 1.3163, + "step": 25838 + }, + { + "epoch": 0.3357660202422508, + "grad_norm": 0.48989471793174744, + "learning_rate": 0.0001328740950623221, + "loss": 1.3307, + "step": 25839 + }, + { + "epoch": 0.3357790147861667, + "grad_norm": 0.2639951705932617, + "learning_rate": 0.00013287149560041074, + "loss": 1.2027, + "step": 25840 + }, + { + "epoch": 0.3357920093300825, + "grad_norm": 0.39515629410743713, + "learning_rate": 0.00013286889613849933, + "loss": 1.433, + "step": 25841 + }, + { + "epoch": 0.3358050038739984, + "grad_norm": 0.3363722860813141, + "learning_rate": 0.00013286629667658793, + "loss": 1.2829, + "step": 25842 + }, + { + "epoch": 0.33581799841791427, + "grad_norm": 0.38438382744789124, + "learning_rate": 0.00013286369721467658, + "loss": 1.4802, + "step": 25843 + }, + { + "epoch": 0.33583099296183017, + "grad_norm": 0.4884340167045593, + "learning_rate": 0.00013286109775276518, + "loss": 1.4087, + "step": 25844 + }, + { + "epoch": 0.335843987505746, + "grad_norm": 0.43961596488952637, + "learning_rate": 0.0001328584982908538, + "loss": 1.5294, + "step": 25845 + }, + { + "epoch": 0.3358569820496619, + "grad_norm": 0.33718201518058777, + "learning_rate": 0.0001328558988289424, + "loss": 1.2932, + "step": 25846 + }, + { + "epoch": 0.33586997659357776, + "grad_norm": 0.270742267370224, + "learning_rate": 0.00013285329936703103, + "loss": 1.2658, + "step": 25847 + }, + { + "epoch": 0.33588297113749366, + "grad_norm": 0.4900417923927307, + "learning_rate": 0.00013285069990511965, + "loss": 1.4858, + "step": 25848 + }, + { + "epoch": 0.3358959656814095, + "grad_norm": 0.534781813621521, + "learning_rate": 0.00013284810044320825, + "loss": 1.389, + "step": 25849 + }, + { + "epoch": 0.3359089602253254, + "grad_norm": 0.4320332705974579, + "learning_rate": 0.0001328455009812969, + "loss": 1.5189, + "step": 25850 + }, + { + "epoch": 0.33592195476924125, + "grad_norm": 0.42957374453544617, + "learning_rate": 0.0001328429015193855, + "loss": 1.5125, + "step": 25851 + }, + { + "epoch": 0.33593494931315715, + "grad_norm": 0.47541356086730957, + "learning_rate": 0.00013284030205747412, + "loss": 1.4657, + "step": 25852 + }, + { + "epoch": 0.335947943857073, + "grad_norm": 0.3425551950931549, + "learning_rate": 0.00013283770259556272, + "loss": 1.4741, + "step": 25853 + }, + { + "epoch": 0.3359609384009889, + "grad_norm": 0.41446205973625183, + "learning_rate": 0.00013283510313365134, + "loss": 1.3261, + "step": 25854 + }, + { + "epoch": 0.33597393294490474, + "grad_norm": 0.3287166357040405, + "learning_rate": 0.00013283250367173997, + "loss": 1.3456, + "step": 25855 + }, + { + "epoch": 0.33598692748882064, + "grad_norm": 0.39911869168281555, + "learning_rate": 0.00013282990420982856, + "loss": 1.3534, + "step": 25856 + }, + { + "epoch": 0.3359999220327365, + "grad_norm": 0.3255561888217926, + "learning_rate": 0.0001328273047479172, + "loss": 1.3262, + "step": 25857 + }, + { + "epoch": 0.3360129165766524, + "grad_norm": 0.3266236484050751, + "learning_rate": 0.00013282470528600581, + "loss": 1.303, + "step": 25858 + }, + { + "epoch": 0.33602591112056823, + "grad_norm": 0.48608916997909546, + "learning_rate": 0.0001328221058240944, + "loss": 1.3884, + "step": 25859 + }, + { + "epoch": 0.33603890566448413, + "grad_norm": 0.4095337986946106, + "learning_rate": 0.00013281950636218304, + "loss": 1.6156, + "step": 25860 + }, + { + "epoch": 0.3360519002084, + "grad_norm": 0.46016305685043335, + "learning_rate": 0.00013281690690027163, + "loss": 1.6192, + "step": 25861 + }, + { + "epoch": 0.3360648947523159, + "grad_norm": 0.4501890540122986, + "learning_rate": 0.00013281430743836028, + "loss": 1.4899, + "step": 25862 + }, + { + "epoch": 0.3360778892962317, + "grad_norm": 0.37623700499534607, + "learning_rate": 0.00013281170797644888, + "loss": 1.4898, + "step": 25863 + }, + { + "epoch": 0.3360908838401476, + "grad_norm": 0.41207200288772583, + "learning_rate": 0.0001328091085145375, + "loss": 1.4318, + "step": 25864 + }, + { + "epoch": 0.33610387838406347, + "grad_norm": 0.3514039218425751, + "learning_rate": 0.0001328065090526261, + "loss": 1.2774, + "step": 25865 + }, + { + "epoch": 0.33611687292797937, + "grad_norm": 0.4079730212688446, + "learning_rate": 0.00013280390959071473, + "loss": 1.5517, + "step": 25866 + }, + { + "epoch": 0.3361298674718952, + "grad_norm": 0.3291737735271454, + "learning_rate": 0.00013280131012880335, + "loss": 1.1904, + "step": 25867 + }, + { + "epoch": 0.3361428620158111, + "grad_norm": 0.4333759546279907, + "learning_rate": 0.00013279871066689195, + "loss": 1.482, + "step": 25868 + }, + { + "epoch": 0.33615585655972696, + "grad_norm": 0.4017113149166107, + "learning_rate": 0.00013279611120498057, + "loss": 1.4207, + "step": 25869 + }, + { + "epoch": 0.33616885110364286, + "grad_norm": 0.33533087372779846, + "learning_rate": 0.0001327935117430692, + "loss": 1.3787, + "step": 25870 + }, + { + "epoch": 0.3361818456475587, + "grad_norm": 0.39227885007858276, + "learning_rate": 0.0001327909122811578, + "loss": 1.5333, + "step": 25871 + }, + { + "epoch": 0.3361948401914746, + "grad_norm": 0.36286699771881104, + "learning_rate": 0.00013278831281924642, + "loss": 1.4355, + "step": 25872 + }, + { + "epoch": 0.33620783473539045, + "grad_norm": 0.34125709533691406, + "learning_rate": 0.00013278571335733502, + "loss": 1.3908, + "step": 25873 + }, + { + "epoch": 0.33622082927930635, + "grad_norm": 0.24990318715572357, + "learning_rate": 0.00013278311389542367, + "loss": 1.4261, + "step": 25874 + }, + { + "epoch": 0.3362338238232222, + "grad_norm": 0.4212367832660675, + "learning_rate": 0.00013278051443351227, + "loss": 1.4594, + "step": 25875 + }, + { + "epoch": 0.3362468183671381, + "grad_norm": 0.5457077026367188, + "learning_rate": 0.0001327779149716009, + "loss": 1.3095, + "step": 25876 + }, + { + "epoch": 0.33625981291105395, + "grad_norm": 0.36358198523521423, + "learning_rate": 0.0001327753155096895, + "loss": 1.3502, + "step": 25877 + }, + { + "epoch": 0.33627280745496985, + "grad_norm": 0.39090877771377563, + "learning_rate": 0.00013277271604777811, + "loss": 1.3575, + "step": 25878 + }, + { + "epoch": 0.3362858019988857, + "grad_norm": 0.4438571035861969, + "learning_rate": 0.00013277011658586674, + "loss": 1.4258, + "step": 25879 + }, + { + "epoch": 0.3362987965428016, + "grad_norm": 0.4956974387168884, + "learning_rate": 0.00013276751712395534, + "loss": 1.4071, + "step": 25880 + }, + { + "epoch": 0.33631179108671744, + "grad_norm": 0.33568087220191956, + "learning_rate": 0.00013276491766204396, + "loss": 1.3551, + "step": 25881 + }, + { + "epoch": 0.33632478563063334, + "grad_norm": 0.41659367084503174, + "learning_rate": 0.00013276231820013258, + "loss": 1.4955, + "step": 25882 + }, + { + "epoch": 0.3363377801745492, + "grad_norm": 0.47589755058288574, + "learning_rate": 0.00013275971873822118, + "loss": 1.4073, + "step": 25883 + }, + { + "epoch": 0.3363507747184651, + "grad_norm": 0.3754766881465912, + "learning_rate": 0.0001327571192763098, + "loss": 1.3217, + "step": 25884 + }, + { + "epoch": 0.336363769262381, + "grad_norm": 0.37256643176078796, + "learning_rate": 0.0001327545198143984, + "loss": 1.2539, + "step": 25885 + }, + { + "epoch": 0.33637676380629683, + "grad_norm": 0.4344462752342224, + "learning_rate": 0.00013275192035248706, + "loss": 1.4253, + "step": 25886 + }, + { + "epoch": 0.33638975835021273, + "grad_norm": 0.4055934250354767, + "learning_rate": 0.00013274932089057565, + "loss": 1.4551, + "step": 25887 + }, + { + "epoch": 0.3364027528941286, + "grad_norm": 0.46688514947891235, + "learning_rate": 0.00013274672142866428, + "loss": 1.4554, + "step": 25888 + }, + { + "epoch": 0.3364157474380445, + "grad_norm": 0.3879428803920746, + "learning_rate": 0.0001327441219667529, + "loss": 1.5035, + "step": 25889 + }, + { + "epoch": 0.3364287419819603, + "grad_norm": 0.3789123594760895, + "learning_rate": 0.0001327415225048415, + "loss": 1.5252, + "step": 25890 + }, + { + "epoch": 0.3364417365258762, + "grad_norm": 0.35874906182289124, + "learning_rate": 0.00013273892304293012, + "loss": 1.2911, + "step": 25891 + }, + { + "epoch": 0.33645473106979207, + "grad_norm": 0.32427549362182617, + "learning_rate": 0.00013273632358101872, + "loss": 1.3824, + "step": 25892 + }, + { + "epoch": 0.33646772561370797, + "grad_norm": 0.36753812432289124, + "learning_rate": 0.00013273372411910737, + "loss": 1.4153, + "step": 25893 + }, + { + "epoch": 0.3364807201576238, + "grad_norm": 0.296047180891037, + "learning_rate": 0.00013273112465719597, + "loss": 1.3456, + "step": 25894 + }, + { + "epoch": 0.3364937147015397, + "grad_norm": 0.3588009178638458, + "learning_rate": 0.0001327285251952846, + "loss": 1.3898, + "step": 25895 + }, + { + "epoch": 0.33650670924545556, + "grad_norm": 0.4116191267967224, + "learning_rate": 0.0001327259257333732, + "loss": 1.3677, + "step": 25896 + }, + { + "epoch": 0.33651970378937146, + "grad_norm": 0.3759332001209259, + "learning_rate": 0.00013272332627146182, + "loss": 1.3373, + "step": 25897 + }, + { + "epoch": 0.3365326983332873, + "grad_norm": 0.4630047380924225, + "learning_rate": 0.00013272072680955044, + "loss": 1.4346, + "step": 25898 + }, + { + "epoch": 0.3365456928772032, + "grad_norm": 0.2555157244205475, + "learning_rate": 0.00013271812734763904, + "loss": 1.3256, + "step": 25899 + }, + { + "epoch": 0.33655868742111905, + "grad_norm": 0.40940627455711365, + "learning_rate": 0.00013271552788572766, + "loss": 1.4412, + "step": 25900 + }, + { + "epoch": 0.33657168196503495, + "grad_norm": 0.3870319724082947, + "learning_rate": 0.0001327129284238163, + "loss": 1.3399, + "step": 25901 + }, + { + "epoch": 0.3365846765089508, + "grad_norm": 0.4527110457420349, + "learning_rate": 0.00013271032896190488, + "loss": 1.4017, + "step": 25902 + }, + { + "epoch": 0.3365976710528667, + "grad_norm": 0.46222782135009766, + "learning_rate": 0.0001327077294999935, + "loss": 1.4352, + "step": 25903 + }, + { + "epoch": 0.33661066559678254, + "grad_norm": 0.4205856919288635, + "learning_rate": 0.0001327051300380821, + "loss": 1.1065, + "step": 25904 + }, + { + "epoch": 0.33662366014069844, + "grad_norm": 0.39422059059143066, + "learning_rate": 0.00013270253057617076, + "loss": 1.4335, + "step": 25905 + }, + { + "epoch": 0.3366366546846143, + "grad_norm": 0.41192176938056946, + "learning_rate": 0.00013269993111425936, + "loss": 1.4046, + "step": 25906 + }, + { + "epoch": 0.3366496492285302, + "grad_norm": 0.33498743176460266, + "learning_rate": 0.00013269733165234798, + "loss": 1.5912, + "step": 25907 + }, + { + "epoch": 0.33666264377244604, + "grad_norm": 0.3808746635913849, + "learning_rate": 0.00013269473219043658, + "loss": 1.4101, + "step": 25908 + }, + { + "epoch": 0.33667563831636194, + "grad_norm": 0.4875222444534302, + "learning_rate": 0.0001326921327285252, + "loss": 1.4948, + "step": 25909 + }, + { + "epoch": 0.3366886328602778, + "grad_norm": 0.38794437050819397, + "learning_rate": 0.00013268953326661383, + "loss": 1.5841, + "step": 25910 + }, + { + "epoch": 0.3367016274041937, + "grad_norm": 0.4314601421356201, + "learning_rate": 0.00013268693380470242, + "loss": 1.4745, + "step": 25911 + }, + { + "epoch": 0.3367146219481095, + "grad_norm": 0.37771472334861755, + "learning_rate": 0.00013268433434279105, + "loss": 1.4342, + "step": 25912 + }, + { + "epoch": 0.33672761649202543, + "grad_norm": 0.4475351572036743, + "learning_rate": 0.00013268173488087967, + "loss": 1.4324, + "step": 25913 + }, + { + "epoch": 0.3367406110359413, + "grad_norm": 0.4147765040397644, + "learning_rate": 0.00013267913541896827, + "loss": 1.4885, + "step": 25914 + }, + { + "epoch": 0.3367536055798572, + "grad_norm": 0.4396967887878418, + "learning_rate": 0.0001326765359570569, + "loss": 1.3747, + "step": 25915 + }, + { + "epoch": 0.336766600123773, + "grad_norm": 0.3707548975944519, + "learning_rate": 0.0001326739364951455, + "loss": 1.3314, + "step": 25916 + }, + { + "epoch": 0.3367795946676889, + "grad_norm": 0.4012846350669861, + "learning_rate": 0.00013267133703323414, + "loss": 1.3984, + "step": 25917 + }, + { + "epoch": 0.33679258921160476, + "grad_norm": 0.43612775206565857, + "learning_rate": 0.00013266873757132274, + "loss": 1.5301, + "step": 25918 + }, + { + "epoch": 0.33680558375552067, + "grad_norm": 0.41023901104927063, + "learning_rate": 0.00013266613810941137, + "loss": 1.519, + "step": 25919 + }, + { + "epoch": 0.3368185782994365, + "grad_norm": 0.3649330139160156, + "learning_rate": 0.00013266353864749996, + "loss": 1.2271, + "step": 25920 + }, + { + "epoch": 0.3368315728433524, + "grad_norm": 0.4352780282497406, + "learning_rate": 0.0001326609391855886, + "loss": 1.5554, + "step": 25921 + }, + { + "epoch": 0.33684456738726826, + "grad_norm": 0.4940381944179535, + "learning_rate": 0.0001326583397236772, + "loss": 1.4621, + "step": 25922 + }, + { + "epoch": 0.33685756193118416, + "grad_norm": 0.42648664116859436, + "learning_rate": 0.0001326557402617658, + "loss": 1.5415, + "step": 25923 + }, + { + "epoch": 0.3368705564751, + "grad_norm": 0.37579816579818726, + "learning_rate": 0.00013265314079985446, + "loss": 1.4407, + "step": 25924 + }, + { + "epoch": 0.3368835510190159, + "grad_norm": 0.36903703212738037, + "learning_rate": 0.00013265054133794306, + "loss": 1.3951, + "step": 25925 + }, + { + "epoch": 0.33689654556293175, + "grad_norm": 0.3801431357860565, + "learning_rate": 0.00013264794187603166, + "loss": 1.3851, + "step": 25926 + }, + { + "epoch": 0.33690954010684765, + "grad_norm": 0.48554742336273193, + "learning_rate": 0.00013264534241412028, + "loss": 1.4229, + "step": 25927 + }, + { + "epoch": 0.3369225346507635, + "grad_norm": 0.38802024722099304, + "learning_rate": 0.0001326427429522089, + "loss": 1.4832, + "step": 25928 + }, + { + "epoch": 0.3369355291946794, + "grad_norm": 0.49022141098976135, + "learning_rate": 0.00013264014349029753, + "loss": 1.4171, + "step": 25929 + }, + { + "epoch": 0.33694852373859524, + "grad_norm": 0.4355834126472473, + "learning_rate": 0.00013263754402838613, + "loss": 1.4349, + "step": 25930 + }, + { + "epoch": 0.33696151828251114, + "grad_norm": 0.34544193744659424, + "learning_rate": 0.00013263494456647475, + "loss": 1.1949, + "step": 25931 + }, + { + "epoch": 0.336974512826427, + "grad_norm": 0.4932114779949188, + "learning_rate": 0.00013263234510456338, + "loss": 1.5736, + "step": 25932 + }, + { + "epoch": 0.3369875073703429, + "grad_norm": 0.4376586675643921, + "learning_rate": 0.00013262974564265197, + "loss": 1.4379, + "step": 25933 + }, + { + "epoch": 0.33700050191425873, + "grad_norm": 0.4078996181488037, + "learning_rate": 0.0001326271461807406, + "loss": 1.4992, + "step": 25934 + }, + { + "epoch": 0.33701349645817463, + "grad_norm": 0.4634312689304352, + "learning_rate": 0.0001326245467188292, + "loss": 1.4847, + "step": 25935 + }, + { + "epoch": 0.3370264910020905, + "grad_norm": 0.34474724531173706, + "learning_rate": 0.00013262194725691785, + "loss": 1.1529, + "step": 25936 + }, + { + "epoch": 0.3370394855460064, + "grad_norm": 0.39053812623023987, + "learning_rate": 0.00013261934779500644, + "loss": 1.5433, + "step": 25937 + }, + { + "epoch": 0.3370524800899222, + "grad_norm": 0.38804084062576294, + "learning_rate": 0.00013261674833309504, + "loss": 1.3174, + "step": 25938 + }, + { + "epoch": 0.3370654746338381, + "grad_norm": 0.45724987983703613, + "learning_rate": 0.00013261414887118367, + "loss": 1.4434, + "step": 25939 + }, + { + "epoch": 0.33707846917775397, + "grad_norm": 0.34311631321907043, + "learning_rate": 0.0001326115494092723, + "loss": 1.1802, + "step": 25940 + }, + { + "epoch": 0.33709146372166987, + "grad_norm": 0.400392085313797, + "learning_rate": 0.00013260894994736091, + "loss": 1.3581, + "step": 25941 + }, + { + "epoch": 0.3371044582655857, + "grad_norm": 0.4552929103374481, + "learning_rate": 0.0001326063504854495, + "loss": 1.5644, + "step": 25942 + }, + { + "epoch": 0.3371174528095016, + "grad_norm": 0.40228772163391113, + "learning_rate": 0.00013260375102353814, + "loss": 1.3824, + "step": 25943 + }, + { + "epoch": 0.33713044735341746, + "grad_norm": 0.39098143577575684, + "learning_rate": 0.00013260115156162676, + "loss": 1.3173, + "step": 25944 + }, + { + "epoch": 0.33714344189733336, + "grad_norm": 0.3960305154323578, + "learning_rate": 0.00013259855209971536, + "loss": 1.3335, + "step": 25945 + }, + { + "epoch": 0.3371564364412492, + "grad_norm": 0.3813297152519226, + "learning_rate": 0.00013259595263780398, + "loss": 1.2998, + "step": 25946 + }, + { + "epoch": 0.3371694309851651, + "grad_norm": 0.46862417459487915, + "learning_rate": 0.00013259335317589258, + "loss": 1.4435, + "step": 25947 + }, + { + "epoch": 0.33718242552908095, + "grad_norm": 0.4491961598396301, + "learning_rate": 0.00013259075371398123, + "loss": 1.5086, + "step": 25948 + }, + { + "epoch": 0.33719542007299685, + "grad_norm": 0.4344651401042938, + "learning_rate": 0.00013258815425206983, + "loss": 1.347, + "step": 25949 + }, + { + "epoch": 0.3372084146169127, + "grad_norm": 0.5068522095680237, + "learning_rate": 0.00013258555479015843, + "loss": 1.5285, + "step": 25950 + }, + { + "epoch": 0.3372214091608286, + "grad_norm": 0.4222278594970703, + "learning_rate": 0.00013258295532824705, + "loss": 1.525, + "step": 25951 + }, + { + "epoch": 0.33723440370474445, + "grad_norm": 0.39252403378486633, + "learning_rate": 0.00013258035586633568, + "loss": 1.3428, + "step": 25952 + }, + { + "epoch": 0.33724739824866035, + "grad_norm": 0.46583855152130127, + "learning_rate": 0.0001325777564044243, + "loss": 1.6201, + "step": 25953 + }, + { + "epoch": 0.3372603927925762, + "grad_norm": 0.42092910408973694, + "learning_rate": 0.0001325751569425129, + "loss": 1.4626, + "step": 25954 + }, + { + "epoch": 0.3372733873364921, + "grad_norm": 0.3947404623031616, + "learning_rate": 0.00013257255748060152, + "loss": 1.5335, + "step": 25955 + }, + { + "epoch": 0.33728638188040794, + "grad_norm": 0.33180686831474304, + "learning_rate": 0.00013256995801869015, + "loss": 1.3432, + "step": 25956 + }, + { + "epoch": 0.33729937642432384, + "grad_norm": 0.43609657883644104, + "learning_rate": 0.00013256735855677874, + "loss": 1.5709, + "step": 25957 + }, + { + "epoch": 0.3373123709682397, + "grad_norm": 0.4034457206726074, + "learning_rate": 0.00013256475909486737, + "loss": 1.3492, + "step": 25958 + }, + { + "epoch": 0.3373253655121556, + "grad_norm": 0.41907432675361633, + "learning_rate": 0.00013256215963295597, + "loss": 1.4434, + "step": 25959 + }, + { + "epoch": 0.3373383600560715, + "grad_norm": 0.2986696660518646, + "learning_rate": 0.00013255956017104462, + "loss": 1.5661, + "step": 25960 + }, + { + "epoch": 0.33735135459998733, + "grad_norm": 0.39098668098449707, + "learning_rate": 0.00013255696070913321, + "loss": 1.4509, + "step": 25961 + }, + { + "epoch": 0.33736434914390323, + "grad_norm": 0.39820170402526855, + "learning_rate": 0.00013255436124722184, + "loss": 1.3919, + "step": 25962 + }, + { + "epoch": 0.3373773436878191, + "grad_norm": 0.49654802680015564, + "learning_rate": 0.00013255176178531046, + "loss": 1.3231, + "step": 25963 + }, + { + "epoch": 0.337390338231735, + "grad_norm": 0.3687657117843628, + "learning_rate": 0.00013254916232339906, + "loss": 1.5856, + "step": 25964 + }, + { + "epoch": 0.3374033327756508, + "grad_norm": 0.3904503583908081, + "learning_rate": 0.00013254656286148768, + "loss": 1.4621, + "step": 25965 + }, + { + "epoch": 0.3374163273195667, + "grad_norm": 0.4072244465351105, + "learning_rate": 0.00013254396339957628, + "loss": 1.351, + "step": 25966 + }, + { + "epoch": 0.33742932186348257, + "grad_norm": 0.40208739042282104, + "learning_rate": 0.0001325413639376649, + "loss": 1.3602, + "step": 25967 + }, + { + "epoch": 0.33744231640739847, + "grad_norm": 0.4271987974643707, + "learning_rate": 0.00013253876447575353, + "loss": 1.2959, + "step": 25968 + }, + { + "epoch": 0.3374553109513143, + "grad_norm": 0.555769145488739, + "learning_rate": 0.00013253616501384213, + "loss": 1.5875, + "step": 25969 + }, + { + "epoch": 0.3374683054952302, + "grad_norm": 0.39692923426628113, + "learning_rate": 0.00013253356555193075, + "loss": 1.3605, + "step": 25970 + }, + { + "epoch": 0.33748130003914606, + "grad_norm": 0.38616666197776794, + "learning_rate": 0.00013253096609001938, + "loss": 1.3348, + "step": 25971 + }, + { + "epoch": 0.33749429458306196, + "grad_norm": 0.3021883964538574, + "learning_rate": 0.000132528366628108, + "loss": 1.2672, + "step": 25972 + }, + { + "epoch": 0.3375072891269778, + "grad_norm": 0.40427741408348083, + "learning_rate": 0.0001325257671661966, + "loss": 1.2641, + "step": 25973 + }, + { + "epoch": 0.3375202836708937, + "grad_norm": 0.3402095437049866, + "learning_rate": 0.00013252316770428522, + "loss": 1.4119, + "step": 25974 + }, + { + "epoch": 0.33753327821480955, + "grad_norm": 0.43896281719207764, + "learning_rate": 0.00013252056824237385, + "loss": 1.3589, + "step": 25975 + }, + { + "epoch": 0.33754627275872545, + "grad_norm": 0.36701881885528564, + "learning_rate": 0.00013251796878046245, + "loss": 1.447, + "step": 25976 + }, + { + "epoch": 0.3375592673026413, + "grad_norm": 0.3691614270210266, + "learning_rate": 0.00013251536931855107, + "loss": 1.3894, + "step": 25977 + }, + { + "epoch": 0.3375722618465572, + "grad_norm": 0.44661641120910645, + "learning_rate": 0.00013251276985663967, + "loss": 1.2686, + "step": 25978 + }, + { + "epoch": 0.33758525639047304, + "grad_norm": 0.3732391893863678, + "learning_rate": 0.0001325101703947283, + "loss": 1.2704, + "step": 25979 + }, + { + "epoch": 0.33759825093438894, + "grad_norm": 0.2902577519416809, + "learning_rate": 0.00013250757093281692, + "loss": 1.34, + "step": 25980 + }, + { + "epoch": 0.3376112454783048, + "grad_norm": 0.36671948432922363, + "learning_rate": 0.00013250497147090551, + "loss": 1.3817, + "step": 25981 + }, + { + "epoch": 0.3376242400222207, + "grad_norm": 0.4579131305217743, + "learning_rate": 0.00013250237200899414, + "loss": 1.4356, + "step": 25982 + }, + { + "epoch": 0.33763723456613653, + "grad_norm": 0.3641127943992615, + "learning_rate": 0.00013249977254708276, + "loss": 1.4254, + "step": 25983 + }, + { + "epoch": 0.33765022911005244, + "grad_norm": 0.4204072654247284, + "learning_rate": 0.0001324971730851714, + "loss": 1.4322, + "step": 25984 + }, + { + "epoch": 0.3376632236539683, + "grad_norm": 0.5142878293991089, + "learning_rate": 0.00013249457362325998, + "loss": 1.4629, + "step": 25985 + }, + { + "epoch": 0.3376762181978842, + "grad_norm": 0.43719133734703064, + "learning_rate": 0.0001324919741613486, + "loss": 1.4805, + "step": 25986 + }, + { + "epoch": 0.3376892127418, + "grad_norm": 0.2883050739765167, + "learning_rate": 0.00013248937469943723, + "loss": 1.2557, + "step": 25987 + }, + { + "epoch": 0.3377022072857159, + "grad_norm": 0.3705081641674042, + "learning_rate": 0.00013248677523752583, + "loss": 1.5661, + "step": 25988 + }, + { + "epoch": 0.3377152018296318, + "grad_norm": 0.5233997106552124, + "learning_rate": 0.00013248417577561446, + "loss": 1.3378, + "step": 25989 + }, + { + "epoch": 0.3377281963735477, + "grad_norm": 0.4095056354999542, + "learning_rate": 0.00013248157631370305, + "loss": 1.3971, + "step": 25990 + }, + { + "epoch": 0.3377411909174635, + "grad_norm": 0.4464869797229767, + "learning_rate": 0.0001324789768517917, + "loss": 1.2914, + "step": 25991 + }, + { + "epoch": 0.3377541854613794, + "grad_norm": 0.3482622504234314, + "learning_rate": 0.0001324763773898803, + "loss": 1.2218, + "step": 25992 + }, + { + "epoch": 0.33776718000529526, + "grad_norm": 0.455708771944046, + "learning_rate": 0.0001324737779279689, + "loss": 1.3833, + "step": 25993 + }, + { + "epoch": 0.33778017454921117, + "grad_norm": 0.37493157386779785, + "learning_rate": 0.00013247117846605752, + "loss": 1.3375, + "step": 25994 + }, + { + "epoch": 0.337793169093127, + "grad_norm": 0.4280388355255127, + "learning_rate": 0.00013246857900414615, + "loss": 1.3368, + "step": 25995 + }, + { + "epoch": 0.3378061636370429, + "grad_norm": 0.3672119379043579, + "learning_rate": 0.00013246597954223477, + "loss": 1.2575, + "step": 25996 + }, + { + "epoch": 0.33781915818095876, + "grad_norm": 0.30698785185813904, + "learning_rate": 0.00013246338008032337, + "loss": 1.2533, + "step": 25997 + }, + { + "epoch": 0.33783215272487466, + "grad_norm": 0.363212525844574, + "learning_rate": 0.000132460780618412, + "loss": 1.2954, + "step": 25998 + }, + { + "epoch": 0.3378451472687905, + "grad_norm": 0.3986384868621826, + "learning_rate": 0.00013245818115650062, + "loss": 1.4081, + "step": 25999 + }, + { + "epoch": 0.3378581418127064, + "grad_norm": 0.3540073037147522, + "learning_rate": 0.00013245558169458922, + "loss": 1.4012, + "step": 26000 + }, + { + "epoch": 0.33787113635662225, + "grad_norm": 0.3556385338306427, + "learning_rate": 0.00013245298223267784, + "loss": 1.3937, + "step": 26001 + }, + { + "epoch": 0.33788413090053815, + "grad_norm": 0.4650766849517822, + "learning_rate": 0.00013245038277076647, + "loss": 1.5194, + "step": 26002 + }, + { + "epoch": 0.337897125444454, + "grad_norm": 0.41923680901527405, + "learning_rate": 0.0001324477833088551, + "loss": 1.4581, + "step": 26003 + }, + { + "epoch": 0.3379101199883699, + "grad_norm": 0.369552880525589, + "learning_rate": 0.0001324451838469437, + "loss": 1.0854, + "step": 26004 + }, + { + "epoch": 0.33792311453228574, + "grad_norm": 0.37159621715545654, + "learning_rate": 0.00013244258438503228, + "loss": 1.5795, + "step": 26005 + }, + { + "epoch": 0.33793610907620164, + "grad_norm": 0.40106499195098877, + "learning_rate": 0.00013243998492312094, + "loss": 1.3117, + "step": 26006 + }, + { + "epoch": 0.3379491036201175, + "grad_norm": 0.43580904603004456, + "learning_rate": 0.00013243738546120953, + "loss": 1.5166, + "step": 26007 + }, + { + "epoch": 0.3379620981640334, + "grad_norm": 0.35146501660346985, + "learning_rate": 0.00013243478599929816, + "loss": 1.5207, + "step": 26008 + }, + { + "epoch": 0.33797509270794923, + "grad_norm": 0.47249868512153625, + "learning_rate": 0.00013243218653738676, + "loss": 1.4799, + "step": 26009 + }, + { + "epoch": 0.33798808725186513, + "grad_norm": 0.44152122735977173, + "learning_rate": 0.00013242958707547538, + "loss": 1.3723, + "step": 26010 + }, + { + "epoch": 0.338001081795781, + "grad_norm": 0.4457405209541321, + "learning_rate": 0.000132426987613564, + "loss": 1.3819, + "step": 26011 + }, + { + "epoch": 0.3380140763396969, + "grad_norm": 0.4266115128993988, + "learning_rate": 0.0001324243881516526, + "loss": 1.4162, + "step": 26012 + }, + { + "epoch": 0.3380270708836127, + "grad_norm": 0.34012266993522644, + "learning_rate": 0.00013242178868974123, + "loss": 1.2692, + "step": 26013 + }, + { + "epoch": 0.3380400654275286, + "grad_norm": 0.47643885016441345, + "learning_rate": 0.00013241918922782985, + "loss": 1.5577, + "step": 26014 + }, + { + "epoch": 0.33805305997144447, + "grad_norm": 0.42859864234924316, + "learning_rate": 0.00013241658976591848, + "loss": 1.3983, + "step": 26015 + }, + { + "epoch": 0.33806605451536037, + "grad_norm": 0.32130736112594604, + "learning_rate": 0.00013241399030400707, + "loss": 1.2395, + "step": 26016 + }, + { + "epoch": 0.3380790490592762, + "grad_norm": 0.37041670083999634, + "learning_rate": 0.0001324113908420957, + "loss": 1.3293, + "step": 26017 + }, + { + "epoch": 0.3380920436031921, + "grad_norm": 0.3703097701072693, + "learning_rate": 0.00013240879138018432, + "loss": 1.1806, + "step": 26018 + }, + { + "epoch": 0.33810503814710796, + "grad_norm": 0.4163018763065338, + "learning_rate": 0.00013240619191827292, + "loss": 1.5549, + "step": 26019 + }, + { + "epoch": 0.33811803269102386, + "grad_norm": 0.3582121729850769, + "learning_rate": 0.00013240359245636154, + "loss": 1.3888, + "step": 26020 + }, + { + "epoch": 0.3381310272349397, + "grad_norm": 0.31785061955451965, + "learning_rate": 0.00013240099299445014, + "loss": 1.2873, + "step": 26021 + }, + { + "epoch": 0.3381440217788556, + "grad_norm": 0.3992772400379181, + "learning_rate": 0.00013239839353253877, + "loss": 1.2759, + "step": 26022 + }, + { + "epoch": 0.33815701632277145, + "grad_norm": 0.29709020256996155, + "learning_rate": 0.0001323957940706274, + "loss": 1.2396, + "step": 26023 + }, + { + "epoch": 0.33817001086668735, + "grad_norm": 0.3581277132034302, + "learning_rate": 0.000132393194608716, + "loss": 1.3635, + "step": 26024 + }, + { + "epoch": 0.3381830054106032, + "grad_norm": 0.43390825390815735, + "learning_rate": 0.0001323905951468046, + "loss": 1.3634, + "step": 26025 + }, + { + "epoch": 0.3381959999545191, + "grad_norm": 0.33940964937210083, + "learning_rate": 0.00013238799568489324, + "loss": 1.6098, + "step": 26026 + }, + { + "epoch": 0.33820899449843494, + "grad_norm": 0.3332218527793884, + "learning_rate": 0.00013238539622298186, + "loss": 1.388, + "step": 26027 + }, + { + "epoch": 0.33822198904235085, + "grad_norm": 0.3670242130756378, + "learning_rate": 0.00013238279676107046, + "loss": 1.3298, + "step": 26028 + }, + { + "epoch": 0.3382349835862667, + "grad_norm": 0.37164583802223206, + "learning_rate": 0.00013238019729915908, + "loss": 1.3288, + "step": 26029 + }, + { + "epoch": 0.3382479781301826, + "grad_norm": 0.44527381658554077, + "learning_rate": 0.0001323775978372477, + "loss": 1.3691, + "step": 26030 + }, + { + "epoch": 0.33826097267409844, + "grad_norm": 0.408902645111084, + "learning_rate": 0.0001323749983753363, + "loss": 1.3029, + "step": 26031 + }, + { + "epoch": 0.33827396721801434, + "grad_norm": 0.3251926004886627, + "learning_rate": 0.00013237239891342493, + "loss": 1.3576, + "step": 26032 + }, + { + "epoch": 0.3382869617619302, + "grad_norm": 0.4064948558807373, + "learning_rate": 0.00013236979945151353, + "loss": 1.5685, + "step": 26033 + }, + { + "epoch": 0.3382999563058461, + "grad_norm": 0.39309945702552795, + "learning_rate": 0.00013236719998960215, + "loss": 1.3101, + "step": 26034 + }, + { + "epoch": 0.33831295084976193, + "grad_norm": 0.3306905925273895, + "learning_rate": 0.00013236460052769078, + "loss": 1.5342, + "step": 26035 + }, + { + "epoch": 0.33832594539367783, + "grad_norm": 0.26769521832466125, + "learning_rate": 0.00013236200106577937, + "loss": 1.2381, + "step": 26036 + }, + { + "epoch": 0.33833893993759373, + "grad_norm": 0.38197287917137146, + "learning_rate": 0.00013235940160386802, + "loss": 1.3308, + "step": 26037 + }, + { + "epoch": 0.3383519344815096, + "grad_norm": 0.320083886384964, + "learning_rate": 0.00013235680214195662, + "loss": 1.4075, + "step": 26038 + }, + { + "epoch": 0.3383649290254255, + "grad_norm": 0.4613439738750458, + "learning_rate": 0.00013235420268004525, + "loss": 1.2781, + "step": 26039 + }, + { + "epoch": 0.3383779235693413, + "grad_norm": 0.47412562370300293, + "learning_rate": 0.00013235160321813384, + "loss": 1.4591, + "step": 26040 + }, + { + "epoch": 0.3383909181132572, + "grad_norm": 0.44963017106056213, + "learning_rate": 0.00013234900375622247, + "loss": 1.3288, + "step": 26041 + }, + { + "epoch": 0.33840391265717307, + "grad_norm": 0.40758216381073, + "learning_rate": 0.0001323464042943111, + "loss": 1.3454, + "step": 26042 + }, + { + "epoch": 0.33841690720108897, + "grad_norm": 0.3422310948371887, + "learning_rate": 0.0001323438048323997, + "loss": 1.421, + "step": 26043 + }, + { + "epoch": 0.3384299017450048, + "grad_norm": 0.45112258195877075, + "learning_rate": 0.00013234120537048831, + "loss": 1.4576, + "step": 26044 + }, + { + "epoch": 0.3384428962889207, + "grad_norm": 0.3183678090572357, + "learning_rate": 0.00013233860590857694, + "loss": 1.2271, + "step": 26045 + }, + { + "epoch": 0.33845589083283656, + "grad_norm": 0.34023067355155945, + "learning_rate": 0.00013233600644666556, + "loss": 1.5048, + "step": 26046 + }, + { + "epoch": 0.33846888537675246, + "grad_norm": 0.3973219394683838, + "learning_rate": 0.00013233340698475416, + "loss": 1.4479, + "step": 26047 + }, + { + "epoch": 0.3384818799206683, + "grad_norm": 0.37197497487068176, + "learning_rate": 0.00013233080752284276, + "loss": 1.6001, + "step": 26048 + }, + { + "epoch": 0.3384948744645842, + "grad_norm": 0.38511350750923157, + "learning_rate": 0.0001323282080609314, + "loss": 1.133, + "step": 26049 + }, + { + "epoch": 0.33850786900850005, + "grad_norm": 0.46855947375297546, + "learning_rate": 0.00013232560859902, + "loss": 1.548, + "step": 26050 + }, + { + "epoch": 0.33852086355241595, + "grad_norm": 0.344058632850647, + "learning_rate": 0.00013232300913710863, + "loss": 1.2832, + "step": 26051 + }, + { + "epoch": 0.3385338580963318, + "grad_norm": 0.48236799240112305, + "learning_rate": 0.00013232040967519723, + "loss": 1.477, + "step": 26052 + }, + { + "epoch": 0.3385468526402477, + "grad_norm": 0.3212074339389801, + "learning_rate": 0.00013231781021328585, + "loss": 1.1441, + "step": 26053 + }, + { + "epoch": 0.33855984718416354, + "grad_norm": 0.31936028599739075, + "learning_rate": 0.00013231521075137448, + "loss": 1.5827, + "step": 26054 + }, + { + "epoch": 0.33857284172807944, + "grad_norm": 0.4629895091056824, + "learning_rate": 0.00013231261128946308, + "loss": 1.352, + "step": 26055 + }, + { + "epoch": 0.3385858362719953, + "grad_norm": 0.43505293130874634, + "learning_rate": 0.0001323100118275517, + "loss": 1.4927, + "step": 26056 + }, + { + "epoch": 0.3385988308159112, + "grad_norm": 0.32325783371925354, + "learning_rate": 0.00013230741236564032, + "loss": 1.3873, + "step": 26057 + }, + { + "epoch": 0.33861182535982703, + "grad_norm": 0.3562498986721039, + "learning_rate": 0.00013230481290372895, + "loss": 1.3323, + "step": 26058 + }, + { + "epoch": 0.33862481990374294, + "grad_norm": 0.3656165897846222, + "learning_rate": 0.00013230221344181755, + "loss": 1.718, + "step": 26059 + }, + { + "epoch": 0.3386378144476588, + "grad_norm": 0.43518126010894775, + "learning_rate": 0.00013229961397990614, + "loss": 1.3138, + "step": 26060 + }, + { + "epoch": 0.3386508089915747, + "grad_norm": 0.3630739748477936, + "learning_rate": 0.0001322970145179948, + "loss": 1.2582, + "step": 26061 + }, + { + "epoch": 0.3386638035354905, + "grad_norm": 0.35151228308677673, + "learning_rate": 0.0001322944150560834, + "loss": 1.4146, + "step": 26062 + }, + { + "epoch": 0.3386767980794064, + "grad_norm": 0.47331181168556213, + "learning_rate": 0.00013229181559417202, + "loss": 1.2425, + "step": 26063 + }, + { + "epoch": 0.33868979262332227, + "grad_norm": 0.47240084409713745, + "learning_rate": 0.00013228921613226061, + "loss": 1.3923, + "step": 26064 + }, + { + "epoch": 0.3387027871672382, + "grad_norm": 0.35691216588020325, + "learning_rate": 0.00013228661667034924, + "loss": 1.402, + "step": 26065 + }, + { + "epoch": 0.338715781711154, + "grad_norm": 0.4560912251472473, + "learning_rate": 0.00013228401720843786, + "loss": 1.3788, + "step": 26066 + }, + { + "epoch": 0.3387287762550699, + "grad_norm": 0.46608152985572815, + "learning_rate": 0.00013228141774652646, + "loss": 1.6213, + "step": 26067 + }, + { + "epoch": 0.33874177079898576, + "grad_norm": 0.40156424045562744, + "learning_rate": 0.00013227881828461509, + "loss": 1.3351, + "step": 26068 + }, + { + "epoch": 0.33875476534290166, + "grad_norm": 0.42579886317253113, + "learning_rate": 0.0001322762188227037, + "loss": 1.3216, + "step": 26069 + }, + { + "epoch": 0.3387677598868175, + "grad_norm": 0.42317482829093933, + "learning_rate": 0.00013227361936079233, + "loss": 1.3659, + "step": 26070 + }, + { + "epoch": 0.3387807544307334, + "grad_norm": 0.4158824682235718, + "learning_rate": 0.00013227101989888093, + "loss": 1.5008, + "step": 26071 + }, + { + "epoch": 0.33879374897464926, + "grad_norm": 0.4330485463142395, + "learning_rate": 0.00013226842043696956, + "loss": 1.4683, + "step": 26072 + }, + { + "epoch": 0.33880674351856516, + "grad_norm": 0.47141727805137634, + "learning_rate": 0.00013226582097505818, + "loss": 1.4663, + "step": 26073 + }, + { + "epoch": 0.338819738062481, + "grad_norm": 0.3531055450439453, + "learning_rate": 0.00013226322151314678, + "loss": 1.3336, + "step": 26074 + }, + { + "epoch": 0.3388327326063969, + "grad_norm": 0.39789867401123047, + "learning_rate": 0.0001322606220512354, + "loss": 1.3242, + "step": 26075 + }, + { + "epoch": 0.33884572715031275, + "grad_norm": 0.4301973581314087, + "learning_rate": 0.00013225802258932403, + "loss": 1.2429, + "step": 26076 + }, + { + "epoch": 0.33885872169422865, + "grad_norm": 0.34061765670776367, + "learning_rate": 0.00013225542312741262, + "loss": 1.3111, + "step": 26077 + }, + { + "epoch": 0.3388717162381445, + "grad_norm": 0.3037078380584717, + "learning_rate": 0.00013225282366550125, + "loss": 1.2519, + "step": 26078 + }, + { + "epoch": 0.3388847107820604, + "grad_norm": 0.38923218846321106, + "learning_rate": 0.00013225022420358985, + "loss": 1.3227, + "step": 26079 + }, + { + "epoch": 0.33889770532597624, + "grad_norm": 0.33996447920799255, + "learning_rate": 0.0001322476247416785, + "loss": 1.2638, + "step": 26080 + }, + { + "epoch": 0.33891069986989214, + "grad_norm": 0.35176604986190796, + "learning_rate": 0.0001322450252797671, + "loss": 1.4445, + "step": 26081 + }, + { + "epoch": 0.338923694413808, + "grad_norm": 0.3652324974536896, + "learning_rate": 0.00013224242581785572, + "loss": 1.4231, + "step": 26082 + }, + { + "epoch": 0.3389366889577239, + "grad_norm": 0.4295262396335602, + "learning_rate": 0.00013223982635594432, + "loss": 1.4937, + "step": 26083 + }, + { + "epoch": 0.33894968350163973, + "grad_norm": 0.5046804547309875, + "learning_rate": 0.00013223722689403294, + "loss": 1.4346, + "step": 26084 + }, + { + "epoch": 0.33896267804555563, + "grad_norm": 0.5072299242019653, + "learning_rate": 0.00013223462743212157, + "loss": 1.4525, + "step": 26085 + }, + { + "epoch": 0.3389756725894715, + "grad_norm": 1.0560909509658813, + "learning_rate": 0.00013223202797021016, + "loss": 1.522, + "step": 26086 + }, + { + "epoch": 0.3389886671333874, + "grad_norm": 0.445686936378479, + "learning_rate": 0.0001322294285082988, + "loss": 1.3364, + "step": 26087 + }, + { + "epoch": 0.3390016616773032, + "grad_norm": 0.40685030817985535, + "learning_rate": 0.0001322268290463874, + "loss": 1.3276, + "step": 26088 + }, + { + "epoch": 0.3390146562212191, + "grad_norm": 0.366913378238678, + "learning_rate": 0.000132224229584476, + "loss": 1.1726, + "step": 26089 + }, + { + "epoch": 0.33902765076513497, + "grad_norm": 0.40471911430358887, + "learning_rate": 0.00013222163012256463, + "loss": 1.4212, + "step": 26090 + }, + { + "epoch": 0.33904064530905087, + "grad_norm": 0.4582507312297821, + "learning_rate": 0.00013221903066065323, + "loss": 1.4037, + "step": 26091 + }, + { + "epoch": 0.3390536398529667, + "grad_norm": 0.3212548792362213, + "learning_rate": 0.00013221643119874188, + "loss": 1.339, + "step": 26092 + }, + { + "epoch": 0.3390666343968826, + "grad_norm": 0.3519456088542938, + "learning_rate": 0.00013221383173683048, + "loss": 1.4781, + "step": 26093 + }, + { + "epoch": 0.33907962894079846, + "grad_norm": 0.31467771530151367, + "learning_rate": 0.0001322112322749191, + "loss": 1.1384, + "step": 26094 + }, + { + "epoch": 0.33909262348471436, + "grad_norm": 0.5373119711875916, + "learning_rate": 0.0001322086328130077, + "loss": 1.3816, + "step": 26095 + }, + { + "epoch": 0.3391056180286302, + "grad_norm": 0.3905687928199768, + "learning_rate": 0.00013220603335109633, + "loss": 1.3385, + "step": 26096 + }, + { + "epoch": 0.3391186125725461, + "grad_norm": 0.43328219652175903, + "learning_rate": 0.00013220343388918495, + "loss": 1.3244, + "step": 26097 + }, + { + "epoch": 0.33913160711646195, + "grad_norm": 0.5360528826713562, + "learning_rate": 0.00013220083442727355, + "loss": 1.4831, + "step": 26098 + }, + { + "epoch": 0.33914460166037785, + "grad_norm": 0.390375018119812, + "learning_rate": 0.00013219823496536217, + "loss": 1.3543, + "step": 26099 + }, + { + "epoch": 0.3391575962042937, + "grad_norm": 0.5416364669799805, + "learning_rate": 0.0001321956355034508, + "loss": 1.4785, + "step": 26100 + }, + { + "epoch": 0.3391705907482096, + "grad_norm": 0.45381808280944824, + "learning_rate": 0.00013219303604153942, + "loss": 1.6316, + "step": 26101 + }, + { + "epoch": 0.33918358529212544, + "grad_norm": 0.24971823394298553, + "learning_rate": 0.00013219043657962802, + "loss": 1.2042, + "step": 26102 + }, + { + "epoch": 0.33919657983604135, + "grad_norm": 0.35749295353889465, + "learning_rate": 0.00013218783711771662, + "loss": 1.3901, + "step": 26103 + }, + { + "epoch": 0.3392095743799572, + "grad_norm": 0.4308362901210785, + "learning_rate": 0.00013218523765580527, + "loss": 1.5455, + "step": 26104 + }, + { + "epoch": 0.3392225689238731, + "grad_norm": 0.48581263422966003, + "learning_rate": 0.00013218263819389387, + "loss": 1.4545, + "step": 26105 + }, + { + "epoch": 0.33923556346778894, + "grad_norm": 0.4493102431297302, + "learning_rate": 0.0001321800387319825, + "loss": 1.3151, + "step": 26106 + }, + { + "epoch": 0.33924855801170484, + "grad_norm": 0.4331951439380646, + "learning_rate": 0.0001321774392700711, + "loss": 1.6084, + "step": 26107 + }, + { + "epoch": 0.3392615525556207, + "grad_norm": 0.3062371015548706, + "learning_rate": 0.0001321748398081597, + "loss": 1.4859, + "step": 26108 + }, + { + "epoch": 0.3392745470995366, + "grad_norm": 0.4729411005973816, + "learning_rate": 0.00013217224034624834, + "loss": 1.5745, + "step": 26109 + }, + { + "epoch": 0.33928754164345243, + "grad_norm": 0.4998309910297394, + "learning_rate": 0.00013216964088433693, + "loss": 1.6605, + "step": 26110 + }, + { + "epoch": 0.33930053618736833, + "grad_norm": 0.5050880312919617, + "learning_rate": 0.00013216704142242559, + "loss": 1.4067, + "step": 26111 + }, + { + "epoch": 0.33931353073128423, + "grad_norm": 0.4295085668563843, + "learning_rate": 0.00013216444196051418, + "loss": 1.4078, + "step": 26112 + }, + { + "epoch": 0.3393265252752001, + "grad_norm": 0.3796772360801697, + "learning_rate": 0.0001321618424986028, + "loss": 1.2763, + "step": 26113 + }, + { + "epoch": 0.339339519819116, + "grad_norm": 0.4142545461654663, + "learning_rate": 0.0001321592430366914, + "loss": 1.2981, + "step": 26114 + }, + { + "epoch": 0.3393525143630318, + "grad_norm": 0.3426409363746643, + "learning_rate": 0.00013215664357478003, + "loss": 1.3433, + "step": 26115 + }, + { + "epoch": 0.3393655089069477, + "grad_norm": 0.3506077229976654, + "learning_rate": 0.00013215404411286865, + "loss": 1.4286, + "step": 26116 + }, + { + "epoch": 0.33937850345086357, + "grad_norm": 0.3806890547275543, + "learning_rate": 0.00013215144465095725, + "loss": 1.4377, + "step": 26117 + }, + { + "epoch": 0.33939149799477947, + "grad_norm": 0.3685814142227173, + "learning_rate": 0.00013214884518904588, + "loss": 1.3635, + "step": 26118 + }, + { + "epoch": 0.3394044925386953, + "grad_norm": 0.4511130750179291, + "learning_rate": 0.0001321462457271345, + "loss": 1.5876, + "step": 26119 + }, + { + "epoch": 0.3394174870826112, + "grad_norm": 0.49052268266677856, + "learning_rate": 0.0001321436462652231, + "loss": 1.599, + "step": 26120 + }, + { + "epoch": 0.33943048162652706, + "grad_norm": 0.37976324558258057, + "learning_rate": 0.00013214104680331172, + "loss": 1.3536, + "step": 26121 + }, + { + "epoch": 0.33944347617044296, + "grad_norm": 0.3351363241672516, + "learning_rate": 0.00013213844734140032, + "loss": 1.3554, + "step": 26122 + }, + { + "epoch": 0.3394564707143588, + "grad_norm": 0.38419032096862793, + "learning_rate": 0.00013213584787948897, + "loss": 1.3478, + "step": 26123 + }, + { + "epoch": 0.3394694652582747, + "grad_norm": 0.3367789387702942, + "learning_rate": 0.00013213324841757757, + "loss": 1.2381, + "step": 26124 + }, + { + "epoch": 0.33948245980219055, + "grad_norm": 0.4104432165622711, + "learning_rate": 0.0001321306489556662, + "loss": 1.366, + "step": 26125 + }, + { + "epoch": 0.33949545434610645, + "grad_norm": 0.3378024101257324, + "learning_rate": 0.0001321280494937548, + "loss": 1.2678, + "step": 26126 + }, + { + "epoch": 0.3395084488900223, + "grad_norm": 0.3339763581752777, + "learning_rate": 0.00013212545003184341, + "loss": 1.4026, + "step": 26127 + }, + { + "epoch": 0.3395214434339382, + "grad_norm": 0.43284308910369873, + "learning_rate": 0.00013212285056993204, + "loss": 1.4484, + "step": 26128 + }, + { + "epoch": 0.33953443797785404, + "grad_norm": 0.3656160235404968, + "learning_rate": 0.00013212025110802064, + "loss": 1.4621, + "step": 26129 + }, + { + "epoch": 0.33954743252176994, + "grad_norm": 0.5169016122817993, + "learning_rate": 0.00013211765164610926, + "loss": 1.2912, + "step": 26130 + }, + { + "epoch": 0.3395604270656858, + "grad_norm": 0.385970801115036, + "learning_rate": 0.00013211505218419789, + "loss": 1.3042, + "step": 26131 + }, + { + "epoch": 0.3395734216096017, + "grad_norm": 0.43052437901496887, + "learning_rate": 0.00013211245272228648, + "loss": 1.5935, + "step": 26132 + }, + { + "epoch": 0.33958641615351753, + "grad_norm": 0.4337923526763916, + "learning_rate": 0.0001321098532603751, + "loss": 1.2833, + "step": 26133 + }, + { + "epoch": 0.33959941069743343, + "grad_norm": 0.3738989531993866, + "learning_rate": 0.0001321072537984637, + "loss": 1.4643, + "step": 26134 + }, + { + "epoch": 0.3396124052413493, + "grad_norm": 0.4272087514400482, + "learning_rate": 0.00013210465433655236, + "loss": 1.569, + "step": 26135 + }, + { + "epoch": 0.3396253997852652, + "grad_norm": 0.40140500664711, + "learning_rate": 0.00013210205487464095, + "loss": 1.2109, + "step": 26136 + }, + { + "epoch": 0.339638394329181, + "grad_norm": 0.6054965257644653, + "learning_rate": 0.00013209945541272958, + "loss": 1.4751, + "step": 26137 + }, + { + "epoch": 0.3396513888730969, + "grad_norm": 0.29412147402763367, + "learning_rate": 0.00013209685595081818, + "loss": 1.33, + "step": 26138 + }, + { + "epoch": 0.33966438341701277, + "grad_norm": 0.3866554796695709, + "learning_rate": 0.0001320942564889068, + "loss": 1.5938, + "step": 26139 + }, + { + "epoch": 0.3396773779609287, + "grad_norm": 0.35024407505989075, + "learning_rate": 0.00013209165702699542, + "loss": 1.2907, + "step": 26140 + }, + { + "epoch": 0.3396903725048445, + "grad_norm": 0.48450350761413574, + "learning_rate": 0.00013208905756508402, + "loss": 1.5128, + "step": 26141 + }, + { + "epoch": 0.3397033670487604, + "grad_norm": 0.3828144073486328, + "learning_rate": 0.00013208645810317265, + "loss": 1.343, + "step": 26142 + }, + { + "epoch": 0.33971636159267626, + "grad_norm": 0.4006251096725464, + "learning_rate": 0.00013208385864126127, + "loss": 1.3495, + "step": 26143 + }, + { + "epoch": 0.33972935613659216, + "grad_norm": 0.3299696445465088, + "learning_rate": 0.00013208125917934987, + "loss": 1.3069, + "step": 26144 + }, + { + "epoch": 0.339742350680508, + "grad_norm": 0.5259748697280884, + "learning_rate": 0.0001320786597174385, + "loss": 1.3785, + "step": 26145 + }, + { + "epoch": 0.3397553452244239, + "grad_norm": 0.32864344120025635, + "learning_rate": 0.00013207606025552712, + "loss": 1.2414, + "step": 26146 + }, + { + "epoch": 0.33976833976833976, + "grad_norm": 0.24723125994205475, + "learning_rate": 0.00013207346079361574, + "loss": 1.2345, + "step": 26147 + }, + { + "epoch": 0.33978133431225566, + "grad_norm": 0.38176044821739197, + "learning_rate": 0.00013207086133170434, + "loss": 1.4247, + "step": 26148 + }, + { + "epoch": 0.3397943288561715, + "grad_norm": 0.22277477383613586, + "learning_rate": 0.00013206826186979296, + "loss": 1.3702, + "step": 26149 + }, + { + "epoch": 0.3398073234000874, + "grad_norm": 0.45568305253982544, + "learning_rate": 0.0001320656624078816, + "loss": 1.3779, + "step": 26150 + }, + { + "epoch": 0.33982031794400325, + "grad_norm": 0.37732306122779846, + "learning_rate": 0.00013206306294597019, + "loss": 1.3485, + "step": 26151 + }, + { + "epoch": 0.33983331248791915, + "grad_norm": 0.48806843161582947, + "learning_rate": 0.0001320604634840588, + "loss": 1.4271, + "step": 26152 + }, + { + "epoch": 0.339846307031835, + "grad_norm": 0.3434215486049652, + "learning_rate": 0.0001320578640221474, + "loss": 1.3304, + "step": 26153 + }, + { + "epoch": 0.3398593015757509, + "grad_norm": 0.3597976267337799, + "learning_rate": 0.00013205526456023606, + "loss": 1.4033, + "step": 26154 + }, + { + "epoch": 0.33987229611966674, + "grad_norm": 0.3724515736103058, + "learning_rate": 0.00013205266509832466, + "loss": 1.3631, + "step": 26155 + }, + { + "epoch": 0.33988529066358264, + "grad_norm": 0.3474212884902954, + "learning_rate": 0.00013205006563641325, + "loss": 1.3655, + "step": 26156 + }, + { + "epoch": 0.3398982852074985, + "grad_norm": 0.3434038460254669, + "learning_rate": 0.00013204746617450188, + "loss": 1.4879, + "step": 26157 + }, + { + "epoch": 0.3399112797514144, + "grad_norm": 0.362981379032135, + "learning_rate": 0.0001320448667125905, + "loss": 1.2338, + "step": 26158 + }, + { + "epoch": 0.33992427429533023, + "grad_norm": 0.37427300214767456, + "learning_rate": 0.00013204226725067913, + "loss": 1.4734, + "step": 26159 + }, + { + "epoch": 0.33993726883924613, + "grad_norm": 0.43690022826194763, + "learning_rate": 0.00013203966778876772, + "loss": 1.3909, + "step": 26160 + }, + { + "epoch": 0.339950263383162, + "grad_norm": 0.3793913424015045, + "learning_rate": 0.00013203706832685635, + "loss": 1.4261, + "step": 26161 + }, + { + "epoch": 0.3399632579270779, + "grad_norm": 0.39530012011528015, + "learning_rate": 0.00013203446886494497, + "loss": 1.3707, + "step": 26162 + }, + { + "epoch": 0.3399762524709937, + "grad_norm": 0.45010656118392944, + "learning_rate": 0.00013203186940303357, + "loss": 1.4455, + "step": 26163 + }, + { + "epoch": 0.3399892470149096, + "grad_norm": 0.3283880650997162, + "learning_rate": 0.0001320292699411222, + "loss": 1.5056, + "step": 26164 + }, + { + "epoch": 0.34000224155882547, + "grad_norm": 0.3500770032405853, + "learning_rate": 0.0001320266704792108, + "loss": 1.2825, + "step": 26165 + }, + { + "epoch": 0.34001523610274137, + "grad_norm": 0.4843919575214386, + "learning_rate": 0.00013202407101729944, + "loss": 1.3489, + "step": 26166 + }, + { + "epoch": 0.3400282306466572, + "grad_norm": 0.39160025119781494, + "learning_rate": 0.00013202147155538804, + "loss": 1.3675, + "step": 26167 + }, + { + "epoch": 0.3400412251905731, + "grad_norm": 0.4595074951648712, + "learning_rate": 0.00013201887209347667, + "loss": 1.4157, + "step": 26168 + }, + { + "epoch": 0.34005421973448896, + "grad_norm": 0.3389313220977783, + "learning_rate": 0.00013201627263156526, + "loss": 1.4829, + "step": 26169 + }, + { + "epoch": 0.34006721427840486, + "grad_norm": 0.4173663258552551, + "learning_rate": 0.0001320136731696539, + "loss": 1.4554, + "step": 26170 + }, + { + "epoch": 0.3400802088223207, + "grad_norm": 0.43081143498420715, + "learning_rate": 0.0001320110737077425, + "loss": 1.25, + "step": 26171 + }, + { + "epoch": 0.3400932033662366, + "grad_norm": 0.42154356837272644, + "learning_rate": 0.0001320084742458311, + "loss": 1.3325, + "step": 26172 + }, + { + "epoch": 0.34010619791015245, + "grad_norm": 0.4978295564651489, + "learning_rate": 0.00013200587478391973, + "loss": 1.4311, + "step": 26173 + }, + { + "epoch": 0.34011919245406835, + "grad_norm": 0.3951527178287506, + "learning_rate": 0.00013200327532200836, + "loss": 1.317, + "step": 26174 + }, + { + "epoch": 0.3401321869979842, + "grad_norm": 0.35416314005851746, + "learning_rate": 0.00013200067586009696, + "loss": 1.4091, + "step": 26175 + }, + { + "epoch": 0.3401451815419001, + "grad_norm": 0.4264552891254425, + "learning_rate": 0.00013199807639818558, + "loss": 1.3756, + "step": 26176 + }, + { + "epoch": 0.34015817608581594, + "grad_norm": 0.4598437547683716, + "learning_rate": 0.00013199547693627418, + "loss": 1.2823, + "step": 26177 + }, + { + "epoch": 0.34017117062973184, + "grad_norm": 0.45161259174346924, + "learning_rate": 0.00013199287747436283, + "loss": 1.6098, + "step": 26178 + }, + { + "epoch": 0.3401841651736477, + "grad_norm": 0.3187314569950104, + "learning_rate": 0.00013199027801245143, + "loss": 1.4541, + "step": 26179 + }, + { + "epoch": 0.3401971597175636, + "grad_norm": 0.3738307058811188, + "learning_rate": 0.00013198767855054005, + "loss": 1.2449, + "step": 26180 + }, + { + "epoch": 0.34021015426147944, + "grad_norm": 0.37954947352409363, + "learning_rate": 0.00013198507908862865, + "loss": 1.4872, + "step": 26181 + }, + { + "epoch": 0.34022314880539534, + "grad_norm": 0.3951531648635864, + "learning_rate": 0.00013198247962671727, + "loss": 1.4865, + "step": 26182 + }, + { + "epoch": 0.3402361433493112, + "grad_norm": 0.38394370675086975, + "learning_rate": 0.0001319798801648059, + "loss": 1.4791, + "step": 26183 + }, + { + "epoch": 0.3402491378932271, + "grad_norm": 0.366731733083725, + "learning_rate": 0.0001319772807028945, + "loss": 1.4386, + "step": 26184 + }, + { + "epoch": 0.34026213243714293, + "grad_norm": 0.3587181568145752, + "learning_rate": 0.00013197468124098315, + "loss": 1.4507, + "step": 26185 + }, + { + "epoch": 0.34027512698105883, + "grad_norm": 0.41163700819015503, + "learning_rate": 0.00013197208177907174, + "loss": 1.3388, + "step": 26186 + }, + { + "epoch": 0.3402881215249747, + "grad_norm": 0.4417600929737091, + "learning_rate": 0.00013196948231716034, + "loss": 1.3494, + "step": 26187 + }, + { + "epoch": 0.3403011160688906, + "grad_norm": 0.4449005126953125, + "learning_rate": 0.00013196688285524897, + "loss": 1.5053, + "step": 26188 + }, + { + "epoch": 0.3403141106128065, + "grad_norm": 0.34276872873306274, + "learning_rate": 0.0001319642833933376, + "loss": 1.3984, + "step": 26189 + }, + { + "epoch": 0.3403271051567223, + "grad_norm": 0.3789070248603821, + "learning_rate": 0.00013196168393142622, + "loss": 1.2547, + "step": 26190 + }, + { + "epoch": 0.3403400997006382, + "grad_norm": 0.4343651533126831, + "learning_rate": 0.0001319590844695148, + "loss": 1.4138, + "step": 26191 + }, + { + "epoch": 0.34035309424455407, + "grad_norm": 0.3517058491706848, + "learning_rate": 0.00013195648500760344, + "loss": 1.6342, + "step": 26192 + }, + { + "epoch": 0.34036608878846997, + "grad_norm": 0.3006521761417389, + "learning_rate": 0.00013195388554569206, + "loss": 1.4442, + "step": 26193 + }, + { + "epoch": 0.3403790833323858, + "grad_norm": 0.430831640958786, + "learning_rate": 0.00013195128608378066, + "loss": 1.4491, + "step": 26194 + }, + { + "epoch": 0.3403920778763017, + "grad_norm": 0.3935178816318512, + "learning_rate": 0.00013194868662186928, + "loss": 1.4217, + "step": 26195 + }, + { + "epoch": 0.34040507242021756, + "grad_norm": 0.3533515930175781, + "learning_rate": 0.00013194608715995788, + "loss": 1.41, + "step": 26196 + }, + { + "epoch": 0.34041806696413346, + "grad_norm": 0.43116259574890137, + "learning_rate": 0.00013194348769804653, + "loss": 1.4256, + "step": 26197 + }, + { + "epoch": 0.3404310615080493, + "grad_norm": 0.37773051857948303, + "learning_rate": 0.00013194088823613513, + "loss": 1.3644, + "step": 26198 + }, + { + "epoch": 0.3404440560519652, + "grad_norm": 0.4361073672771454, + "learning_rate": 0.00013193828877422373, + "loss": 1.5483, + "step": 26199 + }, + { + "epoch": 0.34045705059588105, + "grad_norm": 0.365222305059433, + "learning_rate": 0.00013193568931231235, + "loss": 1.5489, + "step": 26200 + }, + { + "epoch": 0.34047004513979695, + "grad_norm": 0.3768000602722168, + "learning_rate": 0.00013193308985040098, + "loss": 1.3091, + "step": 26201 + }, + { + "epoch": 0.3404830396837128, + "grad_norm": 0.3791691064834595, + "learning_rate": 0.0001319304903884896, + "loss": 1.4619, + "step": 26202 + }, + { + "epoch": 0.3404960342276287, + "grad_norm": 0.40866950154304504, + "learning_rate": 0.0001319278909265782, + "loss": 1.2275, + "step": 26203 + }, + { + "epoch": 0.34050902877154454, + "grad_norm": 0.31759610772132874, + "learning_rate": 0.00013192529146466682, + "loss": 1.5058, + "step": 26204 + }, + { + "epoch": 0.34052202331546044, + "grad_norm": 0.3818267583847046, + "learning_rate": 0.00013192269200275545, + "loss": 1.3961, + "step": 26205 + }, + { + "epoch": 0.3405350178593763, + "grad_norm": 0.37706467509269714, + "learning_rate": 0.00013192009254084404, + "loss": 1.5099, + "step": 26206 + }, + { + "epoch": 0.3405480124032922, + "grad_norm": 0.2985096573829651, + "learning_rate": 0.00013191749307893267, + "loss": 1.6341, + "step": 26207 + }, + { + "epoch": 0.34056100694720803, + "grad_norm": 0.2959730923175812, + "learning_rate": 0.00013191489361702127, + "loss": 1.4541, + "step": 26208 + }, + { + "epoch": 0.34057400149112393, + "grad_norm": 0.4692249000072479, + "learning_rate": 0.00013191229415510992, + "loss": 1.4288, + "step": 26209 + }, + { + "epoch": 0.3405869960350398, + "grad_norm": 0.41624554991722107, + "learning_rate": 0.00013190969469319852, + "loss": 1.4315, + "step": 26210 + }, + { + "epoch": 0.3405999905789557, + "grad_norm": 0.499685674905777, + "learning_rate": 0.0001319070952312871, + "loss": 1.5436, + "step": 26211 + }, + { + "epoch": 0.3406129851228715, + "grad_norm": 0.3813563585281372, + "learning_rate": 0.00013190449576937574, + "loss": 1.4574, + "step": 26212 + }, + { + "epoch": 0.3406259796667874, + "grad_norm": 0.29567989706993103, + "learning_rate": 0.00013190189630746436, + "loss": 1.371, + "step": 26213 + }, + { + "epoch": 0.34063897421070327, + "grad_norm": 0.4973951280117035, + "learning_rate": 0.00013189929684555299, + "loss": 1.3602, + "step": 26214 + }, + { + "epoch": 0.34065196875461917, + "grad_norm": 0.4051089584827423, + "learning_rate": 0.00013189669738364158, + "loss": 1.4089, + "step": 26215 + }, + { + "epoch": 0.340664963298535, + "grad_norm": 0.3603508770465851, + "learning_rate": 0.0001318940979217302, + "loss": 1.2967, + "step": 26216 + }, + { + "epoch": 0.3406779578424509, + "grad_norm": 0.44503557682037354, + "learning_rate": 0.00013189149845981883, + "loss": 1.3451, + "step": 26217 + }, + { + "epoch": 0.34069095238636676, + "grad_norm": 0.348886638879776, + "learning_rate": 0.00013188889899790743, + "loss": 1.272, + "step": 26218 + }, + { + "epoch": 0.34070394693028266, + "grad_norm": 0.3516038656234741, + "learning_rate": 0.00013188629953599605, + "loss": 1.4474, + "step": 26219 + }, + { + "epoch": 0.3407169414741985, + "grad_norm": 0.4363178312778473, + "learning_rate": 0.00013188370007408468, + "loss": 1.479, + "step": 26220 + }, + { + "epoch": 0.3407299360181144, + "grad_norm": 0.3630240261554718, + "learning_rate": 0.0001318811006121733, + "loss": 1.566, + "step": 26221 + }, + { + "epoch": 0.34074293056203026, + "grad_norm": 0.4306725263595581, + "learning_rate": 0.0001318785011502619, + "loss": 1.4945, + "step": 26222 + }, + { + "epoch": 0.34075592510594616, + "grad_norm": 0.1859276294708252, + "learning_rate": 0.00013187590168835053, + "loss": 1.2217, + "step": 26223 + }, + { + "epoch": 0.340768919649862, + "grad_norm": 0.4466590881347656, + "learning_rate": 0.00013187330222643915, + "loss": 1.4849, + "step": 26224 + }, + { + "epoch": 0.3407819141937779, + "grad_norm": 0.3336814343929291, + "learning_rate": 0.00013187070276452775, + "loss": 1.4181, + "step": 26225 + }, + { + "epoch": 0.34079490873769375, + "grad_norm": 0.33973827958106995, + "learning_rate": 0.00013186810330261637, + "loss": 1.3845, + "step": 26226 + }, + { + "epoch": 0.34080790328160965, + "grad_norm": 0.39488059282302856, + "learning_rate": 0.00013186550384070497, + "loss": 1.4804, + "step": 26227 + }, + { + "epoch": 0.3408208978255255, + "grad_norm": 0.43439504504203796, + "learning_rate": 0.0001318629043787936, + "loss": 1.4096, + "step": 26228 + }, + { + "epoch": 0.3408338923694414, + "grad_norm": 0.4110659956932068, + "learning_rate": 0.00013186030491688222, + "loss": 1.4024, + "step": 26229 + }, + { + "epoch": 0.34084688691335724, + "grad_norm": 0.4611261487007141, + "learning_rate": 0.00013185770545497082, + "loss": 1.2091, + "step": 26230 + }, + { + "epoch": 0.34085988145727314, + "grad_norm": 0.4604453444480896, + "learning_rate": 0.00013185510599305944, + "loss": 1.3202, + "step": 26231 + }, + { + "epoch": 0.340872876001189, + "grad_norm": 0.3813309371471405, + "learning_rate": 0.00013185250653114806, + "loss": 1.2697, + "step": 26232 + }, + { + "epoch": 0.3408858705451049, + "grad_norm": 0.36649268865585327, + "learning_rate": 0.0001318499070692367, + "loss": 1.1675, + "step": 26233 + }, + { + "epoch": 0.34089886508902073, + "grad_norm": 0.3964530825614929, + "learning_rate": 0.00013184730760732529, + "loss": 1.5673, + "step": 26234 + }, + { + "epoch": 0.34091185963293663, + "grad_norm": 0.3706746995449066, + "learning_rate": 0.0001318447081454139, + "loss": 1.5572, + "step": 26235 + }, + { + "epoch": 0.3409248541768525, + "grad_norm": 0.540446400642395, + "learning_rate": 0.00013184210868350253, + "loss": 1.4675, + "step": 26236 + }, + { + "epoch": 0.3409378487207684, + "grad_norm": 0.4490174651145935, + "learning_rate": 0.00013183950922159113, + "loss": 1.4082, + "step": 26237 + }, + { + "epoch": 0.3409508432646842, + "grad_norm": 0.4045741856098175, + "learning_rate": 0.00013183690975967976, + "loss": 1.2907, + "step": 26238 + }, + { + "epoch": 0.3409638378086001, + "grad_norm": 0.355634868144989, + "learning_rate": 0.00013183431029776835, + "loss": 1.3647, + "step": 26239 + }, + { + "epoch": 0.34097683235251597, + "grad_norm": 0.43347349762916565, + "learning_rate": 0.00013183171083585698, + "loss": 1.4101, + "step": 26240 + }, + { + "epoch": 0.34098982689643187, + "grad_norm": 0.4328981637954712, + "learning_rate": 0.0001318291113739456, + "loss": 1.4145, + "step": 26241 + }, + { + "epoch": 0.3410028214403477, + "grad_norm": 0.3859333097934723, + "learning_rate": 0.0001318265119120342, + "loss": 1.2177, + "step": 26242 + }, + { + "epoch": 0.3410158159842636, + "grad_norm": 0.38912269473075867, + "learning_rate": 0.00013182391245012282, + "loss": 1.4744, + "step": 26243 + }, + { + "epoch": 0.34102881052817946, + "grad_norm": 0.4100562334060669, + "learning_rate": 0.00013182131298821145, + "loss": 1.5057, + "step": 26244 + }, + { + "epoch": 0.34104180507209536, + "grad_norm": 0.2997613251209259, + "learning_rate": 0.00013181871352630007, + "loss": 1.221, + "step": 26245 + }, + { + "epoch": 0.3410547996160112, + "grad_norm": 0.32728850841522217, + "learning_rate": 0.00013181611406438867, + "loss": 1.3384, + "step": 26246 + }, + { + "epoch": 0.3410677941599271, + "grad_norm": 0.27415427565574646, + "learning_rate": 0.0001318135146024773, + "loss": 1.3474, + "step": 26247 + }, + { + "epoch": 0.34108078870384295, + "grad_norm": 0.478397011756897, + "learning_rate": 0.00013181091514056592, + "loss": 1.5275, + "step": 26248 + }, + { + "epoch": 0.34109378324775885, + "grad_norm": 0.40981540083885193, + "learning_rate": 0.00013180831567865452, + "loss": 1.4127, + "step": 26249 + }, + { + "epoch": 0.3411067777916747, + "grad_norm": 0.42245998978614807, + "learning_rate": 0.00013180571621674314, + "loss": 1.4214, + "step": 26250 + }, + { + "epoch": 0.3411197723355906, + "grad_norm": 0.332137793302536, + "learning_rate": 0.00013180311675483174, + "loss": 1.2254, + "step": 26251 + }, + { + "epoch": 0.34113276687950644, + "grad_norm": 0.392443984746933, + "learning_rate": 0.0001318005172929204, + "loss": 1.5185, + "step": 26252 + }, + { + "epoch": 0.34114576142342234, + "grad_norm": 0.30165979266166687, + "learning_rate": 0.000131797917831009, + "loss": 1.3045, + "step": 26253 + }, + { + "epoch": 0.3411587559673382, + "grad_norm": 0.4552200436592102, + "learning_rate": 0.00013179531836909759, + "loss": 1.3728, + "step": 26254 + }, + { + "epoch": 0.3411717505112541, + "grad_norm": 0.3729575574398041, + "learning_rate": 0.0001317927189071862, + "loss": 1.3288, + "step": 26255 + }, + { + "epoch": 0.34118474505516994, + "grad_norm": 0.4388963282108307, + "learning_rate": 0.00013179011944527483, + "loss": 1.3155, + "step": 26256 + }, + { + "epoch": 0.34119773959908584, + "grad_norm": 0.40844663977622986, + "learning_rate": 0.00013178751998336346, + "loss": 1.3226, + "step": 26257 + }, + { + "epoch": 0.3412107341430017, + "grad_norm": 0.4488868713378906, + "learning_rate": 0.00013178492052145206, + "loss": 1.3919, + "step": 26258 + }, + { + "epoch": 0.3412237286869176, + "grad_norm": 0.4920104742050171, + "learning_rate": 0.00013178232105954068, + "loss": 1.5657, + "step": 26259 + }, + { + "epoch": 0.3412367232308334, + "grad_norm": 0.4138815999031067, + "learning_rate": 0.0001317797215976293, + "loss": 1.2676, + "step": 26260 + }, + { + "epoch": 0.34124971777474933, + "grad_norm": 0.37703418731689453, + "learning_rate": 0.0001317771221357179, + "loss": 1.362, + "step": 26261 + }, + { + "epoch": 0.3412627123186652, + "grad_norm": 0.4716152250766754, + "learning_rate": 0.00013177452267380653, + "loss": 1.561, + "step": 26262 + }, + { + "epoch": 0.3412757068625811, + "grad_norm": 0.35938727855682373, + "learning_rate": 0.00013177192321189515, + "loss": 1.3984, + "step": 26263 + }, + { + "epoch": 0.341288701406497, + "grad_norm": 0.39183974266052246, + "learning_rate": 0.00013176932374998378, + "loss": 1.4437, + "step": 26264 + }, + { + "epoch": 0.3413016959504128, + "grad_norm": 0.48845890164375305, + "learning_rate": 0.00013176672428807237, + "loss": 1.5872, + "step": 26265 + }, + { + "epoch": 0.3413146904943287, + "grad_norm": 0.3081307113170624, + "learning_rate": 0.00013176412482616097, + "loss": 1.473, + "step": 26266 + }, + { + "epoch": 0.34132768503824457, + "grad_norm": 0.45425915718078613, + "learning_rate": 0.00013176152536424962, + "loss": 1.5067, + "step": 26267 + }, + { + "epoch": 0.34134067958216047, + "grad_norm": 0.26841649413108826, + "learning_rate": 0.00013175892590233822, + "loss": 1.3336, + "step": 26268 + }, + { + "epoch": 0.3413536741260763, + "grad_norm": 0.43170174956321716, + "learning_rate": 0.00013175632644042684, + "loss": 1.5657, + "step": 26269 + }, + { + "epoch": 0.3413666686699922, + "grad_norm": 0.40054938197135925, + "learning_rate": 0.00013175372697851544, + "loss": 1.3851, + "step": 26270 + }, + { + "epoch": 0.34137966321390806, + "grad_norm": 0.42931756377220154, + "learning_rate": 0.00013175112751660407, + "loss": 1.3425, + "step": 26271 + }, + { + "epoch": 0.34139265775782396, + "grad_norm": 0.3564052879810333, + "learning_rate": 0.0001317485280546927, + "loss": 1.4348, + "step": 26272 + }, + { + "epoch": 0.3414056523017398, + "grad_norm": 0.3920668065547943, + "learning_rate": 0.0001317459285927813, + "loss": 1.5408, + "step": 26273 + }, + { + "epoch": 0.3414186468456557, + "grad_norm": 0.3827129900455475, + "learning_rate": 0.0001317433291308699, + "loss": 1.5328, + "step": 26274 + }, + { + "epoch": 0.34143164138957155, + "grad_norm": 0.34597861766815186, + "learning_rate": 0.00013174072966895854, + "loss": 1.5048, + "step": 26275 + }, + { + "epoch": 0.34144463593348745, + "grad_norm": 0.4332326054573059, + "learning_rate": 0.00013173813020704716, + "loss": 1.4551, + "step": 26276 + }, + { + "epoch": 0.3414576304774033, + "grad_norm": 0.40205588936805725, + "learning_rate": 0.00013173553074513576, + "loss": 1.4662, + "step": 26277 + }, + { + "epoch": 0.3414706250213192, + "grad_norm": 0.38862383365631104, + "learning_rate": 0.00013173293128322436, + "loss": 1.2794, + "step": 26278 + }, + { + "epoch": 0.34148361956523504, + "grad_norm": 0.3297480046749115, + "learning_rate": 0.000131730331821313, + "loss": 1.4107, + "step": 26279 + }, + { + "epoch": 0.34149661410915094, + "grad_norm": 0.3797451853752136, + "learning_rate": 0.0001317277323594016, + "loss": 1.3059, + "step": 26280 + }, + { + "epoch": 0.3415096086530668, + "grad_norm": 0.354782372713089, + "learning_rate": 0.00013172513289749023, + "loss": 1.286, + "step": 26281 + }, + { + "epoch": 0.3415226031969827, + "grad_norm": 0.47224223613739014, + "learning_rate": 0.00013172253343557883, + "loss": 1.4397, + "step": 26282 + }, + { + "epoch": 0.34153559774089853, + "grad_norm": 0.43819892406463623, + "learning_rate": 0.00013171993397366745, + "loss": 1.4415, + "step": 26283 + }, + { + "epoch": 0.34154859228481443, + "grad_norm": 0.43576580286026, + "learning_rate": 0.00013171733451175608, + "loss": 1.4579, + "step": 26284 + }, + { + "epoch": 0.3415615868287303, + "grad_norm": 0.5775370001792908, + "learning_rate": 0.00013171473504984467, + "loss": 1.4571, + "step": 26285 + }, + { + "epoch": 0.3415745813726462, + "grad_norm": 0.3763059377670288, + "learning_rate": 0.0001317121355879333, + "loss": 1.4663, + "step": 26286 + }, + { + "epoch": 0.341587575916562, + "grad_norm": 0.4656793475151062, + "learning_rate": 0.00013170953612602192, + "loss": 1.4845, + "step": 26287 + }, + { + "epoch": 0.3416005704604779, + "grad_norm": 0.5006188750267029, + "learning_rate": 0.00013170693666411055, + "loss": 1.4441, + "step": 26288 + }, + { + "epoch": 0.34161356500439377, + "grad_norm": 0.3583909571170807, + "learning_rate": 0.00013170433720219914, + "loss": 1.5034, + "step": 26289 + }, + { + "epoch": 0.34162655954830967, + "grad_norm": 0.4445924460887909, + "learning_rate": 0.00013170173774028777, + "loss": 1.4116, + "step": 26290 + }, + { + "epoch": 0.3416395540922255, + "grad_norm": 0.47544562816619873, + "learning_rate": 0.0001316991382783764, + "loss": 1.3125, + "step": 26291 + }, + { + "epoch": 0.3416525486361414, + "grad_norm": 0.3971347510814667, + "learning_rate": 0.000131696538816465, + "loss": 1.3046, + "step": 26292 + }, + { + "epoch": 0.34166554318005726, + "grad_norm": 0.3860016465187073, + "learning_rate": 0.00013169393935455362, + "loss": 1.2685, + "step": 26293 + }, + { + "epoch": 0.34167853772397316, + "grad_norm": 0.4071313738822937, + "learning_rate": 0.00013169133989264224, + "loss": 1.4901, + "step": 26294 + }, + { + "epoch": 0.341691532267889, + "grad_norm": 0.3571883738040924, + "learning_rate": 0.00013168874043073084, + "loss": 1.4147, + "step": 26295 + }, + { + "epoch": 0.3417045268118049, + "grad_norm": 0.45985203981399536, + "learning_rate": 0.00013168614096881946, + "loss": 1.512, + "step": 26296 + }, + { + "epoch": 0.34171752135572075, + "grad_norm": 0.33661729097366333, + "learning_rate": 0.00013168354150690806, + "loss": 1.3382, + "step": 26297 + }, + { + "epoch": 0.34173051589963666, + "grad_norm": 0.3625209331512451, + "learning_rate": 0.0001316809420449967, + "loss": 1.3341, + "step": 26298 + }, + { + "epoch": 0.3417435104435525, + "grad_norm": 0.44608303904533386, + "learning_rate": 0.0001316783425830853, + "loss": 1.3986, + "step": 26299 + }, + { + "epoch": 0.3417565049874684, + "grad_norm": 0.41623011231422424, + "learning_rate": 0.00013167574312117393, + "loss": 1.4593, + "step": 26300 + }, + { + "epoch": 0.34176949953138425, + "grad_norm": 0.42953404784202576, + "learning_rate": 0.00013167314365926253, + "loss": 1.6288, + "step": 26301 + }, + { + "epoch": 0.34178249407530015, + "grad_norm": 0.45586636662483215, + "learning_rate": 0.00013167054419735115, + "loss": 1.3778, + "step": 26302 + }, + { + "epoch": 0.341795488619216, + "grad_norm": 0.47565069794654846, + "learning_rate": 0.00013166794473543978, + "loss": 1.4072, + "step": 26303 + }, + { + "epoch": 0.3418084831631319, + "grad_norm": 0.3788107633590698, + "learning_rate": 0.00013166534527352838, + "loss": 1.2263, + "step": 26304 + }, + { + "epoch": 0.34182147770704774, + "grad_norm": 0.5201157927513123, + "learning_rate": 0.000131662745811617, + "loss": 1.6562, + "step": 26305 + }, + { + "epoch": 0.34183447225096364, + "grad_norm": 0.5794119238853455, + "learning_rate": 0.00013166014634970563, + "loss": 1.4535, + "step": 26306 + }, + { + "epoch": 0.3418474667948795, + "grad_norm": 0.3820491135120392, + "learning_rate": 0.00013165754688779425, + "loss": 1.5125, + "step": 26307 + }, + { + "epoch": 0.3418604613387954, + "grad_norm": 0.498670369386673, + "learning_rate": 0.00013165494742588285, + "loss": 1.4182, + "step": 26308 + }, + { + "epoch": 0.34187345588271123, + "grad_norm": 0.3038325607776642, + "learning_rate": 0.00013165234796397144, + "loss": 1.3584, + "step": 26309 + }, + { + "epoch": 0.34188645042662713, + "grad_norm": 0.41189032793045044, + "learning_rate": 0.0001316497485020601, + "loss": 1.3637, + "step": 26310 + }, + { + "epoch": 0.341899444970543, + "grad_norm": 0.39935967326164246, + "learning_rate": 0.0001316471490401487, + "loss": 1.5526, + "step": 26311 + }, + { + "epoch": 0.3419124395144589, + "grad_norm": 0.4516189396381378, + "learning_rate": 0.00013164454957823732, + "loss": 1.2475, + "step": 26312 + }, + { + "epoch": 0.3419254340583747, + "grad_norm": 0.35357481241226196, + "learning_rate": 0.00013164195011632592, + "loss": 1.3867, + "step": 26313 + }, + { + "epoch": 0.3419384286022906, + "grad_norm": 0.3095816969871521, + "learning_rate": 0.00013163935065441454, + "loss": 1.2348, + "step": 26314 + }, + { + "epoch": 0.34195142314620647, + "grad_norm": 0.32389524579048157, + "learning_rate": 0.00013163675119250316, + "loss": 1.4023, + "step": 26315 + }, + { + "epoch": 0.34196441769012237, + "grad_norm": 0.35221102833747864, + "learning_rate": 0.00013163415173059176, + "loss": 1.1919, + "step": 26316 + }, + { + "epoch": 0.3419774122340382, + "grad_norm": 0.3853890001773834, + "learning_rate": 0.00013163155226868039, + "loss": 1.3998, + "step": 26317 + }, + { + "epoch": 0.3419904067779541, + "grad_norm": 0.3337884843349457, + "learning_rate": 0.000131628952806769, + "loss": 1.3211, + "step": 26318 + }, + { + "epoch": 0.34200340132186996, + "grad_norm": 0.4588981568813324, + "learning_rate": 0.00013162635334485764, + "loss": 1.2218, + "step": 26319 + }, + { + "epoch": 0.34201639586578586, + "grad_norm": 0.752056896686554, + "learning_rate": 0.00013162375388294623, + "loss": 1.414, + "step": 26320 + }, + { + "epoch": 0.3420293904097017, + "grad_norm": 0.4785199463367462, + "learning_rate": 0.00013162115442103483, + "loss": 1.6106, + "step": 26321 + }, + { + "epoch": 0.3420423849536176, + "grad_norm": 0.35274800658226013, + "learning_rate": 0.00013161855495912348, + "loss": 1.1984, + "step": 26322 + }, + { + "epoch": 0.34205537949753345, + "grad_norm": 0.4226943850517273, + "learning_rate": 0.00013161595549721208, + "loss": 1.3982, + "step": 26323 + }, + { + "epoch": 0.34206837404144935, + "grad_norm": 0.4360480010509491, + "learning_rate": 0.0001316133560353007, + "loss": 1.3852, + "step": 26324 + }, + { + "epoch": 0.3420813685853652, + "grad_norm": 0.3864213526248932, + "learning_rate": 0.0001316107565733893, + "loss": 1.3523, + "step": 26325 + }, + { + "epoch": 0.3420943631292811, + "grad_norm": 0.42303675413131714, + "learning_rate": 0.00013160815711147793, + "loss": 1.4889, + "step": 26326 + }, + { + "epoch": 0.34210735767319694, + "grad_norm": 0.3875943422317505, + "learning_rate": 0.00013160555764956655, + "loss": 1.4875, + "step": 26327 + }, + { + "epoch": 0.34212035221711284, + "grad_norm": 0.4419989287853241, + "learning_rate": 0.00013160295818765515, + "loss": 1.3568, + "step": 26328 + }, + { + "epoch": 0.3421333467610287, + "grad_norm": 0.3648955821990967, + "learning_rate": 0.00013160035872574377, + "loss": 1.3217, + "step": 26329 + }, + { + "epoch": 0.3421463413049446, + "grad_norm": 0.4119963049888611, + "learning_rate": 0.0001315977592638324, + "loss": 1.4249, + "step": 26330 + }, + { + "epoch": 0.34215933584886044, + "grad_norm": 0.3409191370010376, + "learning_rate": 0.00013159515980192102, + "loss": 1.298, + "step": 26331 + }, + { + "epoch": 0.34217233039277634, + "grad_norm": 0.40300676226615906, + "learning_rate": 0.00013159256034000962, + "loss": 1.4704, + "step": 26332 + }, + { + "epoch": 0.3421853249366922, + "grad_norm": 0.4108428359031677, + "learning_rate": 0.00013158996087809824, + "loss": 1.4081, + "step": 26333 + }, + { + "epoch": 0.3421983194806081, + "grad_norm": 0.47484689950942993, + "learning_rate": 0.00013158736141618687, + "loss": 1.4511, + "step": 26334 + }, + { + "epoch": 0.3422113140245239, + "grad_norm": 0.3842059373855591, + "learning_rate": 0.00013158476195427546, + "loss": 1.5228, + "step": 26335 + }, + { + "epoch": 0.34222430856843983, + "grad_norm": 0.42994076013565063, + "learning_rate": 0.0001315821624923641, + "loss": 1.3756, + "step": 26336 + }, + { + "epoch": 0.3422373031123557, + "grad_norm": 0.511379599571228, + "learning_rate": 0.0001315795630304527, + "loss": 1.4124, + "step": 26337 + }, + { + "epoch": 0.3422502976562716, + "grad_norm": 0.50417160987854, + "learning_rate": 0.0001315769635685413, + "loss": 1.4752, + "step": 26338 + }, + { + "epoch": 0.3422632922001874, + "grad_norm": 0.39912721514701843, + "learning_rate": 0.00013157436410662994, + "loss": 1.5553, + "step": 26339 + }, + { + "epoch": 0.3422762867441033, + "grad_norm": 0.3984702229499817, + "learning_rate": 0.00013157176464471853, + "loss": 1.3134, + "step": 26340 + }, + { + "epoch": 0.3422892812880192, + "grad_norm": 0.4423327147960663, + "learning_rate": 0.00013156916518280718, + "loss": 1.3988, + "step": 26341 + }, + { + "epoch": 0.34230227583193507, + "grad_norm": 0.36527183651924133, + "learning_rate": 0.00013156656572089578, + "loss": 1.4895, + "step": 26342 + }, + { + "epoch": 0.34231527037585097, + "grad_norm": 0.37877795100212097, + "learning_rate": 0.0001315639662589844, + "loss": 1.3614, + "step": 26343 + }, + { + "epoch": 0.3423282649197668, + "grad_norm": 0.37745794653892517, + "learning_rate": 0.000131561366797073, + "loss": 1.4064, + "step": 26344 + }, + { + "epoch": 0.3423412594636827, + "grad_norm": 0.49252164363861084, + "learning_rate": 0.00013155876733516163, + "loss": 1.4495, + "step": 26345 + }, + { + "epoch": 0.34235425400759856, + "grad_norm": 0.411770224571228, + "learning_rate": 0.00013155616787325025, + "loss": 1.3277, + "step": 26346 + }, + { + "epoch": 0.34236724855151446, + "grad_norm": 0.4192935526371002, + "learning_rate": 0.00013155356841133885, + "loss": 1.5386, + "step": 26347 + }, + { + "epoch": 0.3423802430954303, + "grad_norm": 0.32775774598121643, + "learning_rate": 0.00013155096894942747, + "loss": 1.3214, + "step": 26348 + }, + { + "epoch": 0.3423932376393462, + "grad_norm": 0.4045034646987915, + "learning_rate": 0.0001315483694875161, + "loss": 1.3583, + "step": 26349 + }, + { + "epoch": 0.34240623218326205, + "grad_norm": 0.39173516631126404, + "learning_rate": 0.0001315457700256047, + "loss": 1.5276, + "step": 26350 + }, + { + "epoch": 0.34241922672717795, + "grad_norm": 0.34003376960754395, + "learning_rate": 0.00013154317056369332, + "loss": 1.4005, + "step": 26351 + }, + { + "epoch": 0.3424322212710938, + "grad_norm": 0.37397250533103943, + "learning_rate": 0.00013154057110178192, + "loss": 1.1944, + "step": 26352 + }, + { + "epoch": 0.3424452158150097, + "grad_norm": 0.42243069410324097, + "learning_rate": 0.00013153797163987057, + "loss": 1.4837, + "step": 26353 + }, + { + "epoch": 0.34245821035892554, + "grad_norm": 0.35920339822769165, + "learning_rate": 0.00013153537217795917, + "loss": 1.1708, + "step": 26354 + }, + { + "epoch": 0.34247120490284144, + "grad_norm": 0.4497944116592407, + "learning_rate": 0.0001315327727160478, + "loss": 1.3322, + "step": 26355 + }, + { + "epoch": 0.3424841994467573, + "grad_norm": 0.41501423716545105, + "learning_rate": 0.0001315301732541364, + "loss": 1.4838, + "step": 26356 + }, + { + "epoch": 0.3424971939906732, + "grad_norm": 0.4426480531692505, + "learning_rate": 0.000131527573792225, + "loss": 1.4626, + "step": 26357 + }, + { + "epoch": 0.34251018853458903, + "grad_norm": 0.5210657715797424, + "learning_rate": 0.00013152497433031364, + "loss": 1.4778, + "step": 26358 + }, + { + "epoch": 0.34252318307850493, + "grad_norm": 0.35102003812789917, + "learning_rate": 0.00013152237486840224, + "loss": 1.3906, + "step": 26359 + }, + { + "epoch": 0.3425361776224208, + "grad_norm": 0.35952049493789673, + "learning_rate": 0.00013151977540649086, + "loss": 1.3698, + "step": 26360 + }, + { + "epoch": 0.3425491721663367, + "grad_norm": 0.3831641972064972, + "learning_rate": 0.00013151717594457948, + "loss": 1.4593, + "step": 26361 + }, + { + "epoch": 0.3425621667102525, + "grad_norm": 0.36221179366111755, + "learning_rate": 0.00013151457648266808, + "loss": 1.4441, + "step": 26362 + }, + { + "epoch": 0.3425751612541684, + "grad_norm": 0.44126009941101074, + "learning_rate": 0.0001315119770207567, + "loss": 1.4505, + "step": 26363 + }, + { + "epoch": 0.34258815579808427, + "grad_norm": 0.34182173013687134, + "learning_rate": 0.0001315093775588453, + "loss": 1.2765, + "step": 26364 + }, + { + "epoch": 0.34260115034200017, + "grad_norm": 0.3579317331314087, + "learning_rate": 0.00013150677809693395, + "loss": 1.2909, + "step": 26365 + }, + { + "epoch": 0.342614144885916, + "grad_norm": 0.46075743436813354, + "learning_rate": 0.00013150417863502255, + "loss": 1.4719, + "step": 26366 + }, + { + "epoch": 0.3426271394298319, + "grad_norm": 0.4803047776222229, + "learning_rate": 0.00013150157917311118, + "loss": 1.3745, + "step": 26367 + }, + { + "epoch": 0.34264013397374776, + "grad_norm": 0.4405366778373718, + "learning_rate": 0.0001314989797111998, + "loss": 1.4704, + "step": 26368 + }, + { + "epoch": 0.34265312851766366, + "grad_norm": 0.40220585465431213, + "learning_rate": 0.0001314963802492884, + "loss": 1.1502, + "step": 26369 + }, + { + "epoch": 0.3426661230615795, + "grad_norm": 0.42921125888824463, + "learning_rate": 0.00013149378078737702, + "loss": 1.3817, + "step": 26370 + }, + { + "epoch": 0.3426791176054954, + "grad_norm": 0.4432045519351959, + "learning_rate": 0.00013149118132546562, + "loss": 1.4615, + "step": 26371 + }, + { + "epoch": 0.34269211214941125, + "grad_norm": 0.3832964599132538, + "learning_rate": 0.00013148858186355427, + "loss": 1.2793, + "step": 26372 + }, + { + "epoch": 0.34270510669332716, + "grad_norm": 0.4057629406452179, + "learning_rate": 0.00013148598240164287, + "loss": 1.3216, + "step": 26373 + }, + { + "epoch": 0.342718101237243, + "grad_norm": 0.4439396262168884, + "learning_rate": 0.0001314833829397315, + "loss": 1.4825, + "step": 26374 + }, + { + "epoch": 0.3427310957811589, + "grad_norm": 0.45235151052474976, + "learning_rate": 0.0001314807834778201, + "loss": 1.4786, + "step": 26375 + }, + { + "epoch": 0.34274409032507475, + "grad_norm": 0.5342997312545776, + "learning_rate": 0.00013147818401590872, + "loss": 1.4621, + "step": 26376 + }, + { + "epoch": 0.34275708486899065, + "grad_norm": 0.3664233386516571, + "learning_rate": 0.00013147558455399734, + "loss": 1.4069, + "step": 26377 + }, + { + "epoch": 0.3427700794129065, + "grad_norm": 0.5223405361175537, + "learning_rate": 0.00013147298509208594, + "loss": 1.4488, + "step": 26378 + }, + { + "epoch": 0.3427830739568224, + "grad_norm": 0.46037352085113525, + "learning_rate": 0.00013147038563017456, + "loss": 1.5085, + "step": 26379 + }, + { + "epoch": 0.34279606850073824, + "grad_norm": 0.36788487434387207, + "learning_rate": 0.0001314677861682632, + "loss": 1.2701, + "step": 26380 + }, + { + "epoch": 0.34280906304465414, + "grad_norm": 0.5282098054885864, + "learning_rate": 0.00013146518670635178, + "loss": 1.4128, + "step": 26381 + }, + { + "epoch": 0.34282205758857, + "grad_norm": 0.47046396136283875, + "learning_rate": 0.0001314625872444404, + "loss": 1.4163, + "step": 26382 + }, + { + "epoch": 0.3428350521324859, + "grad_norm": 0.40179768204689026, + "learning_rate": 0.000131459987782529, + "loss": 1.3198, + "step": 26383 + }, + { + "epoch": 0.34284804667640173, + "grad_norm": 0.4061926305294037, + "learning_rate": 0.00013145738832061766, + "loss": 1.3725, + "step": 26384 + }, + { + "epoch": 0.34286104122031763, + "grad_norm": 0.27053365111351013, + "learning_rate": 0.00013145478885870625, + "loss": 1.1807, + "step": 26385 + }, + { + "epoch": 0.3428740357642335, + "grad_norm": 0.4072888493537903, + "learning_rate": 0.00013145218939679488, + "loss": 1.3772, + "step": 26386 + }, + { + "epoch": 0.3428870303081494, + "grad_norm": 0.4764579236507416, + "learning_rate": 0.00013144958993488348, + "loss": 1.4268, + "step": 26387 + }, + { + "epoch": 0.3429000248520652, + "grad_norm": 0.38692694902420044, + "learning_rate": 0.0001314469904729721, + "loss": 1.2772, + "step": 26388 + }, + { + "epoch": 0.3429130193959811, + "grad_norm": 0.49210596084594727, + "learning_rate": 0.00013144439101106073, + "loss": 1.3939, + "step": 26389 + }, + { + "epoch": 0.34292601393989697, + "grad_norm": 0.4415774345397949, + "learning_rate": 0.00013144179154914932, + "loss": 1.4295, + "step": 26390 + }, + { + "epoch": 0.34293900848381287, + "grad_norm": 0.3748113214969635, + "learning_rate": 0.00013143919208723795, + "loss": 1.4526, + "step": 26391 + }, + { + "epoch": 0.3429520030277287, + "grad_norm": 0.36553555727005005, + "learning_rate": 0.00013143659262532657, + "loss": 1.4133, + "step": 26392 + }, + { + "epoch": 0.3429649975716446, + "grad_norm": 0.35615047812461853, + "learning_rate": 0.00013143399316341517, + "loss": 1.4006, + "step": 26393 + }, + { + "epoch": 0.34297799211556046, + "grad_norm": 0.32209745049476624, + "learning_rate": 0.0001314313937015038, + "loss": 1.4184, + "step": 26394 + }, + { + "epoch": 0.34299098665947636, + "grad_norm": 0.35506588220596313, + "learning_rate": 0.0001314287942395924, + "loss": 1.4175, + "step": 26395 + }, + { + "epoch": 0.3430039812033922, + "grad_norm": 0.3618049621582031, + "learning_rate": 0.00013142619477768104, + "loss": 1.342, + "step": 26396 + }, + { + "epoch": 0.3430169757473081, + "grad_norm": 0.46010950207710266, + "learning_rate": 0.00013142359531576964, + "loss": 1.3659, + "step": 26397 + }, + { + "epoch": 0.34302997029122395, + "grad_norm": 0.33406463265419006, + "learning_rate": 0.00013142099585385826, + "loss": 1.3513, + "step": 26398 + }, + { + "epoch": 0.34304296483513985, + "grad_norm": 0.43464308977127075, + "learning_rate": 0.00013141839639194686, + "loss": 1.4307, + "step": 26399 + }, + { + "epoch": 0.3430559593790557, + "grad_norm": 0.29055631160736084, + "learning_rate": 0.0001314157969300355, + "loss": 1.4803, + "step": 26400 + }, + { + "epoch": 0.3430689539229716, + "grad_norm": 0.4285714030265808, + "learning_rate": 0.0001314131974681241, + "loss": 1.3632, + "step": 26401 + }, + { + "epoch": 0.34308194846688744, + "grad_norm": 0.4142897129058838, + "learning_rate": 0.0001314105980062127, + "loss": 1.3268, + "step": 26402 + }, + { + "epoch": 0.34309494301080334, + "grad_norm": 0.4252566695213318, + "learning_rate": 0.00013140799854430133, + "loss": 1.4349, + "step": 26403 + }, + { + "epoch": 0.3431079375547192, + "grad_norm": 0.3657315671443939, + "learning_rate": 0.00013140539908238996, + "loss": 1.3544, + "step": 26404 + }, + { + "epoch": 0.3431209320986351, + "grad_norm": 0.4156716465950012, + "learning_rate": 0.00013140279962047855, + "loss": 1.3758, + "step": 26405 + }, + { + "epoch": 0.34313392664255093, + "grad_norm": 0.49092602729797363, + "learning_rate": 0.00013140020015856718, + "loss": 1.4939, + "step": 26406 + }, + { + "epoch": 0.34314692118646684, + "grad_norm": 0.3937337100505829, + "learning_rate": 0.0001313976006966558, + "loss": 1.4222, + "step": 26407 + }, + { + "epoch": 0.3431599157303827, + "grad_norm": 0.39711228013038635, + "learning_rate": 0.00013139500123474443, + "loss": 1.4624, + "step": 26408 + }, + { + "epoch": 0.3431729102742986, + "grad_norm": 0.464324027299881, + "learning_rate": 0.00013139240177283303, + "loss": 1.5072, + "step": 26409 + }, + { + "epoch": 0.3431859048182144, + "grad_norm": 0.43275925517082214, + "learning_rate": 0.00013138980231092165, + "loss": 1.2666, + "step": 26410 + }, + { + "epoch": 0.3431988993621303, + "grad_norm": 0.43466436862945557, + "learning_rate": 0.00013138720284901027, + "loss": 1.4789, + "step": 26411 + }, + { + "epoch": 0.3432118939060462, + "grad_norm": 0.3580063283443451, + "learning_rate": 0.00013138460338709887, + "loss": 1.6326, + "step": 26412 + }, + { + "epoch": 0.3432248884499621, + "grad_norm": 0.2598384916782379, + "learning_rate": 0.0001313820039251875, + "loss": 1.1258, + "step": 26413 + }, + { + "epoch": 0.3432378829938779, + "grad_norm": 0.38475027680397034, + "learning_rate": 0.0001313794044632761, + "loss": 1.4263, + "step": 26414 + }, + { + "epoch": 0.3432508775377938, + "grad_norm": 0.32091692090034485, + "learning_rate": 0.00013137680500136475, + "loss": 1.2125, + "step": 26415 + }, + { + "epoch": 0.34326387208170966, + "grad_norm": 0.3703199028968811, + "learning_rate": 0.00013137420553945334, + "loss": 1.2698, + "step": 26416 + }, + { + "epoch": 0.34327686662562557, + "grad_norm": 0.27267199754714966, + "learning_rate": 0.00013137160607754194, + "loss": 1.2436, + "step": 26417 + }, + { + "epoch": 0.34328986116954147, + "grad_norm": 0.5563708543777466, + "learning_rate": 0.00013136900661563056, + "loss": 1.6145, + "step": 26418 + }, + { + "epoch": 0.3433028557134573, + "grad_norm": 0.3313523530960083, + "learning_rate": 0.0001313664071537192, + "loss": 1.6008, + "step": 26419 + }, + { + "epoch": 0.3433158502573732, + "grad_norm": 0.44421207904815674, + "learning_rate": 0.00013136380769180781, + "loss": 1.303, + "step": 26420 + }, + { + "epoch": 0.34332884480128906, + "grad_norm": 0.33951789140701294, + "learning_rate": 0.0001313612082298964, + "loss": 1.3742, + "step": 26421 + }, + { + "epoch": 0.34334183934520496, + "grad_norm": 0.3753160238265991, + "learning_rate": 0.00013135860876798504, + "loss": 1.2552, + "step": 26422 + }, + { + "epoch": 0.3433548338891208, + "grad_norm": 0.40816816687583923, + "learning_rate": 0.00013135600930607366, + "loss": 1.5655, + "step": 26423 + }, + { + "epoch": 0.3433678284330367, + "grad_norm": 0.3919028043746948, + "learning_rate": 0.00013135340984416226, + "loss": 1.4693, + "step": 26424 + }, + { + "epoch": 0.34338082297695255, + "grad_norm": 0.42948704957962036, + "learning_rate": 0.00013135081038225088, + "loss": 1.4758, + "step": 26425 + }, + { + "epoch": 0.34339381752086845, + "grad_norm": 0.40445271134376526, + "learning_rate": 0.00013134821092033948, + "loss": 1.4842, + "step": 26426 + }, + { + "epoch": 0.3434068120647843, + "grad_norm": 0.3974856734275818, + "learning_rate": 0.00013134561145842813, + "loss": 1.4983, + "step": 26427 + }, + { + "epoch": 0.3434198066087002, + "grad_norm": 0.3403288424015045, + "learning_rate": 0.00013134301199651673, + "loss": 1.6849, + "step": 26428 + }, + { + "epoch": 0.34343280115261604, + "grad_norm": 0.26605531573295593, + "learning_rate": 0.00013134041253460535, + "loss": 1.3151, + "step": 26429 + }, + { + "epoch": 0.34344579569653194, + "grad_norm": 0.5248103737831116, + "learning_rate": 0.00013133781307269395, + "loss": 1.3908, + "step": 26430 + }, + { + "epoch": 0.3434587902404478, + "grad_norm": 0.3893047571182251, + "learning_rate": 0.00013133521361078257, + "loss": 1.2712, + "step": 26431 + }, + { + "epoch": 0.3434717847843637, + "grad_norm": 0.43012064695358276, + "learning_rate": 0.0001313326141488712, + "loss": 1.2313, + "step": 26432 + }, + { + "epoch": 0.34348477932827953, + "grad_norm": 0.4553283154964447, + "learning_rate": 0.0001313300146869598, + "loss": 1.465, + "step": 26433 + }, + { + "epoch": 0.34349777387219543, + "grad_norm": 0.5589954257011414, + "learning_rate": 0.00013132741522504842, + "loss": 1.4338, + "step": 26434 + }, + { + "epoch": 0.3435107684161113, + "grad_norm": 0.31945690512657166, + "learning_rate": 0.00013132481576313705, + "loss": 1.2953, + "step": 26435 + }, + { + "epoch": 0.3435237629600272, + "grad_norm": 0.4094160497188568, + "learning_rate": 0.00013132221630122564, + "loss": 1.4035, + "step": 26436 + }, + { + "epoch": 0.343536757503943, + "grad_norm": 0.4629078209400177, + "learning_rate": 0.00013131961683931427, + "loss": 1.6031, + "step": 26437 + }, + { + "epoch": 0.3435497520478589, + "grad_norm": 0.5530632138252258, + "learning_rate": 0.00013131701737740286, + "loss": 1.5559, + "step": 26438 + }, + { + "epoch": 0.34356274659177477, + "grad_norm": 0.46392112970352173, + "learning_rate": 0.00013131441791549152, + "loss": 1.4361, + "step": 26439 + }, + { + "epoch": 0.34357574113569067, + "grad_norm": 0.4042486548423767, + "learning_rate": 0.00013131181845358011, + "loss": 1.4919, + "step": 26440 + }, + { + "epoch": 0.3435887356796065, + "grad_norm": 0.3291502892971039, + "learning_rate": 0.00013130921899166874, + "loss": 1.2254, + "step": 26441 + }, + { + "epoch": 0.3436017302235224, + "grad_norm": 0.39213046431541443, + "learning_rate": 0.00013130661952975736, + "loss": 1.4108, + "step": 26442 + }, + { + "epoch": 0.34361472476743826, + "grad_norm": 0.41376858949661255, + "learning_rate": 0.00013130402006784596, + "loss": 1.4525, + "step": 26443 + }, + { + "epoch": 0.34362771931135416, + "grad_norm": 0.5738300681114197, + "learning_rate": 0.00013130142060593458, + "loss": 1.5174, + "step": 26444 + }, + { + "epoch": 0.34364071385527, + "grad_norm": 0.3326481580734253, + "learning_rate": 0.00013129882114402318, + "loss": 1.6406, + "step": 26445 + }, + { + "epoch": 0.3436537083991859, + "grad_norm": 0.49983862042427063, + "learning_rate": 0.0001312962216821118, + "loss": 1.6276, + "step": 26446 + }, + { + "epoch": 0.34366670294310175, + "grad_norm": 0.4768376648426056, + "learning_rate": 0.00013129362222020043, + "loss": 1.5746, + "step": 26447 + }, + { + "epoch": 0.34367969748701765, + "grad_norm": 0.5384788513183594, + "learning_rate": 0.00013129102275828903, + "loss": 1.4349, + "step": 26448 + }, + { + "epoch": 0.3436926920309335, + "grad_norm": 0.42418918013572693, + "learning_rate": 0.00013128842329637765, + "loss": 1.4907, + "step": 26449 + }, + { + "epoch": 0.3437056865748494, + "grad_norm": 0.3909682631492615, + "learning_rate": 0.00013128582383446628, + "loss": 1.4411, + "step": 26450 + }, + { + "epoch": 0.34371868111876525, + "grad_norm": 0.3991309106349945, + "learning_rate": 0.0001312832243725549, + "loss": 1.3129, + "step": 26451 + }, + { + "epoch": 0.34373167566268115, + "grad_norm": 0.38891950249671936, + "learning_rate": 0.0001312806249106435, + "loss": 1.2726, + "step": 26452 + }, + { + "epoch": 0.343744670206597, + "grad_norm": 0.3750346601009369, + "learning_rate": 0.00013127802544873212, + "loss": 1.3529, + "step": 26453 + }, + { + "epoch": 0.3437576647505129, + "grad_norm": 0.3789776861667633, + "learning_rate": 0.00013127542598682075, + "loss": 1.3423, + "step": 26454 + }, + { + "epoch": 0.34377065929442874, + "grad_norm": 0.35564276576042175, + "learning_rate": 0.00013127282652490935, + "loss": 1.2671, + "step": 26455 + }, + { + "epoch": 0.34378365383834464, + "grad_norm": 0.5056749582290649, + "learning_rate": 0.00013127022706299797, + "loss": 1.5405, + "step": 26456 + }, + { + "epoch": 0.3437966483822605, + "grad_norm": 0.33762025833129883, + "learning_rate": 0.00013126762760108657, + "loss": 1.4346, + "step": 26457 + }, + { + "epoch": 0.3438096429261764, + "grad_norm": 0.336229532957077, + "learning_rate": 0.00013126502813917522, + "loss": 1.2583, + "step": 26458 + }, + { + "epoch": 0.34382263747009223, + "grad_norm": 0.40589988231658936, + "learning_rate": 0.00013126242867726382, + "loss": 1.309, + "step": 26459 + }, + { + "epoch": 0.34383563201400813, + "grad_norm": 0.4543602466583252, + "learning_rate": 0.00013125982921535241, + "loss": 1.3681, + "step": 26460 + }, + { + "epoch": 0.343848626557924, + "grad_norm": 0.3799445629119873, + "learning_rate": 0.00013125722975344104, + "loss": 1.1839, + "step": 26461 + }, + { + "epoch": 0.3438616211018399, + "grad_norm": 0.37944915890693665, + "learning_rate": 0.00013125463029152966, + "loss": 1.3555, + "step": 26462 + }, + { + "epoch": 0.3438746156457557, + "grad_norm": 0.436146080493927, + "learning_rate": 0.0001312520308296183, + "loss": 1.4196, + "step": 26463 + }, + { + "epoch": 0.3438876101896716, + "grad_norm": 0.4754045307636261, + "learning_rate": 0.00013124943136770688, + "loss": 1.532, + "step": 26464 + }, + { + "epoch": 0.34390060473358747, + "grad_norm": 0.29849299788475037, + "learning_rate": 0.0001312468319057955, + "loss": 1.4282, + "step": 26465 + }, + { + "epoch": 0.34391359927750337, + "grad_norm": 0.37399810552597046, + "learning_rate": 0.00013124423244388413, + "loss": 1.5605, + "step": 26466 + }, + { + "epoch": 0.3439265938214192, + "grad_norm": 0.4111667573451996, + "learning_rate": 0.00013124163298197273, + "loss": 1.2352, + "step": 26467 + }, + { + "epoch": 0.3439395883653351, + "grad_norm": 0.36064958572387695, + "learning_rate": 0.00013123903352006136, + "loss": 1.3691, + "step": 26468 + }, + { + "epoch": 0.34395258290925096, + "grad_norm": 0.3570909798145294, + "learning_rate": 0.00013123643405814995, + "loss": 1.6023, + "step": 26469 + }, + { + "epoch": 0.34396557745316686, + "grad_norm": 0.44390809535980225, + "learning_rate": 0.0001312338345962386, + "loss": 1.5732, + "step": 26470 + }, + { + "epoch": 0.3439785719970827, + "grad_norm": 0.3735039234161377, + "learning_rate": 0.0001312312351343272, + "loss": 1.4107, + "step": 26471 + }, + { + "epoch": 0.3439915665409986, + "grad_norm": 0.44730323553085327, + "learning_rate": 0.0001312286356724158, + "loss": 1.261, + "step": 26472 + }, + { + "epoch": 0.34400456108491445, + "grad_norm": 0.3879878520965576, + "learning_rate": 0.00013122603621050442, + "loss": 1.5137, + "step": 26473 + }, + { + "epoch": 0.34401755562883035, + "grad_norm": 0.41281330585479736, + "learning_rate": 0.00013122343674859305, + "loss": 1.3499, + "step": 26474 + }, + { + "epoch": 0.3440305501727462, + "grad_norm": 0.35124334692955017, + "learning_rate": 0.00013122083728668167, + "loss": 1.3447, + "step": 26475 + }, + { + "epoch": 0.3440435447166621, + "grad_norm": 0.39903172850608826, + "learning_rate": 0.00013121823782477027, + "loss": 1.5345, + "step": 26476 + }, + { + "epoch": 0.34405653926057794, + "grad_norm": 0.43390360474586487, + "learning_rate": 0.0001312156383628589, + "loss": 1.5127, + "step": 26477 + }, + { + "epoch": 0.34406953380449384, + "grad_norm": 0.39824920892715454, + "learning_rate": 0.00013121303890094752, + "loss": 1.4264, + "step": 26478 + }, + { + "epoch": 0.3440825283484097, + "grad_norm": 0.41766518354415894, + "learning_rate": 0.00013121043943903612, + "loss": 1.4143, + "step": 26479 + }, + { + "epoch": 0.3440955228923256, + "grad_norm": 0.3217308819293976, + "learning_rate": 0.00013120783997712474, + "loss": 1.3848, + "step": 26480 + }, + { + "epoch": 0.34410851743624143, + "grad_norm": 0.3809047043323517, + "learning_rate": 0.00013120524051521337, + "loss": 1.3955, + "step": 26481 + }, + { + "epoch": 0.34412151198015734, + "grad_norm": 0.48214903473854065, + "learning_rate": 0.000131202641053302, + "loss": 1.3745, + "step": 26482 + }, + { + "epoch": 0.3441345065240732, + "grad_norm": 0.4011686146259308, + "learning_rate": 0.0001312000415913906, + "loss": 1.5607, + "step": 26483 + }, + { + "epoch": 0.3441475010679891, + "grad_norm": 0.3684973120689392, + "learning_rate": 0.00013119744212947918, + "loss": 1.248, + "step": 26484 + }, + { + "epoch": 0.3441604956119049, + "grad_norm": 0.46288734674453735, + "learning_rate": 0.00013119484266756784, + "loss": 1.4456, + "step": 26485 + }, + { + "epoch": 0.3441734901558208, + "grad_norm": 0.38262632489204407, + "learning_rate": 0.00013119224320565643, + "loss": 1.5189, + "step": 26486 + }, + { + "epoch": 0.3441864846997367, + "grad_norm": 0.48087427020072937, + "learning_rate": 0.00013118964374374506, + "loss": 1.7132, + "step": 26487 + }, + { + "epoch": 0.3441994792436526, + "grad_norm": 0.3711980879306793, + "learning_rate": 0.00013118704428183366, + "loss": 1.401, + "step": 26488 + }, + { + "epoch": 0.3442124737875684, + "grad_norm": 0.40589219331741333, + "learning_rate": 0.00013118444481992228, + "loss": 1.2023, + "step": 26489 + }, + { + "epoch": 0.3442254683314843, + "grad_norm": 0.4111505448818207, + "learning_rate": 0.0001311818453580109, + "loss": 1.3354, + "step": 26490 + }, + { + "epoch": 0.34423846287540016, + "grad_norm": 0.40154072642326355, + "learning_rate": 0.0001311792458960995, + "loss": 1.547, + "step": 26491 + }, + { + "epoch": 0.34425145741931606, + "grad_norm": 0.3786192536354065, + "learning_rate": 0.00013117664643418813, + "loss": 1.3041, + "step": 26492 + }, + { + "epoch": 0.34426445196323197, + "grad_norm": 0.33170366287231445, + "learning_rate": 0.00013117404697227675, + "loss": 1.3759, + "step": 26493 + }, + { + "epoch": 0.3442774465071478, + "grad_norm": 0.2915847599506378, + "learning_rate": 0.00013117144751036538, + "loss": 1.2715, + "step": 26494 + }, + { + "epoch": 0.3442904410510637, + "grad_norm": 0.4430015981197357, + "learning_rate": 0.00013116884804845397, + "loss": 1.2897, + "step": 26495 + }, + { + "epoch": 0.34430343559497956, + "grad_norm": 0.480791300535202, + "learning_rate": 0.0001311662485865426, + "loss": 1.4735, + "step": 26496 + }, + { + "epoch": 0.34431643013889546, + "grad_norm": 0.4185304045677185, + "learning_rate": 0.00013116364912463122, + "loss": 1.2849, + "step": 26497 + }, + { + "epoch": 0.3443294246828113, + "grad_norm": 0.3496531546115875, + "learning_rate": 0.00013116104966271982, + "loss": 1.3791, + "step": 26498 + }, + { + "epoch": 0.3443424192267272, + "grad_norm": 0.34698158502578735, + "learning_rate": 0.00013115845020080844, + "loss": 1.3564, + "step": 26499 + }, + { + "epoch": 0.34435541377064305, + "grad_norm": 0.5261573195457458, + "learning_rate": 0.00013115585073889704, + "loss": 1.4895, + "step": 26500 + }, + { + "epoch": 0.34436840831455895, + "grad_norm": 0.4059036076068878, + "learning_rate": 0.00013115325127698567, + "loss": 1.2928, + "step": 26501 + }, + { + "epoch": 0.3443814028584748, + "grad_norm": 0.3983599543571472, + "learning_rate": 0.0001311506518150743, + "loss": 1.5055, + "step": 26502 + }, + { + "epoch": 0.3443943974023907, + "grad_norm": 0.3382245600223541, + "learning_rate": 0.0001311480523531629, + "loss": 1.4103, + "step": 26503 + }, + { + "epoch": 0.34440739194630654, + "grad_norm": 0.3494957983493805, + "learning_rate": 0.0001311454528912515, + "loss": 1.4274, + "step": 26504 + }, + { + "epoch": 0.34442038649022244, + "grad_norm": 0.45912179350852966, + "learning_rate": 0.00013114285342934014, + "loss": 1.6341, + "step": 26505 + }, + { + "epoch": 0.3444333810341383, + "grad_norm": 0.38130703568458557, + "learning_rate": 0.00013114025396742876, + "loss": 1.45, + "step": 26506 + }, + { + "epoch": 0.3444463755780542, + "grad_norm": 0.37535321712493896, + "learning_rate": 0.00013113765450551736, + "loss": 1.2743, + "step": 26507 + }, + { + "epoch": 0.34445937012197003, + "grad_norm": 0.4131513237953186, + "learning_rate": 0.00013113505504360598, + "loss": 1.3767, + "step": 26508 + }, + { + "epoch": 0.34447236466588593, + "grad_norm": 0.45072776079177856, + "learning_rate": 0.0001311324555816946, + "loss": 1.5252, + "step": 26509 + }, + { + "epoch": 0.3444853592098018, + "grad_norm": 0.36597904562950134, + "learning_rate": 0.0001311298561197832, + "loss": 1.2754, + "step": 26510 + }, + { + "epoch": 0.3444983537537177, + "grad_norm": 0.3923594653606415, + "learning_rate": 0.00013112725665787183, + "loss": 1.3506, + "step": 26511 + }, + { + "epoch": 0.3445113482976335, + "grad_norm": 0.38187533617019653, + "learning_rate": 0.00013112465719596043, + "loss": 1.4836, + "step": 26512 + }, + { + "epoch": 0.3445243428415494, + "grad_norm": 0.3593902587890625, + "learning_rate": 0.00013112205773404908, + "loss": 1.2763, + "step": 26513 + }, + { + "epoch": 0.34453733738546527, + "grad_norm": 0.3674440085887909, + "learning_rate": 0.00013111945827213767, + "loss": 1.3149, + "step": 26514 + }, + { + "epoch": 0.34455033192938117, + "grad_norm": 0.42395225167274475, + "learning_rate": 0.00013111685881022627, + "loss": 1.5541, + "step": 26515 + }, + { + "epoch": 0.344563326473297, + "grad_norm": 0.3809218108654022, + "learning_rate": 0.00013111425934831492, + "loss": 1.4042, + "step": 26516 + }, + { + "epoch": 0.3445763210172129, + "grad_norm": 0.44831323623657227, + "learning_rate": 0.00013111165988640352, + "loss": 1.3498, + "step": 26517 + }, + { + "epoch": 0.34458931556112876, + "grad_norm": 0.4570062756538391, + "learning_rate": 0.00013110906042449215, + "loss": 1.5425, + "step": 26518 + }, + { + "epoch": 0.34460231010504466, + "grad_norm": 0.38537171483039856, + "learning_rate": 0.00013110646096258074, + "loss": 1.4267, + "step": 26519 + }, + { + "epoch": 0.3446153046489605, + "grad_norm": 0.32493555545806885, + "learning_rate": 0.00013110386150066937, + "loss": 1.3463, + "step": 26520 + }, + { + "epoch": 0.3446282991928764, + "grad_norm": 0.4687088131904602, + "learning_rate": 0.000131101262038758, + "loss": 1.4415, + "step": 26521 + }, + { + "epoch": 0.34464129373679225, + "grad_norm": 0.486675500869751, + "learning_rate": 0.0001310986625768466, + "loss": 1.4634, + "step": 26522 + }, + { + "epoch": 0.34465428828070815, + "grad_norm": 0.3574809730052948, + "learning_rate": 0.00013109606311493521, + "loss": 1.4279, + "step": 26523 + }, + { + "epoch": 0.344667282824624, + "grad_norm": 0.2816152572631836, + "learning_rate": 0.00013109346365302384, + "loss": 1.383, + "step": 26524 + }, + { + "epoch": 0.3446802773685399, + "grad_norm": 0.3294565677642822, + "learning_rate": 0.00013109086419111246, + "loss": 1.5036, + "step": 26525 + }, + { + "epoch": 0.34469327191245575, + "grad_norm": 0.34105944633483887, + "learning_rate": 0.00013108826472920106, + "loss": 1.4198, + "step": 26526 + }, + { + "epoch": 0.34470626645637165, + "grad_norm": 0.37580713629722595, + "learning_rate": 0.00013108566526728966, + "loss": 1.1172, + "step": 26527 + }, + { + "epoch": 0.3447192610002875, + "grad_norm": 0.47000688314437866, + "learning_rate": 0.0001310830658053783, + "loss": 1.4908, + "step": 26528 + }, + { + "epoch": 0.3447322555442034, + "grad_norm": 0.5564127564430237, + "learning_rate": 0.0001310804663434669, + "loss": 1.5379, + "step": 26529 + }, + { + "epoch": 0.34474525008811924, + "grad_norm": 0.5190191268920898, + "learning_rate": 0.00013107786688155553, + "loss": 1.4677, + "step": 26530 + }, + { + "epoch": 0.34475824463203514, + "grad_norm": 0.4767417311668396, + "learning_rate": 0.00013107526741964413, + "loss": 1.4555, + "step": 26531 + }, + { + "epoch": 0.344771239175951, + "grad_norm": 0.29780545830726624, + "learning_rate": 0.00013107266795773275, + "loss": 1.5077, + "step": 26532 + }, + { + "epoch": 0.3447842337198669, + "grad_norm": 0.3425752520561218, + "learning_rate": 0.00013107006849582138, + "loss": 1.3988, + "step": 26533 + }, + { + "epoch": 0.34479722826378273, + "grad_norm": 0.5032296776771545, + "learning_rate": 0.00013106746903390997, + "loss": 1.5185, + "step": 26534 + }, + { + "epoch": 0.34481022280769863, + "grad_norm": 0.40545040369033813, + "learning_rate": 0.0001310648695719986, + "loss": 1.3336, + "step": 26535 + }, + { + "epoch": 0.3448232173516145, + "grad_norm": 0.4259105324745178, + "learning_rate": 0.00013106227011008722, + "loss": 1.3649, + "step": 26536 + }, + { + "epoch": 0.3448362118955304, + "grad_norm": 0.3445252478122711, + "learning_rate": 0.00013105967064817585, + "loss": 1.1858, + "step": 26537 + }, + { + "epoch": 0.3448492064394462, + "grad_norm": 0.39967963099479675, + "learning_rate": 0.00013105707118626445, + "loss": 1.2473, + "step": 26538 + }, + { + "epoch": 0.3448622009833621, + "grad_norm": 0.31189975142478943, + "learning_rate": 0.00013105447172435304, + "loss": 0.9183, + "step": 26539 + }, + { + "epoch": 0.34487519552727797, + "grad_norm": 0.4261758327484131, + "learning_rate": 0.0001310518722624417, + "loss": 1.3791, + "step": 26540 + }, + { + "epoch": 0.34488819007119387, + "grad_norm": 0.37235456705093384, + "learning_rate": 0.0001310492728005303, + "loss": 1.3171, + "step": 26541 + }, + { + "epoch": 0.3449011846151097, + "grad_norm": 0.5004569292068481, + "learning_rate": 0.00013104667333861892, + "loss": 1.4813, + "step": 26542 + }, + { + "epoch": 0.3449141791590256, + "grad_norm": 0.7393776178359985, + "learning_rate": 0.00013104407387670751, + "loss": 1.4146, + "step": 26543 + }, + { + "epoch": 0.34492717370294146, + "grad_norm": 0.36590343713760376, + "learning_rate": 0.00013104147441479614, + "loss": 1.3753, + "step": 26544 + }, + { + "epoch": 0.34494016824685736, + "grad_norm": 0.3392195403575897, + "learning_rate": 0.00013103887495288476, + "loss": 1.2776, + "step": 26545 + }, + { + "epoch": 0.3449531627907732, + "grad_norm": 0.44265690445899963, + "learning_rate": 0.00013103627549097336, + "loss": 1.1844, + "step": 26546 + }, + { + "epoch": 0.3449661573346891, + "grad_norm": 0.3379330635070801, + "learning_rate": 0.00013103367602906198, + "loss": 1.3021, + "step": 26547 + }, + { + "epoch": 0.34497915187860495, + "grad_norm": 0.41267129778862, + "learning_rate": 0.0001310310765671506, + "loss": 1.5288, + "step": 26548 + }, + { + "epoch": 0.34499214642252085, + "grad_norm": 0.46174588799476624, + "learning_rate": 0.00013102847710523923, + "loss": 1.3397, + "step": 26549 + }, + { + "epoch": 0.3450051409664367, + "grad_norm": 0.5087870955467224, + "learning_rate": 0.00013102587764332783, + "loss": 1.3289, + "step": 26550 + }, + { + "epoch": 0.3450181355103526, + "grad_norm": 0.47850853204727173, + "learning_rate": 0.00013102327818141646, + "loss": 1.5565, + "step": 26551 + }, + { + "epoch": 0.34503113005426844, + "grad_norm": 0.4383699893951416, + "learning_rate": 0.00013102067871950508, + "loss": 1.3686, + "step": 26552 + }, + { + "epoch": 0.34504412459818434, + "grad_norm": 0.3639032542705536, + "learning_rate": 0.00013101807925759368, + "loss": 1.4646, + "step": 26553 + }, + { + "epoch": 0.3450571191421002, + "grad_norm": 0.3041555881500244, + "learning_rate": 0.0001310154797956823, + "loss": 1.3459, + "step": 26554 + }, + { + "epoch": 0.3450701136860161, + "grad_norm": 0.44253215193748474, + "learning_rate": 0.00013101288033377093, + "loss": 1.4899, + "step": 26555 + }, + { + "epoch": 0.34508310822993193, + "grad_norm": 0.36604005098342896, + "learning_rate": 0.00013101028087185952, + "loss": 1.5035, + "step": 26556 + }, + { + "epoch": 0.34509610277384783, + "grad_norm": 0.45684728026390076, + "learning_rate": 0.00013100768140994815, + "loss": 1.637, + "step": 26557 + }, + { + "epoch": 0.3451090973177637, + "grad_norm": 0.31577402353286743, + "learning_rate": 0.00013100508194803675, + "loss": 1.3657, + "step": 26558 + }, + { + "epoch": 0.3451220918616796, + "grad_norm": 0.3353492021560669, + "learning_rate": 0.0001310024824861254, + "loss": 1.3518, + "step": 26559 + }, + { + "epoch": 0.3451350864055954, + "grad_norm": 0.43974122405052185, + "learning_rate": 0.000130999883024214, + "loss": 1.5166, + "step": 26560 + }, + { + "epoch": 0.3451480809495113, + "grad_norm": 0.36859339475631714, + "learning_rate": 0.00013099728356230262, + "loss": 1.2987, + "step": 26561 + }, + { + "epoch": 0.34516107549342717, + "grad_norm": 0.33191508054733276, + "learning_rate": 0.00013099468410039122, + "loss": 1.3629, + "step": 26562 + }, + { + "epoch": 0.3451740700373431, + "grad_norm": 0.3837544918060303, + "learning_rate": 0.00013099208463847984, + "loss": 1.3121, + "step": 26563 + }, + { + "epoch": 0.3451870645812589, + "grad_norm": 0.4141685366630554, + "learning_rate": 0.00013098948517656847, + "loss": 1.3303, + "step": 26564 + }, + { + "epoch": 0.3452000591251748, + "grad_norm": 0.35764312744140625, + "learning_rate": 0.00013098688571465706, + "loss": 1.2393, + "step": 26565 + }, + { + "epoch": 0.34521305366909066, + "grad_norm": 0.43173328042030334, + "learning_rate": 0.0001309842862527457, + "loss": 1.3957, + "step": 26566 + }, + { + "epoch": 0.34522604821300656, + "grad_norm": 0.403044193983078, + "learning_rate": 0.0001309816867908343, + "loss": 1.3548, + "step": 26567 + }, + { + "epoch": 0.3452390427569224, + "grad_norm": 0.35216888785362244, + "learning_rate": 0.0001309790873289229, + "loss": 1.3546, + "step": 26568 + }, + { + "epoch": 0.3452520373008383, + "grad_norm": 0.38506150245666504, + "learning_rate": 0.00013097648786701153, + "loss": 1.4264, + "step": 26569 + }, + { + "epoch": 0.3452650318447542, + "grad_norm": 0.4423760175704956, + "learning_rate": 0.00013097388840510013, + "loss": 1.1305, + "step": 26570 + }, + { + "epoch": 0.34527802638867006, + "grad_norm": 0.3367474377155304, + "learning_rate": 0.00013097128894318878, + "loss": 1.4004, + "step": 26571 + }, + { + "epoch": 0.34529102093258596, + "grad_norm": 0.4223332703113556, + "learning_rate": 0.00013096868948127738, + "loss": 1.3798, + "step": 26572 + }, + { + "epoch": 0.3453040154765018, + "grad_norm": 0.44328758120536804, + "learning_rate": 0.000130966090019366, + "loss": 1.3648, + "step": 26573 + }, + { + "epoch": 0.3453170100204177, + "grad_norm": 0.3861069977283478, + "learning_rate": 0.0001309634905574546, + "loss": 1.4447, + "step": 26574 + }, + { + "epoch": 0.34533000456433355, + "grad_norm": 0.36092132329940796, + "learning_rate": 0.00013096089109554323, + "loss": 1.5799, + "step": 26575 + }, + { + "epoch": 0.34534299910824945, + "grad_norm": 0.38764095306396484, + "learning_rate": 0.00013095829163363185, + "loss": 1.2236, + "step": 26576 + }, + { + "epoch": 0.3453559936521653, + "grad_norm": 0.42050817608833313, + "learning_rate": 0.00013095569217172045, + "loss": 1.5395, + "step": 26577 + }, + { + "epoch": 0.3453689881960812, + "grad_norm": 0.40338295698165894, + "learning_rate": 0.00013095309270980907, + "loss": 1.4459, + "step": 26578 + }, + { + "epoch": 0.34538198273999704, + "grad_norm": 0.37771138548851013, + "learning_rate": 0.0001309504932478977, + "loss": 1.4327, + "step": 26579 + }, + { + "epoch": 0.34539497728391294, + "grad_norm": 0.34933656454086304, + "learning_rate": 0.00013094789378598632, + "loss": 1.2706, + "step": 26580 + }, + { + "epoch": 0.3454079718278288, + "grad_norm": 0.4201446771621704, + "learning_rate": 0.00013094529432407492, + "loss": 1.4194, + "step": 26581 + }, + { + "epoch": 0.3454209663717447, + "grad_norm": 0.46379080414772034, + "learning_rate": 0.00013094269486216352, + "loss": 1.4112, + "step": 26582 + }, + { + "epoch": 0.34543396091566053, + "grad_norm": 0.4353771507740021, + "learning_rate": 0.00013094009540025217, + "loss": 1.2551, + "step": 26583 + }, + { + "epoch": 0.34544695545957643, + "grad_norm": 0.3479970395565033, + "learning_rate": 0.00013093749593834077, + "loss": 1.3093, + "step": 26584 + }, + { + "epoch": 0.3454599500034923, + "grad_norm": 0.34489932656288147, + "learning_rate": 0.0001309348964764294, + "loss": 1.4885, + "step": 26585 + }, + { + "epoch": 0.3454729445474082, + "grad_norm": 0.3877830505371094, + "learning_rate": 0.000130932297014518, + "loss": 1.3846, + "step": 26586 + }, + { + "epoch": 0.345485939091324, + "grad_norm": 0.5589036345481873, + "learning_rate": 0.0001309296975526066, + "loss": 1.5886, + "step": 26587 + }, + { + "epoch": 0.3454989336352399, + "grad_norm": 0.4398100972175598, + "learning_rate": 0.00013092709809069524, + "loss": 1.4444, + "step": 26588 + }, + { + "epoch": 0.34551192817915577, + "grad_norm": 0.44797220826148987, + "learning_rate": 0.00013092449862878383, + "loss": 1.3693, + "step": 26589 + }, + { + "epoch": 0.34552492272307167, + "grad_norm": 0.40396520495414734, + "learning_rate": 0.00013092189916687246, + "loss": 1.1889, + "step": 26590 + }, + { + "epoch": 0.3455379172669875, + "grad_norm": 0.4713892936706543, + "learning_rate": 0.00013091929970496108, + "loss": 1.4271, + "step": 26591 + }, + { + "epoch": 0.3455509118109034, + "grad_norm": 0.4433099925518036, + "learning_rate": 0.0001309167002430497, + "loss": 1.3485, + "step": 26592 + }, + { + "epoch": 0.34556390635481926, + "grad_norm": 0.36831218004226685, + "learning_rate": 0.0001309141007811383, + "loss": 1.2387, + "step": 26593 + }, + { + "epoch": 0.34557690089873516, + "grad_norm": 0.4953293204307556, + "learning_rate": 0.00013091150131922693, + "loss": 1.4972, + "step": 26594 + }, + { + "epoch": 0.345589895442651, + "grad_norm": 0.3527107834815979, + "learning_rate": 0.00013090890185731555, + "loss": 1.4307, + "step": 26595 + }, + { + "epoch": 0.3456028899865669, + "grad_norm": 0.35049378871917725, + "learning_rate": 0.00013090630239540415, + "loss": 1.4342, + "step": 26596 + }, + { + "epoch": 0.34561588453048275, + "grad_norm": 0.4256710410118103, + "learning_rate": 0.00013090370293349278, + "loss": 1.3892, + "step": 26597 + }, + { + "epoch": 0.34562887907439865, + "grad_norm": 0.44184941053390503, + "learning_rate": 0.0001309011034715814, + "loss": 1.4305, + "step": 26598 + }, + { + "epoch": 0.3456418736183145, + "grad_norm": 0.41900575160980225, + "learning_rate": 0.00013089850400967, + "loss": 1.4564, + "step": 26599 + }, + { + "epoch": 0.3456548681622304, + "grad_norm": 0.26965197920799255, + "learning_rate": 0.00013089590454775862, + "loss": 1.2301, + "step": 26600 + }, + { + "epoch": 0.34566786270614624, + "grad_norm": 0.3843028247356415, + "learning_rate": 0.00013089330508584722, + "loss": 1.3771, + "step": 26601 + }, + { + "epoch": 0.34568085725006215, + "grad_norm": 0.3360713720321655, + "learning_rate": 0.00013089070562393587, + "loss": 1.2677, + "step": 26602 + }, + { + "epoch": 0.345693851793978, + "grad_norm": 0.3312324285507202, + "learning_rate": 0.00013088810616202447, + "loss": 1.3179, + "step": 26603 + }, + { + "epoch": 0.3457068463378939, + "grad_norm": 0.4654258191585541, + "learning_rate": 0.0001308855067001131, + "loss": 1.3397, + "step": 26604 + }, + { + "epoch": 0.34571984088180974, + "grad_norm": 0.414943665266037, + "learning_rate": 0.0001308829072382017, + "loss": 1.3137, + "step": 26605 + }, + { + "epoch": 0.34573283542572564, + "grad_norm": 0.42995309829711914, + "learning_rate": 0.00013088030777629031, + "loss": 1.3915, + "step": 26606 + }, + { + "epoch": 0.3457458299696415, + "grad_norm": 0.33729854226112366, + "learning_rate": 0.00013087770831437894, + "loss": 1.2833, + "step": 26607 + }, + { + "epoch": 0.3457588245135574, + "grad_norm": 0.424701064825058, + "learning_rate": 0.00013087510885246754, + "loss": 1.3046, + "step": 26608 + }, + { + "epoch": 0.34577181905747323, + "grad_norm": 0.4337643086910248, + "learning_rate": 0.00013087250939055616, + "loss": 1.4366, + "step": 26609 + }, + { + "epoch": 0.34578481360138913, + "grad_norm": 0.4414233863353729, + "learning_rate": 0.00013086990992864479, + "loss": 1.5143, + "step": 26610 + }, + { + "epoch": 0.345797808145305, + "grad_norm": 0.3761984407901764, + "learning_rate": 0.00013086731046673338, + "loss": 1.447, + "step": 26611 + }, + { + "epoch": 0.3458108026892209, + "grad_norm": 0.33630993962287903, + "learning_rate": 0.000130864711004822, + "loss": 1.3806, + "step": 26612 + }, + { + "epoch": 0.3458237972331367, + "grad_norm": 0.40293318033218384, + "learning_rate": 0.0001308621115429106, + "loss": 1.3526, + "step": 26613 + }, + { + "epoch": 0.3458367917770526, + "grad_norm": 0.3696369230747223, + "learning_rate": 0.00013085951208099926, + "loss": 1.4276, + "step": 26614 + }, + { + "epoch": 0.34584978632096847, + "grad_norm": 0.3684592843055725, + "learning_rate": 0.00013085691261908785, + "loss": 1.3795, + "step": 26615 + }, + { + "epoch": 0.34586278086488437, + "grad_norm": 0.3810883164405823, + "learning_rate": 0.00013085431315717648, + "loss": 1.2979, + "step": 26616 + }, + { + "epoch": 0.3458757754088002, + "grad_norm": 0.3024573028087616, + "learning_rate": 0.00013085171369526508, + "loss": 1.2637, + "step": 26617 + }, + { + "epoch": 0.3458887699527161, + "grad_norm": 0.3940378725528717, + "learning_rate": 0.0001308491142333537, + "loss": 1.3488, + "step": 26618 + }, + { + "epoch": 0.34590176449663196, + "grad_norm": 0.5149592161178589, + "learning_rate": 0.00013084651477144232, + "loss": 1.5238, + "step": 26619 + }, + { + "epoch": 0.34591475904054786, + "grad_norm": 0.4384099543094635, + "learning_rate": 0.00013084391530953092, + "loss": 1.5454, + "step": 26620 + }, + { + "epoch": 0.3459277535844637, + "grad_norm": 0.25954023003578186, + "learning_rate": 0.00013084131584761955, + "loss": 1.4853, + "step": 26621 + }, + { + "epoch": 0.3459407481283796, + "grad_norm": 0.3572298288345337, + "learning_rate": 0.00013083871638570817, + "loss": 1.3969, + "step": 26622 + }, + { + "epoch": 0.34595374267229545, + "grad_norm": 0.45168083906173706, + "learning_rate": 0.00013083611692379677, + "loss": 1.3728, + "step": 26623 + }, + { + "epoch": 0.34596673721621135, + "grad_norm": 0.27782830595970154, + "learning_rate": 0.0001308335174618854, + "loss": 1.1622, + "step": 26624 + }, + { + "epoch": 0.3459797317601272, + "grad_norm": 0.3413933217525482, + "learning_rate": 0.000130830917999974, + "loss": 1.2639, + "step": 26625 + }, + { + "epoch": 0.3459927263040431, + "grad_norm": 0.4766233563423157, + "learning_rate": 0.00013082831853806264, + "loss": 1.4792, + "step": 26626 + }, + { + "epoch": 0.34600572084795894, + "grad_norm": 0.35271376371383667, + "learning_rate": 0.00013082571907615124, + "loss": 1.2007, + "step": 26627 + }, + { + "epoch": 0.34601871539187484, + "grad_norm": 0.31171727180480957, + "learning_rate": 0.00013082311961423986, + "loss": 1.4985, + "step": 26628 + }, + { + "epoch": 0.3460317099357907, + "grad_norm": 0.4962668716907501, + "learning_rate": 0.0001308205201523285, + "loss": 1.5039, + "step": 26629 + }, + { + "epoch": 0.3460447044797066, + "grad_norm": 0.30937057733535767, + "learning_rate": 0.00013081792069041709, + "loss": 1.1504, + "step": 26630 + }, + { + "epoch": 0.34605769902362243, + "grad_norm": 0.4515775144100189, + "learning_rate": 0.0001308153212285057, + "loss": 1.4177, + "step": 26631 + }, + { + "epoch": 0.34607069356753833, + "grad_norm": 0.375863641500473, + "learning_rate": 0.0001308127217665943, + "loss": 1.3634, + "step": 26632 + }, + { + "epoch": 0.3460836881114542, + "grad_norm": 0.4325713813304901, + "learning_rate": 0.00013081012230468296, + "loss": 1.351, + "step": 26633 + }, + { + "epoch": 0.3460966826553701, + "grad_norm": 0.4273795187473297, + "learning_rate": 0.00013080752284277156, + "loss": 1.5213, + "step": 26634 + }, + { + "epoch": 0.3461096771992859, + "grad_norm": 0.4006188213825226, + "learning_rate": 0.00013080492338086018, + "loss": 1.4764, + "step": 26635 + }, + { + "epoch": 0.3461226717432018, + "grad_norm": 0.3644360601902008, + "learning_rate": 0.00013080232391894878, + "loss": 1.5916, + "step": 26636 + }, + { + "epoch": 0.34613566628711767, + "grad_norm": 0.4127972424030304, + "learning_rate": 0.0001307997244570374, + "loss": 1.4561, + "step": 26637 + }, + { + "epoch": 0.34614866083103357, + "grad_norm": 0.3162440359592438, + "learning_rate": 0.00013079712499512603, + "loss": 1.3836, + "step": 26638 + }, + { + "epoch": 0.3461616553749494, + "grad_norm": 0.39263442158699036, + "learning_rate": 0.00013079452553321462, + "loss": 1.3344, + "step": 26639 + }, + { + "epoch": 0.3461746499188653, + "grad_norm": 0.3766639828681946, + "learning_rate": 0.00013079192607130325, + "loss": 1.1546, + "step": 26640 + }, + { + "epoch": 0.34618764446278116, + "grad_norm": 0.3880411982536316, + "learning_rate": 0.00013078932660939187, + "loss": 1.4707, + "step": 26641 + }, + { + "epoch": 0.34620063900669706, + "grad_norm": 0.3839316964149475, + "learning_rate": 0.00013078672714748047, + "loss": 1.4076, + "step": 26642 + }, + { + "epoch": 0.3462136335506129, + "grad_norm": 0.40619587898254395, + "learning_rate": 0.0001307841276855691, + "loss": 1.3638, + "step": 26643 + }, + { + "epoch": 0.3462266280945288, + "grad_norm": 0.4590248167514801, + "learning_rate": 0.0001307815282236577, + "loss": 1.2623, + "step": 26644 + }, + { + "epoch": 0.3462396226384447, + "grad_norm": 0.4797360897064209, + "learning_rate": 0.00013077892876174634, + "loss": 1.5481, + "step": 26645 + }, + { + "epoch": 0.34625261718236056, + "grad_norm": 0.28102216124534607, + "learning_rate": 0.00013077632929983494, + "loss": 1.3325, + "step": 26646 + }, + { + "epoch": 0.34626561172627646, + "grad_norm": 0.34471723437309265, + "learning_rate": 0.00013077372983792357, + "loss": 1.2331, + "step": 26647 + }, + { + "epoch": 0.3462786062701923, + "grad_norm": 0.43607595562934875, + "learning_rate": 0.00013077113037601216, + "loss": 1.3849, + "step": 26648 + }, + { + "epoch": 0.3462916008141082, + "grad_norm": 0.362563818693161, + "learning_rate": 0.0001307685309141008, + "loss": 1.2369, + "step": 26649 + }, + { + "epoch": 0.34630459535802405, + "grad_norm": 0.3580750823020935, + "learning_rate": 0.0001307659314521894, + "loss": 1.6221, + "step": 26650 + }, + { + "epoch": 0.34631758990193995, + "grad_norm": 0.43542084097862244, + "learning_rate": 0.000130763331990278, + "loss": 1.3917, + "step": 26651 + }, + { + "epoch": 0.3463305844458558, + "grad_norm": 0.33879831433296204, + "learning_rate": 0.00013076073252836663, + "loss": 1.344, + "step": 26652 + }, + { + "epoch": 0.3463435789897717, + "grad_norm": 0.4282108545303345, + "learning_rate": 0.00013075813306645526, + "loss": 1.3396, + "step": 26653 + }, + { + "epoch": 0.34635657353368754, + "grad_norm": 0.3643760681152344, + "learning_rate": 0.00013075553360454386, + "loss": 1.2406, + "step": 26654 + }, + { + "epoch": 0.34636956807760344, + "grad_norm": 0.34198275208473206, + "learning_rate": 0.00013075293414263248, + "loss": 1.2846, + "step": 26655 + }, + { + "epoch": 0.3463825626215193, + "grad_norm": 0.41564545035362244, + "learning_rate": 0.00013075033468072108, + "loss": 1.3382, + "step": 26656 + }, + { + "epoch": 0.3463955571654352, + "grad_norm": 0.34525421261787415, + "learning_rate": 0.00013074773521880973, + "loss": 1.4865, + "step": 26657 + }, + { + "epoch": 0.34640855170935103, + "grad_norm": 0.4296342730522156, + "learning_rate": 0.00013074513575689833, + "loss": 1.4088, + "step": 26658 + }, + { + "epoch": 0.34642154625326693, + "grad_norm": 0.4237171411514282, + "learning_rate": 0.00013074253629498695, + "loss": 1.5191, + "step": 26659 + }, + { + "epoch": 0.3464345407971828, + "grad_norm": 0.3386423885822296, + "learning_rate": 0.00013073993683307555, + "loss": 1.5748, + "step": 26660 + }, + { + "epoch": 0.3464475353410987, + "grad_norm": 0.4119246006011963, + "learning_rate": 0.00013073733737116417, + "loss": 1.4767, + "step": 26661 + }, + { + "epoch": 0.3464605298850145, + "grad_norm": 0.3888741433620453, + "learning_rate": 0.0001307347379092528, + "loss": 1.1758, + "step": 26662 + }, + { + "epoch": 0.3464735244289304, + "grad_norm": 0.37108558416366577, + "learning_rate": 0.0001307321384473414, + "loss": 1.3987, + "step": 26663 + }, + { + "epoch": 0.34648651897284627, + "grad_norm": 0.356509268283844, + "learning_rate": 0.00013072953898543002, + "loss": 1.4093, + "step": 26664 + }, + { + "epoch": 0.34649951351676217, + "grad_norm": 0.39120975136756897, + "learning_rate": 0.00013072693952351864, + "loss": 1.4408, + "step": 26665 + }, + { + "epoch": 0.346512508060678, + "grad_norm": 0.45946523547172546, + "learning_rate": 0.00013072434006160724, + "loss": 1.3367, + "step": 26666 + }, + { + "epoch": 0.3465255026045939, + "grad_norm": 0.4458352327346802, + "learning_rate": 0.00013072174059969587, + "loss": 1.2939, + "step": 26667 + }, + { + "epoch": 0.34653849714850976, + "grad_norm": 0.3228728175163269, + "learning_rate": 0.0001307191411377845, + "loss": 1.396, + "step": 26668 + }, + { + "epoch": 0.34655149169242566, + "grad_norm": 0.38279810547828674, + "learning_rate": 0.00013071654167587311, + "loss": 1.5347, + "step": 26669 + }, + { + "epoch": 0.3465644862363415, + "grad_norm": 0.5361925363540649, + "learning_rate": 0.0001307139422139617, + "loss": 1.3269, + "step": 26670 + }, + { + "epoch": 0.3465774807802574, + "grad_norm": 0.3816174566745758, + "learning_rate": 0.00013071134275205034, + "loss": 1.4083, + "step": 26671 + }, + { + "epoch": 0.34659047532417325, + "grad_norm": 0.3572666645050049, + "learning_rate": 0.00013070874329013896, + "loss": 1.319, + "step": 26672 + }, + { + "epoch": 0.34660346986808915, + "grad_norm": 0.3896945118904114, + "learning_rate": 0.00013070614382822756, + "loss": 1.3536, + "step": 26673 + }, + { + "epoch": 0.346616464412005, + "grad_norm": 0.4612908363342285, + "learning_rate": 0.00013070354436631618, + "loss": 1.4627, + "step": 26674 + }, + { + "epoch": 0.3466294589559209, + "grad_norm": 0.43143564462661743, + "learning_rate": 0.00013070094490440478, + "loss": 1.3865, + "step": 26675 + }, + { + "epoch": 0.34664245349983674, + "grad_norm": 0.3683503568172455, + "learning_rate": 0.00013069834544249343, + "loss": 1.451, + "step": 26676 + }, + { + "epoch": 0.34665544804375265, + "grad_norm": 0.3972083628177643, + "learning_rate": 0.00013069574598058203, + "loss": 1.4988, + "step": 26677 + }, + { + "epoch": 0.3466684425876685, + "grad_norm": 0.43939390778541565, + "learning_rate": 0.00013069314651867063, + "loss": 1.3289, + "step": 26678 + }, + { + "epoch": 0.3466814371315844, + "grad_norm": 0.4743555188179016, + "learning_rate": 0.00013069054705675925, + "loss": 1.523, + "step": 26679 + }, + { + "epoch": 0.34669443167550024, + "grad_norm": 0.457882285118103, + "learning_rate": 0.00013068794759484788, + "loss": 1.4088, + "step": 26680 + }, + { + "epoch": 0.34670742621941614, + "grad_norm": 0.3829633593559265, + "learning_rate": 0.0001306853481329365, + "loss": 1.3891, + "step": 26681 + }, + { + "epoch": 0.346720420763332, + "grad_norm": 0.4332887530326843, + "learning_rate": 0.0001306827486710251, + "loss": 1.6148, + "step": 26682 + }, + { + "epoch": 0.3467334153072479, + "grad_norm": 0.505901575088501, + "learning_rate": 0.00013068014920911372, + "loss": 1.3889, + "step": 26683 + }, + { + "epoch": 0.34674640985116373, + "grad_norm": 0.3263007402420044, + "learning_rate": 0.00013067754974720235, + "loss": 1.2105, + "step": 26684 + }, + { + "epoch": 0.34675940439507963, + "grad_norm": 0.429423987865448, + "learning_rate": 0.00013067495028529094, + "loss": 1.4806, + "step": 26685 + }, + { + "epoch": 0.3467723989389955, + "grad_norm": 0.3776237964630127, + "learning_rate": 0.00013067235082337957, + "loss": 1.3164, + "step": 26686 + }, + { + "epoch": 0.3467853934829114, + "grad_norm": 0.3904382586479187, + "learning_rate": 0.00013066975136146817, + "loss": 1.4389, + "step": 26687 + }, + { + "epoch": 0.3467983880268272, + "grad_norm": 0.38412243127822876, + "learning_rate": 0.00013066715189955682, + "loss": 1.5245, + "step": 26688 + }, + { + "epoch": 0.3468113825707431, + "grad_norm": 0.452934592962265, + "learning_rate": 0.00013066455243764541, + "loss": 1.4766, + "step": 26689 + }, + { + "epoch": 0.34682437711465897, + "grad_norm": 0.4092235267162323, + "learning_rate": 0.000130661952975734, + "loss": 1.4664, + "step": 26690 + }, + { + "epoch": 0.34683737165857487, + "grad_norm": 0.39480167627334595, + "learning_rate": 0.00013065935351382264, + "loss": 1.3915, + "step": 26691 + }, + { + "epoch": 0.3468503662024907, + "grad_norm": 0.2849251627922058, + "learning_rate": 0.00013065675405191126, + "loss": 1.5287, + "step": 26692 + }, + { + "epoch": 0.3468633607464066, + "grad_norm": 0.36102426052093506, + "learning_rate": 0.00013065415458999989, + "loss": 1.1372, + "step": 26693 + }, + { + "epoch": 0.34687635529032246, + "grad_norm": 0.3119557797908783, + "learning_rate": 0.00013065155512808848, + "loss": 1.4428, + "step": 26694 + }, + { + "epoch": 0.34688934983423836, + "grad_norm": 0.5146140456199646, + "learning_rate": 0.0001306489556661771, + "loss": 1.6175, + "step": 26695 + }, + { + "epoch": 0.3469023443781542, + "grad_norm": 0.4329960346221924, + "learning_rate": 0.00013064635620426573, + "loss": 1.3941, + "step": 26696 + }, + { + "epoch": 0.3469153389220701, + "grad_norm": 0.48260965943336487, + "learning_rate": 0.00013064375674235433, + "loss": 1.2661, + "step": 26697 + }, + { + "epoch": 0.34692833346598595, + "grad_norm": 0.4369378089904785, + "learning_rate": 0.00013064115728044295, + "loss": 1.4921, + "step": 26698 + }, + { + "epoch": 0.34694132800990185, + "grad_norm": 0.40378421545028687, + "learning_rate": 0.00013063855781853155, + "loss": 1.3804, + "step": 26699 + }, + { + "epoch": 0.3469543225538177, + "grad_norm": 0.2600562572479248, + "learning_rate": 0.0001306359583566202, + "loss": 1.228, + "step": 26700 + }, + { + "epoch": 0.3469673170977336, + "grad_norm": 0.4316212236881256, + "learning_rate": 0.0001306333588947088, + "loss": 1.3007, + "step": 26701 + }, + { + "epoch": 0.34698031164164944, + "grad_norm": 0.3639039099216461, + "learning_rate": 0.00013063075943279742, + "loss": 1.3644, + "step": 26702 + }, + { + "epoch": 0.34699330618556534, + "grad_norm": 0.3018897771835327, + "learning_rate": 0.00013062815997088605, + "loss": 1.3853, + "step": 26703 + }, + { + "epoch": 0.3470063007294812, + "grad_norm": 0.3685210049152374, + "learning_rate": 0.00013062556050897465, + "loss": 1.3648, + "step": 26704 + }, + { + "epoch": 0.3470192952733971, + "grad_norm": 0.3241848051548004, + "learning_rate": 0.00013062296104706327, + "loss": 1.4097, + "step": 26705 + }, + { + "epoch": 0.34703228981731293, + "grad_norm": 0.4015924334526062, + "learning_rate": 0.00013062036158515187, + "loss": 1.3189, + "step": 26706 + }, + { + "epoch": 0.34704528436122883, + "grad_norm": 0.4488102197647095, + "learning_rate": 0.0001306177621232405, + "loss": 1.5717, + "step": 26707 + }, + { + "epoch": 0.3470582789051447, + "grad_norm": 0.3320499658584595, + "learning_rate": 0.00013061516266132912, + "loss": 1.3487, + "step": 26708 + }, + { + "epoch": 0.3470712734490606, + "grad_norm": 0.32914769649505615, + "learning_rate": 0.00013061256319941771, + "loss": 1.3243, + "step": 26709 + }, + { + "epoch": 0.3470842679929764, + "grad_norm": 0.37523412704467773, + "learning_rate": 0.00013060996373750634, + "loss": 1.3217, + "step": 26710 + }, + { + "epoch": 0.3470972625368923, + "grad_norm": 0.4538055956363678, + "learning_rate": 0.00013060736427559496, + "loss": 1.2689, + "step": 26711 + }, + { + "epoch": 0.34711025708080817, + "grad_norm": 0.3820892572402954, + "learning_rate": 0.0001306047648136836, + "loss": 1.333, + "step": 26712 + }, + { + "epoch": 0.34712325162472407, + "grad_norm": 0.4614258408546448, + "learning_rate": 0.00013060216535177219, + "loss": 1.5112, + "step": 26713 + }, + { + "epoch": 0.3471362461686399, + "grad_norm": 0.37766528129577637, + "learning_rate": 0.0001305995658898608, + "loss": 1.5765, + "step": 26714 + }, + { + "epoch": 0.3471492407125558, + "grad_norm": 0.4048602879047394, + "learning_rate": 0.00013059696642794943, + "loss": 1.4194, + "step": 26715 + }, + { + "epoch": 0.34716223525647166, + "grad_norm": 0.35259419679641724, + "learning_rate": 0.00013059436696603803, + "loss": 1.3734, + "step": 26716 + }, + { + "epoch": 0.34717522980038756, + "grad_norm": 0.3524955213069916, + "learning_rate": 0.00013059176750412666, + "loss": 1.2932, + "step": 26717 + }, + { + "epoch": 0.3471882243443034, + "grad_norm": 0.42704930901527405, + "learning_rate": 0.00013058916804221525, + "loss": 1.368, + "step": 26718 + }, + { + "epoch": 0.3472012188882193, + "grad_norm": 0.42683956027030945, + "learning_rate": 0.0001305865685803039, + "loss": 1.2146, + "step": 26719 + }, + { + "epoch": 0.34721421343213515, + "grad_norm": 0.48381561040878296, + "learning_rate": 0.0001305839691183925, + "loss": 1.5479, + "step": 26720 + }, + { + "epoch": 0.34722720797605106, + "grad_norm": 0.4491442143917084, + "learning_rate": 0.0001305813696564811, + "loss": 1.4738, + "step": 26721 + }, + { + "epoch": 0.34724020251996696, + "grad_norm": 0.4334605634212494, + "learning_rate": 0.00013057877019456972, + "loss": 1.2744, + "step": 26722 + }, + { + "epoch": 0.3472531970638828, + "grad_norm": 0.3497220277786255, + "learning_rate": 0.00013057617073265835, + "loss": 1.3667, + "step": 26723 + }, + { + "epoch": 0.3472661916077987, + "grad_norm": 0.3376922607421875, + "learning_rate": 0.00013057357127074697, + "loss": 1.2066, + "step": 26724 + }, + { + "epoch": 0.34727918615171455, + "grad_norm": 0.37966978549957275, + "learning_rate": 0.00013057097180883557, + "loss": 1.2996, + "step": 26725 + }, + { + "epoch": 0.34729218069563045, + "grad_norm": 0.3898547887802124, + "learning_rate": 0.0001305683723469242, + "loss": 1.3304, + "step": 26726 + }, + { + "epoch": 0.3473051752395463, + "grad_norm": 0.3639774024486542, + "learning_rate": 0.00013056577288501282, + "loss": 1.2869, + "step": 26727 + }, + { + "epoch": 0.3473181697834622, + "grad_norm": 0.3686040937900543, + "learning_rate": 0.00013056317342310142, + "loss": 1.1486, + "step": 26728 + }, + { + "epoch": 0.34733116432737804, + "grad_norm": 0.38157588243484497, + "learning_rate": 0.00013056057396119004, + "loss": 1.3202, + "step": 26729 + }, + { + "epoch": 0.34734415887129394, + "grad_norm": 0.47151950001716614, + "learning_rate": 0.00013055797449927864, + "loss": 1.5539, + "step": 26730 + }, + { + "epoch": 0.3473571534152098, + "grad_norm": 0.4067578911781311, + "learning_rate": 0.0001305553750373673, + "loss": 1.298, + "step": 26731 + }, + { + "epoch": 0.3473701479591257, + "grad_norm": 0.49372127652168274, + "learning_rate": 0.0001305527755754559, + "loss": 1.5364, + "step": 26732 + }, + { + "epoch": 0.34738314250304153, + "grad_norm": 0.4783504605293274, + "learning_rate": 0.00013055017611354449, + "loss": 1.4912, + "step": 26733 + }, + { + "epoch": 0.34739613704695743, + "grad_norm": 0.33491459488868713, + "learning_rate": 0.0001305475766516331, + "loss": 1.422, + "step": 26734 + }, + { + "epoch": 0.3474091315908733, + "grad_norm": 0.35459378361701965, + "learning_rate": 0.00013054497718972173, + "loss": 1.5814, + "step": 26735 + }, + { + "epoch": 0.3474221261347892, + "grad_norm": 0.46049872040748596, + "learning_rate": 0.00013054237772781036, + "loss": 1.4832, + "step": 26736 + }, + { + "epoch": 0.347435120678705, + "grad_norm": 0.26960837841033936, + "learning_rate": 0.00013053977826589896, + "loss": 1.4086, + "step": 26737 + }, + { + "epoch": 0.3474481152226209, + "grad_norm": 0.4439995288848877, + "learning_rate": 0.00013053717880398758, + "loss": 1.5378, + "step": 26738 + }, + { + "epoch": 0.34746110976653677, + "grad_norm": 0.3669470548629761, + "learning_rate": 0.0001305345793420762, + "loss": 1.4257, + "step": 26739 + }, + { + "epoch": 0.34747410431045267, + "grad_norm": 0.42249906063079834, + "learning_rate": 0.0001305319798801648, + "loss": 1.4669, + "step": 26740 + }, + { + "epoch": 0.3474870988543685, + "grad_norm": 0.3158291280269623, + "learning_rate": 0.00013052938041825343, + "loss": 1.2623, + "step": 26741 + }, + { + "epoch": 0.3475000933982844, + "grad_norm": 0.358104407787323, + "learning_rate": 0.00013052678095634205, + "loss": 1.2407, + "step": 26742 + }, + { + "epoch": 0.34751308794220026, + "grad_norm": 0.39465901255607605, + "learning_rate": 0.00013052418149443068, + "loss": 1.4655, + "step": 26743 + }, + { + "epoch": 0.34752608248611616, + "grad_norm": 0.31541532278060913, + "learning_rate": 0.00013052158203251927, + "loss": 1.3835, + "step": 26744 + }, + { + "epoch": 0.347539077030032, + "grad_norm": 0.4479454457759857, + "learning_rate": 0.00013051898257060787, + "loss": 1.4042, + "step": 26745 + }, + { + "epoch": 0.3475520715739479, + "grad_norm": 0.28242114186286926, + "learning_rate": 0.00013051638310869652, + "loss": 1.375, + "step": 26746 + }, + { + "epoch": 0.34756506611786375, + "grad_norm": 0.41462913155555725, + "learning_rate": 0.00013051378364678512, + "loss": 1.5129, + "step": 26747 + }, + { + "epoch": 0.34757806066177965, + "grad_norm": 0.3445412218570709, + "learning_rate": 0.00013051118418487374, + "loss": 1.5114, + "step": 26748 + }, + { + "epoch": 0.3475910552056955, + "grad_norm": 0.44674918055534363, + "learning_rate": 0.00013050858472296234, + "loss": 1.3324, + "step": 26749 + }, + { + "epoch": 0.3476040497496114, + "grad_norm": 0.46157950162887573, + "learning_rate": 0.00013050598526105097, + "loss": 1.4279, + "step": 26750 + }, + { + "epoch": 0.34761704429352724, + "grad_norm": 0.36281871795654297, + "learning_rate": 0.0001305033857991396, + "loss": 1.332, + "step": 26751 + }, + { + "epoch": 0.34763003883744314, + "grad_norm": 0.3747524619102478, + "learning_rate": 0.0001305007863372282, + "loss": 1.4512, + "step": 26752 + }, + { + "epoch": 0.347643033381359, + "grad_norm": 0.37799760699272156, + "learning_rate": 0.0001304981868753168, + "loss": 1.4614, + "step": 26753 + }, + { + "epoch": 0.3476560279252749, + "grad_norm": 0.426281601190567, + "learning_rate": 0.00013049558741340544, + "loss": 1.4905, + "step": 26754 + }, + { + "epoch": 0.34766902246919074, + "grad_norm": 0.4806407392024994, + "learning_rate": 0.00013049298795149406, + "loss": 1.4491, + "step": 26755 + }, + { + "epoch": 0.34768201701310664, + "grad_norm": 0.3714763820171356, + "learning_rate": 0.00013049038848958266, + "loss": 1.3374, + "step": 26756 + }, + { + "epoch": 0.3476950115570225, + "grad_norm": 0.38324642181396484, + "learning_rate": 0.00013048778902767128, + "loss": 1.359, + "step": 26757 + }, + { + "epoch": 0.3477080061009384, + "grad_norm": 0.376619815826416, + "learning_rate": 0.0001304851895657599, + "loss": 1.2543, + "step": 26758 + }, + { + "epoch": 0.34772100064485423, + "grad_norm": 0.3975328505039215, + "learning_rate": 0.0001304825901038485, + "loss": 1.2356, + "step": 26759 + }, + { + "epoch": 0.34773399518877013, + "grad_norm": 0.40708211064338684, + "learning_rate": 0.00013047999064193713, + "loss": 1.3125, + "step": 26760 + }, + { + "epoch": 0.347746989732686, + "grad_norm": 0.39774417877197266, + "learning_rate": 0.00013047739118002573, + "loss": 1.3985, + "step": 26761 + }, + { + "epoch": 0.3477599842766019, + "grad_norm": 0.3474942743778229, + "learning_rate": 0.00013047479171811435, + "loss": 1.4585, + "step": 26762 + }, + { + "epoch": 0.3477729788205177, + "grad_norm": 0.4041062295436859, + "learning_rate": 0.00013047219225620298, + "loss": 1.2488, + "step": 26763 + }, + { + "epoch": 0.3477859733644336, + "grad_norm": 0.43606680631637573, + "learning_rate": 0.00013046959279429157, + "loss": 1.4448, + "step": 26764 + }, + { + "epoch": 0.34779896790834947, + "grad_norm": 0.4023717939853668, + "learning_rate": 0.0001304669933323802, + "loss": 1.4276, + "step": 26765 + }, + { + "epoch": 0.34781196245226537, + "grad_norm": 0.4473191201686859, + "learning_rate": 0.00013046439387046882, + "loss": 1.5601, + "step": 26766 + }, + { + "epoch": 0.3478249569961812, + "grad_norm": 0.39667677879333496, + "learning_rate": 0.00013046179440855745, + "loss": 1.2787, + "step": 26767 + }, + { + "epoch": 0.3478379515400971, + "grad_norm": 0.505514919757843, + "learning_rate": 0.00013045919494664604, + "loss": 1.6057, + "step": 26768 + }, + { + "epoch": 0.34785094608401296, + "grad_norm": 0.33252087235450745, + "learning_rate": 0.00013045659548473467, + "loss": 1.3894, + "step": 26769 + }, + { + "epoch": 0.34786394062792886, + "grad_norm": 0.36331695318222046, + "learning_rate": 0.0001304539960228233, + "loss": 1.1501, + "step": 26770 + }, + { + "epoch": 0.3478769351718447, + "grad_norm": 0.33615437150001526, + "learning_rate": 0.0001304513965609119, + "loss": 1.2659, + "step": 26771 + }, + { + "epoch": 0.3478899297157606, + "grad_norm": 0.4483484923839569, + "learning_rate": 0.00013044879709900052, + "loss": 1.4512, + "step": 26772 + }, + { + "epoch": 0.34790292425967645, + "grad_norm": 0.36265531182289124, + "learning_rate": 0.0001304461976370891, + "loss": 1.531, + "step": 26773 + }, + { + "epoch": 0.34791591880359235, + "grad_norm": 0.500744640827179, + "learning_rate": 0.00013044359817517774, + "loss": 1.4346, + "step": 26774 + }, + { + "epoch": 0.3479289133475082, + "grad_norm": 0.4104214310646057, + "learning_rate": 0.00013044099871326636, + "loss": 1.5788, + "step": 26775 + }, + { + "epoch": 0.3479419078914241, + "grad_norm": 0.3251439332962036, + "learning_rate": 0.00013043839925135496, + "loss": 1.2545, + "step": 26776 + }, + { + "epoch": 0.34795490243533994, + "grad_norm": 0.4101673364639282, + "learning_rate": 0.0001304357997894436, + "loss": 1.3646, + "step": 26777 + }, + { + "epoch": 0.34796789697925584, + "grad_norm": 0.31956610083580017, + "learning_rate": 0.0001304332003275322, + "loss": 1.4088, + "step": 26778 + }, + { + "epoch": 0.3479808915231717, + "grad_norm": 0.396186500787735, + "learning_rate": 0.00013043060086562083, + "loss": 1.6465, + "step": 26779 + }, + { + "epoch": 0.3479938860670876, + "grad_norm": 0.3561484217643738, + "learning_rate": 0.00013042800140370943, + "loss": 1.3215, + "step": 26780 + }, + { + "epoch": 0.34800688061100343, + "grad_norm": 0.40662887692451477, + "learning_rate": 0.00013042540194179805, + "loss": 1.5822, + "step": 26781 + }, + { + "epoch": 0.34801987515491933, + "grad_norm": 0.4449978172779083, + "learning_rate": 0.00013042280247988668, + "loss": 1.2456, + "step": 26782 + }, + { + "epoch": 0.3480328696988352, + "grad_norm": 0.5672455430030823, + "learning_rate": 0.00013042020301797528, + "loss": 1.4257, + "step": 26783 + }, + { + "epoch": 0.3480458642427511, + "grad_norm": 0.46624547243118286, + "learning_rate": 0.0001304176035560639, + "loss": 1.6072, + "step": 26784 + }, + { + "epoch": 0.3480588587866669, + "grad_norm": 0.4050205945968628, + "learning_rate": 0.00013041500409415252, + "loss": 1.2362, + "step": 26785 + }, + { + "epoch": 0.3480718533305828, + "grad_norm": 0.3998050093650818, + "learning_rate": 0.00013041240463224115, + "loss": 1.5535, + "step": 26786 + }, + { + "epoch": 0.34808484787449867, + "grad_norm": 0.3982885777950287, + "learning_rate": 0.00013040980517032975, + "loss": 1.443, + "step": 26787 + }, + { + "epoch": 0.34809784241841457, + "grad_norm": 0.38016390800476074, + "learning_rate": 0.00013040720570841834, + "loss": 1.1967, + "step": 26788 + }, + { + "epoch": 0.3481108369623304, + "grad_norm": 0.4149615168571472, + "learning_rate": 0.000130404606246507, + "loss": 1.3022, + "step": 26789 + }, + { + "epoch": 0.3481238315062463, + "grad_norm": 0.46945932507514954, + "learning_rate": 0.0001304020067845956, + "loss": 1.3634, + "step": 26790 + }, + { + "epoch": 0.34813682605016216, + "grad_norm": 0.45238369703292847, + "learning_rate": 0.00013039940732268422, + "loss": 1.3256, + "step": 26791 + }, + { + "epoch": 0.34814982059407806, + "grad_norm": 0.258750855922699, + "learning_rate": 0.00013039680786077281, + "loss": 1.3508, + "step": 26792 + }, + { + "epoch": 0.3481628151379939, + "grad_norm": 0.542230486869812, + "learning_rate": 0.00013039420839886144, + "loss": 1.477, + "step": 26793 + }, + { + "epoch": 0.3481758096819098, + "grad_norm": 0.3329165279865265, + "learning_rate": 0.00013039160893695006, + "loss": 1.5693, + "step": 26794 + }, + { + "epoch": 0.34818880422582565, + "grad_norm": 0.3818724751472473, + "learning_rate": 0.00013038900947503866, + "loss": 1.3297, + "step": 26795 + }, + { + "epoch": 0.34820179876974156, + "grad_norm": 0.4799937903881073, + "learning_rate": 0.00013038641001312729, + "loss": 1.4337, + "step": 26796 + }, + { + "epoch": 0.34821479331365746, + "grad_norm": 0.424570769071579, + "learning_rate": 0.0001303838105512159, + "loss": 1.4419, + "step": 26797 + }, + { + "epoch": 0.3482277878575733, + "grad_norm": 0.42631858587265015, + "learning_rate": 0.00013038121108930453, + "loss": 1.4089, + "step": 26798 + }, + { + "epoch": 0.3482407824014892, + "grad_norm": 0.31591910123825073, + "learning_rate": 0.00013037861162739313, + "loss": 1.3228, + "step": 26799 + }, + { + "epoch": 0.34825377694540505, + "grad_norm": 0.3717425763607025, + "learning_rate": 0.00013037601216548173, + "loss": 1.2834, + "step": 26800 + }, + { + "epoch": 0.34826677148932095, + "grad_norm": 0.4752770960330963, + "learning_rate": 0.00013037341270357038, + "loss": 1.482, + "step": 26801 + }, + { + "epoch": 0.3482797660332368, + "grad_norm": 0.5003453493118286, + "learning_rate": 0.00013037081324165898, + "loss": 1.6048, + "step": 26802 + }, + { + "epoch": 0.3482927605771527, + "grad_norm": 0.46734803915023804, + "learning_rate": 0.0001303682137797476, + "loss": 1.2822, + "step": 26803 + }, + { + "epoch": 0.34830575512106854, + "grad_norm": 0.30715063214302063, + "learning_rate": 0.0001303656143178362, + "loss": 1.3288, + "step": 26804 + }, + { + "epoch": 0.34831874966498444, + "grad_norm": 0.4108499586582184, + "learning_rate": 0.00013036301485592482, + "loss": 1.3815, + "step": 26805 + }, + { + "epoch": 0.3483317442089003, + "grad_norm": 0.36533400416374207, + "learning_rate": 0.00013036041539401345, + "loss": 1.1965, + "step": 26806 + }, + { + "epoch": 0.3483447387528162, + "grad_norm": 0.5020785927772522, + "learning_rate": 0.00013035781593210205, + "loss": 1.5708, + "step": 26807 + }, + { + "epoch": 0.34835773329673203, + "grad_norm": 0.48276105523109436, + "learning_rate": 0.00013035521647019067, + "loss": 1.2978, + "step": 26808 + }, + { + "epoch": 0.34837072784064793, + "grad_norm": 0.5764045715332031, + "learning_rate": 0.0001303526170082793, + "loss": 1.4011, + "step": 26809 + }, + { + "epoch": 0.3483837223845638, + "grad_norm": 0.5054870843887329, + "learning_rate": 0.00013035001754636792, + "loss": 1.5591, + "step": 26810 + }, + { + "epoch": 0.3483967169284797, + "grad_norm": 0.3600458800792694, + "learning_rate": 0.00013034741808445652, + "loss": 1.5582, + "step": 26811 + }, + { + "epoch": 0.3484097114723955, + "grad_norm": 0.3598695397377014, + "learning_rate": 0.00013034481862254511, + "loss": 1.1718, + "step": 26812 + }, + { + "epoch": 0.3484227060163114, + "grad_norm": 0.35998156666755676, + "learning_rate": 0.00013034221916063377, + "loss": 1.4594, + "step": 26813 + }, + { + "epoch": 0.34843570056022727, + "grad_norm": 0.3428550362586975, + "learning_rate": 0.00013033961969872236, + "loss": 1.4121, + "step": 26814 + }, + { + "epoch": 0.34844869510414317, + "grad_norm": 0.48719048500061035, + "learning_rate": 0.000130337020236811, + "loss": 1.2761, + "step": 26815 + }, + { + "epoch": 0.348461689648059, + "grad_norm": 0.43609923124313354, + "learning_rate": 0.0001303344207748996, + "loss": 1.3259, + "step": 26816 + }, + { + "epoch": 0.3484746841919749, + "grad_norm": 0.3070835471153259, + "learning_rate": 0.0001303318213129882, + "loss": 1.3896, + "step": 26817 + }, + { + "epoch": 0.34848767873589076, + "grad_norm": 0.4117427468299866, + "learning_rate": 0.00013032922185107683, + "loss": 1.1216, + "step": 26818 + }, + { + "epoch": 0.34850067327980666, + "grad_norm": 0.34535476565361023, + "learning_rate": 0.00013032662238916543, + "loss": 1.3092, + "step": 26819 + }, + { + "epoch": 0.3485136678237225, + "grad_norm": 0.368813693523407, + "learning_rate": 0.00013032402292725408, + "loss": 1.3062, + "step": 26820 + }, + { + "epoch": 0.3485266623676384, + "grad_norm": 0.3915729820728302, + "learning_rate": 0.00013032142346534268, + "loss": 1.3569, + "step": 26821 + }, + { + "epoch": 0.34853965691155425, + "grad_norm": 0.3854377567768097, + "learning_rate": 0.0001303188240034313, + "loss": 1.466, + "step": 26822 + }, + { + "epoch": 0.34855265145547015, + "grad_norm": 0.45055168867111206, + "learning_rate": 0.0001303162245415199, + "loss": 1.487, + "step": 26823 + }, + { + "epoch": 0.348565645999386, + "grad_norm": 0.4203272759914398, + "learning_rate": 0.00013031362507960853, + "loss": 1.4011, + "step": 26824 + }, + { + "epoch": 0.3485786405433019, + "grad_norm": 0.3517386019229889, + "learning_rate": 0.00013031102561769715, + "loss": 1.3355, + "step": 26825 + }, + { + "epoch": 0.34859163508721774, + "grad_norm": 0.35518479347229004, + "learning_rate": 0.00013030842615578575, + "loss": 1.3006, + "step": 26826 + }, + { + "epoch": 0.34860462963113364, + "grad_norm": 0.42156311869621277, + "learning_rate": 0.00013030582669387437, + "loss": 1.6495, + "step": 26827 + }, + { + "epoch": 0.3486176241750495, + "grad_norm": 0.3892824649810791, + "learning_rate": 0.000130303227231963, + "loss": 1.3528, + "step": 26828 + }, + { + "epoch": 0.3486306187189654, + "grad_norm": 0.3573375642299652, + "learning_rate": 0.0001303006277700516, + "loss": 1.456, + "step": 26829 + }, + { + "epoch": 0.34864361326288124, + "grad_norm": 0.377510130405426, + "learning_rate": 0.00013029802830814022, + "loss": 1.5098, + "step": 26830 + }, + { + "epoch": 0.34865660780679714, + "grad_norm": 0.4737508296966553, + "learning_rate": 0.00013029542884622882, + "loss": 1.3942, + "step": 26831 + }, + { + "epoch": 0.348669602350713, + "grad_norm": 0.5155438780784607, + "learning_rate": 0.00013029282938431747, + "loss": 1.6184, + "step": 26832 + }, + { + "epoch": 0.3486825968946289, + "grad_norm": 0.41911548376083374, + "learning_rate": 0.00013029022992240607, + "loss": 1.3075, + "step": 26833 + }, + { + "epoch": 0.3486955914385447, + "grad_norm": 0.393661230802536, + "learning_rate": 0.0001302876304604947, + "loss": 1.4884, + "step": 26834 + }, + { + "epoch": 0.34870858598246063, + "grad_norm": 0.41192901134490967, + "learning_rate": 0.0001302850309985833, + "loss": 1.3713, + "step": 26835 + }, + { + "epoch": 0.3487215805263765, + "grad_norm": 0.40410315990448, + "learning_rate": 0.0001302824315366719, + "loss": 1.3532, + "step": 26836 + }, + { + "epoch": 0.3487345750702924, + "grad_norm": 0.37459951639175415, + "learning_rate": 0.00013027983207476054, + "loss": 1.626, + "step": 26837 + }, + { + "epoch": 0.3487475696142082, + "grad_norm": 0.5089786052703857, + "learning_rate": 0.00013027723261284913, + "loss": 1.3874, + "step": 26838 + }, + { + "epoch": 0.3487605641581241, + "grad_norm": 0.42086324095726013, + "learning_rate": 0.00013027463315093776, + "loss": 1.4244, + "step": 26839 + }, + { + "epoch": 0.34877355870203997, + "grad_norm": 0.4996035695075989, + "learning_rate": 0.00013027203368902638, + "loss": 1.5482, + "step": 26840 + }, + { + "epoch": 0.34878655324595587, + "grad_norm": 0.3828352987766266, + "learning_rate": 0.000130269434227115, + "loss": 1.4537, + "step": 26841 + }, + { + "epoch": 0.3487995477898717, + "grad_norm": 0.38782718777656555, + "learning_rate": 0.0001302668347652036, + "loss": 1.5231, + "step": 26842 + }, + { + "epoch": 0.3488125423337876, + "grad_norm": 0.3696490526199341, + "learning_rate": 0.0001302642353032922, + "loss": 1.4333, + "step": 26843 + }, + { + "epoch": 0.34882553687770346, + "grad_norm": 0.457909494638443, + "learning_rate": 0.00013026163584138085, + "loss": 1.5323, + "step": 26844 + }, + { + "epoch": 0.34883853142161936, + "grad_norm": 0.45369207859039307, + "learning_rate": 0.00013025903637946945, + "loss": 1.5404, + "step": 26845 + }, + { + "epoch": 0.3488515259655352, + "grad_norm": 0.39002755284309387, + "learning_rate": 0.00013025643691755808, + "loss": 1.3482, + "step": 26846 + }, + { + "epoch": 0.3488645205094511, + "grad_norm": 0.4447665214538574, + "learning_rate": 0.00013025383745564667, + "loss": 1.3912, + "step": 26847 + }, + { + "epoch": 0.34887751505336695, + "grad_norm": 0.516338050365448, + "learning_rate": 0.0001302512379937353, + "loss": 1.5053, + "step": 26848 + }, + { + "epoch": 0.34889050959728285, + "grad_norm": 0.3584909439086914, + "learning_rate": 0.00013024863853182392, + "loss": 1.7162, + "step": 26849 + }, + { + "epoch": 0.3489035041411987, + "grad_norm": 0.45109349489212036, + "learning_rate": 0.00013024603906991252, + "loss": 1.4726, + "step": 26850 + }, + { + "epoch": 0.3489164986851146, + "grad_norm": 0.48669570684432983, + "learning_rate": 0.00013024343960800117, + "loss": 1.277, + "step": 26851 + }, + { + "epoch": 0.34892949322903044, + "grad_norm": 0.3479592204093933, + "learning_rate": 0.00013024084014608977, + "loss": 1.3113, + "step": 26852 + }, + { + "epoch": 0.34894248777294634, + "grad_norm": 0.43328437209129333, + "learning_rate": 0.0001302382406841784, + "loss": 1.6825, + "step": 26853 + }, + { + "epoch": 0.3489554823168622, + "grad_norm": 0.3987380862236023, + "learning_rate": 0.000130235641222267, + "loss": 1.2621, + "step": 26854 + }, + { + "epoch": 0.3489684768607781, + "grad_norm": 0.49009841680526733, + "learning_rate": 0.00013023304176035562, + "loss": 1.5059, + "step": 26855 + }, + { + "epoch": 0.34898147140469393, + "grad_norm": 0.4359487295150757, + "learning_rate": 0.00013023044229844424, + "loss": 1.3974, + "step": 26856 + }, + { + "epoch": 0.34899446594860983, + "grad_norm": 0.3093532919883728, + "learning_rate": 0.00013022784283653284, + "loss": 1.4209, + "step": 26857 + }, + { + "epoch": 0.3490074604925257, + "grad_norm": 0.33768537640571594, + "learning_rate": 0.00013022524337462146, + "loss": 1.3923, + "step": 26858 + }, + { + "epoch": 0.3490204550364416, + "grad_norm": 0.36312350630760193, + "learning_rate": 0.00013022264391271009, + "loss": 1.1169, + "step": 26859 + }, + { + "epoch": 0.3490334495803574, + "grad_norm": 0.460262268781662, + "learning_rate": 0.00013022004445079868, + "loss": 1.372, + "step": 26860 + }, + { + "epoch": 0.3490464441242733, + "grad_norm": 0.3945809006690979, + "learning_rate": 0.0001302174449888873, + "loss": 1.4191, + "step": 26861 + }, + { + "epoch": 0.34905943866818917, + "grad_norm": 0.3828829228878021, + "learning_rate": 0.0001302148455269759, + "loss": 1.41, + "step": 26862 + }, + { + "epoch": 0.34907243321210507, + "grad_norm": 0.4079018533229828, + "learning_rate": 0.00013021224606506456, + "loss": 1.4159, + "step": 26863 + }, + { + "epoch": 0.3490854277560209, + "grad_norm": 0.37046951055526733, + "learning_rate": 0.00013020964660315315, + "loss": 1.3911, + "step": 26864 + }, + { + "epoch": 0.3490984222999368, + "grad_norm": 0.37089216709136963, + "learning_rate": 0.00013020704714124178, + "loss": 1.3024, + "step": 26865 + }, + { + "epoch": 0.34911141684385266, + "grad_norm": 0.33971282839775085, + "learning_rate": 0.00013020444767933038, + "loss": 1.4925, + "step": 26866 + }, + { + "epoch": 0.34912441138776856, + "grad_norm": 0.3835345208644867, + "learning_rate": 0.000130201848217419, + "loss": 1.476, + "step": 26867 + }, + { + "epoch": 0.3491374059316844, + "grad_norm": 0.3725162148475647, + "learning_rate": 0.00013019924875550763, + "loss": 1.1449, + "step": 26868 + }, + { + "epoch": 0.3491504004756003, + "grad_norm": 0.494057834148407, + "learning_rate": 0.00013019664929359622, + "loss": 1.4632, + "step": 26869 + }, + { + "epoch": 0.34916339501951615, + "grad_norm": 0.3196750283241272, + "learning_rate": 0.00013019404983168485, + "loss": 1.3638, + "step": 26870 + }, + { + "epoch": 0.34917638956343205, + "grad_norm": 0.37443065643310547, + "learning_rate": 0.00013019145036977347, + "loss": 1.4362, + "step": 26871 + }, + { + "epoch": 0.3491893841073479, + "grad_norm": 0.4043956696987152, + "learning_rate": 0.00013018885090786207, + "loss": 1.2854, + "step": 26872 + }, + { + "epoch": 0.3492023786512638, + "grad_norm": 0.45044976472854614, + "learning_rate": 0.0001301862514459507, + "loss": 1.4914, + "step": 26873 + }, + { + "epoch": 0.3492153731951797, + "grad_norm": 0.4577068090438843, + "learning_rate": 0.0001301836519840393, + "loss": 1.392, + "step": 26874 + }, + { + "epoch": 0.34922836773909555, + "grad_norm": 0.4043601453304291, + "learning_rate": 0.00013018105252212794, + "loss": 1.5481, + "step": 26875 + }, + { + "epoch": 0.34924136228301145, + "grad_norm": 0.39608028531074524, + "learning_rate": 0.00013017845306021654, + "loss": 1.3272, + "step": 26876 + }, + { + "epoch": 0.3492543568269273, + "grad_norm": 0.3926863372325897, + "learning_rate": 0.00013017585359830516, + "loss": 1.3866, + "step": 26877 + }, + { + "epoch": 0.3492673513708432, + "grad_norm": 0.32485154271125793, + "learning_rate": 0.00013017325413639376, + "loss": 1.0555, + "step": 26878 + }, + { + "epoch": 0.34928034591475904, + "grad_norm": 0.44558027386665344, + "learning_rate": 0.00013017065467448239, + "loss": 1.5047, + "step": 26879 + }, + { + "epoch": 0.34929334045867494, + "grad_norm": 0.45488807559013367, + "learning_rate": 0.000130168055212571, + "loss": 1.2739, + "step": 26880 + }, + { + "epoch": 0.3493063350025908, + "grad_norm": 0.40049830079078674, + "learning_rate": 0.0001301654557506596, + "loss": 1.3355, + "step": 26881 + }, + { + "epoch": 0.3493193295465067, + "grad_norm": 0.4068833589553833, + "learning_rate": 0.00013016285628874823, + "loss": 1.3403, + "step": 26882 + }, + { + "epoch": 0.34933232409042253, + "grad_norm": 0.29012948274612427, + "learning_rate": 0.00013016025682683686, + "loss": 1.4189, + "step": 26883 + }, + { + "epoch": 0.34934531863433843, + "grad_norm": 0.43037736415863037, + "learning_rate": 0.00013015765736492545, + "loss": 1.3888, + "step": 26884 + }, + { + "epoch": 0.3493583131782543, + "grad_norm": 0.4870368540287018, + "learning_rate": 0.00013015505790301408, + "loss": 1.6691, + "step": 26885 + }, + { + "epoch": 0.3493713077221702, + "grad_norm": 0.3602190613746643, + "learning_rate": 0.00013015245844110268, + "loss": 1.4264, + "step": 26886 + }, + { + "epoch": 0.349384302266086, + "grad_norm": 0.35174962878227234, + "learning_rate": 0.00013014985897919133, + "loss": 1.4528, + "step": 26887 + }, + { + "epoch": 0.3493972968100019, + "grad_norm": 0.37076127529144287, + "learning_rate": 0.00013014725951727993, + "loss": 1.4095, + "step": 26888 + }, + { + "epoch": 0.34941029135391777, + "grad_norm": 0.45997482538223267, + "learning_rate": 0.00013014466005536855, + "loss": 1.5328, + "step": 26889 + }, + { + "epoch": 0.34942328589783367, + "grad_norm": 0.34510675072669983, + "learning_rate": 0.00013014206059345717, + "loss": 1.4021, + "step": 26890 + }, + { + "epoch": 0.3494362804417495, + "grad_norm": 0.3288014829158783, + "learning_rate": 0.00013013946113154577, + "loss": 1.2502, + "step": 26891 + }, + { + "epoch": 0.3494492749856654, + "grad_norm": 0.436376690864563, + "learning_rate": 0.0001301368616696344, + "loss": 1.3732, + "step": 26892 + }, + { + "epoch": 0.34946226952958126, + "grad_norm": 0.4917808771133423, + "learning_rate": 0.000130134262207723, + "loss": 1.411, + "step": 26893 + }, + { + "epoch": 0.34947526407349716, + "grad_norm": 0.33655521273612976, + "learning_rate": 0.00013013166274581165, + "loss": 1.3825, + "step": 26894 + }, + { + "epoch": 0.349488258617413, + "grad_norm": 0.40204986929893494, + "learning_rate": 0.00013012906328390024, + "loss": 1.3366, + "step": 26895 + }, + { + "epoch": 0.3495012531613289, + "grad_norm": 0.3732942044734955, + "learning_rate": 0.00013012646382198884, + "loss": 1.4675, + "step": 26896 + }, + { + "epoch": 0.34951424770524475, + "grad_norm": 0.3512749671936035, + "learning_rate": 0.00013012386436007746, + "loss": 1.3461, + "step": 26897 + }, + { + "epoch": 0.34952724224916065, + "grad_norm": 0.5294820666313171, + "learning_rate": 0.0001301212648981661, + "loss": 1.48, + "step": 26898 + }, + { + "epoch": 0.3495402367930765, + "grad_norm": 0.4390273094177246, + "learning_rate": 0.0001301186654362547, + "loss": 1.4389, + "step": 26899 + }, + { + "epoch": 0.3495532313369924, + "grad_norm": 0.44069093465805054, + "learning_rate": 0.0001301160659743433, + "loss": 1.4742, + "step": 26900 + }, + { + "epoch": 0.34956622588090824, + "grad_norm": 0.3821607232093811, + "learning_rate": 0.00013011346651243194, + "loss": 1.4528, + "step": 26901 + }, + { + "epoch": 0.34957922042482414, + "grad_norm": 0.4463363587856293, + "learning_rate": 0.00013011086705052056, + "loss": 1.3085, + "step": 26902 + }, + { + "epoch": 0.34959221496874, + "grad_norm": 0.4405565857887268, + "learning_rate": 0.00013010826758860916, + "loss": 1.4502, + "step": 26903 + }, + { + "epoch": 0.3496052095126559, + "grad_norm": 0.457396924495697, + "learning_rate": 0.00013010566812669778, + "loss": 1.3796, + "step": 26904 + }, + { + "epoch": 0.34961820405657174, + "grad_norm": 0.4090796709060669, + "learning_rate": 0.00013010306866478638, + "loss": 1.2769, + "step": 26905 + }, + { + "epoch": 0.34963119860048764, + "grad_norm": 0.5057259202003479, + "learning_rate": 0.00013010046920287503, + "loss": 1.3692, + "step": 26906 + }, + { + "epoch": 0.3496441931444035, + "grad_norm": 0.3464180529117584, + "learning_rate": 0.00013009786974096363, + "loss": 1.4632, + "step": 26907 + }, + { + "epoch": 0.3496571876883194, + "grad_norm": 0.3629237115383148, + "learning_rate": 0.00013009527027905225, + "loss": 1.3575, + "step": 26908 + }, + { + "epoch": 0.3496701822322352, + "grad_norm": 0.3495189845561981, + "learning_rate": 0.00013009267081714085, + "loss": 1.64, + "step": 26909 + }, + { + "epoch": 0.34968317677615113, + "grad_norm": 0.513372540473938, + "learning_rate": 0.00013009007135522947, + "loss": 1.3079, + "step": 26910 + }, + { + "epoch": 0.349696171320067, + "grad_norm": 0.37216681241989136, + "learning_rate": 0.0001300874718933181, + "loss": 1.3351, + "step": 26911 + }, + { + "epoch": 0.3497091658639829, + "grad_norm": 0.38222554326057434, + "learning_rate": 0.0001300848724314067, + "loss": 1.3744, + "step": 26912 + }, + { + "epoch": 0.3497221604078987, + "grad_norm": 0.4410099983215332, + "learning_rate": 0.00013008227296949532, + "loss": 1.53, + "step": 26913 + }, + { + "epoch": 0.3497351549518146, + "grad_norm": 0.41673049330711365, + "learning_rate": 0.00013007967350758394, + "loss": 1.3899, + "step": 26914 + }, + { + "epoch": 0.34974814949573046, + "grad_norm": 0.3506511449813843, + "learning_rate": 0.00013007707404567254, + "loss": 1.5135, + "step": 26915 + }, + { + "epoch": 0.34976114403964637, + "grad_norm": 0.4729503095149994, + "learning_rate": 0.00013007447458376117, + "loss": 1.4096, + "step": 26916 + }, + { + "epoch": 0.3497741385835622, + "grad_norm": 0.36889171600341797, + "learning_rate": 0.00013007187512184976, + "loss": 1.5223, + "step": 26917 + }, + { + "epoch": 0.3497871331274781, + "grad_norm": 0.35369524359703064, + "learning_rate": 0.00013006927565993842, + "loss": 1.4416, + "step": 26918 + }, + { + "epoch": 0.34980012767139396, + "grad_norm": 0.42414289712905884, + "learning_rate": 0.000130066676198027, + "loss": 1.4806, + "step": 26919 + }, + { + "epoch": 0.34981312221530986, + "grad_norm": 0.35942816734313965, + "learning_rate": 0.00013006407673611564, + "loss": 1.202, + "step": 26920 + }, + { + "epoch": 0.3498261167592257, + "grad_norm": 0.3879469931125641, + "learning_rate": 0.00013006147727420424, + "loss": 1.4859, + "step": 26921 + }, + { + "epoch": 0.3498391113031416, + "grad_norm": 0.43745309114456177, + "learning_rate": 0.00013005887781229286, + "loss": 1.5778, + "step": 26922 + }, + { + "epoch": 0.34985210584705745, + "grad_norm": 0.4024139940738678, + "learning_rate": 0.00013005627835038148, + "loss": 1.3276, + "step": 26923 + }, + { + "epoch": 0.34986510039097335, + "grad_norm": 0.5062936544418335, + "learning_rate": 0.00013005367888847008, + "loss": 1.4731, + "step": 26924 + }, + { + "epoch": 0.3498780949348892, + "grad_norm": 0.5520234107971191, + "learning_rate": 0.00013005107942655873, + "loss": 1.4388, + "step": 26925 + }, + { + "epoch": 0.3498910894788051, + "grad_norm": 0.3350699543952942, + "learning_rate": 0.00013004847996464733, + "loss": 1.5799, + "step": 26926 + }, + { + "epoch": 0.34990408402272094, + "grad_norm": 0.42720645666122437, + "learning_rate": 0.00013004588050273593, + "loss": 1.4483, + "step": 26927 + }, + { + "epoch": 0.34991707856663684, + "grad_norm": 0.4357052445411682, + "learning_rate": 0.00013004328104082455, + "loss": 1.6186, + "step": 26928 + }, + { + "epoch": 0.3499300731105527, + "grad_norm": 0.39714014530181885, + "learning_rate": 0.00013004068157891318, + "loss": 1.5038, + "step": 26929 + }, + { + "epoch": 0.3499430676544686, + "grad_norm": 0.3437274396419525, + "learning_rate": 0.0001300380821170018, + "loss": 1.2741, + "step": 26930 + }, + { + "epoch": 0.34995606219838443, + "grad_norm": 0.4881211817264557, + "learning_rate": 0.0001300354826550904, + "loss": 1.3654, + "step": 26931 + }, + { + "epoch": 0.34996905674230033, + "grad_norm": 0.4385480284690857, + "learning_rate": 0.00013003288319317902, + "loss": 1.602, + "step": 26932 + }, + { + "epoch": 0.3499820512862162, + "grad_norm": 0.3212842047214508, + "learning_rate": 0.00013003028373126765, + "loss": 1.2611, + "step": 26933 + }, + { + "epoch": 0.3499950458301321, + "grad_norm": 0.3364401161670685, + "learning_rate": 0.00013002768426935624, + "loss": 1.3029, + "step": 26934 + }, + { + "epoch": 0.3500080403740479, + "grad_norm": 0.33741095662117004, + "learning_rate": 0.00013002508480744487, + "loss": 1.39, + "step": 26935 + }, + { + "epoch": 0.3500210349179638, + "grad_norm": 0.43944478034973145, + "learning_rate": 0.00013002248534553347, + "loss": 1.3808, + "step": 26936 + }, + { + "epoch": 0.35003402946187967, + "grad_norm": 0.4914529025554657, + "learning_rate": 0.00013001988588362212, + "loss": 1.4968, + "step": 26937 + }, + { + "epoch": 0.35004702400579557, + "grad_norm": 0.38273531198501587, + "learning_rate": 0.00013001728642171072, + "loss": 1.3837, + "step": 26938 + }, + { + "epoch": 0.3500600185497114, + "grad_norm": 0.35967475175857544, + "learning_rate": 0.0001300146869597993, + "loss": 1.3749, + "step": 26939 + }, + { + "epoch": 0.3500730130936273, + "grad_norm": 0.4515627324581146, + "learning_rate": 0.00013001208749788794, + "loss": 1.4449, + "step": 26940 + }, + { + "epoch": 0.35008600763754316, + "grad_norm": 0.35798168182373047, + "learning_rate": 0.00013000948803597656, + "loss": 1.2316, + "step": 26941 + }, + { + "epoch": 0.35009900218145906, + "grad_norm": 0.45054376125335693, + "learning_rate": 0.0001300068885740652, + "loss": 1.5569, + "step": 26942 + }, + { + "epoch": 0.3501119967253749, + "grad_norm": 0.413839727640152, + "learning_rate": 0.00013000428911215378, + "loss": 1.3274, + "step": 26943 + }, + { + "epoch": 0.3501249912692908, + "grad_norm": 0.3641895055770874, + "learning_rate": 0.0001300016896502424, + "loss": 1.4187, + "step": 26944 + }, + { + "epoch": 0.35013798581320665, + "grad_norm": 0.4793921709060669, + "learning_rate": 0.00012999909018833103, + "loss": 1.4823, + "step": 26945 + }, + { + "epoch": 0.35015098035712255, + "grad_norm": 0.4696202874183655, + "learning_rate": 0.00012999649072641963, + "loss": 1.3019, + "step": 26946 + }, + { + "epoch": 0.3501639749010384, + "grad_norm": 0.38293203711509705, + "learning_rate": 0.00012999389126450825, + "loss": 1.3395, + "step": 26947 + }, + { + "epoch": 0.3501769694449543, + "grad_norm": 0.384132444858551, + "learning_rate": 0.00012999129180259685, + "loss": 1.3047, + "step": 26948 + }, + { + "epoch": 0.35018996398887015, + "grad_norm": 0.3273977041244507, + "learning_rate": 0.0001299886923406855, + "loss": 1.357, + "step": 26949 + }, + { + "epoch": 0.35020295853278605, + "grad_norm": 0.4120301306247711, + "learning_rate": 0.0001299860928787741, + "loss": 1.5079, + "step": 26950 + }, + { + "epoch": 0.35021595307670195, + "grad_norm": 0.3835808038711548, + "learning_rate": 0.0001299834934168627, + "loss": 1.4467, + "step": 26951 + }, + { + "epoch": 0.3502289476206178, + "grad_norm": 0.38245677947998047, + "learning_rate": 0.00012998089395495132, + "loss": 1.5248, + "step": 26952 + }, + { + "epoch": 0.3502419421645337, + "grad_norm": 0.3783007860183716, + "learning_rate": 0.00012997829449303995, + "loss": 1.5285, + "step": 26953 + }, + { + "epoch": 0.35025493670844954, + "grad_norm": 0.39009571075439453, + "learning_rate": 0.00012997569503112857, + "loss": 1.309, + "step": 26954 + }, + { + "epoch": 0.35026793125236544, + "grad_norm": 0.3074268698692322, + "learning_rate": 0.00012997309556921717, + "loss": 1.2015, + "step": 26955 + }, + { + "epoch": 0.3502809257962813, + "grad_norm": 0.3684777319431305, + "learning_rate": 0.0001299704961073058, + "loss": 1.3467, + "step": 26956 + }, + { + "epoch": 0.3502939203401972, + "grad_norm": 0.377359002828598, + "learning_rate": 0.00012996789664539442, + "loss": 1.2908, + "step": 26957 + }, + { + "epoch": 0.35030691488411303, + "grad_norm": 0.4214313328266144, + "learning_rate": 0.00012996529718348302, + "loss": 1.3175, + "step": 26958 + }, + { + "epoch": 0.35031990942802893, + "grad_norm": 0.37894168496131897, + "learning_rate": 0.00012996269772157164, + "loss": 1.4382, + "step": 26959 + }, + { + "epoch": 0.3503329039719448, + "grad_norm": 0.40779784321784973, + "learning_rate": 0.00012996009825966024, + "loss": 1.3777, + "step": 26960 + }, + { + "epoch": 0.3503458985158607, + "grad_norm": 0.3485451340675354, + "learning_rate": 0.0001299574987977489, + "loss": 1.2494, + "step": 26961 + }, + { + "epoch": 0.3503588930597765, + "grad_norm": 0.38289153575897217, + "learning_rate": 0.0001299548993358375, + "loss": 1.4358, + "step": 26962 + }, + { + "epoch": 0.3503718876036924, + "grad_norm": 0.3391575217247009, + "learning_rate": 0.0001299522998739261, + "loss": 1.3516, + "step": 26963 + }, + { + "epoch": 0.35038488214760827, + "grad_norm": 0.3853176534175873, + "learning_rate": 0.00012994970041201474, + "loss": 1.1945, + "step": 26964 + }, + { + "epoch": 0.35039787669152417, + "grad_norm": 0.4643430709838867, + "learning_rate": 0.00012994710095010333, + "loss": 1.487, + "step": 26965 + }, + { + "epoch": 0.35041087123544, + "grad_norm": 0.41897791624069214, + "learning_rate": 0.00012994450148819196, + "loss": 1.4653, + "step": 26966 + }, + { + "epoch": 0.3504238657793559, + "grad_norm": 0.40874582529067993, + "learning_rate": 0.00012994190202628055, + "loss": 1.5163, + "step": 26967 + }, + { + "epoch": 0.35043686032327176, + "grad_norm": 0.5114034414291382, + "learning_rate": 0.00012993930256436918, + "loss": 1.433, + "step": 26968 + }, + { + "epoch": 0.35044985486718766, + "grad_norm": 0.40875813364982605, + "learning_rate": 0.0001299367031024578, + "loss": 1.3619, + "step": 26969 + }, + { + "epoch": 0.3504628494111035, + "grad_norm": 0.35730940103530884, + "learning_rate": 0.0001299341036405464, + "loss": 1.4359, + "step": 26970 + }, + { + "epoch": 0.3504758439550194, + "grad_norm": 0.4098184108734131, + "learning_rate": 0.00012993150417863503, + "loss": 1.3103, + "step": 26971 + }, + { + "epoch": 0.35048883849893525, + "grad_norm": 0.45835843682289124, + "learning_rate": 0.00012992890471672365, + "loss": 1.4403, + "step": 26972 + }, + { + "epoch": 0.35050183304285115, + "grad_norm": 0.4666747748851776, + "learning_rate": 0.00012992630525481227, + "loss": 1.52, + "step": 26973 + }, + { + "epoch": 0.350514827586767, + "grad_norm": 0.3246106803417206, + "learning_rate": 0.00012992370579290087, + "loss": 1.3549, + "step": 26974 + }, + { + "epoch": 0.3505278221306829, + "grad_norm": 0.3853197395801544, + "learning_rate": 0.0001299211063309895, + "loss": 1.3704, + "step": 26975 + }, + { + "epoch": 0.35054081667459874, + "grad_norm": 0.4811348021030426, + "learning_rate": 0.00012991850686907812, + "loss": 1.5558, + "step": 26976 + }, + { + "epoch": 0.35055381121851464, + "grad_norm": 0.39772531390190125, + "learning_rate": 0.00012991590740716672, + "loss": 1.1971, + "step": 26977 + }, + { + "epoch": 0.3505668057624305, + "grad_norm": 0.3588211238384247, + "learning_rate": 0.00012991330794525534, + "loss": 1.246, + "step": 26978 + }, + { + "epoch": 0.3505798003063464, + "grad_norm": 0.39411279559135437, + "learning_rate": 0.00012991070848334394, + "loss": 1.2463, + "step": 26979 + }, + { + "epoch": 0.35059279485026223, + "grad_norm": 0.34600964188575745, + "learning_rate": 0.00012990810902143256, + "loss": 1.336, + "step": 26980 + }, + { + "epoch": 0.35060578939417814, + "grad_norm": 0.30417564511299133, + "learning_rate": 0.0001299055095595212, + "loss": 1.113, + "step": 26981 + }, + { + "epoch": 0.350618783938094, + "grad_norm": 0.49325844645500183, + "learning_rate": 0.0001299029100976098, + "loss": 1.4323, + "step": 26982 + }, + { + "epoch": 0.3506317784820099, + "grad_norm": 0.40701404213905334, + "learning_rate": 0.0001299003106356984, + "loss": 1.4937, + "step": 26983 + }, + { + "epoch": 0.3506447730259257, + "grad_norm": 0.44545090198516846, + "learning_rate": 0.00012989771117378704, + "loss": 1.2507, + "step": 26984 + }, + { + "epoch": 0.3506577675698416, + "grad_norm": 0.32665714621543884, + "learning_rate": 0.00012989511171187566, + "loss": 1.3411, + "step": 26985 + }, + { + "epoch": 0.3506707621137575, + "grad_norm": 0.36822062730789185, + "learning_rate": 0.00012989251224996426, + "loss": 1.2569, + "step": 26986 + }, + { + "epoch": 0.3506837566576734, + "grad_norm": 0.416346937417984, + "learning_rate": 0.00012988991278805288, + "loss": 1.4851, + "step": 26987 + }, + { + "epoch": 0.3506967512015892, + "grad_norm": 0.4807894229888916, + "learning_rate": 0.0001298873133261415, + "loss": 1.3956, + "step": 26988 + }, + { + "epoch": 0.3507097457455051, + "grad_norm": 0.46177607774734497, + "learning_rate": 0.0001298847138642301, + "loss": 1.2707, + "step": 26989 + }, + { + "epoch": 0.35072274028942096, + "grad_norm": 0.39404296875, + "learning_rate": 0.00012988211440231873, + "loss": 1.5894, + "step": 26990 + }, + { + "epoch": 0.35073573483333687, + "grad_norm": 0.38684603571891785, + "learning_rate": 0.00012987951494040733, + "loss": 1.2964, + "step": 26991 + }, + { + "epoch": 0.3507487293772527, + "grad_norm": 0.4723605513572693, + "learning_rate": 0.00012987691547849598, + "loss": 1.479, + "step": 26992 + }, + { + "epoch": 0.3507617239211686, + "grad_norm": 0.37446537613868713, + "learning_rate": 0.00012987431601658457, + "loss": 1.5069, + "step": 26993 + }, + { + "epoch": 0.35077471846508446, + "grad_norm": 0.4684825837612152, + "learning_rate": 0.00012987171655467317, + "loss": 1.3526, + "step": 26994 + }, + { + "epoch": 0.35078771300900036, + "grad_norm": 0.4224631190299988, + "learning_rate": 0.0001298691170927618, + "loss": 1.3635, + "step": 26995 + }, + { + "epoch": 0.3508007075529162, + "grad_norm": 0.40388378500938416, + "learning_rate": 0.00012986651763085042, + "loss": 1.4961, + "step": 26996 + }, + { + "epoch": 0.3508137020968321, + "grad_norm": 0.3414419889450073, + "learning_rate": 0.00012986391816893905, + "loss": 1.2616, + "step": 26997 + }, + { + "epoch": 0.35082669664074795, + "grad_norm": 0.33642011880874634, + "learning_rate": 0.00012986131870702764, + "loss": 1.4362, + "step": 26998 + }, + { + "epoch": 0.35083969118466385, + "grad_norm": 0.2976331114768982, + "learning_rate": 0.00012985871924511627, + "loss": 1.2113, + "step": 26999 + }, + { + "epoch": 0.3508526857285797, + "grad_norm": 0.33675840497016907, + "learning_rate": 0.0001298561197832049, + "loss": 1.218, + "step": 27000 + }, + { + "epoch": 0.3508656802724956, + "grad_norm": 0.37695372104644775, + "learning_rate": 0.0001298535203212935, + "loss": 1.4911, + "step": 27001 + }, + { + "epoch": 0.35087867481641144, + "grad_norm": 0.43729162216186523, + "learning_rate": 0.0001298509208593821, + "loss": 1.5009, + "step": 27002 + }, + { + "epoch": 0.35089166936032734, + "grad_norm": 0.3892037272453308, + "learning_rate": 0.00012984832139747074, + "loss": 1.2716, + "step": 27003 + }, + { + "epoch": 0.3509046639042432, + "grad_norm": 0.32991844415664673, + "learning_rate": 0.00012984572193555936, + "loss": 1.1744, + "step": 27004 + }, + { + "epoch": 0.3509176584481591, + "grad_norm": 0.3075510561466217, + "learning_rate": 0.00012984312247364796, + "loss": 1.3327, + "step": 27005 + }, + { + "epoch": 0.35093065299207493, + "grad_norm": 0.4822843670845032, + "learning_rate": 0.00012984052301173656, + "loss": 1.411, + "step": 27006 + }, + { + "epoch": 0.35094364753599083, + "grad_norm": 0.4144551753997803, + "learning_rate": 0.0001298379235498252, + "loss": 1.2215, + "step": 27007 + }, + { + "epoch": 0.3509566420799067, + "grad_norm": 0.3838679790496826, + "learning_rate": 0.0001298353240879138, + "loss": 1.4537, + "step": 27008 + }, + { + "epoch": 0.3509696366238226, + "grad_norm": 0.3840310275554657, + "learning_rate": 0.00012983272462600243, + "loss": 1.4244, + "step": 27009 + }, + { + "epoch": 0.3509826311677384, + "grad_norm": 0.33375999331474304, + "learning_rate": 0.00012983012516409103, + "loss": 1.1103, + "step": 27010 + }, + { + "epoch": 0.3509956257116543, + "grad_norm": 0.4148995280265808, + "learning_rate": 0.00012982752570217965, + "loss": 1.5907, + "step": 27011 + }, + { + "epoch": 0.35100862025557017, + "grad_norm": 0.4446263611316681, + "learning_rate": 0.00012982492624026828, + "loss": 1.3562, + "step": 27012 + }, + { + "epoch": 0.35102161479948607, + "grad_norm": 0.4836627244949341, + "learning_rate": 0.00012982232677835687, + "loss": 1.475, + "step": 27013 + }, + { + "epoch": 0.3510346093434019, + "grad_norm": 0.37680524587631226, + "learning_rate": 0.0001298197273164455, + "loss": 1.3046, + "step": 27014 + }, + { + "epoch": 0.3510476038873178, + "grad_norm": 0.29187628626823425, + "learning_rate": 0.00012981712785453412, + "loss": 1.1886, + "step": 27015 + }, + { + "epoch": 0.35106059843123366, + "grad_norm": 0.3478575646877289, + "learning_rate": 0.00012981452839262275, + "loss": 1.4898, + "step": 27016 + }, + { + "epoch": 0.35107359297514956, + "grad_norm": 0.4063595235347748, + "learning_rate": 0.00012981192893071135, + "loss": 1.4405, + "step": 27017 + }, + { + "epoch": 0.3510865875190654, + "grad_norm": 0.3334345817565918, + "learning_rate": 0.00012980932946879994, + "loss": 1.2063, + "step": 27018 + }, + { + "epoch": 0.3510995820629813, + "grad_norm": 0.3710629940032959, + "learning_rate": 0.0001298067300068886, + "loss": 1.1747, + "step": 27019 + }, + { + "epoch": 0.35111257660689715, + "grad_norm": 0.4263536036014557, + "learning_rate": 0.0001298041305449772, + "loss": 1.5293, + "step": 27020 + }, + { + "epoch": 0.35112557115081305, + "grad_norm": 0.4655008614063263, + "learning_rate": 0.00012980153108306582, + "loss": 1.4345, + "step": 27021 + }, + { + "epoch": 0.3511385656947289, + "grad_norm": 0.3801930844783783, + "learning_rate": 0.0001297989316211544, + "loss": 1.3823, + "step": 27022 + }, + { + "epoch": 0.3511515602386448, + "grad_norm": 0.3835110366344452, + "learning_rate": 0.00012979633215924304, + "loss": 1.7593, + "step": 27023 + }, + { + "epoch": 0.35116455478256065, + "grad_norm": 0.2913677990436554, + "learning_rate": 0.00012979373269733166, + "loss": 1.3644, + "step": 27024 + }, + { + "epoch": 0.35117754932647655, + "grad_norm": 0.5896442532539368, + "learning_rate": 0.00012979113323542026, + "loss": 1.5097, + "step": 27025 + }, + { + "epoch": 0.35119054387039245, + "grad_norm": 0.44798359274864197, + "learning_rate": 0.00012978853377350888, + "loss": 1.5235, + "step": 27026 + }, + { + "epoch": 0.3512035384143083, + "grad_norm": 0.2839646637439728, + "learning_rate": 0.0001297859343115975, + "loss": 1.214, + "step": 27027 + }, + { + "epoch": 0.3512165329582242, + "grad_norm": 0.2983718812465668, + "learning_rate": 0.00012978333484968613, + "loss": 1.3195, + "step": 27028 + }, + { + "epoch": 0.35122952750214004, + "grad_norm": 0.44161394238471985, + "learning_rate": 0.00012978073538777473, + "loss": 1.4013, + "step": 27029 + }, + { + "epoch": 0.35124252204605594, + "grad_norm": 0.43229126930236816, + "learning_rate": 0.00012977813592586336, + "loss": 1.3821, + "step": 27030 + }, + { + "epoch": 0.3512555165899718, + "grad_norm": 0.4676183760166168, + "learning_rate": 0.00012977553646395198, + "loss": 1.2401, + "step": 27031 + }, + { + "epoch": 0.3512685111338877, + "grad_norm": 0.42624327540397644, + "learning_rate": 0.00012977293700204058, + "loss": 1.4408, + "step": 27032 + }, + { + "epoch": 0.35128150567780353, + "grad_norm": 0.47141867876052856, + "learning_rate": 0.0001297703375401292, + "loss": 1.4197, + "step": 27033 + }, + { + "epoch": 0.35129450022171943, + "grad_norm": 0.47578155994415283, + "learning_rate": 0.0001297677380782178, + "loss": 1.4183, + "step": 27034 + }, + { + "epoch": 0.3513074947656353, + "grad_norm": 0.3964194357395172, + "learning_rate": 0.00012976513861630642, + "loss": 1.641, + "step": 27035 + }, + { + "epoch": 0.3513204893095512, + "grad_norm": 0.47579044103622437, + "learning_rate": 0.00012976253915439505, + "loss": 1.4875, + "step": 27036 + }, + { + "epoch": 0.351333483853467, + "grad_norm": 0.4011628329753876, + "learning_rate": 0.00012975993969248365, + "loss": 1.298, + "step": 27037 + }, + { + "epoch": 0.3513464783973829, + "grad_norm": 0.3676300346851349, + "learning_rate": 0.0001297573402305723, + "loss": 1.4357, + "step": 27038 + }, + { + "epoch": 0.35135947294129877, + "grad_norm": 0.3938177525997162, + "learning_rate": 0.0001297547407686609, + "loss": 1.2096, + "step": 27039 + }, + { + "epoch": 0.35137246748521467, + "grad_norm": 0.3831042945384979, + "learning_rate": 0.00012975214130674952, + "loss": 1.4742, + "step": 27040 + }, + { + "epoch": 0.3513854620291305, + "grad_norm": 0.3779260814189911, + "learning_rate": 0.00012974954184483812, + "loss": 1.4277, + "step": 27041 + }, + { + "epoch": 0.3513984565730464, + "grad_norm": 0.3410851061344147, + "learning_rate": 0.00012974694238292674, + "loss": 1.1344, + "step": 27042 + }, + { + "epoch": 0.35141145111696226, + "grad_norm": 0.509138286113739, + "learning_rate": 0.00012974434292101537, + "loss": 1.3833, + "step": 27043 + }, + { + "epoch": 0.35142444566087816, + "grad_norm": 0.43648701906204224, + "learning_rate": 0.00012974174345910396, + "loss": 1.3019, + "step": 27044 + }, + { + "epoch": 0.351437440204794, + "grad_norm": 0.469257652759552, + "learning_rate": 0.0001297391439971926, + "loss": 1.3618, + "step": 27045 + }, + { + "epoch": 0.3514504347487099, + "grad_norm": 0.5228627920150757, + "learning_rate": 0.0001297365445352812, + "loss": 1.6148, + "step": 27046 + }, + { + "epoch": 0.35146342929262575, + "grad_norm": 0.4003678858280182, + "learning_rate": 0.00012973394507336984, + "loss": 1.3875, + "step": 27047 + }, + { + "epoch": 0.35147642383654165, + "grad_norm": 0.45567476749420166, + "learning_rate": 0.00012973134561145843, + "loss": 1.3138, + "step": 27048 + }, + { + "epoch": 0.3514894183804575, + "grad_norm": 0.48438018560409546, + "learning_rate": 0.00012972874614954703, + "loss": 1.4265, + "step": 27049 + }, + { + "epoch": 0.3515024129243734, + "grad_norm": 0.400463730096817, + "learning_rate": 0.00012972614668763568, + "loss": 1.4113, + "step": 27050 + }, + { + "epoch": 0.35151540746828924, + "grad_norm": 0.36620020866394043, + "learning_rate": 0.00012972354722572428, + "loss": 1.2006, + "step": 27051 + }, + { + "epoch": 0.35152840201220514, + "grad_norm": 0.42971378564834595, + "learning_rate": 0.0001297209477638129, + "loss": 1.4268, + "step": 27052 + }, + { + "epoch": 0.351541396556121, + "grad_norm": 0.4140094220638275, + "learning_rate": 0.0001297183483019015, + "loss": 1.471, + "step": 27053 + }, + { + "epoch": 0.3515543911000369, + "grad_norm": 0.34700700640678406, + "learning_rate": 0.00012971574883999013, + "loss": 1.4169, + "step": 27054 + }, + { + "epoch": 0.35156738564395273, + "grad_norm": 0.3546464145183563, + "learning_rate": 0.00012971314937807875, + "loss": 1.281, + "step": 27055 + }, + { + "epoch": 0.35158038018786864, + "grad_norm": 0.3365534842014313, + "learning_rate": 0.00012971054991616735, + "loss": 1.3327, + "step": 27056 + }, + { + "epoch": 0.3515933747317845, + "grad_norm": 0.4163077473640442, + "learning_rate": 0.00012970795045425597, + "loss": 1.542, + "step": 27057 + }, + { + "epoch": 0.3516063692757004, + "grad_norm": 0.4851019084453583, + "learning_rate": 0.0001297053509923446, + "loss": 1.19, + "step": 27058 + }, + { + "epoch": 0.3516193638196162, + "grad_norm": 0.35443252325057983, + "learning_rate": 0.00012970275153043322, + "loss": 1.4416, + "step": 27059 + }, + { + "epoch": 0.3516323583635321, + "grad_norm": 0.4046309292316437, + "learning_rate": 0.00012970015206852182, + "loss": 1.3784, + "step": 27060 + }, + { + "epoch": 0.351645352907448, + "grad_norm": 0.4896560311317444, + "learning_rate": 0.00012969755260661042, + "loss": 1.3577, + "step": 27061 + }, + { + "epoch": 0.3516583474513639, + "grad_norm": 0.3035734295845032, + "learning_rate": 0.00012969495314469907, + "loss": 1.4203, + "step": 27062 + }, + { + "epoch": 0.3516713419952797, + "grad_norm": 0.3710680902004242, + "learning_rate": 0.00012969235368278766, + "loss": 1.633, + "step": 27063 + }, + { + "epoch": 0.3516843365391956, + "grad_norm": 0.41660815477371216, + "learning_rate": 0.0001296897542208763, + "loss": 1.3859, + "step": 27064 + }, + { + "epoch": 0.35169733108311146, + "grad_norm": 0.5244542956352234, + "learning_rate": 0.0001296871547589649, + "loss": 1.4916, + "step": 27065 + }, + { + "epoch": 0.35171032562702736, + "grad_norm": 0.3543093204498291, + "learning_rate": 0.0001296845552970535, + "loss": 1.4181, + "step": 27066 + }, + { + "epoch": 0.3517233201709432, + "grad_norm": 0.3983149230480194, + "learning_rate": 0.00012968195583514214, + "loss": 1.3706, + "step": 27067 + }, + { + "epoch": 0.3517363147148591, + "grad_norm": 0.41664624214172363, + "learning_rate": 0.00012967935637323073, + "loss": 1.5179, + "step": 27068 + }, + { + "epoch": 0.35174930925877496, + "grad_norm": 0.3637050986289978, + "learning_rate": 0.00012967675691131936, + "loss": 1.4566, + "step": 27069 + }, + { + "epoch": 0.35176230380269086, + "grad_norm": 0.45833367109298706, + "learning_rate": 0.00012967415744940798, + "loss": 1.3251, + "step": 27070 + }, + { + "epoch": 0.3517752983466067, + "grad_norm": 0.35691601037979126, + "learning_rate": 0.0001296715579874966, + "loss": 1.3208, + "step": 27071 + }, + { + "epoch": 0.3517882928905226, + "grad_norm": 0.39897042512893677, + "learning_rate": 0.0001296689585255852, + "loss": 1.4172, + "step": 27072 + }, + { + "epoch": 0.35180128743443845, + "grad_norm": 0.4390867054462433, + "learning_rate": 0.00012966635906367383, + "loss": 1.4651, + "step": 27073 + }, + { + "epoch": 0.35181428197835435, + "grad_norm": 0.3452596366405487, + "learning_rate": 0.00012966375960176245, + "loss": 1.3759, + "step": 27074 + }, + { + "epoch": 0.3518272765222702, + "grad_norm": 0.37790384888648987, + "learning_rate": 0.00012966116013985105, + "loss": 1.2155, + "step": 27075 + }, + { + "epoch": 0.3518402710661861, + "grad_norm": 0.42543140053749084, + "learning_rate": 0.00012965856067793967, + "loss": 1.5491, + "step": 27076 + }, + { + "epoch": 0.35185326561010194, + "grad_norm": 0.46987539529800415, + "learning_rate": 0.0001296559612160283, + "loss": 1.4277, + "step": 27077 + }, + { + "epoch": 0.35186626015401784, + "grad_norm": 0.28291940689086914, + "learning_rate": 0.0001296533617541169, + "loss": 1.3385, + "step": 27078 + }, + { + "epoch": 0.3518792546979337, + "grad_norm": 0.42285341024398804, + "learning_rate": 0.00012965076229220552, + "loss": 1.3215, + "step": 27079 + }, + { + "epoch": 0.3518922492418496, + "grad_norm": 0.4595276117324829, + "learning_rate": 0.00012964816283029412, + "loss": 1.3864, + "step": 27080 + }, + { + "epoch": 0.35190524378576543, + "grad_norm": 0.3763505816459656, + "learning_rate": 0.00012964556336838277, + "loss": 1.4072, + "step": 27081 + }, + { + "epoch": 0.35191823832968133, + "grad_norm": 0.36641186475753784, + "learning_rate": 0.00012964296390647137, + "loss": 1.3056, + "step": 27082 + }, + { + "epoch": 0.3519312328735972, + "grad_norm": 0.49726569652557373, + "learning_rate": 0.00012964036444456, + "loss": 1.5565, + "step": 27083 + }, + { + "epoch": 0.3519442274175131, + "grad_norm": 0.4177541732788086, + "learning_rate": 0.0001296377649826486, + "loss": 1.2578, + "step": 27084 + }, + { + "epoch": 0.3519572219614289, + "grad_norm": 0.48122891783714294, + "learning_rate": 0.00012963516552073721, + "loss": 1.2529, + "step": 27085 + }, + { + "epoch": 0.3519702165053448, + "grad_norm": 0.3749167323112488, + "learning_rate": 0.00012963256605882584, + "loss": 1.2844, + "step": 27086 + }, + { + "epoch": 0.35198321104926067, + "grad_norm": 0.4928489923477173, + "learning_rate": 0.00012962996659691444, + "loss": 1.3423, + "step": 27087 + }, + { + "epoch": 0.35199620559317657, + "grad_norm": 0.3367745876312256, + "learning_rate": 0.00012962736713500306, + "loss": 1.4079, + "step": 27088 + }, + { + "epoch": 0.3520092001370924, + "grad_norm": 0.4701220691204071, + "learning_rate": 0.00012962476767309168, + "loss": 1.5198, + "step": 27089 + }, + { + "epoch": 0.3520221946810083, + "grad_norm": 0.2655181288719177, + "learning_rate": 0.00012962216821118028, + "loss": 1.3074, + "step": 27090 + }, + { + "epoch": 0.35203518922492416, + "grad_norm": 0.4165365993976593, + "learning_rate": 0.0001296195687492689, + "loss": 1.3321, + "step": 27091 + }, + { + "epoch": 0.35204818376884006, + "grad_norm": 0.45527905225753784, + "learning_rate": 0.0001296169692873575, + "loss": 1.3616, + "step": 27092 + }, + { + "epoch": 0.3520611783127559, + "grad_norm": 0.41198909282684326, + "learning_rate": 0.00012961436982544616, + "loss": 1.3157, + "step": 27093 + }, + { + "epoch": 0.3520741728566718, + "grad_norm": 0.3758623003959656, + "learning_rate": 0.00012961177036353475, + "loss": 1.1563, + "step": 27094 + }, + { + "epoch": 0.35208716740058765, + "grad_norm": 0.366791695356369, + "learning_rate": 0.00012960917090162338, + "loss": 1.3808, + "step": 27095 + }, + { + "epoch": 0.35210016194450355, + "grad_norm": 0.3308677673339844, + "learning_rate": 0.00012960657143971197, + "loss": 1.166, + "step": 27096 + }, + { + "epoch": 0.3521131564884194, + "grad_norm": 0.3171396255493164, + "learning_rate": 0.0001296039719778006, + "loss": 1.282, + "step": 27097 + }, + { + "epoch": 0.3521261510323353, + "grad_norm": 0.41879427433013916, + "learning_rate": 0.00012960137251588922, + "loss": 1.3349, + "step": 27098 + }, + { + "epoch": 0.35213914557625114, + "grad_norm": 0.3509661853313446, + "learning_rate": 0.00012959877305397782, + "loss": 1.5242, + "step": 27099 + }, + { + "epoch": 0.35215214012016705, + "grad_norm": 0.45570144057273865, + "learning_rate": 0.00012959617359206645, + "loss": 1.2994, + "step": 27100 + }, + { + "epoch": 0.3521651346640829, + "grad_norm": 0.2961921691894531, + "learning_rate": 0.00012959357413015507, + "loss": 1.145, + "step": 27101 + }, + { + "epoch": 0.3521781292079988, + "grad_norm": 0.4828557074069977, + "learning_rate": 0.00012959097466824367, + "loss": 1.4225, + "step": 27102 + }, + { + "epoch": 0.3521911237519147, + "grad_norm": 0.3583434820175171, + "learning_rate": 0.0001295883752063323, + "loss": 1.2372, + "step": 27103 + }, + { + "epoch": 0.35220411829583054, + "grad_norm": 0.41975805163383484, + "learning_rate": 0.0001295857757444209, + "loss": 1.3186, + "step": 27104 + }, + { + "epoch": 0.35221711283974644, + "grad_norm": 0.39681947231292725, + "learning_rate": 0.00012958317628250954, + "loss": 1.501, + "step": 27105 + }, + { + "epoch": 0.3522301073836623, + "grad_norm": 0.3591127097606659, + "learning_rate": 0.00012958057682059814, + "loss": 1.4966, + "step": 27106 + }, + { + "epoch": 0.3522431019275782, + "grad_norm": 0.446712464094162, + "learning_rate": 0.00012957797735868676, + "loss": 1.2911, + "step": 27107 + }, + { + "epoch": 0.35225609647149403, + "grad_norm": 0.40670046210289, + "learning_rate": 0.00012957537789677536, + "loss": 1.5025, + "step": 27108 + }, + { + "epoch": 0.35226909101540993, + "grad_norm": 0.3264428675174713, + "learning_rate": 0.00012957277843486398, + "loss": 1.2477, + "step": 27109 + }, + { + "epoch": 0.3522820855593258, + "grad_norm": 0.41159600019454956, + "learning_rate": 0.0001295701789729526, + "loss": 1.4344, + "step": 27110 + }, + { + "epoch": 0.3522950801032417, + "grad_norm": 0.3865565359592438, + "learning_rate": 0.0001295675795110412, + "loss": 1.3583, + "step": 27111 + }, + { + "epoch": 0.3523080746471575, + "grad_norm": 0.47535240650177, + "learning_rate": 0.00012956498004912986, + "loss": 1.5369, + "step": 27112 + }, + { + "epoch": 0.3523210691910734, + "grad_norm": 0.4841874837875366, + "learning_rate": 0.00012956238058721846, + "loss": 1.3149, + "step": 27113 + }, + { + "epoch": 0.35233406373498927, + "grad_norm": 0.36783015727996826, + "learning_rate": 0.00012955978112530708, + "loss": 1.38, + "step": 27114 + }, + { + "epoch": 0.35234705827890517, + "grad_norm": 0.3593432307243347, + "learning_rate": 0.00012955718166339568, + "loss": 1.3898, + "step": 27115 + }, + { + "epoch": 0.352360052822821, + "grad_norm": 0.41562482714653015, + "learning_rate": 0.0001295545822014843, + "loss": 1.2801, + "step": 27116 + }, + { + "epoch": 0.3523730473667369, + "grad_norm": 0.4107935428619385, + "learning_rate": 0.00012955198273957293, + "loss": 1.3241, + "step": 27117 + }, + { + "epoch": 0.35238604191065276, + "grad_norm": 0.4419505298137665, + "learning_rate": 0.00012954938327766152, + "loss": 1.4997, + "step": 27118 + }, + { + "epoch": 0.35239903645456866, + "grad_norm": 0.5432904362678528, + "learning_rate": 0.00012954678381575015, + "loss": 1.5411, + "step": 27119 + }, + { + "epoch": 0.3524120309984845, + "grad_norm": 0.39166587591171265, + "learning_rate": 0.00012954418435383877, + "loss": 1.4298, + "step": 27120 + }, + { + "epoch": 0.3524250255424004, + "grad_norm": 0.4792831242084503, + "learning_rate": 0.00012954158489192737, + "loss": 1.3857, + "step": 27121 + }, + { + "epoch": 0.35243802008631625, + "grad_norm": 0.3664616346359253, + "learning_rate": 0.000129538985430016, + "loss": 1.4441, + "step": 27122 + }, + { + "epoch": 0.35245101463023215, + "grad_norm": 0.3138395845890045, + "learning_rate": 0.0001295363859681046, + "loss": 1.2474, + "step": 27123 + }, + { + "epoch": 0.352464009174148, + "grad_norm": 0.46682190895080566, + "learning_rate": 0.00012953378650619324, + "loss": 1.3826, + "step": 27124 + }, + { + "epoch": 0.3524770037180639, + "grad_norm": 0.40159592032432556, + "learning_rate": 0.00012953118704428184, + "loss": 1.3264, + "step": 27125 + }, + { + "epoch": 0.35248999826197974, + "grad_norm": 0.36960569024086, + "learning_rate": 0.00012952858758237047, + "loss": 1.554, + "step": 27126 + }, + { + "epoch": 0.35250299280589564, + "grad_norm": 0.47910141944885254, + "learning_rate": 0.00012952598812045906, + "loss": 1.5541, + "step": 27127 + }, + { + "epoch": 0.3525159873498115, + "grad_norm": 0.3956829905509949, + "learning_rate": 0.0001295233886585477, + "loss": 1.3818, + "step": 27128 + }, + { + "epoch": 0.3525289818937274, + "grad_norm": 0.4648318290710449, + "learning_rate": 0.0001295207891966363, + "loss": 1.4809, + "step": 27129 + }, + { + "epoch": 0.35254197643764323, + "grad_norm": 0.37654101848602295, + "learning_rate": 0.0001295181897347249, + "loss": 1.4072, + "step": 27130 + }, + { + "epoch": 0.35255497098155913, + "grad_norm": 0.4834626019001007, + "learning_rate": 0.00012951559027281353, + "loss": 1.3761, + "step": 27131 + }, + { + "epoch": 0.352567965525475, + "grad_norm": 0.31139543652534485, + "learning_rate": 0.00012951299081090216, + "loss": 1.2309, + "step": 27132 + }, + { + "epoch": 0.3525809600693909, + "grad_norm": 0.3681308329105377, + "learning_rate": 0.00012951039134899076, + "loss": 1.5518, + "step": 27133 + }, + { + "epoch": 0.3525939546133067, + "grad_norm": 0.4737289845943451, + "learning_rate": 0.00012950779188707938, + "loss": 1.4512, + "step": 27134 + }, + { + "epoch": 0.3526069491572226, + "grad_norm": 0.4096894860267639, + "learning_rate": 0.00012950519242516798, + "loss": 1.3149, + "step": 27135 + }, + { + "epoch": 0.35261994370113847, + "grad_norm": 0.4310626685619354, + "learning_rate": 0.00012950259296325663, + "loss": 1.4938, + "step": 27136 + }, + { + "epoch": 0.3526329382450544, + "grad_norm": 0.4049728810787201, + "learning_rate": 0.00012949999350134523, + "loss": 1.3232, + "step": 27137 + }, + { + "epoch": 0.3526459327889702, + "grad_norm": 0.49418267607688904, + "learning_rate": 0.00012949739403943385, + "loss": 1.4927, + "step": 27138 + }, + { + "epoch": 0.3526589273328861, + "grad_norm": 0.37416353821754456, + "learning_rate": 0.00012949479457752245, + "loss": 1.2577, + "step": 27139 + }, + { + "epoch": 0.35267192187680196, + "grad_norm": 0.40460726618766785, + "learning_rate": 0.00012949219511561107, + "loss": 1.4623, + "step": 27140 + }, + { + "epoch": 0.35268491642071786, + "grad_norm": 0.4240929186344147, + "learning_rate": 0.0001294895956536997, + "loss": 1.3855, + "step": 27141 + }, + { + "epoch": 0.3526979109646337, + "grad_norm": 0.468641996383667, + "learning_rate": 0.0001294869961917883, + "loss": 1.4968, + "step": 27142 + }, + { + "epoch": 0.3527109055085496, + "grad_norm": 0.3761010766029358, + "learning_rate": 0.00012948439672987692, + "loss": 1.3565, + "step": 27143 + }, + { + "epoch": 0.35272390005246546, + "grad_norm": 0.36480721831321716, + "learning_rate": 0.00012948179726796554, + "loss": 1.6263, + "step": 27144 + }, + { + "epoch": 0.35273689459638136, + "grad_norm": 0.4512459337711334, + "learning_rate": 0.00012947919780605414, + "loss": 1.4567, + "step": 27145 + }, + { + "epoch": 0.3527498891402972, + "grad_norm": 0.4153778553009033, + "learning_rate": 0.00012947659834414277, + "loss": 1.4001, + "step": 27146 + }, + { + "epoch": 0.3527628836842131, + "grad_norm": 0.4784587323665619, + "learning_rate": 0.0001294739988822314, + "loss": 1.6973, + "step": 27147 + }, + { + "epoch": 0.35277587822812895, + "grad_norm": 0.46332332491874695, + "learning_rate": 0.00012947139942032001, + "loss": 1.3256, + "step": 27148 + }, + { + "epoch": 0.35278887277204485, + "grad_norm": 0.4082441031932831, + "learning_rate": 0.0001294687999584086, + "loss": 1.4366, + "step": 27149 + }, + { + "epoch": 0.3528018673159607, + "grad_norm": 0.32346153259277344, + "learning_rate": 0.00012946620049649724, + "loss": 1.4199, + "step": 27150 + }, + { + "epoch": 0.3528148618598766, + "grad_norm": 0.43141740560531616, + "learning_rate": 0.00012946360103458586, + "loss": 1.5345, + "step": 27151 + }, + { + "epoch": 0.35282785640379244, + "grad_norm": 0.3736341893672943, + "learning_rate": 0.00012946100157267446, + "loss": 1.2965, + "step": 27152 + }, + { + "epoch": 0.35284085094770834, + "grad_norm": 0.37873899936676025, + "learning_rate": 0.00012945840211076308, + "loss": 1.4427, + "step": 27153 + }, + { + "epoch": 0.3528538454916242, + "grad_norm": 0.2640284299850464, + "learning_rate": 0.00012945580264885168, + "loss": 1.4798, + "step": 27154 + }, + { + "epoch": 0.3528668400355401, + "grad_norm": 0.356067419052124, + "learning_rate": 0.00012945320318694033, + "loss": 1.4675, + "step": 27155 + }, + { + "epoch": 0.35287983457945593, + "grad_norm": 0.4387631118297577, + "learning_rate": 0.00012945060372502893, + "loss": 1.3971, + "step": 27156 + }, + { + "epoch": 0.35289282912337183, + "grad_norm": 0.3957112729549408, + "learning_rate": 0.00012944800426311753, + "loss": 1.4403, + "step": 27157 + }, + { + "epoch": 0.3529058236672877, + "grad_norm": 0.4088270962238312, + "learning_rate": 0.00012944540480120615, + "loss": 1.5837, + "step": 27158 + }, + { + "epoch": 0.3529188182112036, + "grad_norm": 0.4400434195995331, + "learning_rate": 0.00012944280533929478, + "loss": 1.5252, + "step": 27159 + }, + { + "epoch": 0.3529318127551194, + "grad_norm": 0.34892958402633667, + "learning_rate": 0.0001294402058773834, + "loss": 1.5356, + "step": 27160 + }, + { + "epoch": 0.3529448072990353, + "grad_norm": 0.4146108031272888, + "learning_rate": 0.000129437606415472, + "loss": 1.4409, + "step": 27161 + }, + { + "epoch": 0.35295780184295117, + "grad_norm": 0.33908921480178833, + "learning_rate": 0.00012943500695356062, + "loss": 1.3888, + "step": 27162 + }, + { + "epoch": 0.35297079638686707, + "grad_norm": 0.3601709306240082, + "learning_rate": 0.00012943240749164925, + "loss": 1.1899, + "step": 27163 + }, + { + "epoch": 0.3529837909307829, + "grad_norm": 0.36996033787727356, + "learning_rate": 0.00012942980802973784, + "loss": 1.2757, + "step": 27164 + }, + { + "epoch": 0.3529967854746988, + "grad_norm": 0.3836931884288788, + "learning_rate": 0.00012942720856782647, + "loss": 1.3537, + "step": 27165 + }, + { + "epoch": 0.35300978001861466, + "grad_norm": 0.3262748122215271, + "learning_rate": 0.00012942460910591507, + "loss": 1.3677, + "step": 27166 + }, + { + "epoch": 0.35302277456253056, + "grad_norm": 0.48026683926582336, + "learning_rate": 0.00012942200964400372, + "loss": 1.4142, + "step": 27167 + }, + { + "epoch": 0.3530357691064464, + "grad_norm": 0.34279999136924744, + "learning_rate": 0.00012941941018209231, + "loss": 1.3388, + "step": 27168 + }, + { + "epoch": 0.3530487636503623, + "grad_norm": 0.29648464918136597, + "learning_rate": 0.00012941681072018094, + "loss": 1.4247, + "step": 27169 + }, + { + "epoch": 0.35306175819427815, + "grad_norm": 0.4288727343082428, + "learning_rate": 0.00012941421125826954, + "loss": 1.3239, + "step": 27170 + }, + { + "epoch": 0.35307475273819405, + "grad_norm": 0.43024468421936035, + "learning_rate": 0.00012941161179635816, + "loss": 1.2727, + "step": 27171 + }, + { + "epoch": 0.3530877472821099, + "grad_norm": 0.38322314620018005, + "learning_rate": 0.00012940901233444679, + "loss": 1.4501, + "step": 27172 + }, + { + "epoch": 0.3531007418260258, + "grad_norm": 0.3667447865009308, + "learning_rate": 0.00012940641287253538, + "loss": 1.2903, + "step": 27173 + }, + { + "epoch": 0.35311373636994164, + "grad_norm": 0.17939794063568115, + "learning_rate": 0.000129403813410624, + "loss": 1.2339, + "step": 27174 + }, + { + "epoch": 0.35312673091385754, + "grad_norm": 0.4549000561237335, + "learning_rate": 0.00012940121394871263, + "loss": 1.349, + "step": 27175 + }, + { + "epoch": 0.3531397254577734, + "grad_norm": 0.5015128254890442, + "learning_rate": 0.00012939861448680123, + "loss": 1.564, + "step": 27176 + }, + { + "epoch": 0.3531527200016893, + "grad_norm": 0.41706499457359314, + "learning_rate": 0.00012939601502488985, + "loss": 1.3077, + "step": 27177 + }, + { + "epoch": 0.3531657145456052, + "grad_norm": 0.3436248004436493, + "learning_rate": 0.00012939341556297845, + "loss": 1.1769, + "step": 27178 + }, + { + "epoch": 0.35317870908952104, + "grad_norm": 0.3895522952079773, + "learning_rate": 0.0001293908161010671, + "loss": 1.5675, + "step": 27179 + }, + { + "epoch": 0.35319170363343694, + "grad_norm": 0.3534156084060669, + "learning_rate": 0.0001293882166391557, + "loss": 1.5711, + "step": 27180 + }, + { + "epoch": 0.3532046981773528, + "grad_norm": 0.3871992826461792, + "learning_rate": 0.00012938561717724432, + "loss": 1.3985, + "step": 27181 + }, + { + "epoch": 0.3532176927212687, + "grad_norm": 0.475013792514801, + "learning_rate": 0.00012938301771533292, + "loss": 1.6038, + "step": 27182 + }, + { + "epoch": 0.35323068726518453, + "grad_norm": 0.4193267822265625, + "learning_rate": 0.00012938041825342155, + "loss": 1.337, + "step": 27183 + }, + { + "epoch": 0.35324368180910043, + "grad_norm": 0.34763145446777344, + "learning_rate": 0.00012937781879151017, + "loss": 1.3848, + "step": 27184 + }, + { + "epoch": 0.3532566763530163, + "grad_norm": 0.3938160538673401, + "learning_rate": 0.00012937521932959877, + "loss": 1.4728, + "step": 27185 + }, + { + "epoch": 0.3532696708969322, + "grad_norm": 0.34862634539604187, + "learning_rate": 0.0001293726198676874, + "loss": 1.4367, + "step": 27186 + }, + { + "epoch": 0.353282665440848, + "grad_norm": 0.4235605299472809, + "learning_rate": 0.00012937002040577602, + "loss": 1.3693, + "step": 27187 + }, + { + "epoch": 0.3532956599847639, + "grad_norm": 0.4253822863101959, + "learning_rate": 0.00012936742094386461, + "loss": 1.386, + "step": 27188 + }, + { + "epoch": 0.35330865452867977, + "grad_norm": 0.4068506956100464, + "learning_rate": 0.00012936482148195324, + "loss": 1.287, + "step": 27189 + }, + { + "epoch": 0.35332164907259567, + "grad_norm": 0.38031384348869324, + "learning_rate": 0.00012936222202004186, + "loss": 1.2728, + "step": 27190 + }, + { + "epoch": 0.3533346436165115, + "grad_norm": 0.48991456627845764, + "learning_rate": 0.0001293596225581305, + "loss": 1.3372, + "step": 27191 + }, + { + "epoch": 0.3533476381604274, + "grad_norm": 0.45399340987205505, + "learning_rate": 0.00012935702309621909, + "loss": 1.3801, + "step": 27192 + }, + { + "epoch": 0.35336063270434326, + "grad_norm": 0.3930378258228302, + "learning_rate": 0.0001293544236343077, + "loss": 1.4351, + "step": 27193 + }, + { + "epoch": 0.35337362724825916, + "grad_norm": 0.3967180848121643, + "learning_rate": 0.00012935182417239633, + "loss": 1.4975, + "step": 27194 + }, + { + "epoch": 0.353386621792175, + "grad_norm": 0.5270243287086487, + "learning_rate": 0.00012934922471048493, + "loss": 1.4861, + "step": 27195 + }, + { + "epoch": 0.3533996163360909, + "grad_norm": 0.3890226185321808, + "learning_rate": 0.00012934662524857356, + "loss": 1.1952, + "step": 27196 + }, + { + "epoch": 0.35341261088000675, + "grad_norm": 0.4488624632358551, + "learning_rate": 0.00012934402578666215, + "loss": 1.1772, + "step": 27197 + }, + { + "epoch": 0.35342560542392265, + "grad_norm": 0.4957972764968872, + "learning_rate": 0.0001293414263247508, + "loss": 1.4716, + "step": 27198 + }, + { + "epoch": 0.3534385999678385, + "grad_norm": 0.3510872423648834, + "learning_rate": 0.0001293388268628394, + "loss": 1.2411, + "step": 27199 + }, + { + "epoch": 0.3534515945117544, + "grad_norm": 0.42558473348617554, + "learning_rate": 0.000129336227400928, + "loss": 1.4113, + "step": 27200 + }, + { + "epoch": 0.35346458905567024, + "grad_norm": 0.3287118673324585, + "learning_rate": 0.00012933362793901662, + "loss": 1.2199, + "step": 27201 + }, + { + "epoch": 0.35347758359958614, + "grad_norm": 0.4286823570728302, + "learning_rate": 0.00012933102847710525, + "loss": 1.614, + "step": 27202 + }, + { + "epoch": 0.353490578143502, + "grad_norm": 0.48619240522384644, + "learning_rate": 0.00012932842901519387, + "loss": 1.445, + "step": 27203 + }, + { + "epoch": 0.3535035726874179, + "grad_norm": 0.3716752827167511, + "learning_rate": 0.00012932582955328247, + "loss": 1.4941, + "step": 27204 + }, + { + "epoch": 0.35351656723133373, + "grad_norm": 0.35998740792274475, + "learning_rate": 0.0001293232300913711, + "loss": 1.3316, + "step": 27205 + }, + { + "epoch": 0.35352956177524963, + "grad_norm": 0.34886422753334045, + "learning_rate": 0.00012932063062945972, + "loss": 1.3542, + "step": 27206 + }, + { + "epoch": 0.3535425563191655, + "grad_norm": 0.42458242177963257, + "learning_rate": 0.00012931803116754832, + "loss": 1.4558, + "step": 27207 + }, + { + "epoch": 0.3535555508630814, + "grad_norm": 0.3253622055053711, + "learning_rate": 0.00012931543170563694, + "loss": 1.1567, + "step": 27208 + }, + { + "epoch": 0.3535685454069972, + "grad_norm": 0.4473830759525299, + "learning_rate": 0.00012931283224372554, + "loss": 1.3769, + "step": 27209 + }, + { + "epoch": 0.3535815399509131, + "grad_norm": 0.37994784116744995, + "learning_rate": 0.0001293102327818142, + "loss": 1.3377, + "step": 27210 + }, + { + "epoch": 0.35359453449482897, + "grad_norm": 0.29859626293182373, + "learning_rate": 0.0001293076333199028, + "loss": 1.2177, + "step": 27211 + }, + { + "epoch": 0.35360752903874487, + "grad_norm": 0.49773484468460083, + "learning_rate": 0.00012930503385799138, + "loss": 1.5254, + "step": 27212 + }, + { + "epoch": 0.3536205235826607, + "grad_norm": 0.4549420475959778, + "learning_rate": 0.00012930243439608, + "loss": 1.4952, + "step": 27213 + }, + { + "epoch": 0.3536335181265766, + "grad_norm": 0.3952312469482422, + "learning_rate": 0.00012929983493416863, + "loss": 1.3511, + "step": 27214 + }, + { + "epoch": 0.35364651267049246, + "grad_norm": 0.47965988516807556, + "learning_rate": 0.00012929723547225726, + "loss": 1.4842, + "step": 27215 + }, + { + "epoch": 0.35365950721440836, + "grad_norm": 0.37479913234710693, + "learning_rate": 0.00012929463601034586, + "loss": 1.3256, + "step": 27216 + }, + { + "epoch": 0.3536725017583242, + "grad_norm": 0.3680041432380676, + "learning_rate": 0.00012929203654843448, + "loss": 1.3475, + "step": 27217 + }, + { + "epoch": 0.3536854963022401, + "grad_norm": 0.38537779450416565, + "learning_rate": 0.0001292894370865231, + "loss": 1.4512, + "step": 27218 + }, + { + "epoch": 0.35369849084615596, + "grad_norm": 0.3340739905834198, + "learning_rate": 0.0001292868376246117, + "loss": 1.4271, + "step": 27219 + }, + { + "epoch": 0.35371148539007186, + "grad_norm": 0.3916126787662506, + "learning_rate": 0.00012928423816270033, + "loss": 1.3101, + "step": 27220 + }, + { + "epoch": 0.3537244799339877, + "grad_norm": 0.41248396039009094, + "learning_rate": 0.00012928163870078892, + "loss": 1.3531, + "step": 27221 + }, + { + "epoch": 0.3537374744779036, + "grad_norm": 0.4454796016216278, + "learning_rate": 0.00012927903923887758, + "loss": 1.3815, + "step": 27222 + }, + { + "epoch": 0.35375046902181945, + "grad_norm": 1.0947011709213257, + "learning_rate": 0.00012927643977696617, + "loss": 1.4745, + "step": 27223 + }, + { + "epoch": 0.35376346356573535, + "grad_norm": 0.4864475131034851, + "learning_rate": 0.00012927384031505477, + "loss": 1.3942, + "step": 27224 + }, + { + "epoch": 0.3537764581096512, + "grad_norm": 0.39578163623809814, + "learning_rate": 0.00012927124085314342, + "loss": 1.3082, + "step": 27225 + }, + { + "epoch": 0.3537894526535671, + "grad_norm": 0.38561388850212097, + "learning_rate": 0.00012926864139123202, + "loss": 1.3415, + "step": 27226 + }, + { + "epoch": 0.35380244719748294, + "grad_norm": 0.5119971632957458, + "learning_rate": 0.00012926604192932064, + "loss": 1.4067, + "step": 27227 + }, + { + "epoch": 0.35381544174139884, + "grad_norm": 0.4380015432834625, + "learning_rate": 0.00012926344246740924, + "loss": 1.3924, + "step": 27228 + }, + { + "epoch": 0.3538284362853147, + "grad_norm": 0.38569170236587524, + "learning_rate": 0.00012926084300549787, + "loss": 1.4107, + "step": 27229 + }, + { + "epoch": 0.3538414308292306, + "grad_norm": 0.37366804480552673, + "learning_rate": 0.0001292582435435865, + "loss": 1.3113, + "step": 27230 + }, + { + "epoch": 0.35385442537314643, + "grad_norm": 0.3786996603012085, + "learning_rate": 0.0001292556440816751, + "loss": 1.3082, + "step": 27231 + }, + { + "epoch": 0.35386741991706233, + "grad_norm": 0.4101344645023346, + "learning_rate": 0.0001292530446197637, + "loss": 1.2962, + "step": 27232 + }, + { + "epoch": 0.3538804144609782, + "grad_norm": 0.3904150724411011, + "learning_rate": 0.00012925044515785234, + "loss": 1.3084, + "step": 27233 + }, + { + "epoch": 0.3538934090048941, + "grad_norm": 0.49593988060951233, + "learning_rate": 0.00012924784569594096, + "loss": 1.588, + "step": 27234 + }, + { + "epoch": 0.3539064035488099, + "grad_norm": 0.47312068939208984, + "learning_rate": 0.00012924524623402956, + "loss": 1.5722, + "step": 27235 + }, + { + "epoch": 0.3539193980927258, + "grad_norm": 0.43164587020874023, + "learning_rate": 0.00012924264677211818, + "loss": 1.4255, + "step": 27236 + }, + { + "epoch": 0.35393239263664167, + "grad_norm": 0.43883249163627625, + "learning_rate": 0.0001292400473102068, + "loss": 1.4523, + "step": 27237 + }, + { + "epoch": 0.35394538718055757, + "grad_norm": 0.4415944218635559, + "learning_rate": 0.0001292374478482954, + "loss": 1.3986, + "step": 27238 + }, + { + "epoch": 0.3539583817244734, + "grad_norm": 0.4156629145145416, + "learning_rate": 0.00012923484838638403, + "loss": 1.3301, + "step": 27239 + }, + { + "epoch": 0.3539713762683893, + "grad_norm": 0.37376484274864197, + "learning_rate": 0.00012923224892447263, + "loss": 1.405, + "step": 27240 + }, + { + "epoch": 0.35398437081230516, + "grad_norm": 0.2952210307121277, + "learning_rate": 0.00012922964946256125, + "loss": 1.3493, + "step": 27241 + }, + { + "epoch": 0.35399736535622106, + "grad_norm": 0.36737576127052307, + "learning_rate": 0.00012922705000064988, + "loss": 1.4874, + "step": 27242 + }, + { + "epoch": 0.3540103599001369, + "grad_norm": 0.4822121262550354, + "learning_rate": 0.00012922445053873847, + "loss": 1.3707, + "step": 27243 + }, + { + "epoch": 0.3540233544440528, + "grad_norm": 0.4565078318119049, + "learning_rate": 0.0001292218510768271, + "loss": 1.4621, + "step": 27244 + }, + { + "epoch": 0.35403634898796865, + "grad_norm": 0.429116815328598, + "learning_rate": 0.00012921925161491572, + "loss": 1.4263, + "step": 27245 + }, + { + "epoch": 0.35404934353188455, + "grad_norm": 0.38773947954177856, + "learning_rate": 0.00012921665215300435, + "loss": 1.4314, + "step": 27246 + }, + { + "epoch": 0.3540623380758004, + "grad_norm": 0.37396326661109924, + "learning_rate": 0.00012921405269109294, + "loss": 1.3845, + "step": 27247 + }, + { + "epoch": 0.3540753326197163, + "grad_norm": 0.42212367057800293, + "learning_rate": 0.00012921145322918157, + "loss": 1.3265, + "step": 27248 + }, + { + "epoch": 0.35408832716363214, + "grad_norm": 0.4560120403766632, + "learning_rate": 0.0001292088537672702, + "loss": 1.4377, + "step": 27249 + }, + { + "epoch": 0.35410132170754804, + "grad_norm": 0.40525567531585693, + "learning_rate": 0.0001292062543053588, + "loss": 1.3804, + "step": 27250 + }, + { + "epoch": 0.3541143162514639, + "grad_norm": 0.43293580412864685, + "learning_rate": 0.00012920365484344741, + "loss": 1.5648, + "step": 27251 + }, + { + "epoch": 0.3541273107953798, + "grad_norm": 0.44749411940574646, + "learning_rate": 0.000129201055381536, + "loss": 1.4421, + "step": 27252 + }, + { + "epoch": 0.35414030533929564, + "grad_norm": 0.3926527202129364, + "learning_rate": 0.00012919845591962466, + "loss": 1.5333, + "step": 27253 + }, + { + "epoch": 0.35415329988321154, + "grad_norm": 0.4460231363773346, + "learning_rate": 0.00012919585645771326, + "loss": 1.2362, + "step": 27254 + }, + { + "epoch": 0.35416629442712744, + "grad_norm": 0.4694913625717163, + "learning_rate": 0.00012919325699580186, + "loss": 1.5574, + "step": 27255 + }, + { + "epoch": 0.3541792889710433, + "grad_norm": 0.40756139159202576, + "learning_rate": 0.00012919065753389048, + "loss": 1.3267, + "step": 27256 + }, + { + "epoch": 0.3541922835149592, + "grad_norm": 0.4112973213195801, + "learning_rate": 0.0001291880580719791, + "loss": 1.2599, + "step": 27257 + }, + { + "epoch": 0.35420527805887503, + "grad_norm": 0.3808411657810211, + "learning_rate": 0.00012918545861006773, + "loss": 1.2661, + "step": 27258 + }, + { + "epoch": 0.35421827260279093, + "grad_norm": 0.4318644106388092, + "learning_rate": 0.00012918285914815633, + "loss": 1.5312, + "step": 27259 + }, + { + "epoch": 0.3542312671467068, + "grad_norm": 0.4153105318546295, + "learning_rate": 0.00012918025968624495, + "loss": 1.2959, + "step": 27260 + }, + { + "epoch": 0.3542442616906227, + "grad_norm": 0.479297935962677, + "learning_rate": 0.00012917766022433358, + "loss": 1.3265, + "step": 27261 + }, + { + "epoch": 0.3542572562345385, + "grad_norm": 0.40943631529808044, + "learning_rate": 0.00012917506076242218, + "loss": 1.311, + "step": 27262 + }, + { + "epoch": 0.3542702507784544, + "grad_norm": 0.4646322429180145, + "learning_rate": 0.0001291724613005108, + "loss": 1.4378, + "step": 27263 + }, + { + "epoch": 0.35428324532237027, + "grad_norm": 0.44986772537231445, + "learning_rate": 0.00012916986183859942, + "loss": 1.4146, + "step": 27264 + }, + { + "epoch": 0.35429623986628617, + "grad_norm": 0.3947587311267853, + "learning_rate": 0.00012916726237668805, + "loss": 1.6124, + "step": 27265 + }, + { + "epoch": 0.354309234410202, + "grad_norm": 0.3918352723121643, + "learning_rate": 0.00012916466291477665, + "loss": 1.5189, + "step": 27266 + }, + { + "epoch": 0.3543222289541179, + "grad_norm": 0.3979182243347168, + "learning_rate": 0.00012916206345286524, + "loss": 1.2607, + "step": 27267 + }, + { + "epoch": 0.35433522349803376, + "grad_norm": 0.3903525769710541, + "learning_rate": 0.0001291594639909539, + "loss": 1.4899, + "step": 27268 + }, + { + "epoch": 0.35434821804194966, + "grad_norm": 0.5187159776687622, + "learning_rate": 0.0001291568645290425, + "loss": 1.3973, + "step": 27269 + }, + { + "epoch": 0.3543612125858655, + "grad_norm": 0.37096890807151794, + "learning_rate": 0.00012915426506713112, + "loss": 1.2167, + "step": 27270 + }, + { + "epoch": 0.3543742071297814, + "grad_norm": 0.40710756182670593, + "learning_rate": 0.00012915166560521971, + "loss": 1.2715, + "step": 27271 + }, + { + "epoch": 0.35438720167369725, + "grad_norm": 0.42568182945251465, + "learning_rate": 0.00012914906614330834, + "loss": 1.3001, + "step": 27272 + }, + { + "epoch": 0.35440019621761315, + "grad_norm": 0.36408406496047974, + "learning_rate": 0.00012914646668139696, + "loss": 1.5063, + "step": 27273 + }, + { + "epoch": 0.354413190761529, + "grad_norm": 0.4338377118110657, + "learning_rate": 0.00012914386721948556, + "loss": 1.2803, + "step": 27274 + }, + { + "epoch": 0.3544261853054449, + "grad_norm": 0.3782581090927124, + "learning_rate": 0.00012914126775757419, + "loss": 1.3557, + "step": 27275 + }, + { + "epoch": 0.35443917984936074, + "grad_norm": 0.44673991203308105, + "learning_rate": 0.0001291386682956628, + "loss": 1.2482, + "step": 27276 + }, + { + "epoch": 0.35445217439327664, + "grad_norm": 0.4182107150554657, + "learning_rate": 0.00012913606883375143, + "loss": 1.3941, + "step": 27277 + }, + { + "epoch": 0.3544651689371925, + "grad_norm": 0.4898601174354553, + "learning_rate": 0.00012913346937184003, + "loss": 1.4784, + "step": 27278 + }, + { + "epoch": 0.3544781634811084, + "grad_norm": 0.47608789801597595, + "learning_rate": 0.00012913086990992863, + "loss": 1.5314, + "step": 27279 + }, + { + "epoch": 0.35449115802502423, + "grad_norm": 0.3632807433605194, + "learning_rate": 0.00012912827044801728, + "loss": 1.4266, + "step": 27280 + }, + { + "epoch": 0.35450415256894013, + "grad_norm": 0.34378114342689514, + "learning_rate": 0.00012912567098610588, + "loss": 1.1938, + "step": 27281 + }, + { + "epoch": 0.354517147112856, + "grad_norm": 0.38386866450309753, + "learning_rate": 0.0001291230715241945, + "loss": 1.5958, + "step": 27282 + }, + { + "epoch": 0.3545301416567719, + "grad_norm": 0.4135749042034149, + "learning_rate": 0.0001291204720622831, + "loss": 1.2388, + "step": 27283 + }, + { + "epoch": 0.3545431362006877, + "grad_norm": 0.5608319044113159, + "learning_rate": 0.00012911787260037172, + "loss": 1.5145, + "step": 27284 + }, + { + "epoch": 0.3545561307446036, + "grad_norm": 0.35127127170562744, + "learning_rate": 0.00012911527313846035, + "loss": 1.3196, + "step": 27285 + }, + { + "epoch": 0.35456912528851947, + "grad_norm": 0.5179656744003296, + "learning_rate": 0.00012911267367654895, + "loss": 1.4301, + "step": 27286 + }, + { + "epoch": 0.35458211983243537, + "grad_norm": 0.40099936723709106, + "learning_rate": 0.00012911007421463757, + "loss": 1.3277, + "step": 27287 + }, + { + "epoch": 0.3545951143763512, + "grad_norm": 0.3803557753562927, + "learning_rate": 0.0001291074747527262, + "loss": 1.5526, + "step": 27288 + }, + { + "epoch": 0.3546081089202671, + "grad_norm": 0.45398080348968506, + "learning_rate": 0.00012910487529081482, + "loss": 1.4372, + "step": 27289 + }, + { + "epoch": 0.35462110346418296, + "grad_norm": 0.405096173286438, + "learning_rate": 0.00012910227582890342, + "loss": 1.527, + "step": 27290 + }, + { + "epoch": 0.35463409800809886, + "grad_norm": 0.4905530512332916, + "learning_rate": 0.00012909967636699204, + "loss": 1.4064, + "step": 27291 + }, + { + "epoch": 0.3546470925520147, + "grad_norm": 0.3833281397819519, + "learning_rate": 0.00012909707690508067, + "loss": 1.3354, + "step": 27292 + }, + { + "epoch": 0.3546600870959306, + "grad_norm": 0.3851037919521332, + "learning_rate": 0.00012909447744316926, + "loss": 1.3762, + "step": 27293 + }, + { + "epoch": 0.35467308163984645, + "grad_norm": 0.45993658900260925, + "learning_rate": 0.0001290918779812579, + "loss": 1.5244, + "step": 27294 + }, + { + "epoch": 0.35468607618376236, + "grad_norm": 0.5152266621589661, + "learning_rate": 0.00012908927851934649, + "loss": 1.4363, + "step": 27295 + }, + { + "epoch": 0.3546990707276782, + "grad_norm": 0.542110025882721, + "learning_rate": 0.0001290866790574351, + "loss": 1.2823, + "step": 27296 + }, + { + "epoch": 0.3547120652715941, + "grad_norm": 0.35849252343177795, + "learning_rate": 0.00012908407959552373, + "loss": 1.3021, + "step": 27297 + }, + { + "epoch": 0.35472505981550995, + "grad_norm": 0.4606815278530121, + "learning_rate": 0.00012908148013361233, + "loss": 1.3716, + "step": 27298 + }, + { + "epoch": 0.35473805435942585, + "grad_norm": 0.45573890209198, + "learning_rate": 0.00012907888067170098, + "loss": 1.5586, + "step": 27299 + }, + { + "epoch": 0.3547510489033417, + "grad_norm": 0.5061871409416199, + "learning_rate": 0.00012907628120978958, + "loss": 1.5335, + "step": 27300 + }, + { + "epoch": 0.3547640434472576, + "grad_norm": 0.45350974798202515, + "learning_rate": 0.0001290736817478782, + "loss": 1.483, + "step": 27301 + }, + { + "epoch": 0.35477703799117344, + "grad_norm": 0.3578108847141266, + "learning_rate": 0.0001290710822859668, + "loss": 1.349, + "step": 27302 + }, + { + "epoch": 0.35479003253508934, + "grad_norm": 0.5508697032928467, + "learning_rate": 0.00012906848282405543, + "loss": 1.4382, + "step": 27303 + }, + { + "epoch": 0.3548030270790052, + "grad_norm": 0.382597416639328, + "learning_rate": 0.00012906588336214405, + "loss": 1.4373, + "step": 27304 + }, + { + "epoch": 0.3548160216229211, + "grad_norm": 0.39887455105781555, + "learning_rate": 0.00012906328390023265, + "loss": 1.2122, + "step": 27305 + }, + { + "epoch": 0.35482901616683693, + "grad_norm": 0.24349313974380493, + "learning_rate": 0.00012906068443832127, + "loss": 1.108, + "step": 27306 + }, + { + "epoch": 0.35484201071075283, + "grad_norm": 0.2975713312625885, + "learning_rate": 0.0001290580849764099, + "loss": 1.4866, + "step": 27307 + }, + { + "epoch": 0.3548550052546687, + "grad_norm": 0.388852596282959, + "learning_rate": 0.0001290554855144985, + "loss": 1.3193, + "step": 27308 + }, + { + "epoch": 0.3548679997985846, + "grad_norm": 0.2948521375656128, + "learning_rate": 0.00012905288605258712, + "loss": 1.1614, + "step": 27309 + }, + { + "epoch": 0.3548809943425004, + "grad_norm": 0.27335628867149353, + "learning_rate": 0.00012905028659067572, + "loss": 1.2761, + "step": 27310 + }, + { + "epoch": 0.3548939888864163, + "grad_norm": 0.40031322836875916, + "learning_rate": 0.00012904768712876437, + "loss": 1.5413, + "step": 27311 + }, + { + "epoch": 0.35490698343033217, + "grad_norm": 0.4333665668964386, + "learning_rate": 0.00012904508766685297, + "loss": 1.2508, + "step": 27312 + }, + { + "epoch": 0.35491997797424807, + "grad_norm": 0.368606835603714, + "learning_rate": 0.0001290424882049416, + "loss": 1.3583, + "step": 27313 + }, + { + "epoch": 0.3549329725181639, + "grad_norm": 0.39081260561943054, + "learning_rate": 0.0001290398887430302, + "loss": 1.3101, + "step": 27314 + }, + { + "epoch": 0.3549459670620798, + "grad_norm": 0.509438693523407, + "learning_rate": 0.0001290372892811188, + "loss": 1.3052, + "step": 27315 + }, + { + "epoch": 0.35495896160599566, + "grad_norm": 0.31404542922973633, + "learning_rate": 0.00012903468981920744, + "loss": 1.441, + "step": 27316 + }, + { + "epoch": 0.35497195614991156, + "grad_norm": 0.3912501037120819, + "learning_rate": 0.00012903209035729603, + "loss": 1.3667, + "step": 27317 + }, + { + "epoch": 0.3549849506938274, + "grad_norm": 0.40713170170783997, + "learning_rate": 0.00012902949089538466, + "loss": 1.4105, + "step": 27318 + }, + { + "epoch": 0.3549979452377433, + "grad_norm": 0.37688878178596497, + "learning_rate": 0.00012902689143347328, + "loss": 1.2671, + "step": 27319 + }, + { + "epoch": 0.35501093978165915, + "grad_norm": 0.4456031620502472, + "learning_rate": 0.0001290242919715619, + "loss": 1.4529, + "step": 27320 + }, + { + "epoch": 0.35502393432557505, + "grad_norm": 0.4662650525569916, + "learning_rate": 0.0001290216925096505, + "loss": 1.3751, + "step": 27321 + }, + { + "epoch": 0.3550369288694909, + "grad_norm": 0.39828863739967346, + "learning_rate": 0.0001290190930477391, + "loss": 1.4193, + "step": 27322 + }, + { + "epoch": 0.3550499234134068, + "grad_norm": 0.4406076967716217, + "learning_rate": 0.00012901649358582775, + "loss": 1.4869, + "step": 27323 + }, + { + "epoch": 0.35506291795732264, + "grad_norm": 0.3906269669532776, + "learning_rate": 0.00012901389412391635, + "loss": 1.4086, + "step": 27324 + }, + { + "epoch": 0.35507591250123854, + "grad_norm": 0.4025513231754303, + "learning_rate": 0.00012901129466200498, + "loss": 1.2439, + "step": 27325 + }, + { + "epoch": 0.3550889070451544, + "grad_norm": 0.35108527541160583, + "learning_rate": 0.00012900869520009357, + "loss": 1.3176, + "step": 27326 + }, + { + "epoch": 0.3551019015890703, + "grad_norm": 0.32911616563796997, + "learning_rate": 0.0001290060957381822, + "loss": 1.3763, + "step": 27327 + }, + { + "epoch": 0.35511489613298614, + "grad_norm": 0.47493571043014526, + "learning_rate": 0.00012900349627627082, + "loss": 1.4917, + "step": 27328 + }, + { + "epoch": 0.35512789067690204, + "grad_norm": 0.2588498294353485, + "learning_rate": 0.00012900089681435942, + "loss": 1.4285, + "step": 27329 + }, + { + "epoch": 0.35514088522081794, + "grad_norm": 0.38209205865859985, + "learning_rate": 0.00012899829735244804, + "loss": 1.4594, + "step": 27330 + }, + { + "epoch": 0.3551538797647338, + "grad_norm": 0.36052706837654114, + "learning_rate": 0.00012899569789053667, + "loss": 1.3562, + "step": 27331 + }, + { + "epoch": 0.3551668743086497, + "grad_norm": 0.44969382882118225, + "learning_rate": 0.0001289930984286253, + "loss": 1.666, + "step": 27332 + }, + { + "epoch": 0.35517986885256553, + "grad_norm": 0.3375230133533478, + "learning_rate": 0.0001289904989667139, + "loss": 1.2547, + "step": 27333 + }, + { + "epoch": 0.35519286339648143, + "grad_norm": 0.44575703144073486, + "learning_rate": 0.00012898789950480251, + "loss": 1.5851, + "step": 27334 + }, + { + "epoch": 0.3552058579403973, + "grad_norm": 0.31467291712760925, + "learning_rate": 0.00012898530004289114, + "loss": 1.41, + "step": 27335 + }, + { + "epoch": 0.3552188524843132, + "grad_norm": 0.305544376373291, + "learning_rate": 0.00012898270058097974, + "loss": 1.2327, + "step": 27336 + }, + { + "epoch": 0.355231847028229, + "grad_norm": 0.3942440152168274, + "learning_rate": 0.00012898010111906836, + "loss": 1.3341, + "step": 27337 + }, + { + "epoch": 0.3552448415721449, + "grad_norm": 0.3977477252483368, + "learning_rate": 0.00012897750165715699, + "loss": 1.2542, + "step": 27338 + }, + { + "epoch": 0.35525783611606077, + "grad_norm": 0.38865166902542114, + "learning_rate": 0.00012897490219524558, + "loss": 1.4307, + "step": 27339 + }, + { + "epoch": 0.35527083065997667, + "grad_norm": 0.33131730556488037, + "learning_rate": 0.0001289723027333342, + "loss": 1.343, + "step": 27340 + }, + { + "epoch": 0.3552838252038925, + "grad_norm": 0.3502008616924286, + "learning_rate": 0.0001289697032714228, + "loss": 1.2184, + "step": 27341 + }, + { + "epoch": 0.3552968197478084, + "grad_norm": 0.43779516220092773, + "learning_rate": 0.00012896710380951146, + "loss": 1.4779, + "step": 27342 + }, + { + "epoch": 0.35530981429172426, + "grad_norm": 0.4649774134159088, + "learning_rate": 0.00012896450434760005, + "loss": 1.5927, + "step": 27343 + }, + { + "epoch": 0.35532280883564016, + "grad_norm": 0.40542006492614746, + "learning_rate": 0.00012896190488568868, + "loss": 1.5022, + "step": 27344 + }, + { + "epoch": 0.355335803379556, + "grad_norm": 0.43315234780311584, + "learning_rate": 0.00012895930542377728, + "loss": 1.4163, + "step": 27345 + }, + { + "epoch": 0.3553487979234719, + "grad_norm": 0.3740081489086151, + "learning_rate": 0.0001289567059618659, + "loss": 1.3333, + "step": 27346 + }, + { + "epoch": 0.35536179246738775, + "grad_norm": 0.364037424325943, + "learning_rate": 0.00012895410649995452, + "loss": 1.5248, + "step": 27347 + }, + { + "epoch": 0.35537478701130365, + "grad_norm": 0.3614572584629059, + "learning_rate": 0.00012895150703804312, + "loss": 1.2211, + "step": 27348 + }, + { + "epoch": 0.3553877815552195, + "grad_norm": 0.34527918696403503, + "learning_rate": 0.00012894890757613175, + "loss": 1.4175, + "step": 27349 + }, + { + "epoch": 0.3554007760991354, + "grad_norm": 0.4455159306526184, + "learning_rate": 0.00012894630811422037, + "loss": 1.2227, + "step": 27350 + }, + { + "epoch": 0.35541377064305124, + "grad_norm": 0.3693142235279083, + "learning_rate": 0.00012894370865230897, + "loss": 1.4173, + "step": 27351 + }, + { + "epoch": 0.35542676518696714, + "grad_norm": 0.3905201256275177, + "learning_rate": 0.0001289411091903976, + "loss": 1.4463, + "step": 27352 + }, + { + "epoch": 0.355439759730883, + "grad_norm": 0.2974683344364166, + "learning_rate": 0.0001289385097284862, + "loss": 1.2861, + "step": 27353 + }, + { + "epoch": 0.3554527542747989, + "grad_norm": 0.4927957057952881, + "learning_rate": 0.00012893591026657484, + "loss": 1.3052, + "step": 27354 + }, + { + "epoch": 0.35546574881871473, + "grad_norm": 0.3714558482170105, + "learning_rate": 0.00012893331080466344, + "loss": 1.5349, + "step": 27355 + }, + { + "epoch": 0.35547874336263063, + "grad_norm": 0.39975398778915405, + "learning_rate": 0.00012893071134275206, + "loss": 1.5843, + "step": 27356 + }, + { + "epoch": 0.3554917379065465, + "grad_norm": 0.37283632159233093, + "learning_rate": 0.00012892811188084066, + "loss": 1.4262, + "step": 27357 + }, + { + "epoch": 0.3555047324504624, + "grad_norm": 0.3456113636493683, + "learning_rate": 0.00012892551241892929, + "loss": 1.385, + "step": 27358 + }, + { + "epoch": 0.3555177269943782, + "grad_norm": 0.343330442905426, + "learning_rate": 0.0001289229129570179, + "loss": 1.3461, + "step": 27359 + }, + { + "epoch": 0.3555307215382941, + "grad_norm": 0.3786865472793579, + "learning_rate": 0.0001289203134951065, + "loss": 1.4356, + "step": 27360 + }, + { + "epoch": 0.35554371608220997, + "grad_norm": 0.38240817189216614, + "learning_rate": 0.00012891771403319513, + "loss": 1.419, + "step": 27361 + }, + { + "epoch": 0.35555671062612587, + "grad_norm": 0.39295902848243713, + "learning_rate": 0.00012891511457128376, + "loss": 1.2585, + "step": 27362 + }, + { + "epoch": 0.3555697051700417, + "grad_norm": 0.40160566568374634, + "learning_rate": 0.00012891251510937235, + "loss": 1.4868, + "step": 27363 + }, + { + "epoch": 0.3555826997139576, + "grad_norm": 0.4206991493701935, + "learning_rate": 0.00012890991564746098, + "loss": 1.3647, + "step": 27364 + }, + { + "epoch": 0.35559569425787346, + "grad_norm": 0.4079906642436981, + "learning_rate": 0.00012890731618554958, + "loss": 1.5184, + "step": 27365 + }, + { + "epoch": 0.35560868880178936, + "grad_norm": 0.47296544909477234, + "learning_rate": 0.00012890471672363823, + "loss": 1.4384, + "step": 27366 + }, + { + "epoch": 0.3556216833457052, + "grad_norm": 0.37764808535575867, + "learning_rate": 0.00012890211726172682, + "loss": 1.3668, + "step": 27367 + }, + { + "epoch": 0.3556346778896211, + "grad_norm": 0.36749958992004395, + "learning_rate": 0.00012889951779981545, + "loss": 1.6478, + "step": 27368 + }, + { + "epoch": 0.35564767243353695, + "grad_norm": 0.4778698682785034, + "learning_rate": 0.00012889691833790405, + "loss": 1.5483, + "step": 27369 + }, + { + "epoch": 0.35566066697745286, + "grad_norm": 0.3563973903656006, + "learning_rate": 0.00012889431887599267, + "loss": 1.6435, + "step": 27370 + }, + { + "epoch": 0.3556736615213687, + "grad_norm": 0.448848694562912, + "learning_rate": 0.0001288917194140813, + "loss": 1.3224, + "step": 27371 + }, + { + "epoch": 0.3556866560652846, + "grad_norm": 0.34736618399620056, + "learning_rate": 0.0001288891199521699, + "loss": 1.3078, + "step": 27372 + }, + { + "epoch": 0.35569965060920045, + "grad_norm": 0.3348294794559479, + "learning_rate": 0.00012888652049025854, + "loss": 1.242, + "step": 27373 + }, + { + "epoch": 0.35571264515311635, + "grad_norm": 0.573210597038269, + "learning_rate": 0.00012888392102834714, + "loss": 1.4365, + "step": 27374 + }, + { + "epoch": 0.3557256396970322, + "grad_norm": 0.37353646755218506, + "learning_rate": 0.00012888132156643577, + "loss": 1.2811, + "step": 27375 + }, + { + "epoch": 0.3557386342409481, + "grad_norm": 0.26454856991767883, + "learning_rate": 0.00012887872210452436, + "loss": 1.2599, + "step": 27376 + }, + { + "epoch": 0.35575162878486394, + "grad_norm": 0.3324933350086212, + "learning_rate": 0.000128876122642613, + "loss": 1.1641, + "step": 27377 + }, + { + "epoch": 0.35576462332877984, + "grad_norm": 0.4507709741592407, + "learning_rate": 0.0001288735231807016, + "loss": 1.5577, + "step": 27378 + }, + { + "epoch": 0.3557776178726957, + "grad_norm": 0.37117379903793335, + "learning_rate": 0.0001288709237187902, + "loss": 1.4714, + "step": 27379 + }, + { + "epoch": 0.3557906124166116, + "grad_norm": 0.2662966251373291, + "learning_rate": 0.00012886832425687883, + "loss": 1.1298, + "step": 27380 + }, + { + "epoch": 0.35580360696052743, + "grad_norm": 0.43862196803092957, + "learning_rate": 0.00012886572479496746, + "loss": 1.4317, + "step": 27381 + }, + { + "epoch": 0.35581660150444333, + "grad_norm": 0.35771024227142334, + "learning_rate": 0.00012886312533305606, + "loss": 1.3211, + "step": 27382 + }, + { + "epoch": 0.3558295960483592, + "grad_norm": 0.4146856665611267, + "learning_rate": 0.00012886052587114468, + "loss": 1.4294, + "step": 27383 + }, + { + "epoch": 0.3558425905922751, + "grad_norm": 0.35615599155426025, + "learning_rate": 0.00012885792640923328, + "loss": 1.4626, + "step": 27384 + }, + { + "epoch": 0.3558555851361909, + "grad_norm": 0.4137577712535858, + "learning_rate": 0.00012885532694732193, + "loss": 1.5483, + "step": 27385 + }, + { + "epoch": 0.3558685796801068, + "grad_norm": 0.3834209740161896, + "learning_rate": 0.00012885272748541053, + "loss": 1.5189, + "step": 27386 + }, + { + "epoch": 0.35588157422402267, + "grad_norm": 0.38186535239219666, + "learning_rate": 0.00012885012802349915, + "loss": 1.4562, + "step": 27387 + }, + { + "epoch": 0.35589456876793857, + "grad_norm": 0.30644282698631287, + "learning_rate": 0.00012884752856158775, + "loss": 1.3494, + "step": 27388 + }, + { + "epoch": 0.3559075633118544, + "grad_norm": 0.3626924753189087, + "learning_rate": 0.00012884492909967637, + "loss": 1.269, + "step": 27389 + }, + { + "epoch": 0.3559205578557703, + "grad_norm": 0.3227306604385376, + "learning_rate": 0.000128842329637765, + "loss": 1.3591, + "step": 27390 + }, + { + "epoch": 0.35593355239968616, + "grad_norm": 0.44583258032798767, + "learning_rate": 0.0001288397301758536, + "loss": 1.4902, + "step": 27391 + }, + { + "epoch": 0.35594654694360206, + "grad_norm": 0.5193873047828674, + "learning_rate": 0.00012883713071394222, + "loss": 1.3556, + "step": 27392 + }, + { + "epoch": 0.3559595414875179, + "grad_norm": 0.39028802514076233, + "learning_rate": 0.00012883453125203084, + "loss": 1.3033, + "step": 27393 + }, + { + "epoch": 0.3559725360314338, + "grad_norm": 0.2958873212337494, + "learning_rate": 0.00012883193179011944, + "loss": 1.4745, + "step": 27394 + }, + { + "epoch": 0.35598553057534965, + "grad_norm": 0.4339183270931244, + "learning_rate": 0.00012882933232820807, + "loss": 1.6533, + "step": 27395 + }, + { + "epoch": 0.35599852511926555, + "grad_norm": 0.34015390276908875, + "learning_rate": 0.00012882673286629666, + "loss": 1.3683, + "step": 27396 + }, + { + "epoch": 0.3560115196631814, + "grad_norm": 0.3121531903743744, + "learning_rate": 0.00012882413340438532, + "loss": 1.3909, + "step": 27397 + }, + { + "epoch": 0.3560245142070973, + "grad_norm": 0.3852488398551941, + "learning_rate": 0.0001288215339424739, + "loss": 1.5461, + "step": 27398 + }, + { + "epoch": 0.35603750875101314, + "grad_norm": 0.4048745036125183, + "learning_rate": 0.00012881893448056254, + "loss": 1.4399, + "step": 27399 + }, + { + "epoch": 0.35605050329492904, + "grad_norm": 0.2905421257019043, + "learning_rate": 0.00012881633501865113, + "loss": 1.3293, + "step": 27400 + }, + { + "epoch": 0.3560634978388449, + "grad_norm": 0.46253320574760437, + "learning_rate": 0.00012881373555673976, + "loss": 1.4594, + "step": 27401 + }, + { + "epoch": 0.3560764923827608, + "grad_norm": 0.40848439931869507, + "learning_rate": 0.00012881113609482838, + "loss": 1.4353, + "step": 27402 + }, + { + "epoch": 0.35608948692667663, + "grad_norm": 0.44682836532592773, + "learning_rate": 0.00012880853663291698, + "loss": 1.4041, + "step": 27403 + }, + { + "epoch": 0.35610248147059254, + "grad_norm": 0.42151501774787903, + "learning_rate": 0.0001288059371710056, + "loss": 1.4256, + "step": 27404 + }, + { + "epoch": 0.3561154760145084, + "grad_norm": 0.3288726210594177, + "learning_rate": 0.00012880333770909423, + "loss": 1.2968, + "step": 27405 + }, + { + "epoch": 0.3561284705584243, + "grad_norm": 0.42383715510368347, + "learning_rate": 0.00012880073824718283, + "loss": 1.4676, + "step": 27406 + }, + { + "epoch": 0.3561414651023402, + "grad_norm": 0.35092878341674805, + "learning_rate": 0.00012879813878527145, + "loss": 1.3394, + "step": 27407 + }, + { + "epoch": 0.356154459646256, + "grad_norm": 0.34372085332870483, + "learning_rate": 0.00012879553932336008, + "loss": 1.2541, + "step": 27408 + }, + { + "epoch": 0.35616745419017193, + "grad_norm": 0.35861751437187195, + "learning_rate": 0.0001287929398614487, + "loss": 1.2553, + "step": 27409 + }, + { + "epoch": 0.3561804487340878, + "grad_norm": 0.42946261167526245, + "learning_rate": 0.0001287903403995373, + "loss": 1.4311, + "step": 27410 + }, + { + "epoch": 0.3561934432780037, + "grad_norm": 0.43926334381103516, + "learning_rate": 0.00012878774093762592, + "loss": 1.4788, + "step": 27411 + }, + { + "epoch": 0.3562064378219195, + "grad_norm": 0.3741862177848816, + "learning_rate": 0.00012878514147571455, + "loss": 1.412, + "step": 27412 + }, + { + "epoch": 0.3562194323658354, + "grad_norm": 0.3580864369869232, + "learning_rate": 0.00012878254201380314, + "loss": 1.3169, + "step": 27413 + }, + { + "epoch": 0.35623242690975127, + "grad_norm": 0.38417330384254456, + "learning_rate": 0.00012877994255189177, + "loss": 1.4038, + "step": 27414 + }, + { + "epoch": 0.35624542145366717, + "grad_norm": 0.3287806510925293, + "learning_rate": 0.00012877734308998037, + "loss": 1.2851, + "step": 27415 + }, + { + "epoch": 0.356258415997583, + "grad_norm": 0.32369324564933777, + "learning_rate": 0.00012877474362806902, + "loss": 1.2773, + "step": 27416 + }, + { + "epoch": 0.3562714105414989, + "grad_norm": 0.3283182382583618, + "learning_rate": 0.00012877214416615762, + "loss": 1.3611, + "step": 27417 + }, + { + "epoch": 0.35628440508541476, + "grad_norm": 0.49619248509407043, + "learning_rate": 0.0001287695447042462, + "loss": 1.4695, + "step": 27418 + }, + { + "epoch": 0.35629739962933066, + "grad_norm": 0.35933759808540344, + "learning_rate": 0.00012876694524233484, + "loss": 1.3998, + "step": 27419 + }, + { + "epoch": 0.3563103941732465, + "grad_norm": 0.3288777768611908, + "learning_rate": 0.00012876434578042346, + "loss": 1.2439, + "step": 27420 + }, + { + "epoch": 0.3563233887171624, + "grad_norm": 0.37923625111579895, + "learning_rate": 0.00012876174631851209, + "loss": 1.5645, + "step": 27421 + }, + { + "epoch": 0.35633638326107825, + "grad_norm": 0.47454309463500977, + "learning_rate": 0.00012875914685660068, + "loss": 1.4808, + "step": 27422 + }, + { + "epoch": 0.35634937780499415, + "grad_norm": 0.4253118932247162, + "learning_rate": 0.0001287565473946893, + "loss": 1.3443, + "step": 27423 + }, + { + "epoch": 0.35636237234891, + "grad_norm": 0.34108883142471313, + "learning_rate": 0.00012875394793277793, + "loss": 1.2262, + "step": 27424 + }, + { + "epoch": 0.3563753668928259, + "grad_norm": 0.41169416904449463, + "learning_rate": 0.00012875134847086653, + "loss": 1.3079, + "step": 27425 + }, + { + "epoch": 0.35638836143674174, + "grad_norm": 0.43785667419433594, + "learning_rate": 0.00012874874900895515, + "loss": 1.4222, + "step": 27426 + }, + { + "epoch": 0.35640135598065764, + "grad_norm": 0.41733235120773315, + "learning_rate": 0.00012874614954704375, + "loss": 1.5373, + "step": 27427 + }, + { + "epoch": 0.3564143505245735, + "grad_norm": 0.4157243072986603, + "learning_rate": 0.0001287435500851324, + "loss": 1.5147, + "step": 27428 + }, + { + "epoch": 0.3564273450684894, + "grad_norm": 0.4268386960029602, + "learning_rate": 0.000128740950623221, + "loss": 1.4156, + "step": 27429 + }, + { + "epoch": 0.35644033961240523, + "grad_norm": 0.43282243609428406, + "learning_rate": 0.0001287383511613096, + "loss": 1.5022, + "step": 27430 + }, + { + "epoch": 0.35645333415632113, + "grad_norm": 0.4985935389995575, + "learning_rate": 0.00012873575169939822, + "loss": 1.3959, + "step": 27431 + }, + { + "epoch": 0.356466328700237, + "grad_norm": 0.3616192936897278, + "learning_rate": 0.00012873315223748685, + "loss": 1.3045, + "step": 27432 + }, + { + "epoch": 0.3564793232441529, + "grad_norm": 0.44508063793182373, + "learning_rate": 0.00012873055277557547, + "loss": 1.4425, + "step": 27433 + }, + { + "epoch": 0.3564923177880687, + "grad_norm": 0.4529131054878235, + "learning_rate": 0.00012872795331366407, + "loss": 1.5997, + "step": 27434 + }, + { + "epoch": 0.3565053123319846, + "grad_norm": 0.4284322261810303, + "learning_rate": 0.0001287253538517527, + "loss": 1.6232, + "step": 27435 + }, + { + "epoch": 0.35651830687590047, + "grad_norm": 0.40824034810066223, + "learning_rate": 0.00012872275438984132, + "loss": 1.535, + "step": 27436 + }, + { + "epoch": 0.35653130141981637, + "grad_norm": 0.4123683273792267, + "learning_rate": 0.00012872015492792992, + "loss": 1.4224, + "step": 27437 + }, + { + "epoch": 0.3565442959637322, + "grad_norm": 0.3949636220932007, + "learning_rate": 0.00012871755546601854, + "loss": 1.4208, + "step": 27438 + }, + { + "epoch": 0.3565572905076481, + "grad_norm": 0.36995771527290344, + "learning_rate": 0.00012871495600410714, + "loss": 1.4248, + "step": 27439 + }, + { + "epoch": 0.35657028505156396, + "grad_norm": 0.38794466853141785, + "learning_rate": 0.0001287123565421958, + "loss": 1.4921, + "step": 27440 + }, + { + "epoch": 0.35658327959547986, + "grad_norm": 0.4458300471305847, + "learning_rate": 0.00012870975708028439, + "loss": 1.5408, + "step": 27441 + }, + { + "epoch": 0.3565962741393957, + "grad_norm": 0.45325544476509094, + "learning_rate": 0.000128707157618373, + "loss": 1.2174, + "step": 27442 + }, + { + "epoch": 0.3566092686833116, + "grad_norm": 0.37380313873291016, + "learning_rate": 0.0001287045581564616, + "loss": 1.3825, + "step": 27443 + }, + { + "epoch": 0.35662226322722745, + "grad_norm": 0.43868035078048706, + "learning_rate": 0.00012870195869455023, + "loss": 1.5105, + "step": 27444 + }, + { + "epoch": 0.35663525777114335, + "grad_norm": 0.4251108467578888, + "learning_rate": 0.00012869935923263886, + "loss": 1.3589, + "step": 27445 + }, + { + "epoch": 0.3566482523150592, + "grad_norm": 0.4346287250518799, + "learning_rate": 0.00012869675977072745, + "loss": 1.7435, + "step": 27446 + }, + { + "epoch": 0.3566612468589751, + "grad_norm": 0.3544836640357971, + "learning_rate": 0.00012869416030881608, + "loss": 1.2252, + "step": 27447 + }, + { + "epoch": 0.35667424140289095, + "grad_norm": 0.27312132716178894, + "learning_rate": 0.0001286915608469047, + "loss": 1.3102, + "step": 27448 + }, + { + "epoch": 0.35668723594680685, + "grad_norm": 0.33369505405426025, + "learning_rate": 0.0001286889613849933, + "loss": 1.1703, + "step": 27449 + }, + { + "epoch": 0.3567002304907227, + "grad_norm": 0.4068581163883209, + "learning_rate": 0.00012868636192308193, + "loss": 1.4872, + "step": 27450 + }, + { + "epoch": 0.3567132250346386, + "grad_norm": 0.3219605088233948, + "learning_rate": 0.00012868376246117055, + "loss": 1.6284, + "step": 27451 + }, + { + "epoch": 0.35672621957855444, + "grad_norm": 0.4218311011791229, + "learning_rate": 0.00012868116299925917, + "loss": 1.1814, + "step": 27452 + }, + { + "epoch": 0.35673921412247034, + "grad_norm": 0.37565284967422485, + "learning_rate": 0.00012867856353734777, + "loss": 1.3931, + "step": 27453 + }, + { + "epoch": 0.3567522086663862, + "grad_norm": 0.40845251083374023, + "learning_rate": 0.0001286759640754364, + "loss": 1.2751, + "step": 27454 + }, + { + "epoch": 0.3567652032103021, + "grad_norm": 0.20542186498641968, + "learning_rate": 0.00012867336461352502, + "loss": 1.2302, + "step": 27455 + }, + { + "epoch": 0.35677819775421793, + "grad_norm": 0.4377051293849945, + "learning_rate": 0.00012867076515161362, + "loss": 1.3382, + "step": 27456 + }, + { + "epoch": 0.35679119229813383, + "grad_norm": 0.3814966082572937, + "learning_rate": 0.00012866816568970224, + "loss": 1.3556, + "step": 27457 + }, + { + "epoch": 0.3568041868420497, + "grad_norm": 0.26771900057792664, + "learning_rate": 0.00012866556622779084, + "loss": 1.1319, + "step": 27458 + }, + { + "epoch": 0.3568171813859656, + "grad_norm": 0.3916628658771515, + "learning_rate": 0.0001286629667658795, + "loss": 1.2944, + "step": 27459 + }, + { + "epoch": 0.3568301759298814, + "grad_norm": 0.43458664417266846, + "learning_rate": 0.0001286603673039681, + "loss": 1.5212, + "step": 27460 + }, + { + "epoch": 0.3568431704737973, + "grad_norm": 0.4136747419834137, + "learning_rate": 0.00012865776784205669, + "loss": 1.4247, + "step": 27461 + }, + { + "epoch": 0.35685616501771317, + "grad_norm": 0.41235896944999695, + "learning_rate": 0.0001286551683801453, + "loss": 1.3885, + "step": 27462 + }, + { + "epoch": 0.35686915956162907, + "grad_norm": 0.4405613839626312, + "learning_rate": 0.00012865256891823393, + "loss": 1.5091, + "step": 27463 + }, + { + "epoch": 0.3568821541055449, + "grad_norm": 0.30737170577049255, + "learning_rate": 0.00012864996945632256, + "loss": 1.2896, + "step": 27464 + }, + { + "epoch": 0.3568951486494608, + "grad_norm": 0.4509899914264679, + "learning_rate": 0.00012864736999441116, + "loss": 1.5135, + "step": 27465 + }, + { + "epoch": 0.35690814319337666, + "grad_norm": 0.4336063861846924, + "learning_rate": 0.00012864477053249978, + "loss": 1.2786, + "step": 27466 + }, + { + "epoch": 0.35692113773729256, + "grad_norm": 0.4078902304172516, + "learning_rate": 0.0001286421710705884, + "loss": 1.543, + "step": 27467 + }, + { + "epoch": 0.3569341322812084, + "grad_norm": 0.455644816160202, + "learning_rate": 0.000128639571608677, + "loss": 1.4249, + "step": 27468 + }, + { + "epoch": 0.3569471268251243, + "grad_norm": 0.329316109418869, + "learning_rate": 0.00012863697214676563, + "loss": 1.2674, + "step": 27469 + }, + { + "epoch": 0.35696012136904015, + "grad_norm": 0.465891033411026, + "learning_rate": 0.00012863437268485423, + "loss": 1.5171, + "step": 27470 + }, + { + "epoch": 0.35697311591295605, + "grad_norm": 0.44211751222610474, + "learning_rate": 0.00012863177322294288, + "loss": 1.3645, + "step": 27471 + }, + { + "epoch": 0.3569861104568719, + "grad_norm": 0.4569490849971771, + "learning_rate": 0.00012862917376103147, + "loss": 1.344, + "step": 27472 + }, + { + "epoch": 0.3569991050007878, + "grad_norm": 0.4001865088939667, + "learning_rate": 0.00012862657429912007, + "loss": 1.4994, + "step": 27473 + }, + { + "epoch": 0.35701209954470364, + "grad_norm": 0.34411925077438354, + "learning_rate": 0.0001286239748372087, + "loss": 1.2782, + "step": 27474 + }, + { + "epoch": 0.35702509408861954, + "grad_norm": 0.4111136496067047, + "learning_rate": 0.00012862137537529732, + "loss": 1.3819, + "step": 27475 + }, + { + "epoch": 0.3570380886325354, + "grad_norm": 0.42017242312431335, + "learning_rate": 0.00012861877591338594, + "loss": 1.6094, + "step": 27476 + }, + { + "epoch": 0.3570510831764513, + "grad_norm": 0.5055058002471924, + "learning_rate": 0.00012861617645147454, + "loss": 1.2962, + "step": 27477 + }, + { + "epoch": 0.35706407772036713, + "grad_norm": 0.31187164783477783, + "learning_rate": 0.00012861357698956317, + "loss": 1.206, + "step": 27478 + }, + { + "epoch": 0.35707707226428304, + "grad_norm": 0.3526204526424408, + "learning_rate": 0.0001286109775276518, + "loss": 1.3635, + "step": 27479 + }, + { + "epoch": 0.3570900668081989, + "grad_norm": 0.3856695294380188, + "learning_rate": 0.0001286083780657404, + "loss": 1.3929, + "step": 27480 + }, + { + "epoch": 0.3571030613521148, + "grad_norm": 0.38815370202064514, + "learning_rate": 0.000128605778603829, + "loss": 1.4646, + "step": 27481 + }, + { + "epoch": 0.3571160558960306, + "grad_norm": 0.3988179862499237, + "learning_rate": 0.00012860317914191764, + "loss": 1.2555, + "step": 27482 + }, + { + "epoch": 0.3571290504399465, + "grad_norm": 0.5218237042427063, + "learning_rate": 0.00012860057968000626, + "loss": 1.5987, + "step": 27483 + }, + { + "epoch": 0.35714204498386243, + "grad_norm": 0.3820565342903137, + "learning_rate": 0.00012859798021809486, + "loss": 1.4103, + "step": 27484 + }, + { + "epoch": 0.3571550395277783, + "grad_norm": 0.4730757474899292, + "learning_rate": 0.00012859538075618346, + "loss": 1.3554, + "step": 27485 + }, + { + "epoch": 0.3571680340716942, + "grad_norm": 0.4169571101665497, + "learning_rate": 0.0001285927812942721, + "loss": 1.3402, + "step": 27486 + }, + { + "epoch": 0.35718102861561, + "grad_norm": 0.3566894829273224, + "learning_rate": 0.0001285901818323607, + "loss": 1.281, + "step": 27487 + }, + { + "epoch": 0.3571940231595259, + "grad_norm": 0.4983883798122406, + "learning_rate": 0.00012858758237044933, + "loss": 1.4113, + "step": 27488 + }, + { + "epoch": 0.35720701770344176, + "grad_norm": 0.33130237460136414, + "learning_rate": 0.00012858498290853793, + "loss": 1.483, + "step": 27489 + }, + { + "epoch": 0.35722001224735767, + "grad_norm": 0.575483500957489, + "learning_rate": 0.00012858238344662655, + "loss": 1.6155, + "step": 27490 + }, + { + "epoch": 0.3572330067912735, + "grad_norm": 0.3217366635799408, + "learning_rate": 0.00012857978398471518, + "loss": 1.2439, + "step": 27491 + }, + { + "epoch": 0.3572460013351894, + "grad_norm": 0.405439555644989, + "learning_rate": 0.00012857718452280377, + "loss": 1.4758, + "step": 27492 + }, + { + "epoch": 0.35725899587910526, + "grad_norm": 0.422473281621933, + "learning_rate": 0.0001285745850608924, + "loss": 1.4858, + "step": 27493 + }, + { + "epoch": 0.35727199042302116, + "grad_norm": 0.49928635358810425, + "learning_rate": 0.00012857198559898102, + "loss": 1.3273, + "step": 27494 + }, + { + "epoch": 0.357284984966937, + "grad_norm": 0.3200952708721161, + "learning_rate": 0.00012856938613706965, + "loss": 1.4406, + "step": 27495 + }, + { + "epoch": 0.3572979795108529, + "grad_norm": 0.49757829308509827, + "learning_rate": 0.00012856678667515824, + "loss": 1.4077, + "step": 27496 + }, + { + "epoch": 0.35731097405476875, + "grad_norm": 0.43589434027671814, + "learning_rate": 0.00012856418721324687, + "loss": 1.2956, + "step": 27497 + }, + { + "epoch": 0.35732396859868465, + "grad_norm": 0.3679733872413635, + "learning_rate": 0.0001285615877513355, + "loss": 1.1184, + "step": 27498 + }, + { + "epoch": 0.3573369631426005, + "grad_norm": 0.4152955114841461, + "learning_rate": 0.0001285589882894241, + "loss": 1.3858, + "step": 27499 + }, + { + "epoch": 0.3573499576865164, + "grad_norm": 0.36925140023231506, + "learning_rate": 0.00012855638882751272, + "loss": 1.3428, + "step": 27500 + }, + { + "epoch": 0.35736295223043224, + "grad_norm": 0.3072669208049774, + "learning_rate": 0.0001285537893656013, + "loss": 1.2925, + "step": 27501 + }, + { + "epoch": 0.35737594677434814, + "grad_norm": 0.44781291484832764, + "learning_rate": 0.00012855118990368994, + "loss": 1.4501, + "step": 27502 + }, + { + "epoch": 0.357388941318264, + "grad_norm": 0.3549068570137024, + "learning_rate": 0.00012854859044177856, + "loss": 1.2465, + "step": 27503 + }, + { + "epoch": 0.3574019358621799, + "grad_norm": 0.35514843463897705, + "learning_rate": 0.00012854599097986716, + "loss": 1.4616, + "step": 27504 + }, + { + "epoch": 0.35741493040609573, + "grad_norm": 0.4505949020385742, + "learning_rate": 0.00012854339151795578, + "loss": 1.3495, + "step": 27505 + }, + { + "epoch": 0.35742792495001163, + "grad_norm": 0.39385169744491577, + "learning_rate": 0.0001285407920560444, + "loss": 1.4904, + "step": 27506 + }, + { + "epoch": 0.3574409194939275, + "grad_norm": 0.42098578810691833, + "learning_rate": 0.00012853819259413303, + "loss": 1.2555, + "step": 27507 + }, + { + "epoch": 0.3574539140378434, + "grad_norm": 0.3905838429927826, + "learning_rate": 0.00012853559313222163, + "loss": 1.5281, + "step": 27508 + }, + { + "epoch": 0.3574669085817592, + "grad_norm": 0.45986589789390564, + "learning_rate": 0.00012853299367031025, + "loss": 1.3821, + "step": 27509 + }, + { + "epoch": 0.3574799031256751, + "grad_norm": 0.423350065946579, + "learning_rate": 0.00012853039420839888, + "loss": 1.3925, + "step": 27510 + }, + { + "epoch": 0.35749289766959097, + "grad_norm": 0.36655890941619873, + "learning_rate": 0.00012852779474648748, + "loss": 1.2027, + "step": 27511 + }, + { + "epoch": 0.35750589221350687, + "grad_norm": 0.47736120223999023, + "learning_rate": 0.0001285251952845761, + "loss": 1.4323, + "step": 27512 + }, + { + "epoch": 0.3575188867574227, + "grad_norm": 0.41521215438842773, + "learning_rate": 0.0001285225958226647, + "loss": 1.2866, + "step": 27513 + }, + { + "epoch": 0.3575318813013386, + "grad_norm": 0.4382805824279785, + "learning_rate": 0.00012851999636075332, + "loss": 1.3405, + "step": 27514 + }, + { + "epoch": 0.35754487584525446, + "grad_norm": 0.4032374322414398, + "learning_rate": 0.00012851739689884195, + "loss": 1.489, + "step": 27515 + }, + { + "epoch": 0.35755787038917036, + "grad_norm": 0.26001858711242676, + "learning_rate": 0.00012851479743693054, + "loss": 1.105, + "step": 27516 + }, + { + "epoch": 0.3575708649330862, + "grad_norm": 0.4461202025413513, + "learning_rate": 0.00012851219797501917, + "loss": 1.3607, + "step": 27517 + }, + { + "epoch": 0.3575838594770021, + "grad_norm": 0.44025787711143494, + "learning_rate": 0.0001285095985131078, + "loss": 1.5405, + "step": 27518 + }, + { + "epoch": 0.35759685402091795, + "grad_norm": 0.33944806456565857, + "learning_rate": 0.00012850699905119642, + "loss": 1.1997, + "step": 27519 + }, + { + "epoch": 0.35760984856483385, + "grad_norm": 0.40193989872932434, + "learning_rate": 0.00012850439958928502, + "loss": 1.4727, + "step": 27520 + }, + { + "epoch": 0.3576228431087497, + "grad_norm": 0.33706724643707275, + "learning_rate": 0.00012850180012737364, + "loss": 1.0974, + "step": 27521 + }, + { + "epoch": 0.3576358376526656, + "grad_norm": 0.32669657468795776, + "learning_rate": 0.00012849920066546226, + "loss": 1.48, + "step": 27522 + }, + { + "epoch": 0.35764883219658145, + "grad_norm": 0.3747687339782715, + "learning_rate": 0.00012849660120355086, + "loss": 1.5573, + "step": 27523 + }, + { + "epoch": 0.35766182674049735, + "grad_norm": 0.32767003774642944, + "learning_rate": 0.00012849400174163949, + "loss": 1.4037, + "step": 27524 + }, + { + "epoch": 0.3576748212844132, + "grad_norm": 0.35751771926879883, + "learning_rate": 0.0001284914022797281, + "loss": 1.2994, + "step": 27525 + }, + { + "epoch": 0.3576878158283291, + "grad_norm": 0.45140567421913147, + "learning_rate": 0.00012848880281781674, + "loss": 1.2774, + "step": 27526 + }, + { + "epoch": 0.35770081037224494, + "grad_norm": 0.4241960048675537, + "learning_rate": 0.00012848620335590533, + "loss": 1.4395, + "step": 27527 + }, + { + "epoch": 0.35771380491616084, + "grad_norm": 0.4169099032878876, + "learning_rate": 0.00012848360389399393, + "loss": 1.4031, + "step": 27528 + }, + { + "epoch": 0.3577267994600767, + "grad_norm": 0.4897788166999817, + "learning_rate": 0.00012848100443208258, + "loss": 1.4363, + "step": 27529 + }, + { + "epoch": 0.3577397940039926, + "grad_norm": 0.37553176283836365, + "learning_rate": 0.00012847840497017118, + "loss": 1.4147, + "step": 27530 + }, + { + "epoch": 0.35775278854790843, + "grad_norm": 0.3327403664588928, + "learning_rate": 0.0001284758055082598, + "loss": 1.387, + "step": 27531 + }, + { + "epoch": 0.35776578309182433, + "grad_norm": 0.363903671503067, + "learning_rate": 0.0001284732060463484, + "loss": 1.3835, + "step": 27532 + }, + { + "epoch": 0.3577787776357402, + "grad_norm": 0.3305162787437439, + "learning_rate": 0.00012847060658443703, + "loss": 1.3024, + "step": 27533 + }, + { + "epoch": 0.3577917721796561, + "grad_norm": 0.36481979489326477, + "learning_rate": 0.00012846800712252565, + "loss": 1.2966, + "step": 27534 + }, + { + "epoch": 0.3578047667235719, + "grad_norm": 0.5287383794784546, + "learning_rate": 0.00012846540766061425, + "loss": 1.5463, + "step": 27535 + }, + { + "epoch": 0.3578177612674878, + "grad_norm": 0.2881213128566742, + "learning_rate": 0.00012846280819870287, + "loss": 1.3323, + "step": 27536 + }, + { + "epoch": 0.35783075581140367, + "grad_norm": 0.41407474875450134, + "learning_rate": 0.0001284602087367915, + "loss": 1.3141, + "step": 27537 + }, + { + "epoch": 0.35784375035531957, + "grad_norm": 0.3082997798919678, + "learning_rate": 0.00012845760927488012, + "loss": 1.3548, + "step": 27538 + }, + { + "epoch": 0.3578567448992354, + "grad_norm": 0.410317987203598, + "learning_rate": 0.00012845500981296872, + "loss": 1.4282, + "step": 27539 + }, + { + "epoch": 0.3578697394431513, + "grad_norm": 0.39177659153938293, + "learning_rate": 0.00012845241035105732, + "loss": 1.4706, + "step": 27540 + }, + { + "epoch": 0.35788273398706716, + "grad_norm": 0.4329338073730469, + "learning_rate": 0.00012844981088914597, + "loss": 1.4902, + "step": 27541 + }, + { + "epoch": 0.35789572853098306, + "grad_norm": 0.41480037569999695, + "learning_rate": 0.00012844721142723456, + "loss": 1.512, + "step": 27542 + }, + { + "epoch": 0.3579087230748989, + "grad_norm": 0.3825821578502655, + "learning_rate": 0.0001284446119653232, + "loss": 1.6259, + "step": 27543 + }, + { + "epoch": 0.3579217176188148, + "grad_norm": 0.4038439393043518, + "learning_rate": 0.00012844201250341179, + "loss": 1.6682, + "step": 27544 + }, + { + "epoch": 0.35793471216273065, + "grad_norm": 0.4784427881240845, + "learning_rate": 0.0001284394130415004, + "loss": 1.32, + "step": 27545 + }, + { + "epoch": 0.35794770670664655, + "grad_norm": 0.41692790389060974, + "learning_rate": 0.00012843681357958904, + "loss": 1.374, + "step": 27546 + }, + { + "epoch": 0.3579607012505624, + "grad_norm": 0.5039559602737427, + "learning_rate": 0.00012843421411767763, + "loss": 1.5656, + "step": 27547 + }, + { + "epoch": 0.3579736957944783, + "grad_norm": 0.31632688641548157, + "learning_rate": 0.00012843161465576626, + "loss": 1.3362, + "step": 27548 + }, + { + "epoch": 0.35798669033839414, + "grad_norm": 0.45519399642944336, + "learning_rate": 0.00012842901519385488, + "loss": 1.3214, + "step": 27549 + }, + { + "epoch": 0.35799968488231004, + "grad_norm": 0.4742424488067627, + "learning_rate": 0.0001284264157319435, + "loss": 1.5111, + "step": 27550 + }, + { + "epoch": 0.3580126794262259, + "grad_norm": 0.5005501508712769, + "learning_rate": 0.0001284238162700321, + "loss": 1.6841, + "step": 27551 + }, + { + "epoch": 0.3580256739701418, + "grad_norm": 0.3781135678291321, + "learning_rate": 0.00012842121680812073, + "loss": 1.5331, + "step": 27552 + }, + { + "epoch": 0.35803866851405763, + "grad_norm": 0.4068751931190491, + "learning_rate": 0.00012841861734620935, + "loss": 1.4153, + "step": 27553 + }, + { + "epoch": 0.35805166305797353, + "grad_norm": 0.3550473153591156, + "learning_rate": 0.00012841601788429795, + "loss": 1.4758, + "step": 27554 + }, + { + "epoch": 0.3580646576018894, + "grad_norm": 0.43151313066482544, + "learning_rate": 0.00012841341842238657, + "loss": 1.402, + "step": 27555 + }, + { + "epoch": 0.3580776521458053, + "grad_norm": 0.44430312514305115, + "learning_rate": 0.0001284108189604752, + "loss": 1.4594, + "step": 27556 + }, + { + "epoch": 0.3580906466897211, + "grad_norm": 0.36316725611686707, + "learning_rate": 0.0001284082194985638, + "loss": 1.5865, + "step": 27557 + }, + { + "epoch": 0.358103641233637, + "grad_norm": 0.4119855463504791, + "learning_rate": 0.00012840562003665242, + "loss": 1.5897, + "step": 27558 + }, + { + "epoch": 0.3581166357775529, + "grad_norm": 0.2742438316345215, + "learning_rate": 0.00012840302057474102, + "loss": 1.1691, + "step": 27559 + }, + { + "epoch": 0.3581296303214688, + "grad_norm": 0.4849892258644104, + "learning_rate": 0.00012840042111282967, + "loss": 1.4513, + "step": 27560 + }, + { + "epoch": 0.3581426248653847, + "grad_norm": 0.44913890957832336, + "learning_rate": 0.00012839782165091827, + "loss": 1.3446, + "step": 27561 + }, + { + "epoch": 0.3581556194093005, + "grad_norm": 0.3804006278514862, + "learning_rate": 0.0001283952221890069, + "loss": 1.1593, + "step": 27562 + }, + { + "epoch": 0.3581686139532164, + "grad_norm": 0.654653787612915, + "learning_rate": 0.0001283926227270955, + "loss": 1.4065, + "step": 27563 + }, + { + "epoch": 0.35818160849713226, + "grad_norm": 0.3345881998538971, + "learning_rate": 0.0001283900232651841, + "loss": 1.5086, + "step": 27564 + }, + { + "epoch": 0.35819460304104817, + "grad_norm": 0.4422236382961273, + "learning_rate": 0.00012838742380327274, + "loss": 1.3207, + "step": 27565 + }, + { + "epoch": 0.358207597584964, + "grad_norm": 0.40724867582321167, + "learning_rate": 0.00012838482434136134, + "loss": 1.4222, + "step": 27566 + }, + { + "epoch": 0.3582205921288799, + "grad_norm": 0.3789246082305908, + "learning_rate": 0.00012838222487944996, + "loss": 1.2805, + "step": 27567 + }, + { + "epoch": 0.35823358667279576, + "grad_norm": 0.39470186829566956, + "learning_rate": 0.00012837962541753858, + "loss": 1.426, + "step": 27568 + }, + { + "epoch": 0.35824658121671166, + "grad_norm": 0.36517176032066345, + "learning_rate": 0.00012837702595562718, + "loss": 1.4738, + "step": 27569 + }, + { + "epoch": 0.3582595757606275, + "grad_norm": 0.42506855726242065, + "learning_rate": 0.0001283744264937158, + "loss": 1.4405, + "step": 27570 + }, + { + "epoch": 0.3582725703045434, + "grad_norm": 0.5019168853759766, + "learning_rate": 0.0001283718270318044, + "loss": 1.5072, + "step": 27571 + }, + { + "epoch": 0.35828556484845925, + "grad_norm": 0.3862207233905792, + "learning_rate": 0.00012836922756989306, + "loss": 1.2913, + "step": 27572 + }, + { + "epoch": 0.35829855939237515, + "grad_norm": 0.42447784543037415, + "learning_rate": 0.00012836662810798165, + "loss": 1.5047, + "step": 27573 + }, + { + "epoch": 0.358311553936291, + "grad_norm": 0.2949811518192291, + "learning_rate": 0.00012836402864607028, + "loss": 1.218, + "step": 27574 + }, + { + "epoch": 0.3583245484802069, + "grad_norm": 0.43490588665008545, + "learning_rate": 0.00012836142918415887, + "loss": 1.5563, + "step": 27575 + }, + { + "epoch": 0.35833754302412274, + "grad_norm": 0.4238477051258087, + "learning_rate": 0.0001283588297222475, + "loss": 1.4812, + "step": 27576 + }, + { + "epoch": 0.35835053756803864, + "grad_norm": 0.5049501061439514, + "learning_rate": 0.00012835623026033612, + "loss": 1.5118, + "step": 27577 + }, + { + "epoch": 0.3583635321119545, + "grad_norm": 0.40976300835609436, + "learning_rate": 0.00012835363079842472, + "loss": 1.3149, + "step": 27578 + }, + { + "epoch": 0.3583765266558704, + "grad_norm": 0.3435368239879608, + "learning_rate": 0.00012835103133651335, + "loss": 1.2494, + "step": 27579 + }, + { + "epoch": 0.35838952119978623, + "grad_norm": 0.4117524325847626, + "learning_rate": 0.00012834843187460197, + "loss": 1.3695, + "step": 27580 + }, + { + "epoch": 0.35840251574370213, + "grad_norm": 0.40709027647972107, + "learning_rate": 0.0001283458324126906, + "loss": 1.4684, + "step": 27581 + }, + { + "epoch": 0.358415510287618, + "grad_norm": 0.4361148476600647, + "learning_rate": 0.0001283432329507792, + "loss": 1.4955, + "step": 27582 + }, + { + "epoch": 0.3584285048315339, + "grad_norm": 0.36271005868911743, + "learning_rate": 0.0001283406334888678, + "loss": 1.4855, + "step": 27583 + }, + { + "epoch": 0.3584414993754497, + "grad_norm": 0.43793946504592896, + "learning_rate": 0.00012833803402695644, + "loss": 1.514, + "step": 27584 + }, + { + "epoch": 0.3584544939193656, + "grad_norm": 0.4169389307498932, + "learning_rate": 0.00012833543456504504, + "loss": 1.2333, + "step": 27585 + }, + { + "epoch": 0.35846748846328147, + "grad_norm": 0.42226526141166687, + "learning_rate": 0.00012833283510313366, + "loss": 1.2977, + "step": 27586 + }, + { + "epoch": 0.35848048300719737, + "grad_norm": 0.3446049094200134, + "learning_rate": 0.00012833023564122226, + "loss": 1.2861, + "step": 27587 + }, + { + "epoch": 0.3584934775511132, + "grad_norm": 0.4019283354282379, + "learning_rate": 0.00012832763617931088, + "loss": 1.2001, + "step": 27588 + }, + { + "epoch": 0.3585064720950291, + "grad_norm": 0.4221995174884796, + "learning_rate": 0.0001283250367173995, + "loss": 1.2721, + "step": 27589 + }, + { + "epoch": 0.35851946663894496, + "grad_norm": 0.46465909481048584, + "learning_rate": 0.0001283224372554881, + "loss": 1.4662, + "step": 27590 + }, + { + "epoch": 0.35853246118286086, + "grad_norm": 0.336833655834198, + "learning_rate": 0.00012831983779357673, + "loss": 1.4819, + "step": 27591 + }, + { + "epoch": 0.3585454557267767, + "grad_norm": 0.42345377802848816, + "learning_rate": 0.00012831723833166536, + "loss": 1.3168, + "step": 27592 + }, + { + "epoch": 0.3585584502706926, + "grad_norm": 0.4284893870353699, + "learning_rate": 0.00012831463886975398, + "loss": 1.7065, + "step": 27593 + }, + { + "epoch": 0.35857144481460845, + "grad_norm": 0.36274608969688416, + "learning_rate": 0.00012831203940784258, + "loss": 1.5371, + "step": 27594 + }, + { + "epoch": 0.35858443935852435, + "grad_norm": 0.3589741587638855, + "learning_rate": 0.0001283094399459312, + "loss": 1.3476, + "step": 27595 + }, + { + "epoch": 0.3585974339024402, + "grad_norm": 0.360270619392395, + "learning_rate": 0.00012830684048401983, + "loss": 1.6086, + "step": 27596 + }, + { + "epoch": 0.3586104284463561, + "grad_norm": 0.41673773527145386, + "learning_rate": 0.00012830424102210842, + "loss": 1.319, + "step": 27597 + }, + { + "epoch": 0.35862342299027195, + "grad_norm": 0.45880311727523804, + "learning_rate": 0.00012830164156019705, + "loss": 1.3824, + "step": 27598 + }, + { + "epoch": 0.35863641753418785, + "grad_norm": 0.46175432205200195, + "learning_rate": 0.00012829904209828567, + "loss": 1.2331, + "step": 27599 + }, + { + "epoch": 0.3586494120781037, + "grad_norm": 0.5502286553382874, + "learning_rate": 0.00012829644263637427, + "loss": 1.335, + "step": 27600 + }, + { + "epoch": 0.3586624066220196, + "grad_norm": 0.39677104353904724, + "learning_rate": 0.0001282938431744629, + "loss": 1.2464, + "step": 27601 + }, + { + "epoch": 0.35867540116593544, + "grad_norm": 0.39820152521133423, + "learning_rate": 0.0001282912437125515, + "loss": 1.2381, + "step": 27602 + }, + { + "epoch": 0.35868839570985134, + "grad_norm": 0.40472689270973206, + "learning_rate": 0.00012828864425064014, + "loss": 1.3148, + "step": 27603 + }, + { + "epoch": 0.3587013902537672, + "grad_norm": 0.3810681104660034, + "learning_rate": 0.00012828604478872874, + "loss": 1.4141, + "step": 27604 + }, + { + "epoch": 0.3587143847976831, + "grad_norm": 0.31121259927749634, + "learning_rate": 0.00012828344532681736, + "loss": 1.3646, + "step": 27605 + }, + { + "epoch": 0.35872737934159893, + "grad_norm": 0.3821033239364624, + "learning_rate": 0.00012828084586490596, + "loss": 1.3919, + "step": 27606 + }, + { + "epoch": 0.35874037388551483, + "grad_norm": 0.4427330791950226, + "learning_rate": 0.0001282782464029946, + "loss": 1.6134, + "step": 27607 + }, + { + "epoch": 0.3587533684294307, + "grad_norm": 0.41570109128952026, + "learning_rate": 0.0001282756469410832, + "loss": 1.3399, + "step": 27608 + }, + { + "epoch": 0.3587663629733466, + "grad_norm": 0.42868563532829285, + "learning_rate": 0.0001282730474791718, + "loss": 1.3288, + "step": 27609 + }, + { + "epoch": 0.3587793575172624, + "grad_norm": 0.3560831844806671, + "learning_rate": 0.00012827044801726043, + "loss": 1.379, + "step": 27610 + }, + { + "epoch": 0.3587923520611783, + "grad_norm": 0.427074670791626, + "learning_rate": 0.00012826784855534906, + "loss": 1.3527, + "step": 27611 + }, + { + "epoch": 0.35880534660509417, + "grad_norm": 0.39446666836738586, + "learning_rate": 0.00012826524909343765, + "loss": 1.5074, + "step": 27612 + }, + { + "epoch": 0.35881834114901007, + "grad_norm": 0.31512296199798584, + "learning_rate": 0.00012826264963152628, + "loss": 1.1866, + "step": 27613 + }, + { + "epoch": 0.3588313356929259, + "grad_norm": 0.3750813603401184, + "learning_rate": 0.00012826005016961488, + "loss": 1.4543, + "step": 27614 + }, + { + "epoch": 0.3588443302368418, + "grad_norm": 0.3376021981239319, + "learning_rate": 0.00012825745070770353, + "loss": 1.3941, + "step": 27615 + }, + { + "epoch": 0.35885732478075766, + "grad_norm": 0.42299073934555054, + "learning_rate": 0.00012825485124579213, + "loss": 1.4769, + "step": 27616 + }, + { + "epoch": 0.35887031932467356, + "grad_norm": 0.44412702322006226, + "learning_rate": 0.00012825225178388075, + "loss": 1.3452, + "step": 27617 + }, + { + "epoch": 0.3588833138685894, + "grad_norm": 0.3990938067436218, + "learning_rate": 0.00012824965232196935, + "loss": 1.5445, + "step": 27618 + }, + { + "epoch": 0.3588963084125053, + "grad_norm": 0.3706763982772827, + "learning_rate": 0.00012824705286005797, + "loss": 1.2381, + "step": 27619 + }, + { + "epoch": 0.35890930295642115, + "grad_norm": 0.3917693495750427, + "learning_rate": 0.0001282444533981466, + "loss": 1.2236, + "step": 27620 + }, + { + "epoch": 0.35892229750033705, + "grad_norm": 0.5068303346633911, + "learning_rate": 0.0001282418539362352, + "loss": 1.4606, + "step": 27621 + }, + { + "epoch": 0.3589352920442529, + "grad_norm": 0.4087268114089966, + "learning_rate": 0.00012823925447432382, + "loss": 1.5043, + "step": 27622 + }, + { + "epoch": 0.3589482865881688, + "grad_norm": 0.3701595366001129, + "learning_rate": 0.00012823665501241244, + "loss": 1.4456, + "step": 27623 + }, + { + "epoch": 0.35896128113208464, + "grad_norm": 0.36677101254463196, + "learning_rate": 0.00012823405555050104, + "loss": 1.3954, + "step": 27624 + }, + { + "epoch": 0.35897427567600054, + "grad_norm": 0.5198606252670288, + "learning_rate": 0.00012823145608858966, + "loss": 1.5948, + "step": 27625 + }, + { + "epoch": 0.3589872702199164, + "grad_norm": 0.37631678581237793, + "learning_rate": 0.00012822885662667826, + "loss": 1.2855, + "step": 27626 + }, + { + "epoch": 0.3590002647638323, + "grad_norm": 0.47325000166893005, + "learning_rate": 0.00012822625716476691, + "loss": 1.5006, + "step": 27627 + }, + { + "epoch": 0.35901325930774813, + "grad_norm": 0.3766353726387024, + "learning_rate": 0.0001282236577028555, + "loss": 1.4653, + "step": 27628 + }, + { + "epoch": 0.35902625385166403, + "grad_norm": 0.44791173934936523, + "learning_rate": 0.00012822105824094414, + "loss": 1.334, + "step": 27629 + }, + { + "epoch": 0.3590392483955799, + "grad_norm": 0.39020416140556335, + "learning_rate": 0.00012821845877903276, + "loss": 1.2794, + "step": 27630 + }, + { + "epoch": 0.3590522429394958, + "grad_norm": 0.4643324613571167, + "learning_rate": 0.00012821585931712136, + "loss": 1.5083, + "step": 27631 + }, + { + "epoch": 0.3590652374834116, + "grad_norm": 0.40096136927604675, + "learning_rate": 0.00012821325985520998, + "loss": 1.3139, + "step": 27632 + }, + { + "epoch": 0.3590782320273275, + "grad_norm": 0.35726621747016907, + "learning_rate": 0.00012821066039329858, + "loss": 1.3973, + "step": 27633 + }, + { + "epoch": 0.35909122657124337, + "grad_norm": 0.36392661929130554, + "learning_rate": 0.00012820806093138723, + "loss": 1.2806, + "step": 27634 + }, + { + "epoch": 0.35910422111515927, + "grad_norm": 0.40534746646881104, + "learning_rate": 0.00012820546146947583, + "loss": 1.5025, + "step": 27635 + }, + { + "epoch": 0.3591172156590752, + "grad_norm": 0.33008432388305664, + "learning_rate": 0.00012820286200756445, + "loss": 1.3684, + "step": 27636 + }, + { + "epoch": 0.359130210202991, + "grad_norm": 0.3452047109603882, + "learning_rate": 0.00012820026254565305, + "loss": 1.3064, + "step": 27637 + }, + { + "epoch": 0.3591432047469069, + "grad_norm": 0.46656790375709534, + "learning_rate": 0.00012819766308374167, + "loss": 1.4379, + "step": 27638 + }, + { + "epoch": 0.35915619929082276, + "grad_norm": 0.42533358931541443, + "learning_rate": 0.0001281950636218303, + "loss": 1.419, + "step": 27639 + }, + { + "epoch": 0.35916919383473866, + "grad_norm": 0.47659459710121155, + "learning_rate": 0.0001281924641599189, + "loss": 1.3854, + "step": 27640 + }, + { + "epoch": 0.3591821883786545, + "grad_norm": 0.32988694310188293, + "learning_rate": 0.00012818986469800752, + "loss": 1.2151, + "step": 27641 + }, + { + "epoch": 0.3591951829225704, + "grad_norm": 0.31307247281074524, + "learning_rate": 0.00012818726523609615, + "loss": 1.1736, + "step": 27642 + }, + { + "epoch": 0.35920817746648626, + "grad_norm": 0.39512899518013, + "learning_rate": 0.00012818466577418474, + "loss": 1.1759, + "step": 27643 + }, + { + "epoch": 0.35922117201040216, + "grad_norm": 0.4873647391796112, + "learning_rate": 0.00012818206631227337, + "loss": 1.6339, + "step": 27644 + }, + { + "epoch": 0.359234166554318, + "grad_norm": 0.32334479689598083, + "learning_rate": 0.00012817946685036196, + "loss": 1.3449, + "step": 27645 + }, + { + "epoch": 0.3592471610982339, + "grad_norm": 0.4204961955547333, + "learning_rate": 0.00012817686738845062, + "loss": 1.4573, + "step": 27646 + }, + { + "epoch": 0.35926015564214975, + "grad_norm": 0.5079982280731201, + "learning_rate": 0.00012817426792653921, + "loss": 1.3406, + "step": 27647 + }, + { + "epoch": 0.35927315018606565, + "grad_norm": 0.34709957242012024, + "learning_rate": 0.00012817166846462784, + "loss": 1.4508, + "step": 27648 + }, + { + "epoch": 0.3592861447299815, + "grad_norm": 0.48476696014404297, + "learning_rate": 0.00012816906900271644, + "loss": 1.4477, + "step": 27649 + }, + { + "epoch": 0.3592991392738974, + "grad_norm": 0.4945536255836487, + "learning_rate": 0.00012816646954080506, + "loss": 1.4686, + "step": 27650 + }, + { + "epoch": 0.35931213381781324, + "grad_norm": 0.3563969135284424, + "learning_rate": 0.00012816387007889368, + "loss": 1.257, + "step": 27651 + }, + { + "epoch": 0.35932512836172914, + "grad_norm": 0.4547992944717407, + "learning_rate": 0.00012816127061698228, + "loss": 1.4987, + "step": 27652 + }, + { + "epoch": 0.359338122905645, + "grad_norm": 0.38634777069091797, + "learning_rate": 0.0001281586711550709, + "loss": 1.3582, + "step": 27653 + }, + { + "epoch": 0.3593511174495609, + "grad_norm": 0.4946206510066986, + "learning_rate": 0.00012815607169315953, + "loss": 1.5002, + "step": 27654 + }, + { + "epoch": 0.35936411199347673, + "grad_norm": 0.4899272322654724, + "learning_rate": 0.00012815347223124813, + "loss": 1.4997, + "step": 27655 + }, + { + "epoch": 0.35937710653739263, + "grad_norm": 0.42135342955589294, + "learning_rate": 0.00012815087276933675, + "loss": 1.298, + "step": 27656 + }, + { + "epoch": 0.3593901010813085, + "grad_norm": 0.4033015966415405, + "learning_rate": 0.00012814827330742535, + "loss": 1.3741, + "step": 27657 + }, + { + "epoch": 0.3594030956252244, + "grad_norm": 0.44045010209083557, + "learning_rate": 0.000128145673845514, + "loss": 1.4239, + "step": 27658 + }, + { + "epoch": 0.3594160901691402, + "grad_norm": 0.44414782524108887, + "learning_rate": 0.0001281430743836026, + "loss": 1.4081, + "step": 27659 + }, + { + "epoch": 0.3594290847130561, + "grad_norm": 0.34166398644447327, + "learning_rate": 0.00012814047492169122, + "loss": 1.1138, + "step": 27660 + }, + { + "epoch": 0.35944207925697197, + "grad_norm": 0.4519447088241577, + "learning_rate": 0.00012813787545977982, + "loss": 1.2966, + "step": 27661 + }, + { + "epoch": 0.35945507380088787, + "grad_norm": 0.41849958896636963, + "learning_rate": 0.00012813527599786845, + "loss": 1.4028, + "step": 27662 + }, + { + "epoch": 0.3594680683448037, + "grad_norm": 0.4420962631702423, + "learning_rate": 0.00012813267653595707, + "loss": 1.3693, + "step": 27663 + }, + { + "epoch": 0.3594810628887196, + "grad_norm": 0.41066598892211914, + "learning_rate": 0.00012813007707404567, + "loss": 1.5534, + "step": 27664 + }, + { + "epoch": 0.35949405743263546, + "grad_norm": 0.3112351596355438, + "learning_rate": 0.0001281274776121343, + "loss": 1.3731, + "step": 27665 + }, + { + "epoch": 0.35950705197655136, + "grad_norm": 0.38092565536499023, + "learning_rate": 0.00012812487815022292, + "loss": 1.36, + "step": 27666 + }, + { + "epoch": 0.3595200465204672, + "grad_norm": 0.3766894042491913, + "learning_rate": 0.00012812227868831151, + "loss": 1.3587, + "step": 27667 + }, + { + "epoch": 0.3595330410643831, + "grad_norm": 0.3930797576904297, + "learning_rate": 0.00012811967922640014, + "loss": 1.2604, + "step": 27668 + }, + { + "epoch": 0.35954603560829895, + "grad_norm": 0.46917724609375, + "learning_rate": 0.00012811707976448876, + "loss": 1.1305, + "step": 27669 + }, + { + "epoch": 0.35955903015221485, + "grad_norm": 0.5573945641517639, + "learning_rate": 0.0001281144803025774, + "loss": 1.3345, + "step": 27670 + }, + { + "epoch": 0.3595720246961307, + "grad_norm": 0.3489684760570526, + "learning_rate": 0.00012811188084066598, + "loss": 1.426, + "step": 27671 + }, + { + "epoch": 0.3595850192400466, + "grad_norm": 0.3526850640773773, + "learning_rate": 0.0001281092813787546, + "loss": 1.561, + "step": 27672 + }, + { + "epoch": 0.35959801378396244, + "grad_norm": 0.2495710402727127, + "learning_rate": 0.00012810668191684323, + "loss": 1.4678, + "step": 27673 + }, + { + "epoch": 0.35961100832787835, + "grad_norm": 0.3864297568798065, + "learning_rate": 0.00012810408245493183, + "loss": 1.5309, + "step": 27674 + }, + { + "epoch": 0.3596240028717942, + "grad_norm": 0.5073360800743103, + "learning_rate": 0.00012810148299302046, + "loss": 1.3306, + "step": 27675 + }, + { + "epoch": 0.3596369974157101, + "grad_norm": 0.4561956226825714, + "learning_rate": 0.00012809888353110905, + "loss": 1.5017, + "step": 27676 + }, + { + "epoch": 0.35964999195962594, + "grad_norm": 0.4788547456264496, + "learning_rate": 0.0001280962840691977, + "loss": 1.3734, + "step": 27677 + }, + { + "epoch": 0.35966298650354184, + "grad_norm": 0.4014264643192291, + "learning_rate": 0.0001280936846072863, + "loss": 1.4943, + "step": 27678 + }, + { + "epoch": 0.3596759810474577, + "grad_norm": 0.3657355308532715, + "learning_rate": 0.0001280910851453749, + "loss": 1.292, + "step": 27679 + }, + { + "epoch": 0.3596889755913736, + "grad_norm": 0.5509775876998901, + "learning_rate": 0.00012808848568346352, + "loss": 1.4267, + "step": 27680 + }, + { + "epoch": 0.35970197013528943, + "grad_norm": 0.4427236020565033, + "learning_rate": 0.00012808588622155215, + "loss": 1.3906, + "step": 27681 + }, + { + "epoch": 0.35971496467920533, + "grad_norm": 0.44188711047172546, + "learning_rate": 0.00012808328675964077, + "loss": 1.7388, + "step": 27682 + }, + { + "epoch": 0.3597279592231212, + "grad_norm": 0.3850189447402954, + "learning_rate": 0.00012808068729772937, + "loss": 1.5433, + "step": 27683 + }, + { + "epoch": 0.3597409537670371, + "grad_norm": 0.39822354912757874, + "learning_rate": 0.000128078087835818, + "loss": 1.2922, + "step": 27684 + }, + { + "epoch": 0.3597539483109529, + "grad_norm": 0.3820672929286957, + "learning_rate": 0.00012807548837390662, + "loss": 1.6341, + "step": 27685 + }, + { + "epoch": 0.3597669428548688, + "grad_norm": 0.37761199474334717, + "learning_rate": 0.00012807288891199522, + "loss": 1.3627, + "step": 27686 + }, + { + "epoch": 0.35977993739878467, + "grad_norm": 0.4008931517601013, + "learning_rate": 0.00012807028945008384, + "loss": 1.3191, + "step": 27687 + }, + { + "epoch": 0.35979293194270057, + "grad_norm": 0.40517768263816833, + "learning_rate": 0.00012806768998817244, + "loss": 1.3468, + "step": 27688 + }, + { + "epoch": 0.3598059264866164, + "grad_norm": 0.4657978117465973, + "learning_rate": 0.0001280650905262611, + "loss": 1.4435, + "step": 27689 + }, + { + "epoch": 0.3598189210305323, + "grad_norm": 0.4199911952018738, + "learning_rate": 0.0001280624910643497, + "loss": 1.5789, + "step": 27690 + }, + { + "epoch": 0.35983191557444816, + "grad_norm": 0.4553356468677521, + "learning_rate": 0.00012805989160243828, + "loss": 1.2788, + "step": 27691 + }, + { + "epoch": 0.35984491011836406, + "grad_norm": 0.36766916513442993, + "learning_rate": 0.0001280572921405269, + "loss": 1.2288, + "step": 27692 + }, + { + "epoch": 0.3598579046622799, + "grad_norm": 0.4244515299797058, + "learning_rate": 0.00012805469267861553, + "loss": 1.3975, + "step": 27693 + }, + { + "epoch": 0.3598708992061958, + "grad_norm": 0.44811877608299255, + "learning_rate": 0.00012805209321670416, + "loss": 1.2439, + "step": 27694 + }, + { + "epoch": 0.35988389375011165, + "grad_norm": 0.3755172789096832, + "learning_rate": 0.00012804949375479276, + "loss": 1.5773, + "step": 27695 + }, + { + "epoch": 0.35989688829402755, + "grad_norm": 0.2625598907470703, + "learning_rate": 0.00012804689429288138, + "loss": 1.4265, + "step": 27696 + }, + { + "epoch": 0.3599098828379434, + "grad_norm": 0.3401976227760315, + "learning_rate": 0.00012804429483097, + "loss": 1.2307, + "step": 27697 + }, + { + "epoch": 0.3599228773818593, + "grad_norm": 0.41952764987945557, + "learning_rate": 0.0001280416953690586, + "loss": 1.335, + "step": 27698 + }, + { + "epoch": 0.35993587192577514, + "grad_norm": 0.3857037425041199, + "learning_rate": 0.00012803909590714723, + "loss": 1.3957, + "step": 27699 + }, + { + "epoch": 0.35994886646969104, + "grad_norm": 0.45489734411239624, + "learning_rate": 0.00012803649644523582, + "loss": 1.3912, + "step": 27700 + }, + { + "epoch": 0.3599618610136069, + "grad_norm": 0.39681991934776306, + "learning_rate": 0.00012803389698332448, + "loss": 1.4129, + "step": 27701 + }, + { + "epoch": 0.3599748555575228, + "grad_norm": 0.46642985939979553, + "learning_rate": 0.00012803129752141307, + "loss": 1.2736, + "step": 27702 + }, + { + "epoch": 0.35998785010143863, + "grad_norm": 0.4191685616970062, + "learning_rate": 0.0001280286980595017, + "loss": 1.3242, + "step": 27703 + }, + { + "epoch": 0.36000084464535453, + "grad_norm": 0.5292420387268066, + "learning_rate": 0.00012802609859759032, + "loss": 1.4502, + "step": 27704 + }, + { + "epoch": 0.3600138391892704, + "grad_norm": 0.4451882243156433, + "learning_rate": 0.00012802349913567892, + "loss": 1.1633, + "step": 27705 + }, + { + "epoch": 0.3600268337331863, + "grad_norm": 0.4311913847923279, + "learning_rate": 0.00012802089967376754, + "loss": 1.3569, + "step": 27706 + }, + { + "epoch": 0.3600398282771021, + "grad_norm": 0.4199172854423523, + "learning_rate": 0.00012801830021185614, + "loss": 1.381, + "step": 27707 + }, + { + "epoch": 0.360052822821018, + "grad_norm": 0.41160690784454346, + "learning_rate": 0.00012801570074994477, + "loss": 1.4665, + "step": 27708 + }, + { + "epoch": 0.36006581736493387, + "grad_norm": 0.3560848832130432, + "learning_rate": 0.0001280131012880334, + "loss": 1.2832, + "step": 27709 + }, + { + "epoch": 0.36007881190884977, + "grad_norm": 0.3419175148010254, + "learning_rate": 0.000128010501826122, + "loss": 1.6159, + "step": 27710 + }, + { + "epoch": 0.3600918064527657, + "grad_norm": 0.3952697813510895, + "learning_rate": 0.0001280079023642106, + "loss": 1.3438, + "step": 27711 + }, + { + "epoch": 0.3601048009966815, + "grad_norm": 0.38377606868743896, + "learning_rate": 0.00012800530290229924, + "loss": 1.4806, + "step": 27712 + }, + { + "epoch": 0.3601177955405974, + "grad_norm": 0.4361295998096466, + "learning_rate": 0.00012800270344038786, + "loss": 1.3901, + "step": 27713 + }, + { + "epoch": 0.36013079008451326, + "grad_norm": 0.401187002658844, + "learning_rate": 0.00012800010397847646, + "loss": 1.5078, + "step": 27714 + }, + { + "epoch": 0.36014378462842916, + "grad_norm": 0.43393754959106445, + "learning_rate": 0.00012799750451656508, + "loss": 1.3964, + "step": 27715 + }, + { + "epoch": 0.360156779172345, + "grad_norm": 0.3406221568584442, + "learning_rate": 0.0001279949050546537, + "loss": 1.3653, + "step": 27716 + }, + { + "epoch": 0.3601697737162609, + "grad_norm": 0.37753725051879883, + "learning_rate": 0.0001279923055927423, + "loss": 1.4628, + "step": 27717 + }, + { + "epoch": 0.36018276826017676, + "grad_norm": 0.42783549427986145, + "learning_rate": 0.00012798970613083093, + "loss": 1.4597, + "step": 27718 + }, + { + "epoch": 0.36019576280409266, + "grad_norm": 0.36001017689704895, + "learning_rate": 0.00012798710666891953, + "loss": 1.4056, + "step": 27719 + }, + { + "epoch": 0.3602087573480085, + "grad_norm": 0.4802101254463196, + "learning_rate": 0.00012798450720700815, + "loss": 1.5855, + "step": 27720 + }, + { + "epoch": 0.3602217518919244, + "grad_norm": 0.4641880393028259, + "learning_rate": 0.00012798190774509678, + "loss": 1.339, + "step": 27721 + }, + { + "epoch": 0.36023474643584025, + "grad_norm": 0.49150583148002625, + "learning_rate": 0.00012797930828318537, + "loss": 1.3896, + "step": 27722 + }, + { + "epoch": 0.36024774097975615, + "grad_norm": 0.2644767463207245, + "learning_rate": 0.000127976708821274, + "loss": 1.2511, + "step": 27723 + }, + { + "epoch": 0.360260735523672, + "grad_norm": 0.37700560688972473, + "learning_rate": 0.00012797410935936262, + "loss": 1.3508, + "step": 27724 + }, + { + "epoch": 0.3602737300675879, + "grad_norm": 0.48504844307899475, + "learning_rate": 0.00012797150989745125, + "loss": 1.4253, + "step": 27725 + }, + { + "epoch": 0.36028672461150374, + "grad_norm": 0.3321053087711334, + "learning_rate": 0.00012796891043553984, + "loss": 1.4786, + "step": 27726 + }, + { + "epoch": 0.36029971915541964, + "grad_norm": 0.43959081172943115, + "learning_rate": 0.00012796631097362847, + "loss": 1.5355, + "step": 27727 + }, + { + "epoch": 0.3603127136993355, + "grad_norm": 0.3881930112838745, + "learning_rate": 0.0001279637115117171, + "loss": 1.4316, + "step": 27728 + }, + { + "epoch": 0.3603257082432514, + "grad_norm": 0.3957481384277344, + "learning_rate": 0.0001279611120498057, + "loss": 1.4775, + "step": 27729 + }, + { + "epoch": 0.36033870278716723, + "grad_norm": 0.32693642377853394, + "learning_rate": 0.00012795851258789431, + "loss": 1.1513, + "step": 27730 + }, + { + "epoch": 0.36035169733108313, + "grad_norm": 0.44906678795814514, + "learning_rate": 0.0001279559131259829, + "loss": 1.3994, + "step": 27731 + }, + { + "epoch": 0.360364691874999, + "grad_norm": 0.4644584357738495, + "learning_rate": 0.00012795331366407156, + "loss": 1.4257, + "step": 27732 + }, + { + "epoch": 0.3603776864189149, + "grad_norm": 0.38901931047439575, + "learning_rate": 0.00012795071420216016, + "loss": 1.16, + "step": 27733 + }, + { + "epoch": 0.3603906809628307, + "grad_norm": 0.39654111862182617, + "learning_rate": 0.00012794811474024876, + "loss": 1.4416, + "step": 27734 + }, + { + "epoch": 0.3604036755067466, + "grad_norm": 0.4276122450828552, + "learning_rate": 0.00012794551527833738, + "loss": 1.2246, + "step": 27735 + }, + { + "epoch": 0.36041667005066247, + "grad_norm": 0.4020725190639496, + "learning_rate": 0.000127942915816426, + "loss": 1.2876, + "step": 27736 + }, + { + "epoch": 0.36042966459457837, + "grad_norm": 0.274831622838974, + "learning_rate": 0.00012794031635451463, + "loss": 1.2217, + "step": 27737 + }, + { + "epoch": 0.3604426591384942, + "grad_norm": 0.2875143885612488, + "learning_rate": 0.00012793771689260323, + "loss": 1.4213, + "step": 27738 + }, + { + "epoch": 0.3604556536824101, + "grad_norm": 0.3644232153892517, + "learning_rate": 0.00012793511743069185, + "loss": 1.2063, + "step": 27739 + }, + { + "epoch": 0.36046864822632596, + "grad_norm": 0.5233795046806335, + "learning_rate": 0.00012793251796878048, + "loss": 1.3546, + "step": 27740 + }, + { + "epoch": 0.36048164277024186, + "grad_norm": 0.43412065505981445, + "learning_rate": 0.00012792991850686908, + "loss": 1.465, + "step": 27741 + }, + { + "epoch": 0.3604946373141577, + "grad_norm": 0.29403597116470337, + "learning_rate": 0.0001279273190449577, + "loss": 1.3138, + "step": 27742 + }, + { + "epoch": 0.3605076318580736, + "grad_norm": 0.35137149691581726, + "learning_rate": 0.00012792471958304632, + "loss": 1.3716, + "step": 27743 + }, + { + "epoch": 0.36052062640198945, + "grad_norm": 0.494189977645874, + "learning_rate": 0.00012792212012113495, + "loss": 1.5509, + "step": 27744 + }, + { + "epoch": 0.36053362094590535, + "grad_norm": 0.4517155587673187, + "learning_rate": 0.00012791952065922355, + "loss": 1.1368, + "step": 27745 + }, + { + "epoch": 0.3605466154898212, + "grad_norm": 0.5183231830596924, + "learning_rate": 0.00012791692119731214, + "loss": 1.3575, + "step": 27746 + }, + { + "epoch": 0.3605596100337371, + "grad_norm": 0.37608230113983154, + "learning_rate": 0.0001279143217354008, + "loss": 1.4324, + "step": 27747 + }, + { + "epoch": 0.36057260457765294, + "grad_norm": 0.3984592854976654, + "learning_rate": 0.0001279117222734894, + "loss": 1.5375, + "step": 27748 + }, + { + "epoch": 0.36058559912156884, + "grad_norm": 0.4559206962585449, + "learning_rate": 0.00012790912281157802, + "loss": 1.3844, + "step": 27749 + }, + { + "epoch": 0.3605985936654847, + "grad_norm": 0.44641441106796265, + "learning_rate": 0.00012790652334966661, + "loss": 1.4414, + "step": 27750 + }, + { + "epoch": 0.3606115882094006, + "grad_norm": 0.4737507998943329, + "learning_rate": 0.00012790392388775524, + "loss": 1.2061, + "step": 27751 + }, + { + "epoch": 0.36062458275331644, + "grad_norm": 0.468979150056839, + "learning_rate": 0.00012790132442584386, + "loss": 1.4307, + "step": 27752 + }, + { + "epoch": 0.36063757729723234, + "grad_norm": 0.44866955280303955, + "learning_rate": 0.00012789872496393246, + "loss": 1.6161, + "step": 27753 + }, + { + "epoch": 0.3606505718411482, + "grad_norm": 0.4902353286743164, + "learning_rate": 0.00012789612550202108, + "loss": 1.5617, + "step": 27754 + }, + { + "epoch": 0.3606635663850641, + "grad_norm": 0.4259644150733948, + "learning_rate": 0.0001278935260401097, + "loss": 1.3698, + "step": 27755 + }, + { + "epoch": 0.36067656092897993, + "grad_norm": 0.4154307544231415, + "learning_rate": 0.00012789092657819833, + "loss": 1.3558, + "step": 27756 + }, + { + "epoch": 0.36068955547289583, + "grad_norm": 0.3634119927883148, + "learning_rate": 0.00012788832711628693, + "loss": 1.445, + "step": 27757 + }, + { + "epoch": 0.3607025500168117, + "grad_norm": 0.4950021207332611, + "learning_rate": 0.00012788572765437556, + "loss": 1.2763, + "step": 27758 + }, + { + "epoch": 0.3607155445607276, + "grad_norm": 0.3662574291229248, + "learning_rate": 0.00012788312819246418, + "loss": 1.4004, + "step": 27759 + }, + { + "epoch": 0.3607285391046434, + "grad_norm": 0.3208286166191101, + "learning_rate": 0.00012788052873055278, + "loss": 1.3769, + "step": 27760 + }, + { + "epoch": 0.3607415336485593, + "grad_norm": 0.26576370000839233, + "learning_rate": 0.0001278779292686414, + "loss": 1.1657, + "step": 27761 + }, + { + "epoch": 0.36075452819247517, + "grad_norm": 0.4860704243183136, + "learning_rate": 0.00012787532980673, + "loss": 1.3634, + "step": 27762 + }, + { + "epoch": 0.36076752273639107, + "grad_norm": 0.3989601135253906, + "learning_rate": 0.00012787273034481862, + "loss": 1.3924, + "step": 27763 + }, + { + "epoch": 0.3607805172803069, + "grad_norm": 0.42672085762023926, + "learning_rate": 0.00012787013088290725, + "loss": 1.3004, + "step": 27764 + }, + { + "epoch": 0.3607935118242228, + "grad_norm": 0.29372063279151917, + "learning_rate": 0.00012786753142099585, + "loss": 1.0006, + "step": 27765 + }, + { + "epoch": 0.36080650636813866, + "grad_norm": 0.4319475591182709, + "learning_rate": 0.00012786493195908447, + "loss": 1.4514, + "step": 27766 + }, + { + "epoch": 0.36081950091205456, + "grad_norm": 0.4387374222278595, + "learning_rate": 0.0001278623324971731, + "loss": 1.421, + "step": 27767 + }, + { + "epoch": 0.3608324954559704, + "grad_norm": 0.39316609501838684, + "learning_rate": 0.00012785973303526172, + "loss": 1.419, + "step": 27768 + }, + { + "epoch": 0.3608454899998863, + "grad_norm": 0.4666074216365814, + "learning_rate": 0.00012785713357335032, + "loss": 1.4141, + "step": 27769 + }, + { + "epoch": 0.36085848454380215, + "grad_norm": 0.4391013979911804, + "learning_rate": 0.00012785453411143894, + "loss": 1.5427, + "step": 27770 + }, + { + "epoch": 0.36087147908771805, + "grad_norm": 0.4647830128669739, + "learning_rate": 0.00012785193464952757, + "loss": 1.4831, + "step": 27771 + }, + { + "epoch": 0.3608844736316339, + "grad_norm": 0.3286523222923279, + "learning_rate": 0.00012784933518761616, + "loss": 1.2075, + "step": 27772 + }, + { + "epoch": 0.3608974681755498, + "grad_norm": 0.4326229989528656, + "learning_rate": 0.0001278467357257048, + "loss": 1.2158, + "step": 27773 + }, + { + "epoch": 0.36091046271946564, + "grad_norm": 0.34523260593414307, + "learning_rate": 0.00012784413626379338, + "loss": 1.3759, + "step": 27774 + }, + { + "epoch": 0.36092345726338154, + "grad_norm": 0.4005174934864044, + "learning_rate": 0.000127841536801882, + "loss": 1.3219, + "step": 27775 + }, + { + "epoch": 0.3609364518072974, + "grad_norm": 0.38597485423088074, + "learning_rate": 0.00012783893733997063, + "loss": 1.6224, + "step": 27776 + }, + { + "epoch": 0.3609494463512133, + "grad_norm": 0.3368265628814697, + "learning_rate": 0.00012783633787805923, + "loss": 1.3623, + "step": 27777 + }, + { + "epoch": 0.36096244089512913, + "grad_norm": 0.3311925530433655, + "learning_rate": 0.00012783373841614788, + "loss": 1.4814, + "step": 27778 + }, + { + "epoch": 0.36097543543904503, + "grad_norm": 0.42169272899627686, + "learning_rate": 0.00012783113895423648, + "loss": 1.5797, + "step": 27779 + }, + { + "epoch": 0.3609884299829609, + "grad_norm": 0.4065757095813751, + "learning_rate": 0.0001278285394923251, + "loss": 1.3247, + "step": 27780 + }, + { + "epoch": 0.3610014245268768, + "grad_norm": 0.5299268960952759, + "learning_rate": 0.0001278259400304137, + "loss": 1.4152, + "step": 27781 + }, + { + "epoch": 0.3610144190707926, + "grad_norm": 0.4666915237903595, + "learning_rate": 0.00012782334056850233, + "loss": 1.6351, + "step": 27782 + }, + { + "epoch": 0.3610274136147085, + "grad_norm": 0.44816410541534424, + "learning_rate": 0.00012782074110659095, + "loss": 1.4535, + "step": 27783 + }, + { + "epoch": 0.36104040815862437, + "grad_norm": 0.37693458795547485, + "learning_rate": 0.00012781814164467955, + "loss": 1.2685, + "step": 27784 + }, + { + "epoch": 0.36105340270254027, + "grad_norm": 0.4154645800590515, + "learning_rate": 0.00012781554218276817, + "loss": 1.3166, + "step": 27785 + }, + { + "epoch": 0.3610663972464561, + "grad_norm": 0.3993780314922333, + "learning_rate": 0.0001278129427208568, + "loss": 1.4926, + "step": 27786 + }, + { + "epoch": 0.361079391790372, + "grad_norm": 0.40013110637664795, + "learning_rate": 0.00012781034325894542, + "loss": 1.4259, + "step": 27787 + }, + { + "epoch": 0.3610923863342879, + "grad_norm": 0.43420761823654175, + "learning_rate": 0.00012780774379703402, + "loss": 1.5246, + "step": 27788 + }, + { + "epoch": 0.36110538087820376, + "grad_norm": 0.3763276934623718, + "learning_rate": 0.00012780514433512262, + "loss": 1.4771, + "step": 27789 + }, + { + "epoch": 0.36111837542211966, + "grad_norm": 0.4337875247001648, + "learning_rate": 0.00012780254487321127, + "loss": 1.5364, + "step": 27790 + }, + { + "epoch": 0.3611313699660355, + "grad_norm": 0.397617906332016, + "learning_rate": 0.00012779994541129987, + "loss": 1.528, + "step": 27791 + }, + { + "epoch": 0.3611443645099514, + "grad_norm": 0.3005382716655731, + "learning_rate": 0.0001277973459493885, + "loss": 1.1786, + "step": 27792 + }, + { + "epoch": 0.36115735905386726, + "grad_norm": 0.378738135099411, + "learning_rate": 0.0001277947464874771, + "loss": 1.481, + "step": 27793 + }, + { + "epoch": 0.36117035359778316, + "grad_norm": 0.3815094232559204, + "learning_rate": 0.0001277921470255657, + "loss": 1.1579, + "step": 27794 + }, + { + "epoch": 0.361183348141699, + "grad_norm": 0.45481762290000916, + "learning_rate": 0.00012778954756365434, + "loss": 1.3996, + "step": 27795 + }, + { + "epoch": 0.3611963426856149, + "grad_norm": 0.38075512647628784, + "learning_rate": 0.00012778694810174293, + "loss": 1.4065, + "step": 27796 + }, + { + "epoch": 0.36120933722953075, + "grad_norm": 0.4106857180595398, + "learning_rate": 0.00012778434863983156, + "loss": 1.4362, + "step": 27797 + }, + { + "epoch": 0.36122233177344665, + "grad_norm": 0.4493132531642914, + "learning_rate": 0.00012778174917792018, + "loss": 1.5314, + "step": 27798 + }, + { + "epoch": 0.3612353263173625, + "grad_norm": 0.3695198893547058, + "learning_rate": 0.0001277791497160088, + "loss": 1.2503, + "step": 27799 + }, + { + "epoch": 0.3612483208612784, + "grad_norm": 0.3015180826187134, + "learning_rate": 0.0001277765502540974, + "loss": 1.2105, + "step": 27800 + }, + { + "epoch": 0.36126131540519424, + "grad_norm": 0.369755357503891, + "learning_rate": 0.000127773950792186, + "loss": 1.5278, + "step": 27801 + }, + { + "epoch": 0.36127430994911014, + "grad_norm": 0.37347927689552307, + "learning_rate": 0.00012777135133027465, + "loss": 1.522, + "step": 27802 + }, + { + "epoch": 0.361287304493026, + "grad_norm": 0.4142630398273468, + "learning_rate": 0.00012776875186836325, + "loss": 1.5668, + "step": 27803 + }, + { + "epoch": 0.3613002990369419, + "grad_norm": 0.4206830859184265, + "learning_rate": 0.00012776615240645188, + "loss": 1.3385, + "step": 27804 + }, + { + "epoch": 0.36131329358085773, + "grad_norm": 0.4634554982185364, + "learning_rate": 0.00012776355294454047, + "loss": 1.5281, + "step": 27805 + }, + { + "epoch": 0.36132628812477363, + "grad_norm": 0.44164320826530457, + "learning_rate": 0.0001277609534826291, + "loss": 1.4743, + "step": 27806 + }, + { + "epoch": 0.3613392826686895, + "grad_norm": 0.3963533937931061, + "learning_rate": 0.00012775835402071772, + "loss": 1.4595, + "step": 27807 + }, + { + "epoch": 0.3613522772126054, + "grad_norm": 0.3321821987628937, + "learning_rate": 0.00012775575455880632, + "loss": 1.4171, + "step": 27808 + }, + { + "epoch": 0.3613652717565212, + "grad_norm": 0.472731351852417, + "learning_rate": 0.00012775315509689494, + "loss": 1.4773, + "step": 27809 + }, + { + "epoch": 0.3613782663004371, + "grad_norm": 0.34022805094718933, + "learning_rate": 0.00012775055563498357, + "loss": 1.1777, + "step": 27810 + }, + { + "epoch": 0.36139126084435297, + "grad_norm": 0.4276570677757263, + "learning_rate": 0.0001277479561730722, + "loss": 1.5319, + "step": 27811 + }, + { + "epoch": 0.36140425538826887, + "grad_norm": 0.3602466285228729, + "learning_rate": 0.0001277453567111608, + "loss": 1.3432, + "step": 27812 + }, + { + "epoch": 0.3614172499321847, + "grad_norm": 0.35369378328323364, + "learning_rate": 0.0001277427572492494, + "loss": 1.2578, + "step": 27813 + }, + { + "epoch": 0.3614302444761006, + "grad_norm": 0.31022390723228455, + "learning_rate": 0.00012774015778733804, + "loss": 1.3267, + "step": 27814 + }, + { + "epoch": 0.36144323902001646, + "grad_norm": 0.3798324763774872, + "learning_rate": 0.00012773755832542664, + "loss": 1.3003, + "step": 27815 + }, + { + "epoch": 0.36145623356393236, + "grad_norm": 0.4148436188697815, + "learning_rate": 0.00012773495886351526, + "loss": 1.4778, + "step": 27816 + }, + { + "epoch": 0.3614692281078482, + "grad_norm": 0.44393473863601685, + "learning_rate": 0.00012773235940160389, + "loss": 1.5811, + "step": 27817 + }, + { + "epoch": 0.3614822226517641, + "grad_norm": 0.47508734464645386, + "learning_rate": 0.00012772975993969248, + "loss": 1.562, + "step": 27818 + }, + { + "epoch": 0.36149521719567995, + "grad_norm": 0.3382413387298584, + "learning_rate": 0.0001277271604777811, + "loss": 1.4004, + "step": 27819 + }, + { + "epoch": 0.36150821173959585, + "grad_norm": 0.3241707980632782, + "learning_rate": 0.0001277245610158697, + "loss": 1.094, + "step": 27820 + }, + { + "epoch": 0.3615212062835117, + "grad_norm": 0.2444005161523819, + "learning_rate": 0.00012772196155395836, + "loss": 1.3469, + "step": 27821 + }, + { + "epoch": 0.3615342008274276, + "grad_norm": 0.4090452790260315, + "learning_rate": 0.00012771936209204695, + "loss": 1.3498, + "step": 27822 + }, + { + "epoch": 0.36154719537134344, + "grad_norm": 0.48385652899742126, + "learning_rate": 0.00012771676263013558, + "loss": 1.4867, + "step": 27823 + }, + { + "epoch": 0.36156018991525934, + "grad_norm": 0.47417891025543213, + "learning_rate": 0.00012771416316822418, + "loss": 1.3614, + "step": 27824 + }, + { + "epoch": 0.3615731844591752, + "grad_norm": 0.4144386649131775, + "learning_rate": 0.0001277115637063128, + "loss": 1.439, + "step": 27825 + }, + { + "epoch": 0.3615861790030911, + "grad_norm": 0.38045093417167664, + "learning_rate": 0.00012770896424440142, + "loss": 1.4307, + "step": 27826 + }, + { + "epoch": 0.36159917354700694, + "grad_norm": 0.6519449949264526, + "learning_rate": 0.00012770636478249002, + "loss": 1.3359, + "step": 27827 + }, + { + "epoch": 0.36161216809092284, + "grad_norm": 0.4009718894958496, + "learning_rate": 0.00012770376532057865, + "loss": 1.3478, + "step": 27828 + }, + { + "epoch": 0.3616251626348387, + "grad_norm": 0.30755242705345154, + "learning_rate": 0.00012770116585866727, + "loss": 1.2132, + "step": 27829 + }, + { + "epoch": 0.3616381571787546, + "grad_norm": 0.478539377450943, + "learning_rate": 0.00012769856639675587, + "loss": 1.4943, + "step": 27830 + }, + { + "epoch": 0.3616511517226704, + "grad_norm": 0.32504621148109436, + "learning_rate": 0.0001276959669348445, + "loss": 1.4372, + "step": 27831 + }, + { + "epoch": 0.36166414626658633, + "grad_norm": 0.4168856143951416, + "learning_rate": 0.0001276933674729331, + "loss": 1.4452, + "step": 27832 + }, + { + "epoch": 0.3616771408105022, + "grad_norm": 0.42507296800613403, + "learning_rate": 0.00012769076801102174, + "loss": 1.5417, + "step": 27833 + }, + { + "epoch": 0.3616901353544181, + "grad_norm": 0.5024882555007935, + "learning_rate": 0.00012768816854911034, + "loss": 1.5667, + "step": 27834 + }, + { + "epoch": 0.3617031298983339, + "grad_norm": 0.40814459323883057, + "learning_rate": 0.00012768556908719896, + "loss": 1.3775, + "step": 27835 + }, + { + "epoch": 0.3617161244422498, + "grad_norm": 0.44818344712257385, + "learning_rate": 0.00012768296962528756, + "loss": 1.4195, + "step": 27836 + }, + { + "epoch": 0.36172911898616567, + "grad_norm": 0.5035160779953003, + "learning_rate": 0.00012768037016337619, + "loss": 1.3621, + "step": 27837 + }, + { + "epoch": 0.36174211353008157, + "grad_norm": 0.2922307848930359, + "learning_rate": 0.0001276777707014648, + "loss": 1.493, + "step": 27838 + }, + { + "epoch": 0.3617551080739974, + "grad_norm": 0.40868890285491943, + "learning_rate": 0.0001276751712395534, + "loss": 1.2985, + "step": 27839 + }, + { + "epoch": 0.3617681026179133, + "grad_norm": 0.35993722081184387, + "learning_rate": 0.00012767257177764203, + "loss": 1.3174, + "step": 27840 + }, + { + "epoch": 0.36178109716182916, + "grad_norm": 0.4102262556552887, + "learning_rate": 0.00012766997231573066, + "loss": 1.4433, + "step": 27841 + }, + { + "epoch": 0.36179409170574506, + "grad_norm": 0.3526569604873657, + "learning_rate": 0.00012766737285381928, + "loss": 1.5094, + "step": 27842 + }, + { + "epoch": 0.3618070862496609, + "grad_norm": 0.3262081444263458, + "learning_rate": 0.00012766477339190788, + "loss": 1.3418, + "step": 27843 + }, + { + "epoch": 0.3618200807935768, + "grad_norm": 0.5079373717308044, + "learning_rate": 0.00012766217392999648, + "loss": 1.4933, + "step": 27844 + }, + { + "epoch": 0.36183307533749265, + "grad_norm": 0.4362841546535492, + "learning_rate": 0.00012765957446808513, + "loss": 1.5118, + "step": 27845 + }, + { + "epoch": 0.36184606988140855, + "grad_norm": 0.3379189074039459, + "learning_rate": 0.00012765697500617372, + "loss": 1.4802, + "step": 27846 + }, + { + "epoch": 0.3618590644253244, + "grad_norm": 0.3922182321548462, + "learning_rate": 0.00012765437554426235, + "loss": 1.3502, + "step": 27847 + }, + { + "epoch": 0.3618720589692403, + "grad_norm": 0.4121309518814087, + "learning_rate": 0.00012765177608235095, + "loss": 1.4688, + "step": 27848 + }, + { + "epoch": 0.36188505351315614, + "grad_norm": 0.42894285917282104, + "learning_rate": 0.00012764917662043957, + "loss": 1.3544, + "step": 27849 + }, + { + "epoch": 0.36189804805707204, + "grad_norm": 0.43683096766471863, + "learning_rate": 0.0001276465771585282, + "loss": 1.3238, + "step": 27850 + }, + { + "epoch": 0.3619110426009879, + "grad_norm": 0.39007100462913513, + "learning_rate": 0.0001276439776966168, + "loss": 1.3881, + "step": 27851 + }, + { + "epoch": 0.3619240371449038, + "grad_norm": 0.3597339987754822, + "learning_rate": 0.00012764137823470542, + "loss": 1.3517, + "step": 27852 + }, + { + "epoch": 0.36193703168881963, + "grad_norm": 0.2994987368583679, + "learning_rate": 0.00012763877877279404, + "loss": 1.187, + "step": 27853 + }, + { + "epoch": 0.36195002623273553, + "grad_norm": 0.39084556698799133, + "learning_rate": 0.00012763617931088267, + "loss": 1.3499, + "step": 27854 + }, + { + "epoch": 0.3619630207766514, + "grad_norm": 0.4339793920516968, + "learning_rate": 0.00012763357984897126, + "loss": 1.2556, + "step": 27855 + }, + { + "epoch": 0.3619760153205673, + "grad_norm": 0.31620359420776367, + "learning_rate": 0.0001276309803870599, + "loss": 1.2077, + "step": 27856 + }, + { + "epoch": 0.3619890098644831, + "grad_norm": 0.5104015469551086, + "learning_rate": 0.0001276283809251485, + "loss": 1.4807, + "step": 27857 + }, + { + "epoch": 0.362002004408399, + "grad_norm": 0.5380813479423523, + "learning_rate": 0.0001276257814632371, + "loss": 1.4119, + "step": 27858 + }, + { + "epoch": 0.36201499895231487, + "grad_norm": 0.4350837767124176, + "learning_rate": 0.00012762318200132573, + "loss": 1.619, + "step": 27859 + }, + { + "epoch": 0.36202799349623077, + "grad_norm": 0.3735024034976959, + "learning_rate": 0.00012762058253941436, + "loss": 1.4184, + "step": 27860 + }, + { + "epoch": 0.3620409880401466, + "grad_norm": 0.39258772134780884, + "learning_rate": 0.00012761798307750296, + "loss": 1.382, + "step": 27861 + }, + { + "epoch": 0.3620539825840625, + "grad_norm": 0.35262545943260193, + "learning_rate": 0.00012761538361559158, + "loss": 1.2955, + "step": 27862 + }, + { + "epoch": 0.3620669771279784, + "grad_norm": 0.4838973879814148, + "learning_rate": 0.00012761278415368018, + "loss": 1.4091, + "step": 27863 + }, + { + "epoch": 0.36207997167189426, + "grad_norm": 0.4565093517303467, + "learning_rate": 0.00012761018469176883, + "loss": 1.3676, + "step": 27864 + }, + { + "epoch": 0.36209296621581016, + "grad_norm": 0.43859195709228516, + "learning_rate": 0.00012760758522985743, + "loss": 1.4905, + "step": 27865 + }, + { + "epoch": 0.362105960759726, + "grad_norm": 0.4944971203804016, + "learning_rate": 0.00012760498576794605, + "loss": 1.5892, + "step": 27866 + }, + { + "epoch": 0.3621189553036419, + "grad_norm": 0.39383307099342346, + "learning_rate": 0.00012760238630603465, + "loss": 1.4989, + "step": 27867 + }, + { + "epoch": 0.36213194984755775, + "grad_norm": 0.519035816192627, + "learning_rate": 0.00012759978684412327, + "loss": 1.4191, + "step": 27868 + }, + { + "epoch": 0.36214494439147366, + "grad_norm": 0.4462484121322632, + "learning_rate": 0.0001275971873822119, + "loss": 1.3063, + "step": 27869 + }, + { + "epoch": 0.3621579389353895, + "grad_norm": 0.35490527749061584, + "learning_rate": 0.0001275945879203005, + "loss": 1.3366, + "step": 27870 + }, + { + "epoch": 0.3621709334793054, + "grad_norm": 0.45181557536125183, + "learning_rate": 0.00012759198845838912, + "loss": 1.2444, + "step": 27871 + }, + { + "epoch": 0.36218392802322125, + "grad_norm": 0.4016639292240143, + "learning_rate": 0.00012758938899647774, + "loss": 1.4877, + "step": 27872 + }, + { + "epoch": 0.36219692256713715, + "grad_norm": 0.340221643447876, + "learning_rate": 0.00012758678953456634, + "loss": 1.3932, + "step": 27873 + }, + { + "epoch": 0.362209917111053, + "grad_norm": 0.47415557503700256, + "learning_rate": 0.00012758419007265497, + "loss": 1.3907, + "step": 27874 + }, + { + "epoch": 0.3622229116549689, + "grad_norm": 0.3619421720504761, + "learning_rate": 0.00012758159061074356, + "loss": 1.2047, + "step": 27875 + }, + { + "epoch": 0.36223590619888474, + "grad_norm": 0.3746826946735382, + "learning_rate": 0.00012757899114883221, + "loss": 1.5186, + "step": 27876 + }, + { + "epoch": 0.36224890074280064, + "grad_norm": 0.4093152582645416, + "learning_rate": 0.0001275763916869208, + "loss": 1.3872, + "step": 27877 + }, + { + "epoch": 0.3622618952867165, + "grad_norm": 0.4057905673980713, + "learning_rate": 0.00012757379222500944, + "loss": 1.473, + "step": 27878 + }, + { + "epoch": 0.3622748898306324, + "grad_norm": 0.5744732618331909, + "learning_rate": 0.00012757119276309803, + "loss": 1.4239, + "step": 27879 + }, + { + "epoch": 0.36228788437454823, + "grad_norm": 0.39384716749191284, + "learning_rate": 0.00012756859330118666, + "loss": 1.3799, + "step": 27880 + }, + { + "epoch": 0.36230087891846413, + "grad_norm": 0.35178133845329285, + "learning_rate": 0.00012756599383927528, + "loss": 1.321, + "step": 27881 + }, + { + "epoch": 0.36231387346238, + "grad_norm": 0.4607468843460083, + "learning_rate": 0.00012756339437736388, + "loss": 1.5119, + "step": 27882 + }, + { + "epoch": 0.3623268680062959, + "grad_norm": 0.3953493535518646, + "learning_rate": 0.0001275607949154525, + "loss": 1.3561, + "step": 27883 + }, + { + "epoch": 0.3623398625502117, + "grad_norm": 0.4084276258945465, + "learning_rate": 0.00012755819545354113, + "loss": 1.5949, + "step": 27884 + }, + { + "epoch": 0.3623528570941276, + "grad_norm": 0.4487241804599762, + "learning_rate": 0.00012755559599162973, + "loss": 1.483, + "step": 27885 + }, + { + "epoch": 0.36236585163804347, + "grad_norm": 0.3903324604034424, + "learning_rate": 0.00012755299652971835, + "loss": 1.3356, + "step": 27886 + }, + { + "epoch": 0.36237884618195937, + "grad_norm": 0.39339789748191833, + "learning_rate": 0.00012755039706780695, + "loss": 1.3676, + "step": 27887 + }, + { + "epoch": 0.3623918407258752, + "grad_norm": 0.36171844601631165, + "learning_rate": 0.0001275477976058956, + "loss": 1.3902, + "step": 27888 + }, + { + "epoch": 0.3624048352697911, + "grad_norm": 0.4454975724220276, + "learning_rate": 0.0001275451981439842, + "loss": 1.3918, + "step": 27889 + }, + { + "epoch": 0.36241782981370696, + "grad_norm": 0.43110841512680054, + "learning_rate": 0.00012754259868207282, + "loss": 1.5098, + "step": 27890 + }, + { + "epoch": 0.36243082435762286, + "grad_norm": 0.38996773958206177, + "learning_rate": 0.00012753999922016145, + "loss": 1.5115, + "step": 27891 + }, + { + "epoch": 0.3624438189015387, + "grad_norm": 0.31067830324172974, + "learning_rate": 0.00012753739975825004, + "loss": 1.4401, + "step": 27892 + }, + { + "epoch": 0.3624568134454546, + "grad_norm": 0.43410617113113403, + "learning_rate": 0.00012753480029633867, + "loss": 1.1939, + "step": 27893 + }, + { + "epoch": 0.36246980798937045, + "grad_norm": 0.3915244936943054, + "learning_rate": 0.00012753220083442727, + "loss": 1.5003, + "step": 27894 + }, + { + "epoch": 0.36248280253328635, + "grad_norm": 0.2988142967224121, + "learning_rate": 0.00012752960137251592, + "loss": 1.3511, + "step": 27895 + }, + { + "epoch": 0.3624957970772022, + "grad_norm": 0.3417602479457855, + "learning_rate": 0.00012752700191060451, + "loss": 1.1653, + "step": 27896 + }, + { + "epoch": 0.3625087916211181, + "grad_norm": 0.4114759862422943, + "learning_rate": 0.0001275244024486931, + "loss": 1.4774, + "step": 27897 + }, + { + "epoch": 0.36252178616503394, + "grad_norm": 0.3979776203632355, + "learning_rate": 0.00012752180298678174, + "loss": 1.4565, + "step": 27898 + }, + { + "epoch": 0.36253478070894984, + "grad_norm": 0.5564953088760376, + "learning_rate": 0.00012751920352487036, + "loss": 1.498, + "step": 27899 + }, + { + "epoch": 0.3625477752528657, + "grad_norm": 0.3537488877773285, + "learning_rate": 0.00012751660406295899, + "loss": 1.444, + "step": 27900 + }, + { + "epoch": 0.3625607697967816, + "grad_norm": 0.46747323870658875, + "learning_rate": 0.00012751400460104758, + "loss": 1.2902, + "step": 27901 + }, + { + "epoch": 0.36257376434069744, + "grad_norm": 0.37444618344306946, + "learning_rate": 0.0001275114051391362, + "loss": 1.5085, + "step": 27902 + }, + { + "epoch": 0.36258675888461334, + "grad_norm": 0.4861817955970764, + "learning_rate": 0.00012750880567722483, + "loss": 1.4716, + "step": 27903 + }, + { + "epoch": 0.3625997534285292, + "grad_norm": 0.41579338908195496, + "learning_rate": 0.00012750620621531343, + "loss": 1.3574, + "step": 27904 + }, + { + "epoch": 0.3626127479724451, + "grad_norm": 0.47768378257751465, + "learning_rate": 0.00012750360675340205, + "loss": 1.3915, + "step": 27905 + }, + { + "epoch": 0.3626257425163609, + "grad_norm": 0.24530890583992004, + "learning_rate": 0.00012750100729149065, + "loss": 1.3806, + "step": 27906 + }, + { + "epoch": 0.36263873706027683, + "grad_norm": 0.3343988358974457, + "learning_rate": 0.0001274984078295793, + "loss": 1.2221, + "step": 27907 + }, + { + "epoch": 0.3626517316041927, + "grad_norm": 0.3423810601234436, + "learning_rate": 0.0001274958083676679, + "loss": 1.1695, + "step": 27908 + }, + { + "epoch": 0.3626647261481086, + "grad_norm": 0.4304436147212982, + "learning_rate": 0.00012749320890575652, + "loss": 1.2429, + "step": 27909 + }, + { + "epoch": 0.3626777206920244, + "grad_norm": 0.34206873178482056, + "learning_rate": 0.00012749060944384512, + "loss": 1.2436, + "step": 27910 + }, + { + "epoch": 0.3626907152359403, + "grad_norm": 0.468522310256958, + "learning_rate": 0.00012748800998193375, + "loss": 1.3922, + "step": 27911 + }, + { + "epoch": 0.36270370977985616, + "grad_norm": 0.4226739704608917, + "learning_rate": 0.00012748541052002237, + "loss": 1.4326, + "step": 27912 + }, + { + "epoch": 0.36271670432377207, + "grad_norm": 0.34291574358940125, + "learning_rate": 0.00012748281105811097, + "loss": 1.5153, + "step": 27913 + }, + { + "epoch": 0.3627296988676879, + "grad_norm": 0.44214433431625366, + "learning_rate": 0.0001274802115961996, + "loss": 1.3395, + "step": 27914 + }, + { + "epoch": 0.3627426934116038, + "grad_norm": 0.4218233823776245, + "learning_rate": 0.00012747761213428822, + "loss": 1.5031, + "step": 27915 + }, + { + "epoch": 0.36275568795551966, + "grad_norm": 0.4764331579208374, + "learning_rate": 0.00012747501267237681, + "loss": 1.414, + "step": 27916 + }, + { + "epoch": 0.36276868249943556, + "grad_norm": 0.40904611349105835, + "learning_rate": 0.00012747241321046544, + "loss": 1.3522, + "step": 27917 + }, + { + "epoch": 0.3627816770433514, + "grad_norm": 0.46175211668014526, + "learning_rate": 0.00012746981374855404, + "loss": 1.4644, + "step": 27918 + }, + { + "epoch": 0.3627946715872673, + "grad_norm": 0.4082449674606323, + "learning_rate": 0.0001274672142866427, + "loss": 1.4438, + "step": 27919 + }, + { + "epoch": 0.36280766613118315, + "grad_norm": 0.7580820918083191, + "learning_rate": 0.00012746461482473129, + "loss": 1.3861, + "step": 27920 + }, + { + "epoch": 0.36282066067509905, + "grad_norm": 0.43384575843811035, + "learning_rate": 0.0001274620153628199, + "loss": 1.467, + "step": 27921 + }, + { + "epoch": 0.3628336552190149, + "grad_norm": 0.47677040100097656, + "learning_rate": 0.0001274594159009085, + "loss": 1.4714, + "step": 27922 + }, + { + "epoch": 0.3628466497629308, + "grad_norm": 0.3647285997867584, + "learning_rate": 0.00012745681643899713, + "loss": 1.3462, + "step": 27923 + }, + { + "epoch": 0.36285964430684664, + "grad_norm": 0.3311726152896881, + "learning_rate": 0.00012745421697708576, + "loss": 1.2766, + "step": 27924 + }, + { + "epoch": 0.36287263885076254, + "grad_norm": 0.43895748257637024, + "learning_rate": 0.00012745161751517435, + "loss": 1.2864, + "step": 27925 + }, + { + "epoch": 0.3628856333946784, + "grad_norm": 0.3090461492538452, + "learning_rate": 0.00012744901805326298, + "loss": 1.2642, + "step": 27926 + }, + { + "epoch": 0.3628986279385943, + "grad_norm": 0.47124722599983215, + "learning_rate": 0.0001274464185913516, + "loss": 1.1927, + "step": 27927 + }, + { + "epoch": 0.36291162248251013, + "grad_norm": 0.3850697875022888, + "learning_rate": 0.0001274438191294402, + "loss": 1.6199, + "step": 27928 + }, + { + "epoch": 0.36292461702642603, + "grad_norm": 0.4305574297904968, + "learning_rate": 0.00012744121966752882, + "loss": 1.6524, + "step": 27929 + }, + { + "epoch": 0.3629376115703419, + "grad_norm": 0.3600034713745117, + "learning_rate": 0.00012743862020561745, + "loss": 1.2012, + "step": 27930 + }, + { + "epoch": 0.3629506061142578, + "grad_norm": 0.3226478397846222, + "learning_rate": 0.00012743602074370607, + "loss": 1.4266, + "step": 27931 + }, + { + "epoch": 0.3629636006581736, + "grad_norm": 0.44012942910194397, + "learning_rate": 0.00012743342128179467, + "loss": 1.3038, + "step": 27932 + }, + { + "epoch": 0.3629765952020895, + "grad_norm": 0.4136284291744232, + "learning_rate": 0.0001274308218198833, + "loss": 1.3312, + "step": 27933 + }, + { + "epoch": 0.36298958974600537, + "grad_norm": 0.4837016463279724, + "learning_rate": 0.00012742822235797192, + "loss": 1.385, + "step": 27934 + }, + { + "epoch": 0.36300258428992127, + "grad_norm": 0.2655482590198517, + "learning_rate": 0.00012742562289606052, + "loss": 1.2812, + "step": 27935 + }, + { + "epoch": 0.3630155788338371, + "grad_norm": 0.4441456198692322, + "learning_rate": 0.00012742302343414914, + "loss": 1.3846, + "step": 27936 + }, + { + "epoch": 0.363028573377753, + "grad_norm": 0.4971577823162079, + "learning_rate": 0.00012742042397223774, + "loss": 1.4031, + "step": 27937 + }, + { + "epoch": 0.36304156792166886, + "grad_norm": 0.3880029320716858, + "learning_rate": 0.0001274178245103264, + "loss": 1.2369, + "step": 27938 + }, + { + "epoch": 0.36305456246558476, + "grad_norm": 0.4071570932865143, + "learning_rate": 0.000127415225048415, + "loss": 1.3813, + "step": 27939 + }, + { + "epoch": 0.36306755700950066, + "grad_norm": 0.3111426830291748, + "learning_rate": 0.00012741262558650359, + "loss": 1.1114, + "step": 27940 + }, + { + "epoch": 0.3630805515534165, + "grad_norm": 0.2971980571746826, + "learning_rate": 0.0001274100261245922, + "loss": 1.3144, + "step": 27941 + }, + { + "epoch": 0.3630935460973324, + "grad_norm": 0.4115547239780426, + "learning_rate": 0.00012740742666268083, + "loss": 1.1417, + "step": 27942 + }, + { + "epoch": 0.36310654064124825, + "grad_norm": 0.3459772765636444, + "learning_rate": 0.00012740482720076946, + "loss": 1.502, + "step": 27943 + }, + { + "epoch": 0.36311953518516416, + "grad_norm": 0.3610880374908447, + "learning_rate": 0.00012740222773885806, + "loss": 1.2536, + "step": 27944 + }, + { + "epoch": 0.36313252972908, + "grad_norm": 0.3762784004211426, + "learning_rate": 0.00012739962827694668, + "loss": 1.2442, + "step": 27945 + }, + { + "epoch": 0.3631455242729959, + "grad_norm": 0.34537798166275024, + "learning_rate": 0.0001273970288150353, + "loss": 1.3429, + "step": 27946 + }, + { + "epoch": 0.36315851881691175, + "grad_norm": 0.4199179708957672, + "learning_rate": 0.0001273944293531239, + "loss": 1.2794, + "step": 27947 + }, + { + "epoch": 0.36317151336082765, + "grad_norm": 0.4479343593120575, + "learning_rate": 0.00012739182989121253, + "loss": 1.3417, + "step": 27948 + }, + { + "epoch": 0.3631845079047435, + "grad_norm": 0.49493348598480225, + "learning_rate": 0.00012738923042930112, + "loss": 1.3672, + "step": 27949 + }, + { + "epoch": 0.3631975024486594, + "grad_norm": 0.46684977412223816, + "learning_rate": 0.00012738663096738978, + "loss": 1.5311, + "step": 27950 + }, + { + "epoch": 0.36321049699257524, + "grad_norm": 0.39218395948410034, + "learning_rate": 0.00012738403150547837, + "loss": 1.4725, + "step": 27951 + }, + { + "epoch": 0.36322349153649114, + "grad_norm": 0.4167647659778595, + "learning_rate": 0.00012738143204356697, + "loss": 1.4079, + "step": 27952 + }, + { + "epoch": 0.363236486080407, + "grad_norm": 0.5562061667442322, + "learning_rate": 0.0001273788325816556, + "loss": 1.4342, + "step": 27953 + }, + { + "epoch": 0.3632494806243229, + "grad_norm": 0.42765992879867554, + "learning_rate": 0.00012737623311974422, + "loss": 1.444, + "step": 27954 + }, + { + "epoch": 0.36326247516823873, + "grad_norm": 0.3909742832183838, + "learning_rate": 0.00012737363365783284, + "loss": 1.4311, + "step": 27955 + }, + { + "epoch": 0.36327546971215463, + "grad_norm": 0.4262784421443939, + "learning_rate": 0.00012737103419592144, + "loss": 1.4109, + "step": 27956 + }, + { + "epoch": 0.3632884642560705, + "grad_norm": 0.42134782671928406, + "learning_rate": 0.00012736843473401007, + "loss": 1.4371, + "step": 27957 + }, + { + "epoch": 0.3633014587999864, + "grad_norm": 0.45517075061798096, + "learning_rate": 0.0001273658352720987, + "loss": 1.501, + "step": 27958 + }, + { + "epoch": 0.3633144533439022, + "grad_norm": 0.4511476457118988, + "learning_rate": 0.0001273632358101873, + "loss": 1.5152, + "step": 27959 + }, + { + "epoch": 0.3633274478878181, + "grad_norm": 0.4538951516151428, + "learning_rate": 0.0001273606363482759, + "loss": 1.3827, + "step": 27960 + }, + { + "epoch": 0.36334044243173397, + "grad_norm": 0.3794161379337311, + "learning_rate": 0.0001273580368863645, + "loss": 1.4722, + "step": 27961 + }, + { + "epoch": 0.36335343697564987, + "grad_norm": 0.44694384932518005, + "learning_rate": 0.00012735543742445316, + "loss": 1.3154, + "step": 27962 + }, + { + "epoch": 0.3633664315195657, + "grad_norm": 0.30698803067207336, + "learning_rate": 0.00012735283796254176, + "loss": 1.2275, + "step": 27963 + }, + { + "epoch": 0.3633794260634816, + "grad_norm": 0.3932294547557831, + "learning_rate": 0.00012735023850063038, + "loss": 1.3938, + "step": 27964 + }, + { + "epoch": 0.36339242060739746, + "grad_norm": 0.48068439960479736, + "learning_rate": 0.000127347639038719, + "loss": 1.5854, + "step": 27965 + }, + { + "epoch": 0.36340541515131336, + "grad_norm": 0.5042515397071838, + "learning_rate": 0.0001273450395768076, + "loss": 1.4511, + "step": 27966 + }, + { + "epoch": 0.3634184096952292, + "grad_norm": 0.4765413999557495, + "learning_rate": 0.00012734244011489623, + "loss": 1.3311, + "step": 27967 + }, + { + "epoch": 0.3634314042391451, + "grad_norm": 0.3270588219165802, + "learning_rate": 0.00012733984065298483, + "loss": 1.4466, + "step": 27968 + }, + { + "epoch": 0.36344439878306095, + "grad_norm": 0.3830268979072571, + "learning_rate": 0.00012733724119107345, + "loss": 1.3375, + "step": 27969 + }, + { + "epoch": 0.36345739332697685, + "grad_norm": 0.31781959533691406, + "learning_rate": 0.00012733464172916208, + "loss": 1.2993, + "step": 27970 + }, + { + "epoch": 0.3634703878708927, + "grad_norm": 0.4149746894836426, + "learning_rate": 0.00012733204226725067, + "loss": 1.3424, + "step": 27971 + }, + { + "epoch": 0.3634833824148086, + "grad_norm": 0.3852953612804413, + "learning_rate": 0.0001273294428053393, + "loss": 1.4706, + "step": 27972 + }, + { + "epoch": 0.36349637695872444, + "grad_norm": 0.3324621021747589, + "learning_rate": 0.00012732684334342792, + "loss": 1.4881, + "step": 27973 + }, + { + "epoch": 0.36350937150264034, + "grad_norm": 0.41167035698890686, + "learning_rate": 0.00012732424388151655, + "loss": 1.3677, + "step": 27974 + }, + { + "epoch": 0.3635223660465562, + "grad_norm": 0.412930428981781, + "learning_rate": 0.00012732164441960514, + "loss": 1.3112, + "step": 27975 + }, + { + "epoch": 0.3635353605904721, + "grad_norm": 0.3955928385257721, + "learning_rate": 0.00012731904495769377, + "loss": 1.6119, + "step": 27976 + }, + { + "epoch": 0.36354835513438793, + "grad_norm": 0.47529542446136475, + "learning_rate": 0.0001273164454957824, + "loss": 1.2549, + "step": 27977 + }, + { + "epoch": 0.36356134967830384, + "grad_norm": 0.5175417065620422, + "learning_rate": 0.000127313846033871, + "loss": 1.579, + "step": 27978 + }, + { + "epoch": 0.3635743442222197, + "grad_norm": 0.3789021670818329, + "learning_rate": 0.00012731124657195962, + "loss": 1.4066, + "step": 27979 + }, + { + "epoch": 0.3635873387661356, + "grad_norm": 0.442864328622818, + "learning_rate": 0.0001273086471100482, + "loss": 1.3544, + "step": 27980 + }, + { + "epoch": 0.3636003333100514, + "grad_norm": 0.3379141092300415, + "learning_rate": 0.00012730604764813684, + "loss": 1.2663, + "step": 27981 + }, + { + "epoch": 0.3636133278539673, + "grad_norm": 0.3805772662162781, + "learning_rate": 0.00012730344818622546, + "loss": 1.6819, + "step": 27982 + }, + { + "epoch": 0.3636263223978832, + "grad_norm": 0.3813496232032776, + "learning_rate": 0.00012730084872431406, + "loss": 1.4624, + "step": 27983 + }, + { + "epoch": 0.3636393169417991, + "grad_norm": 0.42601850628852844, + "learning_rate": 0.00012729824926240268, + "loss": 1.548, + "step": 27984 + }, + { + "epoch": 0.3636523114857149, + "grad_norm": 0.3615221083164215, + "learning_rate": 0.0001272956498004913, + "loss": 1.3247, + "step": 27985 + }, + { + "epoch": 0.3636653060296308, + "grad_norm": 0.3807869851589203, + "learning_rate": 0.00012729305033857993, + "loss": 1.4613, + "step": 27986 + }, + { + "epoch": 0.36367830057354666, + "grad_norm": 0.38697120547294617, + "learning_rate": 0.00012729045087666853, + "loss": 1.4758, + "step": 27987 + }, + { + "epoch": 0.36369129511746257, + "grad_norm": 0.4261217415332794, + "learning_rate": 0.00012728785141475715, + "loss": 1.4357, + "step": 27988 + }, + { + "epoch": 0.3637042896613784, + "grad_norm": 0.35582613945007324, + "learning_rate": 0.00012728525195284578, + "loss": 1.3145, + "step": 27989 + }, + { + "epoch": 0.3637172842052943, + "grad_norm": 0.39728814363479614, + "learning_rate": 0.00012728265249093438, + "loss": 1.562, + "step": 27990 + }, + { + "epoch": 0.36373027874921016, + "grad_norm": 0.40408873558044434, + "learning_rate": 0.000127280053029023, + "loss": 1.4113, + "step": 27991 + }, + { + "epoch": 0.36374327329312606, + "grad_norm": 0.5270810723304749, + "learning_rate": 0.0001272774535671116, + "loss": 1.3387, + "step": 27992 + }, + { + "epoch": 0.3637562678370419, + "grad_norm": 0.3873375356197357, + "learning_rate": 0.00012727485410520025, + "loss": 1.5123, + "step": 27993 + }, + { + "epoch": 0.3637692623809578, + "grad_norm": 0.41070127487182617, + "learning_rate": 0.00012727225464328885, + "loss": 1.2837, + "step": 27994 + }, + { + "epoch": 0.36378225692487365, + "grad_norm": 0.3857309818267822, + "learning_rate": 0.00012726965518137744, + "loss": 1.2547, + "step": 27995 + }, + { + "epoch": 0.36379525146878955, + "grad_norm": 0.420284628868103, + "learning_rate": 0.00012726705571946607, + "loss": 1.2891, + "step": 27996 + }, + { + "epoch": 0.3638082460127054, + "grad_norm": 0.4320487380027771, + "learning_rate": 0.0001272644562575547, + "loss": 1.4931, + "step": 27997 + }, + { + "epoch": 0.3638212405566213, + "grad_norm": 0.309845894575119, + "learning_rate": 0.00012726185679564332, + "loss": 1.1223, + "step": 27998 + }, + { + "epoch": 0.36383423510053714, + "grad_norm": 0.4259468615055084, + "learning_rate": 0.00012725925733373192, + "loss": 1.491, + "step": 27999 + }, + { + "epoch": 0.36384722964445304, + "grad_norm": 0.48759549856185913, + "learning_rate": 0.00012725665787182054, + "loss": 1.5763, + "step": 28000 + }, + { + "epoch": 0.3638602241883689, + "grad_norm": 0.4181251525878906, + "learning_rate": 0.00012725405840990916, + "loss": 1.2331, + "step": 28001 + }, + { + "epoch": 0.3638732187322848, + "grad_norm": 0.396205335855484, + "learning_rate": 0.00012725145894799776, + "loss": 1.2252, + "step": 28002 + }, + { + "epoch": 0.36388621327620063, + "grad_norm": 0.4210398495197296, + "learning_rate": 0.00012724885948608639, + "loss": 1.3981, + "step": 28003 + }, + { + "epoch": 0.36389920782011653, + "grad_norm": 0.40787240862846375, + "learning_rate": 0.000127246260024175, + "loss": 1.3534, + "step": 28004 + }, + { + "epoch": 0.3639122023640324, + "grad_norm": 0.39276304841041565, + "learning_rate": 0.00012724366056226363, + "loss": 1.3414, + "step": 28005 + }, + { + "epoch": 0.3639251969079483, + "grad_norm": 0.45182257890701294, + "learning_rate": 0.00012724106110035223, + "loss": 1.5151, + "step": 28006 + }, + { + "epoch": 0.3639381914518641, + "grad_norm": 0.4759543836116791, + "learning_rate": 0.00012723846163844083, + "loss": 1.3491, + "step": 28007 + }, + { + "epoch": 0.36395118599578, + "grad_norm": 0.48742738366127014, + "learning_rate": 0.00012723586217652948, + "loss": 1.4055, + "step": 28008 + }, + { + "epoch": 0.36396418053969587, + "grad_norm": 0.3423019349575043, + "learning_rate": 0.00012723326271461808, + "loss": 1.3521, + "step": 28009 + }, + { + "epoch": 0.36397717508361177, + "grad_norm": 0.37417492270469666, + "learning_rate": 0.0001272306632527067, + "loss": 1.2074, + "step": 28010 + }, + { + "epoch": 0.3639901696275276, + "grad_norm": 0.4327795207500458, + "learning_rate": 0.0001272280637907953, + "loss": 1.3384, + "step": 28011 + }, + { + "epoch": 0.3640031641714435, + "grad_norm": 0.3741166591644287, + "learning_rate": 0.00012722546432888392, + "loss": 1.3832, + "step": 28012 + }, + { + "epoch": 0.36401615871535936, + "grad_norm": 0.32205912470817566, + "learning_rate": 0.00012722286486697255, + "loss": 1.3758, + "step": 28013 + }, + { + "epoch": 0.36402915325927526, + "grad_norm": 0.43938589096069336, + "learning_rate": 0.00012722026540506115, + "loss": 1.3601, + "step": 28014 + }, + { + "epoch": 0.36404214780319116, + "grad_norm": 0.3736664056777954, + "learning_rate": 0.00012721766594314977, + "loss": 1.3785, + "step": 28015 + }, + { + "epoch": 0.364055142347107, + "grad_norm": 0.4308818280696869, + "learning_rate": 0.0001272150664812384, + "loss": 1.3639, + "step": 28016 + }, + { + "epoch": 0.3640681368910229, + "grad_norm": 0.36459285020828247, + "learning_rate": 0.00012721246701932702, + "loss": 1.2955, + "step": 28017 + }, + { + "epoch": 0.36408113143493875, + "grad_norm": 0.3885711133480072, + "learning_rate": 0.00012720986755741562, + "loss": 1.2944, + "step": 28018 + }, + { + "epoch": 0.36409412597885465, + "grad_norm": 0.37572920322418213, + "learning_rate": 0.00012720726809550422, + "loss": 1.4485, + "step": 28019 + }, + { + "epoch": 0.3641071205227705, + "grad_norm": 0.41908925771713257, + "learning_rate": 0.00012720466863359287, + "loss": 1.2875, + "step": 28020 + }, + { + "epoch": 0.3641201150666864, + "grad_norm": 0.41559898853302, + "learning_rate": 0.00012720206917168146, + "loss": 1.5178, + "step": 28021 + }, + { + "epoch": 0.36413310961060225, + "grad_norm": 0.5386263132095337, + "learning_rate": 0.0001271994697097701, + "loss": 1.363, + "step": 28022 + }, + { + "epoch": 0.36414610415451815, + "grad_norm": 0.35852131247520447, + "learning_rate": 0.00012719687024785869, + "loss": 1.4462, + "step": 28023 + }, + { + "epoch": 0.364159098698434, + "grad_norm": 0.3842683732509613, + "learning_rate": 0.0001271942707859473, + "loss": 1.368, + "step": 28024 + }, + { + "epoch": 0.3641720932423499, + "grad_norm": 0.3549666106700897, + "learning_rate": 0.00012719167132403593, + "loss": 1.4171, + "step": 28025 + }, + { + "epoch": 0.36418508778626574, + "grad_norm": 0.5157406330108643, + "learning_rate": 0.00012718907186212453, + "loss": 1.561, + "step": 28026 + }, + { + "epoch": 0.36419808233018164, + "grad_norm": 0.31362292170524597, + "learning_rate": 0.00012718647240021316, + "loss": 1.2652, + "step": 28027 + }, + { + "epoch": 0.3642110768740975, + "grad_norm": 0.3563797175884247, + "learning_rate": 0.00012718387293830178, + "loss": 1.4917, + "step": 28028 + }, + { + "epoch": 0.3642240714180134, + "grad_norm": 0.3759686052799225, + "learning_rate": 0.0001271812734763904, + "loss": 1.5185, + "step": 28029 + }, + { + "epoch": 0.36423706596192923, + "grad_norm": 0.3824428617954254, + "learning_rate": 0.000127178674014479, + "loss": 1.6421, + "step": 28030 + }, + { + "epoch": 0.36425006050584513, + "grad_norm": 0.48471498489379883, + "learning_rate": 0.00012717607455256763, + "loss": 1.1653, + "step": 28031 + }, + { + "epoch": 0.364263055049761, + "grad_norm": 0.4868031144142151, + "learning_rate": 0.00012717347509065625, + "loss": 1.4659, + "step": 28032 + }, + { + "epoch": 0.3642760495936769, + "grad_norm": 0.24223241209983826, + "learning_rate": 0.00012717087562874485, + "loss": 1.2754, + "step": 28033 + }, + { + "epoch": 0.3642890441375927, + "grad_norm": 0.4007316529750824, + "learning_rate": 0.00012716827616683347, + "loss": 1.4847, + "step": 28034 + }, + { + "epoch": 0.3643020386815086, + "grad_norm": 0.42385217547416687, + "learning_rate": 0.00012716567670492207, + "loss": 1.5243, + "step": 28035 + }, + { + "epoch": 0.36431503322542447, + "grad_norm": 0.37853702902793884, + "learning_rate": 0.0001271630772430107, + "loss": 1.332, + "step": 28036 + }, + { + "epoch": 0.36432802776934037, + "grad_norm": 0.44703051447868347, + "learning_rate": 0.00012716047778109932, + "loss": 1.5835, + "step": 28037 + }, + { + "epoch": 0.3643410223132562, + "grad_norm": 0.38588809967041016, + "learning_rate": 0.00012715787831918792, + "loss": 1.3347, + "step": 28038 + }, + { + "epoch": 0.3643540168571721, + "grad_norm": 0.4064985513687134, + "learning_rate": 0.00012715527885727657, + "loss": 1.4309, + "step": 28039 + }, + { + "epoch": 0.36436701140108796, + "grad_norm": 0.5217446088790894, + "learning_rate": 0.00012715267939536517, + "loss": 1.4052, + "step": 28040 + }, + { + "epoch": 0.36438000594500386, + "grad_norm": 0.40893876552581787, + "learning_rate": 0.0001271500799334538, + "loss": 1.5905, + "step": 28041 + }, + { + "epoch": 0.3643930004889197, + "grad_norm": 0.3365531861782074, + "learning_rate": 0.0001271474804715424, + "loss": 1.1859, + "step": 28042 + }, + { + "epoch": 0.3644059950328356, + "grad_norm": 0.37935712933540344, + "learning_rate": 0.000127144881009631, + "loss": 1.4425, + "step": 28043 + }, + { + "epoch": 0.36441898957675145, + "grad_norm": 0.31487390398979187, + "learning_rate": 0.00012714228154771964, + "loss": 1.3752, + "step": 28044 + }, + { + "epoch": 0.36443198412066735, + "grad_norm": 0.44216734170913696, + "learning_rate": 0.00012713968208580823, + "loss": 1.3929, + "step": 28045 + }, + { + "epoch": 0.3644449786645832, + "grad_norm": 0.36743104457855225, + "learning_rate": 0.00012713708262389686, + "loss": 1.3501, + "step": 28046 + }, + { + "epoch": 0.3644579732084991, + "grad_norm": 0.37534141540527344, + "learning_rate": 0.00012713448316198548, + "loss": 1.5126, + "step": 28047 + }, + { + "epoch": 0.36447096775241494, + "grad_norm": 0.418866902589798, + "learning_rate": 0.0001271318837000741, + "loss": 1.3158, + "step": 28048 + }, + { + "epoch": 0.36448396229633084, + "grad_norm": 0.37248414754867554, + "learning_rate": 0.0001271292842381627, + "loss": 1.3707, + "step": 28049 + }, + { + "epoch": 0.3644969568402467, + "grad_norm": 0.3232172727584839, + "learning_rate": 0.0001271266847762513, + "loss": 1.2997, + "step": 28050 + }, + { + "epoch": 0.3645099513841626, + "grad_norm": 0.4679691195487976, + "learning_rate": 0.00012712408531433995, + "loss": 1.4489, + "step": 28051 + }, + { + "epoch": 0.36452294592807843, + "grad_norm": 0.4077663719654083, + "learning_rate": 0.00012712148585242855, + "loss": 1.3671, + "step": 28052 + }, + { + "epoch": 0.36453594047199434, + "grad_norm": 0.4671263098716736, + "learning_rate": 0.00012711888639051718, + "loss": 1.3064, + "step": 28053 + }, + { + "epoch": 0.3645489350159102, + "grad_norm": 0.37598350644111633, + "learning_rate": 0.00012711628692860577, + "loss": 1.505, + "step": 28054 + }, + { + "epoch": 0.3645619295598261, + "grad_norm": 0.3588161766529083, + "learning_rate": 0.0001271136874666944, + "loss": 1.2779, + "step": 28055 + }, + { + "epoch": 0.3645749241037419, + "grad_norm": 0.3823160231113434, + "learning_rate": 0.00012711108800478302, + "loss": 1.387, + "step": 28056 + }, + { + "epoch": 0.3645879186476578, + "grad_norm": 0.4202607572078705, + "learning_rate": 0.00012710848854287162, + "loss": 1.3979, + "step": 28057 + }, + { + "epoch": 0.3646009131915737, + "grad_norm": 0.5225715637207031, + "learning_rate": 0.00012710588908096024, + "loss": 1.4796, + "step": 28058 + }, + { + "epoch": 0.3646139077354896, + "grad_norm": 0.30882301926612854, + "learning_rate": 0.00012710328961904887, + "loss": 1.4556, + "step": 28059 + }, + { + "epoch": 0.3646269022794054, + "grad_norm": 0.3137337863445282, + "learning_rate": 0.0001271006901571375, + "loss": 1.3561, + "step": 28060 + }, + { + "epoch": 0.3646398968233213, + "grad_norm": 0.4012397825717926, + "learning_rate": 0.0001270980906952261, + "loss": 1.377, + "step": 28061 + }, + { + "epoch": 0.36465289136723716, + "grad_norm": 0.38306254148483276, + "learning_rate": 0.0001270954912333147, + "loss": 1.3023, + "step": 28062 + }, + { + "epoch": 0.36466588591115306, + "grad_norm": 0.46783527731895447, + "learning_rate": 0.00012709289177140334, + "loss": 1.5391, + "step": 28063 + }, + { + "epoch": 0.3646788804550689, + "grad_norm": 0.4088996648788452, + "learning_rate": 0.00012709029230949194, + "loss": 1.4238, + "step": 28064 + }, + { + "epoch": 0.3646918749989848, + "grad_norm": 0.40283480286598206, + "learning_rate": 0.00012708769284758056, + "loss": 1.4434, + "step": 28065 + }, + { + "epoch": 0.36470486954290066, + "grad_norm": 0.356254518032074, + "learning_rate": 0.00012708509338566916, + "loss": 1.4431, + "step": 28066 + }, + { + "epoch": 0.36471786408681656, + "grad_norm": 0.4371912479400635, + "learning_rate": 0.00012708249392375778, + "loss": 1.3425, + "step": 28067 + }, + { + "epoch": 0.3647308586307324, + "grad_norm": 0.3706740438938141, + "learning_rate": 0.0001270798944618464, + "loss": 1.384, + "step": 28068 + }, + { + "epoch": 0.3647438531746483, + "grad_norm": 0.5011845827102661, + "learning_rate": 0.000127077294999935, + "loss": 1.4167, + "step": 28069 + }, + { + "epoch": 0.36475684771856415, + "grad_norm": 0.3658675253391266, + "learning_rate": 0.00012707469553802363, + "loss": 1.313, + "step": 28070 + }, + { + "epoch": 0.36476984226248005, + "grad_norm": 0.5029263496398926, + "learning_rate": 0.00012707209607611225, + "loss": 1.5346, + "step": 28071 + }, + { + "epoch": 0.3647828368063959, + "grad_norm": 0.3849468231201172, + "learning_rate": 0.00012706949661420088, + "loss": 1.5605, + "step": 28072 + }, + { + "epoch": 0.3647958313503118, + "grad_norm": 0.4321690797805786, + "learning_rate": 0.00012706689715228948, + "loss": 1.3432, + "step": 28073 + }, + { + "epoch": 0.36480882589422764, + "grad_norm": 0.28547438979148865, + "learning_rate": 0.00012706429769037807, + "loss": 1.3526, + "step": 28074 + }, + { + "epoch": 0.36482182043814354, + "grad_norm": 0.42576876282691956, + "learning_rate": 0.00012706169822846673, + "loss": 1.4022, + "step": 28075 + }, + { + "epoch": 0.3648348149820594, + "grad_norm": 0.2830517888069153, + "learning_rate": 0.00012705909876655532, + "loss": 1.0827, + "step": 28076 + }, + { + "epoch": 0.3648478095259753, + "grad_norm": 0.3930256962776184, + "learning_rate": 0.00012705649930464395, + "loss": 1.504, + "step": 28077 + }, + { + "epoch": 0.36486080406989113, + "grad_norm": 0.3815311789512634, + "learning_rate": 0.00012705389984273257, + "loss": 1.4094, + "step": 28078 + }, + { + "epoch": 0.36487379861380703, + "grad_norm": 0.31158214807510376, + "learning_rate": 0.00012705130038082117, + "loss": 1.2232, + "step": 28079 + }, + { + "epoch": 0.3648867931577229, + "grad_norm": 0.36985287070274353, + "learning_rate": 0.0001270487009189098, + "loss": 1.4014, + "step": 28080 + }, + { + "epoch": 0.3648997877016388, + "grad_norm": 0.36557716131210327, + "learning_rate": 0.0001270461014569984, + "loss": 1.3851, + "step": 28081 + }, + { + "epoch": 0.3649127822455546, + "grad_norm": 0.4449576735496521, + "learning_rate": 0.00012704350199508704, + "loss": 1.4077, + "step": 28082 + }, + { + "epoch": 0.3649257767894705, + "grad_norm": 0.3640066087245941, + "learning_rate": 0.00012704090253317564, + "loss": 1.2831, + "step": 28083 + }, + { + "epoch": 0.36493877133338637, + "grad_norm": 0.36832019686698914, + "learning_rate": 0.00012703830307126426, + "loss": 1.449, + "step": 28084 + }, + { + "epoch": 0.36495176587730227, + "grad_norm": 0.4077436923980713, + "learning_rate": 0.00012703570360935286, + "loss": 1.3886, + "step": 28085 + }, + { + "epoch": 0.3649647604212181, + "grad_norm": 0.3969815969467163, + "learning_rate": 0.00012703310414744149, + "loss": 1.3984, + "step": 28086 + }, + { + "epoch": 0.364977754965134, + "grad_norm": 0.43063801527023315, + "learning_rate": 0.0001270305046855301, + "loss": 1.3701, + "step": 28087 + }, + { + "epoch": 0.36499074950904986, + "grad_norm": 0.42025133967399597, + "learning_rate": 0.0001270279052236187, + "loss": 1.5501, + "step": 28088 + }, + { + "epoch": 0.36500374405296576, + "grad_norm": 0.4629644453525543, + "learning_rate": 0.00012702530576170733, + "loss": 1.4808, + "step": 28089 + }, + { + "epoch": 0.3650167385968816, + "grad_norm": 0.3486928343772888, + "learning_rate": 0.00012702270629979596, + "loss": 1.4206, + "step": 28090 + }, + { + "epoch": 0.3650297331407975, + "grad_norm": 0.3603616952896118, + "learning_rate": 0.00012702010683788455, + "loss": 1.5814, + "step": 28091 + }, + { + "epoch": 0.3650427276847134, + "grad_norm": 0.37927836179733276, + "learning_rate": 0.00012701750737597318, + "loss": 1.2954, + "step": 28092 + }, + { + "epoch": 0.36505572222862925, + "grad_norm": 0.44568052887916565, + "learning_rate": 0.00012701490791406178, + "loss": 1.3147, + "step": 28093 + }, + { + "epoch": 0.36506871677254515, + "grad_norm": 0.4193950891494751, + "learning_rate": 0.00012701230845215043, + "loss": 1.3607, + "step": 28094 + }, + { + "epoch": 0.365081711316461, + "grad_norm": 0.3798714876174927, + "learning_rate": 0.00012700970899023903, + "loss": 1.4552, + "step": 28095 + }, + { + "epoch": 0.3650947058603769, + "grad_norm": 0.3370906710624695, + "learning_rate": 0.00012700710952832765, + "loss": 1.4651, + "step": 28096 + }, + { + "epoch": 0.36510770040429275, + "grad_norm": 0.43046578764915466, + "learning_rate": 0.00012700451006641625, + "loss": 1.4425, + "step": 28097 + }, + { + "epoch": 0.36512069494820865, + "grad_norm": 0.34880244731903076, + "learning_rate": 0.00012700191060450487, + "loss": 1.2982, + "step": 28098 + }, + { + "epoch": 0.3651336894921245, + "grad_norm": 0.3743686079978943, + "learning_rate": 0.0001269993111425935, + "loss": 1.2331, + "step": 28099 + }, + { + "epoch": 0.3651466840360404, + "grad_norm": 0.38713330030441284, + "learning_rate": 0.0001269967116806821, + "loss": 1.3827, + "step": 28100 + }, + { + "epoch": 0.36515967857995624, + "grad_norm": 0.39366090297698975, + "learning_rate": 0.00012699411221877072, + "loss": 1.4139, + "step": 28101 + }, + { + "epoch": 0.36517267312387214, + "grad_norm": 0.30771100521087646, + "learning_rate": 0.00012699151275685934, + "loss": 1.4529, + "step": 28102 + }, + { + "epoch": 0.365185667667788, + "grad_norm": 0.33973583579063416, + "learning_rate": 0.00012698891329494794, + "loss": 1.4037, + "step": 28103 + }, + { + "epoch": 0.3651986622117039, + "grad_norm": 0.46407073736190796, + "learning_rate": 0.00012698631383303656, + "loss": 1.4893, + "step": 28104 + }, + { + "epoch": 0.36521165675561973, + "grad_norm": 0.37019267678260803, + "learning_rate": 0.00012698371437112516, + "loss": 1.4, + "step": 28105 + }, + { + "epoch": 0.36522465129953563, + "grad_norm": 0.5243204236030579, + "learning_rate": 0.0001269811149092138, + "loss": 1.3632, + "step": 28106 + }, + { + "epoch": 0.3652376458434515, + "grad_norm": 0.37223151326179504, + "learning_rate": 0.0001269785154473024, + "loss": 1.1576, + "step": 28107 + }, + { + "epoch": 0.3652506403873674, + "grad_norm": 0.328691691160202, + "learning_rate": 0.00012697591598539104, + "loss": 1.6465, + "step": 28108 + }, + { + "epoch": 0.3652636349312832, + "grad_norm": 0.39232245087623596, + "learning_rate": 0.00012697331652347963, + "loss": 1.4403, + "step": 28109 + }, + { + "epoch": 0.3652766294751991, + "grad_norm": 0.4726991057395935, + "learning_rate": 0.00012697071706156826, + "loss": 1.3703, + "step": 28110 + }, + { + "epoch": 0.36528962401911497, + "grad_norm": 0.4225909113883972, + "learning_rate": 0.00012696811759965688, + "loss": 1.3159, + "step": 28111 + }, + { + "epoch": 0.36530261856303087, + "grad_norm": 0.49309974908828735, + "learning_rate": 0.00012696551813774548, + "loss": 1.4466, + "step": 28112 + }, + { + "epoch": 0.3653156131069467, + "grad_norm": 0.39265117049217224, + "learning_rate": 0.00012696291867583413, + "loss": 1.3459, + "step": 28113 + }, + { + "epoch": 0.3653286076508626, + "grad_norm": 0.32008376717567444, + "learning_rate": 0.00012696031921392273, + "loss": 1.3289, + "step": 28114 + }, + { + "epoch": 0.36534160219477846, + "grad_norm": 0.378717839717865, + "learning_rate": 0.00012695771975201135, + "loss": 1.3664, + "step": 28115 + }, + { + "epoch": 0.36535459673869436, + "grad_norm": 0.42208704352378845, + "learning_rate": 0.00012695512029009995, + "loss": 1.2614, + "step": 28116 + }, + { + "epoch": 0.3653675912826102, + "grad_norm": 0.384769469499588, + "learning_rate": 0.00012695252082818857, + "loss": 1.29, + "step": 28117 + }, + { + "epoch": 0.3653805858265261, + "grad_norm": 0.3501450717449188, + "learning_rate": 0.0001269499213662772, + "loss": 1.1789, + "step": 28118 + }, + { + "epoch": 0.36539358037044195, + "grad_norm": 0.3689560890197754, + "learning_rate": 0.0001269473219043658, + "loss": 1.4202, + "step": 28119 + }, + { + "epoch": 0.36540657491435785, + "grad_norm": 0.409323513507843, + "learning_rate": 0.00012694472244245442, + "loss": 1.4544, + "step": 28120 + }, + { + "epoch": 0.3654195694582737, + "grad_norm": 0.3456791937351227, + "learning_rate": 0.00012694212298054305, + "loss": 1.3124, + "step": 28121 + }, + { + "epoch": 0.3654325640021896, + "grad_norm": 0.4129564166069031, + "learning_rate": 0.00012693952351863164, + "loss": 1.2841, + "step": 28122 + }, + { + "epoch": 0.36544555854610544, + "grad_norm": 0.36690765619277954, + "learning_rate": 0.00012693692405672027, + "loss": 1.2794, + "step": 28123 + }, + { + "epoch": 0.36545855309002134, + "grad_norm": 0.4636576473712921, + "learning_rate": 0.00012693432459480886, + "loss": 1.3251, + "step": 28124 + }, + { + "epoch": 0.3654715476339372, + "grad_norm": 0.3891012966632843, + "learning_rate": 0.00012693172513289752, + "loss": 1.4565, + "step": 28125 + }, + { + "epoch": 0.3654845421778531, + "grad_norm": 0.32809528708457947, + "learning_rate": 0.0001269291256709861, + "loss": 1.3799, + "step": 28126 + }, + { + "epoch": 0.36549753672176893, + "grad_norm": 0.42696109414100647, + "learning_rate": 0.00012692652620907474, + "loss": 1.3017, + "step": 28127 + }, + { + "epoch": 0.36551053126568483, + "grad_norm": 0.2179551124572754, + "learning_rate": 0.00012692392674716334, + "loss": 1.1535, + "step": 28128 + }, + { + "epoch": 0.3655235258096007, + "grad_norm": 0.4546618163585663, + "learning_rate": 0.00012692132728525196, + "loss": 1.2079, + "step": 28129 + }, + { + "epoch": 0.3655365203535166, + "grad_norm": 0.44335225224494934, + "learning_rate": 0.00012691872782334058, + "loss": 1.6024, + "step": 28130 + }, + { + "epoch": 0.3655495148974324, + "grad_norm": 0.6570788621902466, + "learning_rate": 0.00012691612836142918, + "loss": 1.4937, + "step": 28131 + }, + { + "epoch": 0.3655625094413483, + "grad_norm": 0.44252750277519226, + "learning_rate": 0.0001269135288995178, + "loss": 1.3296, + "step": 28132 + }, + { + "epoch": 0.36557550398526417, + "grad_norm": 0.43089139461517334, + "learning_rate": 0.00012691092943760643, + "loss": 1.5463, + "step": 28133 + }, + { + "epoch": 0.3655884985291801, + "grad_norm": 0.33625930547714233, + "learning_rate": 0.00012690832997569503, + "loss": 1.2925, + "step": 28134 + }, + { + "epoch": 0.3656014930730959, + "grad_norm": 0.3515777289867401, + "learning_rate": 0.00012690573051378365, + "loss": 1.582, + "step": 28135 + }, + { + "epoch": 0.3656144876170118, + "grad_norm": 0.3245621919631958, + "learning_rate": 0.00012690313105187225, + "loss": 1.3586, + "step": 28136 + }, + { + "epoch": 0.36562748216092766, + "grad_norm": 0.46025052666664124, + "learning_rate": 0.0001269005315899609, + "loss": 1.3535, + "step": 28137 + }, + { + "epoch": 0.36564047670484356, + "grad_norm": 0.4299743175506592, + "learning_rate": 0.0001268979321280495, + "loss": 1.2106, + "step": 28138 + }, + { + "epoch": 0.3656534712487594, + "grad_norm": 0.34638112783432007, + "learning_rate": 0.00012689533266613812, + "loss": 1.1963, + "step": 28139 + }, + { + "epoch": 0.3656664657926753, + "grad_norm": 0.38650864362716675, + "learning_rate": 0.00012689273320422672, + "loss": 1.4405, + "step": 28140 + }, + { + "epoch": 0.36567946033659116, + "grad_norm": 0.5232280492782593, + "learning_rate": 0.00012689013374231535, + "loss": 1.4255, + "step": 28141 + }, + { + "epoch": 0.36569245488050706, + "grad_norm": 0.3474600613117218, + "learning_rate": 0.00012688753428040397, + "loss": 1.256, + "step": 28142 + }, + { + "epoch": 0.3657054494244229, + "grad_norm": 0.4987822473049164, + "learning_rate": 0.00012688493481849257, + "loss": 1.392, + "step": 28143 + }, + { + "epoch": 0.3657184439683388, + "grad_norm": 0.42552152276039124, + "learning_rate": 0.0001268823353565812, + "loss": 1.5524, + "step": 28144 + }, + { + "epoch": 0.36573143851225465, + "grad_norm": 0.3580000400543213, + "learning_rate": 0.00012687973589466982, + "loss": 1.4081, + "step": 28145 + }, + { + "epoch": 0.36574443305617055, + "grad_norm": 0.38259294629096985, + "learning_rate": 0.0001268771364327584, + "loss": 1.6565, + "step": 28146 + }, + { + "epoch": 0.3657574276000864, + "grad_norm": 0.44206780195236206, + "learning_rate": 0.00012687453697084704, + "loss": 1.5425, + "step": 28147 + }, + { + "epoch": 0.3657704221440023, + "grad_norm": 0.44376686215400696, + "learning_rate": 0.00012687193750893564, + "loss": 1.3856, + "step": 28148 + }, + { + "epoch": 0.36578341668791814, + "grad_norm": 0.34192654490470886, + "learning_rate": 0.0001268693380470243, + "loss": 1.483, + "step": 28149 + }, + { + "epoch": 0.36579641123183404, + "grad_norm": 0.3827860355377197, + "learning_rate": 0.00012686673858511288, + "loss": 1.3772, + "step": 28150 + }, + { + "epoch": 0.3658094057757499, + "grad_norm": 0.42208024859428406, + "learning_rate": 0.0001268641391232015, + "loss": 1.3912, + "step": 28151 + }, + { + "epoch": 0.3658224003196658, + "grad_norm": 0.430095374584198, + "learning_rate": 0.00012686153966129013, + "loss": 1.4129, + "step": 28152 + }, + { + "epoch": 0.36583539486358163, + "grad_norm": 0.2927623391151428, + "learning_rate": 0.00012685894019937873, + "loss": 1.3213, + "step": 28153 + }, + { + "epoch": 0.36584838940749753, + "grad_norm": 0.35218799114227295, + "learning_rate": 0.00012685634073746735, + "loss": 1.3798, + "step": 28154 + }, + { + "epoch": 0.3658613839514134, + "grad_norm": 0.35474035143852234, + "learning_rate": 0.00012685374127555595, + "loss": 1.5253, + "step": 28155 + }, + { + "epoch": 0.3658743784953293, + "grad_norm": 0.5100380778312683, + "learning_rate": 0.0001268511418136446, + "loss": 1.4388, + "step": 28156 + }, + { + "epoch": 0.3658873730392451, + "grad_norm": 0.37747254967689514, + "learning_rate": 0.0001268485423517332, + "loss": 1.5742, + "step": 28157 + }, + { + "epoch": 0.365900367583161, + "grad_norm": 0.3083898723125458, + "learning_rate": 0.0001268459428898218, + "loss": 1.3759, + "step": 28158 + }, + { + "epoch": 0.36591336212707687, + "grad_norm": 0.373930960893631, + "learning_rate": 0.00012684334342791042, + "loss": 1.4642, + "step": 28159 + }, + { + "epoch": 0.36592635667099277, + "grad_norm": 0.35953137278556824, + "learning_rate": 0.00012684074396599905, + "loss": 1.4968, + "step": 28160 + }, + { + "epoch": 0.3659393512149086, + "grad_norm": 0.4357090890407562, + "learning_rate": 0.00012683814450408767, + "loss": 1.3088, + "step": 28161 + }, + { + "epoch": 0.3659523457588245, + "grad_norm": 0.41514185070991516, + "learning_rate": 0.00012683554504217627, + "loss": 1.2275, + "step": 28162 + }, + { + "epoch": 0.36596534030274036, + "grad_norm": 0.3018457293510437, + "learning_rate": 0.0001268329455802649, + "loss": 1.4676, + "step": 28163 + }, + { + "epoch": 0.36597833484665626, + "grad_norm": 0.29435214400291443, + "learning_rate": 0.00012683034611835352, + "loss": 1.3255, + "step": 28164 + }, + { + "epoch": 0.3659913293905721, + "grad_norm": 0.4148188531398773, + "learning_rate": 0.00012682774665644212, + "loss": 1.4706, + "step": 28165 + }, + { + "epoch": 0.366004323934488, + "grad_norm": 0.4417075216770172, + "learning_rate": 0.00012682514719453074, + "loss": 1.3882, + "step": 28166 + }, + { + "epoch": 0.36601731847840385, + "grad_norm": 0.2957364022731781, + "learning_rate": 0.00012682254773261934, + "loss": 1.2805, + "step": 28167 + }, + { + "epoch": 0.36603031302231975, + "grad_norm": 0.39461666345596313, + "learning_rate": 0.000126819948270708, + "loss": 1.3663, + "step": 28168 + }, + { + "epoch": 0.36604330756623565, + "grad_norm": 0.41828155517578125, + "learning_rate": 0.0001268173488087966, + "loss": 1.3407, + "step": 28169 + }, + { + "epoch": 0.3660563021101515, + "grad_norm": 0.42200860381126404, + "learning_rate": 0.0001268147493468852, + "loss": 1.4452, + "step": 28170 + }, + { + "epoch": 0.3660692966540674, + "grad_norm": 0.4721120595932007, + "learning_rate": 0.0001268121498849738, + "loss": 1.5777, + "step": 28171 + }, + { + "epoch": 0.36608229119798325, + "grad_norm": 0.38554278016090393, + "learning_rate": 0.00012680955042306243, + "loss": 1.3583, + "step": 28172 + }, + { + "epoch": 0.36609528574189915, + "grad_norm": 0.4189189076423645, + "learning_rate": 0.00012680695096115106, + "loss": 1.2832, + "step": 28173 + }, + { + "epoch": 0.366108280285815, + "grad_norm": 0.38415658473968506, + "learning_rate": 0.00012680435149923965, + "loss": 1.4226, + "step": 28174 + }, + { + "epoch": 0.3661212748297309, + "grad_norm": 0.48040997982025146, + "learning_rate": 0.00012680175203732828, + "loss": 1.5027, + "step": 28175 + }, + { + "epoch": 0.36613426937364674, + "grad_norm": 0.43938133120536804, + "learning_rate": 0.0001267991525754169, + "loss": 1.4759, + "step": 28176 + }, + { + "epoch": 0.36614726391756264, + "grad_norm": 0.4448304772377014, + "learning_rate": 0.0001267965531135055, + "loss": 1.437, + "step": 28177 + }, + { + "epoch": 0.3661602584614785, + "grad_norm": 0.4525499939918518, + "learning_rate": 0.00012679395365159413, + "loss": 1.3965, + "step": 28178 + }, + { + "epoch": 0.3661732530053944, + "grad_norm": 0.44312888383865356, + "learning_rate": 0.00012679135418968272, + "loss": 1.2919, + "step": 28179 + }, + { + "epoch": 0.36618624754931023, + "grad_norm": 0.3774242699146271, + "learning_rate": 0.00012678875472777137, + "loss": 1.4792, + "step": 28180 + }, + { + "epoch": 0.36619924209322613, + "grad_norm": 0.4145831763744354, + "learning_rate": 0.00012678615526585997, + "loss": 1.4226, + "step": 28181 + }, + { + "epoch": 0.366212236637142, + "grad_norm": 0.4463750720024109, + "learning_rate": 0.0001267835558039486, + "loss": 1.4233, + "step": 28182 + }, + { + "epoch": 0.3662252311810579, + "grad_norm": 0.4231424033641815, + "learning_rate": 0.0001267809563420372, + "loss": 1.4051, + "step": 28183 + }, + { + "epoch": 0.3662382257249737, + "grad_norm": 0.40167200565338135, + "learning_rate": 0.00012677835688012582, + "loss": 1.5238, + "step": 28184 + }, + { + "epoch": 0.3662512202688896, + "grad_norm": 0.5366488695144653, + "learning_rate": 0.00012677575741821444, + "loss": 1.3343, + "step": 28185 + }, + { + "epoch": 0.36626421481280547, + "grad_norm": 0.4084893763065338, + "learning_rate": 0.00012677315795630304, + "loss": 1.3832, + "step": 28186 + }, + { + "epoch": 0.36627720935672137, + "grad_norm": 0.35248279571533203, + "learning_rate": 0.00012677055849439166, + "loss": 1.488, + "step": 28187 + }, + { + "epoch": 0.3662902039006372, + "grad_norm": 0.4183511435985565, + "learning_rate": 0.0001267679590324803, + "loss": 1.3968, + "step": 28188 + }, + { + "epoch": 0.3663031984445531, + "grad_norm": 0.33337122201919556, + "learning_rate": 0.0001267653595705689, + "loss": 1.1879, + "step": 28189 + }, + { + "epoch": 0.36631619298846896, + "grad_norm": 0.3395618200302124, + "learning_rate": 0.0001267627601086575, + "loss": 1.2035, + "step": 28190 + }, + { + "epoch": 0.36632918753238486, + "grad_norm": 0.4477768540382385, + "learning_rate": 0.00012676016064674614, + "loss": 1.5642, + "step": 28191 + }, + { + "epoch": 0.3663421820763007, + "grad_norm": 0.3394140899181366, + "learning_rate": 0.00012675756118483476, + "loss": 1.3381, + "step": 28192 + }, + { + "epoch": 0.3663551766202166, + "grad_norm": 0.4839639663696289, + "learning_rate": 0.00012675496172292336, + "loss": 1.4964, + "step": 28193 + }, + { + "epoch": 0.36636817116413245, + "grad_norm": 0.37405848503112793, + "learning_rate": 0.00012675236226101198, + "loss": 1.3034, + "step": 28194 + }, + { + "epoch": 0.36638116570804835, + "grad_norm": 0.5386796593666077, + "learning_rate": 0.0001267497627991006, + "loss": 1.4253, + "step": 28195 + }, + { + "epoch": 0.3663941602519642, + "grad_norm": 0.4870552718639374, + "learning_rate": 0.0001267471633371892, + "loss": 1.3083, + "step": 28196 + }, + { + "epoch": 0.3664071547958801, + "grad_norm": 0.4764971137046814, + "learning_rate": 0.00012674456387527783, + "loss": 1.3278, + "step": 28197 + }, + { + "epoch": 0.36642014933979594, + "grad_norm": 0.3245146572589874, + "learning_rate": 0.00012674196441336643, + "loss": 1.3507, + "step": 28198 + }, + { + "epoch": 0.36643314388371184, + "grad_norm": 0.44167420268058777, + "learning_rate": 0.00012673936495145508, + "loss": 1.3914, + "step": 28199 + }, + { + "epoch": 0.3664461384276277, + "grad_norm": 0.4022807776927948, + "learning_rate": 0.00012673676548954367, + "loss": 1.316, + "step": 28200 + }, + { + "epoch": 0.3664591329715436, + "grad_norm": 0.3878355026245117, + "learning_rate": 0.00012673416602763227, + "loss": 1.3399, + "step": 28201 + }, + { + "epoch": 0.36647212751545943, + "grad_norm": 0.3598688244819641, + "learning_rate": 0.0001267315665657209, + "loss": 1.383, + "step": 28202 + }, + { + "epoch": 0.36648512205937533, + "grad_norm": 0.400646448135376, + "learning_rate": 0.00012672896710380952, + "loss": 1.4203, + "step": 28203 + }, + { + "epoch": 0.3664981166032912, + "grad_norm": 0.430014431476593, + "learning_rate": 0.00012672636764189815, + "loss": 1.4413, + "step": 28204 + }, + { + "epoch": 0.3665111111472071, + "grad_norm": 0.3218499720096588, + "learning_rate": 0.00012672376817998674, + "loss": 1.2877, + "step": 28205 + }, + { + "epoch": 0.3665241056911229, + "grad_norm": 0.40727245807647705, + "learning_rate": 0.00012672116871807537, + "loss": 1.2765, + "step": 28206 + }, + { + "epoch": 0.3665371002350388, + "grad_norm": 0.3618388772010803, + "learning_rate": 0.000126718569256164, + "loss": 1.4602, + "step": 28207 + }, + { + "epoch": 0.36655009477895467, + "grad_norm": 0.41891559958457947, + "learning_rate": 0.0001267159697942526, + "loss": 1.5062, + "step": 28208 + }, + { + "epoch": 0.36656308932287057, + "grad_norm": 0.39407289028167725, + "learning_rate": 0.00012671337033234121, + "loss": 1.5623, + "step": 28209 + }, + { + "epoch": 0.3665760838667864, + "grad_norm": 0.3864799439907074, + "learning_rate": 0.0001267107708704298, + "loss": 1.4045, + "step": 28210 + }, + { + "epoch": 0.3665890784107023, + "grad_norm": 0.3171851336956024, + "learning_rate": 0.00012670817140851846, + "loss": 1.2591, + "step": 28211 + }, + { + "epoch": 0.36660207295461816, + "grad_norm": 0.4939090609550476, + "learning_rate": 0.00012670557194660706, + "loss": 1.4209, + "step": 28212 + }, + { + "epoch": 0.36661506749853406, + "grad_norm": 0.383779913187027, + "learning_rate": 0.00012670297248469566, + "loss": 1.2718, + "step": 28213 + }, + { + "epoch": 0.3666280620424499, + "grad_norm": 0.4203174114227295, + "learning_rate": 0.00012670037302278428, + "loss": 1.43, + "step": 28214 + }, + { + "epoch": 0.3666410565863658, + "grad_norm": 0.4474756121635437, + "learning_rate": 0.0001266977735608729, + "loss": 1.4156, + "step": 28215 + }, + { + "epoch": 0.36665405113028166, + "grad_norm": 0.3777061402797699, + "learning_rate": 0.00012669517409896153, + "loss": 1.45, + "step": 28216 + }, + { + "epoch": 0.36666704567419756, + "grad_norm": 0.3842798173427582, + "learning_rate": 0.00012669257463705013, + "loss": 1.3492, + "step": 28217 + }, + { + "epoch": 0.3666800402181134, + "grad_norm": 0.34121498465538025, + "learning_rate": 0.00012668997517513875, + "loss": 1.3476, + "step": 28218 + }, + { + "epoch": 0.3666930347620293, + "grad_norm": 0.4007270634174347, + "learning_rate": 0.00012668737571322738, + "loss": 1.3771, + "step": 28219 + }, + { + "epoch": 0.36670602930594515, + "grad_norm": 0.4401649534702301, + "learning_rate": 0.00012668477625131597, + "loss": 1.4885, + "step": 28220 + }, + { + "epoch": 0.36671902384986105, + "grad_norm": 0.4479008913040161, + "learning_rate": 0.0001266821767894046, + "loss": 1.3543, + "step": 28221 + }, + { + "epoch": 0.3667320183937769, + "grad_norm": 0.3135763108730316, + "learning_rate": 0.0001266795773274932, + "loss": 1.3169, + "step": 28222 + }, + { + "epoch": 0.3667450129376928, + "grad_norm": 0.3499266803264618, + "learning_rate": 0.00012667697786558185, + "loss": 1.3856, + "step": 28223 + }, + { + "epoch": 0.36675800748160864, + "grad_norm": 0.3431118428707123, + "learning_rate": 0.00012667437840367045, + "loss": 1.4253, + "step": 28224 + }, + { + "epoch": 0.36677100202552454, + "grad_norm": 0.3204564154148102, + "learning_rate": 0.00012667177894175904, + "loss": 1.2607, + "step": 28225 + }, + { + "epoch": 0.3667839965694404, + "grad_norm": 0.31765884160995483, + "learning_rate": 0.0001266691794798477, + "loss": 1.2611, + "step": 28226 + }, + { + "epoch": 0.3667969911133563, + "grad_norm": 0.38403427600860596, + "learning_rate": 0.0001266665800179363, + "loss": 1.3393, + "step": 28227 + }, + { + "epoch": 0.36680998565727213, + "grad_norm": 0.3300393521785736, + "learning_rate": 0.00012666398055602492, + "loss": 1.1441, + "step": 28228 + }, + { + "epoch": 0.36682298020118803, + "grad_norm": 0.45865610241889954, + "learning_rate": 0.00012666138109411351, + "loss": 1.536, + "step": 28229 + }, + { + "epoch": 0.3668359747451039, + "grad_norm": 0.3928692042827606, + "learning_rate": 0.00012665878163220214, + "loss": 1.5818, + "step": 28230 + }, + { + "epoch": 0.3668489692890198, + "grad_norm": 0.371881902217865, + "learning_rate": 0.00012665618217029076, + "loss": 1.3133, + "step": 28231 + }, + { + "epoch": 0.3668619638329356, + "grad_norm": 0.3203613758087158, + "learning_rate": 0.00012665358270837936, + "loss": 1.4208, + "step": 28232 + }, + { + "epoch": 0.3668749583768515, + "grad_norm": 0.3743828237056732, + "learning_rate": 0.00012665098324646798, + "loss": 1.3392, + "step": 28233 + }, + { + "epoch": 0.36688795292076737, + "grad_norm": 0.400187224149704, + "learning_rate": 0.0001266483837845566, + "loss": 1.4545, + "step": 28234 + }, + { + "epoch": 0.36690094746468327, + "grad_norm": 0.37582534551620483, + "learning_rate": 0.00012664578432264523, + "loss": 1.455, + "step": 28235 + }, + { + "epoch": 0.3669139420085991, + "grad_norm": 0.3804473876953125, + "learning_rate": 0.00012664318486073383, + "loss": 1.4632, + "step": 28236 + }, + { + "epoch": 0.366926936552515, + "grad_norm": 0.37541574239730835, + "learning_rate": 0.00012664058539882246, + "loss": 1.3594, + "step": 28237 + }, + { + "epoch": 0.36693993109643086, + "grad_norm": 0.38975194096565247, + "learning_rate": 0.00012663798593691108, + "loss": 1.5815, + "step": 28238 + }, + { + "epoch": 0.36695292564034676, + "grad_norm": 0.34498849511146545, + "learning_rate": 0.00012663538647499968, + "loss": 1.3103, + "step": 28239 + }, + { + "epoch": 0.3669659201842626, + "grad_norm": 0.33500340580940247, + "learning_rate": 0.0001266327870130883, + "loss": 1.2509, + "step": 28240 + }, + { + "epoch": 0.3669789147281785, + "grad_norm": 0.3520366847515106, + "learning_rate": 0.0001266301875511769, + "loss": 1.4546, + "step": 28241 + }, + { + "epoch": 0.36699190927209435, + "grad_norm": 0.4794207811355591, + "learning_rate": 0.00012662758808926552, + "loss": 1.4834, + "step": 28242 + }, + { + "epoch": 0.36700490381601025, + "grad_norm": 0.4109174311161041, + "learning_rate": 0.00012662498862735415, + "loss": 1.3443, + "step": 28243 + }, + { + "epoch": 0.36701789835992615, + "grad_norm": 0.4426645338535309, + "learning_rate": 0.00012662238916544275, + "loss": 1.4635, + "step": 28244 + }, + { + "epoch": 0.367030892903842, + "grad_norm": 0.42775824666023254, + "learning_rate": 0.00012661978970353137, + "loss": 1.2225, + "step": 28245 + }, + { + "epoch": 0.3670438874477579, + "grad_norm": 0.31784045696258545, + "learning_rate": 0.00012661719024162, + "loss": 1.1861, + "step": 28246 + }, + { + "epoch": 0.36705688199167374, + "grad_norm": 0.38443872332572937, + "learning_rate": 0.00012661459077970862, + "loss": 1.2705, + "step": 28247 + }, + { + "epoch": 0.36706987653558965, + "grad_norm": 0.3345561921596527, + "learning_rate": 0.00012661199131779722, + "loss": 1.1758, + "step": 28248 + }, + { + "epoch": 0.3670828710795055, + "grad_norm": 0.3287811279296875, + "learning_rate": 0.00012660939185588584, + "loss": 1.3391, + "step": 28249 + }, + { + "epoch": 0.3670958656234214, + "grad_norm": 0.3764609098434448, + "learning_rate": 0.00012660679239397447, + "loss": 1.3094, + "step": 28250 + }, + { + "epoch": 0.36710886016733724, + "grad_norm": 0.5600670576095581, + "learning_rate": 0.00012660419293206306, + "loss": 1.443, + "step": 28251 + }, + { + "epoch": 0.36712185471125314, + "grad_norm": 0.3201703131198883, + "learning_rate": 0.0001266015934701517, + "loss": 1.5131, + "step": 28252 + }, + { + "epoch": 0.367134849255169, + "grad_norm": 0.514254093170166, + "learning_rate": 0.00012659899400824028, + "loss": 1.4315, + "step": 28253 + }, + { + "epoch": 0.3671478437990849, + "grad_norm": 0.5076304078102112, + "learning_rate": 0.00012659639454632894, + "loss": 1.4243, + "step": 28254 + }, + { + "epoch": 0.36716083834300073, + "grad_norm": 0.38884350657463074, + "learning_rate": 0.00012659379508441753, + "loss": 1.375, + "step": 28255 + }, + { + "epoch": 0.36717383288691663, + "grad_norm": 0.46446534991264343, + "learning_rate": 0.00012659119562250613, + "loss": 1.5978, + "step": 28256 + }, + { + "epoch": 0.3671868274308325, + "grad_norm": 0.39796850085258484, + "learning_rate": 0.00012658859616059476, + "loss": 1.355, + "step": 28257 + }, + { + "epoch": 0.3671998219747484, + "grad_norm": 0.42515382170677185, + "learning_rate": 0.00012658599669868338, + "loss": 1.4486, + "step": 28258 + }, + { + "epoch": 0.3672128165186642, + "grad_norm": 0.46917420625686646, + "learning_rate": 0.000126583397236772, + "loss": 1.6279, + "step": 28259 + }, + { + "epoch": 0.3672258110625801, + "grad_norm": 0.4049796164035797, + "learning_rate": 0.0001265807977748606, + "loss": 1.1622, + "step": 28260 + }, + { + "epoch": 0.36723880560649597, + "grad_norm": 0.5218256115913391, + "learning_rate": 0.00012657819831294923, + "loss": 1.41, + "step": 28261 + }, + { + "epoch": 0.36725180015041187, + "grad_norm": 0.40468713641166687, + "learning_rate": 0.00012657559885103785, + "loss": 1.2532, + "step": 28262 + }, + { + "epoch": 0.3672647946943277, + "grad_norm": 0.34589505195617676, + "learning_rate": 0.00012657299938912645, + "loss": 1.2171, + "step": 28263 + }, + { + "epoch": 0.3672777892382436, + "grad_norm": 0.3997263014316559, + "learning_rate": 0.00012657039992721507, + "loss": 1.4543, + "step": 28264 + }, + { + "epoch": 0.36729078378215946, + "grad_norm": 0.4658871591091156, + "learning_rate": 0.0001265678004653037, + "loss": 1.3436, + "step": 28265 + }, + { + "epoch": 0.36730377832607536, + "grad_norm": 0.38897818326950073, + "learning_rate": 0.00012656520100339232, + "loss": 1.3865, + "step": 28266 + }, + { + "epoch": 0.3673167728699912, + "grad_norm": 0.3992270827293396, + "learning_rate": 0.00012656260154148092, + "loss": 1.4514, + "step": 28267 + }, + { + "epoch": 0.3673297674139071, + "grad_norm": 0.3155049979686737, + "learning_rate": 0.00012656000207956952, + "loss": 1.349, + "step": 28268 + }, + { + "epoch": 0.36734276195782295, + "grad_norm": 0.3793800473213196, + "learning_rate": 0.00012655740261765817, + "loss": 1.1923, + "step": 28269 + }, + { + "epoch": 0.36735575650173885, + "grad_norm": 0.3289342522621155, + "learning_rate": 0.00012655480315574677, + "loss": 1.4689, + "step": 28270 + }, + { + "epoch": 0.3673687510456547, + "grad_norm": 0.4585122764110565, + "learning_rate": 0.0001265522036938354, + "loss": 1.4682, + "step": 28271 + }, + { + "epoch": 0.3673817455895706, + "grad_norm": 0.5174217820167542, + "learning_rate": 0.000126549604231924, + "loss": 1.3983, + "step": 28272 + }, + { + "epoch": 0.36739474013348644, + "grad_norm": 0.38904857635498047, + "learning_rate": 0.0001265470047700126, + "loss": 1.5605, + "step": 28273 + }, + { + "epoch": 0.36740773467740234, + "grad_norm": 0.4413037896156311, + "learning_rate": 0.00012654440530810124, + "loss": 1.4244, + "step": 28274 + }, + { + "epoch": 0.3674207292213182, + "grad_norm": 0.3603969216346741, + "learning_rate": 0.00012654180584618983, + "loss": 1.4262, + "step": 28275 + }, + { + "epoch": 0.3674337237652341, + "grad_norm": 0.4461103677749634, + "learning_rate": 0.00012653920638427846, + "loss": 1.4628, + "step": 28276 + }, + { + "epoch": 0.36744671830914993, + "grad_norm": 0.3080480694770813, + "learning_rate": 0.00012653660692236708, + "loss": 1.49, + "step": 28277 + }, + { + "epoch": 0.36745971285306583, + "grad_norm": 0.3298843204975128, + "learning_rate": 0.0001265340074604557, + "loss": 1.3883, + "step": 28278 + }, + { + "epoch": 0.3674727073969817, + "grad_norm": 0.35455086827278137, + "learning_rate": 0.0001265314079985443, + "loss": 1.4008, + "step": 28279 + }, + { + "epoch": 0.3674857019408976, + "grad_norm": 0.4058314263820648, + "learning_rate": 0.0001265288085366329, + "loss": 1.4439, + "step": 28280 + }, + { + "epoch": 0.3674986964848134, + "grad_norm": 0.5180437564849854, + "learning_rate": 0.00012652620907472155, + "loss": 1.4467, + "step": 28281 + }, + { + "epoch": 0.3675116910287293, + "grad_norm": 0.25658056139945984, + "learning_rate": 0.00012652360961281015, + "loss": 1.4205, + "step": 28282 + }, + { + "epoch": 0.36752468557264517, + "grad_norm": 0.402537077665329, + "learning_rate": 0.00012652101015089877, + "loss": 1.3648, + "step": 28283 + }, + { + "epoch": 0.36753768011656107, + "grad_norm": 0.4145286977291107, + "learning_rate": 0.00012651841068898737, + "loss": 1.3977, + "step": 28284 + }, + { + "epoch": 0.3675506746604769, + "grad_norm": 0.31924372911453247, + "learning_rate": 0.000126515811227076, + "loss": 1.5362, + "step": 28285 + }, + { + "epoch": 0.3675636692043928, + "grad_norm": 0.38005587458610535, + "learning_rate": 0.00012651321176516462, + "loss": 1.3518, + "step": 28286 + }, + { + "epoch": 0.36757666374830866, + "grad_norm": 0.5064309239387512, + "learning_rate": 0.00012651061230325322, + "loss": 1.3775, + "step": 28287 + }, + { + "epoch": 0.36758965829222456, + "grad_norm": 0.43269461393356323, + "learning_rate": 0.00012650801284134184, + "loss": 1.4696, + "step": 28288 + }, + { + "epoch": 0.3676026528361404, + "grad_norm": 0.38758280873298645, + "learning_rate": 0.00012650541337943047, + "loss": 1.2844, + "step": 28289 + }, + { + "epoch": 0.3676156473800563, + "grad_norm": 0.3623393476009369, + "learning_rate": 0.0001265028139175191, + "loss": 1.4764, + "step": 28290 + }, + { + "epoch": 0.36762864192397215, + "grad_norm": 0.4177338778972626, + "learning_rate": 0.0001265002144556077, + "loss": 1.5634, + "step": 28291 + }, + { + "epoch": 0.36764163646788806, + "grad_norm": 0.4605614244937897, + "learning_rate": 0.00012649761499369631, + "loss": 1.3638, + "step": 28292 + }, + { + "epoch": 0.3676546310118039, + "grad_norm": 0.3411669135093689, + "learning_rate": 0.00012649501553178494, + "loss": 1.3509, + "step": 28293 + }, + { + "epoch": 0.3676676255557198, + "grad_norm": 0.43192151188850403, + "learning_rate": 0.00012649241606987354, + "loss": 1.4854, + "step": 28294 + }, + { + "epoch": 0.36768062009963565, + "grad_norm": 0.49286994338035583, + "learning_rate": 0.00012648981660796216, + "loss": 1.5278, + "step": 28295 + }, + { + "epoch": 0.36769361464355155, + "grad_norm": 0.46842652559280396, + "learning_rate": 0.00012648721714605076, + "loss": 1.5172, + "step": 28296 + }, + { + "epoch": 0.3677066091874674, + "grad_norm": 0.3526531755924225, + "learning_rate": 0.00012648461768413938, + "loss": 1.3694, + "step": 28297 + }, + { + "epoch": 0.3677196037313833, + "grad_norm": 0.46419310569763184, + "learning_rate": 0.000126482018222228, + "loss": 1.3507, + "step": 28298 + }, + { + "epoch": 0.36773259827529914, + "grad_norm": 0.38507261872291565, + "learning_rate": 0.0001264794187603166, + "loss": 1.4583, + "step": 28299 + }, + { + "epoch": 0.36774559281921504, + "grad_norm": 0.5182105302810669, + "learning_rate": 0.00012647681929840526, + "loss": 1.4289, + "step": 28300 + }, + { + "epoch": 0.3677585873631309, + "grad_norm": 0.3965875804424286, + "learning_rate": 0.00012647421983649385, + "loss": 1.3298, + "step": 28301 + }, + { + "epoch": 0.3677715819070468, + "grad_norm": 0.31373509764671326, + "learning_rate": 0.00012647162037458248, + "loss": 1.2522, + "step": 28302 + }, + { + "epoch": 0.36778457645096263, + "grad_norm": 0.4045376777648926, + "learning_rate": 0.00012646902091267107, + "loss": 1.4169, + "step": 28303 + }, + { + "epoch": 0.36779757099487853, + "grad_norm": 0.4659777581691742, + "learning_rate": 0.0001264664214507597, + "loss": 1.5066, + "step": 28304 + }, + { + "epoch": 0.3678105655387944, + "grad_norm": 0.4321700930595398, + "learning_rate": 0.00012646382198884832, + "loss": 1.3231, + "step": 28305 + }, + { + "epoch": 0.3678235600827103, + "grad_norm": 0.30213743448257446, + "learning_rate": 0.00012646122252693692, + "loss": 1.4256, + "step": 28306 + }, + { + "epoch": 0.3678365546266261, + "grad_norm": 0.4156777560710907, + "learning_rate": 0.00012645862306502555, + "loss": 1.4331, + "step": 28307 + }, + { + "epoch": 0.367849549170542, + "grad_norm": 0.6103479862213135, + "learning_rate": 0.00012645602360311417, + "loss": 1.6721, + "step": 28308 + }, + { + "epoch": 0.36786254371445787, + "grad_norm": 0.4632102847099304, + "learning_rate": 0.00012645342414120277, + "loss": 1.4686, + "step": 28309 + }, + { + "epoch": 0.36787553825837377, + "grad_norm": 0.3962036073207855, + "learning_rate": 0.0001264508246792914, + "loss": 1.488, + "step": 28310 + }, + { + "epoch": 0.3678885328022896, + "grad_norm": 0.39585331082344055, + "learning_rate": 0.00012644822521738, + "loss": 1.3485, + "step": 28311 + }, + { + "epoch": 0.3679015273462055, + "grad_norm": 0.33448514342308044, + "learning_rate": 0.00012644562575546864, + "loss": 1.2157, + "step": 28312 + }, + { + "epoch": 0.36791452189012136, + "grad_norm": 0.46917641162872314, + "learning_rate": 0.00012644302629355724, + "loss": 1.2645, + "step": 28313 + }, + { + "epoch": 0.36792751643403726, + "grad_norm": 0.30787110328674316, + "learning_rate": 0.00012644042683164586, + "loss": 1.2682, + "step": 28314 + }, + { + "epoch": 0.3679405109779531, + "grad_norm": 0.47201284766197205, + "learning_rate": 0.00012643782736973446, + "loss": 1.3786, + "step": 28315 + }, + { + "epoch": 0.367953505521869, + "grad_norm": 0.4249972999095917, + "learning_rate": 0.00012643522790782308, + "loss": 1.3464, + "step": 28316 + }, + { + "epoch": 0.36796650006578485, + "grad_norm": 0.29537349939346313, + "learning_rate": 0.0001264326284459117, + "loss": 1.4019, + "step": 28317 + }, + { + "epoch": 0.36797949460970075, + "grad_norm": 0.3659409284591675, + "learning_rate": 0.0001264300289840003, + "loss": 1.3854, + "step": 28318 + }, + { + "epoch": 0.3679924891536166, + "grad_norm": 0.4648875892162323, + "learning_rate": 0.00012642742952208893, + "loss": 1.385, + "step": 28319 + }, + { + "epoch": 0.3680054836975325, + "grad_norm": 0.48431596159935, + "learning_rate": 0.00012642483006017756, + "loss": 1.6043, + "step": 28320 + }, + { + "epoch": 0.3680184782414484, + "grad_norm": 0.47812896966934204, + "learning_rate": 0.00012642223059826618, + "loss": 1.5167, + "step": 28321 + }, + { + "epoch": 0.36803147278536424, + "grad_norm": 0.4868755340576172, + "learning_rate": 0.00012641963113635478, + "loss": 1.5123, + "step": 28322 + }, + { + "epoch": 0.36804446732928014, + "grad_norm": 0.398409366607666, + "learning_rate": 0.00012641703167444337, + "loss": 1.4557, + "step": 28323 + }, + { + "epoch": 0.368057461873196, + "grad_norm": 0.4370778203010559, + "learning_rate": 0.00012641443221253203, + "loss": 1.5169, + "step": 28324 + }, + { + "epoch": 0.3680704564171119, + "grad_norm": 0.3462732434272766, + "learning_rate": 0.00012641183275062062, + "loss": 1.6015, + "step": 28325 + }, + { + "epoch": 0.36808345096102774, + "grad_norm": 0.41952386498451233, + "learning_rate": 0.00012640923328870925, + "loss": 1.3269, + "step": 28326 + }, + { + "epoch": 0.36809644550494364, + "grad_norm": 0.4484606385231018, + "learning_rate": 0.00012640663382679785, + "loss": 1.5717, + "step": 28327 + }, + { + "epoch": 0.3681094400488595, + "grad_norm": 0.3657273054122925, + "learning_rate": 0.00012640403436488647, + "loss": 1.4062, + "step": 28328 + }, + { + "epoch": 0.3681224345927754, + "grad_norm": 0.5000094771385193, + "learning_rate": 0.0001264014349029751, + "loss": 1.3152, + "step": 28329 + }, + { + "epoch": 0.36813542913669123, + "grad_norm": 0.4084291458129883, + "learning_rate": 0.0001263988354410637, + "loss": 1.5985, + "step": 28330 + }, + { + "epoch": 0.36814842368060713, + "grad_norm": 0.437229722738266, + "learning_rate": 0.00012639623597915232, + "loss": 1.3473, + "step": 28331 + }, + { + "epoch": 0.368161418224523, + "grad_norm": 0.3651241958141327, + "learning_rate": 0.00012639363651724094, + "loss": 1.2269, + "step": 28332 + }, + { + "epoch": 0.3681744127684389, + "grad_norm": 0.42561301589012146, + "learning_rate": 0.00012639103705532957, + "loss": 1.3654, + "step": 28333 + }, + { + "epoch": 0.3681874073123547, + "grad_norm": 0.394216924905777, + "learning_rate": 0.00012638843759341816, + "loss": 1.3629, + "step": 28334 + }, + { + "epoch": 0.3682004018562706, + "grad_norm": 0.40950807929039, + "learning_rate": 0.0001263858381315068, + "loss": 1.5047, + "step": 28335 + }, + { + "epoch": 0.36821339640018647, + "grad_norm": 0.27787330746650696, + "learning_rate": 0.0001263832386695954, + "loss": 1.4821, + "step": 28336 + }, + { + "epoch": 0.36822639094410237, + "grad_norm": 0.3892754018306732, + "learning_rate": 0.000126380639207684, + "loss": 1.3792, + "step": 28337 + }, + { + "epoch": 0.3682393854880182, + "grad_norm": 0.38460496068000793, + "learning_rate": 0.00012637803974577263, + "loss": 1.3473, + "step": 28338 + }, + { + "epoch": 0.3682523800319341, + "grad_norm": 0.44265082478523254, + "learning_rate": 0.00012637544028386126, + "loss": 1.361, + "step": 28339 + }, + { + "epoch": 0.36826537457584996, + "grad_norm": 0.4281325936317444, + "learning_rate": 0.00012637284082194986, + "loss": 1.4208, + "step": 28340 + }, + { + "epoch": 0.36827836911976586, + "grad_norm": 0.4138733446598053, + "learning_rate": 0.00012637024136003848, + "loss": 1.4761, + "step": 28341 + }, + { + "epoch": 0.3682913636636817, + "grad_norm": 0.405500590801239, + "learning_rate": 0.00012636764189812708, + "loss": 1.4006, + "step": 28342 + }, + { + "epoch": 0.3683043582075976, + "grad_norm": 0.39190343022346497, + "learning_rate": 0.00012636504243621573, + "loss": 1.2992, + "step": 28343 + }, + { + "epoch": 0.36831735275151345, + "grad_norm": 0.47981125116348267, + "learning_rate": 0.00012636244297430433, + "loss": 1.49, + "step": 28344 + }, + { + "epoch": 0.36833034729542935, + "grad_norm": 0.40345466136932373, + "learning_rate": 0.00012635984351239295, + "loss": 1.3537, + "step": 28345 + }, + { + "epoch": 0.3683433418393452, + "grad_norm": 0.3976694941520691, + "learning_rate": 0.00012635724405048155, + "loss": 1.5583, + "step": 28346 + }, + { + "epoch": 0.3683563363832611, + "grad_norm": 0.31501248478889465, + "learning_rate": 0.00012635464458857017, + "loss": 1.3989, + "step": 28347 + }, + { + "epoch": 0.36836933092717694, + "grad_norm": 0.45425599813461304, + "learning_rate": 0.0001263520451266588, + "loss": 1.4561, + "step": 28348 + }, + { + "epoch": 0.36838232547109284, + "grad_norm": 0.3244284987449646, + "learning_rate": 0.0001263494456647474, + "loss": 1.2902, + "step": 28349 + }, + { + "epoch": 0.3683953200150087, + "grad_norm": 0.5544261336326599, + "learning_rate": 0.00012634684620283602, + "loss": 1.4324, + "step": 28350 + }, + { + "epoch": 0.3684083145589246, + "grad_norm": 0.37385526299476624, + "learning_rate": 0.00012634424674092464, + "loss": 1.4953, + "step": 28351 + }, + { + "epoch": 0.36842130910284043, + "grad_norm": 0.3878785967826843, + "learning_rate": 0.00012634164727901324, + "loss": 1.2415, + "step": 28352 + }, + { + "epoch": 0.36843430364675633, + "grad_norm": 0.32700225710868835, + "learning_rate": 0.00012633904781710187, + "loss": 1.2772, + "step": 28353 + }, + { + "epoch": 0.3684472981906722, + "grad_norm": 0.43131348490715027, + "learning_rate": 0.00012633644835519046, + "loss": 1.3612, + "step": 28354 + }, + { + "epoch": 0.3684602927345881, + "grad_norm": 0.36507701873779297, + "learning_rate": 0.00012633384889327911, + "loss": 1.4487, + "step": 28355 + }, + { + "epoch": 0.3684732872785039, + "grad_norm": 0.4436410367488861, + "learning_rate": 0.0001263312494313677, + "loss": 1.5529, + "step": 28356 + }, + { + "epoch": 0.3684862818224198, + "grad_norm": 0.5941357612609863, + "learning_rate": 0.00012632864996945634, + "loss": 1.5078, + "step": 28357 + }, + { + "epoch": 0.36849927636633567, + "grad_norm": 0.5877930521965027, + "learning_rate": 0.00012632605050754493, + "loss": 1.3498, + "step": 28358 + }, + { + "epoch": 0.36851227091025157, + "grad_norm": 0.4041191339492798, + "learning_rate": 0.00012632345104563356, + "loss": 1.4086, + "step": 28359 + }, + { + "epoch": 0.3685252654541674, + "grad_norm": 0.3810862600803375, + "learning_rate": 0.00012632085158372218, + "loss": 1.4334, + "step": 28360 + }, + { + "epoch": 0.3685382599980833, + "grad_norm": 0.4194347560405731, + "learning_rate": 0.00012631825212181078, + "loss": 1.3721, + "step": 28361 + }, + { + "epoch": 0.36855125454199916, + "grad_norm": 0.30258432030677795, + "learning_rate": 0.0001263156526598994, + "loss": 1.2923, + "step": 28362 + }, + { + "epoch": 0.36856424908591506, + "grad_norm": 0.44581466913223267, + "learning_rate": 0.00012631305319798803, + "loss": 1.3805, + "step": 28363 + }, + { + "epoch": 0.3685772436298309, + "grad_norm": 0.4554748237133026, + "learning_rate": 0.00012631045373607663, + "loss": 1.4398, + "step": 28364 + }, + { + "epoch": 0.3685902381737468, + "grad_norm": 0.4317418038845062, + "learning_rate": 0.00012630785427416525, + "loss": 1.4953, + "step": 28365 + }, + { + "epoch": 0.36860323271766265, + "grad_norm": 0.3941706418991089, + "learning_rate": 0.00012630525481225385, + "loss": 1.2766, + "step": 28366 + }, + { + "epoch": 0.36861622726157856, + "grad_norm": 0.48049888014793396, + "learning_rate": 0.0001263026553503425, + "loss": 1.4382, + "step": 28367 + }, + { + "epoch": 0.3686292218054944, + "grad_norm": 0.4745429754257202, + "learning_rate": 0.0001263000558884311, + "loss": 1.3276, + "step": 28368 + }, + { + "epoch": 0.3686422163494103, + "grad_norm": 0.5584399104118347, + "learning_rate": 0.00012629745642651972, + "loss": 1.4464, + "step": 28369 + }, + { + "epoch": 0.36865521089332615, + "grad_norm": 0.2976830303668976, + "learning_rate": 0.00012629485696460832, + "loss": 1.2346, + "step": 28370 + }, + { + "epoch": 0.36866820543724205, + "grad_norm": 0.44537925720214844, + "learning_rate": 0.00012629225750269694, + "loss": 1.3879, + "step": 28371 + }, + { + "epoch": 0.3686811999811579, + "grad_norm": 0.4536598324775696, + "learning_rate": 0.00012628965804078557, + "loss": 1.4492, + "step": 28372 + }, + { + "epoch": 0.3686941945250738, + "grad_norm": 0.41047587990760803, + "learning_rate": 0.00012628705857887417, + "loss": 1.5961, + "step": 28373 + }, + { + "epoch": 0.36870718906898964, + "grad_norm": 0.4140320420265198, + "learning_rate": 0.00012628445911696282, + "loss": 1.2937, + "step": 28374 + }, + { + "epoch": 0.36872018361290554, + "grad_norm": 0.4037458598613739, + "learning_rate": 0.00012628185965505141, + "loss": 1.5198, + "step": 28375 + }, + { + "epoch": 0.3687331781568214, + "grad_norm": 0.3343091905117035, + "learning_rate": 0.00012627926019314004, + "loss": 1.253, + "step": 28376 + }, + { + "epoch": 0.3687461727007373, + "grad_norm": 0.34510770440101624, + "learning_rate": 0.00012627666073122864, + "loss": 1.5007, + "step": 28377 + }, + { + "epoch": 0.36875916724465313, + "grad_norm": 0.47452855110168457, + "learning_rate": 0.00012627406126931726, + "loss": 1.436, + "step": 28378 + }, + { + "epoch": 0.36877216178856903, + "grad_norm": 0.3934592306613922, + "learning_rate": 0.00012627146180740589, + "loss": 1.2946, + "step": 28379 + }, + { + "epoch": 0.3687851563324849, + "grad_norm": 0.44692307710647583, + "learning_rate": 0.00012626886234549448, + "loss": 1.3827, + "step": 28380 + }, + { + "epoch": 0.3687981508764008, + "grad_norm": 0.277104914188385, + "learning_rate": 0.0001262662628835831, + "loss": 1.3933, + "step": 28381 + }, + { + "epoch": 0.3688111454203166, + "grad_norm": 0.31293246150016785, + "learning_rate": 0.00012626366342167173, + "loss": 1.3516, + "step": 28382 + }, + { + "epoch": 0.3688241399642325, + "grad_norm": 0.38781681656837463, + "learning_rate": 0.00012626106395976033, + "loss": 1.1197, + "step": 28383 + }, + { + "epoch": 0.36883713450814837, + "grad_norm": 0.3892761766910553, + "learning_rate": 0.00012625846449784895, + "loss": 1.376, + "step": 28384 + }, + { + "epoch": 0.36885012905206427, + "grad_norm": 0.4109431207180023, + "learning_rate": 0.00012625586503593755, + "loss": 1.3412, + "step": 28385 + }, + { + "epoch": 0.3688631235959801, + "grad_norm": 0.4051346182823181, + "learning_rate": 0.0001262532655740262, + "loss": 1.4347, + "step": 28386 + }, + { + "epoch": 0.368876118139896, + "grad_norm": 0.43324047327041626, + "learning_rate": 0.0001262506661121148, + "loss": 1.4616, + "step": 28387 + }, + { + "epoch": 0.36888911268381186, + "grad_norm": 0.44072192907333374, + "learning_rate": 0.00012624806665020342, + "loss": 1.2729, + "step": 28388 + }, + { + "epoch": 0.36890210722772776, + "grad_norm": 0.5031135678291321, + "learning_rate": 0.00012624546718829202, + "loss": 1.4062, + "step": 28389 + }, + { + "epoch": 0.3689151017716436, + "grad_norm": 0.31822875142097473, + "learning_rate": 0.00012624286772638065, + "loss": 1.3635, + "step": 28390 + }, + { + "epoch": 0.3689280963155595, + "grad_norm": 0.37226349115371704, + "learning_rate": 0.00012624026826446927, + "loss": 1.2548, + "step": 28391 + }, + { + "epoch": 0.36894109085947535, + "grad_norm": 0.426704466342926, + "learning_rate": 0.00012623766880255787, + "loss": 1.2026, + "step": 28392 + }, + { + "epoch": 0.36895408540339125, + "grad_norm": 0.36013510823249817, + "learning_rate": 0.0001262350693406465, + "loss": 1.3736, + "step": 28393 + }, + { + "epoch": 0.3689670799473071, + "grad_norm": 0.4186386466026306, + "learning_rate": 0.00012623246987873512, + "loss": 1.5031, + "step": 28394 + }, + { + "epoch": 0.368980074491223, + "grad_norm": 0.38532984256744385, + "learning_rate": 0.00012622987041682371, + "loss": 1.3806, + "step": 28395 + }, + { + "epoch": 0.3689930690351389, + "grad_norm": 0.40690773725509644, + "learning_rate": 0.00012622727095491234, + "loss": 1.5597, + "step": 28396 + }, + { + "epoch": 0.36900606357905474, + "grad_norm": 0.42174431681632996, + "learning_rate": 0.00012622467149300094, + "loss": 1.2916, + "step": 28397 + }, + { + "epoch": 0.36901905812297064, + "grad_norm": 0.286306232213974, + "learning_rate": 0.0001262220720310896, + "loss": 1.3654, + "step": 28398 + }, + { + "epoch": 0.3690320526668865, + "grad_norm": 0.39403626322746277, + "learning_rate": 0.00012621947256917819, + "loss": 1.3287, + "step": 28399 + }, + { + "epoch": 0.3690450472108024, + "grad_norm": 0.3883787989616394, + "learning_rate": 0.0001262168731072668, + "loss": 1.5887, + "step": 28400 + }, + { + "epoch": 0.36905804175471824, + "grad_norm": 0.46099182963371277, + "learning_rate": 0.0001262142736453554, + "loss": 1.2755, + "step": 28401 + }, + { + "epoch": 0.36907103629863414, + "grad_norm": 0.48636871576309204, + "learning_rate": 0.00012621167418344403, + "loss": 1.3673, + "step": 28402 + }, + { + "epoch": 0.36908403084255, + "grad_norm": 0.3928277790546417, + "learning_rate": 0.00012620907472153266, + "loss": 1.3767, + "step": 28403 + }, + { + "epoch": 0.3690970253864659, + "grad_norm": 0.37164103984832764, + "learning_rate": 0.00012620647525962125, + "loss": 1.2692, + "step": 28404 + }, + { + "epoch": 0.3691100199303817, + "grad_norm": 0.435621052980423, + "learning_rate": 0.00012620387579770988, + "loss": 1.4863, + "step": 28405 + }, + { + "epoch": 0.36912301447429763, + "grad_norm": 0.3375067412853241, + "learning_rate": 0.0001262012763357985, + "loss": 1.4317, + "step": 28406 + }, + { + "epoch": 0.3691360090182135, + "grad_norm": 0.38092947006225586, + "learning_rate": 0.0001261986768738871, + "loss": 1.1144, + "step": 28407 + }, + { + "epoch": 0.3691490035621294, + "grad_norm": 0.37608110904693604, + "learning_rate": 0.00012619607741197572, + "loss": 1.2479, + "step": 28408 + }, + { + "epoch": 0.3691619981060452, + "grad_norm": 0.4085616171360016, + "learning_rate": 0.00012619347795006435, + "loss": 1.2836, + "step": 28409 + }, + { + "epoch": 0.3691749926499611, + "grad_norm": 0.3654405176639557, + "learning_rate": 0.00012619087848815297, + "loss": 1.4252, + "step": 28410 + }, + { + "epoch": 0.36918798719387697, + "grad_norm": 0.3107021152973175, + "learning_rate": 0.00012618827902624157, + "loss": 1.5306, + "step": 28411 + }, + { + "epoch": 0.36920098173779287, + "grad_norm": 0.20088303089141846, + "learning_rate": 0.0001261856795643302, + "loss": 1.1483, + "step": 28412 + }, + { + "epoch": 0.3692139762817087, + "grad_norm": 0.37039002776145935, + "learning_rate": 0.00012618308010241882, + "loss": 1.4032, + "step": 28413 + }, + { + "epoch": 0.3692269708256246, + "grad_norm": 0.383558988571167, + "learning_rate": 0.00012618048064050742, + "loss": 1.2915, + "step": 28414 + }, + { + "epoch": 0.36923996536954046, + "grad_norm": 0.3268488347530365, + "learning_rate": 0.00012617788117859604, + "loss": 1.2759, + "step": 28415 + }, + { + "epoch": 0.36925295991345636, + "grad_norm": 0.2907080054283142, + "learning_rate": 0.00012617528171668464, + "loss": 1.221, + "step": 28416 + }, + { + "epoch": 0.3692659544573722, + "grad_norm": 0.3817647695541382, + "learning_rate": 0.0001261726822547733, + "loss": 1.6906, + "step": 28417 + }, + { + "epoch": 0.3692789490012881, + "grad_norm": 0.3616327941417694, + "learning_rate": 0.0001261700827928619, + "loss": 1.3307, + "step": 28418 + }, + { + "epoch": 0.36929194354520395, + "grad_norm": 0.35940033197402954, + "learning_rate": 0.00012616748333095049, + "loss": 1.3133, + "step": 28419 + }, + { + "epoch": 0.36930493808911985, + "grad_norm": 0.4270550012588501, + "learning_rate": 0.0001261648838690391, + "loss": 1.4502, + "step": 28420 + }, + { + "epoch": 0.3693179326330357, + "grad_norm": 0.32805201411247253, + "learning_rate": 0.00012616228440712773, + "loss": 1.257, + "step": 28421 + }, + { + "epoch": 0.3693309271769516, + "grad_norm": 0.3111568093299866, + "learning_rate": 0.00012615968494521636, + "loss": 1.2839, + "step": 28422 + }, + { + "epoch": 0.36934392172086744, + "grad_norm": 0.29012373089790344, + "learning_rate": 0.00012615708548330496, + "loss": 1.3184, + "step": 28423 + }, + { + "epoch": 0.36935691626478334, + "grad_norm": 0.3929380476474762, + "learning_rate": 0.00012615448602139358, + "loss": 1.4195, + "step": 28424 + }, + { + "epoch": 0.3693699108086992, + "grad_norm": 0.39281922578811646, + "learning_rate": 0.0001261518865594822, + "loss": 1.5138, + "step": 28425 + }, + { + "epoch": 0.3693829053526151, + "grad_norm": 0.46634259819984436, + "learning_rate": 0.0001261492870975708, + "loss": 1.4582, + "step": 28426 + }, + { + "epoch": 0.36939589989653093, + "grad_norm": 0.38453537225723267, + "learning_rate": 0.00012614668763565943, + "loss": 1.3435, + "step": 28427 + }, + { + "epoch": 0.36940889444044683, + "grad_norm": 0.36820077896118164, + "learning_rate": 0.00012614408817374802, + "loss": 1.4042, + "step": 28428 + }, + { + "epoch": 0.3694218889843627, + "grad_norm": 0.5294620990753174, + "learning_rate": 0.00012614148871183668, + "loss": 1.415, + "step": 28429 + }, + { + "epoch": 0.3694348835282786, + "grad_norm": 0.3890458941459656, + "learning_rate": 0.00012613888924992527, + "loss": 1.5321, + "step": 28430 + }, + { + "epoch": 0.3694478780721944, + "grad_norm": 0.4221755862236023, + "learning_rate": 0.00012613628978801387, + "loss": 1.3836, + "step": 28431 + }, + { + "epoch": 0.3694608726161103, + "grad_norm": 0.3718966841697693, + "learning_rate": 0.0001261336903261025, + "loss": 1.3411, + "step": 28432 + }, + { + "epoch": 0.36947386716002617, + "grad_norm": 0.5034308433532715, + "learning_rate": 0.00012613109086419112, + "loss": 1.5535, + "step": 28433 + }, + { + "epoch": 0.36948686170394207, + "grad_norm": 0.36583542823791504, + "learning_rate": 0.00012612849140227974, + "loss": 1.3992, + "step": 28434 + }, + { + "epoch": 0.3694998562478579, + "grad_norm": 0.7009222507476807, + "learning_rate": 0.00012612589194036834, + "loss": 1.3751, + "step": 28435 + }, + { + "epoch": 0.3695128507917738, + "grad_norm": 0.43068015575408936, + "learning_rate": 0.00012612329247845697, + "loss": 1.5226, + "step": 28436 + }, + { + "epoch": 0.36952584533568966, + "grad_norm": 0.4313845932483673, + "learning_rate": 0.0001261206930165456, + "loss": 1.3913, + "step": 28437 + }, + { + "epoch": 0.36953883987960556, + "grad_norm": 0.37349241971969604, + "learning_rate": 0.0001261180935546342, + "loss": 1.3403, + "step": 28438 + }, + { + "epoch": 0.3695518344235214, + "grad_norm": 0.45809611678123474, + "learning_rate": 0.0001261154940927228, + "loss": 1.4647, + "step": 28439 + }, + { + "epoch": 0.3695648289674373, + "grad_norm": 0.2655130922794342, + "learning_rate": 0.0001261128946308114, + "loss": 1.1656, + "step": 28440 + }, + { + "epoch": 0.36957782351135315, + "grad_norm": 0.41603967547416687, + "learning_rate": 0.00012611029516890006, + "loss": 1.5333, + "step": 28441 + }, + { + "epoch": 0.36959081805526905, + "grad_norm": 0.39628106355667114, + "learning_rate": 0.00012610769570698866, + "loss": 1.5212, + "step": 28442 + }, + { + "epoch": 0.3696038125991849, + "grad_norm": 0.44422391057014465, + "learning_rate": 0.00012610509624507728, + "loss": 1.4105, + "step": 28443 + }, + { + "epoch": 0.3696168071431008, + "grad_norm": 0.48169004917144775, + "learning_rate": 0.00012610249678316588, + "loss": 1.4759, + "step": 28444 + }, + { + "epoch": 0.36962980168701665, + "grad_norm": 0.4032215178012848, + "learning_rate": 0.0001260998973212545, + "loss": 1.3084, + "step": 28445 + }, + { + "epoch": 0.36964279623093255, + "grad_norm": 0.4138106405735016, + "learning_rate": 0.00012609729785934313, + "loss": 1.433, + "step": 28446 + }, + { + "epoch": 0.3696557907748484, + "grad_norm": 0.27430957555770874, + "learning_rate": 0.00012609469839743173, + "loss": 1.3192, + "step": 28447 + }, + { + "epoch": 0.3696687853187643, + "grad_norm": 0.4157394766807556, + "learning_rate": 0.00012609209893552035, + "loss": 1.4377, + "step": 28448 + }, + { + "epoch": 0.36968177986268014, + "grad_norm": 0.4084011912345886, + "learning_rate": 0.00012608949947360898, + "loss": 1.3974, + "step": 28449 + }, + { + "epoch": 0.36969477440659604, + "grad_norm": 0.2879941761493683, + "learning_rate": 0.00012608690001169757, + "loss": 1.3122, + "step": 28450 + }, + { + "epoch": 0.3697077689505119, + "grad_norm": 0.443834125995636, + "learning_rate": 0.0001260843005497862, + "loss": 1.335, + "step": 28451 + }, + { + "epoch": 0.3697207634944278, + "grad_norm": 0.5865363478660583, + "learning_rate": 0.00012608170108787482, + "loss": 1.4374, + "step": 28452 + }, + { + "epoch": 0.36973375803834363, + "grad_norm": 0.4429281949996948, + "learning_rate": 0.00012607910162596345, + "loss": 1.4711, + "step": 28453 + }, + { + "epoch": 0.36974675258225953, + "grad_norm": 0.5329416990280151, + "learning_rate": 0.00012607650216405204, + "loss": 1.4233, + "step": 28454 + }, + { + "epoch": 0.3697597471261754, + "grad_norm": 0.39693841338157654, + "learning_rate": 0.00012607390270214067, + "loss": 1.2098, + "step": 28455 + }, + { + "epoch": 0.3697727416700913, + "grad_norm": 0.4363026022911072, + "learning_rate": 0.0001260713032402293, + "loss": 1.3654, + "step": 28456 + }, + { + "epoch": 0.3697857362140071, + "grad_norm": 0.46276989579200745, + "learning_rate": 0.0001260687037783179, + "loss": 1.5165, + "step": 28457 + }, + { + "epoch": 0.369798730757923, + "grad_norm": 0.4555894732475281, + "learning_rate": 0.00012606610431640651, + "loss": 1.4129, + "step": 28458 + }, + { + "epoch": 0.36981172530183887, + "grad_norm": 0.3975951075553894, + "learning_rate": 0.0001260635048544951, + "loss": 1.2047, + "step": 28459 + }, + { + "epoch": 0.36982471984575477, + "grad_norm": 0.31107133626937866, + "learning_rate": 0.00012606090539258376, + "loss": 1.459, + "step": 28460 + }, + { + "epoch": 0.3698377143896706, + "grad_norm": 0.45250436663627625, + "learning_rate": 0.00012605830593067236, + "loss": 1.5376, + "step": 28461 + }, + { + "epoch": 0.3698507089335865, + "grad_norm": 0.37765440344810486, + "learning_rate": 0.00012605570646876096, + "loss": 1.4927, + "step": 28462 + }, + { + "epoch": 0.36986370347750236, + "grad_norm": 0.4555840492248535, + "learning_rate": 0.00012605310700684958, + "loss": 1.462, + "step": 28463 + }, + { + "epoch": 0.36987669802141826, + "grad_norm": 0.4437340199947357, + "learning_rate": 0.0001260505075449382, + "loss": 1.22, + "step": 28464 + }, + { + "epoch": 0.3698896925653341, + "grad_norm": 0.385195255279541, + "learning_rate": 0.00012604790808302683, + "loss": 1.3634, + "step": 28465 + }, + { + "epoch": 0.36990268710925, + "grad_norm": 0.37381511926651, + "learning_rate": 0.00012604530862111543, + "loss": 1.4284, + "step": 28466 + }, + { + "epoch": 0.36991568165316585, + "grad_norm": 0.3051937520503998, + "learning_rate": 0.00012604270915920405, + "loss": 1.2756, + "step": 28467 + }, + { + "epoch": 0.36992867619708175, + "grad_norm": 0.4351508319377899, + "learning_rate": 0.00012604010969729268, + "loss": 1.3764, + "step": 28468 + }, + { + "epoch": 0.3699416707409976, + "grad_norm": 0.41596466302871704, + "learning_rate": 0.00012603751023538128, + "loss": 1.3805, + "step": 28469 + }, + { + "epoch": 0.3699546652849135, + "grad_norm": 0.5010490417480469, + "learning_rate": 0.0001260349107734699, + "loss": 1.485, + "step": 28470 + }, + { + "epoch": 0.36996765982882934, + "grad_norm": 0.3573402464389801, + "learning_rate": 0.0001260323113115585, + "loss": 1.2764, + "step": 28471 + }, + { + "epoch": 0.36998065437274524, + "grad_norm": 0.34636861085891724, + "learning_rate": 0.00012602971184964715, + "loss": 1.4044, + "step": 28472 + }, + { + "epoch": 0.36999364891666114, + "grad_norm": 0.34264034032821655, + "learning_rate": 0.00012602711238773575, + "loss": 1.2301, + "step": 28473 + }, + { + "epoch": 0.370006643460577, + "grad_norm": 0.35069140791893005, + "learning_rate": 0.00012602451292582434, + "loss": 1.4195, + "step": 28474 + }, + { + "epoch": 0.3700196380044929, + "grad_norm": 0.4894803464412689, + "learning_rate": 0.00012602191346391297, + "loss": 1.3296, + "step": 28475 + }, + { + "epoch": 0.37003263254840874, + "grad_norm": 0.37858906388282776, + "learning_rate": 0.0001260193140020016, + "loss": 1.2516, + "step": 28476 + }, + { + "epoch": 0.37004562709232464, + "grad_norm": 0.33961084485054016, + "learning_rate": 0.00012601671454009022, + "loss": 1.4527, + "step": 28477 + }, + { + "epoch": 0.3700586216362405, + "grad_norm": 0.3975807726383209, + "learning_rate": 0.00012601411507817881, + "loss": 1.4288, + "step": 28478 + }, + { + "epoch": 0.3700716161801564, + "grad_norm": 0.3572922348976135, + "learning_rate": 0.00012601151561626744, + "loss": 1.4308, + "step": 28479 + }, + { + "epoch": 0.3700846107240722, + "grad_norm": 0.4830910265445709, + "learning_rate": 0.00012600891615435606, + "loss": 1.5647, + "step": 28480 + }, + { + "epoch": 0.37009760526798813, + "grad_norm": 0.30390429496765137, + "learning_rate": 0.00012600631669244466, + "loss": 1.3876, + "step": 28481 + }, + { + "epoch": 0.370110599811904, + "grad_norm": 0.4210784137248993, + "learning_rate": 0.00012600371723053329, + "loss": 1.3366, + "step": 28482 + }, + { + "epoch": 0.3701235943558199, + "grad_norm": 0.28564774990081787, + "learning_rate": 0.00012600111776862188, + "loss": 1.3688, + "step": 28483 + }, + { + "epoch": 0.3701365888997357, + "grad_norm": 0.4326128363609314, + "learning_rate": 0.00012599851830671053, + "loss": 1.4626, + "step": 28484 + }, + { + "epoch": 0.3701495834436516, + "grad_norm": 0.3149370551109314, + "learning_rate": 0.00012599591884479913, + "loss": 1.2964, + "step": 28485 + }, + { + "epoch": 0.37016257798756746, + "grad_norm": 0.3751177191734314, + "learning_rate": 0.00012599331938288773, + "loss": 1.2786, + "step": 28486 + }, + { + "epoch": 0.37017557253148337, + "grad_norm": 0.3783768117427826, + "learning_rate": 0.00012599071992097638, + "loss": 1.3866, + "step": 28487 + }, + { + "epoch": 0.3701885670753992, + "grad_norm": 0.3886982500553131, + "learning_rate": 0.00012598812045906498, + "loss": 1.4466, + "step": 28488 + }, + { + "epoch": 0.3702015616193151, + "grad_norm": 0.4140985310077667, + "learning_rate": 0.0001259855209971536, + "loss": 1.4811, + "step": 28489 + }, + { + "epoch": 0.37021455616323096, + "grad_norm": 0.40521103143692017, + "learning_rate": 0.0001259829215352422, + "loss": 1.4413, + "step": 28490 + }, + { + "epoch": 0.37022755070714686, + "grad_norm": 0.44567880034446716, + "learning_rate": 0.00012598032207333082, + "loss": 1.4824, + "step": 28491 + }, + { + "epoch": 0.3702405452510627, + "grad_norm": 0.3174348771572113, + "learning_rate": 0.00012597772261141945, + "loss": 1.4906, + "step": 28492 + }, + { + "epoch": 0.3702535397949786, + "grad_norm": 0.4010981321334839, + "learning_rate": 0.00012597512314950805, + "loss": 1.6142, + "step": 28493 + }, + { + "epoch": 0.37026653433889445, + "grad_norm": 0.33209729194641113, + "learning_rate": 0.00012597252368759667, + "loss": 1.267, + "step": 28494 + }, + { + "epoch": 0.37027952888281035, + "grad_norm": 0.4276234805583954, + "learning_rate": 0.0001259699242256853, + "loss": 1.3047, + "step": 28495 + }, + { + "epoch": 0.3702925234267262, + "grad_norm": 0.42527666687965393, + "learning_rate": 0.00012596732476377392, + "loss": 1.3985, + "step": 28496 + }, + { + "epoch": 0.3703055179706421, + "grad_norm": 0.40639492869377136, + "learning_rate": 0.00012596472530186252, + "loss": 1.5361, + "step": 28497 + }, + { + "epoch": 0.37031851251455794, + "grad_norm": 0.43494170904159546, + "learning_rate": 0.00012596212583995114, + "loss": 1.3271, + "step": 28498 + }, + { + "epoch": 0.37033150705847384, + "grad_norm": 0.3743082582950592, + "learning_rate": 0.00012595952637803977, + "loss": 1.6363, + "step": 28499 + }, + { + "epoch": 0.3703445016023897, + "grad_norm": 0.4028823673725128, + "learning_rate": 0.00012595692691612836, + "loss": 1.3723, + "step": 28500 + }, + { + "epoch": 0.3703574961463056, + "grad_norm": 0.48029401898384094, + "learning_rate": 0.000125954327454217, + "loss": 1.4518, + "step": 28501 + }, + { + "epoch": 0.37037049069022143, + "grad_norm": 0.4387197196483612, + "learning_rate": 0.00012595172799230559, + "loss": 1.624, + "step": 28502 + }, + { + "epoch": 0.37038348523413733, + "grad_norm": 0.3294973373413086, + "learning_rate": 0.0001259491285303942, + "loss": 1.4965, + "step": 28503 + }, + { + "epoch": 0.3703964797780532, + "grad_norm": 0.44095978140830994, + "learning_rate": 0.00012594652906848283, + "loss": 1.3095, + "step": 28504 + }, + { + "epoch": 0.3704094743219691, + "grad_norm": 0.4823935925960541, + "learning_rate": 0.00012594392960657143, + "loss": 1.5212, + "step": 28505 + }, + { + "epoch": 0.3704224688658849, + "grad_norm": 0.5020119547843933, + "learning_rate": 0.00012594133014466006, + "loss": 1.4705, + "step": 28506 + }, + { + "epoch": 0.3704354634098008, + "grad_norm": 0.39172613620758057, + "learning_rate": 0.00012593873068274868, + "loss": 1.4425, + "step": 28507 + }, + { + "epoch": 0.37044845795371667, + "grad_norm": 0.4444243907928467, + "learning_rate": 0.0001259361312208373, + "loss": 1.4462, + "step": 28508 + }, + { + "epoch": 0.37046145249763257, + "grad_norm": 0.34541818499565125, + "learning_rate": 0.0001259335317589259, + "loss": 1.223, + "step": 28509 + }, + { + "epoch": 0.3704744470415484, + "grad_norm": 0.35177627205848694, + "learning_rate": 0.00012593093229701453, + "loss": 1.6182, + "step": 28510 + }, + { + "epoch": 0.3704874415854643, + "grad_norm": 0.3601895570755005, + "learning_rate": 0.00012592833283510315, + "loss": 1.3744, + "step": 28511 + }, + { + "epoch": 0.37050043612938016, + "grad_norm": 0.42994552850723267, + "learning_rate": 0.00012592573337319175, + "loss": 1.2998, + "step": 28512 + }, + { + "epoch": 0.37051343067329606, + "grad_norm": 0.42121946811676025, + "learning_rate": 0.00012592313391128037, + "loss": 1.3006, + "step": 28513 + }, + { + "epoch": 0.3705264252172119, + "grad_norm": 0.5001784563064575, + "learning_rate": 0.00012592053444936897, + "loss": 1.4156, + "step": 28514 + }, + { + "epoch": 0.3705394197611278, + "grad_norm": 0.5242595672607422, + "learning_rate": 0.0001259179349874576, + "loss": 1.3819, + "step": 28515 + }, + { + "epoch": 0.37055241430504365, + "grad_norm": 0.43644097447395325, + "learning_rate": 0.00012591533552554622, + "loss": 1.5083, + "step": 28516 + }, + { + "epoch": 0.37056540884895955, + "grad_norm": 0.3800097703933716, + "learning_rate": 0.00012591273606363482, + "loss": 1.5342, + "step": 28517 + }, + { + "epoch": 0.3705784033928754, + "grad_norm": 0.41643908619880676, + "learning_rate": 0.00012591013660172344, + "loss": 1.2589, + "step": 28518 + }, + { + "epoch": 0.3705913979367913, + "grad_norm": 0.39021429419517517, + "learning_rate": 0.00012590753713981207, + "loss": 1.3798, + "step": 28519 + }, + { + "epoch": 0.37060439248070715, + "grad_norm": 0.49075019359588623, + "learning_rate": 0.0001259049376779007, + "loss": 1.4627, + "step": 28520 + }, + { + "epoch": 0.37061738702462305, + "grad_norm": 0.33392399549484253, + "learning_rate": 0.0001259023382159893, + "loss": 1.5124, + "step": 28521 + }, + { + "epoch": 0.3706303815685389, + "grad_norm": 0.4660463035106659, + "learning_rate": 0.0001258997387540779, + "loss": 1.3184, + "step": 28522 + }, + { + "epoch": 0.3706433761124548, + "grad_norm": 0.48202791810035706, + "learning_rate": 0.00012589713929216654, + "loss": 1.5474, + "step": 28523 + }, + { + "epoch": 0.37065637065637064, + "grad_norm": 0.43193021416664124, + "learning_rate": 0.00012589453983025513, + "loss": 1.3771, + "step": 28524 + }, + { + "epoch": 0.37066936520028654, + "grad_norm": 0.6539021134376526, + "learning_rate": 0.00012589194036834376, + "loss": 1.4231, + "step": 28525 + }, + { + "epoch": 0.3706823597442024, + "grad_norm": 0.3315410017967224, + "learning_rate": 0.00012588934090643238, + "loss": 1.3109, + "step": 28526 + }, + { + "epoch": 0.3706953542881183, + "grad_norm": 0.4619930386543274, + "learning_rate": 0.000125886741444521, + "loss": 1.5056, + "step": 28527 + }, + { + "epoch": 0.37070834883203413, + "grad_norm": 0.42916443943977356, + "learning_rate": 0.0001258841419826096, + "loss": 1.3638, + "step": 28528 + }, + { + "epoch": 0.37072134337595003, + "grad_norm": 0.3815104067325592, + "learning_rate": 0.0001258815425206982, + "loss": 1.4779, + "step": 28529 + }, + { + "epoch": 0.3707343379198659, + "grad_norm": 0.3461533188819885, + "learning_rate": 0.00012587894305878685, + "loss": 1.4815, + "step": 28530 + }, + { + "epoch": 0.3707473324637818, + "grad_norm": 0.3713798522949219, + "learning_rate": 0.00012587634359687545, + "loss": 1.3427, + "step": 28531 + }, + { + "epoch": 0.3707603270076976, + "grad_norm": 0.3443409204483032, + "learning_rate": 0.00012587374413496408, + "loss": 1.3924, + "step": 28532 + }, + { + "epoch": 0.3707733215516135, + "grad_norm": 0.3767041265964508, + "learning_rate": 0.00012587114467305267, + "loss": 1.4005, + "step": 28533 + }, + { + "epoch": 0.37078631609552937, + "grad_norm": 0.31080323457717896, + "learning_rate": 0.0001258685452111413, + "loss": 1.4707, + "step": 28534 + }, + { + "epoch": 0.37079931063944527, + "grad_norm": 0.3808943033218384, + "learning_rate": 0.00012586594574922992, + "loss": 1.4928, + "step": 28535 + }, + { + "epoch": 0.3708123051833611, + "grad_norm": 0.43265923857688904, + "learning_rate": 0.00012586334628731852, + "loss": 1.4685, + "step": 28536 + }, + { + "epoch": 0.370825299727277, + "grad_norm": 0.2630341351032257, + "learning_rate": 0.00012586074682540714, + "loss": 1.3499, + "step": 28537 + }, + { + "epoch": 0.37083829427119286, + "grad_norm": 0.34474700689315796, + "learning_rate": 0.00012585814736349577, + "loss": 1.3261, + "step": 28538 + }, + { + "epoch": 0.37085128881510876, + "grad_norm": 0.4624866843223572, + "learning_rate": 0.0001258555479015844, + "loss": 1.4161, + "step": 28539 + }, + { + "epoch": 0.3708642833590246, + "grad_norm": 0.3981010913848877, + "learning_rate": 0.000125852948439673, + "loss": 1.3475, + "step": 28540 + }, + { + "epoch": 0.3708772779029405, + "grad_norm": 0.42675501108169556, + "learning_rate": 0.0001258503489777616, + "loss": 1.5631, + "step": 28541 + }, + { + "epoch": 0.37089027244685635, + "grad_norm": 0.38703203201293945, + "learning_rate": 0.00012584774951585024, + "loss": 1.3689, + "step": 28542 + }, + { + "epoch": 0.37090326699077225, + "grad_norm": 0.370449423789978, + "learning_rate": 0.00012584515005393884, + "loss": 1.3381, + "step": 28543 + }, + { + "epoch": 0.3709162615346881, + "grad_norm": 0.3972005546092987, + "learning_rate": 0.00012584255059202746, + "loss": 1.458, + "step": 28544 + }, + { + "epoch": 0.370929256078604, + "grad_norm": 0.47045713663101196, + "learning_rate": 0.00012583995113011606, + "loss": 1.2855, + "step": 28545 + }, + { + "epoch": 0.37094225062251984, + "grad_norm": 0.42684119939804077, + "learning_rate": 0.00012583735166820468, + "loss": 1.4915, + "step": 28546 + }, + { + "epoch": 0.37095524516643574, + "grad_norm": 0.39620745182037354, + "learning_rate": 0.0001258347522062933, + "loss": 1.424, + "step": 28547 + }, + { + "epoch": 0.37096823971035164, + "grad_norm": 0.4511460065841675, + "learning_rate": 0.0001258321527443819, + "loss": 1.4513, + "step": 28548 + }, + { + "epoch": 0.3709812342542675, + "grad_norm": 0.3990863263607025, + "learning_rate": 0.00012582955328247053, + "loss": 1.572, + "step": 28549 + }, + { + "epoch": 0.3709942287981834, + "grad_norm": 0.37590500712394714, + "learning_rate": 0.00012582695382055915, + "loss": 1.306, + "step": 28550 + }, + { + "epoch": 0.37100722334209923, + "grad_norm": 0.4584839940071106, + "learning_rate": 0.00012582435435864778, + "loss": 1.3541, + "step": 28551 + }, + { + "epoch": 0.37102021788601514, + "grad_norm": 0.42680874466896057, + "learning_rate": 0.00012582175489673638, + "loss": 1.2581, + "step": 28552 + }, + { + "epoch": 0.371033212429931, + "grad_norm": 0.46659156680107117, + "learning_rate": 0.00012581915543482497, + "loss": 1.3899, + "step": 28553 + }, + { + "epoch": 0.3710462069738469, + "grad_norm": 0.29457229375839233, + "learning_rate": 0.00012581655597291362, + "loss": 1.3952, + "step": 28554 + }, + { + "epoch": 0.3710592015177627, + "grad_norm": 0.38635507225990295, + "learning_rate": 0.00012581395651100222, + "loss": 1.5377, + "step": 28555 + }, + { + "epoch": 0.3710721960616786, + "grad_norm": 0.4559784531593323, + "learning_rate": 0.00012581135704909085, + "loss": 1.4913, + "step": 28556 + }, + { + "epoch": 0.3710851906055945, + "grad_norm": 0.4491625726222992, + "learning_rate": 0.00012580875758717944, + "loss": 1.5112, + "step": 28557 + }, + { + "epoch": 0.3710981851495104, + "grad_norm": 0.48063036799430847, + "learning_rate": 0.00012580615812526807, + "loss": 1.4863, + "step": 28558 + }, + { + "epoch": 0.3711111796934262, + "grad_norm": 0.4372749626636505, + "learning_rate": 0.0001258035586633567, + "loss": 1.4965, + "step": 28559 + }, + { + "epoch": 0.3711241742373421, + "grad_norm": 0.35763171315193176, + "learning_rate": 0.0001258009592014453, + "loss": 1.4775, + "step": 28560 + }, + { + "epoch": 0.37113716878125796, + "grad_norm": 0.3866100311279297, + "learning_rate": 0.00012579835973953394, + "loss": 1.4834, + "step": 28561 + }, + { + "epoch": 0.37115016332517387, + "grad_norm": 0.4046694338321686, + "learning_rate": 0.00012579576027762254, + "loss": 1.3932, + "step": 28562 + }, + { + "epoch": 0.3711631578690897, + "grad_norm": 0.31172484159469604, + "learning_rate": 0.00012579316081571116, + "loss": 1.459, + "step": 28563 + }, + { + "epoch": 0.3711761524130056, + "grad_norm": 0.3125612437725067, + "learning_rate": 0.00012579056135379976, + "loss": 1.0811, + "step": 28564 + }, + { + "epoch": 0.37118914695692146, + "grad_norm": 0.3897337019443512, + "learning_rate": 0.00012578796189188839, + "loss": 1.5187, + "step": 28565 + }, + { + "epoch": 0.37120214150083736, + "grad_norm": 0.3549739718437195, + "learning_rate": 0.000125785362429977, + "loss": 1.2253, + "step": 28566 + }, + { + "epoch": 0.3712151360447532, + "grad_norm": 0.564928412437439, + "learning_rate": 0.0001257827629680656, + "loss": 1.4104, + "step": 28567 + }, + { + "epoch": 0.3712281305886691, + "grad_norm": 0.4044153094291687, + "learning_rate": 0.00012578016350615423, + "loss": 1.3929, + "step": 28568 + }, + { + "epoch": 0.37124112513258495, + "grad_norm": 0.3604017198085785, + "learning_rate": 0.00012577756404424286, + "loss": 1.201, + "step": 28569 + }, + { + "epoch": 0.37125411967650085, + "grad_norm": 0.48543795943260193, + "learning_rate": 0.00012577496458233145, + "loss": 1.4587, + "step": 28570 + }, + { + "epoch": 0.3712671142204167, + "grad_norm": 0.4029945135116577, + "learning_rate": 0.00012577236512042008, + "loss": 1.5741, + "step": 28571 + }, + { + "epoch": 0.3712801087643326, + "grad_norm": 0.3804973065853119, + "learning_rate": 0.00012576976565850868, + "loss": 1.374, + "step": 28572 + }, + { + "epoch": 0.37129310330824844, + "grad_norm": 0.3861284554004669, + "learning_rate": 0.00012576716619659733, + "loss": 1.3911, + "step": 28573 + }, + { + "epoch": 0.37130609785216434, + "grad_norm": 0.36409443616867065, + "learning_rate": 0.00012576456673468592, + "loss": 1.4648, + "step": 28574 + }, + { + "epoch": 0.3713190923960802, + "grad_norm": 0.3111497163772583, + "learning_rate": 0.00012576196727277455, + "loss": 1.413, + "step": 28575 + }, + { + "epoch": 0.3713320869399961, + "grad_norm": 0.3570258915424347, + "learning_rate": 0.00012575936781086315, + "loss": 1.7669, + "step": 28576 + }, + { + "epoch": 0.37134508148391193, + "grad_norm": 0.3728431165218353, + "learning_rate": 0.00012575676834895177, + "loss": 1.1999, + "step": 28577 + }, + { + "epoch": 0.37135807602782783, + "grad_norm": 0.47614988684654236, + "learning_rate": 0.0001257541688870404, + "loss": 1.5474, + "step": 28578 + }, + { + "epoch": 0.3713710705717437, + "grad_norm": 0.37860941886901855, + "learning_rate": 0.000125751569425129, + "loss": 1.413, + "step": 28579 + }, + { + "epoch": 0.3713840651156596, + "grad_norm": 0.4034661054611206, + "learning_rate": 0.00012574896996321762, + "loss": 1.4269, + "step": 28580 + }, + { + "epoch": 0.3713970596595754, + "grad_norm": 0.46161821484565735, + "learning_rate": 0.00012574637050130624, + "loss": 1.3991, + "step": 28581 + }, + { + "epoch": 0.3714100542034913, + "grad_norm": 0.3956683278083801, + "learning_rate": 0.00012574377103939487, + "loss": 1.5259, + "step": 28582 + }, + { + "epoch": 0.37142304874740717, + "grad_norm": 0.42626920342445374, + "learning_rate": 0.00012574117157748346, + "loss": 1.4586, + "step": 28583 + }, + { + "epoch": 0.37143604329132307, + "grad_norm": 0.3239297866821289, + "learning_rate": 0.00012573857211557206, + "loss": 1.258, + "step": 28584 + }, + { + "epoch": 0.3714490378352389, + "grad_norm": 0.45306044816970825, + "learning_rate": 0.0001257359726536607, + "loss": 1.4138, + "step": 28585 + }, + { + "epoch": 0.3714620323791548, + "grad_norm": 0.4213177263736725, + "learning_rate": 0.0001257333731917493, + "loss": 1.329, + "step": 28586 + }, + { + "epoch": 0.37147502692307066, + "grad_norm": 0.47822171449661255, + "learning_rate": 0.00012573077372983793, + "loss": 1.4275, + "step": 28587 + }, + { + "epoch": 0.37148802146698656, + "grad_norm": 0.34636977314949036, + "learning_rate": 0.00012572817426792653, + "loss": 1.4334, + "step": 28588 + }, + { + "epoch": 0.3715010160109024, + "grad_norm": 0.30887115001678467, + "learning_rate": 0.00012572557480601516, + "loss": 1.5388, + "step": 28589 + }, + { + "epoch": 0.3715140105548183, + "grad_norm": 0.39939185976982117, + "learning_rate": 0.00012572297534410378, + "loss": 1.4764, + "step": 28590 + }, + { + "epoch": 0.37152700509873415, + "grad_norm": 0.3110973834991455, + "learning_rate": 0.00012572037588219238, + "loss": 1.3193, + "step": 28591 + }, + { + "epoch": 0.37153999964265005, + "grad_norm": 0.4323619306087494, + "learning_rate": 0.000125717776420281, + "loss": 1.3408, + "step": 28592 + }, + { + "epoch": 0.3715529941865659, + "grad_norm": 0.30838853120803833, + "learning_rate": 0.00012571517695836963, + "loss": 1.3376, + "step": 28593 + }, + { + "epoch": 0.3715659887304818, + "grad_norm": 0.47952741384506226, + "learning_rate": 0.00012571257749645825, + "loss": 1.4203, + "step": 28594 + }, + { + "epoch": 0.37157898327439765, + "grad_norm": 0.43382659554481506, + "learning_rate": 0.00012570997803454685, + "loss": 1.3298, + "step": 28595 + }, + { + "epoch": 0.37159197781831355, + "grad_norm": 0.27219194173812866, + "learning_rate": 0.00012570737857263547, + "loss": 1.491, + "step": 28596 + }, + { + "epoch": 0.3716049723622294, + "grad_norm": 0.43320417404174805, + "learning_rate": 0.0001257047791107241, + "loss": 1.4877, + "step": 28597 + }, + { + "epoch": 0.3716179669061453, + "grad_norm": 0.3958994746208191, + "learning_rate": 0.0001257021796488127, + "loss": 1.2832, + "step": 28598 + }, + { + "epoch": 0.37163096145006114, + "grad_norm": 0.37890514731407166, + "learning_rate": 0.00012569958018690132, + "loss": 1.3926, + "step": 28599 + }, + { + "epoch": 0.37164395599397704, + "grad_norm": 0.5272101163864136, + "learning_rate": 0.00012569698072498994, + "loss": 1.673, + "step": 28600 + }, + { + "epoch": 0.3716569505378929, + "grad_norm": 0.3599000871181488, + "learning_rate": 0.00012569438126307854, + "loss": 1.5321, + "step": 28601 + }, + { + "epoch": 0.3716699450818088, + "grad_norm": 0.4670974612236023, + "learning_rate": 0.00012569178180116717, + "loss": 1.3906, + "step": 28602 + }, + { + "epoch": 0.37168293962572463, + "grad_norm": 0.32362043857574463, + "learning_rate": 0.00012568918233925576, + "loss": 1.2695, + "step": 28603 + }, + { + "epoch": 0.37169593416964053, + "grad_norm": 0.27641040086746216, + "learning_rate": 0.00012568658287734442, + "loss": 1.2319, + "step": 28604 + }, + { + "epoch": 0.3717089287135564, + "grad_norm": 0.4326845407485962, + "learning_rate": 0.000125683983415433, + "loss": 1.5359, + "step": 28605 + }, + { + "epoch": 0.3717219232574723, + "grad_norm": 0.4204167425632477, + "learning_rate": 0.00012568138395352164, + "loss": 1.4979, + "step": 28606 + }, + { + "epoch": 0.3717349178013881, + "grad_norm": 0.41838812828063965, + "learning_rate": 0.00012567878449161023, + "loss": 1.4636, + "step": 28607 + }, + { + "epoch": 0.371747912345304, + "grad_norm": 0.4017530381679535, + "learning_rate": 0.00012567618502969886, + "loss": 1.4571, + "step": 28608 + }, + { + "epoch": 0.37176090688921987, + "grad_norm": 0.33240532875061035, + "learning_rate": 0.00012567358556778748, + "loss": 1.2888, + "step": 28609 + }, + { + "epoch": 0.37177390143313577, + "grad_norm": 0.4004509449005127, + "learning_rate": 0.00012567098610587608, + "loss": 1.43, + "step": 28610 + }, + { + "epoch": 0.3717868959770516, + "grad_norm": 0.3449625074863434, + "learning_rate": 0.0001256683866439647, + "loss": 1.2762, + "step": 28611 + }, + { + "epoch": 0.3717998905209675, + "grad_norm": 0.31445473432540894, + "learning_rate": 0.00012566578718205333, + "loss": 1.3907, + "step": 28612 + }, + { + "epoch": 0.37181288506488336, + "grad_norm": 0.39500120282173157, + "learning_rate": 0.00012566318772014193, + "loss": 1.3571, + "step": 28613 + }, + { + "epoch": 0.37182587960879926, + "grad_norm": 0.3693854808807373, + "learning_rate": 0.00012566058825823055, + "loss": 1.4698, + "step": 28614 + }, + { + "epoch": 0.3718388741527151, + "grad_norm": 0.44018974900245667, + "learning_rate": 0.00012565798879631915, + "loss": 1.4344, + "step": 28615 + }, + { + "epoch": 0.371851868696631, + "grad_norm": 0.3605126440525055, + "learning_rate": 0.0001256553893344078, + "loss": 1.4717, + "step": 28616 + }, + { + "epoch": 0.37186486324054685, + "grad_norm": 0.42302262783050537, + "learning_rate": 0.0001256527898724964, + "loss": 1.3964, + "step": 28617 + }, + { + "epoch": 0.37187785778446275, + "grad_norm": 0.24802769720554352, + "learning_rate": 0.00012565019041058502, + "loss": 1.2781, + "step": 28618 + }, + { + "epoch": 0.3718908523283786, + "grad_norm": 0.42664703726768494, + "learning_rate": 0.00012564759094867362, + "loss": 1.6889, + "step": 28619 + }, + { + "epoch": 0.3719038468722945, + "grad_norm": 0.4162429869174957, + "learning_rate": 0.00012564499148676224, + "loss": 1.3692, + "step": 28620 + }, + { + "epoch": 0.37191684141621034, + "grad_norm": 0.4092422425746918, + "learning_rate": 0.00012564239202485087, + "loss": 1.5516, + "step": 28621 + }, + { + "epoch": 0.37192983596012624, + "grad_norm": 0.3229100704193115, + "learning_rate": 0.00012563979256293947, + "loss": 1.4682, + "step": 28622 + }, + { + "epoch": 0.3719428305040421, + "grad_norm": 0.3950127363204956, + "learning_rate": 0.0001256371931010281, + "loss": 1.3339, + "step": 28623 + }, + { + "epoch": 0.371955825047958, + "grad_norm": 0.3432612419128418, + "learning_rate": 0.00012563459363911672, + "loss": 1.0709, + "step": 28624 + }, + { + "epoch": 0.3719688195918739, + "grad_norm": 0.321338027715683, + "learning_rate": 0.0001256319941772053, + "loss": 1.3111, + "step": 28625 + }, + { + "epoch": 0.37198181413578973, + "grad_norm": 0.48300492763519287, + "learning_rate": 0.00012562939471529394, + "loss": 1.3849, + "step": 28626 + }, + { + "epoch": 0.37199480867970564, + "grad_norm": 0.34484028816223145, + "learning_rate": 0.00012562679525338253, + "loss": 1.3808, + "step": 28627 + }, + { + "epoch": 0.3720078032236215, + "grad_norm": 0.3866361975669861, + "learning_rate": 0.00012562419579147119, + "loss": 1.3935, + "step": 28628 + }, + { + "epoch": 0.3720207977675374, + "grad_norm": 0.41459280252456665, + "learning_rate": 0.00012562159632955978, + "loss": 1.5649, + "step": 28629 + }, + { + "epoch": 0.3720337923114532, + "grad_norm": 0.3238179683685303, + "learning_rate": 0.0001256189968676484, + "loss": 1.49, + "step": 28630 + }, + { + "epoch": 0.3720467868553691, + "grad_norm": 0.27497413754463196, + "learning_rate": 0.000125616397405737, + "loss": 1.4515, + "step": 28631 + }, + { + "epoch": 0.372059781399285, + "grad_norm": 0.42133787274360657, + "learning_rate": 0.00012561379794382563, + "loss": 1.4946, + "step": 28632 + }, + { + "epoch": 0.3720727759432009, + "grad_norm": 0.4290090501308441, + "learning_rate": 0.00012561119848191425, + "loss": 1.5536, + "step": 28633 + }, + { + "epoch": 0.3720857704871167, + "grad_norm": 0.4357638359069824, + "learning_rate": 0.00012560859902000285, + "loss": 1.3266, + "step": 28634 + }, + { + "epoch": 0.3720987650310326, + "grad_norm": 0.35319873690605164, + "learning_rate": 0.0001256059995580915, + "loss": 1.2838, + "step": 28635 + }, + { + "epoch": 0.37211175957494846, + "grad_norm": 0.2876637578010559, + "learning_rate": 0.0001256034000961801, + "loss": 1.2703, + "step": 28636 + }, + { + "epoch": 0.37212475411886436, + "grad_norm": 0.40520432591438293, + "learning_rate": 0.0001256008006342687, + "loss": 1.3238, + "step": 28637 + }, + { + "epoch": 0.3721377486627802, + "grad_norm": 0.4260050654411316, + "learning_rate": 0.00012559820117235732, + "loss": 1.4977, + "step": 28638 + }, + { + "epoch": 0.3721507432066961, + "grad_norm": 0.3838316798210144, + "learning_rate": 0.00012559560171044595, + "loss": 1.592, + "step": 28639 + }, + { + "epoch": 0.37216373775061196, + "grad_norm": 0.33360186219215393, + "learning_rate": 0.00012559300224853457, + "loss": 1.5221, + "step": 28640 + }, + { + "epoch": 0.37217673229452786, + "grad_norm": 0.3735339641571045, + "learning_rate": 0.00012559040278662317, + "loss": 1.3799, + "step": 28641 + }, + { + "epoch": 0.3721897268384437, + "grad_norm": 0.41494402289390564, + "learning_rate": 0.0001255878033247118, + "loss": 1.5151, + "step": 28642 + }, + { + "epoch": 0.3722027213823596, + "grad_norm": 0.3460751175880432, + "learning_rate": 0.00012558520386280042, + "loss": 1.4712, + "step": 28643 + }, + { + "epoch": 0.37221571592627545, + "grad_norm": 0.38725244998931885, + "learning_rate": 0.00012558260440088902, + "loss": 1.356, + "step": 28644 + }, + { + "epoch": 0.37222871047019135, + "grad_norm": 0.45364412665367126, + "learning_rate": 0.00012558000493897764, + "loss": 1.2966, + "step": 28645 + }, + { + "epoch": 0.3722417050141072, + "grad_norm": 0.35522225499153137, + "learning_rate": 0.00012557740547706624, + "loss": 1.347, + "step": 28646 + }, + { + "epoch": 0.3722546995580231, + "grad_norm": 0.3954715430736542, + "learning_rate": 0.0001255748060151549, + "loss": 1.3806, + "step": 28647 + }, + { + "epoch": 0.37226769410193894, + "grad_norm": 0.2840288281440735, + "learning_rate": 0.00012557220655324349, + "loss": 1.3557, + "step": 28648 + }, + { + "epoch": 0.37228068864585484, + "grad_norm": 0.4001220166683197, + "learning_rate": 0.0001255696070913321, + "loss": 1.4431, + "step": 28649 + }, + { + "epoch": 0.3722936831897707, + "grad_norm": 0.4043634831905365, + "learning_rate": 0.0001255670076294207, + "loss": 1.3139, + "step": 28650 + }, + { + "epoch": 0.3723066777336866, + "grad_norm": 0.3944031298160553, + "learning_rate": 0.00012556440816750933, + "loss": 1.3241, + "step": 28651 + }, + { + "epoch": 0.37231967227760243, + "grad_norm": 0.3356615900993347, + "learning_rate": 0.00012556180870559796, + "loss": 1.0619, + "step": 28652 + }, + { + "epoch": 0.37233266682151833, + "grad_norm": 0.568498432636261, + "learning_rate": 0.00012555920924368655, + "loss": 1.4497, + "step": 28653 + }, + { + "epoch": 0.3723456613654342, + "grad_norm": 0.5096614956855774, + "learning_rate": 0.00012555660978177518, + "loss": 1.3966, + "step": 28654 + }, + { + "epoch": 0.3723586559093501, + "grad_norm": 0.45051145553588867, + "learning_rate": 0.0001255540103198638, + "loss": 1.156, + "step": 28655 + }, + { + "epoch": 0.3723716504532659, + "grad_norm": 0.3604600429534912, + "learning_rate": 0.0001255514108579524, + "loss": 1.4989, + "step": 28656 + }, + { + "epoch": 0.3723846449971818, + "grad_norm": 0.3429659307003021, + "learning_rate": 0.00012554881139604103, + "loss": 1.2856, + "step": 28657 + }, + { + "epoch": 0.37239763954109767, + "grad_norm": 0.38033246994018555, + "learning_rate": 0.00012554621193412962, + "loss": 1.4322, + "step": 28658 + }, + { + "epoch": 0.37241063408501357, + "grad_norm": 0.4652290940284729, + "learning_rate": 0.00012554361247221827, + "loss": 1.3399, + "step": 28659 + }, + { + "epoch": 0.3724236286289294, + "grad_norm": 0.4360791742801666, + "learning_rate": 0.00012554101301030687, + "loss": 1.406, + "step": 28660 + }, + { + "epoch": 0.3724366231728453, + "grad_norm": 0.3296932280063629, + "learning_rate": 0.0001255384135483955, + "loss": 1.3443, + "step": 28661 + }, + { + "epoch": 0.37244961771676116, + "grad_norm": 0.39879682660102844, + "learning_rate": 0.0001255358140864841, + "loss": 1.4026, + "step": 28662 + }, + { + "epoch": 0.37246261226067706, + "grad_norm": 0.2461155205965042, + "learning_rate": 0.00012553321462457272, + "loss": 1.168, + "step": 28663 + }, + { + "epoch": 0.3724756068045929, + "grad_norm": 0.3588959872722626, + "learning_rate": 0.00012553061516266134, + "loss": 1.2884, + "step": 28664 + }, + { + "epoch": 0.3724886013485088, + "grad_norm": 0.5581744313240051, + "learning_rate": 0.00012552801570074994, + "loss": 1.3158, + "step": 28665 + }, + { + "epoch": 0.37250159589242465, + "grad_norm": 0.4687407910823822, + "learning_rate": 0.00012552541623883856, + "loss": 1.5517, + "step": 28666 + }, + { + "epoch": 0.37251459043634055, + "grad_norm": 0.27124786376953125, + "learning_rate": 0.0001255228167769272, + "loss": 1.3155, + "step": 28667 + }, + { + "epoch": 0.3725275849802564, + "grad_norm": 0.3334617614746094, + "learning_rate": 0.00012552021731501579, + "loss": 1.4242, + "step": 28668 + }, + { + "epoch": 0.3725405795241723, + "grad_norm": 0.4493698477745056, + "learning_rate": 0.0001255176178531044, + "loss": 1.372, + "step": 28669 + }, + { + "epoch": 0.37255357406808814, + "grad_norm": 0.4473382234573364, + "learning_rate": 0.00012551501839119304, + "loss": 1.4243, + "step": 28670 + }, + { + "epoch": 0.37256656861200405, + "grad_norm": 0.3975907564163208, + "learning_rate": 0.00012551241892928166, + "loss": 1.5818, + "step": 28671 + }, + { + "epoch": 0.3725795631559199, + "grad_norm": 0.31713929772377014, + "learning_rate": 0.00012550981946737026, + "loss": 1.2746, + "step": 28672 + }, + { + "epoch": 0.3725925576998358, + "grad_norm": 0.4670765995979309, + "learning_rate": 0.00012550722000545888, + "loss": 1.4332, + "step": 28673 + }, + { + "epoch": 0.37260555224375164, + "grad_norm": 0.42125341296195984, + "learning_rate": 0.0001255046205435475, + "loss": 1.491, + "step": 28674 + }, + { + "epoch": 0.37261854678766754, + "grad_norm": 0.36299535632133484, + "learning_rate": 0.0001255020210816361, + "loss": 1.3692, + "step": 28675 + }, + { + "epoch": 0.3726315413315834, + "grad_norm": 0.43252214789390564, + "learning_rate": 0.00012549942161972473, + "loss": 1.2532, + "step": 28676 + }, + { + "epoch": 0.3726445358754993, + "grad_norm": 0.4870414435863495, + "learning_rate": 0.00012549682215781333, + "loss": 1.3925, + "step": 28677 + }, + { + "epoch": 0.37265753041941513, + "grad_norm": 0.32571282982826233, + "learning_rate": 0.00012549422269590198, + "loss": 1.2693, + "step": 28678 + }, + { + "epoch": 0.37267052496333103, + "grad_norm": 0.39593273401260376, + "learning_rate": 0.00012549162323399057, + "loss": 1.2947, + "step": 28679 + }, + { + "epoch": 0.3726835195072469, + "grad_norm": 0.37479060888290405, + "learning_rate": 0.00012548902377207917, + "loss": 1.1472, + "step": 28680 + }, + { + "epoch": 0.3726965140511628, + "grad_norm": 0.41043952107429504, + "learning_rate": 0.0001254864243101678, + "loss": 1.3782, + "step": 28681 + }, + { + "epoch": 0.3727095085950786, + "grad_norm": 0.39029228687286377, + "learning_rate": 0.00012548382484825642, + "loss": 1.2535, + "step": 28682 + }, + { + "epoch": 0.3727225031389945, + "grad_norm": 0.4228711426258087, + "learning_rate": 0.00012548122538634504, + "loss": 1.2723, + "step": 28683 + }, + { + "epoch": 0.37273549768291037, + "grad_norm": 0.3423106372356415, + "learning_rate": 0.00012547862592443364, + "loss": 1.4927, + "step": 28684 + }, + { + "epoch": 0.37274849222682627, + "grad_norm": 0.36802372336387634, + "learning_rate": 0.00012547602646252227, + "loss": 1.4354, + "step": 28685 + }, + { + "epoch": 0.3727614867707421, + "grad_norm": 0.42316174507141113, + "learning_rate": 0.0001254734270006109, + "loss": 1.5389, + "step": 28686 + }, + { + "epoch": 0.372774481314658, + "grad_norm": 0.39688268303871155, + "learning_rate": 0.0001254708275386995, + "loss": 1.4057, + "step": 28687 + }, + { + "epoch": 0.37278747585857386, + "grad_norm": 0.37469565868377686, + "learning_rate": 0.0001254682280767881, + "loss": 1.5899, + "step": 28688 + }, + { + "epoch": 0.37280047040248976, + "grad_norm": 0.3846202492713928, + "learning_rate": 0.0001254656286148767, + "loss": 1.3641, + "step": 28689 + }, + { + "epoch": 0.3728134649464056, + "grad_norm": 0.3596822917461395, + "learning_rate": 0.00012546302915296536, + "loss": 1.4972, + "step": 28690 + }, + { + "epoch": 0.3728264594903215, + "grad_norm": 0.473092257976532, + "learning_rate": 0.00012546042969105396, + "loss": 1.3877, + "step": 28691 + }, + { + "epoch": 0.37283945403423735, + "grad_norm": 0.41913488507270813, + "learning_rate": 0.00012545783022914256, + "loss": 1.3067, + "step": 28692 + }, + { + "epoch": 0.37285244857815325, + "grad_norm": 0.42669397592544556, + "learning_rate": 0.00012545523076723118, + "loss": 1.3986, + "step": 28693 + }, + { + "epoch": 0.3728654431220691, + "grad_norm": 0.31775355339050293, + "learning_rate": 0.0001254526313053198, + "loss": 1.5008, + "step": 28694 + }, + { + "epoch": 0.372878437665985, + "grad_norm": 0.43441590666770935, + "learning_rate": 0.00012545003184340843, + "loss": 1.3251, + "step": 28695 + }, + { + "epoch": 0.37289143220990084, + "grad_norm": 0.34615200757980347, + "learning_rate": 0.00012544743238149703, + "loss": 1.4758, + "step": 28696 + }, + { + "epoch": 0.37290442675381674, + "grad_norm": 0.37057366967201233, + "learning_rate": 0.00012544483291958565, + "loss": 1.4136, + "step": 28697 + }, + { + "epoch": 0.3729174212977326, + "grad_norm": 0.3675193786621094, + "learning_rate": 0.00012544223345767428, + "loss": 1.3833, + "step": 28698 + }, + { + "epoch": 0.3729304158416485, + "grad_norm": 0.40878060460090637, + "learning_rate": 0.00012543963399576287, + "loss": 1.3567, + "step": 28699 + }, + { + "epoch": 0.37294341038556433, + "grad_norm": 0.36005470156669617, + "learning_rate": 0.0001254370345338515, + "loss": 1.4398, + "step": 28700 + }, + { + "epoch": 0.37295640492948023, + "grad_norm": 0.3990764617919922, + "learning_rate": 0.0001254344350719401, + "loss": 1.4644, + "step": 28701 + }, + { + "epoch": 0.37296939947339613, + "grad_norm": 0.42862221598625183, + "learning_rate": 0.00012543183561002875, + "loss": 1.4198, + "step": 28702 + }, + { + "epoch": 0.372982394017312, + "grad_norm": 0.4164692759513855, + "learning_rate": 0.00012542923614811734, + "loss": 1.2998, + "step": 28703 + }, + { + "epoch": 0.3729953885612279, + "grad_norm": 0.43596258759498596, + "learning_rate": 0.00012542663668620597, + "loss": 1.299, + "step": 28704 + }, + { + "epoch": 0.3730083831051437, + "grad_norm": 0.5348424315452576, + "learning_rate": 0.00012542403722429457, + "loss": 1.4191, + "step": 28705 + }, + { + "epoch": 0.3730213776490596, + "grad_norm": 0.39547592401504517, + "learning_rate": 0.0001254214377623832, + "loss": 1.5583, + "step": 28706 + }, + { + "epoch": 0.37303437219297547, + "grad_norm": 0.4528743624687195, + "learning_rate": 0.00012541883830047182, + "loss": 1.3959, + "step": 28707 + }, + { + "epoch": 0.3730473667368914, + "grad_norm": 0.3225213587284088, + "learning_rate": 0.0001254162388385604, + "loss": 1.1863, + "step": 28708 + }, + { + "epoch": 0.3730603612808072, + "grad_norm": 0.33787041902542114, + "learning_rate": 0.00012541363937664904, + "loss": 1.5928, + "step": 28709 + }, + { + "epoch": 0.3730733558247231, + "grad_norm": 0.4740648567676544, + "learning_rate": 0.00012541103991473766, + "loss": 1.6671, + "step": 28710 + }, + { + "epoch": 0.37308635036863896, + "grad_norm": 0.3897722363471985, + "learning_rate": 0.00012540844045282626, + "loss": 1.4861, + "step": 28711 + }, + { + "epoch": 0.37309934491255486, + "grad_norm": 0.4286596477031708, + "learning_rate": 0.00012540584099091488, + "loss": 1.2548, + "step": 28712 + }, + { + "epoch": 0.3731123394564707, + "grad_norm": 0.41074612736701965, + "learning_rate": 0.0001254032415290035, + "loss": 1.565, + "step": 28713 + }, + { + "epoch": 0.3731253340003866, + "grad_norm": 0.33220306038856506, + "learning_rate": 0.00012540064206709213, + "loss": 1.3268, + "step": 28714 + }, + { + "epoch": 0.37313832854430246, + "grad_norm": 0.4868376553058624, + "learning_rate": 0.00012539804260518073, + "loss": 1.3465, + "step": 28715 + }, + { + "epoch": 0.37315132308821836, + "grad_norm": 0.31483379006385803, + "learning_rate": 0.00012539544314326935, + "loss": 1.4502, + "step": 28716 + }, + { + "epoch": 0.3731643176321342, + "grad_norm": 0.37703099846839905, + "learning_rate": 0.00012539284368135798, + "loss": 1.3666, + "step": 28717 + }, + { + "epoch": 0.3731773121760501, + "grad_norm": 0.3846674859523773, + "learning_rate": 0.00012539024421944658, + "loss": 1.4064, + "step": 28718 + }, + { + "epoch": 0.37319030671996595, + "grad_norm": 0.3678659200668335, + "learning_rate": 0.0001253876447575352, + "loss": 1.323, + "step": 28719 + }, + { + "epoch": 0.37320330126388185, + "grad_norm": 0.3345188498497009, + "learning_rate": 0.0001253850452956238, + "loss": 1.3551, + "step": 28720 + }, + { + "epoch": 0.3732162958077977, + "grad_norm": 0.48107364773750305, + "learning_rate": 0.00012538244583371242, + "loss": 1.3925, + "step": 28721 + }, + { + "epoch": 0.3732292903517136, + "grad_norm": 0.4644206166267395, + "learning_rate": 0.00012537984637180105, + "loss": 1.4438, + "step": 28722 + }, + { + "epoch": 0.37324228489562944, + "grad_norm": 0.30679380893707275, + "learning_rate": 0.00012537724690988964, + "loss": 1.3272, + "step": 28723 + }, + { + "epoch": 0.37325527943954534, + "grad_norm": 0.40435296297073364, + "learning_rate": 0.00012537464744797827, + "loss": 1.3818, + "step": 28724 + }, + { + "epoch": 0.3732682739834612, + "grad_norm": 0.4277142286300659, + "learning_rate": 0.0001253720479860669, + "loss": 1.3127, + "step": 28725 + }, + { + "epoch": 0.3732812685273771, + "grad_norm": 0.4050789475440979, + "learning_rate": 0.00012536944852415552, + "loss": 1.4671, + "step": 28726 + }, + { + "epoch": 0.37329426307129293, + "grad_norm": 0.2995666265487671, + "learning_rate": 0.00012536684906224412, + "loss": 1.2414, + "step": 28727 + }, + { + "epoch": 0.37330725761520883, + "grad_norm": 0.3702373802661896, + "learning_rate": 0.00012536424960033274, + "loss": 1.4577, + "step": 28728 + }, + { + "epoch": 0.3733202521591247, + "grad_norm": 0.4661139249801636, + "learning_rate": 0.00012536165013842136, + "loss": 1.4977, + "step": 28729 + }, + { + "epoch": 0.3733332467030406, + "grad_norm": 0.3852730989456177, + "learning_rate": 0.00012535905067650996, + "loss": 1.4309, + "step": 28730 + }, + { + "epoch": 0.3733462412469564, + "grad_norm": 0.35303887724876404, + "learning_rate": 0.0001253564512145986, + "loss": 1.3882, + "step": 28731 + }, + { + "epoch": 0.3733592357908723, + "grad_norm": 0.41061705350875854, + "learning_rate": 0.00012535385175268718, + "loss": 1.4584, + "step": 28732 + }, + { + "epoch": 0.37337223033478817, + "grad_norm": 0.33713802695274353, + "learning_rate": 0.00012535125229077584, + "loss": 1.2744, + "step": 28733 + }, + { + "epoch": 0.37338522487870407, + "grad_norm": 0.4064413905143738, + "learning_rate": 0.00012534865282886443, + "loss": 1.4572, + "step": 28734 + }, + { + "epoch": 0.3733982194226199, + "grad_norm": 0.33813905715942383, + "learning_rate": 0.00012534605336695303, + "loss": 1.3166, + "step": 28735 + }, + { + "epoch": 0.3734112139665358, + "grad_norm": 0.41231778264045715, + "learning_rate": 0.00012534345390504165, + "loss": 1.3165, + "step": 28736 + }, + { + "epoch": 0.37342420851045166, + "grad_norm": 0.4169091284275055, + "learning_rate": 0.00012534085444313028, + "loss": 1.3911, + "step": 28737 + }, + { + "epoch": 0.37343720305436756, + "grad_norm": 0.38153883814811707, + "learning_rate": 0.0001253382549812189, + "loss": 1.2459, + "step": 28738 + }, + { + "epoch": 0.3734501975982834, + "grad_norm": 0.24761615693569183, + "learning_rate": 0.0001253356555193075, + "loss": 1.3432, + "step": 28739 + }, + { + "epoch": 0.3734631921421993, + "grad_norm": 0.4171951115131378, + "learning_rate": 0.00012533305605739613, + "loss": 1.2287, + "step": 28740 + }, + { + "epoch": 0.37347618668611515, + "grad_norm": 0.36548227071762085, + "learning_rate": 0.00012533045659548475, + "loss": 1.6066, + "step": 28741 + }, + { + "epoch": 0.37348918123003105, + "grad_norm": 0.4641980230808258, + "learning_rate": 0.00012532785713357335, + "loss": 1.5063, + "step": 28742 + }, + { + "epoch": 0.3735021757739469, + "grad_norm": 0.47401878237724304, + "learning_rate": 0.00012532525767166197, + "loss": 1.4163, + "step": 28743 + }, + { + "epoch": 0.3735151703178628, + "grad_norm": 0.3747689127922058, + "learning_rate": 0.0001253226582097506, + "loss": 1.4072, + "step": 28744 + }, + { + "epoch": 0.37352816486177864, + "grad_norm": 0.5015178918838501, + "learning_rate": 0.00012532005874783922, + "loss": 1.4538, + "step": 28745 + }, + { + "epoch": 0.37354115940569455, + "grad_norm": 0.4278414845466614, + "learning_rate": 0.00012531745928592782, + "loss": 1.2608, + "step": 28746 + }, + { + "epoch": 0.3735541539496104, + "grad_norm": 0.46396157145500183, + "learning_rate": 0.00012531485982401642, + "loss": 1.4209, + "step": 28747 + }, + { + "epoch": 0.3735671484935263, + "grad_norm": 0.32740846276283264, + "learning_rate": 0.00012531226036210507, + "loss": 1.3874, + "step": 28748 + }, + { + "epoch": 0.37358014303744214, + "grad_norm": 0.3755617141723633, + "learning_rate": 0.00012530966090019366, + "loss": 1.3691, + "step": 28749 + }, + { + "epoch": 0.37359313758135804, + "grad_norm": 0.3294253945350647, + "learning_rate": 0.0001253070614382823, + "loss": 1.2616, + "step": 28750 + }, + { + "epoch": 0.3736061321252739, + "grad_norm": 0.32209479808807373, + "learning_rate": 0.0001253044619763709, + "loss": 1.352, + "step": 28751 + }, + { + "epoch": 0.3736191266691898, + "grad_norm": 0.34468215703964233, + "learning_rate": 0.0001253018625144595, + "loss": 1.2463, + "step": 28752 + }, + { + "epoch": 0.37363212121310563, + "grad_norm": 0.4348553717136383, + "learning_rate": 0.00012529926305254814, + "loss": 1.3622, + "step": 28753 + }, + { + "epoch": 0.37364511575702153, + "grad_norm": 0.4141204059123993, + "learning_rate": 0.00012529666359063673, + "loss": 1.3976, + "step": 28754 + }, + { + "epoch": 0.3736581103009374, + "grad_norm": 0.4958474636077881, + "learning_rate": 0.00012529406412872536, + "loss": 1.4994, + "step": 28755 + }, + { + "epoch": 0.3736711048448533, + "grad_norm": 0.404459148645401, + "learning_rate": 0.00012529146466681398, + "loss": 1.5054, + "step": 28756 + }, + { + "epoch": 0.3736840993887691, + "grad_norm": 0.4204579293727875, + "learning_rate": 0.0001252888652049026, + "loss": 1.2679, + "step": 28757 + }, + { + "epoch": 0.373697093932685, + "grad_norm": 0.33855143189430237, + "learning_rate": 0.0001252862657429912, + "loss": 1.0715, + "step": 28758 + }, + { + "epoch": 0.37371008847660087, + "grad_norm": 0.3690027892589569, + "learning_rate": 0.0001252836662810798, + "loss": 1.2481, + "step": 28759 + }, + { + "epoch": 0.37372308302051677, + "grad_norm": 0.36489149928092957, + "learning_rate": 0.00012528106681916845, + "loss": 1.3135, + "step": 28760 + }, + { + "epoch": 0.3737360775644326, + "grad_norm": 0.4587075114250183, + "learning_rate": 0.00012527846735725705, + "loss": 1.4474, + "step": 28761 + }, + { + "epoch": 0.3737490721083485, + "grad_norm": 0.4402141571044922, + "learning_rate": 0.00012527586789534567, + "loss": 1.5649, + "step": 28762 + }, + { + "epoch": 0.37376206665226436, + "grad_norm": 0.4330047369003296, + "learning_rate": 0.00012527326843343427, + "loss": 1.3132, + "step": 28763 + }, + { + "epoch": 0.37377506119618026, + "grad_norm": 0.3839380443096161, + "learning_rate": 0.0001252706689715229, + "loss": 1.3463, + "step": 28764 + }, + { + "epoch": 0.3737880557400961, + "grad_norm": 0.3916539251804352, + "learning_rate": 0.00012526806950961152, + "loss": 1.4293, + "step": 28765 + }, + { + "epoch": 0.373801050284012, + "grad_norm": 0.2990649938583374, + "learning_rate": 0.00012526547004770012, + "loss": 1.3049, + "step": 28766 + }, + { + "epoch": 0.37381404482792785, + "grad_norm": 0.37046363949775696, + "learning_rate": 0.00012526287058578874, + "loss": 1.3364, + "step": 28767 + }, + { + "epoch": 0.37382703937184375, + "grad_norm": 0.4471253454685211, + "learning_rate": 0.00012526027112387737, + "loss": 1.4361, + "step": 28768 + }, + { + "epoch": 0.3738400339157596, + "grad_norm": 0.41291218996047974, + "learning_rate": 0.000125257671661966, + "loss": 1.3924, + "step": 28769 + }, + { + "epoch": 0.3738530284596755, + "grad_norm": 0.3176535665988922, + "learning_rate": 0.0001252550722000546, + "loss": 1.267, + "step": 28770 + }, + { + "epoch": 0.37386602300359134, + "grad_norm": 0.5047698020935059, + "learning_rate": 0.0001252524727381432, + "loss": 1.5015, + "step": 28771 + }, + { + "epoch": 0.37387901754750724, + "grad_norm": 0.46696439385414124, + "learning_rate": 0.00012524987327623184, + "loss": 1.4474, + "step": 28772 + }, + { + "epoch": 0.3738920120914231, + "grad_norm": 0.4528016149997711, + "learning_rate": 0.00012524727381432044, + "loss": 1.5101, + "step": 28773 + }, + { + "epoch": 0.373905006635339, + "grad_norm": 0.41030579805374146, + "learning_rate": 0.00012524467435240906, + "loss": 1.2877, + "step": 28774 + }, + { + "epoch": 0.37391800117925483, + "grad_norm": 0.4559067189693451, + "learning_rate": 0.00012524207489049766, + "loss": 1.4248, + "step": 28775 + }, + { + "epoch": 0.37393099572317073, + "grad_norm": 0.4302278459072113, + "learning_rate": 0.00012523947542858628, + "loss": 1.4347, + "step": 28776 + }, + { + "epoch": 0.37394399026708663, + "grad_norm": 0.41090983152389526, + "learning_rate": 0.0001252368759666749, + "loss": 1.3745, + "step": 28777 + }, + { + "epoch": 0.3739569848110025, + "grad_norm": 0.35330748558044434, + "learning_rate": 0.0001252342765047635, + "loss": 1.3543, + "step": 28778 + }, + { + "epoch": 0.3739699793549184, + "grad_norm": 0.6061524748802185, + "learning_rate": 0.00012523167704285213, + "loss": 1.4235, + "step": 28779 + }, + { + "epoch": 0.3739829738988342, + "grad_norm": 0.3606860637664795, + "learning_rate": 0.00012522907758094075, + "loss": 1.3685, + "step": 28780 + }, + { + "epoch": 0.3739959684427501, + "grad_norm": 0.4220470190048218, + "learning_rate": 0.00012522647811902938, + "loss": 1.2141, + "step": 28781 + }, + { + "epoch": 0.37400896298666597, + "grad_norm": 0.44551536440849304, + "learning_rate": 0.00012522387865711797, + "loss": 1.4827, + "step": 28782 + }, + { + "epoch": 0.37402195753058187, + "grad_norm": 0.36242103576660156, + "learning_rate": 0.0001252212791952066, + "loss": 1.3, + "step": 28783 + }, + { + "epoch": 0.3740349520744977, + "grad_norm": 0.3974902033805847, + "learning_rate": 0.00012521867973329522, + "loss": 1.4835, + "step": 28784 + }, + { + "epoch": 0.3740479466184136, + "grad_norm": 0.4516390562057495, + "learning_rate": 0.00012521608027138382, + "loss": 1.3384, + "step": 28785 + }, + { + "epoch": 0.37406094116232946, + "grad_norm": 0.31487998366355896, + "learning_rate": 0.00012521348080947245, + "loss": 1.2772, + "step": 28786 + }, + { + "epoch": 0.37407393570624536, + "grad_norm": 0.3444060981273651, + "learning_rate": 0.00012521088134756107, + "loss": 1.3413, + "step": 28787 + }, + { + "epoch": 0.3740869302501612, + "grad_norm": 0.34494084119796753, + "learning_rate": 0.0001252082818856497, + "loss": 1.5097, + "step": 28788 + }, + { + "epoch": 0.3740999247940771, + "grad_norm": 0.3277198374271393, + "learning_rate": 0.0001252056824237383, + "loss": 1.3112, + "step": 28789 + }, + { + "epoch": 0.37411291933799296, + "grad_norm": 0.4672078788280487, + "learning_rate": 0.0001252030829618269, + "loss": 1.3987, + "step": 28790 + }, + { + "epoch": 0.37412591388190886, + "grad_norm": 0.44889727234840393, + "learning_rate": 0.00012520048349991554, + "loss": 1.3671, + "step": 28791 + }, + { + "epoch": 0.3741389084258247, + "grad_norm": 0.29564836621284485, + "learning_rate": 0.00012519788403800414, + "loss": 1.3785, + "step": 28792 + }, + { + "epoch": 0.3741519029697406, + "grad_norm": 0.5507976412773132, + "learning_rate": 0.00012519528457609276, + "loss": 1.2517, + "step": 28793 + }, + { + "epoch": 0.37416489751365645, + "grad_norm": 0.412579745054245, + "learning_rate": 0.00012519268511418136, + "loss": 1.3978, + "step": 28794 + }, + { + "epoch": 0.37417789205757235, + "grad_norm": 0.3830762803554535, + "learning_rate": 0.00012519008565226998, + "loss": 1.2006, + "step": 28795 + }, + { + "epoch": 0.3741908866014882, + "grad_norm": 0.28367650508880615, + "learning_rate": 0.0001251874861903586, + "loss": 1.1969, + "step": 28796 + }, + { + "epoch": 0.3742038811454041, + "grad_norm": 0.4791542887687683, + "learning_rate": 0.0001251848867284472, + "loss": 1.4949, + "step": 28797 + }, + { + "epoch": 0.37421687568931994, + "grad_norm": 0.3939990699291229, + "learning_rate": 0.00012518228726653583, + "loss": 1.6424, + "step": 28798 + }, + { + "epoch": 0.37422987023323584, + "grad_norm": 0.34524670243263245, + "learning_rate": 0.00012517968780462446, + "loss": 1.341, + "step": 28799 + }, + { + "epoch": 0.3742428647771517, + "grad_norm": 0.37180066108703613, + "learning_rate": 0.00012517708834271308, + "loss": 1.3833, + "step": 28800 + }, + { + "epoch": 0.3742558593210676, + "grad_norm": 0.4311344623565674, + "learning_rate": 0.00012517448888080168, + "loss": 1.4499, + "step": 28801 + }, + { + "epoch": 0.37426885386498343, + "grad_norm": 0.32370734214782715, + "learning_rate": 0.00012517188941889027, + "loss": 1.3205, + "step": 28802 + }, + { + "epoch": 0.37428184840889933, + "grad_norm": 0.3162273168563843, + "learning_rate": 0.00012516928995697893, + "loss": 1.3224, + "step": 28803 + }, + { + "epoch": 0.3742948429528152, + "grad_norm": 0.3170686960220337, + "learning_rate": 0.00012516669049506752, + "loss": 1.2121, + "step": 28804 + }, + { + "epoch": 0.3743078374967311, + "grad_norm": 0.37588006258010864, + "learning_rate": 0.00012516409103315615, + "loss": 1.401, + "step": 28805 + }, + { + "epoch": 0.3743208320406469, + "grad_norm": 0.5735766291618347, + "learning_rate": 0.00012516149157124475, + "loss": 1.4528, + "step": 28806 + }, + { + "epoch": 0.3743338265845628, + "grad_norm": 0.40847253799438477, + "learning_rate": 0.00012515889210933337, + "loss": 1.2933, + "step": 28807 + }, + { + "epoch": 0.37434682112847867, + "grad_norm": 0.4204402267932892, + "learning_rate": 0.000125156292647422, + "loss": 1.4996, + "step": 28808 + }, + { + "epoch": 0.37435981567239457, + "grad_norm": 0.3809019923210144, + "learning_rate": 0.0001251536931855106, + "loss": 1.449, + "step": 28809 + }, + { + "epoch": 0.3743728102163104, + "grad_norm": 0.40501073002815247, + "learning_rate": 0.00012515109372359922, + "loss": 1.3807, + "step": 28810 + }, + { + "epoch": 0.3743858047602263, + "grad_norm": 0.45273464918136597, + "learning_rate": 0.00012514849426168784, + "loss": 1.3613, + "step": 28811 + }, + { + "epoch": 0.37439879930414216, + "grad_norm": 0.4391747713088989, + "learning_rate": 0.00012514589479977647, + "loss": 1.4603, + "step": 28812 + }, + { + "epoch": 0.37441179384805806, + "grad_norm": 0.3503970503807068, + "learning_rate": 0.00012514329533786506, + "loss": 1.2576, + "step": 28813 + }, + { + "epoch": 0.3744247883919739, + "grad_norm": 0.25196388363838196, + "learning_rate": 0.00012514069587595366, + "loss": 1.4823, + "step": 28814 + }, + { + "epoch": 0.3744377829358898, + "grad_norm": 0.5326829552650452, + "learning_rate": 0.0001251380964140423, + "loss": 1.4918, + "step": 28815 + }, + { + "epoch": 0.37445077747980565, + "grad_norm": 0.47540566325187683, + "learning_rate": 0.0001251354969521309, + "loss": 1.422, + "step": 28816 + }, + { + "epoch": 0.37446377202372155, + "grad_norm": 0.4210006594657898, + "learning_rate": 0.00012513289749021953, + "loss": 1.5774, + "step": 28817 + }, + { + "epoch": 0.3744767665676374, + "grad_norm": 0.42402294278144836, + "learning_rate": 0.00012513029802830816, + "loss": 1.416, + "step": 28818 + }, + { + "epoch": 0.3744897611115533, + "grad_norm": 0.3172576129436493, + "learning_rate": 0.00012512769856639676, + "loss": 1.5137, + "step": 28819 + }, + { + "epoch": 0.37450275565546914, + "grad_norm": 0.4440973401069641, + "learning_rate": 0.00012512509910448538, + "loss": 1.3478, + "step": 28820 + }, + { + "epoch": 0.37451575019938504, + "grad_norm": 0.4081081748008728, + "learning_rate": 0.00012512249964257398, + "loss": 1.3932, + "step": 28821 + }, + { + "epoch": 0.3745287447433009, + "grad_norm": 0.4473981261253357, + "learning_rate": 0.00012511990018066263, + "loss": 1.3888, + "step": 28822 + }, + { + "epoch": 0.3745417392872168, + "grad_norm": 0.38845670223236084, + "learning_rate": 0.00012511730071875123, + "loss": 1.2379, + "step": 28823 + }, + { + "epoch": 0.37455473383113264, + "grad_norm": 0.45279181003570557, + "learning_rate": 0.00012511470125683985, + "loss": 1.4901, + "step": 28824 + }, + { + "epoch": 0.37456772837504854, + "grad_norm": 0.30662423372268677, + "learning_rate": 0.00012511210179492845, + "loss": 1.1286, + "step": 28825 + }, + { + "epoch": 0.3745807229189644, + "grad_norm": 0.4232569634914398, + "learning_rate": 0.00012510950233301707, + "loss": 1.2701, + "step": 28826 + }, + { + "epoch": 0.3745937174628803, + "grad_norm": 0.3922737240791321, + "learning_rate": 0.0001251069028711057, + "loss": 1.3488, + "step": 28827 + }, + { + "epoch": 0.3746067120067961, + "grad_norm": 0.34042033553123474, + "learning_rate": 0.0001251043034091943, + "loss": 1.4408, + "step": 28828 + }, + { + "epoch": 0.37461970655071203, + "grad_norm": 0.321860671043396, + "learning_rate": 0.00012510170394728292, + "loss": 1.332, + "step": 28829 + }, + { + "epoch": 0.3746327010946279, + "grad_norm": 0.38447946310043335, + "learning_rate": 0.00012509910448537154, + "loss": 1.4279, + "step": 28830 + }, + { + "epoch": 0.3746456956385438, + "grad_norm": 0.36960867047309875, + "learning_rate": 0.00012509650502346014, + "loss": 1.3425, + "step": 28831 + }, + { + "epoch": 0.3746586901824596, + "grad_norm": 0.4848098158836365, + "learning_rate": 0.00012509390556154876, + "loss": 1.3095, + "step": 28832 + }, + { + "epoch": 0.3746716847263755, + "grad_norm": 0.45199307799339294, + "learning_rate": 0.00012509130609963736, + "loss": 1.6055, + "step": 28833 + }, + { + "epoch": 0.37468467927029137, + "grad_norm": 0.3760932385921478, + "learning_rate": 0.00012508870663772601, + "loss": 1.3039, + "step": 28834 + }, + { + "epoch": 0.37469767381420727, + "grad_norm": 0.4585811495780945, + "learning_rate": 0.0001250861071758146, + "loss": 1.5426, + "step": 28835 + }, + { + "epoch": 0.3747106683581231, + "grad_norm": 0.39848774671554565, + "learning_rate": 0.00012508350771390324, + "loss": 1.3304, + "step": 28836 + }, + { + "epoch": 0.374723662902039, + "grad_norm": 0.47406378388404846, + "learning_rate": 0.00012508090825199183, + "loss": 1.4261, + "step": 28837 + }, + { + "epoch": 0.37473665744595486, + "grad_norm": 0.24699153006076813, + "learning_rate": 0.00012507830879008046, + "loss": 1.1634, + "step": 28838 + }, + { + "epoch": 0.37474965198987076, + "grad_norm": 0.3858836889266968, + "learning_rate": 0.00012507570932816908, + "loss": 1.4059, + "step": 28839 + }, + { + "epoch": 0.3747626465337866, + "grad_norm": 0.4618825614452362, + "learning_rate": 0.00012507310986625768, + "loss": 1.3787, + "step": 28840 + }, + { + "epoch": 0.3747756410777025, + "grad_norm": 0.38702648878097534, + "learning_rate": 0.0001250705104043463, + "loss": 1.4838, + "step": 28841 + }, + { + "epoch": 0.37478863562161835, + "grad_norm": 0.422344446182251, + "learning_rate": 0.00012506791094243493, + "loss": 1.5091, + "step": 28842 + }, + { + "epoch": 0.37480163016553425, + "grad_norm": 0.529836893081665, + "learning_rate": 0.00012506531148052353, + "loss": 1.3878, + "step": 28843 + }, + { + "epoch": 0.3748146247094501, + "grad_norm": 0.31421998143196106, + "learning_rate": 0.00012506271201861215, + "loss": 1.4457, + "step": 28844 + }, + { + "epoch": 0.374827619253366, + "grad_norm": 0.4187467694282532, + "learning_rate": 0.00012506011255670075, + "loss": 1.5248, + "step": 28845 + }, + { + "epoch": 0.37484061379728184, + "grad_norm": 0.4240720868110657, + "learning_rate": 0.0001250575130947894, + "loss": 1.381, + "step": 28846 + }, + { + "epoch": 0.37485360834119774, + "grad_norm": 0.46227872371673584, + "learning_rate": 0.000125054913632878, + "loss": 1.5045, + "step": 28847 + }, + { + "epoch": 0.3748666028851136, + "grad_norm": 0.44956791400909424, + "learning_rate": 0.00012505231417096662, + "loss": 1.3974, + "step": 28848 + }, + { + "epoch": 0.3748795974290295, + "grad_norm": 0.3727710247039795, + "learning_rate": 0.00012504971470905522, + "loss": 1.3807, + "step": 28849 + }, + { + "epoch": 0.37489259197294533, + "grad_norm": 0.3538811504840851, + "learning_rate": 0.00012504711524714384, + "loss": 1.3123, + "step": 28850 + }, + { + "epoch": 0.37490558651686123, + "grad_norm": 0.3979772925376892, + "learning_rate": 0.00012504451578523247, + "loss": 1.4317, + "step": 28851 + }, + { + "epoch": 0.3749185810607771, + "grad_norm": 0.34732866287231445, + "learning_rate": 0.00012504191632332106, + "loss": 1.5715, + "step": 28852 + }, + { + "epoch": 0.374931575604693, + "grad_norm": 0.4288010895252228, + "learning_rate": 0.0001250393168614097, + "loss": 1.4755, + "step": 28853 + }, + { + "epoch": 0.3749445701486089, + "grad_norm": 0.3757539391517639, + "learning_rate": 0.00012503671739949831, + "loss": 1.466, + "step": 28854 + }, + { + "epoch": 0.3749575646925247, + "grad_norm": 0.46227288246154785, + "learning_rate": 0.00012503411793758694, + "loss": 1.3452, + "step": 28855 + }, + { + "epoch": 0.3749705592364406, + "grad_norm": 0.25591906905174255, + "learning_rate": 0.00012503151847567554, + "loss": 1.1903, + "step": 28856 + }, + { + "epoch": 0.37498355378035647, + "grad_norm": 0.4343232214450836, + "learning_rate": 0.00012502891901376416, + "loss": 1.4139, + "step": 28857 + }, + { + "epoch": 0.37499654832427237, + "grad_norm": 0.43891939520835876, + "learning_rate": 0.00012502631955185278, + "loss": 1.3417, + "step": 28858 + }, + { + "epoch": 0.3750095428681882, + "grad_norm": 0.37385502457618713, + "learning_rate": 0.00012502372008994138, + "loss": 1.3168, + "step": 28859 + }, + { + "epoch": 0.3750225374121041, + "grad_norm": 0.3902006447315216, + "learning_rate": 0.00012502112062803, + "loss": 1.5619, + "step": 28860 + }, + { + "epoch": 0.37503553195601996, + "grad_norm": 0.38616517186164856, + "learning_rate": 0.00012501852116611863, + "loss": 1.5015, + "step": 28861 + }, + { + "epoch": 0.37504852649993586, + "grad_norm": 0.24514099955558777, + "learning_rate": 0.00012501592170420723, + "loss": 1.3169, + "step": 28862 + }, + { + "epoch": 0.3750615210438517, + "grad_norm": 0.40264204144477844, + "learning_rate": 0.00012501332224229585, + "loss": 1.4906, + "step": 28863 + }, + { + "epoch": 0.3750745155877676, + "grad_norm": 0.36614781618118286, + "learning_rate": 0.00012501072278038445, + "loss": 1.3317, + "step": 28864 + }, + { + "epoch": 0.37508751013168345, + "grad_norm": 0.39818108081817627, + "learning_rate": 0.0001250081233184731, + "loss": 1.4702, + "step": 28865 + }, + { + "epoch": 0.37510050467559936, + "grad_norm": 0.405103862285614, + "learning_rate": 0.0001250055238565617, + "loss": 1.4284, + "step": 28866 + }, + { + "epoch": 0.3751134992195152, + "grad_norm": 0.46252959966659546, + "learning_rate": 0.00012500292439465032, + "loss": 1.43, + "step": 28867 + }, + { + "epoch": 0.3751264937634311, + "grad_norm": 0.41500088572502136, + "learning_rate": 0.00012500032493273892, + "loss": 1.4114, + "step": 28868 + }, + { + "epoch": 0.37513948830734695, + "grad_norm": 0.4190671443939209, + "learning_rate": 0.00012499772547082755, + "loss": 1.4935, + "step": 28869 + }, + { + "epoch": 0.37515248285126285, + "grad_norm": 0.36047592759132385, + "learning_rate": 0.00012499512600891617, + "loss": 1.2584, + "step": 28870 + }, + { + "epoch": 0.3751654773951787, + "grad_norm": 0.33831408619880676, + "learning_rate": 0.00012499252654700477, + "loss": 1.4996, + "step": 28871 + }, + { + "epoch": 0.3751784719390946, + "grad_norm": 0.41569969058036804, + "learning_rate": 0.0001249899270850934, + "loss": 1.4035, + "step": 28872 + }, + { + "epoch": 0.37519146648301044, + "grad_norm": 0.38575318455696106, + "learning_rate": 0.00012498732762318202, + "loss": 1.5862, + "step": 28873 + }, + { + "epoch": 0.37520446102692634, + "grad_norm": 0.38970300555229187, + "learning_rate": 0.00012498472816127061, + "loss": 1.3781, + "step": 28874 + }, + { + "epoch": 0.3752174555708422, + "grad_norm": 0.4353445768356323, + "learning_rate": 0.00012498212869935924, + "loss": 1.3112, + "step": 28875 + }, + { + "epoch": 0.3752304501147581, + "grad_norm": 0.7563292384147644, + "learning_rate": 0.00012497952923744784, + "loss": 1.301, + "step": 28876 + }, + { + "epoch": 0.37524344465867393, + "grad_norm": 0.39830726385116577, + "learning_rate": 0.0001249769297755365, + "loss": 1.4271, + "step": 28877 + }, + { + "epoch": 0.37525643920258983, + "grad_norm": 0.3310627341270447, + "learning_rate": 0.00012497433031362508, + "loss": 1.3393, + "step": 28878 + }, + { + "epoch": 0.3752694337465057, + "grad_norm": 0.4384199380874634, + "learning_rate": 0.0001249717308517137, + "loss": 1.5294, + "step": 28879 + }, + { + "epoch": 0.3752824282904216, + "grad_norm": 0.45933350920677185, + "learning_rate": 0.0001249691313898023, + "loss": 1.3951, + "step": 28880 + }, + { + "epoch": 0.3752954228343374, + "grad_norm": 0.44464603066444397, + "learning_rate": 0.00012496653192789093, + "loss": 1.4112, + "step": 28881 + }, + { + "epoch": 0.3753084173782533, + "grad_norm": 0.2777060568332672, + "learning_rate": 0.00012496393246597956, + "loss": 1.3047, + "step": 28882 + }, + { + "epoch": 0.37532141192216917, + "grad_norm": 0.36590129137039185, + "learning_rate": 0.00012496133300406815, + "loss": 1.245, + "step": 28883 + }, + { + "epoch": 0.37533440646608507, + "grad_norm": 0.44382137060165405, + "learning_rate": 0.00012495873354215678, + "loss": 1.367, + "step": 28884 + }, + { + "epoch": 0.3753474010100009, + "grad_norm": 0.48704805970191956, + "learning_rate": 0.0001249561340802454, + "loss": 1.5727, + "step": 28885 + }, + { + "epoch": 0.3753603955539168, + "grad_norm": 0.33053481578826904, + "learning_rate": 0.000124953534618334, + "loss": 1.4756, + "step": 28886 + }, + { + "epoch": 0.37537339009783266, + "grad_norm": 0.4279806315898895, + "learning_rate": 0.00012495093515642262, + "loss": 1.573, + "step": 28887 + }, + { + "epoch": 0.37538638464174856, + "grad_norm": 0.429724782705307, + "learning_rate": 0.00012494833569451122, + "loss": 1.5761, + "step": 28888 + }, + { + "epoch": 0.3753993791856644, + "grad_norm": 0.43826866149902344, + "learning_rate": 0.00012494573623259987, + "loss": 1.3706, + "step": 28889 + }, + { + "epoch": 0.3754123737295803, + "grad_norm": 0.4755299687385559, + "learning_rate": 0.00012494313677068847, + "loss": 1.4164, + "step": 28890 + }, + { + "epoch": 0.37542536827349615, + "grad_norm": 0.40580055117607117, + "learning_rate": 0.0001249405373087771, + "loss": 1.3211, + "step": 28891 + }, + { + "epoch": 0.37543836281741205, + "grad_norm": 0.38015103340148926, + "learning_rate": 0.00012493793784686572, + "loss": 1.4253, + "step": 28892 + }, + { + "epoch": 0.3754513573613279, + "grad_norm": 0.4138756990432739, + "learning_rate": 0.00012493533838495432, + "loss": 1.1728, + "step": 28893 + }, + { + "epoch": 0.3754643519052438, + "grad_norm": 0.41246840357780457, + "learning_rate": 0.00012493273892304294, + "loss": 1.5317, + "step": 28894 + }, + { + "epoch": 0.37547734644915964, + "grad_norm": 0.34073248505592346, + "learning_rate": 0.00012493013946113154, + "loss": 1.5484, + "step": 28895 + }, + { + "epoch": 0.37549034099307554, + "grad_norm": 0.4287576377391815, + "learning_rate": 0.0001249275399992202, + "loss": 1.3092, + "step": 28896 + }, + { + "epoch": 0.3755033355369914, + "grad_norm": 0.38764679431915283, + "learning_rate": 0.0001249249405373088, + "loss": 1.2772, + "step": 28897 + }, + { + "epoch": 0.3755163300809073, + "grad_norm": 0.4042341709136963, + "learning_rate": 0.00012492234107539738, + "loss": 1.3907, + "step": 28898 + }, + { + "epoch": 0.37552932462482314, + "grad_norm": 0.4195457398891449, + "learning_rate": 0.000124919741613486, + "loss": 1.3287, + "step": 28899 + }, + { + "epoch": 0.37554231916873904, + "grad_norm": 0.31598949432373047, + "learning_rate": 0.00012491714215157463, + "loss": 1.3299, + "step": 28900 + }, + { + "epoch": 0.3755553137126549, + "grad_norm": 0.348940908908844, + "learning_rate": 0.00012491454268966326, + "loss": 1.6232, + "step": 28901 + }, + { + "epoch": 0.3755683082565708, + "grad_norm": 0.35178977251052856, + "learning_rate": 0.00012491194322775186, + "loss": 1.3992, + "step": 28902 + }, + { + "epoch": 0.3755813028004866, + "grad_norm": 0.3172970712184906, + "learning_rate": 0.00012490934376584048, + "loss": 1.2761, + "step": 28903 + }, + { + "epoch": 0.37559429734440253, + "grad_norm": 0.36124634742736816, + "learning_rate": 0.0001249067443039291, + "loss": 1.1986, + "step": 28904 + }, + { + "epoch": 0.3756072918883184, + "grad_norm": 0.4207603335380554, + "learning_rate": 0.0001249041448420177, + "loss": 1.5873, + "step": 28905 + }, + { + "epoch": 0.3756202864322343, + "grad_norm": 0.36357760429382324, + "learning_rate": 0.00012490154538010633, + "loss": 1.3592, + "step": 28906 + }, + { + "epoch": 0.3756332809761501, + "grad_norm": 0.4150446355342865, + "learning_rate": 0.00012489894591819492, + "loss": 1.4162, + "step": 28907 + }, + { + "epoch": 0.375646275520066, + "grad_norm": 0.40322908759117126, + "learning_rate": 0.00012489634645628358, + "loss": 1.5258, + "step": 28908 + }, + { + "epoch": 0.37565927006398186, + "grad_norm": 0.4171421229839325, + "learning_rate": 0.00012489374699437217, + "loss": 1.5131, + "step": 28909 + }, + { + "epoch": 0.37567226460789777, + "grad_norm": 0.3973075747489929, + "learning_rate": 0.0001248911475324608, + "loss": 1.3498, + "step": 28910 + }, + { + "epoch": 0.3756852591518136, + "grad_norm": 0.3319651782512665, + "learning_rate": 0.0001248885480705494, + "loss": 1.3227, + "step": 28911 + }, + { + "epoch": 0.3756982536957295, + "grad_norm": 0.4336509108543396, + "learning_rate": 0.00012488594860863802, + "loss": 1.2831, + "step": 28912 + }, + { + "epoch": 0.37571124823964536, + "grad_norm": 0.3868233561515808, + "learning_rate": 0.00012488334914672664, + "loss": 1.3754, + "step": 28913 + }, + { + "epoch": 0.37572424278356126, + "grad_norm": 0.40609779953956604, + "learning_rate": 0.00012488074968481524, + "loss": 1.2653, + "step": 28914 + }, + { + "epoch": 0.3757372373274771, + "grad_norm": 0.4330745339393616, + "learning_rate": 0.00012487815022290387, + "loss": 1.4787, + "step": 28915 + }, + { + "epoch": 0.375750231871393, + "grad_norm": 0.4194702208042145, + "learning_rate": 0.0001248755507609925, + "loss": 1.4082, + "step": 28916 + }, + { + "epoch": 0.37576322641530885, + "grad_norm": 0.4286632239818573, + "learning_rate": 0.0001248729512990811, + "loss": 1.5517, + "step": 28917 + }, + { + "epoch": 0.37577622095922475, + "grad_norm": 0.3197442591190338, + "learning_rate": 0.0001248703518371697, + "loss": 1.4543, + "step": 28918 + }, + { + "epoch": 0.3757892155031406, + "grad_norm": 0.4218446612358093, + "learning_rate": 0.0001248677523752583, + "loss": 1.2784, + "step": 28919 + }, + { + "epoch": 0.3758022100470565, + "grad_norm": 0.3768215477466583, + "learning_rate": 0.00012486515291334696, + "loss": 1.4439, + "step": 28920 + }, + { + "epoch": 0.37581520459097234, + "grad_norm": 0.2715786397457123, + "learning_rate": 0.00012486255345143556, + "loss": 1.59, + "step": 28921 + }, + { + "epoch": 0.37582819913488824, + "grad_norm": 0.3572438955307007, + "learning_rate": 0.00012485995398952418, + "loss": 1.3555, + "step": 28922 + }, + { + "epoch": 0.3758411936788041, + "grad_norm": 0.4257218539714813, + "learning_rate": 0.00012485735452761278, + "loss": 1.5358, + "step": 28923 + }, + { + "epoch": 0.37585418822272, + "grad_norm": 0.5142207145690918, + "learning_rate": 0.0001248547550657014, + "loss": 1.3167, + "step": 28924 + }, + { + "epoch": 0.37586718276663583, + "grad_norm": 0.3964592218399048, + "learning_rate": 0.00012485215560379003, + "loss": 1.3761, + "step": 28925 + }, + { + "epoch": 0.37588017731055173, + "grad_norm": 0.3714204728603363, + "learning_rate": 0.00012484955614187863, + "loss": 1.509, + "step": 28926 + }, + { + "epoch": 0.3758931718544676, + "grad_norm": 0.40894579887390137, + "learning_rate": 0.00012484695667996725, + "loss": 1.4759, + "step": 28927 + }, + { + "epoch": 0.3759061663983835, + "grad_norm": 0.5266218781471252, + "learning_rate": 0.00012484435721805588, + "loss": 1.4613, + "step": 28928 + }, + { + "epoch": 0.3759191609422994, + "grad_norm": 0.41685837507247925, + "learning_rate": 0.00012484175775614447, + "loss": 1.363, + "step": 28929 + }, + { + "epoch": 0.3759321554862152, + "grad_norm": 0.41698765754699707, + "learning_rate": 0.0001248391582942331, + "loss": 1.5562, + "step": 28930 + }, + { + "epoch": 0.3759451500301311, + "grad_norm": 0.3285476565361023, + "learning_rate": 0.00012483655883232172, + "loss": 1.4279, + "step": 28931 + }, + { + "epoch": 0.37595814457404697, + "grad_norm": 0.40252307057380676, + "learning_rate": 0.00012483395937041035, + "loss": 1.2857, + "step": 28932 + }, + { + "epoch": 0.37597113911796287, + "grad_norm": 0.4399270713329315, + "learning_rate": 0.00012483135990849894, + "loss": 1.2879, + "step": 28933 + }, + { + "epoch": 0.3759841336618787, + "grad_norm": 0.4111799895763397, + "learning_rate": 0.00012482876044658757, + "loss": 1.4794, + "step": 28934 + }, + { + "epoch": 0.3759971282057946, + "grad_norm": 0.4470709562301636, + "learning_rate": 0.0001248261609846762, + "loss": 1.5317, + "step": 28935 + }, + { + "epoch": 0.37601012274971046, + "grad_norm": 0.5309771299362183, + "learning_rate": 0.0001248235615227648, + "loss": 1.4524, + "step": 28936 + }, + { + "epoch": 0.37602311729362636, + "grad_norm": 0.33288291096687317, + "learning_rate": 0.00012482096206085341, + "loss": 1.2423, + "step": 28937 + }, + { + "epoch": 0.3760361118375422, + "grad_norm": 0.36665570735931396, + "learning_rate": 0.000124818362598942, + "loss": 1.4693, + "step": 28938 + }, + { + "epoch": 0.3760491063814581, + "grad_norm": 0.4501255750656128, + "learning_rate": 0.00012481576313703066, + "loss": 1.267, + "step": 28939 + }, + { + "epoch": 0.37606210092537395, + "grad_norm": 0.3907860517501831, + "learning_rate": 0.00012481316367511926, + "loss": 1.3128, + "step": 28940 + }, + { + "epoch": 0.37607509546928986, + "grad_norm": 0.35737356543540955, + "learning_rate": 0.00012481056421320786, + "loss": 1.3525, + "step": 28941 + }, + { + "epoch": 0.3760880900132057, + "grad_norm": 0.48332130908966064, + "learning_rate": 0.00012480796475129648, + "loss": 1.5365, + "step": 28942 + }, + { + "epoch": 0.3761010845571216, + "grad_norm": 0.3659692704677582, + "learning_rate": 0.0001248053652893851, + "loss": 1.2856, + "step": 28943 + }, + { + "epoch": 0.37611407910103745, + "grad_norm": 0.4015612304210663, + "learning_rate": 0.00012480276582747373, + "loss": 1.2625, + "step": 28944 + }, + { + "epoch": 0.37612707364495335, + "grad_norm": 0.40080517530441284, + "learning_rate": 0.00012480016636556233, + "loss": 1.2334, + "step": 28945 + }, + { + "epoch": 0.3761400681888692, + "grad_norm": 0.4071347713470459, + "learning_rate": 0.00012479756690365095, + "loss": 1.3845, + "step": 28946 + }, + { + "epoch": 0.3761530627327851, + "grad_norm": 0.42958763241767883, + "learning_rate": 0.00012479496744173958, + "loss": 1.4909, + "step": 28947 + }, + { + "epoch": 0.37616605727670094, + "grad_norm": 0.39240241050720215, + "learning_rate": 0.00012479236797982818, + "loss": 1.2925, + "step": 28948 + }, + { + "epoch": 0.37617905182061684, + "grad_norm": 0.3925537168979645, + "learning_rate": 0.0001247897685179168, + "loss": 1.413, + "step": 28949 + }, + { + "epoch": 0.3761920463645327, + "grad_norm": 0.4498641788959503, + "learning_rate": 0.0001247871690560054, + "loss": 1.248, + "step": 28950 + }, + { + "epoch": 0.3762050409084486, + "grad_norm": 0.4823785424232483, + "learning_rate": 0.00012478456959409405, + "loss": 1.5332, + "step": 28951 + }, + { + "epoch": 0.37621803545236443, + "grad_norm": 0.41888949275016785, + "learning_rate": 0.00012478197013218265, + "loss": 1.4827, + "step": 28952 + }, + { + "epoch": 0.37623102999628033, + "grad_norm": 0.43660542368888855, + "learning_rate": 0.00012477937067027124, + "loss": 1.3882, + "step": 28953 + }, + { + "epoch": 0.3762440245401962, + "grad_norm": 0.33248305320739746, + "learning_rate": 0.00012477677120835987, + "loss": 1.3288, + "step": 28954 + }, + { + "epoch": 0.3762570190841121, + "grad_norm": 0.3973116874694824, + "learning_rate": 0.0001247741717464485, + "loss": 1.3128, + "step": 28955 + }, + { + "epoch": 0.3762700136280279, + "grad_norm": 0.42672908306121826, + "learning_rate": 0.00012477157228453712, + "loss": 1.5794, + "step": 28956 + }, + { + "epoch": 0.3762830081719438, + "grad_norm": 0.32531246542930603, + "learning_rate": 0.00012476897282262571, + "loss": 1.0845, + "step": 28957 + }, + { + "epoch": 0.37629600271585967, + "grad_norm": 0.26367253065109253, + "learning_rate": 0.00012476637336071434, + "loss": 1.2971, + "step": 28958 + }, + { + "epoch": 0.37630899725977557, + "grad_norm": 0.5099393725395203, + "learning_rate": 0.00012476377389880296, + "loss": 1.408, + "step": 28959 + }, + { + "epoch": 0.3763219918036914, + "grad_norm": 0.3670794665813446, + "learning_rate": 0.00012476117443689156, + "loss": 1.4446, + "step": 28960 + }, + { + "epoch": 0.3763349863476073, + "grad_norm": 0.47548606991767883, + "learning_rate": 0.00012475857497498019, + "loss": 1.4179, + "step": 28961 + }, + { + "epoch": 0.37634798089152316, + "grad_norm": 0.3915907144546509, + "learning_rate": 0.00012475597551306878, + "loss": 1.4244, + "step": 28962 + }, + { + "epoch": 0.37636097543543906, + "grad_norm": 0.3869493007659912, + "learning_rate": 0.00012475337605115743, + "loss": 1.468, + "step": 28963 + }, + { + "epoch": 0.3763739699793549, + "grad_norm": 0.43332675099372864, + "learning_rate": 0.00012475077658924603, + "loss": 1.4047, + "step": 28964 + }, + { + "epoch": 0.3763869645232708, + "grad_norm": 0.4835144281387329, + "learning_rate": 0.00012474817712733463, + "loss": 1.4065, + "step": 28965 + }, + { + "epoch": 0.37639995906718665, + "grad_norm": 0.26890265941619873, + "learning_rate": 0.00012474557766542328, + "loss": 1.2324, + "step": 28966 + }, + { + "epoch": 0.37641295361110255, + "grad_norm": 0.3907824754714966, + "learning_rate": 0.00012474297820351188, + "loss": 1.4259, + "step": 28967 + }, + { + "epoch": 0.3764259481550184, + "grad_norm": 0.4691123962402344, + "learning_rate": 0.0001247403787416005, + "loss": 1.2552, + "step": 28968 + }, + { + "epoch": 0.3764389426989343, + "grad_norm": 0.43298959732055664, + "learning_rate": 0.0001247377792796891, + "loss": 1.4952, + "step": 28969 + }, + { + "epoch": 0.37645193724285014, + "grad_norm": 0.46592846512794495, + "learning_rate": 0.00012473517981777772, + "loss": 1.5468, + "step": 28970 + }, + { + "epoch": 0.37646493178676604, + "grad_norm": 0.4093274474143982, + "learning_rate": 0.00012473258035586635, + "loss": 1.4561, + "step": 28971 + }, + { + "epoch": 0.3764779263306819, + "grad_norm": 0.43703800439834595, + "learning_rate": 0.00012472998089395495, + "loss": 1.4916, + "step": 28972 + }, + { + "epoch": 0.3764909208745978, + "grad_norm": 0.30699682235717773, + "learning_rate": 0.00012472738143204357, + "loss": 1.2114, + "step": 28973 + }, + { + "epoch": 0.37650391541851363, + "grad_norm": 0.3838783800601959, + "learning_rate": 0.0001247247819701322, + "loss": 1.4807, + "step": 28974 + }, + { + "epoch": 0.37651690996242954, + "grad_norm": 0.31833335757255554, + "learning_rate": 0.00012472218250822082, + "loss": 1.2898, + "step": 28975 + }, + { + "epoch": 0.3765299045063454, + "grad_norm": 0.4873744249343872, + "learning_rate": 0.00012471958304630942, + "loss": 1.3817, + "step": 28976 + }, + { + "epoch": 0.3765428990502613, + "grad_norm": 0.3133584260940552, + "learning_rate": 0.00012471698358439804, + "loss": 1.3875, + "step": 28977 + }, + { + "epoch": 0.3765558935941771, + "grad_norm": 0.37226349115371704, + "learning_rate": 0.00012471438412248667, + "loss": 1.362, + "step": 28978 + }, + { + "epoch": 0.376568888138093, + "grad_norm": 0.4297705888748169, + "learning_rate": 0.00012471178466057526, + "loss": 1.3149, + "step": 28979 + }, + { + "epoch": 0.3765818826820089, + "grad_norm": 0.4352920949459076, + "learning_rate": 0.0001247091851986639, + "loss": 1.3512, + "step": 28980 + }, + { + "epoch": 0.3765948772259248, + "grad_norm": 0.4327623248100281, + "learning_rate": 0.00012470658573675248, + "loss": 1.468, + "step": 28981 + }, + { + "epoch": 0.3766078717698406, + "grad_norm": 0.32561036944389343, + "learning_rate": 0.0001247039862748411, + "loss": 1.2933, + "step": 28982 + }, + { + "epoch": 0.3766208663137565, + "grad_norm": 0.44417065382003784, + "learning_rate": 0.00012470138681292973, + "loss": 1.2152, + "step": 28983 + }, + { + "epoch": 0.37663386085767236, + "grad_norm": 0.4013122618198395, + "learning_rate": 0.00012469878735101833, + "loss": 1.5456, + "step": 28984 + }, + { + "epoch": 0.37664685540158827, + "grad_norm": 0.4067772924900055, + "learning_rate": 0.00012469618788910696, + "loss": 1.3099, + "step": 28985 + }, + { + "epoch": 0.3766598499455041, + "grad_norm": 0.5173984169960022, + "learning_rate": 0.00012469358842719558, + "loss": 1.5832, + "step": 28986 + }, + { + "epoch": 0.37667284448942, + "grad_norm": 0.41349759697914124, + "learning_rate": 0.0001246909889652842, + "loss": 1.5248, + "step": 28987 + }, + { + "epoch": 0.37668583903333586, + "grad_norm": 0.416258841753006, + "learning_rate": 0.0001246883895033728, + "loss": 1.5054, + "step": 28988 + }, + { + "epoch": 0.37669883357725176, + "grad_norm": 0.42920196056365967, + "learning_rate": 0.00012468579004146143, + "loss": 1.4458, + "step": 28989 + }, + { + "epoch": 0.3767118281211676, + "grad_norm": 0.3957315683364868, + "learning_rate": 0.00012468319057955005, + "loss": 1.2049, + "step": 28990 + }, + { + "epoch": 0.3767248226650835, + "grad_norm": 0.4205652177333832, + "learning_rate": 0.00012468059111763865, + "loss": 1.2859, + "step": 28991 + }, + { + "epoch": 0.37673781720899935, + "grad_norm": 0.3964245021343231, + "learning_rate": 0.00012467799165572727, + "loss": 1.5463, + "step": 28992 + }, + { + "epoch": 0.37675081175291525, + "grad_norm": 0.3582054376602173, + "learning_rate": 0.00012467539219381587, + "loss": 1.2587, + "step": 28993 + }, + { + "epoch": 0.3767638062968311, + "grad_norm": 0.4287501871585846, + "learning_rate": 0.00012467279273190452, + "loss": 1.5084, + "step": 28994 + }, + { + "epoch": 0.376776800840747, + "grad_norm": 0.39114248752593994, + "learning_rate": 0.00012467019326999312, + "loss": 1.4415, + "step": 28995 + }, + { + "epoch": 0.37678979538466284, + "grad_norm": 0.493540495634079, + "learning_rate": 0.00012466759380808172, + "loss": 1.4702, + "step": 28996 + }, + { + "epoch": 0.37680278992857874, + "grad_norm": 0.5261231660842896, + "learning_rate": 0.00012466499434617034, + "loss": 1.3912, + "step": 28997 + }, + { + "epoch": 0.3768157844724946, + "grad_norm": 0.32692450284957886, + "learning_rate": 0.00012466239488425897, + "loss": 1.311, + "step": 28998 + }, + { + "epoch": 0.3768287790164105, + "grad_norm": 0.8313952088356018, + "learning_rate": 0.0001246597954223476, + "loss": 1.715, + "step": 28999 + }, + { + "epoch": 0.37684177356032633, + "grad_norm": 0.25373613834381104, + "learning_rate": 0.0001246571959604362, + "loss": 1.1988, + "step": 29000 + }, + { + "epoch": 0.37685476810424223, + "grad_norm": 0.3408701717853546, + "learning_rate": 0.0001246545964985248, + "loss": 1.3072, + "step": 29001 + }, + { + "epoch": 0.3768677626481581, + "grad_norm": 0.4798407554626465, + "learning_rate": 0.00012465199703661344, + "loss": 1.4693, + "step": 29002 + }, + { + "epoch": 0.376880757192074, + "grad_norm": 0.4050613343715668, + "learning_rate": 0.00012464939757470203, + "loss": 1.45, + "step": 29003 + }, + { + "epoch": 0.3768937517359898, + "grad_norm": 0.4492764174938202, + "learning_rate": 0.00012464679811279066, + "loss": 1.4175, + "step": 29004 + }, + { + "epoch": 0.3769067462799057, + "grad_norm": 0.43367472290992737, + "learning_rate": 0.00012464419865087928, + "loss": 1.2815, + "step": 29005 + }, + { + "epoch": 0.3769197408238216, + "grad_norm": 0.35858961939811707, + "learning_rate": 0.0001246415991889679, + "loss": 1.1285, + "step": 29006 + }, + { + "epoch": 0.37693273536773747, + "grad_norm": 0.301493376493454, + "learning_rate": 0.0001246389997270565, + "loss": 1.4326, + "step": 29007 + }, + { + "epoch": 0.37694572991165337, + "grad_norm": 0.4287569224834442, + "learning_rate": 0.0001246364002651451, + "loss": 1.1246, + "step": 29008 + }, + { + "epoch": 0.3769587244555692, + "grad_norm": 0.4359901547431946, + "learning_rate": 0.00012463380080323375, + "loss": 1.5789, + "step": 29009 + }, + { + "epoch": 0.3769717189994851, + "grad_norm": 0.46248188614845276, + "learning_rate": 0.00012463120134132235, + "loss": 1.4456, + "step": 29010 + }, + { + "epoch": 0.37698471354340096, + "grad_norm": 0.4319164752960205, + "learning_rate": 0.00012462860187941098, + "loss": 1.5375, + "step": 29011 + }, + { + "epoch": 0.37699770808731686, + "grad_norm": 0.2823288142681122, + "learning_rate": 0.00012462600241749957, + "loss": 1.2573, + "step": 29012 + }, + { + "epoch": 0.3770107026312327, + "grad_norm": 0.5519055724143982, + "learning_rate": 0.0001246234029555882, + "loss": 1.6317, + "step": 29013 + }, + { + "epoch": 0.3770236971751486, + "grad_norm": 0.3213790953159332, + "learning_rate": 0.00012462080349367682, + "loss": 1.3412, + "step": 29014 + }, + { + "epoch": 0.37703669171906445, + "grad_norm": 0.35477882623672485, + "learning_rate": 0.00012461820403176542, + "loss": 1.2711, + "step": 29015 + }, + { + "epoch": 0.37704968626298035, + "grad_norm": 0.36729753017425537, + "learning_rate": 0.00012461560456985404, + "loss": 1.3556, + "step": 29016 + }, + { + "epoch": 0.3770626808068962, + "grad_norm": 0.5463625192642212, + "learning_rate": 0.00012461300510794267, + "loss": 1.3132, + "step": 29017 + }, + { + "epoch": 0.3770756753508121, + "grad_norm": 0.4464842975139618, + "learning_rate": 0.0001246104056460313, + "loss": 1.637, + "step": 29018 + }, + { + "epoch": 0.37708866989472795, + "grad_norm": 0.3476654589176178, + "learning_rate": 0.0001246078061841199, + "loss": 1.1511, + "step": 29019 + }, + { + "epoch": 0.37710166443864385, + "grad_norm": 0.323486864566803, + "learning_rate": 0.0001246052067222085, + "loss": 1.3813, + "step": 29020 + }, + { + "epoch": 0.3771146589825597, + "grad_norm": 0.395973801612854, + "learning_rate": 0.00012460260726029714, + "loss": 1.4752, + "step": 29021 + }, + { + "epoch": 0.3771276535264756, + "grad_norm": 0.4222963750362396, + "learning_rate": 0.00012460000779838574, + "loss": 1.5462, + "step": 29022 + }, + { + "epoch": 0.37714064807039144, + "grad_norm": 0.3361075222492218, + "learning_rate": 0.00012459740833647436, + "loss": 1.3312, + "step": 29023 + }, + { + "epoch": 0.37715364261430734, + "grad_norm": 0.47997570037841797, + "learning_rate": 0.00012459480887456296, + "loss": 1.4521, + "step": 29024 + }, + { + "epoch": 0.3771666371582232, + "grad_norm": 0.4050688147544861, + "learning_rate": 0.00012459220941265158, + "loss": 1.2776, + "step": 29025 + }, + { + "epoch": 0.3771796317021391, + "grad_norm": 0.4550187885761261, + "learning_rate": 0.0001245896099507402, + "loss": 1.277, + "step": 29026 + }, + { + "epoch": 0.37719262624605493, + "grad_norm": 0.3913280665874481, + "learning_rate": 0.0001245870104888288, + "loss": 1.4726, + "step": 29027 + }, + { + "epoch": 0.37720562078997083, + "grad_norm": 0.4334585964679718, + "learning_rate": 0.00012458441102691743, + "loss": 1.2845, + "step": 29028 + }, + { + "epoch": 0.3772186153338867, + "grad_norm": 0.4494137465953827, + "learning_rate": 0.00012458181156500605, + "loss": 1.4394, + "step": 29029 + }, + { + "epoch": 0.3772316098778026, + "grad_norm": 0.3425474464893341, + "learning_rate": 0.00012457921210309468, + "loss": 1.4459, + "step": 29030 + }, + { + "epoch": 0.3772446044217184, + "grad_norm": 0.4476107358932495, + "learning_rate": 0.00012457661264118328, + "loss": 1.252, + "step": 29031 + }, + { + "epoch": 0.3772575989656343, + "grad_norm": 0.31464043259620667, + "learning_rate": 0.0001245740131792719, + "loss": 1.1194, + "step": 29032 + }, + { + "epoch": 0.37727059350955017, + "grad_norm": 0.4742145836353302, + "learning_rate": 0.00012457141371736052, + "loss": 1.4444, + "step": 29033 + }, + { + "epoch": 0.37728358805346607, + "grad_norm": 0.5120161175727844, + "learning_rate": 0.00012456881425544912, + "loss": 1.3086, + "step": 29034 + }, + { + "epoch": 0.3772965825973819, + "grad_norm": 0.354155033826828, + "learning_rate": 0.00012456621479353775, + "loss": 1.4172, + "step": 29035 + }, + { + "epoch": 0.3773095771412978, + "grad_norm": 0.44087985157966614, + "learning_rate": 0.00012456361533162634, + "loss": 1.4429, + "step": 29036 + }, + { + "epoch": 0.37732257168521366, + "grad_norm": 0.4338029623031616, + "learning_rate": 0.00012456101586971497, + "loss": 1.3781, + "step": 29037 + }, + { + "epoch": 0.37733556622912956, + "grad_norm": 0.4211170971393585, + "learning_rate": 0.0001245584164078036, + "loss": 1.413, + "step": 29038 + }, + { + "epoch": 0.3773485607730454, + "grad_norm": 0.43683192133903503, + "learning_rate": 0.0001245558169458922, + "loss": 1.3641, + "step": 29039 + }, + { + "epoch": 0.3773615553169613, + "grad_norm": 0.42215219140052795, + "learning_rate": 0.00012455321748398084, + "loss": 1.5607, + "step": 29040 + }, + { + "epoch": 0.37737454986087715, + "grad_norm": 0.4348534941673279, + "learning_rate": 0.00012455061802206944, + "loss": 1.4015, + "step": 29041 + }, + { + "epoch": 0.37738754440479305, + "grad_norm": 0.2570900619029999, + "learning_rate": 0.00012454801856015806, + "loss": 1.2447, + "step": 29042 + }, + { + "epoch": 0.3774005389487089, + "grad_norm": 0.38586023449897766, + "learning_rate": 0.00012454541909824666, + "loss": 1.2944, + "step": 29043 + }, + { + "epoch": 0.3774135334926248, + "grad_norm": 0.3790202736854553, + "learning_rate": 0.00012454281963633529, + "loss": 1.5477, + "step": 29044 + }, + { + "epoch": 0.37742652803654064, + "grad_norm": 0.4255603551864624, + "learning_rate": 0.0001245402201744239, + "loss": 1.3821, + "step": 29045 + }, + { + "epoch": 0.37743952258045654, + "grad_norm": 0.47181031107902527, + "learning_rate": 0.0001245376207125125, + "loss": 1.467, + "step": 29046 + }, + { + "epoch": 0.3774525171243724, + "grad_norm": 0.400684118270874, + "learning_rate": 0.00012453502125060113, + "loss": 1.3834, + "step": 29047 + }, + { + "epoch": 0.3774655116682883, + "grad_norm": 0.316967248916626, + "learning_rate": 0.00012453242178868976, + "loss": 1.21, + "step": 29048 + }, + { + "epoch": 0.37747850621220413, + "grad_norm": 0.3553643524646759, + "learning_rate": 0.00012452982232677835, + "loss": 1.3608, + "step": 29049 + }, + { + "epoch": 0.37749150075612004, + "grad_norm": 0.47424566745758057, + "learning_rate": 0.00012452722286486698, + "loss": 1.503, + "step": 29050 + }, + { + "epoch": 0.3775044953000359, + "grad_norm": 0.4041392207145691, + "learning_rate": 0.00012452462340295558, + "loss": 1.4028, + "step": 29051 + }, + { + "epoch": 0.3775174898439518, + "grad_norm": 0.3005121946334839, + "learning_rate": 0.00012452202394104423, + "loss": 1.1557, + "step": 29052 + }, + { + "epoch": 0.3775304843878676, + "grad_norm": 0.3968088626861572, + "learning_rate": 0.00012451942447913282, + "loss": 1.4114, + "step": 29053 + }, + { + "epoch": 0.3775434789317835, + "grad_norm": 0.3767034709453583, + "learning_rate": 0.00012451682501722145, + "loss": 1.4948, + "step": 29054 + }, + { + "epoch": 0.3775564734756994, + "grad_norm": 0.41209688782691956, + "learning_rate": 0.00012451422555531005, + "loss": 1.4503, + "step": 29055 + }, + { + "epoch": 0.3775694680196153, + "grad_norm": 0.42090338468551636, + "learning_rate": 0.00012451162609339867, + "loss": 1.4419, + "step": 29056 + }, + { + "epoch": 0.3775824625635311, + "grad_norm": 0.4267866313457489, + "learning_rate": 0.0001245090266314873, + "loss": 1.3762, + "step": 29057 + }, + { + "epoch": 0.377595457107447, + "grad_norm": 0.38681766390800476, + "learning_rate": 0.0001245064271695759, + "loss": 1.1559, + "step": 29058 + }, + { + "epoch": 0.37760845165136286, + "grad_norm": 0.2588450014591217, + "learning_rate": 0.00012450382770766452, + "loss": 1.3235, + "step": 29059 + }, + { + "epoch": 0.37762144619527876, + "grad_norm": 0.3225371539592743, + "learning_rate": 0.00012450122824575314, + "loss": 1.2559, + "step": 29060 + }, + { + "epoch": 0.3776344407391946, + "grad_norm": 0.2933025658130646, + "learning_rate": 0.00012449862878384177, + "loss": 1.3796, + "step": 29061 + }, + { + "epoch": 0.3776474352831105, + "grad_norm": 0.4021296203136444, + "learning_rate": 0.00012449602932193036, + "loss": 1.2806, + "step": 29062 + }, + { + "epoch": 0.37766042982702636, + "grad_norm": 0.3588813543319702, + "learning_rate": 0.00012449342986001896, + "loss": 1.5836, + "step": 29063 + }, + { + "epoch": 0.37767342437094226, + "grad_norm": 0.5445573329925537, + "learning_rate": 0.0001244908303981076, + "loss": 1.5183, + "step": 29064 + }, + { + "epoch": 0.3776864189148581, + "grad_norm": 0.40056294202804565, + "learning_rate": 0.0001244882309361962, + "loss": 1.351, + "step": 29065 + }, + { + "epoch": 0.377699413458774, + "grad_norm": 0.3994879722595215, + "learning_rate": 0.00012448563147428483, + "loss": 1.5433, + "step": 29066 + }, + { + "epoch": 0.37771240800268985, + "grad_norm": 0.45303842425346375, + "learning_rate": 0.00012448303201237343, + "loss": 1.4605, + "step": 29067 + }, + { + "epoch": 0.37772540254660575, + "grad_norm": 0.510071337223053, + "learning_rate": 0.00012448043255046206, + "loss": 1.4775, + "step": 29068 + }, + { + "epoch": 0.3777383970905216, + "grad_norm": 0.3420342206954956, + "learning_rate": 0.00012447783308855068, + "loss": 1.3244, + "step": 29069 + }, + { + "epoch": 0.3777513916344375, + "grad_norm": 0.4512944221496582, + "learning_rate": 0.00012447523362663928, + "loss": 1.6217, + "step": 29070 + }, + { + "epoch": 0.37776438617835334, + "grad_norm": 0.39227572083473206, + "learning_rate": 0.0001244726341647279, + "loss": 1.3262, + "step": 29071 + }, + { + "epoch": 0.37777738072226924, + "grad_norm": 0.3984995484352112, + "learning_rate": 0.00012447003470281653, + "loss": 1.5671, + "step": 29072 + }, + { + "epoch": 0.3777903752661851, + "grad_norm": 0.3556367754936218, + "learning_rate": 0.00012446743524090515, + "loss": 1.3485, + "step": 29073 + }, + { + "epoch": 0.377803369810101, + "grad_norm": 0.33799123764038086, + "learning_rate": 0.00012446483577899375, + "loss": 1.2686, + "step": 29074 + }, + { + "epoch": 0.37781636435401683, + "grad_norm": 0.28384819626808167, + "learning_rate": 0.00012446223631708235, + "loss": 1.2731, + "step": 29075 + }, + { + "epoch": 0.37782935889793273, + "grad_norm": 0.42464518547058105, + "learning_rate": 0.000124459636855171, + "loss": 1.3001, + "step": 29076 + }, + { + "epoch": 0.3778423534418486, + "grad_norm": 0.48076972365379333, + "learning_rate": 0.0001244570373932596, + "loss": 1.442, + "step": 29077 + }, + { + "epoch": 0.3778553479857645, + "grad_norm": 0.33230268955230713, + "learning_rate": 0.00012445443793134822, + "loss": 1.506, + "step": 29078 + }, + { + "epoch": 0.3778683425296803, + "grad_norm": 0.34044328331947327, + "learning_rate": 0.00012445183846943684, + "loss": 1.137, + "step": 29079 + }, + { + "epoch": 0.3778813370735962, + "grad_norm": 0.3845383822917938, + "learning_rate": 0.00012444923900752544, + "loss": 1.3085, + "step": 29080 + }, + { + "epoch": 0.3778943316175121, + "grad_norm": 0.3190215826034546, + "learning_rate": 0.00012444663954561407, + "loss": 1.2845, + "step": 29081 + }, + { + "epoch": 0.37790732616142797, + "grad_norm": 0.3442033529281616, + "learning_rate": 0.00012444404008370266, + "loss": 1.3278, + "step": 29082 + }, + { + "epoch": 0.37792032070534387, + "grad_norm": 0.4151146411895752, + "learning_rate": 0.00012444144062179132, + "loss": 1.3828, + "step": 29083 + }, + { + "epoch": 0.3779333152492597, + "grad_norm": 0.38955193758010864, + "learning_rate": 0.0001244388411598799, + "loss": 1.3126, + "step": 29084 + }, + { + "epoch": 0.3779463097931756, + "grad_norm": 0.38300663232803345, + "learning_rate": 0.00012443624169796854, + "loss": 1.3438, + "step": 29085 + }, + { + "epoch": 0.37795930433709146, + "grad_norm": 0.4433871805667877, + "learning_rate": 0.00012443364223605713, + "loss": 1.4953, + "step": 29086 + }, + { + "epoch": 0.37797229888100736, + "grad_norm": 0.5659882426261902, + "learning_rate": 0.00012443104277414576, + "loss": 1.4436, + "step": 29087 + }, + { + "epoch": 0.3779852934249232, + "grad_norm": 0.35576027631759644, + "learning_rate": 0.00012442844331223438, + "loss": 1.3506, + "step": 29088 + }, + { + "epoch": 0.3779982879688391, + "grad_norm": 0.38196861743927, + "learning_rate": 0.00012442584385032298, + "loss": 1.4136, + "step": 29089 + }, + { + "epoch": 0.37801128251275495, + "grad_norm": 0.39096394181251526, + "learning_rate": 0.0001244232443884116, + "loss": 1.2255, + "step": 29090 + }, + { + "epoch": 0.37802427705667085, + "grad_norm": 0.4709497094154358, + "learning_rate": 0.00012442064492650023, + "loss": 1.5984, + "step": 29091 + }, + { + "epoch": 0.3780372716005867, + "grad_norm": 0.49661585688591003, + "learning_rate": 0.00012441804546458883, + "loss": 1.2049, + "step": 29092 + }, + { + "epoch": 0.3780502661445026, + "grad_norm": 0.4364995062351227, + "learning_rate": 0.00012441544600267745, + "loss": 1.4643, + "step": 29093 + }, + { + "epoch": 0.37806326068841845, + "grad_norm": 0.40390875935554504, + "learning_rate": 0.00012441284654076605, + "loss": 1.4835, + "step": 29094 + }, + { + "epoch": 0.37807625523233435, + "grad_norm": 0.25178074836730957, + "learning_rate": 0.0001244102470788547, + "loss": 1.4639, + "step": 29095 + }, + { + "epoch": 0.3780892497762502, + "grad_norm": 0.3871273100376129, + "learning_rate": 0.0001244076476169433, + "loss": 1.4521, + "step": 29096 + }, + { + "epoch": 0.3781022443201661, + "grad_norm": 0.447313129901886, + "learning_rate": 0.00012440504815503192, + "loss": 1.3362, + "step": 29097 + }, + { + "epoch": 0.37811523886408194, + "grad_norm": 0.3746159076690674, + "learning_rate": 0.00012440244869312052, + "loss": 1.5628, + "step": 29098 + }, + { + "epoch": 0.37812823340799784, + "grad_norm": 0.3722781538963318, + "learning_rate": 0.00012439984923120914, + "loss": 1.3143, + "step": 29099 + }, + { + "epoch": 0.3781412279519137, + "grad_norm": 0.4642501175403595, + "learning_rate": 0.00012439724976929777, + "loss": 1.4844, + "step": 29100 + }, + { + "epoch": 0.3781542224958296, + "grad_norm": 0.37560561299324036, + "learning_rate": 0.00012439465030738637, + "loss": 1.3506, + "step": 29101 + }, + { + "epoch": 0.37816721703974543, + "grad_norm": 0.38260748982429504, + "learning_rate": 0.000124392050845475, + "loss": 1.4681, + "step": 29102 + }, + { + "epoch": 0.37818021158366133, + "grad_norm": 0.39832690358161926, + "learning_rate": 0.00012438945138356361, + "loss": 1.4453, + "step": 29103 + }, + { + "epoch": 0.3781932061275772, + "grad_norm": 0.3914296329021454, + "learning_rate": 0.0001243868519216522, + "loss": 1.5234, + "step": 29104 + }, + { + "epoch": 0.3782062006714931, + "grad_norm": 0.532298743724823, + "learning_rate": 0.00012438425245974084, + "loss": 1.5036, + "step": 29105 + }, + { + "epoch": 0.3782191952154089, + "grad_norm": 0.45808789134025574, + "learning_rate": 0.00012438165299782943, + "loss": 1.2574, + "step": 29106 + }, + { + "epoch": 0.3782321897593248, + "grad_norm": 0.47658196091651917, + "learning_rate": 0.00012437905353591809, + "loss": 1.4875, + "step": 29107 + }, + { + "epoch": 0.37824518430324067, + "grad_norm": 0.37270355224609375, + "learning_rate": 0.00012437645407400668, + "loss": 1.4023, + "step": 29108 + }, + { + "epoch": 0.37825817884715657, + "grad_norm": 0.39826536178588867, + "learning_rate": 0.0001243738546120953, + "loss": 1.5969, + "step": 29109 + }, + { + "epoch": 0.3782711733910724, + "grad_norm": 0.37886396050453186, + "learning_rate": 0.0001243712551501839, + "loss": 1.1517, + "step": 29110 + }, + { + "epoch": 0.3782841679349883, + "grad_norm": 0.4825325310230255, + "learning_rate": 0.00012436865568827253, + "loss": 1.4136, + "step": 29111 + }, + { + "epoch": 0.37829716247890416, + "grad_norm": 0.3799412250518799, + "learning_rate": 0.00012436605622636115, + "loss": 1.3989, + "step": 29112 + }, + { + "epoch": 0.37831015702282006, + "grad_norm": 0.3932362496852875, + "learning_rate": 0.00012436345676444975, + "loss": 1.4309, + "step": 29113 + }, + { + "epoch": 0.3783231515667359, + "grad_norm": 0.3965602517127991, + "learning_rate": 0.00012436085730253838, + "loss": 1.3298, + "step": 29114 + }, + { + "epoch": 0.3783361461106518, + "grad_norm": 0.4701803922653198, + "learning_rate": 0.000124358257840627, + "loss": 1.7342, + "step": 29115 + }, + { + "epoch": 0.37834914065456765, + "grad_norm": 0.3027585446834564, + "learning_rate": 0.00012435565837871562, + "loss": 1.4163, + "step": 29116 + }, + { + "epoch": 0.37836213519848355, + "grad_norm": 0.42055410146713257, + "learning_rate": 0.00012435305891680422, + "loss": 1.3831, + "step": 29117 + }, + { + "epoch": 0.3783751297423994, + "grad_norm": 0.4090738296508789, + "learning_rate": 0.00012435045945489285, + "loss": 1.376, + "step": 29118 + }, + { + "epoch": 0.3783881242863153, + "grad_norm": 0.32407262921333313, + "learning_rate": 0.00012434785999298147, + "loss": 1.4236, + "step": 29119 + }, + { + "epoch": 0.37840111883023114, + "grad_norm": 0.4219169318675995, + "learning_rate": 0.00012434526053107007, + "loss": 1.4333, + "step": 29120 + }, + { + "epoch": 0.37841411337414704, + "grad_norm": 0.3304325342178345, + "learning_rate": 0.0001243426610691587, + "loss": 1.2584, + "step": 29121 + }, + { + "epoch": 0.3784271079180629, + "grad_norm": 0.41518527269363403, + "learning_rate": 0.00012434006160724732, + "loss": 1.3859, + "step": 29122 + }, + { + "epoch": 0.3784401024619788, + "grad_norm": 0.35314589738845825, + "learning_rate": 0.00012433746214533591, + "loss": 1.4197, + "step": 29123 + }, + { + "epoch": 0.37845309700589463, + "grad_norm": 0.39648914337158203, + "learning_rate": 0.00012433486268342454, + "loss": 1.3606, + "step": 29124 + }, + { + "epoch": 0.37846609154981053, + "grad_norm": 0.3424660265445709, + "learning_rate": 0.00012433226322151314, + "loss": 1.3252, + "step": 29125 + }, + { + "epoch": 0.3784790860937264, + "grad_norm": 0.4410635828971863, + "learning_rate": 0.0001243296637596018, + "loss": 1.4207, + "step": 29126 + }, + { + "epoch": 0.3784920806376423, + "grad_norm": 0.4133591055870056, + "learning_rate": 0.00012432706429769039, + "loss": 1.584, + "step": 29127 + }, + { + "epoch": 0.3785050751815581, + "grad_norm": 0.43514785170555115, + "learning_rate": 0.000124324464835779, + "loss": 1.3868, + "step": 29128 + }, + { + "epoch": 0.378518069725474, + "grad_norm": 0.38888445496559143, + "learning_rate": 0.0001243218653738676, + "loss": 1.2944, + "step": 29129 + }, + { + "epoch": 0.37853106426938987, + "grad_norm": 0.381853312253952, + "learning_rate": 0.00012431926591195623, + "loss": 1.4695, + "step": 29130 + }, + { + "epoch": 0.3785440588133058, + "grad_norm": 0.3694745898246765, + "learning_rate": 0.00012431666645004486, + "loss": 1.6475, + "step": 29131 + }, + { + "epoch": 0.3785570533572216, + "grad_norm": 0.45428037643432617, + "learning_rate": 0.00012431406698813345, + "loss": 1.4406, + "step": 29132 + }, + { + "epoch": 0.3785700479011375, + "grad_norm": 0.316353440284729, + "learning_rate": 0.00012431146752622208, + "loss": 1.2983, + "step": 29133 + }, + { + "epoch": 0.37858304244505336, + "grad_norm": 0.3684375584125519, + "learning_rate": 0.0001243088680643107, + "loss": 1.1729, + "step": 29134 + }, + { + "epoch": 0.37859603698896926, + "grad_norm": 0.41225841641426086, + "learning_rate": 0.0001243062686023993, + "loss": 1.4182, + "step": 29135 + }, + { + "epoch": 0.3786090315328851, + "grad_norm": 0.4168204367160797, + "learning_rate": 0.00012430366914048792, + "loss": 1.4061, + "step": 29136 + }, + { + "epoch": 0.378622026076801, + "grad_norm": 0.39882737398147583, + "learning_rate": 0.00012430106967857652, + "loss": 1.2722, + "step": 29137 + }, + { + "epoch": 0.37863502062071686, + "grad_norm": 0.38166508078575134, + "learning_rate": 0.00012429847021666517, + "loss": 1.4264, + "step": 29138 + }, + { + "epoch": 0.37864801516463276, + "grad_norm": 0.35266152024269104, + "learning_rate": 0.00012429587075475377, + "loss": 1.3299, + "step": 29139 + }, + { + "epoch": 0.3786610097085486, + "grad_norm": 0.4482724368572235, + "learning_rate": 0.0001242932712928424, + "loss": 1.408, + "step": 29140 + }, + { + "epoch": 0.3786740042524645, + "grad_norm": 0.37660476565361023, + "learning_rate": 0.000124290671830931, + "loss": 1.2338, + "step": 29141 + }, + { + "epoch": 0.37868699879638035, + "grad_norm": 0.33763495087623596, + "learning_rate": 0.00012428807236901962, + "loss": 1.4744, + "step": 29142 + }, + { + "epoch": 0.37869999334029625, + "grad_norm": 0.38571202754974365, + "learning_rate": 0.00012428547290710824, + "loss": 1.4643, + "step": 29143 + }, + { + "epoch": 0.3787129878842121, + "grad_norm": 0.4321689009666443, + "learning_rate": 0.00012428287344519684, + "loss": 1.468, + "step": 29144 + }, + { + "epoch": 0.378725982428128, + "grad_norm": 0.43656328320503235, + "learning_rate": 0.00012428027398328546, + "loss": 1.4904, + "step": 29145 + }, + { + "epoch": 0.37873897697204384, + "grad_norm": 0.44920194149017334, + "learning_rate": 0.0001242776745213741, + "loss": 1.4925, + "step": 29146 + }, + { + "epoch": 0.37875197151595974, + "grad_norm": 0.4356619119644165, + "learning_rate": 0.00012427507505946269, + "loss": 1.451, + "step": 29147 + }, + { + "epoch": 0.3787649660598756, + "grad_norm": 0.3751243054866791, + "learning_rate": 0.0001242724755975513, + "loss": 1.534, + "step": 29148 + }, + { + "epoch": 0.3787779606037915, + "grad_norm": 0.460116446018219, + "learning_rate": 0.0001242698761356399, + "loss": 1.4667, + "step": 29149 + }, + { + "epoch": 0.37879095514770733, + "grad_norm": 0.4050978720188141, + "learning_rate": 0.00012426727667372856, + "loss": 1.3061, + "step": 29150 + }, + { + "epoch": 0.37880394969162323, + "grad_norm": 0.4400278627872467, + "learning_rate": 0.00012426467721181716, + "loss": 1.2966, + "step": 29151 + }, + { + "epoch": 0.3788169442355391, + "grad_norm": 0.44580578804016113, + "learning_rate": 0.00012426207774990578, + "loss": 1.5289, + "step": 29152 + }, + { + "epoch": 0.378829938779455, + "grad_norm": 0.42245128750801086, + "learning_rate": 0.0001242594782879944, + "loss": 1.3893, + "step": 29153 + }, + { + "epoch": 0.3788429333233708, + "grad_norm": 0.3842412531375885, + "learning_rate": 0.000124256878826083, + "loss": 1.3115, + "step": 29154 + }, + { + "epoch": 0.3788559278672867, + "grad_norm": 0.5081456899642944, + "learning_rate": 0.00012425427936417163, + "loss": 1.5385, + "step": 29155 + }, + { + "epoch": 0.37886892241120257, + "grad_norm": 0.3887665271759033, + "learning_rate": 0.00012425167990226022, + "loss": 1.3776, + "step": 29156 + }, + { + "epoch": 0.37888191695511847, + "grad_norm": 0.30462446808815, + "learning_rate": 0.00012424908044034888, + "loss": 1.3108, + "step": 29157 + }, + { + "epoch": 0.37889491149903437, + "grad_norm": 0.34881502389907837, + "learning_rate": 0.00012424648097843747, + "loss": 1.3075, + "step": 29158 + }, + { + "epoch": 0.3789079060429502, + "grad_norm": 0.3741593062877655, + "learning_rate": 0.00012424388151652607, + "loss": 1.3194, + "step": 29159 + }, + { + "epoch": 0.3789209005868661, + "grad_norm": 0.4332476854324341, + "learning_rate": 0.0001242412820546147, + "loss": 1.3211, + "step": 29160 + }, + { + "epoch": 0.37893389513078196, + "grad_norm": 0.46578484773635864, + "learning_rate": 0.00012423868259270332, + "loss": 1.3973, + "step": 29161 + }, + { + "epoch": 0.37894688967469786, + "grad_norm": 0.42272937297821045, + "learning_rate": 0.00012423608313079194, + "loss": 1.4261, + "step": 29162 + }, + { + "epoch": 0.3789598842186137, + "grad_norm": 0.39170020818710327, + "learning_rate": 0.00012423348366888054, + "loss": 1.3626, + "step": 29163 + }, + { + "epoch": 0.3789728787625296, + "grad_norm": 0.49651801586151123, + "learning_rate": 0.00012423088420696917, + "loss": 1.4106, + "step": 29164 + }, + { + "epoch": 0.37898587330644545, + "grad_norm": 0.5419514775276184, + "learning_rate": 0.0001242282847450578, + "loss": 1.5202, + "step": 29165 + }, + { + "epoch": 0.37899886785036135, + "grad_norm": 0.4186174273490906, + "learning_rate": 0.0001242256852831464, + "loss": 1.352, + "step": 29166 + }, + { + "epoch": 0.3790118623942772, + "grad_norm": 0.469951331615448, + "learning_rate": 0.000124223085821235, + "loss": 1.393, + "step": 29167 + }, + { + "epoch": 0.3790248569381931, + "grad_norm": 0.4513460695743561, + "learning_rate": 0.0001242204863593236, + "loss": 1.4655, + "step": 29168 + }, + { + "epoch": 0.37903785148210895, + "grad_norm": 0.4261421859264374, + "learning_rate": 0.00012421788689741226, + "loss": 1.5796, + "step": 29169 + }, + { + "epoch": 0.37905084602602485, + "grad_norm": 0.3736957013607025, + "learning_rate": 0.00012421528743550086, + "loss": 1.3083, + "step": 29170 + }, + { + "epoch": 0.3790638405699407, + "grad_norm": 0.3792990744113922, + "learning_rate": 0.00012421268797358946, + "loss": 1.32, + "step": 29171 + }, + { + "epoch": 0.3790768351138566, + "grad_norm": 0.4717152714729309, + "learning_rate": 0.00012421008851167808, + "loss": 1.4518, + "step": 29172 + }, + { + "epoch": 0.37908982965777244, + "grad_norm": 0.3989870846271515, + "learning_rate": 0.0001242074890497667, + "loss": 1.3954, + "step": 29173 + }, + { + "epoch": 0.37910282420168834, + "grad_norm": 0.4924822151660919, + "learning_rate": 0.00012420488958785533, + "loss": 1.3549, + "step": 29174 + }, + { + "epoch": 0.3791158187456042, + "grad_norm": 0.44121262431144714, + "learning_rate": 0.00012420229012594393, + "loss": 1.3776, + "step": 29175 + }, + { + "epoch": 0.3791288132895201, + "grad_norm": 0.4238477647304535, + "learning_rate": 0.00012419969066403255, + "loss": 1.2493, + "step": 29176 + }, + { + "epoch": 0.37914180783343593, + "grad_norm": 0.42497536540031433, + "learning_rate": 0.00012419709120212118, + "loss": 1.4564, + "step": 29177 + }, + { + "epoch": 0.37915480237735183, + "grad_norm": 0.3816337287425995, + "learning_rate": 0.00012419449174020977, + "loss": 1.2512, + "step": 29178 + }, + { + "epoch": 0.3791677969212677, + "grad_norm": 0.35534584522247314, + "learning_rate": 0.0001241918922782984, + "loss": 1.3603, + "step": 29179 + }, + { + "epoch": 0.3791807914651836, + "grad_norm": 0.5033047795295715, + "learning_rate": 0.000124189292816387, + "loss": 1.3541, + "step": 29180 + }, + { + "epoch": 0.3791937860090994, + "grad_norm": 0.4293133616447449, + "learning_rate": 0.00012418669335447565, + "loss": 1.4724, + "step": 29181 + }, + { + "epoch": 0.3792067805530153, + "grad_norm": 0.358553409576416, + "learning_rate": 0.00012418409389256424, + "loss": 1.4491, + "step": 29182 + }, + { + "epoch": 0.37921977509693117, + "grad_norm": 0.40651682019233704, + "learning_rate": 0.00012418149443065287, + "loss": 1.5088, + "step": 29183 + }, + { + "epoch": 0.37923276964084707, + "grad_norm": 0.3773633539676666, + "learning_rate": 0.00012417889496874147, + "loss": 1.2438, + "step": 29184 + }, + { + "epoch": 0.3792457641847629, + "grad_norm": 0.35284101963043213, + "learning_rate": 0.0001241762955068301, + "loss": 1.402, + "step": 29185 + }, + { + "epoch": 0.3792587587286788, + "grad_norm": 0.34899717569351196, + "learning_rate": 0.00012417369604491872, + "loss": 1.3044, + "step": 29186 + }, + { + "epoch": 0.37927175327259466, + "grad_norm": 0.4575243294239044, + "learning_rate": 0.0001241710965830073, + "loss": 1.5636, + "step": 29187 + }, + { + "epoch": 0.37928474781651056, + "grad_norm": 0.4361872971057892, + "learning_rate": 0.00012416849712109594, + "loss": 1.3333, + "step": 29188 + }, + { + "epoch": 0.3792977423604264, + "grad_norm": 0.35873571038246155, + "learning_rate": 0.00012416589765918456, + "loss": 1.312, + "step": 29189 + }, + { + "epoch": 0.3793107369043423, + "grad_norm": 0.46585625410079956, + "learning_rate": 0.00012416329819727316, + "loss": 1.4023, + "step": 29190 + }, + { + "epoch": 0.37932373144825815, + "grad_norm": 0.37591755390167236, + "learning_rate": 0.00012416069873536178, + "loss": 1.3332, + "step": 29191 + }, + { + "epoch": 0.37933672599217405, + "grad_norm": 0.47125518321990967, + "learning_rate": 0.0001241580992734504, + "loss": 1.46, + "step": 29192 + }, + { + "epoch": 0.3793497205360899, + "grad_norm": 0.4839010238647461, + "learning_rate": 0.00012415549981153903, + "loss": 1.4585, + "step": 29193 + }, + { + "epoch": 0.3793627150800058, + "grad_norm": 0.46539804339408875, + "learning_rate": 0.00012415290034962763, + "loss": 1.1312, + "step": 29194 + }, + { + "epoch": 0.37937570962392164, + "grad_norm": 0.5114302039146423, + "learning_rate": 0.00012415030088771625, + "loss": 1.3347, + "step": 29195 + }, + { + "epoch": 0.37938870416783754, + "grad_norm": 0.5146238803863525, + "learning_rate": 0.00012414770142580488, + "loss": 1.4793, + "step": 29196 + }, + { + "epoch": 0.3794016987117534, + "grad_norm": 0.5047728419303894, + "learning_rate": 0.00012414510196389348, + "loss": 1.2871, + "step": 29197 + }, + { + "epoch": 0.3794146932556693, + "grad_norm": 0.4356023967266083, + "learning_rate": 0.0001241425025019821, + "loss": 1.4527, + "step": 29198 + }, + { + "epoch": 0.37942768779958513, + "grad_norm": 0.3981245160102844, + "learning_rate": 0.0001241399030400707, + "loss": 1.3795, + "step": 29199 + }, + { + "epoch": 0.37944068234350103, + "grad_norm": 0.3890610933303833, + "learning_rate": 0.00012413730357815935, + "loss": 1.4034, + "step": 29200 + }, + { + "epoch": 0.3794536768874169, + "grad_norm": 0.3172937035560608, + "learning_rate": 0.00012413470411624795, + "loss": 1.2939, + "step": 29201 + }, + { + "epoch": 0.3794666714313328, + "grad_norm": 0.4470987617969513, + "learning_rate": 0.00012413210465433654, + "loss": 1.3325, + "step": 29202 + }, + { + "epoch": 0.3794796659752486, + "grad_norm": 0.39845916628837585, + "learning_rate": 0.00012412950519242517, + "loss": 1.4547, + "step": 29203 + }, + { + "epoch": 0.3794926605191645, + "grad_norm": 0.3770211637020111, + "learning_rate": 0.0001241269057305138, + "loss": 1.3263, + "step": 29204 + }, + { + "epoch": 0.37950565506308037, + "grad_norm": 0.5465637445449829, + "learning_rate": 0.00012412430626860242, + "loss": 1.2462, + "step": 29205 + }, + { + "epoch": 0.3795186496069963, + "grad_norm": 0.3737373948097229, + "learning_rate": 0.00012412170680669102, + "loss": 1.4105, + "step": 29206 + }, + { + "epoch": 0.3795316441509121, + "grad_norm": 0.3600003123283386, + "learning_rate": 0.00012411910734477964, + "loss": 1.3738, + "step": 29207 + }, + { + "epoch": 0.379544638694828, + "grad_norm": 0.4306640326976776, + "learning_rate": 0.00012411650788286826, + "loss": 1.1999, + "step": 29208 + }, + { + "epoch": 0.37955763323874386, + "grad_norm": 0.39429956674575806, + "learning_rate": 0.00012411390842095686, + "loss": 1.467, + "step": 29209 + }, + { + "epoch": 0.37957062778265976, + "grad_norm": 0.37216323614120483, + "learning_rate": 0.00012411130895904549, + "loss": 1.4452, + "step": 29210 + }, + { + "epoch": 0.3795836223265756, + "grad_norm": 0.4850517809391022, + "learning_rate": 0.00012410870949713408, + "loss": 1.386, + "step": 29211 + }, + { + "epoch": 0.3795966168704915, + "grad_norm": 0.3833988606929779, + "learning_rate": 0.00012410611003522274, + "loss": 1.3681, + "step": 29212 + }, + { + "epoch": 0.37960961141440736, + "grad_norm": 0.4129987061023712, + "learning_rate": 0.00012410351057331133, + "loss": 1.1961, + "step": 29213 + }, + { + "epoch": 0.37962260595832326, + "grad_norm": 0.38009902834892273, + "learning_rate": 0.00012410091111139993, + "loss": 1.232, + "step": 29214 + }, + { + "epoch": 0.3796356005022391, + "grad_norm": 0.4127764403820038, + "learning_rate": 0.00012409831164948855, + "loss": 1.4375, + "step": 29215 + }, + { + "epoch": 0.379648595046155, + "grad_norm": 0.3812805712223053, + "learning_rate": 0.00012409571218757718, + "loss": 1.2951, + "step": 29216 + }, + { + "epoch": 0.37966158959007085, + "grad_norm": 0.36285439133644104, + "learning_rate": 0.0001240931127256658, + "loss": 1.4076, + "step": 29217 + }, + { + "epoch": 0.37967458413398675, + "grad_norm": 0.44615450501441956, + "learning_rate": 0.0001240905132637544, + "loss": 1.4534, + "step": 29218 + }, + { + "epoch": 0.3796875786779026, + "grad_norm": 0.33002763986587524, + "learning_rate": 0.00012408791380184303, + "loss": 1.2594, + "step": 29219 + }, + { + "epoch": 0.3797005732218185, + "grad_norm": 0.4580899775028229, + "learning_rate": 0.00012408531433993165, + "loss": 1.4329, + "step": 29220 + }, + { + "epoch": 0.37971356776573434, + "grad_norm": 0.3683582544326782, + "learning_rate": 0.00012408271487802025, + "loss": 1.4915, + "step": 29221 + }, + { + "epoch": 0.37972656230965024, + "grad_norm": 0.4178430438041687, + "learning_rate": 0.00012408011541610887, + "loss": 1.395, + "step": 29222 + }, + { + "epoch": 0.3797395568535661, + "grad_norm": 0.39848238229751587, + "learning_rate": 0.00012407751595419747, + "loss": 1.3463, + "step": 29223 + }, + { + "epoch": 0.379752551397482, + "grad_norm": 0.4193390905857086, + "learning_rate": 0.00012407491649228612, + "loss": 1.3417, + "step": 29224 + }, + { + "epoch": 0.37976554594139783, + "grad_norm": 0.4338294565677643, + "learning_rate": 0.00012407231703037472, + "loss": 1.4001, + "step": 29225 + }, + { + "epoch": 0.37977854048531373, + "grad_norm": 0.374187171459198, + "learning_rate": 0.00012406971756846332, + "loss": 1.3563, + "step": 29226 + }, + { + "epoch": 0.3797915350292296, + "grad_norm": 0.37506723403930664, + "learning_rate": 0.00012406711810655197, + "loss": 1.4996, + "step": 29227 + }, + { + "epoch": 0.3798045295731455, + "grad_norm": 0.3967728018760681, + "learning_rate": 0.00012406451864464056, + "loss": 1.2992, + "step": 29228 + }, + { + "epoch": 0.3798175241170613, + "grad_norm": 0.4145708680152893, + "learning_rate": 0.0001240619191827292, + "loss": 1.5146, + "step": 29229 + }, + { + "epoch": 0.3798305186609772, + "grad_norm": 0.3018800914287567, + "learning_rate": 0.00012405931972081779, + "loss": 1.4163, + "step": 29230 + }, + { + "epoch": 0.37984351320489307, + "grad_norm": 0.3748125731945038, + "learning_rate": 0.0001240567202589064, + "loss": 1.2653, + "step": 29231 + }, + { + "epoch": 0.37985650774880897, + "grad_norm": 0.3817799985408783, + "learning_rate": 0.00012405412079699504, + "loss": 1.3769, + "step": 29232 + }, + { + "epoch": 0.37986950229272487, + "grad_norm": 0.32482659816741943, + "learning_rate": 0.00012405152133508363, + "loss": 1.489, + "step": 29233 + }, + { + "epoch": 0.3798824968366407, + "grad_norm": 0.40616363286972046, + "learning_rate": 0.00012404892187317226, + "loss": 1.2141, + "step": 29234 + }, + { + "epoch": 0.3798954913805566, + "grad_norm": 0.42527300119400024, + "learning_rate": 0.00012404632241126088, + "loss": 1.6836, + "step": 29235 + }, + { + "epoch": 0.37990848592447246, + "grad_norm": 0.3806702494621277, + "learning_rate": 0.0001240437229493495, + "loss": 1.3101, + "step": 29236 + }, + { + "epoch": 0.37992148046838836, + "grad_norm": 0.4548983871936798, + "learning_rate": 0.0001240411234874381, + "loss": 1.4323, + "step": 29237 + }, + { + "epoch": 0.3799344750123042, + "grad_norm": 0.44386056065559387, + "learning_rate": 0.00012403852402552673, + "loss": 1.3975, + "step": 29238 + }, + { + "epoch": 0.3799474695562201, + "grad_norm": 0.3747439384460449, + "learning_rate": 0.00012403592456361535, + "loss": 1.4673, + "step": 29239 + }, + { + "epoch": 0.37996046410013595, + "grad_norm": 0.4109708368778229, + "learning_rate": 0.00012403332510170395, + "loss": 1.617, + "step": 29240 + }, + { + "epoch": 0.37997345864405185, + "grad_norm": 0.41999250650405884, + "learning_rate": 0.00012403072563979257, + "loss": 1.5366, + "step": 29241 + }, + { + "epoch": 0.3799864531879677, + "grad_norm": 0.4750569760799408, + "learning_rate": 0.00012402812617788117, + "loss": 1.3749, + "step": 29242 + }, + { + "epoch": 0.3799994477318836, + "grad_norm": 0.36913996934890747, + "learning_rate": 0.0001240255267159698, + "loss": 1.3594, + "step": 29243 + }, + { + "epoch": 0.38001244227579944, + "grad_norm": 0.33061403036117554, + "learning_rate": 0.00012402292725405842, + "loss": 1.504, + "step": 29244 + }, + { + "epoch": 0.38002543681971535, + "grad_norm": 0.3785783648490906, + "learning_rate": 0.00012402032779214702, + "loss": 1.2643, + "step": 29245 + }, + { + "epoch": 0.3800384313636312, + "grad_norm": 0.4089740812778473, + "learning_rate": 0.00012401772833023564, + "loss": 1.4331, + "step": 29246 + }, + { + "epoch": 0.3800514259075471, + "grad_norm": 0.4011761546134949, + "learning_rate": 0.00012401512886832427, + "loss": 1.4471, + "step": 29247 + }, + { + "epoch": 0.38006442045146294, + "grad_norm": 0.4180662930011749, + "learning_rate": 0.0001240125294064129, + "loss": 1.3963, + "step": 29248 + }, + { + "epoch": 0.38007741499537884, + "grad_norm": 0.3726418614387512, + "learning_rate": 0.0001240099299445015, + "loss": 1.7263, + "step": 29249 + }, + { + "epoch": 0.3800904095392947, + "grad_norm": 0.43294036388397217, + "learning_rate": 0.0001240073304825901, + "loss": 1.4756, + "step": 29250 + }, + { + "epoch": 0.3801034040832106, + "grad_norm": 0.41812747716903687, + "learning_rate": 0.00012400473102067874, + "loss": 1.5949, + "step": 29251 + }, + { + "epoch": 0.38011639862712643, + "grad_norm": 0.47401514649391174, + "learning_rate": 0.00012400213155876733, + "loss": 1.4265, + "step": 29252 + }, + { + "epoch": 0.38012939317104233, + "grad_norm": 0.3371295928955078, + "learning_rate": 0.00012399953209685596, + "loss": 1.4049, + "step": 29253 + }, + { + "epoch": 0.3801423877149582, + "grad_norm": 0.39377865195274353, + "learning_rate": 0.00012399693263494456, + "loss": 1.5149, + "step": 29254 + }, + { + "epoch": 0.3801553822588741, + "grad_norm": 0.39270544052124023, + "learning_rate": 0.00012399433317303318, + "loss": 1.4356, + "step": 29255 + }, + { + "epoch": 0.3801683768027899, + "grad_norm": 0.38318485021591187, + "learning_rate": 0.0001239917337111218, + "loss": 1.512, + "step": 29256 + }, + { + "epoch": 0.3801813713467058, + "grad_norm": 0.38301828503608704, + "learning_rate": 0.0001239891342492104, + "loss": 1.1555, + "step": 29257 + }, + { + "epoch": 0.38019436589062167, + "grad_norm": 0.45692166686058044, + "learning_rate": 0.00012398653478729903, + "loss": 1.2978, + "step": 29258 + }, + { + "epoch": 0.38020736043453757, + "grad_norm": 0.46812117099761963, + "learning_rate": 0.00012398393532538765, + "loss": 1.4326, + "step": 29259 + }, + { + "epoch": 0.3802203549784534, + "grad_norm": 0.3846263587474823, + "learning_rate": 0.00012398133586347628, + "loss": 1.4002, + "step": 29260 + }, + { + "epoch": 0.3802333495223693, + "grad_norm": 0.3237459659576416, + "learning_rate": 0.00012397873640156487, + "loss": 1.3219, + "step": 29261 + }, + { + "epoch": 0.38024634406628516, + "grad_norm": 0.3671210706233978, + "learning_rate": 0.0001239761369396535, + "loss": 1.3859, + "step": 29262 + }, + { + "epoch": 0.38025933861020106, + "grad_norm": 0.35202863812446594, + "learning_rate": 0.00012397353747774212, + "loss": 1.1257, + "step": 29263 + }, + { + "epoch": 0.3802723331541169, + "grad_norm": 0.3972894251346588, + "learning_rate": 0.00012397093801583072, + "loss": 1.1971, + "step": 29264 + }, + { + "epoch": 0.3802853276980328, + "grad_norm": 0.35992005467414856, + "learning_rate": 0.00012396833855391934, + "loss": 1.3961, + "step": 29265 + }, + { + "epoch": 0.38029832224194865, + "grad_norm": 0.5016254782676697, + "learning_rate": 0.00012396573909200797, + "loss": 1.5307, + "step": 29266 + }, + { + "epoch": 0.38031131678586455, + "grad_norm": 0.4317588210105896, + "learning_rate": 0.0001239631396300966, + "loss": 1.5, + "step": 29267 + }, + { + "epoch": 0.3803243113297804, + "grad_norm": 0.35746684670448303, + "learning_rate": 0.0001239605401681852, + "loss": 1.3658, + "step": 29268 + }, + { + "epoch": 0.3803373058736963, + "grad_norm": 0.47025319933891296, + "learning_rate": 0.0001239579407062738, + "loss": 1.3123, + "step": 29269 + }, + { + "epoch": 0.38035030041761214, + "grad_norm": 0.4329600930213928, + "learning_rate": 0.00012395534124436244, + "loss": 1.3053, + "step": 29270 + }, + { + "epoch": 0.38036329496152804, + "grad_norm": 0.34861868619918823, + "learning_rate": 0.00012395274178245104, + "loss": 1.2645, + "step": 29271 + }, + { + "epoch": 0.3803762895054439, + "grad_norm": 0.521388590335846, + "learning_rate": 0.00012395014232053966, + "loss": 1.6508, + "step": 29272 + }, + { + "epoch": 0.3803892840493598, + "grad_norm": 0.41579383611679077, + "learning_rate": 0.00012394754285862826, + "loss": 1.682, + "step": 29273 + }, + { + "epoch": 0.38040227859327563, + "grad_norm": 0.47176480293273926, + "learning_rate": 0.00012394494339671688, + "loss": 1.3106, + "step": 29274 + }, + { + "epoch": 0.38041527313719153, + "grad_norm": 0.30683043599128723, + "learning_rate": 0.0001239423439348055, + "loss": 1.6188, + "step": 29275 + }, + { + "epoch": 0.3804282676811074, + "grad_norm": 0.37323975563049316, + "learning_rate": 0.0001239397444728941, + "loss": 1.3084, + "step": 29276 + }, + { + "epoch": 0.3804412622250233, + "grad_norm": 0.4219805598258972, + "learning_rate": 0.00012393714501098273, + "loss": 1.4363, + "step": 29277 + }, + { + "epoch": 0.3804542567689391, + "grad_norm": 0.4615146219730377, + "learning_rate": 0.00012393454554907135, + "loss": 1.3311, + "step": 29278 + }, + { + "epoch": 0.380467251312855, + "grad_norm": 0.44219422340393066, + "learning_rate": 0.00012393194608715998, + "loss": 1.4973, + "step": 29279 + }, + { + "epoch": 0.38048024585677087, + "grad_norm": 0.36293652653694153, + "learning_rate": 0.00012392934662524858, + "loss": 1.3596, + "step": 29280 + }, + { + "epoch": 0.38049324040068677, + "grad_norm": 0.31426486372947693, + "learning_rate": 0.00012392674716333717, + "loss": 1.453, + "step": 29281 + }, + { + "epoch": 0.3805062349446026, + "grad_norm": 0.3880532383918762, + "learning_rate": 0.00012392414770142583, + "loss": 1.351, + "step": 29282 + }, + { + "epoch": 0.3805192294885185, + "grad_norm": 0.32018741965293884, + "learning_rate": 0.00012392154823951442, + "loss": 1.378, + "step": 29283 + }, + { + "epoch": 0.38053222403243436, + "grad_norm": 0.4035203754901886, + "learning_rate": 0.00012391894877760305, + "loss": 1.3425, + "step": 29284 + }, + { + "epoch": 0.38054521857635026, + "grad_norm": 0.42208942770957947, + "learning_rate": 0.00012391634931569164, + "loss": 1.4648, + "step": 29285 + }, + { + "epoch": 0.3805582131202661, + "grad_norm": 0.49098819494247437, + "learning_rate": 0.00012391374985378027, + "loss": 1.3835, + "step": 29286 + }, + { + "epoch": 0.380571207664182, + "grad_norm": 0.4460628628730774, + "learning_rate": 0.0001239111503918689, + "loss": 1.3597, + "step": 29287 + }, + { + "epoch": 0.38058420220809785, + "grad_norm": 0.4968879222869873, + "learning_rate": 0.0001239085509299575, + "loss": 1.3622, + "step": 29288 + }, + { + "epoch": 0.38059719675201376, + "grad_norm": 0.4973567724227905, + "learning_rate": 0.00012390595146804612, + "loss": 1.2452, + "step": 29289 + }, + { + "epoch": 0.3806101912959296, + "grad_norm": 0.3727114498615265, + "learning_rate": 0.00012390335200613474, + "loss": 1.4074, + "step": 29290 + }, + { + "epoch": 0.3806231858398455, + "grad_norm": 0.2838941514492035, + "learning_rate": 0.00012390075254422336, + "loss": 1.1543, + "step": 29291 + }, + { + "epoch": 0.38063618038376135, + "grad_norm": 0.4578581154346466, + "learning_rate": 0.00012389815308231196, + "loss": 1.3251, + "step": 29292 + }, + { + "epoch": 0.38064917492767725, + "grad_norm": 0.321943074464798, + "learning_rate": 0.00012389555362040059, + "loss": 1.3916, + "step": 29293 + }, + { + "epoch": 0.3806621694715931, + "grad_norm": 0.4434391260147095, + "learning_rate": 0.0001238929541584892, + "loss": 1.4198, + "step": 29294 + }, + { + "epoch": 0.380675164015509, + "grad_norm": 0.42371317744255066, + "learning_rate": 0.0001238903546965778, + "loss": 1.42, + "step": 29295 + }, + { + "epoch": 0.38068815855942484, + "grad_norm": 0.40165191888809204, + "learning_rate": 0.00012388775523466643, + "loss": 1.3288, + "step": 29296 + }, + { + "epoch": 0.38070115310334074, + "grad_norm": 0.390564888715744, + "learning_rate": 0.00012388515577275503, + "loss": 1.3019, + "step": 29297 + }, + { + "epoch": 0.3807141476472566, + "grad_norm": 0.3627987205982208, + "learning_rate": 0.00012388255631084365, + "loss": 1.3453, + "step": 29298 + }, + { + "epoch": 0.3807271421911725, + "grad_norm": 0.38638800382614136, + "learning_rate": 0.00012387995684893228, + "loss": 1.112, + "step": 29299 + }, + { + "epoch": 0.38074013673508833, + "grad_norm": 0.4235222637653351, + "learning_rate": 0.00012387735738702088, + "loss": 1.2382, + "step": 29300 + }, + { + "epoch": 0.38075313127900423, + "grad_norm": 0.4219317138195038, + "learning_rate": 0.00012387475792510953, + "loss": 1.5246, + "step": 29301 + }, + { + "epoch": 0.3807661258229201, + "grad_norm": 0.36694619059562683, + "learning_rate": 0.00012387215846319813, + "loss": 1.3284, + "step": 29302 + }, + { + "epoch": 0.380779120366836, + "grad_norm": 0.4686177372932434, + "learning_rate": 0.00012386955900128675, + "loss": 1.3858, + "step": 29303 + }, + { + "epoch": 0.3807921149107518, + "grad_norm": 0.42843008041381836, + "learning_rate": 0.00012386695953937535, + "loss": 1.2314, + "step": 29304 + }, + { + "epoch": 0.3808051094546677, + "grad_norm": 0.47080984711647034, + "learning_rate": 0.00012386436007746397, + "loss": 1.4952, + "step": 29305 + }, + { + "epoch": 0.38081810399858357, + "grad_norm": 0.3955569863319397, + "learning_rate": 0.0001238617606155526, + "loss": 1.4162, + "step": 29306 + }, + { + "epoch": 0.38083109854249947, + "grad_norm": 0.44336366653442383, + "learning_rate": 0.0001238591611536412, + "loss": 1.2713, + "step": 29307 + }, + { + "epoch": 0.3808440930864153, + "grad_norm": 0.310170978307724, + "learning_rate": 0.00012385656169172982, + "loss": 1.1964, + "step": 29308 + }, + { + "epoch": 0.3808570876303312, + "grad_norm": 0.4874942898750305, + "learning_rate": 0.00012385396222981844, + "loss": 1.5207, + "step": 29309 + }, + { + "epoch": 0.3808700821742471, + "grad_norm": 0.3653622269630432, + "learning_rate": 0.00012385136276790704, + "loss": 1.4265, + "step": 29310 + }, + { + "epoch": 0.38088307671816296, + "grad_norm": 0.44529467821121216, + "learning_rate": 0.00012384876330599566, + "loss": 1.3345, + "step": 29311 + }, + { + "epoch": 0.38089607126207886, + "grad_norm": 0.42509302496910095, + "learning_rate": 0.00012384616384408426, + "loss": 1.2987, + "step": 29312 + }, + { + "epoch": 0.3809090658059947, + "grad_norm": 0.47302448749542236, + "learning_rate": 0.0001238435643821729, + "loss": 1.3404, + "step": 29313 + }, + { + "epoch": 0.3809220603499106, + "grad_norm": 0.4644275903701782, + "learning_rate": 0.0001238409649202615, + "loss": 1.4853, + "step": 29314 + }, + { + "epoch": 0.38093505489382645, + "grad_norm": 0.3776567280292511, + "learning_rate": 0.00012383836545835014, + "loss": 1.4354, + "step": 29315 + }, + { + "epoch": 0.38094804943774235, + "grad_norm": 0.37674030661582947, + "learning_rate": 0.00012383576599643873, + "loss": 1.2761, + "step": 29316 + }, + { + "epoch": 0.3809610439816582, + "grad_norm": 0.45794782042503357, + "learning_rate": 0.00012383316653452736, + "loss": 1.4461, + "step": 29317 + }, + { + "epoch": 0.3809740385255741, + "grad_norm": 0.39756396412849426, + "learning_rate": 0.00012383056707261598, + "loss": 1.3745, + "step": 29318 + }, + { + "epoch": 0.38098703306948994, + "grad_norm": 0.4186994731426239, + "learning_rate": 0.00012382796761070458, + "loss": 1.508, + "step": 29319 + }, + { + "epoch": 0.38100002761340585, + "grad_norm": 0.6823927760124207, + "learning_rate": 0.0001238253681487932, + "loss": 1.4673, + "step": 29320 + }, + { + "epoch": 0.3810130221573217, + "grad_norm": 0.3511705696582794, + "learning_rate": 0.00012382276868688183, + "loss": 1.3597, + "step": 29321 + }, + { + "epoch": 0.3810260167012376, + "grad_norm": 0.459009051322937, + "learning_rate": 0.00012382016922497045, + "loss": 1.5418, + "step": 29322 + }, + { + "epoch": 0.38103901124515344, + "grad_norm": 0.24147070944309235, + "learning_rate": 0.00012381756976305905, + "loss": 1.2157, + "step": 29323 + }, + { + "epoch": 0.38105200578906934, + "grad_norm": 0.27654290199279785, + "learning_rate": 0.00012381497030114765, + "loss": 1.2676, + "step": 29324 + }, + { + "epoch": 0.3810650003329852, + "grad_norm": 0.3977309465408325, + "learning_rate": 0.0001238123708392363, + "loss": 1.5296, + "step": 29325 + }, + { + "epoch": 0.3810779948769011, + "grad_norm": 0.4082179367542267, + "learning_rate": 0.0001238097713773249, + "loss": 1.2637, + "step": 29326 + }, + { + "epoch": 0.38109098942081693, + "grad_norm": 0.35059911012649536, + "learning_rate": 0.00012380717191541352, + "loss": 1.5569, + "step": 29327 + }, + { + "epoch": 0.38110398396473283, + "grad_norm": 0.365424782037735, + "learning_rate": 0.00012380457245350212, + "loss": 1.3473, + "step": 29328 + }, + { + "epoch": 0.3811169785086487, + "grad_norm": 0.37108683586120605, + "learning_rate": 0.00012380197299159074, + "loss": 1.4112, + "step": 29329 + }, + { + "epoch": 0.3811299730525646, + "grad_norm": 0.3869462311267853, + "learning_rate": 0.00012379937352967937, + "loss": 1.308, + "step": 29330 + }, + { + "epoch": 0.3811429675964804, + "grad_norm": 0.4423421025276184, + "learning_rate": 0.00012379677406776796, + "loss": 1.5004, + "step": 29331 + }, + { + "epoch": 0.3811559621403963, + "grad_norm": 0.3770103454589844, + "learning_rate": 0.0001237941746058566, + "loss": 1.3411, + "step": 29332 + }, + { + "epoch": 0.38116895668431217, + "grad_norm": 0.3661179840564728, + "learning_rate": 0.0001237915751439452, + "loss": 1.3521, + "step": 29333 + }, + { + "epoch": 0.38118195122822807, + "grad_norm": 0.29008179903030396, + "learning_rate": 0.00012378897568203384, + "loss": 1.3433, + "step": 29334 + }, + { + "epoch": 0.3811949457721439, + "grad_norm": 0.3734094500541687, + "learning_rate": 0.00012378637622012244, + "loss": 1.4482, + "step": 29335 + }, + { + "epoch": 0.3812079403160598, + "grad_norm": 0.43825024366378784, + "learning_rate": 0.00012378377675821103, + "loss": 1.4307, + "step": 29336 + }, + { + "epoch": 0.38122093485997566, + "grad_norm": 0.3639650344848633, + "learning_rate": 0.00012378117729629968, + "loss": 1.515, + "step": 29337 + }, + { + "epoch": 0.38123392940389156, + "grad_norm": 0.41644492745399475, + "learning_rate": 0.00012377857783438828, + "loss": 1.4221, + "step": 29338 + }, + { + "epoch": 0.3812469239478074, + "grad_norm": 0.3838006854057312, + "learning_rate": 0.0001237759783724769, + "loss": 1.4273, + "step": 29339 + }, + { + "epoch": 0.3812599184917233, + "grad_norm": 0.48059695959091187, + "learning_rate": 0.00012377337891056553, + "loss": 1.5367, + "step": 29340 + }, + { + "epoch": 0.38127291303563915, + "grad_norm": 0.37707996368408203, + "learning_rate": 0.00012377077944865413, + "loss": 1.3081, + "step": 29341 + }, + { + "epoch": 0.38128590757955505, + "grad_norm": 0.31958329677581787, + "learning_rate": 0.00012376817998674275, + "loss": 1.0924, + "step": 29342 + }, + { + "epoch": 0.3812989021234709, + "grad_norm": 0.3857501447200775, + "learning_rate": 0.00012376558052483135, + "loss": 1.2971, + "step": 29343 + }, + { + "epoch": 0.3813118966673868, + "grad_norm": 0.32319754362106323, + "learning_rate": 0.00012376298106292, + "loss": 1.2435, + "step": 29344 + }, + { + "epoch": 0.38132489121130264, + "grad_norm": 0.472208172082901, + "learning_rate": 0.0001237603816010086, + "loss": 1.4088, + "step": 29345 + }, + { + "epoch": 0.38133788575521854, + "grad_norm": 0.41490796208381653, + "learning_rate": 0.00012375778213909722, + "loss": 1.4036, + "step": 29346 + }, + { + "epoch": 0.3813508802991344, + "grad_norm": 0.37932026386260986, + "learning_rate": 0.00012375518267718582, + "loss": 1.2381, + "step": 29347 + }, + { + "epoch": 0.3813638748430503, + "grad_norm": 0.4356116056442261, + "learning_rate": 0.00012375258321527445, + "loss": 1.369, + "step": 29348 + }, + { + "epoch": 0.38137686938696613, + "grad_norm": 0.43697962164878845, + "learning_rate": 0.00012374998375336307, + "loss": 1.4287, + "step": 29349 + }, + { + "epoch": 0.38138986393088203, + "grad_norm": 0.24116088449954987, + "learning_rate": 0.00012374738429145167, + "loss": 1.1915, + "step": 29350 + }, + { + "epoch": 0.3814028584747979, + "grad_norm": 0.45889362692832947, + "learning_rate": 0.0001237447848295403, + "loss": 1.2901, + "step": 29351 + }, + { + "epoch": 0.3814158530187138, + "grad_norm": 0.4414624571800232, + "learning_rate": 0.00012374218536762892, + "loss": 1.457, + "step": 29352 + }, + { + "epoch": 0.3814288475626296, + "grad_norm": 0.4113902449607849, + "learning_rate": 0.0001237395859057175, + "loss": 1.3579, + "step": 29353 + }, + { + "epoch": 0.3814418421065455, + "grad_norm": 0.3689514994621277, + "learning_rate": 0.00012373698644380614, + "loss": 1.5436, + "step": 29354 + }, + { + "epoch": 0.38145483665046137, + "grad_norm": 0.5008370876312256, + "learning_rate": 0.00012373438698189474, + "loss": 1.4787, + "step": 29355 + }, + { + "epoch": 0.38146783119437727, + "grad_norm": 0.29000452160835266, + "learning_rate": 0.0001237317875199834, + "loss": 1.4038, + "step": 29356 + }, + { + "epoch": 0.3814808257382931, + "grad_norm": 0.3488605320453644, + "learning_rate": 0.00012372918805807198, + "loss": 1.3101, + "step": 29357 + }, + { + "epoch": 0.381493820282209, + "grad_norm": 0.4578481912612915, + "learning_rate": 0.0001237265885961606, + "loss": 1.4426, + "step": 29358 + }, + { + "epoch": 0.38150681482612486, + "grad_norm": 0.437642902135849, + "learning_rate": 0.0001237239891342492, + "loss": 1.339, + "step": 29359 + }, + { + "epoch": 0.38151980937004076, + "grad_norm": 0.43414634466171265, + "learning_rate": 0.00012372138967233783, + "loss": 1.5506, + "step": 29360 + }, + { + "epoch": 0.3815328039139566, + "grad_norm": 0.3625759482383728, + "learning_rate": 0.00012371879021042646, + "loss": 1.3672, + "step": 29361 + }, + { + "epoch": 0.3815457984578725, + "grad_norm": 0.4236309230327606, + "learning_rate": 0.00012371619074851505, + "loss": 1.46, + "step": 29362 + }, + { + "epoch": 0.38155879300178835, + "grad_norm": 0.3547542691230774, + "learning_rate": 0.00012371359128660368, + "loss": 1.3847, + "step": 29363 + }, + { + "epoch": 0.38157178754570426, + "grad_norm": 0.41549909114837646, + "learning_rate": 0.0001237109918246923, + "loss": 1.4217, + "step": 29364 + }, + { + "epoch": 0.3815847820896201, + "grad_norm": 0.26602962613105774, + "learning_rate": 0.0001237083923627809, + "loss": 1.2376, + "step": 29365 + }, + { + "epoch": 0.381597776633536, + "grad_norm": 0.49340665340423584, + "learning_rate": 0.00012370579290086952, + "loss": 1.3961, + "step": 29366 + }, + { + "epoch": 0.38161077117745185, + "grad_norm": 0.4609818160533905, + "learning_rate": 0.00012370319343895812, + "loss": 1.5643, + "step": 29367 + }, + { + "epoch": 0.38162376572136775, + "grad_norm": 0.3672391176223755, + "learning_rate": 0.00012370059397704677, + "loss": 1.3276, + "step": 29368 + }, + { + "epoch": 0.3816367602652836, + "grad_norm": 0.37062928080558777, + "learning_rate": 0.00012369799451513537, + "loss": 1.4721, + "step": 29369 + }, + { + "epoch": 0.3816497548091995, + "grad_norm": 0.40355733036994934, + "learning_rate": 0.000123695395053224, + "loss": 1.3654, + "step": 29370 + }, + { + "epoch": 0.38166274935311534, + "grad_norm": 0.39091750979423523, + "learning_rate": 0.0001236927955913126, + "loss": 1.5978, + "step": 29371 + }, + { + "epoch": 0.38167574389703124, + "grad_norm": 0.5253551602363586, + "learning_rate": 0.00012369019612940122, + "loss": 1.5154, + "step": 29372 + }, + { + "epoch": 0.3816887384409471, + "grad_norm": 0.34352824091911316, + "learning_rate": 0.00012368759666748984, + "loss": 1.4004, + "step": 29373 + }, + { + "epoch": 0.381701732984863, + "grad_norm": 0.40059515833854675, + "learning_rate": 0.00012368499720557844, + "loss": 1.1791, + "step": 29374 + }, + { + "epoch": 0.38171472752877883, + "grad_norm": 0.4644610583782196, + "learning_rate": 0.0001236823977436671, + "loss": 1.3207, + "step": 29375 + }, + { + "epoch": 0.38172772207269473, + "grad_norm": 0.4177878797054291, + "learning_rate": 0.0001236797982817557, + "loss": 1.4119, + "step": 29376 + }, + { + "epoch": 0.3817407166166106, + "grad_norm": 0.31053271889686584, + "learning_rate": 0.00012367719881984428, + "loss": 1.2872, + "step": 29377 + }, + { + "epoch": 0.3817537111605265, + "grad_norm": 0.4227723181247711, + "learning_rate": 0.0001236745993579329, + "loss": 1.5247, + "step": 29378 + }, + { + "epoch": 0.3817667057044423, + "grad_norm": 0.4705777168273926, + "learning_rate": 0.00012367199989602153, + "loss": 1.334, + "step": 29379 + }, + { + "epoch": 0.3817797002483582, + "grad_norm": 0.2954559028148651, + "learning_rate": 0.00012366940043411016, + "loss": 1.1607, + "step": 29380 + }, + { + "epoch": 0.38179269479227407, + "grad_norm": 0.35180020332336426, + "learning_rate": 0.00012366680097219875, + "loss": 1.3692, + "step": 29381 + }, + { + "epoch": 0.38180568933618997, + "grad_norm": 0.34350132942199707, + "learning_rate": 0.00012366420151028738, + "loss": 1.1326, + "step": 29382 + }, + { + "epoch": 0.3818186838801058, + "grad_norm": 0.4459732472896576, + "learning_rate": 0.000123661602048376, + "loss": 1.3347, + "step": 29383 + }, + { + "epoch": 0.3818316784240217, + "grad_norm": 0.3138584494590759, + "learning_rate": 0.0001236590025864646, + "loss": 1.4915, + "step": 29384 + }, + { + "epoch": 0.38184467296793756, + "grad_norm": 0.34164419770240784, + "learning_rate": 0.00012365640312455323, + "loss": 1.5184, + "step": 29385 + }, + { + "epoch": 0.38185766751185346, + "grad_norm": 0.4256591200828552, + "learning_rate": 0.00012365380366264182, + "loss": 1.2372, + "step": 29386 + }, + { + "epoch": 0.38187066205576936, + "grad_norm": 0.3848235309123993, + "learning_rate": 0.00012365120420073047, + "loss": 1.3494, + "step": 29387 + }, + { + "epoch": 0.3818836565996852, + "grad_norm": 0.4413069188594818, + "learning_rate": 0.00012364860473881907, + "loss": 1.5947, + "step": 29388 + }, + { + "epoch": 0.3818966511436011, + "grad_norm": 0.36427780985832214, + "learning_rate": 0.0001236460052769077, + "loss": 1.3516, + "step": 29389 + }, + { + "epoch": 0.38190964568751695, + "grad_norm": 0.4441884756088257, + "learning_rate": 0.0001236434058149963, + "loss": 1.3335, + "step": 29390 + }, + { + "epoch": 0.38192264023143285, + "grad_norm": 0.3923276364803314, + "learning_rate": 0.00012364080635308492, + "loss": 1.4158, + "step": 29391 + }, + { + "epoch": 0.3819356347753487, + "grad_norm": 0.33791735768318176, + "learning_rate": 0.00012363820689117354, + "loss": 1.2208, + "step": 29392 + }, + { + "epoch": 0.3819486293192646, + "grad_norm": 0.46055054664611816, + "learning_rate": 0.00012363560742926214, + "loss": 1.6191, + "step": 29393 + }, + { + "epoch": 0.38196162386318044, + "grad_norm": 0.5053493976593018, + "learning_rate": 0.00012363300796735076, + "loss": 1.3984, + "step": 29394 + }, + { + "epoch": 0.38197461840709634, + "grad_norm": 0.38026535511016846, + "learning_rate": 0.0001236304085054394, + "loss": 1.4039, + "step": 29395 + }, + { + "epoch": 0.3819876129510122, + "grad_norm": 0.322826623916626, + "learning_rate": 0.000123627809043528, + "loss": 1.3155, + "step": 29396 + }, + { + "epoch": 0.3820006074949281, + "grad_norm": 0.4060744047164917, + "learning_rate": 0.0001236252095816166, + "loss": 1.4254, + "step": 29397 + }, + { + "epoch": 0.38201360203884394, + "grad_norm": 0.4446217715740204, + "learning_rate": 0.0001236226101197052, + "loss": 1.4378, + "step": 29398 + }, + { + "epoch": 0.38202659658275984, + "grad_norm": 0.4494479298591614, + "learning_rate": 0.00012362001065779386, + "loss": 1.2752, + "step": 29399 + }, + { + "epoch": 0.3820395911266757, + "grad_norm": 0.30591675639152527, + "learning_rate": 0.00012361741119588246, + "loss": 1.5168, + "step": 29400 + }, + { + "epoch": 0.3820525856705916, + "grad_norm": 0.3413150906562805, + "learning_rate": 0.00012361481173397108, + "loss": 1.6, + "step": 29401 + }, + { + "epoch": 0.3820655802145074, + "grad_norm": 0.4443274140357971, + "learning_rate": 0.00012361221227205968, + "loss": 1.4589, + "step": 29402 + }, + { + "epoch": 0.38207857475842333, + "grad_norm": 0.37355321645736694, + "learning_rate": 0.0001236096128101483, + "loss": 1.4099, + "step": 29403 + }, + { + "epoch": 0.3820915693023392, + "grad_norm": 0.38128727674484253, + "learning_rate": 0.00012360701334823693, + "loss": 1.4049, + "step": 29404 + }, + { + "epoch": 0.3821045638462551, + "grad_norm": 0.4323970079421997, + "learning_rate": 0.00012360441388632553, + "loss": 1.4708, + "step": 29405 + }, + { + "epoch": 0.3821175583901709, + "grad_norm": 0.4144299626350403, + "learning_rate": 0.00012360181442441415, + "loss": 1.4166, + "step": 29406 + }, + { + "epoch": 0.3821305529340868, + "grad_norm": 0.4436207115650177, + "learning_rate": 0.00012359921496250277, + "loss": 1.4823, + "step": 29407 + }, + { + "epoch": 0.38214354747800267, + "grad_norm": 0.39162495732307434, + "learning_rate": 0.00012359661550059137, + "loss": 1.5734, + "step": 29408 + }, + { + "epoch": 0.38215654202191857, + "grad_norm": 0.4349675178527832, + "learning_rate": 0.00012359401603868, + "loss": 1.4324, + "step": 29409 + }, + { + "epoch": 0.3821695365658344, + "grad_norm": 0.3897383511066437, + "learning_rate": 0.0001235914165767686, + "loss": 1.5939, + "step": 29410 + }, + { + "epoch": 0.3821825311097503, + "grad_norm": 0.36471816897392273, + "learning_rate": 0.00012358881711485725, + "loss": 1.2838, + "step": 29411 + }, + { + "epoch": 0.38219552565366616, + "grad_norm": 0.4042739272117615, + "learning_rate": 0.00012358621765294584, + "loss": 1.3122, + "step": 29412 + }, + { + "epoch": 0.38220852019758206, + "grad_norm": 0.42443063855171204, + "learning_rate": 0.00012358361819103447, + "loss": 1.596, + "step": 29413 + }, + { + "epoch": 0.3822215147414979, + "grad_norm": 0.384961873292923, + "learning_rate": 0.0001235810187291231, + "loss": 1.3624, + "step": 29414 + }, + { + "epoch": 0.3822345092854138, + "grad_norm": 0.37326520681381226, + "learning_rate": 0.0001235784192672117, + "loss": 1.2551, + "step": 29415 + }, + { + "epoch": 0.38224750382932965, + "grad_norm": 0.48011258244514465, + "learning_rate": 0.00012357581980530031, + "loss": 1.473, + "step": 29416 + }, + { + "epoch": 0.38226049837324555, + "grad_norm": 0.3795984089374542, + "learning_rate": 0.0001235732203433889, + "loss": 1.4254, + "step": 29417 + }, + { + "epoch": 0.3822734929171614, + "grad_norm": 0.4432600140571594, + "learning_rate": 0.00012357062088147756, + "loss": 1.4637, + "step": 29418 + }, + { + "epoch": 0.3822864874610773, + "grad_norm": 0.33412620425224304, + "learning_rate": 0.00012356802141956616, + "loss": 1.6703, + "step": 29419 + }, + { + "epoch": 0.38229948200499314, + "grad_norm": 0.44140705466270447, + "learning_rate": 0.00012356542195765476, + "loss": 1.5791, + "step": 29420 + }, + { + "epoch": 0.38231247654890904, + "grad_norm": 0.38218456506729126, + "learning_rate": 0.00012356282249574338, + "loss": 1.4292, + "step": 29421 + }, + { + "epoch": 0.3823254710928249, + "grad_norm": 0.47046366333961487, + "learning_rate": 0.000123560223033832, + "loss": 1.4469, + "step": 29422 + }, + { + "epoch": 0.3823384656367408, + "grad_norm": 0.36526861786842346, + "learning_rate": 0.00012355762357192063, + "loss": 1.3608, + "step": 29423 + }, + { + "epoch": 0.38235146018065663, + "grad_norm": 0.3890009820461273, + "learning_rate": 0.00012355502411000923, + "loss": 1.4135, + "step": 29424 + }, + { + "epoch": 0.38236445472457253, + "grad_norm": 0.37007662653923035, + "learning_rate": 0.00012355242464809785, + "loss": 1.1086, + "step": 29425 + }, + { + "epoch": 0.3823774492684884, + "grad_norm": 0.39993101358413696, + "learning_rate": 0.00012354982518618648, + "loss": 1.3535, + "step": 29426 + }, + { + "epoch": 0.3823904438124043, + "grad_norm": 0.37410008907318115, + "learning_rate": 0.00012354722572427507, + "loss": 1.4623, + "step": 29427 + }, + { + "epoch": 0.3824034383563201, + "grad_norm": 0.3292379379272461, + "learning_rate": 0.0001235446262623637, + "loss": 1.3055, + "step": 29428 + }, + { + "epoch": 0.382416432900236, + "grad_norm": 0.3753480613231659, + "learning_rate": 0.0001235420268004523, + "loss": 1.2823, + "step": 29429 + }, + { + "epoch": 0.38242942744415187, + "grad_norm": 0.38356128334999084, + "learning_rate": 0.00012353942733854095, + "loss": 1.4063, + "step": 29430 + }, + { + "epoch": 0.38244242198806777, + "grad_norm": 0.4168642461299896, + "learning_rate": 0.00012353682787662955, + "loss": 1.2351, + "step": 29431 + }, + { + "epoch": 0.3824554165319836, + "grad_norm": 0.42736533284187317, + "learning_rate": 0.00012353422841471814, + "loss": 1.5735, + "step": 29432 + }, + { + "epoch": 0.3824684110758995, + "grad_norm": 0.3064168095588684, + "learning_rate": 0.00012353162895280677, + "loss": 1.4054, + "step": 29433 + }, + { + "epoch": 0.38248140561981536, + "grad_norm": 0.4678229093551636, + "learning_rate": 0.0001235290294908954, + "loss": 1.5785, + "step": 29434 + }, + { + "epoch": 0.38249440016373126, + "grad_norm": 0.3700430989265442, + "learning_rate": 0.00012352643002898402, + "loss": 1.3764, + "step": 29435 + }, + { + "epoch": 0.3825073947076471, + "grad_norm": 0.42482495307922363, + "learning_rate": 0.00012352383056707261, + "loss": 1.5867, + "step": 29436 + }, + { + "epoch": 0.382520389251563, + "grad_norm": 0.42966827750205994, + "learning_rate": 0.00012352123110516124, + "loss": 1.5272, + "step": 29437 + }, + { + "epoch": 0.38253338379547885, + "grad_norm": 0.39197731018066406, + "learning_rate": 0.00012351863164324986, + "loss": 1.4112, + "step": 29438 + }, + { + "epoch": 0.38254637833939475, + "grad_norm": 0.4871208369731903, + "learning_rate": 0.00012351603218133846, + "loss": 1.3997, + "step": 29439 + }, + { + "epoch": 0.3825593728833106, + "grad_norm": 0.38346433639526367, + "learning_rate": 0.00012351343271942708, + "loss": 1.3051, + "step": 29440 + }, + { + "epoch": 0.3825723674272265, + "grad_norm": 0.28971898555755615, + "learning_rate": 0.00012351083325751568, + "loss": 1.3486, + "step": 29441 + }, + { + "epoch": 0.38258536197114235, + "grad_norm": 0.4238617420196533, + "learning_rate": 0.00012350823379560433, + "loss": 1.3721, + "step": 29442 + }, + { + "epoch": 0.38259835651505825, + "grad_norm": 0.4442681670188904, + "learning_rate": 0.00012350563433369293, + "loss": 1.4932, + "step": 29443 + }, + { + "epoch": 0.3826113510589741, + "grad_norm": 0.3989102244377136, + "learning_rate": 0.00012350303487178156, + "loss": 1.4284, + "step": 29444 + }, + { + "epoch": 0.38262434560289, + "grad_norm": 0.3324430286884308, + "learning_rate": 0.00012350043540987015, + "loss": 1.4139, + "step": 29445 + }, + { + "epoch": 0.38263734014680584, + "grad_norm": 0.39921894669532776, + "learning_rate": 0.00012349783594795878, + "loss": 1.4239, + "step": 29446 + }, + { + "epoch": 0.38265033469072174, + "grad_norm": 0.4210700988769531, + "learning_rate": 0.0001234952364860474, + "loss": 1.4715, + "step": 29447 + }, + { + "epoch": 0.3826633292346376, + "grad_norm": 0.40407630801200867, + "learning_rate": 0.000123492637024136, + "loss": 1.3505, + "step": 29448 + }, + { + "epoch": 0.3826763237785535, + "grad_norm": 0.4380398988723755, + "learning_rate": 0.00012349003756222462, + "loss": 1.5281, + "step": 29449 + }, + { + "epoch": 0.38268931832246933, + "grad_norm": 0.3914535641670227, + "learning_rate": 0.00012348743810031325, + "loss": 1.5056, + "step": 29450 + }, + { + "epoch": 0.38270231286638523, + "grad_norm": 0.35153651237487793, + "learning_rate": 0.00012348483863840185, + "loss": 1.4343, + "step": 29451 + }, + { + "epoch": 0.3827153074103011, + "grad_norm": 0.44689682126045227, + "learning_rate": 0.00012348223917649047, + "loss": 1.3514, + "step": 29452 + }, + { + "epoch": 0.382728301954217, + "grad_norm": 0.39732825756073, + "learning_rate": 0.0001234796397145791, + "loss": 1.4462, + "step": 29453 + }, + { + "epoch": 0.3827412964981328, + "grad_norm": 0.3514556586742401, + "learning_rate": 0.00012347704025266772, + "loss": 1.503, + "step": 29454 + }, + { + "epoch": 0.3827542910420487, + "grad_norm": 0.30261436104774475, + "learning_rate": 0.00012347444079075632, + "loss": 1.1625, + "step": 29455 + }, + { + "epoch": 0.38276728558596457, + "grad_norm": 0.29890885949134827, + "learning_rate": 0.00012347184132884494, + "loss": 1.2394, + "step": 29456 + }, + { + "epoch": 0.38278028012988047, + "grad_norm": 0.3462114930152893, + "learning_rate": 0.00012346924186693357, + "loss": 1.2561, + "step": 29457 + }, + { + "epoch": 0.3827932746737963, + "grad_norm": 0.35676610469818115, + "learning_rate": 0.00012346664240502216, + "loss": 1.4892, + "step": 29458 + }, + { + "epoch": 0.3828062692177122, + "grad_norm": 0.45956483483314514, + "learning_rate": 0.0001234640429431108, + "loss": 1.3537, + "step": 29459 + }, + { + "epoch": 0.38281926376162806, + "grad_norm": 0.5853527784347534, + "learning_rate": 0.00012346144348119938, + "loss": 1.4982, + "step": 29460 + }, + { + "epoch": 0.38283225830554396, + "grad_norm": 0.4398396909236908, + "learning_rate": 0.000123458844019288, + "loss": 1.3552, + "step": 29461 + }, + { + "epoch": 0.38284525284945986, + "grad_norm": 0.3814709782600403, + "learning_rate": 0.00012345624455737663, + "loss": 1.3995, + "step": 29462 + }, + { + "epoch": 0.3828582473933757, + "grad_norm": 0.49610692262649536, + "learning_rate": 0.00012345364509546523, + "loss": 1.5677, + "step": 29463 + }, + { + "epoch": 0.3828712419372916, + "grad_norm": 0.4440975785255432, + "learning_rate": 0.00012345104563355386, + "loss": 1.4149, + "step": 29464 + }, + { + "epoch": 0.38288423648120745, + "grad_norm": 0.39127984642982483, + "learning_rate": 0.00012344844617164248, + "loss": 1.3244, + "step": 29465 + }, + { + "epoch": 0.38289723102512335, + "grad_norm": 0.4027152955532074, + "learning_rate": 0.0001234458467097311, + "loss": 1.294, + "step": 29466 + }, + { + "epoch": 0.3829102255690392, + "grad_norm": 0.42077744007110596, + "learning_rate": 0.0001234432472478197, + "loss": 1.4829, + "step": 29467 + }, + { + "epoch": 0.3829232201129551, + "grad_norm": 0.3501160442829132, + "learning_rate": 0.00012344064778590833, + "loss": 1.3497, + "step": 29468 + }, + { + "epoch": 0.38293621465687094, + "grad_norm": 0.427538126707077, + "learning_rate": 0.00012343804832399695, + "loss": 1.387, + "step": 29469 + }, + { + "epoch": 0.38294920920078684, + "grad_norm": 0.3432125449180603, + "learning_rate": 0.00012343544886208555, + "loss": 1.4066, + "step": 29470 + }, + { + "epoch": 0.3829622037447027, + "grad_norm": 0.30831846594810486, + "learning_rate": 0.00012343284940017417, + "loss": 1.2973, + "step": 29471 + }, + { + "epoch": 0.3829751982886186, + "grad_norm": 0.4371612071990967, + "learning_rate": 0.00012343024993826277, + "loss": 1.529, + "step": 29472 + }, + { + "epoch": 0.38298819283253444, + "grad_norm": 0.44449731707572937, + "learning_rate": 0.00012342765047635142, + "loss": 1.4206, + "step": 29473 + }, + { + "epoch": 0.38300118737645034, + "grad_norm": 0.3294375538825989, + "learning_rate": 0.00012342505101444002, + "loss": 1.2176, + "step": 29474 + }, + { + "epoch": 0.3830141819203662, + "grad_norm": 0.43963056802749634, + "learning_rate": 0.00012342245155252862, + "loss": 1.3801, + "step": 29475 + }, + { + "epoch": 0.3830271764642821, + "grad_norm": 0.3999042212963104, + "learning_rate": 0.00012341985209061724, + "loss": 1.366, + "step": 29476 + }, + { + "epoch": 0.3830401710081979, + "grad_norm": 0.33256688714027405, + "learning_rate": 0.00012341725262870587, + "loss": 1.2743, + "step": 29477 + }, + { + "epoch": 0.38305316555211383, + "grad_norm": 0.3898864686489105, + "learning_rate": 0.0001234146531667945, + "loss": 1.5186, + "step": 29478 + }, + { + "epoch": 0.3830661600960297, + "grad_norm": 0.45742136240005493, + "learning_rate": 0.0001234120537048831, + "loss": 1.4502, + "step": 29479 + }, + { + "epoch": 0.3830791546399456, + "grad_norm": 0.4187413156032562, + "learning_rate": 0.0001234094542429717, + "loss": 1.4334, + "step": 29480 + }, + { + "epoch": 0.3830921491838614, + "grad_norm": 0.3867330253124237, + "learning_rate": 0.00012340685478106034, + "loss": 1.5103, + "step": 29481 + }, + { + "epoch": 0.3831051437277773, + "grad_norm": 0.2904622554779053, + "learning_rate": 0.00012340425531914893, + "loss": 0.9805, + "step": 29482 + }, + { + "epoch": 0.38311813827169316, + "grad_norm": 0.4402660131454468, + "learning_rate": 0.00012340165585723756, + "loss": 1.4872, + "step": 29483 + }, + { + "epoch": 0.38313113281560907, + "grad_norm": 0.4186619222164154, + "learning_rate": 0.00012339905639532616, + "loss": 1.4515, + "step": 29484 + }, + { + "epoch": 0.3831441273595249, + "grad_norm": 0.3732668459415436, + "learning_rate": 0.0001233964569334148, + "loss": 1.5705, + "step": 29485 + }, + { + "epoch": 0.3831571219034408, + "grad_norm": 0.4282355010509491, + "learning_rate": 0.0001233938574715034, + "loss": 1.4904, + "step": 29486 + }, + { + "epoch": 0.38317011644735666, + "grad_norm": 0.36313551664352417, + "learning_rate": 0.000123391258009592, + "loss": 1.4112, + "step": 29487 + }, + { + "epoch": 0.38318311099127256, + "grad_norm": 0.44452571868896484, + "learning_rate": 0.00012338865854768065, + "loss": 1.4113, + "step": 29488 + }, + { + "epoch": 0.3831961055351884, + "grad_norm": 0.4374120235443115, + "learning_rate": 0.00012338605908576925, + "loss": 1.4564, + "step": 29489 + }, + { + "epoch": 0.3832091000791043, + "grad_norm": 0.44149017333984375, + "learning_rate": 0.00012338345962385788, + "loss": 1.5246, + "step": 29490 + }, + { + "epoch": 0.38322209462302015, + "grad_norm": 0.27829477190971375, + "learning_rate": 0.00012338086016194647, + "loss": 1.22, + "step": 29491 + }, + { + "epoch": 0.38323508916693605, + "grad_norm": 0.3236914575099945, + "learning_rate": 0.0001233782607000351, + "loss": 1.299, + "step": 29492 + }, + { + "epoch": 0.3832480837108519, + "grad_norm": 0.49193939566612244, + "learning_rate": 0.00012337566123812372, + "loss": 1.5156, + "step": 29493 + }, + { + "epoch": 0.3832610782547678, + "grad_norm": 0.37566548585891724, + "learning_rate": 0.00012337306177621232, + "loss": 1.4679, + "step": 29494 + }, + { + "epoch": 0.38327407279868364, + "grad_norm": 0.46173906326293945, + "learning_rate": 0.00012337046231430094, + "loss": 1.4902, + "step": 29495 + }, + { + "epoch": 0.38328706734259954, + "grad_norm": 0.44643354415893555, + "learning_rate": 0.00012336786285238957, + "loss": 1.3632, + "step": 29496 + }, + { + "epoch": 0.3833000618865154, + "grad_norm": 0.395754337310791, + "learning_rate": 0.0001233652633904782, + "loss": 1.5402, + "step": 29497 + }, + { + "epoch": 0.3833130564304313, + "grad_norm": 0.42827412486076355, + "learning_rate": 0.0001233626639285668, + "loss": 1.3144, + "step": 29498 + }, + { + "epoch": 0.38332605097434713, + "grad_norm": 0.5095736980438232, + "learning_rate": 0.00012336006446665541, + "loss": 1.2139, + "step": 29499 + }, + { + "epoch": 0.38333904551826303, + "grad_norm": 0.406919926404953, + "learning_rate": 0.00012335746500474404, + "loss": 1.4349, + "step": 29500 + }, + { + "epoch": 0.3833520400621789, + "grad_norm": 0.31829166412353516, + "learning_rate": 0.00012335486554283264, + "loss": 1.3305, + "step": 29501 + }, + { + "epoch": 0.3833650346060948, + "grad_norm": 0.3732173442840576, + "learning_rate": 0.00012335226608092126, + "loss": 1.5454, + "step": 29502 + }, + { + "epoch": 0.3833780291500106, + "grad_norm": 0.513748049736023, + "learning_rate": 0.00012334966661900986, + "loss": 1.3684, + "step": 29503 + }, + { + "epoch": 0.3833910236939265, + "grad_norm": 0.34807541966438293, + "learning_rate": 0.00012334706715709848, + "loss": 1.336, + "step": 29504 + }, + { + "epoch": 0.38340401823784237, + "grad_norm": 0.37416428327560425, + "learning_rate": 0.0001233444676951871, + "loss": 1.4171, + "step": 29505 + }, + { + "epoch": 0.38341701278175827, + "grad_norm": 0.4538436532020569, + "learning_rate": 0.0001233418682332757, + "loss": 1.3917, + "step": 29506 + }, + { + "epoch": 0.3834300073256741, + "grad_norm": 0.3606007993221283, + "learning_rate": 0.00012333926877136433, + "loss": 1.6012, + "step": 29507 + }, + { + "epoch": 0.38344300186959, + "grad_norm": 0.47051510214805603, + "learning_rate": 0.00012333666930945295, + "loss": 1.519, + "step": 29508 + }, + { + "epoch": 0.38345599641350586, + "grad_norm": 0.3099076747894287, + "learning_rate": 0.00012333406984754158, + "loss": 1.3667, + "step": 29509 + }, + { + "epoch": 0.38346899095742176, + "grad_norm": 0.32664576172828674, + "learning_rate": 0.00012333147038563018, + "loss": 1.5163, + "step": 29510 + }, + { + "epoch": 0.3834819855013376, + "grad_norm": 0.446612685918808, + "learning_rate": 0.0001233288709237188, + "loss": 1.4837, + "step": 29511 + }, + { + "epoch": 0.3834949800452535, + "grad_norm": 0.3849194645881653, + "learning_rate": 0.00012332627146180742, + "loss": 1.3815, + "step": 29512 + }, + { + "epoch": 0.38350797458916935, + "grad_norm": 0.4123111069202423, + "learning_rate": 0.00012332367199989602, + "loss": 1.4184, + "step": 29513 + }, + { + "epoch": 0.38352096913308525, + "grad_norm": 0.40780967473983765, + "learning_rate": 0.00012332107253798465, + "loss": 1.3085, + "step": 29514 + }, + { + "epoch": 0.3835339636770011, + "grad_norm": 0.34047502279281616, + "learning_rate": 0.00012331847307607324, + "loss": 1.3118, + "step": 29515 + }, + { + "epoch": 0.383546958220917, + "grad_norm": 0.4311307370662689, + "learning_rate": 0.00012331587361416187, + "loss": 1.2915, + "step": 29516 + }, + { + "epoch": 0.38355995276483285, + "grad_norm": 0.3975888192653656, + "learning_rate": 0.0001233132741522505, + "loss": 1.3128, + "step": 29517 + }, + { + "epoch": 0.38357294730874875, + "grad_norm": 0.4133479595184326, + "learning_rate": 0.0001233106746903391, + "loss": 1.2339, + "step": 29518 + }, + { + "epoch": 0.3835859418526646, + "grad_norm": 0.3120031952857971, + "learning_rate": 0.00012330807522842771, + "loss": 1.3423, + "step": 29519 + }, + { + "epoch": 0.3835989363965805, + "grad_norm": 0.4945957660675049, + "learning_rate": 0.00012330547576651634, + "loss": 1.4794, + "step": 29520 + }, + { + "epoch": 0.38361193094049634, + "grad_norm": 0.40636706352233887, + "learning_rate": 0.00012330287630460496, + "loss": 1.3666, + "step": 29521 + }, + { + "epoch": 0.38362492548441224, + "grad_norm": 0.3585086762905121, + "learning_rate": 0.00012330027684269356, + "loss": 1.4693, + "step": 29522 + }, + { + "epoch": 0.3836379200283281, + "grad_norm": 0.45814022421836853, + "learning_rate": 0.00012329767738078218, + "loss": 1.3958, + "step": 29523 + }, + { + "epoch": 0.383650914572244, + "grad_norm": 0.37405315041542053, + "learning_rate": 0.0001232950779188708, + "loss": 1.3487, + "step": 29524 + }, + { + "epoch": 0.38366390911615983, + "grad_norm": 0.3508339822292328, + "learning_rate": 0.0001232924784569594, + "loss": 1.3029, + "step": 29525 + }, + { + "epoch": 0.38367690366007573, + "grad_norm": 0.5569481253623962, + "learning_rate": 0.00012328987899504803, + "loss": 1.3542, + "step": 29526 + }, + { + "epoch": 0.3836898982039916, + "grad_norm": 0.43022823333740234, + "learning_rate": 0.00012328727953313666, + "loss": 1.4787, + "step": 29527 + }, + { + "epoch": 0.3837028927479075, + "grad_norm": 0.45691606402397156, + "learning_rate": 0.00012328468007122528, + "loss": 1.5234, + "step": 29528 + }, + { + "epoch": 0.3837158872918233, + "grad_norm": 0.444759339094162, + "learning_rate": 0.00012328208060931388, + "loss": 1.3691, + "step": 29529 + }, + { + "epoch": 0.3837288818357392, + "grad_norm": 0.4338513910770416, + "learning_rate": 0.00012327948114740247, + "loss": 1.3558, + "step": 29530 + }, + { + "epoch": 0.38374187637965507, + "grad_norm": 0.44795432686805725, + "learning_rate": 0.00012327688168549113, + "loss": 1.5349, + "step": 29531 + }, + { + "epoch": 0.38375487092357097, + "grad_norm": 0.5515525937080383, + "learning_rate": 0.00012327428222357972, + "loss": 1.4683, + "step": 29532 + }, + { + "epoch": 0.3837678654674868, + "grad_norm": 0.3268814980983734, + "learning_rate": 0.00012327168276166835, + "loss": 1.5168, + "step": 29533 + }, + { + "epoch": 0.3837808600114027, + "grad_norm": 0.32242539525032043, + "learning_rate": 0.00012326908329975695, + "loss": 1.4235, + "step": 29534 + }, + { + "epoch": 0.38379385455531856, + "grad_norm": 0.35423004627227783, + "learning_rate": 0.00012326648383784557, + "loss": 1.4055, + "step": 29535 + }, + { + "epoch": 0.38380684909923446, + "grad_norm": 0.3132795989513397, + "learning_rate": 0.0001232638843759342, + "loss": 1.2203, + "step": 29536 + }, + { + "epoch": 0.3838198436431503, + "grad_norm": 0.2962224781513214, + "learning_rate": 0.0001232612849140228, + "loss": 1.3078, + "step": 29537 + }, + { + "epoch": 0.3838328381870662, + "grad_norm": 0.4085319936275482, + "learning_rate": 0.00012325868545211142, + "loss": 1.553, + "step": 29538 + }, + { + "epoch": 0.3838458327309821, + "grad_norm": 0.4380180239677429, + "learning_rate": 0.00012325608599020004, + "loss": 1.3859, + "step": 29539 + }, + { + "epoch": 0.38385882727489795, + "grad_norm": 0.34929758310317993, + "learning_rate": 0.00012325348652828867, + "loss": 1.3987, + "step": 29540 + }, + { + "epoch": 0.38387182181881385, + "grad_norm": 0.430632621049881, + "learning_rate": 0.00012325088706637726, + "loss": 1.3116, + "step": 29541 + }, + { + "epoch": 0.3838848163627297, + "grad_norm": 0.41115549206733704, + "learning_rate": 0.00012324828760446586, + "loss": 1.475, + "step": 29542 + }, + { + "epoch": 0.3838978109066456, + "grad_norm": 0.4312101900577545, + "learning_rate": 0.0001232456881425545, + "loss": 1.3531, + "step": 29543 + }, + { + "epoch": 0.38391080545056144, + "grad_norm": 0.423052042722702, + "learning_rate": 0.0001232430886806431, + "loss": 1.5625, + "step": 29544 + }, + { + "epoch": 0.38392379999447734, + "grad_norm": 0.39454153180122375, + "learning_rate": 0.00012324048921873173, + "loss": 1.4679, + "step": 29545 + }, + { + "epoch": 0.3839367945383932, + "grad_norm": 0.4111841917037964, + "learning_rate": 0.00012323788975682033, + "loss": 1.4577, + "step": 29546 + }, + { + "epoch": 0.3839497890823091, + "grad_norm": 0.3759179711341858, + "learning_rate": 0.00012323529029490896, + "loss": 1.4753, + "step": 29547 + }, + { + "epoch": 0.38396278362622493, + "grad_norm": 0.3814232349395752, + "learning_rate": 0.00012323269083299758, + "loss": 1.2555, + "step": 29548 + }, + { + "epoch": 0.38397577817014084, + "grad_norm": 0.41831669211387634, + "learning_rate": 0.00012323009137108618, + "loss": 1.6321, + "step": 29549 + }, + { + "epoch": 0.3839887727140567, + "grad_norm": 0.34405896067619324, + "learning_rate": 0.0001232274919091748, + "loss": 1.381, + "step": 29550 + }, + { + "epoch": 0.3840017672579726, + "grad_norm": 0.3482199013233185, + "learning_rate": 0.00012322489244726343, + "loss": 1.4558, + "step": 29551 + }, + { + "epoch": 0.3840147618018884, + "grad_norm": 0.47335493564605713, + "learning_rate": 0.00012322229298535205, + "loss": 1.4289, + "step": 29552 + }, + { + "epoch": 0.3840277563458043, + "grad_norm": 0.4393950402736664, + "learning_rate": 0.00012321969352344065, + "loss": 1.4487, + "step": 29553 + }, + { + "epoch": 0.3840407508897202, + "grad_norm": 0.2793898284435272, + "learning_rate": 0.00012321709406152925, + "loss": 1.3675, + "step": 29554 + }, + { + "epoch": 0.3840537454336361, + "grad_norm": 0.3861125409603119, + "learning_rate": 0.0001232144945996179, + "loss": 1.1869, + "step": 29555 + }, + { + "epoch": 0.3840667399775519, + "grad_norm": 0.30514398217201233, + "learning_rate": 0.0001232118951377065, + "loss": 1.1335, + "step": 29556 + }, + { + "epoch": 0.3840797345214678, + "grad_norm": 0.3920758068561554, + "learning_rate": 0.00012320929567579512, + "loss": 1.2021, + "step": 29557 + }, + { + "epoch": 0.38409272906538366, + "grad_norm": 0.29421690106391907, + "learning_rate": 0.00012320669621388372, + "loss": 1.3537, + "step": 29558 + }, + { + "epoch": 0.38410572360929957, + "grad_norm": 0.39906632900238037, + "learning_rate": 0.00012320409675197234, + "loss": 1.5168, + "step": 29559 + }, + { + "epoch": 0.3841187181532154, + "grad_norm": 0.36401134729385376, + "learning_rate": 0.00012320149729006097, + "loss": 1.492, + "step": 29560 + }, + { + "epoch": 0.3841317126971313, + "grad_norm": 0.41493383049964905, + "learning_rate": 0.00012319889782814956, + "loss": 1.417, + "step": 29561 + }, + { + "epoch": 0.38414470724104716, + "grad_norm": 0.38168710470199585, + "learning_rate": 0.00012319629836623821, + "loss": 1.4164, + "step": 29562 + }, + { + "epoch": 0.38415770178496306, + "grad_norm": 0.4548499584197998, + "learning_rate": 0.0001231936989043268, + "loss": 1.4051, + "step": 29563 + }, + { + "epoch": 0.3841706963288789, + "grad_norm": 0.4543074667453766, + "learning_rate": 0.00012319109944241544, + "loss": 1.5098, + "step": 29564 + }, + { + "epoch": 0.3841836908727948, + "grad_norm": 0.32889440655708313, + "learning_rate": 0.00012318849998050403, + "loss": 1.2761, + "step": 29565 + }, + { + "epoch": 0.38419668541671065, + "grad_norm": 0.6412518620491028, + "learning_rate": 0.00012318590051859266, + "loss": 1.471, + "step": 29566 + }, + { + "epoch": 0.38420967996062655, + "grad_norm": 0.4148516058921814, + "learning_rate": 0.00012318330105668128, + "loss": 1.6807, + "step": 29567 + }, + { + "epoch": 0.3842226745045424, + "grad_norm": 0.38719743490219116, + "learning_rate": 0.00012318070159476988, + "loss": 1.2277, + "step": 29568 + }, + { + "epoch": 0.3842356690484583, + "grad_norm": 0.4391591548919678, + "learning_rate": 0.0001231781021328585, + "loss": 1.6889, + "step": 29569 + }, + { + "epoch": 0.38424866359237414, + "grad_norm": 0.3942606449127197, + "learning_rate": 0.00012317550267094713, + "loss": 1.3006, + "step": 29570 + }, + { + "epoch": 0.38426165813629004, + "grad_norm": 0.3238622546195984, + "learning_rate": 0.00012317290320903573, + "loss": 1.3655, + "step": 29571 + }, + { + "epoch": 0.3842746526802059, + "grad_norm": 0.4696352183818817, + "learning_rate": 0.00012317030374712435, + "loss": 1.4848, + "step": 29572 + }, + { + "epoch": 0.3842876472241218, + "grad_norm": 0.4001320004463196, + "learning_rate": 0.00012316770428521295, + "loss": 1.4334, + "step": 29573 + }, + { + "epoch": 0.38430064176803763, + "grad_norm": 0.4030158221721649, + "learning_rate": 0.0001231651048233016, + "loss": 1.3915, + "step": 29574 + }, + { + "epoch": 0.38431363631195353, + "grad_norm": 0.3935050964355469, + "learning_rate": 0.0001231625053613902, + "loss": 1.5405, + "step": 29575 + }, + { + "epoch": 0.3843266308558694, + "grad_norm": 0.33803674578666687, + "learning_rate": 0.00012315990589947882, + "loss": 1.2854, + "step": 29576 + }, + { + "epoch": 0.3843396253997853, + "grad_norm": 0.538811445236206, + "learning_rate": 0.00012315730643756742, + "loss": 1.5299, + "step": 29577 + }, + { + "epoch": 0.3843526199437011, + "grad_norm": 0.3705592453479767, + "learning_rate": 0.00012315470697565604, + "loss": 1.4275, + "step": 29578 + }, + { + "epoch": 0.384365614487617, + "grad_norm": 0.32899147272109985, + "learning_rate": 0.00012315210751374467, + "loss": 1.321, + "step": 29579 + }, + { + "epoch": 0.38437860903153287, + "grad_norm": 0.4022553861141205, + "learning_rate": 0.00012314950805183327, + "loss": 1.3398, + "step": 29580 + }, + { + "epoch": 0.38439160357544877, + "grad_norm": 0.4059377610683441, + "learning_rate": 0.0001231469085899219, + "loss": 1.4669, + "step": 29581 + }, + { + "epoch": 0.3844045981193646, + "grad_norm": 0.2949143946170807, + "learning_rate": 0.00012314430912801051, + "loss": 1.5091, + "step": 29582 + }, + { + "epoch": 0.3844175926632805, + "grad_norm": 0.2948555648326874, + "learning_rate": 0.0001231417096660991, + "loss": 1.2982, + "step": 29583 + }, + { + "epoch": 0.38443058720719636, + "grad_norm": 0.5847094655036926, + "learning_rate": 0.00012313911020418774, + "loss": 1.6183, + "step": 29584 + }, + { + "epoch": 0.38444358175111226, + "grad_norm": 0.4073833227157593, + "learning_rate": 0.00012313651074227633, + "loss": 1.328, + "step": 29585 + }, + { + "epoch": 0.3844565762950281, + "grad_norm": 0.3363688588142395, + "learning_rate": 0.00012313391128036499, + "loss": 1.2448, + "step": 29586 + }, + { + "epoch": 0.384469570838944, + "grad_norm": 0.36529943346977234, + "learning_rate": 0.00012313131181845358, + "loss": 1.2179, + "step": 29587 + }, + { + "epoch": 0.38448256538285985, + "grad_norm": 0.4881378412246704, + "learning_rate": 0.0001231287123565422, + "loss": 1.3075, + "step": 29588 + }, + { + "epoch": 0.38449555992677575, + "grad_norm": 0.4256056249141693, + "learning_rate": 0.0001231261128946308, + "loss": 1.4528, + "step": 29589 + }, + { + "epoch": 0.3845085544706916, + "grad_norm": 0.3573410212993622, + "learning_rate": 0.00012312351343271943, + "loss": 1.1659, + "step": 29590 + }, + { + "epoch": 0.3845215490146075, + "grad_norm": 0.40770286321640015, + "learning_rate": 0.00012312091397080805, + "loss": 1.4056, + "step": 29591 + }, + { + "epoch": 0.38453454355852335, + "grad_norm": 0.44021254777908325, + "learning_rate": 0.00012311831450889665, + "loss": 1.3254, + "step": 29592 + }, + { + "epoch": 0.38454753810243925, + "grad_norm": 0.2869035005569458, + "learning_rate": 0.00012311571504698528, + "loss": 1.2718, + "step": 29593 + }, + { + "epoch": 0.3845605326463551, + "grad_norm": 0.4177190363407135, + "learning_rate": 0.0001231131155850739, + "loss": 1.5365, + "step": 29594 + }, + { + "epoch": 0.384573527190271, + "grad_norm": 0.39106976985931396, + "learning_rate": 0.00012311051612316252, + "loss": 1.4844, + "step": 29595 + }, + { + "epoch": 0.38458652173418684, + "grad_norm": 0.3891622722148895, + "learning_rate": 0.00012310791666125112, + "loss": 1.3454, + "step": 29596 + }, + { + "epoch": 0.38459951627810274, + "grad_norm": 0.3690756559371948, + "learning_rate": 0.00012310531719933975, + "loss": 1.397, + "step": 29597 + }, + { + "epoch": 0.3846125108220186, + "grad_norm": 0.4759562313556671, + "learning_rate": 0.00012310271773742837, + "loss": 1.3455, + "step": 29598 + }, + { + "epoch": 0.3846255053659345, + "grad_norm": 0.3874732553958893, + "learning_rate": 0.00012310011827551697, + "loss": 1.3089, + "step": 29599 + }, + { + "epoch": 0.38463849990985033, + "grad_norm": 0.4730784595012665, + "learning_rate": 0.0001230975188136056, + "loss": 1.5655, + "step": 29600 + }, + { + "epoch": 0.38465149445376623, + "grad_norm": 0.40431496500968933, + "learning_rate": 0.00012309491935169422, + "loss": 1.3822, + "step": 29601 + }, + { + "epoch": 0.3846644889976821, + "grad_norm": 0.4325055181980133, + "learning_rate": 0.00012309231988978281, + "loss": 1.399, + "step": 29602 + }, + { + "epoch": 0.384677483541598, + "grad_norm": 0.40868815779685974, + "learning_rate": 0.00012308972042787144, + "loss": 1.532, + "step": 29603 + }, + { + "epoch": 0.3846904780855138, + "grad_norm": 0.3345896899700165, + "learning_rate": 0.00012308712096596004, + "loss": 1.4445, + "step": 29604 + }, + { + "epoch": 0.3847034726294297, + "grad_norm": 0.4426919221878052, + "learning_rate": 0.0001230845215040487, + "loss": 1.3472, + "step": 29605 + }, + { + "epoch": 0.38471646717334557, + "grad_norm": 0.43984127044677734, + "learning_rate": 0.00012308192204213729, + "loss": 1.6056, + "step": 29606 + }, + { + "epoch": 0.38472946171726147, + "grad_norm": 0.43081796169281006, + "learning_rate": 0.0001230793225802259, + "loss": 1.5311, + "step": 29607 + }, + { + "epoch": 0.3847424562611773, + "grad_norm": 0.3965555429458618, + "learning_rate": 0.0001230767231183145, + "loss": 1.4376, + "step": 29608 + }, + { + "epoch": 0.3847554508050932, + "grad_norm": 0.45082512497901917, + "learning_rate": 0.00012307412365640313, + "loss": 1.2607, + "step": 29609 + }, + { + "epoch": 0.38476844534900906, + "grad_norm": 0.4424546957015991, + "learning_rate": 0.00012307152419449176, + "loss": 1.3874, + "step": 29610 + }, + { + "epoch": 0.38478143989292496, + "grad_norm": 0.38146525621414185, + "learning_rate": 0.00012306892473258035, + "loss": 1.3446, + "step": 29611 + }, + { + "epoch": 0.3847944344368408, + "grad_norm": 0.4125452935695648, + "learning_rate": 0.00012306632527066898, + "loss": 1.585, + "step": 29612 + }, + { + "epoch": 0.3848074289807567, + "grad_norm": 0.3718058466911316, + "learning_rate": 0.0001230637258087576, + "loss": 1.3402, + "step": 29613 + }, + { + "epoch": 0.3848204235246726, + "grad_norm": 0.3712333142757416, + "learning_rate": 0.0001230611263468462, + "loss": 1.4449, + "step": 29614 + }, + { + "epoch": 0.38483341806858845, + "grad_norm": 0.4591827392578125, + "learning_rate": 0.00012305852688493482, + "loss": 1.3396, + "step": 29615 + }, + { + "epoch": 0.38484641261250435, + "grad_norm": 0.38640880584716797, + "learning_rate": 0.00012305592742302342, + "loss": 1.2972, + "step": 29616 + }, + { + "epoch": 0.3848594071564202, + "grad_norm": 0.3804060220718384, + "learning_rate": 0.00012305332796111207, + "loss": 1.4104, + "step": 29617 + }, + { + "epoch": 0.3848724017003361, + "grad_norm": 0.4230775237083435, + "learning_rate": 0.00012305072849920067, + "loss": 1.441, + "step": 29618 + }, + { + "epoch": 0.38488539624425194, + "grad_norm": 0.3696955740451813, + "learning_rate": 0.0001230481290372893, + "loss": 1.5557, + "step": 29619 + }, + { + "epoch": 0.38489839078816784, + "grad_norm": 0.38091495633125305, + "learning_rate": 0.0001230455295753779, + "loss": 1.3525, + "step": 29620 + }, + { + "epoch": 0.3849113853320837, + "grad_norm": 0.5071821808815002, + "learning_rate": 0.00012304293011346652, + "loss": 1.5072, + "step": 29621 + }, + { + "epoch": 0.3849243798759996, + "grad_norm": 0.40129274129867554, + "learning_rate": 0.00012304033065155514, + "loss": 1.475, + "step": 29622 + }, + { + "epoch": 0.38493737441991543, + "grad_norm": 0.42611274123191833, + "learning_rate": 0.00012303773118964374, + "loss": 1.3045, + "step": 29623 + }, + { + "epoch": 0.38495036896383134, + "grad_norm": 0.28798148036003113, + "learning_rate": 0.00012303513172773236, + "loss": 1.4347, + "step": 29624 + }, + { + "epoch": 0.3849633635077472, + "grad_norm": 0.30114781856536865, + "learning_rate": 0.000123032532265821, + "loss": 1.444, + "step": 29625 + }, + { + "epoch": 0.3849763580516631, + "grad_norm": 0.33857262134552, + "learning_rate": 0.00012302993280390959, + "loss": 1.2281, + "step": 29626 + }, + { + "epoch": 0.3849893525955789, + "grad_norm": 0.34680691361427307, + "learning_rate": 0.0001230273333419982, + "loss": 1.4113, + "step": 29627 + }, + { + "epoch": 0.3850023471394948, + "grad_norm": 0.36102545261383057, + "learning_rate": 0.0001230247338800868, + "loss": 1.3754, + "step": 29628 + }, + { + "epoch": 0.3850153416834107, + "grad_norm": 0.42711347341537476, + "learning_rate": 0.00012302213441817546, + "loss": 1.3388, + "step": 29629 + }, + { + "epoch": 0.3850283362273266, + "grad_norm": 0.36953380703926086, + "learning_rate": 0.00012301953495626406, + "loss": 1.5219, + "step": 29630 + }, + { + "epoch": 0.3850413307712424, + "grad_norm": 0.3848010301589966, + "learning_rate": 0.00012301693549435268, + "loss": 1.216, + "step": 29631 + }, + { + "epoch": 0.3850543253151583, + "grad_norm": 0.3525538742542267, + "learning_rate": 0.00012301433603244128, + "loss": 1.3489, + "step": 29632 + }, + { + "epoch": 0.38506731985907416, + "grad_norm": 0.4170626997947693, + "learning_rate": 0.0001230117365705299, + "loss": 1.3912, + "step": 29633 + }, + { + "epoch": 0.38508031440299006, + "grad_norm": 0.4323761463165283, + "learning_rate": 0.00012300913710861853, + "loss": 1.4716, + "step": 29634 + }, + { + "epoch": 0.3850933089469059, + "grad_norm": 0.4812493622303009, + "learning_rate": 0.00012300653764670712, + "loss": 1.545, + "step": 29635 + }, + { + "epoch": 0.3851063034908218, + "grad_norm": 0.3876829445362091, + "learning_rate": 0.00012300393818479578, + "loss": 1.562, + "step": 29636 + }, + { + "epoch": 0.38511929803473766, + "grad_norm": 0.4575363099575043, + "learning_rate": 0.00012300133872288437, + "loss": 1.467, + "step": 29637 + }, + { + "epoch": 0.38513229257865356, + "grad_norm": 0.411072701215744, + "learning_rate": 0.00012299873926097297, + "loss": 1.364, + "step": 29638 + }, + { + "epoch": 0.3851452871225694, + "grad_norm": 0.3790915906429291, + "learning_rate": 0.0001229961397990616, + "loss": 1.4937, + "step": 29639 + }, + { + "epoch": 0.3851582816664853, + "grad_norm": 0.3745225667953491, + "learning_rate": 0.00012299354033715022, + "loss": 1.4222, + "step": 29640 + }, + { + "epoch": 0.38517127621040115, + "grad_norm": 0.48106110095977783, + "learning_rate": 0.00012299094087523884, + "loss": 1.6468, + "step": 29641 + }, + { + "epoch": 0.38518427075431705, + "grad_norm": 0.39493224024772644, + "learning_rate": 0.00012298834141332744, + "loss": 1.5423, + "step": 29642 + }, + { + "epoch": 0.3851972652982329, + "grad_norm": 0.3688465356826782, + "learning_rate": 0.00012298574195141607, + "loss": 1.3025, + "step": 29643 + }, + { + "epoch": 0.3852102598421488, + "grad_norm": 0.38973426818847656, + "learning_rate": 0.0001229831424895047, + "loss": 1.4366, + "step": 29644 + }, + { + "epoch": 0.38522325438606464, + "grad_norm": 0.4233192801475525, + "learning_rate": 0.0001229805430275933, + "loss": 1.4256, + "step": 29645 + }, + { + "epoch": 0.38523624892998054, + "grad_norm": 0.34802868962287903, + "learning_rate": 0.0001229779435656819, + "loss": 1.3975, + "step": 29646 + }, + { + "epoch": 0.3852492434738964, + "grad_norm": 0.4500443935394287, + "learning_rate": 0.0001229753441037705, + "loss": 1.6002, + "step": 29647 + }, + { + "epoch": 0.3852622380178123, + "grad_norm": 0.3810059726238251, + "learning_rate": 0.00012297274464185916, + "loss": 1.2729, + "step": 29648 + }, + { + "epoch": 0.38527523256172813, + "grad_norm": 0.5015361309051514, + "learning_rate": 0.00012297014517994776, + "loss": 1.2725, + "step": 29649 + }, + { + "epoch": 0.38528822710564403, + "grad_norm": 0.38909563422203064, + "learning_rate": 0.00012296754571803638, + "loss": 1.3373, + "step": 29650 + }, + { + "epoch": 0.3853012216495599, + "grad_norm": 0.36756131052970886, + "learning_rate": 0.00012296494625612498, + "loss": 1.2384, + "step": 29651 + }, + { + "epoch": 0.3853142161934758, + "grad_norm": 0.34497302770614624, + "learning_rate": 0.0001229623467942136, + "loss": 1.2016, + "step": 29652 + }, + { + "epoch": 0.3853272107373916, + "grad_norm": 0.38281774520874023, + "learning_rate": 0.00012295974733230223, + "loss": 1.3533, + "step": 29653 + }, + { + "epoch": 0.3853402052813075, + "grad_norm": 0.41373544931411743, + "learning_rate": 0.00012295714787039083, + "loss": 1.3843, + "step": 29654 + }, + { + "epoch": 0.38535319982522337, + "grad_norm": 0.4446506202220917, + "learning_rate": 0.00012295454840847945, + "loss": 1.5086, + "step": 29655 + }, + { + "epoch": 0.38536619436913927, + "grad_norm": 0.43855687975883484, + "learning_rate": 0.00012295194894656808, + "loss": 1.448, + "step": 29656 + }, + { + "epoch": 0.3853791889130551, + "grad_norm": 0.4544562101364136, + "learning_rate": 0.00012294934948465667, + "loss": 1.2667, + "step": 29657 + }, + { + "epoch": 0.385392183456971, + "grad_norm": 0.39347976446151733, + "learning_rate": 0.0001229467500227453, + "loss": 1.1744, + "step": 29658 + }, + { + "epoch": 0.38540517800088686, + "grad_norm": 0.36112740635871887, + "learning_rate": 0.0001229441505608339, + "loss": 1.1105, + "step": 29659 + }, + { + "epoch": 0.38541817254480276, + "grad_norm": 0.43231871724128723, + "learning_rate": 0.00012294155109892255, + "loss": 1.3788, + "step": 29660 + }, + { + "epoch": 0.3854311670887186, + "grad_norm": 0.4059285819530487, + "learning_rate": 0.00012293895163701114, + "loss": 1.2127, + "step": 29661 + }, + { + "epoch": 0.3854441616326345, + "grad_norm": 0.38333094120025635, + "learning_rate": 0.00012293635217509977, + "loss": 1.419, + "step": 29662 + }, + { + "epoch": 0.38545715617655035, + "grad_norm": 0.4240880012512207, + "learning_rate": 0.00012293375271318837, + "loss": 1.3824, + "step": 29663 + }, + { + "epoch": 0.38547015072046625, + "grad_norm": 0.4265848994255066, + "learning_rate": 0.000122931153251277, + "loss": 1.29, + "step": 29664 + }, + { + "epoch": 0.3854831452643821, + "grad_norm": 0.3585989475250244, + "learning_rate": 0.00012292855378936561, + "loss": 1.3191, + "step": 29665 + }, + { + "epoch": 0.385496139808298, + "grad_norm": 0.44446685910224915, + "learning_rate": 0.0001229259543274542, + "loss": 1.5389, + "step": 29666 + }, + { + "epoch": 0.38550913435221384, + "grad_norm": 0.43914347887039185, + "learning_rate": 0.00012292335486554284, + "loss": 1.3775, + "step": 29667 + }, + { + "epoch": 0.38552212889612975, + "grad_norm": 0.45096486806869507, + "learning_rate": 0.00012292075540363146, + "loss": 1.3388, + "step": 29668 + }, + { + "epoch": 0.3855351234400456, + "grad_norm": 0.45299309492111206, + "learning_rate": 0.00012291815594172006, + "loss": 1.2909, + "step": 29669 + }, + { + "epoch": 0.3855481179839615, + "grad_norm": 0.3884403109550476, + "learning_rate": 0.00012291555647980868, + "loss": 1.4764, + "step": 29670 + }, + { + "epoch": 0.38556111252787734, + "grad_norm": 0.37652429938316345, + "learning_rate": 0.0001229129570178973, + "loss": 1.3974, + "step": 29671 + }, + { + "epoch": 0.38557410707179324, + "grad_norm": 0.42271530628204346, + "learning_rate": 0.00012291035755598593, + "loss": 1.617, + "step": 29672 + }, + { + "epoch": 0.3855871016157091, + "grad_norm": 0.4457968771457672, + "learning_rate": 0.00012290775809407453, + "loss": 1.2743, + "step": 29673 + }, + { + "epoch": 0.385600096159625, + "grad_norm": 0.4090537130832672, + "learning_rate": 0.00012290515863216315, + "loss": 1.2524, + "step": 29674 + }, + { + "epoch": 0.38561309070354083, + "grad_norm": 0.4532301425933838, + "learning_rate": 0.00012290255917025178, + "loss": 1.5549, + "step": 29675 + }, + { + "epoch": 0.38562608524745673, + "grad_norm": 0.39488521218299866, + "learning_rate": 0.00012289995970834038, + "loss": 1.1212, + "step": 29676 + }, + { + "epoch": 0.3856390797913726, + "grad_norm": 0.3525855243206024, + "learning_rate": 0.000122897360246429, + "loss": 1.1209, + "step": 29677 + }, + { + "epoch": 0.3856520743352885, + "grad_norm": 0.346203476190567, + "learning_rate": 0.0001228947607845176, + "loss": 1.1241, + "step": 29678 + }, + { + "epoch": 0.3856650688792043, + "grad_norm": 0.43990054726600647, + "learning_rate": 0.00012289216132260625, + "loss": 1.5306, + "step": 29679 + }, + { + "epoch": 0.3856780634231202, + "grad_norm": 0.340364933013916, + "learning_rate": 0.00012288956186069485, + "loss": 1.2509, + "step": 29680 + }, + { + "epoch": 0.38569105796703607, + "grad_norm": 0.37132421135902405, + "learning_rate": 0.00012288696239878344, + "loss": 1.2538, + "step": 29681 + }, + { + "epoch": 0.38570405251095197, + "grad_norm": 0.4005928337574005, + "learning_rate": 0.00012288436293687207, + "loss": 1.2828, + "step": 29682 + }, + { + "epoch": 0.3857170470548678, + "grad_norm": 0.4884699881076813, + "learning_rate": 0.0001228817634749607, + "loss": 1.4636, + "step": 29683 + }, + { + "epoch": 0.3857300415987837, + "grad_norm": 0.5030505061149597, + "learning_rate": 0.00012287916401304932, + "loss": 1.482, + "step": 29684 + }, + { + "epoch": 0.38574303614269956, + "grad_norm": 0.3435817360877991, + "learning_rate": 0.00012287656455113791, + "loss": 1.4532, + "step": 29685 + }, + { + "epoch": 0.38575603068661546, + "grad_norm": 0.2859126329421997, + "learning_rate": 0.00012287396508922654, + "loss": 1.2355, + "step": 29686 + }, + { + "epoch": 0.3857690252305313, + "grad_norm": 0.32572945952415466, + "learning_rate": 0.00012287136562731516, + "loss": 1.301, + "step": 29687 + }, + { + "epoch": 0.3857820197744472, + "grad_norm": 0.33779045939445496, + "learning_rate": 0.00012286876616540376, + "loss": 1.2462, + "step": 29688 + }, + { + "epoch": 0.38579501431836305, + "grad_norm": 0.3701564371585846, + "learning_rate": 0.00012286616670349239, + "loss": 1.2648, + "step": 29689 + }, + { + "epoch": 0.38580800886227895, + "grad_norm": 0.4164462983608246, + "learning_rate": 0.00012286356724158098, + "loss": 1.3668, + "step": 29690 + }, + { + "epoch": 0.38582100340619485, + "grad_norm": 0.3467954695224762, + "learning_rate": 0.00012286096777966963, + "loss": 1.5213, + "step": 29691 + }, + { + "epoch": 0.3858339979501107, + "grad_norm": 0.40671205520629883, + "learning_rate": 0.00012285836831775823, + "loss": 1.691, + "step": 29692 + }, + { + "epoch": 0.3858469924940266, + "grad_norm": 0.47208172082901, + "learning_rate": 0.00012285576885584683, + "loss": 1.4094, + "step": 29693 + }, + { + "epoch": 0.38585998703794244, + "grad_norm": 0.34101757407188416, + "learning_rate": 0.00012285316939393545, + "loss": 1.3154, + "step": 29694 + }, + { + "epoch": 0.38587298158185834, + "grad_norm": 0.3383236229419708, + "learning_rate": 0.00012285056993202408, + "loss": 1.3567, + "step": 29695 + }, + { + "epoch": 0.3858859761257742, + "grad_norm": 0.4301341474056244, + "learning_rate": 0.0001228479704701127, + "loss": 1.4164, + "step": 29696 + }, + { + "epoch": 0.3858989706696901, + "grad_norm": 0.3285837173461914, + "learning_rate": 0.0001228453710082013, + "loss": 1.4719, + "step": 29697 + }, + { + "epoch": 0.38591196521360593, + "grad_norm": 0.38588473200798035, + "learning_rate": 0.00012284277154628992, + "loss": 1.3768, + "step": 29698 + }, + { + "epoch": 0.38592495975752183, + "grad_norm": 0.4318685233592987, + "learning_rate": 0.00012284017208437855, + "loss": 1.2705, + "step": 29699 + }, + { + "epoch": 0.3859379543014377, + "grad_norm": 0.42043429613113403, + "learning_rate": 0.00012283757262246715, + "loss": 1.4581, + "step": 29700 + }, + { + "epoch": 0.3859509488453536, + "grad_norm": 0.4800224304199219, + "learning_rate": 0.00012283497316055577, + "loss": 1.4179, + "step": 29701 + }, + { + "epoch": 0.3859639433892694, + "grad_norm": 0.30927640199661255, + "learning_rate": 0.00012283237369864437, + "loss": 1.2575, + "step": 29702 + }, + { + "epoch": 0.3859769379331853, + "grad_norm": 0.48445573449134827, + "learning_rate": 0.00012282977423673302, + "loss": 1.463, + "step": 29703 + }, + { + "epoch": 0.38598993247710117, + "grad_norm": 0.418739378452301, + "learning_rate": 0.00012282717477482162, + "loss": 1.4311, + "step": 29704 + }, + { + "epoch": 0.3860029270210171, + "grad_norm": 0.4125359356403351, + "learning_rate": 0.00012282457531291024, + "loss": 1.2874, + "step": 29705 + }, + { + "epoch": 0.3860159215649329, + "grad_norm": 0.37390682101249695, + "learning_rate": 0.00012282197585099884, + "loss": 1.3034, + "step": 29706 + }, + { + "epoch": 0.3860289161088488, + "grad_norm": 0.3878862261772156, + "learning_rate": 0.00012281937638908746, + "loss": 1.3842, + "step": 29707 + }, + { + "epoch": 0.38604191065276466, + "grad_norm": 0.38967305421829224, + "learning_rate": 0.0001228167769271761, + "loss": 1.3761, + "step": 29708 + }, + { + "epoch": 0.38605490519668056, + "grad_norm": 0.3685172200202942, + "learning_rate": 0.00012281417746526469, + "loss": 1.3453, + "step": 29709 + }, + { + "epoch": 0.3860678997405964, + "grad_norm": 0.49002605676651, + "learning_rate": 0.0001228115780033533, + "loss": 1.4268, + "step": 29710 + }, + { + "epoch": 0.3860808942845123, + "grad_norm": 0.4744281470775604, + "learning_rate": 0.00012280897854144193, + "loss": 1.4328, + "step": 29711 + }, + { + "epoch": 0.38609388882842816, + "grad_norm": 0.42512616515159607, + "learning_rate": 0.00012280637907953053, + "loss": 1.4905, + "step": 29712 + }, + { + "epoch": 0.38610688337234406, + "grad_norm": 0.4794875383377075, + "learning_rate": 0.00012280377961761916, + "loss": 1.413, + "step": 29713 + }, + { + "epoch": 0.3861198779162599, + "grad_norm": 0.4263433516025543, + "learning_rate": 0.00012280118015570778, + "loss": 1.4963, + "step": 29714 + }, + { + "epoch": 0.3861328724601758, + "grad_norm": 0.40015482902526855, + "learning_rate": 0.0001227985806937964, + "loss": 1.4116, + "step": 29715 + }, + { + "epoch": 0.38614586700409165, + "grad_norm": 0.43053826689720154, + "learning_rate": 0.000122795981231885, + "loss": 1.3597, + "step": 29716 + }, + { + "epoch": 0.38615886154800755, + "grad_norm": 0.3474050760269165, + "learning_rate": 0.00012279338176997363, + "loss": 1.4843, + "step": 29717 + }, + { + "epoch": 0.3861718560919234, + "grad_norm": 0.26055964827537537, + "learning_rate": 0.00012279078230806225, + "loss": 1.4287, + "step": 29718 + }, + { + "epoch": 0.3861848506358393, + "grad_norm": 0.45391812920570374, + "learning_rate": 0.00012278818284615085, + "loss": 1.4813, + "step": 29719 + }, + { + "epoch": 0.38619784517975514, + "grad_norm": 0.41253888607025146, + "learning_rate": 0.00012278558338423947, + "loss": 1.4172, + "step": 29720 + }, + { + "epoch": 0.38621083972367104, + "grad_norm": 0.41941237449645996, + "learning_rate": 0.00012278298392232807, + "loss": 1.211, + "step": 29721 + }, + { + "epoch": 0.3862238342675869, + "grad_norm": 0.4305826425552368, + "learning_rate": 0.0001227803844604167, + "loss": 1.5506, + "step": 29722 + }, + { + "epoch": 0.3862368288115028, + "grad_norm": 0.4402470886707306, + "learning_rate": 0.00012277778499850532, + "loss": 1.6148, + "step": 29723 + }, + { + "epoch": 0.38624982335541863, + "grad_norm": 0.39274129271507263, + "learning_rate": 0.00012277518553659392, + "loss": 1.2169, + "step": 29724 + }, + { + "epoch": 0.38626281789933453, + "grad_norm": 0.33297446370124817, + "learning_rate": 0.00012277258607468254, + "loss": 1.3153, + "step": 29725 + }, + { + "epoch": 0.3862758124432504, + "grad_norm": 0.39429450035095215, + "learning_rate": 0.00012276998661277117, + "loss": 1.3565, + "step": 29726 + }, + { + "epoch": 0.3862888069871663, + "grad_norm": 0.34069812297821045, + "learning_rate": 0.0001227673871508598, + "loss": 1.2784, + "step": 29727 + }, + { + "epoch": 0.3863018015310821, + "grad_norm": 0.4167749583721161, + "learning_rate": 0.0001227647876889484, + "loss": 1.578, + "step": 29728 + }, + { + "epoch": 0.386314796074998, + "grad_norm": 0.3590206801891327, + "learning_rate": 0.000122762188227037, + "loss": 1.3422, + "step": 29729 + }, + { + "epoch": 0.38632779061891387, + "grad_norm": 0.39976903796195984, + "learning_rate": 0.00012275958876512564, + "loss": 1.3873, + "step": 29730 + }, + { + "epoch": 0.38634078516282977, + "grad_norm": 0.48879480361938477, + "learning_rate": 0.00012275698930321423, + "loss": 1.3372, + "step": 29731 + }, + { + "epoch": 0.3863537797067456, + "grad_norm": 0.37933698296546936, + "learning_rate": 0.00012275438984130286, + "loss": 1.6007, + "step": 29732 + }, + { + "epoch": 0.3863667742506615, + "grad_norm": 0.39360150694847107, + "learning_rate": 0.00012275179037939146, + "loss": 1.3832, + "step": 29733 + }, + { + "epoch": 0.38637976879457736, + "grad_norm": 0.3616618514060974, + "learning_rate": 0.0001227491909174801, + "loss": 1.328, + "step": 29734 + }, + { + "epoch": 0.38639276333849326, + "grad_norm": 0.47436678409576416, + "learning_rate": 0.0001227465914555687, + "loss": 1.5347, + "step": 29735 + }, + { + "epoch": 0.3864057578824091, + "grad_norm": 0.38394227623939514, + "learning_rate": 0.0001227439919936573, + "loss": 1.5284, + "step": 29736 + }, + { + "epoch": 0.386418752426325, + "grad_norm": 0.44200798869132996, + "learning_rate": 0.00012274139253174593, + "loss": 1.3914, + "step": 29737 + }, + { + "epoch": 0.38643174697024085, + "grad_norm": 0.3390139937400818, + "learning_rate": 0.00012273879306983455, + "loss": 1.1884, + "step": 29738 + }, + { + "epoch": 0.38644474151415675, + "grad_norm": 0.37402334809303284, + "learning_rate": 0.00012273619360792318, + "loss": 1.4467, + "step": 29739 + }, + { + "epoch": 0.3864577360580726, + "grad_norm": 0.3869030475616455, + "learning_rate": 0.00012273359414601177, + "loss": 1.5286, + "step": 29740 + }, + { + "epoch": 0.3864707306019885, + "grad_norm": 0.33115485310554504, + "learning_rate": 0.0001227309946841004, + "loss": 1.3785, + "step": 29741 + }, + { + "epoch": 0.38648372514590434, + "grad_norm": 0.42454832792282104, + "learning_rate": 0.00012272839522218902, + "loss": 1.5248, + "step": 29742 + }, + { + "epoch": 0.38649671968982025, + "grad_norm": 0.38917481899261475, + "learning_rate": 0.00012272579576027762, + "loss": 1.5065, + "step": 29743 + }, + { + "epoch": 0.3865097142337361, + "grad_norm": 0.37830406427383423, + "learning_rate": 0.00012272319629836624, + "loss": 1.4404, + "step": 29744 + }, + { + "epoch": 0.386522708777652, + "grad_norm": 0.39022427797317505, + "learning_rate": 0.00012272059683645487, + "loss": 1.4768, + "step": 29745 + }, + { + "epoch": 0.38653570332156784, + "grad_norm": 0.5081313848495483, + "learning_rate": 0.0001227179973745435, + "loss": 1.4071, + "step": 29746 + }, + { + "epoch": 0.38654869786548374, + "grad_norm": 0.37116414308547974, + "learning_rate": 0.0001227153979126321, + "loss": 1.4866, + "step": 29747 + }, + { + "epoch": 0.3865616924093996, + "grad_norm": 0.3568953573703766, + "learning_rate": 0.0001227127984507207, + "loss": 1.3873, + "step": 29748 + }, + { + "epoch": 0.3865746869533155, + "grad_norm": 0.39687058329582214, + "learning_rate": 0.00012271019898880934, + "loss": 1.4239, + "step": 29749 + }, + { + "epoch": 0.38658768149723133, + "grad_norm": 0.4455108344554901, + "learning_rate": 0.00012270759952689794, + "loss": 1.3989, + "step": 29750 + }, + { + "epoch": 0.38660067604114723, + "grad_norm": 0.39055877923965454, + "learning_rate": 0.00012270500006498656, + "loss": 1.3154, + "step": 29751 + }, + { + "epoch": 0.3866136705850631, + "grad_norm": 0.3702305853366852, + "learning_rate": 0.00012270240060307516, + "loss": 1.095, + "step": 29752 + }, + { + "epoch": 0.386626665128979, + "grad_norm": 0.5071846842765808, + "learning_rate": 0.00012269980114116378, + "loss": 1.5827, + "step": 29753 + }, + { + "epoch": 0.3866396596728948, + "grad_norm": 0.3344666659832001, + "learning_rate": 0.0001226972016792524, + "loss": 1.3228, + "step": 29754 + }, + { + "epoch": 0.3866526542168107, + "grad_norm": 0.3974967896938324, + "learning_rate": 0.000122694602217341, + "loss": 1.6777, + "step": 29755 + }, + { + "epoch": 0.38666564876072657, + "grad_norm": 0.4079102873802185, + "learning_rate": 0.00012269200275542963, + "loss": 1.3265, + "step": 29756 + }, + { + "epoch": 0.38667864330464247, + "grad_norm": 0.3736034333705902, + "learning_rate": 0.00012268940329351825, + "loss": 1.337, + "step": 29757 + }, + { + "epoch": 0.3866916378485583, + "grad_norm": 0.4684407114982605, + "learning_rate": 0.00012268680383160688, + "loss": 1.4321, + "step": 29758 + }, + { + "epoch": 0.3867046323924742, + "grad_norm": 0.3875333368778229, + "learning_rate": 0.00012268420436969548, + "loss": 1.3352, + "step": 29759 + }, + { + "epoch": 0.38671762693639006, + "grad_norm": 0.4370006322860718, + "learning_rate": 0.00012268160490778407, + "loss": 1.4266, + "step": 29760 + }, + { + "epoch": 0.38673062148030596, + "grad_norm": 0.4806390404701233, + "learning_rate": 0.00012267900544587273, + "loss": 1.5746, + "step": 29761 + }, + { + "epoch": 0.3867436160242218, + "grad_norm": 0.41084182262420654, + "learning_rate": 0.00012267640598396132, + "loss": 1.468, + "step": 29762 + }, + { + "epoch": 0.3867566105681377, + "grad_norm": 0.39975109696388245, + "learning_rate": 0.00012267380652204995, + "loss": 1.5812, + "step": 29763 + }, + { + "epoch": 0.38676960511205355, + "grad_norm": 0.3874363601207733, + "learning_rate": 0.00012267120706013854, + "loss": 1.4981, + "step": 29764 + }, + { + "epoch": 0.38678259965596945, + "grad_norm": 0.38484007120132446, + "learning_rate": 0.00012266860759822717, + "loss": 1.4525, + "step": 29765 + }, + { + "epoch": 0.38679559419988535, + "grad_norm": 0.33379194140434265, + "learning_rate": 0.0001226660081363158, + "loss": 1.2499, + "step": 29766 + }, + { + "epoch": 0.3868085887438012, + "grad_norm": 0.3669845163822174, + "learning_rate": 0.0001226634086744044, + "loss": 1.4502, + "step": 29767 + }, + { + "epoch": 0.3868215832877171, + "grad_norm": 0.482738733291626, + "learning_rate": 0.00012266080921249302, + "loss": 1.2857, + "step": 29768 + }, + { + "epoch": 0.38683457783163294, + "grad_norm": 0.3862116038799286, + "learning_rate": 0.00012265820975058164, + "loss": 1.3294, + "step": 29769 + }, + { + "epoch": 0.38684757237554884, + "grad_norm": 0.4594738185405731, + "learning_rate": 0.00012265561028867026, + "loss": 1.346, + "step": 29770 + }, + { + "epoch": 0.3868605669194647, + "grad_norm": 0.4819294214248657, + "learning_rate": 0.00012265301082675886, + "loss": 1.2718, + "step": 29771 + }, + { + "epoch": 0.3868735614633806, + "grad_norm": 0.3536228835582733, + "learning_rate": 0.00012265041136484749, + "loss": 1.5036, + "step": 29772 + }, + { + "epoch": 0.38688655600729643, + "grad_norm": 0.35345199704170227, + "learning_rate": 0.0001226478119029361, + "loss": 1.3253, + "step": 29773 + }, + { + "epoch": 0.38689955055121233, + "grad_norm": 0.3304901123046875, + "learning_rate": 0.0001226452124410247, + "loss": 1.2689, + "step": 29774 + }, + { + "epoch": 0.3869125450951282, + "grad_norm": 0.4230820834636688, + "learning_rate": 0.00012264261297911333, + "loss": 1.3563, + "step": 29775 + }, + { + "epoch": 0.3869255396390441, + "grad_norm": 0.3944951295852661, + "learning_rate": 0.00012264001351720193, + "loss": 1.3652, + "step": 29776 + }, + { + "epoch": 0.3869385341829599, + "grad_norm": 0.3445724546909332, + "learning_rate": 0.00012263741405529055, + "loss": 1.293, + "step": 29777 + }, + { + "epoch": 0.3869515287268758, + "grad_norm": 0.45801207423210144, + "learning_rate": 0.00012263481459337918, + "loss": 1.4637, + "step": 29778 + }, + { + "epoch": 0.38696452327079167, + "grad_norm": 0.3891269862651825, + "learning_rate": 0.00012263221513146778, + "loss": 1.19, + "step": 29779 + }, + { + "epoch": 0.3869775178147076, + "grad_norm": 0.44965341687202454, + "learning_rate": 0.0001226296156695564, + "loss": 1.5173, + "step": 29780 + }, + { + "epoch": 0.3869905123586234, + "grad_norm": 0.4066510498523712, + "learning_rate": 0.00012262701620764503, + "loss": 1.3143, + "step": 29781 + }, + { + "epoch": 0.3870035069025393, + "grad_norm": 0.3551245331764221, + "learning_rate": 0.00012262441674573365, + "loss": 1.3002, + "step": 29782 + }, + { + "epoch": 0.38701650144645516, + "grad_norm": 0.34226372838020325, + "learning_rate": 0.00012262181728382225, + "loss": 1.3357, + "step": 29783 + }, + { + "epoch": 0.38702949599037106, + "grad_norm": 0.37835416197776794, + "learning_rate": 0.00012261921782191087, + "loss": 1.2966, + "step": 29784 + }, + { + "epoch": 0.3870424905342869, + "grad_norm": 0.42439281940460205, + "learning_rate": 0.0001226166183599995, + "loss": 1.4746, + "step": 29785 + }, + { + "epoch": 0.3870554850782028, + "grad_norm": 0.39346417784690857, + "learning_rate": 0.0001226140188980881, + "loss": 1.3842, + "step": 29786 + }, + { + "epoch": 0.38706847962211866, + "grad_norm": 0.4251452386379242, + "learning_rate": 0.00012261141943617672, + "loss": 1.4836, + "step": 29787 + }, + { + "epoch": 0.38708147416603456, + "grad_norm": 0.4480631947517395, + "learning_rate": 0.00012260881997426534, + "loss": 1.3541, + "step": 29788 + }, + { + "epoch": 0.3870944687099504, + "grad_norm": 0.3870769441127777, + "learning_rate": 0.00012260622051235394, + "loss": 1.4393, + "step": 29789 + }, + { + "epoch": 0.3871074632538663, + "grad_norm": 0.3966757357120514, + "learning_rate": 0.00012260362105044256, + "loss": 1.4477, + "step": 29790 + }, + { + "epoch": 0.38712045779778215, + "grad_norm": 0.4037858843803406, + "learning_rate": 0.00012260102158853116, + "loss": 1.3016, + "step": 29791 + }, + { + "epoch": 0.38713345234169805, + "grad_norm": 0.432411253452301, + "learning_rate": 0.0001225984221266198, + "loss": 1.3313, + "step": 29792 + }, + { + "epoch": 0.3871464468856139, + "grad_norm": 0.42230790853500366, + "learning_rate": 0.0001225958226647084, + "loss": 1.2872, + "step": 29793 + }, + { + "epoch": 0.3871594414295298, + "grad_norm": 0.4460132122039795, + "learning_rate": 0.00012259322320279703, + "loss": 1.4183, + "step": 29794 + }, + { + "epoch": 0.38717243597344564, + "grad_norm": 0.388439416885376, + "learning_rate": 0.00012259062374088563, + "loss": 1.3582, + "step": 29795 + }, + { + "epoch": 0.38718543051736154, + "grad_norm": 0.370025634765625, + "learning_rate": 0.00012258802427897426, + "loss": 1.3086, + "step": 29796 + }, + { + "epoch": 0.3871984250612774, + "grad_norm": 0.36072489619255066, + "learning_rate": 0.00012258542481706288, + "loss": 1.2024, + "step": 29797 + }, + { + "epoch": 0.3872114196051933, + "grad_norm": 0.3985675573348999, + "learning_rate": 0.00012258282535515148, + "loss": 1.4394, + "step": 29798 + }, + { + "epoch": 0.38722441414910913, + "grad_norm": 0.38333627581596375, + "learning_rate": 0.0001225802258932401, + "loss": 1.3597, + "step": 29799 + }, + { + "epoch": 0.38723740869302503, + "grad_norm": 0.47742190957069397, + "learning_rate": 0.00012257762643132873, + "loss": 1.6916, + "step": 29800 + }, + { + "epoch": 0.3872504032369409, + "grad_norm": 0.4569127559661865, + "learning_rate": 0.00012257502696941735, + "loss": 1.7027, + "step": 29801 + }, + { + "epoch": 0.3872633977808568, + "grad_norm": 0.2834679186344147, + "learning_rate": 0.00012257242750750595, + "loss": 1.2926, + "step": 29802 + }, + { + "epoch": 0.3872763923247726, + "grad_norm": 0.33267778158187866, + "learning_rate": 0.00012256982804559455, + "loss": 1.3553, + "step": 29803 + }, + { + "epoch": 0.3872893868686885, + "grad_norm": 0.35983896255493164, + "learning_rate": 0.0001225672285836832, + "loss": 1.3815, + "step": 29804 + }, + { + "epoch": 0.38730238141260437, + "grad_norm": 0.3674542009830475, + "learning_rate": 0.0001225646291217718, + "loss": 1.2445, + "step": 29805 + }, + { + "epoch": 0.38731537595652027, + "grad_norm": 0.44520899653434753, + "learning_rate": 0.00012256202965986042, + "loss": 1.3601, + "step": 29806 + }, + { + "epoch": 0.3873283705004361, + "grad_norm": 0.3955947160720825, + "learning_rate": 0.00012255943019794902, + "loss": 1.3747, + "step": 29807 + }, + { + "epoch": 0.387341365044352, + "grad_norm": 0.46005773544311523, + "learning_rate": 0.00012255683073603764, + "loss": 1.5152, + "step": 29808 + }, + { + "epoch": 0.38735435958826786, + "grad_norm": 0.4513423442840576, + "learning_rate": 0.00012255423127412627, + "loss": 1.6036, + "step": 29809 + }, + { + "epoch": 0.38736735413218376, + "grad_norm": 0.25207453966140747, + "learning_rate": 0.00012255163181221486, + "loss": 1.2006, + "step": 29810 + }, + { + "epoch": 0.3873803486760996, + "grad_norm": 0.4496898353099823, + "learning_rate": 0.0001225490323503035, + "loss": 1.4391, + "step": 29811 + }, + { + "epoch": 0.3873933432200155, + "grad_norm": 0.4895389974117279, + "learning_rate": 0.0001225464328883921, + "loss": 1.3897, + "step": 29812 + }, + { + "epoch": 0.38740633776393135, + "grad_norm": 0.7299759984016418, + "learning_rate": 0.00012254383342648074, + "loss": 1.2374, + "step": 29813 + }, + { + "epoch": 0.38741933230784725, + "grad_norm": 0.4900203049182892, + "learning_rate": 0.00012254123396456933, + "loss": 1.3894, + "step": 29814 + }, + { + "epoch": 0.3874323268517631, + "grad_norm": 0.34242701530456543, + "learning_rate": 0.00012253863450265793, + "loss": 1.2608, + "step": 29815 + }, + { + "epoch": 0.387445321395679, + "grad_norm": 0.5025469660758972, + "learning_rate": 0.00012253603504074658, + "loss": 1.5498, + "step": 29816 + }, + { + "epoch": 0.38745831593959484, + "grad_norm": 0.48988255858421326, + "learning_rate": 0.00012253343557883518, + "loss": 1.3759, + "step": 29817 + }, + { + "epoch": 0.38747131048351074, + "grad_norm": 0.3595893085002899, + "learning_rate": 0.0001225308361169238, + "loss": 1.3703, + "step": 29818 + }, + { + "epoch": 0.3874843050274266, + "grad_norm": 0.3543982207775116, + "learning_rate": 0.0001225282366550124, + "loss": 1.2888, + "step": 29819 + }, + { + "epoch": 0.3874972995713425, + "grad_norm": 0.4523742198944092, + "learning_rate": 0.00012252563719310103, + "loss": 1.3428, + "step": 29820 + }, + { + "epoch": 0.38751029411525834, + "grad_norm": 0.39910241961479187, + "learning_rate": 0.00012252303773118965, + "loss": 1.4277, + "step": 29821 + }, + { + "epoch": 0.38752328865917424, + "grad_norm": 0.4761836528778076, + "learning_rate": 0.00012252043826927825, + "loss": 1.3918, + "step": 29822 + }, + { + "epoch": 0.3875362832030901, + "grad_norm": 0.3799963593482971, + "learning_rate": 0.0001225178388073669, + "loss": 1.2415, + "step": 29823 + }, + { + "epoch": 0.387549277747006, + "grad_norm": 0.4324726462364197, + "learning_rate": 0.0001225152393454555, + "loss": 1.3634, + "step": 29824 + }, + { + "epoch": 0.3875622722909218, + "grad_norm": 0.4833022952079773, + "learning_rate": 0.00012251263988354412, + "loss": 1.5643, + "step": 29825 + }, + { + "epoch": 0.38757526683483773, + "grad_norm": 0.4042578637599945, + "learning_rate": 0.00012251004042163272, + "loss": 1.3777, + "step": 29826 + }, + { + "epoch": 0.3875882613787536, + "grad_norm": 0.3437054455280304, + "learning_rate": 0.00012250744095972134, + "loss": 1.3368, + "step": 29827 + }, + { + "epoch": 0.3876012559226695, + "grad_norm": 0.31347450613975525, + "learning_rate": 0.00012250484149780997, + "loss": 1.3307, + "step": 29828 + }, + { + "epoch": 0.3876142504665853, + "grad_norm": 0.38781213760375977, + "learning_rate": 0.00012250224203589857, + "loss": 1.4691, + "step": 29829 + }, + { + "epoch": 0.3876272450105012, + "grad_norm": 0.37810152769088745, + "learning_rate": 0.0001224996425739872, + "loss": 1.5532, + "step": 29830 + }, + { + "epoch": 0.38764023955441707, + "grad_norm": 0.3242591619491577, + "learning_rate": 0.00012249704311207582, + "loss": 1.3749, + "step": 29831 + }, + { + "epoch": 0.38765323409833297, + "grad_norm": 0.2856314182281494, + "learning_rate": 0.0001224944436501644, + "loss": 1.2521, + "step": 29832 + }, + { + "epoch": 0.3876662286422488, + "grad_norm": 0.39256522059440613, + "learning_rate": 0.00012249184418825304, + "loss": 1.4652, + "step": 29833 + }, + { + "epoch": 0.3876792231861647, + "grad_norm": 0.3368414044380188, + "learning_rate": 0.00012248924472634163, + "loss": 1.3384, + "step": 29834 + }, + { + "epoch": 0.38769221773008056, + "grad_norm": 0.4324895143508911, + "learning_rate": 0.00012248664526443029, + "loss": 1.5427, + "step": 29835 + }, + { + "epoch": 0.38770521227399646, + "grad_norm": 0.32702910900115967, + "learning_rate": 0.00012248404580251888, + "loss": 1.2738, + "step": 29836 + }, + { + "epoch": 0.3877182068179123, + "grad_norm": 0.4195633828639984, + "learning_rate": 0.0001224814463406075, + "loss": 1.4554, + "step": 29837 + }, + { + "epoch": 0.3877312013618282, + "grad_norm": 0.46183133125305176, + "learning_rate": 0.0001224788468786961, + "loss": 1.3694, + "step": 29838 + }, + { + "epoch": 0.38774419590574405, + "grad_norm": 0.39727213978767395, + "learning_rate": 0.00012247624741678473, + "loss": 1.2311, + "step": 29839 + }, + { + "epoch": 0.38775719044965995, + "grad_norm": 0.4146970808506012, + "learning_rate": 0.00012247364795487335, + "loss": 1.2301, + "step": 29840 + }, + { + "epoch": 0.3877701849935758, + "grad_norm": 0.33627963066101074, + "learning_rate": 0.00012247104849296195, + "loss": 1.1917, + "step": 29841 + }, + { + "epoch": 0.3877831795374917, + "grad_norm": 0.39488857984542847, + "learning_rate": 0.00012246844903105058, + "loss": 1.4307, + "step": 29842 + }, + { + "epoch": 0.3877961740814076, + "grad_norm": 0.4596952199935913, + "learning_rate": 0.0001224658495691392, + "loss": 1.3376, + "step": 29843 + }, + { + "epoch": 0.38780916862532344, + "grad_norm": 0.36482855677604675, + "learning_rate": 0.0001224632501072278, + "loss": 1.2941, + "step": 29844 + }, + { + "epoch": 0.38782216316923934, + "grad_norm": 0.37410250306129456, + "learning_rate": 0.00012246065064531642, + "loss": 1.4677, + "step": 29845 + }, + { + "epoch": 0.3878351577131552, + "grad_norm": 0.3789508640766144, + "learning_rate": 0.00012245805118340502, + "loss": 1.445, + "step": 29846 + }, + { + "epoch": 0.3878481522570711, + "grad_norm": 0.34893906116485596, + "learning_rate": 0.00012245545172149367, + "loss": 1.3977, + "step": 29847 + }, + { + "epoch": 0.38786114680098693, + "grad_norm": 0.35791364312171936, + "learning_rate": 0.00012245285225958227, + "loss": 1.489, + "step": 29848 + }, + { + "epoch": 0.38787414134490283, + "grad_norm": 0.32428452372550964, + "learning_rate": 0.0001224502527976709, + "loss": 1.3289, + "step": 29849 + }, + { + "epoch": 0.3878871358888187, + "grad_norm": 0.4137621223926544, + "learning_rate": 0.0001224476533357595, + "loss": 1.3861, + "step": 29850 + }, + { + "epoch": 0.3879001304327346, + "grad_norm": 0.3547743558883667, + "learning_rate": 0.00012244505387384812, + "loss": 1.5293, + "step": 29851 + }, + { + "epoch": 0.3879131249766504, + "grad_norm": 0.3912314176559448, + "learning_rate": 0.00012244245441193674, + "loss": 1.4246, + "step": 29852 + }, + { + "epoch": 0.3879261195205663, + "grad_norm": 0.40899860858917236, + "learning_rate": 0.00012243985495002534, + "loss": 1.3946, + "step": 29853 + }, + { + "epoch": 0.38793911406448217, + "grad_norm": 0.3769967555999756, + "learning_rate": 0.00012243725548811396, + "loss": 1.2737, + "step": 29854 + }, + { + "epoch": 0.38795210860839807, + "grad_norm": 0.4838295876979828, + "learning_rate": 0.00012243465602620259, + "loss": 1.5054, + "step": 29855 + }, + { + "epoch": 0.3879651031523139, + "grad_norm": 0.4463987946510315, + "learning_rate": 0.0001224320565642912, + "loss": 1.4829, + "step": 29856 + }, + { + "epoch": 0.3879780976962298, + "grad_norm": 0.24521949887275696, + "learning_rate": 0.0001224294571023798, + "loss": 1.3377, + "step": 29857 + }, + { + "epoch": 0.38799109224014566, + "grad_norm": 0.45139047503471375, + "learning_rate": 0.00012242685764046843, + "loss": 1.4292, + "step": 29858 + }, + { + "epoch": 0.38800408678406156, + "grad_norm": 0.4290568232536316, + "learning_rate": 0.00012242425817855706, + "loss": 1.557, + "step": 29859 + }, + { + "epoch": 0.3880170813279774, + "grad_norm": 0.3773767054080963, + "learning_rate": 0.00012242165871664565, + "loss": 1.334, + "step": 29860 + }, + { + "epoch": 0.3880300758718933, + "grad_norm": 0.27543938159942627, + "learning_rate": 0.00012241905925473428, + "loss": 1.2525, + "step": 29861 + }, + { + "epoch": 0.38804307041580915, + "grad_norm": 0.34243497252464294, + "learning_rate": 0.0001224164597928229, + "loss": 1.1238, + "step": 29862 + }, + { + "epoch": 0.38805606495972506, + "grad_norm": 0.4504084587097168, + "learning_rate": 0.0001224138603309115, + "loss": 1.2286, + "step": 29863 + }, + { + "epoch": 0.3880690595036409, + "grad_norm": 0.4480748474597931, + "learning_rate": 0.00012241126086900013, + "loss": 1.4154, + "step": 29864 + }, + { + "epoch": 0.3880820540475568, + "grad_norm": 0.5171236395835876, + "learning_rate": 0.00012240866140708872, + "loss": 1.2709, + "step": 29865 + }, + { + "epoch": 0.38809504859147265, + "grad_norm": 0.38499966263771057, + "learning_rate": 0.00012240606194517737, + "loss": 1.3089, + "step": 29866 + }, + { + "epoch": 0.38810804313538855, + "grad_norm": 0.35608506202697754, + "learning_rate": 0.00012240346248326597, + "loss": 1.5467, + "step": 29867 + }, + { + "epoch": 0.3881210376793044, + "grad_norm": 0.4202132821083069, + "learning_rate": 0.0001224008630213546, + "loss": 1.5517, + "step": 29868 + }, + { + "epoch": 0.3881340322232203, + "grad_norm": 0.331220418214798, + "learning_rate": 0.0001223982635594432, + "loss": 1.1938, + "step": 29869 + }, + { + "epoch": 0.38814702676713614, + "grad_norm": 0.35433098673820496, + "learning_rate": 0.00012239566409753182, + "loss": 1.3799, + "step": 29870 + }, + { + "epoch": 0.38816002131105204, + "grad_norm": 0.5086492896080017, + "learning_rate": 0.00012239306463562044, + "loss": 1.4711, + "step": 29871 + }, + { + "epoch": 0.3881730158549679, + "grad_norm": 0.4203968346118927, + "learning_rate": 0.00012239046517370904, + "loss": 1.4212, + "step": 29872 + }, + { + "epoch": 0.3881860103988838, + "grad_norm": 0.5037051439285278, + "learning_rate": 0.00012238786571179766, + "loss": 1.5708, + "step": 29873 + }, + { + "epoch": 0.38819900494279963, + "grad_norm": 0.3497743308544159, + "learning_rate": 0.0001223852662498863, + "loss": 1.2971, + "step": 29874 + }, + { + "epoch": 0.38821199948671553, + "grad_norm": 0.35463201999664307, + "learning_rate": 0.00012238266678797489, + "loss": 1.4071, + "step": 29875 + }, + { + "epoch": 0.3882249940306314, + "grad_norm": 0.4891999661922455, + "learning_rate": 0.0001223800673260635, + "loss": 1.5178, + "step": 29876 + }, + { + "epoch": 0.3882379885745473, + "grad_norm": 0.39860543608665466, + "learning_rate": 0.0001223774678641521, + "loss": 1.2939, + "step": 29877 + }, + { + "epoch": 0.3882509831184631, + "grad_norm": 0.3721354603767395, + "learning_rate": 0.00012237486840224076, + "loss": 1.3603, + "step": 29878 + }, + { + "epoch": 0.388263977662379, + "grad_norm": 0.43463608622550964, + "learning_rate": 0.00012237226894032936, + "loss": 1.4447, + "step": 29879 + }, + { + "epoch": 0.38827697220629487, + "grad_norm": 0.3414178490638733, + "learning_rate": 0.00012236966947841798, + "loss": 1.2873, + "step": 29880 + }, + { + "epoch": 0.38828996675021077, + "grad_norm": 0.3297525644302368, + "learning_rate": 0.00012236707001650658, + "loss": 1.4836, + "step": 29881 + }, + { + "epoch": 0.3883029612941266, + "grad_norm": 0.4560333490371704, + "learning_rate": 0.0001223644705545952, + "loss": 1.3534, + "step": 29882 + }, + { + "epoch": 0.3883159558380425, + "grad_norm": 0.4555419385433197, + "learning_rate": 0.00012236187109268383, + "loss": 1.4457, + "step": 29883 + }, + { + "epoch": 0.38832895038195836, + "grad_norm": 0.30804577469825745, + "learning_rate": 0.00012235927163077243, + "loss": 1.0228, + "step": 29884 + }, + { + "epoch": 0.38834194492587426, + "grad_norm": 0.3303452432155609, + "learning_rate": 0.00012235667216886105, + "loss": 1.3975, + "step": 29885 + }, + { + "epoch": 0.3883549394697901, + "grad_norm": 0.2807534337043762, + "learning_rate": 0.00012235407270694967, + "loss": 1.1644, + "step": 29886 + }, + { + "epoch": 0.388367934013706, + "grad_norm": 0.4142604470252991, + "learning_rate": 0.00012235147324503827, + "loss": 1.3199, + "step": 29887 + }, + { + "epoch": 0.38838092855762185, + "grad_norm": 0.40739983320236206, + "learning_rate": 0.0001223488737831269, + "loss": 1.5283, + "step": 29888 + }, + { + "epoch": 0.38839392310153775, + "grad_norm": 0.46091997623443604, + "learning_rate": 0.0001223462743212155, + "loss": 1.3917, + "step": 29889 + }, + { + "epoch": 0.3884069176454536, + "grad_norm": 0.4044102430343628, + "learning_rate": 0.00012234367485930415, + "loss": 1.4336, + "step": 29890 + }, + { + "epoch": 0.3884199121893695, + "grad_norm": 0.38766008615493774, + "learning_rate": 0.00012234107539739274, + "loss": 1.194, + "step": 29891 + }, + { + "epoch": 0.38843290673328534, + "grad_norm": 0.38603052496910095, + "learning_rate": 0.00012233847593548137, + "loss": 1.2864, + "step": 29892 + }, + { + "epoch": 0.38844590127720124, + "grad_norm": 0.4562664330005646, + "learning_rate": 0.00012233587647356996, + "loss": 1.5865, + "step": 29893 + }, + { + "epoch": 0.3884588958211171, + "grad_norm": 0.4481065273284912, + "learning_rate": 0.0001223332770116586, + "loss": 1.2825, + "step": 29894 + }, + { + "epoch": 0.388471890365033, + "grad_norm": 0.435531884431839, + "learning_rate": 0.0001223306775497472, + "loss": 1.2549, + "step": 29895 + }, + { + "epoch": 0.38848488490894884, + "grad_norm": 0.38524964451789856, + "learning_rate": 0.0001223280780878358, + "loss": 1.4728, + "step": 29896 + }, + { + "epoch": 0.38849787945286474, + "grad_norm": 0.4691812992095947, + "learning_rate": 0.00012232547862592446, + "loss": 1.5518, + "step": 29897 + }, + { + "epoch": 0.3885108739967806, + "grad_norm": 0.525292694568634, + "learning_rate": 0.00012232287916401306, + "loss": 1.5359, + "step": 29898 + }, + { + "epoch": 0.3885238685406965, + "grad_norm": 0.5330997109413147, + "learning_rate": 0.00012232027970210166, + "loss": 1.2279, + "step": 29899 + }, + { + "epoch": 0.3885368630846123, + "grad_norm": 0.41754022240638733, + "learning_rate": 0.00012231768024019028, + "loss": 1.5521, + "step": 29900 + }, + { + "epoch": 0.38854985762852823, + "grad_norm": 0.44540759921073914, + "learning_rate": 0.0001223150807782789, + "loss": 1.4199, + "step": 29901 + }, + { + "epoch": 0.3885628521724441, + "grad_norm": 0.39884260296821594, + "learning_rate": 0.00012231248131636753, + "loss": 1.5216, + "step": 29902 + }, + { + "epoch": 0.38857584671636, + "grad_norm": 0.44731393456459045, + "learning_rate": 0.00012230988185445613, + "loss": 1.3533, + "step": 29903 + }, + { + "epoch": 0.3885888412602758, + "grad_norm": 0.36042359471321106, + "learning_rate": 0.00012230728239254475, + "loss": 1.2104, + "step": 29904 + }, + { + "epoch": 0.3886018358041917, + "grad_norm": 0.3310069739818573, + "learning_rate": 0.00012230468293063338, + "loss": 1.3148, + "step": 29905 + }, + { + "epoch": 0.38861483034810756, + "grad_norm": 0.3812437355518341, + "learning_rate": 0.00012230208346872197, + "loss": 1.5159, + "step": 29906 + }, + { + "epoch": 0.38862782489202347, + "grad_norm": 0.3621780276298523, + "learning_rate": 0.0001222994840068106, + "loss": 1.2924, + "step": 29907 + }, + { + "epoch": 0.3886408194359393, + "grad_norm": 0.47713184356689453, + "learning_rate": 0.0001222968845448992, + "loss": 1.6367, + "step": 29908 + }, + { + "epoch": 0.3886538139798552, + "grad_norm": 0.3496624529361725, + "learning_rate": 0.00012229428508298785, + "loss": 1.3997, + "step": 29909 + }, + { + "epoch": 0.38866680852377106, + "grad_norm": 0.30547887086868286, + "learning_rate": 0.00012229168562107645, + "loss": 1.4512, + "step": 29910 + }, + { + "epoch": 0.38867980306768696, + "grad_norm": 0.4067460596561432, + "learning_rate": 0.00012228908615916507, + "loss": 1.3028, + "step": 29911 + }, + { + "epoch": 0.3886927976116028, + "grad_norm": 0.36536532640457153, + "learning_rate": 0.00012228648669725367, + "loss": 1.4354, + "step": 29912 + }, + { + "epoch": 0.3887057921555187, + "grad_norm": 0.4010399878025055, + "learning_rate": 0.0001222838872353423, + "loss": 1.2371, + "step": 29913 + }, + { + "epoch": 0.38871878669943455, + "grad_norm": 0.48741745948791504, + "learning_rate": 0.00012228128777343092, + "loss": 1.4684, + "step": 29914 + }, + { + "epoch": 0.38873178124335045, + "grad_norm": 0.502138614654541, + "learning_rate": 0.0001222786883115195, + "loss": 1.43, + "step": 29915 + }, + { + "epoch": 0.3887447757872663, + "grad_norm": 0.3628387153148651, + "learning_rate": 0.00012227608884960814, + "loss": 1.3516, + "step": 29916 + }, + { + "epoch": 0.3887577703311822, + "grad_norm": 0.42328667640686035, + "learning_rate": 0.00012227348938769676, + "loss": 1.4117, + "step": 29917 + }, + { + "epoch": 0.38877076487509804, + "grad_norm": 0.36301475763320923, + "learning_rate": 0.00012227088992578536, + "loss": 1.2453, + "step": 29918 + }, + { + "epoch": 0.38878375941901394, + "grad_norm": 0.4574221074581146, + "learning_rate": 0.00012226829046387398, + "loss": 1.3164, + "step": 29919 + }, + { + "epoch": 0.38879675396292984, + "grad_norm": 0.5163411498069763, + "learning_rate": 0.00012226569100196258, + "loss": 1.4778, + "step": 29920 + }, + { + "epoch": 0.3888097485068457, + "grad_norm": 0.4122473895549774, + "learning_rate": 0.00012226309154005123, + "loss": 1.3811, + "step": 29921 + }, + { + "epoch": 0.3888227430507616, + "grad_norm": 0.42602020502090454, + "learning_rate": 0.00012226049207813983, + "loss": 1.5016, + "step": 29922 + }, + { + "epoch": 0.38883573759467743, + "grad_norm": 0.44233569502830505, + "learning_rate": 0.00012225789261622845, + "loss": 1.3322, + "step": 29923 + }, + { + "epoch": 0.38884873213859333, + "grad_norm": 0.35932064056396484, + "learning_rate": 0.00012225529315431705, + "loss": 1.2632, + "step": 29924 + }, + { + "epoch": 0.3888617266825092, + "grad_norm": 0.3449583351612091, + "learning_rate": 0.00012225269369240568, + "loss": 1.1625, + "step": 29925 + }, + { + "epoch": 0.3888747212264251, + "grad_norm": 0.43039581179618835, + "learning_rate": 0.0001222500942304943, + "loss": 1.3463, + "step": 29926 + }, + { + "epoch": 0.3888877157703409, + "grad_norm": 0.41915011405944824, + "learning_rate": 0.0001222474947685829, + "loss": 1.2752, + "step": 29927 + }, + { + "epoch": 0.3889007103142568, + "grad_norm": 0.49146372079849243, + "learning_rate": 0.00012224489530667152, + "loss": 1.5694, + "step": 29928 + }, + { + "epoch": 0.38891370485817267, + "grad_norm": 0.37312987446784973, + "learning_rate": 0.00012224229584476015, + "loss": 1.2486, + "step": 29929 + }, + { + "epoch": 0.38892669940208857, + "grad_norm": 0.4269731938838959, + "learning_rate": 0.00012223969638284875, + "loss": 1.3001, + "step": 29930 + }, + { + "epoch": 0.3889396939460044, + "grad_norm": 0.38001537322998047, + "learning_rate": 0.00012223709692093737, + "loss": 1.307, + "step": 29931 + }, + { + "epoch": 0.3889526884899203, + "grad_norm": 0.4600802958011627, + "learning_rate": 0.000122234497459026, + "loss": 1.4522, + "step": 29932 + }, + { + "epoch": 0.38896568303383616, + "grad_norm": 0.4012828767299652, + "learning_rate": 0.00012223189799711462, + "loss": 1.5026, + "step": 29933 + }, + { + "epoch": 0.38897867757775206, + "grad_norm": 0.39937078952789307, + "learning_rate": 0.00012222929853520322, + "loss": 1.3766, + "step": 29934 + }, + { + "epoch": 0.3889916721216679, + "grad_norm": 0.4260209798812866, + "learning_rate": 0.00012222669907329184, + "loss": 1.5353, + "step": 29935 + }, + { + "epoch": 0.3890046666655838, + "grad_norm": 0.4753043055534363, + "learning_rate": 0.00012222409961138046, + "loss": 1.2501, + "step": 29936 + }, + { + "epoch": 0.38901766120949965, + "grad_norm": 0.3294399082660675, + "learning_rate": 0.00012222150014946906, + "loss": 1.5208, + "step": 29937 + }, + { + "epoch": 0.38903065575341556, + "grad_norm": 0.4828750193119049, + "learning_rate": 0.0001222189006875577, + "loss": 1.3379, + "step": 29938 + }, + { + "epoch": 0.3890436502973314, + "grad_norm": 0.48309463262557983, + "learning_rate": 0.00012221630122564628, + "loss": 1.5085, + "step": 29939 + }, + { + "epoch": 0.3890566448412473, + "grad_norm": 0.4016285836696625, + "learning_rate": 0.00012221370176373494, + "loss": 1.5111, + "step": 29940 + }, + { + "epoch": 0.38906963938516315, + "grad_norm": 0.4267188310623169, + "learning_rate": 0.00012221110230182353, + "loss": 1.4579, + "step": 29941 + }, + { + "epoch": 0.38908263392907905, + "grad_norm": 0.3516971468925476, + "learning_rate": 0.00012220850283991213, + "loss": 1.4335, + "step": 29942 + }, + { + "epoch": 0.3890956284729949, + "grad_norm": 0.37167680263519287, + "learning_rate": 0.00012220590337800075, + "loss": 1.4175, + "step": 29943 + }, + { + "epoch": 0.3891086230169108, + "grad_norm": 0.39827388525009155, + "learning_rate": 0.00012220330391608938, + "loss": 1.3063, + "step": 29944 + }, + { + "epoch": 0.38912161756082664, + "grad_norm": 0.34414955973625183, + "learning_rate": 0.000122200704454178, + "loss": 1.3821, + "step": 29945 + }, + { + "epoch": 0.38913461210474254, + "grad_norm": 0.3396826982498169, + "learning_rate": 0.0001221981049922666, + "loss": 1.4192, + "step": 29946 + }, + { + "epoch": 0.3891476066486584, + "grad_norm": 0.2613384425640106, + "learning_rate": 0.00012219550553035523, + "loss": 1.3163, + "step": 29947 + }, + { + "epoch": 0.3891606011925743, + "grad_norm": 0.46040889620780945, + "learning_rate": 0.00012219290606844385, + "loss": 1.3776, + "step": 29948 + }, + { + "epoch": 0.38917359573649013, + "grad_norm": 0.3295203447341919, + "learning_rate": 0.00012219030660653245, + "loss": 1.3544, + "step": 29949 + }, + { + "epoch": 0.38918659028040603, + "grad_norm": 0.3597117066383362, + "learning_rate": 0.00012218770714462107, + "loss": 1.3296, + "step": 29950 + }, + { + "epoch": 0.3891995848243219, + "grad_norm": 0.40903425216674805, + "learning_rate": 0.00012218510768270967, + "loss": 1.477, + "step": 29951 + }, + { + "epoch": 0.3892125793682378, + "grad_norm": 0.39330413937568665, + "learning_rate": 0.00012218250822079832, + "loss": 1.4333, + "step": 29952 + }, + { + "epoch": 0.3892255739121536, + "grad_norm": 0.3866601586341858, + "learning_rate": 0.00012217990875888692, + "loss": 1.4585, + "step": 29953 + }, + { + "epoch": 0.3892385684560695, + "grad_norm": 0.4674544334411621, + "learning_rate": 0.00012217730929697552, + "loss": 1.5484, + "step": 29954 + }, + { + "epoch": 0.38925156299998537, + "grad_norm": 0.4730983376502991, + "learning_rate": 0.00012217470983506414, + "loss": 1.4724, + "step": 29955 + }, + { + "epoch": 0.38926455754390127, + "grad_norm": 0.5048171281814575, + "learning_rate": 0.00012217211037315276, + "loss": 1.4768, + "step": 29956 + }, + { + "epoch": 0.3892775520878171, + "grad_norm": 0.4015820026397705, + "learning_rate": 0.0001221695109112414, + "loss": 1.3504, + "step": 29957 + }, + { + "epoch": 0.389290546631733, + "grad_norm": 0.3920265734195709, + "learning_rate": 0.00012216691144933, + "loss": 1.3823, + "step": 29958 + }, + { + "epoch": 0.38930354117564886, + "grad_norm": 0.4584847092628479, + "learning_rate": 0.0001221643119874186, + "loss": 1.5047, + "step": 29959 + }, + { + "epoch": 0.38931653571956476, + "grad_norm": 0.41017472743988037, + "learning_rate": 0.00012216171252550724, + "loss": 1.306, + "step": 29960 + }, + { + "epoch": 0.3893295302634806, + "grad_norm": 0.3940582573413849, + "learning_rate": 0.00012215911306359583, + "loss": 1.3614, + "step": 29961 + }, + { + "epoch": 0.3893425248073965, + "grad_norm": 0.40765923261642456, + "learning_rate": 0.00012215651360168446, + "loss": 1.4318, + "step": 29962 + }, + { + "epoch": 0.38935551935131235, + "grad_norm": 0.2614087164402008, + "learning_rate": 0.00012215391413977305, + "loss": 1.4034, + "step": 29963 + }, + { + "epoch": 0.38936851389522825, + "grad_norm": 0.4263293445110321, + "learning_rate": 0.0001221513146778617, + "loss": 1.5482, + "step": 29964 + }, + { + "epoch": 0.3893815084391441, + "grad_norm": 0.3633301258087158, + "learning_rate": 0.0001221487152159503, + "loss": 1.3543, + "step": 29965 + }, + { + "epoch": 0.38939450298306, + "grad_norm": 0.33783143758773804, + "learning_rate": 0.0001221461157540389, + "loss": 1.4277, + "step": 29966 + }, + { + "epoch": 0.38940749752697584, + "grad_norm": 0.33738669753074646, + "learning_rate": 0.00012214351629212753, + "loss": 1.3975, + "step": 29967 + }, + { + "epoch": 0.38942049207089174, + "grad_norm": 0.4516810178756714, + "learning_rate": 0.00012214091683021615, + "loss": 1.2123, + "step": 29968 + }, + { + "epoch": 0.3894334866148076, + "grad_norm": 0.41029927134513855, + "learning_rate": 0.00012213831736830477, + "loss": 1.2133, + "step": 29969 + }, + { + "epoch": 0.3894464811587235, + "grad_norm": 0.4828219711780548, + "learning_rate": 0.00012213571790639337, + "loss": 1.4444, + "step": 29970 + }, + { + "epoch": 0.38945947570263934, + "grad_norm": 0.32723382115364075, + "learning_rate": 0.000122133118444482, + "loss": 1.2271, + "step": 29971 + }, + { + "epoch": 0.38947247024655524, + "grad_norm": 0.479070782661438, + "learning_rate": 0.00012213051898257062, + "loss": 1.4988, + "step": 29972 + }, + { + "epoch": 0.3894854647904711, + "grad_norm": 0.2727302014827728, + "learning_rate": 0.00012212791952065922, + "loss": 1.4481, + "step": 29973 + }, + { + "epoch": 0.389498459334387, + "grad_norm": 0.3500700294971466, + "learning_rate": 0.00012212532005874784, + "loss": 1.3542, + "step": 29974 + }, + { + "epoch": 0.3895114538783028, + "grad_norm": 0.46399515867233276, + "learning_rate": 0.00012212272059683647, + "loss": 1.5085, + "step": 29975 + }, + { + "epoch": 0.3895244484222187, + "grad_norm": 0.4055573046207428, + "learning_rate": 0.0001221201211349251, + "loss": 1.3124, + "step": 29976 + }, + { + "epoch": 0.3895374429661346, + "grad_norm": 0.448212206363678, + "learning_rate": 0.0001221175216730137, + "loss": 1.3962, + "step": 29977 + }, + { + "epoch": 0.3895504375100505, + "grad_norm": 0.448320209980011, + "learning_rate": 0.00012211492221110231, + "loss": 1.4562, + "step": 29978 + }, + { + "epoch": 0.3895634320539663, + "grad_norm": 0.5457174181938171, + "learning_rate": 0.00012211232274919094, + "loss": 1.474, + "step": 29979 + }, + { + "epoch": 0.3895764265978822, + "grad_norm": 0.35137537121772766, + "learning_rate": 0.00012210972328727954, + "loss": 1.2626, + "step": 29980 + }, + { + "epoch": 0.38958942114179806, + "grad_norm": 0.37876641750335693, + "learning_rate": 0.00012210712382536816, + "loss": 1.2808, + "step": 29981 + }, + { + "epoch": 0.38960241568571397, + "grad_norm": 0.38676291704177856, + "learning_rate": 0.00012210452436345676, + "loss": 1.3523, + "step": 29982 + }, + { + "epoch": 0.3896154102296298, + "grad_norm": 0.43328115344047546, + "learning_rate": 0.00012210192490154538, + "loss": 1.2519, + "step": 29983 + }, + { + "epoch": 0.3896284047735457, + "grad_norm": 0.4008753299713135, + "learning_rate": 0.000122099325439634, + "loss": 1.4583, + "step": 29984 + }, + { + "epoch": 0.38964139931746156, + "grad_norm": 0.476602166891098, + "learning_rate": 0.0001220967259777226, + "loss": 1.4223, + "step": 29985 + }, + { + "epoch": 0.38965439386137746, + "grad_norm": 0.37312886118888855, + "learning_rate": 0.00012209412651581123, + "loss": 1.3438, + "step": 29986 + }, + { + "epoch": 0.3896673884052933, + "grad_norm": 0.4831525385379791, + "learning_rate": 0.00012209152705389985, + "loss": 1.4136, + "step": 29987 + }, + { + "epoch": 0.3896803829492092, + "grad_norm": 0.4432586431503296, + "learning_rate": 0.00012208892759198848, + "loss": 1.3598, + "step": 29988 + }, + { + "epoch": 0.38969337749312505, + "grad_norm": 0.37609317898750305, + "learning_rate": 0.00012208632813007707, + "loss": 1.3896, + "step": 29989 + }, + { + "epoch": 0.38970637203704095, + "grad_norm": 0.334155410528183, + "learning_rate": 0.0001220837286681657, + "loss": 1.4285, + "step": 29990 + }, + { + "epoch": 0.3897193665809568, + "grad_norm": 0.42372921109199524, + "learning_rate": 0.00012208112920625432, + "loss": 1.3467, + "step": 29991 + }, + { + "epoch": 0.3897323611248727, + "grad_norm": 0.40522122383117676, + "learning_rate": 0.00012207852974434292, + "loss": 1.5003, + "step": 29992 + }, + { + "epoch": 0.38974535566878854, + "grad_norm": 0.4145407974720001, + "learning_rate": 0.00012207593028243155, + "loss": 1.6396, + "step": 29993 + }, + { + "epoch": 0.38975835021270444, + "grad_norm": 0.394796758890152, + "learning_rate": 0.00012207333082052014, + "loss": 1.383, + "step": 29994 + }, + { + "epoch": 0.38977134475662034, + "grad_norm": 0.42275163531303406, + "learning_rate": 0.0001220707313586088, + "loss": 1.2702, + "step": 29995 + }, + { + "epoch": 0.3897843393005362, + "grad_norm": 0.3760804235935211, + "learning_rate": 0.00012206813189669739, + "loss": 1.3983, + "step": 29996 + }, + { + "epoch": 0.3897973338444521, + "grad_norm": 0.48112455010414124, + "learning_rate": 0.000122065532434786, + "loss": 1.4734, + "step": 29997 + }, + { + "epoch": 0.38981032838836793, + "grad_norm": 0.47838762402534485, + "learning_rate": 0.00012206293297287461, + "loss": 1.3017, + "step": 29998 + }, + { + "epoch": 0.38982332293228383, + "grad_norm": 0.40505892038345337, + "learning_rate": 0.00012206033351096324, + "loss": 1.291, + "step": 29999 + }, + { + "epoch": 0.3898363174761997, + "grad_norm": 0.3999340832233429, + "learning_rate": 0.00012205773404905185, + "loss": 1.6041, + "step": 30000 + }, + { + "epoch": 0.3898493120201156, + "grad_norm": 0.4098939299583435, + "learning_rate": 0.00012205513458714046, + "loss": 1.5688, + "step": 30001 + }, + { + "epoch": 0.3898623065640314, + "grad_norm": 0.38183093070983887, + "learning_rate": 0.00012205253512522907, + "loss": 1.3582, + "step": 30002 + }, + { + "epoch": 0.3898753011079473, + "grad_norm": 0.4575520157814026, + "learning_rate": 0.00012204993566331771, + "loss": 1.1648, + "step": 30003 + }, + { + "epoch": 0.38988829565186317, + "grad_norm": 0.40873438119888306, + "learning_rate": 0.00012204733620140632, + "loss": 1.2611, + "step": 30004 + }, + { + "epoch": 0.38990129019577907, + "grad_norm": 0.4148501455783844, + "learning_rate": 0.00012204473673949493, + "loss": 1.3929, + "step": 30005 + }, + { + "epoch": 0.3899142847396949, + "grad_norm": 0.36465275287628174, + "learning_rate": 0.00012204213727758356, + "loss": 1.4328, + "step": 30006 + }, + { + "epoch": 0.3899272792836108, + "grad_norm": 0.3923112452030182, + "learning_rate": 0.00012203953781567217, + "loss": 1.4388, + "step": 30007 + }, + { + "epoch": 0.38994027382752666, + "grad_norm": 0.35317128896713257, + "learning_rate": 0.00012203693835376078, + "loss": 1.3366, + "step": 30008 + }, + { + "epoch": 0.38995326837144256, + "grad_norm": 0.36921223998069763, + "learning_rate": 0.00012203433889184939, + "loss": 1.2926, + "step": 30009 + }, + { + "epoch": 0.3899662629153584, + "grad_norm": 0.34526345133781433, + "learning_rate": 0.00012203173942993801, + "loss": 1.334, + "step": 30010 + }, + { + "epoch": 0.3899792574592743, + "grad_norm": 0.44029760360717773, + "learning_rate": 0.00012202913996802662, + "loss": 1.2934, + "step": 30011 + }, + { + "epoch": 0.38999225200319015, + "grad_norm": 0.4285171627998352, + "learning_rate": 0.00012202654050611523, + "loss": 1.4166, + "step": 30012 + }, + { + "epoch": 0.39000524654710605, + "grad_norm": 0.39532583951950073, + "learning_rate": 0.00012202394104420385, + "loss": 1.61, + "step": 30013 + }, + { + "epoch": 0.3900182410910219, + "grad_norm": 0.3894072473049164, + "learning_rate": 0.00012202134158229248, + "loss": 1.467, + "step": 30014 + }, + { + "epoch": 0.3900312356349378, + "grad_norm": 0.3797360360622406, + "learning_rate": 0.0001220187421203811, + "loss": 1.443, + "step": 30015 + }, + { + "epoch": 0.39004423017885365, + "grad_norm": 0.5540939569473267, + "learning_rate": 0.0001220161426584697, + "loss": 1.3629, + "step": 30016 + }, + { + "epoch": 0.39005722472276955, + "grad_norm": 0.34394213557243347, + "learning_rate": 0.00012201354319655832, + "loss": 1.4709, + "step": 30017 + }, + { + "epoch": 0.3900702192666854, + "grad_norm": 0.40100544691085815, + "learning_rate": 0.00012201094373464694, + "loss": 1.3265, + "step": 30018 + }, + { + "epoch": 0.3900832138106013, + "grad_norm": 0.5185689330101013, + "learning_rate": 0.00012200834427273555, + "loss": 1.3884, + "step": 30019 + }, + { + "epoch": 0.39009620835451714, + "grad_norm": 0.3766475021839142, + "learning_rate": 0.00012200574481082416, + "loss": 1.3758, + "step": 30020 + }, + { + "epoch": 0.39010920289843304, + "grad_norm": 0.4197940528392792, + "learning_rate": 0.00012200314534891277, + "loss": 1.4424, + "step": 30021 + }, + { + "epoch": 0.3901221974423489, + "grad_norm": 0.29997891187667847, + "learning_rate": 0.00012200054588700141, + "loss": 1.2417, + "step": 30022 + }, + { + "epoch": 0.3901351919862648, + "grad_norm": 0.4618741273880005, + "learning_rate": 0.00012199794642509001, + "loss": 1.3843, + "step": 30023 + }, + { + "epoch": 0.39014818653018063, + "grad_norm": 0.545307993888855, + "learning_rate": 0.00012199534696317862, + "loss": 1.4275, + "step": 30024 + }, + { + "epoch": 0.39016118107409653, + "grad_norm": 0.43186327815055847, + "learning_rate": 0.00012199274750126723, + "loss": 1.444, + "step": 30025 + }, + { + "epoch": 0.3901741756180124, + "grad_norm": 0.34508249163627625, + "learning_rate": 0.00012199014803935587, + "loss": 1.4424, + "step": 30026 + }, + { + "epoch": 0.3901871701619283, + "grad_norm": 0.3395686745643616, + "learning_rate": 0.00012198754857744448, + "loss": 1.4116, + "step": 30027 + }, + { + "epoch": 0.3902001647058441, + "grad_norm": 0.3685622215270996, + "learning_rate": 0.00012198494911553309, + "loss": 1.3384, + "step": 30028 + }, + { + "epoch": 0.39021315924976, + "grad_norm": 0.39594343304634094, + "learning_rate": 0.0001219823496536217, + "loss": 1.4782, + "step": 30029 + }, + { + "epoch": 0.39022615379367587, + "grad_norm": 0.4663515090942383, + "learning_rate": 0.00012197975019171033, + "loss": 1.3149, + "step": 30030 + }, + { + "epoch": 0.39023914833759177, + "grad_norm": 0.40722721815109253, + "learning_rate": 0.00012197715072979894, + "loss": 1.3377, + "step": 30031 + }, + { + "epoch": 0.3902521428815076, + "grad_norm": 0.4376753866672516, + "learning_rate": 0.00012197455126788755, + "loss": 1.4647, + "step": 30032 + }, + { + "epoch": 0.3902651374254235, + "grad_norm": 0.4399189054965973, + "learning_rate": 0.00012197195180597616, + "loss": 1.2996, + "step": 30033 + }, + { + "epoch": 0.39027813196933936, + "grad_norm": 0.4426305592060089, + "learning_rate": 0.0001219693523440648, + "loss": 1.2137, + "step": 30034 + }, + { + "epoch": 0.39029112651325526, + "grad_norm": 0.4404280185699463, + "learning_rate": 0.00012196675288215341, + "loss": 1.4289, + "step": 30035 + }, + { + "epoch": 0.3903041210571711, + "grad_norm": 0.3486882150173187, + "learning_rate": 0.000121964153420242, + "loss": 1.3242, + "step": 30036 + }, + { + "epoch": 0.390317115601087, + "grad_norm": 0.40671306848526, + "learning_rate": 0.00012196155395833062, + "loss": 1.4706, + "step": 30037 + }, + { + "epoch": 0.39033011014500285, + "grad_norm": 0.3922707140445709, + "learning_rate": 0.00012195895449641925, + "loss": 1.4094, + "step": 30038 + }, + { + "epoch": 0.39034310468891875, + "grad_norm": 0.38598543405532837, + "learning_rate": 0.00012195635503450787, + "loss": 1.266, + "step": 30039 + }, + { + "epoch": 0.3903560992328346, + "grad_norm": 0.43411827087402344, + "learning_rate": 0.00012195375557259648, + "loss": 1.2387, + "step": 30040 + }, + { + "epoch": 0.3903690937767505, + "grad_norm": 0.4489670395851135, + "learning_rate": 0.00012195115611068509, + "loss": 1.5064, + "step": 30041 + }, + { + "epoch": 0.39038208832066634, + "grad_norm": 0.4152943193912506, + "learning_rate": 0.00012194855664877371, + "loss": 1.4929, + "step": 30042 + }, + { + "epoch": 0.39039508286458224, + "grad_norm": 0.4033881723880768, + "learning_rate": 0.00012194595718686232, + "loss": 1.495, + "step": 30043 + }, + { + "epoch": 0.3904080774084981, + "grad_norm": 0.3771362006664276, + "learning_rate": 0.00012194335772495093, + "loss": 1.4386, + "step": 30044 + }, + { + "epoch": 0.390421071952414, + "grad_norm": 0.40339937806129456, + "learning_rate": 0.00012194075826303957, + "loss": 1.3571, + "step": 30045 + }, + { + "epoch": 0.39043406649632983, + "grad_norm": 0.43747755885124207, + "learning_rate": 0.00012193815880112818, + "loss": 1.32, + "step": 30046 + }, + { + "epoch": 0.39044706104024574, + "grad_norm": 0.34134072065353394, + "learning_rate": 0.00012193555933921679, + "loss": 1.3762, + "step": 30047 + }, + { + "epoch": 0.3904600555841616, + "grad_norm": 0.42860689759254456, + "learning_rate": 0.00012193295987730539, + "loss": 1.3962, + "step": 30048 + }, + { + "epoch": 0.3904730501280775, + "grad_norm": 0.3270234763622284, + "learning_rate": 0.00012193036041539403, + "loss": 1.5259, + "step": 30049 + }, + { + "epoch": 0.3904860446719933, + "grad_norm": 0.38852789998054504, + "learning_rate": 0.00012192776095348264, + "loss": 1.4467, + "step": 30050 + }, + { + "epoch": 0.3904990392159092, + "grad_norm": 0.32282713055610657, + "learning_rate": 0.00012192516149157125, + "loss": 1.3952, + "step": 30051 + }, + { + "epoch": 0.3905120337598251, + "grad_norm": 0.3749118149280548, + "learning_rate": 0.00012192256202965986, + "loss": 1.4419, + "step": 30052 + }, + { + "epoch": 0.390525028303741, + "grad_norm": 0.37093400955200195, + "learning_rate": 0.00012191996256774849, + "loss": 1.3535, + "step": 30053 + }, + { + "epoch": 0.3905380228476568, + "grad_norm": 0.42257416248321533, + "learning_rate": 0.0001219173631058371, + "loss": 1.3667, + "step": 30054 + }, + { + "epoch": 0.3905510173915727, + "grad_norm": 0.45308542251586914, + "learning_rate": 0.00012191476364392571, + "loss": 1.6331, + "step": 30055 + }, + { + "epoch": 0.39056401193548856, + "grad_norm": 0.40969449281692505, + "learning_rate": 0.00012191216418201432, + "loss": 1.5026, + "step": 30056 + }, + { + "epoch": 0.39057700647940446, + "grad_norm": 0.5387527942657471, + "learning_rate": 0.00012190956472010296, + "loss": 1.577, + "step": 30057 + }, + { + "epoch": 0.3905900010233203, + "grad_norm": 0.4781966209411621, + "learning_rate": 0.00012190696525819157, + "loss": 1.4391, + "step": 30058 + }, + { + "epoch": 0.3906029955672362, + "grad_norm": 0.47998109459877014, + "learning_rate": 0.00012190436579628018, + "loss": 1.408, + "step": 30059 + }, + { + "epoch": 0.39061599011115206, + "grad_norm": 0.30010759830474854, + "learning_rate": 0.00012190176633436879, + "loss": 1.2555, + "step": 30060 + }, + { + "epoch": 0.39062898465506796, + "grad_norm": 0.4064951539039612, + "learning_rate": 0.00012189916687245741, + "loss": 1.5438, + "step": 30061 + }, + { + "epoch": 0.3906419791989838, + "grad_norm": 0.3994278609752655, + "learning_rate": 0.00012189656741054602, + "loss": 1.3502, + "step": 30062 + }, + { + "epoch": 0.3906549737428997, + "grad_norm": 0.5379024744033813, + "learning_rate": 0.00012189396794863464, + "loss": 1.3312, + "step": 30063 + }, + { + "epoch": 0.39066796828681555, + "grad_norm": 0.3372834324836731, + "learning_rate": 0.00012189136848672325, + "loss": 1.5487, + "step": 30064 + }, + { + "epoch": 0.39068096283073145, + "grad_norm": 0.41192808747291565, + "learning_rate": 0.00012188876902481187, + "loss": 1.3943, + "step": 30065 + }, + { + "epoch": 0.3906939573746473, + "grad_norm": 0.4473399519920349, + "learning_rate": 0.00012188616956290048, + "loss": 1.1819, + "step": 30066 + }, + { + "epoch": 0.3907069519185632, + "grad_norm": 0.40857449173927307, + "learning_rate": 0.00012188357010098909, + "loss": 1.5914, + "step": 30067 + }, + { + "epoch": 0.39071994646247904, + "grad_norm": 0.35110679268836975, + "learning_rate": 0.0001218809706390777, + "loss": 1.2856, + "step": 30068 + }, + { + "epoch": 0.39073294100639494, + "grad_norm": 0.33694663643836975, + "learning_rate": 0.00012187837117716634, + "loss": 1.5575, + "step": 30069 + }, + { + "epoch": 0.3907459355503108, + "grad_norm": 0.4459562301635742, + "learning_rate": 0.00012187577171525495, + "loss": 1.2977, + "step": 30070 + }, + { + "epoch": 0.3907589300942267, + "grad_norm": 0.49202728271484375, + "learning_rate": 0.00012187317225334356, + "loss": 1.574, + "step": 30071 + }, + { + "epoch": 0.3907719246381426, + "grad_norm": 0.4667641818523407, + "learning_rate": 0.00012187057279143217, + "loss": 1.5255, + "step": 30072 + }, + { + "epoch": 0.39078491918205843, + "grad_norm": 0.3807390630245209, + "learning_rate": 0.0001218679733295208, + "loss": 1.5656, + "step": 30073 + }, + { + "epoch": 0.39079791372597433, + "grad_norm": 0.44292476773262024, + "learning_rate": 0.00012186537386760941, + "loss": 1.3653, + "step": 30074 + }, + { + "epoch": 0.3908109082698902, + "grad_norm": 0.5953220725059509, + "learning_rate": 0.00012186277440569802, + "loss": 1.4531, + "step": 30075 + }, + { + "epoch": 0.3908239028138061, + "grad_norm": 0.5154035687446594, + "learning_rate": 0.00012186017494378663, + "loss": 1.512, + "step": 30076 + }, + { + "epoch": 0.3908368973577219, + "grad_norm": 0.40332356095314026, + "learning_rate": 0.00012185757548187527, + "loss": 1.2878, + "step": 30077 + }, + { + "epoch": 0.3908498919016378, + "grad_norm": 0.36124491691589355, + "learning_rate": 0.00012185497601996387, + "loss": 1.4294, + "step": 30078 + }, + { + "epoch": 0.39086288644555367, + "grad_norm": 0.27464374899864197, + "learning_rate": 0.00012185237655805248, + "loss": 1.3954, + "step": 30079 + }, + { + "epoch": 0.39087588098946957, + "grad_norm": 0.35874730348587036, + "learning_rate": 0.00012184977709614112, + "loss": 1.5174, + "step": 30080 + }, + { + "epoch": 0.3908888755333854, + "grad_norm": 0.3673916161060333, + "learning_rate": 0.00012184717763422973, + "loss": 1.3723, + "step": 30081 + }, + { + "epoch": 0.3909018700773013, + "grad_norm": 0.40562140941619873, + "learning_rate": 0.00012184457817231834, + "loss": 1.3638, + "step": 30082 + }, + { + "epoch": 0.39091486462121716, + "grad_norm": 0.3026283085346222, + "learning_rate": 0.00012184197871040695, + "loss": 1.3411, + "step": 30083 + }, + { + "epoch": 0.39092785916513306, + "grad_norm": 0.44750916957855225, + "learning_rate": 0.00012183937924849557, + "loss": 1.379, + "step": 30084 + }, + { + "epoch": 0.3909408537090489, + "grad_norm": 0.41736695170402527, + "learning_rate": 0.00012183677978658418, + "loss": 1.3509, + "step": 30085 + }, + { + "epoch": 0.3909538482529648, + "grad_norm": 0.32945263385772705, + "learning_rate": 0.0001218341803246728, + "loss": 1.5389, + "step": 30086 + }, + { + "epoch": 0.39096684279688065, + "grad_norm": 0.3785216212272644, + "learning_rate": 0.0001218315808627614, + "loss": 1.3913, + "step": 30087 + }, + { + "epoch": 0.39097983734079655, + "grad_norm": 0.3218432366847992, + "learning_rate": 0.00012182898140085004, + "loss": 1.3599, + "step": 30088 + }, + { + "epoch": 0.3909928318847124, + "grad_norm": 0.3952227532863617, + "learning_rate": 0.00012182638193893866, + "loss": 1.1908, + "step": 30089 + }, + { + "epoch": 0.3910058264286283, + "grad_norm": 0.4091629981994629, + "learning_rate": 0.00012182378247702725, + "loss": 1.3822, + "step": 30090 + }, + { + "epoch": 0.39101882097254415, + "grad_norm": 0.3612927794456482, + "learning_rate": 0.00012182118301511586, + "loss": 1.5119, + "step": 30091 + }, + { + "epoch": 0.39103181551646005, + "grad_norm": 0.3626982271671295, + "learning_rate": 0.0001218185835532045, + "loss": 1.5039, + "step": 30092 + }, + { + "epoch": 0.3910448100603759, + "grad_norm": 0.40796971321105957, + "learning_rate": 0.00012181598409129311, + "loss": 1.4785, + "step": 30093 + }, + { + "epoch": 0.3910578046042918, + "grad_norm": 0.3584350347518921, + "learning_rate": 0.00012181338462938172, + "loss": 1.3826, + "step": 30094 + }, + { + "epoch": 0.39107079914820764, + "grad_norm": 0.46865078806877136, + "learning_rate": 0.00012181078516747033, + "loss": 1.4512, + "step": 30095 + }, + { + "epoch": 0.39108379369212354, + "grad_norm": 0.35887107253074646, + "learning_rate": 0.00012180818570555896, + "loss": 1.4173, + "step": 30096 + }, + { + "epoch": 0.3910967882360394, + "grad_norm": 0.42081189155578613, + "learning_rate": 0.00012180558624364757, + "loss": 1.2792, + "step": 30097 + }, + { + "epoch": 0.3911097827799553, + "grad_norm": 0.4254751205444336, + "learning_rate": 0.00012180298678173618, + "loss": 1.2319, + "step": 30098 + }, + { + "epoch": 0.39112277732387113, + "grad_norm": 0.4039698541164398, + "learning_rate": 0.00012180038731982479, + "loss": 1.286, + "step": 30099 + }, + { + "epoch": 0.39113577186778703, + "grad_norm": 0.4513438642024994, + "learning_rate": 0.00012179778785791343, + "loss": 1.2674, + "step": 30100 + }, + { + "epoch": 0.3911487664117029, + "grad_norm": 0.37958231568336487, + "learning_rate": 0.00012179518839600204, + "loss": 1.4493, + "step": 30101 + }, + { + "epoch": 0.3911617609556188, + "grad_norm": 0.3481837511062622, + "learning_rate": 0.00012179258893409065, + "loss": 1.4223, + "step": 30102 + }, + { + "epoch": 0.3911747554995346, + "grad_norm": 0.395068883895874, + "learning_rate": 0.00012178998947217925, + "loss": 1.2806, + "step": 30103 + }, + { + "epoch": 0.3911877500434505, + "grad_norm": 0.35109856724739075, + "learning_rate": 0.00012178739001026789, + "loss": 1.3602, + "step": 30104 + }, + { + "epoch": 0.39120074458736637, + "grad_norm": 0.33429616689682007, + "learning_rate": 0.0001217847905483565, + "loss": 1.1632, + "step": 30105 + }, + { + "epoch": 0.39121373913128227, + "grad_norm": 0.3121984004974365, + "learning_rate": 0.00012178219108644511, + "loss": 1.4174, + "step": 30106 + }, + { + "epoch": 0.3912267336751981, + "grad_norm": 0.4757743179798126, + "learning_rate": 0.00012177959162453372, + "loss": 1.5094, + "step": 30107 + }, + { + "epoch": 0.391239728219114, + "grad_norm": 0.3582538366317749, + "learning_rate": 0.00012177699216262234, + "loss": 1.3933, + "step": 30108 + }, + { + "epoch": 0.39125272276302986, + "grad_norm": 0.3541100323200226, + "learning_rate": 0.00012177439270071096, + "loss": 1.2232, + "step": 30109 + }, + { + "epoch": 0.39126571730694576, + "grad_norm": 0.35205531120300293, + "learning_rate": 0.00012177179323879957, + "loss": 1.4804, + "step": 30110 + }, + { + "epoch": 0.3912787118508616, + "grad_norm": 0.4998394548892975, + "learning_rate": 0.00012176919377688818, + "loss": 1.6934, + "step": 30111 + }, + { + "epoch": 0.3912917063947775, + "grad_norm": 0.36380964517593384, + "learning_rate": 0.00012176659431497682, + "loss": 1.349, + "step": 30112 + }, + { + "epoch": 0.39130470093869335, + "grad_norm": 0.4160205125808716, + "learning_rate": 0.00012176399485306543, + "loss": 1.381, + "step": 30113 + }, + { + "epoch": 0.39131769548260925, + "grad_norm": 0.3899326026439667, + "learning_rate": 0.00012176139539115404, + "loss": 1.2929, + "step": 30114 + }, + { + "epoch": 0.3913306900265251, + "grad_norm": 0.3528885841369629, + "learning_rate": 0.00012175879592924265, + "loss": 1.3306, + "step": 30115 + }, + { + "epoch": 0.391343684570441, + "grad_norm": 0.5143755078315735, + "learning_rate": 0.00012175619646733127, + "loss": 1.3214, + "step": 30116 + }, + { + "epoch": 0.39135667911435684, + "grad_norm": 0.3883429169654846, + "learning_rate": 0.00012175359700541988, + "loss": 1.5306, + "step": 30117 + }, + { + "epoch": 0.39136967365827274, + "grad_norm": 0.4060009717941284, + "learning_rate": 0.0001217509975435085, + "loss": 1.3712, + "step": 30118 + }, + { + "epoch": 0.3913826682021886, + "grad_norm": 0.44311872124671936, + "learning_rate": 0.00012174839808159713, + "loss": 1.1517, + "step": 30119 + }, + { + "epoch": 0.3913956627461045, + "grad_norm": 0.3149387538433075, + "learning_rate": 0.00012174579861968573, + "loss": 1.5049, + "step": 30120 + }, + { + "epoch": 0.39140865729002033, + "grad_norm": 0.41133204102516174, + "learning_rate": 0.00012174319915777434, + "loss": 1.3962, + "step": 30121 + }, + { + "epoch": 0.39142165183393623, + "grad_norm": 0.3749205470085144, + "learning_rate": 0.00012174059969586295, + "loss": 1.3365, + "step": 30122 + }, + { + "epoch": 0.3914346463778521, + "grad_norm": 0.31378787755966187, + "learning_rate": 0.00012173800023395159, + "loss": 1.3222, + "step": 30123 + }, + { + "epoch": 0.391447640921768, + "grad_norm": 0.4185314178466797, + "learning_rate": 0.0001217354007720402, + "loss": 1.4204, + "step": 30124 + }, + { + "epoch": 0.3914606354656838, + "grad_norm": 0.4197222888469696, + "learning_rate": 0.00012173280131012881, + "loss": 1.3296, + "step": 30125 + }, + { + "epoch": 0.3914736300095997, + "grad_norm": 0.46947818994522095, + "learning_rate": 0.00012173020184821742, + "loss": 1.3735, + "step": 30126 + }, + { + "epoch": 0.39148662455351557, + "grad_norm": 0.3419879972934723, + "learning_rate": 0.00012172760238630605, + "loss": 1.4667, + "step": 30127 + }, + { + "epoch": 0.3914996190974315, + "grad_norm": 0.40291690826416016, + "learning_rate": 0.00012172500292439466, + "loss": 1.626, + "step": 30128 + }, + { + "epoch": 0.3915126136413473, + "grad_norm": 0.3707294166088104, + "learning_rate": 0.00012172240346248327, + "loss": 1.4204, + "step": 30129 + }, + { + "epoch": 0.3915256081852632, + "grad_norm": 0.3771126866340637, + "learning_rate": 0.00012171980400057188, + "loss": 1.4433, + "step": 30130 + }, + { + "epoch": 0.39153860272917906, + "grad_norm": 0.46512410044670105, + "learning_rate": 0.00012171720453866052, + "loss": 1.4042, + "step": 30131 + }, + { + "epoch": 0.39155159727309496, + "grad_norm": 0.3892116844654083, + "learning_rate": 0.00012171460507674912, + "loss": 1.3139, + "step": 30132 + }, + { + "epoch": 0.3915645918170108, + "grad_norm": 0.3910089433193207, + "learning_rate": 0.00012171200561483773, + "loss": 1.1926, + "step": 30133 + }, + { + "epoch": 0.3915775863609267, + "grad_norm": 0.39351779222488403, + "learning_rate": 0.00012170940615292634, + "loss": 1.453, + "step": 30134 + }, + { + "epoch": 0.39159058090484256, + "grad_norm": 0.40423285961151123, + "learning_rate": 0.00012170680669101498, + "loss": 1.2075, + "step": 30135 + }, + { + "epoch": 0.39160357544875846, + "grad_norm": 0.30895480513572693, + "learning_rate": 0.00012170420722910359, + "loss": 1.3786, + "step": 30136 + }, + { + "epoch": 0.3916165699926743, + "grad_norm": 0.29699960350990295, + "learning_rate": 0.0001217016077671922, + "loss": 1.3196, + "step": 30137 + }, + { + "epoch": 0.3916295645365902, + "grad_norm": 0.37178802490234375, + "learning_rate": 0.00012169900830528081, + "loss": 1.5311, + "step": 30138 + }, + { + "epoch": 0.39164255908050605, + "grad_norm": 0.4345197081565857, + "learning_rate": 0.00012169640884336943, + "loss": 1.2716, + "step": 30139 + }, + { + "epoch": 0.39165555362442195, + "grad_norm": 0.44118285179138184, + "learning_rate": 0.00012169380938145804, + "loss": 1.2205, + "step": 30140 + }, + { + "epoch": 0.3916685481683378, + "grad_norm": 0.3031332790851593, + "learning_rate": 0.00012169120991954665, + "loss": 1.2755, + "step": 30141 + }, + { + "epoch": 0.3916815427122537, + "grad_norm": 0.42799246311187744, + "learning_rate": 0.00012168861045763527, + "loss": 1.6241, + "step": 30142 + }, + { + "epoch": 0.39169453725616954, + "grad_norm": 0.42290955781936646, + "learning_rate": 0.0001216860109957239, + "loss": 1.4982, + "step": 30143 + }, + { + "epoch": 0.39170753180008544, + "grad_norm": 0.3088582456111908, + "learning_rate": 0.00012168341153381251, + "loss": 1.4289, + "step": 30144 + }, + { + "epoch": 0.3917205263440013, + "grad_norm": 0.27989429235458374, + "learning_rate": 0.00012168081207190111, + "loss": 1.242, + "step": 30145 + }, + { + "epoch": 0.3917335208879172, + "grad_norm": 0.39245644211769104, + "learning_rate": 0.00012167821260998972, + "loss": 1.3646, + "step": 30146 + }, + { + "epoch": 0.3917465154318331, + "grad_norm": 0.3112770617008209, + "learning_rate": 0.00012167561314807836, + "loss": 1.1425, + "step": 30147 + }, + { + "epoch": 0.39175950997574893, + "grad_norm": 0.348631888628006, + "learning_rate": 0.00012167301368616697, + "loss": 1.4115, + "step": 30148 + }, + { + "epoch": 0.39177250451966483, + "grad_norm": 0.5282865762710571, + "learning_rate": 0.00012167041422425558, + "loss": 1.3226, + "step": 30149 + }, + { + "epoch": 0.3917854990635807, + "grad_norm": 0.36630749702453613, + "learning_rate": 0.0001216678147623442, + "loss": 1.2992, + "step": 30150 + }, + { + "epoch": 0.3917984936074966, + "grad_norm": 0.4147343039512634, + "learning_rate": 0.00012166521530043282, + "loss": 1.5258, + "step": 30151 + }, + { + "epoch": 0.3918114881514124, + "grad_norm": 0.5318707823753357, + "learning_rate": 0.00012166261583852143, + "loss": 1.4118, + "step": 30152 + }, + { + "epoch": 0.3918244826953283, + "grad_norm": 0.4136781692504883, + "learning_rate": 0.00012166001637661004, + "loss": 1.4336, + "step": 30153 + }, + { + "epoch": 0.39183747723924417, + "grad_norm": 0.36295342445373535, + "learning_rate": 0.00012165741691469868, + "loss": 1.2866, + "step": 30154 + }, + { + "epoch": 0.39185047178316007, + "grad_norm": 0.36061781644821167, + "learning_rate": 0.00012165481745278729, + "loss": 1.5548, + "step": 30155 + }, + { + "epoch": 0.3918634663270759, + "grad_norm": 0.40959620475769043, + "learning_rate": 0.0001216522179908759, + "loss": 1.2291, + "step": 30156 + }, + { + "epoch": 0.3918764608709918, + "grad_norm": 0.27087658643722534, + "learning_rate": 0.00012164961852896451, + "loss": 1.3663, + "step": 30157 + }, + { + "epoch": 0.39188945541490766, + "grad_norm": 0.31021416187286377, + "learning_rate": 0.00012164701906705314, + "loss": 1.2343, + "step": 30158 + }, + { + "epoch": 0.39190244995882356, + "grad_norm": 0.3856663405895233, + "learning_rate": 0.00012164441960514175, + "loss": 1.5296, + "step": 30159 + }, + { + "epoch": 0.3919154445027394, + "grad_norm": 0.4690866768360138, + "learning_rate": 0.00012164182014323036, + "loss": 1.4471, + "step": 30160 + }, + { + "epoch": 0.3919284390466553, + "grad_norm": 0.46747124195098877, + "learning_rate": 0.00012163922068131897, + "loss": 1.3471, + "step": 30161 + }, + { + "epoch": 0.39194143359057115, + "grad_norm": 0.24842660129070282, + "learning_rate": 0.00012163662121940759, + "loss": 1.3677, + "step": 30162 + }, + { + "epoch": 0.39195442813448705, + "grad_norm": 0.3894916772842407, + "learning_rate": 0.0001216340217574962, + "loss": 1.3395, + "step": 30163 + }, + { + "epoch": 0.3919674226784029, + "grad_norm": 0.4911866784095764, + "learning_rate": 0.00012163142229558481, + "loss": 1.4341, + "step": 30164 + }, + { + "epoch": 0.3919804172223188, + "grad_norm": 0.43905109167099, + "learning_rate": 0.00012162882283367343, + "loss": 1.596, + "step": 30165 + }, + { + "epoch": 0.39199341176623465, + "grad_norm": 0.375514954328537, + "learning_rate": 0.00012162622337176206, + "loss": 1.676, + "step": 30166 + }, + { + "epoch": 0.39200640631015055, + "grad_norm": 0.352326363325119, + "learning_rate": 0.00012162362390985067, + "loss": 1.3772, + "step": 30167 + }, + { + "epoch": 0.3920194008540664, + "grad_norm": 0.37222519516944885, + "learning_rate": 0.00012162102444793929, + "loss": 1.3941, + "step": 30168 + }, + { + "epoch": 0.3920323953979823, + "grad_norm": 0.41673657298088074, + "learning_rate": 0.0001216184249860279, + "loss": 1.4676, + "step": 30169 + }, + { + "epoch": 0.39204538994189814, + "grad_norm": 0.4229004383087158, + "learning_rate": 0.00012161582552411652, + "loss": 1.4797, + "step": 30170 + }, + { + "epoch": 0.39205838448581404, + "grad_norm": 0.3822682499885559, + "learning_rate": 0.00012161322606220513, + "loss": 1.177, + "step": 30171 + }, + { + "epoch": 0.3920713790297299, + "grad_norm": 0.3789348006248474, + "learning_rate": 0.00012161062660029374, + "loss": 1.3862, + "step": 30172 + }, + { + "epoch": 0.3920843735736458, + "grad_norm": 0.42446979880332947, + "learning_rate": 0.00012160802713838235, + "loss": 1.285, + "step": 30173 + }, + { + "epoch": 0.39209736811756163, + "grad_norm": 0.4142511785030365, + "learning_rate": 0.00012160542767647098, + "loss": 1.4233, + "step": 30174 + }, + { + "epoch": 0.39211036266147753, + "grad_norm": 0.24715737998485565, + "learning_rate": 0.00012160282821455959, + "loss": 1.378, + "step": 30175 + }, + { + "epoch": 0.3921233572053934, + "grad_norm": 0.399018794298172, + "learning_rate": 0.0001216002287526482, + "loss": 1.3117, + "step": 30176 + }, + { + "epoch": 0.3921363517493093, + "grad_norm": 0.31237298250198364, + "learning_rate": 0.00012159762929073681, + "loss": 1.5218, + "step": 30177 + }, + { + "epoch": 0.3921493462932251, + "grad_norm": 0.404621422290802, + "learning_rate": 0.00012159502982882545, + "loss": 1.369, + "step": 30178 + }, + { + "epoch": 0.392162340837141, + "grad_norm": 0.4243197739124298, + "learning_rate": 0.00012159243036691406, + "loss": 1.4715, + "step": 30179 + }, + { + "epoch": 0.39217533538105687, + "grad_norm": 0.41246849298477173, + "learning_rate": 0.00012158983090500267, + "loss": 1.4675, + "step": 30180 + }, + { + "epoch": 0.39218832992497277, + "grad_norm": 0.38864678144454956, + "learning_rate": 0.00012158723144309128, + "loss": 1.6011, + "step": 30181 + }, + { + "epoch": 0.3922013244688886, + "grad_norm": 0.31902214884757996, + "learning_rate": 0.0001215846319811799, + "loss": 1.4662, + "step": 30182 + }, + { + "epoch": 0.3922143190128045, + "grad_norm": 0.4103827476501465, + "learning_rate": 0.00012158203251926852, + "loss": 1.2086, + "step": 30183 + }, + { + "epoch": 0.39222731355672036, + "grad_norm": 0.440344899892807, + "learning_rate": 0.00012157943305735713, + "loss": 1.5084, + "step": 30184 + }, + { + "epoch": 0.39224030810063626, + "grad_norm": 0.5932542681694031, + "learning_rate": 0.00012157683359544574, + "loss": 1.4913, + "step": 30185 + }, + { + "epoch": 0.3922533026445521, + "grad_norm": 0.34640777111053467, + "learning_rate": 0.00012157423413353438, + "loss": 1.4807, + "step": 30186 + }, + { + "epoch": 0.392266297188468, + "grad_norm": 0.4098737835884094, + "learning_rate": 0.00012157163467162297, + "loss": 1.1511, + "step": 30187 + }, + { + "epoch": 0.39227929173238385, + "grad_norm": 0.4336796700954437, + "learning_rate": 0.00012156903520971159, + "loss": 1.2743, + "step": 30188 + }, + { + "epoch": 0.39229228627629975, + "grad_norm": 0.4209417998790741, + "learning_rate": 0.0001215664357478002, + "loss": 1.5153, + "step": 30189 + }, + { + "epoch": 0.3923052808202156, + "grad_norm": 0.40461570024490356, + "learning_rate": 0.00012156383628588883, + "loss": 1.168, + "step": 30190 + }, + { + "epoch": 0.3923182753641315, + "grad_norm": 0.38222548365592957, + "learning_rate": 0.00012156123682397745, + "loss": 1.4155, + "step": 30191 + }, + { + "epoch": 0.39233126990804734, + "grad_norm": 0.46849504113197327, + "learning_rate": 0.00012155863736206606, + "loss": 1.3355, + "step": 30192 + }, + { + "epoch": 0.39234426445196324, + "grad_norm": 0.2952819764614105, + "learning_rate": 0.00012155603790015468, + "loss": 1.3886, + "step": 30193 + }, + { + "epoch": 0.3923572589958791, + "grad_norm": 0.2982991337776184, + "learning_rate": 0.00012155343843824329, + "loss": 1.2543, + "step": 30194 + }, + { + "epoch": 0.392370253539795, + "grad_norm": 0.4075080454349518, + "learning_rate": 0.0001215508389763319, + "loss": 1.4977, + "step": 30195 + }, + { + "epoch": 0.39238324808371083, + "grad_norm": 0.4195147156715393, + "learning_rate": 0.00012154823951442051, + "loss": 1.3202, + "step": 30196 + }, + { + "epoch": 0.39239624262762673, + "grad_norm": 0.4231061041355133, + "learning_rate": 0.00012154564005250915, + "loss": 1.5016, + "step": 30197 + }, + { + "epoch": 0.3924092371715426, + "grad_norm": 0.4115855395793915, + "learning_rate": 0.00012154304059059776, + "loss": 1.3714, + "step": 30198 + }, + { + "epoch": 0.3924222317154585, + "grad_norm": 0.31283944845199585, + "learning_rate": 0.00012154044112868637, + "loss": 1.4056, + "step": 30199 + }, + { + "epoch": 0.3924352262593743, + "grad_norm": 0.4133973717689514, + "learning_rate": 0.00012153784166677497, + "loss": 1.2775, + "step": 30200 + }, + { + "epoch": 0.3924482208032902, + "grad_norm": 0.3579365015029907, + "learning_rate": 0.00012153524220486361, + "loss": 1.4062, + "step": 30201 + }, + { + "epoch": 0.39246121534720607, + "grad_norm": 0.2966579496860504, + "learning_rate": 0.00012153264274295222, + "loss": 1.1802, + "step": 30202 + }, + { + "epoch": 0.392474209891122, + "grad_norm": 0.18775449693202972, + "learning_rate": 0.00012153004328104083, + "loss": 1.4207, + "step": 30203 + }, + { + "epoch": 0.3924872044350378, + "grad_norm": 0.3445811867713928, + "learning_rate": 0.00012152744381912944, + "loss": 1.5866, + "step": 30204 + }, + { + "epoch": 0.3925001989789537, + "grad_norm": 0.32643625140190125, + "learning_rate": 0.00012152484435721807, + "loss": 1.3237, + "step": 30205 + }, + { + "epoch": 0.39251319352286956, + "grad_norm": 0.3694562315940857, + "learning_rate": 0.00012152224489530668, + "loss": 1.3713, + "step": 30206 + }, + { + "epoch": 0.39252618806678546, + "grad_norm": 0.4584919214248657, + "learning_rate": 0.00012151964543339529, + "loss": 1.5977, + "step": 30207 + }, + { + "epoch": 0.3925391826107013, + "grad_norm": 0.4383241832256317, + "learning_rate": 0.0001215170459714839, + "loss": 1.4826, + "step": 30208 + }, + { + "epoch": 0.3925521771546172, + "grad_norm": 0.42453089356422424, + "learning_rate": 0.00012151444650957254, + "loss": 1.3531, + "step": 30209 + }, + { + "epoch": 0.39256517169853306, + "grad_norm": 0.3423071801662445, + "learning_rate": 0.00012151184704766115, + "loss": 1.3022, + "step": 30210 + }, + { + "epoch": 0.39257816624244896, + "grad_norm": 0.34484803676605225, + "learning_rate": 0.00012150924758574976, + "loss": 1.146, + "step": 30211 + }, + { + "epoch": 0.3925911607863648, + "grad_norm": 0.3715389668941498, + "learning_rate": 0.00012150664812383836, + "loss": 1.5697, + "step": 30212 + }, + { + "epoch": 0.3926041553302807, + "grad_norm": 0.383456289768219, + "learning_rate": 0.000121504048661927, + "loss": 1.3851, + "step": 30213 + }, + { + "epoch": 0.39261714987419655, + "grad_norm": 0.4641530513763428, + "learning_rate": 0.0001215014492000156, + "loss": 1.2807, + "step": 30214 + }, + { + "epoch": 0.39263014441811245, + "grad_norm": 0.3006752133369446, + "learning_rate": 0.00012149884973810422, + "loss": 1.2938, + "step": 30215 + }, + { + "epoch": 0.3926431389620283, + "grad_norm": 0.40168288350105286, + "learning_rate": 0.00012149625027619283, + "loss": 1.3246, + "step": 30216 + }, + { + "epoch": 0.3926561335059442, + "grad_norm": 0.3213330805301666, + "learning_rate": 0.00012149365081428145, + "loss": 1.3463, + "step": 30217 + }, + { + "epoch": 0.39266912804986004, + "grad_norm": 0.40640586614608765, + "learning_rate": 0.00012149105135237006, + "loss": 1.345, + "step": 30218 + }, + { + "epoch": 0.39268212259377594, + "grad_norm": 0.5321385860443115, + "learning_rate": 0.00012148845189045867, + "loss": 1.2842, + "step": 30219 + }, + { + "epoch": 0.3926951171376918, + "grad_norm": 0.363692969083786, + "learning_rate": 0.00012148585242854728, + "loss": 1.4173, + "step": 30220 + }, + { + "epoch": 0.3927081116816077, + "grad_norm": 0.4474540650844574, + "learning_rate": 0.00012148325296663592, + "loss": 1.2595, + "step": 30221 + }, + { + "epoch": 0.39272110622552353, + "grad_norm": 0.39879482984542847, + "learning_rate": 0.00012148065350472453, + "loss": 1.3373, + "step": 30222 + }, + { + "epoch": 0.39273410076943943, + "grad_norm": 0.49099016189575195, + "learning_rate": 0.00012147805404281314, + "loss": 1.4743, + "step": 30223 + }, + { + "epoch": 0.39274709531335533, + "grad_norm": 0.5243951082229614, + "learning_rate": 0.00012147545458090175, + "loss": 1.4535, + "step": 30224 + }, + { + "epoch": 0.3927600898572712, + "grad_norm": 0.37081024050712585, + "learning_rate": 0.00012147285511899038, + "loss": 1.3589, + "step": 30225 + }, + { + "epoch": 0.3927730844011871, + "grad_norm": 0.38306891918182373, + "learning_rate": 0.00012147025565707899, + "loss": 1.4424, + "step": 30226 + }, + { + "epoch": 0.3927860789451029, + "grad_norm": 0.4476652145385742, + "learning_rate": 0.0001214676561951676, + "loss": 1.3947, + "step": 30227 + }, + { + "epoch": 0.3927990734890188, + "grad_norm": 0.40072041749954224, + "learning_rate": 0.00012146505673325624, + "loss": 1.3507, + "step": 30228 + }, + { + "epoch": 0.39281206803293467, + "grad_norm": 0.4932629466056824, + "learning_rate": 0.00012146245727134484, + "loss": 1.4886, + "step": 30229 + }, + { + "epoch": 0.39282506257685057, + "grad_norm": 0.31090280413627625, + "learning_rate": 0.00012145985780943345, + "loss": 1.3406, + "step": 30230 + }, + { + "epoch": 0.3928380571207664, + "grad_norm": 0.3673417866230011, + "learning_rate": 0.00012145725834752206, + "loss": 1.427, + "step": 30231 + }, + { + "epoch": 0.3928510516646823, + "grad_norm": 0.5585647821426392, + "learning_rate": 0.0001214546588856107, + "loss": 1.3765, + "step": 30232 + }, + { + "epoch": 0.39286404620859816, + "grad_norm": 0.3452046513557434, + "learning_rate": 0.00012145205942369931, + "loss": 1.3163, + "step": 30233 + }, + { + "epoch": 0.39287704075251406, + "grad_norm": 0.33295050263404846, + "learning_rate": 0.00012144945996178792, + "loss": 1.1413, + "step": 30234 + }, + { + "epoch": 0.3928900352964299, + "grad_norm": 0.48066234588623047, + "learning_rate": 0.00012144686049987653, + "loss": 1.3916, + "step": 30235 + }, + { + "epoch": 0.3929030298403458, + "grad_norm": 0.3673097491264343, + "learning_rate": 0.00012144426103796515, + "loss": 1.3864, + "step": 30236 + }, + { + "epoch": 0.39291602438426165, + "grad_norm": 0.45697200298309326, + "learning_rate": 0.00012144166157605376, + "loss": 1.4431, + "step": 30237 + }, + { + "epoch": 0.39292901892817755, + "grad_norm": 0.4453720152378082, + "learning_rate": 0.00012143906211414238, + "loss": 1.5751, + "step": 30238 + }, + { + "epoch": 0.3929420134720934, + "grad_norm": 0.4138123393058777, + "learning_rate": 0.00012143646265223099, + "loss": 1.296, + "step": 30239 + }, + { + "epoch": 0.3929550080160093, + "grad_norm": 0.3418317139148712, + "learning_rate": 0.00012143386319031962, + "loss": 1.222, + "step": 30240 + }, + { + "epoch": 0.39296800255992514, + "grad_norm": 0.3883409798145294, + "learning_rate": 0.00012143126372840824, + "loss": 1.4565, + "step": 30241 + }, + { + "epoch": 0.39298099710384105, + "grad_norm": 0.46394437551498413, + "learning_rate": 0.00012142866426649683, + "loss": 1.4492, + "step": 30242 + }, + { + "epoch": 0.3929939916477569, + "grad_norm": 0.42542657256126404, + "learning_rate": 0.00012142606480458544, + "loss": 1.4919, + "step": 30243 + }, + { + "epoch": 0.3930069861916728, + "grad_norm": 0.3854852616786957, + "learning_rate": 0.00012142346534267408, + "loss": 1.413, + "step": 30244 + }, + { + "epoch": 0.39301998073558864, + "grad_norm": 0.3956025242805481, + "learning_rate": 0.00012142086588076269, + "loss": 1.269, + "step": 30245 + }, + { + "epoch": 0.39303297527950454, + "grad_norm": 0.4396824836730957, + "learning_rate": 0.0001214182664188513, + "loss": 1.5778, + "step": 30246 + }, + { + "epoch": 0.3930459698234204, + "grad_norm": 0.3475947678089142, + "learning_rate": 0.00012141566695693991, + "loss": 1.4049, + "step": 30247 + }, + { + "epoch": 0.3930589643673363, + "grad_norm": 0.5264297127723694, + "learning_rate": 0.00012141306749502854, + "loss": 1.5911, + "step": 30248 + }, + { + "epoch": 0.39307195891125213, + "grad_norm": 0.44453394412994385, + "learning_rate": 0.00012141046803311715, + "loss": 1.5334, + "step": 30249 + }, + { + "epoch": 0.39308495345516803, + "grad_norm": 0.3368965685367584, + "learning_rate": 0.00012140786857120576, + "loss": 1.47, + "step": 30250 + }, + { + "epoch": 0.3930979479990839, + "grad_norm": 0.3358776569366455, + "learning_rate": 0.00012140526910929437, + "loss": 1.3875, + "step": 30251 + }, + { + "epoch": 0.3931109425429998, + "grad_norm": 0.4137432873249054, + "learning_rate": 0.00012140266964738301, + "loss": 1.5743, + "step": 30252 + }, + { + "epoch": 0.3931239370869156, + "grad_norm": 0.43048447370529175, + "learning_rate": 0.00012140007018547162, + "loss": 1.2444, + "step": 30253 + }, + { + "epoch": 0.3931369316308315, + "grad_norm": 0.4377857744693756, + "learning_rate": 0.00012139747072356022, + "loss": 1.4032, + "step": 30254 + }, + { + "epoch": 0.39314992617474737, + "grad_norm": 0.43612468242645264, + "learning_rate": 0.00012139487126164883, + "loss": 1.674, + "step": 30255 + }, + { + "epoch": 0.39316292071866327, + "grad_norm": 0.39175236225128174, + "learning_rate": 0.00012139227179973747, + "loss": 1.3721, + "step": 30256 + }, + { + "epoch": 0.3931759152625791, + "grad_norm": 0.3942610025405884, + "learning_rate": 0.00012138967233782608, + "loss": 1.3836, + "step": 30257 + }, + { + "epoch": 0.393188909806495, + "grad_norm": 0.4507838785648346, + "learning_rate": 0.00012138707287591469, + "loss": 1.2908, + "step": 30258 + }, + { + "epoch": 0.39320190435041086, + "grad_norm": 0.42537006735801697, + "learning_rate": 0.0001213844734140033, + "loss": 1.5651, + "step": 30259 + }, + { + "epoch": 0.39321489889432676, + "grad_norm": 0.46596047282218933, + "learning_rate": 0.00012138187395209192, + "loss": 1.5946, + "step": 30260 + }, + { + "epoch": 0.3932278934382426, + "grad_norm": 0.38619983196258545, + "learning_rate": 0.00012137927449018054, + "loss": 1.3514, + "step": 30261 + }, + { + "epoch": 0.3932408879821585, + "grad_norm": 0.3787907361984253, + "learning_rate": 0.00012137667502826915, + "loss": 1.4278, + "step": 30262 + }, + { + "epoch": 0.39325388252607435, + "grad_norm": 0.4231860041618347, + "learning_rate": 0.00012137407556635776, + "loss": 1.5253, + "step": 30263 + }, + { + "epoch": 0.39326687706999025, + "grad_norm": 0.45576971769332886, + "learning_rate": 0.0001213714761044464, + "loss": 1.4211, + "step": 30264 + }, + { + "epoch": 0.3932798716139061, + "grad_norm": 0.46972087025642395, + "learning_rate": 0.000121368876642535, + "loss": 1.3754, + "step": 30265 + }, + { + "epoch": 0.393292866157822, + "grad_norm": 0.3763978183269501, + "learning_rate": 0.00012136627718062362, + "loss": 1.5172, + "step": 30266 + }, + { + "epoch": 0.39330586070173784, + "grad_norm": 0.3517427444458008, + "learning_rate": 0.00012136367771871224, + "loss": 1.4023, + "step": 30267 + }, + { + "epoch": 0.39331885524565374, + "grad_norm": 0.5221895575523376, + "learning_rate": 0.00012136107825680085, + "loss": 1.554, + "step": 30268 + }, + { + "epoch": 0.3933318497895696, + "grad_norm": 0.34739628434181213, + "learning_rate": 0.00012135847879488946, + "loss": 1.3073, + "step": 30269 + }, + { + "epoch": 0.3933448443334855, + "grad_norm": 0.313321977853775, + "learning_rate": 0.00012135587933297807, + "loss": 1.1969, + "step": 30270 + }, + { + "epoch": 0.39335783887740133, + "grad_norm": 0.416465699672699, + "learning_rate": 0.0001213532798710667, + "loss": 1.4437, + "step": 30271 + }, + { + "epoch": 0.39337083342131723, + "grad_norm": 0.452899694442749, + "learning_rate": 0.00012135068040915531, + "loss": 1.3348, + "step": 30272 + }, + { + "epoch": 0.3933838279652331, + "grad_norm": 0.35301804542541504, + "learning_rate": 0.00012134808094724392, + "loss": 1.3065, + "step": 30273 + }, + { + "epoch": 0.393396822509149, + "grad_norm": 0.41131705045700073, + "learning_rate": 0.00012134548148533253, + "loss": 1.4868, + "step": 30274 + }, + { + "epoch": 0.3934098170530648, + "grad_norm": 0.4610689580440521, + "learning_rate": 0.00012134288202342117, + "loss": 1.444, + "step": 30275 + }, + { + "epoch": 0.3934228115969807, + "grad_norm": 0.4829813539981842, + "learning_rate": 0.00012134028256150978, + "loss": 1.5999, + "step": 30276 + }, + { + "epoch": 0.39343580614089657, + "grad_norm": 0.4535762369632721, + "learning_rate": 0.00012133768309959839, + "loss": 1.5563, + "step": 30277 + }, + { + "epoch": 0.39344880068481247, + "grad_norm": 0.4518049955368042, + "learning_rate": 0.000121335083637687, + "loss": 1.5499, + "step": 30278 + }, + { + "epoch": 0.3934617952287283, + "grad_norm": 0.38682714104652405, + "learning_rate": 0.00012133248417577563, + "loss": 1.5489, + "step": 30279 + }, + { + "epoch": 0.3934747897726442, + "grad_norm": 0.454598069190979, + "learning_rate": 0.00012132988471386424, + "loss": 1.5383, + "step": 30280 + }, + { + "epoch": 0.39348778431656006, + "grad_norm": 0.3806298077106476, + "learning_rate": 0.00012132728525195285, + "loss": 1.4287, + "step": 30281 + }, + { + "epoch": 0.39350077886047596, + "grad_norm": 0.4003894031047821, + "learning_rate": 0.00012132468579004146, + "loss": 1.3787, + "step": 30282 + }, + { + "epoch": 0.3935137734043918, + "grad_norm": 0.41036278009414673, + "learning_rate": 0.0001213220863281301, + "loss": 1.3188, + "step": 30283 + }, + { + "epoch": 0.3935267679483077, + "grad_norm": 0.40194404125213623, + "learning_rate": 0.0001213194868662187, + "loss": 1.4512, + "step": 30284 + }, + { + "epoch": 0.39353976249222355, + "grad_norm": 0.4183902442455292, + "learning_rate": 0.0001213168874043073, + "loss": 1.3122, + "step": 30285 + }, + { + "epoch": 0.39355275703613946, + "grad_norm": 0.3271028399467468, + "learning_rate": 0.00012131428794239592, + "loss": 1.512, + "step": 30286 + }, + { + "epoch": 0.3935657515800553, + "grad_norm": 0.40299180150032043, + "learning_rate": 0.00012131168848048456, + "loss": 1.4801, + "step": 30287 + }, + { + "epoch": 0.3935787461239712, + "grad_norm": 0.4294005334377289, + "learning_rate": 0.00012130908901857317, + "loss": 1.4308, + "step": 30288 + }, + { + "epoch": 0.39359174066788705, + "grad_norm": 0.451560914516449, + "learning_rate": 0.00012130648955666178, + "loss": 1.3914, + "step": 30289 + }, + { + "epoch": 0.39360473521180295, + "grad_norm": 0.30970409512519836, + "learning_rate": 0.00012130389009475039, + "loss": 1.2857, + "step": 30290 + }, + { + "epoch": 0.3936177297557188, + "grad_norm": 0.45484453439712524, + "learning_rate": 0.00012130129063283901, + "loss": 1.4404, + "step": 30291 + }, + { + "epoch": 0.3936307242996347, + "grad_norm": 0.24319183826446533, + "learning_rate": 0.00012129869117092762, + "loss": 1.1451, + "step": 30292 + }, + { + "epoch": 0.39364371884355054, + "grad_norm": 0.3762994706630707, + "learning_rate": 0.00012129609170901623, + "loss": 1.4822, + "step": 30293 + }, + { + "epoch": 0.39365671338746644, + "grad_norm": 0.4294280409812927, + "learning_rate": 0.00012129349224710485, + "loss": 1.3901, + "step": 30294 + }, + { + "epoch": 0.3936697079313823, + "grad_norm": 0.4294148087501526, + "learning_rate": 0.00012129089278519348, + "loss": 1.6459, + "step": 30295 + }, + { + "epoch": 0.3936827024752982, + "grad_norm": 0.33589833974838257, + "learning_rate": 0.00012128829332328208, + "loss": 1.3232, + "step": 30296 + }, + { + "epoch": 0.39369569701921403, + "grad_norm": 0.436295747756958, + "learning_rate": 0.00012128569386137069, + "loss": 1.3937, + "step": 30297 + }, + { + "epoch": 0.39370869156312993, + "grad_norm": 0.44970396161079407, + "learning_rate": 0.0001212830943994593, + "loss": 1.5046, + "step": 30298 + }, + { + "epoch": 0.39372168610704583, + "grad_norm": 0.4379139542579651, + "learning_rate": 0.00012128049493754794, + "loss": 1.4892, + "step": 30299 + }, + { + "epoch": 0.3937346806509617, + "grad_norm": 0.43271517753601074, + "learning_rate": 0.00012127789547563655, + "loss": 1.5246, + "step": 30300 + }, + { + "epoch": 0.3937476751948776, + "grad_norm": 0.40532201528549194, + "learning_rate": 0.00012127529601372516, + "loss": 1.375, + "step": 30301 + }, + { + "epoch": 0.3937606697387934, + "grad_norm": 0.46064314246177673, + "learning_rate": 0.00012127269655181379, + "loss": 1.3886, + "step": 30302 + }, + { + "epoch": 0.3937736642827093, + "grad_norm": 0.3226160407066345, + "learning_rate": 0.0001212700970899024, + "loss": 1.348, + "step": 30303 + }, + { + "epoch": 0.39378665882662517, + "grad_norm": 0.4500960111618042, + "learning_rate": 0.00012126749762799101, + "loss": 1.473, + "step": 30304 + }, + { + "epoch": 0.39379965337054107, + "grad_norm": 0.29087477922439575, + "learning_rate": 0.00012126489816607962, + "loss": 1.197, + "step": 30305 + }, + { + "epoch": 0.3938126479144569, + "grad_norm": 0.5905337333679199, + "learning_rate": 0.00012126229870416826, + "loss": 1.2978, + "step": 30306 + }, + { + "epoch": 0.3938256424583728, + "grad_norm": 0.4344117045402527, + "learning_rate": 0.00012125969924225687, + "loss": 1.4983, + "step": 30307 + }, + { + "epoch": 0.39383863700228866, + "grad_norm": 0.4091038405895233, + "learning_rate": 0.00012125709978034548, + "loss": 1.3895, + "step": 30308 + }, + { + "epoch": 0.39385163154620456, + "grad_norm": 0.49192380905151367, + "learning_rate": 0.00012125450031843408, + "loss": 1.4325, + "step": 30309 + }, + { + "epoch": 0.3938646260901204, + "grad_norm": 0.3093331754207611, + "learning_rate": 0.00012125190085652272, + "loss": 1.4219, + "step": 30310 + }, + { + "epoch": 0.3938776206340363, + "grad_norm": 0.4658687114715576, + "learning_rate": 0.00012124930139461133, + "loss": 1.3826, + "step": 30311 + }, + { + "epoch": 0.39389061517795215, + "grad_norm": 0.39100682735443115, + "learning_rate": 0.00012124670193269994, + "loss": 1.3979, + "step": 30312 + }, + { + "epoch": 0.39390360972186805, + "grad_norm": 0.4419659674167633, + "learning_rate": 0.00012124410247078855, + "loss": 1.4854, + "step": 30313 + }, + { + "epoch": 0.3939166042657839, + "grad_norm": 0.3633359372615814, + "learning_rate": 0.00012124150300887717, + "loss": 1.6146, + "step": 30314 + }, + { + "epoch": 0.3939295988096998, + "grad_norm": 0.35677504539489746, + "learning_rate": 0.00012123890354696578, + "loss": 1.2367, + "step": 30315 + }, + { + "epoch": 0.39394259335361564, + "grad_norm": 0.34865784645080566, + "learning_rate": 0.0001212363040850544, + "loss": 1.2232, + "step": 30316 + }, + { + "epoch": 0.39395558789753155, + "grad_norm": 0.4777868390083313, + "learning_rate": 0.000121233704623143, + "loss": 1.5137, + "step": 30317 + }, + { + "epoch": 0.3939685824414474, + "grad_norm": 0.38286593556404114, + "learning_rate": 0.00012123110516123164, + "loss": 1.3217, + "step": 30318 + }, + { + "epoch": 0.3939815769853633, + "grad_norm": 0.38891956210136414, + "learning_rate": 0.00012122850569932025, + "loss": 1.3912, + "step": 30319 + }, + { + "epoch": 0.39399457152927914, + "grad_norm": 0.43460381031036377, + "learning_rate": 0.00012122590623740887, + "loss": 1.184, + "step": 30320 + }, + { + "epoch": 0.39400756607319504, + "grad_norm": 0.36464667320251465, + "learning_rate": 0.00012122330677549748, + "loss": 1.1896, + "step": 30321 + }, + { + "epoch": 0.3940205606171109, + "grad_norm": 0.43258702754974365, + "learning_rate": 0.0001212207073135861, + "loss": 1.5437, + "step": 30322 + }, + { + "epoch": 0.3940335551610268, + "grad_norm": 0.4469812214374542, + "learning_rate": 0.00012121810785167471, + "loss": 1.4213, + "step": 30323 + }, + { + "epoch": 0.39404654970494263, + "grad_norm": 0.3775706887245178, + "learning_rate": 0.00012121550838976332, + "loss": 1.4928, + "step": 30324 + }, + { + "epoch": 0.39405954424885853, + "grad_norm": 0.4082251191139221, + "learning_rate": 0.00012121290892785193, + "loss": 1.5051, + "step": 30325 + }, + { + "epoch": 0.3940725387927744, + "grad_norm": 0.4016304612159729, + "learning_rate": 0.00012121030946594056, + "loss": 1.4906, + "step": 30326 + }, + { + "epoch": 0.3940855333366903, + "grad_norm": 0.4051564931869507, + "learning_rate": 0.00012120771000402917, + "loss": 1.3146, + "step": 30327 + }, + { + "epoch": 0.3940985278806061, + "grad_norm": 0.32665255665779114, + "learning_rate": 0.00012120511054211778, + "loss": 1.1814, + "step": 30328 + }, + { + "epoch": 0.394111522424522, + "grad_norm": 0.4152357876300812, + "learning_rate": 0.00012120251108020639, + "loss": 1.3874, + "step": 30329 + }, + { + "epoch": 0.39412451696843787, + "grad_norm": 0.48771196603775024, + "learning_rate": 0.00012119991161829503, + "loss": 1.5541, + "step": 30330 + }, + { + "epoch": 0.39413751151235377, + "grad_norm": 0.4088663160800934, + "learning_rate": 0.00012119731215638364, + "loss": 1.3029, + "step": 30331 + }, + { + "epoch": 0.3941505060562696, + "grad_norm": 0.4661770462989807, + "learning_rate": 0.00012119471269447225, + "loss": 1.3627, + "step": 30332 + }, + { + "epoch": 0.3941635006001855, + "grad_norm": 0.3803216218948364, + "learning_rate": 0.00012119211323256086, + "loss": 1.3756, + "step": 30333 + }, + { + "epoch": 0.39417649514410136, + "grad_norm": 0.3827464282512665, + "learning_rate": 0.00012118951377064949, + "loss": 1.4559, + "step": 30334 + }, + { + "epoch": 0.39418948968801726, + "grad_norm": 0.43791788816452026, + "learning_rate": 0.0001211869143087381, + "loss": 1.3957, + "step": 30335 + }, + { + "epoch": 0.3942024842319331, + "grad_norm": 0.45772799849510193, + "learning_rate": 0.00012118431484682671, + "loss": 1.5785, + "step": 30336 + }, + { + "epoch": 0.394215478775849, + "grad_norm": 0.3095083236694336, + "learning_rate": 0.00012118171538491532, + "loss": 1.6462, + "step": 30337 + }, + { + "epoch": 0.39422847331976485, + "grad_norm": 0.6071044206619263, + "learning_rate": 0.00012117911592300394, + "loss": 1.4753, + "step": 30338 + }, + { + "epoch": 0.39424146786368075, + "grad_norm": 0.3454511761665344, + "learning_rate": 0.00012117651646109255, + "loss": 1.2334, + "step": 30339 + }, + { + "epoch": 0.3942544624075966, + "grad_norm": 0.44996631145477295, + "learning_rate": 0.00012117391699918117, + "loss": 1.2994, + "step": 30340 + }, + { + "epoch": 0.3942674569515125, + "grad_norm": 0.40120968222618103, + "learning_rate": 0.0001211713175372698, + "loss": 1.4215, + "step": 30341 + }, + { + "epoch": 0.39428045149542834, + "grad_norm": 0.3478165864944458, + "learning_rate": 0.00012116871807535841, + "loss": 1.1979, + "step": 30342 + }, + { + "epoch": 0.39429344603934424, + "grad_norm": 0.3893889784812927, + "learning_rate": 0.00012116611861344702, + "loss": 1.3458, + "step": 30343 + }, + { + "epoch": 0.3943064405832601, + "grad_norm": 0.46901631355285645, + "learning_rate": 0.00012116351915153564, + "loss": 1.3871, + "step": 30344 + }, + { + "epoch": 0.394319435127176, + "grad_norm": 0.4929788410663605, + "learning_rate": 0.00012116091968962426, + "loss": 1.4148, + "step": 30345 + }, + { + "epoch": 0.39433242967109183, + "grad_norm": 0.47531694173812866, + "learning_rate": 0.00012115832022771287, + "loss": 1.2549, + "step": 30346 + }, + { + "epoch": 0.39434542421500773, + "grad_norm": 0.3352787494659424, + "learning_rate": 0.00012115572076580148, + "loss": 1.4831, + "step": 30347 + }, + { + "epoch": 0.3943584187589236, + "grad_norm": 0.3755118250846863, + "learning_rate": 0.00012115312130389009, + "loss": 1.4359, + "step": 30348 + }, + { + "epoch": 0.3943714133028395, + "grad_norm": 0.37394869327545166, + "learning_rate": 0.00012115052184197873, + "loss": 1.5113, + "step": 30349 + }, + { + "epoch": 0.3943844078467553, + "grad_norm": 0.33011576533317566, + "learning_rate": 0.00012114792238006734, + "loss": 1.2471, + "step": 30350 + }, + { + "epoch": 0.3943974023906712, + "grad_norm": 0.47930094599723816, + "learning_rate": 0.00012114532291815594, + "loss": 1.3596, + "step": 30351 + }, + { + "epoch": 0.39441039693458707, + "grad_norm": 0.44782111048698425, + "learning_rate": 0.00012114272345624455, + "loss": 1.4292, + "step": 30352 + }, + { + "epoch": 0.39442339147850297, + "grad_norm": 0.32904288172721863, + "learning_rate": 0.00012114012399433319, + "loss": 1.3898, + "step": 30353 + }, + { + "epoch": 0.3944363860224188, + "grad_norm": 0.45402792096138, + "learning_rate": 0.0001211375245324218, + "loss": 1.4728, + "step": 30354 + }, + { + "epoch": 0.3944493805663347, + "grad_norm": 0.32263854146003723, + "learning_rate": 0.00012113492507051041, + "loss": 1.2434, + "step": 30355 + }, + { + "epoch": 0.39446237511025056, + "grad_norm": 0.33296889066696167, + "learning_rate": 0.00012113232560859902, + "loss": 1.2789, + "step": 30356 + }, + { + "epoch": 0.39447536965416646, + "grad_norm": 0.4589162766933441, + "learning_rate": 0.00012112972614668765, + "loss": 1.4358, + "step": 30357 + }, + { + "epoch": 0.3944883641980823, + "grad_norm": 0.42167553305625916, + "learning_rate": 0.00012112712668477626, + "loss": 1.4254, + "step": 30358 + }, + { + "epoch": 0.3945013587419982, + "grad_norm": 0.3578254282474518, + "learning_rate": 0.00012112452722286487, + "loss": 1.2167, + "step": 30359 + }, + { + "epoch": 0.39451435328591405, + "grad_norm": 0.40141355991363525, + "learning_rate": 0.00012112192776095348, + "loss": 1.4833, + "step": 30360 + }, + { + "epoch": 0.39452734782982996, + "grad_norm": 0.43685653805732727, + "learning_rate": 0.00012111932829904212, + "loss": 1.3553, + "step": 30361 + }, + { + "epoch": 0.3945403423737458, + "grad_norm": 0.3432730734348297, + "learning_rate": 0.00012111672883713073, + "loss": 1.4015, + "step": 30362 + }, + { + "epoch": 0.3945533369176617, + "grad_norm": 0.4469623267650604, + "learning_rate": 0.00012111412937521934, + "loss": 1.3108, + "step": 30363 + }, + { + "epoch": 0.39456633146157755, + "grad_norm": 0.39759576320648193, + "learning_rate": 0.00012111152991330794, + "loss": 1.244, + "step": 30364 + }, + { + "epoch": 0.39457932600549345, + "grad_norm": 0.41511356830596924, + "learning_rate": 0.00012110893045139657, + "loss": 1.5218, + "step": 30365 + }, + { + "epoch": 0.3945923205494093, + "grad_norm": 0.4499237835407257, + "learning_rate": 0.00012110633098948518, + "loss": 1.4689, + "step": 30366 + }, + { + "epoch": 0.3946053150933252, + "grad_norm": 0.3941147029399872, + "learning_rate": 0.0001211037315275738, + "loss": 1.4664, + "step": 30367 + }, + { + "epoch": 0.39461830963724104, + "grad_norm": 0.4820142686367035, + "learning_rate": 0.0001211011320656624, + "loss": 1.6, + "step": 30368 + }, + { + "epoch": 0.39463130418115694, + "grad_norm": 0.41343823075294495, + "learning_rate": 0.00012109853260375103, + "loss": 1.5006, + "step": 30369 + }, + { + "epoch": 0.3946442987250728, + "grad_norm": 0.37576255202293396, + "learning_rate": 0.00012109593314183964, + "loss": 1.2638, + "step": 30370 + }, + { + "epoch": 0.3946572932689887, + "grad_norm": 0.42644011974334717, + "learning_rate": 0.00012109333367992825, + "loss": 1.4276, + "step": 30371 + }, + { + "epoch": 0.39467028781290453, + "grad_norm": 0.43201544880867004, + "learning_rate": 0.00012109073421801686, + "loss": 1.3923, + "step": 30372 + }, + { + "epoch": 0.39468328235682043, + "grad_norm": 0.39694398641586304, + "learning_rate": 0.0001210881347561055, + "loss": 1.3865, + "step": 30373 + }, + { + "epoch": 0.3946962769007363, + "grad_norm": 0.40186285972595215, + "learning_rate": 0.00012108553529419411, + "loss": 1.2719, + "step": 30374 + }, + { + "epoch": 0.3947092714446522, + "grad_norm": 0.4058942198753357, + "learning_rate": 0.00012108293583228272, + "loss": 1.4401, + "step": 30375 + }, + { + "epoch": 0.3947222659885681, + "grad_norm": 0.429519921541214, + "learning_rate": 0.00012108033637037135, + "loss": 1.6299, + "step": 30376 + }, + { + "epoch": 0.3947352605324839, + "grad_norm": 0.35235705971717834, + "learning_rate": 0.00012107773690845996, + "loss": 1.4687, + "step": 30377 + }, + { + "epoch": 0.3947482550763998, + "grad_norm": 0.3400331735610962, + "learning_rate": 0.00012107513744654857, + "loss": 1.4229, + "step": 30378 + }, + { + "epoch": 0.39476124962031567, + "grad_norm": 0.4269712269306183, + "learning_rate": 0.00012107253798463718, + "loss": 1.5344, + "step": 30379 + }, + { + "epoch": 0.39477424416423157, + "grad_norm": 0.4160550534725189, + "learning_rate": 0.0001210699385227258, + "loss": 1.3546, + "step": 30380 + }, + { + "epoch": 0.3947872387081474, + "grad_norm": 0.33458447456359863, + "learning_rate": 0.00012106733906081442, + "loss": 1.4475, + "step": 30381 + }, + { + "epoch": 0.3948002332520633, + "grad_norm": 0.3680357336997986, + "learning_rate": 0.00012106473959890303, + "loss": 1.3641, + "step": 30382 + }, + { + "epoch": 0.39481322779597916, + "grad_norm": 0.4701084494590759, + "learning_rate": 0.00012106214013699164, + "loss": 1.5225, + "step": 30383 + }, + { + "epoch": 0.39482622233989506, + "grad_norm": 0.33250707387924194, + "learning_rate": 0.00012105954067508028, + "loss": 1.294, + "step": 30384 + }, + { + "epoch": 0.3948392168838109, + "grad_norm": 0.4820285141468048, + "learning_rate": 0.00012105694121316889, + "loss": 1.4186, + "step": 30385 + }, + { + "epoch": 0.3948522114277268, + "grad_norm": 0.3237875699996948, + "learning_rate": 0.0001210543417512575, + "loss": 1.5293, + "step": 30386 + }, + { + "epoch": 0.39486520597164265, + "grad_norm": 0.4344033896923065, + "learning_rate": 0.00012105174228934611, + "loss": 1.4003, + "step": 30387 + }, + { + "epoch": 0.39487820051555855, + "grad_norm": 0.29124560952186584, + "learning_rate": 0.00012104914282743473, + "loss": 1.266, + "step": 30388 + }, + { + "epoch": 0.3948911950594744, + "grad_norm": 0.3328922986984253, + "learning_rate": 0.00012104654336552334, + "loss": 1.4182, + "step": 30389 + }, + { + "epoch": 0.3949041896033903, + "grad_norm": 0.3238808214664459, + "learning_rate": 0.00012104394390361196, + "loss": 1.4589, + "step": 30390 + }, + { + "epoch": 0.39491718414730614, + "grad_norm": 0.3149753212928772, + "learning_rate": 0.00012104134444170057, + "loss": 1.352, + "step": 30391 + }, + { + "epoch": 0.39493017869122204, + "grad_norm": 0.4066602289676666, + "learning_rate": 0.0001210387449797892, + "loss": 1.5107, + "step": 30392 + }, + { + "epoch": 0.3949431732351379, + "grad_norm": 0.3294428586959839, + "learning_rate": 0.0001210361455178778, + "loss": 1.1583, + "step": 30393 + }, + { + "epoch": 0.3949561677790538, + "grad_norm": 0.3261302411556244, + "learning_rate": 0.00012103354605596641, + "loss": 1.2195, + "step": 30394 + }, + { + "epoch": 0.39496916232296964, + "grad_norm": 0.41166698932647705, + "learning_rate": 0.00012103094659405502, + "loss": 1.3815, + "step": 30395 + }, + { + "epoch": 0.39498215686688554, + "grad_norm": 0.5023846626281738, + "learning_rate": 0.00012102834713214366, + "loss": 1.2746, + "step": 30396 + }, + { + "epoch": 0.3949951514108014, + "grad_norm": 0.42920002341270447, + "learning_rate": 0.00012102574767023227, + "loss": 1.4611, + "step": 30397 + }, + { + "epoch": 0.3950081459547173, + "grad_norm": 0.3440791666507721, + "learning_rate": 0.00012102314820832088, + "loss": 1.4547, + "step": 30398 + }, + { + "epoch": 0.3950211404986331, + "grad_norm": 0.3438780605792999, + "learning_rate": 0.0001210205487464095, + "loss": 1.2532, + "step": 30399 + }, + { + "epoch": 0.39503413504254903, + "grad_norm": 0.37750938534736633, + "learning_rate": 0.00012101794928449812, + "loss": 1.4519, + "step": 30400 + }, + { + "epoch": 0.3950471295864649, + "grad_norm": 0.3575557470321655, + "learning_rate": 0.00012101534982258673, + "loss": 1.4045, + "step": 30401 + }, + { + "epoch": 0.3950601241303808, + "grad_norm": 0.30885812640190125, + "learning_rate": 0.00012101275036067534, + "loss": 1.4515, + "step": 30402 + }, + { + "epoch": 0.3950731186742966, + "grad_norm": 0.3775915503501892, + "learning_rate": 0.00012101015089876395, + "loss": 1.4948, + "step": 30403 + }, + { + "epoch": 0.3950861132182125, + "grad_norm": 0.36845511198043823, + "learning_rate": 0.00012100755143685259, + "loss": 1.2433, + "step": 30404 + }, + { + "epoch": 0.39509910776212837, + "grad_norm": 0.32144248485565186, + "learning_rate": 0.0001210049519749412, + "loss": 1.545, + "step": 30405 + }, + { + "epoch": 0.39511210230604427, + "grad_norm": 0.32726970314979553, + "learning_rate": 0.0001210023525130298, + "loss": 1.5395, + "step": 30406 + }, + { + "epoch": 0.3951250968499601, + "grad_norm": 0.2911970615386963, + "learning_rate": 0.00012099975305111841, + "loss": 1.1765, + "step": 30407 + }, + { + "epoch": 0.395138091393876, + "grad_norm": 0.46382129192352295, + "learning_rate": 0.00012099715358920705, + "loss": 1.4141, + "step": 30408 + }, + { + "epoch": 0.39515108593779186, + "grad_norm": 0.36765822768211365, + "learning_rate": 0.00012099455412729566, + "loss": 1.3772, + "step": 30409 + }, + { + "epoch": 0.39516408048170776, + "grad_norm": 0.38965848088264465, + "learning_rate": 0.00012099195466538427, + "loss": 1.5916, + "step": 30410 + }, + { + "epoch": 0.3951770750256236, + "grad_norm": 0.3931178152561188, + "learning_rate": 0.00012098935520347288, + "loss": 1.3754, + "step": 30411 + }, + { + "epoch": 0.3951900695695395, + "grad_norm": 0.41154351830482483, + "learning_rate": 0.0001209867557415615, + "loss": 1.4641, + "step": 30412 + }, + { + "epoch": 0.39520306411345535, + "grad_norm": 0.3723491132259369, + "learning_rate": 0.00012098415627965012, + "loss": 1.4292, + "step": 30413 + }, + { + "epoch": 0.39521605865737125, + "grad_norm": 0.3092310428619385, + "learning_rate": 0.00012098155681773873, + "loss": 1.2588, + "step": 30414 + }, + { + "epoch": 0.3952290532012871, + "grad_norm": 0.4849766492843628, + "learning_rate": 0.00012097895735582736, + "loss": 1.4532, + "step": 30415 + }, + { + "epoch": 0.395242047745203, + "grad_norm": 0.45692840218544006, + "learning_rate": 0.00012097635789391598, + "loss": 1.4348, + "step": 30416 + }, + { + "epoch": 0.39525504228911884, + "grad_norm": 0.35874131321907043, + "learning_rate": 0.00012097375843200459, + "loss": 1.2967, + "step": 30417 + }, + { + "epoch": 0.39526803683303474, + "grad_norm": 0.4234062731266022, + "learning_rate": 0.00012097115897009318, + "loss": 1.3856, + "step": 30418 + }, + { + "epoch": 0.3952810313769506, + "grad_norm": 0.5291977524757385, + "learning_rate": 0.00012096855950818182, + "loss": 1.5585, + "step": 30419 + }, + { + "epoch": 0.3952940259208665, + "grad_norm": 0.43282121419906616, + "learning_rate": 0.00012096596004627043, + "loss": 1.309, + "step": 30420 + }, + { + "epoch": 0.39530702046478233, + "grad_norm": 0.42477747797966003, + "learning_rate": 0.00012096336058435904, + "loss": 1.2817, + "step": 30421 + }, + { + "epoch": 0.39532001500869823, + "grad_norm": 0.43694546818733215, + "learning_rate": 0.00012096076112244765, + "loss": 1.4716, + "step": 30422 + }, + { + "epoch": 0.3953330095526141, + "grad_norm": 0.46578672528266907, + "learning_rate": 0.00012095816166053628, + "loss": 1.5188, + "step": 30423 + }, + { + "epoch": 0.39534600409653, + "grad_norm": 0.40985047817230225, + "learning_rate": 0.00012095556219862489, + "loss": 1.3575, + "step": 30424 + }, + { + "epoch": 0.3953589986404458, + "grad_norm": 0.41353029012680054, + "learning_rate": 0.0001209529627367135, + "loss": 1.2939, + "step": 30425 + }, + { + "epoch": 0.3953719931843617, + "grad_norm": 0.4211789071559906, + "learning_rate": 0.00012095036327480211, + "loss": 1.544, + "step": 30426 + }, + { + "epoch": 0.39538498772827757, + "grad_norm": 0.49660423398017883, + "learning_rate": 0.00012094776381289075, + "loss": 1.2689, + "step": 30427 + }, + { + "epoch": 0.39539798227219347, + "grad_norm": 0.37554702162742615, + "learning_rate": 0.00012094516435097936, + "loss": 1.3869, + "step": 30428 + }, + { + "epoch": 0.3954109768161093, + "grad_norm": 0.3733389973640442, + "learning_rate": 0.00012094256488906797, + "loss": 1.4184, + "step": 30429 + }, + { + "epoch": 0.3954239713600252, + "grad_norm": 0.4084286689758301, + "learning_rate": 0.00012093996542715658, + "loss": 1.3995, + "step": 30430 + }, + { + "epoch": 0.39543696590394106, + "grad_norm": 0.37877383828163147, + "learning_rate": 0.00012093736596524521, + "loss": 1.5102, + "step": 30431 + }, + { + "epoch": 0.39544996044785696, + "grad_norm": 0.3838941156864166, + "learning_rate": 0.00012093476650333382, + "loss": 1.465, + "step": 30432 + }, + { + "epoch": 0.3954629549917728, + "grad_norm": 0.5083638429641724, + "learning_rate": 0.00012093216704142243, + "loss": 1.5578, + "step": 30433 + }, + { + "epoch": 0.3954759495356887, + "grad_norm": 0.4485577642917633, + "learning_rate": 0.00012092956757951104, + "loss": 1.2474, + "step": 30434 + }, + { + "epoch": 0.39548894407960455, + "grad_norm": 0.4122265577316284, + "learning_rate": 0.00012092696811759966, + "loss": 1.6011, + "step": 30435 + }, + { + "epoch": 0.39550193862352045, + "grad_norm": 0.4267435669898987, + "learning_rate": 0.00012092436865568828, + "loss": 1.4997, + "step": 30436 + }, + { + "epoch": 0.3955149331674363, + "grad_norm": 0.4243057370185852, + "learning_rate": 0.00012092176919377689, + "loss": 1.4076, + "step": 30437 + }, + { + "epoch": 0.3955279277113522, + "grad_norm": 0.4130112826824188, + "learning_rate": 0.0001209191697318655, + "loss": 1.3049, + "step": 30438 + }, + { + "epoch": 0.39554092225526805, + "grad_norm": 0.3776995837688446, + "learning_rate": 0.00012091657026995414, + "loss": 1.437, + "step": 30439 + }, + { + "epoch": 0.39555391679918395, + "grad_norm": 0.4199751317501068, + "learning_rate": 0.00012091397080804275, + "loss": 1.3364, + "step": 30440 + }, + { + "epoch": 0.3955669113430998, + "grad_norm": 0.3213954567909241, + "learning_rate": 0.00012091137134613136, + "loss": 1.4629, + "step": 30441 + }, + { + "epoch": 0.3955799058870157, + "grad_norm": 0.3428385853767395, + "learning_rate": 0.00012090877188421997, + "loss": 1.315, + "step": 30442 + }, + { + "epoch": 0.39559290043093154, + "grad_norm": 0.517498254776001, + "learning_rate": 0.00012090617242230859, + "loss": 1.5401, + "step": 30443 + }, + { + "epoch": 0.39560589497484744, + "grad_norm": 0.3723388612270355, + "learning_rate": 0.0001209035729603972, + "loss": 1.2064, + "step": 30444 + }, + { + "epoch": 0.3956188895187633, + "grad_norm": 0.4449765682220459, + "learning_rate": 0.00012090097349848581, + "loss": 1.4529, + "step": 30445 + }, + { + "epoch": 0.3956318840626792, + "grad_norm": 0.4224238097667694, + "learning_rate": 0.00012089837403657443, + "loss": 1.2925, + "step": 30446 + }, + { + "epoch": 0.39564487860659503, + "grad_norm": 0.36337023973464966, + "learning_rate": 0.00012089577457466306, + "loss": 1.354, + "step": 30447 + }, + { + "epoch": 0.39565787315051093, + "grad_norm": 0.33868369460105896, + "learning_rate": 0.00012089317511275166, + "loss": 1.5934, + "step": 30448 + }, + { + "epoch": 0.3956708676944268, + "grad_norm": 0.3751220107078552, + "learning_rate": 0.00012089057565084027, + "loss": 1.5654, + "step": 30449 + }, + { + "epoch": 0.3956838622383427, + "grad_norm": 0.5109931826591492, + "learning_rate": 0.00012088797618892888, + "loss": 1.3383, + "step": 30450 + }, + { + "epoch": 0.3956968567822586, + "grad_norm": 0.5026741623878479, + "learning_rate": 0.00012088537672701752, + "loss": 1.4177, + "step": 30451 + }, + { + "epoch": 0.3957098513261744, + "grad_norm": 0.36857569217681885, + "learning_rate": 0.00012088277726510613, + "loss": 1.3923, + "step": 30452 + }, + { + "epoch": 0.3957228458700903, + "grad_norm": 0.34272661805152893, + "learning_rate": 0.00012088017780319474, + "loss": 1.1314, + "step": 30453 + }, + { + "epoch": 0.39573584041400617, + "grad_norm": 0.32468923926353455, + "learning_rate": 0.00012087757834128337, + "loss": 1.2391, + "step": 30454 + }, + { + "epoch": 0.39574883495792207, + "grad_norm": 0.38658881187438965, + "learning_rate": 0.00012087497887937198, + "loss": 1.2582, + "step": 30455 + }, + { + "epoch": 0.3957618295018379, + "grad_norm": 0.38872388005256653, + "learning_rate": 0.00012087237941746059, + "loss": 1.2183, + "step": 30456 + }, + { + "epoch": 0.3957748240457538, + "grad_norm": 0.389353483915329, + "learning_rate": 0.0001208697799555492, + "loss": 1.3856, + "step": 30457 + }, + { + "epoch": 0.39578781858966966, + "grad_norm": 0.41509899497032166, + "learning_rate": 0.00012086718049363784, + "loss": 1.3443, + "step": 30458 + }, + { + "epoch": 0.39580081313358556, + "grad_norm": 0.3822564482688904, + "learning_rate": 0.00012086458103172645, + "loss": 1.2186, + "step": 30459 + }, + { + "epoch": 0.3958138076775014, + "grad_norm": 0.4116131067276001, + "learning_rate": 0.00012086198156981505, + "loss": 1.3594, + "step": 30460 + }, + { + "epoch": 0.3958268022214173, + "grad_norm": 0.45288777351379395, + "learning_rate": 0.00012085938210790366, + "loss": 1.4753, + "step": 30461 + }, + { + "epoch": 0.39583979676533315, + "grad_norm": 0.44897639751434326, + "learning_rate": 0.0001208567826459923, + "loss": 1.446, + "step": 30462 + }, + { + "epoch": 0.39585279130924905, + "grad_norm": 0.2966291606426239, + "learning_rate": 0.0001208541831840809, + "loss": 1.4837, + "step": 30463 + }, + { + "epoch": 0.3958657858531649, + "grad_norm": 0.40922674536705017, + "learning_rate": 0.00012085158372216952, + "loss": 1.1878, + "step": 30464 + }, + { + "epoch": 0.3958787803970808, + "grad_norm": 0.40658944845199585, + "learning_rate": 0.00012084898426025813, + "loss": 1.3913, + "step": 30465 + }, + { + "epoch": 0.39589177494099664, + "grad_norm": 0.3612760007381439, + "learning_rate": 0.00012084638479834675, + "loss": 1.4402, + "step": 30466 + }, + { + "epoch": 0.39590476948491254, + "grad_norm": 0.4137723445892334, + "learning_rate": 0.00012084378533643536, + "loss": 1.3328, + "step": 30467 + }, + { + "epoch": 0.3959177640288284, + "grad_norm": 0.42410802841186523, + "learning_rate": 0.00012084118587452397, + "loss": 1.4157, + "step": 30468 + }, + { + "epoch": 0.3959307585727443, + "grad_norm": 0.48106735944747925, + "learning_rate": 0.00012083858641261259, + "loss": 1.5204, + "step": 30469 + }, + { + "epoch": 0.39594375311666014, + "grad_norm": 0.41061797738075256, + "learning_rate": 0.00012083598695070122, + "loss": 1.3481, + "step": 30470 + }, + { + "epoch": 0.39595674766057604, + "grad_norm": 0.390939861536026, + "learning_rate": 0.00012083338748878983, + "loss": 1.4766, + "step": 30471 + }, + { + "epoch": 0.3959697422044919, + "grad_norm": 0.4585546851158142, + "learning_rate": 0.00012083078802687844, + "loss": 1.4192, + "step": 30472 + }, + { + "epoch": 0.3959827367484078, + "grad_norm": 0.3996250629425049, + "learning_rate": 0.00012082818856496704, + "loss": 1.4573, + "step": 30473 + }, + { + "epoch": 0.3959957312923236, + "grad_norm": 0.36808812618255615, + "learning_rate": 0.00012082558910305568, + "loss": 1.2857, + "step": 30474 + }, + { + "epoch": 0.39600872583623953, + "grad_norm": 0.49980711936950684, + "learning_rate": 0.00012082298964114429, + "loss": 1.2549, + "step": 30475 + }, + { + "epoch": 0.3960217203801554, + "grad_norm": 0.40274959802627563, + "learning_rate": 0.0001208203901792329, + "loss": 1.2752, + "step": 30476 + }, + { + "epoch": 0.3960347149240713, + "grad_norm": 0.40584805607795715, + "learning_rate": 0.00012081779071732151, + "loss": 1.302, + "step": 30477 + }, + { + "epoch": 0.3960477094679871, + "grad_norm": 0.3568449914455414, + "learning_rate": 0.00012081519125541014, + "loss": 1.3545, + "step": 30478 + }, + { + "epoch": 0.396060704011903, + "grad_norm": 0.3956317603588104, + "learning_rate": 0.00012081259179349875, + "loss": 1.2355, + "step": 30479 + }, + { + "epoch": 0.39607369855581886, + "grad_norm": 0.39014559984207153, + "learning_rate": 0.00012080999233158736, + "loss": 1.4363, + "step": 30480 + }, + { + "epoch": 0.39608669309973477, + "grad_norm": 0.31834426522254944, + "learning_rate": 0.00012080739286967597, + "loss": 1.4256, + "step": 30481 + }, + { + "epoch": 0.3960996876436506, + "grad_norm": 0.4114624559879303, + "learning_rate": 0.00012080479340776461, + "loss": 1.4327, + "step": 30482 + }, + { + "epoch": 0.3961126821875665, + "grad_norm": 0.2922571301460266, + "learning_rate": 0.00012080219394585322, + "loss": 1.0886, + "step": 30483 + }, + { + "epoch": 0.39612567673148236, + "grad_norm": 0.5182799696922302, + "learning_rate": 0.00012079959448394183, + "loss": 1.396, + "step": 30484 + }, + { + "epoch": 0.39613867127539826, + "grad_norm": 0.35662055015563965, + "learning_rate": 0.00012079699502203044, + "loss": 1.5107, + "step": 30485 + }, + { + "epoch": 0.3961516658193141, + "grad_norm": 0.41533610224723816, + "learning_rate": 0.00012079439556011907, + "loss": 1.4378, + "step": 30486 + }, + { + "epoch": 0.39616466036323, + "grad_norm": 0.38804373145103455, + "learning_rate": 0.00012079179609820768, + "loss": 1.3224, + "step": 30487 + }, + { + "epoch": 0.39617765490714585, + "grad_norm": 0.42738789319992065, + "learning_rate": 0.00012078919663629629, + "loss": 1.3672, + "step": 30488 + }, + { + "epoch": 0.39619064945106175, + "grad_norm": 0.4870091378688812, + "learning_rate": 0.00012078659717438493, + "loss": 1.5115, + "step": 30489 + }, + { + "epoch": 0.3962036439949776, + "grad_norm": 0.33631303906440735, + "learning_rate": 0.00012078399771247352, + "loss": 1.3926, + "step": 30490 + }, + { + "epoch": 0.3962166385388935, + "grad_norm": 0.2987799644470215, + "learning_rate": 0.00012078139825056213, + "loss": 1.246, + "step": 30491 + }, + { + "epoch": 0.39622963308280934, + "grad_norm": 0.3712847828865051, + "learning_rate": 0.00012077879878865074, + "loss": 1.3163, + "step": 30492 + }, + { + "epoch": 0.39624262762672524, + "grad_norm": 0.366850882768631, + "learning_rate": 0.00012077619932673938, + "loss": 1.3545, + "step": 30493 + }, + { + "epoch": 0.3962556221706411, + "grad_norm": 0.38335683941841125, + "learning_rate": 0.000120773599864828, + "loss": 1.1562, + "step": 30494 + }, + { + "epoch": 0.396268616714557, + "grad_norm": 0.49375998973846436, + "learning_rate": 0.0001207710004029166, + "loss": 1.3706, + "step": 30495 + }, + { + "epoch": 0.39628161125847283, + "grad_norm": 0.321294903755188, + "learning_rate": 0.00012076840094100522, + "loss": 1.3135, + "step": 30496 + }, + { + "epoch": 0.39629460580238873, + "grad_norm": 0.370956152677536, + "learning_rate": 0.00012076580147909384, + "loss": 1.4765, + "step": 30497 + }, + { + "epoch": 0.3963076003463046, + "grad_norm": 0.39633017778396606, + "learning_rate": 0.00012076320201718245, + "loss": 1.4525, + "step": 30498 + }, + { + "epoch": 0.3963205948902205, + "grad_norm": 0.4501740336418152, + "learning_rate": 0.00012076060255527106, + "loss": 1.5941, + "step": 30499 + }, + { + "epoch": 0.3963335894341363, + "grad_norm": 0.35627079010009766, + "learning_rate": 0.00012075800309335967, + "loss": 1.3806, + "step": 30500 + }, + { + "epoch": 0.3963465839780522, + "grad_norm": 0.3913305699825287, + "learning_rate": 0.00012075540363144831, + "loss": 1.2485, + "step": 30501 + }, + { + "epoch": 0.39635957852196807, + "grad_norm": 0.39382895827293396, + "learning_rate": 0.00012075280416953691, + "loss": 1.3889, + "step": 30502 + }, + { + "epoch": 0.39637257306588397, + "grad_norm": 0.4183182120323181, + "learning_rate": 0.00012075020470762552, + "loss": 1.5115, + "step": 30503 + }, + { + "epoch": 0.3963855676097998, + "grad_norm": 0.3839147686958313, + "learning_rate": 0.00012074760524571413, + "loss": 1.4127, + "step": 30504 + }, + { + "epoch": 0.3963985621537157, + "grad_norm": 0.43862468004226685, + "learning_rate": 0.00012074500578380277, + "loss": 1.3861, + "step": 30505 + }, + { + "epoch": 0.39641155669763156, + "grad_norm": 0.34355974197387695, + "learning_rate": 0.00012074240632189138, + "loss": 1.3486, + "step": 30506 + }, + { + "epoch": 0.39642455124154746, + "grad_norm": 0.38287320733070374, + "learning_rate": 0.00012073980685997999, + "loss": 1.3295, + "step": 30507 + }, + { + "epoch": 0.3964375457854633, + "grad_norm": 0.2911398410797119, + "learning_rate": 0.0001207372073980686, + "loss": 1.2995, + "step": 30508 + }, + { + "epoch": 0.3964505403293792, + "grad_norm": 0.34748217463493347, + "learning_rate": 0.00012073460793615723, + "loss": 1.2762, + "step": 30509 + }, + { + "epoch": 0.39646353487329505, + "grad_norm": 0.4181280732154846, + "learning_rate": 0.00012073200847424584, + "loss": 1.5515, + "step": 30510 + }, + { + "epoch": 0.39647652941721095, + "grad_norm": 0.3230595588684082, + "learning_rate": 0.00012072940901233445, + "loss": 1.3266, + "step": 30511 + }, + { + "epoch": 0.3964895239611268, + "grad_norm": 0.3540262281894684, + "learning_rate": 0.00012072680955042306, + "loss": 1.2559, + "step": 30512 + }, + { + "epoch": 0.3965025185050427, + "grad_norm": 0.5047813057899475, + "learning_rate": 0.0001207242100885117, + "loss": 1.4668, + "step": 30513 + }, + { + "epoch": 0.39651551304895855, + "grad_norm": 0.3461460471153259, + "learning_rate": 0.00012072161062660031, + "loss": 1.4371, + "step": 30514 + }, + { + "epoch": 0.39652850759287445, + "grad_norm": 0.3908366858959198, + "learning_rate": 0.0001207190111646889, + "loss": 1.3088, + "step": 30515 + }, + { + "epoch": 0.3965415021367903, + "grad_norm": 0.43267524242401123, + "learning_rate": 0.00012071641170277752, + "loss": 1.419, + "step": 30516 + }, + { + "epoch": 0.3965544966807062, + "grad_norm": 0.31342220306396484, + "learning_rate": 0.00012071381224086615, + "loss": 1.3394, + "step": 30517 + }, + { + "epoch": 0.39656749122462204, + "grad_norm": 0.4734180271625519, + "learning_rate": 0.00012071121277895476, + "loss": 1.4767, + "step": 30518 + }, + { + "epoch": 0.39658048576853794, + "grad_norm": 0.4645110070705414, + "learning_rate": 0.00012070861331704338, + "loss": 1.6725, + "step": 30519 + }, + { + "epoch": 0.3965934803124538, + "grad_norm": 0.324693888425827, + "learning_rate": 0.00012070601385513199, + "loss": 1.3567, + "step": 30520 + }, + { + "epoch": 0.3966064748563697, + "grad_norm": 0.3301478326320648, + "learning_rate": 0.00012070341439322061, + "loss": 1.1773, + "step": 30521 + }, + { + "epoch": 0.39661946940028553, + "grad_norm": 0.43589335680007935, + "learning_rate": 0.00012070081493130922, + "loss": 1.3763, + "step": 30522 + }, + { + "epoch": 0.39663246394420143, + "grad_norm": 0.43325603008270264, + "learning_rate": 0.00012069821546939783, + "loss": 1.3717, + "step": 30523 + }, + { + "epoch": 0.3966454584881173, + "grad_norm": 0.37894830107688904, + "learning_rate": 0.00012069561600748644, + "loss": 1.3014, + "step": 30524 + }, + { + "epoch": 0.3966584530320332, + "grad_norm": 0.38113725185394287, + "learning_rate": 0.00012069301654557508, + "loss": 1.3669, + "step": 30525 + }, + { + "epoch": 0.396671447575949, + "grad_norm": 0.5112550854682922, + "learning_rate": 0.00012069041708366369, + "loss": 1.4037, + "step": 30526 + }, + { + "epoch": 0.3966844421198649, + "grad_norm": 0.39900729060173035, + "learning_rate": 0.0001206878176217523, + "loss": 1.2266, + "step": 30527 + }, + { + "epoch": 0.3966974366637808, + "grad_norm": 0.490354061126709, + "learning_rate": 0.00012068521815984093, + "loss": 1.3599, + "step": 30528 + }, + { + "epoch": 0.39671043120769667, + "grad_norm": 0.5201057195663452, + "learning_rate": 0.00012068261869792954, + "loss": 1.5922, + "step": 30529 + }, + { + "epoch": 0.39672342575161257, + "grad_norm": 0.36522430181503296, + "learning_rate": 0.00012068001923601815, + "loss": 1.457, + "step": 30530 + }, + { + "epoch": 0.3967364202955284, + "grad_norm": 0.43237805366516113, + "learning_rate": 0.00012067741977410676, + "loss": 1.422, + "step": 30531 + }, + { + "epoch": 0.3967494148394443, + "grad_norm": 0.4094823896884918, + "learning_rate": 0.00012067482031219539, + "loss": 1.4329, + "step": 30532 + }, + { + "epoch": 0.39676240938336016, + "grad_norm": 0.2513434588909149, + "learning_rate": 0.000120672220850284, + "loss": 1.0847, + "step": 30533 + }, + { + "epoch": 0.39677540392727606, + "grad_norm": 0.42192405462265015, + "learning_rate": 0.00012066962138837261, + "loss": 1.5105, + "step": 30534 + }, + { + "epoch": 0.3967883984711919, + "grad_norm": 0.45719724893569946, + "learning_rate": 0.00012066702192646122, + "loss": 1.375, + "step": 30535 + }, + { + "epoch": 0.3968013930151078, + "grad_norm": 0.44061875343322754, + "learning_rate": 0.00012066442246454986, + "loss": 1.3445, + "step": 30536 + }, + { + "epoch": 0.39681438755902365, + "grad_norm": 0.27138903737068176, + "learning_rate": 0.00012066182300263847, + "loss": 1.3426, + "step": 30537 + }, + { + "epoch": 0.39682738210293955, + "grad_norm": 0.3566970229148865, + "learning_rate": 0.00012065922354072708, + "loss": 1.3506, + "step": 30538 + }, + { + "epoch": 0.3968403766468554, + "grad_norm": 0.47373026609420776, + "learning_rate": 0.00012065662407881569, + "loss": 1.4535, + "step": 30539 + }, + { + "epoch": 0.3968533711907713, + "grad_norm": 0.3562994599342346, + "learning_rate": 0.00012065402461690431, + "loss": 1.4934, + "step": 30540 + }, + { + "epoch": 0.39686636573468714, + "grad_norm": 0.3392718434333801, + "learning_rate": 0.00012065142515499292, + "loss": 1.3738, + "step": 30541 + }, + { + "epoch": 0.39687936027860304, + "grad_norm": 0.4067341685295105, + "learning_rate": 0.00012064882569308154, + "loss": 1.2405, + "step": 30542 + }, + { + "epoch": 0.3968923548225189, + "grad_norm": 0.43516093492507935, + "learning_rate": 0.00012064622623117015, + "loss": 1.4706, + "step": 30543 + }, + { + "epoch": 0.3969053493664348, + "grad_norm": 0.5096096396446228, + "learning_rate": 0.00012064362676925877, + "loss": 1.5871, + "step": 30544 + }, + { + "epoch": 0.39691834391035064, + "grad_norm": 0.43477702140808105, + "learning_rate": 0.00012064102730734738, + "loss": 1.5679, + "step": 30545 + }, + { + "epoch": 0.39693133845426654, + "grad_norm": 0.4271295666694641, + "learning_rate": 0.00012063842784543599, + "loss": 1.4513, + "step": 30546 + }, + { + "epoch": 0.3969443329981824, + "grad_norm": 0.32982924580574036, + "learning_rate": 0.0001206358283835246, + "loss": 1.4264, + "step": 30547 + }, + { + "epoch": 0.3969573275420983, + "grad_norm": 0.3597833216190338, + "learning_rate": 0.00012063322892161324, + "loss": 1.4237, + "step": 30548 + }, + { + "epoch": 0.3969703220860141, + "grad_norm": 0.4055320620536804, + "learning_rate": 0.00012063062945970185, + "loss": 1.5282, + "step": 30549 + }, + { + "epoch": 0.39698331662993, + "grad_norm": 0.39031103253364563, + "learning_rate": 0.00012062802999779046, + "loss": 1.4969, + "step": 30550 + }, + { + "epoch": 0.3969963111738459, + "grad_norm": 0.45249009132385254, + "learning_rate": 0.00012062543053587907, + "loss": 1.4753, + "step": 30551 + }, + { + "epoch": 0.3970093057177618, + "grad_norm": 0.3768482506275177, + "learning_rate": 0.0001206228310739677, + "loss": 1.2406, + "step": 30552 + }, + { + "epoch": 0.3970223002616776, + "grad_norm": 0.3541805148124695, + "learning_rate": 0.00012062023161205631, + "loss": 1.3852, + "step": 30553 + }, + { + "epoch": 0.3970352948055935, + "grad_norm": 0.3437105119228363, + "learning_rate": 0.00012061763215014492, + "loss": 1.4595, + "step": 30554 + }, + { + "epoch": 0.39704828934950936, + "grad_norm": 0.3743917644023895, + "learning_rate": 0.00012061503268823353, + "loss": 1.5073, + "step": 30555 + }, + { + "epoch": 0.39706128389342527, + "grad_norm": 0.4299183785915375, + "learning_rate": 0.00012061243322632217, + "loss": 1.2275, + "step": 30556 + }, + { + "epoch": 0.3970742784373411, + "grad_norm": 0.4112553894519806, + "learning_rate": 0.00012060983376441077, + "loss": 1.4164, + "step": 30557 + }, + { + "epoch": 0.397087272981257, + "grad_norm": 0.3559432923793793, + "learning_rate": 0.00012060723430249938, + "loss": 1.3988, + "step": 30558 + }, + { + "epoch": 0.39710026752517286, + "grad_norm": 0.40167734026908875, + "learning_rate": 0.00012060463484058799, + "loss": 1.1957, + "step": 30559 + }, + { + "epoch": 0.39711326206908876, + "grad_norm": 0.4780190885066986, + "learning_rate": 0.00012060203537867663, + "loss": 1.5468, + "step": 30560 + }, + { + "epoch": 0.3971262566130046, + "grad_norm": 0.32918885350227356, + "learning_rate": 0.00012059943591676524, + "loss": 1.4744, + "step": 30561 + }, + { + "epoch": 0.3971392511569205, + "grad_norm": 0.4281514585018158, + "learning_rate": 0.00012059683645485385, + "loss": 1.3534, + "step": 30562 + }, + { + "epoch": 0.39715224570083635, + "grad_norm": 0.37919455766677856, + "learning_rate": 0.00012059423699294247, + "loss": 1.2382, + "step": 30563 + }, + { + "epoch": 0.39716524024475225, + "grad_norm": 0.34532901644706726, + "learning_rate": 0.00012059163753103108, + "loss": 1.5231, + "step": 30564 + }, + { + "epoch": 0.3971782347886681, + "grad_norm": 0.4213751554489136, + "learning_rate": 0.0001205890380691197, + "loss": 1.4574, + "step": 30565 + }, + { + "epoch": 0.397191229332584, + "grad_norm": 0.3961006999015808, + "learning_rate": 0.0001205864386072083, + "loss": 1.3479, + "step": 30566 + }, + { + "epoch": 0.39720422387649984, + "grad_norm": 0.4103583097457886, + "learning_rate": 0.00012058383914529694, + "loss": 1.315, + "step": 30567 + }, + { + "epoch": 0.39721721842041574, + "grad_norm": 0.3253219723701477, + "learning_rate": 0.00012058123968338556, + "loss": 1.1293, + "step": 30568 + }, + { + "epoch": 0.3972302129643316, + "grad_norm": 0.3826769292354584, + "learning_rate": 0.00012057864022147417, + "loss": 1.3111, + "step": 30569 + }, + { + "epoch": 0.3972432075082475, + "grad_norm": 0.44675466418266296, + "learning_rate": 0.00012057604075956276, + "loss": 1.5, + "step": 30570 + }, + { + "epoch": 0.39725620205216333, + "grad_norm": 0.36370083689689636, + "learning_rate": 0.0001205734412976514, + "loss": 1.2416, + "step": 30571 + }, + { + "epoch": 0.39726919659607923, + "grad_norm": 0.5032706260681152, + "learning_rate": 0.00012057084183574001, + "loss": 1.4933, + "step": 30572 + }, + { + "epoch": 0.3972821911399951, + "grad_norm": 0.2902430295944214, + "learning_rate": 0.00012056824237382862, + "loss": 1.4969, + "step": 30573 + }, + { + "epoch": 0.397295185683911, + "grad_norm": 0.47435638308525085, + "learning_rate": 0.00012056564291191723, + "loss": 1.3721, + "step": 30574 + }, + { + "epoch": 0.3973081802278268, + "grad_norm": 0.3958859145641327, + "learning_rate": 0.00012056304345000586, + "loss": 1.4035, + "step": 30575 + }, + { + "epoch": 0.3973211747717427, + "grad_norm": 0.3804663419723511, + "learning_rate": 0.00012056044398809447, + "loss": 1.4726, + "step": 30576 + }, + { + "epoch": 0.39733416931565857, + "grad_norm": 0.39885711669921875, + "learning_rate": 0.00012055784452618308, + "loss": 1.5113, + "step": 30577 + }, + { + "epoch": 0.39734716385957447, + "grad_norm": 0.4091666638851166, + "learning_rate": 0.00012055524506427169, + "loss": 1.5481, + "step": 30578 + }, + { + "epoch": 0.3973601584034903, + "grad_norm": 0.484757661819458, + "learning_rate": 0.00012055264560236033, + "loss": 1.4901, + "step": 30579 + }, + { + "epoch": 0.3973731529474062, + "grad_norm": 0.37343254685401917, + "learning_rate": 0.00012055004614044894, + "loss": 1.2495, + "step": 30580 + }, + { + "epoch": 0.39738614749132206, + "grad_norm": 0.41060778498649597, + "learning_rate": 0.00012054744667853755, + "loss": 1.2152, + "step": 30581 + }, + { + "epoch": 0.39739914203523796, + "grad_norm": 0.4466177225112915, + "learning_rate": 0.00012054484721662615, + "loss": 1.6325, + "step": 30582 + }, + { + "epoch": 0.3974121365791538, + "grad_norm": 0.40157777070999146, + "learning_rate": 0.00012054224775471479, + "loss": 1.6397, + "step": 30583 + }, + { + "epoch": 0.3974251311230697, + "grad_norm": 0.36034226417541504, + "learning_rate": 0.0001205396482928034, + "loss": 1.1805, + "step": 30584 + }, + { + "epoch": 0.39743812566698555, + "grad_norm": 0.3764556348323822, + "learning_rate": 0.00012053704883089201, + "loss": 1.3388, + "step": 30585 + }, + { + "epoch": 0.39745112021090145, + "grad_norm": 0.39236927032470703, + "learning_rate": 0.00012053444936898062, + "loss": 1.506, + "step": 30586 + }, + { + "epoch": 0.3974641147548173, + "grad_norm": 0.36235329508781433, + "learning_rate": 0.00012053184990706924, + "loss": 1.2361, + "step": 30587 + }, + { + "epoch": 0.3974771092987332, + "grad_norm": 0.2898573577404022, + "learning_rate": 0.00012052925044515786, + "loss": 1.1999, + "step": 30588 + }, + { + "epoch": 0.39749010384264905, + "grad_norm": 0.4543534219264984, + "learning_rate": 0.00012052665098324647, + "loss": 1.5407, + "step": 30589 + }, + { + "epoch": 0.39750309838656495, + "grad_norm": 0.3502747118473053, + "learning_rate": 0.00012052405152133508, + "loss": 1.3537, + "step": 30590 + }, + { + "epoch": 0.3975160929304808, + "grad_norm": 0.4579504728317261, + "learning_rate": 0.00012052145205942372, + "loss": 1.7329, + "step": 30591 + }, + { + "epoch": 0.3975290874743967, + "grad_norm": 0.4381742775440216, + "learning_rate": 0.00012051885259751233, + "loss": 1.4102, + "step": 30592 + }, + { + "epoch": 0.39754208201831254, + "grad_norm": 0.34841784834861755, + "learning_rate": 0.00012051625313560094, + "loss": 1.3533, + "step": 30593 + }, + { + "epoch": 0.39755507656222844, + "grad_norm": 0.45948076248168945, + "learning_rate": 0.00012051365367368955, + "loss": 1.4975, + "step": 30594 + }, + { + "epoch": 0.3975680711061443, + "grad_norm": 0.3173246383666992, + "learning_rate": 0.00012051105421177817, + "loss": 1.3703, + "step": 30595 + }, + { + "epoch": 0.3975810656500602, + "grad_norm": 0.4007013738155365, + "learning_rate": 0.00012050845474986678, + "loss": 1.3819, + "step": 30596 + }, + { + "epoch": 0.39759406019397603, + "grad_norm": 0.3364884853363037, + "learning_rate": 0.0001205058552879554, + "loss": 1.1384, + "step": 30597 + }, + { + "epoch": 0.39760705473789193, + "grad_norm": 0.3834405541419983, + "learning_rate": 0.000120503255826044, + "loss": 1.3962, + "step": 30598 + }, + { + "epoch": 0.3976200492818078, + "grad_norm": 0.4373567998409271, + "learning_rate": 0.00012050065636413263, + "loss": 1.4803, + "step": 30599 + }, + { + "epoch": 0.3976330438257237, + "grad_norm": 0.3747972548007965, + "learning_rate": 0.00012049805690222124, + "loss": 1.3698, + "step": 30600 + }, + { + "epoch": 0.3976460383696395, + "grad_norm": 0.39132165908813477, + "learning_rate": 0.00012049545744030985, + "loss": 1.4599, + "step": 30601 + }, + { + "epoch": 0.3976590329135554, + "grad_norm": 0.46721139550209045, + "learning_rate": 0.00012049285797839849, + "loss": 1.257, + "step": 30602 + }, + { + "epoch": 0.39767202745747127, + "grad_norm": 0.3414823114871979, + "learning_rate": 0.0001204902585164871, + "loss": 1.628, + "step": 30603 + }, + { + "epoch": 0.39768502200138717, + "grad_norm": 0.3354717195034027, + "learning_rate": 0.00012048765905457571, + "loss": 1.419, + "step": 30604 + }, + { + "epoch": 0.39769801654530307, + "grad_norm": 0.47253406047821045, + "learning_rate": 0.00012048505959266432, + "loss": 1.3272, + "step": 30605 + }, + { + "epoch": 0.3977110110892189, + "grad_norm": 0.39107823371887207, + "learning_rate": 0.00012048246013075295, + "loss": 1.4431, + "step": 30606 + }, + { + "epoch": 0.3977240056331348, + "grad_norm": 0.4482027292251587, + "learning_rate": 0.00012047986066884156, + "loss": 1.6257, + "step": 30607 + }, + { + "epoch": 0.39773700017705066, + "grad_norm": 0.42433831095695496, + "learning_rate": 0.00012047726120693017, + "loss": 1.3668, + "step": 30608 + }, + { + "epoch": 0.39774999472096656, + "grad_norm": 0.42059847712516785, + "learning_rate": 0.00012047466174501878, + "loss": 1.4938, + "step": 30609 + }, + { + "epoch": 0.3977629892648824, + "grad_norm": 0.4015505015850067, + "learning_rate": 0.00012047206228310742, + "loss": 1.4419, + "step": 30610 + }, + { + "epoch": 0.3977759838087983, + "grad_norm": 0.35424479842185974, + "learning_rate": 0.00012046946282119603, + "loss": 1.5279, + "step": 30611 + }, + { + "epoch": 0.39778897835271415, + "grad_norm": 0.4345249533653259, + "learning_rate": 0.00012046686335928463, + "loss": 1.5128, + "step": 30612 + }, + { + "epoch": 0.39780197289663005, + "grad_norm": 0.40703484416007996, + "learning_rate": 0.00012046426389737324, + "loss": 1.5333, + "step": 30613 + }, + { + "epoch": 0.3978149674405459, + "grad_norm": 0.3266160190105438, + "learning_rate": 0.00012046166443546187, + "loss": 1.3241, + "step": 30614 + }, + { + "epoch": 0.3978279619844618, + "grad_norm": 0.346336305141449, + "learning_rate": 0.00012045906497355049, + "loss": 1.3503, + "step": 30615 + }, + { + "epoch": 0.39784095652837764, + "grad_norm": 0.4651446044445038, + "learning_rate": 0.0001204564655116391, + "loss": 1.5313, + "step": 30616 + }, + { + "epoch": 0.39785395107229354, + "grad_norm": 0.4894111156463623, + "learning_rate": 0.00012045386604972771, + "loss": 1.3909, + "step": 30617 + }, + { + "epoch": 0.3978669456162094, + "grad_norm": 0.43553051352500916, + "learning_rate": 0.00012045126658781633, + "loss": 1.2227, + "step": 30618 + }, + { + "epoch": 0.3978799401601253, + "grad_norm": 0.42442312836647034, + "learning_rate": 0.00012044866712590494, + "loss": 1.3431, + "step": 30619 + }, + { + "epoch": 0.39789293470404113, + "grad_norm": 0.39639878273010254, + "learning_rate": 0.00012044606766399355, + "loss": 1.2855, + "step": 30620 + }, + { + "epoch": 0.39790592924795704, + "grad_norm": 0.42936867475509644, + "learning_rate": 0.00012044346820208216, + "loss": 1.3673, + "step": 30621 + }, + { + "epoch": 0.3979189237918729, + "grad_norm": 0.4305630028247833, + "learning_rate": 0.0001204408687401708, + "loss": 1.2348, + "step": 30622 + }, + { + "epoch": 0.3979319183357888, + "grad_norm": 0.39179790019989014, + "learning_rate": 0.00012043826927825941, + "loss": 1.2855, + "step": 30623 + }, + { + "epoch": 0.3979449128797046, + "grad_norm": 0.4149450659751892, + "learning_rate": 0.00012043566981634801, + "loss": 1.3681, + "step": 30624 + }, + { + "epoch": 0.3979579074236205, + "grad_norm": 0.4168071150779724, + "learning_rate": 0.00012043307035443662, + "loss": 1.3684, + "step": 30625 + }, + { + "epoch": 0.3979709019675364, + "grad_norm": 0.2427930235862732, + "learning_rate": 0.00012043047089252526, + "loss": 1.3372, + "step": 30626 + }, + { + "epoch": 0.3979838965114523, + "grad_norm": 0.3487960398197174, + "learning_rate": 0.00012042787143061387, + "loss": 1.3398, + "step": 30627 + }, + { + "epoch": 0.3979968910553681, + "grad_norm": 0.4447000324726105, + "learning_rate": 0.00012042527196870248, + "loss": 1.48, + "step": 30628 + }, + { + "epoch": 0.398009885599284, + "grad_norm": 0.40953731536865234, + "learning_rate": 0.00012042267250679109, + "loss": 1.4033, + "step": 30629 + }, + { + "epoch": 0.39802288014319986, + "grad_norm": 0.417463093996048, + "learning_rate": 0.00012042007304487972, + "loss": 1.3576, + "step": 30630 + }, + { + "epoch": 0.39803587468711576, + "grad_norm": 0.39330989122390747, + "learning_rate": 0.00012041747358296833, + "loss": 1.2288, + "step": 30631 + }, + { + "epoch": 0.3980488692310316, + "grad_norm": 0.43738213181495667, + "learning_rate": 0.00012041487412105694, + "loss": 1.3999, + "step": 30632 + }, + { + "epoch": 0.3980618637749475, + "grad_norm": 0.40446197986602783, + "learning_rate": 0.00012041227465914555, + "loss": 1.4824, + "step": 30633 + }, + { + "epoch": 0.39807485831886336, + "grad_norm": 0.4569695293903351, + "learning_rate": 0.00012040967519723419, + "loss": 1.4332, + "step": 30634 + }, + { + "epoch": 0.39808785286277926, + "grad_norm": 0.4315387010574341, + "learning_rate": 0.0001204070757353228, + "loss": 1.5125, + "step": 30635 + }, + { + "epoch": 0.3981008474066951, + "grad_norm": 0.41952329874038696, + "learning_rate": 0.00012040447627341141, + "loss": 1.3004, + "step": 30636 + }, + { + "epoch": 0.398113841950611, + "grad_norm": 0.46102291345596313, + "learning_rate": 0.00012040187681150003, + "loss": 1.2783, + "step": 30637 + }, + { + "epoch": 0.39812683649452685, + "grad_norm": 0.3600445091724396, + "learning_rate": 0.00012039927734958865, + "loss": 1.2823, + "step": 30638 + }, + { + "epoch": 0.39813983103844275, + "grad_norm": 0.39268195629119873, + "learning_rate": 0.00012039667788767726, + "loss": 1.5793, + "step": 30639 + }, + { + "epoch": 0.3981528255823586, + "grad_norm": 0.42229700088500977, + "learning_rate": 0.00012039407842576587, + "loss": 1.635, + "step": 30640 + }, + { + "epoch": 0.3981658201262745, + "grad_norm": 0.3998759686946869, + "learning_rate": 0.00012039147896385449, + "loss": 1.3335, + "step": 30641 + }, + { + "epoch": 0.39817881467019034, + "grad_norm": 0.4558330476284027, + "learning_rate": 0.0001203888795019431, + "loss": 1.4519, + "step": 30642 + }, + { + "epoch": 0.39819180921410624, + "grad_norm": 0.39772799611091614, + "learning_rate": 0.00012038628004003171, + "loss": 1.3465, + "step": 30643 + }, + { + "epoch": 0.3982048037580221, + "grad_norm": 0.3977593779563904, + "learning_rate": 0.00012038368057812032, + "loss": 1.3635, + "step": 30644 + }, + { + "epoch": 0.398217798301938, + "grad_norm": 0.4713815748691559, + "learning_rate": 0.00012038108111620896, + "loss": 1.3811, + "step": 30645 + }, + { + "epoch": 0.39823079284585383, + "grad_norm": 0.32866600155830383, + "learning_rate": 0.00012037848165429757, + "loss": 1.2731, + "step": 30646 + }, + { + "epoch": 0.39824378738976973, + "grad_norm": 0.38345086574554443, + "learning_rate": 0.00012037588219238618, + "loss": 1.3226, + "step": 30647 + }, + { + "epoch": 0.3982567819336856, + "grad_norm": 0.46418246626853943, + "learning_rate": 0.0001203732827304748, + "loss": 1.4317, + "step": 30648 + }, + { + "epoch": 0.3982697764776015, + "grad_norm": 0.3906811475753784, + "learning_rate": 0.00012037068326856342, + "loss": 1.3708, + "step": 30649 + }, + { + "epoch": 0.3982827710215173, + "grad_norm": 0.3054935038089752, + "learning_rate": 0.00012036808380665203, + "loss": 1.2675, + "step": 30650 + }, + { + "epoch": 0.3982957655654332, + "grad_norm": 0.44424697756767273, + "learning_rate": 0.00012036548434474064, + "loss": 1.5847, + "step": 30651 + }, + { + "epoch": 0.39830876010934907, + "grad_norm": 0.4200749099254608, + "learning_rate": 0.00012036288488282925, + "loss": 1.4534, + "step": 30652 + }, + { + "epoch": 0.39832175465326497, + "grad_norm": 0.40228980779647827, + "learning_rate": 0.00012036028542091789, + "loss": 1.5031, + "step": 30653 + }, + { + "epoch": 0.3983347491971808, + "grad_norm": 0.5039932727813721, + "learning_rate": 0.00012035768595900649, + "loss": 1.5185, + "step": 30654 + }, + { + "epoch": 0.3983477437410967, + "grad_norm": 0.4084838926792145, + "learning_rate": 0.0001203550864970951, + "loss": 1.3826, + "step": 30655 + }, + { + "epoch": 0.39836073828501256, + "grad_norm": 0.4750950336456299, + "learning_rate": 0.00012035248703518371, + "loss": 1.4573, + "step": 30656 + }, + { + "epoch": 0.39837373282892846, + "grad_norm": 0.4051925539970398, + "learning_rate": 0.00012034988757327235, + "loss": 1.5065, + "step": 30657 + }, + { + "epoch": 0.3983867273728443, + "grad_norm": 0.4245031476020813, + "learning_rate": 0.00012034728811136096, + "loss": 1.4599, + "step": 30658 + }, + { + "epoch": 0.3983997219167602, + "grad_norm": 0.3715025782585144, + "learning_rate": 0.00012034468864944957, + "loss": 1.4545, + "step": 30659 + }, + { + "epoch": 0.39841271646067605, + "grad_norm": 0.49048787355422974, + "learning_rate": 0.00012034208918753818, + "loss": 1.4418, + "step": 30660 + }, + { + "epoch": 0.39842571100459195, + "grad_norm": 0.3705085217952728, + "learning_rate": 0.0001203394897256268, + "loss": 1.3518, + "step": 30661 + }, + { + "epoch": 0.3984387055485078, + "grad_norm": 0.35156843066215515, + "learning_rate": 0.00012033689026371542, + "loss": 1.4056, + "step": 30662 + }, + { + "epoch": 0.3984517000924237, + "grad_norm": 0.38524767756462097, + "learning_rate": 0.00012033429080180403, + "loss": 1.2149, + "step": 30663 + }, + { + "epoch": 0.39846469463633954, + "grad_norm": 0.4420812129974365, + "learning_rate": 0.00012033169133989264, + "loss": 1.5197, + "step": 30664 + }, + { + "epoch": 0.39847768918025545, + "grad_norm": 0.41359129548072815, + "learning_rate": 0.00012032909187798128, + "loss": 1.385, + "step": 30665 + }, + { + "epoch": 0.3984906837241713, + "grad_norm": 0.4534244239330292, + "learning_rate": 0.00012032649241606987, + "loss": 1.4598, + "step": 30666 + }, + { + "epoch": 0.3985036782680872, + "grad_norm": 0.43217048048973083, + "learning_rate": 0.00012032389295415848, + "loss": 1.2485, + "step": 30667 + }, + { + "epoch": 0.39851667281200304, + "grad_norm": 0.3915845453739166, + "learning_rate": 0.0001203212934922471, + "loss": 1.2805, + "step": 30668 + }, + { + "epoch": 0.39852966735591894, + "grad_norm": 0.43231967091560364, + "learning_rate": 0.00012031869403033573, + "loss": 1.1903, + "step": 30669 + }, + { + "epoch": 0.3985426618998348, + "grad_norm": 0.40213385224342346, + "learning_rate": 0.00012031609456842434, + "loss": 1.5774, + "step": 30670 + }, + { + "epoch": 0.3985556564437507, + "grad_norm": 0.3723752498626709, + "learning_rate": 0.00012031349510651296, + "loss": 1.3481, + "step": 30671 + }, + { + "epoch": 0.39856865098766653, + "grad_norm": 0.32263416051864624, + "learning_rate": 0.00012031089564460157, + "loss": 1.3231, + "step": 30672 + }, + { + "epoch": 0.39858164553158243, + "grad_norm": 0.37805575132369995, + "learning_rate": 0.00012030829618269019, + "loss": 1.4731, + "step": 30673 + }, + { + "epoch": 0.3985946400754983, + "grad_norm": 0.43951764702796936, + "learning_rate": 0.0001203056967207788, + "loss": 1.4792, + "step": 30674 + }, + { + "epoch": 0.3986076346194142, + "grad_norm": 0.3575046956539154, + "learning_rate": 0.00012030309725886741, + "loss": 1.4373, + "step": 30675 + }, + { + "epoch": 0.39862062916333, + "grad_norm": 0.3849867284297943, + "learning_rate": 0.00012030049779695605, + "loss": 1.2911, + "step": 30676 + }, + { + "epoch": 0.3986336237072459, + "grad_norm": 0.44810420274734497, + "learning_rate": 0.00012029789833504466, + "loss": 1.4358, + "step": 30677 + }, + { + "epoch": 0.39864661825116177, + "grad_norm": 0.3901337683200836, + "learning_rate": 0.00012029529887313327, + "loss": 1.3077, + "step": 30678 + }, + { + "epoch": 0.39865961279507767, + "grad_norm": 0.3956059217453003, + "learning_rate": 0.00012029269941122187, + "loss": 1.3711, + "step": 30679 + }, + { + "epoch": 0.39867260733899357, + "grad_norm": 0.45140060782432556, + "learning_rate": 0.00012029009994931051, + "loss": 1.5245, + "step": 30680 + }, + { + "epoch": 0.3986856018829094, + "grad_norm": 0.4881375730037689, + "learning_rate": 0.00012028750048739912, + "loss": 1.5835, + "step": 30681 + }, + { + "epoch": 0.3986985964268253, + "grad_norm": 0.339712530374527, + "learning_rate": 0.00012028490102548773, + "loss": 1.2691, + "step": 30682 + }, + { + "epoch": 0.39871159097074116, + "grad_norm": 0.4990187883377075, + "learning_rate": 0.00012028230156357634, + "loss": 1.4265, + "step": 30683 + }, + { + "epoch": 0.39872458551465706, + "grad_norm": 0.41261714696884155, + "learning_rate": 0.00012027970210166497, + "loss": 1.4037, + "step": 30684 + }, + { + "epoch": 0.3987375800585729, + "grad_norm": 0.409064918756485, + "learning_rate": 0.00012027710263975358, + "loss": 1.4489, + "step": 30685 + }, + { + "epoch": 0.3987505746024888, + "grad_norm": 0.46332839131355286, + "learning_rate": 0.00012027450317784219, + "loss": 1.5264, + "step": 30686 + }, + { + "epoch": 0.39876356914640465, + "grad_norm": 0.4505152702331543, + "learning_rate": 0.0001202719037159308, + "loss": 1.4464, + "step": 30687 + }, + { + "epoch": 0.39877656369032055, + "grad_norm": 0.4598102569580078, + "learning_rate": 0.00012026930425401944, + "loss": 1.5445, + "step": 30688 + }, + { + "epoch": 0.3987895582342364, + "grad_norm": 0.4157380759716034, + "learning_rate": 0.00012026670479210805, + "loss": 1.6055, + "step": 30689 + }, + { + "epoch": 0.3988025527781523, + "grad_norm": 0.3972194194793701, + "learning_rate": 0.00012026410533019666, + "loss": 1.5474, + "step": 30690 + }, + { + "epoch": 0.39881554732206814, + "grad_norm": 0.3970952033996582, + "learning_rate": 0.00012026150586828527, + "loss": 1.3554, + "step": 30691 + }, + { + "epoch": 0.39882854186598404, + "grad_norm": 0.30305469036102295, + "learning_rate": 0.0001202589064063739, + "loss": 1.1761, + "step": 30692 + }, + { + "epoch": 0.3988415364098999, + "grad_norm": 0.34345945715904236, + "learning_rate": 0.0001202563069444625, + "loss": 1.3842, + "step": 30693 + }, + { + "epoch": 0.3988545309538158, + "grad_norm": 0.4360566735267639, + "learning_rate": 0.00012025370748255112, + "loss": 1.4139, + "step": 30694 + }, + { + "epoch": 0.39886752549773163, + "grad_norm": 0.4208584427833557, + "learning_rate": 0.00012025110802063973, + "loss": 1.3481, + "step": 30695 + }, + { + "epoch": 0.39888052004164753, + "grad_norm": 0.3233270049095154, + "learning_rate": 0.00012024850855872835, + "loss": 1.2849, + "step": 30696 + }, + { + "epoch": 0.3988935145855634, + "grad_norm": 0.36880871653556824, + "learning_rate": 0.00012024590909681696, + "loss": 1.1013, + "step": 30697 + }, + { + "epoch": 0.3989065091294793, + "grad_norm": 0.3352777063846588, + "learning_rate": 0.00012024330963490557, + "loss": 1.33, + "step": 30698 + }, + { + "epoch": 0.3989195036733951, + "grad_norm": 0.3982882797718048, + "learning_rate": 0.00012024071017299418, + "loss": 1.3385, + "step": 30699 + }, + { + "epoch": 0.398932498217311, + "grad_norm": 0.2909446656703949, + "learning_rate": 0.00012023811071108282, + "loss": 1.2409, + "step": 30700 + }, + { + "epoch": 0.39894549276122687, + "grad_norm": 0.4888071119785309, + "learning_rate": 0.00012023551124917143, + "loss": 1.5196, + "step": 30701 + }, + { + "epoch": 0.3989584873051428, + "grad_norm": 0.4029799997806549, + "learning_rate": 0.00012023291178726004, + "loss": 1.4665, + "step": 30702 + }, + { + "epoch": 0.3989714818490586, + "grad_norm": 0.448395311832428, + "learning_rate": 0.00012023031232534865, + "loss": 1.3786, + "step": 30703 + }, + { + "epoch": 0.3989844763929745, + "grad_norm": 0.4449112117290497, + "learning_rate": 0.00012022771286343728, + "loss": 1.2909, + "step": 30704 + }, + { + "epoch": 0.39899747093689036, + "grad_norm": 0.42984306812286377, + "learning_rate": 0.00012022511340152589, + "loss": 1.4295, + "step": 30705 + }, + { + "epoch": 0.39901046548080626, + "grad_norm": 0.29719048738479614, + "learning_rate": 0.0001202225139396145, + "loss": 1.2586, + "step": 30706 + }, + { + "epoch": 0.3990234600247221, + "grad_norm": 0.36108139157295227, + "learning_rate": 0.00012021991447770311, + "loss": 1.3847, + "step": 30707 + }, + { + "epoch": 0.399036454568638, + "grad_norm": 0.5173310041427612, + "learning_rate": 0.00012021731501579174, + "loss": 1.4372, + "step": 30708 + }, + { + "epoch": 0.39904944911255386, + "grad_norm": 0.5210025906562805, + "learning_rate": 0.00012021471555388035, + "loss": 1.4764, + "step": 30709 + }, + { + "epoch": 0.39906244365646976, + "grad_norm": 0.4139043092727661, + "learning_rate": 0.00012021211609196896, + "loss": 1.3859, + "step": 30710 + }, + { + "epoch": 0.3990754382003856, + "grad_norm": 0.27381631731987, + "learning_rate": 0.0001202095166300576, + "loss": 1.5237, + "step": 30711 + }, + { + "epoch": 0.3990884327443015, + "grad_norm": 0.3617205321788788, + "learning_rate": 0.00012020691716814621, + "loss": 1.421, + "step": 30712 + }, + { + "epoch": 0.39910142728821735, + "grad_norm": 0.3965858221054077, + "learning_rate": 0.00012020431770623482, + "loss": 1.465, + "step": 30713 + }, + { + "epoch": 0.39911442183213325, + "grad_norm": 0.4448576867580414, + "learning_rate": 0.00012020171824432343, + "loss": 1.4211, + "step": 30714 + }, + { + "epoch": 0.3991274163760491, + "grad_norm": 0.46282872557640076, + "learning_rate": 0.00012019911878241205, + "loss": 1.5665, + "step": 30715 + }, + { + "epoch": 0.399140410919965, + "grad_norm": 0.4164327383041382, + "learning_rate": 0.00012019651932050066, + "loss": 1.2611, + "step": 30716 + }, + { + "epoch": 0.39915340546388084, + "grad_norm": 0.40648898482322693, + "learning_rate": 0.00012019391985858928, + "loss": 1.2993, + "step": 30717 + }, + { + "epoch": 0.39916640000779674, + "grad_norm": 0.38534462451934814, + "learning_rate": 0.00012019132039667789, + "loss": 1.3671, + "step": 30718 + }, + { + "epoch": 0.3991793945517126, + "grad_norm": 0.3989992141723633, + "learning_rate": 0.00012018872093476652, + "loss": 1.6946, + "step": 30719 + }, + { + "epoch": 0.3991923890956285, + "grad_norm": 0.28701189160346985, + "learning_rate": 0.00012018612147285514, + "loss": 1.0815, + "step": 30720 + }, + { + "epoch": 0.39920538363954433, + "grad_norm": 0.3366556167602539, + "learning_rate": 0.00012018352201094373, + "loss": 1.3896, + "step": 30721 + }, + { + "epoch": 0.39921837818346023, + "grad_norm": 0.3729995787143707, + "learning_rate": 0.00012018092254903234, + "loss": 1.301, + "step": 30722 + }, + { + "epoch": 0.3992313727273761, + "grad_norm": 0.37458181381225586, + "learning_rate": 0.00012017832308712098, + "loss": 1.4842, + "step": 30723 + }, + { + "epoch": 0.399244367271292, + "grad_norm": 0.3517019748687744, + "learning_rate": 0.00012017572362520959, + "loss": 1.3655, + "step": 30724 + }, + { + "epoch": 0.3992573618152078, + "grad_norm": 0.33635079860687256, + "learning_rate": 0.0001201731241632982, + "loss": 1.2718, + "step": 30725 + }, + { + "epoch": 0.3992703563591237, + "grad_norm": 0.4275568425655365, + "learning_rate": 0.00012017052470138681, + "loss": 1.4739, + "step": 30726 + }, + { + "epoch": 0.39928335090303957, + "grad_norm": 0.39404749870300293, + "learning_rate": 0.00012016792523947544, + "loss": 1.2442, + "step": 30727 + }, + { + "epoch": 0.39929634544695547, + "grad_norm": 0.36775586009025574, + "learning_rate": 0.00012016532577756405, + "loss": 1.4552, + "step": 30728 + }, + { + "epoch": 0.3993093399908713, + "grad_norm": 0.3561258912086487, + "learning_rate": 0.00012016272631565266, + "loss": 1.4317, + "step": 30729 + }, + { + "epoch": 0.3993223345347872, + "grad_norm": 0.395398885011673, + "learning_rate": 0.00012016012685374127, + "loss": 1.5401, + "step": 30730 + }, + { + "epoch": 0.39933532907870306, + "grad_norm": 0.4343360960483551, + "learning_rate": 0.00012015752739182991, + "loss": 1.3702, + "step": 30731 + }, + { + "epoch": 0.39934832362261896, + "grad_norm": 0.4516986608505249, + "learning_rate": 0.00012015492792991852, + "loss": 1.5651, + "step": 30732 + }, + { + "epoch": 0.3993613181665348, + "grad_norm": 0.4154672622680664, + "learning_rate": 0.00012015232846800713, + "loss": 1.5392, + "step": 30733 + }, + { + "epoch": 0.3993743127104507, + "grad_norm": 0.39604562520980835, + "learning_rate": 0.00012014972900609573, + "loss": 1.2259, + "step": 30734 + }, + { + "epoch": 0.39938730725436655, + "grad_norm": 0.4680011570453644, + "learning_rate": 0.00012014712954418437, + "loss": 1.4026, + "step": 30735 + }, + { + "epoch": 0.39940030179828245, + "grad_norm": 0.4952586889266968, + "learning_rate": 0.00012014453008227298, + "loss": 1.4814, + "step": 30736 + }, + { + "epoch": 0.3994132963421983, + "grad_norm": 0.39622732996940613, + "learning_rate": 0.00012014193062036159, + "loss": 1.2221, + "step": 30737 + }, + { + "epoch": 0.3994262908861142, + "grad_norm": 0.4304730296134949, + "learning_rate": 0.0001201393311584502, + "loss": 1.2051, + "step": 30738 + }, + { + "epoch": 0.39943928543003004, + "grad_norm": 0.41334205865859985, + "learning_rate": 0.00012013673169653882, + "loss": 1.4043, + "step": 30739 + }, + { + "epoch": 0.39945227997394595, + "grad_norm": 0.509895920753479, + "learning_rate": 0.00012013413223462744, + "loss": 1.5151, + "step": 30740 + }, + { + "epoch": 0.3994652745178618, + "grad_norm": 0.5046713352203369, + "learning_rate": 0.00012013153277271605, + "loss": 1.4272, + "step": 30741 + }, + { + "epoch": 0.3994782690617777, + "grad_norm": 0.5115154981613159, + "learning_rate": 0.00012012893331080466, + "loss": 1.5151, + "step": 30742 + }, + { + "epoch": 0.39949126360569354, + "grad_norm": 0.4074965715408325, + "learning_rate": 0.0001201263338488933, + "loss": 1.3666, + "step": 30743 + }, + { + "epoch": 0.39950425814960944, + "grad_norm": 0.45780643820762634, + "learning_rate": 0.0001201237343869819, + "loss": 1.3173, + "step": 30744 + }, + { + "epoch": 0.3995172526935253, + "grad_norm": 0.4388483464717865, + "learning_rate": 0.00012012113492507052, + "loss": 1.425, + "step": 30745 + }, + { + "epoch": 0.3995302472374412, + "grad_norm": 0.3814353942871094, + "learning_rate": 0.00012011853546315911, + "loss": 1.5571, + "step": 30746 + }, + { + "epoch": 0.39954324178135703, + "grad_norm": 0.4243190884590149, + "learning_rate": 0.00012011593600124775, + "loss": 1.5229, + "step": 30747 + }, + { + "epoch": 0.39955623632527293, + "grad_norm": 0.39877456426620483, + "learning_rate": 0.00012011333653933636, + "loss": 1.3513, + "step": 30748 + }, + { + "epoch": 0.3995692308691888, + "grad_norm": 0.3427993357181549, + "learning_rate": 0.00012011073707742497, + "loss": 1.4455, + "step": 30749 + }, + { + "epoch": 0.3995822254131047, + "grad_norm": 0.4350269138813019, + "learning_rate": 0.0001201081376155136, + "loss": 1.3482, + "step": 30750 + }, + { + "epoch": 0.3995952199570205, + "grad_norm": 0.4132777750492096, + "learning_rate": 0.00012010553815360221, + "loss": 1.3185, + "step": 30751 + }, + { + "epoch": 0.3996082145009364, + "grad_norm": 0.38242480158805847, + "learning_rate": 0.00012010293869169082, + "loss": 1.3265, + "step": 30752 + }, + { + "epoch": 0.39962120904485227, + "grad_norm": 0.4838862121105194, + "learning_rate": 0.00012010033922977943, + "loss": 1.6599, + "step": 30753 + }, + { + "epoch": 0.39963420358876817, + "grad_norm": 0.3450539708137512, + "learning_rate": 0.00012009773976786807, + "loss": 1.344, + "step": 30754 + }, + { + "epoch": 0.399647198132684, + "grad_norm": 0.48688775300979614, + "learning_rate": 0.00012009514030595668, + "loss": 1.4564, + "step": 30755 + }, + { + "epoch": 0.3996601926765999, + "grad_norm": 0.39791902899742126, + "learning_rate": 0.00012009254084404529, + "loss": 1.4225, + "step": 30756 + }, + { + "epoch": 0.3996731872205158, + "grad_norm": 0.35362982749938965, + "learning_rate": 0.0001200899413821339, + "loss": 1.3233, + "step": 30757 + }, + { + "epoch": 0.39968618176443166, + "grad_norm": 0.36608344316482544, + "learning_rate": 0.00012008734192022253, + "loss": 1.355, + "step": 30758 + }, + { + "epoch": 0.39969917630834756, + "grad_norm": 0.44153866171836853, + "learning_rate": 0.00012008474245831114, + "loss": 1.4646, + "step": 30759 + }, + { + "epoch": 0.3997121708522634, + "grad_norm": 0.4325862526893616, + "learning_rate": 0.00012008214299639975, + "loss": 1.4314, + "step": 30760 + }, + { + "epoch": 0.3997251653961793, + "grad_norm": 0.38937950134277344, + "learning_rate": 0.00012007954353448836, + "loss": 1.3571, + "step": 30761 + }, + { + "epoch": 0.39973815994009515, + "grad_norm": 0.45320913195610046, + "learning_rate": 0.000120076944072577, + "loss": 1.703, + "step": 30762 + }, + { + "epoch": 0.39975115448401105, + "grad_norm": 0.45183277130126953, + "learning_rate": 0.0001200743446106656, + "loss": 1.4915, + "step": 30763 + }, + { + "epoch": 0.3997641490279269, + "grad_norm": 0.3765687942504883, + "learning_rate": 0.0001200717451487542, + "loss": 1.1622, + "step": 30764 + }, + { + "epoch": 0.3997771435718428, + "grad_norm": 0.3917236328125, + "learning_rate": 0.00012006914568684282, + "loss": 1.3739, + "step": 30765 + }, + { + "epoch": 0.39979013811575864, + "grad_norm": 0.36005303263664246, + "learning_rate": 0.00012006654622493145, + "loss": 1.2984, + "step": 30766 + }, + { + "epoch": 0.39980313265967454, + "grad_norm": 0.330409973859787, + "learning_rate": 0.00012006394676302007, + "loss": 1.212, + "step": 30767 + }, + { + "epoch": 0.3998161272035904, + "grad_norm": 0.44632336497306824, + "learning_rate": 0.00012006134730110868, + "loss": 1.4048, + "step": 30768 + }, + { + "epoch": 0.3998291217475063, + "grad_norm": 0.3802608251571655, + "learning_rate": 0.00012005874783919729, + "loss": 1.3177, + "step": 30769 + }, + { + "epoch": 0.39984211629142213, + "grad_norm": 0.31766894459724426, + "learning_rate": 0.00012005614837728591, + "loss": 1.3982, + "step": 30770 + }, + { + "epoch": 0.39985511083533803, + "grad_norm": 0.3211827576160431, + "learning_rate": 0.00012005354891537452, + "loss": 1.3492, + "step": 30771 + }, + { + "epoch": 0.3998681053792539, + "grad_norm": 0.4244026243686676, + "learning_rate": 0.00012005094945346313, + "loss": 1.3055, + "step": 30772 + }, + { + "epoch": 0.3998810999231698, + "grad_norm": 0.40082234144210815, + "learning_rate": 0.00012004834999155174, + "loss": 1.3247, + "step": 30773 + }, + { + "epoch": 0.3998940944670856, + "grad_norm": 0.46327075362205505, + "learning_rate": 0.00012004575052964038, + "loss": 1.2136, + "step": 30774 + }, + { + "epoch": 0.3999070890110015, + "grad_norm": 0.3778066039085388, + "learning_rate": 0.000120043151067729, + "loss": 1.615, + "step": 30775 + }, + { + "epoch": 0.39992008355491737, + "grad_norm": 0.43111875653266907, + "learning_rate": 0.00012004055160581759, + "loss": 1.5014, + "step": 30776 + }, + { + "epoch": 0.3999330780988333, + "grad_norm": 0.3785284757614136, + "learning_rate": 0.0001200379521439062, + "loss": 1.2903, + "step": 30777 + }, + { + "epoch": 0.3999460726427491, + "grad_norm": 0.5251731276512146, + "learning_rate": 0.00012003535268199484, + "loss": 1.5504, + "step": 30778 + }, + { + "epoch": 0.399959067186665, + "grad_norm": 0.4645079970359802, + "learning_rate": 0.00012003275322008345, + "loss": 1.3668, + "step": 30779 + }, + { + "epoch": 0.39997206173058086, + "grad_norm": 0.34047141671180725, + "learning_rate": 0.00012003015375817206, + "loss": 1.34, + "step": 30780 + }, + { + "epoch": 0.39998505627449676, + "grad_norm": 0.36831164360046387, + "learning_rate": 0.00012002755429626067, + "loss": 1.3671, + "step": 30781 + }, + { + "epoch": 0.3999980508184126, + "grad_norm": 0.2827679514884949, + "learning_rate": 0.0001200249548343493, + "loss": 1.1858, + "step": 30782 + }, + { + "epoch": 0.4000110453623285, + "grad_norm": 0.29537034034729004, + "learning_rate": 0.00012002235537243791, + "loss": 1.2754, + "step": 30783 + }, + { + "epoch": 0.40002403990624436, + "grad_norm": 0.3191387355327606, + "learning_rate": 0.00012001975591052652, + "loss": 1.3112, + "step": 30784 + }, + { + "epoch": 0.40003703445016026, + "grad_norm": 0.38101980090141296, + "learning_rate": 0.00012001715644861516, + "loss": 1.5365, + "step": 30785 + }, + { + "epoch": 0.4000500289940761, + "grad_norm": 0.3570547103881836, + "learning_rate": 0.00012001455698670377, + "loss": 1.3776, + "step": 30786 + }, + { + "epoch": 0.400063023537992, + "grad_norm": 0.4522722065448761, + "learning_rate": 0.00012001195752479238, + "loss": 1.2787, + "step": 30787 + }, + { + "epoch": 0.40007601808190785, + "grad_norm": 0.44312748312950134, + "learning_rate": 0.00012000935806288098, + "loss": 1.378, + "step": 30788 + }, + { + "epoch": 0.40008901262582375, + "grad_norm": 0.4156210720539093, + "learning_rate": 0.00012000675860096961, + "loss": 1.5378, + "step": 30789 + }, + { + "epoch": 0.4001020071697396, + "grad_norm": 0.43703708052635193, + "learning_rate": 0.00012000415913905823, + "loss": 1.3802, + "step": 30790 + }, + { + "epoch": 0.4001150017136555, + "grad_norm": 0.3735348880290985, + "learning_rate": 0.00012000155967714684, + "loss": 1.4785, + "step": 30791 + }, + { + "epoch": 0.40012799625757134, + "grad_norm": 0.41813093423843384, + "learning_rate": 0.00011999896021523545, + "loss": 1.4512, + "step": 30792 + }, + { + "epoch": 0.40014099080148724, + "grad_norm": 0.506083071231842, + "learning_rate": 0.00011999636075332407, + "loss": 1.4751, + "step": 30793 + }, + { + "epoch": 0.4001539853454031, + "grad_norm": 0.3321952223777771, + "learning_rate": 0.00011999376129141268, + "loss": 1.4753, + "step": 30794 + }, + { + "epoch": 0.400166979889319, + "grad_norm": 0.42100703716278076, + "learning_rate": 0.0001199911618295013, + "loss": 1.3513, + "step": 30795 + }, + { + "epoch": 0.40017997443323483, + "grad_norm": 0.3634747266769409, + "learning_rate": 0.0001199885623675899, + "loss": 1.3472, + "step": 30796 + }, + { + "epoch": 0.40019296897715073, + "grad_norm": 0.4350329637527466, + "learning_rate": 0.00011998596290567854, + "loss": 1.306, + "step": 30797 + }, + { + "epoch": 0.4002059635210666, + "grad_norm": 0.4530585706233978, + "learning_rate": 0.00011998336344376715, + "loss": 1.6067, + "step": 30798 + }, + { + "epoch": 0.4002189580649825, + "grad_norm": 0.39739182591438293, + "learning_rate": 0.00011998076398185576, + "loss": 1.4186, + "step": 30799 + }, + { + "epoch": 0.4002319526088983, + "grad_norm": 0.43387311697006226, + "learning_rate": 0.00011997816451994438, + "loss": 1.5991, + "step": 30800 + }, + { + "epoch": 0.4002449471528142, + "grad_norm": 0.35849398374557495, + "learning_rate": 0.000119975565058033, + "loss": 1.6729, + "step": 30801 + }, + { + "epoch": 0.40025794169673007, + "grad_norm": 0.36983904242515564, + "learning_rate": 0.00011997296559612161, + "loss": 1.2913, + "step": 30802 + }, + { + "epoch": 0.40027093624064597, + "grad_norm": 0.40881189703941345, + "learning_rate": 0.00011997036613421022, + "loss": 1.3664, + "step": 30803 + }, + { + "epoch": 0.4002839307845618, + "grad_norm": 0.37786686420440674, + "learning_rate": 0.00011996776667229883, + "loss": 1.3601, + "step": 30804 + }, + { + "epoch": 0.4002969253284777, + "grad_norm": 0.3708288371562958, + "learning_rate": 0.00011996516721038746, + "loss": 1.35, + "step": 30805 + }, + { + "epoch": 0.40030991987239356, + "grad_norm": 0.38175007700920105, + "learning_rate": 0.00011996256774847607, + "loss": 1.3863, + "step": 30806 + }, + { + "epoch": 0.40032291441630946, + "grad_norm": 0.43318769335746765, + "learning_rate": 0.00011995996828656468, + "loss": 1.508, + "step": 30807 + }, + { + "epoch": 0.4003359089602253, + "grad_norm": 0.41580840945243835, + "learning_rate": 0.00011995736882465329, + "loss": 1.1919, + "step": 30808 + }, + { + "epoch": 0.4003489035041412, + "grad_norm": 0.4472562372684479, + "learning_rate": 0.00011995476936274193, + "loss": 1.2593, + "step": 30809 + }, + { + "epoch": 0.40036189804805705, + "grad_norm": 0.3981015384197235, + "learning_rate": 0.00011995216990083054, + "loss": 1.6482, + "step": 30810 + }, + { + "epoch": 0.40037489259197295, + "grad_norm": 0.29677554965019226, + "learning_rate": 0.00011994957043891915, + "loss": 1.1991, + "step": 30811 + }, + { + "epoch": 0.4003878871358888, + "grad_norm": 0.45702308416366577, + "learning_rate": 0.00011994697097700776, + "loss": 1.3663, + "step": 30812 + }, + { + "epoch": 0.4004008816798047, + "grad_norm": 0.40553992986679077, + "learning_rate": 0.00011994437151509639, + "loss": 1.5633, + "step": 30813 + }, + { + "epoch": 0.40041387622372054, + "grad_norm": 0.3707307279109955, + "learning_rate": 0.000119941772053185, + "loss": 1.345, + "step": 30814 + }, + { + "epoch": 0.40042687076763644, + "grad_norm": 0.4028571546077728, + "learning_rate": 0.00011993917259127361, + "loss": 1.5261, + "step": 30815 + }, + { + "epoch": 0.4004398653115523, + "grad_norm": 0.38304877281188965, + "learning_rate": 0.00011993657312936222, + "loss": 1.3486, + "step": 30816 + }, + { + "epoch": 0.4004528598554682, + "grad_norm": 0.38557168841362, + "learning_rate": 0.00011993397366745086, + "loss": 1.342, + "step": 30817 + }, + { + "epoch": 0.40046585439938404, + "grad_norm": 0.5038419961929321, + "learning_rate": 0.00011993137420553945, + "loss": 1.6818, + "step": 30818 + }, + { + "epoch": 0.40047884894329994, + "grad_norm": 0.3253868818283081, + "learning_rate": 0.00011992877474362806, + "loss": 1.3631, + "step": 30819 + }, + { + "epoch": 0.4004918434872158, + "grad_norm": 0.42589250206947327, + "learning_rate": 0.00011992617528171668, + "loss": 1.3477, + "step": 30820 + }, + { + "epoch": 0.4005048380311317, + "grad_norm": 0.45339396595954895, + "learning_rate": 0.00011992357581980531, + "loss": 1.5092, + "step": 30821 + }, + { + "epoch": 0.4005178325750475, + "grad_norm": 0.35119137167930603, + "learning_rate": 0.00011992097635789392, + "loss": 1.2571, + "step": 30822 + }, + { + "epoch": 0.40053082711896343, + "grad_norm": 0.42659008502960205, + "learning_rate": 0.00011991837689598254, + "loss": 1.4706, + "step": 30823 + }, + { + "epoch": 0.4005438216628793, + "grad_norm": 0.5652179718017578, + "learning_rate": 0.00011991577743407116, + "loss": 1.4996, + "step": 30824 + }, + { + "epoch": 0.4005568162067952, + "grad_norm": 0.4107537269592285, + "learning_rate": 0.00011991317797215977, + "loss": 1.3935, + "step": 30825 + }, + { + "epoch": 0.400569810750711, + "grad_norm": 0.42623329162597656, + "learning_rate": 0.00011991057851024838, + "loss": 1.55, + "step": 30826 + }, + { + "epoch": 0.4005828052946269, + "grad_norm": 0.5213897824287415, + "learning_rate": 0.00011990797904833699, + "loss": 1.5624, + "step": 30827 + }, + { + "epoch": 0.40059579983854277, + "grad_norm": 0.37603679299354553, + "learning_rate": 0.00011990537958642563, + "loss": 1.1814, + "step": 30828 + }, + { + "epoch": 0.40060879438245867, + "grad_norm": 0.36184534430503845, + "learning_rate": 0.00011990278012451424, + "loss": 1.1948, + "step": 30829 + }, + { + "epoch": 0.4006217889263745, + "grad_norm": 0.41600966453552246, + "learning_rate": 0.00011990018066260284, + "loss": 1.4123, + "step": 30830 + }, + { + "epoch": 0.4006347834702904, + "grad_norm": 0.4303798973560333, + "learning_rate": 0.00011989758120069145, + "loss": 1.5242, + "step": 30831 + }, + { + "epoch": 0.4006477780142063, + "grad_norm": 0.5037251114845276, + "learning_rate": 0.00011989498173878009, + "loss": 1.5592, + "step": 30832 + }, + { + "epoch": 0.40066077255812216, + "grad_norm": 0.43412378430366516, + "learning_rate": 0.0001198923822768687, + "loss": 1.5731, + "step": 30833 + }, + { + "epoch": 0.40067376710203806, + "grad_norm": 0.38229018449783325, + "learning_rate": 0.00011988978281495731, + "loss": 1.371, + "step": 30834 + }, + { + "epoch": 0.4006867616459539, + "grad_norm": 0.370576947927475, + "learning_rate": 0.00011988718335304592, + "loss": 1.2974, + "step": 30835 + }, + { + "epoch": 0.4006997561898698, + "grad_norm": 0.3848870098590851, + "learning_rate": 0.00011988458389113455, + "loss": 1.1385, + "step": 30836 + }, + { + "epoch": 0.40071275073378565, + "grad_norm": 0.2757657766342163, + "learning_rate": 0.00011988198442922316, + "loss": 1.2153, + "step": 30837 + }, + { + "epoch": 0.40072574527770155, + "grad_norm": 0.5632579922676086, + "learning_rate": 0.00011987938496731177, + "loss": 1.4664, + "step": 30838 + }, + { + "epoch": 0.4007387398216174, + "grad_norm": 0.303602010011673, + "learning_rate": 0.00011987678550540038, + "loss": 1.462, + "step": 30839 + }, + { + "epoch": 0.4007517343655333, + "grad_norm": 0.35670235753059387, + "learning_rate": 0.00011987418604348902, + "loss": 1.4834, + "step": 30840 + }, + { + "epoch": 0.40076472890944914, + "grad_norm": 0.41477686166763306, + "learning_rate": 0.00011987158658157763, + "loss": 1.3831, + "step": 30841 + }, + { + "epoch": 0.40077772345336504, + "grad_norm": 0.34395071864128113, + "learning_rate": 0.00011986898711966624, + "loss": 1.268, + "step": 30842 + }, + { + "epoch": 0.4007907179972809, + "grad_norm": 0.3681591749191284, + "learning_rate": 0.00011986638765775484, + "loss": 1.2065, + "step": 30843 + }, + { + "epoch": 0.4008037125411968, + "grad_norm": 0.43257269263267517, + "learning_rate": 0.00011986378819584347, + "loss": 1.2028, + "step": 30844 + }, + { + "epoch": 0.40081670708511263, + "grad_norm": 0.41485995054244995, + "learning_rate": 0.00011986118873393208, + "loss": 1.2752, + "step": 30845 + }, + { + "epoch": 0.40082970162902853, + "grad_norm": 0.4048994183540344, + "learning_rate": 0.0001198585892720207, + "loss": 1.5178, + "step": 30846 + }, + { + "epoch": 0.4008426961729444, + "grad_norm": 0.3581618368625641, + "learning_rate": 0.0001198559898101093, + "loss": 1.482, + "step": 30847 + }, + { + "epoch": 0.4008556907168603, + "grad_norm": 0.4165598750114441, + "learning_rate": 0.00011985339034819793, + "loss": 1.476, + "step": 30848 + }, + { + "epoch": 0.4008686852607761, + "grad_norm": 0.3122071921825409, + "learning_rate": 0.00011985079088628654, + "loss": 1.2657, + "step": 30849 + }, + { + "epoch": 0.400881679804692, + "grad_norm": 0.4559158980846405, + "learning_rate": 0.00011984819142437515, + "loss": 1.5182, + "step": 30850 + }, + { + "epoch": 0.40089467434860787, + "grad_norm": 0.42092403769493103, + "learning_rate": 0.00011984559196246376, + "loss": 1.4914, + "step": 30851 + }, + { + "epoch": 0.40090766889252377, + "grad_norm": 0.35695913434028625, + "learning_rate": 0.0001198429925005524, + "loss": 1.5658, + "step": 30852 + }, + { + "epoch": 0.4009206634364396, + "grad_norm": 0.4209710955619812, + "learning_rate": 0.00011984039303864101, + "loss": 1.4249, + "step": 30853 + }, + { + "epoch": 0.4009336579803555, + "grad_norm": 0.3818749487400055, + "learning_rate": 0.00011983779357672962, + "loss": 1.3164, + "step": 30854 + }, + { + "epoch": 0.40094665252427136, + "grad_norm": 0.5126658082008362, + "learning_rate": 0.00011983519411481823, + "loss": 1.2886, + "step": 30855 + }, + { + "epoch": 0.40095964706818726, + "grad_norm": 0.5513366460800171, + "learning_rate": 0.00011983259465290686, + "loss": 1.3874, + "step": 30856 + }, + { + "epoch": 0.4009726416121031, + "grad_norm": 0.32973718643188477, + "learning_rate": 0.00011982999519099547, + "loss": 1.237, + "step": 30857 + }, + { + "epoch": 0.400985636156019, + "grad_norm": 0.38109949231147766, + "learning_rate": 0.00011982739572908408, + "loss": 1.1659, + "step": 30858 + }, + { + "epoch": 0.40099863069993485, + "grad_norm": 0.37945684790611267, + "learning_rate": 0.00011982479626717272, + "loss": 1.3481, + "step": 30859 + }, + { + "epoch": 0.40101162524385076, + "grad_norm": 0.3536010682582855, + "learning_rate": 0.00011982219680526132, + "loss": 1.3725, + "step": 30860 + }, + { + "epoch": 0.4010246197877666, + "grad_norm": 0.3550262153148651, + "learning_rate": 0.00011981959734334993, + "loss": 1.3076, + "step": 30861 + }, + { + "epoch": 0.4010376143316825, + "grad_norm": 0.42360740900039673, + "learning_rate": 0.00011981699788143854, + "loss": 1.5057, + "step": 30862 + }, + { + "epoch": 0.40105060887559835, + "grad_norm": 0.42317911982536316, + "learning_rate": 0.00011981439841952718, + "loss": 1.5379, + "step": 30863 + }, + { + "epoch": 0.40106360341951425, + "grad_norm": 0.4288017749786377, + "learning_rate": 0.00011981179895761579, + "loss": 1.4198, + "step": 30864 + }, + { + "epoch": 0.4010765979634301, + "grad_norm": 0.3426763117313385, + "learning_rate": 0.0001198091994957044, + "loss": 1.4674, + "step": 30865 + }, + { + "epoch": 0.401089592507346, + "grad_norm": 0.3653353154659271, + "learning_rate": 0.00011980660003379301, + "loss": 1.5023, + "step": 30866 + }, + { + "epoch": 0.40110258705126184, + "grad_norm": 0.37957748770713806, + "learning_rate": 0.00011980400057188163, + "loss": 1.3676, + "step": 30867 + }, + { + "epoch": 0.40111558159517774, + "grad_norm": 0.38366758823394775, + "learning_rate": 0.00011980140110997024, + "loss": 1.2874, + "step": 30868 + }, + { + "epoch": 0.4011285761390936, + "grad_norm": 0.3444020748138428, + "learning_rate": 0.00011979880164805886, + "loss": 1.2087, + "step": 30869 + }, + { + "epoch": 0.4011415706830095, + "grad_norm": 0.3868485689163208, + "learning_rate": 0.00011979620218614747, + "loss": 1.3455, + "step": 30870 + }, + { + "epoch": 0.40115456522692533, + "grad_norm": 0.38688522577285767, + "learning_rate": 0.0001197936027242361, + "loss": 1.26, + "step": 30871 + }, + { + "epoch": 0.40116755977084123, + "grad_norm": 0.4591645300388336, + "learning_rate": 0.0001197910032623247, + "loss": 1.4208, + "step": 30872 + }, + { + "epoch": 0.4011805543147571, + "grad_norm": 0.34246429800987244, + "learning_rate": 0.00011978840380041331, + "loss": 1.1288, + "step": 30873 + }, + { + "epoch": 0.401193548858673, + "grad_norm": 0.4164416193962097, + "learning_rate": 0.00011978580433850192, + "loss": 1.3145, + "step": 30874 + }, + { + "epoch": 0.4012065434025888, + "grad_norm": 0.43146395683288574, + "learning_rate": 0.00011978320487659056, + "loss": 1.3508, + "step": 30875 + }, + { + "epoch": 0.4012195379465047, + "grad_norm": 0.38753631711006165, + "learning_rate": 0.00011978060541467917, + "loss": 1.3951, + "step": 30876 + }, + { + "epoch": 0.40123253249042057, + "grad_norm": 0.43959835171699524, + "learning_rate": 0.00011977800595276778, + "loss": 1.3638, + "step": 30877 + }, + { + "epoch": 0.40124552703433647, + "grad_norm": 0.4973922669887543, + "learning_rate": 0.0001197754064908564, + "loss": 1.3873, + "step": 30878 + }, + { + "epoch": 0.4012585215782523, + "grad_norm": 0.41170361638069153, + "learning_rate": 0.00011977280702894502, + "loss": 1.4862, + "step": 30879 + }, + { + "epoch": 0.4012715161221682, + "grad_norm": 0.422812819480896, + "learning_rate": 0.00011977020756703363, + "loss": 1.2926, + "step": 30880 + }, + { + "epoch": 0.40128451066608406, + "grad_norm": 0.4882308542728424, + "learning_rate": 0.00011976760810512224, + "loss": 1.4544, + "step": 30881 + }, + { + "epoch": 0.40129750520999996, + "grad_norm": 0.3994767665863037, + "learning_rate": 0.00011976500864321085, + "loss": 1.3482, + "step": 30882 + }, + { + "epoch": 0.4013104997539158, + "grad_norm": 0.46767398715019226, + "learning_rate": 0.00011976240918129949, + "loss": 1.3036, + "step": 30883 + }, + { + "epoch": 0.4013234942978317, + "grad_norm": 0.42908281087875366, + "learning_rate": 0.0001197598097193881, + "loss": 1.4414, + "step": 30884 + }, + { + "epoch": 0.40133648884174755, + "grad_norm": 0.42466217279434204, + "learning_rate": 0.0001197572102574767, + "loss": 1.5046, + "step": 30885 + }, + { + "epoch": 0.40134948338566345, + "grad_norm": 0.3394346535205841, + "learning_rate": 0.00011975461079556531, + "loss": 1.4717, + "step": 30886 + }, + { + "epoch": 0.4013624779295793, + "grad_norm": 0.3789116442203522, + "learning_rate": 0.00011975201133365395, + "loss": 1.2896, + "step": 30887 + }, + { + "epoch": 0.4013754724734952, + "grad_norm": 0.40706583857536316, + "learning_rate": 0.00011974941187174256, + "loss": 1.5988, + "step": 30888 + }, + { + "epoch": 0.40138846701741104, + "grad_norm": 0.43128660321235657, + "learning_rate": 0.00011974681240983117, + "loss": 1.6382, + "step": 30889 + }, + { + "epoch": 0.40140146156132694, + "grad_norm": 0.27285197377204895, + "learning_rate": 0.00011974421294791978, + "loss": 1.6409, + "step": 30890 + }, + { + "epoch": 0.4014144561052428, + "grad_norm": 0.48049065470695496, + "learning_rate": 0.0001197416134860084, + "loss": 1.3976, + "step": 30891 + }, + { + "epoch": 0.4014274506491587, + "grad_norm": 0.30589044094085693, + "learning_rate": 0.00011973901402409701, + "loss": 1.2662, + "step": 30892 + }, + { + "epoch": 0.40144044519307454, + "grad_norm": 0.4326670467853546, + "learning_rate": 0.00011973641456218563, + "loss": 1.3781, + "step": 30893 + }, + { + "epoch": 0.40145343973699044, + "grad_norm": 0.4140307307243347, + "learning_rate": 0.00011973381510027424, + "loss": 1.2867, + "step": 30894 + }, + { + "epoch": 0.4014664342809063, + "grad_norm": 0.37885743379592896, + "learning_rate": 0.00011973121563836287, + "loss": 1.533, + "step": 30895 + }, + { + "epoch": 0.4014794288248222, + "grad_norm": 0.40460264682769775, + "learning_rate": 0.00011972861617645149, + "loss": 1.3126, + "step": 30896 + }, + { + "epoch": 0.401492423368738, + "grad_norm": 0.29098427295684814, + "learning_rate": 0.0001197260167145401, + "loss": 1.1961, + "step": 30897 + }, + { + "epoch": 0.40150541791265393, + "grad_norm": 0.37326356768608093, + "learning_rate": 0.00011972341725262872, + "loss": 1.3055, + "step": 30898 + }, + { + "epoch": 0.4015184124565698, + "grad_norm": 0.349769651889801, + "learning_rate": 0.00011972081779071733, + "loss": 1.4681, + "step": 30899 + }, + { + "epoch": 0.4015314070004857, + "grad_norm": 0.41014358401298523, + "learning_rate": 0.00011971821832880594, + "loss": 1.5628, + "step": 30900 + }, + { + "epoch": 0.4015444015444015, + "grad_norm": 0.5560110211372375, + "learning_rate": 0.00011971561886689455, + "loss": 1.3391, + "step": 30901 + }, + { + "epoch": 0.4015573960883174, + "grad_norm": 0.47671815752983093, + "learning_rate": 0.00011971301940498318, + "loss": 1.477, + "step": 30902 + }, + { + "epoch": 0.40157039063223327, + "grad_norm": 0.4210048317909241, + "learning_rate": 0.00011971041994307179, + "loss": 1.3543, + "step": 30903 + }, + { + "epoch": 0.40158338517614917, + "grad_norm": 0.4483545422554016, + "learning_rate": 0.0001197078204811604, + "loss": 1.6265, + "step": 30904 + }, + { + "epoch": 0.401596379720065, + "grad_norm": 0.3446093499660492, + "learning_rate": 0.00011970522101924901, + "loss": 1.4344, + "step": 30905 + }, + { + "epoch": 0.4016093742639809, + "grad_norm": 0.27788135409355164, + "learning_rate": 0.00011970262155733765, + "loss": 1.4237, + "step": 30906 + }, + { + "epoch": 0.40162236880789676, + "grad_norm": 0.5458555817604065, + "learning_rate": 0.00011970002209542626, + "loss": 1.4991, + "step": 30907 + }, + { + "epoch": 0.40163536335181266, + "grad_norm": 0.39793604612350464, + "learning_rate": 0.00011969742263351487, + "loss": 1.4515, + "step": 30908 + }, + { + "epoch": 0.40164835789572856, + "grad_norm": 0.4374416470527649, + "learning_rate": 0.00011969482317160348, + "loss": 1.4077, + "step": 30909 + }, + { + "epoch": 0.4016613524396444, + "grad_norm": 0.3765833079814911, + "learning_rate": 0.0001196922237096921, + "loss": 1.4179, + "step": 30910 + }, + { + "epoch": 0.4016743469835603, + "grad_norm": 0.41321155428886414, + "learning_rate": 0.00011968962424778072, + "loss": 1.3748, + "step": 30911 + }, + { + "epoch": 0.40168734152747615, + "grad_norm": 0.33002957701683044, + "learning_rate": 0.00011968702478586933, + "loss": 1.2833, + "step": 30912 + }, + { + "epoch": 0.40170033607139205, + "grad_norm": 0.36609795689582825, + "learning_rate": 0.00011968442532395794, + "loss": 1.4742, + "step": 30913 + }, + { + "epoch": 0.4017133306153079, + "grad_norm": 0.5107268691062927, + "learning_rate": 0.00011968182586204656, + "loss": 1.4252, + "step": 30914 + }, + { + "epoch": 0.4017263251592238, + "grad_norm": 0.42915090918540955, + "learning_rate": 0.00011967922640013517, + "loss": 1.4848, + "step": 30915 + }, + { + "epoch": 0.40173931970313964, + "grad_norm": 0.34450864791870117, + "learning_rate": 0.00011967662693822379, + "loss": 1.4102, + "step": 30916 + }, + { + "epoch": 0.40175231424705554, + "grad_norm": 0.5170263648033142, + "learning_rate": 0.0001196740274763124, + "loss": 1.5575, + "step": 30917 + }, + { + "epoch": 0.4017653087909714, + "grad_norm": 0.39502233266830444, + "learning_rate": 0.00011967142801440103, + "loss": 1.3843, + "step": 30918 + }, + { + "epoch": 0.4017783033348873, + "grad_norm": 0.24875958263874054, + "learning_rate": 0.00011966882855248965, + "loss": 1.3372, + "step": 30919 + }, + { + "epoch": 0.40179129787880313, + "grad_norm": 0.321890652179718, + "learning_rate": 0.00011966622909057826, + "loss": 1.6299, + "step": 30920 + }, + { + "epoch": 0.40180429242271903, + "grad_norm": 0.40718942880630493, + "learning_rate": 0.00011966362962866687, + "loss": 1.4659, + "step": 30921 + }, + { + "epoch": 0.4018172869666349, + "grad_norm": 0.39859914779663086, + "learning_rate": 0.00011966103016675549, + "loss": 1.4074, + "step": 30922 + }, + { + "epoch": 0.4018302815105508, + "grad_norm": 0.3389623463153839, + "learning_rate": 0.0001196584307048441, + "loss": 1.3038, + "step": 30923 + }, + { + "epoch": 0.4018432760544666, + "grad_norm": 0.3802888095378876, + "learning_rate": 0.00011965583124293271, + "loss": 1.4235, + "step": 30924 + }, + { + "epoch": 0.4018562705983825, + "grad_norm": 0.2866087853908539, + "learning_rate": 0.00011965323178102132, + "loss": 1.3044, + "step": 30925 + }, + { + "epoch": 0.40186926514229837, + "grad_norm": 0.325508713722229, + "learning_rate": 0.00011965063231910996, + "loss": 1.425, + "step": 30926 + }, + { + "epoch": 0.40188225968621427, + "grad_norm": 0.34628865122795105, + "learning_rate": 0.00011964803285719856, + "loss": 1.4094, + "step": 30927 + }, + { + "epoch": 0.4018952542301301, + "grad_norm": 0.4367072582244873, + "learning_rate": 0.00011964543339528717, + "loss": 1.443, + "step": 30928 + }, + { + "epoch": 0.401908248774046, + "grad_norm": 0.3642234206199646, + "learning_rate": 0.00011964283393337578, + "loss": 1.2116, + "step": 30929 + }, + { + "epoch": 0.40192124331796186, + "grad_norm": 0.5267978310585022, + "learning_rate": 0.00011964023447146442, + "loss": 1.5136, + "step": 30930 + }, + { + "epoch": 0.40193423786187776, + "grad_norm": 0.33697745203971863, + "learning_rate": 0.00011963763500955303, + "loss": 1.4535, + "step": 30931 + }, + { + "epoch": 0.4019472324057936, + "grad_norm": 0.3666036128997803, + "learning_rate": 0.00011963503554764164, + "loss": 1.2289, + "step": 30932 + }, + { + "epoch": 0.4019602269497095, + "grad_norm": 0.4058707356452942, + "learning_rate": 0.00011963243608573027, + "loss": 1.7209, + "step": 30933 + }, + { + "epoch": 0.40197322149362535, + "grad_norm": 0.331142395734787, + "learning_rate": 0.00011962983662381888, + "loss": 1.233, + "step": 30934 + }, + { + "epoch": 0.40198621603754126, + "grad_norm": 0.4929024279117584, + "learning_rate": 0.00011962723716190749, + "loss": 1.5457, + "step": 30935 + }, + { + "epoch": 0.4019992105814571, + "grad_norm": 0.3453991115093231, + "learning_rate": 0.0001196246376999961, + "loss": 1.2923, + "step": 30936 + }, + { + "epoch": 0.402012205125373, + "grad_norm": 0.3793419897556305, + "learning_rate": 0.00011962203823808474, + "loss": 1.3573, + "step": 30937 + }, + { + "epoch": 0.40202519966928885, + "grad_norm": 0.37099671363830566, + "learning_rate": 0.00011961943877617335, + "loss": 1.4699, + "step": 30938 + }, + { + "epoch": 0.40203819421320475, + "grad_norm": 0.3517524003982544, + "learning_rate": 0.00011961683931426196, + "loss": 1.3488, + "step": 30939 + }, + { + "epoch": 0.4020511887571206, + "grad_norm": 0.4371648132801056, + "learning_rate": 0.00011961423985235056, + "loss": 1.339, + "step": 30940 + }, + { + "epoch": 0.4020641833010365, + "grad_norm": 0.41290485858917236, + "learning_rate": 0.0001196116403904392, + "loss": 1.4669, + "step": 30941 + }, + { + "epoch": 0.40207717784495234, + "grad_norm": 0.5190026164054871, + "learning_rate": 0.0001196090409285278, + "loss": 1.4955, + "step": 30942 + }, + { + "epoch": 0.40209017238886824, + "grad_norm": 0.3623102009296417, + "learning_rate": 0.00011960644146661642, + "loss": 1.4592, + "step": 30943 + }, + { + "epoch": 0.4021031669327841, + "grad_norm": 0.2695774734020233, + "learning_rate": 0.00011960384200470503, + "loss": 1.3351, + "step": 30944 + }, + { + "epoch": 0.4021161614767, + "grad_norm": 0.40172308683395386, + "learning_rate": 0.00011960124254279365, + "loss": 1.477, + "step": 30945 + }, + { + "epoch": 0.40212915602061583, + "grad_norm": 0.4356275498867035, + "learning_rate": 0.00011959864308088226, + "loss": 1.4529, + "step": 30946 + }, + { + "epoch": 0.40214215056453173, + "grad_norm": 0.4433925747871399, + "learning_rate": 0.00011959604361897087, + "loss": 1.314, + "step": 30947 + }, + { + "epoch": 0.4021551451084476, + "grad_norm": 0.4564923048019409, + "learning_rate": 0.00011959344415705948, + "loss": 1.3194, + "step": 30948 + }, + { + "epoch": 0.4021681396523635, + "grad_norm": 0.38128161430358887, + "learning_rate": 0.00011959084469514812, + "loss": 1.3865, + "step": 30949 + }, + { + "epoch": 0.4021811341962793, + "grad_norm": 0.48772263526916504, + "learning_rate": 0.00011958824523323673, + "loss": 1.474, + "step": 30950 + }, + { + "epoch": 0.4021941287401952, + "grad_norm": 0.49761760234832764, + "learning_rate": 0.00011958564577132534, + "loss": 1.3363, + "step": 30951 + }, + { + "epoch": 0.40220712328411107, + "grad_norm": 0.5154112577438354, + "learning_rate": 0.00011958304630941394, + "loss": 1.4275, + "step": 30952 + }, + { + "epoch": 0.40222011782802697, + "grad_norm": 0.25657325983047485, + "learning_rate": 0.00011958044684750258, + "loss": 1.1663, + "step": 30953 + }, + { + "epoch": 0.4022331123719428, + "grad_norm": 0.29696914553642273, + "learning_rate": 0.00011957784738559119, + "loss": 1.0954, + "step": 30954 + }, + { + "epoch": 0.4022461069158587, + "grad_norm": 0.4007854759693146, + "learning_rate": 0.0001195752479236798, + "loss": 1.5016, + "step": 30955 + }, + { + "epoch": 0.40225910145977456, + "grad_norm": 0.4521598815917969, + "learning_rate": 0.00011957264846176841, + "loss": 1.3747, + "step": 30956 + }, + { + "epoch": 0.40227209600369046, + "grad_norm": 0.4260927438735962, + "learning_rate": 0.00011957004899985704, + "loss": 1.4452, + "step": 30957 + }, + { + "epoch": 0.4022850905476063, + "grad_norm": 0.374565064907074, + "learning_rate": 0.00011956744953794565, + "loss": 1.2725, + "step": 30958 + }, + { + "epoch": 0.4022980850915222, + "grad_norm": 0.3917993903160095, + "learning_rate": 0.00011956485007603426, + "loss": 1.5097, + "step": 30959 + }, + { + "epoch": 0.40231107963543805, + "grad_norm": 0.33806294202804565, + "learning_rate": 0.00011956225061412287, + "loss": 1.2169, + "step": 30960 + }, + { + "epoch": 0.40232407417935395, + "grad_norm": 0.42782077193260193, + "learning_rate": 0.00011955965115221151, + "loss": 1.4093, + "step": 30961 + }, + { + "epoch": 0.4023370687232698, + "grad_norm": 0.4071703851222992, + "learning_rate": 0.00011955705169030012, + "loss": 1.3272, + "step": 30962 + }, + { + "epoch": 0.4023500632671857, + "grad_norm": 0.3569739758968353, + "learning_rate": 0.00011955445222838873, + "loss": 1.2467, + "step": 30963 + }, + { + "epoch": 0.40236305781110154, + "grad_norm": 0.42998987436294556, + "learning_rate": 0.00011955185276647734, + "loss": 1.422, + "step": 30964 + }, + { + "epoch": 0.40237605235501744, + "grad_norm": 0.35882484912872314, + "learning_rate": 0.00011954925330456597, + "loss": 1.4329, + "step": 30965 + }, + { + "epoch": 0.4023890468989333, + "grad_norm": 0.48883292078971863, + "learning_rate": 0.00011954665384265458, + "loss": 1.4236, + "step": 30966 + }, + { + "epoch": 0.4024020414428492, + "grad_norm": 0.32677263021469116, + "learning_rate": 0.00011954405438074319, + "loss": 1.2707, + "step": 30967 + }, + { + "epoch": 0.40241503598676504, + "grad_norm": 0.47621282935142517, + "learning_rate": 0.0001195414549188318, + "loss": 1.45, + "step": 30968 + }, + { + "epoch": 0.40242803053068094, + "grad_norm": 0.4009459614753723, + "learning_rate": 0.00011953885545692042, + "loss": 1.3891, + "step": 30969 + }, + { + "epoch": 0.4024410250745968, + "grad_norm": 0.3691713809967041, + "learning_rate": 0.00011953625599500903, + "loss": 1.4875, + "step": 30970 + }, + { + "epoch": 0.4024540196185127, + "grad_norm": 0.3944145739078522, + "learning_rate": 0.00011953365653309764, + "loss": 1.4293, + "step": 30971 + }, + { + "epoch": 0.4024670141624285, + "grad_norm": 0.36506539583206177, + "learning_rate": 0.00011953105707118628, + "loss": 1.4355, + "step": 30972 + }, + { + "epoch": 0.4024800087063444, + "grad_norm": 0.30306610465049744, + "learning_rate": 0.0001195284576092749, + "loss": 1.3112, + "step": 30973 + }, + { + "epoch": 0.4024930032502603, + "grad_norm": 0.4144796133041382, + "learning_rate": 0.0001195258581473635, + "loss": 1.403, + "step": 30974 + }, + { + "epoch": 0.4025059977941762, + "grad_norm": 0.4814968407154083, + "learning_rate": 0.00011952325868545212, + "loss": 1.2996, + "step": 30975 + }, + { + "epoch": 0.402518992338092, + "grad_norm": 0.46252134442329407, + "learning_rate": 0.00011952065922354074, + "loss": 1.5327, + "step": 30976 + }, + { + "epoch": 0.4025319868820079, + "grad_norm": 0.3180639445781708, + "learning_rate": 0.00011951805976162935, + "loss": 1.3248, + "step": 30977 + }, + { + "epoch": 0.40254498142592376, + "grad_norm": 0.3820574879646301, + "learning_rate": 0.00011951546029971796, + "loss": 1.2548, + "step": 30978 + }, + { + "epoch": 0.40255797596983967, + "grad_norm": 0.43728670477867126, + "learning_rate": 0.00011951286083780657, + "loss": 1.3186, + "step": 30979 + }, + { + "epoch": 0.4025709705137555, + "grad_norm": 0.3367365002632141, + "learning_rate": 0.00011951026137589521, + "loss": 1.2738, + "step": 30980 + }, + { + "epoch": 0.4025839650576714, + "grad_norm": 0.4072990119457245, + "learning_rate": 0.00011950766191398382, + "loss": 1.3514, + "step": 30981 + }, + { + "epoch": 0.40259695960158726, + "grad_norm": 0.3772618770599365, + "learning_rate": 0.00011950506245207242, + "loss": 1.3191, + "step": 30982 + }, + { + "epoch": 0.40260995414550316, + "grad_norm": 0.41574108600616455, + "learning_rate": 0.00011950246299016103, + "loss": 1.3599, + "step": 30983 + }, + { + "epoch": 0.40262294868941906, + "grad_norm": 0.4149223566055298, + "learning_rate": 0.00011949986352824967, + "loss": 1.3292, + "step": 30984 + }, + { + "epoch": 0.4026359432333349, + "grad_norm": 0.28939738869667053, + "learning_rate": 0.00011949726406633828, + "loss": 1.0865, + "step": 30985 + }, + { + "epoch": 0.4026489377772508, + "grad_norm": 0.4369574189186096, + "learning_rate": 0.00011949466460442689, + "loss": 1.3737, + "step": 30986 + }, + { + "epoch": 0.40266193232116665, + "grad_norm": 0.43800514936447144, + "learning_rate": 0.0001194920651425155, + "loss": 1.7039, + "step": 30987 + }, + { + "epoch": 0.40267492686508255, + "grad_norm": 0.4189130365848541, + "learning_rate": 0.00011948946568060413, + "loss": 1.4047, + "step": 30988 + }, + { + "epoch": 0.4026879214089984, + "grad_norm": 0.3541613519191742, + "learning_rate": 0.00011948686621869274, + "loss": 1.3784, + "step": 30989 + }, + { + "epoch": 0.4027009159529143, + "grad_norm": 0.4362553656101227, + "learning_rate": 0.00011948426675678135, + "loss": 1.4625, + "step": 30990 + }, + { + "epoch": 0.40271391049683014, + "grad_norm": 0.3711378872394562, + "learning_rate": 0.00011948166729486996, + "loss": 1.3243, + "step": 30991 + }, + { + "epoch": 0.40272690504074604, + "grad_norm": 0.40367165207862854, + "learning_rate": 0.0001194790678329586, + "loss": 1.364, + "step": 30992 + }, + { + "epoch": 0.4027398995846619, + "grad_norm": 0.2890492379665375, + "learning_rate": 0.00011947646837104721, + "loss": 1.4631, + "step": 30993 + }, + { + "epoch": 0.4027528941285778, + "grad_norm": 0.4290468692779541, + "learning_rate": 0.0001194738689091358, + "loss": 1.6929, + "step": 30994 + }, + { + "epoch": 0.40276588867249363, + "grad_norm": 0.5350620746612549, + "learning_rate": 0.00011947126944722442, + "loss": 1.4708, + "step": 30995 + }, + { + "epoch": 0.40277888321640953, + "grad_norm": 0.4404739439487457, + "learning_rate": 0.00011946866998531305, + "loss": 1.4449, + "step": 30996 + }, + { + "epoch": 0.4027918777603254, + "grad_norm": 0.43121930956840515, + "learning_rate": 0.00011946607052340166, + "loss": 1.5191, + "step": 30997 + }, + { + "epoch": 0.4028048723042413, + "grad_norm": 0.3534773886203766, + "learning_rate": 0.00011946347106149028, + "loss": 1.4113, + "step": 30998 + }, + { + "epoch": 0.4028178668481571, + "grad_norm": 0.3682509958744049, + "learning_rate": 0.00011946087159957889, + "loss": 1.3675, + "step": 30999 + }, + { + "epoch": 0.402830861392073, + "grad_norm": 0.49179255962371826, + "learning_rate": 0.00011945827213766751, + "loss": 1.3383, + "step": 31000 + }, + { + "epoch": 0.40284385593598887, + "grad_norm": 0.3741339445114136, + "learning_rate": 0.00011945567267575612, + "loss": 1.3002, + "step": 31001 + }, + { + "epoch": 0.40285685047990477, + "grad_norm": 0.40714019536972046, + "learning_rate": 0.00011945307321384473, + "loss": 1.4236, + "step": 31002 + }, + { + "epoch": 0.4028698450238206, + "grad_norm": 0.4596451222896576, + "learning_rate": 0.00011945047375193334, + "loss": 1.4803, + "step": 31003 + }, + { + "epoch": 0.4028828395677365, + "grad_norm": 0.37432006001472473, + "learning_rate": 0.00011944787429002198, + "loss": 1.2799, + "step": 31004 + }, + { + "epoch": 0.40289583411165236, + "grad_norm": 0.33053556084632874, + "learning_rate": 0.00011944527482811059, + "loss": 1.3286, + "step": 31005 + }, + { + "epoch": 0.40290882865556826, + "grad_norm": 0.43211057782173157, + "learning_rate": 0.0001194426753661992, + "loss": 1.5037, + "step": 31006 + }, + { + "epoch": 0.4029218231994841, + "grad_norm": 0.41410520672798157, + "learning_rate": 0.00011944007590428783, + "loss": 1.3322, + "step": 31007 + }, + { + "epoch": 0.4029348177434, + "grad_norm": 0.3912518620491028, + "learning_rate": 0.00011943747644237644, + "loss": 1.3655, + "step": 31008 + }, + { + "epoch": 0.40294781228731585, + "grad_norm": 0.43086865544319153, + "learning_rate": 0.00011943487698046505, + "loss": 1.3079, + "step": 31009 + }, + { + "epoch": 0.40296080683123175, + "grad_norm": 0.40380388498306274, + "learning_rate": 0.00011943227751855366, + "loss": 1.3576, + "step": 31010 + }, + { + "epoch": 0.4029738013751476, + "grad_norm": 0.4473000466823578, + "learning_rate": 0.00011942967805664228, + "loss": 1.4773, + "step": 31011 + }, + { + "epoch": 0.4029867959190635, + "grad_norm": 0.3912469744682312, + "learning_rate": 0.0001194270785947309, + "loss": 1.3301, + "step": 31012 + }, + { + "epoch": 0.40299979046297935, + "grad_norm": 0.3911738097667694, + "learning_rate": 0.00011942447913281951, + "loss": 1.3142, + "step": 31013 + }, + { + "epoch": 0.40301278500689525, + "grad_norm": 0.5078325271606445, + "learning_rate": 0.00011942187967090812, + "loss": 1.3698, + "step": 31014 + }, + { + "epoch": 0.4030257795508111, + "grad_norm": 0.45078396797180176, + "learning_rate": 0.00011941928020899676, + "loss": 1.4535, + "step": 31015 + }, + { + "epoch": 0.403038774094727, + "grad_norm": 0.3845955431461334, + "learning_rate": 0.00011941668074708537, + "loss": 1.3843, + "step": 31016 + }, + { + "epoch": 0.40305176863864284, + "grad_norm": 0.42612841725349426, + "learning_rate": 0.00011941408128517398, + "loss": 1.4616, + "step": 31017 + }, + { + "epoch": 0.40306476318255874, + "grad_norm": 0.3638926148414612, + "learning_rate": 0.00011941148182326259, + "loss": 1.4092, + "step": 31018 + }, + { + "epoch": 0.4030777577264746, + "grad_norm": 0.39457377791404724, + "learning_rate": 0.00011940888236135121, + "loss": 1.4205, + "step": 31019 + }, + { + "epoch": 0.4030907522703905, + "grad_norm": 0.3126545548439026, + "learning_rate": 0.00011940628289943982, + "loss": 1.0962, + "step": 31020 + }, + { + "epoch": 0.40310374681430633, + "grad_norm": 0.35651418566703796, + "learning_rate": 0.00011940368343752843, + "loss": 1.383, + "step": 31021 + }, + { + "epoch": 0.40311674135822223, + "grad_norm": 0.5040359497070312, + "learning_rate": 0.00011940108397561705, + "loss": 1.4347, + "step": 31022 + }, + { + "epoch": 0.4031297359021381, + "grad_norm": 0.45616036653518677, + "learning_rate": 0.00011939848451370568, + "loss": 1.4728, + "step": 31023 + }, + { + "epoch": 0.403142730446054, + "grad_norm": 0.3578883707523346, + "learning_rate": 0.00011939588505179428, + "loss": 1.2405, + "step": 31024 + }, + { + "epoch": 0.4031557249899698, + "grad_norm": 0.3503490388393402, + "learning_rate": 0.00011939328558988289, + "loss": 1.455, + "step": 31025 + }, + { + "epoch": 0.4031687195338857, + "grad_norm": 0.336767315864563, + "learning_rate": 0.0001193906861279715, + "loss": 1.4416, + "step": 31026 + }, + { + "epoch": 0.40318171407780157, + "grad_norm": 0.446418434381485, + "learning_rate": 0.00011938808666606014, + "loss": 1.5306, + "step": 31027 + }, + { + "epoch": 0.40319470862171747, + "grad_norm": 0.42929142713546753, + "learning_rate": 0.00011938548720414875, + "loss": 1.2, + "step": 31028 + }, + { + "epoch": 0.4032077031656333, + "grad_norm": 0.42894983291625977, + "learning_rate": 0.00011938288774223736, + "loss": 1.4694, + "step": 31029 + }, + { + "epoch": 0.4032206977095492, + "grad_norm": 0.31042197346687317, + "learning_rate": 0.00011938028828032597, + "loss": 1.4006, + "step": 31030 + }, + { + "epoch": 0.40323369225346506, + "grad_norm": 0.4395008087158203, + "learning_rate": 0.0001193776888184146, + "loss": 1.4823, + "step": 31031 + }, + { + "epoch": 0.40324668679738096, + "grad_norm": 0.46104106307029724, + "learning_rate": 0.00011937508935650321, + "loss": 1.5517, + "step": 31032 + }, + { + "epoch": 0.4032596813412968, + "grad_norm": 0.2806963324546814, + "learning_rate": 0.00011937248989459182, + "loss": 1.242, + "step": 31033 + }, + { + "epoch": 0.4032726758852127, + "grad_norm": 0.3751618564128876, + "learning_rate": 0.00011936989043268043, + "loss": 1.4648, + "step": 31034 + }, + { + "epoch": 0.40328567042912855, + "grad_norm": 0.4882134795188904, + "learning_rate": 0.00011936729097076907, + "loss": 1.3877, + "step": 31035 + }, + { + "epoch": 0.40329866497304445, + "grad_norm": 0.34530094265937805, + "learning_rate": 0.00011936469150885767, + "loss": 1.2336, + "step": 31036 + }, + { + "epoch": 0.4033116595169603, + "grad_norm": 0.4550773501396179, + "learning_rate": 0.00011936209204694628, + "loss": 1.2656, + "step": 31037 + }, + { + "epoch": 0.4033246540608762, + "grad_norm": 0.41220822930336, + "learning_rate": 0.00011935949258503489, + "loss": 1.21, + "step": 31038 + }, + { + "epoch": 0.40333764860479204, + "grad_norm": 0.44122928380966187, + "learning_rate": 0.00011935689312312353, + "loss": 1.5191, + "step": 31039 + }, + { + "epoch": 0.40335064314870794, + "grad_norm": 0.4742184281349182, + "learning_rate": 0.00011935429366121214, + "loss": 1.574, + "step": 31040 + }, + { + "epoch": 0.4033636376926238, + "grad_norm": 0.29416942596435547, + "learning_rate": 0.00011935169419930075, + "loss": 1.2699, + "step": 31041 + }, + { + "epoch": 0.4033766322365397, + "grad_norm": 0.5073818564414978, + "learning_rate": 0.00011934909473738936, + "loss": 1.5844, + "step": 31042 + }, + { + "epoch": 0.40338962678045553, + "grad_norm": 0.5408205986022949, + "learning_rate": 0.00011934649527547798, + "loss": 1.5044, + "step": 31043 + }, + { + "epoch": 0.40340262132437144, + "grad_norm": 0.41733425855636597, + "learning_rate": 0.0001193438958135666, + "loss": 1.6471, + "step": 31044 + }, + { + "epoch": 0.4034156158682873, + "grad_norm": 0.4319666922092438, + "learning_rate": 0.0001193412963516552, + "loss": 1.4255, + "step": 31045 + }, + { + "epoch": 0.4034286104122032, + "grad_norm": 0.4576224088668823, + "learning_rate": 0.00011933869688974384, + "loss": 1.349, + "step": 31046 + }, + { + "epoch": 0.403441604956119, + "grad_norm": 0.44393396377563477, + "learning_rate": 0.00011933609742783245, + "loss": 1.1948, + "step": 31047 + }, + { + "epoch": 0.4034545995000349, + "grad_norm": 0.5083323121070862, + "learning_rate": 0.00011933349796592107, + "loss": 1.5725, + "step": 31048 + }, + { + "epoch": 0.4034675940439508, + "grad_norm": 0.4106099605560303, + "learning_rate": 0.00011933089850400966, + "loss": 1.3906, + "step": 31049 + }, + { + "epoch": 0.4034805885878667, + "grad_norm": 0.3360599875450134, + "learning_rate": 0.0001193282990420983, + "loss": 1.2085, + "step": 31050 + }, + { + "epoch": 0.4034935831317825, + "grad_norm": 0.37186679244041443, + "learning_rate": 0.00011932569958018691, + "loss": 1.4006, + "step": 31051 + }, + { + "epoch": 0.4035065776756984, + "grad_norm": 0.3861527144908905, + "learning_rate": 0.00011932310011827552, + "loss": 1.4841, + "step": 31052 + }, + { + "epoch": 0.40351957221961426, + "grad_norm": 0.30264320969581604, + "learning_rate": 0.00011932050065636413, + "loss": 1.1926, + "step": 31053 + }, + { + "epoch": 0.40353256676353016, + "grad_norm": 0.38274046778678894, + "learning_rate": 0.00011931790119445276, + "loss": 1.4578, + "step": 31054 + }, + { + "epoch": 0.403545561307446, + "grad_norm": 0.2625962197780609, + "learning_rate": 0.00011931530173254137, + "loss": 1.2895, + "step": 31055 + }, + { + "epoch": 0.4035585558513619, + "grad_norm": 0.4568151533603668, + "learning_rate": 0.00011931270227062998, + "loss": 1.3637, + "step": 31056 + }, + { + "epoch": 0.40357155039527776, + "grad_norm": 0.26751509308815, + "learning_rate": 0.00011931010280871859, + "loss": 1.2122, + "step": 31057 + }, + { + "epoch": 0.40358454493919366, + "grad_norm": 0.406337171792984, + "learning_rate": 0.00011930750334680723, + "loss": 1.4235, + "step": 31058 + }, + { + "epoch": 0.4035975394831095, + "grad_norm": 0.36575061082839966, + "learning_rate": 0.00011930490388489584, + "loss": 1.3201, + "step": 31059 + }, + { + "epoch": 0.4036105340270254, + "grad_norm": 0.572536826133728, + "learning_rate": 0.00011930230442298445, + "loss": 1.4371, + "step": 31060 + }, + { + "epoch": 0.4036235285709413, + "grad_norm": 0.29167428612709045, + "learning_rate": 0.00011929970496107306, + "loss": 1.3815, + "step": 31061 + }, + { + "epoch": 0.40363652311485715, + "grad_norm": 0.3699348270893097, + "learning_rate": 0.00011929710549916169, + "loss": 1.3976, + "step": 31062 + }, + { + "epoch": 0.40364951765877305, + "grad_norm": 0.36710453033447266, + "learning_rate": 0.0001192945060372503, + "loss": 1.6085, + "step": 31063 + }, + { + "epoch": 0.4036625122026889, + "grad_norm": 0.39688268303871155, + "learning_rate": 0.00011929190657533891, + "loss": 1.5108, + "step": 31064 + }, + { + "epoch": 0.4036755067466048, + "grad_norm": 0.43722018599510193, + "learning_rate": 0.00011928930711342752, + "loss": 1.6239, + "step": 31065 + }, + { + "epoch": 0.40368850129052064, + "grad_norm": 0.39373287558555603, + "learning_rate": 0.00011928670765151614, + "loss": 1.5558, + "step": 31066 + }, + { + "epoch": 0.40370149583443654, + "grad_norm": 0.3575771152973175, + "learning_rate": 0.00011928410818960475, + "loss": 1.3021, + "step": 31067 + }, + { + "epoch": 0.4037144903783524, + "grad_norm": 0.4879028797149658, + "learning_rate": 0.00011928150872769337, + "loss": 1.4182, + "step": 31068 + }, + { + "epoch": 0.4037274849222683, + "grad_norm": 0.3960452377796173, + "learning_rate": 0.00011927890926578198, + "loss": 1.6554, + "step": 31069 + }, + { + "epoch": 0.40374047946618413, + "grad_norm": 0.36082056164741516, + "learning_rate": 0.00011927630980387061, + "loss": 1.3421, + "step": 31070 + }, + { + "epoch": 0.40375347401010003, + "grad_norm": 0.44531652331352234, + "learning_rate": 0.00011927371034195923, + "loss": 1.3684, + "step": 31071 + }, + { + "epoch": 0.4037664685540159, + "grad_norm": 0.38977527618408203, + "learning_rate": 0.00011927111088004784, + "loss": 1.3236, + "step": 31072 + }, + { + "epoch": 0.4037794630979318, + "grad_norm": 0.4571976959705353, + "learning_rate": 0.00011926851141813645, + "loss": 1.4554, + "step": 31073 + }, + { + "epoch": 0.4037924576418476, + "grad_norm": 0.48396363854408264, + "learning_rate": 0.00011926591195622507, + "loss": 1.1672, + "step": 31074 + }, + { + "epoch": 0.4038054521857635, + "grad_norm": 0.4495755732059479, + "learning_rate": 0.00011926331249431368, + "loss": 1.4583, + "step": 31075 + }, + { + "epoch": 0.40381844672967937, + "grad_norm": 0.36246901750564575, + "learning_rate": 0.0001192607130324023, + "loss": 1.3106, + "step": 31076 + }, + { + "epoch": 0.40383144127359527, + "grad_norm": 0.31022751331329346, + "learning_rate": 0.0001192581135704909, + "loss": 1.4672, + "step": 31077 + }, + { + "epoch": 0.4038444358175111, + "grad_norm": 0.46667397022247314, + "learning_rate": 0.00011925551410857953, + "loss": 1.434, + "step": 31078 + }, + { + "epoch": 0.403857430361427, + "grad_norm": 0.321641206741333, + "learning_rate": 0.00011925291464666814, + "loss": 1.4739, + "step": 31079 + }, + { + "epoch": 0.40387042490534286, + "grad_norm": 0.37200087308883667, + "learning_rate": 0.00011925031518475675, + "loss": 1.3111, + "step": 31080 + }, + { + "epoch": 0.40388341944925876, + "grad_norm": 0.24284672737121582, + "learning_rate": 0.00011924771572284536, + "loss": 1.1892, + "step": 31081 + }, + { + "epoch": 0.4038964139931746, + "grad_norm": 0.4175315797328949, + "learning_rate": 0.000119245116260934, + "loss": 1.3551, + "step": 31082 + }, + { + "epoch": 0.4039094085370905, + "grad_norm": 0.3691502809524536, + "learning_rate": 0.00011924251679902261, + "loss": 1.585, + "step": 31083 + }, + { + "epoch": 0.40392240308100635, + "grad_norm": 0.4940640330314636, + "learning_rate": 0.00011923991733711122, + "loss": 1.482, + "step": 31084 + }, + { + "epoch": 0.40393539762492225, + "grad_norm": 0.42931312322616577, + "learning_rate": 0.00011923731787519985, + "loss": 1.3735, + "step": 31085 + }, + { + "epoch": 0.4039483921688381, + "grad_norm": 0.44503793120384216, + "learning_rate": 0.00011923471841328846, + "loss": 1.4508, + "step": 31086 + }, + { + "epoch": 0.403961386712754, + "grad_norm": 0.46254003047943115, + "learning_rate": 0.00011923211895137707, + "loss": 1.3514, + "step": 31087 + }, + { + "epoch": 0.40397438125666985, + "grad_norm": 0.41324159502983093, + "learning_rate": 0.00011922951948946568, + "loss": 1.4286, + "step": 31088 + }, + { + "epoch": 0.40398737580058575, + "grad_norm": 0.41091713309288025, + "learning_rate": 0.00011922692002755432, + "loss": 1.4806, + "step": 31089 + }, + { + "epoch": 0.4040003703445016, + "grad_norm": 0.3074330985546112, + "learning_rate": 0.00011922432056564293, + "loss": 1.2096, + "step": 31090 + }, + { + "epoch": 0.4040133648884175, + "grad_norm": 0.37177252769470215, + "learning_rate": 0.00011922172110373153, + "loss": 1.4092, + "step": 31091 + }, + { + "epoch": 0.40402635943233334, + "grad_norm": 0.3361234962940216, + "learning_rate": 0.00011921912164182014, + "loss": 1.1664, + "step": 31092 + }, + { + "epoch": 0.40403935397624924, + "grad_norm": 0.3064337372779846, + "learning_rate": 0.00011921652217990877, + "loss": 1.5362, + "step": 31093 + }, + { + "epoch": 0.4040523485201651, + "grad_norm": 0.5648393630981445, + "learning_rate": 0.00011921392271799739, + "loss": 1.5108, + "step": 31094 + }, + { + "epoch": 0.404065343064081, + "grad_norm": 0.39658382534980774, + "learning_rate": 0.000119211323256086, + "loss": 1.2835, + "step": 31095 + }, + { + "epoch": 0.40407833760799683, + "grad_norm": 0.3781071603298187, + "learning_rate": 0.00011920872379417461, + "loss": 1.3464, + "step": 31096 + }, + { + "epoch": 0.40409133215191273, + "grad_norm": 0.4796808660030365, + "learning_rate": 0.00011920612433226323, + "loss": 1.3243, + "step": 31097 + }, + { + "epoch": 0.4041043266958286, + "grad_norm": 0.31574904918670654, + "learning_rate": 0.00011920352487035184, + "loss": 1.3697, + "step": 31098 + }, + { + "epoch": 0.4041173212397445, + "grad_norm": 0.396132230758667, + "learning_rate": 0.00011920092540844045, + "loss": 1.2984, + "step": 31099 + }, + { + "epoch": 0.4041303157836603, + "grad_norm": 0.3588292598724365, + "learning_rate": 0.00011919832594652906, + "loss": 1.3467, + "step": 31100 + }, + { + "epoch": 0.4041433103275762, + "grad_norm": 0.3943256437778473, + "learning_rate": 0.0001191957264846177, + "loss": 1.3035, + "step": 31101 + }, + { + "epoch": 0.40415630487149207, + "grad_norm": 0.4043674170970917, + "learning_rate": 0.00011919312702270631, + "loss": 1.2018, + "step": 31102 + }, + { + "epoch": 0.40416929941540797, + "grad_norm": 0.46197620034217834, + "learning_rate": 0.00011919052756079492, + "loss": 1.4024, + "step": 31103 + }, + { + "epoch": 0.4041822939593238, + "grad_norm": 0.2812095284461975, + "learning_rate": 0.00011918792809888352, + "loss": 1.3708, + "step": 31104 + }, + { + "epoch": 0.4041952885032397, + "grad_norm": 0.39766836166381836, + "learning_rate": 0.00011918532863697216, + "loss": 1.4108, + "step": 31105 + }, + { + "epoch": 0.40420828304715556, + "grad_norm": 0.4104083776473999, + "learning_rate": 0.00011918272917506077, + "loss": 1.3334, + "step": 31106 + }, + { + "epoch": 0.40422127759107146, + "grad_norm": 0.45874708890914917, + "learning_rate": 0.00011918012971314938, + "loss": 1.4111, + "step": 31107 + }, + { + "epoch": 0.4042342721349873, + "grad_norm": 0.36362719535827637, + "learning_rate": 0.00011917753025123799, + "loss": 1.4653, + "step": 31108 + }, + { + "epoch": 0.4042472666789032, + "grad_norm": 0.40747252106666565, + "learning_rate": 0.00011917493078932662, + "loss": 1.4284, + "step": 31109 + }, + { + "epoch": 0.40426026122281905, + "grad_norm": 0.34531745314598083, + "learning_rate": 0.00011917233132741523, + "loss": 1.257, + "step": 31110 + }, + { + "epoch": 0.40427325576673495, + "grad_norm": 0.49280163645744324, + "learning_rate": 0.00011916973186550384, + "loss": 1.3904, + "step": 31111 + }, + { + "epoch": 0.4042862503106508, + "grad_norm": 0.4759371280670166, + "learning_rate": 0.00011916713240359245, + "loss": 1.2108, + "step": 31112 + }, + { + "epoch": 0.4042992448545667, + "grad_norm": 0.3847648799419403, + "learning_rate": 0.00011916453294168109, + "loss": 1.2552, + "step": 31113 + }, + { + "epoch": 0.40431223939848254, + "grad_norm": 0.42719778418540955, + "learning_rate": 0.0001191619334797697, + "loss": 1.7538, + "step": 31114 + }, + { + "epoch": 0.40432523394239844, + "grad_norm": 0.5223241448402405, + "learning_rate": 0.00011915933401785831, + "loss": 1.4725, + "step": 31115 + }, + { + "epoch": 0.4043382284863143, + "grad_norm": 0.33119985461235046, + "learning_rate": 0.00011915673455594691, + "loss": 1.3077, + "step": 31116 + }, + { + "epoch": 0.4043512230302302, + "grad_norm": 0.32815834879875183, + "learning_rate": 0.00011915413509403555, + "loss": 1.352, + "step": 31117 + }, + { + "epoch": 0.40436421757414603, + "grad_norm": 0.359495609998703, + "learning_rate": 0.00011915153563212416, + "loss": 1.2257, + "step": 31118 + }, + { + "epoch": 0.40437721211806193, + "grad_norm": 0.40769317746162415, + "learning_rate": 0.00011914893617021277, + "loss": 1.4266, + "step": 31119 + }, + { + "epoch": 0.4043902066619778, + "grad_norm": 0.38165783882141113, + "learning_rate": 0.00011914633670830139, + "loss": 1.2537, + "step": 31120 + }, + { + "epoch": 0.4044032012058937, + "grad_norm": 0.33748123049736023, + "learning_rate": 0.00011914373724639, + "loss": 1.2966, + "step": 31121 + }, + { + "epoch": 0.4044161957498095, + "grad_norm": 0.5030595660209656, + "learning_rate": 0.00011914113778447861, + "loss": 1.4478, + "step": 31122 + }, + { + "epoch": 0.4044291902937254, + "grad_norm": 0.32752615213394165, + "learning_rate": 0.00011913853832256722, + "loss": 1.2675, + "step": 31123 + }, + { + "epoch": 0.40444218483764127, + "grad_norm": 0.400953471660614, + "learning_rate": 0.00011913593886065586, + "loss": 1.3851, + "step": 31124 + }, + { + "epoch": 0.4044551793815572, + "grad_norm": 0.3274124562740326, + "learning_rate": 0.00011913333939874447, + "loss": 1.1848, + "step": 31125 + }, + { + "epoch": 0.404468173925473, + "grad_norm": 0.273478627204895, + "learning_rate": 0.00011913073993683308, + "loss": 1.2055, + "step": 31126 + }, + { + "epoch": 0.4044811684693889, + "grad_norm": 0.3876365125179291, + "learning_rate": 0.0001191281404749217, + "loss": 1.4481, + "step": 31127 + }, + { + "epoch": 0.40449416301330476, + "grad_norm": 0.580828845500946, + "learning_rate": 0.00011912554101301032, + "loss": 1.3591, + "step": 31128 + }, + { + "epoch": 0.40450715755722066, + "grad_norm": 0.29494908452033997, + "learning_rate": 0.00011912294155109893, + "loss": 1.2555, + "step": 31129 + }, + { + "epoch": 0.4045201521011365, + "grad_norm": 0.4451282024383545, + "learning_rate": 0.00011912034208918754, + "loss": 1.4914, + "step": 31130 + }, + { + "epoch": 0.4045331466450524, + "grad_norm": 0.4182499945163727, + "learning_rate": 0.00011911774262727615, + "loss": 1.3394, + "step": 31131 + }, + { + "epoch": 0.40454614118896826, + "grad_norm": 0.36886388063430786, + "learning_rate": 0.00011911514316536479, + "loss": 1.2085, + "step": 31132 + }, + { + "epoch": 0.40455913573288416, + "grad_norm": 0.41878238320350647, + "learning_rate": 0.00011911254370345339, + "loss": 1.4315, + "step": 31133 + }, + { + "epoch": 0.4045721302768, + "grad_norm": 0.3612288534641266, + "learning_rate": 0.000119109944241542, + "loss": 1.1751, + "step": 31134 + }, + { + "epoch": 0.4045851248207159, + "grad_norm": 0.39339253306388855, + "learning_rate": 0.00011910734477963061, + "loss": 1.4236, + "step": 31135 + }, + { + "epoch": 0.40459811936463175, + "grad_norm": 0.39024654030799866, + "learning_rate": 0.00011910474531771925, + "loss": 1.3813, + "step": 31136 + }, + { + "epoch": 0.40461111390854765, + "grad_norm": 0.39640891551971436, + "learning_rate": 0.00011910214585580786, + "loss": 1.3948, + "step": 31137 + }, + { + "epoch": 0.40462410845246355, + "grad_norm": 0.40515291690826416, + "learning_rate": 0.00011909954639389647, + "loss": 1.3752, + "step": 31138 + }, + { + "epoch": 0.4046371029963794, + "grad_norm": 0.519363284111023, + "learning_rate": 0.00011909694693198508, + "loss": 1.2778, + "step": 31139 + }, + { + "epoch": 0.4046500975402953, + "grad_norm": 0.4499009847640991, + "learning_rate": 0.0001190943474700737, + "loss": 1.438, + "step": 31140 + }, + { + "epoch": 0.40466309208421114, + "grad_norm": 0.3646060526371002, + "learning_rate": 0.00011909174800816232, + "loss": 1.202, + "step": 31141 + }, + { + "epoch": 0.40467608662812704, + "grad_norm": 0.3819766640663147, + "learning_rate": 0.00011908914854625093, + "loss": 1.5017, + "step": 31142 + }, + { + "epoch": 0.4046890811720429, + "grad_norm": 0.4532409608364105, + "learning_rate": 0.00011908654908433954, + "loss": 1.3157, + "step": 31143 + }, + { + "epoch": 0.4047020757159588, + "grad_norm": 0.41658562421798706, + "learning_rate": 0.00011908394962242818, + "loss": 1.4714, + "step": 31144 + }, + { + "epoch": 0.40471507025987463, + "grad_norm": 0.3146139681339264, + "learning_rate": 0.00011908135016051679, + "loss": 1.5751, + "step": 31145 + }, + { + "epoch": 0.40472806480379053, + "grad_norm": 0.36841073632240295, + "learning_rate": 0.00011907875069860538, + "loss": 1.2612, + "step": 31146 + }, + { + "epoch": 0.4047410593477064, + "grad_norm": 0.3991709351539612, + "learning_rate": 0.000119076151236694, + "loss": 1.4681, + "step": 31147 + }, + { + "epoch": 0.4047540538916223, + "grad_norm": 0.505599856376648, + "learning_rate": 0.00011907355177478263, + "loss": 1.6774, + "step": 31148 + }, + { + "epoch": 0.4047670484355381, + "grad_norm": 0.3352959156036377, + "learning_rate": 0.00011907095231287124, + "loss": 1.1709, + "step": 31149 + }, + { + "epoch": 0.404780042979454, + "grad_norm": 0.43015971779823303, + "learning_rate": 0.00011906835285095986, + "loss": 1.4067, + "step": 31150 + }, + { + "epoch": 0.40479303752336987, + "grad_norm": 0.3387715816497803, + "learning_rate": 0.00011906575338904847, + "loss": 1.2522, + "step": 31151 + }, + { + "epoch": 0.40480603206728577, + "grad_norm": 0.3760809004306793, + "learning_rate": 0.00011906315392713709, + "loss": 1.3888, + "step": 31152 + }, + { + "epoch": 0.4048190266112016, + "grad_norm": 0.40547674894332886, + "learning_rate": 0.0001190605544652257, + "loss": 1.3071, + "step": 31153 + }, + { + "epoch": 0.4048320211551175, + "grad_norm": 0.30349001288414, + "learning_rate": 0.00011905795500331431, + "loss": 1.3224, + "step": 31154 + }, + { + "epoch": 0.40484501569903336, + "grad_norm": 0.3696651756763458, + "learning_rate": 0.00011905535554140292, + "loss": 1.3081, + "step": 31155 + }, + { + "epoch": 0.40485801024294926, + "grad_norm": 0.38968411087989807, + "learning_rate": 0.00011905275607949156, + "loss": 1.2858, + "step": 31156 + }, + { + "epoch": 0.4048710047868651, + "grad_norm": 0.41720980405807495, + "learning_rate": 0.00011905015661758017, + "loss": 1.4594, + "step": 31157 + }, + { + "epoch": 0.404883999330781, + "grad_norm": 0.49381259083747864, + "learning_rate": 0.00011904755715566877, + "loss": 1.3585, + "step": 31158 + }, + { + "epoch": 0.40489699387469685, + "grad_norm": 0.31462299823760986, + "learning_rate": 0.00011904495769375741, + "loss": 1.3144, + "step": 31159 + }, + { + "epoch": 0.40490998841861275, + "grad_norm": 0.47582030296325684, + "learning_rate": 0.00011904235823184602, + "loss": 1.3183, + "step": 31160 + }, + { + "epoch": 0.4049229829625286, + "grad_norm": 0.4573417901992798, + "learning_rate": 0.00011903975876993463, + "loss": 1.4331, + "step": 31161 + }, + { + "epoch": 0.4049359775064445, + "grad_norm": 0.42500001192092896, + "learning_rate": 0.00011903715930802324, + "loss": 1.4202, + "step": 31162 + }, + { + "epoch": 0.40494897205036035, + "grad_norm": 0.48033180832862854, + "learning_rate": 0.00011903455984611186, + "loss": 1.4146, + "step": 31163 + }, + { + "epoch": 0.40496196659427625, + "grad_norm": 0.4172731041908264, + "learning_rate": 0.00011903196038420048, + "loss": 1.4106, + "step": 31164 + }, + { + "epoch": 0.4049749611381921, + "grad_norm": 0.4109196960926056, + "learning_rate": 0.00011902936092228909, + "loss": 1.0949, + "step": 31165 + }, + { + "epoch": 0.404987955682108, + "grad_norm": 0.5177567601203918, + "learning_rate": 0.0001190267614603777, + "loss": 1.5222, + "step": 31166 + }, + { + "epoch": 0.40500095022602384, + "grad_norm": 0.49179255962371826, + "learning_rate": 0.00011902416199846634, + "loss": 1.3764, + "step": 31167 + }, + { + "epoch": 0.40501394476993974, + "grad_norm": 0.48563122749328613, + "learning_rate": 0.00011902156253655495, + "loss": 1.3855, + "step": 31168 + }, + { + "epoch": 0.4050269393138556, + "grad_norm": 0.36977359652519226, + "learning_rate": 0.00011901896307464356, + "loss": 1.3776, + "step": 31169 + }, + { + "epoch": 0.4050399338577715, + "grad_norm": 0.42352986335754395, + "learning_rate": 0.00011901636361273217, + "loss": 1.2775, + "step": 31170 + }, + { + "epoch": 0.40505292840168733, + "grad_norm": 0.3878430724143982, + "learning_rate": 0.00011901376415082079, + "loss": 1.3114, + "step": 31171 + }, + { + "epoch": 0.40506592294560323, + "grad_norm": 0.4392317235469818, + "learning_rate": 0.0001190111646889094, + "loss": 1.3503, + "step": 31172 + }, + { + "epoch": 0.4050789174895191, + "grad_norm": 0.4080732464790344, + "learning_rate": 0.00011900856522699801, + "loss": 1.3054, + "step": 31173 + }, + { + "epoch": 0.405091912033435, + "grad_norm": 0.4280981123447418, + "learning_rate": 0.00011900596576508663, + "loss": 1.4308, + "step": 31174 + }, + { + "epoch": 0.4051049065773508, + "grad_norm": 0.38070032000541687, + "learning_rate": 0.00011900336630317525, + "loss": 1.2601, + "step": 31175 + }, + { + "epoch": 0.4051179011212667, + "grad_norm": 0.4179835319519043, + "learning_rate": 0.00011900076684126386, + "loss": 1.4414, + "step": 31176 + }, + { + "epoch": 0.40513089566518257, + "grad_norm": 0.4899497628211975, + "learning_rate": 0.00011899816737935247, + "loss": 1.4814, + "step": 31177 + }, + { + "epoch": 0.40514389020909847, + "grad_norm": 0.38369104266166687, + "learning_rate": 0.00011899556791744108, + "loss": 1.2864, + "step": 31178 + }, + { + "epoch": 0.4051568847530143, + "grad_norm": 0.39225828647613525, + "learning_rate": 0.00011899296845552972, + "loss": 1.3075, + "step": 31179 + }, + { + "epoch": 0.4051698792969302, + "grad_norm": 0.3863079845905304, + "learning_rate": 0.00011899036899361833, + "loss": 1.5609, + "step": 31180 + }, + { + "epoch": 0.40518287384084606, + "grad_norm": 0.34979888796806335, + "learning_rate": 0.00011898776953170694, + "loss": 1.2385, + "step": 31181 + }, + { + "epoch": 0.40519586838476196, + "grad_norm": 0.40420815348625183, + "learning_rate": 0.00011898517006979555, + "loss": 1.3738, + "step": 31182 + }, + { + "epoch": 0.4052088629286778, + "grad_norm": 0.4980183243751526, + "learning_rate": 0.00011898257060788418, + "loss": 1.4821, + "step": 31183 + }, + { + "epoch": 0.4052218574725937, + "grad_norm": 0.26523557305336, + "learning_rate": 0.00011897997114597279, + "loss": 1.2932, + "step": 31184 + }, + { + "epoch": 0.40523485201650955, + "grad_norm": 0.4374240040779114, + "learning_rate": 0.0001189773716840614, + "loss": 1.3174, + "step": 31185 + }, + { + "epoch": 0.40524784656042545, + "grad_norm": 0.4220539927482605, + "learning_rate": 0.00011897477222215001, + "loss": 1.3367, + "step": 31186 + }, + { + "epoch": 0.4052608411043413, + "grad_norm": 0.38757675886154175, + "learning_rate": 0.00011897217276023865, + "loss": 1.4012, + "step": 31187 + }, + { + "epoch": 0.4052738356482572, + "grad_norm": 0.39984625577926636, + "learning_rate": 0.00011896957329832725, + "loss": 1.3633, + "step": 31188 + }, + { + "epoch": 0.40528683019217304, + "grad_norm": 0.3410710394382477, + "learning_rate": 0.00011896697383641586, + "loss": 1.316, + "step": 31189 + }, + { + "epoch": 0.40529982473608894, + "grad_norm": 0.4982589781284332, + "learning_rate": 0.00011896437437450447, + "loss": 1.4565, + "step": 31190 + }, + { + "epoch": 0.4053128192800048, + "grad_norm": 0.495503693819046, + "learning_rate": 0.0001189617749125931, + "loss": 1.5095, + "step": 31191 + }, + { + "epoch": 0.4053258138239207, + "grad_norm": 0.3099004030227661, + "learning_rate": 0.00011895917545068172, + "loss": 1.2597, + "step": 31192 + }, + { + "epoch": 0.40533880836783653, + "grad_norm": 0.430729478597641, + "learning_rate": 0.00011895657598877033, + "loss": 1.4133, + "step": 31193 + }, + { + "epoch": 0.40535180291175243, + "grad_norm": 0.4965914189815521, + "learning_rate": 0.00011895397652685895, + "loss": 1.4518, + "step": 31194 + }, + { + "epoch": 0.4053647974556683, + "grad_norm": 0.3689592182636261, + "learning_rate": 0.00011895137706494756, + "loss": 1.326, + "step": 31195 + }, + { + "epoch": 0.4053777919995842, + "grad_norm": 0.5080020427703857, + "learning_rate": 0.00011894877760303617, + "loss": 1.1317, + "step": 31196 + }, + { + "epoch": 0.4053907865435, + "grad_norm": 0.37928450107574463, + "learning_rate": 0.00011894617814112479, + "loss": 1.4909, + "step": 31197 + }, + { + "epoch": 0.4054037810874159, + "grad_norm": 0.5257855653762817, + "learning_rate": 0.00011894357867921342, + "loss": 1.4967, + "step": 31198 + }, + { + "epoch": 0.40541677563133177, + "grad_norm": 0.3613608181476593, + "learning_rate": 0.00011894097921730203, + "loss": 1.473, + "step": 31199 + }, + { + "epoch": 0.4054297701752477, + "grad_norm": 0.38127535581588745, + "learning_rate": 0.00011893837975539063, + "loss": 1.3844, + "step": 31200 + }, + { + "epoch": 0.4054427647191635, + "grad_norm": 0.3791249096393585, + "learning_rate": 0.00011893578029347924, + "loss": 1.3042, + "step": 31201 + }, + { + "epoch": 0.4054557592630794, + "grad_norm": 0.3875804841518402, + "learning_rate": 0.00011893318083156788, + "loss": 1.4339, + "step": 31202 + }, + { + "epoch": 0.40546875380699526, + "grad_norm": 0.31077125668525696, + "learning_rate": 0.00011893058136965649, + "loss": 1.1927, + "step": 31203 + }, + { + "epoch": 0.40548174835091116, + "grad_norm": 0.32536572217941284, + "learning_rate": 0.0001189279819077451, + "loss": 1.4645, + "step": 31204 + }, + { + "epoch": 0.405494742894827, + "grad_norm": 0.3629818558692932, + "learning_rate": 0.00011892538244583371, + "loss": 1.3366, + "step": 31205 + }, + { + "epoch": 0.4055077374387429, + "grad_norm": 0.37646088004112244, + "learning_rate": 0.00011892278298392234, + "loss": 1.2986, + "step": 31206 + }, + { + "epoch": 0.40552073198265876, + "grad_norm": 0.3595403730869293, + "learning_rate": 0.00011892018352201095, + "loss": 1.4383, + "step": 31207 + }, + { + "epoch": 0.40553372652657466, + "grad_norm": 0.32527852058410645, + "learning_rate": 0.00011891758406009956, + "loss": 1.4019, + "step": 31208 + }, + { + "epoch": 0.4055467210704905, + "grad_norm": 0.372487336397171, + "learning_rate": 0.00011891498459818817, + "loss": 1.4469, + "step": 31209 + }, + { + "epoch": 0.4055597156144064, + "grad_norm": 0.3933354318141937, + "learning_rate": 0.00011891238513627681, + "loss": 1.4328, + "step": 31210 + }, + { + "epoch": 0.40557271015832225, + "grad_norm": 0.43458229303359985, + "learning_rate": 0.00011890978567436542, + "loss": 1.2702, + "step": 31211 + }, + { + "epoch": 0.40558570470223815, + "grad_norm": 0.4470691382884979, + "learning_rate": 0.00011890718621245403, + "loss": 1.4581, + "step": 31212 + }, + { + "epoch": 0.40559869924615405, + "grad_norm": 0.35403117537498474, + "learning_rate": 0.00011890458675054263, + "loss": 1.4024, + "step": 31213 + }, + { + "epoch": 0.4056116937900699, + "grad_norm": 0.4237093925476074, + "learning_rate": 0.00011890198728863127, + "loss": 1.5175, + "step": 31214 + }, + { + "epoch": 0.4056246883339858, + "grad_norm": 0.35908371210098267, + "learning_rate": 0.00011889938782671988, + "loss": 1.2216, + "step": 31215 + }, + { + "epoch": 0.40563768287790164, + "grad_norm": 0.5276117324829102, + "learning_rate": 0.00011889678836480849, + "loss": 1.5198, + "step": 31216 + }, + { + "epoch": 0.40565067742181754, + "grad_norm": 0.3601825535297394, + "learning_rate": 0.0001188941889028971, + "loss": 1.3974, + "step": 31217 + }, + { + "epoch": 0.4056636719657334, + "grad_norm": 0.4296603798866272, + "learning_rate": 0.00011889158944098572, + "loss": 1.5108, + "step": 31218 + }, + { + "epoch": 0.4056766665096493, + "grad_norm": 0.4216512143611908, + "learning_rate": 0.00011888898997907433, + "loss": 1.3085, + "step": 31219 + }, + { + "epoch": 0.40568966105356513, + "grad_norm": 0.42654138803482056, + "learning_rate": 0.00011888639051716295, + "loss": 1.4722, + "step": 31220 + }, + { + "epoch": 0.40570265559748103, + "grad_norm": 0.40836450457572937, + "learning_rate": 0.00011888379105525156, + "loss": 1.4478, + "step": 31221 + }, + { + "epoch": 0.4057156501413969, + "grad_norm": 0.4115433990955353, + "learning_rate": 0.0001188811915933402, + "loss": 1.4614, + "step": 31222 + }, + { + "epoch": 0.4057286446853128, + "grad_norm": 0.2881373167037964, + "learning_rate": 0.0001188785921314288, + "loss": 1.3968, + "step": 31223 + }, + { + "epoch": 0.4057416392292286, + "grad_norm": 0.4314188063144684, + "learning_rate": 0.00011887599266951742, + "loss": 1.4829, + "step": 31224 + }, + { + "epoch": 0.4057546337731445, + "grad_norm": 0.360015869140625, + "learning_rate": 0.00011887339320760603, + "loss": 1.4896, + "step": 31225 + }, + { + "epoch": 0.40576762831706037, + "grad_norm": 0.3828166723251343, + "learning_rate": 0.00011887079374569465, + "loss": 1.4096, + "step": 31226 + }, + { + "epoch": 0.40578062286097627, + "grad_norm": 0.501865804195404, + "learning_rate": 0.00011886819428378326, + "loss": 1.413, + "step": 31227 + }, + { + "epoch": 0.4057936174048921, + "grad_norm": 0.414804607629776, + "learning_rate": 0.00011886559482187187, + "loss": 1.5255, + "step": 31228 + }, + { + "epoch": 0.405806611948808, + "grad_norm": 0.3666289150714874, + "learning_rate": 0.00011886299535996048, + "loss": 1.3012, + "step": 31229 + }, + { + "epoch": 0.40581960649272386, + "grad_norm": 0.4416143298149109, + "learning_rate": 0.00011886039589804911, + "loss": 1.5082, + "step": 31230 + }, + { + "epoch": 0.40583260103663976, + "grad_norm": 0.38721030950546265, + "learning_rate": 0.00011885779643613772, + "loss": 1.4334, + "step": 31231 + }, + { + "epoch": 0.4058455955805556, + "grad_norm": 0.38310903310775757, + "learning_rate": 0.00011885519697422633, + "loss": 1.6196, + "step": 31232 + }, + { + "epoch": 0.4058585901244715, + "grad_norm": 0.510926365852356, + "learning_rate": 0.00011885259751231497, + "loss": 1.4067, + "step": 31233 + }, + { + "epoch": 0.40587158466838735, + "grad_norm": 0.3168974220752716, + "learning_rate": 0.00011884999805040358, + "loss": 1.3574, + "step": 31234 + }, + { + "epoch": 0.40588457921230325, + "grad_norm": 0.45922380685806274, + "learning_rate": 0.00011884739858849219, + "loss": 1.3532, + "step": 31235 + }, + { + "epoch": 0.4058975737562191, + "grad_norm": 0.332072377204895, + "learning_rate": 0.0001188447991265808, + "loss": 1.5918, + "step": 31236 + }, + { + "epoch": 0.405910568300135, + "grad_norm": 0.5135757327079773, + "learning_rate": 0.00011884219966466943, + "loss": 1.4235, + "step": 31237 + }, + { + "epoch": 0.40592356284405084, + "grad_norm": 0.35514453053474426, + "learning_rate": 0.00011883960020275804, + "loss": 1.3901, + "step": 31238 + }, + { + "epoch": 0.40593655738796675, + "grad_norm": 0.3588900566101074, + "learning_rate": 0.00011883700074084665, + "loss": 1.3905, + "step": 31239 + }, + { + "epoch": 0.4059495519318826, + "grad_norm": 0.46052685379981995, + "learning_rate": 0.00011883440127893526, + "loss": 1.3979, + "step": 31240 + }, + { + "epoch": 0.4059625464757985, + "grad_norm": 0.40927180647850037, + "learning_rate": 0.0001188318018170239, + "loss": 1.1847, + "step": 31241 + }, + { + "epoch": 0.40597554101971434, + "grad_norm": 0.33551061153411865, + "learning_rate": 0.0001188292023551125, + "loss": 1.3197, + "step": 31242 + }, + { + "epoch": 0.40598853556363024, + "grad_norm": 0.34213122725486755, + "learning_rate": 0.0001188266028932011, + "loss": 1.3425, + "step": 31243 + }, + { + "epoch": 0.4060015301075461, + "grad_norm": 0.2508379817008972, + "learning_rate": 0.00011882400343128972, + "loss": 1.222, + "step": 31244 + }, + { + "epoch": 0.406014524651462, + "grad_norm": 0.3861617147922516, + "learning_rate": 0.00011882140396937835, + "loss": 1.3168, + "step": 31245 + }, + { + "epoch": 0.40602751919537783, + "grad_norm": 0.4054805338382721, + "learning_rate": 0.00011881880450746697, + "loss": 1.5717, + "step": 31246 + }, + { + "epoch": 0.40604051373929373, + "grad_norm": 0.5045730471611023, + "learning_rate": 0.00011881620504555558, + "loss": 1.273, + "step": 31247 + }, + { + "epoch": 0.4060535082832096, + "grad_norm": 0.4012494385242462, + "learning_rate": 0.00011881360558364419, + "loss": 1.3772, + "step": 31248 + }, + { + "epoch": 0.4060665028271255, + "grad_norm": 0.39588335156440735, + "learning_rate": 0.00011881100612173281, + "loss": 1.5071, + "step": 31249 + }, + { + "epoch": 0.4060794973710413, + "grad_norm": 0.33543112874031067, + "learning_rate": 0.00011880840665982142, + "loss": 1.4006, + "step": 31250 + }, + { + "epoch": 0.4060924919149572, + "grad_norm": 0.3463461399078369, + "learning_rate": 0.00011880580719791003, + "loss": 1.4128, + "step": 31251 + }, + { + "epoch": 0.40610548645887307, + "grad_norm": 0.42646366357803345, + "learning_rate": 0.00011880320773599864, + "loss": 1.3896, + "step": 31252 + }, + { + "epoch": 0.40611848100278897, + "grad_norm": 0.396257221698761, + "learning_rate": 0.00011880060827408728, + "loss": 1.542, + "step": 31253 + }, + { + "epoch": 0.4061314755467048, + "grad_norm": 0.38964149355888367, + "learning_rate": 0.0001187980088121759, + "loss": 1.4644, + "step": 31254 + }, + { + "epoch": 0.4061444700906207, + "grad_norm": 0.3986106216907501, + "learning_rate": 0.00011879540935026449, + "loss": 1.3015, + "step": 31255 + }, + { + "epoch": 0.40615746463453656, + "grad_norm": 0.41529545187950134, + "learning_rate": 0.0001187928098883531, + "loss": 1.5393, + "step": 31256 + }, + { + "epoch": 0.40617045917845246, + "grad_norm": 0.40253400802612305, + "learning_rate": 0.00011879021042644174, + "loss": 1.4339, + "step": 31257 + }, + { + "epoch": 0.4061834537223683, + "grad_norm": 0.43429264426231384, + "learning_rate": 0.00011878761096453035, + "loss": 1.2613, + "step": 31258 + }, + { + "epoch": 0.4061964482662842, + "grad_norm": 0.2691577672958374, + "learning_rate": 0.00011878501150261896, + "loss": 1.4561, + "step": 31259 + }, + { + "epoch": 0.40620944281020005, + "grad_norm": 0.40096017718315125, + "learning_rate": 0.00011878241204070757, + "loss": 1.4678, + "step": 31260 + }, + { + "epoch": 0.40622243735411595, + "grad_norm": 0.4558047950267792, + "learning_rate": 0.0001187798125787962, + "loss": 1.3217, + "step": 31261 + }, + { + "epoch": 0.4062354318980318, + "grad_norm": 0.3821234107017517, + "learning_rate": 0.00011877721311688481, + "loss": 1.2376, + "step": 31262 + }, + { + "epoch": 0.4062484264419477, + "grad_norm": 0.40908125042915344, + "learning_rate": 0.00011877461365497342, + "loss": 1.635, + "step": 31263 + }, + { + "epoch": 0.40626142098586354, + "grad_norm": 0.3726956844329834, + "learning_rate": 0.00011877201419306203, + "loss": 1.4287, + "step": 31264 + }, + { + "epoch": 0.40627441552977944, + "grad_norm": 0.4645557701587677, + "learning_rate": 0.00011876941473115067, + "loss": 1.5008, + "step": 31265 + }, + { + "epoch": 0.4062874100736953, + "grad_norm": 0.4110972583293915, + "learning_rate": 0.00011876681526923928, + "loss": 1.344, + "step": 31266 + }, + { + "epoch": 0.4063004046176112, + "grad_norm": 0.4408775269985199, + "learning_rate": 0.00011876421580732789, + "loss": 1.4052, + "step": 31267 + }, + { + "epoch": 0.40631339916152703, + "grad_norm": 0.384743869304657, + "learning_rate": 0.00011876161634541651, + "loss": 1.4074, + "step": 31268 + }, + { + "epoch": 0.40632639370544293, + "grad_norm": 0.4842003285884857, + "learning_rate": 0.00011875901688350513, + "loss": 1.3696, + "step": 31269 + }, + { + "epoch": 0.4063393882493588, + "grad_norm": 0.5091818571090698, + "learning_rate": 0.00011875641742159374, + "loss": 1.6031, + "step": 31270 + }, + { + "epoch": 0.4063523827932747, + "grad_norm": 0.34551045298576355, + "learning_rate": 0.00011875381795968235, + "loss": 1.4607, + "step": 31271 + }, + { + "epoch": 0.4063653773371905, + "grad_norm": 0.3323245346546173, + "learning_rate": 0.00011875121849777097, + "loss": 1.0766, + "step": 31272 + }, + { + "epoch": 0.4063783718811064, + "grad_norm": 0.70066899061203, + "learning_rate": 0.00011874861903585958, + "loss": 1.2979, + "step": 31273 + }, + { + "epoch": 0.40639136642502227, + "grad_norm": 0.45902806520462036, + "learning_rate": 0.0001187460195739482, + "loss": 1.3914, + "step": 31274 + }, + { + "epoch": 0.40640436096893817, + "grad_norm": 0.447151780128479, + "learning_rate": 0.0001187434201120368, + "loss": 1.2701, + "step": 31275 + }, + { + "epoch": 0.406417355512854, + "grad_norm": 0.2966899871826172, + "learning_rate": 0.00011874082065012544, + "loss": 1.212, + "step": 31276 + }, + { + "epoch": 0.4064303500567699, + "grad_norm": 0.37040624022483826, + "learning_rate": 0.00011873822118821405, + "loss": 1.1205, + "step": 31277 + }, + { + "epoch": 0.40644334460068576, + "grad_norm": 0.4391426742076874, + "learning_rate": 0.00011873562172630266, + "loss": 1.4174, + "step": 31278 + }, + { + "epoch": 0.40645633914460166, + "grad_norm": 0.30764180421829224, + "learning_rate": 0.00011873302226439128, + "loss": 1.3751, + "step": 31279 + }, + { + "epoch": 0.4064693336885175, + "grad_norm": 0.35065677762031555, + "learning_rate": 0.0001187304228024799, + "loss": 1.2258, + "step": 31280 + }, + { + "epoch": 0.4064823282324334, + "grad_norm": 0.3802269399166107, + "learning_rate": 0.00011872782334056851, + "loss": 1.5252, + "step": 31281 + }, + { + "epoch": 0.40649532277634925, + "grad_norm": 0.45753800868988037, + "learning_rate": 0.00011872522387865712, + "loss": 1.2664, + "step": 31282 + }, + { + "epoch": 0.40650831732026516, + "grad_norm": 0.41795745491981506, + "learning_rate": 0.00011872262441674573, + "loss": 1.4199, + "step": 31283 + }, + { + "epoch": 0.406521311864181, + "grad_norm": 0.4111131727695465, + "learning_rate": 0.00011872002495483436, + "loss": 1.3316, + "step": 31284 + }, + { + "epoch": 0.4065343064080969, + "grad_norm": 0.3360668122768402, + "learning_rate": 0.00011871742549292297, + "loss": 1.2689, + "step": 31285 + }, + { + "epoch": 0.40654730095201275, + "grad_norm": 0.3608076274394989, + "learning_rate": 0.00011871482603101158, + "loss": 1.6717, + "step": 31286 + }, + { + "epoch": 0.40656029549592865, + "grad_norm": 0.3970521092414856, + "learning_rate": 0.00011871222656910019, + "loss": 1.4533, + "step": 31287 + }, + { + "epoch": 0.4065732900398445, + "grad_norm": 0.40583324432373047, + "learning_rate": 0.00011870962710718883, + "loss": 1.2344, + "step": 31288 + }, + { + "epoch": 0.4065862845837604, + "grad_norm": 0.2829705774784088, + "learning_rate": 0.00011870702764527744, + "loss": 1.2993, + "step": 31289 + }, + { + "epoch": 0.4065992791276763, + "grad_norm": 0.41404134035110474, + "learning_rate": 0.00011870442818336605, + "loss": 1.5585, + "step": 31290 + }, + { + "epoch": 0.40661227367159214, + "grad_norm": 0.392275333404541, + "learning_rate": 0.00011870182872145466, + "loss": 1.2174, + "step": 31291 + }, + { + "epoch": 0.40662526821550804, + "grad_norm": 0.41608232259750366, + "learning_rate": 0.00011869922925954328, + "loss": 1.3351, + "step": 31292 + }, + { + "epoch": 0.4066382627594239, + "grad_norm": 0.30293115973472595, + "learning_rate": 0.0001186966297976319, + "loss": 1.3793, + "step": 31293 + }, + { + "epoch": 0.4066512573033398, + "grad_norm": 0.3355635106563568, + "learning_rate": 0.00011869403033572051, + "loss": 1.2915, + "step": 31294 + }, + { + "epoch": 0.40666425184725563, + "grad_norm": 0.47416988015174866, + "learning_rate": 0.00011869143087380912, + "loss": 1.4625, + "step": 31295 + }, + { + "epoch": 0.40667724639117153, + "grad_norm": 0.37027180194854736, + "learning_rate": 0.00011868883141189776, + "loss": 1.439, + "step": 31296 + }, + { + "epoch": 0.4066902409350874, + "grad_norm": 0.3935788869857788, + "learning_rate": 0.00011868623194998635, + "loss": 1.4832, + "step": 31297 + }, + { + "epoch": 0.4067032354790033, + "grad_norm": 0.3641882836818695, + "learning_rate": 0.00011868363248807496, + "loss": 1.4582, + "step": 31298 + }, + { + "epoch": 0.4067162300229191, + "grad_norm": 0.31681060791015625, + "learning_rate": 0.00011868103302616358, + "loss": 1.2232, + "step": 31299 + }, + { + "epoch": 0.406729224566835, + "grad_norm": 0.44886142015457153, + "learning_rate": 0.00011867843356425221, + "loss": 1.4125, + "step": 31300 + }, + { + "epoch": 0.40674221911075087, + "grad_norm": 0.37310972809791565, + "learning_rate": 0.00011867583410234082, + "loss": 1.3819, + "step": 31301 + }, + { + "epoch": 0.40675521365466677, + "grad_norm": 0.4611188471317291, + "learning_rate": 0.00011867323464042943, + "loss": 1.4512, + "step": 31302 + }, + { + "epoch": 0.4067682081985826, + "grad_norm": 0.3597484827041626, + "learning_rate": 0.00011867063517851805, + "loss": 1.3436, + "step": 31303 + }, + { + "epoch": 0.4067812027424985, + "grad_norm": 0.42691949009895325, + "learning_rate": 0.00011866803571660667, + "loss": 1.424, + "step": 31304 + }, + { + "epoch": 0.40679419728641436, + "grad_norm": 0.5390656590461731, + "learning_rate": 0.00011866543625469528, + "loss": 1.5211, + "step": 31305 + }, + { + "epoch": 0.40680719183033026, + "grad_norm": 0.41673198342323303, + "learning_rate": 0.00011866283679278389, + "loss": 1.464, + "step": 31306 + }, + { + "epoch": 0.4068201863742461, + "grad_norm": 0.39755260944366455, + "learning_rate": 0.00011866023733087253, + "loss": 1.3604, + "step": 31307 + }, + { + "epoch": 0.406833180918162, + "grad_norm": 0.3578665554523468, + "learning_rate": 0.00011865763786896114, + "loss": 1.3936, + "step": 31308 + }, + { + "epoch": 0.40684617546207785, + "grad_norm": 0.4285860061645508, + "learning_rate": 0.00011865503840704975, + "loss": 1.4835, + "step": 31309 + }, + { + "epoch": 0.40685917000599375, + "grad_norm": 0.43358784914016724, + "learning_rate": 0.00011865243894513835, + "loss": 1.4817, + "step": 31310 + }, + { + "epoch": 0.4068721645499096, + "grad_norm": 0.4867531955242157, + "learning_rate": 0.00011864983948322699, + "loss": 1.3186, + "step": 31311 + }, + { + "epoch": 0.4068851590938255, + "grad_norm": 0.310566246509552, + "learning_rate": 0.0001186472400213156, + "loss": 1.5392, + "step": 31312 + }, + { + "epoch": 0.40689815363774134, + "grad_norm": 0.4140184223651886, + "learning_rate": 0.00011864464055940421, + "loss": 1.4285, + "step": 31313 + }, + { + "epoch": 0.40691114818165725, + "grad_norm": 0.4563678205013275, + "learning_rate": 0.00011864204109749282, + "loss": 1.3861, + "step": 31314 + }, + { + "epoch": 0.4069241427255731, + "grad_norm": 0.40866297483444214, + "learning_rate": 0.00011863944163558144, + "loss": 1.4891, + "step": 31315 + }, + { + "epoch": 0.406937137269489, + "grad_norm": 0.4171508550643921, + "learning_rate": 0.00011863684217367006, + "loss": 1.5697, + "step": 31316 + }, + { + "epoch": 0.40695013181340484, + "grad_norm": 0.4434012770652771, + "learning_rate": 0.00011863424271175867, + "loss": 1.4789, + "step": 31317 + }, + { + "epoch": 0.40696312635732074, + "grad_norm": 0.4278556704521179, + "learning_rate": 0.00011863164324984728, + "loss": 1.4238, + "step": 31318 + }, + { + "epoch": 0.4069761209012366, + "grad_norm": 0.3635933995246887, + "learning_rate": 0.00011862904378793592, + "loss": 1.4068, + "step": 31319 + }, + { + "epoch": 0.4069891154451525, + "grad_norm": 0.37037113308906555, + "learning_rate": 0.00011862644432602453, + "loss": 1.3402, + "step": 31320 + }, + { + "epoch": 0.40700210998906833, + "grad_norm": 0.3789578974246979, + "learning_rate": 0.00011862384486411314, + "loss": 1.36, + "step": 31321 + }, + { + "epoch": 0.40701510453298423, + "grad_norm": 0.4212924540042877, + "learning_rate": 0.00011862124540220175, + "loss": 1.4364, + "step": 31322 + }, + { + "epoch": 0.4070280990769001, + "grad_norm": 0.4389123320579529, + "learning_rate": 0.00011861864594029037, + "loss": 1.3789, + "step": 31323 + }, + { + "epoch": 0.407041093620816, + "grad_norm": 0.372774600982666, + "learning_rate": 0.00011861604647837898, + "loss": 1.5103, + "step": 31324 + }, + { + "epoch": 0.4070540881647318, + "grad_norm": 0.463238388299942, + "learning_rate": 0.0001186134470164676, + "loss": 1.4617, + "step": 31325 + }, + { + "epoch": 0.4070670827086477, + "grad_norm": 0.45295020937919617, + "learning_rate": 0.0001186108475545562, + "loss": 1.3962, + "step": 31326 + }, + { + "epoch": 0.40708007725256357, + "grad_norm": 0.3821038007736206, + "learning_rate": 0.00011860824809264483, + "loss": 1.2124, + "step": 31327 + }, + { + "epoch": 0.40709307179647947, + "grad_norm": 0.2766205966472626, + "learning_rate": 0.00011860564863073344, + "loss": 1.3476, + "step": 31328 + }, + { + "epoch": 0.4071060663403953, + "grad_norm": 0.39438456296920776, + "learning_rate": 0.00011860304916882205, + "loss": 1.4519, + "step": 31329 + }, + { + "epoch": 0.4071190608843112, + "grad_norm": 0.3885607421398163, + "learning_rate": 0.00011860044970691066, + "loss": 1.445, + "step": 31330 + }, + { + "epoch": 0.40713205542822706, + "grad_norm": 0.3620757758617401, + "learning_rate": 0.0001185978502449993, + "loss": 1.4061, + "step": 31331 + }, + { + "epoch": 0.40714504997214296, + "grad_norm": 0.49417468905448914, + "learning_rate": 0.00011859525078308791, + "loss": 1.4369, + "step": 31332 + }, + { + "epoch": 0.4071580445160588, + "grad_norm": 0.4415302574634552, + "learning_rate": 0.00011859265132117652, + "loss": 1.399, + "step": 31333 + }, + { + "epoch": 0.4071710390599747, + "grad_norm": 0.3604930639266968, + "learning_rate": 0.00011859005185926513, + "loss": 1.6794, + "step": 31334 + }, + { + "epoch": 0.40718403360389055, + "grad_norm": 0.4214653968811035, + "learning_rate": 0.00011858745239735376, + "loss": 1.5821, + "step": 31335 + }, + { + "epoch": 0.40719702814780645, + "grad_norm": 0.4235413670539856, + "learning_rate": 0.00011858485293544237, + "loss": 1.4422, + "step": 31336 + }, + { + "epoch": 0.4072100226917223, + "grad_norm": 0.41996026039123535, + "learning_rate": 0.00011858225347353098, + "loss": 1.4043, + "step": 31337 + }, + { + "epoch": 0.4072230172356382, + "grad_norm": 0.38430002331733704, + "learning_rate": 0.00011857965401161959, + "loss": 1.3189, + "step": 31338 + }, + { + "epoch": 0.40723601177955404, + "grad_norm": 0.4642822742462158, + "learning_rate": 0.00011857705454970822, + "loss": 1.5658, + "step": 31339 + }, + { + "epoch": 0.40724900632346994, + "grad_norm": 0.4557071626186371, + "learning_rate": 0.00011857445508779683, + "loss": 1.5322, + "step": 31340 + }, + { + "epoch": 0.4072620008673858, + "grad_norm": 0.4533839225769043, + "learning_rate": 0.00011857185562588544, + "loss": 1.4225, + "step": 31341 + }, + { + "epoch": 0.4072749954113017, + "grad_norm": 0.36180832982063293, + "learning_rate": 0.00011856925616397408, + "loss": 1.178, + "step": 31342 + }, + { + "epoch": 0.40728798995521753, + "grad_norm": 0.47234398126602173, + "learning_rate": 0.00011856665670206269, + "loss": 1.3295, + "step": 31343 + }, + { + "epoch": 0.40730098449913343, + "grad_norm": 0.4748702049255371, + "learning_rate": 0.0001185640572401513, + "loss": 1.4596, + "step": 31344 + }, + { + "epoch": 0.4073139790430493, + "grad_norm": 0.44142329692840576, + "learning_rate": 0.00011856145777823991, + "loss": 1.2811, + "step": 31345 + }, + { + "epoch": 0.4073269735869652, + "grad_norm": 0.41287118196487427, + "learning_rate": 0.00011855885831632853, + "loss": 1.4395, + "step": 31346 + }, + { + "epoch": 0.407339968130881, + "grad_norm": 0.36212319135665894, + "learning_rate": 0.00011855625885441714, + "loss": 1.2541, + "step": 31347 + }, + { + "epoch": 0.4073529626747969, + "grad_norm": 0.36613714694976807, + "learning_rate": 0.00011855365939250575, + "loss": 1.4965, + "step": 31348 + }, + { + "epoch": 0.40736595721871277, + "grad_norm": 0.4289287328720093, + "learning_rate": 0.00011855105993059437, + "loss": 1.5802, + "step": 31349 + }, + { + "epoch": 0.40737895176262867, + "grad_norm": 0.4733085036277771, + "learning_rate": 0.000118548460468683, + "loss": 1.4927, + "step": 31350 + }, + { + "epoch": 0.4073919463065445, + "grad_norm": 0.43434008955955505, + "learning_rate": 0.00011854586100677161, + "loss": 1.45, + "step": 31351 + }, + { + "epoch": 0.4074049408504604, + "grad_norm": 0.44725659489631653, + "learning_rate": 0.00011854326154486021, + "loss": 1.5963, + "step": 31352 + }, + { + "epoch": 0.40741793539437626, + "grad_norm": 0.3191600739955902, + "learning_rate": 0.00011854066208294882, + "loss": 1.3828, + "step": 31353 + }, + { + "epoch": 0.40743092993829216, + "grad_norm": 0.3672683835029602, + "learning_rate": 0.00011853806262103746, + "loss": 1.3777, + "step": 31354 + }, + { + "epoch": 0.407443924482208, + "grad_norm": 0.4257959723472595, + "learning_rate": 0.00011853546315912607, + "loss": 1.4296, + "step": 31355 + }, + { + "epoch": 0.4074569190261239, + "grad_norm": 0.32201388478279114, + "learning_rate": 0.00011853286369721468, + "loss": 1.3911, + "step": 31356 + }, + { + "epoch": 0.40746991357003975, + "grad_norm": 0.26250603795051575, + "learning_rate": 0.0001185302642353033, + "loss": 1.1523, + "step": 31357 + }, + { + "epoch": 0.40748290811395566, + "grad_norm": 0.4814162254333496, + "learning_rate": 0.00011852766477339192, + "loss": 1.4619, + "step": 31358 + }, + { + "epoch": 0.4074959026578715, + "grad_norm": 0.3212600350379944, + "learning_rate": 0.00011852506531148053, + "loss": 1.4189, + "step": 31359 + }, + { + "epoch": 0.4075088972017874, + "grad_norm": 0.3166644871234894, + "learning_rate": 0.00011852246584956914, + "loss": 1.4032, + "step": 31360 + }, + { + "epoch": 0.40752189174570325, + "grad_norm": 0.372885137796402, + "learning_rate": 0.00011851986638765775, + "loss": 1.3095, + "step": 31361 + }, + { + "epoch": 0.40753488628961915, + "grad_norm": 0.48347485065460205, + "learning_rate": 0.00011851726692574639, + "loss": 1.431, + "step": 31362 + }, + { + "epoch": 0.407547880833535, + "grad_norm": 0.38742896914482117, + "learning_rate": 0.000118514667463835, + "loss": 1.2352, + "step": 31363 + }, + { + "epoch": 0.4075608753774509, + "grad_norm": 0.4425623416900635, + "learning_rate": 0.0001185120680019236, + "loss": 1.4051, + "step": 31364 + }, + { + "epoch": 0.4075738699213668, + "grad_norm": 0.4310416281223297, + "learning_rate": 0.00011850946854001221, + "loss": 1.3918, + "step": 31365 + }, + { + "epoch": 0.40758686446528264, + "grad_norm": 0.3456686735153198, + "learning_rate": 0.00011850686907810085, + "loss": 1.2985, + "step": 31366 + }, + { + "epoch": 0.40759985900919854, + "grad_norm": 0.4211243987083435, + "learning_rate": 0.00011850426961618946, + "loss": 1.4318, + "step": 31367 + }, + { + "epoch": 0.4076128535531144, + "grad_norm": 0.31994906067848206, + "learning_rate": 0.00011850167015427807, + "loss": 1.3502, + "step": 31368 + }, + { + "epoch": 0.4076258480970303, + "grad_norm": 0.4743326008319855, + "learning_rate": 0.00011849907069236668, + "loss": 1.3237, + "step": 31369 + }, + { + "epoch": 0.40763884264094613, + "grad_norm": 0.36632925271987915, + "learning_rate": 0.0001184964712304553, + "loss": 1.4197, + "step": 31370 + }, + { + "epoch": 0.40765183718486203, + "grad_norm": 0.32511237263679504, + "learning_rate": 0.00011849387176854391, + "loss": 1.4717, + "step": 31371 + }, + { + "epoch": 0.4076648317287779, + "grad_norm": 0.3998405337333679, + "learning_rate": 0.00011849127230663253, + "loss": 1.3779, + "step": 31372 + }, + { + "epoch": 0.4076778262726938, + "grad_norm": 0.34404999017715454, + "learning_rate": 0.00011848867284472114, + "loss": 1.3555, + "step": 31373 + }, + { + "epoch": 0.4076908208166096, + "grad_norm": 0.40047603845596313, + "learning_rate": 0.00011848607338280977, + "loss": 1.1931, + "step": 31374 + }, + { + "epoch": 0.4077038153605255, + "grad_norm": 0.395526260137558, + "learning_rate": 0.00011848347392089839, + "loss": 1.3446, + "step": 31375 + }, + { + "epoch": 0.40771680990444137, + "grad_norm": 0.42978525161743164, + "learning_rate": 0.000118480874458987, + "loss": 1.3874, + "step": 31376 + }, + { + "epoch": 0.40772980444835727, + "grad_norm": 0.382686585187912, + "learning_rate": 0.0001184782749970756, + "loss": 1.2812, + "step": 31377 + }, + { + "epoch": 0.4077427989922731, + "grad_norm": 0.39854878187179565, + "learning_rate": 0.00011847567553516423, + "loss": 1.5112, + "step": 31378 + }, + { + "epoch": 0.407755793536189, + "grad_norm": 0.4491034746170044, + "learning_rate": 0.00011847307607325284, + "loss": 1.3977, + "step": 31379 + }, + { + "epoch": 0.40776878808010486, + "grad_norm": 0.4140099883079529, + "learning_rate": 0.00011847047661134145, + "loss": 1.2671, + "step": 31380 + }, + { + "epoch": 0.40778178262402076, + "grad_norm": 0.32000020146369934, + "learning_rate": 0.00011846787714943008, + "loss": 1.1727, + "step": 31381 + }, + { + "epoch": 0.4077947771679366, + "grad_norm": 0.4219103455543518, + "learning_rate": 0.00011846527768751869, + "loss": 1.5198, + "step": 31382 + }, + { + "epoch": 0.4078077717118525, + "grad_norm": 0.47607484459877014, + "learning_rate": 0.0001184626782256073, + "loss": 1.5757, + "step": 31383 + }, + { + "epoch": 0.40782076625576835, + "grad_norm": 0.2943762242794037, + "learning_rate": 0.00011846007876369591, + "loss": 1.3111, + "step": 31384 + }, + { + "epoch": 0.40783376079968425, + "grad_norm": 0.45588958263397217, + "learning_rate": 0.00011845747930178455, + "loss": 1.3689, + "step": 31385 + }, + { + "epoch": 0.4078467553436001, + "grad_norm": 0.3965607285499573, + "learning_rate": 0.00011845487983987316, + "loss": 1.4404, + "step": 31386 + }, + { + "epoch": 0.407859749887516, + "grad_norm": 0.3627980947494507, + "learning_rate": 0.00011845228037796177, + "loss": 1.4611, + "step": 31387 + }, + { + "epoch": 0.40787274443143184, + "grad_norm": 0.409006804227829, + "learning_rate": 0.00011844968091605038, + "loss": 1.444, + "step": 31388 + }, + { + "epoch": 0.40788573897534774, + "grad_norm": 0.485850989818573, + "learning_rate": 0.000118447081454139, + "loss": 1.5869, + "step": 31389 + }, + { + "epoch": 0.4078987335192636, + "grad_norm": 0.35226863622665405, + "learning_rate": 0.00011844448199222762, + "loss": 1.4892, + "step": 31390 + }, + { + "epoch": 0.4079117280631795, + "grad_norm": 0.38993802666664124, + "learning_rate": 0.00011844188253031623, + "loss": 1.4883, + "step": 31391 + }, + { + "epoch": 0.40792472260709534, + "grad_norm": 0.39846158027648926, + "learning_rate": 0.00011843928306840484, + "loss": 1.4949, + "step": 31392 + }, + { + "epoch": 0.40793771715101124, + "grad_norm": 0.4725247323513031, + "learning_rate": 0.00011843668360649348, + "loss": 1.4223, + "step": 31393 + }, + { + "epoch": 0.4079507116949271, + "grad_norm": 0.2601841390132904, + "learning_rate": 0.00011843408414458207, + "loss": 1.0304, + "step": 31394 + }, + { + "epoch": 0.407963706238843, + "grad_norm": 0.3902965188026428, + "learning_rate": 0.00011843148468267069, + "loss": 1.3575, + "step": 31395 + }, + { + "epoch": 0.4079767007827588, + "grad_norm": 0.45944881439208984, + "learning_rate": 0.0001184288852207593, + "loss": 1.5357, + "step": 31396 + }, + { + "epoch": 0.40798969532667473, + "grad_norm": 0.36571744084358215, + "learning_rate": 0.00011842628575884793, + "loss": 1.5448, + "step": 31397 + }, + { + "epoch": 0.4080026898705906, + "grad_norm": 0.4541155695915222, + "learning_rate": 0.00011842368629693655, + "loss": 1.4336, + "step": 31398 + }, + { + "epoch": 0.4080156844145065, + "grad_norm": 0.2900446653366089, + "learning_rate": 0.00011842108683502516, + "loss": 1.3786, + "step": 31399 + }, + { + "epoch": 0.4080286789584223, + "grad_norm": 0.4956468641757965, + "learning_rate": 0.00011841848737311377, + "loss": 1.5467, + "step": 31400 + }, + { + "epoch": 0.4080416735023382, + "grad_norm": 0.37299689650535583, + "learning_rate": 0.00011841588791120239, + "loss": 1.2458, + "step": 31401 + }, + { + "epoch": 0.40805466804625407, + "grad_norm": 0.3782311677932739, + "learning_rate": 0.000118413288449291, + "loss": 1.3841, + "step": 31402 + }, + { + "epoch": 0.40806766259016997, + "grad_norm": 0.45050573348999023, + "learning_rate": 0.00011841068898737961, + "loss": 1.4763, + "step": 31403 + }, + { + "epoch": 0.4080806571340858, + "grad_norm": 0.4397805631160736, + "learning_rate": 0.00011840808952546822, + "loss": 1.3661, + "step": 31404 + }, + { + "epoch": 0.4080936516780017, + "grad_norm": 0.4004155099391937, + "learning_rate": 0.00011840549006355686, + "loss": 1.5833, + "step": 31405 + }, + { + "epoch": 0.40810664622191756, + "grad_norm": 0.4636945426464081, + "learning_rate": 0.00011840289060164546, + "loss": 1.4436, + "step": 31406 + }, + { + "epoch": 0.40811964076583346, + "grad_norm": 0.4077390432357788, + "learning_rate": 0.00011840029113973407, + "loss": 1.3702, + "step": 31407 + }, + { + "epoch": 0.4081326353097493, + "grad_norm": 0.32488545775413513, + "learning_rate": 0.00011839769167782268, + "loss": 1.34, + "step": 31408 + }, + { + "epoch": 0.4081456298536652, + "grad_norm": 0.4090341627597809, + "learning_rate": 0.00011839509221591132, + "loss": 1.2725, + "step": 31409 + }, + { + "epoch": 0.40815862439758105, + "grad_norm": 0.40529438853263855, + "learning_rate": 0.00011839249275399993, + "loss": 1.4517, + "step": 31410 + }, + { + "epoch": 0.40817161894149695, + "grad_norm": 0.4599699079990387, + "learning_rate": 0.00011838989329208854, + "loss": 1.3348, + "step": 31411 + }, + { + "epoch": 0.4081846134854128, + "grad_norm": 0.45246192812919617, + "learning_rate": 0.00011838729383017715, + "loss": 1.5333, + "step": 31412 + }, + { + "epoch": 0.4081976080293287, + "grad_norm": 0.372466504573822, + "learning_rate": 0.00011838469436826578, + "loss": 1.3214, + "step": 31413 + }, + { + "epoch": 0.40821060257324454, + "grad_norm": 0.4106236398220062, + "learning_rate": 0.00011838209490635439, + "loss": 1.5255, + "step": 31414 + }, + { + "epoch": 0.40822359711716044, + "grad_norm": 0.4159178137779236, + "learning_rate": 0.000118379495444443, + "loss": 1.4433, + "step": 31415 + }, + { + "epoch": 0.4082365916610763, + "grad_norm": 0.4291822016239166, + "learning_rate": 0.00011837689598253164, + "loss": 1.3729, + "step": 31416 + }, + { + "epoch": 0.4082495862049922, + "grad_norm": 0.38729721307754517, + "learning_rate": 0.00011837429652062025, + "loss": 1.2555, + "step": 31417 + }, + { + "epoch": 0.40826258074890803, + "grad_norm": 0.3638719618320465, + "learning_rate": 0.00011837169705870886, + "loss": 1.4664, + "step": 31418 + }, + { + "epoch": 0.40827557529282393, + "grad_norm": 0.2866068184375763, + "learning_rate": 0.00011836909759679746, + "loss": 1.5283, + "step": 31419 + }, + { + "epoch": 0.4082885698367398, + "grad_norm": 0.3282183110713959, + "learning_rate": 0.0001183664981348861, + "loss": 1.1876, + "step": 31420 + }, + { + "epoch": 0.4083015643806557, + "grad_norm": 0.42717763781547546, + "learning_rate": 0.0001183638986729747, + "loss": 1.2725, + "step": 31421 + }, + { + "epoch": 0.4083145589245715, + "grad_norm": 0.45406442880630493, + "learning_rate": 0.00011836129921106332, + "loss": 1.2582, + "step": 31422 + }, + { + "epoch": 0.4083275534684874, + "grad_norm": 0.37367990612983704, + "learning_rate": 0.00011835869974915193, + "loss": 1.4234, + "step": 31423 + }, + { + "epoch": 0.40834054801240327, + "grad_norm": 0.4946652352809906, + "learning_rate": 0.00011835610028724055, + "loss": 1.5869, + "step": 31424 + }, + { + "epoch": 0.40835354255631917, + "grad_norm": 0.37659645080566406, + "learning_rate": 0.00011835350082532916, + "loss": 1.339, + "step": 31425 + }, + { + "epoch": 0.408366537100235, + "grad_norm": 0.454621821641922, + "learning_rate": 0.00011835090136341777, + "loss": 1.2961, + "step": 31426 + }, + { + "epoch": 0.4083795316441509, + "grad_norm": 0.5059607028961182, + "learning_rate": 0.00011834830190150638, + "loss": 1.4858, + "step": 31427 + }, + { + "epoch": 0.40839252618806676, + "grad_norm": 0.4538361728191376, + "learning_rate": 0.00011834570243959502, + "loss": 1.322, + "step": 31428 + }, + { + "epoch": 0.40840552073198266, + "grad_norm": 0.36083900928497314, + "learning_rate": 0.00011834310297768363, + "loss": 1.3338, + "step": 31429 + }, + { + "epoch": 0.4084185152758985, + "grad_norm": 0.4355923533439636, + "learning_rate": 0.00011834050351577224, + "loss": 1.3334, + "step": 31430 + }, + { + "epoch": 0.4084315098198144, + "grad_norm": 0.3410107493400574, + "learning_rate": 0.00011833790405386085, + "loss": 1.2189, + "step": 31431 + }, + { + "epoch": 0.40844450436373025, + "grad_norm": 0.4521462321281433, + "learning_rate": 0.00011833530459194948, + "loss": 1.6769, + "step": 31432 + }, + { + "epoch": 0.40845749890764615, + "grad_norm": 0.35141193866729736, + "learning_rate": 0.00011833270513003809, + "loss": 1.1651, + "step": 31433 + }, + { + "epoch": 0.408470493451562, + "grad_norm": 0.40481558442115784, + "learning_rate": 0.0001183301056681267, + "loss": 1.4379, + "step": 31434 + }, + { + "epoch": 0.4084834879954779, + "grad_norm": 0.4880070090293884, + "learning_rate": 0.00011832750620621531, + "loss": 1.5579, + "step": 31435 + }, + { + "epoch": 0.40849648253939375, + "grad_norm": 0.41805538535118103, + "learning_rate": 0.00011832490674430394, + "loss": 1.4931, + "step": 31436 + }, + { + "epoch": 0.40850947708330965, + "grad_norm": 0.48485302925109863, + "learning_rate": 0.00011832230728239255, + "loss": 1.5111, + "step": 31437 + }, + { + "epoch": 0.4085224716272255, + "grad_norm": 0.39246925711631775, + "learning_rate": 0.00011831970782048116, + "loss": 1.4664, + "step": 31438 + }, + { + "epoch": 0.4085354661711414, + "grad_norm": 0.39697298407554626, + "learning_rate": 0.00011831710835856977, + "loss": 1.3511, + "step": 31439 + }, + { + "epoch": 0.40854846071505724, + "grad_norm": 0.3886861801147461, + "learning_rate": 0.00011831450889665841, + "loss": 1.538, + "step": 31440 + }, + { + "epoch": 0.40856145525897314, + "grad_norm": 0.3768399953842163, + "learning_rate": 0.00011831190943474702, + "loss": 1.3991, + "step": 31441 + }, + { + "epoch": 0.40857444980288904, + "grad_norm": 0.349935919046402, + "learning_rate": 0.00011830930997283563, + "loss": 1.4697, + "step": 31442 + }, + { + "epoch": 0.4085874443468049, + "grad_norm": 0.4222748875617981, + "learning_rate": 0.00011830671051092424, + "loss": 1.494, + "step": 31443 + }, + { + "epoch": 0.4086004388907208, + "grad_norm": 0.39832213521003723, + "learning_rate": 0.00011830411104901286, + "loss": 1.2905, + "step": 31444 + }, + { + "epoch": 0.40861343343463663, + "grad_norm": 0.3936651051044464, + "learning_rate": 0.00011830151158710148, + "loss": 1.3466, + "step": 31445 + }, + { + "epoch": 0.40862642797855253, + "grad_norm": 0.3840542733669281, + "learning_rate": 0.00011829891212519009, + "loss": 1.3802, + "step": 31446 + }, + { + "epoch": 0.4086394225224684, + "grad_norm": 0.42872774600982666, + "learning_rate": 0.0001182963126632787, + "loss": 1.605, + "step": 31447 + }, + { + "epoch": 0.4086524170663843, + "grad_norm": 0.39350372552871704, + "learning_rate": 0.00011829371320136732, + "loss": 1.4396, + "step": 31448 + }, + { + "epoch": 0.4086654116103001, + "grad_norm": 0.4355899691581726, + "learning_rate": 0.00011829111373945593, + "loss": 1.334, + "step": 31449 + }, + { + "epoch": 0.408678406154216, + "grad_norm": 0.44449347257614136, + "learning_rate": 0.00011828851427754454, + "loss": 1.4616, + "step": 31450 + }, + { + "epoch": 0.40869140069813187, + "grad_norm": 0.3463626801967621, + "learning_rate": 0.00011828591481563315, + "loss": 1.4628, + "step": 31451 + }, + { + "epoch": 0.40870439524204777, + "grad_norm": 0.42759230732917786, + "learning_rate": 0.00011828331535372179, + "loss": 1.4798, + "step": 31452 + }, + { + "epoch": 0.4087173897859636, + "grad_norm": 0.613426148891449, + "learning_rate": 0.0001182807158918104, + "loss": 1.3774, + "step": 31453 + }, + { + "epoch": 0.4087303843298795, + "grad_norm": 0.3927464783191681, + "learning_rate": 0.00011827811642989901, + "loss": 1.4392, + "step": 31454 + }, + { + "epoch": 0.40874337887379536, + "grad_norm": 0.33047032356262207, + "learning_rate": 0.00011827551696798764, + "loss": 1.5495, + "step": 31455 + }, + { + "epoch": 0.40875637341771126, + "grad_norm": 0.38525456190109253, + "learning_rate": 0.00011827291750607625, + "loss": 1.3869, + "step": 31456 + }, + { + "epoch": 0.4087693679616271, + "grad_norm": 0.4371989071369171, + "learning_rate": 0.00011827031804416486, + "loss": 1.4229, + "step": 31457 + }, + { + "epoch": 0.408782362505543, + "grad_norm": 0.4202074110507965, + "learning_rate": 0.00011826771858225347, + "loss": 1.448, + "step": 31458 + }, + { + "epoch": 0.40879535704945885, + "grad_norm": 0.3906514346599579, + "learning_rate": 0.00011826511912034211, + "loss": 1.4543, + "step": 31459 + }, + { + "epoch": 0.40880835159337475, + "grad_norm": 0.3407418131828308, + "learning_rate": 0.00011826251965843072, + "loss": 1.4444, + "step": 31460 + }, + { + "epoch": 0.4088213461372906, + "grad_norm": 0.41271138191223145, + "learning_rate": 0.00011825992019651932, + "loss": 1.4037, + "step": 31461 + }, + { + "epoch": 0.4088343406812065, + "grad_norm": 0.3754023611545563, + "learning_rate": 0.00011825732073460793, + "loss": 1.4579, + "step": 31462 + }, + { + "epoch": 0.40884733522512234, + "grad_norm": 0.37632033228874207, + "learning_rate": 0.00011825472127269657, + "loss": 1.4101, + "step": 31463 + }, + { + "epoch": 0.40886032976903824, + "grad_norm": 0.42989492416381836, + "learning_rate": 0.00011825212181078518, + "loss": 1.3728, + "step": 31464 + }, + { + "epoch": 0.4088733243129541, + "grad_norm": 0.39576902985572815, + "learning_rate": 0.00011824952234887379, + "loss": 1.5051, + "step": 31465 + }, + { + "epoch": 0.40888631885687, + "grad_norm": 0.3420815169811249, + "learning_rate": 0.0001182469228869624, + "loss": 1.293, + "step": 31466 + }, + { + "epoch": 0.40889931340078584, + "grad_norm": 0.43998003005981445, + "learning_rate": 0.00011824432342505102, + "loss": 1.4895, + "step": 31467 + }, + { + "epoch": 0.40891230794470174, + "grad_norm": 0.3893432915210724, + "learning_rate": 0.00011824172396313964, + "loss": 1.383, + "step": 31468 + }, + { + "epoch": 0.4089253024886176, + "grad_norm": 0.4175719618797302, + "learning_rate": 0.00011823912450122825, + "loss": 1.4606, + "step": 31469 + }, + { + "epoch": 0.4089382970325335, + "grad_norm": 0.402387797832489, + "learning_rate": 0.00011823652503931686, + "loss": 1.4206, + "step": 31470 + }, + { + "epoch": 0.4089512915764493, + "grad_norm": 0.4163857102394104, + "learning_rate": 0.0001182339255774055, + "loss": 1.4177, + "step": 31471 + }, + { + "epoch": 0.40896428612036523, + "grad_norm": 0.3899572491645813, + "learning_rate": 0.0001182313261154941, + "loss": 1.5447, + "step": 31472 + }, + { + "epoch": 0.4089772806642811, + "grad_norm": 0.5240683555603027, + "learning_rate": 0.00011822872665358272, + "loss": 1.4688, + "step": 31473 + }, + { + "epoch": 0.408990275208197, + "grad_norm": 0.4555453062057495, + "learning_rate": 0.00011822612719167131, + "loss": 1.4944, + "step": 31474 + }, + { + "epoch": 0.4090032697521128, + "grad_norm": 0.39444679021835327, + "learning_rate": 0.00011822352772975995, + "loss": 1.5195, + "step": 31475 + }, + { + "epoch": 0.4090162642960287, + "grad_norm": 0.4139522612094879, + "learning_rate": 0.00011822092826784856, + "loss": 1.4102, + "step": 31476 + }, + { + "epoch": 0.40902925883994457, + "grad_norm": 0.47355741262435913, + "learning_rate": 0.00011821832880593717, + "loss": 1.2987, + "step": 31477 + }, + { + "epoch": 0.40904225338386047, + "grad_norm": 0.33981433510780334, + "learning_rate": 0.00011821572934402579, + "loss": 1.2082, + "step": 31478 + }, + { + "epoch": 0.4090552479277763, + "grad_norm": 0.3353487253189087, + "learning_rate": 0.00011821312988211441, + "loss": 1.1433, + "step": 31479 + }, + { + "epoch": 0.4090682424716922, + "grad_norm": 0.47620120644569397, + "learning_rate": 0.00011821053042020302, + "loss": 1.5979, + "step": 31480 + }, + { + "epoch": 0.40908123701560806, + "grad_norm": 0.41410520672798157, + "learning_rate": 0.00011820793095829163, + "loss": 1.4131, + "step": 31481 + }, + { + "epoch": 0.40909423155952396, + "grad_norm": 0.366629034280777, + "learning_rate": 0.00011820533149638024, + "loss": 1.5615, + "step": 31482 + }, + { + "epoch": 0.4091072261034398, + "grad_norm": 0.34971490502357483, + "learning_rate": 0.00011820273203446888, + "loss": 1.7237, + "step": 31483 + }, + { + "epoch": 0.4091202206473557, + "grad_norm": 0.463933527469635, + "learning_rate": 0.00011820013257255749, + "loss": 1.3382, + "step": 31484 + }, + { + "epoch": 0.40913321519127155, + "grad_norm": 0.48832187056541443, + "learning_rate": 0.0001181975331106461, + "loss": 1.5046, + "step": 31485 + }, + { + "epoch": 0.40914620973518745, + "grad_norm": 0.41951534152030945, + "learning_rate": 0.00011819493364873471, + "loss": 1.4755, + "step": 31486 + }, + { + "epoch": 0.4091592042791033, + "grad_norm": 0.42187410593032837, + "learning_rate": 0.00011819233418682334, + "loss": 1.3842, + "step": 31487 + }, + { + "epoch": 0.4091721988230192, + "grad_norm": 0.42559242248535156, + "learning_rate": 0.00011818973472491195, + "loss": 1.2474, + "step": 31488 + }, + { + "epoch": 0.40918519336693504, + "grad_norm": 0.31046196818351746, + "learning_rate": 0.00011818713526300056, + "loss": 1.4083, + "step": 31489 + }, + { + "epoch": 0.40919818791085094, + "grad_norm": 0.5612074732780457, + "learning_rate": 0.00011818453580108918, + "loss": 1.4781, + "step": 31490 + }, + { + "epoch": 0.4092111824547668, + "grad_norm": 0.4055081605911255, + "learning_rate": 0.0001181819363391778, + "loss": 1.3433, + "step": 31491 + }, + { + "epoch": 0.4092241769986827, + "grad_norm": 0.4123353958129883, + "learning_rate": 0.0001181793368772664, + "loss": 1.3228, + "step": 31492 + }, + { + "epoch": 0.40923717154259853, + "grad_norm": 0.3822682797908783, + "learning_rate": 0.00011817673741535502, + "loss": 1.4473, + "step": 31493 + }, + { + "epoch": 0.40925016608651443, + "grad_norm": 0.40904977917671204, + "learning_rate": 0.00011817413795344366, + "loss": 1.475, + "step": 31494 + }, + { + "epoch": 0.4092631606304303, + "grad_norm": 0.3862096965312958, + "learning_rate": 0.00011817153849153227, + "loss": 1.5589, + "step": 31495 + }, + { + "epoch": 0.4092761551743462, + "grad_norm": 0.32827651500701904, + "learning_rate": 0.00011816893902962088, + "loss": 1.2963, + "step": 31496 + }, + { + "epoch": 0.409289149718262, + "grad_norm": 0.41040390729904175, + "learning_rate": 0.00011816633956770949, + "loss": 1.28, + "step": 31497 + }, + { + "epoch": 0.4093021442621779, + "grad_norm": 0.4335785210132599, + "learning_rate": 0.00011816374010579811, + "loss": 1.4285, + "step": 31498 + }, + { + "epoch": 0.40931513880609377, + "grad_norm": 0.33759036660194397, + "learning_rate": 0.00011816114064388672, + "loss": 1.4329, + "step": 31499 + }, + { + "epoch": 0.40932813335000967, + "grad_norm": 0.41463494300842285, + "learning_rate": 0.00011815854118197533, + "loss": 1.4981, + "step": 31500 + }, + { + "epoch": 0.4093411278939255, + "grad_norm": 0.3643905222415924, + "learning_rate": 0.00011815594172006395, + "loss": 1.3193, + "step": 31501 + }, + { + "epoch": 0.4093541224378414, + "grad_norm": 0.32527557015419006, + "learning_rate": 0.00011815334225815258, + "loss": 1.1753, + "step": 31502 + }, + { + "epoch": 0.40936711698175726, + "grad_norm": 0.4599539041519165, + "learning_rate": 0.00011815074279624118, + "loss": 1.4875, + "step": 31503 + }, + { + "epoch": 0.40938011152567316, + "grad_norm": 0.49181321263313293, + "learning_rate": 0.00011814814333432979, + "loss": 1.3654, + "step": 31504 + }, + { + "epoch": 0.409393106069589, + "grad_norm": 0.3689109981060028, + "learning_rate": 0.0001181455438724184, + "loss": 1.3361, + "step": 31505 + }, + { + "epoch": 0.4094061006135049, + "grad_norm": 0.3569023311138153, + "learning_rate": 0.00011814294441050704, + "loss": 1.4822, + "step": 31506 + }, + { + "epoch": 0.40941909515742075, + "grad_norm": 0.3547045886516571, + "learning_rate": 0.00011814034494859565, + "loss": 1.5036, + "step": 31507 + }, + { + "epoch": 0.40943208970133665, + "grad_norm": 0.3776378035545349, + "learning_rate": 0.00011813774548668426, + "loss": 1.4238, + "step": 31508 + }, + { + "epoch": 0.4094450842452525, + "grad_norm": 0.47640812397003174, + "learning_rate": 0.00011813514602477287, + "loss": 1.3573, + "step": 31509 + }, + { + "epoch": 0.4094580787891684, + "grad_norm": 0.32027679681777954, + "learning_rate": 0.0001181325465628615, + "loss": 1.3882, + "step": 31510 + }, + { + "epoch": 0.40947107333308425, + "grad_norm": 0.32853272557258606, + "learning_rate": 0.00011812994710095011, + "loss": 1.5877, + "step": 31511 + }, + { + "epoch": 0.40948406787700015, + "grad_norm": 0.43371883034706116, + "learning_rate": 0.00011812734763903872, + "loss": 1.4665, + "step": 31512 + }, + { + "epoch": 0.409497062420916, + "grad_norm": 0.41488316655158997, + "learning_rate": 0.00011812474817712733, + "loss": 1.4706, + "step": 31513 + }, + { + "epoch": 0.4095100569648319, + "grad_norm": 0.33448073267936707, + "learning_rate": 0.00011812214871521597, + "loss": 1.416, + "step": 31514 + }, + { + "epoch": 0.40952305150874774, + "grad_norm": 0.3807866871356964, + "learning_rate": 0.00011811954925330458, + "loss": 1.5369, + "step": 31515 + }, + { + "epoch": 0.40953604605266364, + "grad_norm": 0.32609882950782776, + "learning_rate": 0.00011811694979139318, + "loss": 1.4505, + "step": 31516 + }, + { + "epoch": 0.40954904059657954, + "grad_norm": 0.36160367727279663, + "learning_rate": 0.00011811435032948179, + "loss": 1.3877, + "step": 31517 + }, + { + "epoch": 0.4095620351404954, + "grad_norm": 0.4232065975666046, + "learning_rate": 0.00011811175086757043, + "loss": 1.3083, + "step": 31518 + }, + { + "epoch": 0.4095750296844113, + "grad_norm": 0.41500887274742126, + "learning_rate": 0.00011810915140565904, + "loss": 1.5131, + "step": 31519 + }, + { + "epoch": 0.40958802422832713, + "grad_norm": 0.3550424873828888, + "learning_rate": 0.00011810655194374765, + "loss": 1.2894, + "step": 31520 + }, + { + "epoch": 0.40960101877224303, + "grad_norm": 0.44544121623039246, + "learning_rate": 0.00011810395248183626, + "loss": 1.5123, + "step": 31521 + }, + { + "epoch": 0.4096140133161589, + "grad_norm": 0.34656384587287903, + "learning_rate": 0.00011810135301992488, + "loss": 1.3418, + "step": 31522 + }, + { + "epoch": 0.4096270078600748, + "grad_norm": 0.4084930419921875, + "learning_rate": 0.0001180987535580135, + "loss": 1.2707, + "step": 31523 + }, + { + "epoch": 0.4096400024039906, + "grad_norm": 0.39699652791023254, + "learning_rate": 0.0001180961540961021, + "loss": 1.3302, + "step": 31524 + }, + { + "epoch": 0.4096529969479065, + "grad_norm": 0.4151805639266968, + "learning_rate": 0.00011809355463419072, + "loss": 1.3127, + "step": 31525 + }, + { + "epoch": 0.40966599149182237, + "grad_norm": 0.43207848072052, + "learning_rate": 0.00011809095517227935, + "loss": 1.4389, + "step": 31526 + }, + { + "epoch": 0.40967898603573827, + "grad_norm": 0.2966243624687195, + "learning_rate": 0.00011808835571036797, + "loss": 1.2099, + "step": 31527 + }, + { + "epoch": 0.4096919805796541, + "grad_norm": 0.4899061620235443, + "learning_rate": 0.00011808575624845658, + "loss": 1.4809, + "step": 31528 + }, + { + "epoch": 0.40970497512357, + "grad_norm": 0.36162999272346497, + "learning_rate": 0.0001180831567865452, + "loss": 1.4903, + "step": 31529 + }, + { + "epoch": 0.40971796966748586, + "grad_norm": 0.3901997208595276, + "learning_rate": 0.00011808055732463381, + "loss": 1.3494, + "step": 31530 + }, + { + "epoch": 0.40973096421140176, + "grad_norm": 0.4472411274909973, + "learning_rate": 0.00011807795786272242, + "loss": 1.4126, + "step": 31531 + }, + { + "epoch": 0.4097439587553176, + "grad_norm": 0.40811723470687866, + "learning_rate": 0.00011807535840081103, + "loss": 1.3222, + "step": 31532 + }, + { + "epoch": 0.4097569532992335, + "grad_norm": 0.38060200214385986, + "learning_rate": 0.00011807275893889966, + "loss": 1.5181, + "step": 31533 + }, + { + "epoch": 0.40976994784314935, + "grad_norm": 0.33410435914993286, + "learning_rate": 0.00011807015947698827, + "loss": 1.344, + "step": 31534 + }, + { + "epoch": 0.40978294238706525, + "grad_norm": 0.4646623730659485, + "learning_rate": 0.00011806756001507688, + "loss": 1.387, + "step": 31535 + }, + { + "epoch": 0.4097959369309811, + "grad_norm": 0.36100417375564575, + "learning_rate": 0.00011806496055316549, + "loss": 1.2898, + "step": 31536 + }, + { + "epoch": 0.409808931474897, + "grad_norm": 0.398863285779953, + "learning_rate": 0.00011806236109125413, + "loss": 1.4409, + "step": 31537 + }, + { + "epoch": 0.40982192601881284, + "grad_norm": 0.388494610786438, + "learning_rate": 0.00011805976162934274, + "loss": 1.4975, + "step": 31538 + }, + { + "epoch": 0.40983492056272874, + "grad_norm": 0.4466759264469147, + "learning_rate": 0.00011805716216743135, + "loss": 1.3661, + "step": 31539 + }, + { + "epoch": 0.4098479151066446, + "grad_norm": 0.40365058183670044, + "learning_rate": 0.00011805456270551996, + "loss": 1.5221, + "step": 31540 + }, + { + "epoch": 0.4098609096505605, + "grad_norm": 0.4773750901222229, + "learning_rate": 0.00011805196324360859, + "loss": 1.4561, + "step": 31541 + }, + { + "epoch": 0.40987390419447634, + "grad_norm": 0.48428115248680115, + "learning_rate": 0.0001180493637816972, + "loss": 1.4226, + "step": 31542 + }, + { + "epoch": 0.40988689873839224, + "grad_norm": 0.2745041251182556, + "learning_rate": 0.00011804676431978581, + "loss": 1.3222, + "step": 31543 + }, + { + "epoch": 0.4098998932823081, + "grad_norm": 0.3983737528324127, + "learning_rate": 0.00011804416485787442, + "loss": 1.2943, + "step": 31544 + }, + { + "epoch": 0.409912887826224, + "grad_norm": 0.4040609300136566, + "learning_rate": 0.00011804156539596304, + "loss": 1.4124, + "step": 31545 + }, + { + "epoch": 0.4099258823701398, + "grad_norm": 0.3810339570045471, + "learning_rate": 0.00011803896593405165, + "loss": 1.3874, + "step": 31546 + }, + { + "epoch": 0.4099388769140557, + "grad_norm": 0.39410123229026794, + "learning_rate": 0.00011803636647214027, + "loss": 1.3064, + "step": 31547 + }, + { + "epoch": 0.4099518714579716, + "grad_norm": 0.3857850730419159, + "learning_rate": 0.00011803376701022888, + "loss": 1.368, + "step": 31548 + }, + { + "epoch": 0.4099648660018875, + "grad_norm": 0.36780086159706116, + "learning_rate": 0.00011803116754831751, + "loss": 1.3956, + "step": 31549 + }, + { + "epoch": 0.4099778605458033, + "grad_norm": 0.41981688141822815, + "learning_rate": 0.00011802856808640613, + "loss": 1.3771, + "step": 31550 + }, + { + "epoch": 0.4099908550897192, + "grad_norm": 0.4461262822151184, + "learning_rate": 0.00011802596862449474, + "loss": 1.4245, + "step": 31551 + }, + { + "epoch": 0.41000384963363506, + "grad_norm": 0.4512978196144104, + "learning_rate": 0.00011802336916258335, + "loss": 1.3088, + "step": 31552 + }, + { + "epoch": 0.41001684417755097, + "grad_norm": 0.38236308097839355, + "learning_rate": 0.00011802076970067197, + "loss": 1.2624, + "step": 31553 + }, + { + "epoch": 0.4100298387214668, + "grad_norm": 0.43214765191078186, + "learning_rate": 0.00011801817023876058, + "loss": 1.4063, + "step": 31554 + }, + { + "epoch": 0.4100428332653827, + "grad_norm": 0.3449733257293701, + "learning_rate": 0.0001180155707768492, + "loss": 1.2786, + "step": 31555 + }, + { + "epoch": 0.41005582780929856, + "grad_norm": 0.344591349363327, + "learning_rate": 0.0001180129713149378, + "loss": 1.4327, + "step": 31556 + }, + { + "epoch": 0.41006882235321446, + "grad_norm": 0.47518298029899597, + "learning_rate": 0.00011801037185302644, + "loss": 1.5384, + "step": 31557 + }, + { + "epoch": 0.4100818168971303, + "grad_norm": 0.4852653741836548, + "learning_rate": 0.00011800777239111504, + "loss": 1.494, + "step": 31558 + }, + { + "epoch": 0.4100948114410462, + "grad_norm": 0.37454286217689514, + "learning_rate": 0.00011800517292920365, + "loss": 1.322, + "step": 31559 + }, + { + "epoch": 0.41010780598496205, + "grad_norm": 0.4366607367992401, + "learning_rate": 0.00011800257346729226, + "loss": 1.5601, + "step": 31560 + }, + { + "epoch": 0.41012080052887795, + "grad_norm": 0.453578382730484, + "learning_rate": 0.0001179999740053809, + "loss": 1.3266, + "step": 31561 + }, + { + "epoch": 0.4101337950727938, + "grad_norm": 0.4191003739833832, + "learning_rate": 0.00011799737454346951, + "loss": 1.3402, + "step": 31562 + }, + { + "epoch": 0.4101467896167097, + "grad_norm": 0.46088704466819763, + "learning_rate": 0.00011799477508155812, + "loss": 1.4136, + "step": 31563 + }, + { + "epoch": 0.41015978416062554, + "grad_norm": 0.43562641739845276, + "learning_rate": 0.00011799217561964675, + "loss": 1.3783, + "step": 31564 + }, + { + "epoch": 0.41017277870454144, + "grad_norm": 0.37579742074012756, + "learning_rate": 0.00011798957615773536, + "loss": 1.2726, + "step": 31565 + }, + { + "epoch": 0.4101857732484573, + "grad_norm": 0.3702225983142853, + "learning_rate": 0.00011798697669582397, + "loss": 1.2204, + "step": 31566 + }, + { + "epoch": 0.4101987677923732, + "grad_norm": 0.5580505132675171, + "learning_rate": 0.00011798437723391258, + "loss": 1.6742, + "step": 31567 + }, + { + "epoch": 0.41021176233628903, + "grad_norm": 0.4354010820388794, + "learning_rate": 0.00011798177777200122, + "loss": 1.3901, + "step": 31568 + }, + { + "epoch": 0.41022475688020493, + "grad_norm": 0.39802223443984985, + "learning_rate": 0.00011797917831008983, + "loss": 1.3698, + "step": 31569 + }, + { + "epoch": 0.4102377514241208, + "grad_norm": 0.4317769706249237, + "learning_rate": 0.00011797657884817844, + "loss": 1.5276, + "step": 31570 + }, + { + "epoch": 0.4102507459680367, + "grad_norm": 0.39785075187683105, + "learning_rate": 0.00011797397938626704, + "loss": 1.4359, + "step": 31571 + }, + { + "epoch": 0.4102637405119525, + "grad_norm": 0.42543643712997437, + "learning_rate": 0.00011797137992435567, + "loss": 1.6143, + "step": 31572 + }, + { + "epoch": 0.4102767350558684, + "grad_norm": 0.33673784136772156, + "learning_rate": 0.00011796878046244428, + "loss": 1.5353, + "step": 31573 + }, + { + "epoch": 0.41028972959978427, + "grad_norm": 0.40286701917648315, + "learning_rate": 0.0001179661810005329, + "loss": 1.4414, + "step": 31574 + }, + { + "epoch": 0.41030272414370017, + "grad_norm": 0.514765739440918, + "learning_rate": 0.0001179635815386215, + "loss": 1.4206, + "step": 31575 + }, + { + "epoch": 0.410315718687616, + "grad_norm": 0.4378872513771057, + "learning_rate": 0.00011796098207671013, + "loss": 1.366, + "step": 31576 + }, + { + "epoch": 0.4103287132315319, + "grad_norm": 0.4727979898452759, + "learning_rate": 0.00011795838261479874, + "loss": 1.473, + "step": 31577 + }, + { + "epoch": 0.41034170777544776, + "grad_norm": 0.39839720726013184, + "learning_rate": 0.00011795578315288735, + "loss": 1.499, + "step": 31578 + }, + { + "epoch": 0.41035470231936366, + "grad_norm": 0.47844377160072327, + "learning_rate": 0.00011795318369097596, + "loss": 1.3596, + "step": 31579 + }, + { + "epoch": 0.4103676968632795, + "grad_norm": 0.35658955574035645, + "learning_rate": 0.0001179505842290646, + "loss": 1.3111, + "step": 31580 + }, + { + "epoch": 0.4103806914071954, + "grad_norm": 0.33304885029792786, + "learning_rate": 0.00011794798476715321, + "loss": 1.5764, + "step": 31581 + }, + { + "epoch": 0.41039368595111125, + "grad_norm": 0.4718421399593353, + "learning_rate": 0.00011794538530524182, + "loss": 1.3568, + "step": 31582 + }, + { + "epoch": 0.41040668049502715, + "grad_norm": 0.4658700227737427, + "learning_rate": 0.00011794278584333042, + "loss": 1.5012, + "step": 31583 + }, + { + "epoch": 0.410419675038943, + "grad_norm": 0.4477006494998932, + "learning_rate": 0.00011794018638141906, + "loss": 1.5038, + "step": 31584 + }, + { + "epoch": 0.4104326695828589, + "grad_norm": 0.41209080815315247, + "learning_rate": 0.00011793758691950767, + "loss": 1.6472, + "step": 31585 + }, + { + "epoch": 0.41044566412677475, + "grad_norm": 0.3747536242008209, + "learning_rate": 0.00011793498745759628, + "loss": 1.2997, + "step": 31586 + }, + { + "epoch": 0.41045865867069065, + "grad_norm": 0.4215029776096344, + "learning_rate": 0.00011793238799568489, + "loss": 1.4704, + "step": 31587 + }, + { + "epoch": 0.4104716532146065, + "grad_norm": 0.47362276911735535, + "learning_rate": 0.00011792978853377352, + "loss": 1.5515, + "step": 31588 + }, + { + "epoch": 0.4104846477585224, + "grad_norm": 0.3811095654964447, + "learning_rate": 0.00011792718907186213, + "loss": 1.391, + "step": 31589 + }, + { + "epoch": 0.41049764230243824, + "grad_norm": 0.3897624909877777, + "learning_rate": 0.00011792458960995074, + "loss": 1.3357, + "step": 31590 + }, + { + "epoch": 0.41051063684635414, + "grad_norm": 0.43030741810798645, + "learning_rate": 0.00011792199014803935, + "loss": 1.5572, + "step": 31591 + }, + { + "epoch": 0.41052363139027, + "grad_norm": 0.35224151611328125, + "learning_rate": 0.00011791939068612799, + "loss": 1.2773, + "step": 31592 + }, + { + "epoch": 0.4105366259341859, + "grad_norm": 0.382892370223999, + "learning_rate": 0.0001179167912242166, + "loss": 1.3806, + "step": 31593 + }, + { + "epoch": 0.4105496204781018, + "grad_norm": 0.31598880887031555, + "learning_rate": 0.00011791419176230521, + "loss": 1.2217, + "step": 31594 + }, + { + "epoch": 0.41056261502201763, + "grad_norm": 0.36728498339653015, + "learning_rate": 0.00011791159230039382, + "loss": 1.2947, + "step": 31595 + }, + { + "epoch": 0.41057560956593353, + "grad_norm": 0.4650548994541168, + "learning_rate": 0.00011790899283848244, + "loss": 1.3923, + "step": 31596 + }, + { + "epoch": 0.4105886041098494, + "grad_norm": 0.3864407241344452, + "learning_rate": 0.00011790639337657106, + "loss": 1.2028, + "step": 31597 + }, + { + "epoch": 0.4106015986537653, + "grad_norm": 0.39786413311958313, + "learning_rate": 0.00011790379391465967, + "loss": 1.3598, + "step": 31598 + }, + { + "epoch": 0.4106145931976811, + "grad_norm": 0.42337566614151, + "learning_rate": 0.00011790119445274828, + "loss": 1.2514, + "step": 31599 + }, + { + "epoch": 0.410627587741597, + "grad_norm": 0.43584102392196655, + "learning_rate": 0.0001178985949908369, + "loss": 1.4114, + "step": 31600 + }, + { + "epoch": 0.41064058228551287, + "grad_norm": 0.36493369936943054, + "learning_rate": 0.00011789599552892551, + "loss": 1.2058, + "step": 31601 + }, + { + "epoch": 0.41065357682942877, + "grad_norm": 0.39915576577186584, + "learning_rate": 0.00011789339606701412, + "loss": 1.1882, + "step": 31602 + }, + { + "epoch": 0.4106665713733446, + "grad_norm": 0.43771296739578247, + "learning_rate": 0.00011789079660510276, + "loss": 1.2833, + "step": 31603 + }, + { + "epoch": 0.4106795659172605, + "grad_norm": 0.39958369731903076, + "learning_rate": 0.00011788819714319137, + "loss": 1.4624, + "step": 31604 + }, + { + "epoch": 0.41069256046117636, + "grad_norm": 0.45124125480651855, + "learning_rate": 0.00011788559768127998, + "loss": 1.3059, + "step": 31605 + }, + { + "epoch": 0.41070555500509226, + "grad_norm": 0.3760499060153961, + "learning_rate": 0.0001178829982193686, + "loss": 1.3808, + "step": 31606 + }, + { + "epoch": 0.4107185495490081, + "grad_norm": 0.3665977716445923, + "learning_rate": 0.00011788039875745722, + "loss": 1.3556, + "step": 31607 + }, + { + "epoch": 0.410731544092924, + "grad_norm": 0.28853094577789307, + "learning_rate": 0.00011787779929554583, + "loss": 1.3003, + "step": 31608 + }, + { + "epoch": 0.41074453863683985, + "grad_norm": 0.3686813712120056, + "learning_rate": 0.00011787519983363444, + "loss": 1.3116, + "step": 31609 + }, + { + "epoch": 0.41075753318075575, + "grad_norm": 0.4340326189994812, + "learning_rate": 0.00011787260037172305, + "loss": 1.4141, + "step": 31610 + }, + { + "epoch": 0.4107705277246716, + "grad_norm": 0.4526670277118683, + "learning_rate": 0.00011787000090981169, + "loss": 1.3634, + "step": 31611 + }, + { + "epoch": 0.4107835222685875, + "grad_norm": 0.33790960907936096, + "learning_rate": 0.00011786740144790029, + "loss": 1.4982, + "step": 31612 + }, + { + "epoch": 0.41079651681250334, + "grad_norm": 0.3485633134841919, + "learning_rate": 0.0001178648019859889, + "loss": 1.2363, + "step": 31613 + }, + { + "epoch": 0.41080951135641924, + "grad_norm": 0.3062633275985718, + "learning_rate": 0.00011786220252407751, + "loss": 1.4422, + "step": 31614 + }, + { + "epoch": 0.4108225059003351, + "grad_norm": 0.38441792130470276, + "learning_rate": 0.00011785960306216615, + "loss": 1.3213, + "step": 31615 + }, + { + "epoch": 0.410835500444251, + "grad_norm": 0.46414437890052795, + "learning_rate": 0.00011785700360025476, + "loss": 1.4924, + "step": 31616 + }, + { + "epoch": 0.41084849498816683, + "grad_norm": 0.4170149862766266, + "learning_rate": 0.00011785440413834337, + "loss": 1.2918, + "step": 31617 + }, + { + "epoch": 0.41086148953208274, + "grad_norm": 0.44067803025245667, + "learning_rate": 0.00011785180467643198, + "loss": 1.3788, + "step": 31618 + }, + { + "epoch": 0.4108744840759986, + "grad_norm": 0.33015716075897217, + "learning_rate": 0.0001178492052145206, + "loss": 1.322, + "step": 31619 + }, + { + "epoch": 0.4108874786199145, + "grad_norm": 0.39627066254615784, + "learning_rate": 0.00011784660575260922, + "loss": 1.2984, + "step": 31620 + }, + { + "epoch": 0.4109004731638303, + "grad_norm": 0.4287610352039337, + "learning_rate": 0.00011784400629069783, + "loss": 1.413, + "step": 31621 + }, + { + "epoch": 0.4109134677077462, + "grad_norm": 0.3972589373588562, + "learning_rate": 0.00011784140682878644, + "loss": 1.4957, + "step": 31622 + }, + { + "epoch": 0.4109264622516621, + "grad_norm": 0.4931383430957794, + "learning_rate": 0.00011783880736687508, + "loss": 1.3749, + "step": 31623 + }, + { + "epoch": 0.410939456795578, + "grad_norm": 0.4313546121120453, + "learning_rate": 0.00011783620790496369, + "loss": 1.4123, + "step": 31624 + }, + { + "epoch": 0.4109524513394938, + "grad_norm": 0.41138026118278503, + "learning_rate": 0.00011783360844305228, + "loss": 1.2964, + "step": 31625 + }, + { + "epoch": 0.4109654458834097, + "grad_norm": 0.43869635462760925, + "learning_rate": 0.0001178310089811409, + "loss": 1.45, + "step": 31626 + }, + { + "epoch": 0.41097844042732556, + "grad_norm": 0.39452725648880005, + "learning_rate": 0.00011782840951922953, + "loss": 1.4999, + "step": 31627 + }, + { + "epoch": 0.41099143497124146, + "grad_norm": 0.44536057114601135, + "learning_rate": 0.00011782581005731814, + "loss": 1.4383, + "step": 31628 + }, + { + "epoch": 0.4110044295151573, + "grad_norm": 0.4568173289299011, + "learning_rate": 0.00011782321059540675, + "loss": 1.3343, + "step": 31629 + }, + { + "epoch": 0.4110174240590732, + "grad_norm": 0.37603095173835754, + "learning_rate": 0.00011782061113349537, + "loss": 1.4932, + "step": 31630 + }, + { + "epoch": 0.41103041860298906, + "grad_norm": 0.31568434834480286, + "learning_rate": 0.00011781801167158399, + "loss": 1.657, + "step": 31631 + }, + { + "epoch": 0.41104341314690496, + "grad_norm": 0.4077213406562805, + "learning_rate": 0.0001178154122096726, + "loss": 1.4164, + "step": 31632 + }, + { + "epoch": 0.4110564076908208, + "grad_norm": 0.44103261828422546, + "learning_rate": 0.00011781281274776121, + "loss": 1.501, + "step": 31633 + }, + { + "epoch": 0.4110694022347367, + "grad_norm": 0.42536574602127075, + "learning_rate": 0.00011781021328584982, + "loss": 1.5286, + "step": 31634 + }, + { + "epoch": 0.41108239677865255, + "grad_norm": 0.32663974165916443, + "learning_rate": 0.00011780761382393846, + "loss": 1.4775, + "step": 31635 + }, + { + "epoch": 0.41109539132256845, + "grad_norm": 0.39440450072288513, + "learning_rate": 0.00011780501436202707, + "loss": 1.3331, + "step": 31636 + }, + { + "epoch": 0.4111083858664843, + "grad_norm": 0.4558618664741516, + "learning_rate": 0.00011780241490011568, + "loss": 1.452, + "step": 31637 + }, + { + "epoch": 0.4111213804104002, + "grad_norm": 0.39504119753837585, + "learning_rate": 0.00011779981543820431, + "loss": 1.2357, + "step": 31638 + }, + { + "epoch": 0.41113437495431604, + "grad_norm": 0.4278838336467743, + "learning_rate": 0.00011779721597629292, + "loss": 1.2892, + "step": 31639 + }, + { + "epoch": 0.41114736949823194, + "grad_norm": 0.5051475763320923, + "learning_rate": 0.00011779461651438153, + "loss": 1.3866, + "step": 31640 + }, + { + "epoch": 0.4111603640421478, + "grad_norm": 0.32163652777671814, + "learning_rate": 0.00011779201705247014, + "loss": 1.2396, + "step": 31641 + }, + { + "epoch": 0.4111733585860637, + "grad_norm": 0.36184391379356384, + "learning_rate": 0.00011778941759055876, + "loss": 1.3137, + "step": 31642 + }, + { + "epoch": 0.41118635312997953, + "grad_norm": 0.3968331515789032, + "learning_rate": 0.00011778681812864738, + "loss": 1.3664, + "step": 31643 + }, + { + "epoch": 0.41119934767389543, + "grad_norm": 0.481471985578537, + "learning_rate": 0.00011778421866673599, + "loss": 1.4686, + "step": 31644 + }, + { + "epoch": 0.4112123422178113, + "grad_norm": 0.4758431017398834, + "learning_rate": 0.0001177816192048246, + "loss": 1.4014, + "step": 31645 + }, + { + "epoch": 0.4112253367617272, + "grad_norm": 0.2949455976486206, + "learning_rate": 0.00011777901974291324, + "loss": 1.2726, + "step": 31646 + }, + { + "epoch": 0.411238331305643, + "grad_norm": 0.3730049431324005, + "learning_rate": 0.00011777642028100185, + "loss": 1.3608, + "step": 31647 + }, + { + "epoch": 0.4112513258495589, + "grad_norm": 0.45002204179763794, + "learning_rate": 0.00011777382081909046, + "loss": 1.1865, + "step": 31648 + }, + { + "epoch": 0.41126432039347477, + "grad_norm": 0.38180041313171387, + "learning_rate": 0.00011777122135717907, + "loss": 1.5136, + "step": 31649 + }, + { + "epoch": 0.41127731493739067, + "grad_norm": 0.3299858570098877, + "learning_rate": 0.00011776862189526769, + "loss": 1.3172, + "step": 31650 + }, + { + "epoch": 0.4112903094813065, + "grad_norm": 0.35219427943229675, + "learning_rate": 0.0001177660224333563, + "loss": 1.3956, + "step": 31651 + }, + { + "epoch": 0.4113033040252224, + "grad_norm": 0.47893497347831726, + "learning_rate": 0.00011776342297144491, + "loss": 1.353, + "step": 31652 + }, + { + "epoch": 0.41131629856913826, + "grad_norm": 0.3493189811706543, + "learning_rate": 0.00011776082350953353, + "loss": 1.4158, + "step": 31653 + }, + { + "epoch": 0.41132929311305416, + "grad_norm": 0.5018308758735657, + "learning_rate": 0.00011775822404762215, + "loss": 1.4385, + "step": 31654 + }, + { + "epoch": 0.41134228765697, + "grad_norm": 0.4680333435535431, + "learning_rate": 0.00011775562458571076, + "loss": 1.5763, + "step": 31655 + }, + { + "epoch": 0.4113552822008859, + "grad_norm": 0.38837650418281555, + "learning_rate": 0.00011775302512379937, + "loss": 1.4964, + "step": 31656 + }, + { + "epoch": 0.41136827674480175, + "grad_norm": 0.39791902899742126, + "learning_rate": 0.00011775042566188798, + "loss": 1.4055, + "step": 31657 + }, + { + "epoch": 0.41138127128871765, + "grad_norm": 0.30373644828796387, + "learning_rate": 0.00011774782619997662, + "loss": 1.3375, + "step": 31658 + }, + { + "epoch": 0.4113942658326335, + "grad_norm": 0.3376322388648987, + "learning_rate": 0.00011774522673806523, + "loss": 1.2151, + "step": 31659 + }, + { + "epoch": 0.4114072603765494, + "grad_norm": 0.4305064380168915, + "learning_rate": 0.00011774262727615384, + "loss": 1.37, + "step": 31660 + }, + { + "epoch": 0.41142025492046524, + "grad_norm": 0.4347439110279083, + "learning_rate": 0.00011774002781424245, + "loss": 1.4325, + "step": 31661 + }, + { + "epoch": 0.41143324946438115, + "grad_norm": 0.4432355761528015, + "learning_rate": 0.00011773742835233108, + "loss": 1.2511, + "step": 31662 + }, + { + "epoch": 0.411446244008297, + "grad_norm": 0.44055166840553284, + "learning_rate": 0.00011773482889041969, + "loss": 1.6505, + "step": 31663 + }, + { + "epoch": 0.4114592385522129, + "grad_norm": 0.42715615034103394, + "learning_rate": 0.0001177322294285083, + "loss": 1.4526, + "step": 31664 + }, + { + "epoch": 0.41147223309612874, + "grad_norm": 0.3953370749950409, + "learning_rate": 0.00011772962996659691, + "loss": 1.6368, + "step": 31665 + }, + { + "epoch": 0.41148522764004464, + "grad_norm": 0.3512831926345825, + "learning_rate": 0.00011772703050468555, + "loss": 1.3893, + "step": 31666 + }, + { + "epoch": 0.4114982221839605, + "grad_norm": 0.33947882056236267, + "learning_rate": 0.00011772443104277415, + "loss": 1.2292, + "step": 31667 + }, + { + "epoch": 0.4115112167278764, + "grad_norm": 0.4190872609615326, + "learning_rate": 0.00011772183158086276, + "loss": 1.4618, + "step": 31668 + }, + { + "epoch": 0.41152421127179223, + "grad_norm": 0.3576473295688629, + "learning_rate": 0.00011771923211895137, + "loss": 1.2912, + "step": 31669 + }, + { + "epoch": 0.41153720581570813, + "grad_norm": 0.45064982771873474, + "learning_rate": 0.00011771663265704, + "loss": 1.3917, + "step": 31670 + }, + { + "epoch": 0.41155020035962403, + "grad_norm": 0.3616502285003662, + "learning_rate": 0.00011771403319512862, + "loss": 1.3541, + "step": 31671 + }, + { + "epoch": 0.4115631949035399, + "grad_norm": 0.4322145879268646, + "learning_rate": 0.00011771143373321723, + "loss": 1.3843, + "step": 31672 + }, + { + "epoch": 0.4115761894474558, + "grad_norm": 0.4824223220348358, + "learning_rate": 0.00011770883427130584, + "loss": 1.1931, + "step": 31673 + }, + { + "epoch": 0.4115891839913716, + "grad_norm": 0.22588540613651276, + "learning_rate": 0.00011770623480939446, + "loss": 1.3035, + "step": 31674 + }, + { + "epoch": 0.4116021785352875, + "grad_norm": 0.41667428612709045, + "learning_rate": 0.00011770363534748307, + "loss": 1.4036, + "step": 31675 + }, + { + "epoch": 0.41161517307920337, + "grad_norm": 0.429872989654541, + "learning_rate": 0.00011770103588557169, + "loss": 1.4253, + "step": 31676 + }, + { + "epoch": 0.41162816762311927, + "grad_norm": 0.4447936415672302, + "learning_rate": 0.00011769843642366032, + "loss": 1.5114, + "step": 31677 + }, + { + "epoch": 0.4116411621670351, + "grad_norm": 0.47786134481430054, + "learning_rate": 0.00011769583696174893, + "loss": 1.3995, + "step": 31678 + }, + { + "epoch": 0.411654156710951, + "grad_norm": 0.42816466093063354, + "learning_rate": 0.00011769323749983755, + "loss": 1.3131, + "step": 31679 + }, + { + "epoch": 0.41166715125486686, + "grad_norm": 0.38027897477149963, + "learning_rate": 0.00011769063803792614, + "loss": 1.4852, + "step": 31680 + }, + { + "epoch": 0.41168014579878276, + "grad_norm": 0.3440971374511719, + "learning_rate": 0.00011768803857601478, + "loss": 1.3942, + "step": 31681 + }, + { + "epoch": 0.4116931403426986, + "grad_norm": 0.3857981264591217, + "learning_rate": 0.00011768543911410339, + "loss": 1.3695, + "step": 31682 + }, + { + "epoch": 0.4117061348866145, + "grad_norm": 0.3942941725254059, + "learning_rate": 0.000117682839652192, + "loss": 1.3741, + "step": 31683 + }, + { + "epoch": 0.41171912943053035, + "grad_norm": 0.40192854404449463, + "learning_rate": 0.00011768024019028061, + "loss": 1.3453, + "step": 31684 + }, + { + "epoch": 0.41173212397444625, + "grad_norm": 0.4272104799747467, + "learning_rate": 0.00011767764072836924, + "loss": 1.4978, + "step": 31685 + }, + { + "epoch": 0.4117451185183621, + "grad_norm": 0.4035555124282837, + "learning_rate": 0.00011767504126645785, + "loss": 1.6438, + "step": 31686 + }, + { + "epoch": 0.411758113062278, + "grad_norm": 0.4157753586769104, + "learning_rate": 0.00011767244180454646, + "loss": 1.3634, + "step": 31687 + }, + { + "epoch": 0.41177110760619384, + "grad_norm": 0.3271634876728058, + "learning_rate": 0.00011766984234263507, + "loss": 1.3578, + "step": 31688 + }, + { + "epoch": 0.41178410215010974, + "grad_norm": 0.48875632882118225, + "learning_rate": 0.00011766724288072371, + "loss": 1.5354, + "step": 31689 + }, + { + "epoch": 0.4117970966940256, + "grad_norm": 0.3462316393852234, + "learning_rate": 0.00011766464341881232, + "loss": 1.4017, + "step": 31690 + }, + { + "epoch": 0.4118100912379415, + "grad_norm": 0.3762851059436798, + "learning_rate": 0.00011766204395690093, + "loss": 1.4698, + "step": 31691 + }, + { + "epoch": 0.41182308578185733, + "grad_norm": 0.3897680640220642, + "learning_rate": 0.00011765944449498954, + "loss": 1.6004, + "step": 31692 + }, + { + "epoch": 0.41183608032577323, + "grad_norm": 0.41915521025657654, + "learning_rate": 0.00011765684503307817, + "loss": 1.443, + "step": 31693 + }, + { + "epoch": 0.4118490748696891, + "grad_norm": 0.5198265910148621, + "learning_rate": 0.00011765424557116678, + "loss": 1.4722, + "step": 31694 + }, + { + "epoch": 0.411862069413605, + "grad_norm": 0.3126645088195801, + "learning_rate": 0.00011765164610925539, + "loss": 1.3796, + "step": 31695 + }, + { + "epoch": 0.4118750639575208, + "grad_norm": 0.47881627082824707, + "learning_rate": 0.000117649046647344, + "loss": 1.3092, + "step": 31696 + }, + { + "epoch": 0.4118880585014367, + "grad_norm": 0.3607673645019531, + "learning_rate": 0.00011764644718543262, + "loss": 1.3685, + "step": 31697 + }, + { + "epoch": 0.41190105304535257, + "grad_norm": 0.3296034336090088, + "learning_rate": 0.00011764384772352123, + "loss": 1.2878, + "step": 31698 + }, + { + "epoch": 0.4119140475892685, + "grad_norm": 0.44708120822906494, + "learning_rate": 0.00011764124826160985, + "loss": 1.2914, + "step": 31699 + }, + { + "epoch": 0.4119270421331843, + "grad_norm": 0.4136098623275757, + "learning_rate": 0.00011763864879969846, + "loss": 1.4351, + "step": 31700 + }, + { + "epoch": 0.4119400366771002, + "grad_norm": 0.3559790253639221, + "learning_rate": 0.0001176360493377871, + "loss": 1.5522, + "step": 31701 + }, + { + "epoch": 0.41195303122101606, + "grad_norm": 0.39897531270980835, + "learning_rate": 0.0001176334498758757, + "loss": 1.5008, + "step": 31702 + }, + { + "epoch": 0.41196602576493196, + "grad_norm": 0.46392497420310974, + "learning_rate": 0.00011763085041396432, + "loss": 1.3661, + "step": 31703 + }, + { + "epoch": 0.4119790203088478, + "grad_norm": 0.47510677576065063, + "learning_rate": 0.00011762825095205293, + "loss": 1.6113, + "step": 31704 + }, + { + "epoch": 0.4119920148527637, + "grad_norm": 0.3748581111431122, + "learning_rate": 0.00011762565149014155, + "loss": 1.2199, + "step": 31705 + }, + { + "epoch": 0.41200500939667956, + "grad_norm": 0.38417956233024597, + "learning_rate": 0.00011762305202823016, + "loss": 1.4346, + "step": 31706 + }, + { + "epoch": 0.41201800394059546, + "grad_norm": 0.41488510370254517, + "learning_rate": 0.00011762045256631877, + "loss": 1.3613, + "step": 31707 + }, + { + "epoch": 0.4120309984845113, + "grad_norm": 0.3389635682106018, + "learning_rate": 0.00011761785310440738, + "loss": 1.4276, + "step": 31708 + }, + { + "epoch": 0.4120439930284272, + "grad_norm": 0.3359288275241852, + "learning_rate": 0.00011761525364249601, + "loss": 1.2936, + "step": 31709 + }, + { + "epoch": 0.41205698757234305, + "grad_norm": 0.3816105127334595, + "learning_rate": 0.00011761265418058462, + "loss": 1.3175, + "step": 31710 + }, + { + "epoch": 0.41206998211625895, + "grad_norm": 0.3633844256401062, + "learning_rate": 0.00011761005471867323, + "loss": 1.4938, + "step": 31711 + }, + { + "epoch": 0.4120829766601748, + "grad_norm": 0.43474116921424866, + "learning_rate": 0.00011760745525676184, + "loss": 1.2917, + "step": 31712 + }, + { + "epoch": 0.4120959712040907, + "grad_norm": 0.4055688679218292, + "learning_rate": 0.00011760485579485048, + "loss": 1.4573, + "step": 31713 + }, + { + "epoch": 0.41210896574800654, + "grad_norm": 0.3859024941921234, + "learning_rate": 0.00011760225633293909, + "loss": 1.2373, + "step": 31714 + }, + { + "epoch": 0.41212196029192244, + "grad_norm": 0.6114421486854553, + "learning_rate": 0.0001175996568710277, + "loss": 1.4964, + "step": 31715 + }, + { + "epoch": 0.4121349548358383, + "grad_norm": 0.45155957341194153, + "learning_rate": 0.00011759705740911633, + "loss": 1.4001, + "step": 31716 + }, + { + "epoch": 0.4121479493797542, + "grad_norm": 0.45348045229911804, + "learning_rate": 0.00011759445794720494, + "loss": 1.3784, + "step": 31717 + }, + { + "epoch": 0.41216094392367003, + "grad_norm": 0.45521479845046997, + "learning_rate": 0.00011759185848529355, + "loss": 1.4011, + "step": 31718 + }, + { + "epoch": 0.41217393846758593, + "grad_norm": 0.48154446482658386, + "learning_rate": 0.00011758925902338216, + "loss": 1.4296, + "step": 31719 + }, + { + "epoch": 0.4121869330115018, + "grad_norm": 0.3529930114746094, + "learning_rate": 0.0001175866595614708, + "loss": 1.3059, + "step": 31720 + }, + { + "epoch": 0.4121999275554177, + "grad_norm": 0.4399464428424835, + "learning_rate": 0.00011758406009955941, + "loss": 1.552, + "step": 31721 + }, + { + "epoch": 0.4122129220993335, + "grad_norm": 0.42122894525527954, + "learning_rate": 0.000117581460637648, + "loss": 1.4771, + "step": 31722 + }, + { + "epoch": 0.4122259166432494, + "grad_norm": 0.39818650484085083, + "learning_rate": 0.00011757886117573662, + "loss": 1.4132, + "step": 31723 + }, + { + "epoch": 0.41223891118716527, + "grad_norm": 0.34989452362060547, + "learning_rate": 0.00011757626171382525, + "loss": 1.2825, + "step": 31724 + }, + { + "epoch": 0.41225190573108117, + "grad_norm": 0.4541240632534027, + "learning_rate": 0.00011757366225191386, + "loss": 1.4453, + "step": 31725 + }, + { + "epoch": 0.412264900274997, + "grad_norm": 0.3833162486553192, + "learning_rate": 0.00011757106279000248, + "loss": 1.3005, + "step": 31726 + }, + { + "epoch": 0.4122778948189129, + "grad_norm": 0.27818942070007324, + "learning_rate": 0.00011756846332809109, + "loss": 1.4477, + "step": 31727 + }, + { + "epoch": 0.41229088936282876, + "grad_norm": 0.43792203068733215, + "learning_rate": 0.00011756586386617971, + "loss": 1.5172, + "step": 31728 + }, + { + "epoch": 0.41230388390674466, + "grad_norm": 0.4293571710586548, + "learning_rate": 0.00011756326440426832, + "loss": 1.4108, + "step": 31729 + }, + { + "epoch": 0.4123168784506605, + "grad_norm": 0.41879352927207947, + "learning_rate": 0.00011756066494235693, + "loss": 1.4593, + "step": 31730 + }, + { + "epoch": 0.4123298729945764, + "grad_norm": 0.4028339087963104, + "learning_rate": 0.00011755806548044554, + "loss": 1.634, + "step": 31731 + }, + { + "epoch": 0.41234286753849225, + "grad_norm": 0.42921629548072815, + "learning_rate": 0.00011755546601853418, + "loss": 1.4531, + "step": 31732 + }, + { + "epoch": 0.41235586208240815, + "grad_norm": 0.45704343914985657, + "learning_rate": 0.00011755286655662279, + "loss": 1.5203, + "step": 31733 + }, + { + "epoch": 0.412368856626324, + "grad_norm": 0.36380091309547424, + "learning_rate": 0.0001175502670947114, + "loss": 1.3405, + "step": 31734 + }, + { + "epoch": 0.4123818511702399, + "grad_norm": 0.4240029454231262, + "learning_rate": 0.0001175476676328, + "loss": 1.5142, + "step": 31735 + }, + { + "epoch": 0.41239484571415574, + "grad_norm": 0.38916006684303284, + "learning_rate": 0.00011754506817088864, + "loss": 1.5145, + "step": 31736 + }, + { + "epoch": 0.41240784025807165, + "grad_norm": 0.4423098564147949, + "learning_rate": 0.00011754246870897725, + "loss": 1.4158, + "step": 31737 + }, + { + "epoch": 0.4124208348019875, + "grad_norm": 0.42087674140930176, + "learning_rate": 0.00011753986924706586, + "loss": 1.4163, + "step": 31738 + }, + { + "epoch": 0.4124338293459034, + "grad_norm": 0.4316350221633911, + "learning_rate": 0.00011753726978515447, + "loss": 1.4245, + "step": 31739 + }, + { + "epoch": 0.41244682388981924, + "grad_norm": 0.4177074730396271, + "learning_rate": 0.0001175346703232431, + "loss": 1.5875, + "step": 31740 + }, + { + "epoch": 0.41245981843373514, + "grad_norm": 0.40894240140914917, + "learning_rate": 0.00011753207086133171, + "loss": 1.3955, + "step": 31741 + }, + { + "epoch": 0.412472812977651, + "grad_norm": 0.4216795861721039, + "learning_rate": 0.00011752947139942032, + "loss": 1.4871, + "step": 31742 + }, + { + "epoch": 0.4124858075215669, + "grad_norm": 0.2925211191177368, + "learning_rate": 0.00011752687193750893, + "loss": 1.4822, + "step": 31743 + }, + { + "epoch": 0.41249880206548273, + "grad_norm": 0.41941073536872864, + "learning_rate": 0.00011752427247559757, + "loss": 1.4004, + "step": 31744 + }, + { + "epoch": 0.41251179660939863, + "grad_norm": 0.45500990748405457, + "learning_rate": 0.00011752167301368618, + "loss": 1.4676, + "step": 31745 + }, + { + "epoch": 0.41252479115331453, + "grad_norm": 0.4242511987686157, + "learning_rate": 0.00011751907355177479, + "loss": 1.1443, + "step": 31746 + }, + { + "epoch": 0.4125377856972304, + "grad_norm": 0.41123929619789124, + "learning_rate": 0.00011751647408986339, + "loss": 1.5644, + "step": 31747 + }, + { + "epoch": 0.4125507802411463, + "grad_norm": 0.4290185272693634, + "learning_rate": 0.00011751387462795202, + "loss": 1.4074, + "step": 31748 + }, + { + "epoch": 0.4125637747850621, + "grad_norm": 0.33407479524612427, + "learning_rate": 0.00011751127516604064, + "loss": 1.2584, + "step": 31749 + }, + { + "epoch": 0.412576769328978, + "grad_norm": 0.38675615191459656, + "learning_rate": 0.00011750867570412925, + "loss": 1.2887, + "step": 31750 + }, + { + "epoch": 0.41258976387289387, + "grad_norm": 0.4034826159477234, + "learning_rate": 0.00011750607624221787, + "loss": 1.4663, + "step": 31751 + }, + { + "epoch": 0.41260275841680977, + "grad_norm": 0.4832096993923187, + "learning_rate": 0.00011750347678030648, + "loss": 1.6129, + "step": 31752 + }, + { + "epoch": 0.4126157529607256, + "grad_norm": 0.5136739611625671, + "learning_rate": 0.00011750087731839509, + "loss": 1.5366, + "step": 31753 + }, + { + "epoch": 0.4126287475046415, + "grad_norm": 0.301689088344574, + "learning_rate": 0.0001174982778564837, + "loss": 1.1754, + "step": 31754 + }, + { + "epoch": 0.41264174204855736, + "grad_norm": 0.3035409450531006, + "learning_rate": 0.00011749567839457234, + "loss": 1.2916, + "step": 31755 + }, + { + "epoch": 0.41265473659247326, + "grad_norm": 0.38761475682258606, + "learning_rate": 0.00011749307893266095, + "loss": 1.4195, + "step": 31756 + }, + { + "epoch": 0.4126677311363891, + "grad_norm": 0.3026151955127716, + "learning_rate": 0.00011749047947074956, + "loss": 1.4134, + "step": 31757 + }, + { + "epoch": 0.412680725680305, + "grad_norm": 0.3911232650279999, + "learning_rate": 0.00011748788000883817, + "loss": 1.3184, + "step": 31758 + }, + { + "epoch": 0.41269372022422085, + "grad_norm": 0.305425763130188, + "learning_rate": 0.0001174852805469268, + "loss": 1.4759, + "step": 31759 + }, + { + "epoch": 0.41270671476813675, + "grad_norm": 0.3558710217475891, + "learning_rate": 0.00011748268108501541, + "loss": 1.5441, + "step": 31760 + }, + { + "epoch": 0.4127197093120526, + "grad_norm": 0.3979869484901428, + "learning_rate": 0.00011748008162310402, + "loss": 1.4779, + "step": 31761 + }, + { + "epoch": 0.4127327038559685, + "grad_norm": 0.4547790586948395, + "learning_rate": 0.00011747748216119263, + "loss": 1.4595, + "step": 31762 + }, + { + "epoch": 0.41274569839988434, + "grad_norm": 0.3734130561351776, + "learning_rate": 0.00011747488269928127, + "loss": 1.3162, + "step": 31763 + }, + { + "epoch": 0.41275869294380024, + "grad_norm": 0.34993186593055725, + "learning_rate": 0.00011747228323736987, + "loss": 1.1964, + "step": 31764 + }, + { + "epoch": 0.4127716874877161, + "grad_norm": 0.2659223973751068, + "learning_rate": 0.00011746968377545848, + "loss": 1.1795, + "step": 31765 + }, + { + "epoch": 0.412784682031632, + "grad_norm": 0.38865891098976135, + "learning_rate": 0.00011746708431354709, + "loss": 1.3731, + "step": 31766 + }, + { + "epoch": 0.41279767657554783, + "grad_norm": 0.46732452511787415, + "learning_rate": 0.00011746448485163573, + "loss": 1.5205, + "step": 31767 + }, + { + "epoch": 0.41281067111946373, + "grad_norm": 0.46368417143821716, + "learning_rate": 0.00011746188538972434, + "loss": 1.3195, + "step": 31768 + }, + { + "epoch": 0.4128236656633796, + "grad_norm": 0.36316293478012085, + "learning_rate": 0.00011745928592781295, + "loss": 1.131, + "step": 31769 + }, + { + "epoch": 0.4128366602072955, + "grad_norm": 0.4895988702774048, + "learning_rate": 0.00011745668646590156, + "loss": 1.55, + "step": 31770 + }, + { + "epoch": 0.4128496547512113, + "grad_norm": 0.38821110129356384, + "learning_rate": 0.00011745408700399018, + "loss": 1.1851, + "step": 31771 + }, + { + "epoch": 0.4128626492951272, + "grad_norm": 0.40321868658065796, + "learning_rate": 0.0001174514875420788, + "loss": 1.3723, + "step": 31772 + }, + { + "epoch": 0.41287564383904307, + "grad_norm": 0.4872627556324005, + "learning_rate": 0.0001174488880801674, + "loss": 1.3602, + "step": 31773 + }, + { + "epoch": 0.412888638382959, + "grad_norm": 0.3604411482810974, + "learning_rate": 0.00011744628861825602, + "loss": 1.6038, + "step": 31774 + }, + { + "epoch": 0.4129016329268748, + "grad_norm": 0.38879451155662537, + "learning_rate": 0.00011744368915634466, + "loss": 1.3975, + "step": 31775 + }, + { + "epoch": 0.4129146274707907, + "grad_norm": 0.3540457785129547, + "learning_rate": 0.00011744108969443327, + "loss": 1.3911, + "step": 31776 + }, + { + "epoch": 0.41292762201470656, + "grad_norm": 0.3881840407848358, + "learning_rate": 0.00011743849023252186, + "loss": 1.4983, + "step": 31777 + }, + { + "epoch": 0.41294061655862246, + "grad_norm": 0.39743998646736145, + "learning_rate": 0.00011743589077061047, + "loss": 1.5392, + "step": 31778 + }, + { + "epoch": 0.4129536111025383, + "grad_norm": 0.36503171920776367, + "learning_rate": 0.00011743329130869911, + "loss": 1.567, + "step": 31779 + }, + { + "epoch": 0.4129666056464542, + "grad_norm": 0.43378350138664246, + "learning_rate": 0.00011743069184678772, + "loss": 1.5279, + "step": 31780 + }, + { + "epoch": 0.41297960019037006, + "grad_norm": 0.35806694626808167, + "learning_rate": 0.00011742809238487633, + "loss": 1.2686, + "step": 31781 + }, + { + "epoch": 0.41299259473428596, + "grad_norm": 0.3607901334762573, + "learning_rate": 0.00011742549292296495, + "loss": 1.412, + "step": 31782 + }, + { + "epoch": 0.4130055892782018, + "grad_norm": 0.4307047128677368, + "learning_rate": 0.00011742289346105357, + "loss": 1.3682, + "step": 31783 + }, + { + "epoch": 0.4130185838221177, + "grad_norm": 0.422247976064682, + "learning_rate": 0.00011742029399914218, + "loss": 1.4062, + "step": 31784 + }, + { + "epoch": 0.41303157836603355, + "grad_norm": 0.39503756165504456, + "learning_rate": 0.00011741769453723079, + "loss": 1.476, + "step": 31785 + }, + { + "epoch": 0.41304457290994945, + "grad_norm": 0.4251067638397217, + "learning_rate": 0.0001174150950753194, + "loss": 1.3316, + "step": 31786 + }, + { + "epoch": 0.4130575674538653, + "grad_norm": 0.37648358941078186, + "learning_rate": 0.00011741249561340804, + "loss": 1.4498, + "step": 31787 + }, + { + "epoch": 0.4130705619977812, + "grad_norm": 0.43251916766166687, + "learning_rate": 0.00011740989615149665, + "loss": 1.4284, + "step": 31788 + }, + { + "epoch": 0.41308355654169704, + "grad_norm": 0.3351757526397705, + "learning_rate": 0.00011740729668958525, + "loss": 1.2354, + "step": 31789 + }, + { + "epoch": 0.41309655108561294, + "grad_norm": 0.4672812521457672, + "learning_rate": 0.00011740469722767389, + "loss": 1.4183, + "step": 31790 + }, + { + "epoch": 0.4131095456295288, + "grad_norm": 0.48855289816856384, + "learning_rate": 0.0001174020977657625, + "loss": 1.3432, + "step": 31791 + }, + { + "epoch": 0.4131225401734447, + "grad_norm": 0.43809136748313904, + "learning_rate": 0.00011739949830385111, + "loss": 1.3662, + "step": 31792 + }, + { + "epoch": 0.41313553471736053, + "grad_norm": 0.4194406270980835, + "learning_rate": 0.00011739689884193972, + "loss": 1.3783, + "step": 31793 + }, + { + "epoch": 0.41314852926127643, + "grad_norm": 0.43101072311401367, + "learning_rate": 0.00011739429938002834, + "loss": 1.3867, + "step": 31794 + }, + { + "epoch": 0.4131615238051923, + "grad_norm": 0.33521032333374023, + "learning_rate": 0.00011739169991811696, + "loss": 1.2898, + "step": 31795 + }, + { + "epoch": 0.4131745183491082, + "grad_norm": 0.4578614830970764, + "learning_rate": 0.00011738910045620557, + "loss": 1.4371, + "step": 31796 + }, + { + "epoch": 0.413187512893024, + "grad_norm": 0.3994629681110382, + "learning_rate": 0.00011738650099429418, + "loss": 1.5436, + "step": 31797 + }, + { + "epoch": 0.4132005074369399, + "grad_norm": 0.33394962549209595, + "learning_rate": 0.00011738390153238282, + "loss": 1.5098, + "step": 31798 + }, + { + "epoch": 0.41321350198085577, + "grad_norm": 0.49559107422828674, + "learning_rate": 0.00011738130207047143, + "loss": 1.4752, + "step": 31799 + }, + { + "epoch": 0.41322649652477167, + "grad_norm": 0.4040876626968384, + "learning_rate": 0.00011737870260856004, + "loss": 1.3613, + "step": 31800 + }, + { + "epoch": 0.4132394910686875, + "grad_norm": 0.46256762742996216, + "learning_rate": 0.00011737610314664865, + "loss": 1.3513, + "step": 31801 + }, + { + "epoch": 0.4132524856126034, + "grad_norm": 0.44079864025115967, + "learning_rate": 0.00011737350368473727, + "loss": 1.378, + "step": 31802 + }, + { + "epoch": 0.41326548015651926, + "grad_norm": 0.389039009809494, + "learning_rate": 0.00011737090422282588, + "loss": 1.4559, + "step": 31803 + }, + { + "epoch": 0.41327847470043516, + "grad_norm": 0.36529913544654846, + "learning_rate": 0.0001173683047609145, + "loss": 1.505, + "step": 31804 + }, + { + "epoch": 0.413291469244351, + "grad_norm": 0.3017125427722931, + "learning_rate": 0.0001173657052990031, + "loss": 1.2248, + "step": 31805 + }, + { + "epoch": 0.4133044637882669, + "grad_norm": 0.36836498975753784, + "learning_rate": 0.00011736310583709173, + "loss": 1.5097, + "step": 31806 + }, + { + "epoch": 0.41331745833218275, + "grad_norm": 0.3604908883571625, + "learning_rate": 0.00011736050637518034, + "loss": 1.3791, + "step": 31807 + }, + { + "epoch": 0.41333045287609865, + "grad_norm": 0.33806708455085754, + "learning_rate": 0.00011735790691326895, + "loss": 1.238, + "step": 31808 + }, + { + "epoch": 0.4133434474200145, + "grad_norm": 0.4385598599910736, + "learning_rate": 0.00011735530745135756, + "loss": 1.6464, + "step": 31809 + }, + { + "epoch": 0.4133564419639304, + "grad_norm": 0.4294182062149048, + "learning_rate": 0.0001173527079894462, + "loss": 1.433, + "step": 31810 + }, + { + "epoch": 0.41336943650784624, + "grad_norm": 0.37577179074287415, + "learning_rate": 0.00011735010852753481, + "loss": 1.4455, + "step": 31811 + }, + { + "epoch": 0.41338243105176214, + "grad_norm": 0.387668639421463, + "learning_rate": 0.00011734750906562342, + "loss": 1.637, + "step": 31812 + }, + { + "epoch": 0.413395425595678, + "grad_norm": 0.48617732524871826, + "learning_rate": 0.00011734490960371203, + "loss": 1.4254, + "step": 31813 + }, + { + "epoch": 0.4134084201395939, + "grad_norm": 0.4484746754169464, + "learning_rate": 0.00011734231014180066, + "loss": 1.2367, + "step": 31814 + }, + { + "epoch": 0.41342141468350974, + "grad_norm": 0.41519516706466675, + "learning_rate": 0.00011733971067988927, + "loss": 1.5143, + "step": 31815 + }, + { + "epoch": 0.41343440922742564, + "grad_norm": 0.40131473541259766, + "learning_rate": 0.00011733711121797788, + "loss": 1.456, + "step": 31816 + }, + { + "epoch": 0.4134474037713415, + "grad_norm": 0.35350683331489563, + "learning_rate": 0.00011733451175606649, + "loss": 1.5, + "step": 31817 + }, + { + "epoch": 0.4134603983152574, + "grad_norm": 0.3574559688568115, + "learning_rate": 0.00011733191229415512, + "loss": 1.2697, + "step": 31818 + }, + { + "epoch": 0.4134733928591732, + "grad_norm": 0.4214079678058624, + "learning_rate": 0.00011732931283224373, + "loss": 1.4518, + "step": 31819 + }, + { + "epoch": 0.41348638740308913, + "grad_norm": 0.432996928691864, + "learning_rate": 0.00011732671337033234, + "loss": 1.2822, + "step": 31820 + }, + { + "epoch": 0.413499381947005, + "grad_norm": 0.38598212599754333, + "learning_rate": 0.00011732411390842095, + "loss": 1.4522, + "step": 31821 + }, + { + "epoch": 0.4135123764909209, + "grad_norm": 0.39914241433143616, + "learning_rate": 0.00011732151444650959, + "loss": 1.4127, + "step": 31822 + }, + { + "epoch": 0.4135253710348368, + "grad_norm": 0.3408644497394562, + "learning_rate": 0.0001173189149845982, + "loss": 1.2182, + "step": 31823 + }, + { + "epoch": 0.4135383655787526, + "grad_norm": 0.47123709321022034, + "learning_rate": 0.00011731631552268681, + "loss": 1.4532, + "step": 31824 + }, + { + "epoch": 0.4135513601226685, + "grad_norm": 0.3978356420993805, + "learning_rate": 0.00011731371606077543, + "loss": 1.3469, + "step": 31825 + }, + { + "epoch": 0.41356435466658437, + "grad_norm": 0.4253008961677551, + "learning_rate": 0.00011731111659886404, + "loss": 1.4096, + "step": 31826 + }, + { + "epoch": 0.41357734921050027, + "grad_norm": 0.4575842618942261, + "learning_rate": 0.00011730851713695265, + "loss": 1.4485, + "step": 31827 + }, + { + "epoch": 0.4135903437544161, + "grad_norm": 0.3874261677265167, + "learning_rate": 0.00011730591767504127, + "loss": 1.5444, + "step": 31828 + }, + { + "epoch": 0.413603338298332, + "grad_norm": 0.4312470257282257, + "learning_rate": 0.0001173033182131299, + "loss": 1.3847, + "step": 31829 + }, + { + "epoch": 0.41361633284224786, + "grad_norm": 0.4055170714855194, + "learning_rate": 0.00011730071875121851, + "loss": 1.6956, + "step": 31830 + }, + { + "epoch": 0.41362932738616376, + "grad_norm": 0.4001989960670471, + "learning_rate": 0.00011729811928930711, + "loss": 1.4912, + "step": 31831 + }, + { + "epoch": 0.4136423219300796, + "grad_norm": 0.4116976261138916, + "learning_rate": 0.00011729551982739572, + "loss": 1.418, + "step": 31832 + }, + { + "epoch": 0.4136553164739955, + "grad_norm": 0.5365820527076721, + "learning_rate": 0.00011729292036548436, + "loss": 1.2788, + "step": 31833 + }, + { + "epoch": 0.41366831101791135, + "grad_norm": 0.3660898506641388, + "learning_rate": 0.00011729032090357297, + "loss": 1.2591, + "step": 31834 + }, + { + "epoch": 0.41368130556182725, + "grad_norm": 0.4454982280731201, + "learning_rate": 0.00011728772144166158, + "loss": 1.3251, + "step": 31835 + }, + { + "epoch": 0.4136943001057431, + "grad_norm": 0.4318486154079437, + "learning_rate": 0.00011728512197975019, + "loss": 1.3411, + "step": 31836 + }, + { + "epoch": 0.413707294649659, + "grad_norm": 0.3369883894920349, + "learning_rate": 0.00011728252251783882, + "loss": 1.2816, + "step": 31837 + }, + { + "epoch": 0.41372028919357484, + "grad_norm": 0.4071979522705078, + "learning_rate": 0.00011727992305592743, + "loss": 1.3644, + "step": 31838 + }, + { + "epoch": 0.41373328373749074, + "grad_norm": 0.38106846809387207, + "learning_rate": 0.00011727732359401604, + "loss": 1.5451, + "step": 31839 + }, + { + "epoch": 0.4137462782814066, + "grad_norm": 0.33907535672187805, + "learning_rate": 0.00011727472413210465, + "loss": 1.3698, + "step": 31840 + }, + { + "epoch": 0.4137592728253225, + "grad_norm": 0.3957054018974304, + "learning_rate": 0.00011727212467019329, + "loss": 1.2806, + "step": 31841 + }, + { + "epoch": 0.41377226736923833, + "grad_norm": 0.5480278134346008, + "learning_rate": 0.0001172695252082819, + "loss": 1.4715, + "step": 31842 + }, + { + "epoch": 0.41378526191315423, + "grad_norm": 0.35822397470474243, + "learning_rate": 0.00011726692574637051, + "loss": 1.5817, + "step": 31843 + }, + { + "epoch": 0.4137982564570701, + "grad_norm": 0.47707533836364746, + "learning_rate": 0.00011726432628445911, + "loss": 1.3573, + "step": 31844 + }, + { + "epoch": 0.413811251000986, + "grad_norm": 0.3140091001987457, + "learning_rate": 0.00011726172682254775, + "loss": 1.5765, + "step": 31845 + }, + { + "epoch": 0.4138242455449018, + "grad_norm": 0.26068300008773804, + "learning_rate": 0.00011725912736063636, + "loss": 1.3899, + "step": 31846 + }, + { + "epoch": 0.4138372400888177, + "grad_norm": 0.3251701593399048, + "learning_rate": 0.00011725652789872497, + "loss": 1.3564, + "step": 31847 + }, + { + "epoch": 0.41385023463273357, + "grad_norm": 0.3763917088508606, + "learning_rate": 0.00011725392843681358, + "loss": 1.2111, + "step": 31848 + }, + { + "epoch": 0.41386322917664947, + "grad_norm": 0.2945026159286499, + "learning_rate": 0.0001172513289749022, + "loss": 1.3622, + "step": 31849 + }, + { + "epoch": 0.4138762237205653, + "grad_norm": 0.39809390902519226, + "learning_rate": 0.00011724872951299081, + "loss": 1.5935, + "step": 31850 + }, + { + "epoch": 0.4138892182644812, + "grad_norm": 0.3888184726238251, + "learning_rate": 0.00011724613005107942, + "loss": 1.2957, + "step": 31851 + }, + { + "epoch": 0.41390221280839706, + "grad_norm": 0.37752488255500793, + "learning_rate": 0.00011724353058916804, + "loss": 1.4743, + "step": 31852 + }, + { + "epoch": 0.41391520735231296, + "grad_norm": 0.31938958168029785, + "learning_rate": 0.00011724093112725667, + "loss": 1.3758, + "step": 31853 + }, + { + "epoch": 0.4139282018962288, + "grad_norm": 0.5017914175987244, + "learning_rate": 0.00011723833166534528, + "loss": 1.5197, + "step": 31854 + }, + { + "epoch": 0.4139411964401447, + "grad_norm": 0.4456081688404083, + "learning_rate": 0.0001172357322034339, + "loss": 1.5705, + "step": 31855 + }, + { + "epoch": 0.41395419098406055, + "grad_norm": 0.45309075713157654, + "learning_rate": 0.0001172331327415225, + "loss": 1.3788, + "step": 31856 + }, + { + "epoch": 0.41396718552797646, + "grad_norm": 0.35578569769859314, + "learning_rate": 0.00011723053327961113, + "loss": 1.4373, + "step": 31857 + }, + { + "epoch": 0.4139801800718923, + "grad_norm": 0.3951316177845001, + "learning_rate": 0.00011722793381769974, + "loss": 1.429, + "step": 31858 + }, + { + "epoch": 0.4139931746158082, + "grad_norm": 0.327208012342453, + "learning_rate": 0.00011722533435578835, + "loss": 1.4253, + "step": 31859 + }, + { + "epoch": 0.41400616915972405, + "grad_norm": 0.39285707473754883, + "learning_rate": 0.00011722273489387696, + "loss": 1.5298, + "step": 31860 + }, + { + "epoch": 0.41401916370363995, + "grad_norm": 0.39437335729599, + "learning_rate": 0.00011722013543196559, + "loss": 1.4457, + "step": 31861 + }, + { + "epoch": 0.4140321582475558, + "grad_norm": 0.37862467765808105, + "learning_rate": 0.0001172175359700542, + "loss": 1.4461, + "step": 31862 + }, + { + "epoch": 0.4140451527914717, + "grad_norm": 0.3078564703464508, + "learning_rate": 0.00011721493650814281, + "loss": 1.4327, + "step": 31863 + }, + { + "epoch": 0.41405814733538754, + "grad_norm": 0.4065170884132385, + "learning_rate": 0.00011721233704623145, + "loss": 1.2792, + "step": 31864 + }, + { + "epoch": 0.41407114187930344, + "grad_norm": 0.46307018399238586, + "learning_rate": 0.00011720973758432006, + "loss": 1.3912, + "step": 31865 + }, + { + "epoch": 0.4140841364232193, + "grad_norm": 0.3540421426296234, + "learning_rate": 0.00011720713812240867, + "loss": 1.4259, + "step": 31866 + }, + { + "epoch": 0.4140971309671352, + "grad_norm": 0.34455418586730957, + "learning_rate": 0.00011720453866049728, + "loss": 1.5064, + "step": 31867 + }, + { + "epoch": 0.41411012551105103, + "grad_norm": 0.35030514001846313, + "learning_rate": 0.0001172019391985859, + "loss": 1.199, + "step": 31868 + }, + { + "epoch": 0.41412312005496693, + "grad_norm": 0.3843987286090851, + "learning_rate": 0.00011719933973667452, + "loss": 1.3091, + "step": 31869 + }, + { + "epoch": 0.4141361145988828, + "grad_norm": 0.310016006231308, + "learning_rate": 0.00011719674027476313, + "loss": 1.3143, + "step": 31870 + }, + { + "epoch": 0.4141491091427987, + "grad_norm": 0.560285747051239, + "learning_rate": 0.00011719414081285174, + "loss": 1.2457, + "step": 31871 + }, + { + "epoch": 0.4141621036867145, + "grad_norm": 0.43611177802085876, + "learning_rate": 0.00011719154135094038, + "loss": 1.3695, + "step": 31872 + }, + { + "epoch": 0.4141750982306304, + "grad_norm": 0.40217819809913635, + "learning_rate": 0.00011718894188902897, + "loss": 1.4057, + "step": 31873 + }, + { + "epoch": 0.41418809277454627, + "grad_norm": 0.35014262795448303, + "learning_rate": 0.00011718634242711758, + "loss": 1.3944, + "step": 31874 + }, + { + "epoch": 0.41420108731846217, + "grad_norm": 0.4676024913787842, + "learning_rate": 0.0001171837429652062, + "loss": 1.2367, + "step": 31875 + }, + { + "epoch": 0.414214081862378, + "grad_norm": 0.42215174436569214, + "learning_rate": 0.00011718114350329483, + "loss": 1.3562, + "step": 31876 + }, + { + "epoch": 0.4142270764062939, + "grad_norm": 0.4098634719848633, + "learning_rate": 0.00011717854404138344, + "loss": 1.5743, + "step": 31877 + }, + { + "epoch": 0.41424007095020976, + "grad_norm": 0.48227232694625854, + "learning_rate": 0.00011717594457947206, + "loss": 1.4586, + "step": 31878 + }, + { + "epoch": 0.41425306549412566, + "grad_norm": 0.3555050194263458, + "learning_rate": 0.00011717334511756067, + "loss": 1.1874, + "step": 31879 + }, + { + "epoch": 0.4142660600380415, + "grad_norm": 0.4402369260787964, + "learning_rate": 0.00011717074565564929, + "loss": 1.4736, + "step": 31880 + }, + { + "epoch": 0.4142790545819574, + "grad_norm": 0.4640365540981293, + "learning_rate": 0.0001171681461937379, + "loss": 1.459, + "step": 31881 + }, + { + "epoch": 0.41429204912587325, + "grad_norm": 0.3996478021144867, + "learning_rate": 0.00011716554673182651, + "loss": 1.3094, + "step": 31882 + }, + { + "epoch": 0.41430504366978915, + "grad_norm": 0.3977143168449402, + "learning_rate": 0.00011716294726991512, + "loss": 1.3877, + "step": 31883 + }, + { + "epoch": 0.414318038213705, + "grad_norm": 0.2982369363307953, + "learning_rate": 0.00011716034780800376, + "loss": 1.1032, + "step": 31884 + }, + { + "epoch": 0.4143310327576209, + "grad_norm": 0.5057021379470825, + "learning_rate": 0.00011715774834609237, + "loss": 1.5814, + "step": 31885 + }, + { + "epoch": 0.41434402730153674, + "grad_norm": 0.3497140407562256, + "learning_rate": 0.00011715514888418097, + "loss": 1.4647, + "step": 31886 + }, + { + "epoch": 0.41435702184545264, + "grad_norm": 0.4693131446838379, + "learning_rate": 0.00011715254942226958, + "loss": 1.4025, + "step": 31887 + }, + { + "epoch": 0.4143700163893685, + "grad_norm": 0.3365936577320099, + "learning_rate": 0.00011714994996035822, + "loss": 1.4658, + "step": 31888 + }, + { + "epoch": 0.4143830109332844, + "grad_norm": 0.390459269285202, + "learning_rate": 0.00011714735049844683, + "loss": 1.2443, + "step": 31889 + }, + { + "epoch": 0.41439600547720024, + "grad_norm": 0.3920002281665802, + "learning_rate": 0.00011714475103653544, + "loss": 1.3216, + "step": 31890 + }, + { + "epoch": 0.41440900002111614, + "grad_norm": 0.3819471299648285, + "learning_rate": 0.00011714215157462405, + "loss": 1.1292, + "step": 31891 + }, + { + "epoch": 0.414421994565032, + "grad_norm": 0.3853062391281128, + "learning_rate": 0.00011713955211271268, + "loss": 1.3044, + "step": 31892 + }, + { + "epoch": 0.4144349891089479, + "grad_norm": 0.3963586986064911, + "learning_rate": 0.00011713695265080129, + "loss": 1.437, + "step": 31893 + }, + { + "epoch": 0.4144479836528637, + "grad_norm": 0.4465895891189575, + "learning_rate": 0.0001171343531888899, + "loss": 1.4627, + "step": 31894 + }, + { + "epoch": 0.41446097819677963, + "grad_norm": 0.4670734405517578, + "learning_rate": 0.00011713175372697851, + "loss": 1.357, + "step": 31895 + }, + { + "epoch": 0.4144739727406955, + "grad_norm": 0.40060892701148987, + "learning_rate": 0.00011712915426506715, + "loss": 1.3375, + "step": 31896 + }, + { + "epoch": 0.4144869672846114, + "grad_norm": 0.4602660834789276, + "learning_rate": 0.00011712655480315576, + "loss": 1.4134, + "step": 31897 + }, + { + "epoch": 0.4144999618285273, + "grad_norm": 0.41173529624938965, + "learning_rate": 0.00011712395534124437, + "loss": 1.2859, + "step": 31898 + }, + { + "epoch": 0.4145129563724431, + "grad_norm": 0.27473360300064087, + "learning_rate": 0.000117121355879333, + "loss": 1.2874, + "step": 31899 + }, + { + "epoch": 0.414525950916359, + "grad_norm": 0.3998537063598633, + "learning_rate": 0.0001171187564174216, + "loss": 1.5878, + "step": 31900 + }, + { + "epoch": 0.41453894546027487, + "grad_norm": 0.308927983045578, + "learning_rate": 0.00011711615695551022, + "loss": 1.2909, + "step": 31901 + }, + { + "epoch": 0.41455194000419077, + "grad_norm": 0.44094279408454895, + "learning_rate": 0.00011711355749359883, + "loss": 1.5492, + "step": 31902 + }, + { + "epoch": 0.4145649345481066, + "grad_norm": 0.3875715136528015, + "learning_rate": 0.00011711095803168745, + "loss": 1.5206, + "step": 31903 + }, + { + "epoch": 0.4145779290920225, + "grad_norm": 0.38723552227020264, + "learning_rate": 0.00011710835856977606, + "loss": 1.2304, + "step": 31904 + }, + { + "epoch": 0.41459092363593836, + "grad_norm": 0.4022468626499176, + "learning_rate": 0.00011710575910786467, + "loss": 1.4756, + "step": 31905 + }, + { + "epoch": 0.41460391817985426, + "grad_norm": 0.4944455325603485, + "learning_rate": 0.00011710315964595328, + "loss": 1.4196, + "step": 31906 + }, + { + "epoch": 0.4146169127237701, + "grad_norm": 0.3947686553001404, + "learning_rate": 0.00011710056018404192, + "loss": 1.4881, + "step": 31907 + }, + { + "epoch": 0.414629907267686, + "grad_norm": 0.33791208267211914, + "learning_rate": 0.00011709796072213053, + "loss": 1.2132, + "step": 31908 + }, + { + "epoch": 0.41464290181160185, + "grad_norm": 0.4213022291660309, + "learning_rate": 0.00011709536126021914, + "loss": 1.4172, + "step": 31909 + }, + { + "epoch": 0.41465589635551775, + "grad_norm": 0.3720908761024475, + "learning_rate": 0.00011709276179830775, + "loss": 1.3545, + "step": 31910 + }, + { + "epoch": 0.4146688908994336, + "grad_norm": 0.3788302540779114, + "learning_rate": 0.00011709016233639638, + "loss": 1.3457, + "step": 31911 + }, + { + "epoch": 0.4146818854433495, + "grad_norm": 0.34555670619010925, + "learning_rate": 0.00011708756287448499, + "loss": 1.1321, + "step": 31912 + }, + { + "epoch": 0.41469487998726534, + "grad_norm": 0.4369875192642212, + "learning_rate": 0.0001170849634125736, + "loss": 1.6589, + "step": 31913 + }, + { + "epoch": 0.41470787453118124, + "grad_norm": 0.4749024212360382, + "learning_rate": 0.00011708236395066221, + "loss": 1.477, + "step": 31914 + }, + { + "epoch": 0.4147208690750971, + "grad_norm": 0.46750178933143616, + "learning_rate": 0.00011707976448875084, + "loss": 1.3789, + "step": 31915 + }, + { + "epoch": 0.414733863619013, + "grad_norm": 0.3610598146915436, + "learning_rate": 0.00011707716502683945, + "loss": 1.273, + "step": 31916 + }, + { + "epoch": 0.41474685816292883, + "grad_norm": 0.4847123920917511, + "learning_rate": 0.00011707456556492806, + "loss": 1.4229, + "step": 31917 + }, + { + "epoch": 0.41475985270684473, + "grad_norm": 0.43790897727012634, + "learning_rate": 0.00011707196610301667, + "loss": 1.3437, + "step": 31918 + }, + { + "epoch": 0.4147728472507606, + "grad_norm": 0.4149693548679352, + "learning_rate": 0.00011706936664110531, + "loss": 1.4696, + "step": 31919 + }, + { + "epoch": 0.4147858417946765, + "grad_norm": 0.48989924788475037, + "learning_rate": 0.00011706676717919392, + "loss": 1.5989, + "step": 31920 + }, + { + "epoch": 0.4147988363385923, + "grad_norm": 0.4579598009586334, + "learning_rate": 0.00011706416771728253, + "loss": 1.5312, + "step": 31921 + }, + { + "epoch": 0.4148118308825082, + "grad_norm": 0.47385668754577637, + "learning_rate": 0.00011706156825537114, + "loss": 1.4213, + "step": 31922 + }, + { + "epoch": 0.41482482542642407, + "grad_norm": 0.42061296105384827, + "learning_rate": 0.00011705896879345976, + "loss": 1.5059, + "step": 31923 + }, + { + "epoch": 0.41483781997033997, + "grad_norm": 0.40995970368385315, + "learning_rate": 0.00011705636933154838, + "loss": 1.4611, + "step": 31924 + }, + { + "epoch": 0.4148508145142558, + "grad_norm": 0.38926807045936584, + "learning_rate": 0.00011705376986963699, + "loss": 1.5426, + "step": 31925 + }, + { + "epoch": 0.4148638090581717, + "grad_norm": 0.3989414572715759, + "learning_rate": 0.0001170511704077256, + "loss": 1.5748, + "step": 31926 + }, + { + "epoch": 0.41487680360208756, + "grad_norm": 0.3369590938091278, + "learning_rate": 0.00011704857094581424, + "loss": 1.3904, + "step": 31927 + }, + { + "epoch": 0.41488979814600346, + "grad_norm": 0.4963065981864929, + "learning_rate": 0.00011704597148390283, + "loss": 1.5211, + "step": 31928 + }, + { + "epoch": 0.4149027926899193, + "grad_norm": 0.5032174587249756, + "learning_rate": 0.00011704337202199144, + "loss": 1.6569, + "step": 31929 + }, + { + "epoch": 0.4149157872338352, + "grad_norm": 0.4077114760875702, + "learning_rate": 0.00011704077256008005, + "loss": 1.2416, + "step": 31930 + }, + { + "epoch": 0.41492878177775105, + "grad_norm": 0.4691038131713867, + "learning_rate": 0.00011703817309816869, + "loss": 1.4702, + "step": 31931 + }, + { + "epoch": 0.41494177632166696, + "grad_norm": 0.37105709314346313, + "learning_rate": 0.0001170355736362573, + "loss": 1.4023, + "step": 31932 + }, + { + "epoch": 0.4149547708655828, + "grad_norm": 0.4176577031612396, + "learning_rate": 0.00011703297417434591, + "loss": 1.3434, + "step": 31933 + }, + { + "epoch": 0.4149677654094987, + "grad_norm": 0.35434553027153015, + "learning_rate": 0.00011703037471243453, + "loss": 1.3154, + "step": 31934 + }, + { + "epoch": 0.41498075995341455, + "grad_norm": 0.44269946217536926, + "learning_rate": 0.00011702777525052315, + "loss": 1.5711, + "step": 31935 + }, + { + "epoch": 0.41499375449733045, + "grad_norm": 0.4185575544834137, + "learning_rate": 0.00011702517578861176, + "loss": 1.3754, + "step": 31936 + }, + { + "epoch": 0.4150067490412463, + "grad_norm": 0.4493005573749542, + "learning_rate": 0.00011702257632670037, + "loss": 1.4258, + "step": 31937 + }, + { + "epoch": 0.4150197435851622, + "grad_norm": 0.3071945309638977, + "learning_rate": 0.00011701997686478901, + "loss": 1.5116, + "step": 31938 + }, + { + "epoch": 0.41503273812907804, + "grad_norm": 0.43161505460739136, + "learning_rate": 0.00011701737740287762, + "loss": 1.4191, + "step": 31939 + }, + { + "epoch": 0.41504573267299394, + "grad_norm": 0.39987698197364807, + "learning_rate": 0.00011701477794096623, + "loss": 1.3056, + "step": 31940 + }, + { + "epoch": 0.4150587272169098, + "grad_norm": 0.3714471161365509, + "learning_rate": 0.00011701217847905483, + "loss": 1.4079, + "step": 31941 + }, + { + "epoch": 0.4150717217608257, + "grad_norm": 0.31806376576423645, + "learning_rate": 0.00011700957901714347, + "loss": 1.5493, + "step": 31942 + }, + { + "epoch": 0.41508471630474153, + "grad_norm": 0.3202168345451355, + "learning_rate": 0.00011700697955523208, + "loss": 1.4257, + "step": 31943 + }, + { + "epoch": 0.41509771084865743, + "grad_norm": 0.39771831035614014, + "learning_rate": 0.00011700438009332069, + "loss": 1.2237, + "step": 31944 + }, + { + "epoch": 0.4151107053925733, + "grad_norm": 0.29544389247894287, + "learning_rate": 0.0001170017806314093, + "loss": 1.3502, + "step": 31945 + }, + { + "epoch": 0.4151236999364892, + "grad_norm": 0.39181992411613464, + "learning_rate": 0.00011699918116949792, + "loss": 1.4435, + "step": 31946 + }, + { + "epoch": 0.415136694480405, + "grad_norm": 0.5179322361946106, + "learning_rate": 0.00011699658170758654, + "loss": 1.3349, + "step": 31947 + }, + { + "epoch": 0.4151496890243209, + "grad_norm": 0.38667455315589905, + "learning_rate": 0.00011699398224567515, + "loss": 1.3788, + "step": 31948 + }, + { + "epoch": 0.41516268356823677, + "grad_norm": 0.41124311089515686, + "learning_rate": 0.00011699138278376376, + "loss": 1.4866, + "step": 31949 + }, + { + "epoch": 0.41517567811215267, + "grad_norm": 0.49463340640068054, + "learning_rate": 0.0001169887833218524, + "loss": 1.3833, + "step": 31950 + }, + { + "epoch": 0.4151886726560685, + "grad_norm": 0.3388766348361969, + "learning_rate": 0.000116986183859941, + "loss": 1.3869, + "step": 31951 + }, + { + "epoch": 0.4152016671999844, + "grad_norm": 0.426586776971817, + "learning_rate": 0.00011698358439802962, + "loss": 1.532, + "step": 31952 + }, + { + "epoch": 0.41521466174390026, + "grad_norm": 0.3509022295475006, + "learning_rate": 0.00011698098493611821, + "loss": 1.3991, + "step": 31953 + }, + { + "epoch": 0.41522765628781616, + "grad_norm": 0.33200231194496155, + "learning_rate": 0.00011697838547420685, + "loss": 1.5806, + "step": 31954 + }, + { + "epoch": 0.415240650831732, + "grad_norm": 0.3706192374229431, + "learning_rate": 0.00011697578601229546, + "loss": 1.4681, + "step": 31955 + }, + { + "epoch": 0.4152536453756479, + "grad_norm": 0.33666497468948364, + "learning_rate": 0.00011697318655038407, + "loss": 1.3989, + "step": 31956 + }, + { + "epoch": 0.41526663991956375, + "grad_norm": 0.5073790550231934, + "learning_rate": 0.00011697058708847269, + "loss": 1.3996, + "step": 31957 + }, + { + "epoch": 0.41527963446347965, + "grad_norm": 0.4137423634529114, + "learning_rate": 0.00011696798762656131, + "loss": 1.2645, + "step": 31958 + }, + { + "epoch": 0.4152926290073955, + "grad_norm": 0.32358792424201965, + "learning_rate": 0.00011696538816464992, + "loss": 1.3724, + "step": 31959 + }, + { + "epoch": 0.4153056235513114, + "grad_norm": 0.4449808597564697, + "learning_rate": 0.00011696278870273853, + "loss": 1.5268, + "step": 31960 + }, + { + "epoch": 0.41531861809522724, + "grad_norm": 0.43996110558509827, + "learning_rate": 0.00011696018924082714, + "loss": 1.4571, + "step": 31961 + }, + { + "epoch": 0.41533161263914314, + "grad_norm": 0.3297157883644104, + "learning_rate": 0.00011695758977891578, + "loss": 1.5195, + "step": 31962 + }, + { + "epoch": 0.415344607183059, + "grad_norm": 0.40892839431762695, + "learning_rate": 0.00011695499031700439, + "loss": 1.3716, + "step": 31963 + }, + { + "epoch": 0.4153576017269749, + "grad_norm": 0.3498488664627075, + "learning_rate": 0.000116952390855093, + "loss": 1.2525, + "step": 31964 + }, + { + "epoch": 0.41537059627089074, + "grad_norm": 0.3851849436759949, + "learning_rate": 0.00011694979139318161, + "loss": 1.4043, + "step": 31965 + }, + { + "epoch": 0.41538359081480664, + "grad_norm": 0.4749259054660797, + "learning_rate": 0.00011694719193127024, + "loss": 1.3713, + "step": 31966 + }, + { + "epoch": 0.4153965853587225, + "grad_norm": 0.43708065152168274, + "learning_rate": 0.00011694459246935885, + "loss": 1.412, + "step": 31967 + }, + { + "epoch": 0.4154095799026384, + "grad_norm": 0.4439527988433838, + "learning_rate": 0.00011694199300744746, + "loss": 1.4843, + "step": 31968 + }, + { + "epoch": 0.4154225744465542, + "grad_norm": 0.31829768419265747, + "learning_rate": 0.00011693939354553607, + "loss": 1.5222, + "step": 31969 + }, + { + "epoch": 0.4154355689904701, + "grad_norm": 0.38333284854888916, + "learning_rate": 0.0001169367940836247, + "loss": 1.5597, + "step": 31970 + }, + { + "epoch": 0.415448563534386, + "grad_norm": 0.5027971863746643, + "learning_rate": 0.0001169341946217133, + "loss": 1.5013, + "step": 31971 + }, + { + "epoch": 0.4154615580783019, + "grad_norm": 0.4251062572002411, + "learning_rate": 0.00011693159515980192, + "loss": 1.4224, + "step": 31972 + }, + { + "epoch": 0.4154745526222177, + "grad_norm": 0.46308383345603943, + "learning_rate": 0.00011692899569789055, + "loss": 1.4266, + "step": 31973 + }, + { + "epoch": 0.4154875471661336, + "grad_norm": 0.4564632177352905, + "learning_rate": 0.00011692639623597917, + "loss": 1.3427, + "step": 31974 + }, + { + "epoch": 0.4155005417100495, + "grad_norm": 0.4805711805820465, + "learning_rate": 0.00011692379677406778, + "loss": 1.46, + "step": 31975 + }, + { + "epoch": 0.41551353625396537, + "grad_norm": 0.4796523153781891, + "learning_rate": 0.00011692119731215639, + "loss": 1.4555, + "step": 31976 + }, + { + "epoch": 0.41552653079788127, + "grad_norm": 0.4168841242790222, + "learning_rate": 0.00011691859785024501, + "loss": 1.3406, + "step": 31977 + }, + { + "epoch": 0.4155395253417971, + "grad_norm": 0.3594214618206024, + "learning_rate": 0.00011691599838833362, + "loss": 1.2304, + "step": 31978 + }, + { + "epoch": 0.415552519885713, + "grad_norm": 0.3321438431739807, + "learning_rate": 0.00011691339892642223, + "loss": 1.2159, + "step": 31979 + }, + { + "epoch": 0.41556551442962886, + "grad_norm": 0.4258446991443634, + "learning_rate": 0.00011691079946451084, + "loss": 1.301, + "step": 31980 + }, + { + "epoch": 0.41557850897354476, + "grad_norm": 0.3417115807533264, + "learning_rate": 0.00011690820000259948, + "loss": 1.4174, + "step": 31981 + }, + { + "epoch": 0.4155915035174606, + "grad_norm": 0.47925522923469543, + "learning_rate": 0.0001169056005406881, + "loss": 1.4056, + "step": 31982 + }, + { + "epoch": 0.4156044980613765, + "grad_norm": 0.38244304060935974, + "learning_rate": 0.00011690300107877669, + "loss": 1.3481, + "step": 31983 + }, + { + "epoch": 0.41561749260529235, + "grad_norm": 0.4504685699939728, + "learning_rate": 0.0001169004016168653, + "loss": 1.3532, + "step": 31984 + }, + { + "epoch": 0.41563048714920825, + "grad_norm": 0.39021608233451843, + "learning_rate": 0.00011689780215495394, + "loss": 1.41, + "step": 31985 + }, + { + "epoch": 0.4156434816931241, + "grad_norm": 0.3274133503437042, + "learning_rate": 0.00011689520269304255, + "loss": 1.3299, + "step": 31986 + }, + { + "epoch": 0.41565647623704, + "grad_norm": 0.37035664916038513, + "learning_rate": 0.00011689260323113116, + "loss": 1.384, + "step": 31987 + }, + { + "epoch": 0.41566947078095584, + "grad_norm": 0.4943562150001526, + "learning_rate": 0.00011689000376921977, + "loss": 1.2374, + "step": 31988 + }, + { + "epoch": 0.41568246532487174, + "grad_norm": 0.43165430426597595, + "learning_rate": 0.0001168874043073084, + "loss": 1.512, + "step": 31989 + }, + { + "epoch": 0.4156954598687876, + "grad_norm": 0.4384680390357971, + "learning_rate": 0.00011688480484539701, + "loss": 1.544, + "step": 31990 + }, + { + "epoch": 0.4157084544127035, + "grad_norm": 0.4445725679397583, + "learning_rate": 0.00011688220538348562, + "loss": 1.544, + "step": 31991 + }, + { + "epoch": 0.41572144895661933, + "grad_norm": 0.3301665186882019, + "learning_rate": 0.00011687960592157423, + "loss": 1.4456, + "step": 31992 + }, + { + "epoch": 0.41573444350053523, + "grad_norm": 0.3533686697483063, + "learning_rate": 0.00011687700645966287, + "loss": 1.2128, + "step": 31993 + }, + { + "epoch": 0.4157474380444511, + "grad_norm": 0.4108389914035797, + "learning_rate": 0.00011687440699775148, + "loss": 1.5507, + "step": 31994 + }, + { + "epoch": 0.415760432588367, + "grad_norm": 0.3706664443016052, + "learning_rate": 0.00011687180753584008, + "loss": 1.474, + "step": 31995 + }, + { + "epoch": 0.4157734271322828, + "grad_norm": 0.4060254693031311, + "learning_rate": 0.00011686920807392869, + "loss": 1.5062, + "step": 31996 + }, + { + "epoch": 0.4157864216761987, + "grad_norm": 0.43567201495170593, + "learning_rate": 0.00011686660861201733, + "loss": 1.3666, + "step": 31997 + }, + { + "epoch": 0.41579941622011457, + "grad_norm": 0.47659802436828613, + "learning_rate": 0.00011686400915010594, + "loss": 1.4337, + "step": 31998 + }, + { + "epoch": 0.41581241076403047, + "grad_norm": 0.4370724558830261, + "learning_rate": 0.00011686140968819455, + "loss": 1.1913, + "step": 31999 + }, + { + "epoch": 0.4158254053079463, + "grad_norm": 0.3840669095516205, + "learning_rate": 0.00011685881022628316, + "loss": 1.2953, + "step": 32000 + }, + { + "epoch": 0.4158383998518622, + "grad_norm": 0.42917683720588684, + "learning_rate": 0.00011685621076437178, + "loss": 1.463, + "step": 32001 + }, + { + "epoch": 0.41585139439577806, + "grad_norm": 0.2843080759048462, + "learning_rate": 0.0001168536113024604, + "loss": 1.3629, + "step": 32002 + }, + { + "epoch": 0.41586438893969396, + "grad_norm": 0.3443150818347931, + "learning_rate": 0.000116851011840549, + "loss": 1.2903, + "step": 32003 + }, + { + "epoch": 0.4158773834836098, + "grad_norm": 0.32550954818725586, + "learning_rate": 0.00011684841237863762, + "loss": 1.3704, + "step": 32004 + }, + { + "epoch": 0.4158903780275257, + "grad_norm": 0.47192445397377014, + "learning_rate": 0.00011684581291672625, + "loss": 1.4963, + "step": 32005 + }, + { + "epoch": 0.41590337257144155, + "grad_norm": 0.3771071135997772, + "learning_rate": 0.00011684321345481486, + "loss": 1.4452, + "step": 32006 + }, + { + "epoch": 0.41591636711535745, + "grad_norm": 0.36138543486595154, + "learning_rate": 0.00011684061399290348, + "loss": 1.3879, + "step": 32007 + }, + { + "epoch": 0.4159293616592733, + "grad_norm": 0.40519770979881287, + "learning_rate": 0.00011683801453099207, + "loss": 1.3834, + "step": 32008 + }, + { + "epoch": 0.4159423562031892, + "grad_norm": 0.4360518753528595, + "learning_rate": 0.00011683541506908071, + "loss": 1.4514, + "step": 32009 + }, + { + "epoch": 0.41595535074710505, + "grad_norm": 0.30168673396110535, + "learning_rate": 0.00011683281560716932, + "loss": 1.4035, + "step": 32010 + }, + { + "epoch": 0.41596834529102095, + "grad_norm": 0.43757927417755127, + "learning_rate": 0.00011683021614525793, + "loss": 1.4893, + "step": 32011 + }, + { + "epoch": 0.4159813398349368, + "grad_norm": 0.3698733448982239, + "learning_rate": 0.00011682761668334656, + "loss": 1.2706, + "step": 32012 + }, + { + "epoch": 0.4159943343788527, + "grad_norm": 0.3862118422985077, + "learning_rate": 0.00011682501722143517, + "loss": 1.5849, + "step": 32013 + }, + { + "epoch": 0.41600732892276854, + "grad_norm": 0.48158594965934753, + "learning_rate": 0.00011682241775952378, + "loss": 1.7763, + "step": 32014 + }, + { + "epoch": 0.41602032346668444, + "grad_norm": 0.6137977838516235, + "learning_rate": 0.00011681981829761239, + "loss": 1.5746, + "step": 32015 + }, + { + "epoch": 0.4160333180106003, + "grad_norm": 0.37493470311164856, + "learning_rate": 0.00011681721883570103, + "loss": 1.4512, + "step": 32016 + }, + { + "epoch": 0.4160463125545162, + "grad_norm": 0.43486329913139343, + "learning_rate": 0.00011681461937378964, + "loss": 1.2644, + "step": 32017 + }, + { + "epoch": 0.41605930709843203, + "grad_norm": 0.3970802426338196, + "learning_rate": 0.00011681201991187825, + "loss": 1.2633, + "step": 32018 + }, + { + "epoch": 0.41607230164234793, + "grad_norm": 0.48810404539108276, + "learning_rate": 0.00011680942044996686, + "loss": 1.4063, + "step": 32019 + }, + { + "epoch": 0.4160852961862638, + "grad_norm": 0.3442177176475525, + "learning_rate": 0.00011680682098805549, + "loss": 1.3658, + "step": 32020 + }, + { + "epoch": 0.4160982907301797, + "grad_norm": 0.44915226101875305, + "learning_rate": 0.0001168042215261441, + "loss": 1.386, + "step": 32021 + }, + { + "epoch": 0.4161112852740955, + "grad_norm": 0.38550862669944763, + "learning_rate": 0.00011680162206423271, + "loss": 1.3968, + "step": 32022 + }, + { + "epoch": 0.4161242798180114, + "grad_norm": 0.38212844729423523, + "learning_rate": 0.00011679902260232132, + "loss": 1.4965, + "step": 32023 + }, + { + "epoch": 0.41613727436192727, + "grad_norm": 0.35502326488494873, + "learning_rate": 0.00011679642314040996, + "loss": 1.3074, + "step": 32024 + }, + { + "epoch": 0.41615026890584317, + "grad_norm": 0.43055737018585205, + "learning_rate": 0.00011679382367849855, + "loss": 1.2291, + "step": 32025 + }, + { + "epoch": 0.416163263449759, + "grad_norm": 0.34200364351272583, + "learning_rate": 0.00011679122421658716, + "loss": 1.4356, + "step": 32026 + }, + { + "epoch": 0.4161762579936749, + "grad_norm": 0.42000001668930054, + "learning_rate": 0.00011678862475467578, + "loss": 1.389, + "step": 32027 + }, + { + "epoch": 0.41618925253759076, + "grad_norm": 0.3771750032901764, + "learning_rate": 0.00011678602529276441, + "loss": 1.2203, + "step": 32028 + }, + { + "epoch": 0.41620224708150666, + "grad_norm": 0.42215681076049805, + "learning_rate": 0.00011678342583085302, + "loss": 1.4547, + "step": 32029 + }, + { + "epoch": 0.4162152416254225, + "grad_norm": 0.4950105845928192, + "learning_rate": 0.00011678082636894164, + "loss": 1.3919, + "step": 32030 + }, + { + "epoch": 0.4162282361693384, + "grad_norm": 0.33268195390701294, + "learning_rate": 0.00011677822690703025, + "loss": 1.4816, + "step": 32031 + }, + { + "epoch": 0.41624123071325425, + "grad_norm": 0.3720681369304657, + "learning_rate": 0.00011677562744511887, + "loss": 1.4295, + "step": 32032 + }, + { + "epoch": 0.41625422525717015, + "grad_norm": 0.4514576494693756, + "learning_rate": 0.00011677302798320748, + "loss": 1.4904, + "step": 32033 + }, + { + "epoch": 0.416267219801086, + "grad_norm": 0.3868033289909363, + "learning_rate": 0.00011677042852129609, + "loss": 1.3003, + "step": 32034 + }, + { + "epoch": 0.4162802143450019, + "grad_norm": 0.4159703850746155, + "learning_rate": 0.0001167678290593847, + "loss": 1.3896, + "step": 32035 + }, + { + "epoch": 0.41629320888891774, + "grad_norm": 0.4479852020740509, + "learning_rate": 0.00011676522959747334, + "loss": 1.5902, + "step": 32036 + }, + { + "epoch": 0.41630620343283364, + "grad_norm": 0.4594501852989197, + "learning_rate": 0.00011676263013556194, + "loss": 1.3688, + "step": 32037 + }, + { + "epoch": 0.4163191979767495, + "grad_norm": 0.371725857257843, + "learning_rate": 0.00011676003067365055, + "loss": 1.2638, + "step": 32038 + }, + { + "epoch": 0.4163321925206654, + "grad_norm": 0.3222507834434509, + "learning_rate": 0.00011675743121173916, + "loss": 1.2595, + "step": 32039 + }, + { + "epoch": 0.41634518706458123, + "grad_norm": 0.3404005467891693, + "learning_rate": 0.0001167548317498278, + "loss": 1.4802, + "step": 32040 + }, + { + "epoch": 0.41635818160849714, + "grad_norm": 0.38105377554893494, + "learning_rate": 0.00011675223228791641, + "loss": 1.3554, + "step": 32041 + }, + { + "epoch": 0.416371176152413, + "grad_norm": 0.4509718418121338, + "learning_rate": 0.00011674963282600502, + "loss": 1.328, + "step": 32042 + }, + { + "epoch": 0.4163841706963289, + "grad_norm": 0.4476536512374878, + "learning_rate": 0.00011674703336409363, + "loss": 1.3025, + "step": 32043 + }, + { + "epoch": 0.4163971652402447, + "grad_norm": 0.34343191981315613, + "learning_rate": 0.00011674443390218226, + "loss": 1.3024, + "step": 32044 + }, + { + "epoch": 0.4164101597841606, + "grad_norm": 0.4132869839668274, + "learning_rate": 0.00011674183444027087, + "loss": 1.4322, + "step": 32045 + }, + { + "epoch": 0.4164231543280765, + "grad_norm": 0.42127904295921326, + "learning_rate": 0.00011673923497835948, + "loss": 1.2302, + "step": 32046 + }, + { + "epoch": 0.4164361488719924, + "grad_norm": 0.305693119764328, + "learning_rate": 0.00011673663551644812, + "loss": 1.3637, + "step": 32047 + }, + { + "epoch": 0.4164491434159082, + "grad_norm": 0.4029181897640228, + "learning_rate": 0.00011673403605453673, + "loss": 1.4786, + "step": 32048 + }, + { + "epoch": 0.4164621379598241, + "grad_norm": 0.3723565340042114, + "learning_rate": 0.00011673143659262534, + "loss": 1.3932, + "step": 32049 + }, + { + "epoch": 0.41647513250374, + "grad_norm": 0.4205729067325592, + "learning_rate": 0.00011672883713071394, + "loss": 1.3722, + "step": 32050 + }, + { + "epoch": 0.41648812704765587, + "grad_norm": 0.5118665099143982, + "learning_rate": 0.00011672623766880257, + "loss": 1.3516, + "step": 32051 + }, + { + "epoch": 0.41650112159157177, + "grad_norm": 0.3601110279560089, + "learning_rate": 0.00011672363820689118, + "loss": 1.2533, + "step": 32052 + }, + { + "epoch": 0.4165141161354876, + "grad_norm": 0.40130096673965454, + "learning_rate": 0.0001167210387449798, + "loss": 1.2874, + "step": 32053 + }, + { + "epoch": 0.4165271106794035, + "grad_norm": 0.47753527760505676, + "learning_rate": 0.0001167184392830684, + "loss": 1.4642, + "step": 32054 + }, + { + "epoch": 0.41654010522331936, + "grad_norm": 0.36688604950904846, + "learning_rate": 0.00011671583982115703, + "loss": 1.454, + "step": 32055 + }, + { + "epoch": 0.41655309976723526, + "grad_norm": 0.4216481149196625, + "learning_rate": 0.00011671324035924564, + "loss": 1.4542, + "step": 32056 + }, + { + "epoch": 0.4165660943111511, + "grad_norm": 0.38279709219932556, + "learning_rate": 0.00011671064089733425, + "loss": 1.3379, + "step": 32057 + }, + { + "epoch": 0.416579088855067, + "grad_norm": 0.4438285231590271, + "learning_rate": 0.00011670804143542286, + "loss": 1.4319, + "step": 32058 + }, + { + "epoch": 0.41659208339898285, + "grad_norm": 0.3428190052509308, + "learning_rate": 0.0001167054419735115, + "loss": 1.2587, + "step": 32059 + }, + { + "epoch": 0.41660507794289875, + "grad_norm": 0.36510488390922546, + "learning_rate": 0.00011670284251160011, + "loss": 1.2772, + "step": 32060 + }, + { + "epoch": 0.4166180724868146, + "grad_norm": 0.424780935049057, + "learning_rate": 0.00011670024304968872, + "loss": 1.3283, + "step": 32061 + }, + { + "epoch": 0.4166310670307305, + "grad_norm": 0.46007877588272095, + "learning_rate": 0.00011669764358777733, + "loss": 1.3853, + "step": 32062 + }, + { + "epoch": 0.41664406157464634, + "grad_norm": 0.4064677953720093, + "learning_rate": 0.00011669504412586596, + "loss": 1.3263, + "step": 32063 + }, + { + "epoch": 0.41665705611856224, + "grad_norm": 0.40020063519477844, + "learning_rate": 0.00011669244466395457, + "loss": 1.5091, + "step": 32064 + }, + { + "epoch": 0.4166700506624781, + "grad_norm": 0.36137667298316956, + "learning_rate": 0.00011668984520204318, + "loss": 1.1833, + "step": 32065 + }, + { + "epoch": 0.416683045206394, + "grad_norm": 0.32115164399147034, + "learning_rate": 0.00011668724574013179, + "loss": 1.3577, + "step": 32066 + }, + { + "epoch": 0.41669603975030983, + "grad_norm": 0.33294054865837097, + "learning_rate": 0.00011668464627822042, + "loss": 1.2544, + "step": 32067 + }, + { + "epoch": 0.41670903429422573, + "grad_norm": 0.39383426308631897, + "learning_rate": 0.00011668204681630903, + "loss": 1.24, + "step": 32068 + }, + { + "epoch": 0.4167220288381416, + "grad_norm": 0.40805667638778687, + "learning_rate": 0.00011667944735439764, + "loss": 1.5659, + "step": 32069 + }, + { + "epoch": 0.4167350233820575, + "grad_norm": 0.36091023683547974, + "learning_rate": 0.00011667684789248625, + "loss": 1.2805, + "step": 32070 + }, + { + "epoch": 0.4167480179259733, + "grad_norm": 0.34582990407943726, + "learning_rate": 0.00011667424843057489, + "loss": 1.4041, + "step": 32071 + }, + { + "epoch": 0.4167610124698892, + "grad_norm": 0.34655246138572693, + "learning_rate": 0.0001166716489686635, + "loss": 1.4454, + "step": 32072 + }, + { + "epoch": 0.41677400701380507, + "grad_norm": 0.45380038022994995, + "learning_rate": 0.00011666904950675211, + "loss": 1.4222, + "step": 32073 + }, + { + "epoch": 0.41678700155772097, + "grad_norm": 0.4173750579357147, + "learning_rate": 0.00011666645004484072, + "loss": 1.5818, + "step": 32074 + }, + { + "epoch": 0.4167999961016368, + "grad_norm": 0.3961604833602905, + "learning_rate": 0.00011666385058292934, + "loss": 1.3394, + "step": 32075 + }, + { + "epoch": 0.4168129906455527, + "grad_norm": 0.3669795095920563, + "learning_rate": 0.00011666125112101796, + "loss": 1.2676, + "step": 32076 + }, + { + "epoch": 0.41682598518946856, + "grad_norm": 0.41534513235092163, + "learning_rate": 0.00011665865165910657, + "loss": 1.2664, + "step": 32077 + }, + { + "epoch": 0.41683897973338446, + "grad_norm": 0.3199285566806793, + "learning_rate": 0.00011665605219719518, + "loss": 1.2609, + "step": 32078 + }, + { + "epoch": 0.4168519742773003, + "grad_norm": 0.5193150043487549, + "learning_rate": 0.0001166534527352838, + "loss": 1.3395, + "step": 32079 + }, + { + "epoch": 0.4168649688212162, + "grad_norm": 0.26878538727760315, + "learning_rate": 0.00011665085327337241, + "loss": 1.1751, + "step": 32080 + }, + { + "epoch": 0.41687796336513205, + "grad_norm": 0.3845578730106354, + "learning_rate": 0.00011664825381146102, + "loss": 1.4235, + "step": 32081 + }, + { + "epoch": 0.41689095790904795, + "grad_norm": 0.5733974575996399, + "learning_rate": 0.00011664565434954963, + "loss": 1.3856, + "step": 32082 + }, + { + "epoch": 0.4169039524529638, + "grad_norm": 0.4119655191898346, + "learning_rate": 0.00011664305488763827, + "loss": 1.5462, + "step": 32083 + }, + { + "epoch": 0.4169169469968797, + "grad_norm": 0.2903675436973572, + "learning_rate": 0.00011664045542572688, + "loss": 1.3881, + "step": 32084 + }, + { + "epoch": 0.41692994154079555, + "grad_norm": 0.46879932284355164, + "learning_rate": 0.0001166378559638155, + "loss": 1.492, + "step": 32085 + }, + { + "epoch": 0.41694293608471145, + "grad_norm": 0.38961219787597656, + "learning_rate": 0.00011663525650190412, + "loss": 1.3829, + "step": 32086 + }, + { + "epoch": 0.4169559306286273, + "grad_norm": 0.45521771907806396, + "learning_rate": 0.00011663265703999273, + "loss": 1.2809, + "step": 32087 + }, + { + "epoch": 0.4169689251725432, + "grad_norm": 0.3626646399497986, + "learning_rate": 0.00011663005757808134, + "loss": 1.4247, + "step": 32088 + }, + { + "epoch": 0.41698191971645904, + "grad_norm": 0.36435046792030334, + "learning_rate": 0.00011662745811616995, + "loss": 1.4133, + "step": 32089 + }, + { + "epoch": 0.41699491426037494, + "grad_norm": 0.4388221502304077, + "learning_rate": 0.00011662485865425859, + "loss": 1.3028, + "step": 32090 + }, + { + "epoch": 0.4170079088042908, + "grad_norm": 0.37433111667633057, + "learning_rate": 0.0001166222591923472, + "loss": 1.3355, + "step": 32091 + }, + { + "epoch": 0.4170209033482067, + "grad_norm": 0.3647276759147644, + "learning_rate": 0.0001166196597304358, + "loss": 1.1414, + "step": 32092 + }, + { + "epoch": 0.41703389789212253, + "grad_norm": 0.4272243082523346, + "learning_rate": 0.00011661706026852441, + "loss": 1.4518, + "step": 32093 + }, + { + "epoch": 0.41704689243603843, + "grad_norm": 0.3637191355228424, + "learning_rate": 0.00011661446080661305, + "loss": 1.3407, + "step": 32094 + }, + { + "epoch": 0.4170598869799543, + "grad_norm": 0.40804797410964966, + "learning_rate": 0.00011661186134470166, + "loss": 1.3154, + "step": 32095 + }, + { + "epoch": 0.4170728815238702, + "grad_norm": 0.4135993421077728, + "learning_rate": 0.00011660926188279027, + "loss": 1.4536, + "step": 32096 + }, + { + "epoch": 0.417085876067786, + "grad_norm": 0.42042699456214905, + "learning_rate": 0.00011660666242087888, + "loss": 1.3422, + "step": 32097 + }, + { + "epoch": 0.4170988706117019, + "grad_norm": 0.40308523178100586, + "learning_rate": 0.0001166040629589675, + "loss": 1.5127, + "step": 32098 + }, + { + "epoch": 0.41711186515561777, + "grad_norm": 0.3179549276828766, + "learning_rate": 0.00011660146349705612, + "loss": 1.384, + "step": 32099 + }, + { + "epoch": 0.41712485969953367, + "grad_norm": 0.43993109464645386, + "learning_rate": 0.00011659886403514473, + "loss": 1.401, + "step": 32100 + }, + { + "epoch": 0.4171378542434495, + "grad_norm": 0.3192737400531769, + "learning_rate": 0.00011659626457323334, + "loss": 1.3574, + "step": 32101 + }, + { + "epoch": 0.4171508487873654, + "grad_norm": 0.4138120114803314, + "learning_rate": 0.00011659366511132197, + "loss": 1.5481, + "step": 32102 + }, + { + "epoch": 0.41716384333128126, + "grad_norm": 0.22258590161800385, + "learning_rate": 0.00011659106564941059, + "loss": 1.3111, + "step": 32103 + }, + { + "epoch": 0.41717683787519716, + "grad_norm": 0.36392152309417725, + "learning_rate": 0.0001165884661874992, + "loss": 1.4194, + "step": 32104 + }, + { + "epoch": 0.417189832419113, + "grad_norm": 0.3645169734954834, + "learning_rate": 0.0001165858667255878, + "loss": 1.3049, + "step": 32105 + }, + { + "epoch": 0.4172028269630289, + "grad_norm": 0.39033353328704834, + "learning_rate": 0.00011658326726367643, + "loss": 1.41, + "step": 32106 + }, + { + "epoch": 0.41721582150694475, + "grad_norm": 0.3834075927734375, + "learning_rate": 0.00011658066780176504, + "loss": 1.2408, + "step": 32107 + }, + { + "epoch": 0.41722881605086065, + "grad_norm": 0.34865954518318176, + "learning_rate": 0.00011657806833985365, + "loss": 1.2383, + "step": 32108 + }, + { + "epoch": 0.4172418105947765, + "grad_norm": 0.37171727418899536, + "learning_rate": 0.00011657546887794227, + "loss": 1.3279, + "step": 32109 + }, + { + "epoch": 0.4172548051386924, + "grad_norm": 0.434829980134964, + "learning_rate": 0.00011657286941603089, + "loss": 1.2527, + "step": 32110 + }, + { + "epoch": 0.41726779968260824, + "grad_norm": 0.4001743495464325, + "learning_rate": 0.0001165702699541195, + "loss": 1.2942, + "step": 32111 + }, + { + "epoch": 0.41728079422652414, + "grad_norm": 0.45036807656288147, + "learning_rate": 0.00011656767049220811, + "loss": 1.583, + "step": 32112 + }, + { + "epoch": 0.41729378877044, + "grad_norm": 0.4461669623851776, + "learning_rate": 0.00011656507103029672, + "loss": 1.274, + "step": 32113 + }, + { + "epoch": 0.4173067833143559, + "grad_norm": 0.38426461815834045, + "learning_rate": 0.00011656247156838536, + "loss": 1.3694, + "step": 32114 + }, + { + "epoch": 0.41731977785827173, + "grad_norm": 0.41188353300094604, + "learning_rate": 0.00011655987210647397, + "loss": 1.4398, + "step": 32115 + }, + { + "epoch": 0.41733277240218764, + "grad_norm": 0.4044858515262604, + "learning_rate": 0.00011655727264456258, + "loss": 1.4566, + "step": 32116 + }, + { + "epoch": 0.4173457669461035, + "grad_norm": 0.3475191593170166, + "learning_rate": 0.00011655467318265118, + "loss": 1.475, + "step": 32117 + }, + { + "epoch": 0.4173587614900194, + "grad_norm": 0.3644906282424927, + "learning_rate": 0.00011655207372073982, + "loss": 1.2814, + "step": 32118 + }, + { + "epoch": 0.4173717560339352, + "grad_norm": 0.3164217472076416, + "learning_rate": 0.00011654947425882843, + "loss": 1.2209, + "step": 32119 + }, + { + "epoch": 0.4173847505778511, + "grad_norm": 0.43380212783813477, + "learning_rate": 0.00011654687479691704, + "loss": 1.3547, + "step": 32120 + }, + { + "epoch": 0.41739774512176697, + "grad_norm": 0.3839067220687866, + "learning_rate": 0.00011654427533500566, + "loss": 1.2857, + "step": 32121 + }, + { + "epoch": 0.4174107396656829, + "grad_norm": 0.4403882622718811, + "learning_rate": 0.00011654167587309427, + "loss": 1.4929, + "step": 32122 + }, + { + "epoch": 0.4174237342095987, + "grad_norm": 0.4642385244369507, + "learning_rate": 0.00011653907641118289, + "loss": 1.4891, + "step": 32123 + }, + { + "epoch": 0.4174367287535146, + "grad_norm": 0.5472173094749451, + "learning_rate": 0.0001165364769492715, + "loss": 1.4842, + "step": 32124 + }, + { + "epoch": 0.41744972329743046, + "grad_norm": 0.4123762547969818, + "learning_rate": 0.00011653387748736013, + "loss": 1.6344, + "step": 32125 + }, + { + "epoch": 0.41746271784134636, + "grad_norm": 0.38550078868865967, + "learning_rate": 0.00011653127802544875, + "loss": 1.276, + "step": 32126 + }, + { + "epoch": 0.41747571238526227, + "grad_norm": 0.34991931915283203, + "learning_rate": 0.00011652867856353736, + "loss": 1.3246, + "step": 32127 + }, + { + "epoch": 0.4174887069291781, + "grad_norm": 0.2843521535396576, + "learning_rate": 0.00011652607910162597, + "loss": 1.3355, + "step": 32128 + }, + { + "epoch": 0.417501701473094, + "grad_norm": 0.49471455812454224, + "learning_rate": 0.00011652347963971459, + "loss": 1.3609, + "step": 32129 + }, + { + "epoch": 0.41751469601700986, + "grad_norm": 0.3524303734302521, + "learning_rate": 0.0001165208801778032, + "loss": 1.3714, + "step": 32130 + }, + { + "epoch": 0.41752769056092576, + "grad_norm": 0.40377113223075867, + "learning_rate": 0.00011651828071589181, + "loss": 1.2913, + "step": 32131 + }, + { + "epoch": 0.4175406851048416, + "grad_norm": 0.2524406313896179, + "learning_rate": 0.00011651568125398042, + "loss": 1.1992, + "step": 32132 + }, + { + "epoch": 0.4175536796487575, + "grad_norm": 0.3914828896522522, + "learning_rate": 0.00011651308179206906, + "loss": 1.2464, + "step": 32133 + }, + { + "epoch": 0.41756667419267335, + "grad_norm": 0.3798538148403168, + "learning_rate": 0.00011651048233015766, + "loss": 1.3539, + "step": 32134 + }, + { + "epoch": 0.41757966873658925, + "grad_norm": 0.517610490322113, + "learning_rate": 0.00011650788286824627, + "loss": 1.3179, + "step": 32135 + }, + { + "epoch": 0.4175926632805051, + "grad_norm": 0.4488769769668579, + "learning_rate": 0.00011650528340633488, + "loss": 1.4978, + "step": 32136 + }, + { + "epoch": 0.417605657824421, + "grad_norm": 0.3117702305316925, + "learning_rate": 0.00011650268394442352, + "loss": 1.2528, + "step": 32137 + }, + { + "epoch": 0.41761865236833684, + "grad_norm": 0.35263174772262573, + "learning_rate": 0.00011650008448251213, + "loss": 1.2663, + "step": 32138 + }, + { + "epoch": 0.41763164691225274, + "grad_norm": 0.40952765941619873, + "learning_rate": 0.00011649748502060074, + "loss": 1.5245, + "step": 32139 + }, + { + "epoch": 0.4176446414561686, + "grad_norm": 0.4192451238632202, + "learning_rate": 0.00011649488555868935, + "loss": 1.4788, + "step": 32140 + }, + { + "epoch": 0.4176576360000845, + "grad_norm": 0.4450977146625519, + "learning_rate": 0.00011649228609677798, + "loss": 1.4296, + "step": 32141 + }, + { + "epoch": 0.41767063054400033, + "grad_norm": 0.3687826097011566, + "learning_rate": 0.00011648968663486659, + "loss": 1.4947, + "step": 32142 + }, + { + "epoch": 0.41768362508791623, + "grad_norm": 0.43868663907051086, + "learning_rate": 0.0001164870871729552, + "loss": 1.4133, + "step": 32143 + }, + { + "epoch": 0.4176966196318321, + "grad_norm": 0.3895968496799469, + "learning_rate": 0.00011648448771104381, + "loss": 1.5238, + "step": 32144 + }, + { + "epoch": 0.417709614175748, + "grad_norm": 0.39096376299858093, + "learning_rate": 0.00011648188824913245, + "loss": 1.4127, + "step": 32145 + }, + { + "epoch": 0.4177226087196638, + "grad_norm": 0.43277764320373535, + "learning_rate": 0.00011647928878722106, + "loss": 1.4414, + "step": 32146 + }, + { + "epoch": 0.4177356032635797, + "grad_norm": 0.347722589969635, + "learning_rate": 0.00011647668932530966, + "loss": 1.2694, + "step": 32147 + }, + { + "epoch": 0.41774859780749557, + "grad_norm": 0.5539126396179199, + "learning_rate": 0.00011647408986339827, + "loss": 1.3442, + "step": 32148 + }, + { + "epoch": 0.41776159235141147, + "grad_norm": 0.39943423867225647, + "learning_rate": 0.0001164714904014869, + "loss": 1.4288, + "step": 32149 + }, + { + "epoch": 0.4177745868953273, + "grad_norm": 0.3799853026866913, + "learning_rate": 0.00011646889093957552, + "loss": 1.2454, + "step": 32150 + }, + { + "epoch": 0.4177875814392432, + "grad_norm": 0.31893599033355713, + "learning_rate": 0.00011646629147766413, + "loss": 1.471, + "step": 32151 + }, + { + "epoch": 0.41780057598315906, + "grad_norm": 0.3551245331764221, + "learning_rate": 0.00011646369201575274, + "loss": 1.2318, + "step": 32152 + }, + { + "epoch": 0.41781357052707496, + "grad_norm": 0.44132158160209656, + "learning_rate": 0.00011646109255384136, + "loss": 1.4269, + "step": 32153 + }, + { + "epoch": 0.4178265650709908, + "grad_norm": 0.36985716223716736, + "learning_rate": 0.00011645849309192997, + "loss": 1.5795, + "step": 32154 + }, + { + "epoch": 0.4178395596149067, + "grad_norm": 0.525563657283783, + "learning_rate": 0.00011645589363001858, + "loss": 1.5245, + "step": 32155 + }, + { + "epoch": 0.41785255415882255, + "grad_norm": 0.4491332471370697, + "learning_rate": 0.0001164532941681072, + "loss": 1.4541, + "step": 32156 + }, + { + "epoch": 0.41786554870273845, + "grad_norm": 0.42616525292396545, + "learning_rate": 0.00011645069470619583, + "loss": 1.5457, + "step": 32157 + }, + { + "epoch": 0.4178785432466543, + "grad_norm": 0.4395771026611328, + "learning_rate": 0.00011644809524428444, + "loss": 1.4163, + "step": 32158 + }, + { + "epoch": 0.4178915377905702, + "grad_norm": 0.3255276083946228, + "learning_rate": 0.00011644549578237304, + "loss": 1.3785, + "step": 32159 + }, + { + "epoch": 0.41790453233448605, + "grad_norm": 0.39209187030792236, + "learning_rate": 0.00011644289632046168, + "loss": 1.3643, + "step": 32160 + }, + { + "epoch": 0.41791752687840195, + "grad_norm": 0.40622061491012573, + "learning_rate": 0.00011644029685855029, + "loss": 1.4214, + "step": 32161 + }, + { + "epoch": 0.4179305214223178, + "grad_norm": 0.44107410311698914, + "learning_rate": 0.0001164376973966389, + "loss": 1.3739, + "step": 32162 + }, + { + "epoch": 0.4179435159662337, + "grad_norm": 0.33544695377349854, + "learning_rate": 0.00011643509793472751, + "loss": 1.3716, + "step": 32163 + }, + { + "epoch": 0.41795651051014954, + "grad_norm": 0.3974848687648773, + "learning_rate": 0.00011643249847281614, + "loss": 1.3629, + "step": 32164 + }, + { + "epoch": 0.41796950505406544, + "grad_norm": 0.37257468700408936, + "learning_rate": 0.00011642989901090475, + "loss": 1.3994, + "step": 32165 + }, + { + "epoch": 0.4179824995979813, + "grad_norm": 0.4503837823867798, + "learning_rate": 0.00011642729954899336, + "loss": 1.2988, + "step": 32166 + }, + { + "epoch": 0.4179954941418972, + "grad_norm": 0.440286248922348, + "learning_rate": 0.00011642470008708197, + "loss": 1.3539, + "step": 32167 + }, + { + "epoch": 0.41800848868581303, + "grad_norm": 0.48148369789123535, + "learning_rate": 0.00011642210062517061, + "loss": 1.4163, + "step": 32168 + }, + { + "epoch": 0.41802148322972893, + "grad_norm": 0.3875707983970642, + "learning_rate": 0.00011641950116325922, + "loss": 1.2131, + "step": 32169 + }, + { + "epoch": 0.4180344777736448, + "grad_norm": 0.4461034834384918, + "learning_rate": 0.00011641690170134783, + "loss": 1.479, + "step": 32170 + }, + { + "epoch": 0.4180474723175607, + "grad_norm": 0.3959667682647705, + "learning_rate": 0.00011641430223943644, + "loss": 1.3602, + "step": 32171 + }, + { + "epoch": 0.4180604668614765, + "grad_norm": 0.4443519711494446, + "learning_rate": 0.00011641170277752507, + "loss": 1.4481, + "step": 32172 + }, + { + "epoch": 0.4180734614053924, + "grad_norm": 0.40153220295906067, + "learning_rate": 0.00011640910331561368, + "loss": 1.4109, + "step": 32173 + }, + { + "epoch": 0.41808645594930827, + "grad_norm": 0.34715530276298523, + "learning_rate": 0.00011640650385370229, + "loss": 1.3425, + "step": 32174 + }, + { + "epoch": 0.41809945049322417, + "grad_norm": 0.42567458748817444, + "learning_rate": 0.0001164039043917909, + "loss": 1.3745, + "step": 32175 + }, + { + "epoch": 0.41811244503714, + "grad_norm": 0.33880865573883057, + "learning_rate": 0.00011640130492987952, + "loss": 1.356, + "step": 32176 + }, + { + "epoch": 0.4181254395810559, + "grad_norm": 0.40803760290145874, + "learning_rate": 0.00011639870546796813, + "loss": 1.4618, + "step": 32177 + }, + { + "epoch": 0.41813843412497176, + "grad_norm": 0.44775599241256714, + "learning_rate": 0.00011639610600605674, + "loss": 1.6087, + "step": 32178 + }, + { + "epoch": 0.41815142866888766, + "grad_norm": 0.29526832699775696, + "learning_rate": 0.00011639350654414536, + "loss": 1.1344, + "step": 32179 + }, + { + "epoch": 0.4181644232128035, + "grad_norm": 0.4349403977394104, + "learning_rate": 0.000116390907082234, + "loss": 1.3462, + "step": 32180 + }, + { + "epoch": 0.4181774177567194, + "grad_norm": 0.33959266543388367, + "learning_rate": 0.0001163883076203226, + "loss": 1.2271, + "step": 32181 + }, + { + "epoch": 0.41819041230063525, + "grad_norm": 0.4348263144493103, + "learning_rate": 0.00011638570815841122, + "loss": 1.438, + "step": 32182 + }, + { + "epoch": 0.41820340684455115, + "grad_norm": 0.3986891210079193, + "learning_rate": 0.00011638310869649983, + "loss": 1.4484, + "step": 32183 + }, + { + "epoch": 0.418216401388467, + "grad_norm": 0.4220660924911499, + "learning_rate": 0.00011638050923458845, + "loss": 1.456, + "step": 32184 + }, + { + "epoch": 0.4182293959323829, + "grad_norm": 0.36104515194892883, + "learning_rate": 0.00011637790977267706, + "loss": 1.4926, + "step": 32185 + }, + { + "epoch": 0.41824239047629874, + "grad_norm": 0.4250923693180084, + "learning_rate": 0.00011637531031076567, + "loss": 1.4356, + "step": 32186 + }, + { + "epoch": 0.41825538502021464, + "grad_norm": 0.3160364329814911, + "learning_rate": 0.00011637271084885428, + "loss": 1.2582, + "step": 32187 + }, + { + "epoch": 0.4182683795641305, + "grad_norm": 0.4519636332988739, + "learning_rate": 0.00011637011138694292, + "loss": 1.3902, + "step": 32188 + }, + { + "epoch": 0.4182813741080464, + "grad_norm": 0.3768227994441986, + "learning_rate": 0.00011636751192503152, + "loss": 1.4594, + "step": 32189 + }, + { + "epoch": 0.41829436865196223, + "grad_norm": 0.42983752489089966, + "learning_rate": 0.00011636491246312013, + "loss": 1.2179, + "step": 32190 + }, + { + "epoch": 0.41830736319587813, + "grad_norm": 0.4631871283054352, + "learning_rate": 0.00011636231300120874, + "loss": 1.4057, + "step": 32191 + }, + { + "epoch": 0.418320357739794, + "grad_norm": 0.486556738615036, + "learning_rate": 0.00011635971353929738, + "loss": 1.5015, + "step": 32192 + }, + { + "epoch": 0.4183333522837099, + "grad_norm": 0.4325386583805084, + "learning_rate": 0.00011635711407738599, + "loss": 1.4477, + "step": 32193 + }, + { + "epoch": 0.4183463468276257, + "grad_norm": 0.4420927166938782, + "learning_rate": 0.0001163545146154746, + "loss": 1.5671, + "step": 32194 + }, + { + "epoch": 0.4183593413715416, + "grad_norm": 0.45967647433280945, + "learning_rate": 0.00011635191515356323, + "loss": 1.4263, + "step": 32195 + }, + { + "epoch": 0.41837233591545747, + "grad_norm": 0.32406577467918396, + "learning_rate": 0.00011634931569165184, + "loss": 1.3456, + "step": 32196 + }, + { + "epoch": 0.4183853304593734, + "grad_norm": 0.43036600947380066, + "learning_rate": 0.00011634671622974045, + "loss": 1.3495, + "step": 32197 + }, + { + "epoch": 0.4183983250032892, + "grad_norm": 0.48746275901794434, + "learning_rate": 0.00011634411676782906, + "loss": 1.3075, + "step": 32198 + }, + { + "epoch": 0.4184113195472051, + "grad_norm": 0.36737489700317383, + "learning_rate": 0.0001163415173059177, + "loss": 1.4042, + "step": 32199 + }, + { + "epoch": 0.41842431409112096, + "grad_norm": 0.41635018587112427, + "learning_rate": 0.00011633891784400631, + "loss": 1.4237, + "step": 32200 + }, + { + "epoch": 0.41843730863503686, + "grad_norm": 0.3519030213356018, + "learning_rate": 0.0001163363183820949, + "loss": 1.2519, + "step": 32201 + }, + { + "epoch": 0.41845030317895276, + "grad_norm": 0.5549514889717102, + "learning_rate": 0.00011633371892018352, + "loss": 1.4754, + "step": 32202 + }, + { + "epoch": 0.4184632977228686, + "grad_norm": 0.4481637179851532, + "learning_rate": 0.00011633111945827215, + "loss": 1.3793, + "step": 32203 + }, + { + "epoch": 0.4184762922667845, + "grad_norm": 0.43877995014190674, + "learning_rate": 0.00011632851999636076, + "loss": 1.2521, + "step": 32204 + }, + { + "epoch": 0.41848928681070036, + "grad_norm": 0.4198109805583954, + "learning_rate": 0.00011632592053444938, + "loss": 1.2728, + "step": 32205 + }, + { + "epoch": 0.41850228135461626, + "grad_norm": 0.3588009178638458, + "learning_rate": 0.00011632332107253799, + "loss": 1.3339, + "step": 32206 + }, + { + "epoch": 0.4185152758985321, + "grad_norm": 0.33439791202545166, + "learning_rate": 0.00011632072161062661, + "loss": 1.2998, + "step": 32207 + }, + { + "epoch": 0.418528270442448, + "grad_norm": 0.40653684735298157, + "learning_rate": 0.00011631812214871522, + "loss": 1.4201, + "step": 32208 + }, + { + "epoch": 0.41854126498636385, + "grad_norm": 0.3330845236778259, + "learning_rate": 0.00011631552268680383, + "loss": 1.4003, + "step": 32209 + }, + { + "epoch": 0.41855425953027975, + "grad_norm": 0.3548836410045624, + "learning_rate": 0.00011631292322489244, + "loss": 1.3528, + "step": 32210 + }, + { + "epoch": 0.4185672540741956, + "grad_norm": 0.3680754601955414, + "learning_rate": 0.00011631032376298108, + "loss": 1.3234, + "step": 32211 + }, + { + "epoch": 0.4185802486181115, + "grad_norm": 0.3345813453197479, + "learning_rate": 0.00011630772430106969, + "loss": 1.386, + "step": 32212 + }, + { + "epoch": 0.41859324316202734, + "grad_norm": 0.374685674905777, + "learning_rate": 0.0001163051248391583, + "loss": 1.2733, + "step": 32213 + }, + { + "epoch": 0.41860623770594324, + "grad_norm": 0.46408379077911377, + "learning_rate": 0.0001163025253772469, + "loss": 1.3596, + "step": 32214 + }, + { + "epoch": 0.4186192322498591, + "grad_norm": 0.3770532011985779, + "learning_rate": 0.00011629992591533554, + "loss": 1.339, + "step": 32215 + }, + { + "epoch": 0.418632226793775, + "grad_norm": 0.38618651032447815, + "learning_rate": 0.00011629732645342415, + "loss": 1.4408, + "step": 32216 + }, + { + "epoch": 0.41864522133769083, + "grad_norm": 0.3468989133834839, + "learning_rate": 0.00011629472699151276, + "loss": 1.3171, + "step": 32217 + }, + { + "epoch": 0.41865821588160673, + "grad_norm": 0.37816137075424194, + "learning_rate": 0.00011629212752960137, + "loss": 1.5181, + "step": 32218 + }, + { + "epoch": 0.4186712104255226, + "grad_norm": 0.3315829634666443, + "learning_rate": 0.00011628952806769, + "loss": 1.3443, + "step": 32219 + }, + { + "epoch": 0.4186842049694385, + "grad_norm": 0.41978517174720764, + "learning_rate": 0.00011628692860577861, + "loss": 1.4834, + "step": 32220 + }, + { + "epoch": 0.4186971995133543, + "grad_norm": 0.4145573377609253, + "learning_rate": 0.00011628432914386722, + "loss": 1.6032, + "step": 32221 + }, + { + "epoch": 0.4187101940572702, + "grad_norm": 0.37847137451171875, + "learning_rate": 0.00011628172968195583, + "loss": 1.1296, + "step": 32222 + }, + { + "epoch": 0.41872318860118607, + "grad_norm": 0.5768716335296631, + "learning_rate": 0.00011627913022004447, + "loss": 1.3121, + "step": 32223 + }, + { + "epoch": 0.41873618314510197, + "grad_norm": 0.45342352986335754, + "learning_rate": 0.00011627653075813308, + "loss": 1.5283, + "step": 32224 + }, + { + "epoch": 0.4187491776890178, + "grad_norm": 0.4855320155620575, + "learning_rate": 0.00011627393129622169, + "loss": 1.5873, + "step": 32225 + }, + { + "epoch": 0.4187621722329337, + "grad_norm": 0.5236383080482483, + "learning_rate": 0.0001162713318343103, + "loss": 1.4403, + "step": 32226 + }, + { + "epoch": 0.41877516677684956, + "grad_norm": 0.38042962551116943, + "learning_rate": 0.00011626873237239892, + "loss": 1.3865, + "step": 32227 + }, + { + "epoch": 0.41878816132076546, + "grad_norm": 0.4319530129432678, + "learning_rate": 0.00011626613291048754, + "loss": 1.4187, + "step": 32228 + }, + { + "epoch": 0.4188011558646813, + "grad_norm": 0.35347607731819153, + "learning_rate": 0.00011626353344857615, + "loss": 1.2113, + "step": 32229 + }, + { + "epoch": 0.4188141504085972, + "grad_norm": 0.46501338481903076, + "learning_rate": 0.00011626093398666476, + "loss": 1.548, + "step": 32230 + }, + { + "epoch": 0.41882714495251305, + "grad_norm": 0.45698726177215576, + "learning_rate": 0.00011625833452475338, + "loss": 1.363, + "step": 32231 + }, + { + "epoch": 0.41884013949642895, + "grad_norm": 0.3451613187789917, + "learning_rate": 0.00011625573506284199, + "loss": 1.3086, + "step": 32232 + }, + { + "epoch": 0.4188531340403448, + "grad_norm": 0.4871053099632263, + "learning_rate": 0.0001162531356009306, + "loss": 1.454, + "step": 32233 + }, + { + "epoch": 0.4188661285842607, + "grad_norm": 0.2793738842010498, + "learning_rate": 0.00011625053613901924, + "loss": 1.183, + "step": 32234 + }, + { + "epoch": 0.41887912312817654, + "grad_norm": 0.3700677156448364, + "learning_rate": 0.00011624793667710785, + "loss": 1.2283, + "step": 32235 + }, + { + "epoch": 0.41889211767209245, + "grad_norm": 0.3881552517414093, + "learning_rate": 0.00011624533721519646, + "loss": 1.4366, + "step": 32236 + }, + { + "epoch": 0.4189051122160083, + "grad_norm": 0.5991576910018921, + "learning_rate": 0.00011624273775328507, + "loss": 1.2616, + "step": 32237 + }, + { + "epoch": 0.4189181067599242, + "grad_norm": 0.3525272607803345, + "learning_rate": 0.0001162401382913737, + "loss": 1.4306, + "step": 32238 + }, + { + "epoch": 0.41893110130384004, + "grad_norm": 0.3290645480155945, + "learning_rate": 0.00011623753882946231, + "loss": 1.2525, + "step": 32239 + }, + { + "epoch": 0.41894409584775594, + "grad_norm": 0.3410657048225403, + "learning_rate": 0.00011623493936755092, + "loss": 1.5184, + "step": 32240 + }, + { + "epoch": 0.4189570903916718, + "grad_norm": 0.2856634557247162, + "learning_rate": 0.00011623233990563953, + "loss": 1.3853, + "step": 32241 + }, + { + "epoch": 0.4189700849355877, + "grad_norm": 0.3366034924983978, + "learning_rate": 0.00011622974044372817, + "loss": 1.3233, + "step": 32242 + }, + { + "epoch": 0.41898307947950353, + "grad_norm": 0.47480490803718567, + "learning_rate": 0.00011622714098181677, + "loss": 1.4806, + "step": 32243 + }, + { + "epoch": 0.41899607402341943, + "grad_norm": 0.2718394994735718, + "learning_rate": 0.00011622454151990538, + "loss": 1.3126, + "step": 32244 + }, + { + "epoch": 0.4190090685673353, + "grad_norm": 0.37457048892974854, + "learning_rate": 0.00011622194205799399, + "loss": 1.35, + "step": 32245 + }, + { + "epoch": 0.4190220631112512, + "grad_norm": 0.5084694623947144, + "learning_rate": 0.00011621934259608263, + "loss": 1.366, + "step": 32246 + }, + { + "epoch": 0.419035057655167, + "grad_norm": 0.4383896291255951, + "learning_rate": 0.00011621674313417124, + "loss": 1.3434, + "step": 32247 + }, + { + "epoch": 0.4190480521990829, + "grad_norm": 0.423346608877182, + "learning_rate": 0.00011621414367225985, + "loss": 1.3151, + "step": 32248 + }, + { + "epoch": 0.41906104674299877, + "grad_norm": 0.333220511674881, + "learning_rate": 0.00011621154421034846, + "loss": 1.3636, + "step": 32249 + }, + { + "epoch": 0.41907404128691467, + "grad_norm": 0.4135946035385132, + "learning_rate": 0.00011620894474843708, + "loss": 1.4954, + "step": 32250 + }, + { + "epoch": 0.4190870358308305, + "grad_norm": 0.31489089131355286, + "learning_rate": 0.0001162063452865257, + "loss": 1.5427, + "step": 32251 + }, + { + "epoch": 0.4191000303747464, + "grad_norm": 0.474231094121933, + "learning_rate": 0.0001162037458246143, + "loss": 1.4955, + "step": 32252 + }, + { + "epoch": 0.41911302491866226, + "grad_norm": 0.4646984338760376, + "learning_rate": 0.00011620114636270292, + "loss": 1.4317, + "step": 32253 + }, + { + "epoch": 0.41912601946257816, + "grad_norm": 0.41386303305625916, + "learning_rate": 0.00011619854690079155, + "loss": 1.3944, + "step": 32254 + }, + { + "epoch": 0.419139014006494, + "grad_norm": 0.40486401319503784, + "learning_rate": 0.00011619594743888017, + "loss": 1.173, + "step": 32255 + }, + { + "epoch": 0.4191520085504099, + "grad_norm": 0.4298326075077057, + "learning_rate": 0.00011619334797696876, + "loss": 1.4026, + "step": 32256 + }, + { + "epoch": 0.41916500309432575, + "grad_norm": 0.3901742994785309, + "learning_rate": 0.00011619074851505737, + "loss": 1.4589, + "step": 32257 + }, + { + "epoch": 0.41917799763824165, + "grad_norm": 0.4658268094062805, + "learning_rate": 0.00011618814905314601, + "loss": 1.3743, + "step": 32258 + }, + { + "epoch": 0.4191909921821575, + "grad_norm": 0.43392351269721985, + "learning_rate": 0.00011618554959123462, + "loss": 1.5418, + "step": 32259 + }, + { + "epoch": 0.4192039867260734, + "grad_norm": 0.44961997866630554, + "learning_rate": 0.00011618295012932323, + "loss": 1.5159, + "step": 32260 + }, + { + "epoch": 0.41921698126998924, + "grad_norm": 0.3624570369720459, + "learning_rate": 0.00011618035066741184, + "loss": 1.2327, + "step": 32261 + }, + { + "epoch": 0.41922997581390514, + "grad_norm": 0.43030011653900146, + "learning_rate": 0.00011617775120550047, + "loss": 1.3285, + "step": 32262 + }, + { + "epoch": 0.419242970357821, + "grad_norm": 0.34005704522132874, + "learning_rate": 0.00011617515174358908, + "loss": 1.5247, + "step": 32263 + }, + { + "epoch": 0.4192559649017369, + "grad_norm": 0.371330589056015, + "learning_rate": 0.00011617255228167769, + "loss": 1.3898, + "step": 32264 + }, + { + "epoch": 0.41926895944565273, + "grad_norm": 0.32238394021987915, + "learning_rate": 0.0001161699528197663, + "loss": 1.3084, + "step": 32265 + }, + { + "epoch": 0.41928195398956863, + "grad_norm": 0.41983339190483093, + "learning_rate": 0.00011616735335785494, + "loss": 1.2997, + "step": 32266 + }, + { + "epoch": 0.4192949485334845, + "grad_norm": 0.4474751055240631, + "learning_rate": 0.00011616475389594355, + "loss": 1.3101, + "step": 32267 + }, + { + "epoch": 0.4193079430774004, + "grad_norm": 0.4586068093776703, + "learning_rate": 0.00011616215443403216, + "loss": 1.2624, + "step": 32268 + }, + { + "epoch": 0.4193209376213162, + "grad_norm": 0.4348476231098175, + "learning_rate": 0.00011615955497212079, + "loss": 1.4687, + "step": 32269 + }, + { + "epoch": 0.4193339321652321, + "grad_norm": 0.30033254623413086, + "learning_rate": 0.0001161569555102094, + "loss": 1.3672, + "step": 32270 + }, + { + "epoch": 0.41934692670914797, + "grad_norm": 0.44349902868270874, + "learning_rate": 0.00011615435604829801, + "loss": 1.3642, + "step": 32271 + }, + { + "epoch": 0.41935992125306387, + "grad_norm": 0.3840700685977936, + "learning_rate": 0.00011615175658638662, + "loss": 1.4131, + "step": 32272 + }, + { + "epoch": 0.4193729157969797, + "grad_norm": 0.4895966053009033, + "learning_rate": 0.00011614915712447524, + "loss": 1.6234, + "step": 32273 + }, + { + "epoch": 0.4193859103408956, + "grad_norm": 0.465359628200531, + "learning_rate": 0.00011614655766256385, + "loss": 1.5284, + "step": 32274 + }, + { + "epoch": 0.41939890488481146, + "grad_norm": 0.47041377425193787, + "learning_rate": 0.00011614395820065247, + "loss": 1.2472, + "step": 32275 + }, + { + "epoch": 0.41941189942872736, + "grad_norm": 0.46924617886543274, + "learning_rate": 0.00011614135873874108, + "loss": 1.537, + "step": 32276 + }, + { + "epoch": 0.4194248939726432, + "grad_norm": 0.41342389583587646, + "learning_rate": 0.00011613875927682971, + "loss": 1.1781, + "step": 32277 + }, + { + "epoch": 0.4194378885165591, + "grad_norm": 0.476694256067276, + "learning_rate": 0.00011613615981491833, + "loss": 1.3921, + "step": 32278 + }, + { + "epoch": 0.419450883060475, + "grad_norm": 0.28122037649154663, + "learning_rate": 0.00011613356035300694, + "loss": 1.2522, + "step": 32279 + }, + { + "epoch": 0.41946387760439086, + "grad_norm": 0.3382534980773926, + "learning_rate": 0.00011613096089109555, + "loss": 1.3992, + "step": 32280 + }, + { + "epoch": 0.41947687214830676, + "grad_norm": 0.4447752833366394, + "learning_rate": 0.00011612836142918417, + "loss": 1.4606, + "step": 32281 + }, + { + "epoch": 0.4194898666922226, + "grad_norm": 0.4879113435745239, + "learning_rate": 0.00011612576196727278, + "loss": 1.4304, + "step": 32282 + }, + { + "epoch": 0.4195028612361385, + "grad_norm": 0.5712043642997742, + "learning_rate": 0.0001161231625053614, + "loss": 1.3906, + "step": 32283 + }, + { + "epoch": 0.41951585578005435, + "grad_norm": 0.3488975167274475, + "learning_rate": 0.00011612056304345, + "loss": 1.3649, + "step": 32284 + }, + { + "epoch": 0.41952885032397025, + "grad_norm": 0.40721893310546875, + "learning_rate": 0.00011611796358153863, + "loss": 1.4337, + "step": 32285 + }, + { + "epoch": 0.4195418448678861, + "grad_norm": 0.38627147674560547, + "learning_rate": 0.00011611536411962724, + "loss": 1.4006, + "step": 32286 + }, + { + "epoch": 0.419554839411802, + "grad_norm": 0.5106608867645264, + "learning_rate": 0.00011611276465771585, + "loss": 1.4741, + "step": 32287 + }, + { + "epoch": 0.41956783395571784, + "grad_norm": 0.443295419216156, + "learning_rate": 0.00011611016519580446, + "loss": 1.5234, + "step": 32288 + }, + { + "epoch": 0.41958082849963374, + "grad_norm": 0.3780266046524048, + "learning_rate": 0.0001161075657338931, + "loss": 1.3881, + "step": 32289 + }, + { + "epoch": 0.4195938230435496, + "grad_norm": 0.49872884154319763, + "learning_rate": 0.00011610496627198171, + "loss": 1.5286, + "step": 32290 + }, + { + "epoch": 0.4196068175874655, + "grad_norm": 0.3499641716480255, + "learning_rate": 0.00011610236681007032, + "loss": 1.3981, + "step": 32291 + }, + { + "epoch": 0.41961981213138133, + "grad_norm": 0.4204707145690918, + "learning_rate": 0.00011609976734815893, + "loss": 1.5844, + "step": 32292 + }, + { + "epoch": 0.41963280667529723, + "grad_norm": 0.41294464468955994, + "learning_rate": 0.00011609716788624756, + "loss": 1.3681, + "step": 32293 + }, + { + "epoch": 0.4196458012192131, + "grad_norm": 0.33723780512809753, + "learning_rate": 0.00011609456842433617, + "loss": 1.436, + "step": 32294 + }, + { + "epoch": 0.419658795763129, + "grad_norm": 0.35678631067276, + "learning_rate": 0.00011609196896242478, + "loss": 1.4095, + "step": 32295 + }, + { + "epoch": 0.4196717903070448, + "grad_norm": 0.3804474472999573, + "learning_rate": 0.00011608936950051339, + "loss": 1.3847, + "step": 32296 + }, + { + "epoch": 0.4196847848509607, + "grad_norm": 0.47448214888572693, + "learning_rate": 0.00011608677003860203, + "loss": 1.4121, + "step": 32297 + }, + { + "epoch": 0.41969777939487657, + "grad_norm": 0.4430135190486908, + "learning_rate": 0.00011608417057669063, + "loss": 1.2286, + "step": 32298 + }, + { + "epoch": 0.41971077393879247, + "grad_norm": 0.4234481155872345, + "learning_rate": 0.00011608157111477924, + "loss": 1.6024, + "step": 32299 + }, + { + "epoch": 0.4197237684827083, + "grad_norm": 0.25687652826309204, + "learning_rate": 0.00011607897165286785, + "loss": 1.2493, + "step": 32300 + }, + { + "epoch": 0.4197367630266242, + "grad_norm": 0.5005450248718262, + "learning_rate": 0.00011607637219095649, + "loss": 1.404, + "step": 32301 + }, + { + "epoch": 0.41974975757054006, + "grad_norm": 0.39949968457221985, + "learning_rate": 0.0001160737727290451, + "loss": 1.5043, + "step": 32302 + }, + { + "epoch": 0.41976275211445596, + "grad_norm": 0.4189079701900482, + "learning_rate": 0.00011607117326713371, + "loss": 1.4088, + "step": 32303 + }, + { + "epoch": 0.4197757466583718, + "grad_norm": 0.39433741569519043, + "learning_rate": 0.00011606857380522232, + "loss": 1.5064, + "step": 32304 + }, + { + "epoch": 0.4197887412022877, + "grad_norm": 0.36041754484176636, + "learning_rate": 0.00011606597434331094, + "loss": 1.2977, + "step": 32305 + }, + { + "epoch": 0.41980173574620355, + "grad_norm": 0.4624941647052765, + "learning_rate": 0.00011606337488139955, + "loss": 1.4347, + "step": 32306 + }, + { + "epoch": 0.41981473029011945, + "grad_norm": 0.4705483019351959, + "learning_rate": 0.00011606077541948816, + "loss": 1.4634, + "step": 32307 + }, + { + "epoch": 0.4198277248340353, + "grad_norm": 0.3786124289035797, + "learning_rate": 0.0001160581759575768, + "loss": 1.323, + "step": 32308 + }, + { + "epoch": 0.4198407193779512, + "grad_norm": 0.3233451545238495, + "learning_rate": 0.00011605557649566541, + "loss": 1.2716, + "step": 32309 + }, + { + "epoch": 0.41985371392186704, + "grad_norm": 0.34478846192359924, + "learning_rate": 0.00011605297703375402, + "loss": 1.3264, + "step": 32310 + }, + { + "epoch": 0.41986670846578295, + "grad_norm": 0.25865858793258667, + "learning_rate": 0.00011605037757184262, + "loss": 1.3168, + "step": 32311 + }, + { + "epoch": 0.4198797030096988, + "grad_norm": 0.3190869987010956, + "learning_rate": 0.00011604777810993126, + "loss": 1.483, + "step": 32312 + }, + { + "epoch": 0.4198926975536147, + "grad_norm": 0.45041441917419434, + "learning_rate": 0.00011604517864801987, + "loss": 1.4352, + "step": 32313 + }, + { + "epoch": 0.41990569209753054, + "grad_norm": 0.3702928125858307, + "learning_rate": 0.00011604257918610848, + "loss": 1.3017, + "step": 32314 + }, + { + "epoch": 0.41991868664144644, + "grad_norm": 0.45153993368148804, + "learning_rate": 0.00011603997972419709, + "loss": 1.3046, + "step": 32315 + }, + { + "epoch": 0.4199316811853623, + "grad_norm": 0.4125082492828369, + "learning_rate": 0.00011603738026228572, + "loss": 1.2592, + "step": 32316 + }, + { + "epoch": 0.4199446757292782, + "grad_norm": 0.43710389733314514, + "learning_rate": 0.00011603478080037433, + "loss": 1.3791, + "step": 32317 + }, + { + "epoch": 0.41995767027319403, + "grad_norm": 0.31893301010131836, + "learning_rate": 0.00011603218133846294, + "loss": 1.4254, + "step": 32318 + }, + { + "epoch": 0.41997066481710993, + "grad_norm": 0.3802414834499359, + "learning_rate": 0.00011602958187655155, + "loss": 1.4624, + "step": 32319 + }, + { + "epoch": 0.4199836593610258, + "grad_norm": 0.4751666188240051, + "learning_rate": 0.00011602698241464019, + "loss": 1.4983, + "step": 32320 + }, + { + "epoch": 0.4199966539049417, + "grad_norm": 0.3258700668811798, + "learning_rate": 0.0001160243829527288, + "loss": 1.4564, + "step": 32321 + }, + { + "epoch": 0.4200096484488575, + "grad_norm": 0.31875380873680115, + "learning_rate": 0.00011602178349081741, + "loss": 1.4417, + "step": 32322 + }, + { + "epoch": 0.4200226429927734, + "grad_norm": 0.37079137563705444, + "learning_rate": 0.00011601918402890601, + "loss": 1.3303, + "step": 32323 + }, + { + "epoch": 0.42003563753668927, + "grad_norm": 0.3270212411880493, + "learning_rate": 0.00011601658456699465, + "loss": 1.0694, + "step": 32324 + }, + { + "epoch": 0.42004863208060517, + "grad_norm": 0.39389216899871826, + "learning_rate": 0.00011601398510508326, + "loss": 1.3781, + "step": 32325 + }, + { + "epoch": 0.420061626624521, + "grad_norm": 0.43165719509124756, + "learning_rate": 0.00011601138564317187, + "loss": 1.3574, + "step": 32326 + }, + { + "epoch": 0.4200746211684369, + "grad_norm": 0.4926842749118805, + "learning_rate": 0.00011600878618126048, + "loss": 1.4515, + "step": 32327 + }, + { + "epoch": 0.42008761571235276, + "grad_norm": 0.42954862117767334, + "learning_rate": 0.0001160061867193491, + "loss": 1.5662, + "step": 32328 + }, + { + "epoch": 0.42010061025626866, + "grad_norm": 0.3975192606449127, + "learning_rate": 0.00011600358725743771, + "loss": 1.4922, + "step": 32329 + }, + { + "epoch": 0.4201136048001845, + "grad_norm": 0.3848022520542145, + "learning_rate": 0.00011600098779552632, + "loss": 1.293, + "step": 32330 + }, + { + "epoch": 0.4201265993441004, + "grad_norm": 0.36023685336112976, + "learning_rate": 0.00011599838833361494, + "loss": 1.3819, + "step": 32331 + }, + { + "epoch": 0.42013959388801625, + "grad_norm": 0.35359495878219604, + "learning_rate": 0.00011599578887170357, + "loss": 1.3193, + "step": 32332 + }, + { + "epoch": 0.42015258843193215, + "grad_norm": 0.4508792757987976, + "learning_rate": 0.00011599318940979218, + "loss": 1.4522, + "step": 32333 + }, + { + "epoch": 0.420165582975848, + "grad_norm": 0.4280056655406952, + "learning_rate": 0.0001159905899478808, + "loss": 1.4169, + "step": 32334 + }, + { + "epoch": 0.4201785775197639, + "grad_norm": 0.392936646938324, + "learning_rate": 0.0001159879904859694, + "loss": 1.4968, + "step": 32335 + }, + { + "epoch": 0.42019157206367974, + "grad_norm": 0.3698326051235199, + "learning_rate": 0.00011598539102405803, + "loss": 1.5217, + "step": 32336 + }, + { + "epoch": 0.42020456660759564, + "grad_norm": 0.3180531859397888, + "learning_rate": 0.00011598279156214664, + "loss": 1.3354, + "step": 32337 + }, + { + "epoch": 0.4202175611515115, + "grad_norm": 0.32269519567489624, + "learning_rate": 0.00011598019210023525, + "loss": 1.1977, + "step": 32338 + }, + { + "epoch": 0.4202305556954274, + "grad_norm": 0.4611388146877289, + "learning_rate": 0.00011597759263832386, + "loss": 1.5446, + "step": 32339 + }, + { + "epoch": 0.42024355023934323, + "grad_norm": 0.36423933506011963, + "learning_rate": 0.00011597499317641249, + "loss": 1.3829, + "step": 32340 + }, + { + "epoch": 0.42025654478325913, + "grad_norm": 0.4284282326698303, + "learning_rate": 0.0001159723937145011, + "loss": 1.5391, + "step": 32341 + }, + { + "epoch": 0.420269539327175, + "grad_norm": 0.40483248233795166, + "learning_rate": 0.00011596979425258971, + "loss": 1.4558, + "step": 32342 + }, + { + "epoch": 0.4202825338710909, + "grad_norm": 0.38728538155555725, + "learning_rate": 0.00011596719479067835, + "loss": 1.3604, + "step": 32343 + }, + { + "epoch": 0.4202955284150067, + "grad_norm": 0.4580335021018982, + "learning_rate": 0.00011596459532876696, + "loss": 1.295, + "step": 32344 + }, + { + "epoch": 0.4203085229589226, + "grad_norm": 0.30721211433410645, + "learning_rate": 0.00011596199586685557, + "loss": 1.2357, + "step": 32345 + }, + { + "epoch": 0.42032151750283847, + "grad_norm": 0.3419477939605713, + "learning_rate": 0.00011595939640494418, + "loss": 1.4356, + "step": 32346 + }, + { + "epoch": 0.42033451204675437, + "grad_norm": 0.3426543176174164, + "learning_rate": 0.0001159567969430328, + "loss": 1.379, + "step": 32347 + }, + { + "epoch": 0.4203475065906702, + "grad_norm": 0.3871789872646332, + "learning_rate": 0.00011595419748112142, + "loss": 1.4562, + "step": 32348 + }, + { + "epoch": 0.4203605011345861, + "grad_norm": 0.40265098214149475, + "learning_rate": 0.00011595159801921003, + "loss": 1.3805, + "step": 32349 + }, + { + "epoch": 0.42037349567850196, + "grad_norm": 0.4091636538505554, + "learning_rate": 0.00011594899855729864, + "loss": 1.461, + "step": 32350 + }, + { + "epoch": 0.42038649022241786, + "grad_norm": 0.36400511860847473, + "learning_rate": 0.00011594639909538728, + "loss": 1.3014, + "step": 32351 + }, + { + "epoch": 0.4203994847663337, + "grad_norm": 0.4031924307346344, + "learning_rate": 0.00011594379963347589, + "loss": 1.356, + "step": 32352 + }, + { + "epoch": 0.4204124793102496, + "grad_norm": 0.3914255201816559, + "learning_rate": 0.00011594120017156448, + "loss": 1.42, + "step": 32353 + }, + { + "epoch": 0.42042547385416545, + "grad_norm": 0.38627889752388, + "learning_rate": 0.0001159386007096531, + "loss": 1.337, + "step": 32354 + }, + { + "epoch": 0.42043846839808136, + "grad_norm": 0.556472659111023, + "learning_rate": 0.00011593600124774173, + "loss": 1.2022, + "step": 32355 + }, + { + "epoch": 0.42045146294199726, + "grad_norm": 0.46469444036483765, + "learning_rate": 0.00011593340178583034, + "loss": 1.3123, + "step": 32356 + }, + { + "epoch": 0.4204644574859131, + "grad_norm": 0.35841071605682373, + "learning_rate": 0.00011593080232391896, + "loss": 1.2154, + "step": 32357 + }, + { + "epoch": 0.420477452029829, + "grad_norm": 0.43212684988975525, + "learning_rate": 0.00011592820286200757, + "loss": 1.6088, + "step": 32358 + }, + { + "epoch": 0.42049044657374485, + "grad_norm": 0.46963879466056824, + "learning_rate": 0.00011592560340009619, + "loss": 1.4245, + "step": 32359 + }, + { + "epoch": 0.42050344111766075, + "grad_norm": 0.32945647835731506, + "learning_rate": 0.0001159230039381848, + "loss": 1.4608, + "step": 32360 + }, + { + "epoch": 0.4205164356615766, + "grad_norm": 0.48712360858917236, + "learning_rate": 0.00011592040447627341, + "loss": 1.5481, + "step": 32361 + }, + { + "epoch": 0.4205294302054925, + "grad_norm": 0.437644898891449, + "learning_rate": 0.00011591780501436202, + "loss": 1.3199, + "step": 32362 + }, + { + "epoch": 0.42054242474940834, + "grad_norm": 0.3633168041706085, + "learning_rate": 0.00011591520555245066, + "loss": 1.3424, + "step": 32363 + }, + { + "epoch": 0.42055541929332424, + "grad_norm": 0.35237812995910645, + "learning_rate": 0.00011591260609053927, + "loss": 1.36, + "step": 32364 + }, + { + "epoch": 0.4205684138372401, + "grad_norm": 0.3826889395713806, + "learning_rate": 0.00011591000662862787, + "loss": 1.3326, + "step": 32365 + }, + { + "epoch": 0.420581408381156, + "grad_norm": 0.3915887773036957, + "learning_rate": 0.00011590740716671648, + "loss": 1.5658, + "step": 32366 + }, + { + "epoch": 0.42059440292507183, + "grad_norm": 0.45203033089637756, + "learning_rate": 0.00011590480770480512, + "loss": 1.3102, + "step": 32367 + }, + { + "epoch": 0.42060739746898773, + "grad_norm": 0.4447691738605499, + "learning_rate": 0.00011590220824289373, + "loss": 1.3894, + "step": 32368 + }, + { + "epoch": 0.4206203920129036, + "grad_norm": 0.4356711804866791, + "learning_rate": 0.00011589960878098234, + "loss": 1.5708, + "step": 32369 + }, + { + "epoch": 0.4206333865568195, + "grad_norm": 0.45708414912223816, + "learning_rate": 0.00011589700931907095, + "loss": 1.518, + "step": 32370 + }, + { + "epoch": 0.4206463811007353, + "grad_norm": 0.40342941880226135, + "learning_rate": 0.00011589440985715958, + "loss": 1.4355, + "step": 32371 + }, + { + "epoch": 0.4206593756446512, + "grad_norm": 0.4190525710582733, + "learning_rate": 0.00011589181039524819, + "loss": 1.4225, + "step": 32372 + }, + { + "epoch": 0.42067237018856707, + "grad_norm": 0.39301496744155884, + "learning_rate": 0.0001158892109333368, + "loss": 1.2063, + "step": 32373 + }, + { + "epoch": 0.42068536473248297, + "grad_norm": 0.3339405953884125, + "learning_rate": 0.00011588661147142541, + "loss": 1.4193, + "step": 32374 + }, + { + "epoch": 0.4206983592763988, + "grad_norm": 0.37180769443511963, + "learning_rate": 0.00011588401200951405, + "loss": 1.4794, + "step": 32375 + }, + { + "epoch": 0.4207113538203147, + "grad_norm": 0.4803014099597931, + "learning_rate": 0.00011588141254760266, + "loss": 1.3188, + "step": 32376 + }, + { + "epoch": 0.42072434836423056, + "grad_norm": 0.37827426195144653, + "learning_rate": 0.00011587881308569127, + "loss": 1.4775, + "step": 32377 + }, + { + "epoch": 0.42073734290814646, + "grad_norm": 0.3197339177131653, + "learning_rate": 0.00011587621362377987, + "loss": 1.2691, + "step": 32378 + }, + { + "epoch": 0.4207503374520623, + "grad_norm": 0.45096883177757263, + "learning_rate": 0.0001158736141618685, + "loss": 1.6366, + "step": 32379 + }, + { + "epoch": 0.4207633319959782, + "grad_norm": 0.3647748529911041, + "learning_rate": 0.00011587101469995711, + "loss": 1.4396, + "step": 32380 + }, + { + "epoch": 0.42077632653989405, + "grad_norm": 0.4202468693256378, + "learning_rate": 0.00011586841523804573, + "loss": 1.3329, + "step": 32381 + }, + { + "epoch": 0.42078932108380995, + "grad_norm": 0.3129234313964844, + "learning_rate": 0.00011586581577613435, + "loss": 1.2121, + "step": 32382 + }, + { + "epoch": 0.4208023156277258, + "grad_norm": 0.461628258228302, + "learning_rate": 0.00011586321631422296, + "loss": 1.2551, + "step": 32383 + }, + { + "epoch": 0.4208153101716417, + "grad_norm": 0.3742586672306061, + "learning_rate": 0.00011586061685231157, + "loss": 1.4465, + "step": 32384 + }, + { + "epoch": 0.42082830471555754, + "grad_norm": 0.3721318542957306, + "learning_rate": 0.00011585801739040018, + "loss": 1.4255, + "step": 32385 + }, + { + "epoch": 0.42084129925947344, + "grad_norm": 0.298602819442749, + "learning_rate": 0.00011585541792848882, + "loss": 1.5013, + "step": 32386 + }, + { + "epoch": 0.4208542938033893, + "grad_norm": 0.37362757325172424, + "learning_rate": 0.00011585281846657743, + "loss": 1.5203, + "step": 32387 + }, + { + "epoch": 0.4208672883473052, + "grad_norm": 0.4179818630218506, + "learning_rate": 0.00011585021900466604, + "loss": 1.3826, + "step": 32388 + }, + { + "epoch": 0.42088028289122104, + "grad_norm": 0.38251161575317383, + "learning_rate": 0.00011584761954275465, + "loss": 1.2808, + "step": 32389 + }, + { + "epoch": 0.42089327743513694, + "grad_norm": 0.37224841117858887, + "learning_rate": 0.00011584502008084328, + "loss": 1.4339, + "step": 32390 + }, + { + "epoch": 0.4209062719790528, + "grad_norm": 0.33931422233581543, + "learning_rate": 0.00011584242061893189, + "loss": 1.43, + "step": 32391 + }, + { + "epoch": 0.4209192665229687, + "grad_norm": 0.4628772437572479, + "learning_rate": 0.0001158398211570205, + "loss": 1.455, + "step": 32392 + }, + { + "epoch": 0.4209322610668845, + "grad_norm": 0.3676503002643585, + "learning_rate": 0.00011583722169510911, + "loss": 1.5402, + "step": 32393 + }, + { + "epoch": 0.42094525561080043, + "grad_norm": 0.41572582721710205, + "learning_rate": 0.00011583462223319775, + "loss": 1.7797, + "step": 32394 + }, + { + "epoch": 0.4209582501547163, + "grad_norm": 0.2746230959892273, + "learning_rate": 0.00011583202277128635, + "loss": 1.4575, + "step": 32395 + }, + { + "epoch": 0.4209712446986322, + "grad_norm": 0.36138656735420227, + "learning_rate": 0.00011582942330937496, + "loss": 1.398, + "step": 32396 + }, + { + "epoch": 0.420984239242548, + "grad_norm": 0.42010214924812317, + "learning_rate": 0.00011582682384746357, + "loss": 1.554, + "step": 32397 + }, + { + "epoch": 0.4209972337864639, + "grad_norm": 0.3860754370689392, + "learning_rate": 0.0001158242243855522, + "loss": 1.3548, + "step": 32398 + }, + { + "epoch": 0.42101022833037977, + "grad_norm": 0.34347113966941833, + "learning_rate": 0.00011582162492364082, + "loss": 1.3631, + "step": 32399 + }, + { + "epoch": 0.42102322287429567, + "grad_norm": 0.42275938391685486, + "learning_rate": 0.00011581902546172943, + "loss": 1.422, + "step": 32400 + }, + { + "epoch": 0.4210362174182115, + "grad_norm": 0.3430614471435547, + "learning_rate": 0.00011581642599981804, + "loss": 1.2599, + "step": 32401 + }, + { + "epoch": 0.4210492119621274, + "grad_norm": 0.4406384229660034, + "learning_rate": 0.00011581382653790666, + "loss": 1.3838, + "step": 32402 + }, + { + "epoch": 0.42106220650604326, + "grad_norm": 0.3799358904361725, + "learning_rate": 0.00011581122707599527, + "loss": 1.317, + "step": 32403 + }, + { + "epoch": 0.42107520104995916, + "grad_norm": 0.4151029586791992, + "learning_rate": 0.00011580862761408389, + "loss": 1.4482, + "step": 32404 + }, + { + "epoch": 0.421088195593875, + "grad_norm": 0.45835432410240173, + "learning_rate": 0.0001158060281521725, + "loss": 1.5123, + "step": 32405 + }, + { + "epoch": 0.4211011901377909, + "grad_norm": 0.47239863872528076, + "learning_rate": 0.00011580342869026113, + "loss": 1.3094, + "step": 32406 + }, + { + "epoch": 0.42111418468170675, + "grad_norm": 0.4508815109729767, + "learning_rate": 0.00011580082922834973, + "loss": 1.3516, + "step": 32407 + }, + { + "epoch": 0.42112717922562265, + "grad_norm": 0.39499521255493164, + "learning_rate": 0.00011579822976643834, + "loss": 1.3663, + "step": 32408 + }, + { + "epoch": 0.4211401737695385, + "grad_norm": 0.38398101925849915, + "learning_rate": 0.00011579563030452695, + "loss": 1.4474, + "step": 32409 + }, + { + "epoch": 0.4211531683134544, + "grad_norm": 0.42292532324790955, + "learning_rate": 0.00011579303084261559, + "loss": 1.4365, + "step": 32410 + }, + { + "epoch": 0.42116616285737024, + "grad_norm": 0.3487517535686493, + "learning_rate": 0.0001157904313807042, + "loss": 1.1576, + "step": 32411 + }, + { + "epoch": 0.42117915740128614, + "grad_norm": 0.4398074746131897, + "learning_rate": 0.00011578783191879281, + "loss": 1.2777, + "step": 32412 + }, + { + "epoch": 0.421192151945202, + "grad_norm": 0.2918659448623657, + "learning_rate": 0.00011578523245688142, + "loss": 1.3149, + "step": 32413 + }, + { + "epoch": 0.4212051464891179, + "grad_norm": 0.3867429196834564, + "learning_rate": 0.00011578263299497005, + "loss": 1.5261, + "step": 32414 + }, + { + "epoch": 0.42121814103303373, + "grad_norm": 0.4029358923435211, + "learning_rate": 0.00011578003353305866, + "loss": 1.404, + "step": 32415 + }, + { + "epoch": 0.42123113557694963, + "grad_norm": 0.2864662706851959, + "learning_rate": 0.00011577743407114727, + "loss": 1.3101, + "step": 32416 + }, + { + "epoch": 0.4212441301208655, + "grad_norm": 0.40627962350845337, + "learning_rate": 0.00011577483460923588, + "loss": 1.4149, + "step": 32417 + }, + { + "epoch": 0.4212571246647814, + "grad_norm": 0.45507243275642395, + "learning_rate": 0.00011577223514732452, + "loss": 1.4147, + "step": 32418 + }, + { + "epoch": 0.4212701192086972, + "grad_norm": 0.5283242464065552, + "learning_rate": 0.00011576963568541313, + "loss": 1.5259, + "step": 32419 + }, + { + "epoch": 0.4212831137526131, + "grad_norm": 0.4729507267475128, + "learning_rate": 0.00011576703622350173, + "loss": 1.4683, + "step": 32420 + }, + { + "epoch": 0.42129610829652897, + "grad_norm": 0.3660680949687958, + "learning_rate": 0.00011576443676159037, + "loss": 1.4725, + "step": 32421 + }, + { + "epoch": 0.42130910284044487, + "grad_norm": 0.34618934988975525, + "learning_rate": 0.00011576183729967898, + "loss": 1.4073, + "step": 32422 + }, + { + "epoch": 0.4213220973843607, + "grad_norm": 0.4827609360218048, + "learning_rate": 0.00011575923783776759, + "loss": 1.4843, + "step": 32423 + }, + { + "epoch": 0.4213350919282766, + "grad_norm": 0.4057334363460541, + "learning_rate": 0.0001157566383758562, + "loss": 1.2198, + "step": 32424 + }, + { + "epoch": 0.42134808647219246, + "grad_norm": 0.4930001199245453, + "learning_rate": 0.00011575403891394482, + "loss": 1.4337, + "step": 32425 + }, + { + "epoch": 0.42136108101610836, + "grad_norm": 0.39155885577201843, + "learning_rate": 0.00011575143945203343, + "loss": 1.4534, + "step": 32426 + }, + { + "epoch": 0.4213740755600242, + "grad_norm": 0.38546162843704224, + "learning_rate": 0.00011574883999012205, + "loss": 1.5163, + "step": 32427 + }, + { + "epoch": 0.4213870701039401, + "grad_norm": 0.351840615272522, + "learning_rate": 0.00011574624052821066, + "loss": 1.4753, + "step": 32428 + }, + { + "epoch": 0.42140006464785595, + "grad_norm": 0.41888755559921265, + "learning_rate": 0.0001157436410662993, + "loss": 1.3816, + "step": 32429 + }, + { + "epoch": 0.42141305919177185, + "grad_norm": 0.3877086341381073, + "learning_rate": 0.0001157410416043879, + "loss": 1.2635, + "step": 32430 + }, + { + "epoch": 0.42142605373568776, + "grad_norm": 0.35030055046081543, + "learning_rate": 0.00011573844214247652, + "loss": 1.4179, + "step": 32431 + }, + { + "epoch": 0.4214390482796036, + "grad_norm": 0.39953339099884033, + "learning_rate": 0.00011573584268056513, + "loss": 1.2546, + "step": 32432 + }, + { + "epoch": 0.4214520428235195, + "grad_norm": 0.4239491820335388, + "learning_rate": 0.00011573324321865375, + "loss": 1.563, + "step": 32433 + }, + { + "epoch": 0.42146503736743535, + "grad_norm": 0.2957848012447357, + "learning_rate": 0.00011573064375674236, + "loss": 1.2331, + "step": 32434 + }, + { + "epoch": 0.42147803191135125, + "grad_norm": 0.4347279667854309, + "learning_rate": 0.00011572804429483097, + "loss": 1.2822, + "step": 32435 + }, + { + "epoch": 0.4214910264552671, + "grad_norm": 0.2783936858177185, + "learning_rate": 0.00011572544483291958, + "loss": 1.3009, + "step": 32436 + }, + { + "epoch": 0.421504020999183, + "grad_norm": 0.25434961915016174, + "learning_rate": 0.00011572284537100821, + "loss": 1.23, + "step": 32437 + }, + { + "epoch": 0.42151701554309884, + "grad_norm": 0.4028712213039398, + "learning_rate": 0.00011572024590909682, + "loss": 1.4765, + "step": 32438 + }, + { + "epoch": 0.42153001008701474, + "grad_norm": 0.2965194284915924, + "learning_rate": 0.00011571764644718543, + "loss": 1.3242, + "step": 32439 + }, + { + "epoch": 0.4215430046309306, + "grad_norm": 0.29522132873535156, + "learning_rate": 0.00011571504698527404, + "loss": 1.2425, + "step": 32440 + }, + { + "epoch": 0.4215559991748465, + "grad_norm": 0.3713493347167969, + "learning_rate": 0.00011571244752336268, + "loss": 1.3776, + "step": 32441 + }, + { + "epoch": 0.42156899371876233, + "grad_norm": 0.28610777854919434, + "learning_rate": 0.00011570984806145129, + "loss": 1.4438, + "step": 32442 + }, + { + "epoch": 0.42158198826267823, + "grad_norm": 0.3771226704120636, + "learning_rate": 0.0001157072485995399, + "loss": 1.3282, + "step": 32443 + }, + { + "epoch": 0.4215949828065941, + "grad_norm": 0.44139981269836426, + "learning_rate": 0.00011570464913762851, + "loss": 1.3342, + "step": 32444 + }, + { + "epoch": 0.42160797735051, + "grad_norm": 0.34314075112342834, + "learning_rate": 0.00011570204967571714, + "loss": 1.3384, + "step": 32445 + }, + { + "epoch": 0.4216209718944258, + "grad_norm": 0.43567466735839844, + "learning_rate": 0.00011569945021380575, + "loss": 1.4709, + "step": 32446 + }, + { + "epoch": 0.4216339664383417, + "grad_norm": 0.4547687768936157, + "learning_rate": 0.00011569685075189436, + "loss": 1.3817, + "step": 32447 + }, + { + "epoch": 0.42164696098225757, + "grad_norm": 0.4425061345100403, + "learning_rate": 0.00011569425128998297, + "loss": 1.3218, + "step": 32448 + }, + { + "epoch": 0.42165995552617347, + "grad_norm": 0.4090802073478699, + "learning_rate": 0.0001156916518280716, + "loss": 1.4825, + "step": 32449 + }, + { + "epoch": 0.4216729500700893, + "grad_norm": 0.3710377812385559, + "learning_rate": 0.0001156890523661602, + "loss": 1.3153, + "step": 32450 + }, + { + "epoch": 0.4216859446140052, + "grad_norm": 0.36442914605140686, + "learning_rate": 0.00011568645290424882, + "loss": 1.3846, + "step": 32451 + }, + { + "epoch": 0.42169893915792106, + "grad_norm": 0.3278874456882477, + "learning_rate": 0.00011568385344233743, + "loss": 1.5574, + "step": 32452 + }, + { + "epoch": 0.42171193370183696, + "grad_norm": 0.3954211175441742, + "learning_rate": 0.00011568125398042607, + "loss": 1.3831, + "step": 32453 + }, + { + "epoch": 0.4217249282457528, + "grad_norm": 0.48358291387557983, + "learning_rate": 0.00011567865451851468, + "loss": 1.3582, + "step": 32454 + }, + { + "epoch": 0.4217379227896687, + "grad_norm": 0.3833157420158386, + "learning_rate": 0.00011567605505660329, + "loss": 1.3909, + "step": 32455 + }, + { + "epoch": 0.42175091733358455, + "grad_norm": 0.42973846197128296, + "learning_rate": 0.00011567345559469191, + "loss": 1.3112, + "step": 32456 + }, + { + "epoch": 0.42176391187750045, + "grad_norm": 0.418171763420105, + "learning_rate": 0.00011567085613278052, + "loss": 1.4071, + "step": 32457 + }, + { + "epoch": 0.4217769064214163, + "grad_norm": 0.34298932552337646, + "learning_rate": 0.00011566825667086913, + "loss": 1.3623, + "step": 32458 + }, + { + "epoch": 0.4217899009653322, + "grad_norm": 0.42031776905059814, + "learning_rate": 0.00011566565720895774, + "loss": 1.2802, + "step": 32459 + }, + { + "epoch": 0.42180289550924804, + "grad_norm": 0.32334578037261963, + "learning_rate": 0.00011566305774704638, + "loss": 1.2745, + "step": 32460 + }, + { + "epoch": 0.42181589005316394, + "grad_norm": 0.3761952221393585, + "learning_rate": 0.000115660458285135, + "loss": 1.5443, + "step": 32461 + }, + { + "epoch": 0.4218288845970798, + "grad_norm": 0.3793239891529083, + "learning_rate": 0.00011565785882322359, + "loss": 1.352, + "step": 32462 + }, + { + "epoch": 0.4218418791409957, + "grad_norm": 0.41486847400665283, + "learning_rate": 0.0001156552593613122, + "loss": 1.2028, + "step": 32463 + }, + { + "epoch": 0.42185487368491154, + "grad_norm": 0.5224927067756653, + "learning_rate": 0.00011565265989940084, + "loss": 1.3581, + "step": 32464 + }, + { + "epoch": 0.42186786822882744, + "grad_norm": 0.4299875497817993, + "learning_rate": 0.00011565006043748945, + "loss": 1.2916, + "step": 32465 + }, + { + "epoch": 0.4218808627727433, + "grad_norm": 0.4397324323654175, + "learning_rate": 0.00011564746097557806, + "loss": 1.1838, + "step": 32466 + }, + { + "epoch": 0.4218938573166592, + "grad_norm": 0.4065145254135132, + "learning_rate": 0.00011564486151366667, + "loss": 1.5422, + "step": 32467 + }, + { + "epoch": 0.421906851860575, + "grad_norm": 0.30650031566619873, + "learning_rate": 0.0001156422620517553, + "loss": 1.3082, + "step": 32468 + }, + { + "epoch": 0.42191984640449093, + "grad_norm": 0.40959760546684265, + "learning_rate": 0.00011563966258984391, + "loss": 1.2993, + "step": 32469 + }, + { + "epoch": 0.4219328409484068, + "grad_norm": 0.40634217858314514, + "learning_rate": 0.00011563706312793252, + "loss": 1.5222, + "step": 32470 + }, + { + "epoch": 0.4219458354923227, + "grad_norm": 0.4975477159023285, + "learning_rate": 0.00011563446366602113, + "loss": 1.4965, + "step": 32471 + }, + { + "epoch": 0.4219588300362385, + "grad_norm": 0.29792264103889465, + "learning_rate": 0.00011563186420410977, + "loss": 1.3949, + "step": 32472 + }, + { + "epoch": 0.4219718245801544, + "grad_norm": 0.4249768853187561, + "learning_rate": 0.00011562926474219838, + "loss": 1.3595, + "step": 32473 + }, + { + "epoch": 0.42198481912407027, + "grad_norm": 0.40076184272766113, + "learning_rate": 0.00011562666528028699, + "loss": 1.4992, + "step": 32474 + }, + { + "epoch": 0.42199781366798617, + "grad_norm": 0.3255220055580139, + "learning_rate": 0.00011562406581837559, + "loss": 1.3101, + "step": 32475 + }, + { + "epoch": 0.422010808211902, + "grad_norm": 0.27454039454460144, + "learning_rate": 0.00011562146635646423, + "loss": 1.3713, + "step": 32476 + }, + { + "epoch": 0.4220238027558179, + "grad_norm": 0.3896216154098511, + "learning_rate": 0.00011561886689455284, + "loss": 1.3059, + "step": 32477 + }, + { + "epoch": 0.42203679729973376, + "grad_norm": 0.33177557587623596, + "learning_rate": 0.00011561626743264145, + "loss": 1.2798, + "step": 32478 + }, + { + "epoch": 0.42204979184364966, + "grad_norm": 0.264780193567276, + "learning_rate": 0.00011561366797073006, + "loss": 1.366, + "step": 32479 + }, + { + "epoch": 0.4220627863875655, + "grad_norm": 0.4660983681678772, + "learning_rate": 0.00011561106850881868, + "loss": 1.4207, + "step": 32480 + }, + { + "epoch": 0.4220757809314814, + "grad_norm": 0.3910813629627228, + "learning_rate": 0.0001156084690469073, + "loss": 1.3643, + "step": 32481 + }, + { + "epoch": 0.42208877547539725, + "grad_norm": 0.4356350302696228, + "learning_rate": 0.0001156058695849959, + "loss": 1.526, + "step": 32482 + }, + { + "epoch": 0.42210177001931315, + "grad_norm": 0.34705057740211487, + "learning_rate": 0.00011560327012308452, + "loss": 1.4244, + "step": 32483 + }, + { + "epoch": 0.422114764563229, + "grad_norm": 0.3503780961036682, + "learning_rate": 0.00011560067066117315, + "loss": 1.3386, + "step": 32484 + }, + { + "epoch": 0.4221277591071449, + "grad_norm": 0.39164572954177856, + "learning_rate": 0.00011559807119926176, + "loss": 1.5428, + "step": 32485 + }, + { + "epoch": 0.42214075365106074, + "grad_norm": 0.36391976475715637, + "learning_rate": 0.00011559547173735038, + "loss": 1.3764, + "step": 32486 + }, + { + "epoch": 0.42215374819497664, + "grad_norm": 0.4350518584251404, + "learning_rate": 0.00011559287227543897, + "loss": 1.3942, + "step": 32487 + }, + { + "epoch": 0.4221667427388925, + "grad_norm": 0.444780558347702, + "learning_rate": 0.00011559027281352761, + "loss": 1.3879, + "step": 32488 + }, + { + "epoch": 0.4221797372828084, + "grad_norm": 0.5134239196777344, + "learning_rate": 0.00011558767335161622, + "loss": 1.4388, + "step": 32489 + }, + { + "epoch": 0.42219273182672423, + "grad_norm": 0.42801064252853394, + "learning_rate": 0.00011558507388970483, + "loss": 1.5003, + "step": 32490 + }, + { + "epoch": 0.42220572637064013, + "grad_norm": 0.4335562288761139, + "learning_rate": 0.00011558247442779344, + "loss": 1.3406, + "step": 32491 + }, + { + "epoch": 0.422218720914556, + "grad_norm": 0.29763683676719666, + "learning_rate": 0.00011557987496588207, + "loss": 1.4418, + "step": 32492 + }, + { + "epoch": 0.4222317154584719, + "grad_norm": 0.4186922311782837, + "learning_rate": 0.00011557727550397068, + "loss": 1.4223, + "step": 32493 + }, + { + "epoch": 0.4222447100023877, + "grad_norm": 0.4694053828716278, + "learning_rate": 0.00011557467604205929, + "loss": 1.4558, + "step": 32494 + }, + { + "epoch": 0.4222577045463036, + "grad_norm": 0.4468023478984833, + "learning_rate": 0.00011557207658014793, + "loss": 1.466, + "step": 32495 + }, + { + "epoch": 0.42227069909021947, + "grad_norm": 0.46099603176116943, + "learning_rate": 0.00011556947711823654, + "loss": 1.1997, + "step": 32496 + }, + { + "epoch": 0.42228369363413537, + "grad_norm": 0.4393361508846283, + "learning_rate": 0.00011556687765632515, + "loss": 1.3667, + "step": 32497 + }, + { + "epoch": 0.4222966881780512, + "grad_norm": 0.38493579626083374, + "learning_rate": 0.00011556427819441376, + "loss": 1.4841, + "step": 32498 + }, + { + "epoch": 0.4223096827219671, + "grad_norm": 0.5098912119865417, + "learning_rate": 0.00011556167873250239, + "loss": 1.2183, + "step": 32499 + }, + { + "epoch": 0.42232267726588296, + "grad_norm": 0.30311039090156555, + "learning_rate": 0.000115559079270591, + "loss": 1.3999, + "step": 32500 + }, + { + "epoch": 0.42233567180979886, + "grad_norm": 0.3550584018230438, + "learning_rate": 0.00011555647980867961, + "loss": 1.2319, + "step": 32501 + }, + { + "epoch": 0.4223486663537147, + "grad_norm": 0.31162363290786743, + "learning_rate": 0.00011555388034676822, + "loss": 1.2692, + "step": 32502 + }, + { + "epoch": 0.4223616608976306, + "grad_norm": 0.3799562454223633, + "learning_rate": 0.00011555128088485686, + "loss": 1.3296, + "step": 32503 + }, + { + "epoch": 0.42237465544154645, + "grad_norm": 0.3360872268676758, + "learning_rate": 0.00011554868142294545, + "loss": 1.3262, + "step": 32504 + }, + { + "epoch": 0.42238764998546235, + "grad_norm": 0.3756132423877716, + "learning_rate": 0.00011554608196103406, + "loss": 1.5579, + "step": 32505 + }, + { + "epoch": 0.4224006445293782, + "grad_norm": 0.43924999237060547, + "learning_rate": 0.00011554348249912268, + "loss": 1.292, + "step": 32506 + }, + { + "epoch": 0.4224136390732941, + "grad_norm": 0.5720948576927185, + "learning_rate": 0.00011554088303721131, + "loss": 1.396, + "step": 32507 + }, + { + "epoch": 0.42242663361721, + "grad_norm": 0.38311123847961426, + "learning_rate": 0.00011553828357529992, + "loss": 1.3476, + "step": 32508 + }, + { + "epoch": 0.42243962816112585, + "grad_norm": 0.42853695154190063, + "learning_rate": 0.00011553568411338854, + "loss": 1.3229, + "step": 32509 + }, + { + "epoch": 0.42245262270504175, + "grad_norm": 0.4066378176212311, + "learning_rate": 0.00011553308465147715, + "loss": 1.4389, + "step": 32510 + }, + { + "epoch": 0.4224656172489576, + "grad_norm": 0.4813633859157562, + "learning_rate": 0.00011553048518956577, + "loss": 1.3278, + "step": 32511 + }, + { + "epoch": 0.4224786117928735, + "grad_norm": 0.39550501108169556, + "learning_rate": 0.00011552788572765438, + "loss": 1.2252, + "step": 32512 + }, + { + "epoch": 0.42249160633678934, + "grad_norm": 0.5144707560539246, + "learning_rate": 0.00011552528626574299, + "loss": 1.6007, + "step": 32513 + }, + { + "epoch": 0.42250460088070524, + "grad_norm": 0.23891358077526093, + "learning_rate": 0.0001155226868038316, + "loss": 1.1676, + "step": 32514 + }, + { + "epoch": 0.4225175954246211, + "grad_norm": 0.36541077494621277, + "learning_rate": 0.00011552008734192024, + "loss": 1.523, + "step": 32515 + }, + { + "epoch": 0.422530589968537, + "grad_norm": 0.37614455819129944, + "learning_rate": 0.00011551748788000885, + "loss": 1.5062, + "step": 32516 + }, + { + "epoch": 0.42254358451245283, + "grad_norm": 0.46348482370376587, + "learning_rate": 0.00011551488841809745, + "loss": 1.4108, + "step": 32517 + }, + { + "epoch": 0.42255657905636873, + "grad_norm": 0.5071980953216553, + "learning_rate": 0.00011551228895618606, + "loss": 1.4145, + "step": 32518 + }, + { + "epoch": 0.4225695736002846, + "grad_norm": 0.40767702460289, + "learning_rate": 0.0001155096894942747, + "loss": 1.4985, + "step": 32519 + }, + { + "epoch": 0.4225825681442005, + "grad_norm": 0.42947205901145935, + "learning_rate": 0.00011550709003236331, + "loss": 1.4034, + "step": 32520 + }, + { + "epoch": 0.4225955626881163, + "grad_norm": 0.5178409814834595, + "learning_rate": 0.00011550449057045192, + "loss": 1.5899, + "step": 32521 + }, + { + "epoch": 0.4226085572320322, + "grad_norm": 0.33328777551651, + "learning_rate": 0.00011550189110854053, + "loss": 1.3116, + "step": 32522 + }, + { + "epoch": 0.42262155177594807, + "grad_norm": 0.4110775291919708, + "learning_rate": 0.00011549929164662916, + "loss": 1.5896, + "step": 32523 + }, + { + "epoch": 0.42263454631986397, + "grad_norm": 0.38887089490890503, + "learning_rate": 0.00011549669218471777, + "loss": 1.1956, + "step": 32524 + }, + { + "epoch": 0.4226475408637798, + "grad_norm": 0.4104284644126892, + "learning_rate": 0.00011549409272280638, + "loss": 1.208, + "step": 32525 + }, + { + "epoch": 0.4226605354076957, + "grad_norm": 0.37943828105926514, + "learning_rate": 0.00011549149326089499, + "loss": 1.3752, + "step": 32526 + }, + { + "epoch": 0.42267352995161156, + "grad_norm": 0.3851604461669922, + "learning_rate": 0.00011548889379898363, + "loss": 1.4294, + "step": 32527 + }, + { + "epoch": 0.42268652449552746, + "grad_norm": 0.31594303250312805, + "learning_rate": 0.00011548629433707224, + "loss": 1.3908, + "step": 32528 + }, + { + "epoch": 0.4226995190394433, + "grad_norm": 0.32268786430358887, + "learning_rate": 0.00011548369487516083, + "loss": 1.3349, + "step": 32529 + }, + { + "epoch": 0.4227125135833592, + "grad_norm": 0.412300705909729, + "learning_rate": 0.00011548109541324947, + "loss": 1.4975, + "step": 32530 + }, + { + "epoch": 0.42272550812727505, + "grad_norm": 0.36216339468955994, + "learning_rate": 0.00011547849595133808, + "loss": 1.4701, + "step": 32531 + }, + { + "epoch": 0.42273850267119095, + "grad_norm": 0.38794395327568054, + "learning_rate": 0.0001154758964894267, + "loss": 1.4327, + "step": 32532 + }, + { + "epoch": 0.4227514972151068, + "grad_norm": 0.522821307182312, + "learning_rate": 0.0001154732970275153, + "loss": 1.4971, + "step": 32533 + }, + { + "epoch": 0.4227644917590227, + "grad_norm": 0.4122038185596466, + "learning_rate": 0.00011547069756560393, + "loss": 1.3446, + "step": 32534 + }, + { + "epoch": 0.42277748630293854, + "grad_norm": 0.3933035731315613, + "learning_rate": 0.00011546809810369254, + "loss": 1.4748, + "step": 32535 + }, + { + "epoch": 0.42279048084685444, + "grad_norm": 0.39935502409935, + "learning_rate": 0.00011546549864178115, + "loss": 1.3745, + "step": 32536 + }, + { + "epoch": 0.4228034753907703, + "grad_norm": 0.3199804425239563, + "learning_rate": 0.00011546289917986976, + "loss": 1.1982, + "step": 32537 + }, + { + "epoch": 0.4228164699346862, + "grad_norm": 0.41713863611221313, + "learning_rate": 0.0001154602997179584, + "loss": 1.4462, + "step": 32538 + }, + { + "epoch": 0.42282946447860204, + "grad_norm": 0.4183112680912018, + "learning_rate": 0.00011545770025604701, + "loss": 1.441, + "step": 32539 + }, + { + "epoch": 0.42284245902251794, + "grad_norm": 0.4007284343242645, + "learning_rate": 0.00011545510079413562, + "loss": 1.4168, + "step": 32540 + }, + { + "epoch": 0.4228554535664338, + "grad_norm": 0.3678448796272278, + "learning_rate": 0.00011545250133222423, + "loss": 1.4615, + "step": 32541 + }, + { + "epoch": 0.4228684481103497, + "grad_norm": 0.43753302097320557, + "learning_rate": 0.00011544990187031286, + "loss": 1.2577, + "step": 32542 + }, + { + "epoch": 0.4228814426542655, + "grad_norm": 0.28028228878974915, + "learning_rate": 0.00011544730240840147, + "loss": 1.3049, + "step": 32543 + }, + { + "epoch": 0.4228944371981814, + "grad_norm": 0.43065887689590454, + "learning_rate": 0.00011544470294649008, + "loss": 1.4066, + "step": 32544 + }, + { + "epoch": 0.4229074317420973, + "grad_norm": 0.4156615734100342, + "learning_rate": 0.00011544210348457869, + "loss": 1.5258, + "step": 32545 + }, + { + "epoch": 0.4229204262860132, + "grad_norm": 0.3791862428188324, + "learning_rate": 0.00011543950402266732, + "loss": 1.5816, + "step": 32546 + }, + { + "epoch": 0.422933420829929, + "grad_norm": 0.44544869661331177, + "learning_rate": 0.00011543690456075593, + "loss": 1.3462, + "step": 32547 + }, + { + "epoch": 0.4229464153738449, + "grad_norm": 0.459179550409317, + "learning_rate": 0.00011543430509884454, + "loss": 1.3869, + "step": 32548 + }, + { + "epoch": 0.42295940991776076, + "grad_norm": 0.3827952444553375, + "learning_rate": 0.00011543170563693315, + "loss": 1.4707, + "step": 32549 + }, + { + "epoch": 0.42297240446167667, + "grad_norm": 0.4364517033100128, + "learning_rate": 0.00011542910617502179, + "loss": 1.3031, + "step": 32550 + }, + { + "epoch": 0.4229853990055925, + "grad_norm": 0.3850482106208801, + "learning_rate": 0.0001154265067131104, + "loss": 1.2848, + "step": 32551 + }, + { + "epoch": 0.4229983935495084, + "grad_norm": 0.4495599567890167, + "learning_rate": 0.00011542390725119901, + "loss": 1.491, + "step": 32552 + }, + { + "epoch": 0.42301138809342426, + "grad_norm": 0.3892969787120819, + "learning_rate": 0.00011542130778928762, + "loss": 1.5318, + "step": 32553 + }, + { + "epoch": 0.42302438263734016, + "grad_norm": 0.4029179811477661, + "learning_rate": 0.00011541870832737624, + "loss": 1.4513, + "step": 32554 + }, + { + "epoch": 0.423037377181256, + "grad_norm": 0.40128234028816223, + "learning_rate": 0.00011541610886546485, + "loss": 1.4578, + "step": 32555 + }, + { + "epoch": 0.4230503717251719, + "grad_norm": 0.4413566589355469, + "learning_rate": 0.00011541350940355347, + "loss": 1.3805, + "step": 32556 + }, + { + "epoch": 0.42306336626908775, + "grad_norm": 0.3579275608062744, + "learning_rate": 0.00011541090994164208, + "loss": 1.282, + "step": 32557 + }, + { + "epoch": 0.42307636081300365, + "grad_norm": 0.43506690859794617, + "learning_rate": 0.00011540831047973071, + "loss": 1.4102, + "step": 32558 + }, + { + "epoch": 0.4230893553569195, + "grad_norm": 0.3977290987968445, + "learning_rate": 0.00011540571101781931, + "loss": 1.2649, + "step": 32559 + }, + { + "epoch": 0.4231023499008354, + "grad_norm": 0.3319036066532135, + "learning_rate": 0.00011540311155590792, + "loss": 1.464, + "step": 32560 + }, + { + "epoch": 0.42311534444475124, + "grad_norm": 0.32621195912361145, + "learning_rate": 0.00011540051209399653, + "loss": 1.2574, + "step": 32561 + }, + { + "epoch": 0.42312833898866714, + "grad_norm": 0.5112295746803284, + "learning_rate": 0.00011539791263208517, + "loss": 1.3805, + "step": 32562 + }, + { + "epoch": 0.423141333532583, + "grad_norm": 0.29058554768562317, + "learning_rate": 0.00011539531317017378, + "loss": 1.2061, + "step": 32563 + }, + { + "epoch": 0.4231543280764989, + "grad_norm": 0.3931564688682556, + "learning_rate": 0.0001153927137082624, + "loss": 1.1972, + "step": 32564 + }, + { + "epoch": 0.42316732262041473, + "grad_norm": 0.4237231910228729, + "learning_rate": 0.000115390114246351, + "loss": 1.4078, + "step": 32565 + }, + { + "epoch": 0.42318031716433063, + "grad_norm": 0.45243990421295166, + "learning_rate": 0.00011538751478443963, + "loss": 1.3205, + "step": 32566 + }, + { + "epoch": 0.4231933117082465, + "grad_norm": 0.5141027569770813, + "learning_rate": 0.00011538491532252824, + "loss": 1.5201, + "step": 32567 + }, + { + "epoch": 0.4232063062521624, + "grad_norm": 0.3802530765533447, + "learning_rate": 0.00011538231586061685, + "loss": 1.4894, + "step": 32568 + }, + { + "epoch": 0.4232193007960782, + "grad_norm": 0.43684080243110657, + "learning_rate": 0.00011537971639870549, + "loss": 1.2676, + "step": 32569 + }, + { + "epoch": 0.4232322953399941, + "grad_norm": 0.34049174189567566, + "learning_rate": 0.0001153771169367941, + "loss": 1.2872, + "step": 32570 + }, + { + "epoch": 0.42324528988390997, + "grad_norm": 0.40717872977256775, + "learning_rate": 0.0001153745174748827, + "loss": 1.3117, + "step": 32571 + }, + { + "epoch": 0.42325828442782587, + "grad_norm": 0.4071490466594696, + "learning_rate": 0.00011537191801297131, + "loss": 1.4712, + "step": 32572 + }, + { + "epoch": 0.4232712789717417, + "grad_norm": 0.5588322877883911, + "learning_rate": 0.00011536931855105995, + "loss": 1.501, + "step": 32573 + }, + { + "epoch": 0.4232842735156576, + "grad_norm": 0.47346457839012146, + "learning_rate": 0.00011536671908914856, + "loss": 1.3461, + "step": 32574 + }, + { + "epoch": 0.42329726805957346, + "grad_norm": 0.31851914525032043, + "learning_rate": 0.00011536411962723717, + "loss": 1.2281, + "step": 32575 + }, + { + "epoch": 0.42331026260348936, + "grad_norm": 0.29420894384384155, + "learning_rate": 0.00011536152016532578, + "loss": 1.4417, + "step": 32576 + }, + { + "epoch": 0.4233232571474052, + "grad_norm": 0.4689401686191559, + "learning_rate": 0.0001153589207034144, + "loss": 1.3539, + "step": 32577 + }, + { + "epoch": 0.4233362516913211, + "grad_norm": 0.3141169548034668, + "learning_rate": 0.00011535632124150301, + "loss": 1.2189, + "step": 32578 + }, + { + "epoch": 0.42334924623523695, + "grad_norm": 0.45030298829078674, + "learning_rate": 0.00011535372177959163, + "loss": 1.5321, + "step": 32579 + }, + { + "epoch": 0.42336224077915285, + "grad_norm": 0.35827505588531494, + "learning_rate": 0.00011535112231768024, + "loss": 1.2821, + "step": 32580 + }, + { + "epoch": 0.4233752353230687, + "grad_norm": 0.35243159532546997, + "learning_rate": 0.00011534852285576887, + "loss": 1.2912, + "step": 32581 + }, + { + "epoch": 0.4233882298669846, + "grad_norm": 0.36464205384254456, + "learning_rate": 0.00011534592339385749, + "loss": 1.468, + "step": 32582 + }, + { + "epoch": 0.4234012244109005, + "grad_norm": 0.3930002450942993, + "learning_rate": 0.0001153433239319461, + "loss": 1.4181, + "step": 32583 + }, + { + "epoch": 0.42341421895481635, + "grad_norm": 0.4512020945549011, + "learning_rate": 0.0001153407244700347, + "loss": 1.2813, + "step": 32584 + }, + { + "epoch": 0.42342721349873225, + "grad_norm": 0.2981812059879303, + "learning_rate": 0.00011533812500812333, + "loss": 1.3263, + "step": 32585 + }, + { + "epoch": 0.4234402080426481, + "grad_norm": 0.46833929419517517, + "learning_rate": 0.00011533552554621194, + "loss": 1.4143, + "step": 32586 + }, + { + "epoch": 0.423453202586564, + "grad_norm": 0.3698435425758362, + "learning_rate": 0.00011533292608430055, + "loss": 1.4559, + "step": 32587 + }, + { + "epoch": 0.42346619713047984, + "grad_norm": 0.30801665782928467, + "learning_rate": 0.00011533032662238916, + "loss": 1.2508, + "step": 32588 + }, + { + "epoch": 0.42347919167439574, + "grad_norm": 0.3188161253929138, + "learning_rate": 0.00011532772716047779, + "loss": 1.2101, + "step": 32589 + }, + { + "epoch": 0.4234921862183116, + "grad_norm": 0.40829670429229736, + "learning_rate": 0.0001153251276985664, + "loss": 1.394, + "step": 32590 + }, + { + "epoch": 0.4235051807622275, + "grad_norm": 0.508387565612793, + "learning_rate": 0.00011532252823665501, + "loss": 1.5417, + "step": 32591 + }, + { + "epoch": 0.42351817530614333, + "grad_norm": 0.44753769040107727, + "learning_rate": 0.00011531992877474362, + "loss": 1.3346, + "step": 32592 + }, + { + "epoch": 0.42353116985005923, + "grad_norm": 0.44312670826911926, + "learning_rate": 0.00011531732931283226, + "loss": 1.437, + "step": 32593 + }, + { + "epoch": 0.4235441643939751, + "grad_norm": 0.47309330105781555, + "learning_rate": 0.00011531472985092087, + "loss": 1.485, + "step": 32594 + }, + { + "epoch": 0.423557158937891, + "grad_norm": 0.36506420373916626, + "learning_rate": 0.00011531213038900948, + "loss": 1.4639, + "step": 32595 + }, + { + "epoch": 0.4235701534818068, + "grad_norm": 0.3561936318874359, + "learning_rate": 0.00011530953092709809, + "loss": 1.3578, + "step": 32596 + }, + { + "epoch": 0.4235831480257227, + "grad_norm": 0.44975969195365906, + "learning_rate": 0.00011530693146518672, + "loss": 1.4312, + "step": 32597 + }, + { + "epoch": 0.42359614256963857, + "grad_norm": 0.4023270905017853, + "learning_rate": 0.00011530433200327533, + "loss": 1.3393, + "step": 32598 + }, + { + "epoch": 0.42360913711355447, + "grad_norm": 0.345290869474411, + "learning_rate": 0.00011530173254136394, + "loss": 1.5085, + "step": 32599 + }, + { + "epoch": 0.4236221316574703, + "grad_norm": 0.44778358936309814, + "learning_rate": 0.00011529913307945255, + "loss": 1.5911, + "step": 32600 + }, + { + "epoch": 0.4236351262013862, + "grad_norm": 0.4356503188610077, + "learning_rate": 0.00011529653361754117, + "loss": 1.4426, + "step": 32601 + }, + { + "epoch": 0.42364812074530206, + "grad_norm": 0.4331046938896179, + "learning_rate": 0.00011529393415562979, + "loss": 1.3076, + "step": 32602 + }, + { + "epoch": 0.42366111528921796, + "grad_norm": 0.3995019793510437, + "learning_rate": 0.0001152913346937184, + "loss": 1.4856, + "step": 32603 + }, + { + "epoch": 0.4236741098331338, + "grad_norm": 0.39607229828834534, + "learning_rate": 0.00011528873523180703, + "loss": 1.3929, + "step": 32604 + }, + { + "epoch": 0.4236871043770497, + "grad_norm": 0.3992483913898468, + "learning_rate": 0.00011528613576989565, + "loss": 1.3409, + "step": 32605 + }, + { + "epoch": 0.42370009892096555, + "grad_norm": 0.41545766592025757, + "learning_rate": 0.00011528353630798426, + "loss": 1.512, + "step": 32606 + }, + { + "epoch": 0.42371309346488145, + "grad_norm": 0.43989789485931396, + "learning_rate": 0.00011528093684607287, + "loss": 1.4635, + "step": 32607 + }, + { + "epoch": 0.4237260880087973, + "grad_norm": 0.31132909655570984, + "learning_rate": 0.00011527833738416149, + "loss": 1.1587, + "step": 32608 + }, + { + "epoch": 0.4237390825527132, + "grad_norm": 0.3846941888332367, + "learning_rate": 0.0001152757379222501, + "loss": 1.4719, + "step": 32609 + }, + { + "epoch": 0.42375207709662904, + "grad_norm": 0.2898976504802704, + "learning_rate": 0.00011527313846033871, + "loss": 1.3125, + "step": 32610 + }, + { + "epoch": 0.42376507164054494, + "grad_norm": 0.38989585638046265, + "learning_rate": 0.00011527053899842732, + "loss": 1.3963, + "step": 32611 + }, + { + "epoch": 0.4237780661844608, + "grad_norm": 0.278799444437027, + "learning_rate": 0.00011526793953651596, + "loss": 1.4038, + "step": 32612 + }, + { + "epoch": 0.4237910607283767, + "grad_norm": 0.3773708641529083, + "learning_rate": 0.00011526534007460456, + "loss": 1.1559, + "step": 32613 + }, + { + "epoch": 0.42380405527229253, + "grad_norm": 0.3979058265686035, + "learning_rate": 0.00011526274061269317, + "loss": 1.1841, + "step": 32614 + }, + { + "epoch": 0.42381704981620844, + "grad_norm": 0.464068204164505, + "learning_rate": 0.00011526014115078178, + "loss": 1.5539, + "step": 32615 + }, + { + "epoch": 0.4238300443601243, + "grad_norm": 0.43467697501182556, + "learning_rate": 0.00011525754168887042, + "loss": 1.456, + "step": 32616 + }, + { + "epoch": 0.4238430389040402, + "grad_norm": 0.34923499822616577, + "learning_rate": 0.00011525494222695903, + "loss": 1.2825, + "step": 32617 + }, + { + "epoch": 0.423856033447956, + "grad_norm": 0.32817941904067993, + "learning_rate": 0.00011525234276504764, + "loss": 1.4485, + "step": 32618 + }, + { + "epoch": 0.4238690279918719, + "grad_norm": 0.480048805475235, + "learning_rate": 0.00011524974330313625, + "loss": 1.6184, + "step": 32619 + }, + { + "epoch": 0.4238820225357878, + "grad_norm": 0.4450722336769104, + "learning_rate": 0.00011524714384122488, + "loss": 1.5258, + "step": 32620 + }, + { + "epoch": 0.4238950170797037, + "grad_norm": 0.338461697101593, + "learning_rate": 0.00011524454437931349, + "loss": 1.3314, + "step": 32621 + }, + { + "epoch": 0.4239080116236195, + "grad_norm": 0.4182823598384857, + "learning_rate": 0.0001152419449174021, + "loss": 1.2723, + "step": 32622 + }, + { + "epoch": 0.4239210061675354, + "grad_norm": 0.36841315031051636, + "learning_rate": 0.00011523934545549071, + "loss": 1.3257, + "step": 32623 + }, + { + "epoch": 0.42393400071145126, + "grad_norm": 0.45369064807891846, + "learning_rate": 0.00011523674599357935, + "loss": 1.3305, + "step": 32624 + }, + { + "epoch": 0.42394699525536717, + "grad_norm": 0.5095656514167786, + "learning_rate": 0.00011523414653166796, + "loss": 1.3531, + "step": 32625 + }, + { + "epoch": 0.423959989799283, + "grad_norm": 0.42047765851020813, + "learning_rate": 0.00011523154706975656, + "loss": 1.4227, + "step": 32626 + }, + { + "epoch": 0.4239729843431989, + "grad_norm": 0.46105897426605225, + "learning_rate": 0.00011522894760784517, + "loss": 1.2701, + "step": 32627 + }, + { + "epoch": 0.42398597888711476, + "grad_norm": 0.5402507185935974, + "learning_rate": 0.0001152263481459338, + "loss": 1.326, + "step": 32628 + }, + { + "epoch": 0.42399897343103066, + "grad_norm": 0.4956165850162506, + "learning_rate": 0.00011522374868402242, + "loss": 1.4874, + "step": 32629 + }, + { + "epoch": 0.4240119679749465, + "grad_norm": 0.553966224193573, + "learning_rate": 0.00011522114922211103, + "loss": 1.2835, + "step": 32630 + }, + { + "epoch": 0.4240249625188624, + "grad_norm": 0.4198230504989624, + "learning_rate": 0.00011521854976019964, + "loss": 1.4184, + "step": 32631 + }, + { + "epoch": 0.42403795706277825, + "grad_norm": 0.29994654655456543, + "learning_rate": 0.00011521595029828826, + "loss": 1.2848, + "step": 32632 + }, + { + "epoch": 0.42405095160669415, + "grad_norm": 0.4693107008934021, + "learning_rate": 0.00011521335083637687, + "loss": 1.374, + "step": 32633 + }, + { + "epoch": 0.42406394615061, + "grad_norm": 0.3967727720737457, + "learning_rate": 0.00011521075137446548, + "loss": 1.3083, + "step": 32634 + }, + { + "epoch": 0.4240769406945259, + "grad_norm": 0.40576738119125366, + "learning_rate": 0.0001152081519125541, + "loss": 1.3345, + "step": 32635 + }, + { + "epoch": 0.42408993523844174, + "grad_norm": 0.4012446999549866, + "learning_rate": 0.00011520555245064273, + "loss": 1.2862, + "step": 32636 + }, + { + "epoch": 0.42410292978235764, + "grad_norm": 0.42559412121772766, + "learning_rate": 0.00011520295298873134, + "loss": 1.3528, + "step": 32637 + }, + { + "epoch": 0.4241159243262735, + "grad_norm": 0.43126824498176575, + "learning_rate": 0.00011520035352681996, + "loss": 1.5463, + "step": 32638 + }, + { + "epoch": 0.4241289188701894, + "grad_norm": 0.5899081826210022, + "learning_rate": 0.00011519775406490855, + "loss": 1.3077, + "step": 32639 + }, + { + "epoch": 0.42414191341410523, + "grad_norm": 0.4572855830192566, + "learning_rate": 0.00011519515460299719, + "loss": 1.2637, + "step": 32640 + }, + { + "epoch": 0.42415490795802113, + "grad_norm": 0.5459520220756531, + "learning_rate": 0.0001151925551410858, + "loss": 1.4556, + "step": 32641 + }, + { + "epoch": 0.424167902501937, + "grad_norm": 0.42976656556129456, + "learning_rate": 0.00011518995567917441, + "loss": 1.7099, + "step": 32642 + }, + { + "epoch": 0.4241808970458529, + "grad_norm": 0.4294690787792206, + "learning_rate": 0.00011518735621726304, + "loss": 1.5077, + "step": 32643 + }, + { + "epoch": 0.4241938915897687, + "grad_norm": 0.3748714327812195, + "learning_rate": 0.00011518475675535165, + "loss": 1.3059, + "step": 32644 + }, + { + "epoch": 0.4242068861336846, + "grad_norm": 0.4206548035144806, + "learning_rate": 0.00011518215729344026, + "loss": 1.5961, + "step": 32645 + }, + { + "epoch": 0.42421988067760047, + "grad_norm": 0.3011281192302704, + "learning_rate": 0.00011517955783152887, + "loss": 1.2929, + "step": 32646 + }, + { + "epoch": 0.42423287522151637, + "grad_norm": 0.378834992647171, + "learning_rate": 0.00011517695836961751, + "loss": 1.0831, + "step": 32647 + }, + { + "epoch": 0.4242458697654322, + "grad_norm": 0.3436242938041687, + "learning_rate": 0.00011517435890770612, + "loss": 1.5377, + "step": 32648 + }, + { + "epoch": 0.4242588643093481, + "grad_norm": 0.37045952677726746, + "learning_rate": 0.00011517175944579473, + "loss": 1.4286, + "step": 32649 + }, + { + "epoch": 0.42427185885326396, + "grad_norm": 0.3832129240036011, + "learning_rate": 0.00011516915998388334, + "loss": 1.4825, + "step": 32650 + }, + { + "epoch": 0.42428485339717986, + "grad_norm": 0.321992963552475, + "learning_rate": 0.00011516656052197196, + "loss": 1.4818, + "step": 32651 + }, + { + "epoch": 0.4242978479410957, + "grad_norm": 0.46432238817214966, + "learning_rate": 0.00011516396106006058, + "loss": 1.3464, + "step": 32652 + }, + { + "epoch": 0.4243108424850116, + "grad_norm": 0.47884055972099304, + "learning_rate": 0.00011516136159814919, + "loss": 1.4246, + "step": 32653 + }, + { + "epoch": 0.42432383702892745, + "grad_norm": 0.3578111231327057, + "learning_rate": 0.0001151587621362378, + "loss": 1.3495, + "step": 32654 + }, + { + "epoch": 0.42433683157284335, + "grad_norm": 0.3594319224357605, + "learning_rate": 0.00011515616267432642, + "loss": 1.3246, + "step": 32655 + }, + { + "epoch": 0.4243498261167592, + "grad_norm": 0.41986316442489624, + "learning_rate": 0.00011515356321241503, + "loss": 1.3238, + "step": 32656 + }, + { + "epoch": 0.4243628206606751, + "grad_norm": 0.3603053390979767, + "learning_rate": 0.00011515096375050364, + "loss": 1.4085, + "step": 32657 + }, + { + "epoch": 0.42437581520459094, + "grad_norm": 0.391559898853302, + "learning_rate": 0.00011514836428859226, + "loss": 1.3955, + "step": 32658 + }, + { + "epoch": 0.42438880974850685, + "grad_norm": 0.4199407994747162, + "learning_rate": 0.00011514576482668089, + "loss": 1.1347, + "step": 32659 + }, + { + "epoch": 0.42440180429242275, + "grad_norm": 0.3639289140701294, + "learning_rate": 0.0001151431653647695, + "loss": 1.1818, + "step": 32660 + }, + { + "epoch": 0.4244147988363386, + "grad_norm": 0.35207751393318176, + "learning_rate": 0.00011514056590285811, + "loss": 1.1047, + "step": 32661 + }, + { + "epoch": 0.4244277933802545, + "grad_norm": 0.43022620677948, + "learning_rate": 0.00011513796644094673, + "loss": 1.3284, + "step": 32662 + }, + { + "epoch": 0.42444078792417034, + "grad_norm": 0.3480023145675659, + "learning_rate": 0.00011513536697903535, + "loss": 1.3873, + "step": 32663 + }, + { + "epoch": 0.42445378246808624, + "grad_norm": 0.9453961253166199, + "learning_rate": 0.00011513276751712396, + "loss": 1.2621, + "step": 32664 + }, + { + "epoch": 0.4244667770120021, + "grad_norm": 0.38237282633781433, + "learning_rate": 0.00011513016805521257, + "loss": 1.3627, + "step": 32665 + }, + { + "epoch": 0.424479771555918, + "grad_norm": 0.4273073077201843, + "learning_rate": 0.00011512756859330118, + "loss": 1.3197, + "step": 32666 + }, + { + "epoch": 0.42449276609983383, + "grad_norm": 0.45577436685562134, + "learning_rate": 0.00011512496913138982, + "loss": 1.5395, + "step": 32667 + }, + { + "epoch": 0.42450576064374973, + "grad_norm": 0.35005468130111694, + "learning_rate": 0.00011512236966947842, + "loss": 1.408, + "step": 32668 + }, + { + "epoch": 0.4245187551876656, + "grad_norm": 0.49522069096565247, + "learning_rate": 0.00011511977020756703, + "loss": 1.4457, + "step": 32669 + }, + { + "epoch": 0.4245317497315815, + "grad_norm": 0.3975871205329895, + "learning_rate": 0.00011511717074565564, + "loss": 1.4151, + "step": 32670 + }, + { + "epoch": 0.4245447442754973, + "grad_norm": 0.463381826877594, + "learning_rate": 0.00011511457128374428, + "loss": 1.4582, + "step": 32671 + }, + { + "epoch": 0.4245577388194132, + "grad_norm": 0.4626269042491913, + "learning_rate": 0.00011511197182183289, + "loss": 1.4629, + "step": 32672 + }, + { + "epoch": 0.42457073336332907, + "grad_norm": 0.3940832018852234, + "learning_rate": 0.0001151093723599215, + "loss": 1.3748, + "step": 32673 + }, + { + "epoch": 0.42458372790724497, + "grad_norm": 0.4912465512752533, + "learning_rate": 0.00011510677289801011, + "loss": 1.4908, + "step": 32674 + }, + { + "epoch": 0.4245967224511608, + "grad_norm": 0.3597962260246277, + "learning_rate": 0.00011510417343609874, + "loss": 1.4021, + "step": 32675 + }, + { + "epoch": 0.4246097169950767, + "grad_norm": 0.37891116738319397, + "learning_rate": 0.00011510157397418735, + "loss": 1.3063, + "step": 32676 + }, + { + "epoch": 0.42462271153899256, + "grad_norm": 0.3783966302871704, + "learning_rate": 0.00011509897451227596, + "loss": 1.3974, + "step": 32677 + }, + { + "epoch": 0.42463570608290846, + "grad_norm": 0.41415533423423767, + "learning_rate": 0.0001150963750503646, + "loss": 1.4527, + "step": 32678 + }, + { + "epoch": 0.4246487006268243, + "grad_norm": 0.4733615815639496, + "learning_rate": 0.0001150937755884532, + "loss": 1.4599, + "step": 32679 + }, + { + "epoch": 0.4246616951707402, + "grad_norm": 0.4656376540660858, + "learning_rate": 0.00011509117612654182, + "loss": 1.4003, + "step": 32680 + }, + { + "epoch": 0.42467468971465605, + "grad_norm": 0.3168615698814392, + "learning_rate": 0.00011508857666463041, + "loss": 1.298, + "step": 32681 + }, + { + "epoch": 0.42468768425857195, + "grad_norm": 0.3645986318588257, + "learning_rate": 0.00011508597720271905, + "loss": 1.2577, + "step": 32682 + }, + { + "epoch": 0.4247006788024878, + "grad_norm": 0.2938631772994995, + "learning_rate": 0.00011508337774080766, + "loss": 1.2895, + "step": 32683 + }, + { + "epoch": 0.4247136733464037, + "grad_norm": 0.5293096303939819, + "learning_rate": 0.00011508077827889627, + "loss": 1.627, + "step": 32684 + }, + { + "epoch": 0.42472666789031954, + "grad_norm": 0.4071999490261078, + "learning_rate": 0.00011507817881698489, + "loss": 1.4628, + "step": 32685 + }, + { + "epoch": 0.42473966243423544, + "grad_norm": 0.30241674184799194, + "learning_rate": 0.00011507557935507351, + "loss": 1.4766, + "step": 32686 + }, + { + "epoch": 0.4247526569781513, + "grad_norm": 0.4153682589530945, + "learning_rate": 0.00011507297989316212, + "loss": 1.6435, + "step": 32687 + }, + { + "epoch": 0.4247656515220672, + "grad_norm": 0.42855629324913025, + "learning_rate": 0.00011507038043125073, + "loss": 1.4003, + "step": 32688 + }, + { + "epoch": 0.42477864606598303, + "grad_norm": 0.4692609906196594, + "learning_rate": 0.00011506778096933934, + "loss": 1.2338, + "step": 32689 + }, + { + "epoch": 0.42479164060989894, + "grad_norm": 0.5062618255615234, + "learning_rate": 0.00011506518150742798, + "loss": 1.5371, + "step": 32690 + }, + { + "epoch": 0.4248046351538148, + "grad_norm": 0.4251377582550049, + "learning_rate": 0.00011506258204551659, + "loss": 1.301, + "step": 32691 + }, + { + "epoch": 0.4248176296977307, + "grad_norm": 0.4042539596557617, + "learning_rate": 0.0001150599825836052, + "loss": 1.5439, + "step": 32692 + }, + { + "epoch": 0.4248306242416465, + "grad_norm": 0.37823086977005005, + "learning_rate": 0.0001150573831216938, + "loss": 1.4319, + "step": 32693 + }, + { + "epoch": 0.4248436187855624, + "grad_norm": 0.33654046058654785, + "learning_rate": 0.00011505478365978244, + "loss": 1.3817, + "step": 32694 + }, + { + "epoch": 0.42485661332947827, + "grad_norm": 0.4661678969860077, + "learning_rate": 0.00011505218419787105, + "loss": 1.4325, + "step": 32695 + }, + { + "epoch": 0.4248696078733942, + "grad_norm": 0.3422922194004059, + "learning_rate": 0.00011504958473595966, + "loss": 1.5663, + "step": 32696 + }, + { + "epoch": 0.42488260241731, + "grad_norm": 0.3530375063419342, + "learning_rate": 0.00011504698527404827, + "loss": 1.2248, + "step": 32697 + }, + { + "epoch": 0.4248955969612259, + "grad_norm": 0.32024598121643066, + "learning_rate": 0.0001150443858121369, + "loss": 1.2639, + "step": 32698 + }, + { + "epoch": 0.42490859150514176, + "grad_norm": 0.44534948468208313, + "learning_rate": 0.0001150417863502255, + "loss": 1.4372, + "step": 32699 + }, + { + "epoch": 0.42492158604905766, + "grad_norm": 0.42533090710639954, + "learning_rate": 0.00011503918688831412, + "loss": 1.5056, + "step": 32700 + }, + { + "epoch": 0.4249345805929735, + "grad_norm": 0.4291832149028778, + "learning_rate": 0.00011503658742640273, + "loss": 1.3363, + "step": 32701 + }, + { + "epoch": 0.4249475751368894, + "grad_norm": 0.3901008069515228, + "learning_rate": 0.00011503398796449137, + "loss": 1.573, + "step": 32702 + }, + { + "epoch": 0.42496056968080526, + "grad_norm": 0.4670076072216034, + "learning_rate": 0.00011503138850257998, + "loss": 1.4713, + "step": 32703 + }, + { + "epoch": 0.42497356422472116, + "grad_norm": 0.3456535339355469, + "learning_rate": 0.00011502878904066859, + "loss": 1.451, + "step": 32704 + }, + { + "epoch": 0.424986558768637, + "grad_norm": 0.37462544441223145, + "learning_rate": 0.0001150261895787572, + "loss": 1.3581, + "step": 32705 + }, + { + "epoch": 0.4249995533125529, + "grad_norm": 0.43311840295791626, + "learning_rate": 0.00011502359011684582, + "loss": 1.4809, + "step": 32706 + }, + { + "epoch": 0.42501254785646875, + "grad_norm": 0.4312298595905304, + "learning_rate": 0.00011502099065493443, + "loss": 1.4353, + "step": 32707 + }, + { + "epoch": 0.42502554240038465, + "grad_norm": 0.2707349359989166, + "learning_rate": 0.00011501839119302305, + "loss": 1.4312, + "step": 32708 + }, + { + "epoch": 0.4250385369443005, + "grad_norm": 0.3925691246986389, + "learning_rate": 0.00011501579173111166, + "loss": 1.2244, + "step": 32709 + }, + { + "epoch": 0.4250515314882164, + "grad_norm": 0.4297676384449005, + "learning_rate": 0.00011501319226920028, + "loss": 1.4111, + "step": 32710 + }, + { + "epoch": 0.42506452603213224, + "grad_norm": 0.44752100110054016, + "learning_rate": 0.00011501059280728889, + "loss": 1.5595, + "step": 32711 + }, + { + "epoch": 0.42507752057604814, + "grad_norm": 0.45761632919311523, + "learning_rate": 0.0001150079933453775, + "loss": 1.2665, + "step": 32712 + }, + { + "epoch": 0.425090515119964, + "grad_norm": 0.480825811624527, + "learning_rate": 0.00011500539388346611, + "loss": 1.4883, + "step": 32713 + }, + { + "epoch": 0.4251035096638799, + "grad_norm": 0.35824570059776306, + "learning_rate": 0.00011500279442155475, + "loss": 1.6681, + "step": 32714 + }, + { + "epoch": 0.42511650420779573, + "grad_norm": 0.4294672906398773, + "learning_rate": 0.00011500019495964336, + "loss": 1.4592, + "step": 32715 + }, + { + "epoch": 0.42512949875171163, + "grad_norm": 0.36337533593177795, + "learning_rate": 0.00011499759549773197, + "loss": 1.4034, + "step": 32716 + }, + { + "epoch": 0.4251424932956275, + "grad_norm": 0.2813102900981903, + "learning_rate": 0.0001149949960358206, + "loss": 1.3132, + "step": 32717 + }, + { + "epoch": 0.4251554878395434, + "grad_norm": 0.4075600802898407, + "learning_rate": 0.00011499239657390921, + "loss": 1.4144, + "step": 32718 + }, + { + "epoch": 0.4251684823834592, + "grad_norm": 0.5380195379257202, + "learning_rate": 0.00011498979711199782, + "loss": 1.2932, + "step": 32719 + }, + { + "epoch": 0.4251814769273751, + "grad_norm": 0.4188152253627777, + "learning_rate": 0.00011498719765008643, + "loss": 1.4946, + "step": 32720 + }, + { + "epoch": 0.42519447147129097, + "grad_norm": 0.4549977481365204, + "learning_rate": 0.00011498459818817507, + "loss": 1.5807, + "step": 32721 + }, + { + "epoch": 0.42520746601520687, + "grad_norm": 0.45418059825897217, + "learning_rate": 0.00011498199872626368, + "loss": 1.3901, + "step": 32722 + }, + { + "epoch": 0.4252204605591227, + "grad_norm": 0.4834255278110504, + "learning_rate": 0.00011497939926435228, + "loss": 1.489, + "step": 32723 + }, + { + "epoch": 0.4252334551030386, + "grad_norm": 0.4556974172592163, + "learning_rate": 0.00011497679980244089, + "loss": 1.2928, + "step": 32724 + }, + { + "epoch": 0.42524644964695446, + "grad_norm": 0.3241259753704071, + "learning_rate": 0.00011497420034052953, + "loss": 1.2466, + "step": 32725 + }, + { + "epoch": 0.42525944419087036, + "grad_norm": 0.39881256222724915, + "learning_rate": 0.00011497160087861814, + "loss": 1.0398, + "step": 32726 + }, + { + "epoch": 0.4252724387347862, + "grad_norm": 0.3281596004962921, + "learning_rate": 0.00011496900141670675, + "loss": 1.5064, + "step": 32727 + }, + { + "epoch": 0.4252854332787021, + "grad_norm": 0.47772088646888733, + "learning_rate": 0.00011496640195479536, + "loss": 1.641, + "step": 32728 + }, + { + "epoch": 0.42529842782261795, + "grad_norm": 0.5090879797935486, + "learning_rate": 0.00011496380249288398, + "loss": 1.541, + "step": 32729 + }, + { + "epoch": 0.42531142236653385, + "grad_norm": 0.48298248648643494, + "learning_rate": 0.0001149612030309726, + "loss": 1.3389, + "step": 32730 + }, + { + "epoch": 0.4253244169104497, + "grad_norm": 0.40232449769973755, + "learning_rate": 0.0001149586035690612, + "loss": 1.4262, + "step": 32731 + }, + { + "epoch": 0.4253374114543656, + "grad_norm": 0.42198362946510315, + "learning_rate": 0.00011495600410714982, + "loss": 1.1582, + "step": 32732 + }, + { + "epoch": 0.42535040599828144, + "grad_norm": 0.4367421567440033, + "learning_rate": 0.00011495340464523845, + "loss": 1.4233, + "step": 32733 + }, + { + "epoch": 0.42536340054219735, + "grad_norm": 0.31430256366729736, + "learning_rate": 0.00011495080518332707, + "loss": 1.3616, + "step": 32734 + }, + { + "epoch": 0.42537639508611325, + "grad_norm": 0.3407075107097626, + "learning_rate": 0.00011494820572141566, + "loss": 1.2431, + "step": 32735 + }, + { + "epoch": 0.4253893896300291, + "grad_norm": 0.45105311274528503, + "learning_rate": 0.00011494560625950427, + "loss": 1.4104, + "step": 32736 + }, + { + "epoch": 0.425402384173945, + "grad_norm": 0.46722736954689026, + "learning_rate": 0.00011494300679759291, + "loss": 1.3827, + "step": 32737 + }, + { + "epoch": 0.42541537871786084, + "grad_norm": 0.4503607749938965, + "learning_rate": 0.00011494040733568152, + "loss": 1.4144, + "step": 32738 + }, + { + "epoch": 0.42542837326177674, + "grad_norm": 0.369139701128006, + "learning_rate": 0.00011493780787377013, + "loss": 1.3201, + "step": 32739 + }, + { + "epoch": 0.4254413678056926, + "grad_norm": 0.36658594012260437, + "learning_rate": 0.00011493520841185874, + "loss": 1.2704, + "step": 32740 + }, + { + "epoch": 0.4254543623496085, + "grad_norm": 0.47784295678138733, + "learning_rate": 0.00011493260894994737, + "loss": 1.4383, + "step": 32741 + }, + { + "epoch": 0.42546735689352433, + "grad_norm": 0.3908814787864685, + "learning_rate": 0.00011493000948803598, + "loss": 1.3918, + "step": 32742 + }, + { + "epoch": 0.42548035143744023, + "grad_norm": 0.3372659683227539, + "learning_rate": 0.00011492741002612459, + "loss": 1.3738, + "step": 32743 + }, + { + "epoch": 0.4254933459813561, + "grad_norm": 0.3924589455127716, + "learning_rate": 0.0001149248105642132, + "loss": 1.4096, + "step": 32744 + }, + { + "epoch": 0.425506340525272, + "grad_norm": 0.46133190393447876, + "learning_rate": 0.00011492221110230184, + "loss": 1.4488, + "step": 32745 + }, + { + "epoch": 0.4255193350691878, + "grad_norm": 0.539125382900238, + "learning_rate": 0.00011491961164039045, + "loss": 1.3589, + "step": 32746 + }, + { + "epoch": 0.4255323296131037, + "grad_norm": 0.4047953486442566, + "learning_rate": 0.00011491701217847906, + "loss": 1.4049, + "step": 32747 + }, + { + "epoch": 0.42554532415701957, + "grad_norm": 0.4258427619934082, + "learning_rate": 0.00011491441271656766, + "loss": 1.4506, + "step": 32748 + }, + { + "epoch": 0.42555831870093547, + "grad_norm": 0.384223997592926, + "learning_rate": 0.0001149118132546563, + "loss": 1.414, + "step": 32749 + }, + { + "epoch": 0.4255713132448513, + "grad_norm": 0.34071582555770874, + "learning_rate": 0.00011490921379274491, + "loss": 1.2714, + "step": 32750 + }, + { + "epoch": 0.4255843077887672, + "grad_norm": 0.48124393820762634, + "learning_rate": 0.00011490661433083352, + "loss": 1.3624, + "step": 32751 + }, + { + "epoch": 0.42559730233268306, + "grad_norm": 0.41478610038757324, + "learning_rate": 0.00011490401486892214, + "loss": 1.2993, + "step": 32752 + }, + { + "epoch": 0.42561029687659896, + "grad_norm": 0.4133775532245636, + "learning_rate": 0.00011490141540701075, + "loss": 1.2451, + "step": 32753 + }, + { + "epoch": 0.4256232914205148, + "grad_norm": 0.45846426486968994, + "learning_rate": 0.00011489881594509937, + "loss": 1.3328, + "step": 32754 + }, + { + "epoch": 0.4256362859644307, + "grad_norm": 0.4006989002227783, + "learning_rate": 0.00011489621648318798, + "loss": 1.289, + "step": 32755 + }, + { + "epoch": 0.42564928050834655, + "grad_norm": 0.44254937767982483, + "learning_rate": 0.00011489361702127661, + "loss": 1.604, + "step": 32756 + }, + { + "epoch": 0.42566227505226245, + "grad_norm": 0.354093462228775, + "learning_rate": 0.00011489101755936523, + "loss": 1.4043, + "step": 32757 + }, + { + "epoch": 0.4256752695961783, + "grad_norm": 0.5432113409042358, + "learning_rate": 0.00011488841809745384, + "loss": 1.2742, + "step": 32758 + }, + { + "epoch": 0.4256882641400942, + "grad_norm": 0.486877977848053, + "learning_rate": 0.00011488581863554245, + "loss": 1.5019, + "step": 32759 + }, + { + "epoch": 0.42570125868401004, + "grad_norm": 0.4443177282810211, + "learning_rate": 0.00011488321917363107, + "loss": 1.2686, + "step": 32760 + }, + { + "epoch": 0.42571425322792594, + "grad_norm": 0.4437646269798279, + "learning_rate": 0.00011488061971171968, + "loss": 1.3695, + "step": 32761 + }, + { + "epoch": 0.4257272477718418, + "grad_norm": 0.4943767189979553, + "learning_rate": 0.0001148780202498083, + "loss": 1.4588, + "step": 32762 + }, + { + "epoch": 0.4257402423157577, + "grad_norm": 0.43306395411491394, + "learning_rate": 0.0001148754207878969, + "loss": 1.5101, + "step": 32763 + }, + { + "epoch": 0.42575323685967353, + "grad_norm": 0.45462557673454285, + "learning_rate": 0.00011487282132598554, + "loss": 1.3479, + "step": 32764 + }, + { + "epoch": 0.42576623140358943, + "grad_norm": 0.3237178921699524, + "learning_rate": 0.00011487022186407414, + "loss": 1.4843, + "step": 32765 + }, + { + "epoch": 0.4257792259475053, + "grad_norm": 0.38279351592063904, + "learning_rate": 0.00011486762240216275, + "loss": 1.3532, + "step": 32766 + }, + { + "epoch": 0.4257922204914212, + "grad_norm": 0.40862855315208435, + "learning_rate": 0.00011486502294025136, + "loss": 1.4185, + "step": 32767 + }, + { + "epoch": 0.425805215035337, + "grad_norm": 0.3366811275482178, + "learning_rate": 0.00011486242347834, + "loss": 1.3415, + "step": 32768 + }, + { + "epoch": 0.4258182095792529, + "grad_norm": 0.29549768567085266, + "learning_rate": 0.00011485982401642861, + "loss": 1.319, + "step": 32769 + }, + { + "epoch": 0.42583120412316877, + "grad_norm": 0.42901721596717834, + "learning_rate": 0.00011485722455451722, + "loss": 1.2681, + "step": 32770 + }, + { + "epoch": 0.4258441986670847, + "grad_norm": 0.41053691506385803, + "learning_rate": 0.00011485462509260583, + "loss": 1.4367, + "step": 32771 + }, + { + "epoch": 0.4258571932110005, + "grad_norm": 0.36776745319366455, + "learning_rate": 0.00011485202563069446, + "loss": 1.4827, + "step": 32772 + }, + { + "epoch": 0.4258701877549164, + "grad_norm": 0.42250028252601624, + "learning_rate": 0.00011484942616878307, + "loss": 1.3056, + "step": 32773 + }, + { + "epoch": 0.42588318229883226, + "grad_norm": 0.4061184227466583, + "learning_rate": 0.00011484682670687168, + "loss": 1.3856, + "step": 32774 + }, + { + "epoch": 0.42589617684274816, + "grad_norm": 0.3884889781475067, + "learning_rate": 0.00011484422724496029, + "loss": 1.4417, + "step": 32775 + }, + { + "epoch": 0.425909171386664, + "grad_norm": 0.36790668964385986, + "learning_rate": 0.00011484162778304893, + "loss": 1.2486, + "step": 32776 + }, + { + "epoch": 0.4259221659305799, + "grad_norm": 0.38973650336265564, + "learning_rate": 0.00011483902832113753, + "loss": 1.3887, + "step": 32777 + }, + { + "epoch": 0.42593516047449576, + "grad_norm": 0.48880186676979065, + "learning_rate": 0.00011483642885922614, + "loss": 1.4904, + "step": 32778 + }, + { + "epoch": 0.42594815501841166, + "grad_norm": 0.4057094156742096, + "learning_rate": 0.00011483382939731475, + "loss": 1.4893, + "step": 32779 + }, + { + "epoch": 0.4259611495623275, + "grad_norm": 0.5092507004737854, + "learning_rate": 0.00011483122993540339, + "loss": 1.5405, + "step": 32780 + }, + { + "epoch": 0.4259741441062434, + "grad_norm": 0.37149399518966675, + "learning_rate": 0.000114828630473492, + "loss": 1.2962, + "step": 32781 + }, + { + "epoch": 0.42598713865015925, + "grad_norm": 0.4929821789264679, + "learning_rate": 0.00011482603101158061, + "loss": 1.394, + "step": 32782 + }, + { + "epoch": 0.42600013319407515, + "grad_norm": 0.474163293838501, + "learning_rate": 0.00011482343154966922, + "loss": 1.4778, + "step": 32783 + }, + { + "epoch": 0.426013127737991, + "grad_norm": 0.39708057045936584, + "learning_rate": 0.00011482083208775784, + "loss": 1.4183, + "step": 32784 + }, + { + "epoch": 0.4260261222819069, + "grad_norm": 0.47365882992744446, + "learning_rate": 0.00011481823262584645, + "loss": 1.3563, + "step": 32785 + }, + { + "epoch": 0.42603911682582274, + "grad_norm": 0.44962528347969055, + "learning_rate": 0.00011481563316393506, + "loss": 1.5358, + "step": 32786 + }, + { + "epoch": 0.42605211136973864, + "grad_norm": 0.5008590817451477, + "learning_rate": 0.00011481303370202368, + "loss": 1.4158, + "step": 32787 + }, + { + "epoch": 0.4260651059136545, + "grad_norm": 0.3530007004737854, + "learning_rate": 0.00011481043424011231, + "loss": 1.177, + "step": 32788 + }, + { + "epoch": 0.4260781004575704, + "grad_norm": 0.41533127427101135, + "learning_rate": 0.00011480783477820092, + "loss": 1.3842, + "step": 32789 + }, + { + "epoch": 0.42609109500148623, + "grad_norm": 0.36696693301200867, + "learning_rate": 0.00011480523531628952, + "loss": 1.2127, + "step": 32790 + }, + { + "epoch": 0.42610408954540213, + "grad_norm": 0.4217143952846527, + "learning_rate": 0.00011480263585437816, + "loss": 1.3774, + "step": 32791 + }, + { + "epoch": 0.426117084089318, + "grad_norm": 0.4862287938594818, + "learning_rate": 0.00011480003639246677, + "loss": 1.3672, + "step": 32792 + }, + { + "epoch": 0.4261300786332339, + "grad_norm": 0.3787732422351837, + "learning_rate": 0.00011479743693055538, + "loss": 1.2573, + "step": 32793 + }, + { + "epoch": 0.4261430731771497, + "grad_norm": 0.3426017463207245, + "learning_rate": 0.00011479483746864399, + "loss": 1.3609, + "step": 32794 + }, + { + "epoch": 0.4261560677210656, + "grad_norm": 0.41839492321014404, + "learning_rate": 0.00011479223800673262, + "loss": 1.4041, + "step": 32795 + }, + { + "epoch": 0.42616906226498147, + "grad_norm": 0.35034388303756714, + "learning_rate": 0.00011478963854482123, + "loss": 1.4561, + "step": 32796 + }, + { + "epoch": 0.42618205680889737, + "grad_norm": 0.44032689929008484, + "learning_rate": 0.00011478703908290984, + "loss": 1.5159, + "step": 32797 + }, + { + "epoch": 0.4261950513528132, + "grad_norm": 0.29131001234054565, + "learning_rate": 0.00011478443962099845, + "loss": 1.1066, + "step": 32798 + }, + { + "epoch": 0.4262080458967291, + "grad_norm": 0.39421477913856506, + "learning_rate": 0.00011478184015908709, + "loss": 1.2233, + "step": 32799 + }, + { + "epoch": 0.42622104044064496, + "grad_norm": 0.38550177216529846, + "learning_rate": 0.0001147792406971757, + "loss": 1.3127, + "step": 32800 + }, + { + "epoch": 0.42623403498456086, + "grad_norm": 0.4063124656677246, + "learning_rate": 0.00011477664123526431, + "loss": 1.4169, + "step": 32801 + }, + { + "epoch": 0.4262470295284767, + "grad_norm": 0.3618965744972229, + "learning_rate": 0.00011477404177335292, + "loss": 1.3311, + "step": 32802 + }, + { + "epoch": 0.4262600240723926, + "grad_norm": 0.4103372395038605, + "learning_rate": 0.00011477144231144154, + "loss": 1.3457, + "step": 32803 + }, + { + "epoch": 0.42627301861630845, + "grad_norm": 0.310663104057312, + "learning_rate": 0.00011476884284953016, + "loss": 1.1693, + "step": 32804 + }, + { + "epoch": 0.42628601316022435, + "grad_norm": 0.39910319447517395, + "learning_rate": 0.00011476624338761877, + "loss": 1.4011, + "step": 32805 + }, + { + "epoch": 0.4262990077041402, + "grad_norm": 0.4499560296535492, + "learning_rate": 0.00011476364392570738, + "loss": 1.2533, + "step": 32806 + }, + { + "epoch": 0.4263120022480561, + "grad_norm": 0.33073529601097107, + "learning_rate": 0.000114761044463796, + "loss": 1.3588, + "step": 32807 + }, + { + "epoch": 0.42632499679197194, + "grad_norm": 0.31497201323509216, + "learning_rate": 0.00011475844500188461, + "loss": 1.2694, + "step": 32808 + }, + { + "epoch": 0.42633799133588784, + "grad_norm": 0.3413882851600647, + "learning_rate": 0.00011475584553997322, + "loss": 1.2689, + "step": 32809 + }, + { + "epoch": 0.4263509858798037, + "grad_norm": 0.4022754430770874, + "learning_rate": 0.00011475324607806183, + "loss": 1.2566, + "step": 32810 + }, + { + "epoch": 0.4263639804237196, + "grad_norm": 0.42208191752433777, + "learning_rate": 0.00011475064661615047, + "loss": 1.2734, + "step": 32811 + }, + { + "epoch": 0.4263769749676355, + "grad_norm": 0.4367223381996155, + "learning_rate": 0.00011474804715423908, + "loss": 1.3305, + "step": 32812 + }, + { + "epoch": 0.42638996951155134, + "grad_norm": 0.37246769666671753, + "learning_rate": 0.0001147454476923277, + "loss": 1.403, + "step": 32813 + }, + { + "epoch": 0.42640296405546724, + "grad_norm": 0.6029079556465149, + "learning_rate": 0.0001147428482304163, + "loss": 1.6034, + "step": 32814 + }, + { + "epoch": 0.4264159585993831, + "grad_norm": 0.44445306062698364, + "learning_rate": 0.00011474024876850493, + "loss": 1.3958, + "step": 32815 + }, + { + "epoch": 0.426428953143299, + "grad_norm": 0.3457375168800354, + "learning_rate": 0.00011473764930659354, + "loss": 1.1933, + "step": 32816 + }, + { + "epoch": 0.42644194768721483, + "grad_norm": 0.472170889377594, + "learning_rate": 0.00011473504984468215, + "loss": 1.6479, + "step": 32817 + }, + { + "epoch": 0.42645494223113073, + "grad_norm": 0.4075019061565399, + "learning_rate": 0.00011473245038277076, + "loss": 1.2688, + "step": 32818 + }, + { + "epoch": 0.4264679367750466, + "grad_norm": 0.3741489052772522, + "learning_rate": 0.00011472985092085939, + "loss": 1.4706, + "step": 32819 + }, + { + "epoch": 0.4264809313189625, + "grad_norm": 0.4712775647640228, + "learning_rate": 0.000114727251458948, + "loss": 1.4307, + "step": 32820 + }, + { + "epoch": 0.4264939258628783, + "grad_norm": 0.4487972855567932, + "learning_rate": 0.00011472465199703661, + "loss": 1.2472, + "step": 32821 + }, + { + "epoch": 0.4265069204067942, + "grad_norm": 0.39073678851127625, + "learning_rate": 0.00011472205253512522, + "loss": 1.0702, + "step": 32822 + }, + { + "epoch": 0.42651991495071007, + "grad_norm": 0.4891730844974518, + "learning_rate": 0.00011471945307321386, + "loss": 1.3841, + "step": 32823 + }, + { + "epoch": 0.42653290949462597, + "grad_norm": 0.39912983775138855, + "learning_rate": 0.00011471685361130247, + "loss": 1.2607, + "step": 32824 + }, + { + "epoch": 0.4265459040385418, + "grad_norm": 0.3668476343154907, + "learning_rate": 0.00011471425414939108, + "loss": 1.3151, + "step": 32825 + }, + { + "epoch": 0.4265588985824577, + "grad_norm": 0.47021716833114624, + "learning_rate": 0.0001147116546874797, + "loss": 1.3251, + "step": 32826 + }, + { + "epoch": 0.42657189312637356, + "grad_norm": 0.5033183693885803, + "learning_rate": 0.00011470905522556832, + "loss": 1.4136, + "step": 32827 + }, + { + "epoch": 0.42658488767028946, + "grad_norm": 0.43883442878723145, + "learning_rate": 0.00011470645576365693, + "loss": 1.3018, + "step": 32828 + }, + { + "epoch": 0.4265978822142053, + "grad_norm": 0.3371163606643677, + "learning_rate": 0.00011470385630174554, + "loss": 1.4162, + "step": 32829 + }, + { + "epoch": 0.4266108767581212, + "grad_norm": 0.4342816472053528, + "learning_rate": 0.00011470125683983418, + "loss": 1.4202, + "step": 32830 + }, + { + "epoch": 0.42662387130203705, + "grad_norm": 0.3728175759315491, + "learning_rate": 0.00011469865737792279, + "loss": 1.3711, + "step": 32831 + }, + { + "epoch": 0.42663686584595295, + "grad_norm": 0.3101612627506256, + "learning_rate": 0.00011469605791601138, + "loss": 1.5332, + "step": 32832 + }, + { + "epoch": 0.4266498603898688, + "grad_norm": 0.38016143441200256, + "learning_rate": 0.0001146934584541, + "loss": 1.6912, + "step": 32833 + }, + { + "epoch": 0.4266628549337847, + "grad_norm": 0.348294734954834, + "learning_rate": 0.00011469085899218863, + "loss": 1.3105, + "step": 32834 + }, + { + "epoch": 0.42667584947770054, + "grad_norm": 0.4668010175228119, + "learning_rate": 0.00011468825953027724, + "loss": 1.5294, + "step": 32835 + }, + { + "epoch": 0.42668884402161644, + "grad_norm": 0.4506417214870453, + "learning_rate": 0.00011468566006836585, + "loss": 1.5579, + "step": 32836 + }, + { + "epoch": 0.4267018385655323, + "grad_norm": 0.4124908149242401, + "learning_rate": 0.00011468306060645447, + "loss": 1.2575, + "step": 32837 + }, + { + "epoch": 0.4267148331094482, + "grad_norm": 0.3964749574661255, + "learning_rate": 0.00011468046114454309, + "loss": 1.3125, + "step": 32838 + }, + { + "epoch": 0.42672782765336403, + "grad_norm": 0.46739378571510315, + "learning_rate": 0.0001146778616826317, + "loss": 1.5113, + "step": 32839 + }, + { + "epoch": 0.42674082219727993, + "grad_norm": 0.38561102747917175, + "learning_rate": 0.00011467526222072031, + "loss": 1.3133, + "step": 32840 + }, + { + "epoch": 0.4267538167411958, + "grad_norm": 0.4189145863056183, + "learning_rate": 0.00011467266275880892, + "loss": 1.3998, + "step": 32841 + }, + { + "epoch": 0.4267668112851117, + "grad_norm": 0.393160879611969, + "learning_rate": 0.00011467006329689756, + "loss": 1.3429, + "step": 32842 + }, + { + "epoch": 0.4267798058290275, + "grad_norm": 0.4541653096675873, + "learning_rate": 0.00011466746383498617, + "loss": 1.5587, + "step": 32843 + }, + { + "epoch": 0.4267928003729434, + "grad_norm": 0.4426872432231903, + "learning_rate": 0.00011466486437307478, + "loss": 1.3673, + "step": 32844 + }, + { + "epoch": 0.42680579491685927, + "grad_norm": 0.4444718658924103, + "learning_rate": 0.00011466226491116338, + "loss": 1.5381, + "step": 32845 + }, + { + "epoch": 0.42681878946077517, + "grad_norm": 0.3532416820526123, + "learning_rate": 0.00011465966544925202, + "loss": 1.2414, + "step": 32846 + }, + { + "epoch": 0.426831784004691, + "grad_norm": 0.33352282643318176, + "learning_rate": 0.00011465706598734063, + "loss": 1.5366, + "step": 32847 + }, + { + "epoch": 0.4268447785486069, + "grad_norm": 0.390583336353302, + "learning_rate": 0.00011465446652542924, + "loss": 1.2317, + "step": 32848 + }, + { + "epoch": 0.42685777309252276, + "grad_norm": 0.31314635276794434, + "learning_rate": 0.00011465186706351785, + "loss": 1.2794, + "step": 32849 + }, + { + "epoch": 0.42687076763643866, + "grad_norm": 0.3666491210460663, + "learning_rate": 0.00011464926760160648, + "loss": 1.3526, + "step": 32850 + }, + { + "epoch": 0.4268837621803545, + "grad_norm": 0.4602649211883545, + "learning_rate": 0.00011464666813969509, + "loss": 1.4471, + "step": 32851 + }, + { + "epoch": 0.4268967567242704, + "grad_norm": 0.339557409286499, + "learning_rate": 0.0001146440686777837, + "loss": 1.3946, + "step": 32852 + }, + { + "epoch": 0.42690975126818625, + "grad_norm": 0.3528422713279724, + "learning_rate": 0.00011464146921587231, + "loss": 1.1821, + "step": 32853 + }, + { + "epoch": 0.42692274581210216, + "grad_norm": 0.4109194874763489, + "learning_rate": 0.00011463886975396095, + "loss": 1.2582, + "step": 32854 + }, + { + "epoch": 0.426935740356018, + "grad_norm": 0.4166772663593292, + "learning_rate": 0.00011463627029204956, + "loss": 1.3765, + "step": 32855 + }, + { + "epoch": 0.4269487348999339, + "grad_norm": 0.43684664368629456, + "learning_rate": 0.00011463367083013817, + "loss": 1.5558, + "step": 32856 + }, + { + "epoch": 0.42696172944384975, + "grad_norm": 0.4646078050136566, + "learning_rate": 0.00011463107136822677, + "loss": 1.4777, + "step": 32857 + }, + { + "epoch": 0.42697472398776565, + "grad_norm": 0.36714062094688416, + "learning_rate": 0.0001146284719063154, + "loss": 1.3341, + "step": 32858 + }, + { + "epoch": 0.4269877185316815, + "grad_norm": 0.4614235758781433, + "learning_rate": 0.00011462587244440401, + "loss": 1.3862, + "step": 32859 + }, + { + "epoch": 0.4270007130755974, + "grad_norm": 0.3928585350513458, + "learning_rate": 0.00011462327298249263, + "loss": 1.3795, + "step": 32860 + }, + { + "epoch": 0.42701370761951324, + "grad_norm": 0.28215593099594116, + "learning_rate": 0.00011462067352058124, + "loss": 1.3317, + "step": 32861 + }, + { + "epoch": 0.42702670216342914, + "grad_norm": 0.3727930188179016, + "learning_rate": 0.00011461807405866986, + "loss": 1.4699, + "step": 32862 + }, + { + "epoch": 0.427039696707345, + "grad_norm": 0.40743106603622437, + "learning_rate": 0.00011461547459675847, + "loss": 1.5454, + "step": 32863 + }, + { + "epoch": 0.4270526912512609, + "grad_norm": 0.3678722679615021, + "learning_rate": 0.00011461287513484708, + "loss": 1.481, + "step": 32864 + }, + { + "epoch": 0.42706568579517673, + "grad_norm": 0.4831506311893463, + "learning_rate": 0.00011461027567293572, + "loss": 1.4107, + "step": 32865 + }, + { + "epoch": 0.42707868033909263, + "grad_norm": 0.5250071287155151, + "learning_rate": 0.00011460767621102433, + "loss": 1.5032, + "step": 32866 + }, + { + "epoch": 0.4270916748830085, + "grad_norm": 0.38608986139297485, + "learning_rate": 0.00011460507674911294, + "loss": 1.4034, + "step": 32867 + }, + { + "epoch": 0.4271046694269244, + "grad_norm": 0.41576865315437317, + "learning_rate": 0.00011460247728720155, + "loss": 1.358, + "step": 32868 + }, + { + "epoch": 0.4271176639708402, + "grad_norm": 0.42396080493927, + "learning_rate": 0.00011459987782529018, + "loss": 1.3935, + "step": 32869 + }, + { + "epoch": 0.4271306585147561, + "grad_norm": 0.35341450572013855, + "learning_rate": 0.00011459727836337879, + "loss": 1.5241, + "step": 32870 + }, + { + "epoch": 0.42714365305867197, + "grad_norm": 0.2943039536476135, + "learning_rate": 0.0001145946789014674, + "loss": 1.4861, + "step": 32871 + }, + { + "epoch": 0.42715664760258787, + "grad_norm": 0.40003466606140137, + "learning_rate": 0.00011459207943955601, + "loss": 1.548, + "step": 32872 + }, + { + "epoch": 0.4271696421465037, + "grad_norm": 0.38376590609550476, + "learning_rate": 0.00011458947997764465, + "loss": 1.2353, + "step": 32873 + }, + { + "epoch": 0.4271826366904196, + "grad_norm": 0.36843591928482056, + "learning_rate": 0.00011458688051573325, + "loss": 1.2981, + "step": 32874 + }, + { + "epoch": 0.42719563123433546, + "grad_norm": 0.3761000335216522, + "learning_rate": 0.00011458428105382186, + "loss": 1.3807, + "step": 32875 + }, + { + "epoch": 0.42720862577825136, + "grad_norm": 0.396903395652771, + "learning_rate": 0.00011458168159191047, + "loss": 1.3137, + "step": 32876 + }, + { + "epoch": 0.4272216203221672, + "grad_norm": 0.41537466645240784, + "learning_rate": 0.0001145790821299991, + "loss": 1.6739, + "step": 32877 + }, + { + "epoch": 0.4272346148660831, + "grad_norm": 0.48561975359916687, + "learning_rate": 0.00011457648266808772, + "loss": 1.5377, + "step": 32878 + }, + { + "epoch": 0.42724760940999895, + "grad_norm": 0.41702041029930115, + "learning_rate": 0.00011457388320617633, + "loss": 1.476, + "step": 32879 + }, + { + "epoch": 0.42726060395391485, + "grad_norm": 0.353404700756073, + "learning_rate": 0.00011457128374426494, + "loss": 1.2124, + "step": 32880 + }, + { + "epoch": 0.4272735984978307, + "grad_norm": 0.43179455399513245, + "learning_rate": 0.00011456868428235356, + "loss": 1.5913, + "step": 32881 + }, + { + "epoch": 0.4272865930417466, + "grad_norm": 0.44365885853767395, + "learning_rate": 0.00011456608482044217, + "loss": 1.479, + "step": 32882 + }, + { + "epoch": 0.42729958758566244, + "grad_norm": 0.5209981799125671, + "learning_rate": 0.00011456348535853079, + "loss": 1.6482, + "step": 32883 + }, + { + "epoch": 0.42731258212957834, + "grad_norm": 0.4402356743812561, + "learning_rate": 0.0001145608858966194, + "loss": 1.2782, + "step": 32884 + }, + { + "epoch": 0.4273255766734942, + "grad_norm": 0.4412650763988495, + "learning_rate": 0.00011455828643470803, + "loss": 1.1813, + "step": 32885 + }, + { + "epoch": 0.4273385712174101, + "grad_norm": 0.37789034843444824, + "learning_rate": 0.00011455568697279665, + "loss": 1.3956, + "step": 32886 + }, + { + "epoch": 0.42735156576132594, + "grad_norm": 0.3888697624206543, + "learning_rate": 0.00011455308751088524, + "loss": 1.3653, + "step": 32887 + }, + { + "epoch": 0.42736456030524184, + "grad_norm": 0.384438693523407, + "learning_rate": 0.00011455048804897385, + "loss": 1.2726, + "step": 32888 + }, + { + "epoch": 0.42737755484915774, + "grad_norm": 0.45589837431907654, + "learning_rate": 0.00011454788858706249, + "loss": 1.2597, + "step": 32889 + }, + { + "epoch": 0.4273905493930736, + "grad_norm": 0.3793381154537201, + "learning_rate": 0.0001145452891251511, + "loss": 1.5491, + "step": 32890 + }, + { + "epoch": 0.4274035439369895, + "grad_norm": 0.5033237338066101, + "learning_rate": 0.00011454268966323971, + "loss": 1.5788, + "step": 32891 + }, + { + "epoch": 0.42741653848090533, + "grad_norm": 0.40526923537254333, + "learning_rate": 0.00011454009020132832, + "loss": 1.368, + "step": 32892 + }, + { + "epoch": 0.42742953302482123, + "grad_norm": 0.4253199100494385, + "learning_rate": 0.00011453749073941695, + "loss": 1.3319, + "step": 32893 + }, + { + "epoch": 0.4274425275687371, + "grad_norm": 0.3368600308895111, + "learning_rate": 0.00011453489127750556, + "loss": 1.3149, + "step": 32894 + }, + { + "epoch": 0.427455522112653, + "grad_norm": 0.46693888306617737, + "learning_rate": 0.00011453229181559417, + "loss": 1.3481, + "step": 32895 + }, + { + "epoch": 0.4274685166565688, + "grad_norm": 0.4167735278606415, + "learning_rate": 0.00011452969235368278, + "loss": 1.3895, + "step": 32896 + }, + { + "epoch": 0.4274815112004847, + "grad_norm": 0.4482892155647278, + "learning_rate": 0.00011452709289177142, + "loss": 1.4057, + "step": 32897 + }, + { + "epoch": 0.42749450574440057, + "grad_norm": 0.4834119379520416, + "learning_rate": 0.00011452449342986003, + "loss": 1.3401, + "step": 32898 + }, + { + "epoch": 0.42750750028831647, + "grad_norm": 0.326749712228775, + "learning_rate": 0.00011452189396794863, + "loss": 1.3199, + "step": 32899 + }, + { + "epoch": 0.4275204948322323, + "grad_norm": 0.4899827539920807, + "learning_rate": 0.00011451929450603727, + "loss": 1.4786, + "step": 32900 + }, + { + "epoch": 0.4275334893761482, + "grad_norm": 0.5009106397628784, + "learning_rate": 0.00011451669504412588, + "loss": 1.5277, + "step": 32901 + }, + { + "epoch": 0.42754648392006406, + "grad_norm": 0.31033840775489807, + "learning_rate": 0.00011451409558221449, + "loss": 1.512, + "step": 32902 + }, + { + "epoch": 0.42755947846397996, + "grad_norm": 0.4763345718383789, + "learning_rate": 0.0001145114961203031, + "loss": 1.4909, + "step": 32903 + }, + { + "epoch": 0.4275724730078958, + "grad_norm": 0.3603958189487457, + "learning_rate": 0.00011450889665839172, + "loss": 1.413, + "step": 32904 + }, + { + "epoch": 0.4275854675518117, + "grad_norm": 0.312365859746933, + "learning_rate": 0.00011450629719648033, + "loss": 1.2864, + "step": 32905 + }, + { + "epoch": 0.42759846209572755, + "grad_norm": 0.4907492995262146, + "learning_rate": 0.00011450369773456895, + "loss": 1.5186, + "step": 32906 + }, + { + "epoch": 0.42761145663964345, + "grad_norm": 0.34510377049446106, + "learning_rate": 0.00011450109827265756, + "loss": 1.2269, + "step": 32907 + }, + { + "epoch": 0.4276244511835593, + "grad_norm": 0.4158482849597931, + "learning_rate": 0.0001144984988107462, + "loss": 1.3944, + "step": 32908 + }, + { + "epoch": 0.4276374457274752, + "grad_norm": 0.48456472158432007, + "learning_rate": 0.0001144958993488348, + "loss": 1.5591, + "step": 32909 + }, + { + "epoch": 0.42765044027139104, + "grad_norm": 0.34927916526794434, + "learning_rate": 0.00011449329988692342, + "loss": 1.332, + "step": 32910 + }, + { + "epoch": 0.42766343481530694, + "grad_norm": 0.4181821346282959, + "learning_rate": 0.00011449070042501203, + "loss": 1.2098, + "step": 32911 + }, + { + "epoch": 0.4276764293592228, + "grad_norm": 0.4651978611946106, + "learning_rate": 0.00011448810096310065, + "loss": 1.5268, + "step": 32912 + }, + { + "epoch": 0.4276894239031387, + "grad_norm": 0.3932968080043793, + "learning_rate": 0.00011448550150118926, + "loss": 1.2987, + "step": 32913 + }, + { + "epoch": 0.42770241844705453, + "grad_norm": 0.44433948397636414, + "learning_rate": 0.00011448290203927787, + "loss": 1.294, + "step": 32914 + }, + { + "epoch": 0.42771541299097043, + "grad_norm": 0.31703805923461914, + "learning_rate": 0.00011448030257736648, + "loss": 1.1767, + "step": 32915 + }, + { + "epoch": 0.4277284075348863, + "grad_norm": 0.32806262373924255, + "learning_rate": 0.00011447770311545511, + "loss": 1.3066, + "step": 32916 + }, + { + "epoch": 0.4277414020788022, + "grad_norm": 0.3266885578632355, + "learning_rate": 0.00011447510365354372, + "loss": 1.1738, + "step": 32917 + }, + { + "epoch": 0.427754396622718, + "grad_norm": 0.41356852650642395, + "learning_rate": 0.00011447250419163233, + "loss": 1.3799, + "step": 32918 + }, + { + "epoch": 0.4277673911666339, + "grad_norm": 0.4480923116207123, + "learning_rate": 0.00011446990472972094, + "loss": 1.4505, + "step": 32919 + }, + { + "epoch": 0.42778038571054977, + "grad_norm": 0.41598185896873474, + "learning_rate": 0.00011446730526780958, + "loss": 1.4908, + "step": 32920 + }, + { + "epoch": 0.42779338025446567, + "grad_norm": 0.5049240589141846, + "learning_rate": 0.00011446470580589819, + "loss": 1.3075, + "step": 32921 + }, + { + "epoch": 0.4278063747983815, + "grad_norm": 0.4384255111217499, + "learning_rate": 0.0001144621063439868, + "loss": 1.2741, + "step": 32922 + }, + { + "epoch": 0.4278193693422974, + "grad_norm": 0.3281305730342865, + "learning_rate": 0.00011445950688207541, + "loss": 1.3135, + "step": 32923 + }, + { + "epoch": 0.42783236388621326, + "grad_norm": 0.498891144990921, + "learning_rate": 0.00011445690742016404, + "loss": 1.5401, + "step": 32924 + }, + { + "epoch": 0.42784535843012916, + "grad_norm": 0.4500311315059662, + "learning_rate": 0.00011445430795825265, + "loss": 1.3548, + "step": 32925 + }, + { + "epoch": 0.427858352974045, + "grad_norm": 0.39651259779930115, + "learning_rate": 0.00011445170849634126, + "loss": 1.3717, + "step": 32926 + }, + { + "epoch": 0.4278713475179609, + "grad_norm": 0.3072621822357178, + "learning_rate": 0.00011444910903442987, + "loss": 1.3399, + "step": 32927 + }, + { + "epoch": 0.42788434206187675, + "grad_norm": 0.5017160177230835, + "learning_rate": 0.00011444650957251851, + "loss": 1.3851, + "step": 32928 + }, + { + "epoch": 0.42789733660579266, + "grad_norm": 0.3628110885620117, + "learning_rate": 0.0001144439101106071, + "loss": 1.4355, + "step": 32929 + }, + { + "epoch": 0.4279103311497085, + "grad_norm": 0.38326093554496765, + "learning_rate": 0.00011444131064869572, + "loss": 1.6049, + "step": 32930 + }, + { + "epoch": 0.4279233256936244, + "grad_norm": 0.33080607652664185, + "learning_rate": 0.00011443871118678433, + "loss": 1.2293, + "step": 32931 + }, + { + "epoch": 0.42793632023754025, + "grad_norm": 0.39626824855804443, + "learning_rate": 0.00011443611172487296, + "loss": 1.3841, + "step": 32932 + }, + { + "epoch": 0.42794931478145615, + "grad_norm": 0.4182763695716858, + "learning_rate": 0.00011443351226296158, + "loss": 1.3319, + "step": 32933 + }, + { + "epoch": 0.427962309325372, + "grad_norm": 0.39371421933174133, + "learning_rate": 0.00011443091280105019, + "loss": 1.279, + "step": 32934 + }, + { + "epoch": 0.4279753038692879, + "grad_norm": 0.4124096632003784, + "learning_rate": 0.0001144283133391388, + "loss": 1.2485, + "step": 32935 + }, + { + "epoch": 0.42798829841320374, + "grad_norm": 0.4605541527271271, + "learning_rate": 0.00011442571387722742, + "loss": 1.4292, + "step": 32936 + }, + { + "epoch": 0.42800129295711964, + "grad_norm": 0.3719262182712555, + "learning_rate": 0.00011442311441531603, + "loss": 1.3938, + "step": 32937 + }, + { + "epoch": 0.4280142875010355, + "grad_norm": 0.45827800035476685, + "learning_rate": 0.00011442051495340464, + "loss": 1.4153, + "step": 32938 + }, + { + "epoch": 0.4280272820449514, + "grad_norm": 0.38764849305152893, + "learning_rate": 0.00011441791549149328, + "loss": 1.2289, + "step": 32939 + }, + { + "epoch": 0.42804027658886723, + "grad_norm": 0.561897873878479, + "learning_rate": 0.00011441531602958189, + "loss": 1.417, + "step": 32940 + }, + { + "epoch": 0.42805327113278313, + "grad_norm": 0.5016710758209229, + "learning_rate": 0.00011441271656767049, + "loss": 1.5871, + "step": 32941 + }, + { + "epoch": 0.428066265676699, + "grad_norm": 0.3721367418766022, + "learning_rate": 0.0001144101171057591, + "loss": 1.5916, + "step": 32942 + }, + { + "epoch": 0.4280792602206149, + "grad_norm": 0.35788917541503906, + "learning_rate": 0.00011440751764384774, + "loss": 1.2411, + "step": 32943 + }, + { + "epoch": 0.4280922547645307, + "grad_norm": 0.3961165249347687, + "learning_rate": 0.00011440491818193635, + "loss": 1.3592, + "step": 32944 + }, + { + "epoch": 0.4281052493084466, + "grad_norm": 0.3320601284503937, + "learning_rate": 0.00011440231872002496, + "loss": 1.2992, + "step": 32945 + }, + { + "epoch": 0.42811824385236247, + "grad_norm": 0.43274441361427307, + "learning_rate": 0.00011439971925811357, + "loss": 1.455, + "step": 32946 + }, + { + "epoch": 0.42813123839627837, + "grad_norm": 0.3733232021331787, + "learning_rate": 0.0001143971197962022, + "loss": 1.4835, + "step": 32947 + }, + { + "epoch": 0.4281442329401942, + "grad_norm": 0.39285239577293396, + "learning_rate": 0.00011439452033429081, + "loss": 1.2906, + "step": 32948 + }, + { + "epoch": 0.4281572274841101, + "grad_norm": 0.36075690388679504, + "learning_rate": 0.00011439192087237942, + "loss": 1.4588, + "step": 32949 + }, + { + "epoch": 0.42817022202802596, + "grad_norm": 0.37205448746681213, + "learning_rate": 0.00011438932141046803, + "loss": 1.4358, + "step": 32950 + }, + { + "epoch": 0.42818321657194186, + "grad_norm": 0.3537094295024872, + "learning_rate": 0.00011438672194855667, + "loss": 1.4761, + "step": 32951 + }, + { + "epoch": 0.4281962111158577, + "grad_norm": 0.42746567726135254, + "learning_rate": 0.00011438412248664528, + "loss": 1.2044, + "step": 32952 + }, + { + "epoch": 0.4282092056597736, + "grad_norm": 0.35676309466362, + "learning_rate": 0.00011438152302473389, + "loss": 1.3542, + "step": 32953 + }, + { + "epoch": 0.42822220020368945, + "grad_norm": 0.470663845539093, + "learning_rate": 0.00011437892356282249, + "loss": 1.4246, + "step": 32954 + }, + { + "epoch": 0.42823519474760535, + "grad_norm": 0.41334274411201477, + "learning_rate": 0.00011437632410091112, + "loss": 1.5076, + "step": 32955 + }, + { + "epoch": 0.4282481892915212, + "grad_norm": 0.40773844718933105, + "learning_rate": 0.00011437372463899974, + "loss": 1.4476, + "step": 32956 + }, + { + "epoch": 0.4282611838354371, + "grad_norm": 0.3873310685157776, + "learning_rate": 0.00011437112517708835, + "loss": 1.5608, + "step": 32957 + }, + { + "epoch": 0.42827417837935294, + "grad_norm": 0.4539563059806824, + "learning_rate": 0.00011436852571517696, + "loss": 1.3616, + "step": 32958 + }, + { + "epoch": 0.42828717292326884, + "grad_norm": 0.3360161781311035, + "learning_rate": 0.00011436592625326558, + "loss": 1.4219, + "step": 32959 + }, + { + "epoch": 0.4283001674671847, + "grad_norm": 0.38950201869010925, + "learning_rate": 0.00011436332679135419, + "loss": 1.341, + "step": 32960 + }, + { + "epoch": 0.4283131620111006, + "grad_norm": 0.4776470363140106, + "learning_rate": 0.0001143607273294428, + "loss": 1.3799, + "step": 32961 + }, + { + "epoch": 0.42832615655501644, + "grad_norm": 0.35883405804634094, + "learning_rate": 0.00011435812786753141, + "loss": 1.2749, + "step": 32962 + }, + { + "epoch": 0.42833915109893234, + "grad_norm": 0.4309590458869934, + "learning_rate": 0.00011435552840562005, + "loss": 1.4162, + "step": 32963 + }, + { + "epoch": 0.42835214564284824, + "grad_norm": 0.37539729475975037, + "learning_rate": 0.00011435292894370866, + "loss": 1.3487, + "step": 32964 + }, + { + "epoch": 0.4283651401867641, + "grad_norm": 0.42053520679473877, + "learning_rate": 0.00011435032948179727, + "loss": 1.5003, + "step": 32965 + }, + { + "epoch": 0.42837813473068, + "grad_norm": 0.38253238797187805, + "learning_rate": 0.00011434773001988589, + "loss": 1.3482, + "step": 32966 + }, + { + "epoch": 0.4283911292745958, + "grad_norm": 0.4032425582408905, + "learning_rate": 0.00011434513055797451, + "loss": 1.5095, + "step": 32967 + }, + { + "epoch": 0.42840412381851173, + "grad_norm": 0.4528466463088989, + "learning_rate": 0.00011434253109606312, + "loss": 1.3803, + "step": 32968 + }, + { + "epoch": 0.4284171183624276, + "grad_norm": 0.34825149178504944, + "learning_rate": 0.00011433993163415173, + "loss": 1.5305, + "step": 32969 + }, + { + "epoch": 0.4284301129063435, + "grad_norm": 0.4225640892982483, + "learning_rate": 0.00011433733217224034, + "loss": 1.3553, + "step": 32970 + }, + { + "epoch": 0.4284431074502593, + "grad_norm": 0.41909220814704895, + "learning_rate": 0.00011433473271032897, + "loss": 1.3717, + "step": 32971 + }, + { + "epoch": 0.4284561019941752, + "grad_norm": 0.33388692140579224, + "learning_rate": 0.00011433213324841758, + "loss": 1.3239, + "step": 32972 + }, + { + "epoch": 0.42846909653809107, + "grad_norm": 0.33994460105895996, + "learning_rate": 0.00011432953378650619, + "loss": 1.3269, + "step": 32973 + }, + { + "epoch": 0.42848209108200697, + "grad_norm": 0.3454279601573944, + "learning_rate": 0.00011432693432459483, + "loss": 1.2758, + "step": 32974 + }, + { + "epoch": 0.4284950856259228, + "grad_norm": 0.3173542618751526, + "learning_rate": 0.00011432433486268344, + "loss": 1.4344, + "step": 32975 + }, + { + "epoch": 0.4285080801698387, + "grad_norm": 0.40768688917160034, + "learning_rate": 0.00011432173540077205, + "loss": 1.4672, + "step": 32976 + }, + { + "epoch": 0.42852107471375456, + "grad_norm": 0.367645263671875, + "learning_rate": 0.00011431913593886066, + "loss": 1.2757, + "step": 32977 + }, + { + "epoch": 0.42853406925767046, + "grad_norm": 0.3693621754646301, + "learning_rate": 0.00011431653647694928, + "loss": 1.4747, + "step": 32978 + }, + { + "epoch": 0.4285470638015863, + "grad_norm": 0.5344828367233276, + "learning_rate": 0.0001143139370150379, + "loss": 1.3818, + "step": 32979 + }, + { + "epoch": 0.4285600583455022, + "grad_norm": 0.38717135787010193, + "learning_rate": 0.0001143113375531265, + "loss": 1.4041, + "step": 32980 + }, + { + "epoch": 0.42857305288941805, + "grad_norm": 0.29870977997779846, + "learning_rate": 0.00011430873809121512, + "loss": 1.2573, + "step": 32981 + }, + { + "epoch": 0.42858604743333395, + "grad_norm": 0.4521723985671997, + "learning_rate": 0.00011430613862930376, + "loss": 1.3095, + "step": 32982 + }, + { + "epoch": 0.4285990419772498, + "grad_norm": 0.3614078760147095, + "learning_rate": 0.00011430353916739235, + "loss": 1.2895, + "step": 32983 + }, + { + "epoch": 0.4286120365211657, + "grad_norm": 0.4489821791648865, + "learning_rate": 0.00011430093970548096, + "loss": 1.415, + "step": 32984 + }, + { + "epoch": 0.42862503106508154, + "grad_norm": 0.32144278287887573, + "learning_rate": 0.00011429834024356957, + "loss": 1.2743, + "step": 32985 + }, + { + "epoch": 0.42863802560899744, + "grad_norm": 0.3889378607273102, + "learning_rate": 0.00011429574078165821, + "loss": 1.4236, + "step": 32986 + }, + { + "epoch": 0.4286510201529133, + "grad_norm": 0.3499564230442047, + "learning_rate": 0.00011429314131974682, + "loss": 1.5122, + "step": 32987 + }, + { + "epoch": 0.4286640146968292, + "grad_norm": 0.305021196603775, + "learning_rate": 0.00011429054185783543, + "loss": 1.2263, + "step": 32988 + }, + { + "epoch": 0.42867700924074503, + "grad_norm": 0.4088214635848999, + "learning_rate": 0.00011428794239592405, + "loss": 1.6406, + "step": 32989 + }, + { + "epoch": 0.42869000378466093, + "grad_norm": 0.39947709441185, + "learning_rate": 0.00011428534293401267, + "loss": 1.4852, + "step": 32990 + }, + { + "epoch": 0.4287029983285768, + "grad_norm": 0.4040416181087494, + "learning_rate": 0.00011428274347210128, + "loss": 1.3412, + "step": 32991 + }, + { + "epoch": 0.4287159928724927, + "grad_norm": 0.4759654402732849, + "learning_rate": 0.00011428014401018989, + "loss": 1.6076, + "step": 32992 + }, + { + "epoch": 0.4287289874164085, + "grad_norm": 0.4862821102142334, + "learning_rate": 0.0001142775445482785, + "loss": 1.5148, + "step": 32993 + }, + { + "epoch": 0.4287419819603244, + "grad_norm": 0.4117686152458191, + "learning_rate": 0.00011427494508636714, + "loss": 1.4097, + "step": 32994 + }, + { + "epoch": 0.42875497650424027, + "grad_norm": 0.34281232953071594, + "learning_rate": 0.00011427234562445575, + "loss": 1.3735, + "step": 32995 + }, + { + "epoch": 0.42876797104815617, + "grad_norm": 0.3933635354042053, + "learning_rate": 0.00011426974616254435, + "loss": 1.2935, + "step": 32996 + }, + { + "epoch": 0.428780965592072, + "grad_norm": 0.436548113822937, + "learning_rate": 0.00011426714670063296, + "loss": 1.3785, + "step": 32997 + }, + { + "epoch": 0.4287939601359879, + "grad_norm": 0.46581515669822693, + "learning_rate": 0.0001142645472387216, + "loss": 1.5153, + "step": 32998 + }, + { + "epoch": 0.42880695467990376, + "grad_norm": 0.41836127638816833, + "learning_rate": 0.00011426194777681021, + "loss": 1.2761, + "step": 32999 + }, + { + "epoch": 0.42881994922381966, + "grad_norm": 0.4111693203449249, + "learning_rate": 0.00011425934831489882, + "loss": 1.5732, + "step": 33000 + }, + { + "epoch": 0.4288329437677355, + "grad_norm": 0.23935194313526154, + "learning_rate": 0.00011425674885298743, + "loss": 1.3852, + "step": 33001 + }, + { + "epoch": 0.4288459383116514, + "grad_norm": 0.37584179639816284, + "learning_rate": 0.00011425414939107606, + "loss": 1.3571, + "step": 33002 + }, + { + "epoch": 0.42885893285556725, + "grad_norm": 0.35828694701194763, + "learning_rate": 0.00011425154992916467, + "loss": 1.4326, + "step": 33003 + }, + { + "epoch": 0.42887192739948315, + "grad_norm": 0.3447136878967285, + "learning_rate": 0.00011424895046725328, + "loss": 1.3819, + "step": 33004 + }, + { + "epoch": 0.428884921943399, + "grad_norm": 0.2607092559337616, + "learning_rate": 0.00011424635100534189, + "loss": 1.1683, + "step": 33005 + }, + { + "epoch": 0.4288979164873149, + "grad_norm": 0.38082775473594666, + "learning_rate": 0.00011424375154343053, + "loss": 1.531, + "step": 33006 + }, + { + "epoch": 0.42891091103123075, + "grad_norm": 0.4063643217086792, + "learning_rate": 0.00011424115208151914, + "loss": 1.361, + "step": 33007 + }, + { + "epoch": 0.42892390557514665, + "grad_norm": 0.36824119091033936, + "learning_rate": 0.00011423855261960775, + "loss": 1.298, + "step": 33008 + }, + { + "epoch": 0.4289369001190625, + "grad_norm": 0.37015676498413086, + "learning_rate": 0.00011423595315769635, + "loss": 1.1818, + "step": 33009 + }, + { + "epoch": 0.4289498946629784, + "grad_norm": 0.39529991149902344, + "learning_rate": 0.00011423335369578498, + "loss": 1.5586, + "step": 33010 + }, + { + "epoch": 0.42896288920689424, + "grad_norm": 0.4550600051879883, + "learning_rate": 0.0001142307542338736, + "loss": 1.3405, + "step": 33011 + }, + { + "epoch": 0.42897588375081014, + "grad_norm": 0.4093617796897888, + "learning_rate": 0.0001142281547719622, + "loss": 1.5043, + "step": 33012 + }, + { + "epoch": 0.428988878294726, + "grad_norm": 0.36031603813171387, + "learning_rate": 0.00011422555531005083, + "loss": 1.5713, + "step": 33013 + }, + { + "epoch": 0.4290018728386419, + "grad_norm": 0.48979198932647705, + "learning_rate": 0.00011422295584813944, + "loss": 1.3559, + "step": 33014 + }, + { + "epoch": 0.42901486738255773, + "grad_norm": 0.39934849739074707, + "learning_rate": 0.00011422035638622805, + "loss": 1.3094, + "step": 33015 + }, + { + "epoch": 0.42902786192647363, + "grad_norm": 0.36826372146606445, + "learning_rate": 0.00011421775692431666, + "loss": 1.1625, + "step": 33016 + }, + { + "epoch": 0.4290408564703895, + "grad_norm": 0.2788737416267395, + "learning_rate": 0.0001142151574624053, + "loss": 1.0874, + "step": 33017 + }, + { + "epoch": 0.4290538510143054, + "grad_norm": 0.40675240755081177, + "learning_rate": 0.00011421255800049391, + "loss": 1.5414, + "step": 33018 + }, + { + "epoch": 0.4290668455582212, + "grad_norm": 0.39083871245384216, + "learning_rate": 0.00011420995853858252, + "loss": 1.4494, + "step": 33019 + }, + { + "epoch": 0.4290798401021371, + "grad_norm": 0.34771040081977844, + "learning_rate": 0.00011420735907667113, + "loss": 1.4102, + "step": 33020 + }, + { + "epoch": 0.42909283464605297, + "grad_norm": 0.31723377108573914, + "learning_rate": 0.00011420475961475976, + "loss": 1.2182, + "step": 33021 + }, + { + "epoch": 0.42910582918996887, + "grad_norm": 0.40262946486473083, + "learning_rate": 0.00011420216015284837, + "loss": 1.5661, + "step": 33022 + }, + { + "epoch": 0.4291188237338847, + "grad_norm": 0.5085108876228333, + "learning_rate": 0.00011419956069093698, + "loss": 1.4595, + "step": 33023 + }, + { + "epoch": 0.4291318182778006, + "grad_norm": 0.44215112924575806, + "learning_rate": 0.00011419696122902559, + "loss": 1.4176, + "step": 33024 + }, + { + "epoch": 0.42914481282171646, + "grad_norm": 0.3943015933036804, + "learning_rate": 0.00011419436176711422, + "loss": 1.528, + "step": 33025 + }, + { + "epoch": 0.42915780736563236, + "grad_norm": 0.34902501106262207, + "learning_rate": 0.00011419176230520283, + "loss": 1.3306, + "step": 33026 + }, + { + "epoch": 0.4291708019095482, + "grad_norm": 0.4553138315677643, + "learning_rate": 0.00011418916284329144, + "loss": 1.4682, + "step": 33027 + }, + { + "epoch": 0.4291837964534641, + "grad_norm": 0.3910723924636841, + "learning_rate": 0.00011418656338138005, + "loss": 1.3087, + "step": 33028 + }, + { + "epoch": 0.42919679099737995, + "grad_norm": 0.35567012429237366, + "learning_rate": 0.00011418396391946869, + "loss": 1.4543, + "step": 33029 + }, + { + "epoch": 0.42920978554129585, + "grad_norm": 0.467305064201355, + "learning_rate": 0.0001141813644575573, + "loss": 1.4359, + "step": 33030 + }, + { + "epoch": 0.4292227800852117, + "grad_norm": 0.48820051550865173, + "learning_rate": 0.00011417876499564591, + "loss": 1.4388, + "step": 33031 + }, + { + "epoch": 0.4292357746291276, + "grad_norm": 0.43212008476257324, + "learning_rate": 0.00011417616553373452, + "loss": 1.5225, + "step": 33032 + }, + { + "epoch": 0.42924876917304344, + "grad_norm": 0.3583143353462219, + "learning_rate": 0.00011417356607182314, + "loss": 1.3766, + "step": 33033 + }, + { + "epoch": 0.42926176371695934, + "grad_norm": 0.46112048625946045, + "learning_rate": 0.00011417096660991175, + "loss": 1.4813, + "step": 33034 + }, + { + "epoch": 0.4292747582608752, + "grad_norm": 0.40445926785469055, + "learning_rate": 0.00011416836714800037, + "loss": 1.3102, + "step": 33035 + }, + { + "epoch": 0.4292877528047911, + "grad_norm": 0.2742837071418762, + "learning_rate": 0.00011416576768608898, + "loss": 1.4291, + "step": 33036 + }, + { + "epoch": 0.42930074734870693, + "grad_norm": 0.3757851719856262, + "learning_rate": 0.00011416316822417761, + "loss": 1.3656, + "step": 33037 + }, + { + "epoch": 0.42931374189262284, + "grad_norm": 0.38286471366882324, + "learning_rate": 0.00011416056876226621, + "loss": 1.4859, + "step": 33038 + }, + { + "epoch": 0.4293267364365387, + "grad_norm": 0.39252573251724243, + "learning_rate": 0.00011415796930035482, + "loss": 1.4248, + "step": 33039 + }, + { + "epoch": 0.4293397309804546, + "grad_norm": 0.36668860912323, + "learning_rate": 0.00011415536983844343, + "loss": 1.3993, + "step": 33040 + }, + { + "epoch": 0.4293527255243705, + "grad_norm": 0.47206979990005493, + "learning_rate": 0.00011415277037653207, + "loss": 1.5113, + "step": 33041 + }, + { + "epoch": 0.4293657200682863, + "grad_norm": 0.4524155557155609, + "learning_rate": 0.00011415017091462068, + "loss": 1.3287, + "step": 33042 + }, + { + "epoch": 0.42937871461220223, + "grad_norm": 0.4893929958343506, + "learning_rate": 0.0001141475714527093, + "loss": 1.3532, + "step": 33043 + }, + { + "epoch": 0.4293917091561181, + "grad_norm": 0.4571665823459625, + "learning_rate": 0.0001141449719907979, + "loss": 1.6273, + "step": 33044 + }, + { + "epoch": 0.429404703700034, + "grad_norm": 0.420337975025177, + "learning_rate": 0.00011414237252888653, + "loss": 1.4461, + "step": 33045 + }, + { + "epoch": 0.4294176982439498, + "grad_norm": 0.3624253273010254, + "learning_rate": 0.00011413977306697514, + "loss": 1.2497, + "step": 33046 + }, + { + "epoch": 0.4294306927878657, + "grad_norm": 0.5219370722770691, + "learning_rate": 0.00011413717360506375, + "loss": 1.4837, + "step": 33047 + }, + { + "epoch": 0.42944368733178157, + "grad_norm": 0.3202259838581085, + "learning_rate": 0.00011413457414315236, + "loss": 1.3773, + "step": 33048 + }, + { + "epoch": 0.42945668187569747, + "grad_norm": 0.3458300232887268, + "learning_rate": 0.000114131974681241, + "loss": 1.3374, + "step": 33049 + }, + { + "epoch": 0.4294696764196133, + "grad_norm": 0.27415987849235535, + "learning_rate": 0.00011412937521932961, + "loss": 1.4492, + "step": 33050 + }, + { + "epoch": 0.4294826709635292, + "grad_norm": 0.48517686128616333, + "learning_rate": 0.00011412677575741821, + "loss": 1.3907, + "step": 33051 + }, + { + "epoch": 0.42949566550744506, + "grad_norm": 0.2366357147693634, + "learning_rate": 0.00011412417629550685, + "loss": 1.1548, + "step": 33052 + }, + { + "epoch": 0.42950866005136096, + "grad_norm": 0.464781254529953, + "learning_rate": 0.00011412157683359546, + "loss": 1.3002, + "step": 33053 + }, + { + "epoch": 0.4295216545952768, + "grad_norm": 0.436920166015625, + "learning_rate": 0.00011411897737168407, + "loss": 1.6171, + "step": 33054 + }, + { + "epoch": 0.4295346491391927, + "grad_norm": 0.35520246624946594, + "learning_rate": 0.00011411637790977268, + "loss": 1.4255, + "step": 33055 + }, + { + "epoch": 0.42954764368310855, + "grad_norm": 0.37296155095100403, + "learning_rate": 0.0001141137784478613, + "loss": 1.3958, + "step": 33056 + }, + { + "epoch": 0.42956063822702445, + "grad_norm": 0.4117509722709656, + "learning_rate": 0.00011411117898594991, + "loss": 1.5684, + "step": 33057 + }, + { + "epoch": 0.4295736327709403, + "grad_norm": 0.43161967396736145, + "learning_rate": 0.00011410857952403853, + "loss": 1.2941, + "step": 33058 + }, + { + "epoch": 0.4295866273148562, + "grad_norm": 0.3396981656551361, + "learning_rate": 0.00011410598006212714, + "loss": 1.3691, + "step": 33059 + }, + { + "epoch": 0.42959962185877204, + "grad_norm": 0.3769652247428894, + "learning_rate": 0.00011410338060021577, + "loss": 1.3833, + "step": 33060 + }, + { + "epoch": 0.42961261640268794, + "grad_norm": 0.3140166699886322, + "learning_rate": 0.00011410078113830438, + "loss": 1.5036, + "step": 33061 + }, + { + "epoch": 0.4296256109466038, + "grad_norm": 0.45537781715393066, + "learning_rate": 0.000114098181676393, + "loss": 1.2985, + "step": 33062 + }, + { + "epoch": 0.4296386054905197, + "grad_norm": 0.4387262463569641, + "learning_rate": 0.0001140955822144816, + "loss": 1.2902, + "step": 33063 + }, + { + "epoch": 0.42965160003443553, + "grad_norm": 0.4398992359638214, + "learning_rate": 0.00011409298275257023, + "loss": 1.3651, + "step": 33064 + }, + { + "epoch": 0.42966459457835143, + "grad_norm": 0.3066038489341736, + "learning_rate": 0.00011409038329065884, + "loss": 1.3958, + "step": 33065 + }, + { + "epoch": 0.4296775891222673, + "grad_norm": 0.3835153877735138, + "learning_rate": 0.00011408778382874745, + "loss": 1.5516, + "step": 33066 + }, + { + "epoch": 0.4296905836661832, + "grad_norm": 0.33736687898635864, + "learning_rate": 0.00011408518436683606, + "loss": 1.3188, + "step": 33067 + }, + { + "epoch": 0.429703578210099, + "grad_norm": 0.28521960973739624, + "learning_rate": 0.00011408258490492469, + "loss": 1.2023, + "step": 33068 + }, + { + "epoch": 0.4297165727540149, + "grad_norm": 0.36845168471336365, + "learning_rate": 0.0001140799854430133, + "loss": 1.3836, + "step": 33069 + }, + { + "epoch": 0.42972956729793077, + "grad_norm": 0.36686450242996216, + "learning_rate": 0.00011407738598110191, + "loss": 1.5415, + "step": 33070 + }, + { + "epoch": 0.42974256184184667, + "grad_norm": 0.3897012770175934, + "learning_rate": 0.00011407478651919052, + "loss": 1.4018, + "step": 33071 + }, + { + "epoch": 0.4297555563857625, + "grad_norm": 0.43043002486228943, + "learning_rate": 0.00011407218705727916, + "loss": 1.4643, + "step": 33072 + }, + { + "epoch": 0.4297685509296784, + "grad_norm": 0.44798365235328674, + "learning_rate": 0.00011406958759536777, + "loss": 1.5364, + "step": 33073 + }, + { + "epoch": 0.42978154547359426, + "grad_norm": 0.47974103689193726, + "learning_rate": 0.00011406698813345638, + "loss": 1.5396, + "step": 33074 + }, + { + "epoch": 0.42979454001751016, + "grad_norm": 0.40785491466522217, + "learning_rate": 0.00011406438867154499, + "loss": 1.575, + "step": 33075 + }, + { + "epoch": 0.429807534561426, + "grad_norm": 0.47836023569107056, + "learning_rate": 0.00011406178920963362, + "loss": 1.4555, + "step": 33076 + }, + { + "epoch": 0.4298205291053419, + "grad_norm": 0.3881927728652954, + "learning_rate": 0.00011405918974772223, + "loss": 1.3507, + "step": 33077 + }, + { + "epoch": 0.42983352364925775, + "grad_norm": 0.428774893283844, + "learning_rate": 0.00011405659028581084, + "loss": 1.5527, + "step": 33078 + }, + { + "epoch": 0.42984651819317365, + "grad_norm": 0.40334850549697876, + "learning_rate": 0.00011405399082389945, + "loss": 1.3412, + "step": 33079 + }, + { + "epoch": 0.4298595127370895, + "grad_norm": 0.3947594165802002, + "learning_rate": 0.00011405139136198807, + "loss": 1.3321, + "step": 33080 + }, + { + "epoch": 0.4298725072810054, + "grad_norm": 0.3086243271827698, + "learning_rate": 0.00011404879190007668, + "loss": 1.2419, + "step": 33081 + }, + { + "epoch": 0.42988550182492125, + "grad_norm": 0.4754907786846161, + "learning_rate": 0.0001140461924381653, + "loss": 1.532, + "step": 33082 + }, + { + "epoch": 0.42989849636883715, + "grad_norm": 0.36356648802757263, + "learning_rate": 0.0001140435929762539, + "loss": 1.4077, + "step": 33083 + }, + { + "epoch": 0.429911490912753, + "grad_norm": 0.4019680917263031, + "learning_rate": 0.00011404099351434254, + "loss": 1.4357, + "step": 33084 + }, + { + "epoch": 0.4299244854566689, + "grad_norm": 0.32742324471473694, + "learning_rate": 0.00011403839405243116, + "loss": 1.4432, + "step": 33085 + }, + { + "epoch": 0.42993748000058474, + "grad_norm": 0.47397276759147644, + "learning_rate": 0.00011403579459051977, + "loss": 1.4284, + "step": 33086 + }, + { + "epoch": 0.42995047454450064, + "grad_norm": 0.40912535786628723, + "learning_rate": 0.00011403319512860839, + "loss": 1.5713, + "step": 33087 + }, + { + "epoch": 0.4299634690884165, + "grad_norm": 0.40757817029953003, + "learning_rate": 0.000114030595666697, + "loss": 1.5159, + "step": 33088 + }, + { + "epoch": 0.4299764636323324, + "grad_norm": 0.332685649394989, + "learning_rate": 0.00011402799620478561, + "loss": 1.2956, + "step": 33089 + }, + { + "epoch": 0.42998945817624823, + "grad_norm": 0.4536055326461792, + "learning_rate": 0.00011402539674287422, + "loss": 1.212, + "step": 33090 + }, + { + "epoch": 0.43000245272016413, + "grad_norm": 0.38578975200653076, + "learning_rate": 0.00011402279728096286, + "loss": 1.2481, + "step": 33091 + }, + { + "epoch": 0.43001544726408, + "grad_norm": 0.3576241135597229, + "learning_rate": 0.00011402019781905147, + "loss": 1.3822, + "step": 33092 + }, + { + "epoch": 0.4300284418079959, + "grad_norm": 0.37304365634918213, + "learning_rate": 0.00011401759835714007, + "loss": 1.2057, + "step": 33093 + }, + { + "epoch": 0.4300414363519117, + "grad_norm": 0.4942501485347748, + "learning_rate": 0.00011401499889522868, + "loss": 1.4689, + "step": 33094 + }, + { + "epoch": 0.4300544308958276, + "grad_norm": 0.45764344930648804, + "learning_rate": 0.00011401239943331732, + "loss": 1.362, + "step": 33095 + }, + { + "epoch": 0.43006742543974347, + "grad_norm": 0.42469510436058044, + "learning_rate": 0.00011400979997140593, + "loss": 1.3188, + "step": 33096 + }, + { + "epoch": 0.43008041998365937, + "grad_norm": 0.33602777123451233, + "learning_rate": 0.00011400720050949454, + "loss": 1.2276, + "step": 33097 + }, + { + "epoch": 0.4300934145275752, + "grad_norm": 0.4492737054824829, + "learning_rate": 0.00011400460104758315, + "loss": 1.5507, + "step": 33098 + }, + { + "epoch": 0.4301064090714911, + "grad_norm": 0.3442053496837616, + "learning_rate": 0.00011400200158567178, + "loss": 1.4496, + "step": 33099 + }, + { + "epoch": 0.43011940361540696, + "grad_norm": 0.41710934042930603, + "learning_rate": 0.00011399940212376039, + "loss": 1.2895, + "step": 33100 + }, + { + "epoch": 0.43013239815932286, + "grad_norm": 0.46953511238098145, + "learning_rate": 0.000113996802661849, + "loss": 1.4354, + "step": 33101 + }, + { + "epoch": 0.4301453927032387, + "grad_norm": 0.34530511498451233, + "learning_rate": 0.00011399420319993761, + "loss": 1.4933, + "step": 33102 + }, + { + "epoch": 0.4301583872471546, + "grad_norm": 0.33698585629463196, + "learning_rate": 0.00011399160373802625, + "loss": 1.3439, + "step": 33103 + }, + { + "epoch": 0.43017138179107045, + "grad_norm": 0.3705131709575653, + "learning_rate": 0.00011398900427611486, + "loss": 1.4833, + "step": 33104 + }, + { + "epoch": 0.43018437633498635, + "grad_norm": 0.3394135534763336, + "learning_rate": 0.00011398640481420346, + "loss": 1.0934, + "step": 33105 + }, + { + "epoch": 0.4301973708789022, + "grad_norm": 0.4138874411582947, + "learning_rate": 0.00011398380535229207, + "loss": 1.4256, + "step": 33106 + }, + { + "epoch": 0.4302103654228181, + "grad_norm": 0.3919612467288971, + "learning_rate": 0.0001139812058903807, + "loss": 1.3404, + "step": 33107 + }, + { + "epoch": 0.43022335996673394, + "grad_norm": 0.444354772567749, + "learning_rate": 0.00011397860642846932, + "loss": 1.2738, + "step": 33108 + }, + { + "epoch": 0.43023635451064984, + "grad_norm": 0.44634923338890076, + "learning_rate": 0.00011397600696655793, + "loss": 1.5357, + "step": 33109 + }, + { + "epoch": 0.4302493490545657, + "grad_norm": 0.494880348443985, + "learning_rate": 0.00011397340750464654, + "loss": 1.5915, + "step": 33110 + }, + { + "epoch": 0.4302623435984816, + "grad_norm": 0.26488107442855835, + "learning_rate": 0.00011397080804273516, + "loss": 1.4708, + "step": 33111 + }, + { + "epoch": 0.43027533814239743, + "grad_norm": 0.461359441280365, + "learning_rate": 0.00011396820858082377, + "loss": 1.3508, + "step": 33112 + }, + { + "epoch": 0.43028833268631334, + "grad_norm": 0.43496787548065186, + "learning_rate": 0.00011396560911891238, + "loss": 1.4757, + "step": 33113 + }, + { + "epoch": 0.4303013272302292, + "grad_norm": 0.39551621675491333, + "learning_rate": 0.000113963009657001, + "loss": 1.42, + "step": 33114 + }, + { + "epoch": 0.4303143217741451, + "grad_norm": 0.3881276547908783, + "learning_rate": 0.00011396041019508963, + "loss": 1.3462, + "step": 33115 + }, + { + "epoch": 0.430327316318061, + "grad_norm": 0.4036378264427185, + "learning_rate": 0.00011395781073317824, + "loss": 1.3031, + "step": 33116 + }, + { + "epoch": 0.4303403108619768, + "grad_norm": 0.465343177318573, + "learning_rate": 0.00011395521127126685, + "loss": 1.4617, + "step": 33117 + }, + { + "epoch": 0.4303533054058927, + "grad_norm": 0.4165148437023163, + "learning_rate": 0.00011395261180935545, + "loss": 1.3492, + "step": 33118 + }, + { + "epoch": 0.4303662999498086, + "grad_norm": 0.3834114372730255, + "learning_rate": 0.00011395001234744409, + "loss": 1.3815, + "step": 33119 + }, + { + "epoch": 0.4303792944937245, + "grad_norm": 0.4060989022254944, + "learning_rate": 0.0001139474128855327, + "loss": 1.2717, + "step": 33120 + }, + { + "epoch": 0.4303922890376403, + "grad_norm": 0.4501494765281677, + "learning_rate": 0.00011394481342362131, + "loss": 1.448, + "step": 33121 + }, + { + "epoch": 0.4304052835815562, + "grad_norm": 0.33736148476600647, + "learning_rate": 0.00011394221396170992, + "loss": 1.3055, + "step": 33122 + }, + { + "epoch": 0.43041827812547206, + "grad_norm": 0.3838721811771393, + "learning_rate": 0.00011393961449979855, + "loss": 1.5451, + "step": 33123 + }, + { + "epoch": 0.43043127266938797, + "grad_norm": 0.5082861185073853, + "learning_rate": 0.00011393701503788716, + "loss": 1.464, + "step": 33124 + }, + { + "epoch": 0.4304442672133038, + "grad_norm": 0.3409280776977539, + "learning_rate": 0.00011393441557597577, + "loss": 1.156, + "step": 33125 + }, + { + "epoch": 0.4304572617572197, + "grad_norm": 0.3471972644329071, + "learning_rate": 0.00011393181611406441, + "loss": 1.4289, + "step": 33126 + }, + { + "epoch": 0.43047025630113556, + "grad_norm": 0.45658400654792786, + "learning_rate": 0.00011392921665215302, + "loss": 1.405, + "step": 33127 + }, + { + "epoch": 0.43048325084505146, + "grad_norm": 0.31989625096321106, + "learning_rate": 0.00011392661719024163, + "loss": 1.4028, + "step": 33128 + }, + { + "epoch": 0.4304962453889673, + "grad_norm": 0.4005883038043976, + "learning_rate": 0.00011392401772833024, + "loss": 1.3442, + "step": 33129 + }, + { + "epoch": 0.4305092399328832, + "grad_norm": 0.4131905138492584, + "learning_rate": 0.00011392141826641886, + "loss": 1.3235, + "step": 33130 + }, + { + "epoch": 0.43052223447679905, + "grad_norm": 0.4129081666469574, + "learning_rate": 0.00011391881880450748, + "loss": 1.4005, + "step": 33131 + }, + { + "epoch": 0.43053522902071495, + "grad_norm": 0.3129737079143524, + "learning_rate": 0.00011391621934259609, + "loss": 1.4682, + "step": 33132 + }, + { + "epoch": 0.4305482235646308, + "grad_norm": 0.4177081882953644, + "learning_rate": 0.0001139136198806847, + "loss": 1.3637, + "step": 33133 + }, + { + "epoch": 0.4305612181085467, + "grad_norm": 0.4250037670135498, + "learning_rate": 0.00011391102041877334, + "loss": 1.3565, + "step": 33134 + }, + { + "epoch": 0.43057421265246254, + "grad_norm": 0.31789255142211914, + "learning_rate": 0.00011390842095686193, + "loss": 1.4654, + "step": 33135 + }, + { + "epoch": 0.43058720719637844, + "grad_norm": 0.4526450037956238, + "learning_rate": 0.00011390582149495054, + "loss": 1.4473, + "step": 33136 + }, + { + "epoch": 0.4306002017402943, + "grad_norm": 0.41610458493232727, + "learning_rate": 0.00011390322203303915, + "loss": 1.4201, + "step": 33137 + }, + { + "epoch": 0.4306131962842102, + "grad_norm": 0.502224326133728, + "learning_rate": 0.00011390062257112779, + "loss": 1.3476, + "step": 33138 + }, + { + "epoch": 0.43062619082812603, + "grad_norm": 0.40534213185310364, + "learning_rate": 0.0001138980231092164, + "loss": 1.5252, + "step": 33139 + }, + { + "epoch": 0.43063918537204193, + "grad_norm": 0.37727993726730347, + "learning_rate": 0.00011389542364730501, + "loss": 1.335, + "step": 33140 + }, + { + "epoch": 0.4306521799159578, + "grad_norm": 0.32960546016693115, + "learning_rate": 0.00011389282418539363, + "loss": 1.255, + "step": 33141 + }, + { + "epoch": 0.4306651744598737, + "grad_norm": 0.3949643075466156, + "learning_rate": 0.00011389022472348225, + "loss": 1.4645, + "step": 33142 + }, + { + "epoch": 0.4306781690037895, + "grad_norm": 0.3354935944080353, + "learning_rate": 0.00011388762526157086, + "loss": 1.3703, + "step": 33143 + }, + { + "epoch": 0.4306911635477054, + "grad_norm": 0.4239766299724579, + "learning_rate": 0.00011388502579965947, + "loss": 1.4255, + "step": 33144 + }, + { + "epoch": 0.43070415809162127, + "grad_norm": 0.35579726099967957, + "learning_rate": 0.00011388242633774808, + "loss": 1.3935, + "step": 33145 + }, + { + "epoch": 0.43071715263553717, + "grad_norm": 0.3417304456233978, + "learning_rate": 0.00011387982687583672, + "loss": 1.3113, + "step": 33146 + }, + { + "epoch": 0.430730147179453, + "grad_norm": 0.49395790696144104, + "learning_rate": 0.00011387722741392532, + "loss": 1.4068, + "step": 33147 + }, + { + "epoch": 0.4307431417233689, + "grad_norm": 0.35646697878837585, + "learning_rate": 0.00011387462795201393, + "loss": 1.4002, + "step": 33148 + }, + { + "epoch": 0.43075613626728476, + "grad_norm": 0.3382672965526581, + "learning_rate": 0.00011387202849010254, + "loss": 1.4763, + "step": 33149 + }, + { + "epoch": 0.43076913081120066, + "grad_norm": 0.3735864460468292, + "learning_rate": 0.00011386942902819118, + "loss": 1.3321, + "step": 33150 + }, + { + "epoch": 0.4307821253551165, + "grad_norm": 0.4748136103153229, + "learning_rate": 0.00011386682956627979, + "loss": 1.4862, + "step": 33151 + }, + { + "epoch": 0.4307951198990324, + "grad_norm": 0.3942922055721283, + "learning_rate": 0.0001138642301043684, + "loss": 1.5218, + "step": 33152 + }, + { + "epoch": 0.43080811444294825, + "grad_norm": 0.42013445496559143, + "learning_rate": 0.00011386163064245701, + "loss": 1.2873, + "step": 33153 + }, + { + "epoch": 0.43082110898686415, + "grad_norm": 0.4014489948749542, + "learning_rate": 0.00011385903118054564, + "loss": 1.1865, + "step": 33154 + }, + { + "epoch": 0.43083410353078, + "grad_norm": 0.3736909329891205, + "learning_rate": 0.00011385643171863425, + "loss": 1.3899, + "step": 33155 + }, + { + "epoch": 0.4308470980746959, + "grad_norm": 0.3160135746002197, + "learning_rate": 0.00011385383225672286, + "loss": 1.3598, + "step": 33156 + }, + { + "epoch": 0.43086009261861175, + "grad_norm": 0.6537340879440308, + "learning_rate": 0.00011385123279481147, + "loss": 1.4791, + "step": 33157 + }, + { + "epoch": 0.43087308716252765, + "grad_norm": 0.36841726303100586, + "learning_rate": 0.0001138486333329001, + "loss": 1.3764, + "step": 33158 + }, + { + "epoch": 0.4308860817064435, + "grad_norm": 0.34699544310569763, + "learning_rate": 0.00011384603387098872, + "loss": 1.4489, + "step": 33159 + }, + { + "epoch": 0.4308990762503594, + "grad_norm": 0.4063728153705597, + "learning_rate": 0.00011384343440907731, + "loss": 1.3759, + "step": 33160 + }, + { + "epoch": 0.43091207079427524, + "grad_norm": 0.4053795635700226, + "learning_rate": 0.00011384083494716595, + "loss": 1.3979, + "step": 33161 + }, + { + "epoch": 0.43092506533819114, + "grad_norm": 0.41829726099967957, + "learning_rate": 0.00011383823548525456, + "loss": 1.8126, + "step": 33162 + }, + { + "epoch": 0.430938059882107, + "grad_norm": 0.39402708411216736, + "learning_rate": 0.00011383563602334317, + "loss": 1.4858, + "step": 33163 + }, + { + "epoch": 0.4309510544260229, + "grad_norm": 0.3054930865764618, + "learning_rate": 0.00011383303656143179, + "loss": 1.283, + "step": 33164 + }, + { + "epoch": 0.43096404896993873, + "grad_norm": 0.42324545979499817, + "learning_rate": 0.00011383043709952041, + "loss": 1.3945, + "step": 33165 + }, + { + "epoch": 0.43097704351385463, + "grad_norm": 0.33468741178512573, + "learning_rate": 0.00011382783763760902, + "loss": 1.3577, + "step": 33166 + }, + { + "epoch": 0.4309900380577705, + "grad_norm": 0.4499208629131317, + "learning_rate": 0.00011382523817569763, + "loss": 1.4147, + "step": 33167 + }, + { + "epoch": 0.4310030326016864, + "grad_norm": 0.5325268507003784, + "learning_rate": 0.00011382263871378624, + "loss": 1.3786, + "step": 33168 + }, + { + "epoch": 0.4310160271456022, + "grad_norm": 0.3340540826320648, + "learning_rate": 0.00011382003925187488, + "loss": 1.3203, + "step": 33169 + }, + { + "epoch": 0.4310290216895181, + "grad_norm": 0.38758203387260437, + "learning_rate": 0.00011381743978996349, + "loss": 1.3524, + "step": 33170 + }, + { + "epoch": 0.43104201623343397, + "grad_norm": 0.3518804609775543, + "learning_rate": 0.0001138148403280521, + "loss": 1.5326, + "step": 33171 + }, + { + "epoch": 0.43105501077734987, + "grad_norm": 0.4840235114097595, + "learning_rate": 0.00011381224086614071, + "loss": 1.6046, + "step": 33172 + }, + { + "epoch": 0.4310680053212657, + "grad_norm": 0.41478368639945984, + "learning_rate": 0.00011380964140422934, + "loss": 1.5209, + "step": 33173 + }, + { + "epoch": 0.4310809998651816, + "grad_norm": 0.3656295835971832, + "learning_rate": 0.00011380704194231795, + "loss": 1.1945, + "step": 33174 + }, + { + "epoch": 0.43109399440909746, + "grad_norm": 0.44753339886665344, + "learning_rate": 0.00011380444248040656, + "loss": 1.3426, + "step": 33175 + }, + { + "epoch": 0.43110698895301336, + "grad_norm": 0.4699760377407074, + "learning_rate": 0.00011380184301849517, + "loss": 1.5319, + "step": 33176 + }, + { + "epoch": 0.4311199834969292, + "grad_norm": 0.3339541256427765, + "learning_rate": 0.0001137992435565838, + "loss": 1.3831, + "step": 33177 + }, + { + "epoch": 0.4311329780408451, + "grad_norm": 0.35164934396743774, + "learning_rate": 0.0001137966440946724, + "loss": 1.2086, + "step": 33178 + }, + { + "epoch": 0.43114597258476095, + "grad_norm": 0.4146008789539337, + "learning_rate": 0.00011379404463276102, + "loss": 1.1877, + "step": 33179 + }, + { + "epoch": 0.43115896712867685, + "grad_norm": 0.3636126220226288, + "learning_rate": 0.00011379144517084963, + "loss": 1.5375, + "step": 33180 + }, + { + "epoch": 0.4311719616725927, + "grad_norm": 0.342949241399765, + "learning_rate": 0.00011378884570893827, + "loss": 1.2516, + "step": 33181 + }, + { + "epoch": 0.4311849562165086, + "grad_norm": 0.44998618960380554, + "learning_rate": 0.00011378624624702688, + "loss": 1.3039, + "step": 33182 + }, + { + "epoch": 0.43119795076042444, + "grad_norm": 0.42032068967819214, + "learning_rate": 0.00011378364678511549, + "loss": 1.432, + "step": 33183 + }, + { + "epoch": 0.43121094530434034, + "grad_norm": 0.4338682293891907, + "learning_rate": 0.0001137810473232041, + "loss": 1.3524, + "step": 33184 + }, + { + "epoch": 0.4312239398482562, + "grad_norm": 0.39883100986480713, + "learning_rate": 0.00011377844786129272, + "loss": 1.2826, + "step": 33185 + }, + { + "epoch": 0.4312369343921721, + "grad_norm": 0.40389248728752136, + "learning_rate": 0.00011377584839938133, + "loss": 1.4106, + "step": 33186 + }, + { + "epoch": 0.43124992893608793, + "grad_norm": 0.39609894156455994, + "learning_rate": 0.00011377324893746995, + "loss": 1.3038, + "step": 33187 + }, + { + "epoch": 0.43126292348000383, + "grad_norm": 0.3595294952392578, + "learning_rate": 0.00011377064947555856, + "loss": 1.3779, + "step": 33188 + }, + { + "epoch": 0.4312759180239197, + "grad_norm": 0.49085184931755066, + "learning_rate": 0.00011376805001364718, + "loss": 1.498, + "step": 33189 + }, + { + "epoch": 0.4312889125678356, + "grad_norm": 0.5130632519721985, + "learning_rate": 0.00011376545055173579, + "loss": 1.4194, + "step": 33190 + }, + { + "epoch": 0.4313019071117514, + "grad_norm": 0.4032772183418274, + "learning_rate": 0.0001137628510898244, + "loss": 1.3503, + "step": 33191 + }, + { + "epoch": 0.4313149016556673, + "grad_norm": 0.3786592185497284, + "learning_rate": 0.00011376025162791301, + "loss": 1.3481, + "step": 33192 + }, + { + "epoch": 0.4313278961995832, + "grad_norm": 0.43501991033554077, + "learning_rate": 0.00011375765216600165, + "loss": 1.4536, + "step": 33193 + }, + { + "epoch": 0.4313408907434991, + "grad_norm": 0.41508761048316956, + "learning_rate": 0.00011375505270409026, + "loss": 1.1944, + "step": 33194 + }, + { + "epoch": 0.431353885287415, + "grad_norm": 0.3628128468990326, + "learning_rate": 0.00011375245324217887, + "loss": 1.52, + "step": 33195 + }, + { + "epoch": 0.4313668798313308, + "grad_norm": 0.42269468307495117, + "learning_rate": 0.00011374985378026748, + "loss": 1.1061, + "step": 33196 + }, + { + "epoch": 0.4313798743752467, + "grad_norm": 0.4141942262649536, + "learning_rate": 0.00011374725431835611, + "loss": 1.4285, + "step": 33197 + }, + { + "epoch": 0.43139286891916256, + "grad_norm": 0.4284278452396393, + "learning_rate": 0.00011374465485644472, + "loss": 1.384, + "step": 33198 + }, + { + "epoch": 0.43140586346307846, + "grad_norm": 0.28185179829597473, + "learning_rate": 0.00011374205539453333, + "loss": 1.439, + "step": 33199 + }, + { + "epoch": 0.4314188580069943, + "grad_norm": 0.39052891731262207, + "learning_rate": 0.00011373945593262197, + "loss": 1.3289, + "step": 33200 + }, + { + "epoch": 0.4314318525509102, + "grad_norm": 0.3845687508583069, + "learning_rate": 0.00011373685647071058, + "loss": 1.2464, + "step": 33201 + }, + { + "epoch": 0.43144484709482606, + "grad_norm": 0.28828588128089905, + "learning_rate": 0.00011373425700879918, + "loss": 1.2802, + "step": 33202 + }, + { + "epoch": 0.43145784163874196, + "grad_norm": 0.4119218587875366, + "learning_rate": 0.00011373165754688779, + "loss": 1.3812, + "step": 33203 + }, + { + "epoch": 0.4314708361826578, + "grad_norm": 0.3059438467025757, + "learning_rate": 0.00011372905808497643, + "loss": 1.7177, + "step": 33204 + }, + { + "epoch": 0.4314838307265737, + "grad_norm": 0.4431051015853882, + "learning_rate": 0.00011372645862306504, + "loss": 1.5406, + "step": 33205 + }, + { + "epoch": 0.43149682527048955, + "grad_norm": 0.4355873763561249, + "learning_rate": 0.00011372385916115365, + "loss": 1.3495, + "step": 33206 + }, + { + "epoch": 0.43150981981440545, + "grad_norm": 0.3914143443107605, + "learning_rate": 0.00011372125969924226, + "loss": 1.3385, + "step": 33207 + }, + { + "epoch": 0.4315228143583213, + "grad_norm": 0.4504346549510956, + "learning_rate": 0.00011371866023733088, + "loss": 1.5157, + "step": 33208 + }, + { + "epoch": 0.4315358089022372, + "grad_norm": 0.32465875148773193, + "learning_rate": 0.0001137160607754195, + "loss": 1.2986, + "step": 33209 + }, + { + "epoch": 0.43154880344615304, + "grad_norm": 0.4478144645690918, + "learning_rate": 0.0001137134613135081, + "loss": 1.4112, + "step": 33210 + }, + { + "epoch": 0.43156179799006894, + "grad_norm": 0.4574632942676544, + "learning_rate": 0.00011371086185159672, + "loss": 1.2701, + "step": 33211 + }, + { + "epoch": 0.4315747925339848, + "grad_norm": 0.3956300914287567, + "learning_rate": 0.00011370826238968535, + "loss": 1.3532, + "step": 33212 + }, + { + "epoch": 0.4315877870779007, + "grad_norm": 0.29667261242866516, + "learning_rate": 0.00011370566292777396, + "loss": 1.2325, + "step": 33213 + }, + { + "epoch": 0.43160078162181653, + "grad_norm": 0.4740503430366516, + "learning_rate": 0.00011370306346586258, + "loss": 1.5815, + "step": 33214 + }, + { + "epoch": 0.43161377616573243, + "grad_norm": 0.4166916012763977, + "learning_rate": 0.00011370046400395117, + "loss": 1.3698, + "step": 33215 + }, + { + "epoch": 0.4316267707096483, + "grad_norm": 0.32976484298706055, + "learning_rate": 0.00011369786454203981, + "loss": 1.277, + "step": 33216 + }, + { + "epoch": 0.4316397652535642, + "grad_norm": 0.3904076814651489, + "learning_rate": 0.00011369526508012842, + "loss": 1.4539, + "step": 33217 + }, + { + "epoch": 0.43165275979748, + "grad_norm": 0.42991331219673157, + "learning_rate": 0.00011369266561821703, + "loss": 1.5142, + "step": 33218 + }, + { + "epoch": 0.4316657543413959, + "grad_norm": 0.43651294708251953, + "learning_rate": 0.00011369006615630564, + "loss": 1.3554, + "step": 33219 + }, + { + "epoch": 0.43167874888531177, + "grad_norm": 0.3718758821487427, + "learning_rate": 0.00011368746669439427, + "loss": 1.4874, + "step": 33220 + }, + { + "epoch": 0.43169174342922767, + "grad_norm": 0.41168835759162903, + "learning_rate": 0.00011368486723248288, + "loss": 1.3757, + "step": 33221 + }, + { + "epoch": 0.4317047379731435, + "grad_norm": 0.39248085021972656, + "learning_rate": 0.00011368226777057149, + "loss": 1.4277, + "step": 33222 + }, + { + "epoch": 0.4317177325170594, + "grad_norm": 0.34415584802627563, + "learning_rate": 0.0001136796683086601, + "loss": 1.2693, + "step": 33223 + }, + { + "epoch": 0.43173072706097526, + "grad_norm": 0.3893500864505768, + "learning_rate": 0.00011367706884674874, + "loss": 1.3699, + "step": 33224 + }, + { + "epoch": 0.43174372160489116, + "grad_norm": 0.46702802181243896, + "learning_rate": 0.00011367446938483735, + "loss": 1.3849, + "step": 33225 + }, + { + "epoch": 0.431756716148807, + "grad_norm": 0.3870939016342163, + "learning_rate": 0.00011367186992292596, + "loss": 1.4418, + "step": 33226 + }, + { + "epoch": 0.4317697106927229, + "grad_norm": 0.4282057583332062, + "learning_rate": 0.00011366927046101457, + "loss": 1.422, + "step": 33227 + }, + { + "epoch": 0.43178270523663875, + "grad_norm": 0.3599450886249542, + "learning_rate": 0.0001136666709991032, + "loss": 1.2941, + "step": 33228 + }, + { + "epoch": 0.43179569978055465, + "grad_norm": 0.39071422815322876, + "learning_rate": 0.00011366407153719181, + "loss": 1.2355, + "step": 33229 + }, + { + "epoch": 0.4318086943244705, + "grad_norm": 0.4582749307155609, + "learning_rate": 0.00011366147207528042, + "loss": 1.4414, + "step": 33230 + }, + { + "epoch": 0.4318216888683864, + "grad_norm": 0.45632022619247437, + "learning_rate": 0.00011365887261336903, + "loss": 1.432, + "step": 33231 + }, + { + "epoch": 0.43183468341230224, + "grad_norm": 0.32240864634513855, + "learning_rate": 0.00011365627315145765, + "loss": 1.4783, + "step": 33232 + }, + { + "epoch": 0.43184767795621815, + "grad_norm": 0.46361517906188965, + "learning_rate": 0.00011365367368954626, + "loss": 1.3672, + "step": 33233 + }, + { + "epoch": 0.431860672500134, + "grad_norm": 0.4368123412132263, + "learning_rate": 0.00011365107422763488, + "loss": 1.2213, + "step": 33234 + }, + { + "epoch": 0.4318736670440499, + "grad_norm": 0.4330582022666931, + "learning_rate": 0.00011364847476572351, + "loss": 1.4719, + "step": 33235 + }, + { + "epoch": 0.43188666158796574, + "grad_norm": 0.35051003098487854, + "learning_rate": 0.00011364587530381212, + "loss": 1.3664, + "step": 33236 + }, + { + "epoch": 0.43189965613188164, + "grad_norm": 0.42034879326820374, + "learning_rate": 0.00011364327584190074, + "loss": 1.406, + "step": 33237 + }, + { + "epoch": 0.4319126506757975, + "grad_norm": 0.44087105989456177, + "learning_rate": 0.00011364067637998935, + "loss": 1.5003, + "step": 33238 + }, + { + "epoch": 0.4319256452197134, + "grad_norm": 0.38869795203208923, + "learning_rate": 0.00011363807691807797, + "loss": 1.2656, + "step": 33239 + }, + { + "epoch": 0.43193863976362923, + "grad_norm": 0.464563250541687, + "learning_rate": 0.00011363547745616658, + "loss": 1.4988, + "step": 33240 + }, + { + "epoch": 0.43195163430754513, + "grad_norm": 0.38857123255729675, + "learning_rate": 0.00011363287799425519, + "loss": 1.3995, + "step": 33241 + }, + { + "epoch": 0.431964628851461, + "grad_norm": 0.4506318271160126, + "learning_rate": 0.0001136302785323438, + "loss": 1.5083, + "step": 33242 + }, + { + "epoch": 0.4319776233953769, + "grad_norm": 0.42720407247543335, + "learning_rate": 0.00011362767907043244, + "loss": 1.4089, + "step": 33243 + }, + { + "epoch": 0.4319906179392927, + "grad_norm": 0.4419236183166504, + "learning_rate": 0.00011362507960852104, + "loss": 1.4772, + "step": 33244 + }, + { + "epoch": 0.4320036124832086, + "grad_norm": 0.32073158025741577, + "learning_rate": 0.00011362248014660965, + "loss": 1.2382, + "step": 33245 + }, + { + "epoch": 0.43201660702712447, + "grad_norm": 0.32058602571487427, + "learning_rate": 0.00011361988068469826, + "loss": 1.1853, + "step": 33246 + }, + { + "epoch": 0.43202960157104037, + "grad_norm": 0.3077157735824585, + "learning_rate": 0.0001136172812227869, + "loss": 1.6345, + "step": 33247 + }, + { + "epoch": 0.4320425961149562, + "grad_norm": 0.43998798727989197, + "learning_rate": 0.00011361468176087551, + "loss": 1.4264, + "step": 33248 + }, + { + "epoch": 0.4320555906588721, + "grad_norm": 0.3312949240207672, + "learning_rate": 0.00011361208229896412, + "loss": 1.2769, + "step": 33249 + }, + { + "epoch": 0.43206858520278796, + "grad_norm": 0.38544297218322754, + "learning_rate": 0.00011360948283705273, + "loss": 1.3399, + "step": 33250 + }, + { + "epoch": 0.43208157974670386, + "grad_norm": 0.42414095997810364, + "learning_rate": 0.00011360688337514136, + "loss": 1.4023, + "step": 33251 + }, + { + "epoch": 0.4320945742906197, + "grad_norm": 0.385626882314682, + "learning_rate": 0.00011360428391322997, + "loss": 1.1628, + "step": 33252 + }, + { + "epoch": 0.4321075688345356, + "grad_norm": 0.5207545161247253, + "learning_rate": 0.00011360168445131858, + "loss": 1.4965, + "step": 33253 + }, + { + "epoch": 0.43212056337845145, + "grad_norm": 0.27895423769950867, + "learning_rate": 0.00011359908498940719, + "loss": 1.0972, + "step": 33254 + }, + { + "epoch": 0.43213355792236735, + "grad_norm": 0.4397144019603729, + "learning_rate": 0.00011359648552749583, + "loss": 1.379, + "step": 33255 + }, + { + "epoch": 0.4321465524662832, + "grad_norm": 0.3078467845916748, + "learning_rate": 0.00011359388606558444, + "loss": 1.3298, + "step": 33256 + }, + { + "epoch": 0.4321595470101991, + "grad_norm": 0.3218854069709778, + "learning_rate": 0.00011359128660367304, + "loss": 1.2533, + "step": 33257 + }, + { + "epoch": 0.43217254155411494, + "grad_norm": 0.37579187750816345, + "learning_rate": 0.00011358868714176165, + "loss": 1.4169, + "step": 33258 + }, + { + "epoch": 0.43218553609803084, + "grad_norm": 0.4179782271385193, + "learning_rate": 0.00011358608767985028, + "loss": 1.4597, + "step": 33259 + }, + { + "epoch": 0.4321985306419467, + "grad_norm": 0.33286842703819275, + "learning_rate": 0.0001135834882179389, + "loss": 1.406, + "step": 33260 + }, + { + "epoch": 0.4322115251858626, + "grad_norm": 0.41115471720695496, + "learning_rate": 0.0001135808887560275, + "loss": 1.5031, + "step": 33261 + }, + { + "epoch": 0.43222451972977843, + "grad_norm": 0.3228372633457184, + "learning_rate": 0.00011357828929411612, + "loss": 1.5566, + "step": 33262 + }, + { + "epoch": 0.43223751427369433, + "grad_norm": 0.37301644682884216, + "learning_rate": 0.00011357568983220474, + "loss": 1.4507, + "step": 33263 + }, + { + "epoch": 0.4322505088176102, + "grad_norm": 0.3995104134082794, + "learning_rate": 0.00011357309037029335, + "loss": 1.5073, + "step": 33264 + }, + { + "epoch": 0.4322635033615261, + "grad_norm": 0.45963793992996216, + "learning_rate": 0.00011357049090838196, + "loss": 1.6684, + "step": 33265 + }, + { + "epoch": 0.4322764979054419, + "grad_norm": 0.44271236658096313, + "learning_rate": 0.00011356789144647057, + "loss": 1.3368, + "step": 33266 + }, + { + "epoch": 0.4322894924493578, + "grad_norm": 0.4104548990726471, + "learning_rate": 0.00011356529198455921, + "loss": 1.3336, + "step": 33267 + }, + { + "epoch": 0.4323024869932737, + "grad_norm": 0.3448936939239502, + "learning_rate": 0.00011356269252264782, + "loss": 1.1577, + "step": 33268 + }, + { + "epoch": 0.43231548153718957, + "grad_norm": 0.4223630428314209, + "learning_rate": 0.00011356009306073642, + "loss": 1.5959, + "step": 33269 + }, + { + "epoch": 0.4323284760811055, + "grad_norm": 0.40309464931488037, + "learning_rate": 0.00011355749359882503, + "loss": 1.2942, + "step": 33270 + }, + { + "epoch": 0.4323414706250213, + "grad_norm": 0.3207789361476898, + "learning_rate": 0.00011355489413691367, + "loss": 1.3433, + "step": 33271 + }, + { + "epoch": 0.4323544651689372, + "grad_norm": 0.42869865894317627, + "learning_rate": 0.00011355229467500228, + "loss": 1.3548, + "step": 33272 + }, + { + "epoch": 0.43236745971285306, + "grad_norm": 0.4022025465965271, + "learning_rate": 0.00011354969521309089, + "loss": 1.5777, + "step": 33273 + }, + { + "epoch": 0.43238045425676896, + "grad_norm": 0.401841938495636, + "learning_rate": 0.00011354709575117952, + "loss": 1.4223, + "step": 33274 + }, + { + "epoch": 0.4323934488006848, + "grad_norm": 0.39523476362228394, + "learning_rate": 0.00011354449628926813, + "loss": 1.3229, + "step": 33275 + }, + { + "epoch": 0.4324064433446007, + "grad_norm": 0.3510282337665558, + "learning_rate": 0.00011354189682735674, + "loss": 1.3569, + "step": 33276 + }, + { + "epoch": 0.43241943788851656, + "grad_norm": 0.3840654790401459, + "learning_rate": 0.00011353929736544535, + "loss": 1.3249, + "step": 33277 + }, + { + "epoch": 0.43243243243243246, + "grad_norm": 0.3250669538974762, + "learning_rate": 0.00011353669790353399, + "loss": 1.4034, + "step": 33278 + }, + { + "epoch": 0.4324454269763483, + "grad_norm": 0.4078018069267273, + "learning_rate": 0.0001135340984416226, + "loss": 1.4428, + "step": 33279 + }, + { + "epoch": 0.4324584215202642, + "grad_norm": 0.40031298995018005, + "learning_rate": 0.00011353149897971121, + "loss": 1.3446, + "step": 33280 + }, + { + "epoch": 0.43247141606418005, + "grad_norm": 0.4504551887512207, + "learning_rate": 0.00011352889951779982, + "loss": 1.4756, + "step": 33281 + }, + { + "epoch": 0.43248441060809595, + "grad_norm": 0.4024350345134735, + "learning_rate": 0.00011352630005588844, + "loss": 1.2388, + "step": 33282 + }, + { + "epoch": 0.4324974051520118, + "grad_norm": 0.36275625228881836, + "learning_rate": 0.00011352370059397706, + "loss": 1.5586, + "step": 33283 + }, + { + "epoch": 0.4325103996959277, + "grad_norm": 0.3074016273021698, + "learning_rate": 0.00011352110113206567, + "loss": 1.4634, + "step": 33284 + }, + { + "epoch": 0.43252339423984354, + "grad_norm": 0.3277377486228943, + "learning_rate": 0.00011351850167015428, + "loss": 1.2472, + "step": 33285 + }, + { + "epoch": 0.43253638878375944, + "grad_norm": 0.4185287654399872, + "learning_rate": 0.0001135159022082429, + "loss": 1.442, + "step": 33286 + }, + { + "epoch": 0.4325493833276753, + "grad_norm": 0.37106043100357056, + "learning_rate": 0.00011351330274633151, + "loss": 1.4118, + "step": 33287 + }, + { + "epoch": 0.4325623778715912, + "grad_norm": 0.40792375802993774, + "learning_rate": 0.00011351070328442012, + "loss": 1.5117, + "step": 33288 + }, + { + "epoch": 0.43257537241550703, + "grad_norm": 0.3630428910255432, + "learning_rate": 0.00011350810382250873, + "loss": 1.4757, + "step": 33289 + }, + { + "epoch": 0.43258836695942293, + "grad_norm": 0.3440118432044983, + "learning_rate": 0.00011350550436059737, + "loss": 1.2884, + "step": 33290 + }, + { + "epoch": 0.4326013615033388, + "grad_norm": 0.3732414245605469, + "learning_rate": 0.00011350290489868598, + "loss": 1.3255, + "step": 33291 + }, + { + "epoch": 0.4326143560472547, + "grad_norm": 0.4982890188694, + "learning_rate": 0.0001135003054367746, + "loss": 1.4479, + "step": 33292 + }, + { + "epoch": 0.4326273505911705, + "grad_norm": 0.3514576554298401, + "learning_rate": 0.0001134977059748632, + "loss": 1.4575, + "step": 33293 + }, + { + "epoch": 0.4326403451350864, + "grad_norm": 0.517388641834259, + "learning_rate": 0.00011349510651295183, + "loss": 1.3765, + "step": 33294 + }, + { + "epoch": 0.43265333967900227, + "grad_norm": 0.47659197449684143, + "learning_rate": 0.00011349250705104044, + "loss": 1.4819, + "step": 33295 + }, + { + "epoch": 0.43266633422291817, + "grad_norm": 0.3034079372882843, + "learning_rate": 0.00011348990758912905, + "loss": 1.1563, + "step": 33296 + }, + { + "epoch": 0.432679328766834, + "grad_norm": 0.4922278821468353, + "learning_rate": 0.00011348730812721766, + "loss": 1.6657, + "step": 33297 + }, + { + "epoch": 0.4326923233107499, + "grad_norm": 0.3886752426624298, + "learning_rate": 0.0001134847086653063, + "loss": 1.4479, + "step": 33298 + }, + { + "epoch": 0.43270531785466576, + "grad_norm": 0.39772507548332214, + "learning_rate": 0.0001134821092033949, + "loss": 1.3902, + "step": 33299 + }, + { + "epoch": 0.43271831239858166, + "grad_norm": 0.39935287833213806, + "learning_rate": 0.00011347950974148351, + "loss": 1.0775, + "step": 33300 + }, + { + "epoch": 0.4327313069424975, + "grad_norm": 0.4971461594104767, + "learning_rate": 0.00011347691027957212, + "loss": 1.3956, + "step": 33301 + }, + { + "epoch": 0.4327443014864134, + "grad_norm": 0.40562406182289124, + "learning_rate": 0.00011347431081766076, + "loss": 1.4968, + "step": 33302 + }, + { + "epoch": 0.43275729603032925, + "grad_norm": 0.37555643916130066, + "learning_rate": 0.00011347171135574937, + "loss": 1.5903, + "step": 33303 + }, + { + "epoch": 0.43277029057424515, + "grad_norm": 0.3111216425895691, + "learning_rate": 0.00011346911189383798, + "loss": 1.2038, + "step": 33304 + }, + { + "epoch": 0.432783285118161, + "grad_norm": 0.4424316883087158, + "learning_rate": 0.00011346651243192659, + "loss": 1.4002, + "step": 33305 + }, + { + "epoch": 0.4327962796620769, + "grad_norm": 0.3849847614765167, + "learning_rate": 0.00011346391297001522, + "loss": 1.3532, + "step": 33306 + }, + { + "epoch": 0.43280927420599274, + "grad_norm": 0.473209947347641, + "learning_rate": 0.00011346131350810383, + "loss": 1.3683, + "step": 33307 + }, + { + "epoch": 0.43282226874990865, + "grad_norm": 0.40562304854393005, + "learning_rate": 0.00011345871404619244, + "loss": 1.25, + "step": 33308 + }, + { + "epoch": 0.4328352632938245, + "grad_norm": 0.44929763674736023, + "learning_rate": 0.00011345611458428108, + "loss": 1.3771, + "step": 33309 + }, + { + "epoch": 0.4328482578377404, + "grad_norm": 0.4249218702316284, + "learning_rate": 0.00011345351512236969, + "loss": 1.5043, + "step": 33310 + }, + { + "epoch": 0.43286125238165624, + "grad_norm": 0.47616755962371826, + "learning_rate": 0.00011345091566045828, + "loss": 1.4547, + "step": 33311 + }, + { + "epoch": 0.43287424692557214, + "grad_norm": 0.368272989988327, + "learning_rate": 0.0001134483161985469, + "loss": 1.4776, + "step": 33312 + }, + { + "epoch": 0.432887241469488, + "grad_norm": 0.31713753938674927, + "learning_rate": 0.00011344571673663553, + "loss": 1.2758, + "step": 33313 + }, + { + "epoch": 0.4329002360134039, + "grad_norm": 0.3299190104007721, + "learning_rate": 0.00011344311727472414, + "loss": 1.4371, + "step": 33314 + }, + { + "epoch": 0.43291323055731973, + "grad_norm": 0.4193507730960846, + "learning_rate": 0.00011344051781281275, + "loss": 1.4943, + "step": 33315 + }, + { + "epoch": 0.43292622510123563, + "grad_norm": 0.36613646149635315, + "learning_rate": 0.00011343791835090137, + "loss": 1.3403, + "step": 33316 + }, + { + "epoch": 0.4329392196451515, + "grad_norm": 0.3816149830818176, + "learning_rate": 0.00011343531888898999, + "loss": 1.3818, + "step": 33317 + }, + { + "epoch": 0.4329522141890674, + "grad_norm": 0.31470587849617004, + "learning_rate": 0.0001134327194270786, + "loss": 1.3734, + "step": 33318 + }, + { + "epoch": 0.4329652087329832, + "grad_norm": 0.3731381893157959, + "learning_rate": 0.00011343011996516721, + "loss": 1.3972, + "step": 33319 + }, + { + "epoch": 0.4329782032768991, + "grad_norm": 0.4419451355934143, + "learning_rate": 0.00011342752050325582, + "loss": 1.5374, + "step": 33320 + }, + { + "epoch": 0.43299119782081497, + "grad_norm": 0.5802931785583496, + "learning_rate": 0.00011342492104134446, + "loss": 1.5212, + "step": 33321 + }, + { + "epoch": 0.43300419236473087, + "grad_norm": 0.45645299553871155, + "learning_rate": 0.00011342232157943307, + "loss": 1.4509, + "step": 33322 + }, + { + "epoch": 0.4330171869086467, + "grad_norm": 0.4089113175868988, + "learning_rate": 0.00011341972211752168, + "loss": 1.4412, + "step": 33323 + }, + { + "epoch": 0.4330301814525626, + "grad_norm": 0.4606892168521881, + "learning_rate": 0.00011341712265561028, + "loss": 1.5184, + "step": 33324 + }, + { + "epoch": 0.43304317599647846, + "grad_norm": 0.4504558742046356, + "learning_rate": 0.00011341452319369892, + "loss": 1.3575, + "step": 33325 + }, + { + "epoch": 0.43305617054039436, + "grad_norm": 0.29926320910453796, + "learning_rate": 0.00011341192373178753, + "loss": 1.4309, + "step": 33326 + }, + { + "epoch": 0.4330691650843102, + "grad_norm": 0.45521196722984314, + "learning_rate": 0.00011340932426987614, + "loss": 1.6272, + "step": 33327 + }, + { + "epoch": 0.4330821596282261, + "grad_norm": 0.4077664911746979, + "learning_rate": 0.00011340672480796475, + "loss": 1.2156, + "step": 33328 + }, + { + "epoch": 0.43309515417214195, + "grad_norm": 0.3805752694606781, + "learning_rate": 0.00011340412534605338, + "loss": 1.3498, + "step": 33329 + }, + { + "epoch": 0.43310814871605785, + "grad_norm": 0.4026910066604614, + "learning_rate": 0.00011340152588414199, + "loss": 1.4695, + "step": 33330 + }, + { + "epoch": 0.4331211432599737, + "grad_norm": 0.3863276243209839, + "learning_rate": 0.0001133989264222306, + "loss": 1.4246, + "step": 33331 + }, + { + "epoch": 0.4331341378038896, + "grad_norm": 0.43163633346557617, + "learning_rate": 0.00011339632696031921, + "loss": 1.3903, + "step": 33332 + }, + { + "epoch": 0.43314713234780544, + "grad_norm": 0.3720497786998749, + "learning_rate": 0.00011339372749840785, + "loss": 1.3878, + "step": 33333 + }, + { + "epoch": 0.43316012689172134, + "grad_norm": 0.45112699270248413, + "learning_rate": 0.00011339112803649646, + "loss": 1.3059, + "step": 33334 + }, + { + "epoch": 0.4331731214356372, + "grad_norm": 0.3637961745262146, + "learning_rate": 0.00011338852857458507, + "loss": 1.2886, + "step": 33335 + }, + { + "epoch": 0.4331861159795531, + "grad_norm": 0.4268046021461487, + "learning_rate": 0.00011338592911267368, + "loss": 1.3298, + "step": 33336 + }, + { + "epoch": 0.43319911052346893, + "grad_norm": 0.5041301250457764, + "learning_rate": 0.0001133833296507623, + "loss": 1.48, + "step": 33337 + }, + { + "epoch": 0.43321210506738483, + "grad_norm": 0.4450848698616028, + "learning_rate": 0.00011338073018885091, + "loss": 1.4316, + "step": 33338 + }, + { + "epoch": 0.4332250996113007, + "grad_norm": 0.38326382637023926, + "learning_rate": 0.00011337813072693952, + "loss": 1.3024, + "step": 33339 + }, + { + "epoch": 0.4332380941552166, + "grad_norm": 0.34570571780204773, + "learning_rate": 0.00011337553126502814, + "loss": 1.2976, + "step": 33340 + }, + { + "epoch": 0.4332510886991324, + "grad_norm": 0.44194158911705017, + "learning_rate": 0.00011337293180311676, + "loss": 1.3505, + "step": 33341 + }, + { + "epoch": 0.4332640832430483, + "grad_norm": 0.47158992290496826, + "learning_rate": 0.00011337033234120537, + "loss": 1.2014, + "step": 33342 + }, + { + "epoch": 0.43327707778696417, + "grad_norm": 0.46597978472709656, + "learning_rate": 0.00011336773287929398, + "loss": 1.4681, + "step": 33343 + }, + { + "epoch": 0.43329007233088007, + "grad_norm": 0.40148627758026123, + "learning_rate": 0.00011336513341738259, + "loss": 1.3262, + "step": 33344 + }, + { + "epoch": 0.433303066874796, + "grad_norm": 0.37109822034835815, + "learning_rate": 0.00011336253395547123, + "loss": 1.4103, + "step": 33345 + }, + { + "epoch": 0.4333160614187118, + "grad_norm": 0.40506136417388916, + "learning_rate": 0.00011335993449355984, + "loss": 1.2947, + "step": 33346 + }, + { + "epoch": 0.4333290559626277, + "grad_norm": 0.48681625723838806, + "learning_rate": 0.00011335733503164845, + "loss": 1.4877, + "step": 33347 + }, + { + "epoch": 0.43334205050654356, + "grad_norm": 0.4513232409954071, + "learning_rate": 0.00011335473556973708, + "loss": 1.5822, + "step": 33348 + }, + { + "epoch": 0.43335504505045946, + "grad_norm": 0.34064745903015137, + "learning_rate": 0.00011335213610782569, + "loss": 1.3418, + "step": 33349 + }, + { + "epoch": 0.4333680395943753, + "grad_norm": 0.39339929819107056, + "learning_rate": 0.0001133495366459143, + "loss": 1.2815, + "step": 33350 + }, + { + "epoch": 0.4333810341382912, + "grad_norm": 0.4671693444252014, + "learning_rate": 0.00011334693718400291, + "loss": 1.5291, + "step": 33351 + }, + { + "epoch": 0.43339402868220706, + "grad_norm": 0.3874957859516144, + "learning_rate": 0.00011334433772209155, + "loss": 1.4403, + "step": 33352 + }, + { + "epoch": 0.43340702322612296, + "grad_norm": 0.39423203468322754, + "learning_rate": 0.00011334173826018015, + "loss": 1.2113, + "step": 33353 + }, + { + "epoch": 0.4334200177700388, + "grad_norm": 0.31017324328422546, + "learning_rate": 0.00011333913879826876, + "loss": 1.4776, + "step": 33354 + }, + { + "epoch": 0.4334330123139547, + "grad_norm": 0.4219195246696472, + "learning_rate": 0.00011333653933635737, + "loss": 1.4052, + "step": 33355 + }, + { + "epoch": 0.43344600685787055, + "grad_norm": 0.26179036498069763, + "learning_rate": 0.000113333939874446, + "loss": 1.1457, + "step": 33356 + }, + { + "epoch": 0.43345900140178645, + "grad_norm": 0.3898164629936218, + "learning_rate": 0.00011333134041253462, + "loss": 1.2837, + "step": 33357 + }, + { + "epoch": 0.4334719959457023, + "grad_norm": 0.4827081263065338, + "learning_rate": 0.00011332874095062323, + "loss": 1.3817, + "step": 33358 + }, + { + "epoch": 0.4334849904896182, + "grad_norm": 0.3659784495830536, + "learning_rate": 0.00011332614148871184, + "loss": 1.4893, + "step": 33359 + }, + { + "epoch": 0.43349798503353404, + "grad_norm": 0.44223302602767944, + "learning_rate": 0.00011332354202680046, + "loss": 1.4722, + "step": 33360 + }, + { + "epoch": 0.43351097957744994, + "grad_norm": 0.42899152636528015, + "learning_rate": 0.00011332094256488907, + "loss": 1.6069, + "step": 33361 + }, + { + "epoch": 0.4335239741213658, + "grad_norm": 0.4769567847251892, + "learning_rate": 0.00011331834310297768, + "loss": 1.5355, + "step": 33362 + }, + { + "epoch": 0.4335369686652817, + "grad_norm": 0.36739885807037354, + "learning_rate": 0.0001133157436410663, + "loss": 1.2098, + "step": 33363 + }, + { + "epoch": 0.43354996320919753, + "grad_norm": 0.38264384865760803, + "learning_rate": 0.00011331314417915493, + "loss": 1.3599, + "step": 33364 + }, + { + "epoch": 0.43356295775311343, + "grad_norm": 0.48409056663513184, + "learning_rate": 0.00011331054471724354, + "loss": 1.4006, + "step": 33365 + }, + { + "epoch": 0.4335759522970293, + "grad_norm": 0.4064169228076935, + "learning_rate": 0.00011330794525533214, + "loss": 1.3664, + "step": 33366 + }, + { + "epoch": 0.4335889468409452, + "grad_norm": 0.3536081314086914, + "learning_rate": 0.00011330534579342075, + "loss": 1.3164, + "step": 33367 + }, + { + "epoch": 0.433601941384861, + "grad_norm": 0.3633044362068176, + "learning_rate": 0.00011330274633150939, + "loss": 1.3938, + "step": 33368 + }, + { + "epoch": 0.4336149359287769, + "grad_norm": 0.39726391434669495, + "learning_rate": 0.000113300146869598, + "loss": 1.2013, + "step": 33369 + }, + { + "epoch": 0.43362793047269277, + "grad_norm": 0.4329591393470764, + "learning_rate": 0.00011329754740768661, + "loss": 1.3728, + "step": 33370 + }, + { + "epoch": 0.43364092501660867, + "grad_norm": 0.48028022050857544, + "learning_rate": 0.00011329494794577522, + "loss": 1.4975, + "step": 33371 + }, + { + "epoch": 0.4336539195605245, + "grad_norm": 0.5066609382629395, + "learning_rate": 0.00011329234848386385, + "loss": 1.4266, + "step": 33372 + }, + { + "epoch": 0.4336669141044404, + "grad_norm": 0.4257539212703705, + "learning_rate": 0.00011328974902195246, + "loss": 1.5235, + "step": 33373 + }, + { + "epoch": 0.43367990864835626, + "grad_norm": 0.39514589309692383, + "learning_rate": 0.00011328714956004107, + "loss": 1.3934, + "step": 33374 + }, + { + "epoch": 0.43369290319227216, + "grad_norm": 0.41533178091049194, + "learning_rate": 0.00011328455009812968, + "loss": 1.4558, + "step": 33375 + }, + { + "epoch": 0.433705897736188, + "grad_norm": 0.3777025640010834, + "learning_rate": 0.00011328195063621832, + "loss": 1.3505, + "step": 33376 + }, + { + "epoch": 0.4337188922801039, + "grad_norm": 0.48106345534324646, + "learning_rate": 0.00011327935117430693, + "loss": 1.5528, + "step": 33377 + }, + { + "epoch": 0.43373188682401975, + "grad_norm": 0.41469866037368774, + "learning_rate": 0.00011327675171239554, + "loss": 1.2378, + "step": 33378 + }, + { + "epoch": 0.43374488136793565, + "grad_norm": 0.37692227959632874, + "learning_rate": 0.00011327415225048414, + "loss": 1.4141, + "step": 33379 + }, + { + "epoch": 0.4337578759118515, + "grad_norm": 0.4641399681568146, + "learning_rate": 0.00011327155278857278, + "loss": 1.4116, + "step": 33380 + }, + { + "epoch": 0.4337708704557674, + "grad_norm": 0.22812844812870026, + "learning_rate": 0.00011326895332666139, + "loss": 1.2506, + "step": 33381 + }, + { + "epoch": 0.43378386499968324, + "grad_norm": 0.31077444553375244, + "learning_rate": 0.00011326635386475, + "loss": 1.399, + "step": 33382 + }, + { + "epoch": 0.43379685954359914, + "grad_norm": 0.4887525141239166, + "learning_rate": 0.00011326375440283862, + "loss": 1.4804, + "step": 33383 + }, + { + "epoch": 0.433809854087515, + "grad_norm": 0.43618637323379517, + "learning_rate": 0.00011326115494092723, + "loss": 1.5407, + "step": 33384 + }, + { + "epoch": 0.4338228486314309, + "grad_norm": 0.3965262472629547, + "learning_rate": 0.00011325855547901584, + "loss": 1.3818, + "step": 33385 + }, + { + "epoch": 0.43383584317534674, + "grad_norm": 0.3204534351825714, + "learning_rate": 0.00011325595601710446, + "loss": 1.2097, + "step": 33386 + }, + { + "epoch": 0.43384883771926264, + "grad_norm": 0.33079731464385986, + "learning_rate": 0.0001132533565551931, + "loss": 1.3774, + "step": 33387 + }, + { + "epoch": 0.4338618322631785, + "grad_norm": 0.34844499826431274, + "learning_rate": 0.0001132507570932817, + "loss": 1.1496, + "step": 33388 + }, + { + "epoch": 0.4338748268070944, + "grad_norm": 0.40162885189056396, + "learning_rate": 0.00011324815763137032, + "loss": 1.2784, + "step": 33389 + }, + { + "epoch": 0.4338878213510102, + "grad_norm": 0.47484856843948364, + "learning_rate": 0.00011324555816945893, + "loss": 1.3759, + "step": 33390 + }, + { + "epoch": 0.43390081589492613, + "grad_norm": 0.38232091069221497, + "learning_rate": 0.00011324295870754755, + "loss": 1.3834, + "step": 33391 + }, + { + "epoch": 0.433913810438842, + "grad_norm": 0.4098871350288391, + "learning_rate": 0.00011324035924563616, + "loss": 1.3013, + "step": 33392 + }, + { + "epoch": 0.4339268049827579, + "grad_norm": 0.44997867941856384, + "learning_rate": 0.00011323775978372477, + "loss": 1.4015, + "step": 33393 + }, + { + "epoch": 0.4339397995266737, + "grad_norm": 0.40890321135520935, + "learning_rate": 0.00011323516032181338, + "loss": 1.5688, + "step": 33394 + }, + { + "epoch": 0.4339527940705896, + "grad_norm": 0.4250136911869049, + "learning_rate": 0.00011323256085990201, + "loss": 1.407, + "step": 33395 + }, + { + "epoch": 0.43396578861450547, + "grad_norm": 0.4520784020423889, + "learning_rate": 0.00011322996139799062, + "loss": 1.3771, + "step": 33396 + }, + { + "epoch": 0.43397878315842137, + "grad_norm": 0.3191169500350952, + "learning_rate": 0.00011322736193607923, + "loss": 1.1592, + "step": 33397 + }, + { + "epoch": 0.4339917777023372, + "grad_norm": 0.32721102237701416, + "learning_rate": 0.00011322476247416784, + "loss": 1.1837, + "step": 33398 + }, + { + "epoch": 0.4340047722462531, + "grad_norm": 0.43327850103378296, + "learning_rate": 0.00011322216301225648, + "loss": 1.3023, + "step": 33399 + }, + { + "epoch": 0.43401776679016896, + "grad_norm": 0.48186078667640686, + "learning_rate": 0.00011321956355034509, + "loss": 1.4413, + "step": 33400 + }, + { + "epoch": 0.43403076133408486, + "grad_norm": 0.4613114595413208, + "learning_rate": 0.0001132169640884337, + "loss": 1.4906, + "step": 33401 + }, + { + "epoch": 0.4340437558780007, + "grad_norm": 0.4332413375377655, + "learning_rate": 0.00011321436462652231, + "loss": 1.3221, + "step": 33402 + }, + { + "epoch": 0.4340567504219166, + "grad_norm": 0.3787354826927185, + "learning_rate": 0.00011321176516461094, + "loss": 1.2435, + "step": 33403 + }, + { + "epoch": 0.43406974496583245, + "grad_norm": 0.3831295073032379, + "learning_rate": 0.00011320916570269955, + "loss": 1.299, + "step": 33404 + }, + { + "epoch": 0.43408273950974835, + "grad_norm": 0.38606923818588257, + "learning_rate": 0.00011320656624078816, + "loss": 1.537, + "step": 33405 + }, + { + "epoch": 0.4340957340536642, + "grad_norm": 0.4439369738101959, + "learning_rate": 0.00011320396677887677, + "loss": 1.3845, + "step": 33406 + }, + { + "epoch": 0.4341087285975801, + "grad_norm": 0.42838701605796814, + "learning_rate": 0.00011320136731696541, + "loss": 1.4798, + "step": 33407 + }, + { + "epoch": 0.43412172314149594, + "grad_norm": 0.3909963369369507, + "learning_rate": 0.000113198767855054, + "loss": 1.2312, + "step": 33408 + }, + { + "epoch": 0.43413471768541184, + "grad_norm": 0.407919704914093, + "learning_rate": 0.00011319616839314262, + "loss": 1.3801, + "step": 33409 + }, + { + "epoch": 0.4341477122293277, + "grad_norm": 0.44364839792251587, + "learning_rate": 0.00011319356893123123, + "loss": 1.387, + "step": 33410 + }, + { + "epoch": 0.4341607067732436, + "grad_norm": 0.4279322028160095, + "learning_rate": 0.00011319096946931986, + "loss": 1.3925, + "step": 33411 + }, + { + "epoch": 0.43417370131715943, + "grad_norm": 0.4127967059612274, + "learning_rate": 0.00011318837000740848, + "loss": 1.3729, + "step": 33412 + }, + { + "epoch": 0.43418669586107533, + "grad_norm": 0.47077178955078125, + "learning_rate": 0.00011318577054549709, + "loss": 1.464, + "step": 33413 + }, + { + "epoch": 0.4341996904049912, + "grad_norm": 0.3866066038608551, + "learning_rate": 0.0001131831710835857, + "loss": 1.3356, + "step": 33414 + }, + { + "epoch": 0.4342126849489071, + "grad_norm": 0.3365674614906311, + "learning_rate": 0.00011318057162167432, + "loss": 1.5151, + "step": 33415 + }, + { + "epoch": 0.4342256794928229, + "grad_norm": 0.41441819071769714, + "learning_rate": 0.00011317797215976293, + "loss": 1.5264, + "step": 33416 + }, + { + "epoch": 0.4342386740367388, + "grad_norm": 0.48482784628868103, + "learning_rate": 0.00011317537269785154, + "loss": 1.5655, + "step": 33417 + }, + { + "epoch": 0.43425166858065467, + "grad_norm": 0.4661523997783661, + "learning_rate": 0.00011317277323594015, + "loss": 1.4614, + "step": 33418 + }, + { + "epoch": 0.43426466312457057, + "grad_norm": 0.3824726343154907, + "learning_rate": 0.00011317017377402879, + "loss": 1.2984, + "step": 33419 + }, + { + "epoch": 0.43427765766848647, + "grad_norm": 0.4088524580001831, + "learning_rate": 0.0001131675743121174, + "loss": 1.469, + "step": 33420 + }, + { + "epoch": 0.4342906522124023, + "grad_norm": 0.4773986041545868, + "learning_rate": 0.000113164974850206, + "loss": 1.4669, + "step": 33421 + }, + { + "epoch": 0.4343036467563182, + "grad_norm": 0.4116101861000061, + "learning_rate": 0.00011316237538829464, + "loss": 1.4242, + "step": 33422 + }, + { + "epoch": 0.43431664130023406, + "grad_norm": 0.38664594292640686, + "learning_rate": 0.00011315977592638325, + "loss": 1.2727, + "step": 33423 + }, + { + "epoch": 0.43432963584414996, + "grad_norm": 0.41536426544189453, + "learning_rate": 0.00011315717646447186, + "loss": 1.448, + "step": 33424 + }, + { + "epoch": 0.4343426303880658, + "grad_norm": 0.34537506103515625, + "learning_rate": 0.00011315457700256047, + "loss": 1.4336, + "step": 33425 + }, + { + "epoch": 0.4343556249319817, + "grad_norm": 0.41290032863616943, + "learning_rate": 0.0001131519775406491, + "loss": 1.6197, + "step": 33426 + }, + { + "epoch": 0.43436861947589755, + "grad_norm": 0.45269858837127686, + "learning_rate": 0.00011314937807873771, + "loss": 1.3998, + "step": 33427 + }, + { + "epoch": 0.43438161401981346, + "grad_norm": 0.435006320476532, + "learning_rate": 0.00011314677861682632, + "loss": 1.3104, + "step": 33428 + }, + { + "epoch": 0.4343946085637293, + "grad_norm": 0.3992103040218353, + "learning_rate": 0.00011314417915491493, + "loss": 1.4176, + "step": 33429 + }, + { + "epoch": 0.4344076031076452, + "grad_norm": 0.3852788209915161, + "learning_rate": 0.00011314157969300357, + "loss": 1.2839, + "step": 33430 + }, + { + "epoch": 0.43442059765156105, + "grad_norm": 0.47402387857437134, + "learning_rate": 0.00011313898023109218, + "loss": 1.3537, + "step": 33431 + }, + { + "epoch": 0.43443359219547695, + "grad_norm": 0.3946602940559387, + "learning_rate": 0.00011313638076918079, + "loss": 1.2797, + "step": 33432 + }, + { + "epoch": 0.4344465867393928, + "grad_norm": 0.4738875925540924, + "learning_rate": 0.0001131337813072694, + "loss": 1.4255, + "step": 33433 + }, + { + "epoch": 0.4344595812833087, + "grad_norm": 0.34690219163894653, + "learning_rate": 0.00011313118184535802, + "loss": 1.2765, + "step": 33434 + }, + { + "epoch": 0.43447257582722454, + "grad_norm": 0.43955838680267334, + "learning_rate": 0.00011312858238344664, + "loss": 1.401, + "step": 33435 + }, + { + "epoch": 0.43448557037114044, + "grad_norm": 0.4817427396774292, + "learning_rate": 0.00011312598292153525, + "loss": 1.5673, + "step": 33436 + }, + { + "epoch": 0.4344985649150563, + "grad_norm": 0.3731817305088043, + "learning_rate": 0.00011312338345962386, + "loss": 1.3954, + "step": 33437 + }, + { + "epoch": 0.4345115594589722, + "grad_norm": 0.4648199677467346, + "learning_rate": 0.00011312078399771248, + "loss": 1.4883, + "step": 33438 + }, + { + "epoch": 0.43452455400288803, + "grad_norm": 0.40364593267440796, + "learning_rate": 0.00011311818453580109, + "loss": 1.2678, + "step": 33439 + }, + { + "epoch": 0.43453754854680393, + "grad_norm": 0.43110036849975586, + "learning_rate": 0.0001131155850738897, + "loss": 1.534, + "step": 33440 + }, + { + "epoch": 0.4345505430907198, + "grad_norm": 0.3076140880584717, + "learning_rate": 0.00011311298561197831, + "loss": 1.2552, + "step": 33441 + }, + { + "epoch": 0.4345635376346357, + "grad_norm": 0.4156789779663086, + "learning_rate": 0.00011311038615006695, + "loss": 1.4134, + "step": 33442 + }, + { + "epoch": 0.4345765321785515, + "grad_norm": 0.35401976108551025, + "learning_rate": 0.00011310778668815556, + "loss": 1.3956, + "step": 33443 + }, + { + "epoch": 0.4345895267224674, + "grad_norm": 0.43652963638305664, + "learning_rate": 0.00011310518722624417, + "loss": 1.3795, + "step": 33444 + }, + { + "epoch": 0.43460252126638327, + "grad_norm": 0.6564937233924866, + "learning_rate": 0.00011310258776433279, + "loss": 1.3569, + "step": 33445 + }, + { + "epoch": 0.43461551581029917, + "grad_norm": 0.49574828147888184, + "learning_rate": 0.00011309998830242141, + "loss": 1.4067, + "step": 33446 + }, + { + "epoch": 0.434628510354215, + "grad_norm": 0.3732493817806244, + "learning_rate": 0.00011309738884051002, + "loss": 1.3411, + "step": 33447 + }, + { + "epoch": 0.4346415048981309, + "grad_norm": 0.41613703966140747, + "learning_rate": 0.00011309478937859863, + "loss": 1.4204, + "step": 33448 + }, + { + "epoch": 0.43465449944204676, + "grad_norm": 0.35996541380882263, + "learning_rate": 0.00011309218991668724, + "loss": 1.4027, + "step": 33449 + }, + { + "epoch": 0.43466749398596266, + "grad_norm": 0.3963604271411896, + "learning_rate": 0.00011308959045477587, + "loss": 1.3183, + "step": 33450 + }, + { + "epoch": 0.4346804885298785, + "grad_norm": 0.46713197231292725, + "learning_rate": 0.00011308699099286448, + "loss": 1.5591, + "step": 33451 + }, + { + "epoch": 0.4346934830737944, + "grad_norm": 0.4088074564933777, + "learning_rate": 0.00011308439153095309, + "loss": 1.3804, + "step": 33452 + }, + { + "epoch": 0.43470647761771025, + "grad_norm": 0.4444734752178192, + "learning_rate": 0.0001130817920690417, + "loss": 1.2932, + "step": 33453 + }, + { + "epoch": 0.43471947216162615, + "grad_norm": 0.4183602035045624, + "learning_rate": 0.00011307919260713034, + "loss": 1.3085, + "step": 33454 + }, + { + "epoch": 0.434732466705542, + "grad_norm": 0.38002219796180725, + "learning_rate": 0.00011307659314521895, + "loss": 1.2622, + "step": 33455 + }, + { + "epoch": 0.4347454612494579, + "grad_norm": 0.4457433819770813, + "learning_rate": 0.00011307399368330756, + "loss": 1.2834, + "step": 33456 + }, + { + "epoch": 0.43475845579337374, + "grad_norm": 0.40768909454345703, + "learning_rate": 0.00011307139422139618, + "loss": 1.2711, + "step": 33457 + }, + { + "epoch": 0.43477145033728964, + "grad_norm": 0.4220108091831207, + "learning_rate": 0.0001130687947594848, + "loss": 1.3668, + "step": 33458 + }, + { + "epoch": 0.4347844448812055, + "grad_norm": 0.3860601484775543, + "learning_rate": 0.0001130661952975734, + "loss": 1.4977, + "step": 33459 + }, + { + "epoch": 0.4347974394251214, + "grad_norm": 0.4006190299987793, + "learning_rate": 0.00011306359583566202, + "loss": 1.4083, + "step": 33460 + }, + { + "epoch": 0.43481043396903724, + "grad_norm": 0.2855985164642334, + "learning_rate": 0.00011306099637375065, + "loss": 1.437, + "step": 33461 + }, + { + "epoch": 0.43482342851295314, + "grad_norm": 0.47407209873199463, + "learning_rate": 0.00011305839691183927, + "loss": 1.5727, + "step": 33462 + }, + { + "epoch": 0.434836423056869, + "grad_norm": 0.46544012427330017, + "learning_rate": 0.00011305579744992786, + "loss": 1.4005, + "step": 33463 + }, + { + "epoch": 0.4348494176007849, + "grad_norm": 0.418704628944397, + "learning_rate": 0.00011305319798801647, + "loss": 1.3765, + "step": 33464 + }, + { + "epoch": 0.4348624121447007, + "grad_norm": 0.3312271237373352, + "learning_rate": 0.00011305059852610511, + "loss": 1.1666, + "step": 33465 + }, + { + "epoch": 0.43487540668861663, + "grad_norm": 0.34655091166496277, + "learning_rate": 0.00011304799906419372, + "loss": 1.1152, + "step": 33466 + }, + { + "epoch": 0.4348884012325325, + "grad_norm": 0.4476008713245392, + "learning_rate": 0.00011304539960228233, + "loss": 1.5592, + "step": 33467 + }, + { + "epoch": 0.4349013957764484, + "grad_norm": 0.4199071228504181, + "learning_rate": 0.00011304280014037095, + "loss": 1.2558, + "step": 33468 + }, + { + "epoch": 0.4349143903203642, + "grad_norm": 0.42124468088150024, + "learning_rate": 0.00011304020067845957, + "loss": 1.3893, + "step": 33469 + }, + { + "epoch": 0.4349273848642801, + "grad_norm": 0.42469683289527893, + "learning_rate": 0.00011303760121654818, + "loss": 1.2345, + "step": 33470 + }, + { + "epoch": 0.43494037940819597, + "grad_norm": 0.4101039469242096, + "learning_rate": 0.00011303500175463679, + "loss": 1.5741, + "step": 33471 + }, + { + "epoch": 0.43495337395211187, + "grad_norm": 0.45337605476379395, + "learning_rate": 0.0001130324022927254, + "loss": 1.5221, + "step": 33472 + }, + { + "epoch": 0.4349663684960277, + "grad_norm": 0.49228453636169434, + "learning_rate": 0.00011302980283081404, + "loss": 1.3706, + "step": 33473 + }, + { + "epoch": 0.4349793630399436, + "grad_norm": 0.3609216511249542, + "learning_rate": 0.00011302720336890265, + "loss": 1.4534, + "step": 33474 + }, + { + "epoch": 0.43499235758385946, + "grad_norm": 0.3879944682121277, + "learning_rate": 0.00011302460390699126, + "loss": 1.3322, + "step": 33475 + }, + { + "epoch": 0.43500535212777536, + "grad_norm": 0.4508839249610901, + "learning_rate": 0.00011302200444507986, + "loss": 1.2471, + "step": 33476 + }, + { + "epoch": 0.4350183466716912, + "grad_norm": 0.4435296356678009, + "learning_rate": 0.0001130194049831685, + "loss": 1.2675, + "step": 33477 + }, + { + "epoch": 0.4350313412156071, + "grad_norm": 0.4883398711681366, + "learning_rate": 0.00011301680552125711, + "loss": 1.4605, + "step": 33478 + }, + { + "epoch": 0.43504433575952295, + "grad_norm": 0.4156615436077118, + "learning_rate": 0.00011301420605934572, + "loss": 1.2154, + "step": 33479 + }, + { + "epoch": 0.43505733030343885, + "grad_norm": 0.45453551411628723, + "learning_rate": 0.00011301160659743433, + "loss": 1.2391, + "step": 33480 + }, + { + "epoch": 0.4350703248473547, + "grad_norm": 0.33524975180625916, + "learning_rate": 0.00011300900713552295, + "loss": 1.3147, + "step": 33481 + }, + { + "epoch": 0.4350833193912706, + "grad_norm": 0.3049945831298828, + "learning_rate": 0.00011300640767361157, + "loss": 1.4444, + "step": 33482 + }, + { + "epoch": 0.43509631393518644, + "grad_norm": 0.40221959352493286, + "learning_rate": 0.00011300380821170018, + "loss": 1.4626, + "step": 33483 + }, + { + "epoch": 0.43510930847910234, + "grad_norm": 0.3802822232246399, + "learning_rate": 0.00011300120874978879, + "loss": 1.3301, + "step": 33484 + }, + { + "epoch": 0.4351223030230182, + "grad_norm": 0.39566537737846375, + "learning_rate": 0.00011299860928787743, + "loss": 1.3197, + "step": 33485 + }, + { + "epoch": 0.4351352975669341, + "grad_norm": 0.33191347122192383, + "learning_rate": 0.00011299600982596604, + "loss": 1.3181, + "step": 33486 + }, + { + "epoch": 0.43514829211084993, + "grad_norm": 0.3309651017189026, + "learning_rate": 0.00011299341036405465, + "loss": 1.5071, + "step": 33487 + }, + { + "epoch": 0.43516128665476583, + "grad_norm": 0.33770081400871277, + "learning_rate": 0.00011299081090214324, + "loss": 1.3857, + "step": 33488 + }, + { + "epoch": 0.4351742811986817, + "grad_norm": 0.4571974575519562, + "learning_rate": 0.00011298821144023188, + "loss": 1.3903, + "step": 33489 + }, + { + "epoch": 0.4351872757425976, + "grad_norm": 0.406531423330307, + "learning_rate": 0.0001129856119783205, + "loss": 1.4535, + "step": 33490 + }, + { + "epoch": 0.4352002702865134, + "grad_norm": 0.48849910497665405, + "learning_rate": 0.0001129830125164091, + "loss": 1.4285, + "step": 33491 + }, + { + "epoch": 0.4352132648304293, + "grad_norm": 0.36898303031921387, + "learning_rate": 0.00011298041305449772, + "loss": 1.1151, + "step": 33492 + }, + { + "epoch": 0.43522625937434517, + "grad_norm": 0.36265623569488525, + "learning_rate": 0.00011297781359258634, + "loss": 1.449, + "step": 33493 + }, + { + "epoch": 0.43523925391826107, + "grad_norm": 0.40674179792404175, + "learning_rate": 0.00011297521413067495, + "loss": 1.2248, + "step": 33494 + }, + { + "epoch": 0.4352522484621769, + "grad_norm": 0.46880534291267395, + "learning_rate": 0.00011297261466876356, + "loss": 1.4557, + "step": 33495 + }, + { + "epoch": 0.4352652430060928, + "grad_norm": 0.3855893909931183, + "learning_rate": 0.0001129700152068522, + "loss": 1.2897, + "step": 33496 + }, + { + "epoch": 0.4352782375500087, + "grad_norm": 0.3856970965862274, + "learning_rate": 0.00011296741574494081, + "loss": 1.3147, + "step": 33497 + }, + { + "epoch": 0.43529123209392456, + "grad_norm": 0.4600497782230377, + "learning_rate": 0.00011296481628302942, + "loss": 1.643, + "step": 33498 + }, + { + "epoch": 0.43530422663784046, + "grad_norm": 0.30839085578918457, + "learning_rate": 0.00011296221682111803, + "loss": 1.3637, + "step": 33499 + }, + { + "epoch": 0.4353172211817563, + "grad_norm": 0.38388967514038086, + "learning_rate": 0.00011295961735920666, + "loss": 1.4948, + "step": 33500 + }, + { + "epoch": 0.4353302157256722, + "grad_norm": 0.27634376287460327, + "learning_rate": 0.00011295701789729527, + "loss": 1.3606, + "step": 33501 + }, + { + "epoch": 0.43534321026958805, + "grad_norm": 0.6087124943733215, + "learning_rate": 0.00011295441843538388, + "loss": 1.2832, + "step": 33502 + }, + { + "epoch": 0.43535620481350396, + "grad_norm": 0.43419352173805237, + "learning_rate": 0.00011295181897347249, + "loss": 1.3046, + "step": 33503 + }, + { + "epoch": 0.4353691993574198, + "grad_norm": 0.7712401747703552, + "learning_rate": 0.00011294921951156113, + "loss": 1.4573, + "step": 33504 + }, + { + "epoch": 0.4353821939013357, + "grad_norm": 0.36036282777786255, + "learning_rate": 0.00011294662004964973, + "loss": 1.4464, + "step": 33505 + }, + { + "epoch": 0.43539518844525155, + "grad_norm": 0.6549562811851501, + "learning_rate": 0.00011294402058773834, + "loss": 1.5476, + "step": 33506 + }, + { + "epoch": 0.43540818298916745, + "grad_norm": 0.29075387120246887, + "learning_rate": 0.00011294142112582695, + "loss": 1.5202, + "step": 33507 + }, + { + "epoch": 0.4354211775330833, + "grad_norm": 0.4662638306617737, + "learning_rate": 0.00011293882166391559, + "loss": 1.4401, + "step": 33508 + }, + { + "epoch": 0.4354341720769992, + "grad_norm": 0.3522166609764099, + "learning_rate": 0.0001129362222020042, + "loss": 1.3283, + "step": 33509 + }, + { + "epoch": 0.43544716662091504, + "grad_norm": 0.49034950137138367, + "learning_rate": 0.00011293362274009281, + "loss": 1.4127, + "step": 33510 + }, + { + "epoch": 0.43546016116483094, + "grad_norm": 0.3325579762458801, + "learning_rate": 0.00011293102327818142, + "loss": 1.3642, + "step": 33511 + }, + { + "epoch": 0.4354731557087468, + "grad_norm": 0.364096075296402, + "learning_rate": 0.00011292842381627004, + "loss": 1.4818, + "step": 33512 + }, + { + "epoch": 0.4354861502526627, + "grad_norm": 0.42379266023635864, + "learning_rate": 0.00011292582435435865, + "loss": 1.4459, + "step": 33513 + }, + { + "epoch": 0.43549914479657853, + "grad_norm": 0.4427761733531952, + "learning_rate": 0.00011292322489244726, + "loss": 1.5096, + "step": 33514 + }, + { + "epoch": 0.43551213934049443, + "grad_norm": 0.4284376800060272, + "learning_rate": 0.00011292062543053588, + "loss": 1.5818, + "step": 33515 + }, + { + "epoch": 0.4355251338844103, + "grad_norm": 0.4817059338092804, + "learning_rate": 0.00011291802596862451, + "loss": 1.3576, + "step": 33516 + }, + { + "epoch": 0.4355381284283262, + "grad_norm": 0.37710994482040405, + "learning_rate": 0.00011291542650671311, + "loss": 1.4815, + "step": 33517 + }, + { + "epoch": 0.435551122972242, + "grad_norm": 0.3414710462093353, + "learning_rate": 0.00011291282704480172, + "loss": 1.3065, + "step": 33518 + }, + { + "epoch": 0.4355641175161579, + "grad_norm": 0.3931276798248291, + "learning_rate": 0.00011291022758289033, + "loss": 1.3274, + "step": 33519 + }, + { + "epoch": 0.43557711206007377, + "grad_norm": 0.43264785408973694, + "learning_rate": 0.00011290762812097897, + "loss": 1.4062, + "step": 33520 + }, + { + "epoch": 0.43559010660398967, + "grad_norm": 0.3304761052131653, + "learning_rate": 0.00011290502865906758, + "loss": 1.1919, + "step": 33521 + }, + { + "epoch": 0.4356031011479055, + "grad_norm": 0.38635075092315674, + "learning_rate": 0.00011290242919715619, + "loss": 1.365, + "step": 33522 + }, + { + "epoch": 0.4356160956918214, + "grad_norm": 0.3684718906879425, + "learning_rate": 0.0001128998297352448, + "loss": 1.594, + "step": 33523 + }, + { + "epoch": 0.43562909023573726, + "grad_norm": 0.416511207818985, + "learning_rate": 0.00011289723027333343, + "loss": 1.442, + "step": 33524 + }, + { + "epoch": 0.43564208477965316, + "grad_norm": 0.4223576784133911, + "learning_rate": 0.00011289463081142204, + "loss": 1.578, + "step": 33525 + }, + { + "epoch": 0.435655079323569, + "grad_norm": 0.365560382604599, + "learning_rate": 0.00011289203134951065, + "loss": 1.2924, + "step": 33526 + }, + { + "epoch": 0.4356680738674849, + "grad_norm": 0.36633387207984924, + "learning_rate": 0.00011288943188759926, + "loss": 1.2276, + "step": 33527 + }, + { + "epoch": 0.43568106841140075, + "grad_norm": 0.4006326496601105, + "learning_rate": 0.0001128868324256879, + "loss": 1.34, + "step": 33528 + }, + { + "epoch": 0.43569406295531665, + "grad_norm": 0.45010441541671753, + "learning_rate": 0.00011288423296377651, + "loss": 1.3851, + "step": 33529 + }, + { + "epoch": 0.4357070574992325, + "grad_norm": 0.3831198215484619, + "learning_rate": 0.00011288163350186511, + "loss": 1.3443, + "step": 33530 + }, + { + "epoch": 0.4357200520431484, + "grad_norm": 0.4434194564819336, + "learning_rate": 0.00011287903403995375, + "loss": 1.5173, + "step": 33531 + }, + { + "epoch": 0.43573304658706424, + "grad_norm": 0.38336092233657837, + "learning_rate": 0.00011287643457804236, + "loss": 1.5005, + "step": 33532 + }, + { + "epoch": 0.43574604113098014, + "grad_norm": 0.32698726654052734, + "learning_rate": 0.00011287383511613097, + "loss": 1.3211, + "step": 33533 + }, + { + "epoch": 0.435759035674896, + "grad_norm": 0.4255834221839905, + "learning_rate": 0.00011287123565421958, + "loss": 1.4003, + "step": 33534 + }, + { + "epoch": 0.4357720302188119, + "grad_norm": 0.42821648716926575, + "learning_rate": 0.0001128686361923082, + "loss": 1.3387, + "step": 33535 + }, + { + "epoch": 0.43578502476272774, + "grad_norm": 0.4874619245529175, + "learning_rate": 0.00011286603673039681, + "loss": 1.3532, + "step": 33536 + }, + { + "epoch": 0.43579801930664364, + "grad_norm": 0.4532897472381592, + "learning_rate": 0.00011286343726848542, + "loss": 1.3817, + "step": 33537 + }, + { + "epoch": 0.4358110138505595, + "grad_norm": 0.4371196925640106, + "learning_rate": 0.00011286083780657404, + "loss": 1.528, + "step": 33538 + }, + { + "epoch": 0.4358240083944754, + "grad_norm": 0.3872183561325073, + "learning_rate": 0.00011285823834466267, + "loss": 1.4688, + "step": 33539 + }, + { + "epoch": 0.4358370029383912, + "grad_norm": 0.3467790186405182, + "learning_rate": 0.00011285563888275128, + "loss": 1.4403, + "step": 33540 + }, + { + "epoch": 0.4358499974823071, + "grad_norm": 0.3333219885826111, + "learning_rate": 0.0001128530394208399, + "loss": 1.3376, + "step": 33541 + }, + { + "epoch": 0.435862992026223, + "grad_norm": 0.3824302852153778, + "learning_rate": 0.0001128504399589285, + "loss": 1.3371, + "step": 33542 + }, + { + "epoch": 0.4358759865701389, + "grad_norm": 0.39576634764671326, + "learning_rate": 0.00011284784049701713, + "loss": 1.5762, + "step": 33543 + }, + { + "epoch": 0.4358889811140547, + "grad_norm": 0.3787519931793213, + "learning_rate": 0.00011284524103510574, + "loss": 1.3393, + "step": 33544 + }, + { + "epoch": 0.4359019756579706, + "grad_norm": 0.36629825830459595, + "learning_rate": 0.00011284264157319435, + "loss": 1.3414, + "step": 33545 + }, + { + "epoch": 0.43591497020188646, + "grad_norm": 0.4001936614513397, + "learning_rate": 0.00011284004211128296, + "loss": 1.5644, + "step": 33546 + }, + { + "epoch": 0.43592796474580237, + "grad_norm": 0.4340262711048126, + "learning_rate": 0.00011283744264937159, + "loss": 1.3626, + "step": 33547 + }, + { + "epoch": 0.4359409592897182, + "grad_norm": 0.4547877609729767, + "learning_rate": 0.0001128348431874602, + "loss": 1.3608, + "step": 33548 + }, + { + "epoch": 0.4359539538336341, + "grad_norm": 0.38235533237457275, + "learning_rate": 0.00011283224372554881, + "loss": 1.3387, + "step": 33549 + }, + { + "epoch": 0.43596694837754996, + "grad_norm": 0.2960379123687744, + "learning_rate": 0.00011282964426363742, + "loss": 1.1271, + "step": 33550 + }, + { + "epoch": 0.43597994292146586, + "grad_norm": 0.39158883690834045, + "learning_rate": 0.00011282704480172606, + "loss": 1.5222, + "step": 33551 + }, + { + "epoch": 0.4359929374653817, + "grad_norm": 0.3735707700252533, + "learning_rate": 0.00011282444533981467, + "loss": 1.4341, + "step": 33552 + }, + { + "epoch": 0.4360059320092976, + "grad_norm": 0.40538403391838074, + "learning_rate": 0.00011282184587790328, + "loss": 1.38, + "step": 33553 + }, + { + "epoch": 0.43601892655321345, + "grad_norm": 0.39453834295272827, + "learning_rate": 0.00011281924641599189, + "loss": 1.5268, + "step": 33554 + }, + { + "epoch": 0.43603192109712935, + "grad_norm": 0.36798128485679626, + "learning_rate": 0.00011281664695408052, + "loss": 1.1918, + "step": 33555 + }, + { + "epoch": 0.4360449156410452, + "grad_norm": 0.4932467043399811, + "learning_rate": 0.00011281404749216913, + "loss": 1.6268, + "step": 33556 + }, + { + "epoch": 0.4360579101849611, + "grad_norm": 0.532631516456604, + "learning_rate": 0.00011281144803025774, + "loss": 1.6405, + "step": 33557 + }, + { + "epoch": 0.43607090472887694, + "grad_norm": 0.40453746914863586, + "learning_rate": 0.00011280884856834635, + "loss": 1.2374, + "step": 33558 + }, + { + "epoch": 0.43608389927279284, + "grad_norm": 0.5885547399520874, + "learning_rate": 0.00011280624910643497, + "loss": 1.5621, + "step": 33559 + }, + { + "epoch": 0.4360968938167087, + "grad_norm": 0.4040106236934662, + "learning_rate": 0.00011280364964452358, + "loss": 1.4886, + "step": 33560 + }, + { + "epoch": 0.4361098883606246, + "grad_norm": 0.4366855323314667, + "learning_rate": 0.0001128010501826122, + "loss": 1.4403, + "step": 33561 + }, + { + "epoch": 0.43612288290454043, + "grad_norm": 0.29536354541778564, + "learning_rate": 0.0001127984507207008, + "loss": 1.2681, + "step": 33562 + }, + { + "epoch": 0.43613587744845633, + "grad_norm": 0.42942678928375244, + "learning_rate": 0.00011279585125878944, + "loss": 1.5517, + "step": 33563 + }, + { + "epoch": 0.4361488719923722, + "grad_norm": 0.30667543411254883, + "learning_rate": 0.00011279325179687806, + "loss": 1.3038, + "step": 33564 + }, + { + "epoch": 0.4361618665362881, + "grad_norm": 0.4050809144973755, + "learning_rate": 0.00011279065233496667, + "loss": 1.3096, + "step": 33565 + }, + { + "epoch": 0.4361748610802039, + "grad_norm": 0.38385143876075745, + "learning_rate": 0.00011278805287305528, + "loss": 1.2196, + "step": 33566 + }, + { + "epoch": 0.4361878556241198, + "grad_norm": 0.4730268120765686, + "learning_rate": 0.0001127854534111439, + "loss": 1.1984, + "step": 33567 + }, + { + "epoch": 0.43620085016803567, + "grad_norm": 0.45110154151916504, + "learning_rate": 0.00011278285394923251, + "loss": 1.5623, + "step": 33568 + }, + { + "epoch": 0.43621384471195157, + "grad_norm": 0.2870447635650635, + "learning_rate": 0.00011278025448732112, + "loss": 1.0624, + "step": 33569 + }, + { + "epoch": 0.4362268392558674, + "grad_norm": 0.3653133511543274, + "learning_rate": 0.00011277765502540976, + "loss": 1.3821, + "step": 33570 + }, + { + "epoch": 0.4362398337997833, + "grad_norm": 0.37583813071250916, + "learning_rate": 0.00011277505556349837, + "loss": 1.3205, + "step": 33571 + }, + { + "epoch": 0.43625282834369916, + "grad_norm": 0.4050169587135315, + "learning_rate": 0.00011277245610158697, + "loss": 1.5281, + "step": 33572 + }, + { + "epoch": 0.43626582288761506, + "grad_norm": 0.4745836853981018, + "learning_rate": 0.00011276985663967558, + "loss": 1.4027, + "step": 33573 + }, + { + "epoch": 0.43627881743153096, + "grad_norm": 0.34038689732551575, + "learning_rate": 0.00011276725717776422, + "loss": 1.1926, + "step": 33574 + }, + { + "epoch": 0.4362918119754468, + "grad_norm": 0.36607515811920166, + "learning_rate": 0.00011276465771585283, + "loss": 1.3093, + "step": 33575 + }, + { + "epoch": 0.4363048065193627, + "grad_norm": 0.416373610496521, + "learning_rate": 0.00011276205825394144, + "loss": 1.3037, + "step": 33576 + }, + { + "epoch": 0.43631780106327855, + "grad_norm": 0.4066951274871826, + "learning_rate": 0.00011275945879203005, + "loss": 1.1177, + "step": 33577 + }, + { + "epoch": 0.43633079560719445, + "grad_norm": 0.3136371076107025, + "learning_rate": 0.00011275685933011868, + "loss": 1.3289, + "step": 33578 + }, + { + "epoch": 0.4363437901511103, + "grad_norm": 0.4498012065887451, + "learning_rate": 0.00011275425986820729, + "loss": 1.3866, + "step": 33579 + }, + { + "epoch": 0.4363567846950262, + "grad_norm": 0.3883761763572693, + "learning_rate": 0.0001127516604062959, + "loss": 1.4278, + "step": 33580 + }, + { + "epoch": 0.43636977923894205, + "grad_norm": 0.4388565123081207, + "learning_rate": 0.00011274906094438451, + "loss": 1.4257, + "step": 33581 + }, + { + "epoch": 0.43638277378285795, + "grad_norm": 0.4390762150287628, + "learning_rate": 0.00011274646148247315, + "loss": 1.3223, + "step": 33582 + }, + { + "epoch": 0.4363957683267738, + "grad_norm": 0.3192100524902344, + "learning_rate": 0.00011274386202056176, + "loss": 1.5773, + "step": 33583 + }, + { + "epoch": 0.4364087628706897, + "grad_norm": 0.3366519510746002, + "learning_rate": 0.00011274126255865037, + "loss": 1.421, + "step": 33584 + }, + { + "epoch": 0.43642175741460554, + "grad_norm": 0.34497714042663574, + "learning_rate": 0.00011273866309673897, + "loss": 1.2098, + "step": 33585 + }, + { + "epoch": 0.43643475195852144, + "grad_norm": 0.42119815945625305, + "learning_rate": 0.0001127360636348276, + "loss": 1.2516, + "step": 33586 + }, + { + "epoch": 0.4364477465024373, + "grad_norm": 0.42016011476516724, + "learning_rate": 0.00011273346417291622, + "loss": 1.3852, + "step": 33587 + }, + { + "epoch": 0.4364607410463532, + "grad_norm": 0.373702734708786, + "learning_rate": 0.00011273086471100483, + "loss": 1.5105, + "step": 33588 + }, + { + "epoch": 0.43647373559026903, + "grad_norm": 0.40728864073753357, + "learning_rate": 0.00011272826524909344, + "loss": 1.7561, + "step": 33589 + }, + { + "epoch": 0.43648673013418493, + "grad_norm": 0.31583356857299805, + "learning_rate": 0.00011272566578718206, + "loss": 1.2995, + "step": 33590 + }, + { + "epoch": 0.4364997246781008, + "grad_norm": 0.37355634570121765, + "learning_rate": 0.00011272306632527067, + "loss": 1.3015, + "step": 33591 + }, + { + "epoch": 0.4365127192220167, + "grad_norm": 0.37475037574768066, + "learning_rate": 0.00011272046686335928, + "loss": 1.2336, + "step": 33592 + }, + { + "epoch": 0.4365257137659325, + "grad_norm": 0.4435719847679138, + "learning_rate": 0.0001127178674014479, + "loss": 1.4132, + "step": 33593 + }, + { + "epoch": 0.4365387083098484, + "grad_norm": 0.3804897665977478, + "learning_rate": 0.00011271526793953653, + "loss": 1.4095, + "step": 33594 + }, + { + "epoch": 0.43655170285376427, + "grad_norm": 0.2668040990829468, + "learning_rate": 0.00011271266847762514, + "loss": 1.1922, + "step": 33595 + }, + { + "epoch": 0.43656469739768017, + "grad_norm": 0.3474498391151428, + "learning_rate": 0.00011271006901571375, + "loss": 1.4769, + "step": 33596 + }, + { + "epoch": 0.436577691941596, + "grad_norm": 0.34615659713745117, + "learning_rate": 0.00011270746955380237, + "loss": 1.2778, + "step": 33597 + }, + { + "epoch": 0.4365906864855119, + "grad_norm": 0.4304315745830536, + "learning_rate": 0.00011270487009189099, + "loss": 1.5868, + "step": 33598 + }, + { + "epoch": 0.43660368102942776, + "grad_norm": 0.3263946771621704, + "learning_rate": 0.0001127022706299796, + "loss": 1.2583, + "step": 33599 + }, + { + "epoch": 0.43661667557334366, + "grad_norm": 0.36526963114738464, + "learning_rate": 0.00011269967116806821, + "loss": 1.3842, + "step": 33600 + }, + { + "epoch": 0.4366296701172595, + "grad_norm": 0.44879162311553955, + "learning_rate": 0.00011269707170615682, + "loss": 1.3323, + "step": 33601 + }, + { + "epoch": 0.4366426646611754, + "grad_norm": 0.2959417700767517, + "learning_rate": 0.00011269447224424545, + "loss": 1.4114, + "step": 33602 + }, + { + "epoch": 0.43665565920509125, + "grad_norm": 0.4669698178768158, + "learning_rate": 0.00011269187278233406, + "loss": 1.5505, + "step": 33603 + }, + { + "epoch": 0.43666865374900715, + "grad_norm": 0.3637484312057495, + "learning_rate": 0.00011268927332042267, + "loss": 1.3649, + "step": 33604 + }, + { + "epoch": 0.436681648292923, + "grad_norm": 0.3010956048965454, + "learning_rate": 0.0001126866738585113, + "loss": 1.1147, + "step": 33605 + }, + { + "epoch": 0.4366946428368389, + "grad_norm": 0.4042312502861023, + "learning_rate": 0.00011268407439659992, + "loss": 1.2878, + "step": 33606 + }, + { + "epoch": 0.43670763738075474, + "grad_norm": 0.35928875207901, + "learning_rate": 0.00011268147493468853, + "loss": 1.3874, + "step": 33607 + }, + { + "epoch": 0.43672063192467064, + "grad_norm": 0.45363011956214905, + "learning_rate": 0.00011267887547277714, + "loss": 1.4329, + "step": 33608 + }, + { + "epoch": 0.4367336264685865, + "grad_norm": 0.5129709839820862, + "learning_rate": 0.00011267627601086576, + "loss": 1.3038, + "step": 33609 + }, + { + "epoch": 0.4367466210125024, + "grad_norm": 0.5133653879165649, + "learning_rate": 0.00011267367654895437, + "loss": 1.4084, + "step": 33610 + }, + { + "epoch": 0.43675961555641823, + "grad_norm": 0.3572450280189514, + "learning_rate": 0.00011267107708704299, + "loss": 1.4602, + "step": 33611 + }, + { + "epoch": 0.43677261010033414, + "grad_norm": 0.43474170565605164, + "learning_rate": 0.0001126684776251316, + "loss": 1.5156, + "step": 33612 + }, + { + "epoch": 0.43678560464425, + "grad_norm": 0.4533253312110901, + "learning_rate": 0.00011266587816322023, + "loss": 1.4582, + "step": 33613 + }, + { + "epoch": 0.4367985991881659, + "grad_norm": 0.4947125017642975, + "learning_rate": 0.00011266327870130883, + "loss": 1.3736, + "step": 33614 + }, + { + "epoch": 0.4368115937320817, + "grad_norm": 0.3331683874130249, + "learning_rate": 0.00011266067923939744, + "loss": 1.2773, + "step": 33615 + }, + { + "epoch": 0.4368245882759976, + "grad_norm": 0.4979170560836792, + "learning_rate": 0.00011265807977748605, + "loss": 1.4752, + "step": 33616 + }, + { + "epoch": 0.4368375828199135, + "grad_norm": 0.3768923878669739, + "learning_rate": 0.00011265548031557469, + "loss": 1.203, + "step": 33617 + }, + { + "epoch": 0.4368505773638294, + "grad_norm": 0.43261897563934326, + "learning_rate": 0.0001126528808536633, + "loss": 1.4611, + "step": 33618 + }, + { + "epoch": 0.4368635719077452, + "grad_norm": 0.3522312045097351, + "learning_rate": 0.00011265028139175191, + "loss": 1.2645, + "step": 33619 + }, + { + "epoch": 0.4368765664516611, + "grad_norm": 0.38643181324005127, + "learning_rate": 0.00011264768192984052, + "loss": 1.4573, + "step": 33620 + }, + { + "epoch": 0.43688956099557696, + "grad_norm": 0.4221051037311554, + "learning_rate": 0.00011264508246792915, + "loss": 1.5097, + "step": 33621 + }, + { + "epoch": 0.43690255553949287, + "grad_norm": 0.35679060220718384, + "learning_rate": 0.00011264248300601776, + "loss": 1.4753, + "step": 33622 + }, + { + "epoch": 0.4369155500834087, + "grad_norm": 0.3596331775188446, + "learning_rate": 0.00011263988354410637, + "loss": 1.4367, + "step": 33623 + }, + { + "epoch": 0.4369285446273246, + "grad_norm": 0.3658272922039032, + "learning_rate": 0.00011263728408219498, + "loss": 1.3374, + "step": 33624 + }, + { + "epoch": 0.43694153917124046, + "grad_norm": 0.3894549310207367, + "learning_rate": 0.00011263468462028362, + "loss": 1.4432, + "step": 33625 + }, + { + "epoch": 0.43695453371515636, + "grad_norm": 0.3065608739852905, + "learning_rate": 0.00011263208515837223, + "loss": 1.1178, + "step": 33626 + }, + { + "epoch": 0.4369675282590722, + "grad_norm": 0.48100993037223816, + "learning_rate": 0.00011262948569646083, + "loss": 1.7989, + "step": 33627 + }, + { + "epoch": 0.4369805228029881, + "grad_norm": 0.4452144205570221, + "learning_rate": 0.00011262688623454944, + "loss": 1.2811, + "step": 33628 + }, + { + "epoch": 0.43699351734690395, + "grad_norm": 0.6012799739837646, + "learning_rate": 0.00011262428677263808, + "loss": 1.6313, + "step": 33629 + }, + { + "epoch": 0.43700651189081985, + "grad_norm": 0.3175499737262726, + "learning_rate": 0.00011262168731072669, + "loss": 1.4229, + "step": 33630 + }, + { + "epoch": 0.4370195064347357, + "grad_norm": 0.3955228328704834, + "learning_rate": 0.0001126190878488153, + "loss": 1.5599, + "step": 33631 + }, + { + "epoch": 0.4370325009786516, + "grad_norm": 0.3448098599910736, + "learning_rate": 0.00011261648838690391, + "loss": 1.2207, + "step": 33632 + }, + { + "epoch": 0.43704549552256744, + "grad_norm": 0.30379921197891235, + "learning_rate": 0.00011261388892499253, + "loss": 1.6611, + "step": 33633 + }, + { + "epoch": 0.43705849006648334, + "grad_norm": 0.45026421546936035, + "learning_rate": 0.00011261128946308115, + "loss": 1.4578, + "step": 33634 + }, + { + "epoch": 0.4370714846103992, + "grad_norm": 0.38558822870254517, + "learning_rate": 0.00011260869000116976, + "loss": 1.418, + "step": 33635 + }, + { + "epoch": 0.4370844791543151, + "grad_norm": 0.5054637789726257, + "learning_rate": 0.00011260609053925837, + "loss": 1.5613, + "step": 33636 + }, + { + "epoch": 0.43709747369823093, + "grad_norm": 0.3358413279056549, + "learning_rate": 0.000112603491077347, + "loss": 1.3445, + "step": 33637 + }, + { + "epoch": 0.43711046824214683, + "grad_norm": 0.386957049369812, + "learning_rate": 0.00011260089161543562, + "loss": 1.159, + "step": 33638 + }, + { + "epoch": 0.4371234627860627, + "grad_norm": 0.33620503544807434, + "learning_rate": 0.00011259829215352423, + "loss": 1.4737, + "step": 33639 + }, + { + "epoch": 0.4371364573299786, + "grad_norm": 0.3374916613101959, + "learning_rate": 0.00011259569269161282, + "loss": 1.3144, + "step": 33640 + }, + { + "epoch": 0.4371494518738944, + "grad_norm": 0.3896370232105255, + "learning_rate": 0.00011259309322970146, + "loss": 1.2875, + "step": 33641 + }, + { + "epoch": 0.4371624464178103, + "grad_norm": 0.4411768615245819, + "learning_rate": 0.00011259049376779007, + "loss": 1.372, + "step": 33642 + }, + { + "epoch": 0.43717544096172617, + "grad_norm": 0.520294725894928, + "learning_rate": 0.00011258789430587868, + "loss": 1.4626, + "step": 33643 + }, + { + "epoch": 0.43718843550564207, + "grad_norm": 0.3707142174243927, + "learning_rate": 0.00011258529484396731, + "loss": 1.3581, + "step": 33644 + }, + { + "epoch": 0.4372014300495579, + "grad_norm": 0.44626373052597046, + "learning_rate": 0.00011258269538205592, + "loss": 1.4722, + "step": 33645 + }, + { + "epoch": 0.4372144245934738, + "grad_norm": 0.3588748276233673, + "learning_rate": 0.00011258009592014453, + "loss": 1.5229, + "step": 33646 + }, + { + "epoch": 0.43722741913738966, + "grad_norm": 0.3996641933917999, + "learning_rate": 0.00011257749645823314, + "loss": 1.3164, + "step": 33647 + }, + { + "epoch": 0.43724041368130556, + "grad_norm": 0.356577605009079, + "learning_rate": 0.00011257489699632178, + "loss": 1.4843, + "step": 33648 + }, + { + "epoch": 0.43725340822522146, + "grad_norm": 0.4103662669658661, + "learning_rate": 0.00011257229753441039, + "loss": 1.5392, + "step": 33649 + }, + { + "epoch": 0.4372664027691373, + "grad_norm": 0.384285032749176, + "learning_rate": 0.000112569698072499, + "loss": 1.4368, + "step": 33650 + }, + { + "epoch": 0.4372793973130532, + "grad_norm": 0.4841556251049042, + "learning_rate": 0.00011256709861058761, + "loss": 1.6611, + "step": 33651 + }, + { + "epoch": 0.43729239185696905, + "grad_norm": 0.3750399053096771, + "learning_rate": 0.00011256449914867624, + "loss": 1.4013, + "step": 33652 + }, + { + "epoch": 0.43730538640088495, + "grad_norm": 0.38729211688041687, + "learning_rate": 0.00011256189968676485, + "loss": 1.2715, + "step": 33653 + }, + { + "epoch": 0.4373183809448008, + "grad_norm": 0.43664470314979553, + "learning_rate": 0.00011255930022485346, + "loss": 1.532, + "step": 33654 + }, + { + "epoch": 0.4373313754887167, + "grad_norm": 0.41050025820732117, + "learning_rate": 0.00011255670076294207, + "loss": 1.3998, + "step": 33655 + }, + { + "epoch": 0.43734437003263255, + "grad_norm": 0.4281301200389862, + "learning_rate": 0.0001125541013010307, + "loss": 1.279, + "step": 33656 + }, + { + "epoch": 0.43735736457654845, + "grad_norm": 0.3945081830024719, + "learning_rate": 0.0001125515018391193, + "loss": 1.4622, + "step": 33657 + }, + { + "epoch": 0.4373703591204643, + "grad_norm": 0.42187169194221497, + "learning_rate": 0.00011254890237720792, + "loss": 1.3252, + "step": 33658 + }, + { + "epoch": 0.4373833536643802, + "grad_norm": 0.32891568541526794, + "learning_rate": 0.00011254630291529653, + "loss": 1.3998, + "step": 33659 + }, + { + "epoch": 0.43739634820829604, + "grad_norm": 0.3880465030670166, + "learning_rate": 0.00011254370345338517, + "loss": 1.3065, + "step": 33660 + }, + { + "epoch": 0.43740934275221194, + "grad_norm": 0.4180728495121002, + "learning_rate": 0.00011254110399147378, + "loss": 1.5468, + "step": 33661 + }, + { + "epoch": 0.4374223372961278, + "grad_norm": 0.4271094799041748, + "learning_rate": 0.00011253850452956239, + "loss": 1.4774, + "step": 33662 + }, + { + "epoch": 0.4374353318400437, + "grad_norm": 0.4075973331928253, + "learning_rate": 0.000112535905067651, + "loss": 1.4235, + "step": 33663 + }, + { + "epoch": 0.43744832638395953, + "grad_norm": 0.3313482105731964, + "learning_rate": 0.00011253330560573962, + "loss": 1.2185, + "step": 33664 + }, + { + "epoch": 0.43746132092787543, + "grad_norm": 0.3519590198993683, + "learning_rate": 0.00011253070614382823, + "loss": 1.4339, + "step": 33665 + }, + { + "epoch": 0.4374743154717913, + "grad_norm": 0.4447045922279358, + "learning_rate": 0.00011252810668191684, + "loss": 1.4901, + "step": 33666 + }, + { + "epoch": 0.4374873100157072, + "grad_norm": 0.4138498604297638, + "learning_rate": 0.00011252550722000546, + "loss": 1.3637, + "step": 33667 + }, + { + "epoch": 0.437500304559623, + "grad_norm": 0.42780208587646484, + "learning_rate": 0.0001125229077580941, + "loss": 1.3771, + "step": 33668 + }, + { + "epoch": 0.4375132991035389, + "grad_norm": 0.38943788409233093, + "learning_rate": 0.00011252030829618269, + "loss": 1.421, + "step": 33669 + }, + { + "epoch": 0.43752629364745477, + "grad_norm": 0.42851778864860535, + "learning_rate": 0.0001125177088342713, + "loss": 1.3423, + "step": 33670 + }, + { + "epoch": 0.43753928819137067, + "grad_norm": 0.41370466351509094, + "learning_rate": 0.00011251510937235991, + "loss": 1.2065, + "step": 33671 + }, + { + "epoch": 0.4375522827352865, + "grad_norm": 0.5071435570716858, + "learning_rate": 0.00011251250991044855, + "loss": 1.5005, + "step": 33672 + }, + { + "epoch": 0.4375652772792024, + "grad_norm": 0.38066232204437256, + "learning_rate": 0.00011250991044853716, + "loss": 1.4508, + "step": 33673 + }, + { + "epoch": 0.43757827182311826, + "grad_norm": 0.40746909379959106, + "learning_rate": 0.00011250731098662577, + "loss": 1.3133, + "step": 33674 + }, + { + "epoch": 0.43759126636703416, + "grad_norm": 0.41708096861839294, + "learning_rate": 0.00011250471152471438, + "loss": 1.3271, + "step": 33675 + }, + { + "epoch": 0.43760426091095, + "grad_norm": 0.38377097249031067, + "learning_rate": 0.00011250211206280301, + "loss": 1.3973, + "step": 33676 + }, + { + "epoch": 0.4376172554548659, + "grad_norm": 0.4810415804386139, + "learning_rate": 0.00011249951260089162, + "loss": 1.2204, + "step": 33677 + }, + { + "epoch": 0.43763024999878175, + "grad_norm": 0.3533743917942047, + "learning_rate": 0.00011249691313898023, + "loss": 1.3246, + "step": 33678 + }, + { + "epoch": 0.43764324454269765, + "grad_norm": 0.3186552822589874, + "learning_rate": 0.00011249431367706884, + "loss": 1.305, + "step": 33679 + }, + { + "epoch": 0.4376562390866135, + "grad_norm": 0.4591079354286194, + "learning_rate": 0.00011249171421515748, + "loss": 1.4328, + "step": 33680 + }, + { + "epoch": 0.4376692336305294, + "grad_norm": 0.3991362750530243, + "learning_rate": 0.00011248911475324609, + "loss": 1.3158, + "step": 33681 + }, + { + "epoch": 0.43768222817444524, + "grad_norm": 0.41839659214019775, + "learning_rate": 0.00011248651529133469, + "loss": 1.462, + "step": 33682 + }, + { + "epoch": 0.43769522271836114, + "grad_norm": 0.32797348499298096, + "learning_rate": 0.00011248391582942333, + "loss": 1.4652, + "step": 33683 + }, + { + "epoch": 0.437708217262277, + "grad_norm": 0.30605408549308777, + "learning_rate": 0.00011248131636751194, + "loss": 1.3496, + "step": 33684 + }, + { + "epoch": 0.4377212118061929, + "grad_norm": 0.3311157524585724, + "learning_rate": 0.00011247871690560055, + "loss": 1.4715, + "step": 33685 + }, + { + "epoch": 0.43773420635010873, + "grad_norm": 0.359284371137619, + "learning_rate": 0.00011247611744368916, + "loss": 1.3078, + "step": 33686 + }, + { + "epoch": 0.43774720089402464, + "grad_norm": 0.44039100408554077, + "learning_rate": 0.00011247351798177778, + "loss": 1.2357, + "step": 33687 + }, + { + "epoch": 0.4377601954379405, + "grad_norm": 0.39011868834495544, + "learning_rate": 0.0001124709185198664, + "loss": 1.4811, + "step": 33688 + }, + { + "epoch": 0.4377731899818564, + "grad_norm": 0.40913689136505127, + "learning_rate": 0.000112468319057955, + "loss": 1.4818, + "step": 33689 + }, + { + "epoch": 0.4377861845257722, + "grad_norm": 0.39085620641708374, + "learning_rate": 0.00011246571959604362, + "loss": 1.4016, + "step": 33690 + }, + { + "epoch": 0.4377991790696881, + "grad_norm": 0.4359629452228546, + "learning_rate": 0.00011246312013413225, + "loss": 1.4596, + "step": 33691 + }, + { + "epoch": 0.43781217361360397, + "grad_norm": 0.31746187806129456, + "learning_rate": 0.00011246052067222086, + "loss": 1.5756, + "step": 33692 + }, + { + "epoch": 0.4378251681575199, + "grad_norm": 0.3196691870689392, + "learning_rate": 0.00011245792121030948, + "loss": 1.2098, + "step": 33693 + }, + { + "epoch": 0.4378381627014357, + "grad_norm": 0.43217554688453674, + "learning_rate": 0.00011245532174839807, + "loss": 1.2046, + "step": 33694 + }, + { + "epoch": 0.4378511572453516, + "grad_norm": 0.3626560568809509, + "learning_rate": 0.00011245272228648671, + "loss": 1.5426, + "step": 33695 + }, + { + "epoch": 0.43786415178926746, + "grad_norm": 0.3620617091655731, + "learning_rate": 0.00011245012282457532, + "loss": 1.2463, + "step": 33696 + }, + { + "epoch": 0.43787714633318336, + "grad_norm": 0.357759028673172, + "learning_rate": 0.00011244752336266393, + "loss": 1.3739, + "step": 33697 + }, + { + "epoch": 0.4378901408770992, + "grad_norm": 0.3382255434989929, + "learning_rate": 0.00011244492390075254, + "loss": 1.2824, + "step": 33698 + }, + { + "epoch": 0.4379031354210151, + "grad_norm": 0.503534734249115, + "learning_rate": 0.00011244232443884117, + "loss": 1.4395, + "step": 33699 + }, + { + "epoch": 0.43791612996493096, + "grad_norm": 0.3558909595012665, + "learning_rate": 0.00011243972497692978, + "loss": 1.1281, + "step": 33700 + }, + { + "epoch": 0.43792912450884686, + "grad_norm": 0.33497726917266846, + "learning_rate": 0.00011243712551501839, + "loss": 1.2917, + "step": 33701 + }, + { + "epoch": 0.4379421190527627, + "grad_norm": 0.3172791302204132, + "learning_rate": 0.000112434526053107, + "loss": 1.3284, + "step": 33702 + }, + { + "epoch": 0.4379551135966786, + "grad_norm": 0.39789000153541565, + "learning_rate": 0.00011243192659119564, + "loss": 1.5001, + "step": 33703 + }, + { + "epoch": 0.43796810814059445, + "grad_norm": 0.37026578187942505, + "learning_rate": 0.00011242932712928425, + "loss": 1.287, + "step": 33704 + }, + { + "epoch": 0.43798110268451035, + "grad_norm": 0.36092573404312134, + "learning_rate": 0.00011242672766737286, + "loss": 1.2986, + "step": 33705 + }, + { + "epoch": 0.4379940972284262, + "grad_norm": 0.3719044625759125, + "learning_rate": 0.00011242412820546147, + "loss": 1.2152, + "step": 33706 + }, + { + "epoch": 0.4380070917723421, + "grad_norm": 0.4057196080684662, + "learning_rate": 0.0001124215287435501, + "loss": 1.4332, + "step": 33707 + }, + { + "epoch": 0.43802008631625794, + "grad_norm": 0.38312944769859314, + "learning_rate": 0.00011241892928163871, + "loss": 1.4172, + "step": 33708 + }, + { + "epoch": 0.43803308086017384, + "grad_norm": 0.4060550630092621, + "learning_rate": 0.00011241632981972732, + "loss": 1.4773, + "step": 33709 + }, + { + "epoch": 0.4380460754040897, + "grad_norm": 0.36576610803604126, + "learning_rate": 0.00011241373035781593, + "loss": 1.49, + "step": 33710 + }, + { + "epoch": 0.4380590699480056, + "grad_norm": 0.3998563289642334, + "learning_rate": 0.00011241113089590455, + "loss": 1.4628, + "step": 33711 + }, + { + "epoch": 0.43807206449192143, + "grad_norm": 0.33990710973739624, + "learning_rate": 0.00011240853143399316, + "loss": 1.3322, + "step": 33712 + }, + { + "epoch": 0.43808505903583733, + "grad_norm": 0.47588205337524414, + "learning_rate": 0.00011240593197208178, + "loss": 1.3308, + "step": 33713 + }, + { + "epoch": 0.4380980535797532, + "grad_norm": 0.38509654998779297, + "learning_rate": 0.00011240333251017039, + "loss": 1.4915, + "step": 33714 + }, + { + "epoch": 0.4381110481236691, + "grad_norm": 0.4186917543411255, + "learning_rate": 0.00011240073304825902, + "loss": 1.4181, + "step": 33715 + }, + { + "epoch": 0.4381240426675849, + "grad_norm": 0.46086210012435913, + "learning_rate": 0.00011239813358634764, + "loss": 1.4794, + "step": 33716 + }, + { + "epoch": 0.4381370372115008, + "grad_norm": 0.3558335304260254, + "learning_rate": 0.00011239553412443625, + "loss": 1.2663, + "step": 33717 + }, + { + "epoch": 0.43815003175541667, + "grad_norm": 0.3251190781593323, + "learning_rate": 0.00011239293466252487, + "loss": 1.2861, + "step": 33718 + }, + { + "epoch": 0.43816302629933257, + "grad_norm": 0.41585326194763184, + "learning_rate": 0.00011239033520061348, + "loss": 1.4611, + "step": 33719 + }, + { + "epoch": 0.4381760208432484, + "grad_norm": 0.49898022413253784, + "learning_rate": 0.00011238773573870209, + "loss": 1.3779, + "step": 33720 + }, + { + "epoch": 0.4381890153871643, + "grad_norm": 0.36906060576438904, + "learning_rate": 0.0001123851362767907, + "loss": 1.1995, + "step": 33721 + }, + { + "epoch": 0.43820200993108016, + "grad_norm": 0.4501431882381439, + "learning_rate": 0.00011238253681487934, + "loss": 1.4462, + "step": 33722 + }, + { + "epoch": 0.43821500447499606, + "grad_norm": 0.37703192234039307, + "learning_rate": 0.00011237993735296794, + "loss": 1.1854, + "step": 33723 + }, + { + "epoch": 0.4382279990189119, + "grad_norm": 0.44412747025489807, + "learning_rate": 0.00011237733789105655, + "loss": 1.2644, + "step": 33724 + }, + { + "epoch": 0.4382409935628278, + "grad_norm": 0.3478719890117645, + "learning_rate": 0.00011237473842914516, + "loss": 1.3366, + "step": 33725 + }, + { + "epoch": 0.4382539881067437, + "grad_norm": 0.39723125100135803, + "learning_rate": 0.0001123721389672338, + "loss": 1.4417, + "step": 33726 + }, + { + "epoch": 0.43826698265065955, + "grad_norm": 0.45014920830726624, + "learning_rate": 0.00011236953950532241, + "loss": 1.3125, + "step": 33727 + }, + { + "epoch": 0.43827997719457545, + "grad_norm": 0.5657960772514343, + "learning_rate": 0.00011236694004341102, + "loss": 1.6342, + "step": 33728 + }, + { + "epoch": 0.4382929717384913, + "grad_norm": 0.403454065322876, + "learning_rate": 0.00011236434058149963, + "loss": 1.5114, + "step": 33729 + }, + { + "epoch": 0.4383059662824072, + "grad_norm": 0.33389076590538025, + "learning_rate": 0.00011236174111958826, + "loss": 1.2206, + "step": 33730 + }, + { + "epoch": 0.43831896082632305, + "grad_norm": 0.3333199918270111, + "learning_rate": 0.00011235914165767687, + "loss": 1.2222, + "step": 33731 + }, + { + "epoch": 0.43833195537023895, + "grad_norm": 0.4092356264591217, + "learning_rate": 0.00011235654219576548, + "loss": 1.2094, + "step": 33732 + }, + { + "epoch": 0.4383449499141548, + "grad_norm": 0.4158893823623657, + "learning_rate": 0.00011235394273385409, + "loss": 1.6692, + "step": 33733 + }, + { + "epoch": 0.4383579444580707, + "grad_norm": 0.3535400331020355, + "learning_rate": 0.00011235134327194273, + "loss": 1.2455, + "step": 33734 + }, + { + "epoch": 0.43837093900198654, + "grad_norm": 0.37516915798187256, + "learning_rate": 0.00011234874381003134, + "loss": 1.23, + "step": 33735 + }, + { + "epoch": 0.43838393354590244, + "grad_norm": 0.325690895318985, + "learning_rate": 0.00011234614434811994, + "loss": 1.3849, + "step": 33736 + }, + { + "epoch": 0.4383969280898183, + "grad_norm": 0.37609657645225525, + "learning_rate": 0.00011234354488620855, + "loss": 1.2639, + "step": 33737 + }, + { + "epoch": 0.4384099226337342, + "grad_norm": 0.4492342174053192, + "learning_rate": 0.00011234094542429718, + "loss": 1.3149, + "step": 33738 + }, + { + "epoch": 0.43842291717765003, + "grad_norm": 0.4404865503311157, + "learning_rate": 0.0001123383459623858, + "loss": 1.4392, + "step": 33739 + }, + { + "epoch": 0.43843591172156593, + "grad_norm": 0.3790774941444397, + "learning_rate": 0.0001123357465004744, + "loss": 1.1568, + "step": 33740 + }, + { + "epoch": 0.4384489062654818, + "grad_norm": 0.4236330986022949, + "learning_rate": 0.00011233314703856302, + "loss": 1.5279, + "step": 33741 + }, + { + "epoch": 0.4384619008093977, + "grad_norm": 0.42766568064689636, + "learning_rate": 0.00011233054757665164, + "loss": 1.4771, + "step": 33742 + }, + { + "epoch": 0.4384748953533135, + "grad_norm": 0.3860890865325928, + "learning_rate": 0.00011232794811474025, + "loss": 1.4847, + "step": 33743 + }, + { + "epoch": 0.4384878898972294, + "grad_norm": 0.32998207211494446, + "learning_rate": 0.00011232534865282886, + "loss": 1.3991, + "step": 33744 + }, + { + "epoch": 0.43850088444114527, + "grad_norm": 0.4026249349117279, + "learning_rate": 0.00011232274919091747, + "loss": 1.7186, + "step": 33745 + }, + { + "epoch": 0.43851387898506117, + "grad_norm": 0.446280300617218, + "learning_rate": 0.00011232014972900611, + "loss": 1.3755, + "step": 33746 + }, + { + "epoch": 0.438526873528977, + "grad_norm": 0.3363453447818756, + "learning_rate": 0.00011231755026709472, + "loss": 1.3076, + "step": 33747 + }, + { + "epoch": 0.4385398680728929, + "grad_norm": 0.45325642824172974, + "learning_rate": 0.00011231495080518333, + "loss": 1.3107, + "step": 33748 + }, + { + "epoch": 0.43855286261680876, + "grad_norm": 0.44539177417755127, + "learning_rate": 0.00011231235134327193, + "loss": 1.2655, + "step": 33749 + }, + { + "epoch": 0.43856585716072466, + "grad_norm": 0.356073796749115, + "learning_rate": 0.00011230975188136057, + "loss": 1.2275, + "step": 33750 + }, + { + "epoch": 0.4385788517046405, + "grad_norm": 0.3830162584781647, + "learning_rate": 0.00011230715241944918, + "loss": 1.2292, + "step": 33751 + }, + { + "epoch": 0.4385918462485564, + "grad_norm": 0.3281376361846924, + "learning_rate": 0.00011230455295753779, + "loss": 1.3824, + "step": 33752 + }, + { + "epoch": 0.43860484079247225, + "grad_norm": 0.5264029502868652, + "learning_rate": 0.0001123019534956264, + "loss": 1.3585, + "step": 33753 + }, + { + "epoch": 0.43861783533638815, + "grad_norm": 0.24634402990341187, + "learning_rate": 0.00011229935403371503, + "loss": 1.2681, + "step": 33754 + }, + { + "epoch": 0.438630829880304, + "grad_norm": 0.46418091654777527, + "learning_rate": 0.00011229675457180364, + "loss": 1.5891, + "step": 33755 + }, + { + "epoch": 0.4386438244242199, + "grad_norm": 0.36810949444770813, + "learning_rate": 0.00011229415510989225, + "loss": 1.2621, + "step": 33756 + }, + { + "epoch": 0.43865681896813574, + "grad_norm": 0.4122721254825592, + "learning_rate": 0.00011229155564798089, + "loss": 1.2912, + "step": 33757 + }, + { + "epoch": 0.43866981351205164, + "grad_norm": 0.4004136323928833, + "learning_rate": 0.0001122889561860695, + "loss": 1.5792, + "step": 33758 + }, + { + "epoch": 0.4386828080559675, + "grad_norm": 0.4265916347503662, + "learning_rate": 0.00011228635672415811, + "loss": 1.4328, + "step": 33759 + }, + { + "epoch": 0.4386958025998834, + "grad_norm": 0.5141733288764954, + "learning_rate": 0.00011228375726224672, + "loss": 1.4674, + "step": 33760 + }, + { + "epoch": 0.43870879714379923, + "grad_norm": 0.5589814186096191, + "learning_rate": 0.00011228115780033534, + "loss": 1.3478, + "step": 33761 + }, + { + "epoch": 0.43872179168771513, + "grad_norm": 0.3440369963645935, + "learning_rate": 0.00011227855833842395, + "loss": 1.4411, + "step": 33762 + }, + { + "epoch": 0.438734786231631, + "grad_norm": 0.34824028611183167, + "learning_rate": 0.00011227595887651257, + "loss": 1.4798, + "step": 33763 + }, + { + "epoch": 0.4387477807755469, + "grad_norm": 0.3495718836784363, + "learning_rate": 0.00011227335941460118, + "loss": 1.1955, + "step": 33764 + }, + { + "epoch": 0.4387607753194627, + "grad_norm": 0.4167521595954895, + "learning_rate": 0.0001122707599526898, + "loss": 1.451, + "step": 33765 + }, + { + "epoch": 0.4387737698633786, + "grad_norm": 0.4216267168521881, + "learning_rate": 0.00011226816049077841, + "loss": 1.5064, + "step": 33766 + }, + { + "epoch": 0.43878676440729447, + "grad_norm": 0.41661110520362854, + "learning_rate": 0.00011226556102886702, + "loss": 1.5417, + "step": 33767 + }, + { + "epoch": 0.4387997589512104, + "grad_norm": 0.4137970805168152, + "learning_rate": 0.00011226296156695563, + "loss": 1.3145, + "step": 33768 + }, + { + "epoch": 0.4388127534951262, + "grad_norm": 0.36150655150413513, + "learning_rate": 0.00011226036210504427, + "loss": 1.3673, + "step": 33769 + }, + { + "epoch": 0.4388257480390421, + "grad_norm": 0.3358671963214874, + "learning_rate": 0.00011225776264313288, + "loss": 1.248, + "step": 33770 + }, + { + "epoch": 0.43883874258295796, + "grad_norm": 0.4472552239894867, + "learning_rate": 0.0001122551631812215, + "loss": 1.5463, + "step": 33771 + }, + { + "epoch": 0.43885173712687386, + "grad_norm": 0.3578947186470032, + "learning_rate": 0.0001122525637193101, + "loss": 1.3043, + "step": 33772 + }, + { + "epoch": 0.4388647316707897, + "grad_norm": 0.39294353127479553, + "learning_rate": 0.00011224996425739873, + "loss": 1.3364, + "step": 33773 + }, + { + "epoch": 0.4388777262147056, + "grad_norm": 0.3782839775085449, + "learning_rate": 0.00011224736479548734, + "loss": 1.5311, + "step": 33774 + }, + { + "epoch": 0.43889072075862146, + "grad_norm": 0.3367482125759125, + "learning_rate": 0.00011224476533357595, + "loss": 1.194, + "step": 33775 + }, + { + "epoch": 0.43890371530253736, + "grad_norm": 0.4283084571361542, + "learning_rate": 0.00011224216587166456, + "loss": 1.4278, + "step": 33776 + }, + { + "epoch": 0.4389167098464532, + "grad_norm": 0.499356210231781, + "learning_rate": 0.0001122395664097532, + "loss": 1.3436, + "step": 33777 + }, + { + "epoch": 0.4389297043903691, + "grad_norm": 0.4606949985027313, + "learning_rate": 0.0001122369669478418, + "loss": 1.5584, + "step": 33778 + }, + { + "epoch": 0.43894269893428495, + "grad_norm": 0.4238845705986023, + "learning_rate": 0.00011223436748593041, + "loss": 1.4395, + "step": 33779 + }, + { + "epoch": 0.43895569347820085, + "grad_norm": 0.40323111414909363, + "learning_rate": 0.00011223176802401902, + "loss": 1.2444, + "step": 33780 + }, + { + "epoch": 0.4389686880221167, + "grad_norm": 0.46773505210876465, + "learning_rate": 0.00011222916856210766, + "loss": 1.4941, + "step": 33781 + }, + { + "epoch": 0.4389816825660326, + "grad_norm": 0.4639147222042084, + "learning_rate": 0.00011222656910019627, + "loss": 1.5372, + "step": 33782 + }, + { + "epoch": 0.43899467710994844, + "grad_norm": 0.3454759418964386, + "learning_rate": 0.00011222396963828488, + "loss": 1.5796, + "step": 33783 + }, + { + "epoch": 0.43900767165386434, + "grad_norm": 0.4438466429710388, + "learning_rate": 0.00011222137017637349, + "loss": 1.3273, + "step": 33784 + }, + { + "epoch": 0.4390206661977802, + "grad_norm": 0.36770185828208923, + "learning_rate": 0.00011221877071446211, + "loss": 1.426, + "step": 33785 + }, + { + "epoch": 0.4390336607416961, + "grad_norm": 0.4122646749019623, + "learning_rate": 0.00011221617125255073, + "loss": 1.6401, + "step": 33786 + }, + { + "epoch": 0.43904665528561193, + "grad_norm": 0.3544377386569977, + "learning_rate": 0.00011221357179063934, + "loss": 1.4221, + "step": 33787 + }, + { + "epoch": 0.43905964982952783, + "grad_norm": 0.3139743506908417, + "learning_rate": 0.00011221097232872795, + "loss": 1.3035, + "step": 33788 + }, + { + "epoch": 0.4390726443734437, + "grad_norm": 0.40578898787498474, + "learning_rate": 0.00011220837286681659, + "loss": 1.4468, + "step": 33789 + }, + { + "epoch": 0.4390856389173596, + "grad_norm": 0.29836177825927734, + "learning_rate": 0.0001122057734049052, + "loss": 1.3728, + "step": 33790 + }, + { + "epoch": 0.4390986334612754, + "grad_norm": 0.4170806407928467, + "learning_rate": 0.0001122031739429938, + "loss": 1.3737, + "step": 33791 + }, + { + "epoch": 0.4391116280051913, + "grad_norm": 0.5037851929664612, + "learning_rate": 0.00011220057448108243, + "loss": 1.5151, + "step": 33792 + }, + { + "epoch": 0.43912462254910717, + "grad_norm": 0.3820703327655792, + "learning_rate": 0.00011219797501917104, + "loss": 1.3445, + "step": 33793 + }, + { + "epoch": 0.43913761709302307, + "grad_norm": 0.37377795577049255, + "learning_rate": 0.00011219537555725965, + "loss": 1.2305, + "step": 33794 + }, + { + "epoch": 0.4391506116369389, + "grad_norm": 0.3625583350658417, + "learning_rate": 0.00011219277609534826, + "loss": 1.4772, + "step": 33795 + }, + { + "epoch": 0.4391636061808548, + "grad_norm": 0.3164084553718567, + "learning_rate": 0.00011219017663343689, + "loss": 1.2018, + "step": 33796 + }, + { + "epoch": 0.43917660072477066, + "grad_norm": 0.38175728917121887, + "learning_rate": 0.0001121875771715255, + "loss": 1.425, + "step": 33797 + }, + { + "epoch": 0.43918959526868656, + "grad_norm": 0.36510688066482544, + "learning_rate": 0.00011218497770961411, + "loss": 1.2334, + "step": 33798 + }, + { + "epoch": 0.4392025898126024, + "grad_norm": 0.37086033821105957, + "learning_rate": 0.00011218237824770272, + "loss": 1.3364, + "step": 33799 + }, + { + "epoch": 0.4392155843565183, + "grad_norm": 0.446889728307724, + "learning_rate": 0.00011217977878579136, + "loss": 1.5079, + "step": 33800 + }, + { + "epoch": 0.4392285789004342, + "grad_norm": 0.3069266676902771, + "learning_rate": 0.00011217717932387997, + "loss": 1.2707, + "step": 33801 + }, + { + "epoch": 0.43924157344435005, + "grad_norm": 0.4998326003551483, + "learning_rate": 0.00011217457986196858, + "loss": 1.4017, + "step": 33802 + }, + { + "epoch": 0.43925456798826595, + "grad_norm": 0.4500516951084137, + "learning_rate": 0.00011217198040005719, + "loss": 1.4176, + "step": 33803 + }, + { + "epoch": 0.4392675625321818, + "grad_norm": 0.4580056071281433, + "learning_rate": 0.00011216938093814582, + "loss": 1.312, + "step": 33804 + }, + { + "epoch": 0.4392805570760977, + "grad_norm": 0.41984865069389343, + "learning_rate": 0.00011216678147623443, + "loss": 1.4081, + "step": 33805 + }, + { + "epoch": 0.43929355162001354, + "grad_norm": 0.368792325258255, + "learning_rate": 0.00011216418201432304, + "loss": 1.3692, + "step": 33806 + }, + { + "epoch": 0.43930654616392945, + "grad_norm": 0.37742581963539124, + "learning_rate": 0.00011216158255241165, + "loss": 1.4457, + "step": 33807 + }, + { + "epoch": 0.4393195407078453, + "grad_norm": 0.3170832395553589, + "learning_rate": 0.00011215898309050027, + "loss": 1.269, + "step": 33808 + }, + { + "epoch": 0.4393325352517612, + "grad_norm": 0.3944149613380432, + "learning_rate": 0.00011215638362858889, + "loss": 1.3628, + "step": 33809 + }, + { + "epoch": 0.43934552979567704, + "grad_norm": 0.49544981122016907, + "learning_rate": 0.0001121537841666775, + "loss": 1.243, + "step": 33810 + }, + { + "epoch": 0.43935852433959294, + "grad_norm": 0.39050644636154175, + "learning_rate": 0.00011215118470476611, + "loss": 1.3888, + "step": 33811 + }, + { + "epoch": 0.4393715188835088, + "grad_norm": 0.32727938890457153, + "learning_rate": 0.00011214858524285475, + "loss": 1.3225, + "step": 33812 + }, + { + "epoch": 0.4393845134274247, + "grad_norm": 0.49213749170303345, + "learning_rate": 0.00011214598578094336, + "loss": 1.4431, + "step": 33813 + }, + { + "epoch": 0.43939750797134053, + "grad_norm": 0.3536868393421173, + "learning_rate": 0.00011214338631903197, + "loss": 1.4022, + "step": 33814 + }, + { + "epoch": 0.43941050251525643, + "grad_norm": 0.49926725029945374, + "learning_rate": 0.00011214078685712058, + "loss": 1.315, + "step": 33815 + }, + { + "epoch": 0.4394234970591723, + "grad_norm": 0.5178827047348022, + "learning_rate": 0.0001121381873952092, + "loss": 1.3487, + "step": 33816 + }, + { + "epoch": 0.4394364916030882, + "grad_norm": 0.37049800157546997, + "learning_rate": 0.00011213558793329781, + "loss": 1.298, + "step": 33817 + }, + { + "epoch": 0.439449486147004, + "grad_norm": 0.4458928108215332, + "learning_rate": 0.00011213298847138642, + "loss": 1.4293, + "step": 33818 + }, + { + "epoch": 0.4394624806909199, + "grad_norm": 0.44171980023384094, + "learning_rate": 0.00011213038900947504, + "loss": 1.372, + "step": 33819 + }, + { + "epoch": 0.43947547523483577, + "grad_norm": 0.5388789772987366, + "learning_rate": 0.00011212778954756366, + "loss": 1.2785, + "step": 33820 + }, + { + "epoch": 0.43948846977875167, + "grad_norm": 0.33343589305877686, + "learning_rate": 0.00011212519008565227, + "loss": 1.5505, + "step": 33821 + }, + { + "epoch": 0.4395014643226675, + "grad_norm": 0.41944271326065063, + "learning_rate": 0.00011212259062374088, + "loss": 1.5171, + "step": 33822 + }, + { + "epoch": 0.4395144588665834, + "grad_norm": 0.37391194701194763, + "learning_rate": 0.00011211999116182949, + "loss": 1.2939, + "step": 33823 + }, + { + "epoch": 0.43952745341049926, + "grad_norm": 0.42650604248046875, + "learning_rate": 0.00011211739169991813, + "loss": 1.5289, + "step": 33824 + }, + { + "epoch": 0.43954044795441516, + "grad_norm": 0.2201998084783554, + "learning_rate": 0.00011211479223800674, + "loss": 1.1309, + "step": 33825 + }, + { + "epoch": 0.439553442498331, + "grad_norm": 0.42188969254493713, + "learning_rate": 0.00011211219277609535, + "loss": 1.3368, + "step": 33826 + }, + { + "epoch": 0.4395664370422469, + "grad_norm": 0.3888471722602844, + "learning_rate": 0.00011210959331418396, + "loss": 1.3842, + "step": 33827 + }, + { + "epoch": 0.43957943158616275, + "grad_norm": 0.3639715611934662, + "learning_rate": 0.00011210699385227259, + "loss": 1.4096, + "step": 33828 + }, + { + "epoch": 0.43959242613007865, + "grad_norm": 0.4172443747520447, + "learning_rate": 0.0001121043943903612, + "loss": 1.4051, + "step": 33829 + }, + { + "epoch": 0.4396054206739945, + "grad_norm": 0.42720672488212585, + "learning_rate": 0.00011210179492844981, + "loss": 1.3579, + "step": 33830 + }, + { + "epoch": 0.4396184152179104, + "grad_norm": 0.38850098848342896, + "learning_rate": 0.00011209919546653845, + "loss": 1.3702, + "step": 33831 + }, + { + "epoch": 0.43963140976182624, + "grad_norm": 0.3582231104373932, + "learning_rate": 0.00011209659600462706, + "loss": 1.3259, + "step": 33832 + }, + { + "epoch": 0.43964440430574214, + "grad_norm": 0.33361801505088806, + "learning_rate": 0.00011209399654271566, + "loss": 1.3253, + "step": 33833 + }, + { + "epoch": 0.439657398849658, + "grad_norm": 0.3146879971027374, + "learning_rate": 0.00011209139708080427, + "loss": 1.4785, + "step": 33834 + }, + { + "epoch": 0.4396703933935739, + "grad_norm": 0.27605631947517395, + "learning_rate": 0.0001120887976188929, + "loss": 1.4296, + "step": 33835 + }, + { + "epoch": 0.43968338793748973, + "grad_norm": 0.4640986919403076, + "learning_rate": 0.00011208619815698152, + "loss": 1.6448, + "step": 33836 + }, + { + "epoch": 0.43969638248140563, + "grad_norm": 0.3718549907207489, + "learning_rate": 0.00011208359869507013, + "loss": 1.2728, + "step": 33837 + }, + { + "epoch": 0.4397093770253215, + "grad_norm": 0.4482578635215759, + "learning_rate": 0.00011208099923315874, + "loss": 1.2995, + "step": 33838 + }, + { + "epoch": 0.4397223715692374, + "grad_norm": 0.5183087587356567, + "learning_rate": 0.00011207839977124736, + "loss": 1.3979, + "step": 33839 + }, + { + "epoch": 0.4397353661131532, + "grad_norm": 0.4024648666381836, + "learning_rate": 0.00011207580030933597, + "loss": 1.4587, + "step": 33840 + }, + { + "epoch": 0.4397483606570691, + "grad_norm": 0.4905690550804138, + "learning_rate": 0.00011207320084742458, + "loss": 1.57, + "step": 33841 + }, + { + "epoch": 0.43976135520098497, + "grad_norm": 0.40656524896621704, + "learning_rate": 0.0001120706013855132, + "loss": 1.3712, + "step": 33842 + }, + { + "epoch": 0.43977434974490087, + "grad_norm": 0.4429897964000702, + "learning_rate": 0.00011206800192360183, + "loss": 1.5636, + "step": 33843 + }, + { + "epoch": 0.4397873442888167, + "grad_norm": 0.4273623824119568, + "learning_rate": 0.00011206540246169044, + "loss": 1.3503, + "step": 33844 + }, + { + "epoch": 0.4398003388327326, + "grad_norm": 0.3867172300815582, + "learning_rate": 0.00011206280299977906, + "loss": 1.4371, + "step": 33845 + }, + { + "epoch": 0.43981333337664846, + "grad_norm": 0.4311963617801666, + "learning_rate": 0.00011206020353786765, + "loss": 1.3528, + "step": 33846 + }, + { + "epoch": 0.43982632792056436, + "grad_norm": 0.3302820026874542, + "learning_rate": 0.00011205760407595629, + "loss": 1.2851, + "step": 33847 + }, + { + "epoch": 0.4398393224644802, + "grad_norm": 0.45143556594848633, + "learning_rate": 0.0001120550046140449, + "loss": 1.346, + "step": 33848 + }, + { + "epoch": 0.4398523170083961, + "grad_norm": 0.3516835570335388, + "learning_rate": 0.00011205240515213351, + "loss": 1.2949, + "step": 33849 + }, + { + "epoch": 0.43986531155231196, + "grad_norm": 0.47119244933128357, + "learning_rate": 0.00011204980569022212, + "loss": 1.4602, + "step": 33850 + }, + { + "epoch": 0.43987830609622786, + "grad_norm": 0.4856862425804138, + "learning_rate": 0.00011204720622831075, + "loss": 1.5038, + "step": 33851 + }, + { + "epoch": 0.4398913006401437, + "grad_norm": 0.4068794548511505, + "learning_rate": 0.00011204460676639936, + "loss": 1.3051, + "step": 33852 + }, + { + "epoch": 0.4399042951840596, + "grad_norm": 0.5328830480575562, + "learning_rate": 0.00011204200730448797, + "loss": 1.4396, + "step": 33853 + }, + { + "epoch": 0.43991728972797545, + "grad_norm": 0.4361875653266907, + "learning_rate": 0.00011203940784257658, + "loss": 1.4499, + "step": 33854 + }, + { + "epoch": 0.43993028427189135, + "grad_norm": 0.37575486302375793, + "learning_rate": 0.00011203680838066522, + "loss": 1.4291, + "step": 33855 + }, + { + "epoch": 0.4399432788158072, + "grad_norm": 0.48397478461265564, + "learning_rate": 0.00011203420891875383, + "loss": 1.6989, + "step": 33856 + }, + { + "epoch": 0.4399562733597231, + "grad_norm": 0.35768431425094604, + "learning_rate": 0.00011203160945684244, + "loss": 1.0911, + "step": 33857 + }, + { + "epoch": 0.43996926790363894, + "grad_norm": 0.3889373540878296, + "learning_rate": 0.00011202900999493104, + "loss": 1.2627, + "step": 33858 + }, + { + "epoch": 0.43998226244755484, + "grad_norm": 0.4137880504131317, + "learning_rate": 0.00011202641053301968, + "loss": 1.2442, + "step": 33859 + }, + { + "epoch": 0.4399952569914707, + "grad_norm": 0.3847433924674988, + "learning_rate": 0.00011202381107110829, + "loss": 1.3286, + "step": 33860 + }, + { + "epoch": 0.4400082515353866, + "grad_norm": 0.4896392226219177, + "learning_rate": 0.0001120212116091969, + "loss": 1.4377, + "step": 33861 + }, + { + "epoch": 0.44002124607930243, + "grad_norm": 0.3672564923763275, + "learning_rate": 0.00011201861214728551, + "loss": 1.419, + "step": 33862 + }, + { + "epoch": 0.44003424062321833, + "grad_norm": 0.46419256925582886, + "learning_rate": 0.00011201601268537413, + "loss": 1.5734, + "step": 33863 + }, + { + "epoch": 0.4400472351671342, + "grad_norm": 0.4771384596824646, + "learning_rate": 0.00011201341322346274, + "loss": 1.3484, + "step": 33864 + }, + { + "epoch": 0.4400602297110501, + "grad_norm": 0.3740924596786499, + "learning_rate": 0.00011201081376155136, + "loss": 1.0841, + "step": 33865 + }, + { + "epoch": 0.4400732242549659, + "grad_norm": 0.4816083312034607, + "learning_rate": 0.00011200821429963999, + "loss": 1.5036, + "step": 33866 + }, + { + "epoch": 0.4400862187988818, + "grad_norm": 0.35915622115135193, + "learning_rate": 0.0001120056148377286, + "loss": 1.2459, + "step": 33867 + }, + { + "epoch": 0.44009921334279767, + "grad_norm": 0.3351829946041107, + "learning_rate": 0.00011200301537581722, + "loss": 1.1748, + "step": 33868 + }, + { + "epoch": 0.44011220788671357, + "grad_norm": 0.4634656608104706, + "learning_rate": 0.00011200041591390583, + "loss": 1.5248, + "step": 33869 + }, + { + "epoch": 0.4401252024306294, + "grad_norm": 0.4507223963737488, + "learning_rate": 0.00011199781645199445, + "loss": 1.4229, + "step": 33870 + }, + { + "epoch": 0.4401381969745453, + "grad_norm": 0.3536686599254608, + "learning_rate": 0.00011199521699008306, + "loss": 1.3137, + "step": 33871 + }, + { + "epoch": 0.44015119151846116, + "grad_norm": 0.49509260058403015, + "learning_rate": 0.00011199261752817167, + "loss": 1.2775, + "step": 33872 + }, + { + "epoch": 0.44016418606237706, + "grad_norm": 0.5351803302764893, + "learning_rate": 0.00011199001806626028, + "loss": 1.4439, + "step": 33873 + }, + { + "epoch": 0.4401771806062929, + "grad_norm": 0.3896837830543518, + "learning_rate": 0.00011198741860434892, + "loss": 1.4647, + "step": 33874 + }, + { + "epoch": 0.4401901751502088, + "grad_norm": 0.46184828877449036, + "learning_rate": 0.00011198481914243752, + "loss": 1.5146, + "step": 33875 + }, + { + "epoch": 0.44020316969412465, + "grad_norm": 0.45069700479507446, + "learning_rate": 0.00011198221968052613, + "loss": 1.3163, + "step": 33876 + }, + { + "epoch": 0.44021616423804055, + "grad_norm": 0.5432813763618469, + "learning_rate": 0.00011197962021861474, + "loss": 1.3329, + "step": 33877 + }, + { + "epoch": 0.44022915878195645, + "grad_norm": 0.328911155462265, + "learning_rate": 0.00011197702075670338, + "loss": 1.3623, + "step": 33878 + }, + { + "epoch": 0.4402421533258723, + "grad_norm": 0.2985921800136566, + "learning_rate": 0.00011197442129479199, + "loss": 1.3356, + "step": 33879 + }, + { + "epoch": 0.4402551478697882, + "grad_norm": 0.31085917353630066, + "learning_rate": 0.0001119718218328806, + "loss": 1.1905, + "step": 33880 + }, + { + "epoch": 0.44026814241370404, + "grad_norm": 0.4325365424156189, + "learning_rate": 0.00011196922237096921, + "loss": 1.1185, + "step": 33881 + }, + { + "epoch": 0.44028113695761995, + "grad_norm": 0.41128620505332947, + "learning_rate": 0.00011196662290905784, + "loss": 1.6154, + "step": 33882 + }, + { + "epoch": 0.4402941315015358, + "grad_norm": 0.43714362382888794, + "learning_rate": 0.00011196402344714645, + "loss": 1.3221, + "step": 33883 + }, + { + "epoch": 0.4403071260454517, + "grad_norm": 0.317518025636673, + "learning_rate": 0.00011196142398523506, + "loss": 1.2304, + "step": 33884 + }, + { + "epoch": 0.44032012058936754, + "grad_norm": 0.34049302339553833, + "learning_rate": 0.00011195882452332367, + "loss": 1.2478, + "step": 33885 + }, + { + "epoch": 0.44033311513328344, + "grad_norm": 0.38715073466300964, + "learning_rate": 0.0001119562250614123, + "loss": 1.5095, + "step": 33886 + }, + { + "epoch": 0.4403461096771993, + "grad_norm": 0.4641999304294586, + "learning_rate": 0.00011195362559950092, + "loss": 1.5959, + "step": 33887 + }, + { + "epoch": 0.4403591042211152, + "grad_norm": 0.39159679412841797, + "learning_rate": 0.00011195102613758952, + "loss": 1.3798, + "step": 33888 + }, + { + "epoch": 0.44037209876503103, + "grad_norm": 0.39525845646858215, + "learning_rate": 0.00011194842667567813, + "loss": 1.3122, + "step": 33889 + }, + { + "epoch": 0.44038509330894693, + "grad_norm": 0.43412327766418457, + "learning_rate": 0.00011194582721376676, + "loss": 1.3667, + "step": 33890 + }, + { + "epoch": 0.4403980878528628, + "grad_norm": 0.3792523741722107, + "learning_rate": 0.00011194322775185537, + "loss": 1.4723, + "step": 33891 + }, + { + "epoch": 0.4404110823967787, + "grad_norm": 0.3380373418331146, + "learning_rate": 0.00011194062828994399, + "loss": 1.1903, + "step": 33892 + }, + { + "epoch": 0.4404240769406945, + "grad_norm": 0.42319849133491516, + "learning_rate": 0.0001119380288280326, + "loss": 1.3378, + "step": 33893 + }, + { + "epoch": 0.4404370714846104, + "grad_norm": 0.4520934522151947, + "learning_rate": 0.00011193542936612122, + "loss": 1.3675, + "step": 33894 + }, + { + "epoch": 0.44045006602852627, + "grad_norm": 0.4807420074939728, + "learning_rate": 0.00011193282990420983, + "loss": 1.5238, + "step": 33895 + }, + { + "epoch": 0.44046306057244217, + "grad_norm": 0.425485759973526, + "learning_rate": 0.00011193023044229844, + "loss": 1.3506, + "step": 33896 + }, + { + "epoch": 0.440476055116358, + "grad_norm": 0.3834744393825531, + "learning_rate": 0.00011192763098038705, + "loss": 1.4788, + "step": 33897 + }, + { + "epoch": 0.4404890496602739, + "grad_norm": 0.4828564524650574, + "learning_rate": 0.00011192503151847569, + "loss": 1.3775, + "step": 33898 + }, + { + "epoch": 0.44050204420418976, + "grad_norm": 0.4058476686477661, + "learning_rate": 0.0001119224320565643, + "loss": 1.597, + "step": 33899 + }, + { + "epoch": 0.44051503874810566, + "grad_norm": 0.37737974524497986, + "learning_rate": 0.0001119198325946529, + "loss": 1.4249, + "step": 33900 + }, + { + "epoch": 0.4405280332920215, + "grad_norm": 0.3890879452228546, + "learning_rate": 0.00011191723313274151, + "loss": 1.4542, + "step": 33901 + }, + { + "epoch": 0.4405410278359374, + "grad_norm": 0.4908926486968994, + "learning_rate": 0.00011191463367083015, + "loss": 1.3563, + "step": 33902 + }, + { + "epoch": 0.44055402237985325, + "grad_norm": 0.42307162284851074, + "learning_rate": 0.00011191203420891876, + "loss": 1.3151, + "step": 33903 + }, + { + "epoch": 0.44056701692376915, + "grad_norm": 0.3393646478652954, + "learning_rate": 0.00011190943474700737, + "loss": 1.1379, + "step": 33904 + }, + { + "epoch": 0.440580011467685, + "grad_norm": 0.3769477903842926, + "learning_rate": 0.000111906835285096, + "loss": 1.1791, + "step": 33905 + }, + { + "epoch": 0.4405930060116009, + "grad_norm": 0.48580682277679443, + "learning_rate": 0.0001119042358231846, + "loss": 1.4894, + "step": 33906 + }, + { + "epoch": 0.44060600055551674, + "grad_norm": 0.3570936620235443, + "learning_rate": 0.00011190163636127322, + "loss": 1.3161, + "step": 33907 + }, + { + "epoch": 0.44061899509943264, + "grad_norm": 0.3767772614955902, + "learning_rate": 0.00011189903689936183, + "loss": 1.3216, + "step": 33908 + }, + { + "epoch": 0.4406319896433485, + "grad_norm": 0.4276905953884125, + "learning_rate": 0.00011189643743745047, + "loss": 1.4417, + "step": 33909 + }, + { + "epoch": 0.4406449841872644, + "grad_norm": 0.3743925392627716, + "learning_rate": 0.00011189383797553908, + "loss": 1.3313, + "step": 33910 + }, + { + "epoch": 0.44065797873118023, + "grad_norm": 0.4535468518733978, + "learning_rate": 0.00011189123851362769, + "loss": 1.4864, + "step": 33911 + }, + { + "epoch": 0.44067097327509613, + "grad_norm": 0.2709183990955353, + "learning_rate": 0.0001118886390517163, + "loss": 1.6668, + "step": 33912 + }, + { + "epoch": 0.440683967819012, + "grad_norm": 0.3509170413017273, + "learning_rate": 0.00011188603958980492, + "loss": 1.2957, + "step": 33913 + }, + { + "epoch": 0.4406969623629279, + "grad_norm": 0.330238938331604, + "learning_rate": 0.00011188344012789353, + "loss": 1.3562, + "step": 33914 + }, + { + "epoch": 0.4407099569068437, + "grad_norm": 0.47974923253059387, + "learning_rate": 0.00011188084066598215, + "loss": 1.5172, + "step": 33915 + }, + { + "epoch": 0.4407229514507596, + "grad_norm": 0.4952934980392456, + "learning_rate": 0.00011187824120407076, + "loss": 1.3622, + "step": 33916 + }, + { + "epoch": 0.44073594599467547, + "grad_norm": 0.37118789553642273, + "learning_rate": 0.00011187564174215938, + "loss": 1.2489, + "step": 33917 + }, + { + "epoch": 0.44074894053859137, + "grad_norm": 0.23424631357192993, + "learning_rate": 0.00011187304228024799, + "loss": 1.2305, + "step": 33918 + }, + { + "epoch": 0.4407619350825072, + "grad_norm": 0.384907603263855, + "learning_rate": 0.0001118704428183366, + "loss": 1.5708, + "step": 33919 + }, + { + "epoch": 0.4407749296264231, + "grad_norm": 0.31708404421806335, + "learning_rate": 0.00011186784335642521, + "loss": 1.0588, + "step": 33920 + }, + { + "epoch": 0.44078792417033896, + "grad_norm": 0.4358888864517212, + "learning_rate": 0.00011186524389451385, + "loss": 1.3023, + "step": 33921 + }, + { + "epoch": 0.44080091871425486, + "grad_norm": 0.36452725529670715, + "learning_rate": 0.00011186264443260246, + "loss": 1.3971, + "step": 33922 + }, + { + "epoch": 0.4408139132581707, + "grad_norm": 0.44835686683654785, + "learning_rate": 0.00011186004497069107, + "loss": 1.5644, + "step": 33923 + }, + { + "epoch": 0.4408269078020866, + "grad_norm": 0.4204321801662445, + "learning_rate": 0.00011185744550877968, + "loss": 1.2754, + "step": 33924 + }, + { + "epoch": 0.44083990234600245, + "grad_norm": 0.357796847820282, + "learning_rate": 0.00011185484604686831, + "loss": 1.5435, + "step": 33925 + }, + { + "epoch": 0.44085289688991836, + "grad_norm": 0.4933660924434662, + "learning_rate": 0.00011185224658495692, + "loss": 1.5398, + "step": 33926 + }, + { + "epoch": 0.4408658914338342, + "grad_norm": 0.4068813621997833, + "learning_rate": 0.00011184964712304553, + "loss": 1.3599, + "step": 33927 + }, + { + "epoch": 0.4408788859777501, + "grad_norm": 0.41411903500556946, + "learning_rate": 0.00011184704766113414, + "loss": 1.4149, + "step": 33928 + }, + { + "epoch": 0.44089188052166595, + "grad_norm": 0.4143437147140503, + "learning_rate": 0.00011184444819922278, + "loss": 1.4579, + "step": 33929 + }, + { + "epoch": 0.44090487506558185, + "grad_norm": 0.4094889461994171, + "learning_rate": 0.00011184184873731138, + "loss": 1.5253, + "step": 33930 + }, + { + "epoch": 0.4409178696094977, + "grad_norm": 0.388461709022522, + "learning_rate": 0.00011183924927539999, + "loss": 1.3487, + "step": 33931 + }, + { + "epoch": 0.4409308641534136, + "grad_norm": 0.3599866032600403, + "learning_rate": 0.0001118366498134886, + "loss": 1.3316, + "step": 33932 + }, + { + "epoch": 0.44094385869732944, + "grad_norm": 0.3491574823856354, + "learning_rate": 0.00011183405035157724, + "loss": 1.2722, + "step": 33933 + }, + { + "epoch": 0.44095685324124534, + "grad_norm": 0.3522743284702301, + "learning_rate": 0.00011183145088966585, + "loss": 1.2204, + "step": 33934 + }, + { + "epoch": 0.4409698477851612, + "grad_norm": 0.38592448830604553, + "learning_rate": 0.00011182885142775446, + "loss": 1.6383, + "step": 33935 + }, + { + "epoch": 0.4409828423290771, + "grad_norm": 0.3955994248390198, + "learning_rate": 0.00011182625196584307, + "loss": 1.4827, + "step": 33936 + }, + { + "epoch": 0.44099583687299293, + "grad_norm": 0.36901840567588806, + "learning_rate": 0.0001118236525039317, + "loss": 1.2574, + "step": 33937 + }, + { + "epoch": 0.44100883141690883, + "grad_norm": 0.4609015882015228, + "learning_rate": 0.0001118210530420203, + "loss": 1.3762, + "step": 33938 + }, + { + "epoch": 0.4410218259608247, + "grad_norm": 0.3924921452999115, + "learning_rate": 0.00011181845358010892, + "loss": 1.301, + "step": 33939 + }, + { + "epoch": 0.4410348205047406, + "grad_norm": 0.42089369893074036, + "learning_rate": 0.00011181585411819755, + "loss": 1.5235, + "step": 33940 + }, + { + "epoch": 0.4410478150486564, + "grad_norm": 0.45505690574645996, + "learning_rate": 0.00011181325465628617, + "loss": 1.4353, + "step": 33941 + }, + { + "epoch": 0.4410608095925723, + "grad_norm": 0.388092964887619, + "learning_rate": 0.00011181065519437476, + "loss": 1.1597, + "step": 33942 + }, + { + "epoch": 0.44107380413648817, + "grad_norm": 0.37217697501182556, + "learning_rate": 0.00011180805573246337, + "loss": 1.2961, + "step": 33943 + }, + { + "epoch": 0.44108679868040407, + "grad_norm": 0.44210129976272583, + "learning_rate": 0.00011180545627055201, + "loss": 1.3265, + "step": 33944 + }, + { + "epoch": 0.4410997932243199, + "grad_norm": 0.36260077357292175, + "learning_rate": 0.00011180285680864062, + "loss": 1.3572, + "step": 33945 + }, + { + "epoch": 0.4411127877682358, + "grad_norm": 0.4829398989677429, + "learning_rate": 0.00011180025734672923, + "loss": 1.5183, + "step": 33946 + }, + { + "epoch": 0.44112578231215166, + "grad_norm": 0.46915286779403687, + "learning_rate": 0.00011179765788481784, + "loss": 1.682, + "step": 33947 + }, + { + "epoch": 0.44113877685606756, + "grad_norm": 0.5006355047225952, + "learning_rate": 0.00011179505842290647, + "loss": 1.3184, + "step": 33948 + }, + { + "epoch": 0.4411517713999834, + "grad_norm": 0.4542192816734314, + "learning_rate": 0.00011179245896099508, + "loss": 1.3721, + "step": 33949 + }, + { + "epoch": 0.4411647659438993, + "grad_norm": 0.35697054862976074, + "learning_rate": 0.00011178985949908369, + "loss": 1.3939, + "step": 33950 + }, + { + "epoch": 0.44117776048781515, + "grad_norm": 0.4992166757583618, + "learning_rate": 0.0001117872600371723, + "loss": 1.3845, + "step": 33951 + }, + { + "epoch": 0.44119075503173105, + "grad_norm": 0.35691800713539124, + "learning_rate": 0.00011178466057526094, + "loss": 1.3644, + "step": 33952 + }, + { + "epoch": 0.44120374957564695, + "grad_norm": 0.5175820589065552, + "learning_rate": 0.00011178206111334955, + "loss": 1.4384, + "step": 33953 + }, + { + "epoch": 0.4412167441195628, + "grad_norm": 0.3665894865989685, + "learning_rate": 0.00011177946165143816, + "loss": 1.3688, + "step": 33954 + }, + { + "epoch": 0.4412297386634787, + "grad_norm": 0.39191189408302307, + "learning_rate": 0.00011177686218952676, + "loss": 1.4866, + "step": 33955 + }, + { + "epoch": 0.44124273320739454, + "grad_norm": 0.35874414443969727, + "learning_rate": 0.0001117742627276154, + "loss": 1.3883, + "step": 33956 + }, + { + "epoch": 0.44125572775131044, + "grad_norm": 0.40424150228500366, + "learning_rate": 0.00011177166326570401, + "loss": 1.4475, + "step": 33957 + }, + { + "epoch": 0.4412687222952263, + "grad_norm": 0.4540855288505554, + "learning_rate": 0.00011176906380379262, + "loss": 1.2652, + "step": 33958 + }, + { + "epoch": 0.4412817168391422, + "grad_norm": 0.37405267357826233, + "learning_rate": 0.00011176646434188123, + "loss": 1.4242, + "step": 33959 + }, + { + "epoch": 0.44129471138305804, + "grad_norm": 0.42453327775001526, + "learning_rate": 0.00011176386487996985, + "loss": 1.3518, + "step": 33960 + }, + { + "epoch": 0.44130770592697394, + "grad_norm": 0.3619401454925537, + "learning_rate": 0.00011176126541805847, + "loss": 1.3275, + "step": 33961 + }, + { + "epoch": 0.4413207004708898, + "grad_norm": 0.41241100430488586, + "learning_rate": 0.00011175866595614708, + "loss": 1.3013, + "step": 33962 + }, + { + "epoch": 0.4413336950148057, + "grad_norm": 0.3413083553314209, + "learning_rate": 0.00011175606649423569, + "loss": 1.3262, + "step": 33963 + }, + { + "epoch": 0.4413466895587215, + "grad_norm": 0.3298988342285156, + "learning_rate": 0.00011175346703232433, + "loss": 1.3523, + "step": 33964 + }, + { + "epoch": 0.44135968410263743, + "grad_norm": 0.4547967314720154, + "learning_rate": 0.00011175086757041294, + "loss": 1.3906, + "step": 33965 + }, + { + "epoch": 0.4413726786465533, + "grad_norm": 0.3494797646999359, + "learning_rate": 0.00011174826810850155, + "loss": 1.5019, + "step": 33966 + }, + { + "epoch": 0.4413856731904692, + "grad_norm": 0.4248200058937073, + "learning_rate": 0.00011174566864659016, + "loss": 1.5683, + "step": 33967 + }, + { + "epoch": 0.441398667734385, + "grad_norm": 0.3502233326435089, + "learning_rate": 0.00011174306918467878, + "loss": 1.3526, + "step": 33968 + }, + { + "epoch": 0.4414116622783009, + "grad_norm": 0.41335996985435486, + "learning_rate": 0.0001117404697227674, + "loss": 1.4993, + "step": 33969 + }, + { + "epoch": 0.44142465682221677, + "grad_norm": 0.43141835927963257, + "learning_rate": 0.000111737870260856, + "loss": 1.2899, + "step": 33970 + }, + { + "epoch": 0.44143765136613267, + "grad_norm": 0.4510093927383423, + "learning_rate": 0.00011173527079894462, + "loss": 1.373, + "step": 33971 + }, + { + "epoch": 0.4414506459100485, + "grad_norm": 0.42366617918014526, + "learning_rate": 0.00011173267133703324, + "loss": 1.3472, + "step": 33972 + }, + { + "epoch": 0.4414636404539644, + "grad_norm": 0.35545095801353455, + "learning_rate": 0.00011173007187512185, + "loss": 1.3568, + "step": 33973 + }, + { + "epoch": 0.44147663499788026, + "grad_norm": 0.42061683535575867, + "learning_rate": 0.00011172747241321046, + "loss": 1.3641, + "step": 33974 + }, + { + "epoch": 0.44148962954179616, + "grad_norm": 0.4463011920452118, + "learning_rate": 0.00011172487295129907, + "loss": 1.6497, + "step": 33975 + }, + { + "epoch": 0.441502624085712, + "grad_norm": 0.4948456585407257, + "learning_rate": 0.00011172227348938771, + "loss": 1.4132, + "step": 33976 + }, + { + "epoch": 0.4415156186296279, + "grad_norm": 0.46984952688217163, + "learning_rate": 0.00011171967402747632, + "loss": 1.6391, + "step": 33977 + }, + { + "epoch": 0.44152861317354375, + "grad_norm": 0.36725401878356934, + "learning_rate": 0.00011171707456556493, + "loss": 1.3872, + "step": 33978 + }, + { + "epoch": 0.44154160771745965, + "grad_norm": 0.28851690888404846, + "learning_rate": 0.00011171447510365356, + "loss": 1.2723, + "step": 33979 + }, + { + "epoch": 0.4415546022613755, + "grad_norm": 0.42832598090171814, + "learning_rate": 0.00011171187564174217, + "loss": 1.4838, + "step": 33980 + }, + { + "epoch": 0.4415675968052914, + "grad_norm": 0.4003250002861023, + "learning_rate": 0.00011170927617983078, + "loss": 1.422, + "step": 33981 + }, + { + "epoch": 0.44158059134920724, + "grad_norm": 0.41524478793144226, + "learning_rate": 0.00011170667671791939, + "loss": 1.4542, + "step": 33982 + }, + { + "epoch": 0.44159358589312314, + "grad_norm": 0.3334995210170746, + "learning_rate": 0.00011170407725600803, + "loss": 1.2805, + "step": 33983 + }, + { + "epoch": 0.441606580437039, + "grad_norm": 0.3070078194141388, + "learning_rate": 0.00011170147779409663, + "loss": 1.4513, + "step": 33984 + }, + { + "epoch": 0.4416195749809549, + "grad_norm": 0.4811932146549225, + "learning_rate": 0.00011169887833218524, + "loss": 1.4415, + "step": 33985 + }, + { + "epoch": 0.44163256952487073, + "grad_norm": 0.4797988831996918, + "learning_rate": 0.00011169627887027385, + "loss": 1.4277, + "step": 33986 + }, + { + "epoch": 0.44164556406878663, + "grad_norm": 0.3617285490036011, + "learning_rate": 0.00011169367940836249, + "loss": 1.2887, + "step": 33987 + }, + { + "epoch": 0.4416585586127025, + "grad_norm": 0.5502768158912659, + "learning_rate": 0.0001116910799464511, + "loss": 1.4889, + "step": 33988 + }, + { + "epoch": 0.4416715531566184, + "grad_norm": 0.337211549282074, + "learning_rate": 0.00011168848048453971, + "loss": 1.4075, + "step": 33989 + }, + { + "epoch": 0.4416845477005342, + "grad_norm": 0.37986770272254944, + "learning_rate": 0.00011168588102262832, + "loss": 1.4778, + "step": 33990 + }, + { + "epoch": 0.4416975422444501, + "grad_norm": 0.42858952283859253, + "learning_rate": 0.00011168328156071694, + "loss": 1.3399, + "step": 33991 + }, + { + "epoch": 0.44171053678836597, + "grad_norm": 0.3652910590171814, + "learning_rate": 0.00011168068209880555, + "loss": 1.3614, + "step": 33992 + }, + { + "epoch": 0.44172353133228187, + "grad_norm": 0.43751946091651917, + "learning_rate": 0.00011167808263689416, + "loss": 1.501, + "step": 33993 + }, + { + "epoch": 0.4417365258761977, + "grad_norm": 0.4212469160556793, + "learning_rate": 0.00011167548317498278, + "loss": 1.3336, + "step": 33994 + }, + { + "epoch": 0.4417495204201136, + "grad_norm": 0.336783230304718, + "learning_rate": 0.00011167288371307141, + "loss": 1.2714, + "step": 33995 + }, + { + "epoch": 0.44176251496402946, + "grad_norm": 0.41767245531082153, + "learning_rate": 0.00011167028425116002, + "loss": 1.339, + "step": 33996 + }, + { + "epoch": 0.44177550950794536, + "grad_norm": 0.46003207564353943, + "learning_rate": 0.00011166768478924862, + "loss": 1.4393, + "step": 33997 + }, + { + "epoch": 0.4417885040518612, + "grad_norm": 0.411734014749527, + "learning_rate": 0.00011166508532733723, + "loss": 1.4603, + "step": 33998 + }, + { + "epoch": 0.4418014985957771, + "grad_norm": 0.44157516956329346, + "learning_rate": 0.00011166248586542587, + "loss": 1.4364, + "step": 33999 + }, + { + "epoch": 0.44181449313969295, + "grad_norm": 0.39477619528770447, + "learning_rate": 0.00011165988640351448, + "loss": 1.441, + "step": 34000 + }, + { + "epoch": 0.44182748768360885, + "grad_norm": 0.4544408619403839, + "learning_rate": 0.00011165728694160309, + "loss": 1.4328, + "step": 34001 + }, + { + "epoch": 0.4418404822275247, + "grad_norm": 0.3563541769981384, + "learning_rate": 0.0001116546874796917, + "loss": 1.6331, + "step": 34002 + }, + { + "epoch": 0.4418534767714406, + "grad_norm": 0.40583398938179016, + "learning_rate": 0.00011165208801778033, + "loss": 1.6264, + "step": 34003 + }, + { + "epoch": 0.44186647131535645, + "grad_norm": 0.4503598213195801, + "learning_rate": 0.00011164948855586894, + "loss": 1.4198, + "step": 34004 + }, + { + "epoch": 0.44187946585927235, + "grad_norm": 0.352499783039093, + "learning_rate": 0.00011164688909395755, + "loss": 1.3936, + "step": 34005 + }, + { + "epoch": 0.4418924604031882, + "grad_norm": 0.4193965792655945, + "learning_rate": 0.00011164428963204616, + "loss": 1.3939, + "step": 34006 + }, + { + "epoch": 0.4419054549471041, + "grad_norm": 0.41913798451423645, + "learning_rate": 0.0001116416901701348, + "loss": 1.3813, + "step": 34007 + }, + { + "epoch": 0.44191844949101994, + "grad_norm": 0.46930497884750366, + "learning_rate": 0.00011163909070822341, + "loss": 1.5188, + "step": 34008 + }, + { + "epoch": 0.44193144403493584, + "grad_norm": 0.43586206436157227, + "learning_rate": 0.00011163649124631202, + "loss": 1.4712, + "step": 34009 + }, + { + "epoch": 0.4419444385788517, + "grad_norm": 0.48863834142684937, + "learning_rate": 0.00011163389178440062, + "loss": 1.557, + "step": 34010 + }, + { + "epoch": 0.4419574331227676, + "grad_norm": 0.3437492549419403, + "learning_rate": 0.00011163129232248926, + "loss": 1.2922, + "step": 34011 + }, + { + "epoch": 0.44197042766668343, + "grad_norm": 0.36360836029052734, + "learning_rate": 0.00011162869286057787, + "loss": 1.3329, + "step": 34012 + }, + { + "epoch": 0.44198342221059933, + "grad_norm": 0.4119648337364197, + "learning_rate": 0.00011162609339866648, + "loss": 1.4021, + "step": 34013 + }, + { + "epoch": 0.4419964167545152, + "grad_norm": 0.4602004885673523, + "learning_rate": 0.0001116234939367551, + "loss": 1.3554, + "step": 34014 + }, + { + "epoch": 0.4420094112984311, + "grad_norm": 0.34458449482917786, + "learning_rate": 0.00011162089447484371, + "loss": 1.3315, + "step": 34015 + }, + { + "epoch": 0.4420224058423469, + "grad_norm": 0.448383092880249, + "learning_rate": 0.00011161829501293232, + "loss": 1.4859, + "step": 34016 + }, + { + "epoch": 0.4420354003862628, + "grad_norm": 0.4149613380432129, + "learning_rate": 0.00011161569555102094, + "loss": 1.4072, + "step": 34017 + }, + { + "epoch": 0.44204839493017867, + "grad_norm": 0.41026368737220764, + "learning_rate": 0.00011161309608910957, + "loss": 1.3627, + "step": 34018 + }, + { + "epoch": 0.44206138947409457, + "grad_norm": 0.4544207751750946, + "learning_rate": 0.00011161049662719818, + "loss": 1.3649, + "step": 34019 + }, + { + "epoch": 0.4420743840180104, + "grad_norm": 0.408314049243927, + "learning_rate": 0.0001116078971652868, + "loss": 1.6069, + "step": 34020 + }, + { + "epoch": 0.4420873785619263, + "grad_norm": 0.5312645435333252, + "learning_rate": 0.0001116052977033754, + "loss": 1.4035, + "step": 34021 + }, + { + "epoch": 0.44210037310584216, + "grad_norm": 0.4624485969543457, + "learning_rate": 0.00011160269824146403, + "loss": 1.4847, + "step": 34022 + }, + { + "epoch": 0.44211336764975806, + "grad_norm": 0.5031031966209412, + "learning_rate": 0.00011160009877955264, + "loss": 1.5948, + "step": 34023 + }, + { + "epoch": 0.4421263621936739, + "grad_norm": 0.37778013944625854, + "learning_rate": 0.00011159749931764125, + "loss": 1.5047, + "step": 34024 + }, + { + "epoch": 0.4421393567375898, + "grad_norm": 0.3765183091163635, + "learning_rate": 0.00011159489985572986, + "loss": 1.3389, + "step": 34025 + }, + { + "epoch": 0.44215235128150565, + "grad_norm": 0.4213264286518097, + "learning_rate": 0.00011159230039381849, + "loss": 1.5316, + "step": 34026 + }, + { + "epoch": 0.44216534582542155, + "grad_norm": 0.4199904501438141, + "learning_rate": 0.0001115897009319071, + "loss": 1.4879, + "step": 34027 + }, + { + "epoch": 0.4421783403693374, + "grad_norm": 0.46864089369773865, + "learning_rate": 0.00011158710146999571, + "loss": 1.4932, + "step": 34028 + }, + { + "epoch": 0.4421913349132533, + "grad_norm": 0.4181559383869171, + "learning_rate": 0.00011158450200808432, + "loss": 1.6198, + "step": 34029 + }, + { + "epoch": 0.4422043294571692, + "grad_norm": 0.4206635653972626, + "learning_rate": 0.00011158190254617296, + "loss": 1.3485, + "step": 34030 + }, + { + "epoch": 0.44221732400108504, + "grad_norm": 0.4653114080429077, + "learning_rate": 0.00011157930308426157, + "loss": 1.4887, + "step": 34031 + }, + { + "epoch": 0.44223031854500094, + "grad_norm": 0.42721882462501526, + "learning_rate": 0.00011157670362235018, + "loss": 1.3516, + "step": 34032 + }, + { + "epoch": 0.4422433130889168, + "grad_norm": 0.32499122619628906, + "learning_rate": 0.00011157410416043879, + "loss": 1.3322, + "step": 34033 + }, + { + "epoch": 0.4422563076328327, + "grad_norm": 0.35488706827163696, + "learning_rate": 0.00011157150469852742, + "loss": 1.5073, + "step": 34034 + }, + { + "epoch": 0.44226930217674854, + "grad_norm": 0.34038349986076355, + "learning_rate": 0.00011156890523661603, + "loss": 1.4003, + "step": 34035 + }, + { + "epoch": 0.44228229672066444, + "grad_norm": 0.4432404041290283, + "learning_rate": 0.00011156630577470464, + "loss": 1.3601, + "step": 34036 + }, + { + "epoch": 0.4422952912645803, + "grad_norm": 0.48277029395103455, + "learning_rate": 0.00011156370631279325, + "loss": 1.5142, + "step": 34037 + }, + { + "epoch": 0.4423082858084962, + "grad_norm": 0.3559846878051758, + "learning_rate": 0.00011156110685088189, + "loss": 1.3681, + "step": 34038 + }, + { + "epoch": 0.442321280352412, + "grad_norm": 0.44590258598327637, + "learning_rate": 0.00011155850738897048, + "loss": 1.3953, + "step": 34039 + }, + { + "epoch": 0.44233427489632793, + "grad_norm": 0.3752189576625824, + "learning_rate": 0.0001115559079270591, + "loss": 1.2726, + "step": 34040 + }, + { + "epoch": 0.4423472694402438, + "grad_norm": 0.32315489649772644, + "learning_rate": 0.0001115533084651477, + "loss": 1.4106, + "step": 34041 + }, + { + "epoch": 0.4423602639841597, + "grad_norm": 0.4178207218647003, + "learning_rate": 0.00011155070900323634, + "loss": 1.2862, + "step": 34042 + }, + { + "epoch": 0.4423732585280755, + "grad_norm": 0.41131237149238586, + "learning_rate": 0.00011154810954132495, + "loss": 1.3821, + "step": 34043 + }, + { + "epoch": 0.4423862530719914, + "grad_norm": 0.4077105224132538, + "learning_rate": 0.00011154551007941357, + "loss": 1.4065, + "step": 34044 + }, + { + "epoch": 0.44239924761590727, + "grad_norm": 0.42594483494758606, + "learning_rate": 0.00011154291061750218, + "loss": 1.3171, + "step": 34045 + }, + { + "epoch": 0.44241224215982317, + "grad_norm": 0.38548025488853455, + "learning_rate": 0.0001115403111555908, + "loss": 1.3482, + "step": 34046 + }, + { + "epoch": 0.442425236703739, + "grad_norm": 0.4321030080318451, + "learning_rate": 0.00011153771169367941, + "loss": 1.3745, + "step": 34047 + }, + { + "epoch": 0.4424382312476549, + "grad_norm": 0.39287373423576355, + "learning_rate": 0.00011153511223176802, + "loss": 1.1635, + "step": 34048 + }, + { + "epoch": 0.44245122579157076, + "grad_norm": 0.44399556517601013, + "learning_rate": 0.00011153251276985663, + "loss": 1.4099, + "step": 34049 + }, + { + "epoch": 0.44246422033548666, + "grad_norm": 0.3703921139240265, + "learning_rate": 0.00011152991330794527, + "loss": 1.3196, + "step": 34050 + }, + { + "epoch": 0.4424772148794025, + "grad_norm": 0.37313687801361084, + "learning_rate": 0.00011152731384603388, + "loss": 1.3712, + "step": 34051 + }, + { + "epoch": 0.4424902094233184, + "grad_norm": 0.3553673028945923, + "learning_rate": 0.00011152471438412248, + "loss": 1.2903, + "step": 34052 + }, + { + "epoch": 0.44250320396723425, + "grad_norm": 0.4691884517669678, + "learning_rate": 0.00011152211492221112, + "loss": 1.3274, + "step": 34053 + }, + { + "epoch": 0.44251619851115015, + "grad_norm": 0.36158594489097595, + "learning_rate": 0.00011151951546029973, + "loss": 1.1861, + "step": 34054 + }, + { + "epoch": 0.442529193055066, + "grad_norm": 0.4169381856918335, + "learning_rate": 0.00011151691599838834, + "loss": 1.1811, + "step": 34055 + }, + { + "epoch": 0.4425421875989819, + "grad_norm": 0.4420263469219208, + "learning_rate": 0.00011151431653647695, + "loss": 1.523, + "step": 34056 + }, + { + "epoch": 0.44255518214289774, + "grad_norm": 0.48514723777770996, + "learning_rate": 0.00011151171707456558, + "loss": 1.3097, + "step": 34057 + }, + { + "epoch": 0.44256817668681364, + "grad_norm": 0.4264439642429352, + "learning_rate": 0.00011150911761265419, + "loss": 1.2568, + "step": 34058 + }, + { + "epoch": 0.4425811712307295, + "grad_norm": 0.34638628363609314, + "learning_rate": 0.0001115065181507428, + "loss": 1.26, + "step": 34059 + }, + { + "epoch": 0.4425941657746454, + "grad_norm": 0.3641755282878876, + "learning_rate": 0.00011150391868883141, + "loss": 1.2437, + "step": 34060 + }, + { + "epoch": 0.44260716031856123, + "grad_norm": 0.3337748050689697, + "learning_rate": 0.00011150131922692005, + "loss": 1.5411, + "step": 34061 + }, + { + "epoch": 0.44262015486247713, + "grad_norm": 0.42045578360557556, + "learning_rate": 0.00011149871976500866, + "loss": 1.5602, + "step": 34062 + }, + { + "epoch": 0.442633149406393, + "grad_norm": 0.3415861427783966, + "learning_rate": 0.00011149612030309727, + "loss": 1.4709, + "step": 34063 + }, + { + "epoch": 0.4426461439503089, + "grad_norm": 0.4140772521495819, + "learning_rate": 0.00011149352084118587, + "loss": 1.5819, + "step": 34064 + }, + { + "epoch": 0.4426591384942247, + "grad_norm": 0.34868860244750977, + "learning_rate": 0.0001114909213792745, + "loss": 1.4697, + "step": 34065 + }, + { + "epoch": 0.4426721330381406, + "grad_norm": 0.4092274010181427, + "learning_rate": 0.00011148832191736311, + "loss": 1.4222, + "step": 34066 + }, + { + "epoch": 0.44268512758205647, + "grad_norm": 0.4174451529979706, + "learning_rate": 0.00011148572245545173, + "loss": 1.5791, + "step": 34067 + }, + { + "epoch": 0.44269812212597237, + "grad_norm": 0.4202478229999542, + "learning_rate": 0.00011148312299354034, + "loss": 1.3845, + "step": 34068 + }, + { + "epoch": 0.4427111166698882, + "grad_norm": 0.4580093026161194, + "learning_rate": 0.00011148052353162896, + "loss": 1.4447, + "step": 34069 + }, + { + "epoch": 0.4427241112138041, + "grad_norm": 0.4523022472858429, + "learning_rate": 0.00011147792406971757, + "loss": 1.3951, + "step": 34070 + }, + { + "epoch": 0.44273710575771996, + "grad_norm": 0.4134688079357147, + "learning_rate": 0.00011147532460780618, + "loss": 1.3314, + "step": 34071 + }, + { + "epoch": 0.44275010030163586, + "grad_norm": 0.42702803015708923, + "learning_rate": 0.0001114727251458948, + "loss": 1.402, + "step": 34072 + }, + { + "epoch": 0.4427630948455517, + "grad_norm": 0.4308590590953827, + "learning_rate": 0.00011147012568398343, + "loss": 1.3203, + "step": 34073 + }, + { + "epoch": 0.4427760893894676, + "grad_norm": 0.4127539098262787, + "learning_rate": 0.00011146752622207204, + "loss": 1.3437, + "step": 34074 + }, + { + "epoch": 0.44278908393338345, + "grad_norm": 0.2835022807121277, + "learning_rate": 0.00011146492676016065, + "loss": 1.37, + "step": 34075 + }, + { + "epoch": 0.44280207847729935, + "grad_norm": 0.3778228461742401, + "learning_rate": 0.00011146232729824926, + "loss": 1.4776, + "step": 34076 + }, + { + "epoch": 0.4428150730212152, + "grad_norm": 0.44808027148246765, + "learning_rate": 0.00011145972783633789, + "loss": 1.4628, + "step": 34077 + }, + { + "epoch": 0.4428280675651311, + "grad_norm": 0.3863624632358551, + "learning_rate": 0.0001114571283744265, + "loss": 1.1686, + "step": 34078 + }, + { + "epoch": 0.44284106210904695, + "grad_norm": 0.3555189371109009, + "learning_rate": 0.00011145452891251511, + "loss": 1.6037, + "step": 34079 + }, + { + "epoch": 0.44285405665296285, + "grad_norm": 0.3349853754043579, + "learning_rate": 0.00011145192945060372, + "loss": 1.1066, + "step": 34080 + }, + { + "epoch": 0.4428670511968787, + "grad_norm": 0.3734571039676666, + "learning_rate": 0.00011144932998869235, + "loss": 1.4559, + "step": 34081 + }, + { + "epoch": 0.4428800457407946, + "grad_norm": 0.38879725337028503, + "learning_rate": 0.00011144673052678096, + "loss": 1.3851, + "step": 34082 + }, + { + "epoch": 0.44289304028471044, + "grad_norm": 0.43281811475753784, + "learning_rate": 0.00011144413106486957, + "loss": 1.3811, + "step": 34083 + }, + { + "epoch": 0.44290603482862634, + "grad_norm": 0.4733874797821045, + "learning_rate": 0.00011144153160295818, + "loss": 1.5529, + "step": 34084 + }, + { + "epoch": 0.4429190293725422, + "grad_norm": 0.40559878945350647, + "learning_rate": 0.00011143893214104682, + "loss": 1.5049, + "step": 34085 + }, + { + "epoch": 0.4429320239164581, + "grad_norm": 0.3868595361709595, + "learning_rate": 0.00011143633267913543, + "loss": 1.2088, + "step": 34086 + }, + { + "epoch": 0.44294501846037393, + "grad_norm": 0.3120768666267395, + "learning_rate": 0.00011143373321722404, + "loss": 1.4182, + "step": 34087 + }, + { + "epoch": 0.44295801300428983, + "grad_norm": 0.341911643743515, + "learning_rate": 0.00011143113375531266, + "loss": 1.222, + "step": 34088 + }, + { + "epoch": 0.4429710075482057, + "grad_norm": 0.4585247337818146, + "learning_rate": 0.00011142853429340127, + "loss": 1.3681, + "step": 34089 + }, + { + "epoch": 0.4429840020921216, + "grad_norm": 0.4295271933078766, + "learning_rate": 0.00011142593483148989, + "loss": 1.3337, + "step": 34090 + }, + { + "epoch": 0.4429969966360374, + "grad_norm": 0.3395138382911682, + "learning_rate": 0.0001114233353695785, + "loss": 1.4285, + "step": 34091 + }, + { + "epoch": 0.4430099911799533, + "grad_norm": 0.4354317784309387, + "learning_rate": 0.00011142073590766713, + "loss": 1.2586, + "step": 34092 + }, + { + "epoch": 0.44302298572386917, + "grad_norm": 0.3374302089214325, + "learning_rate": 0.00011141813644575575, + "loss": 1.4206, + "step": 34093 + }, + { + "epoch": 0.44303598026778507, + "grad_norm": 0.43610942363739014, + "learning_rate": 0.00011141553698384434, + "loss": 1.4458, + "step": 34094 + }, + { + "epoch": 0.4430489748117009, + "grad_norm": 0.3810350000858307, + "learning_rate": 0.00011141293752193295, + "loss": 1.3943, + "step": 34095 + }, + { + "epoch": 0.4430619693556168, + "grad_norm": 0.4225288927555084, + "learning_rate": 0.00011141033806002159, + "loss": 1.4428, + "step": 34096 + }, + { + "epoch": 0.44307496389953266, + "grad_norm": 0.3530668318271637, + "learning_rate": 0.0001114077385981102, + "loss": 1.5031, + "step": 34097 + }, + { + "epoch": 0.44308795844344856, + "grad_norm": 0.2999928295612335, + "learning_rate": 0.00011140513913619881, + "loss": 1.1151, + "step": 34098 + }, + { + "epoch": 0.4431009529873644, + "grad_norm": 0.37528547644615173, + "learning_rate": 0.00011140253967428742, + "loss": 1.6368, + "step": 34099 + }, + { + "epoch": 0.4431139475312803, + "grad_norm": 0.3746996223926544, + "learning_rate": 0.00011139994021237605, + "loss": 1.331, + "step": 34100 + }, + { + "epoch": 0.44312694207519615, + "grad_norm": 0.4597117006778717, + "learning_rate": 0.00011139734075046466, + "loss": 1.3023, + "step": 34101 + }, + { + "epoch": 0.44313993661911205, + "grad_norm": 0.4087245762348175, + "learning_rate": 0.00011139474128855327, + "loss": 1.4536, + "step": 34102 + }, + { + "epoch": 0.4431529311630279, + "grad_norm": 0.365296334028244, + "learning_rate": 0.00011139214182664188, + "loss": 1.3955, + "step": 34103 + }, + { + "epoch": 0.4431659257069438, + "grad_norm": 0.37722986936569214, + "learning_rate": 0.00011138954236473052, + "loss": 1.4297, + "step": 34104 + }, + { + "epoch": 0.44317892025085964, + "grad_norm": 0.3135521113872528, + "learning_rate": 0.00011138694290281913, + "loss": 1.2679, + "step": 34105 + }, + { + "epoch": 0.44319191479477554, + "grad_norm": 0.4270419180393219, + "learning_rate": 0.00011138434344090773, + "loss": 1.3106, + "step": 34106 + }, + { + "epoch": 0.44320490933869144, + "grad_norm": 0.42852821946144104, + "learning_rate": 0.00011138174397899634, + "loss": 1.5136, + "step": 34107 + }, + { + "epoch": 0.4432179038826073, + "grad_norm": 0.4589688777923584, + "learning_rate": 0.00011137914451708498, + "loss": 1.2346, + "step": 34108 + }, + { + "epoch": 0.4432308984265232, + "grad_norm": 0.30132973194122314, + "learning_rate": 0.00011137654505517359, + "loss": 1.2054, + "step": 34109 + }, + { + "epoch": 0.44324389297043904, + "grad_norm": 0.4193493723869324, + "learning_rate": 0.0001113739455932622, + "loss": 1.2162, + "step": 34110 + }, + { + "epoch": 0.44325688751435494, + "grad_norm": 0.49219849705696106, + "learning_rate": 0.00011137134613135081, + "loss": 1.5356, + "step": 34111 + }, + { + "epoch": 0.4432698820582708, + "grad_norm": 0.2943589687347412, + "learning_rate": 0.00011136874666943943, + "loss": 1.2607, + "step": 34112 + }, + { + "epoch": 0.4432828766021867, + "grad_norm": 0.4865548610687256, + "learning_rate": 0.00011136614720752805, + "loss": 1.4802, + "step": 34113 + }, + { + "epoch": 0.4432958711461025, + "grad_norm": 0.5202329754829407, + "learning_rate": 0.00011136354774561666, + "loss": 1.5178, + "step": 34114 + }, + { + "epoch": 0.4433088656900184, + "grad_norm": 0.40417933464050293, + "learning_rate": 0.00011136094828370527, + "loss": 1.3158, + "step": 34115 + }, + { + "epoch": 0.4433218602339343, + "grad_norm": 0.39617106318473816, + "learning_rate": 0.0001113583488217939, + "loss": 1.3696, + "step": 34116 + }, + { + "epoch": 0.4433348547778502, + "grad_norm": 0.46501678228378296, + "learning_rate": 0.00011135574935988252, + "loss": 1.3862, + "step": 34117 + }, + { + "epoch": 0.443347849321766, + "grad_norm": 0.3566831946372986, + "learning_rate": 0.00011135314989797113, + "loss": 1.2796, + "step": 34118 + }, + { + "epoch": 0.4433608438656819, + "grad_norm": 0.4189980924129486, + "learning_rate": 0.00011135055043605972, + "loss": 1.2754, + "step": 34119 + }, + { + "epoch": 0.44337383840959776, + "grad_norm": 0.3705172538757324, + "learning_rate": 0.00011134795097414836, + "loss": 1.5568, + "step": 34120 + }, + { + "epoch": 0.44338683295351367, + "grad_norm": 0.26607921719551086, + "learning_rate": 0.00011134535151223697, + "loss": 1.2465, + "step": 34121 + }, + { + "epoch": 0.4433998274974295, + "grad_norm": 0.3442211449146271, + "learning_rate": 0.00011134275205032558, + "loss": 1.4224, + "step": 34122 + }, + { + "epoch": 0.4434128220413454, + "grad_norm": 0.3846757411956787, + "learning_rate": 0.0001113401525884142, + "loss": 1.3092, + "step": 34123 + }, + { + "epoch": 0.44342581658526126, + "grad_norm": 0.39371925592422485, + "learning_rate": 0.00011133755312650282, + "loss": 1.2766, + "step": 34124 + }, + { + "epoch": 0.44343881112917716, + "grad_norm": 0.42275723814964294, + "learning_rate": 0.00011133495366459143, + "loss": 1.4954, + "step": 34125 + }, + { + "epoch": 0.443451805673093, + "grad_norm": 0.33947134017944336, + "learning_rate": 0.00011133235420268004, + "loss": 1.3359, + "step": 34126 + }, + { + "epoch": 0.4434648002170089, + "grad_norm": 0.3975488543510437, + "learning_rate": 0.00011132975474076868, + "loss": 1.2365, + "step": 34127 + }, + { + "epoch": 0.44347779476092475, + "grad_norm": 0.3726840019226074, + "learning_rate": 0.00011132715527885729, + "loss": 1.5712, + "step": 34128 + }, + { + "epoch": 0.44349078930484065, + "grad_norm": 0.4731629192829132, + "learning_rate": 0.0001113245558169459, + "loss": 1.432, + "step": 34129 + }, + { + "epoch": 0.4435037838487565, + "grad_norm": 0.422348290681839, + "learning_rate": 0.00011132195635503451, + "loss": 1.5213, + "step": 34130 + }, + { + "epoch": 0.4435167783926724, + "grad_norm": 0.5347578525543213, + "learning_rate": 0.00011131935689312314, + "loss": 1.2417, + "step": 34131 + }, + { + "epoch": 0.44352977293658824, + "grad_norm": 0.46815255284309387, + "learning_rate": 0.00011131675743121175, + "loss": 1.4664, + "step": 34132 + }, + { + "epoch": 0.44354276748050414, + "grad_norm": 0.3828120529651642, + "learning_rate": 0.00011131415796930036, + "loss": 1.372, + "step": 34133 + }, + { + "epoch": 0.44355576202442, + "grad_norm": 0.4296015501022339, + "learning_rate": 0.00011131155850738897, + "loss": 1.496, + "step": 34134 + }, + { + "epoch": 0.4435687565683359, + "grad_norm": 0.30861225724220276, + "learning_rate": 0.00011130895904547761, + "loss": 1.1474, + "step": 34135 + }, + { + "epoch": 0.44358175111225173, + "grad_norm": 0.4997391700744629, + "learning_rate": 0.0001113063595835662, + "loss": 1.322, + "step": 34136 + }, + { + "epoch": 0.44359474565616763, + "grad_norm": 0.43963226675987244, + "learning_rate": 0.00011130376012165482, + "loss": 1.3161, + "step": 34137 + }, + { + "epoch": 0.4436077402000835, + "grad_norm": 0.42827117443084717, + "learning_rate": 0.00011130116065974343, + "loss": 1.4298, + "step": 34138 + }, + { + "epoch": 0.4436207347439994, + "grad_norm": 0.389369934797287, + "learning_rate": 0.00011129856119783207, + "loss": 1.5127, + "step": 34139 + }, + { + "epoch": 0.4436337292879152, + "grad_norm": 0.36223915219306946, + "learning_rate": 0.00011129596173592068, + "loss": 1.3228, + "step": 34140 + }, + { + "epoch": 0.4436467238318311, + "grad_norm": 0.415510892868042, + "learning_rate": 0.00011129336227400929, + "loss": 1.4097, + "step": 34141 + }, + { + "epoch": 0.44365971837574697, + "grad_norm": 0.44629859924316406, + "learning_rate": 0.0001112907628120979, + "loss": 1.3878, + "step": 34142 + }, + { + "epoch": 0.44367271291966287, + "grad_norm": 0.4795943796634674, + "learning_rate": 0.00011128816335018652, + "loss": 1.4461, + "step": 34143 + }, + { + "epoch": 0.4436857074635787, + "grad_norm": 0.4738703668117523, + "learning_rate": 0.00011128556388827513, + "loss": 1.4526, + "step": 34144 + }, + { + "epoch": 0.4436987020074946, + "grad_norm": 0.3595874309539795, + "learning_rate": 0.00011128296442636374, + "loss": 1.3764, + "step": 34145 + }, + { + "epoch": 0.44371169655141046, + "grad_norm": 0.32588106393814087, + "learning_rate": 0.00011128036496445236, + "loss": 1.3746, + "step": 34146 + }, + { + "epoch": 0.44372469109532636, + "grad_norm": 0.4356873035430908, + "learning_rate": 0.00011127776550254099, + "loss": 1.5584, + "step": 34147 + }, + { + "epoch": 0.4437376856392422, + "grad_norm": 0.35701823234558105, + "learning_rate": 0.00011127516604062959, + "loss": 1.1765, + "step": 34148 + }, + { + "epoch": 0.4437506801831581, + "grad_norm": 0.5933940410614014, + "learning_rate": 0.0001112725665787182, + "loss": 1.3891, + "step": 34149 + }, + { + "epoch": 0.44376367472707395, + "grad_norm": 0.31582701206207275, + "learning_rate": 0.00011126996711680681, + "loss": 1.3308, + "step": 34150 + }, + { + "epoch": 0.44377666927098985, + "grad_norm": 0.3645959198474884, + "learning_rate": 0.00011126736765489545, + "loss": 1.2528, + "step": 34151 + }, + { + "epoch": 0.4437896638149057, + "grad_norm": 0.39703112840652466, + "learning_rate": 0.00011126476819298406, + "loss": 1.3323, + "step": 34152 + }, + { + "epoch": 0.4438026583588216, + "grad_norm": 0.38320615887641907, + "learning_rate": 0.00011126216873107267, + "loss": 1.2923, + "step": 34153 + }, + { + "epoch": 0.44381565290273745, + "grad_norm": 0.3426392078399658, + "learning_rate": 0.00011125956926916128, + "loss": 1.2773, + "step": 34154 + }, + { + "epoch": 0.44382864744665335, + "grad_norm": 0.4377420246601105, + "learning_rate": 0.00011125696980724991, + "loss": 1.2048, + "step": 34155 + }, + { + "epoch": 0.4438416419905692, + "grad_norm": 0.4587497115135193, + "learning_rate": 0.00011125437034533852, + "loss": 1.3823, + "step": 34156 + }, + { + "epoch": 0.4438546365344851, + "grad_norm": 0.3848799169063568, + "learning_rate": 0.00011125177088342713, + "loss": 1.3896, + "step": 34157 + }, + { + "epoch": 0.44386763107840094, + "grad_norm": 0.28870266675949097, + "learning_rate": 0.00011124917142151574, + "loss": 1.1698, + "step": 34158 + }, + { + "epoch": 0.44388062562231684, + "grad_norm": 0.34361258149147034, + "learning_rate": 0.00011124657195960438, + "loss": 1.3781, + "step": 34159 + }, + { + "epoch": 0.4438936201662327, + "grad_norm": 0.4734368622303009, + "learning_rate": 0.00011124397249769299, + "loss": 1.2124, + "step": 34160 + }, + { + "epoch": 0.4439066147101486, + "grad_norm": 0.4326978921890259, + "learning_rate": 0.00011124137303578159, + "loss": 1.5378, + "step": 34161 + }, + { + "epoch": 0.44391960925406443, + "grad_norm": 0.40271615982055664, + "learning_rate": 0.00011123877357387022, + "loss": 1.3707, + "step": 34162 + }, + { + "epoch": 0.44393260379798033, + "grad_norm": 0.41837170720100403, + "learning_rate": 0.00011123617411195884, + "loss": 1.4119, + "step": 34163 + }, + { + "epoch": 0.4439455983418962, + "grad_norm": 0.37991392612457275, + "learning_rate": 0.00011123357465004745, + "loss": 1.3521, + "step": 34164 + }, + { + "epoch": 0.4439585928858121, + "grad_norm": 0.5376604199409485, + "learning_rate": 0.00011123097518813606, + "loss": 1.3996, + "step": 34165 + }, + { + "epoch": 0.4439715874297279, + "grad_norm": 0.3981701135635376, + "learning_rate": 0.00011122837572622468, + "loss": 1.4386, + "step": 34166 + }, + { + "epoch": 0.4439845819736438, + "grad_norm": 0.3578491508960724, + "learning_rate": 0.00011122577626431329, + "loss": 1.5333, + "step": 34167 + }, + { + "epoch": 0.44399757651755967, + "grad_norm": 0.42328494787216187, + "learning_rate": 0.0001112231768024019, + "loss": 1.5149, + "step": 34168 + }, + { + "epoch": 0.44401057106147557, + "grad_norm": 0.35145577788352966, + "learning_rate": 0.00011122057734049051, + "loss": 1.3048, + "step": 34169 + }, + { + "epoch": 0.4440235656053914, + "grad_norm": 0.5234702825546265, + "learning_rate": 0.00011121797787857915, + "loss": 1.4279, + "step": 34170 + }, + { + "epoch": 0.4440365601493073, + "grad_norm": 0.3667518198490143, + "learning_rate": 0.00011121537841666776, + "loss": 1.4894, + "step": 34171 + }, + { + "epoch": 0.44404955469322316, + "grad_norm": 0.31042590737342834, + "learning_rate": 0.00011121277895475637, + "loss": 1.4751, + "step": 34172 + }, + { + "epoch": 0.44406254923713906, + "grad_norm": 0.4522731304168701, + "learning_rate": 0.00011121017949284499, + "loss": 1.3506, + "step": 34173 + }, + { + "epoch": 0.4440755437810549, + "grad_norm": 0.41778403520584106, + "learning_rate": 0.00011120758003093361, + "loss": 1.3166, + "step": 34174 + }, + { + "epoch": 0.4440885383249708, + "grad_norm": 0.33895090222358704, + "learning_rate": 0.00011120498056902222, + "loss": 1.3856, + "step": 34175 + }, + { + "epoch": 0.44410153286888665, + "grad_norm": 0.5224469900131226, + "learning_rate": 0.00011120238110711083, + "loss": 1.537, + "step": 34176 + }, + { + "epoch": 0.44411452741280255, + "grad_norm": 0.37412944436073303, + "learning_rate": 0.00011119978164519944, + "loss": 1.2369, + "step": 34177 + }, + { + "epoch": 0.4441275219567184, + "grad_norm": 0.3774406909942627, + "learning_rate": 0.00011119718218328807, + "loss": 1.4191, + "step": 34178 + }, + { + "epoch": 0.4441405165006343, + "grad_norm": 0.3684643805027008, + "learning_rate": 0.00011119458272137668, + "loss": 1.2032, + "step": 34179 + }, + { + "epoch": 0.44415351104455014, + "grad_norm": 0.33266207575798035, + "learning_rate": 0.00011119198325946529, + "loss": 1.2905, + "step": 34180 + }, + { + "epoch": 0.44416650558846604, + "grad_norm": 0.3841158151626587, + "learning_rate": 0.0001111893837975539, + "loss": 1.4892, + "step": 34181 + }, + { + "epoch": 0.44417950013238194, + "grad_norm": 0.46613848209381104, + "learning_rate": 0.00011118678433564254, + "loss": 1.4964, + "step": 34182 + }, + { + "epoch": 0.4441924946762978, + "grad_norm": 0.5643818974494934, + "learning_rate": 0.00011118418487373115, + "loss": 1.4406, + "step": 34183 + }, + { + "epoch": 0.4442054892202137, + "grad_norm": 0.3386726975440979, + "learning_rate": 0.00011118158541181976, + "loss": 1.3488, + "step": 34184 + }, + { + "epoch": 0.44421848376412953, + "grad_norm": 0.40742337703704834, + "learning_rate": 0.00011117898594990837, + "loss": 1.1391, + "step": 34185 + }, + { + "epoch": 0.44423147830804544, + "grad_norm": 0.4673249423503876, + "learning_rate": 0.000111176386487997, + "loss": 1.5867, + "step": 34186 + }, + { + "epoch": 0.4442444728519613, + "grad_norm": 0.45573797821998596, + "learning_rate": 0.0001111737870260856, + "loss": 1.3814, + "step": 34187 + }, + { + "epoch": 0.4442574673958772, + "grad_norm": 0.45553094148635864, + "learning_rate": 0.00011117118756417422, + "loss": 1.4798, + "step": 34188 + }, + { + "epoch": 0.444270461939793, + "grad_norm": 0.47225266695022583, + "learning_rate": 0.00011116858810226283, + "loss": 1.4148, + "step": 34189 + }, + { + "epoch": 0.4442834564837089, + "grad_norm": 0.5917590856552124, + "learning_rate": 0.00011116598864035145, + "loss": 1.4876, + "step": 34190 + }, + { + "epoch": 0.4442964510276248, + "grad_norm": 0.6290504336357117, + "learning_rate": 0.00011116338917844006, + "loss": 1.4817, + "step": 34191 + }, + { + "epoch": 0.4443094455715407, + "grad_norm": 0.4107941687107086, + "learning_rate": 0.00011116078971652867, + "loss": 1.3413, + "step": 34192 + }, + { + "epoch": 0.4443224401154565, + "grad_norm": 0.4469742476940155, + "learning_rate": 0.00011115819025461729, + "loss": 1.2886, + "step": 34193 + }, + { + "epoch": 0.4443354346593724, + "grad_norm": 0.39618921279907227, + "learning_rate": 0.00011115559079270592, + "loss": 1.5528, + "step": 34194 + }, + { + "epoch": 0.44434842920328826, + "grad_norm": 0.4320566654205322, + "learning_rate": 0.00011115299133079453, + "loss": 1.3278, + "step": 34195 + }, + { + "epoch": 0.44436142374720417, + "grad_norm": 0.444646418094635, + "learning_rate": 0.00011115039186888315, + "loss": 1.4441, + "step": 34196 + }, + { + "epoch": 0.44437441829112, + "grad_norm": 0.3444182872772217, + "learning_rate": 0.00011114779240697176, + "loss": 1.5241, + "step": 34197 + }, + { + "epoch": 0.4443874128350359, + "grad_norm": 0.41096407175064087, + "learning_rate": 0.00011114519294506038, + "loss": 1.3688, + "step": 34198 + }, + { + "epoch": 0.44440040737895176, + "grad_norm": 0.36175450682640076, + "learning_rate": 0.00011114259348314899, + "loss": 1.3182, + "step": 34199 + }, + { + "epoch": 0.44441340192286766, + "grad_norm": 0.4703044593334198, + "learning_rate": 0.0001111399940212376, + "loss": 1.368, + "step": 34200 + }, + { + "epoch": 0.4444263964667835, + "grad_norm": 0.40687480568885803, + "learning_rate": 0.00011113739455932624, + "loss": 1.4014, + "step": 34201 + }, + { + "epoch": 0.4444393910106994, + "grad_norm": 0.39292845129966736, + "learning_rate": 0.00011113479509741485, + "loss": 1.5355, + "step": 34202 + }, + { + "epoch": 0.44445238555461525, + "grad_norm": 0.3628689646720886, + "learning_rate": 0.00011113219563550345, + "loss": 1.2313, + "step": 34203 + }, + { + "epoch": 0.44446538009853115, + "grad_norm": 0.40662142634391785, + "learning_rate": 0.00011112959617359206, + "loss": 1.1035, + "step": 34204 + }, + { + "epoch": 0.444478374642447, + "grad_norm": 0.40217074751853943, + "learning_rate": 0.0001111269967116807, + "loss": 1.3639, + "step": 34205 + }, + { + "epoch": 0.4444913691863629, + "grad_norm": 0.3601079285144806, + "learning_rate": 0.00011112439724976931, + "loss": 1.3261, + "step": 34206 + }, + { + "epoch": 0.44450436373027874, + "grad_norm": 0.4247356355190277, + "learning_rate": 0.00011112179778785792, + "loss": 1.3593, + "step": 34207 + }, + { + "epoch": 0.44451735827419464, + "grad_norm": 0.4169764220714569, + "learning_rate": 0.00011111919832594653, + "loss": 1.4128, + "step": 34208 + }, + { + "epoch": 0.4445303528181105, + "grad_norm": 0.4534686207771301, + "learning_rate": 0.00011111659886403516, + "loss": 1.298, + "step": 34209 + }, + { + "epoch": 0.4445433473620264, + "grad_norm": 0.4269981384277344, + "learning_rate": 0.00011111399940212377, + "loss": 1.3738, + "step": 34210 + }, + { + "epoch": 0.44455634190594223, + "grad_norm": 0.4119773507118225, + "learning_rate": 0.00011111139994021238, + "loss": 1.3386, + "step": 34211 + }, + { + "epoch": 0.44456933644985813, + "grad_norm": 0.39342597126960754, + "learning_rate": 0.00011110880047830099, + "loss": 1.463, + "step": 34212 + }, + { + "epoch": 0.444582330993774, + "grad_norm": 0.36418724060058594, + "learning_rate": 0.00011110620101638963, + "loss": 1.4184, + "step": 34213 + }, + { + "epoch": 0.4445953255376899, + "grad_norm": 0.3751234710216522, + "learning_rate": 0.00011110360155447824, + "loss": 1.3731, + "step": 34214 + }, + { + "epoch": 0.4446083200816057, + "grad_norm": 0.3351527154445648, + "learning_rate": 0.00011110100209256685, + "loss": 1.2924, + "step": 34215 + }, + { + "epoch": 0.4446213146255216, + "grad_norm": 0.3286838233470917, + "learning_rate": 0.00011109840263065545, + "loss": 1.3045, + "step": 34216 + }, + { + "epoch": 0.44463430916943747, + "grad_norm": 0.34017321467399597, + "learning_rate": 0.00011109580316874408, + "loss": 1.326, + "step": 34217 + }, + { + "epoch": 0.44464730371335337, + "grad_norm": 0.4791562557220459, + "learning_rate": 0.0001110932037068327, + "loss": 1.3919, + "step": 34218 + }, + { + "epoch": 0.4446602982572692, + "grad_norm": 0.33656057715415955, + "learning_rate": 0.0001110906042449213, + "loss": 1.4916, + "step": 34219 + }, + { + "epoch": 0.4446732928011851, + "grad_norm": 0.48291119933128357, + "learning_rate": 0.00011108800478300992, + "loss": 1.3309, + "step": 34220 + }, + { + "epoch": 0.44468628734510096, + "grad_norm": 0.42001625895500183, + "learning_rate": 0.00011108540532109854, + "loss": 1.4117, + "step": 34221 + }, + { + "epoch": 0.44469928188901686, + "grad_norm": 0.28259068727493286, + "learning_rate": 0.00011108280585918715, + "loss": 1.2551, + "step": 34222 + }, + { + "epoch": 0.4447122764329327, + "grad_norm": 0.3199802339076996, + "learning_rate": 0.00011108020639727576, + "loss": 1.5147, + "step": 34223 + }, + { + "epoch": 0.4447252709768486, + "grad_norm": 0.389824777841568, + "learning_rate": 0.00011107760693536437, + "loss": 1.3221, + "step": 34224 + }, + { + "epoch": 0.44473826552076445, + "grad_norm": 0.4450884461402893, + "learning_rate": 0.00011107500747345301, + "loss": 1.382, + "step": 34225 + }, + { + "epoch": 0.44475126006468035, + "grad_norm": 0.289813369512558, + "learning_rate": 0.00011107240801154162, + "loss": 1.3737, + "step": 34226 + }, + { + "epoch": 0.4447642546085962, + "grad_norm": 0.46458229422569275, + "learning_rate": 0.00011106980854963023, + "loss": 1.49, + "step": 34227 + }, + { + "epoch": 0.4447772491525121, + "grad_norm": 0.35163161158561707, + "learning_rate": 0.00011106720908771883, + "loss": 1.2006, + "step": 34228 + }, + { + "epoch": 0.44479024369642794, + "grad_norm": 0.38532984256744385, + "learning_rate": 0.00011106460962580747, + "loss": 1.6946, + "step": 34229 + }, + { + "epoch": 0.44480323824034385, + "grad_norm": 0.42093268036842346, + "learning_rate": 0.00011106201016389608, + "loss": 1.3829, + "step": 34230 + }, + { + "epoch": 0.4448162327842597, + "grad_norm": 0.3260221779346466, + "learning_rate": 0.00011105941070198469, + "loss": 1.2602, + "step": 34231 + }, + { + "epoch": 0.4448292273281756, + "grad_norm": 0.3451603651046753, + "learning_rate": 0.0001110568112400733, + "loss": 1.2731, + "step": 34232 + }, + { + "epoch": 0.44484222187209144, + "grad_norm": 0.4718239903450012, + "learning_rate": 0.00011105421177816193, + "loss": 1.442, + "step": 34233 + }, + { + "epoch": 0.44485521641600734, + "grad_norm": 0.3400033712387085, + "learning_rate": 0.00011105161231625054, + "loss": 1.2875, + "step": 34234 + }, + { + "epoch": 0.4448682109599232, + "grad_norm": 0.3504631519317627, + "learning_rate": 0.00011104901285433915, + "loss": 1.2278, + "step": 34235 + }, + { + "epoch": 0.4448812055038391, + "grad_norm": 0.44772931933403015, + "learning_rate": 0.00011104641339242779, + "loss": 1.2935, + "step": 34236 + }, + { + "epoch": 0.44489420004775493, + "grad_norm": 0.5015880465507507, + "learning_rate": 0.0001110438139305164, + "loss": 1.4458, + "step": 34237 + }, + { + "epoch": 0.44490719459167083, + "grad_norm": 0.406540185213089, + "learning_rate": 0.00011104121446860501, + "loss": 1.4993, + "step": 34238 + }, + { + "epoch": 0.4449201891355867, + "grad_norm": 0.3785719573497772, + "learning_rate": 0.00011103861500669362, + "loss": 1.4054, + "step": 34239 + }, + { + "epoch": 0.4449331836795026, + "grad_norm": 0.3008996248245239, + "learning_rate": 0.00011103601554478224, + "loss": 1.1654, + "step": 34240 + }, + { + "epoch": 0.4449461782234184, + "grad_norm": 0.37364912033081055, + "learning_rate": 0.00011103341608287085, + "loss": 1.3846, + "step": 34241 + }, + { + "epoch": 0.4449591727673343, + "grad_norm": 0.4528624713420868, + "learning_rate": 0.00011103081662095947, + "loss": 1.4743, + "step": 34242 + }, + { + "epoch": 0.44497216731125017, + "grad_norm": 0.41597968339920044, + "learning_rate": 0.00011102821715904808, + "loss": 1.3934, + "step": 34243 + }, + { + "epoch": 0.44498516185516607, + "grad_norm": 0.444993793964386, + "learning_rate": 0.00011102561769713671, + "loss": 1.3334, + "step": 34244 + }, + { + "epoch": 0.4449981563990819, + "grad_norm": 0.4268054664134979, + "learning_rate": 0.00011102301823522531, + "loss": 1.3748, + "step": 34245 + }, + { + "epoch": 0.4450111509429978, + "grad_norm": 0.46148446202278137, + "learning_rate": 0.00011102041877331392, + "loss": 1.3597, + "step": 34246 + }, + { + "epoch": 0.44502414548691366, + "grad_norm": 0.38514938950538635, + "learning_rate": 0.00011101781931140253, + "loss": 1.2185, + "step": 34247 + }, + { + "epoch": 0.44503714003082956, + "grad_norm": 0.4315408766269684, + "learning_rate": 0.00011101521984949117, + "loss": 1.3127, + "step": 34248 + }, + { + "epoch": 0.4450501345747454, + "grad_norm": 0.3872237205505371, + "learning_rate": 0.00011101262038757978, + "loss": 1.3722, + "step": 34249 + }, + { + "epoch": 0.4450631291186613, + "grad_norm": 0.448140949010849, + "learning_rate": 0.0001110100209256684, + "loss": 1.3254, + "step": 34250 + }, + { + "epoch": 0.44507612366257715, + "grad_norm": 0.4356841444969177, + "learning_rate": 0.000111007421463757, + "loss": 1.266, + "step": 34251 + }, + { + "epoch": 0.44508911820649305, + "grad_norm": 0.3740954101085663, + "learning_rate": 0.00011100482200184563, + "loss": 1.3384, + "step": 34252 + }, + { + "epoch": 0.4451021127504089, + "grad_norm": 0.4336915910243988, + "learning_rate": 0.00011100222253993424, + "loss": 1.4123, + "step": 34253 + }, + { + "epoch": 0.4451151072943248, + "grad_norm": 0.3967514932155609, + "learning_rate": 0.00011099962307802285, + "loss": 1.4324, + "step": 34254 + }, + { + "epoch": 0.44512810183824064, + "grad_norm": 0.5248040556907654, + "learning_rate": 0.00011099702361611146, + "loss": 1.4017, + "step": 34255 + }, + { + "epoch": 0.44514109638215654, + "grad_norm": 0.35255858302116394, + "learning_rate": 0.0001109944241542001, + "loss": 1.446, + "step": 34256 + }, + { + "epoch": 0.4451540909260724, + "grad_norm": 0.48291704058647156, + "learning_rate": 0.00011099182469228871, + "loss": 1.4915, + "step": 34257 + }, + { + "epoch": 0.4451670854699883, + "grad_norm": 0.4787138104438782, + "learning_rate": 0.00011098922523037731, + "loss": 1.3658, + "step": 34258 + }, + { + "epoch": 0.4451800800139042, + "grad_norm": 0.3056149184703827, + "learning_rate": 0.00011098662576846592, + "loss": 1.3565, + "step": 34259 + }, + { + "epoch": 0.44519307455782003, + "grad_norm": 0.4579535126686096, + "learning_rate": 0.00011098402630655456, + "loss": 1.5442, + "step": 34260 + }, + { + "epoch": 0.44520606910173594, + "grad_norm": 0.44152453541755676, + "learning_rate": 0.00011098142684464317, + "loss": 1.4411, + "step": 34261 + }, + { + "epoch": 0.4452190636456518, + "grad_norm": 0.4361974000930786, + "learning_rate": 0.00011097882738273178, + "loss": 1.5052, + "step": 34262 + }, + { + "epoch": 0.4452320581895677, + "grad_norm": 0.5200278162956238, + "learning_rate": 0.00011097622792082039, + "loss": 1.4301, + "step": 34263 + }, + { + "epoch": 0.4452450527334835, + "grad_norm": 0.3853200078010559, + "learning_rate": 0.00011097362845890901, + "loss": 1.5305, + "step": 34264 + }, + { + "epoch": 0.4452580472773994, + "grad_norm": 0.37018293142318726, + "learning_rate": 0.00011097102899699763, + "loss": 1.395, + "step": 34265 + }, + { + "epoch": 0.44527104182131527, + "grad_norm": 0.3669099807739258, + "learning_rate": 0.00011096842953508624, + "loss": 1.3937, + "step": 34266 + }, + { + "epoch": 0.4452840363652312, + "grad_norm": 0.46139511466026306, + "learning_rate": 0.00011096583007317485, + "loss": 1.4425, + "step": 34267 + }, + { + "epoch": 0.445297030909147, + "grad_norm": 0.3863597810268402, + "learning_rate": 0.00011096323061126349, + "loss": 1.5086, + "step": 34268 + }, + { + "epoch": 0.4453100254530629, + "grad_norm": 0.42210930585861206, + "learning_rate": 0.0001109606311493521, + "loss": 1.3558, + "step": 34269 + }, + { + "epoch": 0.44532301999697876, + "grad_norm": 0.3843497335910797, + "learning_rate": 0.0001109580316874407, + "loss": 1.2021, + "step": 34270 + }, + { + "epoch": 0.44533601454089466, + "grad_norm": 0.3850443363189697, + "learning_rate": 0.0001109554322255293, + "loss": 1.412, + "step": 34271 + }, + { + "epoch": 0.4453490090848105, + "grad_norm": 0.4331141412258148, + "learning_rate": 0.00011095283276361794, + "loss": 1.3311, + "step": 34272 + }, + { + "epoch": 0.4453620036287264, + "grad_norm": 0.37451043725013733, + "learning_rate": 0.00011095023330170655, + "loss": 1.4598, + "step": 34273 + }, + { + "epoch": 0.44537499817264226, + "grad_norm": 0.3784250319004059, + "learning_rate": 0.00011094763383979516, + "loss": 1.3684, + "step": 34274 + }, + { + "epoch": 0.44538799271655816, + "grad_norm": 0.35812050104141235, + "learning_rate": 0.00011094503437788379, + "loss": 1.2087, + "step": 34275 + }, + { + "epoch": 0.445400987260474, + "grad_norm": 0.4282328486442566, + "learning_rate": 0.0001109424349159724, + "loss": 1.5197, + "step": 34276 + }, + { + "epoch": 0.4454139818043899, + "grad_norm": 0.45058783888816833, + "learning_rate": 0.00011093983545406101, + "loss": 1.3608, + "step": 34277 + }, + { + "epoch": 0.44542697634830575, + "grad_norm": 0.41182488203048706, + "learning_rate": 0.00011093723599214962, + "loss": 1.6681, + "step": 34278 + }, + { + "epoch": 0.44543997089222165, + "grad_norm": 0.3434402644634247, + "learning_rate": 0.00011093463653023826, + "loss": 1.2741, + "step": 34279 + }, + { + "epoch": 0.4454529654361375, + "grad_norm": 0.3482365012168884, + "learning_rate": 0.00011093203706832687, + "loss": 1.5003, + "step": 34280 + }, + { + "epoch": 0.4454659599800534, + "grad_norm": 0.36723026633262634, + "learning_rate": 0.00011092943760641548, + "loss": 1.5793, + "step": 34281 + }, + { + "epoch": 0.44547895452396924, + "grad_norm": 0.3383881151676178, + "learning_rate": 0.00011092683814450409, + "loss": 1.3145, + "step": 34282 + }, + { + "epoch": 0.44549194906788514, + "grad_norm": 0.44511863589286804, + "learning_rate": 0.00011092423868259272, + "loss": 1.3123, + "step": 34283 + }, + { + "epoch": 0.445504943611801, + "grad_norm": 0.3695056736469269, + "learning_rate": 0.00011092163922068133, + "loss": 1.161, + "step": 34284 + }, + { + "epoch": 0.4455179381557169, + "grad_norm": 0.3754224479198456, + "learning_rate": 0.00011091903975876994, + "loss": 1.6114, + "step": 34285 + }, + { + "epoch": 0.44553093269963273, + "grad_norm": 0.34770339727401733, + "learning_rate": 0.00011091644029685855, + "loss": 1.3797, + "step": 34286 + }, + { + "epoch": 0.44554392724354863, + "grad_norm": 0.3011084198951721, + "learning_rate": 0.00011091384083494717, + "loss": 1.3314, + "step": 34287 + }, + { + "epoch": 0.4455569217874645, + "grad_norm": 0.48076823353767395, + "learning_rate": 0.00011091124137303579, + "loss": 1.4217, + "step": 34288 + }, + { + "epoch": 0.4455699163313804, + "grad_norm": 0.41937559843063354, + "learning_rate": 0.0001109086419111244, + "loss": 1.2841, + "step": 34289 + }, + { + "epoch": 0.4455829108752962, + "grad_norm": 0.35681232810020447, + "learning_rate": 0.00011090604244921301, + "loss": 1.3194, + "step": 34290 + }, + { + "epoch": 0.4455959054192121, + "grad_norm": 0.3575684428215027, + "learning_rate": 0.00011090344298730164, + "loss": 1.4678, + "step": 34291 + }, + { + "epoch": 0.44560889996312797, + "grad_norm": 0.376727432012558, + "learning_rate": 0.00011090084352539026, + "loss": 1.4629, + "step": 34292 + }, + { + "epoch": 0.44562189450704387, + "grad_norm": 0.4357430338859558, + "learning_rate": 0.00011089824406347887, + "loss": 1.4222, + "step": 34293 + }, + { + "epoch": 0.4456348890509597, + "grad_norm": 0.4371733069419861, + "learning_rate": 0.00011089564460156748, + "loss": 1.3941, + "step": 34294 + }, + { + "epoch": 0.4456478835948756, + "grad_norm": 0.38030749559402466, + "learning_rate": 0.0001108930451396561, + "loss": 1.3299, + "step": 34295 + }, + { + "epoch": 0.44566087813879146, + "grad_norm": 0.41900601983070374, + "learning_rate": 0.00011089044567774471, + "loss": 1.2317, + "step": 34296 + }, + { + "epoch": 0.44567387268270736, + "grad_norm": 0.4090757369995117, + "learning_rate": 0.00011088784621583332, + "loss": 1.4425, + "step": 34297 + }, + { + "epoch": 0.4456868672266232, + "grad_norm": 0.34591686725616455, + "learning_rate": 0.00011088524675392194, + "loss": 1.3357, + "step": 34298 + }, + { + "epoch": 0.4456998617705391, + "grad_norm": 0.466488242149353, + "learning_rate": 0.00011088264729201057, + "loss": 1.5224, + "step": 34299 + }, + { + "epoch": 0.44571285631445495, + "grad_norm": 0.39455318450927734, + "learning_rate": 0.00011088004783009917, + "loss": 1.5287, + "step": 34300 + }, + { + "epoch": 0.44572585085837085, + "grad_norm": 0.4686304032802582, + "learning_rate": 0.00011087744836818778, + "loss": 1.4426, + "step": 34301 + }, + { + "epoch": 0.4457388454022867, + "grad_norm": 0.49372440576553345, + "learning_rate": 0.00011087484890627639, + "loss": 1.514, + "step": 34302 + }, + { + "epoch": 0.4457518399462026, + "grad_norm": 0.3624727129936218, + "learning_rate": 0.00011087224944436503, + "loss": 1.2991, + "step": 34303 + }, + { + "epoch": 0.44576483449011844, + "grad_norm": 0.36775651574134827, + "learning_rate": 0.00011086964998245364, + "loss": 1.4698, + "step": 34304 + }, + { + "epoch": 0.44577782903403435, + "grad_norm": 0.4583088457584381, + "learning_rate": 0.00011086705052054225, + "loss": 1.3263, + "step": 34305 + }, + { + "epoch": 0.4457908235779502, + "grad_norm": 0.3856181204319, + "learning_rate": 0.00011086445105863086, + "loss": 1.345, + "step": 34306 + }, + { + "epoch": 0.4458038181218661, + "grad_norm": 0.3952924609184265, + "learning_rate": 0.00011086185159671949, + "loss": 1.4165, + "step": 34307 + }, + { + "epoch": 0.44581681266578194, + "grad_norm": 0.45340263843536377, + "learning_rate": 0.0001108592521348081, + "loss": 1.3613, + "step": 34308 + }, + { + "epoch": 0.44582980720969784, + "grad_norm": 0.4164819121360779, + "learning_rate": 0.00011085665267289671, + "loss": 1.4309, + "step": 34309 + }, + { + "epoch": 0.4458428017536137, + "grad_norm": 0.34728381037712097, + "learning_rate": 0.00011085405321098532, + "loss": 1.3772, + "step": 34310 + }, + { + "epoch": 0.4458557962975296, + "grad_norm": 0.414786159992218, + "learning_rate": 0.00011085145374907396, + "loss": 1.2413, + "step": 34311 + }, + { + "epoch": 0.44586879084144543, + "grad_norm": 0.4210526645183563, + "learning_rate": 0.00011084885428716256, + "loss": 1.3341, + "step": 34312 + }, + { + "epoch": 0.44588178538536133, + "grad_norm": 0.3422400653362274, + "learning_rate": 0.00011084625482525117, + "loss": 1.3576, + "step": 34313 + }, + { + "epoch": 0.4458947799292772, + "grad_norm": 0.4079273045063019, + "learning_rate": 0.0001108436553633398, + "loss": 1.5042, + "step": 34314 + }, + { + "epoch": 0.4459077744731931, + "grad_norm": 0.36557114124298096, + "learning_rate": 0.00011084105590142842, + "loss": 1.2557, + "step": 34315 + }, + { + "epoch": 0.4459207690171089, + "grad_norm": 0.3832559883594513, + "learning_rate": 0.00011083845643951703, + "loss": 1.3405, + "step": 34316 + }, + { + "epoch": 0.4459337635610248, + "grad_norm": 0.4136294722557068, + "learning_rate": 0.00011083585697760564, + "loss": 1.4496, + "step": 34317 + }, + { + "epoch": 0.44594675810494067, + "grad_norm": 0.3425063490867615, + "learning_rate": 0.00011083325751569426, + "loss": 1.4967, + "step": 34318 + }, + { + "epoch": 0.44595975264885657, + "grad_norm": 0.24988465011119843, + "learning_rate": 0.00011083065805378287, + "loss": 1.3096, + "step": 34319 + }, + { + "epoch": 0.4459727471927724, + "grad_norm": 0.3460886776447296, + "learning_rate": 0.00011082805859187148, + "loss": 1.2446, + "step": 34320 + }, + { + "epoch": 0.4459857417366883, + "grad_norm": 0.4185510575771332, + "learning_rate": 0.0001108254591299601, + "loss": 1.5271, + "step": 34321 + }, + { + "epoch": 0.44599873628060416, + "grad_norm": 0.34249839186668396, + "learning_rate": 0.00011082285966804873, + "loss": 1.3808, + "step": 34322 + }, + { + "epoch": 0.44601173082452006, + "grad_norm": 0.4375338852405548, + "learning_rate": 0.00011082026020613734, + "loss": 1.5688, + "step": 34323 + }, + { + "epoch": 0.4460247253684359, + "grad_norm": 0.36479687690734863, + "learning_rate": 0.00011081766074422595, + "loss": 1.2566, + "step": 34324 + }, + { + "epoch": 0.4460377199123518, + "grad_norm": 0.3792046308517456, + "learning_rate": 0.00011081506128231455, + "loss": 1.4592, + "step": 34325 + }, + { + "epoch": 0.44605071445626765, + "grad_norm": 0.4051467776298523, + "learning_rate": 0.00011081246182040319, + "loss": 1.3335, + "step": 34326 + }, + { + "epoch": 0.44606370900018355, + "grad_norm": 0.49074748158454895, + "learning_rate": 0.0001108098623584918, + "loss": 1.443, + "step": 34327 + }, + { + "epoch": 0.4460767035440994, + "grad_norm": 0.4439942538738251, + "learning_rate": 0.00011080726289658041, + "loss": 1.5425, + "step": 34328 + }, + { + "epoch": 0.4460896980880153, + "grad_norm": 0.35491228103637695, + "learning_rate": 0.00011080466343466902, + "loss": 1.3911, + "step": 34329 + }, + { + "epoch": 0.44610269263193114, + "grad_norm": 0.39883947372436523, + "learning_rate": 0.00011080206397275765, + "loss": 1.397, + "step": 34330 + }, + { + "epoch": 0.44611568717584704, + "grad_norm": 0.3358071446418762, + "learning_rate": 0.00011079946451084626, + "loss": 1.3331, + "step": 34331 + }, + { + "epoch": 0.4461286817197629, + "grad_norm": 0.4334869980812073, + "learning_rate": 0.00011079686504893487, + "loss": 1.4793, + "step": 34332 + }, + { + "epoch": 0.4461416762636788, + "grad_norm": 0.4272914230823517, + "learning_rate": 0.00011079426558702348, + "loss": 1.6548, + "step": 34333 + }, + { + "epoch": 0.4461546708075947, + "grad_norm": 0.34586331248283386, + "learning_rate": 0.00011079166612511212, + "loss": 1.2473, + "step": 34334 + }, + { + "epoch": 0.44616766535151053, + "grad_norm": 0.5395427346229553, + "learning_rate": 0.00011078906666320073, + "loss": 1.2849, + "step": 34335 + }, + { + "epoch": 0.44618065989542643, + "grad_norm": 0.3320787250995636, + "learning_rate": 0.00011078646720128934, + "loss": 1.4171, + "step": 34336 + }, + { + "epoch": 0.4461936544393423, + "grad_norm": 0.48876485228538513, + "learning_rate": 0.00011078386773937795, + "loss": 1.6469, + "step": 34337 + }, + { + "epoch": 0.4462066489832582, + "grad_norm": 0.36836597323417664, + "learning_rate": 0.00011078126827746658, + "loss": 1.3296, + "step": 34338 + }, + { + "epoch": 0.446219643527174, + "grad_norm": 0.4083898365497589, + "learning_rate": 0.00011077866881555519, + "loss": 1.6228, + "step": 34339 + }, + { + "epoch": 0.4462326380710899, + "grad_norm": 0.4508657157421112, + "learning_rate": 0.0001107760693536438, + "loss": 1.5912, + "step": 34340 + }, + { + "epoch": 0.44624563261500577, + "grad_norm": 0.3489711582660675, + "learning_rate": 0.00011077346989173241, + "loss": 1.5068, + "step": 34341 + }, + { + "epoch": 0.4462586271589217, + "grad_norm": 0.34017565846443176, + "learning_rate": 0.00011077087042982103, + "loss": 1.3839, + "step": 34342 + }, + { + "epoch": 0.4462716217028375, + "grad_norm": 0.4049300253391266, + "learning_rate": 0.00011076827096790964, + "loss": 1.4943, + "step": 34343 + }, + { + "epoch": 0.4462846162467534, + "grad_norm": 0.318954735994339, + "learning_rate": 0.00011076567150599825, + "loss": 1.2029, + "step": 34344 + }, + { + "epoch": 0.44629761079066926, + "grad_norm": 0.3333900272846222, + "learning_rate": 0.00011076307204408687, + "loss": 1.2969, + "step": 34345 + }, + { + "epoch": 0.44631060533458516, + "grad_norm": 0.444560170173645, + "learning_rate": 0.0001107604725821755, + "loss": 1.5042, + "step": 34346 + }, + { + "epoch": 0.446323599878501, + "grad_norm": 0.5023630261421204, + "learning_rate": 0.00011075787312026411, + "loss": 1.5149, + "step": 34347 + }, + { + "epoch": 0.4463365944224169, + "grad_norm": 0.3006480038166046, + "learning_rate": 0.00011075527365835273, + "loss": 1.3705, + "step": 34348 + }, + { + "epoch": 0.44634958896633276, + "grad_norm": 0.4001215100288391, + "learning_rate": 0.00011075267419644135, + "loss": 1.3932, + "step": 34349 + }, + { + "epoch": 0.44636258351024866, + "grad_norm": 0.34508216381073, + "learning_rate": 0.00011075007473452996, + "loss": 1.2686, + "step": 34350 + }, + { + "epoch": 0.4463755780541645, + "grad_norm": 0.4526347517967224, + "learning_rate": 0.00011074747527261857, + "loss": 1.601, + "step": 34351 + }, + { + "epoch": 0.4463885725980804, + "grad_norm": 0.33056217432022095, + "learning_rate": 0.00011074487581070718, + "loss": 1.377, + "step": 34352 + }, + { + "epoch": 0.44640156714199625, + "grad_norm": 0.41972944140434265, + "learning_rate": 0.00011074227634879582, + "loss": 1.4054, + "step": 34353 + }, + { + "epoch": 0.44641456168591215, + "grad_norm": 0.5326032042503357, + "learning_rate": 0.00011073967688688442, + "loss": 1.4111, + "step": 34354 + }, + { + "epoch": 0.446427556229828, + "grad_norm": 0.40371203422546387, + "learning_rate": 0.00011073707742497303, + "loss": 1.418, + "step": 34355 + }, + { + "epoch": 0.4464405507737439, + "grad_norm": 0.3251248300075531, + "learning_rate": 0.00011073447796306164, + "loss": 1.35, + "step": 34356 + }, + { + "epoch": 0.44645354531765974, + "grad_norm": 0.3798600137233734, + "learning_rate": 0.00011073187850115028, + "loss": 1.2489, + "step": 34357 + }, + { + "epoch": 0.44646653986157564, + "grad_norm": 0.3740684688091278, + "learning_rate": 0.00011072927903923889, + "loss": 1.3124, + "step": 34358 + }, + { + "epoch": 0.4464795344054915, + "grad_norm": 0.39169231057167053, + "learning_rate": 0.0001107266795773275, + "loss": 1.5055, + "step": 34359 + }, + { + "epoch": 0.4464925289494074, + "grad_norm": 0.3304338753223419, + "learning_rate": 0.00011072408011541611, + "loss": 1.2963, + "step": 34360 + }, + { + "epoch": 0.44650552349332323, + "grad_norm": 0.33984822034835815, + "learning_rate": 0.00011072148065350474, + "loss": 1.5574, + "step": 34361 + }, + { + "epoch": 0.44651851803723913, + "grad_norm": 0.3603609502315521, + "learning_rate": 0.00011071888119159335, + "loss": 1.2787, + "step": 34362 + }, + { + "epoch": 0.446531512581155, + "grad_norm": 0.37465938925743103, + "learning_rate": 0.00011071628172968196, + "loss": 1.3149, + "step": 34363 + }, + { + "epoch": 0.4465445071250709, + "grad_norm": 0.4414732754230499, + "learning_rate": 0.00011071368226777057, + "loss": 1.607, + "step": 34364 + }, + { + "epoch": 0.4465575016689867, + "grad_norm": 0.4735172986984253, + "learning_rate": 0.0001107110828058592, + "loss": 1.395, + "step": 34365 + }, + { + "epoch": 0.4465704962129026, + "grad_norm": 0.36253899335861206, + "learning_rate": 0.00011070848334394782, + "loss": 1.3045, + "step": 34366 + }, + { + "epoch": 0.44658349075681847, + "grad_norm": 0.3687044382095337, + "learning_rate": 0.00011070588388203641, + "loss": 1.3396, + "step": 34367 + }, + { + "epoch": 0.44659648530073437, + "grad_norm": 0.3425123691558838, + "learning_rate": 0.00011070328442012503, + "loss": 1.5265, + "step": 34368 + }, + { + "epoch": 0.4466094798446502, + "grad_norm": 0.4352796673774719, + "learning_rate": 0.00011070068495821366, + "loss": 1.4022, + "step": 34369 + }, + { + "epoch": 0.4466224743885661, + "grad_norm": 0.39405304193496704, + "learning_rate": 0.00011069808549630227, + "loss": 1.6418, + "step": 34370 + }, + { + "epoch": 0.44663546893248196, + "grad_norm": 0.33813199400901794, + "learning_rate": 0.00011069548603439089, + "loss": 1.2852, + "step": 34371 + }, + { + "epoch": 0.44664846347639786, + "grad_norm": 0.32915568351745605, + "learning_rate": 0.0001106928865724795, + "loss": 1.4585, + "step": 34372 + }, + { + "epoch": 0.4466614580203137, + "grad_norm": 0.49793750047683716, + "learning_rate": 0.00011069028711056812, + "loss": 1.5444, + "step": 34373 + }, + { + "epoch": 0.4466744525642296, + "grad_norm": 0.48389121890068054, + "learning_rate": 0.00011068768764865673, + "loss": 1.329, + "step": 34374 + }, + { + "epoch": 0.44668744710814545, + "grad_norm": 0.4808426797389984, + "learning_rate": 0.00011068508818674534, + "loss": 1.3835, + "step": 34375 + }, + { + "epoch": 0.44670044165206135, + "grad_norm": 0.43283843994140625, + "learning_rate": 0.00011068248872483395, + "loss": 1.55, + "step": 34376 + }, + { + "epoch": 0.4467134361959772, + "grad_norm": 0.3492809236049652, + "learning_rate": 0.00011067988926292259, + "loss": 1.3043, + "step": 34377 + }, + { + "epoch": 0.4467264307398931, + "grad_norm": 0.414581298828125, + "learning_rate": 0.0001106772898010112, + "loss": 1.3926, + "step": 34378 + }, + { + "epoch": 0.44673942528380894, + "grad_norm": 0.4179449677467346, + "learning_rate": 0.00011067469033909981, + "loss": 1.462, + "step": 34379 + }, + { + "epoch": 0.44675241982772484, + "grad_norm": 0.43760010600090027, + "learning_rate": 0.00011067209087718841, + "loss": 1.4484, + "step": 34380 + }, + { + "epoch": 0.4467654143716407, + "grad_norm": 0.34663277864456177, + "learning_rate": 0.00011066949141527705, + "loss": 1.4346, + "step": 34381 + }, + { + "epoch": 0.4467784089155566, + "grad_norm": 0.29440104961395264, + "learning_rate": 0.00011066689195336566, + "loss": 1.3672, + "step": 34382 + }, + { + "epoch": 0.44679140345947244, + "grad_norm": 0.40804335474967957, + "learning_rate": 0.00011066429249145427, + "loss": 1.5027, + "step": 34383 + }, + { + "epoch": 0.44680439800338834, + "grad_norm": 0.4632614850997925, + "learning_rate": 0.00011066169302954288, + "loss": 1.5145, + "step": 34384 + }, + { + "epoch": 0.4468173925473042, + "grad_norm": 0.35557109117507935, + "learning_rate": 0.0001106590935676315, + "loss": 1.552, + "step": 34385 + }, + { + "epoch": 0.4468303870912201, + "grad_norm": 0.5257606506347656, + "learning_rate": 0.00011065649410572012, + "loss": 1.5322, + "step": 34386 + }, + { + "epoch": 0.44684338163513593, + "grad_norm": 0.4096625745296478, + "learning_rate": 0.00011065389464380873, + "loss": 1.6029, + "step": 34387 + }, + { + "epoch": 0.44685637617905183, + "grad_norm": 0.5139939785003662, + "learning_rate": 0.00011065129518189737, + "loss": 1.4038, + "step": 34388 + }, + { + "epoch": 0.4468693707229677, + "grad_norm": 0.38154035806655884, + "learning_rate": 0.00011064869571998598, + "loss": 1.3196, + "step": 34389 + }, + { + "epoch": 0.4468823652668836, + "grad_norm": 0.40146270394325256, + "learning_rate": 0.00011064609625807459, + "loss": 1.3116, + "step": 34390 + }, + { + "epoch": 0.4468953598107994, + "grad_norm": 0.4216993451118469, + "learning_rate": 0.0001106434967961632, + "loss": 1.2049, + "step": 34391 + }, + { + "epoch": 0.4469083543547153, + "grad_norm": 0.4717008173465729, + "learning_rate": 0.00011064089733425182, + "loss": 1.1986, + "step": 34392 + }, + { + "epoch": 0.44692134889863117, + "grad_norm": 0.3446395695209503, + "learning_rate": 0.00011063829787234043, + "loss": 1.3047, + "step": 34393 + }, + { + "epoch": 0.44693434344254707, + "grad_norm": 0.42920851707458496, + "learning_rate": 0.00011063569841042905, + "loss": 1.5195, + "step": 34394 + }, + { + "epoch": 0.4469473379864629, + "grad_norm": 0.34217706322669983, + "learning_rate": 0.00011063309894851766, + "loss": 1.499, + "step": 34395 + }, + { + "epoch": 0.4469603325303788, + "grad_norm": 0.340866357088089, + "learning_rate": 0.00011063049948660628, + "loss": 1.6144, + "step": 34396 + }, + { + "epoch": 0.44697332707429466, + "grad_norm": 0.4250718355178833, + "learning_rate": 0.00011062790002469489, + "loss": 1.5402, + "step": 34397 + }, + { + "epoch": 0.44698632161821056, + "grad_norm": 0.38750213384628296, + "learning_rate": 0.0001106253005627835, + "loss": 1.3533, + "step": 34398 + }, + { + "epoch": 0.4469993161621264, + "grad_norm": 0.4290032684803009, + "learning_rate": 0.00011062270110087211, + "loss": 1.3773, + "step": 34399 + }, + { + "epoch": 0.4470123107060423, + "grad_norm": 0.4724940359592438, + "learning_rate": 0.00011062010163896075, + "loss": 1.3699, + "step": 34400 + }, + { + "epoch": 0.44702530524995815, + "grad_norm": 0.3969428837299347, + "learning_rate": 0.00011061750217704936, + "loss": 1.2504, + "step": 34401 + }, + { + "epoch": 0.44703829979387405, + "grad_norm": 2.914214611053467, + "learning_rate": 0.00011061490271513797, + "loss": 1.3565, + "step": 34402 + }, + { + "epoch": 0.4470512943377899, + "grad_norm": 0.47409042716026306, + "learning_rate": 0.00011061230325322658, + "loss": 1.4433, + "step": 34403 + }, + { + "epoch": 0.4470642888817058, + "grad_norm": 0.4451931416988373, + "learning_rate": 0.00011060970379131521, + "loss": 1.3567, + "step": 34404 + }, + { + "epoch": 0.44707728342562164, + "grad_norm": 0.4385353624820709, + "learning_rate": 0.00011060710432940382, + "loss": 1.2889, + "step": 34405 + }, + { + "epoch": 0.44709027796953754, + "grad_norm": 0.3574380576610565, + "learning_rate": 0.00011060450486749243, + "loss": 1.4282, + "step": 34406 + }, + { + "epoch": 0.4471032725134534, + "grad_norm": 0.38908064365386963, + "learning_rate": 0.00011060190540558104, + "loss": 1.2899, + "step": 34407 + }, + { + "epoch": 0.4471162670573693, + "grad_norm": 0.4133598506450653, + "learning_rate": 0.00011059930594366968, + "loss": 1.4081, + "step": 34408 + }, + { + "epoch": 0.44712926160128513, + "grad_norm": 0.4932892322540283, + "learning_rate": 0.00011059670648175828, + "loss": 1.519, + "step": 34409 + }, + { + "epoch": 0.44714225614520103, + "grad_norm": 0.4468093514442444, + "learning_rate": 0.00011059410701984689, + "loss": 1.4823, + "step": 34410 + }, + { + "epoch": 0.44715525068911693, + "grad_norm": 0.38885483145713806, + "learning_rate": 0.0001105915075579355, + "loss": 1.3554, + "step": 34411 + }, + { + "epoch": 0.4471682452330328, + "grad_norm": 0.2962060570716858, + "learning_rate": 0.00011058890809602414, + "loss": 1.2117, + "step": 34412 + }, + { + "epoch": 0.4471812397769487, + "grad_norm": 0.36810505390167236, + "learning_rate": 0.00011058630863411275, + "loss": 1.2322, + "step": 34413 + }, + { + "epoch": 0.4471942343208645, + "grad_norm": 0.37185022234916687, + "learning_rate": 0.00011058370917220136, + "loss": 1.3384, + "step": 34414 + }, + { + "epoch": 0.4472072288647804, + "grad_norm": 0.3865322470664978, + "learning_rate": 0.00011058110971028997, + "loss": 1.3367, + "step": 34415 + }, + { + "epoch": 0.44722022340869627, + "grad_norm": 0.35716453194618225, + "learning_rate": 0.0001105785102483786, + "loss": 1.3146, + "step": 34416 + }, + { + "epoch": 0.44723321795261217, + "grad_norm": 0.3830145001411438, + "learning_rate": 0.0001105759107864672, + "loss": 1.2192, + "step": 34417 + }, + { + "epoch": 0.447246212496528, + "grad_norm": 0.3806319832801819, + "learning_rate": 0.00011057331132455582, + "loss": 1.3687, + "step": 34418 + }, + { + "epoch": 0.4472592070404439, + "grad_norm": 0.4754382073879242, + "learning_rate": 0.00011057071186264443, + "loss": 1.679, + "step": 34419 + }, + { + "epoch": 0.44727220158435976, + "grad_norm": 0.479158490896225, + "learning_rate": 0.00011056811240073306, + "loss": 1.2415, + "step": 34420 + }, + { + "epoch": 0.44728519612827566, + "grad_norm": 0.5324766635894775, + "learning_rate": 0.00011056551293882168, + "loss": 1.5294, + "step": 34421 + }, + { + "epoch": 0.4472981906721915, + "grad_norm": 0.44869571924209595, + "learning_rate": 0.00011056291347691027, + "loss": 1.4757, + "step": 34422 + }, + { + "epoch": 0.4473111852161074, + "grad_norm": 0.4435884356498718, + "learning_rate": 0.00011056031401499891, + "loss": 1.3288, + "step": 34423 + }, + { + "epoch": 0.44732417976002325, + "grad_norm": 0.39496248960494995, + "learning_rate": 0.00011055771455308752, + "loss": 1.4739, + "step": 34424 + }, + { + "epoch": 0.44733717430393916, + "grad_norm": 0.3510694205760956, + "learning_rate": 0.00011055511509117613, + "loss": 1.3261, + "step": 34425 + }, + { + "epoch": 0.447350168847855, + "grad_norm": 0.37289169430732727, + "learning_rate": 0.00011055251562926474, + "loss": 1.5314, + "step": 34426 + }, + { + "epoch": 0.4473631633917709, + "grad_norm": 0.46063491702079773, + "learning_rate": 0.00011054991616735337, + "loss": 1.4496, + "step": 34427 + }, + { + "epoch": 0.44737615793568675, + "grad_norm": 0.396872341632843, + "learning_rate": 0.00011054731670544198, + "loss": 1.4397, + "step": 34428 + }, + { + "epoch": 0.44738915247960265, + "grad_norm": 0.48116815090179443, + "learning_rate": 0.00011054471724353059, + "loss": 1.4658, + "step": 34429 + }, + { + "epoch": 0.4474021470235185, + "grad_norm": 0.3058229386806488, + "learning_rate": 0.0001105421177816192, + "loss": 1.4166, + "step": 34430 + }, + { + "epoch": 0.4474151415674344, + "grad_norm": 0.4168516993522644, + "learning_rate": 0.00011053951831970784, + "loss": 1.1931, + "step": 34431 + }, + { + "epoch": 0.44742813611135024, + "grad_norm": 0.3161821961402893, + "learning_rate": 0.00011053691885779645, + "loss": 1.3013, + "step": 34432 + }, + { + "epoch": 0.44744113065526614, + "grad_norm": 0.5002384781837463, + "learning_rate": 0.00011053431939588506, + "loss": 1.4364, + "step": 34433 + }, + { + "epoch": 0.447454125199182, + "grad_norm": 0.4513053894042969, + "learning_rate": 0.00011053171993397366, + "loss": 1.2283, + "step": 34434 + }, + { + "epoch": 0.4474671197430979, + "grad_norm": 0.4575236439704895, + "learning_rate": 0.0001105291204720623, + "loss": 1.3915, + "step": 34435 + }, + { + "epoch": 0.44748011428701373, + "grad_norm": 0.46848222613334656, + "learning_rate": 0.00011052652101015091, + "loss": 1.4672, + "step": 34436 + }, + { + "epoch": 0.44749310883092963, + "grad_norm": 0.44768261909484863, + "learning_rate": 0.00011052392154823952, + "loss": 1.4768, + "step": 34437 + }, + { + "epoch": 0.4475061033748455, + "grad_norm": 0.3839968740940094, + "learning_rate": 0.00011052132208632813, + "loss": 1.3417, + "step": 34438 + }, + { + "epoch": 0.4475190979187614, + "grad_norm": 0.4355773329734802, + "learning_rate": 0.00011051872262441675, + "loss": 1.4958, + "step": 34439 + }, + { + "epoch": 0.4475320924626772, + "grad_norm": 0.32546356320381165, + "learning_rate": 0.00011051612316250536, + "loss": 1.4501, + "step": 34440 + }, + { + "epoch": 0.4475450870065931, + "grad_norm": 0.42174601554870605, + "learning_rate": 0.00011051352370059398, + "loss": 1.4063, + "step": 34441 + }, + { + "epoch": 0.44755808155050897, + "grad_norm": 0.3652845025062561, + "learning_rate": 0.00011051092423868259, + "loss": 1.2826, + "step": 34442 + }, + { + "epoch": 0.44757107609442487, + "grad_norm": 0.419465035200119, + "learning_rate": 0.00011050832477677122, + "loss": 1.4158, + "step": 34443 + }, + { + "epoch": 0.4475840706383407, + "grad_norm": 0.37714046239852905, + "learning_rate": 0.00011050572531485984, + "loss": 1.1297, + "step": 34444 + }, + { + "epoch": 0.4475970651822566, + "grad_norm": 0.3914308547973633, + "learning_rate": 0.00011050312585294845, + "loss": 1.3916, + "step": 34445 + }, + { + "epoch": 0.44761005972617246, + "grad_norm": 0.3930824100971222, + "learning_rate": 0.00011050052639103706, + "loss": 1.2877, + "step": 34446 + }, + { + "epoch": 0.44762305427008836, + "grad_norm": 0.452641099691391, + "learning_rate": 0.00011049792692912568, + "loss": 1.3427, + "step": 34447 + }, + { + "epoch": 0.4476360488140042, + "grad_norm": 0.4003320634365082, + "learning_rate": 0.00011049532746721429, + "loss": 1.2218, + "step": 34448 + }, + { + "epoch": 0.4476490433579201, + "grad_norm": 0.4607212543487549, + "learning_rate": 0.0001104927280053029, + "loss": 1.4456, + "step": 34449 + }, + { + "epoch": 0.44766203790183595, + "grad_norm": 0.3892253339290619, + "learning_rate": 0.00011049012854339151, + "loss": 1.3554, + "step": 34450 + }, + { + "epoch": 0.44767503244575185, + "grad_norm": 0.3758752942085266, + "learning_rate": 0.00011048752908148014, + "loss": 1.3145, + "step": 34451 + }, + { + "epoch": 0.4476880269896677, + "grad_norm": 0.44572144746780396, + "learning_rate": 0.00011048492961956875, + "loss": 1.1913, + "step": 34452 + }, + { + "epoch": 0.4477010215335836, + "grad_norm": 0.33415356278419495, + "learning_rate": 0.00011048233015765736, + "loss": 1.2895, + "step": 34453 + }, + { + "epoch": 0.44771401607749944, + "grad_norm": 0.4133395850658417, + "learning_rate": 0.00011047973069574597, + "loss": 1.4047, + "step": 34454 + }, + { + "epoch": 0.44772701062141534, + "grad_norm": 0.3269536793231964, + "learning_rate": 0.00011047713123383461, + "loss": 1.3029, + "step": 34455 + }, + { + "epoch": 0.4477400051653312, + "grad_norm": 0.35477206110954285, + "learning_rate": 0.00011047453177192322, + "loss": 1.25, + "step": 34456 + }, + { + "epoch": 0.4477529997092471, + "grad_norm": 0.31776291131973267, + "learning_rate": 0.00011047193231001183, + "loss": 1.4291, + "step": 34457 + }, + { + "epoch": 0.44776599425316294, + "grad_norm": 0.3256598114967346, + "learning_rate": 0.00011046933284810044, + "loss": 1.391, + "step": 34458 + }, + { + "epoch": 0.44777898879707884, + "grad_norm": 0.4343120753765106, + "learning_rate": 0.00011046673338618907, + "loss": 1.4171, + "step": 34459 + }, + { + "epoch": 0.4477919833409947, + "grad_norm": 0.5201101899147034, + "learning_rate": 0.00011046413392427768, + "loss": 1.5068, + "step": 34460 + }, + { + "epoch": 0.4478049778849106, + "grad_norm": 0.4345944821834564, + "learning_rate": 0.00011046153446236629, + "loss": 1.4696, + "step": 34461 + }, + { + "epoch": 0.4478179724288264, + "grad_norm": 0.2510223686695099, + "learning_rate": 0.00011045893500045493, + "loss": 1.169, + "step": 34462 + }, + { + "epoch": 0.44783096697274233, + "grad_norm": 0.4620783030986786, + "learning_rate": 0.00011045633553854354, + "loss": 1.4724, + "step": 34463 + }, + { + "epoch": 0.4478439615166582, + "grad_norm": 0.45253750681877136, + "learning_rate": 0.00011045373607663214, + "loss": 1.424, + "step": 34464 + }, + { + "epoch": 0.4478569560605741, + "grad_norm": 0.41018766164779663, + "learning_rate": 0.00011045113661472075, + "loss": 1.517, + "step": 34465 + }, + { + "epoch": 0.4478699506044899, + "grad_norm": 0.35038521885871887, + "learning_rate": 0.00011044853715280938, + "loss": 1.1256, + "step": 34466 + }, + { + "epoch": 0.4478829451484058, + "grad_norm": 0.33538463711738586, + "learning_rate": 0.000110445937690898, + "loss": 1.7104, + "step": 34467 + }, + { + "epoch": 0.44789593969232167, + "grad_norm": 0.4250282347202301, + "learning_rate": 0.0001104433382289866, + "loss": 1.4537, + "step": 34468 + }, + { + "epoch": 0.44790893423623757, + "grad_norm": 0.445030152797699, + "learning_rate": 0.00011044073876707522, + "loss": 1.3583, + "step": 34469 + }, + { + "epoch": 0.4479219287801534, + "grad_norm": 0.3021256923675537, + "learning_rate": 0.00011043813930516384, + "loss": 0.97, + "step": 34470 + }, + { + "epoch": 0.4479349233240693, + "grad_norm": 0.43961846828460693, + "learning_rate": 0.00011043553984325245, + "loss": 1.4222, + "step": 34471 + }, + { + "epoch": 0.44794791786798516, + "grad_norm": 0.4001900553703308, + "learning_rate": 0.00011043294038134106, + "loss": 1.5473, + "step": 34472 + }, + { + "epoch": 0.44796091241190106, + "grad_norm": 0.40726974606513977, + "learning_rate": 0.00011043034091942967, + "loss": 1.294, + "step": 34473 + }, + { + "epoch": 0.4479739069558169, + "grad_norm": 0.32053637504577637, + "learning_rate": 0.00011042774145751831, + "loss": 1.4564, + "step": 34474 + }, + { + "epoch": 0.4479869014997328, + "grad_norm": 0.3984828293323517, + "learning_rate": 0.00011042514199560692, + "loss": 1.3669, + "step": 34475 + }, + { + "epoch": 0.44799989604364865, + "grad_norm": 0.4609595239162445, + "learning_rate": 0.00011042254253369552, + "loss": 1.4772, + "step": 34476 + }, + { + "epoch": 0.44801289058756455, + "grad_norm": 0.3621473014354706, + "learning_rate": 0.00011041994307178413, + "loss": 1.3998, + "step": 34477 + }, + { + "epoch": 0.4480258851314804, + "grad_norm": 0.4369111955165863, + "learning_rate": 0.00011041734360987277, + "loss": 1.4628, + "step": 34478 + }, + { + "epoch": 0.4480388796753963, + "grad_norm": 0.3987017571926117, + "learning_rate": 0.00011041474414796138, + "loss": 1.41, + "step": 34479 + }, + { + "epoch": 0.44805187421931214, + "grad_norm": 0.3982883095741272, + "learning_rate": 0.00011041214468604999, + "loss": 1.2265, + "step": 34480 + }, + { + "epoch": 0.44806486876322804, + "grad_norm": 0.36007747054100037, + "learning_rate": 0.0001104095452241386, + "loss": 1.4363, + "step": 34481 + }, + { + "epoch": 0.4480778633071439, + "grad_norm": 0.3950411081314087, + "learning_rate": 0.00011040694576222723, + "loss": 1.3897, + "step": 34482 + }, + { + "epoch": 0.4480908578510598, + "grad_norm": 0.5346966981887817, + "learning_rate": 0.00011040434630031584, + "loss": 1.4435, + "step": 34483 + }, + { + "epoch": 0.44810385239497563, + "grad_norm": 0.32372596859931946, + "learning_rate": 0.00011040174683840445, + "loss": 1.3193, + "step": 34484 + }, + { + "epoch": 0.44811684693889153, + "grad_norm": 0.36244630813598633, + "learning_rate": 0.00011039914737649306, + "loss": 1.2904, + "step": 34485 + }, + { + "epoch": 0.44812984148280743, + "grad_norm": 0.3881213963031769, + "learning_rate": 0.0001103965479145817, + "loss": 1.477, + "step": 34486 + }, + { + "epoch": 0.4481428360267233, + "grad_norm": 0.3589574992656708, + "learning_rate": 0.00011039394845267031, + "loss": 1.4025, + "step": 34487 + }, + { + "epoch": 0.4481558305706392, + "grad_norm": 0.4735058844089508, + "learning_rate": 0.00011039134899075892, + "loss": 1.3803, + "step": 34488 + }, + { + "epoch": 0.448168825114555, + "grad_norm": 0.398260235786438, + "learning_rate": 0.00011038874952884752, + "loss": 1.1483, + "step": 34489 + }, + { + "epoch": 0.4481818196584709, + "grad_norm": 0.3908102810382843, + "learning_rate": 0.00011038615006693616, + "loss": 1.3377, + "step": 34490 + }, + { + "epoch": 0.44819481420238677, + "grad_norm": 0.4203278124332428, + "learning_rate": 0.00011038355060502477, + "loss": 1.3267, + "step": 34491 + }, + { + "epoch": 0.44820780874630267, + "grad_norm": 0.40055733919143677, + "learning_rate": 0.00011038095114311338, + "loss": 1.6408, + "step": 34492 + }, + { + "epoch": 0.4482208032902185, + "grad_norm": 0.401552677154541, + "learning_rate": 0.00011037835168120199, + "loss": 1.3387, + "step": 34493 + }, + { + "epoch": 0.4482337978341344, + "grad_norm": 0.3981100022792816, + "learning_rate": 0.00011037575221929061, + "loss": 1.3632, + "step": 34494 + }, + { + "epoch": 0.44824679237805026, + "grad_norm": 0.3752848505973816, + "learning_rate": 0.00011037315275737922, + "loss": 1.34, + "step": 34495 + }, + { + "epoch": 0.44825978692196616, + "grad_norm": 0.5562670826911926, + "learning_rate": 0.00011037055329546783, + "loss": 1.4506, + "step": 34496 + }, + { + "epoch": 0.448272781465882, + "grad_norm": 0.3460952937602997, + "learning_rate": 0.00011036795383355647, + "loss": 1.1565, + "step": 34497 + }, + { + "epoch": 0.4482857760097979, + "grad_norm": 0.3721766471862793, + "learning_rate": 0.00011036535437164508, + "loss": 1.1986, + "step": 34498 + }, + { + "epoch": 0.44829877055371375, + "grad_norm": 0.4068165719509125, + "learning_rate": 0.0001103627549097337, + "loss": 1.314, + "step": 34499 + }, + { + "epoch": 0.44831176509762966, + "grad_norm": 0.5032851696014404, + "learning_rate": 0.0001103601554478223, + "loss": 1.406, + "step": 34500 + }, + { + "epoch": 0.4483247596415455, + "grad_norm": 0.43065133690834045, + "learning_rate": 0.00011035755598591093, + "loss": 1.6215, + "step": 34501 + }, + { + "epoch": 0.4483377541854614, + "grad_norm": 0.35741227865219116, + "learning_rate": 0.00011035495652399954, + "loss": 1.4652, + "step": 34502 + }, + { + "epoch": 0.44835074872937725, + "grad_norm": 0.425245076417923, + "learning_rate": 0.00011035235706208815, + "loss": 1.4227, + "step": 34503 + }, + { + "epoch": 0.44836374327329315, + "grad_norm": 0.4913164973258972, + "learning_rate": 0.00011034975760017676, + "loss": 1.4429, + "step": 34504 + }, + { + "epoch": 0.448376737817209, + "grad_norm": 0.4327201247215271, + "learning_rate": 0.0001103471581382654, + "loss": 1.3286, + "step": 34505 + }, + { + "epoch": 0.4483897323611249, + "grad_norm": 0.33906733989715576, + "learning_rate": 0.000110344558676354, + "loss": 1.2818, + "step": 34506 + }, + { + "epoch": 0.44840272690504074, + "grad_norm": 0.3966307044029236, + "learning_rate": 0.00011034195921444261, + "loss": 1.3729, + "step": 34507 + }, + { + "epoch": 0.44841572144895664, + "grad_norm": 0.5355483293533325, + "learning_rate": 0.00011033935975253122, + "loss": 1.3975, + "step": 34508 + }, + { + "epoch": 0.4484287159928725, + "grad_norm": 0.4202527105808258, + "learning_rate": 0.00011033676029061986, + "loss": 1.4654, + "step": 34509 + }, + { + "epoch": 0.4484417105367884, + "grad_norm": 0.28879261016845703, + "learning_rate": 0.00011033416082870847, + "loss": 1.2599, + "step": 34510 + }, + { + "epoch": 0.44845470508070423, + "grad_norm": 0.42864328622817993, + "learning_rate": 0.00011033156136679708, + "loss": 1.4591, + "step": 34511 + }, + { + "epoch": 0.44846769962462013, + "grad_norm": 0.4456816613674164, + "learning_rate": 0.00011032896190488569, + "loss": 1.2534, + "step": 34512 + }, + { + "epoch": 0.448480694168536, + "grad_norm": 0.41308531165122986, + "learning_rate": 0.00011032636244297432, + "loss": 1.3351, + "step": 34513 + }, + { + "epoch": 0.4484936887124519, + "grad_norm": 0.47774264216423035, + "learning_rate": 0.00011032376298106293, + "loss": 1.4716, + "step": 34514 + }, + { + "epoch": 0.4485066832563677, + "grad_norm": 0.36156168580055237, + "learning_rate": 0.00011032116351915154, + "loss": 1.376, + "step": 34515 + }, + { + "epoch": 0.4485196778002836, + "grad_norm": 0.4695911407470703, + "learning_rate": 0.00011031856405724015, + "loss": 1.5211, + "step": 34516 + }, + { + "epoch": 0.44853267234419947, + "grad_norm": 0.43510448932647705, + "learning_rate": 0.00011031596459532879, + "loss": 1.4971, + "step": 34517 + }, + { + "epoch": 0.44854566688811537, + "grad_norm": 0.37574025988578796, + "learning_rate": 0.00011031336513341738, + "loss": 1.3829, + "step": 34518 + }, + { + "epoch": 0.4485586614320312, + "grad_norm": 0.3916834890842438, + "learning_rate": 0.000110310765671506, + "loss": 1.3704, + "step": 34519 + }, + { + "epoch": 0.4485716559759471, + "grad_norm": 0.3476482331752777, + "learning_rate": 0.0001103081662095946, + "loss": 1.3747, + "step": 34520 + }, + { + "epoch": 0.44858465051986296, + "grad_norm": 0.30428409576416016, + "learning_rate": 0.00011030556674768324, + "loss": 1.6379, + "step": 34521 + }, + { + "epoch": 0.44859764506377886, + "grad_norm": 0.3509369492530823, + "learning_rate": 0.00011030296728577185, + "loss": 1.6192, + "step": 34522 + }, + { + "epoch": 0.4486106396076947, + "grad_norm": 0.41059157252311707, + "learning_rate": 0.00011030036782386047, + "loss": 1.4077, + "step": 34523 + }, + { + "epoch": 0.4486236341516106, + "grad_norm": 0.4512932002544403, + "learning_rate": 0.00011029776836194908, + "loss": 1.3083, + "step": 34524 + }, + { + "epoch": 0.44863662869552645, + "grad_norm": 0.40130892395973206, + "learning_rate": 0.0001102951689000377, + "loss": 1.4744, + "step": 34525 + }, + { + "epoch": 0.44864962323944235, + "grad_norm": 0.3996976613998413, + "learning_rate": 0.00011029256943812631, + "loss": 1.5036, + "step": 34526 + }, + { + "epoch": 0.4486626177833582, + "grad_norm": 0.3865920305252075, + "learning_rate": 0.00011028996997621492, + "loss": 1.2762, + "step": 34527 + }, + { + "epoch": 0.4486756123272741, + "grad_norm": 0.4725949466228485, + "learning_rate": 0.00011028737051430353, + "loss": 1.3748, + "step": 34528 + }, + { + "epoch": 0.44868860687118994, + "grad_norm": 0.4591662883758545, + "learning_rate": 0.00011028477105239217, + "loss": 1.5218, + "step": 34529 + }, + { + "epoch": 0.44870160141510584, + "grad_norm": 0.4514273703098297, + "learning_rate": 0.00011028217159048078, + "loss": 1.4663, + "step": 34530 + }, + { + "epoch": 0.4487145959590217, + "grad_norm": 0.5583901405334473, + "learning_rate": 0.00011027957212856938, + "loss": 1.5054, + "step": 34531 + }, + { + "epoch": 0.4487275905029376, + "grad_norm": 0.3910730183124542, + "learning_rate": 0.00011027697266665799, + "loss": 1.2571, + "step": 34532 + }, + { + "epoch": 0.44874058504685344, + "grad_norm": 0.4569634795188904, + "learning_rate": 0.00011027437320474663, + "loss": 1.5548, + "step": 34533 + }, + { + "epoch": 0.44875357959076934, + "grad_norm": 0.3992309272289276, + "learning_rate": 0.00011027177374283524, + "loss": 1.3556, + "step": 34534 + }, + { + "epoch": 0.4487665741346852, + "grad_norm": 0.3682311773300171, + "learning_rate": 0.00011026917428092385, + "loss": 1.5128, + "step": 34535 + }, + { + "epoch": 0.4487795686786011, + "grad_norm": 0.4828411042690277, + "learning_rate": 0.00011026657481901248, + "loss": 1.4408, + "step": 34536 + }, + { + "epoch": 0.4487925632225169, + "grad_norm": 0.3480648994445801, + "learning_rate": 0.00011026397535710109, + "loss": 1.3544, + "step": 34537 + }, + { + "epoch": 0.4488055577664328, + "grad_norm": 0.4272501468658447, + "learning_rate": 0.0001102613758951897, + "loss": 1.5158, + "step": 34538 + }, + { + "epoch": 0.4488185523103487, + "grad_norm": 0.4681031107902527, + "learning_rate": 0.00011025877643327831, + "loss": 1.3275, + "step": 34539 + }, + { + "epoch": 0.4488315468542646, + "grad_norm": 0.40973857045173645, + "learning_rate": 0.00011025617697136695, + "loss": 1.4468, + "step": 34540 + }, + { + "epoch": 0.4488445413981804, + "grad_norm": 0.3795461654663086, + "learning_rate": 0.00011025357750945556, + "loss": 1.3368, + "step": 34541 + }, + { + "epoch": 0.4488575359420963, + "grad_norm": 0.4213436245918274, + "learning_rate": 0.00011025097804754417, + "loss": 1.3384, + "step": 34542 + }, + { + "epoch": 0.44887053048601216, + "grad_norm": 0.41806358098983765, + "learning_rate": 0.00011024837858563278, + "loss": 1.4118, + "step": 34543 + }, + { + "epoch": 0.44888352502992807, + "grad_norm": 0.43470683693885803, + "learning_rate": 0.0001102457791237214, + "loss": 1.5172, + "step": 34544 + }, + { + "epoch": 0.4488965195738439, + "grad_norm": 0.33741140365600586, + "learning_rate": 0.00011024317966181001, + "loss": 1.2366, + "step": 34545 + }, + { + "epoch": 0.4489095141177598, + "grad_norm": 0.3379230201244354, + "learning_rate": 0.00011024058019989863, + "loss": 1.112, + "step": 34546 + }, + { + "epoch": 0.44892250866167566, + "grad_norm": 0.3742949068546295, + "learning_rate": 0.00011023798073798724, + "loss": 1.5657, + "step": 34547 + }, + { + "epoch": 0.44893550320559156, + "grad_norm": 0.39107751846313477, + "learning_rate": 0.00011023538127607586, + "loss": 1.2031, + "step": 34548 + }, + { + "epoch": 0.4489484977495074, + "grad_norm": 0.420554518699646, + "learning_rate": 0.00011023278181416447, + "loss": 1.4858, + "step": 34549 + }, + { + "epoch": 0.4489614922934233, + "grad_norm": 0.45585426688194275, + "learning_rate": 0.00011023018235225308, + "loss": 1.3626, + "step": 34550 + }, + { + "epoch": 0.44897448683733915, + "grad_norm": 0.3173621594905853, + "learning_rate": 0.0001102275828903417, + "loss": 1.3216, + "step": 34551 + }, + { + "epoch": 0.44898748138125505, + "grad_norm": 0.41747456789016724, + "learning_rate": 0.00011022498342843033, + "loss": 1.4171, + "step": 34552 + }, + { + "epoch": 0.4490004759251709, + "grad_norm": 0.47379279136657715, + "learning_rate": 0.00011022238396651894, + "loss": 1.3124, + "step": 34553 + }, + { + "epoch": 0.4490134704690868, + "grad_norm": 0.4038969576358795, + "learning_rate": 0.00011021978450460755, + "loss": 1.3056, + "step": 34554 + }, + { + "epoch": 0.44902646501300264, + "grad_norm": 0.3103243410587311, + "learning_rate": 0.00011021718504269616, + "loss": 1.2316, + "step": 34555 + }, + { + "epoch": 0.44903945955691854, + "grad_norm": 0.37601181864738464, + "learning_rate": 0.00011021458558078479, + "loss": 1.4122, + "step": 34556 + }, + { + "epoch": 0.4490524541008344, + "grad_norm": 0.4181826412677765, + "learning_rate": 0.0001102119861188734, + "loss": 1.3827, + "step": 34557 + }, + { + "epoch": 0.4490654486447503, + "grad_norm": 0.34731724858283997, + "learning_rate": 0.00011020938665696201, + "loss": 1.4421, + "step": 34558 + }, + { + "epoch": 0.44907844318866613, + "grad_norm": 0.4501745402812958, + "learning_rate": 0.00011020678719505062, + "loss": 1.4119, + "step": 34559 + }, + { + "epoch": 0.44909143773258203, + "grad_norm": 0.4302853047847748, + "learning_rate": 0.00011020418773313925, + "loss": 1.4055, + "step": 34560 + }, + { + "epoch": 0.4491044322764979, + "grad_norm": 0.4178692102432251, + "learning_rate": 0.00011020158827122786, + "loss": 1.4248, + "step": 34561 + }, + { + "epoch": 0.4491174268204138, + "grad_norm": 0.37288540601730347, + "learning_rate": 0.00011019898880931647, + "loss": 1.4454, + "step": 34562 + }, + { + "epoch": 0.4491304213643297, + "grad_norm": 0.538740336894989, + "learning_rate": 0.00011019638934740508, + "loss": 1.4985, + "step": 34563 + }, + { + "epoch": 0.4491434159082455, + "grad_norm": 0.3887413442134857, + "learning_rate": 0.00011019378988549372, + "loss": 1.3569, + "step": 34564 + }, + { + "epoch": 0.4491564104521614, + "grad_norm": 0.4002898633480072, + "learning_rate": 0.00011019119042358233, + "loss": 1.402, + "step": 34565 + }, + { + "epoch": 0.44916940499607727, + "grad_norm": 0.4054885506629944, + "learning_rate": 0.00011018859096167094, + "loss": 1.3356, + "step": 34566 + }, + { + "epoch": 0.44918239953999317, + "grad_norm": 0.40970197319984436, + "learning_rate": 0.00011018599149975955, + "loss": 1.3802, + "step": 34567 + }, + { + "epoch": 0.449195394083909, + "grad_norm": 0.26434653997421265, + "learning_rate": 0.00011018339203784817, + "loss": 1.1602, + "step": 34568 + }, + { + "epoch": 0.4492083886278249, + "grad_norm": 0.3916490375995636, + "learning_rate": 0.00011018079257593678, + "loss": 1.38, + "step": 34569 + }, + { + "epoch": 0.44922138317174076, + "grad_norm": 0.5934067368507385, + "learning_rate": 0.0001101781931140254, + "loss": 1.562, + "step": 34570 + }, + { + "epoch": 0.44923437771565666, + "grad_norm": 0.41711217164993286, + "learning_rate": 0.00011017559365211403, + "loss": 1.3323, + "step": 34571 + }, + { + "epoch": 0.4492473722595725, + "grad_norm": 0.3497278392314911, + "learning_rate": 0.00011017299419020264, + "loss": 1.343, + "step": 34572 + }, + { + "epoch": 0.4492603668034884, + "grad_norm": 0.3907304108142853, + "learning_rate": 0.00011017039472829124, + "loss": 1.6952, + "step": 34573 + }, + { + "epoch": 0.44927336134740425, + "grad_norm": 0.444450318813324, + "learning_rate": 0.00011016779526637985, + "loss": 1.407, + "step": 34574 + }, + { + "epoch": 0.44928635589132015, + "grad_norm": 0.3275794982910156, + "learning_rate": 0.00011016519580446849, + "loss": 1.4977, + "step": 34575 + }, + { + "epoch": 0.449299350435236, + "grad_norm": 0.4418283700942993, + "learning_rate": 0.0001101625963425571, + "loss": 1.2848, + "step": 34576 + }, + { + "epoch": 0.4493123449791519, + "grad_norm": 0.3973557949066162, + "learning_rate": 0.00011015999688064571, + "loss": 1.5528, + "step": 34577 + }, + { + "epoch": 0.44932533952306775, + "grad_norm": 0.5718251466751099, + "learning_rate": 0.00011015739741873432, + "loss": 1.554, + "step": 34578 + }, + { + "epoch": 0.44933833406698365, + "grad_norm": 0.4450122117996216, + "learning_rate": 0.00011015479795682295, + "loss": 1.4901, + "step": 34579 + }, + { + "epoch": 0.4493513286108995, + "grad_norm": 0.4234688878059387, + "learning_rate": 0.00011015219849491156, + "loss": 1.5151, + "step": 34580 + }, + { + "epoch": 0.4493643231548154, + "grad_norm": 0.2874080240726471, + "learning_rate": 0.00011014959903300017, + "loss": 1.2937, + "step": 34581 + }, + { + "epoch": 0.44937731769873124, + "grad_norm": 0.40544554591178894, + "learning_rate": 0.00011014699957108878, + "loss": 1.4859, + "step": 34582 + }, + { + "epoch": 0.44939031224264714, + "grad_norm": 0.439264178276062, + "learning_rate": 0.00011014440010917742, + "loss": 1.2281, + "step": 34583 + }, + { + "epoch": 0.449403306786563, + "grad_norm": 0.3725937008857727, + "learning_rate": 0.00011014180064726603, + "loss": 1.5603, + "step": 34584 + }, + { + "epoch": 0.4494163013304789, + "grad_norm": 0.35467728972435, + "learning_rate": 0.00011013920118535464, + "loss": 1.3394, + "step": 34585 + }, + { + "epoch": 0.44942929587439473, + "grad_norm": 0.5001130700111389, + "learning_rate": 0.00011013660172344324, + "loss": 1.4047, + "step": 34586 + }, + { + "epoch": 0.44944229041831063, + "grad_norm": 0.3618234395980835, + "learning_rate": 0.00011013400226153188, + "loss": 1.2259, + "step": 34587 + }, + { + "epoch": 0.4494552849622265, + "grad_norm": 0.42493629455566406, + "learning_rate": 0.00011013140279962049, + "loss": 1.3431, + "step": 34588 + }, + { + "epoch": 0.4494682795061424, + "grad_norm": 0.4427499771118164, + "learning_rate": 0.0001101288033377091, + "loss": 1.442, + "step": 34589 + }, + { + "epoch": 0.4494812740500582, + "grad_norm": 0.5740325450897217, + "learning_rate": 0.00011012620387579771, + "loss": 1.4735, + "step": 34590 + }, + { + "epoch": 0.4494942685939741, + "grad_norm": 0.3771803677082062, + "learning_rate": 0.00011012360441388633, + "loss": 1.2133, + "step": 34591 + }, + { + "epoch": 0.44950726313788997, + "grad_norm": 0.35837626457214355, + "learning_rate": 0.00011012100495197494, + "loss": 1.5354, + "step": 34592 + }, + { + "epoch": 0.44952025768180587, + "grad_norm": 0.3096320629119873, + "learning_rate": 0.00011011840549006356, + "loss": 1.2037, + "step": 34593 + }, + { + "epoch": 0.4495332522257217, + "grad_norm": 0.40373578667640686, + "learning_rate": 0.00011011580602815217, + "loss": 1.544, + "step": 34594 + }, + { + "epoch": 0.4495462467696376, + "grad_norm": 0.2228272259235382, + "learning_rate": 0.0001101132065662408, + "loss": 1.1012, + "step": 34595 + }, + { + "epoch": 0.44955924131355346, + "grad_norm": 0.42053523659706116, + "learning_rate": 0.00011011060710432942, + "loss": 1.4426, + "step": 34596 + }, + { + "epoch": 0.44957223585746936, + "grad_norm": 0.4178027808666229, + "learning_rate": 0.00011010800764241803, + "loss": 1.3974, + "step": 34597 + }, + { + "epoch": 0.4495852304013852, + "grad_norm": 0.3505808711051941, + "learning_rate": 0.00011010540818050662, + "loss": 1.2488, + "step": 34598 + }, + { + "epoch": 0.4495982249453011, + "grad_norm": 0.5439279079437256, + "learning_rate": 0.00011010280871859526, + "loss": 1.4412, + "step": 34599 + }, + { + "epoch": 0.44961121948921695, + "grad_norm": 0.4107949435710907, + "learning_rate": 0.00011010020925668387, + "loss": 1.5524, + "step": 34600 + }, + { + "epoch": 0.44962421403313285, + "grad_norm": 0.4186403155326843, + "learning_rate": 0.00011009760979477248, + "loss": 1.2991, + "step": 34601 + }, + { + "epoch": 0.4496372085770487, + "grad_norm": 0.5023602843284607, + "learning_rate": 0.0001100950103328611, + "loss": 1.4306, + "step": 34602 + }, + { + "epoch": 0.4496502031209646, + "grad_norm": 0.3692898750305176, + "learning_rate": 0.00011009241087094972, + "loss": 1.4973, + "step": 34603 + }, + { + "epoch": 0.44966319766488044, + "grad_norm": 0.35382771492004395, + "learning_rate": 0.00011008981140903833, + "loss": 1.24, + "step": 34604 + }, + { + "epoch": 0.44967619220879634, + "grad_norm": 0.3985137939453125, + "learning_rate": 0.00011008721194712694, + "loss": 1.3116, + "step": 34605 + }, + { + "epoch": 0.4496891867527122, + "grad_norm": 0.46171829104423523, + "learning_rate": 0.00011008461248521555, + "loss": 1.1653, + "step": 34606 + }, + { + "epoch": 0.4497021812966281, + "grad_norm": 0.4294068515300751, + "learning_rate": 0.00011008201302330419, + "loss": 1.4257, + "step": 34607 + }, + { + "epoch": 0.44971517584054393, + "grad_norm": 0.43269744515419006, + "learning_rate": 0.0001100794135613928, + "loss": 1.5493, + "step": 34608 + }, + { + "epoch": 0.44972817038445984, + "grad_norm": 0.4605969786643982, + "learning_rate": 0.00011007681409948141, + "loss": 1.5076, + "step": 34609 + }, + { + "epoch": 0.4497411649283757, + "grad_norm": 0.4703240692615509, + "learning_rate": 0.00011007421463757004, + "loss": 1.309, + "step": 34610 + }, + { + "epoch": 0.4497541594722916, + "grad_norm": 0.4340241253376007, + "learning_rate": 0.00011007161517565865, + "loss": 1.3003, + "step": 34611 + }, + { + "epoch": 0.4497671540162074, + "grad_norm": 0.4455007016658783, + "learning_rate": 0.00011006901571374726, + "loss": 1.4387, + "step": 34612 + }, + { + "epoch": 0.4497801485601233, + "grad_norm": 0.41624656319618225, + "learning_rate": 0.00011006641625183587, + "loss": 1.4398, + "step": 34613 + }, + { + "epoch": 0.4497931431040392, + "grad_norm": 0.3853873312473297, + "learning_rate": 0.00011006381678992451, + "loss": 1.4882, + "step": 34614 + }, + { + "epoch": 0.4498061376479551, + "grad_norm": 0.4715087115764618, + "learning_rate": 0.0001100612173280131, + "loss": 1.4296, + "step": 34615 + }, + { + "epoch": 0.4498191321918709, + "grad_norm": 0.4221077859401703, + "learning_rate": 0.00011005861786610172, + "loss": 1.3067, + "step": 34616 + }, + { + "epoch": 0.4498321267357868, + "grad_norm": 0.37524187564849854, + "learning_rate": 0.00011005601840419033, + "loss": 1.314, + "step": 34617 + }, + { + "epoch": 0.44984512127970266, + "grad_norm": 0.3193286061286926, + "learning_rate": 0.00011005341894227896, + "loss": 1.4158, + "step": 34618 + }, + { + "epoch": 0.44985811582361857, + "grad_norm": 0.2873690128326416, + "learning_rate": 0.00011005081948036758, + "loss": 1.249, + "step": 34619 + }, + { + "epoch": 0.4498711103675344, + "grad_norm": 0.5461328625679016, + "learning_rate": 0.00011004822001845619, + "loss": 1.4104, + "step": 34620 + }, + { + "epoch": 0.4498841049114503, + "grad_norm": 0.45275986194610596, + "learning_rate": 0.0001100456205565448, + "loss": 1.412, + "step": 34621 + }, + { + "epoch": 0.44989709945536616, + "grad_norm": 0.39862167835235596, + "learning_rate": 0.00011004302109463342, + "loss": 1.4582, + "step": 34622 + }, + { + "epoch": 0.44991009399928206, + "grad_norm": 0.4049936532974243, + "learning_rate": 0.00011004042163272203, + "loss": 1.3718, + "step": 34623 + }, + { + "epoch": 0.4499230885431979, + "grad_norm": 0.4950532019138336, + "learning_rate": 0.00011003782217081064, + "loss": 1.2276, + "step": 34624 + }, + { + "epoch": 0.4499360830871138, + "grad_norm": 0.33813920617103577, + "learning_rate": 0.00011003522270889925, + "loss": 1.5944, + "step": 34625 + }, + { + "epoch": 0.44994907763102965, + "grad_norm": 0.5054971575737, + "learning_rate": 0.00011003262324698789, + "loss": 1.5062, + "step": 34626 + }, + { + "epoch": 0.44996207217494555, + "grad_norm": 0.4089203178882599, + "learning_rate": 0.0001100300237850765, + "loss": 1.4684, + "step": 34627 + }, + { + "epoch": 0.4499750667188614, + "grad_norm": 0.3653579652309418, + "learning_rate": 0.0001100274243231651, + "loss": 1.3285, + "step": 34628 + }, + { + "epoch": 0.4499880612627773, + "grad_norm": 0.4717418849468231, + "learning_rate": 0.00011002482486125371, + "loss": 1.2705, + "step": 34629 + }, + { + "epoch": 0.45000105580669314, + "grad_norm": 0.41295260190963745, + "learning_rate": 0.00011002222539934235, + "loss": 1.4125, + "step": 34630 + }, + { + "epoch": 0.45001405035060904, + "grad_norm": 0.32432249188423157, + "learning_rate": 0.00011001962593743096, + "loss": 1.2787, + "step": 34631 + }, + { + "epoch": 0.4500270448945249, + "grad_norm": 0.39494869112968445, + "learning_rate": 0.00011001702647551957, + "loss": 1.6407, + "step": 34632 + }, + { + "epoch": 0.4500400394384408, + "grad_norm": 0.45177191495895386, + "learning_rate": 0.00011001442701360818, + "loss": 1.42, + "step": 34633 + }, + { + "epoch": 0.45005303398235663, + "grad_norm": 0.36676496267318726, + "learning_rate": 0.00011001182755169681, + "loss": 1.3545, + "step": 34634 + }, + { + "epoch": 0.45006602852627253, + "grad_norm": 0.39819958806037903, + "learning_rate": 0.00011000922808978542, + "loss": 1.487, + "step": 34635 + }, + { + "epoch": 0.4500790230701884, + "grad_norm": 0.3898221254348755, + "learning_rate": 0.00011000662862787403, + "loss": 1.4685, + "step": 34636 + }, + { + "epoch": 0.4500920176141043, + "grad_norm": 0.3104458153247833, + "learning_rate": 0.00011000402916596264, + "loss": 1.3507, + "step": 34637 + }, + { + "epoch": 0.4501050121580201, + "grad_norm": 0.44826236367225647, + "learning_rate": 0.00011000142970405128, + "loss": 1.3661, + "step": 34638 + }, + { + "epoch": 0.450118006701936, + "grad_norm": 0.41079476475715637, + "learning_rate": 0.00010999883024213989, + "loss": 1.3416, + "step": 34639 + }, + { + "epoch": 0.4501310012458519, + "grad_norm": 0.3759309947490692, + "learning_rate": 0.00010999623078022849, + "loss": 1.528, + "step": 34640 + }, + { + "epoch": 0.45014399578976777, + "grad_norm": 0.33685725927352905, + "learning_rate": 0.0001099936313183171, + "loss": 1.1044, + "step": 34641 + }, + { + "epoch": 0.45015699033368367, + "grad_norm": 0.378028005361557, + "learning_rate": 0.00010999103185640574, + "loss": 1.1589, + "step": 34642 + }, + { + "epoch": 0.4501699848775995, + "grad_norm": 0.452457457780838, + "learning_rate": 0.00010998843239449435, + "loss": 1.5207, + "step": 34643 + }, + { + "epoch": 0.4501829794215154, + "grad_norm": 0.3738781213760376, + "learning_rate": 0.00010998583293258296, + "loss": 1.4272, + "step": 34644 + }, + { + "epoch": 0.45019597396543126, + "grad_norm": 0.46590104699134827, + "learning_rate": 0.00010998323347067158, + "loss": 1.4186, + "step": 34645 + }, + { + "epoch": 0.45020896850934716, + "grad_norm": 0.42637282609939575, + "learning_rate": 0.00010998063400876019, + "loss": 1.3875, + "step": 34646 + }, + { + "epoch": 0.450221963053263, + "grad_norm": 0.44957152009010315, + "learning_rate": 0.0001099780345468488, + "loss": 1.3859, + "step": 34647 + }, + { + "epoch": 0.4502349575971789, + "grad_norm": 0.4742974638938904, + "learning_rate": 0.00010997543508493741, + "loss": 1.4974, + "step": 34648 + }, + { + "epoch": 0.45024795214109475, + "grad_norm": 0.4378099739551544, + "learning_rate": 0.00010997283562302605, + "loss": 1.4062, + "step": 34649 + }, + { + "epoch": 0.45026094668501065, + "grad_norm": 0.4108748137950897, + "learning_rate": 0.00010997023616111466, + "loss": 1.3695, + "step": 34650 + }, + { + "epoch": 0.4502739412289265, + "grad_norm": 0.4649498164653778, + "learning_rate": 0.00010996763669920327, + "loss": 1.3068, + "step": 34651 + }, + { + "epoch": 0.4502869357728424, + "grad_norm": 0.42827990651130676, + "learning_rate": 0.00010996503723729189, + "loss": 1.4296, + "step": 34652 + }, + { + "epoch": 0.45029993031675825, + "grad_norm": 0.36306333541870117, + "learning_rate": 0.00010996243777538051, + "loss": 1.4684, + "step": 34653 + }, + { + "epoch": 0.45031292486067415, + "grad_norm": 0.4497435986995697, + "learning_rate": 0.00010995983831346912, + "loss": 1.4764, + "step": 34654 + }, + { + "epoch": 0.45032591940459, + "grad_norm": 0.3822482228279114, + "learning_rate": 0.00010995723885155773, + "loss": 1.6684, + "step": 34655 + }, + { + "epoch": 0.4503389139485059, + "grad_norm": 0.3489624857902527, + "learning_rate": 0.00010995463938964634, + "loss": 1.3735, + "step": 34656 + }, + { + "epoch": 0.45035190849242174, + "grad_norm": 0.33736705780029297, + "learning_rate": 0.00010995203992773497, + "loss": 1.2245, + "step": 34657 + }, + { + "epoch": 0.45036490303633764, + "grad_norm": 0.29852402210235596, + "learning_rate": 0.00010994944046582358, + "loss": 1.3669, + "step": 34658 + }, + { + "epoch": 0.4503778975802535, + "grad_norm": 0.4361458718776703, + "learning_rate": 0.00010994684100391219, + "loss": 1.6184, + "step": 34659 + }, + { + "epoch": 0.4503908921241694, + "grad_norm": 0.43799087405204773, + "learning_rate": 0.0001099442415420008, + "loss": 1.3209, + "step": 34660 + }, + { + "epoch": 0.45040388666808523, + "grad_norm": 0.4746192693710327, + "learning_rate": 0.00010994164208008944, + "loss": 1.5739, + "step": 34661 + }, + { + "epoch": 0.45041688121200113, + "grad_norm": 0.4274558424949646, + "learning_rate": 0.00010993904261817805, + "loss": 1.3528, + "step": 34662 + }, + { + "epoch": 0.450429875755917, + "grad_norm": 0.35588523745536804, + "learning_rate": 0.00010993644315626666, + "loss": 1.3957, + "step": 34663 + }, + { + "epoch": 0.4504428702998329, + "grad_norm": 0.3102421164512634, + "learning_rate": 0.00010993384369435527, + "loss": 1.3901, + "step": 34664 + }, + { + "epoch": 0.4504558648437487, + "grad_norm": 0.39387017488479614, + "learning_rate": 0.0001099312442324439, + "loss": 1.5012, + "step": 34665 + }, + { + "epoch": 0.4504688593876646, + "grad_norm": 0.38885587453842163, + "learning_rate": 0.0001099286447705325, + "loss": 1.5434, + "step": 34666 + }, + { + "epoch": 0.45048185393158047, + "grad_norm": 0.3618861734867096, + "learning_rate": 0.00010992604530862112, + "loss": 1.3914, + "step": 34667 + }, + { + "epoch": 0.45049484847549637, + "grad_norm": 0.44911736249923706, + "learning_rate": 0.00010992344584670973, + "loss": 1.5369, + "step": 34668 + }, + { + "epoch": 0.4505078430194122, + "grad_norm": 0.3879886567592621, + "learning_rate": 0.00010992084638479837, + "loss": 1.1754, + "step": 34669 + }, + { + "epoch": 0.4505208375633281, + "grad_norm": 0.4212639331817627, + "learning_rate": 0.00010991824692288696, + "loss": 1.2827, + "step": 34670 + }, + { + "epoch": 0.45053383210724396, + "grad_norm": 0.41131022572517395, + "learning_rate": 0.00010991564746097557, + "loss": 1.3663, + "step": 34671 + }, + { + "epoch": 0.45054682665115986, + "grad_norm": 0.5232630968093872, + "learning_rate": 0.00010991304799906419, + "loss": 1.4822, + "step": 34672 + }, + { + "epoch": 0.4505598211950757, + "grad_norm": 0.42684683203697205, + "learning_rate": 0.00010991044853715282, + "loss": 1.3651, + "step": 34673 + }, + { + "epoch": 0.4505728157389916, + "grad_norm": 0.5546239614486694, + "learning_rate": 0.00010990784907524143, + "loss": 1.398, + "step": 34674 + }, + { + "epoch": 0.45058581028290745, + "grad_norm": 0.3987673223018646, + "learning_rate": 0.00010990524961333005, + "loss": 1.4861, + "step": 34675 + }, + { + "epoch": 0.45059880482682335, + "grad_norm": 0.4570951461791992, + "learning_rate": 0.00010990265015141866, + "loss": 1.3202, + "step": 34676 + }, + { + "epoch": 0.4506117993707392, + "grad_norm": 0.421896368265152, + "learning_rate": 0.00010990005068950728, + "loss": 1.392, + "step": 34677 + }, + { + "epoch": 0.4506247939146551, + "grad_norm": 0.34930211305618286, + "learning_rate": 0.00010989745122759589, + "loss": 1.6332, + "step": 34678 + }, + { + "epoch": 0.45063778845857094, + "grad_norm": 0.43266966938972473, + "learning_rate": 0.0001098948517656845, + "loss": 1.3176, + "step": 34679 + }, + { + "epoch": 0.45065078300248684, + "grad_norm": 0.42050305008888245, + "learning_rate": 0.00010989225230377311, + "loss": 1.6115, + "step": 34680 + }, + { + "epoch": 0.4506637775464027, + "grad_norm": 0.42231515049934387, + "learning_rate": 0.00010988965284186175, + "loss": 1.518, + "step": 34681 + }, + { + "epoch": 0.4506767720903186, + "grad_norm": 0.3048645853996277, + "learning_rate": 0.00010988705337995035, + "loss": 1.2405, + "step": 34682 + }, + { + "epoch": 0.45068976663423443, + "grad_norm": 0.3793469965457916, + "learning_rate": 0.00010988445391803896, + "loss": 1.4223, + "step": 34683 + }, + { + "epoch": 0.45070276117815034, + "grad_norm": 0.3899625837802887, + "learning_rate": 0.0001098818544561276, + "loss": 1.2016, + "step": 34684 + }, + { + "epoch": 0.4507157557220662, + "grad_norm": 0.33458301424980164, + "learning_rate": 0.00010987925499421621, + "loss": 1.1769, + "step": 34685 + }, + { + "epoch": 0.4507287502659821, + "grad_norm": 0.4201454818248749, + "learning_rate": 0.00010987665553230482, + "loss": 1.3793, + "step": 34686 + }, + { + "epoch": 0.4507417448098979, + "grad_norm": 0.2987552583217621, + "learning_rate": 0.00010987405607039343, + "loss": 1.4608, + "step": 34687 + }, + { + "epoch": 0.4507547393538138, + "grad_norm": 0.49060991406440735, + "learning_rate": 0.00010987145660848206, + "loss": 1.4257, + "step": 34688 + }, + { + "epoch": 0.45076773389772967, + "grad_norm": 0.4518246650695801, + "learning_rate": 0.00010986885714657067, + "loss": 1.4056, + "step": 34689 + }, + { + "epoch": 0.4507807284416456, + "grad_norm": 0.38075777888298035, + "learning_rate": 0.00010986625768465928, + "loss": 1.399, + "step": 34690 + }, + { + "epoch": 0.4507937229855614, + "grad_norm": 0.348307341337204, + "learning_rate": 0.00010986365822274789, + "loss": 1.348, + "step": 34691 + }, + { + "epoch": 0.4508067175294773, + "grad_norm": 0.4711441695690155, + "learning_rate": 0.00010986105876083653, + "loss": 1.6102, + "step": 34692 + }, + { + "epoch": 0.45081971207339316, + "grad_norm": 0.3929104804992676, + "learning_rate": 0.00010985845929892514, + "loss": 1.4359, + "step": 34693 + }, + { + "epoch": 0.45083270661730906, + "grad_norm": 0.45121195912361145, + "learning_rate": 0.00010985585983701375, + "loss": 1.5505, + "step": 34694 + }, + { + "epoch": 0.4508457011612249, + "grad_norm": 0.27183642983436584, + "learning_rate": 0.00010985326037510235, + "loss": 1.2986, + "step": 34695 + }, + { + "epoch": 0.4508586957051408, + "grad_norm": 0.4760079085826874, + "learning_rate": 0.00010985066091319098, + "loss": 1.3248, + "step": 34696 + }, + { + "epoch": 0.45087169024905666, + "grad_norm": 0.3645307421684265, + "learning_rate": 0.0001098480614512796, + "loss": 1.3651, + "step": 34697 + }, + { + "epoch": 0.45088468479297256, + "grad_norm": 0.4608154892921448, + "learning_rate": 0.0001098454619893682, + "loss": 1.587, + "step": 34698 + }, + { + "epoch": 0.4508976793368884, + "grad_norm": 0.2584189772605896, + "learning_rate": 0.00010984286252745682, + "loss": 1.3133, + "step": 34699 + }, + { + "epoch": 0.4509106738808043, + "grad_norm": 0.4810093343257904, + "learning_rate": 0.00010984026306554544, + "loss": 1.433, + "step": 34700 + }, + { + "epoch": 0.45092366842472015, + "grad_norm": 0.4034838378429413, + "learning_rate": 0.00010983766360363405, + "loss": 1.5186, + "step": 34701 + }, + { + "epoch": 0.45093666296863605, + "grad_norm": 0.2878677248954773, + "learning_rate": 0.00010983506414172266, + "loss": 1.3735, + "step": 34702 + }, + { + "epoch": 0.4509496575125519, + "grad_norm": 0.3981530964374542, + "learning_rate": 0.00010983246467981127, + "loss": 1.4509, + "step": 34703 + }, + { + "epoch": 0.4509626520564678, + "grad_norm": 0.3467041552066803, + "learning_rate": 0.00010982986521789991, + "loss": 1.2871, + "step": 34704 + }, + { + "epoch": 0.45097564660038364, + "grad_norm": 0.43963155150413513, + "learning_rate": 0.00010982726575598852, + "loss": 1.5168, + "step": 34705 + }, + { + "epoch": 0.45098864114429954, + "grad_norm": 0.4081425070762634, + "learning_rate": 0.00010982466629407713, + "loss": 1.2588, + "step": 34706 + }, + { + "epoch": 0.4510016356882154, + "grad_norm": 0.4085533916950226, + "learning_rate": 0.00010982206683216574, + "loss": 1.5025, + "step": 34707 + }, + { + "epoch": 0.4510146302321313, + "grad_norm": 0.4102005064487457, + "learning_rate": 0.00010981946737025437, + "loss": 1.513, + "step": 34708 + }, + { + "epoch": 0.45102762477604713, + "grad_norm": 0.37340712547302246, + "learning_rate": 0.00010981686790834298, + "loss": 1.3159, + "step": 34709 + }, + { + "epoch": 0.45104061931996303, + "grad_norm": 0.3997344672679901, + "learning_rate": 0.00010981426844643159, + "loss": 1.4921, + "step": 34710 + }, + { + "epoch": 0.4510536138638789, + "grad_norm": 0.3900180757045746, + "learning_rate": 0.0001098116689845202, + "loss": 1.4969, + "step": 34711 + }, + { + "epoch": 0.4510666084077948, + "grad_norm": 0.38005638122558594, + "learning_rate": 0.00010980906952260883, + "loss": 1.3136, + "step": 34712 + }, + { + "epoch": 0.4510796029517106, + "grad_norm": 0.4513770341873169, + "learning_rate": 0.00010980647006069744, + "loss": 1.5497, + "step": 34713 + }, + { + "epoch": 0.4510925974956265, + "grad_norm": 0.3341437876224518, + "learning_rate": 0.00010980387059878605, + "loss": 1.1552, + "step": 34714 + }, + { + "epoch": 0.4511055920395424, + "grad_norm": 0.3980044424533844, + "learning_rate": 0.00010980127113687466, + "loss": 1.4221, + "step": 34715 + }, + { + "epoch": 0.45111858658345827, + "grad_norm": 0.3460882306098938, + "learning_rate": 0.0001097986716749633, + "loss": 1.3419, + "step": 34716 + }, + { + "epoch": 0.45113158112737417, + "grad_norm": 0.42624014616012573, + "learning_rate": 0.00010979607221305191, + "loss": 1.3568, + "step": 34717 + }, + { + "epoch": 0.45114457567129, + "grad_norm": 0.2534487247467041, + "learning_rate": 0.00010979347275114052, + "loss": 1.1958, + "step": 34718 + }, + { + "epoch": 0.4511575702152059, + "grad_norm": 0.37547239661216736, + "learning_rate": 0.00010979087328922914, + "loss": 1.3855, + "step": 34719 + }, + { + "epoch": 0.45117056475912176, + "grad_norm": 0.37876471877098083, + "learning_rate": 0.00010978827382731775, + "loss": 1.3258, + "step": 34720 + }, + { + "epoch": 0.45118355930303766, + "grad_norm": 0.4101529121398926, + "learning_rate": 0.00010978567436540636, + "loss": 1.4229, + "step": 34721 + }, + { + "epoch": 0.4511965538469535, + "grad_norm": 0.3159874379634857, + "learning_rate": 0.00010978307490349498, + "loss": 1.2605, + "step": 34722 + }, + { + "epoch": 0.4512095483908694, + "grad_norm": 0.2980130612850189, + "learning_rate": 0.00010978047544158361, + "loss": 1.201, + "step": 34723 + }, + { + "epoch": 0.45122254293478525, + "grad_norm": 0.4047219157218933, + "learning_rate": 0.00010977787597967221, + "loss": 1.5541, + "step": 34724 + }, + { + "epoch": 0.45123553747870115, + "grad_norm": 0.4015253782272339, + "learning_rate": 0.00010977527651776082, + "loss": 1.4204, + "step": 34725 + }, + { + "epoch": 0.451248532022617, + "grad_norm": 0.4201032817363739, + "learning_rate": 0.00010977267705584943, + "loss": 1.3287, + "step": 34726 + }, + { + "epoch": 0.4512615265665329, + "grad_norm": 0.40899431705474854, + "learning_rate": 0.00010977007759393807, + "loss": 1.4549, + "step": 34727 + }, + { + "epoch": 0.45127452111044875, + "grad_norm": 0.4537949562072754, + "learning_rate": 0.00010976747813202668, + "loss": 1.5097, + "step": 34728 + }, + { + "epoch": 0.45128751565436465, + "grad_norm": 0.3765506148338318, + "learning_rate": 0.00010976487867011529, + "loss": 1.0697, + "step": 34729 + }, + { + "epoch": 0.4513005101982805, + "grad_norm": 0.4242401719093323, + "learning_rate": 0.0001097622792082039, + "loss": 1.4187, + "step": 34730 + }, + { + "epoch": 0.4513135047421964, + "grad_norm": 0.30027660727500916, + "learning_rate": 0.00010975967974629253, + "loss": 1.3223, + "step": 34731 + }, + { + "epoch": 0.45132649928611224, + "grad_norm": 0.38335204124450684, + "learning_rate": 0.00010975708028438114, + "loss": 1.2228, + "step": 34732 + }, + { + "epoch": 0.45133949383002814, + "grad_norm": 0.409382700920105, + "learning_rate": 0.00010975448082246975, + "loss": 1.4447, + "step": 34733 + }, + { + "epoch": 0.451352488373944, + "grad_norm": 0.3845069408416748, + "learning_rate": 0.00010975188136055836, + "loss": 1.4398, + "step": 34734 + }, + { + "epoch": 0.4513654829178599, + "grad_norm": 0.46727463603019714, + "learning_rate": 0.000109749281898647, + "loss": 1.4233, + "step": 34735 + }, + { + "epoch": 0.45137847746177573, + "grad_norm": 0.363198846578598, + "learning_rate": 0.00010974668243673561, + "loss": 1.3647, + "step": 34736 + }, + { + "epoch": 0.45139147200569163, + "grad_norm": 0.39461666345596313, + "learning_rate": 0.00010974408297482421, + "loss": 1.2735, + "step": 34737 + }, + { + "epoch": 0.4514044665496075, + "grad_norm": 0.35019561648368835, + "learning_rate": 0.00010974148351291282, + "loss": 1.4918, + "step": 34738 + }, + { + "epoch": 0.4514174610935234, + "grad_norm": 0.3905196487903595, + "learning_rate": 0.00010973888405100146, + "loss": 1.39, + "step": 34739 + }, + { + "epoch": 0.4514304556374392, + "grad_norm": 0.45560622215270996, + "learning_rate": 0.00010973628458909007, + "loss": 1.4607, + "step": 34740 + }, + { + "epoch": 0.4514434501813551, + "grad_norm": 0.42589449882507324, + "learning_rate": 0.00010973368512717868, + "loss": 1.3714, + "step": 34741 + }, + { + "epoch": 0.45145644472527097, + "grad_norm": 0.5541911721229553, + "learning_rate": 0.00010973108566526729, + "loss": 1.2459, + "step": 34742 + }, + { + "epoch": 0.45146943926918687, + "grad_norm": 0.4173346757888794, + "learning_rate": 0.00010972848620335591, + "loss": 1.3731, + "step": 34743 + }, + { + "epoch": 0.4514824338131027, + "grad_norm": 0.37347304821014404, + "learning_rate": 0.00010972588674144452, + "loss": 1.5033, + "step": 34744 + }, + { + "epoch": 0.4514954283570186, + "grad_norm": 0.38289675116539, + "learning_rate": 0.00010972328727953314, + "loss": 1.3628, + "step": 34745 + }, + { + "epoch": 0.45150842290093446, + "grad_norm": 0.4913569390773773, + "learning_rate": 0.00010972068781762175, + "loss": 1.433, + "step": 34746 + }, + { + "epoch": 0.45152141744485036, + "grad_norm": 0.4125804901123047, + "learning_rate": 0.00010971808835571038, + "loss": 1.3467, + "step": 34747 + }, + { + "epoch": 0.4515344119887662, + "grad_norm": 0.3636202812194824, + "learning_rate": 0.000109715488893799, + "loss": 1.4488, + "step": 34748 + }, + { + "epoch": 0.4515474065326821, + "grad_norm": 0.3246307373046875, + "learning_rate": 0.0001097128894318876, + "loss": 1.3717, + "step": 34749 + }, + { + "epoch": 0.45156040107659795, + "grad_norm": 0.36629313230514526, + "learning_rate": 0.0001097102899699762, + "loss": 1.2703, + "step": 34750 + }, + { + "epoch": 0.45157339562051385, + "grad_norm": 0.3734608590602875, + "learning_rate": 0.00010970769050806484, + "loss": 1.3177, + "step": 34751 + }, + { + "epoch": 0.4515863901644297, + "grad_norm": 0.3563080430030823, + "learning_rate": 0.00010970509104615345, + "loss": 1.3234, + "step": 34752 + }, + { + "epoch": 0.4515993847083456, + "grad_norm": 0.4973289370536804, + "learning_rate": 0.00010970249158424206, + "loss": 1.3033, + "step": 34753 + }, + { + "epoch": 0.45161237925226144, + "grad_norm": 0.33764708042144775, + "learning_rate": 0.00010969989212233067, + "loss": 1.3726, + "step": 34754 + }, + { + "epoch": 0.45162537379617734, + "grad_norm": 0.3473716676235199, + "learning_rate": 0.0001096972926604193, + "loss": 1.1291, + "step": 34755 + }, + { + "epoch": 0.4516383683400932, + "grad_norm": 0.4495391249656677, + "learning_rate": 0.00010969469319850791, + "loss": 1.4441, + "step": 34756 + }, + { + "epoch": 0.4516513628840091, + "grad_norm": 0.4151090085506439, + "learning_rate": 0.00010969209373659652, + "loss": 1.3128, + "step": 34757 + }, + { + "epoch": 0.45166435742792493, + "grad_norm": 0.506668746471405, + "learning_rate": 0.00010968949427468516, + "loss": 1.5258, + "step": 34758 + }, + { + "epoch": 0.45167735197184083, + "grad_norm": 0.4666251838207245, + "learning_rate": 0.00010968689481277377, + "loss": 1.3781, + "step": 34759 + }, + { + "epoch": 0.4516903465157567, + "grad_norm": 0.3775852918624878, + "learning_rate": 0.00010968429535086238, + "loss": 1.4777, + "step": 34760 + }, + { + "epoch": 0.4517033410596726, + "grad_norm": 0.39806291460990906, + "learning_rate": 0.00010968169588895099, + "loss": 1.329, + "step": 34761 + }, + { + "epoch": 0.4517163356035884, + "grad_norm": 0.38272690773010254, + "learning_rate": 0.00010967909642703962, + "loss": 1.5868, + "step": 34762 + }, + { + "epoch": 0.4517293301475043, + "grad_norm": 0.3841523826122284, + "learning_rate": 0.00010967649696512823, + "loss": 1.3323, + "step": 34763 + }, + { + "epoch": 0.45174232469142017, + "grad_norm": 0.4096437692642212, + "learning_rate": 0.00010967389750321684, + "loss": 1.2445, + "step": 34764 + }, + { + "epoch": 0.4517553192353361, + "grad_norm": 0.3654595911502838, + "learning_rate": 0.00010967129804130545, + "loss": 1.5167, + "step": 34765 + }, + { + "epoch": 0.4517683137792519, + "grad_norm": 0.5160888433456421, + "learning_rate": 0.00010966869857939407, + "loss": 1.4523, + "step": 34766 + }, + { + "epoch": 0.4517813083231678, + "grad_norm": 0.5552314519882202, + "learning_rate": 0.00010966609911748268, + "loss": 1.4886, + "step": 34767 + }, + { + "epoch": 0.45179430286708366, + "grad_norm": 0.2955682873725891, + "learning_rate": 0.0001096634996555713, + "loss": 1.3102, + "step": 34768 + }, + { + "epoch": 0.45180729741099956, + "grad_norm": 0.38363808393478394, + "learning_rate": 0.0001096609001936599, + "loss": 1.4417, + "step": 34769 + }, + { + "epoch": 0.4518202919549154, + "grad_norm": 0.3800087571144104, + "learning_rate": 0.00010965830073174854, + "loss": 1.1341, + "step": 34770 + }, + { + "epoch": 0.4518332864988313, + "grad_norm": 0.45169904828071594, + "learning_rate": 0.00010965570126983716, + "loss": 1.3101, + "step": 34771 + }, + { + "epoch": 0.45184628104274716, + "grad_norm": 0.3953300416469574, + "learning_rate": 0.00010965310180792577, + "loss": 1.4107, + "step": 34772 + }, + { + "epoch": 0.45185927558666306, + "grad_norm": 0.3940204381942749, + "learning_rate": 0.00010965050234601438, + "loss": 1.421, + "step": 34773 + }, + { + "epoch": 0.4518722701305789, + "grad_norm": 0.3225698471069336, + "learning_rate": 0.000109647902884103, + "loss": 1.4676, + "step": 34774 + }, + { + "epoch": 0.4518852646744948, + "grad_norm": 0.4055837094783783, + "learning_rate": 0.00010964530342219161, + "loss": 1.5282, + "step": 34775 + }, + { + "epoch": 0.45189825921841065, + "grad_norm": 0.38688719272613525, + "learning_rate": 0.00010964270396028022, + "loss": 1.3816, + "step": 34776 + }, + { + "epoch": 0.45191125376232655, + "grad_norm": 0.392902135848999, + "learning_rate": 0.00010964010449836883, + "loss": 1.2913, + "step": 34777 + }, + { + "epoch": 0.4519242483062424, + "grad_norm": 0.43108752369880676, + "learning_rate": 0.00010963750503645747, + "loss": 1.4338, + "step": 34778 + }, + { + "epoch": 0.4519372428501583, + "grad_norm": 0.45072755217552185, + "learning_rate": 0.00010963490557454607, + "loss": 1.5077, + "step": 34779 + }, + { + "epoch": 0.45195023739407414, + "grad_norm": 0.30004703998565674, + "learning_rate": 0.00010963230611263468, + "loss": 1.3252, + "step": 34780 + }, + { + "epoch": 0.45196323193799004, + "grad_norm": 0.3558892607688904, + "learning_rate": 0.00010962970665072329, + "loss": 1.3765, + "step": 34781 + }, + { + "epoch": 0.4519762264819059, + "grad_norm": 0.38909611105918884, + "learning_rate": 0.00010962710718881193, + "loss": 1.3962, + "step": 34782 + }, + { + "epoch": 0.4519892210258218, + "grad_norm": 0.3764425218105316, + "learning_rate": 0.00010962450772690054, + "loss": 1.4578, + "step": 34783 + }, + { + "epoch": 0.45200221556973763, + "grad_norm": 0.3882215917110443, + "learning_rate": 0.00010962190826498915, + "loss": 1.4854, + "step": 34784 + }, + { + "epoch": 0.45201521011365353, + "grad_norm": 0.5179300308227539, + "learning_rate": 0.00010961930880307776, + "loss": 1.4046, + "step": 34785 + }, + { + "epoch": 0.4520282046575694, + "grad_norm": 0.32083791494369507, + "learning_rate": 0.00010961670934116639, + "loss": 1.2995, + "step": 34786 + }, + { + "epoch": 0.4520411992014853, + "grad_norm": 0.38110214471817017, + "learning_rate": 0.000109614109879255, + "loss": 1.5026, + "step": 34787 + }, + { + "epoch": 0.4520541937454011, + "grad_norm": 0.4439060688018799, + "learning_rate": 0.00010961151041734361, + "loss": 1.4338, + "step": 34788 + }, + { + "epoch": 0.452067188289317, + "grad_norm": 0.3672008812427521, + "learning_rate": 0.00010960891095543222, + "loss": 1.4792, + "step": 34789 + }, + { + "epoch": 0.45208018283323287, + "grad_norm": 0.5323629975318909, + "learning_rate": 0.00010960631149352086, + "loss": 1.5222, + "step": 34790 + }, + { + "epoch": 0.45209317737714877, + "grad_norm": 0.3720775246620178, + "learning_rate": 0.00010960371203160947, + "loss": 1.4019, + "step": 34791 + }, + { + "epoch": 0.45210617192106467, + "grad_norm": 0.3579128086566925, + "learning_rate": 0.00010960111256969807, + "loss": 1.3139, + "step": 34792 + }, + { + "epoch": 0.4521191664649805, + "grad_norm": 0.3261306881904602, + "learning_rate": 0.0001095985131077867, + "loss": 1.1858, + "step": 34793 + }, + { + "epoch": 0.4521321610088964, + "grad_norm": 0.7582377791404724, + "learning_rate": 0.00010959591364587532, + "loss": 1.4017, + "step": 34794 + }, + { + "epoch": 0.45214515555281226, + "grad_norm": 0.33804845809936523, + "learning_rate": 0.00010959331418396393, + "loss": 1.473, + "step": 34795 + }, + { + "epoch": 0.45215815009672816, + "grad_norm": 0.32930707931518555, + "learning_rate": 0.00010959071472205254, + "loss": 1.3993, + "step": 34796 + }, + { + "epoch": 0.452171144640644, + "grad_norm": 0.3443145155906677, + "learning_rate": 0.00010958811526014116, + "loss": 1.3995, + "step": 34797 + }, + { + "epoch": 0.4521841391845599, + "grad_norm": 0.3540690541267395, + "learning_rate": 0.00010958551579822977, + "loss": 1.1822, + "step": 34798 + }, + { + "epoch": 0.45219713372847575, + "grad_norm": 0.4201768636703491, + "learning_rate": 0.00010958291633631838, + "loss": 1.2247, + "step": 34799 + }, + { + "epoch": 0.45221012827239165, + "grad_norm": 0.3197118639945984, + "learning_rate": 0.000109580316874407, + "loss": 1.411, + "step": 34800 + }, + { + "epoch": 0.4522231228163075, + "grad_norm": 0.40830856561660767, + "learning_rate": 0.00010957771741249563, + "loss": 1.3058, + "step": 34801 + }, + { + "epoch": 0.4522361173602234, + "grad_norm": 0.46550801396369934, + "learning_rate": 0.00010957511795058424, + "loss": 1.377, + "step": 34802 + }, + { + "epoch": 0.45224911190413924, + "grad_norm": 0.33782848715782166, + "learning_rate": 0.00010957251848867285, + "loss": 1.3992, + "step": 34803 + }, + { + "epoch": 0.45226210644805515, + "grad_norm": 0.44523853063583374, + "learning_rate": 0.00010956991902676145, + "loss": 1.4648, + "step": 34804 + }, + { + "epoch": 0.452275100991971, + "grad_norm": 0.5018695592880249, + "learning_rate": 0.00010956731956485009, + "loss": 1.5237, + "step": 34805 + }, + { + "epoch": 0.4522880955358869, + "grad_norm": 0.388094425201416, + "learning_rate": 0.0001095647201029387, + "loss": 1.4756, + "step": 34806 + }, + { + "epoch": 0.45230109007980274, + "grad_norm": 0.3974766135215759, + "learning_rate": 0.00010956212064102731, + "loss": 1.3491, + "step": 34807 + }, + { + "epoch": 0.45231408462371864, + "grad_norm": 0.4385204017162323, + "learning_rate": 0.00010955952117911592, + "loss": 1.4852, + "step": 34808 + }, + { + "epoch": 0.4523270791676345, + "grad_norm": 0.4855717718601227, + "learning_rate": 0.00010955692171720455, + "loss": 1.4553, + "step": 34809 + }, + { + "epoch": 0.4523400737115504, + "grad_norm": 0.44082731008529663, + "learning_rate": 0.00010955432225529316, + "loss": 1.3377, + "step": 34810 + }, + { + "epoch": 0.45235306825546623, + "grad_norm": 0.5200600028038025, + "learning_rate": 0.00010955172279338177, + "loss": 1.4526, + "step": 34811 + }, + { + "epoch": 0.45236606279938213, + "grad_norm": 0.4211069941520691, + "learning_rate": 0.00010954912333147038, + "loss": 1.4951, + "step": 34812 + }, + { + "epoch": 0.452379057343298, + "grad_norm": 0.3686727285385132, + "learning_rate": 0.00010954652386955902, + "loss": 1.4319, + "step": 34813 + }, + { + "epoch": 0.4523920518872139, + "grad_norm": 0.3405270278453827, + "learning_rate": 0.00010954392440764763, + "loss": 1.5108, + "step": 34814 + }, + { + "epoch": 0.4524050464311297, + "grad_norm": 0.4647314250469208, + "learning_rate": 0.00010954132494573624, + "loss": 1.6067, + "step": 34815 + }, + { + "epoch": 0.4524180409750456, + "grad_norm": 0.4550323486328125, + "learning_rate": 0.00010953872548382485, + "loss": 1.2762, + "step": 34816 + }, + { + "epoch": 0.45243103551896147, + "grad_norm": 0.33415529131889343, + "learning_rate": 0.00010953612602191348, + "loss": 1.4567, + "step": 34817 + }, + { + "epoch": 0.45244403006287737, + "grad_norm": 0.4055238664150238, + "learning_rate": 0.00010953352656000209, + "loss": 1.3988, + "step": 34818 + }, + { + "epoch": 0.4524570246067932, + "grad_norm": 0.37410417199134827, + "learning_rate": 0.0001095309270980907, + "loss": 1.3187, + "step": 34819 + }, + { + "epoch": 0.4524700191507091, + "grad_norm": 0.49783769249916077, + "learning_rate": 0.00010952832763617931, + "loss": 1.5924, + "step": 34820 + }, + { + "epoch": 0.45248301369462496, + "grad_norm": 0.3820335566997528, + "learning_rate": 0.00010952572817426793, + "loss": 1.3797, + "step": 34821 + }, + { + "epoch": 0.45249600823854086, + "grad_norm": 0.42440611124038696, + "learning_rate": 0.00010952312871235654, + "loss": 1.6008, + "step": 34822 + }, + { + "epoch": 0.4525090027824567, + "grad_norm": 0.4065231680870056, + "learning_rate": 0.00010952052925044515, + "loss": 1.4774, + "step": 34823 + }, + { + "epoch": 0.4525219973263726, + "grad_norm": 0.37790054082870483, + "learning_rate": 0.00010951792978853377, + "loss": 1.3494, + "step": 34824 + }, + { + "epoch": 0.45253499187028845, + "grad_norm": 0.3961838483810425, + "learning_rate": 0.0001095153303266224, + "loss": 1.3042, + "step": 34825 + }, + { + "epoch": 0.45254798641420435, + "grad_norm": 0.3773347735404968, + "learning_rate": 0.00010951273086471101, + "loss": 1.2974, + "step": 34826 + }, + { + "epoch": 0.4525609809581202, + "grad_norm": 0.3110443949699402, + "learning_rate": 0.00010951013140279963, + "loss": 1.5116, + "step": 34827 + }, + { + "epoch": 0.4525739755020361, + "grad_norm": 0.46696510910987854, + "learning_rate": 0.00010950753194088824, + "loss": 1.3896, + "step": 34828 + }, + { + "epoch": 0.45258697004595194, + "grad_norm": 0.42911845445632935, + "learning_rate": 0.00010950493247897686, + "loss": 1.2848, + "step": 34829 + }, + { + "epoch": 0.45259996458986784, + "grad_norm": 0.48784470558166504, + "learning_rate": 0.00010950233301706547, + "loss": 1.2603, + "step": 34830 + }, + { + "epoch": 0.4526129591337837, + "grad_norm": 0.33039045333862305, + "learning_rate": 0.00010949973355515408, + "loss": 1.4433, + "step": 34831 + }, + { + "epoch": 0.4526259536776996, + "grad_norm": 0.4040879011154175, + "learning_rate": 0.00010949713409324272, + "loss": 1.2944, + "step": 34832 + }, + { + "epoch": 0.45263894822161543, + "grad_norm": 0.4118824601173401, + "learning_rate": 0.00010949453463133133, + "loss": 1.1739, + "step": 34833 + }, + { + "epoch": 0.45265194276553133, + "grad_norm": 0.47655031085014343, + "learning_rate": 0.00010949193516941993, + "loss": 1.3565, + "step": 34834 + }, + { + "epoch": 0.4526649373094472, + "grad_norm": 0.4509572386741638, + "learning_rate": 0.00010948933570750854, + "loss": 1.4582, + "step": 34835 + }, + { + "epoch": 0.4526779318533631, + "grad_norm": 0.46587246656417847, + "learning_rate": 0.00010948673624559718, + "loss": 1.5548, + "step": 34836 + }, + { + "epoch": 0.4526909263972789, + "grad_norm": 0.443010538816452, + "learning_rate": 0.00010948413678368579, + "loss": 1.3653, + "step": 34837 + }, + { + "epoch": 0.4527039209411948, + "grad_norm": 0.4608035087585449, + "learning_rate": 0.0001094815373217744, + "loss": 1.4124, + "step": 34838 + }, + { + "epoch": 0.45271691548511067, + "grad_norm": 0.4232497215270996, + "learning_rate": 0.00010947893785986301, + "loss": 1.5286, + "step": 34839 + }, + { + "epoch": 0.45272991002902657, + "grad_norm": 0.32030290365219116, + "learning_rate": 0.00010947633839795163, + "loss": 1.1871, + "step": 34840 + }, + { + "epoch": 0.4527429045729424, + "grad_norm": 0.36549127101898193, + "learning_rate": 0.00010947373893604025, + "loss": 1.3377, + "step": 34841 + }, + { + "epoch": 0.4527558991168583, + "grad_norm": 0.3893648684024811, + "learning_rate": 0.00010947113947412886, + "loss": 1.3878, + "step": 34842 + }, + { + "epoch": 0.45276889366077416, + "grad_norm": 0.40730881690979004, + "learning_rate": 0.00010946854001221747, + "loss": 1.5255, + "step": 34843 + }, + { + "epoch": 0.45278188820469006, + "grad_norm": 0.4550569951534271, + "learning_rate": 0.0001094659405503061, + "loss": 1.5847, + "step": 34844 + }, + { + "epoch": 0.4527948827486059, + "grad_norm": 0.39973410964012146, + "learning_rate": 0.00010946334108839472, + "loss": 1.4134, + "step": 34845 + }, + { + "epoch": 0.4528078772925218, + "grad_norm": 0.43830522894859314, + "learning_rate": 0.00010946074162648331, + "loss": 1.4112, + "step": 34846 + }, + { + "epoch": 0.45282087183643766, + "grad_norm": 0.399680495262146, + "learning_rate": 0.00010945814216457193, + "loss": 1.5347, + "step": 34847 + }, + { + "epoch": 0.45283386638035356, + "grad_norm": 0.32835090160369873, + "learning_rate": 0.00010945554270266056, + "loss": 1.4869, + "step": 34848 + }, + { + "epoch": 0.4528468609242694, + "grad_norm": 0.29390570521354675, + "learning_rate": 0.00010945294324074917, + "loss": 1.3645, + "step": 34849 + }, + { + "epoch": 0.4528598554681853, + "grad_norm": 0.40344110131263733, + "learning_rate": 0.00010945034377883778, + "loss": 1.293, + "step": 34850 + }, + { + "epoch": 0.45287285001210115, + "grad_norm": 0.2858838737010956, + "learning_rate": 0.0001094477443169264, + "loss": 1.1227, + "step": 34851 + }, + { + "epoch": 0.45288584455601705, + "grad_norm": 0.39529165625572205, + "learning_rate": 0.00010944514485501502, + "loss": 1.4179, + "step": 34852 + }, + { + "epoch": 0.4528988390999329, + "grad_norm": 0.40686482191085815, + "learning_rate": 0.00010944254539310363, + "loss": 1.3506, + "step": 34853 + }, + { + "epoch": 0.4529118336438488, + "grad_norm": 0.34736207127571106, + "learning_rate": 0.00010943994593119224, + "loss": 1.3002, + "step": 34854 + }, + { + "epoch": 0.45292482818776464, + "grad_norm": 0.36969509720802307, + "learning_rate": 0.00010943734646928085, + "loss": 1.155, + "step": 34855 + }, + { + "epoch": 0.45293782273168054, + "grad_norm": 0.3658212125301361, + "learning_rate": 0.00010943474700736949, + "loss": 1.3451, + "step": 34856 + }, + { + "epoch": 0.4529508172755964, + "grad_norm": 0.3960089087486267, + "learning_rate": 0.0001094321475454581, + "loss": 1.2906, + "step": 34857 + }, + { + "epoch": 0.4529638118195123, + "grad_norm": 0.3687858581542969, + "learning_rate": 0.00010942954808354671, + "loss": 1.2775, + "step": 34858 + }, + { + "epoch": 0.45297680636342813, + "grad_norm": 0.39189523458480835, + "learning_rate": 0.00010942694862163531, + "loss": 1.3935, + "step": 34859 + }, + { + "epoch": 0.45298980090734403, + "grad_norm": 0.3685706853866577, + "learning_rate": 0.00010942434915972395, + "loss": 1.4652, + "step": 34860 + }, + { + "epoch": 0.4530027954512599, + "grad_norm": 0.48676052689552307, + "learning_rate": 0.00010942174969781256, + "loss": 1.4153, + "step": 34861 + }, + { + "epoch": 0.4530157899951758, + "grad_norm": 0.4634558856487274, + "learning_rate": 0.00010941915023590117, + "loss": 1.3157, + "step": 34862 + }, + { + "epoch": 0.4530287845390916, + "grad_norm": 0.4022471010684967, + "learning_rate": 0.00010941655077398978, + "loss": 1.4496, + "step": 34863 + }, + { + "epoch": 0.4530417790830075, + "grad_norm": 0.3934744894504547, + "learning_rate": 0.0001094139513120784, + "loss": 1.3123, + "step": 34864 + }, + { + "epoch": 0.45305477362692337, + "grad_norm": 0.4103308320045471, + "learning_rate": 0.00010941135185016702, + "loss": 1.468, + "step": 34865 + }, + { + "epoch": 0.45306776817083927, + "grad_norm": 0.5170601606369019, + "learning_rate": 0.00010940875238825563, + "loss": 1.3108, + "step": 34866 + }, + { + "epoch": 0.45308076271475517, + "grad_norm": 0.37900015711784363, + "learning_rate": 0.00010940615292634427, + "loss": 1.2779, + "step": 34867 + }, + { + "epoch": 0.453093757258671, + "grad_norm": 0.37011009454727173, + "learning_rate": 0.00010940355346443288, + "loss": 1.4667, + "step": 34868 + }, + { + "epoch": 0.4531067518025869, + "grad_norm": 0.32416194677352905, + "learning_rate": 0.00010940095400252149, + "loss": 1.2279, + "step": 34869 + }, + { + "epoch": 0.45311974634650276, + "grad_norm": 0.3607569932937622, + "learning_rate": 0.0001093983545406101, + "loss": 1.3245, + "step": 34870 + }, + { + "epoch": 0.45313274089041866, + "grad_norm": 0.40057358145713806, + "learning_rate": 0.00010939575507869872, + "loss": 1.3865, + "step": 34871 + }, + { + "epoch": 0.4531457354343345, + "grad_norm": 0.430584192276001, + "learning_rate": 0.00010939315561678733, + "loss": 1.4241, + "step": 34872 + }, + { + "epoch": 0.4531587299782504, + "grad_norm": 0.4813686013221741, + "learning_rate": 0.00010939055615487594, + "loss": 1.4919, + "step": 34873 + }, + { + "epoch": 0.45317172452216625, + "grad_norm": 0.37951257824897766, + "learning_rate": 0.00010938795669296456, + "loss": 1.3455, + "step": 34874 + }, + { + "epoch": 0.45318471906608215, + "grad_norm": 0.36754757165908813, + "learning_rate": 0.0001093853572310532, + "loss": 1.3525, + "step": 34875 + }, + { + "epoch": 0.453197713609998, + "grad_norm": 0.36494866013526917, + "learning_rate": 0.00010938275776914179, + "loss": 1.4013, + "step": 34876 + }, + { + "epoch": 0.4532107081539139, + "grad_norm": 0.3692297339439392, + "learning_rate": 0.0001093801583072304, + "loss": 1.4688, + "step": 34877 + }, + { + "epoch": 0.45322370269782974, + "grad_norm": 0.3461282551288605, + "learning_rate": 0.00010937755884531901, + "loss": 1.512, + "step": 34878 + }, + { + "epoch": 0.45323669724174565, + "grad_norm": 0.39226582646369934, + "learning_rate": 0.00010937495938340765, + "loss": 1.3067, + "step": 34879 + }, + { + "epoch": 0.4532496917856615, + "grad_norm": 0.35757702589035034, + "learning_rate": 0.00010937235992149626, + "loss": 1.3291, + "step": 34880 + }, + { + "epoch": 0.4532626863295774, + "grad_norm": 0.44034165143966675, + "learning_rate": 0.00010936976045958487, + "loss": 1.3574, + "step": 34881 + }, + { + "epoch": 0.45327568087349324, + "grad_norm": 0.33873802423477173, + "learning_rate": 0.00010936716099767348, + "loss": 1.3109, + "step": 34882 + }, + { + "epoch": 0.45328867541740914, + "grad_norm": 0.3512853682041168, + "learning_rate": 0.00010936456153576211, + "loss": 1.2203, + "step": 34883 + }, + { + "epoch": 0.453301669961325, + "grad_norm": 0.2922482490539551, + "learning_rate": 0.00010936196207385072, + "loss": 1.1629, + "step": 34884 + }, + { + "epoch": 0.4533146645052409, + "grad_norm": 0.4449670910835266, + "learning_rate": 0.00010935936261193933, + "loss": 1.4845, + "step": 34885 + }, + { + "epoch": 0.45332765904915673, + "grad_norm": 0.46181347966194153, + "learning_rate": 0.00010935676315002794, + "loss": 1.3276, + "step": 34886 + }, + { + "epoch": 0.45334065359307263, + "grad_norm": 0.30708736181259155, + "learning_rate": 0.00010935416368811658, + "loss": 1.5237, + "step": 34887 + }, + { + "epoch": 0.4533536481369885, + "grad_norm": 0.38255080580711365, + "learning_rate": 0.00010935156422620518, + "loss": 1.3888, + "step": 34888 + }, + { + "epoch": 0.4533666426809044, + "grad_norm": 0.4061680734157562, + "learning_rate": 0.00010934896476429379, + "loss": 1.4784, + "step": 34889 + }, + { + "epoch": 0.4533796372248202, + "grad_norm": 0.3476581871509552, + "learning_rate": 0.0001093463653023824, + "loss": 1.2786, + "step": 34890 + }, + { + "epoch": 0.4533926317687361, + "grad_norm": 0.4036935865879059, + "learning_rate": 0.00010934376584047104, + "loss": 1.4064, + "step": 34891 + }, + { + "epoch": 0.45340562631265197, + "grad_norm": 0.48464447259902954, + "learning_rate": 0.00010934116637855965, + "loss": 1.38, + "step": 34892 + }, + { + "epoch": 0.45341862085656787, + "grad_norm": 0.5203121304512024, + "learning_rate": 0.00010933856691664826, + "loss": 1.4632, + "step": 34893 + }, + { + "epoch": 0.4534316154004837, + "grad_norm": 0.3555506765842438, + "learning_rate": 0.00010933596745473687, + "loss": 1.5015, + "step": 34894 + }, + { + "epoch": 0.4534446099443996, + "grad_norm": 0.39428168535232544, + "learning_rate": 0.0001093333679928255, + "loss": 1.2106, + "step": 34895 + }, + { + "epoch": 0.45345760448831546, + "grad_norm": 0.498831570148468, + "learning_rate": 0.0001093307685309141, + "loss": 1.5395, + "step": 34896 + }, + { + "epoch": 0.45347059903223136, + "grad_norm": 0.4345197379589081, + "learning_rate": 0.00010932816906900272, + "loss": 1.3917, + "step": 34897 + }, + { + "epoch": 0.4534835935761472, + "grad_norm": 0.37654387950897217, + "learning_rate": 0.00010932556960709133, + "loss": 1.3873, + "step": 34898 + }, + { + "epoch": 0.4534965881200631, + "grad_norm": 0.3658028244972229, + "learning_rate": 0.00010932297014517996, + "loss": 1.2453, + "step": 34899 + }, + { + "epoch": 0.45350958266397895, + "grad_norm": 0.3189201354980469, + "learning_rate": 0.00010932037068326858, + "loss": 1.1739, + "step": 34900 + }, + { + "epoch": 0.45352257720789485, + "grad_norm": 0.4434395730495453, + "learning_rate": 0.00010931777122135717, + "loss": 1.4704, + "step": 34901 + }, + { + "epoch": 0.4535355717518107, + "grad_norm": 0.41789525747299194, + "learning_rate": 0.00010931517175944578, + "loss": 1.3896, + "step": 34902 + }, + { + "epoch": 0.4535485662957266, + "grad_norm": 0.35161980986595154, + "learning_rate": 0.00010931257229753442, + "loss": 1.4959, + "step": 34903 + }, + { + "epoch": 0.45356156083964244, + "grad_norm": 0.5179147720336914, + "learning_rate": 0.00010930997283562303, + "loss": 1.6515, + "step": 34904 + }, + { + "epoch": 0.45357455538355834, + "grad_norm": 0.4979860782623291, + "learning_rate": 0.00010930737337371164, + "loss": 1.3833, + "step": 34905 + }, + { + "epoch": 0.4535875499274742, + "grad_norm": 0.44387370347976685, + "learning_rate": 0.00010930477391180027, + "loss": 1.3929, + "step": 34906 + }, + { + "epoch": 0.4536005444713901, + "grad_norm": 0.4943463206291199, + "learning_rate": 0.00010930217444988888, + "loss": 1.4795, + "step": 34907 + }, + { + "epoch": 0.45361353901530593, + "grad_norm": 0.48257526755332947, + "learning_rate": 0.00010929957498797749, + "loss": 1.4454, + "step": 34908 + }, + { + "epoch": 0.45362653355922183, + "grad_norm": 0.4452836811542511, + "learning_rate": 0.0001092969755260661, + "loss": 1.351, + "step": 34909 + }, + { + "epoch": 0.4536395281031377, + "grad_norm": 0.4060641825199127, + "learning_rate": 0.00010929437606415474, + "loss": 1.4029, + "step": 34910 + }, + { + "epoch": 0.4536525226470536, + "grad_norm": 0.4017835259437561, + "learning_rate": 0.00010929177660224335, + "loss": 1.3236, + "step": 34911 + }, + { + "epoch": 0.4536655171909694, + "grad_norm": 0.3780839145183563, + "learning_rate": 0.00010928917714033196, + "loss": 1.1553, + "step": 34912 + }, + { + "epoch": 0.4536785117348853, + "grad_norm": 0.46542033553123474, + "learning_rate": 0.00010928657767842057, + "loss": 1.5239, + "step": 34913 + }, + { + "epoch": 0.45369150627880117, + "grad_norm": 0.3736023008823395, + "learning_rate": 0.0001092839782165092, + "loss": 1.3707, + "step": 34914 + }, + { + "epoch": 0.45370450082271707, + "grad_norm": 0.41032758355140686, + "learning_rate": 0.00010928137875459781, + "loss": 1.4228, + "step": 34915 + }, + { + "epoch": 0.4537174953666329, + "grad_norm": 0.47862017154693604, + "learning_rate": 0.00010927877929268642, + "loss": 1.4247, + "step": 34916 + }, + { + "epoch": 0.4537304899105488, + "grad_norm": 0.41875341534614563, + "learning_rate": 0.00010927617983077503, + "loss": 1.3976, + "step": 34917 + }, + { + "epoch": 0.45374348445446466, + "grad_norm": 0.43008360266685486, + "learning_rate": 0.00010927358036886365, + "loss": 1.4219, + "step": 34918 + }, + { + "epoch": 0.45375647899838056, + "grad_norm": 0.39129018783569336, + "learning_rate": 0.00010927098090695226, + "loss": 1.4176, + "step": 34919 + }, + { + "epoch": 0.4537694735422964, + "grad_norm": 0.3203194737434387, + "learning_rate": 0.00010926838144504088, + "loss": 1.2215, + "step": 34920 + }, + { + "epoch": 0.4537824680862123, + "grad_norm": 0.26123708486557007, + "learning_rate": 0.00010926578198312949, + "loss": 1.3103, + "step": 34921 + }, + { + "epoch": 0.45379546263012815, + "grad_norm": 0.29463663697242737, + "learning_rate": 0.00010926318252121812, + "loss": 1.454, + "step": 34922 + }, + { + "epoch": 0.45380845717404406, + "grad_norm": 0.43968164920806885, + "learning_rate": 0.00010926058305930674, + "loss": 1.3738, + "step": 34923 + }, + { + "epoch": 0.4538214517179599, + "grad_norm": 0.3553709387779236, + "learning_rate": 0.00010925798359739535, + "loss": 1.2153, + "step": 34924 + }, + { + "epoch": 0.4538344462618758, + "grad_norm": 0.39629679918289185, + "learning_rate": 0.00010925538413548396, + "loss": 1.3975, + "step": 34925 + }, + { + "epoch": 0.45384744080579165, + "grad_norm": 0.48441317677497864, + "learning_rate": 0.00010925278467357258, + "loss": 1.5037, + "step": 34926 + }, + { + "epoch": 0.45386043534970755, + "grad_norm": 0.30706319212913513, + "learning_rate": 0.00010925018521166119, + "loss": 1.3416, + "step": 34927 + }, + { + "epoch": 0.4538734298936234, + "grad_norm": 0.384332537651062, + "learning_rate": 0.0001092475857497498, + "loss": 1.3945, + "step": 34928 + }, + { + "epoch": 0.4538864244375393, + "grad_norm": 0.3666355013847351, + "learning_rate": 0.00010924498628783841, + "loss": 1.376, + "step": 34929 + }, + { + "epoch": 0.45389941898145514, + "grad_norm": 0.40043532848358154, + "learning_rate": 0.00010924238682592704, + "loss": 1.3757, + "step": 34930 + }, + { + "epoch": 0.45391241352537104, + "grad_norm": 0.4931693375110626, + "learning_rate": 0.00010923978736401565, + "loss": 1.4474, + "step": 34931 + }, + { + "epoch": 0.4539254080692869, + "grad_norm": 0.3278084397315979, + "learning_rate": 0.00010923718790210426, + "loss": 1.6461, + "step": 34932 + }, + { + "epoch": 0.4539384026132028, + "grad_norm": 0.3510821461677551, + "learning_rate": 0.00010923458844019287, + "loss": 1.4728, + "step": 34933 + }, + { + "epoch": 0.45395139715711863, + "grad_norm": 0.4659602642059326, + "learning_rate": 0.00010923198897828151, + "loss": 1.3234, + "step": 34934 + }, + { + "epoch": 0.45396439170103453, + "grad_norm": 0.35574060678482056, + "learning_rate": 0.00010922938951637012, + "loss": 1.1814, + "step": 34935 + }, + { + "epoch": 0.4539773862449504, + "grad_norm": 0.3933261036872864, + "learning_rate": 0.00010922679005445873, + "loss": 1.1879, + "step": 34936 + }, + { + "epoch": 0.4539903807888663, + "grad_norm": 0.3575122058391571, + "learning_rate": 0.00010922419059254734, + "loss": 1.4312, + "step": 34937 + }, + { + "epoch": 0.4540033753327821, + "grad_norm": 0.487956166267395, + "learning_rate": 0.00010922159113063597, + "loss": 1.4441, + "step": 34938 + }, + { + "epoch": 0.454016369876698, + "grad_norm": 0.42550304532051086, + "learning_rate": 0.00010921899166872458, + "loss": 1.3532, + "step": 34939 + }, + { + "epoch": 0.45402936442061387, + "grad_norm": 0.3599705398082733, + "learning_rate": 0.00010921639220681319, + "loss": 1.5448, + "step": 34940 + }, + { + "epoch": 0.45404235896452977, + "grad_norm": 0.4104795455932617, + "learning_rate": 0.0001092137927449018, + "loss": 1.215, + "step": 34941 + }, + { + "epoch": 0.4540553535084456, + "grad_norm": 0.3183121383190155, + "learning_rate": 0.00010921119328299044, + "loss": 1.5554, + "step": 34942 + }, + { + "epoch": 0.4540683480523615, + "grad_norm": 0.4363402724266052, + "learning_rate": 0.00010920859382107904, + "loss": 1.3455, + "step": 34943 + }, + { + "epoch": 0.4540813425962774, + "grad_norm": 0.3751155734062195, + "learning_rate": 0.00010920599435916765, + "loss": 1.4028, + "step": 34944 + }, + { + "epoch": 0.45409433714019326, + "grad_norm": 0.4963170289993286, + "learning_rate": 0.00010920339489725628, + "loss": 1.3854, + "step": 34945 + }, + { + "epoch": 0.45410733168410916, + "grad_norm": 0.4422619938850403, + "learning_rate": 0.0001092007954353449, + "loss": 1.2962, + "step": 34946 + }, + { + "epoch": 0.454120326228025, + "grad_norm": 0.39218616485595703, + "learning_rate": 0.0001091981959734335, + "loss": 1.4155, + "step": 34947 + }, + { + "epoch": 0.4541333207719409, + "grad_norm": 0.3759183883666992, + "learning_rate": 0.00010919559651152212, + "loss": 1.3537, + "step": 34948 + }, + { + "epoch": 0.45414631531585675, + "grad_norm": 0.3689135015010834, + "learning_rate": 0.00010919299704961074, + "loss": 1.3818, + "step": 34949 + }, + { + "epoch": 0.45415930985977265, + "grad_norm": 0.40971893072128296, + "learning_rate": 0.00010919039758769935, + "loss": 1.4074, + "step": 34950 + }, + { + "epoch": 0.4541723044036885, + "grad_norm": 0.33112218976020813, + "learning_rate": 0.00010918779812578796, + "loss": 1.204, + "step": 34951 + }, + { + "epoch": 0.4541852989476044, + "grad_norm": 0.4678493142127991, + "learning_rate": 0.00010918519866387657, + "loss": 1.3324, + "step": 34952 + }, + { + "epoch": 0.45419829349152024, + "grad_norm": 0.4082872271537781, + "learning_rate": 0.00010918259920196521, + "loss": 1.4707, + "step": 34953 + }, + { + "epoch": 0.45421128803543614, + "grad_norm": 0.39554452896118164, + "learning_rate": 0.00010917999974005382, + "loss": 1.3348, + "step": 34954 + }, + { + "epoch": 0.454224282579352, + "grad_norm": 0.4113595187664032, + "learning_rate": 0.00010917740027814243, + "loss": 1.423, + "step": 34955 + }, + { + "epoch": 0.4542372771232679, + "grad_norm": 0.34707385301589966, + "learning_rate": 0.00010917480081623103, + "loss": 1.3233, + "step": 34956 + }, + { + "epoch": 0.45425027166718374, + "grad_norm": 0.3725568652153015, + "learning_rate": 0.00010917220135431967, + "loss": 1.4826, + "step": 34957 + }, + { + "epoch": 0.45426326621109964, + "grad_norm": 0.39915165305137634, + "learning_rate": 0.00010916960189240828, + "loss": 1.3383, + "step": 34958 + }, + { + "epoch": 0.4542762607550155, + "grad_norm": 0.3920382559299469, + "learning_rate": 0.00010916700243049689, + "loss": 1.4451, + "step": 34959 + }, + { + "epoch": 0.4542892552989314, + "grad_norm": 0.4853487014770508, + "learning_rate": 0.0001091644029685855, + "loss": 1.3799, + "step": 34960 + }, + { + "epoch": 0.45430224984284723, + "grad_norm": 0.46993762254714966, + "learning_rate": 0.00010916180350667413, + "loss": 1.4378, + "step": 34961 + }, + { + "epoch": 0.45431524438676313, + "grad_norm": 0.33537721633911133, + "learning_rate": 0.00010915920404476274, + "loss": 1.321, + "step": 34962 + }, + { + "epoch": 0.454328238930679, + "grad_norm": 0.466763973236084, + "learning_rate": 0.00010915660458285135, + "loss": 1.4883, + "step": 34963 + }, + { + "epoch": 0.4543412334745949, + "grad_norm": 0.47694411873817444, + "learning_rate": 0.00010915400512093996, + "loss": 1.2671, + "step": 34964 + }, + { + "epoch": 0.4543542280185107, + "grad_norm": 0.3795941174030304, + "learning_rate": 0.0001091514056590286, + "loss": 1.421, + "step": 34965 + }, + { + "epoch": 0.4543672225624266, + "grad_norm": 0.4208022356033325, + "learning_rate": 0.00010914880619711721, + "loss": 1.4141, + "step": 34966 + }, + { + "epoch": 0.45438021710634247, + "grad_norm": 0.3960261940956116, + "learning_rate": 0.00010914620673520582, + "loss": 1.3542, + "step": 34967 + }, + { + "epoch": 0.45439321165025837, + "grad_norm": 0.4825434982776642, + "learning_rate": 0.00010914360727329442, + "loss": 1.5779, + "step": 34968 + }, + { + "epoch": 0.4544062061941742, + "grad_norm": 0.41211599111557007, + "learning_rate": 0.00010914100781138305, + "loss": 1.283, + "step": 34969 + }, + { + "epoch": 0.4544192007380901, + "grad_norm": 0.4351232349872589, + "learning_rate": 0.00010913840834947167, + "loss": 1.3697, + "step": 34970 + }, + { + "epoch": 0.45443219528200596, + "grad_norm": 0.40193799138069153, + "learning_rate": 0.00010913580888756028, + "loss": 1.1993, + "step": 34971 + }, + { + "epoch": 0.45444518982592186, + "grad_norm": 0.455183744430542, + "learning_rate": 0.00010913320942564889, + "loss": 1.3317, + "step": 34972 + }, + { + "epoch": 0.4544581843698377, + "grad_norm": 0.39767202734947205, + "learning_rate": 0.00010913060996373751, + "loss": 1.3024, + "step": 34973 + }, + { + "epoch": 0.4544711789137536, + "grad_norm": 0.3617262840270996, + "learning_rate": 0.00010912801050182612, + "loss": 1.3822, + "step": 34974 + }, + { + "epoch": 0.45448417345766945, + "grad_norm": 0.42992904782295227, + "learning_rate": 0.00010912541103991473, + "loss": 1.4194, + "step": 34975 + }, + { + "epoch": 0.45449716800158535, + "grad_norm": 0.448257714509964, + "learning_rate": 0.00010912281157800335, + "loss": 1.5057, + "step": 34976 + }, + { + "epoch": 0.4545101625455012, + "grad_norm": 0.45390164852142334, + "learning_rate": 0.00010912021211609198, + "loss": 1.6032, + "step": 34977 + }, + { + "epoch": 0.4545231570894171, + "grad_norm": 0.4097226858139038, + "learning_rate": 0.0001091176126541806, + "loss": 1.5089, + "step": 34978 + }, + { + "epoch": 0.45453615163333294, + "grad_norm": 0.44250762462615967, + "learning_rate": 0.0001091150131922692, + "loss": 1.3101, + "step": 34979 + }, + { + "epoch": 0.45454914617724884, + "grad_norm": 0.2933673560619354, + "learning_rate": 0.00010911241373035783, + "loss": 1.1957, + "step": 34980 + }, + { + "epoch": 0.4545621407211647, + "grad_norm": 0.4554346799850464, + "learning_rate": 0.00010910981426844644, + "loss": 1.4014, + "step": 34981 + }, + { + "epoch": 0.4545751352650806, + "grad_norm": 0.25293442606925964, + "learning_rate": 0.00010910721480653505, + "loss": 1.2888, + "step": 34982 + }, + { + "epoch": 0.45458812980899643, + "grad_norm": 0.3799055516719818, + "learning_rate": 0.00010910461534462366, + "loss": 1.4616, + "step": 34983 + }, + { + "epoch": 0.45460112435291233, + "grad_norm": 0.3837970495223999, + "learning_rate": 0.0001091020158827123, + "loss": 1.3854, + "step": 34984 + }, + { + "epoch": 0.4546141188968282, + "grad_norm": 0.4756086468696594, + "learning_rate": 0.0001090994164208009, + "loss": 1.3803, + "step": 34985 + }, + { + "epoch": 0.4546271134407441, + "grad_norm": 0.3744949698448181, + "learning_rate": 0.00010909681695888951, + "loss": 1.5854, + "step": 34986 + }, + { + "epoch": 0.4546401079846599, + "grad_norm": 0.32548439502716064, + "learning_rate": 0.00010909421749697812, + "loss": 1.2678, + "step": 34987 + }, + { + "epoch": 0.4546531025285758, + "grad_norm": 0.3792979121208191, + "learning_rate": 0.00010909161803506676, + "loss": 1.3154, + "step": 34988 + }, + { + "epoch": 0.45466609707249167, + "grad_norm": 0.43225038051605225, + "learning_rate": 0.00010908901857315537, + "loss": 1.3494, + "step": 34989 + }, + { + "epoch": 0.45467909161640757, + "grad_norm": 0.4257349371910095, + "learning_rate": 0.00010908641911124398, + "loss": 1.4762, + "step": 34990 + }, + { + "epoch": 0.4546920861603234, + "grad_norm": 0.30412521958351135, + "learning_rate": 0.00010908381964933259, + "loss": 1.5024, + "step": 34991 + }, + { + "epoch": 0.4547050807042393, + "grad_norm": 0.4065783619880676, + "learning_rate": 0.00010908122018742121, + "loss": 1.2806, + "step": 34992 + }, + { + "epoch": 0.45471807524815516, + "grad_norm": 0.3466070294380188, + "learning_rate": 0.00010907862072550983, + "loss": 1.3104, + "step": 34993 + }, + { + "epoch": 0.45473106979207106, + "grad_norm": 0.3153553903102875, + "learning_rate": 0.00010907602126359844, + "loss": 1.3188, + "step": 34994 + }, + { + "epoch": 0.4547440643359869, + "grad_norm": 0.39845412969589233, + "learning_rate": 0.00010907342180168705, + "loss": 1.4495, + "step": 34995 + }, + { + "epoch": 0.4547570588799028, + "grad_norm": 0.3720419108867645, + "learning_rate": 0.00010907082233977569, + "loss": 1.3898, + "step": 34996 + }, + { + "epoch": 0.45477005342381865, + "grad_norm": 0.30603674054145813, + "learning_rate": 0.0001090682228778643, + "loss": 1.2955, + "step": 34997 + }, + { + "epoch": 0.45478304796773455, + "grad_norm": 0.4608227014541626, + "learning_rate": 0.0001090656234159529, + "loss": 1.37, + "step": 34998 + }, + { + "epoch": 0.4547960425116504, + "grad_norm": 0.4668556749820709, + "learning_rate": 0.0001090630239540415, + "loss": 1.4287, + "step": 34999 + }, + { + "epoch": 0.4548090370555663, + "grad_norm": 0.28385376930236816, + "learning_rate": 0.00010906042449213014, + "loss": 1.2802, + "step": 35000 + }, + { + "epoch": 0.45482203159948215, + "grad_norm": 0.459938108921051, + "learning_rate": 0.00010905782503021875, + "loss": 1.4945, + "step": 35001 + }, + { + "epoch": 0.45483502614339805, + "grad_norm": 0.4360826909542084, + "learning_rate": 0.00010905522556830736, + "loss": 1.4677, + "step": 35002 + }, + { + "epoch": 0.4548480206873139, + "grad_norm": 0.4446322023868561, + "learning_rate": 0.00010905262610639598, + "loss": 1.4812, + "step": 35003 + }, + { + "epoch": 0.4548610152312298, + "grad_norm": 0.3647370934486389, + "learning_rate": 0.0001090500266444846, + "loss": 1.3933, + "step": 35004 + }, + { + "epoch": 0.45487400977514564, + "grad_norm": 0.33514007925987244, + "learning_rate": 0.00010904742718257321, + "loss": 1.3609, + "step": 35005 + }, + { + "epoch": 0.45488700431906154, + "grad_norm": 0.3846753239631653, + "learning_rate": 0.00010904482772066182, + "loss": 1.327, + "step": 35006 + }, + { + "epoch": 0.4548999988629774, + "grad_norm": 0.38477998971939087, + "learning_rate": 0.00010904222825875043, + "loss": 1.4785, + "step": 35007 + }, + { + "epoch": 0.4549129934068933, + "grad_norm": 0.43844062089920044, + "learning_rate": 0.00010903962879683907, + "loss": 1.4052, + "step": 35008 + }, + { + "epoch": 0.45492598795080913, + "grad_norm": 0.3931030035018921, + "learning_rate": 0.00010903702933492768, + "loss": 1.4713, + "step": 35009 + }, + { + "epoch": 0.45493898249472503, + "grad_norm": 0.4574306011199951, + "learning_rate": 0.00010903442987301628, + "loss": 1.5572, + "step": 35010 + }, + { + "epoch": 0.4549519770386409, + "grad_norm": 0.5211104154586792, + "learning_rate": 0.00010903183041110489, + "loss": 1.2276, + "step": 35011 + }, + { + "epoch": 0.4549649715825568, + "grad_norm": 0.3815925717353821, + "learning_rate": 0.00010902923094919353, + "loss": 1.4566, + "step": 35012 + }, + { + "epoch": 0.4549779661264726, + "grad_norm": 0.47390589118003845, + "learning_rate": 0.00010902663148728214, + "loss": 1.3785, + "step": 35013 + }, + { + "epoch": 0.4549909606703885, + "grad_norm": 0.36726316809654236, + "learning_rate": 0.00010902403202537075, + "loss": 1.62, + "step": 35014 + }, + { + "epoch": 0.45500395521430437, + "grad_norm": 0.3422521650791168, + "learning_rate": 0.00010902143256345936, + "loss": 1.2442, + "step": 35015 + }, + { + "epoch": 0.45501694975822027, + "grad_norm": 0.4108089208602905, + "learning_rate": 0.00010901883310154799, + "loss": 1.3889, + "step": 35016 + }, + { + "epoch": 0.4550299443021361, + "grad_norm": 0.43522727489471436, + "learning_rate": 0.0001090162336396366, + "loss": 1.4911, + "step": 35017 + }, + { + "epoch": 0.455042938846052, + "grad_norm": 0.4079974591732025, + "learning_rate": 0.00010901363417772521, + "loss": 1.3812, + "step": 35018 + }, + { + "epoch": 0.4550559333899679, + "grad_norm": 0.39696234464645386, + "learning_rate": 0.00010901103471581385, + "loss": 1.5166, + "step": 35019 + }, + { + "epoch": 0.45506892793388376, + "grad_norm": 0.4703359603881836, + "learning_rate": 0.00010900843525390246, + "loss": 1.5466, + "step": 35020 + }, + { + "epoch": 0.45508192247779966, + "grad_norm": 0.47171080112457275, + "learning_rate": 0.00010900583579199107, + "loss": 1.4436, + "step": 35021 + }, + { + "epoch": 0.4550949170217155, + "grad_norm": 0.3912294805049896, + "learning_rate": 0.00010900323633007968, + "loss": 1.4944, + "step": 35022 + }, + { + "epoch": 0.4551079115656314, + "grad_norm": 0.36724093556404114, + "learning_rate": 0.0001090006368681683, + "loss": 1.3478, + "step": 35023 + }, + { + "epoch": 0.45512090610954725, + "grad_norm": 0.3526167571544647, + "learning_rate": 0.00010899803740625691, + "loss": 1.1736, + "step": 35024 + }, + { + "epoch": 0.45513390065346315, + "grad_norm": 0.38478177785873413, + "learning_rate": 0.00010899543794434552, + "loss": 1.3599, + "step": 35025 + }, + { + "epoch": 0.455146895197379, + "grad_norm": 0.46311745047569275, + "learning_rate": 0.00010899283848243414, + "loss": 1.4867, + "step": 35026 + }, + { + "epoch": 0.4551598897412949, + "grad_norm": 0.4363219738006592, + "learning_rate": 0.00010899023902052276, + "loss": 1.2768, + "step": 35027 + }, + { + "epoch": 0.45517288428521074, + "grad_norm": 0.3876301944255829, + "learning_rate": 0.00010898763955861137, + "loss": 1.4893, + "step": 35028 + }, + { + "epoch": 0.45518587882912664, + "grad_norm": 0.3941620886325836, + "learning_rate": 0.00010898504009669998, + "loss": 1.3935, + "step": 35029 + }, + { + "epoch": 0.4551988733730425, + "grad_norm": 0.23850257694721222, + "learning_rate": 0.00010898244063478859, + "loss": 1.1561, + "step": 35030 + }, + { + "epoch": 0.4552118679169584, + "grad_norm": 0.44576647877693176, + "learning_rate": 0.00010897984117287723, + "loss": 1.4445, + "step": 35031 + }, + { + "epoch": 0.45522486246087424, + "grad_norm": 0.37551572918891907, + "learning_rate": 0.00010897724171096584, + "loss": 1.3057, + "step": 35032 + }, + { + "epoch": 0.45523785700479014, + "grad_norm": 0.4259045720100403, + "learning_rate": 0.00010897464224905445, + "loss": 1.48, + "step": 35033 + }, + { + "epoch": 0.455250851548706, + "grad_norm": 0.35197895765304565, + "learning_rate": 0.00010897204278714306, + "loss": 1.344, + "step": 35034 + }, + { + "epoch": 0.4552638460926219, + "grad_norm": 0.29217153787612915, + "learning_rate": 0.00010896944332523169, + "loss": 1.2978, + "step": 35035 + }, + { + "epoch": 0.4552768406365377, + "grad_norm": 0.4392157793045044, + "learning_rate": 0.0001089668438633203, + "loss": 1.4654, + "step": 35036 + }, + { + "epoch": 0.45528983518045363, + "grad_norm": 0.3665829002857208, + "learning_rate": 0.00010896424440140891, + "loss": 1.2654, + "step": 35037 + }, + { + "epoch": 0.4553028297243695, + "grad_norm": 0.41133347153663635, + "learning_rate": 0.00010896164493949752, + "loss": 1.3439, + "step": 35038 + }, + { + "epoch": 0.4553158242682854, + "grad_norm": 0.41872677206993103, + "learning_rate": 0.00010895904547758616, + "loss": 1.5281, + "step": 35039 + }, + { + "epoch": 0.4553288188122012, + "grad_norm": 0.439369797706604, + "learning_rate": 0.00010895644601567476, + "loss": 1.3763, + "step": 35040 + }, + { + "epoch": 0.4553418133561171, + "grad_norm": 0.40229979157447815, + "learning_rate": 0.00010895384655376337, + "loss": 1.5444, + "step": 35041 + }, + { + "epoch": 0.45535480790003297, + "grad_norm": 0.4021202325820923, + "learning_rate": 0.00010895124709185198, + "loss": 1.4073, + "step": 35042 + }, + { + "epoch": 0.45536780244394887, + "grad_norm": 0.3818398714065552, + "learning_rate": 0.00010894864762994062, + "loss": 1.4037, + "step": 35043 + }, + { + "epoch": 0.4553807969878647, + "grad_norm": 0.39342767000198364, + "learning_rate": 0.00010894604816802923, + "loss": 1.4022, + "step": 35044 + }, + { + "epoch": 0.4553937915317806, + "grad_norm": 0.36151599884033203, + "learning_rate": 0.00010894344870611784, + "loss": 1.2688, + "step": 35045 + }, + { + "epoch": 0.45540678607569646, + "grad_norm": 0.36676326394081116, + "learning_rate": 0.00010894084924420645, + "loss": 1.2197, + "step": 35046 + }, + { + "epoch": 0.45541978061961236, + "grad_norm": 0.3971310257911682, + "learning_rate": 0.00010893824978229507, + "loss": 1.379, + "step": 35047 + }, + { + "epoch": 0.4554327751635282, + "grad_norm": 0.4179786443710327, + "learning_rate": 0.00010893565032038368, + "loss": 1.4474, + "step": 35048 + }, + { + "epoch": 0.4554457697074441, + "grad_norm": 0.37974312901496887, + "learning_rate": 0.0001089330508584723, + "loss": 1.4708, + "step": 35049 + }, + { + "epoch": 0.45545876425135995, + "grad_norm": 0.31294378638267517, + "learning_rate": 0.0001089304513965609, + "loss": 1.2887, + "step": 35050 + }, + { + "epoch": 0.45547175879527585, + "grad_norm": 0.3727509379386902, + "learning_rate": 0.00010892785193464954, + "loss": 1.1632, + "step": 35051 + }, + { + "epoch": 0.4554847533391917, + "grad_norm": 0.4583154618740082, + "learning_rate": 0.00010892525247273814, + "loss": 1.3536, + "step": 35052 + }, + { + "epoch": 0.4554977478831076, + "grad_norm": 0.3647528886795044, + "learning_rate": 0.00010892265301082675, + "loss": 1.4213, + "step": 35053 + }, + { + "epoch": 0.45551074242702344, + "grad_norm": 0.3386082053184509, + "learning_rate": 0.00010892005354891539, + "loss": 1.3928, + "step": 35054 + }, + { + "epoch": 0.45552373697093934, + "grad_norm": 0.35960879921913147, + "learning_rate": 0.000108917454087004, + "loss": 1.2011, + "step": 35055 + }, + { + "epoch": 0.4555367315148552, + "grad_norm": 0.439457505941391, + "learning_rate": 0.00010891485462509261, + "loss": 1.3372, + "step": 35056 + }, + { + "epoch": 0.4555497260587711, + "grad_norm": 0.40421563386917114, + "learning_rate": 0.00010891225516318122, + "loss": 1.5815, + "step": 35057 + }, + { + "epoch": 0.45556272060268693, + "grad_norm": 0.40897563099861145, + "learning_rate": 0.00010890965570126985, + "loss": 1.4969, + "step": 35058 + }, + { + "epoch": 0.45557571514660283, + "grad_norm": 0.33841508626937866, + "learning_rate": 0.00010890705623935846, + "loss": 1.5263, + "step": 35059 + }, + { + "epoch": 0.4555887096905187, + "grad_norm": 0.3402288556098938, + "learning_rate": 0.00010890445677744707, + "loss": 1.5226, + "step": 35060 + }, + { + "epoch": 0.4556017042344346, + "grad_norm": 0.3883363902568817, + "learning_rate": 0.00010890185731553568, + "loss": 1.3213, + "step": 35061 + }, + { + "epoch": 0.4556146987783504, + "grad_norm": 0.4305150508880615, + "learning_rate": 0.00010889925785362432, + "loss": 1.4375, + "step": 35062 + }, + { + "epoch": 0.4556276933222663, + "grad_norm": 0.37741154432296753, + "learning_rate": 0.00010889665839171293, + "loss": 1.3679, + "step": 35063 + }, + { + "epoch": 0.45564068786618217, + "grad_norm": 0.3971658945083618, + "learning_rate": 0.00010889405892980154, + "loss": 1.4676, + "step": 35064 + }, + { + "epoch": 0.45565368241009807, + "grad_norm": 0.5680103898048401, + "learning_rate": 0.00010889145946789014, + "loss": 1.4097, + "step": 35065 + }, + { + "epoch": 0.4556666769540139, + "grad_norm": 0.4103486239910126, + "learning_rate": 0.00010888886000597878, + "loss": 1.4053, + "step": 35066 + }, + { + "epoch": 0.4556796714979298, + "grad_norm": 0.29176458716392517, + "learning_rate": 0.00010888626054406739, + "loss": 1.2888, + "step": 35067 + }, + { + "epoch": 0.45569266604184566, + "grad_norm": 0.3674878180027008, + "learning_rate": 0.000108883661082156, + "loss": 1.3179, + "step": 35068 + }, + { + "epoch": 0.45570566058576156, + "grad_norm": 0.3420224189758301, + "learning_rate": 0.00010888106162024461, + "loss": 1.3068, + "step": 35069 + }, + { + "epoch": 0.4557186551296774, + "grad_norm": 0.3458615243434906, + "learning_rate": 0.00010887846215833323, + "loss": 1.2571, + "step": 35070 + }, + { + "epoch": 0.4557316496735933, + "grad_norm": 0.4455699324607849, + "learning_rate": 0.00010887586269642184, + "loss": 1.4624, + "step": 35071 + }, + { + "epoch": 0.45574464421750915, + "grad_norm": 0.44694995880126953, + "learning_rate": 0.00010887326323451046, + "loss": 1.4674, + "step": 35072 + }, + { + "epoch": 0.45575763876142505, + "grad_norm": 0.4394267201423645, + "learning_rate": 0.00010887066377259907, + "loss": 1.5299, + "step": 35073 + }, + { + "epoch": 0.4557706333053409, + "grad_norm": 0.5419975519180298, + "learning_rate": 0.0001088680643106877, + "loss": 1.5702, + "step": 35074 + }, + { + "epoch": 0.4557836278492568, + "grad_norm": 0.4657742977142334, + "learning_rate": 0.00010886546484877632, + "loss": 1.3794, + "step": 35075 + }, + { + "epoch": 0.45579662239317265, + "grad_norm": 0.3677768111228943, + "learning_rate": 0.00010886286538686493, + "loss": 1.3042, + "step": 35076 + }, + { + "epoch": 0.45580961693708855, + "grad_norm": 0.38906824588775635, + "learning_rate": 0.00010886026592495354, + "loss": 1.3395, + "step": 35077 + }, + { + "epoch": 0.4558226114810044, + "grad_norm": 0.441157728433609, + "learning_rate": 0.00010885766646304216, + "loss": 1.3994, + "step": 35078 + }, + { + "epoch": 0.4558356060249203, + "grad_norm": 0.40818026661872864, + "learning_rate": 0.00010885506700113077, + "loss": 1.377, + "step": 35079 + }, + { + "epoch": 0.45584860056883614, + "grad_norm": 0.3919728100299835, + "learning_rate": 0.00010885246753921938, + "loss": 1.4499, + "step": 35080 + }, + { + "epoch": 0.45586159511275204, + "grad_norm": 0.4807562530040741, + "learning_rate": 0.000108849868077308, + "loss": 1.2835, + "step": 35081 + }, + { + "epoch": 0.4558745896566679, + "grad_norm": 0.43233057856559753, + "learning_rate": 0.00010884726861539662, + "loss": 1.2688, + "step": 35082 + }, + { + "epoch": 0.4558875842005838, + "grad_norm": 0.37879419326782227, + "learning_rate": 0.00010884466915348523, + "loss": 1.3436, + "step": 35083 + }, + { + "epoch": 0.45590057874449963, + "grad_norm": 0.4727247953414917, + "learning_rate": 0.00010884206969157384, + "loss": 1.3208, + "step": 35084 + }, + { + "epoch": 0.45591357328841553, + "grad_norm": 0.36132052540779114, + "learning_rate": 0.00010883947022966245, + "loss": 1.4261, + "step": 35085 + }, + { + "epoch": 0.4559265678323314, + "grad_norm": 0.4445934593677521, + "learning_rate": 0.00010883687076775109, + "loss": 1.3316, + "step": 35086 + }, + { + "epoch": 0.4559395623762473, + "grad_norm": 0.3887840211391449, + "learning_rate": 0.0001088342713058397, + "loss": 1.401, + "step": 35087 + }, + { + "epoch": 0.4559525569201631, + "grad_norm": 0.41173064708709717, + "learning_rate": 0.00010883167184392831, + "loss": 1.3259, + "step": 35088 + }, + { + "epoch": 0.455965551464079, + "grad_norm": 0.3394840657711029, + "learning_rate": 0.00010882907238201692, + "loss": 1.1398, + "step": 35089 + }, + { + "epoch": 0.45597854600799487, + "grad_norm": 0.48709434270858765, + "learning_rate": 0.00010882647292010555, + "loss": 1.378, + "step": 35090 + }, + { + "epoch": 0.45599154055191077, + "grad_norm": 0.3950914144515991, + "learning_rate": 0.00010882387345819416, + "loss": 1.4163, + "step": 35091 + }, + { + "epoch": 0.4560045350958266, + "grad_norm": 0.4286964535713196, + "learning_rate": 0.00010882127399628277, + "loss": 1.5206, + "step": 35092 + }, + { + "epoch": 0.4560175296397425, + "grad_norm": 0.339445024728775, + "learning_rate": 0.00010881867453437141, + "loss": 1.5024, + "step": 35093 + }, + { + "epoch": 0.45603052418365836, + "grad_norm": 0.49044355750083923, + "learning_rate": 0.00010881607507246, + "loss": 1.3305, + "step": 35094 + }, + { + "epoch": 0.45604351872757426, + "grad_norm": 0.39040762186050415, + "learning_rate": 0.00010881347561054862, + "loss": 1.2209, + "step": 35095 + }, + { + "epoch": 0.45605651327149016, + "grad_norm": 0.475614458322525, + "learning_rate": 0.00010881087614863723, + "loss": 1.4488, + "step": 35096 + }, + { + "epoch": 0.456069507815406, + "grad_norm": 0.42221200466156006, + "learning_rate": 0.00010880827668672586, + "loss": 1.479, + "step": 35097 + }, + { + "epoch": 0.4560825023593219, + "grad_norm": 0.7619783878326416, + "learning_rate": 0.00010880567722481448, + "loss": 1.4547, + "step": 35098 + }, + { + "epoch": 0.45609549690323775, + "grad_norm": 0.4462756812572479, + "learning_rate": 0.00010880307776290309, + "loss": 1.527, + "step": 35099 + }, + { + "epoch": 0.45610849144715365, + "grad_norm": 0.5439983010292053, + "learning_rate": 0.0001088004783009917, + "loss": 1.4804, + "step": 35100 + }, + { + "epoch": 0.4561214859910695, + "grad_norm": 0.3381611108779907, + "learning_rate": 0.00010879787883908032, + "loss": 1.2476, + "step": 35101 + }, + { + "epoch": 0.4561344805349854, + "grad_norm": 0.49465855956077576, + "learning_rate": 0.00010879527937716893, + "loss": 1.4513, + "step": 35102 + }, + { + "epoch": 0.45614747507890124, + "grad_norm": 0.34988129138946533, + "learning_rate": 0.00010879267991525754, + "loss": 1.3758, + "step": 35103 + }, + { + "epoch": 0.45616046962281714, + "grad_norm": 0.37241464853286743, + "learning_rate": 0.00010879008045334615, + "loss": 1.3197, + "step": 35104 + }, + { + "epoch": 0.456173464166733, + "grad_norm": 0.3859103322029114, + "learning_rate": 0.00010878748099143479, + "loss": 1.3371, + "step": 35105 + }, + { + "epoch": 0.4561864587106489, + "grad_norm": 0.4179384410381317, + "learning_rate": 0.0001087848815295234, + "loss": 1.2031, + "step": 35106 + }, + { + "epoch": 0.45619945325456474, + "grad_norm": 0.42201900482177734, + "learning_rate": 0.000108782282067612, + "loss": 1.3764, + "step": 35107 + }, + { + "epoch": 0.45621244779848064, + "grad_norm": 0.4361856281757355, + "learning_rate": 0.00010877968260570061, + "loss": 1.2826, + "step": 35108 + }, + { + "epoch": 0.4562254423423965, + "grad_norm": 0.3928603231906891, + "learning_rate": 0.00010877708314378925, + "loss": 1.4327, + "step": 35109 + }, + { + "epoch": 0.4562384368863124, + "grad_norm": 0.3992486298084259, + "learning_rate": 0.00010877448368187786, + "loss": 1.3784, + "step": 35110 + }, + { + "epoch": 0.4562514314302282, + "grad_norm": 0.30212706327438354, + "learning_rate": 0.00010877188421996647, + "loss": 1.3223, + "step": 35111 + }, + { + "epoch": 0.4562644259741441, + "grad_norm": 0.43496283888816833, + "learning_rate": 0.00010876928475805508, + "loss": 1.4327, + "step": 35112 + }, + { + "epoch": 0.45627742051806, + "grad_norm": 0.31552693247795105, + "learning_rate": 0.0001087666852961437, + "loss": 1.5971, + "step": 35113 + }, + { + "epoch": 0.4562904150619759, + "grad_norm": 0.39178699254989624, + "learning_rate": 0.00010876408583423232, + "loss": 1.4298, + "step": 35114 + }, + { + "epoch": 0.4563034096058917, + "grad_norm": 0.28121933341026306, + "learning_rate": 0.00010876148637232093, + "loss": 1.5715, + "step": 35115 + }, + { + "epoch": 0.4563164041498076, + "grad_norm": 0.45745649933815, + "learning_rate": 0.00010875888691040954, + "loss": 1.5354, + "step": 35116 + }, + { + "epoch": 0.45632939869372346, + "grad_norm": 0.3201655447483063, + "learning_rate": 0.00010875628744849818, + "loss": 1.3782, + "step": 35117 + }, + { + "epoch": 0.45634239323763937, + "grad_norm": 0.33472123742103577, + "learning_rate": 0.00010875368798658679, + "loss": 1.4377, + "step": 35118 + }, + { + "epoch": 0.4563553877815552, + "grad_norm": 0.5092899799346924, + "learning_rate": 0.0001087510885246754, + "loss": 1.2806, + "step": 35119 + }, + { + "epoch": 0.4563683823254711, + "grad_norm": 0.3529927432537079, + "learning_rate": 0.000108748489062764, + "loss": 1.3199, + "step": 35120 + }, + { + "epoch": 0.45638137686938696, + "grad_norm": 0.3585021495819092, + "learning_rate": 0.00010874588960085263, + "loss": 1.5215, + "step": 35121 + }, + { + "epoch": 0.45639437141330286, + "grad_norm": 0.4989951550960541, + "learning_rate": 0.00010874329013894125, + "loss": 1.4494, + "step": 35122 + }, + { + "epoch": 0.4564073659572187, + "grad_norm": 0.5339763760566711, + "learning_rate": 0.00010874069067702986, + "loss": 1.2754, + "step": 35123 + }, + { + "epoch": 0.4564203605011346, + "grad_norm": 0.34773895144462585, + "learning_rate": 0.00010873809121511847, + "loss": 1.2247, + "step": 35124 + }, + { + "epoch": 0.45643335504505045, + "grad_norm": 0.4995030164718628, + "learning_rate": 0.00010873549175320709, + "loss": 1.5771, + "step": 35125 + }, + { + "epoch": 0.45644634958896635, + "grad_norm": 0.4398910403251648, + "learning_rate": 0.0001087328922912957, + "loss": 1.442, + "step": 35126 + }, + { + "epoch": 0.4564593441328822, + "grad_norm": 0.3295517563819885, + "learning_rate": 0.00010873029282938431, + "loss": 1.3751, + "step": 35127 + }, + { + "epoch": 0.4564723386767981, + "grad_norm": 0.5005344748497009, + "learning_rate": 0.00010872769336747295, + "loss": 1.4498, + "step": 35128 + }, + { + "epoch": 0.45648533322071394, + "grad_norm": 0.5326129794120789, + "learning_rate": 0.00010872509390556156, + "loss": 1.4191, + "step": 35129 + }, + { + "epoch": 0.45649832776462984, + "grad_norm": 0.4533020853996277, + "learning_rate": 0.00010872249444365017, + "loss": 1.4076, + "step": 35130 + }, + { + "epoch": 0.4565113223085457, + "grad_norm": 0.370254248380661, + "learning_rate": 0.00010871989498173878, + "loss": 1.222, + "step": 35131 + }, + { + "epoch": 0.4565243168524616, + "grad_norm": 0.4136718213558197, + "learning_rate": 0.00010871729551982741, + "loss": 1.4117, + "step": 35132 + }, + { + "epoch": 0.45653731139637743, + "grad_norm": 0.41696450114250183, + "learning_rate": 0.00010871469605791602, + "loss": 1.3432, + "step": 35133 + }, + { + "epoch": 0.45655030594029333, + "grad_norm": 0.4033799469470978, + "learning_rate": 0.00010871209659600463, + "loss": 1.5524, + "step": 35134 + }, + { + "epoch": 0.4565633004842092, + "grad_norm": 0.3737904727458954, + "learning_rate": 0.00010870949713409324, + "loss": 1.468, + "step": 35135 + }, + { + "epoch": 0.4565762950281251, + "grad_norm": 0.35311636328697205, + "learning_rate": 0.00010870689767218187, + "loss": 1.1839, + "step": 35136 + }, + { + "epoch": 0.4565892895720409, + "grad_norm": 0.46390852332115173, + "learning_rate": 0.00010870429821027048, + "loss": 1.4522, + "step": 35137 + }, + { + "epoch": 0.4566022841159568, + "grad_norm": 0.4372662603855133, + "learning_rate": 0.00010870169874835909, + "loss": 1.2583, + "step": 35138 + }, + { + "epoch": 0.45661527865987267, + "grad_norm": 0.3897049129009247, + "learning_rate": 0.0001086990992864477, + "loss": 1.3216, + "step": 35139 + }, + { + "epoch": 0.45662827320378857, + "grad_norm": 0.405662477016449, + "learning_rate": 0.00010869649982453634, + "loss": 1.4033, + "step": 35140 + }, + { + "epoch": 0.4566412677477044, + "grad_norm": 0.4160982370376587, + "learning_rate": 0.00010869390036262495, + "loss": 1.5137, + "step": 35141 + }, + { + "epoch": 0.4566542622916203, + "grad_norm": 0.40514206886291504, + "learning_rate": 0.00010869130090071356, + "loss": 1.2781, + "step": 35142 + }, + { + "epoch": 0.45666725683553616, + "grad_norm": 0.33861401677131653, + "learning_rate": 0.00010868870143880217, + "loss": 1.3123, + "step": 35143 + }, + { + "epoch": 0.45668025137945206, + "grad_norm": 0.4271220266819, + "learning_rate": 0.0001086861019768908, + "loss": 1.3868, + "step": 35144 + }, + { + "epoch": 0.4566932459233679, + "grad_norm": 0.42899203300476074, + "learning_rate": 0.0001086835025149794, + "loss": 1.304, + "step": 35145 + }, + { + "epoch": 0.4567062404672838, + "grad_norm": 0.3812744915485382, + "learning_rate": 0.00010868090305306802, + "loss": 1.2488, + "step": 35146 + }, + { + "epoch": 0.45671923501119965, + "grad_norm": 0.3733411431312561, + "learning_rate": 0.00010867830359115663, + "loss": 1.4233, + "step": 35147 + }, + { + "epoch": 0.45673222955511555, + "grad_norm": 0.3744480609893799, + "learning_rate": 0.00010867570412924527, + "loss": 1.366, + "step": 35148 + }, + { + "epoch": 0.4567452240990314, + "grad_norm": 0.33710265159606934, + "learning_rate": 0.00010867310466733386, + "loss": 1.358, + "step": 35149 + }, + { + "epoch": 0.4567582186429473, + "grad_norm": 0.39333394169807434, + "learning_rate": 0.00010867050520542247, + "loss": 1.4081, + "step": 35150 + }, + { + "epoch": 0.45677121318686315, + "grad_norm": 0.4287506937980652, + "learning_rate": 0.00010866790574351108, + "loss": 1.3353, + "step": 35151 + }, + { + "epoch": 0.45678420773077905, + "grad_norm": 0.437557190656662, + "learning_rate": 0.00010866530628159972, + "loss": 1.3684, + "step": 35152 + }, + { + "epoch": 0.4567972022746949, + "grad_norm": 0.3804386258125305, + "learning_rate": 0.00010866270681968833, + "loss": 1.1482, + "step": 35153 + }, + { + "epoch": 0.4568101968186108, + "grad_norm": 0.44130581617355347, + "learning_rate": 0.00010866010735777694, + "loss": 1.4892, + "step": 35154 + }, + { + "epoch": 0.45682319136252664, + "grad_norm": 0.3461728096008301, + "learning_rate": 0.00010865750789586556, + "loss": 1.3876, + "step": 35155 + }, + { + "epoch": 0.45683618590644254, + "grad_norm": 0.4408833384513855, + "learning_rate": 0.00010865490843395418, + "loss": 1.4337, + "step": 35156 + }, + { + "epoch": 0.4568491804503584, + "grad_norm": 0.3418590724468231, + "learning_rate": 0.00010865230897204279, + "loss": 1.408, + "step": 35157 + }, + { + "epoch": 0.4568621749942743, + "grad_norm": 0.2727314233779907, + "learning_rate": 0.0001086497095101314, + "loss": 1.2416, + "step": 35158 + }, + { + "epoch": 0.45687516953819013, + "grad_norm": 0.4399529695510864, + "learning_rate": 0.00010864711004822001, + "loss": 1.419, + "step": 35159 + }, + { + "epoch": 0.45688816408210603, + "grad_norm": 0.5010734796524048, + "learning_rate": 0.00010864451058630865, + "loss": 1.4272, + "step": 35160 + }, + { + "epoch": 0.4569011586260219, + "grad_norm": 0.3213130831718445, + "learning_rate": 0.00010864191112439726, + "loss": 1.4723, + "step": 35161 + }, + { + "epoch": 0.4569141531699378, + "grad_norm": 0.39463120698928833, + "learning_rate": 0.00010863931166248586, + "loss": 1.4758, + "step": 35162 + }, + { + "epoch": 0.4569271477138536, + "grad_norm": 0.40219464898109436, + "learning_rate": 0.00010863671220057447, + "loss": 1.815, + "step": 35163 + }, + { + "epoch": 0.4569401422577695, + "grad_norm": 0.44015780091285706, + "learning_rate": 0.00010863411273866311, + "loss": 1.3933, + "step": 35164 + }, + { + "epoch": 0.45695313680168537, + "grad_norm": 0.3716558516025543, + "learning_rate": 0.00010863151327675172, + "loss": 1.3976, + "step": 35165 + }, + { + "epoch": 0.45696613134560127, + "grad_norm": 0.4074903130531311, + "learning_rate": 0.00010862891381484033, + "loss": 1.5522, + "step": 35166 + }, + { + "epoch": 0.4569791258895171, + "grad_norm": 0.48126131296157837, + "learning_rate": 0.00010862631435292895, + "loss": 1.3402, + "step": 35167 + }, + { + "epoch": 0.456992120433433, + "grad_norm": 0.41142916679382324, + "learning_rate": 0.00010862371489101757, + "loss": 1.464, + "step": 35168 + }, + { + "epoch": 0.45700511497734886, + "grad_norm": 0.4035705029964447, + "learning_rate": 0.00010862111542910618, + "loss": 1.4421, + "step": 35169 + }, + { + "epoch": 0.45701810952126476, + "grad_norm": 0.3333204984664917, + "learning_rate": 0.00010861851596719479, + "loss": 1.4334, + "step": 35170 + }, + { + "epoch": 0.45703110406518066, + "grad_norm": 0.40880200266838074, + "learning_rate": 0.00010861591650528343, + "loss": 1.1942, + "step": 35171 + }, + { + "epoch": 0.4570440986090965, + "grad_norm": 0.404093474149704, + "learning_rate": 0.00010861331704337204, + "loss": 1.3103, + "step": 35172 + }, + { + "epoch": 0.4570570931530124, + "grad_norm": 0.45474931597709656, + "learning_rate": 0.00010861071758146065, + "loss": 1.5753, + "step": 35173 + }, + { + "epoch": 0.45707008769692825, + "grad_norm": 0.4474669098854065, + "learning_rate": 0.00010860811811954924, + "loss": 1.4011, + "step": 35174 + }, + { + "epoch": 0.45708308224084415, + "grad_norm": 0.3757783770561218, + "learning_rate": 0.00010860551865763788, + "loss": 1.5408, + "step": 35175 + }, + { + "epoch": 0.45709607678476, + "grad_norm": 0.30709052085876465, + "learning_rate": 0.0001086029191957265, + "loss": 1.3539, + "step": 35176 + }, + { + "epoch": 0.4571090713286759, + "grad_norm": 0.3956267833709717, + "learning_rate": 0.0001086003197338151, + "loss": 1.4266, + "step": 35177 + }, + { + "epoch": 0.45712206587259174, + "grad_norm": 0.45040905475616455, + "learning_rate": 0.00010859772027190372, + "loss": 1.3055, + "step": 35178 + }, + { + "epoch": 0.45713506041650764, + "grad_norm": 0.2968219518661499, + "learning_rate": 0.00010859512080999234, + "loss": 1.4739, + "step": 35179 + }, + { + "epoch": 0.4571480549604235, + "grad_norm": 0.41308730840682983, + "learning_rate": 0.00010859252134808095, + "loss": 1.2848, + "step": 35180 + }, + { + "epoch": 0.4571610495043394, + "grad_norm": 0.37023502588272095, + "learning_rate": 0.00010858992188616956, + "loss": 1.3686, + "step": 35181 + }, + { + "epoch": 0.45717404404825523, + "grad_norm": 0.26160967350006104, + "learning_rate": 0.00010858732242425817, + "loss": 1.1939, + "step": 35182 + }, + { + "epoch": 0.45718703859217114, + "grad_norm": 0.2825961709022522, + "learning_rate": 0.00010858472296234681, + "loss": 1.4978, + "step": 35183 + }, + { + "epoch": 0.457200033136087, + "grad_norm": 0.3860016465187073, + "learning_rate": 0.00010858212350043542, + "loss": 1.4197, + "step": 35184 + }, + { + "epoch": 0.4572130276800029, + "grad_norm": 0.4376005530357361, + "learning_rate": 0.00010857952403852403, + "loss": 1.3316, + "step": 35185 + }, + { + "epoch": 0.4572260222239187, + "grad_norm": 0.49659183621406555, + "learning_rate": 0.00010857692457661264, + "loss": 1.4649, + "step": 35186 + }, + { + "epoch": 0.4572390167678346, + "grad_norm": 0.40608954429626465, + "learning_rate": 0.00010857432511470127, + "loss": 1.3845, + "step": 35187 + }, + { + "epoch": 0.4572520113117505, + "grad_norm": 0.4390445947647095, + "learning_rate": 0.00010857172565278988, + "loss": 1.4882, + "step": 35188 + }, + { + "epoch": 0.4572650058556664, + "grad_norm": 0.41920405626296997, + "learning_rate": 0.00010856912619087849, + "loss": 1.4217, + "step": 35189 + }, + { + "epoch": 0.4572780003995822, + "grad_norm": 0.40508297085762024, + "learning_rate": 0.0001085665267289671, + "loss": 1.1064, + "step": 35190 + }, + { + "epoch": 0.4572909949434981, + "grad_norm": 0.38219398260116577, + "learning_rate": 0.00010856392726705573, + "loss": 1.4556, + "step": 35191 + }, + { + "epoch": 0.45730398948741396, + "grad_norm": 0.2865232825279236, + "learning_rate": 0.00010856132780514434, + "loss": 1.2694, + "step": 35192 + }, + { + "epoch": 0.45731698403132987, + "grad_norm": 0.3481275737285614, + "learning_rate": 0.00010855872834323295, + "loss": 1.2036, + "step": 35193 + }, + { + "epoch": 0.4573299785752457, + "grad_norm": 0.2854389548301697, + "learning_rate": 0.00010855612888132156, + "loss": 1.4205, + "step": 35194 + }, + { + "epoch": 0.4573429731191616, + "grad_norm": 0.43644359707832336, + "learning_rate": 0.0001085535294194102, + "loss": 1.3665, + "step": 35195 + }, + { + "epoch": 0.45735596766307746, + "grad_norm": 0.45816734433174133, + "learning_rate": 0.00010855092995749881, + "loss": 1.4786, + "step": 35196 + }, + { + "epoch": 0.45736896220699336, + "grad_norm": 0.5114655494689941, + "learning_rate": 0.00010854833049558742, + "loss": 1.4149, + "step": 35197 + }, + { + "epoch": 0.4573819567509092, + "grad_norm": 0.33392441272735596, + "learning_rate": 0.00010854573103367603, + "loss": 1.4848, + "step": 35198 + }, + { + "epoch": 0.4573949512948251, + "grad_norm": 0.3224254250526428, + "learning_rate": 0.00010854313157176465, + "loss": 1.2942, + "step": 35199 + }, + { + "epoch": 0.45740794583874095, + "grad_norm": 0.2591899335384369, + "learning_rate": 0.00010854053210985326, + "loss": 1.4688, + "step": 35200 + }, + { + "epoch": 0.45742094038265685, + "grad_norm": 0.4480980932712555, + "learning_rate": 0.00010853793264794188, + "loss": 1.4249, + "step": 35201 + }, + { + "epoch": 0.4574339349265727, + "grad_norm": 0.47865980863571167, + "learning_rate": 0.00010853533318603051, + "loss": 1.4411, + "step": 35202 + }, + { + "epoch": 0.4574469294704886, + "grad_norm": 0.3556857407093048, + "learning_rate": 0.00010853273372411912, + "loss": 1.3199, + "step": 35203 + }, + { + "epoch": 0.45745992401440444, + "grad_norm": 0.45210039615631104, + "learning_rate": 0.00010853013426220772, + "loss": 1.5048, + "step": 35204 + }, + { + "epoch": 0.45747291855832034, + "grad_norm": 0.5351685285568237, + "learning_rate": 0.00010852753480029633, + "loss": 1.4864, + "step": 35205 + }, + { + "epoch": 0.4574859131022362, + "grad_norm": 0.4799736738204956, + "learning_rate": 0.00010852493533838497, + "loss": 1.5032, + "step": 35206 + }, + { + "epoch": 0.4574989076461521, + "grad_norm": 0.43956881761550903, + "learning_rate": 0.00010852233587647358, + "loss": 1.5017, + "step": 35207 + }, + { + "epoch": 0.45751190219006793, + "grad_norm": 0.5154749155044556, + "learning_rate": 0.00010851973641456219, + "loss": 1.6559, + "step": 35208 + }, + { + "epoch": 0.45752489673398383, + "grad_norm": 0.4488048553466797, + "learning_rate": 0.0001085171369526508, + "loss": 1.3764, + "step": 35209 + }, + { + "epoch": 0.4575378912778997, + "grad_norm": 0.3972122073173523, + "learning_rate": 0.00010851453749073943, + "loss": 1.6702, + "step": 35210 + }, + { + "epoch": 0.4575508858218156, + "grad_norm": 0.32879453897476196, + "learning_rate": 0.00010851193802882804, + "loss": 1.3974, + "step": 35211 + }, + { + "epoch": 0.4575638803657314, + "grad_norm": 0.33414342999458313, + "learning_rate": 0.00010850933856691665, + "loss": 1.2937, + "step": 35212 + }, + { + "epoch": 0.4575768749096473, + "grad_norm": 0.4241344630718231, + "learning_rate": 0.00010850673910500526, + "loss": 1.5058, + "step": 35213 + }, + { + "epoch": 0.45758986945356317, + "grad_norm": 0.3188968598842621, + "learning_rate": 0.0001085041396430939, + "loss": 1.3327, + "step": 35214 + }, + { + "epoch": 0.45760286399747907, + "grad_norm": 0.3835933804512024, + "learning_rate": 0.00010850154018118251, + "loss": 1.5318, + "step": 35215 + }, + { + "epoch": 0.4576158585413949, + "grad_norm": 0.3659355342388153, + "learning_rate": 0.00010849894071927111, + "loss": 1.585, + "step": 35216 + }, + { + "epoch": 0.4576288530853108, + "grad_norm": 0.44043996930122375, + "learning_rate": 0.00010849634125735972, + "loss": 1.1948, + "step": 35217 + }, + { + "epoch": 0.45764184762922666, + "grad_norm": 0.44834479689598083, + "learning_rate": 0.00010849374179544836, + "loss": 1.3876, + "step": 35218 + }, + { + "epoch": 0.45765484217314256, + "grad_norm": 0.3071345388889313, + "learning_rate": 0.00010849114233353697, + "loss": 1.3022, + "step": 35219 + }, + { + "epoch": 0.4576678367170584, + "grad_norm": 0.3260709047317505, + "learning_rate": 0.00010848854287162558, + "loss": 1.2277, + "step": 35220 + }, + { + "epoch": 0.4576808312609743, + "grad_norm": 0.35543254017829895, + "learning_rate": 0.00010848594340971419, + "loss": 1.3643, + "step": 35221 + }, + { + "epoch": 0.45769382580489015, + "grad_norm": 0.3343386948108673, + "learning_rate": 0.00010848334394780281, + "loss": 1.2112, + "step": 35222 + }, + { + "epoch": 0.45770682034880605, + "grad_norm": 0.39798232913017273, + "learning_rate": 0.00010848074448589142, + "loss": 1.5742, + "step": 35223 + }, + { + "epoch": 0.4577198148927219, + "grad_norm": 0.3496485650539398, + "learning_rate": 0.00010847814502398004, + "loss": 1.3852, + "step": 35224 + }, + { + "epoch": 0.4577328094366378, + "grad_norm": 0.3821001648902893, + "learning_rate": 0.00010847554556206865, + "loss": 1.1962, + "step": 35225 + }, + { + "epoch": 0.45774580398055364, + "grad_norm": 0.28176069259643555, + "learning_rate": 0.00010847294610015728, + "loss": 1.1889, + "step": 35226 + }, + { + "epoch": 0.45775879852446955, + "grad_norm": 0.42420098185539246, + "learning_rate": 0.0001084703466382459, + "loss": 1.2605, + "step": 35227 + }, + { + "epoch": 0.4577717930683854, + "grad_norm": 0.4934530258178711, + "learning_rate": 0.0001084677471763345, + "loss": 1.5385, + "step": 35228 + }, + { + "epoch": 0.4577847876123013, + "grad_norm": 0.43733447790145874, + "learning_rate": 0.0001084651477144231, + "loss": 1.3628, + "step": 35229 + }, + { + "epoch": 0.45779778215621714, + "grad_norm": 0.4146369397640228, + "learning_rate": 0.00010846254825251174, + "loss": 1.4634, + "step": 35230 + }, + { + "epoch": 0.45781077670013304, + "grad_norm": 0.36894673109054565, + "learning_rate": 0.00010845994879060035, + "loss": 1.3395, + "step": 35231 + }, + { + "epoch": 0.4578237712440489, + "grad_norm": 0.37321045994758606, + "learning_rate": 0.00010845734932868896, + "loss": 1.2889, + "step": 35232 + }, + { + "epoch": 0.4578367657879648, + "grad_norm": 0.37834569811820984, + "learning_rate": 0.00010845474986677757, + "loss": 1.3993, + "step": 35233 + }, + { + "epoch": 0.45784976033188063, + "grad_norm": 0.3205845057964325, + "learning_rate": 0.0001084521504048662, + "loss": 1.1027, + "step": 35234 + }, + { + "epoch": 0.45786275487579653, + "grad_norm": 0.42798498272895813, + "learning_rate": 0.00010844955094295481, + "loss": 1.1531, + "step": 35235 + }, + { + "epoch": 0.4578757494197124, + "grad_norm": 0.4186842739582062, + "learning_rate": 0.00010844695148104342, + "loss": 1.4299, + "step": 35236 + }, + { + "epoch": 0.4578887439636283, + "grad_norm": 0.41149231791496277, + "learning_rate": 0.00010844435201913203, + "loss": 1.4173, + "step": 35237 + }, + { + "epoch": 0.4579017385075441, + "grad_norm": 0.3289128243923187, + "learning_rate": 0.00010844175255722067, + "loss": 1.4453, + "step": 35238 + }, + { + "epoch": 0.45791473305146, + "grad_norm": 0.5800630450248718, + "learning_rate": 0.00010843915309530928, + "loss": 1.3749, + "step": 35239 + }, + { + "epoch": 0.45792772759537587, + "grad_norm": 0.4329250752925873, + "learning_rate": 0.00010843655363339789, + "loss": 1.4118, + "step": 35240 + }, + { + "epoch": 0.45794072213929177, + "grad_norm": 0.38029828667640686, + "learning_rate": 0.00010843395417148652, + "loss": 1.4105, + "step": 35241 + }, + { + "epoch": 0.4579537166832076, + "grad_norm": 0.4577818214893341, + "learning_rate": 0.00010843135470957513, + "loss": 1.4297, + "step": 35242 + }, + { + "epoch": 0.4579667112271235, + "grad_norm": 0.4916505217552185, + "learning_rate": 0.00010842875524766374, + "loss": 1.5481, + "step": 35243 + }, + { + "epoch": 0.45797970577103936, + "grad_norm": 0.38707616925239563, + "learning_rate": 0.00010842615578575235, + "loss": 1.3447, + "step": 35244 + }, + { + "epoch": 0.45799270031495526, + "grad_norm": 0.4361681044101715, + "learning_rate": 0.00010842355632384099, + "loss": 1.3706, + "step": 35245 + }, + { + "epoch": 0.4580056948588711, + "grad_norm": 0.3618282973766327, + "learning_rate": 0.00010842095686192958, + "loss": 1.2397, + "step": 35246 + }, + { + "epoch": 0.458018689402787, + "grad_norm": 0.3238358795642853, + "learning_rate": 0.0001084183574000182, + "loss": 1.2986, + "step": 35247 + }, + { + "epoch": 0.4580316839467029, + "grad_norm": 0.37960800528526306, + "learning_rate": 0.0001084157579381068, + "loss": 1.2536, + "step": 35248 + }, + { + "epoch": 0.45804467849061875, + "grad_norm": 0.44496873021125793, + "learning_rate": 0.00010841315847619544, + "loss": 1.5482, + "step": 35249 + }, + { + "epoch": 0.45805767303453465, + "grad_norm": 0.3626144826412201, + "learning_rate": 0.00010841055901428405, + "loss": 1.2164, + "step": 35250 + }, + { + "epoch": 0.4580706675784505, + "grad_norm": 0.36149296164512634, + "learning_rate": 0.00010840795955237267, + "loss": 1.2694, + "step": 35251 + }, + { + "epoch": 0.4580836621223664, + "grad_norm": 0.3782879710197449, + "learning_rate": 0.00010840536009046128, + "loss": 1.2774, + "step": 35252 + }, + { + "epoch": 0.45809665666628224, + "grad_norm": 0.48522794246673584, + "learning_rate": 0.0001084027606285499, + "loss": 1.5855, + "step": 35253 + }, + { + "epoch": 0.45810965121019814, + "grad_norm": 0.4705965220928192, + "learning_rate": 0.00010840016116663851, + "loss": 1.3872, + "step": 35254 + }, + { + "epoch": 0.458122645754114, + "grad_norm": 0.2958342134952545, + "learning_rate": 0.00010839756170472712, + "loss": 1.1364, + "step": 35255 + }, + { + "epoch": 0.4581356402980299, + "grad_norm": 0.29890361428260803, + "learning_rate": 0.00010839496224281573, + "loss": 1.1778, + "step": 35256 + }, + { + "epoch": 0.45814863484194573, + "grad_norm": 0.33634698390960693, + "learning_rate": 0.00010839236278090437, + "loss": 1.3648, + "step": 35257 + }, + { + "epoch": 0.45816162938586164, + "grad_norm": 0.4534910321235657, + "learning_rate": 0.00010838976331899297, + "loss": 1.5126, + "step": 35258 + }, + { + "epoch": 0.4581746239297775, + "grad_norm": 0.4674331247806549, + "learning_rate": 0.00010838716385708158, + "loss": 1.3965, + "step": 35259 + }, + { + "epoch": 0.4581876184736934, + "grad_norm": 0.4149826765060425, + "learning_rate": 0.00010838456439517019, + "loss": 1.3724, + "step": 35260 + }, + { + "epoch": 0.4582006130176092, + "grad_norm": 0.43923062086105347, + "learning_rate": 0.00010838196493325883, + "loss": 1.5823, + "step": 35261 + }, + { + "epoch": 0.4582136075615251, + "grad_norm": 0.5054402947425842, + "learning_rate": 0.00010837936547134744, + "loss": 1.4279, + "step": 35262 + }, + { + "epoch": 0.45822660210544097, + "grad_norm": 0.42773324251174927, + "learning_rate": 0.00010837676600943605, + "loss": 1.4269, + "step": 35263 + }, + { + "epoch": 0.4582395966493569, + "grad_norm": 0.47015732526779175, + "learning_rate": 0.00010837416654752466, + "loss": 1.5479, + "step": 35264 + }, + { + "epoch": 0.4582525911932727, + "grad_norm": 0.4380808174610138, + "learning_rate": 0.00010837156708561329, + "loss": 1.4651, + "step": 35265 + }, + { + "epoch": 0.4582655857371886, + "grad_norm": 0.3586990237236023, + "learning_rate": 0.0001083689676237019, + "loss": 1.1728, + "step": 35266 + }, + { + "epoch": 0.45827858028110446, + "grad_norm": 0.7863790392875671, + "learning_rate": 0.00010836636816179051, + "loss": 1.5038, + "step": 35267 + }, + { + "epoch": 0.45829157482502036, + "grad_norm": 0.32435423135757446, + "learning_rate": 0.00010836376869987912, + "loss": 1.1982, + "step": 35268 + }, + { + "epoch": 0.4583045693689362, + "grad_norm": 0.28237786889076233, + "learning_rate": 0.00010836116923796776, + "loss": 1.1257, + "step": 35269 + }, + { + "epoch": 0.4583175639128521, + "grad_norm": 0.43614304065704346, + "learning_rate": 0.00010835856977605637, + "loss": 1.3819, + "step": 35270 + }, + { + "epoch": 0.45833055845676796, + "grad_norm": 0.4806976318359375, + "learning_rate": 0.00010835597031414497, + "loss": 1.4418, + "step": 35271 + }, + { + "epoch": 0.45834355300068386, + "grad_norm": 0.41072726249694824, + "learning_rate": 0.00010835337085223358, + "loss": 1.4635, + "step": 35272 + }, + { + "epoch": 0.4583565475445997, + "grad_norm": 0.40402135252952576, + "learning_rate": 0.00010835077139032221, + "loss": 1.4885, + "step": 35273 + }, + { + "epoch": 0.4583695420885156, + "grad_norm": 0.40256422758102417, + "learning_rate": 0.00010834817192841083, + "loss": 1.3439, + "step": 35274 + }, + { + "epoch": 0.45838253663243145, + "grad_norm": 0.36553245782852173, + "learning_rate": 0.00010834557246649944, + "loss": 1.5257, + "step": 35275 + }, + { + "epoch": 0.45839553117634735, + "grad_norm": 0.377447247505188, + "learning_rate": 0.00010834297300458806, + "loss": 1.3524, + "step": 35276 + }, + { + "epoch": 0.4584085257202632, + "grad_norm": 0.2974754273891449, + "learning_rate": 0.00010834037354267667, + "loss": 1.1978, + "step": 35277 + }, + { + "epoch": 0.4584215202641791, + "grad_norm": 0.37702056765556335, + "learning_rate": 0.00010833777408076528, + "loss": 1.3135, + "step": 35278 + }, + { + "epoch": 0.45843451480809494, + "grad_norm": 0.2538222670555115, + "learning_rate": 0.0001083351746188539, + "loss": 1.5245, + "step": 35279 + }, + { + "epoch": 0.45844750935201084, + "grad_norm": 0.4145028591156006, + "learning_rate": 0.00010833257515694253, + "loss": 1.4954, + "step": 35280 + }, + { + "epoch": 0.4584605038959267, + "grad_norm": 0.4104863703250885, + "learning_rate": 0.00010832997569503114, + "loss": 1.1267, + "step": 35281 + }, + { + "epoch": 0.4584734984398426, + "grad_norm": 0.4241240918636322, + "learning_rate": 0.00010832737623311975, + "loss": 1.2789, + "step": 35282 + }, + { + "epoch": 0.45848649298375843, + "grad_norm": 0.4301542043685913, + "learning_rate": 0.00010832477677120836, + "loss": 1.4101, + "step": 35283 + }, + { + "epoch": 0.45849948752767433, + "grad_norm": 0.38485342264175415, + "learning_rate": 0.00010832217730929699, + "loss": 1.4676, + "step": 35284 + }, + { + "epoch": 0.4585124820715902, + "grad_norm": 0.39717787504196167, + "learning_rate": 0.0001083195778473856, + "loss": 1.5187, + "step": 35285 + }, + { + "epoch": 0.4585254766155061, + "grad_norm": 0.3654116988182068, + "learning_rate": 0.00010831697838547421, + "loss": 1.3772, + "step": 35286 + }, + { + "epoch": 0.4585384711594219, + "grad_norm": 0.3713887929916382, + "learning_rate": 0.00010831437892356282, + "loss": 1.2216, + "step": 35287 + }, + { + "epoch": 0.4585514657033378, + "grad_norm": 0.46072137355804443, + "learning_rate": 0.00010831177946165145, + "loss": 1.4036, + "step": 35288 + }, + { + "epoch": 0.45856446024725367, + "grad_norm": 0.47606176137924194, + "learning_rate": 0.00010830917999974006, + "loss": 1.5429, + "step": 35289 + }, + { + "epoch": 0.45857745479116957, + "grad_norm": 0.4153774082660675, + "learning_rate": 0.00010830658053782867, + "loss": 1.4051, + "step": 35290 + }, + { + "epoch": 0.4585904493350854, + "grad_norm": 0.4043768346309662, + "learning_rate": 0.00010830398107591728, + "loss": 1.4037, + "step": 35291 + }, + { + "epoch": 0.4586034438790013, + "grad_norm": 0.3259259760379791, + "learning_rate": 0.00010830138161400592, + "loss": 1.3242, + "step": 35292 + }, + { + "epoch": 0.45861643842291716, + "grad_norm": 0.414194256067276, + "learning_rate": 0.00010829878215209453, + "loss": 1.1967, + "step": 35293 + }, + { + "epoch": 0.45862943296683306, + "grad_norm": 0.39186134934425354, + "learning_rate": 0.00010829618269018314, + "loss": 1.1955, + "step": 35294 + }, + { + "epoch": 0.4586424275107489, + "grad_norm": 0.35264942049980164, + "learning_rate": 0.00010829358322827175, + "loss": 1.4243, + "step": 35295 + }, + { + "epoch": 0.4586554220546648, + "grad_norm": 0.3861359655857086, + "learning_rate": 0.00010829098376636037, + "loss": 1.4644, + "step": 35296 + }, + { + "epoch": 0.45866841659858065, + "grad_norm": 0.4659987986087799, + "learning_rate": 0.00010828838430444899, + "loss": 1.3118, + "step": 35297 + }, + { + "epoch": 0.45868141114249655, + "grad_norm": 0.31979990005493164, + "learning_rate": 0.0001082857848425376, + "loss": 1.2998, + "step": 35298 + }, + { + "epoch": 0.4586944056864124, + "grad_norm": 0.4205688238143921, + "learning_rate": 0.00010828318538062621, + "loss": 1.6669, + "step": 35299 + }, + { + "epoch": 0.4587074002303283, + "grad_norm": 0.4614083468914032, + "learning_rate": 0.00010828058591871483, + "loss": 1.4177, + "step": 35300 + }, + { + "epoch": 0.45872039477424414, + "grad_norm": 0.4444385766983032, + "learning_rate": 0.00010827798645680344, + "loss": 1.5298, + "step": 35301 + }, + { + "epoch": 0.45873338931816005, + "grad_norm": 0.4552019536495209, + "learning_rate": 0.00010827538699489205, + "loss": 1.3475, + "step": 35302 + }, + { + "epoch": 0.4587463838620759, + "grad_norm": 0.406088262796402, + "learning_rate": 0.00010827278753298066, + "loss": 1.3874, + "step": 35303 + }, + { + "epoch": 0.4587593784059918, + "grad_norm": 0.3940732181072235, + "learning_rate": 0.0001082701880710693, + "loss": 1.2402, + "step": 35304 + }, + { + "epoch": 0.45877237294990764, + "grad_norm": 0.3841974437236786, + "learning_rate": 0.00010826758860915791, + "loss": 1.3434, + "step": 35305 + }, + { + "epoch": 0.45878536749382354, + "grad_norm": 0.33626532554626465, + "learning_rate": 0.00010826498914724652, + "loss": 1.4824, + "step": 35306 + }, + { + "epoch": 0.4587983620377394, + "grad_norm": 0.3972049057483673, + "learning_rate": 0.00010826238968533514, + "loss": 1.2772, + "step": 35307 + }, + { + "epoch": 0.4588113565816553, + "grad_norm": 0.4353432357311249, + "learning_rate": 0.00010825979022342376, + "loss": 1.4568, + "step": 35308 + }, + { + "epoch": 0.45882435112557113, + "grad_norm": 0.44833609461784363, + "learning_rate": 0.00010825719076151237, + "loss": 1.5144, + "step": 35309 + }, + { + "epoch": 0.45883734566948703, + "grad_norm": 0.31149277091026306, + "learning_rate": 0.00010825459129960098, + "loss": 1.3918, + "step": 35310 + }, + { + "epoch": 0.4588503402134029, + "grad_norm": 0.35204797983169556, + "learning_rate": 0.00010825199183768959, + "loss": 1.3002, + "step": 35311 + }, + { + "epoch": 0.4588633347573188, + "grad_norm": 0.41535937786102295, + "learning_rate": 0.00010824939237577823, + "loss": 1.5458, + "step": 35312 + }, + { + "epoch": 0.4588763293012346, + "grad_norm": 0.35590389370918274, + "learning_rate": 0.00010824679291386683, + "loss": 1.3897, + "step": 35313 + }, + { + "epoch": 0.4588893238451505, + "grad_norm": 0.4094884693622589, + "learning_rate": 0.00010824419345195544, + "loss": 1.2885, + "step": 35314 + }, + { + "epoch": 0.45890231838906637, + "grad_norm": 0.5071133971214294, + "learning_rate": 0.00010824159399004408, + "loss": 1.4719, + "step": 35315 + }, + { + "epoch": 0.45891531293298227, + "grad_norm": 0.49353688955307007, + "learning_rate": 0.00010823899452813269, + "loss": 1.5224, + "step": 35316 + }, + { + "epoch": 0.4589283074768981, + "grad_norm": 0.4569607973098755, + "learning_rate": 0.0001082363950662213, + "loss": 1.4647, + "step": 35317 + }, + { + "epoch": 0.458941302020814, + "grad_norm": 0.4611174166202545, + "learning_rate": 0.00010823379560430991, + "loss": 1.4312, + "step": 35318 + }, + { + "epoch": 0.45895429656472986, + "grad_norm": 0.4620760679244995, + "learning_rate": 0.00010823119614239853, + "loss": 1.3677, + "step": 35319 + }, + { + "epoch": 0.45896729110864576, + "grad_norm": 0.6529434323310852, + "learning_rate": 0.00010822859668048715, + "loss": 1.4053, + "step": 35320 + }, + { + "epoch": 0.4589802856525616, + "grad_norm": 0.28861069679260254, + "learning_rate": 0.00010822599721857576, + "loss": 1.3218, + "step": 35321 + }, + { + "epoch": 0.4589932801964775, + "grad_norm": 0.4238835573196411, + "learning_rate": 0.00010822339775666437, + "loss": 1.299, + "step": 35322 + }, + { + "epoch": 0.45900627474039335, + "grad_norm": 0.2763981819152832, + "learning_rate": 0.000108220798294753, + "loss": 1.1626, + "step": 35323 + }, + { + "epoch": 0.45901926928430925, + "grad_norm": 0.43273213505744934, + "learning_rate": 0.00010821819883284162, + "loss": 1.4008, + "step": 35324 + }, + { + "epoch": 0.45903226382822515, + "grad_norm": 0.4475223422050476, + "learning_rate": 0.00010821559937093023, + "loss": 1.542, + "step": 35325 + }, + { + "epoch": 0.459045258372141, + "grad_norm": 0.39497828483581543, + "learning_rate": 0.00010821299990901882, + "loss": 1.428, + "step": 35326 + }, + { + "epoch": 0.4590582529160569, + "grad_norm": 0.46633705496788025, + "learning_rate": 0.00010821040044710746, + "loss": 1.3103, + "step": 35327 + }, + { + "epoch": 0.45907124745997274, + "grad_norm": 0.42853882908821106, + "learning_rate": 0.00010820780098519607, + "loss": 1.44, + "step": 35328 + }, + { + "epoch": 0.45908424200388864, + "grad_norm": 0.3508547246456146, + "learning_rate": 0.00010820520152328468, + "loss": 1.3196, + "step": 35329 + }, + { + "epoch": 0.4590972365478045, + "grad_norm": 0.34700489044189453, + "learning_rate": 0.0001082026020613733, + "loss": 1.3816, + "step": 35330 + }, + { + "epoch": 0.4591102310917204, + "grad_norm": 0.4014454185962677, + "learning_rate": 0.00010820000259946192, + "loss": 1.4173, + "step": 35331 + }, + { + "epoch": 0.45912322563563623, + "grad_norm": 0.43163809180259705, + "learning_rate": 0.00010819740313755053, + "loss": 1.3621, + "step": 35332 + }, + { + "epoch": 0.45913622017955213, + "grad_norm": 0.4336693286895752, + "learning_rate": 0.00010819480367563914, + "loss": 1.3109, + "step": 35333 + }, + { + "epoch": 0.459149214723468, + "grad_norm": 0.37387824058532715, + "learning_rate": 0.00010819220421372775, + "loss": 1.4071, + "step": 35334 + }, + { + "epoch": 0.4591622092673839, + "grad_norm": 0.375489205121994, + "learning_rate": 0.00010818960475181639, + "loss": 1.2277, + "step": 35335 + }, + { + "epoch": 0.4591752038112997, + "grad_norm": 0.4288072884082794, + "learning_rate": 0.000108187005289905, + "loss": 1.5621, + "step": 35336 + }, + { + "epoch": 0.4591881983552156, + "grad_norm": 0.37284740805625916, + "learning_rate": 0.00010818440582799361, + "loss": 1.153, + "step": 35337 + }, + { + "epoch": 0.45920119289913147, + "grad_norm": 0.389782577753067, + "learning_rate": 0.00010818180636608222, + "loss": 1.2462, + "step": 35338 + }, + { + "epoch": 0.4592141874430474, + "grad_norm": 0.36751076579093933, + "learning_rate": 0.00010817920690417085, + "loss": 1.2772, + "step": 35339 + }, + { + "epoch": 0.4592271819869632, + "grad_norm": 0.2914290726184845, + "learning_rate": 0.00010817660744225946, + "loss": 1.1934, + "step": 35340 + }, + { + "epoch": 0.4592401765308791, + "grad_norm": 0.46337658166885376, + "learning_rate": 0.00010817400798034807, + "loss": 1.4619, + "step": 35341 + }, + { + "epoch": 0.45925317107479496, + "grad_norm": 0.40661898255348206, + "learning_rate": 0.00010817140851843668, + "loss": 1.4287, + "step": 35342 + }, + { + "epoch": 0.45926616561871086, + "grad_norm": 0.37092944979667664, + "learning_rate": 0.0001081688090565253, + "loss": 1.3679, + "step": 35343 + }, + { + "epoch": 0.4592791601626267, + "grad_norm": 0.4093421995639801, + "learning_rate": 0.00010816620959461392, + "loss": 1.3896, + "step": 35344 + }, + { + "epoch": 0.4592921547065426, + "grad_norm": 0.35497674345970154, + "learning_rate": 0.00010816361013270253, + "loss": 1.3961, + "step": 35345 + }, + { + "epoch": 0.45930514925045846, + "grad_norm": 0.42267873883247375, + "learning_rate": 0.00010816101067079114, + "loss": 1.3523, + "step": 35346 + }, + { + "epoch": 0.45931814379437436, + "grad_norm": 0.31545206904411316, + "learning_rate": 0.00010815841120887978, + "loss": 1.2695, + "step": 35347 + }, + { + "epoch": 0.4593311383382902, + "grad_norm": 0.412085622549057, + "learning_rate": 0.00010815581174696839, + "loss": 1.5897, + "step": 35348 + }, + { + "epoch": 0.4593441328822061, + "grad_norm": 0.42375436425209045, + "learning_rate": 0.000108153212285057, + "loss": 1.2868, + "step": 35349 + }, + { + "epoch": 0.45935712742612195, + "grad_norm": 0.4174690544605255, + "learning_rate": 0.00010815061282314562, + "loss": 1.2312, + "step": 35350 + }, + { + "epoch": 0.45937012197003785, + "grad_norm": 0.4588826298713684, + "learning_rate": 0.00010814801336123423, + "loss": 1.4285, + "step": 35351 + }, + { + "epoch": 0.4593831165139537, + "grad_norm": 0.3641478419303894, + "learning_rate": 0.00010814541389932284, + "loss": 1.3225, + "step": 35352 + }, + { + "epoch": 0.4593961110578696, + "grad_norm": 0.33253809809684753, + "learning_rate": 0.00010814281443741146, + "loss": 1.2146, + "step": 35353 + }, + { + "epoch": 0.45940910560178544, + "grad_norm": 0.35771578550338745, + "learning_rate": 0.0001081402149755001, + "loss": 1.4171, + "step": 35354 + }, + { + "epoch": 0.45942210014570134, + "grad_norm": 0.32943418622016907, + "learning_rate": 0.00010813761551358869, + "loss": 1.292, + "step": 35355 + }, + { + "epoch": 0.4594350946896172, + "grad_norm": 0.49043115973472595, + "learning_rate": 0.0001081350160516773, + "loss": 1.4025, + "step": 35356 + }, + { + "epoch": 0.4594480892335331, + "grad_norm": 0.44581982493400574, + "learning_rate": 0.00010813241658976591, + "loss": 1.3602, + "step": 35357 + }, + { + "epoch": 0.45946108377744893, + "grad_norm": 0.40665319561958313, + "learning_rate": 0.00010812981712785455, + "loss": 1.3481, + "step": 35358 + }, + { + "epoch": 0.45947407832136483, + "grad_norm": 0.4171801805496216, + "learning_rate": 0.00010812721766594316, + "loss": 1.2928, + "step": 35359 + }, + { + "epoch": 0.4594870728652807, + "grad_norm": 0.44965749979019165, + "learning_rate": 0.00010812461820403177, + "loss": 1.543, + "step": 35360 + }, + { + "epoch": 0.4595000674091966, + "grad_norm": 0.433795690536499, + "learning_rate": 0.00010812201874212038, + "loss": 1.2605, + "step": 35361 + }, + { + "epoch": 0.4595130619531124, + "grad_norm": 0.43946996331214905, + "learning_rate": 0.00010811941928020901, + "loss": 1.4665, + "step": 35362 + }, + { + "epoch": 0.4595260564970283, + "grad_norm": 0.45820093154907227, + "learning_rate": 0.00010811681981829762, + "loss": 1.3561, + "step": 35363 + }, + { + "epoch": 0.45953905104094417, + "grad_norm": 0.4809021055698395, + "learning_rate": 0.00010811422035638623, + "loss": 1.4838, + "step": 35364 + }, + { + "epoch": 0.45955204558486007, + "grad_norm": 0.3748052716255188, + "learning_rate": 0.00010811162089447484, + "loss": 1.2569, + "step": 35365 + }, + { + "epoch": 0.4595650401287759, + "grad_norm": 0.3864237070083618, + "learning_rate": 0.00010810902143256348, + "loss": 1.3356, + "step": 35366 + }, + { + "epoch": 0.4595780346726918, + "grad_norm": 0.25659817457199097, + "learning_rate": 0.00010810642197065209, + "loss": 1.3025, + "step": 35367 + }, + { + "epoch": 0.45959102921660766, + "grad_norm": 0.3454335629940033, + "learning_rate": 0.00010810382250874069, + "loss": 1.4603, + "step": 35368 + }, + { + "epoch": 0.45960402376052356, + "grad_norm": 0.45242515206336975, + "learning_rate": 0.0001081012230468293, + "loss": 1.4749, + "step": 35369 + }, + { + "epoch": 0.4596170183044394, + "grad_norm": 0.3889758884906769, + "learning_rate": 0.00010809862358491794, + "loss": 1.4544, + "step": 35370 + }, + { + "epoch": 0.4596300128483553, + "grad_norm": 0.39321789145469666, + "learning_rate": 0.00010809602412300655, + "loss": 1.4275, + "step": 35371 + }, + { + "epoch": 0.45964300739227115, + "grad_norm": 0.5043295621871948, + "learning_rate": 0.00010809342466109516, + "loss": 1.4521, + "step": 35372 + }, + { + "epoch": 0.45965600193618705, + "grad_norm": 0.2803584635257721, + "learning_rate": 0.00010809082519918377, + "loss": 1.3983, + "step": 35373 + }, + { + "epoch": 0.4596689964801029, + "grad_norm": 0.49717387557029724, + "learning_rate": 0.00010808822573727239, + "loss": 1.5593, + "step": 35374 + }, + { + "epoch": 0.4596819910240188, + "grad_norm": 0.3836407959461212, + "learning_rate": 0.000108085626275361, + "loss": 1.457, + "step": 35375 + }, + { + "epoch": 0.45969498556793464, + "grad_norm": 0.39679092168807983, + "learning_rate": 0.00010808302681344962, + "loss": 1.3771, + "step": 35376 + }, + { + "epoch": 0.45970798011185054, + "grad_norm": 0.33145424723625183, + "learning_rate": 0.00010808042735153823, + "loss": 1.3228, + "step": 35377 + }, + { + "epoch": 0.4597209746557664, + "grad_norm": 0.48426875472068787, + "learning_rate": 0.00010807782788962686, + "loss": 1.2804, + "step": 35378 + }, + { + "epoch": 0.4597339691996823, + "grad_norm": 0.3295377194881439, + "learning_rate": 0.00010807522842771547, + "loss": 1.1717, + "step": 35379 + }, + { + "epoch": 0.45974696374359814, + "grad_norm": 0.3442242443561554, + "learning_rate": 0.00010807262896580409, + "loss": 1.3973, + "step": 35380 + }, + { + "epoch": 0.45975995828751404, + "grad_norm": 0.40900155901908875, + "learning_rate": 0.00010807002950389268, + "loss": 1.3272, + "step": 35381 + }, + { + "epoch": 0.4597729528314299, + "grad_norm": 0.4533456861972809, + "learning_rate": 0.00010806743004198132, + "loss": 1.3745, + "step": 35382 + }, + { + "epoch": 0.4597859473753458, + "grad_norm": 0.4188876748085022, + "learning_rate": 0.00010806483058006993, + "loss": 1.4518, + "step": 35383 + }, + { + "epoch": 0.45979894191926163, + "grad_norm": 0.4020141065120697, + "learning_rate": 0.00010806223111815854, + "loss": 1.3151, + "step": 35384 + }, + { + "epoch": 0.45981193646317753, + "grad_norm": 0.44772377610206604, + "learning_rate": 0.00010805963165624715, + "loss": 1.4599, + "step": 35385 + }, + { + "epoch": 0.4598249310070934, + "grad_norm": 0.4504566788673401, + "learning_rate": 0.00010805703219433578, + "loss": 1.5226, + "step": 35386 + }, + { + "epoch": 0.4598379255510093, + "grad_norm": 0.3728441894054413, + "learning_rate": 0.00010805443273242439, + "loss": 1.312, + "step": 35387 + }, + { + "epoch": 0.4598509200949251, + "grad_norm": 0.3854468762874603, + "learning_rate": 0.000108051833270513, + "loss": 1.2771, + "step": 35388 + }, + { + "epoch": 0.459863914638841, + "grad_norm": 0.40002134442329407, + "learning_rate": 0.00010804923380860164, + "loss": 1.4039, + "step": 35389 + }, + { + "epoch": 0.45987690918275687, + "grad_norm": 0.3797298073768616, + "learning_rate": 0.00010804663434669025, + "loss": 1.3823, + "step": 35390 + }, + { + "epoch": 0.45988990372667277, + "grad_norm": 0.3901807963848114, + "learning_rate": 0.00010804403488477886, + "loss": 1.4519, + "step": 35391 + }, + { + "epoch": 0.4599028982705886, + "grad_norm": 0.48419633507728577, + "learning_rate": 0.00010804143542286747, + "loss": 1.5055, + "step": 35392 + }, + { + "epoch": 0.4599158928145045, + "grad_norm": 0.31742924451828003, + "learning_rate": 0.0001080388359609561, + "loss": 1.2731, + "step": 35393 + }, + { + "epoch": 0.45992888735842036, + "grad_norm": 0.3422873020172119, + "learning_rate": 0.0001080362364990447, + "loss": 1.4728, + "step": 35394 + }, + { + "epoch": 0.45994188190233626, + "grad_norm": 0.4367825388908386, + "learning_rate": 0.00010803363703713332, + "loss": 1.4299, + "step": 35395 + }, + { + "epoch": 0.4599548764462521, + "grad_norm": 0.3603319525718689, + "learning_rate": 0.00010803103757522193, + "loss": 1.394, + "step": 35396 + }, + { + "epoch": 0.459967870990168, + "grad_norm": 0.2595377266407013, + "learning_rate": 0.00010802843811331055, + "loss": 1.2498, + "step": 35397 + }, + { + "epoch": 0.45998086553408385, + "grad_norm": 0.3504444658756256, + "learning_rate": 0.00010802583865139916, + "loss": 1.2118, + "step": 35398 + }, + { + "epoch": 0.45999386007799975, + "grad_norm": 0.3181664049625397, + "learning_rate": 0.00010802323918948777, + "loss": 1.2841, + "step": 35399 + }, + { + "epoch": 0.46000685462191565, + "grad_norm": 0.33218568563461304, + "learning_rate": 0.00010802063972757639, + "loss": 1.3559, + "step": 35400 + }, + { + "epoch": 0.4600198491658315, + "grad_norm": 0.4189182221889496, + "learning_rate": 0.00010801804026566502, + "loss": 1.2489, + "step": 35401 + }, + { + "epoch": 0.4600328437097474, + "grad_norm": 0.5004599690437317, + "learning_rate": 0.00010801544080375363, + "loss": 1.4222, + "step": 35402 + }, + { + "epoch": 0.46004583825366324, + "grad_norm": 0.3731621503829956, + "learning_rate": 0.00010801284134184225, + "loss": 1.4027, + "step": 35403 + }, + { + "epoch": 0.46005883279757914, + "grad_norm": 0.45000743865966797, + "learning_rate": 0.00010801024187993086, + "loss": 1.3806, + "step": 35404 + }, + { + "epoch": 0.460071827341495, + "grad_norm": 0.3740728497505188, + "learning_rate": 0.00010800764241801948, + "loss": 1.2653, + "step": 35405 + }, + { + "epoch": 0.4600848218854109, + "grad_norm": 0.3123493194580078, + "learning_rate": 0.00010800504295610809, + "loss": 1.2877, + "step": 35406 + }, + { + "epoch": 0.46009781642932673, + "grad_norm": 0.4068834185600281, + "learning_rate": 0.0001080024434941967, + "loss": 1.4178, + "step": 35407 + }, + { + "epoch": 0.46011081097324263, + "grad_norm": 0.37257692217826843, + "learning_rate": 0.00010799984403228531, + "loss": 1.3142, + "step": 35408 + }, + { + "epoch": 0.4601238055171585, + "grad_norm": 0.32890579104423523, + "learning_rate": 0.00010799724457037395, + "loss": 1.4415, + "step": 35409 + }, + { + "epoch": 0.4601368000610744, + "grad_norm": 0.3722566068172455, + "learning_rate": 0.00010799464510846255, + "loss": 1.2349, + "step": 35410 + }, + { + "epoch": 0.4601497946049902, + "grad_norm": 0.3525223433971405, + "learning_rate": 0.00010799204564655116, + "loss": 1.4428, + "step": 35411 + }, + { + "epoch": 0.4601627891489061, + "grad_norm": 0.34788239002227783, + "learning_rate": 0.00010798944618463977, + "loss": 1.4219, + "step": 35412 + }, + { + "epoch": 0.46017578369282197, + "grad_norm": 0.34184110164642334, + "learning_rate": 0.00010798684672272841, + "loss": 1.4317, + "step": 35413 + }, + { + "epoch": 0.46018877823673787, + "grad_norm": 0.4213789105415344, + "learning_rate": 0.00010798424726081702, + "loss": 1.2621, + "step": 35414 + }, + { + "epoch": 0.4602017727806537, + "grad_norm": 0.3489427864551544, + "learning_rate": 0.00010798164779890563, + "loss": 1.3305, + "step": 35415 + }, + { + "epoch": 0.4602147673245696, + "grad_norm": 0.4887595772743225, + "learning_rate": 0.00010797904833699424, + "loss": 1.2549, + "step": 35416 + }, + { + "epoch": 0.46022776186848546, + "grad_norm": 0.40959304571151733, + "learning_rate": 0.00010797644887508287, + "loss": 1.2858, + "step": 35417 + }, + { + "epoch": 0.46024075641240136, + "grad_norm": 0.39794984459877014, + "learning_rate": 0.00010797384941317148, + "loss": 1.6147, + "step": 35418 + }, + { + "epoch": 0.4602537509563172, + "grad_norm": 0.36695581674575806, + "learning_rate": 0.00010797124995126009, + "loss": 1.3242, + "step": 35419 + }, + { + "epoch": 0.4602667455002331, + "grad_norm": 0.37768495082855225, + "learning_rate": 0.0001079686504893487, + "loss": 1.3552, + "step": 35420 + }, + { + "epoch": 0.46027974004414896, + "grad_norm": 0.418940931558609, + "learning_rate": 0.00010796605102743734, + "loss": 1.4552, + "step": 35421 + }, + { + "epoch": 0.46029273458806486, + "grad_norm": 0.4219701886177063, + "learning_rate": 0.00010796345156552593, + "loss": 1.4322, + "step": 35422 + }, + { + "epoch": 0.4603057291319807, + "grad_norm": 0.335228830575943, + "learning_rate": 0.00010796085210361455, + "loss": 1.2513, + "step": 35423 + }, + { + "epoch": 0.4603187236758966, + "grad_norm": 0.3027455806732178, + "learning_rate": 0.00010795825264170318, + "loss": 1.6087, + "step": 35424 + }, + { + "epoch": 0.46033171821981245, + "grad_norm": 0.40976259112358093, + "learning_rate": 0.0001079556531797918, + "loss": 1.3497, + "step": 35425 + }, + { + "epoch": 0.46034471276372835, + "grad_norm": 0.35381394624710083, + "learning_rate": 0.0001079530537178804, + "loss": 1.1744, + "step": 35426 + }, + { + "epoch": 0.4603577073076442, + "grad_norm": 0.35212162137031555, + "learning_rate": 0.00010795045425596902, + "loss": 1.2204, + "step": 35427 + }, + { + "epoch": 0.4603707018515601, + "grad_norm": 0.330342173576355, + "learning_rate": 0.00010794785479405764, + "loss": 1.34, + "step": 35428 + }, + { + "epoch": 0.46038369639547594, + "grad_norm": 0.44823870062828064, + "learning_rate": 0.00010794525533214625, + "loss": 1.3155, + "step": 35429 + }, + { + "epoch": 0.46039669093939184, + "grad_norm": 0.5320379137992859, + "learning_rate": 0.00010794265587023486, + "loss": 1.4441, + "step": 35430 + }, + { + "epoch": 0.4604096854833077, + "grad_norm": 0.4652509391307831, + "learning_rate": 0.00010794005640832347, + "loss": 1.5682, + "step": 35431 + }, + { + "epoch": 0.4604226800272236, + "grad_norm": 0.45554524660110474, + "learning_rate": 0.00010793745694641211, + "loss": 1.3535, + "step": 35432 + }, + { + "epoch": 0.46043567457113943, + "grad_norm": 0.4099522531032562, + "learning_rate": 0.00010793485748450072, + "loss": 1.4467, + "step": 35433 + }, + { + "epoch": 0.46044866911505533, + "grad_norm": 0.4228973984718323, + "learning_rate": 0.00010793225802258933, + "loss": 1.7375, + "step": 35434 + }, + { + "epoch": 0.4604616636589712, + "grad_norm": 0.44818004965782166, + "learning_rate": 0.00010792965856067793, + "loss": 1.3276, + "step": 35435 + }, + { + "epoch": 0.4604746582028871, + "grad_norm": 0.3492596447467804, + "learning_rate": 0.00010792705909876657, + "loss": 1.1877, + "step": 35436 + }, + { + "epoch": 0.4604876527468029, + "grad_norm": 0.3943553864955902, + "learning_rate": 0.00010792445963685518, + "loss": 1.6689, + "step": 35437 + }, + { + "epoch": 0.4605006472907188, + "grad_norm": 0.41357937455177307, + "learning_rate": 0.00010792186017494379, + "loss": 1.3888, + "step": 35438 + }, + { + "epoch": 0.46051364183463467, + "grad_norm": 0.43631890416145325, + "learning_rate": 0.0001079192607130324, + "loss": 1.5142, + "step": 35439 + }, + { + "epoch": 0.46052663637855057, + "grad_norm": 0.41835543513298035, + "learning_rate": 0.00010791666125112103, + "loss": 1.318, + "step": 35440 + }, + { + "epoch": 0.4605396309224664, + "grad_norm": 0.4383360743522644, + "learning_rate": 0.00010791406178920964, + "loss": 1.4763, + "step": 35441 + }, + { + "epoch": 0.4605526254663823, + "grad_norm": 0.29719245433807373, + "learning_rate": 0.00010791146232729825, + "loss": 1.26, + "step": 35442 + }, + { + "epoch": 0.46056562001029816, + "grad_norm": 0.3552534580230713, + "learning_rate": 0.00010790886286538686, + "loss": 1.2009, + "step": 35443 + }, + { + "epoch": 0.46057861455421406, + "grad_norm": 0.34179362654685974, + "learning_rate": 0.0001079062634034755, + "loss": 1.1932, + "step": 35444 + }, + { + "epoch": 0.4605916090981299, + "grad_norm": 0.4392559826374054, + "learning_rate": 0.00010790366394156411, + "loss": 1.4365, + "step": 35445 + }, + { + "epoch": 0.4606046036420458, + "grad_norm": 0.442568838596344, + "learning_rate": 0.00010790106447965272, + "loss": 1.4342, + "step": 35446 + }, + { + "epoch": 0.46061759818596165, + "grad_norm": 0.385448694229126, + "learning_rate": 0.00010789846501774133, + "loss": 1.3128, + "step": 35447 + }, + { + "epoch": 0.46063059272987755, + "grad_norm": 0.3692645728588104, + "learning_rate": 0.00010789586555582995, + "loss": 1.2831, + "step": 35448 + }, + { + "epoch": 0.4606435872737934, + "grad_norm": 0.3242857754230499, + "learning_rate": 0.00010789326609391857, + "loss": 1.2167, + "step": 35449 + }, + { + "epoch": 0.4606565818177093, + "grad_norm": 0.34786468744277954, + "learning_rate": 0.00010789066663200718, + "loss": 1.1588, + "step": 35450 + }, + { + "epoch": 0.46066957636162514, + "grad_norm": 0.4622904658317566, + "learning_rate": 0.00010788806717009579, + "loss": 1.3661, + "step": 35451 + }, + { + "epoch": 0.46068257090554104, + "grad_norm": 0.36055120825767517, + "learning_rate": 0.00010788546770818441, + "loss": 1.4405, + "step": 35452 + }, + { + "epoch": 0.4606955654494569, + "grad_norm": 0.2418884038925171, + "learning_rate": 0.00010788286824627302, + "loss": 1.246, + "step": 35453 + }, + { + "epoch": 0.4607085599933728, + "grad_norm": 0.4132326543331146, + "learning_rate": 0.00010788026878436163, + "loss": 1.3699, + "step": 35454 + }, + { + "epoch": 0.46072155453728864, + "grad_norm": 0.38137102127075195, + "learning_rate": 0.00010787766932245024, + "loss": 1.5493, + "step": 35455 + }, + { + "epoch": 0.46073454908120454, + "grad_norm": 0.3754677474498749, + "learning_rate": 0.00010787506986053888, + "loss": 1.3205, + "step": 35456 + }, + { + "epoch": 0.4607475436251204, + "grad_norm": 0.356577068567276, + "learning_rate": 0.0001078724703986275, + "loss": 1.3615, + "step": 35457 + }, + { + "epoch": 0.4607605381690363, + "grad_norm": 0.3879176676273346, + "learning_rate": 0.0001078698709367161, + "loss": 1.6491, + "step": 35458 + }, + { + "epoch": 0.4607735327129521, + "grad_norm": 0.4713696539402008, + "learning_rate": 0.00010786727147480472, + "loss": 1.5473, + "step": 35459 + }, + { + "epoch": 0.46078652725686803, + "grad_norm": 0.30050498247146606, + "learning_rate": 0.00010786467201289334, + "loss": 1.4121, + "step": 35460 + }, + { + "epoch": 0.4607995218007839, + "grad_norm": 0.3920830488204956, + "learning_rate": 0.00010786207255098195, + "loss": 1.3375, + "step": 35461 + }, + { + "epoch": 0.4608125163446998, + "grad_norm": 0.435604989528656, + "learning_rate": 0.00010785947308907056, + "loss": 1.3839, + "step": 35462 + }, + { + "epoch": 0.4608255108886156, + "grad_norm": 0.4181916117668152, + "learning_rate": 0.0001078568736271592, + "loss": 1.4135, + "step": 35463 + }, + { + "epoch": 0.4608385054325315, + "grad_norm": 0.36078110337257385, + "learning_rate": 0.0001078542741652478, + "loss": 1.4142, + "step": 35464 + }, + { + "epoch": 0.46085149997644737, + "grad_norm": 0.3596632182598114, + "learning_rate": 0.00010785167470333641, + "loss": 1.5991, + "step": 35465 + }, + { + "epoch": 0.46086449452036327, + "grad_norm": 0.5564031004905701, + "learning_rate": 0.00010784907524142502, + "loss": 1.4019, + "step": 35466 + }, + { + "epoch": 0.4608774890642791, + "grad_norm": 0.36761605739593506, + "learning_rate": 0.00010784647577951366, + "loss": 1.3271, + "step": 35467 + }, + { + "epoch": 0.460890483608195, + "grad_norm": 0.4357858896255493, + "learning_rate": 0.00010784387631760227, + "loss": 1.2326, + "step": 35468 + }, + { + "epoch": 0.46090347815211086, + "grad_norm": 0.395498126745224, + "learning_rate": 0.00010784127685569088, + "loss": 1.3378, + "step": 35469 + }, + { + "epoch": 0.46091647269602676, + "grad_norm": 0.3968724310398102, + "learning_rate": 0.00010783867739377949, + "loss": 1.3906, + "step": 35470 + }, + { + "epoch": 0.4609294672399426, + "grad_norm": 0.4514101445674896, + "learning_rate": 0.00010783607793186811, + "loss": 1.408, + "step": 35471 + }, + { + "epoch": 0.4609424617838585, + "grad_norm": 0.3034187853336334, + "learning_rate": 0.00010783347846995673, + "loss": 1.5912, + "step": 35472 + }, + { + "epoch": 0.46095545632777435, + "grad_norm": 0.43063607811927795, + "learning_rate": 0.00010783087900804534, + "loss": 1.37, + "step": 35473 + }, + { + "epoch": 0.46096845087169025, + "grad_norm": 0.4944630265235901, + "learning_rate": 0.00010782827954613395, + "loss": 1.4736, + "step": 35474 + }, + { + "epoch": 0.4609814454156061, + "grad_norm": 0.35504454374313354, + "learning_rate": 0.00010782568008422259, + "loss": 1.4337, + "step": 35475 + }, + { + "epoch": 0.460994439959522, + "grad_norm": 0.4347068965435028, + "learning_rate": 0.0001078230806223112, + "loss": 1.6104, + "step": 35476 + }, + { + "epoch": 0.4610074345034379, + "grad_norm": 0.4077186584472656, + "learning_rate": 0.0001078204811603998, + "loss": 1.3612, + "step": 35477 + }, + { + "epoch": 0.46102042904735374, + "grad_norm": 0.42140883207321167, + "learning_rate": 0.0001078178816984884, + "loss": 1.2237, + "step": 35478 + }, + { + "epoch": 0.46103342359126964, + "grad_norm": 0.39524775743484497, + "learning_rate": 0.00010781528223657704, + "loss": 1.2876, + "step": 35479 + }, + { + "epoch": 0.4610464181351855, + "grad_norm": 0.4483894109725952, + "learning_rate": 0.00010781268277466565, + "loss": 1.5246, + "step": 35480 + }, + { + "epoch": 0.4610594126791014, + "grad_norm": 0.3846283555030823, + "learning_rate": 0.00010781008331275426, + "loss": 1.3072, + "step": 35481 + }, + { + "epoch": 0.46107240722301723, + "grad_norm": 0.4728820025920868, + "learning_rate": 0.00010780748385084288, + "loss": 1.3395, + "step": 35482 + }, + { + "epoch": 0.46108540176693313, + "grad_norm": 0.5418142676353455, + "learning_rate": 0.0001078048843889315, + "loss": 1.5274, + "step": 35483 + }, + { + "epoch": 0.461098396310849, + "grad_norm": 0.4721275568008423, + "learning_rate": 0.00010780228492702011, + "loss": 1.3512, + "step": 35484 + }, + { + "epoch": 0.4611113908547649, + "grad_norm": 0.4507755637168884, + "learning_rate": 0.00010779968546510872, + "loss": 1.564, + "step": 35485 + }, + { + "epoch": 0.4611243853986807, + "grad_norm": 0.3780573606491089, + "learning_rate": 0.00010779708600319733, + "loss": 1.2574, + "step": 35486 + }, + { + "epoch": 0.4611373799425966, + "grad_norm": 0.40787768363952637, + "learning_rate": 0.00010779448654128597, + "loss": 1.2394, + "step": 35487 + }, + { + "epoch": 0.46115037448651247, + "grad_norm": 0.4653986096382141, + "learning_rate": 0.00010779188707937458, + "loss": 1.3631, + "step": 35488 + }, + { + "epoch": 0.46116336903042837, + "grad_norm": 0.42890244722366333, + "learning_rate": 0.00010778928761746319, + "loss": 1.4325, + "step": 35489 + }, + { + "epoch": 0.4611763635743442, + "grad_norm": 0.4308032989501953, + "learning_rate": 0.00010778668815555179, + "loss": 1.4119, + "step": 35490 + }, + { + "epoch": 0.4611893581182601, + "grad_norm": 0.2435729205608368, + "learning_rate": 0.00010778408869364043, + "loss": 1.3705, + "step": 35491 + }, + { + "epoch": 0.46120235266217596, + "grad_norm": 0.37892818450927734, + "learning_rate": 0.00010778148923172904, + "loss": 1.6448, + "step": 35492 + }, + { + "epoch": 0.46121534720609186, + "grad_norm": 0.3605073094367981, + "learning_rate": 0.00010777888976981765, + "loss": 1.1891, + "step": 35493 + }, + { + "epoch": 0.4612283417500077, + "grad_norm": 0.43432673811912537, + "learning_rate": 0.00010777629030790626, + "loss": 1.3704, + "step": 35494 + }, + { + "epoch": 0.4612413362939236, + "grad_norm": 0.42827916145324707, + "learning_rate": 0.00010777369084599489, + "loss": 1.5073, + "step": 35495 + }, + { + "epoch": 0.46125433083783945, + "grad_norm": 0.3474370241165161, + "learning_rate": 0.0001077710913840835, + "loss": 1.2173, + "step": 35496 + }, + { + "epoch": 0.46126732538175536, + "grad_norm": 0.39356929063796997, + "learning_rate": 0.00010776849192217211, + "loss": 1.3685, + "step": 35497 + }, + { + "epoch": 0.4612803199256712, + "grad_norm": 0.31644105911254883, + "learning_rate": 0.00010776589246026075, + "loss": 1.3982, + "step": 35498 + }, + { + "epoch": 0.4612933144695871, + "grad_norm": 0.41147375106811523, + "learning_rate": 0.00010776329299834936, + "loss": 1.2236, + "step": 35499 + }, + { + "epoch": 0.46130630901350295, + "grad_norm": 0.4182385802268982, + "learning_rate": 0.00010776069353643797, + "loss": 1.3749, + "step": 35500 + }, + { + "epoch": 0.46131930355741885, + "grad_norm": 0.3499833643436432, + "learning_rate": 0.00010775809407452658, + "loss": 1.4099, + "step": 35501 + }, + { + "epoch": 0.4613322981013347, + "grad_norm": 0.49604490399360657, + "learning_rate": 0.0001077554946126152, + "loss": 1.4814, + "step": 35502 + }, + { + "epoch": 0.4613452926452506, + "grad_norm": 0.4186013340950012, + "learning_rate": 0.00010775289515070381, + "loss": 1.3458, + "step": 35503 + }, + { + "epoch": 0.46135828718916644, + "grad_norm": 0.5104401111602783, + "learning_rate": 0.00010775029568879242, + "loss": 1.4289, + "step": 35504 + }, + { + "epoch": 0.46137128173308234, + "grad_norm": 0.31747984886169434, + "learning_rate": 0.00010774769622688104, + "loss": 1.3687, + "step": 35505 + }, + { + "epoch": 0.4613842762769982, + "grad_norm": 0.4271358251571655, + "learning_rate": 0.00010774509676496966, + "loss": 1.361, + "step": 35506 + }, + { + "epoch": 0.4613972708209141, + "grad_norm": 0.26651349663734436, + "learning_rate": 0.00010774249730305827, + "loss": 1.3158, + "step": 35507 + }, + { + "epoch": 0.46141026536482993, + "grad_norm": 0.47682732343673706, + "learning_rate": 0.00010773989784114688, + "loss": 1.4754, + "step": 35508 + }, + { + "epoch": 0.46142325990874583, + "grad_norm": 0.45204803347587585, + "learning_rate": 0.00010773729837923549, + "loss": 1.309, + "step": 35509 + }, + { + "epoch": 0.4614362544526617, + "grad_norm": 0.386753648519516, + "learning_rate": 0.00010773469891732413, + "loss": 1.402, + "step": 35510 + }, + { + "epoch": 0.4614492489965776, + "grad_norm": 0.475290447473526, + "learning_rate": 0.00010773209945541274, + "loss": 1.2681, + "step": 35511 + }, + { + "epoch": 0.4614622435404934, + "grad_norm": 0.3560897409915924, + "learning_rate": 0.00010772949999350135, + "loss": 1.2405, + "step": 35512 + }, + { + "epoch": 0.4614752380844093, + "grad_norm": 0.4530765116214752, + "learning_rate": 0.00010772690053158996, + "loss": 1.4329, + "step": 35513 + }, + { + "epoch": 0.46148823262832517, + "grad_norm": 0.3111562728881836, + "learning_rate": 0.00010772430106967859, + "loss": 1.1261, + "step": 35514 + }, + { + "epoch": 0.46150122717224107, + "grad_norm": 0.39080044627189636, + "learning_rate": 0.0001077217016077672, + "loss": 1.399, + "step": 35515 + }, + { + "epoch": 0.4615142217161569, + "grad_norm": 0.44555526971817017, + "learning_rate": 0.00010771910214585581, + "loss": 1.4906, + "step": 35516 + }, + { + "epoch": 0.4615272162600728, + "grad_norm": 0.40763089060783386, + "learning_rate": 0.00010771650268394442, + "loss": 1.4015, + "step": 35517 + }, + { + "epoch": 0.46154021080398866, + "grad_norm": 0.4661819636821747, + "learning_rate": 0.00010771390322203306, + "loss": 1.5345, + "step": 35518 + }, + { + "epoch": 0.46155320534790456, + "grad_norm": 0.37947434186935425, + "learning_rate": 0.00010771130376012166, + "loss": 1.4025, + "step": 35519 + }, + { + "epoch": 0.4615661998918204, + "grad_norm": 0.40372148156166077, + "learning_rate": 0.00010770870429821027, + "loss": 1.2942, + "step": 35520 + }, + { + "epoch": 0.4615791944357363, + "grad_norm": 0.3758772015571594, + "learning_rate": 0.00010770610483629888, + "loss": 1.3351, + "step": 35521 + }, + { + "epoch": 0.46159218897965215, + "grad_norm": 0.40708017349243164, + "learning_rate": 0.00010770350537438752, + "loss": 1.4488, + "step": 35522 + }, + { + "epoch": 0.46160518352356805, + "grad_norm": 0.49547505378723145, + "learning_rate": 0.00010770090591247613, + "loss": 1.4185, + "step": 35523 + }, + { + "epoch": 0.4616181780674839, + "grad_norm": 0.41007938981056213, + "learning_rate": 0.00010769830645056474, + "loss": 1.5052, + "step": 35524 + }, + { + "epoch": 0.4616311726113998, + "grad_norm": 0.4089713394641876, + "learning_rate": 0.00010769570698865335, + "loss": 1.1844, + "step": 35525 + }, + { + "epoch": 0.46164416715531564, + "grad_norm": 0.3328966498374939, + "learning_rate": 0.00010769310752674197, + "loss": 1.5177, + "step": 35526 + }, + { + "epoch": 0.46165716169923154, + "grad_norm": 0.39055484533309937, + "learning_rate": 0.00010769050806483058, + "loss": 1.4707, + "step": 35527 + }, + { + "epoch": 0.4616701562431474, + "grad_norm": 0.441169410943985, + "learning_rate": 0.0001076879086029192, + "loss": 1.423, + "step": 35528 + }, + { + "epoch": 0.4616831507870633, + "grad_norm": 0.37846776843070984, + "learning_rate": 0.0001076853091410078, + "loss": 1.4166, + "step": 35529 + }, + { + "epoch": 0.46169614533097914, + "grad_norm": 0.4110094904899597, + "learning_rate": 0.00010768270967909644, + "loss": 1.2504, + "step": 35530 + }, + { + "epoch": 0.46170913987489504, + "grad_norm": 0.37566396594047546, + "learning_rate": 0.00010768011021718505, + "loss": 1.2148, + "step": 35531 + }, + { + "epoch": 0.4617221344188109, + "grad_norm": 0.4372609555721283, + "learning_rate": 0.00010767751075527365, + "loss": 1.3089, + "step": 35532 + }, + { + "epoch": 0.4617351289627268, + "grad_norm": 0.4025667905807495, + "learning_rate": 0.00010767491129336226, + "loss": 1.4338, + "step": 35533 + }, + { + "epoch": 0.4617481235066426, + "grad_norm": 0.37487006187438965, + "learning_rate": 0.0001076723118314509, + "loss": 1.2738, + "step": 35534 + }, + { + "epoch": 0.46176111805055853, + "grad_norm": 0.4419200122356415, + "learning_rate": 0.00010766971236953951, + "loss": 1.4552, + "step": 35535 + }, + { + "epoch": 0.4617741125944744, + "grad_norm": 0.3112601041793823, + "learning_rate": 0.00010766711290762812, + "loss": 1.2519, + "step": 35536 + }, + { + "epoch": 0.4617871071383903, + "grad_norm": 0.4411623775959015, + "learning_rate": 0.00010766451344571675, + "loss": 1.4343, + "step": 35537 + }, + { + "epoch": 0.4618001016823061, + "grad_norm": 0.4714076817035675, + "learning_rate": 0.00010766191398380536, + "loss": 1.4513, + "step": 35538 + }, + { + "epoch": 0.461813096226222, + "grad_norm": 0.44928380846977234, + "learning_rate": 0.00010765931452189397, + "loss": 1.4452, + "step": 35539 + }, + { + "epoch": 0.46182609077013786, + "grad_norm": 0.4142343997955322, + "learning_rate": 0.00010765671505998258, + "loss": 1.3852, + "step": 35540 + }, + { + "epoch": 0.46183908531405377, + "grad_norm": 0.41743364930152893, + "learning_rate": 0.00010765411559807122, + "loss": 1.4237, + "step": 35541 + }, + { + "epoch": 0.4618520798579696, + "grad_norm": 0.4155822694301605, + "learning_rate": 0.00010765151613615983, + "loss": 1.4403, + "step": 35542 + }, + { + "epoch": 0.4618650744018855, + "grad_norm": 0.2787812352180481, + "learning_rate": 0.00010764891667424844, + "loss": 1.0245, + "step": 35543 + }, + { + "epoch": 0.46187806894580136, + "grad_norm": 0.5130904316902161, + "learning_rate": 0.00010764631721233705, + "loss": 1.3957, + "step": 35544 + }, + { + "epoch": 0.46189106348971726, + "grad_norm": 0.40980926156044006, + "learning_rate": 0.00010764371775042568, + "loss": 1.2248, + "step": 35545 + }, + { + "epoch": 0.4619040580336331, + "grad_norm": 0.4229581356048584, + "learning_rate": 0.00010764111828851429, + "loss": 1.4666, + "step": 35546 + }, + { + "epoch": 0.461917052577549, + "grad_norm": 0.3763701319694519, + "learning_rate": 0.0001076385188266029, + "loss": 1.3778, + "step": 35547 + }, + { + "epoch": 0.46193004712146485, + "grad_norm": 0.43439486622810364, + "learning_rate": 0.00010763591936469151, + "loss": 1.351, + "step": 35548 + }, + { + "epoch": 0.46194304166538075, + "grad_norm": 0.3355864882469177, + "learning_rate": 0.00010763331990278013, + "loss": 1.2932, + "step": 35549 + }, + { + "epoch": 0.4619560362092966, + "grad_norm": 0.32369762659072876, + "learning_rate": 0.00010763072044086874, + "loss": 1.2564, + "step": 35550 + }, + { + "epoch": 0.4619690307532125, + "grad_norm": 0.42036017775535583, + "learning_rate": 0.00010762812097895735, + "loss": 1.4687, + "step": 35551 + }, + { + "epoch": 0.4619820252971284, + "grad_norm": 0.49597010016441345, + "learning_rate": 0.00010762552151704597, + "loss": 1.564, + "step": 35552 + }, + { + "epoch": 0.46199501984104424, + "grad_norm": 0.5177572965621948, + "learning_rate": 0.0001076229220551346, + "loss": 1.3751, + "step": 35553 + }, + { + "epoch": 0.46200801438496014, + "grad_norm": 0.35545286536216736, + "learning_rate": 0.00010762032259322321, + "loss": 1.3324, + "step": 35554 + }, + { + "epoch": 0.462021008928876, + "grad_norm": 0.4604887366294861, + "learning_rate": 0.00010761772313131183, + "loss": 1.4888, + "step": 35555 + }, + { + "epoch": 0.4620340034727919, + "grad_norm": 0.4378258287906647, + "learning_rate": 0.00010761512366940044, + "loss": 1.5625, + "step": 35556 + }, + { + "epoch": 0.46204699801670773, + "grad_norm": 0.499294251203537, + "learning_rate": 0.00010761252420748906, + "loss": 1.3961, + "step": 35557 + }, + { + "epoch": 0.46205999256062363, + "grad_norm": 0.4454141855239868, + "learning_rate": 0.00010760992474557767, + "loss": 1.3144, + "step": 35558 + }, + { + "epoch": 0.4620729871045395, + "grad_norm": 0.3458491861820221, + "learning_rate": 0.00010760732528366628, + "loss": 1.3078, + "step": 35559 + }, + { + "epoch": 0.4620859816484554, + "grad_norm": 0.2784202992916107, + "learning_rate": 0.0001076047258217549, + "loss": 1.2677, + "step": 35560 + }, + { + "epoch": 0.4620989761923712, + "grad_norm": 0.4437010586261749, + "learning_rate": 0.00010760212635984352, + "loss": 1.3901, + "step": 35561 + }, + { + "epoch": 0.4621119707362871, + "grad_norm": 0.42371848225593567, + "learning_rate": 0.00010759952689793213, + "loss": 1.398, + "step": 35562 + }, + { + "epoch": 0.46212496528020297, + "grad_norm": 0.45266059041023254, + "learning_rate": 0.00010759692743602074, + "loss": 1.3946, + "step": 35563 + }, + { + "epoch": 0.46213795982411887, + "grad_norm": 0.4476514458656311, + "learning_rate": 0.00010759432797410935, + "loss": 1.5033, + "step": 35564 + }, + { + "epoch": 0.4621509543680347, + "grad_norm": 0.3082617521286011, + "learning_rate": 0.00010759172851219799, + "loss": 1.361, + "step": 35565 + }, + { + "epoch": 0.4621639489119506, + "grad_norm": 0.29336756467819214, + "learning_rate": 0.0001075891290502866, + "loss": 1.1328, + "step": 35566 + }, + { + "epoch": 0.46217694345586646, + "grad_norm": 0.37239986658096313, + "learning_rate": 0.00010758652958837521, + "loss": 1.4044, + "step": 35567 + }, + { + "epoch": 0.46218993799978236, + "grad_norm": 0.4684375822544098, + "learning_rate": 0.00010758393012646382, + "loss": 1.0556, + "step": 35568 + }, + { + "epoch": 0.4622029325436982, + "grad_norm": 0.38781532645225525, + "learning_rate": 0.00010758133066455245, + "loss": 1.2834, + "step": 35569 + }, + { + "epoch": 0.4622159270876141, + "grad_norm": 0.4089552164077759, + "learning_rate": 0.00010757873120264106, + "loss": 1.5639, + "step": 35570 + }, + { + "epoch": 0.46222892163152995, + "grad_norm": 0.3284449577331543, + "learning_rate": 0.00010757613174072967, + "loss": 1.2763, + "step": 35571 + }, + { + "epoch": 0.46224191617544585, + "grad_norm": 0.3963087499141693, + "learning_rate": 0.0001075735322788183, + "loss": 1.2888, + "step": 35572 + }, + { + "epoch": 0.4622549107193617, + "grad_norm": 0.4717348515987396, + "learning_rate": 0.00010757093281690692, + "loss": 1.4842, + "step": 35573 + }, + { + "epoch": 0.4622679052632776, + "grad_norm": 0.383375346660614, + "learning_rate": 0.00010756833335499551, + "loss": 1.3755, + "step": 35574 + }, + { + "epoch": 0.46228089980719345, + "grad_norm": 0.34396892786026, + "learning_rate": 0.00010756573389308413, + "loss": 1.4222, + "step": 35575 + }, + { + "epoch": 0.46229389435110935, + "grad_norm": 0.44204697012901306, + "learning_rate": 0.00010756313443117276, + "loss": 1.3069, + "step": 35576 + }, + { + "epoch": 0.4623068888950252, + "grad_norm": 0.37377914786338806, + "learning_rate": 0.00010756053496926137, + "loss": 1.3046, + "step": 35577 + }, + { + "epoch": 0.4623198834389411, + "grad_norm": 0.29263922572135925, + "learning_rate": 0.00010755793550734999, + "loss": 1.3045, + "step": 35578 + }, + { + "epoch": 0.46233287798285694, + "grad_norm": 0.37501809000968933, + "learning_rate": 0.0001075553360454386, + "loss": 1.5368, + "step": 35579 + }, + { + "epoch": 0.46234587252677284, + "grad_norm": 0.3148528039455414, + "learning_rate": 0.00010755273658352722, + "loss": 1.4602, + "step": 35580 + }, + { + "epoch": 0.4623588670706887, + "grad_norm": 0.4157966375350952, + "learning_rate": 0.00010755013712161583, + "loss": 1.4566, + "step": 35581 + }, + { + "epoch": 0.4623718616146046, + "grad_norm": 0.5455573797225952, + "learning_rate": 0.00010754753765970444, + "loss": 1.1508, + "step": 35582 + }, + { + "epoch": 0.46238485615852043, + "grad_norm": 0.5336974859237671, + "learning_rate": 0.00010754493819779305, + "loss": 1.3638, + "step": 35583 + }, + { + "epoch": 0.46239785070243633, + "grad_norm": 0.4476361572742462, + "learning_rate": 0.00010754233873588169, + "loss": 1.541, + "step": 35584 + }, + { + "epoch": 0.4624108452463522, + "grad_norm": 0.2974816560745239, + "learning_rate": 0.0001075397392739703, + "loss": 1.4409, + "step": 35585 + }, + { + "epoch": 0.4624238397902681, + "grad_norm": 0.46337515115737915, + "learning_rate": 0.00010753713981205891, + "loss": 1.2421, + "step": 35586 + }, + { + "epoch": 0.4624368343341839, + "grad_norm": 0.4262804687023163, + "learning_rate": 0.00010753454035014751, + "loss": 1.3876, + "step": 35587 + }, + { + "epoch": 0.4624498288780998, + "grad_norm": 0.3468291461467743, + "learning_rate": 0.00010753194088823615, + "loss": 1.4673, + "step": 35588 + }, + { + "epoch": 0.46246282342201567, + "grad_norm": 0.36996781826019287, + "learning_rate": 0.00010752934142632476, + "loss": 1.3925, + "step": 35589 + }, + { + "epoch": 0.46247581796593157, + "grad_norm": 0.3559987246990204, + "learning_rate": 0.00010752674196441337, + "loss": 1.3626, + "step": 35590 + }, + { + "epoch": 0.4624888125098474, + "grad_norm": 0.41341087222099304, + "learning_rate": 0.00010752414250250198, + "loss": 1.4617, + "step": 35591 + }, + { + "epoch": 0.4625018070537633, + "grad_norm": 0.39191341400146484, + "learning_rate": 0.0001075215430405906, + "loss": 1.3471, + "step": 35592 + }, + { + "epoch": 0.46251480159767916, + "grad_norm": 0.36694279313087463, + "learning_rate": 0.00010751894357867922, + "loss": 1.5318, + "step": 35593 + }, + { + "epoch": 0.46252779614159506, + "grad_norm": 0.36124345660209656, + "learning_rate": 0.00010751634411676783, + "loss": 1.3142, + "step": 35594 + }, + { + "epoch": 0.4625407906855109, + "grad_norm": 0.3686319589614868, + "learning_rate": 0.00010751374465485644, + "loss": 1.6122, + "step": 35595 + }, + { + "epoch": 0.4625537852294268, + "grad_norm": 0.23160549998283386, + "learning_rate": 0.00010751114519294508, + "loss": 1.3645, + "step": 35596 + }, + { + "epoch": 0.46256677977334265, + "grad_norm": 0.44235989451408386, + "learning_rate": 0.00010750854573103369, + "loss": 1.3019, + "step": 35597 + }, + { + "epoch": 0.46257977431725855, + "grad_norm": 0.4268600642681122, + "learning_rate": 0.0001075059462691223, + "loss": 1.5811, + "step": 35598 + }, + { + "epoch": 0.4625927688611744, + "grad_norm": 0.3864820599555969, + "learning_rate": 0.0001075033468072109, + "loss": 1.3508, + "step": 35599 + }, + { + "epoch": 0.4626057634050903, + "grad_norm": 0.34672778844833374, + "learning_rate": 0.00010750074734529953, + "loss": 1.2393, + "step": 35600 + }, + { + "epoch": 0.46261875794900614, + "grad_norm": 0.4089890718460083, + "learning_rate": 0.00010749814788338815, + "loss": 1.3372, + "step": 35601 + }, + { + "epoch": 0.46263175249292204, + "grad_norm": 0.48859190940856934, + "learning_rate": 0.00010749554842147676, + "loss": 1.4937, + "step": 35602 + }, + { + "epoch": 0.4626447470368379, + "grad_norm": 0.3934830129146576, + "learning_rate": 0.00010749294895956537, + "loss": 1.3033, + "step": 35603 + }, + { + "epoch": 0.4626577415807538, + "grad_norm": 0.4286805987358093, + "learning_rate": 0.00010749034949765399, + "loss": 1.6952, + "step": 35604 + }, + { + "epoch": 0.46267073612466963, + "grad_norm": 0.41911569237709045, + "learning_rate": 0.0001074877500357426, + "loss": 1.4163, + "step": 35605 + }, + { + "epoch": 0.46268373066858554, + "grad_norm": 0.29294317960739136, + "learning_rate": 0.00010748515057383121, + "loss": 1.3951, + "step": 35606 + }, + { + "epoch": 0.4626967252125014, + "grad_norm": 0.3785363733768463, + "learning_rate": 0.00010748255111191982, + "loss": 1.3576, + "step": 35607 + }, + { + "epoch": 0.4627097197564173, + "grad_norm": 0.457661509513855, + "learning_rate": 0.00010747995165000846, + "loss": 1.3655, + "step": 35608 + }, + { + "epoch": 0.4627227143003331, + "grad_norm": 0.4276477098464966, + "learning_rate": 0.00010747735218809707, + "loss": 1.3633, + "step": 35609 + }, + { + "epoch": 0.462735708844249, + "grad_norm": 0.39906471967697144, + "learning_rate": 0.00010747475272618568, + "loss": 1.4687, + "step": 35610 + }, + { + "epoch": 0.4627487033881649, + "grad_norm": 0.3625788390636444, + "learning_rate": 0.00010747215326427431, + "loss": 1.5682, + "step": 35611 + }, + { + "epoch": 0.4627616979320808, + "grad_norm": 0.38163772225379944, + "learning_rate": 0.00010746955380236292, + "loss": 1.3378, + "step": 35612 + }, + { + "epoch": 0.4627746924759966, + "grad_norm": 0.47390687465667725, + "learning_rate": 0.00010746695434045153, + "loss": 1.3152, + "step": 35613 + }, + { + "epoch": 0.4627876870199125, + "grad_norm": 0.36197197437286377, + "learning_rate": 0.00010746435487854014, + "loss": 1.3028, + "step": 35614 + }, + { + "epoch": 0.46280068156382836, + "grad_norm": 0.4514126777648926, + "learning_rate": 0.00010746175541662878, + "loss": 1.5198, + "step": 35615 + }, + { + "epoch": 0.46281367610774427, + "grad_norm": 0.5091932415962219, + "learning_rate": 0.00010745915595471738, + "loss": 1.4468, + "step": 35616 + }, + { + "epoch": 0.4628266706516601, + "grad_norm": 0.38111212849617004, + "learning_rate": 0.00010745655649280599, + "loss": 1.4489, + "step": 35617 + }, + { + "epoch": 0.462839665195576, + "grad_norm": 0.4679569900035858, + "learning_rate": 0.0001074539570308946, + "loss": 1.5628, + "step": 35618 + }, + { + "epoch": 0.46285265973949186, + "grad_norm": 0.47226688265800476, + "learning_rate": 0.00010745135756898324, + "loss": 1.3734, + "step": 35619 + }, + { + "epoch": 0.46286565428340776, + "grad_norm": 0.49694281816482544, + "learning_rate": 0.00010744875810707185, + "loss": 1.5586, + "step": 35620 + }, + { + "epoch": 0.4628786488273236, + "grad_norm": 0.5413099527359009, + "learning_rate": 0.00010744615864516046, + "loss": 1.3559, + "step": 35621 + }, + { + "epoch": 0.4628916433712395, + "grad_norm": 0.3707817792892456, + "learning_rate": 0.00010744355918324907, + "loss": 1.3946, + "step": 35622 + }, + { + "epoch": 0.46290463791515535, + "grad_norm": 0.41101813316345215, + "learning_rate": 0.0001074409597213377, + "loss": 1.3852, + "step": 35623 + }, + { + "epoch": 0.46291763245907125, + "grad_norm": 0.2865454852581024, + "learning_rate": 0.0001074383602594263, + "loss": 1.1666, + "step": 35624 + }, + { + "epoch": 0.4629306270029871, + "grad_norm": 0.45384377241134644, + "learning_rate": 0.00010743576079751492, + "loss": 1.3639, + "step": 35625 + }, + { + "epoch": 0.462943621546903, + "grad_norm": 0.3580816984176636, + "learning_rate": 0.00010743316133560353, + "loss": 1.2505, + "step": 35626 + }, + { + "epoch": 0.46295661609081884, + "grad_norm": 0.458852082490921, + "learning_rate": 0.00010743056187369217, + "loss": 1.4553, + "step": 35627 + }, + { + "epoch": 0.46296961063473474, + "grad_norm": 0.41107693314552307, + "learning_rate": 0.00010742796241178076, + "loss": 1.4153, + "step": 35628 + }, + { + "epoch": 0.46298260517865064, + "grad_norm": 0.39565780758857727, + "learning_rate": 0.00010742536294986937, + "loss": 1.2137, + "step": 35629 + }, + { + "epoch": 0.4629955997225665, + "grad_norm": 0.40981370210647583, + "learning_rate": 0.00010742276348795798, + "loss": 1.2962, + "step": 35630 + }, + { + "epoch": 0.4630085942664824, + "grad_norm": 0.4478282332420349, + "learning_rate": 0.00010742016402604662, + "loss": 1.2749, + "step": 35631 + }, + { + "epoch": 0.46302158881039823, + "grad_norm": 0.5453441739082336, + "learning_rate": 0.00010741756456413523, + "loss": 1.6012, + "step": 35632 + }, + { + "epoch": 0.46303458335431413, + "grad_norm": 0.3136277198791504, + "learning_rate": 0.00010741496510222384, + "loss": 1.3389, + "step": 35633 + }, + { + "epoch": 0.46304757789823, + "grad_norm": 0.44794800877571106, + "learning_rate": 0.00010741236564031246, + "loss": 1.4151, + "step": 35634 + }, + { + "epoch": 0.4630605724421459, + "grad_norm": 0.42392364144325256, + "learning_rate": 0.00010740976617840108, + "loss": 1.3304, + "step": 35635 + }, + { + "epoch": 0.4630735669860617, + "grad_norm": 0.376111775636673, + "learning_rate": 0.00010740716671648969, + "loss": 1.4198, + "step": 35636 + }, + { + "epoch": 0.4630865615299776, + "grad_norm": 0.4805116653442383, + "learning_rate": 0.0001074045672545783, + "loss": 1.3806, + "step": 35637 + }, + { + "epoch": 0.46309955607389347, + "grad_norm": 0.4525004029273987, + "learning_rate": 0.00010740196779266691, + "loss": 1.3111, + "step": 35638 + }, + { + "epoch": 0.46311255061780937, + "grad_norm": 0.4817827641963959, + "learning_rate": 0.00010739936833075555, + "loss": 1.5681, + "step": 35639 + }, + { + "epoch": 0.4631255451617252, + "grad_norm": 0.3991628587245941, + "learning_rate": 0.00010739676886884416, + "loss": 1.1895, + "step": 35640 + }, + { + "epoch": 0.4631385397056411, + "grad_norm": 0.35473111271858215, + "learning_rate": 0.00010739416940693276, + "loss": 1.3662, + "step": 35641 + }, + { + "epoch": 0.46315153424955696, + "grad_norm": 0.5121421813964844, + "learning_rate": 0.00010739156994502137, + "loss": 1.5058, + "step": 35642 + }, + { + "epoch": 0.46316452879347286, + "grad_norm": 0.4408717155456543, + "learning_rate": 0.00010738897048311001, + "loss": 1.3839, + "step": 35643 + }, + { + "epoch": 0.4631775233373887, + "grad_norm": 0.3886417746543884, + "learning_rate": 0.00010738637102119862, + "loss": 1.5396, + "step": 35644 + }, + { + "epoch": 0.4631905178813046, + "grad_norm": 0.46628519892692566, + "learning_rate": 0.00010738377155928723, + "loss": 1.537, + "step": 35645 + }, + { + "epoch": 0.46320351242522045, + "grad_norm": 0.41789451241493225, + "learning_rate": 0.00010738117209737584, + "loss": 1.3968, + "step": 35646 + }, + { + "epoch": 0.46321650696913635, + "grad_norm": 0.346975177526474, + "learning_rate": 0.00010737857263546447, + "loss": 1.3174, + "step": 35647 + }, + { + "epoch": 0.4632295015130522, + "grad_norm": 0.41786059737205505, + "learning_rate": 0.00010737597317355308, + "loss": 1.3239, + "step": 35648 + }, + { + "epoch": 0.4632424960569681, + "grad_norm": 0.3134523034095764, + "learning_rate": 0.00010737337371164169, + "loss": 1.3524, + "step": 35649 + }, + { + "epoch": 0.46325549060088395, + "grad_norm": 0.34346750378608704, + "learning_rate": 0.00010737077424973032, + "loss": 1.5149, + "step": 35650 + }, + { + "epoch": 0.46326848514479985, + "grad_norm": 0.38218626379966736, + "learning_rate": 0.00010736817478781894, + "loss": 1.4093, + "step": 35651 + }, + { + "epoch": 0.4632814796887157, + "grad_norm": 0.36848539113998413, + "learning_rate": 0.00010736557532590755, + "loss": 1.4656, + "step": 35652 + }, + { + "epoch": 0.4632944742326316, + "grad_norm": 0.427184522151947, + "learning_rate": 0.00010736297586399616, + "loss": 1.4659, + "step": 35653 + }, + { + "epoch": 0.46330746877654744, + "grad_norm": 0.4232839047908783, + "learning_rate": 0.00010736037640208478, + "loss": 1.5128, + "step": 35654 + }, + { + "epoch": 0.46332046332046334, + "grad_norm": 0.42893972992897034, + "learning_rate": 0.00010735777694017339, + "loss": 1.367, + "step": 35655 + }, + { + "epoch": 0.4633334578643792, + "grad_norm": 0.5212001800537109, + "learning_rate": 0.000107355177478262, + "loss": 1.3552, + "step": 35656 + }, + { + "epoch": 0.4633464524082951, + "grad_norm": 0.3655027449131012, + "learning_rate": 0.00010735257801635062, + "loss": 1.5283, + "step": 35657 + }, + { + "epoch": 0.46335944695221093, + "grad_norm": 0.4473346471786499, + "learning_rate": 0.00010734997855443924, + "loss": 1.5149, + "step": 35658 + }, + { + "epoch": 0.46337244149612683, + "grad_norm": 0.3939672112464905, + "learning_rate": 0.00010734737909252785, + "loss": 1.2939, + "step": 35659 + }, + { + "epoch": 0.4633854360400427, + "grad_norm": 0.3965427279472351, + "learning_rate": 0.00010734477963061646, + "loss": 1.5198, + "step": 35660 + }, + { + "epoch": 0.4633984305839586, + "grad_norm": 0.3588354289531708, + "learning_rate": 0.00010734218016870507, + "loss": 1.3263, + "step": 35661 + }, + { + "epoch": 0.4634114251278744, + "grad_norm": 0.33331888914108276, + "learning_rate": 0.00010733958070679371, + "loss": 1.3672, + "step": 35662 + }, + { + "epoch": 0.4634244196717903, + "grad_norm": 0.28365492820739746, + "learning_rate": 0.00010733698124488232, + "loss": 1.4528, + "step": 35663 + }, + { + "epoch": 0.46343741421570617, + "grad_norm": 0.36620762944221497, + "learning_rate": 0.00010733438178297093, + "loss": 1.1314, + "step": 35664 + }, + { + "epoch": 0.46345040875962207, + "grad_norm": 0.21242588758468628, + "learning_rate": 0.00010733178232105954, + "loss": 1.2866, + "step": 35665 + }, + { + "epoch": 0.4634634033035379, + "grad_norm": 0.4560832381248474, + "learning_rate": 0.00010732918285914817, + "loss": 1.3328, + "step": 35666 + }, + { + "epoch": 0.4634763978474538, + "grad_norm": 0.46458005905151367, + "learning_rate": 0.00010732658339723678, + "loss": 1.2755, + "step": 35667 + }, + { + "epoch": 0.46348939239136966, + "grad_norm": 0.3883011043071747, + "learning_rate": 0.00010732398393532539, + "loss": 1.4119, + "step": 35668 + }, + { + "epoch": 0.46350238693528556, + "grad_norm": 0.31358370184898376, + "learning_rate": 0.000107321384473414, + "loss": 1.3884, + "step": 35669 + }, + { + "epoch": 0.4635153814792014, + "grad_norm": 0.3999238610267639, + "learning_rate": 0.00010731878501150262, + "loss": 1.4935, + "step": 35670 + }, + { + "epoch": 0.4635283760231173, + "grad_norm": 0.4081012010574341, + "learning_rate": 0.00010731618554959124, + "loss": 1.4539, + "step": 35671 + }, + { + "epoch": 0.46354137056703315, + "grad_norm": 0.4759165346622467, + "learning_rate": 0.00010731358608767985, + "loss": 1.4321, + "step": 35672 + }, + { + "epoch": 0.46355436511094905, + "grad_norm": 0.36935994029045105, + "learning_rate": 0.00010731098662576846, + "loss": 1.5067, + "step": 35673 + }, + { + "epoch": 0.4635673596548649, + "grad_norm": 0.3831806480884552, + "learning_rate": 0.0001073083871638571, + "loss": 1.1431, + "step": 35674 + }, + { + "epoch": 0.4635803541987808, + "grad_norm": 0.5320271253585815, + "learning_rate": 0.0001073057877019457, + "loss": 1.4181, + "step": 35675 + }, + { + "epoch": 0.46359334874269664, + "grad_norm": 0.47719433903694153, + "learning_rate": 0.00010730318824003432, + "loss": 1.4503, + "step": 35676 + }, + { + "epoch": 0.46360634328661254, + "grad_norm": 0.408518522977829, + "learning_rate": 0.00010730058877812293, + "loss": 1.4137, + "step": 35677 + }, + { + "epoch": 0.4636193378305284, + "grad_norm": 0.47981736063957214, + "learning_rate": 0.00010729798931621155, + "loss": 1.3976, + "step": 35678 + }, + { + "epoch": 0.4636323323744443, + "grad_norm": 0.4576692283153534, + "learning_rate": 0.00010729538985430016, + "loss": 1.4868, + "step": 35679 + }, + { + "epoch": 0.46364532691836013, + "grad_norm": 0.38052305579185486, + "learning_rate": 0.00010729279039238877, + "loss": 1.4702, + "step": 35680 + }, + { + "epoch": 0.46365832146227604, + "grad_norm": 0.4078505337238312, + "learning_rate": 0.00010729019093047739, + "loss": 1.345, + "step": 35681 + }, + { + "epoch": 0.4636713160061919, + "grad_norm": 0.40665605664253235, + "learning_rate": 0.00010728759146856602, + "loss": 1.2464, + "step": 35682 + }, + { + "epoch": 0.4636843105501078, + "grad_norm": 0.47068604826927185, + "learning_rate": 0.00010728499200665462, + "loss": 1.2225, + "step": 35683 + }, + { + "epoch": 0.4636973050940236, + "grad_norm": 0.366054892539978, + "learning_rate": 0.00010728239254474323, + "loss": 1.3867, + "step": 35684 + }, + { + "epoch": 0.4637102996379395, + "grad_norm": 0.45152339339256287, + "learning_rate": 0.00010727979308283187, + "loss": 1.4901, + "step": 35685 + }, + { + "epoch": 0.46372329418185537, + "grad_norm": 0.3877542316913605, + "learning_rate": 0.00010727719362092048, + "loss": 1.5086, + "step": 35686 + }, + { + "epoch": 0.4637362887257713, + "grad_norm": 0.40926992893218994, + "learning_rate": 0.00010727459415900909, + "loss": 1.4042, + "step": 35687 + }, + { + "epoch": 0.4637492832696871, + "grad_norm": 0.42088642716407776, + "learning_rate": 0.0001072719946970977, + "loss": 1.2724, + "step": 35688 + }, + { + "epoch": 0.463762277813603, + "grad_norm": 0.433689147233963, + "learning_rate": 0.00010726939523518633, + "loss": 1.3573, + "step": 35689 + }, + { + "epoch": 0.46377527235751886, + "grad_norm": 0.3792582154273987, + "learning_rate": 0.00010726679577327494, + "loss": 1.3351, + "step": 35690 + }, + { + "epoch": 0.46378826690143476, + "grad_norm": 0.3365780711174011, + "learning_rate": 0.00010726419631136355, + "loss": 1.381, + "step": 35691 + }, + { + "epoch": 0.4638012614453506, + "grad_norm": 0.39113849401474, + "learning_rate": 0.00010726159684945216, + "loss": 1.5173, + "step": 35692 + }, + { + "epoch": 0.4638142559892665, + "grad_norm": 0.5222766399383545, + "learning_rate": 0.0001072589973875408, + "loss": 1.4688, + "step": 35693 + }, + { + "epoch": 0.46382725053318236, + "grad_norm": 0.32109957933425903, + "learning_rate": 0.00010725639792562941, + "loss": 1.3709, + "step": 35694 + }, + { + "epoch": 0.46384024507709826, + "grad_norm": 0.4745495617389679, + "learning_rate": 0.00010725379846371802, + "loss": 1.3546, + "step": 35695 + }, + { + "epoch": 0.4638532396210141, + "grad_norm": 0.3927459120750427, + "learning_rate": 0.00010725119900180662, + "loss": 1.4778, + "step": 35696 + }, + { + "epoch": 0.46386623416493, + "grad_norm": 0.42750415205955505, + "learning_rate": 0.00010724859953989526, + "loss": 1.5684, + "step": 35697 + }, + { + "epoch": 0.46387922870884585, + "grad_norm": 0.2993466854095459, + "learning_rate": 0.00010724600007798387, + "loss": 1.2642, + "step": 35698 + }, + { + "epoch": 0.46389222325276175, + "grad_norm": 0.40829238295555115, + "learning_rate": 0.00010724340061607248, + "loss": 1.4845, + "step": 35699 + }, + { + "epoch": 0.4639052177966776, + "grad_norm": 0.4349399209022522, + "learning_rate": 0.00010724080115416109, + "loss": 1.3762, + "step": 35700 + }, + { + "epoch": 0.4639182123405935, + "grad_norm": 0.44317662715911865, + "learning_rate": 0.00010723820169224971, + "loss": 1.5711, + "step": 35701 + }, + { + "epoch": 0.46393120688450934, + "grad_norm": 0.4665480852127075, + "learning_rate": 0.00010723560223033832, + "loss": 1.4481, + "step": 35702 + }, + { + "epoch": 0.46394420142842524, + "grad_norm": 0.37910211086273193, + "learning_rate": 0.00010723300276842693, + "loss": 1.3125, + "step": 35703 + }, + { + "epoch": 0.46395719597234114, + "grad_norm": 0.3588291108608246, + "learning_rate": 0.00010723040330651555, + "loss": 1.4044, + "step": 35704 + }, + { + "epoch": 0.463970190516257, + "grad_norm": 0.32809385657310486, + "learning_rate": 0.00010722780384460418, + "loss": 1.2349, + "step": 35705 + }, + { + "epoch": 0.4639831850601729, + "grad_norm": 0.38716432452201843, + "learning_rate": 0.0001072252043826928, + "loss": 1.4886, + "step": 35706 + }, + { + "epoch": 0.46399617960408873, + "grad_norm": 0.3898129165172577, + "learning_rate": 0.0001072226049207814, + "loss": 1.4676, + "step": 35707 + }, + { + "epoch": 0.46400917414800463, + "grad_norm": 0.4012194275856018, + "learning_rate": 0.00010722000545887002, + "loss": 1.4584, + "step": 35708 + }, + { + "epoch": 0.4640221686919205, + "grad_norm": 0.3880549967288971, + "learning_rate": 0.00010721740599695864, + "loss": 1.3998, + "step": 35709 + }, + { + "epoch": 0.4640351632358364, + "grad_norm": 0.33373868465423584, + "learning_rate": 0.00010721480653504725, + "loss": 1.3762, + "step": 35710 + }, + { + "epoch": 0.4640481577797522, + "grad_norm": 0.4542563855648041, + "learning_rate": 0.00010721220707313586, + "loss": 1.5327, + "step": 35711 + }, + { + "epoch": 0.4640611523236681, + "grad_norm": 0.3873993456363678, + "learning_rate": 0.00010720960761122447, + "loss": 1.3017, + "step": 35712 + }, + { + "epoch": 0.46407414686758397, + "grad_norm": 0.4531840980052948, + "learning_rate": 0.0001072070081493131, + "loss": 1.3204, + "step": 35713 + }, + { + "epoch": 0.46408714141149987, + "grad_norm": 0.33776605129241943, + "learning_rate": 0.00010720440868740171, + "loss": 1.6071, + "step": 35714 + }, + { + "epoch": 0.4641001359554157, + "grad_norm": 0.4142756760120392, + "learning_rate": 0.00010720180922549032, + "loss": 1.3591, + "step": 35715 + }, + { + "epoch": 0.4641131304993316, + "grad_norm": 0.36697250604629517, + "learning_rate": 0.00010719920976357893, + "loss": 1.3722, + "step": 35716 + }, + { + "epoch": 0.46412612504324746, + "grad_norm": 0.345456600189209, + "learning_rate": 0.00010719661030166757, + "loss": 1.3067, + "step": 35717 + }, + { + "epoch": 0.46413911958716336, + "grad_norm": 0.3545323610305786, + "learning_rate": 0.00010719401083975618, + "loss": 1.4086, + "step": 35718 + }, + { + "epoch": 0.4641521141310792, + "grad_norm": 0.38930609822273254, + "learning_rate": 0.00010719141137784479, + "loss": 1.4761, + "step": 35719 + }, + { + "epoch": 0.4641651086749951, + "grad_norm": 0.40637439489364624, + "learning_rate": 0.0001071888119159334, + "loss": 1.6331, + "step": 35720 + }, + { + "epoch": 0.46417810321891095, + "grad_norm": 0.38163551688194275, + "learning_rate": 0.00010718621245402203, + "loss": 1.2254, + "step": 35721 + }, + { + "epoch": 0.46419109776282685, + "grad_norm": 0.4472687840461731, + "learning_rate": 0.00010718361299211064, + "loss": 1.4667, + "step": 35722 + }, + { + "epoch": 0.4642040923067427, + "grad_norm": 0.35414546728134155, + "learning_rate": 0.00010718101353019925, + "loss": 1.2344, + "step": 35723 + }, + { + "epoch": 0.4642170868506586, + "grad_norm": 0.4409354031085968, + "learning_rate": 0.00010717841406828789, + "loss": 1.5897, + "step": 35724 + }, + { + "epoch": 0.46423008139457445, + "grad_norm": 0.510757565498352, + "learning_rate": 0.00010717581460637648, + "loss": 1.3826, + "step": 35725 + }, + { + "epoch": 0.46424307593849035, + "grad_norm": 0.521203339099884, + "learning_rate": 0.0001071732151444651, + "loss": 1.5454, + "step": 35726 + }, + { + "epoch": 0.4642560704824062, + "grad_norm": 0.3957363963127136, + "learning_rate": 0.0001071706156825537, + "loss": 1.5085, + "step": 35727 + }, + { + "epoch": 0.4642690650263221, + "grad_norm": 0.5520777702331543, + "learning_rate": 0.00010716801622064234, + "loss": 1.4804, + "step": 35728 + }, + { + "epoch": 0.46428205957023794, + "grad_norm": 0.32879137992858887, + "learning_rate": 0.00010716541675873095, + "loss": 1.4548, + "step": 35729 + }, + { + "epoch": 0.46429505411415384, + "grad_norm": 0.299887478351593, + "learning_rate": 0.00010716281729681957, + "loss": 1.1202, + "step": 35730 + }, + { + "epoch": 0.4643080486580697, + "grad_norm": 0.3300218880176544, + "learning_rate": 0.00010716021783490818, + "loss": 1.3384, + "step": 35731 + }, + { + "epoch": 0.4643210432019856, + "grad_norm": 0.36573949456214905, + "learning_rate": 0.0001071576183729968, + "loss": 1.2967, + "step": 35732 + }, + { + "epoch": 0.46433403774590143, + "grad_norm": 0.4675024747848511, + "learning_rate": 0.00010715501891108541, + "loss": 1.3292, + "step": 35733 + }, + { + "epoch": 0.46434703228981733, + "grad_norm": 0.37884876132011414, + "learning_rate": 0.00010715241944917402, + "loss": 1.4362, + "step": 35734 + }, + { + "epoch": 0.4643600268337332, + "grad_norm": 0.35983002185821533, + "learning_rate": 0.00010714981998726263, + "loss": 1.283, + "step": 35735 + }, + { + "epoch": 0.4643730213776491, + "grad_norm": 0.32473140954971313, + "learning_rate": 0.00010714722052535127, + "loss": 1.2105, + "step": 35736 + }, + { + "epoch": 0.4643860159215649, + "grad_norm": 0.4478898048400879, + "learning_rate": 0.00010714462106343988, + "loss": 1.4179, + "step": 35737 + }, + { + "epoch": 0.4643990104654808, + "grad_norm": 0.37548181414604187, + "learning_rate": 0.00010714202160152848, + "loss": 1.4042, + "step": 35738 + }, + { + "epoch": 0.46441200500939667, + "grad_norm": 0.34154483675956726, + "learning_rate": 0.00010713942213961709, + "loss": 1.3242, + "step": 35739 + }, + { + "epoch": 0.46442499955331257, + "grad_norm": 0.5131894946098328, + "learning_rate": 0.00010713682267770573, + "loss": 1.3617, + "step": 35740 + }, + { + "epoch": 0.4644379940972284, + "grad_norm": 0.40826016664505005, + "learning_rate": 0.00010713422321579434, + "loss": 1.5178, + "step": 35741 + }, + { + "epoch": 0.4644509886411443, + "grad_norm": 0.5026918649673462, + "learning_rate": 0.00010713162375388295, + "loss": 1.4838, + "step": 35742 + }, + { + "epoch": 0.46446398318506016, + "grad_norm": 0.394258052110672, + "learning_rate": 0.00010712902429197156, + "loss": 1.2799, + "step": 35743 + }, + { + "epoch": 0.46447697772897606, + "grad_norm": 0.3919675350189209, + "learning_rate": 0.00010712642483006019, + "loss": 1.493, + "step": 35744 + }, + { + "epoch": 0.4644899722728919, + "grad_norm": 0.4354751408100128, + "learning_rate": 0.0001071238253681488, + "loss": 1.1643, + "step": 35745 + }, + { + "epoch": 0.4645029668168078, + "grad_norm": 0.45295071601867676, + "learning_rate": 0.00010712122590623741, + "loss": 1.4635, + "step": 35746 + }, + { + "epoch": 0.46451596136072365, + "grad_norm": 0.4486791491508484, + "learning_rate": 0.00010711862644432602, + "loss": 1.4049, + "step": 35747 + }, + { + "epoch": 0.46452895590463955, + "grad_norm": 0.505698561668396, + "learning_rate": 0.00010711602698241466, + "loss": 1.4294, + "step": 35748 + }, + { + "epoch": 0.4645419504485554, + "grad_norm": 0.4510813355445862, + "learning_rate": 0.00010711342752050327, + "loss": 1.3102, + "step": 35749 + }, + { + "epoch": 0.4645549449924713, + "grad_norm": 0.3974493145942688, + "learning_rate": 0.00010711082805859188, + "loss": 1.3553, + "step": 35750 + }, + { + "epoch": 0.46456793953638714, + "grad_norm": 0.40255334973335266, + "learning_rate": 0.00010710822859668048, + "loss": 1.2446, + "step": 35751 + }, + { + "epoch": 0.46458093408030304, + "grad_norm": 0.4307808578014374, + "learning_rate": 0.00010710562913476911, + "loss": 1.4597, + "step": 35752 + }, + { + "epoch": 0.4645939286242189, + "grad_norm": 0.33516693115234375, + "learning_rate": 0.00010710302967285773, + "loss": 1.3647, + "step": 35753 + }, + { + "epoch": 0.4646069231681348, + "grad_norm": 0.43638190627098083, + "learning_rate": 0.00010710043021094634, + "loss": 1.6564, + "step": 35754 + }, + { + "epoch": 0.46461991771205063, + "grad_norm": 0.3485713303089142, + "learning_rate": 0.00010709783074903495, + "loss": 1.4224, + "step": 35755 + }, + { + "epoch": 0.46463291225596653, + "grad_norm": 0.35191163420677185, + "learning_rate": 0.00010709523128712357, + "loss": 1.1552, + "step": 35756 + }, + { + "epoch": 0.4646459067998824, + "grad_norm": 0.3450961709022522, + "learning_rate": 0.00010709263182521218, + "loss": 1.5828, + "step": 35757 + }, + { + "epoch": 0.4646589013437983, + "grad_norm": 0.43623632192611694, + "learning_rate": 0.0001070900323633008, + "loss": 1.5343, + "step": 35758 + }, + { + "epoch": 0.4646718958877141, + "grad_norm": 0.26587674021720886, + "learning_rate": 0.00010708743290138943, + "loss": 1.2452, + "step": 35759 + }, + { + "epoch": 0.46468489043163, + "grad_norm": 0.2967045307159424, + "learning_rate": 0.00010708483343947804, + "loss": 1.2431, + "step": 35760 + }, + { + "epoch": 0.46469788497554587, + "grad_norm": 0.36637943983078003, + "learning_rate": 0.00010708223397756665, + "loss": 1.5139, + "step": 35761 + }, + { + "epoch": 0.4647108795194618, + "grad_norm": 0.3731740415096283, + "learning_rate": 0.00010707963451565526, + "loss": 1.1725, + "step": 35762 + }, + { + "epoch": 0.4647238740633776, + "grad_norm": 0.38088053464889526, + "learning_rate": 0.00010707703505374389, + "loss": 1.4296, + "step": 35763 + }, + { + "epoch": 0.4647368686072935, + "grad_norm": 0.44334372878074646, + "learning_rate": 0.0001070744355918325, + "loss": 1.3919, + "step": 35764 + }, + { + "epoch": 0.46474986315120936, + "grad_norm": 0.27963680028915405, + "learning_rate": 0.00010707183612992111, + "loss": 1.5254, + "step": 35765 + }, + { + "epoch": 0.46476285769512526, + "grad_norm": 0.4246121048927307, + "learning_rate": 0.00010706923666800972, + "loss": 1.3882, + "step": 35766 + }, + { + "epoch": 0.4647758522390411, + "grad_norm": 0.4229174852371216, + "learning_rate": 0.00010706663720609835, + "loss": 1.3092, + "step": 35767 + }, + { + "epoch": 0.464788846782957, + "grad_norm": 0.3841497004032135, + "learning_rate": 0.00010706403774418696, + "loss": 1.4034, + "step": 35768 + }, + { + "epoch": 0.46480184132687286, + "grad_norm": 0.4514086842536926, + "learning_rate": 0.00010706143828227557, + "loss": 1.3607, + "step": 35769 + }, + { + "epoch": 0.46481483587078876, + "grad_norm": 0.4733964204788208, + "learning_rate": 0.00010705883882036418, + "loss": 1.5485, + "step": 35770 + }, + { + "epoch": 0.4648278304147046, + "grad_norm": 0.3369889557361603, + "learning_rate": 0.00010705623935845282, + "loss": 1.2684, + "step": 35771 + }, + { + "epoch": 0.4648408249586205, + "grad_norm": 0.3376230001449585, + "learning_rate": 0.00010705363989654143, + "loss": 1.314, + "step": 35772 + }, + { + "epoch": 0.46485381950253635, + "grad_norm": 0.38799983263015747, + "learning_rate": 0.00010705104043463004, + "loss": 1.1607, + "step": 35773 + }, + { + "epoch": 0.46486681404645225, + "grad_norm": 0.361493319272995, + "learning_rate": 0.00010704844097271865, + "loss": 1.3729, + "step": 35774 + }, + { + "epoch": 0.4648798085903681, + "grad_norm": 0.40936052799224854, + "learning_rate": 0.00010704584151080727, + "loss": 1.6361, + "step": 35775 + }, + { + "epoch": 0.464892803134284, + "grad_norm": 0.37491458654403687, + "learning_rate": 0.00010704324204889589, + "loss": 1.451, + "step": 35776 + }, + { + "epoch": 0.46490579767819984, + "grad_norm": 0.40805554389953613, + "learning_rate": 0.0001070406425869845, + "loss": 1.3071, + "step": 35777 + }, + { + "epoch": 0.46491879222211574, + "grad_norm": 0.47869864106178284, + "learning_rate": 0.00010703804312507311, + "loss": 1.4213, + "step": 35778 + }, + { + "epoch": 0.4649317867660316, + "grad_norm": 0.4659895896911621, + "learning_rate": 0.00010703544366316174, + "loss": 1.3119, + "step": 35779 + }, + { + "epoch": 0.4649447813099475, + "grad_norm": 0.3218241333961487, + "learning_rate": 0.00010703284420125034, + "loss": 1.4794, + "step": 35780 + }, + { + "epoch": 0.4649577758538634, + "grad_norm": 0.3543262481689453, + "learning_rate": 0.00010703024473933895, + "loss": 1.3566, + "step": 35781 + }, + { + "epoch": 0.46497077039777923, + "grad_norm": 0.4492974281311035, + "learning_rate": 0.00010702764527742756, + "loss": 1.4828, + "step": 35782 + }, + { + "epoch": 0.46498376494169513, + "grad_norm": 0.39959627389907837, + "learning_rate": 0.0001070250458155162, + "loss": 1.3616, + "step": 35783 + }, + { + "epoch": 0.464996759485611, + "grad_norm": 0.36129021644592285, + "learning_rate": 0.00010702244635360481, + "loss": 1.3493, + "step": 35784 + }, + { + "epoch": 0.4650097540295269, + "grad_norm": 0.3807162642478943, + "learning_rate": 0.00010701984689169342, + "loss": 1.4973, + "step": 35785 + }, + { + "epoch": 0.4650227485734427, + "grad_norm": 0.38066765666007996, + "learning_rate": 0.00010701724742978204, + "loss": 1.3361, + "step": 35786 + }, + { + "epoch": 0.4650357431173586, + "grad_norm": 0.3309139013290405, + "learning_rate": 0.00010701464796787066, + "loss": 1.5189, + "step": 35787 + }, + { + "epoch": 0.46504873766127447, + "grad_norm": 0.3651766777038574, + "learning_rate": 0.00010701204850595927, + "loss": 1.5088, + "step": 35788 + }, + { + "epoch": 0.46506173220519037, + "grad_norm": 0.4041731655597687, + "learning_rate": 0.00010700944904404788, + "loss": 1.4147, + "step": 35789 + }, + { + "epoch": 0.4650747267491062, + "grad_norm": 0.3698086738586426, + "learning_rate": 0.00010700684958213649, + "loss": 1.4646, + "step": 35790 + }, + { + "epoch": 0.4650877212930221, + "grad_norm": 0.49434903264045715, + "learning_rate": 0.00010700425012022513, + "loss": 1.3139, + "step": 35791 + }, + { + "epoch": 0.46510071583693796, + "grad_norm": 0.39973777532577515, + "learning_rate": 0.00010700165065831374, + "loss": 1.2901, + "step": 35792 + }, + { + "epoch": 0.46511371038085386, + "grad_norm": 0.5106237530708313, + "learning_rate": 0.00010699905119640234, + "loss": 1.3648, + "step": 35793 + }, + { + "epoch": 0.4651267049247697, + "grad_norm": 0.43667319416999817, + "learning_rate": 0.00010699645173449095, + "loss": 1.4993, + "step": 35794 + }, + { + "epoch": 0.4651396994686856, + "grad_norm": 0.3264627754688263, + "learning_rate": 0.00010699385227257959, + "loss": 1.4228, + "step": 35795 + }, + { + "epoch": 0.46515269401260145, + "grad_norm": 0.4814237654209137, + "learning_rate": 0.0001069912528106682, + "loss": 1.4044, + "step": 35796 + }, + { + "epoch": 0.46516568855651735, + "grad_norm": 0.29576122760772705, + "learning_rate": 0.00010698865334875681, + "loss": 1.0928, + "step": 35797 + }, + { + "epoch": 0.4651786831004332, + "grad_norm": 0.41525232791900635, + "learning_rate": 0.00010698605388684543, + "loss": 1.53, + "step": 35798 + }, + { + "epoch": 0.4651916776443491, + "grad_norm": 0.4054959714412689, + "learning_rate": 0.00010698345442493404, + "loss": 1.4397, + "step": 35799 + }, + { + "epoch": 0.46520467218826494, + "grad_norm": 0.36523696780204773, + "learning_rate": 0.00010698085496302266, + "loss": 1.5171, + "step": 35800 + }, + { + "epoch": 0.46521766673218085, + "grad_norm": 0.347986102104187, + "learning_rate": 0.00010697825550111127, + "loss": 1.4319, + "step": 35801 + }, + { + "epoch": 0.4652306612760967, + "grad_norm": 0.42010366916656494, + "learning_rate": 0.0001069756560391999, + "loss": 1.5793, + "step": 35802 + }, + { + "epoch": 0.4652436558200126, + "grad_norm": 0.36897042393684387, + "learning_rate": 0.00010697305657728852, + "loss": 1.4585, + "step": 35803 + }, + { + "epoch": 0.46525665036392844, + "grad_norm": 0.35504722595214844, + "learning_rate": 0.00010697045711537713, + "loss": 1.3805, + "step": 35804 + }, + { + "epoch": 0.46526964490784434, + "grad_norm": 0.3372567892074585, + "learning_rate": 0.00010696785765346572, + "loss": 1.3338, + "step": 35805 + }, + { + "epoch": 0.4652826394517602, + "grad_norm": 0.35057783126831055, + "learning_rate": 0.00010696525819155436, + "loss": 1.3047, + "step": 35806 + }, + { + "epoch": 0.4652956339956761, + "grad_norm": 0.47402527928352356, + "learning_rate": 0.00010696265872964297, + "loss": 1.3457, + "step": 35807 + }, + { + "epoch": 0.46530862853959193, + "grad_norm": 0.2996128797531128, + "learning_rate": 0.00010696005926773158, + "loss": 1.417, + "step": 35808 + }, + { + "epoch": 0.46532162308350783, + "grad_norm": 0.39967092871665955, + "learning_rate": 0.0001069574598058202, + "loss": 1.336, + "step": 35809 + }, + { + "epoch": 0.4653346176274237, + "grad_norm": 0.34992873668670654, + "learning_rate": 0.00010695486034390882, + "loss": 1.4117, + "step": 35810 + }, + { + "epoch": 0.4653476121713396, + "grad_norm": 0.359475702047348, + "learning_rate": 0.00010695226088199743, + "loss": 1.3689, + "step": 35811 + }, + { + "epoch": 0.4653606067152554, + "grad_norm": 0.45986121892929077, + "learning_rate": 0.00010694966142008604, + "loss": 1.3775, + "step": 35812 + }, + { + "epoch": 0.4653736012591713, + "grad_norm": 0.3535115718841553, + "learning_rate": 0.00010694706195817465, + "loss": 1.5041, + "step": 35813 + }, + { + "epoch": 0.46538659580308717, + "grad_norm": 0.34997013211250305, + "learning_rate": 0.00010694446249626329, + "loss": 1.2499, + "step": 35814 + }, + { + "epoch": 0.46539959034700307, + "grad_norm": 0.4915198087692261, + "learning_rate": 0.0001069418630343519, + "loss": 1.4018, + "step": 35815 + }, + { + "epoch": 0.4654125848909189, + "grad_norm": 0.35682377219200134, + "learning_rate": 0.00010693926357244051, + "loss": 1.5198, + "step": 35816 + }, + { + "epoch": 0.4654255794348348, + "grad_norm": 0.41563135385513306, + "learning_rate": 0.00010693666411052912, + "loss": 1.35, + "step": 35817 + }, + { + "epoch": 0.46543857397875066, + "grad_norm": 0.34668147563934326, + "learning_rate": 0.00010693406464861775, + "loss": 1.4723, + "step": 35818 + }, + { + "epoch": 0.46545156852266656, + "grad_norm": 0.4953675866127014, + "learning_rate": 0.00010693146518670636, + "loss": 1.2904, + "step": 35819 + }, + { + "epoch": 0.4654645630665824, + "grad_norm": 0.3268541395664215, + "learning_rate": 0.00010692886572479497, + "loss": 1.2884, + "step": 35820 + }, + { + "epoch": 0.4654775576104983, + "grad_norm": 0.34770333766937256, + "learning_rate": 0.00010692626626288358, + "loss": 1.3666, + "step": 35821 + }, + { + "epoch": 0.46549055215441415, + "grad_norm": 0.40397119522094727, + "learning_rate": 0.0001069236668009722, + "loss": 1.319, + "step": 35822 + }, + { + "epoch": 0.46550354669833005, + "grad_norm": 0.5016922354698181, + "learning_rate": 0.00010692106733906082, + "loss": 1.3252, + "step": 35823 + }, + { + "epoch": 0.4655165412422459, + "grad_norm": 0.3737371265888214, + "learning_rate": 0.00010691846787714943, + "loss": 1.4927, + "step": 35824 + }, + { + "epoch": 0.4655295357861618, + "grad_norm": 0.3321307897567749, + "learning_rate": 0.00010691586841523804, + "loss": 1.3554, + "step": 35825 + }, + { + "epoch": 0.46554253033007764, + "grad_norm": 0.4948640763759613, + "learning_rate": 0.00010691326895332668, + "loss": 1.492, + "step": 35826 + }, + { + "epoch": 0.46555552487399354, + "grad_norm": 0.4327353239059448, + "learning_rate": 0.00010691066949141529, + "loss": 1.3797, + "step": 35827 + }, + { + "epoch": 0.4655685194179094, + "grad_norm": 0.4067939221858978, + "learning_rate": 0.0001069080700295039, + "loss": 1.5799, + "step": 35828 + }, + { + "epoch": 0.4655815139618253, + "grad_norm": 0.2698366343975067, + "learning_rate": 0.00010690547056759251, + "loss": 1.2538, + "step": 35829 + }, + { + "epoch": 0.46559450850574113, + "grad_norm": 0.3806721270084381, + "learning_rate": 0.00010690287110568113, + "loss": 1.4179, + "step": 35830 + }, + { + "epoch": 0.46560750304965703, + "grad_norm": 0.3619813621044159, + "learning_rate": 0.00010690027164376974, + "loss": 1.1798, + "step": 35831 + }, + { + "epoch": 0.4656204975935729, + "grad_norm": 0.393474817276001, + "learning_rate": 0.00010689767218185835, + "loss": 1.2657, + "step": 35832 + }, + { + "epoch": 0.4656334921374888, + "grad_norm": 0.4190252721309662, + "learning_rate": 0.00010689507271994699, + "loss": 1.3939, + "step": 35833 + }, + { + "epoch": 0.4656464866814046, + "grad_norm": 0.37279942631721497, + "learning_rate": 0.0001068924732580356, + "loss": 1.2785, + "step": 35834 + }, + { + "epoch": 0.4656594812253205, + "grad_norm": 0.4239563047885895, + "learning_rate": 0.0001068898737961242, + "loss": 1.4672, + "step": 35835 + }, + { + "epoch": 0.46567247576923637, + "grad_norm": 0.4341200292110443, + "learning_rate": 0.00010688727433421281, + "loss": 1.4673, + "step": 35836 + }, + { + "epoch": 0.46568547031315227, + "grad_norm": 0.49018964171409607, + "learning_rate": 0.00010688467487230145, + "loss": 1.4178, + "step": 35837 + }, + { + "epoch": 0.4656984648570681, + "grad_norm": 0.38861218094825745, + "learning_rate": 0.00010688207541039006, + "loss": 1.2747, + "step": 35838 + }, + { + "epoch": 0.465711459400984, + "grad_norm": 0.4218166768550873, + "learning_rate": 0.00010687947594847867, + "loss": 1.5383, + "step": 35839 + }, + { + "epoch": 0.46572445394489986, + "grad_norm": 0.8024581670761108, + "learning_rate": 0.00010687687648656728, + "loss": 1.3643, + "step": 35840 + }, + { + "epoch": 0.46573744848881576, + "grad_norm": 0.400626003742218, + "learning_rate": 0.00010687427702465591, + "loss": 1.3545, + "step": 35841 + }, + { + "epoch": 0.4657504430327316, + "grad_norm": 0.37545350193977356, + "learning_rate": 0.00010687167756274452, + "loss": 1.3793, + "step": 35842 + }, + { + "epoch": 0.4657634375766475, + "grad_norm": 0.3697686791419983, + "learning_rate": 0.00010686907810083313, + "loss": 1.268, + "step": 35843 + }, + { + "epoch": 0.46577643212056336, + "grad_norm": 0.2834019958972931, + "learning_rate": 0.00010686647863892174, + "loss": 1.3754, + "step": 35844 + }, + { + "epoch": 0.46578942666447926, + "grad_norm": 0.3353450298309326, + "learning_rate": 0.00010686387917701038, + "loss": 1.5451, + "step": 35845 + }, + { + "epoch": 0.4658024212083951, + "grad_norm": 0.34883835911750793, + "learning_rate": 0.00010686127971509899, + "loss": 1.3699, + "step": 35846 + }, + { + "epoch": 0.465815415752311, + "grad_norm": 0.3946426212787628, + "learning_rate": 0.00010685868025318759, + "loss": 1.328, + "step": 35847 + }, + { + "epoch": 0.46582841029622685, + "grad_norm": 0.4300050735473633, + "learning_rate": 0.0001068560807912762, + "loss": 1.4763, + "step": 35848 + }, + { + "epoch": 0.46584140484014275, + "grad_norm": 0.32959356904029846, + "learning_rate": 0.00010685348132936484, + "loss": 1.4269, + "step": 35849 + }, + { + "epoch": 0.4658543993840586, + "grad_norm": 0.37990960478782654, + "learning_rate": 0.00010685088186745345, + "loss": 1.3496, + "step": 35850 + }, + { + "epoch": 0.4658673939279745, + "grad_norm": 0.34632784128189087, + "learning_rate": 0.00010684828240554206, + "loss": 1.2209, + "step": 35851 + }, + { + "epoch": 0.46588038847189034, + "grad_norm": 0.3987644910812378, + "learning_rate": 0.00010684568294363067, + "loss": 1.3843, + "step": 35852 + }, + { + "epoch": 0.46589338301580624, + "grad_norm": 0.29834672808647156, + "learning_rate": 0.00010684308348171929, + "loss": 1.2073, + "step": 35853 + }, + { + "epoch": 0.4659063775597221, + "grad_norm": 0.40767669677734375, + "learning_rate": 0.0001068404840198079, + "loss": 1.4338, + "step": 35854 + }, + { + "epoch": 0.465919372103638, + "grad_norm": 0.3947330713272095, + "learning_rate": 0.00010683788455789651, + "loss": 1.2716, + "step": 35855 + }, + { + "epoch": 0.46593236664755383, + "grad_norm": 0.35361751914024353, + "learning_rate": 0.00010683528509598513, + "loss": 1.4845, + "step": 35856 + }, + { + "epoch": 0.46594536119146973, + "grad_norm": 0.39010900259017944, + "learning_rate": 0.00010683268563407376, + "loss": 1.4867, + "step": 35857 + }, + { + "epoch": 0.46595835573538563, + "grad_norm": 0.3032688796520233, + "learning_rate": 0.00010683008617216237, + "loss": 1.2686, + "step": 35858 + }, + { + "epoch": 0.4659713502793015, + "grad_norm": 0.3854002356529236, + "learning_rate": 0.00010682748671025099, + "loss": 1.6548, + "step": 35859 + }, + { + "epoch": 0.4659843448232174, + "grad_norm": 0.39423030614852905, + "learning_rate": 0.00010682488724833958, + "loss": 1.4078, + "step": 35860 + }, + { + "epoch": 0.4659973393671332, + "grad_norm": 0.36734601855278015, + "learning_rate": 0.00010682228778642822, + "loss": 1.4832, + "step": 35861 + }, + { + "epoch": 0.4660103339110491, + "grad_norm": 0.42032724618911743, + "learning_rate": 0.00010681968832451683, + "loss": 1.4374, + "step": 35862 + }, + { + "epoch": 0.46602332845496497, + "grad_norm": 0.3666577935218811, + "learning_rate": 0.00010681708886260544, + "loss": 1.4527, + "step": 35863 + }, + { + "epoch": 0.46603632299888087, + "grad_norm": 0.3579496145248413, + "learning_rate": 0.00010681448940069405, + "loss": 1.13, + "step": 35864 + }, + { + "epoch": 0.4660493175427967, + "grad_norm": 0.30104896426200867, + "learning_rate": 0.00010681188993878268, + "loss": 1.4693, + "step": 35865 + }, + { + "epoch": 0.4660623120867126, + "grad_norm": 0.44286391139030457, + "learning_rate": 0.00010680929047687129, + "loss": 1.3679, + "step": 35866 + }, + { + "epoch": 0.46607530663062846, + "grad_norm": 0.3589864671230316, + "learning_rate": 0.0001068066910149599, + "loss": 1.5356, + "step": 35867 + }, + { + "epoch": 0.46608830117454436, + "grad_norm": 0.3279612362384796, + "learning_rate": 0.00010680409155304851, + "loss": 1.3134, + "step": 35868 + }, + { + "epoch": 0.4661012957184602, + "grad_norm": 0.3832318186759949, + "learning_rate": 0.00010680149209113715, + "loss": 1.1652, + "step": 35869 + }, + { + "epoch": 0.4661142902623761, + "grad_norm": 0.3821352422237396, + "learning_rate": 0.00010679889262922576, + "loss": 1.4673, + "step": 35870 + }, + { + "epoch": 0.46612728480629195, + "grad_norm": 0.3807497024536133, + "learning_rate": 0.00010679629316731437, + "loss": 1.4372, + "step": 35871 + }, + { + "epoch": 0.46614027935020785, + "grad_norm": 0.4463353753089905, + "learning_rate": 0.000106793693705403, + "loss": 1.2001, + "step": 35872 + }, + { + "epoch": 0.4661532738941237, + "grad_norm": 0.4181680679321289, + "learning_rate": 0.0001067910942434916, + "loss": 1.4157, + "step": 35873 + }, + { + "epoch": 0.4661662684380396, + "grad_norm": 0.4164464771747589, + "learning_rate": 0.00010678849478158022, + "loss": 1.3284, + "step": 35874 + }, + { + "epoch": 0.46617926298195544, + "grad_norm": 0.2525698244571686, + "learning_rate": 0.00010678589531966883, + "loss": 1.1925, + "step": 35875 + }, + { + "epoch": 0.46619225752587135, + "grad_norm": 0.311665803194046, + "learning_rate": 0.00010678329585775745, + "loss": 1.3553, + "step": 35876 + }, + { + "epoch": 0.4662052520697872, + "grad_norm": 0.42646273970603943, + "learning_rate": 0.00010678069639584606, + "loss": 1.3552, + "step": 35877 + }, + { + "epoch": 0.4662182466137031, + "grad_norm": 0.43143296241760254, + "learning_rate": 0.00010677809693393467, + "loss": 1.4818, + "step": 35878 + }, + { + "epoch": 0.46623124115761894, + "grad_norm": 0.39880892634391785, + "learning_rate": 0.00010677549747202329, + "loss": 1.217, + "step": 35879 + }, + { + "epoch": 0.46624423570153484, + "grad_norm": 0.30916833877563477, + "learning_rate": 0.00010677289801011192, + "loss": 1.1269, + "step": 35880 + }, + { + "epoch": 0.4662572302454507, + "grad_norm": 0.4107491672039032, + "learning_rate": 0.00010677029854820053, + "loss": 1.4896, + "step": 35881 + }, + { + "epoch": 0.4662702247893666, + "grad_norm": 0.5182363390922546, + "learning_rate": 0.00010676769908628915, + "loss": 1.415, + "step": 35882 + }, + { + "epoch": 0.46628321933328243, + "grad_norm": 0.41161927580833435, + "learning_rate": 0.00010676509962437776, + "loss": 1.3017, + "step": 35883 + }, + { + "epoch": 0.46629621387719833, + "grad_norm": 0.5144016146659851, + "learning_rate": 0.00010676250016246638, + "loss": 1.4735, + "step": 35884 + }, + { + "epoch": 0.4663092084211142, + "grad_norm": 0.39478158950805664, + "learning_rate": 0.00010675990070055499, + "loss": 1.4625, + "step": 35885 + }, + { + "epoch": 0.4663222029650301, + "grad_norm": 0.36517956852912903, + "learning_rate": 0.0001067573012386436, + "loss": 1.3703, + "step": 35886 + }, + { + "epoch": 0.4663351975089459, + "grad_norm": 0.3599850833415985, + "learning_rate": 0.00010675470177673221, + "loss": 1.3172, + "step": 35887 + }, + { + "epoch": 0.4663481920528618, + "grad_norm": 0.3892595171928406, + "learning_rate": 0.00010675210231482085, + "loss": 1.291, + "step": 35888 + }, + { + "epoch": 0.46636118659677767, + "grad_norm": 0.35782092809677124, + "learning_rate": 0.00010674950285290945, + "loss": 1.345, + "step": 35889 + }, + { + "epoch": 0.46637418114069357, + "grad_norm": 0.4267702102661133, + "learning_rate": 0.00010674690339099806, + "loss": 1.2876, + "step": 35890 + }, + { + "epoch": 0.4663871756846094, + "grad_norm": 0.4545315206050873, + "learning_rate": 0.00010674430392908667, + "loss": 1.4805, + "step": 35891 + }, + { + "epoch": 0.4664001702285253, + "grad_norm": 0.3772906959056854, + "learning_rate": 0.00010674170446717531, + "loss": 1.3717, + "step": 35892 + }, + { + "epoch": 0.46641316477244116, + "grad_norm": 0.609588086605072, + "learning_rate": 0.00010673910500526392, + "loss": 1.5314, + "step": 35893 + }, + { + "epoch": 0.46642615931635706, + "grad_norm": 0.4107741117477417, + "learning_rate": 0.00010673650554335253, + "loss": 1.408, + "step": 35894 + }, + { + "epoch": 0.4664391538602729, + "grad_norm": 0.34701094031333923, + "learning_rate": 0.00010673390608144114, + "loss": 1.5644, + "step": 35895 + }, + { + "epoch": 0.4664521484041888, + "grad_norm": 0.4221905469894409, + "learning_rate": 0.00010673130661952977, + "loss": 1.3026, + "step": 35896 + }, + { + "epoch": 0.46646514294810465, + "grad_norm": 0.490134596824646, + "learning_rate": 0.00010672870715761838, + "loss": 1.5241, + "step": 35897 + }, + { + "epoch": 0.46647813749202055, + "grad_norm": 0.4115753471851349, + "learning_rate": 0.00010672610769570699, + "loss": 1.4513, + "step": 35898 + }, + { + "epoch": 0.4664911320359364, + "grad_norm": 0.38451868295669556, + "learning_rate": 0.0001067235082337956, + "loss": 1.3489, + "step": 35899 + }, + { + "epoch": 0.4665041265798523, + "grad_norm": 0.43244633078575134, + "learning_rate": 0.00010672090877188424, + "loss": 1.371, + "step": 35900 + }, + { + "epoch": 0.46651712112376814, + "grad_norm": 0.36380696296691895, + "learning_rate": 0.00010671830930997285, + "loss": 1.4971, + "step": 35901 + }, + { + "epoch": 0.46653011566768404, + "grad_norm": 0.4719443619251251, + "learning_rate": 0.00010671570984806145, + "loss": 1.6278, + "step": 35902 + }, + { + "epoch": 0.4665431102115999, + "grad_norm": 0.344825804233551, + "learning_rate": 0.00010671311038615006, + "loss": 1.3417, + "step": 35903 + }, + { + "epoch": 0.4665561047555158, + "grad_norm": 0.40015196800231934, + "learning_rate": 0.0001067105109242387, + "loss": 1.3532, + "step": 35904 + }, + { + "epoch": 0.46656909929943163, + "grad_norm": 0.45742979645729065, + "learning_rate": 0.0001067079114623273, + "loss": 1.5532, + "step": 35905 + }, + { + "epoch": 0.46658209384334753, + "grad_norm": 0.33324095606803894, + "learning_rate": 0.00010670531200041592, + "loss": 1.3681, + "step": 35906 + }, + { + "epoch": 0.4665950883872634, + "grad_norm": 0.1700827032327652, + "learning_rate": 0.00010670271253850454, + "loss": 1.2759, + "step": 35907 + }, + { + "epoch": 0.4666080829311793, + "grad_norm": 0.3229829967021942, + "learning_rate": 0.00010670011307659315, + "loss": 1.3664, + "step": 35908 + }, + { + "epoch": 0.4666210774750951, + "grad_norm": 0.539949893951416, + "learning_rate": 0.00010669751361468176, + "loss": 1.4729, + "step": 35909 + }, + { + "epoch": 0.466634072019011, + "grad_norm": 0.42248407006263733, + "learning_rate": 0.00010669491415277037, + "loss": 1.4785, + "step": 35910 + }, + { + "epoch": 0.46664706656292687, + "grad_norm": 0.42746496200561523, + "learning_rate": 0.00010669231469085901, + "loss": 1.3603, + "step": 35911 + }, + { + "epoch": 0.46666006110684277, + "grad_norm": 0.3592301309108734, + "learning_rate": 0.00010668971522894762, + "loss": 1.2564, + "step": 35912 + }, + { + "epoch": 0.4666730556507586, + "grad_norm": 0.42517951130867004, + "learning_rate": 0.00010668711576703623, + "loss": 1.3646, + "step": 35913 + }, + { + "epoch": 0.4666860501946745, + "grad_norm": 0.4356907308101654, + "learning_rate": 0.00010668451630512484, + "loss": 1.5278, + "step": 35914 + }, + { + "epoch": 0.46669904473859036, + "grad_norm": 0.3531091809272766, + "learning_rate": 0.00010668191684321347, + "loss": 1.5002, + "step": 35915 + }, + { + "epoch": 0.46671203928250626, + "grad_norm": 0.35297533869743347, + "learning_rate": 0.00010667931738130208, + "loss": 1.6151, + "step": 35916 + }, + { + "epoch": 0.4667250338264221, + "grad_norm": 0.41558703780174255, + "learning_rate": 0.00010667671791939069, + "loss": 1.4992, + "step": 35917 + }, + { + "epoch": 0.466738028370338, + "grad_norm": 0.4532746970653534, + "learning_rate": 0.0001066741184574793, + "loss": 1.223, + "step": 35918 + }, + { + "epoch": 0.46675102291425385, + "grad_norm": 0.3878476917743683, + "learning_rate": 0.00010667151899556793, + "loss": 1.3433, + "step": 35919 + }, + { + "epoch": 0.46676401745816976, + "grad_norm": 0.43468937277793884, + "learning_rate": 0.00010666891953365654, + "loss": 1.2907, + "step": 35920 + }, + { + "epoch": 0.4667770120020856, + "grad_norm": 0.3536732494831085, + "learning_rate": 0.00010666632007174515, + "loss": 1.4719, + "step": 35921 + }, + { + "epoch": 0.4667900065460015, + "grad_norm": 0.47575825452804565, + "learning_rate": 0.00010666372060983376, + "loss": 1.6255, + "step": 35922 + }, + { + "epoch": 0.46680300108991735, + "grad_norm": 0.4070780575275421, + "learning_rate": 0.0001066611211479224, + "loss": 1.5798, + "step": 35923 + }, + { + "epoch": 0.46681599563383325, + "grad_norm": 0.41202831268310547, + "learning_rate": 0.00010665852168601101, + "loss": 1.3252, + "step": 35924 + }, + { + "epoch": 0.4668289901777491, + "grad_norm": 0.3353930115699768, + "learning_rate": 0.00010665592222409962, + "loss": 1.3818, + "step": 35925 + }, + { + "epoch": 0.466841984721665, + "grad_norm": 0.3823460042476654, + "learning_rate": 0.00010665332276218823, + "loss": 1.4417, + "step": 35926 + }, + { + "epoch": 0.46685497926558084, + "grad_norm": 0.4184887707233429, + "learning_rate": 0.00010665072330027685, + "loss": 1.4133, + "step": 35927 + }, + { + "epoch": 0.46686797380949674, + "grad_norm": 0.3773918151855469, + "learning_rate": 0.00010664812383836546, + "loss": 1.3159, + "step": 35928 + }, + { + "epoch": 0.4668809683534126, + "grad_norm": 0.4540043771266937, + "learning_rate": 0.00010664552437645408, + "loss": 1.3884, + "step": 35929 + }, + { + "epoch": 0.4668939628973285, + "grad_norm": 0.46505650877952576, + "learning_rate": 0.00010664292491454269, + "loss": 1.591, + "step": 35930 + }, + { + "epoch": 0.46690695744124433, + "grad_norm": 0.4571464955806732, + "learning_rate": 0.00010664032545263131, + "loss": 1.454, + "step": 35931 + }, + { + "epoch": 0.46691995198516023, + "grad_norm": 0.3607877194881439, + "learning_rate": 0.00010663772599071992, + "loss": 1.2839, + "step": 35932 + }, + { + "epoch": 0.46693294652907613, + "grad_norm": 0.44766515493392944, + "learning_rate": 0.00010663512652880853, + "loss": 1.4078, + "step": 35933 + }, + { + "epoch": 0.466945941072992, + "grad_norm": 0.47576385736465454, + "learning_rate": 0.00010663252706689714, + "loss": 1.4613, + "step": 35934 + }, + { + "epoch": 0.4669589356169079, + "grad_norm": 0.3513639271259308, + "learning_rate": 0.00010662992760498578, + "loss": 1.2397, + "step": 35935 + }, + { + "epoch": 0.4669719301608237, + "grad_norm": 0.5041713714599609, + "learning_rate": 0.00010662732814307439, + "loss": 1.5897, + "step": 35936 + }, + { + "epoch": 0.4669849247047396, + "grad_norm": 0.3738866448402405, + "learning_rate": 0.000106624728681163, + "loss": 1.4591, + "step": 35937 + }, + { + "epoch": 0.46699791924865547, + "grad_norm": 0.2819693684577942, + "learning_rate": 0.00010662212921925161, + "loss": 1.1778, + "step": 35938 + }, + { + "epoch": 0.46701091379257137, + "grad_norm": 0.36967772245407104, + "learning_rate": 0.00010661952975734024, + "loss": 1.287, + "step": 35939 + }, + { + "epoch": 0.4670239083364872, + "grad_norm": 0.29896414279937744, + "learning_rate": 0.00010661693029542885, + "loss": 1.4467, + "step": 35940 + }, + { + "epoch": 0.4670369028804031, + "grad_norm": 0.36833497881889343, + "learning_rate": 0.00010661433083351746, + "loss": 1.5933, + "step": 35941 + }, + { + "epoch": 0.46704989742431896, + "grad_norm": 0.36904987692832947, + "learning_rate": 0.00010661173137160607, + "loss": 1.4527, + "step": 35942 + }, + { + "epoch": 0.46706289196823486, + "grad_norm": 0.3404006361961365, + "learning_rate": 0.00010660913190969471, + "loss": 1.5577, + "step": 35943 + }, + { + "epoch": 0.4670758865121507, + "grad_norm": 0.3427305817604065, + "learning_rate": 0.00010660653244778331, + "loss": 1.3337, + "step": 35944 + }, + { + "epoch": 0.4670888810560666, + "grad_norm": 0.41978099942207336, + "learning_rate": 0.00010660393298587192, + "loss": 1.3852, + "step": 35945 + }, + { + "epoch": 0.46710187559998245, + "grad_norm": 0.40083521604537964, + "learning_rate": 0.00010660133352396056, + "loss": 1.3534, + "step": 35946 + }, + { + "epoch": 0.46711487014389835, + "grad_norm": 0.5257160067558289, + "learning_rate": 0.00010659873406204917, + "loss": 1.4199, + "step": 35947 + }, + { + "epoch": 0.4671278646878142, + "grad_norm": 0.43245503306388855, + "learning_rate": 0.00010659613460013778, + "loss": 1.4935, + "step": 35948 + }, + { + "epoch": 0.4671408592317301, + "grad_norm": 0.41445133090019226, + "learning_rate": 0.00010659353513822639, + "loss": 1.5975, + "step": 35949 + }, + { + "epoch": 0.46715385377564594, + "grad_norm": 0.3176117539405823, + "learning_rate": 0.00010659093567631501, + "loss": 1.407, + "step": 35950 + }, + { + "epoch": 0.46716684831956184, + "grad_norm": 0.36174675822257996, + "learning_rate": 0.00010658833621440362, + "loss": 1.3153, + "step": 35951 + }, + { + "epoch": 0.4671798428634777, + "grad_norm": 0.45502999424934387, + "learning_rate": 0.00010658573675249224, + "loss": 1.5634, + "step": 35952 + }, + { + "epoch": 0.4671928374073936, + "grad_norm": 0.39223194122314453, + "learning_rate": 0.00010658313729058085, + "loss": 1.2524, + "step": 35953 + }, + { + "epoch": 0.46720583195130944, + "grad_norm": 0.43161889910697937, + "learning_rate": 0.00010658053782866948, + "loss": 1.2831, + "step": 35954 + }, + { + "epoch": 0.46721882649522534, + "grad_norm": 0.4511711299419403, + "learning_rate": 0.0001065779383667581, + "loss": 1.3442, + "step": 35955 + }, + { + "epoch": 0.4672318210391412, + "grad_norm": 0.3932650685310364, + "learning_rate": 0.0001065753389048467, + "loss": 1.3912, + "step": 35956 + }, + { + "epoch": 0.4672448155830571, + "grad_norm": 0.38475996255874634, + "learning_rate": 0.0001065727394429353, + "loss": 1.3811, + "step": 35957 + }, + { + "epoch": 0.46725781012697293, + "grad_norm": 1.0559144020080566, + "learning_rate": 0.00010657013998102394, + "loss": 1.394, + "step": 35958 + }, + { + "epoch": 0.46727080467088883, + "grad_norm": 0.4285690188407898, + "learning_rate": 0.00010656754051911255, + "loss": 1.2494, + "step": 35959 + }, + { + "epoch": 0.4672837992148047, + "grad_norm": 0.45992931723594666, + "learning_rate": 0.00010656494105720116, + "loss": 1.4023, + "step": 35960 + }, + { + "epoch": 0.4672967937587206, + "grad_norm": 0.4039871096611023, + "learning_rate": 0.00010656234159528977, + "loss": 1.4978, + "step": 35961 + }, + { + "epoch": 0.4673097883026364, + "grad_norm": 0.3369520604610443, + "learning_rate": 0.0001065597421333784, + "loss": 1.3861, + "step": 35962 + }, + { + "epoch": 0.4673227828465523, + "grad_norm": 0.3670004606246948, + "learning_rate": 0.00010655714267146701, + "loss": 1.4783, + "step": 35963 + }, + { + "epoch": 0.46733577739046817, + "grad_norm": 0.464018315076828, + "learning_rate": 0.00010655454320955562, + "loss": 1.454, + "step": 35964 + }, + { + "epoch": 0.46734877193438407, + "grad_norm": 0.3082347810268402, + "learning_rate": 0.00010655194374764423, + "loss": 1.1011, + "step": 35965 + }, + { + "epoch": 0.4673617664782999, + "grad_norm": 0.46055933833122253, + "learning_rate": 0.00010654934428573287, + "loss": 1.3702, + "step": 35966 + }, + { + "epoch": 0.4673747610222158, + "grad_norm": 0.4583047032356262, + "learning_rate": 0.00010654674482382148, + "loss": 1.4097, + "step": 35967 + }, + { + "epoch": 0.46738775556613166, + "grad_norm": 0.34755653142929077, + "learning_rate": 0.00010654414536191009, + "loss": 1.2728, + "step": 35968 + }, + { + "epoch": 0.46740075011004756, + "grad_norm": 0.44869697093963623, + "learning_rate": 0.00010654154589999869, + "loss": 1.4725, + "step": 35969 + }, + { + "epoch": 0.4674137446539634, + "grad_norm": 0.3317490816116333, + "learning_rate": 0.00010653894643808733, + "loss": 1.244, + "step": 35970 + }, + { + "epoch": 0.4674267391978793, + "grad_norm": 0.37015002965927124, + "learning_rate": 0.00010653634697617594, + "loss": 1.3567, + "step": 35971 + }, + { + "epoch": 0.46743973374179515, + "grad_norm": 0.3949768543243408, + "learning_rate": 0.00010653374751426455, + "loss": 1.2916, + "step": 35972 + }, + { + "epoch": 0.46745272828571105, + "grad_norm": 0.3815925121307373, + "learning_rate": 0.00010653114805235316, + "loss": 1.2673, + "step": 35973 + }, + { + "epoch": 0.4674657228296269, + "grad_norm": 0.35038211941719055, + "learning_rate": 0.00010652854859044178, + "loss": 1.284, + "step": 35974 + }, + { + "epoch": 0.4674787173735428, + "grad_norm": 0.3745165169239044, + "learning_rate": 0.0001065259491285304, + "loss": 1.3205, + "step": 35975 + }, + { + "epoch": 0.46749171191745864, + "grad_norm": 0.5041611194610596, + "learning_rate": 0.000106523349666619, + "loss": 1.4573, + "step": 35976 + }, + { + "epoch": 0.46750470646137454, + "grad_norm": 0.364639550447464, + "learning_rate": 0.00010652075020470762, + "loss": 1.3509, + "step": 35977 + }, + { + "epoch": 0.4675177010052904, + "grad_norm": 0.3491594195365906, + "learning_rate": 0.00010651815074279626, + "loss": 1.1403, + "step": 35978 + }, + { + "epoch": 0.4675306955492063, + "grad_norm": 0.3863477110862732, + "learning_rate": 0.00010651555128088487, + "loss": 1.5309, + "step": 35979 + }, + { + "epoch": 0.46754369009312213, + "grad_norm": 0.41628289222717285, + "learning_rate": 0.00010651295181897348, + "loss": 1.5199, + "step": 35980 + }, + { + "epoch": 0.46755668463703803, + "grad_norm": 0.41859912872314453, + "learning_rate": 0.0001065103523570621, + "loss": 1.3811, + "step": 35981 + }, + { + "epoch": 0.4675696791809539, + "grad_norm": 0.3865775465965271, + "learning_rate": 0.00010650775289515071, + "loss": 1.332, + "step": 35982 + }, + { + "epoch": 0.4675826737248698, + "grad_norm": 0.32492291927337646, + "learning_rate": 0.00010650515343323932, + "loss": 1.3899, + "step": 35983 + }, + { + "epoch": 0.4675956682687856, + "grad_norm": 0.3021368384361267, + "learning_rate": 0.00010650255397132793, + "loss": 1.2621, + "step": 35984 + }, + { + "epoch": 0.4676086628127015, + "grad_norm": 0.3440239727497101, + "learning_rate": 0.00010649995450941657, + "loss": 1.1482, + "step": 35985 + }, + { + "epoch": 0.46762165735661737, + "grad_norm": 0.4269804358482361, + "learning_rate": 0.00010649735504750517, + "loss": 1.5055, + "step": 35986 + }, + { + "epoch": 0.46763465190053327, + "grad_norm": 0.3038952648639679, + "learning_rate": 0.00010649475558559378, + "loss": 1.3385, + "step": 35987 + }, + { + "epoch": 0.4676476464444491, + "grad_norm": 0.44491007924079895, + "learning_rate": 0.00010649215612368239, + "loss": 1.3492, + "step": 35988 + }, + { + "epoch": 0.467660640988365, + "grad_norm": 0.3509809672832489, + "learning_rate": 0.00010648955666177103, + "loss": 1.5611, + "step": 35989 + }, + { + "epoch": 0.46767363553228086, + "grad_norm": 0.31764981150627136, + "learning_rate": 0.00010648695719985964, + "loss": 1.3228, + "step": 35990 + }, + { + "epoch": 0.46768663007619676, + "grad_norm": 0.4681842625141144, + "learning_rate": 0.00010648435773794825, + "loss": 1.3906, + "step": 35991 + }, + { + "epoch": 0.4676996246201126, + "grad_norm": 0.3917446732521057, + "learning_rate": 0.00010648175827603686, + "loss": 1.4447, + "step": 35992 + }, + { + "epoch": 0.4677126191640285, + "grad_norm": 0.4038437306880951, + "learning_rate": 0.00010647915881412549, + "loss": 1.3509, + "step": 35993 + }, + { + "epoch": 0.46772561370794435, + "grad_norm": 0.32946309447288513, + "learning_rate": 0.0001064765593522141, + "loss": 1.0545, + "step": 35994 + }, + { + "epoch": 0.46773860825186026, + "grad_norm": 0.44185343384742737, + "learning_rate": 0.00010647395989030271, + "loss": 1.417, + "step": 35995 + }, + { + "epoch": 0.4677516027957761, + "grad_norm": 0.3740565776824951, + "learning_rate": 0.00010647136042839132, + "loss": 1.3879, + "step": 35996 + }, + { + "epoch": 0.467764597339692, + "grad_norm": 0.5356307625770569, + "learning_rate": 0.00010646876096647996, + "loss": 1.4336, + "step": 35997 + }, + { + "epoch": 0.46777759188360785, + "grad_norm": 0.46441715955734253, + "learning_rate": 0.00010646616150456857, + "loss": 1.4545, + "step": 35998 + }, + { + "epoch": 0.46779058642752375, + "grad_norm": 0.410014808177948, + "learning_rate": 0.00010646356204265717, + "loss": 1.3621, + "step": 35999 + }, + { + "epoch": 0.4678035809714396, + "grad_norm": 0.4279406666755676, + "learning_rate": 0.00010646096258074578, + "loss": 1.2898, + "step": 36000 + }, + { + "epoch": 0.4678165755153555, + "grad_norm": 0.48044198751449585, + "learning_rate": 0.00010645836311883442, + "loss": 1.1618, + "step": 36001 + }, + { + "epoch": 0.46782957005927134, + "grad_norm": 0.45341384410858154, + "learning_rate": 0.00010645576365692303, + "loss": 1.4305, + "step": 36002 + }, + { + "epoch": 0.46784256460318724, + "grad_norm": 0.40557536482810974, + "learning_rate": 0.00010645316419501164, + "loss": 1.2023, + "step": 36003 + }, + { + "epoch": 0.4678555591471031, + "grad_norm": 0.3305848240852356, + "learning_rate": 0.00010645056473310025, + "loss": 1.3105, + "step": 36004 + }, + { + "epoch": 0.467868553691019, + "grad_norm": 0.4526832401752472, + "learning_rate": 0.00010644796527118887, + "loss": 1.4626, + "step": 36005 + }, + { + "epoch": 0.46788154823493483, + "grad_norm": 0.47868451476097107, + "learning_rate": 0.00010644536580927748, + "loss": 1.367, + "step": 36006 + }, + { + "epoch": 0.46789454277885073, + "grad_norm": 0.4907841980457306, + "learning_rate": 0.0001064427663473661, + "loss": 1.3488, + "step": 36007 + }, + { + "epoch": 0.4679075373227666, + "grad_norm": 0.39040088653564453, + "learning_rate": 0.0001064401668854547, + "loss": 1.3353, + "step": 36008 + }, + { + "epoch": 0.4679205318666825, + "grad_norm": 0.37176477909088135, + "learning_rate": 0.00010643756742354334, + "loss": 1.2878, + "step": 36009 + }, + { + "epoch": 0.4679335264105984, + "grad_norm": 0.3674590587615967, + "learning_rate": 0.00010643496796163195, + "loss": 1.3325, + "step": 36010 + }, + { + "epoch": 0.4679465209545142, + "grad_norm": 0.33845943212509155, + "learning_rate": 0.00010643236849972055, + "loss": 1.459, + "step": 36011 + }, + { + "epoch": 0.4679595154984301, + "grad_norm": 0.4534614086151123, + "learning_rate": 0.00010642976903780916, + "loss": 1.5258, + "step": 36012 + }, + { + "epoch": 0.46797251004234597, + "grad_norm": 0.35866764187812805, + "learning_rate": 0.0001064271695758978, + "loss": 1.2132, + "step": 36013 + }, + { + "epoch": 0.46798550458626187, + "grad_norm": 0.3718441426753998, + "learning_rate": 0.00010642457011398641, + "loss": 1.4307, + "step": 36014 + }, + { + "epoch": 0.4679984991301777, + "grad_norm": 0.36010098457336426, + "learning_rate": 0.00010642197065207502, + "loss": 1.3033, + "step": 36015 + }, + { + "epoch": 0.4680114936740936, + "grad_norm": 0.37122467160224915, + "learning_rate": 0.00010641937119016363, + "loss": 1.3975, + "step": 36016 + }, + { + "epoch": 0.46802448821800946, + "grad_norm": 0.39414042234420776, + "learning_rate": 0.00010641677172825226, + "loss": 1.3809, + "step": 36017 + }, + { + "epoch": 0.46803748276192536, + "grad_norm": 0.4476868510246277, + "learning_rate": 0.00010641417226634087, + "loss": 1.3161, + "step": 36018 + }, + { + "epoch": 0.4680504773058412, + "grad_norm": 0.4619145393371582, + "learning_rate": 0.00010641157280442948, + "loss": 1.3019, + "step": 36019 + }, + { + "epoch": 0.4680634718497571, + "grad_norm": 0.4380275309085846, + "learning_rate": 0.00010640897334251812, + "loss": 1.3842, + "step": 36020 + }, + { + "epoch": 0.46807646639367295, + "grad_norm": 0.35434839129447937, + "learning_rate": 0.00010640637388060673, + "loss": 1.2279, + "step": 36021 + }, + { + "epoch": 0.46808946093758885, + "grad_norm": 0.3855600953102112, + "learning_rate": 0.00010640377441869534, + "loss": 1.6073, + "step": 36022 + }, + { + "epoch": 0.4681024554815047, + "grad_norm": 0.40054965019226074, + "learning_rate": 0.00010640117495678395, + "loss": 1.3063, + "step": 36023 + }, + { + "epoch": 0.4681154500254206, + "grad_norm": 0.37935322523117065, + "learning_rate": 0.00010639857549487258, + "loss": 1.4915, + "step": 36024 + }, + { + "epoch": 0.46812844456933644, + "grad_norm": 0.43313562870025635, + "learning_rate": 0.00010639597603296119, + "loss": 1.31, + "step": 36025 + }, + { + "epoch": 0.46814143911325234, + "grad_norm": 0.3727060854434967, + "learning_rate": 0.0001063933765710498, + "loss": 1.2527, + "step": 36026 + }, + { + "epoch": 0.4681544336571682, + "grad_norm": 0.3850649893283844, + "learning_rate": 0.00010639077710913841, + "loss": 1.4338, + "step": 36027 + }, + { + "epoch": 0.4681674282010841, + "grad_norm": 0.45306897163391113, + "learning_rate": 0.00010638817764722703, + "loss": 1.5184, + "step": 36028 + }, + { + "epoch": 0.46818042274499994, + "grad_norm": 0.4427480399608612, + "learning_rate": 0.00010638557818531564, + "loss": 1.606, + "step": 36029 + }, + { + "epoch": 0.46819341728891584, + "grad_norm": 0.393987774848938, + "learning_rate": 0.00010638297872340425, + "loss": 1.3951, + "step": 36030 + }, + { + "epoch": 0.4682064118328317, + "grad_norm": 0.40190625190734863, + "learning_rate": 0.00010638037926149287, + "loss": 1.4212, + "step": 36031 + }, + { + "epoch": 0.4682194063767476, + "grad_norm": 0.41310665011405945, + "learning_rate": 0.0001063777797995815, + "loss": 1.5712, + "step": 36032 + }, + { + "epoch": 0.4682324009206634, + "grad_norm": 0.2970741093158722, + "learning_rate": 0.00010637518033767011, + "loss": 1.5025, + "step": 36033 + }, + { + "epoch": 0.46824539546457933, + "grad_norm": 0.4671795964241028, + "learning_rate": 0.00010637258087575873, + "loss": 1.5475, + "step": 36034 + }, + { + "epoch": 0.4682583900084952, + "grad_norm": 0.5359475016593933, + "learning_rate": 0.00010636998141384734, + "loss": 1.552, + "step": 36035 + }, + { + "epoch": 0.4682713845524111, + "grad_norm": 0.4483073353767395, + "learning_rate": 0.00010636738195193596, + "loss": 1.5673, + "step": 36036 + }, + { + "epoch": 0.4682843790963269, + "grad_norm": 0.3945772051811218, + "learning_rate": 0.00010636478249002457, + "loss": 1.4136, + "step": 36037 + }, + { + "epoch": 0.4682973736402428, + "grad_norm": 0.45493283867836, + "learning_rate": 0.00010636218302811318, + "loss": 1.3821, + "step": 36038 + }, + { + "epoch": 0.46831036818415867, + "grad_norm": 0.37724894285202026, + "learning_rate": 0.0001063595835662018, + "loss": 1.3312, + "step": 36039 + }, + { + "epoch": 0.46832336272807457, + "grad_norm": 0.30855613946914673, + "learning_rate": 0.00010635698410429043, + "loss": 1.4425, + "step": 36040 + }, + { + "epoch": 0.4683363572719904, + "grad_norm": 0.35224148631095886, + "learning_rate": 0.00010635438464237903, + "loss": 1.3959, + "step": 36041 + }, + { + "epoch": 0.4683493518159063, + "grad_norm": 0.2714838981628418, + "learning_rate": 0.00010635178518046764, + "loss": 1.3151, + "step": 36042 + }, + { + "epoch": 0.46836234635982216, + "grad_norm": 0.42199426889419556, + "learning_rate": 0.00010634918571855625, + "loss": 1.4751, + "step": 36043 + }, + { + "epoch": 0.46837534090373806, + "grad_norm": 0.2458931803703308, + "learning_rate": 0.00010634658625664489, + "loss": 1.209, + "step": 36044 + }, + { + "epoch": 0.4683883354476539, + "grad_norm": 0.40135452151298523, + "learning_rate": 0.0001063439867947335, + "loss": 1.4309, + "step": 36045 + }, + { + "epoch": 0.4684013299915698, + "grad_norm": 0.4182669520378113, + "learning_rate": 0.00010634138733282211, + "loss": 1.4965, + "step": 36046 + }, + { + "epoch": 0.46841432453548565, + "grad_norm": 0.5029852986335754, + "learning_rate": 0.00010633878787091072, + "loss": 1.5349, + "step": 36047 + }, + { + "epoch": 0.46842731907940155, + "grad_norm": 0.40614351630210876, + "learning_rate": 0.00010633618840899935, + "loss": 1.2453, + "step": 36048 + }, + { + "epoch": 0.4684403136233174, + "grad_norm": 0.5176266431808472, + "learning_rate": 0.00010633358894708796, + "loss": 1.3921, + "step": 36049 + }, + { + "epoch": 0.4684533081672333, + "grad_norm": 0.40352463722229004, + "learning_rate": 0.00010633098948517657, + "loss": 1.4319, + "step": 36050 + }, + { + "epoch": 0.46846630271114914, + "grad_norm": 0.4335116446018219, + "learning_rate": 0.00010632839002326518, + "loss": 1.305, + "step": 36051 + }, + { + "epoch": 0.46847929725506504, + "grad_norm": 0.3310672640800476, + "learning_rate": 0.00010632579056135382, + "loss": 1.2521, + "step": 36052 + }, + { + "epoch": 0.4684922917989809, + "grad_norm": 0.40273770689964294, + "learning_rate": 0.00010632319109944241, + "loss": 1.3588, + "step": 36053 + }, + { + "epoch": 0.4685052863428968, + "grad_norm": 0.36818644404411316, + "learning_rate": 0.00010632059163753103, + "loss": 1.2179, + "step": 36054 + }, + { + "epoch": 0.46851828088681263, + "grad_norm": 0.4689251780509949, + "learning_rate": 0.00010631799217561966, + "loss": 1.4448, + "step": 36055 + }, + { + "epoch": 0.46853127543072853, + "grad_norm": 0.3490266501903534, + "learning_rate": 0.00010631539271370827, + "loss": 1.2343, + "step": 36056 + }, + { + "epoch": 0.4685442699746444, + "grad_norm": 0.5208413600921631, + "learning_rate": 0.00010631279325179689, + "loss": 1.5922, + "step": 36057 + }, + { + "epoch": 0.4685572645185603, + "grad_norm": 0.3412632644176483, + "learning_rate": 0.0001063101937898855, + "loss": 1.3741, + "step": 36058 + }, + { + "epoch": 0.4685702590624761, + "grad_norm": 0.5062688589096069, + "learning_rate": 0.00010630759432797412, + "loss": 1.502, + "step": 36059 + }, + { + "epoch": 0.468583253606392, + "grad_norm": 0.45065438747406006, + "learning_rate": 0.00010630499486606273, + "loss": 1.3298, + "step": 36060 + }, + { + "epoch": 0.46859624815030787, + "grad_norm": 0.4056037664413452, + "learning_rate": 0.00010630239540415134, + "loss": 1.2575, + "step": 36061 + }, + { + "epoch": 0.46860924269422377, + "grad_norm": 0.34066224098205566, + "learning_rate": 0.00010629979594223995, + "loss": 1.2316, + "step": 36062 + }, + { + "epoch": 0.4686222372381396, + "grad_norm": 0.3735927939414978, + "learning_rate": 0.00010629719648032859, + "loss": 1.3024, + "step": 36063 + }, + { + "epoch": 0.4686352317820555, + "grad_norm": 0.5083009004592896, + "learning_rate": 0.0001062945970184172, + "loss": 1.2928, + "step": 36064 + }, + { + "epoch": 0.46864822632597136, + "grad_norm": 0.38366150856018066, + "learning_rate": 0.00010629199755650581, + "loss": 1.4082, + "step": 36065 + }, + { + "epoch": 0.46866122086988726, + "grad_norm": 0.4026401937007904, + "learning_rate": 0.00010628939809459441, + "loss": 1.4272, + "step": 36066 + }, + { + "epoch": 0.4686742154138031, + "grad_norm": 0.4562194347381592, + "learning_rate": 0.00010628679863268305, + "loss": 1.5625, + "step": 36067 + }, + { + "epoch": 0.468687209957719, + "grad_norm": 0.4081043004989624, + "learning_rate": 0.00010628419917077166, + "loss": 1.5431, + "step": 36068 + }, + { + "epoch": 0.46870020450163485, + "grad_norm": 0.5094277858734131, + "learning_rate": 0.00010628159970886027, + "loss": 1.4673, + "step": 36069 + }, + { + "epoch": 0.46871319904555075, + "grad_norm": 0.24190929532051086, + "learning_rate": 0.00010627900024694888, + "loss": 1.2242, + "step": 36070 + }, + { + "epoch": 0.4687261935894666, + "grad_norm": 0.4343477785587311, + "learning_rate": 0.0001062764007850375, + "loss": 1.3976, + "step": 36071 + }, + { + "epoch": 0.4687391881333825, + "grad_norm": 0.3419545292854309, + "learning_rate": 0.00010627380132312612, + "loss": 1.4483, + "step": 36072 + }, + { + "epoch": 0.46875218267729835, + "grad_norm": 0.38569071888923645, + "learning_rate": 0.00010627120186121473, + "loss": 1.5035, + "step": 36073 + }, + { + "epoch": 0.46876517722121425, + "grad_norm": 0.3363886773586273, + "learning_rate": 0.00010626860239930334, + "loss": 1.2194, + "step": 36074 + }, + { + "epoch": 0.4687781717651301, + "grad_norm": 0.3996689021587372, + "learning_rate": 0.00010626600293739198, + "loss": 1.5933, + "step": 36075 + }, + { + "epoch": 0.468791166309046, + "grad_norm": 0.359098881483078, + "learning_rate": 0.00010626340347548059, + "loss": 1.2473, + "step": 36076 + }, + { + "epoch": 0.46880416085296184, + "grad_norm": 0.4002552032470703, + "learning_rate": 0.0001062608040135692, + "loss": 1.4233, + "step": 36077 + }, + { + "epoch": 0.46881715539687774, + "grad_norm": 0.3113689720630646, + "learning_rate": 0.00010625820455165781, + "loss": 1.2333, + "step": 36078 + }, + { + "epoch": 0.4688301499407936, + "grad_norm": 0.4215928912162781, + "learning_rate": 0.00010625560508974643, + "loss": 1.3374, + "step": 36079 + }, + { + "epoch": 0.4688431444847095, + "grad_norm": 0.3352264165878296, + "learning_rate": 0.00010625300562783504, + "loss": 1.3097, + "step": 36080 + }, + { + "epoch": 0.46885613902862533, + "grad_norm": 0.4702156186103821, + "learning_rate": 0.00010625040616592366, + "loss": 1.5179, + "step": 36081 + }, + { + "epoch": 0.46886913357254123, + "grad_norm": 0.4073571264743805, + "learning_rate": 0.00010624780670401227, + "loss": 1.4341, + "step": 36082 + }, + { + "epoch": 0.4688821281164571, + "grad_norm": 0.4177555739879608, + "learning_rate": 0.00010624520724210089, + "loss": 1.3475, + "step": 36083 + }, + { + "epoch": 0.468895122660373, + "grad_norm": 0.35482218861579895, + "learning_rate": 0.0001062426077801895, + "loss": 1.3678, + "step": 36084 + }, + { + "epoch": 0.4689081172042889, + "grad_norm": 0.41617903113365173, + "learning_rate": 0.00010624000831827811, + "loss": 1.5235, + "step": 36085 + }, + { + "epoch": 0.4689211117482047, + "grad_norm": 0.37736591696739197, + "learning_rate": 0.00010623740885636672, + "loss": 1.3755, + "step": 36086 + }, + { + "epoch": 0.4689341062921206, + "grad_norm": 0.4453181326389313, + "learning_rate": 0.00010623480939445536, + "loss": 1.3108, + "step": 36087 + }, + { + "epoch": 0.46894710083603647, + "grad_norm": 0.3362225592136383, + "learning_rate": 0.00010623220993254397, + "loss": 1.3551, + "step": 36088 + }, + { + "epoch": 0.46896009537995237, + "grad_norm": 0.3900918662548065, + "learning_rate": 0.00010622961047063258, + "loss": 1.4762, + "step": 36089 + }, + { + "epoch": 0.4689730899238682, + "grad_norm": 0.41579288244247437, + "learning_rate": 0.0001062270110087212, + "loss": 1.4177, + "step": 36090 + }, + { + "epoch": 0.4689860844677841, + "grad_norm": 0.30973920226097107, + "learning_rate": 0.00010622441154680982, + "loss": 0.9713, + "step": 36091 + }, + { + "epoch": 0.46899907901169996, + "grad_norm": 0.4345738887786865, + "learning_rate": 0.00010622181208489843, + "loss": 1.4786, + "step": 36092 + }, + { + "epoch": 0.46901207355561586, + "grad_norm": 0.3496934771537781, + "learning_rate": 0.00010621921262298704, + "loss": 1.3796, + "step": 36093 + }, + { + "epoch": 0.4690250680995317, + "grad_norm": 0.43879497051239014, + "learning_rate": 0.00010621661316107568, + "loss": 1.28, + "step": 36094 + }, + { + "epoch": 0.4690380626434476, + "grad_norm": 0.43912777304649353, + "learning_rate": 0.00010621401369916428, + "loss": 1.4733, + "step": 36095 + }, + { + "epoch": 0.46905105718736345, + "grad_norm": 0.3175583481788635, + "learning_rate": 0.00010621141423725289, + "loss": 1.2069, + "step": 36096 + }, + { + "epoch": 0.46906405173127935, + "grad_norm": 0.3942506015300751, + "learning_rate": 0.0001062088147753415, + "loss": 1.4992, + "step": 36097 + }, + { + "epoch": 0.4690770462751952, + "grad_norm": 0.4250079095363617, + "learning_rate": 0.00010620621531343014, + "loss": 1.4789, + "step": 36098 + }, + { + "epoch": 0.4690900408191111, + "grad_norm": 0.40334662795066833, + "learning_rate": 0.00010620361585151875, + "loss": 1.3985, + "step": 36099 + }, + { + "epoch": 0.46910303536302694, + "grad_norm": 0.360211044549942, + "learning_rate": 0.00010620101638960736, + "loss": 1.324, + "step": 36100 + }, + { + "epoch": 0.46911602990694284, + "grad_norm": 0.43988221883773804, + "learning_rate": 0.00010619841692769597, + "loss": 1.485, + "step": 36101 + }, + { + "epoch": 0.4691290244508587, + "grad_norm": 0.3439774513244629, + "learning_rate": 0.0001061958174657846, + "loss": 1.2393, + "step": 36102 + }, + { + "epoch": 0.4691420189947746, + "grad_norm": 0.4070330858230591, + "learning_rate": 0.0001061932180038732, + "loss": 1.4063, + "step": 36103 + }, + { + "epoch": 0.46915501353869044, + "grad_norm": 0.4167833626270294, + "learning_rate": 0.00010619061854196182, + "loss": 1.2319, + "step": 36104 + }, + { + "epoch": 0.46916800808260634, + "grad_norm": 0.3463385999202728, + "learning_rate": 0.00010618801908005043, + "loss": 1.4347, + "step": 36105 + }, + { + "epoch": 0.4691810026265222, + "grad_norm": 0.38114556670188904, + "learning_rate": 0.00010618541961813906, + "loss": 1.4742, + "step": 36106 + }, + { + "epoch": 0.4691939971704381, + "grad_norm": 0.3757008910179138, + "learning_rate": 0.00010618282015622768, + "loss": 1.4366, + "step": 36107 + }, + { + "epoch": 0.4692069917143539, + "grad_norm": 0.43338048458099365, + "learning_rate": 0.00010618022069431627, + "loss": 1.1882, + "step": 36108 + }, + { + "epoch": 0.46921998625826983, + "grad_norm": 0.3667354881763458, + "learning_rate": 0.00010617762123240488, + "loss": 1.343, + "step": 36109 + }, + { + "epoch": 0.4692329808021857, + "grad_norm": 0.33875957131385803, + "learning_rate": 0.00010617502177049352, + "loss": 1.2361, + "step": 36110 + }, + { + "epoch": 0.4692459753461016, + "grad_norm": 0.4703972339630127, + "learning_rate": 0.00010617242230858213, + "loss": 1.3811, + "step": 36111 + }, + { + "epoch": 0.4692589698900174, + "grad_norm": 0.43020904064178467, + "learning_rate": 0.00010616982284667074, + "loss": 1.47, + "step": 36112 + }, + { + "epoch": 0.4692719644339333, + "grad_norm": 0.45791110396385193, + "learning_rate": 0.00010616722338475935, + "loss": 1.3289, + "step": 36113 + }, + { + "epoch": 0.46928495897784916, + "grad_norm": 0.37642526626586914, + "learning_rate": 0.00010616462392284798, + "loss": 1.5013, + "step": 36114 + }, + { + "epoch": 0.46929795352176507, + "grad_norm": 0.45203515887260437, + "learning_rate": 0.00010616202446093659, + "loss": 1.3386, + "step": 36115 + }, + { + "epoch": 0.4693109480656809, + "grad_norm": 0.3627721071243286, + "learning_rate": 0.0001061594249990252, + "loss": 1.4903, + "step": 36116 + }, + { + "epoch": 0.4693239426095968, + "grad_norm": 0.454927533864975, + "learning_rate": 0.00010615682553711381, + "loss": 1.2457, + "step": 36117 + }, + { + "epoch": 0.46933693715351266, + "grad_norm": 0.3289344310760498, + "learning_rate": 0.00010615422607520245, + "loss": 1.3405, + "step": 36118 + }, + { + "epoch": 0.46934993169742856, + "grad_norm": 0.3561593294143677, + "learning_rate": 0.00010615162661329106, + "loss": 1.2045, + "step": 36119 + }, + { + "epoch": 0.4693629262413444, + "grad_norm": 0.46802428364753723, + "learning_rate": 0.00010614902715137967, + "loss": 1.4741, + "step": 36120 + }, + { + "epoch": 0.4693759207852603, + "grad_norm": 0.46398335695266724, + "learning_rate": 0.00010614642768946827, + "loss": 1.5371, + "step": 36121 + }, + { + "epoch": 0.46938891532917615, + "grad_norm": 0.40989935398101807, + "learning_rate": 0.00010614382822755691, + "loss": 1.3655, + "step": 36122 + }, + { + "epoch": 0.46940190987309205, + "grad_norm": 0.3533971607685089, + "learning_rate": 0.00010614122876564552, + "loss": 1.3547, + "step": 36123 + }, + { + "epoch": 0.4694149044170079, + "grad_norm": 0.31980404257774353, + "learning_rate": 0.00010613862930373413, + "loss": 1.2353, + "step": 36124 + }, + { + "epoch": 0.4694278989609238, + "grad_norm": 0.35931599140167236, + "learning_rate": 0.00010613602984182274, + "loss": 1.4091, + "step": 36125 + }, + { + "epoch": 0.46944089350483964, + "grad_norm": 0.4648946523666382, + "learning_rate": 0.00010613343037991136, + "loss": 1.4173, + "step": 36126 + }, + { + "epoch": 0.46945388804875554, + "grad_norm": 0.37911030650138855, + "learning_rate": 0.00010613083091799998, + "loss": 1.3801, + "step": 36127 + }, + { + "epoch": 0.4694668825926714, + "grad_norm": 0.4516185224056244, + "learning_rate": 0.00010612823145608859, + "loss": 1.4116, + "step": 36128 + }, + { + "epoch": 0.4694798771365873, + "grad_norm": 0.4491935968399048, + "learning_rate": 0.00010612563199417722, + "loss": 1.409, + "step": 36129 + }, + { + "epoch": 0.46949287168050313, + "grad_norm": 0.3451153635978699, + "learning_rate": 0.00010612303253226584, + "loss": 1.3469, + "step": 36130 + }, + { + "epoch": 0.46950586622441903, + "grad_norm": 0.3741462528705597, + "learning_rate": 0.00010612043307035445, + "loss": 1.2449, + "step": 36131 + }, + { + "epoch": 0.4695188607683349, + "grad_norm": 0.42647701501846313, + "learning_rate": 0.00010611783360844306, + "loss": 1.4542, + "step": 36132 + }, + { + "epoch": 0.4695318553122508, + "grad_norm": 0.4272676706314087, + "learning_rate": 0.00010611523414653168, + "loss": 1.2582, + "step": 36133 + }, + { + "epoch": 0.4695448498561666, + "grad_norm": 0.42716488242149353, + "learning_rate": 0.00010611263468462029, + "loss": 1.5576, + "step": 36134 + }, + { + "epoch": 0.4695578444000825, + "grad_norm": 0.46606266498565674, + "learning_rate": 0.0001061100352227089, + "loss": 1.506, + "step": 36135 + }, + { + "epoch": 0.46957083894399837, + "grad_norm": 0.3855647146701813, + "learning_rate": 0.00010610743576079751, + "loss": 1.5454, + "step": 36136 + }, + { + "epoch": 0.46958383348791427, + "grad_norm": 0.36141031980514526, + "learning_rate": 0.00010610483629888614, + "loss": 1.3278, + "step": 36137 + }, + { + "epoch": 0.4695968280318301, + "grad_norm": 0.47629648447036743, + "learning_rate": 0.00010610223683697475, + "loss": 1.5046, + "step": 36138 + }, + { + "epoch": 0.469609822575746, + "grad_norm": 0.47294700145721436, + "learning_rate": 0.00010609963737506336, + "loss": 1.4909, + "step": 36139 + }, + { + "epoch": 0.46962281711966186, + "grad_norm": 0.4332755208015442, + "learning_rate": 0.00010609703791315197, + "loss": 1.433, + "step": 36140 + }, + { + "epoch": 0.46963581166357776, + "grad_norm": 0.4063400328159332, + "learning_rate": 0.00010609443845124061, + "loss": 1.4572, + "step": 36141 + }, + { + "epoch": 0.4696488062074936, + "grad_norm": 0.352501779794693, + "learning_rate": 0.00010609183898932922, + "loss": 1.1663, + "step": 36142 + }, + { + "epoch": 0.4696618007514095, + "grad_norm": 0.4391920268535614, + "learning_rate": 0.00010608923952741783, + "loss": 1.5187, + "step": 36143 + }, + { + "epoch": 0.46967479529532535, + "grad_norm": 0.3294104337692261, + "learning_rate": 0.00010608664006550644, + "loss": 1.3397, + "step": 36144 + }, + { + "epoch": 0.46968778983924125, + "grad_norm": 0.31026867032051086, + "learning_rate": 0.00010608404060359507, + "loss": 1.1955, + "step": 36145 + }, + { + "epoch": 0.4697007843831571, + "grad_norm": 0.4363054037094116, + "learning_rate": 0.00010608144114168368, + "loss": 1.4178, + "step": 36146 + }, + { + "epoch": 0.469713778927073, + "grad_norm": 0.4318297505378723, + "learning_rate": 0.00010607884167977229, + "loss": 1.4472, + "step": 36147 + }, + { + "epoch": 0.46972677347098885, + "grad_norm": 0.4540519714355469, + "learning_rate": 0.0001060762422178609, + "loss": 1.3004, + "step": 36148 + }, + { + "epoch": 0.46973976801490475, + "grad_norm": 0.3911750912666321, + "learning_rate": 0.00010607364275594954, + "loss": 1.347, + "step": 36149 + }, + { + "epoch": 0.4697527625588206, + "grad_norm": 0.4785829782485962, + "learning_rate": 0.00010607104329403814, + "loss": 1.2467, + "step": 36150 + }, + { + "epoch": 0.4697657571027365, + "grad_norm": 0.36873337626457214, + "learning_rate": 0.00010606844383212675, + "loss": 1.3852, + "step": 36151 + }, + { + "epoch": 0.46977875164665234, + "grad_norm": 0.3357747793197632, + "learning_rate": 0.00010606584437021536, + "loss": 1.611, + "step": 36152 + }, + { + "epoch": 0.46979174619056824, + "grad_norm": 0.45603954792022705, + "learning_rate": 0.000106063244908304, + "loss": 1.4254, + "step": 36153 + }, + { + "epoch": 0.4698047407344841, + "grad_norm": 0.4814583659172058, + "learning_rate": 0.0001060606454463926, + "loss": 1.3516, + "step": 36154 + }, + { + "epoch": 0.4698177352784, + "grad_norm": 0.4520021975040436, + "learning_rate": 0.00010605804598448122, + "loss": 1.3535, + "step": 36155 + }, + { + "epoch": 0.46983072982231583, + "grad_norm": 0.37971118092536926, + "learning_rate": 0.00010605544652256983, + "loss": 1.4058, + "step": 36156 + }, + { + "epoch": 0.46984372436623173, + "grad_norm": 0.4446203410625458, + "learning_rate": 0.00010605284706065845, + "loss": 1.4121, + "step": 36157 + }, + { + "epoch": 0.4698567189101476, + "grad_norm": 0.38405925035476685, + "learning_rate": 0.00010605024759874706, + "loss": 1.31, + "step": 36158 + }, + { + "epoch": 0.4698697134540635, + "grad_norm": 0.46519026160240173, + "learning_rate": 0.00010604764813683567, + "loss": 1.1816, + "step": 36159 + }, + { + "epoch": 0.4698827079979793, + "grad_norm": 0.3670690655708313, + "learning_rate": 0.00010604504867492429, + "loss": 1.4234, + "step": 36160 + }, + { + "epoch": 0.4698957025418952, + "grad_norm": 0.4492076337337494, + "learning_rate": 0.00010604244921301292, + "loss": 1.5289, + "step": 36161 + }, + { + "epoch": 0.4699086970858111, + "grad_norm": 0.39481213688850403, + "learning_rate": 0.00010603984975110153, + "loss": 1.3903, + "step": 36162 + }, + { + "epoch": 0.46992169162972697, + "grad_norm": 0.4510393738746643, + "learning_rate": 0.00010603725028919013, + "loss": 1.4053, + "step": 36163 + }, + { + "epoch": 0.46993468617364287, + "grad_norm": 0.36320334672927856, + "learning_rate": 0.00010603465082727874, + "loss": 1.2432, + "step": 36164 + }, + { + "epoch": 0.4699476807175587, + "grad_norm": 0.4383608102798462, + "learning_rate": 0.00010603205136536738, + "loss": 1.3882, + "step": 36165 + }, + { + "epoch": 0.4699606752614746, + "grad_norm": 0.41992902755737305, + "learning_rate": 0.00010602945190345599, + "loss": 1.2374, + "step": 36166 + }, + { + "epoch": 0.46997366980539046, + "grad_norm": 0.398365318775177, + "learning_rate": 0.0001060268524415446, + "loss": 1.3987, + "step": 36167 + }, + { + "epoch": 0.46998666434930636, + "grad_norm": 0.47090840339660645, + "learning_rate": 0.00010602425297963323, + "loss": 1.5249, + "step": 36168 + }, + { + "epoch": 0.4699996588932222, + "grad_norm": 0.37851324677467346, + "learning_rate": 0.00010602165351772184, + "loss": 1.2214, + "step": 36169 + }, + { + "epoch": 0.4700126534371381, + "grad_norm": 0.3921733796596527, + "learning_rate": 0.00010601905405581045, + "loss": 1.374, + "step": 36170 + }, + { + "epoch": 0.47002564798105395, + "grad_norm": 0.504626989364624, + "learning_rate": 0.00010601645459389906, + "loss": 1.451, + "step": 36171 + }, + { + "epoch": 0.47003864252496985, + "grad_norm": 0.4621210992336273, + "learning_rate": 0.0001060138551319877, + "loss": 1.7462, + "step": 36172 + }, + { + "epoch": 0.4700516370688857, + "grad_norm": 0.32782498002052307, + "learning_rate": 0.00010601125567007631, + "loss": 1.3671, + "step": 36173 + }, + { + "epoch": 0.4700646316128016, + "grad_norm": 0.25939950346946716, + "learning_rate": 0.00010600865620816492, + "loss": 1.2563, + "step": 36174 + }, + { + "epoch": 0.47007762615671744, + "grad_norm": 0.4152630865573883, + "learning_rate": 0.00010600605674625352, + "loss": 1.2558, + "step": 36175 + }, + { + "epoch": 0.47009062070063334, + "grad_norm": 0.31859588623046875, + "learning_rate": 0.00010600345728434216, + "loss": 1.5866, + "step": 36176 + }, + { + "epoch": 0.4701036152445492, + "grad_norm": 0.4142383933067322, + "learning_rate": 0.00010600085782243077, + "loss": 1.5393, + "step": 36177 + }, + { + "epoch": 0.4701166097884651, + "grad_norm": 0.45889878273010254, + "learning_rate": 0.00010599825836051938, + "loss": 1.4033, + "step": 36178 + }, + { + "epoch": 0.47012960433238093, + "grad_norm": 0.38180506229400635, + "learning_rate": 0.00010599565889860799, + "loss": 1.4913, + "step": 36179 + }, + { + "epoch": 0.47014259887629684, + "grad_norm": 0.4229165315628052, + "learning_rate": 0.00010599305943669661, + "loss": 1.4389, + "step": 36180 + }, + { + "epoch": 0.4701555934202127, + "grad_norm": 0.5370358228683472, + "learning_rate": 0.00010599045997478522, + "loss": 1.4634, + "step": 36181 + }, + { + "epoch": 0.4701685879641286, + "grad_norm": 0.3709205090999603, + "learning_rate": 0.00010598786051287383, + "loss": 1.4102, + "step": 36182 + }, + { + "epoch": 0.4701815825080444, + "grad_norm": 0.41304388642311096, + "learning_rate": 0.00010598526105096245, + "loss": 1.4797, + "step": 36183 + }, + { + "epoch": 0.4701945770519603, + "grad_norm": 0.37425580620765686, + "learning_rate": 0.00010598266158905108, + "loss": 1.4339, + "step": 36184 + }, + { + "epoch": 0.4702075715958762, + "grad_norm": 0.48043403029441833, + "learning_rate": 0.0001059800621271397, + "loss": 1.3831, + "step": 36185 + }, + { + "epoch": 0.4702205661397921, + "grad_norm": 0.49232208728790283, + "learning_rate": 0.0001059774626652283, + "loss": 1.4723, + "step": 36186 + }, + { + "epoch": 0.4702335606837079, + "grad_norm": 0.3689571022987366, + "learning_rate": 0.00010597486320331692, + "loss": 1.42, + "step": 36187 + }, + { + "epoch": 0.4702465552276238, + "grad_norm": 0.43442389369010925, + "learning_rate": 0.00010597226374140554, + "loss": 1.3438, + "step": 36188 + }, + { + "epoch": 0.47025954977153966, + "grad_norm": 0.3032548129558563, + "learning_rate": 0.00010596966427949415, + "loss": 1.333, + "step": 36189 + }, + { + "epoch": 0.47027254431545557, + "grad_norm": 0.3889909088611603, + "learning_rate": 0.00010596706481758276, + "loss": 1.4233, + "step": 36190 + }, + { + "epoch": 0.4702855388593714, + "grad_norm": 0.6050897240638733, + "learning_rate": 0.00010596446535567137, + "loss": 1.3794, + "step": 36191 + }, + { + "epoch": 0.4702985334032873, + "grad_norm": 0.35007837414741516, + "learning_rate": 0.00010596186589376, + "loss": 1.5835, + "step": 36192 + }, + { + "epoch": 0.47031152794720316, + "grad_norm": 0.36003994941711426, + "learning_rate": 0.00010595926643184861, + "loss": 1.4002, + "step": 36193 + }, + { + "epoch": 0.47032452249111906, + "grad_norm": 0.490487277507782, + "learning_rate": 0.00010595666696993722, + "loss": 1.3871, + "step": 36194 + }, + { + "epoch": 0.4703375170350349, + "grad_norm": 0.4509434103965759, + "learning_rate": 0.00010595406750802583, + "loss": 1.4859, + "step": 36195 + }, + { + "epoch": 0.4703505115789508, + "grad_norm": 0.41708970069885254, + "learning_rate": 0.00010595146804611447, + "loss": 1.3753, + "step": 36196 + }, + { + "epoch": 0.47036350612286665, + "grad_norm": 0.4493131935596466, + "learning_rate": 0.00010594886858420308, + "loss": 1.3018, + "step": 36197 + }, + { + "epoch": 0.47037650066678255, + "grad_norm": 0.44261786341667175, + "learning_rate": 0.00010594626912229169, + "loss": 1.3897, + "step": 36198 + }, + { + "epoch": 0.4703894952106984, + "grad_norm": 0.5020904541015625, + "learning_rate": 0.0001059436696603803, + "loss": 1.5236, + "step": 36199 + }, + { + "epoch": 0.4704024897546143, + "grad_norm": 0.34281373023986816, + "learning_rate": 0.00010594107019846893, + "loss": 1.3461, + "step": 36200 + }, + { + "epoch": 0.47041548429853014, + "grad_norm": 0.411078542470932, + "learning_rate": 0.00010593847073655754, + "loss": 1.3418, + "step": 36201 + }, + { + "epoch": 0.47042847884244604, + "grad_norm": 0.4068673551082611, + "learning_rate": 0.00010593587127464615, + "loss": 1.4778, + "step": 36202 + }, + { + "epoch": 0.4704414733863619, + "grad_norm": 0.40526413917541504, + "learning_rate": 0.00010593327181273479, + "loss": 1.3133, + "step": 36203 + }, + { + "epoch": 0.4704544679302778, + "grad_norm": 0.31847894191741943, + "learning_rate": 0.0001059306723508234, + "loss": 1.4498, + "step": 36204 + }, + { + "epoch": 0.47046746247419363, + "grad_norm": 0.38429343700408936, + "learning_rate": 0.000105928072888912, + "loss": 1.4061, + "step": 36205 + }, + { + "epoch": 0.47048045701810953, + "grad_norm": 0.34437814354896545, + "learning_rate": 0.0001059254734270006, + "loss": 1.3711, + "step": 36206 + }, + { + "epoch": 0.4704934515620254, + "grad_norm": 0.40567654371261597, + "learning_rate": 0.00010592287396508924, + "loss": 1.3001, + "step": 36207 + }, + { + "epoch": 0.4705064461059413, + "grad_norm": 0.40077120065689087, + "learning_rate": 0.00010592027450317785, + "loss": 1.4081, + "step": 36208 + }, + { + "epoch": 0.4705194406498571, + "grad_norm": 0.4596070945262909, + "learning_rate": 0.00010591767504126646, + "loss": 1.5516, + "step": 36209 + }, + { + "epoch": 0.470532435193773, + "grad_norm": 0.4369564950466156, + "learning_rate": 0.00010591507557935508, + "loss": 1.5924, + "step": 36210 + }, + { + "epoch": 0.47054542973768887, + "grad_norm": 0.4103551506996155, + "learning_rate": 0.0001059124761174437, + "loss": 1.2621, + "step": 36211 + }, + { + "epoch": 0.47055842428160477, + "grad_norm": 0.4694206416606903, + "learning_rate": 0.00010590987665553231, + "loss": 1.3921, + "step": 36212 + }, + { + "epoch": 0.4705714188255206, + "grad_norm": 0.5605316758155823, + "learning_rate": 0.00010590727719362092, + "loss": 1.5591, + "step": 36213 + }, + { + "epoch": 0.4705844133694365, + "grad_norm": 0.44309210777282715, + "learning_rate": 0.00010590467773170953, + "loss": 1.3643, + "step": 36214 + }, + { + "epoch": 0.47059740791335236, + "grad_norm": 0.4272473156452179, + "learning_rate": 0.00010590207826979817, + "loss": 1.2634, + "step": 36215 + }, + { + "epoch": 0.47061040245726826, + "grad_norm": 0.49067726731300354, + "learning_rate": 0.00010589947880788678, + "loss": 1.5069, + "step": 36216 + }, + { + "epoch": 0.4706233970011841, + "grad_norm": 0.4125564992427826, + "learning_rate": 0.00010589687934597538, + "loss": 1.3796, + "step": 36217 + }, + { + "epoch": 0.4706363915451, + "grad_norm": 0.4353954493999481, + "learning_rate": 0.00010589427988406399, + "loss": 1.4034, + "step": 36218 + }, + { + "epoch": 0.47064938608901585, + "grad_norm": 0.3864523768424988, + "learning_rate": 0.00010589168042215263, + "loss": 1.4728, + "step": 36219 + }, + { + "epoch": 0.47066238063293175, + "grad_norm": 0.3618791997432709, + "learning_rate": 0.00010588908096024124, + "loss": 1.2799, + "step": 36220 + }, + { + "epoch": 0.4706753751768476, + "grad_norm": 0.4934016764163971, + "learning_rate": 0.00010588648149832985, + "loss": 1.3761, + "step": 36221 + }, + { + "epoch": 0.4706883697207635, + "grad_norm": 0.3758346736431122, + "learning_rate": 0.00010588388203641846, + "loss": 1.3483, + "step": 36222 + }, + { + "epoch": 0.47070136426467934, + "grad_norm": 0.4102843999862671, + "learning_rate": 0.00010588128257450709, + "loss": 1.6026, + "step": 36223 + }, + { + "epoch": 0.47071435880859525, + "grad_norm": 0.3525262773036957, + "learning_rate": 0.0001058786831125957, + "loss": 1.3236, + "step": 36224 + }, + { + "epoch": 0.4707273533525111, + "grad_norm": 0.37301939725875854, + "learning_rate": 0.00010587608365068431, + "loss": 1.3284, + "step": 36225 + }, + { + "epoch": 0.470740347896427, + "grad_norm": 0.4437795579433441, + "learning_rate": 0.00010587348418877292, + "loss": 1.3931, + "step": 36226 + }, + { + "epoch": 0.47075334244034284, + "grad_norm": 0.3568170368671417, + "learning_rate": 0.00010587088472686156, + "loss": 1.2669, + "step": 36227 + }, + { + "epoch": 0.47076633698425874, + "grad_norm": 0.3910858929157257, + "learning_rate": 0.00010586828526495017, + "loss": 1.4464, + "step": 36228 + }, + { + "epoch": 0.4707793315281746, + "grad_norm": 0.3587120473384857, + "learning_rate": 0.00010586568580303878, + "loss": 1.3654, + "step": 36229 + }, + { + "epoch": 0.4707923260720905, + "grad_norm": 0.4077349901199341, + "learning_rate": 0.00010586308634112738, + "loss": 1.3685, + "step": 36230 + }, + { + "epoch": 0.47080532061600633, + "grad_norm": 0.3619917035102844, + "learning_rate": 0.00010586048687921601, + "loss": 1.3385, + "step": 36231 + }, + { + "epoch": 0.47081831515992223, + "grad_norm": 0.33306464552879333, + "learning_rate": 0.00010585788741730462, + "loss": 1.3687, + "step": 36232 + }, + { + "epoch": 0.4708313097038381, + "grad_norm": 0.4068826735019684, + "learning_rate": 0.00010585528795539324, + "loss": 1.4446, + "step": 36233 + }, + { + "epoch": 0.470844304247754, + "grad_norm": 0.4388924837112427, + "learning_rate": 0.00010585268849348185, + "loss": 1.3201, + "step": 36234 + }, + { + "epoch": 0.4708572987916698, + "grad_norm": 0.40682846307754517, + "learning_rate": 0.00010585008903157047, + "loss": 1.4232, + "step": 36235 + }, + { + "epoch": 0.4708702933355857, + "grad_norm": 0.45418500900268555, + "learning_rate": 0.00010584748956965908, + "loss": 1.3276, + "step": 36236 + }, + { + "epoch": 0.4708832878795016, + "grad_norm": 0.35614830255508423, + "learning_rate": 0.00010584489010774769, + "loss": 1.294, + "step": 36237 + }, + { + "epoch": 0.47089628242341747, + "grad_norm": 0.3680225908756256, + "learning_rate": 0.0001058422906458363, + "loss": 1.5783, + "step": 36238 + }, + { + "epoch": 0.47090927696733337, + "grad_norm": 0.4316016137599945, + "learning_rate": 0.00010583969118392494, + "loss": 1.4524, + "step": 36239 + }, + { + "epoch": 0.4709222715112492, + "grad_norm": 0.504129946231842, + "learning_rate": 0.00010583709172201355, + "loss": 1.3935, + "step": 36240 + }, + { + "epoch": 0.4709352660551651, + "grad_norm": 0.5460162162780762, + "learning_rate": 0.00010583449226010216, + "loss": 1.4641, + "step": 36241 + }, + { + "epoch": 0.47094826059908096, + "grad_norm": 0.42141902446746826, + "learning_rate": 0.00010583189279819079, + "loss": 1.3465, + "step": 36242 + }, + { + "epoch": 0.47096125514299686, + "grad_norm": 0.3766111135482788, + "learning_rate": 0.0001058292933362794, + "loss": 1.6174, + "step": 36243 + }, + { + "epoch": 0.4709742496869127, + "grad_norm": 0.4819640517234802, + "learning_rate": 0.00010582669387436801, + "loss": 1.282, + "step": 36244 + }, + { + "epoch": 0.4709872442308286, + "grad_norm": 0.37777456641197205, + "learning_rate": 0.00010582409441245662, + "loss": 1.3853, + "step": 36245 + }, + { + "epoch": 0.47100023877474445, + "grad_norm": 0.4598681628704071, + "learning_rate": 0.00010582149495054526, + "loss": 1.2891, + "step": 36246 + }, + { + "epoch": 0.47101323331866035, + "grad_norm": 0.4218175709247589, + "learning_rate": 0.00010581889548863386, + "loss": 1.2058, + "step": 36247 + }, + { + "epoch": 0.4710262278625762, + "grad_norm": 0.3776571452617645, + "learning_rate": 0.00010581629602672247, + "loss": 1.3393, + "step": 36248 + }, + { + "epoch": 0.4710392224064921, + "grad_norm": 0.4550991654396057, + "learning_rate": 0.00010581369656481108, + "loss": 1.4118, + "step": 36249 + }, + { + "epoch": 0.47105221695040794, + "grad_norm": 0.45695239305496216, + "learning_rate": 0.00010581109710289972, + "loss": 1.3107, + "step": 36250 + }, + { + "epoch": 0.47106521149432384, + "grad_norm": 0.46862199902534485, + "learning_rate": 0.00010580849764098833, + "loss": 1.5385, + "step": 36251 + }, + { + "epoch": 0.4710782060382397, + "grad_norm": 0.48222196102142334, + "learning_rate": 0.00010580589817907694, + "loss": 1.5039, + "step": 36252 + }, + { + "epoch": 0.4710912005821556, + "grad_norm": 0.4670088291168213, + "learning_rate": 0.00010580329871716555, + "loss": 1.3562, + "step": 36253 + }, + { + "epoch": 0.47110419512607143, + "grad_norm": 0.3355959355831146, + "learning_rate": 0.00010580069925525417, + "loss": 1.0809, + "step": 36254 + }, + { + "epoch": 0.47111718966998734, + "grad_norm": 0.4073513448238373, + "learning_rate": 0.00010579809979334278, + "loss": 1.3185, + "step": 36255 + }, + { + "epoch": 0.4711301842139032, + "grad_norm": 0.5075585842132568, + "learning_rate": 0.0001057955003314314, + "loss": 1.6212, + "step": 36256 + }, + { + "epoch": 0.4711431787578191, + "grad_norm": 0.44583189487457275, + "learning_rate": 0.00010579290086952, + "loss": 1.4864, + "step": 36257 + }, + { + "epoch": 0.4711561733017349, + "grad_norm": 0.3509584665298462, + "learning_rate": 0.00010579030140760864, + "loss": 1.3396, + "step": 36258 + }, + { + "epoch": 0.4711691678456508, + "grad_norm": 0.40845754742622375, + "learning_rate": 0.00010578770194569724, + "loss": 1.3736, + "step": 36259 + }, + { + "epoch": 0.47118216238956667, + "grad_norm": 0.4085879325866699, + "learning_rate": 0.00010578510248378585, + "loss": 1.442, + "step": 36260 + }, + { + "epoch": 0.4711951569334826, + "grad_norm": 0.3478677570819855, + "learning_rate": 0.00010578250302187446, + "loss": 1.2844, + "step": 36261 + }, + { + "epoch": 0.4712081514773984, + "grad_norm": 0.33995118737220764, + "learning_rate": 0.0001057799035599631, + "loss": 1.4346, + "step": 36262 + }, + { + "epoch": 0.4712211460213143, + "grad_norm": 0.3997320532798767, + "learning_rate": 0.00010577730409805171, + "loss": 1.4871, + "step": 36263 + }, + { + "epoch": 0.47123414056523016, + "grad_norm": 0.4496322572231293, + "learning_rate": 0.00010577470463614032, + "loss": 1.3367, + "step": 36264 + }, + { + "epoch": 0.47124713510914606, + "grad_norm": 0.394094854593277, + "learning_rate": 0.00010577210517422893, + "loss": 1.4986, + "step": 36265 + }, + { + "epoch": 0.4712601296530619, + "grad_norm": 0.4957236349582672, + "learning_rate": 0.00010576950571231756, + "loss": 1.3492, + "step": 36266 + }, + { + "epoch": 0.4712731241969778, + "grad_norm": 0.5052787065505981, + "learning_rate": 0.00010576690625040617, + "loss": 1.3471, + "step": 36267 + }, + { + "epoch": 0.47128611874089366, + "grad_norm": 0.44330331683158875, + "learning_rate": 0.00010576430678849478, + "loss": 1.4152, + "step": 36268 + }, + { + "epoch": 0.47129911328480956, + "grad_norm": 0.36066582798957825, + "learning_rate": 0.00010576170732658339, + "loss": 1.3811, + "step": 36269 + }, + { + "epoch": 0.4713121078287254, + "grad_norm": 0.4625958204269409, + "learning_rate": 0.00010575910786467203, + "loss": 1.5199, + "step": 36270 + }, + { + "epoch": 0.4713251023726413, + "grad_norm": 0.33577123284339905, + "learning_rate": 0.00010575650840276064, + "loss": 1.2746, + "step": 36271 + }, + { + "epoch": 0.47133809691655715, + "grad_norm": 0.43062230944633484, + "learning_rate": 0.00010575390894084924, + "loss": 1.2863, + "step": 36272 + }, + { + "epoch": 0.47135109146047305, + "grad_norm": 0.41660013794898987, + "learning_rate": 0.00010575130947893785, + "loss": 1.3425, + "step": 36273 + }, + { + "epoch": 0.4713640860043889, + "grad_norm": 0.4076140224933624, + "learning_rate": 0.00010574871001702649, + "loss": 1.2959, + "step": 36274 + }, + { + "epoch": 0.4713770805483048, + "grad_norm": 0.4022112190723419, + "learning_rate": 0.0001057461105551151, + "loss": 1.5153, + "step": 36275 + }, + { + "epoch": 0.47139007509222064, + "grad_norm": 0.42412760853767395, + "learning_rate": 0.00010574351109320371, + "loss": 1.4482, + "step": 36276 + }, + { + "epoch": 0.47140306963613654, + "grad_norm": 0.4352869391441345, + "learning_rate": 0.00010574091163129232, + "loss": 1.3564, + "step": 36277 + }, + { + "epoch": 0.4714160641800524, + "grad_norm": 0.3720398247241974, + "learning_rate": 0.00010573831216938094, + "loss": 1.2117, + "step": 36278 + }, + { + "epoch": 0.4714290587239683, + "grad_norm": 0.40006303787231445, + "learning_rate": 0.00010573571270746956, + "loss": 1.2733, + "step": 36279 + }, + { + "epoch": 0.47144205326788413, + "grad_norm": 0.4771147072315216, + "learning_rate": 0.00010573311324555817, + "loss": 1.4599, + "step": 36280 + }, + { + "epoch": 0.47145504781180003, + "grad_norm": 0.4586285352706909, + "learning_rate": 0.0001057305137836468, + "loss": 1.4978, + "step": 36281 + }, + { + "epoch": 0.4714680423557159, + "grad_norm": 0.4073632061481476, + "learning_rate": 0.00010572791432173542, + "loss": 1.4803, + "step": 36282 + }, + { + "epoch": 0.4714810368996318, + "grad_norm": 0.3795454800128937, + "learning_rate": 0.00010572531485982403, + "loss": 1.4133, + "step": 36283 + }, + { + "epoch": 0.4714940314435476, + "grad_norm": 0.41940581798553467, + "learning_rate": 0.00010572271539791264, + "loss": 1.4424, + "step": 36284 + }, + { + "epoch": 0.4715070259874635, + "grad_norm": 0.3589300513267517, + "learning_rate": 0.00010572011593600126, + "loss": 1.3594, + "step": 36285 + }, + { + "epoch": 0.47152002053137937, + "grad_norm": 0.353777676820755, + "learning_rate": 0.00010571751647408987, + "loss": 1.4152, + "step": 36286 + }, + { + "epoch": 0.47153301507529527, + "grad_norm": 0.42814257740974426, + "learning_rate": 0.00010571491701217848, + "loss": 1.3781, + "step": 36287 + }, + { + "epoch": 0.4715460096192111, + "grad_norm": 0.3771434724330902, + "learning_rate": 0.0001057123175502671, + "loss": 1.1632, + "step": 36288 + }, + { + "epoch": 0.471559004163127, + "grad_norm": 0.39288365840911865, + "learning_rate": 0.00010570971808835572, + "loss": 1.505, + "step": 36289 + }, + { + "epoch": 0.47157199870704286, + "grad_norm": 0.44034984707832336, + "learning_rate": 0.00010570711862644433, + "loss": 1.335, + "step": 36290 + }, + { + "epoch": 0.47158499325095876, + "grad_norm": 0.3641902506351471, + "learning_rate": 0.00010570451916453294, + "loss": 1.3598, + "step": 36291 + }, + { + "epoch": 0.4715979877948746, + "grad_norm": 0.4167522192001343, + "learning_rate": 0.00010570191970262155, + "loss": 1.5018, + "step": 36292 + }, + { + "epoch": 0.4716109823387905, + "grad_norm": 0.4199286997318268, + "learning_rate": 0.00010569932024071019, + "loss": 1.4505, + "step": 36293 + }, + { + "epoch": 0.47162397688270635, + "grad_norm": 0.5240686535835266, + "learning_rate": 0.0001056967207787988, + "loss": 1.4292, + "step": 36294 + }, + { + "epoch": 0.47163697142662225, + "grad_norm": 0.407202810049057, + "learning_rate": 0.00010569412131688741, + "loss": 1.4553, + "step": 36295 + }, + { + "epoch": 0.4716499659705381, + "grad_norm": 0.29592278599739075, + "learning_rate": 0.00010569152185497602, + "loss": 1.3777, + "step": 36296 + }, + { + "epoch": 0.471662960514454, + "grad_norm": 0.3960427939891815, + "learning_rate": 0.00010568892239306465, + "loss": 1.3455, + "step": 36297 + }, + { + "epoch": 0.47167595505836984, + "grad_norm": 0.40755268931388855, + "learning_rate": 0.00010568632293115326, + "loss": 1.221, + "step": 36298 + }, + { + "epoch": 0.47168894960228575, + "grad_norm": 0.373852014541626, + "learning_rate": 0.00010568372346924187, + "loss": 1.3157, + "step": 36299 + }, + { + "epoch": 0.4717019441462016, + "grad_norm": 0.4340220093727112, + "learning_rate": 0.00010568112400733048, + "loss": 1.3878, + "step": 36300 + }, + { + "epoch": 0.4717149386901175, + "grad_norm": 0.48475176095962524, + "learning_rate": 0.0001056785245454191, + "loss": 1.5269, + "step": 36301 + }, + { + "epoch": 0.47172793323403334, + "grad_norm": 0.4359365403652191, + "learning_rate": 0.00010567592508350772, + "loss": 1.4827, + "step": 36302 + }, + { + "epoch": 0.47174092777794924, + "grad_norm": 0.4574839770793915, + "learning_rate": 0.00010567332562159633, + "loss": 1.3209, + "step": 36303 + }, + { + "epoch": 0.4717539223218651, + "grad_norm": 0.3582971394062042, + "learning_rate": 0.00010567072615968494, + "loss": 1.2927, + "step": 36304 + }, + { + "epoch": 0.471766916865781, + "grad_norm": 0.46742722392082214, + "learning_rate": 0.00010566812669777358, + "loss": 1.3667, + "step": 36305 + }, + { + "epoch": 0.47177991140969683, + "grad_norm": 0.3916938006877899, + "learning_rate": 0.00010566552723586219, + "loss": 1.5274, + "step": 36306 + }, + { + "epoch": 0.47179290595361273, + "grad_norm": 0.44973987340927124, + "learning_rate": 0.0001056629277739508, + "loss": 1.5543, + "step": 36307 + }, + { + "epoch": 0.4718059004975286, + "grad_norm": 0.2684609591960907, + "learning_rate": 0.00010566032831203941, + "loss": 1.3546, + "step": 36308 + }, + { + "epoch": 0.4718188950414445, + "grad_norm": 0.3880734443664551, + "learning_rate": 0.00010565772885012803, + "loss": 1.3502, + "step": 36309 + }, + { + "epoch": 0.4718318895853603, + "grad_norm": 0.31345921754837036, + "learning_rate": 0.00010565512938821664, + "loss": 1.2805, + "step": 36310 + }, + { + "epoch": 0.4718448841292762, + "grad_norm": 0.3631223440170288, + "learning_rate": 0.00010565252992630525, + "loss": 1.4915, + "step": 36311 + }, + { + "epoch": 0.47185787867319207, + "grad_norm": 0.3622957766056061, + "learning_rate": 0.00010564993046439387, + "loss": 1.3546, + "step": 36312 + }, + { + "epoch": 0.47187087321710797, + "grad_norm": 0.4121716618537903, + "learning_rate": 0.0001056473310024825, + "loss": 1.5297, + "step": 36313 + }, + { + "epoch": 0.47188386776102387, + "grad_norm": 0.3836487829685211, + "learning_rate": 0.0001056447315405711, + "loss": 1.239, + "step": 36314 + }, + { + "epoch": 0.4718968623049397, + "grad_norm": 0.3267301321029663, + "learning_rate": 0.00010564213207865971, + "loss": 1.4449, + "step": 36315 + }, + { + "epoch": 0.4719098568488556, + "grad_norm": 0.3430098295211792, + "learning_rate": 0.00010563953261674835, + "loss": 1.3664, + "step": 36316 + }, + { + "epoch": 0.47192285139277146, + "grad_norm": 0.3294520676136017, + "learning_rate": 0.00010563693315483696, + "loss": 1.4008, + "step": 36317 + }, + { + "epoch": 0.47193584593668736, + "grad_norm": 0.41586562991142273, + "learning_rate": 0.00010563433369292557, + "loss": 1.4017, + "step": 36318 + }, + { + "epoch": 0.4719488404806032, + "grad_norm": 0.3817497491836548, + "learning_rate": 0.00010563173423101418, + "loss": 1.2898, + "step": 36319 + }, + { + "epoch": 0.4719618350245191, + "grad_norm": 0.3957844376564026, + "learning_rate": 0.00010562913476910281, + "loss": 1.3424, + "step": 36320 + }, + { + "epoch": 0.47197482956843495, + "grad_norm": 0.45254358649253845, + "learning_rate": 0.00010562653530719142, + "loss": 1.468, + "step": 36321 + }, + { + "epoch": 0.47198782411235085, + "grad_norm": 0.44081953167915344, + "learning_rate": 0.00010562393584528003, + "loss": 1.3416, + "step": 36322 + }, + { + "epoch": 0.4720008186562667, + "grad_norm": 0.4020347595214844, + "learning_rate": 0.00010562133638336864, + "loss": 1.2994, + "step": 36323 + }, + { + "epoch": 0.4720138132001826, + "grad_norm": 0.2760070264339447, + "learning_rate": 0.00010561873692145728, + "loss": 1.3504, + "step": 36324 + }, + { + "epoch": 0.47202680774409844, + "grad_norm": 0.33787959814071655, + "learning_rate": 0.00010561613745954589, + "loss": 1.3264, + "step": 36325 + }, + { + "epoch": 0.47203980228801434, + "grad_norm": 0.40708714723587036, + "learning_rate": 0.0001056135379976345, + "loss": 1.5959, + "step": 36326 + }, + { + "epoch": 0.4720527968319302, + "grad_norm": 0.4376273453235626, + "learning_rate": 0.0001056109385357231, + "loss": 1.3635, + "step": 36327 + }, + { + "epoch": 0.4720657913758461, + "grad_norm": 0.4282427430152893, + "learning_rate": 0.00010560833907381174, + "loss": 1.2953, + "step": 36328 + }, + { + "epoch": 0.47207878591976193, + "grad_norm": 0.466429740190506, + "learning_rate": 0.00010560573961190035, + "loss": 1.4216, + "step": 36329 + }, + { + "epoch": 0.47209178046367783, + "grad_norm": 0.4172934889793396, + "learning_rate": 0.00010560314014998896, + "loss": 1.3632, + "step": 36330 + }, + { + "epoch": 0.4721047750075937, + "grad_norm": 0.394612193107605, + "learning_rate": 0.00010560054068807757, + "loss": 1.3904, + "step": 36331 + }, + { + "epoch": 0.4721177695515096, + "grad_norm": 0.3224700391292572, + "learning_rate": 0.00010559794122616619, + "loss": 1.2999, + "step": 36332 + }, + { + "epoch": 0.4721307640954254, + "grad_norm": 0.4206881821155548, + "learning_rate": 0.0001055953417642548, + "loss": 1.3353, + "step": 36333 + }, + { + "epoch": 0.4721437586393413, + "grad_norm": 0.40402668714523315, + "learning_rate": 0.00010559274230234341, + "loss": 1.4871, + "step": 36334 + }, + { + "epoch": 0.47215675318325717, + "grad_norm": 0.3820200264453888, + "learning_rate": 0.00010559014284043203, + "loss": 1.3762, + "step": 36335 + }, + { + "epoch": 0.4721697477271731, + "grad_norm": 0.45298463106155396, + "learning_rate": 0.00010558754337852066, + "loss": 1.2478, + "step": 36336 + }, + { + "epoch": 0.4721827422710889, + "grad_norm": 0.3556426465511322, + "learning_rate": 0.00010558494391660927, + "loss": 1.4383, + "step": 36337 + }, + { + "epoch": 0.4721957368150048, + "grad_norm": 0.4001607298851013, + "learning_rate": 0.00010558234445469788, + "loss": 1.3547, + "step": 36338 + }, + { + "epoch": 0.47220873135892066, + "grad_norm": 0.40171200037002563, + "learning_rate": 0.00010557974499278648, + "loss": 1.5491, + "step": 36339 + }, + { + "epoch": 0.47222172590283656, + "grad_norm": 0.3648524880409241, + "learning_rate": 0.00010557714553087512, + "loss": 1.4328, + "step": 36340 + }, + { + "epoch": 0.4722347204467524, + "grad_norm": 0.3911897540092468, + "learning_rate": 0.00010557454606896373, + "loss": 1.3298, + "step": 36341 + }, + { + "epoch": 0.4722477149906683, + "grad_norm": 0.4505600035190582, + "learning_rate": 0.00010557194660705234, + "loss": 1.4038, + "step": 36342 + }, + { + "epoch": 0.47226070953458416, + "grad_norm": 0.44918322563171387, + "learning_rate": 0.00010556934714514095, + "loss": 1.4143, + "step": 36343 + }, + { + "epoch": 0.47227370407850006, + "grad_norm": 0.4704125225543976, + "learning_rate": 0.00010556674768322958, + "loss": 1.3556, + "step": 36344 + }, + { + "epoch": 0.4722866986224159, + "grad_norm": 0.3804517388343811, + "learning_rate": 0.00010556414822131819, + "loss": 1.4992, + "step": 36345 + }, + { + "epoch": 0.4722996931663318, + "grad_norm": 0.33664894104003906, + "learning_rate": 0.0001055615487594068, + "loss": 1.5141, + "step": 36346 + }, + { + "epoch": 0.47231268771024765, + "grad_norm": 0.3504297137260437, + "learning_rate": 0.00010555894929749541, + "loss": 1.4532, + "step": 36347 + }, + { + "epoch": 0.47232568225416355, + "grad_norm": 0.4832122325897217, + "learning_rate": 0.00010555634983558405, + "loss": 1.221, + "step": 36348 + }, + { + "epoch": 0.4723386767980794, + "grad_norm": 0.3068981468677521, + "learning_rate": 0.00010555375037367266, + "loss": 1.3667, + "step": 36349 + }, + { + "epoch": 0.4723516713419953, + "grad_norm": 0.4074302017688751, + "learning_rate": 0.00010555115091176127, + "loss": 1.5373, + "step": 36350 + }, + { + "epoch": 0.47236466588591114, + "grad_norm": 0.43595632910728455, + "learning_rate": 0.00010554855144984988, + "loss": 1.3623, + "step": 36351 + }, + { + "epoch": 0.47237766042982704, + "grad_norm": 0.38892650604248047, + "learning_rate": 0.0001055459519879385, + "loss": 1.2509, + "step": 36352 + }, + { + "epoch": 0.4723906549737429, + "grad_norm": 0.39849451184272766, + "learning_rate": 0.00010554335252602712, + "loss": 1.3672, + "step": 36353 + }, + { + "epoch": 0.4724036495176588, + "grad_norm": 0.4168280363082886, + "learning_rate": 0.00010554075306411573, + "loss": 1.2418, + "step": 36354 + }, + { + "epoch": 0.47241664406157463, + "grad_norm": 0.44292542338371277, + "learning_rate": 0.00010553815360220437, + "loss": 1.4295, + "step": 36355 + }, + { + "epoch": 0.47242963860549053, + "grad_norm": 0.33598068356513977, + "learning_rate": 0.00010553555414029296, + "loss": 1.4127, + "step": 36356 + }, + { + "epoch": 0.4724426331494064, + "grad_norm": 0.3550907373428345, + "learning_rate": 0.00010553295467838157, + "loss": 1.3317, + "step": 36357 + }, + { + "epoch": 0.4724556276933223, + "grad_norm": 0.3526102304458618, + "learning_rate": 0.00010553035521647018, + "loss": 1.1941, + "step": 36358 + }, + { + "epoch": 0.4724686222372381, + "grad_norm": 0.3644999563694, + "learning_rate": 0.00010552775575455882, + "loss": 1.3664, + "step": 36359 + }, + { + "epoch": 0.472481616781154, + "grad_norm": 0.47779789566993713, + "learning_rate": 0.00010552515629264743, + "loss": 1.5838, + "step": 36360 + }, + { + "epoch": 0.47249461132506987, + "grad_norm": 0.3704841434955597, + "learning_rate": 0.00010552255683073604, + "loss": 1.4593, + "step": 36361 + }, + { + "epoch": 0.47250760586898577, + "grad_norm": 0.49029144644737244, + "learning_rate": 0.00010551995736882466, + "loss": 1.5015, + "step": 36362 + }, + { + "epoch": 0.4725206004129016, + "grad_norm": 0.35636621713638306, + "learning_rate": 0.00010551735790691328, + "loss": 1.4052, + "step": 36363 + }, + { + "epoch": 0.4725335949568175, + "grad_norm": 0.3917786180973053, + "learning_rate": 0.00010551475844500189, + "loss": 1.4715, + "step": 36364 + }, + { + "epoch": 0.47254658950073336, + "grad_norm": 0.37086358666419983, + "learning_rate": 0.0001055121589830905, + "loss": 1.2612, + "step": 36365 + }, + { + "epoch": 0.47255958404464926, + "grad_norm": 0.38486021757125854, + "learning_rate": 0.00010550955952117911, + "loss": 1.383, + "step": 36366 + }, + { + "epoch": 0.4725725785885651, + "grad_norm": 0.34516850113868713, + "learning_rate": 0.00010550696005926775, + "loss": 1.3181, + "step": 36367 + }, + { + "epoch": 0.472585573132481, + "grad_norm": 0.44308778643608093, + "learning_rate": 0.00010550436059735636, + "loss": 1.376, + "step": 36368 + }, + { + "epoch": 0.47259856767639685, + "grad_norm": 0.3153568208217621, + "learning_rate": 0.00010550176113544496, + "loss": 1.3916, + "step": 36369 + }, + { + "epoch": 0.47261156222031275, + "grad_norm": 0.43162310123443604, + "learning_rate": 0.00010549916167353357, + "loss": 1.482, + "step": 36370 + }, + { + "epoch": 0.4726245567642286, + "grad_norm": 0.3791247606277466, + "learning_rate": 0.00010549656221162221, + "loss": 1.7045, + "step": 36371 + }, + { + "epoch": 0.4726375513081445, + "grad_norm": 0.4568774104118347, + "learning_rate": 0.00010549396274971082, + "loss": 1.2982, + "step": 36372 + }, + { + "epoch": 0.47265054585206034, + "grad_norm": 0.40682920813560486, + "learning_rate": 0.00010549136328779943, + "loss": 1.3991, + "step": 36373 + }, + { + "epoch": 0.47266354039597624, + "grad_norm": 0.5829497575759888, + "learning_rate": 0.00010548876382588804, + "loss": 1.3291, + "step": 36374 + }, + { + "epoch": 0.4726765349398921, + "grad_norm": 0.39848124980926514, + "learning_rate": 0.00010548616436397667, + "loss": 1.3377, + "step": 36375 + }, + { + "epoch": 0.472689529483808, + "grad_norm": 0.5966970324516296, + "learning_rate": 0.00010548356490206528, + "loss": 1.294, + "step": 36376 + }, + { + "epoch": 0.47270252402772384, + "grad_norm": 0.3550399839878082, + "learning_rate": 0.00010548096544015389, + "loss": 1.4642, + "step": 36377 + }, + { + "epoch": 0.47271551857163974, + "grad_norm": 0.41060203313827515, + "learning_rate": 0.0001054783659782425, + "loss": 1.459, + "step": 36378 + }, + { + "epoch": 0.4727285131155556, + "grad_norm": 0.44106003642082214, + "learning_rate": 0.00010547576651633114, + "loss": 1.4886, + "step": 36379 + }, + { + "epoch": 0.4727415076594715, + "grad_norm": 0.4346536695957184, + "learning_rate": 0.00010547316705441975, + "loss": 1.4245, + "step": 36380 + }, + { + "epoch": 0.47275450220338733, + "grad_norm": 0.4286676347255707, + "learning_rate": 0.00010547056759250834, + "loss": 1.5067, + "step": 36381 + }, + { + "epoch": 0.47276749674730323, + "grad_norm": 0.45711028575897217, + "learning_rate": 0.00010546796813059696, + "loss": 1.4612, + "step": 36382 + }, + { + "epoch": 0.4727804912912191, + "grad_norm": 0.3808406889438629, + "learning_rate": 0.0001054653686686856, + "loss": 1.4254, + "step": 36383 + }, + { + "epoch": 0.472793485835135, + "grad_norm": 0.3095826804637909, + "learning_rate": 0.0001054627692067742, + "loss": 1.3425, + "step": 36384 + }, + { + "epoch": 0.4728064803790508, + "grad_norm": 0.4152957499027252, + "learning_rate": 0.00010546016974486282, + "loss": 1.4307, + "step": 36385 + }, + { + "epoch": 0.4728194749229667, + "grad_norm": 0.31122854351997375, + "learning_rate": 0.00010545757028295143, + "loss": 1.2689, + "step": 36386 + }, + { + "epoch": 0.47283246946688257, + "grad_norm": 0.3050546646118164, + "learning_rate": 0.00010545497082104005, + "loss": 1.2804, + "step": 36387 + }, + { + "epoch": 0.47284546401079847, + "grad_norm": 0.38738322257995605, + "learning_rate": 0.00010545237135912866, + "loss": 1.337, + "step": 36388 + }, + { + "epoch": 0.47285845855471437, + "grad_norm": 0.3867758512496948, + "learning_rate": 0.00010544977189721727, + "loss": 1.438, + "step": 36389 + }, + { + "epoch": 0.4728714530986302, + "grad_norm": 0.418655663728714, + "learning_rate": 0.00010544717243530591, + "loss": 1.4436, + "step": 36390 + }, + { + "epoch": 0.4728844476425461, + "grad_norm": 0.4404284656047821, + "learning_rate": 0.00010544457297339452, + "loss": 1.5482, + "step": 36391 + }, + { + "epoch": 0.47289744218646196, + "grad_norm": 0.4149889647960663, + "learning_rate": 0.00010544197351148313, + "loss": 1.4142, + "step": 36392 + }, + { + "epoch": 0.47291043673037786, + "grad_norm": 0.42605578899383545, + "learning_rate": 0.00010543937404957174, + "loss": 1.6962, + "step": 36393 + }, + { + "epoch": 0.4729234312742937, + "grad_norm": 0.3961978256702423, + "learning_rate": 0.00010543677458766037, + "loss": 1.4155, + "step": 36394 + }, + { + "epoch": 0.4729364258182096, + "grad_norm": 0.4792509078979492, + "learning_rate": 0.00010543417512574898, + "loss": 1.4575, + "step": 36395 + }, + { + "epoch": 0.47294942036212545, + "grad_norm": 0.3547539710998535, + "learning_rate": 0.00010543157566383759, + "loss": 1.4044, + "step": 36396 + }, + { + "epoch": 0.47296241490604135, + "grad_norm": 0.3621586561203003, + "learning_rate": 0.0001054289762019262, + "loss": 1.1845, + "step": 36397 + }, + { + "epoch": 0.4729754094499572, + "grad_norm": 0.46153756976127625, + "learning_rate": 0.00010542637674001483, + "loss": 1.4107, + "step": 36398 + }, + { + "epoch": 0.4729884039938731, + "grad_norm": 0.3382875621318817, + "learning_rate": 0.00010542377727810344, + "loss": 1.3004, + "step": 36399 + }, + { + "epoch": 0.47300139853778894, + "grad_norm": 0.5748820900917053, + "learning_rate": 0.00010542117781619205, + "loss": 1.5482, + "step": 36400 + }, + { + "epoch": 0.47301439308170484, + "grad_norm": 0.4077175557613373, + "learning_rate": 0.00010541857835428066, + "loss": 1.2882, + "step": 36401 + }, + { + "epoch": 0.4730273876256207, + "grad_norm": 0.45156893134117126, + "learning_rate": 0.0001054159788923693, + "loss": 1.2087, + "step": 36402 + }, + { + "epoch": 0.4730403821695366, + "grad_norm": 0.43064963817596436, + "learning_rate": 0.00010541337943045791, + "loss": 1.5427, + "step": 36403 + }, + { + "epoch": 0.47305337671345243, + "grad_norm": 0.3077194094657898, + "learning_rate": 0.00010541077996854652, + "loss": 1.21, + "step": 36404 + }, + { + "epoch": 0.47306637125736833, + "grad_norm": 0.46736305952072144, + "learning_rate": 0.00010540818050663513, + "loss": 1.3801, + "step": 36405 + }, + { + "epoch": 0.4730793658012842, + "grad_norm": 0.4443642497062683, + "learning_rate": 0.00010540558104472375, + "loss": 1.5445, + "step": 36406 + }, + { + "epoch": 0.4730923603452001, + "grad_norm": 0.3929472863674164, + "learning_rate": 0.00010540298158281236, + "loss": 1.2371, + "step": 36407 + }, + { + "epoch": 0.4731053548891159, + "grad_norm": 0.41135480999946594, + "learning_rate": 0.00010540038212090098, + "loss": 1.2134, + "step": 36408 + }, + { + "epoch": 0.4731183494330318, + "grad_norm": 0.3893527388572693, + "learning_rate": 0.00010539778265898959, + "loss": 1.1874, + "step": 36409 + }, + { + "epoch": 0.47313134397694767, + "grad_norm": 0.4100229740142822, + "learning_rate": 0.00010539518319707822, + "loss": 1.3522, + "step": 36410 + }, + { + "epoch": 0.47314433852086357, + "grad_norm": 0.5223816633224487, + "learning_rate": 0.00010539258373516682, + "loss": 1.3644, + "step": 36411 + }, + { + "epoch": 0.4731573330647794, + "grad_norm": 0.5095726847648621, + "learning_rate": 0.00010538998427325543, + "loss": 1.4357, + "step": 36412 + }, + { + "epoch": 0.4731703276086953, + "grad_norm": 0.29291126132011414, + "learning_rate": 0.00010538738481134404, + "loss": 1.1927, + "step": 36413 + }, + { + "epoch": 0.47318332215261116, + "grad_norm": 0.49224966764450073, + "learning_rate": 0.00010538478534943268, + "loss": 1.4498, + "step": 36414 + }, + { + "epoch": 0.47319631669652706, + "grad_norm": 0.3949586749076843, + "learning_rate": 0.00010538218588752129, + "loss": 1.3013, + "step": 36415 + }, + { + "epoch": 0.4732093112404429, + "grad_norm": 0.5705249309539795, + "learning_rate": 0.0001053795864256099, + "loss": 1.2863, + "step": 36416 + }, + { + "epoch": 0.4732223057843588, + "grad_norm": 0.6028515100479126, + "learning_rate": 0.00010537698696369851, + "loss": 1.4612, + "step": 36417 + }, + { + "epoch": 0.47323530032827466, + "grad_norm": 0.5409745573997498, + "learning_rate": 0.00010537438750178714, + "loss": 1.2278, + "step": 36418 + }, + { + "epoch": 0.47324829487219056, + "grad_norm": 0.4503304958343506, + "learning_rate": 0.00010537178803987575, + "loss": 1.661, + "step": 36419 + }, + { + "epoch": 0.4732612894161064, + "grad_norm": 0.47031888365745544, + "learning_rate": 0.00010536918857796436, + "loss": 1.2354, + "step": 36420 + }, + { + "epoch": 0.4732742839600223, + "grad_norm": 0.33157649636268616, + "learning_rate": 0.00010536658911605297, + "loss": 1.2154, + "step": 36421 + }, + { + "epoch": 0.47328727850393815, + "grad_norm": 0.3777521848678589, + "learning_rate": 0.00010536398965414161, + "loss": 1.4627, + "step": 36422 + }, + { + "epoch": 0.47330027304785405, + "grad_norm": 0.3475722372531891, + "learning_rate": 0.00010536139019223021, + "loss": 1.4711, + "step": 36423 + }, + { + "epoch": 0.4733132675917699, + "grad_norm": 0.4043409824371338, + "learning_rate": 0.00010535879073031882, + "loss": 1.3512, + "step": 36424 + }, + { + "epoch": 0.4733262621356858, + "grad_norm": 0.46424999833106995, + "learning_rate": 0.00010535619126840743, + "loss": 1.5604, + "step": 36425 + }, + { + "epoch": 0.47333925667960164, + "grad_norm": 0.29085108637809753, + "learning_rate": 0.00010535359180649607, + "loss": 1.4514, + "step": 36426 + }, + { + "epoch": 0.47335225122351754, + "grad_norm": 0.35116955637931824, + "learning_rate": 0.00010535099234458468, + "loss": 1.4099, + "step": 36427 + }, + { + "epoch": 0.4733652457674334, + "grad_norm": 0.35608649253845215, + "learning_rate": 0.00010534839288267329, + "loss": 1.2669, + "step": 36428 + }, + { + "epoch": 0.4733782403113493, + "grad_norm": 0.4304165244102478, + "learning_rate": 0.00010534579342076191, + "loss": 1.394, + "step": 36429 + }, + { + "epoch": 0.47339123485526513, + "grad_norm": 0.5003201961517334, + "learning_rate": 0.00010534319395885052, + "loss": 1.4146, + "step": 36430 + }, + { + "epoch": 0.47340422939918103, + "grad_norm": 0.373423308134079, + "learning_rate": 0.00010534059449693914, + "loss": 1.2369, + "step": 36431 + }, + { + "epoch": 0.4734172239430969, + "grad_norm": 0.40230199694633484, + "learning_rate": 0.00010533799503502775, + "loss": 1.6951, + "step": 36432 + }, + { + "epoch": 0.4734302184870128, + "grad_norm": 0.41639116406440735, + "learning_rate": 0.00010533539557311638, + "loss": 1.5146, + "step": 36433 + }, + { + "epoch": 0.4734432130309286, + "grad_norm": 0.33416110277175903, + "learning_rate": 0.000105332796111205, + "loss": 1.2722, + "step": 36434 + }, + { + "epoch": 0.4734562075748445, + "grad_norm": 0.36078763008117676, + "learning_rate": 0.0001053301966492936, + "loss": 1.5051, + "step": 36435 + }, + { + "epoch": 0.47346920211876037, + "grad_norm": 0.3371979594230652, + "learning_rate": 0.0001053275971873822, + "loss": 1.3258, + "step": 36436 + }, + { + "epoch": 0.47348219666267627, + "grad_norm": 0.3522584140300751, + "learning_rate": 0.00010532499772547084, + "loss": 1.4431, + "step": 36437 + }, + { + "epoch": 0.4734951912065921, + "grad_norm": 0.44137823581695557, + "learning_rate": 0.00010532239826355945, + "loss": 1.3905, + "step": 36438 + }, + { + "epoch": 0.473508185750508, + "grad_norm": 0.409078985452652, + "learning_rate": 0.00010531979880164806, + "loss": 1.3691, + "step": 36439 + }, + { + "epoch": 0.47352118029442386, + "grad_norm": 0.4190751314163208, + "learning_rate": 0.00010531719933973667, + "loss": 1.2521, + "step": 36440 + }, + { + "epoch": 0.47353417483833976, + "grad_norm": 0.5178038477897644, + "learning_rate": 0.0001053145998778253, + "loss": 1.4101, + "step": 36441 + }, + { + "epoch": 0.4735471693822556, + "grad_norm": 0.37695392966270447, + "learning_rate": 0.00010531200041591391, + "loss": 1.7097, + "step": 36442 + }, + { + "epoch": 0.4735601639261715, + "grad_norm": 0.40205806493759155, + "learning_rate": 0.00010530940095400252, + "loss": 1.3161, + "step": 36443 + }, + { + "epoch": 0.47357315847008735, + "grad_norm": 0.39444082975387573, + "learning_rate": 0.00010530680149209113, + "loss": 1.1433, + "step": 36444 + }, + { + "epoch": 0.47358615301400325, + "grad_norm": 0.5432342290878296, + "learning_rate": 0.00010530420203017977, + "loss": 1.5462, + "step": 36445 + }, + { + "epoch": 0.4735991475579191, + "grad_norm": 0.40053340792655945, + "learning_rate": 0.00010530160256826838, + "loss": 1.2002, + "step": 36446 + }, + { + "epoch": 0.473612142101835, + "grad_norm": 0.40720003843307495, + "learning_rate": 0.00010529900310635699, + "loss": 1.3962, + "step": 36447 + }, + { + "epoch": 0.47362513664575084, + "grad_norm": 0.4189891517162323, + "learning_rate": 0.0001052964036444456, + "loss": 1.4713, + "step": 36448 + }, + { + "epoch": 0.47363813118966674, + "grad_norm": 0.3476797342300415, + "learning_rate": 0.00010529380418253423, + "loss": 1.3817, + "step": 36449 + }, + { + "epoch": 0.4736511257335826, + "grad_norm": 0.44074803590774536, + "learning_rate": 0.00010529120472062284, + "loss": 1.2675, + "step": 36450 + }, + { + "epoch": 0.4736641202774985, + "grad_norm": 0.34805727005004883, + "learning_rate": 0.00010528860525871145, + "loss": 1.2348, + "step": 36451 + }, + { + "epoch": 0.47367711482141434, + "grad_norm": 0.40573394298553467, + "learning_rate": 0.00010528600579680006, + "loss": 1.3978, + "step": 36452 + }, + { + "epoch": 0.47369010936533024, + "grad_norm": 0.41177698969841003, + "learning_rate": 0.00010528340633488868, + "loss": 1.2685, + "step": 36453 + }, + { + "epoch": 0.4737031039092461, + "grad_norm": 0.41897687315940857, + "learning_rate": 0.0001052808068729773, + "loss": 1.3308, + "step": 36454 + }, + { + "epoch": 0.473716098453162, + "grad_norm": 0.33357471227645874, + "learning_rate": 0.0001052782074110659, + "loss": 1.3519, + "step": 36455 + }, + { + "epoch": 0.4737290929970778, + "grad_norm": 0.4196988344192505, + "learning_rate": 0.00010527560794915452, + "loss": 1.4598, + "step": 36456 + }, + { + "epoch": 0.47374208754099373, + "grad_norm": 0.4668128788471222, + "learning_rate": 0.00010527300848724316, + "loss": 1.5618, + "step": 36457 + }, + { + "epoch": 0.4737550820849096, + "grad_norm": 0.40311822295188904, + "learning_rate": 0.00010527040902533177, + "loss": 1.4825, + "step": 36458 + }, + { + "epoch": 0.4737680766288255, + "grad_norm": 0.3562118411064148, + "learning_rate": 0.00010526780956342038, + "loss": 1.2388, + "step": 36459 + }, + { + "epoch": 0.4737810711727413, + "grad_norm": 0.43443742394447327, + "learning_rate": 0.00010526521010150899, + "loss": 1.4315, + "step": 36460 + }, + { + "epoch": 0.4737940657166572, + "grad_norm": 0.36753880977630615, + "learning_rate": 0.00010526261063959761, + "loss": 1.2069, + "step": 36461 + }, + { + "epoch": 0.47380706026057307, + "grad_norm": 0.38647881150245667, + "learning_rate": 0.00010526001117768622, + "loss": 1.2645, + "step": 36462 + }, + { + "epoch": 0.47382005480448897, + "grad_norm": 0.4401029050350189, + "learning_rate": 0.00010525741171577483, + "loss": 1.4452, + "step": 36463 + }, + { + "epoch": 0.4738330493484048, + "grad_norm": 0.41859200596809387, + "learning_rate": 0.00010525481225386347, + "loss": 1.2808, + "step": 36464 + }, + { + "epoch": 0.4738460438923207, + "grad_norm": 0.39292094111442566, + "learning_rate": 0.00010525221279195207, + "loss": 1.5636, + "step": 36465 + }, + { + "epoch": 0.4738590384362366, + "grad_norm": 0.38070419430732727, + "learning_rate": 0.00010524961333004068, + "loss": 1.5081, + "step": 36466 + }, + { + "epoch": 0.47387203298015246, + "grad_norm": 0.2946373522281647, + "learning_rate": 0.00010524701386812929, + "loss": 1.2769, + "step": 36467 + }, + { + "epoch": 0.47388502752406836, + "grad_norm": 0.5027557015419006, + "learning_rate": 0.00010524441440621793, + "loss": 1.5535, + "step": 36468 + }, + { + "epoch": 0.4738980220679842, + "grad_norm": 0.42623329162597656, + "learning_rate": 0.00010524181494430654, + "loss": 1.3172, + "step": 36469 + }, + { + "epoch": 0.4739110166119001, + "grad_norm": 0.3594422936439514, + "learning_rate": 0.00010523921548239515, + "loss": 1.5453, + "step": 36470 + }, + { + "epoch": 0.47392401115581595, + "grad_norm": 0.40100613236427307, + "learning_rate": 0.00010523661602048376, + "loss": 1.274, + "step": 36471 + }, + { + "epoch": 0.47393700569973185, + "grad_norm": 0.4887884557247162, + "learning_rate": 0.00010523401655857239, + "loss": 1.4617, + "step": 36472 + }, + { + "epoch": 0.4739500002436477, + "grad_norm": 0.3954545259475708, + "learning_rate": 0.000105231417096661, + "loss": 1.3242, + "step": 36473 + }, + { + "epoch": 0.4739629947875636, + "grad_norm": 0.4259144961833954, + "learning_rate": 0.00010522881763474961, + "loss": 1.2899, + "step": 36474 + }, + { + "epoch": 0.47397598933147944, + "grad_norm": 0.3954712152481079, + "learning_rate": 0.00010522621817283822, + "loss": 1.4941, + "step": 36475 + }, + { + "epoch": 0.47398898387539534, + "grad_norm": 0.2992852032184601, + "learning_rate": 0.00010522361871092686, + "loss": 1.2807, + "step": 36476 + }, + { + "epoch": 0.4740019784193112, + "grad_norm": 0.41336286067962646, + "learning_rate": 0.00010522101924901547, + "loss": 1.5216, + "step": 36477 + }, + { + "epoch": 0.4740149729632271, + "grad_norm": 0.4363195598125458, + "learning_rate": 0.00010521841978710407, + "loss": 1.2006, + "step": 36478 + }, + { + "epoch": 0.47402796750714293, + "grad_norm": 0.4402675926685333, + "learning_rate": 0.00010521582032519268, + "loss": 1.3084, + "step": 36479 + }, + { + "epoch": 0.47404096205105883, + "grad_norm": 0.40029609203338623, + "learning_rate": 0.00010521322086328131, + "loss": 1.2702, + "step": 36480 + }, + { + "epoch": 0.4740539565949747, + "grad_norm": 0.4639246463775635, + "learning_rate": 0.00010521062140136993, + "loss": 1.3475, + "step": 36481 + }, + { + "epoch": 0.4740669511388906, + "grad_norm": 0.38363412022590637, + "learning_rate": 0.00010520802193945854, + "loss": 1.4488, + "step": 36482 + }, + { + "epoch": 0.4740799456828064, + "grad_norm": 0.37412697076797485, + "learning_rate": 0.00010520542247754715, + "loss": 1.4547, + "step": 36483 + }, + { + "epoch": 0.4740929402267223, + "grad_norm": 0.3534381091594696, + "learning_rate": 0.00010520282301563577, + "loss": 1.27, + "step": 36484 + }, + { + "epoch": 0.47410593477063817, + "grad_norm": 0.2850721478462219, + "learning_rate": 0.00010520022355372438, + "loss": 1.0935, + "step": 36485 + }, + { + "epoch": 0.47411892931455407, + "grad_norm": 0.31452807784080505, + "learning_rate": 0.000105197624091813, + "loss": 1.4362, + "step": 36486 + }, + { + "epoch": 0.4741319238584699, + "grad_norm": 0.4571937918663025, + "learning_rate": 0.0001051950246299016, + "loss": 1.4426, + "step": 36487 + }, + { + "epoch": 0.4741449184023858, + "grad_norm": 0.35101813077926636, + "learning_rate": 0.00010519242516799024, + "loss": 1.3846, + "step": 36488 + }, + { + "epoch": 0.47415791294630166, + "grad_norm": 0.3200477957725525, + "learning_rate": 0.00010518982570607885, + "loss": 1.3804, + "step": 36489 + }, + { + "epoch": 0.47417090749021756, + "grad_norm": 0.5071653127670288, + "learning_rate": 0.00010518722624416746, + "loss": 1.3948, + "step": 36490 + }, + { + "epoch": 0.4741839020341334, + "grad_norm": 0.40244919061660767, + "learning_rate": 0.00010518462678225606, + "loss": 1.3707, + "step": 36491 + }, + { + "epoch": 0.4741968965780493, + "grad_norm": 0.41514334082603455, + "learning_rate": 0.0001051820273203447, + "loss": 1.5459, + "step": 36492 + }, + { + "epoch": 0.47420989112196515, + "grad_norm": 0.48066726326942444, + "learning_rate": 0.00010517942785843331, + "loss": 1.4711, + "step": 36493 + }, + { + "epoch": 0.47422288566588106, + "grad_norm": 0.30234599113464355, + "learning_rate": 0.00010517682839652192, + "loss": 1.3041, + "step": 36494 + }, + { + "epoch": 0.4742358802097969, + "grad_norm": 0.3929380774497986, + "learning_rate": 0.00010517422893461053, + "loss": 1.5004, + "step": 36495 + }, + { + "epoch": 0.4742488747537128, + "grad_norm": 0.43211257457733154, + "learning_rate": 0.00010517162947269916, + "loss": 1.4852, + "step": 36496 + }, + { + "epoch": 0.47426186929762865, + "grad_norm": 0.3946191966533661, + "learning_rate": 0.00010516903001078777, + "loss": 1.4782, + "step": 36497 + }, + { + "epoch": 0.47427486384154455, + "grad_norm": 0.442671000957489, + "learning_rate": 0.00010516643054887638, + "loss": 1.3607, + "step": 36498 + }, + { + "epoch": 0.4742878583854604, + "grad_norm": 0.3253670036792755, + "learning_rate": 0.00010516383108696499, + "loss": 1.4483, + "step": 36499 + }, + { + "epoch": 0.4743008529293763, + "grad_norm": 0.4677087068557739, + "learning_rate": 0.00010516123162505363, + "loss": 1.3958, + "step": 36500 + }, + { + "epoch": 0.47431384747329214, + "grad_norm": 0.33318865299224854, + "learning_rate": 0.00010515863216314224, + "loss": 1.3909, + "step": 36501 + }, + { + "epoch": 0.47432684201720804, + "grad_norm": 0.23662245273590088, + "learning_rate": 0.00010515603270123085, + "loss": 1.0205, + "step": 36502 + }, + { + "epoch": 0.4743398365611239, + "grad_norm": 0.30335503816604614, + "learning_rate": 0.00010515343323931947, + "loss": 1.0193, + "step": 36503 + }, + { + "epoch": 0.4743528311050398, + "grad_norm": 0.39821380376815796, + "learning_rate": 0.00010515083377740809, + "loss": 1.2421, + "step": 36504 + }, + { + "epoch": 0.47436582564895563, + "grad_norm": 0.4266816973686218, + "learning_rate": 0.0001051482343154967, + "loss": 1.2585, + "step": 36505 + }, + { + "epoch": 0.47437882019287153, + "grad_norm": 0.44197338819503784, + "learning_rate": 0.00010514563485358531, + "loss": 1.3504, + "step": 36506 + }, + { + "epoch": 0.4743918147367874, + "grad_norm": 0.3546936511993408, + "learning_rate": 0.00010514303539167393, + "loss": 1.2846, + "step": 36507 + }, + { + "epoch": 0.4744048092807033, + "grad_norm": 0.4064730107784271, + "learning_rate": 0.00010514043592976254, + "loss": 1.3243, + "step": 36508 + }, + { + "epoch": 0.4744178038246191, + "grad_norm": 0.42016807198524475, + "learning_rate": 0.00010513783646785115, + "loss": 1.3705, + "step": 36509 + }, + { + "epoch": 0.474430798368535, + "grad_norm": 0.4166208505630493, + "learning_rate": 0.00010513523700593976, + "loss": 1.3553, + "step": 36510 + }, + { + "epoch": 0.47444379291245087, + "grad_norm": 0.3893529772758484, + "learning_rate": 0.0001051326375440284, + "loss": 1.4648, + "step": 36511 + }, + { + "epoch": 0.47445678745636677, + "grad_norm": 0.4707178473472595, + "learning_rate": 0.00010513003808211701, + "loss": 1.3906, + "step": 36512 + }, + { + "epoch": 0.4744697820002826, + "grad_norm": 0.3569815158843994, + "learning_rate": 0.00010512743862020562, + "loss": 1.2468, + "step": 36513 + }, + { + "epoch": 0.4744827765441985, + "grad_norm": 0.5508162379264832, + "learning_rate": 0.00010512483915829424, + "loss": 1.5861, + "step": 36514 + }, + { + "epoch": 0.47449577108811436, + "grad_norm": 0.47720858454704285, + "learning_rate": 0.00010512223969638286, + "loss": 1.4275, + "step": 36515 + }, + { + "epoch": 0.47450876563203026, + "grad_norm": 0.492017924785614, + "learning_rate": 0.00010511964023447147, + "loss": 1.4554, + "step": 36516 + }, + { + "epoch": 0.4745217601759461, + "grad_norm": 0.3388065695762634, + "learning_rate": 0.00010511704077256008, + "loss": 1.5623, + "step": 36517 + }, + { + "epoch": 0.474534754719862, + "grad_norm": 0.490404337644577, + "learning_rate": 0.00010511444131064869, + "loss": 1.3821, + "step": 36518 + }, + { + "epoch": 0.47454774926377785, + "grad_norm": 0.452975869178772, + "learning_rate": 0.00010511184184873733, + "loss": 1.454, + "step": 36519 + }, + { + "epoch": 0.47456074380769375, + "grad_norm": 0.43254348635673523, + "learning_rate": 0.00010510924238682593, + "loss": 1.3016, + "step": 36520 + }, + { + "epoch": 0.4745737383516096, + "grad_norm": 0.4077914357185364, + "learning_rate": 0.00010510664292491454, + "loss": 1.3422, + "step": 36521 + }, + { + "epoch": 0.4745867328955255, + "grad_norm": 0.49535638093948364, + "learning_rate": 0.00010510404346300315, + "loss": 1.3262, + "step": 36522 + }, + { + "epoch": 0.47459972743944134, + "grad_norm": 0.49975574016571045, + "learning_rate": 0.00010510144400109179, + "loss": 1.5343, + "step": 36523 + }, + { + "epoch": 0.47461272198335724, + "grad_norm": 0.31914758682250977, + "learning_rate": 0.0001050988445391804, + "loss": 1.2919, + "step": 36524 + }, + { + "epoch": 0.4746257165272731, + "grad_norm": 0.38303831219673157, + "learning_rate": 0.00010509624507726901, + "loss": 1.4184, + "step": 36525 + }, + { + "epoch": 0.474638711071189, + "grad_norm": 0.38439470529556274, + "learning_rate": 0.00010509364561535762, + "loss": 1.5687, + "step": 36526 + }, + { + "epoch": 0.47465170561510484, + "grad_norm": 0.3509228527545929, + "learning_rate": 0.00010509104615344625, + "loss": 1.4026, + "step": 36527 + }, + { + "epoch": 0.47466470015902074, + "grad_norm": 0.3775359094142914, + "learning_rate": 0.00010508844669153486, + "loss": 1.375, + "step": 36528 + }, + { + "epoch": 0.4746776947029366, + "grad_norm": 0.4158408045768738, + "learning_rate": 0.00010508584722962347, + "loss": 1.4074, + "step": 36529 + }, + { + "epoch": 0.4746906892468525, + "grad_norm": 0.3964065611362457, + "learning_rate": 0.00010508324776771208, + "loss": 1.3681, + "step": 36530 + }, + { + "epoch": 0.4747036837907683, + "grad_norm": 0.3993860185146332, + "learning_rate": 0.00010508064830580072, + "loss": 1.2859, + "step": 36531 + }, + { + "epoch": 0.47471667833468423, + "grad_norm": 0.4389224648475647, + "learning_rate": 0.00010507804884388933, + "loss": 1.4054, + "step": 36532 + }, + { + "epoch": 0.4747296728786001, + "grad_norm": 0.37321236729621887, + "learning_rate": 0.00010507544938197792, + "loss": 1.5025, + "step": 36533 + }, + { + "epoch": 0.474742667422516, + "grad_norm": 0.49972763657569885, + "learning_rate": 0.00010507284992006654, + "loss": 1.409, + "step": 36534 + }, + { + "epoch": 0.4747556619664318, + "grad_norm": 0.43644779920578003, + "learning_rate": 0.00010507025045815517, + "loss": 1.4305, + "step": 36535 + }, + { + "epoch": 0.4747686565103477, + "grad_norm": 0.3297608196735382, + "learning_rate": 0.00010506765099624378, + "loss": 1.3592, + "step": 36536 + }, + { + "epoch": 0.47478165105426356, + "grad_norm": 0.4776918292045593, + "learning_rate": 0.0001050650515343324, + "loss": 1.4418, + "step": 36537 + }, + { + "epoch": 0.47479464559817947, + "grad_norm": 0.4637238681316376, + "learning_rate": 0.00010506245207242102, + "loss": 1.3202, + "step": 36538 + }, + { + "epoch": 0.4748076401420953, + "grad_norm": 0.35447439551353455, + "learning_rate": 0.00010505985261050963, + "loss": 1.5826, + "step": 36539 + }, + { + "epoch": 0.4748206346860112, + "grad_norm": 0.5458242297172546, + "learning_rate": 0.00010505725314859824, + "loss": 1.548, + "step": 36540 + }, + { + "epoch": 0.47483362922992706, + "grad_norm": 0.5168476104736328, + "learning_rate": 0.00010505465368668685, + "loss": 1.4047, + "step": 36541 + }, + { + "epoch": 0.47484662377384296, + "grad_norm": 0.4175431728363037, + "learning_rate": 0.00010505205422477549, + "loss": 1.5352, + "step": 36542 + }, + { + "epoch": 0.47485961831775886, + "grad_norm": 0.45470473170280457, + "learning_rate": 0.0001050494547628641, + "loss": 1.4061, + "step": 36543 + }, + { + "epoch": 0.4748726128616747, + "grad_norm": 0.43922320008277893, + "learning_rate": 0.00010504685530095271, + "loss": 1.2992, + "step": 36544 + }, + { + "epoch": 0.4748856074055906, + "grad_norm": 0.3885059356689453, + "learning_rate": 0.00010504425583904131, + "loss": 1.242, + "step": 36545 + }, + { + "epoch": 0.47489860194950645, + "grad_norm": 0.2863497734069824, + "learning_rate": 0.00010504165637712995, + "loss": 1.1572, + "step": 36546 + }, + { + "epoch": 0.47491159649342235, + "grad_norm": 0.39101648330688477, + "learning_rate": 0.00010503905691521856, + "loss": 1.3434, + "step": 36547 + }, + { + "epoch": 0.4749245910373382, + "grad_norm": 0.41483575105667114, + "learning_rate": 0.00010503645745330717, + "loss": 1.5211, + "step": 36548 + }, + { + "epoch": 0.4749375855812541, + "grad_norm": 0.4933433532714844, + "learning_rate": 0.00010503385799139578, + "loss": 1.3986, + "step": 36549 + }, + { + "epoch": 0.47495058012516994, + "grad_norm": 0.37598884105682373, + "learning_rate": 0.0001050312585294844, + "loss": 1.3217, + "step": 36550 + }, + { + "epoch": 0.47496357466908584, + "grad_norm": 0.3773234188556671, + "learning_rate": 0.00010502865906757302, + "loss": 1.328, + "step": 36551 + }, + { + "epoch": 0.4749765692130017, + "grad_norm": 0.3681739866733551, + "learning_rate": 0.00010502605960566163, + "loss": 1.2298, + "step": 36552 + }, + { + "epoch": 0.4749895637569176, + "grad_norm": 0.48149263858795166, + "learning_rate": 0.00010502346014375024, + "loss": 1.548, + "step": 36553 + }, + { + "epoch": 0.47500255830083343, + "grad_norm": 0.4425208568572998, + "learning_rate": 0.00010502086068183888, + "loss": 1.4054, + "step": 36554 + }, + { + "epoch": 0.47501555284474933, + "grad_norm": 0.4074486792087555, + "learning_rate": 0.00010501826121992749, + "loss": 1.4345, + "step": 36555 + }, + { + "epoch": 0.4750285473886652, + "grad_norm": 0.3843323588371277, + "learning_rate": 0.0001050156617580161, + "loss": 1.286, + "step": 36556 + }, + { + "epoch": 0.4750415419325811, + "grad_norm": 0.22954733669757843, + "learning_rate": 0.00010501306229610471, + "loss": 1.0923, + "step": 36557 + }, + { + "epoch": 0.4750545364764969, + "grad_norm": 0.31725403666496277, + "learning_rate": 0.00010501046283419333, + "loss": 1.1898, + "step": 36558 + }, + { + "epoch": 0.4750675310204128, + "grad_norm": 0.41434192657470703, + "learning_rate": 0.00010500786337228194, + "loss": 1.2978, + "step": 36559 + }, + { + "epoch": 0.47508052556432867, + "grad_norm": 0.4388503432273865, + "learning_rate": 0.00010500526391037056, + "loss": 1.412, + "step": 36560 + }, + { + "epoch": 0.47509352010824457, + "grad_norm": 0.34441328048706055, + "learning_rate": 0.00010500266444845917, + "loss": 1.6324, + "step": 36561 + }, + { + "epoch": 0.4751065146521604, + "grad_norm": 0.32147669792175293, + "learning_rate": 0.00010500006498654779, + "loss": 1.1815, + "step": 36562 + }, + { + "epoch": 0.4751195091960763, + "grad_norm": 0.6119058132171631, + "learning_rate": 0.0001049974655246364, + "loss": 1.4997, + "step": 36563 + }, + { + "epoch": 0.47513250373999216, + "grad_norm": 0.28984853625297546, + "learning_rate": 0.00010499486606272501, + "loss": 1.1602, + "step": 36564 + }, + { + "epoch": 0.47514549828390806, + "grad_norm": 0.39859700202941895, + "learning_rate": 0.00010499226660081362, + "loss": 1.5069, + "step": 36565 + }, + { + "epoch": 0.4751584928278239, + "grad_norm": 0.3677665591239929, + "learning_rate": 0.00010498966713890226, + "loss": 1.4703, + "step": 36566 + }, + { + "epoch": 0.4751714873717398, + "grad_norm": 0.32948583364486694, + "learning_rate": 0.00010498706767699087, + "loss": 1.185, + "step": 36567 + }, + { + "epoch": 0.47518448191565565, + "grad_norm": 0.34933292865753174, + "learning_rate": 0.00010498446821507948, + "loss": 1.1793, + "step": 36568 + }, + { + "epoch": 0.47519747645957156, + "grad_norm": 0.33874472975730896, + "learning_rate": 0.0001049818687531681, + "loss": 1.1813, + "step": 36569 + }, + { + "epoch": 0.4752104710034874, + "grad_norm": 0.3756799101829529, + "learning_rate": 0.00010497926929125672, + "loss": 1.5125, + "step": 36570 + }, + { + "epoch": 0.4752234655474033, + "grad_norm": 0.30656367540359497, + "learning_rate": 0.00010497666982934533, + "loss": 1.1415, + "step": 36571 + }, + { + "epoch": 0.47523646009131915, + "grad_norm": 0.3955113887786865, + "learning_rate": 0.00010497407036743394, + "loss": 1.4206, + "step": 36572 + }, + { + "epoch": 0.47524945463523505, + "grad_norm": 0.45511651039123535, + "learning_rate": 0.00010497147090552255, + "loss": 1.5368, + "step": 36573 + }, + { + "epoch": 0.4752624491791509, + "grad_norm": 0.43069085478782654, + "learning_rate": 0.00010496887144361119, + "loss": 1.2757, + "step": 36574 + }, + { + "epoch": 0.4752754437230668, + "grad_norm": 0.35471856594085693, + "learning_rate": 0.00010496627198169979, + "loss": 1.3408, + "step": 36575 + }, + { + "epoch": 0.47528843826698264, + "grad_norm": 0.3758014142513275, + "learning_rate": 0.0001049636725197884, + "loss": 1.2191, + "step": 36576 + }, + { + "epoch": 0.47530143281089854, + "grad_norm": 0.46816834807395935, + "learning_rate": 0.00010496107305787704, + "loss": 1.366, + "step": 36577 + }, + { + "epoch": 0.4753144273548144, + "grad_norm": 0.305191308259964, + "learning_rate": 0.00010495847359596565, + "loss": 1.4379, + "step": 36578 + }, + { + "epoch": 0.4753274218987303, + "grad_norm": 0.46390971541404724, + "learning_rate": 0.00010495587413405426, + "loss": 1.3382, + "step": 36579 + }, + { + "epoch": 0.47534041644264613, + "grad_norm": 0.4372372031211853, + "learning_rate": 0.00010495327467214287, + "loss": 1.413, + "step": 36580 + }, + { + "epoch": 0.47535341098656203, + "grad_norm": 0.4454105496406555, + "learning_rate": 0.0001049506752102315, + "loss": 1.3159, + "step": 36581 + }, + { + "epoch": 0.4753664055304779, + "grad_norm": 0.49519380927085876, + "learning_rate": 0.0001049480757483201, + "loss": 1.4488, + "step": 36582 + }, + { + "epoch": 0.4753794000743938, + "grad_norm": 0.41635656356811523, + "learning_rate": 0.00010494547628640872, + "loss": 1.3412, + "step": 36583 + }, + { + "epoch": 0.4753923946183096, + "grad_norm": 0.392839252948761, + "learning_rate": 0.00010494287682449733, + "loss": 1.2337, + "step": 36584 + }, + { + "epoch": 0.4754053891622255, + "grad_norm": 0.44731637835502625, + "learning_rate": 0.00010494027736258596, + "loss": 1.5325, + "step": 36585 + }, + { + "epoch": 0.47541838370614137, + "grad_norm": 0.39682236313819885, + "learning_rate": 0.00010493767790067458, + "loss": 1.494, + "step": 36586 + }, + { + "epoch": 0.47543137825005727, + "grad_norm": 0.399547278881073, + "learning_rate": 0.00010493507843876317, + "loss": 1.486, + "step": 36587 + }, + { + "epoch": 0.4754443727939731, + "grad_norm": 0.3670264482498169, + "learning_rate": 0.00010493247897685178, + "loss": 1.5844, + "step": 36588 + }, + { + "epoch": 0.475457367337889, + "grad_norm": 0.4040701389312744, + "learning_rate": 0.00010492987951494042, + "loss": 1.5111, + "step": 36589 + }, + { + "epoch": 0.47547036188180486, + "grad_norm": 0.47769248485565186, + "learning_rate": 0.00010492728005302903, + "loss": 1.4891, + "step": 36590 + }, + { + "epoch": 0.47548335642572076, + "grad_norm": 0.42718306183815, + "learning_rate": 0.00010492468059111764, + "loss": 1.5126, + "step": 36591 + }, + { + "epoch": 0.4754963509696366, + "grad_norm": 0.3339137136936188, + "learning_rate": 0.00010492208112920625, + "loss": 1.2913, + "step": 36592 + }, + { + "epoch": 0.4755093455135525, + "grad_norm": 0.35551586747169495, + "learning_rate": 0.00010491948166729488, + "loss": 1.4391, + "step": 36593 + }, + { + "epoch": 0.47552234005746835, + "grad_norm": 0.542618453502655, + "learning_rate": 0.00010491688220538349, + "loss": 1.349, + "step": 36594 + }, + { + "epoch": 0.47553533460138425, + "grad_norm": 0.3384997546672821, + "learning_rate": 0.0001049142827434721, + "loss": 1.2489, + "step": 36595 + }, + { + "epoch": 0.4755483291453001, + "grad_norm": 0.3902345895767212, + "learning_rate": 0.00010491168328156071, + "loss": 1.3553, + "step": 36596 + }, + { + "epoch": 0.475561323689216, + "grad_norm": 0.46782544255256653, + "learning_rate": 0.00010490908381964935, + "loss": 1.5085, + "step": 36597 + }, + { + "epoch": 0.47557431823313184, + "grad_norm": 0.33022165298461914, + "learning_rate": 0.00010490648435773796, + "loss": 1.2355, + "step": 36598 + }, + { + "epoch": 0.47558731277704774, + "grad_norm": 0.46046265959739685, + "learning_rate": 0.00010490388489582657, + "loss": 1.5841, + "step": 36599 + }, + { + "epoch": 0.4756003073209636, + "grad_norm": 0.3958882987499237, + "learning_rate": 0.00010490128543391517, + "loss": 1.3544, + "step": 36600 + }, + { + "epoch": 0.4756133018648795, + "grad_norm": 0.38472363352775574, + "learning_rate": 0.00010489868597200381, + "loss": 1.3404, + "step": 36601 + }, + { + "epoch": 0.47562629640879533, + "grad_norm": 0.3318668305873871, + "learning_rate": 0.00010489608651009242, + "loss": 1.4403, + "step": 36602 + }, + { + "epoch": 0.47563929095271124, + "grad_norm": 0.35916343331336975, + "learning_rate": 0.00010489348704818103, + "loss": 1.3123, + "step": 36603 + }, + { + "epoch": 0.4756522854966271, + "grad_norm": 0.39827048778533936, + "learning_rate": 0.00010489088758626964, + "loss": 1.5645, + "step": 36604 + }, + { + "epoch": 0.475665280040543, + "grad_norm": 0.43854910135269165, + "learning_rate": 0.00010488828812435826, + "loss": 1.2894, + "step": 36605 + }, + { + "epoch": 0.4756782745844588, + "grad_norm": 0.4121466279029846, + "learning_rate": 0.00010488568866244688, + "loss": 1.4554, + "step": 36606 + }, + { + "epoch": 0.4756912691283747, + "grad_norm": 0.35306692123413086, + "learning_rate": 0.00010488308920053549, + "loss": 1.1623, + "step": 36607 + }, + { + "epoch": 0.4757042636722906, + "grad_norm": 0.34602034091949463, + "learning_rate": 0.0001048804897386241, + "loss": 1.345, + "step": 36608 + }, + { + "epoch": 0.4757172582162065, + "grad_norm": 0.32779160141944885, + "learning_rate": 0.00010487789027671273, + "loss": 1.4213, + "step": 36609 + }, + { + "epoch": 0.4757302527601223, + "grad_norm": 0.3613099455833435, + "learning_rate": 0.00010487529081480135, + "loss": 1.4689, + "step": 36610 + }, + { + "epoch": 0.4757432473040382, + "grad_norm": 0.39151838421821594, + "learning_rate": 0.00010487269135288996, + "loss": 1.5746, + "step": 36611 + }, + { + "epoch": 0.47575624184795406, + "grad_norm": 0.4524723291397095, + "learning_rate": 0.00010487009189097858, + "loss": 1.3822, + "step": 36612 + }, + { + "epoch": 0.47576923639186997, + "grad_norm": 0.3835708200931549, + "learning_rate": 0.00010486749242906719, + "loss": 1.3472, + "step": 36613 + }, + { + "epoch": 0.4757822309357858, + "grad_norm": 0.4190136194229126, + "learning_rate": 0.0001048648929671558, + "loss": 1.6112, + "step": 36614 + }, + { + "epoch": 0.4757952254797017, + "grad_norm": 0.39557453989982605, + "learning_rate": 0.00010486229350524441, + "loss": 1.4293, + "step": 36615 + }, + { + "epoch": 0.47580822002361756, + "grad_norm": 0.4559950530529022, + "learning_rate": 0.00010485969404333305, + "loss": 1.4053, + "step": 36616 + }, + { + "epoch": 0.47582121456753346, + "grad_norm": 0.4888871908187866, + "learning_rate": 0.00010485709458142165, + "loss": 1.34, + "step": 36617 + }, + { + "epoch": 0.47583420911144936, + "grad_norm": 0.5446009635925293, + "learning_rate": 0.00010485449511951026, + "loss": 1.3928, + "step": 36618 + }, + { + "epoch": 0.4758472036553652, + "grad_norm": 0.4229432940483093, + "learning_rate": 0.00010485189565759887, + "loss": 1.5064, + "step": 36619 + }, + { + "epoch": 0.4758601981992811, + "grad_norm": 0.3369017541408539, + "learning_rate": 0.00010484929619568751, + "loss": 1.558, + "step": 36620 + }, + { + "epoch": 0.47587319274319695, + "grad_norm": 0.31404536962509155, + "learning_rate": 0.00010484669673377612, + "loss": 1.2749, + "step": 36621 + }, + { + "epoch": 0.47588618728711285, + "grad_norm": 0.3842542767524719, + "learning_rate": 0.00010484409727186473, + "loss": 1.3331, + "step": 36622 + }, + { + "epoch": 0.4758991818310287, + "grad_norm": 0.35615262389183044, + "learning_rate": 0.00010484149780995334, + "loss": 1.2082, + "step": 36623 + }, + { + "epoch": 0.4759121763749446, + "grad_norm": 0.42282161116600037, + "learning_rate": 0.00010483889834804197, + "loss": 1.5044, + "step": 36624 + }, + { + "epoch": 0.47592517091886044, + "grad_norm": 0.3108629882335663, + "learning_rate": 0.00010483629888613058, + "loss": 1.4641, + "step": 36625 + }, + { + "epoch": 0.47593816546277634, + "grad_norm": 0.477609783411026, + "learning_rate": 0.00010483369942421919, + "loss": 1.3815, + "step": 36626 + }, + { + "epoch": 0.4759511600066922, + "grad_norm": 0.3897896707057953, + "learning_rate": 0.0001048310999623078, + "loss": 1.3263, + "step": 36627 + }, + { + "epoch": 0.4759641545506081, + "grad_norm": 0.3767321705818176, + "learning_rate": 0.00010482850050039644, + "loss": 1.5048, + "step": 36628 + }, + { + "epoch": 0.47597714909452393, + "grad_norm": 0.4252581298351288, + "learning_rate": 0.00010482590103848503, + "loss": 1.4949, + "step": 36629 + }, + { + "epoch": 0.47599014363843983, + "grad_norm": 0.3715384602546692, + "learning_rate": 0.00010482330157657365, + "loss": 1.4018, + "step": 36630 + }, + { + "epoch": 0.4760031381823557, + "grad_norm": 0.43687939643859863, + "learning_rate": 0.00010482070211466226, + "loss": 1.3171, + "step": 36631 + }, + { + "epoch": 0.4760161327262716, + "grad_norm": 0.3893674910068512, + "learning_rate": 0.0001048181026527509, + "loss": 1.2424, + "step": 36632 + }, + { + "epoch": 0.4760291272701874, + "grad_norm": 0.3173537254333496, + "learning_rate": 0.0001048155031908395, + "loss": 1.2627, + "step": 36633 + }, + { + "epoch": 0.4760421218141033, + "grad_norm": 0.37747469544410706, + "learning_rate": 0.00010481290372892812, + "loss": 1.2943, + "step": 36634 + }, + { + "epoch": 0.47605511635801917, + "grad_norm": 0.38996821641921997, + "learning_rate": 0.00010481030426701673, + "loss": 1.3556, + "step": 36635 + }, + { + "epoch": 0.47606811090193507, + "grad_norm": 0.4663032591342926, + "learning_rate": 0.00010480770480510535, + "loss": 1.6733, + "step": 36636 + }, + { + "epoch": 0.4760811054458509, + "grad_norm": 0.3623739778995514, + "learning_rate": 0.00010480510534319396, + "loss": 1.4272, + "step": 36637 + }, + { + "epoch": 0.4760940999897668, + "grad_norm": 0.39605262875556946, + "learning_rate": 0.00010480250588128257, + "loss": 1.4316, + "step": 36638 + }, + { + "epoch": 0.47610709453368266, + "grad_norm": 0.42283758521080017, + "learning_rate": 0.00010479990641937118, + "loss": 1.3129, + "step": 36639 + }, + { + "epoch": 0.47612008907759856, + "grad_norm": 0.3668553829193115, + "learning_rate": 0.00010479730695745982, + "loss": 1.5104, + "step": 36640 + }, + { + "epoch": 0.4761330836215144, + "grad_norm": 0.38374996185302734, + "learning_rate": 0.00010479470749554843, + "loss": 1.446, + "step": 36641 + }, + { + "epoch": 0.4761460781654303, + "grad_norm": 0.45210209488868713, + "learning_rate": 0.00010479210803363703, + "loss": 1.2748, + "step": 36642 + }, + { + "epoch": 0.47615907270934615, + "grad_norm": 0.5122808218002319, + "learning_rate": 0.00010478950857172564, + "loss": 1.4623, + "step": 36643 + }, + { + "epoch": 0.47617206725326205, + "grad_norm": 0.42158788442611694, + "learning_rate": 0.00010478690910981428, + "loss": 1.4949, + "step": 36644 + }, + { + "epoch": 0.4761850617971779, + "grad_norm": 0.37460780143737793, + "learning_rate": 0.00010478430964790289, + "loss": 1.4465, + "step": 36645 + }, + { + "epoch": 0.4761980563410938, + "grad_norm": 0.4514116942882538, + "learning_rate": 0.0001047817101859915, + "loss": 1.47, + "step": 36646 + }, + { + "epoch": 0.47621105088500965, + "grad_norm": 0.37568414211273193, + "learning_rate": 0.00010477911072408011, + "loss": 1.2241, + "step": 36647 + }, + { + "epoch": 0.47622404542892555, + "grad_norm": 0.42487233877182007, + "learning_rate": 0.00010477651126216874, + "loss": 1.3388, + "step": 36648 + }, + { + "epoch": 0.4762370399728414, + "grad_norm": 0.4071959853172302, + "learning_rate": 0.00010477391180025735, + "loss": 1.4931, + "step": 36649 + }, + { + "epoch": 0.4762500345167573, + "grad_norm": 0.3203359842300415, + "learning_rate": 0.00010477131233834596, + "loss": 1.2717, + "step": 36650 + }, + { + "epoch": 0.47626302906067314, + "grad_norm": 0.46125444769859314, + "learning_rate": 0.0001047687128764346, + "loss": 1.2407, + "step": 36651 + }, + { + "epoch": 0.47627602360458904, + "grad_norm": 0.3418574035167694, + "learning_rate": 0.00010476611341452321, + "loss": 1.2622, + "step": 36652 + }, + { + "epoch": 0.4762890181485049, + "grad_norm": 0.32993462681770325, + "learning_rate": 0.00010476351395261182, + "loss": 1.4335, + "step": 36653 + }, + { + "epoch": 0.4763020126924208, + "grad_norm": 0.4010336697101593, + "learning_rate": 0.00010476091449070043, + "loss": 1.5089, + "step": 36654 + }, + { + "epoch": 0.47631500723633663, + "grad_norm": 0.3179466128349304, + "learning_rate": 0.00010475831502878905, + "loss": 1.3824, + "step": 36655 + }, + { + "epoch": 0.47632800178025253, + "grad_norm": 0.4837910830974579, + "learning_rate": 0.00010475571556687767, + "loss": 1.5247, + "step": 36656 + }, + { + "epoch": 0.4763409963241684, + "grad_norm": 0.3144797682762146, + "learning_rate": 0.00010475311610496628, + "loss": 1.2134, + "step": 36657 + }, + { + "epoch": 0.4763539908680843, + "grad_norm": 0.441284716129303, + "learning_rate": 0.00010475051664305489, + "loss": 1.557, + "step": 36658 + }, + { + "epoch": 0.4763669854120001, + "grad_norm": 0.36776813864707947, + "learning_rate": 0.00010474791718114351, + "loss": 1.5993, + "step": 36659 + }, + { + "epoch": 0.476379979955916, + "grad_norm": 0.43245264887809753, + "learning_rate": 0.00010474531771923212, + "loss": 1.4374, + "step": 36660 + }, + { + "epoch": 0.47639297449983187, + "grad_norm": 0.3009413480758667, + "learning_rate": 0.00010474271825732073, + "loss": 1.3869, + "step": 36661 + }, + { + "epoch": 0.47640596904374777, + "grad_norm": 0.44934988021850586, + "learning_rate": 0.00010474011879540934, + "loss": 1.4371, + "step": 36662 + }, + { + "epoch": 0.4764189635876636, + "grad_norm": 0.29567110538482666, + "learning_rate": 0.00010473751933349798, + "loss": 1.2406, + "step": 36663 + }, + { + "epoch": 0.4764319581315795, + "grad_norm": 0.43415939807891846, + "learning_rate": 0.0001047349198715866, + "loss": 1.4056, + "step": 36664 + }, + { + "epoch": 0.47644495267549536, + "grad_norm": 0.4963655471801758, + "learning_rate": 0.0001047323204096752, + "loss": 1.5326, + "step": 36665 + }, + { + "epoch": 0.47645794721941126, + "grad_norm": 0.3924178183078766, + "learning_rate": 0.00010472972094776382, + "loss": 1.6769, + "step": 36666 + }, + { + "epoch": 0.4764709417633271, + "grad_norm": 0.3905147910118103, + "learning_rate": 0.00010472712148585244, + "loss": 1.3099, + "step": 36667 + }, + { + "epoch": 0.476483936307243, + "grad_norm": 0.3200380802154541, + "learning_rate": 0.00010472452202394105, + "loss": 1.4758, + "step": 36668 + }, + { + "epoch": 0.47649693085115885, + "grad_norm": 0.41207966208457947, + "learning_rate": 0.00010472192256202966, + "loss": 1.3918, + "step": 36669 + }, + { + "epoch": 0.47650992539507475, + "grad_norm": 0.4541751742362976, + "learning_rate": 0.00010471932310011827, + "loss": 1.4424, + "step": 36670 + }, + { + "epoch": 0.4765229199389906, + "grad_norm": 0.4057697653770447, + "learning_rate": 0.0001047167236382069, + "loss": 1.3574, + "step": 36671 + }, + { + "epoch": 0.4765359144829065, + "grad_norm": 0.3971586525440216, + "learning_rate": 0.00010471412417629551, + "loss": 1.4402, + "step": 36672 + }, + { + "epoch": 0.47654890902682234, + "grad_norm": 0.3684231638908386, + "learning_rate": 0.00010471152471438412, + "loss": 1.3239, + "step": 36673 + }, + { + "epoch": 0.47656190357073824, + "grad_norm": 0.3162926137447357, + "learning_rate": 0.00010470892525247273, + "loss": 1.5398, + "step": 36674 + }, + { + "epoch": 0.4765748981146541, + "grad_norm": 0.37970682978630066, + "learning_rate": 0.00010470632579056137, + "loss": 1.3114, + "step": 36675 + }, + { + "epoch": 0.47658789265857, + "grad_norm": 0.36380571126937866, + "learning_rate": 0.00010470372632864998, + "loss": 1.3879, + "step": 36676 + }, + { + "epoch": 0.47660088720248583, + "grad_norm": 0.43981075286865234, + "learning_rate": 0.00010470112686673859, + "loss": 1.4355, + "step": 36677 + }, + { + "epoch": 0.47661388174640174, + "grad_norm": 0.37799328565597534, + "learning_rate": 0.0001046985274048272, + "loss": 1.301, + "step": 36678 + }, + { + "epoch": 0.4766268762903176, + "grad_norm": 0.4454060196876526, + "learning_rate": 0.00010469592794291583, + "loss": 1.3866, + "step": 36679 + }, + { + "epoch": 0.4766398708342335, + "grad_norm": 0.27437472343444824, + "learning_rate": 0.00010469332848100444, + "loss": 1.3548, + "step": 36680 + }, + { + "epoch": 0.4766528653781493, + "grad_norm": 0.3962048888206482, + "learning_rate": 0.00010469072901909305, + "loss": 1.3742, + "step": 36681 + }, + { + "epoch": 0.4766658599220652, + "grad_norm": 0.27999815344810486, + "learning_rate": 0.00010468812955718166, + "loss": 1.5608, + "step": 36682 + }, + { + "epoch": 0.47667885446598107, + "grad_norm": 0.3239463269710541, + "learning_rate": 0.0001046855300952703, + "loss": 1.2827, + "step": 36683 + }, + { + "epoch": 0.476691849009897, + "grad_norm": 0.39868465065956116, + "learning_rate": 0.0001046829306333589, + "loss": 1.3061, + "step": 36684 + }, + { + "epoch": 0.4767048435538128, + "grad_norm": 0.3230758011341095, + "learning_rate": 0.0001046803311714475, + "loss": 1.4353, + "step": 36685 + }, + { + "epoch": 0.4767178380977287, + "grad_norm": 0.41743355989456177, + "learning_rate": 0.00010467773170953614, + "loss": 1.3816, + "step": 36686 + }, + { + "epoch": 0.47673083264164456, + "grad_norm": 0.40204691886901855, + "learning_rate": 0.00010467513224762475, + "loss": 1.211, + "step": 36687 + }, + { + "epoch": 0.47674382718556046, + "grad_norm": 0.45041894912719727, + "learning_rate": 0.00010467253278571336, + "loss": 1.3671, + "step": 36688 + }, + { + "epoch": 0.4767568217294763, + "grad_norm": 0.4229316711425781, + "learning_rate": 0.00010466993332380198, + "loss": 1.5293, + "step": 36689 + }, + { + "epoch": 0.4767698162733922, + "grad_norm": 0.43961793184280396, + "learning_rate": 0.0001046673338618906, + "loss": 1.4271, + "step": 36690 + }, + { + "epoch": 0.47678281081730806, + "grad_norm": 0.3478478789329529, + "learning_rate": 0.00010466473439997921, + "loss": 1.3915, + "step": 36691 + }, + { + "epoch": 0.47679580536122396, + "grad_norm": 0.38445302844047546, + "learning_rate": 0.00010466213493806782, + "loss": 1.2262, + "step": 36692 + }, + { + "epoch": 0.4768087999051398, + "grad_norm": 0.3939126431941986, + "learning_rate": 0.00010465953547615643, + "loss": 1.2389, + "step": 36693 + }, + { + "epoch": 0.4768217944490557, + "grad_norm": 0.39434489607810974, + "learning_rate": 0.00010465693601424507, + "loss": 1.4247, + "step": 36694 + }, + { + "epoch": 0.4768347889929716, + "grad_norm": 0.3936539590358734, + "learning_rate": 0.00010465433655233368, + "loss": 1.3082, + "step": 36695 + }, + { + "epoch": 0.47684778353688745, + "grad_norm": 0.28185704350471497, + "learning_rate": 0.00010465173709042229, + "loss": 1.1577, + "step": 36696 + }, + { + "epoch": 0.47686077808080335, + "grad_norm": 0.44948533177375793, + "learning_rate": 0.00010464913762851089, + "loss": 1.3791, + "step": 36697 + }, + { + "epoch": 0.4768737726247192, + "grad_norm": 0.3790379464626312, + "learning_rate": 0.00010464653816659953, + "loss": 1.292, + "step": 36698 + }, + { + "epoch": 0.4768867671686351, + "grad_norm": 0.4188931882381439, + "learning_rate": 0.00010464393870468814, + "loss": 1.2847, + "step": 36699 + }, + { + "epoch": 0.47689976171255094, + "grad_norm": 0.39663296937942505, + "learning_rate": 0.00010464133924277675, + "loss": 1.3052, + "step": 36700 + }, + { + "epoch": 0.47691275625646684, + "grad_norm": 0.3618800640106201, + "learning_rate": 0.00010463873978086536, + "loss": 1.419, + "step": 36701 + }, + { + "epoch": 0.4769257508003827, + "grad_norm": 0.40006691217422485, + "learning_rate": 0.00010463614031895399, + "loss": 1.5252, + "step": 36702 + }, + { + "epoch": 0.4769387453442986, + "grad_norm": 0.47297531366348267, + "learning_rate": 0.0001046335408570426, + "loss": 1.5889, + "step": 36703 + }, + { + "epoch": 0.47695173988821443, + "grad_norm": 0.4589444100856781, + "learning_rate": 0.00010463094139513121, + "loss": 1.5531, + "step": 36704 + }, + { + "epoch": 0.47696473443213033, + "grad_norm": 0.3990491032600403, + "learning_rate": 0.00010462834193321982, + "loss": 1.4231, + "step": 36705 + }, + { + "epoch": 0.4769777289760462, + "grad_norm": 0.4526325762271881, + "learning_rate": 0.00010462574247130846, + "loss": 1.3566, + "step": 36706 + }, + { + "epoch": 0.4769907235199621, + "grad_norm": 0.4384739398956299, + "learning_rate": 0.00010462314300939707, + "loss": 1.4456, + "step": 36707 + }, + { + "epoch": 0.4770037180638779, + "grad_norm": 0.37170732021331787, + "learning_rate": 0.00010462054354748568, + "loss": 1.3146, + "step": 36708 + }, + { + "epoch": 0.4770167126077938, + "grad_norm": 0.3383696973323822, + "learning_rate": 0.00010461794408557428, + "loss": 1.3831, + "step": 36709 + }, + { + "epoch": 0.47702970715170967, + "grad_norm": 0.43649008870124817, + "learning_rate": 0.00010461534462366291, + "loss": 1.5844, + "step": 36710 + }, + { + "epoch": 0.47704270169562557, + "grad_norm": 0.38313889503479004, + "learning_rate": 0.00010461274516175152, + "loss": 1.4672, + "step": 36711 + }, + { + "epoch": 0.4770556962395414, + "grad_norm": 0.35692378878593445, + "learning_rate": 0.00010461014569984014, + "loss": 1.4175, + "step": 36712 + }, + { + "epoch": 0.4770686907834573, + "grad_norm": 0.4566735625267029, + "learning_rate": 0.00010460754623792875, + "loss": 1.5214, + "step": 36713 + }, + { + "epoch": 0.47708168532737316, + "grad_norm": 0.4379623830318451, + "learning_rate": 0.00010460494677601737, + "loss": 1.4347, + "step": 36714 + }, + { + "epoch": 0.47709467987128906, + "grad_norm": 0.47974613308906555, + "learning_rate": 0.00010460234731410598, + "loss": 1.403, + "step": 36715 + }, + { + "epoch": 0.4771076744152049, + "grad_norm": 0.34905222058296204, + "learning_rate": 0.00010459974785219459, + "loss": 1.3785, + "step": 36716 + }, + { + "epoch": 0.4771206689591208, + "grad_norm": 0.2997996211051941, + "learning_rate": 0.0001045971483902832, + "loss": 1.4868, + "step": 36717 + }, + { + "epoch": 0.47713366350303665, + "grad_norm": 0.40797334909439087, + "learning_rate": 0.00010459454892837184, + "loss": 1.3461, + "step": 36718 + }, + { + "epoch": 0.47714665804695255, + "grad_norm": 0.3335047662258148, + "learning_rate": 0.00010459194946646045, + "loss": 1.4098, + "step": 36719 + }, + { + "epoch": 0.4771596525908684, + "grad_norm": 0.48377835750579834, + "learning_rate": 0.00010458935000454906, + "loss": 1.4923, + "step": 36720 + }, + { + "epoch": 0.4771726471347843, + "grad_norm": 0.4701981544494629, + "learning_rate": 0.00010458675054263767, + "loss": 1.3209, + "step": 36721 + }, + { + "epoch": 0.47718564167870015, + "grad_norm": 0.35936102271080017, + "learning_rate": 0.0001045841510807263, + "loss": 1.438, + "step": 36722 + }, + { + "epoch": 0.47719863622261605, + "grad_norm": 0.5204327702522278, + "learning_rate": 0.00010458155161881491, + "loss": 1.5019, + "step": 36723 + }, + { + "epoch": 0.4772116307665319, + "grad_norm": 0.38770392537117004, + "learning_rate": 0.00010457895215690352, + "loss": 1.299, + "step": 36724 + }, + { + "epoch": 0.4772246253104478, + "grad_norm": 0.4550442397594452, + "learning_rate": 0.00010457635269499216, + "loss": 1.4058, + "step": 36725 + }, + { + "epoch": 0.47723761985436364, + "grad_norm": 0.40520885586738586, + "learning_rate": 0.00010457375323308076, + "loss": 1.4137, + "step": 36726 + }, + { + "epoch": 0.47725061439827954, + "grad_norm": 0.38424715399742126, + "learning_rate": 0.00010457115377116937, + "loss": 1.4156, + "step": 36727 + }, + { + "epoch": 0.4772636089421954, + "grad_norm": 0.40615326166152954, + "learning_rate": 0.00010456855430925798, + "loss": 1.5019, + "step": 36728 + }, + { + "epoch": 0.4772766034861113, + "grad_norm": 0.3706136643886566, + "learning_rate": 0.00010456595484734662, + "loss": 1.4819, + "step": 36729 + }, + { + "epoch": 0.47728959803002713, + "grad_norm": 0.3748354911804199, + "learning_rate": 0.00010456335538543523, + "loss": 1.474, + "step": 36730 + }, + { + "epoch": 0.47730259257394303, + "grad_norm": 0.5021761059761047, + "learning_rate": 0.00010456075592352384, + "loss": 1.441, + "step": 36731 + }, + { + "epoch": 0.4773155871178589, + "grad_norm": 0.3414623737335205, + "learning_rate": 0.00010455815646161245, + "loss": 1.1933, + "step": 36732 + }, + { + "epoch": 0.4773285816617748, + "grad_norm": 0.46403640508651733, + "learning_rate": 0.00010455555699970107, + "loss": 1.3232, + "step": 36733 + }, + { + "epoch": 0.4773415762056906, + "grad_norm": 0.34871384501457214, + "learning_rate": 0.00010455295753778968, + "loss": 1.0337, + "step": 36734 + }, + { + "epoch": 0.4773545707496065, + "grad_norm": 0.5148264765739441, + "learning_rate": 0.0001045503580758783, + "loss": 1.3805, + "step": 36735 + }, + { + "epoch": 0.47736756529352237, + "grad_norm": 0.4619481861591339, + "learning_rate": 0.0001045477586139669, + "loss": 1.3468, + "step": 36736 + }, + { + "epoch": 0.47738055983743827, + "grad_norm": 0.3305440843105316, + "learning_rate": 0.00010454515915205554, + "loss": 1.2342, + "step": 36737 + }, + { + "epoch": 0.4773935543813541, + "grad_norm": 0.44203656911849976, + "learning_rate": 0.00010454255969014416, + "loss": 1.2909, + "step": 36738 + }, + { + "epoch": 0.47740654892527, + "grad_norm": 0.37053728103637695, + "learning_rate": 0.00010453996022823275, + "loss": 1.1953, + "step": 36739 + }, + { + "epoch": 0.47741954346918586, + "grad_norm": 0.3549066185951233, + "learning_rate": 0.00010453736076632136, + "loss": 1.4241, + "step": 36740 + }, + { + "epoch": 0.47743253801310176, + "grad_norm": 0.4641919434070587, + "learning_rate": 0.00010453476130441, + "loss": 1.2895, + "step": 36741 + }, + { + "epoch": 0.4774455325570176, + "grad_norm": 0.3824481666088104, + "learning_rate": 0.00010453216184249861, + "loss": 1.3224, + "step": 36742 + }, + { + "epoch": 0.4774585271009335, + "grad_norm": 0.5151437520980835, + "learning_rate": 0.00010452956238058722, + "loss": 1.3017, + "step": 36743 + }, + { + "epoch": 0.47747152164484935, + "grad_norm": 0.4059079885482788, + "learning_rate": 0.00010452696291867583, + "loss": 1.3959, + "step": 36744 + }, + { + "epoch": 0.47748451618876525, + "grad_norm": 0.5170286297798157, + "learning_rate": 0.00010452436345676446, + "loss": 1.3562, + "step": 36745 + }, + { + "epoch": 0.4774975107326811, + "grad_norm": 0.4208987057209015, + "learning_rate": 0.00010452176399485307, + "loss": 1.3479, + "step": 36746 + }, + { + "epoch": 0.477510505276597, + "grad_norm": 0.38757646083831787, + "learning_rate": 0.00010451916453294168, + "loss": 1.3206, + "step": 36747 + }, + { + "epoch": 0.47752349982051284, + "grad_norm": 0.4275495111942291, + "learning_rate": 0.00010451656507103029, + "loss": 1.3001, + "step": 36748 + }, + { + "epoch": 0.47753649436442874, + "grad_norm": 0.4168182909488678, + "learning_rate": 0.00010451396560911893, + "loss": 1.5246, + "step": 36749 + }, + { + "epoch": 0.4775494889083446, + "grad_norm": 0.4191395044326782, + "learning_rate": 0.00010451136614720754, + "loss": 1.3147, + "step": 36750 + }, + { + "epoch": 0.4775624834522605, + "grad_norm": 0.4682447016239166, + "learning_rate": 0.00010450876668529614, + "loss": 1.3921, + "step": 36751 + }, + { + "epoch": 0.47757547799617633, + "grad_norm": 0.5731196999549866, + "learning_rate": 0.00010450616722338475, + "loss": 1.3467, + "step": 36752 + }, + { + "epoch": 0.47758847254009223, + "grad_norm": 0.3306954801082611, + "learning_rate": 0.00010450356776147339, + "loss": 1.3271, + "step": 36753 + }, + { + "epoch": 0.4776014670840081, + "grad_norm": 0.3633701801300049, + "learning_rate": 0.000104500968299562, + "loss": 1.4138, + "step": 36754 + }, + { + "epoch": 0.477614461627924, + "grad_norm": 0.3771771788597107, + "learning_rate": 0.00010449836883765061, + "loss": 1.2619, + "step": 36755 + }, + { + "epoch": 0.4776274561718398, + "grad_norm": 0.7041621804237366, + "learning_rate": 0.00010449576937573922, + "loss": 1.35, + "step": 36756 + }, + { + "epoch": 0.4776404507157557, + "grad_norm": 0.43414121866226196, + "learning_rate": 0.00010449316991382784, + "loss": 1.3361, + "step": 36757 + }, + { + "epoch": 0.47765344525967157, + "grad_norm": 0.4131770730018616, + "learning_rate": 0.00010449057045191645, + "loss": 1.264, + "step": 36758 + }, + { + "epoch": 0.4776664398035875, + "grad_norm": 0.48765864968299866, + "learning_rate": 0.00010448797099000507, + "loss": 1.4956, + "step": 36759 + }, + { + "epoch": 0.4776794343475033, + "grad_norm": 0.36018362641334534, + "learning_rate": 0.0001044853715280937, + "loss": 1.4049, + "step": 36760 + }, + { + "epoch": 0.4776924288914192, + "grad_norm": 0.4293461740016937, + "learning_rate": 0.00010448277206618231, + "loss": 1.3266, + "step": 36761 + }, + { + "epoch": 0.47770542343533506, + "grad_norm": 0.41854169964790344, + "learning_rate": 0.00010448017260427093, + "loss": 1.3138, + "step": 36762 + }, + { + "epoch": 0.47771841797925096, + "grad_norm": 0.38281795382499695, + "learning_rate": 0.00010447757314235954, + "loss": 1.4074, + "step": 36763 + }, + { + "epoch": 0.4777314125231668, + "grad_norm": 0.40058383345603943, + "learning_rate": 0.00010447497368044816, + "loss": 1.4795, + "step": 36764 + }, + { + "epoch": 0.4777444070670827, + "grad_norm": 0.41241368651390076, + "learning_rate": 0.00010447237421853677, + "loss": 1.5214, + "step": 36765 + }, + { + "epoch": 0.47775740161099856, + "grad_norm": 0.40768712759017944, + "learning_rate": 0.00010446977475662538, + "loss": 1.3962, + "step": 36766 + }, + { + "epoch": 0.47777039615491446, + "grad_norm": 0.48448818922042847, + "learning_rate": 0.000104467175294714, + "loss": 1.3316, + "step": 36767 + }, + { + "epoch": 0.4777833906988303, + "grad_norm": 0.3831901252269745, + "learning_rate": 0.00010446457583280262, + "loss": 1.5922, + "step": 36768 + }, + { + "epoch": 0.4777963852427462, + "grad_norm": 0.2974262535572052, + "learning_rate": 0.00010446197637089123, + "loss": 1.413, + "step": 36769 + }, + { + "epoch": 0.4778093797866621, + "grad_norm": 0.4693393409252167, + "learning_rate": 0.00010445937690897984, + "loss": 1.448, + "step": 36770 + }, + { + "epoch": 0.47782237433057795, + "grad_norm": 0.38295403122901917, + "learning_rate": 0.00010445677744706845, + "loss": 1.1731, + "step": 36771 + }, + { + "epoch": 0.47783536887449385, + "grad_norm": 0.3569636046886444, + "learning_rate": 0.00010445417798515709, + "loss": 1.5175, + "step": 36772 + }, + { + "epoch": 0.4778483634184097, + "grad_norm": 0.35745254158973694, + "learning_rate": 0.0001044515785232457, + "loss": 1.3236, + "step": 36773 + }, + { + "epoch": 0.4778613579623256, + "grad_norm": 0.3957768380641937, + "learning_rate": 0.00010444897906133431, + "loss": 1.5653, + "step": 36774 + }, + { + "epoch": 0.47787435250624144, + "grad_norm": 0.38975584506988525, + "learning_rate": 0.00010444637959942292, + "loss": 1.46, + "step": 36775 + }, + { + "epoch": 0.47788734705015734, + "grad_norm": 0.36142873764038086, + "learning_rate": 0.00010444378013751155, + "loss": 1.4422, + "step": 36776 + }, + { + "epoch": 0.4779003415940732, + "grad_norm": 0.35678035020828247, + "learning_rate": 0.00010444118067560016, + "loss": 1.3891, + "step": 36777 + }, + { + "epoch": 0.4779133361379891, + "grad_norm": 0.2892293930053711, + "learning_rate": 0.00010443858121368877, + "loss": 1.1081, + "step": 36778 + }, + { + "epoch": 0.47792633068190493, + "grad_norm": 0.5017591714859009, + "learning_rate": 0.00010443598175177738, + "loss": 1.4209, + "step": 36779 + }, + { + "epoch": 0.47793932522582083, + "grad_norm": 0.402207612991333, + "learning_rate": 0.00010443338228986602, + "loss": 1.4113, + "step": 36780 + }, + { + "epoch": 0.4779523197697367, + "grad_norm": 0.39133259654045105, + "learning_rate": 0.00010443078282795461, + "loss": 1.3819, + "step": 36781 + }, + { + "epoch": 0.4779653143136526, + "grad_norm": 0.28217756748199463, + "learning_rate": 0.00010442818336604323, + "loss": 1.2681, + "step": 36782 + }, + { + "epoch": 0.4779783088575684, + "grad_norm": 0.29145336151123047, + "learning_rate": 0.00010442558390413184, + "loss": 1.24, + "step": 36783 + }, + { + "epoch": 0.4779913034014843, + "grad_norm": 0.33243465423583984, + "learning_rate": 0.00010442298444222047, + "loss": 1.3431, + "step": 36784 + }, + { + "epoch": 0.47800429794540017, + "grad_norm": 0.41443878412246704, + "learning_rate": 0.00010442038498030909, + "loss": 1.3205, + "step": 36785 + }, + { + "epoch": 0.47801729248931607, + "grad_norm": 0.47226008772850037, + "learning_rate": 0.0001044177855183977, + "loss": 1.46, + "step": 36786 + }, + { + "epoch": 0.4780302870332319, + "grad_norm": 0.4109139144420624, + "learning_rate": 0.00010441518605648631, + "loss": 1.4472, + "step": 36787 + }, + { + "epoch": 0.4780432815771478, + "grad_norm": 0.3589145839214325, + "learning_rate": 0.00010441258659457493, + "loss": 1.2934, + "step": 36788 + }, + { + "epoch": 0.47805627612106366, + "grad_norm": 0.4250454306602478, + "learning_rate": 0.00010440998713266354, + "loss": 1.298, + "step": 36789 + }, + { + "epoch": 0.47806927066497956, + "grad_norm": 0.3481067419052124, + "learning_rate": 0.00010440738767075215, + "loss": 1.2307, + "step": 36790 + }, + { + "epoch": 0.4780822652088954, + "grad_norm": 0.3831326961517334, + "learning_rate": 0.00010440478820884076, + "loss": 1.4705, + "step": 36791 + }, + { + "epoch": 0.4780952597528113, + "grad_norm": 0.402998149394989, + "learning_rate": 0.0001044021887469294, + "loss": 1.4058, + "step": 36792 + }, + { + "epoch": 0.47810825429672715, + "grad_norm": 0.3856375813484192, + "learning_rate": 0.000104399589285018, + "loss": 1.4211, + "step": 36793 + }, + { + "epoch": 0.47812124884064305, + "grad_norm": 0.38131654262542725, + "learning_rate": 0.00010439698982310661, + "loss": 1.1716, + "step": 36794 + }, + { + "epoch": 0.4781342433845589, + "grad_norm": 0.3915690779685974, + "learning_rate": 0.00010439439036119522, + "loss": 1.2754, + "step": 36795 + }, + { + "epoch": 0.4781472379284748, + "grad_norm": 0.47065722942352295, + "learning_rate": 0.00010439179089928386, + "loss": 1.4023, + "step": 36796 + }, + { + "epoch": 0.47816023247239064, + "grad_norm": 0.3348385691642761, + "learning_rate": 0.00010438919143737247, + "loss": 1.2122, + "step": 36797 + }, + { + "epoch": 0.47817322701630655, + "grad_norm": 0.4033292829990387, + "learning_rate": 0.00010438659197546108, + "loss": 1.3138, + "step": 36798 + }, + { + "epoch": 0.4781862215602224, + "grad_norm": 0.4553394913673401, + "learning_rate": 0.0001043839925135497, + "loss": 1.5404, + "step": 36799 + }, + { + "epoch": 0.4781992161041383, + "grad_norm": 0.5414491891860962, + "learning_rate": 0.00010438139305163832, + "loss": 1.4562, + "step": 36800 + }, + { + "epoch": 0.47821221064805414, + "grad_norm": 0.42657220363616943, + "learning_rate": 0.00010437879358972693, + "loss": 1.5404, + "step": 36801 + }, + { + "epoch": 0.47822520519197004, + "grad_norm": 0.3631247878074646, + "learning_rate": 0.00010437619412781554, + "loss": 1.2505, + "step": 36802 + }, + { + "epoch": 0.4782381997358859, + "grad_norm": 0.34981024265289307, + "learning_rate": 0.00010437359466590418, + "loss": 1.4006, + "step": 36803 + }, + { + "epoch": 0.4782511942798018, + "grad_norm": 0.4517924189567566, + "learning_rate": 0.00010437099520399279, + "loss": 1.2859, + "step": 36804 + }, + { + "epoch": 0.47826418882371763, + "grad_norm": 0.36635440587997437, + "learning_rate": 0.0001043683957420814, + "loss": 1.349, + "step": 36805 + }, + { + "epoch": 0.47827718336763353, + "grad_norm": 0.39470964670181274, + "learning_rate": 0.00010436579628017, + "loss": 1.5561, + "step": 36806 + }, + { + "epoch": 0.4782901779115494, + "grad_norm": 0.3966015875339508, + "learning_rate": 0.00010436319681825863, + "loss": 1.2293, + "step": 36807 + }, + { + "epoch": 0.4783031724554653, + "grad_norm": 0.45323726534843445, + "learning_rate": 0.00010436059735634725, + "loss": 1.4741, + "step": 36808 + }, + { + "epoch": 0.4783161669993811, + "grad_norm": 0.3808179795742035, + "learning_rate": 0.00010435799789443586, + "loss": 1.3603, + "step": 36809 + }, + { + "epoch": 0.478329161543297, + "grad_norm": 0.3350905776023865, + "learning_rate": 0.00010435539843252447, + "loss": 1.3055, + "step": 36810 + }, + { + "epoch": 0.47834215608721287, + "grad_norm": 0.402875691652298, + "learning_rate": 0.00010435279897061309, + "loss": 1.4636, + "step": 36811 + }, + { + "epoch": 0.47835515063112877, + "grad_norm": 0.3703416585922241, + "learning_rate": 0.0001043501995087017, + "loss": 1.3091, + "step": 36812 + }, + { + "epoch": 0.4783681451750446, + "grad_norm": 0.31940892338752747, + "learning_rate": 0.00010434760004679031, + "loss": 1.4521, + "step": 36813 + }, + { + "epoch": 0.4783811397189605, + "grad_norm": 0.3889502286911011, + "learning_rate": 0.00010434500058487892, + "loss": 1.3578, + "step": 36814 + }, + { + "epoch": 0.47839413426287636, + "grad_norm": 0.3272900879383087, + "learning_rate": 0.00010434240112296756, + "loss": 1.1229, + "step": 36815 + }, + { + "epoch": 0.47840712880679226, + "grad_norm": 0.4614048898220062, + "learning_rate": 0.00010433980166105617, + "loss": 1.3222, + "step": 36816 + }, + { + "epoch": 0.4784201233507081, + "grad_norm": 0.44233179092407227, + "learning_rate": 0.00010433720219914478, + "loss": 1.4231, + "step": 36817 + }, + { + "epoch": 0.478433117894624, + "grad_norm": 0.37301528453826904, + "learning_rate": 0.0001043346027372334, + "loss": 1.2571, + "step": 36818 + }, + { + "epoch": 0.47844611243853985, + "grad_norm": 0.34721657633781433, + "learning_rate": 0.00010433200327532202, + "loss": 1.4824, + "step": 36819 + }, + { + "epoch": 0.47845910698245575, + "grad_norm": 0.34401950240135193, + "learning_rate": 0.00010432940381341063, + "loss": 1.5498, + "step": 36820 + }, + { + "epoch": 0.4784721015263716, + "grad_norm": 0.46975505352020264, + "learning_rate": 0.00010432680435149924, + "loss": 1.4109, + "step": 36821 + }, + { + "epoch": 0.4784850960702875, + "grad_norm": 0.43760818243026733, + "learning_rate": 0.00010432420488958785, + "loss": 1.2845, + "step": 36822 + }, + { + "epoch": 0.47849809061420334, + "grad_norm": 0.4412611126899719, + "learning_rate": 0.00010432160542767648, + "loss": 1.5004, + "step": 36823 + }, + { + "epoch": 0.47851108515811924, + "grad_norm": 0.4533897936344147, + "learning_rate": 0.00010431900596576509, + "loss": 1.4093, + "step": 36824 + }, + { + "epoch": 0.4785240797020351, + "grad_norm": 0.2852482199668884, + "learning_rate": 0.0001043164065038537, + "loss": 1.2727, + "step": 36825 + }, + { + "epoch": 0.478537074245951, + "grad_norm": 0.3644379675388336, + "learning_rate": 0.00010431380704194231, + "loss": 1.1859, + "step": 36826 + }, + { + "epoch": 0.47855006878986683, + "grad_norm": 0.4098767936229706, + "learning_rate": 0.00010431120758003095, + "loss": 1.3433, + "step": 36827 + }, + { + "epoch": 0.47856306333378273, + "grad_norm": 0.3520409166812897, + "learning_rate": 0.00010430860811811956, + "loss": 1.4782, + "step": 36828 + }, + { + "epoch": 0.4785760578776986, + "grad_norm": 0.3600299060344696, + "learning_rate": 0.00010430600865620817, + "loss": 1.4798, + "step": 36829 + }, + { + "epoch": 0.4785890524216145, + "grad_norm": 0.3805144131183624, + "learning_rate": 0.00010430340919429678, + "loss": 1.3649, + "step": 36830 + }, + { + "epoch": 0.4786020469655303, + "grad_norm": 0.3655862510204315, + "learning_rate": 0.0001043008097323854, + "loss": 1.5389, + "step": 36831 + }, + { + "epoch": 0.4786150415094462, + "grad_norm": 0.3184344172477722, + "learning_rate": 0.00010429821027047402, + "loss": 1.337, + "step": 36832 + }, + { + "epoch": 0.47862803605336207, + "grad_norm": 0.2409345954656601, + "learning_rate": 0.00010429561080856263, + "loss": 1.1638, + "step": 36833 + }, + { + "epoch": 0.47864103059727797, + "grad_norm": 0.3876078724861145, + "learning_rate": 0.00010429301134665127, + "loss": 1.1411, + "step": 36834 + }, + { + "epoch": 0.4786540251411938, + "grad_norm": 0.4354804754257202, + "learning_rate": 0.00010429041188473986, + "loss": 1.1954, + "step": 36835 + }, + { + "epoch": 0.4786670196851097, + "grad_norm": 0.41031157970428467, + "learning_rate": 0.00010428781242282847, + "loss": 1.2795, + "step": 36836 + }, + { + "epoch": 0.47868001422902556, + "grad_norm": 0.43959349393844604, + "learning_rate": 0.00010428521296091708, + "loss": 1.3217, + "step": 36837 + }, + { + "epoch": 0.47869300877294146, + "grad_norm": 0.47185084223747253, + "learning_rate": 0.00010428261349900572, + "loss": 1.3637, + "step": 36838 + }, + { + "epoch": 0.4787060033168573, + "grad_norm": 0.42578238248825073, + "learning_rate": 0.00010428001403709433, + "loss": 1.3615, + "step": 36839 + }, + { + "epoch": 0.4787189978607732, + "grad_norm": 0.4404887557029724, + "learning_rate": 0.00010427741457518294, + "loss": 1.4168, + "step": 36840 + }, + { + "epoch": 0.47873199240468906, + "grad_norm": 0.3674813508987427, + "learning_rate": 0.00010427481511327156, + "loss": 1.3231, + "step": 36841 + }, + { + "epoch": 0.47874498694860496, + "grad_norm": 0.42611491680145264, + "learning_rate": 0.00010427221565136018, + "loss": 1.4343, + "step": 36842 + }, + { + "epoch": 0.4787579814925208, + "grad_norm": 0.4254211187362671, + "learning_rate": 0.00010426961618944879, + "loss": 1.3801, + "step": 36843 + }, + { + "epoch": 0.4787709760364367, + "grad_norm": 0.4078112542629242, + "learning_rate": 0.0001042670167275374, + "loss": 1.3886, + "step": 36844 + }, + { + "epoch": 0.47878397058035255, + "grad_norm": 0.46777769923210144, + "learning_rate": 0.00010426441726562601, + "loss": 1.471, + "step": 36845 + }, + { + "epoch": 0.47879696512426845, + "grad_norm": 0.474729984998703, + "learning_rate": 0.00010426181780371465, + "loss": 1.3362, + "step": 36846 + }, + { + "epoch": 0.47880995966818435, + "grad_norm": 0.37238064408302307, + "learning_rate": 0.00010425921834180326, + "loss": 1.3655, + "step": 36847 + }, + { + "epoch": 0.4788229542121002, + "grad_norm": 0.40230315923690796, + "learning_rate": 0.00010425661887989186, + "loss": 1.3212, + "step": 36848 + }, + { + "epoch": 0.4788359487560161, + "grad_norm": 0.4350760579109192, + "learning_rate": 0.00010425401941798047, + "loss": 1.4981, + "step": 36849 + }, + { + "epoch": 0.47884894329993194, + "grad_norm": 0.3952096104621887, + "learning_rate": 0.00010425141995606911, + "loss": 1.2596, + "step": 36850 + }, + { + "epoch": 0.47886193784384784, + "grad_norm": 0.4227961599826813, + "learning_rate": 0.00010424882049415772, + "loss": 1.3455, + "step": 36851 + }, + { + "epoch": 0.4788749323877637, + "grad_norm": 0.34530699253082275, + "learning_rate": 0.00010424622103224633, + "loss": 1.4071, + "step": 36852 + }, + { + "epoch": 0.4788879269316796, + "grad_norm": 0.3591969311237335, + "learning_rate": 0.00010424362157033494, + "loss": 1.241, + "step": 36853 + }, + { + "epoch": 0.47890092147559543, + "grad_norm": 0.3156588077545166, + "learning_rate": 0.00010424102210842357, + "loss": 1.4248, + "step": 36854 + }, + { + "epoch": 0.47891391601951133, + "grad_norm": 0.2812151610851288, + "learning_rate": 0.00010423842264651218, + "loss": 1.0086, + "step": 36855 + }, + { + "epoch": 0.4789269105634272, + "grad_norm": 0.314841628074646, + "learning_rate": 0.00010423582318460079, + "loss": 1.4356, + "step": 36856 + }, + { + "epoch": 0.4789399051073431, + "grad_norm": 0.4753133952617645, + "learning_rate": 0.0001042332237226894, + "loss": 1.3079, + "step": 36857 + }, + { + "epoch": 0.4789528996512589, + "grad_norm": 0.36326488852500916, + "learning_rate": 0.00010423062426077804, + "loss": 1.3631, + "step": 36858 + }, + { + "epoch": 0.4789658941951748, + "grad_norm": 0.4626297950744629, + "learning_rate": 0.00010422802479886665, + "loss": 1.4716, + "step": 36859 + }, + { + "epoch": 0.47897888873909067, + "grad_norm": 0.3644844591617584, + "learning_rate": 0.00010422542533695526, + "loss": 1.299, + "step": 36860 + }, + { + "epoch": 0.47899188328300657, + "grad_norm": 0.4530969262123108, + "learning_rate": 0.00010422282587504386, + "loss": 1.4728, + "step": 36861 + }, + { + "epoch": 0.4790048778269224, + "grad_norm": 0.41439902782440186, + "learning_rate": 0.0001042202264131325, + "loss": 1.4121, + "step": 36862 + }, + { + "epoch": 0.4790178723708383, + "grad_norm": 0.4508451521396637, + "learning_rate": 0.0001042176269512211, + "loss": 1.4569, + "step": 36863 + }, + { + "epoch": 0.47903086691475416, + "grad_norm": 0.3606402575969696, + "learning_rate": 0.00010421502748930972, + "loss": 1.4459, + "step": 36864 + }, + { + "epoch": 0.47904386145867006, + "grad_norm": 0.3238128423690796, + "learning_rate": 0.00010421242802739833, + "loss": 1.3325, + "step": 36865 + }, + { + "epoch": 0.4790568560025859, + "grad_norm": 0.38639646768569946, + "learning_rate": 0.00010420982856548695, + "loss": 1.4325, + "step": 36866 + }, + { + "epoch": 0.4790698505465018, + "grad_norm": 0.44729191064834595, + "learning_rate": 0.00010420722910357556, + "loss": 1.4411, + "step": 36867 + }, + { + "epoch": 0.47908284509041765, + "grad_norm": 0.46440497040748596, + "learning_rate": 0.00010420462964166417, + "loss": 1.3538, + "step": 36868 + }, + { + "epoch": 0.47909583963433355, + "grad_norm": 0.4782659411430359, + "learning_rate": 0.00010420203017975278, + "loss": 1.4666, + "step": 36869 + }, + { + "epoch": 0.4791088341782494, + "grad_norm": 0.32168638706207275, + "learning_rate": 0.00010419943071784142, + "loss": 1.3294, + "step": 36870 + }, + { + "epoch": 0.4791218287221653, + "grad_norm": 0.46257731318473816, + "learning_rate": 0.00010419683125593003, + "loss": 1.3517, + "step": 36871 + }, + { + "epoch": 0.47913482326608114, + "grad_norm": 0.44591307640075684, + "learning_rate": 0.00010419423179401864, + "loss": 1.4001, + "step": 36872 + }, + { + "epoch": 0.47914781780999705, + "grad_norm": 0.38360798358917236, + "learning_rate": 0.00010419163233210727, + "loss": 1.2712, + "step": 36873 + }, + { + "epoch": 0.4791608123539129, + "grad_norm": 0.37663766741752625, + "learning_rate": 0.00010418903287019588, + "loss": 1.4411, + "step": 36874 + }, + { + "epoch": 0.4791738068978288, + "grad_norm": 0.49456003308296204, + "learning_rate": 0.00010418643340828449, + "loss": 1.2319, + "step": 36875 + }, + { + "epoch": 0.47918680144174464, + "grad_norm": 0.4339449107646942, + "learning_rate": 0.0001041838339463731, + "loss": 1.4641, + "step": 36876 + }, + { + "epoch": 0.47919979598566054, + "grad_norm": 0.3863827586174011, + "learning_rate": 0.00010418123448446173, + "loss": 1.3991, + "step": 36877 + }, + { + "epoch": 0.4792127905295764, + "grad_norm": 0.3222728371620178, + "learning_rate": 0.00010417863502255034, + "loss": 1.3971, + "step": 36878 + }, + { + "epoch": 0.4792257850734923, + "grad_norm": 0.4629512429237366, + "learning_rate": 0.00010417603556063895, + "loss": 1.4163, + "step": 36879 + }, + { + "epoch": 0.47923877961740813, + "grad_norm": 0.3564627468585968, + "learning_rate": 0.00010417343609872756, + "loss": 1.3156, + "step": 36880 + }, + { + "epoch": 0.47925177416132403, + "grad_norm": 0.4132477045059204, + "learning_rate": 0.0001041708366368162, + "loss": 1.4443, + "step": 36881 + }, + { + "epoch": 0.4792647687052399, + "grad_norm": 0.4465820789337158, + "learning_rate": 0.00010416823717490481, + "loss": 1.2271, + "step": 36882 + }, + { + "epoch": 0.4792777632491558, + "grad_norm": 0.44737479090690613, + "learning_rate": 0.00010416563771299342, + "loss": 1.5571, + "step": 36883 + }, + { + "epoch": 0.4792907577930716, + "grad_norm": 0.40295544266700745, + "learning_rate": 0.00010416303825108203, + "loss": 1.4604, + "step": 36884 + }, + { + "epoch": 0.4793037523369875, + "grad_norm": 0.3811386227607727, + "learning_rate": 0.00010416043878917065, + "loss": 1.2811, + "step": 36885 + }, + { + "epoch": 0.47931674688090337, + "grad_norm": 0.2680179178714752, + "learning_rate": 0.00010415783932725926, + "loss": 1.1362, + "step": 36886 + }, + { + "epoch": 0.47932974142481927, + "grad_norm": 0.4625927805900574, + "learning_rate": 0.00010415523986534787, + "loss": 1.3885, + "step": 36887 + }, + { + "epoch": 0.4793427359687351, + "grad_norm": 0.503130316734314, + "learning_rate": 0.00010415264040343649, + "loss": 1.4347, + "step": 36888 + }, + { + "epoch": 0.479355730512651, + "grad_norm": 0.37750691175460815, + "learning_rate": 0.00010415004094152512, + "loss": 1.2633, + "step": 36889 + }, + { + "epoch": 0.47936872505656686, + "grad_norm": 0.4137214124202728, + "learning_rate": 0.00010414744147961372, + "loss": 1.2447, + "step": 36890 + }, + { + "epoch": 0.47938171960048276, + "grad_norm": 0.41304516792297363, + "learning_rate": 0.00010414484201770233, + "loss": 1.4316, + "step": 36891 + }, + { + "epoch": 0.4793947141443986, + "grad_norm": 0.45908215641975403, + "learning_rate": 0.00010414224255579094, + "loss": 1.442, + "step": 36892 + }, + { + "epoch": 0.4794077086883145, + "grad_norm": 0.4554549753665924, + "learning_rate": 0.00010413964309387958, + "loss": 1.3797, + "step": 36893 + }, + { + "epoch": 0.47942070323223035, + "grad_norm": 0.37162935733795166, + "learning_rate": 0.00010413704363196819, + "loss": 1.432, + "step": 36894 + }, + { + "epoch": 0.47943369777614625, + "grad_norm": 0.4456905424594879, + "learning_rate": 0.0001041344441700568, + "loss": 1.4346, + "step": 36895 + }, + { + "epoch": 0.4794466923200621, + "grad_norm": 0.42448943853378296, + "learning_rate": 0.00010413184470814541, + "loss": 1.5145, + "step": 36896 + }, + { + "epoch": 0.479459686863978, + "grad_norm": 0.27512335777282715, + "learning_rate": 0.00010412924524623404, + "loss": 1.2322, + "step": 36897 + }, + { + "epoch": 0.47947268140789384, + "grad_norm": 0.31826987862586975, + "learning_rate": 0.00010412664578432265, + "loss": 1.5515, + "step": 36898 + }, + { + "epoch": 0.47948567595180974, + "grad_norm": 0.43376055359840393, + "learning_rate": 0.00010412404632241126, + "loss": 1.4572, + "step": 36899 + }, + { + "epoch": 0.4794986704957256, + "grad_norm": 0.5189838409423828, + "learning_rate": 0.00010412144686049987, + "loss": 1.5409, + "step": 36900 + }, + { + "epoch": 0.4795116650396415, + "grad_norm": 0.3767385482788086, + "learning_rate": 0.00010411884739858851, + "loss": 1.3058, + "step": 36901 + }, + { + "epoch": 0.47952465958355733, + "grad_norm": 0.40498068928718567, + "learning_rate": 0.00010411624793667712, + "loss": 1.4339, + "step": 36902 + }, + { + "epoch": 0.47953765412747323, + "grad_norm": 0.3439054489135742, + "learning_rate": 0.00010411364847476572, + "loss": 1.4034, + "step": 36903 + }, + { + "epoch": 0.4795506486713891, + "grad_norm": 0.29294559359550476, + "learning_rate": 0.00010411104901285433, + "loss": 1.1773, + "step": 36904 + }, + { + "epoch": 0.479563643215305, + "grad_norm": 0.2612760066986084, + "learning_rate": 0.00010410844955094297, + "loss": 1.2663, + "step": 36905 + }, + { + "epoch": 0.4795766377592208, + "grad_norm": 0.4474465548992157, + "learning_rate": 0.00010410585008903158, + "loss": 1.4763, + "step": 36906 + }, + { + "epoch": 0.4795896323031367, + "grad_norm": 0.32873237133026123, + "learning_rate": 0.00010410325062712019, + "loss": 1.446, + "step": 36907 + }, + { + "epoch": 0.47960262684705257, + "grad_norm": 0.30595964193344116, + "learning_rate": 0.0001041006511652088, + "loss": 1.36, + "step": 36908 + }, + { + "epoch": 0.47961562139096847, + "grad_norm": 0.45038941502571106, + "learning_rate": 0.00010409805170329742, + "loss": 1.4713, + "step": 36909 + }, + { + "epoch": 0.4796286159348843, + "grad_norm": 0.4217996895313263, + "learning_rate": 0.00010409545224138603, + "loss": 1.2901, + "step": 36910 + }, + { + "epoch": 0.4796416104788002, + "grad_norm": 0.49533677101135254, + "learning_rate": 0.00010409285277947465, + "loss": 1.404, + "step": 36911 + }, + { + "epoch": 0.47965460502271606, + "grad_norm": 0.4637061655521393, + "learning_rate": 0.00010409025331756328, + "loss": 1.3639, + "step": 36912 + }, + { + "epoch": 0.47966759956663196, + "grad_norm": 0.42781180143356323, + "learning_rate": 0.0001040876538556519, + "loss": 1.3643, + "step": 36913 + }, + { + "epoch": 0.4796805941105478, + "grad_norm": 0.3007575571537018, + "learning_rate": 0.0001040850543937405, + "loss": 1.3189, + "step": 36914 + }, + { + "epoch": 0.4796935886544637, + "grad_norm": 0.2834354341030121, + "learning_rate": 0.0001040824549318291, + "loss": 1.2854, + "step": 36915 + }, + { + "epoch": 0.47970658319837955, + "grad_norm": 0.36999163031578064, + "learning_rate": 0.00010407985546991774, + "loss": 1.2525, + "step": 36916 + }, + { + "epoch": 0.47971957774229546, + "grad_norm": 0.5623327493667603, + "learning_rate": 0.00010407725600800635, + "loss": 1.4012, + "step": 36917 + }, + { + "epoch": 0.4797325722862113, + "grad_norm": 0.35479265451431274, + "learning_rate": 0.00010407465654609496, + "loss": 1.2478, + "step": 36918 + }, + { + "epoch": 0.4797455668301272, + "grad_norm": 0.4296862483024597, + "learning_rate": 0.00010407205708418357, + "loss": 1.457, + "step": 36919 + }, + { + "epoch": 0.47975856137404305, + "grad_norm": 0.49682939052581787, + "learning_rate": 0.0001040694576222722, + "loss": 1.408, + "step": 36920 + }, + { + "epoch": 0.47977155591795895, + "grad_norm": 0.40639010071754456, + "learning_rate": 0.00010406685816036081, + "loss": 1.4331, + "step": 36921 + }, + { + "epoch": 0.47978455046187485, + "grad_norm": 0.39165592193603516, + "learning_rate": 0.00010406425869844942, + "loss": 1.4263, + "step": 36922 + }, + { + "epoch": 0.4797975450057907, + "grad_norm": 0.41559869050979614, + "learning_rate": 0.00010406165923653803, + "loss": 1.4388, + "step": 36923 + }, + { + "epoch": 0.4798105395497066, + "grad_norm": 0.3431210517883301, + "learning_rate": 0.00010405905977462667, + "loss": 1.4256, + "step": 36924 + }, + { + "epoch": 0.47982353409362244, + "grad_norm": 0.39178362488746643, + "learning_rate": 0.00010405646031271528, + "loss": 1.4861, + "step": 36925 + }, + { + "epoch": 0.47983652863753834, + "grad_norm": 0.4565635919570923, + "learning_rate": 0.00010405386085080389, + "loss": 1.3268, + "step": 36926 + }, + { + "epoch": 0.4798495231814542, + "grad_norm": 0.4415455758571625, + "learning_rate": 0.0001040512613888925, + "loss": 1.5069, + "step": 36927 + }, + { + "epoch": 0.4798625177253701, + "grad_norm": 0.47685813903808594, + "learning_rate": 0.00010404866192698113, + "loss": 1.4058, + "step": 36928 + }, + { + "epoch": 0.47987551226928593, + "grad_norm": 0.41550666093826294, + "learning_rate": 0.00010404606246506974, + "loss": 1.3966, + "step": 36929 + }, + { + "epoch": 0.47988850681320183, + "grad_norm": 0.34026575088500977, + "learning_rate": 0.00010404346300315835, + "loss": 1.3131, + "step": 36930 + }, + { + "epoch": 0.4799015013571177, + "grad_norm": 0.4055057466030121, + "learning_rate": 0.00010404086354124696, + "loss": 1.4229, + "step": 36931 + }, + { + "epoch": 0.4799144959010336, + "grad_norm": 0.4502200484275818, + "learning_rate": 0.00010403826407933558, + "loss": 1.4216, + "step": 36932 + }, + { + "epoch": 0.4799274904449494, + "grad_norm": 0.4330025017261505, + "learning_rate": 0.0001040356646174242, + "loss": 1.2733, + "step": 36933 + }, + { + "epoch": 0.4799404849888653, + "grad_norm": 0.3005753457546234, + "learning_rate": 0.0001040330651555128, + "loss": 1.4774, + "step": 36934 + }, + { + "epoch": 0.47995347953278117, + "grad_norm": 0.41319289803504944, + "learning_rate": 0.00010403046569360142, + "loss": 1.3624, + "step": 36935 + }, + { + "epoch": 0.47996647407669707, + "grad_norm": 0.3968539535999298, + "learning_rate": 0.00010402786623169005, + "loss": 1.309, + "step": 36936 + }, + { + "epoch": 0.4799794686206129, + "grad_norm": 0.3321796655654907, + "learning_rate": 0.00010402526676977867, + "loss": 1.4588, + "step": 36937 + }, + { + "epoch": 0.4799924631645288, + "grad_norm": 0.38623952865600586, + "learning_rate": 0.00010402266730786728, + "loss": 1.3366, + "step": 36938 + }, + { + "epoch": 0.48000545770844466, + "grad_norm": 0.38911959528923035, + "learning_rate": 0.00010402006784595589, + "loss": 1.4242, + "step": 36939 + }, + { + "epoch": 0.48001845225236056, + "grad_norm": 0.3286307454109192, + "learning_rate": 0.00010401746838404451, + "loss": 1.3574, + "step": 36940 + }, + { + "epoch": 0.4800314467962764, + "grad_norm": 0.4136251211166382, + "learning_rate": 0.00010401486892213312, + "loss": 1.4438, + "step": 36941 + }, + { + "epoch": 0.4800444413401923, + "grad_norm": 0.33114486932754517, + "learning_rate": 0.00010401226946022173, + "loss": 1.2895, + "step": 36942 + }, + { + "epoch": 0.48005743588410815, + "grad_norm": 0.3806767165660858, + "learning_rate": 0.00010400966999831034, + "loss": 1.4772, + "step": 36943 + }, + { + "epoch": 0.48007043042802405, + "grad_norm": 0.34614482522010803, + "learning_rate": 0.00010400707053639898, + "loss": 1.3742, + "step": 36944 + }, + { + "epoch": 0.4800834249719399, + "grad_norm": 0.4848361909389496, + "learning_rate": 0.00010400447107448758, + "loss": 1.5499, + "step": 36945 + }, + { + "epoch": 0.4800964195158558, + "grad_norm": 0.3694782257080078, + "learning_rate": 0.00010400187161257619, + "loss": 1.331, + "step": 36946 + }, + { + "epoch": 0.48010941405977164, + "grad_norm": 0.31021466851234436, + "learning_rate": 0.00010399927215066483, + "loss": 1.2764, + "step": 36947 + }, + { + "epoch": 0.48012240860368754, + "grad_norm": 0.4170472025871277, + "learning_rate": 0.00010399667268875344, + "loss": 1.3761, + "step": 36948 + }, + { + "epoch": 0.4801354031476034, + "grad_norm": 0.5247902870178223, + "learning_rate": 0.00010399407322684205, + "loss": 1.4719, + "step": 36949 + }, + { + "epoch": 0.4801483976915193, + "grad_norm": 0.40657395124435425, + "learning_rate": 0.00010399147376493066, + "loss": 1.5526, + "step": 36950 + }, + { + "epoch": 0.48016139223543514, + "grad_norm": 0.3284232020378113, + "learning_rate": 0.00010398887430301929, + "loss": 1.272, + "step": 36951 + }, + { + "epoch": 0.48017438677935104, + "grad_norm": 0.34033140540122986, + "learning_rate": 0.0001039862748411079, + "loss": 1.299, + "step": 36952 + }, + { + "epoch": 0.4801873813232669, + "grad_norm": 0.34505438804626465, + "learning_rate": 0.00010398367537919651, + "loss": 1.198, + "step": 36953 + }, + { + "epoch": 0.4802003758671828, + "grad_norm": 0.32588204741477966, + "learning_rate": 0.00010398107591728512, + "loss": 1.2629, + "step": 36954 + }, + { + "epoch": 0.48021337041109863, + "grad_norm": 0.4872061312198639, + "learning_rate": 0.00010397847645537376, + "loss": 1.4692, + "step": 36955 + }, + { + "epoch": 0.48022636495501453, + "grad_norm": 0.38380318880081177, + "learning_rate": 0.00010397587699346237, + "loss": 1.2054, + "step": 36956 + }, + { + "epoch": 0.4802393594989304, + "grad_norm": 0.4731670022010803, + "learning_rate": 0.00010397327753155097, + "loss": 1.3808, + "step": 36957 + }, + { + "epoch": 0.4802523540428463, + "grad_norm": 0.4408159852027893, + "learning_rate": 0.00010397067806963958, + "loss": 1.331, + "step": 36958 + }, + { + "epoch": 0.4802653485867621, + "grad_norm": 0.4369319975376129, + "learning_rate": 0.00010396807860772821, + "loss": 1.4748, + "step": 36959 + }, + { + "epoch": 0.480278343130678, + "grad_norm": 0.4115489721298218, + "learning_rate": 0.00010396547914581683, + "loss": 1.435, + "step": 36960 + }, + { + "epoch": 0.48029133767459387, + "grad_norm": 0.434533029794693, + "learning_rate": 0.00010396287968390544, + "loss": 1.588, + "step": 36961 + }, + { + "epoch": 0.48030433221850977, + "grad_norm": 0.42437365651130676, + "learning_rate": 0.00010396028022199405, + "loss": 1.4981, + "step": 36962 + }, + { + "epoch": 0.4803173267624256, + "grad_norm": 0.4613930583000183, + "learning_rate": 0.00010395768076008267, + "loss": 1.338, + "step": 36963 + }, + { + "epoch": 0.4803303213063415, + "grad_norm": 0.5235838890075684, + "learning_rate": 0.00010395508129817128, + "loss": 1.3152, + "step": 36964 + }, + { + "epoch": 0.48034331585025736, + "grad_norm": 0.48191890120506287, + "learning_rate": 0.0001039524818362599, + "loss": 1.4145, + "step": 36965 + }, + { + "epoch": 0.48035631039417326, + "grad_norm": 0.40093857049942017, + "learning_rate": 0.0001039498823743485, + "loss": 1.3925, + "step": 36966 + }, + { + "epoch": 0.4803693049380891, + "grad_norm": 0.42404696345329285, + "learning_rate": 0.00010394728291243714, + "loss": 1.4142, + "step": 36967 + }, + { + "epoch": 0.480382299482005, + "grad_norm": 0.39218422770500183, + "learning_rate": 0.00010394468345052575, + "loss": 1.2273, + "step": 36968 + }, + { + "epoch": 0.48039529402592085, + "grad_norm": 0.4282079339027405, + "learning_rate": 0.00010394208398861436, + "loss": 1.3739, + "step": 36969 + }, + { + "epoch": 0.48040828856983675, + "grad_norm": 0.37125587463378906, + "learning_rate": 0.00010393948452670296, + "loss": 1.0972, + "step": 36970 + }, + { + "epoch": 0.4804212831137526, + "grad_norm": 0.39503636956214905, + "learning_rate": 0.0001039368850647916, + "loss": 1.4776, + "step": 36971 + }, + { + "epoch": 0.4804342776576685, + "grad_norm": 0.3474675118923187, + "learning_rate": 0.00010393428560288021, + "loss": 1.4618, + "step": 36972 + }, + { + "epoch": 0.48044727220158434, + "grad_norm": 0.31738564372062683, + "learning_rate": 0.00010393168614096882, + "loss": 1.2595, + "step": 36973 + }, + { + "epoch": 0.48046026674550024, + "grad_norm": 0.3933612108230591, + "learning_rate": 0.00010392908667905743, + "loss": 1.4622, + "step": 36974 + }, + { + "epoch": 0.4804732612894161, + "grad_norm": 0.4312123656272888, + "learning_rate": 0.00010392648721714606, + "loss": 1.4021, + "step": 36975 + }, + { + "epoch": 0.480486255833332, + "grad_norm": 0.36490944027900696, + "learning_rate": 0.00010392388775523467, + "loss": 1.3627, + "step": 36976 + }, + { + "epoch": 0.48049925037724783, + "grad_norm": 0.45459774136543274, + "learning_rate": 0.00010392128829332328, + "loss": 1.4745, + "step": 36977 + }, + { + "epoch": 0.48051224492116373, + "grad_norm": 0.3800516426563263, + "learning_rate": 0.00010391868883141189, + "loss": 1.4159, + "step": 36978 + }, + { + "epoch": 0.4805252394650796, + "grad_norm": 0.4457409977912903, + "learning_rate": 0.00010391608936950053, + "loss": 1.379, + "step": 36979 + }, + { + "epoch": 0.4805382340089955, + "grad_norm": 0.4562298059463501, + "learning_rate": 0.00010391348990758914, + "loss": 1.276, + "step": 36980 + }, + { + "epoch": 0.4805512285529113, + "grad_norm": 0.3313678205013275, + "learning_rate": 0.00010391089044567775, + "loss": 1.4453, + "step": 36981 + }, + { + "epoch": 0.4805642230968272, + "grad_norm": 0.47662195563316345, + "learning_rate": 0.00010390829098376636, + "loss": 1.6202, + "step": 36982 + }, + { + "epoch": 0.48057721764074307, + "grad_norm": 0.4243510067462921, + "learning_rate": 0.00010390569152185499, + "loss": 1.4882, + "step": 36983 + }, + { + "epoch": 0.48059021218465897, + "grad_norm": 0.41490447521209717, + "learning_rate": 0.0001039030920599436, + "loss": 1.5299, + "step": 36984 + }, + { + "epoch": 0.4806032067285748, + "grad_norm": 0.35059666633605957, + "learning_rate": 0.00010390049259803221, + "loss": 1.2663, + "step": 36985 + }, + { + "epoch": 0.4806162012724907, + "grad_norm": 0.33567747473716736, + "learning_rate": 0.00010389789313612085, + "loss": 1.212, + "step": 36986 + }, + { + "epoch": 0.48062919581640656, + "grad_norm": 0.42701902985572815, + "learning_rate": 0.00010389529367420944, + "loss": 1.3989, + "step": 36987 + }, + { + "epoch": 0.48064219036032246, + "grad_norm": 0.43308764696121216, + "learning_rate": 0.00010389269421229805, + "loss": 1.3765, + "step": 36988 + }, + { + "epoch": 0.4806551849042383, + "grad_norm": 0.32546907663345337, + "learning_rate": 0.00010389009475038666, + "loss": 1.3773, + "step": 36989 + }, + { + "epoch": 0.4806681794481542, + "grad_norm": 0.4054744839668274, + "learning_rate": 0.0001038874952884753, + "loss": 1.3867, + "step": 36990 + }, + { + "epoch": 0.48068117399207005, + "grad_norm": 0.4095016121864319, + "learning_rate": 0.00010388489582656391, + "loss": 1.4368, + "step": 36991 + }, + { + "epoch": 0.48069416853598596, + "grad_norm": 0.37877586483955383, + "learning_rate": 0.00010388229636465252, + "loss": 1.5284, + "step": 36992 + }, + { + "epoch": 0.4807071630799018, + "grad_norm": 0.4957578480243683, + "learning_rate": 0.00010387969690274114, + "loss": 1.2913, + "step": 36993 + }, + { + "epoch": 0.4807201576238177, + "grad_norm": 0.38614529371261597, + "learning_rate": 0.00010387709744082976, + "loss": 1.3449, + "step": 36994 + }, + { + "epoch": 0.48073315216773355, + "grad_norm": 0.4471305012702942, + "learning_rate": 0.00010387449797891837, + "loss": 1.2938, + "step": 36995 + }, + { + "epoch": 0.48074614671164945, + "grad_norm": 0.41543376445770264, + "learning_rate": 0.00010387189851700698, + "loss": 1.4888, + "step": 36996 + }, + { + "epoch": 0.4807591412555653, + "grad_norm": 0.34518080949783325, + "learning_rate": 0.00010386929905509559, + "loss": 1.4321, + "step": 36997 + }, + { + "epoch": 0.4807721357994812, + "grad_norm": 0.3992246687412262, + "learning_rate": 0.00010386669959318423, + "loss": 1.4655, + "step": 36998 + }, + { + "epoch": 0.4807851303433971, + "grad_norm": 0.38836345076560974, + "learning_rate": 0.00010386410013127283, + "loss": 1.5897, + "step": 36999 + }, + { + "epoch": 0.48079812488731294, + "grad_norm": 0.45370543003082275, + "learning_rate": 0.00010386150066936144, + "loss": 1.4741, + "step": 37000 + }, + { + "epoch": 0.48081111943122884, + "grad_norm": 0.442688912153244, + "learning_rate": 0.00010385890120745005, + "loss": 1.4485, + "step": 37001 + }, + { + "epoch": 0.4808241139751447, + "grad_norm": 0.3650788962841034, + "learning_rate": 0.00010385630174553869, + "loss": 1.2932, + "step": 37002 + }, + { + "epoch": 0.4808371085190606, + "grad_norm": 0.3345593214035034, + "learning_rate": 0.0001038537022836273, + "loss": 1.4152, + "step": 37003 + }, + { + "epoch": 0.48085010306297643, + "grad_norm": 0.45666927099227905, + "learning_rate": 0.00010385110282171591, + "loss": 1.3923, + "step": 37004 + }, + { + "epoch": 0.48086309760689233, + "grad_norm": 0.34880349040031433, + "learning_rate": 0.00010384850335980452, + "loss": 1.359, + "step": 37005 + }, + { + "epoch": 0.4808760921508082, + "grad_norm": 0.4138614237308502, + "learning_rate": 0.00010384590389789315, + "loss": 1.3376, + "step": 37006 + }, + { + "epoch": 0.4808890866947241, + "grad_norm": 0.4633117914199829, + "learning_rate": 0.00010384330443598176, + "loss": 1.4949, + "step": 37007 + }, + { + "epoch": 0.4809020812386399, + "grad_norm": 0.3345903158187866, + "learning_rate": 0.00010384070497407037, + "loss": 1.3897, + "step": 37008 + }, + { + "epoch": 0.4809150757825558, + "grad_norm": 0.4556865692138672, + "learning_rate": 0.00010383810551215898, + "loss": 1.4332, + "step": 37009 + }, + { + "epoch": 0.48092807032647167, + "grad_norm": 0.41012030839920044, + "learning_rate": 0.00010383550605024762, + "loss": 1.3258, + "step": 37010 + }, + { + "epoch": 0.48094106487038757, + "grad_norm": 0.411729097366333, + "learning_rate": 0.00010383290658833623, + "loss": 1.5882, + "step": 37011 + }, + { + "epoch": 0.4809540594143034, + "grad_norm": 0.3972322642803192, + "learning_rate": 0.00010383030712642482, + "loss": 1.2105, + "step": 37012 + }, + { + "epoch": 0.4809670539582193, + "grad_norm": 0.3828829824924469, + "learning_rate": 0.00010382770766451344, + "loss": 1.3966, + "step": 37013 + }, + { + "epoch": 0.48098004850213516, + "grad_norm": 0.33546182513237, + "learning_rate": 0.00010382510820260207, + "loss": 1.3979, + "step": 37014 + }, + { + "epoch": 0.48099304304605106, + "grad_norm": 0.4454078674316406, + "learning_rate": 0.00010382250874069068, + "loss": 1.2634, + "step": 37015 + }, + { + "epoch": 0.4810060375899669, + "grad_norm": 0.375124990940094, + "learning_rate": 0.0001038199092787793, + "loss": 1.2616, + "step": 37016 + }, + { + "epoch": 0.4810190321338828, + "grad_norm": 0.30314013361930847, + "learning_rate": 0.0001038173098168679, + "loss": 1.3906, + "step": 37017 + }, + { + "epoch": 0.48103202667779865, + "grad_norm": 0.4833199977874756, + "learning_rate": 0.00010381471035495653, + "loss": 1.6719, + "step": 37018 + }, + { + "epoch": 0.48104502122171455, + "grad_norm": 0.2718501091003418, + "learning_rate": 0.00010381211089304514, + "loss": 1.3718, + "step": 37019 + }, + { + "epoch": 0.4810580157656304, + "grad_norm": 0.3884361684322357, + "learning_rate": 0.00010380951143113375, + "loss": 1.3511, + "step": 37020 + }, + { + "epoch": 0.4810710103095463, + "grad_norm": 0.3924647271633148, + "learning_rate": 0.00010380691196922239, + "loss": 1.194, + "step": 37021 + }, + { + "epoch": 0.48108400485346214, + "grad_norm": 0.44883644580841064, + "learning_rate": 0.000103804312507311, + "loss": 1.249, + "step": 37022 + }, + { + "epoch": 0.48109699939737804, + "grad_norm": 0.3738546669483185, + "learning_rate": 0.00010380171304539961, + "loss": 1.2924, + "step": 37023 + }, + { + "epoch": 0.4811099939412939, + "grad_norm": 0.43245530128479004, + "learning_rate": 0.00010379911358348822, + "loss": 1.3289, + "step": 37024 + }, + { + "epoch": 0.4811229884852098, + "grad_norm": 0.41660571098327637, + "learning_rate": 0.00010379651412157685, + "loss": 1.3814, + "step": 37025 + }, + { + "epoch": 0.48113598302912564, + "grad_norm": 0.4113090932369232, + "learning_rate": 0.00010379391465966546, + "loss": 1.4656, + "step": 37026 + }, + { + "epoch": 0.48114897757304154, + "grad_norm": 0.40604138374328613, + "learning_rate": 0.00010379131519775407, + "loss": 1.4552, + "step": 37027 + }, + { + "epoch": 0.4811619721169574, + "grad_norm": 0.3968238830566406, + "learning_rate": 0.00010378871573584268, + "loss": 1.4294, + "step": 37028 + }, + { + "epoch": 0.4811749666608733, + "grad_norm": 0.4388469159603119, + "learning_rate": 0.0001037861162739313, + "loss": 1.6759, + "step": 37029 + }, + { + "epoch": 0.4811879612047891, + "grad_norm": 0.46258074045181274, + "learning_rate": 0.00010378351681201992, + "loss": 1.5526, + "step": 37030 + }, + { + "epoch": 0.48120095574870503, + "grad_norm": 0.490599125623703, + "learning_rate": 0.00010378091735010853, + "loss": 1.5859, + "step": 37031 + }, + { + "epoch": 0.4812139502926209, + "grad_norm": 0.4031514823436737, + "learning_rate": 0.00010377831788819714, + "loss": 1.443, + "step": 37032 + }, + { + "epoch": 0.4812269448365368, + "grad_norm": 0.39933234453201294, + "learning_rate": 0.00010377571842628578, + "loss": 1.4039, + "step": 37033 + }, + { + "epoch": 0.4812399393804526, + "grad_norm": 0.3389892578125, + "learning_rate": 0.00010377311896437439, + "loss": 1.2539, + "step": 37034 + }, + { + "epoch": 0.4812529339243685, + "grad_norm": 0.4143637418746948, + "learning_rate": 0.000103770519502463, + "loss": 1.3954, + "step": 37035 + }, + { + "epoch": 0.48126592846828437, + "grad_norm": 0.42558053135871887, + "learning_rate": 0.00010376792004055161, + "loss": 1.3849, + "step": 37036 + }, + { + "epoch": 0.48127892301220027, + "grad_norm": 0.2608511447906494, + "learning_rate": 0.00010376532057864023, + "loss": 1.2232, + "step": 37037 + }, + { + "epoch": 0.4812919175561161, + "grad_norm": 0.3625309467315674, + "learning_rate": 0.00010376272111672884, + "loss": 1.399, + "step": 37038 + }, + { + "epoch": 0.481304912100032, + "grad_norm": 0.3443760871887207, + "learning_rate": 0.00010376012165481745, + "loss": 1.302, + "step": 37039 + }, + { + "epoch": 0.48131790664394786, + "grad_norm": 0.40133070945739746, + "learning_rate": 0.00010375752219290607, + "loss": 1.43, + "step": 37040 + }, + { + "epoch": 0.48133090118786376, + "grad_norm": 0.41258370876312256, + "learning_rate": 0.00010375492273099469, + "loss": 1.5163, + "step": 37041 + }, + { + "epoch": 0.4813438957317796, + "grad_norm": 0.40800005197525024, + "learning_rate": 0.0001037523232690833, + "loss": 1.5115, + "step": 37042 + }, + { + "epoch": 0.4813568902756955, + "grad_norm": 0.24220822751522064, + "learning_rate": 0.00010374972380717191, + "loss": 1.248, + "step": 37043 + }, + { + "epoch": 0.48136988481961135, + "grad_norm": 0.49225279688835144, + "learning_rate": 0.00010374712434526052, + "loss": 1.732, + "step": 37044 + }, + { + "epoch": 0.48138287936352725, + "grad_norm": 0.35616499185562134, + "learning_rate": 0.00010374452488334916, + "loss": 1.2149, + "step": 37045 + }, + { + "epoch": 0.4813958739074431, + "grad_norm": 0.3266451954841614, + "learning_rate": 0.00010374192542143777, + "loss": 1.2898, + "step": 37046 + }, + { + "epoch": 0.481408868451359, + "grad_norm": 0.43068891763687134, + "learning_rate": 0.00010373932595952638, + "loss": 1.2392, + "step": 37047 + }, + { + "epoch": 0.48142186299527484, + "grad_norm": 0.3272014260292053, + "learning_rate": 0.000103736726497615, + "loss": 1.365, + "step": 37048 + }, + { + "epoch": 0.48143485753919074, + "grad_norm": 0.37535202503204346, + "learning_rate": 0.00010373412703570362, + "loss": 1.5827, + "step": 37049 + }, + { + "epoch": 0.4814478520831066, + "grad_norm": 0.5277919769287109, + "learning_rate": 0.00010373152757379223, + "loss": 1.5126, + "step": 37050 + }, + { + "epoch": 0.4814608466270225, + "grad_norm": 0.42219579219818115, + "learning_rate": 0.00010372892811188084, + "loss": 1.3183, + "step": 37051 + }, + { + "epoch": 0.48147384117093833, + "grad_norm": 0.38642510771751404, + "learning_rate": 0.00010372632864996945, + "loss": 1.6148, + "step": 37052 + }, + { + "epoch": 0.48148683571485423, + "grad_norm": 0.3439028263092041, + "learning_rate": 0.00010372372918805809, + "loss": 1.2162, + "step": 37053 + }, + { + "epoch": 0.4814998302587701, + "grad_norm": 0.45564141869544983, + "learning_rate": 0.00010372112972614669, + "loss": 1.1664, + "step": 37054 + }, + { + "epoch": 0.481512824802686, + "grad_norm": 0.331000417470932, + "learning_rate": 0.0001037185302642353, + "loss": 1.2467, + "step": 37055 + }, + { + "epoch": 0.4815258193466018, + "grad_norm": 0.4542824625968933, + "learning_rate": 0.00010371593080232391, + "loss": 1.4139, + "step": 37056 + }, + { + "epoch": 0.4815388138905177, + "grad_norm": 0.41777390241622925, + "learning_rate": 0.00010371333134041255, + "loss": 1.4423, + "step": 37057 + }, + { + "epoch": 0.48155180843443357, + "grad_norm": 0.4126218259334564, + "learning_rate": 0.00010371073187850116, + "loss": 1.3907, + "step": 37058 + }, + { + "epoch": 0.48156480297834947, + "grad_norm": 0.4334541857242584, + "learning_rate": 0.00010370813241658977, + "loss": 1.4799, + "step": 37059 + }, + { + "epoch": 0.4815777975222653, + "grad_norm": 0.3993145525455475, + "learning_rate": 0.00010370553295467839, + "loss": 1.2806, + "step": 37060 + }, + { + "epoch": 0.4815907920661812, + "grad_norm": 0.4233919382095337, + "learning_rate": 0.000103702933492767, + "loss": 1.4537, + "step": 37061 + }, + { + "epoch": 0.48160378661009706, + "grad_norm": 0.37088096141815186, + "learning_rate": 0.00010370033403085561, + "loss": 1.348, + "step": 37062 + }, + { + "epoch": 0.48161678115401296, + "grad_norm": 0.4031206965446472, + "learning_rate": 0.00010369773456894423, + "loss": 1.5072, + "step": 37063 + }, + { + "epoch": 0.4816297756979288, + "grad_norm": 0.4621102213859558, + "learning_rate": 0.00010369513510703286, + "loss": 1.5552, + "step": 37064 + }, + { + "epoch": 0.4816427702418447, + "grad_norm": 0.43671950697898865, + "learning_rate": 0.00010369253564512147, + "loss": 1.2471, + "step": 37065 + }, + { + "epoch": 0.48165576478576055, + "grad_norm": 0.5634618997573853, + "learning_rate": 0.00010368993618321009, + "loss": 1.443, + "step": 37066 + }, + { + "epoch": 0.48166875932967645, + "grad_norm": 0.35647347569465637, + "learning_rate": 0.00010368733672129868, + "loss": 1.503, + "step": 37067 + }, + { + "epoch": 0.4816817538735923, + "grad_norm": 0.40893930196762085, + "learning_rate": 0.00010368473725938732, + "loss": 1.3356, + "step": 37068 + }, + { + "epoch": 0.4816947484175082, + "grad_norm": 0.4411131739616394, + "learning_rate": 0.00010368213779747593, + "loss": 1.4587, + "step": 37069 + }, + { + "epoch": 0.48170774296142405, + "grad_norm": 0.508825957775116, + "learning_rate": 0.00010367953833556454, + "loss": 1.4409, + "step": 37070 + }, + { + "epoch": 0.48172073750533995, + "grad_norm": 0.4320303201675415, + "learning_rate": 0.00010367693887365315, + "loss": 1.2929, + "step": 37071 + }, + { + "epoch": 0.4817337320492558, + "grad_norm": 0.37547269463539124, + "learning_rate": 0.00010367433941174178, + "loss": 1.5674, + "step": 37072 + }, + { + "epoch": 0.4817467265931717, + "grad_norm": 0.5642324686050415, + "learning_rate": 0.00010367173994983039, + "loss": 1.4954, + "step": 37073 + }, + { + "epoch": 0.48175972113708754, + "grad_norm": 0.406318724155426, + "learning_rate": 0.000103669140487919, + "loss": 1.4271, + "step": 37074 + }, + { + "epoch": 0.48177271568100344, + "grad_norm": 0.43324995040893555, + "learning_rate": 0.00010366654102600761, + "loss": 1.2625, + "step": 37075 + }, + { + "epoch": 0.48178571022491934, + "grad_norm": 0.32785969972610474, + "learning_rate": 0.00010366394156409625, + "loss": 1.1535, + "step": 37076 + }, + { + "epoch": 0.4817987047688352, + "grad_norm": 0.5001927018165588, + "learning_rate": 0.00010366134210218486, + "loss": 1.5432, + "step": 37077 + }, + { + "epoch": 0.4818116993127511, + "grad_norm": 0.37814438343048096, + "learning_rate": 0.00010365874264027347, + "loss": 1.327, + "step": 37078 + }, + { + "epoch": 0.48182469385666693, + "grad_norm": 0.41916894912719727, + "learning_rate": 0.00010365614317836207, + "loss": 1.218, + "step": 37079 + }, + { + "epoch": 0.48183768840058283, + "grad_norm": 0.40626901388168335, + "learning_rate": 0.0001036535437164507, + "loss": 1.3679, + "step": 37080 + }, + { + "epoch": 0.4818506829444987, + "grad_norm": 0.43168550729751587, + "learning_rate": 0.00010365094425453932, + "loss": 1.4426, + "step": 37081 + }, + { + "epoch": 0.4818636774884146, + "grad_norm": 0.4354667067527771, + "learning_rate": 0.00010364834479262793, + "loss": 1.482, + "step": 37082 + }, + { + "epoch": 0.4818766720323304, + "grad_norm": 0.34500545263290405, + "learning_rate": 0.00010364574533071654, + "loss": 1.4075, + "step": 37083 + }, + { + "epoch": 0.4818896665762463, + "grad_norm": 0.8157206177711487, + "learning_rate": 0.00010364314586880516, + "loss": 1.2444, + "step": 37084 + }, + { + "epoch": 0.48190266112016217, + "grad_norm": 0.334079384803772, + "learning_rate": 0.00010364054640689377, + "loss": 1.3834, + "step": 37085 + }, + { + "epoch": 0.48191565566407807, + "grad_norm": 0.3525972068309784, + "learning_rate": 0.00010363794694498239, + "loss": 1.3397, + "step": 37086 + }, + { + "epoch": 0.4819286502079939, + "grad_norm": 0.42938733100891113, + "learning_rate": 0.000103635347483071, + "loss": 1.3493, + "step": 37087 + }, + { + "epoch": 0.4819416447519098, + "grad_norm": 0.35131803154945374, + "learning_rate": 0.00010363274802115963, + "loss": 1.3878, + "step": 37088 + }, + { + "epoch": 0.48195463929582566, + "grad_norm": 0.43994802236557007, + "learning_rate": 0.00010363014855924825, + "loss": 1.2969, + "step": 37089 + }, + { + "epoch": 0.48196763383974156, + "grad_norm": 0.20950429141521454, + "learning_rate": 0.00010362754909733686, + "loss": 1.1947, + "step": 37090 + }, + { + "epoch": 0.4819806283836574, + "grad_norm": 0.47065064311027527, + "learning_rate": 0.00010362494963542547, + "loss": 1.3354, + "step": 37091 + }, + { + "epoch": 0.4819936229275733, + "grad_norm": 0.2952079772949219, + "learning_rate": 0.00010362235017351409, + "loss": 1.5077, + "step": 37092 + }, + { + "epoch": 0.48200661747148915, + "grad_norm": 0.4206715524196625, + "learning_rate": 0.0001036197507116027, + "loss": 1.3999, + "step": 37093 + }, + { + "epoch": 0.48201961201540505, + "grad_norm": 0.4758639633655548, + "learning_rate": 0.00010361715124969131, + "loss": 1.4574, + "step": 37094 + }, + { + "epoch": 0.4820326065593209, + "grad_norm": 0.3254123032093048, + "learning_rate": 0.00010361455178777995, + "loss": 1.4146, + "step": 37095 + }, + { + "epoch": 0.4820456011032368, + "grad_norm": 0.3778538703918457, + "learning_rate": 0.00010361195232586855, + "loss": 1.3906, + "step": 37096 + }, + { + "epoch": 0.48205859564715264, + "grad_norm": 0.4666554629802704, + "learning_rate": 0.00010360935286395716, + "loss": 1.3955, + "step": 37097 + }, + { + "epoch": 0.48207159019106854, + "grad_norm": 0.424966424703598, + "learning_rate": 0.00010360675340204577, + "loss": 1.2918, + "step": 37098 + }, + { + "epoch": 0.4820845847349844, + "grad_norm": 0.4617466628551483, + "learning_rate": 0.00010360415394013441, + "loss": 1.3452, + "step": 37099 + }, + { + "epoch": 0.4820975792789003, + "grad_norm": 0.413888543844223, + "learning_rate": 0.00010360155447822302, + "loss": 1.3404, + "step": 37100 + }, + { + "epoch": 0.48211057382281614, + "grad_norm": 0.34735554456710815, + "learning_rate": 0.00010359895501631163, + "loss": 1.3954, + "step": 37101 + }, + { + "epoch": 0.48212356836673204, + "grad_norm": 0.40625226497650146, + "learning_rate": 0.00010359635555440024, + "loss": 1.4646, + "step": 37102 + }, + { + "epoch": 0.4821365629106479, + "grad_norm": 0.4119158387184143, + "learning_rate": 0.00010359375609248887, + "loss": 1.2361, + "step": 37103 + }, + { + "epoch": 0.4821495574545638, + "grad_norm": 0.3501068949699402, + "learning_rate": 0.00010359115663057748, + "loss": 1.4238, + "step": 37104 + }, + { + "epoch": 0.4821625519984796, + "grad_norm": 0.36393558979034424, + "learning_rate": 0.00010358855716866609, + "loss": 1.2528, + "step": 37105 + }, + { + "epoch": 0.48217554654239553, + "grad_norm": 0.47136953473091125, + "learning_rate": 0.0001035859577067547, + "loss": 1.194, + "step": 37106 + }, + { + "epoch": 0.4821885410863114, + "grad_norm": 0.4365573525428772, + "learning_rate": 0.00010358335824484334, + "loss": 1.4229, + "step": 37107 + }, + { + "epoch": 0.4822015356302273, + "grad_norm": 0.4839194118976593, + "learning_rate": 0.00010358075878293195, + "loss": 1.4068, + "step": 37108 + }, + { + "epoch": 0.4822145301741431, + "grad_norm": 0.4345313012599945, + "learning_rate": 0.00010357815932102055, + "loss": 1.4235, + "step": 37109 + }, + { + "epoch": 0.482227524718059, + "grad_norm": 0.4125238060951233, + "learning_rate": 0.00010357555985910916, + "loss": 1.2842, + "step": 37110 + }, + { + "epoch": 0.48224051926197486, + "grad_norm": 0.457811564207077, + "learning_rate": 0.0001035729603971978, + "loss": 1.4778, + "step": 37111 + }, + { + "epoch": 0.48225351380589077, + "grad_norm": 0.49580860137939453, + "learning_rate": 0.0001035703609352864, + "loss": 1.5206, + "step": 37112 + }, + { + "epoch": 0.4822665083498066, + "grad_norm": 0.42422953248023987, + "learning_rate": 0.00010356776147337502, + "loss": 1.4659, + "step": 37113 + }, + { + "epoch": 0.4822795028937225, + "grad_norm": 0.4670541286468506, + "learning_rate": 0.00010356516201146363, + "loss": 1.5048, + "step": 37114 + }, + { + "epoch": 0.48229249743763836, + "grad_norm": 0.5135405659675598, + "learning_rate": 0.00010356256254955225, + "loss": 1.3222, + "step": 37115 + }, + { + "epoch": 0.48230549198155426, + "grad_norm": 0.45545876026153564, + "learning_rate": 0.00010355996308764086, + "loss": 1.4475, + "step": 37116 + }, + { + "epoch": 0.4823184865254701, + "grad_norm": 0.39179643988609314, + "learning_rate": 0.00010355736362572947, + "loss": 1.383, + "step": 37117 + }, + { + "epoch": 0.482331481069386, + "grad_norm": 0.46913668513298035, + "learning_rate": 0.00010355476416381808, + "loss": 1.48, + "step": 37118 + }, + { + "epoch": 0.48234447561330185, + "grad_norm": 0.32873645424842834, + "learning_rate": 0.00010355216470190672, + "loss": 1.4631, + "step": 37119 + }, + { + "epoch": 0.48235747015721775, + "grad_norm": 0.3252403736114502, + "learning_rate": 0.00010354956523999533, + "loss": 1.329, + "step": 37120 + }, + { + "epoch": 0.4823704647011336, + "grad_norm": 0.4843003451824188, + "learning_rate": 0.00010354696577808393, + "loss": 1.2881, + "step": 37121 + }, + { + "epoch": 0.4823834592450495, + "grad_norm": 0.3669763207435608, + "learning_rate": 0.00010354436631617254, + "loss": 1.3835, + "step": 37122 + }, + { + "epoch": 0.48239645378896534, + "grad_norm": 0.4133894741535187, + "learning_rate": 0.00010354176685426118, + "loss": 1.2308, + "step": 37123 + }, + { + "epoch": 0.48240944833288124, + "grad_norm": 0.40545177459716797, + "learning_rate": 0.00010353916739234979, + "loss": 1.3157, + "step": 37124 + }, + { + "epoch": 0.4824224428767971, + "grad_norm": 0.4551369845867157, + "learning_rate": 0.0001035365679304384, + "loss": 1.4425, + "step": 37125 + }, + { + "epoch": 0.482435437420713, + "grad_norm": 0.447760671377182, + "learning_rate": 0.00010353396846852701, + "loss": 1.5473, + "step": 37126 + }, + { + "epoch": 0.48244843196462883, + "grad_norm": 0.453803151845932, + "learning_rate": 0.00010353136900661564, + "loss": 1.6073, + "step": 37127 + }, + { + "epoch": 0.48246142650854473, + "grad_norm": 0.4930380880832672, + "learning_rate": 0.00010352876954470425, + "loss": 1.4331, + "step": 37128 + }, + { + "epoch": 0.4824744210524606, + "grad_norm": 0.3659795820713043, + "learning_rate": 0.00010352617008279286, + "loss": 1.4077, + "step": 37129 + }, + { + "epoch": 0.4824874155963765, + "grad_norm": 0.40258270502090454, + "learning_rate": 0.00010352357062088147, + "loss": 1.2885, + "step": 37130 + }, + { + "epoch": 0.4825004101402923, + "grad_norm": 0.4075701832771301, + "learning_rate": 0.00010352097115897011, + "loss": 1.4023, + "step": 37131 + }, + { + "epoch": 0.4825134046842082, + "grad_norm": 0.3460240364074707, + "learning_rate": 0.00010351837169705872, + "loss": 1.4505, + "step": 37132 + }, + { + "epoch": 0.48252639922812407, + "grad_norm": 0.39133960008621216, + "learning_rate": 0.00010351577223514733, + "loss": 1.4253, + "step": 37133 + }, + { + "epoch": 0.48253939377203997, + "grad_norm": 0.3858380615711212, + "learning_rate": 0.00010351317277323595, + "loss": 1.3297, + "step": 37134 + }, + { + "epoch": 0.4825523883159558, + "grad_norm": 0.4109053313732147, + "learning_rate": 0.00010351057331132457, + "loss": 1.2859, + "step": 37135 + }, + { + "epoch": 0.4825653828598717, + "grad_norm": 0.3750620186328888, + "learning_rate": 0.00010350797384941318, + "loss": 1.4771, + "step": 37136 + }, + { + "epoch": 0.48257837740378756, + "grad_norm": 0.3399812579154968, + "learning_rate": 0.00010350537438750179, + "loss": 1.0581, + "step": 37137 + }, + { + "epoch": 0.48259137194770346, + "grad_norm": 0.41491013765335083, + "learning_rate": 0.00010350277492559041, + "loss": 1.5068, + "step": 37138 + }, + { + "epoch": 0.4826043664916193, + "grad_norm": 0.35757890343666077, + "learning_rate": 0.00010350017546367902, + "loss": 1.5266, + "step": 37139 + }, + { + "epoch": 0.4826173610355352, + "grad_norm": 0.43575409054756165, + "learning_rate": 0.00010349757600176763, + "loss": 1.4045, + "step": 37140 + }, + { + "epoch": 0.48263035557945105, + "grad_norm": 0.2864450216293335, + "learning_rate": 0.00010349497653985624, + "loss": 1.2481, + "step": 37141 + }, + { + "epoch": 0.48264335012336695, + "grad_norm": 0.4132607877254486, + "learning_rate": 0.00010349237707794488, + "loss": 1.5628, + "step": 37142 + }, + { + "epoch": 0.4826563446672828, + "grad_norm": 0.37440818548202515, + "learning_rate": 0.0001034897776160335, + "loss": 1.2088, + "step": 37143 + }, + { + "epoch": 0.4826693392111987, + "grad_norm": 0.44413092732429504, + "learning_rate": 0.0001034871781541221, + "loss": 1.3784, + "step": 37144 + }, + { + "epoch": 0.48268233375511455, + "grad_norm": 0.3561576306819916, + "learning_rate": 0.00010348457869221072, + "loss": 1.3075, + "step": 37145 + }, + { + "epoch": 0.48269532829903045, + "grad_norm": 0.3423554599285126, + "learning_rate": 0.00010348197923029934, + "loss": 1.3032, + "step": 37146 + }, + { + "epoch": 0.4827083228429463, + "grad_norm": 0.4929901361465454, + "learning_rate": 0.00010347937976838795, + "loss": 1.4953, + "step": 37147 + }, + { + "epoch": 0.4827213173868622, + "grad_norm": 0.4789813756942749, + "learning_rate": 0.00010347678030647656, + "loss": 1.2363, + "step": 37148 + }, + { + "epoch": 0.48273431193077804, + "grad_norm": 0.3408944308757782, + "learning_rate": 0.00010347418084456517, + "loss": 1.3726, + "step": 37149 + }, + { + "epoch": 0.48274730647469394, + "grad_norm": 0.2718135118484497, + "learning_rate": 0.00010347158138265381, + "loss": 1.2941, + "step": 37150 + }, + { + "epoch": 0.48276030101860984, + "grad_norm": 0.43935272097587585, + "learning_rate": 0.00010346898192074241, + "loss": 1.2502, + "step": 37151 + }, + { + "epoch": 0.4827732955625257, + "grad_norm": 0.4239197373390198, + "learning_rate": 0.00010346638245883102, + "loss": 1.1811, + "step": 37152 + }, + { + "epoch": 0.4827862901064416, + "grad_norm": 0.43971508741378784, + "learning_rate": 0.00010346378299691963, + "loss": 1.3636, + "step": 37153 + }, + { + "epoch": 0.48279928465035743, + "grad_norm": 0.39191752672195435, + "learning_rate": 0.00010346118353500827, + "loss": 1.3805, + "step": 37154 + }, + { + "epoch": 0.48281227919427333, + "grad_norm": 0.24109315872192383, + "learning_rate": 0.00010345858407309688, + "loss": 1.2716, + "step": 37155 + }, + { + "epoch": 0.4828252737381892, + "grad_norm": 0.45072266459465027, + "learning_rate": 0.00010345598461118549, + "loss": 1.5644, + "step": 37156 + }, + { + "epoch": 0.4828382682821051, + "grad_norm": 0.3615192770957947, + "learning_rate": 0.0001034533851492741, + "loss": 1.469, + "step": 37157 + }, + { + "epoch": 0.4828512628260209, + "grad_norm": 0.48113152384757996, + "learning_rate": 0.00010345078568736272, + "loss": 1.4839, + "step": 37158 + }, + { + "epoch": 0.4828642573699368, + "grad_norm": 0.40739676356315613, + "learning_rate": 0.00010344818622545134, + "loss": 1.2597, + "step": 37159 + }, + { + "epoch": 0.48287725191385267, + "grad_norm": 0.5238981246948242, + "learning_rate": 0.00010344558676353995, + "loss": 1.396, + "step": 37160 + }, + { + "epoch": 0.48289024645776857, + "grad_norm": 0.35567423701286316, + "learning_rate": 0.00010344298730162856, + "loss": 1.4781, + "step": 37161 + }, + { + "epoch": 0.4829032410016844, + "grad_norm": 0.47920507192611694, + "learning_rate": 0.0001034403878397172, + "loss": 1.5047, + "step": 37162 + }, + { + "epoch": 0.4829162355456003, + "grad_norm": 0.36311790347099304, + "learning_rate": 0.00010343778837780579, + "loss": 1.5195, + "step": 37163 + }, + { + "epoch": 0.48292923008951616, + "grad_norm": 0.410348504781723, + "learning_rate": 0.0001034351889158944, + "loss": 1.3969, + "step": 37164 + }, + { + "epoch": 0.48294222463343206, + "grad_norm": 0.4122743308544159, + "learning_rate": 0.00010343258945398302, + "loss": 1.2918, + "step": 37165 + }, + { + "epoch": 0.4829552191773479, + "grad_norm": 0.30620068311691284, + "learning_rate": 0.00010342998999207165, + "loss": 1.2396, + "step": 37166 + }, + { + "epoch": 0.4829682137212638, + "grad_norm": 0.47491100430488586, + "learning_rate": 0.00010342739053016026, + "loss": 1.5018, + "step": 37167 + }, + { + "epoch": 0.48298120826517965, + "grad_norm": 0.44979533553123474, + "learning_rate": 0.00010342479106824887, + "loss": 1.2966, + "step": 37168 + }, + { + "epoch": 0.48299420280909555, + "grad_norm": 0.35663050413131714, + "learning_rate": 0.0001034221916063375, + "loss": 1.3509, + "step": 37169 + }, + { + "epoch": 0.4830071973530114, + "grad_norm": 0.412049263715744, + "learning_rate": 0.00010341959214442611, + "loss": 1.3382, + "step": 37170 + }, + { + "epoch": 0.4830201918969273, + "grad_norm": 0.42856305837631226, + "learning_rate": 0.00010341699268251472, + "loss": 1.5078, + "step": 37171 + }, + { + "epoch": 0.48303318644084314, + "grad_norm": 0.3330698609352112, + "learning_rate": 0.00010341439322060333, + "loss": 1.4553, + "step": 37172 + }, + { + "epoch": 0.48304618098475904, + "grad_norm": 0.4218689501285553, + "learning_rate": 0.00010341179375869197, + "loss": 1.2499, + "step": 37173 + }, + { + "epoch": 0.4830591755286749, + "grad_norm": 0.3141877353191376, + "learning_rate": 0.00010340919429678058, + "loss": 1.483, + "step": 37174 + }, + { + "epoch": 0.4830721700725908, + "grad_norm": 0.39505767822265625, + "learning_rate": 0.00010340659483486919, + "loss": 1.3425, + "step": 37175 + }, + { + "epoch": 0.48308516461650663, + "grad_norm": 0.36962851881980896, + "learning_rate": 0.00010340399537295779, + "loss": 1.2817, + "step": 37176 + }, + { + "epoch": 0.48309815916042254, + "grad_norm": 0.49067923426628113, + "learning_rate": 0.00010340139591104643, + "loss": 1.4719, + "step": 37177 + }, + { + "epoch": 0.4831111537043384, + "grad_norm": 0.3833983540534973, + "learning_rate": 0.00010339879644913504, + "loss": 1.4658, + "step": 37178 + }, + { + "epoch": 0.4831241482482543, + "grad_norm": 0.39992755651474, + "learning_rate": 0.00010339619698722365, + "loss": 1.3551, + "step": 37179 + }, + { + "epoch": 0.4831371427921701, + "grad_norm": 0.3843168020248413, + "learning_rate": 0.00010339359752531226, + "loss": 1.3564, + "step": 37180 + }, + { + "epoch": 0.483150137336086, + "grad_norm": 0.467490553855896, + "learning_rate": 0.00010339099806340088, + "loss": 1.4634, + "step": 37181 + }, + { + "epoch": 0.4831631318800019, + "grad_norm": 0.5566713213920593, + "learning_rate": 0.0001033883986014895, + "loss": 1.3063, + "step": 37182 + }, + { + "epoch": 0.4831761264239178, + "grad_norm": 0.36853906512260437, + "learning_rate": 0.0001033857991395781, + "loss": 1.5132, + "step": 37183 + }, + { + "epoch": 0.4831891209678336, + "grad_norm": 0.36428385972976685, + "learning_rate": 0.00010338319967766672, + "loss": 1.4014, + "step": 37184 + }, + { + "epoch": 0.4832021155117495, + "grad_norm": 0.45255762338638306, + "learning_rate": 0.00010338060021575536, + "loss": 1.4467, + "step": 37185 + }, + { + "epoch": 0.48321511005566536, + "grad_norm": 0.34628692269325256, + "learning_rate": 0.00010337800075384397, + "loss": 1.3885, + "step": 37186 + }, + { + "epoch": 0.48322810459958127, + "grad_norm": 0.46361613273620605, + "learning_rate": 0.00010337540129193258, + "loss": 1.5286, + "step": 37187 + }, + { + "epoch": 0.4832410991434971, + "grad_norm": 0.4115747809410095, + "learning_rate": 0.00010337280183002119, + "loss": 1.3885, + "step": 37188 + }, + { + "epoch": 0.483254093687413, + "grad_norm": 0.3923777639865875, + "learning_rate": 0.00010337020236810981, + "loss": 1.4679, + "step": 37189 + }, + { + "epoch": 0.48326708823132886, + "grad_norm": 0.4257797598838806, + "learning_rate": 0.00010336760290619842, + "loss": 1.5457, + "step": 37190 + }, + { + "epoch": 0.48328008277524476, + "grad_norm": 0.4121578633785248, + "learning_rate": 0.00010336500344428703, + "loss": 1.4574, + "step": 37191 + }, + { + "epoch": 0.4832930773191606, + "grad_norm": 0.38433706760406494, + "learning_rate": 0.00010336240398237565, + "loss": 1.4184, + "step": 37192 + }, + { + "epoch": 0.4833060718630765, + "grad_norm": 0.4299604296684265, + "learning_rate": 0.00010335980452046427, + "loss": 1.2056, + "step": 37193 + }, + { + "epoch": 0.48331906640699235, + "grad_norm": 0.41900190711021423, + "learning_rate": 0.00010335720505855288, + "loss": 1.3693, + "step": 37194 + }, + { + "epoch": 0.48333206095090825, + "grad_norm": 0.3741609454154968, + "learning_rate": 0.00010335460559664149, + "loss": 1.3709, + "step": 37195 + }, + { + "epoch": 0.4833450554948241, + "grad_norm": 0.39460936188697815, + "learning_rate": 0.0001033520061347301, + "loss": 1.2977, + "step": 37196 + }, + { + "epoch": 0.48335805003874, + "grad_norm": 0.3713022768497467, + "learning_rate": 0.00010334940667281874, + "loss": 1.3617, + "step": 37197 + }, + { + "epoch": 0.48337104458265584, + "grad_norm": 0.42974406480789185, + "learning_rate": 0.00010334680721090735, + "loss": 1.5333, + "step": 37198 + }, + { + "epoch": 0.48338403912657174, + "grad_norm": 0.41206085681915283, + "learning_rate": 0.00010334420774899596, + "loss": 1.4894, + "step": 37199 + }, + { + "epoch": 0.4833970336704876, + "grad_norm": 0.2533392012119293, + "learning_rate": 0.00010334160828708457, + "loss": 1.3807, + "step": 37200 + }, + { + "epoch": 0.4834100282144035, + "grad_norm": 0.37510421872138977, + "learning_rate": 0.0001033390088251732, + "loss": 1.3113, + "step": 37201 + }, + { + "epoch": 0.48342302275831933, + "grad_norm": 0.32097989320755005, + "learning_rate": 0.00010333640936326181, + "loss": 1.3608, + "step": 37202 + }, + { + "epoch": 0.48343601730223523, + "grad_norm": 0.48167771100997925, + "learning_rate": 0.00010333380990135042, + "loss": 1.3791, + "step": 37203 + }, + { + "epoch": 0.4834490118461511, + "grad_norm": 0.466338574886322, + "learning_rate": 0.00010333121043943903, + "loss": 1.5742, + "step": 37204 + }, + { + "epoch": 0.483462006390067, + "grad_norm": 0.43769240379333496, + "learning_rate": 0.00010332861097752766, + "loss": 1.4878, + "step": 37205 + }, + { + "epoch": 0.4834750009339828, + "grad_norm": 0.42374518513679504, + "learning_rate": 0.00010332601151561627, + "loss": 1.2254, + "step": 37206 + }, + { + "epoch": 0.4834879954778987, + "grad_norm": 0.35992535948753357, + "learning_rate": 0.00010332341205370488, + "loss": 1.3777, + "step": 37207 + }, + { + "epoch": 0.48350099002181457, + "grad_norm": 0.4110301733016968, + "learning_rate": 0.00010332081259179352, + "loss": 1.347, + "step": 37208 + }, + { + "epoch": 0.48351398456573047, + "grad_norm": 0.4136654734611511, + "learning_rate": 0.00010331821312988213, + "loss": 1.5877, + "step": 37209 + }, + { + "epoch": 0.4835269791096463, + "grad_norm": 0.40613579750061035, + "learning_rate": 0.00010331561366797074, + "loss": 1.3394, + "step": 37210 + }, + { + "epoch": 0.4835399736535622, + "grad_norm": 0.4845450520515442, + "learning_rate": 0.00010331301420605935, + "loss": 1.3919, + "step": 37211 + }, + { + "epoch": 0.48355296819747806, + "grad_norm": 0.3628627061843872, + "learning_rate": 0.00010331041474414797, + "loss": 1.3804, + "step": 37212 + }, + { + "epoch": 0.48356596274139396, + "grad_norm": 0.40418869256973267, + "learning_rate": 0.00010330781528223658, + "loss": 1.3988, + "step": 37213 + }, + { + "epoch": 0.4835789572853098, + "grad_norm": 0.33032387495040894, + "learning_rate": 0.0001033052158203252, + "loss": 1.4622, + "step": 37214 + }, + { + "epoch": 0.4835919518292257, + "grad_norm": 0.4323943257331848, + "learning_rate": 0.0001033026163584138, + "loss": 1.6106, + "step": 37215 + }, + { + "epoch": 0.48360494637314155, + "grad_norm": 0.37121543288230896, + "learning_rate": 0.00010330001689650244, + "loss": 1.3792, + "step": 37216 + }, + { + "epoch": 0.48361794091705745, + "grad_norm": 0.4017961323261261, + "learning_rate": 0.00010329741743459105, + "loss": 1.4648, + "step": 37217 + }, + { + "epoch": 0.4836309354609733, + "grad_norm": 0.416471391916275, + "learning_rate": 0.00010329481797267965, + "loss": 1.6313, + "step": 37218 + }, + { + "epoch": 0.4836439300048892, + "grad_norm": 0.34949880838394165, + "learning_rate": 0.00010329221851076826, + "loss": 1.2322, + "step": 37219 + }, + { + "epoch": 0.48365692454880505, + "grad_norm": 0.47480031847953796, + "learning_rate": 0.0001032896190488569, + "loss": 1.4028, + "step": 37220 + }, + { + "epoch": 0.48366991909272095, + "grad_norm": 0.4453912377357483, + "learning_rate": 0.00010328701958694551, + "loss": 1.3699, + "step": 37221 + }, + { + "epoch": 0.4836829136366368, + "grad_norm": 0.41161414980888367, + "learning_rate": 0.00010328442012503412, + "loss": 1.2339, + "step": 37222 + }, + { + "epoch": 0.4836959081805527, + "grad_norm": 0.3298141360282898, + "learning_rate": 0.00010328182066312273, + "loss": 1.5895, + "step": 37223 + }, + { + "epoch": 0.48370890272446854, + "grad_norm": 0.34714871644973755, + "learning_rate": 0.00010327922120121136, + "loss": 1.3556, + "step": 37224 + }, + { + "epoch": 0.48372189726838444, + "grad_norm": 0.48577478528022766, + "learning_rate": 0.00010327662173929997, + "loss": 1.2358, + "step": 37225 + }, + { + "epoch": 0.4837348918123003, + "grad_norm": 0.3123428523540497, + "learning_rate": 0.00010327402227738858, + "loss": 1.4468, + "step": 37226 + }, + { + "epoch": 0.4837478863562162, + "grad_norm": 0.4715064764022827, + "learning_rate": 0.00010327142281547719, + "loss": 1.4772, + "step": 37227 + }, + { + "epoch": 0.4837608809001321, + "grad_norm": 0.4288148880004883, + "learning_rate": 0.00010326882335356583, + "loss": 1.4387, + "step": 37228 + }, + { + "epoch": 0.48377387544404793, + "grad_norm": 0.3944775462150574, + "learning_rate": 0.00010326622389165444, + "loss": 1.5412, + "step": 37229 + }, + { + "epoch": 0.48378686998796383, + "grad_norm": 0.3985206186771393, + "learning_rate": 0.00010326362442974305, + "loss": 1.3873, + "step": 37230 + }, + { + "epoch": 0.4837998645318797, + "grad_norm": 0.46342065930366516, + "learning_rate": 0.00010326102496783165, + "loss": 1.5622, + "step": 37231 + }, + { + "epoch": 0.4838128590757956, + "grad_norm": 0.441742867231369, + "learning_rate": 0.00010325842550592029, + "loss": 1.453, + "step": 37232 + }, + { + "epoch": 0.4838258536197114, + "grad_norm": 0.39225175976753235, + "learning_rate": 0.0001032558260440089, + "loss": 1.4559, + "step": 37233 + }, + { + "epoch": 0.4838388481636273, + "grad_norm": 0.3734672963619232, + "learning_rate": 0.00010325322658209751, + "loss": 1.2709, + "step": 37234 + }, + { + "epoch": 0.48385184270754317, + "grad_norm": 0.3944225609302521, + "learning_rate": 0.00010325062712018612, + "loss": 1.3263, + "step": 37235 + }, + { + "epoch": 0.48386483725145907, + "grad_norm": 0.46858030557632446, + "learning_rate": 0.00010324802765827474, + "loss": 1.4601, + "step": 37236 + }, + { + "epoch": 0.4838778317953749, + "grad_norm": 0.36646565794944763, + "learning_rate": 0.00010324542819636335, + "loss": 1.1824, + "step": 37237 + }, + { + "epoch": 0.4838908263392908, + "grad_norm": 0.3870489001274109, + "learning_rate": 0.00010324282873445197, + "loss": 1.3307, + "step": 37238 + }, + { + "epoch": 0.48390382088320666, + "grad_norm": 0.44329240918159485, + "learning_rate": 0.00010324022927254058, + "loss": 1.2801, + "step": 37239 + }, + { + "epoch": 0.48391681542712256, + "grad_norm": 0.38681545853614807, + "learning_rate": 0.00010323762981062921, + "loss": 1.3987, + "step": 37240 + }, + { + "epoch": 0.4839298099710384, + "grad_norm": 0.4820795953273773, + "learning_rate": 0.00010323503034871783, + "loss": 1.3726, + "step": 37241 + }, + { + "epoch": 0.4839428045149543, + "grad_norm": 0.29340028762817383, + "learning_rate": 0.00010323243088680644, + "loss": 1.4184, + "step": 37242 + }, + { + "epoch": 0.48395579905887015, + "grad_norm": 0.34914180636405945, + "learning_rate": 0.00010322983142489506, + "loss": 1.1709, + "step": 37243 + }, + { + "epoch": 0.48396879360278605, + "grad_norm": 0.3743126392364502, + "learning_rate": 0.00010322723196298367, + "loss": 1.4648, + "step": 37244 + }, + { + "epoch": 0.4839817881467019, + "grad_norm": 0.37745729088783264, + "learning_rate": 0.00010322463250107228, + "loss": 1.1361, + "step": 37245 + }, + { + "epoch": 0.4839947826906178, + "grad_norm": 0.4105580449104309, + "learning_rate": 0.0001032220330391609, + "loss": 1.4258, + "step": 37246 + }, + { + "epoch": 0.48400777723453364, + "grad_norm": 0.5369956493377686, + "learning_rate": 0.00010321943357724952, + "loss": 1.3895, + "step": 37247 + }, + { + "epoch": 0.48402077177844954, + "grad_norm": 0.39933329820632935, + "learning_rate": 0.00010321683411533813, + "loss": 1.3426, + "step": 37248 + }, + { + "epoch": 0.4840337663223654, + "grad_norm": 0.43829116225242615, + "learning_rate": 0.00010321423465342674, + "loss": 1.378, + "step": 37249 + }, + { + "epoch": 0.4840467608662813, + "grad_norm": 0.4701855480670929, + "learning_rate": 0.00010321163519151535, + "loss": 1.251, + "step": 37250 + }, + { + "epoch": 0.48405975541019713, + "grad_norm": 0.43581947684288025, + "learning_rate": 0.00010320903572960399, + "loss": 1.536, + "step": 37251 + }, + { + "epoch": 0.48407274995411304, + "grad_norm": 0.5097048878669739, + "learning_rate": 0.0001032064362676926, + "loss": 1.3069, + "step": 37252 + }, + { + "epoch": 0.4840857444980289, + "grad_norm": 0.40764036774635315, + "learning_rate": 0.00010320383680578121, + "loss": 1.3116, + "step": 37253 + }, + { + "epoch": 0.4840987390419448, + "grad_norm": 0.48757895827293396, + "learning_rate": 0.00010320123734386982, + "loss": 1.5043, + "step": 37254 + }, + { + "epoch": 0.4841117335858606, + "grad_norm": 0.4363167881965637, + "learning_rate": 0.00010319863788195845, + "loss": 1.3365, + "step": 37255 + }, + { + "epoch": 0.4841247281297765, + "grad_norm": 0.375189870595932, + "learning_rate": 0.00010319603842004706, + "loss": 1.3692, + "step": 37256 + }, + { + "epoch": 0.48413772267369237, + "grad_norm": 0.38907134532928467, + "learning_rate": 0.00010319343895813567, + "loss": 1.3163, + "step": 37257 + }, + { + "epoch": 0.4841507172176083, + "grad_norm": 0.5125067234039307, + "learning_rate": 0.00010319083949622428, + "loss": 1.4085, + "step": 37258 + }, + { + "epoch": 0.4841637117615241, + "grad_norm": 0.39381349086761475, + "learning_rate": 0.00010318824003431292, + "loss": 1.342, + "step": 37259 + }, + { + "epoch": 0.48417670630544, + "grad_norm": 0.42297041416168213, + "learning_rate": 0.00010318564057240151, + "loss": 1.6378, + "step": 37260 + }, + { + "epoch": 0.48418970084935586, + "grad_norm": 0.4624077379703522, + "learning_rate": 0.00010318304111049013, + "loss": 1.5711, + "step": 37261 + }, + { + "epoch": 0.48420269539327176, + "grad_norm": 0.35115256905555725, + "learning_rate": 0.00010318044164857874, + "loss": 1.5156, + "step": 37262 + }, + { + "epoch": 0.4842156899371876, + "grad_norm": 0.35407575964927673, + "learning_rate": 0.00010317784218666737, + "loss": 1.2877, + "step": 37263 + }, + { + "epoch": 0.4842286844811035, + "grad_norm": 0.3609103560447693, + "learning_rate": 0.00010317524272475599, + "loss": 1.2367, + "step": 37264 + }, + { + "epoch": 0.48424167902501936, + "grad_norm": 0.5343315601348877, + "learning_rate": 0.0001031726432628446, + "loss": 1.575, + "step": 37265 + }, + { + "epoch": 0.48425467356893526, + "grad_norm": 0.3595285415649414, + "learning_rate": 0.00010317004380093321, + "loss": 1.2495, + "step": 37266 + }, + { + "epoch": 0.4842676681128511, + "grad_norm": 0.4144512116909027, + "learning_rate": 0.00010316744433902183, + "loss": 1.5043, + "step": 37267 + }, + { + "epoch": 0.484280662656767, + "grad_norm": 0.36911237239837646, + "learning_rate": 0.00010316484487711044, + "loss": 1.1594, + "step": 37268 + }, + { + "epoch": 0.48429365720068285, + "grad_norm": 0.40948423743247986, + "learning_rate": 0.00010316224541519905, + "loss": 1.33, + "step": 37269 + }, + { + "epoch": 0.48430665174459875, + "grad_norm": 0.3155144453048706, + "learning_rate": 0.00010315964595328766, + "loss": 1.3853, + "step": 37270 + }, + { + "epoch": 0.4843196462885146, + "grad_norm": 0.37012192606925964, + "learning_rate": 0.0001031570464913763, + "loss": 1.3815, + "step": 37271 + }, + { + "epoch": 0.4843326408324305, + "grad_norm": 0.5392526388168335, + "learning_rate": 0.00010315444702946491, + "loss": 1.5236, + "step": 37272 + }, + { + "epoch": 0.48434563537634634, + "grad_norm": 0.40471458435058594, + "learning_rate": 0.00010315184756755351, + "loss": 1.3766, + "step": 37273 + }, + { + "epoch": 0.48435862992026224, + "grad_norm": 0.375217080116272, + "learning_rate": 0.00010314924810564212, + "loss": 1.3948, + "step": 37274 + }, + { + "epoch": 0.4843716244641781, + "grad_norm": 0.4086691439151764, + "learning_rate": 0.00010314664864373076, + "loss": 1.4578, + "step": 37275 + }, + { + "epoch": 0.484384619008094, + "grad_norm": 0.34421971440315247, + "learning_rate": 0.00010314404918181937, + "loss": 1.2724, + "step": 37276 + }, + { + "epoch": 0.48439761355200983, + "grad_norm": 0.4066241383552551, + "learning_rate": 0.00010314144971990798, + "loss": 1.5079, + "step": 37277 + }, + { + "epoch": 0.48441060809592573, + "grad_norm": 0.5166919231414795, + "learning_rate": 0.00010313885025799659, + "loss": 1.4763, + "step": 37278 + }, + { + "epoch": 0.4844236026398416, + "grad_norm": 0.4615444242954254, + "learning_rate": 0.00010313625079608522, + "loss": 1.5526, + "step": 37279 + }, + { + "epoch": 0.4844365971837575, + "grad_norm": 0.42272472381591797, + "learning_rate": 0.00010313365133417383, + "loss": 1.3383, + "step": 37280 + }, + { + "epoch": 0.4844495917276733, + "grad_norm": 0.3164714574813843, + "learning_rate": 0.00010313105187226244, + "loss": 1.326, + "step": 37281 + }, + { + "epoch": 0.4844625862715892, + "grad_norm": 0.4370535612106323, + "learning_rate": 0.00010312845241035108, + "loss": 1.4286, + "step": 37282 + }, + { + "epoch": 0.48447558081550507, + "grad_norm": 0.43848681449890137, + "learning_rate": 0.00010312585294843969, + "loss": 1.4454, + "step": 37283 + }, + { + "epoch": 0.48448857535942097, + "grad_norm": 0.542762279510498, + "learning_rate": 0.0001031232534865283, + "loss": 1.5935, + "step": 37284 + }, + { + "epoch": 0.4845015699033368, + "grad_norm": 0.3079347312450409, + "learning_rate": 0.00010312065402461691, + "loss": 1.3663, + "step": 37285 + }, + { + "epoch": 0.4845145644472527, + "grad_norm": 0.3275371193885803, + "learning_rate": 0.00010311805456270553, + "loss": 1.288, + "step": 37286 + }, + { + "epoch": 0.48452755899116856, + "grad_norm": 0.4336625039577484, + "learning_rate": 0.00010311545510079415, + "loss": 1.2743, + "step": 37287 + }, + { + "epoch": 0.48454055353508446, + "grad_norm": 0.3128984868526459, + "learning_rate": 0.00010311285563888276, + "loss": 1.0264, + "step": 37288 + }, + { + "epoch": 0.4845535480790003, + "grad_norm": 0.40120580792427063, + "learning_rate": 0.00010311025617697137, + "loss": 1.3331, + "step": 37289 + }, + { + "epoch": 0.4845665426229162, + "grad_norm": 0.4253067672252655, + "learning_rate": 0.00010310765671505999, + "loss": 1.4092, + "step": 37290 + }, + { + "epoch": 0.48457953716683205, + "grad_norm": 0.5073135495185852, + "learning_rate": 0.0001031050572531486, + "loss": 1.5909, + "step": 37291 + }, + { + "epoch": 0.48459253171074795, + "grad_norm": 0.3991956114768982, + "learning_rate": 0.00010310245779123721, + "loss": 1.3166, + "step": 37292 + }, + { + "epoch": 0.4846055262546638, + "grad_norm": 0.46882501244544983, + "learning_rate": 0.00010309985832932582, + "loss": 1.5447, + "step": 37293 + }, + { + "epoch": 0.4846185207985797, + "grad_norm": 0.35512053966522217, + "learning_rate": 0.00010309725886741446, + "loss": 1.3552, + "step": 37294 + }, + { + "epoch": 0.48463151534249554, + "grad_norm": 0.4463934898376465, + "learning_rate": 0.00010309465940550307, + "loss": 1.5584, + "step": 37295 + }, + { + "epoch": 0.48464450988641145, + "grad_norm": 0.38918331265449524, + "learning_rate": 0.00010309205994359168, + "loss": 1.4079, + "step": 37296 + }, + { + "epoch": 0.4846575044303273, + "grad_norm": 0.4286218285560608, + "learning_rate": 0.0001030894604816803, + "loss": 1.5875, + "step": 37297 + }, + { + "epoch": 0.4846704989742432, + "grad_norm": 0.3851255774497986, + "learning_rate": 0.00010308686101976892, + "loss": 1.5539, + "step": 37298 + }, + { + "epoch": 0.48468349351815904, + "grad_norm": 0.47664251923561096, + "learning_rate": 0.00010308426155785753, + "loss": 1.2369, + "step": 37299 + }, + { + "epoch": 0.48469648806207494, + "grad_norm": 0.3159470558166504, + "learning_rate": 0.00010308166209594614, + "loss": 1.4097, + "step": 37300 + }, + { + "epoch": 0.4847094826059908, + "grad_norm": 0.3819974958896637, + "learning_rate": 0.00010307906263403475, + "loss": 1.3115, + "step": 37301 + }, + { + "epoch": 0.4847224771499067, + "grad_norm": 0.44311949610710144, + "learning_rate": 0.00010307646317212338, + "loss": 1.4432, + "step": 37302 + }, + { + "epoch": 0.4847354716938226, + "grad_norm": 0.39163726568222046, + "learning_rate": 0.00010307386371021199, + "loss": 1.4441, + "step": 37303 + }, + { + "epoch": 0.48474846623773843, + "grad_norm": 0.3820706903934479, + "learning_rate": 0.0001030712642483006, + "loss": 1.3656, + "step": 37304 + }, + { + "epoch": 0.48476146078165433, + "grad_norm": 0.4349375367164612, + "learning_rate": 0.00010306866478638921, + "loss": 1.378, + "step": 37305 + }, + { + "epoch": 0.4847744553255702, + "grad_norm": 0.38828837871551514, + "learning_rate": 0.00010306606532447785, + "loss": 1.2803, + "step": 37306 + }, + { + "epoch": 0.4847874498694861, + "grad_norm": 0.3196861743927002, + "learning_rate": 0.00010306346586256646, + "loss": 1.3451, + "step": 37307 + }, + { + "epoch": 0.4848004444134019, + "grad_norm": 0.40137940645217896, + "learning_rate": 0.00010306086640065507, + "loss": 1.217, + "step": 37308 + }, + { + "epoch": 0.4848134389573178, + "grad_norm": 0.4123448431491852, + "learning_rate": 0.00010305826693874368, + "loss": 1.3543, + "step": 37309 + }, + { + "epoch": 0.48482643350123367, + "grad_norm": 0.38477200269699097, + "learning_rate": 0.0001030556674768323, + "loss": 1.292, + "step": 37310 + }, + { + "epoch": 0.48483942804514957, + "grad_norm": 0.2858336865901947, + "learning_rate": 0.00010305306801492092, + "loss": 1.2048, + "step": 37311 + }, + { + "epoch": 0.4848524225890654, + "grad_norm": 0.4781274199485779, + "learning_rate": 0.00010305046855300953, + "loss": 1.2614, + "step": 37312 + }, + { + "epoch": 0.4848654171329813, + "grad_norm": 0.41455015540122986, + "learning_rate": 0.00010304786909109814, + "loss": 1.4943, + "step": 37313 + }, + { + "epoch": 0.48487841167689716, + "grad_norm": 0.3462524712085724, + "learning_rate": 0.00010304526962918678, + "loss": 1.3053, + "step": 37314 + }, + { + "epoch": 0.48489140622081306, + "grad_norm": 0.5161040425300598, + "learning_rate": 0.00010304267016727537, + "loss": 1.4796, + "step": 37315 + }, + { + "epoch": 0.4849044007647289, + "grad_norm": 0.3433058559894562, + "learning_rate": 0.00010304007070536398, + "loss": 1.4006, + "step": 37316 + }, + { + "epoch": 0.4849173953086448, + "grad_norm": 0.4409097135066986, + "learning_rate": 0.00010303747124345262, + "loss": 1.3829, + "step": 37317 + }, + { + "epoch": 0.48493038985256065, + "grad_norm": 0.46034157276153564, + "learning_rate": 0.00010303487178154123, + "loss": 1.4914, + "step": 37318 + }, + { + "epoch": 0.48494338439647655, + "grad_norm": 0.45123717188835144, + "learning_rate": 0.00010303227231962984, + "loss": 1.4212, + "step": 37319 + }, + { + "epoch": 0.4849563789403924, + "grad_norm": 0.3614075779914856, + "learning_rate": 0.00010302967285771845, + "loss": 1.507, + "step": 37320 + }, + { + "epoch": 0.4849693734843083, + "grad_norm": 0.33442336320877075, + "learning_rate": 0.00010302707339580708, + "loss": 1.4008, + "step": 37321 + }, + { + "epoch": 0.48498236802822414, + "grad_norm": 0.3285088539123535, + "learning_rate": 0.00010302447393389569, + "loss": 1.3035, + "step": 37322 + }, + { + "epoch": 0.48499536257214004, + "grad_norm": 0.3431672155857086, + "learning_rate": 0.0001030218744719843, + "loss": 1.3545, + "step": 37323 + }, + { + "epoch": 0.4850083571160559, + "grad_norm": 0.33698031306266785, + "learning_rate": 0.00010301927501007291, + "loss": 1.2774, + "step": 37324 + }, + { + "epoch": 0.4850213516599718, + "grad_norm": 0.40808069705963135, + "learning_rate": 0.00010301667554816155, + "loss": 1.4727, + "step": 37325 + }, + { + "epoch": 0.48503434620388763, + "grad_norm": 0.40479081869125366, + "learning_rate": 0.00010301407608625016, + "loss": 1.3594, + "step": 37326 + }, + { + "epoch": 0.48504734074780353, + "grad_norm": 0.43282344937324524, + "learning_rate": 0.00010301147662433876, + "loss": 1.3836, + "step": 37327 + }, + { + "epoch": 0.4850603352917194, + "grad_norm": 0.3177891969680786, + "learning_rate": 0.00010300887716242737, + "loss": 1.5342, + "step": 37328 + }, + { + "epoch": 0.4850733298356353, + "grad_norm": 0.46543434262275696, + "learning_rate": 0.00010300627770051601, + "loss": 1.4264, + "step": 37329 + }, + { + "epoch": 0.4850863243795511, + "grad_norm": 0.37158024311065674, + "learning_rate": 0.00010300367823860462, + "loss": 1.3821, + "step": 37330 + }, + { + "epoch": 0.485099318923467, + "grad_norm": 0.3421456217765808, + "learning_rate": 0.00010300107877669323, + "loss": 1.3901, + "step": 37331 + }, + { + "epoch": 0.48511231346738287, + "grad_norm": 0.4326415956020355, + "learning_rate": 0.00010299847931478184, + "loss": 1.6147, + "step": 37332 + }, + { + "epoch": 0.4851253080112988, + "grad_norm": 0.43219414353370667, + "learning_rate": 0.00010299587985287046, + "loss": 1.5064, + "step": 37333 + }, + { + "epoch": 0.4851383025552146, + "grad_norm": 0.4410184919834137, + "learning_rate": 0.00010299328039095908, + "loss": 1.5092, + "step": 37334 + }, + { + "epoch": 0.4851512970991305, + "grad_norm": 0.3448207676410675, + "learning_rate": 0.00010299068092904769, + "loss": 1.2973, + "step": 37335 + }, + { + "epoch": 0.48516429164304636, + "grad_norm": 0.31727349758148193, + "learning_rate": 0.0001029880814671363, + "loss": 1.2197, + "step": 37336 + }, + { + "epoch": 0.48517728618696226, + "grad_norm": 0.4648221731185913, + "learning_rate": 0.00010298548200522494, + "loss": 1.3931, + "step": 37337 + }, + { + "epoch": 0.4851902807308781, + "grad_norm": 0.3920912742614746, + "learning_rate": 0.00010298288254331355, + "loss": 1.3688, + "step": 37338 + }, + { + "epoch": 0.485203275274794, + "grad_norm": 0.5389154553413391, + "learning_rate": 0.00010298028308140216, + "loss": 1.5543, + "step": 37339 + }, + { + "epoch": 0.48521626981870986, + "grad_norm": 0.39465224742889404, + "learning_rate": 0.00010297768361949075, + "loss": 1.5527, + "step": 37340 + }, + { + "epoch": 0.48522926436262576, + "grad_norm": 0.4820897877216339, + "learning_rate": 0.00010297508415757939, + "loss": 1.5539, + "step": 37341 + }, + { + "epoch": 0.4852422589065416, + "grad_norm": 0.41805174946784973, + "learning_rate": 0.000102972484695668, + "loss": 1.4556, + "step": 37342 + }, + { + "epoch": 0.4852552534504575, + "grad_norm": 0.3391643762588501, + "learning_rate": 0.00010296988523375661, + "loss": 1.3382, + "step": 37343 + }, + { + "epoch": 0.48526824799437335, + "grad_norm": 0.4018622636795044, + "learning_rate": 0.00010296728577184523, + "loss": 1.4233, + "step": 37344 + }, + { + "epoch": 0.48528124253828925, + "grad_norm": 0.2915011942386627, + "learning_rate": 0.00010296468630993385, + "loss": 1.3075, + "step": 37345 + }, + { + "epoch": 0.4852942370822051, + "grad_norm": 0.36640891432762146, + "learning_rate": 0.00010296208684802246, + "loss": 1.2986, + "step": 37346 + }, + { + "epoch": 0.485307231626121, + "grad_norm": 0.29109883308410645, + "learning_rate": 0.00010295948738611107, + "loss": 1.1549, + "step": 37347 + }, + { + "epoch": 0.48532022617003684, + "grad_norm": 0.4269489347934723, + "learning_rate": 0.00010295688792419968, + "loss": 1.4679, + "step": 37348 + }, + { + "epoch": 0.48533322071395274, + "grad_norm": 0.3677750527858734, + "learning_rate": 0.00010295428846228832, + "loss": 1.3904, + "step": 37349 + }, + { + "epoch": 0.4853462152578686, + "grad_norm": 0.4265616536140442, + "learning_rate": 0.00010295168900037693, + "loss": 1.3991, + "step": 37350 + }, + { + "epoch": 0.4853592098017845, + "grad_norm": 0.4789171814918518, + "learning_rate": 0.00010294908953846554, + "loss": 1.6488, + "step": 37351 + }, + { + "epoch": 0.48537220434570033, + "grad_norm": 0.4408518373966217, + "learning_rate": 0.00010294649007655415, + "loss": 1.4688, + "step": 37352 + }, + { + "epoch": 0.48538519888961623, + "grad_norm": 0.3400838375091553, + "learning_rate": 0.00010294389061464278, + "loss": 1.3789, + "step": 37353 + }, + { + "epoch": 0.4853981934335321, + "grad_norm": 0.41636016964912415, + "learning_rate": 0.00010294129115273139, + "loss": 1.4442, + "step": 37354 + }, + { + "epoch": 0.485411187977448, + "grad_norm": 0.5291188955307007, + "learning_rate": 0.00010293869169082, + "loss": 1.4994, + "step": 37355 + }, + { + "epoch": 0.4854241825213638, + "grad_norm": 0.4217016398906708, + "learning_rate": 0.00010293609222890864, + "loss": 1.4671, + "step": 37356 + }, + { + "epoch": 0.4854371770652797, + "grad_norm": 0.37189963459968567, + "learning_rate": 0.00010293349276699724, + "loss": 1.2521, + "step": 37357 + }, + { + "epoch": 0.48545017160919557, + "grad_norm": 0.5843049883842468, + "learning_rate": 0.00010293089330508585, + "loss": 1.4887, + "step": 37358 + }, + { + "epoch": 0.48546316615311147, + "grad_norm": 0.43090713024139404, + "learning_rate": 0.00010292829384317446, + "loss": 1.3354, + "step": 37359 + }, + { + "epoch": 0.4854761606970273, + "grad_norm": 0.3094753324985504, + "learning_rate": 0.0001029256943812631, + "loss": 1.4077, + "step": 37360 + }, + { + "epoch": 0.4854891552409432, + "grad_norm": 0.3762724995613098, + "learning_rate": 0.0001029230949193517, + "loss": 1.3662, + "step": 37361 + }, + { + "epoch": 0.48550214978485906, + "grad_norm": 0.41917455196380615, + "learning_rate": 0.00010292049545744032, + "loss": 1.6372, + "step": 37362 + }, + { + "epoch": 0.48551514432877496, + "grad_norm": 0.3764312267303467, + "learning_rate": 0.00010291789599552893, + "loss": 1.5298, + "step": 37363 + }, + { + "epoch": 0.4855281388726908, + "grad_norm": 0.3844401240348816, + "learning_rate": 0.00010291529653361755, + "loss": 1.279, + "step": 37364 + }, + { + "epoch": 0.4855411334166067, + "grad_norm": 0.34436261653900146, + "learning_rate": 0.00010291269707170616, + "loss": 1.2351, + "step": 37365 + }, + { + "epoch": 0.48555412796052255, + "grad_norm": 0.42525362968444824, + "learning_rate": 0.00010291009760979477, + "loss": 1.4303, + "step": 37366 + }, + { + "epoch": 0.48556712250443845, + "grad_norm": 0.40984097123146057, + "learning_rate": 0.00010290749814788339, + "loss": 1.4325, + "step": 37367 + }, + { + "epoch": 0.4855801170483543, + "grad_norm": 0.42023766040802, + "learning_rate": 0.00010290489868597202, + "loss": 1.4536, + "step": 37368 + }, + { + "epoch": 0.4855931115922702, + "grad_norm": 0.31130924820899963, + "learning_rate": 0.00010290229922406062, + "loss": 1.4841, + "step": 37369 + }, + { + "epoch": 0.48560610613618604, + "grad_norm": 0.4077592194080353, + "learning_rate": 0.00010289969976214923, + "loss": 1.4717, + "step": 37370 + }, + { + "epoch": 0.48561910068010194, + "grad_norm": 0.486330509185791, + "learning_rate": 0.00010289710030023784, + "loss": 1.419, + "step": 37371 + }, + { + "epoch": 0.4856320952240178, + "grad_norm": 0.40063440799713135, + "learning_rate": 0.00010289450083832648, + "loss": 1.4058, + "step": 37372 + }, + { + "epoch": 0.4856450897679337, + "grad_norm": 0.4688246548175812, + "learning_rate": 0.00010289190137641509, + "loss": 1.5506, + "step": 37373 + }, + { + "epoch": 0.48565808431184954, + "grad_norm": 0.4947185218334198, + "learning_rate": 0.0001028893019145037, + "loss": 1.3609, + "step": 37374 + }, + { + "epoch": 0.48567107885576544, + "grad_norm": 0.3377351760864258, + "learning_rate": 0.00010288670245259231, + "loss": 1.4575, + "step": 37375 + }, + { + "epoch": 0.4856840733996813, + "grad_norm": 0.4332377016544342, + "learning_rate": 0.00010288410299068094, + "loss": 1.3543, + "step": 37376 + }, + { + "epoch": 0.4856970679435972, + "grad_norm": 0.5080888271331787, + "learning_rate": 0.00010288150352876955, + "loss": 1.2991, + "step": 37377 + }, + { + "epoch": 0.48571006248751303, + "grad_norm": 0.3906863033771515, + "learning_rate": 0.00010287890406685816, + "loss": 1.4482, + "step": 37378 + }, + { + "epoch": 0.48572305703142893, + "grad_norm": 0.3924091160297394, + "learning_rate": 0.00010287630460494677, + "loss": 1.296, + "step": 37379 + }, + { + "epoch": 0.48573605157534483, + "grad_norm": 0.49593600630760193, + "learning_rate": 0.00010287370514303541, + "loss": 1.4692, + "step": 37380 + }, + { + "epoch": 0.4857490461192607, + "grad_norm": 0.40313035249710083, + "learning_rate": 0.00010287110568112402, + "loss": 1.3329, + "step": 37381 + }, + { + "epoch": 0.4857620406631766, + "grad_norm": 0.35620787739753723, + "learning_rate": 0.00010286850621921262, + "loss": 1.2116, + "step": 37382 + }, + { + "epoch": 0.4857750352070924, + "grad_norm": 0.39096513390541077, + "learning_rate": 0.00010286590675730123, + "loss": 1.477, + "step": 37383 + }, + { + "epoch": 0.4857880297510083, + "grad_norm": 0.3853517770767212, + "learning_rate": 0.00010286330729538987, + "loss": 1.4554, + "step": 37384 + }, + { + "epoch": 0.48580102429492417, + "grad_norm": 0.452470600605011, + "learning_rate": 0.00010286070783347848, + "loss": 1.2566, + "step": 37385 + }, + { + "epoch": 0.48581401883884007, + "grad_norm": 0.4798089563846588, + "learning_rate": 0.00010285810837156709, + "loss": 1.3867, + "step": 37386 + }, + { + "epoch": 0.4858270133827559, + "grad_norm": 0.42197123169898987, + "learning_rate": 0.0001028555089096557, + "loss": 1.5808, + "step": 37387 + }, + { + "epoch": 0.4858400079266718, + "grad_norm": 0.356563925743103, + "learning_rate": 0.00010285290944774432, + "loss": 1.3457, + "step": 37388 + }, + { + "epoch": 0.48585300247058766, + "grad_norm": 0.48244062066078186, + "learning_rate": 0.00010285030998583293, + "loss": 1.3217, + "step": 37389 + }, + { + "epoch": 0.48586599701450356, + "grad_norm": 0.4336363673210144, + "learning_rate": 0.00010284771052392155, + "loss": 1.4281, + "step": 37390 + }, + { + "epoch": 0.4858789915584194, + "grad_norm": 0.30372154712677, + "learning_rate": 0.00010284511106201018, + "loss": 1.1803, + "step": 37391 + }, + { + "epoch": 0.4858919861023353, + "grad_norm": 0.4251910150051117, + "learning_rate": 0.0001028425116000988, + "loss": 1.3865, + "step": 37392 + }, + { + "epoch": 0.48590498064625115, + "grad_norm": 0.41322511434555054, + "learning_rate": 0.0001028399121381874, + "loss": 1.2287, + "step": 37393 + }, + { + "epoch": 0.48591797519016705, + "grad_norm": 0.3717118501663208, + "learning_rate": 0.00010283731267627602, + "loss": 1.3218, + "step": 37394 + }, + { + "epoch": 0.4859309697340829, + "grad_norm": 0.36697232723236084, + "learning_rate": 0.00010283471321436464, + "loss": 1.4102, + "step": 37395 + }, + { + "epoch": 0.4859439642779988, + "grad_norm": 0.34533339738845825, + "learning_rate": 0.00010283211375245325, + "loss": 1.4473, + "step": 37396 + }, + { + "epoch": 0.48595695882191464, + "grad_norm": 0.3668302595615387, + "learning_rate": 0.00010282951429054186, + "loss": 1.4159, + "step": 37397 + }, + { + "epoch": 0.48596995336583054, + "grad_norm": 0.34340932965278625, + "learning_rate": 0.00010282691482863047, + "loss": 1.4616, + "step": 37398 + }, + { + "epoch": 0.4859829479097464, + "grad_norm": 0.43444743752479553, + "learning_rate": 0.0001028243153667191, + "loss": 1.3618, + "step": 37399 + }, + { + "epoch": 0.4859959424536623, + "grad_norm": 0.42633339762687683, + "learning_rate": 0.00010282171590480771, + "loss": 1.4518, + "step": 37400 + }, + { + "epoch": 0.48600893699757813, + "grad_norm": 0.2840575575828552, + "learning_rate": 0.00010281911644289632, + "loss": 1.0753, + "step": 37401 + }, + { + "epoch": 0.48602193154149403, + "grad_norm": 0.3826866149902344, + "learning_rate": 0.00010281651698098493, + "loss": 1.2496, + "step": 37402 + }, + { + "epoch": 0.4860349260854099, + "grad_norm": 0.32163798809051514, + "learning_rate": 0.00010281391751907357, + "loss": 1.1408, + "step": 37403 + }, + { + "epoch": 0.4860479206293258, + "grad_norm": 0.43064308166503906, + "learning_rate": 0.00010281131805716218, + "loss": 1.364, + "step": 37404 + }, + { + "epoch": 0.4860609151732416, + "grad_norm": 0.34890252351760864, + "learning_rate": 0.00010280871859525079, + "loss": 1.5047, + "step": 37405 + }, + { + "epoch": 0.4860739097171575, + "grad_norm": 0.46419596672058105, + "learning_rate": 0.0001028061191333394, + "loss": 1.3789, + "step": 37406 + }, + { + "epoch": 0.48608690426107337, + "grad_norm": 0.439878910779953, + "learning_rate": 0.00010280351967142803, + "loss": 1.7154, + "step": 37407 + }, + { + "epoch": 0.48609989880498927, + "grad_norm": 0.47209402918815613, + "learning_rate": 0.00010280092020951664, + "loss": 1.2824, + "step": 37408 + }, + { + "epoch": 0.4861128933489051, + "grad_norm": 0.5429522395133972, + "learning_rate": 0.00010279832074760525, + "loss": 1.4048, + "step": 37409 + }, + { + "epoch": 0.486125887892821, + "grad_norm": 0.38462162017822266, + "learning_rate": 0.00010279572128569386, + "loss": 1.4376, + "step": 37410 + }, + { + "epoch": 0.48613888243673686, + "grad_norm": 0.3830171823501587, + "learning_rate": 0.00010279312182378248, + "loss": 1.4787, + "step": 37411 + }, + { + "epoch": 0.48615187698065276, + "grad_norm": 0.2915896475315094, + "learning_rate": 0.0001027905223618711, + "loss": 1.2151, + "step": 37412 + }, + { + "epoch": 0.4861648715245686, + "grad_norm": 0.523921549320221, + "learning_rate": 0.0001027879228999597, + "loss": 1.4017, + "step": 37413 + }, + { + "epoch": 0.4861778660684845, + "grad_norm": 0.4897431433200836, + "learning_rate": 0.00010278532343804832, + "loss": 1.3038, + "step": 37414 + }, + { + "epoch": 0.48619086061240036, + "grad_norm": 0.3100225627422333, + "learning_rate": 0.00010278272397613695, + "loss": 1.1286, + "step": 37415 + }, + { + "epoch": 0.48620385515631626, + "grad_norm": 0.3244931697845459, + "learning_rate": 0.00010278012451422557, + "loss": 1.3238, + "step": 37416 + }, + { + "epoch": 0.4862168497002321, + "grad_norm": 0.5111957788467407, + "learning_rate": 0.00010277752505231418, + "loss": 1.603, + "step": 37417 + }, + { + "epoch": 0.486229844244148, + "grad_norm": 0.40632325410842896, + "learning_rate": 0.00010277492559040279, + "loss": 1.2489, + "step": 37418 + }, + { + "epoch": 0.48624283878806385, + "grad_norm": 0.5140093564987183, + "learning_rate": 0.00010277232612849141, + "loss": 1.3283, + "step": 37419 + }, + { + "epoch": 0.48625583333197975, + "grad_norm": 0.34243983030319214, + "learning_rate": 0.00010276972666658002, + "loss": 1.3781, + "step": 37420 + }, + { + "epoch": 0.4862688278758956, + "grad_norm": 0.4177440106868744, + "learning_rate": 0.00010276712720466863, + "loss": 1.2799, + "step": 37421 + }, + { + "epoch": 0.4862818224198115, + "grad_norm": 0.4968373775482178, + "learning_rate": 0.00010276452774275724, + "loss": 1.4755, + "step": 37422 + }, + { + "epoch": 0.48629481696372734, + "grad_norm": 0.44247928261756897, + "learning_rate": 0.00010276192828084588, + "loss": 1.3122, + "step": 37423 + }, + { + "epoch": 0.48630781150764324, + "grad_norm": 0.4300784170627594, + "learning_rate": 0.00010275932881893448, + "loss": 1.2055, + "step": 37424 + }, + { + "epoch": 0.4863208060515591, + "grad_norm": 0.3489111363887787, + "learning_rate": 0.00010275672935702309, + "loss": 1.1697, + "step": 37425 + }, + { + "epoch": 0.486333800595475, + "grad_norm": 0.4759625792503357, + "learning_rate": 0.0001027541298951117, + "loss": 1.3755, + "step": 37426 + }, + { + "epoch": 0.48634679513939083, + "grad_norm": 0.45506060123443604, + "learning_rate": 0.00010275153043320034, + "loss": 1.4272, + "step": 37427 + }, + { + "epoch": 0.48635978968330673, + "grad_norm": 0.5025519728660583, + "learning_rate": 0.00010274893097128895, + "loss": 1.5254, + "step": 37428 + }, + { + "epoch": 0.4863727842272226, + "grad_norm": 0.4095103442668915, + "learning_rate": 0.00010274633150937756, + "loss": 1.5412, + "step": 37429 + }, + { + "epoch": 0.4863857787711385, + "grad_norm": 0.39778974652290344, + "learning_rate": 0.00010274373204746619, + "loss": 1.3251, + "step": 37430 + }, + { + "epoch": 0.4863987733150543, + "grad_norm": 0.4651887118816376, + "learning_rate": 0.0001027411325855548, + "loss": 1.5838, + "step": 37431 + }, + { + "epoch": 0.4864117678589702, + "grad_norm": 0.3230370283126831, + "learning_rate": 0.00010273853312364341, + "loss": 1.7236, + "step": 37432 + }, + { + "epoch": 0.48642476240288607, + "grad_norm": 0.3638368248939514, + "learning_rate": 0.00010273593366173202, + "loss": 1.4804, + "step": 37433 + }, + { + "epoch": 0.48643775694680197, + "grad_norm": 0.413433313369751, + "learning_rate": 0.00010273333419982066, + "loss": 1.5228, + "step": 37434 + }, + { + "epoch": 0.4864507514907178, + "grad_norm": 0.30923259258270264, + "learning_rate": 0.00010273073473790927, + "loss": 1.2484, + "step": 37435 + }, + { + "epoch": 0.4864637460346337, + "grad_norm": 0.5297190546989441, + "learning_rate": 0.00010272813527599788, + "loss": 1.6067, + "step": 37436 + }, + { + "epoch": 0.48647674057854956, + "grad_norm": 0.49403342604637146, + "learning_rate": 0.00010272553581408648, + "loss": 1.4081, + "step": 37437 + }, + { + "epoch": 0.48648973512246546, + "grad_norm": 0.4745197892189026, + "learning_rate": 0.00010272293635217511, + "loss": 1.4268, + "step": 37438 + }, + { + "epoch": 0.4865027296663813, + "grad_norm": 0.3124939799308777, + "learning_rate": 0.00010272033689026372, + "loss": 1.4561, + "step": 37439 + }, + { + "epoch": 0.4865157242102972, + "grad_norm": 0.35278668999671936, + "learning_rate": 0.00010271773742835234, + "loss": 1.5195, + "step": 37440 + }, + { + "epoch": 0.48652871875421305, + "grad_norm": 0.43536508083343506, + "learning_rate": 0.00010271513796644095, + "loss": 1.3404, + "step": 37441 + }, + { + "epoch": 0.48654171329812895, + "grad_norm": 0.45758774876594543, + "learning_rate": 0.00010271253850452957, + "loss": 1.402, + "step": 37442 + }, + { + "epoch": 0.4865547078420448, + "grad_norm": 0.4135849177837372, + "learning_rate": 0.00010270993904261818, + "loss": 1.5382, + "step": 37443 + }, + { + "epoch": 0.4865677023859607, + "grad_norm": 0.4844321310520172, + "learning_rate": 0.00010270733958070679, + "loss": 1.4689, + "step": 37444 + }, + { + "epoch": 0.48658069692987654, + "grad_norm": 0.3898744285106659, + "learning_rate": 0.0001027047401187954, + "loss": 1.2793, + "step": 37445 + }, + { + "epoch": 0.48659369147379244, + "grad_norm": 0.4363359212875366, + "learning_rate": 0.00010270214065688404, + "loss": 1.4954, + "step": 37446 + }, + { + "epoch": 0.4866066860177083, + "grad_norm": 0.30738916993141174, + "learning_rate": 0.00010269954119497265, + "loss": 1.1919, + "step": 37447 + }, + { + "epoch": 0.4866196805616242, + "grad_norm": 0.368707537651062, + "learning_rate": 0.00010269694173306126, + "loss": 1.2665, + "step": 37448 + }, + { + "epoch": 0.48663267510554004, + "grad_norm": 0.3278559446334839, + "learning_rate": 0.00010269434227114987, + "loss": 1.2652, + "step": 37449 + }, + { + "epoch": 0.48664566964945594, + "grad_norm": 0.40336546301841736, + "learning_rate": 0.0001026917428092385, + "loss": 1.5781, + "step": 37450 + }, + { + "epoch": 0.4866586641933718, + "grad_norm": 0.31379133462905884, + "learning_rate": 0.00010268914334732711, + "loss": 1.3177, + "step": 37451 + }, + { + "epoch": 0.4866716587372877, + "grad_norm": 0.30316630005836487, + "learning_rate": 0.00010268654388541572, + "loss": 1.3077, + "step": 37452 + }, + { + "epoch": 0.4866846532812035, + "grad_norm": 0.3171525299549103, + "learning_rate": 0.00010268394442350433, + "loss": 1.1972, + "step": 37453 + }, + { + "epoch": 0.48669764782511943, + "grad_norm": 0.35748907923698425, + "learning_rate": 0.00010268134496159296, + "loss": 1.3951, + "step": 37454 + }, + { + "epoch": 0.48671064236903533, + "grad_norm": 0.3408093750476837, + "learning_rate": 0.00010267874549968157, + "loss": 1.3051, + "step": 37455 + }, + { + "epoch": 0.4867236369129512, + "grad_norm": 0.3556941747665405, + "learning_rate": 0.00010267614603777018, + "loss": 1.3009, + "step": 37456 + }, + { + "epoch": 0.4867366314568671, + "grad_norm": 0.4327068626880646, + "learning_rate": 0.00010267354657585879, + "loss": 1.4912, + "step": 37457 + }, + { + "epoch": 0.4867496260007829, + "grad_norm": 0.29644715785980225, + "learning_rate": 0.00010267094711394743, + "loss": 1.2904, + "step": 37458 + }, + { + "epoch": 0.4867626205446988, + "grad_norm": 0.4700661599636078, + "learning_rate": 0.00010266834765203604, + "loss": 1.4357, + "step": 37459 + }, + { + "epoch": 0.48677561508861467, + "grad_norm": 0.3661230504512787, + "learning_rate": 0.00010266574819012465, + "loss": 1.4969, + "step": 37460 + }, + { + "epoch": 0.48678860963253057, + "grad_norm": 0.4363194704055786, + "learning_rate": 0.00010266314872821326, + "loss": 1.256, + "step": 37461 + }, + { + "epoch": 0.4868016041764464, + "grad_norm": 0.4148009121417999, + "learning_rate": 0.00010266054926630188, + "loss": 1.2995, + "step": 37462 + }, + { + "epoch": 0.4868145987203623, + "grad_norm": 0.28058937191963196, + "learning_rate": 0.0001026579498043905, + "loss": 1.3074, + "step": 37463 + }, + { + "epoch": 0.48682759326427816, + "grad_norm": 0.4307621121406555, + "learning_rate": 0.0001026553503424791, + "loss": 1.4343, + "step": 37464 + }, + { + "epoch": 0.48684058780819406, + "grad_norm": 0.4508236050605774, + "learning_rate": 0.00010265275088056774, + "loss": 1.398, + "step": 37465 + }, + { + "epoch": 0.4868535823521099, + "grad_norm": 0.3972441554069519, + "learning_rate": 0.00010265015141865634, + "loss": 1.3853, + "step": 37466 + }, + { + "epoch": 0.4868665768960258, + "grad_norm": 0.37339210510253906, + "learning_rate": 0.00010264755195674495, + "loss": 1.3517, + "step": 37467 + }, + { + "epoch": 0.48687957143994165, + "grad_norm": 0.43052566051483154, + "learning_rate": 0.00010264495249483356, + "loss": 1.5119, + "step": 37468 + }, + { + "epoch": 0.48689256598385755, + "grad_norm": 0.38009142875671387, + "learning_rate": 0.0001026423530329222, + "loss": 1.4826, + "step": 37469 + }, + { + "epoch": 0.4869055605277734, + "grad_norm": 0.4008859694004059, + "learning_rate": 0.00010263975357101081, + "loss": 1.374, + "step": 37470 + }, + { + "epoch": 0.4869185550716893, + "grad_norm": 0.38185590505599976, + "learning_rate": 0.00010263715410909942, + "loss": 1.417, + "step": 37471 + }, + { + "epoch": 0.48693154961560514, + "grad_norm": 0.4229472875595093, + "learning_rate": 0.00010263455464718803, + "loss": 1.3421, + "step": 37472 + }, + { + "epoch": 0.48694454415952104, + "grad_norm": 0.3342346251010895, + "learning_rate": 0.00010263195518527666, + "loss": 1.298, + "step": 37473 + }, + { + "epoch": 0.4869575387034369, + "grad_norm": 0.40374815464019775, + "learning_rate": 0.00010262935572336527, + "loss": 1.3916, + "step": 37474 + }, + { + "epoch": 0.4869705332473528, + "grad_norm": 0.30591002106666565, + "learning_rate": 0.00010262675626145388, + "loss": 1.1477, + "step": 37475 + }, + { + "epoch": 0.48698352779126863, + "grad_norm": 0.3657781183719635, + "learning_rate": 0.00010262415679954249, + "loss": 1.3495, + "step": 37476 + }, + { + "epoch": 0.48699652233518453, + "grad_norm": 0.3618139326572418, + "learning_rate": 0.00010262155733763113, + "loss": 1.2657, + "step": 37477 + }, + { + "epoch": 0.4870095168791004, + "grad_norm": 0.3728991448879242, + "learning_rate": 0.00010261895787571974, + "loss": 1.4445, + "step": 37478 + }, + { + "epoch": 0.4870225114230163, + "grad_norm": 0.39317935705184937, + "learning_rate": 0.00010261635841380834, + "loss": 1.4489, + "step": 37479 + }, + { + "epoch": 0.4870355059669321, + "grad_norm": 0.37254852056503296, + "learning_rate": 0.00010261375895189695, + "loss": 1.1889, + "step": 37480 + }, + { + "epoch": 0.487048500510848, + "grad_norm": 0.4452288746833801, + "learning_rate": 0.00010261115948998559, + "loss": 1.4366, + "step": 37481 + }, + { + "epoch": 0.48706149505476387, + "grad_norm": 0.4262430965900421, + "learning_rate": 0.0001026085600280742, + "loss": 1.5928, + "step": 37482 + }, + { + "epoch": 0.48707448959867977, + "grad_norm": 0.428069144487381, + "learning_rate": 0.00010260596056616281, + "loss": 1.2481, + "step": 37483 + }, + { + "epoch": 0.4870874841425956, + "grad_norm": 0.40225380659103394, + "learning_rate": 0.00010260336110425142, + "loss": 1.3509, + "step": 37484 + }, + { + "epoch": 0.4871004786865115, + "grad_norm": 0.32335609197616577, + "learning_rate": 0.00010260076164234004, + "loss": 1.4448, + "step": 37485 + }, + { + "epoch": 0.48711347323042736, + "grad_norm": 0.41441813111305237, + "learning_rate": 0.00010259816218042866, + "loss": 1.4519, + "step": 37486 + }, + { + "epoch": 0.48712646777434326, + "grad_norm": 0.4198852777481079, + "learning_rate": 0.00010259556271851727, + "loss": 1.4545, + "step": 37487 + }, + { + "epoch": 0.4871394623182591, + "grad_norm": 0.384016215801239, + "learning_rate": 0.00010259296325660588, + "loss": 1.1892, + "step": 37488 + }, + { + "epoch": 0.487152456862175, + "grad_norm": 0.4397445619106293, + "learning_rate": 0.00010259036379469452, + "loss": 1.4366, + "step": 37489 + }, + { + "epoch": 0.48716545140609085, + "grad_norm": 0.4898951053619385, + "learning_rate": 0.00010258776433278313, + "loss": 1.4704, + "step": 37490 + }, + { + "epoch": 0.48717844595000676, + "grad_norm": 0.4236140549182892, + "learning_rate": 0.00010258516487087174, + "loss": 1.3478, + "step": 37491 + }, + { + "epoch": 0.4871914404939226, + "grad_norm": 0.3601870834827423, + "learning_rate": 0.00010258256540896033, + "loss": 1.3734, + "step": 37492 + }, + { + "epoch": 0.4872044350378385, + "grad_norm": 0.3674706518650055, + "learning_rate": 0.00010257996594704897, + "loss": 1.3232, + "step": 37493 + }, + { + "epoch": 0.48721742958175435, + "grad_norm": 0.4053773581981659, + "learning_rate": 0.00010257736648513758, + "loss": 1.4695, + "step": 37494 + }, + { + "epoch": 0.48723042412567025, + "grad_norm": 0.3641587793827057, + "learning_rate": 0.0001025747670232262, + "loss": 1.5134, + "step": 37495 + }, + { + "epoch": 0.4872434186695861, + "grad_norm": 0.3774276375770569, + "learning_rate": 0.0001025721675613148, + "loss": 1.5459, + "step": 37496 + }, + { + "epoch": 0.487256413213502, + "grad_norm": 0.48215439915657043, + "learning_rate": 0.00010256956809940343, + "loss": 1.4752, + "step": 37497 + }, + { + "epoch": 0.48726940775741784, + "grad_norm": 0.32459592819213867, + "learning_rate": 0.00010256696863749204, + "loss": 1.5509, + "step": 37498 + }, + { + "epoch": 0.48728240230133374, + "grad_norm": 0.3688407242298126, + "learning_rate": 0.00010256436917558065, + "loss": 1.5481, + "step": 37499 + }, + { + "epoch": 0.4872953968452496, + "grad_norm": 0.3435598909854889, + "learning_rate": 0.00010256176971366926, + "loss": 1.4308, + "step": 37500 + }, + { + "epoch": 0.4873083913891655, + "grad_norm": 0.4777471125125885, + "learning_rate": 0.0001025591702517579, + "loss": 1.5712, + "step": 37501 + }, + { + "epoch": 0.48732138593308133, + "grad_norm": 0.414863646030426, + "learning_rate": 0.00010255657078984651, + "loss": 1.3593, + "step": 37502 + }, + { + "epoch": 0.48733438047699723, + "grad_norm": 0.31028228998184204, + "learning_rate": 0.00010255397132793512, + "loss": 1.0727, + "step": 37503 + }, + { + "epoch": 0.4873473750209131, + "grad_norm": 0.39195629954338074, + "learning_rate": 0.00010255137186602375, + "loss": 1.2482, + "step": 37504 + }, + { + "epoch": 0.487360369564829, + "grad_norm": 0.4595802426338196, + "learning_rate": 0.00010254877240411236, + "loss": 1.4205, + "step": 37505 + }, + { + "epoch": 0.4873733641087448, + "grad_norm": 0.36123180389404297, + "learning_rate": 0.00010254617294220097, + "loss": 1.3052, + "step": 37506 + }, + { + "epoch": 0.4873863586526607, + "grad_norm": 0.28263694047927856, + "learning_rate": 0.00010254357348028958, + "loss": 1.182, + "step": 37507 + }, + { + "epoch": 0.48739935319657657, + "grad_norm": 0.36781421303749084, + "learning_rate": 0.0001025409740183782, + "loss": 1.3414, + "step": 37508 + }, + { + "epoch": 0.48741234774049247, + "grad_norm": 0.36337050795555115, + "learning_rate": 0.00010253837455646682, + "loss": 1.397, + "step": 37509 + }, + { + "epoch": 0.4874253422844083, + "grad_norm": 0.4091852307319641, + "learning_rate": 0.00010253577509455543, + "loss": 1.5214, + "step": 37510 + }, + { + "epoch": 0.4874383368283242, + "grad_norm": 0.5029363036155701, + "learning_rate": 0.00010253317563264404, + "loss": 1.4945, + "step": 37511 + }, + { + "epoch": 0.48745133137224006, + "grad_norm": 0.3988547623157501, + "learning_rate": 0.00010253057617073268, + "loss": 1.3492, + "step": 37512 + }, + { + "epoch": 0.48746432591615596, + "grad_norm": 0.3847947120666504, + "learning_rate": 0.00010252797670882129, + "loss": 1.4872, + "step": 37513 + }, + { + "epoch": 0.4874773204600718, + "grad_norm": 0.4792209267616272, + "learning_rate": 0.0001025253772469099, + "loss": 1.4328, + "step": 37514 + }, + { + "epoch": 0.4874903150039877, + "grad_norm": 0.39129284024238586, + "learning_rate": 0.00010252277778499851, + "loss": 1.2987, + "step": 37515 + }, + { + "epoch": 0.48750330954790355, + "grad_norm": 0.4576675593852997, + "learning_rate": 0.00010252017832308713, + "loss": 1.5845, + "step": 37516 + }, + { + "epoch": 0.48751630409181945, + "grad_norm": 0.4679252803325653, + "learning_rate": 0.00010251757886117574, + "loss": 1.4264, + "step": 37517 + }, + { + "epoch": 0.4875292986357353, + "grad_norm": 0.3119916021823883, + "learning_rate": 0.00010251497939926435, + "loss": 1.4119, + "step": 37518 + }, + { + "epoch": 0.4875422931796512, + "grad_norm": 0.34806376695632935, + "learning_rate": 0.00010251237993735297, + "loss": 1.3144, + "step": 37519 + }, + { + "epoch": 0.48755528772356704, + "grad_norm": 0.5126516819000244, + "learning_rate": 0.0001025097804754416, + "loss": 1.5164, + "step": 37520 + }, + { + "epoch": 0.48756828226748294, + "grad_norm": 0.60047847032547, + "learning_rate": 0.0001025071810135302, + "loss": 1.4571, + "step": 37521 + }, + { + "epoch": 0.4875812768113988, + "grad_norm": 0.43262067437171936, + "learning_rate": 0.00010250458155161881, + "loss": 1.3572, + "step": 37522 + }, + { + "epoch": 0.4875942713553147, + "grad_norm": 0.3384360074996948, + "learning_rate": 0.00010250198208970742, + "loss": 1.2004, + "step": 37523 + }, + { + "epoch": 0.48760726589923054, + "grad_norm": 0.396616131067276, + "learning_rate": 0.00010249938262779606, + "loss": 1.3489, + "step": 37524 + }, + { + "epoch": 0.48762026044314644, + "grad_norm": 0.42384424805641174, + "learning_rate": 0.00010249678316588467, + "loss": 1.3587, + "step": 37525 + }, + { + "epoch": 0.4876332549870623, + "grad_norm": 0.38283175230026245, + "learning_rate": 0.00010249418370397328, + "loss": 1.182, + "step": 37526 + }, + { + "epoch": 0.4876462495309782, + "grad_norm": 0.2308875024318695, + "learning_rate": 0.0001024915842420619, + "loss": 1.1661, + "step": 37527 + }, + { + "epoch": 0.487659244074894, + "grad_norm": 0.36702489852905273, + "learning_rate": 0.00010248898478015052, + "loss": 1.3899, + "step": 37528 + }, + { + "epoch": 0.48767223861880993, + "grad_norm": 0.441802978515625, + "learning_rate": 0.00010248638531823913, + "loss": 1.2033, + "step": 37529 + }, + { + "epoch": 0.4876852331627258, + "grad_norm": 0.4085250794887543, + "learning_rate": 0.00010248378585632774, + "loss": 1.5424, + "step": 37530 + }, + { + "epoch": 0.4876982277066417, + "grad_norm": 0.33680853247642517, + "learning_rate": 0.00010248118639441635, + "loss": 1.3212, + "step": 37531 + }, + { + "epoch": 0.4877112222505576, + "grad_norm": 0.5441884398460388, + "learning_rate": 0.00010247858693250499, + "loss": 1.4323, + "step": 37532 + }, + { + "epoch": 0.4877242167944734, + "grad_norm": 0.3772077262401581, + "learning_rate": 0.00010247598747059359, + "loss": 1.5213, + "step": 37533 + }, + { + "epoch": 0.4877372113383893, + "grad_norm": 0.3871772587299347, + "learning_rate": 0.0001024733880086822, + "loss": 1.2222, + "step": 37534 + }, + { + "epoch": 0.48775020588230517, + "grad_norm": 0.39174339175224304, + "learning_rate": 0.00010247078854677081, + "loss": 1.4126, + "step": 37535 + }, + { + "epoch": 0.48776320042622107, + "grad_norm": 0.39793696999549866, + "learning_rate": 0.00010246818908485945, + "loss": 1.5365, + "step": 37536 + }, + { + "epoch": 0.4877761949701369, + "grad_norm": 0.4222224950790405, + "learning_rate": 0.00010246558962294806, + "loss": 1.4894, + "step": 37537 + }, + { + "epoch": 0.4877891895140528, + "grad_norm": 0.30023080110549927, + "learning_rate": 0.00010246299016103667, + "loss": 1.2925, + "step": 37538 + }, + { + "epoch": 0.48780218405796866, + "grad_norm": 0.4016575515270233, + "learning_rate": 0.00010246039069912528, + "loss": 1.2311, + "step": 37539 + }, + { + "epoch": 0.48781517860188456, + "grad_norm": 0.4106665253639221, + "learning_rate": 0.0001024577912372139, + "loss": 1.3298, + "step": 37540 + }, + { + "epoch": 0.4878281731458004, + "grad_norm": 0.4347458481788635, + "learning_rate": 0.00010245519177530251, + "loss": 1.4243, + "step": 37541 + }, + { + "epoch": 0.4878411676897163, + "grad_norm": 0.496207594871521, + "learning_rate": 0.00010245259231339113, + "loss": 1.5363, + "step": 37542 + }, + { + "epoch": 0.48785416223363215, + "grad_norm": 0.3152560591697693, + "learning_rate": 0.00010244999285147976, + "loss": 1.3085, + "step": 37543 + }, + { + "epoch": 0.48786715677754805, + "grad_norm": 0.4746745228767395, + "learning_rate": 0.00010244739338956837, + "loss": 1.4684, + "step": 37544 + }, + { + "epoch": 0.4878801513214639, + "grad_norm": 0.42024633288383484, + "learning_rate": 0.00010244479392765699, + "loss": 1.2928, + "step": 37545 + }, + { + "epoch": 0.4878931458653798, + "grad_norm": 0.4881391227245331, + "learning_rate": 0.00010244219446574558, + "loss": 1.4311, + "step": 37546 + }, + { + "epoch": 0.48790614040929564, + "grad_norm": 0.39378783106803894, + "learning_rate": 0.00010243959500383422, + "loss": 1.3778, + "step": 37547 + }, + { + "epoch": 0.48791913495321154, + "grad_norm": 0.3820206820964813, + "learning_rate": 0.00010243699554192283, + "loss": 1.4847, + "step": 37548 + }, + { + "epoch": 0.4879321294971274, + "grad_norm": 0.460351824760437, + "learning_rate": 0.00010243439608001144, + "loss": 1.5352, + "step": 37549 + }, + { + "epoch": 0.4879451240410433, + "grad_norm": 0.4360557198524475, + "learning_rate": 0.00010243179661810005, + "loss": 1.3239, + "step": 37550 + }, + { + "epoch": 0.48795811858495913, + "grad_norm": 0.4283132553100586, + "learning_rate": 0.00010242919715618868, + "loss": 1.5656, + "step": 37551 + }, + { + "epoch": 0.48797111312887503, + "grad_norm": 0.5098937749862671, + "learning_rate": 0.00010242659769427729, + "loss": 1.4628, + "step": 37552 + }, + { + "epoch": 0.4879841076727909, + "grad_norm": 0.5641338229179382, + "learning_rate": 0.0001024239982323659, + "loss": 1.3679, + "step": 37553 + }, + { + "epoch": 0.4879971022167068, + "grad_norm": 0.35181304812431335, + "learning_rate": 0.00010242139877045451, + "loss": 1.2374, + "step": 37554 + }, + { + "epoch": 0.4880100967606226, + "grad_norm": 0.4925462305545807, + "learning_rate": 0.00010241879930854315, + "loss": 1.3761, + "step": 37555 + }, + { + "epoch": 0.4880230913045385, + "grad_norm": 0.4691649377346039, + "learning_rate": 0.00010241619984663176, + "loss": 1.412, + "step": 37556 + }, + { + "epoch": 0.48803608584845437, + "grad_norm": 0.3909376859664917, + "learning_rate": 0.00010241360038472037, + "loss": 1.4768, + "step": 37557 + }, + { + "epoch": 0.48804908039237027, + "grad_norm": 0.42458003759384155, + "learning_rate": 0.00010241100092280898, + "loss": 1.4398, + "step": 37558 + }, + { + "epoch": 0.4880620749362861, + "grad_norm": 0.4558770954608917, + "learning_rate": 0.0001024084014608976, + "loss": 1.256, + "step": 37559 + }, + { + "epoch": 0.488075069480202, + "grad_norm": 0.39437049627304077, + "learning_rate": 0.00010240580199898622, + "loss": 1.6077, + "step": 37560 + }, + { + "epoch": 0.48808806402411786, + "grad_norm": 0.3441692590713501, + "learning_rate": 0.00010240320253707483, + "loss": 1.4481, + "step": 37561 + }, + { + "epoch": 0.48810105856803376, + "grad_norm": 0.3714215159416199, + "learning_rate": 0.00010240060307516344, + "loss": 1.2567, + "step": 37562 + }, + { + "epoch": 0.4881140531119496, + "grad_norm": 0.3831939399242401, + "learning_rate": 0.00010239800361325206, + "loss": 1.3913, + "step": 37563 + }, + { + "epoch": 0.4881270476558655, + "grad_norm": 0.5002745389938354, + "learning_rate": 0.00010239540415134067, + "loss": 1.2416, + "step": 37564 + }, + { + "epoch": 0.48814004219978135, + "grad_norm": 0.46416550874710083, + "learning_rate": 0.00010239280468942929, + "loss": 1.6217, + "step": 37565 + }, + { + "epoch": 0.48815303674369726, + "grad_norm": 0.47698646783828735, + "learning_rate": 0.0001023902052275179, + "loss": 1.243, + "step": 37566 + }, + { + "epoch": 0.4881660312876131, + "grad_norm": 0.3888038992881775, + "learning_rate": 0.00010238760576560653, + "loss": 1.3425, + "step": 37567 + }, + { + "epoch": 0.488179025831529, + "grad_norm": 0.36654165387153625, + "learning_rate": 0.00010238500630369514, + "loss": 1.4717, + "step": 37568 + }, + { + "epoch": 0.48819202037544485, + "grad_norm": 0.3708533048629761, + "learning_rate": 0.00010238240684178376, + "loss": 1.4543, + "step": 37569 + }, + { + "epoch": 0.48820501491936075, + "grad_norm": 0.3568820059299469, + "learning_rate": 0.00010237980737987237, + "loss": 1.3072, + "step": 37570 + }, + { + "epoch": 0.4882180094632766, + "grad_norm": 0.4768930971622467, + "learning_rate": 0.00010237720791796099, + "loss": 1.3287, + "step": 37571 + }, + { + "epoch": 0.4882310040071925, + "grad_norm": 0.3724338114261627, + "learning_rate": 0.0001023746084560496, + "loss": 1.3614, + "step": 37572 + }, + { + "epoch": 0.48824399855110834, + "grad_norm": 0.3386293947696686, + "learning_rate": 0.00010237200899413821, + "loss": 1.096, + "step": 37573 + }, + { + "epoch": 0.48825699309502424, + "grad_norm": 0.4686654508113861, + "learning_rate": 0.00010236940953222682, + "loss": 1.6467, + "step": 37574 + }, + { + "epoch": 0.4882699876389401, + "grad_norm": 0.38210493326187134, + "learning_rate": 0.00010236681007031545, + "loss": 1.5063, + "step": 37575 + }, + { + "epoch": 0.488282982182856, + "grad_norm": 0.38487550616264343, + "learning_rate": 0.00010236421060840406, + "loss": 1.2872, + "step": 37576 + }, + { + "epoch": 0.48829597672677183, + "grad_norm": 0.46951600909233093, + "learning_rate": 0.00010236161114649267, + "loss": 1.5463, + "step": 37577 + }, + { + "epoch": 0.48830897127068773, + "grad_norm": 0.3652660846710205, + "learning_rate": 0.00010235901168458131, + "loss": 1.3737, + "step": 37578 + }, + { + "epoch": 0.4883219658146036, + "grad_norm": 0.3704793155193329, + "learning_rate": 0.00010235641222266992, + "loss": 1.4383, + "step": 37579 + }, + { + "epoch": 0.4883349603585195, + "grad_norm": 0.39712756872177124, + "learning_rate": 0.00010235381276075853, + "loss": 1.4024, + "step": 37580 + }, + { + "epoch": 0.4883479549024353, + "grad_norm": 0.39344578981399536, + "learning_rate": 0.00010235121329884714, + "loss": 1.3374, + "step": 37581 + }, + { + "epoch": 0.4883609494463512, + "grad_norm": 0.41694313287734985, + "learning_rate": 0.00010234861383693577, + "loss": 1.3853, + "step": 37582 + }, + { + "epoch": 0.48837394399026707, + "grad_norm": 0.3142113983631134, + "learning_rate": 0.00010234601437502438, + "loss": 1.2767, + "step": 37583 + }, + { + "epoch": 0.48838693853418297, + "grad_norm": 0.39100587368011475, + "learning_rate": 0.00010234341491311299, + "loss": 1.2014, + "step": 37584 + }, + { + "epoch": 0.4883999330780988, + "grad_norm": 0.41757529973983765, + "learning_rate": 0.0001023408154512016, + "loss": 1.5005, + "step": 37585 + }, + { + "epoch": 0.4884129276220147, + "grad_norm": 0.3109648525714874, + "learning_rate": 0.00010233821598929024, + "loss": 1.3451, + "step": 37586 + }, + { + "epoch": 0.48842592216593056, + "grad_norm": 0.4887288510799408, + "learning_rate": 0.00010233561652737885, + "loss": 1.5348, + "step": 37587 + }, + { + "epoch": 0.48843891670984646, + "grad_norm": 0.3605212867259979, + "learning_rate": 0.00010233301706546744, + "loss": 1.4812, + "step": 37588 + }, + { + "epoch": 0.4884519112537623, + "grad_norm": 0.41585421562194824, + "learning_rate": 0.00010233041760355606, + "loss": 1.3376, + "step": 37589 + }, + { + "epoch": 0.4884649057976782, + "grad_norm": 0.47421103715896606, + "learning_rate": 0.0001023278181416447, + "loss": 1.6893, + "step": 37590 + }, + { + "epoch": 0.48847790034159405, + "grad_norm": 0.40023812651634216, + "learning_rate": 0.0001023252186797333, + "loss": 1.4516, + "step": 37591 + }, + { + "epoch": 0.48849089488550995, + "grad_norm": 0.40364524722099304, + "learning_rate": 0.00010232261921782192, + "loss": 1.2565, + "step": 37592 + }, + { + "epoch": 0.4885038894294258, + "grad_norm": 0.4134129583835602, + "learning_rate": 0.00010232001975591053, + "loss": 1.366, + "step": 37593 + }, + { + "epoch": 0.4885168839733417, + "grad_norm": 0.6423876881599426, + "learning_rate": 0.00010231742029399915, + "loss": 1.455, + "step": 37594 + }, + { + "epoch": 0.48852987851725754, + "grad_norm": 0.4163878262042999, + "learning_rate": 0.00010231482083208776, + "loss": 1.4924, + "step": 37595 + }, + { + "epoch": 0.48854287306117344, + "grad_norm": 0.3769245147705078, + "learning_rate": 0.00010231222137017637, + "loss": 1.5278, + "step": 37596 + }, + { + "epoch": 0.4885558676050893, + "grad_norm": 0.3027964234352112, + "learning_rate": 0.00010230962190826498, + "loss": 1.2966, + "step": 37597 + }, + { + "epoch": 0.4885688621490052, + "grad_norm": 0.47191503643989563, + "learning_rate": 0.00010230702244635362, + "loss": 1.3539, + "step": 37598 + }, + { + "epoch": 0.48858185669292103, + "grad_norm": 0.46247878670692444, + "learning_rate": 0.00010230442298444223, + "loss": 1.5996, + "step": 37599 + }, + { + "epoch": 0.48859485123683694, + "grad_norm": 0.5506212115287781, + "learning_rate": 0.00010230182352253084, + "loss": 1.587, + "step": 37600 + }, + { + "epoch": 0.4886078457807528, + "grad_norm": 0.34305495023727417, + "learning_rate": 0.00010229922406061944, + "loss": 1.2006, + "step": 37601 + }, + { + "epoch": 0.4886208403246687, + "grad_norm": 0.3670308291912079, + "learning_rate": 0.00010229662459870808, + "loss": 1.5751, + "step": 37602 + }, + { + "epoch": 0.4886338348685845, + "grad_norm": 0.38744115829467773, + "learning_rate": 0.00010229402513679669, + "loss": 1.2699, + "step": 37603 + }, + { + "epoch": 0.4886468294125004, + "grad_norm": 0.3548983037471771, + "learning_rate": 0.0001022914256748853, + "loss": 1.4695, + "step": 37604 + }, + { + "epoch": 0.4886598239564163, + "grad_norm": 0.4576367139816284, + "learning_rate": 0.00010228882621297391, + "loss": 1.2714, + "step": 37605 + }, + { + "epoch": 0.4886728185003322, + "grad_norm": 0.3778396546840668, + "learning_rate": 0.00010228622675106254, + "loss": 1.3412, + "step": 37606 + }, + { + "epoch": 0.4886858130442481, + "grad_norm": 0.30134060978889465, + "learning_rate": 0.00010228362728915115, + "loss": 1.2226, + "step": 37607 + }, + { + "epoch": 0.4886988075881639, + "grad_norm": 0.4299182593822479, + "learning_rate": 0.00010228102782723976, + "loss": 1.6, + "step": 37608 + }, + { + "epoch": 0.4887118021320798, + "grad_norm": 0.38845837116241455, + "learning_rate": 0.00010227842836532837, + "loss": 1.4929, + "step": 37609 + }, + { + "epoch": 0.48872479667599567, + "grad_norm": 0.3826586902141571, + "learning_rate": 0.00010227582890341701, + "loss": 1.3567, + "step": 37610 + }, + { + "epoch": 0.48873779121991157, + "grad_norm": 0.40499347448349, + "learning_rate": 0.00010227322944150562, + "loss": 1.3441, + "step": 37611 + }, + { + "epoch": 0.4887507857638274, + "grad_norm": 0.42666059732437134, + "learning_rate": 0.00010227062997959423, + "loss": 1.3862, + "step": 37612 + }, + { + "epoch": 0.4887637803077433, + "grad_norm": 0.37470707297325134, + "learning_rate": 0.00010226803051768284, + "loss": 1.4041, + "step": 37613 + }, + { + "epoch": 0.48877677485165916, + "grad_norm": 0.33899909257888794, + "learning_rate": 0.00010226543105577146, + "loss": 1.4744, + "step": 37614 + }, + { + "epoch": 0.48878976939557506, + "grad_norm": 0.3962824046611786, + "learning_rate": 0.00010226283159386008, + "loss": 1.3148, + "step": 37615 + }, + { + "epoch": 0.4888027639394909, + "grad_norm": 0.4831918179988861, + "learning_rate": 0.00010226023213194869, + "loss": 1.3275, + "step": 37616 + }, + { + "epoch": 0.4888157584834068, + "grad_norm": 0.3771432638168335, + "learning_rate": 0.00010225763267003731, + "loss": 1.4849, + "step": 37617 + }, + { + "epoch": 0.48882875302732265, + "grad_norm": 0.32906803488731384, + "learning_rate": 0.00010225503320812592, + "loss": 1.3202, + "step": 37618 + }, + { + "epoch": 0.48884174757123855, + "grad_norm": 0.40759846568107605, + "learning_rate": 0.00010225243374621453, + "loss": 1.5794, + "step": 37619 + }, + { + "epoch": 0.4888547421151544, + "grad_norm": 0.42669370770454407, + "learning_rate": 0.00010224983428430314, + "loss": 1.2868, + "step": 37620 + }, + { + "epoch": 0.4888677366590703, + "grad_norm": 0.3904821276664734, + "learning_rate": 0.00010224723482239178, + "loss": 1.4206, + "step": 37621 + }, + { + "epoch": 0.48888073120298614, + "grad_norm": 0.4412972927093506, + "learning_rate": 0.00010224463536048039, + "loss": 1.4858, + "step": 37622 + }, + { + "epoch": 0.48889372574690204, + "grad_norm": 0.37078166007995605, + "learning_rate": 0.000102242035898569, + "loss": 1.732, + "step": 37623 + }, + { + "epoch": 0.4889067202908179, + "grad_norm": 0.3906558156013489, + "learning_rate": 0.00010223943643665761, + "loss": 1.4493, + "step": 37624 + }, + { + "epoch": 0.4889197148347338, + "grad_norm": 0.4475751519203186, + "learning_rate": 0.00010223683697474624, + "loss": 1.4804, + "step": 37625 + }, + { + "epoch": 0.48893270937864963, + "grad_norm": 0.3865770101547241, + "learning_rate": 0.00010223423751283485, + "loss": 1.5469, + "step": 37626 + }, + { + "epoch": 0.48894570392256553, + "grad_norm": 0.40844962000846863, + "learning_rate": 0.00010223163805092346, + "loss": 1.4274, + "step": 37627 + }, + { + "epoch": 0.4889586984664814, + "grad_norm": 0.3634379804134369, + "learning_rate": 0.00010222903858901207, + "loss": 1.5192, + "step": 37628 + }, + { + "epoch": 0.4889716930103973, + "grad_norm": 0.46022528409957886, + "learning_rate": 0.00010222643912710071, + "loss": 1.389, + "step": 37629 + }, + { + "epoch": 0.4889846875543131, + "grad_norm": 0.3965018391609192, + "learning_rate": 0.00010222383966518931, + "loss": 1.4472, + "step": 37630 + }, + { + "epoch": 0.488997682098229, + "grad_norm": 0.4179500639438629, + "learning_rate": 0.00010222124020327792, + "loss": 1.153, + "step": 37631 + }, + { + "epoch": 0.48901067664214487, + "grad_norm": 0.37954720854759216, + "learning_rate": 0.00010221864074136653, + "loss": 1.405, + "step": 37632 + }, + { + "epoch": 0.48902367118606077, + "grad_norm": 0.4242473542690277, + "learning_rate": 0.00010221604127945517, + "loss": 1.3825, + "step": 37633 + }, + { + "epoch": 0.4890366657299766, + "grad_norm": 0.4180866777896881, + "learning_rate": 0.00010221344181754378, + "loss": 1.4142, + "step": 37634 + }, + { + "epoch": 0.4890496602738925, + "grad_norm": 0.4099809527397156, + "learning_rate": 0.00010221084235563239, + "loss": 1.5064, + "step": 37635 + }, + { + "epoch": 0.48906265481780836, + "grad_norm": 0.356748104095459, + "learning_rate": 0.000102208242893721, + "loss": 1.5494, + "step": 37636 + }, + { + "epoch": 0.48907564936172426, + "grad_norm": 0.4501465857028961, + "learning_rate": 0.00010220564343180962, + "loss": 1.5439, + "step": 37637 + }, + { + "epoch": 0.4890886439056401, + "grad_norm": 0.4067637026309967, + "learning_rate": 0.00010220304396989824, + "loss": 1.37, + "step": 37638 + }, + { + "epoch": 0.489101638449556, + "grad_norm": 0.4249280095100403, + "learning_rate": 0.00010220044450798685, + "loss": 1.3493, + "step": 37639 + }, + { + "epoch": 0.48911463299347185, + "grad_norm": 0.3765154778957367, + "learning_rate": 0.00010219784504607546, + "loss": 1.4876, + "step": 37640 + }, + { + "epoch": 0.48912762753738775, + "grad_norm": 0.41758087277412415, + "learning_rate": 0.0001021952455841641, + "loss": 1.4805, + "step": 37641 + }, + { + "epoch": 0.4891406220813036, + "grad_norm": 0.43847647309303284, + "learning_rate": 0.0001021926461222527, + "loss": 1.4674, + "step": 37642 + }, + { + "epoch": 0.4891536166252195, + "grad_norm": 0.47920501232147217, + "learning_rate": 0.0001021900466603413, + "loss": 1.3971, + "step": 37643 + }, + { + "epoch": 0.48916661116913535, + "grad_norm": 0.3893868327140808, + "learning_rate": 0.00010218744719842991, + "loss": 1.4283, + "step": 37644 + }, + { + "epoch": 0.48917960571305125, + "grad_norm": 0.3971408009529114, + "learning_rate": 0.00010218484773651855, + "loss": 1.2859, + "step": 37645 + }, + { + "epoch": 0.4891926002569671, + "grad_norm": 0.3316631317138672, + "learning_rate": 0.00010218224827460716, + "loss": 1.3039, + "step": 37646 + }, + { + "epoch": 0.489205594800883, + "grad_norm": 0.3308313488960266, + "learning_rate": 0.00010217964881269577, + "loss": 1.3086, + "step": 37647 + }, + { + "epoch": 0.48921858934479884, + "grad_norm": 0.43697389960289, + "learning_rate": 0.00010217704935078439, + "loss": 1.306, + "step": 37648 + }, + { + "epoch": 0.48923158388871474, + "grad_norm": 0.44559353590011597, + "learning_rate": 0.00010217444988887301, + "loss": 1.4088, + "step": 37649 + }, + { + "epoch": 0.4892445784326306, + "grad_norm": 0.35831785202026367, + "learning_rate": 0.00010217185042696162, + "loss": 1.5006, + "step": 37650 + }, + { + "epoch": 0.4892575729765465, + "grad_norm": 0.4659610390663147, + "learning_rate": 0.00010216925096505023, + "loss": 1.4051, + "step": 37651 + }, + { + "epoch": 0.48927056752046233, + "grad_norm": 0.4211437404155731, + "learning_rate": 0.00010216665150313887, + "loss": 1.3132, + "step": 37652 + }, + { + "epoch": 0.48928356206437823, + "grad_norm": 0.3066316545009613, + "learning_rate": 0.00010216405204122748, + "loss": 1.25, + "step": 37653 + }, + { + "epoch": 0.4892965566082941, + "grad_norm": 0.4758424162864685, + "learning_rate": 0.00010216145257931609, + "loss": 1.6608, + "step": 37654 + }, + { + "epoch": 0.48930955115221, + "grad_norm": 0.3729299008846283, + "learning_rate": 0.0001021588531174047, + "loss": 1.5147, + "step": 37655 + }, + { + "epoch": 0.4893225456961258, + "grad_norm": 0.35858896374702454, + "learning_rate": 0.00010215625365549333, + "loss": 1.5821, + "step": 37656 + }, + { + "epoch": 0.4893355402400417, + "grad_norm": 0.28876805305480957, + "learning_rate": 0.00010215365419358194, + "loss": 1.3293, + "step": 37657 + }, + { + "epoch": 0.48934853478395757, + "grad_norm": 0.376742422580719, + "learning_rate": 0.00010215105473167055, + "loss": 1.413, + "step": 37658 + }, + { + "epoch": 0.48936152932787347, + "grad_norm": 0.43379685282707214, + "learning_rate": 0.00010214845526975916, + "loss": 1.4242, + "step": 37659 + }, + { + "epoch": 0.4893745238717893, + "grad_norm": 0.41608256101608276, + "learning_rate": 0.00010214585580784778, + "loss": 1.4972, + "step": 37660 + }, + { + "epoch": 0.4893875184157052, + "grad_norm": 0.4789860248565674, + "learning_rate": 0.0001021432563459364, + "loss": 1.3728, + "step": 37661 + }, + { + "epoch": 0.48940051295962106, + "grad_norm": 0.42090538144111633, + "learning_rate": 0.000102140656884025, + "loss": 1.3533, + "step": 37662 + }, + { + "epoch": 0.48941350750353696, + "grad_norm": 0.3533662259578705, + "learning_rate": 0.00010213805742211362, + "loss": 1.4003, + "step": 37663 + }, + { + "epoch": 0.4894265020474528, + "grad_norm": 0.4685594439506531, + "learning_rate": 0.00010213545796020226, + "loss": 1.529, + "step": 37664 + }, + { + "epoch": 0.4894394965913687, + "grad_norm": 0.4252340793609619, + "learning_rate": 0.00010213285849829087, + "loss": 1.5663, + "step": 37665 + }, + { + "epoch": 0.48945249113528455, + "grad_norm": 0.4390921890735626, + "learning_rate": 0.00010213025903637948, + "loss": 1.2333, + "step": 37666 + }, + { + "epoch": 0.48946548567920045, + "grad_norm": 0.5438433289527893, + "learning_rate": 0.00010212765957446809, + "loss": 1.4152, + "step": 37667 + }, + { + "epoch": 0.4894784802231163, + "grad_norm": 0.47453826665878296, + "learning_rate": 0.00010212506011255671, + "loss": 1.4378, + "step": 37668 + }, + { + "epoch": 0.4894914747670322, + "grad_norm": 0.45358943939208984, + "learning_rate": 0.00010212246065064532, + "loss": 1.2644, + "step": 37669 + }, + { + "epoch": 0.48950446931094804, + "grad_norm": 0.35541918873786926, + "learning_rate": 0.00010211986118873393, + "loss": 1.4041, + "step": 37670 + }, + { + "epoch": 0.48951746385486394, + "grad_norm": 0.4046880602836609, + "learning_rate": 0.00010211726172682255, + "loss": 1.4442, + "step": 37671 + }, + { + "epoch": 0.4895304583987798, + "grad_norm": 0.39502188563346863, + "learning_rate": 0.00010211466226491117, + "loss": 1.5049, + "step": 37672 + }, + { + "epoch": 0.4895434529426957, + "grad_norm": 0.44652628898620605, + "learning_rate": 0.00010211206280299978, + "loss": 1.3335, + "step": 37673 + }, + { + "epoch": 0.48955644748661153, + "grad_norm": 0.34121567010879517, + "learning_rate": 0.00010210946334108839, + "loss": 1.2714, + "step": 37674 + }, + { + "epoch": 0.48956944203052744, + "grad_norm": 0.351938933134079, + "learning_rate": 0.000102106863879177, + "loss": 1.3949, + "step": 37675 + }, + { + "epoch": 0.4895824365744433, + "grad_norm": 0.3815155327320099, + "learning_rate": 0.00010210426441726564, + "loss": 1.6603, + "step": 37676 + }, + { + "epoch": 0.4895954311183592, + "grad_norm": 0.32269617915153503, + "learning_rate": 0.00010210166495535425, + "loss": 1.4687, + "step": 37677 + }, + { + "epoch": 0.489608425662275, + "grad_norm": 0.3727602958679199, + "learning_rate": 0.00010209906549344286, + "loss": 1.4158, + "step": 37678 + }, + { + "epoch": 0.4896214202061909, + "grad_norm": 0.3991808593273163, + "learning_rate": 0.00010209646603153147, + "loss": 1.4439, + "step": 37679 + }, + { + "epoch": 0.4896344147501068, + "grad_norm": 0.42426377534866333, + "learning_rate": 0.0001020938665696201, + "loss": 1.3166, + "step": 37680 + }, + { + "epoch": 0.4896474092940227, + "grad_norm": 0.42968907952308655, + "learning_rate": 0.00010209126710770871, + "loss": 1.3213, + "step": 37681 + }, + { + "epoch": 0.4896604038379385, + "grad_norm": 0.4454449713230133, + "learning_rate": 0.00010208866764579732, + "loss": 1.4413, + "step": 37682 + }, + { + "epoch": 0.4896733983818544, + "grad_norm": 0.3516608476638794, + "learning_rate": 0.00010208606818388593, + "loss": 1.1771, + "step": 37683 + }, + { + "epoch": 0.4896863929257703, + "grad_norm": 0.3978886902332306, + "learning_rate": 0.00010208346872197457, + "loss": 1.5342, + "step": 37684 + }, + { + "epoch": 0.48969938746968616, + "grad_norm": 0.2885812819004059, + "learning_rate": 0.00010208086926006317, + "loss": 1.1951, + "step": 37685 + }, + { + "epoch": 0.48971238201360207, + "grad_norm": 0.5368260145187378, + "learning_rate": 0.00010207826979815178, + "loss": 1.4905, + "step": 37686 + }, + { + "epoch": 0.4897253765575179, + "grad_norm": 0.5116410255432129, + "learning_rate": 0.00010207567033624039, + "loss": 1.5111, + "step": 37687 + }, + { + "epoch": 0.4897383711014338, + "grad_norm": 0.4339734613895416, + "learning_rate": 0.00010207307087432903, + "loss": 1.1969, + "step": 37688 + }, + { + "epoch": 0.48975136564534966, + "grad_norm": 0.37022683024406433, + "learning_rate": 0.00010207047141241764, + "loss": 1.5182, + "step": 37689 + }, + { + "epoch": 0.48976436018926556, + "grad_norm": 0.31715548038482666, + "learning_rate": 0.00010206787195050625, + "loss": 1.4243, + "step": 37690 + }, + { + "epoch": 0.4897773547331814, + "grad_norm": 0.4087826907634735, + "learning_rate": 0.00010206527248859487, + "loss": 1.3323, + "step": 37691 + }, + { + "epoch": 0.4897903492770973, + "grad_norm": 0.37552472949028015, + "learning_rate": 0.00010206267302668348, + "loss": 1.4163, + "step": 37692 + }, + { + "epoch": 0.48980334382101315, + "grad_norm": 0.4141238331794739, + "learning_rate": 0.0001020600735647721, + "loss": 1.2786, + "step": 37693 + }, + { + "epoch": 0.48981633836492905, + "grad_norm": 0.5030965805053711, + "learning_rate": 0.0001020574741028607, + "loss": 1.5075, + "step": 37694 + }, + { + "epoch": 0.4898293329088449, + "grad_norm": 0.3129291832447052, + "learning_rate": 0.00010205487464094934, + "loss": 1.4336, + "step": 37695 + }, + { + "epoch": 0.4898423274527608, + "grad_norm": 0.5202312469482422, + "learning_rate": 0.00010205227517903795, + "loss": 1.4532, + "step": 37696 + }, + { + "epoch": 0.48985532199667664, + "grad_norm": 0.3756698966026306, + "learning_rate": 0.00010204967571712657, + "loss": 1.3933, + "step": 37697 + }, + { + "epoch": 0.48986831654059254, + "grad_norm": 0.41458624601364136, + "learning_rate": 0.00010204707625521516, + "loss": 1.4659, + "step": 37698 + }, + { + "epoch": 0.4898813110845084, + "grad_norm": 0.2936573624610901, + "learning_rate": 0.0001020444767933038, + "loss": 1.3798, + "step": 37699 + }, + { + "epoch": 0.4898943056284243, + "grad_norm": 0.43088358640670776, + "learning_rate": 0.00010204187733139241, + "loss": 1.3154, + "step": 37700 + }, + { + "epoch": 0.48990730017234013, + "grad_norm": 0.3775368928909302, + "learning_rate": 0.00010203927786948102, + "loss": 1.5167, + "step": 37701 + }, + { + "epoch": 0.48992029471625603, + "grad_norm": 0.32226327061653137, + "learning_rate": 0.00010203667840756963, + "loss": 1.4052, + "step": 37702 + }, + { + "epoch": 0.4899332892601719, + "grad_norm": 0.4110506772994995, + "learning_rate": 0.00010203407894565826, + "loss": 1.6387, + "step": 37703 + }, + { + "epoch": 0.4899462838040878, + "grad_norm": 0.3727943003177643, + "learning_rate": 0.00010203147948374687, + "loss": 1.185, + "step": 37704 + }, + { + "epoch": 0.4899592783480036, + "grad_norm": 0.4041294455528259, + "learning_rate": 0.00010202888002183548, + "loss": 1.6619, + "step": 37705 + }, + { + "epoch": 0.4899722728919195, + "grad_norm": 0.3528921902179718, + "learning_rate": 0.00010202628055992409, + "loss": 1.418, + "step": 37706 + }, + { + "epoch": 0.48998526743583537, + "grad_norm": 0.3151302635669708, + "learning_rate": 0.00010202368109801273, + "loss": 1.3567, + "step": 37707 + }, + { + "epoch": 0.48999826197975127, + "grad_norm": 0.28825438022613525, + "learning_rate": 0.00010202108163610134, + "loss": 1.3763, + "step": 37708 + }, + { + "epoch": 0.4900112565236671, + "grad_norm": 0.35700079798698425, + "learning_rate": 0.00010201848217418995, + "loss": 1.0927, + "step": 37709 + }, + { + "epoch": 0.490024251067583, + "grad_norm": 0.2915344536304474, + "learning_rate": 0.00010201588271227855, + "loss": 1.4072, + "step": 37710 + }, + { + "epoch": 0.49003724561149886, + "grad_norm": 0.428102970123291, + "learning_rate": 0.00010201328325036719, + "loss": 1.4446, + "step": 37711 + }, + { + "epoch": 0.49005024015541476, + "grad_norm": 0.3339066505432129, + "learning_rate": 0.0001020106837884558, + "loss": 1.1874, + "step": 37712 + }, + { + "epoch": 0.4900632346993306, + "grad_norm": 0.3950852155685425, + "learning_rate": 0.00010200808432654441, + "loss": 1.3711, + "step": 37713 + }, + { + "epoch": 0.4900762292432465, + "grad_norm": 0.30970823764801025, + "learning_rate": 0.00010200548486463302, + "loss": 1.2791, + "step": 37714 + }, + { + "epoch": 0.49008922378716235, + "grad_norm": 0.3111138343811035, + "learning_rate": 0.00010200288540272164, + "loss": 1.3152, + "step": 37715 + }, + { + "epoch": 0.49010221833107825, + "grad_norm": 0.3655491769313812, + "learning_rate": 0.00010200028594081025, + "loss": 1.4, + "step": 37716 + }, + { + "epoch": 0.4901152128749941, + "grad_norm": 0.2901107370853424, + "learning_rate": 0.00010199768647889886, + "loss": 1.4504, + "step": 37717 + }, + { + "epoch": 0.49012820741891, + "grad_norm": 0.3774755001068115, + "learning_rate": 0.00010199508701698748, + "loss": 1.5366, + "step": 37718 + }, + { + "epoch": 0.49014120196282585, + "grad_norm": 0.36180928349494934, + "learning_rate": 0.00010199248755507611, + "loss": 1.1822, + "step": 37719 + }, + { + "epoch": 0.49015419650674175, + "grad_norm": 0.4644837975502014, + "learning_rate": 0.00010198988809316472, + "loss": 1.3597, + "step": 37720 + }, + { + "epoch": 0.4901671910506576, + "grad_norm": 0.35096731781959534, + "learning_rate": 0.00010198728863125334, + "loss": 1.5003, + "step": 37721 + }, + { + "epoch": 0.4901801855945735, + "grad_norm": 0.38475143909454346, + "learning_rate": 0.00010198468916934195, + "loss": 1.3354, + "step": 37722 + }, + { + "epoch": 0.49019318013848934, + "grad_norm": 0.43894243240356445, + "learning_rate": 0.00010198208970743057, + "loss": 1.4731, + "step": 37723 + }, + { + "epoch": 0.49020617468240524, + "grad_norm": 0.4135128855705261, + "learning_rate": 0.00010197949024551918, + "loss": 1.3538, + "step": 37724 + }, + { + "epoch": 0.4902191692263211, + "grad_norm": 0.3755813241004944, + "learning_rate": 0.00010197689078360779, + "loss": 1.2868, + "step": 37725 + }, + { + "epoch": 0.490232163770237, + "grad_norm": 0.4043888449668884, + "learning_rate": 0.00010197429132169643, + "loss": 1.2852, + "step": 37726 + }, + { + "epoch": 0.49024515831415283, + "grad_norm": 0.4277477264404297, + "learning_rate": 0.00010197169185978503, + "loss": 1.2776, + "step": 37727 + }, + { + "epoch": 0.49025815285806873, + "grad_norm": 0.4013367295265198, + "learning_rate": 0.00010196909239787364, + "loss": 1.5241, + "step": 37728 + }, + { + "epoch": 0.4902711474019846, + "grad_norm": 0.4822542071342468, + "learning_rate": 0.00010196649293596225, + "loss": 1.4106, + "step": 37729 + }, + { + "epoch": 0.4902841419459005, + "grad_norm": 0.45983362197875977, + "learning_rate": 0.00010196389347405089, + "loss": 1.446, + "step": 37730 + }, + { + "epoch": 0.4902971364898163, + "grad_norm": 0.45816919207572937, + "learning_rate": 0.0001019612940121395, + "loss": 1.4044, + "step": 37731 + }, + { + "epoch": 0.4903101310337322, + "grad_norm": 0.36364760994911194, + "learning_rate": 0.00010195869455022811, + "loss": 1.3194, + "step": 37732 + }, + { + "epoch": 0.49032312557764807, + "grad_norm": 0.4717870354652405, + "learning_rate": 0.00010195609508831672, + "loss": 1.5687, + "step": 37733 + }, + { + "epoch": 0.49033612012156397, + "grad_norm": 0.3057982921600342, + "learning_rate": 0.00010195349562640535, + "loss": 1.2409, + "step": 37734 + }, + { + "epoch": 0.4903491146654798, + "grad_norm": 0.36223775148391724, + "learning_rate": 0.00010195089616449396, + "loss": 1.4644, + "step": 37735 + }, + { + "epoch": 0.4903621092093957, + "grad_norm": 0.4018089771270752, + "learning_rate": 0.00010194829670258257, + "loss": 1.3294, + "step": 37736 + }, + { + "epoch": 0.49037510375331156, + "grad_norm": 0.2628757059574127, + "learning_rate": 0.00010194569724067118, + "loss": 1.3375, + "step": 37737 + }, + { + "epoch": 0.49038809829722746, + "grad_norm": 0.4035939574241638, + "learning_rate": 0.00010194309777875982, + "loss": 1.37, + "step": 37738 + }, + { + "epoch": 0.4904010928411433, + "grad_norm": 0.4631110727787018, + "learning_rate": 0.00010194049831684843, + "loss": 1.3162, + "step": 37739 + }, + { + "epoch": 0.4904140873850592, + "grad_norm": 0.37760648131370544, + "learning_rate": 0.00010193789885493702, + "loss": 1.4845, + "step": 37740 + }, + { + "epoch": 0.49042708192897505, + "grad_norm": 0.4469343423843384, + "learning_rate": 0.00010193529939302564, + "loss": 1.533, + "step": 37741 + }, + { + "epoch": 0.49044007647289095, + "grad_norm": 0.5700821876525879, + "learning_rate": 0.00010193269993111427, + "loss": 1.5045, + "step": 37742 + }, + { + "epoch": 0.4904530710168068, + "grad_norm": 0.36509913206100464, + "learning_rate": 0.00010193010046920288, + "loss": 1.3962, + "step": 37743 + }, + { + "epoch": 0.4904660655607227, + "grad_norm": 0.4578285217285156, + "learning_rate": 0.0001019275010072915, + "loss": 1.4907, + "step": 37744 + }, + { + "epoch": 0.49047906010463854, + "grad_norm": 0.3795306086540222, + "learning_rate": 0.0001019249015453801, + "loss": 1.2938, + "step": 37745 + }, + { + "epoch": 0.49049205464855444, + "grad_norm": 0.37952038645744324, + "learning_rate": 0.00010192230208346873, + "loss": 1.3877, + "step": 37746 + }, + { + "epoch": 0.4905050491924703, + "grad_norm": 0.39149701595306396, + "learning_rate": 0.00010191970262155734, + "loss": 1.3631, + "step": 37747 + }, + { + "epoch": 0.4905180437363862, + "grad_norm": 0.4413684010505676, + "learning_rate": 0.00010191710315964595, + "loss": 1.5862, + "step": 37748 + }, + { + "epoch": 0.49053103828030203, + "grad_norm": 0.4371064603328705, + "learning_rate": 0.00010191450369773456, + "loss": 1.4724, + "step": 37749 + }, + { + "epoch": 0.49054403282421793, + "grad_norm": 0.4671314060688019, + "learning_rate": 0.0001019119042358232, + "loss": 1.4231, + "step": 37750 + }, + { + "epoch": 0.4905570273681338, + "grad_norm": 0.4213501811027527, + "learning_rate": 0.00010190930477391181, + "loss": 1.3799, + "step": 37751 + }, + { + "epoch": 0.4905700219120497, + "grad_norm": 0.4420798122882843, + "learning_rate": 0.00010190670531200041, + "loss": 1.5231, + "step": 37752 + }, + { + "epoch": 0.4905830164559655, + "grad_norm": 0.4246748983860016, + "learning_rate": 0.00010190410585008902, + "loss": 1.4932, + "step": 37753 + }, + { + "epoch": 0.4905960109998814, + "grad_norm": 0.3557030260562897, + "learning_rate": 0.00010190150638817766, + "loss": 1.1935, + "step": 37754 + }, + { + "epoch": 0.49060900554379727, + "grad_norm": 0.32457053661346436, + "learning_rate": 0.00010189890692626627, + "loss": 1.2368, + "step": 37755 + }, + { + "epoch": 0.4906220000877132, + "grad_norm": 0.3853646516799927, + "learning_rate": 0.00010189630746435488, + "loss": 1.4284, + "step": 37756 + }, + { + "epoch": 0.490634994631629, + "grad_norm": 0.410287469625473, + "learning_rate": 0.00010189370800244349, + "loss": 1.3091, + "step": 37757 + }, + { + "epoch": 0.4906479891755449, + "grad_norm": 0.34918105602264404, + "learning_rate": 0.00010189110854053212, + "loss": 1.2867, + "step": 37758 + }, + { + "epoch": 0.49066098371946076, + "grad_norm": 0.41428571939468384, + "learning_rate": 0.00010188850907862073, + "loss": 1.291, + "step": 37759 + }, + { + "epoch": 0.49067397826337666, + "grad_norm": 0.3405683636665344, + "learning_rate": 0.00010188590961670934, + "loss": 1.2035, + "step": 37760 + }, + { + "epoch": 0.49068697280729257, + "grad_norm": 0.4508454203605652, + "learning_rate": 0.00010188331015479795, + "loss": 1.4995, + "step": 37761 + }, + { + "epoch": 0.4906999673512084, + "grad_norm": 0.39756643772125244, + "learning_rate": 0.00010188071069288659, + "loss": 1.5601, + "step": 37762 + }, + { + "epoch": 0.4907129618951243, + "grad_norm": 0.39856451749801636, + "learning_rate": 0.0001018781112309752, + "loss": 1.2366, + "step": 37763 + }, + { + "epoch": 0.49072595643904016, + "grad_norm": 0.32271090149879456, + "learning_rate": 0.00010187551176906381, + "loss": 1.3093, + "step": 37764 + }, + { + "epoch": 0.49073895098295606, + "grad_norm": 0.4080948829650879, + "learning_rate": 0.00010187291230715243, + "loss": 1.4169, + "step": 37765 + }, + { + "epoch": 0.4907519455268719, + "grad_norm": 0.3551578223705292, + "learning_rate": 0.00010187031284524104, + "loss": 1.0811, + "step": 37766 + }, + { + "epoch": 0.4907649400707878, + "grad_norm": 0.38670992851257324, + "learning_rate": 0.00010186771338332966, + "loss": 1.3593, + "step": 37767 + }, + { + "epoch": 0.49077793461470365, + "grad_norm": 0.3447142541408539, + "learning_rate": 0.00010186511392141827, + "loss": 1.3037, + "step": 37768 + }, + { + "epoch": 0.49079092915861955, + "grad_norm": 0.4881947338581085, + "learning_rate": 0.00010186251445950689, + "loss": 1.2725, + "step": 37769 + }, + { + "epoch": 0.4908039237025354, + "grad_norm": 0.3881729245185852, + "learning_rate": 0.0001018599149975955, + "loss": 1.3703, + "step": 37770 + }, + { + "epoch": 0.4908169182464513, + "grad_norm": 0.3906041979789734, + "learning_rate": 0.00010185731553568411, + "loss": 1.4503, + "step": 37771 + }, + { + "epoch": 0.49082991279036714, + "grad_norm": 0.43087923526763916, + "learning_rate": 0.00010185471607377272, + "loss": 1.3202, + "step": 37772 + }, + { + "epoch": 0.49084290733428304, + "grad_norm": 0.39940306544303894, + "learning_rate": 0.00010185211661186136, + "loss": 1.4144, + "step": 37773 + }, + { + "epoch": 0.4908559018781989, + "grad_norm": 0.3941690921783447, + "learning_rate": 0.00010184951714994997, + "loss": 1.0833, + "step": 37774 + }, + { + "epoch": 0.4908688964221148, + "grad_norm": 0.3753497302532196, + "learning_rate": 0.00010184691768803858, + "loss": 1.2368, + "step": 37775 + }, + { + "epoch": 0.49088189096603063, + "grad_norm": 0.2929832935333252, + "learning_rate": 0.0001018443182261272, + "loss": 1.3899, + "step": 37776 + }, + { + "epoch": 0.49089488550994653, + "grad_norm": 0.3228476345539093, + "learning_rate": 0.00010184171876421582, + "loss": 1.2458, + "step": 37777 + }, + { + "epoch": 0.4909078800538624, + "grad_norm": 0.4038845896720886, + "learning_rate": 0.00010183911930230443, + "loss": 1.4574, + "step": 37778 + }, + { + "epoch": 0.4909208745977783, + "grad_norm": 0.41748881340026855, + "learning_rate": 0.00010183651984039304, + "loss": 1.4961, + "step": 37779 + }, + { + "epoch": 0.4909338691416941, + "grad_norm": 0.47549504041671753, + "learning_rate": 0.00010183392037848165, + "loss": 1.472, + "step": 37780 + }, + { + "epoch": 0.49094686368561, + "grad_norm": 0.4866889417171478, + "learning_rate": 0.00010183132091657028, + "loss": 1.4587, + "step": 37781 + }, + { + "epoch": 0.49095985822952587, + "grad_norm": 0.37075915932655334, + "learning_rate": 0.00010182872145465889, + "loss": 1.3511, + "step": 37782 + }, + { + "epoch": 0.49097285277344177, + "grad_norm": 0.38241830468177795, + "learning_rate": 0.0001018261219927475, + "loss": 1.4365, + "step": 37783 + }, + { + "epoch": 0.4909858473173576, + "grad_norm": 0.43948444724082947, + "learning_rate": 0.00010182352253083611, + "loss": 1.4915, + "step": 37784 + }, + { + "epoch": 0.4909988418612735, + "grad_norm": 0.5068712830543518, + "learning_rate": 0.00010182092306892475, + "loss": 1.4348, + "step": 37785 + }, + { + "epoch": 0.49101183640518936, + "grad_norm": 0.4951256513595581, + "learning_rate": 0.00010181832360701336, + "loss": 1.4303, + "step": 37786 + }, + { + "epoch": 0.49102483094910526, + "grad_norm": 0.4821823835372925, + "learning_rate": 0.00010181572414510197, + "loss": 1.5166, + "step": 37787 + }, + { + "epoch": 0.4910378254930211, + "grad_norm": 0.47578558325767517, + "learning_rate": 0.00010181312468319058, + "loss": 1.5105, + "step": 37788 + }, + { + "epoch": 0.491050820036937, + "grad_norm": 0.33312493562698364, + "learning_rate": 0.0001018105252212792, + "loss": 1.3488, + "step": 37789 + }, + { + "epoch": 0.49106381458085285, + "grad_norm": 0.40121138095855713, + "learning_rate": 0.00010180792575936782, + "loss": 1.3689, + "step": 37790 + }, + { + "epoch": 0.49107680912476875, + "grad_norm": 0.48584118485450745, + "learning_rate": 0.00010180532629745643, + "loss": 1.3175, + "step": 37791 + }, + { + "epoch": 0.4910898036686846, + "grad_norm": 0.4626264274120331, + "learning_rate": 0.00010180272683554504, + "loss": 1.5454, + "step": 37792 + }, + { + "epoch": 0.4911027982126005, + "grad_norm": 0.42534276843070984, + "learning_rate": 0.00010180012737363368, + "loss": 1.4869, + "step": 37793 + }, + { + "epoch": 0.49111579275651635, + "grad_norm": 0.4280928671360016, + "learning_rate": 0.00010179752791172227, + "loss": 1.348, + "step": 37794 + }, + { + "epoch": 0.49112878730043225, + "grad_norm": 0.3070835769176483, + "learning_rate": 0.00010179492844981088, + "loss": 1.0555, + "step": 37795 + }, + { + "epoch": 0.4911417818443481, + "grad_norm": 0.3631468415260315, + "learning_rate": 0.0001017923289878995, + "loss": 1.2557, + "step": 37796 + }, + { + "epoch": 0.491154776388264, + "grad_norm": 0.27955323457717896, + "learning_rate": 0.00010178972952598813, + "loss": 1.2874, + "step": 37797 + }, + { + "epoch": 0.49116777093217984, + "grad_norm": 0.49570029973983765, + "learning_rate": 0.00010178713006407674, + "loss": 1.4581, + "step": 37798 + }, + { + "epoch": 0.49118076547609574, + "grad_norm": 0.5180969834327698, + "learning_rate": 0.00010178453060216535, + "loss": 1.5302, + "step": 37799 + }, + { + "epoch": 0.4911937600200116, + "grad_norm": 0.4341398775577545, + "learning_rate": 0.00010178193114025398, + "loss": 1.3904, + "step": 37800 + }, + { + "epoch": 0.4912067545639275, + "grad_norm": 0.402722030878067, + "learning_rate": 0.00010177933167834259, + "loss": 1.5318, + "step": 37801 + }, + { + "epoch": 0.49121974910784333, + "grad_norm": 0.28964880108833313, + "learning_rate": 0.0001017767322164312, + "loss": 1.2989, + "step": 37802 + }, + { + "epoch": 0.49123274365175923, + "grad_norm": 0.4683733284473419, + "learning_rate": 0.00010177413275451981, + "loss": 1.277, + "step": 37803 + }, + { + "epoch": 0.4912457381956751, + "grad_norm": 0.3469351828098297, + "learning_rate": 0.00010177153329260845, + "loss": 1.4016, + "step": 37804 + }, + { + "epoch": 0.491258732739591, + "grad_norm": 0.37269794940948486, + "learning_rate": 0.00010176893383069706, + "loss": 1.4587, + "step": 37805 + }, + { + "epoch": 0.4912717272835068, + "grad_norm": 0.354561448097229, + "learning_rate": 0.00010176633436878567, + "loss": 1.2053, + "step": 37806 + }, + { + "epoch": 0.4912847218274227, + "grad_norm": 0.2931618392467499, + "learning_rate": 0.00010176373490687427, + "loss": 1.3359, + "step": 37807 + }, + { + "epoch": 0.49129771637133857, + "grad_norm": 0.4960029721260071, + "learning_rate": 0.00010176113544496291, + "loss": 1.4593, + "step": 37808 + }, + { + "epoch": 0.49131071091525447, + "grad_norm": 0.4757802188396454, + "learning_rate": 0.00010175853598305152, + "loss": 1.2738, + "step": 37809 + }, + { + "epoch": 0.4913237054591703, + "grad_norm": 0.4474790692329407, + "learning_rate": 0.00010175593652114013, + "loss": 1.352, + "step": 37810 + }, + { + "epoch": 0.4913367000030862, + "grad_norm": 0.3817029893398285, + "learning_rate": 0.00010175333705922874, + "loss": 1.5706, + "step": 37811 + }, + { + "epoch": 0.49134969454700206, + "grad_norm": 0.5639182329177856, + "learning_rate": 0.00010175073759731736, + "loss": 1.2678, + "step": 37812 + }, + { + "epoch": 0.49136268909091796, + "grad_norm": 0.307658314704895, + "learning_rate": 0.00010174813813540598, + "loss": 1.1545, + "step": 37813 + }, + { + "epoch": 0.4913756836348338, + "grad_norm": 0.525376558303833, + "learning_rate": 0.00010174553867349459, + "loss": 1.4462, + "step": 37814 + }, + { + "epoch": 0.4913886781787497, + "grad_norm": 0.4329998195171356, + "learning_rate": 0.0001017429392115832, + "loss": 1.5696, + "step": 37815 + }, + { + "epoch": 0.49140167272266555, + "grad_norm": 0.5260769128799438, + "learning_rate": 0.00010174033974967184, + "loss": 1.2952, + "step": 37816 + }, + { + "epoch": 0.49141466726658145, + "grad_norm": 0.4057026505470276, + "learning_rate": 0.00010173774028776045, + "loss": 1.2807, + "step": 37817 + }, + { + "epoch": 0.4914276618104973, + "grad_norm": 0.4023517072200775, + "learning_rate": 0.00010173514082584906, + "loss": 1.1707, + "step": 37818 + }, + { + "epoch": 0.4914406563544132, + "grad_norm": 0.44709914922714233, + "learning_rate": 0.00010173254136393767, + "loss": 1.2879, + "step": 37819 + }, + { + "epoch": 0.49145365089832904, + "grad_norm": 0.4642372727394104, + "learning_rate": 0.00010172994190202629, + "loss": 1.4165, + "step": 37820 + }, + { + "epoch": 0.49146664544224494, + "grad_norm": 0.5262324810028076, + "learning_rate": 0.0001017273424401149, + "loss": 1.3735, + "step": 37821 + }, + { + "epoch": 0.4914796399861608, + "grad_norm": 0.3654259443283081, + "learning_rate": 0.00010172474297820351, + "loss": 1.3548, + "step": 37822 + }, + { + "epoch": 0.4914926345300767, + "grad_norm": 0.43560153245925903, + "learning_rate": 0.00010172214351629213, + "loss": 1.2534, + "step": 37823 + }, + { + "epoch": 0.49150562907399253, + "grad_norm": 0.46837756037712097, + "learning_rate": 0.00010171954405438075, + "loss": 1.3993, + "step": 37824 + }, + { + "epoch": 0.49151862361790843, + "grad_norm": 0.5833989977836609, + "learning_rate": 0.00010171694459246936, + "loss": 1.4821, + "step": 37825 + }, + { + "epoch": 0.4915316181618243, + "grad_norm": 0.4069930911064148, + "learning_rate": 0.00010171434513055797, + "loss": 1.6053, + "step": 37826 + }, + { + "epoch": 0.4915446127057402, + "grad_norm": 0.42060330510139465, + "learning_rate": 0.00010171174566864658, + "loss": 1.4774, + "step": 37827 + }, + { + "epoch": 0.491557607249656, + "grad_norm": 0.438220739364624, + "learning_rate": 0.00010170914620673522, + "loss": 1.2956, + "step": 37828 + }, + { + "epoch": 0.4915706017935719, + "grad_norm": 0.38657090067863464, + "learning_rate": 0.00010170654674482383, + "loss": 1.3139, + "step": 37829 + }, + { + "epoch": 0.49158359633748777, + "grad_norm": 0.5094223618507385, + "learning_rate": 0.00010170394728291244, + "loss": 1.2433, + "step": 37830 + }, + { + "epoch": 0.49159659088140367, + "grad_norm": 0.36727893352508545, + "learning_rate": 0.00010170134782100105, + "loss": 1.3583, + "step": 37831 + }, + { + "epoch": 0.4916095854253195, + "grad_norm": 0.37785881757736206, + "learning_rate": 0.00010169874835908968, + "loss": 1.3786, + "step": 37832 + }, + { + "epoch": 0.4916225799692354, + "grad_norm": 0.3355453908443451, + "learning_rate": 0.00010169614889717829, + "loss": 1.253, + "step": 37833 + }, + { + "epoch": 0.49163557451315126, + "grad_norm": 0.3342718780040741, + "learning_rate": 0.0001016935494352669, + "loss": 1.3224, + "step": 37834 + }, + { + "epoch": 0.49164856905706716, + "grad_norm": 0.4494706392288208, + "learning_rate": 0.00010169094997335551, + "loss": 1.5681, + "step": 37835 + }, + { + "epoch": 0.49166156360098306, + "grad_norm": 0.3821546733379364, + "learning_rate": 0.00010168835051144414, + "loss": 1.4115, + "step": 37836 + }, + { + "epoch": 0.4916745581448989, + "grad_norm": 0.45482900738716125, + "learning_rate": 0.00010168575104953275, + "loss": 1.4022, + "step": 37837 + }, + { + "epoch": 0.4916875526888148, + "grad_norm": 0.3593827188014984, + "learning_rate": 0.00010168315158762136, + "loss": 1.5212, + "step": 37838 + }, + { + "epoch": 0.49170054723273066, + "grad_norm": 0.40328511595726013, + "learning_rate": 0.00010168055212571, + "loss": 1.402, + "step": 37839 + }, + { + "epoch": 0.49171354177664656, + "grad_norm": 0.3104299008846283, + "learning_rate": 0.0001016779526637986, + "loss": 1.2954, + "step": 37840 + }, + { + "epoch": 0.4917265363205624, + "grad_norm": 0.39228469133377075, + "learning_rate": 0.00010167535320188722, + "loss": 1.302, + "step": 37841 + }, + { + "epoch": 0.4917395308644783, + "grad_norm": 0.4786580801010132, + "learning_rate": 0.00010167275373997583, + "loss": 1.4602, + "step": 37842 + }, + { + "epoch": 0.49175252540839415, + "grad_norm": 0.5236978530883789, + "learning_rate": 0.00010167015427806445, + "loss": 1.4209, + "step": 37843 + }, + { + "epoch": 0.49176551995231005, + "grad_norm": 0.28668534755706787, + "learning_rate": 0.00010166755481615306, + "loss": 1.1925, + "step": 37844 + }, + { + "epoch": 0.4917785144962259, + "grad_norm": 0.3784046471118927, + "learning_rate": 0.00010166495535424167, + "loss": 1.3308, + "step": 37845 + }, + { + "epoch": 0.4917915090401418, + "grad_norm": 0.42403244972229004, + "learning_rate": 0.00010166235589233029, + "loss": 1.4588, + "step": 37846 + }, + { + "epoch": 0.49180450358405764, + "grad_norm": 0.45586347579956055, + "learning_rate": 0.00010165975643041892, + "loss": 1.4756, + "step": 37847 + }, + { + "epoch": 0.49181749812797354, + "grad_norm": 0.40865907073020935, + "learning_rate": 0.00010165715696850753, + "loss": 1.4586, + "step": 37848 + }, + { + "epoch": 0.4918304926718894, + "grad_norm": 0.46635472774505615, + "learning_rate": 0.00010165455750659613, + "loss": 1.5099, + "step": 37849 + }, + { + "epoch": 0.4918434872158053, + "grad_norm": 0.5214419364929199, + "learning_rate": 0.00010165195804468474, + "loss": 1.5649, + "step": 37850 + }, + { + "epoch": 0.49185648175972113, + "grad_norm": 0.4230841398239136, + "learning_rate": 0.00010164935858277338, + "loss": 1.319, + "step": 37851 + }, + { + "epoch": 0.49186947630363703, + "grad_norm": 0.4479019045829773, + "learning_rate": 0.00010164675912086199, + "loss": 1.341, + "step": 37852 + }, + { + "epoch": 0.4918824708475529, + "grad_norm": 0.3817611336708069, + "learning_rate": 0.0001016441596589506, + "loss": 1.3844, + "step": 37853 + }, + { + "epoch": 0.4918954653914688, + "grad_norm": 0.3892451226711273, + "learning_rate": 0.00010164156019703921, + "loss": 1.405, + "step": 37854 + }, + { + "epoch": 0.4919084599353846, + "grad_norm": 0.33282044529914856, + "learning_rate": 0.00010163896073512784, + "loss": 1.1758, + "step": 37855 + }, + { + "epoch": 0.4919214544793005, + "grad_norm": 0.37161576747894287, + "learning_rate": 0.00010163636127321645, + "loss": 1.2568, + "step": 37856 + }, + { + "epoch": 0.49193444902321637, + "grad_norm": 0.47352680563926697, + "learning_rate": 0.00010163376181130506, + "loss": 1.3742, + "step": 37857 + }, + { + "epoch": 0.49194744356713227, + "grad_norm": 0.5469909310340881, + "learning_rate": 0.00010163116234939367, + "loss": 1.1786, + "step": 37858 + }, + { + "epoch": 0.4919604381110481, + "grad_norm": 0.4582150876522064, + "learning_rate": 0.00010162856288748231, + "loss": 1.3074, + "step": 37859 + }, + { + "epoch": 0.491973432654964, + "grad_norm": 0.4114942252635956, + "learning_rate": 0.00010162596342557092, + "loss": 1.3766, + "step": 37860 + }, + { + "epoch": 0.49198642719887986, + "grad_norm": 0.3566601276397705, + "learning_rate": 0.00010162336396365953, + "loss": 1.3678, + "step": 37861 + }, + { + "epoch": 0.49199942174279576, + "grad_norm": 0.3429954946041107, + "learning_rate": 0.00010162076450174813, + "loss": 1.2739, + "step": 37862 + }, + { + "epoch": 0.4920124162867116, + "grad_norm": 0.33289653062820435, + "learning_rate": 0.00010161816503983677, + "loss": 1.5205, + "step": 37863 + }, + { + "epoch": 0.4920254108306275, + "grad_norm": 0.4267808794975281, + "learning_rate": 0.00010161556557792538, + "loss": 1.4195, + "step": 37864 + }, + { + "epoch": 0.49203840537454335, + "grad_norm": 0.4022567868232727, + "learning_rate": 0.00010161296611601399, + "loss": 1.4574, + "step": 37865 + }, + { + "epoch": 0.49205139991845925, + "grad_norm": 0.3995455503463745, + "learning_rate": 0.0001016103666541026, + "loss": 1.5695, + "step": 37866 + }, + { + "epoch": 0.4920643944623751, + "grad_norm": 0.31412267684936523, + "learning_rate": 0.00010160776719219122, + "loss": 1.5187, + "step": 37867 + }, + { + "epoch": 0.492077389006291, + "grad_norm": 0.4527989625930786, + "learning_rate": 0.00010160516773027983, + "loss": 1.4445, + "step": 37868 + }, + { + "epoch": 0.49209038355020684, + "grad_norm": 0.45526984333992004, + "learning_rate": 0.00010160256826836844, + "loss": 1.2516, + "step": 37869 + }, + { + "epoch": 0.49210337809412275, + "grad_norm": 0.5005548596382141, + "learning_rate": 0.00010159996880645706, + "loss": 1.4749, + "step": 37870 + }, + { + "epoch": 0.4921163726380386, + "grad_norm": 0.41235893964767456, + "learning_rate": 0.0001015973693445457, + "loss": 1.3417, + "step": 37871 + }, + { + "epoch": 0.4921293671819545, + "grad_norm": 0.4259625971317291, + "learning_rate": 0.0001015947698826343, + "loss": 1.4757, + "step": 37872 + }, + { + "epoch": 0.49214236172587034, + "grad_norm": 0.3119475841522217, + "learning_rate": 0.00010159217042072292, + "loss": 1.4265, + "step": 37873 + }, + { + "epoch": 0.49215535626978624, + "grad_norm": 0.44098174571990967, + "learning_rate": 0.00010158957095881154, + "loss": 1.3387, + "step": 37874 + }, + { + "epoch": 0.4921683508137021, + "grad_norm": 0.44032105803489685, + "learning_rate": 0.00010158697149690015, + "loss": 1.4495, + "step": 37875 + }, + { + "epoch": 0.492181345357618, + "grad_norm": 0.415775865316391, + "learning_rate": 0.00010158437203498876, + "loss": 1.5054, + "step": 37876 + }, + { + "epoch": 0.49219433990153383, + "grad_norm": 0.427044153213501, + "learning_rate": 0.00010158177257307737, + "loss": 1.4435, + "step": 37877 + }, + { + "epoch": 0.49220733444544973, + "grad_norm": 0.3788157105445862, + "learning_rate": 0.000101579173111166, + "loss": 1.2697, + "step": 37878 + }, + { + "epoch": 0.4922203289893656, + "grad_norm": 0.41348734498023987, + "learning_rate": 0.00010157657364925461, + "loss": 1.2653, + "step": 37879 + }, + { + "epoch": 0.4922333235332815, + "grad_norm": 0.41792169213294983, + "learning_rate": 0.00010157397418734322, + "loss": 1.4294, + "step": 37880 + }, + { + "epoch": 0.4922463180771973, + "grad_norm": 0.4320823848247528, + "learning_rate": 0.00010157137472543183, + "loss": 1.4542, + "step": 37881 + }, + { + "epoch": 0.4922593126211132, + "grad_norm": 0.48169055581092834, + "learning_rate": 0.00010156877526352047, + "loss": 1.3879, + "step": 37882 + }, + { + "epoch": 0.49227230716502907, + "grad_norm": 0.420049786567688, + "learning_rate": 0.00010156617580160908, + "loss": 1.4864, + "step": 37883 + }, + { + "epoch": 0.49228530170894497, + "grad_norm": 0.3211154043674469, + "learning_rate": 0.00010156357633969769, + "loss": 1.591, + "step": 37884 + }, + { + "epoch": 0.4922982962528608, + "grad_norm": 0.4874553680419922, + "learning_rate": 0.0001015609768777863, + "loss": 1.4195, + "step": 37885 + }, + { + "epoch": 0.4923112907967767, + "grad_norm": 0.42579734325408936, + "learning_rate": 0.00010155837741587493, + "loss": 1.3392, + "step": 37886 + }, + { + "epoch": 0.49232428534069256, + "grad_norm": 0.3239997923374176, + "learning_rate": 0.00010155577795396354, + "loss": 1.2086, + "step": 37887 + }, + { + "epoch": 0.49233727988460846, + "grad_norm": 0.344494104385376, + "learning_rate": 0.00010155317849205215, + "loss": 1.1687, + "step": 37888 + }, + { + "epoch": 0.4923502744285243, + "grad_norm": 0.5118855237960815, + "learning_rate": 0.00010155057903014076, + "loss": 1.566, + "step": 37889 + }, + { + "epoch": 0.4923632689724402, + "grad_norm": 0.38154539465904236, + "learning_rate": 0.0001015479795682294, + "loss": 1.4061, + "step": 37890 + }, + { + "epoch": 0.49237626351635605, + "grad_norm": 0.3532242774963379, + "learning_rate": 0.000101545380106318, + "loss": 1.2443, + "step": 37891 + }, + { + "epoch": 0.49238925806027195, + "grad_norm": 0.3777334690093994, + "learning_rate": 0.0001015427806444066, + "loss": 1.3516, + "step": 37892 + }, + { + "epoch": 0.4924022526041878, + "grad_norm": 0.4226819574832916, + "learning_rate": 0.00010154018118249522, + "loss": 1.3131, + "step": 37893 + }, + { + "epoch": 0.4924152471481037, + "grad_norm": 0.383455365896225, + "learning_rate": 0.00010153758172058385, + "loss": 1.4163, + "step": 37894 + }, + { + "epoch": 0.49242824169201954, + "grad_norm": 0.4080467224121094, + "learning_rate": 0.00010153498225867246, + "loss": 1.4903, + "step": 37895 + }, + { + "epoch": 0.49244123623593544, + "grad_norm": 0.5001527667045593, + "learning_rate": 0.00010153238279676108, + "loss": 1.3848, + "step": 37896 + }, + { + "epoch": 0.4924542307798513, + "grad_norm": 0.2543201446533203, + "learning_rate": 0.00010152978333484969, + "loss": 1.1539, + "step": 37897 + }, + { + "epoch": 0.4924672253237672, + "grad_norm": 0.45297348499298096, + "learning_rate": 0.00010152718387293831, + "loss": 1.326, + "step": 37898 + }, + { + "epoch": 0.49248021986768303, + "grad_norm": 0.3008664548397064, + "learning_rate": 0.00010152458441102692, + "loss": 1.1783, + "step": 37899 + }, + { + "epoch": 0.49249321441159893, + "grad_norm": 0.39896097779273987, + "learning_rate": 0.00010152198494911553, + "loss": 1.4346, + "step": 37900 + }, + { + "epoch": 0.4925062089555148, + "grad_norm": 0.43429243564605713, + "learning_rate": 0.00010151938548720414, + "loss": 1.3471, + "step": 37901 + }, + { + "epoch": 0.4925192034994307, + "grad_norm": 0.3289016783237457, + "learning_rate": 0.00010151678602529278, + "loss": 1.2925, + "step": 37902 + }, + { + "epoch": 0.4925321980433465, + "grad_norm": 0.3625752031803131, + "learning_rate": 0.00010151418656338139, + "loss": 1.4543, + "step": 37903 + }, + { + "epoch": 0.4925451925872624, + "grad_norm": 0.3840138912200928, + "learning_rate": 0.00010151158710146999, + "loss": 1.2094, + "step": 37904 + }, + { + "epoch": 0.49255818713117827, + "grad_norm": 0.44661855697631836, + "learning_rate": 0.0001015089876395586, + "loss": 1.3202, + "step": 37905 + }, + { + "epoch": 0.49257118167509417, + "grad_norm": 0.5159459710121155, + "learning_rate": 0.00010150638817764724, + "loss": 1.5165, + "step": 37906 + }, + { + "epoch": 0.49258417621901, + "grad_norm": 0.41345009207725525, + "learning_rate": 0.00010150378871573585, + "loss": 1.5094, + "step": 37907 + }, + { + "epoch": 0.4925971707629259, + "grad_norm": 0.4461956024169922, + "learning_rate": 0.00010150118925382446, + "loss": 1.5059, + "step": 37908 + }, + { + "epoch": 0.49261016530684176, + "grad_norm": 0.39571529626846313, + "learning_rate": 0.00010149858979191307, + "loss": 1.3985, + "step": 37909 + }, + { + "epoch": 0.49262315985075766, + "grad_norm": 0.4356691241264343, + "learning_rate": 0.0001014959903300017, + "loss": 1.3349, + "step": 37910 + }, + { + "epoch": 0.4926361543946735, + "grad_norm": 0.4733138084411621, + "learning_rate": 0.00010149339086809031, + "loss": 1.368, + "step": 37911 + }, + { + "epoch": 0.4926491489385894, + "grad_norm": 0.32814696431159973, + "learning_rate": 0.00010149079140617892, + "loss": 1.3374, + "step": 37912 + }, + { + "epoch": 0.4926621434825053, + "grad_norm": 0.4171048700809479, + "learning_rate": 0.00010148819194426756, + "loss": 1.3955, + "step": 37913 + }, + { + "epoch": 0.49267513802642116, + "grad_norm": 0.4319785237312317, + "learning_rate": 0.00010148559248235617, + "loss": 1.4779, + "step": 37914 + }, + { + "epoch": 0.49268813257033706, + "grad_norm": 0.424127995967865, + "learning_rate": 0.00010148299302044478, + "loss": 1.5297, + "step": 37915 + }, + { + "epoch": 0.4927011271142529, + "grad_norm": 0.37903890013694763, + "learning_rate": 0.00010148039355853338, + "loss": 1.2705, + "step": 37916 + }, + { + "epoch": 0.4927141216581688, + "grad_norm": 0.35672634840011597, + "learning_rate": 0.00010147779409662201, + "loss": 1.6217, + "step": 37917 + }, + { + "epoch": 0.49272711620208465, + "grad_norm": 0.3906625807285309, + "learning_rate": 0.00010147519463471062, + "loss": 1.2418, + "step": 37918 + }, + { + "epoch": 0.49274011074600055, + "grad_norm": 0.40231311321258545, + "learning_rate": 0.00010147259517279924, + "loss": 1.4281, + "step": 37919 + }, + { + "epoch": 0.4927531052899164, + "grad_norm": 0.4384446442127228, + "learning_rate": 0.00010146999571088785, + "loss": 1.4005, + "step": 37920 + }, + { + "epoch": 0.4927660998338323, + "grad_norm": 0.4290167987346649, + "learning_rate": 0.00010146739624897647, + "loss": 1.3525, + "step": 37921 + }, + { + "epoch": 0.49277909437774814, + "grad_norm": 0.46006879210472107, + "learning_rate": 0.00010146479678706508, + "loss": 1.4284, + "step": 37922 + }, + { + "epoch": 0.49279208892166404, + "grad_norm": 0.4378680884838104, + "learning_rate": 0.00010146219732515369, + "loss": 1.4095, + "step": 37923 + }, + { + "epoch": 0.4928050834655799, + "grad_norm": 0.2542547285556793, + "learning_rate": 0.0001014595978632423, + "loss": 1.1708, + "step": 37924 + }, + { + "epoch": 0.4928180780094958, + "grad_norm": 0.39742404222488403, + "learning_rate": 0.00010145699840133094, + "loss": 1.3833, + "step": 37925 + }, + { + "epoch": 0.49283107255341163, + "grad_norm": 0.4774917662143707, + "learning_rate": 0.00010145439893941955, + "loss": 1.286, + "step": 37926 + }, + { + "epoch": 0.49284406709732753, + "grad_norm": 0.5005674958229065, + "learning_rate": 0.00010145179947750816, + "loss": 1.3239, + "step": 37927 + }, + { + "epoch": 0.4928570616412434, + "grad_norm": 0.40683239698410034, + "learning_rate": 0.00010144920001559677, + "loss": 1.4927, + "step": 37928 + }, + { + "epoch": 0.4928700561851593, + "grad_norm": 0.466808944940567, + "learning_rate": 0.0001014466005536854, + "loss": 1.4526, + "step": 37929 + }, + { + "epoch": 0.4928830507290751, + "grad_norm": 0.5208826065063477, + "learning_rate": 0.00010144400109177401, + "loss": 1.3834, + "step": 37930 + }, + { + "epoch": 0.492896045272991, + "grad_norm": 0.43565046787261963, + "learning_rate": 0.00010144140162986262, + "loss": 1.4564, + "step": 37931 + }, + { + "epoch": 0.49290903981690687, + "grad_norm": 0.430633544921875, + "learning_rate": 0.00010143880216795123, + "loss": 1.4805, + "step": 37932 + }, + { + "epoch": 0.49292203436082277, + "grad_norm": 0.47882312536239624, + "learning_rate": 0.00010143620270603986, + "loss": 1.5159, + "step": 37933 + }, + { + "epoch": 0.4929350289047386, + "grad_norm": 0.42850902676582336, + "learning_rate": 0.00010143360324412847, + "loss": 1.3549, + "step": 37934 + }, + { + "epoch": 0.4929480234486545, + "grad_norm": 0.41156506538391113, + "learning_rate": 0.00010143100378221708, + "loss": 1.327, + "step": 37935 + }, + { + "epoch": 0.49296101799257036, + "grad_norm": 0.4543415904045105, + "learning_rate": 0.00010142840432030569, + "loss": 1.368, + "step": 37936 + }, + { + "epoch": 0.49297401253648626, + "grad_norm": 0.4010559916496277, + "learning_rate": 0.00010142580485839433, + "loss": 1.2507, + "step": 37937 + }, + { + "epoch": 0.4929870070804021, + "grad_norm": 0.5090144872665405, + "learning_rate": 0.00010142320539648294, + "loss": 1.4973, + "step": 37938 + }, + { + "epoch": 0.493000001624318, + "grad_norm": 0.39104536175727844, + "learning_rate": 0.00010142060593457155, + "loss": 1.4625, + "step": 37939 + }, + { + "epoch": 0.49301299616823385, + "grad_norm": 0.38204225897789, + "learning_rate": 0.00010141800647266016, + "loss": 1.3766, + "step": 37940 + }, + { + "epoch": 0.49302599071214975, + "grad_norm": 0.44885388016700745, + "learning_rate": 0.00010141540701074878, + "loss": 1.4984, + "step": 37941 + }, + { + "epoch": 0.4930389852560656, + "grad_norm": 0.5591496825218201, + "learning_rate": 0.0001014128075488374, + "loss": 1.4563, + "step": 37942 + }, + { + "epoch": 0.4930519797999815, + "grad_norm": 0.48022717237472534, + "learning_rate": 0.000101410208086926, + "loss": 1.439, + "step": 37943 + }, + { + "epoch": 0.49306497434389734, + "grad_norm": 0.4503045678138733, + "learning_rate": 0.00010140760862501462, + "loss": 1.5237, + "step": 37944 + }, + { + "epoch": 0.49307796888781324, + "grad_norm": 0.3211735486984253, + "learning_rate": 0.00010140500916310326, + "loss": 1.4258, + "step": 37945 + }, + { + "epoch": 0.4930909634317291, + "grad_norm": 0.5986831188201904, + "learning_rate": 0.00010140240970119185, + "loss": 1.4574, + "step": 37946 + }, + { + "epoch": 0.493103957975645, + "grad_norm": 0.42015311121940613, + "learning_rate": 0.00010139981023928046, + "loss": 1.4285, + "step": 37947 + }, + { + "epoch": 0.49311695251956084, + "grad_norm": 0.5040203332901001, + "learning_rate": 0.0001013972107773691, + "loss": 1.3174, + "step": 37948 + }, + { + "epoch": 0.49312994706347674, + "grad_norm": 0.41828641295433044, + "learning_rate": 0.00010139461131545771, + "loss": 1.5426, + "step": 37949 + }, + { + "epoch": 0.4931429416073926, + "grad_norm": 0.39809930324554443, + "learning_rate": 0.00010139201185354632, + "loss": 1.4105, + "step": 37950 + }, + { + "epoch": 0.4931559361513085, + "grad_norm": 0.35233503580093384, + "learning_rate": 0.00010138941239163493, + "loss": 1.5495, + "step": 37951 + }, + { + "epoch": 0.49316893069522433, + "grad_norm": 0.3805144727230072, + "learning_rate": 0.00010138681292972356, + "loss": 1.4797, + "step": 37952 + }, + { + "epoch": 0.49318192523914023, + "grad_norm": 0.4082876741886139, + "learning_rate": 0.00010138421346781217, + "loss": 1.4722, + "step": 37953 + }, + { + "epoch": 0.4931949197830561, + "grad_norm": 0.35256877541542053, + "learning_rate": 0.00010138161400590078, + "loss": 1.3716, + "step": 37954 + }, + { + "epoch": 0.493207914326972, + "grad_norm": 0.49692463874816895, + "learning_rate": 0.00010137901454398939, + "loss": 1.3552, + "step": 37955 + }, + { + "epoch": 0.4932209088708878, + "grad_norm": 0.4395529627799988, + "learning_rate": 0.00010137641508207803, + "loss": 1.214, + "step": 37956 + }, + { + "epoch": 0.4932339034148037, + "grad_norm": 0.4587078094482422, + "learning_rate": 0.00010137381562016664, + "loss": 1.3421, + "step": 37957 + }, + { + "epoch": 0.49324689795871957, + "grad_norm": 0.4206991493701935, + "learning_rate": 0.00010137121615825524, + "loss": 1.4955, + "step": 37958 + }, + { + "epoch": 0.49325989250263547, + "grad_norm": 0.3584602177143097, + "learning_rate": 0.00010136861669634385, + "loss": 1.3994, + "step": 37959 + }, + { + "epoch": 0.4932728870465513, + "grad_norm": 0.4250337779521942, + "learning_rate": 0.00010136601723443249, + "loss": 1.4151, + "step": 37960 + }, + { + "epoch": 0.4932858815904672, + "grad_norm": 0.4662173390388489, + "learning_rate": 0.0001013634177725211, + "loss": 1.4237, + "step": 37961 + }, + { + "epoch": 0.49329887613438306, + "grad_norm": 0.388580858707428, + "learning_rate": 0.00010136081831060971, + "loss": 1.2744, + "step": 37962 + }, + { + "epoch": 0.49331187067829896, + "grad_norm": 0.32799094915390015, + "learning_rate": 0.00010135821884869832, + "loss": 1.1594, + "step": 37963 + }, + { + "epoch": 0.4933248652222148, + "grad_norm": 0.4237835109233856, + "learning_rate": 0.00010135561938678694, + "loss": 1.5096, + "step": 37964 + }, + { + "epoch": 0.4933378597661307, + "grad_norm": 0.3883950114250183, + "learning_rate": 0.00010135301992487556, + "loss": 1.3216, + "step": 37965 + }, + { + "epoch": 0.49335085431004655, + "grad_norm": 0.35629549622535706, + "learning_rate": 0.00010135042046296417, + "loss": 1.3716, + "step": 37966 + }, + { + "epoch": 0.49336384885396245, + "grad_norm": 0.40324509143829346, + "learning_rate": 0.00010134782100105278, + "loss": 1.4888, + "step": 37967 + }, + { + "epoch": 0.4933768433978783, + "grad_norm": 0.4503353536128998, + "learning_rate": 0.00010134522153914141, + "loss": 1.194, + "step": 37968 + }, + { + "epoch": 0.4933898379417942, + "grad_norm": 0.3677539825439453, + "learning_rate": 0.00010134262207723003, + "loss": 1.1002, + "step": 37969 + }, + { + "epoch": 0.49340283248571004, + "grad_norm": 0.3776630759239197, + "learning_rate": 0.00010134002261531864, + "loss": 1.235, + "step": 37970 + }, + { + "epoch": 0.49341582702962594, + "grad_norm": 0.3005678057670593, + "learning_rate": 0.00010133742315340723, + "loss": 1.2759, + "step": 37971 + }, + { + "epoch": 0.4934288215735418, + "grad_norm": 0.37763339281082153, + "learning_rate": 0.00010133482369149587, + "loss": 1.3875, + "step": 37972 + }, + { + "epoch": 0.4934418161174577, + "grad_norm": 0.3466019630432129, + "learning_rate": 0.00010133222422958448, + "loss": 1.4156, + "step": 37973 + }, + { + "epoch": 0.49345481066137353, + "grad_norm": 0.3192394971847534, + "learning_rate": 0.0001013296247676731, + "loss": 1.3902, + "step": 37974 + }, + { + "epoch": 0.49346780520528943, + "grad_norm": 0.33244433999061584, + "learning_rate": 0.0001013270253057617, + "loss": 1.6655, + "step": 37975 + }, + { + "epoch": 0.4934807997492053, + "grad_norm": 0.4626505672931671, + "learning_rate": 0.00010132442584385033, + "loss": 1.4667, + "step": 37976 + }, + { + "epoch": 0.4934937942931212, + "grad_norm": 0.5244022011756897, + "learning_rate": 0.00010132182638193894, + "loss": 1.2626, + "step": 37977 + }, + { + "epoch": 0.493506788837037, + "grad_norm": 0.41166967153549194, + "learning_rate": 0.00010131922692002755, + "loss": 1.4199, + "step": 37978 + }, + { + "epoch": 0.4935197833809529, + "grad_norm": 0.3562083840370178, + "learning_rate": 0.00010131662745811616, + "loss": 1.4426, + "step": 37979 + }, + { + "epoch": 0.49353277792486877, + "grad_norm": 0.4225623309612274, + "learning_rate": 0.0001013140279962048, + "loss": 1.4487, + "step": 37980 + }, + { + "epoch": 0.49354577246878467, + "grad_norm": 0.3182221055030823, + "learning_rate": 0.00010131142853429341, + "loss": 1.3752, + "step": 37981 + }, + { + "epoch": 0.4935587670127005, + "grad_norm": 0.37063705921173096, + "learning_rate": 0.00010130882907238202, + "loss": 1.4088, + "step": 37982 + }, + { + "epoch": 0.4935717615566164, + "grad_norm": 0.3993184268474579, + "learning_rate": 0.00010130622961047063, + "loss": 1.4983, + "step": 37983 + }, + { + "epoch": 0.49358475610053226, + "grad_norm": 0.41423940658569336, + "learning_rate": 0.00010130363014855926, + "loss": 1.2387, + "step": 37984 + }, + { + "epoch": 0.49359775064444816, + "grad_norm": 0.3359740972518921, + "learning_rate": 0.00010130103068664787, + "loss": 1.4156, + "step": 37985 + }, + { + "epoch": 0.493610745188364, + "grad_norm": 0.37544915080070496, + "learning_rate": 0.00010129843122473648, + "loss": 1.3729, + "step": 37986 + }, + { + "epoch": 0.4936237397322799, + "grad_norm": 0.4646163880825043, + "learning_rate": 0.00010129583176282512, + "loss": 1.5846, + "step": 37987 + }, + { + "epoch": 0.4936367342761958, + "grad_norm": 0.4488069713115692, + "learning_rate": 0.00010129323230091371, + "loss": 1.4046, + "step": 37988 + }, + { + "epoch": 0.49364972882011166, + "grad_norm": 0.42781439423561096, + "learning_rate": 0.00010129063283900233, + "loss": 1.5327, + "step": 37989 + }, + { + "epoch": 0.49366272336402756, + "grad_norm": 0.38465964794158936, + "learning_rate": 0.00010128803337709094, + "loss": 1.5006, + "step": 37990 + }, + { + "epoch": 0.4936757179079434, + "grad_norm": 0.4315025210380554, + "learning_rate": 0.00010128543391517957, + "loss": 1.5311, + "step": 37991 + }, + { + "epoch": 0.4936887124518593, + "grad_norm": 0.3920021653175354, + "learning_rate": 0.00010128283445326819, + "loss": 1.5596, + "step": 37992 + }, + { + "epoch": 0.49370170699577515, + "grad_norm": 0.39746302366256714, + "learning_rate": 0.0001012802349913568, + "loss": 1.2285, + "step": 37993 + }, + { + "epoch": 0.49371470153969105, + "grad_norm": 0.32346588373184204, + "learning_rate": 0.00010127763552944541, + "loss": 1.3456, + "step": 37994 + }, + { + "epoch": 0.4937276960836069, + "grad_norm": 0.43340837955474854, + "learning_rate": 0.00010127503606753403, + "loss": 1.5953, + "step": 37995 + }, + { + "epoch": 0.4937406906275228, + "grad_norm": 0.4179762899875641, + "learning_rate": 0.00010127243660562264, + "loss": 1.4177, + "step": 37996 + }, + { + "epoch": 0.49375368517143864, + "grad_norm": 0.2999354600906372, + "learning_rate": 0.00010126983714371125, + "loss": 1.1806, + "step": 37997 + }, + { + "epoch": 0.49376667971535454, + "grad_norm": 0.441028356552124, + "learning_rate": 0.00010126723768179986, + "loss": 1.2673, + "step": 37998 + }, + { + "epoch": 0.4937796742592704, + "grad_norm": 0.3732874095439911, + "learning_rate": 0.0001012646382198885, + "loss": 1.3215, + "step": 37999 + }, + { + "epoch": 0.4937926688031863, + "grad_norm": 0.3397403657436371, + "learning_rate": 0.0001012620387579771, + "loss": 1.3268, + "step": 38000 + }, + { + "epoch": 0.49380566334710213, + "grad_norm": 0.4676308333873749, + "learning_rate": 0.00010125943929606571, + "loss": 1.3105, + "step": 38001 + }, + { + "epoch": 0.49381865789101803, + "grad_norm": 0.4722161293029785, + "learning_rate": 0.00010125683983415432, + "loss": 1.3708, + "step": 38002 + }, + { + "epoch": 0.4938316524349339, + "grad_norm": 0.2911146283149719, + "learning_rate": 0.00010125424037224296, + "loss": 1.2611, + "step": 38003 + }, + { + "epoch": 0.4938446469788498, + "grad_norm": 0.4634213447570801, + "learning_rate": 0.00010125164091033157, + "loss": 1.3869, + "step": 38004 + }, + { + "epoch": 0.4938576415227656, + "grad_norm": 0.3736705183982849, + "learning_rate": 0.00010124904144842018, + "loss": 1.4028, + "step": 38005 + }, + { + "epoch": 0.4938706360666815, + "grad_norm": 0.36504679918289185, + "learning_rate": 0.00010124644198650879, + "loss": 1.4478, + "step": 38006 + }, + { + "epoch": 0.49388363061059737, + "grad_norm": 0.41356122493743896, + "learning_rate": 0.00010124384252459742, + "loss": 1.4253, + "step": 38007 + }, + { + "epoch": 0.49389662515451327, + "grad_norm": 0.3205413818359375, + "learning_rate": 0.00010124124306268603, + "loss": 1.332, + "step": 38008 + }, + { + "epoch": 0.4939096196984291, + "grad_norm": 0.48153674602508545, + "learning_rate": 0.00010123864360077464, + "loss": 1.4926, + "step": 38009 + }, + { + "epoch": 0.493922614242345, + "grad_norm": 0.33620941638946533, + "learning_rate": 0.00010123604413886325, + "loss": 1.5743, + "step": 38010 + }, + { + "epoch": 0.49393560878626086, + "grad_norm": 0.3998759388923645, + "learning_rate": 0.00010123344467695189, + "loss": 1.3283, + "step": 38011 + }, + { + "epoch": 0.49394860333017676, + "grad_norm": 0.47262445092201233, + "learning_rate": 0.0001012308452150405, + "loss": 1.366, + "step": 38012 + }, + { + "epoch": 0.4939615978740926, + "grad_norm": 0.4668542742729187, + "learning_rate": 0.0001012282457531291, + "loss": 1.4604, + "step": 38013 + }, + { + "epoch": 0.4939745924180085, + "grad_norm": 0.4626637101173401, + "learning_rate": 0.00010122564629121771, + "loss": 1.565, + "step": 38014 + }, + { + "epoch": 0.49398758696192435, + "grad_norm": 0.45223796367645264, + "learning_rate": 0.00010122304682930635, + "loss": 1.3877, + "step": 38015 + }, + { + "epoch": 0.49400058150584025, + "grad_norm": 0.3459091782569885, + "learning_rate": 0.00010122044736739496, + "loss": 1.3367, + "step": 38016 + }, + { + "epoch": 0.4940135760497561, + "grad_norm": 0.3948054313659668, + "learning_rate": 0.00010121784790548357, + "loss": 1.2261, + "step": 38017 + }, + { + "epoch": 0.494026570593672, + "grad_norm": 0.3407975435256958, + "learning_rate": 0.00010121524844357218, + "loss": 1.3084, + "step": 38018 + }, + { + "epoch": 0.49403956513758784, + "grad_norm": 0.3612014949321747, + "learning_rate": 0.0001012126489816608, + "loss": 1.3939, + "step": 38019 + }, + { + "epoch": 0.49405255968150374, + "grad_norm": 0.29565030336380005, + "learning_rate": 0.00010121004951974941, + "loss": 1.4792, + "step": 38020 + }, + { + "epoch": 0.4940655542254196, + "grad_norm": 0.3904449939727783, + "learning_rate": 0.00010120745005783802, + "loss": 1.3609, + "step": 38021 + }, + { + "epoch": 0.4940785487693355, + "grad_norm": 0.4191446900367737, + "learning_rate": 0.00010120485059592666, + "loss": 1.3346, + "step": 38022 + }, + { + "epoch": 0.49409154331325134, + "grad_norm": 0.45718222856521606, + "learning_rate": 0.00010120225113401527, + "loss": 1.3972, + "step": 38023 + }, + { + "epoch": 0.49410453785716724, + "grad_norm": 0.39793214201927185, + "learning_rate": 0.00010119965167210388, + "loss": 1.4745, + "step": 38024 + }, + { + "epoch": 0.4941175324010831, + "grad_norm": 0.42547401785850525, + "learning_rate": 0.0001011970522101925, + "loss": 1.4827, + "step": 38025 + }, + { + "epoch": 0.494130526944999, + "grad_norm": 0.415231853723526, + "learning_rate": 0.00010119445274828112, + "loss": 1.4155, + "step": 38026 + }, + { + "epoch": 0.4941435214889148, + "grad_norm": 0.4106965959072113, + "learning_rate": 0.00010119185328636973, + "loss": 1.5041, + "step": 38027 + }, + { + "epoch": 0.49415651603283073, + "grad_norm": 0.3416058421134949, + "learning_rate": 0.00010118925382445834, + "loss": 1.2758, + "step": 38028 + }, + { + "epoch": 0.4941695105767466, + "grad_norm": 0.42380955815315247, + "learning_rate": 0.00010118665436254695, + "loss": 1.2997, + "step": 38029 + }, + { + "epoch": 0.4941825051206625, + "grad_norm": 0.3871223032474518, + "learning_rate": 0.00010118405490063558, + "loss": 1.2986, + "step": 38030 + }, + { + "epoch": 0.4941954996645783, + "grad_norm": 0.44861894845962524, + "learning_rate": 0.00010118145543872419, + "loss": 1.3392, + "step": 38031 + }, + { + "epoch": 0.4942084942084942, + "grad_norm": 0.4205858111381531, + "learning_rate": 0.0001011788559768128, + "loss": 1.5934, + "step": 38032 + }, + { + "epoch": 0.49422148875241007, + "grad_norm": 0.4282827377319336, + "learning_rate": 0.00010117625651490141, + "loss": 1.422, + "step": 38033 + }, + { + "epoch": 0.49423448329632597, + "grad_norm": 0.47219786047935486, + "learning_rate": 0.00010117365705299005, + "loss": 1.4074, + "step": 38034 + }, + { + "epoch": 0.4942474778402418, + "grad_norm": 0.27033543586730957, + "learning_rate": 0.00010117105759107866, + "loss": 1.3483, + "step": 38035 + }, + { + "epoch": 0.4942604723841577, + "grad_norm": 0.43650442361831665, + "learning_rate": 0.00010116845812916727, + "loss": 1.5132, + "step": 38036 + }, + { + "epoch": 0.49427346692807356, + "grad_norm": 0.39370986819267273, + "learning_rate": 0.00010116585866725588, + "loss": 1.3898, + "step": 38037 + }, + { + "epoch": 0.49428646147198946, + "grad_norm": 0.41571998596191406, + "learning_rate": 0.0001011632592053445, + "loss": 1.4039, + "step": 38038 + }, + { + "epoch": 0.4942994560159053, + "grad_norm": 0.39217615127563477, + "learning_rate": 0.00010116065974343312, + "loss": 1.2554, + "step": 38039 + }, + { + "epoch": 0.4943124505598212, + "grad_norm": 0.2416711300611496, + "learning_rate": 0.00010115806028152173, + "loss": 1.3932, + "step": 38040 + }, + { + "epoch": 0.49432544510373705, + "grad_norm": 0.4639711380004883, + "learning_rate": 0.00010115546081961034, + "loss": 1.5079, + "step": 38041 + }, + { + "epoch": 0.49433843964765295, + "grad_norm": 0.3492540121078491, + "learning_rate": 0.00010115286135769896, + "loss": 1.3331, + "step": 38042 + }, + { + "epoch": 0.4943514341915688, + "grad_norm": 0.46703892946243286, + "learning_rate": 0.00010115026189578757, + "loss": 1.4683, + "step": 38043 + }, + { + "epoch": 0.4943644287354847, + "grad_norm": 0.5077219605445862, + "learning_rate": 0.00010114766243387618, + "loss": 1.4193, + "step": 38044 + }, + { + "epoch": 0.49437742327940054, + "grad_norm": 0.4638369083404541, + "learning_rate": 0.0001011450629719648, + "loss": 1.4103, + "step": 38045 + }, + { + "epoch": 0.49439041782331644, + "grad_norm": 0.24610057473182678, + "learning_rate": 0.00010114246351005343, + "loss": 1.3075, + "step": 38046 + }, + { + "epoch": 0.4944034123672323, + "grad_norm": 0.4314155578613281, + "learning_rate": 0.00010113986404814204, + "loss": 1.4749, + "step": 38047 + }, + { + "epoch": 0.4944164069111482, + "grad_norm": 0.4158130884170532, + "learning_rate": 0.00010113726458623066, + "loss": 1.2552, + "step": 38048 + }, + { + "epoch": 0.49442940145506403, + "grad_norm": 0.447780042886734, + "learning_rate": 0.00010113466512431927, + "loss": 1.386, + "step": 38049 + }, + { + "epoch": 0.49444239599897993, + "grad_norm": 0.36035841703414917, + "learning_rate": 0.00010113206566240789, + "loss": 1.4623, + "step": 38050 + }, + { + "epoch": 0.4944553905428958, + "grad_norm": 0.5047826766967773, + "learning_rate": 0.0001011294662004965, + "loss": 1.4424, + "step": 38051 + }, + { + "epoch": 0.4944683850868117, + "grad_norm": 0.39765664935112, + "learning_rate": 0.00010112686673858511, + "loss": 1.2496, + "step": 38052 + }, + { + "epoch": 0.4944813796307275, + "grad_norm": 0.3847762644290924, + "learning_rate": 0.00010112426727667372, + "loss": 1.3808, + "step": 38053 + }, + { + "epoch": 0.4944943741746434, + "grad_norm": 0.34074097871780396, + "learning_rate": 0.00010112166781476236, + "loss": 1.4476, + "step": 38054 + }, + { + "epoch": 0.49450736871855927, + "grad_norm": 0.3757489323616028, + "learning_rate": 0.00010111906835285096, + "loss": 1.4605, + "step": 38055 + }, + { + "epoch": 0.49452036326247517, + "grad_norm": 0.30726081132888794, + "learning_rate": 0.00010111646889093957, + "loss": 1.1356, + "step": 38056 + }, + { + "epoch": 0.494533357806391, + "grad_norm": 0.4372595250606537, + "learning_rate": 0.00010111386942902818, + "loss": 1.3606, + "step": 38057 + }, + { + "epoch": 0.4945463523503069, + "grad_norm": 0.3018917441368103, + "learning_rate": 0.00010111126996711682, + "loss": 1.3616, + "step": 38058 + }, + { + "epoch": 0.49455934689422276, + "grad_norm": 0.3442380726337433, + "learning_rate": 0.00010110867050520543, + "loss": 1.5974, + "step": 38059 + }, + { + "epoch": 0.49457234143813866, + "grad_norm": 0.44635239243507385, + "learning_rate": 0.00010110607104329404, + "loss": 1.4694, + "step": 38060 + }, + { + "epoch": 0.4945853359820545, + "grad_norm": 0.4729903042316437, + "learning_rate": 0.00010110347158138267, + "loss": 1.5467, + "step": 38061 + }, + { + "epoch": 0.4945983305259704, + "grad_norm": 0.4541347622871399, + "learning_rate": 0.00010110087211947128, + "loss": 1.2839, + "step": 38062 + }, + { + "epoch": 0.49461132506988625, + "grad_norm": 0.4162755012512207, + "learning_rate": 0.00010109827265755989, + "loss": 1.5634, + "step": 38063 + }, + { + "epoch": 0.49462431961380215, + "grad_norm": 0.37976840138435364, + "learning_rate": 0.0001010956731956485, + "loss": 1.3361, + "step": 38064 + }, + { + "epoch": 0.49463731415771806, + "grad_norm": 0.4237111210823059, + "learning_rate": 0.00010109307373373714, + "loss": 1.4506, + "step": 38065 + }, + { + "epoch": 0.4946503087016339, + "grad_norm": 0.4040474593639374, + "learning_rate": 0.00010109047427182575, + "loss": 1.3602, + "step": 38066 + }, + { + "epoch": 0.4946633032455498, + "grad_norm": 0.49096718430519104, + "learning_rate": 0.00010108787480991436, + "loss": 1.4101, + "step": 38067 + }, + { + "epoch": 0.49467629778946565, + "grad_norm": 0.3960370421409607, + "learning_rate": 0.00010108527534800296, + "loss": 1.2334, + "step": 38068 + }, + { + "epoch": 0.49468929233338155, + "grad_norm": 0.7094560265541077, + "learning_rate": 0.0001010826758860916, + "loss": 1.5798, + "step": 38069 + }, + { + "epoch": 0.4947022868772974, + "grad_norm": 0.4135470390319824, + "learning_rate": 0.0001010800764241802, + "loss": 1.4375, + "step": 38070 + }, + { + "epoch": 0.4947152814212133, + "grad_norm": 0.5411341190338135, + "learning_rate": 0.00010107747696226882, + "loss": 1.5351, + "step": 38071 + }, + { + "epoch": 0.49472827596512914, + "grad_norm": 0.4583306312561035, + "learning_rate": 0.00010107487750035743, + "loss": 1.3228, + "step": 38072 + }, + { + "epoch": 0.49474127050904504, + "grad_norm": 0.437509149312973, + "learning_rate": 0.00010107227803844605, + "loss": 1.5489, + "step": 38073 + }, + { + "epoch": 0.4947542650529609, + "grad_norm": 0.4036235213279724, + "learning_rate": 0.00010106967857653466, + "loss": 1.5532, + "step": 38074 + }, + { + "epoch": 0.4947672595968768, + "grad_norm": 0.504324197769165, + "learning_rate": 0.00010106707911462327, + "loss": 1.2805, + "step": 38075 + }, + { + "epoch": 0.49478025414079263, + "grad_norm": 0.37474584579467773, + "learning_rate": 0.00010106447965271188, + "loss": 1.4843, + "step": 38076 + }, + { + "epoch": 0.49479324868470853, + "grad_norm": 0.3595208525657654, + "learning_rate": 0.00010106188019080052, + "loss": 1.4961, + "step": 38077 + }, + { + "epoch": 0.4948062432286244, + "grad_norm": 0.414688378572464, + "learning_rate": 0.00010105928072888913, + "loss": 1.3105, + "step": 38078 + }, + { + "epoch": 0.4948192377725403, + "grad_norm": 0.4075782299041748, + "learning_rate": 0.00010105668126697774, + "loss": 1.4428, + "step": 38079 + }, + { + "epoch": 0.4948322323164561, + "grad_norm": 0.3834506869316101, + "learning_rate": 0.00010105408180506634, + "loss": 1.4417, + "step": 38080 + }, + { + "epoch": 0.494845226860372, + "grad_norm": 0.4564828872680664, + "learning_rate": 0.00010105148234315498, + "loss": 1.4381, + "step": 38081 + }, + { + "epoch": 0.49485822140428787, + "grad_norm": 0.4751845598220825, + "learning_rate": 0.00010104888288124359, + "loss": 1.4776, + "step": 38082 + }, + { + "epoch": 0.49487121594820377, + "grad_norm": 0.40767902135849, + "learning_rate": 0.0001010462834193322, + "loss": 1.4437, + "step": 38083 + }, + { + "epoch": 0.4948842104921196, + "grad_norm": 0.40385448932647705, + "learning_rate": 0.00010104368395742081, + "loss": 1.462, + "step": 38084 + }, + { + "epoch": 0.4948972050360355, + "grad_norm": 0.3900837302207947, + "learning_rate": 0.00010104108449550944, + "loss": 1.1103, + "step": 38085 + }, + { + "epoch": 0.49491019957995136, + "grad_norm": 0.4465446472167969, + "learning_rate": 0.00010103848503359805, + "loss": 1.3841, + "step": 38086 + }, + { + "epoch": 0.49492319412386726, + "grad_norm": 0.4772008955478668, + "learning_rate": 0.00010103588557168666, + "loss": 1.5146, + "step": 38087 + }, + { + "epoch": 0.4949361886677831, + "grad_norm": 0.517224133014679, + "learning_rate": 0.00010103328610977527, + "loss": 1.5706, + "step": 38088 + }, + { + "epoch": 0.494949183211699, + "grad_norm": 0.42201143503189087, + "learning_rate": 0.00010103068664786391, + "loss": 1.5045, + "step": 38089 + }, + { + "epoch": 0.49496217775561485, + "grad_norm": 0.41769346594810486, + "learning_rate": 0.00010102808718595252, + "loss": 1.2715, + "step": 38090 + }, + { + "epoch": 0.49497517229953075, + "grad_norm": 0.39498546719551086, + "learning_rate": 0.00010102548772404113, + "loss": 1.2837, + "step": 38091 + }, + { + "epoch": 0.4949881668434466, + "grad_norm": 0.4292343556880951, + "learning_rate": 0.00010102288826212974, + "loss": 1.4145, + "step": 38092 + }, + { + "epoch": 0.4950011613873625, + "grad_norm": 0.2744852900505066, + "learning_rate": 0.00010102028880021836, + "loss": 1.1836, + "step": 38093 + }, + { + "epoch": 0.49501415593127834, + "grad_norm": 0.47898560762405396, + "learning_rate": 0.00010101768933830698, + "loss": 1.3381, + "step": 38094 + }, + { + "epoch": 0.49502715047519424, + "grad_norm": 0.4035773277282715, + "learning_rate": 0.00010101508987639559, + "loss": 1.4612, + "step": 38095 + }, + { + "epoch": 0.4950401450191101, + "grad_norm": 0.4253118634223938, + "learning_rate": 0.00010101249041448422, + "loss": 1.2472, + "step": 38096 + }, + { + "epoch": 0.495053139563026, + "grad_norm": 0.4103727638721466, + "learning_rate": 0.00010100989095257282, + "loss": 1.3235, + "step": 38097 + }, + { + "epoch": 0.49506613410694184, + "grad_norm": 0.34784695506095886, + "learning_rate": 0.00010100729149066143, + "loss": 1.4793, + "step": 38098 + }, + { + "epoch": 0.49507912865085774, + "grad_norm": 0.3973008096218109, + "learning_rate": 0.00010100469202875004, + "loss": 1.3408, + "step": 38099 + }, + { + "epoch": 0.4950921231947736, + "grad_norm": 0.39903563261032104, + "learning_rate": 0.00010100209256683868, + "loss": 1.2727, + "step": 38100 + }, + { + "epoch": 0.4951051177386895, + "grad_norm": 0.4041145145893097, + "learning_rate": 0.00010099949310492729, + "loss": 1.513, + "step": 38101 + }, + { + "epoch": 0.4951181122826053, + "grad_norm": 0.4685211479663849, + "learning_rate": 0.0001009968936430159, + "loss": 1.5599, + "step": 38102 + }, + { + "epoch": 0.49513110682652123, + "grad_norm": 0.4299374520778656, + "learning_rate": 0.00010099429418110451, + "loss": 1.5151, + "step": 38103 + }, + { + "epoch": 0.4951441013704371, + "grad_norm": 0.4188407063484192, + "learning_rate": 0.00010099169471919314, + "loss": 1.2669, + "step": 38104 + }, + { + "epoch": 0.495157095914353, + "grad_norm": 0.4171513319015503, + "learning_rate": 0.00010098909525728175, + "loss": 1.3941, + "step": 38105 + }, + { + "epoch": 0.4951700904582688, + "grad_norm": 0.40999868512153625, + "learning_rate": 0.00010098649579537036, + "loss": 1.4132, + "step": 38106 + }, + { + "epoch": 0.4951830850021847, + "grad_norm": 0.30557218194007874, + "learning_rate": 0.00010098389633345897, + "loss": 1.373, + "step": 38107 + }, + { + "epoch": 0.49519607954610056, + "grad_norm": 0.42449942231178284, + "learning_rate": 0.00010098129687154761, + "loss": 1.4162, + "step": 38108 + }, + { + "epoch": 0.49520907409001647, + "grad_norm": 0.46149638295173645, + "learning_rate": 0.00010097869740963622, + "loss": 1.4531, + "step": 38109 + }, + { + "epoch": 0.4952220686339323, + "grad_norm": 0.445597380399704, + "learning_rate": 0.00010097609794772482, + "loss": 1.3368, + "step": 38110 + }, + { + "epoch": 0.4952350631778482, + "grad_norm": 0.5412014126777649, + "learning_rate": 0.00010097349848581343, + "loss": 1.4705, + "step": 38111 + }, + { + "epoch": 0.49524805772176406, + "grad_norm": 0.4127027690410614, + "learning_rate": 0.00010097089902390207, + "loss": 1.2966, + "step": 38112 + }, + { + "epoch": 0.49526105226567996, + "grad_norm": 0.3508322536945343, + "learning_rate": 0.00010096829956199068, + "loss": 1.3049, + "step": 38113 + }, + { + "epoch": 0.4952740468095958, + "grad_norm": 0.4369240701198578, + "learning_rate": 0.00010096570010007929, + "loss": 1.3868, + "step": 38114 + }, + { + "epoch": 0.4952870413535117, + "grad_norm": 0.3675299882888794, + "learning_rate": 0.0001009631006381679, + "loss": 1.3366, + "step": 38115 + }, + { + "epoch": 0.49530003589742755, + "grad_norm": 0.39378783106803894, + "learning_rate": 0.00010096050117625652, + "loss": 1.6476, + "step": 38116 + }, + { + "epoch": 0.49531303044134345, + "grad_norm": 0.49533969163894653, + "learning_rate": 0.00010095790171434513, + "loss": 1.3325, + "step": 38117 + }, + { + "epoch": 0.4953260249852593, + "grad_norm": 0.3428540825843811, + "learning_rate": 0.00010095530225243375, + "loss": 1.4273, + "step": 38118 + }, + { + "epoch": 0.4953390195291752, + "grad_norm": 0.4104280173778534, + "learning_rate": 0.00010095270279052236, + "loss": 1.4006, + "step": 38119 + }, + { + "epoch": 0.49535201407309104, + "grad_norm": 0.3324749171733856, + "learning_rate": 0.000100950103328611, + "loss": 1.1713, + "step": 38120 + }, + { + "epoch": 0.49536500861700694, + "grad_norm": 0.4256455600261688, + "learning_rate": 0.0001009475038666996, + "loss": 1.4412, + "step": 38121 + }, + { + "epoch": 0.4953780031609228, + "grad_norm": 0.42627793550491333, + "learning_rate": 0.0001009449044047882, + "loss": 1.4096, + "step": 38122 + }, + { + "epoch": 0.4953909977048387, + "grad_norm": 0.3781941533088684, + "learning_rate": 0.00010094230494287681, + "loss": 1.3487, + "step": 38123 + }, + { + "epoch": 0.49540399224875453, + "grad_norm": 0.3275192975997925, + "learning_rate": 0.00010093970548096545, + "loss": 1.479, + "step": 38124 + }, + { + "epoch": 0.49541698679267043, + "grad_norm": 0.270579069852829, + "learning_rate": 0.00010093710601905406, + "loss": 1.3982, + "step": 38125 + }, + { + "epoch": 0.4954299813365863, + "grad_norm": 0.3849242925643921, + "learning_rate": 0.00010093450655714267, + "loss": 1.4817, + "step": 38126 + }, + { + "epoch": 0.4954429758805022, + "grad_norm": 0.4871739447116852, + "learning_rate": 0.00010093190709523128, + "loss": 1.447, + "step": 38127 + }, + { + "epoch": 0.495455970424418, + "grad_norm": 0.34128451347351074, + "learning_rate": 0.00010092930763331991, + "loss": 1.2267, + "step": 38128 + }, + { + "epoch": 0.4954689649683339, + "grad_norm": 0.3936249613761902, + "learning_rate": 0.00010092670817140852, + "loss": 1.2636, + "step": 38129 + }, + { + "epoch": 0.49548195951224977, + "grad_norm": 0.3078303635120392, + "learning_rate": 0.00010092410870949713, + "loss": 1.5205, + "step": 38130 + }, + { + "epoch": 0.49549495405616567, + "grad_norm": 0.4474124610424042, + "learning_rate": 0.00010092150924758574, + "loss": 1.3391, + "step": 38131 + }, + { + "epoch": 0.4955079486000815, + "grad_norm": 0.3340831398963928, + "learning_rate": 0.00010091890978567438, + "loss": 1.3877, + "step": 38132 + }, + { + "epoch": 0.4955209431439974, + "grad_norm": 0.3591429889202118, + "learning_rate": 0.00010091631032376299, + "loss": 1.4964, + "step": 38133 + }, + { + "epoch": 0.49553393768791326, + "grad_norm": 0.40667182207107544, + "learning_rate": 0.0001009137108618516, + "loss": 1.3645, + "step": 38134 + }, + { + "epoch": 0.49554693223182916, + "grad_norm": 0.4645344913005829, + "learning_rate": 0.00010091111139994023, + "loss": 1.4408, + "step": 38135 + }, + { + "epoch": 0.495559926775745, + "grad_norm": 0.5129203200340271, + "learning_rate": 0.00010090851193802884, + "loss": 1.4038, + "step": 38136 + }, + { + "epoch": 0.4955729213196609, + "grad_norm": 0.37310346961021423, + "learning_rate": 0.00010090591247611745, + "loss": 1.3076, + "step": 38137 + }, + { + "epoch": 0.49558591586357675, + "grad_norm": 0.4951123595237732, + "learning_rate": 0.00010090331301420606, + "loss": 1.3905, + "step": 38138 + }, + { + "epoch": 0.49559891040749265, + "grad_norm": 0.40237176418304443, + "learning_rate": 0.00010090071355229468, + "loss": 1.2664, + "step": 38139 + }, + { + "epoch": 0.49561190495140856, + "grad_norm": 0.4635857343673706, + "learning_rate": 0.0001008981140903833, + "loss": 1.2616, + "step": 38140 + }, + { + "epoch": 0.4956248994953244, + "grad_norm": 0.5286194682121277, + "learning_rate": 0.0001008955146284719, + "loss": 1.3302, + "step": 38141 + }, + { + "epoch": 0.4956378940392403, + "grad_norm": 0.5238842368125916, + "learning_rate": 0.00010089291516656052, + "loss": 1.5706, + "step": 38142 + }, + { + "epoch": 0.49565088858315615, + "grad_norm": 0.4664166569709778, + "learning_rate": 0.00010089031570464915, + "loss": 1.6148, + "step": 38143 + }, + { + "epoch": 0.49566388312707205, + "grad_norm": 0.44058454036712646, + "learning_rate": 0.00010088771624273777, + "loss": 1.4235, + "step": 38144 + }, + { + "epoch": 0.4956768776709879, + "grad_norm": 0.5516623854637146, + "learning_rate": 0.00010088511678082638, + "loss": 1.3676, + "step": 38145 + }, + { + "epoch": 0.4956898722149038, + "grad_norm": 0.4085869789123535, + "learning_rate": 0.00010088251731891499, + "loss": 1.3035, + "step": 38146 + }, + { + "epoch": 0.49570286675881964, + "grad_norm": 0.4047647714614868, + "learning_rate": 0.00010087991785700361, + "loss": 1.3883, + "step": 38147 + }, + { + "epoch": 0.49571586130273554, + "grad_norm": 0.34162425994873047, + "learning_rate": 0.00010087731839509222, + "loss": 1.3107, + "step": 38148 + }, + { + "epoch": 0.4957288558466514, + "grad_norm": 0.45023488998413086, + "learning_rate": 0.00010087471893318083, + "loss": 1.5084, + "step": 38149 + }, + { + "epoch": 0.4957418503905673, + "grad_norm": 0.40335461497306824, + "learning_rate": 0.00010087211947126944, + "loss": 1.4245, + "step": 38150 + }, + { + "epoch": 0.49575484493448313, + "grad_norm": 0.4403659701347351, + "learning_rate": 0.00010086952000935808, + "loss": 1.3721, + "step": 38151 + }, + { + "epoch": 0.49576783947839903, + "grad_norm": 0.4788984954357147, + "learning_rate": 0.00010086692054744668, + "loss": 1.4036, + "step": 38152 + }, + { + "epoch": 0.4957808340223149, + "grad_norm": 0.31269076466560364, + "learning_rate": 0.00010086432108553529, + "loss": 1.2356, + "step": 38153 + }, + { + "epoch": 0.4957938285662308, + "grad_norm": 0.2996072769165039, + "learning_rate": 0.0001008617216236239, + "loss": 1.4761, + "step": 38154 + }, + { + "epoch": 0.4958068231101466, + "grad_norm": 0.446087509393692, + "learning_rate": 0.00010085912216171254, + "loss": 1.3956, + "step": 38155 + }, + { + "epoch": 0.4958198176540625, + "grad_norm": 0.2937715947628021, + "learning_rate": 0.00010085652269980115, + "loss": 1.5011, + "step": 38156 + }, + { + "epoch": 0.49583281219797837, + "grad_norm": 0.3581866919994354, + "learning_rate": 0.00010085392323788976, + "loss": 1.4644, + "step": 38157 + }, + { + "epoch": 0.49584580674189427, + "grad_norm": 0.3880802392959595, + "learning_rate": 0.00010085132377597837, + "loss": 1.3598, + "step": 38158 + }, + { + "epoch": 0.4958588012858101, + "grad_norm": 0.5193506479263306, + "learning_rate": 0.000100848724314067, + "loss": 1.2529, + "step": 38159 + }, + { + "epoch": 0.495871795829726, + "grad_norm": 0.4425947666168213, + "learning_rate": 0.00010084612485215561, + "loss": 1.1278, + "step": 38160 + }, + { + "epoch": 0.49588479037364186, + "grad_norm": 0.38908982276916504, + "learning_rate": 0.00010084352539024422, + "loss": 1.4684, + "step": 38161 + }, + { + "epoch": 0.49589778491755776, + "grad_norm": 0.3877604603767395, + "learning_rate": 0.00010084092592833283, + "loss": 1.4409, + "step": 38162 + }, + { + "epoch": 0.4959107794614736, + "grad_norm": 0.4295874834060669, + "learning_rate": 0.00010083832646642147, + "loss": 1.24, + "step": 38163 + }, + { + "epoch": 0.4959237740053895, + "grad_norm": 0.3835473656654358, + "learning_rate": 0.00010083572700451007, + "loss": 1.3604, + "step": 38164 + }, + { + "epoch": 0.49593676854930535, + "grad_norm": 0.4596775472164154, + "learning_rate": 0.00010083312754259868, + "loss": 1.203, + "step": 38165 + }, + { + "epoch": 0.49594976309322125, + "grad_norm": 0.48684290051460266, + "learning_rate": 0.00010083052808068729, + "loss": 1.3622, + "step": 38166 + }, + { + "epoch": 0.4959627576371371, + "grad_norm": 0.3998570740222931, + "learning_rate": 0.00010082792861877593, + "loss": 1.3433, + "step": 38167 + }, + { + "epoch": 0.495975752181053, + "grad_norm": 0.42381227016448975, + "learning_rate": 0.00010082532915686454, + "loss": 1.2974, + "step": 38168 + }, + { + "epoch": 0.49598874672496884, + "grad_norm": 0.46143975853919983, + "learning_rate": 0.00010082272969495315, + "loss": 1.3469, + "step": 38169 + }, + { + "epoch": 0.49600174126888474, + "grad_norm": 0.3237660527229309, + "learning_rate": 0.00010082013023304176, + "loss": 1.5836, + "step": 38170 + }, + { + "epoch": 0.4960147358128006, + "grad_norm": 0.40177983045578003, + "learning_rate": 0.00010081753077113038, + "loss": 1.4561, + "step": 38171 + }, + { + "epoch": 0.4960277303567165, + "grad_norm": 0.39886289834976196, + "learning_rate": 0.000100814931309219, + "loss": 1.3258, + "step": 38172 + }, + { + "epoch": 0.49604072490063233, + "grad_norm": 0.44304659962654114, + "learning_rate": 0.0001008123318473076, + "loss": 1.6858, + "step": 38173 + }, + { + "epoch": 0.49605371944454824, + "grad_norm": 0.4643542766571045, + "learning_rate": 0.00010080973238539624, + "loss": 1.6115, + "step": 38174 + }, + { + "epoch": 0.4960667139884641, + "grad_norm": 0.371380090713501, + "learning_rate": 0.00010080713292348485, + "loss": 1.3763, + "step": 38175 + }, + { + "epoch": 0.49607970853238, + "grad_norm": 0.44603514671325684, + "learning_rate": 0.00010080453346157346, + "loss": 1.3742, + "step": 38176 + }, + { + "epoch": 0.4960927030762958, + "grad_norm": 0.35637059807777405, + "learning_rate": 0.00010080193399966206, + "loss": 1.1879, + "step": 38177 + }, + { + "epoch": 0.4961056976202117, + "grad_norm": 0.45656099915504456, + "learning_rate": 0.0001007993345377507, + "loss": 1.3443, + "step": 38178 + }, + { + "epoch": 0.4961186921641276, + "grad_norm": 0.39014869928359985, + "learning_rate": 0.00010079673507583931, + "loss": 1.3491, + "step": 38179 + }, + { + "epoch": 0.4961316867080435, + "grad_norm": 0.23265451192855835, + "learning_rate": 0.00010079413561392792, + "loss": 1.4216, + "step": 38180 + }, + { + "epoch": 0.4961446812519593, + "grad_norm": 0.40407150983810425, + "learning_rate": 0.00010079153615201653, + "loss": 1.3644, + "step": 38181 + }, + { + "epoch": 0.4961576757958752, + "grad_norm": 0.3973194360733032, + "learning_rate": 0.00010078893669010516, + "loss": 1.45, + "step": 38182 + }, + { + "epoch": 0.49617067033979106, + "grad_norm": 0.46652403473854065, + "learning_rate": 0.00010078633722819377, + "loss": 1.4179, + "step": 38183 + }, + { + "epoch": 0.49618366488370697, + "grad_norm": 0.4539624750614166, + "learning_rate": 0.00010078373776628238, + "loss": 1.2773, + "step": 38184 + }, + { + "epoch": 0.4961966594276228, + "grad_norm": 0.4824382960796356, + "learning_rate": 0.00010078113830437099, + "loss": 1.443, + "step": 38185 + }, + { + "epoch": 0.4962096539715387, + "grad_norm": 0.42717623710632324, + "learning_rate": 0.00010077853884245963, + "loss": 1.3168, + "step": 38186 + }, + { + "epoch": 0.49622264851545456, + "grad_norm": 0.4497928023338318, + "learning_rate": 0.00010077593938054824, + "loss": 1.2448, + "step": 38187 + }, + { + "epoch": 0.49623564305937046, + "grad_norm": 0.2845083475112915, + "learning_rate": 0.00010077333991863685, + "loss": 1.2271, + "step": 38188 + }, + { + "epoch": 0.4962486376032863, + "grad_norm": 0.380996435880661, + "learning_rate": 0.00010077074045672546, + "loss": 1.3912, + "step": 38189 + }, + { + "epoch": 0.4962616321472022, + "grad_norm": 0.4074379503726959, + "learning_rate": 0.00010076814099481409, + "loss": 1.5681, + "step": 38190 + }, + { + "epoch": 0.49627462669111805, + "grad_norm": 0.3348466157913208, + "learning_rate": 0.0001007655415329027, + "loss": 1.2558, + "step": 38191 + }, + { + "epoch": 0.49628762123503395, + "grad_norm": 0.40191611647605896, + "learning_rate": 0.00010076294207099131, + "loss": 1.5807, + "step": 38192 + }, + { + "epoch": 0.4963006157789498, + "grad_norm": 0.333392471075058, + "learning_rate": 0.00010076034260907992, + "loss": 1.3125, + "step": 38193 + }, + { + "epoch": 0.4963136103228657, + "grad_norm": 0.4480747580528259, + "learning_rate": 0.00010075774314716854, + "loss": 1.3607, + "step": 38194 + }, + { + "epoch": 0.49632660486678154, + "grad_norm": 0.3587522506713867, + "learning_rate": 0.00010075514368525715, + "loss": 1.2453, + "step": 38195 + }, + { + "epoch": 0.49633959941069744, + "grad_norm": 0.46608126163482666, + "learning_rate": 0.00010075254422334576, + "loss": 1.5219, + "step": 38196 + }, + { + "epoch": 0.4963525939546133, + "grad_norm": 0.3593595325946808, + "learning_rate": 0.00010074994476143438, + "loss": 1.3089, + "step": 38197 + }, + { + "epoch": 0.4963655884985292, + "grad_norm": 0.47288548946380615, + "learning_rate": 0.00010074734529952301, + "loss": 1.5494, + "step": 38198 + }, + { + "epoch": 0.49637858304244503, + "grad_norm": 0.47059309482574463, + "learning_rate": 0.00010074474583761162, + "loss": 1.2216, + "step": 38199 + }, + { + "epoch": 0.49639157758636093, + "grad_norm": 0.38760021328926086, + "learning_rate": 0.00010074214637570024, + "loss": 1.2507, + "step": 38200 + }, + { + "epoch": 0.4964045721302768, + "grad_norm": 0.489494264125824, + "learning_rate": 0.00010073954691378885, + "loss": 1.5185, + "step": 38201 + }, + { + "epoch": 0.4964175666741927, + "grad_norm": 0.38859695196151733, + "learning_rate": 0.00010073694745187747, + "loss": 1.4479, + "step": 38202 + }, + { + "epoch": 0.4964305612181085, + "grad_norm": 0.5058392286300659, + "learning_rate": 0.00010073434798996608, + "loss": 1.5596, + "step": 38203 + }, + { + "epoch": 0.4964435557620244, + "grad_norm": 0.4065495729446411, + "learning_rate": 0.00010073174852805469, + "loss": 1.3671, + "step": 38204 + }, + { + "epoch": 0.49645655030594027, + "grad_norm": 0.5195760726928711, + "learning_rate": 0.0001007291490661433, + "loss": 1.5253, + "step": 38205 + }, + { + "epoch": 0.49646954484985617, + "grad_norm": 0.3791457414627075, + "learning_rate": 0.00010072654960423193, + "loss": 1.2625, + "step": 38206 + }, + { + "epoch": 0.496482539393772, + "grad_norm": 0.34332340955734253, + "learning_rate": 0.00010072395014232054, + "loss": 1.4659, + "step": 38207 + }, + { + "epoch": 0.4964955339376879, + "grad_norm": 0.46960121393203735, + "learning_rate": 0.00010072135068040915, + "loss": 1.532, + "step": 38208 + }, + { + "epoch": 0.49650852848160376, + "grad_norm": 0.3552747368812561, + "learning_rate": 0.00010071875121849779, + "loss": 1.4114, + "step": 38209 + }, + { + "epoch": 0.49652152302551966, + "grad_norm": 0.43553653359413147, + "learning_rate": 0.0001007161517565864, + "loss": 1.3434, + "step": 38210 + }, + { + "epoch": 0.4965345175694355, + "grad_norm": 0.43856823444366455, + "learning_rate": 0.00010071355229467501, + "loss": 1.4191, + "step": 38211 + }, + { + "epoch": 0.4965475121133514, + "grad_norm": 0.3386930823326111, + "learning_rate": 0.00010071095283276362, + "loss": 1.1422, + "step": 38212 + }, + { + "epoch": 0.49656050665726725, + "grad_norm": 0.34609055519104004, + "learning_rate": 0.00010070835337085225, + "loss": 1.3922, + "step": 38213 + }, + { + "epoch": 0.49657350120118315, + "grad_norm": 0.48085880279541016, + "learning_rate": 0.00010070575390894086, + "loss": 1.3229, + "step": 38214 + }, + { + "epoch": 0.496586495745099, + "grad_norm": 0.3659428358078003, + "learning_rate": 0.00010070315444702947, + "loss": 1.2003, + "step": 38215 + }, + { + "epoch": 0.4965994902890149, + "grad_norm": 0.4486415982246399, + "learning_rate": 0.00010070055498511808, + "loss": 1.5332, + "step": 38216 + }, + { + "epoch": 0.4966124848329308, + "grad_norm": 0.3283880054950714, + "learning_rate": 0.00010069795552320672, + "loss": 1.2941, + "step": 38217 + }, + { + "epoch": 0.49662547937684665, + "grad_norm": 0.3791065812110901, + "learning_rate": 0.00010069535606129533, + "loss": 1.6234, + "step": 38218 + }, + { + "epoch": 0.49663847392076255, + "grad_norm": 0.36310726404190063, + "learning_rate": 0.00010069275659938392, + "loss": 1.3119, + "step": 38219 + }, + { + "epoch": 0.4966514684646784, + "grad_norm": 0.32768315076828003, + "learning_rate": 0.00010069015713747254, + "loss": 1.1876, + "step": 38220 + }, + { + "epoch": 0.4966644630085943, + "grad_norm": 0.3908523917198181, + "learning_rate": 0.00010068755767556117, + "loss": 1.5275, + "step": 38221 + }, + { + "epoch": 0.49667745755251014, + "grad_norm": 0.40550461411476135, + "learning_rate": 0.00010068495821364978, + "loss": 1.3288, + "step": 38222 + }, + { + "epoch": 0.49669045209642604, + "grad_norm": 0.4728432893753052, + "learning_rate": 0.0001006823587517384, + "loss": 1.4515, + "step": 38223 + }, + { + "epoch": 0.4967034466403419, + "grad_norm": 0.4008466303348541, + "learning_rate": 0.000100679759289827, + "loss": 1.4084, + "step": 38224 + }, + { + "epoch": 0.4967164411842578, + "grad_norm": 0.45927658677101135, + "learning_rate": 0.00010067715982791563, + "loss": 1.4489, + "step": 38225 + }, + { + "epoch": 0.49672943572817363, + "grad_norm": 0.4260965585708618, + "learning_rate": 0.00010067456036600424, + "loss": 1.3681, + "step": 38226 + }, + { + "epoch": 0.49674243027208953, + "grad_norm": 0.5579988360404968, + "learning_rate": 0.00010067196090409285, + "loss": 1.3052, + "step": 38227 + }, + { + "epoch": 0.4967554248160054, + "grad_norm": 0.39217960834503174, + "learning_rate": 0.00010066936144218146, + "loss": 1.6686, + "step": 38228 + }, + { + "epoch": 0.4967684193599213, + "grad_norm": 0.42197975516319275, + "learning_rate": 0.0001006667619802701, + "loss": 1.2826, + "step": 38229 + }, + { + "epoch": 0.4967814139038371, + "grad_norm": 0.41482314467430115, + "learning_rate": 0.00010066416251835871, + "loss": 1.3626, + "step": 38230 + }, + { + "epoch": 0.496794408447753, + "grad_norm": 0.32807472348213196, + "learning_rate": 0.00010066156305644732, + "loss": 1.1998, + "step": 38231 + }, + { + "epoch": 0.49680740299166887, + "grad_norm": 0.44601133465766907, + "learning_rate": 0.00010065896359453592, + "loss": 1.4028, + "step": 38232 + }, + { + "epoch": 0.49682039753558477, + "grad_norm": 0.31964483857154846, + "learning_rate": 0.00010065636413262456, + "loss": 1.2503, + "step": 38233 + }, + { + "epoch": 0.4968333920795006, + "grad_norm": 0.41199636459350586, + "learning_rate": 0.00010065376467071317, + "loss": 1.4452, + "step": 38234 + }, + { + "epoch": 0.4968463866234165, + "grad_norm": 0.3892856538295746, + "learning_rate": 0.00010065116520880178, + "loss": 1.422, + "step": 38235 + }, + { + "epoch": 0.49685938116733236, + "grad_norm": 0.3586713373661041, + "learning_rate": 0.00010064856574689039, + "loss": 1.3028, + "step": 38236 + }, + { + "epoch": 0.49687237571124826, + "grad_norm": 0.37764057517051697, + "learning_rate": 0.00010064596628497902, + "loss": 1.4124, + "step": 38237 + }, + { + "epoch": 0.4968853702551641, + "grad_norm": 0.3756992816925049, + "learning_rate": 0.00010064336682306763, + "loss": 1.3134, + "step": 38238 + }, + { + "epoch": 0.49689836479908, + "grad_norm": 0.40205854177474976, + "learning_rate": 0.00010064076736115624, + "loss": 1.4236, + "step": 38239 + }, + { + "epoch": 0.49691135934299585, + "grad_norm": 0.3651718199253082, + "learning_rate": 0.00010063816789924485, + "loss": 1.2067, + "step": 38240 + }, + { + "epoch": 0.49692435388691175, + "grad_norm": 0.32568302750587463, + "learning_rate": 0.00010063556843733349, + "loss": 1.3528, + "step": 38241 + }, + { + "epoch": 0.4969373484308276, + "grad_norm": 0.38428765535354614, + "learning_rate": 0.0001006329689754221, + "loss": 1.3754, + "step": 38242 + }, + { + "epoch": 0.4969503429747435, + "grad_norm": 0.5420585870742798, + "learning_rate": 0.00010063036951351071, + "loss": 1.4364, + "step": 38243 + }, + { + "epoch": 0.49696333751865934, + "grad_norm": 0.35904571413993835, + "learning_rate": 0.0001006277700515993, + "loss": 1.312, + "step": 38244 + }, + { + "epoch": 0.49697633206257524, + "grad_norm": 0.32716450095176697, + "learning_rate": 0.00010062517058968794, + "loss": 1.4958, + "step": 38245 + }, + { + "epoch": 0.4969893266064911, + "grad_norm": 0.39068934321403503, + "learning_rate": 0.00010062257112777656, + "loss": 1.306, + "step": 38246 + }, + { + "epoch": 0.497002321150407, + "grad_norm": 0.41192135214805603, + "learning_rate": 0.00010061997166586517, + "loss": 1.4447, + "step": 38247 + }, + { + "epoch": 0.49701531569432283, + "grad_norm": 0.38641610741615295, + "learning_rate": 0.00010061737220395379, + "loss": 1.4291, + "step": 38248 + }, + { + "epoch": 0.49702831023823874, + "grad_norm": 0.37446409463882446, + "learning_rate": 0.0001006147727420424, + "loss": 1.5246, + "step": 38249 + }, + { + "epoch": 0.4970413047821546, + "grad_norm": 0.47554197907447815, + "learning_rate": 0.00010061217328013101, + "loss": 1.3895, + "step": 38250 + }, + { + "epoch": 0.4970542993260705, + "grad_norm": 0.34382396936416626, + "learning_rate": 0.00010060957381821962, + "loss": 1.2602, + "step": 38251 + }, + { + "epoch": 0.4970672938699863, + "grad_norm": 0.38223326206207275, + "learning_rate": 0.00010060697435630826, + "loss": 1.4299, + "step": 38252 + }, + { + "epoch": 0.4970802884139022, + "grad_norm": 0.37078580260276794, + "learning_rate": 0.00010060437489439687, + "loss": 1.4051, + "step": 38253 + }, + { + "epoch": 0.4970932829578181, + "grad_norm": 0.45896589756011963, + "learning_rate": 0.00010060177543248548, + "loss": 1.4015, + "step": 38254 + }, + { + "epoch": 0.497106277501734, + "grad_norm": 0.5382101535797119, + "learning_rate": 0.0001005991759705741, + "loss": 1.5385, + "step": 38255 + }, + { + "epoch": 0.4971192720456498, + "grad_norm": 0.44979390501976013, + "learning_rate": 0.00010059657650866272, + "loss": 1.4383, + "step": 38256 + }, + { + "epoch": 0.4971322665895657, + "grad_norm": 0.42438235878944397, + "learning_rate": 0.00010059397704675133, + "loss": 1.4412, + "step": 38257 + }, + { + "epoch": 0.49714526113348156, + "grad_norm": 0.39806994795799255, + "learning_rate": 0.00010059137758483994, + "loss": 1.5253, + "step": 38258 + }, + { + "epoch": 0.49715825567739746, + "grad_norm": 0.3334304690361023, + "learning_rate": 0.00010058877812292855, + "loss": 1.1342, + "step": 38259 + }, + { + "epoch": 0.4971712502213133, + "grad_norm": 0.39490842819213867, + "learning_rate": 0.00010058617866101719, + "loss": 1.3501, + "step": 38260 + }, + { + "epoch": 0.4971842447652292, + "grad_norm": 0.28818637132644653, + "learning_rate": 0.00010058357919910579, + "loss": 1.2178, + "step": 38261 + }, + { + "epoch": 0.49719723930914506, + "grad_norm": 0.38212302327156067, + "learning_rate": 0.0001005809797371944, + "loss": 1.5273, + "step": 38262 + }, + { + "epoch": 0.49721023385306096, + "grad_norm": 0.37797409296035767, + "learning_rate": 0.00010057838027528301, + "loss": 1.3841, + "step": 38263 + }, + { + "epoch": 0.4972232283969768, + "grad_norm": 0.4057205021381378, + "learning_rate": 0.00010057578081337165, + "loss": 1.3569, + "step": 38264 + }, + { + "epoch": 0.4972362229408927, + "grad_norm": 0.4671332538127899, + "learning_rate": 0.00010057318135146026, + "loss": 1.4952, + "step": 38265 + }, + { + "epoch": 0.49724921748480855, + "grad_norm": 0.33796024322509766, + "learning_rate": 0.00010057058188954887, + "loss": 1.5194, + "step": 38266 + }, + { + "epoch": 0.49726221202872445, + "grad_norm": 0.3439628779888153, + "learning_rate": 0.00010056798242763748, + "loss": 1.3359, + "step": 38267 + }, + { + "epoch": 0.4972752065726403, + "grad_norm": 0.3598017990589142, + "learning_rate": 0.0001005653829657261, + "loss": 1.3682, + "step": 38268 + }, + { + "epoch": 0.4972882011165562, + "grad_norm": 0.3693104088306427, + "learning_rate": 0.00010056278350381471, + "loss": 1.3575, + "step": 38269 + }, + { + "epoch": 0.49730119566047204, + "grad_norm": 0.373563677072525, + "learning_rate": 0.00010056018404190333, + "loss": 1.2384, + "step": 38270 + }, + { + "epoch": 0.49731419020438794, + "grad_norm": 0.4228780269622803, + "learning_rate": 0.00010055758457999194, + "loss": 1.4962, + "step": 38271 + }, + { + "epoch": 0.4973271847483038, + "grad_norm": 0.47320982813835144, + "learning_rate": 0.00010055498511808057, + "loss": 1.2786, + "step": 38272 + }, + { + "epoch": 0.4973401792922197, + "grad_norm": 0.5204222202301025, + "learning_rate": 0.00010055238565616919, + "loss": 1.2811, + "step": 38273 + }, + { + "epoch": 0.49735317383613553, + "grad_norm": 0.32358795404434204, + "learning_rate": 0.00010054978619425778, + "loss": 1.3961, + "step": 38274 + }, + { + "epoch": 0.49736616838005143, + "grad_norm": 0.35843977332115173, + "learning_rate": 0.0001005471867323464, + "loss": 1.4548, + "step": 38275 + }, + { + "epoch": 0.4973791629239673, + "grad_norm": 0.4199320375919342, + "learning_rate": 0.00010054458727043503, + "loss": 1.3252, + "step": 38276 + }, + { + "epoch": 0.4973921574678832, + "grad_norm": 0.4474913775920868, + "learning_rate": 0.00010054198780852364, + "loss": 1.4012, + "step": 38277 + }, + { + "epoch": 0.497405152011799, + "grad_norm": 0.37559759616851807, + "learning_rate": 0.00010053938834661225, + "loss": 1.2223, + "step": 38278 + }, + { + "epoch": 0.4974181465557149, + "grad_norm": 0.346578449010849, + "learning_rate": 0.00010053678888470086, + "loss": 1.3298, + "step": 38279 + }, + { + "epoch": 0.49743114109963077, + "grad_norm": 0.3738930821418762, + "learning_rate": 0.00010053418942278949, + "loss": 1.282, + "step": 38280 + }, + { + "epoch": 0.49744413564354667, + "grad_norm": 0.4037734866142273, + "learning_rate": 0.0001005315899608781, + "loss": 1.341, + "step": 38281 + }, + { + "epoch": 0.4974571301874625, + "grad_norm": 0.4619690179824829, + "learning_rate": 0.00010052899049896671, + "loss": 1.6006, + "step": 38282 + }, + { + "epoch": 0.4974701247313784, + "grad_norm": 0.3990549147129059, + "learning_rate": 0.00010052639103705535, + "loss": 1.4708, + "step": 38283 + }, + { + "epoch": 0.49748311927529426, + "grad_norm": 0.3018411099910736, + "learning_rate": 0.00010052379157514396, + "loss": 1.1721, + "step": 38284 + }, + { + "epoch": 0.49749611381921016, + "grad_norm": 0.42095980048179626, + "learning_rate": 0.00010052119211323257, + "loss": 1.3796, + "step": 38285 + }, + { + "epoch": 0.497509108363126, + "grad_norm": 0.4593113362789154, + "learning_rate": 0.00010051859265132117, + "loss": 1.37, + "step": 38286 + }, + { + "epoch": 0.4975221029070419, + "grad_norm": 0.4668925404548645, + "learning_rate": 0.0001005159931894098, + "loss": 1.5363, + "step": 38287 + }, + { + "epoch": 0.49753509745095775, + "grad_norm": 0.3969510793685913, + "learning_rate": 0.00010051339372749842, + "loss": 1.5789, + "step": 38288 + }, + { + "epoch": 0.49754809199487365, + "grad_norm": 0.44508442282676697, + "learning_rate": 0.00010051079426558703, + "loss": 1.3749, + "step": 38289 + }, + { + "epoch": 0.4975610865387895, + "grad_norm": 0.4621100127696991, + "learning_rate": 0.00010050819480367564, + "loss": 1.501, + "step": 38290 + }, + { + "epoch": 0.4975740810827054, + "grad_norm": 0.5140964388847351, + "learning_rate": 0.00010050559534176426, + "loss": 1.4981, + "step": 38291 + }, + { + "epoch": 0.49758707562662124, + "grad_norm": 0.47806936502456665, + "learning_rate": 0.00010050299587985287, + "loss": 1.3719, + "step": 38292 + }, + { + "epoch": 0.49760007017053715, + "grad_norm": 0.43101707100868225, + "learning_rate": 0.00010050039641794149, + "loss": 1.3017, + "step": 38293 + }, + { + "epoch": 0.49761306471445305, + "grad_norm": 0.3664492666721344, + "learning_rate": 0.0001004977969560301, + "loss": 1.4411, + "step": 38294 + }, + { + "epoch": 0.4976260592583689, + "grad_norm": 0.40567442774772644, + "learning_rate": 0.00010049519749411873, + "loss": 1.4569, + "step": 38295 + }, + { + "epoch": 0.4976390538022848, + "grad_norm": 0.36975419521331787, + "learning_rate": 0.00010049259803220735, + "loss": 1.431, + "step": 38296 + }, + { + "epoch": 0.49765204834620064, + "grad_norm": 0.350332111120224, + "learning_rate": 0.00010048999857029596, + "loss": 1.3976, + "step": 38297 + }, + { + "epoch": 0.49766504289011654, + "grad_norm": 0.5274938344955444, + "learning_rate": 0.00010048739910838457, + "loss": 1.4596, + "step": 38298 + }, + { + "epoch": 0.4976780374340324, + "grad_norm": 0.4731733202934265, + "learning_rate": 0.00010048479964647319, + "loss": 1.3106, + "step": 38299 + }, + { + "epoch": 0.4976910319779483, + "grad_norm": 0.378708153963089, + "learning_rate": 0.0001004822001845618, + "loss": 1.319, + "step": 38300 + }, + { + "epoch": 0.49770402652186413, + "grad_norm": 0.3066316545009613, + "learning_rate": 0.00010047960072265041, + "loss": 1.2844, + "step": 38301 + }, + { + "epoch": 0.49771702106578003, + "grad_norm": 0.38162386417388916, + "learning_rate": 0.00010047700126073902, + "loss": 1.5328, + "step": 38302 + }, + { + "epoch": 0.4977300156096959, + "grad_norm": 0.41711825132369995, + "learning_rate": 0.00010047440179882765, + "loss": 1.5256, + "step": 38303 + }, + { + "epoch": 0.4977430101536118, + "grad_norm": 0.4321335554122925, + "learning_rate": 0.00010047180233691626, + "loss": 1.3369, + "step": 38304 + }, + { + "epoch": 0.4977560046975276, + "grad_norm": 0.38317978382110596, + "learning_rate": 0.00010046920287500487, + "loss": 1.1886, + "step": 38305 + }, + { + "epoch": 0.4977689992414435, + "grad_norm": 0.43253058195114136, + "learning_rate": 0.00010046660341309348, + "loss": 1.3418, + "step": 38306 + }, + { + "epoch": 0.49778199378535937, + "grad_norm": 0.3355443775653839, + "learning_rate": 0.00010046400395118212, + "loss": 1.3033, + "step": 38307 + }, + { + "epoch": 0.49779498832927527, + "grad_norm": 0.34117597341537476, + "learning_rate": 0.00010046140448927073, + "loss": 1.2317, + "step": 38308 + }, + { + "epoch": 0.4978079828731911, + "grad_norm": 0.3505120575428009, + "learning_rate": 0.00010045880502735934, + "loss": 1.5223, + "step": 38309 + }, + { + "epoch": 0.497820977417107, + "grad_norm": 0.4736119508743286, + "learning_rate": 0.00010045620556544795, + "loss": 1.372, + "step": 38310 + }, + { + "epoch": 0.49783397196102286, + "grad_norm": 0.35333389043807983, + "learning_rate": 0.00010045360610353658, + "loss": 1.3972, + "step": 38311 + }, + { + "epoch": 0.49784696650493876, + "grad_norm": 0.3708445727825165, + "learning_rate": 0.00010045100664162519, + "loss": 1.2807, + "step": 38312 + }, + { + "epoch": 0.4978599610488546, + "grad_norm": 0.39415881037712097, + "learning_rate": 0.0001004484071797138, + "loss": 1.4524, + "step": 38313 + }, + { + "epoch": 0.4978729555927705, + "grad_norm": 0.3399812579154968, + "learning_rate": 0.00010044580771780241, + "loss": 1.1114, + "step": 38314 + }, + { + "epoch": 0.49788595013668635, + "grad_norm": 0.38117313385009766, + "learning_rate": 0.00010044320825589105, + "loss": 1.3756, + "step": 38315 + }, + { + "epoch": 0.49789894468060225, + "grad_norm": 0.3988131582736969, + "learning_rate": 0.00010044060879397965, + "loss": 1.4865, + "step": 38316 + }, + { + "epoch": 0.4979119392245181, + "grad_norm": 0.4566514492034912, + "learning_rate": 0.00010043800933206826, + "loss": 1.4258, + "step": 38317 + }, + { + "epoch": 0.497924933768434, + "grad_norm": 0.31859347224235535, + "learning_rate": 0.00010043540987015687, + "loss": 1.4005, + "step": 38318 + }, + { + "epoch": 0.49793792831234984, + "grad_norm": 0.41094642877578735, + "learning_rate": 0.0001004328104082455, + "loss": 1.2045, + "step": 38319 + }, + { + "epoch": 0.49795092285626574, + "grad_norm": 0.2256426066160202, + "learning_rate": 0.00010043021094633412, + "loss": 1.1688, + "step": 38320 + }, + { + "epoch": 0.4979639174001816, + "grad_norm": 0.4185001254081726, + "learning_rate": 0.00010042761148442273, + "loss": 1.2655, + "step": 38321 + }, + { + "epoch": 0.4979769119440975, + "grad_norm": 0.38996875286102295, + "learning_rate": 0.00010042501202251135, + "loss": 1.4205, + "step": 38322 + }, + { + "epoch": 0.49798990648801333, + "grad_norm": 0.3991868197917938, + "learning_rate": 0.00010042241256059996, + "loss": 1.1118, + "step": 38323 + }, + { + "epoch": 0.49800290103192923, + "grad_norm": 0.3766166865825653, + "learning_rate": 0.00010041981309868857, + "loss": 1.3052, + "step": 38324 + }, + { + "epoch": 0.4980158955758451, + "grad_norm": 0.447470486164093, + "learning_rate": 0.00010041721363677718, + "loss": 1.2146, + "step": 38325 + }, + { + "epoch": 0.498028890119761, + "grad_norm": 0.3612557351589203, + "learning_rate": 0.00010041461417486582, + "loss": 1.3292, + "step": 38326 + }, + { + "epoch": 0.4980418846636768, + "grad_norm": 0.43454593420028687, + "learning_rate": 0.00010041201471295443, + "loss": 1.3403, + "step": 38327 + }, + { + "epoch": 0.4980548792075927, + "grad_norm": 0.3371090888977051, + "learning_rate": 0.00010040941525104303, + "loss": 1.305, + "step": 38328 + }, + { + "epoch": 0.49806787375150857, + "grad_norm": 0.3942055404186249, + "learning_rate": 0.00010040681578913164, + "loss": 1.3997, + "step": 38329 + }, + { + "epoch": 0.4980808682954245, + "grad_norm": 0.44579729437828064, + "learning_rate": 0.00010040421632722028, + "loss": 1.5097, + "step": 38330 + }, + { + "epoch": 0.4980938628393403, + "grad_norm": 0.42233356833457947, + "learning_rate": 0.00010040161686530889, + "loss": 1.315, + "step": 38331 + }, + { + "epoch": 0.4981068573832562, + "grad_norm": 0.3737773001194, + "learning_rate": 0.0001003990174033975, + "loss": 1.5219, + "step": 38332 + }, + { + "epoch": 0.49811985192717206, + "grad_norm": 0.37200042605400085, + "learning_rate": 0.00010039641794148611, + "loss": 1.4068, + "step": 38333 + }, + { + "epoch": 0.49813284647108796, + "grad_norm": 0.4165978729724884, + "learning_rate": 0.00010039381847957474, + "loss": 1.344, + "step": 38334 + }, + { + "epoch": 0.4981458410150038, + "grad_norm": 0.40556177496910095, + "learning_rate": 0.00010039121901766335, + "loss": 1.5079, + "step": 38335 + }, + { + "epoch": 0.4981588355589197, + "grad_norm": 0.38753145933151245, + "learning_rate": 0.00010038861955575196, + "loss": 1.321, + "step": 38336 + }, + { + "epoch": 0.49817183010283556, + "grad_norm": 0.360279381275177, + "learning_rate": 0.00010038602009384057, + "loss": 1.3479, + "step": 38337 + }, + { + "epoch": 0.49818482464675146, + "grad_norm": 0.414491206407547, + "learning_rate": 0.00010038342063192921, + "loss": 1.365, + "step": 38338 + }, + { + "epoch": 0.4981978191906673, + "grad_norm": 0.4049168825149536, + "learning_rate": 0.00010038082117001782, + "loss": 1.2997, + "step": 38339 + }, + { + "epoch": 0.4982108137345832, + "grad_norm": 0.37992316484451294, + "learning_rate": 0.00010037822170810643, + "loss": 1.3229, + "step": 38340 + }, + { + "epoch": 0.49822380827849905, + "grad_norm": 0.4375469386577606, + "learning_rate": 0.00010037562224619503, + "loss": 1.4972, + "step": 38341 + }, + { + "epoch": 0.49823680282241495, + "grad_norm": 0.36174288392066956, + "learning_rate": 0.00010037302278428367, + "loss": 1.4114, + "step": 38342 + }, + { + "epoch": 0.4982497973663308, + "grad_norm": 0.34159186482429504, + "learning_rate": 0.00010037042332237228, + "loss": 1.2551, + "step": 38343 + }, + { + "epoch": 0.4982627919102467, + "grad_norm": 0.34516406059265137, + "learning_rate": 0.00010036782386046089, + "loss": 1.0434, + "step": 38344 + }, + { + "epoch": 0.49827578645416254, + "grad_norm": 0.39406538009643555, + "learning_rate": 0.0001003652243985495, + "loss": 1.4022, + "step": 38345 + }, + { + "epoch": 0.49828878099807844, + "grad_norm": 0.402893990278244, + "learning_rate": 0.00010036262493663812, + "loss": 1.3157, + "step": 38346 + }, + { + "epoch": 0.4983017755419943, + "grad_norm": 0.41723233461380005, + "learning_rate": 0.00010036002547472673, + "loss": 1.3904, + "step": 38347 + }, + { + "epoch": 0.4983147700859102, + "grad_norm": 0.4517935514450073, + "learning_rate": 0.00010035742601281534, + "loss": 1.4417, + "step": 38348 + }, + { + "epoch": 0.49832776462982603, + "grad_norm": 0.46642187237739563, + "learning_rate": 0.00010035482655090396, + "loss": 1.5132, + "step": 38349 + }, + { + "epoch": 0.49834075917374193, + "grad_norm": 0.3711899220943451, + "learning_rate": 0.0001003522270889926, + "loss": 1.2458, + "step": 38350 + }, + { + "epoch": 0.4983537537176578, + "grad_norm": 0.4891204237937927, + "learning_rate": 0.0001003496276270812, + "loss": 1.5312, + "step": 38351 + }, + { + "epoch": 0.4983667482615737, + "grad_norm": 0.33175399899482727, + "learning_rate": 0.00010034702816516982, + "loss": 1.3662, + "step": 38352 + }, + { + "epoch": 0.4983797428054895, + "grad_norm": 0.37135010957717896, + "learning_rate": 0.00010034442870325843, + "loss": 1.3684, + "step": 38353 + }, + { + "epoch": 0.4983927373494054, + "grad_norm": 0.32203182578086853, + "learning_rate": 0.00010034182924134705, + "loss": 1.5153, + "step": 38354 + }, + { + "epoch": 0.49840573189332127, + "grad_norm": 0.3203030526638031, + "learning_rate": 0.00010033922977943566, + "loss": 1.2636, + "step": 38355 + }, + { + "epoch": 0.49841872643723717, + "grad_norm": 0.34687569737434387, + "learning_rate": 0.00010033663031752427, + "loss": 1.3934, + "step": 38356 + }, + { + "epoch": 0.498431720981153, + "grad_norm": 0.34121039509773254, + "learning_rate": 0.00010033403085561291, + "loss": 1.4566, + "step": 38357 + }, + { + "epoch": 0.4984447155250689, + "grad_norm": 0.44214165210723877, + "learning_rate": 0.00010033143139370151, + "loss": 1.452, + "step": 38358 + }, + { + "epoch": 0.49845771006898476, + "grad_norm": 0.39845171570777893, + "learning_rate": 0.00010032883193179012, + "loss": 1.331, + "step": 38359 + }, + { + "epoch": 0.49847070461290066, + "grad_norm": 0.4112400412559509, + "learning_rate": 0.00010032623246987873, + "loss": 1.6181, + "step": 38360 + }, + { + "epoch": 0.4984836991568165, + "grad_norm": 0.36326760053634644, + "learning_rate": 0.00010032363300796737, + "loss": 1.4904, + "step": 38361 + }, + { + "epoch": 0.4984966937007324, + "grad_norm": 0.4529985189437866, + "learning_rate": 0.00010032103354605598, + "loss": 1.4531, + "step": 38362 + }, + { + "epoch": 0.49850968824464825, + "grad_norm": 0.3607131540775299, + "learning_rate": 0.00010031843408414459, + "loss": 1.17, + "step": 38363 + }, + { + "epoch": 0.49852268278856415, + "grad_norm": 0.4582368731498718, + "learning_rate": 0.0001003158346222332, + "loss": 1.4177, + "step": 38364 + }, + { + "epoch": 0.49853567733248, + "grad_norm": 0.36332833766937256, + "learning_rate": 0.00010031323516032183, + "loss": 1.4414, + "step": 38365 + }, + { + "epoch": 0.4985486718763959, + "grad_norm": 0.38659659028053284, + "learning_rate": 0.00010031063569841044, + "loss": 1.2598, + "step": 38366 + }, + { + "epoch": 0.49856166642031174, + "grad_norm": 0.502297580242157, + "learning_rate": 0.00010030803623649905, + "loss": 1.567, + "step": 38367 + }, + { + "epoch": 0.49857466096422765, + "grad_norm": 0.37501534819602966, + "learning_rate": 0.00010030543677458766, + "loss": 1.205, + "step": 38368 + }, + { + "epoch": 0.49858765550814355, + "grad_norm": 0.39188697934150696, + "learning_rate": 0.0001003028373126763, + "loss": 1.3353, + "step": 38369 + }, + { + "epoch": 0.4986006500520594, + "grad_norm": 0.4263515770435333, + "learning_rate": 0.0001003002378507649, + "loss": 1.4962, + "step": 38370 + }, + { + "epoch": 0.4986136445959753, + "grad_norm": 0.4459836184978485, + "learning_rate": 0.0001002976383888535, + "loss": 1.483, + "step": 38371 + }, + { + "epoch": 0.49862663913989114, + "grad_norm": 0.41951653361320496, + "learning_rate": 0.00010029503892694212, + "loss": 1.4803, + "step": 38372 + }, + { + "epoch": 0.49863963368380704, + "grad_norm": 0.5109822750091553, + "learning_rate": 0.00010029243946503075, + "loss": 1.4027, + "step": 38373 + }, + { + "epoch": 0.4986526282277229, + "grad_norm": 0.405787855386734, + "learning_rate": 0.00010028984000311936, + "loss": 1.2191, + "step": 38374 + }, + { + "epoch": 0.4986656227716388, + "grad_norm": 0.4110288918018341, + "learning_rate": 0.00010028724054120798, + "loss": 1.1934, + "step": 38375 + }, + { + "epoch": 0.49867861731555463, + "grad_norm": 0.42943495512008667, + "learning_rate": 0.00010028464107929659, + "loss": 1.4349, + "step": 38376 + }, + { + "epoch": 0.49869161185947053, + "grad_norm": 0.37985706329345703, + "learning_rate": 0.00010028204161738521, + "loss": 1.2487, + "step": 38377 + }, + { + "epoch": 0.4987046064033864, + "grad_norm": 0.480416476726532, + "learning_rate": 0.00010027944215547382, + "loss": 1.4297, + "step": 38378 + }, + { + "epoch": 0.4987176009473023, + "grad_norm": 0.3507551848888397, + "learning_rate": 0.00010027684269356243, + "loss": 1.3726, + "step": 38379 + }, + { + "epoch": 0.4987305954912181, + "grad_norm": 0.3495630919933319, + "learning_rate": 0.00010027424323165104, + "loss": 1.4046, + "step": 38380 + }, + { + "epoch": 0.498743590035134, + "grad_norm": 0.41064468026161194, + "learning_rate": 0.00010027164376973968, + "loss": 1.3079, + "step": 38381 + }, + { + "epoch": 0.49875658457904987, + "grad_norm": 0.37248387932777405, + "learning_rate": 0.00010026904430782829, + "loss": 1.583, + "step": 38382 + }, + { + "epoch": 0.49876957912296577, + "grad_norm": 0.5132301449775696, + "learning_rate": 0.00010026644484591689, + "loss": 1.3397, + "step": 38383 + }, + { + "epoch": 0.4987825736668816, + "grad_norm": 0.43373948335647583, + "learning_rate": 0.0001002638453840055, + "loss": 1.5919, + "step": 38384 + }, + { + "epoch": 0.4987955682107975, + "grad_norm": 0.42042919993400574, + "learning_rate": 0.00010026124592209414, + "loss": 1.2714, + "step": 38385 + }, + { + "epoch": 0.49880856275471336, + "grad_norm": 0.47053229808807373, + "learning_rate": 0.00010025864646018275, + "loss": 1.5696, + "step": 38386 + }, + { + "epoch": 0.49882155729862926, + "grad_norm": 0.3646489381790161, + "learning_rate": 0.00010025604699827136, + "loss": 1.4927, + "step": 38387 + }, + { + "epoch": 0.4988345518425451, + "grad_norm": 0.47750934958457947, + "learning_rate": 0.00010025344753635997, + "loss": 1.3547, + "step": 38388 + }, + { + "epoch": 0.498847546386461, + "grad_norm": 0.38673293590545654, + "learning_rate": 0.0001002508480744486, + "loss": 1.4759, + "step": 38389 + }, + { + "epoch": 0.49886054093037685, + "grad_norm": 0.37473753094673157, + "learning_rate": 0.00010024824861253721, + "loss": 1.5265, + "step": 38390 + }, + { + "epoch": 0.49887353547429275, + "grad_norm": 0.4663291871547699, + "learning_rate": 0.00010024564915062582, + "loss": 1.3869, + "step": 38391 + }, + { + "epoch": 0.4988865300182086, + "grad_norm": 0.45083218812942505, + "learning_rate": 0.00010024304968871443, + "loss": 1.5214, + "step": 38392 + }, + { + "epoch": 0.4988995245621245, + "grad_norm": 0.30985012650489807, + "learning_rate": 0.00010024045022680307, + "loss": 1.4202, + "step": 38393 + }, + { + "epoch": 0.49891251910604034, + "grad_norm": 0.2806839048862457, + "learning_rate": 0.00010023785076489168, + "loss": 1.2232, + "step": 38394 + }, + { + "epoch": 0.49892551364995624, + "grad_norm": 0.501163125038147, + "learning_rate": 0.00010023525130298029, + "loss": 1.4695, + "step": 38395 + }, + { + "epoch": 0.4989385081938721, + "grad_norm": 0.4245503544807434, + "learning_rate": 0.00010023265184106891, + "loss": 1.3073, + "step": 38396 + }, + { + "epoch": 0.498951502737788, + "grad_norm": 0.2749496102333069, + "learning_rate": 0.00010023005237915752, + "loss": 1.3263, + "step": 38397 + }, + { + "epoch": 0.49896449728170383, + "grad_norm": 0.5117831826210022, + "learning_rate": 0.00010022745291724613, + "loss": 1.5459, + "step": 38398 + }, + { + "epoch": 0.49897749182561973, + "grad_norm": 0.3722374439239502, + "learning_rate": 0.00010022485345533475, + "loss": 1.2086, + "step": 38399 + }, + { + "epoch": 0.4989904863695356, + "grad_norm": 0.2619899809360504, + "learning_rate": 0.00010022225399342337, + "loss": 1.3916, + "step": 38400 + }, + { + "epoch": 0.4990034809134515, + "grad_norm": 0.4068641662597656, + "learning_rate": 0.00010021965453151198, + "loss": 1.5562, + "step": 38401 + }, + { + "epoch": 0.4990164754573673, + "grad_norm": 0.44853901863098145, + "learning_rate": 0.00010021705506960059, + "loss": 1.3538, + "step": 38402 + }, + { + "epoch": 0.4990294700012832, + "grad_norm": 0.3799043893814087, + "learning_rate": 0.0001002144556076892, + "loss": 1.5034, + "step": 38403 + }, + { + "epoch": 0.49904246454519907, + "grad_norm": 0.3458322584629059, + "learning_rate": 0.00010021185614577784, + "loss": 1.3797, + "step": 38404 + }, + { + "epoch": 0.49905545908911497, + "grad_norm": 0.4032128155231476, + "learning_rate": 0.00010020925668386645, + "loss": 1.4429, + "step": 38405 + }, + { + "epoch": 0.4990684536330308, + "grad_norm": 0.4673705995082855, + "learning_rate": 0.00010020665722195506, + "loss": 1.2205, + "step": 38406 + }, + { + "epoch": 0.4990814481769467, + "grad_norm": 0.40213948488235474, + "learning_rate": 0.00010020405776004367, + "loss": 1.357, + "step": 38407 + }, + { + "epoch": 0.49909444272086256, + "grad_norm": 0.27652838826179504, + "learning_rate": 0.0001002014582981323, + "loss": 1.1771, + "step": 38408 + }, + { + "epoch": 0.49910743726477846, + "grad_norm": 0.3292090892791748, + "learning_rate": 0.00010019885883622091, + "loss": 1.2742, + "step": 38409 + }, + { + "epoch": 0.4991204318086943, + "grad_norm": 0.3631819486618042, + "learning_rate": 0.00010019625937430952, + "loss": 1.3741, + "step": 38410 + }, + { + "epoch": 0.4991334263526102, + "grad_norm": 0.34049108624458313, + "learning_rate": 0.00010019365991239813, + "loss": 1.3688, + "step": 38411 + }, + { + "epoch": 0.49914642089652606, + "grad_norm": 0.43600165843963623, + "learning_rate": 0.00010019106045048676, + "loss": 1.5146, + "step": 38412 + }, + { + "epoch": 0.49915941544044196, + "grad_norm": 0.4129011332988739, + "learning_rate": 0.00010018846098857537, + "loss": 1.4571, + "step": 38413 + }, + { + "epoch": 0.4991724099843578, + "grad_norm": 0.44655948877334595, + "learning_rate": 0.00010018586152666398, + "loss": 1.3895, + "step": 38414 + }, + { + "epoch": 0.4991854045282737, + "grad_norm": 0.437338650226593, + "learning_rate": 0.00010018326206475259, + "loss": 1.3987, + "step": 38415 + }, + { + "epoch": 0.49919839907218955, + "grad_norm": 0.3843490481376648, + "learning_rate": 0.00010018066260284123, + "loss": 1.2321, + "step": 38416 + }, + { + "epoch": 0.49921139361610545, + "grad_norm": 0.437357634305954, + "learning_rate": 0.00010017806314092984, + "loss": 1.4667, + "step": 38417 + }, + { + "epoch": 0.4992243881600213, + "grad_norm": 0.41078388690948486, + "learning_rate": 0.00010017546367901845, + "loss": 1.1576, + "step": 38418 + }, + { + "epoch": 0.4992373827039372, + "grad_norm": 0.3237869441509247, + "learning_rate": 0.00010017286421710706, + "loss": 1.3675, + "step": 38419 + }, + { + "epoch": 0.49925037724785304, + "grad_norm": 0.40829840302467346, + "learning_rate": 0.00010017026475519568, + "loss": 1.4013, + "step": 38420 + }, + { + "epoch": 0.49926337179176894, + "grad_norm": 0.4732895493507385, + "learning_rate": 0.0001001676652932843, + "loss": 1.2678, + "step": 38421 + }, + { + "epoch": 0.4992763663356848, + "grad_norm": 0.42385509610176086, + "learning_rate": 0.0001001650658313729, + "loss": 1.4338, + "step": 38422 + }, + { + "epoch": 0.4992893608796007, + "grad_norm": 0.5055204033851624, + "learning_rate": 0.00010016246636946152, + "loss": 1.2829, + "step": 38423 + }, + { + "epoch": 0.49930235542351653, + "grad_norm": 0.4948206841945648, + "learning_rate": 0.00010015986690755015, + "loss": 1.4836, + "step": 38424 + }, + { + "epoch": 0.49931534996743243, + "grad_norm": 0.3998386263847351, + "learning_rate": 0.00010015726744563875, + "loss": 1.4044, + "step": 38425 + }, + { + "epoch": 0.4993283445113483, + "grad_norm": 0.3959960341453552, + "learning_rate": 0.00010015466798372736, + "loss": 1.5296, + "step": 38426 + }, + { + "epoch": 0.4993413390552642, + "grad_norm": 0.40329161286354065, + "learning_rate": 0.00010015206852181597, + "loss": 1.4067, + "step": 38427 + }, + { + "epoch": 0.49935433359918, + "grad_norm": 0.42405247688293457, + "learning_rate": 0.00010014946905990461, + "loss": 1.5782, + "step": 38428 + }, + { + "epoch": 0.4993673281430959, + "grad_norm": 0.3255196213722229, + "learning_rate": 0.00010014686959799322, + "loss": 1.3941, + "step": 38429 + }, + { + "epoch": 0.49938032268701177, + "grad_norm": 0.36559823155403137, + "learning_rate": 0.00010014427013608183, + "loss": 1.4683, + "step": 38430 + }, + { + "epoch": 0.49939331723092767, + "grad_norm": 0.4623715281486511, + "learning_rate": 0.00010014167067417046, + "loss": 1.49, + "step": 38431 + }, + { + "epoch": 0.4994063117748435, + "grad_norm": 0.41243699193000793, + "learning_rate": 0.00010013907121225907, + "loss": 1.2236, + "step": 38432 + }, + { + "epoch": 0.4994193063187594, + "grad_norm": 0.39457741379737854, + "learning_rate": 0.00010013647175034768, + "loss": 1.3908, + "step": 38433 + }, + { + "epoch": 0.49943230086267526, + "grad_norm": 0.38218870759010315, + "learning_rate": 0.00010013387228843629, + "loss": 1.5183, + "step": 38434 + }, + { + "epoch": 0.49944529540659116, + "grad_norm": 0.47133591771125793, + "learning_rate": 0.00010013127282652493, + "loss": 1.3844, + "step": 38435 + }, + { + "epoch": 0.499458289950507, + "grad_norm": 0.3822469711303711, + "learning_rate": 0.00010012867336461354, + "loss": 1.3654, + "step": 38436 + }, + { + "epoch": 0.4994712844944229, + "grad_norm": 0.4620053172111511, + "learning_rate": 0.00010012607390270215, + "loss": 1.3816, + "step": 38437 + }, + { + "epoch": 0.49948427903833875, + "grad_norm": 0.4246847331523895, + "learning_rate": 0.00010012347444079075, + "loss": 1.3448, + "step": 38438 + }, + { + "epoch": 0.49949727358225465, + "grad_norm": 0.40838822722435, + "learning_rate": 0.00010012087497887939, + "loss": 1.2657, + "step": 38439 + }, + { + "epoch": 0.4995102681261705, + "grad_norm": 0.48224392533302307, + "learning_rate": 0.000100118275516968, + "loss": 1.4957, + "step": 38440 + }, + { + "epoch": 0.4995232626700864, + "grad_norm": 0.4037822186946869, + "learning_rate": 0.00010011567605505661, + "loss": 1.3534, + "step": 38441 + }, + { + "epoch": 0.49953625721400224, + "grad_norm": 0.45346084237098694, + "learning_rate": 0.00010011307659314522, + "loss": 1.5174, + "step": 38442 + }, + { + "epoch": 0.49954925175791814, + "grad_norm": 0.45121335983276367, + "learning_rate": 0.00010011047713123384, + "loss": 1.5282, + "step": 38443 + }, + { + "epoch": 0.499562246301834, + "grad_norm": 0.39843347668647766, + "learning_rate": 0.00010010787766932245, + "loss": 1.3467, + "step": 38444 + }, + { + "epoch": 0.4995752408457499, + "grad_norm": 0.3001526892185211, + "learning_rate": 0.00010010527820741107, + "loss": 1.2267, + "step": 38445 + }, + { + "epoch": 0.4995882353896658, + "grad_norm": 0.3839610517024994, + "learning_rate": 0.00010010267874549968, + "loss": 1.3876, + "step": 38446 + }, + { + "epoch": 0.49960122993358164, + "grad_norm": 0.3017396330833435, + "learning_rate": 0.00010010007928358831, + "loss": 1.3724, + "step": 38447 + }, + { + "epoch": 0.49961422447749754, + "grad_norm": 0.33791786432266235, + "learning_rate": 0.00010009747982167693, + "loss": 1.2568, + "step": 38448 + }, + { + "epoch": 0.4996272190214134, + "grad_norm": 0.575527012348175, + "learning_rate": 0.00010009488035976554, + "loss": 1.5191, + "step": 38449 + }, + { + "epoch": 0.4996402135653293, + "grad_norm": 0.32862597703933716, + "learning_rate": 0.00010009228089785413, + "loss": 1.4963, + "step": 38450 + }, + { + "epoch": 0.49965320810924513, + "grad_norm": 0.42390745878219604, + "learning_rate": 0.00010008968143594277, + "loss": 1.2572, + "step": 38451 + }, + { + "epoch": 0.49966620265316103, + "grad_norm": 0.38366666436195374, + "learning_rate": 0.00010008708197403138, + "loss": 1.4559, + "step": 38452 + }, + { + "epoch": 0.4996791971970769, + "grad_norm": 0.35566309094429016, + "learning_rate": 0.00010008448251212, + "loss": 1.4391, + "step": 38453 + }, + { + "epoch": 0.4996921917409928, + "grad_norm": 0.48613864183425903, + "learning_rate": 0.0001000818830502086, + "loss": 1.4462, + "step": 38454 + }, + { + "epoch": 0.4997051862849086, + "grad_norm": 0.45702236890792847, + "learning_rate": 0.00010007928358829723, + "loss": 1.4134, + "step": 38455 + }, + { + "epoch": 0.4997181808288245, + "grad_norm": 0.4362688362598419, + "learning_rate": 0.00010007668412638584, + "loss": 1.4077, + "step": 38456 + }, + { + "epoch": 0.49973117537274037, + "grad_norm": 0.40399742126464844, + "learning_rate": 0.00010007408466447445, + "loss": 1.3296, + "step": 38457 + }, + { + "epoch": 0.49974416991665627, + "grad_norm": 0.36409419775009155, + "learning_rate": 0.00010007148520256306, + "loss": 1.2316, + "step": 38458 + }, + { + "epoch": 0.4997571644605721, + "grad_norm": 0.3736996054649353, + "learning_rate": 0.0001000688857406517, + "loss": 1.2609, + "step": 38459 + }, + { + "epoch": 0.499770159004488, + "grad_norm": 0.5145350694656372, + "learning_rate": 0.00010006628627874031, + "loss": 1.4305, + "step": 38460 + }, + { + "epoch": 0.49978315354840386, + "grad_norm": 0.5018911361694336, + "learning_rate": 0.00010006368681682892, + "loss": 1.3918, + "step": 38461 + }, + { + "epoch": 0.49979614809231976, + "grad_norm": 0.38357535004615784, + "learning_rate": 0.00010006108735491753, + "loss": 1.2867, + "step": 38462 + }, + { + "epoch": 0.4998091426362356, + "grad_norm": 0.47817549109458923, + "learning_rate": 0.00010005848789300616, + "loss": 1.3797, + "step": 38463 + }, + { + "epoch": 0.4998221371801515, + "grad_norm": 0.5503483414649963, + "learning_rate": 0.00010005588843109477, + "loss": 1.546, + "step": 38464 + }, + { + "epoch": 0.49983513172406735, + "grad_norm": 0.46716955304145813, + "learning_rate": 0.00010005328896918338, + "loss": 1.4489, + "step": 38465 + }, + { + "epoch": 0.49984812626798325, + "grad_norm": 0.3614872694015503, + "learning_rate": 0.00010005068950727199, + "loss": 1.1966, + "step": 38466 + }, + { + "epoch": 0.4998611208118991, + "grad_norm": 0.36548638343811035, + "learning_rate": 0.00010004809004536061, + "loss": 1.3136, + "step": 38467 + }, + { + "epoch": 0.499874115355815, + "grad_norm": 0.3795667290687561, + "learning_rate": 0.00010004549058344923, + "loss": 1.1901, + "step": 38468 + }, + { + "epoch": 0.49988710989973084, + "grad_norm": 0.4318860173225403, + "learning_rate": 0.00010004289112153784, + "loss": 1.2734, + "step": 38469 + }, + { + "epoch": 0.49990010444364674, + "grad_norm": 0.3916262686252594, + "learning_rate": 0.00010004029165962647, + "loss": 1.383, + "step": 38470 + }, + { + "epoch": 0.4999130989875626, + "grad_norm": 0.45661166310310364, + "learning_rate": 0.00010003769219771509, + "loss": 1.4667, + "step": 38471 + }, + { + "epoch": 0.4999260935314785, + "grad_norm": 0.3498865067958832, + "learning_rate": 0.0001000350927358037, + "loss": 1.4206, + "step": 38472 + }, + { + "epoch": 0.49993908807539433, + "grad_norm": 0.41647881269454956, + "learning_rate": 0.00010003249327389231, + "loss": 1.072, + "step": 38473 + }, + { + "epoch": 0.49995208261931023, + "grad_norm": 0.3985632658004761, + "learning_rate": 0.00010002989381198093, + "loss": 1.4362, + "step": 38474 + }, + { + "epoch": 0.4999650771632261, + "grad_norm": 0.41346773505210876, + "learning_rate": 0.00010002729435006954, + "loss": 1.3832, + "step": 38475 + }, + { + "epoch": 0.499978071707142, + "grad_norm": 0.3916938900947571, + "learning_rate": 0.00010002469488815815, + "loss": 1.3308, + "step": 38476 + }, + { + "epoch": 0.4999910662510578, + "grad_norm": 0.40697628259658813, + "learning_rate": 0.00010002209542624676, + "loss": 1.4365, + "step": 38477 + }, + { + "epoch": 0.5000040607949737, + "grad_norm": 0.39586490392684937, + "learning_rate": 0.0001000194959643354, + "loss": 1.3116, + "step": 38478 + }, + { + "epoch": 0.5000170553388896, + "grad_norm": 0.4302930533885956, + "learning_rate": 0.00010001689650242401, + "loss": 1.2871, + "step": 38479 + }, + { + "epoch": 0.5000300498828054, + "grad_norm": 0.39699772000312805, + "learning_rate": 0.00010001429704051261, + "loss": 1.4254, + "step": 38480 + }, + { + "epoch": 0.5000430444267213, + "grad_norm": 0.33105841279029846, + "learning_rate": 0.00010001169757860122, + "loss": 1.3135, + "step": 38481 + }, + { + "epoch": 0.5000560389706372, + "grad_norm": 0.363977313041687, + "learning_rate": 0.00010000909811668986, + "loss": 1.1808, + "step": 38482 + }, + { + "epoch": 0.5000690335145531, + "grad_norm": 0.3896322548389435, + "learning_rate": 0.00010000649865477847, + "loss": 1.2437, + "step": 38483 + }, + { + "epoch": 0.5000820280584689, + "grad_norm": 0.4553676247596741, + "learning_rate": 0.00010000389919286708, + "loss": 1.3405, + "step": 38484 + }, + { + "epoch": 0.5000950226023848, + "grad_norm": 0.31064674258232117, + "learning_rate": 0.00010000129973095569, + "loss": 1.3983, + "step": 38485 + }, + { + "epoch": 0.5001080171463007, + "grad_norm": 0.5178108811378479, + "learning_rate": 9.99987002690443e-05, + "loss": 1.3524, + "step": 38486 + }, + { + "epoch": 0.5001210116902166, + "grad_norm": 0.4099293351173401, + "learning_rate": 9.999610080713293e-05, + "loss": 1.4344, + "step": 38487 + }, + { + "epoch": 0.5001340062341324, + "grad_norm": 0.34738999605178833, + "learning_rate": 9.999350134522154e-05, + "loss": 1.1937, + "step": 38488 + }, + { + "epoch": 0.5001470007780483, + "grad_norm": 0.4242170751094818, + "learning_rate": 9.999090188331016e-05, + "loss": 1.5287, + "step": 38489 + }, + { + "epoch": 0.5001599953219642, + "grad_norm": 0.36498019099235535, + "learning_rate": 9.998830242139877e-05, + "loss": 1.3708, + "step": 38490 + }, + { + "epoch": 0.5001729898658801, + "grad_norm": 0.37078672647476196, + "learning_rate": 9.99857029594874e-05, + "loss": 1.3509, + "step": 38491 + }, + { + "epoch": 0.5001859844097959, + "grad_norm": 0.4015561044216156, + "learning_rate": 9.9983103497576e-05, + "loss": 1.4638, + "step": 38492 + }, + { + "epoch": 0.5001989789537118, + "grad_norm": 0.39904215931892395, + "learning_rate": 9.998050403566462e-05, + "loss": 1.1768, + "step": 38493 + }, + { + "epoch": 0.5002119734976277, + "grad_norm": 0.5054426193237305, + "learning_rate": 9.997790457375323e-05, + "loss": 1.5331, + "step": 38494 + }, + { + "epoch": 0.5002249680415436, + "grad_norm": 0.39435285329818726, + "learning_rate": 9.997530511184186e-05, + "loss": 1.4728, + "step": 38495 + }, + { + "epoch": 0.5002379625854594, + "grad_norm": 0.3388907015323639, + "learning_rate": 9.997270564993047e-05, + "loss": 1.4266, + "step": 38496 + }, + { + "epoch": 0.5002509571293753, + "grad_norm": 0.4165821373462677, + "learning_rate": 9.997010618801909e-05, + "loss": 1.3252, + "step": 38497 + }, + { + "epoch": 0.5002639516732912, + "grad_norm": 0.42864224314689636, + "learning_rate": 9.99675067261077e-05, + "loss": 1.3126, + "step": 38498 + }, + { + "epoch": 0.5002769462172071, + "grad_norm": 0.3871716558933258, + "learning_rate": 9.996490726419631e-05, + "loss": 1.3973, + "step": 38499 + }, + { + "epoch": 0.5002899407611229, + "grad_norm": 0.5376816987991333, + "learning_rate": 9.996230780228492e-05, + "loss": 1.5091, + "step": 38500 + }, + { + "epoch": 0.5003029353050388, + "grad_norm": 0.4054618775844574, + "learning_rate": 9.995970834037355e-05, + "loss": 1.3816, + "step": 38501 + }, + { + "epoch": 0.5003159298489547, + "grad_norm": 0.42202767729759216, + "learning_rate": 9.995710887846216e-05, + "loss": 1.4288, + "step": 38502 + }, + { + "epoch": 0.5003289243928706, + "grad_norm": 0.48954129219055176, + "learning_rate": 9.995450941655078e-05, + "loss": 1.3647, + "step": 38503 + }, + { + "epoch": 0.5003419189367864, + "grad_norm": 0.3736002445220947, + "learning_rate": 9.99519099546394e-05, + "loss": 1.3228, + "step": 38504 + }, + { + "epoch": 0.5003549134807023, + "grad_norm": 0.4459300935268402, + "learning_rate": 9.9949310492728e-05, + "loss": 1.3371, + "step": 38505 + }, + { + "epoch": 0.5003679080246182, + "grad_norm": 0.48064523935317993, + "learning_rate": 9.994671103081663e-05, + "loss": 1.322, + "step": 38506 + }, + { + "epoch": 0.5003809025685341, + "grad_norm": 0.39444148540496826, + "learning_rate": 9.994411156890524e-05, + "loss": 1.4052, + "step": 38507 + }, + { + "epoch": 0.5003938971124499, + "grad_norm": 0.4147461950778961, + "learning_rate": 9.994151210699387e-05, + "loss": 1.4575, + "step": 38508 + }, + { + "epoch": 0.5004068916563658, + "grad_norm": 0.33071663975715637, + "learning_rate": 9.993891264508248e-05, + "loss": 1.3274, + "step": 38509 + }, + { + "epoch": 0.5004198862002817, + "grad_norm": 0.38374415040016174, + "learning_rate": 9.993631318317109e-05, + "loss": 1.3927, + "step": 38510 + }, + { + "epoch": 0.5004328807441976, + "grad_norm": 0.3648830056190491, + "learning_rate": 9.99337137212597e-05, + "loss": 1.4289, + "step": 38511 + }, + { + "epoch": 0.5004458752881134, + "grad_norm": 0.43707484006881714, + "learning_rate": 9.993111425934832e-05, + "loss": 1.4538, + "step": 38512 + }, + { + "epoch": 0.5004588698320293, + "grad_norm": 0.4712204933166504, + "learning_rate": 9.992851479743693e-05, + "loss": 1.312, + "step": 38513 + }, + { + "epoch": 0.5004718643759452, + "grad_norm": 0.36538490653038025, + "learning_rate": 9.992591533552556e-05, + "loss": 1.1525, + "step": 38514 + }, + { + "epoch": 0.500484858919861, + "grad_norm": 0.46205270290374756, + "learning_rate": 9.992331587361417e-05, + "loss": 1.3015, + "step": 38515 + }, + { + "epoch": 0.5004978534637768, + "grad_norm": 0.39334115386009216, + "learning_rate": 9.992071641170278e-05, + "loss": 1.455, + "step": 38516 + }, + { + "epoch": 0.5005108480076927, + "grad_norm": 0.38330259919166565, + "learning_rate": 9.991811694979139e-05, + "loss": 1.2619, + "step": 38517 + }, + { + "epoch": 0.5005238425516086, + "grad_norm": 0.3695862591266632, + "learning_rate": 9.991551748788002e-05, + "loss": 1.395, + "step": 38518 + }, + { + "epoch": 0.5005368370955245, + "grad_norm": 0.3695862591266632, + "learning_rate": 9.991291802596863e-05, + "loss": 1.5547, + "step": 38519 + }, + { + "epoch": 0.5005498316394403, + "grad_norm": 0.34565216302871704, + "learning_rate": 9.991031856405725e-05, + "loss": 1.3367, + "step": 38520 + }, + { + "epoch": 0.5005628261833562, + "grad_norm": 0.3844248950481415, + "learning_rate": 9.990771910214586e-05, + "loss": 1.3098, + "step": 38521 + }, + { + "epoch": 0.5005758207272721, + "grad_norm": 0.4144669771194458, + "learning_rate": 9.990511964023447e-05, + "loss": 1.6605, + "step": 38522 + }, + { + "epoch": 0.500588815271188, + "grad_norm": 0.43518581986427307, + "learning_rate": 9.990252017832308e-05, + "loss": 1.4268, + "step": 38523 + }, + { + "epoch": 0.5006018098151039, + "grad_norm": 0.4018695652484894, + "learning_rate": 9.989992071641171e-05, + "loss": 1.4326, + "step": 38524 + }, + { + "epoch": 0.5006148043590197, + "grad_norm": 0.4548830986022949, + "learning_rate": 9.989732125450032e-05, + "loss": 1.4517, + "step": 38525 + }, + { + "epoch": 0.5006277989029356, + "grad_norm": 0.44514504075050354, + "learning_rate": 9.989472179258894e-05, + "loss": 1.8069, + "step": 38526 + }, + { + "epoch": 0.5006407934468515, + "grad_norm": 0.3615792393684387, + "learning_rate": 9.989212233067755e-05, + "loss": 1.301, + "step": 38527 + }, + { + "epoch": 0.5006537879907674, + "grad_norm": 0.3245408535003662, + "learning_rate": 9.988952286876617e-05, + "loss": 1.3064, + "step": 38528 + }, + { + "epoch": 0.5006667825346832, + "grad_norm": 0.4558870494365692, + "learning_rate": 9.988692340685478e-05, + "loss": 1.4604, + "step": 38529 + }, + { + "epoch": 0.5006797770785991, + "grad_norm": 0.46221646666526794, + "learning_rate": 9.98843239449434e-05, + "loss": 1.5992, + "step": 38530 + }, + { + "epoch": 0.500692771622515, + "grad_norm": 0.38172295689582825, + "learning_rate": 9.988172448303201e-05, + "loss": 1.3839, + "step": 38531 + }, + { + "epoch": 0.5007057661664309, + "grad_norm": 0.4264618158340454, + "learning_rate": 9.987912502112064e-05, + "loss": 1.3797, + "step": 38532 + }, + { + "epoch": 0.5007187607103467, + "grad_norm": 0.32991787791252136, + "learning_rate": 9.987652555920925e-05, + "loss": 1.2988, + "step": 38533 + }, + { + "epoch": 0.5007317552542626, + "grad_norm": 0.38463470339775085, + "learning_rate": 9.987392609729786e-05, + "loss": 1.2009, + "step": 38534 + }, + { + "epoch": 0.5007447497981785, + "grad_norm": 0.430191308259964, + "learning_rate": 9.987132663538647e-05, + "loss": 1.3779, + "step": 38535 + }, + { + "epoch": 0.5007577443420944, + "grad_norm": 0.4753797948360443, + "learning_rate": 9.98687271734751e-05, + "loss": 1.3091, + "step": 38536 + }, + { + "epoch": 0.5007707388860102, + "grad_norm": 0.4227485954761505, + "learning_rate": 9.98661277115637e-05, + "loss": 1.6048, + "step": 38537 + }, + { + "epoch": 0.5007837334299261, + "grad_norm": 0.5142156481742859, + "learning_rate": 9.986352824965233e-05, + "loss": 1.4795, + "step": 38538 + }, + { + "epoch": 0.500796727973842, + "grad_norm": 0.44958633184432983, + "learning_rate": 9.986092878774094e-05, + "loss": 1.471, + "step": 38539 + }, + { + "epoch": 0.5008097225177579, + "grad_norm": 0.39768242835998535, + "learning_rate": 9.985832932582956e-05, + "loss": 1.2665, + "step": 38540 + }, + { + "epoch": 0.5008227170616737, + "grad_norm": 0.4176120162010193, + "learning_rate": 9.985572986391816e-05, + "loss": 1.5671, + "step": 38541 + }, + { + "epoch": 0.5008357116055896, + "grad_norm": 0.5168113112449646, + "learning_rate": 9.985313040200679e-05, + "loss": 1.2514, + "step": 38542 + }, + { + "epoch": 0.5008487061495055, + "grad_norm": 0.46453577280044556, + "learning_rate": 9.985053094009541e-05, + "loss": 1.2353, + "step": 38543 + }, + { + "epoch": 0.5008617006934214, + "grad_norm": 0.4418941140174866, + "learning_rate": 9.984793147818402e-05, + "loss": 1.4894, + "step": 38544 + }, + { + "epoch": 0.5008746952373372, + "grad_norm": 0.40218988060951233, + "learning_rate": 9.984533201627265e-05, + "loss": 1.3141, + "step": 38545 + }, + { + "epoch": 0.5008876897812531, + "grad_norm": 0.4375206232070923, + "learning_rate": 9.984273255436126e-05, + "loss": 1.1583, + "step": 38546 + }, + { + "epoch": 0.500900684325169, + "grad_norm": 0.45491257309913635, + "learning_rate": 9.984013309244987e-05, + "loss": 1.4359, + "step": 38547 + }, + { + "epoch": 0.5009136788690849, + "grad_norm": 0.4308406412601471, + "learning_rate": 9.983753363053848e-05, + "loss": 1.5101, + "step": 38548 + }, + { + "epoch": 0.5009266734130007, + "grad_norm": 0.25547799468040466, + "learning_rate": 9.98349341686271e-05, + "loss": 1.2887, + "step": 38549 + }, + { + "epoch": 0.5009396679569166, + "grad_norm": 0.3874829411506653, + "learning_rate": 9.983233470671571e-05, + "loss": 1.2919, + "step": 38550 + }, + { + "epoch": 0.5009526625008325, + "grad_norm": 0.47302722930908203, + "learning_rate": 9.982973524480434e-05, + "loss": 1.2919, + "step": 38551 + }, + { + "epoch": 0.5009656570447484, + "grad_norm": 0.2810894548892975, + "learning_rate": 9.982713578289295e-05, + "loss": 1.3824, + "step": 38552 + }, + { + "epoch": 0.5009786515886642, + "grad_norm": 0.43941783905029297, + "learning_rate": 9.982453632098156e-05, + "loss": 1.2925, + "step": 38553 + }, + { + "epoch": 0.5009916461325801, + "grad_norm": 0.40376999974250793, + "learning_rate": 9.982193685907017e-05, + "loss": 1.5715, + "step": 38554 + }, + { + "epoch": 0.501004640676496, + "grad_norm": 0.3744410574436188, + "learning_rate": 9.98193373971588e-05, + "loss": 1.3405, + "step": 38555 + }, + { + "epoch": 0.5010176352204119, + "grad_norm": 0.44319969415664673, + "learning_rate": 9.981673793524741e-05, + "loss": 1.4598, + "step": 38556 + }, + { + "epoch": 0.5010306297643277, + "grad_norm": 0.3882030248641968, + "learning_rate": 9.981413847333603e-05, + "loss": 1.1661, + "step": 38557 + }, + { + "epoch": 0.5010436243082436, + "grad_norm": 0.37690043449401855, + "learning_rate": 9.981153901142464e-05, + "loss": 1.5137, + "step": 38558 + }, + { + "epoch": 0.5010566188521595, + "grad_norm": 0.38836580514907837, + "learning_rate": 9.980893954951325e-05, + "loss": 1.3738, + "step": 38559 + }, + { + "epoch": 0.5010696133960754, + "grad_norm": 0.41097086668014526, + "learning_rate": 9.980634008760186e-05, + "loss": 1.5838, + "step": 38560 + }, + { + "epoch": 0.5010826079399912, + "grad_norm": 0.4168701469898224, + "learning_rate": 9.980374062569049e-05, + "loss": 1.3225, + "step": 38561 + }, + { + "epoch": 0.501095602483907, + "grad_norm": 0.3650151789188385, + "learning_rate": 9.98011411637791e-05, + "loss": 1.7385, + "step": 38562 + }, + { + "epoch": 0.501108597027823, + "grad_norm": 0.4143138825893402, + "learning_rate": 9.979854170186772e-05, + "loss": 1.3563, + "step": 38563 + }, + { + "epoch": 0.5011215915717389, + "grad_norm": 0.34220296144485474, + "learning_rate": 9.979594223995634e-05, + "loss": 1.484, + "step": 38564 + }, + { + "epoch": 0.5011345861156546, + "grad_norm": 0.38319987058639526, + "learning_rate": 9.979334277804495e-05, + "loss": 1.3797, + "step": 38565 + }, + { + "epoch": 0.5011475806595705, + "grad_norm": 0.3852391839027405, + "learning_rate": 9.979074331613356e-05, + "loss": 1.2817, + "step": 38566 + }, + { + "epoch": 0.5011605752034864, + "grad_norm": 0.39859554171562195, + "learning_rate": 9.978814385422218e-05, + "loss": 1.3184, + "step": 38567 + }, + { + "epoch": 0.5011735697474023, + "grad_norm": 0.5399718284606934, + "learning_rate": 9.978554439231079e-05, + "loss": 1.2967, + "step": 38568 + }, + { + "epoch": 0.5011865642913181, + "grad_norm": 0.45720845460891724, + "learning_rate": 9.978294493039942e-05, + "loss": 1.4042, + "step": 38569 + }, + { + "epoch": 0.501199558835234, + "grad_norm": 0.42943382263183594, + "learning_rate": 9.978034546848803e-05, + "loss": 1.4733, + "step": 38570 + }, + { + "epoch": 0.5012125533791499, + "grad_norm": 0.3036157786846161, + "learning_rate": 9.977774600657664e-05, + "loss": 1.202, + "step": 38571 + }, + { + "epoch": 0.5012255479230658, + "grad_norm": 0.434392511844635, + "learning_rate": 9.977514654466525e-05, + "loss": 1.429, + "step": 38572 + }, + { + "epoch": 0.5012385424669816, + "grad_norm": 0.42071211338043213, + "learning_rate": 9.977254708275387e-05, + "loss": 1.3919, + "step": 38573 + }, + { + "epoch": 0.5012515370108975, + "grad_norm": 0.34020864963531494, + "learning_rate": 9.976994762084249e-05, + "loss": 1.4991, + "step": 38574 + }, + { + "epoch": 0.5012645315548134, + "grad_norm": 0.36502739787101746, + "learning_rate": 9.976734815893111e-05, + "loss": 1.4106, + "step": 38575 + }, + { + "epoch": 0.5012775260987293, + "grad_norm": 0.28484347462654114, + "learning_rate": 9.976474869701972e-05, + "loss": 1.3428, + "step": 38576 + }, + { + "epoch": 0.5012905206426451, + "grad_norm": 0.448452353477478, + "learning_rate": 9.976214923510833e-05, + "loss": 1.2886, + "step": 38577 + }, + { + "epoch": 0.501303515186561, + "grad_norm": 0.4120859205722809, + "learning_rate": 9.975954977319694e-05, + "loss": 1.2847, + "step": 38578 + }, + { + "epoch": 0.5013165097304769, + "grad_norm": 0.3343343436717987, + "learning_rate": 9.975695031128557e-05, + "loss": 1.2084, + "step": 38579 + }, + { + "epoch": 0.5013295042743928, + "grad_norm": 0.344948410987854, + "learning_rate": 9.975435084937419e-05, + "loss": 1.2986, + "step": 38580 + }, + { + "epoch": 0.5013424988183086, + "grad_norm": 0.42767760157585144, + "learning_rate": 9.97517513874628e-05, + "loss": 1.3068, + "step": 38581 + }, + { + "epoch": 0.5013554933622245, + "grad_norm": 0.376751571893692, + "learning_rate": 9.974915192555143e-05, + "loss": 1.3869, + "step": 38582 + }, + { + "epoch": 0.5013684879061404, + "grad_norm": 0.44073906540870667, + "learning_rate": 9.974655246364002e-05, + "loss": 1.3422, + "step": 38583 + }, + { + "epoch": 0.5013814824500563, + "grad_norm": 0.34657928347587585, + "learning_rate": 9.974395300172865e-05, + "loss": 1.5945, + "step": 38584 + }, + { + "epoch": 0.5013944769939721, + "grad_norm": 0.4015274941921234, + "learning_rate": 9.974135353981726e-05, + "loss": 1.3947, + "step": 38585 + }, + { + "epoch": 0.501407471537888, + "grad_norm": 0.42863163352012634, + "learning_rate": 9.973875407790588e-05, + "loss": 1.4117, + "step": 38586 + }, + { + "epoch": 0.5014204660818039, + "grad_norm": 0.436213880777359, + "learning_rate": 9.97361546159945e-05, + "loss": 1.4672, + "step": 38587 + }, + { + "epoch": 0.5014334606257198, + "grad_norm": 0.382867693901062, + "learning_rate": 9.973355515408312e-05, + "loss": 1.3857, + "step": 38588 + }, + { + "epoch": 0.5014464551696356, + "grad_norm": 0.4462325870990753, + "learning_rate": 9.973095569217172e-05, + "loss": 1.5183, + "step": 38589 + }, + { + "epoch": 0.5014594497135515, + "grad_norm": 0.34867414832115173, + "learning_rate": 9.972835623026034e-05, + "loss": 1.3722, + "step": 38590 + }, + { + "epoch": 0.5014724442574674, + "grad_norm": 0.4105249047279358, + "learning_rate": 9.972575676834895e-05, + "loss": 1.2695, + "step": 38591 + }, + { + "epoch": 0.5014854388013833, + "grad_norm": 0.43936577439308167, + "learning_rate": 9.972315730643758e-05, + "loss": 1.3983, + "step": 38592 + }, + { + "epoch": 0.5014984333452991, + "grad_norm": 0.4319203197956085, + "learning_rate": 9.972055784452619e-05, + "loss": 1.5116, + "step": 38593 + }, + { + "epoch": 0.501511427889215, + "grad_norm": 0.433403342962265, + "learning_rate": 9.971795838261481e-05, + "loss": 1.4435, + "step": 38594 + }, + { + "epoch": 0.5015244224331309, + "grad_norm": 0.35866764187812805, + "learning_rate": 9.971535892070341e-05, + "loss": 1.3241, + "step": 38595 + }, + { + "epoch": 0.5015374169770468, + "grad_norm": 0.41441044211387634, + "learning_rate": 9.971275945879203e-05, + "loss": 1.3137, + "step": 38596 + }, + { + "epoch": 0.5015504115209627, + "grad_norm": 0.4729395806789398, + "learning_rate": 9.971015999688065e-05, + "loss": 1.308, + "step": 38597 + }, + { + "epoch": 0.5015634060648785, + "grad_norm": 0.36779484152793884, + "learning_rate": 9.970756053496927e-05, + "loss": 1.5165, + "step": 38598 + }, + { + "epoch": 0.5015764006087944, + "grad_norm": 0.3837573528289795, + "learning_rate": 9.970496107305788e-05, + "loss": 1.4158, + "step": 38599 + }, + { + "epoch": 0.5015893951527103, + "grad_norm": 0.44191521406173706, + "learning_rate": 9.97023616111465e-05, + "loss": 1.4352, + "step": 38600 + }, + { + "epoch": 0.5016023896966262, + "grad_norm": 0.4868957996368408, + "learning_rate": 9.969976214923512e-05, + "loss": 1.4142, + "step": 38601 + }, + { + "epoch": 0.501615384240542, + "grad_norm": 0.3141123056411743, + "learning_rate": 9.969716268732373e-05, + "loss": 1.3555, + "step": 38602 + }, + { + "epoch": 0.5016283787844579, + "grad_norm": 0.5004069805145264, + "learning_rate": 9.969456322541234e-05, + "loss": 1.45, + "step": 38603 + }, + { + "epoch": 0.5016413733283738, + "grad_norm": 0.3260367214679718, + "learning_rate": 9.969196376350096e-05, + "loss": 1.3377, + "step": 38604 + }, + { + "epoch": 0.5016543678722897, + "grad_norm": 0.42804521322250366, + "learning_rate": 9.968936430158957e-05, + "loss": 1.3283, + "step": 38605 + }, + { + "epoch": 0.5016673624162055, + "grad_norm": 0.525400698184967, + "learning_rate": 9.96867648396782e-05, + "loss": 1.3515, + "step": 38606 + }, + { + "epoch": 0.5016803569601214, + "grad_norm": 0.37241220474243164, + "learning_rate": 9.968416537776681e-05, + "loss": 1.3381, + "step": 38607 + }, + { + "epoch": 0.5016933515040373, + "grad_norm": 0.36682796478271484, + "learning_rate": 9.968156591585542e-05, + "loss": 1.3541, + "step": 38608 + }, + { + "epoch": 0.5017063460479532, + "grad_norm": 0.457168847322464, + "learning_rate": 9.967896645394403e-05, + "loss": 1.3386, + "step": 38609 + }, + { + "epoch": 0.501719340591869, + "grad_norm": 0.4600222706794739, + "learning_rate": 9.967636699203266e-05, + "loss": 1.3159, + "step": 38610 + }, + { + "epoch": 0.5017323351357849, + "grad_norm": 0.4694440960884094, + "learning_rate": 9.967376753012127e-05, + "loss": 1.5498, + "step": 38611 + }, + { + "epoch": 0.5017453296797008, + "grad_norm": 0.4455929696559906, + "learning_rate": 9.967116806820989e-05, + "loss": 1.4967, + "step": 38612 + }, + { + "epoch": 0.5017583242236167, + "grad_norm": 0.5129976868629456, + "learning_rate": 9.96685686062985e-05, + "loss": 1.6864, + "step": 38613 + }, + { + "epoch": 0.5017713187675324, + "grad_norm": 0.3728010356426239, + "learning_rate": 9.966596914438711e-05, + "loss": 1.2252, + "step": 38614 + }, + { + "epoch": 0.5017843133114483, + "grad_norm": 0.34261587262153625, + "learning_rate": 9.966336968247572e-05, + "loss": 1.2211, + "step": 38615 + }, + { + "epoch": 0.5017973078553642, + "grad_norm": 0.43765607476234436, + "learning_rate": 9.966077022056435e-05, + "loss": 1.5892, + "step": 38616 + }, + { + "epoch": 0.5018103023992802, + "grad_norm": 0.34832432866096497, + "learning_rate": 9.965817075865297e-05, + "loss": 1.234, + "step": 38617 + }, + { + "epoch": 0.5018232969431959, + "grad_norm": 0.47121068835258484, + "learning_rate": 9.965557129674158e-05, + "loss": 1.4693, + "step": 38618 + }, + { + "epoch": 0.5018362914871118, + "grad_norm": 0.47093334794044495, + "learning_rate": 9.96529718348302e-05, + "loss": 1.3836, + "step": 38619 + }, + { + "epoch": 0.5018492860310277, + "grad_norm": 0.41128021478652954, + "learning_rate": 9.96503723729188e-05, + "loss": 1.4557, + "step": 38620 + }, + { + "epoch": 0.5018622805749436, + "grad_norm": 0.4500844180583954, + "learning_rate": 9.964777291100743e-05, + "loss": 1.4342, + "step": 38621 + }, + { + "epoch": 0.5018752751188594, + "grad_norm": 0.37473171949386597, + "learning_rate": 9.964517344909604e-05, + "loss": 1.1323, + "step": 38622 + }, + { + "epoch": 0.5018882696627753, + "grad_norm": 0.389114111661911, + "learning_rate": 9.964257398718467e-05, + "loss": 1.3187, + "step": 38623 + }, + { + "epoch": 0.5019012642066912, + "grad_norm": 0.40775954723358154, + "learning_rate": 9.963997452527328e-05, + "loss": 1.5444, + "step": 38624 + }, + { + "epoch": 0.5019142587506071, + "grad_norm": 0.4420469105243683, + "learning_rate": 9.963737506336189e-05, + "loss": 1.544, + "step": 38625 + }, + { + "epoch": 0.5019272532945229, + "grad_norm": 0.34388697147369385, + "learning_rate": 9.96347756014505e-05, + "loss": 1.1994, + "step": 38626 + }, + { + "epoch": 0.5019402478384388, + "grad_norm": 0.3449591100215912, + "learning_rate": 9.963217613953912e-05, + "loss": 1.1656, + "step": 38627 + }, + { + "epoch": 0.5019532423823547, + "grad_norm": 0.46845781803131104, + "learning_rate": 9.962957667762773e-05, + "loss": 1.4602, + "step": 38628 + }, + { + "epoch": 0.5019662369262706, + "grad_norm": 0.31290364265441895, + "learning_rate": 9.962697721571636e-05, + "loss": 1.3737, + "step": 38629 + }, + { + "epoch": 0.5019792314701864, + "grad_norm": 0.40682268142700195, + "learning_rate": 9.962437775380497e-05, + "loss": 1.4536, + "step": 38630 + }, + { + "epoch": 0.5019922260141023, + "grad_norm": 0.4838141202926636, + "learning_rate": 9.962177829189358e-05, + "loss": 1.347, + "step": 38631 + }, + { + "epoch": 0.5020052205580182, + "grad_norm": 0.4010850191116333, + "learning_rate": 9.961917882998219e-05, + "loss": 1.2836, + "step": 38632 + }, + { + "epoch": 0.5020182151019341, + "grad_norm": 0.3325026333332062, + "learning_rate": 9.961657936807082e-05, + "loss": 1.3295, + "step": 38633 + }, + { + "epoch": 0.5020312096458499, + "grad_norm": 0.38825252652168274, + "learning_rate": 9.961397990615943e-05, + "loss": 1.4019, + "step": 38634 + }, + { + "epoch": 0.5020442041897658, + "grad_norm": 0.2932383120059967, + "learning_rate": 9.961138044424805e-05, + "loss": 1.4335, + "step": 38635 + }, + { + "epoch": 0.5020571987336817, + "grad_norm": 0.4004434049129486, + "learning_rate": 9.960878098233666e-05, + "loss": 1.4192, + "step": 38636 + }, + { + "epoch": 0.5020701932775976, + "grad_norm": 0.4199526309967041, + "learning_rate": 9.960618152042527e-05, + "loss": 1.2629, + "step": 38637 + }, + { + "epoch": 0.5020831878215134, + "grad_norm": 0.38937196135520935, + "learning_rate": 9.960358205851388e-05, + "loss": 1.3638, + "step": 38638 + }, + { + "epoch": 0.5020961823654293, + "grad_norm": 0.4084378778934479, + "learning_rate": 9.960098259660251e-05, + "loss": 1.5108, + "step": 38639 + }, + { + "epoch": 0.5021091769093452, + "grad_norm": 0.38959378004074097, + "learning_rate": 9.959838313469112e-05, + "loss": 1.4613, + "step": 38640 + }, + { + "epoch": 0.5021221714532611, + "grad_norm": 0.43904009461402893, + "learning_rate": 9.959578367277974e-05, + "loss": 1.4719, + "step": 38641 + }, + { + "epoch": 0.5021351659971769, + "grad_norm": 0.4299328029155731, + "learning_rate": 9.959318421086835e-05, + "loss": 1.2845, + "step": 38642 + }, + { + "epoch": 0.5021481605410928, + "grad_norm": 0.4287305176258087, + "learning_rate": 9.959058474895698e-05, + "loss": 1.5222, + "step": 38643 + }, + { + "epoch": 0.5021611550850087, + "grad_norm": 0.34740859270095825, + "learning_rate": 9.958798528704558e-05, + "loss": 1.3907, + "step": 38644 + }, + { + "epoch": 0.5021741496289246, + "grad_norm": 0.33845412731170654, + "learning_rate": 9.95853858251342e-05, + "loss": 1.4029, + "step": 38645 + }, + { + "epoch": 0.5021871441728404, + "grad_norm": 0.41307365894317627, + "learning_rate": 9.958278636322281e-05, + "loss": 1.3485, + "step": 38646 + }, + { + "epoch": 0.5022001387167563, + "grad_norm": 0.4106910228729248, + "learning_rate": 9.958018690131144e-05, + "loss": 1.3823, + "step": 38647 + }, + { + "epoch": 0.5022131332606722, + "grad_norm": 0.40577781200408936, + "learning_rate": 9.957758743940005e-05, + "loss": 1.5168, + "step": 38648 + }, + { + "epoch": 0.5022261278045881, + "grad_norm": 0.5476912260055542, + "learning_rate": 9.957498797748867e-05, + "loss": 1.4737, + "step": 38649 + }, + { + "epoch": 0.5022391223485039, + "grad_norm": 0.4123094975948334, + "learning_rate": 9.957238851557727e-05, + "loss": 1.4891, + "step": 38650 + }, + { + "epoch": 0.5022521168924198, + "grad_norm": 0.43160751461982727, + "learning_rate": 9.95697890536659e-05, + "loss": 1.3068, + "step": 38651 + }, + { + "epoch": 0.5022651114363357, + "grad_norm": 0.3636293411254883, + "learning_rate": 9.95671895917545e-05, + "loss": 1.2352, + "step": 38652 + }, + { + "epoch": 0.5022781059802516, + "grad_norm": 0.4430590569972992, + "learning_rate": 9.956459012984313e-05, + "loss": 1.4589, + "step": 38653 + }, + { + "epoch": 0.5022911005241674, + "grad_norm": 0.4093201160430908, + "learning_rate": 9.956199066793175e-05, + "loss": 1.5918, + "step": 38654 + }, + { + "epoch": 0.5023040950680833, + "grad_norm": 0.406325101852417, + "learning_rate": 9.955939120602036e-05, + "loss": 1.3187, + "step": 38655 + }, + { + "epoch": 0.5023170896119992, + "grad_norm": 0.3986407220363617, + "learning_rate": 9.955679174410898e-05, + "loss": 1.332, + "step": 38656 + }, + { + "epoch": 0.5023300841559151, + "grad_norm": 0.39856845140457153, + "learning_rate": 9.955419228219759e-05, + "loss": 1.4349, + "step": 38657 + }, + { + "epoch": 0.5023430786998309, + "grad_norm": 0.38072022795677185, + "learning_rate": 9.955159282028621e-05, + "loss": 1.3082, + "step": 38658 + }, + { + "epoch": 0.5023560732437468, + "grad_norm": 0.2640657424926758, + "learning_rate": 9.954899335837482e-05, + "loss": 1.4964, + "step": 38659 + }, + { + "epoch": 0.5023690677876627, + "grad_norm": 0.4046262204647064, + "learning_rate": 9.954639389646345e-05, + "loss": 1.2537, + "step": 38660 + }, + { + "epoch": 0.5023820623315786, + "grad_norm": 0.41886183619499207, + "learning_rate": 9.954379443455206e-05, + "loss": 1.1939, + "step": 38661 + }, + { + "epoch": 0.5023950568754944, + "grad_norm": 0.32854607701301575, + "learning_rate": 9.954119497264067e-05, + "loss": 1.3968, + "step": 38662 + }, + { + "epoch": 0.5024080514194103, + "grad_norm": 0.3777220845222473, + "learning_rate": 9.953859551072928e-05, + "loss": 1.4049, + "step": 38663 + }, + { + "epoch": 0.5024210459633262, + "grad_norm": 0.47061794996261597, + "learning_rate": 9.95359960488179e-05, + "loss": 1.4819, + "step": 38664 + }, + { + "epoch": 0.502434040507242, + "grad_norm": 0.3082543909549713, + "learning_rate": 9.953339658690651e-05, + "loss": 1.2269, + "step": 38665 + }, + { + "epoch": 0.5024470350511578, + "grad_norm": 0.3603893518447876, + "learning_rate": 9.953079712499514e-05, + "loss": 1.383, + "step": 38666 + }, + { + "epoch": 0.5024600295950737, + "grad_norm": 0.3619822859764099, + "learning_rate": 9.952819766308375e-05, + "loss": 1.4762, + "step": 38667 + }, + { + "epoch": 0.5024730241389896, + "grad_norm": 0.34904932975769043, + "learning_rate": 9.952559820117236e-05, + "loss": 1.5327, + "step": 38668 + }, + { + "epoch": 0.5024860186829055, + "grad_norm": 0.44304129481315613, + "learning_rate": 9.952299873926097e-05, + "loss": 1.3144, + "step": 38669 + }, + { + "epoch": 0.5024990132268213, + "grad_norm": 0.25501298904418945, + "learning_rate": 9.95203992773496e-05, + "loss": 1.2992, + "step": 38670 + }, + { + "epoch": 0.5025120077707372, + "grad_norm": 0.34362560510635376, + "learning_rate": 9.95177998154382e-05, + "loss": 1.377, + "step": 38671 + }, + { + "epoch": 0.5025250023146531, + "grad_norm": 0.4482221305370331, + "learning_rate": 9.951520035352683e-05, + "loss": 1.367, + "step": 38672 + }, + { + "epoch": 0.502537996858569, + "grad_norm": 0.41808897256851196, + "learning_rate": 9.951260089161544e-05, + "loss": 1.491, + "step": 38673 + }, + { + "epoch": 0.5025509914024849, + "grad_norm": 0.3629595935344696, + "learning_rate": 9.951000142970405e-05, + "loss": 1.2651, + "step": 38674 + }, + { + "epoch": 0.5025639859464007, + "grad_norm": 0.4167031943798065, + "learning_rate": 9.950740196779266e-05, + "loss": 1.5082, + "step": 38675 + }, + { + "epoch": 0.5025769804903166, + "grad_norm": 0.40896928310394287, + "learning_rate": 9.950480250588129e-05, + "loss": 1.344, + "step": 38676 + }, + { + "epoch": 0.5025899750342325, + "grad_norm": 0.4458857476711273, + "learning_rate": 9.95022030439699e-05, + "loss": 1.4298, + "step": 38677 + }, + { + "epoch": 0.5026029695781484, + "grad_norm": 0.4971230626106262, + "learning_rate": 9.949960358205852e-05, + "loss": 1.4142, + "step": 38678 + }, + { + "epoch": 0.5026159641220642, + "grad_norm": 0.47731155157089233, + "learning_rate": 9.949700412014713e-05, + "loss": 1.2656, + "step": 38679 + }, + { + "epoch": 0.5026289586659801, + "grad_norm": 0.5380831360816956, + "learning_rate": 9.949440465823575e-05, + "loss": 1.4705, + "step": 38680 + }, + { + "epoch": 0.502641953209896, + "grad_norm": 0.39273661375045776, + "learning_rate": 9.949180519632436e-05, + "loss": 1.3998, + "step": 38681 + }, + { + "epoch": 0.5026549477538119, + "grad_norm": 0.49421244859695435, + "learning_rate": 9.948920573441298e-05, + "loss": 1.6533, + "step": 38682 + }, + { + "epoch": 0.5026679422977277, + "grad_norm": 0.4012127220630646, + "learning_rate": 9.948660627250159e-05, + "loss": 1.3524, + "step": 38683 + }, + { + "epoch": 0.5026809368416436, + "grad_norm": 0.3115918040275574, + "learning_rate": 9.948400681059022e-05, + "loss": 1.2522, + "step": 38684 + }, + { + "epoch": 0.5026939313855595, + "grad_norm": 0.3512091040611267, + "learning_rate": 9.948140734867883e-05, + "loss": 1.2833, + "step": 38685 + }, + { + "epoch": 0.5027069259294754, + "grad_norm": 0.45960733294487, + "learning_rate": 9.947880788676744e-05, + "loss": 1.3587, + "step": 38686 + }, + { + "epoch": 0.5027199204733912, + "grad_norm": 0.35364776849746704, + "learning_rate": 9.947620842485605e-05, + "loss": 1.3527, + "step": 38687 + }, + { + "epoch": 0.5027329150173071, + "grad_norm": 0.361489474773407, + "learning_rate": 9.947360896294467e-05, + "loss": 1.4331, + "step": 38688 + }, + { + "epoch": 0.502745909561223, + "grad_norm": 0.45128002762794495, + "learning_rate": 9.947100950103328e-05, + "loss": 1.4558, + "step": 38689 + }, + { + "epoch": 0.5027589041051389, + "grad_norm": 0.3806818723678589, + "learning_rate": 9.946841003912191e-05, + "loss": 1.3243, + "step": 38690 + }, + { + "epoch": 0.5027718986490547, + "grad_norm": 0.2227061241865158, + "learning_rate": 9.946581057721053e-05, + "loss": 1.1331, + "step": 38691 + }, + { + "epoch": 0.5027848931929706, + "grad_norm": 0.4346643388271332, + "learning_rate": 9.946321111529913e-05, + "loss": 1.3201, + "step": 38692 + }, + { + "epoch": 0.5027978877368865, + "grad_norm": 0.37980860471725464, + "learning_rate": 9.946061165338776e-05, + "loss": 1.4911, + "step": 38693 + }, + { + "epoch": 0.5028108822808024, + "grad_norm": 0.5231842994689941, + "learning_rate": 9.945801219147637e-05, + "loss": 1.5323, + "step": 38694 + }, + { + "epoch": 0.5028238768247182, + "grad_norm": 0.37290629744529724, + "learning_rate": 9.945541272956499e-05, + "loss": 1.4404, + "step": 38695 + }, + { + "epoch": 0.5028368713686341, + "grad_norm": 0.3669590651988983, + "learning_rate": 9.94528132676536e-05, + "loss": 1.4594, + "step": 38696 + }, + { + "epoch": 0.50284986591255, + "grad_norm": 0.4750474691390991, + "learning_rate": 9.945021380574223e-05, + "loss": 1.3476, + "step": 38697 + }, + { + "epoch": 0.5028628604564659, + "grad_norm": 0.48590344190597534, + "learning_rate": 9.944761434383082e-05, + "loss": 1.4391, + "step": 38698 + }, + { + "epoch": 0.5028758550003817, + "grad_norm": 0.3370203673839569, + "learning_rate": 9.944501488191945e-05, + "loss": 1.3266, + "step": 38699 + }, + { + "epoch": 0.5028888495442976, + "grad_norm": 0.34570881724357605, + "learning_rate": 9.944241542000806e-05, + "loss": 1.3115, + "step": 38700 + }, + { + "epoch": 0.5029018440882135, + "grad_norm": 0.43542373180389404, + "learning_rate": 9.943981595809668e-05, + "loss": 1.2441, + "step": 38701 + }, + { + "epoch": 0.5029148386321294, + "grad_norm": 0.46609562635421753, + "learning_rate": 9.94372164961853e-05, + "loss": 1.471, + "step": 38702 + }, + { + "epoch": 0.5029278331760452, + "grad_norm": 0.5439035892486572, + "learning_rate": 9.943461703427392e-05, + "loss": 1.3798, + "step": 38703 + }, + { + "epoch": 0.5029408277199611, + "grad_norm": 0.4468325078487396, + "learning_rate": 9.943201757236253e-05, + "loss": 1.3957, + "step": 38704 + }, + { + "epoch": 0.502953822263877, + "grad_norm": 0.499202162027359, + "learning_rate": 9.942941811045114e-05, + "loss": 1.5699, + "step": 38705 + }, + { + "epoch": 0.5029668168077929, + "grad_norm": 0.3493598401546478, + "learning_rate": 9.942681864853975e-05, + "loss": 1.4434, + "step": 38706 + }, + { + "epoch": 0.5029798113517087, + "grad_norm": 0.41630589962005615, + "learning_rate": 9.942421918662838e-05, + "loss": 1.2732, + "step": 38707 + }, + { + "epoch": 0.5029928058956246, + "grad_norm": 0.3793299198150635, + "learning_rate": 9.942161972471699e-05, + "loss": 1.4012, + "step": 38708 + }, + { + "epoch": 0.5030058004395405, + "grad_norm": 0.417537659406662, + "learning_rate": 9.941902026280561e-05, + "loss": 1.4816, + "step": 38709 + }, + { + "epoch": 0.5030187949834564, + "grad_norm": 0.4429754614830017, + "learning_rate": 9.941642080089422e-05, + "loss": 1.5595, + "step": 38710 + }, + { + "epoch": 0.5030317895273722, + "grad_norm": 0.39553695917129517, + "learning_rate": 9.941382133898283e-05, + "loss": 1.4568, + "step": 38711 + }, + { + "epoch": 0.503044784071288, + "grad_norm": 0.37764808535575867, + "learning_rate": 9.941122187707144e-05, + "loss": 1.276, + "step": 38712 + }, + { + "epoch": 0.503057778615204, + "grad_norm": 0.37186166644096375, + "learning_rate": 9.940862241516007e-05, + "loss": 1.3691, + "step": 38713 + }, + { + "epoch": 0.5030707731591199, + "grad_norm": 0.3819955885410309, + "learning_rate": 9.940602295324868e-05, + "loss": 1.3951, + "step": 38714 + }, + { + "epoch": 0.5030837677030356, + "grad_norm": 0.321040540933609, + "learning_rate": 9.94034234913373e-05, + "loss": 1.3353, + "step": 38715 + }, + { + "epoch": 0.5030967622469515, + "grad_norm": 0.3601124882698059, + "learning_rate": 9.940082402942592e-05, + "loss": 1.4068, + "step": 38716 + }, + { + "epoch": 0.5031097567908674, + "grad_norm": 0.43082499504089355, + "learning_rate": 9.939822456751453e-05, + "loss": 1.4183, + "step": 38717 + }, + { + "epoch": 0.5031227513347833, + "grad_norm": 0.3668937683105469, + "learning_rate": 9.939562510560314e-05, + "loss": 1.4715, + "step": 38718 + }, + { + "epoch": 0.5031357458786991, + "grad_norm": 0.4214024543762207, + "learning_rate": 9.939302564369176e-05, + "loss": 1.5099, + "step": 38719 + }, + { + "epoch": 0.503148740422615, + "grad_norm": 0.36616218090057373, + "learning_rate": 9.939042618178037e-05, + "loss": 1.3421, + "step": 38720 + }, + { + "epoch": 0.5031617349665309, + "grad_norm": 0.3801290690898895, + "learning_rate": 9.9387826719869e-05, + "loss": 1.4003, + "step": 38721 + }, + { + "epoch": 0.5031747295104468, + "grad_norm": 0.3637678623199463, + "learning_rate": 9.938522725795761e-05, + "loss": 1.3397, + "step": 38722 + }, + { + "epoch": 0.5031877240543626, + "grad_norm": 0.4232293665409088, + "learning_rate": 9.938262779604622e-05, + "loss": 1.5857, + "step": 38723 + }, + { + "epoch": 0.5032007185982785, + "grad_norm": 0.5212748646736145, + "learning_rate": 9.938002833413483e-05, + "loss": 1.4806, + "step": 38724 + }, + { + "epoch": 0.5032137131421944, + "grad_norm": 0.41660821437835693, + "learning_rate": 9.937742887222345e-05, + "loss": 1.46, + "step": 38725 + }, + { + "epoch": 0.5032267076861103, + "grad_norm": 0.46641552448272705, + "learning_rate": 9.937482941031207e-05, + "loss": 1.307, + "step": 38726 + }, + { + "epoch": 0.5032397022300261, + "grad_norm": 0.35546934604644775, + "learning_rate": 9.937222994840069e-05, + "loss": 1.1158, + "step": 38727 + }, + { + "epoch": 0.503252696773942, + "grad_norm": 0.36998599767684937, + "learning_rate": 9.93696304864893e-05, + "loss": 1.4282, + "step": 38728 + }, + { + "epoch": 0.5032656913178579, + "grad_norm": 0.42102518677711487, + "learning_rate": 9.936703102457791e-05, + "loss": 1.275, + "step": 38729 + }, + { + "epoch": 0.5032786858617738, + "grad_norm": 0.4139999747276306, + "learning_rate": 9.936443156266654e-05, + "loss": 1.5319, + "step": 38730 + }, + { + "epoch": 0.5032916804056896, + "grad_norm": 0.4046679437160492, + "learning_rate": 9.936183210075515e-05, + "loss": 1.4493, + "step": 38731 + }, + { + "epoch": 0.5033046749496055, + "grad_norm": 0.4516332447528839, + "learning_rate": 9.935923263884377e-05, + "loss": 1.55, + "step": 38732 + }, + { + "epoch": 0.5033176694935214, + "grad_norm": 0.45310911536216736, + "learning_rate": 9.935663317693238e-05, + "loss": 1.4751, + "step": 38733 + }, + { + "epoch": 0.5033306640374373, + "grad_norm": 0.4741683006286621, + "learning_rate": 9.9354033715021e-05, + "loss": 1.3601, + "step": 38734 + }, + { + "epoch": 0.5033436585813531, + "grad_norm": 0.32811546325683594, + "learning_rate": 9.93514342531096e-05, + "loss": 1.2244, + "step": 38735 + }, + { + "epoch": 0.503356653125269, + "grad_norm": 0.38871097564697266, + "learning_rate": 9.934883479119823e-05, + "loss": 1.4579, + "step": 38736 + }, + { + "epoch": 0.5033696476691849, + "grad_norm": 0.4746989905834198, + "learning_rate": 9.934623532928684e-05, + "loss": 1.4118, + "step": 38737 + }, + { + "epoch": 0.5033826422131008, + "grad_norm": 0.38275283575057983, + "learning_rate": 9.934363586737546e-05, + "loss": 1.3145, + "step": 38738 + }, + { + "epoch": 0.5033956367570166, + "grad_norm": 0.3828461468219757, + "learning_rate": 9.934103640546408e-05, + "loss": 1.4685, + "step": 38739 + }, + { + "epoch": 0.5034086313009325, + "grad_norm": 0.44905635714530945, + "learning_rate": 9.933843694355269e-05, + "loss": 1.6201, + "step": 38740 + }, + { + "epoch": 0.5034216258448484, + "grad_norm": 0.36987483501434326, + "learning_rate": 9.93358374816413e-05, + "loss": 1.3206, + "step": 38741 + }, + { + "epoch": 0.5034346203887643, + "grad_norm": 0.4826483726501465, + "learning_rate": 9.933323801972992e-05, + "loss": 1.422, + "step": 38742 + }, + { + "epoch": 0.5034476149326801, + "grad_norm": 0.35331761837005615, + "learning_rate": 9.933063855781853e-05, + "loss": 1.6108, + "step": 38743 + }, + { + "epoch": 0.503460609476596, + "grad_norm": 0.5542351007461548, + "learning_rate": 9.932803909590716e-05, + "loss": 1.4403, + "step": 38744 + }, + { + "epoch": 0.5034736040205119, + "grad_norm": 0.37678608298301697, + "learning_rate": 9.932543963399577e-05, + "loss": 1.3974, + "step": 38745 + }, + { + "epoch": 0.5034865985644278, + "grad_norm": 0.38738587498664856, + "learning_rate": 9.932284017208439e-05, + "loss": 1.2872, + "step": 38746 + }, + { + "epoch": 0.5034995931083436, + "grad_norm": 0.3126541078090668, + "learning_rate": 9.932024071017299e-05, + "loss": 1.1688, + "step": 38747 + }, + { + "epoch": 0.5035125876522595, + "grad_norm": 0.3494420647621155, + "learning_rate": 9.931764124826161e-05, + "loss": 1.3304, + "step": 38748 + }, + { + "epoch": 0.5035255821961754, + "grad_norm": 0.43524011969566345, + "learning_rate": 9.931504178635023e-05, + "loss": 1.3694, + "step": 38749 + }, + { + "epoch": 0.5035385767400913, + "grad_norm": 0.3468901216983795, + "learning_rate": 9.931244232443885e-05, + "loss": 1.4129, + "step": 38750 + }, + { + "epoch": 0.5035515712840072, + "grad_norm": 0.37986692786216736, + "learning_rate": 9.930984286252746e-05, + "loss": 1.3642, + "step": 38751 + }, + { + "epoch": 0.503564565827923, + "grad_norm": 0.4223717749118805, + "learning_rate": 9.930724340061609e-05, + "loss": 1.6082, + "step": 38752 + }, + { + "epoch": 0.5035775603718389, + "grad_norm": 0.42933374643325806, + "learning_rate": 9.930464393870468e-05, + "loss": 1.186, + "step": 38753 + }, + { + "epoch": 0.5035905549157548, + "grad_norm": 0.34949082136154175, + "learning_rate": 9.930204447679331e-05, + "loss": 1.3982, + "step": 38754 + }, + { + "epoch": 0.5036035494596707, + "grad_norm": 0.37462541460990906, + "learning_rate": 9.929944501488192e-05, + "loss": 1.4427, + "step": 38755 + }, + { + "epoch": 0.5036165440035865, + "grad_norm": 0.4260838031768799, + "learning_rate": 9.929684555297054e-05, + "loss": 1.3889, + "step": 38756 + }, + { + "epoch": 0.5036295385475024, + "grad_norm": 0.2994789183139801, + "learning_rate": 9.929424609105915e-05, + "loss": 1.3296, + "step": 38757 + }, + { + "epoch": 0.5036425330914183, + "grad_norm": 0.41050267219543457, + "learning_rate": 9.929164662914778e-05, + "loss": 1.3246, + "step": 38758 + }, + { + "epoch": 0.5036555276353342, + "grad_norm": 0.4634935259819031, + "learning_rate": 9.928904716723638e-05, + "loss": 1.5292, + "step": 38759 + }, + { + "epoch": 0.50366852217925, + "grad_norm": 0.4125591814517975, + "learning_rate": 9.9286447705325e-05, + "loss": 1.2944, + "step": 38760 + }, + { + "epoch": 0.5036815167231659, + "grad_norm": 0.3591509759426117, + "learning_rate": 9.928384824341361e-05, + "loss": 1.2711, + "step": 38761 + }, + { + "epoch": 0.5036945112670818, + "grad_norm": 0.4216979742050171, + "learning_rate": 9.928124878150224e-05, + "loss": 1.5474, + "step": 38762 + }, + { + "epoch": 0.5037075058109977, + "grad_norm": 0.3817099332809448, + "learning_rate": 9.927864931959085e-05, + "loss": 1.4197, + "step": 38763 + }, + { + "epoch": 0.5037205003549134, + "grad_norm": 0.3384837508201599, + "learning_rate": 9.927604985767947e-05, + "loss": 1.2255, + "step": 38764 + }, + { + "epoch": 0.5037334948988293, + "grad_norm": 0.42455190420150757, + "learning_rate": 9.927345039576808e-05, + "loss": 1.5031, + "step": 38765 + }, + { + "epoch": 0.5037464894427452, + "grad_norm": 0.42044880986213684, + "learning_rate": 9.927085093385669e-05, + "loss": 1.2972, + "step": 38766 + }, + { + "epoch": 0.5037594839866611, + "grad_norm": 0.3709751069545746, + "learning_rate": 9.926825147194532e-05, + "loss": 1.3916, + "step": 38767 + }, + { + "epoch": 0.5037724785305769, + "grad_norm": 0.40946483612060547, + "learning_rate": 9.926565201003393e-05, + "loss": 1.339, + "step": 38768 + }, + { + "epoch": 0.5037854730744928, + "grad_norm": 0.3375990688800812, + "learning_rate": 9.926305254812255e-05, + "loss": 1.3489, + "step": 38769 + }, + { + "epoch": 0.5037984676184087, + "grad_norm": 0.524073600769043, + "learning_rate": 9.926045308621116e-05, + "loss": 1.5011, + "step": 38770 + }, + { + "epoch": 0.5038114621623246, + "grad_norm": 0.4260469675064087, + "learning_rate": 9.925785362429977e-05, + "loss": 1.3027, + "step": 38771 + }, + { + "epoch": 0.5038244567062404, + "grad_norm": 0.34187766909599304, + "learning_rate": 9.925525416238839e-05, + "loss": 1.2047, + "step": 38772 + }, + { + "epoch": 0.5038374512501563, + "grad_norm": 0.3449990749359131, + "learning_rate": 9.925265470047701e-05, + "loss": 1.1701, + "step": 38773 + }, + { + "epoch": 0.5038504457940722, + "grad_norm": 0.43694669008255005, + "learning_rate": 9.925005523856562e-05, + "loss": 1.4895, + "step": 38774 + }, + { + "epoch": 0.5038634403379881, + "grad_norm": 0.4068067967891693, + "learning_rate": 9.924745577665425e-05, + "loss": 1.3221, + "step": 38775 + }, + { + "epoch": 0.5038764348819039, + "grad_norm": 0.5102118253707886, + "learning_rate": 9.924485631474286e-05, + "loss": 1.5115, + "step": 38776 + }, + { + "epoch": 0.5038894294258198, + "grad_norm": 0.44939833879470825, + "learning_rate": 9.924225685283147e-05, + "loss": 1.7191, + "step": 38777 + }, + { + "epoch": 0.5039024239697357, + "grad_norm": 0.4997943639755249, + "learning_rate": 9.923965739092008e-05, + "loss": 1.2966, + "step": 38778 + }, + { + "epoch": 0.5039154185136516, + "grad_norm": 0.36760884523391724, + "learning_rate": 9.92370579290087e-05, + "loss": 1.5071, + "step": 38779 + }, + { + "epoch": 0.5039284130575674, + "grad_norm": 0.43209052085876465, + "learning_rate": 9.923445846709731e-05, + "loss": 1.3443, + "step": 38780 + }, + { + "epoch": 0.5039414076014833, + "grad_norm": 0.38719630241394043, + "learning_rate": 9.923185900518594e-05, + "loss": 1.3434, + "step": 38781 + }, + { + "epoch": 0.5039544021453992, + "grad_norm": 0.3981705904006958, + "learning_rate": 9.922925954327455e-05, + "loss": 1.3683, + "step": 38782 + }, + { + "epoch": 0.5039673966893151, + "grad_norm": 0.3411445617675781, + "learning_rate": 9.922666008136316e-05, + "loss": 1.2915, + "step": 38783 + }, + { + "epoch": 0.5039803912332309, + "grad_norm": 0.37403544783592224, + "learning_rate": 9.922406061945177e-05, + "loss": 1.3626, + "step": 38784 + }, + { + "epoch": 0.5039933857771468, + "grad_norm": 0.37046870589256287, + "learning_rate": 9.92214611575404e-05, + "loss": 1.3146, + "step": 38785 + }, + { + "epoch": 0.5040063803210627, + "grad_norm": 0.43736395239830017, + "learning_rate": 9.9218861695629e-05, + "loss": 1.5803, + "step": 38786 + }, + { + "epoch": 0.5040193748649786, + "grad_norm": 0.39136070013046265, + "learning_rate": 9.921626223371763e-05, + "loss": 1.4286, + "step": 38787 + }, + { + "epoch": 0.5040323694088944, + "grad_norm": 0.3854241669178009, + "learning_rate": 9.921366277180624e-05, + "loss": 1.4189, + "step": 38788 + }, + { + "epoch": 0.5040453639528103, + "grad_norm": 0.2745227515697479, + "learning_rate": 9.921106330989485e-05, + "loss": 1.3281, + "step": 38789 + }, + { + "epoch": 0.5040583584967262, + "grad_norm": 0.3839605748653412, + "learning_rate": 9.920846384798346e-05, + "loss": 1.3319, + "step": 38790 + }, + { + "epoch": 0.5040713530406421, + "grad_norm": 0.3870304524898529, + "learning_rate": 9.920586438607209e-05, + "loss": 1.1957, + "step": 38791 + }, + { + "epoch": 0.5040843475845579, + "grad_norm": 0.407545268535614, + "learning_rate": 9.92032649241607e-05, + "loss": 1.5558, + "step": 38792 + }, + { + "epoch": 0.5040973421284738, + "grad_norm": 0.44787779450416565, + "learning_rate": 9.920066546224932e-05, + "loss": 1.2102, + "step": 38793 + }, + { + "epoch": 0.5041103366723897, + "grad_norm": 0.32703739404678345, + "learning_rate": 9.919806600033793e-05, + "loss": 1.5048, + "step": 38794 + }, + { + "epoch": 0.5041233312163056, + "grad_norm": 0.3944266438484192, + "learning_rate": 9.919546653842655e-05, + "loss": 1.2845, + "step": 38795 + }, + { + "epoch": 0.5041363257602214, + "grad_norm": 0.42107751965522766, + "learning_rate": 9.919286707651516e-05, + "loss": 1.3035, + "step": 38796 + }, + { + "epoch": 0.5041493203041373, + "grad_norm": 0.4647158980369568, + "learning_rate": 9.919026761460378e-05, + "loss": 1.3693, + "step": 38797 + }, + { + "epoch": 0.5041623148480532, + "grad_norm": 0.498198926448822, + "learning_rate": 9.918766815269239e-05, + "loss": 1.3105, + "step": 38798 + }, + { + "epoch": 0.5041753093919691, + "grad_norm": 0.32860368490219116, + "learning_rate": 9.918506869078102e-05, + "loss": 1.3167, + "step": 38799 + }, + { + "epoch": 0.5041883039358849, + "grad_norm": 0.5228272676467896, + "learning_rate": 9.918246922886963e-05, + "loss": 1.5022, + "step": 38800 + }, + { + "epoch": 0.5042012984798008, + "grad_norm": 0.3779371678829193, + "learning_rate": 9.917986976695824e-05, + "loss": 1.2422, + "step": 38801 + }, + { + "epoch": 0.5042142930237167, + "grad_norm": 0.488620400428772, + "learning_rate": 9.917727030504686e-05, + "loss": 1.472, + "step": 38802 + }, + { + "epoch": 0.5042272875676326, + "grad_norm": 0.3369017541408539, + "learning_rate": 9.917467084313547e-05, + "loss": 1.4109, + "step": 38803 + }, + { + "epoch": 0.5042402821115484, + "grad_norm": 0.44672322273254395, + "learning_rate": 9.91720713812241e-05, + "loss": 1.2417, + "step": 38804 + }, + { + "epoch": 0.5042532766554643, + "grad_norm": 0.37324458360671997, + "learning_rate": 9.916947191931271e-05, + "loss": 1.395, + "step": 38805 + }, + { + "epoch": 0.5042662711993802, + "grad_norm": 0.39913618564605713, + "learning_rate": 9.916687245740133e-05, + "loss": 1.2996, + "step": 38806 + }, + { + "epoch": 0.5042792657432961, + "grad_norm": 0.3723594546318054, + "learning_rate": 9.916427299548994e-05, + "loss": 1.2374, + "step": 38807 + }, + { + "epoch": 0.5042922602872119, + "grad_norm": 0.3540034890174866, + "learning_rate": 9.916167353357855e-05, + "loss": 1.2051, + "step": 38808 + }, + { + "epoch": 0.5043052548311278, + "grad_norm": 0.3539971113204956, + "learning_rate": 9.915907407166717e-05, + "loss": 1.3571, + "step": 38809 + }, + { + "epoch": 0.5043182493750437, + "grad_norm": 0.4498256742954254, + "learning_rate": 9.915647460975579e-05, + "loss": 1.3756, + "step": 38810 + }, + { + "epoch": 0.5043312439189596, + "grad_norm": 0.44468775391578674, + "learning_rate": 9.91538751478444e-05, + "loss": 1.5797, + "step": 38811 + }, + { + "epoch": 0.5043442384628753, + "grad_norm": 0.4160136878490448, + "learning_rate": 9.915127568593303e-05, + "loss": 1.2572, + "step": 38812 + }, + { + "epoch": 0.5043572330067913, + "grad_norm": 0.39796027541160583, + "learning_rate": 9.914867622402164e-05, + "loss": 1.4628, + "step": 38813 + }, + { + "epoch": 0.5043702275507072, + "grad_norm": 0.3821994662284851, + "learning_rate": 9.914607676211025e-05, + "loss": 1.3208, + "step": 38814 + }, + { + "epoch": 0.504383222094623, + "grad_norm": 0.40463051199913025, + "learning_rate": 9.914347730019886e-05, + "loss": 1.41, + "step": 38815 + }, + { + "epoch": 0.5043962166385388, + "grad_norm": 0.3927914500236511, + "learning_rate": 9.914087783828748e-05, + "loss": 1.3729, + "step": 38816 + }, + { + "epoch": 0.5044092111824547, + "grad_norm": 0.453128844499588, + "learning_rate": 9.91382783763761e-05, + "loss": 1.4636, + "step": 38817 + }, + { + "epoch": 0.5044222057263706, + "grad_norm": 0.40080153942108154, + "learning_rate": 9.913567891446472e-05, + "loss": 1.3695, + "step": 38818 + }, + { + "epoch": 0.5044352002702865, + "grad_norm": 0.4682604670524597, + "learning_rate": 9.913307945255333e-05, + "loss": 1.5759, + "step": 38819 + }, + { + "epoch": 0.5044481948142023, + "grad_norm": 0.4354437589645386, + "learning_rate": 9.913047999064194e-05, + "loss": 1.3116, + "step": 38820 + }, + { + "epoch": 0.5044611893581182, + "grad_norm": 0.3466787040233612, + "learning_rate": 9.912788052873055e-05, + "loss": 1.353, + "step": 38821 + }, + { + "epoch": 0.5044741839020341, + "grad_norm": 0.4939839243888855, + "learning_rate": 9.912528106681918e-05, + "loss": 1.5435, + "step": 38822 + }, + { + "epoch": 0.50448717844595, + "grad_norm": 0.43136346340179443, + "learning_rate": 9.912268160490779e-05, + "loss": 1.4541, + "step": 38823 + }, + { + "epoch": 0.5045001729898658, + "grad_norm": 0.4758949875831604, + "learning_rate": 9.912008214299641e-05, + "loss": 1.4675, + "step": 38824 + }, + { + "epoch": 0.5045131675337817, + "grad_norm": 0.39081668853759766, + "learning_rate": 9.911748268108502e-05, + "loss": 1.4176, + "step": 38825 + }, + { + "epoch": 0.5045261620776976, + "grad_norm": 0.4540448486804962, + "learning_rate": 9.911488321917363e-05, + "loss": 1.5372, + "step": 38826 + }, + { + "epoch": 0.5045391566216135, + "grad_norm": 0.41783830523490906, + "learning_rate": 9.911228375726224e-05, + "loss": 1.5114, + "step": 38827 + }, + { + "epoch": 0.5045521511655294, + "grad_norm": 0.41544675827026367, + "learning_rate": 9.910968429535087e-05, + "loss": 1.2909, + "step": 38828 + }, + { + "epoch": 0.5045651457094452, + "grad_norm": 0.44033315777778625, + "learning_rate": 9.910708483343948e-05, + "loss": 1.3254, + "step": 38829 + }, + { + "epoch": 0.5045781402533611, + "grad_norm": 0.41473838686943054, + "learning_rate": 9.91044853715281e-05, + "loss": 1.3971, + "step": 38830 + }, + { + "epoch": 0.504591134797277, + "grad_norm": 0.3708394169807434, + "learning_rate": 9.910188590961671e-05, + "loss": 1.38, + "step": 38831 + }, + { + "epoch": 0.5046041293411929, + "grad_norm": 0.35531821846961975, + "learning_rate": 9.909928644770533e-05, + "loss": 1.2907, + "step": 38832 + }, + { + "epoch": 0.5046171238851087, + "grad_norm": 0.39857718348503113, + "learning_rate": 9.909668698579394e-05, + "loss": 1.2973, + "step": 38833 + }, + { + "epoch": 0.5046301184290246, + "grad_norm": 0.4134206175804138, + "learning_rate": 9.909408752388256e-05, + "loss": 1.3216, + "step": 38834 + }, + { + "epoch": 0.5046431129729405, + "grad_norm": 0.38989394903182983, + "learning_rate": 9.909148806197117e-05, + "loss": 1.3817, + "step": 38835 + }, + { + "epoch": 0.5046561075168564, + "grad_norm": 0.41405975818634033, + "learning_rate": 9.90888886000598e-05, + "loss": 1.4141, + "step": 38836 + }, + { + "epoch": 0.5046691020607722, + "grad_norm": 0.40060386061668396, + "learning_rate": 9.908628913814841e-05, + "loss": 1.2089, + "step": 38837 + }, + { + "epoch": 0.5046820966046881, + "grad_norm": 0.43636980652809143, + "learning_rate": 9.908368967623702e-05, + "loss": 1.3592, + "step": 38838 + }, + { + "epoch": 0.504695091148604, + "grad_norm": 0.4051605761051178, + "learning_rate": 9.908109021432564e-05, + "loss": 1.4045, + "step": 38839 + }, + { + "epoch": 0.5047080856925199, + "grad_norm": 0.3659214973449707, + "learning_rate": 9.907849075241425e-05, + "loss": 1.2664, + "step": 38840 + }, + { + "epoch": 0.5047210802364357, + "grad_norm": 0.296008437871933, + "learning_rate": 9.907589129050288e-05, + "loss": 1.3665, + "step": 38841 + }, + { + "epoch": 0.5047340747803516, + "grad_norm": 0.45115482807159424, + "learning_rate": 9.907329182859149e-05, + "loss": 1.5345, + "step": 38842 + }, + { + "epoch": 0.5047470693242675, + "grad_norm": 0.38254332542419434, + "learning_rate": 9.90706923666801e-05, + "loss": 1.5103, + "step": 38843 + }, + { + "epoch": 0.5047600638681834, + "grad_norm": 0.5251911878585815, + "learning_rate": 9.906809290476871e-05, + "loss": 1.4813, + "step": 38844 + }, + { + "epoch": 0.5047730584120992, + "grad_norm": 0.3894405961036682, + "learning_rate": 9.906549344285734e-05, + "loss": 1.3541, + "step": 38845 + }, + { + "epoch": 0.5047860529560151, + "grad_norm": 0.40301117300987244, + "learning_rate": 9.906289398094595e-05, + "loss": 1.4809, + "step": 38846 + }, + { + "epoch": 0.504799047499931, + "grad_norm": 0.3689195215702057, + "learning_rate": 9.906029451903457e-05, + "loss": 1.2862, + "step": 38847 + }, + { + "epoch": 0.5048120420438469, + "grad_norm": 0.4489176273345947, + "learning_rate": 9.905769505712318e-05, + "loss": 1.4535, + "step": 38848 + }, + { + "epoch": 0.5048250365877627, + "grad_norm": 0.4705716073513031, + "learning_rate": 9.90550955952118e-05, + "loss": 1.4097, + "step": 38849 + }, + { + "epoch": 0.5048380311316786, + "grad_norm": 0.3903043270111084, + "learning_rate": 9.90524961333004e-05, + "loss": 1.3098, + "step": 38850 + }, + { + "epoch": 0.5048510256755945, + "grad_norm": 0.40608447790145874, + "learning_rate": 9.904989667138903e-05, + "loss": 1.5943, + "step": 38851 + }, + { + "epoch": 0.5048640202195104, + "grad_norm": 0.44438889622688293, + "learning_rate": 9.904729720947764e-05, + "loss": 1.2881, + "step": 38852 + }, + { + "epoch": 0.5048770147634262, + "grad_norm": 0.3680826723575592, + "learning_rate": 9.904469774756626e-05, + "loss": 1.4008, + "step": 38853 + }, + { + "epoch": 0.5048900093073421, + "grad_norm": 0.32832080125808716, + "learning_rate": 9.904209828565487e-05, + "loss": 1.2712, + "step": 38854 + }, + { + "epoch": 0.504903003851258, + "grad_norm": 0.3889751732349396, + "learning_rate": 9.90394988237435e-05, + "loss": 1.1345, + "step": 38855 + }, + { + "epoch": 0.5049159983951739, + "grad_norm": 0.43394264578819275, + "learning_rate": 9.90368993618321e-05, + "loss": 1.4156, + "step": 38856 + }, + { + "epoch": 0.5049289929390897, + "grad_norm": 0.313986212015152, + "learning_rate": 9.903429989992072e-05, + "loss": 1.4014, + "step": 38857 + }, + { + "epoch": 0.5049419874830056, + "grad_norm": 0.45069023966789246, + "learning_rate": 9.903170043800933e-05, + "loss": 1.44, + "step": 38858 + }, + { + "epoch": 0.5049549820269215, + "grad_norm": 0.3618573844432831, + "learning_rate": 9.902910097609796e-05, + "loss": 1.4056, + "step": 38859 + }, + { + "epoch": 0.5049679765708374, + "grad_norm": 0.37697628140449524, + "learning_rate": 9.902650151418657e-05, + "loss": 1.3655, + "step": 38860 + }, + { + "epoch": 0.5049809711147532, + "grad_norm": 0.41419944167137146, + "learning_rate": 9.902390205227519e-05, + "loss": 1.4111, + "step": 38861 + }, + { + "epoch": 0.504993965658669, + "grad_norm": 0.39759454131126404, + "learning_rate": 9.902130259036379e-05, + "loss": 1.4643, + "step": 38862 + }, + { + "epoch": 0.505006960202585, + "grad_norm": 0.4913485646247864, + "learning_rate": 9.901870312845241e-05, + "loss": 1.5706, + "step": 38863 + }, + { + "epoch": 0.5050199547465009, + "grad_norm": 0.3975781500339508, + "learning_rate": 9.901610366654102e-05, + "loss": 1.3024, + "step": 38864 + }, + { + "epoch": 0.5050329492904166, + "grad_norm": 0.3487274646759033, + "learning_rate": 9.901350420462965e-05, + "loss": 1.4192, + "step": 38865 + }, + { + "epoch": 0.5050459438343325, + "grad_norm": 0.3969731032848358, + "learning_rate": 9.901090474271826e-05, + "loss": 1.2585, + "step": 38866 + }, + { + "epoch": 0.5050589383782484, + "grad_norm": 0.3640156686306, + "learning_rate": 9.900830528080688e-05, + "loss": 1.2446, + "step": 38867 + }, + { + "epoch": 0.5050719329221643, + "grad_norm": 0.40280553698539734, + "learning_rate": 9.90057058188955e-05, + "loss": 1.1372, + "step": 38868 + }, + { + "epoch": 0.5050849274660801, + "grad_norm": 0.39918363094329834, + "learning_rate": 9.90031063569841e-05, + "loss": 1.302, + "step": 38869 + }, + { + "epoch": 0.505097922009996, + "grad_norm": 0.3400992155075073, + "learning_rate": 9.900050689507272e-05, + "loss": 1.5995, + "step": 38870 + }, + { + "epoch": 0.5051109165539119, + "grad_norm": 0.37101495265960693, + "learning_rate": 9.899790743316134e-05, + "loss": 1.4047, + "step": 38871 + }, + { + "epoch": 0.5051239110978278, + "grad_norm": 0.4693084955215454, + "learning_rate": 9.899530797124995e-05, + "loss": 1.4236, + "step": 38872 + }, + { + "epoch": 0.5051369056417436, + "grad_norm": 0.3461991846561432, + "learning_rate": 9.899270850933858e-05, + "loss": 1.4447, + "step": 38873 + }, + { + "epoch": 0.5051499001856595, + "grad_norm": 0.4144139885902405, + "learning_rate": 9.899010904742719e-05, + "loss": 1.3334, + "step": 38874 + }, + { + "epoch": 0.5051628947295754, + "grad_norm": 0.4546304941177368, + "learning_rate": 9.89875095855158e-05, + "loss": 1.3361, + "step": 38875 + }, + { + "epoch": 0.5051758892734913, + "grad_norm": 0.32750195264816284, + "learning_rate": 9.898491012360442e-05, + "loss": 1.2921, + "step": 38876 + }, + { + "epoch": 0.5051888838174071, + "grad_norm": 0.346220999956131, + "learning_rate": 9.898231066169303e-05, + "loss": 1.358, + "step": 38877 + }, + { + "epoch": 0.505201878361323, + "grad_norm": 0.4384554624557495, + "learning_rate": 9.897971119978166e-05, + "loss": 1.3008, + "step": 38878 + }, + { + "epoch": 0.5052148729052389, + "grad_norm": 0.405154287815094, + "learning_rate": 9.897711173787027e-05, + "loss": 1.3757, + "step": 38879 + }, + { + "epoch": 0.5052278674491548, + "grad_norm": 0.39751484990119934, + "learning_rate": 9.897451227595888e-05, + "loss": 1.5712, + "step": 38880 + }, + { + "epoch": 0.5052408619930706, + "grad_norm": 0.37242722511291504, + "learning_rate": 9.897191281404749e-05, + "loss": 1.4477, + "step": 38881 + }, + { + "epoch": 0.5052538565369865, + "grad_norm": 0.4137822389602661, + "learning_rate": 9.896931335213612e-05, + "loss": 1.4915, + "step": 38882 + }, + { + "epoch": 0.5052668510809024, + "grad_norm": 0.3734016716480255, + "learning_rate": 9.896671389022473e-05, + "loss": 1.3301, + "step": 38883 + }, + { + "epoch": 0.5052798456248183, + "grad_norm": 0.4223211109638214, + "learning_rate": 9.896411442831335e-05, + "loss": 1.5394, + "step": 38884 + }, + { + "epoch": 0.5052928401687341, + "grad_norm": 0.3834093511104584, + "learning_rate": 9.896151496640196e-05, + "loss": 1.4788, + "step": 38885 + }, + { + "epoch": 0.50530583471265, + "grad_norm": 0.40939652919769287, + "learning_rate": 9.895891550449057e-05, + "loss": 1.3724, + "step": 38886 + }, + { + "epoch": 0.5053188292565659, + "grad_norm": 0.4255099296569824, + "learning_rate": 9.895631604257918e-05, + "loss": 1.398, + "step": 38887 + }, + { + "epoch": 0.5053318238004818, + "grad_norm": 0.4145475924015045, + "learning_rate": 9.895371658066781e-05, + "loss": 1.3874, + "step": 38888 + }, + { + "epoch": 0.5053448183443976, + "grad_norm": 0.35068678855895996, + "learning_rate": 9.895111711875642e-05, + "loss": 1.2846, + "step": 38889 + }, + { + "epoch": 0.5053578128883135, + "grad_norm": 0.3572659492492676, + "learning_rate": 9.894851765684504e-05, + "loss": 1.4202, + "step": 38890 + }, + { + "epoch": 0.5053708074322294, + "grad_norm": 0.4885404109954834, + "learning_rate": 9.894591819493366e-05, + "loss": 1.4595, + "step": 38891 + }, + { + "epoch": 0.5053838019761453, + "grad_norm": 0.3293319344520569, + "learning_rate": 9.894331873302227e-05, + "loss": 1.2981, + "step": 38892 + }, + { + "epoch": 0.5053967965200611, + "grad_norm": 0.43456119298934937, + "learning_rate": 9.894071927111088e-05, + "loss": 1.251, + "step": 38893 + }, + { + "epoch": 0.505409791063977, + "grad_norm": 0.39235758781433105, + "learning_rate": 9.89381198091995e-05, + "loss": 1.5713, + "step": 38894 + }, + { + "epoch": 0.5054227856078929, + "grad_norm": 0.39271003007888794, + "learning_rate": 9.893552034728811e-05, + "loss": 1.4574, + "step": 38895 + }, + { + "epoch": 0.5054357801518088, + "grad_norm": 0.4353560209274292, + "learning_rate": 9.893292088537674e-05, + "loss": 1.3783, + "step": 38896 + }, + { + "epoch": 0.5054487746957246, + "grad_norm": 0.41434329748153687, + "learning_rate": 9.893032142346535e-05, + "loss": 1.3336, + "step": 38897 + }, + { + "epoch": 0.5054617692396405, + "grad_norm": 0.3412036597728729, + "learning_rate": 9.892772196155396e-05, + "loss": 1.1938, + "step": 38898 + }, + { + "epoch": 0.5054747637835564, + "grad_norm": 0.37865379452705383, + "learning_rate": 9.892512249964257e-05, + "loss": 1.1599, + "step": 38899 + }, + { + "epoch": 0.5054877583274723, + "grad_norm": 0.7900471687316895, + "learning_rate": 9.89225230377312e-05, + "loss": 1.4303, + "step": 38900 + }, + { + "epoch": 0.5055007528713881, + "grad_norm": 0.42494142055511475, + "learning_rate": 9.89199235758198e-05, + "loss": 1.3065, + "step": 38901 + }, + { + "epoch": 0.505513747415304, + "grad_norm": 0.3619062602519989, + "learning_rate": 9.891732411390843e-05, + "loss": 1.4312, + "step": 38902 + }, + { + "epoch": 0.5055267419592199, + "grad_norm": 0.4374343752861023, + "learning_rate": 9.891472465199704e-05, + "loss": 1.2428, + "step": 38903 + }, + { + "epoch": 0.5055397365031358, + "grad_norm": 0.3707488775253296, + "learning_rate": 9.891212519008565e-05, + "loss": 1.4428, + "step": 38904 + }, + { + "epoch": 0.5055527310470517, + "grad_norm": 0.427950918674469, + "learning_rate": 9.890952572817426e-05, + "loss": 1.349, + "step": 38905 + }, + { + "epoch": 0.5055657255909675, + "grad_norm": 0.39433446526527405, + "learning_rate": 9.890692626626289e-05, + "loss": 1.3938, + "step": 38906 + }, + { + "epoch": 0.5055787201348834, + "grad_norm": 0.44621676206588745, + "learning_rate": 9.89043268043515e-05, + "loss": 1.5067, + "step": 38907 + }, + { + "epoch": 0.5055917146787993, + "grad_norm": 0.4836346507072449, + "learning_rate": 9.890172734244012e-05, + "loss": 1.5258, + "step": 38908 + }, + { + "epoch": 0.5056047092227152, + "grad_norm": 0.34925729036331177, + "learning_rate": 9.889912788052873e-05, + "loss": 1.2934, + "step": 38909 + }, + { + "epoch": 0.505617703766631, + "grad_norm": 0.4168814420700073, + "learning_rate": 9.889652841861736e-05, + "loss": 1.339, + "step": 38910 + }, + { + "epoch": 0.5056306983105469, + "grad_norm": 0.311712384223938, + "learning_rate": 9.889392895670596e-05, + "loss": 1.1879, + "step": 38911 + }, + { + "epoch": 0.5056436928544628, + "grad_norm": 0.3923894166946411, + "learning_rate": 9.889132949479458e-05, + "loss": 1.3416, + "step": 38912 + }, + { + "epoch": 0.5056566873983787, + "grad_norm": 0.32185640931129456, + "learning_rate": 9.88887300328832e-05, + "loss": 1.3659, + "step": 38913 + }, + { + "epoch": 0.5056696819422944, + "grad_norm": 0.4125804603099823, + "learning_rate": 9.888613057097182e-05, + "loss": 1.2537, + "step": 38914 + }, + { + "epoch": 0.5056826764862103, + "grad_norm": 0.4877341091632843, + "learning_rate": 9.888353110906044e-05, + "loss": 1.4629, + "step": 38915 + }, + { + "epoch": 0.5056956710301262, + "grad_norm": 0.25266018509864807, + "learning_rate": 9.888093164714905e-05, + "loss": 1.1514, + "step": 38916 + }, + { + "epoch": 0.5057086655740421, + "grad_norm": 0.39433324337005615, + "learning_rate": 9.887833218523766e-05, + "loss": 1.2624, + "step": 38917 + }, + { + "epoch": 0.5057216601179579, + "grad_norm": 0.40970510244369507, + "learning_rate": 9.887573272332627e-05, + "loss": 1.316, + "step": 38918 + }, + { + "epoch": 0.5057346546618738, + "grad_norm": 0.41925790905952454, + "learning_rate": 9.88731332614149e-05, + "loss": 1.4473, + "step": 38919 + }, + { + "epoch": 0.5057476492057897, + "grad_norm": 0.4835989475250244, + "learning_rate": 9.887053379950351e-05, + "loss": 1.4043, + "step": 38920 + }, + { + "epoch": 0.5057606437497056, + "grad_norm": 0.4846076965332031, + "learning_rate": 9.886793433759213e-05, + "loss": 1.2623, + "step": 38921 + }, + { + "epoch": 0.5057736382936214, + "grad_norm": 0.39049088954925537, + "learning_rate": 9.886533487568074e-05, + "loss": 1.3817, + "step": 38922 + }, + { + "epoch": 0.5057866328375373, + "grad_norm": 0.4195057451725006, + "learning_rate": 9.886273541376935e-05, + "loss": 1.4288, + "step": 38923 + }, + { + "epoch": 0.5057996273814532, + "grad_norm": 0.34025663137435913, + "learning_rate": 9.886013595185797e-05, + "loss": 1.206, + "step": 38924 + }, + { + "epoch": 0.5058126219253691, + "grad_norm": 0.3454870879650116, + "learning_rate": 9.885753648994659e-05, + "loss": 1.4942, + "step": 38925 + }, + { + "epoch": 0.5058256164692849, + "grad_norm": 0.4575982391834259, + "learning_rate": 9.88549370280352e-05, + "loss": 1.3897, + "step": 38926 + }, + { + "epoch": 0.5058386110132008, + "grad_norm": 0.442023903131485, + "learning_rate": 9.885233756612382e-05, + "loss": 1.3317, + "step": 38927 + }, + { + "epoch": 0.5058516055571167, + "grad_norm": 0.32065555453300476, + "learning_rate": 9.884973810421244e-05, + "loss": 1.2771, + "step": 38928 + }, + { + "epoch": 0.5058646001010326, + "grad_norm": 0.3846389949321747, + "learning_rate": 9.884713864230105e-05, + "loss": 1.467, + "step": 38929 + }, + { + "epoch": 0.5058775946449484, + "grad_norm": 0.4648045003414154, + "learning_rate": 9.884453918038966e-05, + "loss": 1.3416, + "step": 38930 + }, + { + "epoch": 0.5058905891888643, + "grad_norm": 0.3404772877693176, + "learning_rate": 9.884193971847828e-05, + "loss": 1.5463, + "step": 38931 + }, + { + "epoch": 0.5059035837327802, + "grad_norm": 0.4219919443130493, + "learning_rate": 9.883934025656689e-05, + "loss": 1.4257, + "step": 38932 + }, + { + "epoch": 0.5059165782766961, + "grad_norm": 0.6982499957084656, + "learning_rate": 9.883674079465552e-05, + "loss": 1.3381, + "step": 38933 + }, + { + "epoch": 0.5059295728206119, + "grad_norm": 0.3580003082752228, + "learning_rate": 9.883414133274413e-05, + "loss": 1.451, + "step": 38934 + }, + { + "epoch": 0.5059425673645278, + "grad_norm": 0.39006948471069336, + "learning_rate": 9.883154187083274e-05, + "loss": 1.3216, + "step": 38935 + }, + { + "epoch": 0.5059555619084437, + "grad_norm": 0.6378609538078308, + "learning_rate": 9.882894240892135e-05, + "loss": 1.2982, + "step": 38936 + }, + { + "epoch": 0.5059685564523596, + "grad_norm": 0.3807947635650635, + "learning_rate": 9.882634294700997e-05, + "loss": 1.2957, + "step": 38937 + }, + { + "epoch": 0.5059815509962754, + "grad_norm": 0.4316331446170807, + "learning_rate": 9.882374348509859e-05, + "loss": 1.4239, + "step": 38938 + }, + { + "epoch": 0.5059945455401913, + "grad_norm": 0.38596537709236145, + "learning_rate": 9.882114402318721e-05, + "loss": 1.4194, + "step": 38939 + }, + { + "epoch": 0.5060075400841072, + "grad_norm": 0.41458335518836975, + "learning_rate": 9.881854456127582e-05, + "loss": 1.239, + "step": 38940 + }, + { + "epoch": 0.5060205346280231, + "grad_norm": 0.4952690303325653, + "learning_rate": 9.881594509936443e-05, + "loss": 1.5313, + "step": 38941 + }, + { + "epoch": 0.5060335291719389, + "grad_norm": 0.4348819851875305, + "learning_rate": 9.881334563745304e-05, + "loss": 1.2912, + "step": 38942 + }, + { + "epoch": 0.5060465237158548, + "grad_norm": 0.4328376054763794, + "learning_rate": 9.881074617554167e-05, + "loss": 1.5716, + "step": 38943 + }, + { + "epoch": 0.5060595182597707, + "grad_norm": 0.3950212299823761, + "learning_rate": 9.880814671363028e-05, + "loss": 1.3277, + "step": 38944 + }, + { + "epoch": 0.5060725128036866, + "grad_norm": 0.4061471223831177, + "learning_rate": 9.88055472517189e-05, + "loss": 1.4136, + "step": 38945 + }, + { + "epoch": 0.5060855073476024, + "grad_norm": 0.35404038429260254, + "learning_rate": 9.880294778980751e-05, + "loss": 1.4198, + "step": 38946 + }, + { + "epoch": 0.5060985018915183, + "grad_norm": 0.3556307852268219, + "learning_rate": 9.880034832789612e-05, + "loss": 1.2953, + "step": 38947 + }, + { + "epoch": 0.5061114964354342, + "grad_norm": 0.4540856182575226, + "learning_rate": 9.879774886598474e-05, + "loss": 1.336, + "step": 38948 + }, + { + "epoch": 0.5061244909793501, + "grad_norm": 0.5268908739089966, + "learning_rate": 9.879514940407336e-05, + "loss": 1.3452, + "step": 38949 + }, + { + "epoch": 0.5061374855232659, + "grad_norm": 0.4249366521835327, + "learning_rate": 9.879254994216198e-05, + "loss": 1.3342, + "step": 38950 + }, + { + "epoch": 0.5061504800671818, + "grad_norm": 0.42012661695480347, + "learning_rate": 9.87899504802506e-05, + "loss": 1.349, + "step": 38951 + }, + { + "epoch": 0.5061634746110977, + "grad_norm": 0.43349936604499817, + "learning_rate": 9.878735101833922e-05, + "loss": 1.4633, + "step": 38952 + }, + { + "epoch": 0.5061764691550136, + "grad_norm": 0.4422926902770996, + "learning_rate": 9.878475155642782e-05, + "loss": 1.3919, + "step": 38953 + }, + { + "epoch": 0.5061894636989294, + "grad_norm": 0.4255259335041046, + "learning_rate": 9.878215209451644e-05, + "loss": 1.4353, + "step": 38954 + }, + { + "epoch": 0.5062024582428453, + "grad_norm": 0.38872551918029785, + "learning_rate": 9.877955263260505e-05, + "loss": 1.2441, + "step": 38955 + }, + { + "epoch": 0.5062154527867612, + "grad_norm": 0.4083339273929596, + "learning_rate": 9.877695317069368e-05, + "loss": 1.4566, + "step": 38956 + }, + { + "epoch": 0.5062284473306771, + "grad_norm": 0.3543145954608917, + "learning_rate": 9.877435370878229e-05, + "loss": 1.3937, + "step": 38957 + }, + { + "epoch": 0.5062414418745929, + "grad_norm": 0.3212784230709076, + "learning_rate": 9.877175424687091e-05, + "loss": 1.1536, + "step": 38958 + }, + { + "epoch": 0.5062544364185088, + "grad_norm": 0.25021910667419434, + "learning_rate": 9.876915478495951e-05, + "loss": 1.3443, + "step": 38959 + }, + { + "epoch": 0.5062674309624247, + "grad_norm": 0.31177476048469543, + "learning_rate": 9.876655532304813e-05, + "loss": 1.2439, + "step": 38960 + }, + { + "epoch": 0.5062804255063406, + "grad_norm": 0.3537243902683258, + "learning_rate": 9.876395586113675e-05, + "loss": 1.5179, + "step": 38961 + }, + { + "epoch": 0.5062934200502563, + "grad_norm": 0.44082117080688477, + "learning_rate": 9.876135639922537e-05, + "loss": 1.4276, + "step": 38962 + }, + { + "epoch": 0.5063064145941722, + "grad_norm": 0.36659103631973267, + "learning_rate": 9.875875693731398e-05, + "loss": 1.327, + "step": 38963 + }, + { + "epoch": 0.5063194091380882, + "grad_norm": 0.3681972622871399, + "learning_rate": 9.87561574754026e-05, + "loss": 1.4731, + "step": 38964 + }, + { + "epoch": 0.506332403682004, + "grad_norm": 0.3321286141872406, + "learning_rate": 9.87535580134912e-05, + "loss": 1.2838, + "step": 38965 + }, + { + "epoch": 0.5063453982259198, + "grad_norm": 0.4140274226665497, + "learning_rate": 9.875095855157983e-05, + "loss": 1.5132, + "step": 38966 + }, + { + "epoch": 0.5063583927698357, + "grad_norm": 0.4781914949417114, + "learning_rate": 9.874835908966844e-05, + "loss": 1.4204, + "step": 38967 + }, + { + "epoch": 0.5063713873137516, + "grad_norm": 0.42894309759140015, + "learning_rate": 9.874575962775706e-05, + "loss": 1.3322, + "step": 38968 + }, + { + "epoch": 0.5063843818576675, + "grad_norm": 0.3696809411048889, + "learning_rate": 9.874316016584567e-05, + "loss": 1.0725, + "step": 38969 + }, + { + "epoch": 0.5063973764015833, + "grad_norm": 0.3757809102535248, + "learning_rate": 9.87405607039343e-05, + "loss": 1.3738, + "step": 38970 + }, + { + "epoch": 0.5064103709454992, + "grad_norm": 0.4483136832714081, + "learning_rate": 9.873796124202291e-05, + "loss": 1.3979, + "step": 38971 + }, + { + "epoch": 0.5064233654894151, + "grad_norm": 0.3810258209705353, + "learning_rate": 9.873536178011152e-05, + "loss": 1.3499, + "step": 38972 + }, + { + "epoch": 0.506436360033331, + "grad_norm": 0.43310102820396423, + "learning_rate": 9.873276231820013e-05, + "loss": 1.2242, + "step": 38973 + }, + { + "epoch": 0.5064493545772468, + "grad_norm": 0.5236282348632812, + "learning_rate": 9.873016285628876e-05, + "loss": 1.3746, + "step": 38974 + }, + { + "epoch": 0.5064623491211627, + "grad_norm": 0.37277260422706604, + "learning_rate": 9.872756339437737e-05, + "loss": 1.2893, + "step": 38975 + }, + { + "epoch": 0.5064753436650786, + "grad_norm": 0.4919889569282532, + "learning_rate": 9.872496393246599e-05, + "loss": 1.3748, + "step": 38976 + }, + { + "epoch": 0.5064883382089945, + "grad_norm": 0.47015029191970825, + "learning_rate": 9.87223644705546e-05, + "loss": 1.3494, + "step": 38977 + }, + { + "epoch": 0.5065013327529104, + "grad_norm": 0.41589295864105225, + "learning_rate": 9.871976500864321e-05, + "loss": 1.1958, + "step": 38978 + }, + { + "epoch": 0.5065143272968262, + "grad_norm": 0.45331794023513794, + "learning_rate": 9.871716554673182e-05, + "loss": 1.5814, + "step": 38979 + }, + { + "epoch": 0.5065273218407421, + "grad_norm": 0.3352465033531189, + "learning_rate": 9.871456608482045e-05, + "loss": 1.3548, + "step": 38980 + }, + { + "epoch": 0.506540316384658, + "grad_norm": 0.359514981508255, + "learning_rate": 9.871196662290906e-05, + "loss": 1.2862, + "step": 38981 + }, + { + "epoch": 0.5065533109285739, + "grad_norm": 0.4791959822177887, + "learning_rate": 9.870936716099768e-05, + "loss": 1.3639, + "step": 38982 + }, + { + "epoch": 0.5065663054724897, + "grad_norm": 0.43867239356040955, + "learning_rate": 9.87067676990863e-05, + "loss": 1.3326, + "step": 38983 + }, + { + "epoch": 0.5065793000164056, + "grad_norm": 0.44322293996810913, + "learning_rate": 9.87041682371749e-05, + "loss": 1.3724, + "step": 38984 + }, + { + "epoch": 0.5065922945603215, + "grad_norm": 0.41319164633750916, + "learning_rate": 9.870156877526352e-05, + "loss": 1.2863, + "step": 38985 + }, + { + "epoch": 0.5066052891042374, + "grad_norm": 0.31432044506073, + "learning_rate": 9.869896931335214e-05, + "loss": 1.2856, + "step": 38986 + }, + { + "epoch": 0.5066182836481532, + "grad_norm": 0.4204002618789673, + "learning_rate": 9.869636985144077e-05, + "loss": 1.3697, + "step": 38987 + }, + { + "epoch": 0.5066312781920691, + "grad_norm": 0.3710818290710449, + "learning_rate": 9.869377038952938e-05, + "loss": 1.5202, + "step": 38988 + }, + { + "epoch": 0.506644272735985, + "grad_norm": 0.39185553789138794, + "learning_rate": 9.869117092761799e-05, + "loss": 1.1378, + "step": 38989 + }, + { + "epoch": 0.5066572672799009, + "grad_norm": 0.4404742121696472, + "learning_rate": 9.86885714657066e-05, + "loss": 1.3626, + "step": 38990 + }, + { + "epoch": 0.5066702618238167, + "grad_norm": 0.3249245882034302, + "learning_rate": 9.868597200379522e-05, + "loss": 1.2427, + "step": 38991 + }, + { + "epoch": 0.5066832563677326, + "grad_norm": 0.4008246660232544, + "learning_rate": 9.868337254188383e-05, + "loss": 1.4032, + "step": 38992 + }, + { + "epoch": 0.5066962509116485, + "grad_norm": 0.4222026467323303, + "learning_rate": 9.868077307997246e-05, + "loss": 1.5242, + "step": 38993 + }, + { + "epoch": 0.5067092454555644, + "grad_norm": 0.5606344938278198, + "learning_rate": 9.867817361806107e-05, + "loss": 1.4008, + "step": 38994 + }, + { + "epoch": 0.5067222399994802, + "grad_norm": 0.4248940646648407, + "learning_rate": 9.867557415614968e-05, + "loss": 1.2444, + "step": 38995 + }, + { + "epoch": 0.5067352345433961, + "grad_norm": 0.47065213322639465, + "learning_rate": 9.867297469423829e-05, + "loss": 1.4587, + "step": 38996 + }, + { + "epoch": 0.506748229087312, + "grad_norm": 0.36539891362190247, + "learning_rate": 9.867037523232692e-05, + "loss": 1.3329, + "step": 38997 + }, + { + "epoch": 0.5067612236312279, + "grad_norm": 0.43173322081565857, + "learning_rate": 9.866777577041553e-05, + "loss": 1.5516, + "step": 38998 + }, + { + "epoch": 0.5067742181751437, + "grad_norm": 0.37651559710502625, + "learning_rate": 9.866517630850415e-05, + "loss": 1.2981, + "step": 38999 + }, + { + "epoch": 0.5067872127190596, + "grad_norm": 0.43880975246429443, + "learning_rate": 9.866257684659276e-05, + "loss": 1.5334, + "step": 39000 + }, + { + "epoch": 0.5068002072629755, + "grad_norm": 0.5258310437202454, + "learning_rate": 9.865997738468137e-05, + "loss": 1.412, + "step": 39001 + }, + { + "epoch": 0.5068132018068914, + "grad_norm": 0.364282488822937, + "learning_rate": 9.865737792276998e-05, + "loss": 1.4415, + "step": 39002 + }, + { + "epoch": 0.5068261963508072, + "grad_norm": 0.425642728805542, + "learning_rate": 9.865477846085861e-05, + "loss": 1.3678, + "step": 39003 + }, + { + "epoch": 0.5068391908947231, + "grad_norm": 0.4209429621696472, + "learning_rate": 9.865217899894722e-05, + "loss": 1.621, + "step": 39004 + }, + { + "epoch": 0.506852185438639, + "grad_norm": 0.3565278649330139, + "learning_rate": 9.864957953703584e-05, + "loss": 1.3581, + "step": 39005 + }, + { + "epoch": 0.5068651799825549, + "grad_norm": 0.44529885053634644, + "learning_rate": 9.864698007512445e-05, + "loss": 1.415, + "step": 39006 + }, + { + "epoch": 0.5068781745264707, + "grad_norm": 0.48272332549095154, + "learning_rate": 9.864438061321307e-05, + "loss": 1.4084, + "step": 39007 + }, + { + "epoch": 0.5068911690703866, + "grad_norm": 0.4302612543106079, + "learning_rate": 9.864178115130168e-05, + "loss": 1.2888, + "step": 39008 + }, + { + "epoch": 0.5069041636143025, + "grad_norm": 0.4196456968784332, + "learning_rate": 9.86391816893903e-05, + "loss": 1.4415, + "step": 39009 + }, + { + "epoch": 0.5069171581582184, + "grad_norm": 0.4898987114429474, + "learning_rate": 9.863658222747891e-05, + "loss": 1.487, + "step": 39010 + }, + { + "epoch": 0.5069301527021342, + "grad_norm": 0.3473092019557953, + "learning_rate": 9.863398276556754e-05, + "loss": 1.353, + "step": 39011 + }, + { + "epoch": 0.50694314724605, + "grad_norm": 0.37998655438423157, + "learning_rate": 9.863138330365615e-05, + "loss": 1.376, + "step": 39012 + }, + { + "epoch": 0.506956141789966, + "grad_norm": 0.4746607542037964, + "learning_rate": 9.862878384174477e-05, + "loss": 1.5109, + "step": 39013 + }, + { + "epoch": 0.5069691363338819, + "grad_norm": 0.32741448283195496, + "learning_rate": 9.862618437983337e-05, + "loss": 1.323, + "step": 39014 + }, + { + "epoch": 0.5069821308777976, + "grad_norm": 0.39243850111961365, + "learning_rate": 9.8623584917922e-05, + "loss": 1.5047, + "step": 39015 + }, + { + "epoch": 0.5069951254217135, + "grad_norm": 0.4433901906013489, + "learning_rate": 9.86209854560106e-05, + "loss": 1.4594, + "step": 39016 + }, + { + "epoch": 0.5070081199656294, + "grad_norm": 0.3639475405216217, + "learning_rate": 9.861838599409923e-05, + "loss": 1.4614, + "step": 39017 + }, + { + "epoch": 0.5070211145095453, + "grad_norm": 0.3475119471549988, + "learning_rate": 9.861578653218784e-05, + "loss": 1.3216, + "step": 39018 + }, + { + "epoch": 0.5070341090534611, + "grad_norm": 0.37647977471351624, + "learning_rate": 9.861318707027646e-05, + "loss": 1.5529, + "step": 39019 + }, + { + "epoch": 0.507047103597377, + "grad_norm": 0.31217002868652344, + "learning_rate": 9.861058760836506e-05, + "loss": 1.5192, + "step": 39020 + }, + { + "epoch": 0.5070600981412929, + "grad_norm": 0.39649656414985657, + "learning_rate": 9.860798814645369e-05, + "loss": 1.4413, + "step": 39021 + }, + { + "epoch": 0.5070730926852088, + "grad_norm": 0.26817792654037476, + "learning_rate": 9.86053886845423e-05, + "loss": 1.3552, + "step": 39022 + }, + { + "epoch": 0.5070860872291246, + "grad_norm": 0.42410463094711304, + "learning_rate": 9.860278922263092e-05, + "loss": 1.3251, + "step": 39023 + }, + { + "epoch": 0.5070990817730405, + "grad_norm": 0.4862684905529022, + "learning_rate": 9.860018976071955e-05, + "loss": 1.3354, + "step": 39024 + }, + { + "epoch": 0.5071120763169564, + "grad_norm": 0.384348064661026, + "learning_rate": 9.859759029880816e-05, + "loss": 1.3511, + "step": 39025 + }, + { + "epoch": 0.5071250708608723, + "grad_norm": 0.47815629839897156, + "learning_rate": 9.859499083689677e-05, + "loss": 1.4294, + "step": 39026 + }, + { + "epoch": 0.5071380654047881, + "grad_norm": 0.26062968373298645, + "learning_rate": 9.859239137498538e-05, + "loss": 1.1733, + "step": 39027 + }, + { + "epoch": 0.507151059948704, + "grad_norm": 0.35596537590026855, + "learning_rate": 9.8589791913074e-05, + "loss": 1.5939, + "step": 39028 + }, + { + "epoch": 0.5071640544926199, + "grad_norm": 0.47660031914711, + "learning_rate": 9.858719245116261e-05, + "loss": 1.3364, + "step": 39029 + }, + { + "epoch": 0.5071770490365358, + "grad_norm": 0.7936979532241821, + "learning_rate": 9.858459298925124e-05, + "loss": 1.2979, + "step": 39030 + }, + { + "epoch": 0.5071900435804516, + "grad_norm": 0.4277774691581726, + "learning_rate": 9.858199352733985e-05, + "loss": 1.2778, + "step": 39031 + }, + { + "epoch": 0.5072030381243675, + "grad_norm": 0.42370083928108215, + "learning_rate": 9.857939406542846e-05, + "loss": 1.3735, + "step": 39032 + }, + { + "epoch": 0.5072160326682834, + "grad_norm": 0.28332576155662537, + "learning_rate": 9.857679460351707e-05, + "loss": 1.2286, + "step": 39033 + }, + { + "epoch": 0.5072290272121993, + "grad_norm": 0.4282630980014801, + "learning_rate": 9.85741951416057e-05, + "loss": 1.3522, + "step": 39034 + }, + { + "epoch": 0.5072420217561151, + "grad_norm": 0.2657581567764282, + "learning_rate": 9.857159567969431e-05, + "loss": 1.2845, + "step": 39035 + }, + { + "epoch": 0.507255016300031, + "grad_norm": 0.5029018521308899, + "learning_rate": 9.856899621778293e-05, + "loss": 1.3669, + "step": 39036 + }, + { + "epoch": 0.5072680108439469, + "grad_norm": 0.44701889157295227, + "learning_rate": 9.856639675587154e-05, + "loss": 1.4461, + "step": 39037 + }, + { + "epoch": 0.5072810053878628, + "grad_norm": 0.45277971029281616, + "learning_rate": 9.856379729396015e-05, + "loss": 1.4303, + "step": 39038 + }, + { + "epoch": 0.5072939999317786, + "grad_norm": 0.3852752447128296, + "learning_rate": 9.856119783204876e-05, + "loss": 1.4924, + "step": 39039 + }, + { + "epoch": 0.5073069944756945, + "grad_norm": 0.3843984007835388, + "learning_rate": 9.855859837013739e-05, + "loss": 1.4405, + "step": 39040 + }, + { + "epoch": 0.5073199890196104, + "grad_norm": 0.29779189825057983, + "learning_rate": 9.8555998908226e-05, + "loss": 1.5695, + "step": 39041 + }, + { + "epoch": 0.5073329835635263, + "grad_norm": 0.5727490186691284, + "learning_rate": 9.855339944631462e-05, + "loss": 1.3484, + "step": 39042 + }, + { + "epoch": 0.5073459781074421, + "grad_norm": 0.39712148904800415, + "learning_rate": 9.855079998440324e-05, + "loss": 1.5144, + "step": 39043 + }, + { + "epoch": 0.507358972651358, + "grad_norm": 0.4007851779460907, + "learning_rate": 9.854820052249185e-05, + "loss": 1.3443, + "step": 39044 + }, + { + "epoch": 0.5073719671952739, + "grad_norm": 0.3793415129184723, + "learning_rate": 9.854560106058046e-05, + "loss": 1.3422, + "step": 39045 + }, + { + "epoch": 0.5073849617391898, + "grad_norm": 0.46783238649368286, + "learning_rate": 9.854300159866908e-05, + "loss": 1.5594, + "step": 39046 + }, + { + "epoch": 0.5073979562831056, + "grad_norm": 0.38685062527656555, + "learning_rate": 9.854040213675769e-05, + "loss": 1.4416, + "step": 39047 + }, + { + "epoch": 0.5074109508270215, + "grad_norm": 0.49881842732429504, + "learning_rate": 9.853780267484632e-05, + "loss": 1.3938, + "step": 39048 + }, + { + "epoch": 0.5074239453709374, + "grad_norm": 0.4347364902496338, + "learning_rate": 9.853520321293493e-05, + "loss": 1.497, + "step": 39049 + }, + { + "epoch": 0.5074369399148533, + "grad_norm": 0.45195472240448, + "learning_rate": 9.853260375102354e-05, + "loss": 1.4096, + "step": 39050 + }, + { + "epoch": 0.5074499344587691, + "grad_norm": 0.37484413385391235, + "learning_rate": 9.853000428911215e-05, + "loss": 1.4546, + "step": 39051 + }, + { + "epoch": 0.507462929002685, + "grad_norm": 0.37836262583732605, + "learning_rate": 9.852740482720077e-05, + "loss": 1.1198, + "step": 39052 + }, + { + "epoch": 0.5074759235466009, + "grad_norm": 0.32535144686698914, + "learning_rate": 9.852480536528939e-05, + "loss": 1.4362, + "step": 39053 + }, + { + "epoch": 0.5074889180905168, + "grad_norm": 0.45424124598503113, + "learning_rate": 9.852220590337801e-05, + "loss": 1.3379, + "step": 39054 + }, + { + "epoch": 0.5075019126344327, + "grad_norm": 0.36031848192214966, + "learning_rate": 9.851960644146662e-05, + "loss": 1.4038, + "step": 39055 + }, + { + "epoch": 0.5075149071783485, + "grad_norm": 0.40901780128479004, + "learning_rate": 9.851700697955523e-05, + "loss": 1.5447, + "step": 39056 + }, + { + "epoch": 0.5075279017222644, + "grad_norm": 0.427380234003067, + "learning_rate": 9.851440751764384e-05, + "loss": 1.492, + "step": 39057 + }, + { + "epoch": 0.5075408962661803, + "grad_norm": 0.38958027958869934, + "learning_rate": 9.851180805573247e-05, + "loss": 1.4365, + "step": 39058 + }, + { + "epoch": 0.5075538908100962, + "grad_norm": 0.4590432643890381, + "learning_rate": 9.850920859382108e-05, + "loss": 1.4199, + "step": 39059 + }, + { + "epoch": 0.507566885354012, + "grad_norm": 0.47250598669052124, + "learning_rate": 9.85066091319097e-05, + "loss": 1.3995, + "step": 39060 + }, + { + "epoch": 0.5075798798979279, + "grad_norm": 0.36515259742736816, + "learning_rate": 9.850400966999833e-05, + "loss": 1.2916, + "step": 39061 + }, + { + "epoch": 0.5075928744418438, + "grad_norm": 0.3330346345901489, + "learning_rate": 9.850141020808692e-05, + "loss": 1.5427, + "step": 39062 + }, + { + "epoch": 0.5076058689857597, + "grad_norm": 0.42624878883361816, + "learning_rate": 9.849881074617555e-05, + "loss": 1.312, + "step": 39063 + }, + { + "epoch": 0.5076188635296754, + "grad_norm": 0.44789859652519226, + "learning_rate": 9.849621128426416e-05, + "loss": 1.23, + "step": 39064 + }, + { + "epoch": 0.5076318580735913, + "grad_norm": 0.37938860058784485, + "learning_rate": 9.849361182235278e-05, + "loss": 1.4681, + "step": 39065 + }, + { + "epoch": 0.5076448526175072, + "grad_norm": 0.3430190682411194, + "learning_rate": 9.84910123604414e-05, + "loss": 1.4245, + "step": 39066 + }, + { + "epoch": 0.5076578471614231, + "grad_norm": 0.4929802417755127, + "learning_rate": 9.848841289853002e-05, + "loss": 1.4092, + "step": 39067 + }, + { + "epoch": 0.5076708417053389, + "grad_norm": 0.44188693165779114, + "learning_rate": 9.848581343661862e-05, + "loss": 1.2332, + "step": 39068 + }, + { + "epoch": 0.5076838362492548, + "grad_norm": 0.391022264957428, + "learning_rate": 9.848321397470724e-05, + "loss": 1.2075, + "step": 39069 + }, + { + "epoch": 0.5076968307931707, + "grad_norm": 0.41861647367477417, + "learning_rate": 9.848061451279585e-05, + "loss": 1.4019, + "step": 39070 + }, + { + "epoch": 0.5077098253370866, + "grad_norm": 0.3480142056941986, + "learning_rate": 9.847801505088448e-05, + "loss": 1.4242, + "step": 39071 + }, + { + "epoch": 0.5077228198810024, + "grad_norm": 0.3744737207889557, + "learning_rate": 9.847541558897309e-05, + "loss": 1.4627, + "step": 39072 + }, + { + "epoch": 0.5077358144249183, + "grad_norm": 0.4499123692512512, + "learning_rate": 9.847281612706171e-05, + "loss": 1.4574, + "step": 39073 + }, + { + "epoch": 0.5077488089688342, + "grad_norm": 0.34960511326789856, + "learning_rate": 9.847021666515032e-05, + "loss": 1.3298, + "step": 39074 + }, + { + "epoch": 0.5077618035127501, + "grad_norm": 0.36144495010375977, + "learning_rate": 9.846761720323893e-05, + "loss": 1.2475, + "step": 39075 + }, + { + "epoch": 0.5077747980566659, + "grad_norm": 0.41158127784729004, + "learning_rate": 9.846501774132754e-05, + "loss": 1.3089, + "step": 39076 + }, + { + "epoch": 0.5077877926005818, + "grad_norm": 0.33869001269340515, + "learning_rate": 9.846241827941617e-05, + "loss": 1.4093, + "step": 39077 + }, + { + "epoch": 0.5078007871444977, + "grad_norm": 0.3813190162181854, + "learning_rate": 9.845981881750478e-05, + "loss": 1.3357, + "step": 39078 + }, + { + "epoch": 0.5078137816884136, + "grad_norm": 0.36648333072662354, + "learning_rate": 9.84572193555934e-05, + "loss": 1.3756, + "step": 39079 + }, + { + "epoch": 0.5078267762323294, + "grad_norm": 0.4366242289543152, + "learning_rate": 9.845461989368202e-05, + "loss": 1.3843, + "step": 39080 + }, + { + "epoch": 0.5078397707762453, + "grad_norm": 0.4592461585998535, + "learning_rate": 9.845202043177063e-05, + "loss": 1.3387, + "step": 39081 + }, + { + "epoch": 0.5078527653201612, + "grad_norm": 0.35144972801208496, + "learning_rate": 9.844942096985924e-05, + "loss": 1.2979, + "step": 39082 + }, + { + "epoch": 0.5078657598640771, + "grad_norm": 0.4178408086299896, + "learning_rate": 9.844682150794786e-05, + "loss": 1.2145, + "step": 39083 + }, + { + "epoch": 0.5078787544079929, + "grad_norm": 0.5404536128044128, + "learning_rate": 9.844422204603647e-05, + "loss": 1.435, + "step": 39084 + }, + { + "epoch": 0.5078917489519088, + "grad_norm": 0.42538902163505554, + "learning_rate": 9.84416225841251e-05, + "loss": 1.5258, + "step": 39085 + }, + { + "epoch": 0.5079047434958247, + "grad_norm": 0.3881109654903412, + "learning_rate": 9.843902312221371e-05, + "loss": 1.5943, + "step": 39086 + }, + { + "epoch": 0.5079177380397406, + "grad_norm": 0.373271107673645, + "learning_rate": 9.843642366030232e-05, + "loss": 1.4006, + "step": 39087 + }, + { + "epoch": 0.5079307325836564, + "grad_norm": 0.38096803426742554, + "learning_rate": 9.843382419839093e-05, + "loss": 1.2663, + "step": 39088 + }, + { + "epoch": 0.5079437271275723, + "grad_norm": 0.3490661680698395, + "learning_rate": 9.843122473647955e-05, + "loss": 1.3182, + "step": 39089 + }, + { + "epoch": 0.5079567216714882, + "grad_norm": 0.3150794804096222, + "learning_rate": 9.842862527456817e-05, + "loss": 1.4092, + "step": 39090 + }, + { + "epoch": 0.5079697162154041, + "grad_norm": 0.4496360719203949, + "learning_rate": 9.842602581265679e-05, + "loss": 1.4594, + "step": 39091 + }, + { + "epoch": 0.5079827107593199, + "grad_norm": 0.4075656533241272, + "learning_rate": 9.84234263507454e-05, + "loss": 1.3182, + "step": 39092 + }, + { + "epoch": 0.5079957053032358, + "grad_norm": 0.4551810920238495, + "learning_rate": 9.842082688883401e-05, + "loss": 1.3865, + "step": 39093 + }, + { + "epoch": 0.5080086998471517, + "grad_norm": 0.34509798884391785, + "learning_rate": 9.841822742692262e-05, + "loss": 1.4242, + "step": 39094 + }, + { + "epoch": 0.5080216943910676, + "grad_norm": 0.4474751949310303, + "learning_rate": 9.841562796501125e-05, + "loss": 1.3955, + "step": 39095 + }, + { + "epoch": 0.5080346889349834, + "grad_norm": 0.3474105894565582, + "learning_rate": 9.841302850309986e-05, + "loss": 1.3519, + "step": 39096 + }, + { + "epoch": 0.5080476834788993, + "grad_norm": 0.294693261384964, + "learning_rate": 9.841042904118848e-05, + "loss": 1.3867, + "step": 39097 + }, + { + "epoch": 0.5080606780228152, + "grad_norm": 0.37856578826904297, + "learning_rate": 9.84078295792771e-05, + "loss": 1.3108, + "step": 39098 + }, + { + "epoch": 0.5080736725667311, + "grad_norm": 0.41657671332359314, + "learning_rate": 9.84052301173657e-05, + "loss": 1.2933, + "step": 39099 + }, + { + "epoch": 0.5080866671106469, + "grad_norm": 0.4832209348678589, + "learning_rate": 9.840263065545433e-05, + "loss": 1.504, + "step": 39100 + }, + { + "epoch": 0.5080996616545628, + "grad_norm": 0.32037147879600525, + "learning_rate": 9.840003119354294e-05, + "loss": 1.3887, + "step": 39101 + }, + { + "epoch": 0.5081126561984787, + "grad_norm": 0.4562925398349762, + "learning_rate": 9.839743173163156e-05, + "loss": 1.338, + "step": 39102 + }, + { + "epoch": 0.5081256507423946, + "grad_norm": 0.41976720094680786, + "learning_rate": 9.839483226972018e-05, + "loss": 1.7332, + "step": 39103 + }, + { + "epoch": 0.5081386452863104, + "grad_norm": 0.4253847897052765, + "learning_rate": 9.839223280780879e-05, + "loss": 1.4224, + "step": 39104 + }, + { + "epoch": 0.5081516398302263, + "grad_norm": 0.3479412794113159, + "learning_rate": 9.83896333458974e-05, + "loss": 1.5214, + "step": 39105 + }, + { + "epoch": 0.5081646343741422, + "grad_norm": 0.46863052248954773, + "learning_rate": 9.838703388398602e-05, + "loss": 1.4047, + "step": 39106 + }, + { + "epoch": 0.5081776289180581, + "grad_norm": 0.33401528000831604, + "learning_rate": 9.838443442207463e-05, + "loss": 1.4721, + "step": 39107 + }, + { + "epoch": 0.5081906234619739, + "grad_norm": 0.42769506573677063, + "learning_rate": 9.838183496016326e-05, + "loss": 1.5025, + "step": 39108 + }, + { + "epoch": 0.5082036180058898, + "grad_norm": 0.38392290472984314, + "learning_rate": 9.837923549825187e-05, + "loss": 1.5524, + "step": 39109 + }, + { + "epoch": 0.5082166125498057, + "grad_norm": 0.39655613899230957, + "learning_rate": 9.837663603634048e-05, + "loss": 1.4895, + "step": 39110 + }, + { + "epoch": 0.5082296070937216, + "grad_norm": 0.24204331636428833, + "learning_rate": 9.837403657442909e-05, + "loss": 1.3271, + "step": 39111 + }, + { + "epoch": 0.5082426016376373, + "grad_norm": 0.3175000250339508, + "learning_rate": 9.837143711251771e-05, + "loss": 1.357, + "step": 39112 + }, + { + "epoch": 0.5082555961815532, + "grad_norm": 0.3003745675086975, + "learning_rate": 9.836883765060633e-05, + "loss": 1.3441, + "step": 39113 + }, + { + "epoch": 0.5082685907254691, + "grad_norm": 0.43871405720710754, + "learning_rate": 9.836623818869495e-05, + "loss": 1.3118, + "step": 39114 + }, + { + "epoch": 0.508281585269385, + "grad_norm": 0.37494778633117676, + "learning_rate": 9.836363872678356e-05, + "loss": 1.3706, + "step": 39115 + }, + { + "epoch": 0.5082945798133008, + "grad_norm": 0.48025625944137573, + "learning_rate": 9.836103926487219e-05, + "loss": 1.4312, + "step": 39116 + }, + { + "epoch": 0.5083075743572167, + "grad_norm": 0.33407461643218994, + "learning_rate": 9.835843980296078e-05, + "loss": 1.3946, + "step": 39117 + }, + { + "epoch": 0.5083205689011326, + "grad_norm": 0.3723430633544922, + "learning_rate": 9.835584034104941e-05, + "loss": 1.422, + "step": 39118 + }, + { + "epoch": 0.5083335634450485, + "grad_norm": 0.38238099217414856, + "learning_rate": 9.835324087913802e-05, + "loss": 1.4006, + "step": 39119 + }, + { + "epoch": 0.5083465579889643, + "grad_norm": 0.33316078782081604, + "learning_rate": 9.835064141722664e-05, + "loss": 1.159, + "step": 39120 + }, + { + "epoch": 0.5083595525328802, + "grad_norm": 0.39543792605400085, + "learning_rate": 9.834804195531525e-05, + "loss": 1.3235, + "step": 39121 + }, + { + "epoch": 0.5083725470767961, + "grad_norm": 0.39916759729385376, + "learning_rate": 9.834544249340388e-05, + "loss": 1.5048, + "step": 39122 + }, + { + "epoch": 0.508385541620712, + "grad_norm": 0.30125388503074646, + "learning_rate": 9.834284303149248e-05, + "loss": 1.4396, + "step": 39123 + }, + { + "epoch": 0.5083985361646278, + "grad_norm": 0.38635626435279846, + "learning_rate": 9.83402435695811e-05, + "loss": 1.4106, + "step": 39124 + }, + { + "epoch": 0.5084115307085437, + "grad_norm": 0.363315612077713, + "learning_rate": 9.833764410766971e-05, + "loss": 1.2013, + "step": 39125 + }, + { + "epoch": 0.5084245252524596, + "grad_norm": 0.25509172677993774, + "learning_rate": 9.833504464575834e-05, + "loss": 1.3336, + "step": 39126 + }, + { + "epoch": 0.5084375197963755, + "grad_norm": 0.36476683616638184, + "learning_rate": 9.833244518384695e-05, + "loss": 1.4105, + "step": 39127 + }, + { + "epoch": 0.5084505143402913, + "grad_norm": 0.3915826082229614, + "learning_rate": 9.832984572193557e-05, + "loss": 1.2316, + "step": 39128 + }, + { + "epoch": 0.5084635088842072, + "grad_norm": 0.4001163840293884, + "learning_rate": 9.832724626002417e-05, + "loss": 1.4599, + "step": 39129 + }, + { + "epoch": 0.5084765034281231, + "grad_norm": 0.36212319135665894, + "learning_rate": 9.832464679811279e-05, + "loss": 1.325, + "step": 39130 + }, + { + "epoch": 0.508489497972039, + "grad_norm": 0.3813785910606384, + "learning_rate": 9.83220473362014e-05, + "loss": 1.3975, + "step": 39131 + }, + { + "epoch": 0.5085024925159549, + "grad_norm": 0.37070780992507935, + "learning_rate": 9.831944787429003e-05, + "loss": 1.3775, + "step": 39132 + }, + { + "epoch": 0.5085154870598707, + "grad_norm": 0.4906761646270752, + "learning_rate": 9.831684841237864e-05, + "loss": 1.4019, + "step": 39133 + }, + { + "epoch": 0.5085284816037866, + "grad_norm": 0.5181283354759216, + "learning_rate": 9.831424895046726e-05, + "loss": 1.3807, + "step": 39134 + }, + { + "epoch": 0.5085414761477025, + "grad_norm": 0.3241029381752014, + "learning_rate": 9.831164948855587e-05, + "loss": 1.364, + "step": 39135 + }, + { + "epoch": 0.5085544706916184, + "grad_norm": 0.4425843358039856, + "learning_rate": 9.830905002664449e-05, + "loss": 1.1851, + "step": 39136 + }, + { + "epoch": 0.5085674652355342, + "grad_norm": 0.2734488248825073, + "learning_rate": 9.830645056473311e-05, + "loss": 1.4825, + "step": 39137 + }, + { + "epoch": 0.5085804597794501, + "grad_norm": 0.38445886969566345, + "learning_rate": 9.830385110282172e-05, + "loss": 1.2179, + "step": 39138 + }, + { + "epoch": 0.508593454323366, + "grad_norm": 0.45285069942474365, + "learning_rate": 9.830125164091035e-05, + "loss": 1.2794, + "step": 39139 + }, + { + "epoch": 0.5086064488672819, + "grad_norm": 0.43517419695854187, + "learning_rate": 9.829865217899896e-05, + "loss": 1.3594, + "step": 39140 + }, + { + "epoch": 0.5086194434111977, + "grad_norm": 0.4396021068096161, + "learning_rate": 9.829605271708757e-05, + "loss": 1.4293, + "step": 39141 + }, + { + "epoch": 0.5086324379551136, + "grad_norm": 0.42517006397247314, + "learning_rate": 9.829345325517618e-05, + "loss": 1.2422, + "step": 39142 + }, + { + "epoch": 0.5086454324990295, + "grad_norm": 0.3792587220668793, + "learning_rate": 9.82908537932648e-05, + "loss": 1.3222, + "step": 39143 + }, + { + "epoch": 0.5086584270429454, + "grad_norm": 0.47380226850509644, + "learning_rate": 9.828825433135341e-05, + "loss": 1.3948, + "step": 39144 + }, + { + "epoch": 0.5086714215868612, + "grad_norm": 0.37331247329711914, + "learning_rate": 9.828565486944204e-05, + "loss": 1.3523, + "step": 39145 + }, + { + "epoch": 0.5086844161307771, + "grad_norm": 0.3739831745624542, + "learning_rate": 9.828305540753065e-05, + "loss": 1.1868, + "step": 39146 + }, + { + "epoch": 0.508697410674693, + "grad_norm": 0.3964419662952423, + "learning_rate": 9.828045594561926e-05, + "loss": 1.2513, + "step": 39147 + }, + { + "epoch": 0.5087104052186089, + "grad_norm": 0.39319562911987305, + "learning_rate": 9.827785648370787e-05, + "loss": 1.3898, + "step": 39148 + }, + { + "epoch": 0.5087233997625247, + "grad_norm": 0.4910820722579956, + "learning_rate": 9.82752570217965e-05, + "loss": 1.2613, + "step": 39149 + }, + { + "epoch": 0.5087363943064406, + "grad_norm": 0.4563422203063965, + "learning_rate": 9.82726575598851e-05, + "loss": 1.4288, + "step": 39150 + }, + { + "epoch": 0.5087493888503565, + "grad_norm": 0.4918176829814911, + "learning_rate": 9.827005809797373e-05, + "loss": 1.505, + "step": 39151 + }, + { + "epoch": 0.5087623833942724, + "grad_norm": 0.33197033405303955, + "learning_rate": 9.826745863606234e-05, + "loss": 1.2825, + "step": 39152 + }, + { + "epoch": 0.5087753779381882, + "grad_norm": 0.3984086215496063, + "learning_rate": 9.826485917415095e-05, + "loss": 1.3721, + "step": 39153 + }, + { + "epoch": 0.5087883724821041, + "grad_norm": 0.5225964188575745, + "learning_rate": 9.826225971223956e-05, + "loss": 1.3568, + "step": 39154 + }, + { + "epoch": 0.50880136702602, + "grad_norm": 0.45638221502304077, + "learning_rate": 9.825966025032819e-05, + "loss": 1.3787, + "step": 39155 + }, + { + "epoch": 0.5088143615699359, + "grad_norm": 0.44852665066719055, + "learning_rate": 9.82570607884168e-05, + "loss": 1.3562, + "step": 39156 + }, + { + "epoch": 0.5088273561138517, + "grad_norm": 0.37948957085609436, + "learning_rate": 9.825446132650542e-05, + "loss": 1.4091, + "step": 39157 + }, + { + "epoch": 0.5088403506577676, + "grad_norm": 0.20879511535167694, + "learning_rate": 9.825186186459403e-05, + "loss": 1.1155, + "step": 39158 + }, + { + "epoch": 0.5088533452016835, + "grad_norm": 0.45720037817955017, + "learning_rate": 9.824926240268265e-05, + "loss": 1.4597, + "step": 39159 + }, + { + "epoch": 0.5088663397455994, + "grad_norm": 0.3994595408439636, + "learning_rate": 9.824666294077126e-05, + "loss": 1.5436, + "step": 39160 + }, + { + "epoch": 0.5088793342895152, + "grad_norm": 0.3915811777114868, + "learning_rate": 9.824406347885988e-05, + "loss": 1.3103, + "step": 39161 + }, + { + "epoch": 0.508892328833431, + "grad_norm": 0.41099846363067627, + "learning_rate": 9.824146401694849e-05, + "loss": 1.263, + "step": 39162 + }, + { + "epoch": 0.508905323377347, + "grad_norm": 0.4201980531215668, + "learning_rate": 9.823886455503712e-05, + "loss": 1.3202, + "step": 39163 + }, + { + "epoch": 0.5089183179212629, + "grad_norm": 0.41457399725914, + "learning_rate": 9.823626509312573e-05, + "loss": 1.37, + "step": 39164 + }, + { + "epoch": 0.5089313124651786, + "grad_norm": 0.31223076581954956, + "learning_rate": 9.823366563121434e-05, + "loss": 1.4406, + "step": 39165 + }, + { + "epoch": 0.5089443070090945, + "grad_norm": 0.4344368577003479, + "learning_rate": 9.823106616930295e-05, + "loss": 1.3944, + "step": 39166 + }, + { + "epoch": 0.5089573015530104, + "grad_norm": 0.3481143116950989, + "learning_rate": 9.822846670739157e-05, + "loss": 1.3729, + "step": 39167 + }, + { + "epoch": 0.5089702960969263, + "grad_norm": 0.41374996304512024, + "learning_rate": 9.822586724548018e-05, + "loss": 1.4794, + "step": 39168 + }, + { + "epoch": 0.5089832906408421, + "grad_norm": 0.3319886028766632, + "learning_rate": 9.822326778356881e-05, + "loss": 1.2633, + "step": 39169 + }, + { + "epoch": 0.508996285184758, + "grad_norm": 0.4248144030570984, + "learning_rate": 9.822066832165742e-05, + "loss": 1.4331, + "step": 39170 + }, + { + "epoch": 0.5090092797286739, + "grad_norm": 0.4264313578605652, + "learning_rate": 9.821806885974603e-05, + "loss": 1.2653, + "step": 39171 + }, + { + "epoch": 0.5090222742725898, + "grad_norm": 0.3684108853340149, + "learning_rate": 9.821546939783464e-05, + "loss": 1.2743, + "step": 39172 + }, + { + "epoch": 0.5090352688165056, + "grad_norm": 0.4924924373626709, + "learning_rate": 9.821286993592327e-05, + "loss": 1.4386, + "step": 39173 + }, + { + "epoch": 0.5090482633604215, + "grad_norm": 0.4306740164756775, + "learning_rate": 9.821027047401189e-05, + "loss": 1.4667, + "step": 39174 + }, + { + "epoch": 0.5090612579043374, + "grad_norm": 0.4500599801540375, + "learning_rate": 9.82076710121005e-05, + "loss": 1.3339, + "step": 39175 + }, + { + "epoch": 0.5090742524482533, + "grad_norm": 0.34499624371528625, + "learning_rate": 9.820507155018913e-05, + "loss": 1.2237, + "step": 39176 + }, + { + "epoch": 0.5090872469921691, + "grad_norm": 0.3862970173358917, + "learning_rate": 9.820247208827774e-05, + "loss": 1.2948, + "step": 39177 + }, + { + "epoch": 0.509100241536085, + "grad_norm": 0.33407288789749146, + "learning_rate": 9.819987262636635e-05, + "loss": 1.4451, + "step": 39178 + }, + { + "epoch": 0.5091132360800009, + "grad_norm": 0.3596533238887787, + "learning_rate": 9.819727316445496e-05, + "loss": 1.4126, + "step": 39179 + }, + { + "epoch": 0.5091262306239168, + "grad_norm": 0.35112443566322327, + "learning_rate": 9.819467370254358e-05, + "loss": 1.2163, + "step": 39180 + }, + { + "epoch": 0.5091392251678326, + "grad_norm": 0.30794188380241394, + "learning_rate": 9.81920742406322e-05, + "loss": 1.1117, + "step": 39181 + }, + { + "epoch": 0.5091522197117485, + "grad_norm": 0.4689164161682129, + "learning_rate": 9.818947477872082e-05, + "loss": 1.3546, + "step": 39182 + }, + { + "epoch": 0.5091652142556644, + "grad_norm": 0.5049583315849304, + "learning_rate": 9.818687531680943e-05, + "loss": 1.4558, + "step": 39183 + }, + { + "epoch": 0.5091782087995803, + "grad_norm": 0.45010024309158325, + "learning_rate": 9.818427585489804e-05, + "loss": 1.1078, + "step": 39184 + }, + { + "epoch": 0.5091912033434961, + "grad_norm": 0.49955010414123535, + "learning_rate": 9.818167639298665e-05, + "loss": 1.5761, + "step": 39185 + }, + { + "epoch": 0.509204197887412, + "grad_norm": 0.38533174991607666, + "learning_rate": 9.817907693107528e-05, + "loss": 1.4038, + "step": 39186 + }, + { + "epoch": 0.5092171924313279, + "grad_norm": 0.376058429479599, + "learning_rate": 9.817647746916389e-05, + "loss": 1.3392, + "step": 39187 + }, + { + "epoch": 0.5092301869752438, + "grad_norm": 0.4223913252353668, + "learning_rate": 9.817387800725251e-05, + "loss": 1.3768, + "step": 39188 + }, + { + "epoch": 0.5092431815191596, + "grad_norm": 0.3299860656261444, + "learning_rate": 9.817127854534112e-05, + "loss": 1.2803, + "step": 39189 + }, + { + "epoch": 0.5092561760630755, + "grad_norm": 0.40014559030532837, + "learning_rate": 9.816867908342973e-05, + "loss": 1.2292, + "step": 39190 + }, + { + "epoch": 0.5092691706069914, + "grad_norm": 0.4480552673339844, + "learning_rate": 9.816607962151834e-05, + "loss": 1.3112, + "step": 39191 + }, + { + "epoch": 0.5092821651509073, + "grad_norm": 0.4934820234775543, + "learning_rate": 9.816348015960697e-05, + "loss": 1.4381, + "step": 39192 + }, + { + "epoch": 0.5092951596948231, + "grad_norm": 0.35670486092567444, + "learning_rate": 9.816088069769558e-05, + "loss": 1.4641, + "step": 39193 + }, + { + "epoch": 0.509308154238739, + "grad_norm": 0.36212441325187683, + "learning_rate": 9.81582812357842e-05, + "loss": 1.2408, + "step": 39194 + }, + { + "epoch": 0.5093211487826549, + "grad_norm": 0.2892414331436157, + "learning_rate": 9.815568177387282e-05, + "loss": 1.2475, + "step": 39195 + }, + { + "epoch": 0.5093341433265708, + "grad_norm": 0.33519724011421204, + "learning_rate": 9.815308231196143e-05, + "loss": 1.4386, + "step": 39196 + }, + { + "epoch": 0.5093471378704866, + "grad_norm": 0.4160432517528534, + "learning_rate": 9.815048285005004e-05, + "loss": 1.4025, + "step": 39197 + }, + { + "epoch": 0.5093601324144025, + "grad_norm": 0.3615199625492096, + "learning_rate": 9.814788338813866e-05, + "loss": 1.337, + "step": 39198 + }, + { + "epoch": 0.5093731269583184, + "grad_norm": 0.4740733206272125, + "learning_rate": 9.814528392622727e-05, + "loss": 1.2486, + "step": 39199 + }, + { + "epoch": 0.5093861215022343, + "grad_norm": 0.3478700816631317, + "learning_rate": 9.81426844643159e-05, + "loss": 1.3605, + "step": 39200 + }, + { + "epoch": 0.5093991160461501, + "grad_norm": 0.42419081926345825, + "learning_rate": 9.814008500240451e-05, + "loss": 1.4241, + "step": 39201 + }, + { + "epoch": 0.509412110590066, + "grad_norm": 0.4373956024646759, + "learning_rate": 9.813748554049312e-05, + "loss": 1.355, + "step": 39202 + }, + { + "epoch": 0.5094251051339819, + "grad_norm": 0.34196627140045166, + "learning_rate": 9.813488607858173e-05, + "loss": 1.1621, + "step": 39203 + }, + { + "epoch": 0.5094380996778978, + "grad_norm": 0.39044463634490967, + "learning_rate": 9.813228661667035e-05, + "loss": 1.3256, + "step": 39204 + }, + { + "epoch": 0.5094510942218136, + "grad_norm": 0.414628267288208, + "learning_rate": 9.812968715475897e-05, + "loss": 1.2813, + "step": 39205 + }, + { + "epoch": 0.5094640887657295, + "grad_norm": 0.4861219525337219, + "learning_rate": 9.812708769284759e-05, + "loss": 1.6509, + "step": 39206 + }, + { + "epoch": 0.5094770833096454, + "grad_norm": 0.41792428493499756, + "learning_rate": 9.81244882309362e-05, + "loss": 1.651, + "step": 39207 + }, + { + "epoch": 0.5094900778535613, + "grad_norm": 0.4257180988788605, + "learning_rate": 9.812188876902481e-05, + "loss": 1.3115, + "step": 39208 + }, + { + "epoch": 0.5095030723974772, + "grad_norm": 0.46537283062934875, + "learning_rate": 9.811928930711342e-05, + "loss": 1.5039, + "step": 39209 + }, + { + "epoch": 0.509516066941393, + "grad_norm": 0.3252220153808594, + "learning_rate": 9.811668984520205e-05, + "loss": 1.3486, + "step": 39210 + }, + { + "epoch": 0.5095290614853089, + "grad_norm": 0.4003971815109253, + "learning_rate": 9.811409038329067e-05, + "loss": 1.2687, + "step": 39211 + }, + { + "epoch": 0.5095420560292248, + "grad_norm": 0.48370397090911865, + "learning_rate": 9.811149092137928e-05, + "loss": 1.4693, + "step": 39212 + }, + { + "epoch": 0.5095550505731407, + "grad_norm": 0.3933558464050293, + "learning_rate": 9.810889145946789e-05, + "loss": 1.3913, + "step": 39213 + }, + { + "epoch": 0.5095680451170564, + "grad_norm": 0.36668825149536133, + "learning_rate": 9.81062919975565e-05, + "loss": 1.1639, + "step": 39214 + }, + { + "epoch": 0.5095810396609723, + "grad_norm": 0.39010128378868103, + "learning_rate": 9.810369253564513e-05, + "loss": 1.5945, + "step": 39215 + }, + { + "epoch": 0.5095940342048882, + "grad_norm": 0.38122034072875977, + "learning_rate": 9.810109307373374e-05, + "loss": 1.5294, + "step": 39216 + }, + { + "epoch": 0.5096070287488041, + "grad_norm": 0.47352004051208496, + "learning_rate": 9.809849361182236e-05, + "loss": 1.5688, + "step": 39217 + }, + { + "epoch": 0.5096200232927199, + "grad_norm": 0.37617090344429016, + "learning_rate": 9.809589414991097e-05, + "loss": 1.4715, + "step": 39218 + }, + { + "epoch": 0.5096330178366358, + "grad_norm": 0.4637976884841919, + "learning_rate": 9.80932946879996e-05, + "loss": 1.52, + "step": 39219 + }, + { + "epoch": 0.5096460123805517, + "grad_norm": 0.3982177972793579, + "learning_rate": 9.80906952260882e-05, + "loss": 1.3725, + "step": 39220 + }, + { + "epoch": 0.5096590069244676, + "grad_norm": 0.3927983045578003, + "learning_rate": 9.808809576417682e-05, + "loss": 1.4453, + "step": 39221 + }, + { + "epoch": 0.5096720014683834, + "grad_norm": 0.46731603145599365, + "learning_rate": 9.808549630226543e-05, + "loss": 1.4025, + "step": 39222 + }, + { + "epoch": 0.5096849960122993, + "grad_norm": 0.4086238145828247, + "learning_rate": 9.808289684035406e-05, + "loss": 1.3231, + "step": 39223 + }, + { + "epoch": 0.5096979905562152, + "grad_norm": 0.424526572227478, + "learning_rate": 9.808029737844267e-05, + "loss": 1.5135, + "step": 39224 + }, + { + "epoch": 0.5097109851001311, + "grad_norm": 0.4239373505115509, + "learning_rate": 9.807769791653129e-05, + "loss": 1.3703, + "step": 39225 + }, + { + "epoch": 0.5097239796440469, + "grad_norm": 0.3453957438468933, + "learning_rate": 9.807509845461989e-05, + "loss": 1.3502, + "step": 39226 + }, + { + "epoch": 0.5097369741879628, + "grad_norm": 0.4526543617248535, + "learning_rate": 9.807249899270851e-05, + "loss": 1.4249, + "step": 39227 + }, + { + "epoch": 0.5097499687318787, + "grad_norm": 0.4349013566970825, + "learning_rate": 9.806989953079712e-05, + "loss": 1.4725, + "step": 39228 + }, + { + "epoch": 0.5097629632757946, + "grad_norm": 0.351409375667572, + "learning_rate": 9.806730006888575e-05, + "loss": 1.3898, + "step": 39229 + }, + { + "epoch": 0.5097759578197104, + "grad_norm": 0.3883218765258789, + "learning_rate": 9.806470060697436e-05, + "loss": 1.2212, + "step": 39230 + }, + { + "epoch": 0.5097889523636263, + "grad_norm": 0.34148740768432617, + "learning_rate": 9.806210114506298e-05, + "loss": 1.4885, + "step": 39231 + }, + { + "epoch": 0.5098019469075422, + "grad_norm": 0.32814446091651917, + "learning_rate": 9.805950168315158e-05, + "loss": 1.352, + "step": 39232 + }, + { + "epoch": 0.5098149414514581, + "grad_norm": 0.3692649006843567, + "learning_rate": 9.80569022212402e-05, + "loss": 1.3238, + "step": 39233 + }, + { + "epoch": 0.5098279359953739, + "grad_norm": 0.39818865060806274, + "learning_rate": 9.805430275932882e-05, + "loss": 1.357, + "step": 39234 + }, + { + "epoch": 0.5098409305392898, + "grad_norm": 0.5230275988578796, + "learning_rate": 9.805170329741744e-05, + "loss": 1.4339, + "step": 39235 + }, + { + "epoch": 0.5098539250832057, + "grad_norm": 0.4490581452846527, + "learning_rate": 9.804910383550605e-05, + "loss": 1.4215, + "step": 39236 + }, + { + "epoch": 0.5098669196271216, + "grad_norm": 0.47048327326774597, + "learning_rate": 9.804650437359468e-05, + "loss": 1.2651, + "step": 39237 + }, + { + "epoch": 0.5098799141710374, + "grad_norm": 0.36902928352355957, + "learning_rate": 9.804390491168329e-05, + "loss": 1.4508, + "step": 39238 + }, + { + "epoch": 0.5098929087149533, + "grad_norm": 0.36316853761672974, + "learning_rate": 9.80413054497719e-05, + "loss": 1.2027, + "step": 39239 + }, + { + "epoch": 0.5099059032588692, + "grad_norm": 0.28990671038627625, + "learning_rate": 9.803870598786051e-05, + "loss": 1.4653, + "step": 39240 + }, + { + "epoch": 0.5099188978027851, + "grad_norm": 0.4087803363800049, + "learning_rate": 9.803610652594913e-05, + "loss": 1.4851, + "step": 39241 + }, + { + "epoch": 0.5099318923467009, + "grad_norm": 0.40900230407714844, + "learning_rate": 9.803350706403775e-05, + "loss": 1.4249, + "step": 39242 + }, + { + "epoch": 0.5099448868906168, + "grad_norm": 0.43276965618133545, + "learning_rate": 9.803090760212637e-05, + "loss": 1.2779, + "step": 39243 + }, + { + "epoch": 0.5099578814345327, + "grad_norm": 0.4382627308368683, + "learning_rate": 9.802830814021498e-05, + "loss": 1.3701, + "step": 39244 + }, + { + "epoch": 0.5099708759784486, + "grad_norm": 0.3575490415096283, + "learning_rate": 9.802570867830359e-05, + "loss": 1.3614, + "step": 39245 + }, + { + "epoch": 0.5099838705223644, + "grad_norm": 0.4890463054180145, + "learning_rate": 9.80231092163922e-05, + "loss": 1.4129, + "step": 39246 + }, + { + "epoch": 0.5099968650662803, + "grad_norm": 0.5059266686439514, + "learning_rate": 9.802050975448083e-05, + "loss": 1.5434, + "step": 39247 + }, + { + "epoch": 0.5100098596101962, + "grad_norm": 0.3912336826324463, + "learning_rate": 9.801791029256945e-05, + "loss": 1.2874, + "step": 39248 + }, + { + "epoch": 0.5100228541541121, + "grad_norm": 0.5042760968208313, + "learning_rate": 9.801531083065806e-05, + "loss": 1.3362, + "step": 39249 + }, + { + "epoch": 0.5100358486980279, + "grad_norm": 0.4623928964138031, + "learning_rate": 9.801271136874667e-05, + "loss": 1.4268, + "step": 39250 + }, + { + "epoch": 0.5100488432419438, + "grad_norm": 0.36923345923423767, + "learning_rate": 9.801011190683528e-05, + "loss": 1.3538, + "step": 39251 + }, + { + "epoch": 0.5100618377858597, + "grad_norm": 0.34344062209129333, + "learning_rate": 9.800751244492391e-05, + "loss": 1.0877, + "step": 39252 + }, + { + "epoch": 0.5100748323297756, + "grad_norm": 0.38448214530944824, + "learning_rate": 9.800491298301252e-05, + "loss": 1.377, + "step": 39253 + }, + { + "epoch": 0.5100878268736914, + "grad_norm": 0.37683412432670593, + "learning_rate": 9.800231352110114e-05, + "loss": 1.1389, + "step": 39254 + }, + { + "epoch": 0.5101008214176073, + "grad_norm": 0.4299337565898895, + "learning_rate": 9.799971405918976e-05, + "loss": 1.4249, + "step": 39255 + }, + { + "epoch": 0.5101138159615232, + "grad_norm": 0.3300829529762268, + "learning_rate": 9.799711459727837e-05, + "loss": 1.6148, + "step": 39256 + }, + { + "epoch": 0.5101268105054391, + "grad_norm": 0.4856196939945221, + "learning_rate": 9.799451513536698e-05, + "loss": 1.5518, + "step": 39257 + }, + { + "epoch": 0.5101398050493549, + "grad_norm": 0.35708701610565186, + "learning_rate": 9.79919156734556e-05, + "loss": 1.4513, + "step": 39258 + }, + { + "epoch": 0.5101527995932708, + "grad_norm": 0.3373710811138153, + "learning_rate": 9.798931621154421e-05, + "loss": 1.325, + "step": 39259 + }, + { + "epoch": 0.5101657941371867, + "grad_norm": 0.4144609570503235, + "learning_rate": 9.798671674963284e-05, + "loss": 1.4631, + "step": 39260 + }, + { + "epoch": 0.5101787886811026, + "grad_norm": 0.33534038066864014, + "learning_rate": 9.798411728772145e-05, + "loss": 1.4006, + "step": 39261 + }, + { + "epoch": 0.5101917832250183, + "grad_norm": 0.35719284415245056, + "learning_rate": 9.798151782581006e-05, + "loss": 1.2885, + "step": 39262 + }, + { + "epoch": 0.5102047777689342, + "grad_norm": 0.39487341046333313, + "learning_rate": 9.797891836389867e-05, + "loss": 1.3962, + "step": 39263 + }, + { + "epoch": 0.5102177723128501, + "grad_norm": 0.45949462056159973, + "learning_rate": 9.79763189019873e-05, + "loss": 1.5093, + "step": 39264 + }, + { + "epoch": 0.510230766856766, + "grad_norm": 0.42805638909339905, + "learning_rate": 9.79737194400759e-05, + "loss": 1.5651, + "step": 39265 + }, + { + "epoch": 0.5102437614006818, + "grad_norm": 0.4566091299057007, + "learning_rate": 9.797111997816453e-05, + "loss": 1.5625, + "step": 39266 + }, + { + "epoch": 0.5102567559445977, + "grad_norm": 0.34924131631851196, + "learning_rate": 9.796852051625314e-05, + "loss": 1.447, + "step": 39267 + }, + { + "epoch": 0.5102697504885136, + "grad_norm": 0.5608017444610596, + "learning_rate": 9.796592105434175e-05, + "loss": 1.4102, + "step": 39268 + }, + { + "epoch": 0.5102827450324295, + "grad_norm": 0.3222692906856537, + "learning_rate": 9.796332159243036e-05, + "loss": 1.1683, + "step": 39269 + }, + { + "epoch": 0.5102957395763453, + "grad_norm": 0.4071926772594452, + "learning_rate": 9.796072213051899e-05, + "loss": 1.5976, + "step": 39270 + }, + { + "epoch": 0.5103087341202612, + "grad_norm": 0.36689645051956177, + "learning_rate": 9.79581226686076e-05, + "loss": 1.3351, + "step": 39271 + }, + { + "epoch": 0.5103217286641771, + "grad_norm": 0.44225069880485535, + "learning_rate": 9.795552320669622e-05, + "loss": 1.2521, + "step": 39272 + }, + { + "epoch": 0.510334723208093, + "grad_norm": 0.4344242215156555, + "learning_rate": 9.795292374478483e-05, + "loss": 1.3839, + "step": 39273 + }, + { + "epoch": 0.5103477177520088, + "grad_norm": 0.3886382579803467, + "learning_rate": 9.795032428287344e-05, + "loss": 1.4903, + "step": 39274 + }, + { + "epoch": 0.5103607122959247, + "grad_norm": 0.48813384771347046, + "learning_rate": 9.794772482096206e-05, + "loss": 1.4209, + "step": 39275 + }, + { + "epoch": 0.5103737068398406, + "grad_norm": 0.4030526280403137, + "learning_rate": 9.794512535905068e-05, + "loss": 1.5349, + "step": 39276 + }, + { + "epoch": 0.5103867013837565, + "grad_norm": 0.44391322135925293, + "learning_rate": 9.794252589713929e-05, + "loss": 1.374, + "step": 39277 + }, + { + "epoch": 0.5103996959276723, + "grad_norm": 0.4312180280685425, + "learning_rate": 9.793992643522792e-05, + "loss": 1.3403, + "step": 39278 + }, + { + "epoch": 0.5104126904715882, + "grad_norm": 0.31165820360183716, + "learning_rate": 9.793732697331653e-05, + "loss": 1.0907, + "step": 39279 + }, + { + "epoch": 0.5104256850155041, + "grad_norm": 0.49674123525619507, + "learning_rate": 9.793472751140515e-05, + "loss": 1.5206, + "step": 39280 + }, + { + "epoch": 0.51043867955942, + "grad_norm": 0.4100476801395416, + "learning_rate": 9.793212804949375e-05, + "loss": 1.3613, + "step": 39281 + }, + { + "epoch": 0.5104516741033359, + "grad_norm": 0.4155726730823517, + "learning_rate": 9.792952858758237e-05, + "loss": 1.4438, + "step": 39282 + }, + { + "epoch": 0.5104646686472517, + "grad_norm": 0.38523587584495544, + "learning_rate": 9.792692912567098e-05, + "loss": 1.6815, + "step": 39283 + }, + { + "epoch": 0.5104776631911676, + "grad_norm": 0.3895767331123352, + "learning_rate": 9.792432966375961e-05, + "loss": 1.5873, + "step": 39284 + }, + { + "epoch": 0.5104906577350835, + "grad_norm": 0.4110410511493683, + "learning_rate": 9.792173020184823e-05, + "loss": 1.4763, + "step": 39285 + }, + { + "epoch": 0.5105036522789994, + "grad_norm": 0.2810681462287903, + "learning_rate": 9.791913073993684e-05, + "loss": 1.3509, + "step": 39286 + }, + { + "epoch": 0.5105166468229152, + "grad_norm": 0.43798908591270447, + "learning_rate": 9.791653127802545e-05, + "loss": 1.163, + "step": 39287 + }, + { + "epoch": 0.5105296413668311, + "grad_norm": 0.3786788284778595, + "learning_rate": 9.791393181611407e-05, + "loss": 1.3499, + "step": 39288 + }, + { + "epoch": 0.510542635910747, + "grad_norm": 0.3901594281196594, + "learning_rate": 9.791133235420269e-05, + "loss": 1.3245, + "step": 39289 + }, + { + "epoch": 0.5105556304546629, + "grad_norm": 0.3720622956752777, + "learning_rate": 9.79087328922913e-05, + "loss": 1.2647, + "step": 39290 + }, + { + "epoch": 0.5105686249985787, + "grad_norm": 0.5036496520042419, + "learning_rate": 9.790613343037993e-05, + "loss": 1.3893, + "step": 39291 + }, + { + "epoch": 0.5105816195424946, + "grad_norm": 0.3622131943702698, + "learning_rate": 9.790353396846854e-05, + "loss": 1.1889, + "step": 39292 + }, + { + "epoch": 0.5105946140864105, + "grad_norm": 0.33060070872306824, + "learning_rate": 9.790093450655715e-05, + "loss": 1.6825, + "step": 39293 + }, + { + "epoch": 0.5106076086303264, + "grad_norm": 0.36880016326904297, + "learning_rate": 9.789833504464576e-05, + "loss": 1.3704, + "step": 39294 + }, + { + "epoch": 0.5106206031742422, + "grad_norm": 0.3040933609008789, + "learning_rate": 9.789573558273438e-05, + "loss": 1.287, + "step": 39295 + }, + { + "epoch": 0.5106335977181581, + "grad_norm": 0.3780258297920227, + "learning_rate": 9.7893136120823e-05, + "loss": 1.1544, + "step": 39296 + }, + { + "epoch": 0.510646592262074, + "grad_norm": 0.33794379234313965, + "learning_rate": 9.789053665891162e-05, + "loss": 1.3383, + "step": 39297 + }, + { + "epoch": 0.5106595868059899, + "grad_norm": 0.34207865595817566, + "learning_rate": 9.788793719700023e-05, + "loss": 1.402, + "step": 39298 + }, + { + "epoch": 0.5106725813499057, + "grad_norm": 0.3777764141559601, + "learning_rate": 9.788533773508884e-05, + "loss": 1.5684, + "step": 39299 + }, + { + "epoch": 0.5106855758938216, + "grad_norm": 0.43486225605010986, + "learning_rate": 9.788273827317745e-05, + "loss": 1.3576, + "step": 39300 + }, + { + "epoch": 0.5106985704377375, + "grad_norm": 0.34987571835517883, + "learning_rate": 9.788013881126608e-05, + "loss": 1.6069, + "step": 39301 + }, + { + "epoch": 0.5107115649816534, + "grad_norm": 0.37480953335762024, + "learning_rate": 9.787753934935469e-05, + "loss": 1.0948, + "step": 39302 + }, + { + "epoch": 0.5107245595255692, + "grad_norm": 0.5006477236747742, + "learning_rate": 9.787493988744331e-05, + "loss": 1.5047, + "step": 39303 + }, + { + "epoch": 0.5107375540694851, + "grad_norm": 0.28755712509155273, + "learning_rate": 9.787234042553192e-05, + "loss": 1.2244, + "step": 39304 + }, + { + "epoch": 0.510750548613401, + "grad_norm": 0.47323811054229736, + "learning_rate": 9.786974096362053e-05, + "loss": 1.4976, + "step": 39305 + }, + { + "epoch": 0.5107635431573169, + "grad_norm": 0.4157203733921051, + "learning_rate": 9.786714150170914e-05, + "loss": 1.4582, + "step": 39306 + }, + { + "epoch": 0.5107765377012327, + "grad_norm": 0.4243549704551697, + "learning_rate": 9.786454203979777e-05, + "loss": 1.5274, + "step": 39307 + }, + { + "epoch": 0.5107895322451486, + "grad_norm": 0.32249003648757935, + "learning_rate": 9.786194257788638e-05, + "loss": 1.3357, + "step": 39308 + }, + { + "epoch": 0.5108025267890645, + "grad_norm": 0.4227658212184906, + "learning_rate": 9.7859343115975e-05, + "loss": 1.4523, + "step": 39309 + }, + { + "epoch": 0.5108155213329804, + "grad_norm": 0.3778490424156189, + "learning_rate": 9.785674365406361e-05, + "loss": 1.3866, + "step": 39310 + }, + { + "epoch": 0.5108285158768961, + "grad_norm": 0.395142138004303, + "learning_rate": 9.785414419215223e-05, + "loss": 1.3202, + "step": 39311 + }, + { + "epoch": 0.510841510420812, + "grad_norm": 0.552781879901886, + "learning_rate": 9.785154473024084e-05, + "loss": 1.6138, + "step": 39312 + }, + { + "epoch": 0.510854504964728, + "grad_norm": 0.4020445644855499, + "learning_rate": 9.784894526832946e-05, + "loss": 1.5534, + "step": 39313 + }, + { + "epoch": 0.5108674995086439, + "grad_norm": 0.3734736144542694, + "learning_rate": 9.784634580641807e-05, + "loss": 1.2685, + "step": 39314 + }, + { + "epoch": 0.5108804940525596, + "grad_norm": 0.34066110849380493, + "learning_rate": 9.78437463445067e-05, + "loss": 1.2275, + "step": 39315 + }, + { + "epoch": 0.5108934885964755, + "grad_norm": 0.39267152547836304, + "learning_rate": 9.784114688259531e-05, + "loss": 1.3644, + "step": 39316 + }, + { + "epoch": 0.5109064831403914, + "grad_norm": 0.3191804587841034, + "learning_rate": 9.783854742068392e-05, + "loss": 1.3228, + "step": 39317 + }, + { + "epoch": 0.5109194776843073, + "grad_norm": 0.41228315234184265, + "learning_rate": 9.783594795877253e-05, + "loss": 1.5185, + "step": 39318 + }, + { + "epoch": 0.5109324722282231, + "grad_norm": 0.4741588532924652, + "learning_rate": 9.783334849686115e-05, + "loss": 1.2785, + "step": 39319 + }, + { + "epoch": 0.510945466772139, + "grad_norm": 0.3594357669353485, + "learning_rate": 9.783074903494976e-05, + "loss": 1.2011, + "step": 39320 + }, + { + "epoch": 0.5109584613160549, + "grad_norm": 0.3708149492740631, + "learning_rate": 9.782814957303839e-05, + "loss": 1.4429, + "step": 39321 + }, + { + "epoch": 0.5109714558599708, + "grad_norm": 0.36683204770088196, + "learning_rate": 9.782555011112701e-05, + "loss": 1.2601, + "step": 39322 + }, + { + "epoch": 0.5109844504038866, + "grad_norm": 0.35370591282844543, + "learning_rate": 9.782295064921561e-05, + "loss": 1.4153, + "step": 39323 + }, + { + "epoch": 0.5109974449478025, + "grad_norm": 0.5464272499084473, + "learning_rate": 9.782035118730424e-05, + "loss": 1.3843, + "step": 39324 + }, + { + "epoch": 0.5110104394917184, + "grad_norm": 0.38672149181365967, + "learning_rate": 9.781775172539285e-05, + "loss": 1.4909, + "step": 39325 + }, + { + "epoch": 0.5110234340356343, + "grad_norm": 0.37967124581336975, + "learning_rate": 9.781515226348147e-05, + "loss": 1.4825, + "step": 39326 + }, + { + "epoch": 0.5110364285795501, + "grad_norm": 0.40882858633995056, + "learning_rate": 9.781255280157008e-05, + "loss": 1.4778, + "step": 39327 + }, + { + "epoch": 0.511049423123466, + "grad_norm": 0.43306443095207214, + "learning_rate": 9.78099533396587e-05, + "loss": 1.2781, + "step": 39328 + }, + { + "epoch": 0.5110624176673819, + "grad_norm": 0.3639138340950012, + "learning_rate": 9.78073538777473e-05, + "loss": 1.2938, + "step": 39329 + }, + { + "epoch": 0.5110754122112978, + "grad_norm": 0.3407760262489319, + "learning_rate": 9.780475441583593e-05, + "loss": 1.4925, + "step": 39330 + }, + { + "epoch": 0.5110884067552136, + "grad_norm": 0.4410576820373535, + "learning_rate": 9.780215495392454e-05, + "loss": 1.5092, + "step": 39331 + }, + { + "epoch": 0.5111014012991295, + "grad_norm": 0.4604591727256775, + "learning_rate": 9.779955549201316e-05, + "loss": 1.3982, + "step": 39332 + }, + { + "epoch": 0.5111143958430454, + "grad_norm": 0.45295262336730957, + "learning_rate": 9.779695603010177e-05, + "loss": 1.4926, + "step": 39333 + }, + { + "epoch": 0.5111273903869613, + "grad_norm": 0.5161308646202087, + "learning_rate": 9.77943565681904e-05, + "loss": 1.4366, + "step": 39334 + }, + { + "epoch": 0.5111403849308771, + "grad_norm": 0.38867849111557007, + "learning_rate": 9.7791757106279e-05, + "loss": 1.5089, + "step": 39335 + }, + { + "epoch": 0.511153379474793, + "grad_norm": 0.3477202355861664, + "learning_rate": 9.778915764436762e-05, + "loss": 1.2176, + "step": 39336 + }, + { + "epoch": 0.5111663740187089, + "grad_norm": 0.41309410333633423, + "learning_rate": 9.778655818245623e-05, + "loss": 1.3876, + "step": 39337 + }, + { + "epoch": 0.5111793685626248, + "grad_norm": 0.4485919177532196, + "learning_rate": 9.778395872054486e-05, + "loss": 1.5452, + "step": 39338 + }, + { + "epoch": 0.5111923631065406, + "grad_norm": 0.4592667520046234, + "learning_rate": 9.778135925863347e-05, + "loss": 1.3997, + "step": 39339 + }, + { + "epoch": 0.5112053576504565, + "grad_norm": 0.34632593393325806, + "learning_rate": 9.777875979672209e-05, + "loss": 1.2485, + "step": 39340 + }, + { + "epoch": 0.5112183521943724, + "grad_norm": 0.41406524181365967, + "learning_rate": 9.77761603348107e-05, + "loss": 1.3343, + "step": 39341 + }, + { + "epoch": 0.5112313467382883, + "grad_norm": 0.26073506474494934, + "learning_rate": 9.777356087289931e-05, + "loss": 1.1468, + "step": 39342 + }, + { + "epoch": 0.5112443412822041, + "grad_norm": 0.4435719847679138, + "learning_rate": 9.777096141098792e-05, + "loss": 1.407, + "step": 39343 + }, + { + "epoch": 0.51125733582612, + "grad_norm": 0.5298016667366028, + "learning_rate": 9.776836194907655e-05, + "loss": 1.2463, + "step": 39344 + }, + { + "epoch": 0.5112703303700359, + "grad_norm": 0.5006462335586548, + "learning_rate": 9.776576248716516e-05, + "loss": 1.4353, + "step": 39345 + }, + { + "epoch": 0.5112833249139518, + "grad_norm": 0.4609855115413666, + "learning_rate": 9.776316302525378e-05, + "loss": 1.3395, + "step": 39346 + }, + { + "epoch": 0.5112963194578676, + "grad_norm": 0.3615628182888031, + "learning_rate": 9.77605635633424e-05, + "loss": 1.2369, + "step": 39347 + }, + { + "epoch": 0.5113093140017835, + "grad_norm": 0.4094845950603485, + "learning_rate": 9.7757964101431e-05, + "loss": 1.426, + "step": 39348 + }, + { + "epoch": 0.5113223085456994, + "grad_norm": 0.3444294333457947, + "learning_rate": 9.775536463951962e-05, + "loss": 1.481, + "step": 39349 + }, + { + "epoch": 0.5113353030896153, + "grad_norm": 0.38126417994499207, + "learning_rate": 9.775276517760824e-05, + "loss": 1.4755, + "step": 39350 + }, + { + "epoch": 0.5113482976335311, + "grad_norm": 0.29789432883262634, + "learning_rate": 9.775016571569685e-05, + "loss": 1.2326, + "step": 39351 + }, + { + "epoch": 0.511361292177447, + "grad_norm": 0.3457271456718445, + "learning_rate": 9.774756625378548e-05, + "loss": 1.1788, + "step": 39352 + }, + { + "epoch": 0.5113742867213629, + "grad_norm": 0.4097297489643097, + "learning_rate": 9.774496679187409e-05, + "loss": 1.4295, + "step": 39353 + }, + { + "epoch": 0.5113872812652788, + "grad_norm": 0.4819372296333313, + "learning_rate": 9.77423673299627e-05, + "loss": 1.4335, + "step": 39354 + }, + { + "epoch": 0.5114002758091946, + "grad_norm": 0.29294726252555847, + "learning_rate": 9.773976786805131e-05, + "loss": 1.2147, + "step": 39355 + }, + { + "epoch": 0.5114132703531105, + "grad_norm": 0.3596588671207428, + "learning_rate": 9.773716840613993e-05, + "loss": 1.4775, + "step": 39356 + }, + { + "epoch": 0.5114262648970264, + "grad_norm": 0.4221736788749695, + "learning_rate": 9.773456894422854e-05, + "loss": 1.3847, + "step": 39357 + }, + { + "epoch": 0.5114392594409423, + "grad_norm": 0.3831305205821991, + "learning_rate": 9.773196948231717e-05, + "loss": 1.344, + "step": 39358 + }, + { + "epoch": 0.5114522539848582, + "grad_norm": 0.4026622474193573, + "learning_rate": 9.772937002040578e-05, + "loss": 1.2924, + "step": 39359 + }, + { + "epoch": 0.511465248528774, + "grad_norm": 0.36827659606933594, + "learning_rate": 9.772677055849439e-05, + "loss": 1.2902, + "step": 39360 + }, + { + "epoch": 0.5114782430726899, + "grad_norm": 0.32670995593070984, + "learning_rate": 9.772417109658302e-05, + "loss": 1.3947, + "step": 39361 + }, + { + "epoch": 0.5114912376166058, + "grad_norm": 0.28596577048301697, + "learning_rate": 9.772157163467163e-05, + "loss": 1.1981, + "step": 39362 + }, + { + "epoch": 0.5115042321605217, + "grad_norm": 0.3686067759990692, + "learning_rate": 9.771897217276025e-05, + "loss": 1.1774, + "step": 39363 + }, + { + "epoch": 0.5115172267044374, + "grad_norm": 0.4227554500102997, + "learning_rate": 9.771637271084886e-05, + "loss": 1.2169, + "step": 39364 + }, + { + "epoch": 0.5115302212483533, + "grad_norm": 0.39554816484451294, + "learning_rate": 9.771377324893747e-05, + "loss": 1.3379, + "step": 39365 + }, + { + "epoch": 0.5115432157922692, + "grad_norm": 0.39307141304016113, + "learning_rate": 9.771117378702608e-05, + "loss": 1.4063, + "step": 39366 + }, + { + "epoch": 0.5115562103361851, + "grad_norm": 0.40418288111686707, + "learning_rate": 9.770857432511471e-05, + "loss": 1.3704, + "step": 39367 + }, + { + "epoch": 0.5115692048801009, + "grad_norm": 0.40767961740493774, + "learning_rate": 9.770597486320332e-05, + "loss": 1.4369, + "step": 39368 + }, + { + "epoch": 0.5115821994240168, + "grad_norm": 0.48716607689857483, + "learning_rate": 9.770337540129194e-05, + "loss": 1.3882, + "step": 39369 + }, + { + "epoch": 0.5115951939679327, + "grad_norm": 0.4574127793312073, + "learning_rate": 9.770077593938055e-05, + "loss": 1.5414, + "step": 39370 + }, + { + "epoch": 0.5116081885118486, + "grad_norm": 0.35702288150787354, + "learning_rate": 9.769817647746917e-05, + "loss": 1.2803, + "step": 39371 + }, + { + "epoch": 0.5116211830557644, + "grad_norm": 0.43470922112464905, + "learning_rate": 9.769557701555778e-05, + "loss": 1.3974, + "step": 39372 + }, + { + "epoch": 0.5116341775996803, + "grad_norm": 0.3507862985134125, + "learning_rate": 9.76929775536464e-05, + "loss": 1.4185, + "step": 39373 + }, + { + "epoch": 0.5116471721435962, + "grad_norm": 0.4715103805065155, + "learning_rate": 9.769037809173501e-05, + "loss": 1.5131, + "step": 39374 + }, + { + "epoch": 0.5116601666875121, + "grad_norm": 0.3318878710269928, + "learning_rate": 9.768777862982364e-05, + "loss": 1.1944, + "step": 39375 + }, + { + "epoch": 0.5116731612314279, + "grad_norm": 0.4630686044692993, + "learning_rate": 9.768517916791225e-05, + "loss": 1.2553, + "step": 39376 + }, + { + "epoch": 0.5116861557753438, + "grad_norm": 0.43654918670654297, + "learning_rate": 9.768257970600086e-05, + "loss": 1.3665, + "step": 39377 + }, + { + "epoch": 0.5116991503192597, + "grad_norm": 0.392220675945282, + "learning_rate": 9.767998024408947e-05, + "loss": 1.2356, + "step": 39378 + }, + { + "epoch": 0.5117121448631756, + "grad_norm": 0.49385005235671997, + "learning_rate": 9.76773807821781e-05, + "loss": 1.5824, + "step": 39379 + }, + { + "epoch": 0.5117251394070914, + "grad_norm": 0.3665119707584381, + "learning_rate": 9.76747813202667e-05, + "loss": 1.3948, + "step": 39380 + }, + { + "epoch": 0.5117381339510073, + "grad_norm": 0.3519100844860077, + "learning_rate": 9.767218185835533e-05, + "loss": 1.1824, + "step": 39381 + }, + { + "epoch": 0.5117511284949232, + "grad_norm": 0.44039052724838257, + "learning_rate": 9.766958239644394e-05, + "loss": 1.3796, + "step": 39382 + }, + { + "epoch": 0.5117641230388391, + "grad_norm": 0.4043805003166199, + "learning_rate": 9.766698293453256e-05, + "loss": 1.4108, + "step": 39383 + }, + { + "epoch": 0.5117771175827549, + "grad_norm": 0.4009416401386261, + "learning_rate": 9.766438347262116e-05, + "loss": 1.2185, + "step": 39384 + }, + { + "epoch": 0.5117901121266708, + "grad_norm": 0.39937108755111694, + "learning_rate": 9.766178401070979e-05, + "loss": 1.3887, + "step": 39385 + }, + { + "epoch": 0.5118031066705867, + "grad_norm": 0.3723333477973938, + "learning_rate": 9.76591845487984e-05, + "loss": 1.3458, + "step": 39386 + }, + { + "epoch": 0.5118161012145026, + "grad_norm": 0.42617493867874146, + "learning_rate": 9.765658508688702e-05, + "loss": 1.3766, + "step": 39387 + }, + { + "epoch": 0.5118290957584184, + "grad_norm": 0.4693741798400879, + "learning_rate": 9.765398562497563e-05, + "loss": 1.3107, + "step": 39388 + }, + { + "epoch": 0.5118420903023343, + "grad_norm": 0.3658781051635742, + "learning_rate": 9.765138616306426e-05, + "loss": 1.2589, + "step": 39389 + }, + { + "epoch": 0.5118550848462502, + "grad_norm": 0.35842278599739075, + "learning_rate": 9.764878670115285e-05, + "loss": 1.3256, + "step": 39390 + }, + { + "epoch": 0.5118680793901661, + "grad_norm": 0.46786054968833923, + "learning_rate": 9.764618723924148e-05, + "loss": 1.5239, + "step": 39391 + }, + { + "epoch": 0.5118810739340819, + "grad_norm": 0.5196093916893005, + "learning_rate": 9.764358777733009e-05, + "loss": 1.406, + "step": 39392 + }, + { + "epoch": 0.5118940684779978, + "grad_norm": 0.5426972508430481, + "learning_rate": 9.764098831541871e-05, + "loss": 1.3829, + "step": 39393 + }, + { + "epoch": 0.5119070630219137, + "grad_norm": 0.3667483329772949, + "learning_rate": 9.763838885350733e-05, + "loss": 1.4969, + "step": 39394 + }, + { + "epoch": 0.5119200575658296, + "grad_norm": 0.42266958951950073, + "learning_rate": 9.763578939159595e-05, + "loss": 1.2915, + "step": 39395 + }, + { + "epoch": 0.5119330521097454, + "grad_norm": 0.3246303200721741, + "learning_rate": 9.763318992968456e-05, + "loss": 1.2917, + "step": 39396 + }, + { + "epoch": 0.5119460466536613, + "grad_norm": 0.4075184762477875, + "learning_rate": 9.763059046777317e-05, + "loss": 1.596, + "step": 39397 + }, + { + "epoch": 0.5119590411975772, + "grad_norm": 0.34920117259025574, + "learning_rate": 9.76279910058618e-05, + "loss": 1.4556, + "step": 39398 + }, + { + "epoch": 0.5119720357414931, + "grad_norm": 0.38396698236465454, + "learning_rate": 9.762539154395041e-05, + "loss": 1.2795, + "step": 39399 + }, + { + "epoch": 0.5119850302854089, + "grad_norm": 0.32200586795806885, + "learning_rate": 9.762279208203903e-05, + "loss": 1.5126, + "step": 39400 + }, + { + "epoch": 0.5119980248293248, + "grad_norm": 0.3854883313179016, + "learning_rate": 9.762019262012764e-05, + "loss": 1.3088, + "step": 39401 + }, + { + "epoch": 0.5120110193732407, + "grad_norm": 0.46787258982658386, + "learning_rate": 9.761759315821625e-05, + "loss": 1.4954, + "step": 39402 + }, + { + "epoch": 0.5120240139171566, + "grad_norm": 0.3591521084308624, + "learning_rate": 9.761499369630486e-05, + "loss": 1.2844, + "step": 39403 + }, + { + "epoch": 0.5120370084610724, + "grad_norm": 0.32724836468696594, + "learning_rate": 9.761239423439349e-05, + "loss": 1.2669, + "step": 39404 + }, + { + "epoch": 0.5120500030049883, + "grad_norm": 0.3592516779899597, + "learning_rate": 9.76097947724821e-05, + "loss": 1.2995, + "step": 39405 + }, + { + "epoch": 0.5120629975489042, + "grad_norm": 0.38421496748924255, + "learning_rate": 9.760719531057072e-05, + "loss": 1.2846, + "step": 39406 + }, + { + "epoch": 0.5120759920928201, + "grad_norm": 0.4078837037086487, + "learning_rate": 9.760459584865934e-05, + "loss": 1.3143, + "step": 39407 + }, + { + "epoch": 0.5120889866367359, + "grad_norm": 0.3870948851108551, + "learning_rate": 9.760199638674795e-05, + "loss": 1.267, + "step": 39408 + }, + { + "epoch": 0.5121019811806518, + "grad_norm": 0.4500604271888733, + "learning_rate": 9.759939692483656e-05, + "loss": 1.3788, + "step": 39409 + }, + { + "epoch": 0.5121149757245677, + "grad_norm": 0.3979552984237671, + "learning_rate": 9.759679746292518e-05, + "loss": 1.4474, + "step": 39410 + }, + { + "epoch": 0.5121279702684836, + "grad_norm": 0.4356045424938202, + "learning_rate": 9.759419800101379e-05, + "loss": 1.3508, + "step": 39411 + }, + { + "epoch": 0.5121409648123993, + "grad_norm": 0.37838214635849, + "learning_rate": 9.759159853910242e-05, + "loss": 1.2991, + "step": 39412 + }, + { + "epoch": 0.5121539593563152, + "grad_norm": 0.3353853225708008, + "learning_rate": 9.758899907719103e-05, + "loss": 1.3965, + "step": 39413 + }, + { + "epoch": 0.5121669539002311, + "grad_norm": 0.36219966411590576, + "learning_rate": 9.758639961527964e-05, + "loss": 1.2188, + "step": 39414 + }, + { + "epoch": 0.512179948444147, + "grad_norm": 0.3651082515716553, + "learning_rate": 9.758380015336825e-05, + "loss": 1.3183, + "step": 39415 + }, + { + "epoch": 0.5121929429880628, + "grad_norm": 0.4207509160041809, + "learning_rate": 9.758120069145687e-05, + "loss": 1.4562, + "step": 39416 + }, + { + "epoch": 0.5122059375319787, + "grad_norm": 0.3921317160129547, + "learning_rate": 9.757860122954549e-05, + "loss": 1.256, + "step": 39417 + }, + { + "epoch": 0.5122189320758946, + "grad_norm": 0.47295695543289185, + "learning_rate": 9.757600176763411e-05, + "loss": 1.3844, + "step": 39418 + }, + { + "epoch": 0.5122319266198105, + "grad_norm": 0.3705073595046997, + "learning_rate": 9.757340230572272e-05, + "loss": 1.4691, + "step": 39419 + }, + { + "epoch": 0.5122449211637263, + "grad_norm": 0.3770361542701721, + "learning_rate": 9.757080284381133e-05, + "loss": 1.3377, + "step": 39420 + }, + { + "epoch": 0.5122579157076422, + "grad_norm": 0.34595030546188354, + "learning_rate": 9.756820338189994e-05, + "loss": 1.4983, + "step": 39421 + }, + { + "epoch": 0.5122709102515581, + "grad_norm": 0.4200112223625183, + "learning_rate": 9.756560391998857e-05, + "loss": 1.3209, + "step": 39422 + }, + { + "epoch": 0.512283904795474, + "grad_norm": 0.44866743683815, + "learning_rate": 9.756300445807718e-05, + "loss": 1.3751, + "step": 39423 + }, + { + "epoch": 0.5122968993393898, + "grad_norm": 0.43044301867485046, + "learning_rate": 9.75604049961658e-05, + "loss": 1.4599, + "step": 39424 + }, + { + "epoch": 0.5123098938833057, + "grad_norm": 0.31085312366485596, + "learning_rate": 9.755780553425441e-05, + "loss": 1.3919, + "step": 39425 + }, + { + "epoch": 0.5123228884272216, + "grad_norm": 0.42447277903556824, + "learning_rate": 9.755520607234302e-05, + "loss": 1.4848, + "step": 39426 + }, + { + "epoch": 0.5123358829711375, + "grad_norm": 0.41111063957214355, + "learning_rate": 9.755260661043164e-05, + "loss": 1.5893, + "step": 39427 + }, + { + "epoch": 0.5123488775150533, + "grad_norm": 0.3333425223827362, + "learning_rate": 9.755000714852026e-05, + "loss": 1.183, + "step": 39428 + }, + { + "epoch": 0.5123618720589692, + "grad_norm": 0.3708946108818054, + "learning_rate": 9.754740768660887e-05, + "loss": 1.2738, + "step": 39429 + }, + { + "epoch": 0.5123748666028851, + "grad_norm": 0.4210493564605713, + "learning_rate": 9.75448082246975e-05, + "loss": 1.3708, + "step": 39430 + }, + { + "epoch": 0.512387861146801, + "grad_norm": 0.41169503331184387, + "learning_rate": 9.75422087627861e-05, + "loss": 1.3323, + "step": 39431 + }, + { + "epoch": 0.5124008556907168, + "grad_norm": 0.41974085569381714, + "learning_rate": 9.753960930087472e-05, + "loss": 1.5254, + "step": 39432 + }, + { + "epoch": 0.5124138502346327, + "grad_norm": 0.4018036723136902, + "learning_rate": 9.753700983896334e-05, + "loss": 1.2555, + "step": 39433 + }, + { + "epoch": 0.5124268447785486, + "grad_norm": 0.4323458969593048, + "learning_rate": 9.753441037705195e-05, + "loss": 1.5239, + "step": 39434 + }, + { + "epoch": 0.5124398393224645, + "grad_norm": 0.48945188522338867, + "learning_rate": 9.753181091514058e-05, + "loss": 1.3043, + "step": 39435 + }, + { + "epoch": 0.5124528338663804, + "grad_norm": 0.3842877447605133, + "learning_rate": 9.752921145322919e-05, + "loss": 1.3522, + "step": 39436 + }, + { + "epoch": 0.5124658284102962, + "grad_norm": 0.3731060028076172, + "learning_rate": 9.752661199131781e-05, + "loss": 1.2713, + "step": 39437 + }, + { + "epoch": 0.5124788229542121, + "grad_norm": 0.4615040123462677, + "learning_rate": 9.752401252940642e-05, + "loss": 1.3999, + "step": 39438 + }, + { + "epoch": 0.512491817498128, + "grad_norm": 0.45647865533828735, + "learning_rate": 9.752141306749503e-05, + "loss": 1.3285, + "step": 39439 + }, + { + "epoch": 0.5125048120420439, + "grad_norm": 0.4202924072742462, + "learning_rate": 9.751881360558365e-05, + "loss": 1.4438, + "step": 39440 + }, + { + "epoch": 0.5125178065859597, + "grad_norm": 0.32791075110435486, + "learning_rate": 9.751621414367227e-05, + "loss": 1.3588, + "step": 39441 + }, + { + "epoch": 0.5125308011298756, + "grad_norm": 0.3537735044956207, + "learning_rate": 9.751361468176088e-05, + "loss": 1.4101, + "step": 39442 + }, + { + "epoch": 0.5125437956737915, + "grad_norm": 0.4185813367366791, + "learning_rate": 9.75110152198495e-05, + "loss": 1.3588, + "step": 39443 + }, + { + "epoch": 0.5125567902177074, + "grad_norm": 0.35989633202552795, + "learning_rate": 9.750841575793812e-05, + "loss": 1.3411, + "step": 39444 + }, + { + "epoch": 0.5125697847616232, + "grad_norm": 0.26838019490242004, + "learning_rate": 9.750581629602673e-05, + "loss": 1.2136, + "step": 39445 + }, + { + "epoch": 0.5125827793055391, + "grad_norm": 0.5781556367874146, + "learning_rate": 9.750321683411534e-05, + "loss": 1.462, + "step": 39446 + }, + { + "epoch": 0.512595773849455, + "grad_norm": 0.37682318687438965, + "learning_rate": 9.750061737220396e-05, + "loss": 1.2769, + "step": 39447 + }, + { + "epoch": 0.5126087683933709, + "grad_norm": 0.41718778014183044, + "learning_rate": 9.749801791029257e-05, + "loss": 1.3063, + "step": 39448 + }, + { + "epoch": 0.5126217629372867, + "grad_norm": 0.47583910822868347, + "learning_rate": 9.74954184483812e-05, + "loss": 1.3256, + "step": 39449 + }, + { + "epoch": 0.5126347574812026, + "grad_norm": 0.42350900173187256, + "learning_rate": 9.749281898646981e-05, + "loss": 1.5627, + "step": 39450 + }, + { + "epoch": 0.5126477520251185, + "grad_norm": 0.49644577503204346, + "learning_rate": 9.749021952455842e-05, + "loss": 1.3146, + "step": 39451 + }, + { + "epoch": 0.5126607465690344, + "grad_norm": 0.4172748625278473, + "learning_rate": 9.748762006264703e-05, + "loss": 1.4177, + "step": 39452 + }, + { + "epoch": 0.5126737411129502, + "grad_norm": 0.5760789513587952, + "learning_rate": 9.748502060073566e-05, + "loss": 1.324, + "step": 39453 + }, + { + "epoch": 0.5126867356568661, + "grad_norm": 0.363398015499115, + "learning_rate": 9.748242113882427e-05, + "loss": 1.5696, + "step": 39454 + }, + { + "epoch": 0.512699730200782, + "grad_norm": 0.41613560914993286, + "learning_rate": 9.747982167691289e-05, + "loss": 1.4868, + "step": 39455 + }, + { + "epoch": 0.5127127247446979, + "grad_norm": 0.4418090879917145, + "learning_rate": 9.74772222150015e-05, + "loss": 1.3785, + "step": 39456 + }, + { + "epoch": 0.5127257192886137, + "grad_norm": 0.28069639205932617, + "learning_rate": 9.747462275309011e-05, + "loss": 1.2093, + "step": 39457 + }, + { + "epoch": 0.5127387138325296, + "grad_norm": 0.4408092796802521, + "learning_rate": 9.747202329117872e-05, + "loss": 1.1734, + "step": 39458 + }, + { + "epoch": 0.5127517083764455, + "grad_norm": 0.3360148072242737, + "learning_rate": 9.746942382926735e-05, + "loss": 1.2117, + "step": 39459 + }, + { + "epoch": 0.5127647029203614, + "grad_norm": 0.42431241273880005, + "learning_rate": 9.746682436735596e-05, + "loss": 1.462, + "step": 39460 + }, + { + "epoch": 0.5127776974642771, + "grad_norm": 0.4087930917739868, + "learning_rate": 9.746422490544458e-05, + "loss": 1.4759, + "step": 39461 + }, + { + "epoch": 0.512790692008193, + "grad_norm": 0.5163681507110596, + "learning_rate": 9.74616254435332e-05, + "loss": 1.4411, + "step": 39462 + }, + { + "epoch": 0.512803686552109, + "grad_norm": 0.3801685571670532, + "learning_rate": 9.74590259816218e-05, + "loss": 1.5252, + "step": 39463 + }, + { + "epoch": 0.5128166810960249, + "grad_norm": 0.5072066783905029, + "learning_rate": 9.745642651971042e-05, + "loss": 1.3552, + "step": 39464 + }, + { + "epoch": 0.5128296756399406, + "grad_norm": 0.3318372070789337, + "learning_rate": 9.745382705779904e-05, + "loss": 1.4463, + "step": 39465 + }, + { + "epoch": 0.5128426701838565, + "grad_norm": 0.5001636147499084, + "learning_rate": 9.745122759588765e-05, + "loss": 1.4276, + "step": 39466 + }, + { + "epoch": 0.5128556647277724, + "grad_norm": 0.3696518540382385, + "learning_rate": 9.744862813397628e-05, + "loss": 1.2401, + "step": 39467 + }, + { + "epoch": 0.5128686592716883, + "grad_norm": 0.29905152320861816, + "learning_rate": 9.744602867206489e-05, + "loss": 1.1754, + "step": 39468 + }, + { + "epoch": 0.5128816538156041, + "grad_norm": 0.334574818611145, + "learning_rate": 9.74434292101535e-05, + "loss": 1.2266, + "step": 39469 + }, + { + "epoch": 0.51289464835952, + "grad_norm": 0.4141849875450134, + "learning_rate": 9.744082974824212e-05, + "loss": 1.3044, + "step": 39470 + }, + { + "epoch": 0.5129076429034359, + "grad_norm": 0.4707565903663635, + "learning_rate": 9.743823028633073e-05, + "loss": 1.4208, + "step": 39471 + }, + { + "epoch": 0.5129206374473518, + "grad_norm": 0.2881731390953064, + "learning_rate": 9.743563082441936e-05, + "loss": 1.2949, + "step": 39472 + }, + { + "epoch": 0.5129336319912676, + "grad_norm": 0.48114830255508423, + "learning_rate": 9.743303136250797e-05, + "loss": 1.2387, + "step": 39473 + }, + { + "epoch": 0.5129466265351835, + "grad_norm": 0.481478214263916, + "learning_rate": 9.743043190059658e-05, + "loss": 1.2808, + "step": 39474 + }, + { + "epoch": 0.5129596210790994, + "grad_norm": 0.3560812175273895, + "learning_rate": 9.742783243868519e-05, + "loss": 1.5284, + "step": 39475 + }, + { + "epoch": 0.5129726156230153, + "grad_norm": 0.41663411259651184, + "learning_rate": 9.742523297677381e-05, + "loss": 1.4942, + "step": 39476 + }, + { + "epoch": 0.5129856101669311, + "grad_norm": 0.38447460532188416, + "learning_rate": 9.742263351486243e-05, + "loss": 1.4993, + "step": 39477 + }, + { + "epoch": 0.512998604710847, + "grad_norm": 0.36054709553718567, + "learning_rate": 9.742003405295105e-05, + "loss": 1.2638, + "step": 39478 + }, + { + "epoch": 0.5130115992547629, + "grad_norm": 0.4156758785247803, + "learning_rate": 9.741743459103966e-05, + "loss": 1.431, + "step": 39479 + }, + { + "epoch": 0.5130245937986788, + "grad_norm": 0.3394497334957123, + "learning_rate": 9.741483512912827e-05, + "loss": 1.3306, + "step": 39480 + }, + { + "epoch": 0.5130375883425946, + "grad_norm": 0.46876341104507446, + "learning_rate": 9.741223566721688e-05, + "loss": 1.5192, + "step": 39481 + }, + { + "epoch": 0.5130505828865105, + "grad_norm": 0.4184974730014801, + "learning_rate": 9.740963620530551e-05, + "loss": 1.3944, + "step": 39482 + }, + { + "epoch": 0.5130635774304264, + "grad_norm": 0.46014732122421265, + "learning_rate": 9.740703674339412e-05, + "loss": 1.3378, + "step": 39483 + }, + { + "epoch": 0.5130765719743423, + "grad_norm": 0.37448692321777344, + "learning_rate": 9.740443728148274e-05, + "loss": 1.3002, + "step": 39484 + }, + { + "epoch": 0.5130895665182581, + "grad_norm": 0.40476009249687195, + "learning_rate": 9.740183781957135e-05, + "loss": 1.5501, + "step": 39485 + }, + { + "epoch": 0.513102561062174, + "grad_norm": 0.41558411717414856, + "learning_rate": 9.739923835765998e-05, + "loss": 1.4701, + "step": 39486 + }, + { + "epoch": 0.5131155556060899, + "grad_norm": 0.44448649883270264, + "learning_rate": 9.739663889574858e-05, + "loss": 1.5139, + "step": 39487 + }, + { + "epoch": 0.5131285501500058, + "grad_norm": 0.43545255064964294, + "learning_rate": 9.73940394338372e-05, + "loss": 1.3608, + "step": 39488 + }, + { + "epoch": 0.5131415446939216, + "grad_norm": 0.42789942026138306, + "learning_rate": 9.739143997192581e-05, + "loss": 1.362, + "step": 39489 + }, + { + "epoch": 0.5131545392378375, + "grad_norm": 0.35289517045021057, + "learning_rate": 9.738884051001444e-05, + "loss": 1.1577, + "step": 39490 + }, + { + "epoch": 0.5131675337817534, + "grad_norm": 0.3240531384944916, + "learning_rate": 9.738624104810305e-05, + "loss": 1.4687, + "step": 39491 + }, + { + "epoch": 0.5131805283256693, + "grad_norm": 0.4367976486682892, + "learning_rate": 9.738364158619167e-05, + "loss": 1.4106, + "step": 39492 + }, + { + "epoch": 0.5131935228695851, + "grad_norm": 0.3416028916835785, + "learning_rate": 9.738104212428027e-05, + "loss": 1.2996, + "step": 39493 + }, + { + "epoch": 0.513206517413501, + "grad_norm": 0.393548846244812, + "learning_rate": 9.737844266236889e-05, + "loss": 1.1513, + "step": 39494 + }, + { + "epoch": 0.5132195119574169, + "grad_norm": 0.4714078903198242, + "learning_rate": 9.73758432004575e-05, + "loss": 1.5379, + "step": 39495 + }, + { + "epoch": 0.5132325065013328, + "grad_norm": 0.3536761999130249, + "learning_rate": 9.737324373854613e-05, + "loss": 1.3223, + "step": 39496 + }, + { + "epoch": 0.5132455010452486, + "grad_norm": 0.4359242618083954, + "learning_rate": 9.737064427663474e-05, + "loss": 1.4809, + "step": 39497 + }, + { + "epoch": 0.5132584955891645, + "grad_norm": 0.38661080598831177, + "learning_rate": 9.736804481472336e-05, + "loss": 1.3359, + "step": 39498 + }, + { + "epoch": 0.5132714901330804, + "grad_norm": 0.49218955636024475, + "learning_rate": 9.736544535281197e-05, + "loss": 1.5116, + "step": 39499 + }, + { + "epoch": 0.5132844846769963, + "grad_norm": 0.3969157934188843, + "learning_rate": 9.736284589090059e-05, + "loss": 1.592, + "step": 39500 + }, + { + "epoch": 0.5132974792209121, + "grad_norm": 0.3856186270713806, + "learning_rate": 9.73602464289892e-05, + "loss": 1.3175, + "step": 39501 + }, + { + "epoch": 0.513310473764828, + "grad_norm": 0.6555805206298828, + "learning_rate": 9.735764696707782e-05, + "loss": 1.4554, + "step": 39502 + }, + { + "epoch": 0.5133234683087439, + "grad_norm": 0.4810357093811035, + "learning_rate": 9.735504750516643e-05, + "loss": 1.4785, + "step": 39503 + }, + { + "epoch": 0.5133364628526598, + "grad_norm": 0.41863593459129333, + "learning_rate": 9.735244804325506e-05, + "loss": 1.2289, + "step": 39504 + }, + { + "epoch": 0.5133494573965756, + "grad_norm": 0.4902775287628174, + "learning_rate": 9.734984858134367e-05, + "loss": 1.4196, + "step": 39505 + }, + { + "epoch": 0.5133624519404915, + "grad_norm": 0.319583922624588, + "learning_rate": 9.734724911943228e-05, + "loss": 1.2695, + "step": 39506 + }, + { + "epoch": 0.5133754464844074, + "grad_norm": 0.43820762634277344, + "learning_rate": 9.73446496575209e-05, + "loss": 1.582, + "step": 39507 + }, + { + "epoch": 0.5133884410283233, + "grad_norm": 0.3769552409648895, + "learning_rate": 9.734205019560951e-05, + "loss": 1.3192, + "step": 39508 + }, + { + "epoch": 0.513401435572239, + "grad_norm": 0.3626807928085327, + "learning_rate": 9.733945073369814e-05, + "loss": 1.3745, + "step": 39509 + }, + { + "epoch": 0.513414430116155, + "grad_norm": 0.3693755567073822, + "learning_rate": 9.733685127178675e-05, + "loss": 1.3985, + "step": 39510 + }, + { + "epoch": 0.5134274246600709, + "grad_norm": 0.37674593925476074, + "learning_rate": 9.733425180987536e-05, + "loss": 1.3443, + "step": 39511 + }, + { + "epoch": 0.5134404192039868, + "grad_norm": 0.36541858315467834, + "learning_rate": 9.733165234796397e-05, + "loss": 1.3516, + "step": 39512 + }, + { + "epoch": 0.5134534137479027, + "grad_norm": 0.3605427145957947, + "learning_rate": 9.73290528860526e-05, + "loss": 1.2629, + "step": 39513 + }, + { + "epoch": 0.5134664082918184, + "grad_norm": 0.38582688570022583, + "learning_rate": 9.73264534241412e-05, + "loss": 1.346, + "step": 39514 + }, + { + "epoch": 0.5134794028357343, + "grad_norm": 0.3768206834793091, + "learning_rate": 9.732385396222983e-05, + "loss": 1.5225, + "step": 39515 + }, + { + "epoch": 0.5134923973796502, + "grad_norm": 0.47960832715034485, + "learning_rate": 9.732125450031844e-05, + "loss": 1.336, + "step": 39516 + }, + { + "epoch": 0.5135053919235661, + "grad_norm": 0.3815993368625641, + "learning_rate": 9.731865503840705e-05, + "loss": 1.3352, + "step": 39517 + }, + { + "epoch": 0.5135183864674819, + "grad_norm": 0.45172834396362305, + "learning_rate": 9.731605557649566e-05, + "loss": 1.4784, + "step": 39518 + }, + { + "epoch": 0.5135313810113978, + "grad_norm": 0.3518627882003784, + "learning_rate": 9.731345611458429e-05, + "loss": 1.3606, + "step": 39519 + }, + { + "epoch": 0.5135443755553137, + "grad_norm": 0.38611143827438354, + "learning_rate": 9.73108566526729e-05, + "loss": 1.1186, + "step": 39520 + }, + { + "epoch": 0.5135573700992296, + "grad_norm": 0.4205883741378784, + "learning_rate": 9.730825719076152e-05, + "loss": 1.5008, + "step": 39521 + }, + { + "epoch": 0.5135703646431454, + "grad_norm": 0.4126805365085602, + "learning_rate": 9.730565772885013e-05, + "loss": 1.3918, + "step": 39522 + }, + { + "epoch": 0.5135833591870613, + "grad_norm": 0.35251301527023315, + "learning_rate": 9.730305826693875e-05, + "loss": 1.2244, + "step": 39523 + }, + { + "epoch": 0.5135963537309772, + "grad_norm": 0.48083367943763733, + "learning_rate": 9.730045880502736e-05, + "loss": 1.5146, + "step": 39524 + }, + { + "epoch": 0.5136093482748931, + "grad_norm": 0.44507211446762085, + "learning_rate": 9.729785934311598e-05, + "loss": 1.3203, + "step": 39525 + }, + { + "epoch": 0.5136223428188089, + "grad_norm": 0.4734783172607422, + "learning_rate": 9.729525988120459e-05, + "loss": 1.3451, + "step": 39526 + }, + { + "epoch": 0.5136353373627248, + "grad_norm": 0.2835654020309448, + "learning_rate": 9.729266041929322e-05, + "loss": 1.2581, + "step": 39527 + }, + { + "epoch": 0.5136483319066407, + "grad_norm": 0.3735317587852478, + "learning_rate": 9.729006095738183e-05, + "loss": 1.4256, + "step": 39528 + }, + { + "epoch": 0.5136613264505566, + "grad_norm": 0.4685862958431244, + "learning_rate": 9.728746149547044e-05, + "loss": 1.3742, + "step": 39529 + }, + { + "epoch": 0.5136743209944724, + "grad_norm": 0.3782171308994293, + "learning_rate": 9.728486203355905e-05, + "loss": 1.4356, + "step": 39530 + }, + { + "epoch": 0.5136873155383883, + "grad_norm": 0.40176665782928467, + "learning_rate": 9.728226257164767e-05, + "loss": 1.3824, + "step": 39531 + }, + { + "epoch": 0.5137003100823042, + "grad_norm": 0.3948041796684265, + "learning_rate": 9.727966310973628e-05, + "loss": 1.3063, + "step": 39532 + }, + { + "epoch": 0.5137133046262201, + "grad_norm": 0.4475201964378357, + "learning_rate": 9.727706364782491e-05, + "loss": 1.4033, + "step": 39533 + }, + { + "epoch": 0.5137262991701359, + "grad_norm": 0.41835859417915344, + "learning_rate": 9.727446418591352e-05, + "loss": 1.4636, + "step": 39534 + }, + { + "epoch": 0.5137392937140518, + "grad_norm": 0.39145171642303467, + "learning_rate": 9.727186472400213e-05, + "loss": 1.4561, + "step": 39535 + }, + { + "epoch": 0.5137522882579677, + "grad_norm": 0.3844452500343323, + "learning_rate": 9.726926526209074e-05, + "loss": 1.3852, + "step": 39536 + }, + { + "epoch": 0.5137652828018836, + "grad_norm": 0.3905602693557739, + "learning_rate": 9.726666580017937e-05, + "loss": 1.2901, + "step": 39537 + }, + { + "epoch": 0.5137782773457994, + "grad_norm": 0.40003377199172974, + "learning_rate": 9.726406633826798e-05, + "loss": 1.3927, + "step": 39538 + }, + { + "epoch": 0.5137912718897153, + "grad_norm": 0.4175763428211212, + "learning_rate": 9.72614668763566e-05, + "loss": 1.5071, + "step": 39539 + }, + { + "epoch": 0.5138042664336312, + "grad_norm": 0.5333362221717834, + "learning_rate": 9.725886741444521e-05, + "loss": 1.5085, + "step": 39540 + }, + { + "epoch": 0.5138172609775471, + "grad_norm": 0.4528881311416626, + "learning_rate": 9.725626795253384e-05, + "loss": 1.3516, + "step": 39541 + }, + { + "epoch": 0.5138302555214629, + "grad_norm": 0.36816319823265076, + "learning_rate": 9.725366849062243e-05, + "loss": 1.3222, + "step": 39542 + }, + { + "epoch": 0.5138432500653788, + "grad_norm": 0.4030925929546356, + "learning_rate": 9.725106902871106e-05, + "loss": 1.5036, + "step": 39543 + }, + { + "epoch": 0.5138562446092947, + "grad_norm": 0.44089964032173157, + "learning_rate": 9.724846956679968e-05, + "loss": 1.3359, + "step": 39544 + }, + { + "epoch": 0.5138692391532106, + "grad_norm": 0.2526116967201233, + "learning_rate": 9.72458701048883e-05, + "loss": 1.3223, + "step": 39545 + }, + { + "epoch": 0.5138822336971264, + "grad_norm": 0.39631563425064087, + "learning_rate": 9.724327064297692e-05, + "loss": 1.2672, + "step": 39546 + }, + { + "epoch": 0.5138952282410423, + "grad_norm": 0.440799355506897, + "learning_rate": 9.724067118106553e-05, + "loss": 1.3313, + "step": 39547 + }, + { + "epoch": 0.5139082227849582, + "grad_norm": 0.3612401485443115, + "learning_rate": 9.723807171915414e-05, + "loss": 1.3459, + "step": 39548 + }, + { + "epoch": 0.5139212173288741, + "grad_norm": 0.4177757203578949, + "learning_rate": 9.723547225724275e-05, + "loss": 1.3998, + "step": 39549 + }, + { + "epoch": 0.5139342118727899, + "grad_norm": 0.42302751541137695, + "learning_rate": 9.723287279533138e-05, + "loss": 1.4905, + "step": 39550 + }, + { + "epoch": 0.5139472064167058, + "grad_norm": 0.3923909366130829, + "learning_rate": 9.723027333341999e-05, + "loss": 1.4384, + "step": 39551 + }, + { + "epoch": 0.5139602009606217, + "grad_norm": 0.45204275846481323, + "learning_rate": 9.722767387150861e-05, + "loss": 1.5433, + "step": 39552 + }, + { + "epoch": 0.5139731955045376, + "grad_norm": 0.3555446267127991, + "learning_rate": 9.722507440959722e-05, + "loss": 1.3435, + "step": 39553 + }, + { + "epoch": 0.5139861900484534, + "grad_norm": 0.4486521780490875, + "learning_rate": 9.722247494768583e-05, + "loss": 1.5393, + "step": 39554 + }, + { + "epoch": 0.5139991845923693, + "grad_norm": 0.41485920548439026, + "learning_rate": 9.721987548577444e-05, + "loss": 1.5329, + "step": 39555 + }, + { + "epoch": 0.5140121791362852, + "grad_norm": 0.42809656262397766, + "learning_rate": 9.721727602386307e-05, + "loss": 1.252, + "step": 39556 + }, + { + "epoch": 0.5140251736802011, + "grad_norm": 0.3964829742908478, + "learning_rate": 9.721467656195168e-05, + "loss": 1.3866, + "step": 39557 + }, + { + "epoch": 0.5140381682241169, + "grad_norm": 0.41932404041290283, + "learning_rate": 9.72120771000403e-05, + "loss": 1.4595, + "step": 39558 + }, + { + "epoch": 0.5140511627680328, + "grad_norm": 0.30819523334503174, + "learning_rate": 9.720947763812892e-05, + "loss": 1.2134, + "step": 39559 + }, + { + "epoch": 0.5140641573119487, + "grad_norm": 0.4475322365760803, + "learning_rate": 9.720687817621753e-05, + "loss": 1.4524, + "step": 39560 + }, + { + "epoch": 0.5140771518558646, + "grad_norm": 0.3976454734802246, + "learning_rate": 9.720427871430614e-05, + "loss": 1.3617, + "step": 39561 + }, + { + "epoch": 0.5140901463997803, + "grad_norm": 0.474162220954895, + "learning_rate": 9.720167925239476e-05, + "loss": 1.3463, + "step": 39562 + }, + { + "epoch": 0.5141031409436962, + "grad_norm": 0.3635695278644562, + "learning_rate": 9.719907979048337e-05, + "loss": 1.3694, + "step": 39563 + }, + { + "epoch": 0.5141161354876121, + "grad_norm": 0.40568265318870544, + "learning_rate": 9.7196480328572e-05, + "loss": 1.3383, + "step": 39564 + }, + { + "epoch": 0.514129130031528, + "grad_norm": 0.46171995997428894, + "learning_rate": 9.719388086666061e-05, + "loss": 1.6526, + "step": 39565 + }, + { + "epoch": 0.5141421245754438, + "grad_norm": 0.29516011476516724, + "learning_rate": 9.719128140474922e-05, + "loss": 1.2467, + "step": 39566 + }, + { + "epoch": 0.5141551191193597, + "grad_norm": 0.3648897111415863, + "learning_rate": 9.718868194283783e-05, + "loss": 1.2264, + "step": 39567 + }, + { + "epoch": 0.5141681136632756, + "grad_norm": 0.9092336893081665, + "learning_rate": 9.718608248092645e-05, + "loss": 1.6395, + "step": 39568 + }, + { + "epoch": 0.5141811082071915, + "grad_norm": 0.3998500108718872, + "learning_rate": 9.718348301901507e-05, + "loss": 1.3913, + "step": 39569 + }, + { + "epoch": 0.5141941027511073, + "grad_norm": 0.39480289816856384, + "learning_rate": 9.718088355710369e-05, + "loss": 1.3587, + "step": 39570 + }, + { + "epoch": 0.5142070972950232, + "grad_norm": 0.4250936806201935, + "learning_rate": 9.71782840951923e-05, + "loss": 1.2375, + "step": 39571 + }, + { + "epoch": 0.5142200918389391, + "grad_norm": 0.42685559391975403, + "learning_rate": 9.717568463328091e-05, + "loss": 1.3746, + "step": 39572 + }, + { + "epoch": 0.514233086382855, + "grad_norm": 0.6645125150680542, + "learning_rate": 9.717308517136952e-05, + "loss": 1.4749, + "step": 39573 + }, + { + "epoch": 0.5142460809267708, + "grad_norm": 0.42402684688568115, + "learning_rate": 9.717048570945815e-05, + "loss": 1.5369, + "step": 39574 + }, + { + "epoch": 0.5142590754706867, + "grad_norm": 0.38953161239624023, + "learning_rate": 9.716788624754676e-05, + "loss": 1.4201, + "step": 39575 + }, + { + "epoch": 0.5142720700146026, + "grad_norm": 0.44474655389785767, + "learning_rate": 9.716528678563538e-05, + "loss": 1.3513, + "step": 39576 + }, + { + "epoch": 0.5142850645585185, + "grad_norm": 0.3826843500137329, + "learning_rate": 9.7162687323724e-05, + "loss": 1.3743, + "step": 39577 + }, + { + "epoch": 0.5142980591024343, + "grad_norm": 0.4314892888069153, + "learning_rate": 9.71600878618126e-05, + "loss": 1.3754, + "step": 39578 + }, + { + "epoch": 0.5143110536463502, + "grad_norm": 0.4289112687110901, + "learning_rate": 9.715748839990122e-05, + "loss": 1.3172, + "step": 39579 + }, + { + "epoch": 0.5143240481902661, + "grad_norm": 0.3868785500526428, + "learning_rate": 9.715488893798984e-05, + "loss": 1.4085, + "step": 39580 + }, + { + "epoch": 0.514337042734182, + "grad_norm": 0.4201281666755676, + "learning_rate": 9.715228947607846e-05, + "loss": 1.4202, + "step": 39581 + }, + { + "epoch": 0.5143500372780978, + "grad_norm": 0.4700254797935486, + "learning_rate": 9.714969001416708e-05, + "loss": 1.4354, + "step": 39582 + }, + { + "epoch": 0.5143630318220137, + "grad_norm": 0.4276207983493805, + "learning_rate": 9.714709055225569e-05, + "loss": 1.3266, + "step": 39583 + }, + { + "epoch": 0.5143760263659296, + "grad_norm": 0.47100603580474854, + "learning_rate": 9.71444910903443e-05, + "loss": 1.4775, + "step": 39584 + }, + { + "epoch": 0.5143890209098455, + "grad_norm": 0.4283423125743866, + "learning_rate": 9.714189162843292e-05, + "loss": 1.2042, + "step": 39585 + }, + { + "epoch": 0.5144020154537613, + "grad_norm": 0.38923966884613037, + "learning_rate": 9.713929216652153e-05, + "loss": 1.3739, + "step": 39586 + }, + { + "epoch": 0.5144150099976772, + "grad_norm": 0.3972034454345703, + "learning_rate": 9.713669270461016e-05, + "loss": 1.3613, + "step": 39587 + }, + { + "epoch": 0.5144280045415931, + "grad_norm": 0.35842642188072205, + "learning_rate": 9.713409324269877e-05, + "loss": 1.4798, + "step": 39588 + }, + { + "epoch": 0.514440999085509, + "grad_norm": 0.2264215052127838, + "learning_rate": 9.713149378078739e-05, + "loss": 1.2238, + "step": 39589 + }, + { + "epoch": 0.5144539936294249, + "grad_norm": 0.4745984971523285, + "learning_rate": 9.712889431887599e-05, + "loss": 1.3859, + "step": 39590 + }, + { + "epoch": 0.5144669881733407, + "grad_norm": 0.4654080271720886, + "learning_rate": 9.712629485696461e-05, + "loss": 1.5567, + "step": 39591 + }, + { + "epoch": 0.5144799827172566, + "grad_norm": 0.41530296206474304, + "learning_rate": 9.712369539505323e-05, + "loss": 1.4064, + "step": 39592 + }, + { + "epoch": 0.5144929772611725, + "grad_norm": 0.46620091795921326, + "learning_rate": 9.712109593314185e-05, + "loss": 1.4118, + "step": 39593 + }, + { + "epoch": 0.5145059718050884, + "grad_norm": 0.4257492125034332, + "learning_rate": 9.711849647123046e-05, + "loss": 1.53, + "step": 39594 + }, + { + "epoch": 0.5145189663490042, + "grad_norm": 0.34607985615730286, + "learning_rate": 9.711589700931909e-05, + "loss": 1.4411, + "step": 39595 + }, + { + "epoch": 0.5145319608929201, + "grad_norm": 0.35827481746673584, + "learning_rate": 9.711329754740768e-05, + "loss": 1.4432, + "step": 39596 + }, + { + "epoch": 0.514544955436836, + "grad_norm": 0.37934985756874084, + "learning_rate": 9.711069808549631e-05, + "loss": 1.5502, + "step": 39597 + }, + { + "epoch": 0.5145579499807519, + "grad_norm": 0.33695855736732483, + "learning_rate": 9.710809862358492e-05, + "loss": 1.0542, + "step": 39598 + }, + { + "epoch": 0.5145709445246677, + "grad_norm": 0.41489583253860474, + "learning_rate": 9.710549916167354e-05, + "loss": 1.2033, + "step": 39599 + }, + { + "epoch": 0.5145839390685836, + "grad_norm": 0.5226443409919739, + "learning_rate": 9.710289969976215e-05, + "loss": 1.5031, + "step": 39600 + }, + { + "epoch": 0.5145969336124995, + "grad_norm": 0.488183856010437, + "learning_rate": 9.710030023785078e-05, + "loss": 1.4077, + "step": 39601 + }, + { + "epoch": 0.5146099281564154, + "grad_norm": 0.4549994170665741, + "learning_rate": 9.709770077593939e-05, + "loss": 1.4703, + "step": 39602 + }, + { + "epoch": 0.5146229227003312, + "grad_norm": 0.46214890480041504, + "learning_rate": 9.7095101314028e-05, + "loss": 1.4448, + "step": 39603 + }, + { + "epoch": 0.5146359172442471, + "grad_norm": 0.35513556003570557, + "learning_rate": 9.709250185211661e-05, + "loss": 1.1875, + "step": 39604 + }, + { + "epoch": 0.514648911788163, + "grad_norm": 0.441771924495697, + "learning_rate": 9.708990239020524e-05, + "loss": 1.4226, + "step": 39605 + }, + { + "epoch": 0.5146619063320789, + "grad_norm": 0.3566654622554779, + "learning_rate": 9.708730292829385e-05, + "loss": 1.4356, + "step": 39606 + }, + { + "epoch": 0.5146749008759947, + "grad_norm": 0.4244002103805542, + "learning_rate": 9.708470346638247e-05, + "loss": 1.3938, + "step": 39607 + }, + { + "epoch": 0.5146878954199106, + "grad_norm": 0.46922242641448975, + "learning_rate": 9.708210400447108e-05, + "loss": 1.5661, + "step": 39608 + }, + { + "epoch": 0.5147008899638265, + "grad_norm": 0.34383782744407654, + "learning_rate": 9.707950454255969e-05, + "loss": 1.3873, + "step": 39609 + }, + { + "epoch": 0.5147138845077424, + "grad_norm": 0.3621924817562103, + "learning_rate": 9.70769050806483e-05, + "loss": 1.4737, + "step": 39610 + }, + { + "epoch": 0.5147268790516581, + "grad_norm": 0.41974350810050964, + "learning_rate": 9.707430561873693e-05, + "loss": 1.2503, + "step": 39611 + }, + { + "epoch": 0.514739873595574, + "grad_norm": 0.4380539655685425, + "learning_rate": 9.707170615682554e-05, + "loss": 1.4793, + "step": 39612 + }, + { + "epoch": 0.51475286813949, + "grad_norm": 0.4075803756713867, + "learning_rate": 9.706910669491416e-05, + "loss": 1.3166, + "step": 39613 + }, + { + "epoch": 0.5147658626834059, + "grad_norm": 0.3006149232387543, + "learning_rate": 9.706650723300277e-05, + "loss": 1.1098, + "step": 39614 + }, + { + "epoch": 0.5147788572273216, + "grad_norm": 0.48501020669937134, + "learning_rate": 9.706390777109139e-05, + "loss": 1.2855, + "step": 39615 + }, + { + "epoch": 0.5147918517712375, + "grad_norm": 0.3923147916793823, + "learning_rate": 9.706130830918e-05, + "loss": 1.3287, + "step": 39616 + }, + { + "epoch": 0.5148048463151534, + "grad_norm": 0.33656594157218933, + "learning_rate": 9.705870884726862e-05, + "loss": 1.3181, + "step": 39617 + }, + { + "epoch": 0.5148178408590693, + "grad_norm": 0.3882770240306854, + "learning_rate": 9.705610938535724e-05, + "loss": 1.3012, + "step": 39618 + }, + { + "epoch": 0.5148308354029851, + "grad_norm": 0.3996526598930359, + "learning_rate": 9.705350992344586e-05, + "loss": 1.2602, + "step": 39619 + }, + { + "epoch": 0.514843829946901, + "grad_norm": 0.4422810971736908, + "learning_rate": 9.705091046153447e-05, + "loss": 1.3228, + "step": 39620 + }, + { + "epoch": 0.5148568244908169, + "grad_norm": 0.4335898756980896, + "learning_rate": 9.704831099962308e-05, + "loss": 1.3476, + "step": 39621 + }, + { + "epoch": 0.5148698190347328, + "grad_norm": 0.4514157474040985, + "learning_rate": 9.70457115377117e-05, + "loss": 1.398, + "step": 39622 + }, + { + "epoch": 0.5148828135786486, + "grad_norm": 0.448866069316864, + "learning_rate": 9.704311207580031e-05, + "loss": 1.5592, + "step": 39623 + }, + { + "epoch": 0.5148958081225645, + "grad_norm": 0.42010459303855896, + "learning_rate": 9.704051261388894e-05, + "loss": 1.5571, + "step": 39624 + }, + { + "epoch": 0.5149088026664804, + "grad_norm": 0.4195231795310974, + "learning_rate": 9.703791315197755e-05, + "loss": 1.4552, + "step": 39625 + }, + { + "epoch": 0.5149217972103963, + "grad_norm": 0.4092920124530792, + "learning_rate": 9.703531369006616e-05, + "loss": 1.429, + "step": 39626 + }, + { + "epoch": 0.5149347917543121, + "grad_norm": 0.43227365612983704, + "learning_rate": 9.703271422815477e-05, + "loss": 1.3812, + "step": 39627 + }, + { + "epoch": 0.514947786298228, + "grad_norm": 0.4294111132621765, + "learning_rate": 9.70301147662434e-05, + "loss": 1.397, + "step": 39628 + }, + { + "epoch": 0.5149607808421439, + "grad_norm": 0.4405810534954071, + "learning_rate": 9.7027515304332e-05, + "loss": 1.6005, + "step": 39629 + }, + { + "epoch": 0.5149737753860598, + "grad_norm": 0.3107149302959442, + "learning_rate": 9.702491584242063e-05, + "loss": 1.2027, + "step": 39630 + }, + { + "epoch": 0.5149867699299756, + "grad_norm": 0.49265652894973755, + "learning_rate": 9.702231638050924e-05, + "loss": 1.3595, + "step": 39631 + }, + { + "epoch": 0.5149997644738915, + "grad_norm": 0.4751925766468048, + "learning_rate": 9.701971691859785e-05, + "loss": 1.3913, + "step": 39632 + }, + { + "epoch": 0.5150127590178074, + "grad_norm": 0.3191274404525757, + "learning_rate": 9.701711745668646e-05, + "loss": 1.1649, + "step": 39633 + }, + { + "epoch": 0.5150257535617233, + "grad_norm": 0.403568834066391, + "learning_rate": 9.701451799477509e-05, + "loss": 1.4818, + "step": 39634 + }, + { + "epoch": 0.5150387481056391, + "grad_norm": 0.4050021171569824, + "learning_rate": 9.70119185328637e-05, + "loss": 1.4224, + "step": 39635 + }, + { + "epoch": 0.515051742649555, + "grad_norm": 0.4036855101585388, + "learning_rate": 9.700931907095232e-05, + "loss": 1.598, + "step": 39636 + }, + { + "epoch": 0.5150647371934709, + "grad_norm": 0.6103993654251099, + "learning_rate": 9.700671960904093e-05, + "loss": 1.5138, + "step": 39637 + }, + { + "epoch": 0.5150777317373868, + "grad_norm": 0.4858066141605377, + "learning_rate": 9.700412014712954e-05, + "loss": 1.3041, + "step": 39638 + }, + { + "epoch": 0.5150907262813026, + "grad_norm": 0.4496895968914032, + "learning_rate": 9.700152068521816e-05, + "loss": 1.4757, + "step": 39639 + }, + { + "epoch": 0.5151037208252185, + "grad_norm": 0.3962342441082001, + "learning_rate": 9.699892122330678e-05, + "loss": 1.3397, + "step": 39640 + }, + { + "epoch": 0.5151167153691344, + "grad_norm": 0.4005768895149231, + "learning_rate": 9.699632176139539e-05, + "loss": 1.2683, + "step": 39641 + }, + { + "epoch": 0.5151297099130503, + "grad_norm": 0.4648260474205017, + "learning_rate": 9.699372229948402e-05, + "loss": 1.3596, + "step": 39642 + }, + { + "epoch": 0.5151427044569661, + "grad_norm": 0.269114226102829, + "learning_rate": 9.699112283757263e-05, + "loss": 1.5445, + "step": 39643 + }, + { + "epoch": 0.515155699000882, + "grad_norm": 0.3677331209182739, + "learning_rate": 9.698852337566125e-05, + "loss": 1.1855, + "step": 39644 + }, + { + "epoch": 0.5151686935447979, + "grad_norm": 0.38550108671188354, + "learning_rate": 9.698592391374985e-05, + "loss": 1.3194, + "step": 39645 + }, + { + "epoch": 0.5151816880887138, + "grad_norm": 0.424008309841156, + "learning_rate": 9.698332445183847e-05, + "loss": 1.3432, + "step": 39646 + }, + { + "epoch": 0.5151946826326296, + "grad_norm": 0.38239532709121704, + "learning_rate": 9.698072498992708e-05, + "loss": 1.2656, + "step": 39647 + }, + { + "epoch": 0.5152076771765455, + "grad_norm": 0.4066025912761688, + "learning_rate": 9.697812552801571e-05, + "loss": 1.3697, + "step": 39648 + }, + { + "epoch": 0.5152206717204614, + "grad_norm": 0.46402406692504883, + "learning_rate": 9.697552606610432e-05, + "loss": 1.466, + "step": 39649 + }, + { + "epoch": 0.5152336662643773, + "grad_norm": 0.370604008436203, + "learning_rate": 9.697292660419294e-05, + "loss": 1.1553, + "step": 39650 + }, + { + "epoch": 0.5152466608082931, + "grad_norm": 0.33773642778396606, + "learning_rate": 9.697032714228154e-05, + "loss": 1.3052, + "step": 39651 + }, + { + "epoch": 0.515259655352209, + "grad_norm": 0.4245837926864624, + "learning_rate": 9.696772768037017e-05, + "loss": 1.1692, + "step": 39652 + }, + { + "epoch": 0.5152726498961249, + "grad_norm": 0.37763452529907227, + "learning_rate": 9.696512821845878e-05, + "loss": 1.3546, + "step": 39653 + }, + { + "epoch": 0.5152856444400408, + "grad_norm": 0.3826252818107605, + "learning_rate": 9.69625287565474e-05, + "loss": 1.3515, + "step": 39654 + }, + { + "epoch": 0.5152986389839566, + "grad_norm": 0.40364518761634827, + "learning_rate": 9.695992929463603e-05, + "loss": 1.3175, + "step": 39655 + }, + { + "epoch": 0.5153116335278725, + "grad_norm": 0.3601331114768982, + "learning_rate": 9.695732983272464e-05, + "loss": 1.3983, + "step": 39656 + }, + { + "epoch": 0.5153246280717884, + "grad_norm": 0.4910103678703308, + "learning_rate": 9.695473037081325e-05, + "loss": 1.4178, + "step": 39657 + }, + { + "epoch": 0.5153376226157043, + "grad_norm": 0.3635617196559906, + "learning_rate": 9.695213090890186e-05, + "loss": 1.4992, + "step": 39658 + }, + { + "epoch": 0.51535061715962, + "grad_norm": 0.4244706630706787, + "learning_rate": 9.694953144699048e-05, + "loss": 1.3825, + "step": 39659 + }, + { + "epoch": 0.515363611703536, + "grad_norm": 0.4413972795009613, + "learning_rate": 9.69469319850791e-05, + "loss": 1.2561, + "step": 39660 + }, + { + "epoch": 0.5153766062474519, + "grad_norm": 0.4252251088619232, + "learning_rate": 9.694433252316772e-05, + "loss": 1.5165, + "step": 39661 + }, + { + "epoch": 0.5153896007913678, + "grad_norm": 0.38003408908843994, + "learning_rate": 9.694173306125633e-05, + "loss": 1.4168, + "step": 39662 + }, + { + "epoch": 0.5154025953352837, + "grad_norm": 0.4139915108680725, + "learning_rate": 9.693913359934494e-05, + "loss": 1.34, + "step": 39663 + }, + { + "epoch": 0.5154155898791994, + "grad_norm": 0.41516634821891785, + "learning_rate": 9.693653413743355e-05, + "loss": 1.3181, + "step": 39664 + }, + { + "epoch": 0.5154285844231153, + "grad_norm": 0.40572717785835266, + "learning_rate": 9.693393467552218e-05, + "loss": 1.3159, + "step": 39665 + }, + { + "epoch": 0.5154415789670312, + "grad_norm": 0.43985527753829956, + "learning_rate": 9.693133521361079e-05, + "loss": 1.3505, + "step": 39666 + }, + { + "epoch": 0.5154545735109471, + "grad_norm": 0.38294097781181335, + "learning_rate": 9.692873575169941e-05, + "loss": 1.4556, + "step": 39667 + }, + { + "epoch": 0.5154675680548629, + "grad_norm": 0.40433546900749207, + "learning_rate": 9.692613628978802e-05, + "loss": 1.2838, + "step": 39668 + }, + { + "epoch": 0.5154805625987788, + "grad_norm": 0.33904653787612915, + "learning_rate": 9.692353682787663e-05, + "loss": 1.4997, + "step": 39669 + }, + { + "epoch": 0.5154935571426947, + "grad_norm": 0.4727816581726074, + "learning_rate": 9.692093736596524e-05, + "loss": 1.3736, + "step": 39670 + }, + { + "epoch": 0.5155065516866106, + "grad_norm": 0.4772983193397522, + "learning_rate": 9.691833790405387e-05, + "loss": 1.3239, + "step": 39671 + }, + { + "epoch": 0.5155195462305264, + "grad_norm": 0.295014888048172, + "learning_rate": 9.691573844214248e-05, + "loss": 1.1912, + "step": 39672 + }, + { + "epoch": 0.5155325407744423, + "grad_norm": 0.3867585361003876, + "learning_rate": 9.69131389802311e-05, + "loss": 1.4461, + "step": 39673 + }, + { + "epoch": 0.5155455353183582, + "grad_norm": 0.39707034826278687, + "learning_rate": 9.691053951831971e-05, + "loss": 1.4088, + "step": 39674 + }, + { + "epoch": 0.5155585298622741, + "grad_norm": 0.27397042512893677, + "learning_rate": 9.690794005640833e-05, + "loss": 1.1664, + "step": 39675 + }, + { + "epoch": 0.5155715244061899, + "grad_norm": 0.39887845516204834, + "learning_rate": 9.690534059449694e-05, + "loss": 1.4501, + "step": 39676 + }, + { + "epoch": 0.5155845189501058, + "grad_norm": 0.3815763592720032, + "learning_rate": 9.690274113258556e-05, + "loss": 1.277, + "step": 39677 + }, + { + "epoch": 0.5155975134940217, + "grad_norm": 0.44615644216537476, + "learning_rate": 9.690014167067417e-05, + "loss": 1.4794, + "step": 39678 + }, + { + "epoch": 0.5156105080379376, + "grad_norm": 0.4535128176212311, + "learning_rate": 9.68975422087628e-05, + "loss": 1.3566, + "step": 39679 + }, + { + "epoch": 0.5156235025818534, + "grad_norm": 0.37491074204444885, + "learning_rate": 9.689494274685141e-05, + "loss": 1.1551, + "step": 39680 + }, + { + "epoch": 0.5156364971257693, + "grad_norm": 0.3964325785636902, + "learning_rate": 9.689234328494002e-05, + "loss": 1.4746, + "step": 39681 + }, + { + "epoch": 0.5156494916696852, + "grad_norm": 0.4339883327484131, + "learning_rate": 9.688974382302863e-05, + "loss": 1.4004, + "step": 39682 + }, + { + "epoch": 0.5156624862136011, + "grad_norm": 0.42420902848243713, + "learning_rate": 9.688714436111725e-05, + "loss": 1.343, + "step": 39683 + }, + { + "epoch": 0.5156754807575169, + "grad_norm": 0.4737933874130249, + "learning_rate": 9.688454489920586e-05, + "loss": 1.5135, + "step": 39684 + }, + { + "epoch": 0.5156884753014328, + "grad_norm": 0.39823395013809204, + "learning_rate": 9.688194543729449e-05, + "loss": 1.3754, + "step": 39685 + }, + { + "epoch": 0.5157014698453487, + "grad_norm": 0.3858322203159332, + "learning_rate": 9.68793459753831e-05, + "loss": 1.455, + "step": 39686 + }, + { + "epoch": 0.5157144643892646, + "grad_norm": 0.385077565908432, + "learning_rate": 9.687674651347171e-05, + "loss": 1.3362, + "step": 39687 + }, + { + "epoch": 0.5157274589331804, + "grad_norm": 0.3967994153499603, + "learning_rate": 9.687414705156032e-05, + "loss": 1.519, + "step": 39688 + }, + { + "epoch": 0.5157404534770963, + "grad_norm": 0.40071114897727966, + "learning_rate": 9.687154758964895e-05, + "loss": 1.4978, + "step": 39689 + }, + { + "epoch": 0.5157534480210122, + "grad_norm": 0.38770627975463867, + "learning_rate": 9.686894812773756e-05, + "loss": 1.4567, + "step": 39690 + }, + { + "epoch": 0.5157664425649281, + "grad_norm": 0.4315352737903595, + "learning_rate": 9.686634866582618e-05, + "loss": 1.4558, + "step": 39691 + }, + { + "epoch": 0.5157794371088439, + "grad_norm": 0.36199596524238586, + "learning_rate": 9.68637492039148e-05, + "loss": 1.4656, + "step": 39692 + }, + { + "epoch": 0.5157924316527598, + "grad_norm": 0.4555199146270752, + "learning_rate": 9.68611497420034e-05, + "loss": 1.4621, + "step": 39693 + }, + { + "epoch": 0.5158054261966757, + "grad_norm": 0.3427242040634155, + "learning_rate": 9.685855028009203e-05, + "loss": 1.2882, + "step": 39694 + }, + { + "epoch": 0.5158184207405916, + "grad_norm": 0.3931252062320709, + "learning_rate": 9.685595081818064e-05, + "loss": 1.4466, + "step": 39695 + }, + { + "epoch": 0.5158314152845074, + "grad_norm": 0.41686004400253296, + "learning_rate": 9.685335135626926e-05, + "loss": 1.4782, + "step": 39696 + }, + { + "epoch": 0.5158444098284233, + "grad_norm": 0.448379784822464, + "learning_rate": 9.685075189435787e-05, + "loss": 1.4749, + "step": 39697 + }, + { + "epoch": 0.5158574043723392, + "grad_norm": 0.4665580093860626, + "learning_rate": 9.68481524324465e-05, + "loss": 1.4845, + "step": 39698 + }, + { + "epoch": 0.5158703989162551, + "grad_norm": 0.344743013381958, + "learning_rate": 9.68455529705351e-05, + "loss": 1.4014, + "step": 39699 + }, + { + "epoch": 0.5158833934601709, + "grad_norm": 0.17722588777542114, + "learning_rate": 9.684295350862372e-05, + "loss": 0.9985, + "step": 39700 + }, + { + "epoch": 0.5158963880040868, + "grad_norm": 0.3760932385921478, + "learning_rate": 9.684035404671233e-05, + "loss": 1.5692, + "step": 39701 + }, + { + "epoch": 0.5159093825480027, + "grad_norm": 0.3735870122909546, + "learning_rate": 9.683775458480096e-05, + "loss": 1.3973, + "step": 39702 + }, + { + "epoch": 0.5159223770919186, + "grad_norm": 0.42061847448349, + "learning_rate": 9.683515512288957e-05, + "loss": 1.4381, + "step": 39703 + }, + { + "epoch": 0.5159353716358344, + "grad_norm": 0.4074678122997284, + "learning_rate": 9.683255566097819e-05, + "loss": 1.3, + "step": 39704 + }, + { + "epoch": 0.5159483661797503, + "grad_norm": 0.4492395520210266, + "learning_rate": 9.68299561990668e-05, + "loss": 1.4017, + "step": 39705 + }, + { + "epoch": 0.5159613607236662, + "grad_norm": 0.35534337162971497, + "learning_rate": 9.682735673715541e-05, + "loss": 1.48, + "step": 39706 + }, + { + "epoch": 0.5159743552675821, + "grad_norm": 0.38247454166412354, + "learning_rate": 9.682475727524402e-05, + "loss": 1.2794, + "step": 39707 + }, + { + "epoch": 0.5159873498114979, + "grad_norm": 0.4027169644832611, + "learning_rate": 9.682215781333265e-05, + "loss": 1.3626, + "step": 39708 + }, + { + "epoch": 0.5160003443554138, + "grad_norm": 0.42067059874534607, + "learning_rate": 9.681955835142126e-05, + "loss": 1.5148, + "step": 39709 + }, + { + "epoch": 0.5160133388993297, + "grad_norm": 0.43157386779785156, + "learning_rate": 9.681695888950988e-05, + "loss": 1.2895, + "step": 39710 + }, + { + "epoch": 0.5160263334432456, + "grad_norm": 0.4166816473007202, + "learning_rate": 9.68143594275985e-05, + "loss": 1.3646, + "step": 39711 + }, + { + "epoch": 0.5160393279871613, + "grad_norm": 0.4115425944328308, + "learning_rate": 9.68117599656871e-05, + "loss": 1.2935, + "step": 39712 + }, + { + "epoch": 0.5160523225310772, + "grad_norm": 0.3313804566860199, + "learning_rate": 9.680916050377572e-05, + "loss": 1.4153, + "step": 39713 + }, + { + "epoch": 0.5160653170749931, + "grad_norm": 0.4072376787662506, + "learning_rate": 9.680656104186434e-05, + "loss": 1.4457, + "step": 39714 + }, + { + "epoch": 0.516078311618909, + "grad_norm": 0.4364437162876129, + "learning_rate": 9.680396157995295e-05, + "loss": 1.4303, + "step": 39715 + }, + { + "epoch": 0.5160913061628248, + "grad_norm": 0.3545844554901123, + "learning_rate": 9.680136211804158e-05, + "loss": 1.3587, + "step": 39716 + }, + { + "epoch": 0.5161043007067407, + "grad_norm": 0.4468756318092346, + "learning_rate": 9.679876265613019e-05, + "loss": 1.3551, + "step": 39717 + }, + { + "epoch": 0.5161172952506566, + "grad_norm": 0.4774535894393921, + "learning_rate": 9.67961631942188e-05, + "loss": 1.3784, + "step": 39718 + }, + { + "epoch": 0.5161302897945725, + "grad_norm": 0.2824321389198303, + "learning_rate": 9.679356373230741e-05, + "loss": 1.372, + "step": 39719 + }, + { + "epoch": 0.5161432843384883, + "grad_norm": 0.32836994528770447, + "learning_rate": 9.679096427039603e-05, + "loss": 1.4254, + "step": 39720 + }, + { + "epoch": 0.5161562788824042, + "grad_norm": 0.3381592333316803, + "learning_rate": 9.678836480848465e-05, + "loss": 1.3536, + "step": 39721 + }, + { + "epoch": 0.5161692734263201, + "grad_norm": 0.42833465337753296, + "learning_rate": 9.678576534657327e-05, + "loss": 1.5646, + "step": 39722 + }, + { + "epoch": 0.516182267970236, + "grad_norm": 0.31299206614494324, + "learning_rate": 9.678316588466188e-05, + "loss": 1.2645, + "step": 39723 + }, + { + "epoch": 0.5161952625141518, + "grad_norm": 0.4437852203845978, + "learning_rate": 9.678056642275049e-05, + "loss": 1.429, + "step": 39724 + }, + { + "epoch": 0.5162082570580677, + "grad_norm": 0.42146965861320496, + "learning_rate": 9.67779669608391e-05, + "loss": 1.3933, + "step": 39725 + }, + { + "epoch": 0.5162212516019836, + "grad_norm": 0.385261207818985, + "learning_rate": 9.677536749892773e-05, + "loss": 1.384, + "step": 39726 + }, + { + "epoch": 0.5162342461458995, + "grad_norm": 0.4106166362762451, + "learning_rate": 9.677276803701634e-05, + "loss": 1.3839, + "step": 39727 + }, + { + "epoch": 0.5162472406898153, + "grad_norm": 0.441175252199173, + "learning_rate": 9.677016857510496e-05, + "loss": 1.348, + "step": 39728 + }, + { + "epoch": 0.5162602352337312, + "grad_norm": 0.40549442172050476, + "learning_rate": 9.676756911319357e-05, + "loss": 1.3247, + "step": 39729 + }, + { + "epoch": 0.5162732297776471, + "grad_norm": 0.43162834644317627, + "learning_rate": 9.676496965128218e-05, + "loss": 1.5191, + "step": 39730 + }, + { + "epoch": 0.516286224321563, + "grad_norm": 0.3366645574569702, + "learning_rate": 9.676237018937081e-05, + "loss": 1.4965, + "step": 39731 + }, + { + "epoch": 0.5162992188654788, + "grad_norm": 0.45134490728378296, + "learning_rate": 9.675977072745942e-05, + "loss": 1.4184, + "step": 39732 + }, + { + "epoch": 0.5163122134093947, + "grad_norm": 0.49368637800216675, + "learning_rate": 9.675717126554804e-05, + "loss": 1.3428, + "step": 39733 + }, + { + "epoch": 0.5163252079533106, + "grad_norm": 0.4337972104549408, + "learning_rate": 9.675457180363666e-05, + "loss": 1.4095, + "step": 39734 + }, + { + "epoch": 0.5163382024972265, + "grad_norm": 0.4925230145454407, + "learning_rate": 9.675197234172527e-05, + "loss": 1.2615, + "step": 39735 + }, + { + "epoch": 0.5163511970411423, + "grad_norm": 0.42128655314445496, + "learning_rate": 9.674937287981388e-05, + "loss": 1.3554, + "step": 39736 + }, + { + "epoch": 0.5163641915850582, + "grad_norm": 0.31247860193252563, + "learning_rate": 9.67467734179025e-05, + "loss": 1.3697, + "step": 39737 + }, + { + "epoch": 0.5163771861289741, + "grad_norm": 0.4025348424911499, + "learning_rate": 9.674417395599111e-05, + "loss": 1.3012, + "step": 39738 + }, + { + "epoch": 0.51639018067289, + "grad_norm": 0.36298179626464844, + "learning_rate": 9.674157449407974e-05, + "loss": 1.1191, + "step": 39739 + }, + { + "epoch": 0.5164031752168059, + "grad_norm": 0.3341827988624573, + "learning_rate": 9.673897503216835e-05, + "loss": 1.1723, + "step": 39740 + }, + { + "epoch": 0.5164161697607217, + "grad_norm": 0.5221813321113586, + "learning_rate": 9.673637557025696e-05, + "loss": 1.4704, + "step": 39741 + }, + { + "epoch": 0.5164291643046376, + "grad_norm": 0.4140227437019348, + "learning_rate": 9.673377610834557e-05, + "loss": 1.3517, + "step": 39742 + }, + { + "epoch": 0.5164421588485535, + "grad_norm": 0.43374601006507874, + "learning_rate": 9.67311766464342e-05, + "loss": 1.1578, + "step": 39743 + }, + { + "epoch": 0.5164551533924694, + "grad_norm": 0.37288859486579895, + "learning_rate": 9.67285771845228e-05, + "loss": 1.3072, + "step": 39744 + }, + { + "epoch": 0.5164681479363852, + "grad_norm": 0.3364974856376648, + "learning_rate": 9.672597772261143e-05, + "loss": 1.4689, + "step": 39745 + }, + { + "epoch": 0.5164811424803011, + "grad_norm": 0.42575275897979736, + "learning_rate": 9.672337826070004e-05, + "loss": 1.1894, + "step": 39746 + }, + { + "epoch": 0.516494137024217, + "grad_norm": 0.343377947807312, + "learning_rate": 9.672077879878866e-05, + "loss": 1.3062, + "step": 39747 + }, + { + "epoch": 0.5165071315681329, + "grad_norm": 0.408843457698822, + "learning_rate": 9.671817933687726e-05, + "loss": 1.2573, + "step": 39748 + }, + { + "epoch": 0.5165201261120487, + "grad_norm": 0.4673076868057251, + "learning_rate": 9.671557987496589e-05, + "loss": 1.2183, + "step": 39749 + }, + { + "epoch": 0.5165331206559646, + "grad_norm": 0.40669989585876465, + "learning_rate": 9.67129804130545e-05, + "loss": 1.2672, + "step": 39750 + }, + { + "epoch": 0.5165461151998805, + "grad_norm": 0.3623637855052948, + "learning_rate": 9.671038095114312e-05, + "loss": 1.4054, + "step": 39751 + }, + { + "epoch": 0.5165591097437964, + "grad_norm": 0.4518243372440338, + "learning_rate": 9.670778148923173e-05, + "loss": 1.4425, + "step": 39752 + }, + { + "epoch": 0.5165721042877122, + "grad_norm": 0.38863709568977356, + "learning_rate": 9.670518202732036e-05, + "loss": 1.0689, + "step": 39753 + }, + { + "epoch": 0.5165850988316281, + "grad_norm": 0.4522719383239746, + "learning_rate": 9.670258256540896e-05, + "loss": 1.2337, + "step": 39754 + }, + { + "epoch": 0.516598093375544, + "grad_norm": 0.4815397560596466, + "learning_rate": 9.669998310349758e-05, + "loss": 1.4118, + "step": 39755 + }, + { + "epoch": 0.5166110879194599, + "grad_norm": 0.5005756616592407, + "learning_rate": 9.669738364158619e-05, + "loss": 1.043, + "step": 39756 + }, + { + "epoch": 0.5166240824633757, + "grad_norm": 0.4224078357219696, + "learning_rate": 9.669478417967481e-05, + "loss": 1.434, + "step": 39757 + }, + { + "epoch": 0.5166370770072916, + "grad_norm": 0.4652017652988434, + "learning_rate": 9.669218471776343e-05, + "loss": 1.2413, + "step": 39758 + }, + { + "epoch": 0.5166500715512075, + "grad_norm": 0.24948734045028687, + "learning_rate": 9.668958525585205e-05, + "loss": 1.2508, + "step": 39759 + }, + { + "epoch": 0.5166630660951234, + "grad_norm": 0.35661581158638, + "learning_rate": 9.668698579394065e-05, + "loss": 1.1946, + "step": 39760 + }, + { + "epoch": 0.5166760606390391, + "grad_norm": 0.4033752679824829, + "learning_rate": 9.668438633202927e-05, + "loss": 1.4716, + "step": 39761 + }, + { + "epoch": 0.516689055182955, + "grad_norm": 0.47122421860694885, + "learning_rate": 9.668178687011788e-05, + "loss": 1.41, + "step": 39762 + }, + { + "epoch": 0.516702049726871, + "grad_norm": 0.3335406184196472, + "learning_rate": 9.667918740820651e-05, + "loss": 1.3852, + "step": 39763 + }, + { + "epoch": 0.5167150442707868, + "grad_norm": 0.5164328217506409, + "learning_rate": 9.667658794629512e-05, + "loss": 1.5719, + "step": 39764 + }, + { + "epoch": 0.5167280388147026, + "grad_norm": 0.4332098662853241, + "learning_rate": 9.667398848438374e-05, + "loss": 1.5024, + "step": 39765 + }, + { + "epoch": 0.5167410333586185, + "grad_norm": 0.40759408473968506, + "learning_rate": 9.667138902247235e-05, + "loss": 1.3069, + "step": 39766 + }, + { + "epoch": 0.5167540279025344, + "grad_norm": 0.38008755445480347, + "learning_rate": 9.666878956056096e-05, + "loss": 1.4045, + "step": 39767 + }, + { + "epoch": 0.5167670224464503, + "grad_norm": 0.45996609330177307, + "learning_rate": 9.666619009864959e-05, + "loss": 1.4022, + "step": 39768 + }, + { + "epoch": 0.5167800169903661, + "grad_norm": 0.543403148651123, + "learning_rate": 9.66635906367382e-05, + "loss": 1.624, + "step": 39769 + }, + { + "epoch": 0.516793011534282, + "grad_norm": 0.3805897831916809, + "learning_rate": 9.666099117482682e-05, + "loss": 1.2429, + "step": 39770 + }, + { + "epoch": 0.5168060060781979, + "grad_norm": 0.3329651951789856, + "learning_rate": 9.665839171291544e-05, + "loss": 1.4482, + "step": 39771 + }, + { + "epoch": 0.5168190006221138, + "grad_norm": 0.4025849997997284, + "learning_rate": 9.665579225100405e-05, + "loss": 1.4648, + "step": 39772 + }, + { + "epoch": 0.5168319951660296, + "grad_norm": 0.37977755069732666, + "learning_rate": 9.665319278909266e-05, + "loss": 1.4802, + "step": 39773 + }, + { + "epoch": 0.5168449897099455, + "grad_norm": 0.42401570081710815, + "learning_rate": 9.665059332718128e-05, + "loss": 1.4939, + "step": 39774 + }, + { + "epoch": 0.5168579842538614, + "grad_norm": 0.37666407227516174, + "learning_rate": 9.664799386526989e-05, + "loss": 1.3415, + "step": 39775 + }, + { + "epoch": 0.5168709787977773, + "grad_norm": 0.40926486253738403, + "learning_rate": 9.664539440335852e-05, + "loss": 1.2638, + "step": 39776 + }, + { + "epoch": 0.5168839733416931, + "grad_norm": 0.43744245171546936, + "learning_rate": 9.664279494144713e-05, + "loss": 1.4516, + "step": 39777 + }, + { + "epoch": 0.516896967885609, + "grad_norm": 0.41427087783813477, + "learning_rate": 9.664019547953574e-05, + "loss": 1.4958, + "step": 39778 + }, + { + "epoch": 0.5169099624295249, + "grad_norm": 0.42561405897140503, + "learning_rate": 9.663759601762435e-05, + "loss": 1.4731, + "step": 39779 + }, + { + "epoch": 0.5169229569734408, + "grad_norm": 0.3866291046142578, + "learning_rate": 9.663499655571297e-05, + "loss": 1.41, + "step": 39780 + }, + { + "epoch": 0.5169359515173566, + "grad_norm": 0.40552273392677307, + "learning_rate": 9.663239709380159e-05, + "loss": 1.2356, + "step": 39781 + }, + { + "epoch": 0.5169489460612725, + "grad_norm": 0.42753568291664124, + "learning_rate": 9.662979763189021e-05, + "loss": 1.4893, + "step": 39782 + }, + { + "epoch": 0.5169619406051884, + "grad_norm": 0.4228987991809845, + "learning_rate": 9.662719816997882e-05, + "loss": 1.4465, + "step": 39783 + }, + { + "epoch": 0.5169749351491043, + "grad_norm": 0.33820393681526184, + "learning_rate": 9.662459870806743e-05, + "loss": 1.2627, + "step": 39784 + }, + { + "epoch": 0.5169879296930201, + "grad_norm": 0.38927245140075684, + "learning_rate": 9.662199924615604e-05, + "loss": 1.2624, + "step": 39785 + }, + { + "epoch": 0.517000924236936, + "grad_norm": 0.4500969350337982, + "learning_rate": 9.661939978424467e-05, + "loss": 1.3489, + "step": 39786 + }, + { + "epoch": 0.5170139187808519, + "grad_norm": 0.47534242272377014, + "learning_rate": 9.661680032233328e-05, + "loss": 1.3918, + "step": 39787 + }, + { + "epoch": 0.5170269133247678, + "grad_norm": 0.4441378116607666, + "learning_rate": 9.66142008604219e-05, + "loss": 1.3423, + "step": 39788 + }, + { + "epoch": 0.5170399078686836, + "grad_norm": 0.45378854870796204, + "learning_rate": 9.661160139851051e-05, + "loss": 1.295, + "step": 39789 + }, + { + "epoch": 0.5170529024125995, + "grad_norm": 0.4024064242839813, + "learning_rate": 9.660900193659912e-05, + "loss": 1.3863, + "step": 39790 + }, + { + "epoch": 0.5170658969565154, + "grad_norm": 0.3129641115665436, + "learning_rate": 9.660640247468774e-05, + "loss": 1.2861, + "step": 39791 + }, + { + "epoch": 0.5170788915004313, + "grad_norm": 0.5997129082679749, + "learning_rate": 9.660380301277636e-05, + "loss": 1.4397, + "step": 39792 + }, + { + "epoch": 0.5170918860443471, + "grad_norm": 0.34037235379219055, + "learning_rate": 9.660120355086497e-05, + "loss": 1.5064, + "step": 39793 + }, + { + "epoch": 0.517104880588263, + "grad_norm": 0.4425402879714966, + "learning_rate": 9.65986040889536e-05, + "loss": 1.5402, + "step": 39794 + }, + { + "epoch": 0.5171178751321789, + "grad_norm": 0.4149685502052307, + "learning_rate": 9.65960046270422e-05, + "loss": 1.4033, + "step": 39795 + }, + { + "epoch": 0.5171308696760948, + "grad_norm": 0.40826696157455444, + "learning_rate": 9.659340516513082e-05, + "loss": 1.4657, + "step": 39796 + }, + { + "epoch": 0.5171438642200106, + "grad_norm": 0.27776089310646057, + "learning_rate": 9.659080570321943e-05, + "loss": 1.3309, + "step": 39797 + }, + { + "epoch": 0.5171568587639265, + "grad_norm": 0.4421710669994354, + "learning_rate": 9.658820624130805e-05, + "loss": 1.4369, + "step": 39798 + }, + { + "epoch": 0.5171698533078424, + "grad_norm": 0.4532667398452759, + "learning_rate": 9.658560677939666e-05, + "loss": 1.5156, + "step": 39799 + }, + { + "epoch": 0.5171828478517583, + "grad_norm": 0.40888866782188416, + "learning_rate": 9.658300731748529e-05, + "loss": 1.3444, + "step": 39800 + }, + { + "epoch": 0.5171958423956741, + "grad_norm": 0.4366384744644165, + "learning_rate": 9.65804078555739e-05, + "loss": 1.4396, + "step": 39801 + }, + { + "epoch": 0.51720883693959, + "grad_norm": 0.4267401397228241, + "learning_rate": 9.657780839366251e-05, + "loss": 1.3359, + "step": 39802 + }, + { + "epoch": 0.5172218314835059, + "grad_norm": 0.372907817363739, + "learning_rate": 9.657520893175112e-05, + "loss": 1.4627, + "step": 39803 + }, + { + "epoch": 0.5172348260274218, + "grad_norm": 0.40663430094718933, + "learning_rate": 9.657260946983975e-05, + "loss": 1.3293, + "step": 39804 + }, + { + "epoch": 0.5172478205713376, + "grad_norm": 0.32831236720085144, + "learning_rate": 9.657001000792837e-05, + "loss": 1.4796, + "step": 39805 + }, + { + "epoch": 0.5172608151152535, + "grad_norm": 0.41455942392349243, + "learning_rate": 9.656741054601698e-05, + "loss": 1.2911, + "step": 39806 + }, + { + "epoch": 0.5172738096591694, + "grad_norm": 0.4237598478794098, + "learning_rate": 9.65648110841056e-05, + "loss": 1.3466, + "step": 39807 + }, + { + "epoch": 0.5172868042030853, + "grad_norm": 0.5078092813491821, + "learning_rate": 9.656221162219422e-05, + "loss": 1.4124, + "step": 39808 + }, + { + "epoch": 0.517299798747001, + "grad_norm": 0.43971511721611023, + "learning_rate": 9.655961216028283e-05, + "loss": 1.4124, + "step": 39809 + }, + { + "epoch": 0.517312793290917, + "grad_norm": 0.378388911485672, + "learning_rate": 9.655701269837144e-05, + "loss": 1.3398, + "step": 39810 + }, + { + "epoch": 0.5173257878348329, + "grad_norm": 0.3556126356124878, + "learning_rate": 9.655441323646006e-05, + "loss": 1.1744, + "step": 39811 + }, + { + "epoch": 0.5173387823787488, + "grad_norm": 0.35911452770233154, + "learning_rate": 9.655181377454867e-05, + "loss": 1.2825, + "step": 39812 + }, + { + "epoch": 0.5173517769226645, + "grad_norm": 0.3927856981754303, + "learning_rate": 9.65492143126373e-05, + "loss": 1.1968, + "step": 39813 + }, + { + "epoch": 0.5173647714665804, + "grad_norm": 0.31085440516471863, + "learning_rate": 9.654661485072591e-05, + "loss": 1.3163, + "step": 39814 + }, + { + "epoch": 0.5173777660104963, + "grad_norm": 0.3252396583557129, + "learning_rate": 9.654401538881452e-05, + "loss": 1.3741, + "step": 39815 + }, + { + "epoch": 0.5173907605544122, + "grad_norm": 0.4776071310043335, + "learning_rate": 9.654141592690313e-05, + "loss": 1.4171, + "step": 39816 + }, + { + "epoch": 0.5174037550983281, + "grad_norm": 0.33488571643829346, + "learning_rate": 9.653881646499176e-05, + "loss": 1.2847, + "step": 39817 + }, + { + "epoch": 0.5174167496422439, + "grad_norm": 0.43468913435935974, + "learning_rate": 9.653621700308037e-05, + "loss": 1.2042, + "step": 39818 + }, + { + "epoch": 0.5174297441861598, + "grad_norm": 0.3939213752746582, + "learning_rate": 9.653361754116899e-05, + "loss": 1.5204, + "step": 39819 + }, + { + "epoch": 0.5174427387300757, + "grad_norm": 0.4318459630012512, + "learning_rate": 9.65310180792576e-05, + "loss": 1.5786, + "step": 39820 + }, + { + "epoch": 0.5174557332739916, + "grad_norm": 0.36560386419296265, + "learning_rate": 9.652841861734621e-05, + "loss": 1.2486, + "step": 39821 + }, + { + "epoch": 0.5174687278179074, + "grad_norm": 0.473521888256073, + "learning_rate": 9.652581915543482e-05, + "loss": 1.4041, + "step": 39822 + }, + { + "epoch": 0.5174817223618233, + "grad_norm": 0.5077974796295166, + "learning_rate": 9.652321969352345e-05, + "loss": 1.449, + "step": 39823 + }, + { + "epoch": 0.5174947169057392, + "grad_norm": 0.41945451498031616, + "learning_rate": 9.652062023161206e-05, + "loss": 1.4509, + "step": 39824 + }, + { + "epoch": 0.5175077114496551, + "grad_norm": 0.4337124228477478, + "learning_rate": 9.651802076970068e-05, + "loss": 1.5603, + "step": 39825 + }, + { + "epoch": 0.5175207059935709, + "grad_norm": 0.3450642228126526, + "learning_rate": 9.65154213077893e-05, + "loss": 1.3649, + "step": 39826 + }, + { + "epoch": 0.5175337005374868, + "grad_norm": 0.4827076196670532, + "learning_rate": 9.65128218458779e-05, + "loss": 1.435, + "step": 39827 + }, + { + "epoch": 0.5175466950814027, + "grad_norm": 0.37749630212783813, + "learning_rate": 9.651022238396652e-05, + "loss": 1.4146, + "step": 39828 + }, + { + "epoch": 0.5175596896253186, + "grad_norm": 0.34670841693878174, + "learning_rate": 9.650762292205514e-05, + "loss": 1.2127, + "step": 39829 + }, + { + "epoch": 0.5175726841692344, + "grad_norm": 0.4276048541069031, + "learning_rate": 9.650502346014375e-05, + "loss": 1.7008, + "step": 39830 + }, + { + "epoch": 0.5175856787131503, + "grad_norm": 0.3143344223499298, + "learning_rate": 9.650242399823238e-05, + "loss": 1.2626, + "step": 39831 + }, + { + "epoch": 0.5175986732570662, + "grad_norm": 0.3719705045223236, + "learning_rate": 9.649982453632099e-05, + "loss": 1.3233, + "step": 39832 + }, + { + "epoch": 0.5176116678009821, + "grad_norm": 0.37398555874824524, + "learning_rate": 9.64972250744096e-05, + "loss": 1.2199, + "step": 39833 + }, + { + "epoch": 0.5176246623448979, + "grad_norm": 0.5182440876960754, + "learning_rate": 9.649462561249821e-05, + "loss": 1.4561, + "step": 39834 + }, + { + "epoch": 0.5176376568888138, + "grad_norm": 0.3954979181289673, + "learning_rate": 9.649202615058683e-05, + "loss": 1.2075, + "step": 39835 + }, + { + "epoch": 0.5176506514327297, + "grad_norm": 0.2489006072282791, + "learning_rate": 9.648942668867544e-05, + "loss": 1.1991, + "step": 39836 + }, + { + "epoch": 0.5176636459766456, + "grad_norm": 0.4750337600708008, + "learning_rate": 9.648682722676407e-05, + "loss": 1.4766, + "step": 39837 + }, + { + "epoch": 0.5176766405205614, + "grad_norm": 0.43312010169029236, + "learning_rate": 9.648422776485268e-05, + "loss": 1.4388, + "step": 39838 + }, + { + "epoch": 0.5176896350644773, + "grad_norm": 0.408152312040329, + "learning_rate": 9.648162830294129e-05, + "loss": 1.3923, + "step": 39839 + }, + { + "epoch": 0.5177026296083932, + "grad_norm": 0.4312520921230316, + "learning_rate": 9.64790288410299e-05, + "loss": 1.3184, + "step": 39840 + }, + { + "epoch": 0.5177156241523091, + "grad_norm": 0.3170221149921417, + "learning_rate": 9.647642937911853e-05, + "loss": 1.1946, + "step": 39841 + }, + { + "epoch": 0.5177286186962249, + "grad_norm": 0.4742594361305237, + "learning_rate": 9.647382991720715e-05, + "loss": 1.4002, + "step": 39842 + }, + { + "epoch": 0.5177416132401408, + "grad_norm": 0.42664453387260437, + "learning_rate": 9.647123045529576e-05, + "loss": 1.4475, + "step": 39843 + }, + { + "epoch": 0.5177546077840567, + "grad_norm": 0.4326609969139099, + "learning_rate": 9.646863099338437e-05, + "loss": 1.3398, + "step": 39844 + }, + { + "epoch": 0.5177676023279726, + "grad_norm": 0.44383418560028076, + "learning_rate": 9.646603153147298e-05, + "loss": 1.4035, + "step": 39845 + }, + { + "epoch": 0.5177805968718884, + "grad_norm": 0.3674788475036621, + "learning_rate": 9.646343206956161e-05, + "loss": 1.424, + "step": 39846 + }, + { + "epoch": 0.5177935914158043, + "grad_norm": 0.3745240271091461, + "learning_rate": 9.646083260765022e-05, + "loss": 1.2085, + "step": 39847 + }, + { + "epoch": 0.5178065859597202, + "grad_norm": 0.34725555777549744, + "learning_rate": 9.645823314573884e-05, + "loss": 1.3426, + "step": 39848 + }, + { + "epoch": 0.5178195805036361, + "grad_norm": 0.5519569516181946, + "learning_rate": 9.645563368382745e-05, + "loss": 1.4524, + "step": 39849 + }, + { + "epoch": 0.5178325750475519, + "grad_norm": 0.4079243838787079, + "learning_rate": 9.645303422191608e-05, + "loss": 1.2667, + "step": 39850 + }, + { + "epoch": 0.5178455695914678, + "grad_norm": 0.433389276266098, + "learning_rate": 9.645043476000468e-05, + "loss": 1.457, + "step": 39851 + }, + { + "epoch": 0.5178585641353837, + "grad_norm": 0.4642544090747833, + "learning_rate": 9.64478352980933e-05, + "loss": 1.4968, + "step": 39852 + }, + { + "epoch": 0.5178715586792996, + "grad_norm": 0.5511395335197449, + "learning_rate": 9.644523583618191e-05, + "loss": 1.4382, + "step": 39853 + }, + { + "epoch": 0.5178845532232154, + "grad_norm": 0.3607686460018158, + "learning_rate": 9.644263637427054e-05, + "loss": 1.3017, + "step": 39854 + }, + { + "epoch": 0.5178975477671313, + "grad_norm": 0.31322386860847473, + "learning_rate": 9.644003691235915e-05, + "loss": 1.4886, + "step": 39855 + }, + { + "epoch": 0.5179105423110472, + "grad_norm": 0.3395468294620514, + "learning_rate": 9.643743745044777e-05, + "loss": 1.1908, + "step": 39856 + }, + { + "epoch": 0.5179235368549631, + "grad_norm": 0.4174630045890808, + "learning_rate": 9.643483798853637e-05, + "loss": 1.3528, + "step": 39857 + }, + { + "epoch": 0.5179365313988789, + "grad_norm": 0.34309256076812744, + "learning_rate": 9.6432238526625e-05, + "loss": 1.1808, + "step": 39858 + }, + { + "epoch": 0.5179495259427948, + "grad_norm": 0.4298976957798004, + "learning_rate": 9.64296390647136e-05, + "loss": 1.2467, + "step": 39859 + }, + { + "epoch": 0.5179625204867107, + "grad_norm": 0.26785096526145935, + "learning_rate": 9.642703960280223e-05, + "loss": 1.264, + "step": 39860 + }, + { + "epoch": 0.5179755150306266, + "grad_norm": 0.43830612301826477, + "learning_rate": 9.642444014089084e-05, + "loss": 1.3387, + "step": 39861 + }, + { + "epoch": 0.5179885095745423, + "grad_norm": 0.45528340339660645, + "learning_rate": 9.642184067897946e-05, + "loss": 1.3545, + "step": 39862 + }, + { + "epoch": 0.5180015041184582, + "grad_norm": 0.4136285185813904, + "learning_rate": 9.641924121706806e-05, + "loss": 1.4656, + "step": 39863 + }, + { + "epoch": 0.5180144986623741, + "grad_norm": 0.4535764753818512, + "learning_rate": 9.641664175515669e-05, + "loss": 1.4776, + "step": 39864 + }, + { + "epoch": 0.51802749320629, + "grad_norm": 0.365784227848053, + "learning_rate": 9.64140422932453e-05, + "loss": 1.2931, + "step": 39865 + }, + { + "epoch": 0.5180404877502058, + "grad_norm": 0.43346884846687317, + "learning_rate": 9.641144283133392e-05, + "loss": 1.4059, + "step": 39866 + }, + { + "epoch": 0.5180534822941217, + "grad_norm": 0.3246438503265381, + "learning_rate": 9.640884336942253e-05, + "loss": 1.3472, + "step": 39867 + }, + { + "epoch": 0.5180664768380376, + "grad_norm": 0.482332319021225, + "learning_rate": 9.640624390751116e-05, + "loss": 1.2797, + "step": 39868 + }, + { + "epoch": 0.5180794713819535, + "grad_norm": 0.4121081233024597, + "learning_rate": 9.640364444559977e-05, + "loss": 1.5128, + "step": 39869 + }, + { + "epoch": 0.5180924659258693, + "grad_norm": 0.37354040145874023, + "learning_rate": 9.640104498368838e-05, + "loss": 1.4179, + "step": 39870 + }, + { + "epoch": 0.5181054604697852, + "grad_norm": 0.5037780404090881, + "learning_rate": 9.639844552177699e-05, + "loss": 1.5753, + "step": 39871 + }, + { + "epoch": 0.5181184550137011, + "grad_norm": 0.40410736203193665, + "learning_rate": 9.639584605986561e-05, + "loss": 1.3085, + "step": 39872 + }, + { + "epoch": 0.518131449557617, + "grad_norm": 0.4117945432662964, + "learning_rate": 9.639324659795423e-05, + "loss": 1.1837, + "step": 39873 + }, + { + "epoch": 0.5181444441015328, + "grad_norm": 0.5411643981933594, + "learning_rate": 9.639064713604285e-05, + "loss": 1.3347, + "step": 39874 + }, + { + "epoch": 0.5181574386454487, + "grad_norm": 0.4504357874393463, + "learning_rate": 9.638804767413146e-05, + "loss": 1.3478, + "step": 39875 + }, + { + "epoch": 0.5181704331893646, + "grad_norm": 0.6726453304290771, + "learning_rate": 9.638544821222007e-05, + "loss": 1.4861, + "step": 39876 + }, + { + "epoch": 0.5181834277332805, + "grad_norm": 0.5539174675941467, + "learning_rate": 9.638284875030868e-05, + "loss": 1.5024, + "step": 39877 + }, + { + "epoch": 0.5181964222771963, + "grad_norm": 0.5677586197853088, + "learning_rate": 9.638024928839731e-05, + "loss": 1.588, + "step": 39878 + }, + { + "epoch": 0.5182094168211122, + "grad_norm": 0.3257874846458435, + "learning_rate": 9.637764982648593e-05, + "loss": 1.3361, + "step": 39879 + }, + { + "epoch": 0.5182224113650281, + "grad_norm": 0.3789377212524414, + "learning_rate": 9.637505036457454e-05, + "loss": 1.5491, + "step": 39880 + }, + { + "epoch": 0.518235405908944, + "grad_norm": 0.35225480794906616, + "learning_rate": 9.637245090266315e-05, + "loss": 1.3352, + "step": 39881 + }, + { + "epoch": 0.5182484004528598, + "grad_norm": 0.35333380103111267, + "learning_rate": 9.636985144075176e-05, + "loss": 1.458, + "step": 39882 + }, + { + "epoch": 0.5182613949967757, + "grad_norm": 0.3810538053512573, + "learning_rate": 9.636725197884039e-05, + "loss": 1.5951, + "step": 39883 + }, + { + "epoch": 0.5182743895406916, + "grad_norm": 0.5290937423706055, + "learning_rate": 9.6364652516929e-05, + "loss": 1.2644, + "step": 39884 + }, + { + "epoch": 0.5182873840846075, + "grad_norm": 0.33791014552116394, + "learning_rate": 9.636205305501762e-05, + "loss": 1.4552, + "step": 39885 + }, + { + "epoch": 0.5183003786285233, + "grad_norm": 0.36395886540412903, + "learning_rate": 9.635945359310623e-05, + "loss": 1.2726, + "step": 39886 + }, + { + "epoch": 0.5183133731724392, + "grad_norm": 0.4548851251602173, + "learning_rate": 9.635685413119485e-05, + "loss": 1.364, + "step": 39887 + }, + { + "epoch": 0.5183263677163551, + "grad_norm": 0.36717653274536133, + "learning_rate": 9.635425466928346e-05, + "loss": 1.4723, + "step": 39888 + }, + { + "epoch": 0.518339362260271, + "grad_norm": 0.4422442615032196, + "learning_rate": 9.635165520737208e-05, + "loss": 1.3459, + "step": 39889 + }, + { + "epoch": 0.5183523568041868, + "grad_norm": 0.4063000977039337, + "learning_rate": 9.634905574546069e-05, + "loss": 1.3891, + "step": 39890 + }, + { + "epoch": 0.5183653513481027, + "grad_norm": 0.4366697072982788, + "learning_rate": 9.634645628354932e-05, + "loss": 1.3686, + "step": 39891 + }, + { + "epoch": 0.5183783458920186, + "grad_norm": 0.2905206084251404, + "learning_rate": 9.634385682163793e-05, + "loss": 1.3038, + "step": 39892 + }, + { + "epoch": 0.5183913404359345, + "grad_norm": 0.450033962726593, + "learning_rate": 9.634125735972654e-05, + "loss": 1.2095, + "step": 39893 + }, + { + "epoch": 0.5184043349798504, + "grad_norm": 0.2715192437171936, + "learning_rate": 9.633865789781515e-05, + "loss": 1.4236, + "step": 39894 + }, + { + "epoch": 0.5184173295237662, + "grad_norm": 0.40323707461357117, + "learning_rate": 9.633605843590377e-05, + "loss": 1.2899, + "step": 39895 + }, + { + "epoch": 0.5184303240676821, + "grad_norm": 0.46104416251182556, + "learning_rate": 9.633345897399238e-05, + "loss": 1.4659, + "step": 39896 + }, + { + "epoch": 0.518443318611598, + "grad_norm": 0.4568787217140198, + "learning_rate": 9.633085951208101e-05, + "loss": 1.7088, + "step": 39897 + }, + { + "epoch": 0.5184563131555139, + "grad_norm": 0.44228917360305786, + "learning_rate": 9.632826005016962e-05, + "loss": 1.4369, + "step": 39898 + }, + { + "epoch": 0.5184693076994297, + "grad_norm": 0.40326258540153503, + "learning_rate": 9.632566058825823e-05, + "loss": 1.3603, + "step": 39899 + }, + { + "epoch": 0.5184823022433456, + "grad_norm": 0.5430493354797363, + "learning_rate": 9.632306112634684e-05, + "loss": 1.4205, + "step": 39900 + }, + { + "epoch": 0.5184952967872615, + "grad_norm": 0.3709203898906708, + "learning_rate": 9.632046166443547e-05, + "loss": 1.357, + "step": 39901 + }, + { + "epoch": 0.5185082913311774, + "grad_norm": 0.4018144905567169, + "learning_rate": 9.631786220252408e-05, + "loss": 1.2487, + "step": 39902 + }, + { + "epoch": 0.5185212858750932, + "grad_norm": 0.3554931581020355, + "learning_rate": 9.63152627406127e-05, + "loss": 1.3591, + "step": 39903 + }, + { + "epoch": 0.5185342804190091, + "grad_norm": 0.42651045322418213, + "learning_rate": 9.631266327870131e-05, + "loss": 1.3985, + "step": 39904 + }, + { + "epoch": 0.518547274962925, + "grad_norm": 0.35157230496406555, + "learning_rate": 9.631006381678992e-05, + "loss": 1.4667, + "step": 39905 + }, + { + "epoch": 0.5185602695068409, + "grad_norm": 0.4239737391471863, + "learning_rate": 9.630746435487853e-05, + "loss": 1.4522, + "step": 39906 + }, + { + "epoch": 0.5185732640507567, + "grad_norm": 0.3595733046531677, + "learning_rate": 9.630486489296716e-05, + "loss": 1.5159, + "step": 39907 + }, + { + "epoch": 0.5185862585946726, + "grad_norm": 0.39164459705352783, + "learning_rate": 9.630226543105577e-05, + "loss": 1.5744, + "step": 39908 + }, + { + "epoch": 0.5185992531385885, + "grad_norm": 0.35062751173973083, + "learning_rate": 9.62996659691444e-05, + "loss": 1.3477, + "step": 39909 + }, + { + "epoch": 0.5186122476825044, + "grad_norm": 0.5065329670906067, + "learning_rate": 9.6297066507233e-05, + "loss": 1.3549, + "step": 39910 + }, + { + "epoch": 0.5186252422264201, + "grad_norm": 0.41593268513679504, + "learning_rate": 9.629446704532163e-05, + "loss": 1.4537, + "step": 39911 + }, + { + "epoch": 0.518638236770336, + "grad_norm": 0.4158235192298889, + "learning_rate": 9.629186758341023e-05, + "loss": 1.2937, + "step": 39912 + }, + { + "epoch": 0.518651231314252, + "grad_norm": 0.3900696635246277, + "learning_rate": 9.628926812149885e-05, + "loss": 1.5217, + "step": 39913 + }, + { + "epoch": 0.5186642258581678, + "grad_norm": 0.47524160146713257, + "learning_rate": 9.628666865958746e-05, + "loss": 1.5086, + "step": 39914 + }, + { + "epoch": 0.5186772204020836, + "grad_norm": 0.40152740478515625, + "learning_rate": 9.628406919767609e-05, + "loss": 1.5261, + "step": 39915 + }, + { + "epoch": 0.5186902149459995, + "grad_norm": 0.4031531810760498, + "learning_rate": 9.628146973576471e-05, + "loss": 1.2405, + "step": 39916 + }, + { + "epoch": 0.5187032094899154, + "grad_norm": 0.42054039239883423, + "learning_rate": 9.627887027385332e-05, + "loss": 1.3224, + "step": 39917 + }, + { + "epoch": 0.5187162040338313, + "grad_norm": 0.44588547945022583, + "learning_rate": 9.627627081194193e-05, + "loss": 1.3928, + "step": 39918 + }, + { + "epoch": 0.5187291985777471, + "grad_norm": 0.4453255534172058, + "learning_rate": 9.627367135003054e-05, + "loss": 1.4701, + "step": 39919 + }, + { + "epoch": 0.518742193121663, + "grad_norm": 0.4229930341243744, + "learning_rate": 9.627107188811917e-05, + "loss": 1.4649, + "step": 39920 + }, + { + "epoch": 0.5187551876655789, + "grad_norm": 0.32511550188064575, + "learning_rate": 9.626847242620778e-05, + "loss": 1.3025, + "step": 39921 + }, + { + "epoch": 0.5187681822094948, + "grad_norm": 0.46386486291885376, + "learning_rate": 9.62658729642964e-05, + "loss": 1.3834, + "step": 39922 + }, + { + "epoch": 0.5187811767534106, + "grad_norm": 0.43011224269866943, + "learning_rate": 9.626327350238502e-05, + "loss": 1.4239, + "step": 39923 + }, + { + "epoch": 0.5187941712973265, + "grad_norm": 0.4847968518733978, + "learning_rate": 9.626067404047363e-05, + "loss": 1.4064, + "step": 39924 + }, + { + "epoch": 0.5188071658412424, + "grad_norm": 0.3036381006240845, + "learning_rate": 9.625807457856224e-05, + "loss": 1.4301, + "step": 39925 + }, + { + "epoch": 0.5188201603851583, + "grad_norm": 0.5004080533981323, + "learning_rate": 9.625547511665086e-05, + "loss": 1.3712, + "step": 39926 + }, + { + "epoch": 0.5188331549290741, + "grad_norm": 0.4350956082344055, + "learning_rate": 9.625287565473947e-05, + "loss": 1.3599, + "step": 39927 + }, + { + "epoch": 0.51884614947299, + "grad_norm": 0.36283931136131287, + "learning_rate": 9.62502761928281e-05, + "loss": 1.2028, + "step": 39928 + }, + { + "epoch": 0.5188591440169059, + "grad_norm": 0.5054888725280762, + "learning_rate": 9.624767673091671e-05, + "loss": 1.3245, + "step": 39929 + }, + { + "epoch": 0.5188721385608218, + "grad_norm": 0.45617255568504333, + "learning_rate": 9.624507726900532e-05, + "loss": 1.2946, + "step": 39930 + }, + { + "epoch": 0.5188851331047376, + "grad_norm": 0.4867818057537079, + "learning_rate": 9.624247780709393e-05, + "loss": 1.5105, + "step": 39931 + }, + { + "epoch": 0.5188981276486535, + "grad_norm": 0.3775416612625122, + "learning_rate": 9.623987834518255e-05, + "loss": 1.479, + "step": 39932 + }, + { + "epoch": 0.5189111221925694, + "grad_norm": 0.4536754786968231, + "learning_rate": 9.623727888327117e-05, + "loss": 1.3278, + "step": 39933 + }, + { + "epoch": 0.5189241167364853, + "grad_norm": 0.4363205134868622, + "learning_rate": 9.623467942135979e-05, + "loss": 1.5842, + "step": 39934 + }, + { + "epoch": 0.5189371112804011, + "grad_norm": 0.40178975462913513, + "learning_rate": 9.62320799594484e-05, + "loss": 1.3725, + "step": 39935 + }, + { + "epoch": 0.518950105824317, + "grad_norm": 0.34961575269699097, + "learning_rate": 9.622948049753701e-05, + "loss": 1.348, + "step": 39936 + }, + { + "epoch": 0.5189631003682329, + "grad_norm": 0.341223806142807, + "learning_rate": 9.622688103562562e-05, + "loss": 1.2666, + "step": 39937 + }, + { + "epoch": 0.5189760949121488, + "grad_norm": 0.417461097240448, + "learning_rate": 9.622428157371425e-05, + "loss": 1.5806, + "step": 39938 + }, + { + "epoch": 0.5189890894560646, + "grad_norm": 0.39730849862098694, + "learning_rate": 9.622168211180286e-05, + "loss": 1.4125, + "step": 39939 + }, + { + "epoch": 0.5190020839999805, + "grad_norm": 0.4442359209060669, + "learning_rate": 9.621908264989148e-05, + "loss": 1.3605, + "step": 39940 + }, + { + "epoch": 0.5190150785438964, + "grad_norm": 0.41551077365875244, + "learning_rate": 9.62164831879801e-05, + "loss": 1.373, + "step": 39941 + }, + { + "epoch": 0.5190280730878123, + "grad_norm": 0.4439445436000824, + "learning_rate": 9.62138837260687e-05, + "loss": 1.3941, + "step": 39942 + }, + { + "epoch": 0.5190410676317281, + "grad_norm": 0.4588831663131714, + "learning_rate": 9.621128426415732e-05, + "loss": 1.3986, + "step": 39943 + }, + { + "epoch": 0.519054062175644, + "grad_norm": 0.46232840418815613, + "learning_rate": 9.620868480224594e-05, + "loss": 1.4797, + "step": 39944 + }, + { + "epoch": 0.5190670567195599, + "grad_norm": 0.34890320897102356, + "learning_rate": 9.620608534033455e-05, + "loss": 1.4271, + "step": 39945 + }, + { + "epoch": 0.5190800512634758, + "grad_norm": 0.4595262110233307, + "learning_rate": 9.620348587842318e-05, + "loss": 1.4983, + "step": 39946 + }, + { + "epoch": 0.5190930458073916, + "grad_norm": 0.3593679368495941, + "learning_rate": 9.620088641651179e-05, + "loss": 1.182, + "step": 39947 + }, + { + "epoch": 0.5191060403513075, + "grad_norm": 0.413931280374527, + "learning_rate": 9.61982869546004e-05, + "loss": 1.3121, + "step": 39948 + }, + { + "epoch": 0.5191190348952234, + "grad_norm": 0.4017539322376251, + "learning_rate": 9.619568749268901e-05, + "loss": 1.1954, + "step": 39949 + }, + { + "epoch": 0.5191320294391393, + "grad_norm": 0.45622384548187256, + "learning_rate": 9.619308803077763e-05, + "loss": 1.4627, + "step": 39950 + }, + { + "epoch": 0.5191450239830551, + "grad_norm": 0.38428208231925964, + "learning_rate": 9.619048856886624e-05, + "loss": 1.3374, + "step": 39951 + }, + { + "epoch": 0.519158018526971, + "grad_norm": 0.3753044903278351, + "learning_rate": 9.618788910695487e-05, + "loss": 1.3387, + "step": 39952 + }, + { + "epoch": 0.5191710130708869, + "grad_norm": 0.39697080850601196, + "learning_rate": 9.618528964504349e-05, + "loss": 1.3966, + "step": 39953 + }, + { + "epoch": 0.5191840076148028, + "grad_norm": 0.46103984117507935, + "learning_rate": 9.618269018313209e-05, + "loss": 1.3871, + "step": 39954 + }, + { + "epoch": 0.5191970021587186, + "grad_norm": 0.4444766640663147, + "learning_rate": 9.618009072122071e-05, + "loss": 1.3533, + "step": 39955 + }, + { + "epoch": 0.5192099967026345, + "grad_norm": 0.3617517352104187, + "learning_rate": 9.617749125930933e-05, + "loss": 1.485, + "step": 39956 + }, + { + "epoch": 0.5192229912465504, + "grad_norm": 0.4572962522506714, + "learning_rate": 9.617489179739795e-05, + "loss": 1.2551, + "step": 39957 + }, + { + "epoch": 0.5192359857904663, + "grad_norm": 0.4589724838733673, + "learning_rate": 9.617229233548656e-05, + "loss": 1.5344, + "step": 39958 + }, + { + "epoch": 0.519248980334382, + "grad_norm": 0.4317861795425415, + "learning_rate": 9.616969287357519e-05, + "loss": 1.2749, + "step": 39959 + }, + { + "epoch": 0.519261974878298, + "grad_norm": 0.4390122592449188, + "learning_rate": 9.616709341166378e-05, + "loss": 1.4224, + "step": 39960 + }, + { + "epoch": 0.5192749694222139, + "grad_norm": 0.41167759895324707, + "learning_rate": 9.616449394975241e-05, + "loss": 1.3775, + "step": 39961 + }, + { + "epoch": 0.5192879639661298, + "grad_norm": 0.39599475264549255, + "learning_rate": 9.616189448784102e-05, + "loss": 1.3691, + "step": 39962 + }, + { + "epoch": 0.5193009585100455, + "grad_norm": 0.39877018332481384, + "learning_rate": 9.615929502592964e-05, + "loss": 1.516, + "step": 39963 + }, + { + "epoch": 0.5193139530539614, + "grad_norm": 0.3969963788986206, + "learning_rate": 9.615669556401825e-05, + "loss": 1.3909, + "step": 39964 + }, + { + "epoch": 0.5193269475978773, + "grad_norm": 0.358370840549469, + "learning_rate": 9.615409610210688e-05, + "loss": 1.3347, + "step": 39965 + }, + { + "epoch": 0.5193399421417932, + "grad_norm": 0.39317548274993896, + "learning_rate": 9.615149664019548e-05, + "loss": 1.4535, + "step": 39966 + }, + { + "epoch": 0.5193529366857091, + "grad_norm": 0.32412514090538025, + "learning_rate": 9.61488971782841e-05, + "loss": 1.4027, + "step": 39967 + }, + { + "epoch": 0.5193659312296249, + "grad_norm": 0.3520224988460541, + "learning_rate": 9.614629771637271e-05, + "loss": 1.4431, + "step": 39968 + }, + { + "epoch": 0.5193789257735408, + "grad_norm": 0.4667593538761139, + "learning_rate": 9.614369825446134e-05, + "loss": 1.5183, + "step": 39969 + }, + { + "epoch": 0.5193919203174567, + "grad_norm": 0.4510015547275543, + "learning_rate": 9.614109879254995e-05, + "loss": 1.2715, + "step": 39970 + }, + { + "epoch": 0.5194049148613726, + "grad_norm": 0.47441092133522034, + "learning_rate": 9.613849933063857e-05, + "loss": 1.4617, + "step": 39971 + }, + { + "epoch": 0.5194179094052884, + "grad_norm": 0.4711161255836487, + "learning_rate": 9.613589986872718e-05, + "loss": 1.4063, + "step": 39972 + }, + { + "epoch": 0.5194309039492043, + "grad_norm": 0.32250046730041504, + "learning_rate": 9.613330040681579e-05, + "loss": 1.3051, + "step": 39973 + }, + { + "epoch": 0.5194438984931202, + "grad_norm": 0.42298680543899536, + "learning_rate": 9.61307009449044e-05, + "loss": 1.3107, + "step": 39974 + }, + { + "epoch": 0.5194568930370361, + "grad_norm": 0.3625754117965698, + "learning_rate": 9.612810148299303e-05, + "loss": 1.3295, + "step": 39975 + }, + { + "epoch": 0.5194698875809519, + "grad_norm": 0.44999998807907104, + "learning_rate": 9.612550202108164e-05, + "loss": 1.5497, + "step": 39976 + }, + { + "epoch": 0.5194828821248678, + "grad_norm": 0.36292049288749695, + "learning_rate": 9.612290255917026e-05, + "loss": 1.3161, + "step": 39977 + }, + { + "epoch": 0.5194958766687837, + "grad_norm": 0.40168920159339905, + "learning_rate": 9.612030309725887e-05, + "loss": 1.376, + "step": 39978 + }, + { + "epoch": 0.5195088712126996, + "grad_norm": 0.41131851077079773, + "learning_rate": 9.611770363534749e-05, + "loss": 1.4771, + "step": 39979 + }, + { + "epoch": 0.5195218657566154, + "grad_norm": 0.3358024060726166, + "learning_rate": 9.61151041734361e-05, + "loss": 1.4687, + "step": 39980 + }, + { + "epoch": 0.5195348603005313, + "grad_norm": 0.41045722365379333, + "learning_rate": 9.611250471152472e-05, + "loss": 1.512, + "step": 39981 + }, + { + "epoch": 0.5195478548444472, + "grad_norm": 0.37855643033981323, + "learning_rate": 9.610990524961333e-05, + "loss": 1.457, + "step": 39982 + }, + { + "epoch": 0.5195608493883631, + "grad_norm": 0.6064843535423279, + "learning_rate": 9.610730578770196e-05, + "loss": 1.4736, + "step": 39983 + }, + { + "epoch": 0.5195738439322789, + "grad_norm": 0.43973588943481445, + "learning_rate": 9.610470632579057e-05, + "loss": 1.4998, + "step": 39984 + }, + { + "epoch": 0.5195868384761948, + "grad_norm": 0.45167720317840576, + "learning_rate": 9.610210686387918e-05, + "loss": 1.4831, + "step": 39985 + }, + { + "epoch": 0.5195998330201107, + "grad_norm": 0.4005807936191559, + "learning_rate": 9.609950740196779e-05, + "loss": 1.4136, + "step": 39986 + }, + { + "epoch": 0.5196128275640266, + "grad_norm": 0.31736108660697937, + "learning_rate": 9.609690794005641e-05, + "loss": 1.2567, + "step": 39987 + }, + { + "epoch": 0.5196258221079424, + "grad_norm": 0.4036203622817993, + "learning_rate": 9.609430847814502e-05, + "loss": 1.3578, + "step": 39988 + }, + { + "epoch": 0.5196388166518583, + "grad_norm": 0.3702143430709839, + "learning_rate": 9.609170901623365e-05, + "loss": 1.378, + "step": 39989 + }, + { + "epoch": 0.5196518111957742, + "grad_norm": 0.4238418638706207, + "learning_rate": 9.608910955432226e-05, + "loss": 1.537, + "step": 39990 + }, + { + "epoch": 0.5196648057396901, + "grad_norm": 0.43518656492233276, + "learning_rate": 9.608651009241087e-05, + "loss": 1.257, + "step": 39991 + }, + { + "epoch": 0.5196778002836059, + "grad_norm": 0.42913341522216797, + "learning_rate": 9.60839106304995e-05, + "loss": 1.4435, + "step": 39992 + }, + { + "epoch": 0.5196907948275218, + "grad_norm": 0.4853131175041199, + "learning_rate": 9.60813111685881e-05, + "loss": 1.4001, + "step": 39993 + }, + { + "epoch": 0.5197037893714377, + "grad_norm": 0.3982795774936676, + "learning_rate": 9.607871170667673e-05, + "loss": 1.1793, + "step": 39994 + }, + { + "epoch": 0.5197167839153536, + "grad_norm": 0.5129113793373108, + "learning_rate": 9.607611224476534e-05, + "loss": 1.4162, + "step": 39995 + }, + { + "epoch": 0.5197297784592694, + "grad_norm": 0.40881386399269104, + "learning_rate": 9.607351278285395e-05, + "loss": 1.5367, + "step": 39996 + }, + { + "epoch": 0.5197427730031853, + "grad_norm": 0.4761309325695038, + "learning_rate": 9.607091332094256e-05, + "loss": 1.5653, + "step": 39997 + }, + { + "epoch": 0.5197557675471012, + "grad_norm": 0.477521687746048, + "learning_rate": 9.606831385903119e-05, + "loss": 1.2835, + "step": 39998 + }, + { + "epoch": 0.5197687620910171, + "grad_norm": 0.3403060734272003, + "learning_rate": 9.60657143971198e-05, + "loss": 1.0931, + "step": 39999 + }, + { + "epoch": 0.5197817566349329, + "grad_norm": 0.3844358026981354, + "learning_rate": 9.606311493520842e-05, + "loss": 1.5826, + "step": 40000 + } + ], + "logging_steps": 1, + "max_steps": 76955, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 10, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.5725484746843685e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}