{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21796183443796985, "eval_steps": 500, "global_step": 2450, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 2.7801254392598194e-05, "grad_norm": 1.3386272192001343, "learning_rate": 1.9999999961857163e-05, "loss": 2.2105, "step": 1 }, { "epoch": 5.560250878519639e-05, "grad_norm": 1.1603680849075317, "learning_rate": 1.9999999847428643e-05, "loss": 2.5266, "step": 2 }, { "epoch": 8.340376317779459e-05, "grad_norm": 1.0131351947784424, "learning_rate": 1.9999999656714445e-05, "loss": 2.323, "step": 3 }, { "epoch": 0.00011120501757039278, "grad_norm": 1.0777212381362915, "learning_rate": 1.9999999389714576e-05, "loss": 2.1585, "step": 4 }, { "epoch": 0.00013900627196299098, "grad_norm": 1.0180584192276, "learning_rate": 1.9999999046429026e-05, "loss": 2.3234, "step": 5 }, { "epoch": 0.00016680752635558918, "grad_norm": 0.9856230616569519, "learning_rate": 1.9999998626857804e-05, "loss": 2.2657, "step": 6 }, { "epoch": 0.00019460878074818735, "grad_norm": 1.0257734060287476, "learning_rate": 1.9999998131000916e-05, "loss": 2.3998, "step": 7 }, { "epoch": 0.00022241003514078555, "grad_norm": 1.112367033958435, "learning_rate": 1.9999997558858363e-05, "loss": 2.4801, "step": 8 }, { "epoch": 0.0002502112895333837, "grad_norm": 1.0615919828414917, "learning_rate": 1.9999996910430152e-05, "loss": 2.0438, "step": 9 }, { "epoch": 0.00027801254392598195, "grad_norm": 1.0746697187423706, "learning_rate": 1.999999618571628e-05, "loss": 2.2881, "step": 10 }, { "epoch": 0.0003058137983185801, "grad_norm": 1.0217502117156982, "learning_rate": 1.9999995384716762e-05, "loss": 2.282, "step": 11 }, { "epoch": 0.00033361505271117835, "grad_norm": 0.9601636528968811, "learning_rate": 1.9999994507431596e-05, "loss": 2.1897, "step": 12 }, { "epoch": 0.00036141630710377653, "grad_norm": 1.0465426445007324, "learning_rate": 1.9999993553860796e-05, "loss": 2.3409, "step": 13 }, { "epoch": 0.0003892175614963747, "grad_norm": 0.9713447690010071, "learning_rate": 1.9999992524004366e-05, "loss": 2.1595, "step": 14 }, { "epoch": 0.00041701881588897293, "grad_norm": 0.9324694275856018, "learning_rate": 1.9999991417862312e-05, "loss": 2.4275, "step": 15 }, { "epoch": 0.0004448200702815711, "grad_norm": 0.9952303171157837, "learning_rate": 1.9999990235434645e-05, "loss": 2.2186, "step": 16 }, { "epoch": 0.0004726213246741693, "grad_norm": 1.0026544332504272, "learning_rate": 1.999998897672137e-05, "loss": 2.4889, "step": 17 }, { "epoch": 0.0005004225790667675, "grad_norm": 0.9121339321136475, "learning_rate": 1.9999987641722504e-05, "loss": 2.1681, "step": 18 }, { "epoch": 0.0005282238334593657, "grad_norm": 1.0004253387451172, "learning_rate": 1.9999986230438054e-05, "loss": 2.3554, "step": 19 }, { "epoch": 0.0005560250878519639, "grad_norm": 0.9249687790870667, "learning_rate": 1.9999984742868025e-05, "loss": 2.3414, "step": 20 }, { "epoch": 0.0005838263422445621, "grad_norm": 0.9404823780059814, "learning_rate": 1.999998317901244e-05, "loss": 1.9877, "step": 21 }, { "epoch": 0.0006116275966371603, "grad_norm": 0.9161513447761536, "learning_rate": 1.99999815388713e-05, "loss": 2.276, "step": 22 }, { "epoch": 0.0006394288510297584, "grad_norm": 0.9387003779411316, "learning_rate": 1.999997982244462e-05, "loss": 2.4599, "step": 23 }, { "epoch": 0.0006672301054223567, "grad_norm": 0.9360546469688416, "learning_rate": 1.9999978029732416e-05, "loss": 2.26, "step": 24 }, { "epoch": 0.0006950313598149549, "grad_norm": 0.9499924182891846, "learning_rate": 1.9999976160734703e-05, "loss": 2.2495, "step": 25 }, { "epoch": 0.0007228326142075531, "grad_norm": 0.9392028450965881, "learning_rate": 1.9999974215451487e-05, "loss": 2.2908, "step": 26 }, { "epoch": 0.0007506338686001512, "grad_norm": 0.932240903377533, "learning_rate": 1.9999972193882794e-05, "loss": 2.4504, "step": 27 }, { "epoch": 0.0007784351229927494, "grad_norm": 0.9948500990867615, "learning_rate": 1.9999970096028633e-05, "loss": 2.5611, "step": 28 }, { "epoch": 0.0008062363773853476, "grad_norm": 0.9813898801803589, "learning_rate": 1.9999967921889018e-05, "loss": 2.1209, "step": 29 }, { "epoch": 0.0008340376317779459, "grad_norm": 0.9719802141189575, "learning_rate": 1.999996567146397e-05, "loss": 2.4723, "step": 30 }, { "epoch": 0.000861838886170544, "grad_norm": 0.9253854751586914, "learning_rate": 1.9999963344753506e-05, "loss": 2.4117, "step": 31 }, { "epoch": 0.0008896401405631422, "grad_norm": 0.9216965436935425, "learning_rate": 1.999996094175764e-05, "loss": 2.046, "step": 32 }, { "epoch": 0.0009174413949557404, "grad_norm": 0.936556339263916, "learning_rate": 1.9999958462476394e-05, "loss": 2.3441, "step": 33 }, { "epoch": 0.0009452426493483386, "grad_norm": 0.9563581347465515, "learning_rate": 1.9999955906909786e-05, "loss": 2.0949, "step": 34 }, { "epoch": 0.0009730439037409368, "grad_norm": 0.9302502870559692, "learning_rate": 1.9999953275057835e-05, "loss": 2.0818, "step": 35 }, { "epoch": 0.001000845158133535, "grad_norm": 0.9053279757499695, "learning_rate": 1.999995056692056e-05, "loss": 2.1552, "step": 36 }, { "epoch": 0.001028646412526133, "grad_norm": 0.9565393924713135, "learning_rate": 1.9999947782497985e-05, "loss": 2.012, "step": 37 }, { "epoch": 0.0010564476669187315, "grad_norm": 0.9636730551719666, "learning_rate": 1.999994492179013e-05, "loss": 2.5128, "step": 38 }, { "epoch": 0.0010842489213113296, "grad_norm": 0.9518397450447083, "learning_rate": 1.9999941984797012e-05, "loss": 2.2401, "step": 39 }, { "epoch": 0.0011120501757039278, "grad_norm": 0.9769637584686279, "learning_rate": 1.9999938971518657e-05, "loss": 2.282, "step": 40 }, { "epoch": 0.001139851430096526, "grad_norm": 0.9409621357917786, "learning_rate": 1.9999935881955093e-05, "loss": 2.2772, "step": 41 }, { "epoch": 0.0011676526844891242, "grad_norm": 0.9893931746482849, "learning_rate": 1.9999932716106335e-05, "loss": 2.5441, "step": 42 }, { "epoch": 0.0011954539388817223, "grad_norm": 0.9539863467216492, "learning_rate": 1.9999929473972414e-05, "loss": 2.1744, "step": 43 }, { "epoch": 0.0012232551932743205, "grad_norm": 0.9548373818397522, "learning_rate": 1.999992615555335e-05, "loss": 1.968, "step": 44 }, { "epoch": 0.0012510564476669187, "grad_norm": 0.9413251876831055, "learning_rate": 1.999992276084917e-05, "loss": 2.0788, "step": 45 }, { "epoch": 0.0012788577020595169, "grad_norm": 0.9034217000007629, "learning_rate": 1.9999919289859902e-05, "loss": 2.119, "step": 46 }, { "epoch": 0.001306658956452115, "grad_norm": 0.9652527570724487, "learning_rate": 1.999991574258557e-05, "loss": 2.3709, "step": 47 }, { "epoch": 0.0013344602108447134, "grad_norm": 0.9130657315254211, "learning_rate": 1.9999912119026196e-05, "loss": 2.0305, "step": 48 }, { "epoch": 0.0013622614652373116, "grad_norm": 0.959320068359375, "learning_rate": 1.999990841918182e-05, "loss": 2.0622, "step": 49 }, { "epoch": 0.0013900627196299098, "grad_norm": 1.0403366088867188, "learning_rate": 1.999990464305246e-05, "loss": 2.1067, "step": 50 }, { "epoch": 0.001417863974022508, "grad_norm": 0.9955949783325195, "learning_rate": 1.9999900790638152e-05, "loss": 2.1958, "step": 51 }, { "epoch": 0.0014456652284151061, "grad_norm": 0.9905858635902405, "learning_rate": 1.999989686193892e-05, "loss": 2.5736, "step": 52 }, { "epoch": 0.0014734664828077043, "grad_norm": 0.9309405088424683, "learning_rate": 1.9999892856954793e-05, "loss": 2.3492, "step": 53 }, { "epoch": 0.0015012677372003025, "grad_norm": 0.9667283296585083, "learning_rate": 1.9999888775685802e-05, "loss": 2.4044, "step": 54 }, { "epoch": 0.0015290689915929006, "grad_norm": 0.9356451034545898, "learning_rate": 1.9999884618131988e-05, "loss": 2.1564, "step": 55 }, { "epoch": 0.0015568702459854988, "grad_norm": 0.9150791168212891, "learning_rate": 1.999988038429337e-05, "loss": 2.045, "step": 56 }, { "epoch": 0.001584671500378097, "grad_norm": 1.039883017539978, "learning_rate": 1.9999876074169985e-05, "loss": 2.4869, "step": 57 }, { "epoch": 0.0016124727547706952, "grad_norm": 0.9824104905128479, "learning_rate": 1.9999871687761868e-05, "loss": 1.8094, "step": 58 }, { "epoch": 0.0016402740091632935, "grad_norm": 0.9961601495742798, "learning_rate": 1.999986722506905e-05, "loss": 2.3793, "step": 59 }, { "epoch": 0.0016680752635558917, "grad_norm": 1.1470301151275635, "learning_rate": 1.9999862686091565e-05, "loss": 2.2851, "step": 60 }, { "epoch": 0.00169587651794849, "grad_norm": 1.0058739185333252, "learning_rate": 1.9999858070829447e-05, "loss": 2.3094, "step": 61 }, { "epoch": 0.001723677772341088, "grad_norm": 1.0498450994491577, "learning_rate": 1.9999853379282735e-05, "loss": 2.2641, "step": 62 }, { "epoch": 0.0017514790267336862, "grad_norm": 0.9721875786781311, "learning_rate": 1.999984861145146e-05, "loss": 2.146, "step": 63 }, { "epoch": 0.0017792802811262844, "grad_norm": 1.0139849185943604, "learning_rate": 1.9999843767335663e-05, "loss": 2.2511, "step": 64 }, { "epoch": 0.0018070815355188826, "grad_norm": 1.0162826776504517, "learning_rate": 1.999983884693538e-05, "loss": 2.2973, "step": 65 }, { "epoch": 0.0018348827899114808, "grad_norm": 1.0020618438720703, "learning_rate": 1.9999833850250644e-05, "loss": 2.3906, "step": 66 }, { "epoch": 0.001862684044304079, "grad_norm": 1.0440468788146973, "learning_rate": 1.99998287772815e-05, "loss": 2.3258, "step": 67 }, { "epoch": 0.0018904852986966771, "grad_norm": 1.0328731536865234, "learning_rate": 1.999982362802798e-05, "loss": 2.1352, "step": 68 }, { "epoch": 0.0019182865530892753, "grad_norm": 1.0517632961273193, "learning_rate": 1.9999818402490125e-05, "loss": 2.3609, "step": 69 }, { "epoch": 0.0019460878074818737, "grad_norm": 1.0437003374099731, "learning_rate": 1.999981310066798e-05, "loss": 2.3489, "step": 70 }, { "epoch": 0.0019738890618744716, "grad_norm": 1.0550068616867065, "learning_rate": 1.9999807722561578e-05, "loss": 2.2211, "step": 71 }, { "epoch": 0.00200169031626707, "grad_norm": 1.0007189512252808, "learning_rate": 1.9999802268170963e-05, "loss": 2.0665, "step": 72 }, { "epoch": 0.002029491570659668, "grad_norm": 0.9840867519378662, "learning_rate": 1.9999796737496178e-05, "loss": 2.1628, "step": 73 }, { "epoch": 0.002057292825052266, "grad_norm": 1.080731749534607, "learning_rate": 1.9999791130537267e-05, "loss": 2.0314, "step": 74 }, { "epoch": 0.0020850940794448648, "grad_norm": 0.961958110332489, "learning_rate": 1.9999785447294267e-05, "loss": 2.0913, "step": 75 }, { "epoch": 0.002112895333837463, "grad_norm": 1.0624992847442627, "learning_rate": 1.9999779687767226e-05, "loss": 2.4478, "step": 76 }, { "epoch": 0.002140696588230061, "grad_norm": 1.0067684650421143, "learning_rate": 1.9999773851956185e-05, "loss": 2.2355, "step": 77 }, { "epoch": 0.0021684978426226593, "grad_norm": 1.041019082069397, "learning_rate": 1.9999767939861193e-05, "loss": 2.0397, "step": 78 }, { "epoch": 0.0021962990970152575, "grad_norm": 1.0057357549667358, "learning_rate": 1.9999761951482292e-05, "loss": 2.3636, "step": 79 }, { "epoch": 0.0022241003514078556, "grad_norm": 1.0208083391189575, "learning_rate": 1.9999755886819526e-05, "loss": 2.2067, "step": 80 }, { "epoch": 0.002251901605800454, "grad_norm": 1.1258176565170288, "learning_rate": 1.9999749745872942e-05, "loss": 2.6462, "step": 81 }, { "epoch": 0.002279702860193052, "grad_norm": 0.9876511096954346, "learning_rate": 1.999974352864259e-05, "loss": 2.197, "step": 82 }, { "epoch": 0.00230750411458565, "grad_norm": 1.0398801565170288, "learning_rate": 1.9999737235128515e-05, "loss": 2.2492, "step": 83 }, { "epoch": 0.0023353053689782483, "grad_norm": 1.1035606861114502, "learning_rate": 1.9999730865330763e-05, "loss": 2.1908, "step": 84 }, { "epoch": 0.0023631066233708465, "grad_norm": 1.0301669836044312, "learning_rate": 1.9999724419249386e-05, "loss": 2.1138, "step": 85 }, { "epoch": 0.0023909078777634447, "grad_norm": 1.0599033832550049, "learning_rate": 1.9999717896884432e-05, "loss": 2.3766, "step": 86 }, { "epoch": 0.002418709132156043, "grad_norm": 1.0322849750518799, "learning_rate": 1.9999711298235955e-05, "loss": 2.242, "step": 87 }, { "epoch": 0.002446510386548641, "grad_norm": 1.100713849067688, "learning_rate": 1.9999704623303996e-05, "loss": 2.5872, "step": 88 }, { "epoch": 0.002474311640941239, "grad_norm": 1.0419594049453735, "learning_rate": 1.999969787208861e-05, "loss": 2.0488, "step": 89 }, { "epoch": 0.0025021128953338374, "grad_norm": 1.04029381275177, "learning_rate": 1.9999691044589854e-05, "loss": 2.0487, "step": 90 }, { "epoch": 0.0025299141497264355, "grad_norm": 1.0287145376205444, "learning_rate": 1.9999684140807773e-05, "loss": 2.3412, "step": 91 }, { "epoch": 0.0025577154041190337, "grad_norm": 1.1112746000289917, "learning_rate": 1.9999677160742426e-05, "loss": 2.3723, "step": 92 }, { "epoch": 0.002585516658511632, "grad_norm": 1.0892740488052368, "learning_rate": 1.999967010439386e-05, "loss": 2.5003, "step": 93 }, { "epoch": 0.00261331791290423, "grad_norm": 1.0428860187530518, "learning_rate": 1.9999662971762127e-05, "loss": 2.1988, "step": 94 }, { "epoch": 0.0026411191672968282, "grad_norm": 1.0085920095443726, "learning_rate": 1.9999655762847293e-05, "loss": 2.0561, "step": 95 }, { "epoch": 0.002668920421689427, "grad_norm": 1.0214283466339111, "learning_rate": 1.99996484776494e-05, "loss": 2.1516, "step": 96 }, { "epoch": 0.002696721676082025, "grad_norm": 1.0548397302627563, "learning_rate": 1.999964111616851e-05, "loss": 2.1519, "step": 97 }, { "epoch": 0.002724522930474623, "grad_norm": 1.1266776323318481, "learning_rate": 1.9999633678404684e-05, "loss": 2.4213, "step": 98 }, { "epoch": 0.0027523241848672214, "grad_norm": 1.0543451309204102, "learning_rate": 1.999962616435797e-05, "loss": 2.049, "step": 99 }, { "epoch": 0.0027801254392598195, "grad_norm": 1.0210249423980713, "learning_rate": 1.999961857402843e-05, "loss": 2.13, "step": 100 }, { "epoch": 0.0028079266936524177, "grad_norm": 0.9730202555656433, "learning_rate": 1.9999610907416118e-05, "loss": 1.9835, "step": 101 }, { "epoch": 0.002835727948045016, "grad_norm": 1.1717532873153687, "learning_rate": 1.99996031645211e-05, "loss": 2.2918, "step": 102 }, { "epoch": 0.002863529202437614, "grad_norm": 1.0638781785964966, "learning_rate": 1.9999595345343424e-05, "loss": 2.2488, "step": 103 }, { "epoch": 0.0028913304568302122, "grad_norm": 0.984815776348114, "learning_rate": 1.9999587449883154e-05, "loss": 2.3585, "step": 104 }, { "epoch": 0.0029191317112228104, "grad_norm": 1.1235684156417847, "learning_rate": 1.9999579478140358e-05, "loss": 2.2554, "step": 105 }, { "epoch": 0.0029469329656154086, "grad_norm": 1.0674397945404053, "learning_rate": 1.999957143011509e-05, "loss": 2.257, "step": 106 }, { "epoch": 0.0029747342200080067, "grad_norm": 1.0589383840560913, "learning_rate": 1.999956330580741e-05, "loss": 2.2385, "step": 107 }, { "epoch": 0.003002535474400605, "grad_norm": 1.2297029495239258, "learning_rate": 1.9999555105217383e-05, "loss": 2.3857, "step": 108 }, { "epoch": 0.003030336728793203, "grad_norm": 1.1495815515518188, "learning_rate": 1.9999546828345072e-05, "loss": 2.1412, "step": 109 }, { "epoch": 0.0030581379831858013, "grad_norm": 1.172291874885559, "learning_rate": 1.9999538475190533e-05, "loss": 2.4039, "step": 110 }, { "epoch": 0.0030859392375783994, "grad_norm": 1.081527829170227, "learning_rate": 1.9999530045753843e-05, "loss": 2.1043, "step": 111 }, { "epoch": 0.0031137404919709976, "grad_norm": 1.0823593139648438, "learning_rate": 1.9999521540035056e-05, "loss": 2.1938, "step": 112 }, { "epoch": 0.003141541746363596, "grad_norm": 1.0946345329284668, "learning_rate": 1.999951295803424e-05, "loss": 2.4797, "step": 113 }, { "epoch": 0.003169343000756194, "grad_norm": 1.1632286310195923, "learning_rate": 1.999950429975146e-05, "loss": 2.3821, "step": 114 }, { "epoch": 0.003197144255148792, "grad_norm": 1.115511417388916, "learning_rate": 1.999949556518678e-05, "loss": 2.3643, "step": 115 }, { "epoch": 0.0032249455095413903, "grad_norm": 1.1243259906768799, "learning_rate": 1.999948675434027e-05, "loss": 2.4053, "step": 116 }, { "epoch": 0.0032527467639339885, "grad_norm": 1.0602004528045654, "learning_rate": 1.9999477867211997e-05, "loss": 2.2727, "step": 117 }, { "epoch": 0.003280548018326587, "grad_norm": 1.0885320901870728, "learning_rate": 1.999946890380203e-05, "loss": 2.3424, "step": 118 }, { "epoch": 0.0033083492727191853, "grad_norm": 1.2160863876342773, "learning_rate": 1.9999459864110434e-05, "loss": 2.4823, "step": 119 }, { "epoch": 0.0033361505271117834, "grad_norm": 1.073218584060669, "learning_rate": 1.9999450748137277e-05, "loss": 2.2351, "step": 120 }, { "epoch": 0.0033639517815043816, "grad_norm": 1.0930808782577515, "learning_rate": 1.9999441555882633e-05, "loss": 2.2819, "step": 121 }, { "epoch": 0.00339175303589698, "grad_norm": 1.147673487663269, "learning_rate": 1.9999432287346567e-05, "loss": 2.4726, "step": 122 }, { "epoch": 0.003419554290289578, "grad_norm": 1.1316108703613281, "learning_rate": 1.9999422942529157e-05, "loss": 2.3182, "step": 123 }, { "epoch": 0.003447355544682176, "grad_norm": 1.036102533340454, "learning_rate": 1.9999413521430466e-05, "loss": 2.2087, "step": 124 }, { "epoch": 0.0034751567990747743, "grad_norm": 1.0413788557052612, "learning_rate": 1.9999404024050573e-05, "loss": 2.2349, "step": 125 }, { "epoch": 0.0035029580534673725, "grad_norm": 1.173976182937622, "learning_rate": 1.9999394450389543e-05, "loss": 2.0392, "step": 126 }, { "epoch": 0.0035307593078599707, "grad_norm": 1.1466434001922607, "learning_rate": 1.9999384800447456e-05, "loss": 2.3033, "step": 127 }, { "epoch": 0.003558560562252569, "grad_norm": 1.1636332273483276, "learning_rate": 1.9999375074224384e-05, "loss": 2.2482, "step": 128 }, { "epoch": 0.003586361816645167, "grad_norm": 1.161712408065796, "learning_rate": 1.99993652717204e-05, "loss": 2.259, "step": 129 }, { "epoch": 0.003614163071037765, "grad_norm": 1.1004548072814941, "learning_rate": 1.9999355392935576e-05, "loss": 2.1649, "step": 130 }, { "epoch": 0.0036419643254303634, "grad_norm": 1.095963954925537, "learning_rate": 1.9999345437869992e-05, "loss": 2.1103, "step": 131 }, { "epoch": 0.0036697655798229615, "grad_norm": 1.0763362646102905, "learning_rate": 1.9999335406523725e-05, "loss": 2.5131, "step": 132 }, { "epoch": 0.0036975668342155597, "grad_norm": 1.116161823272705, "learning_rate": 1.9999325298896845e-05, "loss": 2.1684, "step": 133 }, { "epoch": 0.003725368088608158, "grad_norm": 1.0859335660934448, "learning_rate": 1.999931511498943e-05, "loss": 2.1617, "step": 134 }, { "epoch": 0.003753169343000756, "grad_norm": 1.1181995868682861, "learning_rate": 1.9999304854801565e-05, "loss": 2.0858, "step": 135 }, { "epoch": 0.0037809705973933542, "grad_norm": 1.156731128692627, "learning_rate": 1.9999294518333322e-05, "loss": 2.0743, "step": 136 }, { "epoch": 0.0038087718517859524, "grad_norm": 1.0622448921203613, "learning_rate": 1.9999284105584784e-05, "loss": 2.2431, "step": 137 }, { "epoch": 0.0038365731061785506, "grad_norm": 1.1988381147384644, "learning_rate": 1.9999273616556024e-05, "loss": 2.2102, "step": 138 }, { "epoch": 0.003864374360571149, "grad_norm": 1.1849607229232788, "learning_rate": 1.9999263051247132e-05, "loss": 2.4513, "step": 139 }, { "epoch": 0.0038921756149637473, "grad_norm": 1.1514637470245361, "learning_rate": 1.999925240965818e-05, "loss": 2.4173, "step": 140 }, { "epoch": 0.003919976869356345, "grad_norm": 1.1059222221374512, "learning_rate": 1.999924169178925e-05, "loss": 2.3223, "step": 141 }, { "epoch": 0.003947778123748943, "grad_norm": 1.1028525829315186, "learning_rate": 1.9999230897640423e-05, "loss": 2.3995, "step": 142 }, { "epoch": 0.0039755793781415414, "grad_norm": 1.1293936967849731, "learning_rate": 1.999922002721179e-05, "loss": 2.0917, "step": 143 }, { "epoch": 0.00400338063253414, "grad_norm": 1.1188113689422607, "learning_rate": 1.9999209080503422e-05, "loss": 2.1301, "step": 144 }, { "epoch": 0.004031181886926738, "grad_norm": 1.1235722303390503, "learning_rate": 1.9999198057515415e-05, "loss": 2.1448, "step": 145 }, { "epoch": 0.004058983141319336, "grad_norm": 1.1035553216934204, "learning_rate": 1.9999186958247843e-05, "loss": 2.107, "step": 146 }, { "epoch": 0.004086784395711934, "grad_norm": 1.1216835975646973, "learning_rate": 1.9999175782700793e-05, "loss": 2.2124, "step": 147 }, { "epoch": 0.004114585650104532, "grad_norm": 1.1839274168014526, "learning_rate": 1.999916453087435e-05, "loss": 2.1785, "step": 148 }, { "epoch": 0.0041423869044971305, "grad_norm": 1.1069618463516235, "learning_rate": 1.9999153202768603e-05, "loss": 2.0667, "step": 149 }, { "epoch": 0.0041701881588897295, "grad_norm": 1.1783491373062134, "learning_rate": 1.999914179838364e-05, "loss": 2.496, "step": 150 }, { "epoch": 0.004197989413282328, "grad_norm": 1.1272366046905518, "learning_rate": 1.9999130317719543e-05, "loss": 1.9779, "step": 151 }, { "epoch": 0.004225790667674926, "grad_norm": 1.1723352670669556, "learning_rate": 1.99991187607764e-05, "loss": 2.2702, "step": 152 }, { "epoch": 0.004253591922067524, "grad_norm": 1.1620469093322754, "learning_rate": 1.9999107127554297e-05, "loss": 2.263, "step": 153 }, { "epoch": 0.004281393176460122, "grad_norm": 1.1044648885726929, "learning_rate": 1.999909541805333e-05, "loss": 2.155, "step": 154 }, { "epoch": 0.00430919443085272, "grad_norm": 1.1630659103393555, "learning_rate": 1.9999083632273584e-05, "loss": 2.0963, "step": 155 }, { "epoch": 0.0043369956852453186, "grad_norm": 1.1296038627624512, "learning_rate": 1.9999071770215154e-05, "loss": 2.3582, "step": 156 }, { "epoch": 0.004364796939637917, "grad_norm": 1.0691393613815308, "learning_rate": 1.999905983187812e-05, "loss": 2.0368, "step": 157 }, { "epoch": 0.004392598194030515, "grad_norm": 1.0658721923828125, "learning_rate": 1.999904781726258e-05, "loss": 2.0881, "step": 158 }, { "epoch": 0.004420399448423113, "grad_norm": 1.1333595514297485, "learning_rate": 1.9999035726368623e-05, "loss": 2.3787, "step": 159 }, { "epoch": 0.004448200702815711, "grad_norm": 1.1318788528442383, "learning_rate": 1.9999023559196345e-05, "loss": 2.4706, "step": 160 }, { "epoch": 0.004476001957208309, "grad_norm": 1.1712559461593628, "learning_rate": 1.9999011315745835e-05, "loss": 2.0052, "step": 161 }, { "epoch": 0.004503803211600908, "grad_norm": 1.1999025344848633, "learning_rate": 1.999899899601719e-05, "loss": 2.3613, "step": 162 }, { "epoch": 0.004531604465993506, "grad_norm": 1.1536388397216797, "learning_rate": 1.99989866000105e-05, "loss": 2.4171, "step": 163 }, { "epoch": 0.004559405720386104, "grad_norm": 1.190962791442871, "learning_rate": 1.999897412772586e-05, "loss": 2.4344, "step": 164 }, { "epoch": 0.004587206974778702, "grad_norm": 1.157014012336731, "learning_rate": 1.999896157916337e-05, "loss": 2.1682, "step": 165 }, { "epoch": 0.0046150082291713, "grad_norm": 1.1295533180236816, "learning_rate": 1.9998948954323125e-05, "loss": 2.5532, "step": 166 }, { "epoch": 0.0046428094835638985, "grad_norm": 1.1318200826644897, "learning_rate": 1.9998936253205213e-05, "loss": 2.2569, "step": 167 }, { "epoch": 0.004670610737956497, "grad_norm": 1.175905466079712, "learning_rate": 1.9998923475809742e-05, "loss": 2.1654, "step": 168 }, { "epoch": 0.004698411992349095, "grad_norm": 1.1348875761032104, "learning_rate": 1.99989106221368e-05, "loss": 2.1114, "step": 169 }, { "epoch": 0.004726213246741693, "grad_norm": 1.1247903108596802, "learning_rate": 1.9998897692186493e-05, "loss": 2.3323, "step": 170 }, { "epoch": 0.004754014501134291, "grad_norm": 1.2056310176849365, "learning_rate": 1.999888468595891e-05, "loss": 2.0983, "step": 171 }, { "epoch": 0.004781815755526889, "grad_norm": 1.1028159856796265, "learning_rate": 1.9998871603454163e-05, "loss": 2.2841, "step": 172 }, { "epoch": 0.0048096170099194875, "grad_norm": 1.1404368877410889, "learning_rate": 1.999885844467234e-05, "loss": 2.1063, "step": 173 }, { "epoch": 0.004837418264312086, "grad_norm": 1.0591777563095093, "learning_rate": 1.9998845209613548e-05, "loss": 1.913, "step": 174 }, { "epoch": 0.004865219518704684, "grad_norm": 1.1495721340179443, "learning_rate": 1.9998831898277885e-05, "loss": 2.1078, "step": 175 }, { "epoch": 0.004893020773097282, "grad_norm": 1.1986558437347412, "learning_rate": 1.9998818510665456e-05, "loss": 2.0418, "step": 176 }, { "epoch": 0.00492082202748988, "grad_norm": 1.1243244409561157, "learning_rate": 1.9998805046776357e-05, "loss": 2.1728, "step": 177 }, { "epoch": 0.004948623281882478, "grad_norm": 1.0615439414978027, "learning_rate": 1.9998791506610698e-05, "loss": 2.4303, "step": 178 }, { "epoch": 0.0049764245362750766, "grad_norm": 1.1140798330307007, "learning_rate": 1.9998777890168577e-05, "loss": 2.1822, "step": 179 }, { "epoch": 0.005004225790667675, "grad_norm": 1.1956887245178223, "learning_rate": 1.99987641974501e-05, "loss": 2.1391, "step": 180 }, { "epoch": 0.005032027045060273, "grad_norm": 1.246429443359375, "learning_rate": 1.9998750428455373e-05, "loss": 2.4827, "step": 181 }, { "epoch": 0.005059828299452871, "grad_norm": 1.2714473009109497, "learning_rate": 1.9998736583184496e-05, "loss": 2.2301, "step": 182 }, { "epoch": 0.005087629553845469, "grad_norm": 1.1799248456954956, "learning_rate": 1.999872266163758e-05, "loss": 2.2893, "step": 183 }, { "epoch": 0.005115430808238067, "grad_norm": 1.1697207689285278, "learning_rate": 1.999870866381473e-05, "loss": 2.6932, "step": 184 }, { "epoch": 0.005143232062630666, "grad_norm": 1.1122878789901733, "learning_rate": 1.9998694589716046e-05, "loss": 2.14, "step": 185 }, { "epoch": 0.005171033317023264, "grad_norm": 1.2276055812835693, "learning_rate": 1.9998680439341645e-05, "loss": 2.1487, "step": 186 }, { "epoch": 0.005198834571415862, "grad_norm": 1.2141193151474, "learning_rate": 1.9998666212691636e-05, "loss": 2.1219, "step": 187 }, { "epoch": 0.00522663582580846, "grad_norm": 1.1240246295928955, "learning_rate": 1.999865190976612e-05, "loss": 2.3281, "step": 188 }, { "epoch": 0.005254437080201058, "grad_norm": 1.2413674592971802, "learning_rate": 1.9998637530565202e-05, "loss": 2.1402, "step": 189 }, { "epoch": 0.0052822383345936565, "grad_norm": 1.1016952991485596, "learning_rate": 1.9998623075089005e-05, "loss": 2.0606, "step": 190 }, { "epoch": 0.005310039588986255, "grad_norm": 1.119667887687683, "learning_rate": 1.9998608543337633e-05, "loss": 2.3831, "step": 191 }, { "epoch": 0.005337840843378854, "grad_norm": 1.1289544105529785, "learning_rate": 1.9998593935311195e-05, "loss": 2.2553, "step": 192 }, { "epoch": 0.005365642097771452, "grad_norm": 1.26864755153656, "learning_rate": 1.9998579251009806e-05, "loss": 2.135, "step": 193 }, { "epoch": 0.00539344335216405, "grad_norm": 1.1740165948867798, "learning_rate": 1.999856449043357e-05, "loss": 2.4579, "step": 194 }, { "epoch": 0.005421244606556648, "grad_norm": 1.186118245124817, "learning_rate": 1.9998549653582612e-05, "loss": 2.1699, "step": 195 }, { "epoch": 0.005449045860949246, "grad_norm": 1.411658525466919, "learning_rate": 1.9998534740457037e-05, "loss": 2.3127, "step": 196 }, { "epoch": 0.0054768471153418445, "grad_norm": 1.1610398292541504, "learning_rate": 1.999851975105696e-05, "loss": 2.1869, "step": 197 }, { "epoch": 0.005504648369734443, "grad_norm": 1.1175934076309204, "learning_rate": 1.9998504685382496e-05, "loss": 2.3615, "step": 198 }, { "epoch": 0.005532449624127041, "grad_norm": 1.1365805864334106, "learning_rate": 1.999848954343376e-05, "loss": 2.3088, "step": 199 }, { "epoch": 0.005560250878519639, "grad_norm": 1.2173691987991333, "learning_rate": 1.9998474325210867e-05, "loss": 2.4849, "step": 200 }, { "epoch": 0.005588052132912237, "grad_norm": 1.2359378337860107, "learning_rate": 1.9998459030713936e-05, "loss": 2.4016, "step": 201 }, { "epoch": 0.005615853387304835, "grad_norm": 1.1392202377319336, "learning_rate": 1.9998443659943077e-05, "loss": 2.0882, "step": 202 }, { "epoch": 0.005643654641697434, "grad_norm": 1.159311056137085, "learning_rate": 1.9998428212898412e-05, "loss": 2.2791, "step": 203 }, { "epoch": 0.005671455896090032, "grad_norm": 1.1994978189468384, "learning_rate": 1.999841268958006e-05, "loss": 2.0961, "step": 204 }, { "epoch": 0.00569925715048263, "grad_norm": 1.1242446899414062, "learning_rate": 1.9998397089988137e-05, "loss": 2.1686, "step": 205 }, { "epoch": 0.005727058404875228, "grad_norm": 1.2168036699295044, "learning_rate": 1.9998381414122763e-05, "loss": 2.2435, "step": 206 }, { "epoch": 0.005754859659267826, "grad_norm": 1.148865818977356, "learning_rate": 1.9998365661984057e-05, "loss": 2.2307, "step": 207 }, { "epoch": 0.0057826609136604245, "grad_norm": 1.2594053745269775, "learning_rate": 1.9998349833572142e-05, "loss": 2.1741, "step": 208 }, { "epoch": 0.005810462168053023, "grad_norm": 1.1274986267089844, "learning_rate": 1.9998333928887136e-05, "loss": 2.1917, "step": 209 }, { "epoch": 0.005838263422445621, "grad_norm": 1.2244343757629395, "learning_rate": 1.9998317947929155e-05, "loss": 1.9637, "step": 210 }, { "epoch": 0.005866064676838219, "grad_norm": 1.136908769607544, "learning_rate": 1.999830189069833e-05, "loss": 2.2904, "step": 211 }, { "epoch": 0.005893865931230817, "grad_norm": 1.1897549629211426, "learning_rate": 1.999828575719478e-05, "loss": 2.2452, "step": 212 }, { "epoch": 0.005921667185623415, "grad_norm": 1.1481176614761353, "learning_rate": 1.9998269547418627e-05, "loss": 2.0026, "step": 213 }, { "epoch": 0.0059494684400160135, "grad_norm": 1.2160489559173584, "learning_rate": 1.9998253261369996e-05, "loss": 1.9002, "step": 214 }, { "epoch": 0.005977269694408612, "grad_norm": 1.1582677364349365, "learning_rate": 1.9998236899049013e-05, "loss": 2.1698, "step": 215 }, { "epoch": 0.00600507094880121, "grad_norm": 1.158765435218811, "learning_rate": 1.99982204604558e-05, "loss": 2.2893, "step": 216 }, { "epoch": 0.006032872203193808, "grad_norm": 1.133265733718872, "learning_rate": 1.999820394559048e-05, "loss": 2.3633, "step": 217 }, { "epoch": 0.006060673457586406, "grad_norm": 1.2613697052001953, "learning_rate": 1.9998187354453184e-05, "loss": 2.23, "step": 218 }, { "epoch": 0.006088474711979004, "grad_norm": 1.1500599384307861, "learning_rate": 1.999817068704404e-05, "loss": 2.3103, "step": 219 }, { "epoch": 0.0061162759663716025, "grad_norm": 1.1793016195297241, "learning_rate": 1.9998153943363168e-05, "loss": 2.3407, "step": 220 }, { "epoch": 0.006144077220764201, "grad_norm": 1.0700280666351318, "learning_rate": 1.99981371234107e-05, "loss": 2.133, "step": 221 }, { "epoch": 0.006171878475156799, "grad_norm": 1.2073957920074463, "learning_rate": 1.9998120227186765e-05, "loss": 2.1209, "step": 222 }, { "epoch": 0.006199679729549397, "grad_norm": 1.169073224067688, "learning_rate": 1.9998103254691487e-05, "loss": 2.1885, "step": 223 }, { "epoch": 0.006227480983941995, "grad_norm": 1.128422498703003, "learning_rate": 1.9998086205925005e-05, "loss": 2.5044, "step": 224 }, { "epoch": 0.006255282238334593, "grad_norm": 1.1118581295013428, "learning_rate": 1.9998069080887438e-05, "loss": 2.4033, "step": 225 }, { "epoch": 0.006283083492727192, "grad_norm": 1.2448819875717163, "learning_rate": 1.9998051879578925e-05, "loss": 2.0687, "step": 226 }, { "epoch": 0.00631088474711979, "grad_norm": 1.1881904602050781, "learning_rate": 1.9998034601999597e-05, "loss": 2.045, "step": 227 }, { "epoch": 0.006338686001512388, "grad_norm": 1.1464701890945435, "learning_rate": 1.9998017248149578e-05, "loss": 2.2586, "step": 228 }, { "epoch": 0.006366487255904986, "grad_norm": 1.2277913093566895, "learning_rate": 1.9997999818029006e-05, "loss": 2.2768, "step": 229 }, { "epoch": 0.006394288510297584, "grad_norm": 1.1781984567642212, "learning_rate": 1.9997982311638012e-05, "loss": 2.2699, "step": 230 }, { "epoch": 0.0064220897646901825, "grad_norm": 1.2156535387039185, "learning_rate": 1.9997964728976736e-05, "loss": 2.4397, "step": 231 }, { "epoch": 0.006449891019082781, "grad_norm": 1.1331546306610107, "learning_rate": 1.9997947070045304e-05, "loss": 2.4782, "step": 232 }, { "epoch": 0.006477692273475379, "grad_norm": 1.3272883892059326, "learning_rate": 1.9997929334843853e-05, "loss": 1.8357, "step": 233 }, { "epoch": 0.006505493527867977, "grad_norm": 1.2408573627471924, "learning_rate": 1.999791152337252e-05, "loss": 2.2326, "step": 234 }, { "epoch": 0.006533294782260576, "grad_norm": 1.2071201801300049, "learning_rate": 1.999789363563144e-05, "loss": 2.3973, "step": 235 }, { "epoch": 0.006561096036653174, "grad_norm": 1.2091608047485352, "learning_rate": 1.9997875671620747e-05, "loss": 2.222, "step": 236 }, { "epoch": 0.006588897291045772, "grad_norm": 1.1463544368743896, "learning_rate": 1.9997857631340584e-05, "loss": 2.0699, "step": 237 }, { "epoch": 0.0066166985454383705, "grad_norm": 1.2473269701004028, "learning_rate": 1.999783951479108e-05, "loss": 2.1138, "step": 238 }, { "epoch": 0.006644499799830969, "grad_norm": 1.2098089456558228, "learning_rate": 1.9997821321972383e-05, "loss": 2.1912, "step": 239 }, { "epoch": 0.006672301054223567, "grad_norm": 1.1255178451538086, "learning_rate": 1.9997803052884626e-05, "loss": 2.2018, "step": 240 }, { "epoch": 0.006700102308616165, "grad_norm": 1.0956746339797974, "learning_rate": 1.999778470752795e-05, "loss": 2.262, "step": 241 }, { "epoch": 0.006727903563008763, "grad_norm": 1.12589430809021, "learning_rate": 1.999776628590249e-05, "loss": 2.3262, "step": 242 }, { "epoch": 0.006755704817401361, "grad_norm": 1.1384801864624023, "learning_rate": 1.9997747788008393e-05, "loss": 2.2085, "step": 243 }, { "epoch": 0.00678350607179396, "grad_norm": 1.1941229104995728, "learning_rate": 1.99977292138458e-05, "loss": 1.8827, "step": 244 }, { "epoch": 0.006811307326186558, "grad_norm": 1.1836212873458862, "learning_rate": 1.9997710563414848e-05, "loss": 2.1355, "step": 245 }, { "epoch": 0.006839108580579156, "grad_norm": 1.1552331447601318, "learning_rate": 1.9997691836715685e-05, "loss": 2.1136, "step": 246 }, { "epoch": 0.006866909834971754, "grad_norm": 1.1576809883117676, "learning_rate": 1.999767303374845e-05, "loss": 1.8707, "step": 247 }, { "epoch": 0.006894711089364352, "grad_norm": 1.217590093612671, "learning_rate": 1.9997654154513288e-05, "loss": 1.9808, "step": 248 }, { "epoch": 0.0069225123437569504, "grad_norm": 1.0830674171447754, "learning_rate": 1.9997635199010336e-05, "loss": 2.2103, "step": 249 }, { "epoch": 0.006950313598149549, "grad_norm": 1.3201905488967896, "learning_rate": 1.999761616723975e-05, "loss": 2.0579, "step": 250 }, { "epoch": 0.006978114852542147, "grad_norm": 1.1891950368881226, "learning_rate": 1.999759705920167e-05, "loss": 2.2141, "step": 251 }, { "epoch": 0.007005916106934745, "grad_norm": 1.2047144174575806, "learning_rate": 1.9997577874896242e-05, "loss": 2.1733, "step": 252 }, { "epoch": 0.007033717361327343, "grad_norm": 1.1163572072982788, "learning_rate": 1.999755861432361e-05, "loss": 2.2826, "step": 253 }, { "epoch": 0.007061518615719941, "grad_norm": 1.3825294971466064, "learning_rate": 1.9997539277483924e-05, "loss": 2.1507, "step": 254 }, { "epoch": 0.0070893198701125395, "grad_norm": 1.4220788478851318, "learning_rate": 1.999751986437733e-05, "loss": 2.0164, "step": 255 }, { "epoch": 0.007117121124505138, "grad_norm": 1.1253244876861572, "learning_rate": 1.999750037500398e-05, "loss": 2.1497, "step": 256 }, { "epoch": 0.007144922378897736, "grad_norm": 1.1598966121673584, "learning_rate": 1.9997480809364015e-05, "loss": 2.2188, "step": 257 }, { "epoch": 0.007172723633290334, "grad_norm": 1.1915944814682007, "learning_rate": 1.999746116745759e-05, "loss": 2.0826, "step": 258 }, { "epoch": 0.007200524887682932, "grad_norm": 1.2803897857666016, "learning_rate": 1.9997441449284854e-05, "loss": 2.2626, "step": 259 }, { "epoch": 0.00722832614207553, "grad_norm": 1.1314741373062134, "learning_rate": 1.9997421654845956e-05, "loss": 2.1059, "step": 260 }, { "epoch": 0.0072561273964681285, "grad_norm": 1.1593464612960815, "learning_rate": 1.9997401784141048e-05, "loss": 2.2332, "step": 261 }, { "epoch": 0.007283928650860727, "grad_norm": 1.1185193061828613, "learning_rate": 1.9997381837170283e-05, "loss": 2.1661, "step": 262 }, { "epoch": 0.007311729905253325, "grad_norm": 1.2017245292663574, "learning_rate": 1.999736181393381e-05, "loss": 2.3657, "step": 263 }, { "epoch": 0.007339531159645923, "grad_norm": 1.2290972471237183, "learning_rate": 1.9997341714431784e-05, "loss": 2.3996, "step": 264 }, { "epoch": 0.007367332414038521, "grad_norm": 1.1724745035171509, "learning_rate": 1.9997321538664355e-05, "loss": 2.2115, "step": 265 }, { "epoch": 0.007395133668431119, "grad_norm": 1.1552612781524658, "learning_rate": 1.9997301286631682e-05, "loss": 1.8123, "step": 266 }, { "epoch": 0.007422934922823718, "grad_norm": 1.200886845588684, "learning_rate": 1.999728095833392e-05, "loss": 2.4026, "step": 267 }, { "epoch": 0.007450736177216316, "grad_norm": 1.148847222328186, "learning_rate": 1.9997260553771218e-05, "loss": 1.7639, "step": 268 }, { "epoch": 0.007478537431608914, "grad_norm": 1.2669720649719238, "learning_rate": 1.9997240072943735e-05, "loss": 2.2422, "step": 269 }, { "epoch": 0.007506338686001512, "grad_norm": 1.2966080904006958, "learning_rate": 1.9997219515851628e-05, "loss": 2.3122, "step": 270 }, { "epoch": 0.00753413994039411, "grad_norm": 1.333851933479309, "learning_rate": 1.999719888249505e-05, "loss": 2.3629, "step": 271 }, { "epoch": 0.0075619411947867084, "grad_norm": 1.2224901914596558, "learning_rate": 1.9997178172874165e-05, "loss": 2.3491, "step": 272 }, { "epoch": 0.007589742449179307, "grad_norm": 1.236270546913147, "learning_rate": 1.9997157386989124e-05, "loss": 2.0446, "step": 273 }, { "epoch": 0.007617543703571905, "grad_norm": 1.2419363260269165, "learning_rate": 1.9997136524840093e-05, "loss": 2.3756, "step": 274 }, { "epoch": 0.007645344957964503, "grad_norm": 1.180742859840393, "learning_rate": 1.9997115586427225e-05, "loss": 2.3076, "step": 275 }, { "epoch": 0.007673146212357101, "grad_norm": 1.2005637884140015, "learning_rate": 1.999709457175068e-05, "loss": 2.2688, "step": 276 }, { "epoch": 0.0077009474667497, "grad_norm": 1.264400839805603, "learning_rate": 1.9997073480810624e-05, "loss": 2.1371, "step": 277 }, { "epoch": 0.007728748721142298, "grad_norm": 1.2124438285827637, "learning_rate": 1.999705231360721e-05, "loss": 2.14, "step": 278 }, { "epoch": 0.0077565499755348965, "grad_norm": 1.192994475364685, "learning_rate": 1.9997031070140603e-05, "loss": 2.0863, "step": 279 }, { "epoch": 0.007784351229927495, "grad_norm": 1.2407389879226685, "learning_rate": 1.9997009750410967e-05, "loss": 2.3014, "step": 280 }, { "epoch": 0.007812152484320093, "grad_norm": 1.179093360900879, "learning_rate": 1.9996988354418466e-05, "loss": 2.4778, "step": 281 }, { "epoch": 0.00783995373871269, "grad_norm": 1.1964751482009888, "learning_rate": 1.9996966882163255e-05, "loss": 2.1327, "step": 282 }, { "epoch": 0.007867754993105288, "grad_norm": 1.2292656898498535, "learning_rate": 1.9996945333645507e-05, "loss": 2.2744, "step": 283 }, { "epoch": 0.007895556247497887, "grad_norm": 1.177311897277832, "learning_rate": 1.999692370886538e-05, "loss": 1.8938, "step": 284 }, { "epoch": 0.007923357501890485, "grad_norm": 1.2083489894866943, "learning_rate": 1.9996902007823044e-05, "loss": 2.4551, "step": 285 }, { "epoch": 0.007951158756283083, "grad_norm": 1.2135870456695557, "learning_rate": 1.9996880230518658e-05, "loss": 2.4491, "step": 286 }, { "epoch": 0.007978960010675681, "grad_norm": 1.240782380104065, "learning_rate": 1.9996858376952396e-05, "loss": 2.1782, "step": 287 }, { "epoch": 0.00800676126506828, "grad_norm": 1.1914286613464355, "learning_rate": 1.999683644712442e-05, "loss": 2.0576, "step": 288 }, { "epoch": 0.008034562519460877, "grad_norm": 1.2376099824905396, "learning_rate": 1.99968144410349e-05, "loss": 2.1571, "step": 289 }, { "epoch": 0.008062363773853476, "grad_norm": 1.1904844045639038, "learning_rate": 1.9996792358683995e-05, "loss": 2.2738, "step": 290 }, { "epoch": 0.008090165028246074, "grad_norm": 1.206160306930542, "learning_rate": 1.9996770200071885e-05, "loss": 2.4953, "step": 291 }, { "epoch": 0.008117966282638672, "grad_norm": 1.2402923107147217, "learning_rate": 1.999674796519874e-05, "loss": 2.3178, "step": 292 }, { "epoch": 0.00814576753703127, "grad_norm": 1.266143560409546, "learning_rate": 1.9996725654064716e-05, "loss": 2.3691, "step": 293 }, { "epoch": 0.008173568791423868, "grad_norm": 1.1773747205734253, "learning_rate": 1.9996703266669993e-05, "loss": 1.9016, "step": 294 }, { "epoch": 0.008201370045816466, "grad_norm": 1.1454988718032837, "learning_rate": 1.9996680803014742e-05, "loss": 2.2614, "step": 295 }, { "epoch": 0.008229171300209065, "grad_norm": 1.129171371459961, "learning_rate": 1.9996658263099133e-05, "loss": 2.0591, "step": 296 }, { "epoch": 0.008256972554601663, "grad_norm": 1.3212676048278809, "learning_rate": 1.9996635646923338e-05, "loss": 2.5014, "step": 297 }, { "epoch": 0.008284773808994261, "grad_norm": 1.2573388814926147, "learning_rate": 1.9996612954487527e-05, "loss": 2.1511, "step": 298 }, { "epoch": 0.00831257506338686, "grad_norm": 1.1956983804702759, "learning_rate": 1.9996590185791876e-05, "loss": 2.1612, "step": 299 }, { "epoch": 0.008340376317779459, "grad_norm": 1.2361464500427246, "learning_rate": 1.999656734083656e-05, "loss": 2.1309, "step": 300 }, { "epoch": 0.008368177572172057, "grad_norm": 1.1856281757354736, "learning_rate": 1.9996544419621746e-05, "loss": 2.2978, "step": 301 }, { "epoch": 0.008395978826564655, "grad_norm": 1.1988948583602905, "learning_rate": 1.9996521422147616e-05, "loss": 2.2372, "step": 302 }, { "epoch": 0.008423780080957254, "grad_norm": 1.1382933855056763, "learning_rate": 1.999649834841434e-05, "loss": 2.3066, "step": 303 }, { "epoch": 0.008451581335349852, "grad_norm": 1.245491623878479, "learning_rate": 1.9996475198422102e-05, "loss": 2.2183, "step": 304 }, { "epoch": 0.00847938258974245, "grad_norm": 1.175065040588379, "learning_rate": 1.9996451972171074e-05, "loss": 2.19, "step": 305 }, { "epoch": 0.008507183844135048, "grad_norm": 1.1517988443374634, "learning_rate": 1.9996428669661433e-05, "loss": 2.1713, "step": 306 }, { "epoch": 0.008534985098527646, "grad_norm": 1.241449236869812, "learning_rate": 1.9996405290893355e-05, "loss": 2.2101, "step": 307 }, { "epoch": 0.008562786352920244, "grad_norm": 1.368135690689087, "learning_rate": 1.999638183586702e-05, "loss": 2.4745, "step": 308 }, { "epoch": 0.008590587607312843, "grad_norm": 1.1619513034820557, "learning_rate": 1.999635830458261e-05, "loss": 2.036, "step": 309 }, { "epoch": 0.00861838886170544, "grad_norm": 1.2862197160720825, "learning_rate": 1.9996334697040302e-05, "loss": 2.4662, "step": 310 }, { "epoch": 0.008646190116098039, "grad_norm": 1.2846444845199585, "learning_rate": 1.999631101324027e-05, "loss": 2.3629, "step": 311 }, { "epoch": 0.008673991370490637, "grad_norm": 1.1459872722625732, "learning_rate": 1.9996287253182706e-05, "loss": 2.1387, "step": 312 }, { "epoch": 0.008701792624883235, "grad_norm": 1.2167809009552002, "learning_rate": 1.9996263416867786e-05, "loss": 2.3122, "step": 313 }, { "epoch": 0.008729593879275833, "grad_norm": 1.2156113386154175, "learning_rate": 1.999623950429569e-05, "loss": 2.3352, "step": 314 }, { "epoch": 0.008757395133668432, "grad_norm": 1.1823561191558838, "learning_rate": 1.99962155154666e-05, "loss": 2.0944, "step": 315 }, { "epoch": 0.00878519638806103, "grad_norm": 1.2361655235290527, "learning_rate": 1.99961914503807e-05, "loss": 2.2105, "step": 316 }, { "epoch": 0.008812997642453628, "grad_norm": 1.2121020555496216, "learning_rate": 1.9996167309038178e-05, "loss": 2.1835, "step": 317 }, { "epoch": 0.008840798896846226, "grad_norm": 1.2943947315216064, "learning_rate": 1.9996143091439217e-05, "loss": 2.1378, "step": 318 }, { "epoch": 0.008868600151238824, "grad_norm": 1.3286017179489136, "learning_rate": 1.9996118797583993e-05, "loss": 2.2766, "step": 319 }, { "epoch": 0.008896401405631423, "grad_norm": 1.2502901554107666, "learning_rate": 1.9996094427472704e-05, "loss": 2.3142, "step": 320 }, { "epoch": 0.00892420266002402, "grad_norm": 4.016668319702148, "learning_rate": 1.9996069981105525e-05, "loss": 2.597, "step": 321 }, { "epoch": 0.008952003914416619, "grad_norm": 1.2570430040359497, "learning_rate": 1.999604545848265e-05, "loss": 2.4194, "step": 322 }, { "epoch": 0.008979805168809217, "grad_norm": 1.2298004627227783, "learning_rate": 1.9996020859604262e-05, "loss": 2.5186, "step": 323 }, { "epoch": 0.009007606423201815, "grad_norm": 1.1344186067581177, "learning_rate": 1.999599618447055e-05, "loss": 2.2583, "step": 324 }, { "epoch": 0.009035407677594413, "grad_norm": 1.1378529071807861, "learning_rate": 1.9995971433081702e-05, "loss": 2.0913, "step": 325 }, { "epoch": 0.009063208931987012, "grad_norm": 1.1911808252334595, "learning_rate": 1.999594660543791e-05, "loss": 2.1768, "step": 326 }, { "epoch": 0.00909101018637961, "grad_norm": 1.1516472101211548, "learning_rate": 1.9995921701539355e-05, "loss": 2.2729, "step": 327 }, { "epoch": 0.009118811440772208, "grad_norm": 1.1454017162322998, "learning_rate": 1.9995896721386233e-05, "loss": 1.8774, "step": 328 }, { "epoch": 0.009146612695164806, "grad_norm": 1.2178415060043335, "learning_rate": 1.999587166497874e-05, "loss": 2.125, "step": 329 }, { "epoch": 0.009174413949557404, "grad_norm": 1.1714255809783936, "learning_rate": 1.9995846532317054e-05, "loss": 2.3367, "step": 330 }, { "epoch": 0.009202215203950002, "grad_norm": 1.1687825918197632, "learning_rate": 1.9995821323401377e-05, "loss": 2.0562, "step": 331 }, { "epoch": 0.0092300164583426, "grad_norm": 1.225373387336731, "learning_rate": 1.99957960382319e-05, "loss": 2.5194, "step": 332 }, { "epoch": 0.009257817712735199, "grad_norm": 1.1686179637908936, "learning_rate": 1.999577067680881e-05, "loss": 2.5327, "step": 333 }, { "epoch": 0.009285618967127797, "grad_norm": 1.226623296737671, "learning_rate": 1.9995745239132308e-05, "loss": 1.789, "step": 334 }, { "epoch": 0.009313420221520395, "grad_norm": 2.9716482162475586, "learning_rate": 1.9995719725202583e-05, "loss": 2.1625, "step": 335 }, { "epoch": 0.009341221475912993, "grad_norm": 1.1785285472869873, "learning_rate": 1.999569413501983e-05, "loss": 1.8064, "step": 336 }, { "epoch": 0.009369022730305591, "grad_norm": 1.239641785621643, "learning_rate": 1.9995668468584245e-05, "loss": 2.2974, "step": 337 }, { "epoch": 0.00939682398469819, "grad_norm": 1.1642181873321533, "learning_rate": 1.9995642725896028e-05, "loss": 2.2843, "step": 338 }, { "epoch": 0.009424625239090788, "grad_norm": 1.2472692728042603, "learning_rate": 1.9995616906955367e-05, "loss": 2.3215, "step": 339 }, { "epoch": 0.009452426493483386, "grad_norm": 1.2883131504058838, "learning_rate": 1.9995591011762466e-05, "loss": 1.9569, "step": 340 }, { "epoch": 0.009480227747875984, "grad_norm": 1.2761942148208618, "learning_rate": 1.9995565040317518e-05, "loss": 2.1518, "step": 341 }, { "epoch": 0.009508029002268582, "grad_norm": 1.605558156967163, "learning_rate": 1.9995538992620725e-05, "loss": 1.999, "step": 342 }, { "epoch": 0.00953583025666118, "grad_norm": 1.1872611045837402, "learning_rate": 1.9995512868672286e-05, "loss": 2.3401, "step": 343 }, { "epoch": 0.009563631511053779, "grad_norm": 1.2010524272918701, "learning_rate": 1.9995486668472395e-05, "loss": 2.1019, "step": 344 }, { "epoch": 0.009591432765446377, "grad_norm": 1.200452446937561, "learning_rate": 1.9995460392021257e-05, "loss": 2.2417, "step": 345 }, { "epoch": 0.009619234019838975, "grad_norm": 1.2160141468048096, "learning_rate": 1.9995434039319068e-05, "loss": 2.2987, "step": 346 }, { "epoch": 0.009647035274231573, "grad_norm": 1.2686721086502075, "learning_rate": 1.9995407610366034e-05, "loss": 1.8622, "step": 347 }, { "epoch": 0.009674836528624171, "grad_norm": 1.1203049421310425, "learning_rate": 1.9995381105162352e-05, "loss": 2.2375, "step": 348 }, { "epoch": 0.00970263778301677, "grad_norm": 1.2720309495925903, "learning_rate": 1.9995354523708226e-05, "loss": 2.2997, "step": 349 }, { "epoch": 0.009730439037409368, "grad_norm": 1.2582367658615112, "learning_rate": 1.999532786600386e-05, "loss": 2.0737, "step": 350 }, { "epoch": 0.009758240291801966, "grad_norm": 1.439640760421753, "learning_rate": 1.999530113204946e-05, "loss": 1.702, "step": 351 }, { "epoch": 0.009786041546194564, "grad_norm": 1.2243410348892212, "learning_rate": 1.9995274321845222e-05, "loss": 2.1211, "step": 352 }, { "epoch": 0.009813842800587162, "grad_norm": 1.1874209642410278, "learning_rate": 1.9995247435391357e-05, "loss": 2.1846, "step": 353 }, { "epoch": 0.00984164405497976, "grad_norm": 1.1727927923202515, "learning_rate": 1.9995220472688067e-05, "loss": 2.0035, "step": 354 }, { "epoch": 0.009869445309372359, "grad_norm": 1.1343703269958496, "learning_rate": 1.999519343373556e-05, "loss": 2.0503, "step": 355 }, { "epoch": 0.009897246563764957, "grad_norm": 1.1845941543579102, "learning_rate": 1.999516631853404e-05, "loss": 1.8049, "step": 356 }, { "epoch": 0.009925047818157555, "grad_norm": 1.1968684196472168, "learning_rate": 1.9995139127083712e-05, "loss": 1.9737, "step": 357 }, { "epoch": 0.009952849072550153, "grad_norm": 1.2261815071105957, "learning_rate": 1.999511185938479e-05, "loss": 2.1651, "step": 358 }, { "epoch": 0.009980650326942751, "grad_norm": 1.1724697351455688, "learning_rate": 1.9995084515437476e-05, "loss": 2.3121, "step": 359 }, { "epoch": 0.01000845158133535, "grad_norm": 1.2035807371139526, "learning_rate": 1.9995057095241984e-05, "loss": 2.1951, "step": 360 }, { "epoch": 0.010036252835727948, "grad_norm": 1.1641165018081665, "learning_rate": 1.9995029598798517e-05, "loss": 2.015, "step": 361 }, { "epoch": 0.010064054090120546, "grad_norm": 1.2423123121261597, "learning_rate": 1.999500202610729e-05, "loss": 2.1349, "step": 362 }, { "epoch": 0.010091855344513144, "grad_norm": 1.3289189338684082, "learning_rate": 1.999497437716851e-05, "loss": 1.9513, "step": 363 }, { "epoch": 0.010119656598905742, "grad_norm": 1.2354518175125122, "learning_rate": 1.999494665198239e-05, "loss": 2.029, "step": 364 }, { "epoch": 0.01014745785329834, "grad_norm": 1.3132226467132568, "learning_rate": 1.999491885054914e-05, "loss": 2.2365, "step": 365 }, { "epoch": 0.010175259107690938, "grad_norm": 1.216260552406311, "learning_rate": 1.999489097286897e-05, "loss": 2.1655, "step": 366 }, { "epoch": 0.010203060362083537, "grad_norm": 1.2158098220825195, "learning_rate": 1.99948630189421e-05, "loss": 2.1189, "step": 367 }, { "epoch": 0.010230861616476135, "grad_norm": 1.2435050010681152, "learning_rate": 1.9994834988768736e-05, "loss": 2.3598, "step": 368 }, { "epoch": 0.010258662870868733, "grad_norm": 1.1722849607467651, "learning_rate": 1.9994806882349095e-05, "loss": 2.1073, "step": 369 }, { "epoch": 0.010286464125261331, "grad_norm": 1.2009717226028442, "learning_rate": 1.999477869968339e-05, "loss": 2.2609, "step": 370 }, { "epoch": 0.01031426537965393, "grad_norm": 1.1167720556259155, "learning_rate": 1.999475044077184e-05, "loss": 2.2551, "step": 371 }, { "epoch": 0.010342066634046528, "grad_norm": 1.108054518699646, "learning_rate": 1.9994722105614655e-05, "loss": 2.1521, "step": 372 }, { "epoch": 0.010369867888439126, "grad_norm": 1.2709945440292358, "learning_rate": 1.9994693694212053e-05, "loss": 2.2846, "step": 373 }, { "epoch": 0.010397669142831724, "grad_norm": 1.1911569833755493, "learning_rate": 1.9994665206564252e-05, "loss": 2.1756, "step": 374 }, { "epoch": 0.010425470397224322, "grad_norm": 1.1784591674804688, "learning_rate": 1.9994636642671468e-05, "loss": 2.2175, "step": 375 }, { "epoch": 0.01045327165161692, "grad_norm": 1.1652233600616455, "learning_rate": 1.999460800253392e-05, "loss": 1.9948, "step": 376 }, { "epoch": 0.010481072906009518, "grad_norm": 1.2901479005813599, "learning_rate": 1.999457928615183e-05, "loss": 2.2778, "step": 377 }, { "epoch": 0.010508874160402117, "grad_norm": 1.2542388439178467, "learning_rate": 1.9994550493525407e-05, "loss": 2.1594, "step": 378 }, { "epoch": 0.010536675414794715, "grad_norm": 1.413255214691162, "learning_rate": 1.9994521624654876e-05, "loss": 2.0555, "step": 379 }, { "epoch": 0.010564476669187313, "grad_norm": 1.174635648727417, "learning_rate": 1.999449267954046e-05, "loss": 2.1037, "step": 380 }, { "epoch": 0.010592277923579911, "grad_norm": 1.1941524744033813, "learning_rate": 1.999446365818238e-05, "loss": 2.1442, "step": 381 }, { "epoch": 0.01062007917797251, "grad_norm": 1.2522538900375366, "learning_rate": 1.9994434560580854e-05, "loss": 1.9233, "step": 382 }, { "epoch": 0.010647880432365107, "grad_norm": 1.221524715423584, "learning_rate": 1.9994405386736104e-05, "loss": 2.14, "step": 383 }, { "epoch": 0.010675681686757707, "grad_norm": 1.232542872428894, "learning_rate": 1.9994376136648353e-05, "loss": 2.1618, "step": 384 }, { "epoch": 0.010703482941150306, "grad_norm": 1.269310712814331, "learning_rate": 1.9994346810317825e-05, "loss": 2.5878, "step": 385 }, { "epoch": 0.010731284195542904, "grad_norm": 1.2510594129562378, "learning_rate": 1.9994317407744746e-05, "loss": 2.15, "step": 386 }, { "epoch": 0.010759085449935502, "grad_norm": 1.2017520666122437, "learning_rate": 1.9994287928929338e-05, "loss": 2.3673, "step": 387 }, { "epoch": 0.0107868867043281, "grad_norm": 1.2362123727798462, "learning_rate": 1.9994258373871823e-05, "loss": 2.2315, "step": 388 }, { "epoch": 0.010814687958720698, "grad_norm": 1.2072386741638184, "learning_rate": 1.9994228742572432e-05, "loss": 1.9465, "step": 389 }, { "epoch": 0.010842489213113296, "grad_norm": 1.1547166109085083, "learning_rate": 1.999419903503139e-05, "loss": 2.3037, "step": 390 }, { "epoch": 0.010870290467505895, "grad_norm": 1.2988885641098022, "learning_rate": 1.9994169251248913e-05, "loss": 1.8762, "step": 391 }, { "epoch": 0.010898091721898493, "grad_norm": 1.2310724258422852, "learning_rate": 1.9994139391225246e-05, "loss": 2.1536, "step": 392 }, { "epoch": 0.010925892976291091, "grad_norm": 1.1934928894042969, "learning_rate": 1.9994109454960603e-05, "loss": 2.2205, "step": 393 }, { "epoch": 0.010953694230683689, "grad_norm": 1.2438488006591797, "learning_rate": 1.9994079442455217e-05, "loss": 2.0007, "step": 394 }, { "epoch": 0.010981495485076287, "grad_norm": 1.2168654203414917, "learning_rate": 1.999404935370932e-05, "loss": 2.1511, "step": 395 }, { "epoch": 0.011009296739468885, "grad_norm": 1.2364530563354492, "learning_rate": 1.9994019188723136e-05, "loss": 2.0775, "step": 396 }, { "epoch": 0.011037097993861484, "grad_norm": 1.1661148071289062, "learning_rate": 1.99939889474969e-05, "loss": 2.3728, "step": 397 }, { "epoch": 0.011064899248254082, "grad_norm": 1.1998419761657715, "learning_rate": 1.999395863003084e-05, "loss": 2.0848, "step": 398 }, { "epoch": 0.01109270050264668, "grad_norm": 1.3086804151535034, "learning_rate": 1.9993928236325186e-05, "loss": 2.3736, "step": 399 }, { "epoch": 0.011120501757039278, "grad_norm": 1.129580020904541, "learning_rate": 1.9993897766380174e-05, "loss": 2.0703, "step": 400 }, { "epoch": 0.011148303011431876, "grad_norm": 1.2022536993026733, "learning_rate": 1.999386722019603e-05, "loss": 2.2668, "step": 401 }, { "epoch": 0.011176104265824474, "grad_norm": 1.2621519565582275, "learning_rate": 1.9993836597772994e-05, "loss": 2.144, "step": 402 }, { "epoch": 0.011203905520217073, "grad_norm": 1.2360827922821045, "learning_rate": 1.99938058991113e-05, "loss": 2.1746, "step": 403 }, { "epoch": 0.01123170677460967, "grad_norm": 1.2271840572357178, "learning_rate": 1.9993775124211178e-05, "loss": 2.4581, "step": 404 }, { "epoch": 0.011259508029002269, "grad_norm": 1.1490858793258667, "learning_rate": 1.9993744273072856e-05, "loss": 2.1479, "step": 405 }, { "epoch": 0.011287309283394867, "grad_norm": 1.17617666721344, "learning_rate": 1.9993713345696582e-05, "loss": 1.9352, "step": 406 }, { "epoch": 0.011315110537787465, "grad_norm": 1.1602685451507568, "learning_rate": 1.9993682342082593e-05, "loss": 1.9926, "step": 407 }, { "epoch": 0.011342911792180064, "grad_norm": 1.2253341674804688, "learning_rate": 1.999365126223111e-05, "loss": 1.8753, "step": 408 }, { "epoch": 0.011370713046572662, "grad_norm": 1.1766576766967773, "learning_rate": 1.9993620106142386e-05, "loss": 2.0156, "step": 409 }, { "epoch": 0.01139851430096526, "grad_norm": 1.1876331567764282, "learning_rate": 1.9993588873816646e-05, "loss": 2.2175, "step": 410 }, { "epoch": 0.011426315555357858, "grad_norm": 1.76826012134552, "learning_rate": 1.999355756525414e-05, "loss": 2.3987, "step": 411 }, { "epoch": 0.011454116809750456, "grad_norm": 1.2139486074447632, "learning_rate": 1.99935261804551e-05, "loss": 2.1007, "step": 412 }, { "epoch": 0.011481918064143054, "grad_norm": 1.1700291633605957, "learning_rate": 1.9993494719419768e-05, "loss": 2.2082, "step": 413 }, { "epoch": 0.011509719318535653, "grad_norm": 1.1571215391159058, "learning_rate": 1.9993463182148377e-05, "loss": 2.2047, "step": 414 }, { "epoch": 0.01153752057292825, "grad_norm": 1.1979042291641235, "learning_rate": 1.999343156864118e-05, "loss": 2.1323, "step": 415 }, { "epoch": 0.011565321827320849, "grad_norm": 1.2227662801742554, "learning_rate": 1.9993399878898406e-05, "loss": 2.2673, "step": 416 }, { "epoch": 0.011593123081713447, "grad_norm": 1.1977609395980835, "learning_rate": 1.9993368112920307e-05, "loss": 2.1903, "step": 417 }, { "epoch": 0.011620924336106045, "grad_norm": 1.1764262914657593, "learning_rate": 1.999333627070712e-05, "loss": 2.3888, "step": 418 }, { "epoch": 0.011648725590498643, "grad_norm": 1.2286403179168701, "learning_rate": 1.9993304352259086e-05, "loss": 2.3855, "step": 419 }, { "epoch": 0.011676526844891242, "grad_norm": 1.1996562480926514, "learning_rate": 1.9993272357576453e-05, "loss": 2.2075, "step": 420 }, { "epoch": 0.01170432809928384, "grad_norm": 1.2185468673706055, "learning_rate": 1.9993240286659463e-05, "loss": 2.152, "step": 421 }, { "epoch": 0.011732129353676438, "grad_norm": 1.1974788904190063, "learning_rate": 1.9993208139508358e-05, "loss": 2.297, "step": 422 }, { "epoch": 0.011759930608069036, "grad_norm": 1.2041329145431519, "learning_rate": 1.9993175916123387e-05, "loss": 2.2113, "step": 423 }, { "epoch": 0.011787731862461634, "grad_norm": 1.219990611076355, "learning_rate": 1.9993143616504797e-05, "loss": 2.3692, "step": 424 }, { "epoch": 0.011815533116854232, "grad_norm": 1.2000436782836914, "learning_rate": 1.999311124065283e-05, "loss": 2.3672, "step": 425 }, { "epoch": 0.01184333437124683, "grad_norm": 1.2684118747711182, "learning_rate": 1.9993078788567738e-05, "loss": 2.1843, "step": 426 }, { "epoch": 0.011871135625639429, "grad_norm": 1.24372398853302, "learning_rate": 1.999304626024976e-05, "loss": 2.1906, "step": 427 }, { "epoch": 0.011898936880032027, "grad_norm": 1.2300208806991577, "learning_rate": 1.9993013655699154e-05, "loss": 2.1919, "step": 428 }, { "epoch": 0.011926738134424625, "grad_norm": 1.2356122732162476, "learning_rate": 1.9992980974916164e-05, "loss": 2.3626, "step": 429 }, { "epoch": 0.011954539388817223, "grad_norm": 1.2899771928787231, "learning_rate": 1.999294821790104e-05, "loss": 1.9316, "step": 430 }, { "epoch": 0.011982340643209822, "grad_norm": 1.1712349653244019, "learning_rate": 1.9992915384654033e-05, "loss": 2.0873, "step": 431 }, { "epoch": 0.01201014189760242, "grad_norm": 1.2467706203460693, "learning_rate": 1.999288247517539e-05, "loss": 2.419, "step": 432 }, { "epoch": 0.012037943151995018, "grad_norm": 1.2197586297988892, "learning_rate": 1.9992849489465365e-05, "loss": 2.3779, "step": 433 }, { "epoch": 0.012065744406387616, "grad_norm": 1.157510757446289, "learning_rate": 1.999281642752421e-05, "loss": 2.3001, "step": 434 }, { "epoch": 0.012093545660780214, "grad_norm": 1.2590513229370117, "learning_rate": 1.9992783289352177e-05, "loss": 2.4025, "step": 435 }, { "epoch": 0.012121346915172812, "grad_norm": 1.2335513830184937, "learning_rate": 1.9992750074949513e-05, "loss": 2.24, "step": 436 }, { "epoch": 0.01214914816956541, "grad_norm": 1.2725893259048462, "learning_rate": 1.9992716784316483e-05, "loss": 2.205, "step": 437 }, { "epoch": 0.012176949423958009, "grad_norm": 1.1728627681732178, "learning_rate": 1.999268341745333e-05, "loss": 2.1782, "step": 438 }, { "epoch": 0.012204750678350607, "grad_norm": 1.1822483539581299, "learning_rate": 1.9992649974360312e-05, "loss": 2.359, "step": 439 }, { "epoch": 0.012232551932743205, "grad_norm": 1.3972904682159424, "learning_rate": 1.9992616455037686e-05, "loss": 1.9882, "step": 440 }, { "epoch": 0.012260353187135803, "grad_norm": 1.1683534383773804, "learning_rate": 1.999258285948571e-05, "loss": 2.155, "step": 441 }, { "epoch": 0.012288154441528401, "grad_norm": 1.2812308073043823, "learning_rate": 1.9992549187704632e-05, "loss": 2.3385, "step": 442 }, { "epoch": 0.012315955695921, "grad_norm": 1.0444228649139404, "learning_rate": 1.9992515439694715e-05, "loss": 2.3281, "step": 443 }, { "epoch": 0.012343756950313598, "grad_norm": 1.3510922193527222, "learning_rate": 1.9992481615456217e-05, "loss": 2.3721, "step": 444 }, { "epoch": 0.012371558204706196, "grad_norm": 1.147889256477356, "learning_rate": 1.9992447714989395e-05, "loss": 2.1489, "step": 445 }, { "epoch": 0.012399359459098794, "grad_norm": 1.2484749555587769, "learning_rate": 1.9992413738294504e-05, "loss": 2.201, "step": 446 }, { "epoch": 0.012427160713491392, "grad_norm": 1.3927819728851318, "learning_rate": 1.9992379685371808e-05, "loss": 2.0761, "step": 447 }, { "epoch": 0.01245496196788399, "grad_norm": 1.2036995887756348, "learning_rate": 1.9992345556221564e-05, "loss": 2.2006, "step": 448 }, { "epoch": 0.012482763222276589, "grad_norm": 1.2021297216415405, "learning_rate": 1.9992311350844033e-05, "loss": 1.8693, "step": 449 }, { "epoch": 0.012510564476669187, "grad_norm": 1.2200133800506592, "learning_rate": 1.9992277069239477e-05, "loss": 2.1441, "step": 450 }, { "epoch": 0.012538365731061785, "grad_norm": 1.2220898866653442, "learning_rate": 1.9992242711408156e-05, "loss": 2.1723, "step": 451 }, { "epoch": 0.012566166985454383, "grad_norm": 1.155791163444519, "learning_rate": 1.9992208277350334e-05, "loss": 1.9258, "step": 452 }, { "epoch": 0.012593968239846981, "grad_norm": 1.2435553073883057, "learning_rate": 1.9992173767066272e-05, "loss": 2.1352, "step": 453 }, { "epoch": 0.01262176949423958, "grad_norm": 1.1029977798461914, "learning_rate": 1.9992139180556234e-05, "loss": 2.2182, "step": 454 }, { "epoch": 0.012649570748632178, "grad_norm": 1.2091385126113892, "learning_rate": 1.9992104517820482e-05, "loss": 2.116, "step": 455 }, { "epoch": 0.012677372003024776, "grad_norm": 1.1786929368972778, "learning_rate": 1.9992069778859283e-05, "loss": 2.3876, "step": 456 }, { "epoch": 0.012705173257417374, "grad_norm": 1.1083950996398926, "learning_rate": 1.99920349636729e-05, "loss": 2.0256, "step": 457 }, { "epoch": 0.012732974511809972, "grad_norm": 1.229845643043518, "learning_rate": 1.9992000072261605e-05, "loss": 2.2203, "step": 458 }, { "epoch": 0.01276077576620257, "grad_norm": 1.1881684064865112, "learning_rate": 1.9991965104625654e-05, "loss": 2.1497, "step": 459 }, { "epoch": 0.012788577020595169, "grad_norm": 1.1735217571258545, "learning_rate": 1.9991930060765316e-05, "loss": 2.1847, "step": 460 }, { "epoch": 0.012816378274987767, "grad_norm": 1.165432095527649, "learning_rate": 1.9991894940680864e-05, "loss": 2.544, "step": 461 }, { "epoch": 0.012844179529380365, "grad_norm": 1.2097011804580688, "learning_rate": 1.9991859744372563e-05, "loss": 2.0172, "step": 462 }, { "epoch": 0.012871980783772963, "grad_norm": 1.1680452823638916, "learning_rate": 1.999182447184068e-05, "loss": 2.1463, "step": 463 }, { "epoch": 0.012899782038165561, "grad_norm": 1.2568109035491943, "learning_rate": 1.9991789123085485e-05, "loss": 2.0812, "step": 464 }, { "epoch": 0.01292758329255816, "grad_norm": 1.2364802360534668, "learning_rate": 1.999175369810725e-05, "loss": 2.1338, "step": 465 }, { "epoch": 0.012955384546950758, "grad_norm": 1.4788399934768677, "learning_rate": 1.9991718196906246e-05, "loss": 2.2164, "step": 466 }, { "epoch": 0.012983185801343356, "grad_norm": 1.2310047149658203, "learning_rate": 1.9991682619482734e-05, "loss": 2.2073, "step": 467 }, { "epoch": 0.013010987055735954, "grad_norm": 1.1002541780471802, "learning_rate": 1.9991646965837e-05, "loss": 2.0099, "step": 468 }, { "epoch": 0.013038788310128554, "grad_norm": 1.3662173748016357, "learning_rate": 1.99916112359693e-05, "loss": 2.3071, "step": 469 }, { "epoch": 0.013066589564521152, "grad_norm": 1.2921808958053589, "learning_rate": 1.999157542987992e-05, "loss": 1.955, "step": 470 }, { "epoch": 0.01309439081891375, "grad_norm": 1.1646687984466553, "learning_rate": 1.9991539547569128e-05, "loss": 2.004, "step": 471 }, { "epoch": 0.013122192073306348, "grad_norm": 1.1669816970825195, "learning_rate": 1.99915035890372e-05, "loss": 2.2564, "step": 472 }, { "epoch": 0.013149993327698947, "grad_norm": 1.3151882886886597, "learning_rate": 1.9991467554284405e-05, "loss": 2.1042, "step": 473 }, { "epoch": 0.013177794582091545, "grad_norm": 1.2540489435195923, "learning_rate": 1.999143144331102e-05, "loss": 2.4068, "step": 474 }, { "epoch": 0.013205595836484143, "grad_norm": 1.1625101566314697, "learning_rate": 1.9991395256117325e-05, "loss": 1.955, "step": 475 }, { "epoch": 0.013233397090876741, "grad_norm": 1.1614941358566284, "learning_rate": 1.999135899270359e-05, "loss": 2.1963, "step": 476 }, { "epoch": 0.01326119834526934, "grad_norm": 1.1700503826141357, "learning_rate": 1.9991322653070096e-05, "loss": 1.988, "step": 477 }, { "epoch": 0.013288999599661937, "grad_norm": 1.4293177127838135, "learning_rate": 1.9991286237217114e-05, "loss": 2.5614, "step": 478 }, { "epoch": 0.013316800854054536, "grad_norm": 1.2783358097076416, "learning_rate": 1.9991249745144933e-05, "loss": 2.1271, "step": 479 }, { "epoch": 0.013344602108447134, "grad_norm": 1.2052714824676514, "learning_rate": 1.9991213176853823e-05, "loss": 2.0725, "step": 480 }, { "epoch": 0.013372403362839732, "grad_norm": 1.2063096761703491, "learning_rate": 1.9991176532344063e-05, "loss": 2.2797, "step": 481 }, { "epoch": 0.01340020461723233, "grad_norm": 1.1941556930541992, "learning_rate": 1.9991139811615934e-05, "loss": 2.3628, "step": 482 }, { "epoch": 0.013428005871624928, "grad_norm": 1.2268918752670288, "learning_rate": 1.9991103014669717e-05, "loss": 2.3303, "step": 483 }, { "epoch": 0.013455807126017526, "grad_norm": 1.2953178882598877, "learning_rate": 1.9991066141505693e-05, "loss": 2.4511, "step": 484 }, { "epoch": 0.013483608380410125, "grad_norm": 1.2567760944366455, "learning_rate": 1.999102919212414e-05, "loss": 2.0718, "step": 485 }, { "epoch": 0.013511409634802723, "grad_norm": 1.264397144317627, "learning_rate": 1.9990992166525346e-05, "loss": 2.8764, "step": 486 }, { "epoch": 0.013539210889195321, "grad_norm": 1.3823630809783936, "learning_rate": 1.9990955064709586e-05, "loss": 2.2197, "step": 487 }, { "epoch": 0.01356701214358792, "grad_norm": 1.206826090812683, "learning_rate": 1.9990917886677147e-05, "loss": 2.1708, "step": 488 }, { "epoch": 0.013594813397980517, "grad_norm": 1.1796847581863403, "learning_rate": 1.9990880632428314e-05, "loss": 2.1593, "step": 489 }, { "epoch": 0.013622614652373116, "grad_norm": 1.1794111728668213, "learning_rate": 1.999084330196337e-05, "loss": 1.9458, "step": 490 }, { "epoch": 0.013650415906765714, "grad_norm": 1.1478537321090698, "learning_rate": 1.9990805895282596e-05, "loss": 2.3299, "step": 491 }, { "epoch": 0.013678217161158312, "grad_norm": 1.1743508577346802, "learning_rate": 1.9990768412386282e-05, "loss": 2.406, "step": 492 }, { "epoch": 0.01370601841555091, "grad_norm": 1.3193660974502563, "learning_rate": 1.9990730853274716e-05, "loss": 2.1875, "step": 493 }, { "epoch": 0.013733819669943508, "grad_norm": 1.2479784488677979, "learning_rate": 1.9990693217948176e-05, "loss": 2.367, "step": 494 }, { "epoch": 0.013761620924336106, "grad_norm": 1.1647697687149048, "learning_rate": 1.9990655506406957e-05, "loss": 2.1107, "step": 495 }, { "epoch": 0.013789422178728705, "grad_norm": 1.0985122919082642, "learning_rate": 1.9990617718651343e-05, "loss": 2.0489, "step": 496 }, { "epoch": 0.013817223433121303, "grad_norm": 1.240638017654419, "learning_rate": 1.9990579854681625e-05, "loss": 2.3695, "step": 497 }, { "epoch": 0.013845024687513901, "grad_norm": 1.1942298412322998, "learning_rate": 1.9990541914498088e-05, "loss": 1.7991, "step": 498 }, { "epoch": 0.013872825941906499, "grad_norm": 1.3533128499984741, "learning_rate": 1.9990503898101027e-05, "loss": 2.0025, "step": 499 }, { "epoch": 0.013900627196299097, "grad_norm": 1.1640307903289795, "learning_rate": 1.9990465805490723e-05, "loss": 2.0252, "step": 500 }, { "epoch": 0.013928428450691695, "grad_norm": 1.1996756792068481, "learning_rate": 1.9990427636667475e-05, "loss": 1.9937, "step": 501 }, { "epoch": 0.013956229705084294, "grad_norm": 1.345767617225647, "learning_rate": 1.999038939163157e-05, "loss": 2.2924, "step": 502 }, { "epoch": 0.013984030959476892, "grad_norm": 1.2127631902694702, "learning_rate": 1.9990351070383302e-05, "loss": 2.2439, "step": 503 }, { "epoch": 0.01401183221386949, "grad_norm": 1.1649333238601685, "learning_rate": 1.999031267292296e-05, "loss": 2.3577, "step": 504 }, { "epoch": 0.014039633468262088, "grad_norm": 1.1551506519317627, "learning_rate": 1.999027419925084e-05, "loss": 2.2696, "step": 505 }, { "epoch": 0.014067434722654686, "grad_norm": 1.1893254518508911, "learning_rate": 1.9990235649367238e-05, "loss": 1.9466, "step": 506 }, { "epoch": 0.014095235977047284, "grad_norm": 1.2400295734405518, "learning_rate": 1.9990197023272443e-05, "loss": 2.2831, "step": 507 }, { "epoch": 0.014123037231439883, "grad_norm": 1.2162057161331177, "learning_rate": 1.999015832096675e-05, "loss": 2.2291, "step": 508 }, { "epoch": 0.01415083848583248, "grad_norm": 1.3146461248397827, "learning_rate": 1.9990119542450456e-05, "loss": 2.3881, "step": 509 }, { "epoch": 0.014178639740225079, "grad_norm": 1.1921062469482422, "learning_rate": 1.999008068772386e-05, "loss": 2.1862, "step": 510 }, { "epoch": 0.014206440994617677, "grad_norm": 1.2418739795684814, "learning_rate": 1.999004175678725e-05, "loss": 2.1341, "step": 511 }, { "epoch": 0.014234242249010275, "grad_norm": 1.066643476486206, "learning_rate": 1.9990002749640932e-05, "loss": 1.8328, "step": 512 }, { "epoch": 0.014262043503402873, "grad_norm": 1.1948403120040894, "learning_rate": 1.99899636662852e-05, "loss": 2.3416, "step": 513 }, { "epoch": 0.014289844757795472, "grad_norm": 1.2083033323287964, "learning_rate": 1.9989924506720343e-05, "loss": 2.438, "step": 514 }, { "epoch": 0.01431764601218807, "grad_norm": 1.126063585281372, "learning_rate": 1.998988527094668e-05, "loss": 2.3264, "step": 515 }, { "epoch": 0.014345447266580668, "grad_norm": 1.2375507354736328, "learning_rate": 1.9989845958964488e-05, "loss": 2.2139, "step": 516 }, { "epoch": 0.014373248520973266, "grad_norm": 1.2445451021194458, "learning_rate": 1.9989806570774086e-05, "loss": 2.0511, "step": 517 }, { "epoch": 0.014401049775365864, "grad_norm": 1.203120231628418, "learning_rate": 1.998976710637576e-05, "loss": 2.127, "step": 518 }, { "epoch": 0.014428851029758463, "grad_norm": 1.2312647104263306, "learning_rate": 1.9989727565769823e-05, "loss": 2.2258, "step": 519 }, { "epoch": 0.01445665228415106, "grad_norm": 1.1408109664916992, "learning_rate": 1.9989687948956568e-05, "loss": 2.0787, "step": 520 }, { "epoch": 0.014484453538543659, "grad_norm": 1.2608133554458618, "learning_rate": 1.9989648255936298e-05, "loss": 2.4488, "step": 521 }, { "epoch": 0.014512254792936257, "grad_norm": 1.1921988725662231, "learning_rate": 1.9989608486709317e-05, "loss": 2.3327, "step": 522 }, { "epoch": 0.014540056047328855, "grad_norm": 1.1758983135223389, "learning_rate": 1.9989568641275934e-05, "loss": 2.1224, "step": 523 }, { "epoch": 0.014567857301721453, "grad_norm": 1.1631217002868652, "learning_rate": 1.9989528719636446e-05, "loss": 1.965, "step": 524 }, { "epoch": 0.014595658556114052, "grad_norm": 1.1465047597885132, "learning_rate": 1.9989488721791157e-05, "loss": 2.0759, "step": 525 }, { "epoch": 0.01462345981050665, "grad_norm": 1.35430908203125, "learning_rate": 1.9989448647740376e-05, "loss": 2.054, "step": 526 }, { "epoch": 0.014651261064899248, "grad_norm": 1.243316888809204, "learning_rate": 1.9989408497484412e-05, "loss": 2.2183, "step": 527 }, { "epoch": 0.014679062319291846, "grad_norm": 1.1052221059799194, "learning_rate": 1.998936827102356e-05, "loss": 2.0553, "step": 528 }, { "epoch": 0.014706863573684444, "grad_norm": 1.3163663148880005, "learning_rate": 1.9989327968358137e-05, "loss": 2.1821, "step": 529 }, { "epoch": 0.014734664828077042, "grad_norm": 1.2163243293762207, "learning_rate": 1.998928758948845e-05, "loss": 2.1194, "step": 530 }, { "epoch": 0.01476246608246964, "grad_norm": 1.1873992681503296, "learning_rate": 1.99892471344148e-05, "loss": 1.9937, "step": 531 }, { "epoch": 0.014790267336862239, "grad_norm": 1.25604248046875, "learning_rate": 1.9989206603137503e-05, "loss": 2.4695, "step": 532 }, { "epoch": 0.014818068591254837, "grad_norm": 1.2191027402877808, "learning_rate": 1.998916599565686e-05, "loss": 2.3145, "step": 533 }, { "epoch": 0.014845869845647435, "grad_norm": 1.1769269704818726, "learning_rate": 1.998912531197319e-05, "loss": 2.0663, "step": 534 }, { "epoch": 0.014873671100040033, "grad_norm": 1.2733705043792725, "learning_rate": 1.9989084552086796e-05, "loss": 2.1585, "step": 535 }, { "epoch": 0.014901472354432631, "grad_norm": 1.1710221767425537, "learning_rate": 1.9989043715997996e-05, "loss": 2.1951, "step": 536 }, { "epoch": 0.01492927360882523, "grad_norm": 1.2470922470092773, "learning_rate": 1.9989002803707093e-05, "loss": 2.1155, "step": 537 }, { "epoch": 0.014957074863217828, "grad_norm": 1.136382818222046, "learning_rate": 1.9988961815214407e-05, "loss": 2.2543, "step": 538 }, { "epoch": 0.014984876117610426, "grad_norm": 1.2709037065505981, "learning_rate": 1.9988920750520247e-05, "loss": 2.2249, "step": 539 }, { "epoch": 0.015012677372003024, "grad_norm": 1.1925733089447021, "learning_rate": 1.9988879609624923e-05, "loss": 2.0649, "step": 540 }, { "epoch": 0.015040478626395622, "grad_norm": 1.2169063091278076, "learning_rate": 1.9988838392528757e-05, "loss": 2.0692, "step": 541 }, { "epoch": 0.01506827988078822, "grad_norm": 1.1559888124465942, "learning_rate": 1.9988797099232057e-05, "loss": 2.2054, "step": 542 }, { "epoch": 0.015096081135180819, "grad_norm": 1.2127629518508911, "learning_rate": 1.998875572973514e-05, "loss": 2.2204, "step": 543 }, { "epoch": 0.015123882389573417, "grad_norm": 1.2398165464401245, "learning_rate": 1.9988714284038322e-05, "loss": 2.1682, "step": 544 }, { "epoch": 0.015151683643966015, "grad_norm": 1.1545902490615845, "learning_rate": 1.998867276214192e-05, "loss": 2.0712, "step": 545 }, { "epoch": 0.015179484898358613, "grad_norm": 1.2115375995635986, "learning_rate": 1.9988631164046245e-05, "loss": 2.1612, "step": 546 }, { "epoch": 0.015207286152751211, "grad_norm": 1.3551746606826782, "learning_rate": 1.998858948975162e-05, "loss": 2.4521, "step": 547 }, { "epoch": 0.01523508740714381, "grad_norm": 1.15744948387146, "learning_rate": 1.9988547739258364e-05, "loss": 2.3232, "step": 548 }, { "epoch": 0.015262888661536408, "grad_norm": 1.3266716003417969, "learning_rate": 1.9988505912566793e-05, "loss": 2.0525, "step": 549 }, { "epoch": 0.015290689915929006, "grad_norm": 1.1887346506118774, "learning_rate": 1.998846400967722e-05, "loss": 2.2242, "step": 550 }, { "epoch": 0.015318491170321604, "grad_norm": 1.2454662322998047, "learning_rate": 1.9988422030589982e-05, "loss": 2.376, "step": 551 }, { "epoch": 0.015346292424714202, "grad_norm": 1.2171859741210938, "learning_rate": 1.998837997530538e-05, "loss": 2.1107, "step": 552 }, { "epoch": 0.0153740936791068, "grad_norm": 1.1975688934326172, "learning_rate": 1.9988337843823744e-05, "loss": 2.0292, "step": 553 }, { "epoch": 0.0154018949334994, "grad_norm": 1.3241117000579834, "learning_rate": 1.9988295636145395e-05, "loss": 2.3117, "step": 554 }, { "epoch": 0.015429696187891999, "grad_norm": 1.2358787059783936, "learning_rate": 1.9988253352270653e-05, "loss": 2.2417, "step": 555 }, { "epoch": 0.015457497442284597, "grad_norm": 1.113571286201477, "learning_rate": 1.9988210992199846e-05, "loss": 2.0011, "step": 556 }, { "epoch": 0.015485298696677195, "grad_norm": 1.1680878400802612, "learning_rate": 1.998816855593329e-05, "loss": 2.3552, "step": 557 }, { "epoch": 0.015513099951069793, "grad_norm": 1.211004376411438, "learning_rate": 1.9988126043471314e-05, "loss": 1.9817, "step": 558 }, { "epoch": 0.015540901205462391, "grad_norm": 1.2137372493743896, "learning_rate": 1.9988083454814235e-05, "loss": 1.9472, "step": 559 }, { "epoch": 0.01556870245985499, "grad_norm": 1.1469380855560303, "learning_rate": 1.998804078996239e-05, "loss": 1.8946, "step": 560 }, { "epoch": 0.015596503714247588, "grad_norm": 1.159514307975769, "learning_rate": 1.9987998048916092e-05, "loss": 2.0307, "step": 561 }, { "epoch": 0.015624304968640186, "grad_norm": 1.1782461404800415, "learning_rate": 1.9987955231675674e-05, "loss": 2.255, "step": 562 }, { "epoch": 0.015652106223032782, "grad_norm": 1.1787811517715454, "learning_rate": 1.998791233824146e-05, "loss": 1.9867, "step": 563 }, { "epoch": 0.01567990747742538, "grad_norm": 1.2824307680130005, "learning_rate": 1.998786936861378e-05, "loss": 2.3431, "step": 564 }, { "epoch": 0.01570770873181798, "grad_norm": 1.2485274076461792, "learning_rate": 1.998782632279296e-05, "loss": 2.2123, "step": 565 }, { "epoch": 0.015735509986210577, "grad_norm": 1.1772146224975586, "learning_rate": 1.9987783200779327e-05, "loss": 2.2834, "step": 566 }, { "epoch": 0.015763311240603175, "grad_norm": 1.2476216554641724, "learning_rate": 1.9987740002573213e-05, "loss": 2.1938, "step": 567 }, { "epoch": 0.015791112494995773, "grad_norm": 1.2516151666641235, "learning_rate": 1.9987696728174947e-05, "loss": 2.3275, "step": 568 }, { "epoch": 0.01581891374938837, "grad_norm": 1.2438257932662964, "learning_rate": 1.9987653377584855e-05, "loss": 2.1676, "step": 569 }, { "epoch": 0.01584671500378097, "grad_norm": 1.2634676694869995, "learning_rate": 1.998760995080327e-05, "loss": 2.4909, "step": 570 }, { "epoch": 0.015874516258173568, "grad_norm": 1.1434614658355713, "learning_rate": 1.9987566447830533e-05, "loss": 2.0277, "step": 571 }, { "epoch": 0.015902317512566166, "grad_norm": 1.257112741470337, "learning_rate": 1.9987522868666956e-05, "loss": 2.1011, "step": 572 }, { "epoch": 0.015930118766958764, "grad_norm": 1.1984972953796387, "learning_rate": 1.998747921331289e-05, "loss": 2.1098, "step": 573 }, { "epoch": 0.015957920021351362, "grad_norm": 1.2184396982192993, "learning_rate": 1.9987435481768654e-05, "loss": 2.2491, "step": 574 }, { "epoch": 0.01598572127574396, "grad_norm": 1.1133394241333008, "learning_rate": 1.9987391674034592e-05, "loss": 2.0023, "step": 575 }, { "epoch": 0.01601352253013656, "grad_norm": 1.268955945968628, "learning_rate": 1.9987347790111035e-05, "loss": 2.4468, "step": 576 }, { "epoch": 0.016041323784529157, "grad_norm": 1.221291422843933, "learning_rate": 1.9987303829998318e-05, "loss": 2.2031, "step": 577 }, { "epoch": 0.016069125038921755, "grad_norm": 1.3600314855575562, "learning_rate": 1.9987259793696772e-05, "loss": 2.0595, "step": 578 }, { "epoch": 0.016096926293314353, "grad_norm": 1.188693881034851, "learning_rate": 1.9987215681206734e-05, "loss": 2.2596, "step": 579 }, { "epoch": 0.01612472754770695, "grad_norm": 1.1366506814956665, "learning_rate": 1.998717149252855e-05, "loss": 2.1841, "step": 580 }, { "epoch": 0.01615252880209955, "grad_norm": 1.1239315271377563, "learning_rate": 1.9987127227662545e-05, "loss": 2.1071, "step": 581 }, { "epoch": 0.016180330056492147, "grad_norm": 1.2930082082748413, "learning_rate": 1.9987082886609062e-05, "loss": 2.3554, "step": 582 }, { "epoch": 0.016208131310884746, "grad_norm": 1.1686736345291138, "learning_rate": 1.998703846936844e-05, "loss": 2.1747, "step": 583 }, { "epoch": 0.016235932565277344, "grad_norm": 1.2470439672470093, "learning_rate": 1.9986993975941016e-05, "loss": 1.888, "step": 584 }, { "epoch": 0.016263733819669942, "grad_norm": 1.2435041666030884, "learning_rate": 1.9986949406327133e-05, "loss": 2.0442, "step": 585 }, { "epoch": 0.01629153507406254, "grad_norm": 1.2208584547042847, "learning_rate": 1.9986904760527123e-05, "loss": 2.1521, "step": 586 }, { "epoch": 0.01631933632845514, "grad_norm": 1.3955239057540894, "learning_rate": 1.9986860038541335e-05, "loss": 2.3352, "step": 587 }, { "epoch": 0.016347137582847737, "grad_norm": 1.1917351484298706, "learning_rate": 1.9986815240370106e-05, "loss": 2.4006, "step": 588 }, { "epoch": 0.016374938837240335, "grad_norm": 1.2494513988494873, "learning_rate": 1.998677036601378e-05, "loss": 2.1055, "step": 589 }, { "epoch": 0.016402740091632933, "grad_norm": 1.105431079864502, "learning_rate": 1.9986725415472698e-05, "loss": 1.9929, "step": 590 }, { "epoch": 0.01643054134602553, "grad_norm": 1.250173568725586, "learning_rate": 1.99866803887472e-05, "loss": 1.9954, "step": 591 }, { "epoch": 0.01645834260041813, "grad_norm": 1.2721790075302124, "learning_rate": 1.9986635285837633e-05, "loss": 2.3216, "step": 592 }, { "epoch": 0.016486143854810727, "grad_norm": 1.2010505199432373, "learning_rate": 1.9986590106744344e-05, "loss": 2.045, "step": 593 }, { "epoch": 0.016513945109203326, "grad_norm": 1.1454036235809326, "learning_rate": 1.998654485146767e-05, "loss": 2.209, "step": 594 }, { "epoch": 0.016541746363595924, "grad_norm": 1.0905146598815918, "learning_rate": 1.998649952000796e-05, "loss": 2.2252, "step": 595 }, { "epoch": 0.016569547617988522, "grad_norm": 1.1966474056243896, "learning_rate": 1.9986454112365565e-05, "loss": 2.0473, "step": 596 }, { "epoch": 0.016597348872381124, "grad_norm": 1.2300399541854858, "learning_rate": 1.998640862854082e-05, "loss": 1.9271, "step": 597 }, { "epoch": 0.01662515012677372, "grad_norm": 1.257317066192627, "learning_rate": 1.9986363068534084e-05, "loss": 2.3952, "step": 598 }, { "epoch": 0.01665295138116632, "grad_norm": 1.2372100353240967, "learning_rate": 1.9986317432345696e-05, "loss": 2.2366, "step": 599 }, { "epoch": 0.016680752635558918, "grad_norm": 1.2256265878677368, "learning_rate": 1.9986271719976006e-05, "loss": 2.3936, "step": 600 }, { "epoch": 0.016708553889951516, "grad_norm": 1.2395943403244019, "learning_rate": 1.9986225931425368e-05, "loss": 2.2231, "step": 601 }, { "epoch": 0.016736355144344114, "grad_norm": 1.2437233924865723, "learning_rate": 1.9986180066694125e-05, "loss": 2.1512, "step": 602 }, { "epoch": 0.016764156398736713, "grad_norm": 1.264139175415039, "learning_rate": 1.998613412578263e-05, "loss": 2.2327, "step": 603 }, { "epoch": 0.01679195765312931, "grad_norm": 1.3560538291931152, "learning_rate": 1.998608810869123e-05, "loss": 1.8434, "step": 604 }, { "epoch": 0.01681975890752191, "grad_norm": 1.2186620235443115, "learning_rate": 1.998604201542028e-05, "loss": 2.2922, "step": 605 }, { "epoch": 0.016847560161914507, "grad_norm": 1.1597272157669067, "learning_rate": 1.9985995845970132e-05, "loss": 2.4213, "step": 606 }, { "epoch": 0.016875361416307105, "grad_norm": 1.2454571723937988, "learning_rate": 1.9985949600341136e-05, "loss": 2.1212, "step": 607 }, { "epoch": 0.016903162670699703, "grad_norm": 1.18659245967865, "learning_rate": 1.998590327853364e-05, "loss": 2.3679, "step": 608 }, { "epoch": 0.0169309639250923, "grad_norm": 1.1465864181518555, "learning_rate": 1.998585688054801e-05, "loss": 1.9965, "step": 609 }, { "epoch": 0.0169587651794849, "grad_norm": 1.2818816900253296, "learning_rate": 1.9985810406384587e-05, "loss": 2.1055, "step": 610 }, { "epoch": 0.016986566433877498, "grad_norm": 1.2758877277374268, "learning_rate": 1.9985763856043734e-05, "loss": 2.0941, "step": 611 }, { "epoch": 0.017014367688270096, "grad_norm": 1.2042895555496216, "learning_rate": 1.9985717229525804e-05, "loss": 2.3862, "step": 612 }, { "epoch": 0.017042168942662694, "grad_norm": 1.153221607208252, "learning_rate": 1.9985670526831148e-05, "loss": 2.2583, "step": 613 }, { "epoch": 0.017069970197055293, "grad_norm": 1.248835802078247, "learning_rate": 1.998562374796013e-05, "loss": 2.2712, "step": 614 }, { "epoch": 0.01709777145144789, "grad_norm": 1.2551778554916382, "learning_rate": 1.99855768929131e-05, "loss": 2.0149, "step": 615 }, { "epoch": 0.01712557270584049, "grad_norm": 1.2017302513122559, "learning_rate": 1.9985529961690423e-05, "loss": 2.3387, "step": 616 }, { "epoch": 0.017153373960233087, "grad_norm": 1.219696044921875, "learning_rate": 1.998548295429245e-05, "loss": 2.3516, "step": 617 }, { "epoch": 0.017181175214625685, "grad_norm": 1.2442412376403809, "learning_rate": 1.998543587071954e-05, "loss": 2.0989, "step": 618 }, { "epoch": 0.017208976469018283, "grad_norm": 1.1917599439620972, "learning_rate": 1.9985388710972053e-05, "loss": 2.2159, "step": 619 }, { "epoch": 0.01723677772341088, "grad_norm": 1.2734211683273315, "learning_rate": 1.9985341475050353e-05, "loss": 2.0669, "step": 620 }, { "epoch": 0.01726457897780348, "grad_norm": 1.18391752243042, "learning_rate": 1.9985294162954797e-05, "loss": 2.0566, "step": 621 }, { "epoch": 0.017292380232196078, "grad_norm": 1.2043577432632446, "learning_rate": 1.9985246774685744e-05, "loss": 2.1514, "step": 622 }, { "epoch": 0.017320181486588676, "grad_norm": 1.261600375175476, "learning_rate": 1.998519931024356e-05, "loss": 2.2223, "step": 623 }, { "epoch": 0.017347982740981274, "grad_norm": 1.246983289718628, "learning_rate": 1.998515176962861e-05, "loss": 2.0185, "step": 624 }, { "epoch": 0.017375783995373872, "grad_norm": 1.2273379564285278, "learning_rate": 1.998510415284124e-05, "loss": 2.1714, "step": 625 }, { "epoch": 0.01740358524976647, "grad_norm": 1.4262818098068237, "learning_rate": 1.9985056459881833e-05, "loss": 1.8271, "step": 626 }, { "epoch": 0.01743138650415907, "grad_norm": 1.2087355852127075, "learning_rate": 1.998500869075074e-05, "loss": 1.9074, "step": 627 }, { "epoch": 0.017459187758551667, "grad_norm": 1.3462269306182861, "learning_rate": 1.9984960845448335e-05, "loss": 2.0181, "step": 628 }, { "epoch": 0.017486989012944265, "grad_norm": 1.3233671188354492, "learning_rate": 1.9984912923974976e-05, "loss": 2.1588, "step": 629 }, { "epoch": 0.017514790267336863, "grad_norm": 1.3851628303527832, "learning_rate": 1.998486492633103e-05, "loss": 1.9172, "step": 630 }, { "epoch": 0.01754259152172946, "grad_norm": 1.3632209300994873, "learning_rate": 1.9984816852516865e-05, "loss": 2.3658, "step": 631 }, { "epoch": 0.01757039277612206, "grad_norm": 1.2222546339035034, "learning_rate": 1.9984768702532845e-05, "loss": 2.0563, "step": 632 }, { "epoch": 0.017598194030514658, "grad_norm": 1.2702916860580444, "learning_rate": 1.998472047637934e-05, "loss": 2.1224, "step": 633 }, { "epoch": 0.017625995284907256, "grad_norm": 1.2321580648422241, "learning_rate": 1.9984672174056714e-05, "loss": 2.3075, "step": 634 }, { "epoch": 0.017653796539299854, "grad_norm": 1.232852578163147, "learning_rate": 1.998462379556534e-05, "loss": 2.3048, "step": 635 }, { "epoch": 0.017681597793692452, "grad_norm": 1.4964567422866821, "learning_rate": 1.9984575340905584e-05, "loss": 2.1412, "step": 636 }, { "epoch": 0.01770939904808505, "grad_norm": 1.2850028276443481, "learning_rate": 1.9984526810077817e-05, "loss": 2.0015, "step": 637 }, { "epoch": 0.01773720030247765, "grad_norm": 1.2436912059783936, "learning_rate": 1.9984478203082412e-05, "loss": 2.141, "step": 638 }, { "epoch": 0.017765001556870247, "grad_norm": 1.3351807594299316, "learning_rate": 1.9984429519919733e-05, "loss": 2.3343, "step": 639 }, { "epoch": 0.017792802811262845, "grad_norm": 1.328674077987671, "learning_rate": 1.998438076059016e-05, "loss": 2.121, "step": 640 }, { "epoch": 0.017820604065655443, "grad_norm": 1.2353153228759766, "learning_rate": 1.9984331925094056e-05, "loss": 2.233, "step": 641 }, { "epoch": 0.01784840532004804, "grad_norm": 1.2795600891113281, "learning_rate": 1.99842830134318e-05, "loss": 2.4106, "step": 642 }, { "epoch": 0.01787620657444064, "grad_norm": 1.251430869102478, "learning_rate": 1.998423402560376e-05, "loss": 2.1163, "step": 643 }, { "epoch": 0.017904007828833238, "grad_norm": 1.3501018285751343, "learning_rate": 1.9984184961610315e-05, "loss": 2.3498, "step": 644 }, { "epoch": 0.017931809083225836, "grad_norm": 1.3220409154891968, "learning_rate": 1.9984135821451837e-05, "loss": 1.9086, "step": 645 }, { "epoch": 0.017959610337618434, "grad_norm": 1.2476192712783813, "learning_rate": 1.99840866051287e-05, "loss": 2.1961, "step": 646 }, { "epoch": 0.017987411592011032, "grad_norm": 1.2205522060394287, "learning_rate": 1.998403731264128e-05, "loss": 2.0777, "step": 647 }, { "epoch": 0.01801521284640363, "grad_norm": 1.204083800315857, "learning_rate": 1.9983987943989954e-05, "loss": 2.1591, "step": 648 }, { "epoch": 0.01804301410079623, "grad_norm": 1.2453559637069702, "learning_rate": 1.9983938499175097e-05, "loss": 2.074, "step": 649 }, { "epoch": 0.018070815355188827, "grad_norm": 1.2716399431228638, "learning_rate": 1.9983888978197087e-05, "loss": 2.2509, "step": 650 }, { "epoch": 0.018098616609581425, "grad_norm": 1.231399416923523, "learning_rate": 1.99838393810563e-05, "loss": 2.3122, "step": 651 }, { "epoch": 0.018126417863974023, "grad_norm": 1.2742851972579956, "learning_rate": 1.998378970775312e-05, "loss": 2.4068, "step": 652 }, { "epoch": 0.01815421911836662, "grad_norm": 1.23708176612854, "learning_rate": 1.998373995828792e-05, "loss": 2.0379, "step": 653 }, { "epoch": 0.01818202037275922, "grad_norm": 1.2650407552719116, "learning_rate": 1.9983690132661083e-05, "loss": 2.3771, "step": 654 }, { "epoch": 0.018209821627151818, "grad_norm": 1.2577096223831177, "learning_rate": 1.9983640230872988e-05, "loss": 2.069, "step": 655 }, { "epoch": 0.018237622881544416, "grad_norm": 1.1885590553283691, "learning_rate": 1.9983590252924013e-05, "loss": 2.2888, "step": 656 }, { "epoch": 0.018265424135937014, "grad_norm": 1.1356934309005737, "learning_rate": 1.9983540198814546e-05, "loss": 1.9905, "step": 657 }, { "epoch": 0.018293225390329612, "grad_norm": 1.322825312614441, "learning_rate": 1.9983490068544957e-05, "loss": 1.9269, "step": 658 }, { "epoch": 0.01832102664472221, "grad_norm": 1.2157461643218994, "learning_rate": 1.9983439862115642e-05, "loss": 2.1992, "step": 659 }, { "epoch": 0.01834882789911481, "grad_norm": 1.126942753791809, "learning_rate": 1.9983389579526975e-05, "loss": 2.1162, "step": 660 }, { "epoch": 0.018376629153507407, "grad_norm": 1.192244529724121, "learning_rate": 1.9983339220779343e-05, "loss": 1.8943, "step": 661 }, { "epoch": 0.018404430407900005, "grad_norm": 1.2397443056106567, "learning_rate": 1.9983288785873133e-05, "loss": 2.0936, "step": 662 }, { "epoch": 0.018432231662292603, "grad_norm": 1.1834206581115723, "learning_rate": 1.998323827480872e-05, "loss": 2.6984, "step": 663 }, { "epoch": 0.0184600329166852, "grad_norm": 1.2986841201782227, "learning_rate": 1.99831876875865e-05, "loss": 1.9282, "step": 664 }, { "epoch": 0.0184878341710778, "grad_norm": 1.257500410079956, "learning_rate": 1.9983137024206853e-05, "loss": 2.3582, "step": 665 }, { "epoch": 0.018515635425470398, "grad_norm": 1.2042535543441772, "learning_rate": 1.998308628467017e-05, "loss": 2.2107, "step": 666 }, { "epoch": 0.018543436679862996, "grad_norm": 1.210256576538086, "learning_rate": 1.998303546897683e-05, "loss": 2.3727, "step": 667 }, { "epoch": 0.018571237934255594, "grad_norm": 1.1807748079299927, "learning_rate": 1.998298457712723e-05, "loss": 2.4549, "step": 668 }, { "epoch": 0.018599039188648192, "grad_norm": 1.261179804801941, "learning_rate": 1.9982933609121757e-05, "loss": 2.2887, "step": 669 }, { "epoch": 0.01862684044304079, "grad_norm": 1.1307488679885864, "learning_rate": 1.9982882564960793e-05, "loss": 2.1788, "step": 670 }, { "epoch": 0.01865464169743339, "grad_norm": 1.2333331108093262, "learning_rate": 1.998283144464473e-05, "loss": 2.1402, "step": 671 }, { "epoch": 0.018682442951825987, "grad_norm": 1.321232557296753, "learning_rate": 1.9982780248173962e-05, "loss": 1.889, "step": 672 }, { "epoch": 0.018710244206218585, "grad_norm": 1.2325247526168823, "learning_rate": 1.9982728975548876e-05, "loss": 2.5717, "step": 673 }, { "epoch": 0.018738045460611183, "grad_norm": 1.1923264265060425, "learning_rate": 1.998267762676986e-05, "loss": 2.5212, "step": 674 }, { "epoch": 0.01876584671500378, "grad_norm": 1.1908127069473267, "learning_rate": 1.9982626201837318e-05, "loss": 2.4925, "step": 675 }, { "epoch": 0.01879364796939638, "grad_norm": 1.3857849836349487, "learning_rate": 1.9982574700751627e-05, "loss": 2.0457, "step": 676 }, { "epoch": 0.018821449223788977, "grad_norm": 1.0952517986297607, "learning_rate": 1.9982523123513186e-05, "loss": 2.1416, "step": 677 }, { "epoch": 0.018849250478181576, "grad_norm": 1.2316055297851562, "learning_rate": 1.9982471470122396e-05, "loss": 2.3593, "step": 678 }, { "epoch": 0.018877051732574174, "grad_norm": 1.2756849527359009, "learning_rate": 1.998241974057964e-05, "loss": 2.1556, "step": 679 }, { "epoch": 0.018904852986966772, "grad_norm": 1.2000385522842407, "learning_rate": 1.9982367934885317e-05, "loss": 2.0713, "step": 680 }, { "epoch": 0.01893265424135937, "grad_norm": 1.192880630493164, "learning_rate": 1.998231605303982e-05, "loss": 2.058, "step": 681 }, { "epoch": 0.01896045549575197, "grad_norm": 1.1940876245498657, "learning_rate": 1.9982264095043553e-05, "loss": 2.2095, "step": 682 }, { "epoch": 0.018988256750144566, "grad_norm": 1.1619784832000732, "learning_rate": 1.99822120608969e-05, "loss": 2.1234, "step": 683 }, { "epoch": 0.019016058004537165, "grad_norm": 1.1772998571395874, "learning_rate": 1.998215995060027e-05, "loss": 2.045, "step": 684 }, { "epoch": 0.019043859258929763, "grad_norm": 1.2283533811569214, "learning_rate": 1.9982107764154052e-05, "loss": 2.2667, "step": 685 }, { "epoch": 0.01907166051332236, "grad_norm": 1.168361783027649, "learning_rate": 1.9982055501558647e-05, "loss": 2.0106, "step": 686 }, { "epoch": 0.01909946176771496, "grad_norm": 1.2390023469924927, "learning_rate": 1.9982003162814455e-05, "loss": 2.2291, "step": 687 }, { "epoch": 0.019127263022107557, "grad_norm": 1.2544379234313965, "learning_rate": 1.9981950747921872e-05, "loss": 2.1755, "step": 688 }, { "epoch": 0.019155064276500156, "grad_norm": 1.1726070642471313, "learning_rate": 1.9981898256881302e-05, "loss": 1.9248, "step": 689 }, { "epoch": 0.019182865530892754, "grad_norm": 1.0936195850372314, "learning_rate": 1.9981845689693137e-05, "loss": 2.1226, "step": 690 }, { "epoch": 0.019210666785285352, "grad_norm": 1.2380118370056152, "learning_rate": 1.9981793046357788e-05, "loss": 2.3945, "step": 691 }, { "epoch": 0.01923846803967795, "grad_norm": 1.2669039964675903, "learning_rate": 1.9981740326875657e-05, "loss": 2.0678, "step": 692 }, { "epoch": 0.019266269294070548, "grad_norm": 1.1941074132919312, "learning_rate": 1.9981687531247135e-05, "loss": 2.2239, "step": 693 }, { "epoch": 0.019294070548463146, "grad_norm": 1.2309935092926025, "learning_rate": 1.9981634659472637e-05, "loss": 1.9371, "step": 694 }, { "epoch": 0.019321871802855745, "grad_norm": 1.1997584104537964, "learning_rate": 1.998158171155256e-05, "loss": 2.1311, "step": 695 }, { "epoch": 0.019349673057248343, "grad_norm": 1.250409483909607, "learning_rate": 1.9981528687487307e-05, "loss": 2.0872, "step": 696 }, { "epoch": 0.01937747431164094, "grad_norm": 1.2198785543441772, "learning_rate": 1.9981475587277287e-05, "loss": 2.0706, "step": 697 }, { "epoch": 0.01940527556603354, "grad_norm": 1.185333490371704, "learning_rate": 1.9981422410922898e-05, "loss": 2.0161, "step": 698 }, { "epoch": 0.019433076820426137, "grad_norm": 1.1606931686401367, "learning_rate": 1.9981369158424557e-05, "loss": 1.9743, "step": 699 }, { "epoch": 0.019460878074818735, "grad_norm": 1.227649450302124, "learning_rate": 1.9981315829782656e-05, "loss": 2.137, "step": 700 }, { "epoch": 0.019488679329211334, "grad_norm": 1.1613130569458008, "learning_rate": 1.9981262424997613e-05, "loss": 2.252, "step": 701 }, { "epoch": 0.019516480583603932, "grad_norm": 1.1844148635864258, "learning_rate": 1.9981208944069834e-05, "loss": 2.1621, "step": 702 }, { "epoch": 0.01954428183799653, "grad_norm": 1.2287095785140991, "learning_rate": 1.9981155386999722e-05, "loss": 2.2433, "step": 703 }, { "epoch": 0.019572083092389128, "grad_norm": 1.2302637100219727, "learning_rate": 1.9981101753787685e-05, "loss": 2.2381, "step": 704 }, { "epoch": 0.019599884346781726, "grad_norm": 1.3193657398223877, "learning_rate": 1.998104804443414e-05, "loss": 1.7743, "step": 705 }, { "epoch": 0.019627685601174324, "grad_norm": 1.1896389722824097, "learning_rate": 1.998099425893949e-05, "loss": 2.0243, "step": 706 }, { "epoch": 0.019655486855566923, "grad_norm": 1.2087386846542358, "learning_rate": 1.9980940397304148e-05, "loss": 2.0851, "step": 707 }, { "epoch": 0.01968328810995952, "grad_norm": 1.2358545064926147, "learning_rate": 1.9980886459528525e-05, "loss": 2.2631, "step": 708 }, { "epoch": 0.01971108936435212, "grad_norm": 1.1981942653656006, "learning_rate": 1.9980832445613033e-05, "loss": 2.0958, "step": 709 }, { "epoch": 0.019738890618744717, "grad_norm": 1.314407467842102, "learning_rate": 1.998077835555808e-05, "loss": 2.228, "step": 710 }, { "epoch": 0.019766691873137315, "grad_norm": 1.1909462213516235, "learning_rate": 1.998072418936408e-05, "loss": 2.1032, "step": 711 }, { "epoch": 0.019794493127529914, "grad_norm": 1.1363677978515625, "learning_rate": 1.998066994703145e-05, "loss": 2.0222, "step": 712 }, { "epoch": 0.01982229438192251, "grad_norm": 1.2856818437576294, "learning_rate": 1.9980615628560603e-05, "loss": 2.2121, "step": 713 }, { "epoch": 0.01985009563631511, "grad_norm": 1.2553746700286865, "learning_rate": 1.998056123395195e-05, "loss": 2.1594, "step": 714 }, { "epoch": 0.019877896890707708, "grad_norm": 1.285005807876587, "learning_rate": 1.9980506763205907e-05, "loss": 2.3685, "step": 715 }, { "epoch": 0.019905698145100306, "grad_norm": 1.2738608121871948, "learning_rate": 1.998045221632289e-05, "loss": 2.3711, "step": 716 }, { "epoch": 0.019933499399492904, "grad_norm": 1.2335405349731445, "learning_rate": 1.9980397593303314e-05, "loss": 2.2163, "step": 717 }, { "epoch": 0.019961300653885503, "grad_norm": 1.2515232563018799, "learning_rate": 1.99803428941476e-05, "loss": 2.244, "step": 718 }, { "epoch": 0.0199891019082781, "grad_norm": 1.1644277572631836, "learning_rate": 1.9980288118856164e-05, "loss": 2.225, "step": 719 }, { "epoch": 0.0200169031626707, "grad_norm": 1.3331873416900635, "learning_rate": 1.9980233267429422e-05, "loss": 2.4956, "step": 720 }, { "epoch": 0.020044704417063297, "grad_norm": 1.2012351751327515, "learning_rate": 1.9980178339867788e-05, "loss": 2.165, "step": 721 }, { "epoch": 0.020072505671455895, "grad_norm": 1.2640992403030396, "learning_rate": 1.9980123336171688e-05, "loss": 2.128, "step": 722 }, { "epoch": 0.020100306925848493, "grad_norm": 1.1183795928955078, "learning_rate": 1.998006825634154e-05, "loss": 2.3794, "step": 723 }, { "epoch": 0.02012810818024109, "grad_norm": 1.225666880607605, "learning_rate": 1.9980013100377762e-05, "loss": 2.3802, "step": 724 }, { "epoch": 0.02015590943463369, "grad_norm": 1.163535714149475, "learning_rate": 1.9979957868280776e-05, "loss": 2.0247, "step": 725 }, { "epoch": 0.020183710689026288, "grad_norm": 1.2289594411849976, "learning_rate": 1.9979902560051006e-05, "loss": 2.3907, "step": 726 }, { "epoch": 0.020211511943418886, "grad_norm": 1.4199200868606567, "learning_rate": 1.997984717568887e-05, "loss": 2.4884, "step": 727 }, { "epoch": 0.020239313197811484, "grad_norm": 1.2021894454956055, "learning_rate": 1.9979791715194793e-05, "loss": 2.1647, "step": 728 }, { "epoch": 0.020267114452204082, "grad_norm": 1.1944386959075928, "learning_rate": 1.9979736178569195e-05, "loss": 2.2057, "step": 729 }, { "epoch": 0.02029491570659668, "grad_norm": 1.192878246307373, "learning_rate": 1.9979680565812503e-05, "loss": 2.1056, "step": 730 }, { "epoch": 0.02032271696098928, "grad_norm": 1.2096731662750244, "learning_rate": 1.997962487692514e-05, "loss": 2.2514, "step": 731 }, { "epoch": 0.020350518215381877, "grad_norm": 1.216076135635376, "learning_rate": 1.997956911190753e-05, "loss": 1.9725, "step": 732 }, { "epoch": 0.020378319469774475, "grad_norm": 1.2456984519958496, "learning_rate": 1.9979513270760102e-05, "loss": 2.4213, "step": 733 }, { "epoch": 0.020406120724167073, "grad_norm": 1.1114468574523926, "learning_rate": 1.9979457353483274e-05, "loss": 2.3038, "step": 734 }, { "epoch": 0.02043392197855967, "grad_norm": 1.2675299644470215, "learning_rate": 1.997940136007748e-05, "loss": 2.063, "step": 735 }, { "epoch": 0.02046172323295227, "grad_norm": 1.2021836042404175, "learning_rate": 1.9979345290543146e-05, "loss": 2.4236, "step": 736 }, { "epoch": 0.020489524487344868, "grad_norm": 1.210781216621399, "learning_rate": 1.9979289144880696e-05, "loss": 2.1191, "step": 737 }, { "epoch": 0.020517325741737466, "grad_norm": 1.3336102962493896, "learning_rate": 1.9979232923090564e-05, "loss": 2.1687, "step": 738 }, { "epoch": 0.020545126996130064, "grad_norm": 1.1894129514694214, "learning_rate": 1.9979176625173176e-05, "loss": 2.4436, "step": 739 }, { "epoch": 0.020572928250522662, "grad_norm": 1.2080823183059692, "learning_rate": 1.9979120251128963e-05, "loss": 2.0472, "step": 740 }, { "epoch": 0.02060072950491526, "grad_norm": 1.1765656471252441, "learning_rate": 1.9979063800958348e-05, "loss": 2.4114, "step": 741 }, { "epoch": 0.02062853075930786, "grad_norm": 1.1623668670654297, "learning_rate": 1.997900727466177e-05, "loss": 2.2214, "step": 742 }, { "epoch": 0.020656332013700457, "grad_norm": 1.1521472930908203, "learning_rate": 1.9978950672239658e-05, "loss": 1.942, "step": 743 }, { "epoch": 0.020684133268093055, "grad_norm": 1.2251015901565552, "learning_rate": 1.997889399369244e-05, "loss": 2.2925, "step": 744 }, { "epoch": 0.020711934522485653, "grad_norm": 1.1769088506698608, "learning_rate": 1.9978837239020555e-05, "loss": 2.1981, "step": 745 }, { "epoch": 0.02073973577687825, "grad_norm": 1.2776894569396973, "learning_rate": 1.997878040822443e-05, "loss": 2.3054, "step": 746 }, { "epoch": 0.02076753703127085, "grad_norm": 1.174149751663208, "learning_rate": 1.9978723501304502e-05, "loss": 2.073, "step": 747 }, { "epoch": 0.020795338285663448, "grad_norm": 1.321775197982788, "learning_rate": 1.9978666518261204e-05, "loss": 2.3072, "step": 748 }, { "epoch": 0.020823139540056046, "grad_norm": 1.2209161520004272, "learning_rate": 1.9978609459094968e-05, "loss": 2.0594, "step": 749 }, { "epoch": 0.020850940794448644, "grad_norm": 1.1712366342544556, "learning_rate": 1.9978552323806236e-05, "loss": 2.0999, "step": 750 }, { "epoch": 0.020878742048841242, "grad_norm": 1.286587119102478, "learning_rate": 1.9978495112395436e-05, "loss": 2.4217, "step": 751 }, { "epoch": 0.02090654330323384, "grad_norm": 1.1362825632095337, "learning_rate": 1.997843782486301e-05, "loss": 2.1712, "step": 752 }, { "epoch": 0.02093434455762644, "grad_norm": 1.2130725383758545, "learning_rate": 1.9978380461209396e-05, "loss": 1.9151, "step": 753 }, { "epoch": 0.020962145812019037, "grad_norm": 1.259271502494812, "learning_rate": 1.9978323021435027e-05, "loss": 2.0506, "step": 754 }, { "epoch": 0.020989947066411635, "grad_norm": 1.2244856357574463, "learning_rate": 1.997826550554034e-05, "loss": 2.1452, "step": 755 }, { "epoch": 0.021017748320804233, "grad_norm": 1.2241125106811523, "learning_rate": 1.997820791352578e-05, "loss": 2.2393, "step": 756 }, { "epoch": 0.02104554957519683, "grad_norm": 1.1169238090515137, "learning_rate": 1.9978150245391782e-05, "loss": 2.2939, "step": 757 }, { "epoch": 0.02107335082958943, "grad_norm": 1.1493014097213745, "learning_rate": 1.997809250113879e-05, "loss": 2.1045, "step": 758 }, { "epoch": 0.021101152083982028, "grad_norm": 1.3108172416687012, "learning_rate": 1.9978034680767234e-05, "loss": 1.9569, "step": 759 }, { "epoch": 0.021128953338374626, "grad_norm": 1.261904239654541, "learning_rate": 1.9977976784277572e-05, "loss": 2.2159, "step": 760 }, { "epoch": 0.021156754592767224, "grad_norm": 1.2068402767181396, "learning_rate": 1.9977918811670233e-05, "loss": 1.9211, "step": 761 }, { "epoch": 0.021184555847159822, "grad_norm": 1.1185142993927002, "learning_rate": 1.997786076294566e-05, "loss": 2.3487, "step": 762 }, { "epoch": 0.02121235710155242, "grad_norm": 1.133130431175232, "learning_rate": 1.99778026381043e-05, "loss": 1.9135, "step": 763 }, { "epoch": 0.02124015835594502, "grad_norm": 1.1313711404800415, "learning_rate": 1.9977744437146596e-05, "loss": 2.2874, "step": 764 }, { "epoch": 0.021267959610337617, "grad_norm": 1.2762500047683716, "learning_rate": 1.997768616007299e-05, "loss": 2.0451, "step": 765 }, { "epoch": 0.021295760864730215, "grad_norm": 1.293135643005371, "learning_rate": 1.9977627806883925e-05, "loss": 2.1638, "step": 766 }, { "epoch": 0.021323562119122817, "grad_norm": 1.1720689535140991, "learning_rate": 1.9977569377579853e-05, "loss": 2.0025, "step": 767 }, { "epoch": 0.021351363373515415, "grad_norm": 1.1355785131454468, "learning_rate": 1.9977510872161213e-05, "loss": 2.0474, "step": 768 }, { "epoch": 0.021379164627908013, "grad_norm": 1.1804016828536987, "learning_rate": 1.9977452290628454e-05, "loss": 1.9131, "step": 769 }, { "epoch": 0.02140696588230061, "grad_norm": 1.206207275390625, "learning_rate": 1.9977393632982023e-05, "loss": 2.1392, "step": 770 }, { "epoch": 0.02143476713669321, "grad_norm": 1.1796706914901733, "learning_rate": 1.9977334899222366e-05, "loss": 2.0633, "step": 771 }, { "epoch": 0.021462568391085807, "grad_norm": 1.3576617240905762, "learning_rate": 1.997727608934993e-05, "loss": 2.3956, "step": 772 }, { "epoch": 0.021490369645478406, "grad_norm": 1.3414121866226196, "learning_rate": 1.9977217203365173e-05, "loss": 2.0177, "step": 773 }, { "epoch": 0.021518170899871004, "grad_norm": 1.2675609588623047, "learning_rate": 1.997715824126853e-05, "loss": 2.1115, "step": 774 }, { "epoch": 0.021545972154263602, "grad_norm": 1.2006562948226929, "learning_rate": 1.9977099203060458e-05, "loss": 2.3346, "step": 775 }, { "epoch": 0.0215737734086562, "grad_norm": 1.1671146154403687, "learning_rate": 1.997704008874141e-05, "loss": 2.1002, "step": 776 }, { "epoch": 0.0216015746630488, "grad_norm": 1.205952525138855, "learning_rate": 1.9976980898311835e-05, "loss": 2.2487, "step": 777 }, { "epoch": 0.021629375917441396, "grad_norm": 1.2839131355285645, "learning_rate": 1.9976921631772178e-05, "loss": 2.2774, "step": 778 }, { "epoch": 0.021657177171833995, "grad_norm": 1.1342172622680664, "learning_rate": 1.9976862289122904e-05, "loss": 2.4841, "step": 779 }, { "epoch": 0.021684978426226593, "grad_norm": 1.2011606693267822, "learning_rate": 1.9976802870364453e-05, "loss": 2.0749, "step": 780 }, { "epoch": 0.02171277968061919, "grad_norm": 1.3241729736328125, "learning_rate": 1.9976743375497284e-05, "loss": 2.4125, "step": 781 }, { "epoch": 0.02174058093501179, "grad_norm": 1.259620189666748, "learning_rate": 1.9976683804521853e-05, "loss": 2.2754, "step": 782 }, { "epoch": 0.021768382189404387, "grad_norm": 1.1774060726165771, "learning_rate": 1.997662415743861e-05, "loss": 1.8514, "step": 783 }, { "epoch": 0.021796183443796985, "grad_norm": 1.1984117031097412, "learning_rate": 1.997656443424801e-05, "loss": 2.3338, "step": 784 }, { "epoch": 0.021823984698189584, "grad_norm": 1.1380529403686523, "learning_rate": 1.9976504634950516e-05, "loss": 2.1094, "step": 785 }, { "epoch": 0.021851785952582182, "grad_norm": 1.169572353363037, "learning_rate": 1.9976444759546574e-05, "loss": 2.1083, "step": 786 }, { "epoch": 0.02187958720697478, "grad_norm": 1.3705956935882568, "learning_rate": 1.997638480803665e-05, "loss": 2.4184, "step": 787 }, { "epoch": 0.021907388461367378, "grad_norm": 1.205291748046875, "learning_rate": 1.997632478042119e-05, "loss": 2.316, "step": 788 }, { "epoch": 0.021935189715759976, "grad_norm": 1.2233227491378784, "learning_rate": 1.9976264676700668e-05, "loss": 2.2446, "step": 789 }, { "epoch": 0.021962990970152575, "grad_norm": 1.17822265625, "learning_rate": 1.9976204496875525e-05, "loss": 1.9613, "step": 790 }, { "epoch": 0.021990792224545173, "grad_norm": 1.2377842664718628, "learning_rate": 1.9976144240946234e-05, "loss": 2.3576, "step": 791 }, { "epoch": 0.02201859347893777, "grad_norm": 1.2520421743392944, "learning_rate": 1.9976083908913248e-05, "loss": 2.1096, "step": 792 }, { "epoch": 0.02204639473333037, "grad_norm": 1.255120038986206, "learning_rate": 1.9976023500777025e-05, "loss": 2.0929, "step": 793 }, { "epoch": 0.022074195987722967, "grad_norm": 1.2712846994400024, "learning_rate": 1.9975963016538032e-05, "loss": 2.2336, "step": 794 }, { "epoch": 0.022101997242115565, "grad_norm": 1.3197938203811646, "learning_rate": 1.9975902456196724e-05, "loss": 2.5907, "step": 795 }, { "epoch": 0.022129798496508164, "grad_norm": 1.227813720703125, "learning_rate": 1.9975841819753572e-05, "loss": 2.2756, "step": 796 }, { "epoch": 0.022157599750900762, "grad_norm": 1.2126960754394531, "learning_rate": 1.9975781107209032e-05, "loss": 2.2228, "step": 797 }, { "epoch": 0.02218540100529336, "grad_norm": 1.2741320133209229, "learning_rate": 1.9975720318563566e-05, "loss": 2.1291, "step": 798 }, { "epoch": 0.022213202259685958, "grad_norm": 1.2088127136230469, "learning_rate": 1.997565945381764e-05, "loss": 2.2795, "step": 799 }, { "epoch": 0.022241003514078556, "grad_norm": 1.3392537832260132, "learning_rate": 1.9975598512971718e-05, "loss": 2.2347, "step": 800 }, { "epoch": 0.022268804768471154, "grad_norm": 1.172756552696228, "learning_rate": 1.9975537496026268e-05, "loss": 1.9746, "step": 801 }, { "epoch": 0.022296606022863753, "grad_norm": 1.2117072343826294, "learning_rate": 1.9975476402981752e-05, "loss": 2.1345, "step": 802 }, { "epoch": 0.02232440727725635, "grad_norm": 1.2100775241851807, "learning_rate": 1.997541523383863e-05, "loss": 2.1182, "step": 803 }, { "epoch": 0.02235220853164895, "grad_norm": 1.256800651550293, "learning_rate": 1.9975353988597384e-05, "loss": 1.9512, "step": 804 }, { "epoch": 0.022380009786041547, "grad_norm": 1.2266587018966675, "learning_rate": 1.997529266725847e-05, "loss": 2.3236, "step": 805 }, { "epoch": 0.022407811040434145, "grad_norm": 1.1271886825561523, "learning_rate": 1.997523126982236e-05, "loss": 1.9396, "step": 806 }, { "epoch": 0.022435612294826743, "grad_norm": 1.2485790252685547, "learning_rate": 1.9975169796289514e-05, "loss": 2.0774, "step": 807 }, { "epoch": 0.02246341354921934, "grad_norm": 1.2536906003952026, "learning_rate": 1.9975108246660412e-05, "loss": 2.1345, "step": 808 }, { "epoch": 0.02249121480361194, "grad_norm": 1.3038263320922852, "learning_rate": 1.997504662093552e-05, "loss": 1.9354, "step": 809 }, { "epoch": 0.022519016058004538, "grad_norm": 1.2137815952301025, "learning_rate": 1.9974984919115306e-05, "loss": 2.0208, "step": 810 }, { "epoch": 0.022546817312397136, "grad_norm": 1.2528146505355835, "learning_rate": 1.9974923141200244e-05, "loss": 2.1669, "step": 811 }, { "epoch": 0.022574618566789734, "grad_norm": 1.2399553060531616, "learning_rate": 1.99748612871908e-05, "loss": 1.9972, "step": 812 }, { "epoch": 0.022602419821182333, "grad_norm": 1.2212539911270142, "learning_rate": 1.9974799357087452e-05, "loss": 2.4435, "step": 813 }, { "epoch": 0.02263022107557493, "grad_norm": 1.1948703527450562, "learning_rate": 1.997473735089067e-05, "loss": 2.1545, "step": 814 }, { "epoch": 0.02265802232996753, "grad_norm": 1.3263567686080933, "learning_rate": 1.9974675268600926e-05, "loss": 2.1599, "step": 815 }, { "epoch": 0.022685823584360127, "grad_norm": 1.3392012119293213, "learning_rate": 1.997461311021869e-05, "loss": 2.1442, "step": 816 }, { "epoch": 0.022713624838752725, "grad_norm": 1.1028655767440796, "learning_rate": 1.9974550875744447e-05, "loss": 1.9841, "step": 817 }, { "epoch": 0.022741426093145323, "grad_norm": 1.3606761693954468, "learning_rate": 1.997448856517866e-05, "loss": 2.2069, "step": 818 }, { "epoch": 0.02276922734753792, "grad_norm": 1.2940837144851685, "learning_rate": 1.997442617852181e-05, "loss": 2.4033, "step": 819 }, { "epoch": 0.02279702860193052, "grad_norm": 1.3072127103805542, "learning_rate": 1.9974363715774376e-05, "loss": 2.3834, "step": 820 }, { "epoch": 0.022824829856323118, "grad_norm": 1.354925513267517, "learning_rate": 1.997430117693683e-05, "loss": 2.4105, "step": 821 }, { "epoch": 0.022852631110715716, "grad_norm": 1.1742846965789795, "learning_rate": 1.997423856200965e-05, "loss": 2.2817, "step": 822 }, { "epoch": 0.022880432365108314, "grad_norm": 1.412327527999878, "learning_rate": 1.997417587099331e-05, "loss": 1.8978, "step": 823 }, { "epoch": 0.022908233619500912, "grad_norm": 1.2942640781402588, "learning_rate": 1.9974113103888294e-05, "loss": 2.2575, "step": 824 }, { "epoch": 0.02293603487389351, "grad_norm": 1.2054380178451538, "learning_rate": 1.997405026069508e-05, "loss": 2.4047, "step": 825 }, { "epoch": 0.02296383612828611, "grad_norm": 1.2491945028305054, "learning_rate": 1.9973987341414146e-05, "loss": 2.0614, "step": 826 }, { "epoch": 0.022991637382678707, "grad_norm": 1.2348061800003052, "learning_rate": 1.997392434604597e-05, "loss": 2.141, "step": 827 }, { "epoch": 0.023019438637071305, "grad_norm": 1.266965627670288, "learning_rate": 1.9973861274591034e-05, "loss": 2.1899, "step": 828 }, { "epoch": 0.023047239891463903, "grad_norm": 1.2975566387176514, "learning_rate": 1.9973798127049822e-05, "loss": 2.2302, "step": 829 }, { "epoch": 0.0230750411458565, "grad_norm": 1.2624861001968384, "learning_rate": 1.9973734903422815e-05, "loss": 1.9821, "step": 830 }, { "epoch": 0.0231028424002491, "grad_norm": 1.1645833253860474, "learning_rate": 1.997367160371049e-05, "loss": 2.1387, "step": 831 }, { "epoch": 0.023130643654641698, "grad_norm": 1.211867094039917, "learning_rate": 1.997360822791334e-05, "loss": 2.0474, "step": 832 }, { "epoch": 0.023158444909034296, "grad_norm": 1.1930607557296753, "learning_rate": 1.997354477603184e-05, "loss": 2.3738, "step": 833 }, { "epoch": 0.023186246163426894, "grad_norm": 1.1596156358718872, "learning_rate": 1.997348124806647e-05, "loss": 2.1786, "step": 834 }, { "epoch": 0.023214047417819492, "grad_norm": 1.2294217348098755, "learning_rate": 1.9973417644017728e-05, "loss": 2.1692, "step": 835 }, { "epoch": 0.02324184867221209, "grad_norm": 1.1390455961227417, "learning_rate": 1.9973353963886088e-05, "loss": 2.1287, "step": 836 }, { "epoch": 0.02326964992660469, "grad_norm": 1.1800559759140015, "learning_rate": 1.997329020767204e-05, "loss": 1.8649, "step": 837 }, { "epoch": 0.023297451180997287, "grad_norm": 1.2710946798324585, "learning_rate": 1.9973226375376073e-05, "loss": 2.5752, "step": 838 }, { "epoch": 0.023325252435389885, "grad_norm": 1.2589893341064453, "learning_rate": 1.997316246699867e-05, "loss": 2.1195, "step": 839 }, { "epoch": 0.023353053689782483, "grad_norm": 1.130064845085144, "learning_rate": 1.997309848254032e-05, "loss": 2.0024, "step": 840 }, { "epoch": 0.02338085494417508, "grad_norm": 1.2818498611450195, "learning_rate": 1.997303442200151e-05, "loss": 2.0472, "step": 841 }, { "epoch": 0.02340865619856768, "grad_norm": 1.2507104873657227, "learning_rate": 1.9972970285382728e-05, "loss": 2.0062, "step": 842 }, { "epoch": 0.023436457452960278, "grad_norm": 1.2920022010803223, "learning_rate": 1.997290607268447e-05, "loss": 1.8601, "step": 843 }, { "epoch": 0.023464258707352876, "grad_norm": 1.1889721155166626, "learning_rate": 1.9972841783907216e-05, "loss": 1.9619, "step": 844 }, { "epoch": 0.023492059961745474, "grad_norm": 1.2898443937301636, "learning_rate": 1.997277741905146e-05, "loss": 2.3165, "step": 845 }, { "epoch": 0.023519861216138072, "grad_norm": 1.2144360542297363, "learning_rate": 1.9972712978117695e-05, "loss": 2.0499, "step": 846 }, { "epoch": 0.02354766247053067, "grad_norm": 1.2242858409881592, "learning_rate": 1.9972648461106417e-05, "loss": 2.4849, "step": 847 }, { "epoch": 0.02357546372492327, "grad_norm": 1.2607983350753784, "learning_rate": 1.9972583868018107e-05, "loss": 2.2616, "step": 848 }, { "epoch": 0.023603264979315867, "grad_norm": 1.2975778579711914, "learning_rate": 1.9972519198853267e-05, "loss": 1.8682, "step": 849 }, { "epoch": 0.023631066233708465, "grad_norm": 1.2547056674957275, "learning_rate": 1.9972454453612384e-05, "loss": 2.1272, "step": 850 }, { "epoch": 0.023658867488101063, "grad_norm": 1.2257258892059326, "learning_rate": 1.9972389632295955e-05, "loss": 2.3855, "step": 851 }, { "epoch": 0.02368666874249366, "grad_norm": 1.2778208255767822, "learning_rate": 1.9972324734904478e-05, "loss": 2.2279, "step": 852 }, { "epoch": 0.02371446999688626, "grad_norm": 1.2723045349121094, "learning_rate": 1.9972259761438438e-05, "loss": 1.9987, "step": 853 }, { "epoch": 0.023742271251278858, "grad_norm": 1.2511473894119263, "learning_rate": 1.9972194711898343e-05, "loss": 2.4474, "step": 854 }, { "epoch": 0.023770072505671456, "grad_norm": 1.1924453973770142, "learning_rate": 1.997212958628468e-05, "loss": 2.2983, "step": 855 }, { "epoch": 0.023797873760064054, "grad_norm": 1.209864854812622, "learning_rate": 1.9972064384597952e-05, "loss": 2.4734, "step": 856 }, { "epoch": 0.023825675014456652, "grad_norm": 1.186599850654602, "learning_rate": 1.9971999106838653e-05, "loss": 2.0542, "step": 857 }, { "epoch": 0.02385347626884925, "grad_norm": 1.2470898628234863, "learning_rate": 1.9971933753007284e-05, "loss": 2.0414, "step": 858 }, { "epoch": 0.02388127752324185, "grad_norm": 1.1702929735183716, "learning_rate": 1.9971868323104333e-05, "loss": 2.2207, "step": 859 }, { "epoch": 0.023909078777634447, "grad_norm": 1.2147670984268188, "learning_rate": 1.9971802817130312e-05, "loss": 2.1178, "step": 860 }, { "epoch": 0.023936880032027045, "grad_norm": 1.231812834739685, "learning_rate": 1.9971737235085716e-05, "loss": 2.2784, "step": 861 }, { "epoch": 0.023964681286419643, "grad_norm": 1.1119379997253418, "learning_rate": 1.9971671576971045e-05, "loss": 1.9365, "step": 862 }, { "epoch": 0.02399248254081224, "grad_norm": 1.1899245977401733, "learning_rate": 1.99716058427868e-05, "loss": 2.0711, "step": 863 }, { "epoch": 0.02402028379520484, "grad_norm": 1.222233533859253, "learning_rate": 1.997154003253348e-05, "loss": 2.427, "step": 864 }, { "epoch": 0.024048085049597438, "grad_norm": 1.250306248664856, "learning_rate": 1.9971474146211594e-05, "loss": 2.3759, "step": 865 }, { "epoch": 0.024075886303990036, "grad_norm": 1.2401456832885742, "learning_rate": 1.997140818382164e-05, "loss": 2.2159, "step": 866 }, { "epoch": 0.024103687558382634, "grad_norm": 1.1653214693069458, "learning_rate": 1.997134214536412e-05, "loss": 2.1235, "step": 867 }, { "epoch": 0.024131488812775232, "grad_norm": 1.2160677909851074, "learning_rate": 1.997127603083954e-05, "loss": 2.2018, "step": 868 }, { "epoch": 0.02415929006716783, "grad_norm": 1.1776270866394043, "learning_rate": 1.99712098402484e-05, "loss": 2.2642, "step": 869 }, { "epoch": 0.02418709132156043, "grad_norm": 1.2102035284042358, "learning_rate": 1.997114357359121e-05, "loss": 2.1751, "step": 870 }, { "epoch": 0.024214892575953027, "grad_norm": 1.2097270488739014, "learning_rate": 1.9971077230868475e-05, "loss": 2.0304, "step": 871 }, { "epoch": 0.024242693830345625, "grad_norm": 1.1696282625198364, "learning_rate": 1.99710108120807e-05, "loss": 2.0043, "step": 872 }, { "epoch": 0.024270495084738223, "grad_norm": 1.1803638935089111, "learning_rate": 1.997094431722839e-05, "loss": 2.0164, "step": 873 }, { "epoch": 0.02429829633913082, "grad_norm": 1.192821741104126, "learning_rate": 1.9970877746312055e-05, "loss": 2.0407, "step": 874 }, { "epoch": 0.02432609759352342, "grad_norm": 1.2372760772705078, "learning_rate": 1.9970811099332206e-05, "loss": 2.2748, "step": 875 }, { "epoch": 0.024353898847916017, "grad_norm": 1.2641793489456177, "learning_rate": 1.997074437628934e-05, "loss": 2.0915, "step": 876 }, { "epoch": 0.024381700102308616, "grad_norm": 1.2403004169464111, "learning_rate": 1.997067757718398e-05, "loss": 2.2518, "step": 877 }, { "epoch": 0.024409501356701214, "grad_norm": 1.3031015396118164, "learning_rate": 1.9970610702016623e-05, "loss": 2.3842, "step": 878 }, { "epoch": 0.024437302611093812, "grad_norm": 1.2453633546829224, "learning_rate": 1.997054375078779e-05, "loss": 2.3276, "step": 879 }, { "epoch": 0.02446510386548641, "grad_norm": 1.3694889545440674, "learning_rate": 1.9970476723497985e-05, "loss": 1.8899, "step": 880 }, { "epoch": 0.02449290511987901, "grad_norm": 1.1380016803741455, "learning_rate": 1.997040962014772e-05, "loss": 1.9125, "step": 881 }, { "epoch": 0.024520706374271607, "grad_norm": 1.2116196155548096, "learning_rate": 1.997034244073751e-05, "loss": 2.3157, "step": 882 }, { "epoch": 0.024548507628664205, "grad_norm": 1.3349127769470215, "learning_rate": 1.9970275185267863e-05, "loss": 2.129, "step": 883 }, { "epoch": 0.024576308883056803, "grad_norm": 1.1835367679595947, "learning_rate": 1.9970207853739297e-05, "loss": 2.2019, "step": 884 }, { "epoch": 0.0246041101374494, "grad_norm": 1.2861647605895996, "learning_rate": 1.997014044615232e-05, "loss": 2.4828, "step": 885 }, { "epoch": 0.024631911391842, "grad_norm": 1.1594455242156982, "learning_rate": 1.9970072962507455e-05, "loss": 2.2684, "step": 886 }, { "epoch": 0.024659712646234597, "grad_norm": 1.2256097793579102, "learning_rate": 1.9970005402805207e-05, "loss": 2.0016, "step": 887 }, { "epoch": 0.024687513900627196, "grad_norm": 1.1204482316970825, "learning_rate": 1.9969937767046096e-05, "loss": 2.2654, "step": 888 }, { "epoch": 0.024715315155019794, "grad_norm": 1.2038068771362305, "learning_rate": 1.996987005523064e-05, "loss": 2.006, "step": 889 }, { "epoch": 0.024743116409412392, "grad_norm": 1.289568305015564, "learning_rate": 1.9969802267359348e-05, "loss": 2.2139, "step": 890 }, { "epoch": 0.02477091766380499, "grad_norm": 1.2928622961044312, "learning_rate": 1.9969734403432746e-05, "loss": 2.1335, "step": 891 }, { "epoch": 0.024798718918197588, "grad_norm": 1.2417336702346802, "learning_rate": 1.9969666463451346e-05, "loss": 2.5345, "step": 892 }, { "epoch": 0.024826520172590186, "grad_norm": 1.2544372081756592, "learning_rate": 1.996959844741567e-05, "loss": 2.1018, "step": 893 }, { "epoch": 0.024854321426982785, "grad_norm": 1.1650993824005127, "learning_rate": 1.9969530355326233e-05, "loss": 2.1896, "step": 894 }, { "epoch": 0.024882122681375383, "grad_norm": 1.211014747619629, "learning_rate": 1.996946218718356e-05, "loss": 2.0694, "step": 895 }, { "epoch": 0.02490992393576798, "grad_norm": 1.3171793222427368, "learning_rate": 1.9969393942988166e-05, "loss": 2.1444, "step": 896 }, { "epoch": 0.02493772519016058, "grad_norm": 1.2558826208114624, "learning_rate": 1.996932562274057e-05, "loss": 2.3299, "step": 897 }, { "epoch": 0.024965526444553177, "grad_norm": 1.2251890897750854, "learning_rate": 1.99692572264413e-05, "loss": 2.4285, "step": 898 }, { "epoch": 0.024993327698945775, "grad_norm": 1.223473310470581, "learning_rate": 1.9969188754090874e-05, "loss": 2.1837, "step": 899 }, { "epoch": 0.025021128953338374, "grad_norm": 1.1845606565475464, "learning_rate": 1.9969120205689815e-05, "loss": 2.0004, "step": 900 }, { "epoch": 0.025048930207730972, "grad_norm": 1.1684184074401855, "learning_rate": 1.9969051581238644e-05, "loss": 2.0444, "step": 901 }, { "epoch": 0.02507673146212357, "grad_norm": 1.2012953758239746, "learning_rate": 1.9968982880737885e-05, "loss": 2.0893, "step": 902 }, { "epoch": 0.025104532716516168, "grad_norm": 1.1470839977264404, "learning_rate": 1.9968914104188064e-05, "loss": 2.2284, "step": 903 }, { "epoch": 0.025132333970908766, "grad_norm": 1.1849442720413208, "learning_rate": 1.9968845251589704e-05, "loss": 2.1385, "step": 904 }, { "epoch": 0.025160135225301365, "grad_norm": 1.150512933731079, "learning_rate": 1.996877632294333e-05, "loss": 2.113, "step": 905 }, { "epoch": 0.025187936479693963, "grad_norm": 1.270215630531311, "learning_rate": 1.9968707318249472e-05, "loss": 1.9331, "step": 906 }, { "epoch": 0.02521573773408656, "grad_norm": 1.2315492630004883, "learning_rate": 1.996863823750865e-05, "loss": 2.4386, "step": 907 }, { "epoch": 0.02524353898847916, "grad_norm": 1.1641958951950073, "learning_rate": 1.9968569080721395e-05, "loss": 1.8773, "step": 908 }, { "epoch": 0.025271340242871757, "grad_norm": 1.1815314292907715, "learning_rate": 1.9968499847888238e-05, "loss": 2.141, "step": 909 }, { "epoch": 0.025299141497264355, "grad_norm": 1.1533674001693726, "learning_rate": 1.9968430539009696e-05, "loss": 2.1526, "step": 910 }, { "epoch": 0.025326942751656954, "grad_norm": 1.2788751125335693, "learning_rate": 1.9968361154086308e-05, "loss": 1.9305, "step": 911 }, { "epoch": 0.02535474400604955, "grad_norm": 1.1481788158416748, "learning_rate": 1.99682916931186e-05, "loss": 2.1194, "step": 912 }, { "epoch": 0.02538254526044215, "grad_norm": 1.088585376739502, "learning_rate": 1.9968222156107098e-05, "loss": 2.2148, "step": 913 }, { "epoch": 0.025410346514834748, "grad_norm": 1.1896530389785767, "learning_rate": 1.996815254305234e-05, "loss": 2.2067, "step": 914 }, { "epoch": 0.025438147769227346, "grad_norm": 1.2413997650146484, "learning_rate": 1.9968082853954853e-05, "loss": 2.1593, "step": 915 }, { "epoch": 0.025465949023619944, "grad_norm": 1.1861114501953125, "learning_rate": 1.9968013088815167e-05, "loss": 2.1959, "step": 916 }, { "epoch": 0.025493750278012543, "grad_norm": 1.113680362701416, "learning_rate": 1.996794324763382e-05, "loss": 1.8211, "step": 917 }, { "epoch": 0.02552155153240514, "grad_norm": 1.2193152904510498, "learning_rate": 1.996787333041134e-05, "loss": 1.9778, "step": 918 }, { "epoch": 0.02554935278679774, "grad_norm": 1.1727924346923828, "learning_rate": 1.9967803337148258e-05, "loss": 2.0699, "step": 919 }, { "epoch": 0.025577154041190337, "grad_norm": 1.1627894639968872, "learning_rate": 1.9967733267845112e-05, "loss": 1.9798, "step": 920 }, { "epoch": 0.025604955295582935, "grad_norm": 1.1728607416152954, "learning_rate": 1.9967663122502435e-05, "loss": 2.0131, "step": 921 }, { "epoch": 0.025632756549975533, "grad_norm": 1.1787608861923218, "learning_rate": 1.9967592901120768e-05, "loss": 2.1375, "step": 922 }, { "epoch": 0.02566055780436813, "grad_norm": 1.252601146697998, "learning_rate": 1.9967522603700634e-05, "loss": 1.9697, "step": 923 }, { "epoch": 0.02568835905876073, "grad_norm": 1.1952341794967651, "learning_rate": 1.996745223024258e-05, "loss": 2.1945, "step": 924 }, { "epoch": 0.025716160313153328, "grad_norm": 1.2065502405166626, "learning_rate": 1.9967381780747143e-05, "loss": 1.995, "step": 925 }, { "epoch": 0.025743961567545926, "grad_norm": 1.2076259851455688, "learning_rate": 1.9967311255214853e-05, "loss": 2.2967, "step": 926 }, { "epoch": 0.025771762821938524, "grad_norm": 1.3130898475646973, "learning_rate": 1.9967240653646252e-05, "loss": 2.2247, "step": 927 }, { "epoch": 0.025799564076331123, "grad_norm": 1.262519359588623, "learning_rate": 1.9967169976041885e-05, "loss": 2.1642, "step": 928 }, { "epoch": 0.02582736533072372, "grad_norm": 1.1608154773712158, "learning_rate": 1.996709922240228e-05, "loss": 2.26, "step": 929 }, { "epoch": 0.02585516658511632, "grad_norm": 1.2211614847183228, "learning_rate": 1.9967028392727982e-05, "loss": 2.1833, "step": 930 }, { "epoch": 0.025882967839508917, "grad_norm": 1.1924750804901123, "learning_rate": 1.996695748701953e-05, "loss": 2.453, "step": 931 }, { "epoch": 0.025910769093901515, "grad_norm": 1.1862854957580566, "learning_rate": 1.9966886505277466e-05, "loss": 2.245, "step": 932 }, { "epoch": 0.025938570348294113, "grad_norm": 1.2379851341247559, "learning_rate": 1.996681544750233e-05, "loss": 2.3586, "step": 933 }, { "epoch": 0.02596637160268671, "grad_norm": 1.2045191526412964, "learning_rate": 1.9966744313694668e-05, "loss": 2.1428, "step": 934 }, { "epoch": 0.02599417285707931, "grad_norm": 1.2450040578842163, "learning_rate": 1.9966673103855017e-05, "loss": 2.0352, "step": 935 }, { "epoch": 0.026021974111471908, "grad_norm": 1.2457891702651978, "learning_rate": 1.9966601817983928e-05, "loss": 2.3093, "step": 936 }, { "epoch": 0.02604977536586451, "grad_norm": 1.309598445892334, "learning_rate": 1.9966530456081937e-05, "loss": 2.003, "step": 937 }, { "epoch": 0.026077576620257108, "grad_norm": 1.195908784866333, "learning_rate": 1.9966459018149592e-05, "loss": 2.2885, "step": 938 }, { "epoch": 0.026105377874649706, "grad_norm": 1.197320580482483, "learning_rate": 1.9966387504187438e-05, "loss": 2.1546, "step": 939 }, { "epoch": 0.026133179129042304, "grad_norm": 1.1319479942321777, "learning_rate": 1.9966315914196016e-05, "loss": 2.1724, "step": 940 }, { "epoch": 0.026160980383434902, "grad_norm": 1.2361865043640137, "learning_rate": 1.9966244248175882e-05, "loss": 2.4079, "step": 941 }, { "epoch": 0.0261887816378275, "grad_norm": 1.2233473062515259, "learning_rate": 1.9966172506127575e-05, "loss": 2.0898, "step": 942 }, { "epoch": 0.0262165828922201, "grad_norm": 1.2142386436462402, "learning_rate": 1.9966100688051645e-05, "loss": 2.0604, "step": 943 }, { "epoch": 0.026244384146612697, "grad_norm": 1.2224408388137817, "learning_rate": 1.9966028793948635e-05, "loss": 1.9997, "step": 944 }, { "epoch": 0.026272185401005295, "grad_norm": 1.3289246559143066, "learning_rate": 1.99659568238191e-05, "loss": 2.0368, "step": 945 }, { "epoch": 0.026299986655397893, "grad_norm": 1.3365048170089722, "learning_rate": 1.9965884777663586e-05, "loss": 2.4926, "step": 946 }, { "epoch": 0.02632778790979049, "grad_norm": 1.2010939121246338, "learning_rate": 1.9965812655482644e-05, "loss": 2.1108, "step": 947 }, { "epoch": 0.02635558916418309, "grad_norm": 1.270769715309143, "learning_rate": 1.9965740457276822e-05, "loss": 2.0306, "step": 948 }, { "epoch": 0.026383390418575688, "grad_norm": 1.2005252838134766, "learning_rate": 1.9965668183046673e-05, "loss": 2.2448, "step": 949 }, { "epoch": 0.026411191672968286, "grad_norm": 1.2237684726715088, "learning_rate": 1.996559583279275e-05, "loss": 2.0992, "step": 950 }, { "epoch": 0.026438992927360884, "grad_norm": 1.2444114685058594, "learning_rate": 1.9965523406515598e-05, "loss": 1.9986, "step": 951 }, { "epoch": 0.026466794181753482, "grad_norm": 1.210436463356018, "learning_rate": 1.9965450904215773e-05, "loss": 2.0745, "step": 952 }, { "epoch": 0.02649459543614608, "grad_norm": 1.15388822555542, "learning_rate": 1.9965378325893833e-05, "loss": 2.1821, "step": 953 }, { "epoch": 0.02652239669053868, "grad_norm": 1.2099543809890747, "learning_rate": 1.9965305671550323e-05, "loss": 2.2429, "step": 954 }, { "epoch": 0.026550197944931277, "grad_norm": 1.2937488555908203, "learning_rate": 1.9965232941185807e-05, "loss": 2.0954, "step": 955 }, { "epoch": 0.026577999199323875, "grad_norm": 1.2517366409301758, "learning_rate": 1.9965160134800832e-05, "loss": 2.1413, "step": 956 }, { "epoch": 0.026605800453716473, "grad_norm": 1.20096755027771, "learning_rate": 1.9965087252395954e-05, "loss": 1.9805, "step": 957 }, { "epoch": 0.02663360170810907, "grad_norm": 1.2287697792053223, "learning_rate": 1.9965014293971733e-05, "loss": 2.1718, "step": 958 }, { "epoch": 0.02666140296250167, "grad_norm": 1.3404163122177124, "learning_rate": 1.9964941259528725e-05, "loss": 2.056, "step": 959 }, { "epoch": 0.026689204216894268, "grad_norm": 1.39670729637146, "learning_rate": 1.9964868149067484e-05, "loss": 2.1225, "step": 960 }, { "epoch": 0.026717005471286866, "grad_norm": 1.2729413509368896, "learning_rate": 1.9964794962588573e-05, "loss": 2.0507, "step": 961 }, { "epoch": 0.026744806725679464, "grad_norm": 1.2338212728500366, "learning_rate": 1.996472170009254e-05, "loss": 2.1727, "step": 962 }, { "epoch": 0.026772607980072062, "grad_norm": 1.1539911031723022, "learning_rate": 1.9964648361579954e-05, "loss": 2.1867, "step": 963 }, { "epoch": 0.02680040923446466, "grad_norm": 1.3844999074935913, "learning_rate": 1.9964574947051374e-05, "loss": 2.3182, "step": 964 }, { "epoch": 0.02682821048885726, "grad_norm": 1.4040056467056274, "learning_rate": 1.9964501456507354e-05, "loss": 1.9272, "step": 965 }, { "epoch": 0.026856011743249857, "grad_norm": 1.3569767475128174, "learning_rate": 1.996442788994846e-05, "loss": 2.1046, "step": 966 }, { "epoch": 0.026883812997642455, "grad_norm": 1.1690312623977661, "learning_rate": 1.996435424737525e-05, "loss": 2.211, "step": 967 }, { "epoch": 0.026911614252035053, "grad_norm": 1.244255781173706, "learning_rate": 1.9964280528788286e-05, "loss": 2.0303, "step": 968 }, { "epoch": 0.02693941550642765, "grad_norm": 1.380043864250183, "learning_rate": 1.996420673418813e-05, "loss": 2.239, "step": 969 }, { "epoch": 0.02696721676082025, "grad_norm": 1.4245936870574951, "learning_rate": 1.996413286357535e-05, "loss": 2.3724, "step": 970 }, { "epoch": 0.026995018015212847, "grad_norm": 1.265580415725708, "learning_rate": 1.9964058916950502e-05, "loss": 2.0965, "step": 971 }, { "epoch": 0.027022819269605446, "grad_norm": 1.1776010990142822, "learning_rate": 1.9963984894314157e-05, "loss": 2.2965, "step": 972 }, { "epoch": 0.027050620523998044, "grad_norm": 1.408167839050293, "learning_rate": 1.996391079566687e-05, "loss": 1.88, "step": 973 }, { "epoch": 0.027078421778390642, "grad_norm": 1.1978322267532349, "learning_rate": 1.996383662100922e-05, "loss": 2.1543, "step": 974 }, { "epoch": 0.02710622303278324, "grad_norm": 1.5202429294586182, "learning_rate": 1.9963762370341764e-05, "loss": 2.0315, "step": 975 }, { "epoch": 0.02713402428717584, "grad_norm": 1.3757271766662598, "learning_rate": 1.9963688043665067e-05, "loss": 2.143, "step": 976 }, { "epoch": 0.027161825541568436, "grad_norm": 1.272028923034668, "learning_rate": 1.9963613640979703e-05, "loss": 2.1482, "step": 977 }, { "epoch": 0.027189626795961035, "grad_norm": 1.36763596534729, "learning_rate": 1.9963539162286234e-05, "loss": 2.164, "step": 978 }, { "epoch": 0.027217428050353633, "grad_norm": 1.3678679466247559, "learning_rate": 1.996346460758523e-05, "loss": 2.1165, "step": 979 }, { "epoch": 0.02724522930474623, "grad_norm": 1.333658218383789, "learning_rate": 1.996338997687726e-05, "loss": 2.1342, "step": 980 }, { "epoch": 0.02727303055913883, "grad_norm": 1.132538914680481, "learning_rate": 1.996331527016289e-05, "loss": 2.1657, "step": 981 }, { "epoch": 0.027300831813531427, "grad_norm": 3.729539632797241, "learning_rate": 1.9963240487442696e-05, "loss": 2.2474, "step": 982 }, { "epoch": 0.027328633067924026, "grad_norm": 1.1967582702636719, "learning_rate": 1.9963165628717242e-05, "loss": 2.2778, "step": 983 }, { "epoch": 0.027356434322316624, "grad_norm": 1.242608666419983, "learning_rate": 1.9963090693987105e-05, "loss": 2.3494, "step": 984 }, { "epoch": 0.027384235576709222, "grad_norm": 1.2497773170471191, "learning_rate": 1.996301568325285e-05, "loss": 2.2541, "step": 985 }, { "epoch": 0.02741203683110182, "grad_norm": 1.218598484992981, "learning_rate": 1.996294059651506e-05, "loss": 2.353, "step": 986 }, { "epoch": 0.027439838085494418, "grad_norm": 1.2289186716079712, "learning_rate": 1.9962865433774296e-05, "loss": 2.0852, "step": 987 }, { "epoch": 0.027467639339887016, "grad_norm": 1.1605364084243774, "learning_rate": 1.9962790195031136e-05, "loss": 2.0297, "step": 988 }, { "epoch": 0.027495440594279615, "grad_norm": 1.1647799015045166, "learning_rate": 1.9962714880286156e-05, "loss": 2.0389, "step": 989 }, { "epoch": 0.027523241848672213, "grad_norm": 1.2514933347702026, "learning_rate": 1.9962639489539928e-05, "loss": 2.1524, "step": 990 }, { "epoch": 0.02755104310306481, "grad_norm": 1.1304253339767456, "learning_rate": 1.996256402279303e-05, "loss": 2.4878, "step": 991 }, { "epoch": 0.02757884435745741, "grad_norm": 1.1576107740402222, "learning_rate": 1.996248848004603e-05, "loss": 2.1372, "step": 992 }, { "epoch": 0.027606645611850007, "grad_norm": 1.2313652038574219, "learning_rate": 1.996241286129952e-05, "loss": 2.1677, "step": 993 }, { "epoch": 0.027634446866242605, "grad_norm": 1.1331932544708252, "learning_rate": 1.996233716655406e-05, "loss": 2.1478, "step": 994 }, { "epoch": 0.027662248120635204, "grad_norm": 1.2116929292678833, "learning_rate": 1.9962261395810233e-05, "loss": 1.9809, "step": 995 }, { "epoch": 0.027690049375027802, "grad_norm": 1.1891601085662842, "learning_rate": 1.996218554906862e-05, "loss": 2.0752, "step": 996 }, { "epoch": 0.0277178506294204, "grad_norm": 1.1840407848358154, "learning_rate": 1.9962109626329796e-05, "loss": 2.2663, "step": 997 }, { "epoch": 0.027745651883812998, "grad_norm": 1.1401996612548828, "learning_rate": 1.9962033627594348e-05, "loss": 1.8309, "step": 998 }, { "epoch": 0.027773453138205596, "grad_norm": 1.19683039188385, "learning_rate": 1.9961957552862845e-05, "loss": 2.225, "step": 999 }, { "epoch": 0.027801254392598194, "grad_norm": 1.2513744831085205, "learning_rate": 1.9961881402135872e-05, "loss": 2.2907, "step": 1000 }, { "epoch": 0.027829055646990793, "grad_norm": 1.2874199151992798, "learning_rate": 1.9961805175414014e-05, "loss": 2.1521, "step": 1001 }, { "epoch": 0.02785685690138339, "grad_norm": 1.2811918258666992, "learning_rate": 1.9961728872697845e-05, "loss": 2.4166, "step": 1002 }, { "epoch": 0.02788465815577599, "grad_norm": 1.1523001194000244, "learning_rate": 1.9961652493987952e-05, "loss": 2.0137, "step": 1003 }, { "epoch": 0.027912459410168587, "grad_norm": 1.2701681852340698, "learning_rate": 1.9961576039284917e-05, "loss": 2.3181, "step": 1004 }, { "epoch": 0.027940260664561185, "grad_norm": 1.2041475772857666, "learning_rate": 1.996149950858932e-05, "loss": 2.136, "step": 1005 }, { "epoch": 0.027968061918953784, "grad_norm": 1.2551193237304688, "learning_rate": 1.996142290190175e-05, "loss": 2.1385, "step": 1006 }, { "epoch": 0.02799586317334638, "grad_norm": 1.3142166137695312, "learning_rate": 1.9961346219222786e-05, "loss": 2.0986, "step": 1007 }, { "epoch": 0.02802366442773898, "grad_norm": 1.3179500102996826, "learning_rate": 1.996126946055302e-05, "loss": 1.9588, "step": 1008 }, { "epoch": 0.028051465682131578, "grad_norm": 1.2392182350158691, "learning_rate": 1.996119262589303e-05, "loss": 1.9821, "step": 1009 }, { "epoch": 0.028079266936524176, "grad_norm": 1.2514424324035645, "learning_rate": 1.9961115715243405e-05, "loss": 1.7303, "step": 1010 }, { "epoch": 0.028107068190916774, "grad_norm": 1.1712360382080078, "learning_rate": 1.9961038728604736e-05, "loss": 2.3482, "step": 1011 }, { "epoch": 0.028134869445309373, "grad_norm": 1.297306776046753, "learning_rate": 1.9960961665977602e-05, "loss": 2.2864, "step": 1012 }, { "epoch": 0.02816267069970197, "grad_norm": 1.2256876230239868, "learning_rate": 1.9960884527362595e-05, "loss": 1.9132, "step": 1013 }, { "epoch": 0.02819047195409457, "grad_norm": 1.1851500272750854, "learning_rate": 1.996080731276031e-05, "loss": 2.2018, "step": 1014 }, { "epoch": 0.028218273208487167, "grad_norm": 1.1611994504928589, "learning_rate": 1.9960730022171323e-05, "loss": 2.0485, "step": 1015 }, { "epoch": 0.028246074462879765, "grad_norm": 1.2068134546279907, "learning_rate": 1.9960652655596236e-05, "loss": 2.0731, "step": 1016 }, { "epoch": 0.028273875717272363, "grad_norm": 1.2434766292572021, "learning_rate": 1.9960575213035632e-05, "loss": 2.006, "step": 1017 }, { "epoch": 0.02830167697166496, "grad_norm": 1.1272668838500977, "learning_rate": 1.99604976944901e-05, "loss": 2.2904, "step": 1018 }, { "epoch": 0.02832947822605756, "grad_norm": 1.3019568920135498, "learning_rate": 1.996042009996024e-05, "loss": 1.9977, "step": 1019 }, { "epoch": 0.028357279480450158, "grad_norm": 1.2907110452651978, "learning_rate": 1.9960342429446637e-05, "loss": 2.637, "step": 1020 }, { "epoch": 0.028385080734842756, "grad_norm": 1.2333530187606812, "learning_rate": 1.9960264682949883e-05, "loss": 2.1714, "step": 1021 }, { "epoch": 0.028412881989235354, "grad_norm": 1.2873077392578125, "learning_rate": 1.9960186860470573e-05, "loss": 2.1424, "step": 1022 }, { "epoch": 0.028440683243627952, "grad_norm": 1.2203079462051392, "learning_rate": 1.9960108962009302e-05, "loss": 2.0282, "step": 1023 }, { "epoch": 0.02846848449802055, "grad_norm": 1.3269989490509033, "learning_rate": 1.9960030987566664e-05, "loss": 1.9208, "step": 1024 }, { "epoch": 0.02849628575241315, "grad_norm": 1.184553861618042, "learning_rate": 1.9959952937143254e-05, "loss": 2.0436, "step": 1025 }, { "epoch": 0.028524087006805747, "grad_norm": 1.1386909484863281, "learning_rate": 1.9959874810739666e-05, "loss": 2.0725, "step": 1026 }, { "epoch": 0.028551888261198345, "grad_norm": 1.1467082500457764, "learning_rate": 1.9959796608356495e-05, "loss": 2.0196, "step": 1027 }, { "epoch": 0.028579689515590943, "grad_norm": 1.3204232454299927, "learning_rate": 1.9959718329994343e-05, "loss": 2.4722, "step": 1028 }, { "epoch": 0.02860749076998354, "grad_norm": 1.3122261762619019, "learning_rate": 1.99596399756538e-05, "loss": 2.1236, "step": 1029 }, { "epoch": 0.02863529202437614, "grad_norm": 1.187410593032837, "learning_rate": 1.9959561545335466e-05, "loss": 2.0056, "step": 1030 }, { "epoch": 0.028663093278768738, "grad_norm": 1.2071913480758667, "learning_rate": 1.9959483039039943e-05, "loss": 1.9091, "step": 1031 }, { "epoch": 0.028690894533161336, "grad_norm": 1.19940984249115, "learning_rate": 1.9959404456767824e-05, "loss": 2.2006, "step": 1032 }, { "epoch": 0.028718695787553934, "grad_norm": 1.2214174270629883, "learning_rate": 1.9959325798519717e-05, "loss": 1.9134, "step": 1033 }, { "epoch": 0.028746497041946532, "grad_norm": 1.1674891710281372, "learning_rate": 1.9959247064296213e-05, "loss": 2.2128, "step": 1034 }, { "epoch": 0.02877429829633913, "grad_norm": 1.1695327758789062, "learning_rate": 1.9959168254097917e-05, "loss": 2.3981, "step": 1035 }, { "epoch": 0.02880209955073173, "grad_norm": 1.8407140970230103, "learning_rate": 1.9959089367925427e-05, "loss": 2.4234, "step": 1036 }, { "epoch": 0.028829900805124327, "grad_norm": 1.2921278476715088, "learning_rate": 1.995901040577935e-05, "loss": 2.442, "step": 1037 }, { "epoch": 0.028857702059516925, "grad_norm": 1.1945892572402954, "learning_rate": 1.9958931367660283e-05, "loss": 2.1827, "step": 1038 }, { "epoch": 0.028885503313909523, "grad_norm": 1.1993122100830078, "learning_rate": 1.9958852253568835e-05, "loss": 2.0781, "step": 1039 }, { "epoch": 0.02891330456830212, "grad_norm": 1.1627475023269653, "learning_rate": 1.9958773063505603e-05, "loss": 2.1021, "step": 1040 }, { "epoch": 0.02894110582269472, "grad_norm": 1.1768262386322021, "learning_rate": 1.9958693797471196e-05, "loss": 1.9968, "step": 1041 }, { "epoch": 0.028968907077087318, "grad_norm": 1.2578184604644775, "learning_rate": 1.9958614455466218e-05, "loss": 2.174, "step": 1042 }, { "epoch": 0.028996708331479916, "grad_norm": 1.1763030290603638, "learning_rate": 1.9958535037491272e-05, "loss": 1.9156, "step": 1043 }, { "epoch": 0.029024509585872514, "grad_norm": 1.1976784467697144, "learning_rate": 1.9958455543546967e-05, "loss": 2.298, "step": 1044 }, { "epoch": 0.029052310840265112, "grad_norm": 1.1906042098999023, "learning_rate": 1.9958375973633904e-05, "loss": 1.9622, "step": 1045 }, { "epoch": 0.02908011209465771, "grad_norm": 1.137036919593811, "learning_rate": 1.9958296327752695e-05, "loss": 2.14, "step": 1046 }, { "epoch": 0.02910791334905031, "grad_norm": 1.1943312883377075, "learning_rate": 1.995821660590395e-05, "loss": 1.9127, "step": 1047 }, { "epoch": 0.029135714603442907, "grad_norm": 1.1805248260498047, "learning_rate": 1.9958136808088266e-05, "loss": 2.0205, "step": 1048 }, { "epoch": 0.029163515857835505, "grad_norm": 1.2263551950454712, "learning_rate": 1.9958056934306263e-05, "loss": 2.0036, "step": 1049 }, { "epoch": 0.029191317112228103, "grad_norm": 1.2468680143356323, "learning_rate": 1.9957976984558547e-05, "loss": 2.2292, "step": 1050 }, { "epoch": 0.0292191183666207, "grad_norm": 1.1622015237808228, "learning_rate": 1.9957896958845724e-05, "loss": 1.9233, "step": 1051 }, { "epoch": 0.0292469196210133, "grad_norm": 1.1971123218536377, "learning_rate": 1.995781685716841e-05, "loss": 1.908, "step": 1052 }, { "epoch": 0.029274720875405898, "grad_norm": 1.1625900268554688, "learning_rate": 1.9957736679527214e-05, "loss": 2.1119, "step": 1053 }, { "epoch": 0.029302522129798496, "grad_norm": 1.2680864334106445, "learning_rate": 1.9957656425922745e-05, "loss": 2.1662, "step": 1054 }, { "epoch": 0.029330323384191094, "grad_norm": 1.2665683031082153, "learning_rate": 1.9957576096355623e-05, "loss": 2.2731, "step": 1055 }, { "epoch": 0.029358124638583692, "grad_norm": 1.1756962537765503, "learning_rate": 1.995749569082645e-05, "loss": 1.8945, "step": 1056 }, { "epoch": 0.02938592589297629, "grad_norm": 1.231147050857544, "learning_rate": 1.9957415209335845e-05, "loss": 2.2691, "step": 1057 }, { "epoch": 0.02941372714736889, "grad_norm": 1.1365751028060913, "learning_rate": 1.9957334651884426e-05, "loss": 2.2011, "step": 1058 }, { "epoch": 0.029441528401761487, "grad_norm": 1.3910654783248901, "learning_rate": 1.99572540184728e-05, "loss": 1.8944, "step": 1059 }, { "epoch": 0.029469329656154085, "grad_norm": 1.1403043270111084, "learning_rate": 1.9957173309101586e-05, "loss": 1.9612, "step": 1060 }, { "epoch": 0.029497130910546683, "grad_norm": 1.2612004280090332, "learning_rate": 1.99570925237714e-05, "loss": 1.9225, "step": 1061 }, { "epoch": 0.02952493216493928, "grad_norm": 1.324126958847046, "learning_rate": 1.9957011662482854e-05, "loss": 2.2878, "step": 1062 }, { "epoch": 0.02955273341933188, "grad_norm": 1.2408560514450073, "learning_rate": 1.9956930725236573e-05, "loss": 2.0419, "step": 1063 }, { "epoch": 0.029580534673724478, "grad_norm": 1.1242367029190063, "learning_rate": 1.9956849712033167e-05, "loss": 2.166, "step": 1064 }, { "epoch": 0.029608335928117076, "grad_norm": 1.2456222772598267, "learning_rate": 1.9956768622873258e-05, "loss": 2.1787, "step": 1065 }, { "epoch": 0.029636137182509674, "grad_norm": 1.216109037399292, "learning_rate": 1.995668745775746e-05, "loss": 2.1263, "step": 1066 }, { "epoch": 0.029663938436902272, "grad_norm": 1.3324060440063477, "learning_rate": 1.9956606216686398e-05, "loss": 2.0309, "step": 1067 }, { "epoch": 0.02969173969129487, "grad_norm": 1.1500639915466309, "learning_rate": 1.995652489966069e-05, "loss": 2.0669, "step": 1068 }, { "epoch": 0.02971954094568747, "grad_norm": 1.14754056930542, "learning_rate": 1.9956443506680955e-05, "loss": 2.1229, "step": 1069 }, { "epoch": 0.029747342200080067, "grad_norm": 1.2698420286178589, "learning_rate": 1.9956362037747814e-05, "loss": 2.2509, "step": 1070 }, { "epoch": 0.029775143454472665, "grad_norm": 1.3290386199951172, "learning_rate": 1.995628049286189e-05, "loss": 2.0892, "step": 1071 }, { "epoch": 0.029802944708865263, "grad_norm": 1.2239036560058594, "learning_rate": 1.9956198872023802e-05, "loss": 1.9137, "step": 1072 }, { "epoch": 0.02983074596325786, "grad_norm": 1.0657373666763306, "learning_rate": 1.9956117175234176e-05, "loss": 2.1124, "step": 1073 }, { "epoch": 0.02985854721765046, "grad_norm": 1.1941990852355957, "learning_rate": 1.9956035402493633e-05, "loss": 2.0913, "step": 1074 }, { "epoch": 0.029886348472043058, "grad_norm": 1.1982321739196777, "learning_rate": 1.9955953553802797e-05, "loss": 2.2047, "step": 1075 }, { "epoch": 0.029914149726435656, "grad_norm": 1.1913443803787231, "learning_rate": 1.9955871629162294e-05, "loss": 2.0933, "step": 1076 }, { "epoch": 0.029941950980828254, "grad_norm": 1.229921817779541, "learning_rate": 1.995578962857275e-05, "loss": 2.1698, "step": 1077 }, { "epoch": 0.029969752235220852, "grad_norm": 1.3010627031326294, "learning_rate": 1.995570755203479e-05, "loss": 2.2013, "step": 1078 }, { "epoch": 0.02999755348961345, "grad_norm": 1.3838484287261963, "learning_rate": 1.9955625399549034e-05, "loss": 2.0114, "step": 1079 }, { "epoch": 0.03002535474400605, "grad_norm": 1.1230838298797607, "learning_rate": 1.9955543171116115e-05, "loss": 1.9423, "step": 1080 }, { "epoch": 0.030053155998398647, "grad_norm": 1.1773936748504639, "learning_rate": 1.9955460866736658e-05, "loss": 1.9109, "step": 1081 }, { "epoch": 0.030080957252791245, "grad_norm": 1.1244663000106812, "learning_rate": 1.995537848641129e-05, "loss": 2.0523, "step": 1082 }, { "epoch": 0.030108758507183843, "grad_norm": 1.2206745147705078, "learning_rate": 1.9955296030140645e-05, "loss": 2.1959, "step": 1083 }, { "epoch": 0.03013655976157644, "grad_norm": 1.2758944034576416, "learning_rate": 1.9955213497925347e-05, "loss": 2.1895, "step": 1084 }, { "epoch": 0.03016436101596904, "grad_norm": 1.1278725862503052, "learning_rate": 1.9955130889766027e-05, "loss": 2.1269, "step": 1085 }, { "epoch": 0.030192162270361637, "grad_norm": 1.244704008102417, "learning_rate": 1.9955048205663316e-05, "loss": 2.0834, "step": 1086 }, { "epoch": 0.030219963524754236, "grad_norm": 1.1638721227645874, "learning_rate": 1.995496544561784e-05, "loss": 2.1059, "step": 1087 }, { "epoch": 0.030247764779146834, "grad_norm": 1.2541478872299194, "learning_rate": 1.9954882609630234e-05, "loss": 1.9934, "step": 1088 }, { "epoch": 0.030275566033539432, "grad_norm": 1.2810208797454834, "learning_rate": 1.995479969770113e-05, "loss": 2.0637, "step": 1089 }, { "epoch": 0.03030336728793203, "grad_norm": 1.2203056812286377, "learning_rate": 1.9954716709831165e-05, "loss": 2.3893, "step": 1090 }, { "epoch": 0.03033116854232463, "grad_norm": 1.3014075756072998, "learning_rate": 1.9954633646020964e-05, "loss": 2.0799, "step": 1091 }, { "epoch": 0.030358969796717226, "grad_norm": 1.1742225885391235, "learning_rate": 1.9954550506271163e-05, "loss": 2.1218, "step": 1092 }, { "epoch": 0.030386771051109825, "grad_norm": 1.2535860538482666, "learning_rate": 1.9954467290582398e-05, "loss": 2.2108, "step": 1093 }, { "epoch": 0.030414572305502423, "grad_norm": 1.243058681488037, "learning_rate": 1.9954383998955303e-05, "loss": 2.2864, "step": 1094 }, { "epoch": 0.03044237355989502, "grad_norm": 1.186172604560852, "learning_rate": 1.9954300631390515e-05, "loss": 1.802, "step": 1095 }, { "epoch": 0.03047017481428762, "grad_norm": 1.134049415588379, "learning_rate": 1.9954217187888668e-05, "loss": 2.3547, "step": 1096 }, { "epoch": 0.030497976068680217, "grad_norm": 1.2576265335083008, "learning_rate": 1.99541336684504e-05, "loss": 2.06, "step": 1097 }, { "epoch": 0.030525777323072815, "grad_norm": 1.247373104095459, "learning_rate": 1.9954050073076347e-05, "loss": 2.1219, "step": 1098 }, { "epoch": 0.030553578577465414, "grad_norm": 1.1981040239334106, "learning_rate": 1.9953966401767144e-05, "loss": 2.3722, "step": 1099 }, { "epoch": 0.030581379831858012, "grad_norm": 1.2057154178619385, "learning_rate": 1.9953882654523432e-05, "loss": 2.3227, "step": 1100 }, { "epoch": 0.03060918108625061, "grad_norm": 1.3580480813980103, "learning_rate": 1.9953798831345853e-05, "loss": 2.1041, "step": 1101 }, { "epoch": 0.030636982340643208, "grad_norm": 1.3682360649108887, "learning_rate": 1.9953714932235043e-05, "loss": 2.0802, "step": 1102 }, { "epoch": 0.030664783595035806, "grad_norm": 1.2071720361709595, "learning_rate": 1.9953630957191642e-05, "loss": 2.1642, "step": 1103 }, { "epoch": 0.030692584849428405, "grad_norm": 1.2239891290664673, "learning_rate": 1.9953546906216292e-05, "loss": 2.3356, "step": 1104 }, { "epoch": 0.030720386103821003, "grad_norm": 1.3360637426376343, "learning_rate": 1.9953462779309635e-05, "loss": 2.0391, "step": 1105 }, { "epoch": 0.0307481873582136, "grad_norm": 1.2143646478652954, "learning_rate": 1.995337857647231e-05, "loss": 2.0625, "step": 1106 }, { "epoch": 0.0307759886126062, "grad_norm": 1.3532516956329346, "learning_rate": 1.9953294297704957e-05, "loss": 2.3019, "step": 1107 }, { "epoch": 0.0308037898669988, "grad_norm": 1.2752262353897095, "learning_rate": 1.9953209943008223e-05, "loss": 1.9904, "step": 1108 }, { "epoch": 0.0308315911213914, "grad_norm": 1.2009601593017578, "learning_rate": 1.9953125512382755e-05, "loss": 2.2993, "step": 1109 }, { "epoch": 0.030859392375783997, "grad_norm": 1.3744828701019287, "learning_rate": 1.995304100582919e-05, "loss": 2.1834, "step": 1110 }, { "epoch": 0.030887193630176595, "grad_norm": 1.1483668088912964, "learning_rate": 1.9952956423348175e-05, "loss": 1.947, "step": 1111 }, { "epoch": 0.030914994884569193, "grad_norm": 1.2159770727157593, "learning_rate": 1.9952871764940358e-05, "loss": 2.0011, "step": 1112 }, { "epoch": 0.03094279613896179, "grad_norm": 1.2391875982284546, "learning_rate": 1.995278703060638e-05, "loss": 2.171, "step": 1113 }, { "epoch": 0.03097059739335439, "grad_norm": 1.1962138414382935, "learning_rate": 1.9952702220346895e-05, "loss": 2.2387, "step": 1114 }, { "epoch": 0.030998398647746988, "grad_norm": 1.2280415296554565, "learning_rate": 1.9952617334162544e-05, "loss": 2.2794, "step": 1115 }, { "epoch": 0.031026199902139586, "grad_norm": 1.204247236251831, "learning_rate": 1.995253237205397e-05, "loss": 2.1823, "step": 1116 }, { "epoch": 0.031054001156532184, "grad_norm": 1.2137607336044312, "learning_rate": 1.9952447334021833e-05, "loss": 2.094, "step": 1117 }, { "epoch": 0.031081802410924782, "grad_norm": 1.1906448602676392, "learning_rate": 1.995236222006677e-05, "loss": 2.1256, "step": 1118 }, { "epoch": 0.03110960366531738, "grad_norm": 1.2307296991348267, "learning_rate": 1.995227703018944e-05, "loss": 2.1785, "step": 1119 }, { "epoch": 0.03113740491970998, "grad_norm": 1.2234601974487305, "learning_rate": 1.995219176439049e-05, "loss": 1.929, "step": 1120 }, { "epoch": 0.031165206174102577, "grad_norm": 1.2915749549865723, "learning_rate": 1.9952106422670564e-05, "loss": 2.232, "step": 1121 }, { "epoch": 0.031193007428495175, "grad_norm": 1.1794281005859375, "learning_rate": 1.995202100503032e-05, "loss": 2.0648, "step": 1122 }, { "epoch": 0.031220808682887773, "grad_norm": 1.2773468494415283, "learning_rate": 1.995193551147041e-05, "loss": 2.1231, "step": 1123 }, { "epoch": 0.03124860993728037, "grad_norm": 1.207288146018982, "learning_rate": 1.9951849941991485e-05, "loss": 2.0407, "step": 1124 }, { "epoch": 0.031276411191672966, "grad_norm": 1.245851993560791, "learning_rate": 1.9951764296594193e-05, "loss": 2.0487, "step": 1125 }, { "epoch": 0.031304212446065564, "grad_norm": 1.2381623983383179, "learning_rate": 1.9951678575279195e-05, "loss": 2.1621, "step": 1126 }, { "epoch": 0.03133201370045816, "grad_norm": 1.095927357673645, "learning_rate": 1.9951592778047137e-05, "loss": 2.0513, "step": 1127 }, { "epoch": 0.03135981495485076, "grad_norm": 1.219295620918274, "learning_rate": 1.9951506904898677e-05, "loss": 1.6493, "step": 1128 }, { "epoch": 0.03138761620924336, "grad_norm": 1.1498454809188843, "learning_rate": 1.9951420955834477e-05, "loss": 2.055, "step": 1129 }, { "epoch": 0.03141541746363596, "grad_norm": 1.1925979852676392, "learning_rate": 1.9951334930855178e-05, "loss": 1.9847, "step": 1130 }, { "epoch": 0.031443218718028555, "grad_norm": 1.166666030883789, "learning_rate": 1.9951248829961453e-05, "loss": 1.9483, "step": 1131 }, { "epoch": 0.03147101997242115, "grad_norm": 1.2226347923278809, "learning_rate": 1.9951162653153945e-05, "loss": 2.0816, "step": 1132 }, { "epoch": 0.03149882122681375, "grad_norm": 1.1555098295211792, "learning_rate": 1.995107640043332e-05, "loss": 2.3051, "step": 1133 }, { "epoch": 0.03152662248120635, "grad_norm": 1.1961783170700073, "learning_rate": 1.9950990071800227e-05, "loss": 2.2332, "step": 1134 }, { "epoch": 0.03155442373559895, "grad_norm": 1.2447867393493652, "learning_rate": 1.9950903667255334e-05, "loss": 2.1319, "step": 1135 }, { "epoch": 0.031582224989991546, "grad_norm": 1.1414600610733032, "learning_rate": 1.99508171867993e-05, "loss": 2.041, "step": 1136 }, { "epoch": 0.031610026244384144, "grad_norm": 1.2904462814331055, "learning_rate": 1.9950730630432776e-05, "loss": 2.0305, "step": 1137 }, { "epoch": 0.03163782749877674, "grad_norm": 1.2137258052825928, "learning_rate": 1.9950643998156428e-05, "loss": 1.9885, "step": 1138 }, { "epoch": 0.03166562875316934, "grad_norm": 1.2385977506637573, "learning_rate": 1.995055728997092e-05, "loss": 1.9072, "step": 1139 }, { "epoch": 0.03169343000756194, "grad_norm": 1.209688425064087, "learning_rate": 1.9950470505876905e-05, "loss": 2.0882, "step": 1140 }, { "epoch": 0.03172123126195454, "grad_norm": 1.216902732849121, "learning_rate": 1.9950383645875055e-05, "loss": 1.9768, "step": 1141 }, { "epoch": 0.031749032516347135, "grad_norm": 1.2072303295135498, "learning_rate": 1.9950296709966025e-05, "loss": 1.9941, "step": 1142 }, { "epoch": 0.03177683377073973, "grad_norm": 1.2400919198989868, "learning_rate": 1.995020969815048e-05, "loss": 2.1502, "step": 1143 }, { "epoch": 0.03180463502513233, "grad_norm": 1.2420872449874878, "learning_rate": 1.9950122610429083e-05, "loss": 2.0663, "step": 1144 }, { "epoch": 0.03183243627952493, "grad_norm": 1.231480598449707, "learning_rate": 1.9950035446802503e-05, "loss": 1.8362, "step": 1145 }, { "epoch": 0.03186023753391753, "grad_norm": 1.2560821771621704, "learning_rate": 1.99499482072714e-05, "loss": 2.2129, "step": 1146 }, { "epoch": 0.031888038788310126, "grad_norm": 1.1720819473266602, "learning_rate": 1.9949860891836444e-05, "loss": 1.7033, "step": 1147 }, { "epoch": 0.031915840042702724, "grad_norm": 1.2021994590759277, "learning_rate": 1.9949773500498297e-05, "loss": 2.0746, "step": 1148 }, { "epoch": 0.03194364129709532, "grad_norm": 1.2418798208236694, "learning_rate": 1.9949686033257625e-05, "loss": 2.3414, "step": 1149 }, { "epoch": 0.03197144255148792, "grad_norm": 1.288629174232483, "learning_rate": 1.9949598490115097e-05, "loss": 1.9546, "step": 1150 }, { "epoch": 0.03199924380588052, "grad_norm": 1.2021105289459229, "learning_rate": 1.9949510871071383e-05, "loss": 2.0022, "step": 1151 }, { "epoch": 0.03202704506027312, "grad_norm": 1.2205355167388916, "learning_rate": 1.994942317612715e-05, "loss": 1.77, "step": 1152 }, { "epoch": 0.032054846314665715, "grad_norm": 1.2284846305847168, "learning_rate": 1.9949335405283068e-05, "loss": 1.9673, "step": 1153 }, { "epoch": 0.03208264756905831, "grad_norm": 1.2520922422409058, "learning_rate": 1.99492475585398e-05, "loss": 1.9999, "step": 1154 }, { "epoch": 0.03211044882345091, "grad_norm": 1.373508095741272, "learning_rate": 1.9949159635898025e-05, "loss": 2.0016, "step": 1155 }, { "epoch": 0.03213825007784351, "grad_norm": 1.2249436378479004, "learning_rate": 1.994907163735841e-05, "loss": 1.983, "step": 1156 }, { "epoch": 0.03216605133223611, "grad_norm": 1.1260356903076172, "learning_rate": 1.9948983562921625e-05, "loss": 1.915, "step": 1157 }, { "epoch": 0.032193852586628706, "grad_norm": 1.3283015489578247, "learning_rate": 1.9948895412588344e-05, "loss": 1.9554, "step": 1158 }, { "epoch": 0.032221653841021304, "grad_norm": 1.3630905151367188, "learning_rate": 1.994880718635924e-05, "loss": 1.7028, "step": 1159 }, { "epoch": 0.0322494550954139, "grad_norm": 1.2104308605194092, "learning_rate": 1.994871888423498e-05, "loss": 2.2851, "step": 1160 }, { "epoch": 0.0322772563498065, "grad_norm": 1.169094443321228, "learning_rate": 1.9948630506216244e-05, "loss": 1.8951, "step": 1161 }, { "epoch": 0.0323050576041991, "grad_norm": 1.2502542734146118, "learning_rate": 1.9948542052303707e-05, "loss": 1.9785, "step": 1162 }, { "epoch": 0.0323328588585917, "grad_norm": 1.2500983476638794, "learning_rate": 1.994845352249804e-05, "loss": 1.9055, "step": 1163 }, { "epoch": 0.032360660112984295, "grad_norm": 1.2414236068725586, "learning_rate": 1.994836491679992e-05, "loss": 2.0378, "step": 1164 }, { "epoch": 0.03238846136737689, "grad_norm": 1.2330596446990967, "learning_rate": 1.994827623521002e-05, "loss": 2.3401, "step": 1165 }, { "epoch": 0.03241626262176949, "grad_norm": 1.2064566612243652, "learning_rate": 1.9948187477729023e-05, "loss": 2.1369, "step": 1166 }, { "epoch": 0.03244406387616209, "grad_norm": 1.1876155138015747, "learning_rate": 1.9948098644357598e-05, "loss": 2.2563, "step": 1167 }, { "epoch": 0.03247186513055469, "grad_norm": 1.197113037109375, "learning_rate": 1.994800973509643e-05, "loss": 2.2184, "step": 1168 }, { "epoch": 0.032499666384947286, "grad_norm": 1.1548502445220947, "learning_rate": 1.994792074994619e-05, "loss": 2.1214, "step": 1169 }, { "epoch": 0.032527467639339884, "grad_norm": 1.2515721321105957, "learning_rate": 1.9947831688907567e-05, "loss": 1.9023, "step": 1170 }, { "epoch": 0.03255526889373248, "grad_norm": 1.1699787378311157, "learning_rate": 1.9947742551981228e-05, "loss": 2.3737, "step": 1171 }, { "epoch": 0.03258307014812508, "grad_norm": 1.241152048110962, "learning_rate": 1.9947653339167862e-05, "loss": 1.8987, "step": 1172 }, { "epoch": 0.03261087140251768, "grad_norm": 1.1819188594818115, "learning_rate": 1.9947564050468148e-05, "loss": 1.9338, "step": 1173 }, { "epoch": 0.03263867265691028, "grad_norm": 1.1759090423583984, "learning_rate": 1.9947474685882765e-05, "loss": 1.9941, "step": 1174 }, { "epoch": 0.032666473911302875, "grad_norm": 1.1870983839035034, "learning_rate": 1.9947385245412398e-05, "loss": 2.0557, "step": 1175 }, { "epoch": 0.03269427516569547, "grad_norm": 1.2893280982971191, "learning_rate": 1.994729572905772e-05, "loss": 2.3451, "step": 1176 }, { "epoch": 0.03272207642008807, "grad_norm": 1.1660308837890625, "learning_rate": 1.9947206136819427e-05, "loss": 2.3187, "step": 1177 }, { "epoch": 0.03274987767448067, "grad_norm": 1.184065580368042, "learning_rate": 1.9947116468698195e-05, "loss": 2.3414, "step": 1178 }, { "epoch": 0.03277767892887327, "grad_norm": 1.1744658946990967, "learning_rate": 1.994702672469471e-05, "loss": 1.9344, "step": 1179 }, { "epoch": 0.032805480183265866, "grad_norm": 1.2392834424972534, "learning_rate": 1.9946936904809656e-05, "loss": 1.9973, "step": 1180 }, { "epoch": 0.032833281437658464, "grad_norm": 1.1947425603866577, "learning_rate": 1.994684700904372e-05, "loss": 2.1257, "step": 1181 }, { "epoch": 0.03286108269205106, "grad_norm": 1.278005599975586, "learning_rate": 1.994675703739758e-05, "loss": 2.1848, "step": 1182 }, { "epoch": 0.03288888394644366, "grad_norm": 1.2695791721343994, "learning_rate": 1.9946666989871932e-05, "loss": 2.1256, "step": 1183 }, { "epoch": 0.03291668520083626, "grad_norm": 1.2813076972961426, "learning_rate": 1.994657686646746e-05, "loss": 2.1609, "step": 1184 }, { "epoch": 0.03294448645522886, "grad_norm": 1.1698096990585327, "learning_rate": 1.994648666718485e-05, "loss": 2.19, "step": 1185 }, { "epoch": 0.032972287709621455, "grad_norm": 1.3370317220687866, "learning_rate": 1.994639639202479e-05, "loss": 2.4573, "step": 1186 }, { "epoch": 0.03300008896401405, "grad_norm": 1.2400381565093994, "learning_rate": 1.994630604098797e-05, "loss": 1.9748, "step": 1187 }, { "epoch": 0.03302789021840665, "grad_norm": 1.2346843481063843, "learning_rate": 1.9946215614075078e-05, "loss": 1.7313, "step": 1188 }, { "epoch": 0.03305569147279925, "grad_norm": 1.229712963104248, "learning_rate": 1.9946125111286808e-05, "loss": 2.1962, "step": 1189 }, { "epoch": 0.03308349272719185, "grad_norm": 1.167667269706726, "learning_rate": 1.9946034532623843e-05, "loss": 2.2228, "step": 1190 }, { "epoch": 0.033111293981584446, "grad_norm": 1.3459688425064087, "learning_rate": 1.994594387808688e-05, "loss": 1.9967, "step": 1191 }, { "epoch": 0.033139095235977044, "grad_norm": 1.1765429973602295, "learning_rate": 1.994585314767661e-05, "loss": 2.1523, "step": 1192 }, { "epoch": 0.03316689649036965, "grad_norm": 1.2633529901504517, "learning_rate": 1.994576234139372e-05, "loss": 2.4357, "step": 1193 }, { "epoch": 0.03319469774476225, "grad_norm": 1.1808453798294067, "learning_rate": 1.9945671459238908e-05, "loss": 2.1974, "step": 1194 }, { "epoch": 0.033222498999154845, "grad_norm": 1.246795415878296, "learning_rate": 1.9945580501212867e-05, "loss": 2.0818, "step": 1195 }, { "epoch": 0.03325030025354744, "grad_norm": 1.1750390529632568, "learning_rate": 1.9945489467316287e-05, "loss": 2.3099, "step": 1196 }, { "epoch": 0.03327810150794004, "grad_norm": 1.2320919036865234, "learning_rate": 1.9945398357549868e-05, "loss": 2.2075, "step": 1197 }, { "epoch": 0.03330590276233264, "grad_norm": 1.1250507831573486, "learning_rate": 1.99453071719143e-05, "loss": 2.1369, "step": 1198 }, { "epoch": 0.03333370401672524, "grad_norm": 1.2306535243988037, "learning_rate": 1.9945215910410285e-05, "loss": 1.9981, "step": 1199 }, { "epoch": 0.033361505271117836, "grad_norm": 1.298494815826416, "learning_rate": 1.994512457303851e-05, "loss": 2.4643, "step": 1200 }, { "epoch": 0.033389306525510434, "grad_norm": 1.2741076946258545, "learning_rate": 1.9945033159799677e-05, "loss": 2.1157, "step": 1201 }, { "epoch": 0.03341710777990303, "grad_norm": 1.171218991279602, "learning_rate": 1.9944941670694486e-05, "loss": 1.994, "step": 1202 }, { "epoch": 0.03344490903429563, "grad_norm": 1.195442795753479, "learning_rate": 1.994485010572363e-05, "loss": 2.2476, "step": 1203 }, { "epoch": 0.03347271028868823, "grad_norm": 1.1938704252243042, "learning_rate": 1.994475846488781e-05, "loss": 2.0609, "step": 1204 }, { "epoch": 0.03350051154308083, "grad_norm": 1.2483707666397095, "learning_rate": 1.9944666748187725e-05, "loss": 2.407, "step": 1205 }, { "epoch": 0.033528312797473425, "grad_norm": 1.2502087354660034, "learning_rate": 1.9944574955624073e-05, "loss": 2.0348, "step": 1206 }, { "epoch": 0.03355611405186602, "grad_norm": 1.15809965133667, "learning_rate": 1.9944483087197555e-05, "loss": 1.9172, "step": 1207 }, { "epoch": 0.03358391530625862, "grad_norm": 1.1779600381851196, "learning_rate": 1.9944391142908875e-05, "loss": 2.188, "step": 1208 }, { "epoch": 0.03361171656065122, "grad_norm": 1.1537083387374878, "learning_rate": 1.994429912275873e-05, "loss": 1.8845, "step": 1209 }, { "epoch": 0.03363951781504382, "grad_norm": 1.3165799379348755, "learning_rate": 1.9944207026747824e-05, "loss": 1.8535, "step": 1210 }, { "epoch": 0.033667319069436416, "grad_norm": 1.1551237106323242, "learning_rate": 1.9944114854876858e-05, "loss": 2.0169, "step": 1211 }, { "epoch": 0.033695120323829014, "grad_norm": 1.3037774562835693, "learning_rate": 1.994402260714654e-05, "loss": 2.1481, "step": 1212 }, { "epoch": 0.03372292157822161, "grad_norm": 1.2613604068756104, "learning_rate": 1.9943930283557567e-05, "loss": 2.3124, "step": 1213 }, { "epoch": 0.03375072283261421, "grad_norm": 1.2166721820831299, "learning_rate": 1.9943837884110647e-05, "loss": 1.8837, "step": 1214 }, { "epoch": 0.03377852408700681, "grad_norm": 1.4294248819351196, "learning_rate": 1.9943745408806486e-05, "loss": 2.2923, "step": 1215 }, { "epoch": 0.03380632534139941, "grad_norm": 1.2085946798324585, "learning_rate": 1.9943652857645784e-05, "loss": 2.2501, "step": 1216 }, { "epoch": 0.033834126595792005, "grad_norm": 1.2135885953903198, "learning_rate": 1.9943560230629254e-05, "loss": 1.9125, "step": 1217 }, { "epoch": 0.0338619278501846, "grad_norm": 1.1430251598358154, "learning_rate": 1.99434675277576e-05, "loss": 2.1407, "step": 1218 }, { "epoch": 0.0338897291045772, "grad_norm": 1.2234485149383545, "learning_rate": 1.994337474903152e-05, "loss": 2.0618, "step": 1219 }, { "epoch": 0.0339175303589698, "grad_norm": 1.269737958908081, "learning_rate": 1.9943281894451743e-05, "loss": 1.9456, "step": 1220 }, { "epoch": 0.0339453316133624, "grad_norm": 1.4007192850112915, "learning_rate": 1.9943188964018958e-05, "loss": 2.2358, "step": 1221 }, { "epoch": 0.033973132867754996, "grad_norm": 1.1576035022735596, "learning_rate": 1.994309595773388e-05, "loss": 2.277, "step": 1222 }, { "epoch": 0.034000934122147594, "grad_norm": 1.237837314605713, "learning_rate": 1.994300287559722e-05, "loss": 2.0874, "step": 1223 }, { "epoch": 0.03402873537654019, "grad_norm": 1.3552136421203613, "learning_rate": 1.994290971760969e-05, "loss": 2.2833, "step": 1224 }, { "epoch": 0.03405653663093279, "grad_norm": 1.2551783323287964, "learning_rate": 1.9942816483771997e-05, "loss": 2.0957, "step": 1225 }, { "epoch": 0.03408433788532539, "grad_norm": 1.2986005544662476, "learning_rate": 1.994272317408485e-05, "loss": 1.9666, "step": 1226 }, { "epoch": 0.03411213913971799, "grad_norm": 1.1070493459701538, "learning_rate": 1.9942629788548965e-05, "loss": 2.0234, "step": 1227 }, { "epoch": 0.034139940394110585, "grad_norm": 1.237081527709961, "learning_rate": 1.9942536327165056e-05, "loss": 2.2096, "step": 1228 }, { "epoch": 0.03416774164850318, "grad_norm": 1.233892560005188, "learning_rate": 1.994244278993383e-05, "loss": 2.1286, "step": 1229 }, { "epoch": 0.03419554290289578, "grad_norm": 1.1330904960632324, "learning_rate": 1.9942349176856005e-05, "loss": 2.2142, "step": 1230 }, { "epoch": 0.03422334415728838, "grad_norm": 1.2502447366714478, "learning_rate": 1.9942255487932293e-05, "loss": 2.0501, "step": 1231 }, { "epoch": 0.03425114541168098, "grad_norm": 1.1978919506072998, "learning_rate": 1.9942161723163415e-05, "loss": 2.399, "step": 1232 }, { "epoch": 0.034278946666073576, "grad_norm": 1.2179925441741943, "learning_rate": 1.9942067882550075e-05, "loss": 2.0406, "step": 1233 }, { "epoch": 0.034306747920466174, "grad_norm": 1.2188806533813477, "learning_rate": 1.9941973966093e-05, "loss": 1.8611, "step": 1234 }, { "epoch": 0.03433454917485877, "grad_norm": 1.2227526903152466, "learning_rate": 1.9941879973792897e-05, "loss": 2.3055, "step": 1235 }, { "epoch": 0.03436235042925137, "grad_norm": 1.4342420101165771, "learning_rate": 1.994178590565049e-05, "loss": 1.7664, "step": 1236 }, { "epoch": 0.03439015168364397, "grad_norm": 1.1872097253799438, "learning_rate": 1.9941691761666488e-05, "loss": 1.8616, "step": 1237 }, { "epoch": 0.03441795293803657, "grad_norm": 1.1868524551391602, "learning_rate": 1.994159754184162e-05, "loss": 2.3633, "step": 1238 }, { "epoch": 0.034445754192429165, "grad_norm": 1.235103726387024, "learning_rate": 1.99415032461766e-05, "loss": 2.367, "step": 1239 }, { "epoch": 0.03447355544682176, "grad_norm": 1.2777996063232422, "learning_rate": 1.9941408874672148e-05, "loss": 2.0897, "step": 1240 }, { "epoch": 0.03450135670121436, "grad_norm": 1.260987639427185, "learning_rate": 1.9941314427328982e-05, "loss": 1.7813, "step": 1241 }, { "epoch": 0.03452915795560696, "grad_norm": 1.2262928485870361, "learning_rate": 1.994121990414782e-05, "loss": 1.9662, "step": 1242 }, { "epoch": 0.03455695920999956, "grad_norm": 1.1867263317108154, "learning_rate": 1.9941125305129395e-05, "loss": 1.9841, "step": 1243 }, { "epoch": 0.034584760464392156, "grad_norm": 1.1628460884094238, "learning_rate": 1.9941030630274414e-05, "loss": 2.075, "step": 1244 }, { "epoch": 0.034612561718784754, "grad_norm": 1.313145399093628, "learning_rate": 1.9940935879583607e-05, "loss": 1.8662, "step": 1245 }, { "epoch": 0.03464036297317735, "grad_norm": 1.2847847938537598, "learning_rate": 1.9940841053057696e-05, "loss": 2.3493, "step": 1246 }, { "epoch": 0.03466816422756995, "grad_norm": 1.1442865133285522, "learning_rate": 1.9940746150697407e-05, "loss": 2.3079, "step": 1247 }, { "epoch": 0.03469596548196255, "grad_norm": 1.181789755821228, "learning_rate": 1.9940651172503457e-05, "loss": 2.1245, "step": 1248 }, { "epoch": 0.03472376673635515, "grad_norm": 1.20806884765625, "learning_rate": 1.994055611847657e-05, "loss": 2.2601, "step": 1249 }, { "epoch": 0.034751567990747745, "grad_norm": 1.3058011531829834, "learning_rate": 1.9940460988617483e-05, "loss": 2.031, "step": 1250 }, { "epoch": 0.03477936924514034, "grad_norm": 1.3279777765274048, "learning_rate": 1.9940365782926913e-05, "loss": 2.2339, "step": 1251 }, { "epoch": 0.03480717049953294, "grad_norm": 1.2800029516220093, "learning_rate": 1.9940270501405582e-05, "loss": 1.8893, "step": 1252 }, { "epoch": 0.03483497175392554, "grad_norm": 1.2503271102905273, "learning_rate": 1.9940175144054227e-05, "loss": 2.19, "step": 1253 }, { "epoch": 0.03486277300831814, "grad_norm": 1.2150905132293701, "learning_rate": 1.994007971087357e-05, "loss": 1.9969, "step": 1254 }, { "epoch": 0.034890574262710736, "grad_norm": 1.1979856491088867, "learning_rate": 1.9939984201864338e-05, "loss": 1.9877, "step": 1255 }, { "epoch": 0.034918375517103334, "grad_norm": 1.145242691040039, "learning_rate": 1.9939888617027265e-05, "loss": 1.9254, "step": 1256 }, { "epoch": 0.03494617677149593, "grad_norm": 1.2482068538665771, "learning_rate": 1.9939792956363074e-05, "loss": 2.1297, "step": 1257 }, { "epoch": 0.03497397802588853, "grad_norm": 1.1406564712524414, "learning_rate": 1.9939697219872497e-05, "loss": 1.9243, "step": 1258 }, { "epoch": 0.03500177928028113, "grad_norm": 1.23075270652771, "learning_rate": 1.993960140755627e-05, "loss": 1.9748, "step": 1259 }, { "epoch": 0.03502958053467373, "grad_norm": 1.2925224304199219, "learning_rate": 1.9939505519415113e-05, "loss": 1.993, "step": 1260 }, { "epoch": 0.035057381789066325, "grad_norm": 1.2474255561828613, "learning_rate": 1.9939409555449762e-05, "loss": 2.1092, "step": 1261 }, { "epoch": 0.03508518304345892, "grad_norm": 1.1872620582580566, "learning_rate": 1.9939313515660952e-05, "loss": 2.0986, "step": 1262 }, { "epoch": 0.03511298429785152, "grad_norm": 1.2687238454818726, "learning_rate": 1.9939217400049413e-05, "loss": 2.086, "step": 1263 }, { "epoch": 0.03514078555224412, "grad_norm": 1.3478718996047974, "learning_rate": 1.9939121208615878e-05, "loss": 2.0384, "step": 1264 }, { "epoch": 0.03516858680663672, "grad_norm": 1.2817445993423462, "learning_rate": 1.9939024941361083e-05, "loss": 2.3707, "step": 1265 }, { "epoch": 0.035196388061029316, "grad_norm": 1.1764520406723022, "learning_rate": 1.9938928598285764e-05, "loss": 2.0378, "step": 1266 }, { "epoch": 0.035224189315421914, "grad_norm": 1.3202978372573853, "learning_rate": 1.9938832179390652e-05, "loss": 2.1597, "step": 1267 }, { "epoch": 0.03525199056981451, "grad_norm": 1.2804394960403442, "learning_rate": 1.9938735684676482e-05, "loss": 2.005, "step": 1268 }, { "epoch": 0.03527979182420711, "grad_norm": 1.2151662111282349, "learning_rate": 1.9938639114143994e-05, "loss": 2.0953, "step": 1269 }, { "epoch": 0.03530759307859971, "grad_norm": 1.2177338600158691, "learning_rate": 1.993854246779392e-05, "loss": 2.1351, "step": 1270 }, { "epoch": 0.035335394332992306, "grad_norm": 1.2052662372589111, "learning_rate": 1.9938445745626998e-05, "loss": 1.9961, "step": 1271 }, { "epoch": 0.035363195587384905, "grad_norm": 1.2249430418014526, "learning_rate": 1.9938348947643975e-05, "loss": 2.0807, "step": 1272 }, { "epoch": 0.0353909968417775, "grad_norm": 1.1924840211868286, "learning_rate": 1.9938252073845573e-05, "loss": 1.9688, "step": 1273 }, { "epoch": 0.0354187980961701, "grad_norm": 1.2458494901657104, "learning_rate": 1.9938155124232545e-05, "loss": 2.1907, "step": 1274 }, { "epoch": 0.0354465993505627, "grad_norm": 1.212618350982666, "learning_rate": 1.9938058098805624e-05, "loss": 2.0753, "step": 1275 }, { "epoch": 0.0354744006049553, "grad_norm": 1.3073194026947021, "learning_rate": 1.9937960997565554e-05, "loss": 2.1649, "step": 1276 }, { "epoch": 0.035502201859347896, "grad_norm": 1.2116751670837402, "learning_rate": 1.9937863820513074e-05, "loss": 1.9639, "step": 1277 }, { "epoch": 0.035530003113740494, "grad_norm": 1.3377214670181274, "learning_rate": 1.993776656764892e-05, "loss": 1.9996, "step": 1278 }, { "epoch": 0.03555780436813309, "grad_norm": 1.2161060571670532, "learning_rate": 1.993766923897384e-05, "loss": 2.133, "step": 1279 }, { "epoch": 0.03558560562252569, "grad_norm": 1.2157573699951172, "learning_rate": 1.993757183448858e-05, "loss": 2.1304, "step": 1280 }, { "epoch": 0.03561340687691829, "grad_norm": 1.203014612197876, "learning_rate": 1.9937474354193873e-05, "loss": 2.1614, "step": 1281 }, { "epoch": 0.035641208131310886, "grad_norm": 1.2651705741882324, "learning_rate": 1.9937376798090473e-05, "loss": 2.0316, "step": 1282 }, { "epoch": 0.035669009385703485, "grad_norm": 1.1783782243728638, "learning_rate": 1.993727916617911e-05, "loss": 2.1759, "step": 1283 }, { "epoch": 0.03569681064009608, "grad_norm": 1.3300323486328125, "learning_rate": 1.9937181458460548e-05, "loss": 1.9178, "step": 1284 }, { "epoch": 0.03572461189448868, "grad_norm": 1.1590920686721802, "learning_rate": 1.993708367493552e-05, "loss": 2.1405, "step": 1285 }, { "epoch": 0.03575241314888128, "grad_norm": 1.2339591979980469, "learning_rate": 1.993698581560477e-05, "loss": 2.3958, "step": 1286 }, { "epoch": 0.03578021440327388, "grad_norm": 1.196839451789856, "learning_rate": 1.9936887880469054e-05, "loss": 2.1815, "step": 1287 }, { "epoch": 0.035808015657666475, "grad_norm": 1.107629418373108, "learning_rate": 1.993678986952911e-05, "loss": 1.9937, "step": 1288 }, { "epoch": 0.035835816912059074, "grad_norm": 1.2039545774459839, "learning_rate": 1.9936691782785692e-05, "loss": 2.1481, "step": 1289 }, { "epoch": 0.03586361816645167, "grad_norm": 1.2216969728469849, "learning_rate": 1.9936593620239543e-05, "loss": 2.4732, "step": 1290 }, { "epoch": 0.03589141942084427, "grad_norm": 1.2105244398117065, "learning_rate": 1.993649538189142e-05, "loss": 2.1444, "step": 1291 }, { "epoch": 0.03591922067523687, "grad_norm": 1.2554609775543213, "learning_rate": 1.993639706774206e-05, "loss": 2.3951, "step": 1292 }, { "epoch": 0.035947021929629466, "grad_norm": 1.2524057626724243, "learning_rate": 1.9936298677792222e-05, "loss": 2.7142, "step": 1293 }, { "epoch": 0.035974823184022064, "grad_norm": 1.2760930061340332, "learning_rate": 1.9936200212042658e-05, "loss": 2.239, "step": 1294 }, { "epoch": 0.03600262443841466, "grad_norm": 1.2530438899993896, "learning_rate": 1.9936101670494113e-05, "loss": 2.3079, "step": 1295 }, { "epoch": 0.03603042569280726, "grad_norm": 1.165138602256775, "learning_rate": 1.9936003053147345e-05, "loss": 2.2541, "step": 1296 }, { "epoch": 0.03605822694719986, "grad_norm": 1.3255267143249512, "learning_rate": 1.9935904360003098e-05, "loss": 1.9934, "step": 1297 }, { "epoch": 0.03608602820159246, "grad_norm": 1.1804821491241455, "learning_rate": 1.9935805591062132e-05, "loss": 2.2363, "step": 1298 }, { "epoch": 0.036113829455985055, "grad_norm": 1.2411335706710815, "learning_rate": 1.9935706746325198e-05, "loss": 2.4325, "step": 1299 }, { "epoch": 0.036141630710377654, "grad_norm": 1.2910631895065308, "learning_rate": 1.993560782579305e-05, "loss": 2.043, "step": 1300 }, { "epoch": 0.03616943196477025, "grad_norm": 1.2932283878326416, "learning_rate": 1.9935508829466444e-05, "loss": 1.9993, "step": 1301 }, { "epoch": 0.03619723321916285, "grad_norm": 1.1941739320755005, "learning_rate": 1.993540975734613e-05, "loss": 2.1648, "step": 1302 }, { "epoch": 0.03622503447355545, "grad_norm": 1.2468361854553223, "learning_rate": 1.9935310609432873e-05, "loss": 2.3949, "step": 1303 }, { "epoch": 0.036252835727948046, "grad_norm": 1.1922541856765747, "learning_rate": 1.9935211385727422e-05, "loss": 2.1483, "step": 1304 }, { "epoch": 0.036280636982340644, "grad_norm": 1.305637001991272, "learning_rate": 1.9935112086230537e-05, "loss": 1.9588, "step": 1305 }, { "epoch": 0.03630843823673324, "grad_norm": 1.2783691883087158, "learning_rate": 1.9935012710942977e-05, "loss": 2.0894, "step": 1306 }, { "epoch": 0.03633623949112584, "grad_norm": 1.2845031023025513, "learning_rate": 1.993491325986549e-05, "loss": 2.1892, "step": 1307 }, { "epoch": 0.03636404074551844, "grad_norm": 1.2030986547470093, "learning_rate": 1.993481373299885e-05, "loss": 2.1292, "step": 1308 }, { "epoch": 0.03639184199991104, "grad_norm": 1.1553462743759155, "learning_rate": 1.9934714130343807e-05, "loss": 2.0662, "step": 1309 }, { "epoch": 0.036419643254303635, "grad_norm": 1.1582826375961304, "learning_rate": 1.993461445190112e-05, "loss": 2.0414, "step": 1310 }, { "epoch": 0.03644744450869623, "grad_norm": 1.252187728881836, "learning_rate": 1.993451469767155e-05, "loss": 2.2474, "step": 1311 }, { "epoch": 0.03647524576308883, "grad_norm": 1.140934705734253, "learning_rate": 1.9934414867655863e-05, "loss": 2.0253, "step": 1312 }, { "epoch": 0.03650304701748143, "grad_norm": 1.2355079650878906, "learning_rate": 1.993431496185482e-05, "loss": 1.7059, "step": 1313 }, { "epoch": 0.03653084827187403, "grad_norm": 1.1832714080810547, "learning_rate": 1.9934214980269178e-05, "loss": 2.1603, "step": 1314 }, { "epoch": 0.036558649526266626, "grad_norm": 1.2211263179779053, "learning_rate": 1.9934114922899704e-05, "loss": 2.381, "step": 1315 }, { "epoch": 0.036586450780659224, "grad_norm": 1.185011386871338, "learning_rate": 1.9934014789747154e-05, "loss": 1.7521, "step": 1316 }, { "epoch": 0.03661425203505182, "grad_norm": 1.2781494855880737, "learning_rate": 1.9933914580812305e-05, "loss": 2.3463, "step": 1317 }, { "epoch": 0.03664205328944442, "grad_norm": 1.2361035346984863, "learning_rate": 1.993381429609591e-05, "loss": 2.0101, "step": 1318 }, { "epoch": 0.03666985454383702, "grad_norm": 1.4974513053894043, "learning_rate": 1.9933713935598737e-05, "loss": 2.0372, "step": 1319 }, { "epoch": 0.03669765579822962, "grad_norm": 1.296120047569275, "learning_rate": 1.9933613499321552e-05, "loss": 1.9587, "step": 1320 }, { "epoch": 0.036725457052622215, "grad_norm": 1.3080729246139526, "learning_rate": 1.993351298726512e-05, "loss": 2.4506, "step": 1321 }, { "epoch": 0.03675325830701481, "grad_norm": 1.1910256147384644, "learning_rate": 1.9933412399430215e-05, "loss": 1.9246, "step": 1322 }, { "epoch": 0.03678105956140741, "grad_norm": 1.1428884267807007, "learning_rate": 1.9933311735817597e-05, "loss": 1.7913, "step": 1323 }, { "epoch": 0.03680886081580001, "grad_norm": 1.296375036239624, "learning_rate": 1.9933210996428033e-05, "loss": 2.0665, "step": 1324 }, { "epoch": 0.03683666207019261, "grad_norm": 1.1729846000671387, "learning_rate": 1.9933110181262297e-05, "loss": 2.0154, "step": 1325 }, { "epoch": 0.036864463324585206, "grad_norm": 1.1968066692352295, "learning_rate": 1.9933009290321158e-05, "loss": 2.0311, "step": 1326 }, { "epoch": 0.036892264578977804, "grad_norm": 1.1854437589645386, "learning_rate": 1.993290832360538e-05, "loss": 1.9113, "step": 1327 }, { "epoch": 0.0369200658333704, "grad_norm": 1.307992696762085, "learning_rate": 1.9932807281115733e-05, "loss": 2.1646, "step": 1328 }, { "epoch": 0.036947867087763, "grad_norm": 1.2731554508209229, "learning_rate": 1.9932706162852993e-05, "loss": 2.0648, "step": 1329 }, { "epoch": 0.0369756683421556, "grad_norm": 1.2269641160964966, "learning_rate": 1.9932604968817932e-05, "loss": 1.9379, "step": 1330 }, { "epoch": 0.0370034695965482, "grad_norm": 1.1382948160171509, "learning_rate": 1.9932503699011318e-05, "loss": 2.1067, "step": 1331 }, { "epoch": 0.037031270850940795, "grad_norm": 1.1890647411346436, "learning_rate": 1.9932402353433928e-05, "loss": 2.1453, "step": 1332 }, { "epoch": 0.03705907210533339, "grad_norm": 1.2212414741516113, "learning_rate": 1.9932300932086526e-05, "loss": 2.0558, "step": 1333 }, { "epoch": 0.03708687335972599, "grad_norm": 1.2819281816482544, "learning_rate": 1.9932199434969893e-05, "loss": 2.075, "step": 1334 }, { "epoch": 0.03711467461411859, "grad_norm": 1.1629246473312378, "learning_rate": 1.9932097862084804e-05, "loss": 1.9659, "step": 1335 }, { "epoch": 0.03714247586851119, "grad_norm": 1.216323971748352, "learning_rate": 1.993199621343203e-05, "loss": 2.0514, "step": 1336 }, { "epoch": 0.037170277122903786, "grad_norm": 1.208565592765808, "learning_rate": 1.9931894489012353e-05, "loss": 2.1969, "step": 1337 }, { "epoch": 0.037198078377296384, "grad_norm": 1.1699680089950562, "learning_rate": 1.9931792688826538e-05, "loss": 2.1373, "step": 1338 }, { "epoch": 0.03722587963168898, "grad_norm": 1.3332046270370483, "learning_rate": 1.993169081287537e-05, "loss": 2.1323, "step": 1339 }, { "epoch": 0.03725368088608158, "grad_norm": 1.243316411972046, "learning_rate": 1.9931588861159622e-05, "loss": 2.1358, "step": 1340 }, { "epoch": 0.03728148214047418, "grad_norm": 1.2496333122253418, "learning_rate": 1.9931486833680078e-05, "loss": 2.0367, "step": 1341 }, { "epoch": 0.03730928339486678, "grad_norm": 1.2973045110702515, "learning_rate": 1.993138473043751e-05, "loss": 2.1235, "step": 1342 }, { "epoch": 0.037337084649259375, "grad_norm": 1.1528412103652954, "learning_rate": 1.9931282551432703e-05, "loss": 2.0156, "step": 1343 }, { "epoch": 0.03736488590365197, "grad_norm": 1.414344310760498, "learning_rate": 1.9931180296666427e-05, "loss": 2.0637, "step": 1344 }, { "epoch": 0.03739268715804457, "grad_norm": 1.234403371810913, "learning_rate": 1.9931077966139468e-05, "loss": 1.7512, "step": 1345 }, { "epoch": 0.03742048841243717, "grad_norm": 1.1327723264694214, "learning_rate": 1.9930975559852607e-05, "loss": 2.3068, "step": 1346 }, { "epoch": 0.03744828966682977, "grad_norm": 1.1688581705093384, "learning_rate": 1.9930873077806626e-05, "loss": 2.0789, "step": 1347 }, { "epoch": 0.037476090921222366, "grad_norm": 1.2273043394088745, "learning_rate": 1.9930770520002305e-05, "loss": 2.0584, "step": 1348 }, { "epoch": 0.037503892175614964, "grad_norm": 1.240781307220459, "learning_rate": 1.9930667886440425e-05, "loss": 2.1094, "step": 1349 }, { "epoch": 0.03753169343000756, "grad_norm": 1.3256891965866089, "learning_rate": 1.9930565177121775e-05, "loss": 1.917, "step": 1350 }, { "epoch": 0.03755949468440016, "grad_norm": 1.2659461498260498, "learning_rate": 1.9930462392047127e-05, "loss": 2.0421, "step": 1351 }, { "epoch": 0.03758729593879276, "grad_norm": 1.2482260465621948, "learning_rate": 1.9930359531217278e-05, "loss": 2.1722, "step": 1352 }, { "epoch": 0.03761509719318536, "grad_norm": 1.269026756286621, "learning_rate": 1.9930256594633003e-05, "loss": 2.2754, "step": 1353 }, { "epoch": 0.037642898447577955, "grad_norm": 1.1114495992660522, "learning_rate": 1.9930153582295092e-05, "loss": 2.0431, "step": 1354 }, { "epoch": 0.03767069970197055, "grad_norm": 1.1889506578445435, "learning_rate": 1.993005049420433e-05, "loss": 2.2152, "step": 1355 }, { "epoch": 0.03769850095636315, "grad_norm": 1.3291246891021729, "learning_rate": 1.9929947330361507e-05, "loss": 2.094, "step": 1356 }, { "epoch": 0.03772630221075575, "grad_norm": 1.208566427230835, "learning_rate": 1.99298440907674e-05, "loss": 1.9577, "step": 1357 }, { "epoch": 0.03775410346514835, "grad_norm": 1.2206013202667236, "learning_rate": 1.992974077542281e-05, "loss": 2.1404, "step": 1358 }, { "epoch": 0.037781904719540946, "grad_norm": 1.2113780975341797, "learning_rate": 1.9929637384328512e-05, "loss": 2.1329, "step": 1359 }, { "epoch": 0.037809705973933544, "grad_norm": 1.2720178365707397, "learning_rate": 1.9929533917485302e-05, "loss": 2.4281, "step": 1360 }, { "epoch": 0.03783750722832614, "grad_norm": 1.1791713237762451, "learning_rate": 1.992943037489397e-05, "loss": 2.2822, "step": 1361 }, { "epoch": 0.03786530848271874, "grad_norm": 1.1562762260437012, "learning_rate": 1.9929326756555304e-05, "loss": 2.1181, "step": 1362 }, { "epoch": 0.03789310973711134, "grad_norm": 1.26426362991333, "learning_rate": 1.992922306247009e-05, "loss": 2.3012, "step": 1363 }, { "epoch": 0.03792091099150394, "grad_norm": 1.301809549331665, "learning_rate": 1.992911929263913e-05, "loss": 2.0932, "step": 1364 }, { "epoch": 0.037948712245896535, "grad_norm": 1.2428250312805176, "learning_rate": 1.9929015447063206e-05, "loss": 2.2456, "step": 1365 }, { "epoch": 0.03797651350028913, "grad_norm": 1.2517924308776855, "learning_rate": 1.992891152574311e-05, "loss": 2.4408, "step": 1366 }, { "epoch": 0.03800431475468173, "grad_norm": 1.2607048749923706, "learning_rate": 1.992880752867964e-05, "loss": 2.305, "step": 1367 }, { "epoch": 0.03803211600907433, "grad_norm": 1.2496546506881714, "learning_rate": 1.992870345587359e-05, "loss": 2.095, "step": 1368 }, { "epoch": 0.03805991726346693, "grad_norm": 1.2667412757873535, "learning_rate": 1.992859930732575e-05, "loss": 2.057, "step": 1369 }, { "epoch": 0.038087718517859526, "grad_norm": 1.13648521900177, "learning_rate": 1.9928495083036917e-05, "loss": 2.2642, "step": 1370 }, { "epoch": 0.038115519772252124, "grad_norm": 1.2686041593551636, "learning_rate": 1.992839078300788e-05, "loss": 2.2598, "step": 1371 }, { "epoch": 0.03814332102664472, "grad_norm": 1.2756860256195068, "learning_rate": 1.9928286407239447e-05, "loss": 2.5115, "step": 1372 }, { "epoch": 0.03817112228103732, "grad_norm": 1.2359508275985718, "learning_rate": 1.9928181955732402e-05, "loss": 1.7061, "step": 1373 }, { "epoch": 0.03819892353542992, "grad_norm": 1.2201504707336426, "learning_rate": 1.9928077428487548e-05, "loss": 2.1326, "step": 1374 }, { "epoch": 0.03822672478982252, "grad_norm": 1.1320239305496216, "learning_rate": 1.992797282550568e-05, "loss": 1.9853, "step": 1375 }, { "epoch": 0.038254526044215115, "grad_norm": 1.2424898147583008, "learning_rate": 1.9927868146787596e-05, "loss": 2.4017, "step": 1376 }, { "epoch": 0.03828232729860771, "grad_norm": 1.1589460372924805, "learning_rate": 1.9927763392334095e-05, "loss": 2.1795, "step": 1377 }, { "epoch": 0.03831012855300031, "grad_norm": 1.1879429817199707, "learning_rate": 1.992765856214598e-05, "loss": 2.1876, "step": 1378 }, { "epoch": 0.03833792980739291, "grad_norm": 1.2368412017822266, "learning_rate": 1.9927553656224048e-05, "loss": 2.3027, "step": 1379 }, { "epoch": 0.03836573106178551, "grad_norm": 1.220084547996521, "learning_rate": 1.9927448674569097e-05, "loss": 2.2704, "step": 1380 }, { "epoch": 0.038393532316178106, "grad_norm": 1.2466228008270264, "learning_rate": 1.9927343617181927e-05, "loss": 2.4138, "step": 1381 }, { "epoch": 0.038421333570570704, "grad_norm": 1.2966084480285645, "learning_rate": 1.9927238484063346e-05, "loss": 2.7252, "step": 1382 }, { "epoch": 0.0384491348249633, "grad_norm": 1.145957350730896, "learning_rate": 1.992713327521415e-05, "loss": 2.3152, "step": 1383 }, { "epoch": 0.0384769360793559, "grad_norm": 1.1721484661102295, "learning_rate": 1.992702799063514e-05, "loss": 2.105, "step": 1384 }, { "epoch": 0.0385047373337485, "grad_norm": 1.1776816844940186, "learning_rate": 1.9926922630327127e-05, "loss": 1.9537, "step": 1385 }, { "epoch": 0.038532538588141096, "grad_norm": 1.2493082284927368, "learning_rate": 1.992681719429091e-05, "loss": 1.9924, "step": 1386 }, { "epoch": 0.038560339842533695, "grad_norm": 1.1525123119354248, "learning_rate": 1.9926711682527294e-05, "loss": 2.0866, "step": 1387 }, { "epoch": 0.03858814109692629, "grad_norm": 1.2515438795089722, "learning_rate": 1.9926606095037083e-05, "loss": 2.1452, "step": 1388 }, { "epoch": 0.03861594235131889, "grad_norm": 1.2053173780441284, "learning_rate": 1.9926500431821083e-05, "loss": 2.2153, "step": 1389 }, { "epoch": 0.03864374360571149, "grad_norm": 1.1855990886688232, "learning_rate": 1.99263946928801e-05, "loss": 2.1029, "step": 1390 }, { "epoch": 0.03867154486010409, "grad_norm": 1.2139908075332642, "learning_rate": 1.992628887821494e-05, "loss": 2.1839, "step": 1391 }, { "epoch": 0.038699346114496685, "grad_norm": 1.1785370111465454, "learning_rate": 1.992618298782641e-05, "loss": 2.4724, "step": 1392 }, { "epoch": 0.038727147368889284, "grad_norm": 1.17913818359375, "learning_rate": 1.9926077021715325e-05, "loss": 2.0771, "step": 1393 }, { "epoch": 0.03875494862328188, "grad_norm": 1.117510437965393, "learning_rate": 1.9925970979882482e-05, "loss": 1.9172, "step": 1394 }, { "epoch": 0.03878274987767448, "grad_norm": 1.2049167156219482, "learning_rate": 1.99258648623287e-05, "loss": 1.8222, "step": 1395 }, { "epoch": 0.03881055113206708, "grad_norm": 1.1876420974731445, "learning_rate": 1.9925758669054777e-05, "loss": 1.8541, "step": 1396 }, { "epoch": 0.038838352386459676, "grad_norm": 1.1437922716140747, "learning_rate": 1.9925652400061535e-05, "loss": 2.1097, "step": 1397 }, { "epoch": 0.038866153640852275, "grad_norm": 1.1680666208267212, "learning_rate": 1.9925546055349775e-05, "loss": 2.0993, "step": 1398 }, { "epoch": 0.03889395489524487, "grad_norm": 1.170271635055542, "learning_rate": 1.9925439634920316e-05, "loss": 2.1513, "step": 1399 }, { "epoch": 0.03892175614963747, "grad_norm": 1.2494733333587646, "learning_rate": 1.992533313877397e-05, "loss": 1.9584, "step": 1400 }, { "epoch": 0.03894955740403007, "grad_norm": 1.2351012229919434, "learning_rate": 1.992522656691154e-05, "loss": 2.4126, "step": 1401 }, { "epoch": 0.03897735865842267, "grad_norm": 1.2524217367172241, "learning_rate": 1.9925119919333845e-05, "loss": 2.2178, "step": 1402 }, { "epoch": 0.039005159912815265, "grad_norm": 1.1896950006484985, "learning_rate": 1.9925013196041702e-05, "loss": 2.1478, "step": 1403 }, { "epoch": 0.039032961167207864, "grad_norm": 1.194324254989624, "learning_rate": 1.9924906397035922e-05, "loss": 1.8919, "step": 1404 }, { "epoch": 0.03906076242160046, "grad_norm": 1.3205488920211792, "learning_rate": 1.9924799522317317e-05, "loss": 2.5284, "step": 1405 }, { "epoch": 0.03908856367599306, "grad_norm": 1.224345326423645, "learning_rate": 1.9924692571886702e-05, "loss": 2.211, "step": 1406 }, { "epoch": 0.03911636493038566, "grad_norm": 1.2657428979873657, "learning_rate": 1.99245855457449e-05, "loss": 1.9687, "step": 1407 }, { "epoch": 0.039144166184778256, "grad_norm": 1.271405816078186, "learning_rate": 1.9924478443892724e-05, "loss": 2.24, "step": 1408 }, { "epoch": 0.039171967439170854, "grad_norm": 1.298462986946106, "learning_rate": 1.9924371266330987e-05, "loss": 2.2879, "step": 1409 }, { "epoch": 0.03919976869356345, "grad_norm": 1.3107045888900757, "learning_rate": 1.9924264013060513e-05, "loss": 2.0442, "step": 1410 }, { "epoch": 0.03922756994795605, "grad_norm": 1.2571825981140137, "learning_rate": 1.9924156684082115e-05, "loss": 1.8838, "step": 1411 }, { "epoch": 0.03925537120234865, "grad_norm": 1.2439790964126587, "learning_rate": 1.9924049279396615e-05, "loss": 2.1171, "step": 1412 }, { "epoch": 0.03928317245674125, "grad_norm": 1.259441614151001, "learning_rate": 1.9923941799004827e-05, "loss": 1.9855, "step": 1413 }, { "epoch": 0.039310973711133845, "grad_norm": 1.199937343597412, "learning_rate": 1.9923834242907576e-05, "loss": 2.1268, "step": 1414 }, { "epoch": 0.039338774965526443, "grad_norm": 1.1825830936431885, "learning_rate": 1.9923726611105684e-05, "loss": 2.1801, "step": 1415 }, { "epoch": 0.03936657621991904, "grad_norm": 1.191115379333496, "learning_rate": 1.992361890359997e-05, "loss": 1.9817, "step": 1416 }, { "epoch": 0.03939437747431164, "grad_norm": 1.3097801208496094, "learning_rate": 1.992351112039125e-05, "loss": 2.1628, "step": 1417 }, { "epoch": 0.03942217872870424, "grad_norm": 1.3246010541915894, "learning_rate": 1.9923403261480356e-05, "loss": 2.3863, "step": 1418 }, { "epoch": 0.039449979983096836, "grad_norm": 1.2098909616470337, "learning_rate": 1.9923295326868106e-05, "loss": 2.186, "step": 1419 }, { "epoch": 0.039477781237489434, "grad_norm": 1.2373130321502686, "learning_rate": 1.9923187316555325e-05, "loss": 2.1276, "step": 1420 }, { "epoch": 0.03950558249188203, "grad_norm": 1.2550370693206787, "learning_rate": 1.992307923054283e-05, "loss": 2.1903, "step": 1421 }, { "epoch": 0.03953338374627463, "grad_norm": 1.1255377531051636, "learning_rate": 1.9922971068831455e-05, "loss": 2.0589, "step": 1422 }, { "epoch": 0.03956118500066723, "grad_norm": 1.301255464553833, "learning_rate": 1.992286283142202e-05, "loss": 2.0217, "step": 1423 }, { "epoch": 0.03958898625505983, "grad_norm": 1.1865506172180176, "learning_rate": 1.9922754518315353e-05, "loss": 1.9126, "step": 1424 }, { "epoch": 0.039616787509452425, "grad_norm": 1.2210986614227295, "learning_rate": 1.9922646129512277e-05, "loss": 2.3116, "step": 1425 }, { "epoch": 0.03964458876384502, "grad_norm": 1.2077000141143799, "learning_rate": 1.9922537665013623e-05, "loss": 2.085, "step": 1426 }, { "epoch": 0.03967239001823762, "grad_norm": 1.2165359258651733, "learning_rate": 1.992242912482022e-05, "loss": 2.3008, "step": 1427 }, { "epoch": 0.03970019127263022, "grad_norm": 1.2171224355697632, "learning_rate": 1.9922320508932884e-05, "loss": 2.1229, "step": 1428 }, { "epoch": 0.03972799252702282, "grad_norm": 1.2421091794967651, "learning_rate": 1.992221181735246e-05, "loss": 1.8808, "step": 1429 }, { "epoch": 0.039755793781415416, "grad_norm": 1.2045314311981201, "learning_rate": 1.9922103050079764e-05, "loss": 2.2347, "step": 1430 }, { "epoch": 0.039783595035808014, "grad_norm": 1.1925619840621948, "learning_rate": 1.9921994207115633e-05, "loss": 1.9731, "step": 1431 }, { "epoch": 0.03981139629020061, "grad_norm": 1.1867597103118896, "learning_rate": 1.9921885288460894e-05, "loss": 2.3045, "step": 1432 }, { "epoch": 0.03983919754459321, "grad_norm": 1.1583549976348877, "learning_rate": 1.992177629411638e-05, "loss": 1.8489, "step": 1433 }, { "epoch": 0.03986699879898581, "grad_norm": 1.169317364692688, "learning_rate": 1.992166722408292e-05, "loss": 2.1237, "step": 1434 }, { "epoch": 0.03989480005337841, "grad_norm": 1.231044054031372, "learning_rate": 1.992155807836135e-05, "loss": 2.0402, "step": 1435 }, { "epoch": 0.039922601307771005, "grad_norm": 1.1068332195281982, "learning_rate": 1.9921448856952503e-05, "loss": 1.7569, "step": 1436 }, { "epoch": 0.0399504025621636, "grad_norm": 1.3066352605819702, "learning_rate": 1.9921339559857206e-05, "loss": 2.0545, "step": 1437 }, { "epoch": 0.0399782038165562, "grad_norm": 1.244352102279663, "learning_rate": 1.9921230187076297e-05, "loss": 1.9405, "step": 1438 }, { "epoch": 0.0400060050709488, "grad_norm": 1.311156153678894, "learning_rate": 1.9921120738610605e-05, "loss": 2.0538, "step": 1439 }, { "epoch": 0.0400338063253414, "grad_norm": 1.2236918210983276, "learning_rate": 1.9921011214460978e-05, "loss": 1.8599, "step": 1440 }, { "epoch": 0.040061607579733996, "grad_norm": 1.2163811922073364, "learning_rate": 1.992090161462824e-05, "loss": 2.1559, "step": 1441 }, { "epoch": 0.040089408834126594, "grad_norm": 1.2320597171783447, "learning_rate": 1.9920791939113228e-05, "loss": 2.1001, "step": 1442 }, { "epoch": 0.04011721008851919, "grad_norm": 1.1804983615875244, "learning_rate": 1.992068218791678e-05, "loss": 2.0221, "step": 1443 }, { "epoch": 0.04014501134291179, "grad_norm": 1.154345154762268, "learning_rate": 1.9920572361039737e-05, "loss": 2.1917, "step": 1444 }, { "epoch": 0.04017281259730439, "grad_norm": 1.1380411386489868, "learning_rate": 1.992046245848293e-05, "loss": 2.2144, "step": 1445 }, { "epoch": 0.04020061385169699, "grad_norm": 1.188504934310913, "learning_rate": 1.992035248024721e-05, "loss": 1.9563, "step": 1446 }, { "epoch": 0.040228415106089585, "grad_norm": 1.194416880607605, "learning_rate": 1.9920242426333398e-05, "loss": 2.4288, "step": 1447 }, { "epoch": 0.04025621636048218, "grad_norm": 1.1254068613052368, "learning_rate": 1.9920132296742347e-05, "loss": 1.972, "step": 1448 }, { "epoch": 0.04028401761487478, "grad_norm": 1.2054961919784546, "learning_rate": 1.9920022091474892e-05, "loss": 2.1187, "step": 1449 }, { "epoch": 0.04031181886926738, "grad_norm": 1.1669386625289917, "learning_rate": 1.9919911810531874e-05, "loss": 2.0905, "step": 1450 }, { "epoch": 0.12908678439571195, "grad_norm": 0.674976646900177, "learning_rate": 1.918883128019115e-05, "loss": 2.0356, "step": 1451 }, { "epoch": 0.12917574840976825, "grad_norm": 0.7310806512832642, "learning_rate": 1.9187728205932974e-05, "loss": 2.1057, "step": 1452 }, { "epoch": 0.12926471242382456, "grad_norm": 0.7335729598999023, "learning_rate": 1.91866244139214e-05, "loss": 2.024, "step": 1453 }, { "epoch": 0.12935367643788087, "grad_norm": 0.7030975818634033, "learning_rate": 1.9185519904242647e-05, "loss": 2.1634, "step": 1454 }, { "epoch": 0.12944264045193718, "grad_norm": 0.7368497848510742, "learning_rate": 1.9184414676983006e-05, "loss": 2.0831, "step": 1455 }, { "epoch": 0.12953160446599352, "grad_norm": 0.7524637579917908, "learning_rate": 1.9183308732228827e-05, "loss": 2.2064, "step": 1456 }, { "epoch": 0.12962056848004982, "grad_norm": 0.7159258127212524, "learning_rate": 1.9182202070066494e-05, "loss": 1.9866, "step": 1457 }, { "epoch": 0.12970953249410613, "grad_norm": 0.7139582633972168, "learning_rate": 1.918109469058247e-05, "loss": 2.1594, "step": 1458 }, { "epoch": 0.12979849650816244, "grad_norm": 0.6831613183021545, "learning_rate": 1.9179986593863257e-05, "loss": 2.1885, "step": 1459 }, { "epoch": 0.12988746052221875, "grad_norm": 0.7433372735977173, "learning_rate": 1.9178877779995423e-05, "loss": 2.1902, "step": 1460 }, { "epoch": 0.1299764245362751, "grad_norm": 0.7060792446136475, "learning_rate": 1.917776824906559e-05, "loss": 2.0515, "step": 1461 }, { "epoch": 0.1300653885503314, "grad_norm": 0.7091501951217651, "learning_rate": 1.9176658001160443e-05, "loss": 2.1377, "step": 1462 }, { "epoch": 0.1301543525643877, "grad_norm": 0.7096594572067261, "learning_rate": 1.91755470363667e-05, "loss": 2.1208, "step": 1463 }, { "epoch": 0.130243316578444, "grad_norm": 0.6679684519767761, "learning_rate": 1.9174435354771167e-05, "loss": 1.9931, "step": 1464 }, { "epoch": 0.13033228059250032, "grad_norm": 0.7237833738327026, "learning_rate": 1.9173322956460675e-05, "loss": 2.19, "step": 1465 }, { "epoch": 0.13042124460655666, "grad_norm": 0.716782808303833, "learning_rate": 1.9172209841522134e-05, "loss": 2.2026, "step": 1466 }, { "epoch": 0.13051020862061297, "grad_norm": 0.7138167023658752, "learning_rate": 1.91710960100425e-05, "loss": 2.1479, "step": 1467 }, { "epoch": 0.13059917263466927, "grad_norm": 0.6813187599182129, "learning_rate": 1.9169981462108788e-05, "loss": 2.1735, "step": 1468 }, { "epoch": 0.13068813664872558, "grad_norm": 0.730536937713623, "learning_rate": 1.9168866197808064e-05, "loss": 2.1064, "step": 1469 }, { "epoch": 0.1307771006627819, "grad_norm": 0.6633700728416443, "learning_rate": 1.9167750217227454e-05, "loss": 2.2471, "step": 1470 }, { "epoch": 0.13086606467683823, "grad_norm": 0.7200865745544434, "learning_rate": 1.9166633520454144e-05, "loss": 2.1037, "step": 1471 }, { "epoch": 0.13095502869089454, "grad_norm": 0.6877714991569519, "learning_rate": 1.9165516107575365e-05, "loss": 2.2933, "step": 1472 }, { "epoch": 0.13104399270495085, "grad_norm": 0.7162935137748718, "learning_rate": 1.9164397978678412e-05, "loss": 2.3149, "step": 1473 }, { "epoch": 0.13113295671900715, "grad_norm": 0.6733060479164124, "learning_rate": 1.916327913385064e-05, "loss": 2.1126, "step": 1474 }, { "epoch": 0.13122192073306346, "grad_norm": 0.7055361866950989, "learning_rate": 1.9162159573179446e-05, "loss": 2.2286, "step": 1475 }, { "epoch": 0.1313108847471198, "grad_norm": 0.7420381307601929, "learning_rate": 1.9161039296752296e-05, "loss": 2.2274, "step": 1476 }, { "epoch": 0.1313998487611761, "grad_norm": 0.7491070032119751, "learning_rate": 1.9159918304656703e-05, "loss": 2.135, "step": 1477 }, { "epoch": 0.13148881277523242, "grad_norm": 0.6901069283485413, "learning_rate": 1.9158796596980242e-05, "loss": 1.8872, "step": 1478 }, { "epoch": 0.13157777678928873, "grad_norm": 0.7616465091705322, "learning_rate": 1.9157674173810544e-05, "loss": 2.2045, "step": 1479 }, { "epoch": 0.13166674080334503, "grad_norm": 0.6756492257118225, "learning_rate": 1.915655103523529e-05, "loss": 2.1401, "step": 1480 }, { "epoch": 0.13175570481740137, "grad_norm": 0.7493202686309814, "learning_rate": 1.915542718134223e-05, "loss": 2.1485, "step": 1481 }, { "epoch": 0.13184466883145768, "grad_norm": 0.7584684491157532, "learning_rate": 1.9154302612219144e-05, "loss": 2.1721, "step": 1482 }, { "epoch": 0.131933632845514, "grad_norm": 0.6957008838653564, "learning_rate": 1.9153177327953897e-05, "loss": 2.139, "step": 1483 }, { "epoch": 0.1320225968595703, "grad_norm": 0.672055184841156, "learning_rate": 1.9152051328634393e-05, "loss": 2.2069, "step": 1484 }, { "epoch": 0.1321115608736266, "grad_norm": 0.6958116292953491, "learning_rate": 1.9150924614348594e-05, "loss": 2.1101, "step": 1485 }, { "epoch": 0.13220052488768294, "grad_norm": 0.7473313808441162, "learning_rate": 1.9149797185184526e-05, "loss": 2.0489, "step": 1486 }, { "epoch": 0.13228948890173925, "grad_norm": 0.6686244010925293, "learning_rate": 1.9148669041230257e-05, "loss": 2.1462, "step": 1487 }, { "epoch": 0.13237845291579556, "grad_norm": 0.6933692097663879, "learning_rate": 1.9147540182573925e-05, "loss": 2.1164, "step": 1488 }, { "epoch": 0.13246741692985187, "grad_norm": 0.6905208230018616, "learning_rate": 1.9146410609303716e-05, "loss": 2.1732, "step": 1489 }, { "epoch": 0.13255638094390818, "grad_norm": 0.7162343859672546, "learning_rate": 1.9145280321507872e-05, "loss": 2.0226, "step": 1490 }, { "epoch": 0.1326453449579645, "grad_norm": 0.6874192357063293, "learning_rate": 1.914414931927469e-05, "loss": 2.107, "step": 1491 }, { "epoch": 0.13273430897202082, "grad_norm": 0.7397066354751587, "learning_rate": 1.914301760269253e-05, "loss": 2.037, "step": 1492 }, { "epoch": 0.13282327298607713, "grad_norm": 0.7085984349250793, "learning_rate": 1.91418851718498e-05, "loss": 2.0926, "step": 1493 }, { "epoch": 0.13291223700013344, "grad_norm": 0.6834755539894104, "learning_rate": 1.9140752026834966e-05, "loss": 2.1119, "step": 1494 }, { "epoch": 0.13300120101418977, "grad_norm": 0.7041957974433899, "learning_rate": 1.913961816773655e-05, "loss": 2.1888, "step": 1495 }, { "epoch": 0.13309016502824608, "grad_norm": 0.7494297027587891, "learning_rate": 1.9138483594643133e-05, "loss": 2.1941, "step": 1496 }, { "epoch": 0.1331791290423024, "grad_norm": 0.6943486928939819, "learning_rate": 1.9137348307643344e-05, "loss": 2.1824, "step": 1497 }, { "epoch": 0.1332680930563587, "grad_norm": 0.690528154373169, "learning_rate": 1.913621230682588e-05, "loss": 2.1335, "step": 1498 }, { "epoch": 0.133357057070415, "grad_norm": 0.7519603967666626, "learning_rate": 1.9135075592279475e-05, "loss": 2.0838, "step": 1499 }, { "epoch": 0.13344602108447134, "grad_norm": 0.7231424450874329, "learning_rate": 1.9133938164092942e-05, "loss": 2.0468, "step": 1500 }, { "epoch": 0.13353498509852765, "grad_norm": 0.6963239908218384, "learning_rate": 1.913280002235513e-05, "loss": 2.2255, "step": 1501 }, { "epoch": 0.13362394911258396, "grad_norm": 0.7087194919586182, "learning_rate": 1.9131661167154954e-05, "loss": 2.0769, "step": 1502 }, { "epoch": 0.13371291312664027, "grad_norm": 0.7032302618026733, "learning_rate": 1.9130521598581385e-05, "loss": 2.1591, "step": 1503 }, { "epoch": 0.13380187714069658, "grad_norm": 0.7066627144813538, "learning_rate": 1.9129381316723442e-05, "loss": 2.0189, "step": 1504 }, { "epoch": 0.13389084115475292, "grad_norm": 0.6774100661277771, "learning_rate": 1.912824032167021e-05, "loss": 1.9907, "step": 1505 }, { "epoch": 0.13397980516880922, "grad_norm": 0.8225818276405334, "learning_rate": 1.9127098613510825e-05, "loss": 2.3418, "step": 1506 }, { "epoch": 0.13406876918286553, "grad_norm": 0.70097815990448, "learning_rate": 1.9125956192334473e-05, "loss": 2.2569, "step": 1507 }, { "epoch": 0.13415773319692184, "grad_norm": 0.6995531916618347, "learning_rate": 1.91248130582304e-05, "loss": 2.2324, "step": 1508 }, { "epoch": 0.13424669721097815, "grad_norm": 0.6896299123764038, "learning_rate": 1.9123669211287916e-05, "loss": 2.0234, "step": 1509 }, { "epoch": 0.1343356612250345, "grad_norm": 0.6727869510650635, "learning_rate": 1.9122524651596376e-05, "loss": 2.1753, "step": 1510 }, { "epoch": 0.1344246252390908, "grad_norm": 0.7001076936721802, "learning_rate": 1.912137937924519e-05, "loss": 1.9737, "step": 1511 }, { "epoch": 0.1345135892531471, "grad_norm": 0.7247818112373352, "learning_rate": 1.9120233394323833e-05, "loss": 1.9521, "step": 1512 }, { "epoch": 0.1346025532672034, "grad_norm": 0.7155790328979492, "learning_rate": 1.9119086696921826e-05, "loss": 2.0935, "step": 1513 }, { "epoch": 0.13469151728125972, "grad_norm": 0.8023723363876343, "learning_rate": 1.911793928712876e-05, "loss": 2.2189, "step": 1514 }, { "epoch": 0.13478048129531606, "grad_norm": 0.6678867340087891, "learning_rate": 1.9116791165034258e-05, "loss": 2.1952, "step": 1515 }, { "epoch": 0.13486944530937237, "grad_norm": 0.6956883072853088, "learning_rate": 1.9115642330728018e-05, "loss": 2.1945, "step": 1516 }, { "epoch": 0.13495840932342867, "grad_norm": 0.7054136991500854, "learning_rate": 1.911449278429979e-05, "loss": 2.1408, "step": 1517 }, { "epoch": 0.13504737333748498, "grad_norm": 0.6842026710510254, "learning_rate": 1.9113342525839372e-05, "loss": 2.2023, "step": 1518 }, { "epoch": 0.1351363373515413, "grad_norm": 0.7804849147796631, "learning_rate": 1.9112191555436632e-05, "loss": 2.0387, "step": 1519 }, { "epoch": 0.13522530136559763, "grad_norm": 0.7178310751914978, "learning_rate": 1.9111039873181478e-05, "loss": 2.1217, "step": 1520 }, { "epoch": 0.13531426537965394, "grad_norm": 0.7458540201187134, "learning_rate": 1.910988747916388e-05, "loss": 1.9931, "step": 1521 }, { "epoch": 0.13540322939371024, "grad_norm": 0.7119858860969543, "learning_rate": 1.9108734373473874e-05, "loss": 1.9836, "step": 1522 }, { "epoch": 0.13549219340776655, "grad_norm": 0.721994936466217, "learning_rate": 1.9107580556201527e-05, "loss": 2.152, "step": 1523 }, { "epoch": 0.13558115742182286, "grad_norm": 0.7333192825317383, "learning_rate": 1.9106426027436985e-05, "loss": 2.2056, "step": 1524 }, { "epoch": 0.1356701214358792, "grad_norm": 0.7366089820861816, "learning_rate": 1.9105270787270442e-05, "loss": 2.1027, "step": 1525 }, { "epoch": 0.1357590854499355, "grad_norm": 0.7128590941429138, "learning_rate": 1.910411483579214e-05, "loss": 2.0367, "step": 1526 }, { "epoch": 0.13584804946399182, "grad_norm": 0.7528856992721558, "learning_rate": 1.9102958173092387e-05, "loss": 2.0628, "step": 1527 }, { "epoch": 0.13593701347804812, "grad_norm": 0.724431037902832, "learning_rate": 1.9101800799261543e-05, "loss": 2.2369, "step": 1528 }, { "epoch": 0.13602597749210443, "grad_norm": 0.7336472272872925, "learning_rate": 1.910064271439002e-05, "loss": 2.1532, "step": 1529 }, { "epoch": 0.13611494150616077, "grad_norm": 0.7338613867759705, "learning_rate": 1.9099483918568294e-05, "loss": 1.9967, "step": 1530 }, { "epoch": 0.13620390552021708, "grad_norm": 0.6688590049743652, "learning_rate": 1.9098324411886883e-05, "loss": 2.0811, "step": 1531 }, { "epoch": 0.1362928695342734, "grad_norm": 0.831134021282196, "learning_rate": 1.9097164194436378e-05, "loss": 2.1027, "step": 1532 }, { "epoch": 0.1363818335483297, "grad_norm": 0.8285362124443054, "learning_rate": 1.909600326630741e-05, "loss": 2.0172, "step": 1533 }, { "epoch": 0.136470797562386, "grad_norm": 0.7233934998512268, "learning_rate": 1.9094841627590673e-05, "loss": 2.2165, "step": 1534 }, { "epoch": 0.13655976157644234, "grad_norm": 0.6699313521385193, "learning_rate": 1.9093679278376913e-05, "loss": 2.0704, "step": 1535 }, { "epoch": 0.13664872559049865, "grad_norm": 0.7230476140975952, "learning_rate": 1.909251621875694e-05, "loss": 2.1482, "step": 1536 }, { "epoch": 0.13673768960455496, "grad_norm": 0.767360508441925, "learning_rate": 1.9091352448821607e-05, "loss": 2.1803, "step": 1537 }, { "epoch": 0.13682665361861127, "grad_norm": 0.6728501915931702, "learning_rate": 1.9090187968661834e-05, "loss": 2.1466, "step": 1538 }, { "epoch": 0.13691561763266757, "grad_norm": 0.6831271648406982, "learning_rate": 1.9089022778368584e-05, "loss": 2.1824, "step": 1539 }, { "epoch": 0.1370045816467239, "grad_norm": 0.7068498730659485, "learning_rate": 1.908785687803289e-05, "loss": 2.2738, "step": 1540 }, { "epoch": 0.13709354566078022, "grad_norm": 0.6927396655082703, "learning_rate": 1.9086690267745835e-05, "loss": 2.0958, "step": 1541 }, { "epoch": 0.13718250967483653, "grad_norm": 0.6951919198036194, "learning_rate": 1.9085522947598542e-05, "loss": 2.0689, "step": 1542 }, { "epoch": 0.13727147368889284, "grad_norm": 0.7033616900444031, "learning_rate": 1.9084354917682218e-05, "loss": 2.1778, "step": 1543 }, { "epoch": 0.13736043770294915, "grad_norm": 0.7442080974578857, "learning_rate": 1.9083186178088103e-05, "loss": 2.2193, "step": 1544 }, { "epoch": 0.13744940171700548, "grad_norm": 0.7784643173217773, "learning_rate": 1.90820167289075e-05, "loss": 2.1083, "step": 1545 }, { "epoch": 0.1375383657310618, "grad_norm": 0.6760720014572144, "learning_rate": 1.908084657023177e-05, "loss": 2.2061, "step": 1546 }, { "epoch": 0.1376273297451181, "grad_norm": 0.7411883473396301, "learning_rate": 1.9079675702152327e-05, "loss": 2.1173, "step": 1547 }, { "epoch": 0.1377162937591744, "grad_norm": 0.730983555316925, "learning_rate": 1.907850412476064e-05, "loss": 2.1532, "step": 1548 }, { "epoch": 0.13780525777323072, "grad_norm": 0.715723991394043, "learning_rate": 1.9077331838148228e-05, "loss": 2.0931, "step": 1549 }, { "epoch": 0.13789422178728705, "grad_norm": 0.6817996501922607, "learning_rate": 1.9076158842406677e-05, "loss": 2.1363, "step": 1550 }, { "epoch": 0.13798318580134336, "grad_norm": 0.7068785429000854, "learning_rate": 1.9074985137627623e-05, "loss": 2.1468, "step": 1551 }, { "epoch": 0.13807214981539967, "grad_norm": 0.7507544755935669, "learning_rate": 1.9073810723902757e-05, "loss": 2.1402, "step": 1552 }, { "epoch": 0.13816111382945598, "grad_norm": 0.742301881313324, "learning_rate": 1.9072635601323817e-05, "loss": 2.0598, "step": 1553 }, { "epoch": 0.1382500778435123, "grad_norm": 0.7125744819641113, "learning_rate": 1.9071459769982615e-05, "loss": 2.1225, "step": 1554 }, { "epoch": 0.13833904185756862, "grad_norm": 0.6999467611312866, "learning_rate": 1.9070283229971007e-05, "loss": 1.9213, "step": 1555 }, { "epoch": 0.13842800587162493, "grad_norm": 0.6858512163162231, "learning_rate": 1.9069105981380898e-05, "loss": 2.0182, "step": 1556 }, { "epoch": 0.13851696988568124, "grad_norm": 0.7477670907974243, "learning_rate": 1.906792802430426e-05, "loss": 2.2359, "step": 1557 }, { "epoch": 0.13860593389973755, "grad_norm": 0.7466025352478027, "learning_rate": 1.9066749358833117e-05, "loss": 2.1252, "step": 1558 }, { "epoch": 0.13869489791379386, "grad_norm": 0.7181077599525452, "learning_rate": 1.9065569985059542e-05, "loss": 2.1922, "step": 1559 }, { "epoch": 0.1387838619278502, "grad_norm": 0.7508081793785095, "learning_rate": 1.9064389903075676e-05, "loss": 2.2926, "step": 1560 }, { "epoch": 0.1388728259419065, "grad_norm": 0.7642948031425476, "learning_rate": 1.906320911297371e-05, "loss": 2.2114, "step": 1561 }, { "epoch": 0.1389617899559628, "grad_norm": 0.7403438687324524, "learning_rate": 1.9062027614845877e-05, "loss": 2.1091, "step": 1562 }, { "epoch": 0.13905075397001912, "grad_norm": 0.6996174454689026, "learning_rate": 1.9060845408784486e-05, "loss": 2.1427, "step": 1563 }, { "epoch": 0.13913971798407543, "grad_norm": 0.7214592695236206, "learning_rate": 1.905966249488189e-05, "loss": 2.2872, "step": 1564 }, { "epoch": 0.13922868199813176, "grad_norm": 0.7074094414710999, "learning_rate": 1.9058478873230494e-05, "loss": 2.1028, "step": 1565 }, { "epoch": 0.13931764601218807, "grad_norm": 0.7104836702346802, "learning_rate": 1.9057294543922768e-05, "loss": 1.9901, "step": 1566 }, { "epoch": 0.13940661002624438, "grad_norm": 0.7215651273727417, "learning_rate": 1.9056109507051236e-05, "loss": 2.089, "step": 1567 }, { "epoch": 0.1394955740403007, "grad_norm": 0.6993376612663269, "learning_rate": 1.9054923762708472e-05, "loss": 2.1071, "step": 1568 }, { "epoch": 0.139584538054357, "grad_norm": 0.7807263731956482, "learning_rate": 1.90537373109871e-05, "loss": 2.127, "step": 1569 }, { "epoch": 0.13967350206841334, "grad_norm": 0.816089928150177, "learning_rate": 1.905255015197982e-05, "loss": 2.2639, "step": 1570 }, { "epoch": 0.13976246608246964, "grad_norm": 0.7122036218643188, "learning_rate": 1.9051362285779363e-05, "loss": 2.2795, "step": 1571 }, { "epoch": 0.13985143009652595, "grad_norm": 0.7390177249908447, "learning_rate": 1.905017371247853e-05, "loss": 2.0848, "step": 1572 }, { "epoch": 0.13994039411058226, "grad_norm": 0.699128270149231, "learning_rate": 1.9048984432170175e-05, "loss": 2.1444, "step": 1573 }, { "epoch": 0.14002935812463857, "grad_norm": 0.7561851739883423, "learning_rate": 1.9047794444947204e-05, "loss": 2.0741, "step": 1574 }, { "epoch": 0.1401183221386949, "grad_norm": 0.7206645011901855, "learning_rate": 1.9046603750902578e-05, "loss": 2.0631, "step": 1575 }, { "epoch": 0.14020728615275121, "grad_norm": 0.7066382765769958, "learning_rate": 1.9045412350129314e-05, "loss": 2.0786, "step": 1576 }, { "epoch": 0.14029625016680752, "grad_norm": 0.7629712820053101, "learning_rate": 1.9044220242720494e-05, "loss": 2.2756, "step": 1577 }, { "epoch": 0.14038521418086383, "grad_norm": 0.730331301689148, "learning_rate": 1.904302742876924e-05, "loss": 2.1321, "step": 1578 }, { "epoch": 0.14047417819492017, "grad_norm": 0.7184867262840271, "learning_rate": 1.9041833908368736e-05, "loss": 2.1348, "step": 1579 }, { "epoch": 0.14056314220897648, "grad_norm": 0.7042089104652405, "learning_rate": 1.904063968161222e-05, "loss": 2.2112, "step": 1580 }, { "epoch": 0.14065210622303279, "grad_norm": 0.7240262627601624, "learning_rate": 1.9039444748592984e-05, "loss": 2.0821, "step": 1581 }, { "epoch": 0.1407410702370891, "grad_norm": 0.717921793460846, "learning_rate": 1.9038249109404386e-05, "loss": 2.2045, "step": 1582 }, { "epoch": 0.1408300342511454, "grad_norm": 0.692244291305542, "learning_rate": 1.903705276413982e-05, "loss": 2.1747, "step": 1583 }, { "epoch": 0.14091899826520174, "grad_norm": 0.7028403282165527, "learning_rate": 1.9035855712892753e-05, "loss": 1.9697, "step": 1584 }, { "epoch": 0.14100796227925805, "grad_norm": 0.6866932511329651, "learning_rate": 1.9034657955756695e-05, "loss": 2.1732, "step": 1585 }, { "epoch": 0.14109692629331436, "grad_norm": 0.7175632119178772, "learning_rate": 1.903345949282522e-05, "loss": 2.3072, "step": 1586 }, { "epoch": 0.14118589030737067, "grad_norm": 0.7163522839546204, "learning_rate": 1.903226032419195e-05, "loss": 2.2625, "step": 1587 }, { "epoch": 0.14127485432142697, "grad_norm": 0.7228026986122131, "learning_rate": 1.9031060449950568e-05, "loss": 2.1505, "step": 1588 }, { "epoch": 0.1413638183354833, "grad_norm": 0.7363160252571106, "learning_rate": 1.9029859870194806e-05, "loss": 2.2136, "step": 1589 }, { "epoch": 0.14145278234953962, "grad_norm": 0.7044832706451416, "learning_rate": 1.9028658585018455e-05, "loss": 2.1166, "step": 1590 }, { "epoch": 0.14154174636359593, "grad_norm": 0.7056241035461426, "learning_rate": 1.902745659451536e-05, "loss": 1.944, "step": 1591 }, { "epoch": 0.14163071037765224, "grad_norm": 0.687555730342865, "learning_rate": 1.9026253898779426e-05, "loss": 2.1491, "step": 1592 }, { "epoch": 0.14171967439170854, "grad_norm": 0.727637767791748, "learning_rate": 1.90250504979046e-05, "loss": 2.0347, "step": 1593 }, { "epoch": 0.14180863840576488, "grad_norm": 0.71175217628479, "learning_rate": 1.9023846391984905e-05, "loss": 2.1626, "step": 1594 }, { "epoch": 0.1418976024198212, "grad_norm": 0.7223698496818542, "learning_rate": 1.9022641581114392e-05, "loss": 1.9723, "step": 1595 }, { "epoch": 0.1419865664338775, "grad_norm": 0.7376037240028381, "learning_rate": 1.9021436065387195e-05, "loss": 2.0388, "step": 1596 }, { "epoch": 0.1420755304479338, "grad_norm": 0.6674297451972961, "learning_rate": 1.9020229844897483e-05, "loss": 1.9178, "step": 1597 }, { "epoch": 0.14216449446199012, "grad_norm": 0.6752891540527344, "learning_rate": 1.9019022919739486e-05, "loss": 1.9869, "step": 1598 }, { "epoch": 0.14225345847604645, "grad_norm": 0.8238239288330078, "learning_rate": 1.9017815290007497e-05, "loss": 2.2705, "step": 1599 }, { "epoch": 0.14234242249010276, "grad_norm": 0.7054691910743713, "learning_rate": 1.901660695579585e-05, "loss": 2.127, "step": 1600 }, { "epoch": 0.14243138650415907, "grad_norm": 0.723223090171814, "learning_rate": 1.9015397917198947e-05, "loss": 2.1819, "step": 1601 }, { "epoch": 0.14252035051821538, "grad_norm": 0.7419965267181396, "learning_rate": 1.901418817431123e-05, "loss": 2.0055, "step": 1602 }, { "epoch": 0.14260931453227169, "grad_norm": 0.7081524133682251, "learning_rate": 1.9012977727227214e-05, "loss": 2.0034, "step": 1603 }, { "epoch": 0.14269827854632802, "grad_norm": 0.7017990350723267, "learning_rate": 1.901176657604146e-05, "loss": 2.2058, "step": 1604 }, { "epoch": 0.14278724256038433, "grad_norm": 0.7065199613571167, "learning_rate": 1.901055472084858e-05, "loss": 2.0947, "step": 1605 }, { "epoch": 0.14287620657444064, "grad_norm": 0.7354803681373596, "learning_rate": 1.9009342161743248e-05, "loss": 2.0925, "step": 1606 }, { "epoch": 0.14296517058849695, "grad_norm": 0.7831696271896362, "learning_rate": 1.9008128898820188e-05, "loss": 1.9525, "step": 1607 }, { "epoch": 0.14305413460255326, "grad_norm": 0.6968514919281006, "learning_rate": 1.900691493217418e-05, "loss": 1.9783, "step": 1608 }, { "epoch": 0.1431430986166096, "grad_norm": 0.741797924041748, "learning_rate": 1.9005700261900063e-05, "loss": 2.2641, "step": 1609 }, { "epoch": 0.1432320626306659, "grad_norm": 0.7444104552268982, "learning_rate": 1.900448488809273e-05, "loss": 2.0563, "step": 1610 }, { "epoch": 0.1433210266447222, "grad_norm": 0.6823083758354187, "learning_rate": 1.900326881084712e-05, "loss": 2.05, "step": 1611 }, { "epoch": 0.14340999065877852, "grad_norm": 0.7570714950561523, "learning_rate": 1.9002052030258244e-05, "loss": 1.9717, "step": 1612 }, { "epoch": 0.14349895467283483, "grad_norm": 0.9097091555595398, "learning_rate": 1.900083454642115e-05, "loss": 2.0523, "step": 1613 }, { "epoch": 0.14358791868689116, "grad_norm": 0.7547870874404907, "learning_rate": 1.899961635943095e-05, "loss": 2.1613, "step": 1614 }, { "epoch": 0.14367688270094747, "grad_norm": 0.7002084851264954, "learning_rate": 1.8998397469382812e-05, "loss": 2.093, "step": 1615 }, { "epoch": 0.14376584671500378, "grad_norm": 0.7612960934638977, "learning_rate": 1.8997177876371958e-05, "loss": 2.1898, "step": 1616 }, { "epoch": 0.1438548107290601, "grad_norm": 0.7901351451873779, "learning_rate": 1.899595758049366e-05, "loss": 2.1686, "step": 1617 }, { "epoch": 0.1439437747431164, "grad_norm": 0.7260928153991699, "learning_rate": 1.899473658184325e-05, "loss": 2.08, "step": 1618 }, { "epoch": 0.14403273875717273, "grad_norm": 0.7595137357711792, "learning_rate": 1.8993514880516114e-05, "loss": 2.1567, "step": 1619 }, { "epoch": 0.14412170277122904, "grad_norm": 0.6946915984153748, "learning_rate": 1.899229247660769e-05, "loss": 2.1557, "step": 1620 }, { "epoch": 0.14421066678528535, "grad_norm": 0.8131952285766602, "learning_rate": 1.8991069370213477e-05, "loss": 2.042, "step": 1621 }, { "epoch": 0.14429963079934166, "grad_norm": 0.7136657238006592, "learning_rate": 1.8989845561429025e-05, "loss": 2.1521, "step": 1622 }, { "epoch": 0.14438859481339797, "grad_norm": 0.694391667842865, "learning_rate": 1.8988621050349936e-05, "loss": 2.1343, "step": 1623 }, { "epoch": 0.1444775588274543, "grad_norm": 0.8052846789360046, "learning_rate": 1.898739583707187e-05, "loss": 2.0542, "step": 1624 }, { "epoch": 0.14456652284151061, "grad_norm": 0.7174726128578186, "learning_rate": 1.8986169921690546e-05, "loss": 2.0926, "step": 1625 }, { "epoch": 0.14465548685556692, "grad_norm": 0.7309580445289612, "learning_rate": 1.898494330430173e-05, "loss": 2.1391, "step": 1626 }, { "epoch": 0.14474445086962323, "grad_norm": 0.7136374711990356, "learning_rate": 1.8983715985001245e-05, "loss": 1.9662, "step": 1627 }, { "epoch": 0.14483341488367954, "grad_norm": 0.7026463150978088, "learning_rate": 1.8982487963884975e-05, "loss": 2.082, "step": 1628 }, { "epoch": 0.14492237889773588, "grad_norm": 0.7408065795898438, "learning_rate": 1.898125924104885e-05, "loss": 2.0902, "step": 1629 }, { "epoch": 0.14501134291179218, "grad_norm": 0.6933066248893738, "learning_rate": 1.898002981658886e-05, "loss": 2.1383, "step": 1630 }, { "epoch": 0.1451003069258485, "grad_norm": 0.7678527235984802, "learning_rate": 1.897879969060105e-05, "loss": 2.0613, "step": 1631 }, { "epoch": 0.1451892709399048, "grad_norm": 0.7678179740905762, "learning_rate": 1.8977568863181517e-05, "loss": 2.2704, "step": 1632 }, { "epoch": 0.1452782349539611, "grad_norm": 0.7525485157966614, "learning_rate": 1.8976337334426417e-05, "loss": 2.1422, "step": 1633 }, { "epoch": 0.14536719896801745, "grad_norm": 0.8016729354858398, "learning_rate": 1.8975105104431953e-05, "loss": 2.1039, "step": 1634 }, { "epoch": 0.14545616298207376, "grad_norm": 0.7251212000846863, "learning_rate": 1.8973872173294394e-05, "loss": 2.2837, "step": 1635 }, { "epoch": 0.14554512699613006, "grad_norm": 0.7730448246002197, "learning_rate": 1.8972638541110053e-05, "loss": 2.0775, "step": 1636 }, { "epoch": 0.14563409101018637, "grad_norm": 0.6937607526779175, "learning_rate": 1.8971404207975303e-05, "loss": 2.0895, "step": 1637 }, { "epoch": 0.14572305502424268, "grad_norm": 0.7883414030075073, "learning_rate": 1.8970169173986573e-05, "loss": 2.1418, "step": 1638 }, { "epoch": 0.14581201903829902, "grad_norm": 0.734897255897522, "learning_rate": 1.8968933439240347e-05, "loss": 2.1223, "step": 1639 }, { "epoch": 0.14590098305235533, "grad_norm": 0.7466630935668945, "learning_rate": 1.8967697003833156e-05, "loss": 1.9625, "step": 1640 }, { "epoch": 0.14598994706641163, "grad_norm": 0.7184388637542725, "learning_rate": 1.8966459867861596e-05, "loss": 2.2619, "step": 1641 }, { "epoch": 0.14607891108046794, "grad_norm": 0.8266288042068481, "learning_rate": 1.896522203142231e-05, "loss": 2.2181, "step": 1642 }, { "epoch": 0.14616787509452425, "grad_norm": 0.6852174401283264, "learning_rate": 1.8963983494611998e-05, "loss": 2.1574, "step": 1643 }, { "epoch": 0.1462568391085806, "grad_norm": 0.7667473554611206, "learning_rate": 1.8962744257527423e-05, "loss": 2.1507, "step": 1644 }, { "epoch": 0.1463458031226369, "grad_norm": 0.7464703917503357, "learning_rate": 1.8961504320265386e-05, "loss": 2.0421, "step": 1645 }, { "epoch": 0.1464347671366932, "grad_norm": 0.772483766078949, "learning_rate": 1.896026368292276e-05, "loss": 2.2768, "step": 1646 }, { "epoch": 0.14652373115074951, "grad_norm": 0.7036101222038269, "learning_rate": 1.8959022345596455e-05, "loss": 1.9817, "step": 1647 }, { "epoch": 0.14661269516480582, "grad_norm": 0.6918807625770569, "learning_rate": 1.8957780308383458e-05, "loss": 2.1934, "step": 1648 }, { "epoch": 0.14670165917886216, "grad_norm": 0.732772707939148, "learning_rate": 1.8956537571380788e-05, "loss": 2.0385, "step": 1649 }, { "epoch": 0.14679062319291847, "grad_norm": 0.7011646628379822, "learning_rate": 1.895529413468553e-05, "loss": 2.1243, "step": 1650 }, { "epoch": 0.14687958720697478, "grad_norm": 0.7015833854675293, "learning_rate": 1.895404999839483e-05, "loss": 2.0698, "step": 1651 }, { "epoch": 0.14696855122103109, "grad_norm": 0.6898995041847229, "learning_rate": 1.895280516260587e-05, "loss": 2.0514, "step": 1652 }, { "epoch": 0.1470575152350874, "grad_norm": 0.7405246496200562, "learning_rate": 1.8951559627415906e-05, "loss": 2.0706, "step": 1653 }, { "epoch": 0.14714647924914373, "grad_norm": 0.6931696534156799, "learning_rate": 1.8950313392922236e-05, "loss": 2.0784, "step": 1654 }, { "epoch": 0.14723544326320004, "grad_norm": 0.6846954226493835, "learning_rate": 1.8949066459222217e-05, "loss": 2.1389, "step": 1655 }, { "epoch": 0.14732440727725635, "grad_norm": 0.7054144144058228, "learning_rate": 1.8947818826413266e-05, "loss": 2.021, "step": 1656 }, { "epoch": 0.14741337129131266, "grad_norm": 0.7543538808822632, "learning_rate": 1.8946570494592838e-05, "loss": 2.1464, "step": 1657 }, { "epoch": 0.14750233530536896, "grad_norm": 0.7679184675216675, "learning_rate": 1.8945321463858468e-05, "loss": 1.9925, "step": 1658 }, { "epoch": 0.1475912993194253, "grad_norm": 0.7037811279296875, "learning_rate": 1.894407173430772e-05, "loss": 2.2312, "step": 1659 }, { "epoch": 0.1476802633334816, "grad_norm": 0.6890579462051392, "learning_rate": 1.894282130603823e-05, "loss": 2.1696, "step": 1660 }, { "epoch": 0.14776922734753792, "grad_norm": 0.7776521444320679, "learning_rate": 1.8941570179147678e-05, "loss": 2.2939, "step": 1661 }, { "epoch": 0.14785819136159423, "grad_norm": 0.7288386225700378, "learning_rate": 1.8940318353733808e-05, "loss": 2.1992, "step": 1662 }, { "epoch": 0.14794715537565056, "grad_norm": 0.7131479978561401, "learning_rate": 1.8939065829894412e-05, "loss": 2.1939, "step": 1663 }, { "epoch": 0.14803611938970687, "grad_norm": 0.697925329208374, "learning_rate": 1.893781260772734e-05, "loss": 2.1373, "step": 1664 }, { "epoch": 0.14812508340376318, "grad_norm": 0.7242127060890198, "learning_rate": 1.893655868733049e-05, "loss": 2.4302, "step": 1665 }, { "epoch": 0.1482140474178195, "grad_norm": 0.7666712999343872, "learning_rate": 1.893530406880182e-05, "loss": 2.0462, "step": 1666 }, { "epoch": 0.1483030114318758, "grad_norm": 0.780622124671936, "learning_rate": 1.8934048752239345e-05, "loss": 2.2201, "step": 1667 }, { "epoch": 0.14839197544593213, "grad_norm": 0.7040262222290039, "learning_rate": 1.893279273774113e-05, "loss": 2.0862, "step": 1668 }, { "epoch": 0.14848093945998844, "grad_norm": 0.7368348836898804, "learning_rate": 1.8931536025405298e-05, "loss": 2.2721, "step": 1669 }, { "epoch": 0.14856990347404475, "grad_norm": 0.7601885795593262, "learning_rate": 1.893027861533002e-05, "loss": 2.0413, "step": 1670 }, { "epoch": 0.14865886748810106, "grad_norm": 0.6626439094543457, "learning_rate": 1.8929020507613533e-05, "loss": 1.9587, "step": 1671 }, { "epoch": 0.14874783150215737, "grad_norm": 0.6983657479286194, "learning_rate": 1.892776170235411e-05, "loss": 1.9111, "step": 1672 }, { "epoch": 0.1488367955162137, "grad_norm": 0.7465906739234924, "learning_rate": 1.8926502199650105e-05, "loss": 1.9884, "step": 1673 }, { "epoch": 0.14892575953027, "grad_norm": 0.6806813478469849, "learning_rate": 1.89252419995999e-05, "loss": 2.1236, "step": 1674 }, { "epoch": 0.14901472354432632, "grad_norm": 0.70137619972229, "learning_rate": 1.8923981102301944e-05, "loss": 2.0311, "step": 1675 }, { "epoch": 0.14910368755838263, "grad_norm": 0.7313404083251953, "learning_rate": 1.8922719507854748e-05, "loss": 2.1564, "step": 1676 }, { "epoch": 0.14919265157243894, "grad_norm": 0.703441321849823, "learning_rate": 1.8921457216356857e-05, "loss": 2.2297, "step": 1677 }, { "epoch": 0.14928161558649528, "grad_norm": 0.7462134957313538, "learning_rate": 1.8920194227906892e-05, "loss": 2.115, "step": 1678 }, { "epoch": 0.14937057960055158, "grad_norm": 0.7192675471305847, "learning_rate": 1.8918930542603514e-05, "loss": 2.2432, "step": 1679 }, { "epoch": 0.1494595436146079, "grad_norm": 0.7085431218147278, "learning_rate": 1.8917666160545446e-05, "loss": 2.3091, "step": 1680 }, { "epoch": 0.1495485076286642, "grad_norm": 0.718258261680603, "learning_rate": 1.8916401081831453e-05, "loss": 2.1966, "step": 1681 }, { "epoch": 0.1496374716427205, "grad_norm": 0.7071862816810608, "learning_rate": 1.891513530656038e-05, "loss": 2.171, "step": 1682 }, { "epoch": 0.14972643565677685, "grad_norm": 0.7258570194244385, "learning_rate": 1.89138688348311e-05, "loss": 2.1344, "step": 1683 }, { "epoch": 0.14981539967083315, "grad_norm": 0.7308819890022278, "learning_rate": 1.8912601666742552e-05, "loss": 2.1188, "step": 1684 }, { "epoch": 0.14990436368488946, "grad_norm": 0.7914236783981323, "learning_rate": 1.891133380239373e-05, "loss": 2.1013, "step": 1685 }, { "epoch": 0.14999332769894577, "grad_norm": 0.792649507522583, "learning_rate": 1.891006524188368e-05, "loss": 2.1814, "step": 1686 }, { "epoch": 0.15008229171300208, "grad_norm": 0.777113139629364, "learning_rate": 1.8908795985311503e-05, "loss": 1.9792, "step": 1687 }, { "epoch": 0.15017125572705842, "grad_norm": 0.7165171504020691, "learning_rate": 1.8907526032776356e-05, "loss": 2.16, "step": 1688 }, { "epoch": 0.15026021974111473, "grad_norm": 0.8429832458496094, "learning_rate": 1.890625538437745e-05, "loss": 2.1278, "step": 1689 }, { "epoch": 0.15034918375517103, "grad_norm": 0.7465900778770447, "learning_rate": 1.890498404021404e-05, "loss": 2.1063, "step": 1690 }, { "epoch": 0.15043814776922734, "grad_norm": 0.7323991060256958, "learning_rate": 1.8903712000385454e-05, "loss": 2.0112, "step": 1691 }, { "epoch": 0.15052711178328365, "grad_norm": 0.7859575748443604, "learning_rate": 1.8902439264991063e-05, "loss": 2.0312, "step": 1692 }, { "epoch": 0.15061607579734, "grad_norm": 0.7432184219360352, "learning_rate": 1.890116583413029e-05, "loss": 1.9236, "step": 1693 }, { "epoch": 0.1507050398113963, "grad_norm": 0.753281831741333, "learning_rate": 1.8899891707902623e-05, "loss": 2.0403, "step": 1694 }, { "epoch": 0.1507940038254526, "grad_norm": 0.7935951948165894, "learning_rate": 1.8898616886407595e-05, "loss": 2.0814, "step": 1695 }, { "epoch": 0.1508829678395089, "grad_norm": 0.7128658890724182, "learning_rate": 1.8897341369744794e-05, "loss": 1.9356, "step": 1696 }, { "epoch": 0.15097193185356522, "grad_norm": 0.7423327565193176, "learning_rate": 1.8896065158013865e-05, "loss": 2.0001, "step": 1697 }, { "epoch": 0.15106089586762156, "grad_norm": 0.7703441381454468, "learning_rate": 1.8894788251314507e-05, "loss": 1.8965, "step": 1698 }, { "epoch": 0.15114985988167787, "grad_norm": 0.7241037487983704, "learning_rate": 1.889351064974648e-05, "loss": 2.1048, "step": 1699 }, { "epoch": 0.15123882389573418, "grad_norm": 0.7374036908149719, "learning_rate": 1.8892232353409582e-05, "loss": 2.1671, "step": 1700 }, { "epoch": 0.15132778790979048, "grad_norm": 0.7733006477355957, "learning_rate": 1.8890953362403677e-05, "loss": 2.1208, "step": 1701 }, { "epoch": 0.1514167519238468, "grad_norm": 0.7187023758888245, "learning_rate": 1.8889673676828682e-05, "loss": 2.1337, "step": 1702 }, { "epoch": 0.15150571593790313, "grad_norm": 0.7157388925552368, "learning_rate": 1.888839329678457e-05, "loss": 2.119, "step": 1703 }, { "epoch": 0.15159467995195944, "grad_norm": 0.713214635848999, "learning_rate": 1.8887112222371363e-05, "loss": 2.2152, "step": 1704 }, { "epoch": 0.15168364396601575, "grad_norm": 0.7415915131568909, "learning_rate": 1.888583045368914e-05, "loss": 2.0911, "step": 1705 }, { "epoch": 0.15177260798007206, "grad_norm": 0.721565306186676, "learning_rate": 1.8884547990838027e-05, "loss": 1.976, "step": 1706 }, { "epoch": 0.15186157199412836, "grad_norm": 0.7319061160087585, "learning_rate": 1.8883264833918222e-05, "loss": 2.0447, "step": 1707 }, { "epoch": 0.1519505360081847, "grad_norm": 0.7613155245780945, "learning_rate": 1.888198098302996e-05, "loss": 2.0663, "step": 1708 }, { "epoch": 0.152039500022241, "grad_norm": 0.7512582540512085, "learning_rate": 1.888069643827354e-05, "loss": 2.2097, "step": 1709 }, { "epoch": 0.15212846403629732, "grad_norm": 0.6775617599487305, "learning_rate": 1.8879411199749306e-05, "loss": 2.0195, "step": 1710 }, { "epoch": 0.15221742805035363, "grad_norm": 0.7454359531402588, "learning_rate": 1.887812526755767e-05, "loss": 2.0821, "step": 1711 }, { "epoch": 0.15230639206440993, "grad_norm": 0.717721164226532, "learning_rate": 1.8876838641799083e-05, "loss": 2.0334, "step": 1712 }, { "epoch": 0.15239535607846627, "grad_norm": 0.9763962030410767, "learning_rate": 1.8875551322574065e-05, "loss": 2.0207, "step": 1713 }, { "epoch": 0.15248432009252258, "grad_norm": 0.7153776288032532, "learning_rate": 1.8874263309983175e-05, "loss": 2.0679, "step": 1714 }, { "epoch": 0.1525732841065789, "grad_norm": 0.6894409656524658, "learning_rate": 1.8872974604127035e-05, "loss": 2.2202, "step": 1715 }, { "epoch": 0.1526622481206352, "grad_norm": 0.7402191758155823, "learning_rate": 1.887168520510632e-05, "loss": 2.1805, "step": 1716 }, { "epoch": 0.1527512121346915, "grad_norm": 0.7610843181610107, "learning_rate": 1.8870395113021766e-05, "loss": 2.2017, "step": 1717 }, { "epoch": 0.15284017614874784, "grad_norm": 0.7854196429252625, "learning_rate": 1.8869104327974145e-05, "loss": 2.0792, "step": 1718 }, { "epoch": 0.15292914016280415, "grad_norm": 0.7620598673820496, "learning_rate": 1.88678128500643e-05, "loss": 2.2147, "step": 1719 }, { "epoch": 0.15301810417686046, "grad_norm": 0.8224397897720337, "learning_rate": 1.8866520679393127e-05, "loss": 2.0944, "step": 1720 }, { "epoch": 0.15310706819091677, "grad_norm": 0.7002196311950684, "learning_rate": 1.8865227816061568e-05, "loss": 2.0543, "step": 1721 }, { "epoch": 0.15319603220497308, "grad_norm": 0.7724828720092773, "learning_rate": 1.8863934260170612e-05, "loss": 2.0252, "step": 1722 }, { "epoch": 0.1532849962190294, "grad_norm": 0.7003974318504333, "learning_rate": 1.8862640011821328e-05, "loss": 2.0419, "step": 1723 }, { "epoch": 0.15337396023308572, "grad_norm": 0.7713096141815186, "learning_rate": 1.886134507111482e-05, "loss": 2.0754, "step": 1724 }, { "epoch": 0.15346292424714203, "grad_norm": 0.7424819469451904, "learning_rate": 1.8860049438152247e-05, "loss": 2.222, "step": 1725 }, { "epoch": 0.15355188826119834, "grad_norm": 0.6938824653625488, "learning_rate": 1.8858753113034823e-05, "loss": 2.1414, "step": 1726 }, { "epoch": 0.15364085227525465, "grad_norm": 0.705534040927887, "learning_rate": 1.885745609586382e-05, "loss": 2.1389, "step": 1727 }, { "epoch": 0.15372981628931098, "grad_norm": 0.7659321427345276, "learning_rate": 1.8856158386740566e-05, "loss": 2.0364, "step": 1728 }, { "epoch": 0.1538187803033673, "grad_norm": 0.7210482358932495, "learning_rate": 1.8854859985766433e-05, "loss": 2.1499, "step": 1729 }, { "epoch": 0.1539077443174236, "grad_norm": 0.726554811000824, "learning_rate": 1.8853560893042858e-05, "loss": 1.8068, "step": 1730 }, { "epoch": 0.1539967083314799, "grad_norm": 0.6968569755554199, "learning_rate": 1.8852261108671324e-05, "loss": 2.1237, "step": 1731 }, { "epoch": 0.15408567234553622, "grad_norm": 0.8003910183906555, "learning_rate": 1.8850960632753375e-05, "loss": 2.0449, "step": 1732 }, { "epoch": 0.15417463635959255, "grad_norm": 0.6938436627388, "learning_rate": 1.8849659465390602e-05, "loss": 2.0797, "step": 1733 }, { "epoch": 0.15426360037364886, "grad_norm": 0.7101855278015137, "learning_rate": 1.8848357606684655e-05, "loss": 2.1846, "step": 1734 }, { "epoch": 0.15435256438770517, "grad_norm": 0.7504428625106812, "learning_rate": 1.8847055056737236e-05, "loss": 2.11, "step": 1735 }, { "epoch": 0.15444152840176148, "grad_norm": 0.7059917449951172, "learning_rate": 1.8845751815650103e-05, "loss": 1.9762, "step": 1736 }, { "epoch": 0.1545304924158178, "grad_norm": 0.6485599875450134, "learning_rate": 1.8844447883525062e-05, "loss": 1.9545, "step": 1737 }, { "epoch": 0.15461945642987412, "grad_norm": 0.7002295851707458, "learning_rate": 1.884314326046398e-05, "loss": 2.0011, "step": 1738 }, { "epoch": 0.15470842044393043, "grad_norm": 0.7534664869308472, "learning_rate": 1.8841837946568776e-05, "loss": 1.9783, "step": 1739 }, { "epoch": 0.15479738445798674, "grad_norm": 0.7672422528266907, "learning_rate": 1.884053194194142e-05, "loss": 1.9927, "step": 1740 }, { "epoch": 0.15488634847204305, "grad_norm": 0.6991240978240967, "learning_rate": 1.8839225246683942e-05, "loss": 2.2359, "step": 1741 }, { "epoch": 0.15497531248609936, "grad_norm": 0.7343803644180298, "learning_rate": 1.8837917860898417e-05, "loss": 2.0843, "step": 1742 }, { "epoch": 0.1550642765001557, "grad_norm": 0.7663724422454834, "learning_rate": 1.8836609784686985e-05, "loss": 2.1153, "step": 1743 }, { "epoch": 0.155153240514212, "grad_norm": 0.7693855166435242, "learning_rate": 1.883530101815183e-05, "loss": 2.1338, "step": 1744 }, { "epoch": 0.1552422045282683, "grad_norm": 0.7001520395278931, "learning_rate": 1.8833991561395194e-05, "loss": 2.1277, "step": 1745 }, { "epoch": 0.15533116854232462, "grad_norm": 0.7208811640739441, "learning_rate": 1.8832681414519376e-05, "loss": 2.1053, "step": 1746 }, { "epoch": 0.15542013255638096, "grad_norm": 0.7156041860580444, "learning_rate": 1.883137057762672e-05, "loss": 2.0689, "step": 1747 }, { "epoch": 0.15550909657043727, "grad_norm": 0.7706544995307922, "learning_rate": 1.883005905081964e-05, "loss": 2.1309, "step": 1748 }, { "epoch": 0.15559806058449357, "grad_norm": 0.7086490988731384, "learning_rate": 1.8828746834200582e-05, "loss": 2.0511, "step": 1749 }, { "epoch": 0.15568702459854988, "grad_norm": 0.7971568703651428, "learning_rate": 1.8827433927872066e-05, "loss": 2.1786, "step": 1750 }, { "epoch": 0.1557759886126062, "grad_norm": 0.8119637966156006, "learning_rate": 1.882612033193665e-05, "loss": 2.0407, "step": 1751 }, { "epoch": 0.15586495262666253, "grad_norm": 0.6752327084541321, "learning_rate": 1.882480604649696e-05, "loss": 2.0103, "step": 1752 }, { "epoch": 0.15595391664071884, "grad_norm": 0.7700290679931641, "learning_rate": 1.8823491071655663e-05, "loss": 2.0433, "step": 1753 }, { "epoch": 0.15604288065477515, "grad_norm": 0.7152250409126282, "learning_rate": 1.8822175407515492e-05, "loss": 2.1931, "step": 1754 }, { "epoch": 0.15613184466883145, "grad_norm": 0.7262698411941528, "learning_rate": 1.8820859054179225e-05, "loss": 1.9912, "step": 1755 }, { "epoch": 0.15622080868288776, "grad_norm": 0.7028684616088867, "learning_rate": 1.8819542011749702e-05, "loss": 2.0953, "step": 1756 }, { "epoch": 0.1563097726969441, "grad_norm": 0.7653725743293762, "learning_rate": 1.88182242803298e-05, "loss": 2.0136, "step": 1757 }, { "epoch": 0.1563987367110004, "grad_norm": 0.7374569177627563, "learning_rate": 1.8816905860022468e-05, "loss": 1.966, "step": 1758 }, { "epoch": 0.15648770072505672, "grad_norm": 0.7701684832572937, "learning_rate": 1.8815586750930705e-05, "loss": 2.0522, "step": 1759 }, { "epoch": 0.15657666473911303, "grad_norm": 0.7594572901725769, "learning_rate": 1.8814266953157557e-05, "loss": 2.0601, "step": 1760 }, { "epoch": 0.15666562875316933, "grad_norm": 0.7379246354103088, "learning_rate": 1.881294646680613e-05, "loss": 2.0749, "step": 1761 }, { "epoch": 0.15675459276722567, "grad_norm": 0.7488964200019836, "learning_rate": 1.8811625291979575e-05, "loss": 2.0425, "step": 1762 }, { "epoch": 0.15684355678128198, "grad_norm": 0.8486132621765137, "learning_rate": 1.8810303428781112e-05, "loss": 2.0423, "step": 1763 }, { "epoch": 0.1569325207953383, "grad_norm": 0.7446539402008057, "learning_rate": 1.8808980877314003e-05, "loss": 2.1037, "step": 1764 }, { "epoch": 0.1570214848093946, "grad_norm": 0.7896617650985718, "learning_rate": 1.8807657637681567e-05, "loss": 2.0395, "step": 1765 }, { "epoch": 0.1571104488234509, "grad_norm": 0.7322534918785095, "learning_rate": 1.8806333709987177e-05, "loss": 2.2084, "step": 1766 }, { "epoch": 0.15719941283750724, "grad_norm": 0.7449268698692322, "learning_rate": 1.8805009094334258e-05, "loss": 2.1782, "step": 1767 }, { "epoch": 0.15728837685156355, "grad_norm": 0.7084174156188965, "learning_rate": 1.8803683790826288e-05, "loss": 2.072, "step": 1768 }, { "epoch": 0.15737734086561986, "grad_norm": 0.769329309463501, "learning_rate": 1.8802357799566807e-05, "loss": 2.0874, "step": 1769 }, { "epoch": 0.15746630487967617, "grad_norm": 0.7894322872161865, "learning_rate": 1.8801031120659396e-05, "loss": 2.0483, "step": 1770 }, { "epoch": 0.15755526889373248, "grad_norm": 0.7232023477554321, "learning_rate": 1.8799703754207705e-05, "loss": 2.1506, "step": 1771 }, { "epoch": 0.1576442329077888, "grad_norm": 0.7480355501174927, "learning_rate": 1.8798375700315417e-05, "loss": 2.1665, "step": 1772 }, { "epoch": 0.15773319692184512, "grad_norm": 0.7537410855293274, "learning_rate": 1.8797046959086288e-05, "loss": 2.1898, "step": 1773 }, { "epoch": 0.15782216093590143, "grad_norm": 0.7657075524330139, "learning_rate": 1.8795717530624125e-05, "loss": 2.1121, "step": 1774 }, { "epoch": 0.15791112494995774, "grad_norm": 0.7194181084632874, "learning_rate": 1.8794387415032777e-05, "loss": 2.0995, "step": 1775 }, { "epoch": 0.15800008896401405, "grad_norm": 0.8271670341491699, "learning_rate": 1.8793056612416155e-05, "loss": 2.1438, "step": 1776 }, { "epoch": 0.15808905297807038, "grad_norm": 0.7701734304428101, "learning_rate": 1.8791725122878223e-05, "loss": 2.0079, "step": 1777 }, { "epoch": 0.1581780169921267, "grad_norm": 0.7514343857765198, "learning_rate": 1.8790392946522995e-05, "loss": 2.0762, "step": 1778 }, { "epoch": 0.158266981006183, "grad_norm": 0.7932415008544922, "learning_rate": 1.8789060083454548e-05, "loss": 2.003, "step": 1779 }, { "epoch": 0.1583559450202393, "grad_norm": 0.9084933996200562, "learning_rate": 1.8787726533777003e-05, "loss": 1.9776, "step": 1780 }, { "epoch": 0.15844490903429562, "grad_norm": 0.7867962121963501, "learning_rate": 1.878639229759454e-05, "loss": 2.1112, "step": 1781 }, { "epoch": 0.15853387304835195, "grad_norm": 0.7356816530227661, "learning_rate": 1.878505737501139e-05, "loss": 2.0141, "step": 1782 }, { "epoch": 0.15862283706240826, "grad_norm": 0.7537601590156555, "learning_rate": 1.8783721766131836e-05, "loss": 2.0923, "step": 1783 }, { "epoch": 0.15871180107646457, "grad_norm": 0.7181324362754822, "learning_rate": 1.8782385471060217e-05, "loss": 2.1452, "step": 1784 }, { "epoch": 0.15880076509052088, "grad_norm": 0.727722704410553, "learning_rate": 1.878104848990093e-05, "loss": 2.0917, "step": 1785 }, { "epoch": 0.1588897291045772, "grad_norm": 0.743998110294342, "learning_rate": 1.8779710822758416e-05, "loss": 2.0438, "step": 1786 }, { "epoch": 0.15897869311863352, "grad_norm": 0.7641489505767822, "learning_rate": 1.8778372469737177e-05, "loss": 2.236, "step": 1787 }, { "epoch": 0.15906765713268983, "grad_norm": 0.7348562479019165, "learning_rate": 1.8777033430941768e-05, "loss": 1.9987, "step": 1788 }, { "epoch": 0.15915662114674614, "grad_norm": 0.779440701007843, "learning_rate": 1.8775693706476793e-05, "loss": 1.918, "step": 1789 }, { "epoch": 0.15924558516080245, "grad_norm": 0.8021536469459534, "learning_rate": 1.8774353296446914e-05, "loss": 2.0355, "step": 1790 }, { "epoch": 0.15933454917485876, "grad_norm": 0.7122755646705627, "learning_rate": 1.8773012200956843e-05, "loss": 2.1684, "step": 1791 }, { "epoch": 0.1594235131889151, "grad_norm": 0.759109377861023, "learning_rate": 1.8771670420111354e-05, "loss": 1.9955, "step": 1792 }, { "epoch": 0.1595124772029714, "grad_norm": 0.7963847517967224, "learning_rate": 1.8770327954015258e-05, "loss": 2.1885, "step": 1793 }, { "epoch": 0.1596014412170277, "grad_norm": 0.7539779543876648, "learning_rate": 1.8768984802773435e-05, "loss": 2.0795, "step": 1794 }, { "epoch": 0.15969040523108402, "grad_norm": 0.7149888277053833, "learning_rate": 1.8767640966490816e-05, "loss": 1.9932, "step": 1795 }, { "epoch": 0.15977936924514033, "grad_norm": 0.7326191663742065, "learning_rate": 1.876629644527238e-05, "loss": 2.0157, "step": 1796 }, { "epoch": 0.15986833325919667, "grad_norm": 0.743001401424408, "learning_rate": 1.8764951239223158e-05, "loss": 2.2578, "step": 1797 }, { "epoch": 0.15995729727325297, "grad_norm": 0.7044163942337036, "learning_rate": 1.8763605348448244e-05, "loss": 2.119, "step": 1798 }, { "epoch": 0.16004626128730928, "grad_norm": 0.7616197466850281, "learning_rate": 1.876225877305278e-05, "loss": 2.0961, "step": 1799 }, { "epoch": 0.1601352253013656, "grad_norm": 0.7645540833473206, "learning_rate": 1.876091151314196e-05, "loss": 2.1012, "step": 1800 }, { "epoch": 0.1602241893154219, "grad_norm": 0.8000994920730591, "learning_rate": 1.8759563568821037e-05, "loss": 2.0713, "step": 1801 }, { "epoch": 0.16031315332947824, "grad_norm": 0.734710156917572, "learning_rate": 1.8758214940195307e-05, "loss": 2.1271, "step": 1802 }, { "epoch": 0.16040211734353454, "grad_norm": 0.7325761914253235, "learning_rate": 1.8756865627370132e-05, "loss": 2.1531, "step": 1803 }, { "epoch": 0.16049108135759085, "grad_norm": 0.7637733221054077, "learning_rate": 1.8755515630450913e-05, "loss": 2.1232, "step": 1804 }, { "epoch": 0.16058004537164716, "grad_norm": 0.8559128046035767, "learning_rate": 1.8754164949543123e-05, "loss": 2.2097, "step": 1805 }, { "epoch": 0.16066900938570347, "grad_norm": 0.7366340756416321, "learning_rate": 1.875281358475227e-05, "loss": 2.1654, "step": 1806 }, { "epoch": 0.1607579733997598, "grad_norm": 0.7143263220787048, "learning_rate": 1.8751461536183933e-05, "loss": 2.2115, "step": 1807 }, { "epoch": 0.16084693741381612, "grad_norm": 0.8203288316726685, "learning_rate": 1.8750108803943728e-05, "loss": 2.0976, "step": 1808 }, { "epoch": 0.16093590142787242, "grad_norm": 0.7197508811950684, "learning_rate": 1.8748755388137333e-05, "loss": 2.0084, "step": 1809 }, { "epoch": 0.16102486544192873, "grad_norm": 0.7975771427154541, "learning_rate": 1.874740128887048e-05, "loss": 2.092, "step": 1810 }, { "epoch": 0.16111382945598504, "grad_norm": 0.7872530221939087, "learning_rate": 1.874604650624895e-05, "loss": 2.0387, "step": 1811 }, { "epoch": 0.16120279347004138, "grad_norm": 0.705989420413971, "learning_rate": 1.874469104037858e-05, "loss": 2.1177, "step": 1812 }, { "epoch": 0.1612917574840977, "grad_norm": 0.703155517578125, "learning_rate": 1.8743334891365263e-05, "loss": 2.132, "step": 1813 }, { "epoch": 0.161380721498154, "grad_norm": 0.7616347670555115, "learning_rate": 1.8741978059314937e-05, "loss": 1.9902, "step": 1814 }, { "epoch": 0.1614696855122103, "grad_norm": 0.7730119228363037, "learning_rate": 1.8740620544333604e-05, "loss": 2.0752, "step": 1815 }, { "epoch": 0.1615586495262666, "grad_norm": 0.7856339812278748, "learning_rate": 1.8739262346527316e-05, "loss": 2.1875, "step": 1816 }, { "epoch": 0.16164761354032295, "grad_norm": 0.7071665525436401, "learning_rate": 1.8737903466002173e-05, "loss": 2.1505, "step": 1817 }, { "epoch": 0.16173657755437926, "grad_norm": 0.7310065031051636, "learning_rate": 1.873654390286433e-05, "loss": 1.9654, "step": 1818 }, { "epoch": 0.16182554156843557, "grad_norm": 0.7112226486206055, "learning_rate": 1.8735183657220003e-05, "loss": 2.187, "step": 1819 }, { "epoch": 0.16191450558249187, "grad_norm": 0.6939377784729004, "learning_rate": 1.8733822729175452e-05, "loss": 2.0761, "step": 1820 }, { "epoch": 0.16200346959654818, "grad_norm": 0.6940298080444336, "learning_rate": 1.873246111883699e-05, "loss": 2.0908, "step": 1821 }, { "epoch": 0.16209243361060452, "grad_norm": 0.7029221653938293, "learning_rate": 1.8731098826310993e-05, "loss": 2.1953, "step": 1822 }, { "epoch": 0.16218139762466083, "grad_norm": 0.727691650390625, "learning_rate": 1.8729735851703884e-05, "loss": 2.0464, "step": 1823 }, { "epoch": 0.16227036163871714, "grad_norm": 0.7493777871131897, "learning_rate": 1.872837219512214e-05, "loss": 2.0195, "step": 1824 }, { "epoch": 0.16235932565277345, "grad_norm": 0.7697184085845947, "learning_rate": 1.8727007856672285e-05, "loss": 2.0733, "step": 1825 }, { "epoch": 0.16244828966682975, "grad_norm": 0.7314260601997375, "learning_rate": 1.872564283646091e-05, "loss": 2.0274, "step": 1826 }, { "epoch": 0.1625372536808861, "grad_norm": 0.6913999915122986, "learning_rate": 1.872427713459465e-05, "loss": 2.0319, "step": 1827 }, { "epoch": 0.1626262176949424, "grad_norm": 0.7095019221305847, "learning_rate": 1.872291075118019e-05, "loss": 2.2865, "step": 1828 }, { "epoch": 0.1627151817089987, "grad_norm": 0.7772275805473328, "learning_rate": 1.872154368632428e-05, "loss": 2.0656, "step": 1829 }, { "epoch": 0.16280414572305502, "grad_norm": 0.7674510478973389, "learning_rate": 1.872017594013371e-05, "loss": 2.1981, "step": 1830 }, { "epoch": 0.16289310973711135, "grad_norm": 0.7497212290763855, "learning_rate": 1.871880751271533e-05, "loss": 2.0903, "step": 1831 }, { "epoch": 0.16298207375116766, "grad_norm": 0.7360950112342834, "learning_rate": 1.871743840417605e-05, "loss": 2.1679, "step": 1832 }, { "epoch": 0.16307103776522397, "grad_norm": 0.8901522159576416, "learning_rate": 1.871606861462282e-05, "loss": 2.1009, "step": 1833 }, { "epoch": 0.16316000177928028, "grad_norm": 0.8145518898963928, "learning_rate": 1.871469814416265e-05, "loss": 2.293, "step": 1834 }, { "epoch": 0.1632489657933366, "grad_norm": 0.7363942265510559, "learning_rate": 1.8713326992902602e-05, "loss": 2.0026, "step": 1835 }, { "epoch": 0.16333792980739292, "grad_norm": 0.7192262411117554, "learning_rate": 1.8711955160949792e-05, "loss": 2.1988, "step": 1836 }, { "epoch": 0.16342689382144923, "grad_norm": 0.723246693611145, "learning_rate": 1.871058264841139e-05, "loss": 2.1663, "step": 1837 }, { "epoch": 0.16351585783550554, "grad_norm": 0.853582501411438, "learning_rate": 1.8709209455394614e-05, "loss": 2.2892, "step": 1838 }, { "epoch": 0.16360482184956185, "grad_norm": 0.8126729130744934, "learning_rate": 1.8707835582006743e-05, "loss": 2.0741, "step": 1839 }, { "epoch": 0.16369378586361816, "grad_norm": 0.7468144297599792, "learning_rate": 1.8706461028355107e-05, "loss": 1.9585, "step": 1840 }, { "epoch": 0.1637827498776745, "grad_norm": 0.7093182802200317, "learning_rate": 1.870508579454708e-05, "loss": 2.0454, "step": 1841 }, { "epoch": 0.1638717138917308, "grad_norm": 0.8176747560501099, "learning_rate": 1.8703709880690103e-05, "loss": 2.1492, "step": 1842 }, { "epoch": 0.1639606779057871, "grad_norm": 0.7589290738105774, "learning_rate": 1.870233328689166e-05, "loss": 2.2183, "step": 1843 }, { "epoch": 0.16404964191984342, "grad_norm": 0.7100753784179688, "learning_rate": 1.8700956013259293e-05, "loss": 2.131, "step": 1844 }, { "epoch": 0.16413860593389973, "grad_norm": 0.7215257883071899, "learning_rate": 1.8699578059900597e-05, "loss": 2.0446, "step": 1845 }, { "epoch": 0.16422756994795606, "grad_norm": 0.7379798889160156, "learning_rate": 1.8698199426923217e-05, "loss": 2.0686, "step": 1846 }, { "epoch": 0.16431653396201237, "grad_norm": 0.7155280113220215, "learning_rate": 1.8696820114434854e-05, "loss": 2.1562, "step": 1847 }, { "epoch": 0.16440549797606868, "grad_norm": 0.7534448504447937, "learning_rate": 1.8695440122543262e-05, "loss": 2.0457, "step": 1848 }, { "epoch": 0.164494461990125, "grad_norm": 0.7414466142654419, "learning_rate": 1.8694059451356247e-05, "loss": 2.0605, "step": 1849 }, { "epoch": 0.1645834260041813, "grad_norm": 0.7555286884307861, "learning_rate": 1.8692678100981663e-05, "loss": 2.2114, "step": 1850 }, { "epoch": 0.16467239001823764, "grad_norm": 0.7113584280014038, "learning_rate": 1.869129607152743e-05, "loss": 2.0994, "step": 1851 }, { "epoch": 0.16476135403229394, "grad_norm": 0.6962443590164185, "learning_rate": 1.8689913363101507e-05, "loss": 2.0234, "step": 1852 }, { "epoch": 0.16485031804635025, "grad_norm": 0.7367153763771057, "learning_rate": 1.8688529975811918e-05, "loss": 2.0091, "step": 1853 }, { "epoch": 0.16493928206040656, "grad_norm": 0.7367897033691406, "learning_rate": 1.8687145909766734e-05, "loss": 2.1639, "step": 1854 }, { "epoch": 0.16502824607446287, "grad_norm": 0.7595120668411255, "learning_rate": 1.8685761165074073e-05, "loss": 2.0679, "step": 1855 }, { "epoch": 0.1651172100885192, "grad_norm": 0.7509666681289673, "learning_rate": 1.8684375741842123e-05, "loss": 2.1355, "step": 1856 }, { "epoch": 0.16520617410257551, "grad_norm": 0.7923156023025513, "learning_rate": 1.8682989640179103e-05, "loss": 2.2245, "step": 1857 }, { "epoch": 0.16529513811663182, "grad_norm": 0.7212690114974976, "learning_rate": 1.86816028601933e-05, "loss": 1.8301, "step": 1858 }, { "epoch": 0.16538410213068813, "grad_norm": 0.7804785966873169, "learning_rate": 1.8680215401993057e-05, "loss": 1.9919, "step": 1859 }, { "epoch": 0.16547306614474444, "grad_norm": 0.7560901641845703, "learning_rate": 1.867882726568676e-05, "loss": 2.0787, "step": 1860 }, { "epoch": 0.16556203015880078, "grad_norm": 0.7424188256263733, "learning_rate": 1.8677438451382844e-05, "loss": 2.2917, "step": 1861 }, { "epoch": 0.16565099417285709, "grad_norm": 0.711249053478241, "learning_rate": 1.8676048959189813e-05, "loss": 2.063, "step": 1862 }, { "epoch": 0.1657399581869134, "grad_norm": 0.701790452003479, "learning_rate": 1.8674658789216214e-05, "loss": 2.1327, "step": 1863 }, { "epoch": 0.1658289222009697, "grad_norm": 0.7108314633369446, "learning_rate": 1.8673267941570646e-05, "loss": 2.037, "step": 1864 }, { "epoch": 0.165917886215026, "grad_norm": 0.7222524285316467, "learning_rate": 1.8671876416361763e-05, "loss": 2.1311, "step": 1865 }, { "epoch": 0.16600685022908235, "grad_norm": 0.7131462693214417, "learning_rate": 1.8670484213698277e-05, "loss": 2.0107, "step": 1866 }, { "epoch": 0.16609581424313866, "grad_norm": 0.8341764211654663, "learning_rate": 1.8669091333688944e-05, "loss": 1.9434, "step": 1867 }, { "epoch": 0.16618477825719496, "grad_norm": 0.7179468274116516, "learning_rate": 1.8667697776442576e-05, "loss": 2.3947, "step": 1868 }, { "epoch": 0.16627374227125127, "grad_norm": 0.7234111428260803, "learning_rate": 1.8666303542068038e-05, "loss": 2.069, "step": 1869 }, { "epoch": 0.16636270628530758, "grad_norm": 0.7480978965759277, "learning_rate": 1.8664908630674258e-05, "loss": 2.2225, "step": 1870 }, { "epoch": 0.16645167029936392, "grad_norm": 0.6910400390625, "learning_rate": 1.8663513042370195e-05, "loss": 2.2105, "step": 1871 }, { "epoch": 0.16654063431342023, "grad_norm": 0.7273512482643127, "learning_rate": 1.8662116777264882e-05, "loss": 2.1312, "step": 1872 }, { "epoch": 0.16662959832747654, "grad_norm": 0.7401614189147949, "learning_rate": 1.8660719835467394e-05, "loss": 2.1228, "step": 1873 }, { "epoch": 0.16671856234153284, "grad_norm": 0.7384092211723328, "learning_rate": 1.8659322217086863e-05, "loss": 2.091, "step": 1874 }, { "epoch": 0.16680752635558915, "grad_norm": 0.7078136801719666, "learning_rate": 1.865792392223247e-05, "loss": 2.1224, "step": 1875 }, { "epoch": 0.1668964903696455, "grad_norm": 0.708893358707428, "learning_rate": 1.8656524951013454e-05, "loss": 2.0841, "step": 1876 }, { "epoch": 0.1669854543837018, "grad_norm": 0.7142239212989807, "learning_rate": 1.8655125303539097e-05, "loss": 2.138, "step": 1877 }, { "epoch": 0.1670744183977581, "grad_norm": 0.7485974431037903, "learning_rate": 1.8653724979918745e-05, "loss": 2.0516, "step": 1878 }, { "epoch": 0.16716338241181442, "grad_norm": 0.7050961256027222, "learning_rate": 1.8652323980261794e-05, "loss": 2.0073, "step": 1879 }, { "epoch": 0.16725234642587072, "grad_norm": 0.7545443773269653, "learning_rate": 1.865092230467769e-05, "loss": 2.1348, "step": 1880 }, { "epoch": 0.16734131043992706, "grad_norm": 0.7435582280158997, "learning_rate": 1.8649519953275934e-05, "loss": 2.0748, "step": 1881 }, { "epoch": 0.16743027445398337, "grad_norm": 0.7553359270095825, "learning_rate": 1.8648116926166078e-05, "loss": 2.154, "step": 1882 }, { "epoch": 0.16751923846803968, "grad_norm": 0.7540184259414673, "learning_rate": 1.8646713223457728e-05, "loss": 2.1608, "step": 1883 }, { "epoch": 0.16760820248209599, "grad_norm": 0.7073484063148499, "learning_rate": 1.8645308845260542e-05, "loss": 2.2049, "step": 1884 }, { "epoch": 0.1676971664961523, "grad_norm": 0.7543067932128906, "learning_rate": 1.864390379168423e-05, "loss": 2.025, "step": 1885 }, { "epoch": 0.16778613051020863, "grad_norm": 0.7582738399505615, "learning_rate": 1.864249806283856e-05, "loss": 2.0858, "step": 1886 }, { "epoch": 0.16787509452426494, "grad_norm": 0.7203623652458191, "learning_rate": 1.864109165883335e-05, "loss": 2.0574, "step": 1887 }, { "epoch": 0.16796405853832125, "grad_norm": 0.7144920229911804, "learning_rate": 1.863968457977846e-05, "loss": 2.0281, "step": 1888 }, { "epoch": 0.16805302255237756, "grad_norm": 0.7496237754821777, "learning_rate": 1.8638276825783816e-05, "loss": 1.95, "step": 1889 }, { "epoch": 0.16814198656643387, "grad_norm": 0.755275547504425, "learning_rate": 1.86368683969594e-05, "loss": 2.2377, "step": 1890 }, { "epoch": 0.1682309505804902, "grad_norm": 0.7724699974060059, "learning_rate": 1.863545929341523e-05, "loss": 2.0523, "step": 1891 }, { "epoch": 0.1683199145945465, "grad_norm": 0.7186881899833679, "learning_rate": 1.86340495152614e-05, "loss": 2.0724, "step": 1892 }, { "epoch": 0.16840887860860282, "grad_norm": 0.7525007128715515, "learning_rate": 1.863263906260803e-05, "loss": 2.1485, "step": 1893 }, { "epoch": 0.16849784262265913, "grad_norm": 0.7314103841781616, "learning_rate": 1.8631227935565305e-05, "loss": 2.1517, "step": 1894 }, { "epoch": 0.16858680663671544, "grad_norm": 0.7624326944351196, "learning_rate": 1.8629816134243473e-05, "loss": 2.0776, "step": 1895 }, { "epoch": 0.16867577065077177, "grad_norm": 0.712710440158844, "learning_rate": 1.862840365875282e-05, "loss": 2.1549, "step": 1896 }, { "epoch": 0.16876473466482808, "grad_norm": 0.7075045704841614, "learning_rate": 1.862699050920369e-05, "loss": 2.232, "step": 1897 }, { "epoch": 0.1688536986788844, "grad_norm": 0.7915934324264526, "learning_rate": 1.8625576685706484e-05, "loss": 2.1458, "step": 1898 }, { "epoch": 0.1689426626929407, "grad_norm": 0.7355189919471741, "learning_rate": 1.8624162188371643e-05, "loss": 2.1983, "step": 1899 }, { "epoch": 0.169031626706997, "grad_norm": 0.7223502397537231, "learning_rate": 1.8622747017309676e-05, "loss": 2.0683, "step": 1900 }, { "epoch": 0.16912059072105334, "grad_norm": 0.7345114946365356, "learning_rate": 1.862133117263113e-05, "loss": 2.0412, "step": 1901 }, { "epoch": 0.16920955473510965, "grad_norm": 0.7882584929466248, "learning_rate": 1.861991465444662e-05, "loss": 2.1158, "step": 1902 }, { "epoch": 0.16929851874916596, "grad_norm": 0.7379816770553589, "learning_rate": 1.8618497462866802e-05, "loss": 2.1331, "step": 1903 }, { "epoch": 0.16938748276322227, "grad_norm": 0.7374833822250366, "learning_rate": 1.8617079598002385e-05, "loss": 2.2117, "step": 1904 }, { "epoch": 0.16947644677727858, "grad_norm": 0.8703639507293701, "learning_rate": 1.861566105996414e-05, "loss": 2.1359, "step": 1905 }, { "epoch": 0.16956541079133491, "grad_norm": 0.7541050910949707, "learning_rate": 1.861424184886288e-05, "loss": 2.0179, "step": 1906 }, { "epoch": 0.16965437480539122, "grad_norm": 0.7221600413322449, "learning_rate": 1.8612821964809477e-05, "loss": 2.0993, "step": 1907 }, { "epoch": 0.16974333881944753, "grad_norm": 0.7363880276679993, "learning_rate": 1.8611401407914854e-05, "loss": 2.086, "step": 1908 }, { "epoch": 0.16983230283350384, "grad_norm": 0.7541742324829102, "learning_rate": 1.8609980178289988e-05, "loss": 2.0277, "step": 1909 }, { "epoch": 0.16992126684756015, "grad_norm": 0.7416980862617493, "learning_rate": 1.86085582760459e-05, "loss": 2.1452, "step": 1910 }, { "epoch": 0.17001023086161648, "grad_norm": 0.697801411151886, "learning_rate": 1.8607135701293674e-05, "loss": 2.1716, "step": 1911 }, { "epoch": 0.1700991948756728, "grad_norm": 0.7489839792251587, "learning_rate": 1.8605712454144446e-05, "loss": 2.0901, "step": 1912 }, { "epoch": 0.1701881588897291, "grad_norm": 0.7638895511627197, "learning_rate": 1.8604288534709397e-05, "loss": 1.9408, "step": 1913 }, { "epoch": 0.1702771229037854, "grad_norm": 0.7614314556121826, "learning_rate": 1.8602863943099768e-05, "loss": 2.1526, "step": 1914 }, { "epoch": 0.17036608691784172, "grad_norm": 0.749198317527771, "learning_rate": 1.8601438679426843e-05, "loss": 2.1285, "step": 1915 }, { "epoch": 0.17045505093189806, "grad_norm": 0.7942161560058594, "learning_rate": 1.8600012743801973e-05, "loss": 1.9493, "step": 1916 }, { "epoch": 0.17054401494595436, "grad_norm": 0.7613053321838379, "learning_rate": 1.8598586136336553e-05, "loss": 2.1215, "step": 1917 }, { "epoch": 0.17063297896001067, "grad_norm": 0.7468259930610657, "learning_rate": 1.8597158857142022e-05, "loss": 2.0766, "step": 1918 }, { "epoch": 0.17072194297406698, "grad_norm": 0.7206287980079651, "learning_rate": 1.8595730906329886e-05, "loss": 2.1261, "step": 1919 }, { "epoch": 0.17081090698812332, "grad_norm": 0.7447630167007446, "learning_rate": 1.8594302284011704e-05, "loss": 2.1041, "step": 1920 }, { "epoch": 0.17089987100217963, "grad_norm": 0.7892324328422546, "learning_rate": 1.859287299029907e-05, "loss": 2.001, "step": 1921 }, { "epoch": 0.17098883501623593, "grad_norm": 0.7449338436126709, "learning_rate": 1.8591443025303646e-05, "loss": 2.0803, "step": 1922 }, { "epoch": 0.17107779903029224, "grad_norm": 0.7706722617149353, "learning_rate": 1.859001238913714e-05, "loss": 2.3637, "step": 1923 }, { "epoch": 0.17116676304434855, "grad_norm": 0.7476236820220947, "learning_rate": 1.8588581081911323e-05, "loss": 2.0552, "step": 1924 }, { "epoch": 0.1712557270584049, "grad_norm": 0.7348268628120422, "learning_rate": 1.8587149103738003e-05, "loss": 1.9755, "step": 1925 }, { "epoch": 0.1713446910724612, "grad_norm": 0.7174257636070251, "learning_rate": 1.858571645472905e-05, "loss": 2.0591, "step": 1926 }, { "epoch": 0.1714336550865175, "grad_norm": 0.8517979979515076, "learning_rate": 1.858428313499638e-05, "loss": 2.2137, "step": 1927 }, { "epoch": 0.17152261910057381, "grad_norm": 0.740403413772583, "learning_rate": 1.858284914465197e-05, "loss": 2.0894, "step": 1928 }, { "epoch": 0.17161158311463012, "grad_norm": 0.7610679268836975, "learning_rate": 1.8581414483807838e-05, "loss": 2.2276, "step": 1929 }, { "epoch": 0.17170054712868646, "grad_norm": 0.6924216747283936, "learning_rate": 1.857997915257607e-05, "loss": 2.0484, "step": 1930 }, { "epoch": 0.17178951114274277, "grad_norm": 0.702710747718811, "learning_rate": 1.8578543151068786e-05, "loss": 2.1306, "step": 1931 }, { "epoch": 0.17187847515679908, "grad_norm": 0.762301504611969, "learning_rate": 1.8577106479398173e-05, "loss": 2.1962, "step": 1932 }, { "epoch": 0.17196743917085539, "grad_norm": 0.7432664632797241, "learning_rate": 1.857566913767647e-05, "loss": 2.0497, "step": 1933 }, { "epoch": 0.1720564031849117, "grad_norm": 0.7619324922561646, "learning_rate": 1.8574231126015952e-05, "loss": 2.0108, "step": 1934 }, { "epoch": 0.17214536719896803, "grad_norm": 0.7168024778366089, "learning_rate": 1.8572792444528963e-05, "loss": 2.1529, "step": 1935 }, { "epoch": 0.17223433121302434, "grad_norm": 0.7577253580093384, "learning_rate": 1.8571353093327897e-05, "loss": 2.0845, "step": 1936 }, { "epoch": 0.17232329522708065, "grad_norm": 0.7009834051132202, "learning_rate": 1.8569913072525192e-05, "loss": 2.1419, "step": 1937 }, { "epoch": 0.17241225924113696, "grad_norm": 0.7270769476890564, "learning_rate": 1.8568472382233352e-05, "loss": 1.9507, "step": 1938 }, { "epoch": 0.17250122325519326, "grad_norm": 0.75755375623703, "learning_rate": 1.8567031022564915e-05, "loss": 2.0653, "step": 1939 }, { "epoch": 0.1725901872692496, "grad_norm": 0.7334719896316528, "learning_rate": 1.8565588993632488e-05, "loss": 2.2284, "step": 1940 }, { "epoch": 0.1726791512833059, "grad_norm": 0.7682117223739624, "learning_rate": 1.856414629554872e-05, "loss": 2.1101, "step": 1941 }, { "epoch": 0.17276811529736222, "grad_norm": 0.7219837307929993, "learning_rate": 1.8562702928426318e-05, "loss": 2.1949, "step": 1942 }, { "epoch": 0.17285707931141853, "grad_norm": 0.7939475178718567, "learning_rate": 1.856125889237804e-05, "loss": 2.1708, "step": 1943 }, { "epoch": 0.17294604332547484, "grad_norm": 0.8877162337303162, "learning_rate": 1.8559814187516692e-05, "loss": 2.0469, "step": 1944 }, { "epoch": 0.17303500733953117, "grad_norm": 0.7914113998413086, "learning_rate": 1.8558368813955136e-05, "loss": 1.9695, "step": 1945 }, { "epoch": 0.17312397135358748, "grad_norm": 0.7338616847991943, "learning_rate": 1.8556922771806293e-05, "loss": 2.3152, "step": 1946 }, { "epoch": 0.1732129353676438, "grad_norm": 0.8243101835250854, "learning_rate": 1.8555476061183117e-05, "loss": 1.9247, "step": 1947 }, { "epoch": 0.1733018993817001, "grad_norm": 0.7637730240821838, "learning_rate": 1.8554028682198634e-05, "loss": 2.1164, "step": 1948 }, { "epoch": 0.1733908633957564, "grad_norm": 0.7518311738967896, "learning_rate": 1.8552580634965914e-05, "loss": 2.1484, "step": 1949 }, { "epoch": 0.17347982740981274, "grad_norm": 0.7514322400093079, "learning_rate": 1.855113191959808e-05, "loss": 2.1461, "step": 1950 }, { "epoch": 0.17356879142386905, "grad_norm": 0.8137099742889404, "learning_rate": 1.8549682536208306e-05, "loss": 2.0429, "step": 1951 }, { "epoch": 0.17365775543792536, "grad_norm": 0.7096027731895447, "learning_rate": 1.8548232484909815e-05, "loss": 2.0246, "step": 1952 }, { "epoch": 0.17374671945198167, "grad_norm": 0.9244053959846497, "learning_rate": 1.8546781765815892e-05, "loss": 1.9989, "step": 1953 }, { "epoch": 0.17383568346603798, "grad_norm": 0.7113448977470398, "learning_rate": 1.8545330379039867e-05, "loss": 2.1591, "step": 1954 }, { "epoch": 0.1739246474800943, "grad_norm": 0.8011485934257507, "learning_rate": 1.8543878324695122e-05, "loss": 2.0654, "step": 1955 }, { "epoch": 0.17401361149415062, "grad_norm": 0.8973109722137451, "learning_rate": 1.8542425602895096e-05, "loss": 2.2378, "step": 1956 }, { "epoch": 0.17410257550820693, "grad_norm": 0.7187478542327881, "learning_rate": 1.8540972213753275e-05, "loss": 2.1477, "step": 1957 }, { "epoch": 0.17419153952226324, "grad_norm": 0.7342544794082642, "learning_rate": 1.8539518157383198e-05, "loss": 2.1177, "step": 1958 }, { "epoch": 0.17428050353631955, "grad_norm": 0.7432228326797485, "learning_rate": 1.8538063433898458e-05, "loss": 2.0946, "step": 1959 }, { "epoch": 0.17436946755037588, "grad_norm": 0.7029479146003723, "learning_rate": 1.85366080434127e-05, "loss": 2.0983, "step": 1960 }, { "epoch": 0.1744584315644322, "grad_norm": 0.7784577012062073, "learning_rate": 1.8535151986039617e-05, "loss": 2.0171, "step": 1961 }, { "epoch": 0.1745473955784885, "grad_norm": 0.7120226621627808, "learning_rate": 1.853369526189296e-05, "loss": 2.231, "step": 1962 }, { "epoch": 0.1746363595925448, "grad_norm": 0.7883275747299194, "learning_rate": 1.8532237871086535e-05, "loss": 2.2354, "step": 1963 }, { "epoch": 0.17472532360660112, "grad_norm": 0.7253494262695312, "learning_rate": 1.8530779813734186e-05, "loss": 2.0291, "step": 1964 }, { "epoch": 0.17481428762065745, "grad_norm": 0.7413527965545654, "learning_rate": 1.852932108994982e-05, "loss": 2.0373, "step": 1965 }, { "epoch": 0.17490325163471376, "grad_norm": 0.8719160556793213, "learning_rate": 1.85278616998474e-05, "loss": 2.0239, "step": 1966 }, { "epoch": 0.17499221564877007, "grad_norm": 0.7891694903373718, "learning_rate": 1.8526401643540924e-05, "loss": 2.3019, "step": 1967 }, { "epoch": 0.17508117966282638, "grad_norm": 0.7814561128616333, "learning_rate": 1.852494092114446e-05, "loss": 2.0794, "step": 1968 }, { "epoch": 0.1751701436768827, "grad_norm": 0.7497658133506775, "learning_rate": 1.8523479532772122e-05, "loss": 1.9785, "step": 1969 }, { "epoch": 0.17525910769093903, "grad_norm": 0.8714814782142639, "learning_rate": 1.852201747853807e-05, "loss": 2.0924, "step": 1970 }, { "epoch": 0.17534807170499533, "grad_norm": 0.7860372066497803, "learning_rate": 1.8520554758556525e-05, "loss": 1.9245, "step": 1971 }, { "epoch": 0.17543703571905164, "grad_norm": 0.7438045144081116, "learning_rate": 1.8519091372941755e-05, "loss": 2.0612, "step": 1972 }, { "epoch": 0.17552599973310795, "grad_norm": 0.7341151237487793, "learning_rate": 1.851762732180808e-05, "loss": 2.1661, "step": 1973 }, { "epoch": 0.17561496374716426, "grad_norm": 0.7238994240760803, "learning_rate": 1.8516162605269873e-05, "loss": 2.162, "step": 1974 }, { "epoch": 0.1757039277612206, "grad_norm": 0.7861316204071045, "learning_rate": 1.8514697223441555e-05, "loss": 2.1072, "step": 1975 }, { "epoch": 0.1757928917752769, "grad_norm": 0.7153291702270508, "learning_rate": 1.8513231176437612e-05, "loss": 2.1006, "step": 1976 }, { "epoch": 0.1758818557893332, "grad_norm": 0.7574006915092468, "learning_rate": 1.851176446437257e-05, "loss": 2.1611, "step": 1977 }, { "epoch": 0.17597081980338952, "grad_norm": 0.7300019264221191, "learning_rate": 1.8510297087361003e-05, "loss": 1.9121, "step": 1978 }, { "epoch": 0.17605978381744583, "grad_norm": 0.7209392786026001, "learning_rate": 1.850882904551755e-05, "loss": 2.0348, "step": 1979 }, { "epoch": 0.17614874783150217, "grad_norm": 0.7444897294044495, "learning_rate": 1.8507360338956896e-05, "loss": 2.2022, "step": 1980 }, { "epoch": 0.17623771184555848, "grad_norm": 0.7467645406723022, "learning_rate": 1.8505890967793775e-05, "loss": 2.1208, "step": 1981 }, { "epoch": 0.17632667585961478, "grad_norm": 0.738163948059082, "learning_rate": 1.850442093214298e-05, "loss": 2.1139, "step": 1982 }, { "epoch": 0.1764156398736711, "grad_norm": 0.7563031315803528, "learning_rate": 1.8502950232119342e-05, "loss": 2.0689, "step": 1983 }, { "epoch": 0.1765046038877274, "grad_norm": 0.7431614398956299, "learning_rate": 1.8501478867837766e-05, "loss": 2.1835, "step": 1984 }, { "epoch": 0.17659356790178374, "grad_norm": 0.7453591823577881, "learning_rate": 1.8500006839413183e-05, "loss": 2.1909, "step": 1985 }, { "epoch": 0.17668253191584005, "grad_norm": 0.7018469572067261, "learning_rate": 1.84985341469606e-05, "loss": 2.1315, "step": 1986 }, { "epoch": 0.17677149592989636, "grad_norm": 0.753649890422821, "learning_rate": 1.849706079059506e-05, "loss": 2.0804, "step": 1987 }, { "epoch": 0.17686045994395266, "grad_norm": 0.7390051484107971, "learning_rate": 1.8495586770431666e-05, "loss": 2.0815, "step": 1988 }, { "epoch": 0.17694942395800897, "grad_norm": 0.7527368068695068, "learning_rate": 1.8494112086585567e-05, "loss": 2.3614, "step": 1989 }, { "epoch": 0.1770383879720653, "grad_norm": 0.7226595282554626, "learning_rate": 1.8492636739171966e-05, "loss": 2.0088, "step": 1990 }, { "epoch": 0.17712735198612162, "grad_norm": 0.7631362080574036, "learning_rate": 1.849116072830612e-05, "loss": 2.0889, "step": 1991 }, { "epoch": 0.17721631600017793, "grad_norm": 0.7226073741912842, "learning_rate": 1.8489684054103335e-05, "loss": 2.1921, "step": 1992 }, { "epoch": 0.17730528001423423, "grad_norm": 0.6849701404571533, "learning_rate": 1.8488206716678974e-05, "loss": 2.1173, "step": 1993 }, { "epoch": 0.17739424402829054, "grad_norm": 0.7053691148757935, "learning_rate": 1.848672871614844e-05, "loss": 2.0466, "step": 1994 }, { "epoch": 0.17748320804234688, "grad_norm": 0.7058512568473816, "learning_rate": 1.8485250052627205e-05, "loss": 2.1056, "step": 1995 }, { "epoch": 0.1775721720564032, "grad_norm": 0.6904292702674866, "learning_rate": 1.8483770726230777e-05, "loss": 2.1447, "step": 1996 }, { "epoch": 0.1776611360704595, "grad_norm": 0.7385074496269226, "learning_rate": 1.8482290737074725e-05, "loss": 2.0178, "step": 1997 }, { "epoch": 0.1777501000845158, "grad_norm": 0.7233278155326843, "learning_rate": 1.8480810085274664e-05, "loss": 2.1163, "step": 1998 }, { "epoch": 0.1778390640985721, "grad_norm": 0.7351945638656616, "learning_rate": 1.847932877094627e-05, "loss": 2.0682, "step": 1999 }, { "epoch": 0.17792802811262845, "grad_norm": 0.7012211084365845, "learning_rate": 1.8477846794205258e-05, "loss": 2.0747, "step": 2000 }, { "epoch": 0.17801699212668476, "grad_norm": 0.73421311378479, "learning_rate": 1.8476364155167406e-05, "loss": 2.1078, "step": 2001 }, { "epoch": 0.17810595614074107, "grad_norm": 0.6934822201728821, "learning_rate": 1.8474880853948538e-05, "loss": 2.1157, "step": 2002 }, { "epoch": 0.17819492015479738, "grad_norm": 0.7698958516120911, "learning_rate": 1.8473396890664527e-05, "loss": 2.1707, "step": 2003 }, { "epoch": 0.1782838841688537, "grad_norm": 0.7333460450172424, "learning_rate": 1.8471912265431308e-05, "loss": 2.1271, "step": 2004 }, { "epoch": 0.17837284818291002, "grad_norm": 0.7325608134269714, "learning_rate": 1.8470426978364857e-05, "loss": 2.0501, "step": 2005 }, { "epoch": 0.17846181219696633, "grad_norm": 0.736074686050415, "learning_rate": 1.8468941029581203e-05, "loss": 2.0562, "step": 2006 }, { "epoch": 0.17855077621102264, "grad_norm": 0.7414700984954834, "learning_rate": 1.8467454419196436e-05, "loss": 2.1475, "step": 2007 }, { "epoch": 0.17863974022507895, "grad_norm": 0.8054749369621277, "learning_rate": 1.846596714732669e-05, "loss": 1.9846, "step": 2008 }, { "epoch": 0.17872870423913528, "grad_norm": 0.7651209831237793, "learning_rate": 1.846447921408815e-05, "loss": 2.0877, "step": 2009 }, { "epoch": 0.1788176682531916, "grad_norm": 0.7956200838088989, "learning_rate": 1.846299061959705e-05, "loss": 2.0253, "step": 2010 }, { "epoch": 0.1789066322672479, "grad_norm": 0.7639230489730835, "learning_rate": 1.846150136396969e-05, "loss": 2.0835, "step": 2011 }, { "epoch": 0.1789955962813042, "grad_norm": 0.7276865839958191, "learning_rate": 1.846001144732241e-05, "loss": 2.1073, "step": 2012 }, { "epoch": 0.17908456029536052, "grad_norm": 0.7728212475776672, "learning_rate": 1.8458520869771595e-05, "loss": 2.0879, "step": 2013 }, { "epoch": 0.17917352430941685, "grad_norm": 0.6928873658180237, "learning_rate": 1.8457029631433703e-05, "loss": 2.1217, "step": 2014 }, { "epoch": 0.17926248832347316, "grad_norm": 0.7800984978675842, "learning_rate": 1.8455537732425223e-05, "loss": 2.0639, "step": 2015 }, { "epoch": 0.17935145233752947, "grad_norm": 0.757377028465271, "learning_rate": 1.84540451728627e-05, "loss": 2.017, "step": 2016 }, { "epoch": 0.17944041635158578, "grad_norm": 0.693466305732727, "learning_rate": 1.8452551952862744e-05, "loss": 1.989, "step": 2017 }, { "epoch": 0.1795293803656421, "grad_norm": 0.7197025418281555, "learning_rate": 1.8451058072541998e-05, "loss": 2.1787, "step": 2018 }, { "epoch": 0.17961834437969842, "grad_norm": 0.7140913605690002, "learning_rate": 1.8449563532017168e-05, "loss": 2.0594, "step": 2019 }, { "epoch": 0.17970730839375473, "grad_norm": 0.8315644264221191, "learning_rate": 1.844806833140501e-05, "loss": 2.0811, "step": 2020 }, { "epoch": 0.17979627240781104, "grad_norm": 0.7547711133956909, "learning_rate": 1.8446572470822333e-05, "loss": 2.1553, "step": 2021 }, { "epoch": 0.17988523642186735, "grad_norm": 0.6921864151954651, "learning_rate": 1.8445075950385992e-05, "loss": 2.0358, "step": 2022 }, { "epoch": 0.17997420043592366, "grad_norm": 0.7265222072601318, "learning_rate": 1.844357877021289e-05, "loss": 2.0626, "step": 2023 }, { "epoch": 0.18006316444998, "grad_norm": 0.6996111869812012, "learning_rate": 1.844208093042e-05, "loss": 2.098, "step": 2024 }, { "epoch": 0.1801521284640363, "grad_norm": 0.7017983198165894, "learning_rate": 1.8440582431124325e-05, "loss": 2.025, "step": 2025 }, { "epoch": 0.1802410924780926, "grad_norm": 0.7135716080665588, "learning_rate": 1.8439083272442934e-05, "loss": 1.9293, "step": 2026 }, { "epoch": 0.18033005649214892, "grad_norm": 0.711635172367096, "learning_rate": 1.8437583454492944e-05, "loss": 2.0207, "step": 2027 }, { "epoch": 0.18041902050620523, "grad_norm": 0.7058443427085876, "learning_rate": 1.843608297739152e-05, "loss": 2.1201, "step": 2028 }, { "epoch": 0.18050798452026157, "grad_norm": 0.7396076321601868, "learning_rate": 1.8434581841255877e-05, "loss": 2.0232, "step": 2029 }, { "epoch": 0.18059694853431787, "grad_norm": 0.7142722010612488, "learning_rate": 1.8433080046203293e-05, "loss": 2.0403, "step": 2030 }, { "epoch": 0.18068591254837418, "grad_norm": 0.7487977147102356, "learning_rate": 1.8431577592351077e-05, "loss": 2.1648, "step": 2031 }, { "epoch": 0.1807748765624305, "grad_norm": 0.7571637034416199, "learning_rate": 1.8430074479816616e-05, "loss": 2.1551, "step": 2032 }, { "epoch": 0.1808638405764868, "grad_norm": 0.6937898993492126, "learning_rate": 1.8428570708717325e-05, "loss": 2.1368, "step": 2033 }, { "epoch": 0.18095280459054314, "grad_norm": 0.7531159520149231, "learning_rate": 1.8427066279170684e-05, "loss": 2.0148, "step": 2034 }, { "epoch": 0.18104176860459945, "grad_norm": 0.7772713303565979, "learning_rate": 1.842556119129422e-05, "loss": 2.0339, "step": 2035 }, { "epoch": 0.18113073261865575, "grad_norm": 0.7375651001930237, "learning_rate": 1.8424055445205513e-05, "loss": 2.1175, "step": 2036 }, { "epoch": 0.18121969663271206, "grad_norm": 0.6963471174240112, "learning_rate": 1.842254904102219e-05, "loss": 2.0138, "step": 2037 }, { "epoch": 0.18130866064676837, "grad_norm": 0.7259635329246521, "learning_rate": 1.8421041978861937e-05, "loss": 2.2592, "step": 2038 }, { "epoch": 0.1813976246608247, "grad_norm": 0.8020951151847839, "learning_rate": 1.8419534258842483e-05, "loss": 2.1428, "step": 2039 }, { "epoch": 0.18148658867488102, "grad_norm": 0.7641043663024902, "learning_rate": 1.8418025881081612e-05, "loss": 1.9517, "step": 2040 }, { "epoch": 0.18157555268893733, "grad_norm": 0.7566503286361694, "learning_rate": 1.8416516845697162e-05, "loss": 2.1682, "step": 2041 }, { "epoch": 0.18166451670299363, "grad_norm": 0.771088182926178, "learning_rate": 1.841500715280702e-05, "loss": 2.1033, "step": 2042 }, { "epoch": 0.18175348071704994, "grad_norm": 0.767106294631958, "learning_rate": 1.8413496802529123e-05, "loss": 2.3175, "step": 2043 }, { "epoch": 0.18184244473110628, "grad_norm": 0.7438821792602539, "learning_rate": 1.8411985794981463e-05, "loss": 2.077, "step": 2044 }, { "epoch": 0.1819314087451626, "grad_norm": 0.7870458364486694, "learning_rate": 1.8410474130282078e-05, "loss": 2.1919, "step": 2045 }, { "epoch": 0.1820203727592189, "grad_norm": 0.760523796081543, "learning_rate": 1.8408961808549065e-05, "loss": 2.1029, "step": 2046 }, { "epoch": 0.1821093367732752, "grad_norm": 0.7154877185821533, "learning_rate": 1.8407448829900568e-05, "loss": 2.2724, "step": 2047 }, { "epoch": 0.1821983007873315, "grad_norm": 0.792496919631958, "learning_rate": 1.8405935194454775e-05, "loss": 2.0299, "step": 2048 }, { "epoch": 0.18228726480138785, "grad_norm": 0.7246788144111633, "learning_rate": 1.8404420902329942e-05, "loss": 2.2161, "step": 2049 }, { "epoch": 0.18237622881544416, "grad_norm": 0.7912760972976685, "learning_rate": 1.840290595364436e-05, "loss": 2.1386, "step": 2050 }, { "epoch": 0.18246519282950047, "grad_norm": 0.7123632431030273, "learning_rate": 1.8401390348516384e-05, "loss": 2.0757, "step": 2051 }, { "epoch": 0.18255415684355678, "grad_norm": 0.7240727543830872, "learning_rate": 1.8399874087064408e-05, "loss": 2.0307, "step": 2052 }, { "epoch": 0.18264312085761308, "grad_norm": 0.7183453440666199, "learning_rate": 1.8398357169406883e-05, "loss": 1.9844, "step": 2053 }, { "epoch": 0.18273208487166942, "grad_norm": 0.7336275577545166, "learning_rate": 1.839683959566232e-05, "loss": 1.9868, "step": 2054 }, { "epoch": 0.18282104888572573, "grad_norm": 0.8274102807044983, "learning_rate": 1.839532136594927e-05, "loss": 1.8717, "step": 2055 }, { "epoch": 0.18291001289978204, "grad_norm": 0.7818873524665833, "learning_rate": 1.8393802480386332e-05, "loss": 2.1902, "step": 2056 }, { "epoch": 0.18299897691383835, "grad_norm": 0.7319453358650208, "learning_rate": 1.8392282939092172e-05, "loss": 1.9805, "step": 2057 }, { "epoch": 0.18308794092789465, "grad_norm": 0.7468681931495667, "learning_rate": 1.8390762742185494e-05, "loss": 2.032, "step": 2058 }, { "epoch": 0.183176904941951, "grad_norm": 0.7650938034057617, "learning_rate": 1.8389241889785054e-05, "loss": 1.9859, "step": 2059 }, { "epoch": 0.1832658689560073, "grad_norm": 0.7147865891456604, "learning_rate": 1.8387720382009665e-05, "loss": 2.0682, "step": 2060 }, { "epoch": 0.1833548329700636, "grad_norm": 0.7502256631851196, "learning_rate": 1.838619821897819e-05, "loss": 2.1689, "step": 2061 }, { "epoch": 0.18344379698411992, "grad_norm": 0.7414448261260986, "learning_rate": 1.838467540080954e-05, "loss": 2.2427, "step": 2062 }, { "epoch": 0.18353276099817623, "grad_norm": 0.771108090877533, "learning_rate": 1.8383151927622684e-05, "loss": 1.9484, "step": 2063 }, { "epoch": 0.18362172501223256, "grad_norm": 0.7370500564575195, "learning_rate": 1.8381627799536623e-05, "loss": 2.1278, "step": 2064 }, { "epoch": 0.18371068902628887, "grad_norm": 0.7501896023750305, "learning_rate": 1.838010301667044e-05, "loss": 2.1838, "step": 2065 }, { "epoch": 0.18379965304034518, "grad_norm": 0.7344055771827698, "learning_rate": 1.837857757914324e-05, "loss": 2.1099, "step": 2066 }, { "epoch": 0.1838886170544015, "grad_norm": 0.7788003087043762, "learning_rate": 1.83770514870742e-05, "loss": 2.1253, "step": 2067 }, { "epoch": 0.1839775810684578, "grad_norm": 0.7370340824127197, "learning_rate": 1.8375524740582535e-05, "loss": 2.0867, "step": 2068 }, { "epoch": 0.18406654508251413, "grad_norm": 0.8314010500907898, "learning_rate": 1.8373997339787517e-05, "loss": 2.2112, "step": 2069 }, { "epoch": 0.18415550909657044, "grad_norm": 0.7510485649108887, "learning_rate": 1.8372469284808468e-05, "loss": 2.1686, "step": 2070 }, { "epoch": 0.18424447311062675, "grad_norm": 0.7852067947387695, "learning_rate": 1.837094057576476e-05, "loss": 2.0942, "step": 2071 }, { "epoch": 0.18433343712468306, "grad_norm": 0.7058443427085876, "learning_rate": 1.8369411212775822e-05, "loss": 2.1433, "step": 2072 }, { "epoch": 0.18442240113873937, "grad_norm": 0.7200643420219421, "learning_rate": 1.836788119596112e-05, "loss": 2.2078, "step": 2073 }, { "epoch": 0.1845113651527957, "grad_norm": 0.7197245955467224, "learning_rate": 1.8366350525440186e-05, "loss": 2.2011, "step": 2074 }, { "epoch": 0.184600329166852, "grad_norm": 0.7659677267074585, "learning_rate": 1.83648192013326e-05, "loss": 2.1798, "step": 2075 }, { "epoch": 0.18468929318090832, "grad_norm": 0.7264163494110107, "learning_rate": 1.8363287223757985e-05, "loss": 2.1537, "step": 2076 }, { "epoch": 0.18477825719496463, "grad_norm": 0.7348256707191467, "learning_rate": 1.8361754592836017e-05, "loss": 2.0537, "step": 2077 }, { "epoch": 0.18486722120902094, "grad_norm": 0.8363910913467407, "learning_rate": 1.8360221308686438e-05, "loss": 2.0761, "step": 2078 }, { "epoch": 0.18495618522307727, "grad_norm": 0.8249756097793579, "learning_rate": 1.8358687371429025e-05, "loss": 2.2193, "step": 2079 }, { "epoch": 0.18504514923713358, "grad_norm": 0.7469397783279419, "learning_rate": 1.8357152781183606e-05, "loss": 2.0071, "step": 2080 }, { "epoch": 0.1851341132511899, "grad_norm": 0.7809965014457703, "learning_rate": 1.8355617538070064e-05, "loss": 1.9269, "step": 2081 }, { "epoch": 0.1852230772652462, "grad_norm": 0.7024915218353271, "learning_rate": 1.8354081642208345e-05, "loss": 2.0428, "step": 2082 }, { "epoch": 0.1853120412793025, "grad_norm": 0.7563740611076355, "learning_rate": 1.8352545093718423e-05, "loss": 2.1205, "step": 2083 }, { "epoch": 0.18540100529335884, "grad_norm": 0.8411794900894165, "learning_rate": 1.8351007892720336e-05, "loss": 1.9766, "step": 2084 }, { "epoch": 0.18548996930741515, "grad_norm": 0.748618483543396, "learning_rate": 1.8349470039334173e-05, "loss": 2.0933, "step": 2085 }, { "epoch": 0.18557893332147146, "grad_norm": 0.7395220398902893, "learning_rate": 1.8347931533680077e-05, "loss": 2.1114, "step": 2086 }, { "epoch": 0.18566789733552777, "grad_norm": 0.762856662273407, "learning_rate": 1.8346392375878232e-05, "loss": 2.0595, "step": 2087 }, { "epoch": 0.1857568613495841, "grad_norm": 0.7675796151161194, "learning_rate": 1.834485256604888e-05, "loss": 2.1888, "step": 2088 }, { "epoch": 0.18584582536364042, "grad_norm": 0.7634822130203247, "learning_rate": 1.834331210431231e-05, "loss": 2.059, "step": 2089 }, { "epoch": 0.18593478937769672, "grad_norm": 0.7941399812698364, "learning_rate": 1.834177099078887e-05, "loss": 2.0229, "step": 2090 }, { "epoch": 0.18602375339175303, "grad_norm": 0.7735574841499329, "learning_rate": 1.8340229225598945e-05, "loss": 2.1619, "step": 2091 }, { "epoch": 0.18611271740580934, "grad_norm": 0.7174379825592041, "learning_rate": 1.833868680886299e-05, "loss": 2.1127, "step": 2092 }, { "epoch": 0.18620168141986568, "grad_norm": 0.7338917255401611, "learning_rate": 1.8337143740701488e-05, "loss": 1.9325, "step": 2093 }, { "epoch": 0.186290645433922, "grad_norm": 0.7710809111595154, "learning_rate": 1.8335600021234992e-05, "loss": 2.1756, "step": 2094 }, { "epoch": 0.1863796094479783, "grad_norm": 0.7515461444854736, "learning_rate": 1.8334055650584094e-05, "loss": 2.0517, "step": 2095 }, { "epoch": 0.1864685734620346, "grad_norm": 0.7146453261375427, "learning_rate": 1.833251062886945e-05, "loss": 2.1044, "step": 2096 }, { "epoch": 0.1865575374760909, "grad_norm": 0.7388179898262024, "learning_rate": 1.833096495621175e-05, "loss": 2.102, "step": 2097 }, { "epoch": 0.18664650149014725, "grad_norm": 0.7374165058135986, "learning_rate": 1.832941863273175e-05, "loss": 1.9547, "step": 2098 }, { "epoch": 0.18673546550420356, "grad_norm": 0.7537643313407898, "learning_rate": 1.8327871658550243e-05, "loss": 1.9902, "step": 2099 }, { "epoch": 0.18682442951825987, "grad_norm": 0.7149966359138489, "learning_rate": 1.832632403378808e-05, "loss": 1.9424, "step": 2100 }, { "epoch": 0.18691339353231617, "grad_norm": 0.766290009021759, "learning_rate": 1.8324775758566173e-05, "loss": 2.1324, "step": 2101 }, { "epoch": 0.18700235754637248, "grad_norm": 0.7302610278129578, "learning_rate": 1.8323226833005464e-05, "loss": 1.9702, "step": 2102 }, { "epoch": 0.18709132156042882, "grad_norm": 0.7499619126319885, "learning_rate": 1.8321677257226964e-05, "loss": 2.2301, "step": 2103 }, { "epoch": 0.18718028557448513, "grad_norm": 0.7148032784461975, "learning_rate": 1.8320127031351723e-05, "loss": 2.0181, "step": 2104 }, { "epoch": 0.18726924958854144, "grad_norm": 0.7233209609985352, "learning_rate": 1.831857615550084e-05, "loss": 2.0054, "step": 2105 }, { "epoch": 0.18735821360259775, "grad_norm": 0.7492173314094543, "learning_rate": 1.8317024629795485e-05, "loss": 2.1464, "step": 2106 }, { "epoch": 0.18744717761665405, "grad_norm": 0.7252806425094604, "learning_rate": 1.8315472454356856e-05, "loss": 2.0476, "step": 2107 }, { "epoch": 0.1875361416307104, "grad_norm": 0.7296537160873413, "learning_rate": 1.831391962930621e-05, "loss": 2.1372, "step": 2108 }, { "epoch": 0.1876251056447667, "grad_norm": 0.7488391399383545, "learning_rate": 1.8312366154764857e-05, "loss": 2.0685, "step": 2109 }, { "epoch": 0.187714069658823, "grad_norm": 0.7503678798675537, "learning_rate": 1.831081203085415e-05, "loss": 2.0783, "step": 2110 }, { "epoch": 0.18780303367287932, "grad_norm": 0.7349790334701538, "learning_rate": 1.830925725769551e-05, "loss": 2.2206, "step": 2111 }, { "epoch": 0.18789199768693562, "grad_norm": 0.7055210471153259, "learning_rate": 1.830770183541039e-05, "loss": 2.1122, "step": 2112 }, { "epoch": 0.18798096170099196, "grad_norm": 0.6954486966133118, "learning_rate": 1.83061457641203e-05, "loss": 2.1333, "step": 2113 }, { "epoch": 0.18806992571504827, "grad_norm": 0.7280832529067993, "learning_rate": 1.8304589043946804e-05, "loss": 2.1047, "step": 2114 }, { "epoch": 0.18815888972910458, "grad_norm": 0.7758644819259644, "learning_rate": 1.8303031675011515e-05, "loss": 2.2298, "step": 2115 }, { "epoch": 0.1882478537431609, "grad_norm": 0.7210476398468018, "learning_rate": 1.8301473657436095e-05, "loss": 2.1613, "step": 2116 }, { "epoch": 0.1883368177572172, "grad_norm": 0.7422409653663635, "learning_rate": 1.8299914991342254e-05, "loss": 2.0808, "step": 2117 }, { "epoch": 0.18842578177127353, "grad_norm": 0.7623264789581299, "learning_rate": 1.8298355676851764e-05, "loss": 2.2243, "step": 2118 }, { "epoch": 0.18851474578532984, "grad_norm": 0.7579565048217773, "learning_rate": 1.8296795714086434e-05, "loss": 2.1216, "step": 2119 }, { "epoch": 0.18860370979938615, "grad_norm": 0.7255637049674988, "learning_rate": 1.829523510316813e-05, "loss": 2.1232, "step": 2120 }, { "epoch": 0.18869267381344246, "grad_norm": 0.7768126130104065, "learning_rate": 1.8293673844218775e-05, "loss": 2.3129, "step": 2121 }, { "epoch": 0.18878163782749877, "grad_norm": 0.7062876224517822, "learning_rate": 1.8292111937360325e-05, "loss": 2.0558, "step": 2122 }, { "epoch": 0.1888706018415551, "grad_norm": 0.7025502920150757, "learning_rate": 1.829054938271481e-05, "loss": 1.9205, "step": 2123 }, { "epoch": 0.1889595658556114, "grad_norm": 0.7081987261772156, "learning_rate": 1.8288986180404285e-05, "loss": 2.2188, "step": 2124 }, { "epoch": 0.18904852986966772, "grad_norm": 0.7518102526664734, "learning_rate": 1.8287422330550878e-05, "loss": 2.0556, "step": 2125 }, { "epoch": 0.18913749388372403, "grad_norm": 0.7027660012245178, "learning_rate": 1.8285857833276756e-05, "loss": 2.0148, "step": 2126 }, { "epoch": 0.18922645789778034, "grad_norm": 0.7800981998443604, "learning_rate": 1.8284292688704138e-05, "loss": 2.0019, "step": 2127 }, { "epoch": 0.18931542191183667, "grad_norm": 0.7294033169746399, "learning_rate": 1.8282726896955295e-05, "loss": 2.0659, "step": 2128 }, { "epoch": 0.18940438592589298, "grad_norm": 0.7423782348632812, "learning_rate": 1.828116045815255e-05, "loss": 2.1216, "step": 2129 }, { "epoch": 0.1894933499399493, "grad_norm": 0.730366587638855, "learning_rate": 1.827959337241827e-05, "loss": 2.057, "step": 2130 }, { "epoch": 0.1895823139540056, "grad_norm": 0.7573592662811279, "learning_rate": 1.827802563987488e-05, "loss": 1.967, "step": 2131 }, { "epoch": 0.1896712779680619, "grad_norm": 0.7229348421096802, "learning_rate": 1.827645726064485e-05, "loss": 1.9861, "step": 2132 }, { "epoch": 0.18976024198211824, "grad_norm": 0.7455100417137146, "learning_rate": 1.827488823485071e-05, "loss": 2.2631, "step": 2133 }, { "epoch": 0.18984920599617455, "grad_norm": 0.7316442131996155, "learning_rate": 1.8273318562615027e-05, "loss": 2.1154, "step": 2134 }, { "epoch": 0.18993817001023086, "grad_norm": 0.7868925929069519, "learning_rate": 1.827174824406043e-05, "loss": 2.0532, "step": 2135 }, { "epoch": 0.19002713402428717, "grad_norm": 0.7164347171783447, "learning_rate": 1.827017727930959e-05, "loss": 2.1691, "step": 2136 }, { "epoch": 0.19011609803834348, "grad_norm": 0.7586401700973511, "learning_rate": 1.826860566848523e-05, "loss": 2.0139, "step": 2137 }, { "epoch": 0.19020506205239981, "grad_norm": 0.761176586151123, "learning_rate": 1.8267033411710132e-05, "loss": 2.0618, "step": 2138 }, { "epoch": 0.19029402606645612, "grad_norm": 0.8018330335617065, "learning_rate": 1.8265460509107114e-05, "loss": 2.1487, "step": 2139 }, { "epoch": 0.19038299008051243, "grad_norm": 0.7229007482528687, "learning_rate": 1.8263886960799062e-05, "loss": 2.1218, "step": 2140 }, { "epoch": 0.19047195409456874, "grad_norm": 0.7522724866867065, "learning_rate": 1.8262312766908898e-05, "loss": 2.0609, "step": 2141 }, { "epoch": 0.19056091810862505, "grad_norm": 0.7667751312255859, "learning_rate": 1.8260737927559598e-05, "loss": 1.9292, "step": 2142 }, { "epoch": 0.19064988212268139, "grad_norm": 0.7181320786476135, "learning_rate": 1.825916244287419e-05, "loss": 2.2225, "step": 2143 }, { "epoch": 0.1907388461367377, "grad_norm": 0.7324398159980774, "learning_rate": 1.8257586312975758e-05, "loss": 2.0173, "step": 2144 }, { "epoch": 0.190827810150794, "grad_norm": 0.719336986541748, "learning_rate": 1.8256009537987424e-05, "loss": 2.1789, "step": 2145 }, { "epoch": 0.1909167741648503, "grad_norm": 0.7633517384529114, "learning_rate": 1.825443211803237e-05, "loss": 2.1408, "step": 2146 }, { "epoch": 0.19100573817890662, "grad_norm": 0.7911515235900879, "learning_rate": 1.825285405323383e-05, "loss": 2.2164, "step": 2147 }, { "epoch": 0.19109470219296296, "grad_norm": 0.7910372614860535, "learning_rate": 1.8251275343715075e-05, "loss": 2.2683, "step": 2148 }, { "epoch": 0.19118366620701926, "grad_norm": 0.824871301651001, "learning_rate": 1.8249695989599433e-05, "loss": 2.0175, "step": 2149 }, { "epoch": 0.19127263022107557, "grad_norm": 0.8306604623794556, "learning_rate": 1.8248115991010296e-05, "loss": 2.1806, "step": 2150 }, { "epoch": 0.19136159423513188, "grad_norm": 0.6689299941062927, "learning_rate": 1.824653534807109e-05, "loss": 2.0515, "step": 2151 }, { "epoch": 0.1914505582491882, "grad_norm": 0.8022832274436951, "learning_rate": 1.8244954060905294e-05, "loss": 2.0564, "step": 2152 }, { "epoch": 0.19153952226324453, "grad_norm": 0.7581310272216797, "learning_rate": 1.8243372129636447e-05, "loss": 1.9507, "step": 2153 }, { "epoch": 0.19162848627730084, "grad_norm": 0.7916885614395142, "learning_rate": 1.8241789554388118e-05, "loss": 2.0495, "step": 2154 }, { "epoch": 0.19171745029135714, "grad_norm": 0.7129745483398438, "learning_rate": 1.8240206335283947e-05, "loss": 2.0925, "step": 2155 }, { "epoch": 0.19180641430541345, "grad_norm": 0.8071553111076355, "learning_rate": 1.8238622472447617e-05, "loss": 2.101, "step": 2156 }, { "epoch": 0.19189537831946976, "grad_norm": 0.8280145525932312, "learning_rate": 1.823703796600286e-05, "loss": 2.0314, "step": 2157 }, { "epoch": 0.1919843423335261, "grad_norm": 0.6875373125076294, "learning_rate": 1.8235452816073456e-05, "loss": 1.9566, "step": 2158 }, { "epoch": 0.1920733063475824, "grad_norm": 0.6827114224433899, "learning_rate": 1.8233867022783243e-05, "loss": 2.215, "step": 2159 }, { "epoch": 0.19216227036163872, "grad_norm": 0.7586964964866638, "learning_rate": 1.82322805862561e-05, "loss": 2.1334, "step": 2160 }, { "epoch": 0.19225123437569502, "grad_norm": 0.7525504231452942, "learning_rate": 1.8230693506615965e-05, "loss": 2.0065, "step": 2161 }, { "epoch": 0.19234019838975133, "grad_norm": 0.7927106022834778, "learning_rate": 1.822910578398682e-05, "loss": 2.1167, "step": 2162 }, { "epoch": 0.19242916240380767, "grad_norm": 0.7581115365028381, "learning_rate": 1.82275174184927e-05, "loss": 2.0724, "step": 2163 }, { "epoch": 0.19251812641786398, "grad_norm": 0.763451337814331, "learning_rate": 1.822592841025769e-05, "loss": 2.311, "step": 2164 }, { "epoch": 0.19260709043192029, "grad_norm": 0.7517302632331848, "learning_rate": 1.822433875940592e-05, "loss": 2.1078, "step": 2165 }, { "epoch": 0.1926960544459766, "grad_norm": 0.7358631491661072, "learning_rate": 1.822274846606158e-05, "loss": 2.139, "step": 2166 }, { "epoch": 0.1927850184600329, "grad_norm": 0.7151654362678528, "learning_rate": 1.8221157530348904e-05, "loss": 2.0972, "step": 2167 }, { "epoch": 0.19287398247408924, "grad_norm": 0.7208395600318909, "learning_rate": 1.821956595239218e-05, "loss": 2.086, "step": 2168 }, { "epoch": 0.19296294648814555, "grad_norm": 0.7754930853843689, "learning_rate": 1.8217973732315735e-05, "loss": 2.0987, "step": 2169 }, { "epoch": 0.19305191050220186, "grad_norm": 0.7317039370536804, "learning_rate": 1.8216380870243963e-05, "loss": 2.1341, "step": 2170 }, { "epoch": 0.19314087451625817, "grad_norm": 0.7459531426429749, "learning_rate": 1.8214787366301297e-05, "loss": 1.9928, "step": 2171 }, { "epoch": 0.1932298385303145, "grad_norm": 0.7220882177352905, "learning_rate": 1.8213193220612223e-05, "loss": 2.0186, "step": 2172 }, { "epoch": 0.1933188025443708, "grad_norm": 0.7332236170768738, "learning_rate": 1.8211598433301278e-05, "loss": 2.1364, "step": 2173 }, { "epoch": 0.19340776655842712, "grad_norm": 0.7779446840286255, "learning_rate": 1.8210003004493044e-05, "loss": 2.0137, "step": 2174 }, { "epoch": 0.19349673057248343, "grad_norm": 0.7737859487533569, "learning_rate": 1.820840693431216e-05, "loss": 1.9768, "step": 2175 }, { "epoch": 0.19358569458653974, "grad_norm": 0.7356739044189453, "learning_rate": 1.8206810222883317e-05, "loss": 2.1709, "step": 2176 }, { "epoch": 0.19367465860059607, "grad_norm": 0.7163736820220947, "learning_rate": 1.8205212870331245e-05, "loss": 1.9595, "step": 2177 }, { "epoch": 0.19376362261465238, "grad_norm": 0.7441657185554504, "learning_rate": 1.8203614876780732e-05, "loss": 2.2272, "step": 2178 }, { "epoch": 0.1938525866287087, "grad_norm": 0.6907464265823364, "learning_rate": 1.8202016242356618e-05, "loss": 2.0931, "step": 2179 }, { "epoch": 0.193941550642765, "grad_norm": 0.753607988357544, "learning_rate": 1.8200416967183785e-05, "loss": 2.172, "step": 2180 }, { "epoch": 0.1940305146568213, "grad_norm": 0.8176886439323425, "learning_rate": 1.819881705138717e-05, "loss": 1.9563, "step": 2181 }, { "epoch": 0.19411947867087764, "grad_norm": 0.7674827575683594, "learning_rate": 1.8197216495091767e-05, "loss": 2.0113, "step": 2182 }, { "epoch": 0.19420844268493395, "grad_norm": 0.7029126286506653, "learning_rate": 1.8195615298422606e-05, "loss": 2.017, "step": 2183 }, { "epoch": 0.19429740669899026, "grad_norm": 0.765516459941864, "learning_rate": 1.8194013461504774e-05, "loss": 2.1312, "step": 2184 }, { "epoch": 0.19438637071304657, "grad_norm": 0.7445728778839111, "learning_rate": 1.819241098446341e-05, "loss": 2.0534, "step": 2185 }, { "epoch": 0.19447533472710288, "grad_norm": 0.7007355093955994, "learning_rate": 1.81908078674237e-05, "loss": 2.139, "step": 2186 }, { "epoch": 0.19456429874115921, "grad_norm": 0.81076979637146, "learning_rate": 1.8189204110510884e-05, "loss": 2.0106, "step": 2187 }, { "epoch": 0.19465326275521552, "grad_norm": 0.9036375284194946, "learning_rate": 1.8187599713850243e-05, "loss": 2.09, "step": 2188 }, { "epoch": 0.19474222676927183, "grad_norm": 0.75246661901474, "learning_rate": 1.818599467756712e-05, "loss": 2.0266, "step": 2189 }, { "epoch": 0.19483119078332814, "grad_norm": 0.7718597054481506, "learning_rate": 1.8184389001786895e-05, "loss": 2.0303, "step": 2190 }, { "epoch": 0.19492015479738445, "grad_norm": 0.7287378311157227, "learning_rate": 1.8182782686635013e-05, "loss": 2.1759, "step": 2191 }, { "epoch": 0.19500911881144078, "grad_norm": 0.7947046160697937, "learning_rate": 1.8181175732236957e-05, "loss": 1.9177, "step": 2192 }, { "epoch": 0.1950980828254971, "grad_norm": 0.7834987044334412, "learning_rate": 1.8179568138718257e-05, "loss": 2.0793, "step": 2193 }, { "epoch": 0.1951870468395534, "grad_norm": 0.7171180248260498, "learning_rate": 1.8177959906204513e-05, "loss": 2.1872, "step": 2194 }, { "epoch": 0.1952760108536097, "grad_norm": 0.7793415188789368, "learning_rate": 1.8176351034821352e-05, "loss": 2.1427, "step": 2195 }, { "epoch": 0.19536497486766602, "grad_norm": 0.7188016772270203, "learning_rate": 1.817474152469446e-05, "loss": 2.0571, "step": 2196 }, { "epoch": 0.19545393888172236, "grad_norm": 0.7348620891571045, "learning_rate": 1.817313137594958e-05, "loss": 2.1281, "step": 2197 }, { "epoch": 0.19554290289577866, "grad_norm": 0.7179164886474609, "learning_rate": 1.817152058871249e-05, "loss": 2.13, "step": 2198 }, { "epoch": 0.19563186690983497, "grad_norm": 0.6666889786720276, "learning_rate": 1.8169909163109037e-05, "loss": 1.9474, "step": 2199 }, { "epoch": 0.19572083092389128, "grad_norm": 0.753353476524353, "learning_rate": 1.8168297099265094e-05, "loss": 2.0625, "step": 2200 }, { "epoch": 0.1958097949379476, "grad_norm": 0.7233378887176514, "learning_rate": 1.8166684397306607e-05, "loss": 2.0136, "step": 2201 }, { "epoch": 0.19589875895200393, "grad_norm": 0.7174249887466431, "learning_rate": 1.8165071057359557e-05, "loss": 2.0743, "step": 2202 }, { "epoch": 0.19598772296606023, "grad_norm": 1.4288454055786133, "learning_rate": 1.816345707954998e-05, "loss": 2.1428, "step": 2203 }, { "epoch": 0.19607668698011654, "grad_norm": 0.7448567748069763, "learning_rate": 1.8161842464003965e-05, "loss": 2.0614, "step": 2204 }, { "epoch": 0.19616565099417285, "grad_norm": 0.7051004767417908, "learning_rate": 1.8160227210847642e-05, "loss": 2.186, "step": 2205 }, { "epoch": 0.19625461500822916, "grad_norm": 0.7209646701812744, "learning_rate": 1.81586113202072e-05, "loss": 2.1272, "step": 2206 }, { "epoch": 0.1963435790222855, "grad_norm": 0.6927865743637085, "learning_rate": 1.8156994792208868e-05, "loss": 2.1162, "step": 2207 }, { "epoch": 0.1964325430363418, "grad_norm": 0.7197739481925964, "learning_rate": 1.8155377626978934e-05, "loss": 2.0727, "step": 2208 }, { "epoch": 0.19652150705039811, "grad_norm": 0.7035665512084961, "learning_rate": 1.8153759824643735e-05, "loss": 1.9936, "step": 2209 }, { "epoch": 0.19661047106445442, "grad_norm": 0.7248867154121399, "learning_rate": 1.8152141385329654e-05, "loss": 2.1462, "step": 2210 }, { "epoch": 0.19669943507851073, "grad_norm": 0.714458703994751, "learning_rate": 1.8150522309163125e-05, "loss": 1.9628, "step": 2211 }, { "epoch": 0.19678839909256707, "grad_norm": 0.753125786781311, "learning_rate": 1.814890259627063e-05, "loss": 2.0382, "step": 2212 }, { "epoch": 0.19687736310662338, "grad_norm": 0.7153069376945496, "learning_rate": 1.8147282246778698e-05, "loss": 2.1014, "step": 2213 }, { "epoch": 0.19696632712067969, "grad_norm": 0.7221390008926392, "learning_rate": 1.814566126081392e-05, "loss": 2.0143, "step": 2214 }, { "epoch": 0.197055291134736, "grad_norm": 0.7878203392028809, "learning_rate": 1.814403963850293e-05, "loss": 1.9553, "step": 2215 }, { "epoch": 0.1971442551487923, "grad_norm": 0.7212650775909424, "learning_rate": 1.8142417379972405e-05, "loss": 2.0352, "step": 2216 }, { "epoch": 0.19723321916284864, "grad_norm": 0.7349398732185364, "learning_rate": 1.8140794485349077e-05, "loss": 1.9693, "step": 2217 }, { "epoch": 0.19732218317690495, "grad_norm": 0.7072976231575012, "learning_rate": 1.8139170954759728e-05, "loss": 2.0427, "step": 2218 }, { "epoch": 0.19741114719096126, "grad_norm": 0.735213041305542, "learning_rate": 1.8137546788331192e-05, "loss": 2.0162, "step": 2219 }, { "epoch": 0.19750011120501756, "grad_norm": 0.7231016159057617, "learning_rate": 1.813592198619035e-05, "loss": 2.1026, "step": 2220 }, { "epoch": 0.19758907521907387, "grad_norm": 0.9179976582527161, "learning_rate": 1.8134296548464136e-05, "loss": 2.067, "step": 2221 }, { "epoch": 0.1976780392331302, "grad_norm": 0.7928861379623413, "learning_rate": 1.8132670475279522e-05, "loss": 2.1027, "step": 2222 }, { "epoch": 0.19776700324718652, "grad_norm": 0.7634360790252686, "learning_rate": 1.813104376676355e-05, "loss": 1.9809, "step": 2223 }, { "epoch": 0.19785596726124283, "grad_norm": 0.7589354515075684, "learning_rate": 1.8129416423043287e-05, "loss": 2.2864, "step": 2224 }, { "epoch": 0.19794493127529914, "grad_norm": 0.8142014145851135, "learning_rate": 1.812778844424587e-05, "loss": 2.0984, "step": 2225 }, { "epoch": 0.19803389528935544, "grad_norm": 0.7325040102005005, "learning_rate": 1.812615983049848e-05, "loss": 1.9662, "step": 2226 }, { "epoch": 0.19812285930341178, "grad_norm": 0.7087334394454956, "learning_rate": 1.8124530581928343e-05, "loss": 1.9485, "step": 2227 }, { "epoch": 0.1982118233174681, "grad_norm": 0.7379799485206604, "learning_rate": 1.812290069866273e-05, "loss": 2.0432, "step": 2228 }, { "epoch": 0.1983007873315244, "grad_norm": 0.75636225938797, "learning_rate": 1.8121270180828985e-05, "loss": 2.1122, "step": 2229 }, { "epoch": 0.1983897513455807, "grad_norm": 0.7488731145858765, "learning_rate": 1.8119639028554475e-05, "loss": 2.134, "step": 2230 }, { "epoch": 0.19847871535963701, "grad_norm": 0.7486355304718018, "learning_rate": 1.8118007241966628e-05, "loss": 2.0741, "step": 2231 }, { "epoch": 0.19856767937369335, "grad_norm": 0.8399962782859802, "learning_rate": 1.8116374821192917e-05, "loss": 1.966, "step": 2232 }, { "epoch": 0.19865664338774966, "grad_norm": 0.7797385454177856, "learning_rate": 1.8114741766360875e-05, "loss": 2.176, "step": 2233 }, { "epoch": 0.19874560740180597, "grad_norm": 0.7697577476501465, "learning_rate": 1.8113108077598077e-05, "loss": 2.17, "step": 2234 }, { "epoch": 0.19883457141586228, "grad_norm": 0.7305457592010498, "learning_rate": 1.8111473755032142e-05, "loss": 2.2541, "step": 2235 }, { "epoch": 0.19892353542991859, "grad_norm": 0.7676687240600586, "learning_rate": 1.8109838798790753e-05, "loss": 2.1598, "step": 2236 }, { "epoch": 0.19901249944397492, "grad_norm": 0.7123486399650574, "learning_rate": 1.8108203209001628e-05, "loss": 2.05, "step": 2237 }, { "epoch": 0.19910146345803123, "grad_norm": 0.7144032716751099, "learning_rate": 1.8106566985792544e-05, "loss": 2.1911, "step": 2238 }, { "epoch": 0.19919042747208754, "grad_norm": 0.7907541394233704, "learning_rate": 1.810493012929132e-05, "loss": 2.2045, "step": 2239 }, { "epoch": 0.19927939148614385, "grad_norm": 0.7238190770149231, "learning_rate": 1.810329263962584e-05, "loss": 2.2175, "step": 2240 }, { "epoch": 0.19936835550020016, "grad_norm": 0.7524806261062622, "learning_rate": 1.810165451692401e-05, "loss": 1.9976, "step": 2241 }, { "epoch": 0.1994573195142565, "grad_norm": 0.7729364037513733, "learning_rate": 1.8100015761313813e-05, "loss": 2.0594, "step": 2242 }, { "epoch": 0.1995462835283128, "grad_norm": 0.7671797275543213, "learning_rate": 1.8098376372923264e-05, "loss": 2.1386, "step": 2243 }, { "epoch": 0.1996352475423691, "grad_norm": 0.7543330192565918, "learning_rate": 1.809673635188044e-05, "loss": 2.2023, "step": 2244 }, { "epoch": 0.19972421155642542, "grad_norm": 1.3953800201416016, "learning_rate": 1.809509569831345e-05, "loss": 2.0617, "step": 2245 }, { "epoch": 0.19981317557048173, "grad_norm": 0.7541524171829224, "learning_rate": 1.809345441235047e-05, "loss": 2.0326, "step": 2246 }, { "epoch": 0.19990213958453806, "grad_norm": 0.755070149898529, "learning_rate": 1.8091812494119727e-05, "loss": 1.977, "step": 2247 }, { "epoch": 0.19999110359859437, "grad_norm": 0.7555477619171143, "learning_rate": 1.8090169943749477e-05, "loss": 2.1461, "step": 2248 }, { "epoch": 0.20008006761265068, "grad_norm": 0.7238770723342896, "learning_rate": 1.808852676136804e-05, "loss": 2.0286, "step": 2249 }, { "epoch": 0.200169031626707, "grad_norm": 0.6943899393081665, "learning_rate": 1.8086882947103787e-05, "loss": 2.1216, "step": 2250 }, { "epoch": 0.2002579956407633, "grad_norm": 0.7399301528930664, "learning_rate": 1.8085238501085127e-05, "loss": 1.9737, "step": 2251 }, { "epoch": 0.20034695965481963, "grad_norm": 0.8017804622650146, "learning_rate": 1.8083593423440534e-05, "loss": 2.1018, "step": 2252 }, { "epoch": 0.20043592366887594, "grad_norm": 0.7284921407699585, "learning_rate": 1.8081947714298516e-05, "loss": 2.1887, "step": 2253 }, { "epoch": 0.20052488768293225, "grad_norm": 0.7191417217254639, "learning_rate": 1.8080301373787643e-05, "loss": 2.0676, "step": 2254 }, { "epoch": 0.20061385169698856, "grad_norm": 0.710736095905304, "learning_rate": 1.8078654402036526e-05, "loss": 2.1072, "step": 2255 }, { "epoch": 0.20070281571104487, "grad_norm": 0.7189115285873413, "learning_rate": 1.8077006799173827e-05, "loss": 2.0067, "step": 2256 }, { "epoch": 0.2007917797251012, "grad_norm": 0.7840230464935303, "learning_rate": 1.807535856532826e-05, "loss": 2.0834, "step": 2257 }, { "epoch": 0.2008807437391575, "grad_norm": 0.7374019622802734, "learning_rate": 1.807370970062858e-05, "loss": 2.1566, "step": 2258 }, { "epoch": 0.20096970775321382, "grad_norm": 0.7476488351821899, "learning_rate": 1.8072060205203604e-05, "loss": 1.976, "step": 2259 }, { "epoch": 0.20105867176727013, "grad_norm": 0.7184441089630127, "learning_rate": 1.8070410079182198e-05, "loss": 2.1008, "step": 2260 }, { "epoch": 0.20114763578132647, "grad_norm": 0.767291784286499, "learning_rate": 1.8068759322693258e-05, "loss": 2.1431, "step": 2261 }, { "epoch": 0.20123659979538278, "grad_norm": 0.7106056213378906, "learning_rate": 1.8067107935865752e-05, "loss": 2.0672, "step": 2262 }, { "epoch": 0.20132556380943908, "grad_norm": 0.7268100380897522, "learning_rate": 1.806545591882868e-05, "loss": 1.9841, "step": 2263 }, { "epoch": 0.2014145278234954, "grad_norm": 0.7567141056060791, "learning_rate": 1.806380327171111e-05, "loss": 2.2254, "step": 2264 }, { "epoch": 0.2015034918375517, "grad_norm": 0.7048068046569824, "learning_rate": 1.806214999464214e-05, "loss": 2.0391, "step": 2265 }, { "epoch": 0.20159245585160804, "grad_norm": 0.7185144424438477, "learning_rate": 1.8060496087750926e-05, "loss": 1.9601, "step": 2266 }, { "epoch": 0.20168141986566435, "grad_norm": 0.7298046350479126, "learning_rate": 1.8058841551166676e-05, "loss": 2.1313, "step": 2267 }, { "epoch": 0.20177038387972066, "grad_norm": 0.7284688949584961, "learning_rate": 1.805718638501864e-05, "loss": 2.0824, "step": 2268 }, { "epoch": 0.20185934789377696, "grad_norm": 0.7986564636230469, "learning_rate": 1.8055530589436122e-05, "loss": 2.119, "step": 2269 }, { "epoch": 0.20194831190783327, "grad_norm": 0.7779829502105713, "learning_rate": 1.805387416454848e-05, "loss": 2.0624, "step": 2270 }, { "epoch": 0.2020372759218896, "grad_norm": 0.7167750597000122, "learning_rate": 1.8052217110485104e-05, "loss": 2.0948, "step": 2271 }, { "epoch": 0.20212623993594592, "grad_norm": 0.7124595046043396, "learning_rate": 1.8050559427375455e-05, "loss": 2.0361, "step": 2272 }, { "epoch": 0.20221520395000223, "grad_norm": 0.7592333555221558, "learning_rate": 1.804890111534903e-05, "loss": 2.0946, "step": 2273 }, { "epoch": 0.20230416796405853, "grad_norm": 0.8425976634025574, "learning_rate": 1.8047242174535374e-05, "loss": 2.0474, "step": 2274 }, { "epoch": 0.20239313197811484, "grad_norm": 0.7282816171646118, "learning_rate": 1.804558260506409e-05, "loss": 1.9423, "step": 2275 }, { "epoch": 0.20248209599217118, "grad_norm": 0.7455686330795288, "learning_rate": 1.804392240706482e-05, "loss": 2.2573, "step": 2276 }, { "epoch": 0.2025710600062275, "grad_norm": 0.8618271350860596, "learning_rate": 1.8042261580667264e-05, "loss": 2.1233, "step": 2277 }, { "epoch": 0.2026600240202838, "grad_norm": 0.7810195088386536, "learning_rate": 1.8040600126001163e-05, "loss": 1.9051, "step": 2278 }, { "epoch": 0.2027489880343401, "grad_norm": 0.6963299512863159, "learning_rate": 1.803893804319632e-05, "loss": 1.9474, "step": 2279 }, { "epoch": 0.2028379520483964, "grad_norm": 0.7719687223434448, "learning_rate": 1.803727533238257e-05, "loss": 2.105, "step": 2280 }, { "epoch": 0.20292691606245275, "grad_norm": 0.7455959320068359, "learning_rate": 1.8035611993689805e-05, "loss": 2.1497, "step": 2281 }, { "epoch": 0.20301588007650906, "grad_norm": 0.8464915156364441, "learning_rate": 1.803394802724797e-05, "loss": 1.8759, "step": 2282 }, { "epoch": 0.20310484409056537, "grad_norm": 0.7682441473007202, "learning_rate": 1.8032283433187056e-05, "loss": 2.1567, "step": 2283 }, { "epoch": 0.20319380810462168, "grad_norm": 0.732213020324707, "learning_rate": 1.80306182116371e-05, "loss": 2.0421, "step": 2284 }, { "epoch": 0.20328277211867798, "grad_norm": 0.7447060942649841, "learning_rate": 1.8028952362728197e-05, "loss": 2.2534, "step": 2285 }, { "epoch": 0.20337173613273432, "grad_norm": 0.8191580772399902, "learning_rate": 1.8027285886590475e-05, "loss": 2.1752, "step": 2286 }, { "epoch": 0.20346070014679063, "grad_norm": 0.7398326992988586, "learning_rate": 1.8025618783354123e-05, "loss": 2.0579, "step": 2287 }, { "epoch": 0.20354966416084694, "grad_norm": 0.7366246581077576, "learning_rate": 1.8023951053149384e-05, "loss": 2.0705, "step": 2288 }, { "epoch": 0.20363862817490325, "grad_norm": 0.8348613977432251, "learning_rate": 1.8022282696106536e-05, "loss": 2.0301, "step": 2289 }, { "epoch": 0.20372759218895956, "grad_norm": 0.7894786596298218, "learning_rate": 1.8020613712355915e-05, "loss": 1.958, "step": 2290 }, { "epoch": 0.2038165562030159, "grad_norm": 0.8159781098365784, "learning_rate": 1.80189441020279e-05, "loss": 2.1502, "step": 2291 }, { "epoch": 0.2039055202170722, "grad_norm": 0.7401807308197021, "learning_rate": 1.8017273865252925e-05, "loss": 2.0858, "step": 2292 }, { "epoch": 0.2039944842311285, "grad_norm": 0.7309886813163757, "learning_rate": 1.801560300216147e-05, "loss": 2.0389, "step": 2293 }, { "epoch": 0.20408344824518482, "grad_norm": 0.758003830909729, "learning_rate": 1.8013931512884066e-05, "loss": 2.0082, "step": 2294 }, { "epoch": 0.20417241225924113, "grad_norm": 0.7481549382209778, "learning_rate": 1.8012259397551287e-05, "loss": 2.1098, "step": 2295 }, { "epoch": 0.20426137627329746, "grad_norm": 0.7673386335372925, "learning_rate": 1.8010586656293764e-05, "loss": 2.0889, "step": 2296 }, { "epoch": 0.20435034028735377, "grad_norm": 0.7797995805740356, "learning_rate": 1.8008913289242173e-05, "loss": 2.1047, "step": 2297 }, { "epoch": 0.20443930430141008, "grad_norm": 0.7464137673377991, "learning_rate": 1.8007239296527234e-05, "loss": 2.0284, "step": 2298 }, { "epoch": 0.2045282683154664, "grad_norm": 0.7340493202209473, "learning_rate": 1.800556467827973e-05, "loss": 2.071, "step": 2299 }, { "epoch": 0.2046172323295227, "grad_norm": 0.7217735648155212, "learning_rate": 1.8003889434630473e-05, "loss": 2.069, "step": 2300 }, { "epoch": 0.20470619634357903, "grad_norm": 0.7397622466087341, "learning_rate": 1.800221356571034e-05, "loss": 2.0124, "step": 2301 }, { "epoch": 0.20479516035763534, "grad_norm": 0.7743487358093262, "learning_rate": 1.8000537071650256e-05, "loss": 1.9323, "step": 2302 }, { "epoch": 0.20488412437169165, "grad_norm": 0.7376651763916016, "learning_rate": 1.799885995258118e-05, "loss": 2.0496, "step": 2303 }, { "epoch": 0.20497308838574796, "grad_norm": 0.7325171828269958, "learning_rate": 1.7997182208634137e-05, "loss": 2.1613, "step": 2304 }, { "epoch": 0.20506205239980427, "grad_norm": 0.7556714415550232, "learning_rate": 1.7995503839940194e-05, "loss": 2.1581, "step": 2305 }, { "epoch": 0.2051510164138606, "grad_norm": 0.7820920348167419, "learning_rate": 1.7993824846630464e-05, "loss": 2.1223, "step": 2306 }, { "epoch": 0.2052399804279169, "grad_norm": 0.7620216012001038, "learning_rate": 1.7992145228836108e-05, "loss": 2.2376, "step": 2307 }, { "epoch": 0.20532894444197322, "grad_norm": 0.7632612586021423, "learning_rate": 1.7990464986688346e-05, "loss": 1.9742, "step": 2308 }, { "epoch": 0.20541790845602953, "grad_norm": 0.733502209186554, "learning_rate": 1.7988784120318437e-05, "loss": 1.9119, "step": 2309 }, { "epoch": 0.20550687247008584, "grad_norm": 0.7716120481491089, "learning_rate": 1.7987102629857692e-05, "loss": 2.0186, "step": 2310 }, { "epoch": 0.20559583648414217, "grad_norm": 0.784528374671936, "learning_rate": 1.7985420515437472e-05, "loss": 2.205, "step": 2311 }, { "epoch": 0.20568480049819848, "grad_norm": 0.7187865972518921, "learning_rate": 1.7983737777189182e-05, "loss": 1.9044, "step": 2312 }, { "epoch": 0.2057737645122548, "grad_norm": 0.7409180998802185, "learning_rate": 1.798205441524428e-05, "loss": 1.9772, "step": 2313 }, { "epoch": 0.2058627285263111, "grad_norm": 0.7581562995910645, "learning_rate": 1.7980370429734274e-05, "loss": 2.0967, "step": 2314 }, { "epoch": 0.2059516925403674, "grad_norm": 0.7608269453048706, "learning_rate": 1.7978685820790718e-05, "loss": 2.1137, "step": 2315 }, { "epoch": 0.20604065655442375, "grad_norm": 0.7396580576896667, "learning_rate": 1.797700058854521e-05, "loss": 2.0543, "step": 2316 }, { "epoch": 0.20612962056848005, "grad_norm": 0.6879896521568298, "learning_rate": 1.7975314733129414e-05, "loss": 2.1251, "step": 2317 }, { "epoch": 0.20621858458253636, "grad_norm": 0.7197800278663635, "learning_rate": 1.7973628254675014e-05, "loss": 1.9892, "step": 2318 }, { "epoch": 0.20630754859659267, "grad_norm": 0.7490249276161194, "learning_rate": 1.7971941153313772e-05, "loss": 2.0891, "step": 2319 }, { "epoch": 0.20639651261064898, "grad_norm": 0.7407044172286987, "learning_rate": 1.7970253429177477e-05, "loss": 1.922, "step": 2320 }, { "epoch": 0.20648547662470532, "grad_norm": 0.7748199105262756, "learning_rate": 1.7968565082397983e-05, "loss": 1.9993, "step": 2321 }, { "epoch": 0.20657444063876162, "grad_norm": 0.7430422306060791, "learning_rate": 1.7966876113107183e-05, "loss": 1.9298, "step": 2322 }, { "epoch": 0.20666340465281793, "grad_norm": 0.7413453459739685, "learning_rate": 1.796518652143702e-05, "loss": 2.1816, "step": 2323 }, { "epoch": 0.20675236866687424, "grad_norm": 0.7570335865020752, "learning_rate": 1.7963496307519483e-05, "loss": 2.2593, "step": 2324 }, { "epoch": 0.20684133268093055, "grad_norm": 0.7523745894432068, "learning_rate": 1.796180547148662e-05, "loss": 2.149, "step": 2325 }, { "epoch": 0.2069302966949869, "grad_norm": 0.7065045237541199, "learning_rate": 1.7960114013470513e-05, "loss": 2.121, "step": 2326 }, { "epoch": 0.2070192607090432, "grad_norm": 0.7854615449905396, "learning_rate": 1.7958421933603307e-05, "loss": 1.939, "step": 2327 }, { "epoch": 0.2071082247230995, "grad_norm": 0.7629469633102417, "learning_rate": 1.7956729232017184e-05, "loss": 2.0449, "step": 2328 }, { "epoch": 0.2071971887371558, "grad_norm": 0.7909695506095886, "learning_rate": 1.795503590884438e-05, "loss": 1.9392, "step": 2329 }, { "epoch": 0.20728615275121212, "grad_norm": 0.7536750435829163, "learning_rate": 1.7953341964217183e-05, "loss": 2.0701, "step": 2330 }, { "epoch": 0.20737511676526846, "grad_norm": 0.7511408925056458, "learning_rate": 1.795164739826792e-05, "loss": 1.9626, "step": 2331 }, { "epoch": 0.20746408077932477, "grad_norm": 0.7249951362609863, "learning_rate": 1.7949952211128977e-05, "loss": 2.095, "step": 2332 }, { "epoch": 0.20755304479338108, "grad_norm": 0.746575117111206, "learning_rate": 1.794825640293278e-05, "loss": 2.1728, "step": 2333 }, { "epoch": 0.20764200880743738, "grad_norm": 0.7784445881843567, "learning_rate": 1.7946559973811807e-05, "loss": 2.1077, "step": 2334 }, { "epoch": 0.2077309728214937, "grad_norm": 0.7920525670051575, "learning_rate": 1.7944862923898583e-05, "loss": 2.0554, "step": 2335 }, { "epoch": 0.20781993683555003, "grad_norm": 0.7228875160217285, "learning_rate": 1.794316525332569e-05, "loss": 2.1251, "step": 2336 }, { "epoch": 0.20790890084960634, "grad_norm": 0.793457567691803, "learning_rate": 1.7941466962225743e-05, "loss": 2.0407, "step": 2337 }, { "epoch": 0.20799786486366265, "grad_norm": 0.8384448885917664, "learning_rate": 1.7939768050731422e-05, "loss": 2.0921, "step": 2338 }, { "epoch": 0.20808682887771895, "grad_norm": 0.6929352283477783, "learning_rate": 1.7938068518975443e-05, "loss": 2.1362, "step": 2339 }, { "epoch": 0.20817579289177526, "grad_norm": 0.7608261704444885, "learning_rate": 1.793636836709057e-05, "loss": 2.1314, "step": 2340 }, { "epoch": 0.2082647569058316, "grad_norm": 0.7612526416778564, "learning_rate": 1.793466759520963e-05, "loss": 2.0709, "step": 2341 }, { "epoch": 0.2083537209198879, "grad_norm": 0.7767353057861328, "learning_rate": 1.7932966203465486e-05, "loss": 2.1235, "step": 2342 }, { "epoch": 0.20844268493394422, "grad_norm": 0.7376976013183594, "learning_rate": 1.793126419199105e-05, "loss": 2.0773, "step": 2343 }, { "epoch": 0.20853164894800053, "grad_norm": 0.7758573293685913, "learning_rate": 1.792956156091928e-05, "loss": 2.1574, "step": 2344 }, { "epoch": 0.20862061296205686, "grad_norm": 0.7994840741157532, "learning_rate": 1.79278583103832e-05, "loss": 2.1307, "step": 2345 }, { "epoch": 0.20870957697611317, "grad_norm": 0.859292209148407, "learning_rate": 1.7926154440515858e-05, "loss": 1.9864, "step": 2346 }, { "epoch": 0.20879854099016948, "grad_norm": 0.769569993019104, "learning_rate": 1.7924449951450366e-05, "loss": 2.1622, "step": 2347 }, { "epoch": 0.2088875050042258, "grad_norm": 0.7359813451766968, "learning_rate": 1.792274484331988e-05, "loss": 2.1402, "step": 2348 }, { "epoch": 0.2089764690182821, "grad_norm": 0.7828568816184998, "learning_rate": 1.7921039116257603e-05, "loss": 1.9673, "step": 2349 }, { "epoch": 0.20906543303233843, "grad_norm": 0.7542172074317932, "learning_rate": 1.791933277039679e-05, "loss": 2.0782, "step": 2350 }, { "epoch": 0.20915439704639474, "grad_norm": 0.7541742920875549, "learning_rate": 1.7917625805870745e-05, "loss": 1.9591, "step": 2351 }, { "epoch": 0.20924336106045105, "grad_norm": 0.7631632685661316, "learning_rate": 1.7915918222812808e-05, "loss": 2.0099, "step": 2352 }, { "epoch": 0.20933232507450736, "grad_norm": 0.7919346690177917, "learning_rate": 1.7914210021356387e-05, "loss": 2.0782, "step": 2353 }, { "epoch": 0.20942128908856367, "grad_norm": 0.7607916593551636, "learning_rate": 1.7912501201634924e-05, "loss": 2.1793, "step": 2354 }, { "epoch": 0.20951025310262, "grad_norm": 0.7556031346321106, "learning_rate": 1.791079176378191e-05, "loss": 2.0543, "step": 2355 }, { "epoch": 0.2095992171166763, "grad_norm": 0.7733479738235474, "learning_rate": 1.7909081707930895e-05, "loss": 1.8587, "step": 2356 }, { "epoch": 0.20968818113073262, "grad_norm": 0.7201804518699646, "learning_rate": 1.7907371034215467e-05, "loss": 1.9511, "step": 2357 }, { "epoch": 0.20977714514478893, "grad_norm": 0.7367768287658691, "learning_rate": 1.7905659742769265e-05, "loss": 2.1404, "step": 2358 }, { "epoch": 0.20986610915884524, "grad_norm": 0.7400447130203247, "learning_rate": 1.7903947833725973e-05, "loss": 2.1581, "step": 2359 }, { "epoch": 0.20995507317290157, "grad_norm": 0.7392048239707947, "learning_rate": 1.7902235307219333e-05, "loss": 2.2371, "step": 2360 }, { "epoch": 0.21004403718695788, "grad_norm": 0.7503815293312073, "learning_rate": 1.7900522163383124e-05, "loss": 2.1357, "step": 2361 }, { "epoch": 0.2101330012010142, "grad_norm": 0.7807356715202332, "learning_rate": 1.7898808402351184e-05, "loss": 2.1801, "step": 2362 }, { "epoch": 0.2102219652150705, "grad_norm": 0.7442643642425537, "learning_rate": 1.789709402425739e-05, "loss": 2.002, "step": 2363 }, { "epoch": 0.2103109292291268, "grad_norm": 0.7328593134880066, "learning_rate": 1.7895379029235668e-05, "loss": 2.0367, "step": 2364 }, { "epoch": 0.21039989324318314, "grad_norm": 0.7681505084037781, "learning_rate": 1.789366341742e-05, "loss": 2.1577, "step": 2365 }, { "epoch": 0.21048885725723945, "grad_norm": 0.7399666905403137, "learning_rate": 1.7891947188944406e-05, "loss": 2.0344, "step": 2366 }, { "epoch": 0.21057782127129576, "grad_norm": 0.7388241291046143, "learning_rate": 1.789023034394297e-05, "loss": 2.1328, "step": 2367 }, { "epoch": 0.21066678528535207, "grad_norm": 0.7271691560745239, "learning_rate": 1.7888512882549797e-05, "loss": 2.0624, "step": 2368 }, { "epoch": 0.21075574929940838, "grad_norm": 0.7275827527046204, "learning_rate": 1.788679480489907e-05, "loss": 2.1188, "step": 2369 }, { "epoch": 0.21084471331346472, "grad_norm": 1.121680498123169, "learning_rate": 1.7885076111125e-05, "loss": 2.0202, "step": 2370 }, { "epoch": 0.21093367732752102, "grad_norm": 0.7453483939170837, "learning_rate": 1.7883356801361854e-05, "loss": 2.2465, "step": 2371 }, { "epoch": 0.21102264134157733, "grad_norm": 0.7643547654151917, "learning_rate": 1.788163687574395e-05, "loss": 2.1335, "step": 2372 }, { "epoch": 0.21111160535563364, "grad_norm": 0.7481558918952942, "learning_rate": 1.787991633440564e-05, "loss": 2.0779, "step": 2373 }, { "epoch": 0.21120056936968995, "grad_norm": 0.7154313325881958, "learning_rate": 1.787819517748135e-05, "loss": 1.9624, "step": 2374 }, { "epoch": 0.2112895333837463, "grad_norm": 0.751154899597168, "learning_rate": 1.7876473405105525e-05, "loss": 2.0742, "step": 2375 }, { "epoch": 0.2113784973978026, "grad_norm": 0.7602733373641968, "learning_rate": 1.7874751017412676e-05, "loss": 2.0392, "step": 2376 }, { "epoch": 0.2114674614118589, "grad_norm": 0.7421813011169434, "learning_rate": 1.787302801453736e-05, "loss": 2.0666, "step": 2377 }, { "epoch": 0.2115564254259152, "grad_norm": 0.8586877584457397, "learning_rate": 1.7871304396614172e-05, "loss": 2.2146, "step": 2378 }, { "epoch": 0.21164538943997152, "grad_norm": 0.7398556470870972, "learning_rate": 1.786958016377777e-05, "loss": 2.0294, "step": 2379 }, { "epoch": 0.21173435345402786, "grad_norm": 0.7347520589828491, "learning_rate": 1.7867855316162846e-05, "loss": 2.0026, "step": 2380 }, { "epoch": 0.21182331746808417, "grad_norm": 0.7748924493789673, "learning_rate": 1.7866129853904154e-05, "loss": 2.0842, "step": 2381 }, { "epoch": 0.21191228148214047, "grad_norm": 0.7678163051605225, "learning_rate": 1.7864403777136486e-05, "loss": 2.0533, "step": 2382 }, { "epoch": 0.21200124549619678, "grad_norm": 0.6897913813591003, "learning_rate": 1.7862677085994683e-05, "loss": 2.0121, "step": 2383 }, { "epoch": 0.2120902095102531, "grad_norm": 0.747669517993927, "learning_rate": 1.7860949780613636e-05, "loss": 1.9881, "step": 2384 }, { "epoch": 0.21217917352430943, "grad_norm": 0.7802873253822327, "learning_rate": 1.7859221861128284e-05, "loss": 2.2047, "step": 2385 }, { "epoch": 0.21226813753836574, "grad_norm": 0.757056713104248, "learning_rate": 1.7857493327673615e-05, "loss": 2.3182, "step": 2386 }, { "epoch": 0.21235710155242205, "grad_norm": 0.7974374890327454, "learning_rate": 1.785576418038466e-05, "loss": 2.0605, "step": 2387 }, { "epoch": 0.21244606556647835, "grad_norm": 0.8644691705703735, "learning_rate": 1.7854034419396504e-05, "loss": 2.2487, "step": 2388 }, { "epoch": 0.21253502958053466, "grad_norm": 0.7458727359771729, "learning_rate": 1.785230404484428e-05, "loss": 2.0921, "step": 2389 }, { "epoch": 0.212623993594591, "grad_norm": 0.7764711976051331, "learning_rate": 1.7850573056863156e-05, "loss": 2.2676, "step": 2390 }, { "epoch": 0.2127129576086473, "grad_norm": 0.7715417742729187, "learning_rate": 1.784884145558837e-05, "loss": 2.0355, "step": 2391 }, { "epoch": 0.21280192162270362, "grad_norm": 0.7392106652259827, "learning_rate": 1.7847109241155195e-05, "loss": 1.972, "step": 2392 }, { "epoch": 0.21289088563675992, "grad_norm": 0.7099721431732178, "learning_rate": 1.7845376413698946e-05, "loss": 1.9882, "step": 2393 }, { "epoch": 0.21297984965081623, "grad_norm": 0.7655950784683228, "learning_rate": 1.7843642973355e-05, "loss": 2.0889, "step": 2394 }, { "epoch": 0.21306881366487257, "grad_norm": 0.750331461429596, "learning_rate": 1.784190892025877e-05, "loss": 2.1297, "step": 2395 }, { "epoch": 0.21315777767892888, "grad_norm": 0.7808218002319336, "learning_rate": 1.7840174254545725e-05, "loss": 2.0628, "step": 2396 }, { "epoch": 0.2132467416929852, "grad_norm": 0.758748471736908, "learning_rate": 1.7838438976351374e-05, "loss": 2.1118, "step": 2397 }, { "epoch": 0.2133357057070415, "grad_norm": 0.7616027593612671, "learning_rate": 1.7836703085811288e-05, "loss": 2.0059, "step": 2398 }, { "epoch": 0.2134246697210978, "grad_norm": 0.7204197645187378, "learning_rate": 1.7834966583061067e-05, "loss": 2.1071, "step": 2399 }, { "epoch": 0.21351363373515414, "grad_norm": 0.7476840615272522, "learning_rate": 1.7833229468236367e-05, "loss": 2.0474, "step": 2400 }, { "epoch": 0.21360259774921045, "grad_norm": 0.794748067855835, "learning_rate": 1.7831491741472905e-05, "loss": 2.1335, "step": 2401 }, { "epoch": 0.21369156176326676, "grad_norm": 0.799299418926239, "learning_rate": 1.782975340290642e-05, "loss": 2.0839, "step": 2402 }, { "epoch": 0.21378052577732307, "grad_norm": 0.7506393194198608, "learning_rate": 1.7828014452672718e-05, "loss": 2.1597, "step": 2403 }, { "epoch": 0.21386948979137937, "grad_norm": 0.6996781229972839, "learning_rate": 1.7826274890907653e-05, "loss": 1.9818, "step": 2404 }, { "epoch": 0.2139584538054357, "grad_norm": 0.7256022095680237, "learning_rate": 1.782453471774711e-05, "loss": 1.9338, "step": 2405 }, { "epoch": 0.21404741781949202, "grad_norm": 0.7982615828514099, "learning_rate": 1.7822793933327043e-05, "loss": 2.0831, "step": 2406 }, { "epoch": 0.21413638183354833, "grad_norm": 0.80601966381073, "learning_rate": 1.7821052537783437e-05, "loss": 2.1154, "step": 2407 }, { "epoch": 0.21422534584760464, "grad_norm": 0.7382056713104248, "learning_rate": 1.7819310531252334e-05, "loss": 1.996, "step": 2408 }, { "epoch": 0.21431430986166095, "grad_norm": 0.7119576930999756, "learning_rate": 1.781756791386982e-05, "loss": 2.0615, "step": 2409 }, { "epoch": 0.21440327387571728, "grad_norm": 0.7528584003448486, "learning_rate": 1.7815824685772032e-05, "loss": 2.0152, "step": 2410 }, { "epoch": 0.2144922378897736, "grad_norm": 0.760177493095398, "learning_rate": 1.781408084709515e-05, "loss": 2.0751, "step": 2411 }, { "epoch": 0.2145812019038299, "grad_norm": 0.7331271767616272, "learning_rate": 1.781233639797541e-05, "loss": 2.2108, "step": 2412 }, { "epoch": 0.2146701659178862, "grad_norm": 0.7643139362335205, "learning_rate": 1.7810591338549078e-05, "loss": 2.09, "step": 2413 }, { "epoch": 0.21475912993194252, "grad_norm": 0.7814044952392578, "learning_rate": 1.780884566895249e-05, "loss": 2.101, "step": 2414 }, { "epoch": 0.21484809394599885, "grad_norm": 0.7639012336730957, "learning_rate": 1.780709938932202e-05, "loss": 2.1016, "step": 2415 }, { "epoch": 0.21493705796005516, "grad_norm": 0.7651916146278381, "learning_rate": 1.7805352499794077e-05, "loss": 2.0866, "step": 2416 }, { "epoch": 0.21502602197411147, "grad_norm": 0.8419439792633057, "learning_rate": 1.780360500050514e-05, "loss": 2.0285, "step": 2417 }, { "epoch": 0.21511498598816778, "grad_norm": 0.7349149584770203, "learning_rate": 1.7801856891591725e-05, "loss": 2.0657, "step": 2418 }, { "epoch": 0.2152039500022241, "grad_norm": 0.7283397912979126, "learning_rate": 1.7800108173190392e-05, "loss": 2.25, "step": 2419 }, { "epoch": 0.21529291401628042, "grad_norm": 0.7257175445556641, "learning_rate": 1.7798358845437754e-05, "loss": 2.0573, "step": 2420 }, { "epoch": 0.21538187803033673, "grad_norm": 0.7287864089012146, "learning_rate": 1.779660890847047e-05, "loss": 2.1065, "step": 2421 }, { "epoch": 0.21547084204439304, "grad_norm": 0.7335583567619324, "learning_rate": 1.7794858362425245e-05, "loss": 2.0787, "step": 2422 }, { "epoch": 0.21555980605844935, "grad_norm": 0.7335202693939209, "learning_rate": 1.7793107207438836e-05, "loss": 2.1018, "step": 2423 }, { "epoch": 0.21564877007250566, "grad_norm": 0.7552435398101807, "learning_rate": 1.7791355443648045e-05, "loss": 2.0344, "step": 2424 }, { "epoch": 0.215737734086562, "grad_norm": 0.8239918351173401, "learning_rate": 1.7789603071189716e-05, "loss": 2.1336, "step": 2425 }, { "epoch": 0.2158266981006183, "grad_norm": 0.7392275929450989, "learning_rate": 1.778785009020075e-05, "loss": 2.1326, "step": 2426 }, { "epoch": 0.2159156621146746, "grad_norm": 0.7165732979774475, "learning_rate": 1.7786096500818094e-05, "loss": 2.0248, "step": 2427 }, { "epoch": 0.21600462612873092, "grad_norm": 0.7192463874816895, "learning_rate": 1.7784342303178737e-05, "loss": 2.1153, "step": 2428 }, { "epoch": 0.21609359014278726, "grad_norm": 0.7708304524421692, "learning_rate": 1.7782587497419715e-05, "loss": 2.0723, "step": 2429 }, { "epoch": 0.21618255415684356, "grad_norm": 0.7654867172241211, "learning_rate": 1.7780832083678122e-05, "loss": 2.1657, "step": 2430 }, { "epoch": 0.21627151817089987, "grad_norm": 0.7315652370452881, "learning_rate": 1.7779076062091088e-05, "loss": 2.1893, "step": 2431 }, { "epoch": 0.21636048218495618, "grad_norm": 0.7834140658378601, "learning_rate": 1.7777319432795793e-05, "loss": 2.1145, "step": 2432 }, { "epoch": 0.2164494461990125, "grad_norm": 0.8263528943061829, "learning_rate": 1.7775562195929474e-05, "loss": 2.1006, "step": 2433 }, { "epoch": 0.21653841021306883, "grad_norm": 0.7348493337631226, "learning_rate": 1.77738043516294e-05, "loss": 2.0518, "step": 2434 }, { "epoch": 0.21662737422712514, "grad_norm": 0.7220854759216309, "learning_rate": 1.7772045900032898e-05, "loss": 1.9473, "step": 2435 }, { "epoch": 0.21671633824118144, "grad_norm": 0.7541854977607727, "learning_rate": 1.777028684127734e-05, "loss": 2.1233, "step": 2436 }, { "epoch": 0.21680530225523775, "grad_norm": 0.7490376234054565, "learning_rate": 1.7768527175500146e-05, "loss": 2.073, "step": 2437 }, { "epoch": 0.21689426626929406, "grad_norm": 0.7713085412979126, "learning_rate": 1.7766766902838782e-05, "loss": 2.0832, "step": 2438 }, { "epoch": 0.2169832302833504, "grad_norm": 0.7217466831207275, "learning_rate": 1.7765006023430764e-05, "loss": 2.0231, "step": 2439 }, { "epoch": 0.2170721942974067, "grad_norm": 0.7221043109893799, "learning_rate": 1.776324453741365e-05, "loss": 2.0966, "step": 2440 }, { "epoch": 0.21716115831146302, "grad_norm": 0.7518332600593567, "learning_rate": 1.7761482444925052e-05, "loss": 2.017, "step": 2441 }, { "epoch": 0.21725012232551932, "grad_norm": 0.7476770281791687, "learning_rate": 1.7759719746102623e-05, "loss": 2.0013, "step": 2442 }, { "epoch": 0.21733908633957563, "grad_norm": 0.7386400103569031, "learning_rate": 1.775795644108407e-05, "loss": 2.0768, "step": 2443 }, { "epoch": 0.21742805035363197, "grad_norm": 0.7734381556510925, "learning_rate": 1.7756192530007142e-05, "loss": 2.1892, "step": 2444 }, { "epoch": 0.21751701436768828, "grad_norm": 0.7441405653953552, "learning_rate": 1.7754428013009634e-05, "loss": 2.0316, "step": 2445 }, { "epoch": 0.21760597838174459, "grad_norm": 0.7056363821029663, "learning_rate": 1.7752662890229398e-05, "loss": 1.9907, "step": 2446 }, { "epoch": 0.2176949423958009, "grad_norm": 0.7250891327857971, "learning_rate": 1.7750897161804325e-05, "loss": 2.1385, "step": 2447 }, { "epoch": 0.2177839064098572, "grad_norm": 0.7067894339561462, "learning_rate": 1.7749130827872352e-05, "loss": 2.2149, "step": 2448 }, { "epoch": 0.21787287042391354, "grad_norm": 0.7777976393699646, "learning_rate": 1.774736388857147e-05, "loss": 1.9735, "step": 2449 }, { "epoch": 0.21796183443796985, "grad_norm": 0.7721734642982483, "learning_rate": 1.7745596344039712e-05, "loss": 2.0531, "step": 2450 } ], "logging_steps": 1, "max_steps": 11240, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.69102700724224e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }