{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 426, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002347417840375587, "grad_norm": 6.74153470993042, "learning_rate": 2.325581395348837e-06, "loss": 8.0682, "step": 1 }, { "epoch": 0.004694835680751174, "grad_norm": 5.902026176452637, "learning_rate": 4.651162790697674e-06, "loss": 7.5085, "step": 2 }, { "epoch": 0.007042253521126761, "grad_norm": 7.1663970947265625, "learning_rate": 6.976744186046512e-06, "loss": 8.4092, "step": 3 }, { "epoch": 0.009389671361502348, "grad_norm": 4.929966449737549, "learning_rate": 9.302325581395349e-06, "loss": 6.8045, "step": 4 }, { "epoch": 0.011737089201877934, "grad_norm": 5.453227996826172, "learning_rate": 1.1627906976744187e-05, "loss": 7.5007, "step": 5 }, { "epoch": 0.014084507042253521, "grad_norm": 5.242559909820557, "learning_rate": 1.3953488372093024e-05, "loss": 7.1794, "step": 6 }, { "epoch": 0.01643192488262911, "grad_norm": 5.786001682281494, "learning_rate": 1.6279069767441862e-05, "loss": 7.5557, "step": 7 }, { "epoch": 0.018779342723004695, "grad_norm": 37.475582122802734, "learning_rate": 1.8604651162790697e-05, "loss": 6.7967, "step": 8 }, { "epoch": 0.02112676056338028, "grad_norm": 8.487391471862793, "learning_rate": 2.0930232558139536e-05, "loss": 6.8769, "step": 9 }, { "epoch": 0.023474178403755867, "grad_norm": 7.1259026527404785, "learning_rate": 2.3255813953488374e-05, "loss": 6.6679, "step": 10 }, { "epoch": 0.025821596244131457, "grad_norm": 6.672307014465332, "learning_rate": 2.5581395348837212e-05, "loss": 6.381, "step": 11 }, { "epoch": 0.028169014084507043, "grad_norm": 13.447758674621582, "learning_rate": 2.7906976744186048e-05, "loss": 6.1116, "step": 12 }, { "epoch": 0.03051643192488263, "grad_norm": 5.074902534484863, "learning_rate": 3.0232558139534883e-05, "loss": 5.7065, "step": 13 }, { "epoch": 0.03286384976525822, "grad_norm": 5.343297481536865, "learning_rate": 3.2558139534883724e-05, "loss": 6.07, "step": 14 }, { "epoch": 0.035211267605633804, "grad_norm": 4.750032901763916, "learning_rate": 3.488372093023256e-05, "loss": 5.7021, "step": 15 }, { "epoch": 0.03755868544600939, "grad_norm": 4.497730731964111, "learning_rate": 3.7209302325581394e-05, "loss": 5.2804, "step": 16 }, { "epoch": 0.03990610328638498, "grad_norm": 6.505649566650391, "learning_rate": 3.953488372093023e-05, "loss": 5.7704, "step": 17 }, { "epoch": 0.04225352112676056, "grad_norm": 3.8866803646087646, "learning_rate": 4.186046511627907e-05, "loss": 5.3131, "step": 18 }, { "epoch": 0.04460093896713615, "grad_norm": 4.154186248779297, "learning_rate": 4.418604651162791e-05, "loss": 5.2177, "step": 19 }, { "epoch": 0.046948356807511735, "grad_norm": 3.877696990966797, "learning_rate": 4.651162790697675e-05, "loss": 5.0364, "step": 20 }, { "epoch": 0.04929577464788732, "grad_norm": 4.082431793212891, "learning_rate": 4.883720930232558e-05, "loss": 5.0723, "step": 21 }, { "epoch": 0.051643192488262914, "grad_norm": 4.014376163482666, "learning_rate": 5.1162790697674425e-05, "loss": 5.0463, "step": 22 }, { "epoch": 0.0539906103286385, "grad_norm": 3.7503790855407715, "learning_rate": 5.348837209302326e-05, "loss": 4.8761, "step": 23 }, { "epoch": 0.056338028169014086, "grad_norm": 3.149980306625366, "learning_rate": 5.5813953488372095e-05, "loss": 4.5262, "step": 24 }, { "epoch": 0.05868544600938967, "grad_norm": 3.164466619491577, "learning_rate": 5.8139534883720933e-05, "loss": 4.4134, "step": 25 }, { "epoch": 0.06103286384976526, "grad_norm": 3.258887767791748, "learning_rate": 6.0465116279069765e-05, "loss": 4.4837, "step": 26 }, { "epoch": 0.06338028169014084, "grad_norm": 2.9795877933502197, "learning_rate": 6.27906976744186e-05, "loss": 4.268, "step": 27 }, { "epoch": 0.06572769953051644, "grad_norm": 3.418090581893921, "learning_rate": 6.511627906976745e-05, "loss": 4.1638, "step": 28 }, { "epoch": 0.06807511737089202, "grad_norm": 3.0284407138824463, "learning_rate": 6.744186046511628e-05, "loss": 4.1912, "step": 29 }, { "epoch": 0.07042253521126761, "grad_norm": 2.4764463901519775, "learning_rate": 6.976744186046513e-05, "loss": 3.7192, "step": 30 }, { "epoch": 0.07276995305164319, "grad_norm": 3.433175563812256, "learning_rate": 7.209302325581396e-05, "loss": 4.0516, "step": 31 }, { "epoch": 0.07511737089201878, "grad_norm": 3.6180222034454346, "learning_rate": 7.441860465116279e-05, "loss": 4.1871, "step": 32 }, { "epoch": 0.07746478873239436, "grad_norm": 9.763015747070312, "learning_rate": 7.674418604651163e-05, "loss": 3.9995, "step": 33 }, { "epoch": 0.07981220657276995, "grad_norm": 3.139212131500244, "learning_rate": 7.906976744186047e-05, "loss": 3.7272, "step": 34 }, { "epoch": 0.08215962441314555, "grad_norm": 2.6827597618103027, "learning_rate": 8.139534883720931e-05, "loss": 3.751, "step": 35 }, { "epoch": 0.08450704225352113, "grad_norm": 2.9310972690582275, "learning_rate": 8.372093023255814e-05, "loss": 4.0055, "step": 36 }, { "epoch": 0.08685446009389672, "grad_norm": 2.362100839614868, "learning_rate": 8.604651162790697e-05, "loss": 3.6492, "step": 37 }, { "epoch": 0.0892018779342723, "grad_norm": 2.7767703533172607, "learning_rate": 8.837209302325582e-05, "loss": 3.8801, "step": 38 }, { "epoch": 0.09154929577464789, "grad_norm": 2.6525282859802246, "learning_rate": 9.069767441860465e-05, "loss": 3.5582, "step": 39 }, { "epoch": 0.09389671361502347, "grad_norm": 2.6659605503082275, "learning_rate": 9.30232558139535e-05, "loss": 3.7146, "step": 40 }, { "epoch": 0.09624413145539906, "grad_norm": 2.707975149154663, "learning_rate": 9.534883720930233e-05, "loss": 3.59, "step": 41 }, { "epoch": 0.09859154929577464, "grad_norm": 2.599092721939087, "learning_rate": 9.767441860465116e-05, "loss": 3.4901, "step": 42 }, { "epoch": 0.10093896713615023, "grad_norm": 2.611809492111206, "learning_rate": 0.0001, "loss": 3.7515, "step": 43 }, { "epoch": 0.10328638497652583, "grad_norm": 2.4812469482421875, "learning_rate": 9.999962299929265e-05, "loss": 3.6521, "step": 44 }, { "epoch": 0.1056338028169014, "grad_norm": 2.396311044692993, "learning_rate": 9.999849200285573e-05, "loss": 3.6281, "step": 45 }, { "epoch": 0.107981220657277, "grad_norm": 2.4779536724090576, "learning_rate": 9.999660702774474e-05, "loss": 3.5034, "step": 46 }, { "epoch": 0.11032863849765258, "grad_norm": 2.3948028087615967, "learning_rate": 9.999396810238513e-05, "loss": 3.5579, "step": 47 }, { "epoch": 0.11267605633802817, "grad_norm": 2.401021957397461, "learning_rate": 9.999057526657196e-05, "loss": 3.4793, "step": 48 }, { "epoch": 0.11502347417840375, "grad_norm": 2.6258273124694824, "learning_rate": 9.998642857146934e-05, "loss": 3.4992, "step": 49 }, { "epoch": 0.11737089201877934, "grad_norm": 4.11713171005249, "learning_rate": 9.99815280796095e-05, "loss": 3.2179, "step": 50 }, { "epoch": 0.11971830985915492, "grad_norm": 2.6189980506896973, "learning_rate": 9.997587386489202e-05, "loss": 3.4057, "step": 51 }, { "epoch": 0.12206572769953052, "grad_norm": 3.7957706451416016, "learning_rate": 9.996946601258261e-05, "loss": 3.509, "step": 52 }, { "epoch": 0.12441314553990611, "grad_norm": 2.797846555709839, "learning_rate": 9.996230461931186e-05, "loss": 3.1791, "step": 53 }, { "epoch": 0.1267605633802817, "grad_norm": 3.2530624866485596, "learning_rate": 9.995438979307381e-05, "loss": 3.3071, "step": 54 }, { "epoch": 0.12910798122065728, "grad_norm": 2.044215679168701, "learning_rate": 9.994572165322422e-05, "loss": 3.3685, "step": 55 }, { "epoch": 0.13145539906103287, "grad_norm": 2.2636239528656006, "learning_rate": 9.993630033047891e-05, "loss": 3.4271, "step": 56 }, { "epoch": 0.13380281690140844, "grad_norm": 4.127092361450195, "learning_rate": 9.992612596691171e-05, "loss": 3.3618, "step": 57 }, { "epoch": 0.13615023474178403, "grad_norm": 2.1330583095550537, "learning_rate": 9.991519871595227e-05, "loss": 3.1341, "step": 58 }, { "epoch": 0.13849765258215962, "grad_norm": 2.255793571472168, "learning_rate": 9.990351874238388e-05, "loss": 3.1382, "step": 59 }, { "epoch": 0.14084507042253522, "grad_norm": 2.0289318561553955, "learning_rate": 9.989108622234084e-05, "loss": 3.0725, "step": 60 }, { "epoch": 0.1431924882629108, "grad_norm": 2.0822529792785645, "learning_rate": 9.987790134330593e-05, "loss": 3.1886, "step": 61 }, { "epoch": 0.14553990610328638, "grad_norm": 2.202343702316284, "learning_rate": 9.986396430410749e-05, "loss": 3.1895, "step": 62 }, { "epoch": 0.14788732394366197, "grad_norm": 2.0780410766601562, "learning_rate": 9.984927531491648e-05, "loss": 3.2399, "step": 63 }, { "epoch": 0.15023474178403756, "grad_norm": 1.9594764709472656, "learning_rate": 9.983383459724322e-05, "loss": 3.194, "step": 64 }, { "epoch": 0.15258215962441316, "grad_norm": 2.6781210899353027, "learning_rate": 9.981764238393424e-05, "loss": 3.3434, "step": 65 }, { "epoch": 0.15492957746478872, "grad_norm": 1.913921594619751, "learning_rate": 9.980069891916854e-05, "loss": 2.9046, "step": 66 }, { "epoch": 0.1572769953051643, "grad_norm": 2.063204050064087, "learning_rate": 9.978300445845404e-05, "loss": 3.1134, "step": 67 }, { "epoch": 0.1596244131455399, "grad_norm": 2.063624620437622, "learning_rate": 9.976455926862371e-05, "loss": 3.1757, "step": 68 }, { "epoch": 0.1619718309859155, "grad_norm": 2.1056790351867676, "learning_rate": 9.974536362783156e-05, "loss": 3.1839, "step": 69 }, { "epoch": 0.1643192488262911, "grad_norm": 2.1855363845825195, "learning_rate": 9.972541782554836e-05, "loss": 3.1744, "step": 70 }, { "epoch": 0.16666666666666666, "grad_norm": 2.163306951522827, "learning_rate": 9.970472216255741e-05, "loss": 3.0721, "step": 71 }, { "epoch": 0.16901408450704225, "grad_norm": 2.0321338176727295, "learning_rate": 9.968327695094987e-05, "loss": 3.2354, "step": 72 }, { "epoch": 0.17136150234741784, "grad_norm": 2.473968505859375, "learning_rate": 9.966108251412014e-05, "loss": 3.265, "step": 73 }, { "epoch": 0.17370892018779344, "grad_norm": 2.1940367221832275, "learning_rate": 9.963813918676095e-05, "loss": 3.2263, "step": 74 }, { "epoch": 0.176056338028169, "grad_norm": 2.01611590385437, "learning_rate": 9.961444731485836e-05, "loss": 3.0679, "step": 75 }, { "epoch": 0.1784037558685446, "grad_norm": 2.064678430557251, "learning_rate": 9.959000725568641e-05, "loss": 3.0248, "step": 76 }, { "epoch": 0.1807511737089202, "grad_norm": 2.0593502521514893, "learning_rate": 9.956481937780193e-05, "loss": 3.1276, "step": 77 }, { "epoch": 0.18309859154929578, "grad_norm": 2.0508029460906982, "learning_rate": 9.953888406103883e-05, "loss": 2.9601, "step": 78 }, { "epoch": 0.18544600938967137, "grad_norm": 2.0320143699645996, "learning_rate": 9.951220169650239e-05, "loss": 3.2927, "step": 79 }, { "epoch": 0.18779342723004694, "grad_norm": 2.048593759536743, "learning_rate": 9.948477268656345e-05, "loss": 2.9041, "step": 80 }, { "epoch": 0.19014084507042253, "grad_norm": 1.9130347967147827, "learning_rate": 9.945659744485225e-05, "loss": 2.846, "step": 81 }, { "epoch": 0.19248826291079812, "grad_norm": 1.9843111038208008, "learning_rate": 9.942767639625224e-05, "loss": 2.925, "step": 82 }, { "epoch": 0.19483568075117372, "grad_norm": 1.976904034614563, "learning_rate": 9.939800997689362e-05, "loss": 3.3067, "step": 83 }, { "epoch": 0.19718309859154928, "grad_norm": 1.9135102033615112, "learning_rate": 9.936759863414685e-05, "loss": 3.0173, "step": 84 }, { "epoch": 0.19953051643192488, "grad_norm": 1.8903708457946777, "learning_rate": 9.933644282661586e-05, "loss": 3.0035, "step": 85 }, { "epoch": 0.20187793427230047, "grad_norm": 2.104926109313965, "learning_rate": 9.930454302413108e-05, "loss": 3.1789, "step": 86 }, { "epoch": 0.20422535211267606, "grad_norm": 2.187955379486084, "learning_rate": 9.927189970774245e-05, "loss": 3.0865, "step": 87 }, { "epoch": 0.20657276995305165, "grad_norm": 1.9850472211837769, "learning_rate": 9.923851336971208e-05, "loss": 3.0692, "step": 88 }, { "epoch": 0.20892018779342722, "grad_norm": 2.022155284881592, "learning_rate": 9.920438451350694e-05, "loss": 3.0953, "step": 89 }, { "epoch": 0.2112676056338028, "grad_norm": 2.039721727371216, "learning_rate": 9.916951365379112e-05, "loss": 3.1641, "step": 90 }, { "epoch": 0.2136150234741784, "grad_norm": 1.8413914442062378, "learning_rate": 9.913390131641815e-05, "loss": 2.9263, "step": 91 }, { "epoch": 0.215962441314554, "grad_norm": 3.865889310836792, "learning_rate": 9.909754803842313e-05, "loss": 2.888, "step": 92 }, { "epoch": 0.21830985915492956, "grad_norm": 2.0847761631011963, "learning_rate": 9.906045436801448e-05, "loss": 3.0763, "step": 93 }, { "epoch": 0.22065727699530516, "grad_norm": 1.9045860767364502, "learning_rate": 9.902262086456582e-05, "loss": 2.9922, "step": 94 }, { "epoch": 0.22300469483568075, "grad_norm": 2.059549331665039, "learning_rate": 9.898404809860744e-05, "loss": 2.9984, "step": 95 }, { "epoch": 0.22535211267605634, "grad_norm": 1.974882960319519, "learning_rate": 9.894473665181776e-05, "loss": 3.0983, "step": 96 }, { "epoch": 0.22769953051643194, "grad_norm": 1.9072291851043701, "learning_rate": 9.89046871170145e-05, "loss": 3.011, "step": 97 }, { "epoch": 0.2300469483568075, "grad_norm": 2.395671844482422, "learning_rate": 9.886390009814579e-05, "loss": 2.8887, "step": 98 }, { "epoch": 0.2323943661971831, "grad_norm": 1.9982821941375732, "learning_rate": 9.882237621028101e-05, "loss": 3.0971, "step": 99 }, { "epoch": 0.2347417840375587, "grad_norm": 1.750945806503296, "learning_rate": 9.878011607960156e-05, "loss": 2.9084, "step": 100 }, { "epoch": 0.23708920187793428, "grad_norm": 1.9544695615768433, "learning_rate": 9.873712034339143e-05, "loss": 3.0739, "step": 101 }, { "epoch": 0.23943661971830985, "grad_norm": 1.9211771488189697, "learning_rate": 9.869338965002752e-05, "loss": 2.8189, "step": 102 }, { "epoch": 0.24178403755868544, "grad_norm": 1.9226988554000854, "learning_rate": 9.864892465896994e-05, "loss": 2.8616, "step": 103 }, { "epoch": 0.24413145539906103, "grad_norm": 1.9747202396392822, "learning_rate": 9.860372604075199e-05, "loss": 3.0266, "step": 104 }, { "epoch": 0.24647887323943662, "grad_norm": 1.8386709690093994, "learning_rate": 9.855779447697013e-05, "loss": 2.7453, "step": 105 }, { "epoch": 0.24882629107981222, "grad_norm": 1.9151238203048706, "learning_rate": 9.851113066027364e-05, "loss": 2.8376, "step": 106 }, { "epoch": 0.2511737089201878, "grad_norm": 2.1199045181274414, "learning_rate": 9.846373529435418e-05, "loss": 3.0903, "step": 107 }, { "epoch": 0.2535211267605634, "grad_norm": 1.742322325706482, "learning_rate": 9.841560909393523e-05, "loss": 2.7989, "step": 108 }, { "epoch": 0.25586854460093894, "grad_norm": 1.8751287460327148, "learning_rate": 9.836675278476124e-05, "loss": 2.9556, "step": 109 }, { "epoch": 0.25821596244131456, "grad_norm": 1.8210289478302002, "learning_rate": 9.831716710358673e-05, "loss": 2.7869, "step": 110 }, { "epoch": 0.2605633802816901, "grad_norm": 1.9199378490447998, "learning_rate": 9.82668527981652e-05, "loss": 2.8579, "step": 111 }, { "epoch": 0.26291079812206575, "grad_norm": 2.19356107711792, "learning_rate": 9.821581062723779e-05, "loss": 2.8818, "step": 112 }, { "epoch": 0.2652582159624413, "grad_norm": 1.755565881729126, "learning_rate": 9.816404136052186e-05, "loss": 2.7002, "step": 113 }, { "epoch": 0.2676056338028169, "grad_norm": 2.039726972579956, "learning_rate": 9.811154577869943e-05, "loss": 2.9508, "step": 114 }, { "epoch": 0.2699530516431925, "grad_norm": 1.824286699295044, "learning_rate": 9.805832467340538e-05, "loss": 2.8587, "step": 115 }, { "epoch": 0.27230046948356806, "grad_norm": 1.9687395095825195, "learning_rate": 9.800437884721545e-05, "loss": 2.9892, "step": 116 }, { "epoch": 0.2746478873239437, "grad_norm": 1.9100291728973389, "learning_rate": 9.794970911363426e-05, "loss": 2.9608, "step": 117 }, { "epoch": 0.27699530516431925, "grad_norm": 1.8510876893997192, "learning_rate": 9.78943162970829e-05, "loss": 2.8168, "step": 118 }, { "epoch": 0.2793427230046948, "grad_norm": 3.4659063816070557, "learning_rate": 9.783820123288664e-05, "loss": 2.8401, "step": 119 }, { "epoch": 0.28169014084507044, "grad_norm": 1.894242286682129, "learning_rate": 9.778136476726223e-05, "loss": 2.7963, "step": 120 }, { "epoch": 0.284037558685446, "grad_norm": 2.108842372894287, "learning_rate": 9.772380775730516e-05, "loss": 2.8951, "step": 121 }, { "epoch": 0.2863849765258216, "grad_norm": 1.8829090595245361, "learning_rate": 9.766553107097681e-05, "loss": 3.0315, "step": 122 }, { "epoch": 0.2887323943661972, "grad_norm": 1.8636614084243774, "learning_rate": 9.760653558709122e-05, "loss": 2.979, "step": 123 }, { "epoch": 0.29107981220657275, "grad_norm": 1.8715219497680664, "learning_rate": 9.754682219530199e-05, "loss": 2.9428, "step": 124 }, { "epoch": 0.2934272300469484, "grad_norm": 1.785217046737671, "learning_rate": 9.748639179608872e-05, "loss": 2.6377, "step": 125 }, { "epoch": 0.29577464788732394, "grad_norm": 2.0240108966827393, "learning_rate": 9.742524530074355e-05, "loss": 2.7979, "step": 126 }, { "epoch": 0.2981220657276995, "grad_norm": 2.643691062927246, "learning_rate": 9.736338363135738e-05, "loss": 2.9472, "step": 127 }, { "epoch": 0.3004694835680751, "grad_norm": 1.8176496028900146, "learning_rate": 9.730080772080592e-05, "loss": 2.8626, "step": 128 }, { "epoch": 0.3028169014084507, "grad_norm": 2.218360662460327, "learning_rate": 9.723751851273568e-05, "loss": 2.8979, "step": 129 }, { "epoch": 0.3051643192488263, "grad_norm": 1.8391715288162231, "learning_rate": 9.71735169615497e-05, "loss": 2.8412, "step": 130 }, { "epoch": 0.3075117370892019, "grad_norm": 2.497711420059204, "learning_rate": 9.710880403239317e-05, "loss": 2.7567, "step": 131 }, { "epoch": 0.30985915492957744, "grad_norm": 1.859116554260254, "learning_rate": 9.704338070113893e-05, "loss": 2.8, "step": 132 }, { "epoch": 0.31220657276995306, "grad_norm": 1.7876489162445068, "learning_rate": 9.697724795437264e-05, "loss": 2.7589, "step": 133 }, { "epoch": 0.3145539906103286, "grad_norm": 1.6835530996322632, "learning_rate": 9.6910406789378e-05, "loss": 2.6982, "step": 134 }, { "epoch": 0.31690140845070425, "grad_norm": 1.7438956499099731, "learning_rate": 9.684285821412165e-05, "loss": 2.7689, "step": 135 }, { "epoch": 0.3192488262910798, "grad_norm": 1.9094929695129395, "learning_rate": 9.677460324723806e-05, "loss": 2.8394, "step": 136 }, { "epoch": 0.3215962441314554, "grad_norm": 1.8689284324645996, "learning_rate": 9.670564291801401e-05, "loss": 2.7924, "step": 137 }, { "epoch": 0.323943661971831, "grad_norm": 1.8969767093658447, "learning_rate": 9.663597826637325e-05, "loss": 2.8311, "step": 138 }, { "epoch": 0.32629107981220656, "grad_norm": 1.758320689201355, "learning_rate": 9.656561034286069e-05, "loss": 2.769, "step": 139 }, { "epoch": 0.3286384976525822, "grad_norm": 1.7114989757537842, "learning_rate": 9.64945402086266e-05, "loss": 2.7606, "step": 140 }, { "epoch": 0.33098591549295775, "grad_norm": 1.708143711090088, "learning_rate": 9.642276893541063e-05, "loss": 2.6031, "step": 141 }, { "epoch": 0.3333333333333333, "grad_norm": 1.8093912601470947, "learning_rate": 9.63502976055256e-05, "loss": 2.6895, "step": 142 }, { "epoch": 0.33568075117370894, "grad_norm": 2.6389997005462646, "learning_rate": 9.627712731184122e-05, "loss": 2.7398, "step": 143 }, { "epoch": 0.3380281690140845, "grad_norm": 2.378230094909668, "learning_rate": 9.620325915776758e-05, "loss": 2.6429, "step": 144 }, { "epoch": 0.3403755868544601, "grad_norm": 1.9288902282714844, "learning_rate": 9.612869425723854e-05, "loss": 3.024, "step": 145 }, { "epoch": 0.3427230046948357, "grad_norm": 1.8169496059417725, "learning_rate": 9.605343373469491e-05, "loss": 2.7254, "step": 146 }, { "epoch": 0.34507042253521125, "grad_norm": 1.8046923875808716, "learning_rate": 9.59774787250675e-05, "loss": 2.7404, "step": 147 }, { "epoch": 0.3474178403755869, "grad_norm": 1.9261105060577393, "learning_rate": 9.590083037376001e-05, "loss": 2.856, "step": 148 }, { "epoch": 0.34976525821596244, "grad_norm": 1.845190405845642, "learning_rate": 9.582348983663173e-05, "loss": 2.8376, "step": 149 }, { "epoch": 0.352112676056338, "grad_norm": 2.6601450443267822, "learning_rate": 9.574545827998017e-05, "loss": 2.7516, "step": 150 }, { "epoch": 0.3544600938967136, "grad_norm": 1.7026342153549194, "learning_rate": 9.566673688052339e-05, "loss": 2.5917, "step": 151 }, { "epoch": 0.3568075117370892, "grad_norm": 1.6950021982192993, "learning_rate": 9.558732682538233e-05, "loss": 2.6247, "step": 152 }, { "epoch": 0.3591549295774648, "grad_norm": 2.047011613845825, "learning_rate": 9.550722931206286e-05, "loss": 2.8301, "step": 153 }, { "epoch": 0.3615023474178404, "grad_norm": 1.7364177703857422, "learning_rate": 9.542644554843778e-05, "loss": 2.6797, "step": 154 }, { "epoch": 0.36384976525821594, "grad_norm": 1.6910269260406494, "learning_rate": 9.53449767527285e-05, "loss": 2.6877, "step": 155 }, { "epoch": 0.36619718309859156, "grad_norm": 1.8064078092575073, "learning_rate": 9.526282415348677e-05, "loss": 2.6628, "step": 156 }, { "epoch": 0.3685446009389671, "grad_norm": 49.98039245605469, "learning_rate": 9.517998898957611e-05, "loss": 2.7553, "step": 157 }, { "epoch": 0.37089201877934275, "grad_norm": 1.855872392654419, "learning_rate": 9.509647251015314e-05, "loss": 2.8635, "step": 158 }, { "epoch": 0.3732394366197183, "grad_norm": 1.932474970817566, "learning_rate": 9.501227597464875e-05, "loss": 2.868, "step": 159 }, { "epoch": 0.3755868544600939, "grad_norm": 1.7808077335357666, "learning_rate": 9.492740065274904e-05, "loss": 2.5917, "step": 160 }, { "epoch": 0.3779342723004695, "grad_norm": 1.772858738899231, "learning_rate": 9.484184782437628e-05, "loss": 2.6197, "step": 161 }, { "epoch": 0.38028169014084506, "grad_norm": 2.6756508350372314, "learning_rate": 9.475561877966956e-05, "loss": 2.6527, "step": 162 }, { "epoch": 0.3826291079812207, "grad_norm": 1.6639829874038696, "learning_rate": 9.46687148189653e-05, "loss": 2.633, "step": 163 }, { "epoch": 0.38497652582159625, "grad_norm": 1.8140541315078735, "learning_rate": 9.458113725277769e-05, "loss": 2.7801, "step": 164 }, { "epoch": 0.3873239436619718, "grad_norm": 11.151857376098633, "learning_rate": 9.449288740177891e-05, "loss": 2.7566, "step": 165 }, { "epoch": 0.38967136150234744, "grad_norm": 2.033475637435913, "learning_rate": 9.440396659677919e-05, "loss": 2.6477, "step": 166 }, { "epoch": 0.392018779342723, "grad_norm": 2.4777884483337402, "learning_rate": 9.43143761787068e-05, "loss": 3.0192, "step": 167 }, { "epoch": 0.39436619718309857, "grad_norm": 1.8000035285949707, "learning_rate": 9.422411749858779e-05, "loss": 2.8657, "step": 168 }, { "epoch": 0.3967136150234742, "grad_norm": 1.6467283964157104, "learning_rate": 9.413319191752559e-05, "loss": 2.6207, "step": 169 }, { "epoch": 0.39906103286384975, "grad_norm": 1.7085107564926147, "learning_rate": 9.404160080668055e-05, "loss": 2.8527, "step": 170 }, { "epoch": 0.4014084507042254, "grad_norm": 1.721285104751587, "learning_rate": 9.394934554724921e-05, "loss": 2.7701, "step": 171 }, { "epoch": 0.40375586854460094, "grad_norm": 1.7729564905166626, "learning_rate": 9.385642753044348e-05, "loss": 2.738, "step": 172 }, { "epoch": 0.4061032863849765, "grad_norm": 1.7724652290344238, "learning_rate": 9.37628481574697e-05, "loss": 2.8383, "step": 173 }, { "epoch": 0.4084507042253521, "grad_norm": 1.6301813125610352, "learning_rate": 9.366860883950745e-05, "loss": 2.5906, "step": 174 }, { "epoch": 0.4107981220657277, "grad_norm": 1.8605163097381592, "learning_rate": 9.357371099768833e-05, "loss": 2.7792, "step": 175 }, { "epoch": 0.4131455399061033, "grad_norm": 1.8396178483963013, "learning_rate": 9.347815606307445e-05, "loss": 2.8461, "step": 176 }, { "epoch": 0.4154929577464789, "grad_norm": 1.747826337814331, "learning_rate": 9.338194547663694e-05, "loss": 2.8471, "step": 177 }, { "epoch": 0.41784037558685444, "grad_norm": 1.8125886917114258, "learning_rate": 9.328508068923418e-05, "loss": 2.6038, "step": 178 }, { "epoch": 0.42018779342723006, "grad_norm": 2.1431937217712402, "learning_rate": 9.31875631615899e-05, "loss": 2.9192, "step": 179 }, { "epoch": 0.4225352112676056, "grad_norm": 1.6201167106628418, "learning_rate": 9.308939436427115e-05, "loss": 2.7844, "step": 180 }, { "epoch": 0.42488262910798125, "grad_norm": 1.9184616804122925, "learning_rate": 9.299057577766622e-05, "loss": 2.9886, "step": 181 }, { "epoch": 0.4272300469483568, "grad_norm": 1.6579173803329468, "learning_rate": 9.289110889196214e-05, "loss": 2.7774, "step": 182 }, { "epoch": 0.4295774647887324, "grad_norm": 1.7431479692459106, "learning_rate": 9.27909952071224e-05, "loss": 2.4447, "step": 183 }, { "epoch": 0.431924882629108, "grad_norm": 1.8576884269714355, "learning_rate": 9.269023623286417e-05, "loss": 2.6607, "step": 184 }, { "epoch": 0.43427230046948356, "grad_norm": 1.9461067914962769, "learning_rate": 9.258883348863566e-05, "loss": 2.6777, "step": 185 }, { "epoch": 0.43661971830985913, "grad_norm": 2.585401773452759, "learning_rate": 9.248678850359309e-05, "loss": 2.6043, "step": 186 }, { "epoch": 0.43896713615023475, "grad_norm": 1.724517822265625, "learning_rate": 9.238410281657775e-05, "loss": 2.8784, "step": 187 }, { "epoch": 0.4413145539906103, "grad_norm": 1.6239253282546997, "learning_rate": 9.228077797609269e-05, "loss": 2.5965, "step": 188 }, { "epoch": 0.44366197183098594, "grad_norm": 1.6929664611816406, "learning_rate": 9.217681554027945e-05, "loss": 2.7162, "step": 189 }, { "epoch": 0.4460093896713615, "grad_norm": 5.029322147369385, "learning_rate": 9.207221707689447e-05, "loss": 2.5703, "step": 190 }, { "epoch": 0.44835680751173707, "grad_norm": 1.8209108114242554, "learning_rate": 9.196698416328557e-05, "loss": 2.7178, "step": 191 }, { "epoch": 0.4507042253521127, "grad_norm": 1.71205472946167, "learning_rate": 9.186111838636804e-05, "loss": 2.7824, "step": 192 }, { "epoch": 0.45305164319248825, "grad_norm": 1.7310357093811035, "learning_rate": 9.175462134260083e-05, "loss": 2.535, "step": 193 }, { "epoch": 0.45539906103286387, "grad_norm": 1.7333024740219116, "learning_rate": 9.16474946379623e-05, "loss": 2.5513, "step": 194 }, { "epoch": 0.45774647887323944, "grad_norm": 1.8149805068969727, "learning_rate": 9.153973988792626e-05, "loss": 2.644, "step": 195 }, { "epoch": 0.460093896713615, "grad_norm": 2.832623243331909, "learning_rate": 9.143135871743736e-05, "loss": 2.5307, "step": 196 }, { "epoch": 0.4624413145539906, "grad_norm": 1.854354977607727, "learning_rate": 9.132235276088671e-05, "loss": 2.5903, "step": 197 }, { "epoch": 0.4647887323943662, "grad_norm": 1.7942005395889282, "learning_rate": 9.121272366208722e-05, "loss": 2.609, "step": 198 }, { "epoch": 0.4671361502347418, "grad_norm": 1.8697292804718018, "learning_rate": 9.110247307424882e-05, "loss": 2.8038, "step": 199 }, { "epoch": 0.4694835680751174, "grad_norm": 1.690024495124817, "learning_rate": 9.09916026599535e-05, "loss": 2.54, "step": 200 }, { "epoch": 0.47183098591549294, "grad_norm": 1.8546425104141235, "learning_rate": 9.08801140911302e-05, "loss": 2.7207, "step": 201 }, { "epoch": 0.47417840375586856, "grad_norm": 1.7608038187026978, "learning_rate": 9.076800904902975e-05, "loss": 2.7212, "step": 202 }, { "epoch": 0.4765258215962441, "grad_norm": 1.745438814163208, "learning_rate": 9.06552892241993e-05, "loss": 2.6066, "step": 203 }, { "epoch": 0.4788732394366197, "grad_norm": 2.2760910987854004, "learning_rate": 9.054195631645704e-05, "loss": 2.7303, "step": 204 }, { "epoch": 0.4812206572769953, "grad_norm": 1.6470048427581787, "learning_rate": 9.042801203486641e-05, "loss": 2.5848, "step": 205 }, { "epoch": 0.4835680751173709, "grad_norm": 1.7573647499084473, "learning_rate": 9.03134580977104e-05, "loss": 2.6186, "step": 206 }, { "epoch": 0.4859154929577465, "grad_norm": 1.668116569519043, "learning_rate": 9.019829623246563e-05, "loss": 2.5771, "step": 207 }, { "epoch": 0.48826291079812206, "grad_norm": 1.744146466255188, "learning_rate": 9.008252817577628e-05, "loss": 2.6314, "step": 208 }, { "epoch": 0.49061032863849763, "grad_norm": 1.6404486894607544, "learning_rate": 8.99661556734279e-05, "loss": 2.4092, "step": 209 }, { "epoch": 0.49295774647887325, "grad_norm": 1.5752975940704346, "learning_rate": 8.984918048032116e-05, "loss": 2.4866, "step": 210 }, { "epoch": 0.4953051643192488, "grad_norm": 1.5991661548614502, "learning_rate": 8.973160436044526e-05, "loss": 2.6372, "step": 211 }, { "epoch": 0.49765258215962443, "grad_norm": 1.7967761754989624, "learning_rate": 8.961342908685142e-05, "loss": 2.8232, "step": 212 }, { "epoch": 0.5, "grad_norm": 1.6597074270248413, "learning_rate": 8.949465644162611e-05, "loss": 2.6634, "step": 213 }, { "epoch": 0.5023474178403756, "grad_norm": 1.6096906661987305, "learning_rate": 8.937528821586416e-05, "loss": 2.5347, "step": 214 }, { "epoch": 0.5046948356807511, "grad_norm": 2.293700695037842, "learning_rate": 8.92553262096418e-05, "loss": 2.5662, "step": 215 }, { "epoch": 0.5070422535211268, "grad_norm": 1.690674066543579, "learning_rate": 8.913477223198951e-05, "loss": 2.6845, "step": 216 }, { "epoch": 0.5093896713615024, "grad_norm": 1.8968983888626099, "learning_rate": 8.901362810086464e-05, "loss": 2.7832, "step": 217 }, { "epoch": 0.5117370892018779, "grad_norm": 1.60163414478302, "learning_rate": 8.889189564312417e-05, "loss": 2.4938, "step": 218 }, { "epoch": 0.5140845070422535, "grad_norm": 1.7134038209915161, "learning_rate": 8.876957669449694e-05, "loss": 2.6695, "step": 219 }, { "epoch": 0.5164319248826291, "grad_norm": 1.6584218740463257, "learning_rate": 8.864667309955619e-05, "loss": 2.7791, "step": 220 }, { "epoch": 0.5187793427230047, "grad_norm": 1.782516598701477, "learning_rate": 8.852318671169162e-05, "loss": 2.5737, "step": 221 }, { "epoch": 0.5211267605633803, "grad_norm": 1.6085395812988281, "learning_rate": 8.839911939308143e-05, "loss": 2.3568, "step": 222 }, { "epoch": 0.5234741784037559, "grad_norm": 1.5431314706802368, "learning_rate": 8.827447301466432e-05, "loss": 2.4808, "step": 223 }, { "epoch": 0.5258215962441315, "grad_norm": 1.6283578872680664, "learning_rate": 8.814924945611118e-05, "loss": 2.5186, "step": 224 }, { "epoch": 0.528169014084507, "grad_norm": 1.6893575191497803, "learning_rate": 8.802345060579684e-05, "loss": 2.7792, "step": 225 }, { "epoch": 0.5305164319248826, "grad_norm": 1.9037388563156128, "learning_rate": 8.789707836077149e-05, "loss": 2.5587, "step": 226 }, { "epoch": 0.5328638497652582, "grad_norm": 1.57706618309021, "learning_rate": 8.777013462673217e-05, "loss": 2.4527, "step": 227 }, { "epoch": 0.5352112676056338, "grad_norm": 2.0377097129821777, "learning_rate": 8.764262131799401e-05, "loss": 2.6479, "step": 228 }, { "epoch": 0.5375586854460094, "grad_norm": 1.6766222715377808, "learning_rate": 8.75145403574613e-05, "loss": 2.5871, "step": 229 }, { "epoch": 0.539906103286385, "grad_norm": 1.616554617881775, "learning_rate": 8.738589367659852e-05, "loss": 2.6863, "step": 230 }, { "epoch": 0.5422535211267606, "grad_norm": 1.6433231830596924, "learning_rate": 8.725668321540128e-05, "loss": 2.5667, "step": 231 }, { "epoch": 0.5446009389671361, "grad_norm": 1.7055845260620117, "learning_rate": 8.7126910922367e-05, "loss": 2.6972, "step": 232 }, { "epoch": 0.5469483568075117, "grad_norm": 1.6490628719329834, "learning_rate": 8.699657875446551e-05, "loss": 2.663, "step": 233 }, { "epoch": 0.5492957746478874, "grad_norm": 1.7133729457855225, "learning_rate": 8.686568867710962e-05, "loss": 2.707, "step": 234 }, { "epoch": 0.5516431924882629, "grad_norm": 4.825928211212158, "learning_rate": 8.673424266412538e-05, "loss": 2.9167, "step": 235 }, { "epoch": 0.5539906103286385, "grad_norm": 3.086928367614746, "learning_rate": 8.660224269772237e-05, "loss": 2.5891, "step": 236 }, { "epoch": 0.5563380281690141, "grad_norm": 4.451052665710449, "learning_rate": 8.646969076846383e-05, "loss": 2.5727, "step": 237 }, { "epoch": 0.5586854460093896, "grad_norm": 1.7714531421661377, "learning_rate": 8.633658887523664e-05, "loss": 2.9154, "step": 238 }, { "epoch": 0.5610328638497653, "grad_norm": 1.6821494102478027, "learning_rate": 8.620293902522105e-05, "loss": 2.4986, "step": 239 }, { "epoch": 0.5633802816901409, "grad_norm": 1.5702682733535767, "learning_rate": 8.606874323386062e-05, "loss": 2.6296, "step": 240 }, { "epoch": 0.5657276995305164, "grad_norm": 1.569319486618042, "learning_rate": 8.593400352483168e-05, "loss": 2.4851, "step": 241 }, { "epoch": 0.568075117370892, "grad_norm": 1.9096845388412476, "learning_rate": 8.579872193001285e-05, "loss": 2.6623, "step": 242 }, { "epoch": 0.5704225352112676, "grad_norm": 1.598663091659546, "learning_rate": 8.56629004894544e-05, "loss": 2.6143, "step": 243 }, { "epoch": 0.5727699530516432, "grad_norm": 1.6170557737350464, "learning_rate": 8.552654125134751e-05, "loss": 2.7028, "step": 244 }, { "epoch": 0.5751173708920188, "grad_norm": 1.655263066291809, "learning_rate": 8.538964627199332e-05, "loss": 2.6627, "step": 245 }, { "epoch": 0.5774647887323944, "grad_norm": 1.723892092704773, "learning_rate": 8.525221761577204e-05, "loss": 2.8072, "step": 246 }, { "epoch": 0.57981220657277, "grad_norm": 3.896911859512329, "learning_rate": 8.511425735511167e-05, "loss": 2.6754, "step": 247 }, { "epoch": 0.5821596244131455, "grad_norm": 1.606334924697876, "learning_rate": 8.497576757045683e-05, "loss": 2.2084, "step": 248 }, { "epoch": 0.5845070422535211, "grad_norm": 1.7658907175064087, "learning_rate": 8.483675035023739e-05, "loss": 2.6787, "step": 249 }, { "epoch": 0.5868544600938967, "grad_norm": 1.6326708793640137, "learning_rate": 8.4697207790837e-05, "loss": 2.5509, "step": 250 }, { "epoch": 0.5892018779342723, "grad_norm": 1.6483526229858398, "learning_rate": 8.455714199656138e-05, "loss": 2.468, "step": 251 }, { "epoch": 0.5915492957746479, "grad_norm": 1.5847233533859253, "learning_rate": 8.441655507960667e-05, "loss": 2.5259, "step": 252 }, { "epoch": 0.5938967136150235, "grad_norm": 1.744722604751587, "learning_rate": 8.427544916002756e-05, "loss": 2.8526, "step": 253 }, { "epoch": 0.596244131455399, "grad_norm": 1.7232311964035034, "learning_rate": 8.41338263657053e-05, "loss": 2.9207, "step": 254 }, { "epoch": 0.5985915492957746, "grad_norm": 1.6628774404525757, "learning_rate": 8.399168883231564e-05, "loss": 2.5764, "step": 255 }, { "epoch": 0.6009389671361502, "grad_norm": 1.5911755561828613, "learning_rate": 8.38490387032966e-05, "loss": 2.5154, "step": 256 }, { "epoch": 0.6032863849765259, "grad_norm": 1.588707685470581, "learning_rate": 8.37058781298162e-05, "loss": 2.7086, "step": 257 }, { "epoch": 0.6056338028169014, "grad_norm": 1.6533771753311157, "learning_rate": 8.35622092707399e-05, "loss": 2.7257, "step": 258 }, { "epoch": 0.607981220657277, "grad_norm": 1.5928781032562256, "learning_rate": 8.341803429259817e-05, "loss": 2.4508, "step": 259 }, { "epoch": 0.6103286384976526, "grad_norm": 1.610063910484314, "learning_rate": 8.327335536955376e-05, "loss": 2.5847, "step": 260 }, { "epoch": 0.6126760563380281, "grad_norm": 1.5072062015533447, "learning_rate": 8.31281746833689e-05, "loss": 2.4901, "step": 261 }, { "epoch": 0.6150234741784038, "grad_norm": 1.5797420740127563, "learning_rate": 8.29824944233725e-05, "loss": 2.6541, "step": 262 }, { "epoch": 0.6173708920187794, "grad_norm": 1.695048213005066, "learning_rate": 8.283631678642694e-05, "loss": 2.6204, "step": 263 }, { "epoch": 0.6197183098591549, "grad_norm": 1.665381669998169, "learning_rate": 8.268964397689516e-05, "loss": 2.3731, "step": 264 }, { "epoch": 0.6220657276995305, "grad_norm": 1.600093960762024, "learning_rate": 8.254247820660727e-05, "loss": 2.5218, "step": 265 }, { "epoch": 0.6244131455399061, "grad_norm": 1.901977300643921, "learning_rate": 8.239482169482726e-05, "loss": 2.7333, "step": 266 }, { "epoch": 0.6267605633802817, "grad_norm": 1.5469740629196167, "learning_rate": 8.22466766682195e-05, "loss": 2.3281, "step": 267 }, { "epoch": 0.6291079812206573, "grad_norm": 1.6751726865768433, "learning_rate": 8.209804536081516e-05, "loss": 2.8722, "step": 268 }, { "epoch": 0.6314553990610329, "grad_norm": 1.8046952486038208, "learning_rate": 8.194893001397858e-05, "loss": 2.7165, "step": 269 }, { "epoch": 0.6338028169014085, "grad_norm": 1.6492973566055298, "learning_rate": 8.179933287637342e-05, "loss": 2.7958, "step": 270 }, { "epoch": 0.636150234741784, "grad_norm": 1.8264124393463135, "learning_rate": 8.164925620392871e-05, "loss": 2.6456, "step": 271 }, { "epoch": 0.6384976525821596, "grad_norm": 1.7997009754180908, "learning_rate": 8.149870225980498e-05, "loss": 2.7984, "step": 272 }, { "epoch": 0.6408450704225352, "grad_norm": 1.5842161178588867, "learning_rate": 8.134767331435991e-05, "loss": 2.4858, "step": 273 }, { "epoch": 0.6431924882629108, "grad_norm": 1.5902608633041382, "learning_rate": 8.11961716451143e-05, "loss": 2.4969, "step": 274 }, { "epoch": 0.6455399061032864, "grad_norm": 1.6638398170471191, "learning_rate": 8.104419953671759e-05, "loss": 2.5576, "step": 275 }, { "epoch": 0.647887323943662, "grad_norm": 1.6883394718170166, "learning_rate": 8.089175928091349e-05, "loss": 2.7457, "step": 276 }, { "epoch": 0.6502347417840375, "grad_norm": 1.686235785484314, "learning_rate": 8.073885317650534e-05, "loss": 2.8693, "step": 277 }, { "epoch": 0.6525821596244131, "grad_norm": 1.7170850038528442, "learning_rate": 8.058548352932158e-05, "loss": 2.6033, "step": 278 }, { "epoch": 0.6549295774647887, "grad_norm": 1.674912691116333, "learning_rate": 8.043165265218078e-05, "loss": 2.5084, "step": 279 }, { "epoch": 0.6572769953051644, "grad_norm": 1.605450987815857, "learning_rate": 8.027736286485694e-05, "loss": 2.6496, "step": 280 }, { "epoch": 0.6596244131455399, "grad_norm": 1.6739922761917114, "learning_rate": 8.01226164940444e-05, "loss": 2.6467, "step": 281 }, { "epoch": 0.6619718309859155, "grad_norm": 2.5874874591827393, "learning_rate": 7.996741587332284e-05, "loss": 2.7073, "step": 282 }, { "epoch": 0.6643192488262911, "grad_norm": 1.6068830490112305, "learning_rate": 7.981176334312199e-05, "loss": 2.6242, "step": 283 }, { "epoch": 0.6666666666666666, "grad_norm": 1.5618075132369995, "learning_rate": 7.965566125068642e-05, "loss": 2.4312, "step": 284 }, { "epoch": 0.6690140845070423, "grad_norm": 1.6623550653457642, "learning_rate": 7.949911195004008e-05, "loss": 2.7836, "step": 285 }, { "epoch": 0.6713615023474179, "grad_norm": 1.6733129024505615, "learning_rate": 7.934211780195089e-05, "loss": 2.6002, "step": 286 }, { "epoch": 0.6737089201877934, "grad_norm": 1.675602674484253, "learning_rate": 7.9184681173895e-05, "loss": 2.536, "step": 287 }, { "epoch": 0.676056338028169, "grad_norm": 1.6826605796813965, "learning_rate": 7.902680444002127e-05, "loss": 2.7009, "step": 288 }, { "epoch": 0.6784037558685446, "grad_norm": 1.4956185817718506, "learning_rate": 7.886848998111526e-05, "loss": 2.3685, "step": 289 }, { "epoch": 0.6807511737089202, "grad_norm": 1.5748240947723389, "learning_rate": 7.870974018456352e-05, "loss": 2.4995, "step": 290 }, { "epoch": 0.6830985915492958, "grad_norm": 1.536579966545105, "learning_rate": 7.855055744431747e-05, "loss": 2.5763, "step": 291 }, { "epoch": 0.6854460093896714, "grad_norm": 1.5213676691055298, "learning_rate": 7.839094416085733e-05, "loss": 2.4565, "step": 292 }, { "epoch": 0.687793427230047, "grad_norm": 1.668575406074524, "learning_rate": 7.823090274115592e-05, "loss": 2.7248, "step": 293 }, { "epoch": 0.6901408450704225, "grad_norm": 1.6413085460662842, "learning_rate": 7.807043559864242e-05, "loss": 2.4725, "step": 294 }, { "epoch": 0.6924882629107981, "grad_norm": 1.682806372642517, "learning_rate": 7.790954515316584e-05, "loss": 2.5853, "step": 295 }, { "epoch": 0.6948356807511737, "grad_norm": 1.5448178052902222, "learning_rate": 7.774823383095867e-05, "loss": 2.326, "step": 296 }, { "epoch": 0.6971830985915493, "grad_norm": 1.6233508586883545, "learning_rate": 7.75865040646002e-05, "loss": 2.4861, "step": 297 }, { "epoch": 0.6995305164319249, "grad_norm": 3.1035714149475098, "learning_rate": 7.742435829297988e-05, "loss": 2.7022, "step": 298 }, { "epoch": 0.7018779342723005, "grad_norm": 1.6291568279266357, "learning_rate": 7.726179896126055e-05, "loss": 2.4584, "step": 299 }, { "epoch": 0.704225352112676, "grad_norm": 1.5778498649597168, "learning_rate": 7.709882852084152e-05, "loss": 2.4744, "step": 300 }, { "epoch": 0.7065727699530516, "grad_norm": 1.5400241613388062, "learning_rate": 7.693544942932162e-05, "loss": 2.3768, "step": 301 }, { "epoch": 0.7089201877934272, "grad_norm": 1.661958932876587, "learning_rate": 7.677166415046225e-05, "loss": 2.6589, "step": 302 }, { "epoch": 0.7112676056338029, "grad_norm": 1.5739407539367676, "learning_rate": 7.660747515414996e-05, "loss": 2.4508, "step": 303 }, { "epoch": 0.7136150234741784, "grad_norm": 1.5613700151443481, "learning_rate": 7.644288491635952e-05, "loss": 2.4655, "step": 304 }, { "epoch": 0.715962441314554, "grad_norm": 1.6637332439422607, "learning_rate": 7.627789591911634e-05, "loss": 2.4657, "step": 305 }, { "epoch": 0.7183098591549296, "grad_norm": 1.6113313436508179, "learning_rate": 7.611251065045918e-05, "loss": 2.5994, "step": 306 }, { "epoch": 0.7206572769953051, "grad_norm": 1.656441569328308, "learning_rate": 7.594673160440258e-05, "loss": 2.5098, "step": 307 }, { "epoch": 0.7230046948356808, "grad_norm": 1.5881431102752686, "learning_rate": 7.578056128089921e-05, "loss": 2.5514, "step": 308 }, { "epoch": 0.7253521126760564, "grad_norm": 1.5691207647323608, "learning_rate": 7.56140021858023e-05, "loss": 2.3184, "step": 309 }, { "epoch": 0.7276995305164319, "grad_norm": 1.5510538816452026, "learning_rate": 7.544705683082768e-05, "loss": 2.6, "step": 310 }, { "epoch": 0.7300469483568075, "grad_norm": 1.492972731590271, "learning_rate": 7.527972773351604e-05, "loss": 2.5153, "step": 311 }, { "epoch": 0.7323943661971831, "grad_norm": 1.6028672456741333, "learning_rate": 7.511201741719489e-05, "loss": 2.6437, "step": 312 }, { "epoch": 0.7347417840375586, "grad_norm": 1.5389173030853271, "learning_rate": 7.494392841094058e-05, "loss": 2.3661, "step": 313 }, { "epoch": 0.7370892018779343, "grad_norm": 1.5952788591384888, "learning_rate": 7.477546324954002e-05, "loss": 2.5836, "step": 314 }, { "epoch": 0.7394366197183099, "grad_norm": 1.7208667993545532, "learning_rate": 7.460662447345265e-05, "loss": 2.6799, "step": 315 }, { "epoch": 0.7417840375586855, "grad_norm": 1.5389221906661987, "learning_rate": 7.443741462877202e-05, "loss": 2.4238, "step": 316 }, { "epoch": 0.744131455399061, "grad_norm": 2.6154470443725586, "learning_rate": 7.426783626718732e-05, "loss": 2.4256, "step": 317 }, { "epoch": 0.7464788732394366, "grad_norm": 1.661379337310791, "learning_rate": 7.409789194594507e-05, "loss": 2.5471, "step": 318 }, { "epoch": 0.7488262910798122, "grad_norm": 1.676131248474121, "learning_rate": 7.392758422781044e-05, "loss": 2.4158, "step": 319 }, { "epoch": 0.7511737089201878, "grad_norm": 1.7112098932266235, "learning_rate": 7.375691568102863e-05, "loss": 2.7728, "step": 320 }, { "epoch": 0.7535211267605634, "grad_norm": 1.705582857131958, "learning_rate": 7.358588887928614e-05, "loss": 2.6519, "step": 321 }, { "epoch": 0.755868544600939, "grad_norm": 1.5166276693344116, "learning_rate": 7.341450640167202e-05, "loss": 2.5022, "step": 322 }, { "epoch": 0.7582159624413145, "grad_norm": 1.5449557304382324, "learning_rate": 7.324277083263885e-05, "loss": 2.4206, "step": 323 }, { "epoch": 0.7605633802816901, "grad_norm": 1.4922847747802734, "learning_rate": 7.307068476196388e-05, "loss": 2.4094, "step": 324 }, { "epoch": 0.7629107981220657, "grad_norm": 1.6599231958389282, "learning_rate": 7.289825078470993e-05, "loss": 2.6286, "step": 325 }, { "epoch": 0.7652582159624414, "grad_norm": 1.625157356262207, "learning_rate": 7.272547150118624e-05, "loss": 2.3925, "step": 326 }, { "epoch": 0.7676056338028169, "grad_norm": 1.5863850116729736, "learning_rate": 7.255234951690932e-05, "loss": 2.5999, "step": 327 }, { "epoch": 0.7699530516431925, "grad_norm": 1.731698751449585, "learning_rate": 7.237888744256357e-05, "loss": 2.6319, "step": 328 }, { "epoch": 0.7723004694835681, "grad_norm": 1.7433608770370483, "learning_rate": 7.220508789396197e-05, "loss": 2.6355, "step": 329 }, { "epoch": 0.7746478873239436, "grad_norm": 1.8249096870422363, "learning_rate": 7.203095349200666e-05, "loss": 2.6562, "step": 330 }, { "epoch": 0.7769953051643192, "grad_norm": 1.5074961185455322, "learning_rate": 7.185648686264934e-05, "loss": 2.4015, "step": 331 }, { "epoch": 0.7793427230046949, "grad_norm": 1.6418836116790771, "learning_rate": 7.168169063685171e-05, "loss": 2.5743, "step": 332 }, { "epoch": 0.7816901408450704, "grad_norm": 1.518595814704895, "learning_rate": 7.15065674505458e-05, "loss": 2.4607, "step": 333 }, { "epoch": 0.784037558685446, "grad_norm": 1.5618414878845215, "learning_rate": 7.13311199445942e-05, "loss": 2.6172, "step": 334 }, { "epoch": 0.7863849765258216, "grad_norm": 1.5513619184494019, "learning_rate": 7.115535076475031e-05, "loss": 2.4506, "step": 335 }, { "epoch": 0.7887323943661971, "grad_norm": 1.492394208908081, "learning_rate": 7.09792625616183e-05, "loss": 2.4202, "step": 336 }, { "epoch": 0.7910798122065728, "grad_norm": 1.5783830881118774, "learning_rate": 7.080285799061324e-05, "loss": 2.5364, "step": 337 }, { "epoch": 0.7934272300469484, "grad_norm": 1.5449110269546509, "learning_rate": 7.06261397119211e-05, "loss": 2.5843, "step": 338 }, { "epoch": 0.795774647887324, "grad_norm": 1.5108921527862549, "learning_rate": 7.044911039045847e-05, "loss": 2.474, "step": 339 }, { "epoch": 0.7981220657276995, "grad_norm": 1.5703331232070923, "learning_rate": 7.027177269583256e-05, "loss": 2.6182, "step": 340 }, { "epoch": 0.8004694835680751, "grad_norm": 1.5418925285339355, "learning_rate": 7.009412930230084e-05, "loss": 2.486, "step": 341 }, { "epoch": 0.8028169014084507, "grad_norm": 1.617924690246582, "learning_rate": 6.991618288873066e-05, "loss": 2.5069, "step": 342 }, { "epoch": 0.8051643192488263, "grad_norm": 1.559870719909668, "learning_rate": 6.973793613855902e-05, "loss": 2.3108, "step": 343 }, { "epoch": 0.8075117370892019, "grad_norm": 1.569182276725769, "learning_rate": 6.955939173975191e-05, "loss": 2.4741, "step": 344 }, { "epoch": 0.8098591549295775, "grad_norm": 1.5738292932510376, "learning_rate": 6.938055238476395e-05, "loss": 2.5839, "step": 345 }, { "epoch": 0.812206572769953, "grad_norm": 1.6121405363082886, "learning_rate": 6.920142077049766e-05, "loss": 2.4468, "step": 346 }, { "epoch": 0.8145539906103286, "grad_norm": 1.474192500114441, "learning_rate": 6.902199959826286e-05, "loss": 2.3262, "step": 347 }, { "epoch": 0.8169014084507042, "grad_norm": 1.590667486190796, "learning_rate": 6.88422915737359e-05, "loss": 2.348, "step": 348 }, { "epoch": 0.8192488262910798, "grad_norm": 1.623644232749939, "learning_rate": 6.866229940691888e-05, "loss": 2.6121, "step": 349 }, { "epoch": 0.8215962441314554, "grad_norm": 1.6322968006134033, "learning_rate": 6.848202581209875e-05, "loss": 2.4334, "step": 350 }, { "epoch": 0.823943661971831, "grad_norm": 1.60962975025177, "learning_rate": 6.830147350780645e-05, "loss": 2.4688, "step": 351 }, { "epoch": 0.8262910798122066, "grad_norm": 1.553849458694458, "learning_rate": 6.812064521677579e-05, "loss": 2.558, "step": 352 }, { "epoch": 0.8286384976525821, "grad_norm": 1.4718174934387207, "learning_rate": 6.793954366590257e-05, "loss": 2.3784, "step": 353 }, { "epoch": 0.8309859154929577, "grad_norm": 1.4675418138504028, "learning_rate": 6.775817158620328e-05, "loss": 2.3752, "step": 354 }, { "epoch": 0.8333333333333334, "grad_norm": 1.591253399848938, "learning_rate": 6.7576531712774e-05, "loss": 2.6141, "step": 355 }, { "epoch": 0.8356807511737089, "grad_norm": 1.6021631956100464, "learning_rate": 6.739462678474917e-05, "loss": 2.3948, "step": 356 }, { "epoch": 0.8380281690140845, "grad_norm": 1.6207449436187744, "learning_rate": 6.721245954526025e-05, "loss": 2.4396, "step": 357 }, { "epoch": 0.8403755868544601, "grad_norm": 1.5401954650878906, "learning_rate": 6.703003274139438e-05, "loss": 2.7093, "step": 358 }, { "epoch": 0.8427230046948356, "grad_norm": 1.6794401407241821, "learning_rate": 6.684734912415289e-05, "loss": 2.5922, "step": 359 }, { "epoch": 0.8450704225352113, "grad_norm": 1.5993467569351196, "learning_rate": 6.666441144840994e-05, "loss": 2.5032, "step": 360 }, { "epoch": 0.8474178403755869, "grad_norm": 1.5863590240478516, "learning_rate": 6.648122247287083e-05, "loss": 2.4734, "step": 361 }, { "epoch": 0.8497652582159625, "grad_norm": 1.555893063545227, "learning_rate": 6.62977849600305e-05, "loss": 2.4898, "step": 362 }, { "epoch": 0.852112676056338, "grad_norm": 1.5755188465118408, "learning_rate": 6.611410167613184e-05, "loss": 2.4916, "step": 363 }, { "epoch": 0.8544600938967136, "grad_norm": 1.7223633527755737, "learning_rate": 6.593017539112396e-05, "loss": 2.5071, "step": 364 }, { "epoch": 0.8568075117370892, "grad_norm": 1.5736289024353027, "learning_rate": 6.574600887862043e-05, "loss": 2.5697, "step": 365 }, { "epoch": 0.8591549295774648, "grad_norm": 1.4946801662445068, "learning_rate": 6.55616049158575e-05, "loss": 2.4197, "step": 366 }, { "epoch": 0.8615023474178404, "grad_norm": 1.5540156364440918, "learning_rate": 6.537696628365212e-05, "loss": 2.5144, "step": 367 }, { "epoch": 0.863849765258216, "grad_norm": 1.4268841743469238, "learning_rate": 6.51920957663601e-05, "loss": 2.2671, "step": 368 }, { "epoch": 0.8661971830985915, "grad_norm": 1.5012619495391846, "learning_rate": 6.500699615183408e-05, "loss": 2.3217, "step": 369 }, { "epoch": 0.8685446009389671, "grad_norm": 1.5565698146820068, "learning_rate": 6.482167023138147e-05, "loss": 2.3991, "step": 370 }, { "epoch": 0.8708920187793427, "grad_norm": 1.6378949880599976, "learning_rate": 6.46361207997224e-05, "loss": 2.6809, "step": 371 }, { "epoch": 0.8732394366197183, "grad_norm": 1.472183346748352, "learning_rate": 6.445035065494754e-05, "loss": 2.4983, "step": 372 }, { "epoch": 0.8755868544600939, "grad_norm": 1.5875253677368164, "learning_rate": 6.426436259847595e-05, "loss": 2.5813, "step": 373 }, { "epoch": 0.8779342723004695, "grad_norm": 1.8145997524261475, "learning_rate": 6.407815943501274e-05, "loss": 2.681, "step": 374 }, { "epoch": 0.8802816901408451, "grad_norm": 1.617915391921997, "learning_rate": 6.389174397250693e-05, "loss": 2.8412, "step": 375 }, { "epoch": 0.8826291079812206, "grad_norm": 1.451400876045227, "learning_rate": 6.370511902210897e-05, "loss": 2.3213, "step": 376 }, { "epoch": 0.8849765258215962, "grad_norm": 1.6040557622909546, "learning_rate": 6.351828739812836e-05, "loss": 2.5939, "step": 377 }, { "epoch": 0.8873239436619719, "grad_norm": 1.62812340259552, "learning_rate": 6.33312519179913e-05, "loss": 2.483, "step": 378 }, { "epoch": 0.8896713615023474, "grad_norm": 1.653408169746399, "learning_rate": 6.31440154021981e-05, "loss": 2.4626, "step": 379 }, { "epoch": 0.892018779342723, "grad_norm": 1.5635944604873657, "learning_rate": 6.295658067428077e-05, "loss": 2.5795, "step": 380 }, { "epoch": 0.8943661971830986, "grad_norm": 1.5737652778625488, "learning_rate": 6.276895056076022e-05, "loss": 2.6681, "step": 381 }, { "epoch": 0.8967136150234741, "grad_norm": 1.5483229160308838, "learning_rate": 6.258112789110395e-05, "loss": 2.3571, "step": 382 }, { "epoch": 0.8990610328638498, "grad_norm": 1.458562970161438, "learning_rate": 6.239311549768311e-05, "loss": 2.3653, "step": 383 }, { "epoch": 0.9014084507042254, "grad_norm": 1.5463008880615234, "learning_rate": 6.220491621572989e-05, "loss": 2.3952, "step": 384 }, { "epoch": 0.903755868544601, "grad_norm": 1.5711524486541748, "learning_rate": 6.20165328832948e-05, "loss": 2.4119, "step": 385 }, { "epoch": 0.9061032863849765, "grad_norm": 1.5307199954986572, "learning_rate": 6.182796834120386e-05, "loss": 2.4746, "step": 386 }, { "epoch": 0.9084507042253521, "grad_norm": 1.694445013999939, "learning_rate": 6.163922543301565e-05, "loss": 2.5904, "step": 387 }, { "epoch": 0.9107981220657277, "grad_norm": 1.5599974393844604, "learning_rate": 6.145030700497857e-05, "loss": 2.4916, "step": 388 }, { "epoch": 0.9131455399061033, "grad_norm": 1.569372534751892, "learning_rate": 6.126121590598788e-05, "loss": 2.4512, "step": 389 }, { "epoch": 0.9154929577464789, "grad_norm": 1.6844699382781982, "learning_rate": 6.10719549875427e-05, "loss": 2.5407, "step": 390 }, { "epoch": 0.9178403755868545, "grad_norm": 1.5658828020095825, "learning_rate": 6.088252710370302e-05, "loss": 2.4303, "step": 391 }, { "epoch": 0.92018779342723, "grad_norm": 1.615698218345642, "learning_rate": 6.069293511104672e-05, "loss": 2.3797, "step": 392 }, { "epoch": 0.9225352112676056, "grad_norm": 1.5572597980499268, "learning_rate": 6.0503181868626394e-05, "loss": 2.43, "step": 393 }, { "epoch": 0.9248826291079812, "grad_norm": 1.5271580219268799, "learning_rate": 6.031327023792629e-05, "loss": 2.5207, "step": 394 }, { "epoch": 0.9272300469483568, "grad_norm": 1.5841909646987915, "learning_rate": 6.012320308281919e-05, "loss": 2.5794, "step": 395 }, { "epoch": 0.9295774647887324, "grad_norm": 1.4326632022857666, "learning_rate": 5.993298326952318e-05, "loss": 2.2667, "step": 396 }, { "epoch": 0.931924882629108, "grad_norm": 1.5157475471496582, "learning_rate": 5.974261366655841e-05, "loss": 2.363, "step": 397 }, { "epoch": 0.9342723004694836, "grad_norm": 1.5970475673675537, "learning_rate": 5.955209714470388e-05, "loss": 2.519, "step": 398 }, { "epoch": 0.9366197183098591, "grad_norm": 1.5131961107254028, "learning_rate": 5.9361436576954157e-05, "loss": 2.5372, "step": 399 }, { "epoch": 0.9389671361502347, "grad_norm": 1.5415359735488892, "learning_rate": 5.9170634838475955e-05, "loss": 2.2527, "step": 400 }, { "epoch": 0.9413145539906104, "grad_norm": 1.4601541757583618, "learning_rate": 5.897969480656491e-05, "loss": 2.2968, "step": 401 }, { "epoch": 0.9436619718309859, "grad_norm": 1.6026557683944702, "learning_rate": 5.8788619360602126e-05, "loss": 2.4206, "step": 402 }, { "epoch": 0.9460093896713615, "grad_norm": 1.6263443231582642, "learning_rate": 5.859741138201068e-05, "loss": 2.4799, "step": 403 }, { "epoch": 0.9483568075117371, "grad_norm": 1.715550184249878, "learning_rate": 5.8406073754212355e-05, "loss": 2.685, "step": 404 }, { "epoch": 0.9507042253521126, "grad_norm": 1.536697268486023, "learning_rate": 5.8214609362583974e-05, "loss": 2.4646, "step": 405 }, { "epoch": 0.9530516431924883, "grad_norm": 1.559288501739502, "learning_rate": 5.8023021094413966e-05, "loss": 2.4852, "step": 406 }, { "epoch": 0.9553990610328639, "grad_norm": 1.523455262184143, "learning_rate": 5.783131183885885e-05, "loss": 2.5411, "step": 407 }, { "epoch": 0.9577464788732394, "grad_norm": 1.5540595054626465, "learning_rate": 5.763948448689963e-05, "loss": 2.4401, "step": 408 }, { "epoch": 0.960093896713615, "grad_norm": 1.5767078399658203, "learning_rate": 5.744754193129818e-05, "loss": 2.5617, "step": 409 }, { "epoch": 0.9624413145539906, "grad_norm": 1.4613933563232422, "learning_rate": 5.725548706655368e-05, "loss": 2.3601, "step": 410 }, { "epoch": 0.9647887323943662, "grad_norm": 1.5543547868728638, "learning_rate": 5.706332278885893e-05, "loss": 2.4387, "step": 411 }, { "epoch": 0.9671361502347418, "grad_norm": 1.5075656175613403, "learning_rate": 5.687105199605667e-05, "loss": 2.5223, "step": 412 }, { "epoch": 0.9694835680751174, "grad_norm": 1.468519926071167, "learning_rate": 5.6678677587595884e-05, "loss": 2.3275, "step": 413 }, { "epoch": 0.971830985915493, "grad_norm": 1.6226619482040405, "learning_rate": 5.648620246448813e-05, "loss": 2.5644, "step": 414 }, { "epoch": 0.9741784037558685, "grad_norm": 1.442681908607483, "learning_rate": 5.629362952926367e-05, "loss": 2.2933, "step": 415 }, { "epoch": 0.9765258215962441, "grad_norm": 1.4851113557815552, "learning_rate": 5.610096168592785e-05, "loss": 2.3604, "step": 416 }, { "epoch": 0.9788732394366197, "grad_norm": 1.491042971611023, "learning_rate": 5.590820183991716e-05, "loss": 2.4396, "step": 417 }, { "epoch": 0.9812206572769953, "grad_norm": 1.5098199844360352, "learning_rate": 5.571535289805556e-05, "loss": 2.3233, "step": 418 }, { "epoch": 0.9835680751173709, "grad_norm": 1.4578444957733154, "learning_rate": 5.552241776851055e-05, "loss": 2.254, "step": 419 }, { "epoch": 0.9859154929577465, "grad_norm": 1.5896917581558228, "learning_rate": 5.5329399360749336e-05, "loss": 2.8669, "step": 420 }, { "epoch": 0.9882629107981221, "grad_norm": 1.4958128929138184, "learning_rate": 5.513630058549497e-05, "loss": 2.5112, "step": 421 }, { "epoch": 0.9906103286384976, "grad_norm": 1.4808344841003418, "learning_rate": 5.494312435468244e-05, "loss": 2.2581, "step": 422 }, { "epoch": 0.9929577464788732, "grad_norm": 1.4487098455429077, "learning_rate": 5.474987358141478e-05, "loss": 2.3525, "step": 423 }, { "epoch": 0.9953051643192489, "grad_norm": 1.4762784242630005, "learning_rate": 5.4556551179919134e-05, "loss": 2.3316, "step": 424 }, { "epoch": 0.9976525821596244, "grad_norm": 1.417960286140442, "learning_rate": 5.436316006550275e-05, "loss": 2.3885, "step": 425 }, { "epoch": 1.0, "grad_norm": 4.407947540283203, "learning_rate": 5.416970315450911e-05, "loss": 2.2996, "step": 426 } ], "logging_steps": 1, "max_steps": 852, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 426, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.196187718110413e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }