{ "best_metric": 1.3203791379928589, "best_model_checkpoint": "miner_id_24/checkpoint-200", "epoch": 0.15088645794039984, "eval_steps": 50, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007544322897019992, "grad_norm": 28.610990524291992, "learning_rate": 5e-06, "loss": 8.4345, "step": 1 }, { "epoch": 0.0007544322897019992, "eval_loss": 2.208278179168701, "eval_runtime": 220.3339, "eval_samples_per_second": 10.135, "eval_steps_per_second": 5.07, "step": 1 }, { "epoch": 0.0015088645794039985, "grad_norm": 29.75434112548828, "learning_rate": 1e-05, "loss": 7.8978, "step": 2 }, { "epoch": 0.002263296869105998, "grad_norm": 25.781906127929688, "learning_rate": 1.5e-05, "loss": 8.1028, "step": 3 }, { "epoch": 0.003017729158807997, "grad_norm": 21.908246994018555, "learning_rate": 2e-05, "loss": 7.9355, "step": 4 }, { "epoch": 0.003772161448509996, "grad_norm": 21.276395797729492, "learning_rate": 2.5e-05, "loss": 7.2463, "step": 5 }, { "epoch": 0.004526593738211996, "grad_norm": 18.05831527709961, "learning_rate": 3e-05, "loss": 7.0793, "step": 6 }, { "epoch": 0.005281026027913994, "grad_norm": 18.404077529907227, "learning_rate": 3.5e-05, "loss": 6.6955, "step": 7 }, { "epoch": 0.006035458317615994, "grad_norm": 14.49231243133545, "learning_rate": 4e-05, "loss": 6.241, "step": 8 }, { "epoch": 0.0067898906073179935, "grad_norm": 13.323002815246582, "learning_rate": 4.5e-05, "loss": 6.7582, "step": 9 }, { "epoch": 0.007544322897019992, "grad_norm": 13.41299819946289, "learning_rate": 5e-05, "loss": 6.5987, "step": 10 }, { "epoch": 0.008298755186721992, "grad_norm": 12.793521881103516, "learning_rate": 5.500000000000001e-05, "loss": 6.5348, "step": 11 }, { "epoch": 0.009053187476423991, "grad_norm": 12.212225914001465, "learning_rate": 6e-05, "loss": 5.9619, "step": 12 }, { "epoch": 0.00980761976612599, "grad_norm": 15.029327392578125, "learning_rate": 6.500000000000001e-05, "loss": 6.5328, "step": 13 }, { "epoch": 0.010562052055827989, "grad_norm": 41.65517807006836, "learning_rate": 7e-05, "loss": 6.5128, "step": 14 }, { "epoch": 0.011316484345529988, "grad_norm": 26.0521240234375, "learning_rate": 7.500000000000001e-05, "loss": 6.2811, "step": 15 }, { "epoch": 0.012070916635231988, "grad_norm": 16.089492797851562, "learning_rate": 8e-05, "loss": 6.7146, "step": 16 }, { "epoch": 0.012825348924933987, "grad_norm": 11.127829551696777, "learning_rate": 8.5e-05, "loss": 6.6048, "step": 17 }, { "epoch": 0.013579781214635987, "grad_norm": 12.313581466674805, "learning_rate": 9e-05, "loss": 6.6018, "step": 18 }, { "epoch": 0.014334213504337986, "grad_norm": 9.649194717407227, "learning_rate": 9.5e-05, "loss": 6.4406, "step": 19 }, { "epoch": 0.015088645794039984, "grad_norm": 9.973852157592773, "learning_rate": 0.0001, "loss": 5.958, "step": 20 }, { "epoch": 0.015843078083741986, "grad_norm": 10.235247611999512, "learning_rate": 9.999238475781957e-05, "loss": 6.0049, "step": 21 }, { "epoch": 0.016597510373443983, "grad_norm": 12.643736839294434, "learning_rate": 9.99695413509548e-05, "loss": 5.9654, "step": 22 }, { "epoch": 0.01735194266314598, "grad_norm": 11.938157081604004, "learning_rate": 9.99314767377287e-05, "loss": 5.579, "step": 23 }, { "epoch": 0.018106374952847983, "grad_norm": 11.99202823638916, "learning_rate": 9.987820251299122e-05, "loss": 6.014, "step": 24 }, { "epoch": 0.01886080724254998, "grad_norm": 12.833913803100586, "learning_rate": 9.980973490458728e-05, "loss": 6.1423, "step": 25 }, { "epoch": 0.01961523953225198, "grad_norm": 9.916985511779785, "learning_rate": 9.972609476841367e-05, "loss": 5.8949, "step": 26 }, { "epoch": 0.02036967182195398, "grad_norm": 11.102204322814941, "learning_rate": 9.962730758206611e-05, "loss": 6.1882, "step": 27 }, { "epoch": 0.021124104111655977, "grad_norm": 10.992071151733398, "learning_rate": 9.951340343707852e-05, "loss": 6.3971, "step": 28 }, { "epoch": 0.02187853640135798, "grad_norm": 10.18704605102539, "learning_rate": 9.938441702975689e-05, "loss": 6.0592, "step": 29 }, { "epoch": 0.022632968691059976, "grad_norm": 11.332548141479492, "learning_rate": 9.924038765061042e-05, "loss": 6.082, "step": 30 }, { "epoch": 0.023387400980761978, "grad_norm": 9.355793952941895, "learning_rate": 9.908135917238321e-05, "loss": 5.669, "step": 31 }, { "epoch": 0.024141833270463976, "grad_norm": 9.461305618286133, "learning_rate": 9.890738003669029e-05, "loss": 5.5116, "step": 32 }, { "epoch": 0.024896265560165973, "grad_norm": 10.504962921142578, "learning_rate": 9.871850323926177e-05, "loss": 6.328, "step": 33 }, { "epoch": 0.025650697849867975, "grad_norm": 9.149413108825684, "learning_rate": 9.851478631379982e-05, "loss": 5.3682, "step": 34 }, { "epoch": 0.026405130139569973, "grad_norm": 9.902061462402344, "learning_rate": 9.829629131445342e-05, "loss": 5.6849, "step": 35 }, { "epoch": 0.027159562429271974, "grad_norm": 9.989099502563477, "learning_rate": 9.806308479691595e-05, "loss": 5.3639, "step": 36 }, { "epoch": 0.02791399471897397, "grad_norm": 9.415989875793457, "learning_rate": 9.781523779815179e-05, "loss": 5.2341, "step": 37 }, { "epoch": 0.028668427008675973, "grad_norm": 11.008522987365723, "learning_rate": 9.755282581475769e-05, "loss": 5.0794, "step": 38 }, { "epoch": 0.02942285929837797, "grad_norm": 10.653331756591797, "learning_rate": 9.727592877996585e-05, "loss": 6.2048, "step": 39 }, { "epoch": 0.03017729158807997, "grad_norm": 14.365373611450195, "learning_rate": 9.698463103929542e-05, "loss": 5.6984, "step": 40 }, { "epoch": 0.03093172387778197, "grad_norm": 11.19192886352539, "learning_rate": 9.667902132486009e-05, "loss": 5.2766, "step": 41 }, { "epoch": 0.03168615616748397, "grad_norm": 10.388138771057129, "learning_rate": 9.635919272833938e-05, "loss": 5.033, "step": 42 }, { "epoch": 0.03244058845718597, "grad_norm": 9.952187538146973, "learning_rate": 9.602524267262203e-05, "loss": 4.9101, "step": 43 }, { "epoch": 0.03319502074688797, "grad_norm": 10.397631645202637, "learning_rate": 9.567727288213005e-05, "loss": 5.0858, "step": 44 }, { "epoch": 0.033949453036589965, "grad_norm": 10.087145805358887, "learning_rate": 9.53153893518325e-05, "loss": 4.4193, "step": 45 }, { "epoch": 0.03470388532629196, "grad_norm": 11.928916931152344, "learning_rate": 9.493970231495835e-05, "loss": 4.2049, "step": 46 }, { "epoch": 0.03545831761599397, "grad_norm": 13.15577220916748, "learning_rate": 9.45503262094184e-05, "loss": 3.8644, "step": 47 }, { "epoch": 0.036212749905695965, "grad_norm": 13.862105369567871, "learning_rate": 9.414737964294636e-05, "loss": 3.52, "step": 48 }, { "epoch": 0.03696718219539796, "grad_norm": 9.312602996826172, "learning_rate": 9.373098535696979e-05, "loss": 1.3459, "step": 49 }, { "epoch": 0.03772161448509996, "grad_norm": 15.924839973449707, "learning_rate": 9.330127018922194e-05, "loss": 2.7023, "step": 50 }, { "epoch": 0.03772161448509996, "eval_loss": 1.6293420791625977, "eval_runtime": 222.4612, "eval_samples_per_second": 10.038, "eval_steps_per_second": 5.021, "step": 50 }, { "epoch": 0.03847604677480196, "grad_norm": 14.912610054016113, "learning_rate": 9.285836503510562e-05, "loss": 6.8462, "step": 51 }, { "epoch": 0.03923047906450396, "grad_norm": 12.471588134765625, "learning_rate": 9.24024048078213e-05, "loss": 7.0599, "step": 52 }, { "epoch": 0.03998491135420596, "grad_norm": 8.87014389038086, "learning_rate": 9.193352839727121e-05, "loss": 6.3873, "step": 53 }, { "epoch": 0.04073934364390796, "grad_norm": 8.259026527404785, "learning_rate": 9.145187862775209e-05, "loss": 5.9529, "step": 54 }, { "epoch": 0.04149377593360996, "grad_norm": 10.203092575073242, "learning_rate": 9.09576022144496e-05, "loss": 6.6684, "step": 55 }, { "epoch": 0.042248208223311955, "grad_norm": 8.597597122192383, "learning_rate": 9.045084971874738e-05, "loss": 6.3543, "step": 56 }, { "epoch": 0.04300264051301396, "grad_norm": 8.163016319274902, "learning_rate": 8.993177550236464e-05, "loss": 5.8639, "step": 57 }, { "epoch": 0.04375707280271596, "grad_norm": 8.327492713928223, "learning_rate": 8.940053768033609e-05, "loss": 6.0806, "step": 58 }, { "epoch": 0.044511505092417955, "grad_norm": 7.924893856048584, "learning_rate": 8.885729807284856e-05, "loss": 6.2455, "step": 59 }, { "epoch": 0.04526593738211995, "grad_norm": 8.130339622497559, "learning_rate": 8.83022221559489e-05, "loss": 5.4982, "step": 60 }, { "epoch": 0.04602036967182195, "grad_norm": 8.018017768859863, "learning_rate": 8.773547901113862e-05, "loss": 5.7014, "step": 61 }, { "epoch": 0.046774801961523955, "grad_norm": 8.45346736907959, "learning_rate": 8.715724127386972e-05, "loss": 6.2539, "step": 62 }, { "epoch": 0.04752923425122595, "grad_norm": 10.911635398864746, "learning_rate": 8.656768508095853e-05, "loss": 6.3726, "step": 63 }, { "epoch": 0.04828366654092795, "grad_norm": 9.591697692871094, "learning_rate": 8.596699001693255e-05, "loss": 6.1596, "step": 64 }, { "epoch": 0.04903809883062995, "grad_norm": 7.932915210723877, "learning_rate": 8.535533905932738e-05, "loss": 6.1618, "step": 65 }, { "epoch": 0.04979253112033195, "grad_norm": 8.063417434692383, "learning_rate": 8.473291852294987e-05, "loss": 5.4566, "step": 66 }, { "epoch": 0.05054696341003395, "grad_norm": 8.73833179473877, "learning_rate": 8.409991800312493e-05, "loss": 6.3322, "step": 67 }, { "epoch": 0.05130139569973595, "grad_norm": 10.085183143615723, "learning_rate": 8.345653031794292e-05, "loss": 5.8554, "step": 68 }, { "epoch": 0.05205582798943795, "grad_norm": 10.401449203491211, "learning_rate": 8.280295144952536e-05, "loss": 6.3202, "step": 69 }, { "epoch": 0.052810260279139945, "grad_norm": 8.073527336120605, "learning_rate": 8.213938048432697e-05, "loss": 5.6743, "step": 70 }, { "epoch": 0.05356469256884195, "grad_norm": 8.164782524108887, "learning_rate": 8.146601955249188e-05, "loss": 5.9845, "step": 71 }, { "epoch": 0.05431912485854395, "grad_norm": 7.460975170135498, "learning_rate": 8.07830737662829e-05, "loss": 6.0119, "step": 72 }, { "epoch": 0.055073557148245945, "grad_norm": 8.160001754760742, "learning_rate": 8.009075115760243e-05, "loss": 5.8083, "step": 73 }, { "epoch": 0.05582798943794794, "grad_norm": 7.511910438537598, "learning_rate": 7.938926261462366e-05, "loss": 5.9554, "step": 74 }, { "epoch": 0.05658242172764994, "grad_norm": 8.06807804107666, "learning_rate": 7.86788218175523e-05, "loss": 5.4149, "step": 75 }, { "epoch": 0.057336854017351946, "grad_norm": 8.23879623413086, "learning_rate": 7.795964517353735e-05, "loss": 6.0332, "step": 76 }, { "epoch": 0.058091286307053944, "grad_norm": 8.210224151611328, "learning_rate": 7.723195175075136e-05, "loss": 5.5186, "step": 77 }, { "epoch": 0.05884571859675594, "grad_norm": 8.879250526428223, "learning_rate": 7.649596321166024e-05, "loss": 5.9477, "step": 78 }, { "epoch": 0.05960015088645794, "grad_norm": 8.089262962341309, "learning_rate": 7.575190374550272e-05, "loss": 5.8029, "step": 79 }, { "epoch": 0.06035458317615994, "grad_norm": 8.588730812072754, "learning_rate": 7.500000000000001e-05, "loss": 5.6917, "step": 80 }, { "epoch": 0.06110901546586194, "grad_norm": 7.952780723571777, "learning_rate": 7.424048101231686e-05, "loss": 5.4262, "step": 81 }, { "epoch": 0.06186344775556394, "grad_norm": 7.111525058746338, "learning_rate": 7.347357813929454e-05, "loss": 5.1552, "step": 82 }, { "epoch": 0.06261788004526593, "grad_norm": 8.799844741821289, "learning_rate": 7.269952498697734e-05, "loss": 6.2327, "step": 83 }, { "epoch": 0.06337231233496794, "grad_norm": 7.895553112030029, "learning_rate": 7.191855733945387e-05, "loss": 4.8941, "step": 84 }, { "epoch": 0.06412674462466994, "grad_norm": 8.473020553588867, "learning_rate": 7.113091308703498e-05, "loss": 6.1927, "step": 85 }, { "epoch": 0.06488117691437194, "grad_norm": 8.47846508026123, "learning_rate": 7.033683215379002e-05, "loss": 5.1994, "step": 86 }, { "epoch": 0.06563560920407394, "grad_norm": 8.409833908081055, "learning_rate": 6.953655642446368e-05, "loss": 6.0418, "step": 87 }, { "epoch": 0.06639004149377593, "grad_norm": 8.531917572021484, "learning_rate": 6.873032967079561e-05, "loss": 5.0468, "step": 88 }, { "epoch": 0.06714447378347793, "grad_norm": 8.19450569152832, "learning_rate": 6.7918397477265e-05, "loss": 5.3869, "step": 89 }, { "epoch": 0.06789890607317993, "grad_norm": 8.973502159118652, "learning_rate": 6.710100716628344e-05, "loss": 5.2092, "step": 90 }, { "epoch": 0.06865333836288193, "grad_norm": 8.892091751098633, "learning_rate": 6.627840772285784e-05, "loss": 5.4139, "step": 91 }, { "epoch": 0.06940777065258392, "grad_norm": 8.022160530090332, "learning_rate": 6.545084971874738e-05, "loss": 4.7068, "step": 92 }, { "epoch": 0.07016220294228594, "grad_norm": 10.1045560836792, "learning_rate": 6.461858523613684e-05, "loss": 5.8565, "step": 93 }, { "epoch": 0.07091663523198793, "grad_norm": 9.677287101745605, "learning_rate": 6.378186779084995e-05, "loss": 5.1281, "step": 94 }, { "epoch": 0.07167106752168993, "grad_norm": 10.185721397399902, "learning_rate": 6.294095225512603e-05, "loss": 5.3468, "step": 95 }, { "epoch": 0.07242549981139193, "grad_norm": 10.971282958984375, "learning_rate": 6.209609477998338e-05, "loss": 4.6916, "step": 96 }, { "epoch": 0.07317993210109393, "grad_norm": 11.636845588684082, "learning_rate": 6.124755271719325e-05, "loss": 4.4786, "step": 97 }, { "epoch": 0.07393436439079593, "grad_norm": 10.711810111999512, "learning_rate": 6.0395584540887963e-05, "loss": 3.0341, "step": 98 }, { "epoch": 0.07468879668049792, "grad_norm": 8.94869613647461, "learning_rate": 5.9540449768827246e-05, "loss": 2.3939, "step": 99 }, { "epoch": 0.07544322897019992, "grad_norm": 8.477395057678223, "learning_rate": 5.868240888334653e-05, "loss": 1.6288, "step": 100 }, { "epoch": 0.07544322897019992, "eval_loss": 1.5527470111846924, "eval_runtime": 222.1555, "eval_samples_per_second": 10.052, "eval_steps_per_second": 5.028, "step": 100 }, { "epoch": 0.07619766125990192, "grad_norm": 12.187577247619629, "learning_rate": 5.782172325201155e-05, "loss": 6.6371, "step": 101 }, { "epoch": 0.07695209354960392, "grad_norm": 11.621190071105957, "learning_rate": 5.695865504800327e-05, "loss": 7.1557, "step": 102 }, { "epoch": 0.07770652583930593, "grad_norm": 9.95458698272705, "learning_rate": 5.6093467170257374e-05, "loss": 6.7466, "step": 103 }, { "epoch": 0.07846095812900793, "grad_norm": 7.798550605773926, "learning_rate": 5.522642316338268e-05, "loss": 6.3035, "step": 104 }, { "epoch": 0.07921539041870992, "grad_norm": 7.551530838012695, "learning_rate": 5.435778713738292e-05, "loss": 6.4349, "step": 105 }, { "epoch": 0.07996982270841192, "grad_norm": 7.4671630859375, "learning_rate": 5.348782368720626e-05, "loss": 6.3822, "step": 106 }, { "epoch": 0.08072425499811392, "grad_norm": 7.680654525756836, "learning_rate": 5.26167978121472e-05, "loss": 5.8368, "step": 107 }, { "epoch": 0.08147868728781592, "grad_norm": 7.623833656311035, "learning_rate": 5.174497483512506e-05, "loss": 6.2462, "step": 108 }, { "epoch": 0.08223311957751792, "grad_norm": 8.119511604309082, "learning_rate": 5.0872620321864185e-05, "loss": 5.8799, "step": 109 }, { "epoch": 0.08298755186721991, "grad_norm": 8.163238525390625, "learning_rate": 5e-05, "loss": 6.4398, "step": 110 }, { "epoch": 0.08374198415692191, "grad_norm": 7.675838947296143, "learning_rate": 4.912737967813583e-05, "loss": 6.1381, "step": 111 }, { "epoch": 0.08449641644662391, "grad_norm": 8.161046981811523, "learning_rate": 4.825502516487497e-05, "loss": 6.1432, "step": 112 }, { "epoch": 0.08525084873632592, "grad_norm": 8.056014060974121, "learning_rate": 4.738320218785281e-05, "loss": 5.9772, "step": 113 }, { "epoch": 0.08600528102602792, "grad_norm": 7.2145185470581055, "learning_rate": 4.6512176312793736e-05, "loss": 5.7573, "step": 114 }, { "epoch": 0.08675971331572992, "grad_norm": 7.514006614685059, "learning_rate": 4.564221286261709e-05, "loss": 5.9614, "step": 115 }, { "epoch": 0.08751414560543191, "grad_norm": 7.510403633117676, "learning_rate": 4.477357683661734e-05, "loss": 5.3342, "step": 116 }, { "epoch": 0.08826857789513391, "grad_norm": 8.066449165344238, "learning_rate": 4.390653282974264e-05, "loss": 5.5604, "step": 117 }, { "epoch": 0.08902301018483591, "grad_norm": 6.986632347106934, "learning_rate": 4.3041344951996746e-05, "loss": 5.5458, "step": 118 }, { "epoch": 0.08977744247453791, "grad_norm": 7.432390213012695, "learning_rate": 4.2178276747988446e-05, "loss": 5.9329, "step": 119 }, { "epoch": 0.0905318747642399, "grad_norm": 7.102813243865967, "learning_rate": 4.131759111665349e-05, "loss": 5.7666, "step": 120 }, { "epoch": 0.0912863070539419, "grad_norm": 7.3202714920043945, "learning_rate": 4.045955023117276e-05, "loss": 5.5461, "step": 121 }, { "epoch": 0.0920407393436439, "grad_norm": 7.094704627990723, "learning_rate": 3.960441545911204e-05, "loss": 5.6711, "step": 122 }, { "epoch": 0.09279517163334591, "grad_norm": 6.997950077056885, "learning_rate": 3.875244728280676e-05, "loss": 6.0052, "step": 123 }, { "epoch": 0.09354960392304791, "grad_norm": 7.559538841247559, "learning_rate": 3.790390522001662e-05, "loss": 4.6827, "step": 124 }, { "epoch": 0.09430403621274991, "grad_norm": 7.945469379425049, "learning_rate": 3.705904774487396e-05, "loss": 5.421, "step": 125 }, { "epoch": 0.0950584685024519, "grad_norm": 7.207925796508789, "learning_rate": 3.6218132209150045e-05, "loss": 5.95, "step": 126 }, { "epoch": 0.0958129007921539, "grad_norm": 7.7100725173950195, "learning_rate": 3.5381414763863166e-05, "loss": 6.2309, "step": 127 }, { "epoch": 0.0965673330818559, "grad_norm": 8.336740493774414, "learning_rate": 3.4549150281252636e-05, "loss": 5.8438, "step": 128 }, { "epoch": 0.0973217653715579, "grad_norm": 7.9306511878967285, "learning_rate": 3.372159227714218e-05, "loss": 6.0361, "step": 129 }, { "epoch": 0.0980761976612599, "grad_norm": 8.01220417022705, "learning_rate": 3.289899283371657e-05, "loss": 6.1294, "step": 130 }, { "epoch": 0.0988306299509619, "grad_norm": 7.395640850067139, "learning_rate": 3.2081602522734986e-05, "loss": 5.6617, "step": 131 }, { "epoch": 0.0995850622406639, "grad_norm": 7.897842884063721, "learning_rate": 3.12696703292044e-05, "loss": 5.7985, "step": 132 }, { "epoch": 0.1003394945303659, "grad_norm": 8.189974784851074, "learning_rate": 3.046344357553632e-05, "loss": 4.9965, "step": 133 }, { "epoch": 0.1010939268200679, "grad_norm": 8.065415382385254, "learning_rate": 2.9663167846209998e-05, "loss": 4.9086, "step": 134 }, { "epoch": 0.1018483591097699, "grad_norm": 8.694807052612305, "learning_rate": 2.886908691296504e-05, "loss": 4.776, "step": 135 }, { "epoch": 0.1026027913994719, "grad_norm": 7.551018714904785, "learning_rate": 2.8081442660546125e-05, "loss": 5.4173, "step": 136 }, { "epoch": 0.1033572236891739, "grad_norm": 7.498192310333252, "learning_rate": 2.7300475013022663e-05, "loss": 5.1536, "step": 137 }, { "epoch": 0.1041116559788759, "grad_norm": 7.734801769256592, "learning_rate": 2.6526421860705473e-05, "loss": 4.447, "step": 138 }, { "epoch": 0.10486608826857789, "grad_norm": 8.053444862365723, "learning_rate": 2.575951898768315e-05, "loss": 5.1973, "step": 139 }, { "epoch": 0.10562052055827989, "grad_norm": 8.864779472351074, "learning_rate": 2.500000000000001e-05, "loss": 5.1369, "step": 140 }, { "epoch": 0.10637495284798189, "grad_norm": 8.096254348754883, "learning_rate": 2.4248096254497288e-05, "loss": 4.3606, "step": 141 }, { "epoch": 0.1071293851376839, "grad_norm": 8.422588348388672, "learning_rate": 2.350403678833976e-05, "loss": 5.2434, "step": 142 }, { "epoch": 0.1078838174273859, "grad_norm": 9.554398536682129, "learning_rate": 2.2768048249248648e-05, "loss": 4.9714, "step": 143 }, { "epoch": 0.1086382497170879, "grad_norm": 9.492046356201172, "learning_rate": 2.2040354826462668e-05, "loss": 5.4207, "step": 144 }, { "epoch": 0.1093926820067899, "grad_norm": 9.344889640808105, "learning_rate": 2.132117818244771e-05, "loss": 4.7644, "step": 145 }, { "epoch": 0.11014711429649189, "grad_norm": 9.320569038391113, "learning_rate": 2.061073738537635e-05, "loss": 4.5768, "step": 146 }, { "epoch": 0.11090154658619389, "grad_norm": 9.480978965759277, "learning_rate": 1.9909248842397584e-05, "loss": 3.6032, "step": 147 }, { "epoch": 0.11165597887589589, "grad_norm": 9.053571701049805, "learning_rate": 1.9216926233717085e-05, "loss": 3.3363, "step": 148 }, { "epoch": 0.11241041116559788, "grad_norm": 7.955657958984375, "learning_rate": 1.8533980447508137e-05, "loss": 2.2038, "step": 149 }, { "epoch": 0.11316484345529988, "grad_norm": 10.95832347869873, "learning_rate": 1.7860619515673033e-05, "loss": 2.481, "step": 150 }, { "epoch": 0.11316484345529988, "eval_loss": 1.3595361709594727, "eval_runtime": 222.2461, "eval_samples_per_second": 10.047, "eval_steps_per_second": 5.026, "step": 150 }, { "epoch": 0.11391927574500188, "grad_norm": 7.0187482833862305, "learning_rate": 1.7197048550474643e-05, "loss": 5.7281, "step": 151 }, { "epoch": 0.11467370803470389, "grad_norm": 8.375664710998535, "learning_rate": 1.6543469682057106e-05, "loss": 6.2015, "step": 152 }, { "epoch": 0.11542814032440589, "grad_norm": 8.071150779724121, "learning_rate": 1.5900081996875083e-05, "loss": 6.2623, "step": 153 }, { "epoch": 0.11618257261410789, "grad_norm": 7.718661308288574, "learning_rate": 1.526708147705013e-05, "loss": 6.0401, "step": 154 }, { "epoch": 0.11693700490380989, "grad_norm": 8.218561172485352, "learning_rate": 1.4644660940672627e-05, "loss": 6.4259, "step": 155 }, { "epoch": 0.11769143719351188, "grad_norm": 7.594738960266113, "learning_rate": 1.4033009983067452e-05, "loss": 6.5689, "step": 156 }, { "epoch": 0.11844586948321388, "grad_norm": 6.898587703704834, "learning_rate": 1.3432314919041478e-05, "loss": 5.5474, "step": 157 }, { "epoch": 0.11920030177291588, "grad_norm": 7.060200214385986, "learning_rate": 1.2842758726130283e-05, "loss": 5.9418, "step": 158 }, { "epoch": 0.11995473406261788, "grad_norm": 7.08625602722168, "learning_rate": 1.22645209888614e-05, "loss": 5.9037, "step": 159 }, { "epoch": 0.12070916635231987, "grad_norm": 6.942751407623291, "learning_rate": 1.1697777844051105e-05, "loss": 6.0381, "step": 160 }, { "epoch": 0.12146359864202187, "grad_norm": 7.1135406494140625, "learning_rate": 1.1142701927151456e-05, "loss": 6.1544, "step": 161 }, { "epoch": 0.12221803093172388, "grad_norm": 6.448284149169922, "learning_rate": 1.0599462319663905e-05, "loss": 5.8762, "step": 162 }, { "epoch": 0.12297246322142588, "grad_norm": 7.199798107147217, "learning_rate": 1.006822449763537e-05, "loss": 6.246, "step": 163 }, { "epoch": 0.12372689551112788, "grad_norm": 7.024713039398193, "learning_rate": 9.549150281252633e-06, "loss": 6.028, "step": 164 }, { "epoch": 0.12448132780082988, "grad_norm": 6.762689113616943, "learning_rate": 9.042397785550405e-06, "loss": 5.8399, "step": 165 }, { "epoch": 0.12523576009053186, "grad_norm": 6.662613391876221, "learning_rate": 8.548121372247918e-06, "loss": 6.0572, "step": 166 }, { "epoch": 0.12599019238023387, "grad_norm": 7.291405200958252, "learning_rate": 8.066471602728803e-06, "loss": 5.7887, "step": 167 }, { "epoch": 0.12674462466993588, "grad_norm": 8.169452667236328, "learning_rate": 7.597595192178702e-06, "loss": 6.4472, "step": 168 }, { "epoch": 0.12749905695963787, "grad_norm": 7.192326068878174, "learning_rate": 7.1416349648943894e-06, "loss": 5.6836, "step": 169 }, { "epoch": 0.12825348924933988, "grad_norm": 7.195889949798584, "learning_rate": 6.698729810778065e-06, "loss": 5.4774, "step": 170 }, { "epoch": 0.12900792153904186, "grad_norm": 7.8230204582214355, "learning_rate": 6.269014643030213e-06, "loss": 5.5761, "step": 171 }, { "epoch": 0.12976235382874388, "grad_norm": 7.158181667327881, "learning_rate": 5.852620357053651e-06, "loss": 6.0725, "step": 172 }, { "epoch": 0.13051678611844586, "grad_norm": 7.191005706787109, "learning_rate": 5.449673790581611e-06, "loss": 6.3648, "step": 173 }, { "epoch": 0.13127121840814787, "grad_norm": 7.722646236419678, "learning_rate": 5.060297685041659e-06, "loss": 5.8579, "step": 174 }, { "epoch": 0.13202565069784986, "grad_norm": 7.786721229553223, "learning_rate": 4.684610648167503e-06, "loss": 6.0884, "step": 175 }, { "epoch": 0.13278008298755187, "grad_norm": 7.266101360321045, "learning_rate": 4.322727117869951e-06, "loss": 5.5713, "step": 176 }, { "epoch": 0.13353451527725388, "grad_norm": 8.33176326751709, "learning_rate": 3.974757327377981e-06, "loss": 6.0155, "step": 177 }, { "epoch": 0.13428894756695586, "grad_norm": 7.038465976715088, "learning_rate": 3.6408072716606346e-06, "loss": 5.3413, "step": 178 }, { "epoch": 0.13504337985665787, "grad_norm": 7.81002140045166, "learning_rate": 3.3209786751399187e-06, "loss": 5.9929, "step": 179 }, { "epoch": 0.13579781214635986, "grad_norm": 7.73304557800293, "learning_rate": 3.0153689607045845e-06, "loss": 6.365, "step": 180 }, { "epoch": 0.13655224443606187, "grad_norm": 7.7701544761657715, "learning_rate": 2.724071220034158e-06, "loss": 5.4176, "step": 181 }, { "epoch": 0.13730667672576385, "grad_norm": 6.981159687042236, "learning_rate": 2.4471741852423237e-06, "loss": 4.9839, "step": 182 }, { "epoch": 0.13806110901546587, "grad_norm": 7.628045558929443, "learning_rate": 2.1847622018482283e-06, "loss": 5.467, "step": 183 }, { "epoch": 0.13881554130516785, "grad_norm": 7.262263298034668, "learning_rate": 1.9369152030840556e-06, "loss": 5.3167, "step": 184 }, { "epoch": 0.13956997359486986, "grad_norm": 8.176560401916504, "learning_rate": 1.70370868554659e-06, "loss": 5.5774, "step": 185 }, { "epoch": 0.14032440588457187, "grad_norm": 8.238152503967285, "learning_rate": 1.4852136862001764e-06, "loss": 5.6847, "step": 186 }, { "epoch": 0.14107883817427386, "grad_norm": 7.836312770843506, "learning_rate": 1.2814967607382432e-06, "loss": 5.6294, "step": 187 }, { "epoch": 0.14183327046397587, "grad_norm": 8.08753490447998, "learning_rate": 1.0926199633097157e-06, "loss": 5.3877, "step": 188 }, { "epoch": 0.14258770275367785, "grad_norm": 7.177961349487305, "learning_rate": 9.186408276168013e-07, "loss": 4.799, "step": 189 }, { "epoch": 0.14334213504337986, "grad_norm": 7.347300052642822, "learning_rate": 7.596123493895991e-07, "loss": 4.6541, "step": 190 }, { "epoch": 0.14409656733308185, "grad_norm": 8.560412406921387, "learning_rate": 6.15582970243117e-07, "loss": 5.5527, "step": 191 }, { "epoch": 0.14485099962278386, "grad_norm": 8.744789123535156, "learning_rate": 4.865965629214819e-07, "loss": 5.1754, "step": 192 }, { "epoch": 0.14560543191248584, "grad_norm": 8.127669334411621, "learning_rate": 3.7269241793390085e-07, "loss": 4.7211, "step": 193 }, { "epoch": 0.14635986420218786, "grad_norm": 8.923094749450684, "learning_rate": 2.7390523158633554e-07, "loss": 5.0364, "step": 194 }, { "epoch": 0.14711429649188984, "grad_norm": 9.52410888671875, "learning_rate": 1.9026509541272275e-07, "loss": 4.7301, "step": 195 }, { "epoch": 0.14786872878159185, "grad_norm": 9.446085929870605, "learning_rate": 1.2179748700879012e-07, "loss": 4.8496, "step": 196 }, { "epoch": 0.14862316107129386, "grad_norm": 9.303170204162598, "learning_rate": 6.852326227130834e-08, "loss": 4.8698, "step": 197 }, { "epoch": 0.14937759336099585, "grad_norm": 10.187568664550781, "learning_rate": 3.04586490452119e-08, "loss": 3.2888, "step": 198 }, { "epoch": 0.15013202565069786, "grad_norm": 7.18109655380249, "learning_rate": 7.615242180436522e-09, "loss": 2.0294, "step": 199 }, { "epoch": 0.15088645794039984, "grad_norm": 7.182096004486084, "learning_rate": 0.0, "loss": 1.7007, "step": 200 }, { "epoch": 0.15088645794039984, "eval_loss": 1.3203791379928589, "eval_runtime": 222.0788, "eval_samples_per_second": 10.055, "eval_steps_per_second": 5.03, "step": 200 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.865573512997765e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }