{ "best_metric": 3.2505762577056885, "best_model_checkpoint": "miner_id_24/checkpoint-200", "epoch": 0.12972271769093563, "eval_steps": 100, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006486135884546781, "grad_norm": 3.839925765991211, "learning_rate": 2e-05, "loss": 5.3671, "step": 1 }, { "epoch": 0.0006486135884546781, "eval_loss": 5.341527462005615, "eval_runtime": 35.046, "eval_samples_per_second": 58.666, "eval_steps_per_second": 14.666, "step": 1 }, { "epoch": 0.0012972271769093563, "grad_norm": 3.8855299949645996, "learning_rate": 4e-05, "loss": 5.2735, "step": 2 }, { "epoch": 0.0019458407653640344, "grad_norm": 3.7578446865081787, "learning_rate": 6e-05, "loss": 4.8599, "step": 3 }, { "epoch": 0.0025944543538187126, "grad_norm": 2.840263843536377, "learning_rate": 8e-05, "loss": 4.3011, "step": 4 }, { "epoch": 0.0032430679422733905, "grad_norm": 3.2963178157806396, "learning_rate": 0.0001, "loss": 4.7049, "step": 5 }, { "epoch": 0.003891681530728069, "grad_norm": 2.3765435218811035, "learning_rate": 0.00012, "loss": 4.4563, "step": 6 }, { "epoch": 0.004540295119182747, "grad_norm": 3.072453022003174, "learning_rate": 0.00014, "loss": 4.4638, "step": 7 }, { "epoch": 0.005188908707637425, "grad_norm": 3.7620046138763428, "learning_rate": 0.00016, "loss": 4.8635, "step": 8 }, { "epoch": 0.0058375222960921035, "grad_norm": 2.274141311645508, "learning_rate": 0.00018, "loss": 4.0879, "step": 9 }, { "epoch": 0.006486135884546781, "grad_norm": 2.1749279499053955, "learning_rate": 0.0002, "loss": 4.0116, "step": 10 }, { "epoch": 0.007134749473001459, "grad_norm": 2.7090888023376465, "learning_rate": 0.0001999999477769797, "loss": 4.1352, "step": 11 }, { "epoch": 0.007783363061456138, "grad_norm": 2.8192594051361084, "learning_rate": 0.00019999979110797331, "loss": 3.7506, "step": 12 }, { "epoch": 0.008431976649910815, "grad_norm": 2.1979215145111084, "learning_rate": 0.00019999952999314444, "loss": 4.0342, "step": 13 }, { "epoch": 0.009080590238365494, "grad_norm": 1.9726805686950684, "learning_rate": 0.0001999991644327659, "loss": 3.6903, "step": 14 }, { "epoch": 0.009729203826820172, "grad_norm": 2.4161455631256104, "learning_rate": 0.0001999986944272194, "loss": 3.2464, "step": 15 }, { "epoch": 0.01037781741527485, "grad_norm": 1.8676812648773193, "learning_rate": 0.00019999811997699593, "loss": 3.7741, "step": 16 }, { "epoch": 0.011026431003729529, "grad_norm": 1.7903848886489868, "learning_rate": 0.00019999744108269542, "loss": 3.7455, "step": 17 }, { "epoch": 0.011675044592184207, "grad_norm": 2.043623924255371, "learning_rate": 0.00019999665774502696, "loss": 3.8827, "step": 18 }, { "epoch": 0.012323658180638884, "grad_norm": 1.8445640802383423, "learning_rate": 0.00019999576996480872, "loss": 3.9371, "step": 19 }, { "epoch": 0.012972271769093562, "grad_norm": 1.5927119255065918, "learning_rate": 0.000199994777742968, "loss": 3.5422, "step": 20 }, { "epoch": 0.01362088535754824, "grad_norm": 1.3268831968307495, "learning_rate": 0.0001999936810805411, "loss": 3.286, "step": 21 }, { "epoch": 0.014269498946002919, "grad_norm": 1.617830514907837, "learning_rate": 0.0001999924799786734, "loss": 3.3049, "step": 22 }, { "epoch": 0.014918112534457597, "grad_norm": 1.592024803161621, "learning_rate": 0.00019999117443861942, "loss": 3.3464, "step": 23 }, { "epoch": 0.015566726122912275, "grad_norm": 1.5516711473464966, "learning_rate": 0.00019998976446174277, "loss": 3.399, "step": 24 }, { "epoch": 0.016215339711366954, "grad_norm": 1.7870945930480957, "learning_rate": 0.00019998825004951612, "loss": 3.5775, "step": 25 }, { "epoch": 0.01686395329982163, "grad_norm": 1.6130069494247437, "learning_rate": 0.00019998663120352118, "loss": 3.6604, "step": 26 }, { "epoch": 0.01751256688827631, "grad_norm": 1.4511877298355103, "learning_rate": 0.00019998490792544883, "loss": 3.243, "step": 27 }, { "epoch": 0.018161180476730987, "grad_norm": 1.6252033710479736, "learning_rate": 0.0001999830802170989, "loss": 3.6021, "step": 28 }, { "epoch": 0.018809794065185667, "grad_norm": 1.7277567386627197, "learning_rate": 0.00019998114808038043, "loss": 3.5944, "step": 29 }, { "epoch": 0.019458407653640344, "grad_norm": 1.6731386184692383, "learning_rate": 0.00019997911151731134, "loss": 3.3969, "step": 30 }, { "epoch": 0.02010702124209502, "grad_norm": 1.6243277788162231, "learning_rate": 0.00019997697053001886, "loss": 3.035, "step": 31 }, { "epoch": 0.0207556348305497, "grad_norm": 1.773876667022705, "learning_rate": 0.00019997472512073912, "loss": 3.6559, "step": 32 }, { "epoch": 0.021404248419004377, "grad_norm": 1.47771155834198, "learning_rate": 0.00019997237529181737, "loss": 3.103, "step": 33 }, { "epoch": 0.022052862007459057, "grad_norm": 1.3901262283325195, "learning_rate": 0.0001999699210457079, "loss": 2.8133, "step": 34 }, { "epoch": 0.022701475595913734, "grad_norm": 1.6069719791412354, "learning_rate": 0.00019996736238497406, "loss": 3.3517, "step": 35 }, { "epoch": 0.023350089184368414, "grad_norm": 1.5524790287017822, "learning_rate": 0.0001999646993122883, "loss": 3.3211, "step": 36 }, { "epoch": 0.02399870277282309, "grad_norm": 1.6193372011184692, "learning_rate": 0.0001999619318304321, "loss": 3.5321, "step": 37 }, { "epoch": 0.024647316361277767, "grad_norm": 1.6295925378799438, "learning_rate": 0.00019995905994229593, "loss": 3.4257, "step": 38 }, { "epoch": 0.025295929949732447, "grad_norm": 1.9165164232254028, "learning_rate": 0.00019995608365087946, "loss": 3.9521, "step": 39 }, { "epoch": 0.025944543538187124, "grad_norm": 1.5593171119689941, "learning_rate": 0.0001999530029592912, "loss": 3.4656, "step": 40 }, { "epoch": 0.026593157126641804, "grad_norm": 1.691826581954956, "learning_rate": 0.0001999498178707489, "loss": 3.2846, "step": 41 }, { "epoch": 0.02724177071509648, "grad_norm": 1.709882140159607, "learning_rate": 0.00019994652838857917, "loss": 3.4973, "step": 42 }, { "epoch": 0.02789038430355116, "grad_norm": 1.5011334419250488, "learning_rate": 0.00019994313451621783, "loss": 3.0221, "step": 43 }, { "epoch": 0.028538997892005837, "grad_norm": 1.3794457912445068, "learning_rate": 0.0001999396362572096, "loss": 3.5346, "step": 44 }, { "epoch": 0.029187611480460517, "grad_norm": 1.6306489706039429, "learning_rate": 0.00019993603361520828, "loss": 3.4094, "step": 45 }, { "epoch": 0.029836225068915194, "grad_norm": 1.2462743520736694, "learning_rate": 0.00019993232659397666, "loss": 3.1227, "step": 46 }, { "epoch": 0.03048483865736987, "grad_norm": 1.4848414659500122, "learning_rate": 0.00019992851519738664, "loss": 3.2734, "step": 47 }, { "epoch": 0.03113345224582455, "grad_norm": 1.7278729677200317, "learning_rate": 0.00019992459942941906, "loss": 3.3184, "step": 48 }, { "epoch": 0.03178206583427923, "grad_norm": 1.4307690858840942, "learning_rate": 0.00019992057929416371, "loss": 3.5227, "step": 49 }, { "epoch": 0.03243067942273391, "grad_norm": 1.530920147895813, "learning_rate": 0.00019991645479581956, "loss": 3.1131, "step": 50 }, { "epoch": 0.03307929301118859, "grad_norm": 1.5259414911270142, "learning_rate": 0.00019991222593869444, "loss": 3.7062, "step": 51 }, { "epoch": 0.03372790659964326, "grad_norm": 1.562098503112793, "learning_rate": 0.0001999078927272052, "loss": 3.3501, "step": 52 }, { "epoch": 0.03437652018809794, "grad_norm": 1.539969801902771, "learning_rate": 0.00019990345516587775, "loss": 3.7154, "step": 53 }, { "epoch": 0.03502513377655262, "grad_norm": 1.8281248807907104, "learning_rate": 0.00019989891325934692, "loss": 3.5978, "step": 54 }, { "epoch": 0.035673747365007294, "grad_norm": 1.5411529541015625, "learning_rate": 0.00019989426701235653, "loss": 3.3103, "step": 55 }, { "epoch": 0.036322360953461974, "grad_norm": 1.3391532897949219, "learning_rate": 0.00019988951642975947, "loss": 2.9469, "step": 56 }, { "epoch": 0.036970974541916654, "grad_norm": 1.5546962022781372, "learning_rate": 0.00019988466151651748, "loss": 3.5001, "step": 57 }, { "epoch": 0.037619588130371334, "grad_norm": 1.673667550086975, "learning_rate": 0.00019987970227770135, "loss": 3.5981, "step": 58 }, { "epoch": 0.03826820171882601, "grad_norm": 1.5357543230056763, "learning_rate": 0.00019987463871849078, "loss": 3.2209, "step": 59 }, { "epoch": 0.03891681530728069, "grad_norm": 1.5452758073806763, "learning_rate": 0.0001998694708441745, "loss": 2.9202, "step": 60 }, { "epoch": 0.03956542889573537, "grad_norm": 1.5281524658203125, "learning_rate": 0.00019986419866015013, "loss": 3.1143, "step": 61 }, { "epoch": 0.04021404248419004, "grad_norm": 1.6060402393341064, "learning_rate": 0.00019985882217192423, "loss": 3.5452, "step": 62 }, { "epoch": 0.04086265607264472, "grad_norm": 1.6011849641799927, "learning_rate": 0.00019985334138511237, "loss": 3.5304, "step": 63 }, { "epoch": 0.0415112696610994, "grad_norm": 1.8889275789260864, "learning_rate": 0.00019984775630543902, "loss": 3.2174, "step": 64 }, { "epoch": 0.04215988324955408, "grad_norm": 1.7960286140441895, "learning_rate": 0.00019984206693873753, "loss": 3.6113, "step": 65 }, { "epoch": 0.042808496838008754, "grad_norm": 1.4898135662078857, "learning_rate": 0.00019983627329095028, "loss": 3.2973, "step": 66 }, { "epoch": 0.043457110426463434, "grad_norm": 1.5716447830200195, "learning_rate": 0.00019983037536812842, "loss": 3.011, "step": 67 }, { "epoch": 0.044105724014918114, "grad_norm": 1.4708287715911865, "learning_rate": 0.00019982437317643217, "loss": 3.3225, "step": 68 }, { "epoch": 0.04475433760337279, "grad_norm": 1.4985527992248535, "learning_rate": 0.00019981826672213054, "loss": 3.1602, "step": 69 }, { "epoch": 0.04540295119182747, "grad_norm": 1.4717998504638672, "learning_rate": 0.0001998120560116015, "loss": 3.2304, "step": 70 }, { "epoch": 0.04605156478028215, "grad_norm": 1.9946448802947998, "learning_rate": 0.00019980574105133187, "loss": 3.3665, "step": 71 }, { "epoch": 0.04670017836873683, "grad_norm": 1.5488526821136475, "learning_rate": 0.00019979932184791742, "loss": 3.0899, "step": 72 }, { "epoch": 0.0473487919571915, "grad_norm": 1.3588461875915527, "learning_rate": 0.00019979279840806271, "loss": 3.0037, "step": 73 }, { "epoch": 0.04799740554564618, "grad_norm": 1.368888258934021, "learning_rate": 0.00019978617073858123, "loss": 3.1688, "step": 74 }, { "epoch": 0.04864601913410086, "grad_norm": 1.7766389846801758, "learning_rate": 0.00019977943884639534, "loss": 3.0139, "step": 75 }, { "epoch": 0.049294632722555534, "grad_norm": 1.4920904636383057, "learning_rate": 0.0001997726027385362, "loss": 3.2154, "step": 76 }, { "epoch": 0.049943246311010214, "grad_norm": 1.707010269165039, "learning_rate": 0.00019976566242214388, "loss": 3.3891, "step": 77 }, { "epoch": 0.050591859899464894, "grad_norm": 1.7228715419769287, "learning_rate": 0.00019975861790446722, "loss": 3.4335, "step": 78 }, { "epoch": 0.051240473487919574, "grad_norm": 1.423966646194458, "learning_rate": 0.000199751469192864, "loss": 3.0397, "step": 79 }, { "epoch": 0.05188908707637425, "grad_norm": 1.5038292407989502, "learning_rate": 0.00019974421629480075, "loss": 3.0318, "step": 80 }, { "epoch": 0.05253770066482893, "grad_norm": 1.4808934926986694, "learning_rate": 0.00019973685921785282, "loss": 3.1422, "step": 81 }, { "epoch": 0.05318631425328361, "grad_norm": 1.4777363538742065, "learning_rate": 0.00019972939796970436, "loss": 3.3775, "step": 82 }, { "epoch": 0.05383492784173829, "grad_norm": 1.7979999780654907, "learning_rate": 0.00019972183255814843, "loss": 3.3686, "step": 83 }, { "epoch": 0.05448354143019296, "grad_norm": 1.5056769847869873, "learning_rate": 0.00019971416299108672, "loss": 3.2013, "step": 84 }, { "epoch": 0.05513215501864764, "grad_norm": 1.389157772064209, "learning_rate": 0.0001997063892765298, "loss": 3.0217, "step": 85 }, { "epoch": 0.05578076860710232, "grad_norm": 1.7881611585617065, "learning_rate": 0.00019969851142259706, "loss": 3.1829, "step": 86 }, { "epoch": 0.056429382195556994, "grad_norm": 1.3970147371292114, "learning_rate": 0.0001996905294375166, "loss": 3.059, "step": 87 }, { "epoch": 0.057077995784011674, "grad_norm": 1.655731439590454, "learning_rate": 0.0001996824433296252, "loss": 3.3041, "step": 88 }, { "epoch": 0.057726609372466355, "grad_norm": 1.7357268333435059, "learning_rate": 0.0001996742531073686, "loss": 3.4154, "step": 89 }, { "epoch": 0.058375222960921035, "grad_norm": 1.6906143426895142, "learning_rate": 0.00019966595877930106, "loss": 3.186, "step": 90 }, { "epoch": 0.05902383654937571, "grad_norm": 1.3434598445892334, "learning_rate": 0.00019965756035408573, "loss": 3.0845, "step": 91 }, { "epoch": 0.05967245013783039, "grad_norm": 1.5761163234710693, "learning_rate": 0.00019964905784049442, "loss": 3.2079, "step": 92 }, { "epoch": 0.06032106372628507, "grad_norm": 1.366127371788025, "learning_rate": 0.00019964045124740772, "loss": 3.0052, "step": 93 }, { "epoch": 0.06096967731473974, "grad_norm": 1.5109351873397827, "learning_rate": 0.0001996317405838148, "loss": 3.0047, "step": 94 }, { "epoch": 0.06161829090319442, "grad_norm": 1.2576245069503784, "learning_rate": 0.0001996229258588136, "loss": 3.1236, "step": 95 }, { "epoch": 0.0622669044916491, "grad_norm": 1.4992951154708862, "learning_rate": 0.0001996140070816108, "loss": 3.1032, "step": 96 }, { "epoch": 0.06291551808010377, "grad_norm": 1.4879511594772339, "learning_rate": 0.0001996049842615217, "loss": 3.2732, "step": 97 }, { "epoch": 0.06356413166855845, "grad_norm": 1.2676115036010742, "learning_rate": 0.00019959585740797028, "loss": 3.0975, "step": 98 }, { "epoch": 0.06421274525701313, "grad_norm": 1.826397180557251, "learning_rate": 0.00019958662653048913, "loss": 3.3263, "step": 99 }, { "epoch": 0.06486135884546781, "grad_norm": 1.3498917818069458, "learning_rate": 0.00019957729163871962, "loss": 3.1325, "step": 100 }, { "epoch": 0.06486135884546781, "eval_loss": 3.3947603702545166, "eval_runtime": 35.0846, "eval_samples_per_second": 58.601, "eval_steps_per_second": 14.65, "step": 100 }, { "epoch": 0.0655099724339225, "grad_norm": 1.445410966873169, "learning_rate": 0.00019956785274241164, "loss": 3.236, "step": 101 }, { "epoch": 0.06615858602237717, "grad_norm": 1.6176825761795044, "learning_rate": 0.00019955830985142367, "loss": 3.0255, "step": 102 }, { "epoch": 0.06680719961083184, "grad_norm": 1.61312735080719, "learning_rate": 0.000199548662975723, "loss": 3.2001, "step": 103 }, { "epoch": 0.06745581319928652, "grad_norm": 1.36332368850708, "learning_rate": 0.00019953891212538534, "loss": 3.0994, "step": 104 }, { "epoch": 0.0681044267877412, "grad_norm": 1.7672650814056396, "learning_rate": 0.00019952905731059506, "loss": 3.2036, "step": 105 }, { "epoch": 0.06875304037619588, "grad_norm": 1.484741449356079, "learning_rate": 0.00019951909854164517, "loss": 3.0926, "step": 106 }, { "epoch": 0.06940165396465056, "grad_norm": 1.6508300304412842, "learning_rate": 0.00019950903582893718, "loss": 3.3921, "step": 107 }, { "epoch": 0.07005026755310524, "grad_norm": 1.5160471200942993, "learning_rate": 0.0001994988691829812, "loss": 3.2314, "step": 108 }, { "epoch": 0.07069888114155992, "grad_norm": 1.6546351909637451, "learning_rate": 0.00019948859861439587, "loss": 3.6566, "step": 109 }, { "epoch": 0.07134749473001459, "grad_norm": 1.401430368423462, "learning_rate": 0.00019947822413390843, "loss": 3.0595, "step": 110 }, { "epoch": 0.07199610831846927, "grad_norm": 1.4741144180297852, "learning_rate": 0.0001994677457523546, "loss": 2.5762, "step": 111 }, { "epoch": 0.07264472190692395, "grad_norm": 1.4336308240890503, "learning_rate": 0.0001994571634806786, "loss": 3.2864, "step": 112 }, { "epoch": 0.07329333549537863, "grad_norm": 1.3772560358047485, "learning_rate": 0.00019944647732993324, "loss": 2.9711, "step": 113 }, { "epoch": 0.07394194908383331, "grad_norm": 1.8636032342910767, "learning_rate": 0.0001994356873112798, "loss": 3.6032, "step": 114 }, { "epoch": 0.07459056267228799, "grad_norm": 1.5523511171340942, "learning_rate": 0.00019942479343598794, "loss": 2.9138, "step": 115 }, { "epoch": 0.07523917626074267, "grad_norm": 1.3805291652679443, "learning_rate": 0.00019941379571543596, "loss": 3.1352, "step": 116 }, { "epoch": 0.07588778984919733, "grad_norm": 1.7657650709152222, "learning_rate": 0.00019940269416111054, "loss": 3.6572, "step": 117 }, { "epoch": 0.07653640343765201, "grad_norm": 1.0355079174041748, "learning_rate": 0.00019939148878460677, "loss": 3.1429, "step": 118 }, { "epoch": 0.0771850170261067, "grad_norm": 2.0980677604675293, "learning_rate": 0.00019938017959762822, "loss": 3.6367, "step": 119 }, { "epoch": 0.07783363061456137, "grad_norm": 1.554826259613037, "learning_rate": 0.00019936876661198692, "loss": 3.0011, "step": 120 }, { "epoch": 0.07848224420301605, "grad_norm": 1.5516928434371948, "learning_rate": 0.0001993572498396033, "loss": 2.8849, "step": 121 }, { "epoch": 0.07913085779147074, "grad_norm": 1.8022054433822632, "learning_rate": 0.00019934562929250612, "loss": 3.3661, "step": 122 }, { "epoch": 0.07977947137992542, "grad_norm": 1.523295521736145, "learning_rate": 0.00019933390498283262, "loss": 3.37, "step": 123 }, { "epoch": 0.08042808496838008, "grad_norm": 1.5638176202774048, "learning_rate": 0.0001993220769228284, "loss": 3.0696, "step": 124 }, { "epoch": 0.08107669855683476, "grad_norm": 1.4551162719726562, "learning_rate": 0.00019931014512484732, "loss": 2.9773, "step": 125 }, { "epoch": 0.08172531214528944, "grad_norm": 1.8125625848770142, "learning_rate": 0.00019929810960135172, "loss": 3.5409, "step": 126 }, { "epoch": 0.08237392573374412, "grad_norm": 1.598616361618042, "learning_rate": 0.0001992859703649122, "loss": 3.2174, "step": 127 }, { "epoch": 0.0830225393221988, "grad_norm": 1.6178065538406372, "learning_rate": 0.00019927372742820779, "loss": 3.1763, "step": 128 }, { "epoch": 0.08367115291065348, "grad_norm": 1.5398484468460083, "learning_rate": 0.00019926138080402566, "loss": 3.174, "step": 129 }, { "epoch": 0.08431976649910816, "grad_norm": 1.6358362436294556, "learning_rate": 0.0001992489305052614, "loss": 3.3533, "step": 130 }, { "epoch": 0.08496838008756283, "grad_norm": 1.5181739330291748, "learning_rate": 0.00019923637654491888, "loss": 3.0561, "step": 131 }, { "epoch": 0.08561699367601751, "grad_norm": 1.4952844381332397, "learning_rate": 0.00019922371893611022, "loss": 2.8942, "step": 132 }, { "epoch": 0.08626560726447219, "grad_norm": 1.55109441280365, "learning_rate": 0.00019921095769205574, "loss": 2.9548, "step": 133 }, { "epoch": 0.08691422085292687, "grad_norm": 1.258230209350586, "learning_rate": 0.00019919809282608407, "loss": 2.6494, "step": 134 }, { "epoch": 0.08756283444138155, "grad_norm": 1.5716472864151, "learning_rate": 0.0001991851243516321, "loss": 3.0505, "step": 135 }, { "epoch": 0.08821144802983623, "grad_norm": 2.0663163661956787, "learning_rate": 0.00019917205228224481, "loss": 3.699, "step": 136 }, { "epoch": 0.08886006161829091, "grad_norm": 1.6701781749725342, "learning_rate": 0.00019915887663157555, "loss": 3.3952, "step": 137 }, { "epoch": 0.08950867520674557, "grad_norm": 1.5281134843826294, "learning_rate": 0.0001991455974133857, "loss": 3.1934, "step": 138 }, { "epoch": 0.09015728879520025, "grad_norm": 1.577763557434082, "learning_rate": 0.00019913221464154488, "loss": 3.266, "step": 139 }, { "epoch": 0.09080590238365494, "grad_norm": 1.510722041130066, "learning_rate": 0.0001991187283300309, "loss": 3.0935, "step": 140 }, { "epoch": 0.09145451597210962, "grad_norm": 1.5357921123504639, "learning_rate": 0.00019910513849292962, "loss": 3.1124, "step": 141 }, { "epoch": 0.0921031295605643, "grad_norm": 1.441179633140564, "learning_rate": 0.00019909144514443518, "loss": 3.0029, "step": 142 }, { "epoch": 0.09275174314901898, "grad_norm": 1.5612683296203613, "learning_rate": 0.00019907764829884964, "loss": 2.9997, "step": 143 }, { "epoch": 0.09340035673747366, "grad_norm": 1.4757554531097412, "learning_rate": 0.00019906374797058334, "loss": 3.0236, "step": 144 }, { "epoch": 0.09404897032592832, "grad_norm": 1.4126044511795044, "learning_rate": 0.00019904974417415456, "loss": 2.7912, "step": 145 }, { "epoch": 0.094697583914383, "grad_norm": 1.751099944114685, "learning_rate": 0.00019903563692418976, "loss": 3.2925, "step": 146 }, { "epoch": 0.09534619750283768, "grad_norm": 1.6099004745483398, "learning_rate": 0.00019902142623542336, "loss": 3.0836, "step": 147 }, { "epoch": 0.09599481109129236, "grad_norm": 1.4375733137130737, "learning_rate": 0.0001990071121226979, "loss": 3.0289, "step": 148 }, { "epoch": 0.09664342467974704, "grad_norm": 1.8070977926254272, "learning_rate": 0.0001989926946009639, "loss": 3.1598, "step": 149 }, { "epoch": 0.09729203826820172, "grad_norm": 1.9119011163711548, "learning_rate": 0.00019897817368527985, "loss": 3.484, "step": 150 }, { "epoch": 0.0979406518566564, "grad_norm": 1.6774824857711792, "learning_rate": 0.00019896354939081233, "loss": 3.3953, "step": 151 }, { "epoch": 0.09858926544511107, "grad_norm": 1.5909019708633423, "learning_rate": 0.00019894882173283578, "loss": 3.3354, "step": 152 }, { "epoch": 0.09923787903356575, "grad_norm": 1.9849345684051514, "learning_rate": 0.00019893399072673268, "loss": 3.6584, "step": 153 }, { "epoch": 0.09988649262202043, "grad_norm": 1.6064544916152954, "learning_rate": 0.00019891905638799346, "loss": 2.8525, "step": 154 }, { "epoch": 0.10053510621047511, "grad_norm": 1.518021821975708, "learning_rate": 0.0001989040187322164, "loss": 3.1254, "step": 155 }, { "epoch": 0.10118371979892979, "grad_norm": 1.3452222347259521, "learning_rate": 0.00019888887777510776, "loss": 2.8711, "step": 156 }, { "epoch": 0.10183233338738447, "grad_norm": 1.5667740106582642, "learning_rate": 0.00019887363353248168, "loss": 2.8838, "step": 157 }, { "epoch": 0.10248094697583915, "grad_norm": 1.9317669868469238, "learning_rate": 0.0001988582860202601, "loss": 3.6402, "step": 158 }, { "epoch": 0.10312956056429382, "grad_norm": 1.5328751802444458, "learning_rate": 0.00019884283525447297, "loss": 3.4408, "step": 159 }, { "epoch": 0.1037781741527485, "grad_norm": 1.5413289070129395, "learning_rate": 0.00019882728125125799, "loss": 3.2537, "step": 160 }, { "epoch": 0.10442678774120318, "grad_norm": 1.7009034156799316, "learning_rate": 0.00019881162402686064, "loss": 3.2318, "step": 161 }, { "epoch": 0.10507540132965786, "grad_norm": 1.3361696004867554, "learning_rate": 0.00019879586359763436, "loss": 3.042, "step": 162 }, { "epoch": 0.10572401491811254, "grad_norm": 1.5617996454238892, "learning_rate": 0.0001987799999800402, "loss": 3.2803, "step": 163 }, { "epoch": 0.10637262850656722, "grad_norm": 1.399532437324524, "learning_rate": 0.00019876403319064715, "loss": 3.0391, "step": 164 }, { "epoch": 0.1070212420950219, "grad_norm": 1.429004192352295, "learning_rate": 0.0001987479632461319, "loss": 2.9923, "step": 165 }, { "epoch": 0.10766985568347658, "grad_norm": 1.4386272430419922, "learning_rate": 0.0001987317901632788, "loss": 2.7631, "step": 166 }, { "epoch": 0.10831846927193124, "grad_norm": 1.4610167741775513, "learning_rate": 0.00019871551395898003, "loss": 3.152, "step": 167 }, { "epoch": 0.10896708286038592, "grad_norm": 1.275974154472351, "learning_rate": 0.00019869913465023548, "loss": 3.0508, "step": 168 }, { "epoch": 0.1096156964488406, "grad_norm": 1.895013451576233, "learning_rate": 0.00019868265225415265, "loss": 3.4938, "step": 169 }, { "epoch": 0.11026431003729528, "grad_norm": 1.966009497642517, "learning_rate": 0.0001986660667879467, "loss": 3.4529, "step": 170 }, { "epoch": 0.11091292362574996, "grad_norm": 1.5443345308303833, "learning_rate": 0.0001986493782689406, "loss": 2.8802, "step": 171 }, { "epoch": 0.11156153721420464, "grad_norm": 1.4930187463760376, "learning_rate": 0.00019863258671456478, "loss": 3.1778, "step": 172 }, { "epoch": 0.11221015080265932, "grad_norm": 1.4915255308151245, "learning_rate": 0.00019861569214235737, "loss": 3.0042, "step": 173 }, { "epoch": 0.11285876439111399, "grad_norm": 1.735092043876648, "learning_rate": 0.00019859869456996407, "loss": 3.5419, "step": 174 }, { "epoch": 0.11350737797956867, "grad_norm": 1.5523368120193481, "learning_rate": 0.00019858159401513819, "loss": 3.0072, "step": 175 }, { "epoch": 0.11415599156802335, "grad_norm": 1.5719223022460938, "learning_rate": 0.00019856439049574057, "loss": 3.0198, "step": 176 }, { "epoch": 0.11480460515647803, "grad_norm": 1.835192084312439, "learning_rate": 0.0001985470840297396, "loss": 3.0267, "step": 177 }, { "epoch": 0.11545321874493271, "grad_norm": 1.9231184720993042, "learning_rate": 0.00019852967463521124, "loss": 3.1645, "step": 178 }, { "epoch": 0.11610183233338739, "grad_norm": 1.5600292682647705, "learning_rate": 0.00019851216233033884, "loss": 3.0769, "step": 179 }, { "epoch": 0.11675044592184207, "grad_norm": 1.7042049169540405, "learning_rate": 0.00019849454713341338, "loss": 3.4191, "step": 180 }, { "epoch": 0.11739905951029674, "grad_norm": 1.5651507377624512, "learning_rate": 0.0001984768290628332, "loss": 2.922, "step": 181 }, { "epoch": 0.11804767309875142, "grad_norm": 1.469074010848999, "learning_rate": 0.0001984590081371041, "loss": 2.9323, "step": 182 }, { "epoch": 0.1186962866872061, "grad_norm": 1.4901165962219238, "learning_rate": 0.00019844108437483938, "loss": 2.9711, "step": 183 }, { "epoch": 0.11934490027566078, "grad_norm": 1.219331979751587, "learning_rate": 0.00019842305779475968, "loss": 2.4844, "step": 184 }, { "epoch": 0.11999351386411546, "grad_norm": 1.562193751335144, "learning_rate": 0.00019840492841569307, "loss": 3.0347, "step": 185 }, { "epoch": 0.12064212745257014, "grad_norm": 1.3141801357269287, "learning_rate": 0.0001983866962565749, "loss": 2.8955, "step": 186 }, { "epoch": 0.12129074104102482, "grad_norm": 1.7868186235427856, "learning_rate": 0.00019836836133644802, "loss": 3.371, "step": 187 }, { "epoch": 0.12193935462947948, "grad_norm": 1.5469980239868164, "learning_rate": 0.0001983499236744625, "loss": 3.1649, "step": 188 }, { "epoch": 0.12258796821793416, "grad_norm": 1.5688115358352661, "learning_rate": 0.00019833138328987572, "loss": 3.102, "step": 189 }, { "epoch": 0.12323658180638884, "grad_norm": 1.9711377620697021, "learning_rate": 0.00019831274020205242, "loss": 3.2971, "step": 190 }, { "epoch": 0.12388519539484352, "grad_norm": 1.5055005550384521, "learning_rate": 0.00019829399443046454, "loss": 3.0587, "step": 191 }, { "epoch": 0.1245338089832982, "grad_norm": 1.5554633140563965, "learning_rate": 0.00019827514599469128, "loss": 3.2015, "step": 192 }, { "epoch": 0.12518242257175288, "grad_norm": 1.476672887802124, "learning_rate": 0.00019825619491441914, "loss": 3.0183, "step": 193 }, { "epoch": 0.12583103616020755, "grad_norm": 1.6196556091308594, "learning_rate": 0.0001982371412094417, "loss": 3.1581, "step": 194 }, { "epoch": 0.12647964974866224, "grad_norm": 1.4986144304275513, "learning_rate": 0.0001982179848996599, "loss": 3.2815, "step": 195 }, { "epoch": 0.1271282633371169, "grad_norm": 1.511244773864746, "learning_rate": 0.00019819872600508162, "loss": 3.1595, "step": 196 }, { "epoch": 0.1277768769255716, "grad_norm": 1.9068719148635864, "learning_rate": 0.00019817936454582212, "loss": 3.3151, "step": 197 }, { "epoch": 0.12842549051402627, "grad_norm": 2.2623164653778076, "learning_rate": 0.00019815990054210361, "loss": 3.6853, "step": 198 }, { "epoch": 0.12907410410248094, "grad_norm": 1.4253089427947998, "learning_rate": 0.00019814033401425554, "loss": 2.919, "step": 199 }, { "epoch": 0.12972271769093563, "grad_norm": 1.7597752809524536, "learning_rate": 0.0001981206649827143, "loss": 3.3967, "step": 200 }, { "epoch": 0.12972271769093563, "eval_loss": 3.2505762577056885, "eval_runtime": 35.0313, "eval_samples_per_second": 58.69, "eval_steps_per_second": 14.673, "step": 200 } ], "logging_steps": 1, "max_steps": 3084, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 2, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.4419086016512e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }