{ "best_metric": 3.268723964691162, "best_model_checkpoint": "miner_id_24/checkpoint-200", "epoch": 0.010232010845931496, "eval_steps": 50, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 5.1160054229657485e-05, "grad_norm": 9.063485145568848, "learning_rate": 1.004e-05, "loss": 6.7778, "step": 1 }, { "epoch": 5.1160054229657485e-05, "eval_loss": 3.955782651901245, "eval_runtime": 64.0286, "eval_samples_per_second": 128.536, "eval_steps_per_second": 32.142, "step": 1 }, { "epoch": 0.00010232010845931497, "grad_norm": 12.084126472473145, "learning_rate": 2.008e-05, "loss": 7.1737, "step": 2 }, { "epoch": 0.00015348016268897246, "grad_norm": 12.5093412399292, "learning_rate": 3.012e-05, "loss": 7.628, "step": 3 }, { "epoch": 0.00020464021691862994, "grad_norm": 7.164158821105957, "learning_rate": 4.016e-05, "loss": 6.7344, "step": 4 }, { "epoch": 0.0002558002711482874, "grad_norm": 6.665700435638428, "learning_rate": 5.02e-05, "loss": 7.4547, "step": 5 }, { "epoch": 0.0003069603253779449, "grad_norm": 5.877810955047607, "learning_rate": 6.024e-05, "loss": 7.6542, "step": 6 }, { "epoch": 0.00035812037960760237, "grad_norm": 6.6811909675598145, "learning_rate": 7.028e-05, "loss": 7.4317, "step": 7 }, { "epoch": 0.0004092804338372599, "grad_norm": 6.906431198120117, "learning_rate": 8.032e-05, "loss": 7.48, "step": 8 }, { "epoch": 0.00046044048806691734, "grad_norm": 7.104148864746094, "learning_rate": 9.036000000000001e-05, "loss": 7.1284, "step": 9 }, { "epoch": 0.0005116005422965748, "grad_norm": 6.226468563079834, "learning_rate": 0.0001004, "loss": 7.3501, "step": 10 }, { "epoch": 0.0005627605965262323, "grad_norm": 5.7838616371154785, "learning_rate": 9.987157894736842e-05, "loss": 6.8874, "step": 11 }, { "epoch": 0.0006139206507558898, "grad_norm": 6.224602222442627, "learning_rate": 9.934315789473684e-05, "loss": 7.1344, "step": 12 }, { "epoch": 0.0006650807049855473, "grad_norm": 6.317062854766846, "learning_rate": 9.881473684210525e-05, "loss": 7.2425, "step": 13 }, { "epoch": 0.0007162407592152047, "grad_norm": 6.039802074432373, "learning_rate": 9.828631578947369e-05, "loss": 7.3656, "step": 14 }, { "epoch": 0.0007674008134448623, "grad_norm": 7.281157970428467, "learning_rate": 9.77578947368421e-05, "loss": 7.5598, "step": 15 }, { "epoch": 0.0008185608676745198, "grad_norm": 6.830419063568115, "learning_rate": 9.722947368421052e-05, "loss": 7.2359, "step": 16 }, { "epoch": 0.0008697209219041772, "grad_norm": 6.214624404907227, "learning_rate": 9.670105263157895e-05, "loss": 7.699, "step": 17 }, { "epoch": 0.0009208809761338347, "grad_norm": 7.471404075622559, "learning_rate": 9.617263157894737e-05, "loss": 7.4639, "step": 18 }, { "epoch": 0.0009720410303634922, "grad_norm": 6.599513530731201, "learning_rate": 9.564421052631579e-05, "loss": 6.6926, "step": 19 }, { "epoch": 0.0010232010845931496, "grad_norm": 7.531161308288574, "learning_rate": 9.511578947368421e-05, "loss": 6.8589, "step": 20 }, { "epoch": 0.0010743611388228072, "grad_norm": 6.904527187347412, "learning_rate": 9.458736842105264e-05, "loss": 6.7745, "step": 21 }, { "epoch": 0.0011255211930524646, "grad_norm": 7.319172382354736, "learning_rate": 9.405894736842106e-05, "loss": 7.0661, "step": 22 }, { "epoch": 0.001176681247282122, "grad_norm": 7.7111406326293945, "learning_rate": 9.353052631578947e-05, "loss": 7.3184, "step": 23 }, { "epoch": 0.0012278413015117797, "grad_norm": 7.416265487670898, "learning_rate": 9.300210526315789e-05, "loss": 7.1042, "step": 24 }, { "epoch": 0.001279001355741437, "grad_norm": 7.195783615112305, "learning_rate": 9.247368421052631e-05, "loss": 6.9308, "step": 25 }, { "epoch": 0.0013301614099710947, "grad_norm": 7.958686351776123, "learning_rate": 9.194526315789473e-05, "loss": 7.1668, "step": 26 }, { "epoch": 0.001381321464200752, "grad_norm": 8.478933334350586, "learning_rate": 9.141684210526316e-05, "loss": 7.3691, "step": 27 }, { "epoch": 0.0014324815184304095, "grad_norm": 7.790161609649658, "learning_rate": 9.088842105263158e-05, "loss": 7.1757, "step": 28 }, { "epoch": 0.001483641572660067, "grad_norm": 8.151640892028809, "learning_rate": 9.036000000000001e-05, "loss": 7.3144, "step": 29 }, { "epoch": 0.0015348016268897245, "grad_norm": 8.261672019958496, "learning_rate": 8.983157894736843e-05, "loss": 6.9852, "step": 30 }, { "epoch": 0.001585961681119382, "grad_norm": 8.49923038482666, "learning_rate": 8.930315789473684e-05, "loss": 6.4607, "step": 31 }, { "epoch": 0.0016371217353490395, "grad_norm": 9.094529151916504, "learning_rate": 8.877473684210526e-05, "loss": 6.8413, "step": 32 }, { "epoch": 0.001688281789578697, "grad_norm": 8.042215347290039, "learning_rate": 8.824631578947368e-05, "loss": 6.0975, "step": 33 }, { "epoch": 0.0017394418438083543, "grad_norm": 9.398799896240234, "learning_rate": 8.771789473684211e-05, "loss": 7.8556, "step": 34 }, { "epoch": 0.001790601898038012, "grad_norm": 9.225303649902344, "learning_rate": 8.718947368421053e-05, "loss": 6.9043, "step": 35 }, { "epoch": 0.0018417619522676694, "grad_norm": 9.216371536254883, "learning_rate": 8.666105263157895e-05, "loss": 6.8808, "step": 36 }, { "epoch": 0.001892922006497327, "grad_norm": 9.414981842041016, "learning_rate": 8.613263157894737e-05, "loss": 6.8462, "step": 37 }, { "epoch": 0.0019440820607269844, "grad_norm": 9.462505340576172, "learning_rate": 8.560421052631578e-05, "loss": 7.0395, "step": 38 }, { "epoch": 0.001995242114956642, "grad_norm": 9.712154388427734, "learning_rate": 8.50757894736842e-05, "loss": 6.9949, "step": 39 }, { "epoch": 0.002046402169186299, "grad_norm": 9.938891410827637, "learning_rate": 8.454736842105263e-05, "loss": 6.9219, "step": 40 }, { "epoch": 0.0020975622234159566, "grad_norm": 9.004804611206055, "learning_rate": 8.401894736842106e-05, "loss": 6.9582, "step": 41 }, { "epoch": 0.0021487222776456144, "grad_norm": 10.507999420166016, "learning_rate": 8.349052631578948e-05, "loss": 7.1598, "step": 42 }, { "epoch": 0.002199882331875272, "grad_norm": 11.1851167678833, "learning_rate": 8.29621052631579e-05, "loss": 6.6988, "step": 43 }, { "epoch": 0.0022510423861049293, "grad_norm": 11.41895866394043, "learning_rate": 8.243368421052632e-05, "loss": 6.4008, "step": 44 }, { "epoch": 0.0023022024403345867, "grad_norm": 12.590848922729492, "learning_rate": 8.190526315789474e-05, "loss": 6.8194, "step": 45 }, { "epoch": 0.002353362494564244, "grad_norm": 12.010416030883789, "learning_rate": 8.137684210526315e-05, "loss": 5.9006, "step": 46 }, { "epoch": 0.002404522548793902, "grad_norm": 14.762430191040039, "learning_rate": 8.084842105263157e-05, "loss": 6.6724, "step": 47 }, { "epoch": 0.0024556826030235593, "grad_norm": 16.334985733032227, "learning_rate": 8.032e-05, "loss": 7.9172, "step": 48 }, { "epoch": 0.0025068426572532167, "grad_norm": 19.476001739501953, "learning_rate": 7.979157894736842e-05, "loss": 7.183, "step": 49 }, { "epoch": 0.002558002711482874, "grad_norm": 22.785118103027344, "learning_rate": 7.926315789473684e-05, "loss": 8.0195, "step": 50 }, { "epoch": 0.002558002711482874, "eval_loss": 3.542884588241577, "eval_runtime": 63.9961, "eval_samples_per_second": 128.602, "eval_steps_per_second": 32.158, "step": 50 }, { "epoch": 0.0026091627657125315, "grad_norm": 9.901399612426758, "learning_rate": 7.873473684210526e-05, "loss": 7.1914, "step": 51 }, { "epoch": 0.0026603228199421894, "grad_norm": 9.308576583862305, "learning_rate": 7.820631578947369e-05, "loss": 7.345, "step": 52 }, { "epoch": 0.0027114828741718468, "grad_norm": 8.333325386047363, "learning_rate": 7.76778947368421e-05, "loss": 7.2228, "step": 53 }, { "epoch": 0.002762642928401504, "grad_norm": 8.715147972106934, "learning_rate": 7.714947368421052e-05, "loss": 7.3092, "step": 54 }, { "epoch": 0.0028138029826311616, "grad_norm": 7.331003189086914, "learning_rate": 7.662105263157896e-05, "loss": 7.0676, "step": 55 }, { "epoch": 0.002864963036860819, "grad_norm": 6.238088130950928, "learning_rate": 7.609263157894737e-05, "loss": 7.1804, "step": 56 }, { "epoch": 0.0029161230910904764, "grad_norm": 5.0990214347839355, "learning_rate": 7.556421052631579e-05, "loss": 6.9847, "step": 57 }, { "epoch": 0.002967283145320134, "grad_norm": 4.999057292938232, "learning_rate": 7.503578947368421e-05, "loss": 6.7341, "step": 58 }, { "epoch": 0.0030184431995497916, "grad_norm": 4.857077598571777, "learning_rate": 7.450736842105263e-05, "loss": 6.9527, "step": 59 }, { "epoch": 0.003069603253779449, "grad_norm": 4.997248649597168, "learning_rate": 7.397894736842105e-05, "loss": 6.8346, "step": 60 }, { "epoch": 0.0031207633080091064, "grad_norm": 5.417945861816406, "learning_rate": 7.345052631578948e-05, "loss": 6.7909, "step": 61 }, { "epoch": 0.003171923362238764, "grad_norm": 5.509094715118408, "learning_rate": 7.29221052631579e-05, "loss": 6.9353, "step": 62 }, { "epoch": 0.0032230834164684217, "grad_norm": 5.141371726989746, "learning_rate": 7.239368421052631e-05, "loss": 6.4616, "step": 63 }, { "epoch": 0.003274243470698079, "grad_norm": 5.105563163757324, "learning_rate": 7.186526315789474e-05, "loss": 6.7637, "step": 64 }, { "epoch": 0.0033254035249277365, "grad_norm": 5.5586395263671875, "learning_rate": 7.133684210526316e-05, "loss": 6.8437, "step": 65 }, { "epoch": 0.003376563579157394, "grad_norm": 6.44944429397583, "learning_rate": 7.080842105263158e-05, "loss": 7.5635, "step": 66 }, { "epoch": 0.0034277236333870513, "grad_norm": 5.609267234802246, "learning_rate": 7.028e-05, "loss": 6.8409, "step": 67 }, { "epoch": 0.0034788836876167087, "grad_norm": 5.41196870803833, "learning_rate": 6.975157894736843e-05, "loss": 6.9099, "step": 68 }, { "epoch": 0.0035300437418463665, "grad_norm": 5.729660511016846, "learning_rate": 6.922315789473685e-05, "loss": 7.0213, "step": 69 }, { "epoch": 0.003581203796076024, "grad_norm": 5.930596351623535, "learning_rate": 6.869473684210527e-05, "loss": 6.9063, "step": 70 }, { "epoch": 0.0036323638503056813, "grad_norm": 5.334259510040283, "learning_rate": 6.816631578947368e-05, "loss": 6.5692, "step": 71 }, { "epoch": 0.0036835239045353387, "grad_norm": 5.7713799476623535, "learning_rate": 6.76378947368421e-05, "loss": 6.6977, "step": 72 }, { "epoch": 0.003734683958764996, "grad_norm": 5.6330060958862305, "learning_rate": 6.710947368421052e-05, "loss": 6.8352, "step": 73 }, { "epoch": 0.003785844012994654, "grad_norm": 6.375953674316406, "learning_rate": 6.658105263157894e-05, "loss": 6.8407, "step": 74 }, { "epoch": 0.0038370040672243114, "grad_norm": 5.348406791687012, "learning_rate": 6.605263157894737e-05, "loss": 6.0472, "step": 75 }, { "epoch": 0.003888164121453969, "grad_norm": 6.372185707092285, "learning_rate": 6.55242105263158e-05, "loss": 7.0043, "step": 76 }, { "epoch": 0.003939324175683627, "grad_norm": 7.178797721862793, "learning_rate": 6.499578947368422e-05, "loss": 7.0941, "step": 77 }, { "epoch": 0.003990484229913284, "grad_norm": 6.4347405433654785, "learning_rate": 6.446736842105264e-05, "loss": 6.4544, "step": 78 }, { "epoch": 0.004041644284142941, "grad_norm": 6.168828010559082, "learning_rate": 6.393894736842105e-05, "loss": 6.6922, "step": 79 }, { "epoch": 0.004092804338372598, "grad_norm": 8.033903121948242, "learning_rate": 6.341052631578947e-05, "loss": 7.0748, "step": 80 }, { "epoch": 0.004143964392602256, "grad_norm": 8.563255310058594, "learning_rate": 6.288210526315789e-05, "loss": 7.3227, "step": 81 }, { "epoch": 0.004195124446831913, "grad_norm": 7.053371429443359, "learning_rate": 6.235368421052632e-05, "loss": 6.9836, "step": 82 }, { "epoch": 0.004246284501061571, "grad_norm": 7.234203815460205, "learning_rate": 6.182526315789474e-05, "loss": 6.6639, "step": 83 }, { "epoch": 0.004297444555291229, "grad_norm": 8.589826583862305, "learning_rate": 6.129684210526316e-05, "loss": 6.4636, "step": 84 }, { "epoch": 0.004348604609520886, "grad_norm": 7.279289245605469, "learning_rate": 6.076842105263158e-05, "loss": 6.5707, "step": 85 }, { "epoch": 0.004399764663750544, "grad_norm": 8.435927391052246, "learning_rate": 6.024e-05, "loss": 7.0168, "step": 86 }, { "epoch": 0.004450924717980201, "grad_norm": 8.27949333190918, "learning_rate": 5.971157894736842e-05, "loss": 6.6047, "step": 87 }, { "epoch": 0.0045020847722098585, "grad_norm": 8.873299598693848, "learning_rate": 5.9183157894736835e-05, "loss": 7.204, "step": 88 }, { "epoch": 0.004553244826439516, "grad_norm": 8.857370376586914, "learning_rate": 5.8654736842105267e-05, "loss": 6.9514, "step": 89 }, { "epoch": 0.004604404880669173, "grad_norm": 9.384210586547852, "learning_rate": 5.8126315789473684e-05, "loss": 7.2892, "step": 90 }, { "epoch": 0.004655564934898831, "grad_norm": 8.983156204223633, "learning_rate": 5.759789473684211e-05, "loss": 6.4263, "step": 91 }, { "epoch": 0.004706724989128488, "grad_norm": 10.15875244140625, "learning_rate": 5.706947368421053e-05, "loss": 6.957, "step": 92 }, { "epoch": 0.004757885043358146, "grad_norm": 11.718667030334473, "learning_rate": 5.6541052631578945e-05, "loss": 6.7773, "step": 93 }, { "epoch": 0.004809045097587804, "grad_norm": 10.486905097961426, "learning_rate": 5.601263157894736e-05, "loss": 5.8637, "step": 94 }, { "epoch": 0.004860205151817461, "grad_norm": 10.54555606842041, "learning_rate": 5.5484210526315794e-05, "loss": 6.2613, "step": 95 }, { "epoch": 0.004911365206047119, "grad_norm": 11.35007381439209, "learning_rate": 5.495578947368421e-05, "loss": 6.6358, "step": 96 }, { "epoch": 0.004962525260276776, "grad_norm": 13.453088760375977, "learning_rate": 5.442736842105264e-05, "loss": 6.6338, "step": 97 }, { "epoch": 0.005013685314506433, "grad_norm": 12.99913215637207, "learning_rate": 5.3898947368421055e-05, "loss": 6.1289, "step": 98 }, { "epoch": 0.005064845368736091, "grad_norm": 13.534591674804688, "learning_rate": 5.337052631578947e-05, "loss": 5.9654, "step": 99 }, { "epoch": 0.005116005422965748, "grad_norm": 21.44630241394043, "learning_rate": 5.284210526315789e-05, "loss": 7.7421, "step": 100 }, { "epoch": 0.005116005422965748, "eval_loss": 3.3566391468048096, "eval_runtime": 63.8318, "eval_samples_per_second": 128.933, "eval_steps_per_second": 32.241, "step": 100 }, { "epoch": 0.005167165477195406, "grad_norm": 4.078703880310059, "learning_rate": 5.231368421052631e-05, "loss": 6.6078, "step": 101 }, { "epoch": 0.005218325531425063, "grad_norm": 4.873344898223877, "learning_rate": 5.178526315789474e-05, "loss": 6.8522, "step": 102 }, { "epoch": 0.005269485585654721, "grad_norm": 4.799216270446777, "learning_rate": 5.1256842105263165e-05, "loss": 6.7426, "step": 103 }, { "epoch": 0.005320645639884379, "grad_norm": 4.976231575012207, "learning_rate": 5.072842105263158e-05, "loss": 6.9461, "step": 104 }, { "epoch": 0.005371805694114036, "grad_norm": 5.191263675689697, "learning_rate": 5.02e-05, "loss": 6.6738, "step": 105 }, { "epoch": 0.0054229657483436935, "grad_norm": 4.8230671882629395, "learning_rate": 4.967157894736842e-05, "loss": 7.0215, "step": 106 }, { "epoch": 0.0054741258025733505, "grad_norm": 4.912371635437012, "learning_rate": 4.914315789473684e-05, "loss": 6.9355, "step": 107 }, { "epoch": 0.005525285856803008, "grad_norm": 4.830647945404053, "learning_rate": 4.861473684210526e-05, "loss": 7.2809, "step": 108 }, { "epoch": 0.005576445911032665, "grad_norm": 4.306668758392334, "learning_rate": 4.8086315789473686e-05, "loss": 6.548, "step": 109 }, { "epoch": 0.005627605965262323, "grad_norm": 4.5233354568481445, "learning_rate": 4.7557894736842104e-05, "loss": 6.7959, "step": 110 }, { "epoch": 0.005678766019491981, "grad_norm": 4.432839393615723, "learning_rate": 4.702947368421053e-05, "loss": 6.397, "step": 111 }, { "epoch": 0.005729926073721638, "grad_norm": 4.1401286125183105, "learning_rate": 4.6501052631578946e-05, "loss": 6.4082, "step": 112 }, { "epoch": 0.005781086127951296, "grad_norm": 4.629195213317871, "learning_rate": 4.5972631578947364e-05, "loss": 6.6789, "step": 113 }, { "epoch": 0.005832246182180953, "grad_norm": 4.584342956542969, "learning_rate": 4.544421052631579e-05, "loss": 6.9578, "step": 114 }, { "epoch": 0.005883406236410611, "grad_norm": 5.518434524536133, "learning_rate": 4.4915789473684213e-05, "loss": 7.2951, "step": 115 }, { "epoch": 0.005934566290640268, "grad_norm": 5.404613494873047, "learning_rate": 4.438736842105263e-05, "loss": 6.7455, "step": 116 }, { "epoch": 0.005985726344869925, "grad_norm": 5.240906238555908, "learning_rate": 4.3858947368421056e-05, "loss": 6.6538, "step": 117 }, { "epoch": 0.006036886399099583, "grad_norm": 5.181666374206543, "learning_rate": 4.3330526315789474e-05, "loss": 6.4923, "step": 118 }, { "epoch": 0.00608804645332924, "grad_norm": 4.942501068115234, "learning_rate": 4.280210526315789e-05, "loss": 6.9221, "step": 119 }, { "epoch": 0.006139206507558898, "grad_norm": 5.363955020904541, "learning_rate": 4.2273684210526317e-05, "loss": 6.5048, "step": 120 }, { "epoch": 0.006190366561788556, "grad_norm": 5.2177205085754395, "learning_rate": 4.174526315789474e-05, "loss": 6.3778, "step": 121 }, { "epoch": 0.006241526616018213, "grad_norm": 5.227365493774414, "learning_rate": 4.121684210526316e-05, "loss": 6.63, "step": 122 }, { "epoch": 0.006292686670247871, "grad_norm": 5.972108364105225, "learning_rate": 4.068842105263158e-05, "loss": 6.8807, "step": 123 }, { "epoch": 0.006343846724477528, "grad_norm": 5.9163665771484375, "learning_rate": 4.016e-05, "loss": 7.0169, "step": 124 }, { "epoch": 0.0063950067787071855, "grad_norm": 5.8213582038879395, "learning_rate": 3.963157894736842e-05, "loss": 6.8512, "step": 125 }, { "epoch": 0.006446166832936843, "grad_norm": 5.801515579223633, "learning_rate": 3.9103157894736844e-05, "loss": 6.5021, "step": 126 }, { "epoch": 0.0064973268871665, "grad_norm": 7.357461452484131, "learning_rate": 3.857473684210526e-05, "loss": 7.2509, "step": 127 }, { "epoch": 0.006548486941396158, "grad_norm": 6.292891979217529, "learning_rate": 3.804631578947369e-05, "loss": 6.6632, "step": 128 }, { "epoch": 0.006599646995625815, "grad_norm": 7.3089470863342285, "learning_rate": 3.7517894736842105e-05, "loss": 6.5396, "step": 129 }, { "epoch": 0.006650807049855473, "grad_norm": 6.743093967437744, "learning_rate": 3.698947368421052e-05, "loss": 6.3143, "step": 130 }, { "epoch": 0.00670196710408513, "grad_norm": 7.874321937561035, "learning_rate": 3.646105263157895e-05, "loss": 6.9193, "step": 131 }, { "epoch": 0.006753127158314788, "grad_norm": 8.004316329956055, "learning_rate": 3.593263157894737e-05, "loss": 7.1274, "step": 132 }, { "epoch": 0.006804287212544446, "grad_norm": 8.134577751159668, "learning_rate": 3.540421052631579e-05, "loss": 6.7643, "step": 133 }, { "epoch": 0.0068554472667741026, "grad_norm": 9.081040382385254, "learning_rate": 3.4875789473684215e-05, "loss": 7.4408, "step": 134 }, { "epoch": 0.00690660732100376, "grad_norm": 8.707175254821777, "learning_rate": 3.434736842105263e-05, "loss": 7.259, "step": 135 }, { "epoch": 0.006957767375233417, "grad_norm": 7.119297981262207, "learning_rate": 3.381894736842105e-05, "loss": 6.0159, "step": 136 }, { "epoch": 0.007008927429463075, "grad_norm": 7.995966911315918, "learning_rate": 3.329052631578947e-05, "loss": 6.9343, "step": 137 }, { "epoch": 0.007060087483692733, "grad_norm": 8.485739707946777, "learning_rate": 3.27621052631579e-05, "loss": 6.6603, "step": 138 }, { "epoch": 0.00711124753792239, "grad_norm": 8.533065795898438, "learning_rate": 3.223368421052632e-05, "loss": 6.6126, "step": 139 }, { "epoch": 0.007162407592152048, "grad_norm": 8.851434707641602, "learning_rate": 3.1705263157894736e-05, "loss": 6.6025, "step": 140 }, { "epoch": 0.007213567646381705, "grad_norm": 9.988713264465332, "learning_rate": 3.117684210526316e-05, "loss": 6.7507, "step": 141 }, { "epoch": 0.007264727700611363, "grad_norm": 8.809107780456543, "learning_rate": 3.064842105263158e-05, "loss": 6.4291, "step": 142 }, { "epoch": 0.0073158877548410205, "grad_norm": 9.391690254211426, "learning_rate": 3.012e-05, "loss": 6.8171, "step": 143 }, { "epoch": 0.0073670478090706775, "grad_norm": 9.753740310668945, "learning_rate": 2.9591578947368418e-05, "loss": 6.3895, "step": 144 }, { "epoch": 0.007418207863300335, "grad_norm": 11.239912986755371, "learning_rate": 2.9063157894736842e-05, "loss": 7.2081, "step": 145 }, { "epoch": 0.007469367917529992, "grad_norm": 11.756484985351562, "learning_rate": 2.8534736842105264e-05, "loss": 7.7236, "step": 146 }, { "epoch": 0.00752052797175965, "grad_norm": 13.649311065673828, "learning_rate": 2.800631578947368e-05, "loss": 6.8186, "step": 147 }, { "epoch": 0.007571688025989308, "grad_norm": 13.54671859741211, "learning_rate": 2.7477894736842106e-05, "loss": 7.2639, "step": 148 }, { "epoch": 0.007622848080218965, "grad_norm": 17.210609436035156, "learning_rate": 2.6949473684210527e-05, "loss": 6.5821, "step": 149 }, { "epoch": 0.007674008134448623, "grad_norm": 20.728883743286133, "learning_rate": 2.6421052631578945e-05, "loss": 7.2953, "step": 150 }, { "epoch": 0.007674008134448623, "eval_loss": 3.283073663711548, "eval_runtime": 63.7554, "eval_samples_per_second": 129.087, "eval_steps_per_second": 32.28, "step": 150 }, { "epoch": 0.00772516818867828, "grad_norm": 2.60485577583313, "learning_rate": 2.589263157894737e-05, "loss": 6.6848, "step": 151 }, { "epoch": 0.007776328242907938, "grad_norm": 3.393989086151123, "learning_rate": 2.536421052631579e-05, "loss": 6.8256, "step": 152 }, { "epoch": 0.007827488297137595, "grad_norm": 3.4821159839630127, "learning_rate": 2.483578947368421e-05, "loss": 6.7886, "step": 153 }, { "epoch": 0.007878648351367253, "grad_norm": 3.74564528465271, "learning_rate": 2.430736842105263e-05, "loss": 6.4158, "step": 154 }, { "epoch": 0.00792980840559691, "grad_norm": 3.815945863723755, "learning_rate": 2.3778947368421052e-05, "loss": 6.9564, "step": 155 }, { "epoch": 0.007980968459826567, "grad_norm": 3.868741750717163, "learning_rate": 2.3250526315789473e-05, "loss": 6.3913, "step": 156 }, { "epoch": 0.008032128514056224, "grad_norm": 4.156867027282715, "learning_rate": 2.2722105263157894e-05, "loss": 6.8321, "step": 157 }, { "epoch": 0.008083288568285883, "grad_norm": 4.067819595336914, "learning_rate": 2.2193684210526316e-05, "loss": 6.4717, "step": 158 }, { "epoch": 0.00813444862251554, "grad_norm": 4.351383209228516, "learning_rate": 2.1665263157894737e-05, "loss": 6.5168, "step": 159 }, { "epoch": 0.008185608676745197, "grad_norm": 4.422007083892822, "learning_rate": 2.1136842105263158e-05, "loss": 6.9062, "step": 160 }, { "epoch": 0.008236768730974856, "grad_norm": 4.846063613891602, "learning_rate": 2.060842105263158e-05, "loss": 7.1057, "step": 161 }, { "epoch": 0.008287928785204512, "grad_norm": 4.516020774841309, "learning_rate": 2.008e-05, "loss": 6.7208, "step": 162 }, { "epoch": 0.00833908883943417, "grad_norm": 4.626501083374023, "learning_rate": 1.9551578947368422e-05, "loss": 6.6746, "step": 163 }, { "epoch": 0.008390248893663826, "grad_norm": 4.8145432472229, "learning_rate": 1.9023157894736843e-05, "loss": 6.7835, "step": 164 }, { "epoch": 0.008441408947893485, "grad_norm": 5.256274700164795, "learning_rate": 1.849473684210526e-05, "loss": 6.7433, "step": 165 }, { "epoch": 0.008492569002123142, "grad_norm": 4.914257526397705, "learning_rate": 1.7966315789473686e-05, "loss": 6.5287, "step": 166 }, { "epoch": 0.008543729056352799, "grad_norm": 5.295276165008545, "learning_rate": 1.7437894736842107e-05, "loss": 6.8411, "step": 167 }, { "epoch": 0.008594889110582458, "grad_norm": 5.221895694732666, "learning_rate": 1.6909473684210525e-05, "loss": 6.9334, "step": 168 }, { "epoch": 0.008646049164812115, "grad_norm": 6.2948808670043945, "learning_rate": 1.638105263157895e-05, "loss": 6.8033, "step": 169 }, { "epoch": 0.008697209219041772, "grad_norm": 5.295489311218262, "learning_rate": 1.5852631578947368e-05, "loss": 6.2837, "step": 170 }, { "epoch": 0.00874836927327143, "grad_norm": 5.55963134765625, "learning_rate": 1.532421052631579e-05, "loss": 6.435, "step": 171 }, { "epoch": 0.008799529327501087, "grad_norm": 6.1554365158081055, "learning_rate": 1.4795789473684209e-05, "loss": 7.6007, "step": 172 }, { "epoch": 0.008850689381730744, "grad_norm": 5.635270118713379, "learning_rate": 1.4267368421052632e-05, "loss": 6.5427, "step": 173 }, { "epoch": 0.008901849435960401, "grad_norm": 6.0492167472839355, "learning_rate": 1.3738947368421053e-05, "loss": 6.7604, "step": 174 }, { "epoch": 0.00895300949019006, "grad_norm": 6.073276042938232, "learning_rate": 1.3210526315789473e-05, "loss": 6.5833, "step": 175 }, { "epoch": 0.009004169544419717, "grad_norm": 6.390398979187012, "learning_rate": 1.2682105263157896e-05, "loss": 7.1092, "step": 176 }, { "epoch": 0.009055329598649374, "grad_norm": 6.145637035369873, "learning_rate": 1.2153684210526315e-05, "loss": 6.8361, "step": 177 }, { "epoch": 0.009106489652879033, "grad_norm": 6.2100629806518555, "learning_rate": 1.1625263157894737e-05, "loss": 6.5985, "step": 178 }, { "epoch": 0.00915764970710869, "grad_norm": 7.055151462554932, "learning_rate": 1.1096842105263158e-05, "loss": 5.8342, "step": 179 }, { "epoch": 0.009208809761338347, "grad_norm": 7.049131870269775, "learning_rate": 1.0568421052631579e-05, "loss": 7.0179, "step": 180 }, { "epoch": 0.009259969815568005, "grad_norm": 7.064737796783447, "learning_rate": 1.004e-05, "loss": 6.5794, "step": 181 }, { "epoch": 0.009311129869797662, "grad_norm": 7.239538669586182, "learning_rate": 9.511578947368422e-06, "loss": 7.1518, "step": 182 }, { "epoch": 0.00936228992402732, "grad_norm": 6.966012001037598, "learning_rate": 8.983157894736843e-06, "loss": 6.2703, "step": 183 }, { "epoch": 0.009413449978256976, "grad_norm": 8.423846244812012, "learning_rate": 8.454736842105263e-06, "loss": 6.9322, "step": 184 }, { "epoch": 0.009464610032486635, "grad_norm": 7.523478984832764, "learning_rate": 7.926315789473684e-06, "loss": 6.1557, "step": 185 }, { "epoch": 0.009515770086716292, "grad_norm": 8.170174598693848, "learning_rate": 7.397894736842104e-06, "loss": 6.5374, "step": 186 }, { "epoch": 0.009566930140945949, "grad_norm": 8.937209129333496, "learning_rate": 6.8694736842105265e-06, "loss": 6.0916, "step": 187 }, { "epoch": 0.009618090195175608, "grad_norm": 9.289628028869629, "learning_rate": 6.341052631578948e-06, "loss": 6.795, "step": 188 }, { "epoch": 0.009669250249405265, "grad_norm": 8.454782485961914, "learning_rate": 5.812631578947368e-06, "loss": 7.3334, "step": 189 }, { "epoch": 0.009720410303634922, "grad_norm": 9.165085792541504, "learning_rate": 5.2842105263157896e-06, "loss": 6.4959, "step": 190 }, { "epoch": 0.009771570357864579, "grad_norm": 9.678885459899902, "learning_rate": 4.755789473684211e-06, "loss": 6.7935, "step": 191 }, { "epoch": 0.009822730412094237, "grad_norm": 10.571404457092285, "learning_rate": 4.227368421052631e-06, "loss": 7.1292, "step": 192 }, { "epoch": 0.009873890466323894, "grad_norm": 10.899333000183105, "learning_rate": 3.698947368421052e-06, "loss": 6.9073, "step": 193 }, { "epoch": 0.009925050520553551, "grad_norm": 10.557672500610352, "learning_rate": 3.170526315789474e-06, "loss": 6.1996, "step": 194 }, { "epoch": 0.00997621057478321, "grad_norm": 12.432440757751465, "learning_rate": 2.6421052631578948e-06, "loss": 7.3089, "step": 195 }, { "epoch": 0.010027370629012867, "grad_norm": 11.616739273071289, "learning_rate": 2.1136842105263157e-06, "loss": 6.8059, "step": 196 }, { "epoch": 0.010078530683242524, "grad_norm": 12.100770950317383, "learning_rate": 1.585263157894737e-06, "loss": 7.8332, "step": 197 }, { "epoch": 0.010129690737472183, "grad_norm": 13.274076461791992, "learning_rate": 1.0568421052631578e-06, "loss": 6.9364, "step": 198 }, { "epoch": 0.01018085079170184, "grad_norm": 14.706048011779785, "learning_rate": 5.284210526315789e-07, "loss": 6.9485, "step": 199 }, { "epoch": 0.010232010845931496, "grad_norm": 20.332195281982422, "learning_rate": 0.0, "loss": 7.482, "step": 200 }, { "epoch": 0.010232010845931496, "eval_loss": 3.268723964691162, "eval_runtime": 63.6451, "eval_samples_per_second": 129.311, "eval_steps_per_second": 32.336, "step": 200 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 945204718731264.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }