Fill-Mask
Transformers
PyTorch
bert
Inference Endpoints
manchuBERT / trainer_state.json
seemdog's picture
Rename checkpoint/trainer_state.json to trainer_state.json
55ddb41 verified
{
"best_metric": 2.4363410472869873,
"best_model_checkpoint": "/home/seemdog/manchu_BERT/1002_BERT_DA_1.0/checkpoint-86000",
"epoch": 9.964620917517031,
"global_step": 213000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.05,
"learning_rate": 4.9766081871345035e-05,
"loss": 6.1581,
"step": 1000
},
{
"epoch": 0.05,
"eval_loss": 5.5598931312561035,
"eval_runtime": 54.891,
"eval_samples_per_second": 120.767,
"eval_steps_per_second": 1.895,
"step": 1000
},
{
"epoch": 0.09,
"learning_rate": 4.953216374269006e-05,
"loss": 5.3713,
"step": 2000
},
{
"epoch": 0.09,
"eval_loss": 4.843267440795898,
"eval_runtime": 54.8945,
"eval_samples_per_second": 120.759,
"eval_steps_per_second": 1.895,
"step": 2000
},
{
"epoch": 0.14,
"learning_rate": 4.9298245614035086e-05,
"loss": 4.7624,
"step": 3000
},
{
"epoch": 0.14,
"eval_loss": 4.427705764770508,
"eval_runtime": 54.9095,
"eval_samples_per_second": 120.726,
"eval_steps_per_second": 1.894,
"step": 3000
},
{
"epoch": 0.19,
"learning_rate": 4.906432748538012e-05,
"loss": 4.2884,
"step": 4000
},
{
"epoch": 0.19,
"eval_loss": 4.152446746826172,
"eval_runtime": 54.9536,
"eval_samples_per_second": 120.629,
"eval_steps_per_second": 1.893,
"step": 4000
},
{
"epoch": 0.23,
"learning_rate": 4.883040935672515e-05,
"loss": 3.908,
"step": 5000
},
{
"epoch": 0.23,
"eval_loss": 3.943004608154297,
"eval_runtime": 54.9769,
"eval_samples_per_second": 120.578,
"eval_steps_per_second": 1.892,
"step": 5000
},
{
"epoch": 0.28,
"learning_rate": 4.859649122807018e-05,
"loss": 3.6357,
"step": 6000
},
{
"epoch": 0.28,
"eval_loss": 3.7840378284454346,
"eval_runtime": 54.9612,
"eval_samples_per_second": 120.612,
"eval_steps_per_second": 1.892,
"step": 6000
},
{
"epoch": 0.33,
"learning_rate": 4.836257309941521e-05,
"loss": 3.442,
"step": 7000
},
{
"epoch": 0.33,
"eval_loss": 3.6515119075775146,
"eval_runtime": 55.0182,
"eval_samples_per_second": 120.487,
"eval_steps_per_second": 1.89,
"step": 7000
},
{
"epoch": 0.37,
"learning_rate": 4.8128654970760235e-05,
"loss": 3.2982,
"step": 8000
},
{
"epoch": 0.37,
"eval_loss": 3.5147831439971924,
"eval_runtime": 54.9459,
"eval_samples_per_second": 120.646,
"eval_steps_per_second": 1.893,
"step": 8000
},
{
"epoch": 0.42,
"learning_rate": 4.789473684210526e-05,
"loss": 3.1681,
"step": 9000
},
{
"epoch": 0.42,
"eval_loss": 3.4453866481781006,
"eval_runtime": 54.9741,
"eval_samples_per_second": 120.584,
"eval_steps_per_second": 1.892,
"step": 9000
},
{
"epoch": 0.47,
"learning_rate": 4.7660818713450294e-05,
"loss": 3.0515,
"step": 10000
},
{
"epoch": 0.47,
"eval_loss": 3.3482985496520996,
"eval_runtime": 54.9922,
"eval_samples_per_second": 120.544,
"eval_steps_per_second": 1.891,
"step": 10000
},
{
"epoch": 0.51,
"learning_rate": 4.7426900584795326e-05,
"loss": 2.9408,
"step": 11000
},
{
"epoch": 0.51,
"eval_loss": 3.274308919906616,
"eval_runtime": 55.0307,
"eval_samples_per_second": 120.46,
"eval_steps_per_second": 1.89,
"step": 11000
},
{
"epoch": 0.56,
"learning_rate": 4.719298245614036e-05,
"loss": 2.8601,
"step": 12000
},
{
"epoch": 0.56,
"eval_loss": 3.2094714641571045,
"eval_runtime": 54.9444,
"eval_samples_per_second": 120.649,
"eval_steps_per_second": 1.893,
"step": 12000
},
{
"epoch": 0.61,
"learning_rate": 4.695906432748538e-05,
"loss": 2.7866,
"step": 13000
},
{
"epoch": 0.61,
"eval_loss": 3.1299281120300293,
"eval_runtime": 54.9484,
"eval_samples_per_second": 120.64,
"eval_steps_per_second": 1.893,
"step": 13000
},
{
"epoch": 0.65,
"learning_rate": 4.672514619883041e-05,
"loss": 2.7094,
"step": 14000
},
{
"epoch": 0.65,
"eval_loss": 3.096022844314575,
"eval_runtime": 55.155,
"eval_samples_per_second": 120.189,
"eval_steps_per_second": 1.886,
"step": 14000
},
{
"epoch": 0.7,
"learning_rate": 4.649122807017544e-05,
"loss": 2.6424,
"step": 15000
},
{
"epoch": 0.7,
"eval_loss": 3.060807228088379,
"eval_runtime": 55.1935,
"eval_samples_per_second": 120.105,
"eval_steps_per_second": 1.884,
"step": 15000
},
{
"epoch": 0.75,
"learning_rate": 4.625730994152047e-05,
"loss": 2.5729,
"step": 16000
},
{
"epoch": 0.75,
"eval_loss": 3.0170695781707764,
"eval_runtime": 55.2064,
"eval_samples_per_second": 120.077,
"eval_steps_per_second": 1.884,
"step": 16000
},
{
"epoch": 0.8,
"learning_rate": 4.60233918128655e-05,
"loss": 2.5108,
"step": 17000
},
{
"epoch": 0.8,
"eval_loss": 2.9729015827178955,
"eval_runtime": 55.2048,
"eval_samples_per_second": 120.08,
"eval_steps_per_second": 1.884,
"step": 17000
},
{
"epoch": 0.84,
"learning_rate": 4.5789473684210527e-05,
"loss": 2.4538,
"step": 18000
},
{
"epoch": 0.84,
"eval_loss": 2.9392964839935303,
"eval_runtime": 55.2009,
"eval_samples_per_second": 120.089,
"eval_steps_per_second": 1.884,
"step": 18000
},
{
"epoch": 0.89,
"learning_rate": 4.555555555555556e-05,
"loss": 2.3941,
"step": 19000
},
{
"epoch": 0.89,
"eval_loss": 2.900946617126465,
"eval_runtime": 55.1868,
"eval_samples_per_second": 120.119,
"eval_steps_per_second": 1.885,
"step": 19000
},
{
"epoch": 0.94,
"learning_rate": 4.5321637426900585e-05,
"loss": 2.3341,
"step": 20000
},
{
"epoch": 0.94,
"eval_loss": 2.87040376663208,
"eval_runtime": 55.0611,
"eval_samples_per_second": 120.393,
"eval_steps_per_second": 1.889,
"step": 20000
},
{
"epoch": 0.98,
"learning_rate": 4.508771929824562e-05,
"loss": 2.2797,
"step": 21000
},
{
"epoch": 0.98,
"eval_loss": 2.8554604053497314,
"eval_runtime": 54.9944,
"eval_samples_per_second": 120.54,
"eval_steps_per_second": 1.891,
"step": 21000
},
{
"epoch": 1.03,
"learning_rate": 4.485380116959065e-05,
"loss": 2.2284,
"step": 22000
},
{
"epoch": 1.03,
"eval_loss": 2.8280177116394043,
"eval_runtime": 54.9695,
"eval_samples_per_second": 120.594,
"eval_steps_per_second": 1.892,
"step": 22000
},
{
"epoch": 1.08,
"learning_rate": 4.4619883040935676e-05,
"loss": 2.1651,
"step": 23000
},
{
"epoch": 1.08,
"eval_loss": 2.7877776622772217,
"eval_runtime": 54.9786,
"eval_samples_per_second": 120.574,
"eval_steps_per_second": 1.892,
"step": 23000
},
{
"epoch": 1.12,
"learning_rate": 4.43859649122807e-05,
"loss": 2.1267,
"step": 24000
},
{
"epoch": 1.12,
"eval_loss": 2.7796318531036377,
"eval_runtime": 55.0112,
"eval_samples_per_second": 120.503,
"eval_steps_per_second": 1.891,
"step": 24000
},
{
"epoch": 1.17,
"learning_rate": 4.4152046783625734e-05,
"loss": 2.0887,
"step": 25000
},
{
"epoch": 1.17,
"eval_loss": 2.7155935764312744,
"eval_runtime": 54.9846,
"eval_samples_per_second": 120.561,
"eval_steps_per_second": 1.891,
"step": 25000
},
{
"epoch": 1.22,
"learning_rate": 4.3918128654970766e-05,
"loss": 2.0477,
"step": 26000
},
{
"epoch": 1.22,
"eval_loss": 2.7347090244293213,
"eval_runtime": 54.9797,
"eval_samples_per_second": 120.572,
"eval_steps_per_second": 1.892,
"step": 26000
},
{
"epoch": 1.26,
"learning_rate": 4.368421052631579e-05,
"loss": 2.0055,
"step": 27000
},
{
"epoch": 1.26,
"eval_loss": 2.7260184288024902,
"eval_runtime": 54.9686,
"eval_samples_per_second": 120.596,
"eval_steps_per_second": 1.892,
"step": 27000
},
{
"epoch": 1.31,
"learning_rate": 4.345029239766082e-05,
"loss": 1.9738,
"step": 28000
},
{
"epoch": 1.31,
"eval_loss": 2.7053301334381104,
"eval_runtime": 54.975,
"eval_samples_per_second": 120.582,
"eval_steps_per_second": 1.892,
"step": 28000
},
{
"epoch": 1.36,
"learning_rate": 4.321637426900585e-05,
"loss": 1.9336,
"step": 29000
},
{
"epoch": 1.36,
"eval_loss": 2.6540746688842773,
"eval_runtime": 54.9866,
"eval_samples_per_second": 120.557,
"eval_steps_per_second": 1.891,
"step": 29000
},
{
"epoch": 1.4,
"learning_rate": 4.298245614035088e-05,
"loss": 1.9008,
"step": 30000
},
{
"epoch": 1.4,
"eval_loss": 2.6721866130828857,
"eval_runtime": 54.9707,
"eval_samples_per_second": 120.592,
"eval_steps_per_second": 1.892,
"step": 30000
},
{
"epoch": 1.45,
"learning_rate": 4.274853801169591e-05,
"loss": 1.8603,
"step": 31000
},
{
"epoch": 1.45,
"eval_loss": 2.6387619972229004,
"eval_runtime": 54.9719,
"eval_samples_per_second": 120.589,
"eval_steps_per_second": 1.892,
"step": 31000
},
{
"epoch": 1.5,
"learning_rate": 4.251461988304094e-05,
"loss": 1.8291,
"step": 32000
},
{
"epoch": 1.5,
"eval_loss": 2.640782594680786,
"eval_runtime": 54.9643,
"eval_samples_per_second": 120.606,
"eval_steps_per_second": 1.892,
"step": 32000
},
{
"epoch": 1.54,
"learning_rate": 4.228070175438597e-05,
"loss": 1.8059,
"step": 33000
},
{
"epoch": 1.54,
"eval_loss": 2.614128589630127,
"eval_runtime": 54.9538,
"eval_samples_per_second": 120.629,
"eval_steps_per_second": 1.893,
"step": 33000
},
{
"epoch": 1.59,
"learning_rate": 4.204678362573099e-05,
"loss": 1.7663,
"step": 34000
},
{
"epoch": 1.59,
"eval_loss": 2.618607997894287,
"eval_runtime": 55.0051,
"eval_samples_per_second": 120.516,
"eval_steps_per_second": 1.891,
"step": 34000
},
{
"epoch": 1.64,
"learning_rate": 4.1812865497076025e-05,
"loss": 1.7322,
"step": 35000
},
{
"epoch": 1.64,
"eval_loss": 2.6462574005126953,
"eval_runtime": 54.9802,
"eval_samples_per_second": 120.571,
"eval_steps_per_second": 1.892,
"step": 35000
},
{
"epoch": 1.68,
"learning_rate": 4.157894736842106e-05,
"loss": 1.7187,
"step": 36000
},
{
"epoch": 1.68,
"eval_loss": 2.5989272594451904,
"eval_runtime": 54.9619,
"eval_samples_per_second": 120.611,
"eval_steps_per_second": 1.892,
"step": 36000
},
{
"epoch": 1.73,
"learning_rate": 4.134502923976608e-05,
"loss": 1.6852,
"step": 37000
},
{
"epoch": 1.73,
"eval_loss": 2.5719058513641357,
"eval_runtime": 54.9667,
"eval_samples_per_second": 120.6,
"eval_steps_per_second": 1.892,
"step": 37000
},
{
"epoch": 1.78,
"learning_rate": 4.111111111111111e-05,
"loss": 1.6649,
"step": 38000
},
{
"epoch": 1.78,
"eval_loss": 2.57804012298584,
"eval_runtime": 54.9675,
"eval_samples_per_second": 120.598,
"eval_steps_per_second": 1.892,
"step": 38000
},
{
"epoch": 1.82,
"learning_rate": 4.087719298245614e-05,
"loss": 1.6285,
"step": 39000
},
{
"epoch": 1.82,
"eval_loss": 2.5606088638305664,
"eval_runtime": 55.1929,
"eval_samples_per_second": 120.106,
"eval_steps_per_second": 1.884,
"step": 39000
},
{
"epoch": 1.87,
"learning_rate": 4.0643274853801174e-05,
"loss": 1.6033,
"step": 40000
},
{
"epoch": 1.87,
"eval_loss": 2.570094585418701,
"eval_runtime": 55.1572,
"eval_samples_per_second": 120.184,
"eval_steps_per_second": 1.886,
"step": 40000
},
{
"epoch": 1.92,
"learning_rate": 4.04093567251462e-05,
"loss": 1.5833,
"step": 41000
},
{
"epoch": 1.92,
"eval_loss": 2.5516393184661865,
"eval_runtime": 55.1223,
"eval_samples_per_second": 120.26,
"eval_steps_per_second": 1.887,
"step": 41000
},
{
"epoch": 1.96,
"learning_rate": 4.017543859649123e-05,
"loss": 1.5701,
"step": 42000
},
{
"epoch": 1.96,
"eval_loss": 2.544060707092285,
"eval_runtime": 54.9919,
"eval_samples_per_second": 120.545,
"eval_steps_per_second": 1.891,
"step": 42000
},
{
"epoch": 2.01,
"learning_rate": 3.994152046783626e-05,
"loss": 1.5252,
"step": 43000
},
{
"epoch": 2.01,
"eval_loss": 2.545295476913452,
"eval_runtime": 54.9924,
"eval_samples_per_second": 120.544,
"eval_steps_per_second": 1.891,
"step": 43000
},
{
"epoch": 2.06,
"learning_rate": 3.970760233918129e-05,
"loss": 1.5019,
"step": 44000
},
{
"epoch": 2.06,
"eval_loss": 2.547807216644287,
"eval_runtime": 55.007,
"eval_samples_per_second": 120.512,
"eval_steps_per_second": 1.891,
"step": 44000
},
{
"epoch": 2.11,
"learning_rate": 3.9473684210526316e-05,
"loss": 1.4789,
"step": 45000
},
{
"epoch": 2.11,
"eval_loss": 2.541635036468506,
"eval_runtime": 54.9822,
"eval_samples_per_second": 120.566,
"eval_steps_per_second": 1.892,
"step": 45000
},
{
"epoch": 2.15,
"learning_rate": 3.923976608187135e-05,
"loss": 1.4611,
"step": 46000
},
{
"epoch": 2.15,
"eval_loss": 2.526390790939331,
"eval_runtime": 54.9826,
"eval_samples_per_second": 120.565,
"eval_steps_per_second": 1.892,
"step": 46000
},
{
"epoch": 2.2,
"learning_rate": 3.9005847953216374e-05,
"loss": 1.4413,
"step": 47000
},
{
"epoch": 2.2,
"eval_loss": 2.5193886756896973,
"eval_runtime": 54.9793,
"eval_samples_per_second": 120.573,
"eval_steps_per_second": 1.892,
"step": 47000
},
{
"epoch": 2.25,
"learning_rate": 3.877192982456141e-05,
"loss": 1.4106,
"step": 48000
},
{
"epoch": 2.25,
"eval_loss": 2.504810094833374,
"eval_runtime": 55.0248,
"eval_samples_per_second": 120.473,
"eval_steps_per_second": 1.89,
"step": 48000
},
{
"epoch": 2.29,
"learning_rate": 3.853801169590643e-05,
"loss": 1.3928,
"step": 49000
},
{
"epoch": 2.29,
"eval_loss": 2.5266056060791016,
"eval_runtime": 55.1287,
"eval_samples_per_second": 120.246,
"eval_steps_per_second": 1.886,
"step": 49000
},
{
"epoch": 2.34,
"learning_rate": 3.8304093567251465e-05,
"loss": 1.3857,
"step": 50000
},
{
"epoch": 2.34,
"eval_loss": 2.5026743412017822,
"eval_runtime": 55.0968,
"eval_samples_per_second": 120.315,
"eval_steps_per_second": 1.888,
"step": 50000
},
{
"epoch": 2.39,
"learning_rate": 3.80701754385965e-05,
"loss": 1.3682,
"step": 51000
},
{
"epoch": 2.39,
"eval_loss": 2.5191988945007324,
"eval_runtime": 55.0835,
"eval_samples_per_second": 120.345,
"eval_steps_per_second": 1.888,
"step": 51000
},
{
"epoch": 2.43,
"learning_rate": 3.783625730994152e-05,
"loss": 1.337,
"step": 52000
},
{
"epoch": 2.43,
"eval_loss": 2.4917993545532227,
"eval_runtime": 55.1615,
"eval_samples_per_second": 120.175,
"eval_steps_per_second": 1.885,
"step": 52000
},
{
"epoch": 2.48,
"learning_rate": 3.760233918128655e-05,
"loss": 1.3314,
"step": 53000
},
{
"epoch": 2.48,
"eval_loss": 2.503882646560669,
"eval_runtime": 55.1711,
"eval_samples_per_second": 120.153,
"eval_steps_per_second": 1.885,
"step": 53000
},
{
"epoch": 2.53,
"learning_rate": 3.736842105263158e-05,
"loss": 1.3213,
"step": 54000
},
{
"epoch": 2.53,
"eval_loss": 2.5335164070129395,
"eval_runtime": 55.1504,
"eval_samples_per_second": 120.199,
"eval_steps_per_second": 1.886,
"step": 54000
},
{
"epoch": 2.57,
"learning_rate": 3.713450292397661e-05,
"loss": 1.2901,
"step": 55000
},
{
"epoch": 2.57,
"eval_loss": 2.5040109157562256,
"eval_runtime": 55.1836,
"eval_samples_per_second": 120.126,
"eval_steps_per_second": 1.885,
"step": 55000
},
{
"epoch": 2.62,
"learning_rate": 3.690058479532164e-05,
"loss": 1.2927,
"step": 56000
},
{
"epoch": 2.62,
"eval_loss": 2.4990580081939697,
"eval_runtime": 55.1982,
"eval_samples_per_second": 120.095,
"eval_steps_per_second": 1.884,
"step": 56000
},
{
"epoch": 2.67,
"learning_rate": 3.6666666666666666e-05,
"loss": 1.2631,
"step": 57000
},
{
"epoch": 2.67,
"eval_loss": 2.500002861022949,
"eval_runtime": 55.1671,
"eval_samples_per_second": 120.162,
"eval_steps_per_second": 1.885,
"step": 57000
},
{
"epoch": 2.71,
"learning_rate": 3.64327485380117e-05,
"loss": 1.2526,
"step": 58000
},
{
"epoch": 2.71,
"eval_loss": 2.484260320663452,
"eval_runtime": 55.0693,
"eval_samples_per_second": 120.376,
"eval_steps_per_second": 1.889,
"step": 58000
},
{
"epoch": 2.76,
"learning_rate": 3.6198830409356724e-05,
"loss": 1.2371,
"step": 59000
},
{
"epoch": 2.76,
"eval_loss": 2.480639696121216,
"eval_runtime": 55.0676,
"eval_samples_per_second": 120.379,
"eval_steps_per_second": 1.889,
"step": 59000
},
{
"epoch": 2.81,
"learning_rate": 3.5964912280701756e-05,
"loss": 1.2194,
"step": 60000
},
{
"epoch": 2.81,
"eval_loss": 2.480283498764038,
"eval_runtime": 54.981,
"eval_samples_per_second": 120.569,
"eval_steps_per_second": 1.892,
"step": 60000
},
{
"epoch": 2.85,
"learning_rate": 3.573099415204679e-05,
"loss": 1.2103,
"step": 61000
},
{
"epoch": 2.85,
"eval_loss": 2.4655823707580566,
"eval_runtime": 54.9896,
"eval_samples_per_second": 120.55,
"eval_steps_per_second": 1.891,
"step": 61000
},
{
"epoch": 2.9,
"learning_rate": 3.5497076023391815e-05,
"loss": 1.1954,
"step": 62000
},
{
"epoch": 2.9,
"eval_loss": 2.467862367630005,
"eval_runtime": 55.0349,
"eval_samples_per_second": 120.451,
"eval_steps_per_second": 1.89,
"step": 62000
},
{
"epoch": 2.95,
"learning_rate": 3.526315789473684e-05,
"loss": 1.1841,
"step": 63000
},
{
"epoch": 2.95,
"eval_loss": 2.4734864234924316,
"eval_runtime": 55.0767,
"eval_samples_per_second": 120.359,
"eval_steps_per_second": 1.888,
"step": 63000
},
{
"epoch": 2.99,
"learning_rate": 3.502923976608187e-05,
"loss": 1.1697,
"step": 64000
},
{
"epoch": 2.99,
"eval_loss": 2.4691245555877686,
"eval_runtime": 55.01,
"eval_samples_per_second": 120.505,
"eval_steps_per_second": 1.891,
"step": 64000
},
{
"epoch": 3.04,
"learning_rate": 3.4795321637426905e-05,
"loss": 1.1488,
"step": 65000
},
{
"epoch": 3.04,
"eval_loss": 2.50709867477417,
"eval_runtime": 55.0061,
"eval_samples_per_second": 120.514,
"eval_steps_per_second": 1.891,
"step": 65000
},
{
"epoch": 3.09,
"learning_rate": 3.456140350877193e-05,
"loss": 1.1343,
"step": 66000
},
{
"epoch": 3.09,
"eval_loss": 2.464665412902832,
"eval_runtime": 54.9972,
"eval_samples_per_second": 120.533,
"eval_steps_per_second": 1.891,
"step": 66000
},
{
"epoch": 3.13,
"learning_rate": 3.432748538011696e-05,
"loss": 1.1285,
"step": 67000
},
{
"epoch": 3.13,
"eval_loss": 2.4716575145721436,
"eval_runtime": 54.9735,
"eval_samples_per_second": 120.585,
"eval_steps_per_second": 1.892,
"step": 67000
},
{
"epoch": 3.18,
"learning_rate": 3.409356725146199e-05,
"loss": 1.1124,
"step": 68000
},
{
"epoch": 3.18,
"eval_loss": 2.476966619491577,
"eval_runtime": 55.0007,
"eval_samples_per_second": 120.526,
"eval_steps_per_second": 1.891,
"step": 68000
},
{
"epoch": 3.23,
"learning_rate": 3.385964912280702e-05,
"loss": 1.1097,
"step": 69000
},
{
"epoch": 3.23,
"eval_loss": 2.487794876098633,
"eval_runtime": 54.9919,
"eval_samples_per_second": 120.545,
"eval_steps_per_second": 1.891,
"step": 69000
},
{
"epoch": 3.27,
"learning_rate": 3.362573099415205e-05,
"loss": 1.0956,
"step": 70000
},
{
"epoch": 3.27,
"eval_loss": 2.4818880558013916,
"eval_runtime": 55.0269,
"eval_samples_per_second": 120.468,
"eval_steps_per_second": 1.89,
"step": 70000
},
{
"epoch": 3.32,
"learning_rate": 3.339181286549708e-05,
"loss": 1.088,
"step": 71000
},
{
"epoch": 3.32,
"eval_loss": 2.4609289169311523,
"eval_runtime": 54.9477,
"eval_samples_per_second": 120.642,
"eval_steps_per_second": 1.893,
"step": 71000
},
{
"epoch": 3.37,
"learning_rate": 3.3157894736842106e-05,
"loss": 1.0728,
"step": 72000
},
{
"epoch": 3.37,
"eval_loss": 2.4839322566986084,
"eval_runtime": 54.9672,
"eval_samples_per_second": 120.599,
"eval_steps_per_second": 1.892,
"step": 72000
},
{
"epoch": 3.42,
"learning_rate": 3.292397660818713e-05,
"loss": 1.0587,
"step": 73000
},
{
"epoch": 3.42,
"eval_loss": 2.4727675914764404,
"eval_runtime": 55.0507,
"eval_samples_per_second": 120.416,
"eval_steps_per_second": 1.889,
"step": 73000
},
{
"epoch": 3.46,
"learning_rate": 3.2690058479532164e-05,
"loss": 1.0534,
"step": 74000
},
{
"epoch": 3.46,
"eval_loss": 2.4812207221984863,
"eval_runtime": 54.9899,
"eval_samples_per_second": 120.549,
"eval_steps_per_second": 1.891,
"step": 74000
},
{
"epoch": 3.51,
"learning_rate": 3.24561403508772e-05,
"loss": 1.0455,
"step": 75000
},
{
"epoch": 3.51,
"eval_loss": 2.469550609588623,
"eval_runtime": 54.9765,
"eval_samples_per_second": 120.579,
"eval_steps_per_second": 1.892,
"step": 75000
},
{
"epoch": 3.56,
"learning_rate": 3.222222222222223e-05,
"loss": 1.0402,
"step": 76000
},
{
"epoch": 3.56,
"eval_loss": 2.458113431930542,
"eval_runtime": 54.9925,
"eval_samples_per_second": 120.544,
"eval_steps_per_second": 1.891,
"step": 76000
},
{
"epoch": 3.6,
"learning_rate": 3.198830409356725e-05,
"loss": 1.0227,
"step": 77000
},
{
"epoch": 3.6,
"eval_loss": 2.4712133407592773,
"eval_runtime": 54.9707,
"eval_samples_per_second": 120.592,
"eval_steps_per_second": 1.892,
"step": 77000
},
{
"epoch": 3.65,
"learning_rate": 3.175438596491228e-05,
"loss": 1.0172,
"step": 78000
},
{
"epoch": 3.65,
"eval_loss": 2.4822046756744385,
"eval_runtime": 54.9842,
"eval_samples_per_second": 120.562,
"eval_steps_per_second": 1.891,
"step": 78000
},
{
"epoch": 3.7,
"learning_rate": 3.152046783625731e-05,
"loss": 0.9947,
"step": 79000
},
{
"epoch": 3.7,
"eval_loss": 2.455008029937744,
"eval_runtime": 54.9636,
"eval_samples_per_second": 120.607,
"eval_steps_per_second": 1.892,
"step": 79000
},
{
"epoch": 3.74,
"learning_rate": 3.128654970760234e-05,
"loss": 0.9924,
"step": 80000
},
{
"epoch": 3.74,
"eval_loss": 2.440960168838501,
"eval_runtime": 54.9708,
"eval_samples_per_second": 120.591,
"eval_steps_per_second": 1.892,
"step": 80000
},
{
"epoch": 3.79,
"learning_rate": 3.105263157894737e-05,
"loss": 0.9863,
"step": 81000
},
{
"epoch": 3.79,
"eval_loss": 2.454493761062622,
"eval_runtime": 54.966,
"eval_samples_per_second": 120.602,
"eval_steps_per_second": 1.892,
"step": 81000
},
{
"epoch": 3.84,
"learning_rate": 3.08187134502924e-05,
"loss": 0.9793,
"step": 82000
},
{
"epoch": 3.84,
"eval_loss": 2.482584238052368,
"eval_runtime": 55.0651,
"eval_samples_per_second": 120.385,
"eval_steps_per_second": 1.889,
"step": 82000
},
{
"epoch": 3.88,
"learning_rate": 3.058479532163743e-05,
"loss": 0.9639,
"step": 83000
},
{
"epoch": 3.88,
"eval_loss": 2.4847776889801025,
"eval_runtime": 55.089,
"eval_samples_per_second": 120.332,
"eval_steps_per_second": 1.888,
"step": 83000
},
{
"epoch": 3.93,
"learning_rate": 3.035087719298246e-05,
"loss": 0.9584,
"step": 84000
},
{
"epoch": 3.93,
"eval_loss": 2.4647934436798096,
"eval_runtime": 55.1206,
"eval_samples_per_second": 120.263,
"eval_steps_per_second": 1.887,
"step": 84000
},
{
"epoch": 3.98,
"learning_rate": 3.0116959064327488e-05,
"loss": 0.9508,
"step": 85000
},
{
"epoch": 3.98,
"eval_loss": 2.445103406906128,
"eval_runtime": 55.0978,
"eval_samples_per_second": 120.313,
"eval_steps_per_second": 1.888,
"step": 85000
},
{
"epoch": 4.02,
"learning_rate": 2.9883040935672517e-05,
"loss": 0.9425,
"step": 86000
},
{
"epoch": 4.02,
"eval_loss": 2.4363410472869873,
"eval_runtime": 55.0773,
"eval_samples_per_second": 120.358,
"eval_steps_per_second": 1.888,
"step": 86000
},
{
"epoch": 4.07,
"learning_rate": 2.9649122807017543e-05,
"loss": 0.9301,
"step": 87000
},
{
"epoch": 4.07,
"eval_loss": 2.4576821327209473,
"eval_runtime": 55.0908,
"eval_samples_per_second": 120.329,
"eval_steps_per_second": 1.888,
"step": 87000
},
{
"epoch": 4.12,
"learning_rate": 2.9415204678362572e-05,
"loss": 0.922,
"step": 88000
},
{
"epoch": 4.12,
"eval_loss": 2.487666130065918,
"eval_runtime": 55.1028,
"eval_samples_per_second": 120.302,
"eval_steps_per_second": 1.887,
"step": 88000
},
{
"epoch": 4.16,
"learning_rate": 2.9181286549707604e-05,
"loss": 0.9102,
"step": 89000
},
{
"epoch": 4.16,
"eval_loss": 2.462902784347534,
"eval_runtime": 55.0955,
"eval_samples_per_second": 120.318,
"eval_steps_per_second": 1.888,
"step": 89000
},
{
"epoch": 4.21,
"learning_rate": 2.8947368421052634e-05,
"loss": 0.9081,
"step": 90000
},
{
"epoch": 4.21,
"eval_loss": 2.4494595527648926,
"eval_runtime": 55.0849,
"eval_samples_per_second": 120.341,
"eval_steps_per_second": 1.888,
"step": 90000
},
{
"epoch": 4.26,
"learning_rate": 2.8713450292397666e-05,
"loss": 0.8956,
"step": 91000
},
{
"epoch": 4.26,
"eval_loss": 2.466681718826294,
"eval_runtime": 55.0767,
"eval_samples_per_second": 120.359,
"eval_steps_per_second": 1.888,
"step": 91000
},
{
"epoch": 4.3,
"learning_rate": 2.847953216374269e-05,
"loss": 0.8932,
"step": 92000
},
{
"epoch": 4.3,
"eval_loss": 2.4637372493743896,
"eval_runtime": 55.0713,
"eval_samples_per_second": 120.371,
"eval_steps_per_second": 1.888,
"step": 92000
},
{
"epoch": 4.35,
"learning_rate": 2.824561403508772e-05,
"loss": 0.8845,
"step": 93000
},
{
"epoch": 4.35,
"eval_loss": 2.4586174488067627,
"eval_runtime": 55.0741,
"eval_samples_per_second": 120.365,
"eval_steps_per_second": 1.888,
"step": 93000
},
{
"epoch": 4.4,
"learning_rate": 2.801169590643275e-05,
"loss": 0.877,
"step": 94000
},
{
"epoch": 4.4,
"eval_loss": 2.471717357635498,
"eval_runtime": 55.0727,
"eval_samples_per_second": 120.368,
"eval_steps_per_second": 1.888,
"step": 94000
},
{
"epoch": 4.44,
"learning_rate": 2.777777777777778e-05,
"loss": 0.8713,
"step": 95000
},
{
"epoch": 4.44,
"eval_loss": 2.4618284702301025,
"eval_runtime": 55.0799,
"eval_samples_per_second": 120.352,
"eval_steps_per_second": 1.888,
"step": 95000
},
{
"epoch": 4.49,
"learning_rate": 2.754385964912281e-05,
"loss": 0.8768,
"step": 96000
},
{
"epoch": 4.49,
"eval_loss": 2.4480040073394775,
"eval_runtime": 55.1696,
"eval_samples_per_second": 120.157,
"eval_steps_per_second": 1.885,
"step": 96000
},
{
"epoch": 4.54,
"learning_rate": 2.7309941520467834e-05,
"loss": 0.8662,
"step": 97000
},
{
"epoch": 4.54,
"eval_loss": 2.468902349472046,
"eval_runtime": 55.1714,
"eval_samples_per_second": 120.153,
"eval_steps_per_second": 1.885,
"step": 97000
},
{
"epoch": 4.58,
"learning_rate": 2.7076023391812866e-05,
"loss": 0.8622,
"step": 98000
},
{
"epoch": 4.58,
"eval_loss": 2.4613983631134033,
"eval_runtime": 55.1613,
"eval_samples_per_second": 120.175,
"eval_steps_per_second": 1.885,
"step": 98000
},
{
"epoch": 4.63,
"learning_rate": 2.6842105263157896e-05,
"loss": 0.8497,
"step": 99000
},
{
"epoch": 4.63,
"eval_loss": 2.488284111022949,
"eval_runtime": 55.1664,
"eval_samples_per_second": 120.164,
"eval_steps_per_second": 1.885,
"step": 99000
},
{
"epoch": 4.68,
"learning_rate": 2.6608187134502928e-05,
"loss": 0.8399,
"step": 100000
},
{
"epoch": 4.68,
"eval_loss": 2.486598253250122,
"eval_runtime": 55.142,
"eval_samples_per_second": 120.217,
"eval_steps_per_second": 1.886,
"step": 100000
},
{
"epoch": 4.73,
"learning_rate": 2.6374269005847957e-05,
"loss": 0.8397,
"step": 101000
},
{
"epoch": 4.73,
"eval_loss": 2.490933895111084,
"eval_runtime": 55.1377,
"eval_samples_per_second": 120.226,
"eval_steps_per_second": 1.886,
"step": 101000
},
{
"epoch": 4.77,
"learning_rate": 2.6140350877192983e-05,
"loss": 0.8266,
"step": 102000
},
{
"epoch": 4.77,
"eval_loss": 2.4587643146514893,
"eval_runtime": 55.0944,
"eval_samples_per_second": 120.321,
"eval_steps_per_second": 1.888,
"step": 102000
},
{
"epoch": 4.82,
"learning_rate": 2.5906432748538012e-05,
"loss": 0.8231,
"step": 103000
},
{
"epoch": 4.82,
"eval_loss": 2.4951488971710205,
"eval_runtime": 55.155,
"eval_samples_per_second": 120.189,
"eval_steps_per_second": 1.886,
"step": 103000
},
{
"epoch": 4.87,
"learning_rate": 2.567251461988304e-05,
"loss": 0.8189,
"step": 104000
},
{
"epoch": 4.87,
"eval_loss": 2.458134889602661,
"eval_runtime": 55.0735,
"eval_samples_per_second": 120.366,
"eval_steps_per_second": 1.888,
"step": 104000
},
{
"epoch": 4.91,
"learning_rate": 2.5438596491228074e-05,
"loss": 0.8155,
"step": 105000
},
{
"epoch": 4.91,
"eval_loss": 2.448225736618042,
"eval_runtime": 55.0955,
"eval_samples_per_second": 120.318,
"eval_steps_per_second": 1.888,
"step": 105000
},
{
"epoch": 4.96,
"learning_rate": 2.5204678362573103e-05,
"loss": 0.8059,
"step": 106000
},
{
"epoch": 4.96,
"eval_loss": 2.489133358001709,
"eval_runtime": 55.1106,
"eval_samples_per_second": 120.285,
"eval_steps_per_second": 1.887,
"step": 106000
},
{
"epoch": 5.01,
"learning_rate": 2.4970760233918132e-05,
"loss": 0.8085,
"step": 107000
},
{
"epoch": 5.01,
"eval_loss": 2.491405487060547,
"eval_runtime": 55.0557,
"eval_samples_per_second": 120.405,
"eval_steps_per_second": 1.889,
"step": 107000
},
{
"epoch": 5.05,
"learning_rate": 2.4736842105263158e-05,
"loss": 0.7851,
"step": 108000
},
{
"epoch": 5.05,
"eval_loss": 2.486567735671997,
"eval_runtime": 55.0714,
"eval_samples_per_second": 120.371,
"eval_steps_per_second": 1.888,
"step": 108000
},
{
"epoch": 5.1,
"learning_rate": 2.450292397660819e-05,
"loss": 0.7827,
"step": 109000
},
{
"epoch": 5.1,
"eval_loss": 2.480097532272339,
"eval_runtime": 55.0814,
"eval_samples_per_second": 120.349,
"eval_steps_per_second": 1.888,
"step": 109000
},
{
"epoch": 5.15,
"learning_rate": 2.4269005847953216e-05,
"loss": 0.7813,
"step": 110000
},
{
"epoch": 5.15,
"eval_loss": 2.4855968952178955,
"eval_runtime": 55.078,
"eval_samples_per_second": 120.357,
"eval_steps_per_second": 1.888,
"step": 110000
},
{
"epoch": 5.19,
"learning_rate": 2.4035087719298245e-05,
"loss": 0.7829,
"step": 111000
},
{
"epoch": 5.19,
"eval_loss": 2.462341785430908,
"eval_runtime": 55.0705,
"eval_samples_per_second": 120.373,
"eval_steps_per_second": 1.888,
"step": 111000
},
{
"epoch": 5.24,
"learning_rate": 2.3801169590643278e-05,
"loss": 0.7724,
"step": 112000
},
{
"epoch": 5.24,
"eval_loss": 2.478029251098633,
"eval_runtime": 55.0837,
"eval_samples_per_second": 120.344,
"eval_steps_per_second": 1.888,
"step": 112000
},
{
"epoch": 5.29,
"learning_rate": 2.3567251461988303e-05,
"loss": 0.7646,
"step": 113000
},
{
"epoch": 5.29,
"eval_loss": 2.4587323665618896,
"eval_runtime": 55.1053,
"eval_samples_per_second": 120.297,
"eval_steps_per_second": 1.887,
"step": 113000
},
{
"epoch": 5.33,
"learning_rate": 2.3333333333333336e-05,
"loss": 0.7604,
"step": 114000
},
{
"epoch": 5.33,
"eval_loss": 2.453984498977661,
"eval_runtime": 55.0903,
"eval_samples_per_second": 120.33,
"eval_steps_per_second": 1.888,
"step": 114000
},
{
"epoch": 5.38,
"learning_rate": 2.309941520467836e-05,
"loss": 0.7518,
"step": 115000
},
{
"epoch": 5.38,
"eval_loss": 2.488924026489258,
"eval_runtime": 55.1009,
"eval_samples_per_second": 120.307,
"eval_steps_per_second": 1.887,
"step": 115000
},
{
"epoch": 5.43,
"learning_rate": 2.2865497076023394e-05,
"loss": 0.7515,
"step": 116000
},
{
"epoch": 5.43,
"eval_loss": 2.4510860443115234,
"eval_runtime": 55.088,
"eval_samples_per_second": 120.335,
"eval_steps_per_second": 1.888,
"step": 116000
},
{
"epoch": 5.47,
"learning_rate": 2.2631578947368423e-05,
"loss": 0.7511,
"step": 117000
},
{
"epoch": 5.47,
"eval_loss": 2.468933343887329,
"eval_runtime": 55.0676,
"eval_samples_per_second": 120.379,
"eval_steps_per_second": 1.889,
"step": 117000
},
{
"epoch": 5.52,
"learning_rate": 2.2397660818713452e-05,
"loss": 0.7424,
"step": 118000
},
{
"epoch": 5.52,
"eval_loss": 2.4676008224487305,
"eval_runtime": 55.1052,
"eval_samples_per_second": 120.297,
"eval_steps_per_second": 1.887,
"step": 118000
},
{
"epoch": 5.57,
"learning_rate": 2.216374269005848e-05,
"loss": 0.7327,
"step": 119000
},
{
"epoch": 5.57,
"eval_loss": 2.482384443283081,
"eval_runtime": 55.0883,
"eval_samples_per_second": 120.334,
"eval_steps_per_second": 1.888,
"step": 119000
},
{
"epoch": 5.61,
"learning_rate": 2.1929824561403507e-05,
"loss": 0.7349,
"step": 120000
},
{
"epoch": 5.61,
"eval_loss": 2.450364351272583,
"eval_runtime": 55.0642,
"eval_samples_per_second": 120.387,
"eval_steps_per_second": 1.889,
"step": 120000
},
{
"epoch": 5.66,
"learning_rate": 2.169590643274854e-05,
"loss": 0.7307,
"step": 121000
},
{
"epoch": 5.66,
"eval_loss": 2.4753456115722656,
"eval_runtime": 55.0827,
"eval_samples_per_second": 120.346,
"eval_steps_per_second": 1.888,
"step": 121000
},
{
"epoch": 5.71,
"learning_rate": 2.146198830409357e-05,
"loss": 0.7269,
"step": 122000
},
{
"epoch": 5.71,
"eval_loss": 2.463690757751465,
"eval_runtime": 55.087,
"eval_samples_per_second": 120.337,
"eval_steps_per_second": 1.888,
"step": 122000
},
{
"epoch": 5.75,
"learning_rate": 2.1228070175438598e-05,
"loss": 0.7175,
"step": 123000
},
{
"epoch": 5.75,
"eval_loss": 2.4744393825531006,
"eval_runtime": 55.0809,
"eval_samples_per_second": 120.35,
"eval_steps_per_second": 1.888,
"step": 123000
},
{
"epoch": 5.8,
"learning_rate": 2.0994152046783627e-05,
"loss": 0.7178,
"step": 124000
},
{
"epoch": 5.8,
"eval_loss": 2.4851980209350586,
"eval_runtime": 55.0877,
"eval_samples_per_second": 120.335,
"eval_steps_per_second": 1.888,
"step": 124000
},
{
"epoch": 5.85,
"learning_rate": 2.0760233918128656e-05,
"loss": 0.7048,
"step": 125000
},
{
"epoch": 5.85,
"eval_loss": 2.5102007389068604,
"eval_runtime": 55.1078,
"eval_samples_per_second": 120.291,
"eval_steps_per_second": 1.887,
"step": 125000
},
{
"epoch": 5.89,
"learning_rate": 2.0526315789473685e-05,
"loss": 0.7072,
"step": 126000
},
{
"epoch": 5.89,
"eval_loss": 2.5026237964630127,
"eval_runtime": 55.1176,
"eval_samples_per_second": 120.27,
"eval_steps_per_second": 1.887,
"step": 126000
},
{
"epoch": 5.94,
"learning_rate": 2.0292397660818714e-05,
"loss": 0.7054,
"step": 127000
},
{
"epoch": 5.94,
"eval_loss": 2.4804298877716064,
"eval_runtime": 55.0663,
"eval_samples_per_second": 120.382,
"eval_steps_per_second": 1.889,
"step": 127000
},
{
"epoch": 5.99,
"learning_rate": 2.0058479532163744e-05,
"loss": 0.7019,
"step": 128000
},
{
"epoch": 5.99,
"eval_loss": 2.4398744106292725,
"eval_runtime": 54.9972,
"eval_samples_per_second": 120.533,
"eval_steps_per_second": 1.891,
"step": 128000
},
{
"epoch": 6.03,
"learning_rate": 1.9824561403508773e-05,
"loss": 0.6942,
"step": 129000
},
{
"epoch": 6.03,
"eval_loss": 2.4618844985961914,
"eval_runtime": 55.1004,
"eval_samples_per_second": 120.308,
"eval_steps_per_second": 1.887,
"step": 129000
},
{
"epoch": 6.08,
"learning_rate": 1.9590643274853802e-05,
"loss": 0.6842,
"step": 130000
},
{
"epoch": 6.08,
"eval_loss": 2.496403217315674,
"eval_runtime": 55.0871,
"eval_samples_per_second": 120.337,
"eval_steps_per_second": 1.888,
"step": 130000
},
{
"epoch": 6.13,
"learning_rate": 1.935672514619883e-05,
"loss": 0.6859,
"step": 131000
},
{
"epoch": 6.13,
"eval_loss": 2.483705520629883,
"eval_runtime": 55.0869,
"eval_samples_per_second": 120.337,
"eval_steps_per_second": 1.888,
"step": 131000
},
{
"epoch": 6.18,
"learning_rate": 1.9122807017543863e-05,
"loss": 0.6742,
"step": 132000
},
{
"epoch": 6.18,
"eval_loss": 2.489377498626709,
"eval_runtime": 55.1198,
"eval_samples_per_second": 120.265,
"eval_steps_per_second": 1.887,
"step": 132000
},
{
"epoch": 6.22,
"learning_rate": 1.888888888888889e-05,
"loss": 0.6818,
"step": 133000
},
{
"epoch": 6.22,
"eval_loss": 2.507904052734375,
"eval_runtime": 55.1222,
"eval_samples_per_second": 120.26,
"eval_steps_per_second": 1.887,
"step": 133000
},
{
"epoch": 6.27,
"learning_rate": 1.8654970760233918e-05,
"loss": 0.6742,
"step": 134000
},
{
"epoch": 6.27,
"eval_loss": 2.4935832023620605,
"eval_runtime": 55.1223,
"eval_samples_per_second": 120.26,
"eval_steps_per_second": 1.887,
"step": 134000
},
{
"epoch": 6.32,
"learning_rate": 1.8421052631578947e-05,
"loss": 0.6756,
"step": 135000
},
{
"epoch": 6.32,
"eval_loss": 2.512763023376465,
"eval_runtime": 55.167,
"eval_samples_per_second": 120.162,
"eval_steps_per_second": 1.885,
"step": 135000
},
{
"epoch": 6.36,
"learning_rate": 1.8187134502923976e-05,
"loss": 0.6635,
"step": 136000
},
{
"epoch": 6.36,
"eval_loss": 2.5170469284057617,
"eval_runtime": 55.1756,
"eval_samples_per_second": 120.144,
"eval_steps_per_second": 1.885,
"step": 136000
},
{
"epoch": 6.41,
"learning_rate": 1.795321637426901e-05,
"loss": 0.6645,
"step": 137000
},
{
"epoch": 6.41,
"eval_loss": 2.5008370876312256,
"eval_runtime": 55.1095,
"eval_samples_per_second": 120.288,
"eval_steps_per_second": 1.887,
"step": 137000
},
{
"epoch": 6.46,
"learning_rate": 1.7719298245614035e-05,
"loss": 0.6617,
"step": 138000
},
{
"epoch": 6.46,
"eval_loss": 2.503709316253662,
"eval_runtime": 55.1047,
"eval_samples_per_second": 120.298,
"eval_steps_per_second": 1.887,
"step": 138000
},
{
"epoch": 6.5,
"learning_rate": 1.7485380116959067e-05,
"loss": 0.6574,
"step": 139000
},
{
"epoch": 6.5,
"eval_loss": 2.4953572750091553,
"eval_runtime": 55.0727,
"eval_samples_per_second": 120.368,
"eval_steps_per_second": 1.888,
"step": 139000
},
{
"epoch": 6.55,
"learning_rate": 1.7251461988304093e-05,
"loss": 0.6519,
"step": 140000
},
{
"epoch": 6.55,
"eval_loss": 2.519571304321289,
"eval_runtime": 55.1072,
"eval_samples_per_second": 120.293,
"eval_steps_per_second": 1.887,
"step": 140000
},
{
"epoch": 6.6,
"learning_rate": 1.7017543859649125e-05,
"loss": 0.6453,
"step": 141000
},
{
"epoch": 6.6,
"eval_loss": 2.485342502593994,
"eval_runtime": 55.0939,
"eval_samples_per_second": 120.322,
"eval_steps_per_second": 1.888,
"step": 141000
},
{
"epoch": 6.64,
"learning_rate": 1.6783625730994155e-05,
"loss": 0.6445,
"step": 142000
},
{
"epoch": 6.64,
"eval_loss": 2.485079765319824,
"eval_runtime": 55.093,
"eval_samples_per_second": 120.324,
"eval_steps_per_second": 1.888,
"step": 142000
},
{
"epoch": 6.69,
"learning_rate": 1.654970760233918e-05,
"loss": 0.643,
"step": 143000
},
{
"epoch": 6.69,
"eval_loss": 2.4923973083496094,
"eval_runtime": 55.1032,
"eval_samples_per_second": 120.302,
"eval_steps_per_second": 1.887,
"step": 143000
},
{
"epoch": 6.74,
"learning_rate": 1.6315789473684213e-05,
"loss": 0.6373,
"step": 144000
},
{
"epoch": 6.74,
"eval_loss": 2.5037529468536377,
"eval_runtime": 55.0798,
"eval_samples_per_second": 120.353,
"eval_steps_per_second": 1.888,
"step": 144000
},
{
"epoch": 6.78,
"learning_rate": 1.608187134502924e-05,
"loss": 0.6292,
"step": 145000
},
{
"epoch": 6.78,
"eval_loss": 2.488449811935425,
"eval_runtime": 55.097,
"eval_samples_per_second": 120.315,
"eval_steps_per_second": 1.888,
"step": 145000
},
{
"epoch": 6.83,
"learning_rate": 1.584795321637427e-05,
"loss": 0.6386,
"step": 146000
},
{
"epoch": 6.83,
"eval_loss": 2.482603073120117,
"eval_runtime": 55.1088,
"eval_samples_per_second": 120.289,
"eval_steps_per_second": 1.887,
"step": 146000
},
{
"epoch": 6.88,
"learning_rate": 1.56140350877193e-05,
"loss": 0.6357,
"step": 147000
},
{
"epoch": 6.88,
"eval_loss": 2.482375144958496,
"eval_runtime": 55.1247,
"eval_samples_per_second": 120.255,
"eval_steps_per_second": 1.887,
"step": 147000
},
{
"epoch": 6.92,
"learning_rate": 1.538011695906433e-05,
"loss": 0.6251,
"step": 148000
},
{
"epoch": 6.92,
"eval_loss": 2.4937736988067627,
"eval_runtime": 55.1287,
"eval_samples_per_second": 120.246,
"eval_steps_per_second": 1.886,
"step": 148000
},
{
"epoch": 6.97,
"learning_rate": 1.5146198830409358e-05,
"loss": 0.624,
"step": 149000
},
{
"epoch": 6.97,
"eval_loss": 2.5023653507232666,
"eval_runtime": 55.1273,
"eval_samples_per_second": 120.249,
"eval_steps_per_second": 1.887,
"step": 149000
},
{
"epoch": 7.02,
"learning_rate": 1.4912280701754386e-05,
"loss": 0.6238,
"step": 150000
},
{
"epoch": 7.02,
"eval_loss": 2.520798444747925,
"eval_runtime": 55.0799,
"eval_samples_per_second": 120.352,
"eval_steps_per_second": 1.888,
"step": 150000
},
{
"epoch": 7.06,
"learning_rate": 1.4678362573099417e-05,
"loss": 0.6165,
"step": 151000
},
{
"epoch": 7.06,
"eval_loss": 2.5339748859405518,
"eval_runtime": 55.1169,
"eval_samples_per_second": 120.272,
"eval_steps_per_second": 1.887,
"step": 151000
},
{
"epoch": 7.11,
"learning_rate": 1.4444444444444444e-05,
"loss": 0.6119,
"step": 152000
},
{
"epoch": 7.11,
"eval_loss": 2.5113964080810547,
"eval_runtime": 55.0891,
"eval_samples_per_second": 120.332,
"eval_steps_per_second": 1.888,
"step": 152000
},
{
"epoch": 7.16,
"learning_rate": 1.4210526315789475e-05,
"loss": 0.6089,
"step": 153000
},
{
"epoch": 7.16,
"eval_loss": 2.52811861038208,
"eval_runtime": 55.112,
"eval_samples_per_second": 120.282,
"eval_steps_per_second": 1.887,
"step": 153000
},
{
"epoch": 7.2,
"learning_rate": 1.3976608187134504e-05,
"loss": 0.6035,
"step": 154000
},
{
"epoch": 7.2,
"eval_loss": 2.5194358825683594,
"eval_runtime": 55.1145,
"eval_samples_per_second": 120.277,
"eval_steps_per_second": 1.887,
"step": 154000
},
{
"epoch": 7.25,
"learning_rate": 1.3742690058479531e-05,
"loss": 0.6018,
"step": 155000
},
{
"epoch": 7.25,
"eval_loss": 2.5066628456115723,
"eval_runtime": 55.0979,
"eval_samples_per_second": 120.313,
"eval_steps_per_second": 1.888,
"step": 155000
},
{
"epoch": 7.3,
"learning_rate": 1.3508771929824562e-05,
"loss": 0.6016,
"step": 156000
},
{
"epoch": 7.3,
"eval_loss": 2.490973711013794,
"eval_runtime": 54.9953,
"eval_samples_per_second": 120.538,
"eval_steps_per_second": 1.891,
"step": 156000
},
{
"epoch": 7.34,
"learning_rate": 1.327485380116959e-05,
"loss": 0.6013,
"step": 157000
},
{
"epoch": 7.34,
"eval_loss": 2.489246368408203,
"eval_runtime": 54.9888,
"eval_samples_per_second": 120.552,
"eval_steps_per_second": 1.891,
"step": 157000
},
{
"epoch": 7.39,
"learning_rate": 1.304093567251462e-05,
"loss": 0.5958,
"step": 158000
},
{
"epoch": 7.39,
"eval_loss": 2.528749704360962,
"eval_runtime": 54.994,
"eval_samples_per_second": 120.54,
"eval_steps_per_second": 1.891,
"step": 158000
},
{
"epoch": 7.44,
"learning_rate": 1.2807017543859651e-05,
"loss": 0.5925,
"step": 159000
},
{
"epoch": 7.44,
"eval_loss": 2.528515100479126,
"eval_runtime": 54.9798,
"eval_samples_per_second": 120.571,
"eval_steps_per_second": 1.892,
"step": 159000
},
{
"epoch": 7.49,
"learning_rate": 1.2573099415204679e-05,
"loss": 0.5908,
"step": 160000
},
{
"epoch": 7.49,
"eval_loss": 2.510267734527588,
"eval_runtime": 55.0014,
"eval_samples_per_second": 120.524,
"eval_steps_per_second": 1.891,
"step": 160000
},
{
"epoch": 7.53,
"learning_rate": 1.2339181286549708e-05,
"loss": 0.587,
"step": 161000
},
{
"epoch": 7.53,
"eval_loss": 2.533625602722168,
"eval_runtime": 54.9987,
"eval_samples_per_second": 120.53,
"eval_steps_per_second": 1.891,
"step": 161000
},
{
"epoch": 7.58,
"learning_rate": 1.2105263157894737e-05,
"loss": 0.5851,
"step": 162000
},
{
"epoch": 7.58,
"eval_loss": 2.538762331008911,
"eval_runtime": 54.9696,
"eval_samples_per_second": 120.594,
"eval_steps_per_second": 1.892,
"step": 162000
},
{
"epoch": 7.63,
"learning_rate": 1.1871345029239766e-05,
"loss": 0.579,
"step": 163000
},
{
"epoch": 7.63,
"eval_loss": 2.5098183155059814,
"eval_runtime": 54.9924,
"eval_samples_per_second": 120.544,
"eval_steps_per_second": 1.891,
"step": 163000
},
{
"epoch": 7.67,
"learning_rate": 1.1637426900584795e-05,
"loss": 0.5764,
"step": 164000
},
{
"epoch": 7.67,
"eval_loss": 2.5329983234405518,
"eval_runtime": 55.0148,
"eval_samples_per_second": 120.495,
"eval_steps_per_second": 1.89,
"step": 164000
},
{
"epoch": 7.72,
"learning_rate": 1.1403508771929824e-05,
"loss": 0.5781,
"step": 165000
},
{
"epoch": 7.72,
"eval_loss": 2.512319803237915,
"eval_runtime": 54.9674,
"eval_samples_per_second": 120.599,
"eval_steps_per_second": 1.892,
"step": 165000
},
{
"epoch": 7.77,
"learning_rate": 1.1169590643274855e-05,
"loss": 0.5758,
"step": 166000
},
{
"epoch": 7.77,
"eval_loss": 2.5034148693084717,
"eval_runtime": 54.9854,
"eval_samples_per_second": 120.559,
"eval_steps_per_second": 1.891,
"step": 166000
},
{
"epoch": 7.81,
"learning_rate": 1.0935672514619884e-05,
"loss": 0.5792,
"step": 167000
},
{
"epoch": 7.81,
"eval_loss": 2.525723934173584,
"eval_runtime": 55.017,
"eval_samples_per_second": 120.49,
"eval_steps_per_second": 1.89,
"step": 167000
},
{
"epoch": 7.86,
"learning_rate": 1.0701754385964913e-05,
"loss": 0.5745,
"step": 168000
},
{
"epoch": 7.86,
"eval_loss": 2.526042938232422,
"eval_runtime": 54.987,
"eval_samples_per_second": 120.556,
"eval_steps_per_second": 1.891,
"step": 168000
},
{
"epoch": 7.91,
"learning_rate": 1.0467836257309941e-05,
"loss": 0.5702,
"step": 169000
},
{
"epoch": 7.91,
"eval_loss": 2.5171217918395996,
"eval_runtime": 54.976,
"eval_samples_per_second": 120.58,
"eval_steps_per_second": 1.892,
"step": 169000
},
{
"epoch": 7.95,
"learning_rate": 1.023391812865497e-05,
"loss": 0.5714,
"step": 170000
},
{
"epoch": 7.95,
"eval_loss": 2.509648323059082,
"eval_runtime": 54.9828,
"eval_samples_per_second": 120.565,
"eval_steps_per_second": 1.892,
"step": 170000
},
{
"epoch": 8.0,
"learning_rate": 1e-05,
"loss": 0.5692,
"step": 171000
},
{
"epoch": 8.0,
"eval_loss": 2.4963207244873047,
"eval_runtime": 54.9818,
"eval_samples_per_second": 120.567,
"eval_steps_per_second": 1.892,
"step": 171000
},
{
"epoch": 8.05,
"learning_rate": 9.76608187134503e-06,
"loss": 0.5541,
"step": 172000
},
{
"epoch": 8.05,
"eval_loss": 2.5158822536468506,
"eval_runtime": 54.9875,
"eval_samples_per_second": 120.555,
"eval_steps_per_second": 1.891,
"step": 172000
},
{
"epoch": 8.09,
"learning_rate": 9.532163742690059e-06,
"loss": 0.5609,
"step": 173000
},
{
"epoch": 8.09,
"eval_loss": 2.52651047706604,
"eval_runtime": 54.9727,
"eval_samples_per_second": 120.587,
"eval_steps_per_second": 1.892,
"step": 173000
},
{
"epoch": 8.14,
"learning_rate": 9.298245614035088e-06,
"loss": 0.5567,
"step": 174000
},
{
"epoch": 8.14,
"eval_loss": 2.529944658279419,
"eval_runtime": 54.9646,
"eval_samples_per_second": 120.605,
"eval_steps_per_second": 1.892,
"step": 174000
},
{
"epoch": 8.19,
"learning_rate": 9.064327485380117e-06,
"loss": 0.5593,
"step": 175000
},
{
"epoch": 8.19,
"eval_loss": 2.5352935791015625,
"eval_runtime": 54.9969,
"eval_samples_per_second": 120.534,
"eval_steps_per_second": 1.891,
"step": 175000
},
{
"epoch": 8.23,
"learning_rate": 8.830409356725146e-06,
"loss": 0.5537,
"step": 176000
},
{
"epoch": 8.23,
"eval_loss": 2.5415403842926025,
"eval_runtime": 54.9924,
"eval_samples_per_second": 120.544,
"eval_steps_per_second": 1.891,
"step": 176000
},
{
"epoch": 8.28,
"learning_rate": 8.596491228070176e-06,
"loss": 0.5465,
"step": 177000
},
{
"epoch": 8.28,
"eval_loss": 2.5204358100891113,
"eval_runtime": 55.0062,
"eval_samples_per_second": 120.514,
"eval_steps_per_second": 1.891,
"step": 177000
},
{
"epoch": 8.33,
"learning_rate": 8.362573099415205e-06,
"loss": 0.548,
"step": 178000
},
{
"epoch": 8.33,
"eval_loss": 2.5008552074432373,
"eval_runtime": 54.9691,
"eval_samples_per_second": 120.595,
"eval_steps_per_second": 1.892,
"step": 178000
},
{
"epoch": 8.37,
"learning_rate": 8.128654970760234e-06,
"loss": 0.5477,
"step": 179000
},
{
"epoch": 8.37,
"eval_loss": 2.5255722999572754,
"eval_runtime": 54.9912,
"eval_samples_per_second": 120.547,
"eval_steps_per_second": 1.891,
"step": 179000
},
{
"epoch": 8.42,
"learning_rate": 7.894736842105263e-06,
"loss": 0.5393,
"step": 180000
},
{
"epoch": 8.42,
"eval_loss": 2.49310564994812,
"eval_runtime": 54.9871,
"eval_samples_per_second": 120.556,
"eval_steps_per_second": 1.891,
"step": 180000
},
{
"epoch": 8.47,
"learning_rate": 7.660818713450294e-06,
"loss": 0.5441,
"step": 181000
},
{
"epoch": 8.47,
"eval_loss": 2.5206234455108643,
"eval_runtime": 54.9863,
"eval_samples_per_second": 120.557,
"eval_steps_per_second": 1.891,
"step": 181000
},
{
"epoch": 8.51,
"learning_rate": 7.426900584795322e-06,
"loss": 0.5419,
"step": 182000
},
{
"epoch": 8.51,
"eval_loss": 2.511657476425171,
"eval_runtime": 54.9931,
"eval_samples_per_second": 120.542,
"eval_steps_per_second": 1.891,
"step": 182000
},
{
"epoch": 8.56,
"learning_rate": 7.192982456140351e-06,
"loss": 0.5377,
"step": 183000
},
{
"epoch": 8.56,
"eval_loss": 2.534726142883301,
"eval_runtime": 55.0074,
"eval_samples_per_second": 120.511,
"eval_steps_per_second": 1.891,
"step": 183000
},
{
"epoch": 8.61,
"learning_rate": 6.95906432748538e-06,
"loss": 0.5375,
"step": 184000
},
{
"epoch": 8.61,
"eval_loss": 2.4978044033050537,
"eval_runtime": 55.0077,
"eval_samples_per_second": 120.51,
"eval_steps_per_second": 1.891,
"step": 184000
},
{
"epoch": 8.65,
"learning_rate": 6.725146198830409e-06,
"loss": 0.5375,
"step": 185000
},
{
"epoch": 8.65,
"eval_loss": 2.4929347038269043,
"eval_runtime": 54.9953,
"eval_samples_per_second": 120.537,
"eval_steps_per_second": 1.891,
"step": 185000
},
{
"epoch": 8.7,
"learning_rate": 6.4912280701754385e-06,
"loss": 0.5354,
"step": 186000
},
{
"epoch": 8.7,
"eval_loss": 2.4908556938171387,
"eval_runtime": 55.0037,
"eval_samples_per_second": 120.519,
"eval_steps_per_second": 1.891,
"step": 186000
},
{
"epoch": 8.75,
"learning_rate": 6.2573099415204685e-06,
"loss": 0.5318,
"step": 187000
},
{
"epoch": 8.75,
"eval_loss": 2.531054973602295,
"eval_runtime": 54.9993,
"eval_samples_per_second": 120.529,
"eval_steps_per_second": 1.891,
"step": 187000
},
{
"epoch": 8.8,
"learning_rate": 6.023391812865498e-06,
"loss": 0.5338,
"step": 188000
},
{
"epoch": 8.8,
"eval_loss": 2.5138602256774902,
"eval_runtime": 54.9949,
"eval_samples_per_second": 120.539,
"eval_steps_per_second": 1.891,
"step": 188000
},
{
"epoch": 8.84,
"learning_rate": 5.789473684210527e-06,
"loss": 0.5247,
"step": 189000
},
{
"epoch": 8.84,
"eval_loss": 2.5182831287384033,
"eval_runtime": 54.9996,
"eval_samples_per_second": 120.528,
"eval_steps_per_second": 1.891,
"step": 189000
},
{
"epoch": 8.89,
"learning_rate": 5.555555555555556e-06,
"loss": 0.5249,
"step": 190000
},
{
"epoch": 8.89,
"eval_loss": 2.5073628425598145,
"eval_runtime": 54.9824,
"eval_samples_per_second": 120.566,
"eval_steps_per_second": 1.892,
"step": 190000
},
{
"epoch": 8.94,
"learning_rate": 5.321637426900585e-06,
"loss": 0.5266,
"step": 191000
},
{
"epoch": 8.94,
"eval_loss": 2.5005078315734863,
"eval_runtime": 54.9464,
"eval_samples_per_second": 120.645,
"eval_steps_per_second": 1.893,
"step": 191000
},
{
"epoch": 8.98,
"learning_rate": 5.087719298245614e-06,
"loss": 0.5279,
"step": 192000
},
{
"epoch": 8.98,
"eval_loss": 2.5144731998443604,
"eval_runtime": 54.9856,
"eval_samples_per_second": 120.559,
"eval_steps_per_second": 1.891,
"step": 192000
},
{
"epoch": 9.03,
"learning_rate": 4.853801169590644e-06,
"loss": 0.5231,
"step": 193000
},
{
"epoch": 9.03,
"eval_loss": 2.5163862705230713,
"eval_runtime": 54.965,
"eval_samples_per_second": 120.604,
"eval_steps_per_second": 1.892,
"step": 193000
},
{
"epoch": 9.08,
"learning_rate": 4.619883040935673e-06,
"loss": 0.5157,
"step": 194000
},
{
"epoch": 9.08,
"eval_loss": 2.4902589321136475,
"eval_runtime": 54.9685,
"eval_samples_per_second": 120.596,
"eval_steps_per_second": 1.892,
"step": 194000
},
{
"epoch": 9.12,
"learning_rate": 4.3859649122807014e-06,
"loss": 0.5153,
"step": 195000
},
{
"epoch": 9.12,
"eval_loss": 2.5248496532440186,
"eval_runtime": 55.0107,
"eval_samples_per_second": 120.504,
"eval_steps_per_second": 1.891,
"step": 195000
},
{
"epoch": 9.17,
"learning_rate": 4.152046783625731e-06,
"loss": 0.5238,
"step": 196000
},
{
"epoch": 9.17,
"eval_loss": 2.4956910610198975,
"eval_runtime": 54.9681,
"eval_samples_per_second": 120.597,
"eval_steps_per_second": 1.892,
"step": 196000
},
{
"epoch": 9.22,
"learning_rate": 3.9181286549707605e-06,
"loss": 0.5229,
"step": 197000
},
{
"epoch": 9.22,
"eval_loss": 2.509634256362915,
"eval_runtime": 55.0395,
"eval_samples_per_second": 120.441,
"eval_steps_per_second": 1.89,
"step": 197000
},
{
"epoch": 9.26,
"learning_rate": 3.6842105263157892e-06,
"loss": 0.5099,
"step": 198000
},
{
"epoch": 9.26,
"eval_loss": 2.505375862121582,
"eval_runtime": 54.9659,
"eval_samples_per_second": 120.602,
"eval_steps_per_second": 1.892,
"step": 198000
},
{
"epoch": 9.31,
"learning_rate": 3.4502923976608188e-06,
"loss": 0.5164,
"step": 199000
},
{
"epoch": 9.31,
"eval_loss": 2.512755870819092,
"eval_runtime": 54.9727,
"eval_samples_per_second": 120.587,
"eval_steps_per_second": 1.892,
"step": 199000
},
{
"epoch": 9.36,
"learning_rate": 3.216374269005848e-06,
"loss": 0.5147,
"step": 200000
},
{
"epoch": 9.36,
"eval_loss": 2.5104758739471436,
"eval_runtime": 54.9829,
"eval_samples_per_second": 120.565,
"eval_steps_per_second": 1.891,
"step": 200000
},
{
"epoch": 9.4,
"learning_rate": 2.9824561403508774e-06,
"loss": 0.5092,
"step": 201000
},
{
"epoch": 9.4,
"eval_loss": 2.5510807037353516,
"eval_runtime": 54.9886,
"eval_samples_per_second": 120.552,
"eval_steps_per_second": 1.891,
"step": 201000
},
{
"epoch": 9.45,
"learning_rate": 2.7485380116959066e-06,
"loss": 0.5123,
"step": 202000
},
{
"epoch": 9.45,
"eval_loss": 2.4837098121643066,
"eval_runtime": 54.9612,
"eval_samples_per_second": 120.612,
"eval_steps_per_second": 1.892,
"step": 202000
},
{
"epoch": 9.5,
"learning_rate": 2.5146198830409357e-06,
"loss": 0.5077,
"step": 203000
},
{
"epoch": 9.5,
"eval_loss": 2.5026121139526367,
"eval_runtime": 55.0018,
"eval_samples_per_second": 120.523,
"eval_steps_per_second": 1.891,
"step": 203000
},
{
"epoch": 9.54,
"learning_rate": 2.2807017543859652e-06,
"loss": 0.5112,
"step": 204000
},
{
"epoch": 9.54,
"eval_loss": 2.514636278152466,
"eval_runtime": 54.9811,
"eval_samples_per_second": 120.569,
"eval_steps_per_second": 1.892,
"step": 204000
},
{
"epoch": 9.59,
"learning_rate": 2.0467836257309943e-06,
"loss": 0.5033,
"step": 205000
},
{
"epoch": 9.59,
"eval_loss": 2.537416696548462,
"eval_runtime": 54.983,
"eval_samples_per_second": 120.565,
"eval_steps_per_second": 1.891,
"step": 205000
},
{
"epoch": 9.64,
"learning_rate": 1.8128654970760235e-06,
"loss": 0.5111,
"step": 206000
},
{
"epoch": 9.64,
"eval_loss": 2.515895366668701,
"eval_runtime": 54.9923,
"eval_samples_per_second": 120.544,
"eval_steps_per_second": 1.891,
"step": 206000
},
{
"epoch": 9.68,
"learning_rate": 1.5789473684210528e-06,
"loss": 0.5119,
"step": 207000
},
{
"epoch": 9.68,
"eval_loss": 2.5189149379730225,
"eval_runtime": 54.9887,
"eval_samples_per_second": 120.552,
"eval_steps_per_second": 1.891,
"step": 207000
},
{
"epoch": 9.73,
"learning_rate": 1.345029239766082e-06,
"loss": 0.5022,
"step": 208000
},
{
"epoch": 9.73,
"eval_loss": 2.506300926208496,
"eval_runtime": 54.9799,
"eval_samples_per_second": 120.571,
"eval_steps_per_second": 1.892,
"step": 208000
},
{
"epoch": 9.78,
"learning_rate": 1.1111111111111112e-06,
"loss": 0.5051,
"step": 209000
},
{
"epoch": 9.78,
"eval_loss": 2.4811651706695557,
"eval_runtime": 54.958,
"eval_samples_per_second": 120.619,
"eval_steps_per_second": 1.892,
"step": 209000
},
{
"epoch": 9.82,
"learning_rate": 8.771929824561404e-07,
"loss": 0.5028,
"step": 210000
},
{
"epoch": 9.82,
"eval_loss": 2.4914138317108154,
"eval_runtime": 55.0024,
"eval_samples_per_second": 120.522,
"eval_steps_per_second": 1.891,
"step": 210000
},
{
"epoch": 9.87,
"learning_rate": 6.432748538011697e-07,
"loss": 0.5066,
"step": 211000
},
{
"epoch": 9.87,
"eval_loss": 2.5056285858154297,
"eval_runtime": 54.9649,
"eval_samples_per_second": 120.604,
"eval_steps_per_second": 1.892,
"step": 211000
},
{
"epoch": 9.92,
"learning_rate": 4.093567251461989e-07,
"loss": 0.5058,
"step": 212000
},
{
"epoch": 9.92,
"eval_loss": 2.53446102142334,
"eval_runtime": 54.9817,
"eval_samples_per_second": 120.567,
"eval_steps_per_second": 1.892,
"step": 212000
},
{
"epoch": 9.96,
"learning_rate": 1.7543859649122808e-07,
"loss": 0.507,
"step": 213000
},
{
"epoch": 9.96,
"eval_loss": 2.507356882095337,
"eval_runtime": 55.001,
"eval_samples_per_second": 120.525,
"eval_steps_per_second": 1.891,
"step": 213000
}
],
"max_steps": 213750,
"num_train_epochs": 10,
"total_flos": 4.4847043698061394e+18,
"trial_name": null,
"trial_params": null
}