{ "best_metric": null, "best_model_checkpoint": null, "epoch": 90.0, "global_step": 29790, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.15, "learning_rate": 0.00013193067062605056, "loss": 17.6929, "step": 100 }, { "epoch": 0.3, "learning_rate": 0.00011254053736637465, "loss": 2.9597, "step": 200 }, { "epoch": 0.45, "learning_rate": 8.317464938377179e-05, "loss": 2.8924, "step": 300 }, { "epoch": 0.6, "learning_rate": 5.04999004691124e-05, "loss": 2.854, "step": 400 }, { "epoch": 0.75, "learning_rate": 2.1934390180170752e-05, "loss": 2.7631, "step": 500 }, { "epoch": 0.9, "learning_rate": 3.963303752660306e-06, "loss": 2.827, "step": 600 }, { "epoch": 1.06, "learning_rate": 6.665901596802237e-07, "loss": 2.7348, "step": 700 }, { "epoch": 1.21, "learning_rate": 1.2792697375844372e-05, "loss": 2.5285, "step": 800 }, { "epoch": 1.36, "learning_rate": 3.758865334885973e-05, "loss": 2.532, "step": 900 }, { "epoch": 1.51, "learning_rate": 6.942506914238186e-05, "loss": 2.5207, "step": 1000 }, { "epoch": 1.66, "learning_rate": 0.00010107417071793643, "loss": 2.5964, "step": 1100 }, { "epoch": 1.81, "learning_rate": 0.00012535070970483918, "loss": 2.651, "step": 1200 }, { "epoch": 1.96, "learning_rate": 0.00013674321963112474, "loss": 2.6082, "step": 1300 }, { "epoch": 2.11, "learning_rate": 0.0001326652759440951, "loss": 2.4929, "step": 1400 }, { "epoch": 2.26, "learning_rate": 0.00011404268812251294, "loss": 2.4281, "step": 1500 }, { "epoch": 2.41, "learning_rate": 8.510331453017784e-05, "loss": 2.4293, "step": 1600 }, { "epoch": 2.56, "learning_rate": 5.2417218031482405e-05, "loss": 2.4101, "step": 1700 }, { "epoch": 2.71, "learning_rate": 2.3405074409295983e-05, "loss": 2.3587, "step": 1800 }, { "epoch": 2.87, "learning_rate": 4.653467399454465e-06, "loss": 2.307, "step": 1900 }, { "epoch": 3.02, "learning_rate": 4.1954639981762966e-07, "loss": 2.3605, "step": 2000 }, { "epoch": 3.17, "learning_rate": 1.1664532186009215e-05, "loss": 2.137, "step": 2100 }, { "epoch": 3.32, "learning_rate": 3.5835492391032385e-05, "loss": 2.0627, "step": 2200 }, { "epoch": 3.47, "learning_rate": 6.744492993645014e-05, "loss": 2.1204, "step": 2300 }, { "epoch": 3.62, "learning_rate": 9.931660131723702e-05, "loss": 2.1823, "step": 2400 }, { "epoch": 3.77, "learning_rate": 0.00012421472847147613, "loss": 2.2277, "step": 2500 }, { "epoch": 3.92, "learning_rate": 0.000136486726691794, "loss": 2.2512, "step": 2600 }, { "epoch": 4.07, "learning_rate": 0.00013334650250861535, "loss": 2.0589, "step": 2700 }, { "epoch": 4.22, "learning_rate": 0.00011550697633903009, "loss": 2.0364, "step": 2800 }, { "epoch": 4.37, "learning_rate": 8.701822922681755e-05, "loss": 2.079, "step": 2900 }, { "epoch": 4.52, "learning_rate": 5.4348018978061915e-05, "loss": 2.0451, "step": 3000 }, { "epoch": 4.68, "learning_rate": 2.4913414743980736e-05, "loss": 2.0148, "step": 3100 }, { "epoch": 4.83, "learning_rate": 5.396910863624203e-06, "loss": 2.0322, "step": 3200 }, { "epoch": 4.98, "learning_rate": 2.2931013159956905e-07, "loss": 2.0037, "step": 3300 }, { "epoch": 5.13, "learning_rate": 1.0583805241907789e-05, "loss": 1.7858, "step": 3400 }, { "epoch": 5.28, "learning_rate": 3.410963059878637e-05, "loss": 1.7749, "step": 3500 }, { "epoch": 5.43, "learning_rate": 6.546575312706341e-05, "loss": 1.753, "step": 3600 }, { "epoch": 5.58, "learning_rate": 9.753343905259311e-05, "loss": 1.8774, "step": 3700 }, { "epoch": 5.73, "learning_rate": 0.00012303240942344423, "loss": 1.8664, "step": 3800 }, { "epoch": 5.88, "learning_rate": 0.0001361736709920872, "loss": 1.9253, "step": 3900 }, { "epoch": 6.03, "learning_rate": 0.00013397378272625676, "loss": 1.895, "step": 4000 }, { "epoch": 6.18, "learning_rate": 0.00011693218198091279, "loss": 1.6965, "step": 4100 }, { "epoch": 6.33, "learning_rate": 8.891779797974934e-05, "loss": 1.7647, "step": 4200 }, { "epoch": 6.49, "learning_rate": 5.629069457859385e-05, "loss": 1.7246, "step": 4300 }, { "epoch": 6.64, "learning_rate": 2.645815444528567e-05, "loss": 1.7252, "step": 4400 }, { "epoch": 6.79, "learning_rate": 6.193014713115989e-06, "loss": 1.7153, "step": 4500 }, { "epoch": 6.94, "learning_rate": 9.603985859484616e-08, "loss": 1.7007, "step": 4600 }, { "epoch": 7.09, "learning_rate": 9.55141699790193e-06, "loss": 1.5635, "step": 4700 }, { "epoch": 7.24, "learning_rate": 3.2412505948464725e-05, "loss": 1.4871, "step": 4800 }, { "epoch": 7.39, "learning_rate": 6.348918775091967e-05, "loss": 1.4965, "step": 4900 }, { "epoch": 7.54, "learning_rate": 9.572616964271242e-05, "loss": 1.5202, "step": 5000 }, { "epoch": 7.69, "learning_rate": 0.00012180473766095787, "loss": 1.5508, "step": 5100 }, { "epoch": 7.84, "learning_rate": 0.0001358043133678906, "loss": 1.658, "step": 5200 }, { "epoch": 7.99, "learning_rate": 0.0001345465939513953, "loss": 1.6614, "step": 5300 }, { "epoch": 8.14, "learning_rate": 0.0001183171175764834, "loss": 1.4421, "step": 5400 }, { "epoch": 8.3, "learning_rate": 9.080043808116885e-05, "loss": 1.4517, "step": 5500 }, { "epoch": 8.45, "learning_rate": 5.824362620893997e-05, "loss": 1.5004, "step": 5600 }, { "epoch": 8.6, "learning_rate": 2.803800644656531e-05, "loss": 1.4732, "step": 5700 }, { "epoch": 8.75, "learning_rate": 7.041115639600712e-06, "loss": 1.3825, "step": 5800 }, { "epoch": 8.9, "learning_rate": 1.9846620691268856e-08, "loss": 1.3723, "step": 5900 }, { "epoch": 9.05, "learning_rate": 8.568227632876734e-06, "loss": 1.3401, "step": 6000 }, { "epoch": 9.2, "learning_rate": 3.074553247281859e-05, "loss": 1.2335, "step": 6100 }, { "epoch": 9.35, "learning_rate": 6.151688066888818e-05, "loss": 1.211, "step": 6200 }, { "epoch": 9.5, "learning_rate": 9.389629889221367e-05, "loss": 1.2744, "step": 6300 }, { "epoch": 9.65, "learning_rate": 0.00012053273607180507, "loss": 1.3462, "step": 6400 }, { "epoch": 9.8, "learning_rate": 0.0001353789615654724, "loss": 1.3833, "step": 6500 }, { "epoch": 9.95, "learning_rate": 0.00013506445892160198, "loss": 1.383, "step": 6600 }, { "epoch": 10.11, "learning_rate": 1.3445492880161939e-06, "loss": 1.2022, "step": 6700 }, { "epoch": 10.26, "learning_rate": 1.1808914090066666e-06, "loss": 1.1142, "step": 6800 }, { "epoch": 10.41, "learning_rate": 9.048790708431603e-07, "loss": 1.1743, "step": 6900 }, { "epoch": 10.56, "learning_rate": 5.791749433756745e-07, "loss": 1.1411, "step": 7000 }, { "epoch": 10.71, "learning_rate": 2.7772314933464675e-07, "loss": 1.2239, "step": 7100 }, { "epoch": 10.86, "learning_rate": 6.896184033086834e-08, "loss": 1.1843, "step": 7200 }, { "epoch": 11.01, "learning_rate": 2.8578527430069923e-10, "loss": 1.1689, "step": 7300 }, { "epoch": 11.16, "learning_rate": 8.568227632876734e-08, "loss": 1.105, "step": 7400 }, { "epoch": 11.31, "learning_rate": 3.074553247281864e-07, "loss": 1.1371, "step": 7500 }, { "epoch": 11.46, "learning_rate": 6.151688066888805e-07, "loss": 1.1963, "step": 7600 }, { "epoch": 11.61, "learning_rate": 9.389629889221367e-07, "loss": 1.153, "step": 7700 }, { "epoch": 11.76, "learning_rate": 1.2053273607180523e-06, "loss": 1.1951, "step": 7800 }, { "epoch": 11.92, "learning_rate": 1.353789615654724e-06, "loss": 1.1274, "step": 7900 }, { "epoch": 12.07, "learning_rate": 1.363372539869449e-07, "loss": 1.1804, "step": 8000 }, { "epoch": 12.22, "learning_rate": 1.2362930050095064e-07, "loss": 1.1455, "step": 8100 }, { "epoch": 12.37, "learning_rate": 9.842812692084315e-08, "loss": 1.1767, "step": 8200 }, { "epoch": 12.52, "learning_rate": 6.645511813249419e-08, "loss": 1.147, "step": 8300 }, { "epoch": 12.67, "learning_rate": 3.496905867845774e-08, "loss": 1.114, "step": 8400 }, { "epoch": 12.82, "learning_rate": 1.1118181719517309e-08, "loss": 1.1379, "step": 8500 }, { "epoch": 12.97, "learning_rate": 3.173163117047257e-10, "loss": 1.1261, "step": 8600 }, { "epoch": 13.0, "eval_loss": 3.459977149963379, "eval_runtime": 0.891, "eval_samples_per_second": 76.319, "eval_steps_per_second": 76.319, "step": 8619 }, { "epoch": 13.12, "learning_rate": 5.0185668342927494e-09, "loss": 1.1202, "step": 8700 }, { "epoch": 13.27, "learning_rate": 2.415461538642524e-08, "loss": 1.1387, "step": 8800 }, { "epoch": 13.42, "learning_rate": 5.338103337974863e-08, "loss": 1.1257, "step": 8900 }, { "epoch": 13.57, "learning_rate": 8.606259068723572e-08, "loss": 1.1419, "step": 9000 }, { "epoch": 13.73, "learning_rate": 1.1477964205183373e-07, "loss": 1.1491, "step": 9100 }, { "epoch": 13.88, "learning_rate": 1.330125980932328e-07, "loss": 1.141, "step": 9200 }, { "epoch": 14.0, "eval_loss": 3.4634451866149902, "eval_runtime": 0.8581, "eval_samples_per_second": 79.247, "eval_steps_per_second": 79.247, "step": 9282 }, { "epoch": 14.03, "learning_rate": 1.3662205797010195e-07, "loss": 1.1135, "step": 9300 }, { "epoch": 14.18, "learning_rate": 1.2497725716841956e-07, "loss": 1.1462, "step": 9400 }, { "epoch": 14.33, "learning_rate": 1.0049124874956241e-07, "loss": 1.1353, "step": 9500 }, { "epoch": 14.48, "learning_rate": 6.876501764792728e-08, "loss": 1.1573, "step": 9600 }, { "epoch": 14.63, "learning_rate": 3.700132283557427e-08, "loss": 1.1477, "step": 9700 }, { "epoch": 14.78, "learning_rate": 1.2411428614983945e-08, "loss": 1.0973, "step": 9800 }, { "epoch": 14.93, "learning_rate": 5.779420298980625e-10, "loss": 1.1278, "step": 9900 }, { "epoch": 15.0, "eval_loss": 3.4664907455444336, "eval_runtime": 0.883, "eval_samples_per_second": 77.009, "eval_steps_per_second": 77.009, "step": 9945 }, { "epoch": 15.08, "learning_rate": 4.187401906767171e-09, "loss": 1.125, "step": 10000 }, { "epoch": 15.23, "learning_rate": 2.2420357948166246e-08, "loss": 1.1384, "step": 10100 }, { "epoch": 15.38, "learning_rate": 5.1137409312764306e-08, "loss": 1.1209, "step": 10200 }, { "epoch": 15.54, "learning_rate": 8.381896662025135e-08, "loss": 1.1044, "step": 10300 }, { "epoch": 15.69, "learning_rate": 1.1304538461357478e-07, "loss": 1.1637, "step": 10400 }, { "epoch": 15.84, "learning_rate": 1.3218143316570722e-07, "loss": 1.1661, "step": 10500 }, { "epoch": 15.99, "learning_rate": 1.368826836882953e-07, "loss": 1.1183, "step": 10600 }, { "epoch": 16.0, "eval_loss": 3.4696619510650635, "eval_runtime": 0.872, "eval_samples_per_second": 77.979, "eval_steps_per_second": 77.979, "step": 10608 }, { "epoch": 16.14, "learning_rate": 1.2608181828048274e-07, "loss": 1.1561, "step": 10700 }, { "epoch": 16.29, "learning_rate": 1.0223094132154226e-07, "loss": 1.1467, "step": 10800 }, { "epoch": 16.44, "learning_rate": 7.074488186750591e-08, "loss": 1.1874, "step": 10900 }, { "epoch": 16.59, "learning_rate": 3.8771873079156884e-08, "loss": 1.1281, "step": 11000 }, { "epoch": 16.74, "learning_rate": 1.3570699499049336e-08, "loss": 1.0814, "step": 11100 }, { "epoch": 16.89, "learning_rate": 8.627460130551071e-10, "loss": 1.1048, "step": 11200 }, { "epoch": 17.0, "eval_loss": 3.4714086055755615, "eval_runtime": 0.874, "eval_samples_per_second": 77.801, "eval_steps_per_second": 77.801, "step": 11271 }, { "epoch": 17.04, "learning_rate": 3.5330803651032965e-09, "loss": 1.1379, "step": 11300 }, { "epoch": 17.19, "learning_rate": 2.0975460526287258e-08, "loss": 1.1625, "step": 11400 }, { "epoch": 17.35, "learning_rate": 4.922996891933192e-08, "loss": 1.1556, "step": 11500 }, { "epoch": 17.5, "learning_rate": 8.188202660552875e-08, "loss": 1.1082, "step": 11600 }, { "epoch": 17.65, "learning_rate": 1.115186855828889e-07, "loss": 1.1591, "step": 11700 }, { "epoch": 17.8, "learning_rate": 1.3141157932507567e-07, "loss": 1.119, "step": 11800 }, { "epoch": 17.95, "learning_rate": 1.3704445380800497e-07, "loss": 1.1061, "step": 11900 }, { "epoch": 18.0, "eval_loss": 3.4751551151275635, "eval_runtime": 0.87, "eval_samples_per_second": 78.161, "eval_steps_per_second": 78.161, "step": 11934 }, { "epoch": 18.1, "learning_rate": 1.2713848593130804e-07, "loss": 1.1182, "step": 12000 }, { "epoch": 18.25, "learning_rate": 1.0394261282124642e-07, "loss": 1.1469, "step": 12100 }, { "epoch": 18.4, "learning_rate": 7.272295898605559e-08, "loss": 1.1398, "step": 12200 }, { "epoch": 18.55, "learning_rate": 4.05672759158415e-08, "loss": 1.0718, "step": 12300 }, { "epoch": 18.7, "learning_rate": 1.4775820423167982e-08, "loss": 1.1535, "step": 12400 }, { "epoch": 18.85, "learning_rate": 1.2039882169440381e-09, "loss": 1.1471, "step": 12500 }, { "epoch": 19.0, "eval_loss": 3.4772818088531494, "eval_runtime": 0.871, "eval_samples_per_second": 78.07, "eval_steps_per_second": 78.07, "step": 12597 }, { "epoch": 19.0, "learning_rate": 2.9329721396952097e-09, "loss": 1.1334, "step": 12600 }, { "epoch": 19.16, "learning_rate": 1.9570243547781353e-08, "loss": 1.1311, "step": 12700 }, { "epoch": 19.31, "learning_rate": 4.733866750445838e-08, "loss": 1.129, "step": 12800 }, { "epoch": 19.46, "learning_rate": 7.993402009629649e-08, "loss": 1.0708, "step": 12900 }, { "epoch": 19.61, "learning_rate": 1.0995622699438409e-07, "loss": 1.1266, "step": 13000 }, { "epoch": 19.76, "learning_rate": 1.3058939130244216e-07, "loss": 1.146, "step": 13100 }, { "epoch": 19.91, "learning_rate": 1.3716110242414843e-07, "loss": 1.1402, "step": 13200 }, { "epoch": 20.0, "eval_loss": 3.4797611236572266, "eval_runtime": 0.848, "eval_samples_per_second": 80.192, "eval_steps_per_second": 80.192, "step": 13260 }, { "epoch": 20.06, "learning_rate": 1.2830956053906914e-07, "loss": 1.1516, "step": 13300 }, { "epoch": 20.21, "learning_rate": 1.0590224609743171e-07, "loss": 1.1088, "step": 13400 }, { "epoch": 20.36, "learning_rate": 7.502625830800328e-08, "loss": 1.1331, "step": 13500 }, { "epoch": 20.51, "learning_rate": 4.269132667061271e-08, "loss": 1.157, "step": 13600 }, { "epoch": 20.66, "learning_rate": 1.6238402532934746e-08, "loss": 1.1155, "step": 13700 }, { "epoch": 20.81, "learning_rate": 1.673053844704988e-09, "loss": 1.1693, "step": 13800 }, { "epoch": 20.97, "learning_rate": 2.3020300149297553e-09, "loss": 1.0847, "step": 13900 }, { "epoch": 21.0, "eval_loss": 3.4811391830444336, "eval_runtime": 0.8641, "eval_samples_per_second": 78.699, "eval_steps_per_second": 78.699, "step": 13923 }, { "epoch": 21.12, "learning_rate": 1.7982535520914407e-08, "loss": 1.1415, "step": 14000 }, { "epoch": 21.27, "learning_rate": 4.515464856300149e-08, "loss": 1.1699, "step": 14100 }, { "epoch": 21.42, "learning_rate": 7.764952480614377e-08, "loss": 1.0972, "step": 14200 }, { "epoch": 21.57, "learning_rate": 1.0808990091715443e-07, "loss": 1.1351, "step": 14300 }, { "epoch": 21.72, "learning_rate": 1.295649436664554e-07, "loss": 1.1021, "step": 14400 }, { "epoch": 21.87, "learning_rate": 1.3719920609842196e-07, "loss": 1.1462, "step": 14500 }, { "epoch": 22.0, "eval_loss": 3.48413348197937, "eval_runtime": 0.887, "eval_samples_per_second": 76.663, "eval_steps_per_second": 76.663, "step": 14586 }, { "epoch": 22.02, "learning_rate": 1.292594929888637e-07, "loss": 1.1323, "step": 14600 }, { "epoch": 22.17, "learning_rate": 1.0754834557415543e-07, "loss": 1.1255, "step": 14700 }, { "epoch": 22.32, "learning_rate": 7.699481330029191e-08, "loss": 1.1254, "step": 14800 }, { "epoch": 22.47, "learning_rate": 4.453541907189201e-08, "loss": 1.1578, "step": 14900 }, { "epoch": 22.62, "learning_rate": 1.7539370793205484e-08, "loss": 1.1509, "step": 15000 }, { "epoch": 22.78, "learning_rate": 2.135541078398113e-09, "loss": 1.096, "step": 15100 }, { "epoch": 22.93, "learning_rate": 1.821038434527568e-09, "loss": 1.1107, "step": 15200 }, { "epoch": 23.0, "eval_loss": 3.485179901123047, "eval_runtime": 0.876, "eval_samples_per_second": 77.624, "eval_steps_per_second": 77.624, "step": 15249 }, { "epoch": 23.08, "learning_rate": 1.6667263928194815e-08, "loss": 1.1112, "step": 15300 }, { "epoch": 23.23, "learning_rate": 4.330370110778617e-08, "loss": 1.0894, "step": 15400 }, { "epoch": 23.38, "learning_rate": 7.568311933111189e-08, "loss": 1.1571, "step": 15500 }, { "epoch": 23.53, "learning_rate": 1.0645446752718106e-07, "loss": 1.0807, "step": 15600 }, { "epoch": 23.68, "learning_rate": 1.286317723671232e-07, "loss": 1.1122, "step": 15700 }, { "epoch": 23.83, "learning_rate": 1.3718015337930875e-07, "loss": 1.2016, "step": 15800 }, { "epoch": 23.98, "learning_rate": 1.301588843603994e-07, "loss": 1.1192, "step": 15900 }, { "epoch": 24.0, "eval_loss": 3.487265110015869, "eval_runtime": 1.004, "eval_samples_per_second": 67.732, "eval_steps_per_second": 67.732, "step": 15912 }, { "epoch": 24.13, "learning_rate": 1.0916199355343483e-07, "loss": 1.1713, "step": 16000 }, { "epoch": 24.28, "learning_rate": 7.895637379105998e-08, "loss": 1.1412, "step": 16100 }, { "epoch": 24.43, "learning_rate": 4.639956191883156e-08, "loss": 1.1021, "step": 16200 }, { "epoch": 24.59, "learning_rate": 1.8882882423516715e-08, "loss": 1.1353, "step": 16300 }, { "epoch": 24.74, "learning_rate": 2.653406048604756e-09, "loss": 1.0801, "step": 16400 }, { "epoch": 24.89, "learning_rate": 1.3956866321093081e-09, "loss": 1.0868, "step": 16500 }, { "epoch": 25.0, "eval_loss": 3.487858295440674, "eval_runtime": 0.854, "eval_samples_per_second": 79.625, "eval_steps_per_second": 79.625, "step": 16575 }, { "epoch": 25.04, "learning_rate": 1.539526233904202e-08, "loss": 1.0812, "step": 16600 }, { "epoch": 25.19, "learning_rate": 4.1473830357287196e-08, "loss": 1.104, "step": 16700 }, { "epoch": 25.34, "learning_rate": 7.371081224908017e-08, "loss": 1.0943, "step": 16800 }, { "epoch": 25.49, "learning_rate": 1.0478749405153535e-07, "loss": 1.097, "step": 16900 }, { "epoch": 25.64, "learning_rate": 1.2764858300209813e-07, "loss": 1.1316, "step": 17000 }, { "epoch": 25.79, "learning_rate": 1.3710396014140516e-07, "loss": 1.1679, "step": 17100 }, { "epoch": 25.94, "learning_rate": 1.31006985286884e-07, "loss": 1.1313, "step": 17200 }, { "epoch": 26.0, "eval_loss": 3.489774703979492, "eval_runtime": 0.859, "eval_samples_per_second": 79.161, "eval_steps_per_second": 79.161, "step": 17238 }, { "epoch": 26.09, "learning_rate": 1.1100177565244e-07, "loss": 1.1454, "step": 17300 }, { "epoch": 26.24, "learning_rate": 8.123384073832201e-08, "loss": 1.0682, "step": 17400 }, { "epoch": 26.4, "learning_rate": 4.859766378694414e-08, "loss": 1.1414, "step": 17500 }, { "epoch": 26.55, "learning_rate": 2.050258742073295e-08, "loss": 1.146, "step": 17600 }, { "epoch": 26.7, "learning_rate": 3.326994962912756e-09, "loss": 1.145, "step": 17700 }, { "epoch": 26.85, "learning_rate": 9.702355760016704e-10, "loss": 1.1254, "step": 17800 }, { "epoch": 27.0, "learning_rate": 1.3967360831826574e-08, "loss": 1.1033, "step": 17900 }, { "epoch": 27.0, "eval_loss": 3.491461992263794, "eval_runtime": 0.878, "eval_samples_per_second": 77.447, "eval_steps_per_second": 77.447, "step": 17901 }, { "epoch": 27.15, "learning_rate": 3.936765275044558e-08, "loss": 1.1389, "step": 18000 }, { "epoch": 27.3, "learning_rate": 7.140452090629836e-08, "loss": 1.1195, "step": 18100 }, { "epoch": 27.45, "learning_rate": 1.0280468285978317e-07, "loss": 1.0567, "step": 18200 }, { "epoch": 27.6, "learning_rate": 1.2643940663167218e-07, "loss": 1.1261, "step": 18300 }, { "epoch": 27.75, "learning_rate": 1.369429361019015e-07, "loss": 1.1393, "step": 18400 }, { "epoch": 27.9, "learning_rate": 1.3193067062605075e-07, "loss": 1.1578, "step": 18500 }, { "epoch": 28.0, "eval_loss": 3.4938805103302, "eval_runtime": 0.864, "eval_samples_per_second": 78.704, "eval_steps_per_second": 78.704, "step": 18564 }, { "epoch": 28.05, "learning_rate": 1.1254053736637487e-07, "loss": 1.136, "step": 18600 }, { "epoch": 28.21, "learning_rate": 8.317464938377186e-08, "loss": 1.102, "step": 18700 }, { "epoch": 28.36, "learning_rate": 5.0499900469112783e-08, "loss": 1.1325, "step": 18800 }, { "epoch": 28.51, "learning_rate": 2.193439018017092e-08, "loss": 1.1511, "step": 18900 }, { "epoch": 28.66, "learning_rate": 3.963303752660322e-09, "loss": 1.118, "step": 19000 }, { "epoch": 28.81, "learning_rate": 6.66590159680239e-10, "loss": 1.1024, "step": 19100 }, { "epoch": 28.96, "learning_rate": 1.2792697375844275e-08, "loss": 1.0987, "step": 19200 }, { "epoch": 29.0, "eval_loss": 3.4947092533111572, "eval_runtime": 0.879, "eval_samples_per_second": 77.363, "eval_steps_per_second": 77.363, "step": 19227 }, { "epoch": 29.11, "learning_rate": 3.7294625797960914e-08, "loss": 1.135, "step": 19300 }, { "epoch": 29.26, "learning_rate": 6.909504912431355e-08, "loss": 1.1308, "step": 19400 }, { "epoch": 29.41, "learning_rate": 1.0078308218564023e-07, "loss": 1.0927, "step": 19500 }, { "epoch": 29.56, "learning_rate": 1.2516463805405508e-07, "loss": 1.1707, "step": 19600 }, { "epoch": 29.71, "learning_rate": 1.367044084314665e-07, "loss": 1.1217, "step": 19700 }, { "epoch": 29.86, "learning_rate": 1.3278253644438908e-07, "loss": 1.0779, "step": 19800 }, { "epoch": 30.0, "eval_loss": 3.497239112854004, "eval_runtime": 0.879, "eval_samples_per_second": 77.362, "eval_steps_per_second": 77.362, "step": 19890 }, { "epoch": 60.12, "learning_rate": 8.693541350698569e-09, "loss": 1.5417, "step": 19900 }, { "epoch": 60.42, "learning_rate": 6.223106707028002e-08, "loss": 1.3927, "step": 20000 }, { "epoch": 60.73, "learning_rate": 1.2172156539717049e-07, "loss": 1.3567, "step": 20100 }, { "epoch": 61.0, "eval_loss": 3.4576382637023926, "eval_runtime": 0.952, "eval_samples_per_second": 35.715, "eval_steps_per_second": 35.715, "step": 20191 }, { "epoch": 61.03, "learning_rate": 1.3426435048201084e-07, "loss": 1.3891, "step": 20200 }, { "epoch": 61.33, "learning_rate": 8.877437479246501e-08, "loss": 1.3829, "step": 20300 }, { "epoch": 61.63, "learning_rate": 2.5454714388004032e-08, "loss": 1.3482, "step": 20400 }, { "epoch": 61.93, "learning_rate": 2.6594266338289214e-10, "loss": 1.3278, "step": 20500 }, { "epoch": 62.0, "eval_loss": 3.452810525894165, "eval_runtime": 0.964, "eval_samples_per_second": 35.269, "eval_steps_per_second": 35.269, "step": 20522 }, { "epoch": 62.24, "learning_rate": 3.54693620295034e-08, "loss": 1.3495, "step": 20600 }, { "epoch": 62.54, "learning_rate": 9.995293681528824e-08, "loss": 1.3372, "step": 20700 }, { "epoch": 62.84, "learning_rate": 1.3672745077248097e-07, "loss": 1.3292, "step": 20800 }, { "epoch": 63.0, "eval_loss": 3.4468021392822266, "eval_runtime": 0.9878, "eval_samples_per_second": 34.42, "eval_steps_per_second": 34.42, "step": 20853 }, { "epoch": 63.14, "learning_rate": 1.1329236813994995e-07, "loss": 1.3209, "step": 20900 }, { "epoch": 63.44, "learning_rate": 5.035911808981744e-08, "loss": 1.3127, "step": 21000 }, { "epoch": 63.75, "learning_rate": 3.5467727381560775e-09, "loss": 1.3285, "step": 21100 }, { "epoch": 64.0, "eval_loss": 3.443075180053711, "eval_runtime": 0.9684, "eval_samples_per_second": 35.111, "eval_steps_per_second": 35.111, "step": 21184 }, { "epoch": 64.05, "learning_rate": 1.4227090415615305e-08, "loss": 1.2899, "step": 21200 }, { "epoch": 64.35, "learning_rate": 7.296103279743161e-08, "loss": 1.3259, "step": 21300 }, { "epoch": 64.65, "learning_rate": 1.2784078685324094e-07, "loss": 1.308, "step": 21400 }, { "epoch": 64.95, "learning_rate": 1.303647894806954e-07, "loss": 1.3032, "step": 21500 }, { "epoch": 65.0, "eval_loss": 3.437013626098633, "eval_runtime": 0.9607, "eval_samples_per_second": 35.392, "eval_steps_per_second": 35.392, "step": 21515 }, { "epoch": 65.26, "learning_rate": 7.830238067025156e-08, "loss": 1.2946, "step": 21600 }, { "epoch": 65.56, "learning_rate": 1.766521352829689e-08, "loss": 1.3209, "step": 21700 }, { "epoch": 65.86, "learning_rate": 2.043130777021614e-09, "loss": 1.318, "step": 21800 }, { "epoch": 66.0, "eval_loss": 3.4345176219940186, "eval_runtime": 0.9583, "eval_samples_per_second": 35.479, "eval_steps_per_second": 35.479, "step": 21846 }, { "epoch": 66.16, "learning_rate": 4.5242597940677724e-08, "loss": 1.2637, "step": 21900 }, { "epoch": 66.47, "learning_rate": 1.090848418327998e-07, "loss": 1.2813, "step": 22000 }, { "epoch": 66.77, "learning_rate": 1.3714744085178092e-07, "loss": 1.3003, "step": 22100 }, { "epoch": 67.0, "eval_loss": 3.428901195526123, "eval_runtime": 0.9557, "eval_samples_per_second": 35.575, "eval_steps_per_second": 35.575, "step": 22177 }, { "epoch": 67.07, "learning_rate": 1.0462926495336562e-07, "loss": 1.3045, "step": 22200 }, { "epoch": 67.37, "learning_rate": 4.026918856301729e-08, "loss": 1.2654, "step": 22300 }, { "epoch": 67.67, "learning_rate": 9.472826654548543e-10, "loss": 1.2689, "step": 22400 }, { "epoch": 67.98, "learning_rate": 2.141541401022131e-08, "loss": 1.3202, "step": 22500 }, { "epoch": 68.0, "eval_loss": 3.427440881729126, "eval_runtime": 0.958, "eval_samples_per_second": 35.49, "eval_steps_per_second": 35.49, "step": 22508 }, { "epoch": 68.28, "learning_rate": 8.358428206955252e-08, "loss": 1.3008, "step": 22600 }, { "epoch": 68.58, "learning_rate": 1.3251035932481603e-07, "loss": 1.2628, "step": 22700 }, { "epoch": 68.88, "learning_rate": 1.2495381600320813e-07, "loss": 1.2643, "step": 22800 }, { "epoch": 69.0, "eval_loss": 3.4231972694396973, "eval_runtime": 0.9561, "eval_samples_per_second": 35.563, "eval_steps_per_second": 35.563, "step": 22839 }, { "epoch": 69.18, "learning_rate": 6.759296488244086e-08, "loss": 1.2717, "step": 22900 }, { "epoch": 69.49, "learning_rate": 1.1122110047934364e-08, "loss": 1.2759, "step": 23000 }, { "epoch": 69.79, "learning_rate": 5.44899573655199e-09, "loss": 1.2862, "step": 23100 }, { "epoch": 70.0, "eval_loss": 3.4223098754882812, "eval_runtime": 1.0568, "eval_samples_per_second": 32.174, "eval_steps_per_second": 32.174, "step": 23170 }, { "epoch": 70.09, "learning_rate": 5.492884226688333e-08, "loss": 1.2962, "step": 23200 }, { "epoch": 70.39, "learning_rate": 1.1675016817694061e-07, "loss": 1.2762, "step": 23300 }, { "epoch": 70.69, "learning_rate": 1.3601739630719386e-07, "loss": 1.2783, "step": 23400 }, { "epoch": 71.0, "learning_rate": 9.570255892993473e-08, "loss": 1.2597, "step": 23500 }, { "epoch": 71.0, "eval_loss": 3.418621063232422, "eval_runtime": 1.0452, "eval_samples_per_second": 32.531, "eval_steps_per_second": 32.531, "step": 23501 }, { "epoch": 71.3, "learning_rate": 3.143505458484596e-08, "loss": 1.3153, "step": 23600 }, { "epoch": 71.6, "learning_rate": 1.3141045706995258e-11, "loss": 1.245, "step": 23700 }, { "epoch": 71.9, "learning_rate": 2.920683930724396e-08, "loss": 1.2426, "step": 23800 }, { "epoch": 72.0, "eval_loss": 3.4176230430603027, "eval_runtime": 1.0483, "eval_samples_per_second": 32.434, "eval_steps_per_second": 32.434, "step": 23832 }, { "epoch": 72.21, "learning_rate": 9.32153777956087e-08, "loss": 1.276, "step": 23900 }, { "epoch": 72.51, "learning_rate": 1.3546936727073083e-07, "loss": 1.2707, "step": 24000 }, { "epoch": 72.81, "learning_rate": 1.1862562768173958e-07, "loss": 1.2539, "step": 24100 }, { "epoch": 73.0, "eval_loss": 3.4151535034179688, "eval_runtime": 1.0259, "eval_samples_per_second": 33.143, "eval_steps_per_second": 33.143, "step": 24163 }, { "epoch": 73.11, "learning_rate": 5.7570298965124436e-08, "loss": 1.2827, "step": 24200 }, { "epoch": 73.41, "learning_rate": 6.262786177113756e-09, "loss": 1.2277, "step": 24300 }, { "epoch": 73.72, "learning_rate": 1.0047581324385604e-08, "loss": 1.2604, "step": 24400 }, { "epoch": 74.0, "eval_loss": 3.414668083190918, "eval_runtime": 1.0049, "eval_samples_per_second": 33.834, "eval_steps_per_second": 33.834, "step": 24494 }, { "epoch": 74.02, "learning_rate": 6.55797626990514e-08, "loss": 1.2588, "step": 24500 }, { "epoch": 74.32, "learning_rate": 1.2378116576678553e-07, "loss": 1.2695, "step": 24600 }, { "epoch": 74.62, "learning_rate": 1.3321462445238648e-07, "loss": 1.2576, "step": 24700 }, { "epoch": 74.92, "learning_rate": 8.554304797283439e-08, "loss": 1.263, "step": 24800 }, { "epoch": 75.0, "eval_loss": 3.412790298461914, "eval_runtime": 1.012, "eval_samples_per_second": 33.597, "eval_steps_per_second": 33.597, "step": 24825 }, { "epoch": 75.23, "learning_rate": 2.2897564912636232e-08, "loss": 1.2362, "step": 24900 }, { "epoch": 75.53, "learning_rate": 6.429250232458032e-10, "loss": 1.2653, "step": 25000 }, { "epoch": 75.83, "learning_rate": 3.844730720753521e-08, "loss": 1.2642, "step": 25100 }, { "epoch": 76.0, "eval_loss": 3.412682294845581, "eval_runtime": 1.006, "eval_samples_per_second": 33.797, "eval_steps_per_second": 33.797, "step": 25156 }, { "epoch": 76.13, "learning_rate": 1.0289999999999885e-07, "loss": 1.2254, "step": 25200 }, { "epoch": 76.44, "learning_rate": 1.370390800033334e-07, "loss": 1.2543, "step": 25300 }, { "epoch": 76.74, "learning_rate": 1.106931521097187e-07, "loss": 1.2694, "step": 25400 }, { "epoch": 77.0, "eval_loss": 3.4108803272247314, "eval_runtime": 1.004, "eval_samples_per_second": 33.865, "eval_steps_per_second": 33.865, "step": 25487 }, { "epoch": 77.04, "learning_rate": 4.7146188818785276e-08, "loss": 1.25, "step": 25500 }, { "epoch": 77.34, "learning_rate": 2.559648997196144e-09, "loss": 1.2614, "step": 25600 }, { "epoch": 77.64, "learning_rate": 1.6338171238500445e-08, "loss": 1.2549, "step": 25700 }, { "epoch": 77.95, "learning_rate": 7.63045897109416e-08, "loss": 1.2251, "step": 25800 }, { "epoch": 78.0, "eval_loss": 3.4106197357177734, "eval_runtime": 1.0035, "eval_samples_per_second": 33.882, "eval_steps_per_second": 33.882, "step": 25818 }, { "epoch": 78.25, "learning_rate": 1.2946185514557884e-07, "loss": 1.2965, "step": 25900 }, { "epoch": 78.55, "learning_rate": 1.288307033641468e-07, "loss": 1.2137, "step": 26000 }, { "epoch": 78.85, "learning_rate": 7.496893292971942e-08, "loss": 1.2673, "step": 26100 }, { "epoch": 79.0, "eval_loss": 3.409691333770752, "eval_runtime": 1.0062, "eval_samples_per_second": 33.792, "eval_steps_per_second": 33.792, "step": 26149 }, { "epoch": 79.15, "learning_rate": 1.5478434602829786e-08, "loss": 1.225, "step": 26200 }, { "epoch": 79.46, "learning_rate": 2.9356495179890543e-09, "loss": 1.2545, "step": 26300 }, { "epoch": 79.76, "learning_rate": 4.842562520753367e-08, "loss": 1.233, "step": 26400 }, { "epoch": 80.0, "eval_loss": 3.409552574157715, "eval_runtime": 1.0059, "eval_samples_per_second": 33.8, "eval_steps_per_second": 33.8, "step": 26480 }, { "epoch": 80.06, "learning_rate": 1.1174528561199565e-07, "loss": 1.2286, "step": 26500 }, { "epoch": 80.36, "learning_rate": 1.3693405733661706e-07, "loss": 1.2464, "step": 26600 }, { "epoch": 80.66, "learning_rate": 1.0173063797049695e-07, "loss": 1.2497, "step": 26700 }, { "epoch": 80.97, "learning_rate": 3.7247063184712124e-08, "loss": 1.2408, "step": 26800 }, { "epoch": 81.0, "eval_loss": 3.4087181091308594, "eval_runtime": 1.0321, "eval_samples_per_second": 32.942, "eval_steps_per_second": 32.942, "step": 26811 }, { "epoch": 81.27, "learning_rate": 4.725492275192098e-10, "loss": 1.2461, "step": 26900 }, { "epoch": 81.57, "learning_rate": 2.3400433722385994e-08, "loss": 1.2495, "step": 27000 }, { "epoch": 81.87, "learning_rate": 8.619280748313942e-08, "loss": 1.2579, "step": 27100 }, { "epoch": 82.0, "eval_loss": 3.408783435821533, "eval_runtime": 1.0924, "eval_samples_per_second": 31.123, "eval_steps_per_second": 31.123, "step": 27142 }, { "epoch": 82.18, "learning_rate": 1.3343703098390236e-07, "loss": 1.2153, "step": 27200 }, { "epoch": 82.48, "learning_rate": 1.233796611423087e-07, "loss": 1.2398, "step": 27300 }, { "epoch": 82.78, "learning_rate": 6.490918819320101e-08, "loss": 1.2346, "step": 27400 }, { "epoch": 83.0, "eval_loss": 3.408099412918091, "eval_runtime": 1.0406, "eval_samples_per_second": 32.673, "eval_steps_per_second": 32.673, "step": 27473 }, { "epoch": 83.08, "learning_rate": 9.700576468258166e-09, "loss": 1.2543, "step": 27500 }, { "epoch": 83.38, "learning_rate": 6.546026505537958e-09, "loss": 1.247, "step": 27600 }, { "epoch": 83.69, "learning_rate": 5.823346268095307e-08, "loss": 1.2225, "step": 27700 }, { "epoch": 83.99, "learning_rate": 1.190826247529784e-07, "loss": 1.2298, "step": 27800 }, { "epoch": 84.0, "eval_loss": 3.4082374572753906, "eval_runtime": 1.0242, "eval_samples_per_second": 33.197, "eval_steps_per_second": 33.197, "step": 27804 }, { "epoch": 84.29, "learning_rate": 1.3531631337483422e-07, "loss": 1.2426, "step": 27900 }, { "epoch": 84.59, "learning_rate": 9.258753872080096e-08, "loss": 1.2389, "step": 28000 }, { "epoch": 84.89, "learning_rate": 2.865908591672022e-08, "loss": 1.219, "step": 28100 }, { "epoch": 85.0, "eval_loss": 3.407871961593628, "eval_runtime": 0.9538, "eval_samples_per_second": 35.648, "eval_steps_per_second": 35.648, "step": 28135 }, { "epoch": 85.2, "learning_rate": 2.956617282787915e-11, "loss": 1.2712, "step": 28200 }, { "epoch": 85.5, "learning_rate": 3.2001142050499094e-08, "loss": 1.2184, "step": 28300 }, { "epoch": 85.8, "learning_rate": 9.631801263387146e-08, "loss": 1.2515, "step": 28400 }, { "epoch": 86.0, "eval_loss": 3.4080073833465576, "eval_runtime": 0.951, "eval_samples_per_second": 35.752, "eval_steps_per_second": 35.752, "step": 28466 }, { "epoch": 86.1, "learning_rate": 1.361382913144042e-07, "loss": 1.2196, "step": 28500 }, { "epoch": 86.4, "learning_rate": 1.162696600432059e-07, "loss": 1.2085, "step": 28600 }, { "epoch": 86.71, "learning_rate": 5.427159389331479e-08, "loss": 1.2316, "step": 28700 }, { "epoch": 87.0, "eval_loss": 3.408378839492798, "eval_runtime": 0.9581, "eval_samples_per_second": 35.486, "eval_steps_per_second": 35.486, "step": 28797 }, { "epoch": 87.01, "learning_rate": 4.936669277050985e-09, "loss": 1.2578, "step": 28800 }, { "epoch": 87.31, "learning_rate": 1.1866046596701257e-08, "loss": 1.24, "step": 28900 }, { "epoch": 87.61, "learning_rate": 6.893568909023435e-08, "loss": 1.2342, "step": 29000 }, { "epoch": 87.92, "learning_rate": 1.2570865668292598e-07, "loss": 1.2085, "step": 29100 }, { "epoch": 88.0, "eval_loss": 3.4085471630096436, "eval_runtime": 1.0334, "eval_samples_per_second": 32.901, "eval_steps_per_second": 32.901, "step": 29128 }, { "epoch": 88.22, "learning_rate": 1.32010204287465e-07, "loss": 1.2504, "step": 29200 }, { "epoch": 88.52, "learning_rate": 8.227115773311707e-08, "loss": 1.2419, "step": 29300 }, { "epoch": 88.82, "learning_rate": 2.0449831823058302e-08, "loss": 1.2334, "step": 29400 }, { "epoch": 89.0, "eval_loss": 3.4085144996643066, "eval_runtime": 1.0283, "eval_samples_per_second": 33.063, "eval_steps_per_second": 33.063, "step": 29459 }, { "epoch": 89.12, "learning_rate": 1.1826036928064389e-09, "loss": 1.2213, "step": 29500 }, { "epoch": 89.43, "learning_rate": 4.088198736612909e-08, "loss": 1.2165, "step": 29600 }, { "epoch": 89.73, "learning_rate": 1.0519885794950143e-07, "loss": 1.2263, "step": 29700 }, { "epoch": 90.0, "eval_loss": 3.408367156982422, "eval_runtime": 1.0186, "eval_samples_per_second": 33.38, "eval_steps_per_second": 33.38, "step": 29790 }, { "epoch": 90.0, "step": 29790, "total_flos": 1.037068075008e+16, "train_loss": 0.13739024364296004, "train_runtime": 461.3754, "train_samples_per_second": 64.568, "train_steps_per_second": 64.568 } ], "max_steps": 29790, "num_train_epochs": 90, "total_flos": 1.037068075008e+16, "trial_name": null, "trial_params": null }