|
{ |
|
"best_metric": 0.5539772727272727, |
|
"best_model_checkpoint": "videomae-base-finetuned-ucf101-subset/checkpoint-1600", |
|
"epoch": 119.00833333333334, |
|
"eval_steps": 500, |
|
"global_step": 1920, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.005208333333333333, |
|
"grad_norm": 3.7305831909179688, |
|
"learning_rate": 2.604166666666667e-06, |
|
"loss": 4.2451, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.008333333333333333, |
|
"eval_accuracy": 0.01065340909090909, |
|
"eval_loss": 4.228378772735596, |
|
"eval_runtime": 133.9626, |
|
"eval_samples_per_second": 10.51, |
|
"eval_steps_per_second": 0.164, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 1.0020833333333334, |
|
"grad_norm": 3.0969808101654053, |
|
"learning_rate": 5.208333333333334e-06, |
|
"loss": 4.231, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 1.0072916666666667, |
|
"grad_norm": 4.606812000274658, |
|
"learning_rate": 7.8125e-06, |
|
"loss": 4.2251, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.0083333333333333, |
|
"eval_accuracy": 0.01065340909090909, |
|
"eval_loss": 4.215222358703613, |
|
"eval_runtime": 129.5768, |
|
"eval_samples_per_second": 10.866, |
|
"eval_steps_per_second": 0.17, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 2.004166666666667, |
|
"grad_norm": 3.574765682220459, |
|
"learning_rate": 1.0416666666666668e-05, |
|
"loss": 4.2276, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 2.0083333333333333, |
|
"eval_accuracy": 0.012073863636363636, |
|
"eval_loss": 4.209568500518799, |
|
"eval_runtime": 130.2735, |
|
"eval_samples_per_second": 10.808, |
|
"eval_steps_per_second": 0.169, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 3.0010416666666666, |
|
"grad_norm": 4.630516052246094, |
|
"learning_rate": 1.3020833333333334e-05, |
|
"loss": 4.2126, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 3.00625, |
|
"grad_norm": 3.0715420246124268, |
|
"learning_rate": 1.5625e-05, |
|
"loss": 4.2146, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 3.0083333333333333, |
|
"eval_accuracy": 0.014914772727272728, |
|
"eval_loss": 4.212440490722656, |
|
"eval_runtime": 128.256, |
|
"eval_samples_per_second": 10.978, |
|
"eval_steps_per_second": 0.172, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 4.003125, |
|
"grad_norm": 2.934682846069336, |
|
"learning_rate": 1.8229166666666668e-05, |
|
"loss": 4.2235, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 4.008333333333334, |
|
"grad_norm": 8.70511245727539, |
|
"learning_rate": 2.0833333333333336e-05, |
|
"loss": 4.2217, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 4.008333333333334, |
|
"eval_accuracy": 0.01775568181818182, |
|
"eval_loss": 4.2041802406311035, |
|
"eval_runtime": 126.2921, |
|
"eval_samples_per_second": 11.149, |
|
"eval_steps_per_second": 0.174, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 5.005208333333333, |
|
"grad_norm": 2.5297322273254395, |
|
"learning_rate": 2.34375e-05, |
|
"loss": 4.2091, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 5.008333333333334, |
|
"eval_accuracy": 0.0234375, |
|
"eval_loss": 4.205115795135498, |
|
"eval_runtime": 127.2102, |
|
"eval_samples_per_second": 11.068, |
|
"eval_steps_per_second": 0.173, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 6.002083333333333, |
|
"grad_norm": 2.4549720287323, |
|
"learning_rate": 2.604166666666667e-05, |
|
"loss": 4.2256, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 6.007291666666666, |
|
"grad_norm": 2.3255746364593506, |
|
"learning_rate": 2.8645833333333333e-05, |
|
"loss": 4.2085, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 6.008333333333334, |
|
"eval_accuracy": 0.018465909090909092, |
|
"eval_loss": 4.193885803222656, |
|
"eval_runtime": 130.9427, |
|
"eval_samples_per_second": 10.753, |
|
"eval_steps_per_second": 0.168, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 7.004166666666666, |
|
"grad_norm": 2.676177978515625, |
|
"learning_rate": 3.125e-05, |
|
"loss": 4.2044, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 7.008333333333334, |
|
"eval_accuracy": 0.0390625, |
|
"eval_loss": 4.1791582107543945, |
|
"eval_runtime": 125.2796, |
|
"eval_samples_per_second": 11.239, |
|
"eval_steps_per_second": 0.176, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 8.001041666666667, |
|
"grad_norm": 2.616762161254883, |
|
"learning_rate": 3.385416666666667e-05, |
|
"loss": 4.1911, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 8.00625, |
|
"grad_norm": 2.983841896057129, |
|
"learning_rate": 3.6458333333333336e-05, |
|
"loss": 4.1624, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 8.008333333333333, |
|
"eval_accuracy": 0.022017045454545456, |
|
"eval_loss": 4.201480388641357, |
|
"eval_runtime": 136.9987, |
|
"eval_samples_per_second": 10.277, |
|
"eval_steps_per_second": 0.161, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 9.003125, |
|
"grad_norm": 2.874126434326172, |
|
"learning_rate": 3.90625e-05, |
|
"loss": 4.1349, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 9.008333333333333, |
|
"grad_norm": 10.345636367797852, |
|
"learning_rate": 4.166666666666667e-05, |
|
"loss": 4.1253, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 9.008333333333333, |
|
"eval_accuracy": 0.029829545454545456, |
|
"eval_loss": 4.1215105056762695, |
|
"eval_runtime": 132.9763, |
|
"eval_samples_per_second": 10.588, |
|
"eval_steps_per_second": 0.165, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 10.005208333333334, |
|
"grad_norm": 3.5383424758911133, |
|
"learning_rate": 4.4270833333333337e-05, |
|
"loss": 4.0308, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 10.008333333333333, |
|
"eval_accuracy": 0.07102272727272728, |
|
"eval_loss": 4.002456188201904, |
|
"eval_runtime": 127.9472, |
|
"eval_samples_per_second": 11.005, |
|
"eval_steps_per_second": 0.172, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 11.002083333333333, |
|
"grad_norm": 4.1187543869018555, |
|
"learning_rate": 4.6875e-05, |
|
"loss": 3.9438, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 11.007291666666667, |
|
"grad_norm": 4.370882034301758, |
|
"learning_rate": 4.947916666666667e-05, |
|
"loss": 3.8065, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 11.008333333333333, |
|
"eval_accuracy": 0.08380681818181818, |
|
"eval_loss": 3.872328758239746, |
|
"eval_runtime": 142.2129, |
|
"eval_samples_per_second": 9.901, |
|
"eval_steps_per_second": 0.155, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 12.004166666666666, |
|
"grad_norm": 3.4101152420043945, |
|
"learning_rate": 4.976851851851852e-05, |
|
"loss": 3.7614, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 12.008333333333333, |
|
"eval_accuracy": 0.09943181818181818, |
|
"eval_loss": 3.7138783931732178, |
|
"eval_runtime": 125.568, |
|
"eval_samples_per_second": 11.213, |
|
"eval_steps_per_second": 0.175, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 13.001041666666667, |
|
"grad_norm": 4.585361480712891, |
|
"learning_rate": 4.947916666666667e-05, |
|
"loss": 3.6302, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 13.00625, |
|
"grad_norm": 4.209935665130615, |
|
"learning_rate": 4.9189814814814815e-05, |
|
"loss": 3.4761, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 13.008333333333333, |
|
"eval_accuracy": 0.1434659090909091, |
|
"eval_loss": 3.6160459518432617, |
|
"eval_runtime": 133.4296, |
|
"eval_samples_per_second": 10.552, |
|
"eval_steps_per_second": 0.165, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 14.003125, |
|
"grad_norm": 4.6234049797058105, |
|
"learning_rate": 4.8900462962962965e-05, |
|
"loss": 3.3764, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 14.008333333333333, |
|
"grad_norm": 13.464088439941406, |
|
"learning_rate": 4.8611111111111115e-05, |
|
"loss": 3.278, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 14.008333333333333, |
|
"eval_accuracy": 0.1924715909090909, |
|
"eval_loss": 3.3939576148986816, |
|
"eval_runtime": 129.3255, |
|
"eval_samples_per_second": 10.887, |
|
"eval_steps_per_second": 0.17, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 15.005208333333334, |
|
"grad_norm": 4.547211647033691, |
|
"learning_rate": 4.8321759259259265e-05, |
|
"loss": 3.0999, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 15.008333333333333, |
|
"eval_accuracy": 0.2080965909090909, |
|
"eval_loss": 3.3182637691497803, |
|
"eval_runtime": 123.805, |
|
"eval_samples_per_second": 11.373, |
|
"eval_steps_per_second": 0.178, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 16.002083333333335, |
|
"grad_norm": 4.825346946716309, |
|
"learning_rate": 4.803240740740741e-05, |
|
"loss": 3.0213, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 16.007291666666667, |
|
"grad_norm": 5.93734884262085, |
|
"learning_rate": 4.774305555555556e-05, |
|
"loss": 2.9721, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 16.008333333333333, |
|
"eval_accuracy": 0.24857954545454544, |
|
"eval_loss": 3.159555673599243, |
|
"eval_runtime": 124.115, |
|
"eval_samples_per_second": 11.344, |
|
"eval_steps_per_second": 0.177, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 17.004166666666666, |
|
"grad_norm": 4.84201717376709, |
|
"learning_rate": 4.745370370370371e-05, |
|
"loss": 2.8064, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 17.008333333333333, |
|
"eval_accuracy": 0.26704545454545453, |
|
"eval_loss": 3.023179054260254, |
|
"eval_runtime": 131.2545, |
|
"eval_samples_per_second": 10.727, |
|
"eval_steps_per_second": 0.168, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 18.001041666666666, |
|
"grad_norm": 5.273012161254883, |
|
"learning_rate": 4.716435185185186e-05, |
|
"loss": 2.813, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 18.00625, |
|
"grad_norm": 5.6201276779174805, |
|
"learning_rate": 4.6875e-05, |
|
"loss": 2.6554, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 18.008333333333333, |
|
"eval_accuracy": 0.2911931818181818, |
|
"eval_loss": 2.9448060989379883, |
|
"eval_runtime": 123.4251, |
|
"eval_samples_per_second": 11.408, |
|
"eval_steps_per_second": 0.178, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 19.003125, |
|
"grad_norm": 7.461442470550537, |
|
"learning_rate": 4.658564814814815e-05, |
|
"loss": 2.6668, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 19.008333333333333, |
|
"grad_norm": 14.555140495300293, |
|
"learning_rate": 4.62962962962963e-05, |
|
"loss": 2.5052, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 19.008333333333333, |
|
"eval_accuracy": 0.3309659090909091, |
|
"eval_loss": 2.828484535217285, |
|
"eval_runtime": 127.0838, |
|
"eval_samples_per_second": 11.079, |
|
"eval_steps_per_second": 0.173, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 20.005208333333332, |
|
"grad_norm": 6.753479957580566, |
|
"learning_rate": 4.6006944444444444e-05, |
|
"loss": 2.4322, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 20.008333333333333, |
|
"eval_accuracy": 0.34019886363636365, |
|
"eval_loss": 2.7478535175323486, |
|
"eval_runtime": 125.7771, |
|
"eval_samples_per_second": 11.194, |
|
"eval_steps_per_second": 0.175, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 21.002083333333335, |
|
"grad_norm": 5.665611743927002, |
|
"learning_rate": 4.5717592592592594e-05, |
|
"loss": 2.4376, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 21.007291666666667, |
|
"grad_norm": 5.6526570320129395, |
|
"learning_rate": 4.5428240740740744e-05, |
|
"loss": 2.3193, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 21.008333333333333, |
|
"eval_accuracy": 0.3309659090909091, |
|
"eval_loss": 2.794116735458374, |
|
"eval_runtime": 126.5702, |
|
"eval_samples_per_second": 11.124, |
|
"eval_steps_per_second": 0.174, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 22.004166666666666, |
|
"grad_norm": 6.312890529632568, |
|
"learning_rate": 4.5138888888888894e-05, |
|
"loss": 2.2565, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 22.008333333333333, |
|
"eval_accuracy": 0.3671875, |
|
"eval_loss": 2.6383402347564697, |
|
"eval_runtime": 123.0345, |
|
"eval_samples_per_second": 11.444, |
|
"eval_steps_per_second": 0.179, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 23.001041666666666, |
|
"grad_norm": 5.865190505981445, |
|
"learning_rate": 4.484953703703704e-05, |
|
"loss": 2.1257, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 23.00625, |
|
"grad_norm": 5.813269138336182, |
|
"learning_rate": 4.456018518518519e-05, |
|
"loss": 2.1405, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 23.008333333333333, |
|
"eval_accuracy": 0.36079545454545453, |
|
"eval_loss": 2.5905861854553223, |
|
"eval_runtime": 126.2698, |
|
"eval_samples_per_second": 11.151, |
|
"eval_steps_per_second": 0.174, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 24.003125, |
|
"grad_norm": 6.6013569831848145, |
|
"learning_rate": 4.4270833333333337e-05, |
|
"loss": 1.9715, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 24.008333333333333, |
|
"grad_norm": 14.786352157592773, |
|
"learning_rate": 4.3981481481481486e-05, |
|
"loss": 2.1049, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 24.008333333333333, |
|
"eval_accuracy": 0.37855113636363635, |
|
"eval_loss": 2.5515265464782715, |
|
"eval_runtime": 134.242, |
|
"eval_samples_per_second": 10.489, |
|
"eval_steps_per_second": 0.164, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 25.005208333333332, |
|
"grad_norm": 6.399519920349121, |
|
"learning_rate": 4.369212962962963e-05, |
|
"loss": 1.8424, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 25.008333333333333, |
|
"eval_accuracy": 0.39204545454545453, |
|
"eval_loss": 2.4692277908325195, |
|
"eval_runtime": 134.5245, |
|
"eval_samples_per_second": 10.466, |
|
"eval_steps_per_second": 0.164, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 26.002083333333335, |
|
"grad_norm": 5.429991245269775, |
|
"learning_rate": 4.340277777777778e-05, |
|
"loss": 2.0616, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 26.007291666666667, |
|
"grad_norm": 6.501383304595947, |
|
"learning_rate": 4.311342592592593e-05, |
|
"loss": 1.8685, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 26.008333333333333, |
|
"eval_accuracy": 0.4275568181818182, |
|
"eval_loss": 2.4325406551361084, |
|
"eval_runtime": 138.3084, |
|
"eval_samples_per_second": 10.18, |
|
"eval_steps_per_second": 0.159, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 27.004166666666666, |
|
"grad_norm": 6.909026622772217, |
|
"learning_rate": 4.282407407407408e-05, |
|
"loss": 1.7478, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 27.008333333333333, |
|
"eval_accuracy": 0.4147727272727273, |
|
"eval_loss": 2.416501522064209, |
|
"eval_runtime": 128.9504, |
|
"eval_samples_per_second": 10.919, |
|
"eval_steps_per_second": 0.171, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 28.001041666666666, |
|
"grad_norm": 6.359575271606445, |
|
"learning_rate": 4.253472222222222e-05, |
|
"loss": 2.0103, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 28.00625, |
|
"grad_norm": 7.018951416015625, |
|
"learning_rate": 4.224537037037037e-05, |
|
"loss": 1.7072, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 28.008333333333333, |
|
"eval_accuracy": 0.4268465909090909, |
|
"eval_loss": 2.3617048263549805, |
|
"eval_runtime": 126.6826, |
|
"eval_samples_per_second": 11.114, |
|
"eval_steps_per_second": 0.174, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 29.003125, |
|
"grad_norm": 5.399332523345947, |
|
"learning_rate": 4.195601851851852e-05, |
|
"loss": 1.8236, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 29.008333333333333, |
|
"grad_norm": 14.481021881103516, |
|
"learning_rate": 4.166666666666667e-05, |
|
"loss": 1.7206, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 29.008333333333333, |
|
"eval_accuracy": 0.4303977272727273, |
|
"eval_loss": 2.372326135635376, |
|
"eval_runtime": 128.5457, |
|
"eval_samples_per_second": 10.953, |
|
"eval_steps_per_second": 0.171, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 30.005208333333332, |
|
"grad_norm": 7.11575984954834, |
|
"learning_rate": 4.1377314814814815e-05, |
|
"loss": 1.693, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 30.008333333333333, |
|
"eval_accuracy": 0.4424715909090909, |
|
"eval_loss": 2.2890071868896484, |
|
"eval_runtime": 128.9784, |
|
"eval_samples_per_second": 10.917, |
|
"eval_steps_per_second": 0.171, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 31.002083333333335, |
|
"grad_norm": 7.223865985870361, |
|
"learning_rate": 4.1087962962962965e-05, |
|
"loss": 1.6722, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 31.007291666666667, |
|
"grad_norm": 7.608266353607178, |
|
"learning_rate": 4.0798611111111115e-05, |
|
"loss": 1.6347, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 31.008333333333333, |
|
"eval_accuracy": 0.44105113636363635, |
|
"eval_loss": 2.244246482849121, |
|
"eval_runtime": 129.1752, |
|
"eval_samples_per_second": 10.9, |
|
"eval_steps_per_second": 0.17, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 32.00416666666667, |
|
"grad_norm": 7.774643898010254, |
|
"learning_rate": 4.0509259259259265e-05, |
|
"loss": 1.5276, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 32.00833333333333, |
|
"eval_accuracy": 0.46732954545454547, |
|
"eval_loss": 2.210369110107422, |
|
"eval_runtime": 120.0766, |
|
"eval_samples_per_second": 11.726, |
|
"eval_steps_per_second": 0.183, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 33.001041666666666, |
|
"grad_norm": 6.766725063323975, |
|
"learning_rate": 4.021990740740741e-05, |
|
"loss": 1.5203, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 33.00625, |
|
"grad_norm": 5.995940208435059, |
|
"learning_rate": 3.993055555555556e-05, |
|
"loss": 1.4576, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 33.00833333333333, |
|
"eval_accuracy": 0.45454545454545453, |
|
"eval_loss": 2.227916717529297, |
|
"eval_runtime": 124.0445, |
|
"eval_samples_per_second": 11.351, |
|
"eval_steps_per_second": 0.177, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 34.003125, |
|
"grad_norm": 6.610184669494629, |
|
"learning_rate": 3.964120370370371e-05, |
|
"loss": 1.5347, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 34.00833333333333, |
|
"grad_norm": 24.694988250732422, |
|
"learning_rate": 3.935185185185186e-05, |
|
"loss": 1.5455, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 34.00833333333333, |
|
"eval_accuracy": 0.4524147727272727, |
|
"eval_loss": 2.204953908920288, |
|
"eval_runtime": 131.2835, |
|
"eval_samples_per_second": 10.725, |
|
"eval_steps_per_second": 0.168, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 35.005208333333336, |
|
"grad_norm": 6.918396472930908, |
|
"learning_rate": 3.90625e-05, |
|
"loss": 1.4485, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 35.00833333333333, |
|
"eval_accuracy": 0.4737215909090909, |
|
"eval_loss": 2.1584527492523193, |
|
"eval_runtime": 134.6721, |
|
"eval_samples_per_second": 10.455, |
|
"eval_steps_per_second": 0.163, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 36.00208333333333, |
|
"grad_norm": 6.077505588531494, |
|
"learning_rate": 3.877314814814815e-05, |
|
"loss": 1.4391, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 36.00729166666667, |
|
"grad_norm": 6.617316722869873, |
|
"learning_rate": 3.84837962962963e-05, |
|
"loss": 1.3896, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 36.00833333333333, |
|
"eval_accuracy": 0.4446022727272727, |
|
"eval_loss": 2.1850500106811523, |
|
"eval_runtime": 129.9914, |
|
"eval_samples_per_second": 10.831, |
|
"eval_steps_per_second": 0.169, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 37.00416666666667, |
|
"grad_norm": 7.900727272033691, |
|
"learning_rate": 3.8194444444444444e-05, |
|
"loss": 1.3766, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 37.00833333333333, |
|
"eval_accuracy": 0.4872159090909091, |
|
"eval_loss": 2.118501663208008, |
|
"eval_runtime": 131.0307, |
|
"eval_samples_per_second": 10.746, |
|
"eval_steps_per_second": 0.168, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 38.001041666666666, |
|
"grad_norm": 6.658583641052246, |
|
"learning_rate": 3.7905092592592594e-05, |
|
"loss": 1.287, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 38.00625, |
|
"grad_norm": 5.483121395111084, |
|
"learning_rate": 3.7615740740740744e-05, |
|
"loss": 1.4035, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 38.00833333333333, |
|
"eval_accuracy": 0.4794034090909091, |
|
"eval_loss": 2.116427183151245, |
|
"eval_runtime": 133.3027, |
|
"eval_samples_per_second": 10.562, |
|
"eval_steps_per_second": 0.165, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 39.003125, |
|
"grad_norm": 6.027029991149902, |
|
"learning_rate": 3.7326388888888893e-05, |
|
"loss": 1.416, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 39.00833333333333, |
|
"grad_norm": 17.550609588623047, |
|
"learning_rate": 3.7037037037037037e-05, |
|
"loss": 1.5892, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 39.00833333333333, |
|
"eval_accuracy": 0.48011363636363635, |
|
"eval_loss": 2.102943181991577, |
|
"eval_runtime": 124.9558, |
|
"eval_samples_per_second": 11.268, |
|
"eval_steps_per_second": 0.176, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 40.005208333333336, |
|
"grad_norm": 8.286770820617676, |
|
"learning_rate": 3.6747685185185186e-05, |
|
"loss": 1.3647, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 40.00833333333333, |
|
"eval_accuracy": 0.4928977272727273, |
|
"eval_loss": 2.0912482738494873, |
|
"eval_runtime": 125.7809, |
|
"eval_samples_per_second": 11.194, |
|
"eval_steps_per_second": 0.175, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 41.00208333333333, |
|
"grad_norm": 6.9772138595581055, |
|
"learning_rate": 3.6458333333333336e-05, |
|
"loss": 1.3769, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 41.00729166666667, |
|
"grad_norm": 6.215446472167969, |
|
"learning_rate": 3.6168981481481486e-05, |
|
"loss": 1.388, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 41.00833333333333, |
|
"eval_accuracy": 0.47301136363636365, |
|
"eval_loss": 2.1330864429473877, |
|
"eval_runtime": 134.2267, |
|
"eval_samples_per_second": 10.49, |
|
"eval_steps_per_second": 0.164, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 42.00416666666667, |
|
"grad_norm": 6.233066082000732, |
|
"learning_rate": 3.587962962962963e-05, |
|
"loss": 1.3425, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 42.00833333333333, |
|
"eval_accuracy": 0.4794034090909091, |
|
"eval_loss": 2.143657922744751, |
|
"eval_runtime": 126.4352, |
|
"eval_samples_per_second": 11.136, |
|
"eval_steps_per_second": 0.174, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 43.001041666666666, |
|
"grad_norm": 7.223750591278076, |
|
"learning_rate": 3.559027777777778e-05, |
|
"loss": 1.1958, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 43.00625, |
|
"grad_norm": 6.594886779785156, |
|
"learning_rate": 3.530092592592593e-05, |
|
"loss": 1.2909, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 43.00833333333333, |
|
"eval_accuracy": 0.4715909090909091, |
|
"eval_loss": 2.109005928039551, |
|
"eval_runtime": 129.7594, |
|
"eval_samples_per_second": 10.851, |
|
"eval_steps_per_second": 0.17, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 44.003125, |
|
"grad_norm": 5.431155681610107, |
|
"learning_rate": 3.501157407407408e-05, |
|
"loss": 1.2277, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 44.00833333333333, |
|
"grad_norm": 23.15688133239746, |
|
"learning_rate": 3.472222222222222e-05, |
|
"loss": 1.2757, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 44.00833333333333, |
|
"eval_accuracy": 0.4900568181818182, |
|
"eval_loss": 2.0685956478118896, |
|
"eval_runtime": 117.5767, |
|
"eval_samples_per_second": 11.975, |
|
"eval_steps_per_second": 0.187, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 45.005208333333336, |
|
"grad_norm": 6.826360702514648, |
|
"learning_rate": 3.443287037037037e-05, |
|
"loss": 1.181, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 45.00833333333333, |
|
"eval_accuracy": 0.4893465909090909, |
|
"eval_loss": 2.0484628677368164, |
|
"eval_runtime": 122.183, |
|
"eval_samples_per_second": 11.524, |
|
"eval_steps_per_second": 0.18, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 46.00208333333333, |
|
"grad_norm": 6.446974277496338, |
|
"learning_rate": 3.414351851851852e-05, |
|
"loss": 1.2034, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 46.00729166666667, |
|
"grad_norm": 6.026095390319824, |
|
"learning_rate": 3.385416666666667e-05, |
|
"loss": 1.1825, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 46.00833333333333, |
|
"eval_accuracy": 0.484375, |
|
"eval_loss": 2.0560503005981445, |
|
"eval_runtime": 129.8384, |
|
"eval_samples_per_second": 10.844, |
|
"eval_steps_per_second": 0.169, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 47.00416666666667, |
|
"grad_norm": 7.490734577178955, |
|
"learning_rate": 3.3564814814814815e-05, |
|
"loss": 1.1594, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 47.00833333333333, |
|
"eval_accuracy": 0.49644886363636365, |
|
"eval_loss": 2.0327041149139404, |
|
"eval_runtime": 125.1147, |
|
"eval_samples_per_second": 11.254, |
|
"eval_steps_per_second": 0.176, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 48.001041666666666, |
|
"grad_norm": 6.074591636657715, |
|
"learning_rate": 3.3275462962962965e-05, |
|
"loss": 1.0989, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 48.00625, |
|
"grad_norm": 7.924504280090332, |
|
"learning_rate": 3.2986111111111115e-05, |
|
"loss": 1.1699, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 48.00833333333333, |
|
"eval_accuracy": 0.4765625, |
|
"eval_loss": 2.095003366470337, |
|
"eval_runtime": 131.8398, |
|
"eval_samples_per_second": 10.68, |
|
"eval_steps_per_second": 0.167, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 49.003125, |
|
"grad_norm": 7.409806728363037, |
|
"learning_rate": 3.2696759259259265e-05, |
|
"loss": 1.2027, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 49.00833333333333, |
|
"grad_norm": 23.586345672607422, |
|
"learning_rate": 3.240740740740741e-05, |
|
"loss": 1.1908, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 49.00833333333333, |
|
"eval_accuracy": 0.4850852272727273, |
|
"eval_loss": 2.0465078353881836, |
|
"eval_runtime": 129.3358, |
|
"eval_samples_per_second": 10.886, |
|
"eval_steps_per_second": 0.17, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 50.005208333333336, |
|
"grad_norm": 7.079756736755371, |
|
"learning_rate": 3.211805555555556e-05, |
|
"loss": 1.1149, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 50.00833333333333, |
|
"eval_accuracy": 0.48792613636363635, |
|
"eval_loss": 2.0569851398468018, |
|
"eval_runtime": 123.867, |
|
"eval_samples_per_second": 11.367, |
|
"eval_steps_per_second": 0.178, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 51.00208333333333, |
|
"grad_norm": 5.848146438598633, |
|
"learning_rate": 3.182870370370371e-05, |
|
"loss": 1.0907, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 51.00729166666667, |
|
"grad_norm": 7.725104808807373, |
|
"learning_rate": 3.153935185185186e-05, |
|
"loss": 1.1388, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 51.00833333333333, |
|
"eval_accuracy": 0.4978693181818182, |
|
"eval_loss": 2.0232162475585938, |
|
"eval_runtime": 124.9726, |
|
"eval_samples_per_second": 11.266, |
|
"eval_steps_per_second": 0.176, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 52.00416666666667, |
|
"grad_norm": 6.411397457122803, |
|
"learning_rate": 3.125e-05, |
|
"loss": 1.0421, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 52.00833333333333, |
|
"eval_accuracy": 0.49857954545454547, |
|
"eval_loss": 2.0132648944854736, |
|
"eval_runtime": 130.0486, |
|
"eval_samples_per_second": 10.827, |
|
"eval_steps_per_second": 0.169, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 53.001041666666666, |
|
"grad_norm": 6.417796611785889, |
|
"learning_rate": 3.0960648148148144e-05, |
|
"loss": 1.128, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 53.00625, |
|
"grad_norm": 6.483155727386475, |
|
"learning_rate": 3.06712962962963e-05, |
|
"loss": 1.1243, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 53.00833333333333, |
|
"eval_accuracy": 0.4900568181818182, |
|
"eval_loss": 2.0420944690704346, |
|
"eval_runtime": 131.0098, |
|
"eval_samples_per_second": 10.747, |
|
"eval_steps_per_second": 0.168, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 54.003125, |
|
"grad_norm": 7.885180950164795, |
|
"learning_rate": 3.0381944444444444e-05, |
|
"loss": 1.0331, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 54.00833333333333, |
|
"grad_norm": 28.23676300048828, |
|
"learning_rate": 3.0092592592592593e-05, |
|
"loss": 1.1064, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 54.00833333333333, |
|
"eval_accuracy": 0.5042613636363636, |
|
"eval_loss": 1.9613640308380127, |
|
"eval_runtime": 129.6513, |
|
"eval_samples_per_second": 10.86, |
|
"eval_steps_per_second": 0.17, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 55.005208333333336, |
|
"grad_norm": 7.0548319816589355, |
|
"learning_rate": 2.980324074074074e-05, |
|
"loss": 0.9778, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 55.00833333333333, |
|
"eval_accuracy": 0.5071022727272727, |
|
"eval_loss": 1.9938956499099731, |
|
"eval_runtime": 125.6294, |
|
"eval_samples_per_second": 11.208, |
|
"eval_steps_per_second": 0.175, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 56.00208333333333, |
|
"grad_norm": 7.993034362792969, |
|
"learning_rate": 2.951388888888889e-05, |
|
"loss": 1.0192, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 56.00729166666667, |
|
"grad_norm": 8.833446502685547, |
|
"learning_rate": 2.9224537037037036e-05, |
|
"loss": 1.1417, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 56.00833333333333, |
|
"eval_accuracy": 0.5106534090909091, |
|
"eval_loss": 1.977372646331787, |
|
"eval_runtime": 141.0715, |
|
"eval_samples_per_second": 9.981, |
|
"eval_steps_per_second": 0.156, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 57.00416666666667, |
|
"grad_norm": 6.480681896209717, |
|
"learning_rate": 2.8935185185185186e-05, |
|
"loss": 1.0578, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 57.00833333333333, |
|
"eval_accuracy": 0.5305397727272727, |
|
"eval_loss": 1.9625455141067505, |
|
"eval_runtime": 137.3354, |
|
"eval_samples_per_second": 10.252, |
|
"eval_steps_per_second": 0.16, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 58.001041666666666, |
|
"grad_norm": 6.03174352645874, |
|
"learning_rate": 2.8645833333333333e-05, |
|
"loss": 0.9695, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 58.00625, |
|
"grad_norm": 5.930421352386475, |
|
"learning_rate": 2.8356481481481483e-05, |
|
"loss": 1.0904, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 58.00833333333333, |
|
"eval_accuracy": 0.5056818181818182, |
|
"eval_loss": 1.971279501914978, |
|
"eval_runtime": 124.5828, |
|
"eval_samples_per_second": 11.302, |
|
"eval_steps_per_second": 0.177, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 59.003125, |
|
"grad_norm": 7.177443027496338, |
|
"learning_rate": 2.806712962962963e-05, |
|
"loss": 1.0493, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 59.00833333333333, |
|
"grad_norm": 32.896873474121094, |
|
"learning_rate": 2.777777777777778e-05, |
|
"loss": 1.2569, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 59.00833333333333, |
|
"eval_accuracy": 0.5255681818181818, |
|
"eval_loss": 1.9495558738708496, |
|
"eval_runtime": 125.0434, |
|
"eval_samples_per_second": 11.26, |
|
"eval_steps_per_second": 0.176, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 60.005208333333336, |
|
"grad_norm": 6.914553642272949, |
|
"learning_rate": 2.7488425925925926e-05, |
|
"loss": 1.076, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 60.00833333333333, |
|
"eval_accuracy": 0.5369318181818182, |
|
"eval_loss": 1.9237945079803467, |
|
"eval_runtime": 130.5511, |
|
"eval_samples_per_second": 10.785, |
|
"eval_steps_per_second": 0.169, |
|
"step": 976 |
|
}, |
|
{ |
|
"epoch": 61.00208333333333, |
|
"grad_norm": 6.131436347961426, |
|
"learning_rate": 2.7199074074074076e-05, |
|
"loss": 0.9972, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 61.00729166666667, |
|
"grad_norm": 8.375364303588867, |
|
"learning_rate": 2.6909722222222222e-05, |
|
"loss": 1.018, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 61.00833333333333, |
|
"eval_accuracy": 0.515625, |
|
"eval_loss": 1.9578070640563965, |
|
"eval_runtime": 128.8964, |
|
"eval_samples_per_second": 10.924, |
|
"eval_steps_per_second": 0.171, |
|
"step": 992 |
|
}, |
|
{ |
|
"epoch": 62.00416666666667, |
|
"grad_norm": 8.409364700317383, |
|
"learning_rate": 2.6620370370370372e-05, |
|
"loss": 0.8569, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 62.00833333333333, |
|
"eval_accuracy": 0.5184659090909091, |
|
"eval_loss": 1.9410430192947388, |
|
"eval_runtime": 126.0956, |
|
"eval_samples_per_second": 11.166, |
|
"eval_steps_per_second": 0.174, |
|
"step": 1008 |
|
}, |
|
{ |
|
"epoch": 63.001041666666666, |
|
"grad_norm": 6.331228256225586, |
|
"learning_rate": 2.633101851851852e-05, |
|
"loss": 1.1689, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 63.00625, |
|
"grad_norm": 5.7633161544799805, |
|
"learning_rate": 2.604166666666667e-05, |
|
"loss": 0.9847, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 63.00833333333333, |
|
"eval_accuracy": 0.5134943181818182, |
|
"eval_loss": 1.965501308441162, |
|
"eval_runtime": 134.8984, |
|
"eval_samples_per_second": 10.437, |
|
"eval_steps_per_second": 0.163, |
|
"step": 1024 |
|
}, |
|
{ |
|
"epoch": 64.003125, |
|
"grad_norm": 6.778160572052002, |
|
"learning_rate": 2.5752314814814815e-05, |
|
"loss": 1.0541, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 64.00833333333334, |
|
"grad_norm": 15.068023681640625, |
|
"learning_rate": 2.5462962962962965e-05, |
|
"loss": 0.8992, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 64.00833333333334, |
|
"eval_accuracy": 0.5184659090909091, |
|
"eval_loss": 1.97407865524292, |
|
"eval_runtime": 131.1607, |
|
"eval_samples_per_second": 10.735, |
|
"eval_steps_per_second": 0.168, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 65.00520833333333, |
|
"grad_norm": 6.6322126388549805, |
|
"learning_rate": 2.517361111111111e-05, |
|
"loss": 0.9781, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 65.00833333333334, |
|
"eval_accuracy": 0.5248579545454546, |
|
"eval_loss": 1.9591399431228638, |
|
"eval_runtime": 130.8261, |
|
"eval_samples_per_second": 10.762, |
|
"eval_steps_per_second": 0.168, |
|
"step": 1056 |
|
}, |
|
{ |
|
"epoch": 66.00208333333333, |
|
"grad_norm": 6.270596504211426, |
|
"learning_rate": 2.488425925925926e-05, |
|
"loss": 1.1274, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 66.00729166666666, |
|
"grad_norm": 5.216790676116943, |
|
"learning_rate": 2.4594907407407408e-05, |
|
"loss": 0.9016, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 66.00833333333334, |
|
"eval_accuracy": 0.5134943181818182, |
|
"eval_loss": 1.9802237749099731, |
|
"eval_runtime": 138.409, |
|
"eval_samples_per_second": 10.173, |
|
"eval_steps_per_second": 0.159, |
|
"step": 1072 |
|
}, |
|
{ |
|
"epoch": 67.00416666666666, |
|
"grad_norm": 4.889580249786377, |
|
"learning_rate": 2.4305555555555558e-05, |
|
"loss": 0.9443, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 67.00833333333334, |
|
"eval_accuracy": 0.5035511363636364, |
|
"eval_loss": 1.9881755113601685, |
|
"eval_runtime": 132.577, |
|
"eval_samples_per_second": 10.62, |
|
"eval_steps_per_second": 0.166, |
|
"step": 1088 |
|
}, |
|
{ |
|
"epoch": 68.00104166666667, |
|
"grad_norm": 8.338197708129883, |
|
"learning_rate": 2.4016203703703704e-05, |
|
"loss": 0.9719, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 68.00625, |
|
"grad_norm": 7.402223110198975, |
|
"learning_rate": 2.3726851851851854e-05, |
|
"loss": 0.9359, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 68.00833333333334, |
|
"eval_accuracy": 0.5092329545454546, |
|
"eval_loss": 2.0045785903930664, |
|
"eval_runtime": 122.9735, |
|
"eval_samples_per_second": 11.45, |
|
"eval_steps_per_second": 0.179, |
|
"step": 1104 |
|
}, |
|
{ |
|
"epoch": 69.003125, |
|
"grad_norm": 5.299436092376709, |
|
"learning_rate": 2.34375e-05, |
|
"loss": 0.9554, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 69.00833333333334, |
|
"grad_norm": 22.75337028503418, |
|
"learning_rate": 2.314814814814815e-05, |
|
"loss": 0.7735, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 69.00833333333334, |
|
"eval_accuracy": 0.5063920454545454, |
|
"eval_loss": 2.017183542251587, |
|
"eval_runtime": 132.6432, |
|
"eval_samples_per_second": 10.615, |
|
"eval_steps_per_second": 0.166, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 70.00520833333333, |
|
"grad_norm": 6.869340896606445, |
|
"learning_rate": 2.2858796296296297e-05, |
|
"loss": 0.9405, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 70.00833333333334, |
|
"eval_accuracy": 0.5269886363636364, |
|
"eval_loss": 1.955207109451294, |
|
"eval_runtime": 135.5316, |
|
"eval_samples_per_second": 10.389, |
|
"eval_steps_per_second": 0.162, |
|
"step": 1136 |
|
}, |
|
{ |
|
"epoch": 71.00208333333333, |
|
"grad_norm": 7.737886428833008, |
|
"learning_rate": 2.2569444444444447e-05, |
|
"loss": 0.9419, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 71.00729166666666, |
|
"grad_norm": 6.463024139404297, |
|
"learning_rate": 2.2280092592592593e-05, |
|
"loss": 0.9709, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 71.00833333333334, |
|
"eval_accuracy": 0.5227272727272727, |
|
"eval_loss": 1.9572663307189941, |
|
"eval_runtime": 134.3176, |
|
"eval_samples_per_second": 10.483, |
|
"eval_steps_per_second": 0.164, |
|
"step": 1152 |
|
}, |
|
{ |
|
"epoch": 72.00416666666666, |
|
"grad_norm": 6.5754923820495605, |
|
"learning_rate": 2.1990740740740743e-05, |
|
"loss": 0.9914, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 72.00833333333334, |
|
"eval_accuracy": 0.5248579545454546, |
|
"eval_loss": 1.9767735004425049, |
|
"eval_runtime": 125.4669, |
|
"eval_samples_per_second": 11.222, |
|
"eval_steps_per_second": 0.175, |
|
"step": 1168 |
|
}, |
|
{ |
|
"epoch": 73.00104166666667, |
|
"grad_norm": 6.831247329711914, |
|
"learning_rate": 2.170138888888889e-05, |
|
"loss": 0.8804, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 73.00625, |
|
"grad_norm": 7.8077392578125, |
|
"learning_rate": 2.141203703703704e-05, |
|
"loss": 0.8487, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 73.00833333333334, |
|
"eval_accuracy": 0.5326704545454546, |
|
"eval_loss": 1.9569602012634277, |
|
"eval_runtime": 134.6123, |
|
"eval_samples_per_second": 10.46, |
|
"eval_steps_per_second": 0.163, |
|
"step": 1184 |
|
}, |
|
{ |
|
"epoch": 74.003125, |
|
"grad_norm": 5.786107063293457, |
|
"learning_rate": 2.1122685185185186e-05, |
|
"loss": 0.9529, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 74.00833333333334, |
|
"grad_norm": 21.245386123657227, |
|
"learning_rate": 2.0833333333333336e-05, |
|
"loss": 0.835, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 74.00833333333334, |
|
"eval_accuracy": 0.5241477272727273, |
|
"eval_loss": 1.9758590459823608, |
|
"eval_runtime": 129.6528, |
|
"eval_samples_per_second": 10.86, |
|
"eval_steps_per_second": 0.17, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 75.00520833333333, |
|
"grad_norm": 5.697115421295166, |
|
"learning_rate": 2.0543981481481483e-05, |
|
"loss": 0.8914, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 75.00833333333334, |
|
"eval_accuracy": 0.5298295454545454, |
|
"eval_loss": 1.9309029579162598, |
|
"eval_runtime": 130.377, |
|
"eval_samples_per_second": 10.799, |
|
"eval_steps_per_second": 0.169, |
|
"step": 1216 |
|
}, |
|
{ |
|
"epoch": 76.00208333333333, |
|
"grad_norm": 4.525904655456543, |
|
"learning_rate": 2.0254629629629632e-05, |
|
"loss": 0.9268, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 76.00729166666666, |
|
"grad_norm": 6.657559871673584, |
|
"learning_rate": 1.996527777777778e-05, |
|
"loss": 0.9242, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 76.00833333333334, |
|
"eval_accuracy": 0.5241477272727273, |
|
"eval_loss": 1.9594990015029907, |
|
"eval_runtime": 130.7531, |
|
"eval_samples_per_second": 10.768, |
|
"eval_steps_per_second": 0.168, |
|
"step": 1232 |
|
}, |
|
{ |
|
"epoch": 77.00416666666666, |
|
"grad_norm": 6.962241172790527, |
|
"learning_rate": 1.967592592592593e-05, |
|
"loss": 0.8235, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 77.00833333333334, |
|
"eval_accuracy": 0.5276988636363636, |
|
"eval_loss": 1.9556376934051514, |
|
"eval_runtime": 124.6215, |
|
"eval_samples_per_second": 11.298, |
|
"eval_steps_per_second": 0.177, |
|
"step": 1248 |
|
}, |
|
{ |
|
"epoch": 78.00104166666667, |
|
"grad_norm": 6.555630207061768, |
|
"learning_rate": 1.9386574074074075e-05, |
|
"loss": 1.1044, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 78.00625, |
|
"grad_norm": 7.0458455085754395, |
|
"learning_rate": 1.9097222222222222e-05, |
|
"loss": 0.8664, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 78.00833333333334, |
|
"eval_accuracy": 0.5134943181818182, |
|
"eval_loss": 1.978991985321045, |
|
"eval_runtime": 130.4341, |
|
"eval_samples_per_second": 10.795, |
|
"eval_steps_per_second": 0.169, |
|
"step": 1264 |
|
}, |
|
{ |
|
"epoch": 79.003125, |
|
"grad_norm": 4.815821170806885, |
|
"learning_rate": 1.8807870370370372e-05, |
|
"loss": 0.7884, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 79.00833333333334, |
|
"grad_norm": 26.361722946166992, |
|
"learning_rate": 1.8518518518518518e-05, |
|
"loss": 0.8699, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 79.00833333333334, |
|
"eval_accuracy": 0.5227272727272727, |
|
"eval_loss": 1.9835097789764404, |
|
"eval_runtime": 119.0563, |
|
"eval_samples_per_second": 11.826, |
|
"eval_steps_per_second": 0.185, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 80.00520833333333, |
|
"grad_norm": 4.6080474853515625, |
|
"learning_rate": 1.8229166666666668e-05, |
|
"loss": 0.9112, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 80.00833333333334, |
|
"eval_accuracy": 0.5291193181818182, |
|
"eval_loss": 1.9426430463790894, |
|
"eval_runtime": 137.1174, |
|
"eval_samples_per_second": 10.269, |
|
"eval_steps_per_second": 0.16, |
|
"step": 1296 |
|
}, |
|
{ |
|
"epoch": 81.00208333333333, |
|
"grad_norm": 5.045470237731934, |
|
"learning_rate": 1.7939814814814815e-05, |
|
"loss": 0.8392, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 81.00729166666666, |
|
"grad_norm": 6.203034400939941, |
|
"learning_rate": 1.7650462962962965e-05, |
|
"loss": 0.7901, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 81.00833333333334, |
|
"eval_accuracy": 0.5255681818181818, |
|
"eval_loss": 1.959786295890808, |
|
"eval_runtime": 120.2871, |
|
"eval_samples_per_second": 11.705, |
|
"eval_steps_per_second": 0.183, |
|
"step": 1312 |
|
}, |
|
{ |
|
"epoch": 82.00416666666666, |
|
"grad_norm": 6.214341163635254, |
|
"learning_rate": 1.736111111111111e-05, |
|
"loss": 0.8186, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 82.00833333333334, |
|
"eval_accuracy": 0.5319602272727273, |
|
"eval_loss": 1.9397464990615845, |
|
"eval_runtime": 130.0941, |
|
"eval_samples_per_second": 10.823, |
|
"eval_steps_per_second": 0.169, |
|
"step": 1328 |
|
}, |
|
{ |
|
"epoch": 83.00104166666667, |
|
"grad_norm": 5.813119411468506, |
|
"learning_rate": 1.707175925925926e-05, |
|
"loss": 0.7438, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 83.00625, |
|
"grad_norm": 5.390239715576172, |
|
"learning_rate": 1.6782407407407408e-05, |
|
"loss": 0.8229, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 83.00833333333334, |
|
"eval_accuracy": 0.5326704545454546, |
|
"eval_loss": 1.938385009765625, |
|
"eval_runtime": 124.8818, |
|
"eval_samples_per_second": 11.275, |
|
"eval_steps_per_second": 0.176, |
|
"step": 1344 |
|
}, |
|
{ |
|
"epoch": 84.003125, |
|
"grad_norm": 5.821295738220215, |
|
"learning_rate": 1.6493055555555557e-05, |
|
"loss": 1.0251, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 84.00833333333334, |
|
"grad_norm": 23.789480209350586, |
|
"learning_rate": 1.6203703703703704e-05, |
|
"loss": 0.9063, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 84.00833333333334, |
|
"eval_accuracy": 0.5291193181818182, |
|
"eval_loss": 1.9323524236679077, |
|
"eval_runtime": 122.6737, |
|
"eval_samples_per_second": 11.478, |
|
"eval_steps_per_second": 0.179, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 85.00520833333333, |
|
"grad_norm": 7.094452381134033, |
|
"learning_rate": 1.5914351851851854e-05, |
|
"loss": 0.8843, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 85.00833333333334, |
|
"eval_accuracy": 0.5369318181818182, |
|
"eval_loss": 1.9315829277038574, |
|
"eval_runtime": 124.9222, |
|
"eval_samples_per_second": 11.271, |
|
"eval_steps_per_second": 0.176, |
|
"step": 1376 |
|
}, |
|
{ |
|
"epoch": 86.00208333333333, |
|
"grad_norm": 5.776772975921631, |
|
"learning_rate": 1.5625e-05, |
|
"loss": 0.7684, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 86.00729166666666, |
|
"grad_norm": 8.940707206726074, |
|
"learning_rate": 1.533564814814815e-05, |
|
"loss": 0.7904, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 86.00833333333334, |
|
"eval_accuracy": 0.5376420454545454, |
|
"eval_loss": 1.926943302154541, |
|
"eval_runtime": 126.2416, |
|
"eval_samples_per_second": 11.153, |
|
"eval_steps_per_second": 0.174, |
|
"step": 1392 |
|
}, |
|
{ |
|
"epoch": 87.00416666666666, |
|
"grad_norm": 6.900567054748535, |
|
"learning_rate": 1.5046296296296297e-05, |
|
"loss": 0.7942, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 87.00833333333334, |
|
"eval_accuracy": 0.5291193181818182, |
|
"eval_loss": 1.9505839347839355, |
|
"eval_runtime": 129.4495, |
|
"eval_samples_per_second": 10.877, |
|
"eval_steps_per_second": 0.17, |
|
"step": 1408 |
|
}, |
|
{ |
|
"epoch": 88.00104166666667, |
|
"grad_norm": 6.561887741088867, |
|
"learning_rate": 1.4756944444444445e-05, |
|
"loss": 0.7708, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 88.00625, |
|
"grad_norm": 6.302940368652344, |
|
"learning_rate": 1.4467592592592593e-05, |
|
"loss": 0.8798, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 88.00833333333334, |
|
"eval_accuracy": 0.5404829545454546, |
|
"eval_loss": 1.9184972047805786, |
|
"eval_runtime": 136.5919, |
|
"eval_samples_per_second": 10.308, |
|
"eval_steps_per_second": 0.161, |
|
"step": 1424 |
|
}, |
|
{ |
|
"epoch": 89.003125, |
|
"grad_norm": 6.015424728393555, |
|
"learning_rate": 1.4178240740740741e-05, |
|
"loss": 0.7788, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 89.00833333333334, |
|
"grad_norm": 9.586054801940918, |
|
"learning_rate": 1.388888888888889e-05, |
|
"loss": 0.7678, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 89.00833333333334, |
|
"eval_accuracy": 0.5326704545454546, |
|
"eval_loss": 1.9361698627471924, |
|
"eval_runtime": 128.5409, |
|
"eval_samples_per_second": 10.954, |
|
"eval_steps_per_second": 0.171, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 90.00520833333333, |
|
"grad_norm": 6.406522750854492, |
|
"learning_rate": 1.3599537037037038e-05, |
|
"loss": 0.7589, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 90.00833333333334, |
|
"eval_accuracy": 0.5276988636363636, |
|
"eval_loss": 1.9496175050735474, |
|
"eval_runtime": 128.3627, |
|
"eval_samples_per_second": 10.969, |
|
"eval_steps_per_second": 0.171, |
|
"step": 1456 |
|
}, |
|
{ |
|
"epoch": 91.00208333333333, |
|
"grad_norm": 5.885864734649658, |
|
"learning_rate": 1.3310185185185186e-05, |
|
"loss": 0.7746, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 91.00729166666666, |
|
"grad_norm": 6.763460159301758, |
|
"learning_rate": 1.3020833333333334e-05, |
|
"loss": 0.6679, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 91.00833333333334, |
|
"eval_accuracy": 0.5298295454545454, |
|
"eval_loss": 1.9507235288619995, |
|
"eval_runtime": 133.1729, |
|
"eval_samples_per_second": 10.573, |
|
"eval_steps_per_second": 0.165, |
|
"step": 1472 |
|
}, |
|
{ |
|
"epoch": 92.00416666666666, |
|
"grad_norm": 8.202223777770996, |
|
"learning_rate": 1.2731481481481482e-05, |
|
"loss": 0.8042, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 92.00833333333334, |
|
"eval_accuracy": 0.5369318181818182, |
|
"eval_loss": 1.950987696647644, |
|
"eval_runtime": 134.0073, |
|
"eval_samples_per_second": 10.507, |
|
"eval_steps_per_second": 0.164, |
|
"step": 1488 |
|
}, |
|
{ |
|
"epoch": 93.00104166666667, |
|
"grad_norm": 6.163456439971924, |
|
"learning_rate": 1.244212962962963e-05, |
|
"loss": 0.8758, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 93.00625, |
|
"grad_norm": 8.015287399291992, |
|
"learning_rate": 1.2152777777777779e-05, |
|
"loss": 0.7722, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 93.00833333333334, |
|
"eval_accuracy": 0.5333806818181818, |
|
"eval_loss": 1.9502681493759155, |
|
"eval_runtime": 125.0273, |
|
"eval_samples_per_second": 11.262, |
|
"eval_steps_per_second": 0.176, |
|
"step": 1504 |
|
}, |
|
{ |
|
"epoch": 94.003125, |
|
"grad_norm": 5.918884754180908, |
|
"learning_rate": 1.1863425925925927e-05, |
|
"loss": 0.7579, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 94.00833333333334, |
|
"grad_norm": 18.762502670288086, |
|
"learning_rate": 1.1574074074074075e-05, |
|
"loss": 0.6831, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 94.00833333333334, |
|
"eval_accuracy": 0.5348011363636364, |
|
"eval_loss": 1.9531010389328003, |
|
"eval_runtime": 120.9603, |
|
"eval_samples_per_second": 11.64, |
|
"eval_steps_per_second": 0.182, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 95.00520833333333, |
|
"grad_norm": 7.426961421966553, |
|
"learning_rate": 1.1284722222222223e-05, |
|
"loss": 0.766, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 95.00833333333334, |
|
"eval_accuracy": 0.5383522727272727, |
|
"eval_loss": 1.9345489740371704, |
|
"eval_runtime": 129.3794, |
|
"eval_samples_per_second": 10.883, |
|
"eval_steps_per_second": 0.17, |
|
"step": 1536 |
|
}, |
|
{ |
|
"epoch": 96.00208333333333, |
|
"grad_norm": 5.175162315368652, |
|
"learning_rate": 1.0995370370370372e-05, |
|
"loss": 0.7168, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 96.00729166666666, |
|
"grad_norm": 7.063689708709717, |
|
"learning_rate": 1.070601851851852e-05, |
|
"loss": 0.8099, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 96.00833333333334, |
|
"eval_accuracy": 0.5376420454545454, |
|
"eval_loss": 1.9349414110183716, |
|
"eval_runtime": 123.5725, |
|
"eval_samples_per_second": 11.394, |
|
"eval_steps_per_second": 0.178, |
|
"step": 1552 |
|
}, |
|
{ |
|
"epoch": 97.00416666666666, |
|
"grad_norm": 6.602786064147949, |
|
"learning_rate": 1.0416666666666668e-05, |
|
"loss": 0.7513, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 97.00833333333334, |
|
"eval_accuracy": 0.5461647727272727, |
|
"eval_loss": 1.9238044023513794, |
|
"eval_runtime": 139.032, |
|
"eval_samples_per_second": 10.127, |
|
"eval_steps_per_second": 0.158, |
|
"step": 1568 |
|
}, |
|
{ |
|
"epoch": 98.00104166666667, |
|
"grad_norm": 5.149336338043213, |
|
"learning_rate": 1.0127314814814816e-05, |
|
"loss": 0.8236, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 98.00625, |
|
"grad_norm": 8.144251823425293, |
|
"learning_rate": 9.837962962962964e-06, |
|
"loss": 0.6561, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 98.00833333333334, |
|
"eval_accuracy": 0.5426136363636364, |
|
"eval_loss": 1.9337557554244995, |
|
"eval_runtime": 133.1537, |
|
"eval_samples_per_second": 10.574, |
|
"eval_steps_per_second": 0.165, |
|
"step": 1584 |
|
}, |
|
{ |
|
"epoch": 99.003125, |
|
"grad_norm": 5.338318347930908, |
|
"learning_rate": 9.548611111111111e-06, |
|
"loss": 0.778, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 99.00833333333334, |
|
"grad_norm": 15.469060897827148, |
|
"learning_rate": 9.259259259259259e-06, |
|
"loss": 0.7423, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 99.00833333333334, |
|
"eval_accuracy": 0.5539772727272727, |
|
"eval_loss": 1.901895523071289, |
|
"eval_runtime": 131.3276, |
|
"eval_samples_per_second": 10.721, |
|
"eval_steps_per_second": 0.168, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 100.00520833333333, |
|
"grad_norm": 4.922194957733154, |
|
"learning_rate": 8.969907407407407e-06, |
|
"loss": 0.7739, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 100.00833333333334, |
|
"eval_accuracy": 0.5504261363636364, |
|
"eval_loss": 1.916474461555481, |
|
"eval_runtime": 124.6426, |
|
"eval_samples_per_second": 11.296, |
|
"eval_steps_per_second": 0.177, |
|
"step": 1616 |
|
}, |
|
{ |
|
"epoch": 101.00208333333333, |
|
"grad_norm": 7.351630210876465, |
|
"learning_rate": 8.680555555555556e-06, |
|
"loss": 0.6743, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 101.00729166666666, |
|
"grad_norm": 5.356750965118408, |
|
"learning_rate": 8.391203703703704e-06, |
|
"loss": 0.6562, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 101.00833333333334, |
|
"eval_accuracy": 0.5433238636363636, |
|
"eval_loss": 1.9271347522735596, |
|
"eval_runtime": 121.1594, |
|
"eval_samples_per_second": 11.621, |
|
"eval_steps_per_second": 0.182, |
|
"step": 1632 |
|
}, |
|
{ |
|
"epoch": 102.00416666666666, |
|
"grad_norm": 6.858729839324951, |
|
"learning_rate": 8.101851851851852e-06, |
|
"loss": 0.7182, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 102.00833333333334, |
|
"eval_accuracy": 0.5440340909090909, |
|
"eval_loss": 1.9096194505691528, |
|
"eval_runtime": 133.4895, |
|
"eval_samples_per_second": 10.548, |
|
"eval_steps_per_second": 0.165, |
|
"step": 1648 |
|
}, |
|
{ |
|
"epoch": 103.00104166666667, |
|
"grad_norm": 6.165463924407959, |
|
"learning_rate": 7.8125e-06, |
|
"loss": 0.75, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 103.00625, |
|
"grad_norm": 7.791365623474121, |
|
"learning_rate": 7.523148148148148e-06, |
|
"loss": 0.6898, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 103.00833333333334, |
|
"eval_accuracy": 0.5482954545454546, |
|
"eval_loss": 1.9213480949401855, |
|
"eval_runtime": 121.4191, |
|
"eval_samples_per_second": 11.596, |
|
"eval_steps_per_second": 0.181, |
|
"step": 1664 |
|
}, |
|
{ |
|
"epoch": 104.003125, |
|
"grad_norm": 7.106443405151367, |
|
"learning_rate": 7.2337962962962966e-06, |
|
"loss": 0.8279, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 104.00833333333334, |
|
"grad_norm": 11.49569320678711, |
|
"learning_rate": 6.944444444444445e-06, |
|
"loss": 0.6541, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 104.00833333333334, |
|
"eval_accuracy": 0.5433238636363636, |
|
"eval_loss": 1.926275372505188, |
|
"eval_runtime": 128.2374, |
|
"eval_samples_per_second": 10.98, |
|
"eval_steps_per_second": 0.172, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 105.00520833333333, |
|
"grad_norm": 5.9457316398620605, |
|
"learning_rate": 6.655092592592593e-06, |
|
"loss": 0.7131, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 105.00833333333334, |
|
"eval_accuracy": 0.546875, |
|
"eval_loss": 1.9148098230361938, |
|
"eval_runtime": 128.976, |
|
"eval_samples_per_second": 10.917, |
|
"eval_steps_per_second": 0.171, |
|
"step": 1696 |
|
}, |
|
{ |
|
"epoch": 106.00208333333333, |
|
"grad_norm": 7.0640716552734375, |
|
"learning_rate": 6.365740740740741e-06, |
|
"loss": 0.7, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 106.00729166666666, |
|
"grad_norm": 5.8036699295043945, |
|
"learning_rate": 6.076388888888889e-06, |
|
"loss": 0.7076, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 106.00833333333334, |
|
"eval_accuracy": 0.5454545454545454, |
|
"eval_loss": 1.9192754030227661, |
|
"eval_runtime": 128.1384, |
|
"eval_samples_per_second": 10.988, |
|
"eval_steps_per_second": 0.172, |
|
"step": 1712 |
|
}, |
|
{ |
|
"epoch": 107.00416666666666, |
|
"grad_norm": 6.425831317901611, |
|
"learning_rate": 5.787037037037038e-06, |
|
"loss": 0.7822, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 107.00833333333334, |
|
"eval_accuracy": 0.5440340909090909, |
|
"eval_loss": 1.916624903678894, |
|
"eval_runtime": 128.0714, |
|
"eval_samples_per_second": 10.994, |
|
"eval_steps_per_second": 0.172, |
|
"step": 1728 |
|
}, |
|
{ |
|
"epoch": 108.00104166666667, |
|
"grad_norm": 5.592070579528809, |
|
"learning_rate": 5.497685185185186e-06, |
|
"loss": 0.7629, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 108.00625, |
|
"grad_norm": 6.38240909576416, |
|
"learning_rate": 5.208333333333334e-06, |
|
"loss": 0.6955, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 108.00833333333334, |
|
"eval_accuracy": 0.5490056818181818, |
|
"eval_loss": 1.9166675806045532, |
|
"eval_runtime": 133.0053, |
|
"eval_samples_per_second": 10.586, |
|
"eval_steps_per_second": 0.165, |
|
"step": 1744 |
|
}, |
|
{ |
|
"epoch": 109.003125, |
|
"grad_norm": 4.68496036529541, |
|
"learning_rate": 4.918981481481482e-06, |
|
"loss": 0.8267, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 109.00833333333334, |
|
"grad_norm": 22.761470794677734, |
|
"learning_rate": 4.6296296296296296e-06, |
|
"loss": 0.6939, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 109.00833333333334, |
|
"eval_accuracy": 0.5426136363636364, |
|
"eval_loss": 1.9129440784454346, |
|
"eval_runtime": 123.3598, |
|
"eval_samples_per_second": 11.414, |
|
"eval_steps_per_second": 0.178, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 110.00520833333333, |
|
"grad_norm": 6.633167266845703, |
|
"learning_rate": 4.340277777777778e-06, |
|
"loss": 0.7149, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 110.00833333333334, |
|
"eval_accuracy": 0.5355113636363636, |
|
"eval_loss": 1.9237289428710938, |
|
"eval_runtime": 133.1436, |
|
"eval_samples_per_second": 10.575, |
|
"eval_steps_per_second": 0.165, |
|
"step": 1776 |
|
}, |
|
{ |
|
"epoch": 111.00208333333333, |
|
"grad_norm": 6.027252674102783, |
|
"learning_rate": 4.050925925925926e-06, |
|
"loss": 0.7654, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 111.00729166666666, |
|
"grad_norm": 7.40338134765625, |
|
"learning_rate": 3.761574074074074e-06, |
|
"loss": 0.7341, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 111.00833333333334, |
|
"eval_accuracy": 0.5433238636363636, |
|
"eval_loss": 1.9047017097473145, |
|
"eval_runtime": 133.6961, |
|
"eval_samples_per_second": 10.531, |
|
"eval_steps_per_second": 0.165, |
|
"step": 1792 |
|
}, |
|
{ |
|
"epoch": 112.00416666666666, |
|
"grad_norm": 6.874715328216553, |
|
"learning_rate": 3.4722222222222224e-06, |
|
"loss": 0.7101, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 112.00833333333334, |
|
"eval_accuracy": 0.5433238636363636, |
|
"eval_loss": 1.9010353088378906, |
|
"eval_runtime": 130.9035, |
|
"eval_samples_per_second": 10.756, |
|
"eval_steps_per_second": 0.168, |
|
"step": 1808 |
|
}, |
|
{ |
|
"epoch": 113.00104166666667, |
|
"grad_norm": 6.984443664550781, |
|
"learning_rate": 3.1828703703703706e-06, |
|
"loss": 0.6031, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 113.00625, |
|
"grad_norm": 7.135616302490234, |
|
"learning_rate": 2.893518518518519e-06, |
|
"loss": 0.764, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 113.00833333333334, |
|
"eval_accuracy": 0.5454545454545454, |
|
"eval_loss": 1.9023563861846924, |
|
"eval_runtime": 128.9627, |
|
"eval_samples_per_second": 10.918, |
|
"eval_steps_per_second": 0.171, |
|
"step": 1824 |
|
}, |
|
{ |
|
"epoch": 114.003125, |
|
"grad_norm": 5.495314121246338, |
|
"learning_rate": 2.604166666666667e-06, |
|
"loss": 0.7183, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 114.00833333333334, |
|
"grad_norm": 25.88104248046875, |
|
"learning_rate": 2.3148148148148148e-06, |
|
"loss": 0.667, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 114.00833333333334, |
|
"eval_accuracy": 0.5475852272727273, |
|
"eval_loss": 1.9040805101394653, |
|
"eval_runtime": 132.319, |
|
"eval_samples_per_second": 10.641, |
|
"eval_steps_per_second": 0.166, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 115.00520833333333, |
|
"grad_norm": 7.5589280128479, |
|
"learning_rate": 2.025462962962963e-06, |
|
"loss": 0.7465, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 115.00833333333334, |
|
"eval_accuracy": 0.5482954545454546, |
|
"eval_loss": 1.9005664587020874, |
|
"eval_runtime": 129.3051, |
|
"eval_samples_per_second": 10.889, |
|
"eval_steps_per_second": 0.17, |
|
"step": 1856 |
|
}, |
|
{ |
|
"epoch": 116.00208333333333, |
|
"grad_norm": 8.857230186462402, |
|
"learning_rate": 1.7361111111111112e-06, |
|
"loss": 0.5929, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 116.00729166666666, |
|
"grad_norm": 5.749704837799072, |
|
"learning_rate": 1.4467592592592594e-06, |
|
"loss": 0.6935, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 116.00833333333334, |
|
"eval_accuracy": 0.5461647727272727, |
|
"eval_loss": 1.901587724685669, |
|
"eval_runtime": 135.6846, |
|
"eval_samples_per_second": 10.377, |
|
"eval_steps_per_second": 0.162, |
|
"step": 1872 |
|
}, |
|
{ |
|
"epoch": 117.00416666666666, |
|
"grad_norm": 6.5324883460998535, |
|
"learning_rate": 1.1574074074074074e-06, |
|
"loss": 0.7306, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 117.00833333333334, |
|
"eval_accuracy": 0.5482954545454546, |
|
"eval_loss": 1.9008712768554688, |
|
"eval_runtime": 123.3184, |
|
"eval_samples_per_second": 11.418, |
|
"eval_steps_per_second": 0.178, |
|
"step": 1888 |
|
}, |
|
{ |
|
"epoch": 118.00104166666667, |
|
"grad_norm": 6.570457458496094, |
|
"learning_rate": 8.680555555555556e-07, |
|
"loss": 0.6663, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 118.00625, |
|
"grad_norm": 4.956265449523926, |
|
"learning_rate": 5.787037037037037e-07, |
|
"loss": 0.6578, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 118.00833333333334, |
|
"eval_accuracy": 0.5482954545454546, |
|
"eval_loss": 1.9007748365402222, |
|
"eval_runtime": 136.6271, |
|
"eval_samples_per_second": 10.305, |
|
"eval_steps_per_second": 0.161, |
|
"step": 1904 |
|
}, |
|
{ |
|
"epoch": 119.003125, |
|
"grad_norm": 7.002548694610596, |
|
"learning_rate": 2.8935185185185185e-07, |
|
"loss": 0.6235, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 119.00833333333334, |
|
"grad_norm": 2.7440524101257324, |
|
"learning_rate": 0.0, |
|
"loss": 0.6427, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 119.00833333333334, |
|
"eval_accuracy": 0.5504261363636364, |
|
"eval_loss": 1.9013855457305908, |
|
"eval_runtime": 128.5584, |
|
"eval_samples_per_second": 10.952, |
|
"eval_steps_per_second": 0.171, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 119.00833333333334, |
|
"step": 1920, |
|
"total_flos": 1.4452753827235627e+20, |
|
"train_loss": 1.5160956154266994, |
|
"train_runtime": 26874.3406, |
|
"train_samples_per_second": 4.572, |
|
"train_steps_per_second": 0.071 |
|
}, |
|
{ |
|
"epoch": 119.00833333333334, |
|
"eval_accuracy": 0.5539772727272727, |
|
"eval_loss": 1.901895523071289, |
|
"eval_runtime": 137.4484, |
|
"eval_samples_per_second": 10.244, |
|
"eval_steps_per_second": 0.16, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 119.00833333333334, |
|
"eval_accuracy": 0.5539772727272727, |
|
"eval_loss": 1.9018956422805786, |
|
"eval_runtime": 133.7632, |
|
"eval_samples_per_second": 10.526, |
|
"eval_steps_per_second": 0.164, |
|
"step": 1920 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1920, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 9223372036854775807, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.4452753827235627e+20, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|