|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 15.0, |
|
"eval_steps": 500, |
|
"global_step": 3750, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.0283896923065186, |
|
"learning_rate": 5.333333333333335e-07, |
|
"loss": 1.5494, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.980957508087158, |
|
"learning_rate": 1.066666666666667e-06, |
|
"loss": 1.5781, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.3325070142745972, |
|
"learning_rate": 1.6000000000000001e-06, |
|
"loss": 1.271, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.1161530017852783, |
|
"learning_rate": 2.133333333333334e-06, |
|
"loss": 1.1263, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.9847109317779541, |
|
"learning_rate": 2.666666666666667e-06, |
|
"loss": 1.4759, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.3176578283309937, |
|
"learning_rate": 3.2000000000000003e-06, |
|
"loss": 1.1906, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 2.9614243507385254, |
|
"learning_rate": 3.7333333333333337e-06, |
|
"loss": 1.3136, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.0635404586791992, |
|
"learning_rate": 4.266666666666668e-06, |
|
"loss": 1.159, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.4873822927474976, |
|
"learning_rate": 4.800000000000001e-06, |
|
"loss": 1.0645, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 2.666663646697998, |
|
"learning_rate": 5.333333333333334e-06, |
|
"loss": 1.3174, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.9259383678436279, |
|
"learning_rate": 5.8666666666666675e-06, |
|
"loss": 1.0041, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.422500729560852, |
|
"learning_rate": 6.4000000000000006e-06, |
|
"loss": 1.1208, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.513822317123413, |
|
"learning_rate": 6.9333333333333344e-06, |
|
"loss": 0.9806, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 3.26381254196167, |
|
"learning_rate": 7.4666666666666675e-06, |
|
"loss": 0.9904, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 2.899075984954834, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.9365, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.7061178088188171, |
|
"learning_rate": 8.533333333333335e-06, |
|
"loss": 0.8843, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.0236766338348389, |
|
"learning_rate": 9.066666666666667e-06, |
|
"loss": 0.9131, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.3964245319366455, |
|
"learning_rate": 9.600000000000001e-06, |
|
"loss": 0.9142, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.7908815145492554, |
|
"learning_rate": 1.0133333333333335e-05, |
|
"loss": 0.8973, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.9264830946922302, |
|
"learning_rate": 1.0666666666666667e-05, |
|
"loss": 0.8407, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 2.5779850482940674, |
|
"learning_rate": 1.1200000000000001e-05, |
|
"loss": 0.8228, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.6980341672897339, |
|
"learning_rate": 1.1733333333333335e-05, |
|
"loss": 0.799, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.9194531440734863, |
|
"learning_rate": 1.2266666666666667e-05, |
|
"loss": 0.7759, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 2.0350003242492676, |
|
"learning_rate": 1.2800000000000001e-05, |
|
"loss": 0.7466, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 3.474932909011841, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 0.7317, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 1.7944034337997437, |
|
"learning_rate": 1.3866666666666669e-05, |
|
"loss": 0.7046, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 2.945058584213257, |
|
"learning_rate": 1.4400000000000001e-05, |
|
"loss": 0.6625, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 1.820989966392517, |
|
"learning_rate": 1.4933333333333335e-05, |
|
"loss": 0.7307, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 2.9544613361358643, |
|
"learning_rate": 1.546666666666667e-05, |
|
"loss": 0.7996, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.9499707221984863, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.706, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 5.53220796585083, |
|
"learning_rate": 1.6533333333333333e-05, |
|
"loss": 0.6941, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 2.7142622470855713, |
|
"learning_rate": 1.706666666666667e-05, |
|
"loss": 0.746, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 4.010003089904785, |
|
"learning_rate": 1.76e-05, |
|
"loss": 0.7262, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.3599999999999999, |
|
"grad_norm": 2.3094098567962646, |
|
"learning_rate": 1.8133333333333335e-05, |
|
"loss": 0.6552, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 5.371938228607178, |
|
"learning_rate": 1.866666666666667e-05, |
|
"loss": 0.677, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 1.662387728691101, |
|
"learning_rate": 1.9200000000000003e-05, |
|
"loss": 0.6551, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 1.2856159210205078, |
|
"learning_rate": 1.9733333333333336e-05, |
|
"loss": 0.7076, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 1.2962098121643066, |
|
"learning_rate": 1.999989169177959e-05, |
|
"loss": 0.6818, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 1.3778997659683228, |
|
"learning_rate": 1.9999025240093045e-05, |
|
"loss": 0.6823, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 1.4772292375564575, |
|
"learning_rate": 1.999729241179462e-05, |
|
"loss": 0.7704, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.6400000000000001, |
|
"grad_norm": 1.13938307762146, |
|
"learning_rate": 1.999469335702714e-05, |
|
"loss": 0.6668, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.6800000000000002, |
|
"grad_norm": 2.4890644550323486, |
|
"learning_rate": 1.9991228300988586e-05, |
|
"loss": 0.6448, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 2.2066543102264404, |
|
"learning_rate": 1.998689754391257e-05, |
|
"loss": 0.7159, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 1.4606579542160034, |
|
"learning_rate": 1.998170146104234e-05, |
|
"loss": 0.6443, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 5.692836284637451, |
|
"learning_rate": 1.9975640502598243e-05, |
|
"loss": 0.7253, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.8399999999999999, |
|
"grad_norm": 1.7549424171447754, |
|
"learning_rate": 1.9968715193738738e-05, |
|
"loss": 0.6349, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 2.5602545738220215, |
|
"learning_rate": 1.9960926134514875e-05, |
|
"loss": 0.6793, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 1.393797755241394, |
|
"learning_rate": 1.9952273999818312e-05, |
|
"loss": 0.6686, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 0.6151896119117737, |
|
"learning_rate": 1.9942759539322845e-05, |
|
"loss": 0.6584, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 2.0221006870269775, |
|
"learning_rate": 1.9932383577419432e-05, |
|
"loss": 0.6771, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 2.0078063011169434, |
|
"learning_rate": 1.9921147013144782e-05, |
|
"loss": 0.6664, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 2.788282871246338, |
|
"learning_rate": 1.990905082010344e-05, |
|
"loss": 0.6243, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 2.064715623855591, |
|
"learning_rate": 1.9896096046383456e-05, |
|
"loss": 0.6253, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 2.5293374061584473, |
|
"learning_rate": 1.988228381446553e-05, |
|
"loss": 0.6362, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 1.461493730545044, |
|
"learning_rate": 1.9867615321125796e-05, |
|
"loss": 0.6517, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 1.1433868408203125, |
|
"learning_rate": 1.985209183733209e-05, |
|
"loss": 0.6849, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.2800000000000002, |
|
"grad_norm": 1.6532901525497437, |
|
"learning_rate": 1.983571470813386e-05, |
|
"loss": 0.6298, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 3.705383539199829, |
|
"learning_rate": 1.9818485352545595e-05, |
|
"loss": 0.6588, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 2.4615492820739746, |
|
"learning_rate": 1.980040526342388e-05, |
|
"loss": 0.6154, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.8189066052436829, |
|
"learning_rate": 1.9781476007338058e-05, |
|
"loss": 0.6393, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 1.1122651100158691, |
|
"learning_rate": 1.9761699224434476e-05, |
|
"loss": 0.6245, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 1.4684017896652222, |
|
"learning_rate": 1.9741076628294387e-05, |
|
"loss": 0.6592, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 0.9914065599441528, |
|
"learning_rate": 1.9719610005785466e-05, |
|
"loss": 0.6262, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 1.7366482019424438, |
|
"learning_rate": 1.969730121690698e-05, |
|
"loss": 0.672, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 3.544377326965332, |
|
"learning_rate": 1.967415219462864e-05, |
|
"loss": 0.6057, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 1.9553754329681396, |
|
"learning_rate": 1.9650164944723116e-05, |
|
"loss": 0.6142, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 2.1661672592163086, |
|
"learning_rate": 1.9625341545592226e-05, |
|
"loss": 0.6238, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.7199999999999998, |
|
"grad_norm": 3.7167468070983887, |
|
"learning_rate": 1.9599684148086876e-05, |
|
"loss": 0.7166, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 2.688824415206909, |
|
"learning_rate": 1.9573194975320672e-05, |
|
"loss": 0.6769, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 4.10930061340332, |
|
"learning_rate": 1.954587632247732e-05, |
|
"loss": 0.6199, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 1.5201390981674194, |
|
"learning_rate": 1.951773055661174e-05, |
|
"loss": 0.6242, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 3.6892731189727783, |
|
"learning_rate": 1.9488760116444966e-05, |
|
"loss": 0.6245, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 0.8859150409698486, |
|
"learning_rate": 1.9458967512152872e-05, |
|
"loss": 0.628, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 1.4320142269134521, |
|
"learning_rate": 1.9428355325148632e-05, |
|
"loss": 0.5806, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 1.4816261529922485, |
|
"learning_rate": 1.9396926207859085e-05, |
|
"loss": 0.5818, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 2.1367580890655518, |
|
"learning_rate": 1.9364682883494892e-05, |
|
"loss": 0.6387, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 2.2321407794952393, |
|
"learning_rate": 1.9331628145814587e-05, |
|
"loss": 0.6207, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 3.410268783569336, |
|
"learning_rate": 1.9297764858882516e-05, |
|
"loss": 0.5868, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 3.323219060897827, |
|
"learning_rate": 1.926309595682066e-05, |
|
"loss": 0.5444, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 2.397799015045166, |
|
"learning_rate": 1.9227624443554425e-05, |
|
"loss": 0.5891, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 9.090506553649902, |
|
"learning_rate": 1.9191353392552346e-05, |
|
"loss": 0.5453, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 3.2800000000000002, |
|
"grad_norm": 1.3556101322174072, |
|
"learning_rate": 1.9154285946559792e-05, |
|
"loss": 0.6406, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"grad_norm": 1.056227684020996, |
|
"learning_rate": 1.911642531732666e-05, |
|
"loss": 0.5613, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 1.2134612798690796, |
|
"learning_rate": 1.907777478532909e-05, |
|
"loss": 0.6439, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 2.153582811355591, |
|
"learning_rate": 1.9038337699485207e-05, |
|
"loss": 0.6268, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 1.4763509035110474, |
|
"learning_rate": 1.8998117476864984e-05, |
|
"loss": 0.6358, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 2.120673656463623, |
|
"learning_rate": 1.895711760239413e-05, |
|
"loss": 0.5479, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 3.2643983364105225, |
|
"learning_rate": 1.8915341628552166e-05, |
|
"loss": 0.5908, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 1.7468228340148926, |
|
"learning_rate": 1.8872793175064594e-05, |
|
"loss": 0.6167, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 1.6314669847488403, |
|
"learning_rate": 1.8829475928589272e-05, |
|
"loss": 0.605, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 1.2006853818893433, |
|
"learning_rate": 1.8785393642396976e-05, |
|
"loss": 0.6374, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 5.40316915512085, |
|
"learning_rate": 1.8740550136046195e-05, |
|
"loss": 0.5919, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 3.7199999999999998, |
|
"grad_norm": 1.0310533046722412, |
|
"learning_rate": 1.869494929505219e-05, |
|
"loss": 0.596, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 3.058582067489624, |
|
"learning_rate": 1.8653264281300622e-05, |
|
"loss": 0.5709, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 0.8612210750579834, |
|
"learning_rate": 1.8606235443821602e-05, |
|
"loss": 0.6734, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 1.0079221725463867, |
|
"learning_rate": 1.8558460909544564e-05, |
|
"loss": 0.5874, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 1.8223471641540527, |
|
"learning_rate": 1.850994481794692e-05, |
|
"loss": 0.6199, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 2.1497292518615723, |
|
"learning_rate": 1.846069137275914e-05, |
|
"loss": 0.5527, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"grad_norm": 1.5918537378311157, |
|
"learning_rate": 1.8410704841600506e-05, |
|
"loss": 0.5998, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 4.21558952331543, |
|
"learning_rate": 1.8359989555609355e-05, |
|
"loss": 0.6357, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 4.04, |
|
"grad_norm": 2.512099504470825, |
|
"learning_rate": 1.830854990906779e-05, |
|
"loss": 0.6116, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 4.08, |
|
"grad_norm": 2.5760135650634766, |
|
"learning_rate": 1.825639035902093e-05, |
|
"loss": 0.54, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 4.12, |
|
"grad_norm": 1.5788276195526123, |
|
"learning_rate": 1.8203515424890738e-05, |
|
"loss": 0.6258, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 4.16, |
|
"grad_norm": 1.5123496055603027, |
|
"learning_rate": 1.814992968808442e-05, |
|
"loss": 0.5147, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"grad_norm": 1.539919376373291, |
|
"learning_rate": 1.809563779159746e-05, |
|
"loss": 0.569, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 4.24, |
|
"grad_norm": 1.6823704242706299, |
|
"learning_rate": 1.8040644439611348e-05, |
|
"loss": 0.5588, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 4.28, |
|
"grad_norm": 1.1113232374191284, |
|
"learning_rate": 1.798495439708594e-05, |
|
"loss": 0.5692, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 4.32, |
|
"grad_norm": 1.6908786296844482, |
|
"learning_rate": 1.792857248934663e-05, |
|
"loss": 0.6102, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 4.36, |
|
"grad_norm": 1.7746518850326538, |
|
"learning_rate": 1.7871503601666233e-05, |
|
"loss": 0.5706, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 1.2888718843460083, |
|
"learning_rate": 1.7813752678841702e-05, |
|
"loss": 0.5964, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"grad_norm": 2.7955427169799805, |
|
"learning_rate": 1.7755324724765688e-05, |
|
"loss": 0.6055, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"grad_norm": 1.4672576189041138, |
|
"learning_rate": 1.7696224801992947e-05, |
|
"loss": 0.5548, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 4.52, |
|
"grad_norm": 2.2973852157592773, |
|
"learning_rate": 1.7636458031301725e-05, |
|
"loss": 0.5967, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 4.5600000000000005, |
|
"grad_norm": 2.6734001636505127, |
|
"learning_rate": 1.7576029591250036e-05, |
|
"loss": 0.5567, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"grad_norm": 2.127830743789673, |
|
"learning_rate": 1.7514944717726962e-05, |
|
"loss": 0.6065, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 4.64, |
|
"grad_norm": 2.201108455657959, |
|
"learning_rate": 1.7453208703499006e-05, |
|
"loss": 0.566, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 4.68, |
|
"grad_norm": 3.8374786376953125, |
|
"learning_rate": 1.739082689775146e-05, |
|
"loss": 0.55, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 4.72, |
|
"grad_norm": 2.7282190322875977, |
|
"learning_rate": 1.732780470562496e-05, |
|
"loss": 0.5748, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 4.76, |
|
"grad_norm": 1.8128880262374878, |
|
"learning_rate": 1.7264147587747097e-05, |
|
"loss": 0.6309, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 2.522096633911133, |
|
"learning_rate": 1.7199861059759338e-05, |
|
"loss": 0.5504, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 4.84, |
|
"grad_norm": 1.186700463294983, |
|
"learning_rate": 1.7134950691839063e-05, |
|
"loss": 0.5741, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 4.88, |
|
"grad_norm": 4.312258720397949, |
|
"learning_rate": 1.7069422108216973e-05, |
|
"loss": 0.575, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 4.92, |
|
"grad_norm": 3.402963399887085, |
|
"learning_rate": 1.7003280986689733e-05, |
|
"loss": 0.5842, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 4.96, |
|
"grad_norm": 2.346266269683838, |
|
"learning_rate": 1.693653305812805e-05, |
|
"loss": 0.5877, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 1.304891586303711, |
|
"learning_rate": 1.686918410598009e-05, |
|
"loss": 0.6153, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 5.04, |
|
"grad_norm": 2.874284505844116, |
|
"learning_rate": 1.6801239965770366e-05, |
|
"loss": 0.5776, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 5.08, |
|
"grad_norm": 3.4333293437957764, |
|
"learning_rate": 1.6732706524594138e-05, |
|
"loss": 0.5099, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 5.12, |
|
"grad_norm": 1.4809489250183105, |
|
"learning_rate": 1.6663589720607287e-05, |
|
"loss": 0.5535, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 5.16, |
|
"grad_norm": 3.005042314529419, |
|
"learning_rate": 1.659389554251181e-05, |
|
"loss": 0.561, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 5.2, |
|
"grad_norm": 1.8546274900436401, |
|
"learning_rate": 1.652363002903693e-05, |
|
"loss": 0.555, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 5.24, |
|
"grad_norm": 10.558731079101562, |
|
"learning_rate": 1.6452799268415857e-05, |
|
"loss": 0.5333, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 5.28, |
|
"grad_norm": 2.2776925563812256, |
|
"learning_rate": 1.6381409397858257e-05, |
|
"loss": 0.5941, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 5.32, |
|
"grad_norm": 2.6703994274139404, |
|
"learning_rate": 1.6309466603018497e-05, |
|
"loss": 0.5676, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 5.36, |
|
"grad_norm": 3.8042726516723633, |
|
"learning_rate": 1.6236977117459693e-05, |
|
"loss": 0.5609, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 5.4, |
|
"grad_norm": 1.2701845169067383, |
|
"learning_rate": 1.616394722211357e-05, |
|
"loss": 0.5702, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 5.44, |
|
"grad_norm": 1.392269253730774, |
|
"learning_rate": 1.6090383244736256e-05, |
|
"loss": 0.5388, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 5.48, |
|
"grad_norm": 2.634445905685425, |
|
"learning_rate": 1.6016291559360023e-05, |
|
"loss": 0.573, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 5.52, |
|
"grad_norm": 3.3800487518310547, |
|
"learning_rate": 1.5941678585740976e-05, |
|
"loss": 0.5522, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 5.5600000000000005, |
|
"grad_norm": 2.935079336166382, |
|
"learning_rate": 1.5866550788802815e-05, |
|
"loss": 0.5615, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 5.6, |
|
"grad_norm": 1.9093431234359741, |
|
"learning_rate": 1.579091467807668e-05, |
|
"loss": 0.5537, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 5.64, |
|
"grad_norm": 2.825533628463745, |
|
"learning_rate": 1.5714776807137128e-05, |
|
"loss": 0.6197, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 5.68, |
|
"grad_norm": 3.172933578491211, |
|
"learning_rate": 1.5638143773034268e-05, |
|
"loss": 0.5415, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 5.72, |
|
"grad_norm": 1.2932432889938354, |
|
"learning_rate": 1.556102221572219e-05, |
|
"loss": 0.5792, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 5.76, |
|
"grad_norm": 2.1744470596313477, |
|
"learning_rate": 1.5483418817483607e-05, |
|
"loss": 0.52, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 5.8, |
|
"grad_norm": 2.4148924350738525, |
|
"learning_rate": 1.540534030235087e-05, |
|
"loss": 0.602, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 5.84, |
|
"grad_norm": 2.427771806716919, |
|
"learning_rate": 1.5326793435523374e-05, |
|
"loss": 0.515, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 5.88, |
|
"grad_norm": 1.566942811012268, |
|
"learning_rate": 1.5247785022781343e-05, |
|
"loss": 0.5795, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 5.92, |
|
"grad_norm": 1.7555649280548096, |
|
"learning_rate": 1.5168321909896171e-05, |
|
"loss": 0.5819, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 5.96, |
|
"grad_norm": 1.6367748975753784, |
|
"learning_rate": 1.5088410982037251e-05, |
|
"loss": 0.5244, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 1.9994490146636963, |
|
"learning_rate": 1.50080591631754e-05, |
|
"loss": 0.5823, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 6.04, |
|
"grad_norm": 6.0024261474609375, |
|
"learning_rate": 1.4927273415482916e-05, |
|
"loss": 0.5641, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 6.08, |
|
"grad_norm": 1.50034499168396, |
|
"learning_rate": 1.484606073873035e-05, |
|
"loss": 0.5325, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 6.12, |
|
"grad_norm": 12.008216857910156, |
|
"learning_rate": 1.4764428169679987e-05, |
|
"loss": 0.5384, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 6.16, |
|
"grad_norm": 1.6358847618103027, |
|
"learning_rate": 1.4682382781476146e-05, |
|
"loss": 0.5822, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 6.2, |
|
"grad_norm": 2.520883321762085, |
|
"learning_rate": 1.4599931683032327e-05, |
|
"loss": 0.5256, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 6.24, |
|
"grad_norm": 1.136460304260254, |
|
"learning_rate": 1.4517082018415231e-05, |
|
"loss": 0.5589, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 6.28, |
|
"grad_norm": 2.730435848236084, |
|
"learning_rate": 1.4433840966225772e-05, |
|
"loss": 0.4939, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 6.32, |
|
"grad_norm": 0.9575507044792175, |
|
"learning_rate": 1.4350215738977077e-05, |
|
"loss": 0.5277, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 6.36, |
|
"grad_norm": 1.9613964557647705, |
|
"learning_rate": 1.4266213582469543e-05, |
|
"loss": 0.5457, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"grad_norm": 2.436429500579834, |
|
"learning_rate": 1.4181841775163014e-05, |
|
"loss": 0.547, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 6.44, |
|
"grad_norm": 2.1270251274108887, |
|
"learning_rate": 1.409710762754615e-05, |
|
"loss": 0.5422, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 6.48, |
|
"grad_norm": 2.4047000408172607, |
|
"learning_rate": 1.4012018481502975e-05, |
|
"loss": 0.5219, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 6.52, |
|
"grad_norm": 1.1183472871780396, |
|
"learning_rate": 1.3926581709676752e-05, |
|
"loss": 0.5088, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 6.5600000000000005, |
|
"grad_norm": 1.89410400390625, |
|
"learning_rate": 1.3840804714831164e-05, |
|
"loss": 0.5707, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 6.6, |
|
"grad_norm": 1.2478140592575073, |
|
"learning_rate": 1.3754694929208891e-05, |
|
"loss": 0.5893, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 6.64, |
|
"grad_norm": 4.104971885681152, |
|
"learning_rate": 1.3668259813887644e-05, |
|
"loss": 0.5768, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 6.68, |
|
"grad_norm": 1.4798212051391602, |
|
"learning_rate": 1.3581506858133677e-05, |
|
"loss": 0.5223, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 6.72, |
|
"grad_norm": 3.716698408126831, |
|
"learning_rate": 1.3494443578752893e-05, |
|
"loss": 0.5208, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 6.76, |
|
"grad_norm": 2.887411117553711, |
|
"learning_rate": 1.340707751943952e-05, |
|
"loss": 0.5415, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 6.8, |
|
"grad_norm": 3.6842939853668213, |
|
"learning_rate": 1.3319416250122484e-05, |
|
"loss": 0.5272, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 6.84, |
|
"grad_norm": 4.174267292022705, |
|
"learning_rate": 1.3231467366309523e-05, |
|
"loss": 0.5341, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 6.88, |
|
"grad_norm": 4.511580467224121, |
|
"learning_rate": 1.3143238488429042e-05, |
|
"loss": 0.573, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 6.92, |
|
"grad_norm": 1.930474042892456, |
|
"learning_rate": 1.3054737261169838e-05, |
|
"loss": 0.5134, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 6.96, |
|
"grad_norm": 1.2770944833755493, |
|
"learning_rate": 1.2965971352818736e-05, |
|
"loss": 0.4917, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 2.182473659515381, |
|
"learning_rate": 1.287694845459613e-05, |
|
"loss": 0.5725, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 7.04, |
|
"grad_norm": 4.948633670806885, |
|
"learning_rate": 1.2787676279989594e-05, |
|
"loss": 0.5122, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 7.08, |
|
"grad_norm": 2.5241198539733887, |
|
"learning_rate": 1.2698162564085536e-05, |
|
"loss": 0.4839, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 7.12, |
|
"grad_norm": 1.8158693313598633, |
|
"learning_rate": 1.2608415062898971e-05, |
|
"loss": 0.4502, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 7.16, |
|
"grad_norm": 3.540010929107666, |
|
"learning_rate": 1.2518441552701493e-05, |
|
"loss": 0.5585, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 7.2, |
|
"grad_norm": 1.9738972187042236, |
|
"learning_rate": 1.2428249829347509e-05, |
|
"loss": 0.4918, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 7.24, |
|
"grad_norm": 1.0376015901565552, |
|
"learning_rate": 1.2337847707598738e-05, |
|
"loss": 0.4989, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 7.28, |
|
"grad_norm": 2.5017688274383545, |
|
"learning_rate": 1.2247243020447104e-05, |
|
"loss": 0.4962, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 7.32, |
|
"grad_norm": 1.2260102033615112, |
|
"learning_rate": 1.2156443618436033e-05, |
|
"loss": 0.5316, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 7.36, |
|
"grad_norm": 2.4500176906585693, |
|
"learning_rate": 1.2065457368980236e-05, |
|
"loss": 0.4841, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 7.4, |
|
"grad_norm": 3.0221171379089355, |
|
"learning_rate": 1.197429215568403e-05, |
|
"loss": 0.5507, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 7.44, |
|
"grad_norm": 5.26624059677124, |
|
"learning_rate": 1.1882955877658252e-05, |
|
"loss": 0.532, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 7.48, |
|
"grad_norm": 2.411428213119507, |
|
"learning_rate": 1.1791456448835825e-05, |
|
"loss": 0.4802, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 7.52, |
|
"grad_norm": 2.334620952606201, |
|
"learning_rate": 1.169980179728606e-05, |
|
"loss": 0.5331, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 7.5600000000000005, |
|
"grad_norm": 2.4401047229766846, |
|
"learning_rate": 1.1607999864527718e-05, |
|
"loss": 0.4994, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 7.6, |
|
"grad_norm": 2.3867135047912598, |
|
"learning_rate": 1.1516058604840891e-05, |
|
"loss": 0.5124, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 7.64, |
|
"grad_norm": 2.3309555053710938, |
|
"learning_rate": 1.1423985984577813e-05, |
|
"loss": 0.574, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 7.68, |
|
"grad_norm": 1.1885383129119873, |
|
"learning_rate": 1.1331789981472603e-05, |
|
"loss": 0.5361, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 7.72, |
|
"grad_norm": 1.6586416959762573, |
|
"learning_rate": 1.1239478583950019e-05, |
|
"loss": 0.5388, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 7.76, |
|
"grad_norm": 1.3869335651397705, |
|
"learning_rate": 1.1147059790433296e-05, |
|
"loss": 0.536, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 7.8, |
|
"grad_norm": 1.5383076667785645, |
|
"learning_rate": 1.1054541608651121e-05, |
|
"loss": 0.5165, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 7.84, |
|
"grad_norm": 1.1627497673034668, |
|
"learning_rate": 1.0961932054943778e-05, |
|
"loss": 0.5369, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 7.88, |
|
"grad_norm": 1.4803476333618164, |
|
"learning_rate": 1.0869239153568575e-05, |
|
"loss": 0.548, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 7.92, |
|
"grad_norm": 1.503915786743164, |
|
"learning_rate": 1.0776470936004572e-05, |
|
"loss": 0.5377, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 7.96, |
|
"grad_norm": 1.9053574800491333, |
|
"learning_rate": 1.0683635440256689e-05, |
|
"loss": 0.5249, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 2.171719551086426, |
|
"learning_rate": 1.059074071015923e-05, |
|
"loss": 0.5162, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 8.04, |
|
"grad_norm": 3.7397103309631348, |
|
"learning_rate": 1.0497794794678923e-05, |
|
"loss": 0.5067, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 8.08, |
|
"grad_norm": 3.237569570541382, |
|
"learning_rate": 1.0404805747217525e-05, |
|
"loss": 0.4901, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 8.12, |
|
"grad_norm": 2.6131529808044434, |
|
"learning_rate": 1.0311781624914e-05, |
|
"loss": 0.4834, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 8.16, |
|
"grad_norm": 2.543020009994507, |
|
"learning_rate": 1.0228036587536431e-05, |
|
"loss": 0.4991, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 8.2, |
|
"grad_norm": 2.420510768890381, |
|
"learning_rate": 1.013496803077246e-05, |
|
"loss": 0.5326, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 8.24, |
|
"grad_norm": 1.7979626655578613, |
|
"learning_rate": 1.0041887779554041e-05, |
|
"loss": 0.501, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 8.28, |
|
"grad_norm": 3.0351650714874268, |
|
"learning_rate": 9.948803898922586e-06, |
|
"loss": 0.5263, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 8.32, |
|
"grad_norm": 2.1602799892425537, |
|
"learning_rate": 9.85572445423399e-06, |
|
"loss": 0.505, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 8.36, |
|
"grad_norm": 2.298388957977295, |
|
"learning_rate": 9.762657510459784e-06, |
|
"loss": 0.4962, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 8.4, |
|
"grad_norm": 1.9878581762313843, |
|
"learning_rate": 9.669611131488346e-06, |
|
"loss": 0.5086, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 8.44, |
|
"grad_norm": 3.1122074127197266, |
|
"learning_rate": 9.576593379426196e-06, |
|
"loss": 0.5105, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 8.48, |
|
"grad_norm": 1.8491990566253662, |
|
"learning_rate": 9.483612313899436e-06, |
|
"loss": 0.5028, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 8.52, |
|
"grad_norm": 2.2476413249969482, |
|
"learning_rate": 9.390675991355435e-06, |
|
"loss": 0.5273, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 8.56, |
|
"grad_norm": 3.6653342247009277, |
|
"learning_rate": 9.297792464364748e-06, |
|
"loss": 0.4313, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 8.6, |
|
"grad_norm": 0.8962536454200745, |
|
"learning_rate": 9.204969780923404e-06, |
|
"loss": 0.5045, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 8.64, |
|
"grad_norm": 6.7541823387146, |
|
"learning_rate": 9.112215983755573e-06, |
|
"loss": 0.4818, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 8.68, |
|
"grad_norm": 3.15523362159729, |
|
"learning_rate": 9.019539109616694e-06, |
|
"loss": 0.4779, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 8.72, |
|
"grad_norm": 1.048300862312317, |
|
"learning_rate": 8.926947188597133e-06, |
|
"loss": 0.4815, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 8.76, |
|
"grad_norm": 4.415710926055908, |
|
"learning_rate": 8.8344482434264e-06, |
|
"loss": 0.5259, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 8.8, |
|
"grad_norm": 4.474966049194336, |
|
"learning_rate": 8.742050288778e-06, |
|
"loss": 0.5378, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 8.84, |
|
"grad_norm": 3.487746477127075, |
|
"learning_rate": 8.649761330575009e-06, |
|
"loss": 0.5144, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 8.88, |
|
"grad_norm": 1.44117271900177, |
|
"learning_rate": 8.557589365296385e-06, |
|
"loss": 0.5383, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 8.92, |
|
"grad_norm": 2.9286913871765137, |
|
"learning_rate": 8.4655423792841e-06, |
|
"loss": 0.4653, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 8.96, |
|
"grad_norm": 1.4172818660736084, |
|
"learning_rate": 8.373628348051165e-06, |
|
"loss": 0.4868, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 1.9049030542373657, |
|
"learning_rate": 8.281855235590574e-06, |
|
"loss": 0.5606, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 9.04, |
|
"grad_norm": 3.6202874183654785, |
|
"learning_rate": 8.19023099368526e-06, |
|
"loss": 0.4717, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 9.08, |
|
"grad_norm": 1.4381736516952515, |
|
"learning_rate": 8.098763561219101e-06, |
|
"loss": 0.4578, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 9.12, |
|
"grad_norm": 3.8551642894744873, |
|
"learning_rate": 8.007460863489042e-06, |
|
"loss": 0.4553, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 9.16, |
|
"grad_norm": 2.2333943843841553, |
|
"learning_rate": 7.91633081151841e-06, |
|
"loss": 0.4861, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 9.2, |
|
"grad_norm": 3.517455816268921, |
|
"learning_rate": 7.825381301371452e-06, |
|
"loss": 0.4518, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 9.24, |
|
"grad_norm": 1.2912664413452148, |
|
"learning_rate": 7.734620213469166e-06, |
|
"loss": 0.4832, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 9.28, |
|
"grad_norm": 3.6948964595794678, |
|
"learning_rate": 7.644055411906493e-06, |
|
"loss": 0.4969, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 9.32, |
|
"grad_norm": 1.5953376293182373, |
|
"learning_rate": 7.553694743770928e-06, |
|
"loss": 0.4606, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 9.36, |
|
"grad_norm": 2.7939870357513428, |
|
"learning_rate": 7.463546038462602e-06, |
|
"loss": 0.5225, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 9.4, |
|
"grad_norm": 1.0297088623046875, |
|
"learning_rate": 7.373617107015889e-06, |
|
"loss": 0.529, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 9.44, |
|
"grad_norm": 2.87479305267334, |
|
"learning_rate": 7.283915741422611e-06, |
|
"loss": 0.5134, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 9.48, |
|
"grad_norm": 3.1623082160949707, |
|
"learning_rate": 7.194449713956908e-06, |
|
"loss": 0.4509, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 9.52, |
|
"grad_norm": 1.8917375802993774, |
|
"learning_rate": 7.105226776501772e-06, |
|
"loss": 0.5175, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 9.56, |
|
"grad_norm": 1.6095237731933594, |
|
"learning_rate": 7.016254659877398e-06, |
|
"loss": 0.4742, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 9.6, |
|
"grad_norm": 3.2498207092285156, |
|
"learning_rate": 6.927541073171333e-06, |
|
"loss": 0.4605, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 9.64, |
|
"grad_norm": 1.7395751476287842, |
|
"learning_rate": 6.839093703070512e-06, |
|
"loss": 0.4987, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 9.68, |
|
"grad_norm": 2.4571480751037598, |
|
"learning_rate": 6.750920213195238e-06, |
|
"loss": 0.4829, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 9.72, |
|
"grad_norm": 4.019631385803223, |
|
"learning_rate": 6.6630282434351535e-06, |
|
"loss": 0.4842, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 9.76, |
|
"grad_norm": 2.756540298461914, |
|
"learning_rate": 6.575425409287292e-06, |
|
"loss": 0.5198, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 9.8, |
|
"grad_norm": 2.0040042400360107, |
|
"learning_rate": 6.488119301196201e-06, |
|
"loss": 0.5239, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 9.84, |
|
"grad_norm": 7.419244766235352, |
|
"learning_rate": 6.4011174838962706e-06, |
|
"loss": 0.4636, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 9.88, |
|
"grad_norm": 2.879230260848999, |
|
"learning_rate": 6.314427495756283e-06, |
|
"loss": 0.4693, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 9.92, |
|
"grad_norm": 2.1217892169952393, |
|
"learning_rate": 6.228056848126236e-06, |
|
"loss": 0.475, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 9.96, |
|
"grad_norm": 2.1474809646606445, |
|
"learning_rate": 6.142013024686509e-06, |
|
"loss": 0.4995, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 4.3783721923828125, |
|
"learning_rate": 6.056303480799449e-06, |
|
"loss": 0.486, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 10.04, |
|
"grad_norm": 1.397594928741455, |
|
"learning_rate": 5.970935642863375e-06, |
|
"loss": 0.4537, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 10.08, |
|
"grad_norm": 3.2472903728485107, |
|
"learning_rate": 5.885916907669114e-06, |
|
"loss": 0.3856, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 10.12, |
|
"grad_norm": 3.1067910194396973, |
|
"learning_rate": 5.801254641759103e-06, |
|
"loss": 0.4705, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 10.16, |
|
"grad_norm": 2.5792055130004883, |
|
"learning_rate": 5.716956180789098e-06, |
|
"loss": 0.5011, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 10.2, |
|
"grad_norm": 3.1249446868896484, |
|
"learning_rate": 5.6330288288925805e-06, |
|
"loss": 0.462, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 10.24, |
|
"grad_norm": 4.708195209503174, |
|
"learning_rate": 5.549479858047875e-06, |
|
"loss": 0.5043, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 10.28, |
|
"grad_norm": 1.4850634336471558, |
|
"learning_rate": 5.466316507448049e-06, |
|
"loss": 0.5244, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 10.32, |
|
"grad_norm": 1.0298686027526855, |
|
"learning_rate": 5.3835459828736945e-06, |
|
"loss": 0.4362, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 10.36, |
|
"grad_norm": 2.251105546951294, |
|
"learning_rate": 5.30117545606854e-06, |
|
"loss": 0.4788, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 10.4, |
|
"grad_norm": 4.622702121734619, |
|
"learning_rate": 5.219212064118079e-06, |
|
"loss": 0.4265, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 10.44, |
|
"grad_norm": 3.4977996349334717, |
|
"learning_rate": 5.137662908831147e-06, |
|
"loss": 0.5, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 10.48, |
|
"grad_norm": 5.369349002838135, |
|
"learning_rate": 5.056535056124592e-06, |
|
"loss": 0.4409, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 10.52, |
|
"grad_norm": 2.318140983581543, |
|
"learning_rate": 4.97583553541102e-06, |
|
"loss": 0.4594, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 10.56, |
|
"grad_norm": 2.4116406440734863, |
|
"learning_rate": 4.895571338989754e-06, |
|
"loss": 0.4953, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 10.6, |
|
"grad_norm": 3.0506629943847656, |
|
"learning_rate": 4.8157494214409475e-06, |
|
"loss": 0.4795, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 10.64, |
|
"grad_norm": 1.8125630617141724, |
|
"learning_rate": 4.736376699023023e-06, |
|
"loss": 0.481, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 10.68, |
|
"grad_norm": 1.9686360359191895, |
|
"learning_rate": 4.6574600490733794e-06, |
|
"loss": 0.4713, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 10.72, |
|
"grad_norm": 2.8264060020446777, |
|
"learning_rate": 4.579006309412533e-06, |
|
"loss": 0.4501, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 10.76, |
|
"grad_norm": 1.9835346937179565, |
|
"learning_rate": 4.501022277751602e-06, |
|
"loss": 0.4754, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 10.8, |
|
"grad_norm": 4.487490653991699, |
|
"learning_rate": 4.423514711103355e-06, |
|
"loss": 0.5056, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 10.84, |
|
"grad_norm": 3.1984522342681885, |
|
"learning_rate": 4.346490325196704e-06, |
|
"loss": 0.4415, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 10.88, |
|
"grad_norm": 2.0367348194122314, |
|
"learning_rate": 4.26995579389485e-06, |
|
"loss": 0.5117, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 10.92, |
|
"grad_norm": 1.78911292552948, |
|
"learning_rate": 4.193917748616979e-06, |
|
"loss": 0.475, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 10.96, |
|
"grad_norm": 2.0589475631713867, |
|
"learning_rate": 4.118382777763711e-06, |
|
"loss": 0.4363, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"grad_norm": 3.54664945602417, |
|
"learning_rate": 4.04335742614622e-06, |
|
"loss": 0.4665, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 11.04, |
|
"grad_norm": 1.714920997619629, |
|
"learning_rate": 3.968848194419163e-06, |
|
"loss": 0.4515, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 11.08, |
|
"grad_norm": 11.161652565002441, |
|
"learning_rate": 3.894861538517401e-06, |
|
"loss": 0.4285, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 11.12, |
|
"grad_norm": 2.627831220626831, |
|
"learning_rate": 3.821403869096658e-06, |
|
"loss": 0.4343, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 11.16, |
|
"grad_norm": 2.6865172386169434, |
|
"learning_rate": 3.748481550978017e-06, |
|
"loss": 0.4766, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 11.2, |
|
"grad_norm": 4.996657848358154, |
|
"learning_rate": 3.6761009025964657e-06, |
|
"loss": 0.4096, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 11.24, |
|
"grad_norm": 1.6282066106796265, |
|
"learning_rate": 3.604268195453421e-06, |
|
"loss": 0.4622, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 11.28, |
|
"grad_norm": 1.6030402183532715, |
|
"learning_rate": 3.5329896535733133e-06, |
|
"loss": 0.4437, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 11.32, |
|
"grad_norm": 2.8916399478912354, |
|
"learning_rate": 3.462271452964321e-06, |
|
"loss": 0.4871, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 11.36, |
|
"grad_norm": 2.375190019607544, |
|
"learning_rate": 3.3921197210832235e-06, |
|
"loss": 0.4575, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 11.4, |
|
"grad_norm": 4.021700382232666, |
|
"learning_rate": 3.3225405363045016e-06, |
|
"loss": 0.4699, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 11.44, |
|
"grad_norm": 1.7844669818878174, |
|
"learning_rate": 3.2535399273936407e-06, |
|
"loss": 0.4648, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 11.48, |
|
"grad_norm": 2.744528293609619, |
|
"learning_rate": 3.1851238729848033e-06, |
|
"loss": 0.3923, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 11.52, |
|
"grad_norm": 1.6203703880310059, |
|
"learning_rate": 3.11729830106276e-06, |
|
"loss": 0.4717, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 11.56, |
|
"grad_norm": 1.696370244026184, |
|
"learning_rate": 3.0500690884492836e-06, |
|
"loss": 0.4556, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 11.6, |
|
"grad_norm": 4.04744291305542, |
|
"learning_rate": 2.983442060293926e-06, |
|
"loss": 0.4785, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 11.64, |
|
"grad_norm": 2.629739284515381, |
|
"learning_rate": 2.917422989569311e-06, |
|
"loss": 0.463, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 11.68, |
|
"grad_norm": 2.43945050239563, |
|
"learning_rate": 2.852017596570901e-06, |
|
"loss": 0.4551, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 11.72, |
|
"grad_norm": 6.5788116455078125, |
|
"learning_rate": 2.7872315484213954e-06, |
|
"loss": 0.4501, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 11.76, |
|
"grad_norm": 2.3283305168151855, |
|
"learning_rate": 2.723070458579653e-06, |
|
"loss": 0.4338, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 11.8, |
|
"grad_norm": 4.168436527252197, |
|
"learning_rate": 2.6595398863543407e-06, |
|
"loss": 0.4744, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 11.84, |
|
"grad_norm": 3.3579213619232178, |
|
"learning_rate": 2.596645336422219e-06, |
|
"loss": 0.4257, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 11.88, |
|
"grad_norm": 4.208755970001221, |
|
"learning_rate": 2.5343922583512026e-06, |
|
"loss": 0.4676, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 11.92, |
|
"grad_norm": 2.752279281616211, |
|
"learning_rate": 2.472786046128156e-06, |
|
"loss": 0.455, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 11.96, |
|
"grad_norm": 3.5390079021453857, |
|
"learning_rate": 2.411832037691545e-06, |
|
"loss": 0.4646, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 2.805065870285034, |
|
"learning_rate": 2.3515355144689155e-06, |
|
"loss": 0.4774, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 12.04, |
|
"grad_norm": 2.3203487396240234, |
|
"learning_rate": 2.2919017009192703e-06, |
|
"loss": 0.4333, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 12.08, |
|
"grad_norm": 2.62695050239563, |
|
"learning_rate": 2.2329357640804118e-06, |
|
"loss": 0.456, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 12.12, |
|
"grad_norm": 1.8128643035888672, |
|
"learning_rate": 2.1746428131212126e-06, |
|
"loss": 0.4054, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 12.16, |
|
"grad_norm": 1.5641071796417236, |
|
"learning_rate": 2.117027898898948e-06, |
|
"loss": 0.4875, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 12.2, |
|
"grad_norm": 2.1929521560668945, |
|
"learning_rate": 2.0600960135216463e-06, |
|
"loss": 0.4041, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 12.24, |
|
"grad_norm": 1.746474027633667, |
|
"learning_rate": 2.003852089915548e-06, |
|
"loss": 0.5115, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 12.28, |
|
"grad_norm": 2.731269121170044, |
|
"learning_rate": 1.9483010013976766e-06, |
|
"loss": 0.4459, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 12.32, |
|
"grad_norm": 1.9771169424057007, |
|
"learning_rate": 1.8934475612536019e-06, |
|
"loss": 0.3677, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 12.36, |
|
"grad_norm": 3.043891191482544, |
|
"learning_rate": 1.8392965223203707e-06, |
|
"loss": 0.4353, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 12.4, |
|
"grad_norm": 3.361074447631836, |
|
"learning_rate": 1.7858525765747047e-06, |
|
"loss": 0.4578, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 12.44, |
|
"grad_norm": 3.8681182861328125, |
|
"learning_rate": 1.7331203547264452e-06, |
|
"loss": 0.4057, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 12.48, |
|
"grad_norm": 2.581637382507324, |
|
"learning_rate": 1.6811044258173425e-06, |
|
"loss": 0.4532, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 12.52, |
|
"grad_norm": 2.8616292476654053, |
|
"learning_rate": 1.629809296825139e-06, |
|
"loss": 0.4551, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 12.56, |
|
"grad_norm": 2.4617111682891846, |
|
"learning_rate": 1.579239412273078e-06, |
|
"loss": 0.4388, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 12.6, |
|
"grad_norm": 14.222563743591309, |
|
"learning_rate": 1.5293991538447882e-06, |
|
"loss": 0.412, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 12.64, |
|
"grad_norm": 3.5921924114227295, |
|
"learning_rate": 1.4802928400046457e-06, |
|
"loss": 0.4517, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 12.68, |
|
"grad_norm": 2.4046990871429443, |
|
"learning_rate": 1.4319247256235713e-06, |
|
"loss": 0.4893, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 12.72, |
|
"grad_norm": 2.5496039390563965, |
|
"learning_rate": 1.3842990016103886e-06, |
|
"loss": 0.4305, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 12.76, |
|
"grad_norm": 2.3980159759521484, |
|
"learning_rate": 1.3374197945486833e-06, |
|
"loss": 0.3833, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 12.8, |
|
"grad_norm": 1.515519142150879, |
|
"learning_rate": 1.2912911663392468e-06, |
|
"loss": 0.4513, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 12.84, |
|
"grad_norm": 2.939988136291504, |
|
"learning_rate": 1.245917113848144e-06, |
|
"loss": 0.4712, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 12.88, |
|
"grad_norm": 1.846997857093811, |
|
"learning_rate": 1.2013015685603813e-06, |
|
"loss": 0.4789, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 12.92, |
|
"grad_norm": 2.960897445678711, |
|
"learning_rate": 1.1574483962392768e-06, |
|
"loss": 0.4128, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 12.96, |
|
"grad_norm": 1.8638381958007812, |
|
"learning_rate": 1.114361396591498e-06, |
|
"loss": 0.4949, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"grad_norm": 2.134097099304199, |
|
"learning_rate": 1.0720443029378303e-06, |
|
"loss": 0.4167, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 13.04, |
|
"grad_norm": 2.2456820011138916, |
|
"learning_rate": 1.0305007818897006e-06, |
|
"loss": 0.4483, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 13.08, |
|
"grad_norm": 9.045520782470703, |
|
"learning_rate": 9.897344330314862e-07, |
|
"loss": 0.454, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 13.12, |
|
"grad_norm": 2.646930694580078, |
|
"learning_rate": 9.497487886086132e-07, |
|
"loss": 0.4438, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 13.16, |
|
"grad_norm": 4.203260898590088, |
|
"learning_rate": 9.105473132215126e-07, |
|
"loss": 0.3904, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 13.2, |
|
"grad_norm": 3.177109479904175, |
|
"learning_rate": 8.721334035254203e-07, |
|
"loss": 0.4128, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 13.24, |
|
"grad_norm": 1.825671911239624, |
|
"learning_rate": 8.345103879360695e-07, |
|
"loss": 0.4479, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 13.28, |
|
"grad_norm": 3.2267651557922363, |
|
"learning_rate": 7.976815263412963e-07, |
|
"loss": 0.3928, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 13.32, |
|
"grad_norm": 3.6811180114746094, |
|
"learning_rate": 7.616500098185908e-07, |
|
"loss": 0.4163, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 13.36, |
|
"grad_norm": 3.258467435836792, |
|
"learning_rate": 7.264189603585892e-07, |
|
"loss": 0.4186, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 13.4, |
|
"grad_norm": 1.861440658569336, |
|
"learning_rate": 6.919914305945774e-07, |
|
"loss": 0.4416, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 13.44, |
|
"grad_norm": 3.303765296936035, |
|
"learning_rate": 6.58370403537989e-07, |
|
"loss": 0.3958, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 13.48, |
|
"grad_norm": 1.9865639209747314, |
|
"learning_rate": 6.255587923199313e-07, |
|
"loss": 0.4424, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 13.52, |
|
"grad_norm": 2.3877573013305664, |
|
"learning_rate": 5.935594399387856e-07, |
|
"loss": 0.4778, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 13.56, |
|
"grad_norm": 2.6566929817199707, |
|
"learning_rate": 5.623751190138682e-07, |
|
"loss": 0.4045, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 13.6, |
|
"grad_norm": 4.0699052810668945, |
|
"learning_rate": 5.320085315451862e-07, |
|
"loss": 0.4275, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 13.64, |
|
"grad_norm": 4.235281944274902, |
|
"learning_rate": 5.024623086793323e-07, |
|
"loss": 0.4346, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 13.68, |
|
"grad_norm": 1.7936820983886719, |
|
"learning_rate": 4.737390104814954e-07, |
|
"loss": 0.4343, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 13.72, |
|
"grad_norm": 4.244424343109131, |
|
"learning_rate": 4.458411257136486e-07, |
|
"loss": 0.4355, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 13.76, |
|
"grad_norm": 3.0799527168273926, |
|
"learning_rate": 4.1877107161890416e-07, |
|
"loss": 0.4407, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 13.8, |
|
"grad_norm": 3.4574971199035645, |
|
"learning_rate": 3.9253119371206684e-07, |
|
"loss": 0.4369, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 13.84, |
|
"grad_norm": 1.3924965858459473, |
|
"learning_rate": 3.671237655764104e-07, |
|
"loss": 0.4823, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 13.88, |
|
"grad_norm": 1.9487115144729614, |
|
"learning_rate": 3.4255098866667114e-07, |
|
"loss": 0.4566, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 13.92, |
|
"grad_norm": 2.6502346992492676, |
|
"learning_rate": 3.188149921183115e-07, |
|
"loss": 0.4824, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 13.96, |
|
"grad_norm": 3.1728081703186035, |
|
"learning_rate": 2.959178325630296e-07, |
|
"loss": 0.3983, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"grad_norm": 2.273251533508301, |
|
"learning_rate": 2.7386149395056463e-07, |
|
"loss": 0.4541, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 14.04, |
|
"grad_norm": 2.2681076526641846, |
|
"learning_rate": 2.526478873767946e-07, |
|
"loss": 0.4667, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 14.08, |
|
"grad_norm": 2.3255577087402344, |
|
"learning_rate": 2.322788509181484e-07, |
|
"loss": 0.441, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 14.12, |
|
"grad_norm": 1.8558521270751953, |
|
"learning_rate": 2.1275614947233624e-07, |
|
"loss": 0.4294, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 14.16, |
|
"grad_norm": 3.000098943710327, |
|
"learning_rate": 1.9408147460544203e-07, |
|
"loss": 0.4245, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 14.2, |
|
"grad_norm": 3.5599377155303955, |
|
"learning_rate": 1.7625644440534384e-07, |
|
"loss": 0.3884, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 14.24, |
|
"grad_norm": 3.208889961242676, |
|
"learning_rate": 1.5928260334151847e-07, |
|
"loss": 0.4434, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 14.28, |
|
"grad_norm": 3.4625139236450195, |
|
"learning_rate": 1.4316142213121386e-07, |
|
"loss": 0.4474, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 14.32, |
|
"grad_norm": 4.883245944976807, |
|
"learning_rate": 1.2789429761202565e-07, |
|
"loss": 0.3927, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 14.36, |
|
"grad_norm": 4.052615165710449, |
|
"learning_rate": 1.134825526208605e-07, |
|
"loss": 0.3967, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 14.4, |
|
"grad_norm": 1.4959968328475952, |
|
"learning_rate": 9.992743587931674e-08, |
|
"loss": 0.4807, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 14.44, |
|
"grad_norm": 2.757499933242798, |
|
"learning_rate": 8.723012188549318e-08, |
|
"loss": 0.4506, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 14.48, |
|
"grad_norm": 2.0996549129486084, |
|
"learning_rate": 7.539171081221597e-08, |
|
"loss": 0.3934, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 14.52, |
|
"grad_norm": 2.4962105751037598, |
|
"learning_rate": 6.44132284117216e-08, |
|
"loss": 0.4549, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 14.56, |
|
"grad_norm": 1.9287108182907104, |
|
"learning_rate": 5.429562592677018e-08, |
|
"loss": 0.4208, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 14.6, |
|
"grad_norm": 4.647493362426758, |
|
"learning_rate": 4.503978000823028e-08, |
|
"loss": 0.4376, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 14.64, |
|
"grad_norm": 1.24240243434906, |
|
"learning_rate": 3.6646492639118567e-08, |
|
"loss": 0.4562, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 14.68, |
|
"grad_norm": 2.359651565551758, |
|
"learning_rate": 2.911649106511316e-08, |
|
"loss": 0.42, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 14.72, |
|
"grad_norm": 2.7108898162841797, |
|
"learning_rate": 2.2450427731534052e-08, |
|
"loss": 0.4489, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 14.76, |
|
"grad_norm": 1.4741981029510498, |
|
"learning_rate": 1.664888022682165e-08, |
|
"loss": 0.4532, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 14.8, |
|
"grad_norm": 1.676648497581482, |
|
"learning_rate": 1.1712351232480157e-08, |
|
"loss": 0.4375, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 14.84, |
|
"grad_norm": 1.2909196615219116, |
|
"learning_rate": 7.641268479531283e-09, |
|
"loss": 0.4574, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 14.88, |
|
"grad_norm": 3.2380595207214355, |
|
"learning_rate": 4.435984711446128e-09, |
|
"loss": 0.3538, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 14.92, |
|
"grad_norm": 1.9485658407211304, |
|
"learning_rate": 2.0967776535851802e-09, |
|
"loss": 0.3884, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 14.96, |
|
"grad_norm": 2.0014994144439697, |
|
"learning_rate": 6.238499891353389e-10, |
|
"loss": 0.4266, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"grad_norm": 2.8281970024108887, |
|
"learning_rate": 1.7329341542859922e-11, |
|
"loss": 0.4743, |
|
"step": 3750 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3750, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 15, |
|
"save_steps": 10000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.815859931388314e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|