|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.048128, |
|
"eval_steps": 100, |
|
"global_step": 1024, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01024, |
|
"grad_norm": 269.9034729003906, |
|
"learning_rate": 0.0001999995200527669, |
|
"loss": 624.6766, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02048, |
|
"grad_norm": 73.23262786865234, |
|
"learning_rate": 0.000199941931959037, |
|
"loss": 544.9223, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03072, |
|
"grad_norm": 95.47492218017578, |
|
"learning_rate": 0.00019978841775475367, |
|
"loss": 511.6501, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.04096, |
|
"grad_norm": 145.1549530029297, |
|
"learning_rate": 0.00019953912478568305, |
|
"loss": 497.5438, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0512, |
|
"grad_norm": 150.53515625, |
|
"learning_rate": 0.00019919429232781712, |
|
"loss": 497.208, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06144, |
|
"grad_norm": 151.51736450195312, |
|
"learning_rate": 0.0001987542513577122, |
|
"loss": 496.9036, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07168, |
|
"grad_norm": 159.39337158203125, |
|
"learning_rate": 0.0001982771584048096, |
|
"loss": 489.2719, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.08192, |
|
"grad_norm": 148.92176818847656, |
|
"learning_rate": 0.00019765746006440455, |
|
"loss": 482.9451, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.09216, |
|
"grad_norm": 139.00486755371094, |
|
"learning_rate": 0.0001970195706599109, |
|
"loss": 476.9848, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1024, |
|
"grad_norm": 157.25439453125, |
|
"learning_rate": 0.00019622236172137374, |
|
"loss": 471.4595, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1024, |
|
"eval_loss": 7.320011615753174, |
|
"eval_runtime": 3.4958, |
|
"eval_samples_per_second": 143.029, |
|
"eval_steps_per_second": 9.154, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.11264, |
|
"grad_norm": 141.9667510986328, |
|
"learning_rate": 0.0001953327967844356, |
|
"loss": 468.9138, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.12288, |
|
"grad_norm": 99.36428833007812, |
|
"learning_rate": 0.0001943517296699384, |
|
"loss": 468.5555, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.13312, |
|
"grad_norm": 98.90169525146484, |
|
"learning_rate": 0.00019328010202420258, |
|
"loss": 463.3139, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.14336, |
|
"grad_norm": 77.31971740722656, |
|
"learning_rate": 0.00019211894241521758, |
|
"loss": 458.0901, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.1536, |
|
"grad_norm": 134.48403930664062, |
|
"learning_rate": 0.0001908693653454033, |
|
"loss": 454.8131, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.16384, |
|
"grad_norm": 95.27815246582031, |
|
"learning_rate": 0.00018953257018189024, |
|
"loss": 454.0167, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.17408, |
|
"grad_norm": 87.9055404663086, |
|
"learning_rate": 0.00018810984000534458, |
|
"loss": 449.3531, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.18432, |
|
"grad_norm": 116.1905288696289, |
|
"learning_rate": 0.00018660254037844388, |
|
"loss": 447.6146, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.19456, |
|
"grad_norm": 81.48722076416016, |
|
"learning_rate": 0.00018501211803518468, |
|
"loss": 450.6066, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.2048, |
|
"grad_norm": 75.57894897460938, |
|
"learning_rate": 0.00018334009949228061, |
|
"loss": 448.9498, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2048, |
|
"eval_loss": 6.992556571960449, |
|
"eval_runtime": 3.4911, |
|
"eval_samples_per_second": 143.219, |
|
"eval_steps_per_second": 9.166, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.21504, |
|
"grad_norm": 160.8426513671875, |
|
"learning_rate": 0.00018158808958398338, |
|
"loss": 449.3857, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.22528, |
|
"grad_norm": 86.6707992553711, |
|
"learning_rate": 0.00017975776992173344, |
|
"loss": 449.0133, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.23552, |
|
"grad_norm": 65.60407257080078, |
|
"learning_rate": 0.00017785089728011798, |
|
"loss": 446.8142, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.24576, |
|
"grad_norm": 125.04796600341797, |
|
"learning_rate": 0.00017586930191068655, |
|
"loss": 446.0437, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 89.98313903808594, |
|
"learning_rate": 0.00017381488578524173, |
|
"loss": 445.3744, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.26624, |
|
"grad_norm": 269.5317077636719, |
|
"learning_rate": 0.00017168962077029147, |
|
"loss": 446.719, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.27648, |
|
"grad_norm": 73.03639221191406, |
|
"learning_rate": 0.00016949554673441534, |
|
"loss": 448.0971, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.28672, |
|
"grad_norm": 122.57962799072266, |
|
"learning_rate": 0.00016723476959036083, |
|
"loss": 448.991, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.29696, |
|
"grad_norm": 81.79669189453125, |
|
"learning_rate": 0.0001649094592737497, |
|
"loss": 444.1866, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.3072, |
|
"grad_norm": 74.33326721191406, |
|
"learning_rate": 0.00016252184766033342, |
|
"loss": 436.623, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3072, |
|
"eval_loss": 6.777511119842529, |
|
"eval_runtime": 3.4227, |
|
"eval_samples_per_second": 146.083, |
|
"eval_steps_per_second": 9.349, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.31744, |
|
"grad_norm": 113.46410369873047, |
|
"learning_rate": 0.0001600742264237979, |
|
"loss": 435.7422, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.32768, |
|
"grad_norm": 99.42645263671875, |
|
"learning_rate": 0.00015756894483617267, |
|
"loss": 439.4858, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.33792, |
|
"grad_norm": 328.0025634765625, |
|
"learning_rate": 0.0001550084075129563, |
|
"loss": 447.5792, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.34816, |
|
"grad_norm": 82.54906463623047, |
|
"learning_rate": 0.00015239507210512194, |
|
"loss": 446.5024, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.3584, |
|
"grad_norm": 62.32942581176758, |
|
"learning_rate": 0.00014973144694021876, |
|
"loss": 437.9146, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.36864, |
|
"grad_norm": 64.61022186279297, |
|
"learning_rate": 0.00014702008861483266, |
|
"loss": 430.4142, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.37888, |
|
"grad_norm": 133.777587890625, |
|
"learning_rate": 0.00014426359954071796, |
|
"loss": 428.6971, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.38912, |
|
"grad_norm": 218.512939453125, |
|
"learning_rate": 0.00014146462544695426, |
|
"loss": 435.1475, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.39936, |
|
"grad_norm": 125.56941986083984, |
|
"learning_rate": 0.00013862585284052714, |
|
"loss": 445.5835, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.4096, |
|
"grad_norm": 109.06041717529297, |
|
"learning_rate": 0.00013575000642776893, |
|
"loss": 446.3095, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4096, |
|
"eval_loss": 6.905622959136963, |
|
"eval_runtime": 3.4269, |
|
"eval_samples_per_second": 145.903, |
|
"eval_steps_per_second": 9.338, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.41984, |
|
"grad_norm": 69.12989044189453, |
|
"learning_rate": 0.0001328398464991355, |
|
"loss": 438.9709, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.43008, |
|
"grad_norm": 68.11474609375, |
|
"learning_rate": 0.00012989816627982848, |
|
"loss": 432.2964, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.44032, |
|
"grad_norm": 65.17674255371094, |
|
"learning_rate": 0.00012692778924880603, |
|
"loss": 428.2125, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.45056, |
|
"grad_norm": 114.95523834228516, |
|
"learning_rate": 0.0001239315664287558, |
|
"loss": 426.8882, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.4608, |
|
"grad_norm": 185.0157470703125, |
|
"learning_rate": 0.00012091237364963071, |
|
"loss": 435.8043, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.47104, |
|
"grad_norm": 92.59754180908203, |
|
"learning_rate": 0.00011787310878837422, |
|
"loss": 440.9751, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.48128, |
|
"grad_norm": 75.24162292480469, |
|
"learning_rate": 0.00011481668898748475, |
|
"loss": 439.3276, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.49152, |
|
"grad_norm": 55.42325210571289, |
|
"learning_rate": 0.00011174604785508813, |
|
"loss": 432.4603, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.50176, |
|
"grad_norm": 62.27671813964844, |
|
"learning_rate": 0.00010866413264920678, |
|
"loss": 427.5299, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 65.43367767333984, |
|
"learning_rate": 0.00010557390144892684, |
|
"loss": 425.4595, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"eval_loss": 6.613161087036133, |
|
"eval_runtime": 3.4182, |
|
"eval_samples_per_second": 146.277, |
|
"eval_steps_per_second": 9.362, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.52224, |
|
"grad_norm": 175.58470153808594, |
|
"learning_rate": 0.0001024783203151793, |
|
"loss": 425.5378, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.53248, |
|
"grad_norm": 199.92291259765625, |
|
"learning_rate": 9.938036044386005e-05, |
|
"loss": 431.3893, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.54272, |
|
"grad_norm": 212.65650939941406, |
|
"learning_rate": 9.628299531402117e-05, |
|
"loss": 443.9659, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.55296, |
|
"grad_norm": 93.11270141601562, |
|
"learning_rate": 9.318919783387094e-05, |
|
"loss": 443.3476, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.5632, |
|
"grad_norm": 93.02433013916016, |
|
"learning_rate": 9.010193748732155e-05, |
|
"loss": 438.1048, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.57344, |
|
"grad_norm": 72.0661849975586, |
|
"learning_rate": 8.702417748382385e-05, |
|
"loss": 431.1463, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.58368, |
|
"grad_norm": 67.0578842163086, |
|
"learning_rate": 8.395887191422397e-05, |
|
"loss": 427.2931, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.59392, |
|
"grad_norm": 85.532958984375, |
|
"learning_rate": 8.090896291537273e-05, |
|
"loss": 424.9293, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.60416, |
|
"grad_norm": 72.48572540283203, |
|
"learning_rate": 7.787737784620803e-05, |
|
"loss": 424.9051, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.6144, |
|
"grad_norm": 237.96592712402344, |
|
"learning_rate": 7.486702647802213e-05, |
|
"loss": 425.6438, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6144, |
|
"eval_loss": 6.683709621429443, |
|
"eval_runtime": 3.436, |
|
"eval_samples_per_second": 145.519, |
|
"eval_steps_per_second": 9.313, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.62464, |
|
"grad_norm": 178.17239379882812, |
|
"learning_rate": 7.188079820160904e-05, |
|
"loss": 432.3896, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.63488, |
|
"grad_norm": 84.38874053955078, |
|
"learning_rate": 6.892155925397436e-05, |
|
"loss": 434.9848, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.64512, |
|
"grad_norm": 66.67383575439453, |
|
"learning_rate": 6.59921499672677e-05, |
|
"loss": 433.8923, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.65536, |
|
"grad_norm": 74.11187744140625, |
|
"learning_rate": 6.309538204257977e-05, |
|
"loss": 430.2817, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.6656, |
|
"grad_norm": 95.32003784179688, |
|
"learning_rate": 6.02340358512196e-05, |
|
"loss": 427.1533, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.67584, |
|
"grad_norm": 71.91348266601562, |
|
"learning_rate": 5.7410857766062966e-05, |
|
"loss": 425.3034, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.68608, |
|
"grad_norm": 95.72642517089844, |
|
"learning_rate": 5.4628557525532976e-05, |
|
"loss": 425.3343, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.69632, |
|
"grad_norm": 161.08612060546875, |
|
"learning_rate": 5.188980563274315e-05, |
|
"loss": 426.5362, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.70656, |
|
"grad_norm": 130.4775848388672, |
|
"learning_rate": 4.9197230792299195e-05, |
|
"loss": 431.4921, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.7168, |
|
"grad_norm": 102.47798919677734, |
|
"learning_rate": 4.6553417387219886e-05, |
|
"loss": 432.9831, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7168, |
|
"eval_loss": 6.725553512573242, |
|
"eval_runtime": 3.4338, |
|
"eval_samples_per_second": 145.61, |
|
"eval_steps_per_second": 9.319, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.72704, |
|
"grad_norm": 73.72420501708984, |
|
"learning_rate": 4.421777466693434e-05, |
|
"loss": 431.4859, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.73728, |
|
"grad_norm": 83.63558197021484, |
|
"learning_rate": 4.167355837898584e-05, |
|
"loss": 428.698, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.74752, |
|
"grad_norm": 69.27027893066406, |
|
"learning_rate": 3.918532488602094e-05, |
|
"loss": 428.0623, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.75776, |
|
"grad_norm": 107.68405151367188, |
|
"learning_rate": 3.675546244046228e-05, |
|
"loss": 425.6424, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 96.42312622070312, |
|
"learning_rate": 3.438630326912414e-05, |
|
"loss": 425.8188, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.77824, |
|
"grad_norm": 116.1615982055664, |
|
"learning_rate": 3.208012133469799e-05, |
|
"loss": 425.9528, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.78848, |
|
"grad_norm": 91.75414276123047, |
|
"learning_rate": 2.9839130153161154e-05, |
|
"loss": 426.8583, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.79872, |
|
"grad_norm": 79.31800079345703, |
|
"learning_rate": 2.766548066920338e-05, |
|
"loss": 425.4576, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.80896, |
|
"grad_norm": 108.06861114501953, |
|
"learning_rate": 2.5561259191710407e-05, |
|
"loss": 425.0249, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.8192, |
|
"grad_norm": 86.25403594970703, |
|
"learning_rate": 2.3528485391286147e-05, |
|
"loss": 426.0778, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.8192, |
|
"eval_loss": 6.630011558532715, |
|
"eval_runtime": 3.4108, |
|
"eval_samples_per_second": 146.591, |
|
"eval_steps_per_second": 9.382, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.82944, |
|
"grad_norm": 80.1161117553711, |
|
"learning_rate": 2.1569110361735677e-05, |
|
"loss": 426.9529, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.83968, |
|
"grad_norm": 92.57079315185547, |
|
"learning_rate": 2e-05, |
|
"loss": 425.7674, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.84992, |
|
"grad_norm": 76.28582000732422, |
|
"learning_rate": 2e-05, |
|
"loss": 425.3365, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.86016, |
|
"grad_norm": 111.58943176269531, |
|
"learning_rate": 2e-05, |
|
"loss": 424.6131, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.8704, |
|
"grad_norm": 158.44044494628906, |
|
"learning_rate": 2e-05, |
|
"loss": 425.0354, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.88064, |
|
"grad_norm": 101.99372100830078, |
|
"learning_rate": 2e-05, |
|
"loss": 424.9413, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.89088, |
|
"grad_norm": 140.2552490234375, |
|
"learning_rate": 2e-05, |
|
"loss": 426.5773, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.90112, |
|
"grad_norm": 117.06301879882812, |
|
"learning_rate": 2e-05, |
|
"loss": 427.2063, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.91136, |
|
"grad_norm": 147.27670288085938, |
|
"learning_rate": 2e-05, |
|
"loss": 427.2577, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.9216, |
|
"grad_norm": 109.34888458251953, |
|
"learning_rate": 2e-05, |
|
"loss": 428.4192, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.9216, |
|
"eval_loss": 6.663826942443848, |
|
"eval_runtime": 3.429, |
|
"eval_samples_per_second": 145.817, |
|
"eval_steps_per_second": 9.332, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.93184, |
|
"grad_norm": 145.62522888183594, |
|
"learning_rate": 2e-05, |
|
"loss": 428.9891, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.94208, |
|
"grad_norm": 90.70750427246094, |
|
"learning_rate": 2e-05, |
|
"loss": 429.0984, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.95232, |
|
"grad_norm": 92.83578491210938, |
|
"learning_rate": 2e-05, |
|
"loss": 429.0002, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.96256, |
|
"grad_norm": 125.1180648803711, |
|
"learning_rate": 2e-05, |
|
"loss": 428.4883, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.9728, |
|
"grad_norm": 132.3828125, |
|
"learning_rate": 2e-05, |
|
"loss": 428.6993, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.98304, |
|
"grad_norm": 100.2248306274414, |
|
"learning_rate": 2e-05, |
|
"loss": 429.164, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.99328, |
|
"grad_norm": 112.38407897949219, |
|
"learning_rate": 2e-05, |
|
"loss": 430.1383, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.003072, |
|
"grad_norm": 104.0753173828125, |
|
"learning_rate": 2e-05, |
|
"loss": 410.847, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.013312, |
|
"grad_norm": 137.40553283691406, |
|
"learning_rate": 2e-05, |
|
"loss": 431.1839, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.023552, |
|
"grad_norm": 191.53709411621094, |
|
"learning_rate": 2e-05, |
|
"loss": 431.9959, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.023552, |
|
"eval_loss": 6.718299865722656, |
|
"eval_runtime": 3.4132, |
|
"eval_samples_per_second": 146.49, |
|
"eval_steps_per_second": 9.375, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.033792, |
|
"grad_norm": 188.12156677246094, |
|
"learning_rate": 2e-05, |
|
"loss": 432.0317, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.044032, |
|
"grad_norm": 119.64492797851562, |
|
"learning_rate": 2e-05, |
|
"loss": 432.8214, |
|
"step": 1020 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1024, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 1024, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.134583528711782e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|