Marcus2112's picture
Upload folder using huggingface_hub
2c5f076 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.048128,
"eval_steps": 100,
"global_step": 1024,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01024,
"grad_norm": 269.9034729003906,
"learning_rate": 0.0001999995200527669,
"loss": 624.6766,
"step": 10
},
{
"epoch": 0.02048,
"grad_norm": 73.23262786865234,
"learning_rate": 0.000199941931959037,
"loss": 544.9223,
"step": 20
},
{
"epoch": 0.03072,
"grad_norm": 95.47492218017578,
"learning_rate": 0.00019978841775475367,
"loss": 511.6501,
"step": 30
},
{
"epoch": 0.04096,
"grad_norm": 145.1549530029297,
"learning_rate": 0.00019953912478568305,
"loss": 497.5438,
"step": 40
},
{
"epoch": 0.0512,
"grad_norm": 150.53515625,
"learning_rate": 0.00019919429232781712,
"loss": 497.208,
"step": 50
},
{
"epoch": 0.06144,
"grad_norm": 151.51736450195312,
"learning_rate": 0.0001987542513577122,
"loss": 496.9036,
"step": 60
},
{
"epoch": 0.07168,
"grad_norm": 159.39337158203125,
"learning_rate": 0.0001982771584048096,
"loss": 489.2719,
"step": 70
},
{
"epoch": 0.08192,
"grad_norm": 148.92176818847656,
"learning_rate": 0.00019765746006440455,
"loss": 482.9451,
"step": 80
},
{
"epoch": 0.09216,
"grad_norm": 139.00486755371094,
"learning_rate": 0.0001970195706599109,
"loss": 476.9848,
"step": 90
},
{
"epoch": 0.1024,
"grad_norm": 157.25439453125,
"learning_rate": 0.00019622236172137374,
"loss": 471.4595,
"step": 100
},
{
"epoch": 0.1024,
"eval_loss": 7.320011615753174,
"eval_runtime": 3.4958,
"eval_samples_per_second": 143.029,
"eval_steps_per_second": 9.154,
"step": 100
},
{
"epoch": 0.11264,
"grad_norm": 141.9667510986328,
"learning_rate": 0.0001953327967844356,
"loss": 468.9138,
"step": 110
},
{
"epoch": 0.12288,
"grad_norm": 99.36428833007812,
"learning_rate": 0.0001943517296699384,
"loss": 468.5555,
"step": 120
},
{
"epoch": 0.13312,
"grad_norm": 98.90169525146484,
"learning_rate": 0.00019328010202420258,
"loss": 463.3139,
"step": 130
},
{
"epoch": 0.14336,
"grad_norm": 77.31971740722656,
"learning_rate": 0.00019211894241521758,
"loss": 458.0901,
"step": 140
},
{
"epoch": 0.1536,
"grad_norm": 134.48403930664062,
"learning_rate": 0.0001908693653454033,
"loss": 454.8131,
"step": 150
},
{
"epoch": 0.16384,
"grad_norm": 95.27815246582031,
"learning_rate": 0.00018953257018189024,
"loss": 454.0167,
"step": 160
},
{
"epoch": 0.17408,
"grad_norm": 87.9055404663086,
"learning_rate": 0.00018810984000534458,
"loss": 449.3531,
"step": 170
},
{
"epoch": 0.18432,
"grad_norm": 116.1905288696289,
"learning_rate": 0.00018660254037844388,
"loss": 447.6146,
"step": 180
},
{
"epoch": 0.19456,
"grad_norm": 81.48722076416016,
"learning_rate": 0.00018501211803518468,
"loss": 450.6066,
"step": 190
},
{
"epoch": 0.2048,
"grad_norm": 75.57894897460938,
"learning_rate": 0.00018334009949228061,
"loss": 448.9498,
"step": 200
},
{
"epoch": 0.2048,
"eval_loss": 6.992556571960449,
"eval_runtime": 3.4911,
"eval_samples_per_second": 143.219,
"eval_steps_per_second": 9.166,
"step": 200
},
{
"epoch": 0.21504,
"grad_norm": 160.8426513671875,
"learning_rate": 0.00018158808958398338,
"loss": 449.3857,
"step": 210
},
{
"epoch": 0.22528,
"grad_norm": 86.6707992553711,
"learning_rate": 0.00017975776992173344,
"loss": 449.0133,
"step": 220
},
{
"epoch": 0.23552,
"grad_norm": 65.60407257080078,
"learning_rate": 0.00017785089728011798,
"loss": 446.8142,
"step": 230
},
{
"epoch": 0.24576,
"grad_norm": 125.04796600341797,
"learning_rate": 0.00017586930191068655,
"loss": 446.0437,
"step": 240
},
{
"epoch": 0.256,
"grad_norm": 89.98313903808594,
"learning_rate": 0.00017381488578524173,
"loss": 445.3744,
"step": 250
},
{
"epoch": 0.26624,
"grad_norm": 269.5317077636719,
"learning_rate": 0.00017168962077029147,
"loss": 446.719,
"step": 260
},
{
"epoch": 0.27648,
"grad_norm": 73.03639221191406,
"learning_rate": 0.00016949554673441534,
"loss": 448.0971,
"step": 270
},
{
"epoch": 0.28672,
"grad_norm": 122.57962799072266,
"learning_rate": 0.00016723476959036083,
"loss": 448.991,
"step": 280
},
{
"epoch": 0.29696,
"grad_norm": 81.79669189453125,
"learning_rate": 0.0001649094592737497,
"loss": 444.1866,
"step": 290
},
{
"epoch": 0.3072,
"grad_norm": 74.33326721191406,
"learning_rate": 0.00016252184766033342,
"loss": 436.623,
"step": 300
},
{
"epoch": 0.3072,
"eval_loss": 6.777511119842529,
"eval_runtime": 3.4227,
"eval_samples_per_second": 146.083,
"eval_steps_per_second": 9.349,
"step": 300
},
{
"epoch": 0.31744,
"grad_norm": 113.46410369873047,
"learning_rate": 0.0001600742264237979,
"loss": 435.7422,
"step": 310
},
{
"epoch": 0.32768,
"grad_norm": 99.42645263671875,
"learning_rate": 0.00015756894483617267,
"loss": 439.4858,
"step": 320
},
{
"epoch": 0.33792,
"grad_norm": 328.0025634765625,
"learning_rate": 0.0001550084075129563,
"loss": 447.5792,
"step": 330
},
{
"epoch": 0.34816,
"grad_norm": 82.54906463623047,
"learning_rate": 0.00015239507210512194,
"loss": 446.5024,
"step": 340
},
{
"epoch": 0.3584,
"grad_norm": 62.32942581176758,
"learning_rate": 0.00014973144694021876,
"loss": 437.9146,
"step": 350
},
{
"epoch": 0.36864,
"grad_norm": 64.61022186279297,
"learning_rate": 0.00014702008861483266,
"loss": 430.4142,
"step": 360
},
{
"epoch": 0.37888,
"grad_norm": 133.777587890625,
"learning_rate": 0.00014426359954071796,
"loss": 428.6971,
"step": 370
},
{
"epoch": 0.38912,
"grad_norm": 218.512939453125,
"learning_rate": 0.00014146462544695426,
"loss": 435.1475,
"step": 380
},
{
"epoch": 0.39936,
"grad_norm": 125.56941986083984,
"learning_rate": 0.00013862585284052714,
"loss": 445.5835,
"step": 390
},
{
"epoch": 0.4096,
"grad_norm": 109.06041717529297,
"learning_rate": 0.00013575000642776893,
"loss": 446.3095,
"step": 400
},
{
"epoch": 0.4096,
"eval_loss": 6.905622959136963,
"eval_runtime": 3.4269,
"eval_samples_per_second": 145.903,
"eval_steps_per_second": 9.338,
"step": 400
},
{
"epoch": 0.41984,
"grad_norm": 69.12989044189453,
"learning_rate": 0.0001328398464991355,
"loss": 438.9709,
"step": 410
},
{
"epoch": 0.43008,
"grad_norm": 68.11474609375,
"learning_rate": 0.00012989816627982848,
"loss": 432.2964,
"step": 420
},
{
"epoch": 0.44032,
"grad_norm": 65.17674255371094,
"learning_rate": 0.00012692778924880603,
"loss": 428.2125,
"step": 430
},
{
"epoch": 0.45056,
"grad_norm": 114.95523834228516,
"learning_rate": 0.0001239315664287558,
"loss": 426.8882,
"step": 440
},
{
"epoch": 0.4608,
"grad_norm": 185.0157470703125,
"learning_rate": 0.00012091237364963071,
"loss": 435.8043,
"step": 450
},
{
"epoch": 0.47104,
"grad_norm": 92.59754180908203,
"learning_rate": 0.00011787310878837422,
"loss": 440.9751,
"step": 460
},
{
"epoch": 0.48128,
"grad_norm": 75.24162292480469,
"learning_rate": 0.00011481668898748475,
"loss": 439.3276,
"step": 470
},
{
"epoch": 0.49152,
"grad_norm": 55.42325210571289,
"learning_rate": 0.00011174604785508813,
"loss": 432.4603,
"step": 480
},
{
"epoch": 0.50176,
"grad_norm": 62.27671813964844,
"learning_rate": 0.00010866413264920678,
"loss": 427.5299,
"step": 490
},
{
"epoch": 0.512,
"grad_norm": 65.43367767333984,
"learning_rate": 0.00010557390144892684,
"loss": 425.4595,
"step": 500
},
{
"epoch": 0.512,
"eval_loss": 6.613161087036133,
"eval_runtime": 3.4182,
"eval_samples_per_second": 146.277,
"eval_steps_per_second": 9.362,
"step": 500
},
{
"epoch": 0.52224,
"grad_norm": 175.58470153808594,
"learning_rate": 0.0001024783203151793,
"loss": 425.5378,
"step": 510
},
{
"epoch": 0.53248,
"grad_norm": 199.92291259765625,
"learning_rate": 9.938036044386005e-05,
"loss": 431.3893,
"step": 520
},
{
"epoch": 0.54272,
"grad_norm": 212.65650939941406,
"learning_rate": 9.628299531402117e-05,
"loss": 443.9659,
"step": 530
},
{
"epoch": 0.55296,
"grad_norm": 93.11270141601562,
"learning_rate": 9.318919783387094e-05,
"loss": 443.3476,
"step": 540
},
{
"epoch": 0.5632,
"grad_norm": 93.02433013916016,
"learning_rate": 9.010193748732155e-05,
"loss": 438.1048,
"step": 550
},
{
"epoch": 0.57344,
"grad_norm": 72.0661849975586,
"learning_rate": 8.702417748382385e-05,
"loss": 431.1463,
"step": 560
},
{
"epoch": 0.58368,
"grad_norm": 67.0578842163086,
"learning_rate": 8.395887191422397e-05,
"loss": 427.2931,
"step": 570
},
{
"epoch": 0.59392,
"grad_norm": 85.532958984375,
"learning_rate": 8.090896291537273e-05,
"loss": 424.9293,
"step": 580
},
{
"epoch": 0.60416,
"grad_norm": 72.48572540283203,
"learning_rate": 7.787737784620803e-05,
"loss": 424.9051,
"step": 590
},
{
"epoch": 0.6144,
"grad_norm": 237.96592712402344,
"learning_rate": 7.486702647802213e-05,
"loss": 425.6438,
"step": 600
},
{
"epoch": 0.6144,
"eval_loss": 6.683709621429443,
"eval_runtime": 3.436,
"eval_samples_per_second": 145.519,
"eval_steps_per_second": 9.313,
"step": 600
},
{
"epoch": 0.62464,
"grad_norm": 178.17239379882812,
"learning_rate": 7.188079820160904e-05,
"loss": 432.3896,
"step": 610
},
{
"epoch": 0.63488,
"grad_norm": 84.38874053955078,
"learning_rate": 6.892155925397436e-05,
"loss": 434.9848,
"step": 620
},
{
"epoch": 0.64512,
"grad_norm": 66.67383575439453,
"learning_rate": 6.59921499672677e-05,
"loss": 433.8923,
"step": 630
},
{
"epoch": 0.65536,
"grad_norm": 74.11187744140625,
"learning_rate": 6.309538204257977e-05,
"loss": 430.2817,
"step": 640
},
{
"epoch": 0.6656,
"grad_norm": 95.32003784179688,
"learning_rate": 6.02340358512196e-05,
"loss": 427.1533,
"step": 650
},
{
"epoch": 0.67584,
"grad_norm": 71.91348266601562,
"learning_rate": 5.7410857766062966e-05,
"loss": 425.3034,
"step": 660
},
{
"epoch": 0.68608,
"grad_norm": 95.72642517089844,
"learning_rate": 5.4628557525532976e-05,
"loss": 425.3343,
"step": 670
},
{
"epoch": 0.69632,
"grad_norm": 161.08612060546875,
"learning_rate": 5.188980563274315e-05,
"loss": 426.5362,
"step": 680
},
{
"epoch": 0.70656,
"grad_norm": 130.4775848388672,
"learning_rate": 4.9197230792299195e-05,
"loss": 431.4921,
"step": 690
},
{
"epoch": 0.7168,
"grad_norm": 102.47798919677734,
"learning_rate": 4.6553417387219886e-05,
"loss": 432.9831,
"step": 700
},
{
"epoch": 0.7168,
"eval_loss": 6.725553512573242,
"eval_runtime": 3.4338,
"eval_samples_per_second": 145.61,
"eval_steps_per_second": 9.319,
"step": 700
},
{
"epoch": 0.72704,
"grad_norm": 73.72420501708984,
"learning_rate": 4.421777466693434e-05,
"loss": 431.4859,
"step": 710
},
{
"epoch": 0.73728,
"grad_norm": 83.63558197021484,
"learning_rate": 4.167355837898584e-05,
"loss": 428.698,
"step": 720
},
{
"epoch": 0.74752,
"grad_norm": 69.27027893066406,
"learning_rate": 3.918532488602094e-05,
"loss": 428.0623,
"step": 730
},
{
"epoch": 0.75776,
"grad_norm": 107.68405151367188,
"learning_rate": 3.675546244046228e-05,
"loss": 425.6424,
"step": 740
},
{
"epoch": 0.768,
"grad_norm": 96.42312622070312,
"learning_rate": 3.438630326912414e-05,
"loss": 425.8188,
"step": 750
},
{
"epoch": 0.77824,
"grad_norm": 116.1615982055664,
"learning_rate": 3.208012133469799e-05,
"loss": 425.9528,
"step": 760
},
{
"epoch": 0.78848,
"grad_norm": 91.75414276123047,
"learning_rate": 2.9839130153161154e-05,
"loss": 426.8583,
"step": 770
},
{
"epoch": 0.79872,
"grad_norm": 79.31800079345703,
"learning_rate": 2.766548066920338e-05,
"loss": 425.4576,
"step": 780
},
{
"epoch": 0.80896,
"grad_norm": 108.06861114501953,
"learning_rate": 2.5561259191710407e-05,
"loss": 425.0249,
"step": 790
},
{
"epoch": 0.8192,
"grad_norm": 86.25403594970703,
"learning_rate": 2.3528485391286147e-05,
"loss": 426.0778,
"step": 800
},
{
"epoch": 0.8192,
"eval_loss": 6.630011558532715,
"eval_runtime": 3.4108,
"eval_samples_per_second": 146.591,
"eval_steps_per_second": 9.382,
"step": 800
},
{
"epoch": 0.82944,
"grad_norm": 80.1161117553711,
"learning_rate": 2.1569110361735677e-05,
"loss": 426.9529,
"step": 810
},
{
"epoch": 0.83968,
"grad_norm": 92.57079315185547,
"learning_rate": 2e-05,
"loss": 425.7674,
"step": 820
},
{
"epoch": 0.84992,
"grad_norm": 76.28582000732422,
"learning_rate": 2e-05,
"loss": 425.3365,
"step": 830
},
{
"epoch": 0.86016,
"grad_norm": 111.58943176269531,
"learning_rate": 2e-05,
"loss": 424.6131,
"step": 840
},
{
"epoch": 0.8704,
"grad_norm": 158.44044494628906,
"learning_rate": 2e-05,
"loss": 425.0354,
"step": 850
},
{
"epoch": 0.88064,
"grad_norm": 101.99372100830078,
"learning_rate": 2e-05,
"loss": 424.9413,
"step": 860
},
{
"epoch": 0.89088,
"grad_norm": 140.2552490234375,
"learning_rate": 2e-05,
"loss": 426.5773,
"step": 870
},
{
"epoch": 0.90112,
"grad_norm": 117.06301879882812,
"learning_rate": 2e-05,
"loss": 427.2063,
"step": 880
},
{
"epoch": 0.91136,
"grad_norm": 147.27670288085938,
"learning_rate": 2e-05,
"loss": 427.2577,
"step": 890
},
{
"epoch": 0.9216,
"grad_norm": 109.34888458251953,
"learning_rate": 2e-05,
"loss": 428.4192,
"step": 900
},
{
"epoch": 0.9216,
"eval_loss": 6.663826942443848,
"eval_runtime": 3.429,
"eval_samples_per_second": 145.817,
"eval_steps_per_second": 9.332,
"step": 900
},
{
"epoch": 0.93184,
"grad_norm": 145.62522888183594,
"learning_rate": 2e-05,
"loss": 428.9891,
"step": 910
},
{
"epoch": 0.94208,
"grad_norm": 90.70750427246094,
"learning_rate": 2e-05,
"loss": 429.0984,
"step": 920
},
{
"epoch": 0.95232,
"grad_norm": 92.83578491210938,
"learning_rate": 2e-05,
"loss": 429.0002,
"step": 930
},
{
"epoch": 0.96256,
"grad_norm": 125.1180648803711,
"learning_rate": 2e-05,
"loss": 428.4883,
"step": 940
},
{
"epoch": 0.9728,
"grad_norm": 132.3828125,
"learning_rate": 2e-05,
"loss": 428.6993,
"step": 950
},
{
"epoch": 0.98304,
"grad_norm": 100.2248306274414,
"learning_rate": 2e-05,
"loss": 429.164,
"step": 960
},
{
"epoch": 0.99328,
"grad_norm": 112.38407897949219,
"learning_rate": 2e-05,
"loss": 430.1383,
"step": 970
},
{
"epoch": 1.003072,
"grad_norm": 104.0753173828125,
"learning_rate": 2e-05,
"loss": 410.847,
"step": 980
},
{
"epoch": 1.013312,
"grad_norm": 137.40553283691406,
"learning_rate": 2e-05,
"loss": 431.1839,
"step": 990
},
{
"epoch": 1.023552,
"grad_norm": 191.53709411621094,
"learning_rate": 2e-05,
"loss": 431.9959,
"step": 1000
},
{
"epoch": 1.023552,
"eval_loss": 6.718299865722656,
"eval_runtime": 3.4132,
"eval_samples_per_second": 146.49,
"eval_steps_per_second": 9.375,
"step": 1000
},
{
"epoch": 1.033792,
"grad_norm": 188.12156677246094,
"learning_rate": 2e-05,
"loss": 432.0317,
"step": 1010
},
{
"epoch": 1.044032,
"grad_norm": 119.64492797851562,
"learning_rate": 2e-05,
"loss": 432.8214,
"step": 1020
}
],
"logging_steps": 10,
"max_steps": 1024,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 1024,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.134583528711782e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}