Lorraine013's picture
End of training
f55c0a7 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 2080,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.009615384615384616,
"grad_norm": 9.013119895723426,
"learning_rate": 4.807692307692308e-07,
"loss": 1.3011,
"step": 10
},
{
"epoch": 0.019230769230769232,
"grad_norm": 8.170199017296364,
"learning_rate": 9.615384615384617e-07,
"loss": 1.1715,
"step": 20
},
{
"epoch": 0.028846153846153848,
"grad_norm": 5.232892874296514,
"learning_rate": 1.4423076923076922e-06,
"loss": 0.9015,
"step": 30
},
{
"epoch": 0.038461538461538464,
"grad_norm": 2.862322298571592,
"learning_rate": 1.9230769230769234e-06,
"loss": 0.7978,
"step": 40
},
{
"epoch": 0.04807692307692308,
"grad_norm": 2.7249762448136132,
"learning_rate": 2.403846153846154e-06,
"loss": 0.7609,
"step": 50
},
{
"epoch": 0.057692307692307696,
"grad_norm": 3.169444209352466,
"learning_rate": 2.8846153846153845e-06,
"loss": 0.8146,
"step": 60
},
{
"epoch": 0.0673076923076923,
"grad_norm": 2.6189394700073807,
"learning_rate": 3.365384615384616e-06,
"loss": 0.777,
"step": 70
},
{
"epoch": 0.07692307692307693,
"grad_norm": 2.4397555428298077,
"learning_rate": 3.846153846153847e-06,
"loss": 0.78,
"step": 80
},
{
"epoch": 0.08653846153846154,
"grad_norm": 2.812934479310922,
"learning_rate": 4.326923076923077e-06,
"loss": 0.7721,
"step": 90
},
{
"epoch": 0.09615384615384616,
"grad_norm": 2.5334288416130746,
"learning_rate": 4.807692307692308e-06,
"loss": 0.7669,
"step": 100
},
{
"epoch": 0.10576923076923077,
"grad_norm": 2.342690607930004,
"learning_rate": 5.288461538461539e-06,
"loss": 0.7459,
"step": 110
},
{
"epoch": 0.11538461538461539,
"grad_norm": 2.3269599095205264,
"learning_rate": 5.769230769230769e-06,
"loss": 0.7851,
"step": 120
},
{
"epoch": 0.125,
"grad_norm": 2.42709508438667,
"learning_rate": 6.25e-06,
"loss": 0.7703,
"step": 130
},
{
"epoch": 0.1346153846153846,
"grad_norm": 2.6377643494039273,
"learning_rate": 6.730769230769232e-06,
"loss": 0.7496,
"step": 140
},
{
"epoch": 0.14423076923076922,
"grad_norm": 2.875686037935072,
"learning_rate": 7.211538461538462e-06,
"loss": 0.784,
"step": 150
},
{
"epoch": 0.15384615384615385,
"grad_norm": 2.1768155795174917,
"learning_rate": 7.692307692307694e-06,
"loss": 0.8022,
"step": 160
},
{
"epoch": 0.16346153846153846,
"grad_norm": 2.46654983618583,
"learning_rate": 8.173076923076923e-06,
"loss": 0.7741,
"step": 170
},
{
"epoch": 0.17307692307692307,
"grad_norm": 2.4450582546281554,
"learning_rate": 8.653846153846155e-06,
"loss": 0.7827,
"step": 180
},
{
"epoch": 0.18269230769230768,
"grad_norm": 2.4657718631331567,
"learning_rate": 9.134615384615384e-06,
"loss": 0.7894,
"step": 190
},
{
"epoch": 0.19230769230769232,
"grad_norm": 2.54311809351163,
"learning_rate": 9.615384615384616e-06,
"loss": 0.8164,
"step": 200
},
{
"epoch": 0.20192307692307693,
"grad_norm": 2.097681181559205,
"learning_rate": 9.999971836433636e-06,
"loss": 0.8139,
"step": 210
},
{
"epoch": 0.21153846153846154,
"grad_norm": 2.2579921721680516,
"learning_rate": 9.998986144924253e-06,
"loss": 0.7919,
"step": 220
},
{
"epoch": 0.22115384615384615,
"grad_norm": 2.1097886183574466,
"learning_rate": 9.996592592355083e-06,
"loss": 0.8056,
"step": 230
},
{
"epoch": 0.23076923076923078,
"grad_norm": 2.22507095955906,
"learning_rate": 9.992791852820709e-06,
"loss": 0.8108,
"step": 240
},
{
"epoch": 0.2403846153846154,
"grad_norm": 2.1931221071365425,
"learning_rate": 9.987584996720813e-06,
"loss": 0.8041,
"step": 250
},
{
"epoch": 0.25,
"grad_norm": 2.1220375928827044,
"learning_rate": 9.980973490458728e-06,
"loss": 0.8149,
"step": 260
},
{
"epoch": 0.25961538461538464,
"grad_norm": 2.1068957326041904,
"learning_rate": 9.972959196028456e-06,
"loss": 0.7914,
"step": 270
},
{
"epoch": 0.2692307692307692,
"grad_norm": 1.9372196057401814,
"learning_rate": 9.96354437049027e-06,
"loss": 0.8033,
"step": 280
},
{
"epoch": 0.27884615384615385,
"grad_norm": 2.2606013253960846,
"learning_rate": 9.952731665335071e-06,
"loss": 0.8039,
"step": 290
},
{
"epoch": 0.28846153846153844,
"grad_norm": 2.1614476516877406,
"learning_rate": 9.940524125737641e-06,
"loss": 0.7647,
"step": 300
},
{
"epoch": 0.2980769230769231,
"grad_norm": 2.220222508508704,
"learning_rate": 9.92692518969903e-06,
"loss": 0.7901,
"step": 310
},
{
"epoch": 0.3076923076923077,
"grad_norm": 2.201097779810873,
"learning_rate": 9.911938687078324e-06,
"loss": 0.7964,
"step": 320
},
{
"epoch": 0.3173076923076923,
"grad_norm": 2.029609328977134,
"learning_rate": 9.895568838514042e-06,
"loss": 0.7764,
"step": 330
},
{
"epoch": 0.3269230769230769,
"grad_norm": 2.2142291176229834,
"learning_rate": 9.87782025423547e-06,
"loss": 0.8005,
"step": 340
},
{
"epoch": 0.33653846153846156,
"grad_norm": 2.084315605131572,
"learning_rate": 9.8586979327643e-06,
"loss": 0.7795,
"step": 350
},
{
"epoch": 0.34615384615384615,
"grad_norm": 2.3024285135825457,
"learning_rate": 9.838207259506891e-06,
"loss": 0.7935,
"step": 360
},
{
"epoch": 0.3557692307692308,
"grad_norm": 2.5288625621043326,
"learning_rate": 9.816354005237583e-06,
"loss": 0.7749,
"step": 370
},
{
"epoch": 0.36538461538461536,
"grad_norm": 2.0010687704613743,
"learning_rate": 9.793144324473473e-06,
"loss": 0.7988,
"step": 380
},
{
"epoch": 0.375,
"grad_norm": 2.1536447186627496,
"learning_rate": 9.768584753741134e-06,
"loss": 0.7955,
"step": 390
},
{
"epoch": 0.38461538461538464,
"grad_norm": 2.025116331685143,
"learning_rate": 9.742682209735727e-06,
"loss": 0.8014,
"step": 400
},
{
"epoch": 0.3942307692307692,
"grad_norm": 2.027340612157234,
"learning_rate": 9.715443987373062e-06,
"loss": 0.8341,
"step": 410
},
{
"epoch": 0.40384615384615385,
"grad_norm": 1.968065863088579,
"learning_rate": 9.686877757735126e-06,
"loss": 0.7922,
"step": 420
},
{
"epoch": 0.41346153846153844,
"grad_norm": 2.016993760012052,
"learning_rate": 9.656991565909703e-06,
"loss": 0.8328,
"step": 430
},
{
"epoch": 0.4230769230769231,
"grad_norm": 2.1902508822456213,
"learning_rate": 9.62579382872462e-06,
"loss": 0.8029,
"step": 440
},
{
"epoch": 0.4326923076923077,
"grad_norm": 2.0187820173893534,
"learning_rate": 9.593293332377325e-06,
"loss": 0.7634,
"step": 450
},
{
"epoch": 0.4423076923076923,
"grad_norm": 2.1364408496498437,
"learning_rate": 9.55949922996045e-06,
"loss": 0.774,
"step": 460
},
{
"epoch": 0.4519230769230769,
"grad_norm": 2.0116381073985945,
"learning_rate": 9.52442103888402e-06,
"loss": 0.8331,
"step": 470
},
{
"epoch": 0.46153846153846156,
"grad_norm": 2.100311438402205,
"learning_rate": 9.488068638195072e-06,
"loss": 0.7478,
"step": 480
},
{
"epoch": 0.47115384615384615,
"grad_norm": 2.059472131869095,
"learning_rate": 9.450452265795423e-06,
"loss": 0.8202,
"step": 490
},
{
"epoch": 0.4807692307692308,
"grad_norm": 2.074026027178722,
"learning_rate": 9.411582515558391e-06,
"loss": 0.8177,
"step": 500
},
{
"epoch": 0.4807692307692308,
"eval_loss": 0.7822802066802979,
"eval_runtime": 31.3783,
"eval_samples_per_second": 58.926,
"eval_steps_per_second": 7.394,
"step": 500
},
{
"epoch": 0.49038461538461536,
"grad_norm": 1.9938002782487565,
"learning_rate": 9.371470334345232e-06,
"loss": 0.804,
"step": 510
},
{
"epoch": 0.5,
"grad_norm": 1.9596865504803214,
"learning_rate": 9.330127018922195e-06,
"loss": 0.7844,
"step": 520
},
{
"epoch": 0.5096153846153846,
"grad_norm": 1.8726235477694055,
"learning_rate": 9.287564212779012e-06,
"loss": 0.756,
"step": 530
},
{
"epoch": 0.5192307692307693,
"grad_norm": 2.091468778530183,
"learning_rate": 9.243793902849764e-06,
"loss": 0.8057,
"step": 540
},
{
"epoch": 0.5288461538461539,
"grad_norm": 2.032448528673805,
"learning_rate": 9.198828416136991e-06,
"loss": 0.8065,
"step": 550
},
{
"epoch": 0.5384615384615384,
"grad_norm": 2.0648947345620137,
"learning_rate": 9.152680416240059e-06,
"loss": 0.8101,
"step": 560
},
{
"epoch": 0.5480769230769231,
"grad_norm": 1.864661141720549,
"learning_rate": 9.10536289978872e-06,
"loss": 0.7597,
"step": 570
},
{
"epoch": 0.5576923076923077,
"grad_norm": 1.8389259729678664,
"learning_rate": 9.056889192782865e-06,
"loss": 0.806,
"step": 580
},
{
"epoch": 0.5673076923076923,
"grad_norm": 1.997501225318945,
"learning_rate": 9.007272946839559e-06,
"loss": 0.7696,
"step": 590
},
{
"epoch": 0.5769230769230769,
"grad_norm": 1.9510249081212931,
"learning_rate": 8.95652813534831e-06,
"loss": 0.7779,
"step": 600
},
{
"epoch": 0.5865384615384616,
"grad_norm": 2.043423090422744,
"learning_rate": 8.90466904953579e-06,
"loss": 0.8142,
"step": 610
},
{
"epoch": 0.5961538461538461,
"grad_norm": 2.1520676063407,
"learning_rate": 8.851710294440974e-06,
"loss": 0.804,
"step": 620
},
{
"epoch": 0.6057692307692307,
"grad_norm": 1.8353482991425896,
"learning_rate": 8.797666784801954e-06,
"loss": 0.7701,
"step": 630
},
{
"epoch": 0.6153846153846154,
"grad_norm": 1.8647174071179153,
"learning_rate": 8.742553740855507e-06,
"loss": 0.7641,
"step": 640
},
{
"epoch": 0.625,
"grad_norm": 1.859731623435647,
"learning_rate": 8.68638668405062e-06,
"loss": 0.8293,
"step": 650
},
{
"epoch": 0.6346153846153846,
"grad_norm": 2.035763511676626,
"learning_rate": 8.629181432677213e-06,
"loss": 0.8386,
"step": 660
},
{
"epoch": 0.6442307692307693,
"grad_norm": 1.9498309927555528,
"learning_rate": 8.570954097411224e-06,
"loss": 0.826,
"step": 670
},
{
"epoch": 0.6538461538461539,
"grad_norm": 2.157631582471241,
"learning_rate": 8.511721076777388e-06,
"loss": 0.7933,
"step": 680
},
{
"epoch": 0.6634615384615384,
"grad_norm": 1.817540210529388,
"learning_rate": 8.451499052530923e-06,
"loss": 0.794,
"step": 690
},
{
"epoch": 0.6730769230769231,
"grad_norm": 2.003699550055361,
"learning_rate": 8.390304984959455e-06,
"loss": 0.8152,
"step": 700
},
{
"epoch": 0.6826923076923077,
"grad_norm": 1.7853207622424534,
"learning_rate": 8.328156108106518e-06,
"loss": 0.7869,
"step": 710
},
{
"epoch": 0.6923076923076923,
"grad_norm": 1.951815011854146,
"learning_rate": 8.265069924917925e-06,
"loss": 0.8086,
"step": 720
},
{
"epoch": 0.7019230769230769,
"grad_norm": 1.904892870037223,
"learning_rate": 8.20106420231244e-06,
"loss": 0.7959,
"step": 730
},
{
"epoch": 0.7115384615384616,
"grad_norm": 1.9417662866059235,
"learning_rate": 8.136156966178082e-06,
"loss": 0.8072,
"step": 740
},
{
"epoch": 0.7211538461538461,
"grad_norm": 1.8480817149859605,
"learning_rate": 8.070366496295505e-06,
"loss": 0.7814,
"step": 750
},
{
"epoch": 0.7307692307692307,
"grad_norm": 1.8327729681591678,
"learning_rate": 8.003711321189895e-06,
"loss": 0.7786,
"step": 760
},
{
"epoch": 0.7403846153846154,
"grad_norm": 2.1198206040027494,
"learning_rate": 7.93621021291277e-06,
"loss": 0.7485,
"step": 770
},
{
"epoch": 0.75,
"grad_norm": 1.9784614334378419,
"learning_rate": 7.86788218175523e-06,
"loss": 0.7736,
"step": 780
},
{
"epoch": 0.7596153846153846,
"grad_norm": 1.733613545143569,
"learning_rate": 7.798746470894113e-06,
"loss": 0.7613,
"step": 790
},
{
"epoch": 0.7692307692307693,
"grad_norm": 1.7905885212318795,
"learning_rate": 7.728822550972523e-06,
"loss": 0.7902,
"step": 800
},
{
"epoch": 0.7788461538461539,
"grad_norm": 2.02868421572116,
"learning_rate": 7.658130114616364e-06,
"loss": 0.7962,
"step": 810
},
{
"epoch": 0.7884615384615384,
"grad_norm": 1.8163329758157731,
"learning_rate": 7.586689070888284e-06,
"loss": 0.7625,
"step": 820
},
{
"epoch": 0.7980769230769231,
"grad_norm": 2.0352374441953542,
"learning_rate": 7.5145195396807244e-06,
"loss": 0.7836,
"step": 830
},
{
"epoch": 0.8076923076923077,
"grad_norm": 2.0684855751019144,
"learning_rate": 7.441641846049557e-06,
"loss": 0.8016,
"step": 840
},
{
"epoch": 0.8173076923076923,
"grad_norm": 2.042151868500934,
"learning_rate": 7.368076514489947e-06,
"loss": 0.8012,
"step": 850
},
{
"epoch": 0.8269230769230769,
"grad_norm": 1.9253750246288859,
"learning_rate": 7.2938442631560714e-06,
"loss": 0.799,
"step": 860
},
{
"epoch": 0.8365384615384616,
"grad_norm": 1.8896143274467088,
"learning_rate": 7.218965998026259e-06,
"loss": 0.8099,
"step": 870
},
{
"epoch": 0.8461538461538461,
"grad_norm": 1.7590979198633765,
"learning_rate": 7.143462807015271e-06,
"loss": 0.7726,
"step": 880
},
{
"epoch": 0.8557692307692307,
"grad_norm": 1.9332789610600787,
"learning_rate": 7.067355954035316e-06,
"loss": 0.808,
"step": 890
},
{
"epoch": 0.8653846153846154,
"grad_norm": 1.8265190820741775,
"learning_rate": 6.990666873007506e-06,
"loss": 0.7604,
"step": 900
},
{
"epoch": 0.875,
"grad_norm": 1.8486951872238018,
"learning_rate": 6.913417161825449e-06,
"loss": 0.7861,
"step": 910
},
{
"epoch": 0.8846153846153846,
"grad_norm": 2.0473377615419412,
"learning_rate": 6.8356285762726385e-06,
"loss": 0.7618,
"step": 920
},
{
"epoch": 0.8942307692307693,
"grad_norm": 1.7908009618022396,
"learning_rate": 6.757323023895388e-06,
"loss": 0.7547,
"step": 930
},
{
"epoch": 0.9038461538461539,
"grad_norm": 1.7499349918180362,
"learning_rate": 6.678522557833025e-06,
"loss": 0.7595,
"step": 940
},
{
"epoch": 0.9134615384615384,
"grad_norm": 1.9193288351945685,
"learning_rate": 6.599249370607083e-06,
"loss": 0.7465,
"step": 950
},
{
"epoch": 0.9230769230769231,
"grad_norm": 1.7382141318072688,
"learning_rate": 6.519525787871235e-06,
"loss": 0.7829,
"step": 960
},
{
"epoch": 0.9326923076923077,
"grad_norm": 1.6639815518816288,
"learning_rate": 6.439374262123731e-06,
"loss": 0.7483,
"step": 970
},
{
"epoch": 0.9423076923076923,
"grad_norm": 2.040809216745246,
"learning_rate": 6.358817366384122e-06,
"loss": 0.7695,
"step": 980
},
{
"epoch": 0.9519230769230769,
"grad_norm": 1.9765992795556684,
"learning_rate": 6.277877787836034e-06,
"loss": 0.8039,
"step": 990
},
{
"epoch": 0.9615384615384616,
"grad_norm": 1.7379165898055615,
"learning_rate": 6.1965783214377895e-06,
"loss": 0.7708,
"step": 1000
},
{
"epoch": 0.9615384615384616,
"eval_loss": 0.757188618183136,
"eval_runtime": 30.718,
"eval_samples_per_second": 60.193,
"eval_steps_per_second": 7.553,
"step": 1000
},
{
"epoch": 0.9711538461538461,
"grad_norm": 1.91002204642886,
"learning_rate": 6.114941863502682e-06,
"loss": 0.791,
"step": 1010
},
{
"epoch": 0.9807692307692307,
"grad_norm": 1.7547449600143168,
"learning_rate": 6.032991405250702e-06,
"loss": 0.7584,
"step": 1020
},
{
"epoch": 0.9903846153846154,
"grad_norm": 1.7518146591240198,
"learning_rate": 5.950750026333534e-06,
"loss": 0.7911,
"step": 1030
},
{
"epoch": 1.0,
"grad_norm": 1.8856712522978225,
"learning_rate": 5.8682408883346535e-06,
"loss": 0.7762,
"step": 1040
},
{
"epoch": 1.0096153846153846,
"grad_norm": 2.6930087290136417,
"learning_rate": 5.785487228246339e-06,
"loss": 0.6172,
"step": 1050
},
{
"epoch": 1.0192307692307692,
"grad_norm": 2.0128509132494217,
"learning_rate": 5.7025123519254644e-06,
"loss": 0.5335,
"step": 1060
},
{
"epoch": 1.0288461538461537,
"grad_norm": 2.081106502499814,
"learning_rate": 5.619339627529876e-06,
"loss": 0.5475,
"step": 1070
},
{
"epoch": 1.0384615384615385,
"grad_norm": 2.089872779664623,
"learning_rate": 5.53599247893724e-06,
"loss": 0.5491,
"step": 1080
},
{
"epoch": 1.0480769230769231,
"grad_norm": 2.487028007682742,
"learning_rate": 5.45249437914819e-06,
"loss": 0.5396,
"step": 1090
},
{
"epoch": 1.0576923076923077,
"grad_norm": 2.1183702883792015,
"learning_rate": 5.368868843675642e-06,
"loss": 0.5602,
"step": 1100
},
{
"epoch": 1.0673076923076923,
"grad_norm": 2.231623903215653,
"learning_rate": 5.285139423922138e-06,
"loss": 0.5341,
"step": 1110
},
{
"epoch": 1.0769230769230769,
"grad_norm": 1.8590648999967128,
"learning_rate": 5.201329700547077e-06,
"loss": 0.5499,
"step": 1120
},
{
"epoch": 1.0865384615384615,
"grad_norm": 2.031456951588171,
"learning_rate": 5.117463276825711e-06,
"loss": 0.5541,
"step": 1130
},
{
"epoch": 1.0961538461538463,
"grad_norm": 1.996025281675884,
"learning_rate": 5.033563772001782e-06,
"loss": 0.5229,
"step": 1140
},
{
"epoch": 1.1057692307692308,
"grad_norm": 2.01803070793536,
"learning_rate": 4.949654814635623e-06,
"loss": 0.5534,
"step": 1150
},
{
"epoch": 1.1153846153846154,
"grad_norm": 2.0131103606600163,
"learning_rate": 4.865760035949695e-06,
"loss": 0.5503,
"step": 1160
},
{
"epoch": 1.125,
"grad_norm": 1.9109776540155083,
"learning_rate": 4.781903063173321e-06,
"loss": 0.5587,
"step": 1170
},
{
"epoch": 1.1346153846153846,
"grad_norm": 2.240195731738998,
"learning_rate": 4.69810751288857e-06,
"loss": 0.5345,
"step": 1180
},
{
"epoch": 1.1442307692307692,
"grad_norm": 2.21815291482556,
"learning_rate": 4.61439698437914e-06,
"loss": 0.5105,
"step": 1190
},
{
"epoch": 1.1538461538461537,
"grad_norm": 1.959424085301074,
"learning_rate": 4.530795052984104e-06,
"loss": 0.5472,
"step": 1200
},
{
"epoch": 1.1634615384615385,
"grad_norm": 1.9724626057174217,
"learning_rate": 4.447325263458401e-06,
"loss": 0.5425,
"step": 1210
},
{
"epoch": 1.1730769230769231,
"grad_norm": 1.9404277504052594,
"learning_rate": 4.364011123341947e-06,
"loss": 0.5383,
"step": 1220
},
{
"epoch": 1.1826923076923077,
"grad_norm": 1.975957154057773,
"learning_rate": 4.280876096339222e-06,
"loss": 0.5635,
"step": 1230
},
{
"epoch": 1.1923076923076923,
"grad_norm": 3.7893846099890203,
"learning_rate": 4.1979435957111984e-06,
"loss": 0.5508,
"step": 1240
},
{
"epoch": 1.2019230769230769,
"grad_norm": 2.088379071314605,
"learning_rate": 4.11523697768149e-06,
"loss": 0.5917,
"step": 1250
},
{
"epoch": 1.2115384615384615,
"grad_norm": 2.0699986543823914,
"learning_rate": 4.032779534858544e-06,
"loss": 0.5549,
"step": 1260
},
{
"epoch": 1.2211538461538463,
"grad_norm": 2.4531750427167744,
"learning_rate": 3.9505944896757635e-06,
"loss": 0.5314,
"step": 1270
},
{
"epoch": 1.2307692307692308,
"grad_norm": 2.130560765169762,
"learning_rate": 3.86870498785139e-06,
"loss": 0.5327,
"step": 1280
},
{
"epoch": 1.2403846153846154,
"grad_norm": 2.2163755882504947,
"learning_rate": 3.7871340918699945e-06,
"loss": 0.509,
"step": 1290
},
{
"epoch": 1.25,
"grad_norm": 2.1107171131060136,
"learning_rate": 3.705904774487396e-06,
"loss": 0.5153,
"step": 1300
},
{
"epoch": 1.2596153846153846,
"grad_norm": 2.1786622272309413,
"learning_rate": 3.6250399122608713e-06,
"loss": 0.5589,
"step": 1310
},
{
"epoch": 1.2692307692307692,
"grad_norm": 1.9962608207111503,
"learning_rate": 3.544562279106436e-06,
"loss": 0.5425,
"step": 1320
},
{
"epoch": 1.2788461538461537,
"grad_norm": 1.8464394968329445,
"learning_rate": 3.464494539885047e-06,
"loss": 0.5349,
"step": 1330
},
{
"epoch": 1.2884615384615383,
"grad_norm": 2.138267726845245,
"learning_rate": 3.3848592440195118e-06,
"loss": 0.5461,
"step": 1340
},
{
"epoch": 1.2980769230769231,
"grad_norm": 2.0087619862264767,
"learning_rate": 3.3056788191439116e-06,
"loss": 0.559,
"step": 1350
},
{
"epoch": 1.3076923076923077,
"grad_norm": 2.1318708987886854,
"learning_rate": 3.226975564787322e-06,
"loss": 0.5326,
"step": 1360
},
{
"epoch": 1.3173076923076923,
"grad_norm": 2.0595039744355725,
"learning_rate": 3.148771646093608e-06,
"loss": 0.5215,
"step": 1370
},
{
"epoch": 1.3269230769230769,
"grad_norm": 2.106731128516135,
"learning_rate": 3.0710890875790745e-06,
"loss": 0.5205,
"step": 1380
},
{
"epoch": 1.3365384615384617,
"grad_norm": 2.015598237138461,
"learning_rate": 2.993949766929711e-06,
"loss": 0.5155,
"step": 1390
},
{
"epoch": 1.3461538461538463,
"grad_norm": 2.089932096970866,
"learning_rate": 2.917375408839803e-06,
"loss": 0.5321,
"step": 1400
},
{
"epoch": 1.3557692307692308,
"grad_norm": 2.1011121825872414,
"learning_rate": 2.8413875788936067e-06,
"loss": 0.5654,
"step": 1410
},
{
"epoch": 1.3653846153846154,
"grad_norm": 2.5049128653170607,
"learning_rate": 2.766007677491871e-06,
"loss": 0.5332,
"step": 1420
},
{
"epoch": 1.375,
"grad_norm": 2.199796386555421,
"learning_rate": 2.6912569338248317e-06,
"loss": 0.5129,
"step": 1430
},
{
"epoch": 1.3846153846153846,
"grad_norm": 2.122643085103428,
"learning_rate": 2.6171563998934605e-06,
"loss": 0.5315,
"step": 1440
},
{
"epoch": 1.3942307692307692,
"grad_norm": 1.9680294145904305,
"learning_rate": 2.5437269445806146e-06,
"loss": 0.5388,
"step": 1450
},
{
"epoch": 1.4038461538461537,
"grad_norm": 2.1649286481102505,
"learning_rate": 2.4709892477737263e-06,
"loss": 0.5449,
"step": 1460
},
{
"epoch": 1.4134615384615383,
"grad_norm": 1.9108821348630196,
"learning_rate": 2.3989637945407547e-06,
"loss": 0.5124,
"step": 1470
},
{
"epoch": 1.4230769230769231,
"grad_norm": 1.8705842302466216,
"learning_rate": 2.3276708693609947e-06,
"loss": 0.5271,
"step": 1480
},
{
"epoch": 1.4326923076923077,
"grad_norm": 2.218892686030775,
"learning_rate": 2.2571305504123547e-06,
"loss": 0.536,
"step": 1490
},
{
"epoch": 1.4423076923076923,
"grad_norm": 2.151762003437118,
"learning_rate": 2.187362703916766e-06,
"loss": 0.5513,
"step": 1500
},
{
"epoch": 1.4423076923076923,
"eval_loss": 0.7693456411361694,
"eval_runtime": 31.5006,
"eval_samples_per_second": 58.697,
"eval_steps_per_second": 7.365,
"step": 1500
},
{
"epoch": 1.4519230769230769,
"grad_norm": 2.2335086500452768,
"learning_rate": 2.1183869785452744e-06,
"loss": 0.5343,
"step": 1510
},
{
"epoch": 1.4615384615384617,
"grad_norm": 2.201272848216364,
"learning_rate": 2.050222799884387e-06,
"loss": 0.5203,
"step": 1520
},
{
"epoch": 1.4711538461538463,
"grad_norm": 2.245522812744825,
"learning_rate": 1.9828893649652653e-06,
"loss": 0.5492,
"step": 1530
},
{
"epoch": 1.4807692307692308,
"grad_norm": 2.4219731931643134,
"learning_rate": 1.9164056368572847e-06,
"loss": 0.531,
"step": 1540
},
{
"epoch": 1.4903846153846154,
"grad_norm": 1.9852458155211052,
"learning_rate": 1.8507903393274622e-06,
"loss": 0.5413,
"step": 1550
},
{
"epoch": 1.5,
"grad_norm": 2.057532782936977,
"learning_rate": 1.7860619515673034e-06,
"loss": 0.5271,
"step": 1560
},
{
"epoch": 1.5096153846153846,
"grad_norm": 2.3471986544512187,
"learning_rate": 1.7222387029885268e-06,
"loss": 0.5579,
"step": 1570
},
{
"epoch": 1.5192307692307692,
"grad_norm": 2.12927805118844,
"learning_rate": 1.6593385680891139e-06,
"loss": 0.524,
"step": 1580
},
{
"epoch": 1.5288461538461537,
"grad_norm": 2.3452167989714785,
"learning_rate": 1.5973792613911698e-06,
"loss": 0.5263,
"step": 1590
},
{
"epoch": 1.5384615384615383,
"grad_norm": 2.3303569386029115,
"learning_rate": 1.5363782324520033e-06,
"loss": 0.5075,
"step": 1600
},
{
"epoch": 1.5480769230769231,
"grad_norm": 2.3477866769254514,
"learning_rate": 1.476352660949802e-06,
"loss": 0.5611,
"step": 1610
},
{
"epoch": 1.5576923076923077,
"grad_norm": 1.9831558221073002,
"learning_rate": 1.4173194518453415e-06,
"loss": 0.5124,
"step": 1620
},
{
"epoch": 1.5673076923076923,
"grad_norm": 2.5946859927202732,
"learning_rate": 1.3592952306210589e-06,
"loss": 0.5193,
"step": 1630
},
{
"epoch": 1.5769230769230769,
"grad_norm": 2.3095838494615433,
"learning_rate": 1.3022963385988153e-06,
"loss": 0.5374,
"step": 1640
},
{
"epoch": 1.5865384615384617,
"grad_norm": 2.0028485071747806,
"learning_rate": 1.246338828337707e-06,
"loss": 0.512,
"step": 1650
},
{
"epoch": 1.5961538461538463,
"grad_norm": 2.029535183303286,
"learning_rate": 1.1914384591132045e-06,
"loss": 0.5128,
"step": 1660
},
{
"epoch": 1.6057692307692308,
"grad_norm": 2.147413310284848,
"learning_rate": 1.1376106924788594e-06,
"loss": 0.5316,
"step": 1670
},
{
"epoch": 1.6153846153846154,
"grad_norm": 2.3899145312729693,
"learning_rate": 1.0848706879118893e-06,
"loss": 0.5658,
"step": 1680
},
{
"epoch": 1.625,
"grad_norm": 2.0348123827167335,
"learning_rate": 1.0332332985438248e-06,
"loss": 0.5269,
"step": 1690
},
{
"epoch": 1.6346153846153846,
"grad_norm": 1.8531683575644768,
"learning_rate": 9.82713066977427e-07,
"loss": 0.5405,
"step": 1700
},
{
"epoch": 1.6442307692307692,
"grad_norm": 1.9592530320960306,
"learning_rate": 9.333242211910687e-07,
"loss": 0.5184,
"step": 1710
},
{
"epoch": 1.6538461538461537,
"grad_norm": 1.9954409383029053,
"learning_rate": 8.850806705317183e-07,
"loss": 0.4852,
"step": 1720
},
{
"epoch": 1.6634615384615383,
"grad_norm": 2.098998703354889,
"learning_rate": 8.379960017976546e-07,
"loss": 0.5139,
"step": 1730
},
{
"epoch": 1.6730769230769231,
"grad_norm": 2.5491851954980658,
"learning_rate": 7.920834754120305e-07,
"loss": 0.5392,
"step": 1740
},
{
"epoch": 1.6826923076923077,
"grad_norm": 2.3935228227975,
"learning_rate": 7.473560216883524e-07,
"loss": 0.533,
"step": 1750
},
{
"epoch": 1.6923076923076923,
"grad_norm": 2.170304242236588,
"learning_rate": 7.03826237188916e-07,
"loss": 0.5523,
"step": 1760
},
{
"epoch": 1.7019230769230769,
"grad_norm": 2.238634395178553,
"learning_rate": 6.615063811772532e-07,
"loss": 0.5122,
"step": 1770
},
{
"epoch": 1.7115384615384617,
"grad_norm": 2.0570639625612985,
"learning_rate": 6.204083721655607e-07,
"loss": 0.4928,
"step": 1780
},
{
"epoch": 1.7211538461538463,
"grad_norm": 2.103504186819526,
"learning_rate": 5.805437845580958e-07,
"loss": 0.5203,
"step": 1790
},
{
"epoch": 1.7307692307692308,
"grad_norm": 2.197105961623965,
"learning_rate": 5.41923845391486e-07,
"loss": 0.5124,
"step": 1800
},
{
"epoch": 1.7403846153846154,
"grad_norm": 2.1152456056469227,
"learning_rate": 5.045594311728708e-07,
"loss": 0.4916,
"step": 1810
},
{
"epoch": 1.75,
"grad_norm": 2.0939964989579973,
"learning_rate": 4.6846106481675035e-07,
"loss": 0.5111,
"step": 1820
},
{
"epoch": 1.7596153846153846,
"grad_norm": 2.4677432405873256,
"learning_rate": 4.336389126814311e-07,
"loss": 0.5346,
"step": 1830
},
{
"epoch": 1.7692307692307692,
"grad_norm": 1.9790895263145134,
"learning_rate": 4.001027817058789e-07,
"loss": 0.5051,
"step": 1840
},
{
"epoch": 1.7788461538461537,
"grad_norm": 2.2327516887468417,
"learning_rate": 3.6786211664779583e-07,
"loss": 0.5376,
"step": 1850
},
{
"epoch": 1.7884615384615383,
"grad_norm": 2.0971604625086777,
"learning_rate": 3.369259974236988e-07,
"loss": 0.4795,
"step": 1860
},
{
"epoch": 1.7980769230769231,
"grad_norm": 2.124716857194864,
"learning_rate": 3.0730313655175647e-07,
"loss": 0.5388,
"step": 1870
},
{
"epoch": 1.8076923076923077,
"grad_norm": 2.1095913682765977,
"learning_rate": 2.790018766980773e-07,
"loss": 0.545,
"step": 1880
},
{
"epoch": 1.8173076923076923,
"grad_norm": 2.0130617931416275,
"learning_rate": 2.520301883271797e-07,
"loss": 0.5169,
"step": 1890
},
{
"epoch": 1.8269230769230769,
"grad_norm": 2.083745653036941,
"learning_rate": 2.2639566745727203e-07,
"loss": 0.4928,
"step": 1900
},
{
"epoch": 1.8365384615384617,
"grad_norm": 2.220101162444105,
"learning_rate": 2.0210553352098815e-07,
"loss": 0.5075,
"step": 1910
},
{
"epoch": 1.8461538461538463,
"grad_norm": 2.308065945824805,
"learning_rate": 1.7916662733218848e-07,
"loss": 0.5116,
"step": 1920
},
{
"epoch": 1.8557692307692308,
"grad_norm": 1.9607562596689283,
"learning_rate": 1.575854091593837e-07,
"loss": 0.5183,
"step": 1930
},
{
"epoch": 1.8653846153846154,
"grad_norm": 2.1443103265105634,
"learning_rate": 1.3736795690633353e-07,
"loss": 0.5123,
"step": 1940
},
{
"epoch": 1.875,
"grad_norm": 1.9320175413394869,
"learning_rate": 1.185199644003332e-07,
"loss": 0.4931,
"step": 1950
},
{
"epoch": 1.8846153846153846,
"grad_norm": 2.2160725485465567,
"learning_rate": 1.0104673978866164e-07,
"loss": 0.513,
"step": 1960
},
{
"epoch": 1.8942307692307692,
"grad_norm": 2.2787872837737697,
"learning_rate": 8.495320404365348e-08,
"loss": 0.5062,
"step": 1970
},
{
"epoch": 1.9038461538461537,
"grad_norm": 2.168799751436692,
"learning_rate": 7.024388957680705e-08,
"loss": 0.5168,
"step": 1980
},
{
"epoch": 1.9134615384615383,
"grad_norm": 2.064037817068646,
"learning_rate": 5.6922938962329364e-08,
"loss": 0.5154,
"step": 1990
},
{
"epoch": 1.9230769230769231,
"grad_norm": 2.085831001764275,
"learning_rate": 4.499410377045765e-08,
"loss": 0.5059,
"step": 2000
},
{
"epoch": 1.9230769230769231,
"eval_loss": 0.7636520862579346,
"eval_runtime": 31.002,
"eval_samples_per_second": 59.641,
"eval_steps_per_second": 7.483,
"step": 2000
},
{
"epoch": 1.9326923076923077,
"grad_norm": 2.0866363192357245,
"learning_rate": 3.446074351091566e-08,
"loss": 0.535,
"step": 2010
},
{
"epoch": 1.9423076923076923,
"grad_norm": 1.9303541701779643,
"learning_rate": 2.5325824686772138e-08,
"loss": 0.5193,
"step": 2020
},
{
"epoch": 1.9519230769230769,
"grad_norm": 2.035058945568711,
"learning_rate": 1.7591919958986348e-08,
"loss": 0.5242,
"step": 2030
},
{
"epoch": 1.9615384615384617,
"grad_norm": 1.906676822404482,
"learning_rate": 1.1261207421874309e-08,
"loss": 0.4826,
"step": 2040
},
{
"epoch": 1.9711538461538463,
"grad_norm": 2.0734277093423965,
"learning_rate": 6.335469989692255e-09,
"loss": 0.5064,
"step": 2050
},
{
"epoch": 1.9807692307692308,
"grad_norm": 2.320470177512831,
"learning_rate": 2.816094894513843e-09,
"loss": 0.5264,
"step": 2060
},
{
"epoch": 1.9903846153846154,
"grad_norm": 2.2494248679357955,
"learning_rate": 7.040732955487795e-10,
"loss": 0.51,
"step": 2070
},
{
"epoch": 2.0,
"grad_norm": 2.2699709074042307,
"learning_rate": 0.0,
"loss": 0.5211,
"step": 2080
},
{
"epoch": 2.0,
"step": 2080,
"total_flos": 34318199685120.0,
"train_loss": 0.6641153321816371,
"train_runtime": 2465.8736,
"train_samples_per_second": 13.491,
"train_steps_per_second": 0.844
}
],
"logging_steps": 10,
"max_steps": 2080,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 50000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 34318199685120.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}