aleegis12's picture
Training in progress, step 169, checkpoint
eb83d6f verified
{
"best_metric": 0.032022152096033096,
"best_model_checkpoint": "miner_id_24/checkpoint-150",
"epoch": 3.004424778761062,
"eval_steps": 50,
"global_step": 169,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.017699115044247787,
"grad_norm": 1.0721659660339355,
"learning_rate": 1e-05,
"loss": 0.9325,
"step": 1
},
{
"epoch": 0.017699115044247787,
"eval_loss": 1.3935922384262085,
"eval_runtime": 6.5562,
"eval_samples_per_second": 14.49,
"eval_steps_per_second": 3.661,
"step": 1
},
{
"epoch": 0.035398230088495575,
"grad_norm": 1.2187706232070923,
"learning_rate": 2e-05,
"loss": 1.1,
"step": 2
},
{
"epoch": 0.05309734513274336,
"grad_norm": 1.1939740180969238,
"learning_rate": 3e-05,
"loss": 1.0582,
"step": 3
},
{
"epoch": 0.07079646017699115,
"grad_norm": 1.1176855564117432,
"learning_rate": 4e-05,
"loss": 1.0807,
"step": 4
},
{
"epoch": 0.08849557522123894,
"grad_norm": 1.3982747793197632,
"learning_rate": 5e-05,
"loss": 1.2339,
"step": 5
},
{
"epoch": 0.10619469026548672,
"grad_norm": 1.3325220346450806,
"learning_rate": 6e-05,
"loss": 1.1153,
"step": 6
},
{
"epoch": 0.12389380530973451,
"grad_norm": 1.265844702720642,
"learning_rate": 7e-05,
"loss": 0.958,
"step": 7
},
{
"epoch": 0.1415929203539823,
"grad_norm": 1.1271923780441284,
"learning_rate": 8e-05,
"loss": 0.7678,
"step": 8
},
{
"epoch": 0.1592920353982301,
"grad_norm": 1.0350992679595947,
"learning_rate": 9e-05,
"loss": 0.586,
"step": 9
},
{
"epoch": 0.17699115044247787,
"grad_norm": 1.1207027435302734,
"learning_rate": 0.0001,
"loss": 0.5873,
"step": 10
},
{
"epoch": 0.19469026548672566,
"grad_norm": 1.0533980131149292,
"learning_rate": 9.999024041442456e-05,
"loss": 0.4849,
"step": 11
},
{
"epoch": 0.21238938053097345,
"grad_norm": 1.0255162715911865,
"learning_rate": 9.99609654676786e-05,
"loss": 0.4021,
"step": 12
},
{
"epoch": 0.23008849557522124,
"grad_norm": 1.1924771070480347,
"learning_rate": 9.991218658821608e-05,
"loss": 0.4131,
"step": 13
},
{
"epoch": 0.24778761061946902,
"grad_norm": 1.1136771440505981,
"learning_rate": 9.984392281850293e-05,
"loss": 0.2954,
"step": 14
},
{
"epoch": 0.26548672566371684,
"grad_norm": 0.9027830958366394,
"learning_rate": 9.97562008075832e-05,
"loss": 0.2755,
"step": 15
},
{
"epoch": 0.2831858407079646,
"grad_norm": 0.6252666711807251,
"learning_rate": 9.964905480067586e-05,
"loss": 0.1862,
"step": 16
},
{
"epoch": 0.3008849557522124,
"grad_norm": 0.5391494631767273,
"learning_rate": 9.952252662580579e-05,
"loss": 0.1796,
"step": 17
},
{
"epoch": 0.3185840707964602,
"grad_norm": 0.5780326724052429,
"learning_rate": 9.937666567747501e-05,
"loss": 0.1702,
"step": 18
},
{
"epoch": 0.336283185840708,
"grad_norm": 0.579827070236206,
"learning_rate": 9.921152889737984e-05,
"loss": 0.1355,
"step": 19
},
{
"epoch": 0.35398230088495575,
"grad_norm": 0.5776816010475159,
"learning_rate": 9.902718075218176e-05,
"loss": 0.1651,
"step": 20
},
{
"epoch": 0.37168141592920356,
"grad_norm": 0.5652853846549988,
"learning_rate": 9.882369320834069e-05,
"loss": 0.1378,
"step": 21
},
{
"epoch": 0.3893805309734513,
"grad_norm": 0.5807967782020569,
"learning_rate": 9.860114570402054e-05,
"loss": 0.1444,
"step": 22
},
{
"epoch": 0.40707964601769914,
"grad_norm": 0.46384990215301514,
"learning_rate": 9.835962511807786e-05,
"loss": 0.1163,
"step": 23
},
{
"epoch": 0.4247787610619469,
"grad_norm": 0.5239526033401489,
"learning_rate": 9.809922573614569e-05,
"loss": 0.1363,
"step": 24
},
{
"epoch": 0.4424778761061947,
"grad_norm": 0.5285527110099792,
"learning_rate": 9.782004921382612e-05,
"loss": 0.1366,
"step": 25
},
{
"epoch": 0.46017699115044247,
"grad_norm": 0.6417925953865051,
"learning_rate": 9.752220453700556e-05,
"loss": 0.1262,
"step": 26
},
{
"epoch": 0.4778761061946903,
"grad_norm": 0.5849817395210266,
"learning_rate": 9.720580797930845e-05,
"loss": 0.1117,
"step": 27
},
{
"epoch": 0.49557522123893805,
"grad_norm": 0.6580327749252319,
"learning_rate": 9.687098305670605e-05,
"loss": 0.1371,
"step": 28
},
{
"epoch": 0.5132743362831859,
"grad_norm": 0.43546798825263977,
"learning_rate": 9.651786047929773e-05,
"loss": 0.1155,
"step": 29
},
{
"epoch": 0.5309734513274337,
"grad_norm": 0.42823654413223267,
"learning_rate": 9.614657810028402e-05,
"loss": 0.1186,
"step": 30
},
{
"epoch": 0.5486725663716814,
"grad_norm": 0.39037245512008667,
"learning_rate": 9.575728086215092e-05,
"loss": 0.1127,
"step": 31
},
{
"epoch": 0.5663716814159292,
"grad_norm": 0.3906686305999756,
"learning_rate": 9.535012074008687e-05,
"loss": 0.0985,
"step": 32
},
{
"epoch": 0.584070796460177,
"grad_norm": 0.32296302914619446,
"learning_rate": 9.492525668265399e-05,
"loss": 0.0729,
"step": 33
},
{
"epoch": 0.6017699115044248,
"grad_norm": 0.3804223835468292,
"learning_rate": 9.448285454973738e-05,
"loss": 0.0887,
"step": 34
},
{
"epoch": 0.6194690265486725,
"grad_norm": 0.528093695640564,
"learning_rate": 9.402308704779599e-05,
"loss": 0.0873,
"step": 35
},
{
"epoch": 0.6371681415929203,
"grad_norm": 0.43243497610092163,
"learning_rate": 9.354613366244108e-05,
"loss": 0.1039,
"step": 36
},
{
"epoch": 0.6548672566371682,
"grad_norm": 0.33139994740486145,
"learning_rate": 9.305218058836778e-05,
"loss": 0.0861,
"step": 37
},
{
"epoch": 0.672566371681416,
"grad_norm": 0.5364376902580261,
"learning_rate": 9.254142065666801e-05,
"loss": 0.0946,
"step": 38
},
{
"epoch": 0.6902654867256637,
"grad_norm": 0.3789397180080414,
"learning_rate": 9.201405325955221e-05,
"loss": 0.0787,
"step": 39
},
{
"epoch": 0.7079646017699115,
"grad_norm": 0.445186585187912,
"learning_rate": 9.14702842725101e-05,
"loss": 0.0822,
"step": 40
},
{
"epoch": 0.7256637168141593,
"grad_norm": 0.3613705039024353,
"learning_rate": 9.091032597394012e-05,
"loss": 0.0774,
"step": 41
},
{
"epoch": 0.7433628318584071,
"grad_norm": 0.3099222779273987,
"learning_rate": 9.033439696227965e-05,
"loss": 0.0521,
"step": 42
},
{
"epoch": 0.7610619469026548,
"grad_norm": 0.27817392349243164,
"learning_rate": 8.974272207066767e-05,
"loss": 0.0867,
"step": 43
},
{
"epoch": 0.7787610619469026,
"grad_norm": 0.2684073746204376,
"learning_rate": 8.913553227917367e-05,
"loss": 0.061,
"step": 44
},
{
"epoch": 0.7964601769911505,
"grad_norm": 0.2929267883300781,
"learning_rate": 8.851306462462688e-05,
"loss": 0.0698,
"step": 45
},
{
"epoch": 0.8141592920353983,
"grad_norm": 0.3591836392879486,
"learning_rate": 8.787556210808101e-05,
"loss": 0.0653,
"step": 46
},
{
"epoch": 0.831858407079646,
"grad_norm": 0.36023128032684326,
"learning_rate": 8.722327359995064e-05,
"loss": 0.0733,
"step": 47
},
{
"epoch": 0.8495575221238938,
"grad_norm": 0.34157848358154297,
"learning_rate": 8.655645374285637e-05,
"loss": 0.0611,
"step": 48
},
{
"epoch": 0.8672566371681416,
"grad_norm": 0.3172074854373932,
"learning_rate": 8.587536285221656e-05,
"loss": 0.0689,
"step": 49
},
{
"epoch": 0.8849557522123894,
"grad_norm": 0.27993178367614746,
"learning_rate": 8.518026681462448e-05,
"loss": 0.0566,
"step": 50
},
{
"epoch": 0.8849557522123894,
"eval_loss": 0.06235470622777939,
"eval_runtime": 6.691,
"eval_samples_per_second": 14.198,
"eval_steps_per_second": 3.587,
"step": 50
},
{
"epoch": 0.9026548672566371,
"grad_norm": 0.21425798535346985,
"learning_rate": 8.44714369840506e-05,
"loss": 0.0481,
"step": 51
},
{
"epoch": 0.9203539823008849,
"grad_norm": 0.2822091579437256,
"learning_rate": 8.374915007591053e-05,
"loss": 0.0539,
"step": 52
},
{
"epoch": 0.9380530973451328,
"grad_norm": 0.3757535517215729,
"learning_rate": 8.301368805903988e-05,
"loss": 0.0578,
"step": 53
},
{
"epoch": 0.9557522123893806,
"grad_norm": 0.36872968077659607,
"learning_rate": 8.226533804561827e-05,
"loss": 0.0614,
"step": 54
},
{
"epoch": 0.9734513274336283,
"grad_norm": 0.4218049943447113,
"learning_rate": 8.150439217908556e-05,
"loss": 0.0822,
"step": 55
},
{
"epoch": 0.9911504424778761,
"grad_norm": 0.5659615993499756,
"learning_rate": 8.073114752009387e-05,
"loss": 0.0642,
"step": 56
},
{
"epoch": 1.0132743362831858,
"grad_norm": 0.9227009415626526,
"learning_rate": 7.994590593054001e-05,
"loss": 0.127,
"step": 57
},
{
"epoch": 1.0309734513274336,
"grad_norm": 0.20931123197078705,
"learning_rate": 7.91489739557236e-05,
"loss": 0.0463,
"step": 58
},
{
"epoch": 1.0486725663716814,
"grad_norm": 0.25171875953674316,
"learning_rate": 7.83406627046769e-05,
"loss": 0.0586,
"step": 59
},
{
"epoch": 1.0663716814159292,
"grad_norm": 0.24994538724422455,
"learning_rate": 7.752128772871292e-05,
"loss": 0.0424,
"step": 60
},
{
"epoch": 1.084070796460177,
"grad_norm": 0.23162053525447845,
"learning_rate": 7.669116889823955e-05,
"loss": 0.0436,
"step": 61
},
{
"epoch": 1.1017699115044248,
"grad_norm": 0.2777061462402344,
"learning_rate": 7.585063027788731e-05,
"loss": 0.043,
"step": 62
},
{
"epoch": 1.1194690265486726,
"grad_norm": 0.2399117797613144,
"learning_rate": 7.500000000000001e-05,
"loss": 0.0475,
"step": 63
},
{
"epoch": 1.1371681415929205,
"grad_norm": 0.2516414225101471,
"learning_rate": 7.413961013653726e-05,
"loss": 0.0434,
"step": 64
},
{
"epoch": 1.154867256637168,
"grad_norm": 0.15977510809898376,
"learning_rate": 7.326979656943906e-05,
"loss": 0.0282,
"step": 65
},
{
"epoch": 1.1725663716814159,
"grad_norm": 0.23954959213733673,
"learning_rate": 7.239089885950316e-05,
"loss": 0.0324,
"step": 66
},
{
"epoch": 1.1902654867256637,
"grad_norm": 0.15419161319732666,
"learning_rate": 7.150326011382604e-05,
"loss": 0.0235,
"step": 67
},
{
"epoch": 1.2079646017699115,
"grad_norm": 0.2992177903652191,
"learning_rate": 7.060722685185961e-05,
"loss": 0.0317,
"step": 68
},
{
"epoch": 1.2256637168141593,
"grad_norm": 0.2721240222454071,
"learning_rate": 6.970314887013584e-05,
"loss": 0.0441,
"step": 69
},
{
"epoch": 1.2433628318584071,
"grad_norm": 0.19432015717029572,
"learning_rate": 6.879137910571191e-05,
"loss": 0.0235,
"step": 70
},
{
"epoch": 1.261061946902655,
"grad_norm": 0.2833891212940216,
"learning_rate": 6.787227349838947e-05,
"loss": 0.0408,
"step": 71
},
{
"epoch": 1.2787610619469025,
"grad_norm": 0.24525463581085205,
"learning_rate": 6.694619085176159e-05,
"loss": 0.0469,
"step": 72
},
{
"epoch": 1.2964601769911503,
"grad_norm": 0.2467905730009079,
"learning_rate": 6.601349269314188e-05,
"loss": 0.0446,
"step": 73
},
{
"epoch": 1.3141592920353982,
"grad_norm": 0.18387003242969513,
"learning_rate": 6.507454313243015e-05,
"loss": 0.0334,
"step": 74
},
{
"epoch": 1.331858407079646,
"grad_norm": 0.22559407353401184,
"learning_rate": 6.412970871996995e-05,
"loss": 0.0292,
"step": 75
},
{
"epoch": 1.3495575221238938,
"grad_norm": 0.18541377782821655,
"learning_rate": 6.317935830345338e-05,
"loss": 0.0351,
"step": 76
},
{
"epoch": 1.3672566371681416,
"grad_norm": 0.17538294196128845,
"learning_rate": 6.222386288392913e-05,
"loss": 0.0329,
"step": 77
},
{
"epoch": 1.3849557522123894,
"grad_norm": 0.21896906197071075,
"learning_rate": 6.126359547096975e-05,
"loss": 0.0313,
"step": 78
},
{
"epoch": 1.4026548672566372,
"grad_norm": 0.1702110767364502,
"learning_rate": 6.029893093705492e-05,
"loss": 0.0168,
"step": 79
},
{
"epoch": 1.420353982300885,
"grad_norm": 0.279254674911499,
"learning_rate": 5.9330245871227454e-05,
"loss": 0.0371,
"step": 80
},
{
"epoch": 1.4380530973451329,
"grad_norm": 0.20467723906040192,
"learning_rate": 5.835791843207916e-05,
"loss": 0.0251,
"step": 81
},
{
"epoch": 1.4557522123893805,
"grad_norm": 0.23105274140834808,
"learning_rate": 5.738232820012407e-05,
"loss": 0.0243,
"step": 82
},
{
"epoch": 1.4734513274336283,
"grad_norm": 0.2554505467414856,
"learning_rate": 5.640385602961634e-05,
"loss": 0.0226,
"step": 83
},
{
"epoch": 1.491150442477876,
"grad_norm": 0.3279307186603546,
"learning_rate": 5.5422883899871284e-05,
"loss": 0.0181,
"step": 84
},
{
"epoch": 1.508849557522124,
"grad_norm": 0.23609845340251923,
"learning_rate": 5.4439794766146746e-05,
"loss": 0.03,
"step": 85
},
{
"epoch": 1.5265486725663717,
"grad_norm": 0.19427117705345154,
"learning_rate": 5.34549724101439e-05,
"loss": 0.0283,
"step": 86
},
{
"epoch": 1.5442477876106193,
"grad_norm": 0.21057020127773285,
"learning_rate": 5.246880129018516e-05,
"loss": 0.0328,
"step": 87
},
{
"epoch": 1.5619469026548671,
"grad_norm": 0.157065749168396,
"learning_rate": 5.148166639112799e-05,
"loss": 0.0265,
"step": 88
},
{
"epoch": 1.579646017699115,
"grad_norm": 0.21837662160396576,
"learning_rate": 5.049395307407329e-05,
"loss": 0.0353,
"step": 89
},
{
"epoch": 1.5973451327433628,
"grad_norm": 0.2608608305454254,
"learning_rate": 4.950604692592672e-05,
"loss": 0.0445,
"step": 90
},
{
"epoch": 1.6150442477876106,
"grad_norm": 0.29214149713516235,
"learning_rate": 4.851833360887201e-05,
"loss": 0.0319,
"step": 91
},
{
"epoch": 1.6327433628318584,
"grad_norm": 0.3026266396045685,
"learning_rate": 4.7531198709814854e-05,
"loss": 0.0387,
"step": 92
},
{
"epoch": 1.6504424778761062,
"grad_norm": 0.1753360629081726,
"learning_rate": 4.654502758985611e-05,
"loss": 0.0178,
"step": 93
},
{
"epoch": 1.668141592920354,
"grad_norm": 0.29309213161468506,
"learning_rate": 4.5560205233853266e-05,
"loss": 0.0279,
"step": 94
},
{
"epoch": 1.6858407079646018,
"grad_norm": 0.1859235018491745,
"learning_rate": 4.4577116100128735e-05,
"loss": 0.0225,
"step": 95
},
{
"epoch": 1.7035398230088497,
"grad_norm": 0.21251869201660156,
"learning_rate": 4.3596143970383664e-05,
"loss": 0.028,
"step": 96
},
{
"epoch": 1.7212389380530975,
"grad_norm": 0.2775147259235382,
"learning_rate": 4.2617671799875944e-05,
"loss": 0.0323,
"step": 97
},
{
"epoch": 1.7389380530973453,
"grad_norm": 0.3155283033847809,
"learning_rate": 4.1642081567920846e-05,
"loss": 0.0194,
"step": 98
},
{
"epoch": 1.7566371681415929,
"grad_norm": 0.13303042948246002,
"learning_rate": 4.066975412877255e-05,
"loss": 0.0211,
"step": 99
},
{
"epoch": 1.7743362831858407,
"grad_norm": 0.19871264696121216,
"learning_rate": 3.970106906294509e-05,
"loss": 0.0302,
"step": 100
},
{
"epoch": 1.7743362831858407,
"eval_loss": 0.0379580594599247,
"eval_runtime": 6.6861,
"eval_samples_per_second": 14.209,
"eval_steps_per_second": 3.59,
"step": 100
},
{
"epoch": 1.7920353982300885,
"grad_norm": 0.20991753041744232,
"learning_rate": 3.873640452903026e-05,
"loss": 0.0253,
"step": 101
},
{
"epoch": 1.8097345132743363,
"grad_norm": 0.3313141167163849,
"learning_rate": 3.777613711607087e-05,
"loss": 0.0289,
"step": 102
},
{
"epoch": 1.827433628318584,
"grad_norm": 0.19206620752811432,
"learning_rate": 3.682064169654663e-05,
"loss": 0.0265,
"step": 103
},
{
"epoch": 1.8451327433628317,
"grad_norm": 0.4013370871543884,
"learning_rate": 3.587029128003006e-05,
"loss": 0.0308,
"step": 104
},
{
"epoch": 1.8628318584070795,
"grad_norm": 0.18024751543998718,
"learning_rate": 3.492545686756986e-05,
"loss": 0.0162,
"step": 105
},
{
"epoch": 1.8805309734513274,
"grad_norm": 0.2305198758840561,
"learning_rate": 3.3986507306858125e-05,
"loss": 0.0284,
"step": 106
},
{
"epoch": 1.8982300884955752,
"grad_norm": 0.19050440192222595,
"learning_rate": 3.3053809148238426e-05,
"loss": 0.0227,
"step": 107
},
{
"epoch": 1.915929203539823,
"grad_norm": 0.15006108582019806,
"learning_rate": 3.212772650161056e-05,
"loss": 0.0191,
"step": 108
},
{
"epoch": 1.9336283185840708,
"grad_norm": 0.2283119261264801,
"learning_rate": 3.12086208942881e-05,
"loss": 0.0227,
"step": 109
},
{
"epoch": 1.9513274336283186,
"grad_norm": 0.2260059416294098,
"learning_rate": 3.0296851129864168e-05,
"loss": 0.0253,
"step": 110
},
{
"epoch": 1.9690265486725664,
"grad_norm": 0.23939450085163116,
"learning_rate": 2.9392773148140408e-05,
"loss": 0.0265,
"step": 111
},
{
"epoch": 1.9867256637168142,
"grad_norm": 0.1815921664237976,
"learning_rate": 2.8496739886173995e-05,
"loss": 0.0216,
"step": 112
},
{
"epoch": 2.0088495575221237,
"grad_norm": 0.24775980412960052,
"learning_rate": 2.7609101140496863e-05,
"loss": 0.0232,
"step": 113
},
{
"epoch": 2.0265486725663715,
"grad_norm": 0.12413739413022995,
"learning_rate": 2.6730203430560947e-05,
"loss": 0.0206,
"step": 114
},
{
"epoch": 2.0442477876106193,
"grad_norm": 0.1339850276708603,
"learning_rate": 2.5860389863462765e-05,
"loss": 0.0196,
"step": 115
},
{
"epoch": 2.061946902654867,
"grad_norm": 0.13854354619979858,
"learning_rate": 2.500000000000001e-05,
"loss": 0.0159,
"step": 116
},
{
"epoch": 2.079646017699115,
"grad_norm": 0.1154838278889656,
"learning_rate": 2.414936972211272e-05,
"loss": 0.0142,
"step": 117
},
{
"epoch": 2.0973451327433628,
"grad_norm": 0.10285928845405579,
"learning_rate": 2.3308831101760486e-05,
"loss": 0.01,
"step": 118
},
{
"epoch": 2.1150442477876106,
"grad_norm": 0.1806727647781372,
"learning_rate": 2.247871227128709e-05,
"loss": 0.0187,
"step": 119
},
{
"epoch": 2.1327433628318584,
"grad_norm": 0.19010406732559204,
"learning_rate": 2.1659337295323118e-05,
"loss": 0.0235,
"step": 120
},
{
"epoch": 2.150442477876106,
"grad_norm": 0.1291082799434662,
"learning_rate": 2.0851026044276406e-05,
"loss": 0.0144,
"step": 121
},
{
"epoch": 2.168141592920354,
"grad_norm": 0.16923344135284424,
"learning_rate": 2.005409406946e-05,
"loss": 0.0138,
"step": 122
},
{
"epoch": 2.185840707964602,
"grad_norm": 0.14110802114009857,
"learning_rate": 1.9268852479906147e-05,
"loss": 0.0119,
"step": 123
},
{
"epoch": 2.2035398230088497,
"grad_norm": 0.10906057059764862,
"learning_rate": 1.849560782091445e-05,
"loss": 0.009,
"step": 124
},
{
"epoch": 2.2212389380530975,
"grad_norm": 0.1177993193268776,
"learning_rate": 1.7734661954381754e-05,
"loss": 0.014,
"step": 125
},
{
"epoch": 2.2389380530973453,
"grad_norm": 0.10654302686452866,
"learning_rate": 1.6986311940960147e-05,
"loss": 0.0101,
"step": 126
},
{
"epoch": 2.256637168141593,
"grad_norm": 0.16150303184986115,
"learning_rate": 1.6250849924089484e-05,
"loss": 0.0168,
"step": 127
},
{
"epoch": 2.274336283185841,
"grad_norm": 0.12198394536972046,
"learning_rate": 1.552856301594942e-05,
"loss": 0.0136,
"step": 128
},
{
"epoch": 2.2920353982300883,
"grad_norm": 0.16058827936649323,
"learning_rate": 1.4819733185375534e-05,
"loss": 0.0196,
"step": 129
},
{
"epoch": 2.309734513274336,
"grad_norm": 0.19370707869529724,
"learning_rate": 1.4124637147783432e-05,
"loss": 0.0205,
"step": 130
},
{
"epoch": 2.327433628318584,
"grad_norm": 0.0963333398103714,
"learning_rate": 1.3443546257143624e-05,
"loss": 0.011,
"step": 131
},
{
"epoch": 2.3451327433628317,
"grad_norm": 0.1633400171995163,
"learning_rate": 1.277672640004936e-05,
"loss": 0.0208,
"step": 132
},
{
"epoch": 2.3628318584070795,
"grad_norm": 0.1224859431385994,
"learning_rate": 1.2124437891918993e-05,
"loss": 0.015,
"step": 133
},
{
"epoch": 2.3805309734513274,
"grad_norm": 0.10458586364984512,
"learning_rate": 1.1486935375373126e-05,
"loss": 0.0101,
"step": 134
},
{
"epoch": 2.398230088495575,
"grad_norm": 0.15571683645248413,
"learning_rate": 1.0864467720826343e-05,
"loss": 0.0162,
"step": 135
},
{
"epoch": 2.415929203539823,
"grad_norm": 0.08478286117315292,
"learning_rate": 1.0257277929332332e-05,
"loss": 0.0072,
"step": 136
},
{
"epoch": 2.433628318584071,
"grad_norm": 0.10365074872970581,
"learning_rate": 9.66560303772035e-06,
"loss": 0.0095,
"step": 137
},
{
"epoch": 2.4513274336283186,
"grad_norm": 0.07361367344856262,
"learning_rate": 9.08967402605988e-06,
"loss": 0.0076,
"step": 138
},
{
"epoch": 2.4690265486725664,
"grad_norm": 0.13091503083705902,
"learning_rate": 8.529715727489912e-06,
"loss": 0.0082,
"step": 139
},
{
"epoch": 2.4867256637168142,
"grad_norm": 0.11134269833564758,
"learning_rate": 7.985946740447791e-06,
"loss": 0.0079,
"step": 140
},
{
"epoch": 2.504424778761062,
"grad_norm": 0.18341496586799622,
"learning_rate": 7.458579343331995e-06,
"loss": 0.0129,
"step": 141
},
{
"epoch": 2.52212389380531,
"grad_norm": 0.11337222903966904,
"learning_rate": 6.947819411632223e-06,
"loss": 0.0173,
"step": 142
},
{
"epoch": 2.5398230088495577,
"grad_norm": 0.20701636373996735,
"learning_rate": 6.45386633755894e-06,
"loss": 0.0175,
"step": 143
},
{
"epoch": 2.557522123893805,
"grad_norm": 0.20868322253227234,
"learning_rate": 5.976912952204017e-06,
"loss": 0.0147,
"step": 144
},
{
"epoch": 2.5752212389380533,
"grad_norm": 0.10272178053855896,
"learning_rate": 5.51714545026264e-06,
"loss": 0.0128,
"step": 145
},
{
"epoch": 2.5929203539823007,
"grad_norm": 0.14340701699256897,
"learning_rate": 5.074743317346009e-06,
"loss": 0.0136,
"step": 146
},
{
"epoch": 2.6106194690265485,
"grad_norm": 0.12963014841079712,
"learning_rate": 4.649879259913137e-06,
"loss": 0.0093,
"step": 147
},
{
"epoch": 2.6283185840707963,
"grad_norm": 0.12100816518068314,
"learning_rate": 4.242719137849077e-06,
"loss": 0.0107,
"step": 148
},
{
"epoch": 2.646017699115044,
"grad_norm": 0.09687142819166183,
"learning_rate": 3.853421899715992e-06,
"loss": 0.0082,
"step": 149
},
{
"epoch": 2.663716814159292,
"grad_norm": 0.09663711488246918,
"learning_rate": 3.4821395207022766e-06,
"loss": 0.0081,
"step": 150
},
{
"epoch": 2.663716814159292,
"eval_loss": 0.032022152096033096,
"eval_runtime": 6.6983,
"eval_samples_per_second": 14.183,
"eval_steps_per_second": 3.583,
"step": 150
},
{
"epoch": 2.6814159292035398,
"grad_norm": 0.12883880734443665,
"learning_rate": 3.1290169432939553e-06,
"loss": 0.0114,
"step": 151
},
{
"epoch": 2.6991150442477876,
"grad_norm": 0.16089491546154022,
"learning_rate": 2.794192020691544e-06,
"loss": 0.0165,
"step": 152
},
{
"epoch": 2.7168141592920354,
"grad_norm": 0.25050851702690125,
"learning_rate": 2.4777954629944477e-06,
"loss": 0.0182,
"step": 153
},
{
"epoch": 2.734513274336283,
"grad_norm": 0.09175170958042145,
"learning_rate": 2.179950786173879e-06,
"loss": 0.0057,
"step": 154
},
{
"epoch": 2.752212389380531,
"grad_norm": 0.1262611448764801,
"learning_rate": 1.9007742638543102e-06,
"loss": 0.0119,
"step": 155
},
{
"epoch": 2.769911504424779,
"grad_norm": 0.13375544548034668,
"learning_rate": 1.6403748819221466e-06,
"loss": 0.0125,
"step": 156
},
{
"epoch": 2.7876106194690267,
"grad_norm": 0.09264933317899704,
"learning_rate": 1.3988542959794627e-06,
"loss": 0.0109,
"step": 157
},
{
"epoch": 2.8053097345132745,
"grad_norm": 0.10276864469051361,
"learning_rate": 1.1763067916593262e-06,
"loss": 0.0125,
"step": 158
},
{
"epoch": 2.823008849557522,
"grad_norm": 0.08319110423326492,
"learning_rate": 9.728192478182574e-07,
"loss": 0.0082,
"step": 159
},
{
"epoch": 2.84070796460177,
"grad_norm": 0.168908029794693,
"learning_rate": 7.884711026201585e-07,
"loss": 0.0144,
"step": 160
},
{
"epoch": 2.8584070796460175,
"grad_norm": 0.1352635771036148,
"learning_rate": 6.233343225249933e-07,
"loss": 0.0158,
"step": 161
},
{
"epoch": 2.8761061946902657,
"grad_norm": 0.09505495429039001,
"learning_rate": 4.774733741942206e-07,
"loss": 0.0099,
"step": 162
},
{
"epoch": 2.893805309734513,
"grad_norm": 0.0929616317152977,
"learning_rate": 3.5094519932415417e-07,
"loss": 0.0078,
"step": 163
},
{
"epoch": 2.911504424778761,
"grad_norm": 0.10294952243566513,
"learning_rate": 2.437991924167937e-07,
"loss": 0.012,
"step": 164
},
{
"epoch": 2.9292035398230087,
"grad_norm": 0.125913605093956,
"learning_rate": 1.560771814970885e-07,
"loss": 0.0088,
"step": 165
},
{
"epoch": 2.9469026548672566,
"grad_norm": 0.14280952513217926,
"learning_rate": 8.781341178393244e-08,
"loss": 0.011,
"step": 166
},
{
"epoch": 2.9646017699115044,
"grad_norm": 0.1293206512928009,
"learning_rate": 3.9034532321408076e-08,
"loss": 0.0099,
"step": 167
},
{
"epoch": 2.982300884955752,
"grad_norm": 0.1486680805683136,
"learning_rate": 9.75958557545842e-09,
"loss": 0.0107,
"step": 168
},
{
"epoch": 3.004424778761062,
"grad_norm": 0.1905655860900879,
"learning_rate": 0.0,
"loss": 0.0132,
"step": 169
}
],
"logging_steps": 1,
"max_steps": 169,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.248537858501509e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}