latest_9 / trainer_state.json
krishnakalyan3's picture
Upload folder using huggingface_hub (#1)
eb47cc7 verified
{
"best_metric": 0.121661689779634,
"best_model_checkpoint": "/workspace/disk2/krishna/checkpoints/checkpoint-1280",
"epoch": 0.128,
"eval_steps": 10,
"global_step": 1280,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001,
"grad_norm": 0.11198576539754868,
"learning_rate": 1e-05,
"loss": 0.126,
"step": 10
},
{
"epoch": 0.001,
"eval_cos_sim": 0.8696296215057373,
"eval_loss": 0.13132101871716445,
"eval_runtime": 191.9539,
"eval_samples_per_second": 20.838,
"eval_steps_per_second": 1.302,
"step": 10
},
{
"epoch": 0.002,
"grad_norm": 0.19444850087165833,
"learning_rate": 2e-05,
"loss": 0.1267,
"step": 20
},
{
"epoch": 0.002,
"eval_cos_sim": 0.8698329329490662,
"eval_loss": 0.1311149292205519,
"eval_runtime": 177.5098,
"eval_samples_per_second": 22.534,
"eval_steps_per_second": 1.408,
"step": 20
},
{
"epoch": 0.003,
"grad_norm": 0.12954622507095337,
"learning_rate": 3e-05,
"loss": 0.1271,
"step": 30
},
{
"epoch": 0.003,
"eval_cos_sim": 0.8700494766235352,
"eval_loss": 0.1309011602615065,
"eval_runtime": 179.7068,
"eval_samples_per_second": 22.258,
"eval_steps_per_second": 1.391,
"step": 30
},
{
"epoch": 0.004,
"grad_norm": 0.11514733731746674,
"learning_rate": 4e-05,
"loss": 0.1265,
"step": 40
},
{
"epoch": 0.004,
"eval_cos_sim": 0.870728075504303,
"eval_loss": 0.13021534349667496,
"eval_runtime": 174.4918,
"eval_samples_per_second": 22.924,
"eval_steps_per_second": 1.433,
"step": 40
},
{
"epoch": 0.005,
"grad_norm": 0.34224584698677063,
"learning_rate": 5e-05,
"loss": 0.1273,
"step": 50
},
{
"epoch": 0.005,
"eval_cos_sim": 0.8705285787582397,
"eval_loss": 0.1304176144813246,
"eval_runtime": 175.5157,
"eval_samples_per_second": 22.79,
"eval_steps_per_second": 1.424,
"step": 50
},
{
"epoch": 0.006,
"grad_norm": 0.1085827499628067,
"learning_rate": 4.517892759404963e-05,
"loss": 0.125,
"step": 60
},
{
"epoch": 0.006,
"eval_cos_sim": 0.8709338903427124,
"eval_loss": 0.130007851145143,
"eval_runtime": 173.9237,
"eval_samples_per_second": 22.999,
"eval_steps_per_second": 1.437,
"step": 60
},
{
"epoch": 0.007,
"grad_norm": 0.11786766350269318,
"learning_rate": 3.257512950767182e-05,
"loss": 0.1291,
"step": 70
},
{
"epoch": 0.007,
"eval_cos_sim": 0.8714690208435059,
"eval_loss": 0.12946533443676894,
"eval_runtime": 177.0345,
"eval_samples_per_second": 22.594,
"eval_steps_per_second": 1.412,
"step": 70
},
{
"epoch": 0.008,
"grad_norm": 0.10741184651851654,
"learning_rate": 1.7049711594019046e-05,
"loss": 0.1285,
"step": 80
},
{
"epoch": 0.008,
"eval_cos_sim": 0.8719983696937561,
"eval_loss": 0.1289418597434706,
"eval_runtime": 178.6566,
"eval_samples_per_second": 22.389,
"eval_steps_per_second": 1.399,
"step": 80
},
{
"epoch": 0.009,
"grad_norm": 0.12072350829839706,
"learning_rate": 4.590606964640023e-06,
"loss": 0.125,
"step": 90
},
{
"epoch": 0.009,
"eval_cos_sim": 0.8721248507499695,
"eval_loss": 0.12881728055226274,
"eval_runtime": 181.5969,
"eval_samples_per_second": 22.027,
"eval_steps_per_second": 1.377,
"step": 90
},
{
"epoch": 0.01,
"grad_norm": 0.11123672872781754,
"learning_rate": 4.999688473794144e-05,
"loss": 0.1249,
"step": 100
},
{
"epoch": 0.01,
"eval_cos_sim": 0.8721336722373962,
"eval_loss": 0.12880885388600297,
"eval_runtime": 174.6097,
"eval_samples_per_second": 22.908,
"eval_steps_per_second": 1.432,
"step": 100
},
{
"epoch": 0.011,
"grad_norm": 0.11100038141012192,
"learning_rate": 4.494343314093799e-05,
"loss": 0.1246,
"step": 110
},
{
"epoch": 0.011,
"eval_cos_sim": 0.8723854422569275,
"eval_loss": 0.1285583892081923,
"eval_runtime": 180.7772,
"eval_samples_per_second": 22.127,
"eval_steps_per_second": 1.383,
"step": 110
},
{
"epoch": 0.012,
"grad_norm": 0.11933281272649765,
"learning_rate": 3.219808272827916e-05,
"loss": 0.1265,
"step": 120
},
{
"epoch": 0.012,
"eval_cos_sim": 0.8727645874023438,
"eval_loss": 0.12819017722355788,
"eval_runtime": 176.8881,
"eval_samples_per_second": 22.613,
"eval_steps_per_second": 1.413,
"step": 120
},
{
"epoch": 0.013,
"grad_norm": 0.11295568197965622,
"learning_rate": 1.667653407425597e-05,
"loss": 0.1256,
"step": 130
},
{
"epoch": 0.013,
"eval_cos_sim": 0.8724489808082581,
"eval_loss": 0.12850400116192764,
"eval_runtime": 176.2937,
"eval_samples_per_second": 22.689,
"eval_steps_per_second": 1.418,
"step": 130
},
{
"epoch": 0.014,
"grad_norm": 0.10013717412948608,
"learning_rate": 4.365227971950606e-06,
"loss": 0.1252,
"step": 140
},
{
"epoch": 0.014,
"eval_cos_sim": 0.8726389408111572,
"eval_loss": 0.1283098426078505,
"eval_runtime": 175.1837,
"eval_samples_per_second": 22.833,
"eval_steps_per_second": 1.427,
"step": 140
},
{
"epoch": 0.015,
"grad_norm": 0.08663387596607208,
"learning_rate": 4.998753972815435e-05,
"loss": 0.1252,
"step": 150
},
{
"epoch": 0.015,
"eval_cos_sim": 0.8726971745491028,
"eval_loss": 0.12825069954144425,
"eval_runtime": 179.297,
"eval_samples_per_second": 22.309,
"eval_steps_per_second": 1.394,
"step": 150
},
{
"epoch": 0.016,
"grad_norm": 0.10253303498029709,
"learning_rate": 4.47029683661798e-05,
"loss": 0.1258,
"step": 160
},
{
"epoch": 0.016,
"eval_cos_sim": 0.8739002346992493,
"eval_loss": 0.12703985621678301,
"eval_runtime": 175.0922,
"eval_samples_per_second": 22.845,
"eval_steps_per_second": 1.428,
"step": 160
},
{
"epoch": 0.017,
"grad_norm": 0.11590978503227234,
"learning_rate": 3.1819242035765096e-05,
"loss": 0.1219,
"step": 170
},
{
"epoch": 0.017,
"eval_cos_sim": 0.8737954497337341,
"eval_loss": 0.12715704419362017,
"eval_runtime": 180.7326,
"eval_samples_per_second": 22.132,
"eval_steps_per_second": 1.383,
"step": 170
},
{
"epoch": 0.018,
"grad_norm": 0.09687651693820953,
"learning_rate": 1.6305430936700428e-05,
"loss": 0.1244,
"step": 180
},
{
"epoch": 0.018,
"eval_cos_sim": 0.8735443353652954,
"eval_loss": 0.12740902497517534,
"eval_runtime": 177.9084,
"eval_samples_per_second": 22.483,
"eval_steps_per_second": 1.405,
"step": 180
},
{
"epoch": 0.019,
"grad_norm": 0.10086172819137573,
"learning_rate": 4.144991597052059e-06,
"loss": 0.1258,
"step": 190
},
{
"epoch": 0.019,
"eval_cos_sim": 0.8735744953155518,
"eval_loss": 0.12737621738659807,
"eval_runtime": 174.0483,
"eval_samples_per_second": 22.982,
"eval_steps_per_second": 1.436,
"step": 190
},
{
"epoch": 0.02,
"grad_norm": 0.09316889941692352,
"learning_rate": 4.9971967299611097e-05,
"loss": 0.122,
"step": 200
},
{
"epoch": 0.02,
"eval_cos_sim": 0.8735851645469666,
"eval_loss": 0.12736523821103043,
"eval_runtime": 176.3327,
"eval_samples_per_second": 22.684,
"eval_steps_per_second": 1.418,
"step": 200
},
{
"epoch": 0.021,
"grad_norm": 0.10805534571409225,
"learning_rate": 4.4457593198638246e-05,
"loss": 0.1256,
"step": 210
},
{
"epoch": 0.021,
"eval_cos_sim": 0.8735992312431335,
"eval_loss": 0.12734888651120133,
"eval_runtime": 177.4342,
"eval_samples_per_second": 22.544,
"eval_steps_per_second": 1.409,
"step": 210
},
{
"epoch": 0.022,
"grad_norm": 0.14335550367832184,
"learning_rate": 3.143870184517241e-05,
"loss": 0.1228,
"step": 220
},
{
"epoch": 0.022,
"eval_cos_sim": 0.8742734789848328,
"eval_loss": 0.1266735837672896,
"eval_runtime": 174.698,
"eval_samples_per_second": 22.897,
"eval_steps_per_second": 1.431,
"step": 220
},
{
"epoch": 0.023,
"grad_norm": 0.10455214232206345,
"learning_rate": 1.5936494668034417e-05,
"loss": 0.1235,
"step": 230
},
{
"epoch": 0.023,
"eval_cos_sim": 0.874700129032135,
"eval_loss": 0.12624898936497636,
"eval_runtime": 175.2174,
"eval_samples_per_second": 22.829,
"eval_steps_per_second": 1.427,
"step": 230
},
{
"epoch": 0.024,
"grad_norm": 0.10344243049621582,
"learning_rate": 3.9299527274662355e-06,
"loss": 0.1258,
"step": 240
},
{
"epoch": 0.024,
"eval_cos_sim": 0.8746932148933411,
"eval_loss": 0.1262588949416823,
"eval_runtime": 178.5496,
"eval_samples_per_second": 22.403,
"eval_steps_per_second": 1.4,
"step": 240
},
{
"epoch": 0.025,
"grad_norm": 0.1515665352344513,
"learning_rate": 4.9950171333287335e-05,
"loss": 0.1259,
"step": 250
},
{
"epoch": 0.025,
"eval_cos_sim": 0.8746062517166138,
"eval_loss": 0.1263456218455977,
"eval_runtime": 181.2208,
"eval_samples_per_second": 22.073,
"eval_steps_per_second": 1.38,
"step": 250
},
{
"epoch": 0.026,
"grad_norm": 0.08521851152181625,
"learning_rate": 4.420736879094929e-05,
"loss": 0.123,
"step": 260
},
{
"epoch": 0.026,
"eval_cos_sim": 0.8742081522941589,
"eval_loss": 0.1267440173839278,
"eval_runtime": 172.3377,
"eval_samples_per_second": 23.21,
"eval_steps_per_second": 1.451,
"step": 260
},
{
"epoch": 0.027,
"grad_norm": 0.24638278782367706,
"learning_rate": 3.105655699509455e-05,
"loss": 0.1246,
"step": 270
},
{
"epoch": 0.027,
"eval_cos_sim": 0.8748664259910583,
"eval_loss": 0.12609003236042926,
"eval_runtime": 175.6344,
"eval_samples_per_second": 22.775,
"eval_steps_per_second": 1.423,
"step": 270
},
{
"epoch": 0.028,
"grad_norm": 0.09267835319042206,
"learning_rate": 1.5569817214910634e-05,
"loss": 0.1246,
"step": 280
},
{
"epoch": 0.028,
"eval_cos_sim": 0.8748399615287781,
"eval_loss": 0.12611397721516557,
"eval_runtime": 175.9072,
"eval_samples_per_second": 22.739,
"eval_steps_per_second": 1.421,
"step": 280
},
{
"epoch": 0.029,
"grad_norm": 0.1712462306022644,
"learning_rate": 3.720164955387656e-06,
"loss": 0.1243,
"step": 290
},
{
"epoch": 0.029,
"eval_cos_sim": 0.8749127388000488,
"eval_loss": 0.1260433347438521,
"eval_runtime": 176.0561,
"eval_samples_per_second": 22.72,
"eval_steps_per_second": 1.42,
"step": 290
},
{
"epoch": 0.03,
"grad_norm": 0.07719286531209946,
"learning_rate": 4.992215726119483e-05,
"loss": 0.1227,
"step": 300
},
{
"epoch": 0.03,
"eval_cos_sim": 0.8748821020126343,
"eval_loss": 0.1260761695121474,
"eval_runtime": 174.2263,
"eval_samples_per_second": 22.959,
"eval_steps_per_second": 1.435,
"step": 300
},
{
"epoch": 0.031,
"grad_norm": 0.08637545257806778,
"learning_rate": 4.395235750428112e-05,
"loss": 0.1222,
"step": 310
},
{
"epoch": 0.031,
"eval_cos_sim": 0.8745994567871094,
"eval_loss": 0.12635979654538104,
"eval_runtime": 179.4806,
"eval_samples_per_second": 22.287,
"eval_steps_per_second": 1.393,
"step": 310
},
{
"epoch": 0.032,
"grad_norm": 0.0923767164349556,
"learning_rate": 3.0672902724039794e-05,
"loss": 0.1232,
"step": 320
},
{
"epoch": 0.032,
"eval_cos_sim": 0.8750612735748291,
"eval_loss": 0.1258947375034041,
"eval_runtime": 181.1338,
"eval_samples_per_second": 22.083,
"eval_steps_per_second": 1.38,
"step": 320
},
{
"epoch": 0.033,
"grad_norm": 0.08724959194660187,
"learning_rate": 1.5205489961037645e-05,
"loss": 0.1236,
"step": 330
},
{
"epoch": 0.033,
"eval_cos_sim": 0.8755974173545837,
"eval_loss": 0.125363212845201,
"eval_runtime": 198.751,
"eval_samples_per_second": 20.126,
"eval_steps_per_second": 1.258,
"step": 330
},
{
"epoch": 0.034,
"grad_norm": 0.07283046841621399,
"learning_rate": 3.5156805643271896e-06,
"loss": 0.1239,
"step": 340
},
{
"epoch": 0.034,
"eval_cos_sim": 0.8756656646728516,
"eval_loss": 0.12529714014279317,
"eval_runtime": 187.9639,
"eval_samples_per_second": 21.281,
"eval_steps_per_second": 1.33,
"step": 340
},
{
"epoch": 0.035,
"grad_norm": 0.15486685931682587,
"learning_rate": 4.9887932065027656e-05,
"loss": 0.1231,
"step": 350
},
{
"epoch": 0.035,
"eval_cos_sim": 0.8756564259529114,
"eval_loss": 0.12530613209950398,
"eval_runtime": 194.2503,
"eval_samples_per_second": 20.592,
"eval_steps_per_second": 1.287,
"step": 350
},
{
"epoch": 0.036,
"grad_norm": 0.07505682110786438,
"learning_rate": 4.369262289279271e-05,
"loss": 0.1233,
"step": 360
},
{
"epoch": 0.036,
"eval_cos_sim": 0.8755001425743103,
"eval_loss": 0.12546515204655598,
"eval_runtime": 194.8309,
"eval_samples_per_second": 20.531,
"eval_steps_per_second": 1.283,
"step": 360
},
{
"epoch": 0.037,
"grad_norm": 0.09688587486743927,
"learning_rate": 3.0287834646695457e-05,
"loss": 0.1259,
"step": 370
},
{
"epoch": 0.037,
"eval_cos_sim": 0.8756394386291504,
"eval_loss": 0.1253258285735793,
"eval_runtime": 188.2216,
"eval_samples_per_second": 21.252,
"eval_steps_per_second": 1.328,
"step": 370
},
{
"epoch": 0.038,
"grad_norm": 0.07268425822257996,
"learning_rate": 1.4843603704405253e-05,
"loss": 0.1247,
"step": 380
},
{
"epoch": 0.038,
"eval_cos_sim": 0.8758111596107483,
"eval_loss": 0.12515661337124775,
"eval_runtime": 189.0095,
"eval_samples_per_second": 21.163,
"eval_steps_per_second": 1.323,
"step": 380
},
{
"epoch": 0.039,
"grad_norm": 0.09875091165304184,
"learning_rate": 3.316550516082126e-06,
"loss": 0.1229,
"step": 390
},
{
"epoch": 0.039,
"eval_cos_sim": 0.8758672475814819,
"eval_loss": 0.12509912636029194,
"eval_runtime": 235.6105,
"eval_samples_per_second": 16.977,
"eval_steps_per_second": 1.061,
"step": 390
},
{
"epoch": 0.04,
"grad_norm": 0.0792056992650032,
"learning_rate": 4.98475042744222e-05,
"loss": 0.1246,
"step": 400
},
{
"epoch": 0.04,
"eval_cos_sim": 0.8759932518005371,
"eval_loss": 0.12497495915638873,
"eval_runtime": 200.3436,
"eval_samples_per_second": 19.966,
"eval_steps_per_second": 1.248,
"step": 400
},
{
"epoch": 0.041,
"grad_norm": 0.10644775629043579,
"learning_rate": 4.3428229687794505e-05,
"loss": 0.1224,
"step": 410
},
{
"epoch": 0.041,
"eval_cos_sim": 0.8761371374130249,
"eval_loss": 0.12483511426197956,
"eval_runtime": 197.5074,
"eval_samples_per_second": 20.252,
"eval_steps_per_second": 1.266,
"step": 410
},
{
"epoch": 0.042,
"grad_norm": 0.09292006492614746,
"learning_rate": 2.9901448730099503e-05,
"loss": 0.1239,
"step": 420
},
{
"epoch": 0.042,
"eval_cos_sim": 0.876413881778717,
"eval_loss": 0.12455732419239948,
"eval_runtime": 187.5784,
"eval_samples_per_second": 21.324,
"eval_steps_per_second": 1.333,
"step": 420
},
{
"epoch": 0.043,
"grad_norm": 0.08105887472629547,
"learning_rate": 1.448424863465538e-05,
"loss": 0.1231,
"step": 430
},
{
"epoch": 0.043,
"eval_cos_sim": 0.876311719417572,
"eval_loss": 0.12465796377407977,
"eval_runtime": 203.0598,
"eval_samples_per_second": 19.699,
"eval_steps_per_second": 1.231,
"step": 430
},
{
"epoch": 0.044,
"grad_norm": 0.15435349941253662,
"learning_rate": 3.1228244380351547e-06,
"loss": 0.1225,
"step": 440
},
{
"epoch": 0.044,
"eval_cos_sim": 0.8762248754501343,
"eval_loss": 0.12474570634114215,
"eval_runtime": 199.1025,
"eval_samples_per_second": 20.09,
"eval_steps_per_second": 1.256,
"step": 440
},
{
"epoch": 0.045,
"grad_norm": 0.09370752424001694,
"learning_rate": 4.980088396483146e-05,
"loss": 0.1228,
"step": 450
},
{
"epoch": 0.045,
"eval_cos_sim": 0.8761196136474609,
"eval_loss": 0.12484796597706745,
"eval_runtime": 192.1246,
"eval_samples_per_second": 20.82,
"eval_steps_per_second": 1.301,
"step": 450
},
{
"epoch": 0.046,
"grad_norm": 0.08999752253293991,
"learning_rate": 4.3159243781616026e-05,
"loss": 0.1229,
"step": 460
},
{
"epoch": 0.046,
"eval_cos_sim": 0.8762247562408447,
"eval_loss": 0.12473729922520588,
"eval_runtime": 196.5532,
"eval_samples_per_second": 20.351,
"eval_steps_per_second": 1.272,
"step": 460
},
{
"epoch": 0.047,
"grad_norm": 0.0809365063905716,
"learning_rate": 2.9513841269722613e-05,
"loss": 0.124,
"step": 470
},
{
"epoch": 0.047,
"eval_cos_sim": 0.8765152096748352,
"eval_loss": 0.12444968440281817,
"eval_runtime": 204.1545,
"eval_samples_per_second": 19.593,
"eval_steps_per_second": 1.225,
"step": 470
},
{
"epoch": 0.048,
"grad_norm": 0.08176057785749435,
"learning_rate": 1.4127514310605238e-05,
"loss": 0.123,
"step": 480
},
{
"epoch": 0.048,
"eval_cos_sim": 0.876448929309845,
"eval_loss": 0.12451095607029865,
"eval_runtime": 198.7286,
"eval_samples_per_second": 20.128,
"eval_steps_per_second": 1.258,
"step": 480
},
{
"epoch": 0.049,
"grad_norm": 0.09636738151311874,
"learning_rate": 2.934550610786291e-06,
"loss": 0.1236,
"step": 490
},
{
"epoch": 0.049,
"eval_cos_sim": 0.8765274882316589,
"eval_loss": 0.12443248560177753,
"eval_runtime": 196.3413,
"eval_samples_per_second": 20.373,
"eval_steps_per_second": 1.273,
"step": 490
},
{
"epoch": 0.05,
"grad_norm": 0.08814109116792679,
"learning_rate": 4.974808275501392e-05,
"loss": 0.123,
"step": 500
},
{
"epoch": 0.05,
"eval_cos_sim": 0.8765753507614136,
"eval_loss": 0.12438686539875934,
"eval_runtime": 191.2687,
"eval_samples_per_second": 20.913,
"eval_steps_per_second": 1.307,
"step": 500
},
{
"epoch": 0.051,
"grad_norm": 0.08511923253536224,
"learning_rate": 4.2885732211184324e-05,
"loss": 0.1246,
"step": 510
},
{
"epoch": 0.051,
"eval_cos_sim": 0.8767162561416626,
"eval_loss": 0.12425224568592975,
"eval_runtime": 173.2088,
"eval_samples_per_second": 23.094,
"eval_steps_per_second": 1.443,
"step": 510
},
{
"epoch": 0.052,
"grad_norm": 0.0837215781211853,
"learning_rate": 2.9125108865470048e-05,
"loss": 0.1221,
"step": 520
},
{
"epoch": 0.052,
"eval_cos_sim": 0.876861572265625,
"eval_loss": 0.1241044213985152,
"eval_runtime": 174.8239,
"eval_samples_per_second": 22.88,
"eval_steps_per_second": 1.43,
"step": 520
},
{
"epoch": 0.053,
"grad_norm": 0.09207245707511902,
"learning_rate": 1.3773489637927061e-05,
"loss": 0.1229,
"step": 530
},
{
"epoch": 0.053,
"eval_cos_sim": 0.8767414093017578,
"eval_loss": 0.12421691825138996,
"eval_runtime": 173.8268,
"eval_samples_per_second": 23.011,
"eval_steps_per_second": 1.438,
"step": 530
},
{
"epoch": 0.054,
"grad_norm": 0.0655718669295311,
"learning_rate": 2.7517759561205253e-06,
"loss": 0.1221,
"step": 540
},
{
"epoch": 0.054,
"eval_cos_sim": 0.8767919540405273,
"eval_loss": 0.1241676082824416,
"eval_runtime": 179.6327,
"eval_samples_per_second": 22.268,
"eval_steps_per_second": 1.392,
"step": 540
},
{
"epoch": 0.055,
"grad_norm": 0.21964910626411438,
"learning_rate": 4.968911380413809e-05,
"loss": 0.1243,
"step": 550
},
{
"epoch": 0.055,
"eval_cos_sim": 0.8768623471260071,
"eval_loss": 0.12409912397610615,
"eval_runtime": 172.7843,
"eval_samples_per_second": 23.15,
"eval_steps_per_second": 1.447,
"step": 550
},
{
"epoch": 0.056,
"grad_norm": 0.08817338943481445,
"learning_rate": 4.260776314131676e-05,
"loss": 0.1222,
"step": 560
},
{
"epoch": 0.056,
"eval_cos_sim": 0.8767062425613403,
"eval_loss": 0.12425821544873188,
"eval_runtime": 172.6396,
"eval_samples_per_second": 23.17,
"eval_steps_per_second": 1.448,
"step": 560
},
{
"epoch": 0.057,
"grad_norm": 0.06475117802619934,
"learning_rate": 2.873534839760646e-05,
"loss": 0.1232,
"step": 570
},
{
"epoch": 0.057,
"eval_cos_sim": 0.8768667578697205,
"eval_loss": 0.12410461117970416,
"eval_runtime": 172.7054,
"eval_samples_per_second": 23.161,
"eval_steps_per_second": 1.448,
"step": 570
},
{
"epoch": 0.058,
"grad_norm": 0.07474437355995178,
"learning_rate": 1.342226284699138e-05,
"loss": 0.1227,
"step": 580
},
{
"epoch": 0.058,
"eval_cos_sim": 0.8771414160728455,
"eval_loss": 0.12382852866398761,
"eval_runtime": 175.1422,
"eval_samples_per_second": 22.839,
"eval_steps_per_second": 1.427,
"step": 580
},
{
"epoch": 0.059,
"grad_norm": 0.07362603396177292,
"learning_rate": 2.5745460253134484e-06,
"loss": 0.1234,
"step": 590
},
{
"epoch": 0.059,
"eval_cos_sim": 0.8771759271621704,
"eval_loss": 0.12379106380688618,
"eval_runtime": 174.7169,
"eval_samples_per_second": 22.894,
"eval_steps_per_second": 1.431,
"step": 590
},
{
"epoch": 0.06,
"grad_norm": 0.07593993842601776,
"learning_rate": 4.962399180850275e-05,
"loss": 0.1232,
"step": 600
},
{
"epoch": 0.06,
"eval_cos_sim": 0.877038300037384,
"eval_loss": 0.12392904116856525,
"eval_runtime": 172.4786,
"eval_samples_per_second": 23.191,
"eval_steps_per_second": 1.449,
"step": 600
},
{
"epoch": 0.061,
"grad_norm": 0.07887241989374161,
"learning_rate": 4.2325405847733254e-05,
"loss": 0.1235,
"step": 610
},
{
"epoch": 0.061,
"eval_cos_sim": 0.8767529726028442,
"eval_loss": 0.12422390153157184,
"eval_runtime": 173.6696,
"eval_samples_per_second": 23.032,
"eval_steps_per_second": 1.44,
"step": 610
},
{
"epoch": 0.062,
"grad_norm": 0.17296281456947327,
"learning_rate": 2.834465700261192e-05,
"loss": 0.1204,
"step": 620
},
{
"epoch": 0.062,
"eval_cos_sim": 0.8772019743919373,
"eval_loss": 0.12377139737355183,
"eval_runtime": 179.9864,
"eval_samples_per_second": 22.224,
"eval_steps_per_second": 1.389,
"step": 620
},
{
"epoch": 0.063,
"grad_norm": 0.06920995563268661,
"learning_rate": 1.3073921470877709e-05,
"loss": 0.1245,
"step": 630
},
{
"epoch": 0.063,
"eval_cos_sim": 0.8773365616798401,
"eval_loss": 0.12363236000287008,
"eval_runtime": 173.1204,
"eval_samples_per_second": 23.105,
"eval_steps_per_second": 1.444,
"step": 630
},
{
"epoch": 0.064,
"grad_norm": 0.08347232639789581,
"learning_rate": 2.4029049877794472e-06,
"loss": 0.1217,
"step": 640
},
{
"epoch": 0.064,
"eval_cos_sim": 0.8773410320281982,
"eval_loss": 0.12362796523320149,
"eval_runtime": 172.0713,
"eval_samples_per_second": 23.246,
"eval_steps_per_second": 1.453,
"step": 640
},
{
"epoch": 0.065,
"grad_norm": 0.07459770888090134,
"learning_rate": 4.955273299787453e-05,
"loss": 0.1223,
"step": 650
},
{
"epoch": 0.065,
"eval_cos_sim": 0.8773767948150635,
"eval_loss": 0.12359384205090472,
"eval_runtime": 173.2149,
"eval_samples_per_second": 23.093,
"eval_steps_per_second": 1.443,
"step": 650
},
{
"epoch": 0.066,
"grad_norm": 0.0831998735666275,
"learning_rate": 4.203873069979081e-05,
"loss": 0.1231,
"step": 660
},
{
"epoch": 0.066,
"eval_cos_sim": 0.8774532675743103,
"eval_loss": 0.12351777221905659,
"eval_runtime": 171.902,
"eval_samples_per_second": 23.269,
"eval_steps_per_second": 1.454,
"step": 660
},
{
"epoch": 0.067,
"grad_norm": 0.07724840193986893,
"learning_rate": 2.7953132048972646e-05,
"loss": 0.122,
"step": 670
},
{
"epoch": 0.067,
"eval_cos_sim": 0.877151608467102,
"eval_loss": 0.12382214214550921,
"eval_runtime": 173.6766,
"eval_samples_per_second": 23.031,
"eval_steps_per_second": 1.439,
"step": 670
},
{
"epoch": 0.068,
"grad_norm": 0.0648268312215805,
"learning_rate": 1.2728552323560239e-05,
"loss": 0.1227,
"step": 680
},
{
"epoch": 0.068,
"eval_cos_sim": 0.8769506216049194,
"eval_loss": 0.12402295615422199,
"eval_runtime": 171.7424,
"eval_samples_per_second": 23.291,
"eval_steps_per_second": 1.456,
"step": 680
},
{
"epoch": 0.069,
"grad_norm": 0.08475865423679352,
"learning_rate": 2.2368956200634283e-06,
"loss": 0.1274,
"step": 690
},
{
"epoch": 0.069,
"eval_cos_sim": 0.8771329522132874,
"eval_loss": 0.12383969738232563,
"eval_runtime": 174.2776,
"eval_samples_per_second": 22.952,
"eval_steps_per_second": 1.434,
"step": 690
},
{
"epoch": 0.07,
"grad_norm": 0.06382860988378525,
"learning_rate": 4.947535513144286e-05,
"loss": 0.122,
"step": 700
},
{
"epoch": 0.07,
"eval_cos_sim": 0.8775114417076111,
"eval_loss": 0.12346241619336079,
"eval_runtime": 185.1334,
"eval_samples_per_second": 21.606,
"eval_steps_per_second": 1.35,
"step": 700
},
{
"epoch": 0.071,
"grad_norm": 0.07273228466510773,
"learning_rate": 4.174780914294635e-05,
"loss": 0.1228,
"step": 710
},
{
"epoch": 0.071,
"eval_cos_sim": 0.8777372241020203,
"eval_loss": 0.12323929693448019,
"eval_runtime": 170.2151,
"eval_samples_per_second": 23.5,
"eval_steps_per_second": 1.469,
"step": 710
},
{
"epoch": 0.072,
"grad_norm": 0.08377543836832047,
"learning_rate": 2.756087111291529e-05,
"loss": 0.1209,
"step": 720
},
{
"epoch": 0.072,
"eval_cos_sim": 0.8776744604110718,
"eval_loss": 0.12329552843319844,
"eval_runtime": 173.1907,
"eval_samples_per_second": 23.096,
"eval_steps_per_second": 1.443,
"step": 720
},
{
"epoch": 0.073,
"grad_norm": 0.08579932153224945,
"learning_rate": 1.2386241478270527e-05,
"loss": 0.1234,
"step": 730
},
{
"epoch": 0.073,
"eval_cos_sim": 0.8776343464851379,
"eval_loss": 0.12333650018917988,
"eval_runtime": 172.2784,
"eval_samples_per_second": 23.218,
"eval_steps_per_second": 1.451,
"step": 730
},
{
"epoch": 0.074,
"grad_norm": 0.07494545727968216,
"learning_rate": 2.0765592951802664e-06,
"loss": 0.1209,
"step": 740
},
{
"epoch": 0.074,
"eval_cos_sim": 0.8777279853820801,
"eval_loss": 0.12324421884762715,
"eval_runtime": 172.9417,
"eval_samples_per_second": 23.129,
"eval_steps_per_second": 1.446,
"step": 740
},
{
"epoch": 0.075,
"grad_norm": 0.07511463761329651,
"learning_rate": 4.9391877493394335e-05,
"loss": 0.1222,
"step": 750
},
{
"epoch": 0.075,
"eval_cos_sim": 0.8777404427528381,
"eval_loss": 0.12323040797459553,
"eval_runtime": 173.813,
"eval_samples_per_second": 23.013,
"eval_steps_per_second": 1.438,
"step": 750
},
{
"epoch": 0.076,
"grad_norm": 0.08240217715501785,
"learning_rate": 4.1452713680951016e-05,
"loss": 0.1237,
"step": 760
},
{
"epoch": 0.076,
"eval_cos_sim": 0.8776569366455078,
"eval_loss": 0.1233164258216567,
"eval_runtime": 173.6453,
"eval_samples_per_second": 23.035,
"eval_steps_per_second": 1.44,
"step": 760
},
{
"epoch": 0.077,
"grad_norm": 0.07817904651165009,
"learning_rate": 2.716797195408887e-05,
"loss": 0.1215,
"step": 770
},
{
"epoch": 0.077,
"eval_cos_sim": 0.8779506683349609,
"eval_loss": 0.12303087331997822,
"eval_runtime": 198.4978,
"eval_samples_per_second": 20.151,
"eval_steps_per_second": 1.259,
"step": 770
},
{
"epoch": 0.078,
"grad_norm": 0.06472489982843399,
"learning_rate": 1.2047074246048157e-05,
"loss": 0.1222,
"step": 780
},
{
"epoch": 0.078,
"eval_cos_sim": 0.8780341148376465,
"eval_loss": 0.12294723345982503,
"eval_runtime": 187.0246,
"eval_samples_per_second": 21.388,
"eval_steps_per_second": 1.337,
"step": 780
},
{
"epoch": 0.079,
"grad_norm": 0.06511878967285156,
"learning_rate": 1.921935972303521e-06,
"loss": 0.1211,
"step": 790
},
{
"epoch": 0.079,
"eval_cos_sim": 0.8780234456062317,
"eval_loss": 0.1229577579711623,
"eval_runtime": 170.8199,
"eval_samples_per_second": 23.416,
"eval_steps_per_second": 1.464,
"step": 790
},
{
"epoch": 0.08,
"grad_norm": 0.08275925368070602,
"learning_rate": 4.9302320888106454e-05,
"loss": 0.1234,
"step": 800
},
{
"epoch": 0.08,
"eval_cos_sim": 0.8778801560401917,
"eval_loss": 0.1230986237739272,
"eval_runtime": 175.6448,
"eval_samples_per_second": 22.773,
"eval_steps_per_second": 1.423,
"step": 800
},
{
"epoch": 0.081,
"grad_norm": 0.06466321647167206,
"learning_rate": 4.115351785778022e-05,
"loss": 0.1215,
"step": 810
},
{
"epoch": 0.081,
"eval_cos_sim": 0.877547025680542,
"eval_loss": 0.12342484547841023,
"eval_runtime": 173.845,
"eval_samples_per_second": 23.009,
"eval_steps_per_second": 1.438,
"step": 810
},
{
"epoch": 0.082,
"grad_norm": 0.060175709426403046,
"learning_rate": 2.6774532491200373e-05,
"loss": 0.1237,
"step": 820
},
{
"epoch": 0.082,
"eval_cos_sim": 0.8778981566429138,
"eval_loss": 0.1230772545551009,
"eval_runtime": 174.1784,
"eval_samples_per_second": 22.965,
"eval_steps_per_second": 1.435,
"step": 820
},
{
"epoch": 0.083,
"grad_norm": 0.06948266923427582,
"learning_rate": 1.1711135154477437e-05,
"loss": 0.1213,
"step": 830
},
{
"epoch": 0.083,
"eval_cos_sim": 0.8779332041740417,
"eval_loss": 0.12304716589199971,
"eval_runtime": 171.7677,
"eval_samples_per_second": 23.287,
"eval_steps_per_second": 1.455,
"step": 830
},
{
"epoch": 0.084,
"grad_norm": 0.0633857399225235,
"learning_rate": 1.7730641868067276e-06,
"loss": 0.1212,
"step": 840
},
{
"epoch": 0.084,
"eval_cos_sim": 0.8779239058494568,
"eval_loss": 0.12305730154263447,
"eval_runtime": 172.6941,
"eval_samples_per_second": 23.162,
"eval_steps_per_second": 1.448,
"step": 840
},
{
"epoch": 0.085,
"grad_norm": 0.07013432681560516,
"learning_rate": 4.9206707634962714e-05,
"loss": 0.1219,
"step": 850
},
{
"epoch": 0.085,
"eval_cos_sim": 0.8781536221504211,
"eval_loss": 0.12283129765736531,
"eval_runtime": 178.3382,
"eval_samples_per_second": 22.429,
"eval_steps_per_second": 1.402,
"step": 850
},
{
"epoch": 0.086,
"grad_norm": 0.0714387595653534,
"learning_rate": 4.085029623930606e-05,
"loss": 0.1214,
"step": 860
},
{
"epoch": 0.086,
"eval_cos_sim": 0.8783000111579895,
"eval_loss": 0.12268445636975239,
"eval_runtime": 180.4291,
"eval_samples_per_second": 22.169,
"eval_steps_per_second": 1.386,
"step": 860
},
{
"epoch": 0.087,
"grad_norm": 0.07285313308238983,
"learning_rate": 2.638065077761282e-05,
"loss": 0.1211,
"step": 870
},
{
"epoch": 0.087,
"eval_cos_sim": 0.8782742619514465,
"eval_loss": 0.12271090867268514,
"eval_runtime": 174.6757,
"eval_samples_per_second": 22.9,
"eval_steps_per_second": 1.431,
"step": 870
},
{
"epoch": 0.088,
"grad_norm": 0.1114286258816719,
"learning_rate": 1.1378507926623341e-05,
"loss": 0.1203,
"step": 880
},
{
"epoch": 0.088,
"eval_cos_sim": 0.8782421946525574,
"eval_loss": 0.12274044944989156,
"eval_runtime": 173.5126,
"eval_samples_per_second": 23.053,
"eval_steps_per_second": 1.441,
"step": 880
},
{
"epoch": 0.089,
"grad_norm": 0.07392691820859909,
"learning_rate": 1.6299810406600836e-06,
"loss": 0.1222,
"step": 890
},
{
"epoch": 0.089,
"eval_cos_sim": 0.8782600164413452,
"eval_loss": 0.12272232272374105,
"eval_runtime": 173.9745,
"eval_samples_per_second": 22.992,
"eval_steps_per_second": 1.437,
"step": 890
},
{
"epoch": 0.09,
"grad_norm": 0.1509944051504135,
"learning_rate": 4.9105061562790325e-05,
"loss": 0.1211,
"step": 900
},
{
"epoch": 0.09,
"eval_cos_sim": 0.8785330653190613,
"eval_loss": 0.12244940116154622,
"eval_runtime": 174.6529,
"eval_samples_per_second": 22.903,
"eval_steps_per_second": 1.431,
"step": 900
},
{
"epoch": 0.091,
"grad_norm": 0.07572964578866959,
"learning_rate": 4.0543124394712475e-05,
"loss": 0.1234,
"step": 910
},
{
"epoch": 0.091,
"eval_cos_sim": 0.8782286643981934,
"eval_loss": 0.1227607171748824,
"eval_runtime": 174.4786,
"eval_samples_per_second": 22.925,
"eval_steps_per_second": 1.433,
"step": 910
},
{
"epoch": 0.092,
"grad_norm": 0.07199128717184067,
"learning_rate": 2.5986424976906166e-05,
"loss": 0.1202,
"step": 920
},
{
"epoch": 0.092,
"eval_cos_sim": 0.8780964612960815,
"eval_loss": 0.12288942649113606,
"eval_runtime": 175.9134,
"eval_samples_per_second": 22.738,
"eval_steps_per_second": 1.421,
"step": 920
},
{
"epoch": 0.093,
"grad_norm": 0.07497607171535492,
"learning_rate": 1.1049275460163872e-05,
"loss": 0.123,
"step": 930
},
{
"epoch": 0.093,
"eval_cos_sim": 0.8781337141990662,
"eval_loss": 0.12284465791928242,
"eval_runtime": 174.1009,
"eval_samples_per_second": 22.975,
"eval_steps_per_second": 1.436,
"step": 930
},
{
"epoch": 0.094,
"grad_norm": 0.056581463664770126,
"learning_rate": 1.4927221931830576e-06,
"loss": 0.1218,
"step": 940
},
{
"epoch": 0.094,
"eval_cos_sim": 0.8781940340995789,
"eval_loss": 0.12278383018719624,
"eval_runtime": 180.3511,
"eval_samples_per_second": 22.179,
"eval_steps_per_second": 1.386,
"step": 940
},
{
"epoch": 0.095,
"grad_norm": 0.06227719038724899,
"learning_rate": 4.8997408003921384e-05,
"loss": 0.1216,
"step": 950
},
{
"epoch": 0.095,
"eval_cos_sim": 0.8782709836959839,
"eval_loss": 0.12271020819889973,
"eval_runtime": 174.3195,
"eval_samples_per_second": 22.946,
"eval_steps_per_second": 1.434,
"step": 950
},
{
"epoch": 0.096,
"grad_norm": 0.07964574545621872,
"learning_rate": 4.02320788776628e-05,
"loss": 0.1205,
"step": 960
},
{
"epoch": 0.096,
"eval_cos_sim": 0.8782918453216553,
"eval_loss": 0.12269965698468159,
"eval_runtime": 171.8922,
"eval_samples_per_second": 23.27,
"eval_steps_per_second": 1.454,
"step": 960
},
{
"epoch": 0.097,
"grad_norm": 0.059999242424964905,
"learning_rate": 2.559195333841573e-05,
"loss": 0.1224,
"step": 970
},
{
"epoch": 0.097,
"eval_cos_sim": 0.8782675862312317,
"eval_loss": 0.12272447182881306,
"eval_runtime": 178.4336,
"eval_samples_per_second": 22.417,
"eval_steps_per_second": 1.401,
"step": 970
},
{
"epoch": 0.098,
"grad_norm": 0.07078584283590317,
"learning_rate": 1.0723519806732741e-05,
"loss": 0.1226,
"step": 980
},
{
"epoch": 0.098,
"eval_cos_sim": 0.8782561421394348,
"eval_loss": 0.12273399831997822,
"eval_runtime": 172.0171,
"eval_samples_per_second": 23.254,
"eval_steps_per_second": 1.453,
"step": 980
},
{
"epoch": 0.099,
"grad_norm": 0.0700722336769104,
"learning_rate": 1.3613218521583647e-06,
"loss": 0.1189,
"step": 990
},
{
"epoch": 0.099,
"eval_cos_sim": 0.8782747387886047,
"eval_loss": 0.1227147035812087,
"eval_runtime": 174.8389,
"eval_samples_per_second": 22.878,
"eval_steps_per_second": 1.43,
"step": 990
},
{
"epoch": 0.1,
"grad_norm": 0.06270556151866913,
"learning_rate": 4.888377378787991e-05,
"loss": 0.1209,
"step": 1000
},
{
"epoch": 0.1,
"eval_cos_sim": 0.8783043622970581,
"eval_loss": 0.12268760301815938,
"eval_runtime": 171.6574,
"eval_samples_per_second": 23.302,
"eval_steps_per_second": 1.456,
"step": 1000
},
{
"epoch": 0.101,
"grad_norm": 0.059303585439920425,
"learning_rate": 3.9917237207221514e-05,
"loss": 0.1206,
"step": 1010
},
{
"epoch": 0.101,
"eval_cos_sim": 0.8785374760627747,
"eval_loss": 0.12245997311818074,
"eval_runtime": 173.2279,
"eval_samples_per_second": 23.091,
"eval_steps_per_second": 1.443,
"step": 1010
},
{
"epoch": 0.102,
"grad_norm": 0.06463504582643509,
"learning_rate": 2.519733417274297e-05,
"loss": 0.122,
"step": 1020
},
{
"epoch": 0.102,
"eval_cos_sim": 0.8785625100135803,
"eval_loss": 0.12243694259869527,
"eval_runtime": 179.8429,
"eval_samples_per_second": 22.242,
"eval_steps_per_second": 1.39,
"step": 1020
},
{
"epoch": 0.103,
"grad_norm": 0.06594408303499222,
"learning_rate": 1.0401322151467458e-05,
"loss": 0.1226,
"step": 1030
},
{
"epoch": 0.103,
"eval_cos_sim": 0.8784922361373901,
"eval_loss": 0.1225029034827895,
"eval_runtime": 171.8585,
"eval_samples_per_second": 23.275,
"eval_steps_per_second": 1.455,
"step": 1030
},
{
"epoch": 0.104,
"grad_norm": 0.061140164732933044,
"learning_rate": 1.2358127653053858e-06,
"loss": 0.122,
"step": 1040
},
{
"epoch": 0.104,
"eval_cos_sim": 0.8785346746444702,
"eval_loss": 0.12245874931561421,
"eval_runtime": 170.3116,
"eval_samples_per_second": 23.486,
"eval_steps_per_second": 1.468,
"step": 1040
},
{
"epoch": 0.105,
"grad_norm": 0.06770511716604233,
"learning_rate": 4.876418723469453e-05,
"loss": 0.1196,
"step": 1050
},
{
"epoch": 0.105,
"eval_cos_sim": 0.878551721572876,
"eval_loss": 0.12243552591549825,
"eval_runtime": 173.9331,
"eval_samples_per_second": 22.997,
"eval_steps_per_second": 1.437,
"step": 1050
},
{
"epoch": 0.106,
"grad_norm": 0.06050929054617882,
"learning_rate": 3.959867784853255e-05,
"loss": 0.1219,
"step": 1060
},
{
"epoch": 0.106,
"eval_cos_sim": 0.8784484267234802,
"eval_loss": 0.12253486802327107,
"eval_runtime": 175.2374,
"eval_samples_per_second": 22.826,
"eval_steps_per_second": 1.427,
"step": 1060
},
{
"epoch": 0.107,
"grad_norm": 0.07329047471284866,
"learning_rate": 2.4802665827257035e-05,
"loss": 0.1214,
"step": 1070
},
{
"epoch": 0.107,
"eval_cos_sim": 0.8785268068313599,
"eval_loss": 0.12246101453053426,
"eval_runtime": 172.381,
"eval_samples_per_second": 23.204,
"eval_steps_per_second": 1.45,
"step": 1070
},
{
"epoch": 0.108,
"grad_norm": 0.061687979847192764,
"learning_rate": 1.0082762792778497e-05,
"loss": 0.1206,
"step": 1080
},
{
"epoch": 0.108,
"eval_cos_sim": 0.8787024617195129,
"eval_loss": 0.12228504302250813,
"eval_runtime": 171.0068,
"eval_samples_per_second": 23.391,
"eval_steps_per_second": 1.462,
"step": 1080
},
{
"epoch": 0.109,
"grad_norm": 0.06697102636098862,
"learning_rate": 1.1162262121200917e-06,
"loss": 0.1216,
"step": 1090
},
{
"epoch": 0.109,
"eval_cos_sim": 0.8787557482719421,
"eval_loss": 0.12223189308392476,
"eval_runtime": 172.5647,
"eval_samples_per_second": 23.18,
"eval_steps_per_second": 1.449,
"step": 1090
},
{
"epoch": 0.11,
"grad_norm": 0.06245901808142662,
"learning_rate": 4.8638678147841726e-05,
"loss": 0.1224,
"step": 1100
},
{
"epoch": 0.11,
"eval_cos_sim": 0.878864049911499,
"eval_loss": 0.12212434603917073,
"eval_runtime": 177.5612,
"eval_samples_per_second": 22.527,
"eval_steps_per_second": 1.408,
"step": 1100
},
{
"epoch": 0.111,
"grad_norm": 0.07445187121629715,
"learning_rate": 3.9276480193267495e-05,
"loss": 0.1226,
"step": 1110
},
{
"epoch": 0.111,
"eval_cos_sim": 0.8787615895271301,
"eval_loss": 0.12223191478001545,
"eval_runtime": 170.2386,
"eval_samples_per_second": 23.496,
"eval_steps_per_second": 1.469,
"step": 1110
},
{
"epoch": 0.112,
"grad_norm": 0.06328488141298294,
"learning_rate": 2.4408046661584553e-05,
"loss": 0.1205,
"step": 1120
},
{
"epoch": 0.112,
"eval_cos_sim": 0.8786949515342712,
"eval_loss": 0.12229911091076802,
"eval_runtime": 173.6977,
"eval_samples_per_second": 23.029,
"eval_steps_per_second": 1.439,
"step": 1120
},
{
"epoch": 0.113,
"grad_norm": 0.1140422523021698,
"learning_rate": 9.767921122337203e-06,
"loss": 0.1213,
"step": 1130
},
{
"epoch": 0.113,
"eval_cos_sim": 0.8787314295768738,
"eval_loss": 0.12225894191014242,
"eval_runtime": 176.5254,
"eval_samples_per_second": 22.66,
"eval_steps_per_second": 1.416,
"step": 1130
},
{
"epoch": 0.114,
"grad_norm": 0.07940120995044708,
"learning_rate": 1.0025919960786169e-06,
"loss": 0.1216,
"step": 1140
},
{
"epoch": 0.114,
"eval_cos_sim": 0.878764271736145,
"eval_loss": 0.12222567083584737,
"eval_runtime": 173.6241,
"eval_samples_per_second": 23.038,
"eval_steps_per_second": 1.44,
"step": 1140
},
{
"epoch": 0.115,
"grad_norm": 0.06326926499605179,
"learning_rate": 4.850727780681685e-05,
"loss": 0.121,
"step": 1150
},
{
"epoch": 0.115,
"eval_cos_sim": 0.8787913918495178,
"eval_loss": 0.1222020423625655,
"eval_runtime": 197.6043,
"eval_samples_per_second": 20.242,
"eval_steps_per_second": 1.265,
"step": 1150
},
{
"epoch": 0.116,
"grad_norm": 0.06304363161325455,
"learning_rate": 3.89507245398359e-05,
"loss": 0.1212,
"step": 1160
},
{
"epoch": 0.116,
"eval_cos_sim": 0.8788431286811829,
"eval_loss": 0.1221448552821822,
"eval_runtime": 180.7769,
"eval_samples_per_second": 22.127,
"eval_steps_per_second": 1.383,
"step": 1160
},
{
"epoch": 0.117,
"grad_norm": 0.06048878654837608,
"learning_rate": 2.4013575023093562e-05,
"loss": 0.121,
"step": 1170
},
{
"epoch": 0.117,
"eval_cos_sim": 0.8789100050926208,
"eval_loss": 0.12207724287259053,
"eval_runtime": 175.5012,
"eval_samples_per_second": 22.792,
"eval_steps_per_second": 1.424,
"step": 1170
},
{
"epoch": 0.118,
"grad_norm": 0.060076240450143814,
"learning_rate": 9.456875605287529e-06,
"loss": 0.1208,
"step": 1180
},
{
"epoch": 0.118,
"eval_cos_sim": 0.8789265751838684,
"eval_loss": 0.12206284239041279,
"eval_runtime": 179.6264,
"eval_samples_per_second": 22.268,
"eval_steps_per_second": 1.392,
"step": 1180
},
{
"epoch": 0.119,
"grad_norm": 0.06535797566175461,
"learning_rate": 8.949384372096747e-07,
"loss": 0.1224,
"step": 1190
},
{
"epoch": 0.119,
"eval_cos_sim": 0.8789151310920715,
"eval_loss": 0.12207536175000142,
"eval_runtime": 173.573,
"eval_samples_per_second": 23.045,
"eval_steps_per_second": 1.44,
"step": 1190
},
{
"epoch": 0.12,
"grad_norm": 0.051111843436956406,
"learning_rate": 4.8370018959339916e-05,
"loss": 0.1216,
"step": 1200
},
{
"epoch": 0.12,
"eval_cos_sim": 0.878704845905304,
"eval_loss": 0.1222877917503066,
"eval_runtime": 170.7747,
"eval_samples_per_second": 23.423,
"eval_steps_per_second": 1.464,
"step": 1200
},
{
"epoch": 0.121,
"grad_norm": 0.07394807785749435,
"learning_rate": 3.862149207337666e-05,
"loss": 0.1227,
"step": 1210
},
{
"epoch": 0.121,
"eval_cos_sim": 0.8786987662315369,
"eval_loss": 0.12228692223774862,
"eval_runtime": 172.7735,
"eval_samples_per_second": 23.152,
"eval_steps_per_second": 1.447,
"step": 1210
},
{
"epoch": 0.122,
"grad_norm": 0.06019896641373634,
"learning_rate": 2.3619349222387182e-05,
"loss": 0.1194,
"step": 1220
},
{
"epoch": 0.122,
"eval_cos_sim": 0.8791972398757935,
"eval_loss": 0.12178870942341757,
"eval_runtime": 171.5715,
"eval_samples_per_second": 23.314,
"eval_steps_per_second": 1.457,
"step": 1220
},
{
"epoch": 0.123,
"grad_norm": 0.05350535735487938,
"learning_rate": 9.149703760694162e-06,
"loss": 0.1214,
"step": 1230
},
{
"epoch": 0.123,
"eval_cos_sim": 0.8792542219161987,
"eval_loss": 0.12173621847378684,
"eval_runtime": 173.1804,
"eval_samples_per_second": 23.097,
"eval_steps_per_second": 1.444,
"step": 1230
},
{
"epoch": 0.124,
"grad_norm": 0.06338366866111755,
"learning_rate": 7.932923650373624e-07,
"loss": 0.1194,
"step": 1240
},
{
"epoch": 0.124,
"eval_cos_sim": 0.8792427182197571,
"eval_loss": 0.12174849869954062,
"eval_runtime": 172.0716,
"eval_samples_per_second": 23.246,
"eval_steps_per_second": 1.453,
"step": 1240
},
{
"epoch": 0.125,
"grad_norm": 0.052142199128866196,
"learning_rate": 4.822693581319333e-05,
"loss": 0.12,
"step": 1250
},
{
"epoch": 0.125,
"eval_cos_sim": 0.8787649869918823,
"eval_loss": 0.1222243664478011,
"eval_runtime": 172.6696,
"eval_samples_per_second": 23.166,
"eval_steps_per_second": 1.448,
"step": 1250
},
{
"epoch": 0.126,
"grad_norm": 0.0695052519440651,
"learning_rate": 3.828886484552269e-05,
"loss": 0.1213,
"step": 1260
},
{
"epoch": 0.126,
"eval_cos_sim": 0.8785125017166138,
"eval_loss": 0.12247128774868916,
"eval_runtime": 182.4937,
"eval_samples_per_second": 21.919,
"eval_steps_per_second": 1.37,
"step": 1260
},
{
"epoch": 0.127,
"grad_norm": 0.07181504368782043,
"learning_rate": 2.3225467508799494e-05,
"loss": 0.1216,
"step": 1270
},
{
"epoch": 0.127,
"eval_cos_sim": 0.8791427612304688,
"eval_loss": 0.12184033658253621,
"eval_runtime": 172.8353,
"eval_samples_per_second": 23.143,
"eval_steps_per_second": 1.446,
"step": 1270
},
{
"epoch": 0.128,
"grad_norm": 0.06035405769944191,
"learning_rate": 8.846482142219678e-06,
"loss": 0.12,
"step": 1280
},
{
"epoch": 0.128,
"eval_cos_sim": 0.8793256282806396,
"eval_loss": 0.121661689779634,
"eval_runtime": 173.4166,
"eval_samples_per_second": 23.066,
"eval_steps_per_second": 1.442,
"step": 1280
}
],
"logging_steps": 10,
"max_steps": 10000,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 10,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 110,
"trial_name": null,
"trial_params": null
}