{ "best_metric": 0.7105076909065247, "best_model_checkpoint": "miner_id_24/checkpoint-400", "epoch": 0.2170138888888889, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00043402777777777775, "eval_loss": 1.0486223697662354, "eval_runtime": 49.5892, "eval_samples_per_second": 9.78, "eval_steps_per_second": 2.46, "step": 1 }, { "epoch": 0.004340277777777778, "grad_norm": 1.30818772315979, "learning_rate": 4.2600000000000005e-05, "loss": 0.912, "step": 10 }, { "epoch": 0.008680555555555556, "grad_norm": 0.8930862545967102, "learning_rate": 8.520000000000001e-05, "loss": 0.8451, "step": 20 }, { "epoch": 0.013020833333333334, "grad_norm": 1.0381461381912231, "learning_rate": 0.0001278, "loss": 0.5786, "step": 30 }, { "epoch": 0.017361111111111112, "grad_norm": 1.2415528297424316, "learning_rate": 0.00017040000000000002, "loss": 0.6245, "step": 40 }, { "epoch": 0.021701388888888888, "grad_norm": 0.8442565202713013, "learning_rate": 0.000213, "loss": 0.4496, "step": 50 }, { "epoch": 0.021701388888888888, "eval_loss": 0.7272957563400269, "eval_runtime": 49.5625, "eval_samples_per_second": 9.786, "eval_steps_per_second": 2.462, "step": 50 }, { "epoch": 0.026041666666666668, "grad_norm": 1.2658860683441162, "learning_rate": 0.00021274057135267128, "loss": 0.8503, "step": 60 }, { "epoch": 0.030381944444444444, "grad_norm": 1.3917545080184937, "learning_rate": 0.00021196354932097723, "loss": 0.6671, "step": 70 }, { "epoch": 0.034722222222222224, "grad_norm": 1.3726558685302734, "learning_rate": 0.0002106727194781503, "loss": 0.7479, "step": 80 }, { "epoch": 0.0390625, "grad_norm": 1.26426100730896, "learning_rate": 0.00020887437061743096, "loss": 0.8955, "step": 90 }, { "epoch": 0.043402777777777776, "grad_norm": 0.8687089085578918, "learning_rate": 0.00020657726411369925, "loss": 0.8438, "step": 100 }, { "epoch": 0.043402777777777776, "eval_loss": 0.7392935752868652, "eval_runtime": 49.5607, "eval_samples_per_second": 9.786, "eval_steps_per_second": 2.462, "step": 100 }, { "epoch": 0.04774305555555555, "grad_norm": 0.9549110531806946, "learning_rate": 0.000203792591238937, "loss": 0.7712, "step": 110 }, { "epoch": 0.052083333333333336, "grad_norm": 1.3070204257965088, "learning_rate": 0.0002005339186394757, "loss": 0.669, "step": 120 }, { "epoch": 0.05642361111111111, "grad_norm": 1.2052979469299316, "learning_rate": 0.00019681712224065936, "loss": 0.616, "step": 130 }, { "epoch": 0.06076388888888889, "grad_norm": 0.6207852363586426, "learning_rate": 0.0001926603099009319, "loss": 0.7878, "step": 140 }, { "epoch": 0.06510416666666667, "grad_norm": 0.7814576029777527, "learning_rate": 0.00018808373319217114, "loss": 0.4888, "step": 150 }, { "epoch": 0.06510416666666667, "eval_loss": 0.7211872339248657, "eval_runtime": 49.5742, "eval_samples_per_second": 9.783, "eval_steps_per_second": 2.461, "step": 150 }, { "epoch": 0.06944444444444445, "grad_norm": 0.8724411129951477, "learning_rate": 0.00018310968873606635, "loss": 0.7598, "step": 160 }, { "epoch": 0.07378472222222222, "grad_norm": 1.6653738021850586, "learning_rate": 0.0001777624095772184, "loss": 0.8254, "step": 170 }, { "epoch": 0.078125, "grad_norm": 1.2747048139572144, "learning_rate": 0.0001720679471221826, "loss": 0.8594, "step": 180 }, { "epoch": 0.08246527777777778, "grad_norm": 1.4110088348388672, "learning_rate": 0.00016605404421963453, "loss": 0.7506, "step": 190 }, { "epoch": 0.08680555555555555, "grad_norm": 2.366544485092163, "learning_rate": 0.00015975, "loss": 0.9703, "step": 200 }, { "epoch": 0.08680555555555555, "eval_loss": 0.7333279252052307, "eval_runtime": 49.6093, "eval_samples_per_second": 9.776, "eval_steps_per_second": 2.459, "step": 200 }, { "epoch": 0.09114583333333333, "grad_norm": 1.0888848304748535, "learning_rate": 0.00015318652713303674, "loss": 0.5779, "step": 210 }, { "epoch": 0.0954861111111111, "grad_norm": 0.9510017037391663, "learning_rate": 0.00014639560219879464, "loss": 0.7727, "step": 220 }, { "epoch": 0.0998263888888889, "grad_norm": 0.347385048866272, "learning_rate": 0.0001394103099009319, "loss": 0.6551, "step": 230 }, { "epoch": 0.10416666666666667, "grad_norm": 1.4552265405654907, "learning_rate": 0.0001322646818813646, "loss": 0.6279, "step": 240 }, { "epoch": 0.10850694444444445, "grad_norm": 1.388659119606018, "learning_rate": 0.0001249935309215281, "loss": 0.7938, "step": 250 }, { "epoch": 0.10850694444444445, "eval_loss": 0.7187796831130981, "eval_runtime": 49.5717, "eval_samples_per_second": 9.784, "eval_steps_per_second": 2.461, "step": 250 }, { "epoch": 0.11284722222222222, "grad_norm": 0.9789396524429321, "learning_rate": 0.0001176322813380051, "loss": 0.8391, "step": 260 }, { "epoch": 0.1171875, "grad_norm": 1.4175513982772827, "learning_rate": 0.00011021679639881638, "loss": 0.8921, "step": 270 }, { "epoch": 0.12152777777777778, "grad_norm": 1.268831729888916, "learning_rate": 0.00010278320360118368, "loss": 0.5002, "step": 280 }, { "epoch": 0.12586805555555555, "grad_norm": 0.8849596977233887, "learning_rate": 9.536771866199493e-05, "loss": 0.602, "step": 290 }, { "epoch": 0.13020833333333334, "grad_norm": 1.0034514665603638, "learning_rate": 8.800646907847192e-05, "loss": 0.8343, "step": 300 }, { "epoch": 0.13020833333333334, "eval_loss": 0.7189058065414429, "eval_runtime": 49.5602, "eval_samples_per_second": 9.786, "eval_steps_per_second": 2.462, "step": 300 }, { "epoch": 0.1345486111111111, "grad_norm": 0.5106523036956787, "learning_rate": 8.07353181186354e-05, "loss": 0.715, "step": 310 }, { "epoch": 0.1388888888888889, "grad_norm": 0.9988264441490173, "learning_rate": 7.35896900990681e-05, "loss": 0.8572, "step": 320 }, { "epoch": 0.14322916666666666, "grad_norm": 1.3042094707489014, "learning_rate": 6.660439780120536e-05, "loss": 0.7248, "step": 330 }, { "epoch": 0.14756944444444445, "grad_norm": 0.9319302439689636, "learning_rate": 5.981347286696324e-05, "loss": 0.873, "step": 340 }, { "epoch": 0.1519097222222222, "grad_norm": 1.1595097780227661, "learning_rate": 5.325000000000002e-05, "loss": 0.5117, "step": 350 }, { "epoch": 0.1519097222222222, "eval_loss": 0.7169974446296692, "eval_runtime": 49.5441, "eval_samples_per_second": 9.789, "eval_steps_per_second": 2.462, "step": 350 }, { "epoch": 0.15625, "grad_norm": 1.3601690530776978, "learning_rate": 4.6945955780365475e-05, "loss": 0.8535, "step": 360 }, { "epoch": 0.1605902777777778, "grad_norm": 0.8528502583503723, "learning_rate": 4.0932052877817393e-05, "loss": 0.7758, "step": 370 }, { "epoch": 0.16493055555555555, "grad_norm": 0.9841234683990479, "learning_rate": 3.523759042278163e-05, "loss": 0.6716, "step": 380 }, { "epoch": 0.16927083333333334, "grad_norm": 0.7331548929214478, "learning_rate": 2.989031126393367e-05, "loss": 0.9835, "step": 390 }, { "epoch": 0.1736111111111111, "grad_norm": 1.9498828649520874, "learning_rate": 2.4916266807828855e-05, "loss": 0.9712, "step": 400 }, { "epoch": 0.1736111111111111, "eval_loss": 0.7105076909065247, "eval_runtime": 49.5867, "eval_samples_per_second": 9.781, "eval_steps_per_second": 2.46, "step": 400 }, { "epoch": 0.1779513888888889, "grad_norm": 1.3422185182571411, "learning_rate": 2.033969009906811e-05, "loss": 0.8554, "step": 410 }, { "epoch": 0.18229166666666666, "grad_norm": 1.4816110134124756, "learning_rate": 1.6182877759340637e-05, "loss": 0.6037, "step": 420 }, { "epoch": 0.18663194444444445, "grad_norm": 0.8355005383491516, "learning_rate": 1.2466081360524275e-05, "loss": 0.6468, "step": 430 }, { "epoch": 0.1909722222222222, "grad_norm": 0.657798171043396, "learning_rate": 9.207408761062996e-06, "loss": 0.48, "step": 440 }, { "epoch": 0.1953125, "grad_norm": 1.6586601734161377, "learning_rate": 6.422735886300764e-06, "loss": 0.6231, "step": 450 }, { "epoch": 0.1953125, "eval_loss": 0.712680459022522, "eval_runtime": 49.6689, "eval_samples_per_second": 9.765, "eval_steps_per_second": 2.456, "step": 450 }, { "epoch": 0.1996527777777778, "grad_norm": 0.8355949521064758, "learning_rate": 4.125629382569038e-06, "loss": 0.6722, "step": 460 }, { "epoch": 0.20399305555555555, "grad_norm": 1.7075914144515991, "learning_rate": 2.327280521849694e-06, "loss": 0.66, "step": 470 }, { "epoch": 0.20833333333333334, "grad_norm": 0.7151721119880676, "learning_rate": 1.0364506790227565e-06, "loss": 0.6677, "step": 480 }, { "epoch": 0.2126736111111111, "grad_norm": 0.9590463042259216, "learning_rate": 2.5942864732872295e-07, "loss": 0.9534, "step": 490 }, { "epoch": 0.2170138888888889, "grad_norm": 2.80539870262146, "learning_rate": 0.0, "loss": 0.8418, "step": 500 }, { "epoch": 0.2170138888888889, "eval_loss": 0.7122297286987305, "eval_runtime": 49.691, "eval_samples_per_second": 9.76, "eval_steps_per_second": 2.455, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 2 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.64727392017449e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }