{ "best_metric": 8.305614471435547, "best_model_checkpoint": "miner_id_24/checkpoint-350", "epoch": 1.0012771392081736, "eval_steps": 50, "global_step": 392, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002554278416347382, "eval_loss": 10.56713581085205, "eval_runtime": 1.4121, "eval_samples_per_second": 116.843, "eval_steps_per_second": 29.742, "step": 1 }, { "epoch": 0.02554278416347382, "grad_norm": 1.1778725385665894, "learning_rate": 4.34e-05, "loss": 10.552, "step": 10 }, { "epoch": 0.05108556832694764, "grad_norm": 1.8060922622680664, "learning_rate": 8.68e-05, "loss": 10.3268, "step": 20 }, { "epoch": 0.07662835249042145, "grad_norm": 1.1790982484817505, "learning_rate": 0.0001302, "loss": 9.6125, "step": 30 }, { "epoch": 0.10217113665389528, "grad_norm": 0.8280765414237976, "learning_rate": 0.0001736, "loss": 9.0527, "step": 40 }, { "epoch": 0.1277139208173691, "grad_norm": 0.7909080386161804, "learning_rate": 0.000217, "loss": 8.9191, "step": 50 }, { "epoch": 0.1277139208173691, "eval_loss": 8.911927223205566, "eval_runtime": 1.3472, "eval_samples_per_second": 122.473, "eval_steps_per_second": 31.175, "step": 50 }, { "epoch": 0.1532567049808429, "grad_norm": 0.5643665194511414, "learning_rate": 0.00021654255185873147, "loss": 8.9193, "step": 60 }, { "epoch": 0.17879948914431673, "grad_norm": 0.5557847023010254, "learning_rate": 0.0002151740647400309, "loss": 8.851, "step": 70 }, { "epoch": 0.20434227330779056, "grad_norm": 0.6045961976051331, "learning_rate": 0.00021290607803355364, "loss": 8.8021, "step": 80 }, { "epoch": 0.22988505747126436, "grad_norm": 0.514235258102417, "learning_rate": 0.0002097577159107891, "loss": 8.7032, "step": 90 }, { "epoch": 0.2554278416347382, "grad_norm": 1.2574142217636108, "learning_rate": 0.00020575552606576667, "loss": 8.654, "step": 100 }, { "epoch": 0.2554278416347382, "eval_loss": 8.747037887573242, "eval_runtime": 1.4221, "eval_samples_per_second": 116.024, "eval_steps_per_second": 29.533, "step": 100 }, { "epoch": 0.280970625798212, "grad_norm": 0.5849463939666748, "learning_rate": 0.00020093325585895849, "loss": 8.7102, "step": 110 }, { "epoch": 0.3065134099616858, "grad_norm": 0.5225157141685486, "learning_rate": 0.0001953315677519829, "loss": 8.7207, "step": 120 }, { "epoch": 0.33205619412515963, "grad_norm": 0.49455714225769043, "learning_rate": 0.00018899769643262803, "loss": 8.6173, "step": 130 }, { "epoch": 0.35759897828863346, "grad_norm": 0.5805558562278748, "learning_rate": 0.0001819850505213929, "loss": 8.5269, "step": 140 }, { "epoch": 0.3831417624521073, "grad_norm": 0.805574893951416, "learning_rate": 0.00017435276221804648, "loss": 8.3712, "step": 150 }, { "epoch": 0.3831417624521073, "eval_loss": 8.502363204956055, "eval_runtime": 1.3704, "eval_samples_per_second": 120.401, "eval_steps_per_second": 30.647, "step": 150 }, { "epoch": 0.4086845466155811, "grad_norm": 0.8308420181274414, "learning_rate": 0.00016616518868568545, "loss": 8.4712, "step": 160 }, { "epoch": 0.4342273307790549, "grad_norm": 0.8022027015686035, "learning_rate": 0.00015749136937673149, "loss": 8.4756, "step": 170 }, { "epoch": 0.45977011494252873, "grad_norm": 0.4959689676761627, "learning_rate": 0.00014840444387681742, "loss": 8.4593, "step": 180 }, { "epoch": 0.48531289910600256, "grad_norm": 0.9598937034606934, "learning_rate": 0.00013898103517543227, "loss": 8.2892, "step": 190 }, { "epoch": 0.5108556832694764, "grad_norm": 1.3006800413131714, "learning_rate": 0.0001293006035637256, "loss": 8.3764, "step": 200 }, { "epoch": 0.5108556832694764, "eval_loss": 8.402132987976074, "eval_runtime": 1.3881, "eval_samples_per_second": 118.864, "eval_steps_per_second": 30.256, "step": 200 }, { "epoch": 0.5363984674329502, "grad_norm": 0.5018998980522156, "learning_rate": 0.0001194447766075491, "loss": 8.4948, "step": 210 }, { "epoch": 0.561941251596424, "grad_norm": 0.6962450742721558, "learning_rate": 0.00010949666084555282, "loss": 8.436, "step": 220 }, { "epoch": 0.5874840357598978, "grad_norm": 0.6272366046905518, "learning_rate": 9.954014101625195e-05, "loss": 8.4713, "step": 230 }, { "epoch": 0.6130268199233716, "grad_norm": 0.5077914595603943, "learning_rate": 8.965917272313806e-05, "loss": 8.295, "step": 240 }, { "epoch": 0.6385696040868455, "grad_norm": 1.2477344274520874, "learning_rate": 7.993707450224101e-05, "loss": 8.3127, "step": 250 }, { "epoch": 0.6385696040868455, "eval_loss": 8.343680381774902, "eval_runtime": 1.3407, "eval_samples_per_second": 123.074, "eval_steps_per_second": 31.328, "step": 250 }, { "epoch": 0.6641123882503193, "grad_norm": 1.0321159362792969, "learning_rate": 7.045582526158615e-05, "loss": 8.3004, "step": 260 }, { "epoch": 0.6896551724137931, "grad_norm": 0.7519944906234741, "learning_rate": 6.129537301669508e-05, "loss": 8.4156, "step": 270 }, { "epoch": 0.7151979565772669, "grad_norm": 0.6195271611213684, "learning_rate": 5.253296075102587e-05, "loss": 8.3163, "step": 280 }, { "epoch": 0.7407407407407407, "grad_norm": 0.6736447811126709, "learning_rate": 4.4242475085849676e-05, "loss": 8.2839, "step": 290 }, { "epoch": 0.7662835249042146, "grad_norm": 1.1920803785324097, "learning_rate": 3.649382325172409e-05, "loss": 8.2662, "step": 300 }, { "epoch": 0.7662835249042146, "eval_loss": 8.317304611206055, "eval_runtime": 1.3354, "eval_samples_per_second": 123.555, "eval_steps_per_second": 31.45, "step": 300 }, { "epoch": 0.7918263090676884, "grad_norm": 0.7110276222229004, "learning_rate": 2.9352343615079657e-05, "loss": 8.3509, "step": 310 }, { "epoch": 0.8173690932311622, "grad_norm": 0.5742895007133484, "learning_rate": 2.287825473049131e-05, "loss": 8.2672, "step": 320 }, { "epoch": 0.842911877394636, "grad_norm": 0.6707843542098999, "learning_rate": 1.7126147564349132e-05, "loss": 8.2678, "step": 330 }, { "epoch": 0.8684546615581098, "grad_norm": 0.69368976354599, "learning_rate": 1.214452517161218e-05, "loss": 8.2203, "step": 340 }, { "epoch": 0.8939974457215837, "grad_norm": 1.245832920074463, "learning_rate": 7.975393707194009e-06, "loss": 8.2846, "step": 350 }, { "epoch": 0.8939974457215837, "eval_loss": 8.305614471435547, "eval_runtime": 1.3367, "eval_samples_per_second": 123.435, "eval_steps_per_second": 31.42, "step": 350 }, { "epoch": 0.9195402298850575, "grad_norm": 0.44001325964927673, "learning_rate": 4.653908220663318e-06, "loss": 8.3403, "step": 360 }, { "epoch": 0.9450830140485313, "grad_norm": 0.5452269315719604, "learning_rate": 2.208076220998228e-06, "loss": 8.333, "step": 370 }, { "epoch": 0.9706257982120051, "grad_norm": 1.2281514406204224, "learning_rate": 6.585215110024809e-07, "loss": 8.293, "step": 380 }, { "epoch": 0.9961685823754789, "grad_norm": 0.7243633270263672, "learning_rate": 1.8310282784486043e-08, "loss": 8.2837, "step": 390 } ], "logging_steps": 10, "max_steps": 392, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4054460525445120.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }