{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 625, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08, "grad_norm": 16.931909561157227, "learning_rate": 2.53968253968254e-06, "loss": 17.7745, "step": 10 }, { "epoch": 0.16, "grad_norm": 20.85945701599121, "learning_rate": 5.7142857142857145e-06, "loss": 16.3129, "step": 20 }, { "epoch": 0.24, "grad_norm": 17.975467681884766, "learning_rate": 8.888888888888888e-06, "loss": 12.0213, "step": 30 }, { "epoch": 0.32, "grad_norm": 6.495634078979492, "learning_rate": 1.2063492063492064e-05, "loss": 6.7596, "step": 40 }, { "epoch": 0.4, "grad_norm": 5.885069847106934, "learning_rate": 1.523809523809524e-05, "loss": 3.8509, "step": 50 }, { "epoch": 0.48, "grad_norm": 4.575489521026611, "learning_rate": 1.8412698412698415e-05, "loss": 2.0809, "step": 60 }, { "epoch": 0.56, "grad_norm": 5.4581708908081055, "learning_rate": 1.999609421031453e-05, "loss": 1.503, "step": 70 }, { "epoch": 0.64, "grad_norm": 1.8591735363006592, "learning_rate": 1.9964866196679105e-05, "loss": 1.2903, "step": 80 }, { "epoch": 0.72, "grad_norm": 2.716611385345459, "learning_rate": 1.990250772639552e-05, "loss": 1.174, "step": 90 }, { "epoch": 0.8, "grad_norm": 1.5368252992630005, "learning_rate": 1.9809213608668188e-05, "loss": 1.1583, "step": 100 }, { "epoch": 0.88, "grad_norm": 1.6447538137435913, "learning_rate": 1.96852752963305e-05, "loss": 1.1363, "step": 110 }, { "epoch": 0.96, "grad_norm": 2.368715286254883, "learning_rate": 1.9531079975339912e-05, "loss": 1.1077, "step": 120 }, { "epoch": 1.04, "grad_norm": 2.465362787246704, "learning_rate": 1.9347109355200672e-05, "loss": 1.0916, "step": 130 }, { "epoch": 1.12, "grad_norm": 1.520612359046936, "learning_rate": 1.9133938164092942e-05, "loss": 1.0992, "step": 140 }, { "epoch": 1.2, "grad_norm": 1.3989702463150024, "learning_rate": 1.8892232353409582e-05, "loss": 1.0849, "step": 150 }, { "epoch": 1.28, "grad_norm": 1.8318055868148804, "learning_rate": 1.8622747017309676e-05, "loss": 1.0918, "step": 160 }, { "epoch": 1.3599999999999999, "grad_norm": 1.3566697835922241, "learning_rate": 1.832632403378808e-05, "loss": 1.0673, "step": 170 }, { "epoch": 1.44, "grad_norm": 1.595423936843872, "learning_rate": 1.8003889434630473e-05, "loss": 1.0789, "step": 180 }, { "epoch": 1.52, "grad_norm": 1.4190820455551147, "learning_rate": 1.765645051247007e-05, "loss": 1.0618, "step": 190 }, { "epoch": 1.6, "grad_norm": 1.8037772178649902, "learning_rate": 1.728509267398376e-05, "loss": 1.0658, "step": 200 }, { "epoch": 1.6800000000000002, "grad_norm": 2.374105453491211, "learning_rate": 1.6890976049058267e-05, "loss": 1.0481, "step": 210 }, { "epoch": 1.76, "grad_norm": 2.712444543838501, "learning_rate": 1.6475331866519387e-05, "loss": 1.0856, "step": 220 }, { "epoch": 1.8399999999999999, "grad_norm": 0.846947193145752, "learning_rate": 1.6039458607746614e-05, "loss": 1.0575, "step": 230 }, { "epoch": 1.92, "grad_norm": 1.0007365942001343, "learning_rate": 1.558471795018936e-05, "loss": 1.0391, "step": 240 }, { "epoch": 2.0, "grad_norm": 1.445178747177124, "learning_rate": 1.5112530513457236e-05, "loss": 1.0378, "step": 250 }, { "epoch": 2.08, "grad_norm": 1.1010501384735107, "learning_rate": 1.4624371421273823e-05, "loss": 1.0421, "step": 260 }, { "epoch": 2.16, "grad_norm": 1.2421690225601196, "learning_rate": 1.4121765693158364e-05, "loss": 1.0279, "step": 270 }, { "epoch": 2.24, "grad_norm": 1.585425853729248, "learning_rate": 1.3606283480231957e-05, "loss": 1.036, "step": 280 }, { "epoch": 2.32, "grad_norm": 1.0381609201431274, "learning_rate": 1.3079535160031598e-05, "loss": 1.0348, "step": 290 }, { "epoch": 2.4, "grad_norm": 1.5096782445907593, "learning_rate": 1.2543166305656099e-05, "loss": 1.0362, "step": 300 }, { "epoch": 2.48, "grad_norm": 0.7411025762557983, "learning_rate": 1.1998852544960266e-05, "loss": 1.0293, "step": 310 }, { "epoch": 2.56, "grad_norm": 1.0825903415679932, "learning_rate": 1.1448294325857387e-05, "loss": 1.0287, "step": 320 }, { "epoch": 2.64, "grad_norm": 1.187018871307373, "learning_rate": 1.0893211604083325e-05, "loss": 1.0302, "step": 330 }, { "epoch": 2.7199999999999998, "grad_norm": 1.1337244510650635, "learning_rate": 1.0335338470017742e-05, "loss": 1.0358, "step": 340 }, { "epoch": 2.8, "grad_norm": 1.3198221921920776, "learning_rate": 9.776417731348416e-06, "loss": 1.028, "step": 350 }, { "epoch": 2.88, "grad_norm": 1.04779851436615, "learning_rate": 9.218195468502462e-06, "loss": 1.0221, "step": 360 }, { "epoch": 2.96, "grad_norm": 1.228852391242981, "learning_rate": 8.662415579853492e-06, "loss": 1.0152, "step": 370 }, { "epoch": 3.04, "grad_norm": 0.8538860082626343, "learning_rate": 8.110814333745496e-06, "loss": 1.0264, "step": 380 }, { "epoch": 3.12, "grad_norm": 1.0245864391326904, "learning_rate": 7.56511494435318e-06, "loss": 1.0161, "step": 390 }, { "epoch": 3.2, "grad_norm": 0.983771800994873, "learning_rate": 7.027022188323716e-06, "loss": 1.0131, "step": 400 }, { "epoch": 3.2800000000000002, "grad_norm": 0.8150555491447449, "learning_rate": 6.498217079017818e-06, "loss": 1.0271, "step": 410 }, { "epoch": 3.36, "grad_norm": 1.072811484336853, "learning_rate": 5.980351614987759e-06, "loss": 1.0062, "step": 420 }, { "epoch": 3.44, "grad_norm": 1.0907390117645264, "learning_rate": 5.475043619098334e-06, "loss": 1.0137, "step": 430 }, { "epoch": 3.52, "grad_norm": 0.8887478113174438, "learning_rate": 4.983871684413363e-06, "loss": 0.9897, "step": 440 }, { "epoch": 3.6, "grad_norm": 0.8817629218101501, "learning_rate": 4.508370242636968e-06, "loss": 1.0112, "step": 450 }, { "epoch": 3.68, "grad_norm": 1.7927525043487549, "learning_rate": 4.050024770515869e-06, "loss": 0.9977, "step": 460 }, { "epoch": 3.76, "grad_norm": 1.2345198392868042, "learning_rate": 3.6102671491780393e-06, "loss": 0.9906, "step": 470 }, { "epoch": 3.84, "grad_norm": 0.9548463225364685, "learning_rate": 3.1904711909051933e-06, "loss": 1.0106, "step": 480 }, { "epoch": 3.92, "grad_norm": 1.2012273073196411, "learning_rate": 2.7919483473136678e-06, "loss": 0.9938, "step": 490 }, { "epoch": 4.0, "grad_norm": 1.8137465715408325, "learning_rate": 2.4159436123512737e-06, "loss": 0.9991, "step": 500 }, { "epoch": 4.08, "grad_norm": 0.9726725220680237, "learning_rate": 2.0636316329094317e-06, "loss": 0.9852, "step": 510 }, { "epoch": 4.16, "grad_norm": 0.9943102598190308, "learning_rate": 1.7361130392009407e-06, "loss": 0.9887, "step": 520 }, { "epoch": 4.24, "grad_norm": 0.8023040890693665, "learning_rate": 1.4344110063675143e-06, "loss": 0.9813, "step": 530 }, { "epoch": 4.32, "grad_norm": 0.987560510635376, "learning_rate": 1.1594680580585815e-06, "loss": 0.9934, "step": 540 }, { "epoch": 4.4, "grad_norm": 0.8008418083190918, "learning_rate": 9.121431219671096e-07, "loss": 0.9944, "step": 550 }, { "epoch": 4.48, "grad_norm": 1.4193716049194336, "learning_rate": 6.932088465209941e-07, "loss": 0.9935, "step": 560 }, { "epoch": 4.5600000000000005, "grad_norm": 1.3066496849060059, "learning_rate": 5.033491871127105e-07, "loss": 0.9825, "step": 570 }, { "epoch": 4.64, "grad_norm": 0.9865421056747437, "learning_rate": 3.4315726940795436e-07, "loss": 0.9849, "step": 580 }, { "epoch": 4.72, "grad_norm": 0.7975912690162659, "learning_rate": 2.1313353640827207e-07, "loss": 0.9803, "step": 590 }, { "epoch": 4.8, "grad_norm": 0.8560538291931152, "learning_rate": 1.1368418505635303e-07, "loss": 0.9821, "step": 600 }, { "epoch": 4.88, "grad_norm": 0.9261242747306824, "learning_rate": 4.5119897268023347e-08, "loss": 0.9878, "step": 610 }, { "epoch": 4.96, "grad_norm": 0.7569891214370728, "learning_rate": 7.654869355252503e-09, "loss": 0.9881, "step": 620 }, { "epoch": 5.0, "step": 625, "total_flos": 2.425513594532659e+16, "train_loss": 1.886077099609375, "train_runtime": 28535.108, "train_samples_per_second": 0.175, "train_steps_per_second": 0.022 } ], "logging_steps": 10, "max_steps": 625, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.425513594532659e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }