{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.05735918320523116, "eval_steps": 10, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001147183664104623, "eval_loss": 1.873344898223877, "eval_runtime": 12.7647, "eval_samples_per_second": 515.797, "eval_steps_per_second": 8.069, "step": 10 }, { "epoch": 0.002294367328209246, "eval_loss": 1.8726389408111572, "eval_runtime": 12.8667, "eval_samples_per_second": 511.709, "eval_steps_per_second": 8.005, "step": 20 }, { "epoch": 0.0034415509923138693, "eval_loss": 1.8714078664779663, "eval_runtime": 12.9103, "eval_samples_per_second": 509.979, "eval_steps_per_second": 7.978, "step": 30 }, { "epoch": 0.004588734656418492, "eval_loss": 1.8696790933609009, "eval_runtime": 12.947, "eval_samples_per_second": 508.534, "eval_steps_per_second": 7.955, "step": 40 }, { "epoch": 0.0057359183205231154, "eval_loss": 1.8675329685211182, "eval_runtime": 12.9458, "eval_samples_per_second": 508.582, "eval_steps_per_second": 7.956, "step": 50 }, { "epoch": 0.006883101984627739, "eval_loss": 1.8649154901504517, "eval_runtime": 13.0432, "eval_samples_per_second": 504.785, "eval_steps_per_second": 7.897, "step": 60 }, { "epoch": 0.008030285648732363, "eval_loss": 1.8619294166564941, "eval_runtime": 13.0638, "eval_samples_per_second": 503.988, "eval_steps_per_second": 7.884, "step": 70 }, { "epoch": 0.009177469312836984, "eval_loss": 1.8583979606628418, "eval_runtime": 13.0482, "eval_samples_per_second": 504.592, "eval_steps_per_second": 7.894, "step": 80 }, { "epoch": 0.010324652976941608, "eval_loss": 1.85438871383667, "eval_runtime": 13.0615, "eval_samples_per_second": 504.075, "eval_steps_per_second": 7.886, "step": 90 }, { "epoch": 0.011471836641046231, "grad_norm": 9.938580513000488, "learning_rate": 3.8226299694189603e-07, "loss": 3.1046, "step": 100 }, { "epoch": 0.011471836641046231, "eval_loss": 1.849947214126587, "eval_runtime": 13.0663, "eval_samples_per_second": 503.89, "eval_steps_per_second": 7.883, "step": 100 }, { "epoch": 0.012619020305150854, "eval_loss": 1.8451412916183472, "eval_runtime": 12.9771, "eval_samples_per_second": 507.357, "eval_steps_per_second": 7.937, "step": 110 }, { "epoch": 0.013766203969255477, "eval_loss": 1.8399487733840942, "eval_runtime": 13.0209, "eval_samples_per_second": 505.648, "eval_steps_per_second": 7.91, "step": 120 }, { "epoch": 0.0149133876333601, "eval_loss": 1.8342881202697754, "eval_runtime": 13.0369, "eval_samples_per_second": 505.028, "eval_steps_per_second": 7.901, "step": 130 }, { "epoch": 0.016060571297464726, "eval_loss": 1.8283486366271973, "eval_runtime": 13.0149, "eval_samples_per_second": 505.88, "eval_steps_per_second": 7.914, "step": 140 }, { "epoch": 0.017207754961569347, "eval_loss": 1.822334885597229, "eval_runtime": 13.0213, "eval_samples_per_second": 505.632, "eval_steps_per_second": 7.91, "step": 150 }, { "epoch": 0.01835493862567397, "eval_loss": 1.8158738613128662, "eval_runtime": 13.0599, "eval_samples_per_second": 504.14, "eval_steps_per_second": 7.887, "step": 160 }, { "epoch": 0.019502122289778594, "eval_loss": 1.8090614080429077, "eval_runtime": 13.034, "eval_samples_per_second": 505.14, "eval_steps_per_second": 7.902, "step": 170 }, { "epoch": 0.020649305953883215, "eval_loss": 1.8015782833099365, "eval_runtime": 13.0665, "eval_samples_per_second": 503.885, "eval_steps_per_second": 7.883, "step": 180 }, { "epoch": 0.02179648961798784, "eval_loss": 1.793796420097351, "eval_runtime": 13.0555, "eval_samples_per_second": 504.31, "eval_steps_per_second": 7.889, "step": 190 }, { "epoch": 0.022943673282092462, "grad_norm": 4.906337738037109, "learning_rate": 7.645259938837921e-07, "loss": 3.0303, "step": 200 }, { "epoch": 0.022943673282092462, "eval_loss": 1.785815715789795, "eval_runtime": 12.9925, "eval_samples_per_second": 506.754, "eval_steps_per_second": 7.928, "step": 200 }, { "epoch": 0.024090856946197087, "eval_loss": 1.7775053977966309, "eval_runtime": 13.0639, "eval_samples_per_second": 503.986, "eval_steps_per_second": 7.884, "step": 210 }, { "epoch": 0.025238040610301708, "eval_loss": 1.7692992687225342, "eval_runtime": 13.0129, "eval_samples_per_second": 505.96, "eval_steps_per_second": 7.915, "step": 220 }, { "epoch": 0.026385224274406333, "eval_loss": 1.760453224182129, "eval_runtime": 13.0078, "eval_samples_per_second": 506.158, "eval_steps_per_second": 7.918, "step": 230 }, { "epoch": 0.027532407938510955, "eval_loss": 1.751396656036377, "eval_runtime": 12.9957, "eval_samples_per_second": 506.628, "eval_steps_per_second": 7.926, "step": 240 }, { "epoch": 0.02867959160261558, "eval_loss": 1.7417218685150146, "eval_runtime": 12.9774, "eval_samples_per_second": 507.344, "eval_steps_per_second": 7.937, "step": 250 }, { "epoch": 0.0298267752667202, "eval_loss": 1.7319914102554321, "eval_runtime": 13.0219, "eval_samples_per_second": 505.611, "eval_steps_per_second": 7.91, "step": 260 }, { "epoch": 0.030973958930824826, "eval_loss": 1.7227253913879395, "eval_runtime": 13.0026, "eval_samples_per_second": 506.361, "eval_steps_per_second": 7.922, "step": 270 }, { "epoch": 0.03212114259492945, "eval_loss": 1.7133797407150269, "eval_runtime": 12.9757, "eval_samples_per_second": 507.409, "eval_steps_per_second": 7.938, "step": 280 }, { "epoch": 0.03326832625903407, "eval_loss": 1.704041600227356, "eval_runtime": 12.9845, "eval_samples_per_second": 507.065, "eval_steps_per_second": 7.933, "step": 290 }, { "epoch": 0.034415509923138694, "grad_norm": 4.665822505950928, "learning_rate": 1.1467889908256882e-06, "loss": 2.9459, "step": 300 }, { "epoch": 0.034415509923138694, "eval_loss": 1.6940686702728271, "eval_runtime": 13.0019, "eval_samples_per_second": 506.387, "eval_steps_per_second": 7.922, "step": 300 }, { "epoch": 0.035562693587243316, "eval_loss": 1.683342695236206, "eval_runtime": 13.0065, "eval_samples_per_second": 506.209, "eval_steps_per_second": 7.919, "step": 310 }, { "epoch": 0.03670987725134794, "eval_loss": 1.6724653244018555, "eval_runtime": 13.0129, "eval_samples_per_second": 505.96, "eval_steps_per_second": 7.915, "step": 320 }, { "epoch": 0.037857060915452566, "eval_loss": 1.6614341735839844, "eval_runtime": 12.9921, "eval_samples_per_second": 506.769, "eval_steps_per_second": 7.928, "step": 330 }, { "epoch": 0.03900424457955719, "eval_loss": 1.6510112285614014, "eval_runtime": 13.0242, "eval_samples_per_second": 505.52, "eval_steps_per_second": 7.908, "step": 340 }, { "epoch": 0.04015142824366181, "eval_loss": 1.6401513814926147, "eval_runtime": 12.9214, "eval_samples_per_second": 509.542, "eval_steps_per_second": 7.971, "step": 350 }, { "epoch": 0.04129861190776643, "eval_loss": 1.6295816898345947, "eval_runtime": 12.9563, "eval_samples_per_second": 508.171, "eval_steps_per_second": 7.95, "step": 360 }, { "epoch": 0.04244579557187106, "eval_loss": 1.6187150478363037, "eval_runtime": 12.9758, "eval_samples_per_second": 507.405, "eval_steps_per_second": 7.938, "step": 370 }, { "epoch": 0.04359297923597568, "eval_loss": 1.607272982597351, "eval_runtime": 12.9876, "eval_samples_per_second": 506.947, "eval_steps_per_second": 7.931, "step": 380 }, { "epoch": 0.0447401629000803, "eval_loss": 1.5961676836013794, "eval_runtime": 12.9782, "eval_samples_per_second": 507.313, "eval_steps_per_second": 7.936, "step": 390 }, { "epoch": 0.045887346564184923, "grad_norm": 4.870114326477051, "learning_rate": 1.5290519877675841e-06, "loss": 2.7813, "step": 400 }, { "epoch": 0.045887346564184923, "eval_loss": 1.5848218202590942, "eval_runtime": 12.9783, "eval_samples_per_second": 507.309, "eval_steps_per_second": 7.936, "step": 400 }, { "epoch": 0.04703453022828955, "eval_loss": 1.5734797716140747, "eval_runtime": 12.9739, "eval_samples_per_second": 507.482, "eval_steps_per_second": 7.939, "step": 410 }, { "epoch": 0.04818171389239417, "eval_loss": 1.562021255493164, "eval_runtime": 12.9388, "eval_samples_per_second": 508.855, "eval_steps_per_second": 7.961, "step": 420 }, { "epoch": 0.049328897556498795, "eval_loss": 1.5495364665985107, "eval_runtime": 12.9412, "eval_samples_per_second": 508.764, "eval_steps_per_second": 7.959, "step": 430 }, { "epoch": 0.050476081220603417, "eval_loss": 1.5375314950942993, "eval_runtime": 12.9686, "eval_samples_per_second": 507.687, "eval_steps_per_second": 7.942, "step": 440 }, { "epoch": 0.051623264884708045, "eval_loss": 1.525598168373108, "eval_runtime": 12.9695, "eval_samples_per_second": 507.651, "eval_steps_per_second": 7.942, "step": 450 }, { "epoch": 0.052770448548812667, "eval_loss": 1.5132672786712646, "eval_runtime": 12.8961, "eval_samples_per_second": 510.543, "eval_steps_per_second": 7.987, "step": 460 }, { "epoch": 0.05391763221291729, "eval_loss": 1.5012215375900269, "eval_runtime": 12.9428, "eval_samples_per_second": 508.7, "eval_steps_per_second": 7.958, "step": 470 }, { "epoch": 0.05506481587702191, "eval_loss": 1.4892219305038452, "eval_runtime": 12.9208, "eval_samples_per_second": 509.567, "eval_steps_per_second": 7.972, "step": 480 }, { "epoch": 0.05621199954112653, "eval_loss": 1.4768636226654053, "eval_runtime": 12.9423, "eval_samples_per_second": 508.721, "eval_steps_per_second": 7.958, "step": 490 }, { "epoch": 0.05735918320523116, "grad_norm": 4.155641555786133, "learning_rate": 1.9113149847094803e-06, "loss": 2.6308, "step": 500 }, { "epoch": 0.05735918320523116, "eval_loss": 1.4640088081359863, "eval_runtime": 12.8729, "eval_samples_per_second": 511.462, "eval_steps_per_second": 8.001, "step": 500 } ], "logging_steps": 100, "max_steps": 26151, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }