{ "best_metric": null, "best_model_checkpoint": null, "epoch": 17.094017094017094, "eval_steps": 1000, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005698005698005698, "grad_norm": 6.307312488555908, "learning_rate": 0.0, "loss": 3.0911, "step": 1 }, { "epoch": 0.2849002849002849, "grad_norm": 4.076864719390869, "learning_rate": 0.0001977977977977978, "loss": 1.4649, "step": 50 }, { "epoch": 0.5698005698005698, "grad_norm": 2.819526195526123, "learning_rate": 0.0001952952952952953, "loss": 1.1601, "step": 100 }, { "epoch": 0.8547008547008547, "grad_norm": 3.7976534366607666, "learning_rate": 0.00019279279279279282, "loss": 1.1284, "step": 150 }, { "epoch": 1.1396011396011396, "grad_norm": 3.787907838821411, "learning_rate": 0.0001902902902902903, "loss": 0.91, "step": 200 }, { "epoch": 1.4245014245014245, "grad_norm": 3.807788133621216, "learning_rate": 0.0001877877877877878, "loss": 0.7007, "step": 250 }, { "epoch": 1.7094017094017095, "grad_norm": 3.6782045364379883, "learning_rate": 0.00018528528528528532, "loss": 0.6793, "step": 300 }, { "epoch": 1.9943019943019942, "grad_norm": 3.648346424102783, "learning_rate": 0.0001827827827827828, "loss": 0.7325, "step": 350 }, { "epoch": 2.2792022792022792, "grad_norm": 3.3711447715759277, "learning_rate": 0.00018028028028028027, "loss": 0.3936, "step": 400 }, { "epoch": 2.564102564102564, "grad_norm": 3.362125873565674, "learning_rate": 0.00017777777777777779, "loss": 0.3844, "step": 450 }, { "epoch": 2.849002849002849, "grad_norm": 3.8006858825683594, "learning_rate": 0.00017527527527527528, "loss": 0.4327, "step": 500 }, { "epoch": 3.133903133903134, "grad_norm": 3.940809488296509, "learning_rate": 0.00017277277277277277, "loss": 0.3364, "step": 550 }, { "epoch": 3.4188034188034186, "grad_norm": 3.061803102493286, "learning_rate": 0.00017027027027027028, "loss": 0.2445, "step": 600 }, { "epoch": 3.7037037037037037, "grad_norm": 3.389284372329712, "learning_rate": 0.00016776776776776777, "loss": 0.2597, "step": 650 }, { "epoch": 3.9886039886039883, "grad_norm": 3.320084810256958, "learning_rate": 0.00016526526526526526, "loss": 0.2698, "step": 700 }, { "epoch": 4.273504273504273, "grad_norm": 2.7199738025665283, "learning_rate": 0.00016276276276276275, "loss": 0.1781, "step": 750 }, { "epoch": 4.5584045584045585, "grad_norm": 3.226743459701538, "learning_rate": 0.00016026026026026027, "loss": 0.1902, "step": 800 }, { "epoch": 4.843304843304844, "grad_norm": 4.62879753112793, "learning_rate": 0.00015775775775775776, "loss": 0.209, "step": 850 }, { "epoch": 5.128205128205128, "grad_norm": 2.747284412384033, "learning_rate": 0.00015525525525525525, "loss": 0.1786, "step": 900 }, { "epoch": 5.413105413105413, "grad_norm": 2.259187936782837, "learning_rate": 0.00015275275275275277, "loss": 0.1536, "step": 950 }, { "epoch": 5.698005698005698, "grad_norm": 1.9622772932052612, "learning_rate": 0.00015025025025025026, "loss": 0.156, "step": 1000 }, { "epoch": 5.982905982905983, "grad_norm": 2.236668825149536, "learning_rate": 0.00014774774774774775, "loss": 0.1618, "step": 1050 }, { "epoch": 6.267806267806268, "grad_norm": 3.983106851577759, "learning_rate": 0.00014524524524524526, "loss": 0.1273, "step": 1100 }, { "epoch": 6.552706552706553, "grad_norm": 1.7318657636642456, "learning_rate": 0.00014274274274274275, "loss": 0.1334, "step": 1150 }, { "epoch": 6.837606837606837, "grad_norm": 1.885850191116333, "learning_rate": 0.00014024024024024024, "loss": 0.1329, "step": 1200 }, { "epoch": 7.122507122507122, "grad_norm": 4.529383659362793, "learning_rate": 0.00013773773773773776, "loss": 0.1313, "step": 1250 }, { "epoch": 7.407407407407407, "grad_norm": 2.217284679412842, "learning_rate": 0.00013523523523523525, "loss": 0.114, "step": 1300 }, { "epoch": 7.6923076923076925, "grad_norm": 2.4019832611083984, "learning_rate": 0.00013273273273273274, "loss": 0.1175, "step": 1350 }, { "epoch": 7.977207977207978, "grad_norm": 2.7499570846557617, "learning_rate": 0.00013023023023023023, "loss": 0.1229, "step": 1400 }, { "epoch": 8.262108262108262, "grad_norm": 1.7632925510406494, "learning_rate": 0.00012772772772772775, "loss": 0.0974, "step": 1450 }, { "epoch": 8.547008547008547, "grad_norm": 1.798587441444397, "learning_rate": 0.00012522522522522524, "loss": 0.1031, "step": 1500 }, { "epoch": 8.831908831908832, "grad_norm": 2.925875663757324, "learning_rate": 0.00012272272272272273, "loss": 0.1098, "step": 1550 }, { "epoch": 9.116809116809117, "grad_norm": 2.593219757080078, "learning_rate": 0.00012022022022022023, "loss": 0.0979, "step": 1600 }, { "epoch": 9.401709401709402, "grad_norm": 2.457099199295044, "learning_rate": 0.00011771771771771771, "loss": 0.0917, "step": 1650 }, { "epoch": 9.686609686609687, "grad_norm": 2.513124465942383, "learning_rate": 0.00011521521521521521, "loss": 0.0884, "step": 1700 }, { "epoch": 9.971509971509972, "grad_norm": 0.9835783243179321, "learning_rate": 0.00011271271271271271, "loss": 0.0891, "step": 1750 }, { "epoch": 10.256410256410255, "grad_norm": 1.4291648864746094, "learning_rate": 0.0001102102102102102, "loss": 0.0792, "step": 1800 }, { "epoch": 10.54131054131054, "grad_norm": 0.974391758441925, "learning_rate": 0.00010770770770770771, "loss": 0.0834, "step": 1850 }, { "epoch": 10.826210826210826, "grad_norm": 2.3604581356048584, "learning_rate": 0.0001052052052052052, "loss": 0.0829, "step": 1900 }, { "epoch": 11.11111111111111, "grad_norm": 1.6176166534423828, "learning_rate": 0.0001027027027027027, "loss": 0.08, "step": 1950 }, { "epoch": 11.396011396011396, "grad_norm": 1.9266570806503296, "learning_rate": 0.0001002002002002002, "loss": 0.0763, "step": 2000 }, { "epoch": 11.68091168091168, "grad_norm": 2.1315979957580566, "learning_rate": 9.76976976976977e-05, "loss": 0.0769, "step": 2050 }, { "epoch": 11.965811965811966, "grad_norm": 1.4953100681304932, "learning_rate": 9.51951951951952e-05, "loss": 0.0787, "step": 2100 }, { "epoch": 12.250712250712251, "grad_norm": 1.715878963470459, "learning_rate": 9.26926926926927e-05, "loss": 0.0699, "step": 2150 }, { "epoch": 12.535612535612536, "grad_norm": 0.5827972888946533, "learning_rate": 9.019019019019019e-05, "loss": 0.0738, "step": 2200 }, { "epoch": 12.820512820512821, "grad_norm": 0.3549967408180237, "learning_rate": 8.76876876876877e-05, "loss": 0.0743, "step": 2250 }, { "epoch": 13.105413105413106, "grad_norm": 2.361628532409668, "learning_rate": 8.518518518518518e-05, "loss": 0.0692, "step": 2300 }, { "epoch": 13.39031339031339, "grad_norm": 1.2454289197921753, "learning_rate": 8.268268268268269e-05, "loss": 0.0711, "step": 2350 }, { "epoch": 13.675213675213675, "grad_norm": 1.1497353315353394, "learning_rate": 8.018018018018019e-05, "loss": 0.0688, "step": 2400 }, { "epoch": 13.96011396011396, "grad_norm": 2.107264995574951, "learning_rate": 7.767767767767768e-05, "loss": 0.0706, "step": 2450 }, { "epoch": 14.245014245014245, "grad_norm": 0.4413589537143707, "learning_rate": 7.517517517517519e-05, "loss": 0.0664, "step": 2500 }, { "epoch": 14.52991452991453, "grad_norm": 1.1788307428359985, "learning_rate": 7.267267267267268e-05, "loss": 0.0658, "step": 2550 }, { "epoch": 14.814814814814815, "grad_norm": 0.5639584064483643, "learning_rate": 7.017017017017016e-05, "loss": 0.063, "step": 2600 }, { "epoch": 15.0997150997151, "grad_norm": 0.2644463777542114, "learning_rate": 6.766766766766767e-05, "loss": 0.0653, "step": 2650 }, { "epoch": 15.384615384615385, "grad_norm": 0.10696238279342651, "learning_rate": 6.516516516516516e-05, "loss": 0.0614, "step": 2700 }, { "epoch": 15.66951566951567, "grad_norm": 1.1413618326187134, "learning_rate": 6.266266266266266e-05, "loss": 0.0637, "step": 2750 }, { "epoch": 15.954415954415955, "grad_norm": 0.09536276012659073, "learning_rate": 6.016016016016016e-05, "loss": 0.0617, "step": 2800 }, { "epoch": 16.23931623931624, "grad_norm": 0.09245631843805313, "learning_rate": 5.765765765765766e-05, "loss": 0.06, "step": 2850 }, { "epoch": 16.524216524216524, "grad_norm": 1.483127474784851, "learning_rate": 5.515515515515516e-05, "loss": 0.0605, "step": 2900 }, { "epoch": 16.80911680911681, "grad_norm": 0.12009686231613159, "learning_rate": 5.2652652652652655e-05, "loss": 0.062, "step": 2950 }, { "epoch": 17.094017094017094, "grad_norm": 0.0926225408911705, "learning_rate": 5.015015015015015e-05, "loss": 0.0586, "step": 3000 } ], "logging_steps": 50, "max_steps": 4001, "num_input_tokens_seen": 0, "num_train_epochs": 23, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.09931874049065e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }