{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 5000.0,
  "eval_steps": 500,
  "global_step": 5000,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 100.0,
      "grad_norm": 0.0051079667173326015,
      "learning_rate": 0.00019616000000000002,
      "loss": 0.1768,
      "step": 100
    },
    {
      "epoch": 200.0,
      "grad_norm": 0.0013037772150710225,
      "learning_rate": 0.00019216,
      "loss": 0.0,
      "step": 200
    },
    {
      "epoch": 300.0,
      "grad_norm": 0.00030190523830242455,
      "learning_rate": 0.00018816000000000001,
      "loss": 0.0,
      "step": 300
    },
    {
      "epoch": 400.0,
      "grad_norm": 0.00017417919298168272,
      "learning_rate": 0.00018416,
      "loss": 0.0,
      "step": 400
    },
    {
      "epoch": 500.0,
      "grad_norm": 0.00014137968537397683,
      "learning_rate": 0.00018016,
      "loss": 0.0,
      "step": 500
    },
    {
      "epoch": 600.0,
      "grad_norm": 0.0001240275305463001,
      "learning_rate": 0.00017616000000000002,
      "loss": 0.0,
      "step": 600
    },
    {
      "epoch": 700.0,
      "grad_norm": 9.807997412281111e-05,
      "learning_rate": 0.00017216,
      "loss": 0.0,
      "step": 700
    },
    {
      "epoch": 800.0,
      "grad_norm": 6.768624734831974e-05,
      "learning_rate": 0.00016816000000000002,
      "loss": 0.0,
      "step": 800
    },
    {
      "epoch": 900.0,
      "grad_norm": 5.961491842754185e-05,
      "learning_rate": 0.00016416,
      "loss": 0.0,
      "step": 900
    },
    {
      "epoch": 1000.0,
      "grad_norm": 5.017322473577224e-05,
      "learning_rate": 0.00016016,
      "loss": 0.0,
      "step": 1000
    },
    {
      "epoch": 1100.0,
      "grad_norm": 5.257365410216153e-05,
      "learning_rate": 0.00015616000000000002,
      "loss": 0.0,
      "step": 1100
    },
    {
      "epoch": 1200.0,
      "grad_norm": 5.0212354835821316e-05,
      "learning_rate": 0.00015216,
      "loss": 0.0,
      "step": 1200
    },
    {
      "epoch": 1300.0,
      "grad_norm": 0.00011130324128316715,
      "learning_rate": 0.00014816000000000002,
      "loss": 0.0,
      "step": 1300
    },
    {
      "epoch": 1400.0,
      "grad_norm": 3.4537704777903855e-05,
      "learning_rate": 0.00014416,
      "loss": 0.0,
      "step": 1400
    },
    {
      "epoch": 1500.0,
      "grad_norm": 2.7689882699633017e-05,
      "learning_rate": 0.00014016,
      "loss": 0.0,
      "step": 1500
    },
    {
      "epoch": 1600.0,
      "grad_norm": 2.726606180658564e-05,
      "learning_rate": 0.00013616,
      "loss": 0.0,
      "step": 1600
    },
    {
      "epoch": 1700.0,
      "grad_norm": 2.1775686036562547e-05,
      "learning_rate": 0.00013216,
      "loss": 0.0,
      "step": 1700
    },
    {
      "epoch": 1800.0,
      "grad_norm": 2.3525770302512683e-05,
      "learning_rate": 0.00012816000000000002,
      "loss": 0.0,
      "step": 1800
    },
    {
      "epoch": 1900.0,
      "grad_norm": 1.902567055367399e-05,
      "learning_rate": 0.00012416,
      "loss": 0.0,
      "step": 1900
    },
    {
      "epoch": 2000.0,
      "grad_norm": 2.1888447008677758e-05,
      "learning_rate": 0.00012016,
      "loss": 0.0,
      "step": 2000
    },
    {
      "epoch": 2100.0,
      "grad_norm": 1.896571302495431e-05,
      "learning_rate": 0.00011616,
      "loss": 0.0,
      "step": 2100
    },
    {
      "epoch": 2200.0,
      "grad_norm": 1.5480936781386845e-05,
      "learning_rate": 0.00011216,
      "loss": 0.0,
      "step": 2200
    },
    {
      "epoch": 2300.0,
      "grad_norm": 1.3961292097519618e-05,
      "learning_rate": 0.00010816,
      "loss": 0.0,
      "step": 2300
    },
    {
      "epoch": 2400.0,
      "grad_norm": 1.4109475159784779e-05,
      "learning_rate": 0.00010416000000000002,
      "loss": 0.0,
      "step": 2400
    },
    {
      "epoch": 2500.0,
      "grad_norm": 1.2665558642765973e-05,
      "learning_rate": 0.00010016,
      "loss": 0.0,
      "step": 2500
    },
    {
      "epoch": 2600.0,
      "grad_norm": 1.5646817701053806e-05,
      "learning_rate": 9.616e-05,
      "loss": 0.0,
      "step": 2600
    },
    {
      "epoch": 2700.0,
      "grad_norm": 1.2876950677309651e-05,
      "learning_rate": 9.216e-05,
      "loss": 0.0,
      "step": 2700
    },
    {
      "epoch": 2800.0,
      "grad_norm": 1.2121616236981936e-05,
      "learning_rate": 8.816000000000001e-05,
      "loss": 0.0,
      "step": 2800
    },
    {
      "epoch": 2900.0,
      "grad_norm": 1.4524578546115663e-05,
      "learning_rate": 8.416000000000001e-05,
      "loss": 0.0,
      "step": 2900
    },
    {
      "epoch": 3000.0,
      "grad_norm": 1.1223896763112862e-05,
      "learning_rate": 8.016e-05,
      "loss": 0.0,
      "step": 3000
    },
    {
      "epoch": 3100.0,
      "grad_norm": 8.85269673744915e-06,
      "learning_rate": 7.616e-05,
      "loss": 0.0,
      "step": 3100
    },
    {
      "epoch": 3200.0,
      "grad_norm": 1.264509955944959e-05,
      "learning_rate": 7.216e-05,
      "loss": 0.0,
      "step": 3200
    },
    {
      "epoch": 3300.0,
      "grad_norm": 8.284540854219813e-06,
      "learning_rate": 6.816e-05,
      "loss": 0.0,
      "step": 3300
    },
    {
      "epoch": 3400.0,
      "grad_norm": 8.871616046235431e-06,
      "learning_rate": 6.416e-05,
      "loss": 0.0,
      "step": 3400
    },
    {
      "epoch": 3500.0,
      "grad_norm": 9.966872312361374e-06,
      "learning_rate": 6.016000000000001e-05,
      "loss": 0.0,
      "step": 3500
    },
    {
      "epoch": 3600.0,
      "grad_norm": 2.9739601814071648e-05,
      "learning_rate": 5.6160000000000004e-05,
      "loss": 0.0,
      "step": 3600
    },
    {
      "epoch": 3700.0,
      "grad_norm": 7.714033927186392e-06,
      "learning_rate": 5.2159999999999995e-05,
      "loss": 0.0,
      "step": 3700
    },
    {
      "epoch": 3800.0,
      "grad_norm": 1.497406901762588e-05,
      "learning_rate": 4.816e-05,
      "loss": 0.0,
      "step": 3800
    },
    {
      "epoch": 3900.0,
      "grad_norm": 7.307490250241244e-06,
      "learning_rate": 4.4160000000000004e-05,
      "loss": 0.0,
      "step": 3900
    },
    {
      "epoch": 4000.0,
      "grad_norm": 6.682894763798686e-06,
      "learning_rate": 4.016e-05,
      "loss": 0.0,
      "step": 4000
    },
    {
      "epoch": 4100.0,
      "grad_norm": 7.749928954581264e-06,
      "learning_rate": 3.616e-05,
      "loss": 0.0,
      "step": 4100
    },
    {
      "epoch": 4200.0,
      "grad_norm": 1.01770574474358e-05,
      "learning_rate": 3.2160000000000004e-05,
      "loss": 0.0,
      "step": 4200
    },
    {
      "epoch": 4300.0,
      "grad_norm": 6.606936040043365e-06,
      "learning_rate": 2.816e-05,
      "loss": 0.0,
      "step": 4300
    },
    {
      "epoch": 4400.0,
      "grad_norm": 6.749212843715213e-06,
      "learning_rate": 2.4160000000000002e-05,
      "loss": 0.0,
      "step": 4400
    },
    {
      "epoch": 4500.0,
      "grad_norm": 8.575744686822873e-06,
      "learning_rate": 2.016e-05,
      "loss": 0.0,
      "step": 4500
    },
    {
      "epoch": 4600.0,
      "grad_norm": 6.673930329270661e-06,
      "learning_rate": 1.616e-05,
      "loss": 0.0,
      "step": 4600
    },
    {
      "epoch": 4700.0,
      "grad_norm": 6.32612272966071e-06,
      "learning_rate": 1.216e-05,
      "loss": 0.0,
      "step": 4700
    },
    {
      "epoch": 4800.0,
      "grad_norm": 6.985771960899001e-06,
      "learning_rate": 8.160000000000001e-06,
      "loss": 0.0,
      "step": 4800
    },
    {
      "epoch": 4900.0,
      "grad_norm": 5.245818101684563e-06,
      "learning_rate": 4.16e-06,
      "loss": 0.0,
      "step": 4900
    },
    {
      "epoch": 5000.0,
      "grad_norm": 5.854470146005042e-06,
      "learning_rate": 1.6e-07,
      "loss": 0.0,
      "step": 5000
    }
  ],
  "logging_steps": 100,
  "max_steps": 5000,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 5000,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 6755965747200000.0,
  "train_batch_size": 1,
  "trial_name": null,
  "trial_params": null
}