{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 5.0,
  "eval_steps": 1000,
  "global_step": 1110,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.04504504504504504,
      "grad_norm": 3.3412909507751465,
      "learning_rate": 1.801801801801802e-05,
      "loss": 1.2791,
      "step": 10
    },
    {
      "epoch": 0.09009009009009009,
      "grad_norm": 1.4792481660842896,
      "learning_rate": 3.603603603603604e-05,
      "loss": 1.0799,
      "step": 20
    },
    {
      "epoch": 0.13513513513513514,
      "grad_norm": 1.3788402080535889,
      "learning_rate": 5.405405405405406e-05,
      "loss": 0.7657,
      "step": 30
    },
    {
      "epoch": 0.18018018018018017,
      "grad_norm": 0.7668061256408691,
      "learning_rate": 7.207207207207208e-05,
      "loss": 0.5807,
      "step": 40
    },
    {
      "epoch": 0.22522522522522523,
      "grad_norm": 0.7166613936424255,
      "learning_rate": 9.009009009009009e-05,
      "loss": 0.6035,
      "step": 50
    },
    {
      "epoch": 0.2702702702702703,
      "grad_norm": 0.9661350846290588,
      "learning_rate": 0.00010810810810810812,
      "loss": 0.5441,
      "step": 60
    },
    {
      "epoch": 0.3153153153153153,
      "grad_norm": 0.7341681122779846,
      "learning_rate": 0.00012612612612612612,
      "loss": 0.6031,
      "step": 70
    },
    {
      "epoch": 0.36036036036036034,
      "grad_norm": 1.3319752216339111,
      "learning_rate": 0.00014414414414414415,
      "loss": 0.5686,
      "step": 80
    },
    {
      "epoch": 0.40540540540540543,
      "grad_norm": 0.7269447445869446,
      "learning_rate": 0.00016216216216216218,
      "loss": 0.4978,
      "step": 90
    },
    {
      "epoch": 0.45045045045045046,
      "grad_norm": 0.46780362725257874,
      "learning_rate": 0.00018018018018018018,
      "loss": 0.473,
      "step": 100
    },
    {
      "epoch": 0.4954954954954955,
      "grad_norm": 0.728823184967041,
      "learning_rate": 0.0001981981981981982,
      "loss": 0.5892,
      "step": 110
    },
    {
      "epoch": 0.5405405405405406,
      "grad_norm": 0.5054831504821777,
      "learning_rate": 0.0001999599507118322,
      "loss": 0.5117,
      "step": 120
    },
    {
      "epoch": 0.5855855855855856,
      "grad_norm": 0.6193355321884155,
      "learning_rate": 0.00019982154991201608,
      "loss": 0.4759,
      "step": 130
    },
    {
      "epoch": 0.6306306306306306,
      "grad_norm": 0.3995501399040222,
      "learning_rate": 0.00019958443999073397,
      "loss": 0.3928,
      "step": 140
    },
    {
      "epoch": 0.6756756756756757,
      "grad_norm": 0.5497131943702698,
      "learning_rate": 0.0001992488554155135,
      "loss": 0.4507,
      "step": 150
    },
    {
      "epoch": 0.7207207207207207,
      "grad_norm": 1.0290075540542603,
      "learning_rate": 0.00019881512803111796,
      "loss": 0.4766,
      "step": 160
    },
    {
      "epoch": 0.7657657657657657,
      "grad_norm": 0.6826834678649902,
      "learning_rate": 0.00019828368673139947,
      "loss": 0.5236,
      "step": 170
    },
    {
      "epoch": 0.8108108108108109,
      "grad_norm": 0.6135373711585999,
      "learning_rate": 0.00019765505703518496,
      "loss": 0.4454,
      "step": 180
    },
    {
      "epoch": 0.8558558558558559,
      "grad_norm": 0.6352598667144775,
      "learning_rate": 0.00019692986056661356,
      "loss": 0.508,
      "step": 190
    },
    {
      "epoch": 0.9009009009009009,
      "grad_norm": 0.5680545568466187,
      "learning_rate": 0.0001961088144404403,
      "loss": 0.5896,
      "step": 200
    },
    {
      "epoch": 0.9459459459459459,
      "grad_norm": 0.3386252820491791,
      "learning_rate": 0.00019519273055291266,
      "loss": 0.4729,
      "step": 210
    },
    {
      "epoch": 0.990990990990991,
      "grad_norm": 0.358553409576416,
      "learning_rate": 0.0001941825147789225,
      "loss": 0.45,
      "step": 220
    },
    {
      "epoch": 1.0360360360360361,
      "grad_norm": 0.668021023273468,
      "learning_rate": 0.0001930791660762262,
      "loss": 0.4162,
      "step": 230
    },
    {
      "epoch": 1.0810810810810811,
      "grad_norm": 0.5463367700576782,
      "learning_rate": 0.00019188377549761963,
      "loss": 0.4445,
      "step": 240
    },
    {
      "epoch": 1.1261261261261262,
      "grad_norm": 0.7385880351066589,
      "learning_rate": 0.000190597525112044,
      "loss": 0.3849,
      "step": 250
    },
    {
      "epoch": 1.1711711711711712,
      "grad_norm": 0.6837536692619324,
      "learning_rate": 0.0001892216868356904,
      "loss": 0.4652,
      "step": 260
    },
    {
      "epoch": 1.2162162162162162,
      "grad_norm": 0.866578221321106,
      "learning_rate": 0.00018775762117425777,
      "loss": 0.4648,
      "step": 270
    },
    {
      "epoch": 1.2612612612612613,
      "grad_norm": 0.6583455204963684,
      "learning_rate": 0.00018620677587760916,
      "loss": 0.3848,
      "step": 280
    },
    {
      "epoch": 1.3063063063063063,
      "grad_norm": 0.6937561631202698,
      "learning_rate": 0.00018457068450815562,
      "loss": 0.4532,
      "step": 290
    },
    {
      "epoch": 1.3513513513513513,
      "grad_norm": 0.5930050611495972,
      "learning_rate": 0.00018285096492438424,
      "loss": 0.5282,
      "step": 300
    },
    {
      "epoch": 1.3963963963963963,
      "grad_norm": 1.1432991027832031,
      "learning_rate": 0.0001810493176810292,
      "loss": 0.4369,
      "step": 310
    },
    {
      "epoch": 1.4414414414414414,
      "grad_norm": 0.5736434459686279,
      "learning_rate": 0.00017916752434746856,
      "loss": 0.4434,
      "step": 320
    },
    {
      "epoch": 1.4864864864864864,
      "grad_norm": 0.5792914032936096,
      "learning_rate": 0.00017720744574600863,
      "loss": 0.4434,
      "step": 330
    },
    {
      "epoch": 1.5315315315315314,
      "grad_norm": 0.7626290917396545,
      "learning_rate": 0.00017517102011179933,
      "loss": 0.4226,
      "step": 340
    },
    {
      "epoch": 1.5765765765765765,
      "grad_norm": 0.6746386289596558,
      "learning_rate": 0.00017306026117619889,
      "loss": 0.4126,
      "step": 350
    },
    {
      "epoch": 1.6216216216216215,
      "grad_norm": 0.5064342617988586,
      "learning_rate": 0.00017087725617548385,
      "loss": 0.3926,
      "step": 360
    },
    {
      "epoch": 1.6666666666666665,
      "grad_norm": 0.793991208076477,
      "learning_rate": 0.0001686241637868734,
      "loss": 0.4437,
      "step": 370
    },
    {
      "epoch": 1.7117117117117115,
      "grad_norm": 0.6197868585586548,
      "learning_rate": 0.00016630321199390867,
      "loss": 0.3932,
      "step": 380
    },
    {
      "epoch": 1.7567567567567568,
      "grad_norm": 0.5004612803459167,
      "learning_rate": 0.0001639166958832985,
      "loss": 0.3883,
      "step": 390
    },
    {
      "epoch": 1.8018018018018018,
      "grad_norm": 0.7265865206718445,
      "learning_rate": 0.00016146697537540924,
      "loss": 0.4453,
      "step": 400
    },
    {
      "epoch": 1.8468468468468469,
      "grad_norm": 0.5155379772186279,
      "learning_rate": 0.00015895647289064396,
      "loss": 0.48,
      "step": 410
    },
    {
      "epoch": 1.8918918918918919,
      "grad_norm": 0.5756716132164001,
      "learning_rate": 0.0001563876709540178,
      "loss": 0.4874,
      "step": 420
    },
    {
      "epoch": 1.936936936936937,
      "grad_norm": 0.7919459342956543,
      "learning_rate": 0.00015376310974029873,
      "loss": 0.4075,
      "step": 430
    },
    {
      "epoch": 1.981981981981982,
      "grad_norm": 0.5977569818496704,
      "learning_rate": 0.0001510853845621409,
      "loss": 0.504,
      "step": 440
    },
    {
      "epoch": 2.027027027027027,
      "grad_norm": 0.601466178894043,
      "learning_rate": 0.00014835714330369446,
      "loss": 0.3732,
      "step": 450
    },
    {
      "epoch": 2.0720720720720722,
      "grad_norm": 0.575406014919281,
      "learning_rate": 0.00014558108380223012,
      "loss": 0.3317,
      "step": 460
    },
    {
      "epoch": 2.1171171171171173,
      "grad_norm": 1.0440267324447632,
      "learning_rate": 0.00014275995118036693,
      "loss": 0.3896,
      "step": 470
    },
    {
      "epoch": 2.1621621621621623,
      "grad_norm": 0.6646713614463806,
      "learning_rate": 0.00013989653513154165,
      "loss": 0.3478,
      "step": 480
    },
    {
      "epoch": 2.2072072072072073,
      "grad_norm": 0.7202288508415222,
      "learning_rate": 0.00013699366716140435,
      "loss": 0.3712,
      "step": 490
    },
    {
      "epoch": 2.2522522522522523,
      "grad_norm": 0.6566169261932373,
      "learning_rate": 0.00013405421778786737,
      "loss": 0.3548,
      "step": 500
    },
    {
      "epoch": 2.2972972972972974,
      "grad_norm": 1.0158584117889404,
      "learning_rate": 0.00013108109370257712,
      "loss": 0.3404,
      "step": 510
    },
    {
      "epoch": 2.3423423423423424,
      "grad_norm": 0.7582752108573914,
      "learning_rate": 0.00012807723489661495,
      "loss": 0.374,
      "step": 520
    },
    {
      "epoch": 2.3873873873873874,
      "grad_norm": 0.7685467600822449,
      "learning_rate": 0.00012504561175326985,
      "loss": 0.3127,
      "step": 530
    },
    {
      "epoch": 2.4324324324324325,
      "grad_norm": 0.9077286124229431,
      "learning_rate": 0.00012198922211075778,
      "loss": 0.353,
      "step": 540
    },
    {
      "epoch": 2.4774774774774775,
      "grad_norm": 0.9107437133789062,
      "learning_rate": 0.00011891108829779165,
      "loss": 0.3531,
      "step": 550
    },
    {
      "epoch": 2.5225225225225225,
      "grad_norm": 1.1385325193405151,
      "learning_rate": 0.0001158142541449341,
      "loss": 0.3695,
      "step": 560
    },
    {
      "epoch": 2.5675675675675675,
      "grad_norm": 0.9225629568099976,
      "learning_rate": 0.00011270178197468789,
      "loss": 0.3606,
      "step": 570
    },
    {
      "epoch": 2.6126126126126126,
      "grad_norm": 0.6338076591491699,
      "learning_rate": 0.00010957674957330042,
      "loss": 0.312,
      "step": 580
    },
    {
      "epoch": 2.6576576576576576,
      "grad_norm": 1.3998136520385742,
      "learning_rate": 0.00010644224714727681,
      "loss": 0.4027,
      "step": 590
    },
    {
      "epoch": 2.7027027027027026,
      "grad_norm": 0.598822832107544,
      "learning_rate": 0.00010330137426761135,
      "loss": 0.3496,
      "step": 600
    },
    {
      "epoch": 2.7477477477477477,
      "grad_norm": 0.9068642854690552,
      "learning_rate": 0.00010015723680475846,
      "loss": 0.3489,
      "step": 610
    },
    {
      "epoch": 2.7927927927927927,
      "grad_norm": 0.4025176167488098,
      "learning_rate": 9.70129438573747e-05,
      "loss": 0.3296,
      "step": 620
    },
    {
      "epoch": 2.8378378378378377,
      "grad_norm": 0.6708613634109497,
      "learning_rate": 9.38716046778684e-05,
      "loss": 0.3004,
      "step": 630
    },
    {
      "epoch": 2.8828828828828827,
      "grad_norm": 0.7858556509017944,
      "learning_rate": 9.07363255977973e-05,
      "loss": 0.3716,
      "step": 640
    },
    {
      "epoch": 2.9279279279279278,
      "grad_norm": 0.6855165958404541,
      "learning_rate": 8.76102069561545e-05,
      "loss": 0.311,
      "step": 650
    },
    {
      "epoch": 2.972972972972973,
      "grad_norm": 0.6526620388031006,
      "learning_rate": 8.449634003358022e-05,
      "loss": 0.3488,
      "step": 660
    },
    {
      "epoch": 3.018018018018018,
      "grad_norm": 0.3698066174983978,
      "learning_rate": 8.13978039955308e-05,
      "loss": 0.2858,
      "step": 670
    },
    {
      "epoch": 3.063063063063063,
      "grad_norm": 0.8586738705635071,
      "learning_rate": 7.831766284742807e-05,
      "loss": 0.2565,
      "step": 680
    },
    {
      "epoch": 3.108108108108108,
      "grad_norm": 0.8718597292900085,
      "learning_rate": 7.525896240479976e-05,
      "loss": 0.2173,
      "step": 690
    },
    {
      "epoch": 3.153153153153153,
      "grad_norm": 0.7671772241592407,
      "learning_rate": 7.222472728140695e-05,
      "loss": 0.2548,
      "step": 700
    },
    {
      "epoch": 3.1981981981981984,
      "grad_norm": 1.2702572345733643,
      "learning_rate": 6.921795789833723e-05,
      "loss": 0.2638,
      "step": 710
    },
    {
      "epoch": 3.2432432432432434,
      "grad_norm": 1.4898873567581177,
      "learning_rate": 6.624162751702076e-05,
      "loss": 0.2623,
      "step": 720
    },
    {
      "epoch": 3.2882882882882885,
      "grad_norm": 1.0137726068496704,
      "learning_rate": 6.329867929910347e-05,
      "loss": 0.2938,
      "step": 730
    },
    {
      "epoch": 3.3333333333333335,
      "grad_norm": 0.9631416201591492,
      "learning_rate": 6.039202339608432e-05,
      "loss": 0.2443,
      "step": 740
    },
    {
      "epoch": 3.3783783783783785,
      "grad_norm": 0.8912140130996704,
      "learning_rate": 5.752453407159522e-05,
      "loss": 0.2359,
      "step": 750
    },
    {
      "epoch": 3.4234234234234235,
      "grad_norm": 0.9686083793640137,
      "learning_rate": 5.469904685916861e-05,
      "loss": 0.2465,
      "step": 760
    },
    {
      "epoch": 3.4684684684684686,
      "grad_norm": 1.942658543586731,
      "learning_rate": 5.191835575830352e-05,
      "loss": 0.3042,
      "step": 770
    },
    {
      "epoch": 3.5135135135135136,
      "grad_norm": 1.2755067348480225,
      "learning_rate": 4.918521047160308e-05,
      "loss": 0.2885,
      "step": 780
    },
    {
      "epoch": 3.5585585585585586,
      "grad_norm": 0.8992679715156555,
      "learning_rate": 4.650231368571486e-05,
      "loss": 0.2728,
      "step": 790
    },
    {
      "epoch": 3.6036036036036037,
      "grad_norm": 2.1154401302337646,
      "learning_rate": 4.387231839876349e-05,
      "loss": 0.258,
      "step": 800
    },
    {
      "epoch": 3.6486486486486487,
      "grad_norm": 0.6957826018333435,
      "learning_rate": 4.129782529691815e-05,
      "loss": 0.3219,
      "step": 810
    },
    {
      "epoch": 3.6936936936936937,
      "grad_norm": 0.8093072175979614,
      "learning_rate": 3.878138018268866e-05,
      "loss": 0.2318,
      "step": 820
    },
    {
      "epoch": 3.7387387387387387,
      "grad_norm": 0.900164008140564,
      "learning_rate": 3.632547145749395e-05,
      "loss": 0.3025,
      "step": 830
    },
    {
      "epoch": 3.7837837837837838,
      "grad_norm": 1.3732051849365234,
      "learning_rate": 3.393252766099187e-05,
      "loss": 0.2744,
      "step": 840
    },
    {
      "epoch": 3.828828828828829,
      "grad_norm": 1.3438997268676758,
      "learning_rate": 3.1604915069603436e-05,
      "loss": 0.2663,
      "step": 850
    },
    {
      "epoch": 3.873873873873874,
      "grad_norm": 0.7277224063873291,
      "learning_rate": 2.9344935356606773e-05,
      "loss": 0.2058,
      "step": 860
    },
    {
      "epoch": 3.918918918918919,
      "grad_norm": 0.9671199321746826,
      "learning_rate": 2.7154823316113932e-05,
      "loss": 0.2466,
      "step": 870
    },
    {
      "epoch": 3.963963963963964,
      "grad_norm": 1.0856068134307861,
      "learning_rate": 2.5036744653181753e-05,
      "loss": 0.2695,
      "step": 880
    },
    {
      "epoch": 4.009009009009009,
      "grad_norm": 0.7191686034202576,
      "learning_rate": 2.29927938422419e-05,
      "loss": 0.2252,
      "step": 890
    },
    {
      "epoch": 4.054054054054054,
      "grad_norm": 0.9094095826148987,
      "learning_rate": 2.102499205596743e-05,
      "loss": 0.2067,
      "step": 900
    },
    {
      "epoch": 4.099099099099099,
      "grad_norm": 1.2016669511795044,
      "learning_rate": 1.913528516662452e-05,
      "loss": 0.2165,
      "step": 910
    },
    {
      "epoch": 4.1441441441441444,
      "grad_norm": 1.6922552585601807,
      "learning_rate": 1.7325541821885384e-05,
      "loss": 0.2102,
      "step": 920
    },
    {
      "epoch": 4.1891891891891895,
      "grad_norm": 1.52359139919281,
      "learning_rate": 1.5597551597004966e-05,
      "loss": 0.1765,
      "step": 930
    },
    {
      "epoch": 4.2342342342342345,
      "grad_norm": 1.333765983581543,
      "learning_rate": 1.3953023225189243e-05,
      "loss": 0.2147,
      "step": 940
    },
    {
      "epoch": 4.2792792792792795,
      "grad_norm": 0.9832772016525269,
      "learning_rate": 1.23935829079042e-05,
      "loss": 0.2068,
      "step": 950
    },
    {
      "epoch": 4.324324324324325,
      "grad_norm": 0.7258216738700867,
      "learning_rate": 1.0920772706797167e-05,
      "loss": 0.1884,
      "step": 960
    },
    {
      "epoch": 4.36936936936937,
      "grad_norm": 1.0229756832122803,
      "learning_rate": 9.536049018820192e-06,
      "loss": 0.2135,
      "step": 970
    },
    {
      "epoch": 4.414414414414415,
      "grad_norm": 1.0085179805755615,
      "learning_rate": 8.240781136063346e-06,
      "loss": 0.1831,
      "step": 980
    },
    {
      "epoch": 4.45945945945946,
      "grad_norm": 0.7446288466453552,
      "learning_rate": 7.03624989172228e-06,
      "loss": 0.198,
      "step": 990
    },
    {
      "epoch": 4.504504504504505,
      "grad_norm": 0.8291650414466858,
      "learning_rate": 5.9236463935389065e-06,
      "loss": 0.2189,
      "step": 1000
    },
    {
      "epoch": 4.504504504504505,
      "eval_loss": 0.9809222221374512,
      "eval_runtime": 10.6739,
      "eval_samples_per_second": 35.039,
      "eval_steps_per_second": 4.403,
      "step": 1000
    },
    {
      "epoch": 4.54954954954955,
      "grad_norm": 1.1298563480377197,
      "learning_rate": 4.904070845967468e-06,
      "loss": 0.1889,
      "step": 1010
    },
    {
      "epoch": 4.594594594594595,
      "grad_norm": 1.0232703685760498,
      "learning_rate": 3.9785314622310495e-06,
      "loss": 0.1891,
      "step": 1020
    },
    {
      "epoch": 4.63963963963964,
      "grad_norm": 1.2712104320526123,
      "learning_rate": 3.1479434673440167e-06,
      "loss": 0.1879,
      "step": 1030
    },
    {
      "epoch": 4.684684684684685,
      "grad_norm": 1.564489722251892,
      "learning_rate": 2.4131281930864002e-06,
      "loss": 0.1972,
      "step": 1040
    },
    {
      "epoch": 4.72972972972973,
      "grad_norm": 1.4100459814071655,
      "learning_rate": 1.7748122658251876e-06,
      "loss": 0.201,
      "step": 1050
    },
    {
      "epoch": 4.774774774774775,
      "grad_norm": 1.3149417638778687,
      "learning_rate": 1.2336268879856727e-06,
      "loss": 0.1876,
      "step": 1060
    },
    {
      "epoch": 4.81981981981982,
      "grad_norm": 0.8505904674530029,
      "learning_rate": 7.901072138831511e-07,
      "loss": 0.1722,
      "step": 1070
    },
    {
      "epoch": 4.864864864864865,
      "grad_norm": 2.1957037448883057,
      "learning_rate": 4.44691820532539e-07,
      "loss": 0.1917,
      "step": 1080
    },
    {
      "epoch": 4.90990990990991,
      "grad_norm": 1.9867583513259888,
      "learning_rate": 1.977222739588891e-07,
      "loss": 0.2082,
      "step": 1090
    },
    {
      "epoch": 4.954954954954955,
      "grad_norm": 1.539480447769165,
      "learning_rate": 4.9442791437848136e-08,
      "loss": 0.2052,
      "step": 1100
    },
    {
      "epoch": 5.0,
      "grad_norm": 0.6789027452468872,
      "learning_rate": 0.0,
      "loss": 0.197,
      "step": 1110
    },
    {
      "epoch": 5.0,
      "step": 1110,
      "total_flos": 1.01086802968209e+18,
      "train_loss": 0.36593411194311604,
      "train_runtime": 2755.6936,
      "train_samples_per_second": 12.872,
      "train_steps_per_second": 0.403
    }
  ],
  "logging_steps": 10,
  "max_steps": 1110,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 5,
  "save_steps": 1000,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 1.01086802968209e+18,
  "train_batch_size": 2,
  "trial_name": null,
  "trial_params": null
}