Upload model

Browse files

Files changed (8) hide show

config.json +40 -0
generation_config.json +8 -0
pytorch_model.bin +3 -0
special_tokens_map.json +24 -0
tokenizer.json +0 -0
tokenizer_config.json +26 -0
trainer_state.json +1947 -0
training_args.bin +3 -0

config.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "_name_or_path": "/home/madaraszg/mamba/test_mamba_wsd/checkpoint-570000",
+  "architectures": [
+    "MambaForCausalLM"
+  ],
+  "bos_token_id": 0,
+  "conv_kernel": 4,
+  "d_inner": 1536,
+  "d_model": 768,
+  "eos_token_id": 0,
+  "expand": 2,
+  "fused_add_norm": true,
+  "hidden_act": "silu",
+  "hidden_size": 768,
+  "initializer_range": 0.1,
+  "intermediate_size": 1536,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "mamba",
+  "n_layer": 24,
+  "num_hidden_layers": 24,
+  "pad_token_id": 0,
+  "pad_vocab_size_multiple": 8,
+  "rescale_prenorm_residual": false,
+  "residual_in_fp32": true,
+  "rms_norm": true,
+  "ssm_cfg": {},
+  "state_size": 16,
+  "time_step_floor": 0.0001,
+  "time_step_init_scheme": "random",
+  "time_step_max": 0.1,
+  "time_step_min": 0.001,
+  "time_step_rank": 48,
+  "time_step_scale": 1.0,
+  "torch_dtype": "float32",
+  "transformers_version": "4.41.2",
+  "use_bias": false,
+  "use_cache": false,
+  "use_conv_bias": true,
+  "vocab_size": 52000
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "eos_token_id": 0,
+  "pad_token_id": 0,
+  "transformers_version": "4.41.2",
+  "use_cache": false
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6a311dc3bee49f979d95d6c130e6b6b4df18d8a6b9fca0ce3a9c6e17a08c9061
+size 521905938

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|endoftext|>",
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "max_length": 2048,
+  "model_max_length": 1024,
+  "pad_token": "<|endoftext|>",
+  "stride": 0,
+  "tokenizer_class": "GPTNeoXTokenizer",
+  "truncation_side": "right",
+  "truncation_strategy": "longest_first",
+  "unk_token": "<|endoftext|>"
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1947 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9786327442581368,
+  "eval_steps": 15000,
+  "global_step": 370000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.003967430044289744,
+      "grad_norm": 1.5698450803756714,
+      "learning_rate": 2e-05,
+      "loss": 4.231,
+      "step": 1500
+    },
+    {
+      "epoch": 0.007934860088579488,
+      "grad_norm": 1.8081231117248535,
+      "learning_rate": 2e-05,
+      "loss": 4.1347,
+      "step": 3000
+    },
+    {
+      "epoch": 0.011902290132869232,
+      "grad_norm": 1.1643619537353516,
+      "learning_rate": 2e-05,
+      "loss": 4.1056,
+      "step": 4500
+    },
+    {
+      "epoch": 0.015869720177158975,
+      "grad_norm": 1.3358386754989624,
+      "learning_rate": 2e-05,
+      "loss": 4.0762,
+      "step": 6000
+    },
+    {
+      "epoch": 0.019837150221448722,
+      "grad_norm": 1.3741443157196045,
+      "learning_rate": 2e-05,
+      "loss": 4.0576,
+      "step": 7500
+    },
+    {
+      "epoch": 0.023804580265738465,
+      "grad_norm": 1.3668568134307861,
+      "learning_rate": 2e-05,
+      "loss": 4.0384,
+      "step": 9000
+    },
+    {
+      "epoch": 0.027772010310028208,
+      "grad_norm": 1.714483618736267,
+      "learning_rate": 2e-05,
+      "loss": 4.0187,
+      "step": 10500
+    },
+    {
+      "epoch": 0.03173944035431795,
+      "grad_norm": 1.3867794275283813,
+      "learning_rate": 2e-05,
+      "loss": 4.0159,
+      "step": 12000
+    },
+    {
+      "epoch": 0.0357068703986077,
+      "grad_norm": 1.603532075881958,
+      "learning_rate": 2e-05,
+      "loss": 3.9967,
+      "step": 13500
+    },
+    {
+      "epoch": 0.039674300442897444,
+      "grad_norm": 1.5365407466888428,
+      "learning_rate": 2e-05,
+      "loss": 3.9848,
+      "step": 15000
+    },
+    {
+      "epoch": 0.039674300442897444,
+      "eval_loss": 3.973594903945923,
+      "eval_runtime": 509.3826,
+      "eval_samples_per_second": 131.161,
+      "eval_steps_per_second": 32.791,
+      "step": 15000
+    },
+    {
+      "epoch": 0.04364173048718718,
+      "grad_norm": 1.728336215019226,
+      "learning_rate": 2e-05,
+      "loss": 3.9812,
+      "step": 16500
+    },
+    {
+      "epoch": 0.04760916053147693,
+      "grad_norm": 1.8370418548583984,
+      "learning_rate": 2e-05,
+      "loss": 3.9633,
+      "step": 18000
+    },
+    {
+      "epoch": 0.051576590575766676,
+      "grad_norm": 1.6304991245269775,
+      "learning_rate": 2e-05,
+      "loss": 3.9549,
+      "step": 19500
+    },
+    {
+      "epoch": 0.055544020620056415,
+      "grad_norm": 1.4472306966781616,
+      "learning_rate": 2e-05,
+      "loss": 3.9518,
+      "step": 21000
+    },
+    {
+      "epoch": 0.05951145066434616,
+      "grad_norm": 2.202906370162964,
+      "learning_rate": 2e-05,
+      "loss": 3.9497,
+      "step": 22500
+    },
+    {
+      "epoch": 0.0634788807086359,
+      "grad_norm": 1.5186316967010498,
+      "learning_rate": 2e-05,
+      "loss": 3.9362,
+      "step": 24000
+    },
+    {
+      "epoch": 0.06744631075292565,
+      "grad_norm": 2.0651211738586426,
+      "learning_rate": 2e-05,
+      "loss": 3.932,
+      "step": 25500
+    },
+    {
+      "epoch": 0.0714137407972154,
+      "grad_norm": 1.750518560409546,
+      "learning_rate": 2e-05,
+      "loss": 3.9276,
+      "step": 27000
+    },
+    {
+      "epoch": 0.07538117084150514,
+      "grad_norm": 2.3646767139434814,
+      "learning_rate": 2e-05,
+      "loss": 3.9108,
+      "step": 28500
+    },
+    {
+      "epoch": 0.07934860088579489,
+      "grad_norm": 1.3822572231292725,
+      "learning_rate": 2e-05,
+      "loss": 3.9242,
+      "step": 30000
+    },
+    {
+      "epoch": 0.07934860088579489,
+      "eval_loss": 3.8984525203704834,
+      "eval_runtime": 509.4664,
+      "eval_samples_per_second": 131.139,
+      "eval_steps_per_second": 32.785,
+      "step": 30000
+    },
+    {
+      "epoch": 0.08331603093008462,
+      "grad_norm": 1.394733190536499,
+      "learning_rate": 2e-05,
+      "loss": 3.9112,
+      "step": 31500
+    },
+    {
+      "epoch": 0.08728346097437437,
+      "grad_norm": 2.038745641708374,
+      "learning_rate": 2e-05,
+      "loss": 3.9065,
+      "step": 33000
+    },
+    {
+      "epoch": 0.09125089101866411,
+      "grad_norm": 1.3629461526870728,
+      "learning_rate": 2e-05,
+      "loss": 3.8944,
+      "step": 34500
+    },
+    {
+      "epoch": 0.09521832106295386,
+      "grad_norm": 1.59019136428833,
+      "learning_rate": 2e-05,
+      "loss": 3.899,
+      "step": 36000
+    },
+    {
+      "epoch": 0.0991857511072436,
+      "grad_norm": 1.7221766710281372,
+      "learning_rate": 2e-05,
+      "loss": 3.8944,
+      "step": 37500
+    },
+    {
+      "epoch": 0.10315318115153335,
+      "grad_norm": 1.5057470798492432,
+      "learning_rate": 2e-05,
+      "loss": 3.8929,
+      "step": 39000
+    },
+    {
+      "epoch": 0.10712061119582308,
+      "grad_norm": 1.812050223350525,
+      "learning_rate": 2e-05,
+      "loss": 3.8788,
+      "step": 40500
+    },
+    {
+      "epoch": 0.11108804124011283,
+      "grad_norm": 1.7102704048156738,
+      "learning_rate": 2e-05,
+      "loss": 3.8753,
+      "step": 42000
+    },
+    {
+      "epoch": 0.11505547128440258,
+      "grad_norm": 1.408778190612793,
+      "learning_rate": 2e-05,
+      "loss": 3.8705,
+      "step": 43500
+    },
+    {
+      "epoch": 0.11902290132869232,
+      "grad_norm": 1.8378455638885498,
+      "learning_rate": 2e-05,
+      "loss": 3.8678,
+      "step": 45000
+    },
+    {
+      "epoch": 0.11902290132869232,
+      "eval_loss": 3.851247787475586,
+      "eval_runtime": 509.1413,
+      "eval_samples_per_second": 131.223,
+      "eval_steps_per_second": 32.806,
+      "step": 45000
+    },
+    {
+      "epoch": 0.12299033137298207,
+      "grad_norm": 1.7914859056472778,
+      "learning_rate": 2e-05,
+      "loss": 3.864,
+      "step": 46500
+    },
+    {
+      "epoch": 0.1269577614172718,
+      "grad_norm": 1.6122477054595947,
+      "learning_rate": 2e-05,
+      "loss": 3.8561,
+      "step": 48000
+    },
+    {
+      "epoch": 0.13092519146156156,
+      "grad_norm": 1.827723741531372,
+      "learning_rate": 2e-05,
+      "loss": 3.8639,
+      "step": 49500
+    },
+    {
+      "epoch": 0.1348926215058513,
+      "grad_norm": 1.9765942096710205,
+      "learning_rate": 2e-05,
+      "loss": 3.8486,
+      "step": 51000
+    },
+    {
+      "epoch": 0.13886005155014106,
+      "grad_norm": 1.3416199684143066,
+      "learning_rate": 2e-05,
+      "loss": 3.8514,
+      "step": 52500
+    },
+    {
+      "epoch": 0.1428274815944308,
+      "grad_norm": 2.1308629512786865,
+      "learning_rate": 2e-05,
+      "loss": 3.834,
+      "step": 54000
+    },
+    {
+      "epoch": 0.14679491163872052,
+      "grad_norm": 1.630344271659851,
+      "learning_rate": 2e-05,
+      "loss": 3.8438,
+      "step": 55500
+    },
+    {
+      "epoch": 0.15076234168301028,
+      "grad_norm": 1.311733603477478,
+      "learning_rate": 2e-05,
+      "loss": 3.8387,
+      "step": 57000
+    },
+    {
+      "epoch": 0.15472977172730001,
+      "grad_norm": 1.2008074522018433,
+      "learning_rate": 2e-05,
+      "loss": 3.831,
+      "step": 58500
+    },
+    {
+      "epoch": 0.15869720177158977,
+      "grad_norm": 1.804685115814209,
+      "learning_rate": 2e-05,
+      "loss": 3.8369,
+      "step": 60000
+    },
+    {
+      "epoch": 0.15869720177158977,
+      "eval_loss": 3.8154687881469727,
+      "eval_runtime": 509.3502,
+      "eval_samples_per_second": 131.169,
+      "eval_steps_per_second": 32.793,
+      "step": 60000
+    },
+    {
+      "epoch": 0.1626646318158795,
+      "grad_norm": 2.4677481651306152,
+      "learning_rate": 2e-05,
+      "loss": 3.8405,
+      "step": 61500
+    },
+    {
+      "epoch": 0.16663206186016924,
+      "grad_norm": 3.788940668106079,
+      "learning_rate": 2e-05,
+      "loss": 3.8371,
+      "step": 63000
+    },
+    {
+      "epoch": 0.170599491904459,
+      "grad_norm": 1.9231897592544556,
+      "learning_rate": 2e-05,
+      "loss": 3.8197,
+      "step": 64500
+    },
+    {
+      "epoch": 0.17456692194874873,
+      "grad_norm": 2.9989049434661865,
+      "learning_rate": 2e-05,
+      "loss": 3.8248,
+      "step": 66000
+    },
+    {
+      "epoch": 0.1785343519930385,
+      "grad_norm": 1.5718803405761719,
+      "learning_rate": 2e-05,
+      "loss": 3.818,
+      "step": 67500
+    },
+    {
+      "epoch": 0.18250178203732823,
+      "grad_norm": 2.03694748878479,
+      "learning_rate": 2e-05,
+      "loss": 3.8277,
+      "step": 69000
+    },
+    {
+      "epoch": 0.18646921208161796,
+      "grad_norm": 3.6177451610565186,
+      "learning_rate": 2e-05,
+      "loss": 3.8079,
+      "step": 70500
+    },
+    {
+      "epoch": 0.19043664212590772,
+      "grad_norm": 1.6187258958816528,
+      "learning_rate": 2e-05,
+      "loss": 3.8093,
+      "step": 72000
+    },
+    {
+      "epoch": 0.19440407217019745,
+      "grad_norm": 1.4624522924423218,
+      "learning_rate": 2e-05,
+      "loss": 3.8206,
+      "step": 73500
+    },
+    {
+      "epoch": 0.1983715022144872,
+      "grad_norm": 1.632415533065796,
+      "learning_rate": 2e-05,
+      "loss": 3.8037,
+      "step": 75000
+    },
+    {
+      "epoch": 0.1983715022144872,
+      "eval_loss": 3.787764072418213,
+      "eval_runtime": 515.1464,
+      "eval_samples_per_second": 129.693,
+      "eval_steps_per_second": 32.424,
+      "step": 75000
+    },
+    {
+      "epoch": 0.20233893225877694,
+      "grad_norm": 1.420238971710205,
+      "learning_rate": 2e-05,
+      "loss": 3.8085,
+      "step": 76500
+    },
+    {
+      "epoch": 0.2063063623030667,
+      "grad_norm": 1.5415395498275757,
+      "learning_rate": 2e-05,
+      "loss": 3.8071,
+      "step": 78000
+    },
+    {
+      "epoch": 0.21027379234735644,
+      "grad_norm": 1.9765852689743042,
+      "learning_rate": 2e-05,
+      "loss": 3.797,
+      "step": 79500
+    },
+    {
+      "epoch": 0.21424122239164617,
+      "grad_norm": 2.0840232372283936,
+      "learning_rate": 2e-05,
+      "loss": 3.7955,
+      "step": 81000
+    },
+    {
+      "epoch": 0.21820865243593593,
+      "grad_norm": 1.6770437955856323,
+      "learning_rate": 2e-05,
+      "loss": 3.7961,
+      "step": 82500
+    },
+    {
+      "epoch": 0.22217608248022566,
+      "grad_norm": 2.207707166671753,
+      "learning_rate": 2e-05,
+      "loss": 3.7832,
+      "step": 84000
+    },
+    {
+      "epoch": 0.22614351252451542,
+      "grad_norm": 1.7404212951660156,
+      "learning_rate": 2e-05,
+      "loss": 3.7895,
+      "step": 85500
+    },
+    {
+      "epoch": 0.23011094256880515,
+      "grad_norm": 1.492181658744812,
+      "learning_rate": 2e-05,
+      "loss": 3.7876,
+      "step": 87000
+    },
+    {
+      "epoch": 0.2340783726130949,
+      "grad_norm": 1.1175812482833862,
+      "learning_rate": 2e-05,
+      "loss": 3.792,
+      "step": 88500
+    },
+    {
+      "epoch": 0.23804580265738465,
+      "grad_norm": 1.816019892692566,
+      "learning_rate": 2e-05,
+      "loss": 3.7788,
+      "step": 90000
+    },
+    {
+      "epoch": 0.23804580265738465,
+      "eval_loss": 3.7638604640960693,
+      "eval_runtime": 517.2086,
+      "eval_samples_per_second": 129.176,
+      "eval_steps_per_second": 32.295,
+      "step": 90000
+    },
+    {
+      "epoch": 0.24201323270167438,
+      "grad_norm": 2.108680009841919,
+      "learning_rate": 2e-05,
+      "loss": 3.7833,
+      "step": 91500
+    },
+    {
+      "epoch": 0.24598066274596414,
+      "grad_norm": 1.6215394735336304,
+      "learning_rate": 2e-05,
+      "loss": 3.7684,
+      "step": 93000
+    },
+    {
+      "epoch": 0.24994809279025387,
+      "grad_norm": 1.3144052028656006,
+      "learning_rate": 2e-05,
+      "loss": 3.7716,
+      "step": 94500
+    },
+    {
+      "epoch": 0.2539155228345436,
+      "grad_norm": 1.788275957107544,
+      "learning_rate": 2e-05,
+      "loss": 3.7698,
+      "step": 96000
+    },
+    {
+      "epoch": 0.25788295287883334,
+      "grad_norm": 2.0639145374298096,
+      "learning_rate": 2e-05,
+      "loss": 3.7684,
+      "step": 97500
+    },
+    {
+      "epoch": 0.2618503829231231,
+      "grad_norm": 1.38687002658844,
+      "learning_rate": 2e-05,
+      "loss": 3.7696,
+      "step": 99000
+    },
+    {
+      "epoch": 0.26581781296741286,
+      "grad_norm": 1.4817004203796387,
+      "learning_rate": 2e-05,
+      "loss": 3.7673,
+      "step": 100500
+    },
+    {
+      "epoch": 0.2697852430117026,
+      "grad_norm": 1.7259806394577026,
+      "learning_rate": 2e-05,
+      "loss": 3.7602,
+      "step": 102000
+    },
+    {
+      "epoch": 0.2737526730559923,
+      "grad_norm": 1.340493083000183,
+      "learning_rate": 2e-05,
+      "loss": 3.7514,
+      "step": 103500
+    },
+    {
+      "epoch": 0.2777201031002821,
+      "grad_norm": 2.0217068195343018,
+      "learning_rate": 2e-05,
+      "loss": 3.7618,
+      "step": 105000
+    },
+    {
+      "epoch": 0.2777201031002821,
+      "eval_loss": 3.743399143218994,
+      "eval_runtime": 527.3734,
+      "eval_samples_per_second": 126.686,
+      "eval_steps_per_second": 31.672,
+      "step": 105000
+    },
+    {
+      "epoch": 0.28168753314457184,
+      "grad_norm": 1.1739765405654907,
+      "learning_rate": 2e-05,
+      "loss": 3.7599,
+      "step": 106500
+    },
+    {
+      "epoch": 0.2856549631888616,
+      "grad_norm": 1.692695140838623,
+      "learning_rate": 2e-05,
+      "loss": 3.7447,
+      "step": 108000
+    },
+    {
+      "epoch": 0.2896223932331513,
+      "grad_norm": 1.8593926429748535,
+      "learning_rate": 2e-05,
+      "loss": 3.7533,
+      "step": 109500
+    },
+    {
+      "epoch": 0.29358982327744104,
+      "grad_norm": 2.278003454208374,
+      "learning_rate": 2e-05,
+      "loss": 3.7555,
+      "step": 111000
+    },
+    {
+      "epoch": 0.29755725332173083,
+      "grad_norm": 1.2629317045211792,
+      "learning_rate": 2e-05,
+      "loss": 3.7502,
+      "step": 112500
+    },
+    {
+      "epoch": 0.30152468336602056,
+      "grad_norm": 1.8427543640136719,
+      "learning_rate": 2e-05,
+      "loss": 3.7669,
+      "step": 114000
+    },
+    {
+      "epoch": 0.3054921134103103,
+      "grad_norm": 2.1399271488189697,
+      "learning_rate": 2e-05,
+      "loss": 3.7441,
+      "step": 115500
+    },
+    {
+      "epoch": 0.30945954345460003,
+      "grad_norm": 1.726097822189331,
+      "learning_rate": 2e-05,
+      "loss": 3.7413,
+      "step": 117000
+    },
+    {
+      "epoch": 0.31342697349888976,
+      "grad_norm": 2.9889960289001465,
+      "learning_rate": 2e-05,
+      "loss": 3.7435,
+      "step": 118500
+    },
+    {
+      "epoch": 0.31739440354317955,
+      "grad_norm": 1.6429541110992432,
+      "learning_rate": 2e-05,
+      "loss": 3.7321,
+      "step": 120000
+    },
+    {
+      "epoch": 0.31739440354317955,
+      "eval_loss": 3.725595235824585,
+      "eval_runtime": 507.8039,
+      "eval_samples_per_second": 131.568,
+      "eval_steps_per_second": 32.893,
+      "step": 120000
+    },
+    {
+      "epoch": 0.3213618335874693,
+      "grad_norm": 1.523931622505188,
+      "learning_rate": 2e-05,
+      "loss": 3.7449,
+      "step": 121500
+    },
+    {
+      "epoch": 0.325329263631759,
+      "grad_norm": 1.1351861953735352,
+      "learning_rate": 2e-05,
+      "loss": 3.74,
+      "step": 123000
+    },
+    {
+      "epoch": 0.32929669367604875,
+      "grad_norm": 1.9448051452636719,
+      "learning_rate": 2e-05,
+      "loss": 3.738,
+      "step": 124500
+    },
+    {
+      "epoch": 0.3332641237203385,
+      "grad_norm": 2.2584104537963867,
+      "learning_rate": 2e-05,
+      "loss": 3.7349,
+      "step": 126000
+    },
+    {
+      "epoch": 0.33723155376462827,
+      "grad_norm": 1.9008771181106567,
+      "learning_rate": 2e-05,
+      "loss": 3.7294,
+      "step": 127500
+    },
+    {
+      "epoch": 0.341198983808918,
+      "grad_norm": 1.5307859182357788,
+      "learning_rate": 2e-05,
+      "loss": 3.7374,
+      "step": 129000
+    },
+    {
+      "epoch": 0.34516641385320773,
+      "grad_norm": 1.4004909992218018,
+      "learning_rate": 2e-05,
+      "loss": 3.739,
+      "step": 130500
+    },
+    {
+      "epoch": 0.34913384389749746,
+      "grad_norm": 2.54671573638916,
+      "learning_rate": 2e-05,
+      "loss": 3.7185,
+      "step": 132000
+    },
+    {
+      "epoch": 0.3531012739417872,
+      "grad_norm": 1.5276941061019897,
+      "learning_rate": 2e-05,
+      "loss": 3.7359,
+      "step": 133500
+    },
+    {
+      "epoch": 0.357068703986077,
+      "grad_norm": 1.8586397171020508,
+      "learning_rate": 2e-05,
+      "loss": 3.7275,
+      "step": 135000
+    },
+    {
+      "epoch": 0.357068703986077,
+      "eval_loss": 3.708439826965332,
+      "eval_runtime": 508.0774,
+      "eval_samples_per_second": 131.498,
+      "eval_steps_per_second": 32.875,
+      "step": 135000
+    },
+    {
+      "epoch": 0.3610361340303667,
+      "grad_norm": 1.7233084440231323,
+      "learning_rate": 2e-05,
+      "loss": 3.7294,
+      "step": 136500
+    },
+    {
+      "epoch": 0.36500356407465645,
+      "grad_norm": 1.4811041355133057,
+      "learning_rate": 2e-05,
+      "loss": 3.7377,
+      "step": 138000
+    },
+    {
+      "epoch": 0.3689709941189462,
+      "grad_norm": 1.778673768043518,
+      "learning_rate": 2e-05,
+      "loss": 3.7252,
+      "step": 139500
+    },
+    {
+      "epoch": 0.3729384241632359,
+      "grad_norm": 1.6126329898834229,
+      "learning_rate": 2e-05,
+      "loss": 3.7139,
+      "step": 141000
+    },
+    {
+      "epoch": 0.3769058542075257,
+      "grad_norm": 1.7368344068527222,
+      "learning_rate": 2e-05,
+      "loss": 3.7165,
+      "step": 142500
+    },
+    {
+      "epoch": 0.38087328425181544,
+      "grad_norm": 1.5401374101638794,
+      "learning_rate": 2e-05,
+      "loss": 3.7141,
+      "step": 144000
+    },
+    {
+      "epoch": 0.38484071429610517,
+      "grad_norm": 1.4228651523590088,
+      "learning_rate": 2e-05,
+      "loss": 3.7132,
+      "step": 145500
+    },
+    {
+      "epoch": 0.3888081443403949,
+      "grad_norm": 1.5804122686386108,
+      "learning_rate": 2e-05,
+      "loss": 3.7199,
+      "step": 147000
+    },
+    {
+      "epoch": 0.3927755743846847,
+      "grad_norm": 1.7531423568725586,
+      "learning_rate": 2e-05,
+      "loss": 3.7251,
+      "step": 148500
+    },
+    {
+      "epoch": 0.3967430044289744,
+      "grad_norm": 2.62064528465271,
+      "learning_rate": 2e-05,
+      "loss": 3.7075,
+      "step": 150000
+    },
+    {
+      "epoch": 0.3967430044289744,
+      "eval_loss": 3.693021535873413,
+      "eval_runtime": 508.4788,
+      "eval_samples_per_second": 131.394,
+      "eval_steps_per_second": 32.849,
+      "step": 150000
+    },
+    {
+      "epoch": 0.40071043447326415,
+      "grad_norm": 2.2958357334136963,
+      "learning_rate": 2e-05,
+      "loss": 3.7145,
+      "step": 151500
+    },
+    {
+      "epoch": 0.4046778645175539,
+      "grad_norm": 1.2222336530685425,
+      "learning_rate": 2e-05,
+      "loss": 3.7229,
+      "step": 153000
+    },
+    {
+      "epoch": 0.4086452945618436,
+      "grad_norm": 1.8726732730865479,
+      "learning_rate": 2e-05,
+      "loss": 3.7075,
+      "step": 154500
+    },
+    {
+      "epoch": 0.4126127246061334,
+      "grad_norm": 2.425828218460083,
+      "learning_rate": 2e-05,
+      "loss": 3.7002,
+      "step": 156000
+    },
+    {
+      "epoch": 0.41658015465042314,
+      "grad_norm": 1.6482657194137573,
+      "learning_rate": 2e-05,
+      "loss": 3.7024,
+      "step": 157500
+    },
+    {
+      "epoch": 0.4205475846947129,
+      "grad_norm": 2.0351409912109375,
+      "learning_rate": 2e-05,
+      "loss": 3.7079,
+      "step": 159000
+    },
+    {
+      "epoch": 0.4245150147390026,
+      "grad_norm": 1.5541174411773682,
+      "learning_rate": 2e-05,
+      "loss": 3.6976,
+      "step": 160500
+    },
+    {
+      "epoch": 0.42848244478329234,
+      "grad_norm": 2.035759687423706,
+      "learning_rate": 2e-05,
+      "loss": 3.6955,
+      "step": 162000
+    },
+    {
+      "epoch": 0.4324498748275821,
+      "grad_norm": 2.4348483085632324,
+      "learning_rate": 2e-05,
+      "loss": 3.6893,
+      "step": 163500
+    },
+    {
+      "epoch": 0.43641730487187186,
+      "grad_norm": 2.0152392387390137,
+      "learning_rate": 2e-05,
+      "loss": 3.6942,
+      "step": 165000
+    },
+    {
+      "epoch": 0.43641730487187186,
+      "eval_loss": 3.679271697998047,
+      "eval_runtime": 508.381,
+      "eval_samples_per_second": 131.419,
+      "eval_steps_per_second": 32.855,
+      "step": 165000
+    },
+    {
+      "epoch": 0.4403847349161616,
+      "grad_norm": 2.1153664588928223,
+      "learning_rate": 2e-05,
+      "loss": 3.6959,
+      "step": 166500
+    },
+    {
+      "epoch": 0.4443521649604513,
+      "grad_norm": 1.2098298072814941,
+      "learning_rate": 2e-05,
+      "loss": 3.6839,
+      "step": 168000
+    },
+    {
+      "epoch": 0.44831959500474106,
+      "grad_norm": 2.01379132270813,
+      "learning_rate": 2e-05,
+      "loss": 3.687,
+      "step": 169500
+    },
+    {
+      "epoch": 0.45228702504903084,
+      "grad_norm": 1.823215365409851,
+      "learning_rate": 2e-05,
+      "loss": 3.6932,
+      "step": 171000
+    },
+    {
+      "epoch": 0.4562544550933206,
+      "grad_norm": 2.030813694000244,
+      "learning_rate": 2e-05,
+      "loss": 3.6907,
+      "step": 172500
+    },
+    {
+      "epoch": 0.4602218851376103,
+      "grad_norm": 1.4791704416275024,
+      "learning_rate": 2e-05,
+      "loss": 3.6929,
+      "step": 174000
+    },
+    {
+      "epoch": 0.46418931518190004,
+      "grad_norm": 1.747860312461853,
+      "learning_rate": 2e-05,
+      "loss": 3.6912,
+      "step": 175500
+    },
+    {
+      "epoch": 0.4681567452261898,
+      "grad_norm": 1.837263584136963,
+      "learning_rate": 2e-05,
+      "loss": 3.6975,
+      "step": 177000
+    },
+    {
+      "epoch": 0.47212417527047956,
+      "grad_norm": 1.7791885137557983,
+      "learning_rate": 2e-05,
+      "loss": 3.687,
+      "step": 178500
+    },
+    {
+      "epoch": 0.4760916053147693,
+      "grad_norm": 1.7485198974609375,
+      "learning_rate": 2e-05,
+      "loss": 3.6875,
+      "step": 180000
+    },
+    {
+      "epoch": 0.4760916053147693,
+      "eval_loss": 3.666386127471924,
+      "eval_runtime": 508.5011,
+      "eval_samples_per_second": 131.388,
+      "eval_steps_per_second": 32.848,
+      "step": 180000
+    },
+    {
+      "epoch": 0.480059035359059,
+      "grad_norm": 1.6664392948150635,
+      "learning_rate": 2e-05,
+      "loss": 3.6763,
+      "step": 181500
+    },
+    {
+      "epoch": 0.48402646540334876,
+      "grad_norm": 1.7725111246109009,
+      "learning_rate": 2e-05,
+      "loss": 3.6824,
+      "step": 183000
+    },
+    {
+      "epoch": 0.4879938954476385,
+      "grad_norm": 2.1763925552368164,
+      "learning_rate": 2e-05,
+      "loss": 3.6885,
+      "step": 184500
+    },
+    {
+      "epoch": 0.4919613254919283,
+      "grad_norm": 1.492720603942871,
+      "learning_rate": 2e-05,
+      "loss": 3.6825,
+      "step": 186000
+    },
+    {
+      "epoch": 0.495928755536218,
+      "grad_norm": 1.365192174911499,
+      "learning_rate": 2e-05,
+      "loss": 3.6879,
+      "step": 187500
+    },
+    {
+      "epoch": 0.49989618558050775,
+      "grad_norm": 1.8424572944641113,
+      "learning_rate": 2e-05,
+      "loss": 3.6825,
+      "step": 189000
+    },
+    {
+      "epoch": 0.5038636156247975,
+      "grad_norm": 1.6156889200210571,
+      "learning_rate": 2e-05,
+      "loss": 3.6819,
+      "step": 190500
+    },
+    {
+      "epoch": 0.5078310456690872,
+      "grad_norm": 1.3726621866226196,
+      "learning_rate": 2e-05,
+      "loss": 3.6766,
+      "step": 192000
+    },
+    {
+      "epoch": 0.511798475713377,
+      "grad_norm": 1.5357227325439453,
+      "learning_rate": 2e-05,
+      "loss": 3.6686,
+      "step": 193500
+    },
+    {
+      "epoch": 0.5157659057576667,
+      "grad_norm": 1.6667983531951904,
+      "learning_rate": 2e-05,
+      "loss": 3.673,
+      "step": 195000
+    },
+    {
+      "epoch": 0.5157659057576667,
+      "eval_loss": 3.6544015407562256,
+      "eval_runtime": 508.3154,
+      "eval_samples_per_second": 131.436,
+      "eval_steps_per_second": 32.86,
+      "step": 195000
+    },
+    {
+      "epoch": 0.5197333358019565,
+      "grad_norm": 1.6955657005310059,
+      "learning_rate": 2e-05,
+      "loss": 3.6681,
+      "step": 196500
+    },
+    {
+      "epoch": 0.5237007658462463,
+      "grad_norm": 2.2115893363952637,
+      "learning_rate": 2e-05,
+      "loss": 3.6631,
+      "step": 198000
+    },
+    {
+      "epoch": 0.5276681958905359,
+      "grad_norm": 1.6372627019882202,
+      "learning_rate": 2e-05,
+      "loss": 3.6663,
+      "step": 199500
+    },
+    {
+      "epoch": 0.5316356259348257,
+      "grad_norm": 1.719591498374939,
+      "learning_rate": 2e-05,
+      "loss": 3.6668,
+      "step": 201000
+    },
+    {
+      "epoch": 0.5356030559791154,
+      "grad_norm": 1.9526976346969604,
+      "learning_rate": 2e-05,
+      "loss": 3.6614,
+      "step": 202500
+    },
+    {
+      "epoch": 0.5395704860234052,
+      "grad_norm": 1.4962718486785889,
+      "learning_rate": 2e-05,
+      "loss": 3.6541,
+      "step": 204000
+    },
+    {
+      "epoch": 0.543537916067695,
+      "grad_norm": 1.552954077720642,
+      "learning_rate": 2e-05,
+      "loss": 3.667,
+      "step": 205500
+    },
+    {
+      "epoch": 0.5475053461119846,
+      "grad_norm": 2.0610477924346924,
+      "learning_rate": 2e-05,
+      "loss": 3.6551,
+      "step": 207000
+    },
+    {
+      "epoch": 0.5514727761562744,
+      "grad_norm": 1.989310383796692,
+      "learning_rate": 2e-05,
+      "loss": 3.6638,
+      "step": 208500
+    },
+    {
+      "epoch": 0.5554402062005642,
+      "grad_norm": 1.2716268301010132,
+      "learning_rate": 2e-05,
+      "loss": 3.6599,
+      "step": 210000
+    },
+    {
+      "epoch": 0.5554402062005642,
+      "eval_loss": 3.6426613330841064,
+      "eval_runtime": 508.1725,
+      "eval_samples_per_second": 131.473,
+      "eval_steps_per_second": 32.869,
+      "step": 210000
+    },
+    {
+      "epoch": 0.5594076362448539,
+      "grad_norm": 1.81135892868042,
+      "learning_rate": 2e-05,
+      "loss": 3.6569,
+      "step": 211500
+    },
+    {
+      "epoch": 0.5633750662891437,
+      "grad_norm": 1.3680062294006348,
+      "learning_rate": 2e-05,
+      "loss": 3.6584,
+      "step": 213000
+    },
+    {
+      "epoch": 0.5673424963334334,
+      "grad_norm": 1.6358596086502075,
+      "learning_rate": 2e-05,
+      "loss": 3.6474,
+      "step": 214500
+    },
+    {
+      "epoch": 0.5713099263777232,
+      "grad_norm": 1.3548320531845093,
+      "learning_rate": 2e-05,
+      "loss": 3.6492,
+      "step": 216000
+    },
+    {
+      "epoch": 0.5752773564220129,
+      "grad_norm": 1.5863239765167236,
+      "learning_rate": 2e-05,
+      "loss": 3.6549,
+      "step": 217500
+    },
+    {
+      "epoch": 0.5792447864663026,
+      "grad_norm": 1.269634485244751,
+      "learning_rate": 2e-05,
+      "loss": 3.6405,
+      "step": 219000
+    },
+    {
+      "epoch": 0.5832122165105924,
+      "grad_norm": 1.405115008354187,
+      "learning_rate": 2e-05,
+      "loss": 3.6517,
+      "step": 220500
+    },
+    {
+      "epoch": 0.5871796465548821,
+      "grad_norm": 1.9946074485778809,
+      "learning_rate": 2e-05,
+      "loss": 3.6555,
+      "step": 222000
+    },
+    {
+      "epoch": 0.5911470765991719,
+      "grad_norm": 1.3142012357711792,
+      "learning_rate": 2e-05,
+      "loss": 3.6596,
+      "step": 223500
+    },
+    {
+      "epoch": 0.5951145066434617,
+      "grad_norm": 2.1748433113098145,
+      "learning_rate": 2e-05,
+      "loss": 3.6562,
+      "step": 225000
+    },
+    {
+      "epoch": 0.5951145066434617,
+      "eval_loss": 3.6325650215148926,
+      "eval_runtime": 508.3225,
+      "eval_samples_per_second": 131.434,
+      "eval_steps_per_second": 32.859,
+      "step": 225000
+    },
+    {
+      "epoch": 0.5990819366877513,
+      "grad_norm": 1.736243724822998,
+      "learning_rate": 2e-05,
+      "loss": 3.6535,
+      "step": 226500
+    },
+    {
+      "epoch": 0.6030493667320411,
+      "grad_norm": 1.4503512382507324,
+      "learning_rate": 2e-05,
+      "loss": 3.6472,
+      "step": 228000
+    },
+    {
+      "epoch": 0.6070167967763308,
+      "grad_norm": 1.7718149423599243,
+      "learning_rate": 2e-05,
+      "loss": 3.652,
+      "step": 229500
+    },
+    {
+      "epoch": 0.6109842268206206,
+      "grad_norm": 1.7987645864486694,
+      "learning_rate": 2e-05,
+      "loss": 3.6393,
+      "step": 231000
+    },
+    {
+      "epoch": 0.6149516568649104,
+      "grad_norm": 2.3214948177337646,
+      "learning_rate": 2e-05,
+      "loss": 3.6552,
+      "step": 232500
+    },
+    {
+      "epoch": 0.6189190869092001,
+      "grad_norm": 1.3199760913848877,
+      "learning_rate": 2e-05,
+      "loss": 3.6537,
+      "step": 234000
+    },
+    {
+      "epoch": 0.6228865169534898,
+      "grad_norm": 2.081068992614746,
+      "learning_rate": 2e-05,
+      "loss": 3.6401,
+      "step": 235500
+    },
+    {
+      "epoch": 0.6268539469977795,
+      "grad_norm": 1.275061845779419,
+      "learning_rate": 2e-05,
+      "loss": 3.6443,
+      "step": 237000
+    },
+    {
+      "epoch": 0.6308213770420693,
+      "grad_norm": 1.7260355949401855,
+      "learning_rate": 2e-05,
+      "loss": 3.6342,
+      "step": 238500
+    },
+    {
+      "epoch": 0.6347888070863591,
+      "grad_norm": 1.4915698766708374,
+      "learning_rate": 2e-05,
+      "loss": 3.6409,
+      "step": 240000
+    },
+    {
+      "epoch": 0.6347888070863591,
+      "eval_loss": 3.6224632263183594,
+      "eval_runtime": 509.0689,
+      "eval_samples_per_second": 131.242,
+      "eval_steps_per_second": 32.811,
+      "step": 240000
+    },
+    {
+      "epoch": 0.6387562371306488,
+      "grad_norm": 2.0755701065063477,
+      "learning_rate": 2e-05,
+      "loss": 3.6467,
+      "step": 241500
+    },
+    {
+      "epoch": 0.6427236671749386,
+      "grad_norm": 1.1929559707641602,
+      "learning_rate": 2e-05,
+      "loss": 3.6437,
+      "step": 243000
+    },
+    {
+      "epoch": 0.6466910972192282,
+      "grad_norm": 1.8279653787612915,
+      "learning_rate": 2e-05,
+      "loss": 3.6362,
+      "step": 244500
+    },
+    {
+      "epoch": 0.650658527263518,
+      "grad_norm": 2.0271174907684326,
+      "learning_rate": 2e-05,
+      "loss": 3.6278,
+      "step": 246000
+    },
+    {
+      "epoch": 0.6546259573078078,
+      "grad_norm": 2.5957841873168945,
+      "learning_rate": 2e-05,
+      "loss": 3.6292,
+      "step": 247500
+    },
+    {
+      "epoch": 0.6585933873520975,
+      "grad_norm": 1.8614776134490967,
+      "learning_rate": 2e-05,
+      "loss": 3.6334,
+      "step": 249000
+    },
+    {
+      "epoch": 0.6625608173963873,
+      "grad_norm": 1.9923863410949707,
+      "learning_rate": 2e-05,
+      "loss": 3.6389,
+      "step": 250500
+    },
+    {
+      "epoch": 0.666528247440677,
+      "grad_norm": 1.719926357269287,
+      "learning_rate": 2e-05,
+      "loss": 3.6319,
+      "step": 252000
+    },
+    {
+      "epoch": 0.6704956774849667,
+      "grad_norm": 1.542592167854309,
+      "learning_rate": 2e-05,
+      "loss": 3.6308,
+      "step": 253500
+    },
+    {
+      "epoch": 0.6744631075292565,
+      "grad_norm": 1.6688200235366821,
+      "learning_rate": 2e-05,
+      "loss": 3.6306,
+      "step": 255000
+    },
+    {
+      "epoch": 0.6744631075292565,
+      "eval_loss": 3.6125638484954834,
+      "eval_runtime": 508.6624,
+      "eval_samples_per_second": 131.346,
+      "eval_steps_per_second": 32.837,
+      "step": 255000
+    },
+    {
+      "epoch": 0.6784305375735462,
+      "grad_norm": 2.0490431785583496,
+      "learning_rate": 2e-05,
+      "loss": 3.6257,
+      "step": 256500
+    },
+    {
+      "epoch": 0.682397967617836,
+      "grad_norm": 1.5656068325042725,
+      "learning_rate": 2e-05,
+      "loss": 3.6327,
+      "step": 258000
+    },
+    {
+      "epoch": 0.6863653976621257,
+      "grad_norm": 1.741310715675354,
+      "learning_rate": 2e-05,
+      "loss": 3.624,
+      "step": 259500
+    },
+    {
+      "epoch": 0.6903328277064155,
+      "grad_norm": 1.5890847444534302,
+      "learning_rate": 2e-05,
+      "loss": 3.631,
+      "step": 261000
+    },
+    {
+      "epoch": 0.6943002577507053,
+      "grad_norm": 2.6518447399139404,
+      "learning_rate": 2e-05,
+      "loss": 3.6219,
+      "step": 262500
+    },
+    {
+      "epoch": 0.6982676877949949,
+      "grad_norm": 1.344978928565979,
+      "learning_rate": 2e-05,
+      "loss": 3.637,
+      "step": 264000
+    },
+    {
+      "epoch": 0.7022351178392847,
+      "grad_norm": 1.8812129497528076,
+      "learning_rate": 2e-05,
+      "loss": 3.6273,
+      "step": 265500
+    },
+    {
+      "epoch": 0.7062025478835744,
+      "grad_norm": 1.59878408908844,
+      "learning_rate": 2e-05,
+      "loss": 3.6204,
+      "step": 267000
+    },
+    {
+      "epoch": 0.7101699779278642,
+      "grad_norm": 2.1738855838775635,
+      "learning_rate": 2e-05,
+      "loss": 3.6113,
+      "step": 268500
+    },
+    {
+      "epoch": 0.714137407972154,
+      "grad_norm": 1.292671799659729,
+      "learning_rate": 2e-05,
+      "loss": 3.6244,
+      "step": 270000
+    },
+    {
+      "epoch": 0.714137407972154,
+      "eval_loss": 3.6037514209747314,
+      "eval_runtime": 508.7856,
+      "eval_samples_per_second": 131.315,
+      "eval_steps_per_second": 32.829,
+      "step": 270000
+    },
+    {
+      "epoch": 0.7181048380164436,
+      "grad_norm": 1.5464812517166138,
+      "learning_rate": 2e-05,
+      "loss": 3.618,
+      "step": 271500
+    },
+    {
+      "epoch": 0.7220722680607334,
+      "grad_norm": 1.2013261318206787,
+      "learning_rate": 2e-05,
+      "loss": 3.6131,
+      "step": 273000
+    },
+    {
+      "epoch": 0.7260396981050231,
+      "grad_norm": 1.424142599105835,
+      "learning_rate": 2e-05,
+      "loss": 3.6223,
+      "step": 274500
+    },
+    {
+      "epoch": 0.7300071281493129,
+      "grad_norm": 2.2044925689697266,
+      "learning_rate": 2e-05,
+      "loss": 3.6141,
+      "step": 276000
+    },
+    {
+      "epoch": 0.7339745581936027,
+      "grad_norm": 1.668161392211914,
+      "learning_rate": 2e-05,
+      "loss": 3.6179,
+      "step": 277500
+    },
+    {
+      "epoch": 0.7379419882378924,
+      "grad_norm": 2.1473095417022705,
+      "learning_rate": 2e-05,
+      "loss": 3.6185,
+      "step": 279000
+    },
+    {
+      "epoch": 0.7419094182821822,
+      "grad_norm": 2.150111436843872,
+      "learning_rate": 2e-05,
+      "loss": 3.6244,
+      "step": 280500
+    },
+    {
+      "epoch": 0.7458768483264718,
+      "grad_norm": 1.4405452013015747,
+      "learning_rate": 2e-05,
+      "loss": 3.6174,
+      "step": 282000
+    },
+    {
+      "epoch": 0.7498442783707616,
+      "grad_norm": 1.622565507888794,
+      "learning_rate": 2e-05,
+      "loss": 3.6176,
+      "step": 283500
+    },
+    {
+      "epoch": 0.7538117084150514,
+      "grad_norm": 1.6604636907577515,
+      "learning_rate": 2e-05,
+      "loss": 3.6026,
+      "step": 285000
+    },
+    {
+      "epoch": 0.7538117084150514,
+      "eval_loss": 3.594251871109009,
+      "eval_runtime": 508.3923,
+      "eval_samples_per_second": 131.416,
+      "eval_steps_per_second": 32.855,
+      "step": 285000
+    },
+    {
+      "epoch": 0.7577791384593411,
+      "grad_norm": 1.5415109395980835,
+      "learning_rate": 2e-05,
+      "loss": 3.6221,
+      "step": 286500
+    },
+    {
+      "epoch": 0.7617465685036309,
+      "grad_norm": 2.0804574489593506,
+      "learning_rate": 2e-05,
+      "loss": 3.6095,
+      "step": 288000
+    },
+    {
+      "epoch": 0.7657139985479205,
+      "grad_norm": 1.6679505109786987,
+      "learning_rate": 2e-05,
+      "loss": 3.6175,
+      "step": 289500
+    },
+    {
+      "epoch": 0.7696814285922103,
+      "grad_norm": 1.562348484992981,
+      "learning_rate": 2e-05,
+      "loss": 3.604,
+      "step": 291000
+    },
+    {
+      "epoch": 0.7736488586365001,
+      "grad_norm": 1.4263495206832886,
+      "learning_rate": 2e-05,
+      "loss": 3.6253,
+      "step": 292500
+    },
+    {
+      "epoch": 0.7776162886807898,
+      "grad_norm": 2.324164390563965,
+      "learning_rate": 2e-05,
+      "loss": 3.6093,
+      "step": 294000
+    },
+    {
+      "epoch": 0.7815837187250796,
+      "grad_norm": 1.5391567945480347,
+      "learning_rate": 2e-05,
+      "loss": 3.5999,
+      "step": 295500
+    },
+    {
+      "epoch": 0.7855511487693694,
+      "grad_norm": 1.6802319288253784,
+      "learning_rate": 2e-05,
+      "loss": 3.6148,
+      "step": 297000
+    },
+    {
+      "epoch": 0.789518578813659,
+      "grad_norm": 1.5229837894439697,
+      "learning_rate": 2e-05,
+      "loss": 3.6033,
+      "step": 298500
+    },
+    {
+      "epoch": 0.7934860088579488,
+      "grad_norm": 1.6937259435653687,
+      "learning_rate": 2e-05,
+      "loss": 3.606,
+      "step": 300000
+    },
+    {
+      "epoch": 0.7934860088579488,
+      "eval_loss": 3.5862793922424316,
+      "eval_runtime": 512.2421,
+      "eval_samples_per_second": 130.429,
+      "eval_steps_per_second": 32.608,
+      "step": 300000
+    },
+    {
+      "epoch": 0.7974534389022385,
+      "grad_norm": 1.4837462902069092,
+      "learning_rate": 2e-05,
+      "loss": 3.6056,
+      "step": 301500
+    },
+    {
+      "epoch": 0.8014208689465283,
+      "grad_norm": 2.499969482421875,
+      "learning_rate": 2e-05,
+      "loss": 3.5886,
+      "step": 303000
+    },
+    {
+      "epoch": 0.8053882989908181,
+      "grad_norm": 1.9505125284194946,
+      "learning_rate": 2e-05,
+      "loss": 3.6012,
+      "step": 304500
+    },
+    {
+      "epoch": 0.8093557290351078,
+      "grad_norm": 1.3550680875778198,
+      "learning_rate": 2e-05,
+      "loss": 3.604,
+      "step": 306000
+    },
+    {
+      "epoch": 0.8133231590793976,
+      "grad_norm": 1.6309428215026855,
+      "learning_rate": 2e-05,
+      "loss": 3.5924,
+      "step": 307500
+    },
+    {
+      "epoch": 0.8172905891236872,
+      "grad_norm": 1.4535800218582153,
+      "learning_rate": 2e-05,
+      "loss": 3.6027,
+      "step": 309000
+    },
+    {
+      "epoch": 0.821258019167977,
+      "grad_norm": 1.4342435598373413,
+      "learning_rate": 2e-05,
+      "loss": 3.5984,
+      "step": 310500
+    },
+    {
+      "epoch": 0.8252254492122668,
+      "grad_norm": 1.4876697063446045,
+      "learning_rate": 2e-05,
+      "loss": 3.608,
+      "step": 312000
+    },
+    {
+      "epoch": 0.8291928792565565,
+      "grad_norm": 1.625393271446228,
+      "learning_rate": 2e-05,
+      "loss": 3.5937,
+      "step": 313500
+    },
+    {
+      "epoch": 0.8331603093008463,
+      "grad_norm": 1.6351014375686646,
+      "learning_rate": 2e-05,
+      "loss": 3.5954,
+      "step": 315000
+    },
+    {
+      "epoch": 0.8331603093008463,
+      "eval_loss": 3.578728675842285,
+      "eval_runtime": 509.839,
+      "eval_samples_per_second": 131.043,
+      "eval_steps_per_second": 32.761,
+      "step": 315000
+    },
+    {
+      "epoch": 0.837127739345136,
+      "grad_norm": 1.5755149126052856,
+      "learning_rate": 2e-05,
+      "loss": 3.5986,
+      "step": 316500
+    },
+    {
+      "epoch": 0.8410951693894257,
+      "grad_norm": 2.0601248741149902,
+      "learning_rate": 2e-05,
+      "loss": 3.5942,
+      "step": 318000
+    },
+    {
+      "epoch": 0.8450625994337155,
+      "grad_norm": 1.9317686557769775,
+      "learning_rate": 2e-05,
+      "loss": 3.5993,
+      "step": 319500
+    },
+    {
+      "epoch": 0.8490300294780052,
+      "grad_norm": 2.375718593597412,
+      "learning_rate": 2e-05,
+      "loss": 3.5833,
+      "step": 321000
+    },
+    {
+      "epoch": 0.852997459522295,
+      "grad_norm": 2.2386367321014404,
+      "learning_rate": 2e-05,
+      "loss": 3.587,
+      "step": 322500
+    },
+    {
+      "epoch": 0.8569648895665847,
+      "grad_norm": 1.9626859426498413,
+      "learning_rate": 2e-05,
+      "loss": 3.5951,
+      "step": 324000
+    },
+    {
+      "epoch": 0.8609323196108745,
+      "grad_norm": 1.279280424118042,
+      "learning_rate": 2e-05,
+      "loss": 3.5881,
+      "step": 325500
+    },
+    {
+      "epoch": 0.8648997496551643,
+      "grad_norm": 1.8277925252914429,
+      "learning_rate": 2e-05,
+      "loss": 3.5933,
+      "step": 327000
+    },
+    {
+      "epoch": 0.8688671796994539,
+      "grad_norm": 1.578514575958252,
+      "learning_rate": 2e-05,
+      "loss": 3.5936,
+      "step": 328500
+    },
+    {
+      "epoch": 0.8728346097437437,
+      "grad_norm": 1.8757489919662476,
+      "learning_rate": 2e-05,
+      "loss": 3.5997,
+      "step": 330000
+    },
+    {
+      "epoch": 0.8728346097437437,
+      "eval_loss": 3.5698554515838623,
+      "eval_runtime": 508.3917,
+      "eval_samples_per_second": 131.416,
+      "eval_steps_per_second": 32.855,
+      "step": 330000
+    },
+    {
+      "epoch": 0.8768020397880334,
+      "grad_norm": 1.7685048580169678,
+      "learning_rate": 2e-05,
+      "loss": 3.5876,
+      "step": 331500
+    },
+    {
+      "epoch": 0.8807694698323232,
+      "grad_norm": 1.6386830806732178,
+      "learning_rate": 2e-05,
+      "loss": 3.5908,
+      "step": 333000
+    },
+    {
+      "epoch": 0.884736899876613,
+      "grad_norm": 1.6260840892791748,
+      "learning_rate": 2e-05,
+      "loss": 3.5839,
+      "step": 334500
+    },
+    {
+      "epoch": 0.8887043299209026,
+      "grad_norm": 1.721689224243164,
+      "learning_rate": 2e-05,
+      "loss": 3.5876,
+      "step": 336000
+    },
+    {
+      "epoch": 0.8926717599651924,
+      "grad_norm": 2.114492654800415,
+      "learning_rate": 2e-05,
+      "loss": 3.5752,
+      "step": 337500
+    },
+    {
+      "epoch": 0.8966391900094821,
+      "grad_norm": 1.563759446144104,
+      "learning_rate": 2e-05,
+      "loss": 3.5679,
+      "step": 339000
+    },
+    {
+      "epoch": 0.9006066200537719,
+      "grad_norm": 1.5705732107162476,
+      "learning_rate": 2e-05,
+      "loss": 3.5934,
+      "step": 340500
+    },
+    {
+      "epoch": 0.9045740500980617,
+      "grad_norm": 1.3177400827407837,
+      "learning_rate": 2e-05,
+      "loss": 3.5942,
+      "step": 342000
+    },
+    {
+      "epoch": 0.9085414801423514,
+      "grad_norm": 2.3175089359283447,
+      "learning_rate": 2e-05,
+      "loss": 3.5812,
+      "step": 343500
+    },
+    {
+      "epoch": 0.9125089101866412,
+      "grad_norm": 1.7788249254226685,
+      "learning_rate": 2e-05,
+      "loss": 3.5801,
+      "step": 345000
+    },
+    {
+      "epoch": 0.9125089101866412,
+      "eval_loss": 3.562117338180542,
+      "eval_runtime": 508.9343,
+      "eval_samples_per_second": 131.276,
+      "eval_steps_per_second": 32.82,
+      "step": 345000
+    },
+    {
+      "epoch": 0.9164763402309308,
+      "grad_norm": 1.8623145818710327,
+      "learning_rate": 2e-05,
+      "loss": 3.5826,
+      "step": 346500
+    },
+    {
+      "epoch": 0.9204437702752206,
+      "grad_norm": 1.7332490682601929,
+      "learning_rate": 2e-05,
+      "loss": 3.5857,
+      "step": 348000
+    },
+    {
+      "epoch": 0.9244112003195104,
+      "grad_norm": 2.2232964038848877,
+      "learning_rate": 2e-05,
+      "loss": 3.5789,
+      "step": 349500
+    },
+    {
+      "epoch": 0.9283786303638001,
+      "grad_norm": 1.4348344802856445,
+      "learning_rate": 2e-05,
+      "loss": 3.5736,
+      "step": 351000
+    },
+    {
+      "epoch": 0.9323460604080899,
+      "grad_norm": 1.3817743062973022,
+      "learning_rate": 2e-05,
+      "loss": 3.5759,
+      "step": 352500
+    },
+    {
+      "epoch": 0.9363134904523795,
+      "grad_norm": 1.8987672328948975,
+      "learning_rate": 2e-05,
+      "loss": 3.5743,
+      "step": 354000
+    },
+    {
+      "epoch": 0.9402809204966693,
+      "grad_norm": 1.5825597047805786,
+      "learning_rate": 2e-05,
+      "loss": 3.5746,
+      "step": 355500
+    },
+    {
+      "epoch": 0.9442483505409591,
+      "grad_norm": 1.5452762842178345,
+      "learning_rate": 2e-05,
+      "loss": 3.5781,
+      "step": 357000
+    },
+    {
+      "epoch": 0.9482157805852488,
+      "grad_norm": 1.3254438638687134,
+      "learning_rate": 2e-05,
+      "loss": 3.5868,
+      "step": 358500
+    },
+    {
+      "epoch": 0.9521832106295386,
+      "grad_norm": 2.779953956604004,
+      "learning_rate": 2e-05,
+      "loss": 3.5705,
+      "step": 360000
+    },
+    {
+      "epoch": 0.9521832106295386,
+      "eval_loss": 3.5557963848114014,
+      "eval_runtime": 508.3414,
+      "eval_samples_per_second": 131.429,
+      "eval_steps_per_second": 32.858,
+      "step": 360000
+    },
+    {
+      "epoch": 0.9561506406738283,
+      "grad_norm": 1.8877593278884888,
+      "learning_rate": 2e-05,
+      "loss": 3.561,
+      "step": 361500
+    },
+    {
+      "epoch": 0.960118070718118,
+      "grad_norm": 1.5967822074890137,
+      "learning_rate": 2e-05,
+      "loss": 3.5633,
+      "step": 363000
+    },
+    {
+      "epoch": 0.9640855007624078,
+      "grad_norm": 2.1027090549468994,
+      "learning_rate": 2e-05,
+      "loss": 3.576,
+      "step": 364500
+    },
+    {
+      "epoch": 0.9680529308066975,
+      "grad_norm": 1.8715656995773315,
+      "learning_rate": 2e-05,
+      "loss": 3.5809,
+      "step": 366000
+    },
+    {
+      "epoch": 0.9720203608509873,
+      "grad_norm": 1.6980156898498535,
+      "learning_rate": 2e-05,
+      "loss": 3.5678,
+      "step": 367500
+    },
+    {
+      "epoch": 0.975987790895277,
+      "grad_norm": 1.4774205684661865,
+      "learning_rate": 2e-05,
+      "loss": 3.5771,
+      "step": 369000
+    }
+  ],
+  "logging_steps": 1500,
+  "max_steps": 378078,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 10000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.653629565827236e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f0002e7e084213a534b889c72e039fd6fc9abae9b0498d394c5c7323149eb7ad
+size 5112